diff --git a/crawlab/bin/run_worker.py b/crawlab/bin/run_worker.py index 69edcbc6..07f36396 100644 --- a/crawlab/bin/run_worker.py +++ b/crawlab/bin/run_worker.py @@ -13,7 +13,7 @@ import tasks.spider import tasks.deploy if __name__ == '__main__': - if 'win' in sys.platform: + if 'win32' in sys.platform: celery_app.start(argv=['tasks', 'worker', '-P', 'eventlet', '-E', '-l', 'INFO']) else: celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO']) diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index 243f11a1..5fabde73 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -479,7 +479,9 @@ class SpiderApi(BaseApi): }, 400 try: - r = requests.get(spider['start_url']) + r = requests.get(spider['start_url'], headers={ + 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' + }) except Exception as err: return { 'status': 'ok', diff --git a/crawlab/routes/tasks.py b/crawlab/routes/tasks.py index e0cdd0e7..e531e81d 100644 --- a/crawlab/routes/tasks.py +++ b/crawlab/routes/tasks.py @@ -1,4 +1,7 @@ import json +import os +import sys +from _signal import SIGINT, SIGKILL from datetime import datetime import requests @@ -189,10 +192,21 @@ class TaskApi(BaseApi): :param id: :return: """ + task = db_manager.get('tasks', id=id) celery_app.control.revoke(id, terminate=True) db_manager.update_one('tasks', id=id, values={ 'status': TaskStatus.REVOKED }) + + # kill process + if task.get('pid'): + pid = task.get('pid') + if 'win32' in sys.platform: + os.popen('taskkill /pid:' + str(pid)) + else: + # unix system + os.kill(pid, SIGKILL) + return { 'id': id, 'status': 'ok', diff --git a/crawlab/spiders/spiders/utils.py b/crawlab/spiders/spiders/utils.py new file mode 100644 index 00000000..0fc60188 --- /dev/null +++ b/crawlab/spiders/spiders/utils.py @@ -0,0 +1,55 @@ +import itertools +import re + + +def generate_urls(base_url: str) -> str: + url = base_url + + # number range list + list_arr = [] + for i, res in enumerate(re.findall(r'{(\d+),(\d+)}', base_url)): + try: + _min = int(res[0]) + _max = int(res[1]) + except ValueError as err: + raise ValueError(f'{base_url} is not a valid URL pattern') + + # list + _list = range(_min, _max + 1) + + # key + _key = f'n{i}' + + # append list and key + list_arr.append((_list, _key)) + + # replace url placeholder with key + url = url.replace('{' + res[0] + ',' + res[1] + '}', '{' + _key + '}', 1) + + # string list + for i, res in enumerate(re.findall(r'\[([\w\-,]+)\]', base_url)): + # list + _list = res.split(',') + + # key + _key = f's{i}' + + # append list and key + list_arr.append((_list, _key)) + + # replace url placeholder with key + url = url.replace('[' + ','.join(_list) + ']', '{' + _key + '}', 1) + + # combine together + _list_arr = [] + for res in itertools.product(*map(lambda x: x[0], list_arr)): + _url = url + for _arr, _rep in zip(list_arr, res): + _list, _key = _arr + _url = _url.replace('{' + _key + '}', str(_rep), 1) + yield _url + +# +# base_url = 'http://[baidu,ali].com/page-{1,10}-[1,2,3]' +# for url in generate_urls(base_url): +# print(url) diff --git a/spiders/sites_inspector/sites_inspector.py b/spiders/sites_inspector/sites_inspector.py index 72d2184b..b6e264c7 100644 --- a/spiders/sites_inspector/sites_inspector.py +++ b/spiders/sites_inspector/sites_inspector.py @@ -63,7 +63,8 @@ async def request_site_home_page(url: str, semophore): async def run(): semaphore = asyncio.Semaphore(50) # 限制并发量为50 - sites = [site for site in col.find({'rank': {'$lte': 5000}})] + # sites = [site for site in col.find({'rank': {'$lte': 5000}})] + sites = [site for site in col.find({'rank': {'$lte': 100}})] urls = [site['_id'] for site in sites] to_get = [request_site(url, semaphore) for url in urls] to_get += [request_site_home_page(url, semaphore) for url in urls]