added generate_urls function

2026-01-21 17:21:09 +01:00 · 2019-05-28 13:27:25 +08:00
parent 8d3d7b3869
commit 823c7f6b53
5 changed files with 70 additions and 3 deletions
--- a/crawlab/bin/run_worker.py
+++ b/crawlab/bin/run_worker.py
@@ -13,7 +13,7 @@ import tasks.spider
 import tasks.deploy

 if __name__ == '__main__':
-    if 'win' in sys.platform:
+    if 'win32' in sys.platform:
        celery_app.start(argv=['tasks', 'worker', '-P', 'eventlet', '-E', '-l', 'INFO'])
    else:
        celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO'])
--- a/crawlab/routes/spiders.py
+++ b/crawlab/routes/spiders.py
@@ -479,7 +479,9 @@ class SpiderApi(BaseApi):
                   }, 400

        try:
-            r = requests.get(spider['start_url'])
+            r = requests.get(spider['start_url'], headers={
+                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
+            })
        except Exception as err:
            return {
                       'status': 'ok',
--- a/crawlab/routes/tasks.py
+++ b/crawlab/routes/tasks.py
@@ -1,4 +1,7 @@
 import json
+import os
+import sys
+from _signal import SIGINT, SIGKILL
 from datetime import datetime

 import requests
@@ -189,10 +192,21 @@ class TaskApi(BaseApi):
        :param id:
        :return:
        """
+        task = db_manager.get('tasks', id=id)
        celery_app.control.revoke(id, terminate=True)
        db_manager.update_one('tasks', id=id, values={
            'status': TaskStatus.REVOKED
        })
+
+        # kill process
+        if task.get('pid'):
+            pid = task.get('pid')
+            if 'win32' in sys.platform:
+                os.popen('taskkill /pid:' + str(pid))
+            else:
+                # unix system
+                os.kill(pid, SIGKILL)
+
        return {
            'id': id,
            'status': 'ok',
--- a/crawlab/spiders/spiders/utils.py
+++ b/crawlab/spiders/spiders/utils.py
@@ -0,0 +1,50 @@
+import itertools
+import re
+
+
+def generate_urls(base_url: str) -> str:
+    url = base_url
+
+    # number range list
+    list_arr = []
+    for i, res in enumerate(re.findall(r'{(\d+),(\d+)}', base_url)):
+        try:
+            _min = int(res[0])
+            _max = int(res[1])
+        except ValueError as err:
+            raise ValueError(f'{base_url} is not a valid URL pattern')
+
+        # list
+        _list = range(_min, _max + 1)
+
+        # key
+        _key = f'n{i}'
+
+        # append list and key
+        list_arr.append((_list, _key))
+
+        # replace url placeholder with key
+        url = url.replace('{' + res[0] + ',' + res[1] + '}', '{' + _key + '}', 1)
+
+    # string list
+    for i, res in enumerate(re.findall(r'\[(.+)\]', base_url)):
+        # list
+        _list = res.split(',')
+
+        # key
+        _key = f's{i}'
+
+        # append list and key
+        list_arr.append((_list, _key))
+
+        # replace url placeholder with key
+        url = url.replace('[' + ','.join(_list) + ']', '{' + _key + '}', 1)
+
+    # combine together
+    _list_arr = []
+    for res in itertools.product(*map(lambda x: x[0], list_arr)):
+        _url = url
+        for _arr, _rep in zip(list_arr, res):
+            _list, _key = _arr
+            _url = _url.replace('{' + _key + '}', str(_rep), 1)
+        yield _url
--- a/spiders/sites_inspector/sites_inspector.py
+++ b/spiders/sites_inspector/sites_inspector.py
@@ -63,7 +63,8 @@ async def request_site_home_page(url: str, semophore):

 async def run():
    semaphore = asyncio.Semaphore(50)  # 限制并发量为50
-    sites = [site for site in col.find({'rank': {'$lte': 5000}})]
+    # sites = [site for site in col.find({'rank': {'$lte': 5000}})]
+    sites = [site for site in col.find({'rank': {'$lte': 100}})]
    urls = [site['_id'] for site in sites]
    to_get = [request_site(url, semaphore) for url in urls]
    to_get += [request_site_home_page(url, semaphore) for url in urls]