Merge pull request #39 from tikazyq/develop

Develop
2026-01-21 17:21:09 +01:00 · 2019-05-29 06:28:24 +08:00
parent 0a67d6ae1f 50a1d6cccd
commit c0d46bbb42
20 changed files with 94 additions and 15 deletions
--- a/README-zh.md
+++ b/README-zh.md
@@ -1,4 +1,7 @@
 # Crawlab
+
+![](https://img.shields.io/badge/版本-v0.2.1-blue.svg)
+
 基于Celery的爬虫分布式爬虫管理平台，支持多种编程语言以及多种爬虫框架.

 [查看演示 Demo](http://139.129.230.98:8080)
@@ -48,19 +51,20 @@ npm run serve
 ## 截图

 #### 首页
-![home](./docs/img/screenshot-home.png)
+
+![](https://user-gold-cdn.xitu.io/2019/3/6/169524d4c7f117f7?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)

 #### 爬虫列表

-![spider-list](./docs/img/screenshot-spiders.png)
+![](https://user-gold-cdn.xitu.io/2019/3/6/169524daf9c8ccef?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)

 #### 爬虫详情 - 概览

-![spider-list](./docs/img/screenshot-spider-detail-overview.png)
+![](https://user-gold-cdn.xitu.io/2019/3/6/169524e0794d6be1?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)

 #### 任务详情 - 抓取结果

-![spider-list](./docs/img/screenshot-task-detail-results.png)
+![](https://user-gold-cdn.xitu.io/2019/3/6/169524e4064c7f0a?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)

 ## 使用流程

--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # Crawlab

+![](https://img.shields.io/badge/version-v0.2.1-blue.svg)
+
 Celery-based web crawler admin platform for managing distributed web spiders regardless of languages and frameworks. 

 [Demo](http://139.129.230.98:8080)
@@ -49,19 +51,20 @@ npm run serve
 ## Screenshot

 #### Home Page
-![home](./docs/img/screenshot-home.png)
+
+![](https://user-gold-cdn.xitu.io/2019/3/6/169524d4c7f117f7?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)

 #### Spider List

-![spider-list](./docs/img/screenshot-spiders.png)
+![](https://user-gold-cdn.xitu.io/2019/3/6/169524daf9c8ccef?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)

 #### Spider Detail - Overview

-![spider-list](./docs/img/screenshot-spider-detail-overview.png)
+![](https://user-gold-cdn.xitu.io/2019/3/6/169524e0794d6be1?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)

 #### Task Detail - Results

-![spider-list](./docs/img/screenshot-task-detail-results.png)
+![](https://user-gold-cdn.xitu.io/2019/3/6/169524e4064c7f0a?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)

 ## Architecture

--- a/crawlab/bin/run_worker.py
+++ b/crawlab/bin/run_worker.py
@@ -13,7 +13,7 @@ import tasks.spider
 import tasks.deploy

 if __name__ == '__main__':
-    if 'win' in sys.platform:
+    if 'win32' in sys.platform:
        celery_app.start(argv=['tasks', 'worker', '-P', 'eventlet', '-E', '-l', 'INFO'])
    else:
        celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO'])
--- a/crawlab/requirements.txt
+++ b/crawlab/requirements.txt
@@ -1,7 +1,6 @@
 aiohttp==3.5.4
 amqp==2.4.2
 aniso8601==6.0.0
-Appium-Python-Client==0.40
 APScheduler==3.6.0
 asn1crypto==0.24.0
 async-timeout==3.0.1
@@ -59,7 +58,6 @@ pytz==2018.9
 queuelib==1.5.0
 redis==3.2.1
 redisbeat==1.1.4
-reppy==0.4.12
 requests==2.21.0
 Scrapy==1.6.0
 selenium==3.141.0
--- a/crawlab/routes/spiders.py
+++ b/crawlab/routes/spiders.py
@@ -479,7 +479,9 @@ class SpiderApi(BaseApi):
                   }, 400

        try:
-            r = requests.get(spider['start_url'])
+            r = requests.get(spider['start_url'], headers={
+                'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
+            })
        except Exception as err:
            return {
                       'status': 'ok',
--- a/crawlab/routes/tasks.py
+++ b/crawlab/routes/tasks.py
@@ -1,5 +1,11 @@
 import json
-from datetime import datetime
+import os
+import sys
+
+try:
+    from _signal import SIGKILL
+except ImportError:
+    pass

 import requests
 from bson import ObjectId
@@ -10,7 +16,6 @@ from db.manager import db_manager
 from routes.base import BaseApi
 from utils import jsonify
 from utils.spider import get_spider_col_fields
-from utils.log import other


 class TaskApi(BaseApi):
@@ -189,10 +194,21 @@ class TaskApi(BaseApi):
        :param id:
        :return:
        """
+        task = db_manager.get('tasks', id=id)
        celery_app.control.revoke(id, terminate=True)
        db_manager.update_one('tasks', id=id, values={
            'status': TaskStatus.REVOKED
        })
+
+        # kill process
+        if task.get('pid'):
+            pid = task.get('pid')
+            if 'win32' in sys.platform:
+                os.popen('taskkill /pid:' + str(pid))
+            else:
+                # unix system
+                os.kill(pid, SIGKILL)
+
        return {
            'id': id,
            'status': 'ok',
--- a/crawlab/spiders/spiders/utils.py
+++ b/crawlab/spiders/spiders/utils.py
@@ -0,0 +1,55 @@
+import itertools
+import re
+
+
+def generate_urls(base_url: str) -> str:
+    url = base_url
+
+    # number range list
+    list_arr = []
+    for i, res in enumerate(re.findall(r'{(\d+),(\d+)}', base_url)):
+        try:
+            _min = int(res[0])
+            _max = int(res[1])
+        except ValueError as err:
+            raise ValueError(f'{base_url} is not a valid URL pattern')
+
+        # list
+        _list = range(_min, _max + 1)
+
+        # key
+        _key = f'n{i}'
+
+        # append list and key
+        list_arr.append((_list, _key))
+
+        # replace url placeholder with key
+        url = url.replace('{' + res[0] + ',' + res[1] + '}', '{' + _key + '}', 1)
+
+    # string list
+    for i, res in enumerate(re.findall(r'\[([\w\-,]+)\]', base_url)):
+        # list
+        _list = res.split(',')
+
+        # key
+        _key = f's{i}'
+
+        # append list and key
+        list_arr.append((_list, _key))
+
+        # replace url placeholder with key
+        url = url.replace('[' + ','.join(_list) + ']', '{' + _key + '}', 1)
+
+    # combine together
+    _list_arr = []
+    for res in itertools.product(*map(lambda x: x[0], list_arr)):
+        _url = url
+        for _arr, _rep in zip(list_arr, res):
+            _list, _key = _arr
+            _url = _url.replace('{' + _key + '}', str(_rep), 1)
+        yield _url
+
+#
+# base_url = 'http://[baidu,ali].com/page-{1,10}-[1,2,3]'
+# for url in generate_urls(base_url):
+#     print(url)
--- a/docs/.DS_Store
+++ b/docs/.DS_Store
--- a/docs/img/crawlab-architecture
+++ b/docs/img/crawlab-architecture
--- a/docs/img/screenshot-home.png
+++ b/docs/img/screenshot-home.png
--- a/docs/img/screenshot-node-detail.png
+++ b/docs/img/screenshot-node-detail.png
--- a/docs/img/screenshot-nodes.png
+++ b/docs/img/screenshot-nodes.png
--- a/docs/img/screenshot-spider-detail-overview.png
+++ b/docs/img/screenshot-spider-detail-overview.png
--- a/docs/img/screenshot-spider-import.png
+++ b/docs/img/screenshot-spider-import.png
--- a/docs/img/screenshot-spiders.png
+++ b/docs/img/screenshot-spiders.png
--- a/docs/img/screenshot-task-detail-log.png
+++ b/docs/img/screenshot-task-detail-log.png
--- a/docs/img/screenshot-task-detail-overview.png
+++ b/docs/img/screenshot-task-detail-overview.png
--- a/docs/img/screenshot-task-detail-results.png
+++ b/docs/img/screenshot-task-detail-results.png
--- a/docs/img/screenshot-tasks.png
+++ b/docs/img/screenshot-tasks.png
--- a/spiders/sites_inspector/sites_inspector.py
+++ b/spiders/sites_inspector/sites_inspector.py
@@ -63,7 +63,8 @@ async def request_site_home_page(url: str, semophore):

 async def run():
    semaphore = asyncio.Semaphore(50)  # 限制并发量为50
-    sites = [site for site in col.find({'rank': {'$lte': 5000}})]
+    # sites = [site for site in col.find({'rank': {'$lte': 5000}})]
+    sites = [site for site in col.find({'rank': {'$lte': 100}})]
    urls = [site['_id'] for site in sites]
    to_get = [request_site(url, semaphore) for url in urls]
    to_get += [request_site_home_page(url, semaphore) for url in urls]