diff --git a/README-zh.md b/README-zh.md index 42f850ac..0c3e8e2f 100644 --- a/README-zh.md +++ b/README-zh.md @@ -1,4 +1,7 @@ # Crawlab + +![](https://img.shields.io/badge/版本-v0.2.1-blue.svg) + 基于Celery的爬虫分布式爬虫管理平台,支持多种编程语言以及多种爬虫框架. [查看演示 Demo](http://139.129.230.98:8080) @@ -48,19 +51,20 @@ npm run serve ## 截图 #### 首页 -![home](./docs/img/screenshot-home.png) + +![](https://user-gold-cdn.xitu.io/2019/3/6/169524d4c7f117f7?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) #### 爬虫列表 -![spider-list](./docs/img/screenshot-spiders.png) +![](https://user-gold-cdn.xitu.io/2019/3/6/169524daf9c8ccef?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) #### 爬虫详情 - 概览 -![spider-list](./docs/img/screenshot-spider-detail-overview.png) +![](https://user-gold-cdn.xitu.io/2019/3/6/169524e0794d6be1?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) #### 任务详情 - 抓取结果 -![spider-list](./docs/img/screenshot-task-detail-results.png) +![](https://user-gold-cdn.xitu.io/2019/3/6/169524e4064c7f0a?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) ## 使用流程 diff --git a/README.md b/README.md index eeea04bb..aff91cba 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Crawlab +![](https://img.shields.io/badge/version-v0.2.1-blue.svg) + Celery-based web crawler admin platform for managing distributed web spiders regardless of languages and frameworks. [Demo](http://139.129.230.98:8080) @@ -49,19 +51,20 @@ npm run serve ## Screenshot #### Home Page -![home](./docs/img/screenshot-home.png) + +![](https://user-gold-cdn.xitu.io/2019/3/6/169524d4c7f117f7?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) #### Spider List -![spider-list](./docs/img/screenshot-spiders.png) +![](https://user-gold-cdn.xitu.io/2019/3/6/169524daf9c8ccef?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) #### Spider Detail - Overview -![spider-list](./docs/img/screenshot-spider-detail-overview.png) +![](https://user-gold-cdn.xitu.io/2019/3/6/169524e0794d6be1?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) #### Task Detail - Results -![spider-list](./docs/img/screenshot-task-detail-results.png) +![](https://user-gold-cdn.xitu.io/2019/3/6/169524e4064c7f0a?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) ## Architecture diff --git a/crawlab/bin/run_worker.py b/crawlab/bin/run_worker.py index 69edcbc6..07f36396 100644 --- a/crawlab/bin/run_worker.py +++ b/crawlab/bin/run_worker.py @@ -13,7 +13,7 @@ import tasks.spider import tasks.deploy if __name__ == '__main__': - if 'win' in sys.platform: + if 'win32' in sys.platform: celery_app.start(argv=['tasks', 'worker', '-P', 'eventlet', '-E', '-l', 'INFO']) else: celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO']) diff --git a/crawlab/requirements.txt b/crawlab/requirements.txt index 11af6f69..8b13446c 100644 --- a/crawlab/requirements.txt +++ b/crawlab/requirements.txt @@ -1,7 +1,6 @@ aiohttp==3.5.4 amqp==2.4.2 aniso8601==6.0.0 -Appium-Python-Client==0.40 APScheduler==3.6.0 asn1crypto==0.24.0 async-timeout==3.0.1 @@ -59,7 +58,6 @@ pytz==2018.9 queuelib==1.5.0 redis==3.2.1 redisbeat==1.1.4 -reppy==0.4.12 requests==2.21.0 Scrapy==1.6.0 selenium==3.141.0 diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index 243f11a1..5fabde73 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -479,7 +479,9 @@ class SpiderApi(BaseApi): }, 400 try: - r = requests.get(spider['start_url']) + r = requests.get(spider['start_url'], headers={ + 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' + }) except Exception as err: return { 'status': 'ok', diff --git a/crawlab/routes/tasks.py b/crawlab/routes/tasks.py index e0cdd0e7..5ae7648b 100644 --- a/crawlab/routes/tasks.py +++ b/crawlab/routes/tasks.py @@ -1,5 +1,11 @@ import json -from datetime import datetime +import os +import sys + +try: + from _signal import SIGKILL +except ImportError: + pass import requests from bson import ObjectId @@ -10,7 +16,6 @@ from db.manager import db_manager from routes.base import BaseApi from utils import jsonify from utils.spider import get_spider_col_fields -from utils.log import other class TaskApi(BaseApi): @@ -189,10 +194,21 @@ class TaskApi(BaseApi): :param id: :return: """ + task = db_manager.get('tasks', id=id) celery_app.control.revoke(id, terminate=True) db_manager.update_one('tasks', id=id, values={ 'status': TaskStatus.REVOKED }) + + # kill process + if task.get('pid'): + pid = task.get('pid') + if 'win32' in sys.platform: + os.popen('taskkill /pid:' + str(pid)) + else: + # unix system + os.kill(pid, SIGKILL) + return { 'id': id, 'status': 'ok', diff --git a/crawlab/spiders/spiders/utils.py b/crawlab/spiders/spiders/utils.py new file mode 100644 index 00000000..0fc60188 --- /dev/null +++ b/crawlab/spiders/spiders/utils.py @@ -0,0 +1,55 @@ +import itertools +import re + + +def generate_urls(base_url: str) -> str: + url = base_url + + # number range list + list_arr = [] + for i, res in enumerate(re.findall(r'{(\d+),(\d+)}', base_url)): + try: + _min = int(res[0]) + _max = int(res[1]) + except ValueError as err: + raise ValueError(f'{base_url} is not a valid URL pattern') + + # list + _list = range(_min, _max + 1) + + # key + _key = f'n{i}' + + # append list and key + list_arr.append((_list, _key)) + + # replace url placeholder with key + url = url.replace('{' + res[0] + ',' + res[1] + '}', '{' + _key + '}', 1) + + # string list + for i, res in enumerate(re.findall(r'\[([\w\-,]+)\]', base_url)): + # list + _list = res.split(',') + + # key + _key = f's{i}' + + # append list and key + list_arr.append((_list, _key)) + + # replace url placeholder with key + url = url.replace('[' + ','.join(_list) + ']', '{' + _key + '}', 1) + + # combine together + _list_arr = [] + for res in itertools.product(*map(lambda x: x[0], list_arr)): + _url = url + for _arr, _rep in zip(list_arr, res): + _list, _key = _arr + _url = _url.replace('{' + _key + '}', str(_rep), 1) + yield _url + +# +# base_url = 'http://[baidu,ali].com/page-{1,10}-[1,2,3]' +# for url in generate_urls(base_url): +# print(url) diff --git a/docs/.DS_Store b/docs/.DS_Store index 328c974d..9cbf3ccd 100644 Binary files a/docs/.DS_Store and b/docs/.DS_Store differ diff --git a/docs/img/crawlab-architecture 2.png b/docs/img/crawlab-architecture 2.png deleted file mode 100644 index fcac460f..00000000 Binary files a/docs/img/crawlab-architecture 2.png and /dev/null differ diff --git a/docs/img/screenshot-home.png b/docs/img/screenshot-home.png deleted file mode 100644 index 650dca47..00000000 Binary files a/docs/img/screenshot-home.png and /dev/null differ diff --git a/docs/img/screenshot-node-detail.png b/docs/img/screenshot-node-detail.png deleted file mode 100644 index 3d323172..00000000 Binary files a/docs/img/screenshot-node-detail.png and /dev/null differ diff --git a/docs/img/screenshot-nodes.png b/docs/img/screenshot-nodes.png deleted file mode 100644 index 88fc7489..00000000 Binary files a/docs/img/screenshot-nodes.png and /dev/null differ diff --git a/docs/img/screenshot-spider-detail-overview.png b/docs/img/screenshot-spider-detail-overview.png deleted file mode 100644 index 8745c451..00000000 Binary files a/docs/img/screenshot-spider-detail-overview.png and /dev/null differ diff --git a/docs/img/screenshot-spider-import.png b/docs/img/screenshot-spider-import.png deleted file mode 100644 index d2ca7c17..00000000 Binary files a/docs/img/screenshot-spider-import.png and /dev/null differ diff --git a/docs/img/screenshot-spiders.png b/docs/img/screenshot-spiders.png deleted file mode 100644 index b23310d7..00000000 Binary files a/docs/img/screenshot-spiders.png and /dev/null differ diff --git a/docs/img/screenshot-task-detail-log.png b/docs/img/screenshot-task-detail-log.png deleted file mode 100644 index 7e3ee387..00000000 Binary files a/docs/img/screenshot-task-detail-log.png and /dev/null differ diff --git a/docs/img/screenshot-task-detail-overview.png b/docs/img/screenshot-task-detail-overview.png deleted file mode 100644 index fbb339e8..00000000 Binary files a/docs/img/screenshot-task-detail-overview.png and /dev/null differ diff --git a/docs/img/screenshot-task-detail-results.png b/docs/img/screenshot-task-detail-results.png deleted file mode 100644 index 8623fb33..00000000 Binary files a/docs/img/screenshot-task-detail-results.png and /dev/null differ diff --git a/docs/img/screenshot-tasks.png b/docs/img/screenshot-tasks.png deleted file mode 100644 index ab5585da..00000000 Binary files a/docs/img/screenshot-tasks.png and /dev/null differ diff --git a/spiders/sites_inspector/sites_inspector.py b/spiders/sites_inspector/sites_inspector.py index 72d2184b..b6e264c7 100644 --- a/spiders/sites_inspector/sites_inspector.py +++ b/spiders/sites_inspector/sites_inspector.py @@ -63,7 +63,8 @@ async def request_site_home_page(url: str, semophore): async def run(): semaphore = asyncio.Semaphore(50) # 限制并发量为50 - sites = [site for site in col.find({'rank': {'$lte': 5000}})] + # sites = [site for site in col.find({'rank': {'$lte': 5000}})] + sites = [site for site in col.find({'rank': {'$lte': 100}})] urls = [site['_id'] for site in sites] to_get = [request_site(url, semaphore) for url in urls] to_get += [request_site_home_page(url, semaphore) for url in urls]