Merge pull request #39 from tikazyq/develop

Develop
This commit is contained in:
Marvin Zhang
2019-05-29 06:28:24 +08:00
committed by GitHub
20 changed files with 94 additions and 15 deletions

View File

@@ -1,4 +1,7 @@
# Crawlab
![](https://img.shields.io/badge/版本-v0.2.1-blue.svg)
基于Celery的爬虫分布式爬虫管理平台支持多种编程语言以及多种爬虫框架.
[查看演示 Demo](http://139.129.230.98:8080)
@@ -48,19 +51,20 @@ npm run serve
## 截图
#### 首页
![home](./docs/img/screenshot-home.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524d4c7f117f7?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
#### 爬虫列表
![spider-list](./docs/img/screenshot-spiders.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524daf9c8ccef?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
#### 爬虫详情 - 概览
![spider-list](./docs/img/screenshot-spider-detail-overview.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524e0794d6be1?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
#### 任务详情 - 抓取结果
![spider-list](./docs/img/screenshot-task-detail-results.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524e4064c7f0a?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
## 使用流程

View File

@@ -1,5 +1,7 @@
# Crawlab
![](https://img.shields.io/badge/version-v0.2.1-blue.svg)
Celery-based web crawler admin platform for managing distributed web spiders regardless of languages and frameworks.
[Demo](http://139.129.230.98:8080)
@@ -49,19 +51,20 @@ npm run serve
## Screenshot
#### Home Page
![home](./docs/img/screenshot-home.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524d4c7f117f7?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
#### Spider List
![spider-list](./docs/img/screenshot-spiders.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524daf9c8ccef?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
#### Spider Detail - Overview
![spider-list](./docs/img/screenshot-spider-detail-overview.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524e0794d6be1?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
#### Task Detail - Results
![spider-list](./docs/img/screenshot-task-detail-results.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524e4064c7f0a?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
## Architecture

View File

@@ -13,7 +13,7 @@ import tasks.spider
import tasks.deploy
if __name__ == '__main__':
if 'win' in sys.platform:
if 'win32' in sys.platform:
celery_app.start(argv=['tasks', 'worker', '-P', 'eventlet', '-E', '-l', 'INFO'])
else:
celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO'])

View File

@@ -1,7 +1,6 @@
aiohttp==3.5.4
amqp==2.4.2
aniso8601==6.0.0
Appium-Python-Client==0.40
APScheduler==3.6.0
asn1crypto==0.24.0
async-timeout==3.0.1
@@ -59,7 +58,6 @@ pytz==2018.9
queuelib==1.5.0
redis==3.2.1
redisbeat==1.1.4
reppy==0.4.12
requests==2.21.0
Scrapy==1.6.0
selenium==3.141.0

View File

@@ -479,7 +479,9 @@ class SpiderApi(BaseApi):
}, 400
try:
r = requests.get(spider['start_url'])
r = requests.get(spider['start_url'], headers={
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
})
except Exception as err:
return {
'status': 'ok',

View File

@@ -1,5 +1,11 @@
import json
from datetime import datetime
import os
import sys
try:
from _signal import SIGKILL
except ImportError:
pass
import requests
from bson import ObjectId
@@ -10,7 +16,6 @@ from db.manager import db_manager
from routes.base import BaseApi
from utils import jsonify
from utils.spider import get_spider_col_fields
from utils.log import other
class TaskApi(BaseApi):
@@ -189,10 +194,21 @@ class TaskApi(BaseApi):
:param id:
:return:
"""
task = db_manager.get('tasks', id=id)
celery_app.control.revoke(id, terminate=True)
db_manager.update_one('tasks', id=id, values={
'status': TaskStatus.REVOKED
})
# kill process
if task.get('pid'):
pid = task.get('pid')
if 'win32' in sys.platform:
os.popen('taskkill /pid:' + str(pid))
else:
# unix system
os.kill(pid, SIGKILL)
return {
'id': id,
'status': 'ok',

View File

@@ -0,0 +1,55 @@
import itertools
import re
def generate_urls(base_url: str) -> str:
url = base_url
# number range list
list_arr = []
for i, res in enumerate(re.findall(r'{(\d+),(\d+)}', base_url)):
try:
_min = int(res[0])
_max = int(res[1])
except ValueError as err:
raise ValueError(f'{base_url} is not a valid URL pattern')
# list
_list = range(_min, _max + 1)
# key
_key = f'n{i}'
# append list and key
list_arr.append((_list, _key))
# replace url placeholder with key
url = url.replace('{' + res[0] + ',' + res[1] + '}', '{' + _key + '}', 1)
# string list
for i, res in enumerate(re.findall(r'\[([\w\-,]+)\]', base_url)):
# list
_list = res.split(',')
# key
_key = f's{i}'
# append list and key
list_arr.append((_list, _key))
# replace url placeholder with key
url = url.replace('[' + ','.join(_list) + ']', '{' + _key + '}', 1)
# combine together
_list_arr = []
for res in itertools.product(*map(lambda x: x[0], list_arr)):
_url = url
for _arr, _rep in zip(list_arr, res):
_list, _key = _arr
_url = _url.replace('{' + _key + '}', str(_rep), 1)
yield _url
#
# base_url = 'http://[baidu,ali].com/page-{1,10}-[1,2,3]'
# for url in generate_urls(base_url):
# print(url)

BIN
docs/.DS_Store vendored

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 162 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 358 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 175 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 349 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 174 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 187 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 538 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 245 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 542 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 326 KiB

View File

@@ -63,7 +63,8 @@ async def request_site_home_page(url: str, semophore):
async def run():
semaphore = asyncio.Semaphore(50) # 限制并发量为50
sites = [site for site in col.find({'rank': {'$lte': 5000}})]
# sites = [site for site in col.find({'rank': {'$lte': 5000}})]
sites = [site for site in col.find({'rank': {'$lte': 100}})]
urls = [site['_id'] for site in sites]
to_get = [request_site(url, semaphore) for url in urls]
to_get += [request_site_home_page(url, semaphore) for url in urls]