12
README-zh.md
@@ -1,4 +1,7 @@
|
||||
# Crawlab
|
||||
|
||||

|
||||
|
||||
基于Celery的爬虫分布式爬虫管理平台,支持多种编程语言以及多种爬虫框架.
|
||||
|
||||
[查看演示 Demo](http://139.129.230.98:8080)
|
||||
@@ -48,19 +51,20 @@ npm run serve
|
||||
## 截图
|
||||
|
||||
#### 首页
|
||||

|
||||
|
||||

|
||||
|
||||
#### 爬虫列表
|
||||
|
||||

|
||||

|
||||
|
||||
#### 爬虫详情 - 概览
|
||||
|
||||

|
||||

|
||||
|
||||
#### 任务详情 - 抓取结果
|
||||
|
||||

|
||||

|
||||
|
||||
## 使用流程
|
||||
|
||||
|
||||
11
README.md
@@ -1,5 +1,7 @@
|
||||
# Crawlab
|
||||
|
||||

|
||||
|
||||
Celery-based web crawler admin platform for managing distributed web spiders regardless of languages and frameworks.
|
||||
|
||||
[Demo](http://139.129.230.98:8080)
|
||||
@@ -49,19 +51,20 @@ npm run serve
|
||||
## Screenshot
|
||||
|
||||
#### Home Page
|
||||

|
||||
|
||||

|
||||
|
||||
#### Spider List
|
||||
|
||||

|
||||

|
||||
|
||||
#### Spider Detail - Overview
|
||||
|
||||

|
||||

|
||||
|
||||
#### Task Detail - Results
|
||||
|
||||

|
||||

|
||||
|
||||
## Architecture
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ import tasks.spider
|
||||
import tasks.deploy
|
||||
|
||||
if __name__ == '__main__':
|
||||
if 'win' in sys.platform:
|
||||
if 'win32' in sys.platform:
|
||||
celery_app.start(argv=['tasks', 'worker', '-P', 'eventlet', '-E', '-l', 'INFO'])
|
||||
else:
|
||||
celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO'])
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
aiohttp==3.5.4
|
||||
amqp==2.4.2
|
||||
aniso8601==6.0.0
|
||||
Appium-Python-Client==0.40
|
||||
APScheduler==3.6.0
|
||||
asn1crypto==0.24.0
|
||||
async-timeout==3.0.1
|
||||
@@ -59,7 +58,6 @@ pytz==2018.9
|
||||
queuelib==1.5.0
|
||||
redis==3.2.1
|
||||
redisbeat==1.1.4
|
||||
reppy==0.4.12
|
||||
requests==2.21.0
|
||||
Scrapy==1.6.0
|
||||
selenium==3.141.0
|
||||
|
||||
@@ -479,7 +479,9 @@ class SpiderApi(BaseApi):
|
||||
}, 400
|
||||
|
||||
try:
|
||||
r = requests.get(spider['start_url'])
|
||||
r = requests.get(spider['start_url'], headers={
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
|
||||
})
|
||||
except Exception as err:
|
||||
return {
|
||||
'status': 'ok',
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
import json
|
||||
from datetime import datetime
|
||||
import os
|
||||
import sys
|
||||
|
||||
try:
|
||||
from _signal import SIGKILL
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
import requests
|
||||
from bson import ObjectId
|
||||
@@ -10,7 +16,6 @@ from db.manager import db_manager
|
||||
from routes.base import BaseApi
|
||||
from utils import jsonify
|
||||
from utils.spider import get_spider_col_fields
|
||||
from utils.log import other
|
||||
|
||||
|
||||
class TaskApi(BaseApi):
|
||||
@@ -189,10 +194,21 @@ class TaskApi(BaseApi):
|
||||
:param id:
|
||||
:return:
|
||||
"""
|
||||
task = db_manager.get('tasks', id=id)
|
||||
celery_app.control.revoke(id, terminate=True)
|
||||
db_manager.update_one('tasks', id=id, values={
|
||||
'status': TaskStatus.REVOKED
|
||||
})
|
||||
|
||||
# kill process
|
||||
if task.get('pid'):
|
||||
pid = task.get('pid')
|
||||
if 'win32' in sys.platform:
|
||||
os.popen('taskkill /pid:' + str(pid))
|
||||
else:
|
||||
# unix system
|
||||
os.kill(pid, SIGKILL)
|
||||
|
||||
return {
|
||||
'id': id,
|
||||
'status': 'ok',
|
||||
|
||||
55
crawlab/spiders/spiders/utils.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import itertools
|
||||
import re
|
||||
|
||||
|
||||
def generate_urls(base_url: str) -> str:
|
||||
url = base_url
|
||||
|
||||
# number range list
|
||||
list_arr = []
|
||||
for i, res in enumerate(re.findall(r'{(\d+),(\d+)}', base_url)):
|
||||
try:
|
||||
_min = int(res[0])
|
||||
_max = int(res[1])
|
||||
except ValueError as err:
|
||||
raise ValueError(f'{base_url} is not a valid URL pattern')
|
||||
|
||||
# list
|
||||
_list = range(_min, _max + 1)
|
||||
|
||||
# key
|
||||
_key = f'n{i}'
|
||||
|
||||
# append list and key
|
||||
list_arr.append((_list, _key))
|
||||
|
||||
# replace url placeholder with key
|
||||
url = url.replace('{' + res[0] + ',' + res[1] + '}', '{' + _key + '}', 1)
|
||||
|
||||
# string list
|
||||
for i, res in enumerate(re.findall(r'\[([\w\-,]+)\]', base_url)):
|
||||
# list
|
||||
_list = res.split(',')
|
||||
|
||||
# key
|
||||
_key = f's{i}'
|
||||
|
||||
# append list and key
|
||||
list_arr.append((_list, _key))
|
||||
|
||||
# replace url placeholder with key
|
||||
url = url.replace('[' + ','.join(_list) + ']', '{' + _key + '}', 1)
|
||||
|
||||
# combine together
|
||||
_list_arr = []
|
||||
for res in itertools.product(*map(lambda x: x[0], list_arr)):
|
||||
_url = url
|
||||
for _arr, _rep in zip(list_arr, res):
|
||||
_list, _key = _arr
|
||||
_url = _url.replace('{' + _key + '}', str(_rep), 1)
|
||||
yield _url
|
||||
|
||||
#
|
||||
# base_url = 'http://[baidu,ali].com/page-{1,10}-[1,2,3]'
|
||||
# for url in generate_urls(base_url):
|
||||
# print(url)
|
||||
BIN
docs/.DS_Store
vendored
|
Before Width: | Height: | Size: 47 KiB |
|
Before Width: | Height: | Size: 162 KiB |
|
Before Width: | Height: | Size: 358 KiB |
|
Before Width: | Height: | Size: 175 KiB |
|
Before Width: | Height: | Size: 349 KiB |
|
Before Width: | Height: | Size: 174 KiB |
|
Before Width: | Height: | Size: 187 KiB |
|
Before Width: | Height: | Size: 538 KiB |
|
Before Width: | Height: | Size: 245 KiB |
|
Before Width: | Height: | Size: 542 KiB |
|
Before Width: | Height: | Size: 326 KiB |
@@ -63,7 +63,8 @@ async def request_site_home_page(url: str, semophore):
|
||||
|
||||
async def run():
|
||||
semaphore = asyncio.Semaphore(50) # 限制并发量为50
|
||||
sites = [site for site in col.find({'rank': {'$lte': 5000}})]
|
||||
# sites = [site for site in col.find({'rank': {'$lte': 5000}})]
|
||||
sites = [site for site in col.find({'rank': {'$lte': 100}})]
|
||||
urls = [site['_id'] for site in sites]
|
||||
to_get = [request_site(url, semaphore) for url in urls]
|
||||
to_get += [request_site_home_page(url, semaphore) for url in urls]
|
||||
|
||||