Merge remote-tracking branch 'origin/develop' into develop

This commit is contained in:
Marvin Zhang
2019-05-29 14:45:30 +08:00
28 changed files with 108 additions and 225 deletions

View File

@@ -1,4 +1,7 @@
# Crawlab
![](https://img.shields.io/badge/版本-v0.2.1-blue.svg)
基于Celery的爬虫分布式爬虫管理平台支持多种编程语言以及多种爬虫框架.
[查看演示 Demo](http://139.129.230.98:8080)
@@ -48,19 +51,20 @@ npm run serve
## 截图
#### 首页
![home](./docs/img/screenshot-home.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524d4c7f117f7?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
#### 爬虫列表
![spider-list](./docs/img/screenshot-spiders.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524daf9c8ccef?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
#### 爬虫详情 - 概览
![spider-list](./docs/img/screenshot-spider-detail-overview.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524e0794d6be1?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
#### 任务详情 - 抓取结果
![spider-list](./docs/img/screenshot-task-detail-results.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524e4064c7f0a?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
## 使用流程
@@ -170,6 +174,9 @@ Crawlab使用起来很方便也很通用可以适用于几乎任何主流
- [ ] 登录和用户管理
- [ ] 全局搜索
如果您喜欢Crawlab或者希望贡献开发它,请加作者微信 tikazyq1 并注明"Crawlab",作者会将你拉入群。
如果您觉得Crawlab对您的日常开发或公司有帮助,请加作者微信 tikazyq1 并注明"Crawlab",作者会将你拉入群。或者,您可以扫下方支付宝二维码给作者打赏去升级团队协作软件或买一杯咖啡。
![](https://user-gold-cdn.xitu.io/2019/3/15/169814cbd5e600e9?imageslim)
<p align="center">
<img src="https://user-gold-cdn.xitu.io/2019/3/15/169814cbd5e600e9?imageslim" height="360">
<img src="https://raw.githubusercontent.com/tikazyq/crawlab/master/docs/img/payment.jpg" height="360">
</p>

View File

@@ -1,5 +1,7 @@
# Crawlab
![](https://img.shields.io/badge/version-v0.2.1-blue.svg)
Celery-based web crawler admin platform for managing distributed web spiders regardless of languages and frameworks.
[Demo](http://139.129.230.98:8080)
@@ -49,19 +51,20 @@ npm run serve
## Screenshot
#### Home Page
![home](./docs/img/screenshot-home.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524d4c7f117f7?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
#### Spider List
![spider-list](./docs/img/screenshot-spiders.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524daf9c8ccef?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
#### Spider Detail - Overview
![spider-list](./docs/img/screenshot-spider-detail-overview.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524e0794d6be1?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
#### Task Detail - Results
![spider-list](./docs/img/screenshot-task-detail-results.png)
![](https://user-gold-cdn.xitu.io/2019/3/6/169524e4064c7f0a?imageView2/0/w/1280/h/960/format/webp/ignore-error/1)
## Architecture
@@ -166,6 +169,9 @@ Crawlab is easy to use, general enough to adapt spiders in any language and any
- [ ] Login & User Management
- [ ] General Search
If you like Crawlab or would like to contribute to it, please add the Author's Wechat noting "Crawlab" to enter the discussion group.
If you feel Crawlab could benefit your daily work or your company, please add the author's Wechat account noting "Crawlab" to enter the discussion group. Or you scan the Alipay QR code below to give us a reward to upgrade our teamwork software or buy a coffee.
![](https://user-gold-cdn.xitu.io/2019/3/15/169814cbd5e600e9?imageslim)
<p align="center">
<img src="https://user-gold-cdn.xitu.io/2019/3/15/169814cbd5e600e9?imageslim" height="360">
<img src="https://raw.githubusercontent.com/tikazyq/crawlab/master/docs/img/payment.jpg" height="360">
</p>

View File

@@ -1,10 +1,7 @@
import os
import subprocess
import sys
from multiprocessing import Process
import click
from celery import Celery
from flask import Flask
from flask_cors import CORS
from flask_restful import Api

View File

@@ -1,125 +0,0 @@
import os
import subprocess
import sys
from multiprocessing import Process
import click
from flask import Flask
from flask_cors import CORS
from flask_restful import Api
from routes.schedules import ScheduleApi
from tasks.scheduler import scheduler
file_dir = os.path.dirname(os.path.realpath(__file__))
root_path = os.path.abspath(os.path.join(file_dir, '.'))
sys.path.append(root_path)
from config import FLASK_HOST, FLASK_PORT, PROJECT_LOGS_FOLDER, BROKER_URL
from constants.manage import ActionType
from routes.deploys import DeployApi
from routes.files import FileApi
from routes.nodes import NodeApi
from routes.spiders import SpiderApi, SpiderImportApi, SpiderManageApi
from routes.stats import StatsApi
from routes.tasks import TaskApi
from tasks.celery import celery_app
from utils.log import other
# flask app instance
app = Flask(__name__)
app.config.from_object('config')
# init flask api instance
api = Api(app)
# cors support
CORS(app, supports_credentials=True)
# reference api routes
api.add_resource(NodeApi,
'/api/nodes',
'/api/nodes/<string:id>',
'/api/nodes/<string:id>/<string:action>')
api.add_resource(SpiderImportApi,
'/api/spiders/import/<string:platform>')
api.add_resource(SpiderManageApi,
'/api/spiders/manage/<string:action>')
api.add_resource(SpiderApi,
'/api/spiders',
'/api/spiders/<string:id>',
'/api/spiders/<string:id>/<string:action>')
api.add_resource(DeployApi,
'/api/deploys',
'/api/deploys/<string:id>',
'/api/deploys/<string:id>/<string:action>')
api.add_resource(TaskApi,
'/api/tasks',
'/api/tasks/<string:id>',
'/api/tasks/<string:id>/<string:action>'
)
api.add_resource(FileApi,
'/api/files',
'/api/files/<string:action>')
api.add_resource(StatsApi,
'/api/stats',
'/api/stats/<string:action>')
api.add_resource(ScheduleApi,
'/api/schedules',
'/api/schedules/<string:id>')
def run_app():
# create folder if it does not exist
if not os.path.exists(PROJECT_LOGS_FOLDER):
os.makedirs(PROJECT_LOGS_FOLDER)
# run app instance
app.run(host=FLASK_HOST, port=FLASK_PORT)
def run_flower():
p = subprocess.Popen(['celery', 'flower', '-b', BROKER_URL], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for line in iter(p.stdout.readline, 'b'):
if line.decode('utf-8') != '':
other.info(line.decode('utf-8'))
def run_worker():
if sys.platform == 'windows':
celery_app.start(argv=['tasks', 'worker', '-P', 'eventlet', '-E', '-l', 'INFO'])
else:
celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO'])
def run_scheduler():
scheduler.run()
@click.command()
@click.argument('action', type=click.Choice([ActionType.APP,
ActionType.FLOWER,
ActionType.WORKER,
ActionType.SCHEDULER,
ActionType.RUN_ALL]))
def main(action):
if action == ActionType.APP:
run_app()
elif action == ActionType.FLOWER:
run_flower()
elif action == ActionType.WORKER:
run_worker()
elif action == ActionType.SCHEDULER:
run_scheduler()
elif action == ActionType.RUN_ALL:
p_flower = Process(target=run_flower)
p_flower.start()
p_app = Process(target=run_app)
p_app.start()
p_worker = Process(target=run_worker)
p_worker.start()
p_scheduler = Process(target=run_scheduler)
p_scheduler.start()
if __name__ == '__main__':
main()

View File

@@ -1,6 +0,0 @@
from mongoengine import *
import datetime
class BaseModel(Document):
create_ts = DateTimeField(default=datetime.datetime.utcnow)

View File

@@ -1,10 +0,0 @@
from mongoengine import *
from model.base import BaseModel
class Deploy(BaseModel):
_id = ObjectIdField()
spider_id = ObjectIdField()
version = IntField()
node_id = ObjectIdField()

View File

@@ -1,12 +0,0 @@
from mongoengine import *
from model.base import BaseModel
class Node(BaseModel):
_id = ObjectIdField()
ip = StringField()
port = IntField()
name = StringField()
description = StringField()
status = IntField()

View File

@@ -1,12 +0,0 @@
from mongoengine import *
from model.base import BaseModel
class Spider(BaseModel):
_id = ObjectIdField()
name = StringField()
cmd = StringField()
src = StringField()
type = IntField()
lang = IntField()

View File

@@ -1,9 +0,0 @@
from mongoengine import *
from model.base import BaseModel
class Task(BaseModel):
_id = ObjectIdField()
deploy_id = ObjectIdField()
file_path = StringField()

View File

@@ -57,10 +57,8 @@ python-dateutil==2.8.0
pytz==2018.9
queuelib==1.5.0
redis==3.2.1
redisbeat==1.1.4
requests==2.21.0
Scrapy==1.6.0
selenium==3.141.0
service-identity==18.1.0
six==1.12.0
soupsieve==1.9.1

View File

@@ -466,9 +466,8 @@ class SpiderApi(BaseApi):
detail_fields = json.loads(args.detail_fields)
db_manager.update_one(col_name='spiders', id=id, values={'detail_fields': detail_fields})
def preview_crawl(self, id: str):
spider = db_manager.get(col_name='spiders', id=id)
@staticmethod
def _get_html(spider) -> etree.Element:
if spider['type'] != SpiderType.CONFIGURABLE:
return {
'status': 'ok',
@@ -509,6 +508,26 @@ class SpiderApi(BaseApi):
# get html parse tree
sel = etree.HTML(r.content)
return sel
@staticmethod
def _get_text_child_tags(sel):
tags = []
for tag in sel.iter():
if tag.text is not None:
tags.append(tag)
return tags
def preview_crawl(self, id: str):
spider = db_manager.get(col_name='spiders', id=id)
# get html parse tree
sel = self._get_html(spider)
# when error happens, return
if type(sel) == type(tuple):
return sel
# parse fields
if spider['crawl_type'] == CrawlType.LIST:
if spider.get('item_selector') is None:
@@ -525,6 +544,7 @@ class SpiderApi(BaseApi):
}
elif spider['crawl_type'] == CrawlType.DETAIL:
# TODO: 详情页预览
pass
elif spider['crawl_type'] == CrawlType.LIST_DETAIL:
@@ -546,6 +566,54 @@ class SpiderApi(BaseApi):
'items': data
}
def extract_fields(self, id: str):
"""
Extract list fields from a web page
:param id:
:return:
"""
spider = db_manager.get(col_name='spiders', id=id)
# get html parse tree
sel = self._get_html(spider)
# when error happens, return
if type(sel) == type(tuple):
return sel
list_tag_list = []
threshold = 10
# iterate all child nodes in a top-down direction
for tag in sel.iter():
# get child tags
child_tags = tag.getchildren()
if len(child_tags) < threshold:
# if number of child tags is below threshold, skip
continue
else:
# have one or more child tags
child_tags_set = set(map(lambda x: x.tag, child_tags))
# if there are more than 1 tag names, skip
if len(child_tags_set) > 1:
continue
# add as list tag
list_tag_list.append(tag)
# find the list tag with the most child text tags
_tag_list = []
_max_tag = None
_max_num = 0
for tag in list_tag_list:
_child_text_tags = self._get_text_child_tags(tag[0])
if len(_child_text_tags) > _max_num:
_max_tag = tag
_max_num = len(_child_text_tags)
# TODO: extract list fields
class SpiderImportApi(Resource):
__doc__ = """

View File

@@ -1,24 +0,0 @@
from setuptools import setup, find_packages
with open("README.md", "r") as fh:
long_description = fh.read()
with open('requirements.txt') as f:
requirements = [l for l in f.read().splitlines() if l]
setup(
name='crawlab-server',
version='0.0.1',
url='https://github.com/tikazyq/crawlab',
install_requires=requirements,
license='BSD',
author='Marvin Zhang',
author_email='tikazyq@163.com',
description='Celery-based web crawler admin platform for managing distributed web spiders regardless of languages and frameworks.',
long_description=long_description,
long_description_content_type="text/markdown",
download_url="https://github.com/tikazyq/crawlab/archive/master.zip",
packages=find_packages(),
keywords=['celery', 'python', 'webcrawler', 'crawl', 'scrapy', 'admin'],
zip_safe=True,
)

BIN
docs/.DS_Store vendored

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 47 KiB

BIN
docs/img/payment.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 78 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 162 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 358 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 175 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 349 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 174 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 187 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 538 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 245 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 542 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 326 KiB

View File

@@ -82,6 +82,7 @@
<el-row class="button-group-container">
<div class="button-group">
<el-button type="danger" @click="onCrawl">{{$t('Run')}}</el-button>
<el-button type="primary" @click="onExtractFields" v-loading="extractFieldsLoading">{{$t('Extract Fields')}}</el-button>
<el-button type="warning" @click="onPreview" v-loading="previewLoading">{{$t('Preview')}}</el-button>
<el-button type="success" @click="onSave" v-loading="saveLoading">{{$t('Save')}}</el-button>
</div>
@@ -129,6 +130,7 @@ export default {
{ value: 'detail', label: 'Detail Only' },
{ value: 'list-detail', label: 'List + Detail' }
],
extractFieldsLoading: false,
previewLoading: false,
saveLoading: false,
dialogVisible: false
@@ -213,6 +215,8 @@ export default {
this.$message.success(this.$t(`Spider task has been scheduled`))
})
})
},
onExtractFields () {
}
},
created () {

View File

@@ -48,13 +48,14 @@ export default {
Submit: '提交',
'Import Spiders': '导入爬虫',
'Deploy All': '部署所有爬虫',
Refresh: '刷新',
View: '查看',
Edit: '编辑',
Remove: '删除',
Confirm: '确认',
Stop: '停止',
Preview: '预览',
'Refresh': '刷新',
'View': '查看',
'Edit': '编辑',
'Remove': '删除',
'Confirm': '确认',
'Stop': '停止',
'Preview': '预览',
'Extract Fields': '提取字段',
// 主页
'Total Tasks': '总任务数',