diff --git a/README-zh.md b/README-zh.md index 42f850ac..dbb84089 100644 --- a/README-zh.md +++ b/README-zh.md @@ -1,4 +1,7 @@ # Crawlab + +![](https://img.shields.io/badge/版本-v0.2.1-blue.svg) + 基于Celery的爬虫分布式爬虫管理平台,支持多种编程语言以及多种爬虫框架. [查看演示 Demo](http://139.129.230.98:8080) @@ -48,19 +51,20 @@ npm run serve ## 截图 #### 首页 -![home](./docs/img/screenshot-home.png) + +![](https://user-gold-cdn.xitu.io/2019/3/6/169524d4c7f117f7?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) #### 爬虫列表 -![spider-list](./docs/img/screenshot-spiders.png) +![](https://user-gold-cdn.xitu.io/2019/3/6/169524daf9c8ccef?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) #### 爬虫详情 - 概览 -![spider-list](./docs/img/screenshot-spider-detail-overview.png) +![](https://user-gold-cdn.xitu.io/2019/3/6/169524e0794d6be1?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) #### 任务详情 - 抓取结果 -![spider-list](./docs/img/screenshot-task-detail-results.png) +![](https://user-gold-cdn.xitu.io/2019/3/6/169524e4064c7f0a?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) ## 使用流程 @@ -170,6 +174,9 @@ Crawlab使用起来很方便,也很通用,可以适用于几乎任何主流 - [ ] 登录和用户管理 - [ ] 全局搜索 -如果您喜欢Crawlab或者希望贡献开发它,请加作者微信 tikazyq1 并注明"Crawlab",作者会将你拉入群。 +如果您觉得Crawlab对您的日常开发或公司有帮助,请加作者微信 tikazyq1 并注明"Crawlab",作者会将你拉入群。或者,您可以扫下方支付宝二维码给作者打赏去升级团队协作软件或买一杯咖啡。 -![](https://user-gold-cdn.xitu.io/2019/3/15/169814cbd5e600e9?imageslim) +

+ + +

diff --git a/README.md b/README.md index eeea04bb..e3581571 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Crawlab +![](https://img.shields.io/badge/version-v0.2.1-blue.svg) + Celery-based web crawler admin platform for managing distributed web spiders regardless of languages and frameworks. [Demo](http://139.129.230.98:8080) @@ -49,19 +51,20 @@ npm run serve ## Screenshot #### Home Page -![home](./docs/img/screenshot-home.png) + +![](https://user-gold-cdn.xitu.io/2019/3/6/169524d4c7f117f7?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) #### Spider List -![spider-list](./docs/img/screenshot-spiders.png) +![](https://user-gold-cdn.xitu.io/2019/3/6/169524daf9c8ccef?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) #### Spider Detail - Overview -![spider-list](./docs/img/screenshot-spider-detail-overview.png) +![](https://user-gold-cdn.xitu.io/2019/3/6/169524e0794d6be1?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) #### Task Detail - Results -![spider-list](./docs/img/screenshot-task-detail-results.png) +![](https://user-gold-cdn.xitu.io/2019/3/6/169524e4064c7f0a?imageView2/0/w/1280/h/960/format/webp/ignore-error/1) ## Architecture @@ -166,6 +169,9 @@ Crawlab is easy to use, general enough to adapt spiders in any language and any - [ ] Login & User Management - [ ] General Search -If you like Crawlab or would like to contribute to it, please add the Author's Wechat noting "Crawlab" to enter the discussion group. +If you feel Crawlab could benefit your daily work or your company, please add the author's Wechat account noting "Crawlab" to enter the discussion group. Or you scan the Alipay QR code below to give us a reward to upgrade our teamwork software or buy a coffee. -![](https://user-gold-cdn.xitu.io/2019/3/15/169814cbd5e600e9?imageslim) +

+ + +

diff --git a/crawlab/app.py b/crawlab/app.py index 3a60c19e..eabb2bc3 100644 --- a/crawlab/app.py +++ b/crawlab/app.py @@ -1,10 +1,7 @@ import os -import subprocess import sys from multiprocessing import Process -import click -from celery import Celery from flask import Flask from flask_cors import CORS from flask_restful import Api diff --git a/crawlab/manage.py b/crawlab/manage.py deleted file mode 100644 index 5b9ae3d3..00000000 --- a/crawlab/manage.py +++ /dev/null @@ -1,125 +0,0 @@ -import os -import subprocess -import sys -from multiprocessing import Process - -import click -from flask import Flask -from flask_cors import CORS -from flask_restful import Api - -from routes.schedules import ScheduleApi -from tasks.scheduler import scheduler - -file_dir = os.path.dirname(os.path.realpath(__file__)) -root_path = os.path.abspath(os.path.join(file_dir, '.')) -sys.path.append(root_path) - -from config import FLASK_HOST, FLASK_PORT, PROJECT_LOGS_FOLDER, BROKER_URL -from constants.manage import ActionType -from routes.deploys import DeployApi -from routes.files import FileApi -from routes.nodes import NodeApi -from routes.spiders import SpiderApi, SpiderImportApi, SpiderManageApi -from routes.stats import StatsApi -from routes.tasks import TaskApi -from tasks.celery import celery_app -from utils.log import other -# flask app instance -app = Flask(__name__) -app.config.from_object('config') - -# init flask api instance -api = Api(app) - -# cors support -CORS(app, supports_credentials=True) - -# reference api routes -api.add_resource(NodeApi, - '/api/nodes', - '/api/nodes/', - '/api/nodes//') -api.add_resource(SpiderImportApi, - '/api/spiders/import/') -api.add_resource(SpiderManageApi, - '/api/spiders/manage/') -api.add_resource(SpiderApi, - '/api/spiders', - '/api/spiders/', - '/api/spiders//') -api.add_resource(DeployApi, - '/api/deploys', - '/api/deploys/', - '/api/deploys//') -api.add_resource(TaskApi, - '/api/tasks', - '/api/tasks/', - '/api/tasks//' - ) -api.add_resource(FileApi, - '/api/files', - '/api/files/') -api.add_resource(StatsApi, - '/api/stats', - '/api/stats/') -api.add_resource(ScheduleApi, - '/api/schedules', - '/api/schedules/') - - -def run_app(): - # create folder if it does not exist - if not os.path.exists(PROJECT_LOGS_FOLDER): - os.makedirs(PROJECT_LOGS_FOLDER) - - # run app instance - app.run(host=FLASK_HOST, port=FLASK_PORT) - - -def run_flower(): - p = subprocess.Popen(['celery', 'flower', '-b', BROKER_URL], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - for line in iter(p.stdout.readline, 'b'): - if line.decode('utf-8') != '': - other.info(line.decode('utf-8')) - - -def run_worker(): - if sys.platform == 'windows': - celery_app.start(argv=['tasks', 'worker', '-P', 'eventlet', '-E', '-l', 'INFO']) - else: - celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO']) - - -def run_scheduler(): - scheduler.run() - - -@click.command() -@click.argument('action', type=click.Choice([ActionType.APP, - ActionType.FLOWER, - ActionType.WORKER, - ActionType.SCHEDULER, - ActionType.RUN_ALL])) -def main(action): - if action == ActionType.APP: - run_app() - elif action == ActionType.FLOWER: - run_flower() - elif action == ActionType.WORKER: - run_worker() - elif action == ActionType.SCHEDULER: - run_scheduler() - elif action == ActionType.RUN_ALL: - p_flower = Process(target=run_flower) - p_flower.start() - p_app = Process(target=run_app) - p_app.start() - p_worker = Process(target=run_worker) - p_worker.start() - p_scheduler = Process(target=run_scheduler) - p_scheduler.start() - - -if __name__ == '__main__': - main() diff --git a/crawlab/model/__init__.py b/crawlab/model/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/crawlab/model/base.py b/crawlab/model/base.py deleted file mode 100644 index 253cc063..00000000 --- a/crawlab/model/base.py +++ /dev/null @@ -1,6 +0,0 @@ -from mongoengine import * -import datetime - - -class BaseModel(Document): - create_ts = DateTimeField(default=datetime.datetime.utcnow) diff --git a/crawlab/model/deploy.py b/crawlab/model/deploy.py deleted file mode 100644 index 2d21c6fc..00000000 --- a/crawlab/model/deploy.py +++ /dev/null @@ -1,10 +0,0 @@ -from mongoengine import * - -from model.base import BaseModel - - -class Deploy(BaseModel): - _id = ObjectIdField() - spider_id = ObjectIdField() - version = IntField() - node_id = ObjectIdField() diff --git a/crawlab/model/node.py b/crawlab/model/node.py deleted file mode 100644 index 4b9b4910..00000000 --- a/crawlab/model/node.py +++ /dev/null @@ -1,12 +0,0 @@ -from mongoengine import * - -from model.base import BaseModel - - -class Node(BaseModel): - _id = ObjectIdField() - ip = StringField() - port = IntField() - name = StringField() - description = StringField() - status = IntField() diff --git a/crawlab/model/spider.py b/crawlab/model/spider.py deleted file mode 100644 index eee86711..00000000 --- a/crawlab/model/spider.py +++ /dev/null @@ -1,12 +0,0 @@ -from mongoengine import * - -from model.base import BaseModel - - -class Spider(BaseModel): - _id = ObjectIdField() - name = StringField() - cmd = StringField() - src = StringField() - type = IntField() - lang = IntField() diff --git a/crawlab/model/task.py b/crawlab/model/task.py deleted file mode 100644 index 9f934eb1..00000000 --- a/crawlab/model/task.py +++ /dev/null @@ -1,9 +0,0 @@ -from mongoengine import * - -from model.base import BaseModel - - -class Task(BaseModel): - _id = ObjectIdField() - deploy_id = ObjectIdField() - file_path = StringField() diff --git a/crawlab/requirements.txt b/crawlab/requirements.txt index 8b13446c..0a1b7f13 100644 --- a/crawlab/requirements.txt +++ b/crawlab/requirements.txt @@ -57,10 +57,8 @@ python-dateutil==2.8.0 pytz==2018.9 queuelib==1.5.0 redis==3.2.1 -redisbeat==1.1.4 requests==2.21.0 Scrapy==1.6.0 -selenium==3.141.0 service-identity==18.1.0 six==1.12.0 soupsieve==1.9.1 diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index a9240dae..ef8e83e9 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -466,9 +466,8 @@ class SpiderApi(BaseApi): detail_fields = json.loads(args.detail_fields) db_manager.update_one(col_name='spiders', id=id, values={'detail_fields': detail_fields}) - def preview_crawl(self, id: str): - spider = db_manager.get(col_name='spiders', id=id) - + @staticmethod + def _get_html(spider) -> etree.Element: if spider['type'] != SpiderType.CONFIGURABLE: return { 'status': 'ok', @@ -509,6 +508,26 @@ class SpiderApi(BaseApi): # get html parse tree sel = etree.HTML(r.content) + return sel + + @staticmethod + def _get_text_child_tags(sel): + tags = [] + for tag in sel.iter(): + if tag.text is not None: + tags.append(tag) + return tags + + def preview_crawl(self, id: str): + spider = db_manager.get(col_name='spiders', id=id) + + # get html parse tree + sel = self._get_html(spider) + + # when error happens, return + if type(sel) == type(tuple): + return sel + # parse fields if spider['crawl_type'] == CrawlType.LIST: if spider.get('item_selector') is None: @@ -525,6 +544,7 @@ class SpiderApi(BaseApi): } elif spider['crawl_type'] == CrawlType.DETAIL: + # TODO: 详情页预览 pass elif spider['crawl_type'] == CrawlType.LIST_DETAIL: @@ -546,6 +566,54 @@ class SpiderApi(BaseApi): 'items': data } + def extract_fields(self, id: str): + """ + Extract list fields from a web page + :param id: + :return: + """ + spider = db_manager.get(col_name='spiders', id=id) + + # get html parse tree + sel = self._get_html(spider) + + # when error happens, return + if type(sel) == type(tuple): + return sel + + list_tag_list = [] + threshold = 10 + # iterate all child nodes in a top-down direction + for tag in sel.iter(): + # get child tags + child_tags = tag.getchildren() + + if len(child_tags) < threshold: + # if number of child tags is below threshold, skip + continue + else: + # have one or more child tags + child_tags_set = set(map(lambda x: x.tag, child_tags)) + + # if there are more than 1 tag names, skip + if len(child_tags_set) > 1: + continue + + # add as list tag + list_tag_list.append(tag) + + # find the list tag with the most child text tags + _tag_list = [] + _max_tag = None + _max_num = 0 + for tag in list_tag_list: + _child_text_tags = self._get_text_child_tags(tag[0]) + if len(_child_text_tags) > _max_num: + _max_tag = tag + _max_num = len(_child_text_tags) + + # TODO: extract list fields + class SpiderImportApi(Resource): __doc__ = """ diff --git a/crawlab/setup.py b/crawlab/setup.py deleted file mode 100644 index 22cda1de..00000000 --- a/crawlab/setup.py +++ /dev/null @@ -1,24 +0,0 @@ -from setuptools import setup, find_packages - -with open("README.md", "r") as fh: - long_description = fh.read() - -with open('requirements.txt') as f: - requirements = [l for l in f.read().splitlines() if l] - -setup( - name='crawlab-server', - version='0.0.1', - url='https://github.com/tikazyq/crawlab', - install_requires=requirements, - license='BSD', - author='Marvin Zhang', - author_email='tikazyq@163.com', - description='Celery-based web crawler admin platform for managing distributed web spiders regardless of languages and frameworks.', - long_description=long_description, - long_description_content_type="text/markdown", - download_url="https://github.com/tikazyq/crawlab/archive/master.zip", - packages=find_packages(), - keywords=['celery', 'python', 'webcrawler', 'crawl', 'scrapy', 'admin'], - zip_safe=True, -) diff --git a/docs/.DS_Store b/docs/.DS_Store index 328c974d..9cbf3ccd 100644 Binary files a/docs/.DS_Store and b/docs/.DS_Store differ diff --git a/docs/img/crawlab-architecture 2.png b/docs/img/crawlab-architecture 2.png deleted file mode 100644 index fcac460f..00000000 Binary files a/docs/img/crawlab-architecture 2.png and /dev/null differ diff --git a/docs/img/payment.jpg b/docs/img/payment.jpg new file mode 100644 index 00000000..2b20f2d8 Binary files /dev/null and b/docs/img/payment.jpg differ diff --git a/docs/img/screenshot-home.png b/docs/img/screenshot-home.png deleted file mode 100644 index 650dca47..00000000 Binary files a/docs/img/screenshot-home.png and /dev/null differ diff --git a/docs/img/screenshot-node-detail.png b/docs/img/screenshot-node-detail.png deleted file mode 100644 index 3d323172..00000000 Binary files a/docs/img/screenshot-node-detail.png and /dev/null differ diff --git a/docs/img/screenshot-nodes.png b/docs/img/screenshot-nodes.png deleted file mode 100644 index 88fc7489..00000000 Binary files a/docs/img/screenshot-nodes.png and /dev/null differ diff --git a/docs/img/screenshot-spider-detail-overview.png b/docs/img/screenshot-spider-detail-overview.png deleted file mode 100644 index 8745c451..00000000 Binary files a/docs/img/screenshot-spider-detail-overview.png and /dev/null differ diff --git a/docs/img/screenshot-spider-import.png b/docs/img/screenshot-spider-import.png deleted file mode 100644 index d2ca7c17..00000000 Binary files a/docs/img/screenshot-spider-import.png and /dev/null differ diff --git a/docs/img/screenshot-spiders.png b/docs/img/screenshot-spiders.png deleted file mode 100644 index b23310d7..00000000 Binary files a/docs/img/screenshot-spiders.png and /dev/null differ diff --git a/docs/img/screenshot-task-detail-log.png b/docs/img/screenshot-task-detail-log.png deleted file mode 100644 index 7e3ee387..00000000 Binary files a/docs/img/screenshot-task-detail-log.png and /dev/null differ diff --git a/docs/img/screenshot-task-detail-overview.png b/docs/img/screenshot-task-detail-overview.png deleted file mode 100644 index fbb339e8..00000000 Binary files a/docs/img/screenshot-task-detail-overview.png and /dev/null differ diff --git a/docs/img/screenshot-task-detail-results.png b/docs/img/screenshot-task-detail-results.png deleted file mode 100644 index 8623fb33..00000000 Binary files a/docs/img/screenshot-task-detail-results.png and /dev/null differ diff --git a/docs/img/screenshot-tasks.png b/docs/img/screenshot-tasks.png deleted file mode 100644 index ab5585da..00000000 Binary files a/docs/img/screenshot-tasks.png and /dev/null differ diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index b5dcda0b..5b7e3d3d 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -82,6 +82,7 @@
{{$t('Run')}} + {{$t('Extract Fields')}} {{$t('Preview')}} {{$t('Save')}}
@@ -129,6 +130,7 @@ export default { { value: 'detail', label: 'Detail Only' }, { value: 'list-detail', label: 'List + Detail' } ], + extractFieldsLoading: false, previewLoading: false, saveLoading: false, dialogVisible: false @@ -213,6 +215,8 @@ export default { this.$message.success(this.$t(`Spider task has been scheduled`)) }) }) + }, + onExtractFields () { } }, created () { diff --git a/frontend/src/i18n/zh.js b/frontend/src/i18n/zh.js index e6db4795..9af096c9 100644 --- a/frontend/src/i18n/zh.js +++ b/frontend/src/i18n/zh.js @@ -48,13 +48,14 @@ export default { Submit: '提交', 'Import Spiders': '导入爬虫', 'Deploy All': '部署所有爬虫', - Refresh: '刷新', - View: '查看', - Edit: '编辑', - Remove: '删除', - Confirm: '确认', - Stop: '停止', - Preview: '预览', + 'Refresh': '刷新', + 'View': '查看', + 'Edit': '编辑', + 'Remove': '删除', + 'Confirm': '确认', + 'Stop': '停止', + 'Preview': '预览', + 'Extract Fields': '提取字段', // 主页 'Total Tasks': '总任务数',