diff --git a/README-zh.md b/README-zh.md index 0c3e8e2f..dbb84089 100644 --- a/README-zh.md +++ b/README-zh.md @@ -174,6 +174,9 @@ Crawlab使用起来很方便,也很通用,可以适用于几乎任何主流 - [ ] 登录和用户管理 - [ ] 全局搜索 -如果您喜欢Crawlab或者希望贡献开发它,请加作者微信 tikazyq1 并注明"Crawlab",作者会将你拉入群。 +如果您觉得Crawlab对您的日常开发或公司有帮助,请加作者微信 tikazyq1 并注明"Crawlab",作者会将你拉入群。或者,您可以扫下方支付宝二维码给作者打赏去升级团队协作软件或买一杯咖啡。 -![](https://user-gold-cdn.xitu.io/2019/3/15/169814cbd5e600e9?imageslim) +

+ + +

diff --git a/README.md b/README.md index aff91cba..e3581571 100644 --- a/README.md +++ b/README.md @@ -169,6 +169,9 @@ Crawlab is easy to use, general enough to adapt spiders in any language and any - [ ] Login & User Management - [ ] General Search -If you like Crawlab or would like to contribute to it, please add the Author's Wechat noting "Crawlab" to enter the discussion group. +If you feel Crawlab could benefit your daily work or your company, please add the author's Wechat account noting "Crawlab" to enter the discussion group. Or you scan the Alipay QR code below to give us a reward to upgrade our teamwork software or buy a coffee. -![](https://user-gold-cdn.xitu.io/2019/3/15/169814cbd5e600e9?imageslim) +

+ + +

diff --git a/crawlab/manage.py b/crawlab/manage.py deleted file mode 100644 index 5b9ae3d3..00000000 --- a/crawlab/manage.py +++ /dev/null @@ -1,125 +0,0 @@ -import os -import subprocess -import sys -from multiprocessing import Process - -import click -from flask import Flask -from flask_cors import CORS -from flask_restful import Api - -from routes.schedules import ScheduleApi -from tasks.scheduler import scheduler - -file_dir = os.path.dirname(os.path.realpath(__file__)) -root_path = os.path.abspath(os.path.join(file_dir, '.')) -sys.path.append(root_path) - -from config import FLASK_HOST, FLASK_PORT, PROJECT_LOGS_FOLDER, BROKER_URL -from constants.manage import ActionType -from routes.deploys import DeployApi -from routes.files import FileApi -from routes.nodes import NodeApi -from routes.spiders import SpiderApi, SpiderImportApi, SpiderManageApi -from routes.stats import StatsApi -from routes.tasks import TaskApi -from tasks.celery import celery_app -from utils.log import other -# flask app instance -app = Flask(__name__) -app.config.from_object('config') - -# init flask api instance -api = Api(app) - -# cors support -CORS(app, supports_credentials=True) - -# reference api routes -api.add_resource(NodeApi, - '/api/nodes', - '/api/nodes/', - '/api/nodes//') -api.add_resource(SpiderImportApi, - '/api/spiders/import/') -api.add_resource(SpiderManageApi, - '/api/spiders/manage/') -api.add_resource(SpiderApi, - '/api/spiders', - '/api/spiders/', - '/api/spiders//') -api.add_resource(DeployApi, - '/api/deploys', - '/api/deploys/', - '/api/deploys//') -api.add_resource(TaskApi, - '/api/tasks', - '/api/tasks/', - '/api/tasks//' - ) -api.add_resource(FileApi, - '/api/files', - '/api/files/') -api.add_resource(StatsApi, - '/api/stats', - '/api/stats/') -api.add_resource(ScheduleApi, - '/api/schedules', - '/api/schedules/') - - -def run_app(): - # create folder if it does not exist - if not os.path.exists(PROJECT_LOGS_FOLDER): - os.makedirs(PROJECT_LOGS_FOLDER) - - # run app instance - app.run(host=FLASK_HOST, port=FLASK_PORT) - - -def run_flower(): - p = subprocess.Popen(['celery', 'flower', '-b', BROKER_URL], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - for line in iter(p.stdout.readline, 'b'): - if line.decode('utf-8') != '': - other.info(line.decode('utf-8')) - - -def run_worker(): - if sys.platform == 'windows': - celery_app.start(argv=['tasks', 'worker', '-P', 'eventlet', '-E', '-l', 'INFO']) - else: - celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO']) - - -def run_scheduler(): - scheduler.run() - - -@click.command() -@click.argument('action', type=click.Choice([ActionType.APP, - ActionType.FLOWER, - ActionType.WORKER, - ActionType.SCHEDULER, - ActionType.RUN_ALL])) -def main(action): - if action == ActionType.APP: - run_app() - elif action == ActionType.FLOWER: - run_flower() - elif action == ActionType.WORKER: - run_worker() - elif action == ActionType.SCHEDULER: - run_scheduler() - elif action == ActionType.RUN_ALL: - p_flower = Process(target=run_flower) - p_flower.start() - p_app = Process(target=run_app) - p_app.start() - p_worker = Process(target=run_worker) - p_worker.start() - p_scheduler = Process(target=run_scheduler) - p_scheduler.start() - - -if __name__ == '__main__': - main() diff --git a/crawlab/model/__init__.py b/crawlab/model/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/crawlab/model/base.py b/crawlab/model/base.py deleted file mode 100644 index 253cc063..00000000 --- a/crawlab/model/base.py +++ /dev/null @@ -1,6 +0,0 @@ -from mongoengine import * -import datetime - - -class BaseModel(Document): - create_ts = DateTimeField(default=datetime.datetime.utcnow) diff --git a/crawlab/model/deploy.py b/crawlab/model/deploy.py deleted file mode 100644 index 2d21c6fc..00000000 --- a/crawlab/model/deploy.py +++ /dev/null @@ -1,10 +0,0 @@ -from mongoengine import * - -from model.base import BaseModel - - -class Deploy(BaseModel): - _id = ObjectIdField() - spider_id = ObjectIdField() - version = IntField() - node_id = ObjectIdField() diff --git a/crawlab/model/node.py b/crawlab/model/node.py deleted file mode 100644 index 4b9b4910..00000000 --- a/crawlab/model/node.py +++ /dev/null @@ -1,12 +0,0 @@ -from mongoengine import * - -from model.base import BaseModel - - -class Node(BaseModel): - _id = ObjectIdField() - ip = StringField() - port = IntField() - name = StringField() - description = StringField() - status = IntField() diff --git a/crawlab/model/spider.py b/crawlab/model/spider.py deleted file mode 100644 index eee86711..00000000 --- a/crawlab/model/spider.py +++ /dev/null @@ -1,12 +0,0 @@ -from mongoengine import * - -from model.base import BaseModel - - -class Spider(BaseModel): - _id = ObjectIdField() - name = StringField() - cmd = StringField() - src = StringField() - type = IntField() - lang = IntField() diff --git a/crawlab/model/task.py b/crawlab/model/task.py deleted file mode 100644 index 9f934eb1..00000000 --- a/crawlab/model/task.py +++ /dev/null @@ -1,9 +0,0 @@ -from mongoengine import * - -from model.base import BaseModel - - -class Task(BaseModel): - _id = ObjectIdField() - deploy_id = ObjectIdField() - file_path = StringField() diff --git a/crawlab/requirements.txt b/crawlab/requirements.txt index 8b13446c..0a1b7f13 100644 --- a/crawlab/requirements.txt +++ b/crawlab/requirements.txt @@ -57,10 +57,8 @@ python-dateutil==2.8.0 pytz==2018.9 queuelib==1.5.0 redis==3.2.1 -redisbeat==1.1.4 requests==2.21.0 Scrapy==1.6.0 -selenium==3.141.0 service-identity==18.1.0 six==1.12.0 soupsieve==1.9.1 diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index 5fabde73..758cf29c 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -463,9 +463,8 @@ class SpiderApi(BaseApi): detail_fields = json.loads(args.detail_fields) db_manager.update_one(col_name='spiders', id=id, values={'detail_fields': detail_fields}) - def preview_crawl(self, id: str): - spider = db_manager.get(col_name='spiders', id=id) - + @staticmethod + def _get_html(spider) -> etree.Element: if spider['type'] != SpiderType.CONFIGURABLE: return { 'status': 'ok', @@ -497,6 +496,26 @@ class SpiderApi(BaseApi): # get html parse tree sel = etree.HTML(r.content) + return sel + + @staticmethod + def _get_text_child_tags(sel): + tags = [] + for tag in sel.iter(): + if tag.text is not None: + tags.append(tag) + return tags + + def preview_crawl(self, id: str): + spider = db_manager.get(col_name='spiders', id=id) + + # get html parse tree + sel = self._get_html(spider) + + # when error happens, return + if type(sel) == type(tuple): + return sel + # parse fields if spider['crawl_type'] == CrawlType.LIST: if spider.get('item_selector') is None: @@ -513,6 +532,7 @@ class SpiderApi(BaseApi): } elif spider['crawl_type'] == CrawlType.DETAIL: + # TODO: 详情页预览 pass elif spider['crawl_type'] == CrawlType.LIST_DETAIL: @@ -534,6 +554,54 @@ class SpiderApi(BaseApi): 'items': data } + def extract_fields(self, id: str): + """ + Extract list fields from a web page + :param id: + :return: + """ + spider = db_manager.get(col_name='spiders', id=id) + + # get html parse tree + sel = self._get_html(spider) + + # when error happens, return + if type(sel) == type(tuple): + return sel + + list_tag_list = [] + threshold = 10 + # iterate all child nodes in a top-down direction + for tag in sel.iter(): + # get child tags + child_tags = tag.getchildren() + + if len(child_tags) < threshold: + # if number of child tags is below threshold, skip + continue + else: + # have one or more child tags + child_tags_set = set(map(lambda x: x.tag, child_tags)) + + # if there are more than 1 tag names, skip + if len(child_tags_set) > 1: + continue + + # add as list tag + list_tag_list.append(tag) + + # find the list tag with the most child text tags + _tag_list = [] + _max_tag = None + _max_num = 0 + for tag in list_tag_list: + _child_text_tags = self._get_text_child_tags(tag[0]) + if len(_child_text_tags) > _max_num: + _max_tag = tag + _max_num = len(_child_text_tags) + + # TODO: extract list fields + class SpiderImportApi(Resource): __doc__ = """ diff --git a/crawlab/setup.py b/crawlab/setup.py deleted file mode 100644 index 22cda1de..00000000 --- a/crawlab/setup.py +++ /dev/null @@ -1,24 +0,0 @@ -from setuptools import setup, find_packages - -with open("README.md", "r") as fh: - long_description = fh.read() - -with open('requirements.txt') as f: - requirements = [l for l in f.read().splitlines() if l] - -setup( - name='crawlab-server', - version='0.0.1', - url='https://github.com/tikazyq/crawlab', - install_requires=requirements, - license='BSD', - author='Marvin Zhang', - author_email='tikazyq@163.com', - description='Celery-based web crawler admin platform for managing distributed web spiders regardless of languages and frameworks.', - long_description=long_description, - long_description_content_type="text/markdown", - download_url="https://github.com/tikazyq/crawlab/archive/master.zip", - packages=find_packages(), - keywords=['celery', 'python', 'webcrawler', 'crawl', 'scrapy', 'admin'], - zip_safe=True, -) diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index 6c47570a..908f2c9a 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -79,6 +79,7 @@
{{$t('Run')}} + {{$t('Extract Fields')}} {{$t('Preview')}} {{$t('Save')}}
@@ -126,6 +127,7 @@ export default { { value: 'detail', label: 'Detail Only' }, { value: 'list-detail', label: 'List + Detail' } ], + extractFieldsLoading: false, previewLoading: false, saveLoading: false, dialogVisible: false @@ -210,6 +212,8 @@ export default { this.$message.success(this.$t(`Spider task has been scheduled`)) }) }) + }, + onExtractFields () { } }, created () { diff --git a/frontend/src/i18n/zh.js b/frontend/src/i18n/zh.js index e6db4795..9af096c9 100644 --- a/frontend/src/i18n/zh.js +++ b/frontend/src/i18n/zh.js @@ -48,13 +48,14 @@ export default { Submit: '提交', 'Import Spiders': '导入爬虫', 'Deploy All': '部署所有爬虫', - Refresh: '刷新', - View: '查看', - Edit: '编辑', - Remove: '删除', - Confirm: '确认', - Stop: '停止', - Preview: '预览', + 'Refresh': '刷新', + 'View': '查看', + 'Edit': '编辑', + 'Remove': '删除', + 'Confirm': '确认', + 'Stop': '停止', + 'Preview': '预览', + 'Extract Fields': '提取字段', // 主页 'Total Tasks': '总任务数',