From af21ab4841c6c7c16c57232255117e3d5acc8c1c Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Wed, 6 Mar 2019 13:14:26 +0800 Subject: [PATCH] added Results --- .gitignore | 3 + README.md | 21 +++++ crawlab/manage.py | 7 +- crawlab/routes/nodes.py | 8 +- crawlab/routes/spiders.py | 15 +++- crawlab/routes/tasks.py | 15 ++++ crawlab/tasks/spider.py | 2 +- crawlab/utils/spider.py | 10 +++ .../components/InfoView/SpiderInfoView.vue | 10 ++- .../src/components/Overview/NodeOverview.vue | 2 +- .../components/Overview/SpiderOverview.vue | 2 +- .../components/TableView/GeneralTableView.vue | 77 ++++++++++++++++ .../components/TableView/TaskTableView.vue | 8 +- frontend/src/store/modules/spider.js | 3 +- frontend/src/store/modules/task.js | 17 +++- frontend/src/views/deploy/DeployList.vue | 9 +- frontend/src/views/node/NodeDetail.vue | 2 +- frontend/src/views/spider/SpiderList.vue | 2 +- frontend/src/views/task/TaskDetail.vue | 10 ++- frontend/src/views/task/TaskList.vue | 2 +- spiders/baidu/baidu/settings.py | 4 +- spiders/example_juejin/juejin/__init__.py | 0 spiders/example_juejin/juejin/items.py | 17 ++++ spiders/example_juejin/juejin/middlewares.py | 56 ++++++++++++ spiders/example_juejin/juejin/pipelines.py | 27 ++++++ spiders/example_juejin/juejin/settings.py | 89 +++++++++++++++++++ .../example_juejin/juejin/spiders/__init__.py | 4 + .../juejin/spiders/juejin_spider.py | 17 ++++ spiders/example_juejin/scrapy.cfg | 11 +++ 29 files changed, 425 insertions(+), 25 deletions(-) create mode 100644 frontend/src/components/TableView/GeneralTableView.vue create mode 100644 spiders/example_juejin/juejin/__init__.py create mode 100644 spiders/example_juejin/juejin/items.py create mode 100644 spiders/example_juejin/juejin/middlewares.py create mode 100644 spiders/example_juejin/juejin/pipelines.py create mode 100644 spiders/example_juejin/juejin/settings.py create mode 100644 spiders/example_juejin/juejin/spiders/__init__.py create mode 100644 spiders/example_juejin/juejin/spiders/juejin_spider.py create mode 100644 spiders/example_juejin/scrapy.cfg diff --git a/.gitignore b/.gitignore index e043754d..c366ea24 100644 --- a/.gitignore +++ b/.gitignore @@ -110,3 +110,6 @@ node_modules/ # egg-info *.egg-info + +# .DS_Store +.DS_Store diff --git a/README.md b/README.md index 8da8e297..b82c9bd4 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,23 @@ cd frontend npm run dev ``` +## Screenshot + +#### Home Page +![home](./docs/img/screenshot-home.png) + +#### Spider List + +![spider-list](./docs/img/screenshot-spiders.png) + +#### Spider Detail - Overview + +![spider-list](./docs/img/screenshot-spider-detail-overview.png) + +#### Task Detail - Results + +![spider-list](./docs/img/screenshot-task-detail-results.png) + ## Architecture The architecture of Crawlab is as below. It's very similar to Celery architecture, but a few more modules including Frontend, Spiders and Flower are added to feature the crawling management functionality. @@ -60,3 +77,7 @@ After deploying the spider, you can click "Run" button on spider detail page and ### Tasks Tasks are triggered and run by the workers. Users can check the task status info and logs in the task detail page. + +### App + +### Broker diff --git a/crawlab/manage.py b/crawlab/manage.py index 98e7226a..45f07f17 100644 --- a/crawlab/manage.py +++ b/crawlab/manage.py @@ -86,12 +86,17 @@ def run_worker(): @click.command() -@click.argument('action', type=click.Choice([ActionType.APP, ActionType.FLOWER, ActionType.RUN_ALL])) +@click.argument('action', type=click.Choice([ActionType.APP, + ActionType.FLOWER, + ActionType.WORKER, + ActionType.RUN_ALL])) def main(action): if action == ActionType.APP: run_app() elif action == ActionType.FLOWER: run_flower() + elif action == ActionType.WORKER: + run_worker() elif action == ActionType.RUN_ALL: p_flower = Process(target=run_flower) p_flower.start() diff --git a/crawlab/routes/nodes.py b/crawlab/routes/nodes.py index 36a37114..91bb9686 100644 --- a/crawlab/routes/nodes.py +++ b/crawlab/routes/nodes.py @@ -1,3 +1,4 @@ +from constants.task import TaskStatus from db.manager import db_manager from routes.base import BaseApi from utils import jsonify @@ -64,8 +65,11 @@ class NodeApi(BaseApi): spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) item['spider_name'] = spider['name'] - task = db_manager.get('tasks_celery', id=item['_id']) - item['status'] = task['status'] + _task = db_manager.get('tasks_celery', id=item['_id']) + if _task: + item['status'] = _task['status'] + else: + item['status'] = TaskStatus.UNAVAILABLE return jsonify({ 'status': 'ok', 'items': items diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index cd4f242c..a2964400 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -30,14 +30,23 @@ class SpiderApi(BaseApi): col_name = 'spiders' arguments = ( + # name of spider ('name', str), + + # execute shell command ('cmd', str), + + # spider source folder ('src', str), + + # spider type ('type', str), + + # spider language ('lang', str), - # for deploy only - ('node_id', str), + # spider results collection + ('col', str), ) def get(self, id=None, action=None): @@ -130,8 +139,6 @@ class SpiderApi(BaseApi): } def on_crawl(self, id): - args = self.parser.parse_args() - job = execute_spider.delay(id) return { diff --git a/crawlab/routes/tasks.py b/crawlab/routes/tasks.py index 62854737..2f496c17 100644 --- a/crawlab/routes/tasks.py +++ b/crawlab/routes/tasks.py @@ -2,6 +2,7 @@ from constants.task import TaskStatus from db.manager import db_manager from routes.base import BaseApi from utils import jsonify +from utils.spider import get_spider_col_fields class TaskApi(BaseApi): @@ -71,3 +72,17 @@ class TaskApi(BaseApi): 'status': 'ok', 'error': str(err) }, 500 + + def get_results(self, id): + task = db_manager.get('tasks', id=id) + spider = db_manager.get('spiders', id=task['spider_id']) + col_name = spider.get('col') + if not col_name: + return [] + fields = get_spider_col_fields(col_name) + items = db_manager.list(col_name, {'task_id': id}) + return jsonify({ + 'status': 'ok', + 'fields': fields, + 'items': items + }) diff --git a/crawlab/tasks/spider.py b/crawlab/tasks/spider.py index 6098061b..26e0faf5 100644 --- a/crawlab/tasks/spider.py +++ b/crawlab/tasks/spider.py @@ -17,7 +17,6 @@ logger = get_logger(__name__) @celery_app.task(bind=True) def execute_spider(self, id: str): - print(self.state) task_id = self.request.id hostname = self.request.hostname spider = db_manager.get('spiders', id=id) @@ -53,6 +52,7 @@ def execute_spider(self, id: str): # execute the command env = os.environ.copy() env['CRAWLAB_TASK_ID'] = task_id + env['CRAWLAB_COLLECTION'] = spider.get('col') p = subprocess.Popen(command.split(' '), stdout=stdout.fileno(), stderr=stderr.fileno(), diff --git a/crawlab/utils/spider.py b/crawlab/utils/spider.py index 10b89720..2ee3dd77 100644 --- a/crawlab/utils/spider.py +++ b/crawlab/utils/spider.py @@ -1,6 +1,7 @@ import os from constants.spider import FILE_SUFFIX_LANG_MAPPING, LangType, SUFFIX_IGNORE, SpiderType +from db.manager import db_manager def get_lang_by_stats(stats: dict) -> LangType: @@ -21,3 +22,12 @@ def get_spider_type(path: str) -> SpiderType: for file_name in os.listdir(path): if file_name == 'scrapy.cfg': return SpiderType.SCRAPY + + +def get_spider_col_fields(col_name): + items = db_manager.list(col_name, {}, limit=100, sort_key='_id') + fields = set() + for item in items: + for k in item.keys(): + fields.add(k) + return list(fields) diff --git a/frontend/src/components/InfoView/SpiderInfoView.vue b/frontend/src/components/InfoView/SpiderInfoView.vue index 1ed15fdc..417db03f 100644 --- a/frontend/src/components/InfoView/SpiderInfoView.vue +++ b/frontend/src/components/InfoView/SpiderInfoView.vue @@ -15,10 +15,14 @@ - + + + + @@ -37,8 +41,8 @@ - Run - Deploy + Run + Save diff --git a/frontend/src/components/Overview/NodeOverview.vue b/frontend/src/components/Overview/NodeOverview.vue index 653d2d54..7f9baf0b 100644 --- a/frontend/src/components/Overview/NodeOverview.vue +++ b/frontend/src/components/Overview/NodeOverview.vue @@ -7,7 +7,7 @@ - + diff --git a/frontend/src/components/Overview/SpiderOverview.vue b/frontend/src/components/Overview/SpiderOverview.vue index 61fc1c02..166bd3ae 100644 --- a/frontend/src/components/Overview/SpiderOverview.vue +++ b/frontend/src/components/Overview/SpiderOverview.vue @@ -7,7 +7,7 @@ - + diff --git a/frontend/src/components/TableView/GeneralTableView.vue b/frontend/src/components/TableView/GeneralTableView.vue new file mode 100644 index 00000000..5a91b104 --- /dev/null +++ b/frontend/src/components/TableView/GeneralTableView.vue @@ -0,0 +1,77 @@ + + + + + diff --git a/frontend/src/components/TableView/TaskTableView.vue b/frontend/src/components/TableView/TaskTableView.vue index 9e2a328a..afa2c053 100644 --- a/frontend/src/components/TableView/TaskTableView.vue +++ b/frontend/src/components/TableView/TaskTableView.vue @@ -4,7 +4,7 @@
{{title}}
- + @@ -21,10 +24,12 @@ import { mapState } from 'vuex' import TaskOverview from '../../components/Overview/TaskOverview' +import GeneralTableView from '../../components/TableView/GeneralTableView' export default { name: 'TaskDetail', components: { + GeneralTableView, TaskOverview }, data () { @@ -34,7 +39,9 @@ export default { }, computed: { ...mapState('task', [ - 'taskLog' + 'taskLog', + 'taskResultsData', + 'taskResultsColumns' ]), ...mapState('file', [ 'currentPath' @@ -53,6 +60,7 @@ export default { created () { this.$store.dispatch('task/getTaskData', this.$route.params.id) this.$store.dispatch('task/getTaskLog', this.$route.params.id) + this.$store.dispatch('task/getTaskResults', this.$route.params.id) } } diff --git a/frontend/src/views/task/TaskList.vue b/frontend/src/views/task/TaskList.vue index 5796494d..60498977 100644 --- a/frontend/src/views/task/TaskList.vue +++ b/frontend/src/views/task/TaskList.vue @@ -13,7 +13,7 @@ icon="el-icon-refresh" class="refresh" @click="onRefresh"> - Search + Refresh diff --git a/spiders/baidu/baidu/settings.py b/spiders/baidu/baidu/settings.py index 34218961..667b09ca 100644 --- a/spiders/baidu/baidu/settings.py +++ b/spiders/baidu/baidu/settings.py @@ -19,8 +19,8 @@ NEWSPIDER_MODULE = 'baidu.spiders' #USER_AGENT = 'baidu (+http://www.yourdomain.com)' # Obey robots.txt rules -# ROBOTSTXT_OBEY = True -ROBOTSTXT_OBEY = False +ROBOTSTXT_OBEY = True +# ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 diff --git a/spiders/example_juejin/juejin/__init__.py b/spiders/example_juejin/juejin/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/spiders/example_juejin/juejin/items.py b/spiders/example_juejin/juejin/items.py new file mode 100644 index 00000000..2c4717dd --- /dev/null +++ b/spiders/example_juejin/juejin/items.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class JuejinItem(scrapy.Item): + # define the fields for your item here like: + _id = scrapy.Field() + title = scrapy.Field() + link = scrapy.Field() + like = scrapy.Field() + task_id = scrapy.Field() diff --git a/spiders/example_juejin/juejin/middlewares.py b/spiders/example_juejin/juejin/middlewares.py new file mode 100644 index 00000000..9d5225a2 --- /dev/null +++ b/spiders/example_juejin/juejin/middlewares.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class JuejinSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/spiders/example_juejin/juejin/pipelines.py b/spiders/example_juejin/juejin/pipelines.py new file mode 100644 index 00000000..a6876dfc --- /dev/null +++ b/spiders/example_juejin/juejin/pipelines.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +import os + +from pymongo import MongoClient + +MONGO_HOST = '192.168.99.100' +MONGO_PORT = 27017 +MONGO_DB = 'crawlab_test' + + +class JuejinPipeline(object): + mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT) + db = mongo[MONGO_DB] + col_name = os.environ.get('CRAWLAB_COLLECTION') + if not col_name: + col_name = 'test' + col = db[col_name] + + def process_item(self, item, spider): + item['task_id'] = os.environ.get('CRAWLAB_TASK_ID') + self.col.save(item) + return item diff --git a/spiders/example_juejin/juejin/settings.py b/spiders/example_juejin/juejin/settings.py new file mode 100644 index 00000000..44f8866c --- /dev/null +++ b/spiders/example_juejin/juejin/settings.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for juejin project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'juejin' + +SPIDER_MODULES = ['juejin.spiders'] +NEWSPIDER_MODULE = 'juejin.spiders' + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +# DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +# } + +# Enable or disable spider middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'juejin.middlewares.JuejinSpiderMiddleware': 543, +# } + +# Enable or disable downloader middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# 'juejin.middlewares.MyCustomDownloaderMiddleware': 543, +# } + +# Enable or disable extensions +# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +# } + +# Configure item pipelines +# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'juejin.pipelines.JuejinPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See http://doc.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/spiders/example_juejin/juejin/spiders/__init__.py b/spiders/example_juejin/juejin/spiders/__init__.py new file mode 100644 index 00000000..ebd689ac --- /dev/null +++ b/spiders/example_juejin/juejin/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/spiders/example_juejin/juejin/spiders/juejin_spider.py b/spiders/example_juejin/juejin/spiders/juejin_spider.py new file mode 100644 index 00000000..28df5be7 --- /dev/null +++ b/spiders/example_juejin/juejin/spiders/juejin_spider.py @@ -0,0 +1,17 @@ +# -*- coding: utf-8 -*- +import scrapy +from juejin.items import JuejinItem + + +class JuejinSpiderSpider(scrapy.Spider): + name = 'juejin_spider' + allowed_domains = ['juejin.com'] + start_urls = ['https://juejin.im/search?query=celery'] + + def parse(self, response): + for item in response.css('ul.main-list > li.item'): + yield JuejinItem( + title=item.css('.title span').extract_first(), + link=item.css('a::attr("href")').extract_first(), + like=item.css('.like .count::text').extract_first(), + ) diff --git a/spiders/example_juejin/scrapy.cfg b/spiders/example_juejin/scrapy.cfg new file mode 100644 index 00000000..38ba44f1 --- /dev/null +++ b/spiders/example_juejin/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.org/en/latest/deploy.html + +[settings] +default = juejin.settings + +[deploy] +#url = http://localhost:6800/ +project = juejin