From 04789febc88e820b3c94b09fe0e40dc35ccf1c80 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Fri, 24 May 2019 17:28:25 +0800 Subject: [PATCH] prepared for configurable spiders --- crawlab/constants/spider.py | 5 +- crawlab/routes/base.py | 4 +- crawlab/routes/spiders.py | 53 ++++-- crawlab/spiders/scrapy.cfg | 11 ++ crawlab/spiders/spiders/__init__.py | 0 crawlab/spiders/spiders/items.py | 14 ++ crawlab/spiders/spiders/middlewares.py | 103 ++++++++++++ crawlab/spiders/spiders/pipelines.py | 11 ++ crawlab/spiders/spiders/settings.py | 90 ++++++++++ crawlab/spiders/spiders/spiders/__init__.py | 4 + frontend/src/components/Config/ConfigList.vue | 53 ++++++ .../components/InfoView/SpiderInfoView.vue | 30 ++-- frontend/src/i18n/zh.js | 16 +- frontend/src/store/modules/spider.js | 9 +- frontend/src/views/spider/SpiderDetail.vue | 15 +- frontend/src/views/spider/SpiderList.vue | 154 ++++++++++++++++-- 16 files changed, 513 insertions(+), 59 deletions(-) create mode 100644 crawlab/spiders/scrapy.cfg create mode 100644 crawlab/spiders/spiders/__init__.py create mode 100644 crawlab/spiders/spiders/items.py create mode 100644 crawlab/spiders/spiders/middlewares.py create mode 100644 crawlab/spiders/spiders/pipelines.py create mode 100644 crawlab/spiders/spiders/settings.py create mode 100644 crawlab/spiders/spiders/spiders/__init__.py create mode 100644 frontend/src/components/Config/ConfigList.vue diff --git a/crawlab/constants/spider.py b/crawlab/constants/spider.py index 685e2b07..8f1421be 100644 --- a/crawlab/constants/spider.py +++ b/crawlab/constants/spider.py @@ -1,7 +1,6 @@ class SpiderType: - SCRAPY = 'scrapy' - PYSPIDER = 'pyspider' - WEBMAGIC = 'webmagic' + CONFIGURABLE = 'configurable' + CUSTOMIZED = 'customized' class LangType: diff --git a/crawlab/routes/base.py b/crawlab/routes/base.py index 1578b3f8..e068e4a2 100644 --- a/crawlab/routes/base.py +++ b/crawlab/routes/base.py @@ -111,7 +111,7 @@ class BaseApi(Resource): self.after_update() - return item + return jsonify(item) def update(self, id: str = None) -> (dict, tuple): """ @@ -137,7 +137,7 @@ class BaseApi(Resource): # execute after_update hook self.after_update(id) - return item + return jsonify(item) def post(self, id: str = None, action: str = None): """ diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index 51bae78c..6f94f259 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -13,6 +13,7 @@ from werkzeug.datastructures import FileStorage from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_SOURCE_FILE_FOLDER, PROJECT_TMP_FOLDER from constants.node import NodeStatus +from constants.spider import SpiderType from constants.task import TaskStatus from db.manager import db_manager from routes.base import BaseApi @@ -96,6 +97,8 @@ class SpiderApi(BaseApi): # get a list of items else: items = [] + + # get customized spiders dirs = os.listdir(PROJECT_SOURCE_FILE_FOLDER) for _dir in dirs: if _dir in IGNORE_DIRS: @@ -114,6 +117,7 @@ class SpiderApi(BaseApi): 'src': dir_path, 'lang': lang, 'suffix_stats': stats, + 'type': SpiderType.CUSTOMIZED }) # existing spider @@ -123,39 +127,52 @@ class SpiderApi(BaseApi): if last_deploy is not None: spider['deploy_ts'] = last_deploy['finish_ts'] - # get last task - last_task = db_manager.get_last_task(spider_id=spider['_id']) - if last_task is not None: - spider['task_ts'] = last_task['create_ts'] - - # get site - if spider.get('site') is not None: - site = db_manager.get('sites', spider['site']) - if site is not None: - spider['site_name'] = site['name'] - # file stats stats = get_file_suffix_stats(dir_path) # language lang = get_lang_by_stats(stats) + # spider type + type_ = SpiderType.CUSTOMIZED + # update spider data db_manager.update_one('spiders', id=str(spider['_id']), values={ 'lang': lang, + 'type': type_, 'suffix_stats': stats, }) - # --------- - # stats - # --------- - # last 5-run errors - spider['last_5_errors'] = get_last_n_run_errors_count(spider_id=spider['_id'], n=5) - spider['last_7d_tasks'] = get_last_n_day_tasks_count(spider_id=spider['_id'], n=5) - # append spider items.append(spider) + # get configurable spiders + for spider in db_manager.list('spiders', {'type': SpiderType.CONFIGURABLE}): + # append spider + items.append(spider) + + # get other info + for i in range(len(items)): + spider = items[i] + + # get site + if spider.get('site') is not None: + site = db_manager.get('sites', spider['site']) + if site is not None: + items[i]['site_name'] = site['name'] + + # get last task + last_task = db_manager.get_last_task(spider_id=spider['_id']) + if last_task is not None: + items[i]['task_ts'] = last_task['create_ts'] + + # --------- + # stats + # --------- + # last 5-run errors + items[i]['last_5_errors'] = get_last_n_run_errors_count(spider_id=spider['_id'], n=5) + items[i]['last_7d_tasks'] = get_last_n_day_tasks_count(spider_id=spider['_id'], n=5) + return { 'status': 'ok', 'items': jsonify(items) diff --git a/crawlab/spiders/scrapy.cfg b/crawlab/spiders/scrapy.cfg new file mode 100644 index 00000000..bf9391f1 --- /dev/null +++ b/crawlab/spiders/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = spiders.settings + +[deploy] +#url = http://localhost:6800/ +project = spiders diff --git a/crawlab/spiders/spiders/__init__.py b/crawlab/spiders/spiders/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/crawlab/spiders/spiders/items.py b/crawlab/spiders/spiders/items.py new file mode 100644 index 00000000..18cf9cfe --- /dev/null +++ b/crawlab/spiders/spiders/items.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class SpidersItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/crawlab/spiders/spiders/middlewares.py b/crawlab/spiders/spiders/middlewares.py new file mode 100644 index 00000000..1760fe41 --- /dev/null +++ b/crawlab/spiders/spiders/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class SpidersSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class SpidersDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/crawlab/spiders/spiders/pipelines.py b/crawlab/spiders/spiders/pipelines.py new file mode 100644 index 00000000..9e0dd2e9 --- /dev/null +++ b/crawlab/spiders/spiders/pipelines.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html + + +class SpidersPipeline(object): + def process_item(self, item, spider): + return item diff --git a/crawlab/spiders/spiders/settings.py b/crawlab/spiders/spiders/settings.py new file mode 100644 index 00000000..c7b40f78 --- /dev/null +++ b/crawlab/spiders/spiders/settings.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for spiders project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'spiders' + +SPIDER_MODULES = ['spiders.spiders'] +NEWSPIDER_MODULE = 'spiders.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'spiders (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'spiders.middlewares.SpidersSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'spiders.middlewares.SpidersDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# 'spiders.pipelines.SpidersPipeline': 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/crawlab/spiders/spiders/spiders/__init__.py b/crawlab/spiders/spiders/spiders/__init__.py new file mode 100644 index 00000000..ebd689ac --- /dev/null +++ b/crawlab/spiders/spiders/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue new file mode 100644 index 00000000..e538831d --- /dev/null +++ b/frontend/src/components/Config/ConfigList.vue @@ -0,0 +1,53 @@ + + + + + diff --git a/frontend/src/components/InfoView/SpiderInfoView.vue b/frontend/src/components/InfoView/SpiderInfoView.vue index fe384f14..41d26375 100644 --- a/frontend/src/components/InfoView/SpiderInfoView.vue +++ b/frontend/src/components/InfoView/SpiderInfoView.vue @@ -12,10 +12,10 @@ - + - + @@ -32,13 +32,12 @@ - - - - + + + - + @@ -50,7 +49,7 @@ {{$t('Run')}} - {{$t('Deploy')}} + {{$t('Deploy')}} {{$t('Save')}} @@ -99,13 +98,18 @@ export default { 'spiderForm' ]), isShowRun () { - if (!this.spiderForm.deploy_ts) { + if (this.isCustomized) { + if (!this.spiderForm.deploy_ts) { + return false + } + return !!this.spiderForm.cmd + } else { + // TODO: has to add rules return false } - if (!this.spiderForm.cmd) { - return false - } - return true + }, + isCustomized () { + return this.spiderForm.type === 'customized' } }, methods: { diff --git a/frontend/src/i18n/zh.js b/frontend/src/i18n/zh.js index 8036d818..d3cb926e 100644 --- a/frontend/src/i18n/zh.js +++ b/frontend/src/i18n/zh.js @@ -13,16 +13,18 @@ export default { 'Sites': '网站', // 标签 - Overview: '概览', - Files: '文件', + 'Overview': '概览', + 'Files': '文件', 'Deployed Spiders': '已部署爬虫', 'Log': '日志', 'Results': '结果', 'Environment': '环境', 'Analytics': '分析', + 'Rules': '规则', + 'Config': '配置', // 选择 - Spider: '爬虫', + 'Spider': '爬虫', // 块标题 'Latest Tasks': '最近任务', @@ -37,6 +39,7 @@ export default { REVOKED: '已取消', // 操作 + Add: '添加', Run: '运行', Deploy: '部署', Save: '保存', @@ -88,6 +91,9 @@ export default { 'Variable': '变量', 'Value': '值', 'Add Environment Variables': '添加环境变量', + 'Add Spider': '添加爬虫', + 'Add Configurable Spider': '添加可配置爬虫', + 'Add Customized Spider': '添加自定义爬虫', 'Last 7-Day Tasks': '最近7天任务数', 'Last 5-Run Errors': '最近5次运行错误数', '30-Day Tasks': '最近30天任务数', @@ -98,6 +104,10 @@ export default { 'Tasks by Node': '分节点任务数', 'Daily Tasks': '每日任务数', 'Daily Avg Duration (sec)': '每日平均运行时长(秒)', + 'Configurable Spider': '可配置爬虫', + 'Customized Spider': '自定义爬虫', + 'Configurable': '可配置', + 'Customized': '自定义', // 爬虫列表 'Name': '名称', diff --git a/frontend/src/store/modules/spider.js b/frontend/src/store/modules/spider.js index a55f90f7..53005837 100644 --- a/frontend/src/store/modules/spider.js +++ b/frontend/src/store/modules/spider.js @@ -79,13 +79,8 @@ const actions = { addSpider ({ state, dispatch }) { return request.put('/spiders', { name: state.spiderForm.name, - src: state.spiderForm.src, - cmd: state.spiderForm.cmd, - type: state.spiderForm.type, - lang: state.spiderForm.lang, col: state.spiderForm.col, - cron: state.spiderForm.cron, - cron_enabled: state.spiderForm.cron_enabled ? 1 : 0, + type: 'configurable', site: state.spiderForm.site }) .then(() => { @@ -100,8 +95,6 @@ const actions = { type: state.spiderForm.type, lang: state.spiderForm.lang, col: state.spiderForm.col, - cron: state.spiderForm.cron, - cron_enabled: state.spiderForm.cron_enabled ? 1 : 0, site: state.spiderForm.site }) .then(() => { diff --git a/frontend/src/views/spider/SpiderDetail.vue b/frontend/src/views/spider/SpiderDetail.vue index dac7931e..4080b04a 100644 --- a/frontend/src/views/spider/SpiderDetail.vue +++ b/frontend/src/views/spider/SpiderDetail.vue @@ -13,7 +13,10 @@ - + + + + @@ -34,10 +37,12 @@ import FileList from '../../components/FileList/FileList' import SpiderOverview from '../../components/Overview/SpiderOverview' import EnvironmentList from '../../components/Environment/EnvironmentList' import SpiderStats from '../../components/Stats/SpiderStats' +import ConfigList from '../../components/Config/ConfigList' export default { name: 'NodeDetail', components: { + ConfigList, SpiderStats, EnvironmentList, FileList, @@ -58,7 +63,13 @@ export default { ]), ...mapState('deploy', [ 'deployList' - ]) + ]), + isCustomized () { + return this.spiderForm.type === 'customized' + }, + isConfigurable () { + return this.spiderForm.type === 'configurable' + } }, methods: { onTabClick () { diff --git a/frontend/src/views/spider/SpiderList.vue b/frontend/src/views/spider/SpiderList.vue index 9628e28d..54baa762 100644 --- a/frontend/src/views/spider/SpiderList.vue +++ b/frontend/src/views/spider/SpiderList.vue @@ -1,6 +1,6 @@