From 96a9c220771351710cedfc149d9d817137ee70df Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Sat, 25 May 2019 20:18:27 +0800 Subject: [PATCH] added configurable spider: add/edit fields, preview results --- crawlab/constants/spider.py | 16 ++ crawlab/routes/spiders.py | 105 +++++++- frontend/src/components/Config/ConfigList.vue | 231 +++++++++++++++++- frontend/src/i18n/zh.js | 4 + frontend/src/store/modules/spider.js | 26 +- 5 files changed, 366 insertions(+), 16 deletions(-) diff --git a/crawlab/constants/spider.py b/crawlab/constants/spider.py index 8f1421be..97cbbdf2 100644 --- a/crawlab/constants/spider.py +++ b/crawlab/constants/spider.py @@ -16,6 +16,22 @@ class CronEnabled: OFF = 0 +class CrawlType: + LIST = 'list' + DETAIL = 'detail' + LIST_DETAIL = 'list-detail' + + +class QueryType: + CSS = 'css' + XPATH = 'xpath' + + +class ExtractType: + TEXT = 'text' + ATTRIBUTE = 'attribute' + + SUFFIX_IGNORE = [ 'pyc' ] diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index 6f94f259..ae36a69b 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -9,11 +9,12 @@ import requests from bson import ObjectId from flask import current_app, request from flask_restful import reqparse, Resource +from lxml import etree from werkzeug.datastructures import FileStorage from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_SOURCE_FILE_FOLDER, PROJECT_TMP_FOLDER from constants.node import NodeStatus -from constants.spider import SpiderType +from constants.spider import SpiderType, CrawlType, QueryType, ExtractType from constants.task import TaskStatus from db.manager import db_manager from routes.base import BaseApi @@ -65,6 +66,25 @@ class SpiderApi(BaseApi): # spider site ('site', str), + + ######################## + # Configurable Spider + ######################## + + # spider crawl fields + ('fields', str), + + # spider crawl type + ('crawl_type', str), + + # spider start url + ('start_url', str), + + # spider item selector + ('item_selector', str), + + # spider pagination selector + ('pagination_selector', str), ) def get(self, id=None, action=None): @@ -394,10 +414,93 @@ class SpiderApi(BaseApi): scheduler.update() def update_envs(self, id: str): + """ + Update environment variables + :param id: spider_id + """ args = self.parser.parse_args() envs = json.loads(args.envs) db_manager.update_one(col_name='spiders', id=id, values={'envs': envs}) + def update_fields(self, id: str): + """ + Update fields variables for configurable spiders + :param id: spider_id + """ + args = self.parser.parse_args() + fields = json.loads(args.fields) + db_manager.update_one(col_name='spiders', id=id, values={'fields': fields}) + + def preview_crawl(self, id: str): + spider = db_manager.get(col_name='spiders', id=id) + + if spider['type'] != SpiderType.CONFIGURABLE: + return { + 'status': 'ok', + 'error': 'type %s is invalid' % spider['type'] + }, 400 + + if spider.get('start_url') is None: + return { + 'status': 'ok', + 'error': 'start_url should not be empty' + }, 400 + + try: + r = requests.get(spider['start_url']) + except Exception as err: + return { + 'status': 'ok', + 'error': 'connection error' + }, 500 + + if r.status_code != 200: + return { + 'status': 'ok', + 'error': 'status code is not 200, but %s' % r.status_code + } + + # get html parse tree + sel = etree.HTML(r.content) + + # parse fields + if spider['crawl_type'] == CrawlType.LIST: + if spider.get('item_selector') is None: + return { + 'status': 'ok', + 'error': 'item_selector should not be empty' + }, 400 + + # TODO: enable xpath + data = [] + items = sel.cssselect(spider['item_selector']) + for item in items: + row = {} + for f in spider['fields']: + if f['type'] == QueryType.CSS: + # css selector + res = item.cssselect(f['query']) + else: + # xpath + res = item.xpath(f['query']) + + if len(res) > 0: + if f['extract_type'] == ExtractType.TEXT: + row[f['name']] = res[0].text + else: + row[f['name']] = res[0].get(f['attribute']) + data.append(row) + return { + 'status': 'ok', + 'items': data + } + + elif spider['crawl_type'] == CrawlType.DETAIL: + pass + + elif spider['crawl_type'] == CrawlType.LIST_DETAIL: + pass + class SpiderImportApi(Resource): __doc__ = """ diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index e538831d..f1f26fbe 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -1,35 +1,121 @@ @@ -40,14 +126,133 @@ import { export default { name: 'ConfigList', + data () { + return { + crawlTypeList: [ + { value: 'list', label: 'List Only' }, + { value: 'detail', label: 'Detail Only' }, + { value: 'list-detail', label: 'List + Detail' } + ], + previewLoading: false, + saveLoading: false, + dialogVisible: false + } + }, computed: { ...mapState('spider', [ - 'spiderForm' + 'spiderForm', + 'previewCrawlData' ]) + }, + methods: { + addField () { + this.spiderForm.fields.push({ + type: 'css', + extract_type: 'text' + }) + }, + deleteField (index) { + this.spiderForm.fields.splice(index, 1) + }, + onSelectCrawlType (value) { + this.spiderForm.crawl_type = value + }, + onSave () { + return new Promise((resolve, reject) => { + this.saveLoading = true + this.$store.dispatch('spider/updateSpiderFields') + .then(() => { + this.$store.dispatch('spider/editSpider') + .then(() => { + this.$message.success(this.$t('Spider info has been saved successfully')) + resolve() + }) + .catch(() => { + this.$message.error(this.$t('Something wrong happened')) + reject(new Error()) + }) + .finally(() => { + this.saveLoading = false + }) + }) + .catch(() => { + this.$message.error(this.$t('Something wrong happened')) + this.saveLoading = false + reject(new Error()) + }) + }) + }, + onDialogClose () { + this.dialogVisible = false + }, + onPreview () { + this.onSave() + .then(() => { + this.previewLoading = true + this.$store.dispatch('spider/getPreviewCrawlData') + .then(() => { + this.dialogVisible = true + }) + .catch(() => { + this.$message.error(this.$t('Something wrong happened')) + }) + .finally(() => { + this.previewLoading = false + }) + }) + } + }, + created () { + if (!this.spiderForm.fields) { + this.spiderForm.fields = [] + for (let i = 0; i < 3; i++) { + this.spiderForm.fields.push({ + name: `field_${i + 1}`, + type: 'css', + extract_type: 'text' + }) + } + } + if (!this.spiderForm.crawl_type) this.$set(this.spiderForm, 'crawl_type', 'list') + if (!this.spiderForm.start_url) this.$set(this.spiderForm, 'start_url', 'http://example.com') } } diff --git a/frontend/src/i18n/zh.js b/frontend/src/i18n/zh.js index d3cb926e..ffdebad7 100644 --- a/frontend/src/i18n/zh.js +++ b/frontend/src/i18n/zh.js @@ -54,6 +54,7 @@ export default { Remove: '删除', Confirm: '确认', Stop: '停止', + Preview: '预览', // 主页 'Total Tasks': '总任务数', @@ -94,6 +95,7 @@ export default { 'Add Spider': '添加爬虫', 'Add Configurable Spider': '添加可配置爬虫', 'Add Customized Spider': '添加自定义爬虫', + 'Add Field': '添加字段', 'Last 7-Day Tasks': '最近7天任务数', 'Last 5-Run Errors': '最近5次运行错误数', '30-Day Tasks': '最近30天任务数', @@ -108,6 +110,8 @@ export default { 'Customized Spider': '自定义爬虫', 'Configurable': '可配置', 'Customized': '自定义', + 'Text': '文本', + 'Attribute': '属性', // 爬虫列表 'Name': '名称', diff --git a/frontend/src/store/modules/spider.js b/frontend/src/store/modules/spider.js index 53005837..bff750e3 100644 --- a/frontend/src/store/modules/spider.js +++ b/frontend/src/store/modules/spider.js @@ -29,7 +29,10 @@ const state = { nodeStats: [], // filters - filterSite: '' + filterSite: '', + + // preview crawl data + previewCrawlData: [] } const getters = {} @@ -61,6 +64,9 @@ const mutations = { }, SET_FILTER_SITE (state, value) { state.filterSite = value + }, + SET_PREVIEW_CRAWL_DATA (state, value) { + state.previewCrawlData = value } } @@ -95,7 +101,12 @@ const actions = { type: state.spiderForm.type, lang: state.spiderForm.lang, col: state.spiderForm.col, - site: state.spiderForm.site + site: state.spiderForm.site, + // configurable spider + crawl_type: state.spiderForm.crawl_type, + start_url: state.spiderForm.start_url, + item_selector: state.spiderForm.item_selector, + pagination_selector: state.spiderForm.pagination_selector }) .then(() => { dispatch('getSpiderList') @@ -112,6 +123,11 @@ const actions = { envs: JSON.stringify(state.spiderForm.envs) }) }, + updateSpiderFields ({ state }) { + return request.post(`/spiders/${state.spiderForm._id}/update_fields`, { + fields: JSON.stringify(state.spiderForm.fields) + }) + }, getSpiderData ({ state, commit }, id) { return request.get(`/spiders/${id}`) .then(response => { @@ -177,6 +193,12 @@ const actions = { commit('SET_DAILY_STATS', response.data.daily_stats) commit('SET_NODE_STATS', response.data.task_count_by_node) }) + }, + getPreviewCrawlData ({ state, commit }) { + return request.post(`/spiders/${state.spiderForm._id}/preview_crawl`) + .then(response => { + commit('SET_PREVIEW_CRAWL_DATA', response.data.items) + }) } }