From 2b9cd566ebd7e2ec577f98eb3f0447888c53cc4d Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Tue, 4 Jun 2019 23:43:47 +0800 Subject: [PATCH] updated auto field extraction --- crawlab/routes/spiders.py | 21 +++- frontend/src/components/Config/ConfigList.vue | 109 +++++++++++------- frontend/src/i18n/zh.js | 1 + frontend/src/store/modules/spider.js | 3 +- 4 files changed, 83 insertions(+), 51 deletions(-) diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index de5feb94..91ab8608 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -103,6 +103,9 @@ class SpiderApi(BaseApi): # whether to obey robots.txt ('obey_robots_txt', bool), + + # item threshold to filter out non-relevant list items + ('item_threshold', int), ) def get(self, id=None, action=None): @@ -508,6 +511,13 @@ class SpiderApi(BaseApi): # get html parse tree sel = etree.HTML(r.content) + # remove unnecessary tags + unnecessary_tags = [ + 'script' + ] + for t in unnecessary_tags: + etree.strip_tags(sel, t) + return sel @staticmethod @@ -613,11 +623,11 @@ class SpiderApi(BaseApi): return sel list_tag_list = [] - threshold = 10 + threshold = spider.get('item_threshold') or 10 # iterate all child nodes in a top-down direction for tag in sel.iter(): # get child tags - child_tags = [t for t in tag.getchildren() if type(t) != etree._Comment] + child_tags = self._get_children(tag) if len(child_tags) < threshold: # if number of child tags is below threshold, skip @@ -634,7 +644,6 @@ class SpiderApi(BaseApi): list_tag_list.append(tag) # find the list tag with the most child text tags - _tag_list = [] max_tag = None max_num = 0 for tag in list_tag_list: @@ -648,8 +657,9 @@ class SpiderApi(BaseApi): if max_tag.get('id') is not None: item_selector = f'#{max_tag.get("id")} > {self._get_children(max_tag)[0].tag}' elif max_tag.get('class') is not None: - if len(sel.cssselect(f'.{max_tag.get("class")}')) == 1: - item_selector = f'.{max_tag.get("class")} > {self._get_children(max_tag)[0].tag}' + cls_str = '.'.join([x for x in max_tag.get("class").split(' ') if x != '']) + if len(sel.cssselect(f'.{cls_str}')) == 1: + item_selector = f'.{cls_str} > {self._get_children(max_tag)[0].tag}' # get list fields fields = [] @@ -665,7 +675,6 @@ class SpiderApi(BaseApi): }) elif tag.get('class') is not None: cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != '']) - # print(tag.tag + '.' + cls_str) if len(tag.cssselect(f'{tag.tag}.{cls_str}')) == 1: fields.append({ 'name': f'field{i + 1}', diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index af833edc..0362b72f 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -25,7 +25,7 @@ min-width="100px"> @@ -34,8 +34,8 @@ - - + + - + @@ -55,10 +55,8 @@ - - - - + + - - + + + + + @@ -207,24 +209,28 @@ export default { }) }, onPreview () { - this.onSave() - .then(() => { - this.previewLoading = true - this.$store.dispatch('spider/getPreviewCrawlData') + this.$refs['form'].validate(res => { + if (res) { + this.onSave() .then(() => { - this.fields.forEach(f => { - this.columnsDict[f.name] = f.name - }) - this.dialogVisible = true + this.previewLoading = true + this.$store.dispatch('spider/getPreviewCrawlData') + .then(() => { + this.fields.forEach(f => { + this.columnsDict[f.name] = f.name + }) + this.dialogVisible = true + }) + .catch(() => { + this.$message.error(this.$t('Something wrong happened')) + }) + .finally(() => { + this.previewLoading = false + }) + this.$st.sendEv('爬虫详情-配置', '预览') }) - .catch(() => { - this.$message.error(this.$t('Something wrong happened')) - }) - .finally(() => { - this.previewLoading = false - }) - this.$st.sendEv('爬虫详情-配置', '预览') - }) + } + }) }, onCrawl () { this.$confirm(this.$t('Are you sure to run this spider?'), this.$t('Notification'), { @@ -240,28 +246,42 @@ export default { }) }, onExtractFields () { - this.onSave() - .then(() => { - this.extractFieldsLoading = true - this.$store.dispatch('spider/extractFields') - .then(response => { - if (response.data.item_selector) { - this.$set(this.spiderForm, 'item_selector', response.data.item_selector) - this.$set(this.spiderForm, 'item_selector_type', 'css') - } + this.$refs['form'].validate(res => { + if (res) { + this.onSave() + .then(() => { + this.extractFieldsLoading = true + this.$store.dispatch('spider/extractFields') + .then(response => { + if (response.data.item_selector) { + this.$set(this.spiderForm, 'item_selector', response.data.item_selector) + this.$set(this.spiderForm, 'item_selector_type', 'css') + } - if (response.data.fields && response.data.fields.length) { - this.spiderForm.fields = response.data.fields - } + if (response.data.fields && response.data.fields.length) { + this.spiderForm.fields = response.data.fields + } + + if (response.data.pagination_selector) { + this.spiderForm.pagination_selector = response.data.pagination_selector + } + }) + .finally(() => { + this.extractFieldsLoading = false + }) + this.$st.sendEv('爬虫详情-配置', '提取字段') }) - .finally(() => { - this.extractFieldsLoading = false - }) - this.$st.sendEv('爬虫详情-配置', '提取字段') - }) + } + }) }, onDeleteField (index) { this.fields.splice(index, 1) + }, + getDisplayStr (value) { + if (!value) return value + value = value.trim() + if (value.length > 20) return value.substr(0, 20) + '...' + return value } }, created () { @@ -293,7 +313,8 @@ export default { // if (!this.spiderForm.start_url) this.$set(this.spiderForm, 'start_url', 'http://example.com') if (!this.spiderForm.item_selector_type) this.$set(this.spiderForm, 'item_selector_type', 'css') if (!this.spiderForm.pagination_selector_type) this.$set(this.spiderForm, 'pagination_selector_type', 'css') - if (this.spiderForm.obey_robots_txt === undefined) this.$set(this.spiderForm, 'obey_robots_txt', true) + if (this.spiderForm.obey_robots_txt == null) this.$set(this.spiderForm, 'obey_robots_txt', true) + if (this.spiderForm.item_threshold == null) this.$set(this.spiderForm, 'item_threshold', 10) } } diff --git a/frontend/src/i18n/zh.js b/frontend/src/i18n/zh.js index c4b95f78..fb49de32 100644 --- a/frontend/src/i18n/zh.js +++ b/frontend/src/i18n/zh.js @@ -60,6 +60,7 @@ export default { 'Download CSV': '下载CSV', 'Upload Zip File': '上传Zip文件', 'Upload': '上传', + 'Item Threshold': '子项阈值', // 主页 'Total Tasks': '总任务数', diff --git a/frontend/src/store/modules/spider.js b/frontend/src/store/modules/spider.js index 831c0d0e..25f1260f 100644 --- a/frontend/src/store/modules/spider.js +++ b/frontend/src/store/modules/spider.js @@ -110,7 +110,8 @@ const actions = { item_selector_type: state.spiderForm.item_selector_type, pagination_selector: state.spiderForm.pagination_selector, pagination_selector_type: state.spiderForm.pagination_selector_type, - obey_robots_txt: state.spiderForm.obey_robots_txt + obey_robots_txt: state.spiderForm.obey_robots_txt, + item_threshold: state.spiderForm.item_threshold }) .then(() => { dispatch('getSpiderList')