diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py
index de5feb94..91ab8608 100644
--- a/crawlab/routes/spiders.py
+++ b/crawlab/routes/spiders.py
@@ -103,6 +103,9 @@ class SpiderApi(BaseApi):
# whether to obey robots.txt
('obey_robots_txt', bool),
+
+ # item threshold to filter out non-relevant list items
+ ('item_threshold', int),
)
def get(self, id=None, action=None):
@@ -508,6 +511,13 @@ class SpiderApi(BaseApi):
# get html parse tree
sel = etree.HTML(r.content)
+ # remove unnecessary tags
+ unnecessary_tags = [
+ 'script'
+ ]
+ for t in unnecessary_tags:
+ etree.strip_tags(sel, t)
+
return sel
@staticmethod
@@ -613,11 +623,11 @@ class SpiderApi(BaseApi):
return sel
list_tag_list = []
- threshold = 10
+ threshold = spider.get('item_threshold') or 10
# iterate all child nodes in a top-down direction
for tag in sel.iter():
# get child tags
- child_tags = [t for t in tag.getchildren() if type(t) != etree._Comment]
+ child_tags = self._get_children(tag)
if len(child_tags) < threshold:
# if number of child tags is below threshold, skip
@@ -634,7 +644,6 @@ class SpiderApi(BaseApi):
list_tag_list.append(tag)
# find the list tag with the most child text tags
- _tag_list = []
max_tag = None
max_num = 0
for tag in list_tag_list:
@@ -648,8 +657,9 @@ class SpiderApi(BaseApi):
if max_tag.get('id') is not None:
item_selector = f'#{max_tag.get("id")} > {self._get_children(max_tag)[0].tag}'
elif max_tag.get('class') is not None:
- if len(sel.cssselect(f'.{max_tag.get("class")}')) == 1:
- item_selector = f'.{max_tag.get("class")} > {self._get_children(max_tag)[0].tag}'
+ cls_str = '.'.join([x for x in max_tag.get("class").split(' ') if x != ''])
+ if len(sel.cssselect(f'.{cls_str}')) == 1:
+ item_selector = f'.{cls_str} > {self._get_children(max_tag)[0].tag}'
# get list fields
fields = []
@@ -665,7 +675,6 @@ class SpiderApi(BaseApi):
})
elif tag.get('class') is not None:
cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != ''])
- # print(tag.tag + '.' + cls_str)
if len(tag.cssselect(f'{tag.tag}.{cls_str}')) == 1:
fields.append({
'name': f'field{i + 1}',
diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue
index af833edc..0362b72f 100644
--- a/frontend/src/components/Config/ConfigList.vue
+++ b/frontend/src/components/Config/ConfigList.vue
@@ -25,7 +25,7 @@
min-width="100px">
- {{scope.row[f.name] ? scope.row[f.name].trim() : ''}}
+ {{getDisplayStr(scope.row[f.name])}}
@@ -34,8 +34,8 @@
-
-
+
+
-
+
@@ -55,10 +55,8 @@
-
-
-
-
+
+
-
-
+
+
+
+
+
@@ -207,24 +209,28 @@ export default {
})
},
onPreview () {
- this.onSave()
- .then(() => {
- this.previewLoading = true
- this.$store.dispatch('spider/getPreviewCrawlData')
+ this.$refs['form'].validate(res => {
+ if (res) {
+ this.onSave()
.then(() => {
- this.fields.forEach(f => {
- this.columnsDict[f.name] = f.name
- })
- this.dialogVisible = true
+ this.previewLoading = true
+ this.$store.dispatch('spider/getPreviewCrawlData')
+ .then(() => {
+ this.fields.forEach(f => {
+ this.columnsDict[f.name] = f.name
+ })
+ this.dialogVisible = true
+ })
+ .catch(() => {
+ this.$message.error(this.$t('Something wrong happened'))
+ })
+ .finally(() => {
+ this.previewLoading = false
+ })
+ this.$st.sendEv('爬虫详情-配置', '预览')
})
- .catch(() => {
- this.$message.error(this.$t('Something wrong happened'))
- })
- .finally(() => {
- this.previewLoading = false
- })
- this.$st.sendEv('爬虫详情-配置', '预览')
- })
+ }
+ })
},
onCrawl () {
this.$confirm(this.$t('Are you sure to run this spider?'), this.$t('Notification'), {
@@ -240,28 +246,42 @@ export default {
})
},
onExtractFields () {
- this.onSave()
- .then(() => {
- this.extractFieldsLoading = true
- this.$store.dispatch('spider/extractFields')
- .then(response => {
- if (response.data.item_selector) {
- this.$set(this.spiderForm, 'item_selector', response.data.item_selector)
- this.$set(this.spiderForm, 'item_selector_type', 'css')
- }
+ this.$refs['form'].validate(res => {
+ if (res) {
+ this.onSave()
+ .then(() => {
+ this.extractFieldsLoading = true
+ this.$store.dispatch('spider/extractFields')
+ .then(response => {
+ if (response.data.item_selector) {
+ this.$set(this.spiderForm, 'item_selector', response.data.item_selector)
+ this.$set(this.spiderForm, 'item_selector_type', 'css')
+ }
- if (response.data.fields && response.data.fields.length) {
- this.spiderForm.fields = response.data.fields
- }
+ if (response.data.fields && response.data.fields.length) {
+ this.spiderForm.fields = response.data.fields
+ }
+
+ if (response.data.pagination_selector) {
+ this.spiderForm.pagination_selector = response.data.pagination_selector
+ }
+ })
+ .finally(() => {
+ this.extractFieldsLoading = false
+ })
+ this.$st.sendEv('爬虫详情-配置', '提取字段')
})
- .finally(() => {
- this.extractFieldsLoading = false
- })
- this.$st.sendEv('爬虫详情-配置', '提取字段')
- })
+ }
+ })
},
onDeleteField (index) {
this.fields.splice(index, 1)
+ },
+ getDisplayStr (value) {
+ if (!value) return value
+ value = value.trim()
+ if (value.length > 20) return value.substr(0, 20) + '...'
+ return value
}
},
created () {
@@ -293,7 +313,8 @@ export default {
// if (!this.spiderForm.start_url) this.$set(this.spiderForm, 'start_url', 'http://example.com')
if (!this.spiderForm.item_selector_type) this.$set(this.spiderForm, 'item_selector_type', 'css')
if (!this.spiderForm.pagination_selector_type) this.$set(this.spiderForm, 'pagination_selector_type', 'css')
- if (this.spiderForm.obey_robots_txt === undefined) this.$set(this.spiderForm, 'obey_robots_txt', true)
+ if (this.spiderForm.obey_robots_txt == null) this.$set(this.spiderForm, 'obey_robots_txt', true)
+ if (this.spiderForm.item_threshold == null) this.$set(this.spiderForm, 'item_threshold', 10)
}
}
diff --git a/frontend/src/i18n/zh.js b/frontend/src/i18n/zh.js
index c4b95f78..fb49de32 100644
--- a/frontend/src/i18n/zh.js
+++ b/frontend/src/i18n/zh.js
@@ -60,6 +60,7 @@ export default {
'Download CSV': '下载CSV',
'Upload Zip File': '上传Zip文件',
'Upload': '上传',
+ 'Item Threshold': '子项阈值',
// 主页
'Total Tasks': '总任务数',
diff --git a/frontend/src/store/modules/spider.js b/frontend/src/store/modules/spider.js
index 831c0d0e..25f1260f 100644
--- a/frontend/src/store/modules/spider.js
+++ b/frontend/src/store/modules/spider.js
@@ -110,7 +110,8 @@ const actions = {
item_selector_type: state.spiderForm.item_selector_type,
pagination_selector: state.spiderForm.pagination_selector,
pagination_selector_type: state.spiderForm.pagination_selector_type,
- obey_robots_txt: state.spiderForm.obey_robots_txt
+ obey_robots_txt: state.spiderForm.obey_robots_txt,
+ item_threshold: state.spiderForm.item_threshold
})
.then(() => {
dispatch('getSpiderList')