updated auto field extraction

This commit is contained in:
Marvin Zhang
2019-06-04 23:43:47 +08:00
parent d2a1bf8430
commit 2b9cd566eb
4 changed files with 83 additions and 51 deletions

View File

@@ -103,6 +103,9 @@ class SpiderApi(BaseApi):
# whether to obey robots.txt
('obey_robots_txt', bool),
# item threshold to filter out non-relevant list items
('item_threshold', int),
)
def get(self, id=None, action=None):
@@ -508,6 +511,13 @@ class SpiderApi(BaseApi):
# get html parse tree
sel = etree.HTML(r.content)
# remove unnecessary tags
unnecessary_tags = [
'script'
]
for t in unnecessary_tags:
etree.strip_tags(sel, t)
return sel
@staticmethod
@@ -613,11 +623,11 @@ class SpiderApi(BaseApi):
return sel
list_tag_list = []
threshold = 10
threshold = spider.get('item_threshold') or 10
# iterate all child nodes in a top-down direction
for tag in sel.iter():
# get child tags
child_tags = [t for t in tag.getchildren() if type(t) != etree._Comment]
child_tags = self._get_children(tag)
if len(child_tags) < threshold:
# if number of child tags is below threshold, skip
@@ -634,7 +644,6 @@ class SpiderApi(BaseApi):
list_tag_list.append(tag)
# find the list tag with the most child text tags
_tag_list = []
max_tag = None
max_num = 0
for tag in list_tag_list:
@@ -648,8 +657,9 @@ class SpiderApi(BaseApi):
if max_tag.get('id') is not None:
item_selector = f'#{max_tag.get("id")} > {self._get_children(max_tag)[0].tag}'
elif max_tag.get('class') is not None:
if len(sel.cssselect(f'.{max_tag.get("class")}')) == 1:
item_selector = f'.{max_tag.get("class")} > {self._get_children(max_tag)[0].tag}'
cls_str = '.'.join([x for x in max_tag.get("class").split(' ') if x != ''])
if len(sel.cssselect(f'.{cls_str}')) == 1:
item_selector = f'.{cls_str} > {self._get_children(max_tag)[0].tag}'
# get list fields
fields = []
@@ -665,7 +675,6 @@ class SpiderApi(BaseApi):
})
elif tag.get('class') is not None:
cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != ''])
# print(tag.tag + '.' + cls_str)
if len(tag.cssselect(f'{tag.tag}.{cls_str}')) == 1:
fields.append({
'name': f'field{i + 1}',

View File

@@ -25,7 +25,7 @@
min-width="100px">
<template slot-scope="scope">
{{scope.row[f.name] ? scope.row[f.name].trim() : ''}}
{{getDisplayStr(scope.row[f.name])}}
</template>
</el-table-column>
</el-table>
@@ -34,8 +34,8 @@
<!--config detail-->
<el-row>
<el-col :span="11" :offset="1">
<el-form label-width="150px">
<el-form label-width="150px" ref="form" :model="spiderForm">
<el-col :span="11" :offset="1">
<el-form-item :label="$t('Crawl Type')">
<el-button-group>
<el-button v-for="type in crawlTypeList"
@@ -46,7 +46,7 @@
</el-button>
</el-button-group>
</el-form-item>
<el-form-item :label="$t('Start URL')" required>
<el-form-item :label="$t('Start URL')" prop="start_url" required>
<el-input v-model="spiderForm.start_url" :placeholder="$t('Start URL')"></el-input>
</el-form-item>
<el-form-item :label="$t('Obey robots.txt')">
@@ -55,10 +55,8 @@
<!--<el-form-item :label="$t('URL Pattern')">-->
<!--<el-input v-model="spiderForm.url_pattern" :placeholder="$t('URL Pattern')"></el-input>-->
<!--</el-form-item>-->
</el-form>
</el-col>
<el-col :span="11" :offset="1">
<el-form label-width="150px">
</el-col>
<el-col :span="11" :offset="1">
<el-form-item :label="$t('Item Selector')"
v-if="['list','list-detail'].includes(spiderForm.crawl_type)">
<el-select style="width: 35%;margin-right: 10px;"
@@ -85,8 +83,12 @@
:placeholder="$t('Pagination Selector')">
</el-input>
</el-form-item>
</el-form>
</el-col>
<el-form-item :label="$t('Item Threshold')"
v-if="['list','list-detail'].includes(spiderForm.crawl_type)">
<el-input-number v-model="spiderForm.item_threshold"/>
</el-form-item>
</el-col>
</el-form>
</el-row>
<!--./config detail-->
@@ -207,24 +209,28 @@ export default {
})
},
onPreview () {
this.onSave()
.then(() => {
this.previewLoading = true
this.$store.dispatch('spider/getPreviewCrawlData')
this.$refs['form'].validate(res => {
if (res) {
this.onSave()
.then(() => {
this.fields.forEach(f => {
this.columnsDict[f.name] = f.name
})
this.dialogVisible = true
this.previewLoading = true
this.$store.dispatch('spider/getPreviewCrawlData')
.then(() => {
this.fields.forEach(f => {
this.columnsDict[f.name] = f.name
})
this.dialogVisible = true
})
.catch(() => {
this.$message.error(this.$t('Something wrong happened'))
})
.finally(() => {
this.previewLoading = false
})
this.$st.sendEv('爬虫详情-配置', '预览')
})
.catch(() => {
this.$message.error(this.$t('Something wrong happened'))
})
.finally(() => {
this.previewLoading = false
})
this.$st.sendEv('爬虫详情-配置', '预览')
})
}
})
},
onCrawl () {
this.$confirm(this.$t('Are you sure to run this spider?'), this.$t('Notification'), {
@@ -240,28 +246,42 @@ export default {
})
},
onExtractFields () {
this.onSave()
.then(() => {
this.extractFieldsLoading = true
this.$store.dispatch('spider/extractFields')
.then(response => {
if (response.data.item_selector) {
this.$set(this.spiderForm, 'item_selector', response.data.item_selector)
this.$set(this.spiderForm, 'item_selector_type', 'css')
}
this.$refs['form'].validate(res => {
if (res) {
this.onSave()
.then(() => {
this.extractFieldsLoading = true
this.$store.dispatch('spider/extractFields')
.then(response => {
if (response.data.item_selector) {
this.$set(this.spiderForm, 'item_selector', response.data.item_selector)
this.$set(this.spiderForm, 'item_selector_type', 'css')
}
if (response.data.fields && response.data.fields.length) {
this.spiderForm.fields = response.data.fields
}
if (response.data.fields && response.data.fields.length) {
this.spiderForm.fields = response.data.fields
}
if (response.data.pagination_selector) {
this.spiderForm.pagination_selector = response.data.pagination_selector
}
})
.finally(() => {
this.extractFieldsLoading = false
})
this.$st.sendEv('爬虫详情-配置', '提取字段')
})
.finally(() => {
this.extractFieldsLoading = false
})
this.$st.sendEv('爬虫详情-配置', '提取字段')
})
}
})
},
onDeleteField (index) {
this.fields.splice(index, 1)
},
getDisplayStr (value) {
if (!value) return value
value = value.trim()
if (value.length > 20) return value.substr(0, 20) + '...'
return value
}
},
created () {
@@ -293,7 +313,8 @@ export default {
// if (!this.spiderForm.start_url) this.$set(this.spiderForm, 'start_url', 'http://example.com')
if (!this.spiderForm.item_selector_type) this.$set(this.spiderForm, 'item_selector_type', 'css')
if (!this.spiderForm.pagination_selector_type) this.$set(this.spiderForm, 'pagination_selector_type', 'css')
if (this.spiderForm.obey_robots_txt === undefined) this.$set(this.spiderForm, 'obey_robots_txt', true)
if (this.spiderForm.obey_robots_txt == null) this.$set(this.spiderForm, 'obey_robots_txt', true)
if (this.spiderForm.item_threshold == null) this.$set(this.spiderForm, 'item_threshold', 10)
}
}
</script>

View File

@@ -60,6 +60,7 @@ export default {
'Download CSV': '下载CSV',
'Upload Zip File': '上传Zip文件',
'Upload': '上传',
'Item Threshold': '子项阈值',
// 主页
'Total Tasks': '总任务数',

View File

@@ -110,7 +110,8 @@ const actions = {
item_selector_type: state.spiderForm.item_selector_type,
pagination_selector: state.spiderForm.pagination_selector,
pagination_selector_type: state.spiderForm.pagination_selector_type,
obey_robots_txt: state.spiderForm.obey_robots_txt
obey_robots_txt: state.spiderForm.obey_robots_txt,
item_threshold: state.spiderForm.item_threshold
})
.then(() => {
dispatch('getSpiderList')