mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-21 17:21:09 +01:00
updated auto field extraction
This commit is contained in:
@@ -103,6 +103,9 @@ class SpiderApi(BaseApi):
|
||||
|
||||
# whether to obey robots.txt
|
||||
('obey_robots_txt', bool),
|
||||
|
||||
# item threshold to filter out non-relevant list items
|
||||
('item_threshold', int),
|
||||
)
|
||||
|
||||
def get(self, id=None, action=None):
|
||||
@@ -508,6 +511,13 @@ class SpiderApi(BaseApi):
|
||||
# get html parse tree
|
||||
sel = etree.HTML(r.content)
|
||||
|
||||
# remove unnecessary tags
|
||||
unnecessary_tags = [
|
||||
'script'
|
||||
]
|
||||
for t in unnecessary_tags:
|
||||
etree.strip_tags(sel, t)
|
||||
|
||||
return sel
|
||||
|
||||
@staticmethod
|
||||
@@ -613,11 +623,11 @@ class SpiderApi(BaseApi):
|
||||
return sel
|
||||
|
||||
list_tag_list = []
|
||||
threshold = 10
|
||||
threshold = spider.get('item_threshold') or 10
|
||||
# iterate all child nodes in a top-down direction
|
||||
for tag in sel.iter():
|
||||
# get child tags
|
||||
child_tags = [t for t in tag.getchildren() if type(t) != etree._Comment]
|
||||
child_tags = self._get_children(tag)
|
||||
|
||||
if len(child_tags) < threshold:
|
||||
# if number of child tags is below threshold, skip
|
||||
@@ -634,7 +644,6 @@ class SpiderApi(BaseApi):
|
||||
list_tag_list.append(tag)
|
||||
|
||||
# find the list tag with the most child text tags
|
||||
_tag_list = []
|
||||
max_tag = None
|
||||
max_num = 0
|
||||
for tag in list_tag_list:
|
||||
@@ -648,8 +657,9 @@ class SpiderApi(BaseApi):
|
||||
if max_tag.get('id') is not None:
|
||||
item_selector = f'#{max_tag.get("id")} > {self._get_children(max_tag)[0].tag}'
|
||||
elif max_tag.get('class') is not None:
|
||||
if len(sel.cssselect(f'.{max_tag.get("class")}')) == 1:
|
||||
item_selector = f'.{max_tag.get("class")} > {self._get_children(max_tag)[0].tag}'
|
||||
cls_str = '.'.join([x for x in max_tag.get("class").split(' ') if x != ''])
|
||||
if len(sel.cssselect(f'.{cls_str}')) == 1:
|
||||
item_selector = f'.{cls_str} > {self._get_children(max_tag)[0].tag}'
|
||||
|
||||
# get list fields
|
||||
fields = []
|
||||
@@ -665,7 +675,6 @@ class SpiderApi(BaseApi):
|
||||
})
|
||||
elif tag.get('class') is not None:
|
||||
cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != ''])
|
||||
# print(tag.tag + '.' + cls_str)
|
||||
if len(tag.cssselect(f'{tag.tag}.{cls_str}')) == 1:
|
||||
fields.append({
|
||||
'name': f'field{i + 1}',
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
min-width="100px">
|
||||
|
||||
<template slot-scope="scope">
|
||||
{{scope.row[f.name] ? scope.row[f.name].trim() : ''}}
|
||||
{{getDisplayStr(scope.row[f.name])}}
|
||||
</template>
|
||||
</el-table-column>
|
||||
</el-table>
|
||||
@@ -34,8 +34,8 @@
|
||||
|
||||
<!--config detail-->
|
||||
<el-row>
|
||||
<el-col :span="11" :offset="1">
|
||||
<el-form label-width="150px">
|
||||
<el-form label-width="150px" ref="form" :model="spiderForm">
|
||||
<el-col :span="11" :offset="1">
|
||||
<el-form-item :label="$t('Crawl Type')">
|
||||
<el-button-group>
|
||||
<el-button v-for="type in crawlTypeList"
|
||||
@@ -46,7 +46,7 @@
|
||||
</el-button>
|
||||
</el-button-group>
|
||||
</el-form-item>
|
||||
<el-form-item :label="$t('Start URL')" required>
|
||||
<el-form-item :label="$t('Start URL')" prop="start_url" required>
|
||||
<el-input v-model="spiderForm.start_url" :placeholder="$t('Start URL')"></el-input>
|
||||
</el-form-item>
|
||||
<el-form-item :label="$t('Obey robots.txt')">
|
||||
@@ -55,10 +55,8 @@
|
||||
<!--<el-form-item :label="$t('URL Pattern')">-->
|
||||
<!--<el-input v-model="spiderForm.url_pattern" :placeholder="$t('URL Pattern')"></el-input>-->
|
||||
<!--</el-form-item>-->
|
||||
</el-form>
|
||||
</el-col>
|
||||
<el-col :span="11" :offset="1">
|
||||
<el-form label-width="150px">
|
||||
</el-col>
|
||||
<el-col :span="11" :offset="1">
|
||||
<el-form-item :label="$t('Item Selector')"
|
||||
v-if="['list','list-detail'].includes(spiderForm.crawl_type)">
|
||||
<el-select style="width: 35%;margin-right: 10px;"
|
||||
@@ -85,8 +83,12 @@
|
||||
:placeholder="$t('Pagination Selector')">
|
||||
</el-input>
|
||||
</el-form-item>
|
||||
</el-form>
|
||||
</el-col>
|
||||
<el-form-item :label="$t('Item Threshold')"
|
||||
v-if="['list','list-detail'].includes(spiderForm.crawl_type)">
|
||||
<el-input-number v-model="spiderForm.item_threshold"/>
|
||||
</el-form-item>
|
||||
</el-col>
|
||||
</el-form>
|
||||
</el-row>
|
||||
<!--./config detail-->
|
||||
|
||||
@@ -207,24 +209,28 @@ export default {
|
||||
})
|
||||
},
|
||||
onPreview () {
|
||||
this.onSave()
|
||||
.then(() => {
|
||||
this.previewLoading = true
|
||||
this.$store.dispatch('spider/getPreviewCrawlData')
|
||||
this.$refs['form'].validate(res => {
|
||||
if (res) {
|
||||
this.onSave()
|
||||
.then(() => {
|
||||
this.fields.forEach(f => {
|
||||
this.columnsDict[f.name] = f.name
|
||||
})
|
||||
this.dialogVisible = true
|
||||
this.previewLoading = true
|
||||
this.$store.dispatch('spider/getPreviewCrawlData')
|
||||
.then(() => {
|
||||
this.fields.forEach(f => {
|
||||
this.columnsDict[f.name] = f.name
|
||||
})
|
||||
this.dialogVisible = true
|
||||
})
|
||||
.catch(() => {
|
||||
this.$message.error(this.$t('Something wrong happened'))
|
||||
})
|
||||
.finally(() => {
|
||||
this.previewLoading = false
|
||||
})
|
||||
this.$st.sendEv('爬虫详情-配置', '预览')
|
||||
})
|
||||
.catch(() => {
|
||||
this.$message.error(this.$t('Something wrong happened'))
|
||||
})
|
||||
.finally(() => {
|
||||
this.previewLoading = false
|
||||
})
|
||||
this.$st.sendEv('爬虫详情-配置', '预览')
|
||||
})
|
||||
}
|
||||
})
|
||||
},
|
||||
onCrawl () {
|
||||
this.$confirm(this.$t('Are you sure to run this spider?'), this.$t('Notification'), {
|
||||
@@ -240,28 +246,42 @@ export default {
|
||||
})
|
||||
},
|
||||
onExtractFields () {
|
||||
this.onSave()
|
||||
.then(() => {
|
||||
this.extractFieldsLoading = true
|
||||
this.$store.dispatch('spider/extractFields')
|
||||
.then(response => {
|
||||
if (response.data.item_selector) {
|
||||
this.$set(this.spiderForm, 'item_selector', response.data.item_selector)
|
||||
this.$set(this.spiderForm, 'item_selector_type', 'css')
|
||||
}
|
||||
this.$refs['form'].validate(res => {
|
||||
if (res) {
|
||||
this.onSave()
|
||||
.then(() => {
|
||||
this.extractFieldsLoading = true
|
||||
this.$store.dispatch('spider/extractFields')
|
||||
.then(response => {
|
||||
if (response.data.item_selector) {
|
||||
this.$set(this.spiderForm, 'item_selector', response.data.item_selector)
|
||||
this.$set(this.spiderForm, 'item_selector_type', 'css')
|
||||
}
|
||||
|
||||
if (response.data.fields && response.data.fields.length) {
|
||||
this.spiderForm.fields = response.data.fields
|
||||
}
|
||||
if (response.data.fields && response.data.fields.length) {
|
||||
this.spiderForm.fields = response.data.fields
|
||||
}
|
||||
|
||||
if (response.data.pagination_selector) {
|
||||
this.spiderForm.pagination_selector = response.data.pagination_selector
|
||||
}
|
||||
})
|
||||
.finally(() => {
|
||||
this.extractFieldsLoading = false
|
||||
})
|
||||
this.$st.sendEv('爬虫详情-配置', '提取字段')
|
||||
})
|
||||
.finally(() => {
|
||||
this.extractFieldsLoading = false
|
||||
})
|
||||
this.$st.sendEv('爬虫详情-配置', '提取字段')
|
||||
})
|
||||
}
|
||||
})
|
||||
},
|
||||
onDeleteField (index) {
|
||||
this.fields.splice(index, 1)
|
||||
},
|
||||
getDisplayStr (value) {
|
||||
if (!value) return value
|
||||
value = value.trim()
|
||||
if (value.length > 20) return value.substr(0, 20) + '...'
|
||||
return value
|
||||
}
|
||||
},
|
||||
created () {
|
||||
@@ -293,7 +313,8 @@ export default {
|
||||
// if (!this.spiderForm.start_url) this.$set(this.spiderForm, 'start_url', 'http://example.com')
|
||||
if (!this.spiderForm.item_selector_type) this.$set(this.spiderForm, 'item_selector_type', 'css')
|
||||
if (!this.spiderForm.pagination_selector_type) this.$set(this.spiderForm, 'pagination_selector_type', 'css')
|
||||
if (this.spiderForm.obey_robots_txt === undefined) this.$set(this.spiderForm, 'obey_robots_txt', true)
|
||||
if (this.spiderForm.obey_robots_txt == null) this.$set(this.spiderForm, 'obey_robots_txt', true)
|
||||
if (this.spiderForm.item_threshold == null) this.$set(this.spiderForm, 'item_threshold', 10)
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
@@ -60,6 +60,7 @@ export default {
|
||||
'Download CSV': '下载CSV',
|
||||
'Upload Zip File': '上传Zip文件',
|
||||
'Upload': '上传',
|
||||
'Item Threshold': '子项阈值',
|
||||
|
||||
// 主页
|
||||
'Total Tasks': '总任务数',
|
||||
|
||||
@@ -110,7 +110,8 @@ const actions = {
|
||||
item_selector_type: state.spiderForm.item_selector_type,
|
||||
pagination_selector: state.spiderForm.pagination_selector,
|
||||
pagination_selector_type: state.spiderForm.pagination_selector_type,
|
||||
obey_robots_txt: state.spiderForm.obey_robots_txt
|
||||
obey_robots_txt: state.spiderForm.obey_robots_txt,
|
||||
item_threshold: state.spiderForm.item_threshold
|
||||
})
|
||||
.then(() => {
|
||||
dispatch('getSpiderList')
|
||||
|
||||
Reference in New Issue
Block a user