added configurable spider: add/edit fields, preview results

This commit is contained in:
Marvin Zhang
2019-05-25 20:18:27 +08:00
parent 5477099f15
commit 96a9c22077
5 changed files with 366 additions and 16 deletions

View File

@@ -16,6 +16,22 @@ class CronEnabled:
OFF = 0
class CrawlType:
LIST = 'list'
DETAIL = 'detail'
LIST_DETAIL = 'list-detail'
class QueryType:
CSS = 'css'
XPATH = 'xpath'
class ExtractType:
TEXT = 'text'
ATTRIBUTE = 'attribute'
SUFFIX_IGNORE = [
'pyc'
]

View File

@@ -9,11 +9,12 @@ import requests
from bson import ObjectId
from flask import current_app, request
from flask_restful import reqparse, Resource
from lxml import etree
from werkzeug.datastructures import FileStorage
from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_SOURCE_FILE_FOLDER, PROJECT_TMP_FOLDER
from constants.node import NodeStatus
from constants.spider import SpiderType
from constants.spider import SpiderType, CrawlType, QueryType, ExtractType
from constants.task import TaskStatus
from db.manager import db_manager
from routes.base import BaseApi
@@ -65,6 +66,25 @@ class SpiderApi(BaseApi):
# spider site
('site', str),
########################
# Configurable Spider
########################
# spider crawl fields
('fields', str),
# spider crawl type
('crawl_type', str),
# spider start url
('start_url', str),
# spider item selector
('item_selector', str),
# spider pagination selector
('pagination_selector', str),
)
def get(self, id=None, action=None):
@@ -394,10 +414,93 @@ class SpiderApi(BaseApi):
scheduler.update()
def update_envs(self, id: str):
"""
Update environment variables
:param id: spider_id
"""
args = self.parser.parse_args()
envs = json.loads(args.envs)
db_manager.update_one(col_name='spiders', id=id, values={'envs': envs})
def update_fields(self, id: str):
"""
Update fields variables for configurable spiders
:param id: spider_id
"""
args = self.parser.parse_args()
fields = json.loads(args.fields)
db_manager.update_one(col_name='spiders', id=id, values={'fields': fields})
def preview_crawl(self, id: str):
spider = db_manager.get(col_name='spiders', id=id)
if spider['type'] != SpiderType.CONFIGURABLE:
return {
'status': 'ok',
'error': 'type %s is invalid' % spider['type']
}, 400
if spider.get('start_url') is None:
return {
'status': 'ok',
'error': 'start_url should not be empty'
}, 400
try:
r = requests.get(spider['start_url'])
except Exception as err:
return {
'status': 'ok',
'error': 'connection error'
}, 500
if r.status_code != 200:
return {
'status': 'ok',
'error': 'status code is not 200, but %s' % r.status_code
}
# get html parse tree
sel = etree.HTML(r.content)
# parse fields
if spider['crawl_type'] == CrawlType.LIST:
if spider.get('item_selector') is None:
return {
'status': 'ok',
'error': 'item_selector should not be empty'
}, 400
# TODO: enable xpath
data = []
items = sel.cssselect(spider['item_selector'])
for item in items:
row = {}
for f in spider['fields']:
if f['type'] == QueryType.CSS:
# css selector
res = item.cssselect(f['query'])
else:
# xpath
res = item.xpath(f['query'])
if len(res) > 0:
if f['extract_type'] == ExtractType.TEXT:
row[f['name']] = res[0].text
else:
row[f['name']] = res[0].get(f['attribute'])
data.append(row)
return {
'status': 'ok',
'items': data
}
elif spider['crawl_type'] == CrawlType.DETAIL:
pass
elif spider['crawl_type'] == CrawlType.LIST_DETAIL:
pass
class SpiderImportApi(Resource):
__doc__ = """

View File

@@ -1,35 +1,121 @@
<template>
<div class="config-list">
<!--preview results-->
<el-dialog :visible.sync="dialogVisible"
:title="$t('Preview Results')"
width="90%"
:before-close="onDialogClose">
<el-table :data="previewCrawlData"
:header-cell-style="{background:'rgb(48, 65, 86)',color:'white'}"
border>
<el-table-column v-for="(f, index) in spiderForm.fields"
:label="f.name"
:key="index"
min-width="100px">
<template slot-scope="scope">
{{scope.row[f.name]}}
</template>
</el-table-column>
</el-table>
</el-dialog>
<!--./preview results-->
<el-row style="margin-top: 10px;">
<el-col :span="11" offset="1">
<el-form label-width="100px">
<el-form-item :label="$t('Crawl Type')">
<el-button-group>
<el-button v-for="type in crawlTypeList"
:key="type.value"
:type="type.value === spiderForm.crawl_type ? 'primary' : ''"
@click="onSelectCrawlType(type.value)">
{{$t(type.label)}}
</el-button>
</el-button-group>
</el-form-item>
<el-form-item :label="$t('Start URL')">
<el-input v-model="spiderForm.start_url" :placeholder="$t('Start URL')"></el-input>
</el-form-item>
</el-form>
</el-col>
<el-col :span="11" :offset="1">
<el-form label-width="150px">
<el-form-item :label="$t('Item Selector')"
v-if="['list','list-detail'].includes(spiderForm.crawl_type)">
<el-input v-model="spiderForm.item_selector" :placeholder="$t('Item Selector')"></el-input>
</el-form-item>
<el-form-item :label="$t('Pagination Selector')"
v-if="['list','list-detail'].includes(spiderForm.crawl_type)">
<el-input v-model="spiderForm.pagination_selector" :placeholder="$t('Pagination Selector')"></el-input>
</el-form-item>
</el-form>
</el-col>
</el-row>
<!--button group-->
<el-row>
<div class="button-group">
<el-button type="primary" @click="addEnv" icon="el-icon-plus">{{$t('Add Environment Variables')}}</el-button>
<el-button type="success" @click="save">{{$t('Save')}}</el-button>
<el-button type="primary" @click="addField" icon="el-icon-plus">{{$t('Add Field')}}</el-button>
<el-button type="warning" @click="onPreview" v-loading="previewLoading">{{$t('Preview')}}</el-button>
<el-button type="success" @click="onSave" v-loading="saveLoading">{{$t('Save')}}</el-button>
</div>
</el-row>
<el-row>
<el-table :data="spiderForm.fields">
<el-table-column :label="$t('Field Name')">
<!--./button group-->
<!--field list-->
<el-row style="margin-top: 10px;">
<el-table :data="spiderForm.fields"
class="table edit"
:header-cell-style="{background:'rgb(48, 65, 86)',color:'white'}"
border>
<el-table-column :label="$t('Field Name')" width="200px">
<template slot-scope="scope">
<el-input v-model="scope.row.name" :placeholder="$t('Variable')"></el-input>
<el-input v-model="scope.row.name" :placeholder="$t('Field Name')"></el-input>
</template>
</el-table-column>
<el-table-column :label="$t('Extract Type')">
<el-table-column :label="$t('Query Type')" width="200px">
<template slot-scope="scope">
<el-input v-model="scope.row.type" :placeholder="$t('Value')"></el-input>
<el-select v-model="scope.row.type" :placeholder="$t('Query Type')">
<el-option value="css" :label="$t('CSS Selector')"></el-option>
<el-option value="xpath" :label="$t('XPath')"></el-option>
</el-select>
</template>
</el-table-column>
<el-table-column :label="$t('Query')">
<el-table-column :label="$t('Query')" width="250px">
<template slot-scope="scope">
<el-input v-model="scope.row.query" :placeholder="$t('Value')"></el-input>
<el-input v-model="scope.row.query" :placeholder="$t('Query')"></el-input>
</template>
</el-table-column>
<el-table-column :label="$t('Action')">
<el-table-column :label="$t('Extract Type')" width="120px">
<template slot-scope="scope">
<el-button size="mini" icon="el-icon-delete" type="danger" @click="deleteEnv(scope.$index)"></el-button>
<el-select v-model="scope.row.extract_type" :placeholder="$t('Extract Type')">
<el-option value="text" :label="$t('Text')"></el-option>
<el-option value="attribute" :label="$t('Attribute')"></el-option>
</el-select>
</template>
</el-table-column>
<el-table-column :label="$t('Attribute')" width="250px">
<template slot-scope="scope">
<template v-if="scope.row.extract_type === 'attribute'">
<el-input v-model="scope.row.attribute"
:placeholder="$t('Attribute')">
</el-input>
</template>
<template v-else>
</template>
</template>
</el-table-column>
<el-table-column :label="$t('Action')" fixed="right">
<template slot-scope="scope">
<div class="action-button-group">
<el-button size="mini" icon="el-icon-delete" type="danger"
@click="deleteField(scope.$index)"></el-button>
</div>
</template>
</el-table-column>
</el-table>
</el-row>
<!--./field list-->
</div>
</template>
@@ -40,14 +126,133 @@ import {
export default {
name: 'ConfigList',
data () {
return {
crawlTypeList: [
{ value: 'list', label: 'List Only' },
{ value: 'detail', label: 'Detail Only' },
{ value: 'list-detail', label: 'List + Detail' }
],
previewLoading: false,
saveLoading: false,
dialogVisible: false
}
},
computed: {
...mapState('spider', [
'spiderForm'
'spiderForm',
'previewCrawlData'
])
},
methods: {
addField () {
this.spiderForm.fields.push({
type: 'css',
extract_type: 'text'
})
},
deleteField (index) {
this.spiderForm.fields.splice(index, 1)
},
onSelectCrawlType (value) {
this.spiderForm.crawl_type = value
},
onSave () {
return new Promise((resolve, reject) => {
this.saveLoading = true
this.$store.dispatch('spider/updateSpiderFields')
.then(() => {
this.$store.dispatch('spider/editSpider')
.then(() => {
this.$message.success(this.$t('Spider info has been saved successfully'))
resolve()
})
.catch(() => {
this.$message.error(this.$t('Something wrong happened'))
reject(new Error())
})
.finally(() => {
this.saveLoading = false
})
})
.catch(() => {
this.$message.error(this.$t('Something wrong happened'))
this.saveLoading = false
reject(new Error())
})
})
},
onDialogClose () {
this.dialogVisible = false
},
onPreview () {
this.onSave()
.then(() => {
this.previewLoading = true
this.$store.dispatch('spider/getPreviewCrawlData')
.then(() => {
this.dialogVisible = true
})
.catch(() => {
this.$message.error(this.$t('Something wrong happened'))
})
.finally(() => {
this.previewLoading = false
})
})
}
},
created () {
if (!this.spiderForm.fields) {
this.spiderForm.fields = []
for (let i = 0; i < 3; i++) {
this.spiderForm.fields.push({
name: `field_${i + 1}`,
type: 'css',
extract_type: 'text'
})
}
}
if (!this.spiderForm.crawl_type) this.$set(this.spiderForm, 'crawl_type', 'list')
if (!this.spiderForm.start_url) this.$set(this.spiderForm, 'start_url', 'http://example.com')
}
}
</script>
<style scoped>
.el-table {
margin-top: 10px;
}
.el-table.edit >>> .el-table__body td {
padding: 0;
}
.el-table.edit >>> .el-table__body td .cell {
padding: 0;
font-size: 12px;
}
.el-table.edit >>> .el-input__inner:hover {
text-decoration: underline;
}
.el-table.edit >>> .el-input__inner {
height: 36px;
border: none;
border-radius: 0;
font-size: 12px;
}
.el-table.edit >>> .el-select .el-input .el-select__caret {
line-height: 36px;
}
.button-group {
text-align: right;
}
.action-button-group {
margin-left: 10px;
}
</style>

View File

@@ -54,6 +54,7 @@ export default {
Remove: '删除',
Confirm: '确认',
Stop: '停止',
Preview: '预览',
// 主页
'Total Tasks': '总任务数',
@@ -94,6 +95,7 @@ export default {
'Add Spider': '添加爬虫',
'Add Configurable Spider': '添加可配置爬虫',
'Add Customized Spider': '添加自定义爬虫',
'Add Field': '添加字段',
'Last 7-Day Tasks': '最近7天任务数',
'Last 5-Run Errors': '最近5次运行错误数',
'30-Day Tasks': '最近30天任务数',
@@ -108,6 +110,8 @@ export default {
'Customized Spider': '自定义爬虫',
'Configurable': '可配置',
'Customized': '自定义',
'Text': '文本',
'Attribute': '属性',
// 爬虫列表
'Name': '名称',

View File

@@ -29,7 +29,10 @@ const state = {
nodeStats: [],
// filters
filterSite: ''
filterSite: '',
// preview crawl data
previewCrawlData: []
}
const getters = {}
@@ -61,6 +64,9 @@ const mutations = {
},
SET_FILTER_SITE (state, value) {
state.filterSite = value
},
SET_PREVIEW_CRAWL_DATA (state, value) {
state.previewCrawlData = value
}
}
@@ -95,7 +101,12 @@ const actions = {
type: state.spiderForm.type,
lang: state.spiderForm.lang,
col: state.spiderForm.col,
site: state.spiderForm.site
site: state.spiderForm.site,
// configurable spider
crawl_type: state.spiderForm.crawl_type,
start_url: state.spiderForm.start_url,
item_selector: state.spiderForm.item_selector,
pagination_selector: state.spiderForm.pagination_selector
})
.then(() => {
dispatch('getSpiderList')
@@ -112,6 +123,11 @@ const actions = {
envs: JSON.stringify(state.spiderForm.envs)
})
},
updateSpiderFields ({ state }) {
return request.post(`/spiders/${state.spiderForm._id}/update_fields`, {
fields: JSON.stringify(state.spiderForm.fields)
})
},
getSpiderData ({ state, commit }, id) {
return request.get(`/spiders/${id}`)
.then(response => {
@@ -177,6 +193,12 @@ const actions = {
commit('SET_DAILY_STATS', response.data.daily_stats)
commit('SET_NODE_STATS', response.data.task_count_by_node)
})
},
getPreviewCrawlData ({ state, commit }) {
return request.post(`/spiders/${state.spiderForm._id}/preview_crawl`)
.then(response => {
commit('SET_PREVIEW_CRAWL_DATA', response.data.items)
})
}
}