mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
added configurable spider: add/edit fields, preview results
This commit is contained in:
@@ -16,6 +16,22 @@ class CronEnabled:
|
||||
OFF = 0
|
||||
|
||||
|
||||
class CrawlType:
|
||||
LIST = 'list'
|
||||
DETAIL = 'detail'
|
||||
LIST_DETAIL = 'list-detail'
|
||||
|
||||
|
||||
class QueryType:
|
||||
CSS = 'css'
|
||||
XPATH = 'xpath'
|
||||
|
||||
|
||||
class ExtractType:
|
||||
TEXT = 'text'
|
||||
ATTRIBUTE = 'attribute'
|
||||
|
||||
|
||||
SUFFIX_IGNORE = [
|
||||
'pyc'
|
||||
]
|
||||
|
||||
@@ -9,11 +9,12 @@ import requests
|
||||
from bson import ObjectId
|
||||
from flask import current_app, request
|
||||
from flask_restful import reqparse, Resource
|
||||
from lxml import etree
|
||||
from werkzeug.datastructures import FileStorage
|
||||
|
||||
from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_SOURCE_FILE_FOLDER, PROJECT_TMP_FOLDER
|
||||
from constants.node import NodeStatus
|
||||
from constants.spider import SpiderType
|
||||
from constants.spider import SpiderType, CrawlType, QueryType, ExtractType
|
||||
from constants.task import TaskStatus
|
||||
from db.manager import db_manager
|
||||
from routes.base import BaseApi
|
||||
@@ -65,6 +66,25 @@ class SpiderApi(BaseApi):
|
||||
|
||||
# spider site
|
||||
('site', str),
|
||||
|
||||
########################
|
||||
# Configurable Spider
|
||||
########################
|
||||
|
||||
# spider crawl fields
|
||||
('fields', str),
|
||||
|
||||
# spider crawl type
|
||||
('crawl_type', str),
|
||||
|
||||
# spider start url
|
||||
('start_url', str),
|
||||
|
||||
# spider item selector
|
||||
('item_selector', str),
|
||||
|
||||
# spider pagination selector
|
||||
('pagination_selector', str),
|
||||
)
|
||||
|
||||
def get(self, id=None, action=None):
|
||||
@@ -394,10 +414,93 @@ class SpiderApi(BaseApi):
|
||||
scheduler.update()
|
||||
|
||||
def update_envs(self, id: str):
|
||||
"""
|
||||
Update environment variables
|
||||
:param id: spider_id
|
||||
"""
|
||||
args = self.parser.parse_args()
|
||||
envs = json.loads(args.envs)
|
||||
db_manager.update_one(col_name='spiders', id=id, values={'envs': envs})
|
||||
|
||||
def update_fields(self, id: str):
|
||||
"""
|
||||
Update fields variables for configurable spiders
|
||||
:param id: spider_id
|
||||
"""
|
||||
args = self.parser.parse_args()
|
||||
fields = json.loads(args.fields)
|
||||
db_manager.update_one(col_name='spiders', id=id, values={'fields': fields})
|
||||
|
||||
def preview_crawl(self, id: str):
|
||||
spider = db_manager.get(col_name='spiders', id=id)
|
||||
|
||||
if spider['type'] != SpiderType.CONFIGURABLE:
|
||||
return {
|
||||
'status': 'ok',
|
||||
'error': 'type %s is invalid' % spider['type']
|
||||
}, 400
|
||||
|
||||
if spider.get('start_url') is None:
|
||||
return {
|
||||
'status': 'ok',
|
||||
'error': 'start_url should not be empty'
|
||||
}, 400
|
||||
|
||||
try:
|
||||
r = requests.get(spider['start_url'])
|
||||
except Exception as err:
|
||||
return {
|
||||
'status': 'ok',
|
||||
'error': 'connection error'
|
||||
}, 500
|
||||
|
||||
if r.status_code != 200:
|
||||
return {
|
||||
'status': 'ok',
|
||||
'error': 'status code is not 200, but %s' % r.status_code
|
||||
}
|
||||
|
||||
# get html parse tree
|
||||
sel = etree.HTML(r.content)
|
||||
|
||||
# parse fields
|
||||
if spider['crawl_type'] == CrawlType.LIST:
|
||||
if spider.get('item_selector') is None:
|
||||
return {
|
||||
'status': 'ok',
|
||||
'error': 'item_selector should not be empty'
|
||||
}, 400
|
||||
|
||||
# TODO: enable xpath
|
||||
data = []
|
||||
items = sel.cssselect(spider['item_selector'])
|
||||
for item in items:
|
||||
row = {}
|
||||
for f in spider['fields']:
|
||||
if f['type'] == QueryType.CSS:
|
||||
# css selector
|
||||
res = item.cssselect(f['query'])
|
||||
else:
|
||||
# xpath
|
||||
res = item.xpath(f['query'])
|
||||
|
||||
if len(res) > 0:
|
||||
if f['extract_type'] == ExtractType.TEXT:
|
||||
row[f['name']] = res[0].text
|
||||
else:
|
||||
row[f['name']] = res[0].get(f['attribute'])
|
||||
data.append(row)
|
||||
return {
|
||||
'status': 'ok',
|
||||
'items': data
|
||||
}
|
||||
|
||||
elif spider['crawl_type'] == CrawlType.DETAIL:
|
||||
pass
|
||||
|
||||
elif spider['crawl_type'] == CrawlType.LIST_DETAIL:
|
||||
pass
|
||||
|
||||
|
||||
class SpiderImportApi(Resource):
|
||||
__doc__ = """
|
||||
|
||||
@@ -1,35 +1,121 @@
|
||||
<template>
|
||||
<div class="config-list">
|
||||
<!--preview results-->
|
||||
<el-dialog :visible.sync="dialogVisible"
|
||||
:title="$t('Preview Results')"
|
||||
width="90%"
|
||||
:before-close="onDialogClose">
|
||||
<el-table :data="previewCrawlData"
|
||||
:header-cell-style="{background:'rgb(48, 65, 86)',color:'white'}"
|
||||
border>
|
||||
<el-table-column v-for="(f, index) in spiderForm.fields"
|
||||
:label="f.name"
|
||||
:key="index"
|
||||
min-width="100px">
|
||||
<template slot-scope="scope">
|
||||
{{scope.row[f.name]}}
|
||||
</template>
|
||||
</el-table-column>
|
||||
</el-table>
|
||||
</el-dialog>
|
||||
<!--./preview results-->
|
||||
|
||||
<el-row style="margin-top: 10px;">
|
||||
<el-col :span="11" offset="1">
|
||||
<el-form label-width="100px">
|
||||
<el-form-item :label="$t('Crawl Type')">
|
||||
<el-button-group>
|
||||
<el-button v-for="type in crawlTypeList"
|
||||
:key="type.value"
|
||||
:type="type.value === spiderForm.crawl_type ? 'primary' : ''"
|
||||
@click="onSelectCrawlType(type.value)">
|
||||
{{$t(type.label)}}
|
||||
</el-button>
|
||||
</el-button-group>
|
||||
</el-form-item>
|
||||
<el-form-item :label="$t('Start URL')">
|
||||
<el-input v-model="spiderForm.start_url" :placeholder="$t('Start URL')"></el-input>
|
||||
</el-form-item>
|
||||
</el-form>
|
||||
</el-col>
|
||||
<el-col :span="11" :offset="1">
|
||||
<el-form label-width="150px">
|
||||
<el-form-item :label="$t('Item Selector')"
|
||||
v-if="['list','list-detail'].includes(spiderForm.crawl_type)">
|
||||
<el-input v-model="spiderForm.item_selector" :placeholder="$t('Item Selector')"></el-input>
|
||||
</el-form-item>
|
||||
<el-form-item :label="$t('Pagination Selector')"
|
||||
v-if="['list','list-detail'].includes(spiderForm.crawl_type)">
|
||||
<el-input v-model="spiderForm.pagination_selector" :placeholder="$t('Pagination Selector')"></el-input>
|
||||
</el-form-item>
|
||||
</el-form>
|
||||
</el-col>
|
||||
</el-row>
|
||||
|
||||
<!--button group-->
|
||||
<el-row>
|
||||
<div class="button-group">
|
||||
<el-button type="primary" @click="addEnv" icon="el-icon-plus">{{$t('Add Environment Variables')}}</el-button>
|
||||
<el-button type="success" @click="save">{{$t('Save')}}</el-button>
|
||||
<el-button type="primary" @click="addField" icon="el-icon-plus">{{$t('Add Field')}}</el-button>
|
||||
<el-button type="warning" @click="onPreview" v-loading="previewLoading">{{$t('Preview')}}</el-button>
|
||||
<el-button type="success" @click="onSave" v-loading="saveLoading">{{$t('Save')}}</el-button>
|
||||
</div>
|
||||
</el-row>
|
||||
<el-row>
|
||||
<el-table :data="spiderForm.fields">
|
||||
<el-table-column :label="$t('Field Name')">
|
||||
<!--./button group-->
|
||||
|
||||
<!--field list-->
|
||||
<el-row style="margin-top: 10px;">
|
||||
<el-table :data="spiderForm.fields"
|
||||
class="table edit"
|
||||
:header-cell-style="{background:'rgb(48, 65, 86)',color:'white'}"
|
||||
border>
|
||||
<el-table-column :label="$t('Field Name')" width="200px">
|
||||
<template slot-scope="scope">
|
||||
<el-input v-model="scope.row.name" :placeholder="$t('Variable')"></el-input>
|
||||
<el-input v-model="scope.row.name" :placeholder="$t('Field Name')"></el-input>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column :label="$t('Extract Type')">
|
||||
<el-table-column :label="$t('Query Type')" width="200px">
|
||||
<template slot-scope="scope">
|
||||
<el-input v-model="scope.row.type" :placeholder="$t('Value')"></el-input>
|
||||
<el-select v-model="scope.row.type" :placeholder="$t('Query Type')">
|
||||
<el-option value="css" :label="$t('CSS Selector')"></el-option>
|
||||
<el-option value="xpath" :label="$t('XPath')"></el-option>
|
||||
</el-select>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column :label="$t('Query')">
|
||||
<el-table-column :label="$t('Query')" width="250px">
|
||||
<template slot-scope="scope">
|
||||
<el-input v-model="scope.row.query" :placeholder="$t('Value')"></el-input>
|
||||
<el-input v-model="scope.row.query" :placeholder="$t('Query')"></el-input>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column :label="$t('Action')">
|
||||
<el-table-column :label="$t('Extract Type')" width="120px">
|
||||
<template slot-scope="scope">
|
||||
<el-button size="mini" icon="el-icon-delete" type="danger" @click="deleteEnv(scope.$index)"></el-button>
|
||||
<el-select v-model="scope.row.extract_type" :placeholder="$t('Extract Type')">
|
||||
<el-option value="text" :label="$t('Text')"></el-option>
|
||||
<el-option value="attribute" :label="$t('Attribute')"></el-option>
|
||||
</el-select>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column :label="$t('Attribute')" width="250px">
|
||||
<template slot-scope="scope">
|
||||
<template v-if="scope.row.extract_type === 'attribute'">
|
||||
<el-input v-model="scope.row.attribute"
|
||||
:placeholder="$t('Attribute')">
|
||||
</el-input>
|
||||
</template>
|
||||
<template v-else>
|
||||
</template>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column :label="$t('Action')" fixed="right">
|
||||
<template slot-scope="scope">
|
||||
<div class="action-button-group">
|
||||
<el-button size="mini" icon="el-icon-delete" type="danger"
|
||||
@click="deleteField(scope.$index)"></el-button>
|
||||
</div>
|
||||
</template>
|
||||
</el-table-column>
|
||||
</el-table>
|
||||
</el-row>
|
||||
<!--./field list-->
|
||||
</div>
|
||||
</template>
|
||||
|
||||
@@ -40,14 +126,133 @@ import {
|
||||
|
||||
export default {
|
||||
name: 'ConfigList',
|
||||
data () {
|
||||
return {
|
||||
crawlTypeList: [
|
||||
{ value: 'list', label: 'List Only' },
|
||||
{ value: 'detail', label: 'Detail Only' },
|
||||
{ value: 'list-detail', label: 'List + Detail' }
|
||||
],
|
||||
previewLoading: false,
|
||||
saveLoading: false,
|
||||
dialogVisible: false
|
||||
}
|
||||
},
|
||||
computed: {
|
||||
...mapState('spider', [
|
||||
'spiderForm'
|
||||
'spiderForm',
|
||||
'previewCrawlData'
|
||||
])
|
||||
},
|
||||
methods: {
|
||||
addField () {
|
||||
this.spiderForm.fields.push({
|
||||
type: 'css',
|
||||
extract_type: 'text'
|
||||
})
|
||||
},
|
||||
deleteField (index) {
|
||||
this.spiderForm.fields.splice(index, 1)
|
||||
},
|
||||
onSelectCrawlType (value) {
|
||||
this.spiderForm.crawl_type = value
|
||||
},
|
||||
onSave () {
|
||||
return new Promise((resolve, reject) => {
|
||||
this.saveLoading = true
|
||||
this.$store.dispatch('spider/updateSpiderFields')
|
||||
.then(() => {
|
||||
this.$store.dispatch('spider/editSpider')
|
||||
.then(() => {
|
||||
this.$message.success(this.$t('Spider info has been saved successfully'))
|
||||
resolve()
|
||||
})
|
||||
.catch(() => {
|
||||
this.$message.error(this.$t('Something wrong happened'))
|
||||
reject(new Error())
|
||||
})
|
||||
.finally(() => {
|
||||
this.saveLoading = false
|
||||
})
|
||||
})
|
||||
.catch(() => {
|
||||
this.$message.error(this.$t('Something wrong happened'))
|
||||
this.saveLoading = false
|
||||
reject(new Error())
|
||||
})
|
||||
})
|
||||
},
|
||||
onDialogClose () {
|
||||
this.dialogVisible = false
|
||||
},
|
||||
onPreview () {
|
||||
this.onSave()
|
||||
.then(() => {
|
||||
this.previewLoading = true
|
||||
this.$store.dispatch('spider/getPreviewCrawlData')
|
||||
.then(() => {
|
||||
this.dialogVisible = true
|
||||
})
|
||||
.catch(() => {
|
||||
this.$message.error(this.$t('Something wrong happened'))
|
||||
})
|
||||
.finally(() => {
|
||||
this.previewLoading = false
|
||||
})
|
||||
})
|
||||
}
|
||||
},
|
||||
created () {
|
||||
if (!this.spiderForm.fields) {
|
||||
this.spiderForm.fields = []
|
||||
for (let i = 0; i < 3; i++) {
|
||||
this.spiderForm.fields.push({
|
||||
name: `field_${i + 1}`,
|
||||
type: 'css',
|
||||
extract_type: 'text'
|
||||
})
|
||||
}
|
||||
}
|
||||
if (!this.spiderForm.crawl_type) this.$set(this.spiderForm, 'crawl_type', 'list')
|
||||
if (!this.spiderForm.start_url) this.$set(this.spiderForm, 'start_url', 'http://example.com')
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
.el-table {
|
||||
margin-top: 10px;
|
||||
}
|
||||
|
||||
.el-table.edit >>> .el-table__body td {
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
.el-table.edit >>> .el-table__body td .cell {
|
||||
padding: 0;
|
||||
font-size: 12px;
|
||||
}
|
||||
|
||||
.el-table.edit >>> .el-input__inner:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.el-table.edit >>> .el-input__inner {
|
||||
height: 36px;
|
||||
border: none;
|
||||
border-radius: 0;
|
||||
font-size: 12px;
|
||||
}
|
||||
|
||||
.el-table.edit >>> .el-select .el-input .el-select__caret {
|
||||
line-height: 36px;
|
||||
}
|
||||
|
||||
.button-group {
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
.action-button-group {
|
||||
margin-left: 10px;
|
||||
}
|
||||
</style>
|
||||
|
||||
@@ -54,6 +54,7 @@ export default {
|
||||
Remove: '删除',
|
||||
Confirm: '确认',
|
||||
Stop: '停止',
|
||||
Preview: '预览',
|
||||
|
||||
// 主页
|
||||
'Total Tasks': '总任务数',
|
||||
@@ -94,6 +95,7 @@ export default {
|
||||
'Add Spider': '添加爬虫',
|
||||
'Add Configurable Spider': '添加可配置爬虫',
|
||||
'Add Customized Spider': '添加自定义爬虫',
|
||||
'Add Field': '添加字段',
|
||||
'Last 7-Day Tasks': '最近7天任务数',
|
||||
'Last 5-Run Errors': '最近5次运行错误数',
|
||||
'30-Day Tasks': '最近30天任务数',
|
||||
@@ -108,6 +110,8 @@ export default {
|
||||
'Customized Spider': '自定义爬虫',
|
||||
'Configurable': '可配置',
|
||||
'Customized': '自定义',
|
||||
'Text': '文本',
|
||||
'Attribute': '属性',
|
||||
|
||||
// 爬虫列表
|
||||
'Name': '名称',
|
||||
|
||||
@@ -29,7 +29,10 @@ const state = {
|
||||
nodeStats: [],
|
||||
|
||||
// filters
|
||||
filterSite: ''
|
||||
filterSite: '',
|
||||
|
||||
// preview crawl data
|
||||
previewCrawlData: []
|
||||
}
|
||||
|
||||
const getters = {}
|
||||
@@ -61,6 +64,9 @@ const mutations = {
|
||||
},
|
||||
SET_FILTER_SITE (state, value) {
|
||||
state.filterSite = value
|
||||
},
|
||||
SET_PREVIEW_CRAWL_DATA (state, value) {
|
||||
state.previewCrawlData = value
|
||||
}
|
||||
}
|
||||
|
||||
@@ -95,7 +101,12 @@ const actions = {
|
||||
type: state.spiderForm.type,
|
||||
lang: state.spiderForm.lang,
|
||||
col: state.spiderForm.col,
|
||||
site: state.spiderForm.site
|
||||
site: state.spiderForm.site,
|
||||
// configurable spider
|
||||
crawl_type: state.spiderForm.crawl_type,
|
||||
start_url: state.spiderForm.start_url,
|
||||
item_selector: state.spiderForm.item_selector,
|
||||
pagination_selector: state.spiderForm.pagination_selector
|
||||
})
|
||||
.then(() => {
|
||||
dispatch('getSpiderList')
|
||||
@@ -112,6 +123,11 @@ const actions = {
|
||||
envs: JSON.stringify(state.spiderForm.envs)
|
||||
})
|
||||
},
|
||||
updateSpiderFields ({ state }) {
|
||||
return request.post(`/spiders/${state.spiderForm._id}/update_fields`, {
|
||||
fields: JSON.stringify(state.spiderForm.fields)
|
||||
})
|
||||
},
|
||||
getSpiderData ({ state, commit }, id) {
|
||||
return request.get(`/spiders/${id}`)
|
||||
.then(response => {
|
||||
@@ -177,6 +193,12 @@ const actions = {
|
||||
commit('SET_DAILY_STATS', response.data.daily_stats)
|
||||
commit('SET_NODE_STATS', response.data.task_count_by_node)
|
||||
})
|
||||
},
|
||||
getPreviewCrawlData ({ state, commit }) {
|
||||
return request.post(`/spiders/${state.spiderForm._id}/preview_crawl`)
|
||||
.then(response => {
|
||||
commit('SET_PREVIEW_CRAWL_DATA', response.data.items)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user