准备可配置爬虫自定义设置变量

This commit is contained in:
marvzhang
2019-11-29 13:42:50 +08:00
parent d7cbaac94f
commit 87546f0c88
7 changed files with 402 additions and 16 deletions

View File

@@ -1,12 +1,12 @@
package entity
type Field struct {
Name string `yaml:"name" json:"name"`
Css string `yaml:"css" json:"css"`
Xpath string `yaml:"xpath" json:"xpath"`
Attr string `yaml:"attr" json:"attr"`
NextStage string `yaml:"next_stage" json:"next_stage"`
Remark string `yaml:"remark" json:"remark"`
type ConfigSpiderData struct {
Version string `yaml:"version" json:"version"`
Engine string `yaml:"engine" json:"engine"`
StartUrl string `yaml:"start_url" json:"start_url"`
StartStage string `yaml:"start_stage" json:"start_stage"`
Stages map[string]Stage `yaml:"stages" json:"stages"`
Settings map[string]string `yaml:"settings" json:"settings"`
}
type Stage struct {
@@ -20,10 +20,11 @@ type Stage struct {
Fields []Field `yaml:"fields" json:"fields"`
}
type ConfigSpiderData struct {
Version string `yaml:"version" json:"version"`
Engine string `yaml:"engine" json:"engine"`
StartUrl string `yaml:"start_url" json:"start_url"`
StartStage string `yaml:"start_stage" json:"start_stage"`
Stages map[string]Stage `yaml:"stages" json:"stages"`
type Field struct {
Name string `yaml:"name" json:"name"`
Css string `yaml:"css" json:"css"`
Xpath string `yaml:"xpath" json:"xpath"`
Attr string `yaml:"attr" json:"attr"`
NextStage string `yaml:"next_stage" json:"next_stage"`
Remark string `yaml:"remark" json:"remark"`
}

View File

@@ -205,8 +205,8 @@ func PostConfigSpiderSpiderfile(c *gin.Context) {
return
}
// 根据序列化后的数据处理爬虫文件
if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
// 校验configData
if err := services.ValidateSpiderfile(configData); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
@@ -217,6 +217,12 @@ func PostConfigSpiderSpiderfile(c *gin.Context) {
return
}
// 根据序列化后的数据处理爬虫文件
if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",

View File

@@ -219,12 +219,18 @@ func ExecuteShellCmd(cmdStr string, cwd string, t model.Task, s model.Spider) (e
// 环境变量配置
envs := s.Envs
if s.Type == constants.Configurable {
// 数据库配置
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_HOST", Value: viper.GetString("mongo.host")})
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_PORT", Value: viper.GetString("mongo.port")})
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_DB", Value: viper.GetString("mongo.db")})
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_USERNAME", Value: viper.GetString("mongo.username")})
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_PASSWORD", Value: viper.GetString("mongo.password")})
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_AUTHSOURCE", Value: viper.GetString("mongo.authSource")})
// 设置配置
for envName, envValue := range s.Config.Settings {
envs = append(envs, model.Env{Name: "CRAWLAB_SETTING_" + envName, Value: envValue})
}
}
cmd = SetEnv(cmd, envs, t.Id, s.Col)

View File

@@ -23,3 +23,5 @@ stages:
fields:
- name: "description"
css: "#product_description + p"
settings:
ROBOTSTXT_OBEY: true

View File

@@ -9,7 +9,7 @@
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'config_spider'
BOT_NAME = 'Crawlab Configurable Spider'
SPIDER_MODULES = ['config_spider.spiders']
NEWSPIDER_MODULE = 'config_spider.spiders'
@@ -88,3 +88,9 @@ ITEM_PIPELINES = {
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWALAB_SETTING_')]:
setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '')
setting_value = os.environ.get('setting_env_name')
locals()[setting_name] = setting_value

View File

@@ -182,6 +182,20 @@
</el-tab-pane>
<!--./Graph-->
<!--Setting-->
<el-tab-pane name="setting" :label="$t('Setting')">
<setting-fields-table-view
type="list"
:fields="spiderForm.settings"
/>
<fields-table-view
type="list"
title="List Page Fields"
:fields="stage.fields"
/>
</el-tab-pane>
<!--./Setting-->
<!--Spiderfile-->
<el-tab-pane name="spiderfile" label="Spiderfile">
<div class="spiderfile-actions">
@@ -209,10 +223,12 @@ import CrawlConfirmDialog from '../Common/CrawlConfirmDialog'
import 'codemirror/lib/codemirror.js'
import 'codemirror/mode/yaml/yaml.js'
import FileDetail from '../File/FileDetail'
import SettingFieldsTableView from '../TableView/SettingFieldsTableView'
export default {
name: 'ConfigList',
components: {
SettingFieldsTableView,
FileDetail,
CrawlConfirmDialog,
FieldsTableView

View File

@@ -0,0 +1,349 @@
<template>
<div class="setting-fields-table-view">
<!-- <el-row class="button-group-container">-->
<!-- <label class="title">{{$t(this.title)}}</label>-->
<!-- <div class="button-group">-->
<!-- <el-button type="primary" size="small" @click="addField" icon="el-icon-plus">{{$t('Add Field')}}</el-button>-->
<!-- </div>-->
<!-- </el-row>-->
<el-row>
<el-table :data="fields"
class="table edit"
:header-cell-style="{background:'rgb(48, 65, 86)',color:'white'}"
:cell-style="getCellClassStyle"
>
<el-table-column class-name="action" width="80px" align="right">
<template slot-scope="scope">
<i class="action-item el-icon-copy-document" @click="onCopyField(scope.row)"></i>
<i class="action-item el-icon-remove-outline" @click="onRemoveField(scope.row)"></i>
<i class="action-item el-icon-circle-plus-outline" @click="onAddField(scope.row)"></i>
</template>
</el-table-column>
<el-table-column :label="$t('Field Name')" width="150px">
<template slot-scope="scope">
<el-input v-model="scope.row.name"
:placeholder="$t('Field Name')"
@change="onNameChange(scope.row)"
/>
</template>
</el-table-column>
<el-table-column :label="$t('Selector Type')" width="150px" align="center" class-name="selector-type">
<template slot-scope="scope">
<span class="button-selector-item" @click="onClickSelectorType(scope.row, 'css')">
<el-tag
:class="scope.row.css ? 'active' : 'inactive'"
type="success"
>
CSS
</el-tag>
</span>
<span class="button-selector-item" @click="onClickSelectorType(scope.row, 'xpath')">
<el-tag
:class="scope.row.xpath ? 'active' : 'inactive'"
type="primary"
>
XPath
</el-tag>
</span>
</template>
</el-table-column>
<el-table-column :label="$t('Selector')" width="200px">
<template slot-scope="scope">
<template v-if="scope.row.css">
<el-input v-model="scope.row.css" :placeholder="$t('CSS / XPath')"></el-input>
</template>
<template v-else>
<el-input v-model="scope.row.xpath" :placeholder="$t('CSS / XPath')"></el-input>
</template>
</template>
</el-table-column>
<el-table-column :label="$t('Is Attribute')" width="150px" align="center">
<template slot-scope="scope">
<span class="button-selector-item" @click="onClickIsAttribute(scope.row, false)">
<el-tag
:class="!scope.row.attr ? 'active' : 'inactive'"
type="success"
>
{{$t('Text')}}
</el-tag>
</span>
<span class="button-selector-item" @click="onClickIsAttribute(scope.row, true)">
<el-tag
:class="scope.row.attr ? 'active' : 'inactive'"
type="primary"
>
{{$t('Attribute')}}
</el-tag>
</span>
</template>
</el-table-column>
<el-table-column :label="$t('Attribute')" width="200px">
<template slot-scope="scope">
<template v-if="scope.row.attr">
<el-input v-model="scope.row.attr" :placeholder="$t('Attribute')"/>
</template>
<template v-else>
<span style="margin-left: 15px; color: lightgrey">
N/A
</span>
</template>
</template>
</el-table-column>
<el-table-column :label="$t('Next Stage')" width="250px">
<template slot-scope="scope">
<el-select
v-model="scope.row.next_stage"
:class="!scope.row.next_stage ? 'disabled' : ''"
@change="onChangeNextStage(scope.row)"
>
<el-option :label="$t('No Next Stage')" value=""/>
<el-option v-for="n in stageNames" :key="n" :label="n" :value="n"/>
</el-select>
</template>
</el-table-column>
<el-table-column :label="$t('Remark')" width="auto" min-width="120px">
<template slot-scope="scope">
<el-input v-model="scope.row.remark" :placeholder="$t('Remark')"/>
</template>
</el-table-column>
</el-table>
</el-row>
</div>
</template>
<script>
import {
mapState
} from 'vuex'
export default {
name: 'SettingFieldsTableView',
props: {
type: {
type: String,
default: 'list'
},
title: {
type: String,
default: ''
},
stageNames: {
type: Array,
default () {
return []
}
},
fields: {
type: Array,
default () {
return []
}
}
},
computed: {
...mapState('spider', [
'spiderForm'
])
},
methods: {
addField () {
this.fields.push({
type: 'css',
extract_type: 'text'
})
this.$st.sendEv('爬虫详情-配置', '添加字段')
},
deleteField (index) {
this.fields.splice(index, 1)
this.$st.sendEv('爬虫详情-配置', '删除字段')
},
onNameChange (row) {
if (this.fields.filter(d => d.name === row.name).length > 1) {
this.$message.error(this.$t(`Duplicated field names for ${row.name}`))
}
this.$st.sendEv('爬虫详情-配置', '更改字段')
},
onCheck (row) {
this.fields.forEach(d => {
if (row.name !== d.name) {
this.$set(d, 'is_detail', false)
}
})
this.$st.sendEv('爬虫详情-配置', '设置详情页URL')
},
onClickSelectorType (row, selectorType) {
if (selectorType === 'css') {
if (row.xpath) this.$set(row, 'xpath', '')
if (!row.css) this.$set(row, 'css', 'body')
} else {
if (row.css) this.$set(row, 'css', '')
if (!row.xpath) this.$set(row, 'xpath', '//body')
}
},
onClickIsAttribute (row, isAttribute) {
if (!isAttribute) {
// 文本
if (row.attr) this.$set(row, 'attr', '')
} else {
// 属性
if (!row.attr) this.$set(row, 'attr', 'href')
}
},
onCopyField (row) {
for (let i = 0; i < this.fields.length; i++) {
if (row.name === this.fields[i].name) {
this.fields.splice(i, 0, JSON.parse(JSON.stringify(row)))
break
}
}
},
onRemoveField (row) {
for (let i = 0; i < this.fields.length; i++) {
if (row.name === this.fields[i].name) {
this.fields.splice(i, 1)
break
}
}
if (this.fields.length === 0) {
this.fields.push({
css: 'body',
next_stage: ''
})
}
},
onAddField (row) {
for (let i = 0; i < this.fields.length; i++) {
if (row.name === this.fields[i].name) {
this.fields.splice(i + 1, 0, {
name: `field_${Math.floor(new Date().getTime()).toString()}`,
css: 'body',
next_stage: ''
})
break
}
}
},
getCellClassStyle ({ row, columnIndex }) {
if (columnIndex === 1) {
// 字段名称
if (!row.name) {
return {
'border': '1px solid red'
}
}
} else if (columnIndex === 3) {
// 选择器
if (!row.css && !row.xpath) {
return {
'border': '1px solid red'
}
}
}
},
onChangeNextStage (row) {
this.fields.forEach(f => {
if (f.name !== row.name) {
this.$set(f, 'next_stage', '')
}
})
}
}
}
</script>
<style scoped>
.el-table.edit >>> .el-table__body td {
padding: 0;
}
.el-table.edit >>> .el-table__body td .cell {
padding: 0;
font-size: 12px;
}
.el-table.edit >>> .el-input__inner:hover {
text-decoration: underline;
}
.el-table.edit >>> .el-input__inner {
height: 36px;
border: none;
border-radius: 0;
font-size: 12px;
}
.el-table.edit >>> .el-select .el-input .el-select__caret {
line-height: 36px;
}
.el-table.edit >>> .button-selector-item {
cursor: pointer;
margin: 0 5px;
}
.el-table.edit >>> .el-tag.inactive {
opacity: 0.5;
}
.el-table.edit >>> .action {
background: none !important;
border: none;
}
.el-table.edit >>> tr {
border: none;
}
.el-table.edit >>> tr th {
border-right: 1px solid rgb(220, 223, 230);
}
.el-table.edit >>> tr td:nth-child(2) {
border-left: 1px solid rgb(220, 223, 230);
}
.el-table.edit >>> tr td {
border-right: 1px solid rgb(220, 223, 230);
}
.el-table.edit::before {
background: none;
}
.el-table.edit >>> .action-item {
font-size: 14px;
margin-right: 5px;
cursor: pointer;
}
.el-table.edit >>> .action-item:last-child {
margin-right: 10px;
}
.button-group-container {
/*display: inline-block;*/
/*width: 100%;*/
}
.button-group-container .title {
float: left;
line-height: 32px;
}
.button-group-container .button-group {
float: right;
}
.action-button-group {
display: flex;
margin-left: 10px;
}
.action-button-group >>> .el-checkbox__label {
font-size: 12px;
}
.el-table.edit >>> .el-select.disabled .el-input__inner {
color: lightgrey;
}
</style>