加入Spiderfile模版

This commit is contained in:
marvzhang
2019-12-03 13:37:41 +08:00
parent 9e849695d1
commit 98cbccb81e
13 changed files with 177 additions and 52 deletions

View File

@@ -146,6 +146,7 @@ func main() {
authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫
authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫
authGroup.POST("/config_spiders/:id/spiderfile", routes.PostConfigSpiderSpiderfile) // 上传可配置爬虫
authGroup.GET("/config_spiders_templates", routes.GetConfigSpiderTemplateList) // 获取可配置爬虫模版列表
// 任务
authGroup.GET("/tasks", routes.GetTaskList) // 任务列表
authGroup.GET("/tasks/:id", routes.GetTask) // 任务详情

View File

@@ -36,6 +36,9 @@ type Spider struct {
// 自定义爬虫
Cmd string `json:"cmd" bson:"cmd"` // 执行命令
// 可配置爬虫
Template string `json:"template" bson:"template"` // Spiderfile模版
// 前端展示
LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间
LastStatus string `json:"last_status"` // 最后执行状态

View File

@@ -16,6 +16,7 @@ import (
"net/http"
"os"
"path/filepath"
"strings"
)
// 添加可配置爬虫
@@ -32,6 +33,12 @@ func PutConfigSpider(c *gin.Context) {
return
}
// 模版名不能为空
if spider.Template == "" {
HandleErrorF(http.StatusBadRequest, c, "spider template should not be empty")
return
}
// 判断爬虫是否存在
if spider := model.GetSpiderByName(spider.Name); spider != nil {
HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("spider for '%s' already exists", spider.Name))
@@ -59,7 +66,7 @@ func PutConfigSpider(c *gin.Context) {
spider.Src = spiderDir
// 复制Spiderfile模版
contentByte, err := ioutil.ReadFile("./template/Spiderfile")
contentByte, err := ioutil.ReadFile("./template/spiderfile/Spiderfile." + spider.Template)
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
@@ -113,8 +120,8 @@ func UploadConfigSpider(c *gin.Context) {
// 文件名称必须为Spiderfile
filename := header.Filename
if filename != "Spiderfile" {
HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile'")
if filename != "Spiderfile" && filename != "Spiderfile.yaml" && filename != "Spiderfile.yml" {
HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile(.yaml|.yml)'")
return
}
@@ -286,3 +293,18 @@ func GetConfigSpiderConfig(c *gin.Context) {
Data: spider.Config,
})
}
// 获取模版名称列表
func GetConfigSpiderTemplateList(c *gin.Context) {
var data []string
for _, fInfo := range utils.ListDir("./template/spiderfile") {
templateName := strings.Replace(fInfo.Name(), "Spiderfile.", "", -1)
data = append(data, templateName)
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
Data: data,
})
}

View File

@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import os
import re
import json
# Scrapy settings for config_spider project
#
@@ -100,6 +101,10 @@ for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_S
setting_value = False
elif re.search(r'^\d+$', setting_value) is not None:
setting_value = int(setting_value)
elif re.search(r'^\{.*\}$', setting_value.strip()) is not None:
setting_value = json.loads(setting_value)
elif re.search(r'^\[.*\]$', setting_value.strip()) is not None:
setting_value = json.loads(setting_value)
else:
pass
locals()[setting_name] = setting_value

View File

@@ -0,0 +1,20 @@
version: "0.4.0"
name: "toscrapy_books"
start_url: "http://news.163.com/special/0001386F/rank_news.html"
start_stage: "list"
engine: "scrapy"
stages:
list:
is_list: true
list_css: "table tr:not(:first-child)"
fields:
- name: "title"
css: "td:nth-child(1) > a"
- name: "url"
css: "td:nth-child(1) > a"
attr: "href"
- name: "clicks"
css: "td.cBlue"
settings:
ROBOTSTXT_OBEY: false
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36

View File

@@ -0,0 +1,22 @@
version: "0.4.0"
name: "toscrapy_books"
start_url: "http://www.baidu.com/s?wd=crawlab"
start_stage: "list"
engine: "scrapy"
stages:
list:
is_list: true
list_css: ".result.c-container"
page_css: "#page a.n:last-child"
page_attr: "href"
fields:
- name: "title"
css: "h3 > a"
- name: "url"
css: "h3 > a"
attr: "href"
- name: "abstract"
css: ".c-abstract"
settings:
ROBOTSTXT_OBEY: false
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36

View File

@@ -5,10 +5,10 @@ start_stage: "list"
engine: "scrapy"
stages:
list:
is_list: true # default: false
is_list: true
list_css: "section article.product_pod"
page_css: "ul.pager li.next a"
page_attr: "href" # default: href
page_attr: "href"
fields:
- name: "title"
css: "h3 > a"
@@ -25,3 +25,4 @@ stages:
css: "#product_description + p"
settings:
ROBOTSTXT_OBEY: true
AUTOTHROTTLE_ENABLED: true

View File

@@ -171,25 +171,27 @@
<!--list-->
<li class="stage-item" style="min-width: 240px">
<label>{{$t('Is List')}}: </label>
<label>{{$t('List')}}: </label>
<el-checkbox
style="text-align: left; flex-basis: 20px; margin-right: 5px"
:value="isList(stage)"
@change="onCheckIsList($event, stage)"
/>
<el-popover v-model="stage.isListOpen" v-if="isList(stage)" placement="top">
<div>
<el-tag :class="stage.list_css ? 'active' : 'inactive'" type="success"
@click="onSelectStageListType(stage, 'css')">CSS
</el-tag>
<el-tag :class="!stage.list_css ? 'active' : 'inactive'" type="primary"
@click="onSelectStageListType(stage, 'xpath')">XPath
</el-tag>
</div>
<div class="list-selector" style="margin-top: 5px; width: 240px">
<el-input v-if="stage.list_css" v-model="stage.list_css"/>
<el-input v-else v-model="stage.list_xpath"/>
</div>
<el-popover v-model="stage.isListOpen" v-if="isList(stage)" placement="top" width="360">
<el-form label-width="120px">
<el-form-item :label="$t('Selector Type')">
<el-tag :class="stage.list_css ? 'active' : 'inactive'" type="success"
@click="onSelectStageListType(stage, 'css')">CSS
</el-tag>
<el-tag :class="!stage.list_css ? 'active' : 'inactive'" type="primary"
@click="onSelectStageListType(stage, 'xpath')">XPath
</el-tag>
</el-form-item>
<el-form-item :label="$t('Selector')" class="list-selector">
<el-input v-if="stage.list_css" v-model="stage.list_css"/>
<el-input v-else v-model="stage.list_xpath"/>
</el-form-item>
</el-form>
<el-tag
v-if="stage.list_css"
type="success"
@@ -223,19 +225,21 @@
@change="onCheckIsPage($event, stage)"
:disabled="!isList(stage)"
/>
<el-popover v-model="stage.isPageOpen" v-if="isPage(stage)" placement="top">
<div>
<el-tag :class="stage.page_css ? 'active' : 'inactive'" type="success"
@click="onSelectStagePageType(stage, 'css')">CSS
</el-tag>
<el-tag :class="!stage.page_css ? 'active' : 'inactive'" type="primary"
@click="onSelectStagePageType(stage, 'xpath')">XPath
</el-tag>
</div>
<div class="page-selector" style="margin-top: 5px; width: 240px">
<el-input v-if="stage.page_css" v-model="stage.page_css"/>
<el-input v-else v-model="stage.page_xpath"/>
</div>
<el-popover v-model="stage.isPageOpen" v-if="isPage(stage)" placement="top" width="360">
<el-form label-width="120px">
<el-form-item :label="$t('Selector Type')">
<el-tag :class="stage.page_css ? 'active' : 'inactive'" type="success"
@click="onSelectStagePageType(stage, 'css')">CSS
</el-tag>
<el-tag :class="!stage.page_css ? 'active' : 'inactive'" type="primary"
@click="onSelectStagePageType(stage, 'xpath')">XPath
</el-tag>
</el-form-item>
<el-form-item :label="$t('Selector')" class="page-selector">
<el-input v-if="stage.page_css" v-model="stage.page_css"/>
<el-input v-else v-model="stage.page_xpath"/>
</el-form-item>
</el-form>
<el-tag
v-if="stage.page_css"
type="success"

View File

@@ -125,9 +125,15 @@ export default {
const list = JSON.parse(JSON.stringify(this.list))
for (let i = 0; i < list.length; i++) {
if (row.name === list[i].name) {
list.splice(i, 0, 1)
list.splice(i, 1)
}
}
if (list.length === 0) {
list.push({
name: `VARIABLE_NAME_${Math.floor(new Date().getTime())}`,
value: `VARIABLE_VALUE_${Math.floor(new Date().getTime())}`
})
}
this.$store.commit('spider/SET_SPIDER_FORM_CONFIG_SETTINGS', list)
},
onAddField (row) {

View File

@@ -165,8 +165,11 @@ export default {
'Fields': '字段',
'Stage': '阶段',
'Is List': '是否为列表',
'List': '列表',
'Pagination': '分页',
'Settings': '设置',
'Display Name': '显示名称',
'Template': '模版',
// 爬虫列表
'Name': '名称',

View File

@@ -1,5 +1,6 @@
import Vue from 'vue'
import request from '../../api/request'
import axisModelCommonMixin from 'echarts/src/coord/axisModelCommonMixin'
const state = {
// list of spiders
@@ -35,7 +36,10 @@ const state = {
filterSite: '',
// preview crawl data
previewCrawlData: []
previewCrawlData: [],
// template list
templateList: []
}
const getters = {}
@@ -80,6 +84,9 @@ const mutations = {
settings[row.name] = row.value
})
Vue.set(state.spiderForm.config, 'settings', settings)
},
SET_TEMPLATE_LIST (state, value) {
state.templateList = value
}
}
@@ -166,6 +173,10 @@ const actions = {
},
addConfigSpider ({ state }) {
return request.put(`/config_spiders`, state.spiderForm)
},
async getTemplateList ({ state, commit }) {
const res = await request.get(`/config_spiders_templates`)
commit('SET_TEMPLATE_LIST', res.data.data)
}
}

View File

@@ -96,22 +96,26 @@ export default {
this.$st.sendEv('爬虫详情', '切换爬虫')
}
},
created () {
async created () {
// get the list of the spiders
// this.$store.dispatch('spider/getSpiderList')
// get spider basic info
this.$store.dispatch('spider/getSpiderData', this.$route.params.id)
.then(() => {
// get spider file info
this.$store.dispatch('file/getFileList', this.spiderForm.src)
})
await this.$store.dispatch('spider/getSpiderData', this.$route.params.id)
// get spider file info
await this.$store.dispatch('file/getFileList', this.spiderForm.src)
// get spider tasks
this.$store.dispatch('spider/getTaskList', this.$route.params.id)
await this.$store.dispatch('spider/getTaskList', this.$route.params.id)
// get spider list
this.$store.dispatch('spider/getSpiderList')
await this.$store.dispatch('spider/getSpiderList')
// if spider is configurable spider, set to config tab by default
if (this.spiderForm.type === 'configurable') {
this.activeTabName = 'config'
}
}
}
</script>

View File

@@ -42,6 +42,16 @@
<el-form-item :label="$t('Display Name')" prop="display_name" required>
<el-input v-model="spiderForm.display_name" :placeholder="$t('Display Name')"/>
</el-form-item>
<el-form-item :label="$t('Template')" prop="template" required>
<el-select v-model="spiderForm.template" :value="spiderForm.template" :placeholder="$t('Template')">
<el-option
v-for="template in templateList"
:key="template"
:label="template"
:value="template"
/>
</el-select>
</el-form-item>
<el-form-item :label="$t('Results')" prop="col" required>
<el-input v-model="spiderForm.col" :placeholder="$t('Results')"/>
</el-form-item>
@@ -319,7 +329,8 @@ export default {
'importForm',
'spiderList',
'spiderForm',
'spiderTotal'
'spiderTotal',
'templateList'
]),
...mapGetters('user', [
'token'
@@ -342,7 +353,9 @@ export default {
this.getList()
},
onAdd () {
this.$store.commit('spider/SET_SPIDER_FORM', {})
this.$store.commit('spider/SET_SPIDER_FORM', {
template: this.templateList[0]
})
this.addDialogVisible = true
},
onAddConfigurable () {
@@ -535,19 +548,29 @@ export default {
type: this.filter.type
}
this.$store.dispatch('spider/getSpiderList', params)
},
getTypes () {
request.get(`/spider/types`).then(resp => {
this.types = resp.data.data
})
}
// getTypes () {
// request.get(`/spider/types`).then(resp => {
// this.types = resp.data.data
// })
// }
},
created () {
this.getTypes()
async created () {
// fetch spider types
// await this.getTypes()
// fetch spider list
this.getList()
await this.getList()
// fetch template list
await this.$store.dispatch('spider/getTemplateList')
},
mounted () {
console.log(this.spiderForm)
const vm = this
this.$nextTick(() => {
vm.$store.commit('spider/SET_SPIDER_FORM', this.spiderForm)
})
}
}
</script>