mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
加入Spiderfile模版
This commit is contained in:
@@ -146,6 +146,7 @@ func main() {
|
||||
authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫
|
||||
authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫
|
||||
authGroup.POST("/config_spiders/:id/spiderfile", routes.PostConfigSpiderSpiderfile) // 上传可配置爬虫
|
||||
authGroup.GET("/config_spiders_templates", routes.GetConfigSpiderTemplateList) // 获取可配置爬虫模版列表
|
||||
// 任务
|
||||
authGroup.GET("/tasks", routes.GetTaskList) // 任务列表
|
||||
authGroup.GET("/tasks/:id", routes.GetTask) // 任务详情
|
||||
|
||||
@@ -36,6 +36,9 @@ type Spider struct {
|
||||
// 自定义爬虫
|
||||
Cmd string `json:"cmd" bson:"cmd"` // 执行命令
|
||||
|
||||
// 可配置爬虫
|
||||
Template string `json:"template" bson:"template"` // Spiderfile模版
|
||||
|
||||
// 前端展示
|
||||
LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间
|
||||
LastStatus string `json:"last_status"` // 最后执行状态
|
||||
|
||||
@@ -16,6 +16,7 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// 添加可配置爬虫
|
||||
@@ -32,6 +33,12 @@ func PutConfigSpider(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// 模版名不能为空
|
||||
if spider.Template == "" {
|
||||
HandleErrorF(http.StatusBadRequest, c, "spider template should not be empty")
|
||||
return
|
||||
}
|
||||
|
||||
// 判断爬虫是否存在
|
||||
if spider := model.GetSpiderByName(spider.Name); spider != nil {
|
||||
HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("spider for '%s' already exists", spider.Name))
|
||||
@@ -59,7 +66,7 @@ func PutConfigSpider(c *gin.Context) {
|
||||
spider.Src = spiderDir
|
||||
|
||||
// 复制Spiderfile模版
|
||||
contentByte, err := ioutil.ReadFile("./template/Spiderfile")
|
||||
contentByte, err := ioutil.ReadFile("./template/spiderfile/Spiderfile." + spider.Template)
|
||||
if err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
@@ -113,8 +120,8 @@ func UploadConfigSpider(c *gin.Context) {
|
||||
|
||||
// 文件名称必须为Spiderfile
|
||||
filename := header.Filename
|
||||
if filename != "Spiderfile" {
|
||||
HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile'")
|
||||
if filename != "Spiderfile" && filename != "Spiderfile.yaml" && filename != "Spiderfile.yml" {
|
||||
HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile(.yaml|.yml)'")
|
||||
return
|
||||
}
|
||||
|
||||
@@ -286,3 +293,18 @@ func GetConfigSpiderConfig(c *gin.Context) {
|
||||
Data: spider.Config,
|
||||
})
|
||||
}
|
||||
|
||||
// 获取模版名称列表
|
||||
func GetConfigSpiderTemplateList(c *gin.Context) {
|
||||
var data []string
|
||||
for _, fInfo := range utils.ListDir("./template/spiderfile") {
|
||||
templateName := strings.Replace(fInfo.Name(), "Spiderfile.", "", -1)
|
||||
data = append(data, templateName)
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, Response{
|
||||
Status: "ok",
|
||||
Message: "success",
|
||||
Data: data,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
|
||||
# Scrapy settings for config_spider project
|
||||
#
|
||||
@@ -100,6 +101,10 @@ for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_S
|
||||
setting_value = False
|
||||
elif re.search(r'^\d+$', setting_value) is not None:
|
||||
setting_value = int(setting_value)
|
||||
elif re.search(r'^\{.*\}$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
elif re.search(r'^\[.*\]$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
else:
|
||||
pass
|
||||
locals()[setting_name] = setting_value
|
||||
|
||||
20
backend/template/spiderfile/Spiderfile.163_news
Normal file
20
backend/template/spiderfile/Spiderfile.163_news
Normal file
@@ -0,0 +1,20 @@
|
||||
version: "0.4.0"
|
||||
name: "toscrapy_books"
|
||||
start_url: "http://news.163.com/special/0001386F/rank_news.html"
|
||||
start_stage: "list"
|
||||
engine: "scrapy"
|
||||
stages:
|
||||
list:
|
||||
is_list: true
|
||||
list_css: "table tr:not(:first-child)"
|
||||
fields:
|
||||
- name: "title"
|
||||
css: "td:nth-child(1) > a"
|
||||
- name: "url"
|
||||
css: "td:nth-child(1) > a"
|
||||
attr: "href"
|
||||
- name: "clicks"
|
||||
css: "td.cBlue"
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: false
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
22
backend/template/spiderfile/Spiderfile.baidu
Normal file
22
backend/template/spiderfile/Spiderfile.baidu
Normal file
@@ -0,0 +1,22 @@
|
||||
version: "0.4.0"
|
||||
name: "toscrapy_books"
|
||||
start_url: "http://www.baidu.com/s?wd=crawlab"
|
||||
start_stage: "list"
|
||||
engine: "scrapy"
|
||||
stages:
|
||||
list:
|
||||
is_list: true
|
||||
list_css: ".result.c-container"
|
||||
page_css: "#page a.n:last-child"
|
||||
page_attr: "href"
|
||||
fields:
|
||||
- name: "title"
|
||||
css: "h3 > a"
|
||||
- name: "url"
|
||||
css: "h3 > a"
|
||||
attr: "href"
|
||||
- name: "abstract"
|
||||
css: ".c-abstract"
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: false
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
@@ -5,10 +5,10 @@ start_stage: "list"
|
||||
engine: "scrapy"
|
||||
stages:
|
||||
list:
|
||||
is_list: true # default: false
|
||||
is_list: true
|
||||
list_css: "section article.product_pod"
|
||||
page_css: "ul.pager li.next a"
|
||||
page_attr: "href" # default: href
|
||||
page_attr: "href"
|
||||
fields:
|
||||
- name: "title"
|
||||
css: "h3 > a"
|
||||
@@ -25,3 +25,4 @@ stages:
|
||||
css: "#product_description + p"
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: true
|
||||
AUTOTHROTTLE_ENABLED: true
|
||||
@@ -171,25 +171,27 @@
|
||||
|
||||
<!--list-->
|
||||
<li class="stage-item" style="min-width: 240px">
|
||||
<label>{{$t('Is List')}}: </label>
|
||||
<label>{{$t('List')}}: </label>
|
||||
<el-checkbox
|
||||
style="text-align: left; flex-basis: 20px; margin-right: 5px"
|
||||
:value="isList(stage)"
|
||||
@change="onCheckIsList($event, stage)"
|
||||
/>
|
||||
<el-popover v-model="stage.isListOpen" v-if="isList(stage)" placement="top">
|
||||
<div>
|
||||
<el-tag :class="stage.list_css ? 'active' : 'inactive'" type="success"
|
||||
@click="onSelectStageListType(stage, 'css')">CSS
|
||||
</el-tag>
|
||||
<el-tag :class="!stage.list_css ? 'active' : 'inactive'" type="primary"
|
||||
@click="onSelectStageListType(stage, 'xpath')">XPath
|
||||
</el-tag>
|
||||
</div>
|
||||
<div class="list-selector" style="margin-top: 5px; width: 240px">
|
||||
<el-input v-if="stage.list_css" v-model="stage.list_css"/>
|
||||
<el-input v-else v-model="stage.list_xpath"/>
|
||||
</div>
|
||||
<el-popover v-model="stage.isListOpen" v-if="isList(stage)" placement="top" width="360">
|
||||
<el-form label-width="120px">
|
||||
<el-form-item :label="$t('Selector Type')">
|
||||
<el-tag :class="stage.list_css ? 'active' : 'inactive'" type="success"
|
||||
@click="onSelectStageListType(stage, 'css')">CSS
|
||||
</el-tag>
|
||||
<el-tag :class="!stage.list_css ? 'active' : 'inactive'" type="primary"
|
||||
@click="onSelectStageListType(stage, 'xpath')">XPath
|
||||
</el-tag>
|
||||
</el-form-item>
|
||||
<el-form-item :label="$t('Selector')" class="list-selector">
|
||||
<el-input v-if="stage.list_css" v-model="stage.list_css"/>
|
||||
<el-input v-else v-model="stage.list_xpath"/>
|
||||
</el-form-item>
|
||||
</el-form>
|
||||
<el-tag
|
||||
v-if="stage.list_css"
|
||||
type="success"
|
||||
@@ -223,19 +225,21 @@
|
||||
@change="onCheckIsPage($event, stage)"
|
||||
:disabled="!isList(stage)"
|
||||
/>
|
||||
<el-popover v-model="stage.isPageOpen" v-if="isPage(stage)" placement="top">
|
||||
<div>
|
||||
<el-tag :class="stage.page_css ? 'active' : 'inactive'" type="success"
|
||||
@click="onSelectStagePageType(stage, 'css')">CSS
|
||||
</el-tag>
|
||||
<el-tag :class="!stage.page_css ? 'active' : 'inactive'" type="primary"
|
||||
@click="onSelectStagePageType(stage, 'xpath')">XPath
|
||||
</el-tag>
|
||||
</div>
|
||||
<div class="page-selector" style="margin-top: 5px; width: 240px">
|
||||
<el-input v-if="stage.page_css" v-model="stage.page_css"/>
|
||||
<el-input v-else v-model="stage.page_xpath"/>
|
||||
</div>
|
||||
<el-popover v-model="stage.isPageOpen" v-if="isPage(stage)" placement="top" width="360">
|
||||
<el-form label-width="120px">
|
||||
<el-form-item :label="$t('Selector Type')">
|
||||
<el-tag :class="stage.page_css ? 'active' : 'inactive'" type="success"
|
||||
@click="onSelectStagePageType(stage, 'css')">CSS
|
||||
</el-tag>
|
||||
<el-tag :class="!stage.page_css ? 'active' : 'inactive'" type="primary"
|
||||
@click="onSelectStagePageType(stage, 'xpath')">XPath
|
||||
</el-tag>
|
||||
</el-form-item>
|
||||
<el-form-item :label="$t('Selector')" class="page-selector">
|
||||
<el-input v-if="stage.page_css" v-model="stage.page_css"/>
|
||||
<el-input v-else v-model="stage.page_xpath"/>
|
||||
</el-form-item>
|
||||
</el-form>
|
||||
<el-tag
|
||||
v-if="stage.page_css"
|
||||
type="success"
|
||||
|
||||
@@ -125,9 +125,15 @@ export default {
|
||||
const list = JSON.parse(JSON.stringify(this.list))
|
||||
for (let i = 0; i < list.length; i++) {
|
||||
if (row.name === list[i].name) {
|
||||
list.splice(i, 0, 1)
|
||||
list.splice(i, 1)
|
||||
}
|
||||
}
|
||||
if (list.length === 0) {
|
||||
list.push({
|
||||
name: `VARIABLE_NAME_${Math.floor(new Date().getTime())}`,
|
||||
value: `VARIABLE_VALUE_${Math.floor(new Date().getTime())}`
|
||||
})
|
||||
}
|
||||
this.$store.commit('spider/SET_SPIDER_FORM_CONFIG_SETTINGS', list)
|
||||
},
|
||||
onAddField (row) {
|
||||
|
||||
@@ -165,8 +165,11 @@ export default {
|
||||
'Fields': '字段',
|
||||
'Stage': '阶段',
|
||||
'Is List': '是否为列表',
|
||||
'List': '列表',
|
||||
'Pagination': '分页',
|
||||
'Settings': '设置',
|
||||
'Display Name': '显示名称',
|
||||
'Template': '模版',
|
||||
|
||||
// 爬虫列表
|
||||
'Name': '名称',
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import Vue from 'vue'
|
||||
import request from '../../api/request'
|
||||
import axisModelCommonMixin from 'echarts/src/coord/axisModelCommonMixin'
|
||||
|
||||
const state = {
|
||||
// list of spiders
|
||||
@@ -35,7 +36,10 @@ const state = {
|
||||
filterSite: '',
|
||||
|
||||
// preview crawl data
|
||||
previewCrawlData: []
|
||||
previewCrawlData: [],
|
||||
|
||||
// template list
|
||||
templateList: []
|
||||
}
|
||||
|
||||
const getters = {}
|
||||
@@ -80,6 +84,9 @@ const mutations = {
|
||||
settings[row.name] = row.value
|
||||
})
|
||||
Vue.set(state.spiderForm.config, 'settings', settings)
|
||||
},
|
||||
SET_TEMPLATE_LIST (state, value) {
|
||||
state.templateList = value
|
||||
}
|
||||
}
|
||||
|
||||
@@ -166,6 +173,10 @@ const actions = {
|
||||
},
|
||||
addConfigSpider ({ state }) {
|
||||
return request.put(`/config_spiders`, state.spiderForm)
|
||||
},
|
||||
async getTemplateList ({ state, commit }) {
|
||||
const res = await request.get(`/config_spiders_templates`)
|
||||
commit('SET_TEMPLATE_LIST', res.data.data)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -96,22 +96,26 @@ export default {
|
||||
this.$st.sendEv('爬虫详情', '切换爬虫')
|
||||
}
|
||||
},
|
||||
created () {
|
||||
async created () {
|
||||
// get the list of the spiders
|
||||
// this.$store.dispatch('spider/getSpiderList')
|
||||
|
||||
// get spider basic info
|
||||
this.$store.dispatch('spider/getSpiderData', this.$route.params.id)
|
||||
.then(() => {
|
||||
// get spider file info
|
||||
this.$store.dispatch('file/getFileList', this.spiderForm.src)
|
||||
})
|
||||
await this.$store.dispatch('spider/getSpiderData', this.$route.params.id)
|
||||
|
||||
// get spider file info
|
||||
await this.$store.dispatch('file/getFileList', this.spiderForm.src)
|
||||
|
||||
// get spider tasks
|
||||
this.$store.dispatch('spider/getTaskList', this.$route.params.id)
|
||||
await this.$store.dispatch('spider/getTaskList', this.$route.params.id)
|
||||
|
||||
// get spider list
|
||||
this.$store.dispatch('spider/getSpiderList')
|
||||
await this.$store.dispatch('spider/getSpiderList')
|
||||
|
||||
// if spider is configurable spider, set to config tab by default
|
||||
if (this.spiderForm.type === 'configurable') {
|
||||
this.activeTabName = 'config'
|
||||
}
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
@@ -42,6 +42,16 @@
|
||||
<el-form-item :label="$t('Display Name')" prop="display_name" required>
|
||||
<el-input v-model="spiderForm.display_name" :placeholder="$t('Display Name')"/>
|
||||
</el-form-item>
|
||||
<el-form-item :label="$t('Template')" prop="template" required>
|
||||
<el-select v-model="spiderForm.template" :value="spiderForm.template" :placeholder="$t('Template')">
|
||||
<el-option
|
||||
v-for="template in templateList"
|
||||
:key="template"
|
||||
:label="template"
|
||||
:value="template"
|
||||
/>
|
||||
</el-select>
|
||||
</el-form-item>
|
||||
<el-form-item :label="$t('Results')" prop="col" required>
|
||||
<el-input v-model="spiderForm.col" :placeholder="$t('Results')"/>
|
||||
</el-form-item>
|
||||
@@ -319,7 +329,8 @@ export default {
|
||||
'importForm',
|
||||
'spiderList',
|
||||
'spiderForm',
|
||||
'spiderTotal'
|
||||
'spiderTotal',
|
||||
'templateList'
|
||||
]),
|
||||
...mapGetters('user', [
|
||||
'token'
|
||||
@@ -342,7 +353,9 @@ export default {
|
||||
this.getList()
|
||||
},
|
||||
onAdd () {
|
||||
this.$store.commit('spider/SET_SPIDER_FORM', {})
|
||||
this.$store.commit('spider/SET_SPIDER_FORM', {
|
||||
template: this.templateList[0]
|
||||
})
|
||||
this.addDialogVisible = true
|
||||
},
|
||||
onAddConfigurable () {
|
||||
@@ -535,19 +548,29 @@ export default {
|
||||
type: this.filter.type
|
||||
}
|
||||
this.$store.dispatch('spider/getSpiderList', params)
|
||||
},
|
||||
getTypes () {
|
||||
request.get(`/spider/types`).then(resp => {
|
||||
this.types = resp.data.data
|
||||
})
|
||||
}
|
||||
// getTypes () {
|
||||
// request.get(`/spider/types`).then(resp => {
|
||||
// this.types = resp.data.data
|
||||
// })
|
||||
// }
|
||||
},
|
||||
created () {
|
||||
this.getTypes()
|
||||
async created () {
|
||||
// fetch spider types
|
||||
// await this.getTypes()
|
||||
|
||||
// fetch spider list
|
||||
this.getList()
|
||||
await this.getList()
|
||||
|
||||
// fetch template list
|
||||
await this.$store.dispatch('spider/getTemplateList')
|
||||
},
|
||||
mounted () {
|
||||
console.log(this.spiderForm)
|
||||
const vm = this
|
||||
this.$nextTick(() => {
|
||||
vm.$store.commit('spider/SET_SPIDER_FORM', this.spiderForm)
|
||||
})
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
Reference in New Issue
Block a user