From 98cbccb81e96efe5106807b34345c7f2985da5e0 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Tue, 3 Dec 2019 13:37:41 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8A=A0=E5=85=A5Spiderfile=E6=A8=A1=E7=89=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/main.go | 1 + backend/model/spider.go | 3 + backend/routes/config_spider.go | 28 ++++++++- .../template/scrapy/config_spider/settings.py | 5 ++ .../template/spiderfile/Spiderfile.163_news | 20 +++++++ backend/template/spiderfile/Spiderfile.baidu | 22 +++++++ .../Spiderfile.toscrapy_books} | 5 +- frontend/src/components/Config/ConfigList.vue | 58 ++++++++++--------- .../TableView/SettingFieldsTableView.vue | 8 ++- frontend/src/i18n/zh.js | 3 + frontend/src/store/modules/spider.js | 13 ++++- frontend/src/views/spider/SpiderDetail.vue | 20 ++++--- frontend/src/views/spider/SpiderList.vue | 43 ++++++++++---- 13 files changed, 177 insertions(+), 52 deletions(-) create mode 100644 backend/template/spiderfile/Spiderfile.163_news create mode 100644 backend/template/spiderfile/Spiderfile.baidu rename backend/template/{Spiderfile => spiderfile/Spiderfile.toscrapy_books} (88%) diff --git a/backend/main.go b/backend/main.go index 226b3bd5..b49efae7 100644 --- a/backend/main.go +++ b/backend/main.go @@ -146,6 +146,7 @@ func main() { authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫 authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫 authGroup.POST("/config_spiders/:id/spiderfile", routes.PostConfigSpiderSpiderfile) // 上传可配置爬虫 + authGroup.GET("/config_spiders_templates", routes.GetConfigSpiderTemplateList) // 获取可配置爬虫模版列表 // 任务 authGroup.GET("/tasks", routes.GetTaskList) // 任务列表 authGroup.GET("/tasks/:id", routes.GetTask) // 任务详情 diff --git a/backend/model/spider.go b/backend/model/spider.go index a06e682b..a0d72c1c 100644 --- a/backend/model/spider.go +++ b/backend/model/spider.go @@ -36,6 +36,9 @@ type Spider struct { // 自定义爬虫 Cmd string `json:"cmd" bson:"cmd"` // 执行命令 + // 可配置爬虫 + Template string `json:"template" bson:"template"` // Spiderfile模版 + // 前端展示 LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间 LastStatus string `json:"last_status"` // 最后执行状态 diff --git a/backend/routes/config_spider.go b/backend/routes/config_spider.go index 61067f98..3e0f0e56 100644 --- a/backend/routes/config_spider.go +++ b/backend/routes/config_spider.go @@ -16,6 +16,7 @@ import ( "net/http" "os" "path/filepath" + "strings" ) // 添加可配置爬虫 @@ -32,6 +33,12 @@ func PutConfigSpider(c *gin.Context) { return } + // 模版名不能为空 + if spider.Template == "" { + HandleErrorF(http.StatusBadRequest, c, "spider template should not be empty") + return + } + // 判断爬虫是否存在 if spider := model.GetSpiderByName(spider.Name); spider != nil { HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("spider for '%s' already exists", spider.Name)) @@ -59,7 +66,7 @@ func PutConfigSpider(c *gin.Context) { spider.Src = spiderDir // 复制Spiderfile模版 - contentByte, err := ioutil.ReadFile("./template/Spiderfile") + contentByte, err := ioutil.ReadFile("./template/spiderfile/Spiderfile." + spider.Template) if err != nil { HandleError(http.StatusInternalServerError, c, err) return @@ -113,8 +120,8 @@ func UploadConfigSpider(c *gin.Context) { // 文件名称必须为Spiderfile filename := header.Filename - if filename != "Spiderfile" { - HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile'") + if filename != "Spiderfile" && filename != "Spiderfile.yaml" && filename != "Spiderfile.yml" { + HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile(.yaml|.yml)'") return } @@ -286,3 +293,18 @@ func GetConfigSpiderConfig(c *gin.Context) { Data: spider.Config, }) } + +// 获取模版名称列表 +func GetConfigSpiderTemplateList(c *gin.Context) { + var data []string + for _, fInfo := range utils.ListDir("./template/spiderfile") { + templateName := strings.Replace(fInfo.Name(), "Spiderfile.", "", -1) + data = append(data, templateName) + } + + c.JSON(http.StatusOK, Response{ + Status: "ok", + Message: "success", + Data: data, + }) +} diff --git a/backend/template/scrapy/config_spider/settings.py b/backend/template/scrapy/config_spider/settings.py index 195f95df..4b0965f2 100644 --- a/backend/template/scrapy/config_spider/settings.py +++ b/backend/template/scrapy/config_spider/settings.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import os import re +import json # Scrapy settings for config_spider project # @@ -100,6 +101,10 @@ for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_S setting_value = False elif re.search(r'^\d+$', setting_value) is not None: setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) else: pass locals()[setting_name] = setting_value diff --git a/backend/template/spiderfile/Spiderfile.163_news b/backend/template/spiderfile/Spiderfile.163_news new file mode 100644 index 00000000..29d58279 --- /dev/null +++ b/backend/template/spiderfile/Spiderfile.163_news @@ -0,0 +1,20 @@ +version: "0.4.0" +name: "toscrapy_books" +start_url: "http://news.163.com/special/0001386F/rank_news.html" +start_stage: "list" +engine: "scrapy" +stages: + list: + is_list: true + list_css: "table tr:not(:first-child)" + fields: + - name: "title" + css: "td:nth-child(1) > a" + - name: "url" + css: "td:nth-child(1) > a" + attr: "href" + - name: "clicks" + css: "td.cBlue" +settings: + ROBOTSTXT_OBEY: false + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/spiderfile/Spiderfile.baidu b/backend/template/spiderfile/Spiderfile.baidu new file mode 100644 index 00000000..86388621 --- /dev/null +++ b/backend/template/spiderfile/Spiderfile.baidu @@ -0,0 +1,22 @@ +version: "0.4.0" +name: "toscrapy_books" +start_url: "http://www.baidu.com/s?wd=crawlab" +start_stage: "list" +engine: "scrapy" +stages: + list: + is_list: true + list_css: ".result.c-container" + page_css: "#page a.n:last-child" + page_attr: "href" + fields: + - name: "title" + css: "h3 > a" + - name: "url" + css: "h3 > a" + attr: "href" + - name: "abstract" + css: ".c-abstract" +settings: + ROBOTSTXT_OBEY: false + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/Spiderfile b/backend/template/spiderfile/Spiderfile.toscrapy_books similarity index 88% rename from backend/template/Spiderfile rename to backend/template/spiderfile/Spiderfile.toscrapy_books index d748d5f8..4bf18f61 100644 --- a/backend/template/Spiderfile +++ b/backend/template/spiderfile/Spiderfile.toscrapy_books @@ -5,10 +5,10 @@ start_stage: "list" engine: "scrapy" stages: list: - is_list: true # default: false + is_list: true list_css: "section article.product_pod" page_css: "ul.pager li.next a" - page_attr: "href" # default: href + page_attr: "href" fields: - name: "title" css: "h3 > a" @@ -25,3 +25,4 @@ stages: css: "#product_description + p" settings: ROBOTSTXT_OBEY: true + AUTOTHROTTLE_ENABLED: true diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index 160de988..1fc62d18 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -171,25 +171,27 @@
  • - + - -
    - CSS - - XPath - -
    -
    - - -
    + + + + CSS + + XPath + + + + + + + - -
    - CSS - - XPath - -
    -
    - - -
    + + + + CSS + + XPath + + + + + + + { - // get spider file info - this.$store.dispatch('file/getFileList', this.spiderForm.src) - }) + await this.$store.dispatch('spider/getSpiderData', this.$route.params.id) + + // get spider file info + await this.$store.dispatch('file/getFileList', this.spiderForm.src) // get spider tasks - this.$store.dispatch('spider/getTaskList', this.$route.params.id) + await this.$store.dispatch('spider/getTaskList', this.$route.params.id) // get spider list - this.$store.dispatch('spider/getSpiderList') + await this.$store.dispatch('spider/getSpiderList') + + // if spider is configurable spider, set to config tab by default + if (this.spiderForm.type === 'configurable') { + this.activeTabName = 'config' + } } } diff --git a/frontend/src/views/spider/SpiderList.vue b/frontend/src/views/spider/SpiderList.vue index c756ce52..cc3d6acb 100644 --- a/frontend/src/views/spider/SpiderList.vue +++ b/frontend/src/views/spider/SpiderList.vue @@ -42,6 +42,16 @@ + + + + + @@ -319,7 +329,8 @@ export default { 'importForm', 'spiderList', 'spiderForm', - 'spiderTotal' + 'spiderTotal', + 'templateList' ]), ...mapGetters('user', [ 'token' @@ -342,7 +353,9 @@ export default { this.getList() }, onAdd () { - this.$store.commit('spider/SET_SPIDER_FORM', {}) + this.$store.commit('spider/SET_SPIDER_FORM', { + template: this.templateList[0] + }) this.addDialogVisible = true }, onAddConfigurable () { @@ -535,19 +548,29 @@ export default { type: this.filter.type } this.$store.dispatch('spider/getSpiderList', params) - }, - getTypes () { - request.get(`/spider/types`).then(resp => { - this.types = resp.data.data - }) } + // getTypes () { + // request.get(`/spider/types`).then(resp => { + // this.types = resp.data.data + // }) + // } }, - created () { - this.getTypes() + async created () { + // fetch spider types + // await this.getTypes() + // fetch spider list - this.getList() + await this.getList() + + // fetch template list + await this.$store.dispatch('spider/getTemplateList') }, mounted () { + console.log(this.spiderForm) + const vm = this + this.$nextTick(() => { + vm.$store.commit('spider/SET_SPIDER_FORM', this.spiderForm) + }) } }