diff --git a/backend/main.go b/backend/main.go index 226b3bd5..b49efae7 100644 --- a/backend/main.go +++ b/backend/main.go @@ -146,6 +146,7 @@ func main() { authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫 authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫 authGroup.POST("/config_spiders/:id/spiderfile", routes.PostConfigSpiderSpiderfile) // 上传可配置爬虫 + authGroup.GET("/config_spiders_templates", routes.GetConfigSpiderTemplateList) // 获取可配置爬虫模版列表 // 任务 authGroup.GET("/tasks", routes.GetTaskList) // 任务列表 authGroup.GET("/tasks/:id", routes.GetTask) // 任务详情 diff --git a/backend/model/spider.go b/backend/model/spider.go index a06e682b..a0d72c1c 100644 --- a/backend/model/spider.go +++ b/backend/model/spider.go @@ -36,6 +36,9 @@ type Spider struct { // 自定义爬虫 Cmd string `json:"cmd" bson:"cmd"` // 执行命令 + // 可配置爬虫 + Template string `json:"template" bson:"template"` // Spiderfile模版 + // 前端展示 LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间 LastStatus string `json:"last_status"` // 最后执行状态 diff --git a/backend/routes/config_spider.go b/backend/routes/config_spider.go index 61067f98..3e0f0e56 100644 --- a/backend/routes/config_spider.go +++ b/backend/routes/config_spider.go @@ -16,6 +16,7 @@ import ( "net/http" "os" "path/filepath" + "strings" ) // 添加可配置爬虫 @@ -32,6 +33,12 @@ func PutConfigSpider(c *gin.Context) { return } + // 模版名不能为空 + if spider.Template == "" { + HandleErrorF(http.StatusBadRequest, c, "spider template should not be empty") + return + } + // 判断爬虫是否存在 if spider := model.GetSpiderByName(spider.Name); spider != nil { HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("spider for '%s' already exists", spider.Name)) @@ -59,7 +66,7 @@ func PutConfigSpider(c *gin.Context) { spider.Src = spiderDir // 复制Spiderfile模版 - contentByte, err := ioutil.ReadFile("./template/Spiderfile") + contentByte, err := ioutil.ReadFile("./template/spiderfile/Spiderfile." + spider.Template) if err != nil { HandleError(http.StatusInternalServerError, c, err) return @@ -113,8 +120,8 @@ func UploadConfigSpider(c *gin.Context) { // 文件名称必须为Spiderfile filename := header.Filename - if filename != "Spiderfile" { - HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile'") + if filename != "Spiderfile" && filename != "Spiderfile.yaml" && filename != "Spiderfile.yml" { + HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile(.yaml|.yml)'") return } @@ -286,3 +293,18 @@ func GetConfigSpiderConfig(c *gin.Context) { Data: spider.Config, }) } + +// 获取模版名称列表 +func GetConfigSpiderTemplateList(c *gin.Context) { + var data []string + for _, fInfo := range utils.ListDir("./template/spiderfile") { + templateName := strings.Replace(fInfo.Name(), "Spiderfile.", "", -1) + data = append(data, templateName) + } + + c.JSON(http.StatusOK, Response{ + Status: "ok", + Message: "success", + Data: data, + }) +} diff --git a/backend/template/scrapy/config_spider/settings.py b/backend/template/scrapy/config_spider/settings.py index 195f95df..4b0965f2 100644 --- a/backend/template/scrapy/config_spider/settings.py +++ b/backend/template/scrapy/config_spider/settings.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import os import re +import json # Scrapy settings for config_spider project # @@ -100,6 +101,10 @@ for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_S setting_value = False elif re.search(r'^\d+$', setting_value) is not None: setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) else: pass locals()[setting_name] = setting_value diff --git a/backend/template/spiderfile/Spiderfile.163_news b/backend/template/spiderfile/Spiderfile.163_news new file mode 100644 index 00000000..29d58279 --- /dev/null +++ b/backend/template/spiderfile/Spiderfile.163_news @@ -0,0 +1,20 @@ +version: "0.4.0" +name: "toscrapy_books" +start_url: "http://news.163.com/special/0001386F/rank_news.html" +start_stage: "list" +engine: "scrapy" +stages: + list: + is_list: true + list_css: "table tr:not(:first-child)" + fields: + - name: "title" + css: "td:nth-child(1) > a" + - name: "url" + css: "td:nth-child(1) > a" + attr: "href" + - name: "clicks" + css: "td.cBlue" +settings: + ROBOTSTXT_OBEY: false + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/spiderfile/Spiderfile.baidu b/backend/template/spiderfile/Spiderfile.baidu new file mode 100644 index 00000000..86388621 --- /dev/null +++ b/backend/template/spiderfile/Spiderfile.baidu @@ -0,0 +1,22 @@ +version: "0.4.0" +name: "toscrapy_books" +start_url: "http://www.baidu.com/s?wd=crawlab" +start_stage: "list" +engine: "scrapy" +stages: + list: + is_list: true + list_css: ".result.c-container" + page_css: "#page a.n:last-child" + page_attr: "href" + fields: + - name: "title" + css: "h3 > a" + - name: "url" + css: "h3 > a" + attr: "href" + - name: "abstract" + css: ".c-abstract" +settings: + ROBOTSTXT_OBEY: false + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/Spiderfile b/backend/template/spiderfile/Spiderfile.toscrapy_books similarity index 88% rename from backend/template/Spiderfile rename to backend/template/spiderfile/Spiderfile.toscrapy_books index d748d5f8..4bf18f61 100644 --- a/backend/template/Spiderfile +++ b/backend/template/spiderfile/Spiderfile.toscrapy_books @@ -5,10 +5,10 @@ start_stage: "list" engine: "scrapy" stages: list: - is_list: true # default: false + is_list: true list_css: "section article.product_pod" page_css: "ul.pager li.next a" - page_attr: "href" # default: href + page_attr: "href" fields: - name: "title" css: "h3 > a" @@ -25,3 +25,4 @@ stages: css: "#product_description + p" settings: ROBOTSTXT_OBEY: true + AUTOTHROTTLE_ENABLED: true diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index 160de988..1fc62d18 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -171,25 +171,27 @@