From 3ef794f7a2c4ad6e75697d7124a28222bbc66e8a Mon Sep 17 00:00:00 2001 From: marvzhang Date: Fri, 13 Dec 2019 12:55:53 +0800 Subject: [PATCH] =?UTF-8?q?=E5=B0=86=E5=8F=AF=E9=85=8D=E7=BD=AE=E7=88=AC?= =?UTF-8?q?=E8=99=ABstages=E8=B0=83=E6=95=B4=E4=B8=BA=E5=88=97=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/entity/config_spider.go | 2 +- backend/model/config_spider/common.go | 10 ++--- backend/model/config_spider/scrapy.go | 3 +- backend/model/spider.go | 6 --- backend/services/config_spider.go | 10 ++--- .../template/spiderfile/Spiderfile.163_news | 22 +++++------ backend/template/spiderfile/Spiderfile.baidu | 26 ++++++------- .../spiderfile/Spiderfile.toscrapy_books | 38 +++++++++---------- 8 files changed, 52 insertions(+), 65 deletions(-) diff --git a/backend/entity/config_spider.go b/backend/entity/config_spider.go index 3fe28bc9..d9e085d2 100644 --- a/backend/entity/config_spider.go +++ b/backend/entity/config_spider.go @@ -5,7 +5,7 @@ type ConfigSpiderData struct { Engine string `yaml:"engine" json:"engine"` StartUrl string `yaml:"start_url" json:"start_url"` StartStage string `yaml:"start_stage" json:"start_stage"` - Stages map[string]Stage `yaml:"stages" json:"stages"` + Stages []Stage `yaml:"stages" json:"stages"` Settings map[string]string `yaml:"settings" json:"settings"` } diff --git a/backend/model/config_spider/common.go b/backend/model/config_spider/common.go index c803755a..4d244fe1 100644 --- a/backend/model/config_spider/common.go +++ b/backend/model/config_spider/common.go @@ -15,16 +15,12 @@ func GetAllFields(data entity.ConfigSpiderData) []entity.Field { func GetStartStageName(data entity.ConfigSpiderData) string { // 如果 start_stage 设置了且在 stages 里,则返回 if data.StartStage != "" { - for stageName := range data.Stages { - if stageName == data.StartStage { - return data.StartStage - } - } + return data.StartStage } // 否则返回第一个 stage - for stageName := range data.Stages { - return stageName + for _, stage := range data.Stages { + return stage.Name } return "" } diff --git a/backend/model/config_spider/scrapy.go b/backend/model/config_spider/scrapy.go index 6fcb77f0..ee24a3e7 100644 --- a/backend/model/config_spider/scrapy.go +++ b/backend/model/config_spider/scrapy.go @@ -83,7 +83,8 @@ func (g ScrapyGenerator) ProcessSpider() error { // 替换 parsers strParser := "" - for stageName, stage := range g.ConfigData.Stages { + for _, stage := range g.ConfigData.Stages { + stageName := stage.Name stageStr := g.GetParserString(stageName, stage) strParser += stageStr } diff --git a/backend/model/spider.go b/backend/model/spider.go index a0d72c1c..78adc4d0 100644 --- a/backend/model/spider.go +++ b/backend/model/spider.go @@ -319,11 +319,5 @@ func GetConfigSpiderData(spider Spider) (entity.ConfigSpiderData, error) { return configData, err } - // 赋值 stage_name - for stageName, stage := range configData.Stages { - stage.Name = stageName - configData.Stages[stageName] = stage - } - return configData, nil } diff --git a/backend/services/config_spider.go b/backend/services/config_spider.go index 7c736cc7..fe0a3da1 100644 --- a/backend/services/config_spider.go +++ b/backend/services/config_spider.go @@ -61,7 +61,9 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error { // 校验stages dict := map[string]int{} - for stageName, stage := range configData.Stages { + for _, stage := range configData.Stages { + stageName := stage.Name + // stage 名称不能为空 if stageName == "" { return errors.New("spiderfile invalid: stage name is empty") @@ -152,12 +154,6 @@ func IsUniqueConfigSpiderFields(fields []entity.Field) bool { func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error { spiderDir := spider.Src - // 赋值 stage_name - for stageName, stage := range configData.Stages { - stage.Name = stageName - configData.Stages[stageName] = stage - } - // 删除已有的爬虫文件 for _, fInfo := range utils.ListDir(spiderDir) { // 不删除Spiderfile diff --git a/backend/template/spiderfile/Spiderfile.163_news b/backend/template/spiderfile/Spiderfile.163_news index 29d58279..c2a73be7 100644 --- a/backend/template/spiderfile/Spiderfile.163_news +++ b/backend/template/spiderfile/Spiderfile.163_news @@ -4,17 +4,17 @@ start_url: "http://news.163.com/special/0001386F/rank_news.html" start_stage: "list" engine: "scrapy" stages: - list: - is_list: true - list_css: "table tr:not(:first-child)" - fields: - - name: "title" - css: "td:nth-child(1) > a" - - name: "url" - css: "td:nth-child(1) > a" - attr: "href" - - name: "clicks" - css: "td.cBlue" +- name: list + is_list: true + list_css: "table tr:not(:first-child)" + fields: + - name: "title" + css: "td:nth-child(1) > a" + - name: "url" + css: "td:nth-child(1) > a" + attr: "href" + - name: "clicks" + css: "td.cBlue" settings: ROBOTSTXT_OBEY: false USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/spiderfile/Spiderfile.baidu b/backend/template/spiderfile/Spiderfile.baidu index fbf720e4..5643c980 100644 --- a/backend/template/spiderfile/Spiderfile.baidu +++ b/backend/template/spiderfile/Spiderfile.baidu @@ -4,19 +4,19 @@ start_url: http://www.baidu.com/s?wd=crawlab start_stage: list engine: scrapy stages: - list: - is_list: true - list_xpath: //*[contains(@class, "c-container")] - page_xpath: //*[@id="page"]//a[@class="n"][last()] - page_attr: href - fields: - - name: title - xpath: .//h3/a - - name: url - xpath: .//h3/a - attr: href - - name: abstract - xpath: .//*[@class="c-abstract"] +- name: list + is_list: true + list_xpath: //*[contains(@class, "c-container")] + page_xpath: //*[@id="page"]//a[@class="n"][last()] + page_attr: href + fields: + - name: title + xpath: .//h3/a + - name: url + xpath: .//h3/a + attr: href + - name: abstract + xpath: .//*[@class="c-abstract"] settings: ROBOTSTXT_OBEY: false USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/spiderfile/Spiderfile.toscrapy_books b/backend/template/spiderfile/Spiderfile.toscrapy_books index 4bf18f61..247b4f40 100644 --- a/backend/template/spiderfile/Spiderfile.toscrapy_books +++ b/backend/template/spiderfile/Spiderfile.toscrapy_books @@ -4,25 +4,25 @@ start_url: "http://books.toscrape.com" start_stage: "list" engine: "scrapy" stages: - list: - is_list: true - list_css: "section article.product_pod" - page_css: "ul.pager li.next a" - page_attr: "href" - fields: - - name: "title" - css: "h3 > a" - - name: "url" - css: "h3 > a" - attr: "href" - next_stage: "detail" - - name: "price" - css: ".product_price > .price_color" - detail: - is_list: false - fields: - - name: "description" - css: "#product_description + p" +- name: list + is_list: true + list_css: "section article.product_pod" + page_css: "ul.pager li.next a" + page_attr: "href" + fields: + - name: "title" + css: "h3 > a" + - name: "url" + css: "h3 > a" + attr: "href" + next_stage: "detail" + - name: "price" + css: ".product_price > .price_color" +- name: detail + is_list: false + fields: + - name: "description" + css: "#product_description + p" settings: ROBOTSTXT_OBEY: true AUTOTHROTTLE_ENABLED: true