From 5a286f98af02f9b6c8d600fd9f547c6f3595caae Mon Sep 17 00:00:00 2001 From: marvzhang Date: Sun, 24 Nov 2019 19:45:21 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E5=8F=AF=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E7=88=AC=E8=99=AB=EF=BC=8C=E4=BF=AE=E5=A4=8D=E4=B8=80=E4=BA=9B?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/constants/scrapy.go | 2 +- backend/entity/config_spider.go | 9 +++-- backend/model/config_spider/common.go | 6 +-- backend/model/config_spider/scrapy.go | 38 +++++++++++-------- backend/template/Spiderfile | 28 ++++++++------ .../scrapy/config_spider/spiders/spider.py | 8 +++- 6 files changed, 53 insertions(+), 38 deletions(-) diff --git a/backend/constants/scrapy.go b/backend/constants/scrapy.go index 8ad6739c..bc82508f 100644 --- a/backend/constants/scrapy.go +++ b/backend/constants/scrapy.go @@ -1,5 +1,5 @@ package constants -const ScrapyProtectedStageNames = "start_requests" +const ScrapyProtectedStageNames = "" const ScrapyProtectedFieldNames = "_id,task_id,ts" diff --git a/backend/entity/config_spider.go b/backend/entity/config_spider.go index b519163b..5e0fe1e1 100644 --- a/backend/entity/config_spider.go +++ b/backend/entity/config_spider.go @@ -9,10 +9,11 @@ type Field struct { } type Stage struct { - IsList bool `yaml:"is_list" json:"is_list"` - ListCss string `yaml:"list_css" json:"list_css"` - PageCss string `yaml:"page_css" json:"page_css"` - Fields []Field `yaml:"fields" json:"fields"` + IsList bool `yaml:"is_list" json:"is_list"` + ListCss string `yaml:"list_css" json:"list_css"` + PageCss string `yaml:"page_css" json:"page_css"` + PageAttr string `yaml:"page_attr" json:"page_attr"` + Fields []Field `yaml:"fields" json:"fields"` } type ConfigSpiderData struct { diff --git a/backend/model/config_spider/common.go b/backend/model/config_spider/common.go index e8440350..c803755a 100644 --- a/backend/model/config_spider/common.go +++ b/backend/model/config_spider/common.go @@ -5,10 +5,8 @@ import "crawlab/entity" func GetAllFields(data entity.ConfigSpiderData) []entity.Field { var fields []entity.Field for _, stage := range data.Stages { - if stage.IsList { - for _, field := range stage.Fields { - fields = append(fields, field) - } + for _, field := range stage.Fields { + fields = append(fields, field) } } return fields diff --git a/backend/model/config_spider/scrapy.go b/backend/model/config_spider/scrapy.go index 5d6eab22..7503b9bf 100644 --- a/backend/model/config_spider/scrapy.go +++ b/backend/model/config_spider/scrapy.go @@ -72,7 +72,7 @@ func (g ScrapyGenerator) ProcessSpider() error { filePath := filepath.Join(src, "config_spider", "spiders", "spider.py") // 替换 start_stage - if err := utils.SetFileVariable(filePath, constants.AnchorStartStage, GetStartStageName(g.ConfigData)); err != nil { + if err := utils.SetFileVariable(filePath, constants.AnchorStartStage, "parse_"+GetStartStageName(g.ConfigData)); err != nil { return err } @@ -96,15 +96,15 @@ func (g ScrapyGenerator) ProcessSpider() error { func (g ScrapyGenerator) GetParserString(stageName string, stage entity.Stage) string { // 构造函数定义行 - strDef := g.PadCode(fmt.Sprintf("def %s(self, response):", stageName), 1) + strDef := g.PadCode(fmt.Sprintf("def parse_%s(self, response):", stageName), 1) strParse := "" if stage.IsList { // 列表逻辑 - strParse = g.GetListParserString(stage) + strParse = g.GetListParserString(stageName, stage) } else { // 非列表逻辑 - strParse = g.GetNonListParserString(stage) + strParse = g.GetNonListParserString(stageName, stage) } // 构造 @@ -116,14 +116,14 @@ func (g ScrapyGenerator) GetParserString(stageName string, stage entity.Stage) s func (g ScrapyGenerator) PadCode(str string, num int) string { res := "" for i := 0; i < num; i++ { - res += "\t" + res += " " } res += str res += "\n" return res } -func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string { +func (g ScrapyGenerator) GetNonListParserString(stageName string, stage entity.Stage) string { str := "" // 获取或构造item @@ -133,9 +133,9 @@ func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string { for _, f := range stage.Fields { line := "" if f.Attr == "" { - line += fmt.Sprintf(`item['%s'] = response.css('%s::text).extract_first()')`, f.Name, f.Css) + line += fmt.Sprintf(`item['%s'] = response.css('%s::text').extract_first()`, f.Name, f.Css) } else { - line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")).extract_first()')`, f.Name, f.Css, f.Attr) + line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr) } line = g.PadCode(line, 2) str += line @@ -144,7 +144,7 @@ func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string { // next stage 字段 if f, err := g.GetNextStageField(stage); err == nil { // 如果找到 next stage 字段,进行下一个回调 - str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="item['%s']", callback='%s', meta={'item': item})`, f.Name, f.NextStage), 2) + str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="get_real_url(response, item['%s'])", callback=self.parse_%s, meta={'item': item})`, f.Name, f.NextStage), 2) } else { // 如果没找到 next stage 字段,返回 item str += g.PadCode(fmt.Sprintf(`yield item`), 2) @@ -156,14 +156,14 @@ func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string { return str } -func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string { +func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stage) string { str := "" // 获取前一个 stage 的 item str += g.PadCode(`prev_item = response.meta.get('item')`, 2) // for 循环遍历列表 - str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s')`, stage.ListCss), 2) + str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s'):`, stage.ListCss), 2) // 构造item str += g.PadCode(`item = Item()`, 3) @@ -172,9 +172,9 @@ func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string { for _, f := range stage.Fields { line := "" if f.Attr == "" { - line += fmt.Sprintf(`item['%s'] = elem.css('%s::text).extract_first()')`, f.Name, f.Css) + line += fmt.Sprintf(`item['%s'] = elem.css('%s::text').extract_first()`, f.Name, f.Css) } else { - line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")).extract_first()')`, f.Name, f.Css, f.Attr) + line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr) } line = g.PadCode(line, 3) str += line @@ -188,7 +188,7 @@ func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string { // next stage 字段 if f, err := g.GetNextStageField(stage); err == nil { // 如果找到 next stage 字段,进行下一个回调 - str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="item['%s']", callback='%s', meta={'item': item})`, f.Name, f.NextStage), 3) + str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, item['%s']), callback=self.parse_%s, meta={'item': item})`, f.Name, f.NextStage), 3) } else { // 如果没找到 next stage 字段,返回 item str += g.PadCode(fmt.Sprintf(`yield item`), 3) @@ -196,8 +196,14 @@ func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string { // 分页 if stage.PageCss != "" { - str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s').extract_first()`, stage.PageCss), 2) - str += g.PadCode(`yield scrapy.Request(url=next_url, meta={'item': item})`, 2) + // 分页元素属性,默认为 href + pageAttr := "href" + if stage.PageAttr != "" { + pageAttr = stage.PageAttr + } + + str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s::attr("%s")').extract_first()`, stage.PageCss, pageAttr), 2) + str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': item})`, stageName), 2) } // 加入末尾换行 diff --git a/backend/template/Spiderfile b/backend/template/Spiderfile index 7c9c524c..8d0e05cf 100644 --- a/backend/template/Spiderfile +++ b/backend/template/Spiderfile @@ -1,21 +1,25 @@ -version: 0.4.0 -start_url: "https://baidu.com/s?wd=crawlab" -start_stage: "stage_4" +version: "0.4.0" +name: "toscrapy_books" +start_url: "http://books.toscrape.com" +start_stage: "list" engine: "scrapy" stages: - stage_1: + list: is_list: true # default: false - list_css: "#content_left > .result" - page_css: "#page > a.n:last-child" + list_css: "section article.product_pod" + page_css: "ul.pager li.next a" + page_attr: "href" # default: href fields: - name: "title" - css: "a" + css: "h3 > a" - name: "url" - css: "a" + css: "h3 > a" attr: "href" - next_stage: "stage_2" - stage_2: + next_stage: "detail" + - name: "price" + css: ".product_price > .price_color" + detail: is_list: false fields: - - name: "stage_2_field_1" - css: "a" + - name: "description" + css: "#product_description + p" diff --git a/backend/template/scrapy/config_spider/spiders/spider.py b/backend/template/scrapy/config_spider/spiders/spider.py index f40b0e57..0e3c661d 100644 --- a/backend/template/scrapy/config_spider/spiders/spider.py +++ b/backend/template/scrapy/config_spider/spiders/spider.py @@ -1,12 +1,18 @@ # -*- coding: utf-8 -*- import scrapy +import re from config_spider.items import Item +from urllib.parse import urljoin +def get_real_url(response, url): + if re.search(r'^https?|^\/\/', url): + return url + return urljoin(response.url, url) class ConfigSpider(scrapy.Spider): name = 'config_spider' def start_requests(self): - return scrapy.Request(url='###START_URL###', callback='###START_STAGE###') + yield scrapy.Request(url='###START_URL###', callback=self.###START_STAGE###) ###PARSERS###