diff --git a/backend/entity/config_spider.go b/backend/entity/config_spider.go index 5e0fe1e1..d46e092e 100644 --- a/backend/entity/config_spider.go +++ b/backend/entity/config_spider.go @@ -9,11 +9,13 @@ type Field struct { } type Stage struct { - IsList bool `yaml:"is_list" json:"is_list"` - ListCss string `yaml:"list_css" json:"list_css"` - PageCss string `yaml:"page_css" json:"page_css"` - PageAttr string `yaml:"page_attr" json:"page_attr"` - Fields []Field `yaml:"fields" json:"fields"` + IsList bool `yaml:"is_list" json:"is_list"` + ListCss string `yaml:"list_css" json:"list_css"` + ListXpath string `yaml:"list_xpath" json:"list_xpath"` + PageCss string `yaml:"page_css" json:"page_css"` + PageXpath string `yaml:"page_xpath" json:"page_xpath"` + PageAttr string `yaml:"page_attr" json:"page_attr"` + Fields []Field `yaml:"fields" json:"fields"` } type ConfigSpiderData struct { diff --git a/backend/model/config_spider/scrapy.go b/backend/model/config_spider/scrapy.go index 7503b9bf..9da6fb89 100644 --- a/backend/model/config_spider/scrapy.go +++ b/backend/model/config_spider/scrapy.go @@ -131,12 +131,7 @@ func (g ScrapyGenerator) GetNonListParserString(stageName string, stage entity.S // 遍历字段列表 for _, f := range stage.Fields { - line := "" - if f.Attr == "" { - line += fmt.Sprintf(`item['%s'] = response.css('%s::text').extract_first()`, f.Name, f.Css) - } else { - line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr) - } + line := fmt.Sprintf(`item['%s'] = response.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f)) line = g.PadCode(line, 2) str += line } @@ -170,12 +165,7 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag // 遍历字段列表 for _, f := range stage.Fields { - line := "" - if f.Attr == "" { - line += fmt.Sprintf(`item['%s'] = elem.css('%s::text').extract_first()`, f.Name, f.Css) - } else { - line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr) - } + line := fmt.Sprintf(`item['%s'] = elem.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f)) line = g.PadCode(line, 3) str += line } @@ -195,14 +185,8 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag } // 分页 - if stage.PageCss != "" { - // 分页元素属性,默认为 href - pageAttr := "href" - if stage.PageAttr != "" { - pageAttr = stage.PageAttr - } - - str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s::attr("%s")').extract_first()`, stage.PageCss, pageAttr), 2) + if stage.PageCss != "" || stage.PageXpath != "" { + str += g.PadCode(fmt.Sprintf(`next_url = response.%s.extract_first()`, g.GetExtractStringFromStage(stage)), 2) str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': item})`, stageName), 2) } @@ -226,3 +210,41 @@ func (g ScrapyGenerator) GetNextStageField(stage entity.Stage) (entity.Field, er } return entity.Field{}, errors.New("cannot find next stage field") } + +func (g ScrapyGenerator) GetExtractStringFromField(f entity.Field) string { + if f.Css != "" { + // 如果为CSS + if f.Attr == "" { + // 文本 + return fmt.Sprintf(`css(%s::text())`, f.Css) + } else { + // 属性 + return fmt.Sprintf(`css(%s::attr("%s"))`, f.Css, f.Attr) + } + } else { + // 如果为XPath + if f.Attr == "" { + // 文本 + return fmt.Sprintf(`xpath(%s/text())`, f.Xpath) + } else { + // 属性 + return fmt.Sprintf(`xpath(%s/@%s)`, f.Xpath, f.Attr) + } + } +} + +func (g ScrapyGenerator) GetExtractStringFromStage(stage entity.Stage) string { + // 分页元素属性,默认为 href + pageAttr := "href" + if stage.PageAttr != "" { + pageAttr = stage.PageAttr + } + + if stage.PageCss != "" { + // 如果为CSS + return fmt.Sprintf(`css(%s::attr("%s"))`, stage.PageCss, pageAttr) + } else { + // 如果为XPath + return fmt.Sprintf(`xpath(%s/@%s)`, stage.PageXpath, pageAttr) + } +} diff --git a/backend/services/config_spider.go b/backend/services/config_spider.go index 91b474f7..35bb2790 100644 --- a/backend/services/config_spider.go +++ b/backend/services/config_spider.go @@ -92,12 +92,22 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error { hasNextStage = true } - // 字段里 CSS 和 XPath 只能包含一个 + // 字段里 css 和 xpath 只能包含一个 if field.Css != "" && field.Xpath != "" { - return errors.New(fmt.Sprintf("spiderfile invalid: field '%s' in stage '%s' has both CSS and XPath set which is prohibited", field.Name, stageName)) + return errors.New(fmt.Sprintf("spiderfile invalid: field '%s' in stage '%s' has both css and xpath set which is prohibited", field.Name, stageName)) } } + // stage 里 page_css 和 page_xpath 只能包含一个 + if stage.PageCss != "" && stage.PageXpath != "" { + return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both page_css and page_xpath set which is prohibited", stageName)) + } + + // stage 里 list_css 和 list_xpath 只能包含一个 + if stage.ListCss != "" && stage.ListXpath != "" { + return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both list_css and list_xpath set which is prohibited", stageName)) + } + // 如果 stage 的 is_list 为 true 但 list_css 为空,报错 if stage.IsList && stage.ListCss == "" { return errors.New("spiderfile invalid: stage with is_list = true should have list_css being set")