mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
加入xpath
This commit is contained in:
@@ -9,11 +9,13 @@ type Field struct {
|
||||
}
|
||||
|
||||
type Stage struct {
|
||||
IsList bool `yaml:"is_list" json:"is_list"`
|
||||
ListCss string `yaml:"list_css" json:"list_css"`
|
||||
PageCss string `yaml:"page_css" json:"page_css"`
|
||||
PageAttr string `yaml:"page_attr" json:"page_attr"`
|
||||
Fields []Field `yaml:"fields" json:"fields"`
|
||||
IsList bool `yaml:"is_list" json:"is_list"`
|
||||
ListCss string `yaml:"list_css" json:"list_css"`
|
||||
ListXpath string `yaml:"list_xpath" json:"list_xpath"`
|
||||
PageCss string `yaml:"page_css" json:"page_css"`
|
||||
PageXpath string `yaml:"page_xpath" json:"page_xpath"`
|
||||
PageAttr string `yaml:"page_attr" json:"page_attr"`
|
||||
Fields []Field `yaml:"fields" json:"fields"`
|
||||
}
|
||||
|
||||
type ConfigSpiderData struct {
|
||||
|
||||
@@ -131,12 +131,7 @@ func (g ScrapyGenerator) GetNonListParserString(stageName string, stage entity.S
|
||||
|
||||
// 遍历字段列表
|
||||
for _, f := range stage.Fields {
|
||||
line := ""
|
||||
if f.Attr == "" {
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::text').extract_first()`, f.Name, f.Css)
|
||||
} else {
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
|
||||
}
|
||||
line := fmt.Sprintf(`item['%s'] = response.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f))
|
||||
line = g.PadCode(line, 2)
|
||||
str += line
|
||||
}
|
||||
@@ -170,12 +165,7 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag
|
||||
|
||||
// 遍历字段列表
|
||||
for _, f := range stage.Fields {
|
||||
line := ""
|
||||
if f.Attr == "" {
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::text').extract_first()`, f.Name, f.Css)
|
||||
} else {
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
|
||||
}
|
||||
line := fmt.Sprintf(`item['%s'] = elem.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f))
|
||||
line = g.PadCode(line, 3)
|
||||
str += line
|
||||
}
|
||||
@@ -195,14 +185,8 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag
|
||||
}
|
||||
|
||||
// 分页
|
||||
if stage.PageCss != "" {
|
||||
// 分页元素属性,默认为 href
|
||||
pageAttr := "href"
|
||||
if stage.PageAttr != "" {
|
||||
pageAttr = stage.PageAttr
|
||||
}
|
||||
|
||||
str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s::attr("%s")').extract_first()`, stage.PageCss, pageAttr), 2)
|
||||
if stage.PageCss != "" || stage.PageXpath != "" {
|
||||
str += g.PadCode(fmt.Sprintf(`next_url = response.%s.extract_first()`, g.GetExtractStringFromStage(stage)), 2)
|
||||
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': item})`, stageName), 2)
|
||||
}
|
||||
|
||||
@@ -226,3 +210,41 @@ func (g ScrapyGenerator) GetNextStageField(stage entity.Stage) (entity.Field, er
|
||||
}
|
||||
return entity.Field{}, errors.New("cannot find next stage field")
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) GetExtractStringFromField(f entity.Field) string {
|
||||
if f.Css != "" {
|
||||
// 如果为CSS
|
||||
if f.Attr == "" {
|
||||
// 文本
|
||||
return fmt.Sprintf(`css(%s::text())`, f.Css)
|
||||
} else {
|
||||
// 属性
|
||||
return fmt.Sprintf(`css(%s::attr("%s"))`, f.Css, f.Attr)
|
||||
}
|
||||
} else {
|
||||
// 如果为XPath
|
||||
if f.Attr == "" {
|
||||
// 文本
|
||||
return fmt.Sprintf(`xpath(%s/text())`, f.Xpath)
|
||||
} else {
|
||||
// 属性
|
||||
return fmt.Sprintf(`xpath(%s/@%s)`, f.Xpath, f.Attr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) GetExtractStringFromStage(stage entity.Stage) string {
|
||||
// 分页元素属性,默认为 href
|
||||
pageAttr := "href"
|
||||
if stage.PageAttr != "" {
|
||||
pageAttr = stage.PageAttr
|
||||
}
|
||||
|
||||
if stage.PageCss != "" {
|
||||
// 如果为CSS
|
||||
return fmt.Sprintf(`css(%s::attr("%s"))`, stage.PageCss, pageAttr)
|
||||
} else {
|
||||
// 如果为XPath
|
||||
return fmt.Sprintf(`xpath(%s/@%s)`, stage.PageXpath, pageAttr)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,12 +92,22 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error {
|
||||
hasNextStage = true
|
||||
}
|
||||
|
||||
// 字段里 CSS 和 XPath 只能包含一个
|
||||
// 字段里 css 和 xpath 只能包含一个
|
||||
if field.Css != "" && field.Xpath != "" {
|
||||
return errors.New(fmt.Sprintf("spiderfile invalid: field '%s' in stage '%s' has both CSS and XPath set which is prohibited", field.Name, stageName))
|
||||
return errors.New(fmt.Sprintf("spiderfile invalid: field '%s' in stage '%s' has both css and xpath set which is prohibited", field.Name, stageName))
|
||||
}
|
||||
}
|
||||
|
||||
// stage 里 page_css 和 page_xpath 只能包含一个
|
||||
if stage.PageCss != "" && stage.PageXpath != "" {
|
||||
return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both page_css and page_xpath set which is prohibited", stageName))
|
||||
}
|
||||
|
||||
// stage 里 list_css 和 list_xpath 只能包含一个
|
||||
if stage.ListCss != "" && stage.ListXpath != "" {
|
||||
return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both list_css and list_xpath set which is prohibited", stageName))
|
||||
}
|
||||
|
||||
// 如果 stage 的 is_list 为 true 但 list_css 为空,报错
|
||||
if stage.IsList && stage.ListCss == "" {
|
||||
return errors.New("spiderfile invalid: stage with is_list = true should have list_css being set")
|
||||
|
||||
Reference in New Issue
Block a user