将可配置爬虫stages调整为列表

This commit is contained in:
marvzhang
2019-12-13 12:55:53 +08:00
parent be9598abbc
commit a067c1c1ad
8 changed files with 52 additions and 65 deletions

View File

@@ -5,7 +5,7 @@ type ConfigSpiderData struct {
Engine string `yaml:"engine" json:"engine"`
StartUrl string `yaml:"start_url" json:"start_url"`
StartStage string `yaml:"start_stage" json:"start_stage"`
Stages map[string]Stage `yaml:"stages" json:"stages"`
Stages []Stage `yaml:"stages" json:"stages"`
Settings map[string]string `yaml:"settings" json:"settings"`
}

View File

@@ -15,16 +15,12 @@ func GetAllFields(data entity.ConfigSpiderData) []entity.Field {
func GetStartStageName(data entity.ConfigSpiderData) string {
// 如果 start_stage 设置了且在 stages 里,则返回
if data.StartStage != "" {
for stageName := range data.Stages {
if stageName == data.StartStage {
return data.StartStage
}
}
return data.StartStage
}
// 否则返回第一个 stage
for stageName := range data.Stages {
return stageName
for _, stage := range data.Stages {
return stage.Name
}
return ""
}

View File

@@ -83,7 +83,8 @@ func (g ScrapyGenerator) ProcessSpider() error {
// 替换 parsers
strParser := ""
for stageName, stage := range g.ConfigData.Stages {
for _, stage := range g.ConfigData.Stages {
stageName := stage.Name
stageStr := g.GetParserString(stageName, stage)
strParser += stageStr
}

View File

@@ -319,11 +319,5 @@ func GetConfigSpiderData(spider Spider) (entity.ConfigSpiderData, error) {
return configData, err
}
// 赋值 stage_name
for stageName, stage := range configData.Stages {
stage.Name = stageName
configData.Stages[stageName] = stage
}
return configData, nil
}

View File

@@ -61,7 +61,9 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error {
// 校验stages
dict := map[string]int{}
for stageName, stage := range configData.Stages {
for _, stage := range configData.Stages {
stageName := stage.Name
// stage 名称不能为空
if stageName == "" {
return errors.New("spiderfile invalid: stage name is empty")
@@ -152,12 +154,6 @@ func IsUniqueConfigSpiderFields(fields []entity.Field) bool {
func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error {
spiderDir := spider.Src
// 赋值 stage_name
for stageName, stage := range configData.Stages {
stage.Name = stageName
configData.Stages[stageName] = stage
}
// 删除已有的爬虫文件
for _, fInfo := range utils.ListDir(spiderDir) {
// 不删除Spiderfile

View File

@@ -4,17 +4,17 @@ start_url: "http://news.163.com/special/0001386F/rank_news.html"
start_stage: "list"
engine: "scrapy"
stages:
list:
is_list: true
list_css: "table tr:not(:first-child)"
fields:
- name: "title"
css: "td:nth-child(1) > a"
- name: "url"
css: "td:nth-child(1) > a"
attr: "href"
- name: "clicks"
css: "td.cBlue"
- name: list
is_list: true
list_css: "table tr:not(:first-child)"
fields:
- name: "title"
css: "td:nth-child(1) > a"
- name: "url"
css: "td:nth-child(1) > a"
attr: "href"
- name: "clicks"
css: "td.cBlue"
settings:
ROBOTSTXT_OBEY: false
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36

View File

@@ -4,19 +4,19 @@ start_url: http://www.baidu.com/s?wd=crawlab
start_stage: list
engine: scrapy
stages:
list:
is_list: true
list_xpath: //*[contains(@class, "c-container")]
page_xpath: //*[@id="page"]//a[@class="n"][last()]
page_attr: href
fields:
- name: title
xpath: .//h3/a
- name: url
xpath: .//h3/a
attr: href
- name: abstract
xpath: .//*[@class="c-abstract"]
- name: list
is_list: true
list_xpath: //*[contains(@class, "c-container")]
page_xpath: //*[@id="page"]//a[@class="n"][last()]
page_attr: href
fields:
- name: title
xpath: .//h3/a
- name: url
xpath: .//h3/a
attr: href
- name: abstract
xpath: .//*[@class="c-abstract"]
settings:
ROBOTSTXT_OBEY: false
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36

View File

@@ -4,25 +4,25 @@ start_url: "http://books.toscrape.com"
start_stage: "list"
engine: "scrapy"
stages:
list:
is_list: true
list_css: "section article.product_pod"
page_css: "ul.pager li.next a"
page_attr: "href"
fields:
- name: "title"
css: "h3 > a"
- name: "url"
css: "h3 > a"
attr: "href"
next_stage: "detail"
- name: "price"
css: ".product_price > .price_color"
detail:
is_list: false
fields:
- name: "description"
css: "#product_description + p"
- name: list
is_list: true
list_css: "section article.product_pod"
page_css: "ul.pager li.next a"
page_attr: "href"
fields:
- name: "title"
css: "h3 > a"
- name: "url"
css: "h3 > a"
attr: "href"
next_stage: "detail"
- name: "price"
css: ".product_price > .price_color"
- name: detail
is_list: false
fields:
- name: "description"
css: "#product_description + p"
settings:
ROBOTSTXT_OBEY: true
AUTOTHROTTLE_ENABLED: true