mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
将可配置爬虫stages调整为列表
This commit is contained in:
@@ -5,7 +5,7 @@ type ConfigSpiderData struct {
|
||||
Engine string `yaml:"engine" json:"engine"`
|
||||
StartUrl string `yaml:"start_url" json:"start_url"`
|
||||
StartStage string `yaml:"start_stage" json:"start_stage"`
|
||||
Stages map[string]Stage `yaml:"stages" json:"stages"`
|
||||
Stages []Stage `yaml:"stages" json:"stages"`
|
||||
Settings map[string]string `yaml:"settings" json:"settings"`
|
||||
}
|
||||
|
||||
|
||||
@@ -15,16 +15,12 @@ func GetAllFields(data entity.ConfigSpiderData) []entity.Field {
|
||||
func GetStartStageName(data entity.ConfigSpiderData) string {
|
||||
// 如果 start_stage 设置了且在 stages 里,则返回
|
||||
if data.StartStage != "" {
|
||||
for stageName := range data.Stages {
|
||||
if stageName == data.StartStage {
|
||||
return data.StartStage
|
||||
}
|
||||
}
|
||||
return data.StartStage
|
||||
}
|
||||
|
||||
// 否则返回第一个 stage
|
||||
for stageName := range data.Stages {
|
||||
return stageName
|
||||
for _, stage := range data.Stages {
|
||||
return stage.Name
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
@@ -83,7 +83,8 @@ func (g ScrapyGenerator) ProcessSpider() error {
|
||||
|
||||
// 替换 parsers
|
||||
strParser := ""
|
||||
for stageName, stage := range g.ConfigData.Stages {
|
||||
for _, stage := range g.ConfigData.Stages {
|
||||
stageName := stage.Name
|
||||
stageStr := g.GetParserString(stageName, stage)
|
||||
strParser += stageStr
|
||||
}
|
||||
|
||||
@@ -319,11 +319,5 @@ func GetConfigSpiderData(spider Spider) (entity.ConfigSpiderData, error) {
|
||||
return configData, err
|
||||
}
|
||||
|
||||
// 赋值 stage_name
|
||||
for stageName, stage := range configData.Stages {
|
||||
stage.Name = stageName
|
||||
configData.Stages[stageName] = stage
|
||||
}
|
||||
|
||||
return configData, nil
|
||||
}
|
||||
|
||||
@@ -61,7 +61,9 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error {
|
||||
|
||||
// 校验stages
|
||||
dict := map[string]int{}
|
||||
for stageName, stage := range configData.Stages {
|
||||
for _, stage := range configData.Stages {
|
||||
stageName := stage.Name
|
||||
|
||||
// stage 名称不能为空
|
||||
if stageName == "" {
|
||||
return errors.New("spiderfile invalid: stage name is empty")
|
||||
@@ -152,12 +154,6 @@ func IsUniqueConfigSpiderFields(fields []entity.Field) bool {
|
||||
func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error {
|
||||
spiderDir := spider.Src
|
||||
|
||||
// 赋值 stage_name
|
||||
for stageName, stage := range configData.Stages {
|
||||
stage.Name = stageName
|
||||
configData.Stages[stageName] = stage
|
||||
}
|
||||
|
||||
// 删除已有的爬虫文件
|
||||
for _, fInfo := range utils.ListDir(spiderDir) {
|
||||
// 不删除Spiderfile
|
||||
|
||||
@@ -4,17 +4,17 @@ start_url: "http://news.163.com/special/0001386F/rank_news.html"
|
||||
start_stage: "list"
|
||||
engine: "scrapy"
|
||||
stages:
|
||||
list:
|
||||
is_list: true
|
||||
list_css: "table tr:not(:first-child)"
|
||||
fields:
|
||||
- name: "title"
|
||||
css: "td:nth-child(1) > a"
|
||||
- name: "url"
|
||||
css: "td:nth-child(1) > a"
|
||||
attr: "href"
|
||||
- name: "clicks"
|
||||
css: "td.cBlue"
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: "table tr:not(:first-child)"
|
||||
fields:
|
||||
- name: "title"
|
||||
css: "td:nth-child(1) > a"
|
||||
- name: "url"
|
||||
css: "td:nth-child(1) > a"
|
||||
attr: "href"
|
||||
- name: "clicks"
|
||||
css: "td.cBlue"
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: false
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
|
||||
@@ -4,19 +4,19 @@ start_url: http://www.baidu.com/s?wd=crawlab
|
||||
start_stage: list
|
||||
engine: scrapy
|
||||
stages:
|
||||
list:
|
||||
is_list: true
|
||||
list_xpath: //*[contains(@class, "c-container")]
|
||||
page_xpath: //*[@id="page"]//a[@class="n"][last()]
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: title
|
||||
xpath: .//h3/a
|
||||
- name: url
|
||||
xpath: .//h3/a
|
||||
attr: href
|
||||
- name: abstract
|
||||
xpath: .//*[@class="c-abstract"]
|
||||
- name: list
|
||||
is_list: true
|
||||
list_xpath: //*[contains(@class, "c-container")]
|
||||
page_xpath: //*[@id="page"]//a[@class="n"][last()]
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: title
|
||||
xpath: .//h3/a
|
||||
- name: url
|
||||
xpath: .//h3/a
|
||||
attr: href
|
||||
- name: abstract
|
||||
xpath: .//*[@class="c-abstract"]
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: false
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
|
||||
@@ -4,25 +4,25 @@ start_url: "http://books.toscrape.com"
|
||||
start_stage: "list"
|
||||
engine: "scrapy"
|
||||
stages:
|
||||
list:
|
||||
is_list: true
|
||||
list_css: "section article.product_pod"
|
||||
page_css: "ul.pager li.next a"
|
||||
page_attr: "href"
|
||||
fields:
|
||||
- name: "title"
|
||||
css: "h3 > a"
|
||||
- name: "url"
|
||||
css: "h3 > a"
|
||||
attr: "href"
|
||||
next_stage: "detail"
|
||||
- name: "price"
|
||||
css: ".product_price > .price_color"
|
||||
detail:
|
||||
is_list: false
|
||||
fields:
|
||||
- name: "description"
|
||||
css: "#product_description + p"
|
||||
- name: list
|
||||
is_list: true
|
||||
list_css: "section article.product_pod"
|
||||
page_css: "ul.pager li.next a"
|
||||
page_attr: "href"
|
||||
fields:
|
||||
- name: "title"
|
||||
css: "h3 > a"
|
||||
- name: "url"
|
||||
css: "h3 > a"
|
||||
attr: "href"
|
||||
next_stage: "detail"
|
||||
- name: "price"
|
||||
css: ".product_price > .price_color"
|
||||
- name: detail
|
||||
is_list: false
|
||||
fields:
|
||||
- name: "description"
|
||||
css: "#product_description + p"
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: true
|
||||
AUTOTHROTTLE_ENABLED: true
|
||||
|
||||
Reference in New Issue
Block a user