更新可配置爬虫,修复一些问题

This commit is contained in:
marvzhang
2019-11-24 19:45:21 +08:00
parent 8b451c8b2b
commit 5a286f98af
6 changed files with 53 additions and 38 deletions

View File

@@ -1,5 +1,5 @@
package constants
const ScrapyProtectedStageNames = "start_requests"
const ScrapyProtectedStageNames = ""
const ScrapyProtectedFieldNames = "_id,task_id,ts"

View File

@@ -9,10 +9,11 @@ type Field struct {
}
type Stage struct {
IsList bool `yaml:"is_list" json:"is_list"`
ListCss string `yaml:"list_css" json:"list_css"`
PageCss string `yaml:"page_css" json:"page_css"`
Fields []Field `yaml:"fields" json:"fields"`
IsList bool `yaml:"is_list" json:"is_list"`
ListCss string `yaml:"list_css" json:"list_css"`
PageCss string `yaml:"page_css" json:"page_css"`
PageAttr string `yaml:"page_attr" json:"page_attr"`
Fields []Field `yaml:"fields" json:"fields"`
}
type ConfigSpiderData struct {

View File

@@ -5,10 +5,8 @@ import "crawlab/entity"
func GetAllFields(data entity.ConfigSpiderData) []entity.Field {
var fields []entity.Field
for _, stage := range data.Stages {
if stage.IsList {
for _, field := range stage.Fields {
fields = append(fields, field)
}
for _, field := range stage.Fields {
fields = append(fields, field)
}
}
return fields

View File

@@ -72,7 +72,7 @@ func (g ScrapyGenerator) ProcessSpider() error {
filePath := filepath.Join(src, "config_spider", "spiders", "spider.py")
// 替换 start_stage
if err := utils.SetFileVariable(filePath, constants.AnchorStartStage, GetStartStageName(g.ConfigData)); err != nil {
if err := utils.SetFileVariable(filePath, constants.AnchorStartStage, "parse_"+GetStartStageName(g.ConfigData)); err != nil {
return err
}
@@ -96,15 +96,15 @@ func (g ScrapyGenerator) ProcessSpider() error {
func (g ScrapyGenerator) GetParserString(stageName string, stage entity.Stage) string {
// 构造函数定义行
strDef := g.PadCode(fmt.Sprintf("def %s(self, response):", stageName), 1)
strDef := g.PadCode(fmt.Sprintf("def parse_%s(self, response):", stageName), 1)
strParse := ""
if stage.IsList {
// 列表逻辑
strParse = g.GetListParserString(stage)
strParse = g.GetListParserString(stageName, stage)
} else {
// 非列表逻辑
strParse = g.GetNonListParserString(stage)
strParse = g.GetNonListParserString(stageName, stage)
}
// 构造
@@ -116,14 +116,14 @@ func (g ScrapyGenerator) GetParserString(stageName string, stage entity.Stage) s
func (g ScrapyGenerator) PadCode(str string, num int) string {
res := ""
for i := 0; i < num; i++ {
res += "\t"
res += " "
}
res += str
res += "\n"
return res
}
func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string {
func (g ScrapyGenerator) GetNonListParserString(stageName string, stage entity.Stage) string {
str := ""
// 获取或构造item
@@ -133,9 +133,9 @@ func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string {
for _, f := range stage.Fields {
line := ""
if f.Attr == "" {
line += fmt.Sprintf(`item['%s'] = response.css('%s::text).extract_first()')`, f.Name, f.Css)
line += fmt.Sprintf(`item['%s'] = response.css('%s::text').extract_first()`, f.Name, f.Css)
} else {
line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")).extract_first()')`, f.Name, f.Css, f.Attr)
line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
}
line = g.PadCode(line, 2)
str += line
@@ -144,7 +144,7 @@ func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string {
// next stage 字段
if f, err := g.GetNextStageField(stage); err == nil {
// 如果找到 next stage 字段,进行下一个回调
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="item['%s']", callback='%s', meta={'item': item})`, f.Name, f.NextStage), 2)
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="get_real_url(response, item['%s'])", callback=self.parse_%s, meta={'item': item})`, f.Name, f.NextStage), 2)
} else {
// 如果没找到 next stage 字段,返回 item
str += g.PadCode(fmt.Sprintf(`yield item`), 2)
@@ -156,14 +156,14 @@ func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string {
return str
}
func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string {
func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stage) string {
str := ""
// 获取前一个 stage 的 item
str += g.PadCode(`prev_item = response.meta.get('item')`, 2)
// for 循环遍历列表
str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s')`, stage.ListCss), 2)
str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s'):`, stage.ListCss), 2)
// 构造item
str += g.PadCode(`item = Item()`, 3)
@@ -172,9 +172,9 @@ func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string {
for _, f := range stage.Fields {
line := ""
if f.Attr == "" {
line += fmt.Sprintf(`item['%s'] = elem.css('%s::text).extract_first()')`, f.Name, f.Css)
line += fmt.Sprintf(`item['%s'] = elem.css('%s::text').extract_first()`, f.Name, f.Css)
} else {
line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")).extract_first()')`, f.Name, f.Css, f.Attr)
line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
}
line = g.PadCode(line, 3)
str += line
@@ -188,7 +188,7 @@ func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string {
// next stage 字段
if f, err := g.GetNextStageField(stage); err == nil {
// 如果找到 next stage 字段,进行下一个回调
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="item['%s']", callback='%s', meta={'item': item})`, f.Name, f.NextStage), 3)
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, item['%s']), callback=self.parse_%s, meta={'item': item})`, f.Name, f.NextStage), 3)
} else {
// 如果没找到 next stage 字段,返回 item
str += g.PadCode(fmt.Sprintf(`yield item`), 3)
@@ -196,8 +196,14 @@ func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string {
// 分页
if stage.PageCss != "" {
str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s').extract_first()`, stage.PageCss), 2)
str += g.PadCode(`yield scrapy.Request(url=next_url, meta={'item': item})`, 2)
// 分页元素属性,默认为 href
pageAttr := "href"
if stage.PageAttr != "" {
pageAttr = stage.PageAttr
}
str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s::attr("%s")').extract_first()`, stage.PageCss, pageAttr), 2)
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': item})`, stageName), 2)
}
// 加入末尾换行

View File

@@ -1,21 +1,25 @@
version: 0.4.0
start_url: "https://baidu.com/s?wd=crawlab"
start_stage: "stage_4"
version: "0.4.0"
name: "toscrapy_books"
start_url: "http://books.toscrape.com"
start_stage: "list"
engine: "scrapy"
stages:
stage_1:
list:
is_list: true # default: false
list_css: "#content_left > .result"
page_css: "#page > a.n:last-child"
list_css: "section article.product_pod"
page_css: "ul.pager li.next a"
page_attr: "href" # default: href
fields:
- name: "title"
css: "a"
css: "h3 > a"
- name: "url"
css: "a"
css: "h3 > a"
attr: "href"
next_stage: "stage_2"
stage_2:
next_stage: "detail"
- name: "price"
css: ".product_price > .price_color"
detail:
is_list: false
fields:
- name: "stage_2_field_1"
css: "a"
- name: "description"
css: "#product_description + p"

View File

@@ -1,12 +1,18 @@
# -*- coding: utf-8 -*-
import scrapy
import re
from config_spider.items import Item
from urllib.parse import urljoin
def get_real_url(response, url):
if re.search(r'^https?|^\/\/', url):
return url
return urljoin(response.url, url)
class ConfigSpider(scrapy.Spider):
name = 'config_spider'
def start_requests(self):
return scrapy.Request(url='###START_URL###', callback='###START_STAGE###')
yield scrapy.Request(url='###START_URL###', callback=self.###START_STAGE###)
###PARSERS###