mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-21 17:21:09 +01:00
更新可配置爬虫,修复一些问题
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
package constants
|
||||
|
||||
const ScrapyProtectedStageNames = "start_requests"
|
||||
const ScrapyProtectedStageNames = ""
|
||||
|
||||
const ScrapyProtectedFieldNames = "_id,task_id,ts"
|
||||
|
||||
@@ -9,10 +9,11 @@ type Field struct {
|
||||
}
|
||||
|
||||
type Stage struct {
|
||||
IsList bool `yaml:"is_list" json:"is_list"`
|
||||
ListCss string `yaml:"list_css" json:"list_css"`
|
||||
PageCss string `yaml:"page_css" json:"page_css"`
|
||||
Fields []Field `yaml:"fields" json:"fields"`
|
||||
IsList bool `yaml:"is_list" json:"is_list"`
|
||||
ListCss string `yaml:"list_css" json:"list_css"`
|
||||
PageCss string `yaml:"page_css" json:"page_css"`
|
||||
PageAttr string `yaml:"page_attr" json:"page_attr"`
|
||||
Fields []Field `yaml:"fields" json:"fields"`
|
||||
}
|
||||
|
||||
type ConfigSpiderData struct {
|
||||
|
||||
@@ -5,10 +5,8 @@ import "crawlab/entity"
|
||||
func GetAllFields(data entity.ConfigSpiderData) []entity.Field {
|
||||
var fields []entity.Field
|
||||
for _, stage := range data.Stages {
|
||||
if stage.IsList {
|
||||
for _, field := range stage.Fields {
|
||||
fields = append(fields, field)
|
||||
}
|
||||
for _, field := range stage.Fields {
|
||||
fields = append(fields, field)
|
||||
}
|
||||
}
|
||||
return fields
|
||||
|
||||
@@ -72,7 +72,7 @@ func (g ScrapyGenerator) ProcessSpider() error {
|
||||
filePath := filepath.Join(src, "config_spider", "spiders", "spider.py")
|
||||
|
||||
// 替换 start_stage
|
||||
if err := utils.SetFileVariable(filePath, constants.AnchorStartStage, GetStartStageName(g.ConfigData)); err != nil {
|
||||
if err := utils.SetFileVariable(filePath, constants.AnchorStartStage, "parse_"+GetStartStageName(g.ConfigData)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -96,15 +96,15 @@ func (g ScrapyGenerator) ProcessSpider() error {
|
||||
|
||||
func (g ScrapyGenerator) GetParserString(stageName string, stage entity.Stage) string {
|
||||
// 构造函数定义行
|
||||
strDef := g.PadCode(fmt.Sprintf("def %s(self, response):", stageName), 1)
|
||||
strDef := g.PadCode(fmt.Sprintf("def parse_%s(self, response):", stageName), 1)
|
||||
|
||||
strParse := ""
|
||||
if stage.IsList {
|
||||
// 列表逻辑
|
||||
strParse = g.GetListParserString(stage)
|
||||
strParse = g.GetListParserString(stageName, stage)
|
||||
} else {
|
||||
// 非列表逻辑
|
||||
strParse = g.GetNonListParserString(stage)
|
||||
strParse = g.GetNonListParserString(stageName, stage)
|
||||
}
|
||||
|
||||
// 构造
|
||||
@@ -116,14 +116,14 @@ func (g ScrapyGenerator) GetParserString(stageName string, stage entity.Stage) s
|
||||
func (g ScrapyGenerator) PadCode(str string, num int) string {
|
||||
res := ""
|
||||
for i := 0; i < num; i++ {
|
||||
res += "\t"
|
||||
res += " "
|
||||
}
|
||||
res += str
|
||||
res += "\n"
|
||||
return res
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string {
|
||||
func (g ScrapyGenerator) GetNonListParserString(stageName string, stage entity.Stage) string {
|
||||
str := ""
|
||||
|
||||
// 获取或构造item
|
||||
@@ -133,9 +133,9 @@ func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string {
|
||||
for _, f := range stage.Fields {
|
||||
line := ""
|
||||
if f.Attr == "" {
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::text).extract_first()')`, f.Name, f.Css)
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::text').extract_first()`, f.Name, f.Css)
|
||||
} else {
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")).extract_first()')`, f.Name, f.Css, f.Attr)
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
|
||||
}
|
||||
line = g.PadCode(line, 2)
|
||||
str += line
|
||||
@@ -144,7 +144,7 @@ func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string {
|
||||
// next stage 字段
|
||||
if f, err := g.GetNextStageField(stage); err == nil {
|
||||
// 如果找到 next stage 字段,进行下一个回调
|
||||
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="item['%s']", callback='%s', meta={'item': item})`, f.Name, f.NextStage), 2)
|
||||
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="get_real_url(response, item['%s'])", callback=self.parse_%s, meta={'item': item})`, f.Name, f.NextStage), 2)
|
||||
} else {
|
||||
// 如果没找到 next stage 字段,返回 item
|
||||
str += g.PadCode(fmt.Sprintf(`yield item`), 2)
|
||||
@@ -156,14 +156,14 @@ func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string {
|
||||
return str
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string {
|
||||
func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stage) string {
|
||||
str := ""
|
||||
|
||||
// 获取前一个 stage 的 item
|
||||
str += g.PadCode(`prev_item = response.meta.get('item')`, 2)
|
||||
|
||||
// for 循环遍历列表
|
||||
str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s')`, stage.ListCss), 2)
|
||||
str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s'):`, stage.ListCss), 2)
|
||||
|
||||
// 构造item
|
||||
str += g.PadCode(`item = Item()`, 3)
|
||||
@@ -172,9 +172,9 @@ func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string {
|
||||
for _, f := range stage.Fields {
|
||||
line := ""
|
||||
if f.Attr == "" {
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::text).extract_first()')`, f.Name, f.Css)
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::text').extract_first()`, f.Name, f.Css)
|
||||
} else {
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")).extract_first()')`, f.Name, f.Css, f.Attr)
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
|
||||
}
|
||||
line = g.PadCode(line, 3)
|
||||
str += line
|
||||
@@ -188,7 +188,7 @@ func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string {
|
||||
// next stage 字段
|
||||
if f, err := g.GetNextStageField(stage); err == nil {
|
||||
// 如果找到 next stage 字段,进行下一个回调
|
||||
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="item['%s']", callback='%s', meta={'item': item})`, f.Name, f.NextStage), 3)
|
||||
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, item['%s']), callback=self.parse_%s, meta={'item': item})`, f.Name, f.NextStage), 3)
|
||||
} else {
|
||||
// 如果没找到 next stage 字段,返回 item
|
||||
str += g.PadCode(fmt.Sprintf(`yield item`), 3)
|
||||
@@ -196,8 +196,14 @@ func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string {
|
||||
|
||||
// 分页
|
||||
if stage.PageCss != "" {
|
||||
str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s').extract_first()`, stage.PageCss), 2)
|
||||
str += g.PadCode(`yield scrapy.Request(url=next_url, meta={'item': item})`, 2)
|
||||
// 分页元素属性,默认为 href
|
||||
pageAttr := "href"
|
||||
if stage.PageAttr != "" {
|
||||
pageAttr = stage.PageAttr
|
||||
}
|
||||
|
||||
str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s::attr("%s")').extract_first()`, stage.PageCss, pageAttr), 2)
|
||||
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': item})`, stageName), 2)
|
||||
}
|
||||
|
||||
// 加入末尾换行
|
||||
|
||||
@@ -1,21 +1,25 @@
|
||||
version: 0.4.0
|
||||
start_url: "https://baidu.com/s?wd=crawlab"
|
||||
start_stage: "stage_4"
|
||||
version: "0.4.0"
|
||||
name: "toscrapy_books"
|
||||
start_url: "http://books.toscrape.com"
|
||||
start_stage: "list"
|
||||
engine: "scrapy"
|
||||
stages:
|
||||
stage_1:
|
||||
list:
|
||||
is_list: true # default: false
|
||||
list_css: "#content_left > .result"
|
||||
page_css: "#page > a.n:last-child"
|
||||
list_css: "section article.product_pod"
|
||||
page_css: "ul.pager li.next a"
|
||||
page_attr: "href" # default: href
|
||||
fields:
|
||||
- name: "title"
|
||||
css: "a"
|
||||
css: "h3 > a"
|
||||
- name: "url"
|
||||
css: "a"
|
||||
css: "h3 > a"
|
||||
attr: "href"
|
||||
next_stage: "stage_2"
|
||||
stage_2:
|
||||
next_stage: "detail"
|
||||
- name: "price"
|
||||
css: ".product_price > .price_color"
|
||||
detail:
|
||||
is_list: false
|
||||
fields:
|
||||
- name: "stage_2_field_1"
|
||||
css: "a"
|
||||
- name: "description"
|
||||
css: "#product_description + p"
|
||||
|
||||
@@ -1,12 +1,18 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
import re
|
||||
from config_spider.items import Item
|
||||
from urllib.parse import urljoin
|
||||
|
||||
def get_real_url(response, url):
|
||||
if re.search(r'^https?|^\/\/', url):
|
||||
return url
|
||||
return urljoin(response.url, url)
|
||||
|
||||
class ConfigSpider(scrapy.Spider):
|
||||
name = 'config_spider'
|
||||
|
||||
def start_requests(self):
|
||||
return scrapy.Request(url='###START_URL###', callback='###START_STAGE###')
|
||||
yield scrapy.Request(url='###START_URL###', callback=self.###START_STAGE###)
|
||||
|
||||
###PARSERS###
|
||||
|
||||
Reference in New Issue
Block a user