mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-02-01 18:20:17 +01:00
加入可配置爬虫
This commit is contained in:
22
backend/model/config_spider/common.go
Normal file
22
backend/model/config_spider/common.go
Normal file
@@ -0,0 +1,22 @@
|
||||
package config_spider
|
||||
|
||||
import "crawlab/entity"
|
||||
|
||||
func GetAllFields(data entity.ConfigSpiderData) []entity.Field {
|
||||
var fields []entity.Field
|
||||
for _, stage := range data.Stages {
|
||||
if stage.IsList {
|
||||
for _, field := range stage.Fields {
|
||||
fields = append(fields, field)
|
||||
}
|
||||
}
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
func GetStartStageName(data entity.ConfigSpiderData) string {
|
||||
for stageName := range data.Stages {
|
||||
return stageName
|
||||
}
|
||||
return ""
|
||||
}
|
||||
214
backend/model/config_spider/scrapy.go
Normal file
214
backend/model/config_spider/scrapy.go
Normal file
@@ -0,0 +1,214 @@
|
||||
package config_spider
|
||||
|
||||
import (
|
||||
"crawlab/constants"
|
||||
"crawlab/entity"
|
||||
"crawlab/model"
|
||||
"crawlab/utils"
|
||||
"errors"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
type ScrapyGenerator struct {
|
||||
Spider model.Spider
|
||||
ConfigData entity.ConfigSpiderData
|
||||
}
|
||||
|
||||
// 生成爬虫文件
|
||||
func (g ScrapyGenerator) Generate() error {
|
||||
// 生成 items.py
|
||||
if err := g.ProcessItems(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 生成 spider.py
|
||||
if err := g.ProcessSpider(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// 生成 items.py
|
||||
func (g ScrapyGenerator) ProcessItems() error {
|
||||
// 待处理文件名
|
||||
src := g.Spider.Src
|
||||
filePath := filepath.Join(src, "items.py")
|
||||
|
||||
// 获取所有字段
|
||||
fields := g.GetAllFields()
|
||||
|
||||
// 字段名列表(包含默认字段名)
|
||||
fieldNames := []string{
|
||||
"_id",
|
||||
"task_id",
|
||||
"ts",
|
||||
}
|
||||
|
||||
// 加入字段
|
||||
for _, field := range fields {
|
||||
fieldNames = append(fieldNames, field.Name)
|
||||
}
|
||||
|
||||
// 将字段名转化为python代码
|
||||
str := ""
|
||||
for _, fieldName := range fieldNames {
|
||||
line := fmt.Sprintf("%s = scrapy.Field()", fieldName)
|
||||
str += line
|
||||
}
|
||||
|
||||
// 将占位符替换为代码
|
||||
if err := utils.SetFileVariable(filePath, constants.AnchorItems, str); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// 生成 spider.py
|
||||
func (g ScrapyGenerator) ProcessSpider() error {
|
||||
// 待处理文件名
|
||||
src := g.Spider.Src
|
||||
filePath := filepath.Join(src, "spiders", "spider.py")
|
||||
|
||||
// 替换 start_stage
|
||||
if err := utils.SetFileVariable(filePath, constants.AnchorStartStage, GetStartStageName(g.ConfigData)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 替换 start_url
|
||||
if err := utils.SetFileVariable(filePath, constants.AnchorStartUrl, g.ConfigData.StartUrl); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 替换 parsers
|
||||
strParser := ""
|
||||
for stageName, stage := range g.ConfigData.Stages {
|
||||
stageStr := g.GetParserString(stageName, stage)
|
||||
strParser += stageStr
|
||||
}
|
||||
if err := utils.SetFileVariable(filePath, constants.AnchorParsers, strParser); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) GetParserString(stageName string, stage entity.Stage) string {
|
||||
// 构造函数定义行
|
||||
strDef := g.PadCode(fmt.Sprintf("def %s(self, response):", stageName), 1)
|
||||
|
||||
strParse := ""
|
||||
if stage.IsList {
|
||||
// 列表逻辑
|
||||
strParse = g.GetListParserString(stage)
|
||||
} else {
|
||||
// 非列表逻辑
|
||||
strParse = g.GetNonListParserString(stage)
|
||||
}
|
||||
|
||||
// 构造
|
||||
str := fmt.Sprintf(`%s%s`, strDef, strParse)
|
||||
|
||||
return str
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) PadCode(str string, num int) string {
|
||||
res := ""
|
||||
for i := 0; i < num; i++ {
|
||||
res += "\t"
|
||||
}
|
||||
res += str
|
||||
res += "\n"
|
||||
return res
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string {
|
||||
str := ""
|
||||
|
||||
// 获取或构造item
|
||||
str += g.PadCode("item = Item() if response.meta.get('item') is None else response.meta.get('item')", 2)
|
||||
|
||||
// 遍历字段列表
|
||||
for _, f := range stage.Fields {
|
||||
line := ""
|
||||
if f.Attr == "" {
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::text).extract_first()'`, f.Name, f.Css)
|
||||
} else {
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")).extract_first()'`, f.Name, f.Css, f.Attr)
|
||||
}
|
||||
line = g.PadCode(line, 2)
|
||||
}
|
||||
|
||||
// next stage 字段
|
||||
if f, err := g.GetNextStageField(stage); err == nil {
|
||||
// 如果找到 next stage 字段,进行下一个回调
|
||||
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="item['%s']", callback='%s', meta={'item': item})`, f.Name, f.NextStage), 3)
|
||||
} else {
|
||||
// 如果没找到 next stage 字段,返回 item
|
||||
str += g.PadCode(fmt.Sprintf(`yield item`), 3)
|
||||
}
|
||||
|
||||
return str
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string {
|
||||
str := ""
|
||||
|
||||
// 获取前一个 stage 的 item
|
||||
str += g.PadCode(`prev_item = response.meta.get('item')`, 2)
|
||||
|
||||
// for 循环遍历列表
|
||||
str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s')`, stage.ListCss), 2)
|
||||
|
||||
// 构造item
|
||||
str += g.PadCode(`item = Item()`, 3)
|
||||
|
||||
// 遍历字段列表
|
||||
for _, f := range stage.Fields {
|
||||
line := ""
|
||||
if f.Attr == "" {
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::text).extract_first()'`, f.Name, f.Css)
|
||||
} else {
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")).extract_first()'`, f.Name, f.Css, f.Attr)
|
||||
}
|
||||
line = g.PadCode(line, 3)
|
||||
}
|
||||
|
||||
// 把前一个 stage 的 item 值赋给当前 item
|
||||
str += g.PadCode(`if prev_item is not None:`, 3)
|
||||
str += g.PadCode(`for key, value in prev_item.items():`, 4)
|
||||
str += g.PadCode(`item[key] = value`, 5)
|
||||
|
||||
// next stage 字段
|
||||
if f, err := g.GetNextStageField(stage); err == nil {
|
||||
// 如果找到 next stage 字段,进行下一个回调
|
||||
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="item['%s']", callback='%s', meta={'item': item})`, f.Name, f.NextStage), 3)
|
||||
} else {
|
||||
// 如果没找到 next stage 字段,返回 item
|
||||
str += g.PadCode(fmt.Sprintf(`yield item`), 3)
|
||||
}
|
||||
|
||||
// 分页
|
||||
if stage.PageCss != "" {
|
||||
str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s').extract_first()`, stage.PageCss), 2)
|
||||
str += g.PadCode(`yield scrapy.Request(url=next_url, meta={'item': item})`, 2)
|
||||
}
|
||||
|
||||
return str
|
||||
}
|
||||
|
||||
// 获取所有字段
|
||||
func (g ScrapyGenerator) GetAllFields() []entity.Field {
|
||||
return GetAllFields(g.ConfigData)
|
||||
}
|
||||
|
||||
// 获取包含 next stage 的字段
|
||||
func (g ScrapyGenerator) GetNextStageField(stage entity.Stage) (entity.Field, error) {
|
||||
for _, field := range stage.Fields {
|
||||
if field.NextStage != "" {
|
||||
return field, nil
|
||||
}
|
||||
}
|
||||
return entity.Field{}, errors.New("cannot find next stage field")
|
||||
}
|
||||
Reference in New Issue
Block a user