mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-30 18:00:56 +01:00
Merge remote-tracking branch 'upstream/develop' into upstream-develop
# Conflicts: # backend/services/spider.go
This commit is contained in:
30
backend/model/config_spider/common.go
Normal file
30
backend/model/config_spider/common.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package config_spider
|
||||
|
||||
import "crawlab/entity"
|
||||
|
||||
func GetAllFields(data entity.ConfigSpiderData) []entity.Field {
|
||||
var fields []entity.Field
|
||||
for _, stage := range data.Stages {
|
||||
for _, field := range stage.Fields {
|
||||
fields = append(fields, field)
|
||||
}
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
func GetStartStageName(data entity.ConfigSpiderData) string {
|
||||
// 如果 start_stage 设置了且在 stages 里,则返回
|
||||
if data.StartStage != "" {
|
||||
for stageName := range data.Stages {
|
||||
if stageName == data.StartStage {
|
||||
return data.StartStage
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 否则返回第一个 stage
|
||||
for stageName := range data.Stages {
|
||||
return stageName
|
||||
}
|
||||
return ""
|
||||
}
|
||||
228
backend/model/config_spider/scrapy.go
Normal file
228
backend/model/config_spider/scrapy.go
Normal file
@@ -0,0 +1,228 @@
|
||||
package config_spider
|
||||
|
||||
import (
|
||||
"crawlab/constants"
|
||||
"crawlab/entity"
|
||||
"crawlab/model"
|
||||
"crawlab/utils"
|
||||
"errors"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
type ScrapyGenerator struct {
|
||||
Spider model.Spider
|
||||
ConfigData entity.ConfigSpiderData
|
||||
}
|
||||
|
||||
// 生成爬虫文件
|
||||
func (g ScrapyGenerator) Generate() error {
|
||||
// 生成 items.py
|
||||
if err := g.ProcessItems(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 生成 spider.py
|
||||
if err := g.ProcessSpider(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// 生成 items.py
|
||||
func (g ScrapyGenerator) ProcessItems() error {
|
||||
// 待处理文件名
|
||||
src := g.Spider.Src
|
||||
filePath := filepath.Join(src, "config_spider", "items.py")
|
||||
|
||||
// 获取所有字段
|
||||
fields := g.GetAllFields()
|
||||
|
||||
// 字段名列表(包含默认字段名)
|
||||
fieldNames := []string{
|
||||
"_id",
|
||||
"task_id",
|
||||
"ts",
|
||||
}
|
||||
|
||||
// 加入字段
|
||||
for _, field := range fields {
|
||||
fieldNames = append(fieldNames, field.Name)
|
||||
}
|
||||
|
||||
// 将字段名转化为python代码
|
||||
str := ""
|
||||
for _, fieldName := range fieldNames {
|
||||
line := g.PadCode(fmt.Sprintf("%s = scrapy.Field()", fieldName), 1)
|
||||
str += line
|
||||
}
|
||||
|
||||
// 将占位符替换为代码
|
||||
if err := utils.SetFileVariable(filePath, constants.AnchorItems, str); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// 生成 spider.py
|
||||
func (g ScrapyGenerator) ProcessSpider() error {
|
||||
// 待处理文件名
|
||||
src := g.Spider.Src
|
||||
filePath := filepath.Join(src, "config_spider", "spiders", "spider.py")
|
||||
|
||||
// 替换 start_stage
|
||||
if err := utils.SetFileVariable(filePath, constants.AnchorStartStage, "parse_"+GetStartStageName(g.ConfigData)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 替换 start_url
|
||||
if err := utils.SetFileVariable(filePath, constants.AnchorStartUrl, g.ConfigData.StartUrl); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 替换 parsers
|
||||
strParser := ""
|
||||
for stageName, stage := range g.ConfigData.Stages {
|
||||
stageStr := g.GetParserString(stageName, stage)
|
||||
strParser += stageStr
|
||||
}
|
||||
if err := utils.SetFileVariable(filePath, constants.AnchorParsers, strParser); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) GetParserString(stageName string, stage entity.Stage) string {
|
||||
// 构造函数定义行
|
||||
strDef := g.PadCode(fmt.Sprintf("def parse_%s(self, response):", stageName), 1)
|
||||
|
||||
strParse := ""
|
||||
if stage.IsList {
|
||||
// 列表逻辑
|
||||
strParse = g.GetListParserString(stageName, stage)
|
||||
} else {
|
||||
// 非列表逻辑
|
||||
strParse = g.GetNonListParserString(stageName, stage)
|
||||
}
|
||||
|
||||
// 构造
|
||||
str := fmt.Sprintf(`%s%s`, strDef, strParse)
|
||||
|
||||
return str
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) PadCode(str string, num int) string {
|
||||
res := ""
|
||||
for i := 0; i < num; i++ {
|
||||
res += " "
|
||||
}
|
||||
res += str
|
||||
res += "\n"
|
||||
return res
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) GetNonListParserString(stageName string, stage entity.Stage) string {
|
||||
str := ""
|
||||
|
||||
// 获取或构造item
|
||||
str += g.PadCode("item = Item() if response.meta.get('item') is None else response.meta.get('item')", 2)
|
||||
|
||||
// 遍历字段列表
|
||||
for _, f := range stage.Fields {
|
||||
line := ""
|
||||
if f.Attr == "" {
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::text').extract_first()`, f.Name, f.Css)
|
||||
} else {
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
|
||||
}
|
||||
line = g.PadCode(line, 2)
|
||||
str += line
|
||||
}
|
||||
|
||||
// next stage 字段
|
||||
if f, err := g.GetNextStageField(stage); err == nil {
|
||||
// 如果找到 next stage 字段,进行下一个回调
|
||||
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="get_real_url(response, item['%s'])", callback=self.parse_%s, meta={'item': item})`, f.Name, f.NextStage), 2)
|
||||
} else {
|
||||
// 如果没找到 next stage 字段,返回 item
|
||||
str += g.PadCode(fmt.Sprintf(`yield item`), 2)
|
||||
}
|
||||
|
||||
// 加入末尾换行
|
||||
str += g.PadCode("", 0)
|
||||
|
||||
return str
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stage) string {
|
||||
str := ""
|
||||
|
||||
// 获取前一个 stage 的 item
|
||||
str += g.PadCode(`prev_item = response.meta.get('item')`, 2)
|
||||
|
||||
// for 循环遍历列表
|
||||
str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s'):`, stage.ListCss), 2)
|
||||
|
||||
// 构造item
|
||||
str += g.PadCode(`item = Item()`, 3)
|
||||
|
||||
// 遍历字段列表
|
||||
for _, f := range stage.Fields {
|
||||
line := ""
|
||||
if f.Attr == "" {
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::text').extract_first()`, f.Name, f.Css)
|
||||
} else {
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
|
||||
}
|
||||
line = g.PadCode(line, 3)
|
||||
str += line
|
||||
}
|
||||
|
||||
// 把前一个 stage 的 item 值赋给当前 item
|
||||
str += g.PadCode(`if prev_item is not None:`, 3)
|
||||
str += g.PadCode(`for key, value in prev_item.items():`, 4)
|
||||
str += g.PadCode(`item[key] = value`, 5)
|
||||
|
||||
// next stage 字段
|
||||
if f, err := g.GetNextStageField(stage); err == nil {
|
||||
// 如果找到 next stage 字段,进行下一个回调
|
||||
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, item['%s']), callback=self.parse_%s, meta={'item': item})`, f.Name, f.NextStage), 3)
|
||||
} else {
|
||||
// 如果没找到 next stage 字段,返回 item
|
||||
str += g.PadCode(fmt.Sprintf(`yield item`), 3)
|
||||
}
|
||||
|
||||
// 分页
|
||||
if stage.PageCss != "" {
|
||||
// 分页元素属性,默认为 href
|
||||
pageAttr := "href"
|
||||
if stage.PageAttr != "" {
|
||||
pageAttr = stage.PageAttr
|
||||
}
|
||||
|
||||
str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s::attr("%s")').extract_first()`, stage.PageCss, pageAttr), 2)
|
||||
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': item})`, stageName), 2)
|
||||
}
|
||||
|
||||
// 加入末尾换行
|
||||
str += g.PadCode("", 0)
|
||||
|
||||
return str
|
||||
}
|
||||
|
||||
// 获取所有字段
|
||||
func (g ScrapyGenerator) GetAllFields() []entity.Field {
|
||||
return GetAllFields(g.ConfigData)
|
||||
}
|
||||
|
||||
// 获取包含 next stage 的字段
|
||||
func (g ScrapyGenerator) GetNextStageField(stage entity.Stage) (entity.Field, error) {
|
||||
for _, field := range stage.Fields {
|
||||
if field.NextStage != "" {
|
||||
return field, nil
|
||||
}
|
||||
}
|
||||
return entity.Field{}, errors.New("cannot find next stage field")
|
||||
}
|
||||
@@ -25,6 +25,7 @@ type Spider struct {
|
||||
Site string `json:"site" bson:"site"` // 爬虫网站
|
||||
Envs []Env `json:"envs" bson:"envs"` // 环境变量
|
||||
Remark string `json:"remark" bson:"remark"` // 备注
|
||||
|
||||
// 自定义爬虫
|
||||
Src string `json:"src" bson:"src"` // 源码位置
|
||||
Cmd string `json:"cmd" bson:"cmd"` // 执行命令
|
||||
@@ -33,17 +34,7 @@ type Spider struct {
|
||||
LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间
|
||||
LastStatus string `json:"last_status"` // 最后执行状态
|
||||
|
||||
// TODO: 可配置爬虫
|
||||
//Fields []interface{} `json:"fields"`
|
||||
//DetailFields []interface{} `json:"detail_fields"`
|
||||
//CrawlType string `json:"crawl_type"`
|
||||
//StartUrl string `json:"start_url"`
|
||||
//UrlPattern string `json:"url_pattern"`
|
||||
//ItemSelector string `json:"item_selector"`
|
||||
//ItemSelectorType string `json:"item_selector_type"`
|
||||
//PaginationSelector string `json:"pagination_selector"`
|
||||
//PaginationSelectorType string `json:"pagination_selector_type"`
|
||||
|
||||
// 时间
|
||||
CreateTs time.Time `json:"create_ts" bson:"create_ts"`
|
||||
UpdateTs time.Time `json:"update_ts" bson:"update_ts"`
|
||||
}
|
||||
@@ -98,13 +89,14 @@ func (spider *Spider) GetLastTask() (Task, error) {
|
||||
return tasks[0], nil
|
||||
}
|
||||
|
||||
// 删除爬虫
|
||||
func (spider *Spider) Delete() error {
|
||||
s, c := database.GetCol("spiders")
|
||||
defer s.Close()
|
||||
return c.RemoveId(spider.Id)
|
||||
}
|
||||
|
||||
// 爬虫列表
|
||||
// 获取爬虫列表
|
||||
func GetSpiderList(filter interface{}, skip int, limit int) ([]Spider, int, error) {
|
||||
s, c := database.GetCol("spiders")
|
||||
defer s.Close()
|
||||
@@ -136,7 +128,7 @@ func GetSpiderList(filter interface{}, skip int, limit int) ([]Spider, int, erro
|
||||
return spiders, count, nil
|
||||
}
|
||||
|
||||
// 获取爬虫
|
||||
// 获取爬虫(根据FileId)
|
||||
func GetSpiderByFileId(fileId bson.ObjectId) *Spider {
|
||||
s, c := database.GetCol("spiders")
|
||||
defer s.Close()
|
||||
@@ -150,7 +142,7 @@ func GetSpiderByFileId(fileId bson.ObjectId) *Spider {
|
||||
return result
|
||||
}
|
||||
|
||||
// 获取爬虫
|
||||
// 获取爬虫(根据名称)
|
||||
func GetSpiderByName(name string) *Spider {
|
||||
s, c := database.GetCol("spiders")
|
||||
defer s.Close()
|
||||
@@ -158,13 +150,13 @@ func GetSpiderByName(name string) *Spider {
|
||||
var result *Spider
|
||||
if err := c.Find(bson.M{"name": name}).One(&result); err != nil {
|
||||
log.Errorf("get spider error: %s, spider_name: %s", err.Error(), name)
|
||||
debug.PrintStack()
|
||||
//debug.PrintStack()
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// 获取爬虫
|
||||
// 获取爬虫(根据ID)
|
||||
func GetSpider(id bson.ObjectId) (Spider, error) {
|
||||
s, c := database.GetCol("spiders")
|
||||
defer s.Close()
|
||||
@@ -245,7 +237,7 @@ func RemoveAllSpider() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// 爬虫总数
|
||||
// 获取爬虫总数
|
||||
func GetSpiderCount() (int, error) {
|
||||
s, c := database.GetCol("spiders")
|
||||
defer s.Close()
|
||||
@@ -257,7 +249,7 @@ func GetSpiderCount() (int, error) {
|
||||
return count, nil
|
||||
}
|
||||
|
||||
// 爬虫类型
|
||||
// 获取爬虫类型
|
||||
func GetSpiderTypes() ([]*entity.SpiderType, error) {
|
||||
s, c := database.GetCol("spiders")
|
||||
defer s.Close()
|
||||
|
||||
Reference in New Issue
Block a user