Merge remote-tracking branch 'upstream/develop' into develop

This commit is contained in:
陈景阳
2019-12-05 07:15:07 +08:00
41 changed files with 2260 additions and 519 deletions

View File

@@ -15,7 +15,7 @@ redis:
log:
level: info
path: "/var/logs/crawlab"
isDeletePeriodically: "Y"
isDeletePeriodically: "N"
deleteFrequency: "@hourly"
server:
host: 0.0.0.0

View File

@@ -3,15 +3,15 @@ package entity
import "strconv"
type Page struct {
Skip int
Limit int
PageNum int
Skip int
Limit int
PageNum int
PageSize int
}
func (p *Page)GetPage(pageNum string, pageSize string) {
func (p *Page) GetPage(pageNum string, pageSize string) {
p.PageNum, _ = strconv.Atoi(pageNum)
p.PageSize, _ = strconv.Atoi(pageSize)
p.Skip = p.PageSize * (p.PageNum - 1)
p.Limit = p.PageSize
}
}

View File

@@ -1,25 +1,30 @@
package entity
type ConfigSpiderData struct {
Version string `yaml:"version" json:"version"`
Engine string `yaml:"engine" json:"engine"`
StartUrl string `yaml:"start_url" json:"start_url"`
StartStage string `yaml:"start_stage" json:"start_stage"`
Stages map[string]Stage `yaml:"stages" json:"stages"`
Settings map[string]string `yaml:"settings" json:"settings"`
}
type Stage struct {
Name string `yaml:"name" json:"name"`
IsList bool `yaml:"is_list" json:"is_list"`
ListCss string `yaml:"list_css" json:"list_css"`
ListXpath string `yaml:"list_xpath" json:"list_xpath"`
PageCss string `yaml:"page_css" json:"page_css"`
PageXpath string `yaml:"page_xpath" json:"page_xpath"`
PageAttr string `yaml:"page_attr" json:"page_attr"`
Fields []Field `yaml:"fields" json:"fields"`
}
type Field struct {
Name string `yaml:"name" json:"name"`
Css string `yaml:"css" json:"css"`
Xpath string `yaml:"xpath" json:"xpath"`
Attr string `yaml:"attr" json:"attr"`
NextStage string `yaml:"next_stage" json:"next_stage"`
}
type Stage struct {
IsList bool `yaml:"is_list" json:"is_list"`
ListCss string `yaml:"list_css" json:"list_css"`
PageCss string `yaml:"page_css" json:"page_css"`
PageAttr string `yaml:"page_attr" json:"page_attr"`
Fields []Field `yaml:"fields" json:"fields"`
}
type ConfigSpiderData struct {
Version string `yaml:"version" json:"version"`
Engine string `yaml:"engine" json:"engine"`
StartUrl string `yaml:"start_url" json:"start_url"`
StartStage string `yaml:"start_stage" json:"start_stage"`
Stages map[string]Stage `yaml:"stages" json:"stages"`
Remark string `yaml:"remark" json:"remark"`
}

View File

@@ -47,6 +47,8 @@ func main() {
panic(err)
}
log.Info("初始化定期清理日志配置成功")
}else {
log.Info("默认未开启定期清理日志配置")
}
// 初始化Mongodb数据库
@@ -140,9 +142,13 @@ func main() {
authGroup.GET("/spiders/:id/stats", routes.GetSpiderStats) // 爬虫统计数据
authGroup.GET("/spider/types", routes.GetSpiderTypes) // 爬虫类型
// 可配置爬虫
authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫
authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 改可配置爬虫
authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫
authGroup.GET("/config_spiders/:id/config", routes.GetConfigSpiderConfig) // 获取可配置爬虫配置
authGroup.POST("/config_spiders/:id/config", routes.PostConfigSpiderConfig) // 改可配置爬虫配置
authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫
authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫
authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫
authGroup.POST("/config_spiders/:id/spiderfile", routes.PostConfigSpiderSpiderfile) // 上传可配置爬虫
authGroup.GET("/config_spiders_templates", routes.GetConfigSpiderTemplateList) // 获取可配置爬虫模版列表
// 任务
authGroup.GET("/tasks", routes.GetTaskList) // 任务列表
authGroup.GET("/tasks/:id", routes.GetTask) // 任务详情

View File

@@ -42,12 +42,12 @@ func init() {
app.DELETE("/tasks/:id", DeleteTask) // 删除任务
app.GET("/tasks/:id/results", GetTaskResults) // 任务结果
app.GET("/tasks/:id/results/download", DownloadTaskResultsCsv) // 下载任务结果
app.GET("/spiders", GetSpiderList) // 爬虫列表
app.GET("/spiders/:id", GetSpider) // 爬虫详情
app.POST("/spiders/:id", PostSpider) // 修改爬虫
app.DELETE("/spiders/:id",DeleteSpider) // 删除爬虫
app.GET("/spiders/:id/tasks",GetSpiderTasks) // 爬虫任务列表
app.GET("/spiders/:id/dir",GetSpiderDir) // 爬虫目录
app.GET("/spiders", GetSpiderList) // 爬虫列表
app.GET("/spiders/:id", GetSpider) // 爬虫详情
app.POST("/spiders/:id", PostSpider) // 修改爬虫
app.DELETE("/spiders/:id", DeleteSpider) // 删除爬虫
app.GET("/spiders/:id/tasks", GetSpiderTasks) // 爬虫任务列表
app.GET("/spiders/:id/dir", GetSpiderDir) // 爬虫目录
}
//mock test, test data in ./mock

View File

@@ -6,8 +6,6 @@ import (
"net/http"
)
var taskDailyItems = []model.TaskDailyItem{
{
Date: "2019/08/19",

View File

@@ -1 +1 @@
package mock
package mock

View File

@@ -1 +1 @@
package mock
package mock

View File

@@ -131,12 +131,7 @@ func (g ScrapyGenerator) GetNonListParserString(stageName string, stage entity.S
// 遍历字段列表
for _, f := range stage.Fields {
line := ""
if f.Attr == "" {
line += fmt.Sprintf(`item['%s'] = response.css('%s::text').extract_first()`, f.Name, f.Css)
} else {
line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
}
line := fmt.Sprintf(`item['%s'] = response.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f))
line = g.PadCode(line, 2)
str += line
}
@@ -163,19 +158,14 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag
str += g.PadCode(`prev_item = response.meta.get('item')`, 2)
// for 循环遍历列表
str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s'):`, stage.ListCss), 2)
str += g.PadCode(fmt.Sprintf(`for elem in response.%s:`, g.GetListString(stage)), 2)
// 构造item
str += g.PadCode(`item = Item()`, 3)
// 遍历字段列表
for _, f := range stage.Fields {
line := ""
if f.Attr == "" {
line += fmt.Sprintf(`item['%s'] = elem.css('%s::text').extract_first()`, f.Name, f.Css)
} else {
line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
}
line := fmt.Sprintf(`item['%s'] = elem.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f))
line = g.PadCode(line, 3)
str += line
}
@@ -195,15 +185,9 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag
}
// 分页
if stage.PageCss != "" {
// 分页元素属性,默认为 href
pageAttr := "href"
if stage.PageAttr != "" {
pageAttr = stage.PageAttr
}
str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s::attr("%s")').extract_first()`, stage.PageCss, pageAttr), 2)
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': item})`, stageName), 2)
if stage.PageCss != "" || stage.PageXpath != "" {
str += g.PadCode(fmt.Sprintf(`next_url = response.%s.extract_first()`, g.GetExtractStringFromStage(stage)), 2)
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': prev_item})`, stageName), 2)
}
// 加入末尾换行
@@ -226,3 +210,49 @@ func (g ScrapyGenerator) GetNextStageField(stage entity.Stage) (entity.Field, er
}
return entity.Field{}, errors.New("cannot find next stage field")
}
func (g ScrapyGenerator) GetExtractStringFromField(f entity.Field) string {
if f.Css != "" {
// 如果为CSS
if f.Attr == "" {
// 文本
return fmt.Sprintf(`css('%s::text')`, f.Css)
} else {
// 属性
return fmt.Sprintf(`css('%s::attr("%s")')`, f.Css, f.Attr)
}
} else {
// 如果为XPath
if f.Attr == "" {
// 文本
return fmt.Sprintf(`xpath('string(%s)')`, f.Xpath)
} else {
// 属性
return fmt.Sprintf(`xpath('%s/@%s')`, f.Xpath, f.Attr)
}
}
}
func (g ScrapyGenerator) GetExtractStringFromStage(stage entity.Stage) string {
// 分页元素属性,默认为 href
pageAttr := "href"
if stage.PageAttr != "" {
pageAttr = stage.PageAttr
}
if stage.PageCss != "" {
// 如果为CSS
return fmt.Sprintf(`css('%s::attr("%s")')`, stage.PageCss, pageAttr)
} else {
// 如果为XPath
return fmt.Sprintf(`xpath('%s/@%s')`, stage.PageXpath, pageAttr)
}
}
func (g ScrapyGenerator) GetListString(stage entity.Stage) string {
if stage.ListCss != "" {
return fmt.Sprintf(`css('%s')`, stage.ListCss)
} else {
return fmt.Sprintf(`xpath('%s')`, stage.ListXpath)
}
}

View File

@@ -1,11 +1,17 @@
package model
import (
"crawlab/constants"
"crawlab/database"
"crawlab/entity"
"crawlab/utils"
"errors"
"github.com/apex/log"
"github.com/globalsign/mgo"
"github.com/globalsign/mgo/bson"
"gopkg.in/yaml.v2"
"io/ioutil"
"path/filepath"
"runtime/debug"
"time"
)
@@ -25,14 +31,18 @@ type Spider struct {
Site string `json:"site" bson:"site"` // 爬虫网站
Envs []Env `json:"envs" bson:"envs"` // 环境变量
Remark string `json:"remark" bson:"remark"` // 备注
Src string `json:"src" bson:"src"` // 源码位置
// 自定义爬虫
Src string `json:"src" bson:"src"` // 源码位置
Cmd string `json:"cmd" bson:"cmd"` // 执行命令
// 可配置爬虫
Template string `json:"template" bson:"template"` // Spiderfile模版
// 前端展示
LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间
LastStatus string `json:"last_status"` // 最后执行状态
LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间
LastStatus string `json:"last_status"` // 最后执行状态
Config entity.ConfigSpiderData `json:"config"` // 可配置爬虫配置
// 时间
CreateTs time.Time `json:"create_ts" bson:"create_ts"`
@@ -108,6 +118,10 @@ func GetSpiderList(filter interface{}, skip int, limit int) ([]Spider, int, erro
return spiders, 0, err
}
if spiders == nil {
spiders = []Spider{}
}
// 遍历爬虫列表
for i, spider := range spiders {
// 获取最后一次任务
@@ -161,15 +175,25 @@ func GetSpider(id bson.ObjectId) (Spider, error) {
s, c := database.GetCol("spiders")
defer s.Close()
var result Spider
if err := c.FindId(id).One(&result); err != nil {
// 获取爬虫
var spider Spider
if err := c.FindId(id).One(&spider); err != nil {
if err != mgo.ErrNotFound {
log.Errorf("get spider error: %s, id: %id", err.Error(), id.Hex())
debug.PrintStack()
}
return result, err
return spider, err
}
return result, nil
// 如果为可配置爬虫,获取爬虫配置
if spider.Type == constants.Configurable && utils.Exists(filepath.Join(spider.Src, "Spiderfile")) {
config, err := GetConfigSpiderData(spider)
if err != nil {
return spider, err
}
spider.Config = config
}
return spider, nil
}
// 更新爬虫
@@ -209,10 +233,12 @@ func RemoveSpider(id bson.ObjectId) error {
s, gf := database.GetGridFs("files")
defer s.Close()
if err := gf.RemoveId(result.FileId); err != nil {
log.Error("remove file error, id:" + result.FileId.Hex())
debug.PrintStack()
return err
if result.FileId.Hex() != constants.ObjectIdNull {
if err := gf.RemoveId(result.FileId); err != nil {
log.Error("remove file error, id:" + result.FileId.Hex())
debug.PrintStack()
return err
}
}
return nil
@@ -269,3 +295,35 @@ func GetSpiderTypes() ([]*entity.SpiderType, error) {
return types, nil
}
func GetConfigSpiderData(spider Spider) (entity.ConfigSpiderData, error) {
// 构造配置数据
configData := entity.ConfigSpiderData{}
// 校验爬虫类别
if spider.Type != constants.Configurable {
return configData, errors.New("not a configurable spider")
}
// Spiderfile 目录
sfPath := filepath.Join(spider.Src, "Spiderfile")
// 读取YAML文件
yamlFile, err := ioutil.ReadFile(sfPath)
if err != nil {
return configData, err
}
// 反序列化
if err := yaml.Unmarshal(yamlFile, &configData); err != nil {
return configData, err
}
// 赋值 stage_name
for stageName, stage := range configData.Stages {
stage.Name = stageName
configData.Stages[stageName] = stage
}
return configData, nil
}

View File

@@ -2,16 +2,13 @@ package routes
import (
"crawlab/constants"
"crawlab/database"
"crawlab/entity"
"crawlab/model"
"crawlab/services"
"crawlab/utils"
"fmt"
"github.com/apex/log"
"github.com/gin-gonic/gin"
"github.com/globalsign/mgo/bson"
uuid "github.com/satori/go.uuid"
"github.com/spf13/viper"
"gopkg.in/yaml.v2"
"io"
@@ -19,7 +16,7 @@ import (
"net/http"
"os"
"path/filepath"
"runtime/debug"
"strings"
)
// 添加可配置爬虫
@@ -36,6 +33,12 @@ func PutConfigSpider(c *gin.Context) {
return
}
// 模版名不能为空
if spider.Template == "" {
HandleErrorF(http.StatusBadRequest, c, "spider template should not be empty")
return
}
// 判断爬虫是否存在
if spider := model.GetSpiderByName(spider.Name); spider != nil {
HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("spider for '%s' already exists", spider.Name))
@@ -62,6 +65,23 @@ func PutConfigSpider(c *gin.Context) {
}
spider.Src = spiderDir
// 复制Spiderfile模版
contentByte, err := ioutil.ReadFile("./template/spiderfile/Spiderfile." + spider.Template)
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
f, err := os.Create(filepath.Join(spider.Src, "Spiderfile"))
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
defer f.Close()
if _, err := f.Write(contentByte); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 添加爬虫到数据库
if err := spider.Add(); err != nil {
HandleError(http.StatusInternalServerError, c, err)
@@ -100,8 +120,8 @@ func UploadConfigSpider(c *gin.Context) {
// 文件名称必须为Spiderfile
filename := header.Filename
if filename != "Spiderfile" {
HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile'")
if filename != "Spiderfile" && filename != "Spiderfile.yaml" && filename != "Spiderfile.yml" {
HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile(.yaml|.yml)'")
return
}
@@ -151,88 +171,146 @@ func UploadConfigSpider(c *gin.Context) {
return
}
// 删除已有的爬虫文件
for _, fInfo := range utils.ListDir(spiderDir) {
// 不删除Spiderfile
if fInfo.Name() == filename {
continue
}
// 删除其他文件
if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
}
// 拷贝爬虫文件
tplDir := "./template/scrapy"
for _, fInfo := range utils.ListDir(tplDir) {
// 跳过Spiderfile
if fInfo.Name() == "Spiderfile" {
continue
}
srcPath := filepath.Join(tplDir, fInfo.Name())
if fInfo.IsDir() {
dirPath := filepath.Join(spiderDir, fInfo.Name())
if err := utils.CopyDir(srcPath, dirPath); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
} else {
if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
}
}
// 更改爬虫文件
if err := services.GenerateConfigSpiderFiles(spider, configData); err != nil {
// 根据序列化后的数据处理爬虫文件
if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 打包为 zip 文件
files, err := utils.GetFilesFromDir(spiderDir)
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
randomId := uuid.NewV4()
tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip")
spiderZipFileName := spider.Name + ".zip"
if err := utils.Compress(files, tmpFilePath); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 获取 GridFS 实例
s, gf := database.GetGridFs("files")
defer s.Close()
// 判断文件是否已经存在
var gfFile model.GridFs
if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil {
// 已经存在文件,则删除
_ = gf.RemoveId(gfFile.Id)
}
// 上传到GridFs
fid, err := services.UploadToGridFs(spiderZipFileName, tmpFilePath)
if err != nil {
log.Errorf("upload to grid fs error: %s", err.Error())
debug.PrintStack()
return
}
// 保存爬虫 FileId
spider.FileId = fid
_ = spider.Save()
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
})
}
func PostConfigSpiderSpiderfile(c *gin.Context) {
type Body struct {
Content string `json:"content"`
}
id := c.Param("id")
// 文件内容
var reqBody Body
if err := c.ShouldBindJSON(&reqBody); err != nil {
HandleError(http.StatusBadRequest, c, err)
return
}
content := reqBody.Content
// 获取爬虫
var spider model.Spider
spider, err := model.GetSpider(bson.ObjectIdHex(id))
if err != nil {
HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("cannot find spider (id: %s)", id))
return
}
// 反序列化
var configData entity.ConfigSpiderData
if err := yaml.Unmarshal([]byte(content), &configData); err != nil {
HandleError(http.StatusBadRequest, c, err)
return
}
// 校验configData
if err := services.ValidateSpiderfile(configData); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 写文件
if err := ioutil.WriteFile(filepath.Join(spider.Src, "Spiderfile"), []byte(content), os.ModePerm); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 根据序列化后的数据处理爬虫文件
if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
})
}
func PostConfigSpiderConfig(c *gin.Context) {
id := c.Param("id")
// 获取爬虫
var spider model.Spider
spider, err := model.GetSpider(bson.ObjectIdHex(id))
if err != nil {
HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("cannot find spider (id: %s)", id))
return
}
// 反序列化配置数据
var configData entity.ConfigSpiderData
if err := c.ShouldBindJSON(&configData); err != nil {
HandleError(http.StatusBadRequest, c, err)
return
}
// 校验configData
if err := services.ValidateSpiderfile(configData); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 替换Spiderfile文件
if err := services.GenerateSpiderfileFromConfigData(spider, configData); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 根据序列化后的数据处理爬虫文件
if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
})
}
func GetConfigSpiderConfig(c *gin.Context) {
id := c.Param("id")
// 校验ID
if !bson.IsObjectIdHex(id) {
HandleErrorF(http.StatusBadRequest, c, "invalid id")
}
// 获取爬虫
spider, err := model.GetSpider(bson.ObjectIdHex(id))
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
Data: spider.Config,
})
}
// 获取模版名称列表
func GetConfigSpiderTemplateList(c *gin.Context) {
var data []string
for _, fInfo := range utils.ListDir("./template/spiderfile") {
templateName := strings.Replace(fInfo.Name(), "Spiderfile.", "", -1)
data = append(data, templateName)
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
Data: data,
})
}

View File

@@ -34,7 +34,7 @@ func GetSpiderList(c *gin.Context) {
"name": bson.M{"$regex": bson.RegEx{Pattern: keyword, Options: "im"}},
}
if t != "" {
if t != "" && t != "all" {
filter["type"] = t
}

View File

@@ -2,11 +2,20 @@ package services
import (
"crawlab/constants"
"crawlab/database"
"crawlab/entity"
"crawlab/model"
"crawlab/model/config_spider"
"crawlab/utils"
"errors"
"fmt"
"github.com/apex/log"
"github.com/globalsign/mgo/bson"
uuid "github.com/satori/go.uuid"
"github.com/spf13/viper"
"gopkg.in/yaml.v2"
"os"
"path/filepath"
"strings"
)
@@ -37,12 +46,17 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error {
// 校验是否存在 start_url
if configData.StartUrl == "" {
return errors.New("spiderfile start_url is empty")
return errors.New("spiderfile invalid: start_url is empty")
}
// 校验是否存在 start_stage
if configData.StartStage == "" {
return errors.New("spiderfile invalid: start_stage is empty")
}
// 校验是否存在 stages
if len(configData.Stages) == 0 {
return errors.New("spiderfile stages is empty")
return errors.New("spiderfile invalid: stages is empty")
}
// 校验stages
@@ -50,56 +64,74 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error {
for stageName, stage := range configData.Stages {
// stage 名称不能为空
if stageName == "" {
return errors.New("spiderfile stage name is empty")
return errors.New("spiderfile invalid: stage name is empty")
}
// stage 名称不能为保留字符串
// NOTE: 如果有其他Engine可以扩展默认为Scrapy
if configData.Engine == "" || configData.Engine == constants.EngineScrapy {
if strings.Contains(constants.ScrapyProtectedStageNames, stageName) {
return errors.New(fmt.Sprintf("spiderfile stage name '%s' is protected", stageName))
return errors.New(fmt.Sprintf("spiderfile invalid: stage name '%s' is protected", stageName))
}
} else if configData.Engine == constants.EngineColly {
return errors.New(fmt.Sprintf("engine '%s' is not implemented", stageName))
} else {
return errors.New(fmt.Sprintf("spiderfile invalid: engine '%s' is not implemented", configData.Engine))
}
// stage 名称不能重复
if dict[stageName] == 1 {
return errors.New("spiderfile stage name should be unique")
return errors.New(fmt.Sprintf("spiderfile invalid: stage name '%s' is duplicated", stageName))
}
dict[stageName] = 1
// stage 字段不能为空
if len(stage.Fields) == 0 {
return errors.New(fmt.Sprintf("spiderfile stage '%s' has no fields", stageName))
return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has no fields", stageName))
}
// stage 的下一个 stage 只能有一个
// 是否包含 next_stage
hasNextStage := false
// 遍历字段列表
for _, field := range stage.Fields {
// stage 的 next stage 只能有一个
if field.NextStage != "" {
if hasNextStage {
return errors.New("spiderfile stage fields should have only 1 next_stage")
return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has more than 1 next_stage", stageName))
}
hasNextStage = true
}
// 字段里 css 和 xpath 只能包含一个
if field.Css != "" && field.Xpath != "" {
return errors.New(fmt.Sprintf("spiderfile invalid: field '%s' in stage '%s' has both css and xpath set which is prohibited", field.Name, stageName))
}
}
// stage 里 page_css 和 page_xpath 只能包含一个
if stage.PageCss != "" && stage.PageXpath != "" {
return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both page_css and page_xpath set which is prohibited", stageName))
}
// stage 里 list_css 和 list_xpath 只能包含一个
if stage.ListCss != "" && stage.ListXpath != "" {
return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both list_css and list_xpath set which is prohibited", stageName))
}
// 如果 stage 的 is_list 为 true 但 list_css 为空,报错
if stage.IsList && stage.ListCss == "" {
return errors.New("spiderfile stage with is_list = true should have list_css being set")
if stage.IsList && (stage.ListCss == "" && stage.ListXpath == "") {
return errors.New("spiderfile invalid: stage with is_list = true should have either list_css or list_xpath being set")
}
}
// 校验字段唯一性
if !IsUniqueConfigSpiderFields(fields) {
return errors.New("spiderfile fields not unique")
return errors.New("spiderfile invalid: fields not unique")
}
// 字段名称不能为保留字符串
for _, field := range fields {
if strings.Contains(constants.ScrapyProtectedFieldNames, field.Name) {
return errors.New(fmt.Sprintf("spiderfile field name '%s' is protected", field.Name))
return errors.New(fmt.Sprintf("spiderfile invalid: field name '%s' is protected", field.Name))
}
}
@@ -116,3 +148,118 @@ func IsUniqueConfigSpiderFields(fields []entity.Field) bool {
}
return true
}
func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error {
spiderDir := spider.Src
// 赋值 stage_name
for stageName, stage := range configData.Stages {
stage.Name = stageName
configData.Stages[stageName] = stage
}
// 删除已有的爬虫文件
for _, fInfo := range utils.ListDir(spiderDir) {
// 不删除Spiderfile
if fInfo.Name() == "Spiderfile" {
continue
}
// 删除其他文件
if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil {
return err
}
}
// 拷贝爬虫文件
tplDir := "./template/scrapy"
for _, fInfo := range utils.ListDir(tplDir) {
// 跳过Spiderfile
if fInfo.Name() == "Spiderfile" {
continue
}
srcPath := filepath.Join(tplDir, fInfo.Name())
if fInfo.IsDir() {
dirPath := filepath.Join(spiderDir, fInfo.Name())
if err := utils.CopyDir(srcPath, dirPath); err != nil {
return err
}
} else {
if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
return err
}
}
}
// 更改爬虫文件
if err := GenerateConfigSpiderFiles(spider, configData); err != nil {
return err
}
// 打包为 zip 文件
files, err := utils.GetFilesFromDir(spiderDir)
if err != nil {
return err
}
randomId := uuid.NewV4()
tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip")
spiderZipFileName := spider.Name + ".zip"
if err := utils.Compress(files, tmpFilePath); err != nil {
return err
}
// 获取 GridFS 实例
s, gf := database.GetGridFs("files")
defer s.Close()
// 判断文件是否已经存在
var gfFile model.GridFs
if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil {
// 已经存在文件,则删除
_ = gf.RemoveId(gfFile.Id)
}
// 上传到GridFs
fid, err := UploadToGridFs(spiderZipFileName, tmpFilePath)
if err != nil {
log.Errorf("upload to grid fs error: %s", err.Error())
return err
}
// 保存爬虫 FileId
spider.FileId = fid
_ = spider.Save()
return nil
}
func GenerateSpiderfileFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error {
// Spiderfile 路径
sfPath := filepath.Join(spider.Src, "Spiderfile")
// 生成Yaml内容
sfContentByte, err := yaml.Marshal(configData)
if err != nil {
return err
}
// 打开文件
var f *os.File
if utils.Exists(sfPath) {
f, err = os.OpenFile(sfPath, os.O_WRONLY|os.O_TRUNC, 0777)
} else {
f, err = os.OpenFile(sfPath, os.O_CREATE, 0777)
}
if err != nil {
return err
}
defer f.Close()
// 写入内容
if _, err := f.Write(sfContentByte); err != nil {
return err
}
return nil
}

View File

@@ -116,12 +116,15 @@ func PublishAllSpiders() {
// 发布爬虫
func PublishSpider(spider model.Spider) {
// 查询gf file不存在则标记为爬虫文件不存在
gfFile := model.GetGridFs(spider.FileId)
if gfFile == nil {
spider.FileId = constants.ObjectIdNull
_ = spider.Save()
return
var gfFile *model.GridFs
if spider.FileId.Hex() != constants.ObjectIdNull {
// 查询gf file不存在则标记为爬虫文件不存在
gfFile = model.GetGridFs(spider.FileId)
if gfFile == nil {
spider.FileId = constants.ObjectIdNull
_ = spider.Save()
return
}
}
// 如果FileId为空表示还没有上传爬虫到GridFS则跳过

View File

@@ -10,6 +10,7 @@ import (
"github.com/spf13/viper"
"io"
"os"
"os/exec"
"path/filepath"
"runtime/debug"
)
@@ -99,7 +100,6 @@ func (s *SpiderSync) Download() {
// 创建临时文件
tmpFilePath := filepath.Join(tmpPath, randomId.String()+".zip")
tmpFile := utils.OpenFile(tmpFilePath)
defer utils.Close(tmpFile)
// 将该文件写入临时文件
if _, err := io.Copy(tmpFile, f); err != nil {
@@ -119,6 +119,15 @@ func (s *SpiderSync) Download() {
return
}
//递归修改目标文件夹权限
// 解决scrapy.setting中开启LOG_ENABLED 和 LOG_FILE时不能创建log文件的问题
cmd := exec.Command("chmod", "-R", "777", dstPath)
if err := cmd.Run(); err != nil {
log.Errorf(err.Error())
debug.PrintStack()
return
}
// 关闭临时文件
if err := tmpFile.Close(); err != nil {
log.Errorf(err.Error())

View File

@@ -226,12 +226,18 @@ func ExecuteShellCmd(cmdStr string, cwd string, t model.Task, s model.Spider) (e
// 环境变量配置
envs := s.Envs
if s.Type == constants.Configurable {
// 数据库配置
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_HOST", Value: viper.GetString("mongo.host")})
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_PORT", Value: viper.GetString("mongo.port")})
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_DB", Value: viper.GetString("mongo.db")})
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_USERNAME", Value: viper.GetString("mongo.username")})
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_PASSWORD", Value: viper.GetString("mongo.password")})
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_AUTHSOURCE", Value: viper.GetString("mongo.authSource")})
// 设置配置
for envName, envValue := range s.Config.Settings {
envs = append(envs, model.Env{Name: "CRAWLAB_SETTING_" + envName, Value: envValue})
}
}
cmd = SetEnv(cmd, envs, t.Id, s.Col)
@@ -311,9 +317,12 @@ func SaveTaskResultCount(id string) func() {
// 执行任务
func ExecuteTask(id int) {
if flag, _ := LockList.Load(id); flag.(bool) {
log.Debugf(GetWorkerPrefix(id) + "正在执行任务...")
return
if flag, ok := LockList.Load(id); ok {
if flag.(bool) {
log.Debugf(GetWorkerPrefix(id) + "正在执行任务...")
return
}
}
// 上锁
@@ -477,6 +486,29 @@ func GetTaskLog(id string) (logStr string, err error) {
}
if IsMasterNode(task.NodeId.Hex()) {
if !utils.Exists(task.LogPath) {
fileDir, err := MakeLogDir(task)
if err != nil {
log.Errorf(err.Error())
}
fileP := GetLogFilePaths(fileDir)
// 获取日志文件路径
fLog, err := os.Create(fileP)
defer fLog.Close()
if err != nil {
log.Errorf("create task log file error: %s", fileP)
debug.PrintStack()
}
task.LogPath = fileP
if err := task.Save(); err != nil {
log.Errorf(err.Error())
debug.PrintStack()
}
}
// 若为主节点,获取本机日志
logBytes, err := model.GetLocalLog(task.LogPath)
if err != nil {

View File

@@ -1,4 +1,7 @@
# -*- coding: utf-8 -*-
import os
import re
import json
# Scrapy settings for config_spider project
#
@@ -9,14 +12,14 @@
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'config_spider'
BOT_NAME = 'Crawlab Configurable Spider'
SPIDER_MODULES = ['config_spider.spiders']
NEWSPIDER_MODULE = 'config_spider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'config_spider (+http://www.yourdomain.com)'
USER_AGENT = 'Crawlab Spider'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
@@ -88,3 +91,21 @@ ITEM_PIPELINES = {
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]:
setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '')
setting_value = os.environ.get(setting_env_name)
if setting_value.lower() == 'true':
setting_value = True
elif setting_value.lower() == 'false':
setting_value = False
elif re.search(r'^\d+$', setting_value) is not None:
setting_value = int(setting_value)
elif re.search(r'^\{.*\}$', setting_value.strip()) is not None:
setting_value = json.loads(setting_value)
elif re.search(r'^\[.*\]$', setting_value.strip()) is not None:
setting_value = json.loads(setting_value)
else:
pass
locals()[setting_name] = setting_value

View File

@@ -0,0 +1,20 @@
version: "0.4.0"
name: "toscrapy_books"
start_url: "http://news.163.com/special/0001386F/rank_news.html"
start_stage: "list"
engine: "scrapy"
stages:
list:
is_list: true
list_css: "table tr:not(:first-child)"
fields:
- name: "title"
css: "td:nth-child(1) > a"
- name: "url"
css: "td:nth-child(1) > a"
attr: "href"
- name: "clicks"
css: "td.cBlue"
settings:
ROBOTSTXT_OBEY: false
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36

View File

@@ -0,0 +1,22 @@
version: 0.4.0
name: toscrapy_books
start_url: http://www.baidu.com/s?wd=crawlab
start_stage: list
engine: scrapy
stages:
list:
is_list: true
list_xpath: //*[contains(@class, "c-container")]
page_xpath: //*[@id="page"]//a[@class="n"][last()]
page_attr: href
fields:
- name: title
xpath: .//h3/a
- name: url
xpath: .//h3/a
attr: href
- name: abstract
xpath: .//*[@class="c-abstract"]
settings:
ROBOTSTXT_OBEY: false
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36

View File

@@ -5,10 +5,10 @@ start_stage: "list"
engine: "scrapy"
stages:
list:
is_list: true # default: false
is_list: true
list_css: "section article.product_pod"
page_css: "ul.pager li.next a"
page_attr: "href" # default: href
page_attr: "href"
fields:
- name: "title"
css: "h3 > a"
@@ -23,3 +23,6 @@ stages:
fields:
- name: "description"
css: "#product_description + p"
settings:
ROBOTSTXT_OBEY: true
AUTOTHROTTLE_ENABLED: true

View File

@@ -167,7 +167,6 @@ func DeCompress(srcFile *os.File, dstPath string) error {
debug.PrintStack()
continue
}
defer Close(newFile)
// 拷贝该文件到新文件中
if _, err := io.Copy(newFile, srcFile); err != nil {