From 7003951561cca316619c7f6ee3f7b1fb85f1d32a Mon Sep 17 00:00:00 2001 From: marvzhang Date: Sun, 24 Nov 2019 12:20:44 +0800 Subject: [PATCH] fixed https://github.com/crawlab-team/crawlab/issues/315 --- backend/constants/spider.go | 1 + backend/entity/config_spider.go | 22 ++++++ backend/main.go | 6 +- backend/model/spider.go | 28 +++----- backend/routes/config_spider.go | 119 ++++++++++++++++++++++++++------ backend/services/spider.go | 19 +++-- 6 files changed, 148 insertions(+), 47 deletions(-) create mode 100644 backend/entity/config_spider.go diff --git a/backend/constants/spider.go b/backend/constants/spider.go index b4b7f65e..5119aa67 100644 --- a/backend/constants/spider.go +++ b/backend/constants/spider.go @@ -3,4 +3,5 @@ package constants const ( Customized = "customized" Configurable = "configurable" + Plugin = "plugin" ) diff --git a/backend/entity/config_spider.go b/backend/entity/config_spider.go new file mode 100644 index 00000000..d9b4932f --- /dev/null +++ b/backend/entity/config_spider.go @@ -0,0 +1,22 @@ +package entity + +type Field struct { + Name string `yaml:"name" json:"name"` + Css string `yaml:"css" json:"css"` + Xpath string `yaml:"xpath" json:"xpath"` + Attr string `yaml:"attr" json:"attr"` + Stage string `yaml:"stage" json:"stage"` +} + +type Stage struct { + List bool `yaml:"list" json:"list"` + Css string `yaml:"css" json:"css"` + Xpath string `yaml:"xpath" json:"xpath"` + Fields []Field `yaml:"fields" json:"fields"` +} + +type ConfigSpiderData struct { + Version string `yaml:"version" json:"version"` + StartUrl string `yaml:"startUrl" json:"start_url"` + Stages map[string]Stage `yaml:"stages" json:"stages"` +} diff --git a/backend/main.go b/backend/main.go index f4ec47a1..565c7892 100644 --- a/backend/main.go +++ b/backend/main.go @@ -129,7 +129,7 @@ func main() { // 爬虫 authGroup.GET("/spiders", routes.GetSpiderList) // 爬虫列表 authGroup.GET("/spiders/:id", routes.GetSpider) // 爬虫详情 - authGroup.POST("/spiders", routes.PutSpider) // 上传爬虫 + authGroup.POST("/spiders", routes.PutSpider) // 上传爬虫 TODO: 名称不对 authGroup.POST("/spiders/:id", routes.PostSpider) // 修改爬虫 authGroup.POST("/spiders/:id/publish", routes.PublishSpider) // 发布爬虫 authGroup.DELETE("/spiders/:id", routes.DeleteSpider) // 删除爬虫 @@ -140,7 +140,9 @@ func main() { authGroup.GET("/spiders/:id/stats", routes.GetSpiderStats) // 爬虫统计数据 authGroup.GET("/spider/types", routes.GetSpiderTypes) // 爬虫类型 // 可配置爬虫 - authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫 + authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫 + authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫 + authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫 // 任务 authGroup.GET("/tasks", routes.GetTaskList) // 任务列表 authGroup.GET("/tasks/:id", routes.GetTask) // 任务详情 diff --git a/backend/model/spider.go b/backend/model/spider.go index 5c2c92e8..53c5ab1f 100644 --- a/backend/model/spider.go +++ b/backend/model/spider.go @@ -25,6 +25,7 @@ type Spider struct { Site string `json:"site" bson:"site"` // 爬虫网站 Envs []Env `json:"envs" bson:"envs"` // 环境变量 Remark string `json:"remark" bson:"remark"` // 备注 + // 自定义爬虫 Src string `json:"src" bson:"src"` // 源码位置 Cmd string `json:"cmd" bson:"cmd"` // 执行命令 @@ -33,17 +34,7 @@ type Spider struct { LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间 LastStatus string `json:"last_status"` // 最后执行状态 - // TODO: 可配置爬虫 - //Fields []interface{} `json:"fields"` - //DetailFields []interface{} `json:"detail_fields"` - //CrawlType string `json:"crawl_type"` - //StartUrl string `json:"start_url"` - //UrlPattern string `json:"url_pattern"` - //ItemSelector string `json:"item_selector"` - //ItemSelectorType string `json:"item_selector_type"` - //PaginationSelector string `json:"pagination_selector"` - //PaginationSelectorType string `json:"pagination_selector_type"` - + // 时间 CreateTs time.Time `json:"create_ts" bson:"create_ts"` UpdateTs time.Time `json:"update_ts" bson:"update_ts"` } @@ -98,13 +89,14 @@ func (spider *Spider) GetLastTask() (Task, error) { return tasks[0], nil } +// 删除爬虫 func (spider *Spider) Delete() error { s, c := database.GetCol("spiders") defer s.Close() return c.RemoveId(spider.Id) } -// 爬虫列表 +// 获取爬虫列表 func GetSpiderList(filter interface{}, skip int, limit int) ([]Spider, int, error) { s, c := database.GetCol("spiders") defer s.Close() @@ -136,7 +128,7 @@ func GetSpiderList(filter interface{}, skip int, limit int) ([]Spider, int, erro return spiders, count, nil } -// 获取爬虫 +// 获取爬虫(根据FileId) func GetSpiderByFileId(fileId bson.ObjectId) *Spider { s, c := database.GetCol("spiders") defer s.Close() @@ -150,7 +142,7 @@ func GetSpiderByFileId(fileId bson.ObjectId) *Spider { return result } -// 获取爬虫 +// 获取爬虫(根据名称) func GetSpiderByName(name string) *Spider { s, c := database.GetCol("spiders") defer s.Close() @@ -158,13 +150,13 @@ func GetSpiderByName(name string) *Spider { var result *Spider if err := c.Find(bson.M{"name": name}).One(&result); err != nil { log.Errorf("get spider error: %s, spider_name: %s", err.Error(), name) - debug.PrintStack() + //debug.PrintStack() return nil } return result } -// 获取爬虫 +// 获取爬虫(根据ID) func GetSpider(id bson.ObjectId) (Spider, error) { s, c := database.GetCol("spiders") defer s.Close() @@ -245,7 +237,7 @@ func RemoveAllSpider() error { return nil } -// 爬虫总数 +// 获取爬虫总数 func GetSpiderCount() (int, error) { s, c := database.GetCol("spiders") defer s.Close() @@ -257,7 +249,7 @@ func GetSpiderCount() (int, error) { return count, nil } -// 爬虫类型 +// 获取爬虫类型 func GetSpiderTypes() ([]*entity.SpiderType, error) { s, c := database.GetCol("spiders") defer s.Close() diff --git a/backend/routes/config_spider.go b/backend/routes/config_spider.go index 6fa2927b..55d164b9 100644 --- a/backend/routes/config_spider.go +++ b/backend/routes/config_spider.go @@ -1,39 +1,112 @@ package routes import ( + "crawlab/constants" + "crawlab/entity" + "crawlab/model" + "crawlab/utils" + "fmt" + "github.com/apex/log" "github.com/gin-gonic/gin" + "github.com/globalsign/mgo/bson" + uuid "github.com/satori/go.uuid" + "github.com/spf13/viper" "gopkg.in/yaml.v2" + "io" "io/ioutil" "net/http" + "os" + "path/filepath" + "runtime/debug" ) -type Field struct { - Name string `yaml:"name" json:"name"` - Css string `yaml:"css" json:"css"` - Xpath string `yaml:"xpath" json:"xpath"` - Attr string `yaml:"attr" json:"attr"` - Stage string `yaml:"stage" json:"stage"` -} - -type Stage struct { - List bool `yaml:"list" json:"list"` - Css string `yaml:"css" json:"css"` - Xpath string `yaml:"xpath" json:"xpath"` - Fields []Field `yaml:"fields" json:"fields"` -} - -type ConfigSpiderData struct { - Version string `yaml:"version" json:"version"` - StartUrl string `yaml:"startUrl" json:"start_url"` - Stages map[string]Stage `yaml:"stages" json:"stages"` -} - +// 添加可配置爬虫 func PutConfigSpider(c *gin.Context) { + var spider model.Spider + if err := c.ShouldBindJSON(&spider); err != nil { + HandleError(http.StatusBadRequest, c, err) + return + } + + // 爬虫名称不能为空 + if spider.Name == "" { + HandleErrorF(http.StatusBadRequest, c, "spider name should not be empty") + return + } + + // 判断爬虫是否存在 + if spider := model.GetSpiderByName(spider.Name); spider != nil { + HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("spider for '%s' already exists", spider.Name)) + return + } + + // 设置爬虫类别 + spider.Type = constants.Configurable + + // 将FileId置空 + spider.FileId = bson.ObjectIdHex(constants.ObjectIdNull) + + // 添加爬虫到数据库 + if err := spider.Add(); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + c.JSON(http.StatusOK, Response{ + Status: "ok", + Message: "success", + Data: spider, + }) +} + +// 更改可配置爬虫 +func PostConfigSpider(c *gin.Context) { + PostSpider(c) +} + +func UploadConfigSpider(c *gin.Context) { + // 获取上传文件 + file, header, err := c.Request.FormFile("file") + if err != nil { + HandleError(http.StatusBadRequest, c, err) + return + } + + // 文件名称必须为Spiderfile + filename := header.Filename + if filename != "Spiderfile" { + HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile'") + return + } + + // 以防tmp目录不存在 + tmpPath := viper.GetString("other.tmppath") + if !utils.Exists(tmpPath) { + if err := os.MkdirAll(tmpPath, os.ModePerm); err != nil { + log.Error("mkdir other.tmppath dir error:" + err.Error()) + debug.PrintStack() + HandleError(http.StatusBadRequest, c, err) + return + } + } + + //创建文件 + randomId := uuid.NewV4() + tmpFilePath := filepath.Join(tmpPath, "Spiderfile."+randomId.String()) + out, err := os.Create(tmpFilePath) + if err != nil { + } + _, err = io.Copy(out, file) + if err != nil { + HandleError(http.StatusInternalServerError, c, err) + } + _ = out.Close() + // 构造配置数据 - data := ConfigSpiderData{} + data := entity.ConfigSpiderData{} // 读取YAML文件 - yamlFile, err := ioutil.ReadFile("./template/Spiderfile") + yamlFile, err := ioutil.ReadFile(tmpFilePath) if err != nil { HandleError(http.StatusInternalServerError, c, err) return diff --git a/backend/services/spider.go b/backend/services/spider.go index 84d218bb..e6e1c9c4 100644 --- a/backend/services/spider.go +++ b/backend/services/spider.go @@ -116,12 +116,23 @@ func PublishAllSpiders() { // 发布爬虫 func PublishSpider(spider model.Spider) { - // 查询gf file,不存在则删除 - gfFile := model.GetGridFs(spider.FileId) - if gfFile == nil { - _ = model.RemoveSpider(spider.Id) + // 查询gf file,不存在则标记为爬虫文件不存在 + var gfFile *model.GridFs + if spider.Type == constants.Customized { + gfFile = model.GetGridFs(spider.FileId) + if gfFile == nil { + spider.FileId = constants.ObjectIdNull + _ = spider.Save() + return + } + } + + // 如果FileId为空,表示还没有上传爬虫到GridFS,则跳过 + if spider.FileId == bson.ObjectIdHex(constants.ObjectIdNull) { return } + + // 获取爬虫同步实例 spiderSync := spider_handler.SpiderSync{ Spider: spider, }