diff --git a/backend/main.go b/backend/main.go index 436c537e..72ab2c25 100644 --- a/backend/main.go +++ b/backend/main.go @@ -140,10 +140,11 @@ func main() { authGroup.GET("/spiders/:id/stats", routes.GetSpiderStats) // 爬虫统计数据 authGroup.GET("/spider/types", routes.GetSpiderTypes) // 爬虫类型 // 可配置爬虫 - authGroup.GET("/config_spiders/:id/config", routes.GetConfigSpiderConfig) // 可配置爬虫配置 - authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫 - authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫 - authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫 + authGroup.GET("/config_spiders/:id/config", routes.GetConfigSpiderConfig) // 获取可配置爬虫配置 + authGroup.POST("/config_spiders/:id/config", routes.PostConfigSpiderConfig) // 更改可配置爬虫配置 + authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫 + authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫 + authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫 // 任务 authGroup.GET("/tasks", routes.GetTaskList) // 任务列表 authGroup.GET("/tasks/:id", routes.GetTask) // 任务详情 diff --git a/backend/model/config_spider/scrapy.go b/backend/model/config_spider/scrapy.go index 9da6fb89..62c5cd78 100644 --- a/backend/model/config_spider/scrapy.go +++ b/backend/model/config_spider/scrapy.go @@ -216,19 +216,19 @@ func (g ScrapyGenerator) GetExtractStringFromField(f entity.Field) string { // 如果为CSS if f.Attr == "" { // 文本 - return fmt.Sprintf(`css(%s::text())`, f.Css) + return fmt.Sprintf(`css('%s::text()')`, f.Css) } else { // 属性 - return fmt.Sprintf(`css(%s::attr("%s"))`, f.Css, f.Attr) + return fmt.Sprintf(`css('%s::attr("%s")')`, f.Css, f.Attr) } } else { // 如果为XPath if f.Attr == "" { // 文本 - return fmt.Sprintf(`xpath(%s/text())`, f.Xpath) + return fmt.Sprintf(`xpath('%s/text()')`, f.Xpath) } else { // 属性 - return fmt.Sprintf(`xpath(%s/@%s)`, f.Xpath, f.Attr) + return fmt.Sprintf(`xpath('%s/@%s')`, f.Xpath, f.Attr) } } } diff --git a/backend/routes/config_spider.go b/backend/routes/config_spider.go index cdd40607..1d10335e 100644 --- a/backend/routes/config_spider.go +++ b/backend/routes/config_spider.go @@ -2,16 +2,13 @@ package routes import ( "crawlab/constants" - "crawlab/database" "crawlab/entity" "crawlab/model" "crawlab/services" "crawlab/utils" "fmt" - "github.com/apex/log" "github.com/gin-gonic/gin" "github.com/globalsign/mgo/bson" - uuid "github.com/satori/go.uuid" "github.com/spf13/viper" "gopkg.in/yaml.v2" "io" @@ -19,7 +16,6 @@ import ( "net/http" "os" "path/filepath" - "runtime/debug" ) // 添加可配置爬虫 @@ -151,91 +147,39 @@ func UploadConfigSpider(c *gin.Context) { return } - // 赋值 stage_name - for stageName, stage := range configData.Stages { - stage.Name = stageName - configData.Stages[stageName] = stage - } - - // 删除已有的爬虫文件 - for _, fInfo := range utils.ListDir(spiderDir) { - // 不删除Spiderfile - if fInfo.Name() == filename { - continue - } - - // 删除其他文件 - if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil { - HandleError(http.StatusInternalServerError, c, err) - return - } - } - - // 拷贝爬虫文件 - tplDir := "./template/scrapy" - for _, fInfo := range utils.ListDir(tplDir) { - // 跳过Spiderfile - if fInfo.Name() == "Spiderfile" { - continue - } - - srcPath := filepath.Join(tplDir, fInfo.Name()) - if fInfo.IsDir() { - dirPath := filepath.Join(spiderDir, fInfo.Name()) - if err := utils.CopyDir(srcPath, dirPath); err != nil { - HandleError(http.StatusInternalServerError, c, err) - return - } - } else { - if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil { - HandleError(http.StatusInternalServerError, c, err) - return - } - } - } - - // 更改爬虫文件 - if err := services.GenerateConfigSpiderFiles(spider, configData); err != nil { + // 根据序列化后的数据处理爬虫文件 + if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil { HandleError(http.StatusInternalServerError, c, err) - return } - // 打包为 zip 文件 - files, err := utils.GetFilesFromDir(spiderDir) + c.JSON(http.StatusOK, Response{ + Status: "ok", + Message: "success", + }) +} + +func PostConfigSpiderConfig(c *gin.Context) { + id := c.Param("id") + + // 获取爬虫 + var spider model.Spider + spider, err := model.GetSpider(bson.ObjectIdHex(id)) if err != nil { + HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("cannot find spider (id: %s)", id)) + } + + // 反序列化配置数据 + var configData entity.ConfigSpiderData + if err := c.ShouldBindJSON(&configData); err != nil { + HandleError(http.StatusBadRequest, c, err) + } + + // 根据序列化后的数据处理爬虫文件 + if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil { HandleError(http.StatusInternalServerError, c, err) - return - } - randomId := uuid.NewV4() - tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip") - spiderZipFileName := spider.Name + ".zip" - if err := utils.Compress(files, tmpFilePath); err != nil { - HandleError(http.StatusInternalServerError, c, err) - return } - // 获取 GridFS 实例 - s, gf := database.GetGridFs("files") - defer s.Close() - - // 判断文件是否已经存在 - var gfFile model.GridFs - if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil { - // 已经存在文件,则删除 - _ = gf.RemoveId(gfFile.Id) - } - - // 上传到GridFs - fid, err := services.UploadToGridFs(spiderZipFileName, tmpFilePath) - if err != nil { - log.Errorf("upload to grid fs error: %s", err.Error()) - debug.PrintStack() - return - } - - // 保存爬虫 FileId - spider.FileId = fid - _ = spider.Save() + // TODO: 替换Spiderfile文件 c.JSON(http.StatusOK, Response{ Status: "ok", diff --git a/backend/services/config_spider.go b/backend/services/config_spider.go index 35bb2790..2d9acaba 100644 --- a/backend/services/config_spider.go +++ b/backend/services/config_spider.go @@ -2,11 +2,19 @@ package services import ( "crawlab/constants" + "crawlab/database" "crawlab/entity" "crawlab/model" "crawlab/model/config_spider" + "crawlab/utils" "errors" "fmt" + "github.com/apex/log" + "github.com/globalsign/mgo/bson" + uuid "github.com/satori/go.uuid" + "github.com/spf13/viper" + "os" + "path/filepath" "strings" ) @@ -139,3 +147,88 @@ func IsUniqueConfigSpiderFields(fields []entity.Field) bool { } return true } + +func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error { + spiderDir := spider.Src + + // 赋值 stage_name + for stageName, stage := range configData.Stages { + stage.Name = stageName + configData.Stages[stageName] = stage + } + + // 删除已有的爬虫文件 + for _, fInfo := range utils.ListDir(spiderDir) { + // 不删除Spiderfile + if fInfo.Name() == "Spiderfile" { + continue + } + + // 删除其他文件 + if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil { + return err + } + } + + // 拷贝爬虫文件 + tplDir := "./template/scrapy" + for _, fInfo := range utils.ListDir(tplDir) { + // 跳过Spiderfile + if fInfo.Name() == "Spiderfile" { + continue + } + + srcPath := filepath.Join(tplDir, fInfo.Name()) + if fInfo.IsDir() { + dirPath := filepath.Join(spiderDir, fInfo.Name()) + if err := utils.CopyDir(srcPath, dirPath); err != nil { + return err + } + } else { + if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil { + return err + } + } + } + + // 更改爬虫文件 + if err := GenerateConfigSpiderFiles(spider, configData); err != nil { + return err + } + + // 打包为 zip 文件 + files, err := utils.GetFilesFromDir(spiderDir) + if err != nil { + return err + } + randomId := uuid.NewV4() + tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip") + spiderZipFileName := spider.Name + ".zip" + if err := utils.Compress(files, tmpFilePath); err != nil { + return err + } + + // 获取 GridFS 实例 + s, gf := database.GetGridFs("files") + defer s.Close() + + // 判断文件是否已经存在 + var gfFile model.GridFs + if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil { + // 已经存在文件,则删除 + _ = gf.RemoveId(gfFile.Id) + } + + // 上传到GridFs + fid, err := UploadToGridFs(spiderZipFileName, tmpFilePath) + if err != nil { + log.Errorf("upload to grid fs error: %s", err.Error()) + return err + } + + // 保存爬虫 FileId + spider.FileId = fid + _ = spider.Save() + + return nil +}