diff --git a/backend/conf/config.yml b/backend/conf/config.yml index a5e0b23b..60d2bd41 100644 --- a/backend/conf/config.yml +++ b/backend/conf/config.yml @@ -15,7 +15,7 @@ redis: log: level: info path: "/var/logs/crawlab" - isDeletePeriodically: "Y" + isDeletePeriodically: "N" deleteFrequency: "@hourly" server: host: 0.0.0.0 diff --git a/backend/entity/common.go b/backend/entity/common.go index 332cc494..c46ae4f9 100644 --- a/backend/entity/common.go +++ b/backend/entity/common.go @@ -3,15 +3,15 @@ package entity import "strconv" type Page struct { - Skip int - Limit int - PageNum int + Skip int + Limit int + PageNum int PageSize int } -func (p *Page)GetPage(pageNum string, pageSize string) { +func (p *Page) GetPage(pageNum string, pageSize string) { p.PageNum, _ = strconv.Atoi(pageNum) p.PageSize, _ = strconv.Atoi(pageSize) p.Skip = p.PageSize * (p.PageNum - 1) p.Limit = p.PageSize -} \ No newline at end of file +} diff --git a/backend/entity/config_spider.go b/backend/entity/config_spider.go index 5e0fe1e1..3fe28bc9 100644 --- a/backend/entity/config_spider.go +++ b/backend/entity/config_spider.go @@ -1,25 +1,30 @@ package entity +type ConfigSpiderData struct { + Version string `yaml:"version" json:"version"` + Engine string `yaml:"engine" json:"engine"` + StartUrl string `yaml:"start_url" json:"start_url"` + StartStage string `yaml:"start_stage" json:"start_stage"` + Stages map[string]Stage `yaml:"stages" json:"stages"` + Settings map[string]string `yaml:"settings" json:"settings"` +} + +type Stage struct { + Name string `yaml:"name" json:"name"` + IsList bool `yaml:"is_list" json:"is_list"` + ListCss string `yaml:"list_css" json:"list_css"` + ListXpath string `yaml:"list_xpath" json:"list_xpath"` + PageCss string `yaml:"page_css" json:"page_css"` + PageXpath string `yaml:"page_xpath" json:"page_xpath"` + PageAttr string `yaml:"page_attr" json:"page_attr"` + Fields []Field `yaml:"fields" json:"fields"` +} + type Field struct { Name string `yaml:"name" json:"name"` Css string `yaml:"css" json:"css"` Xpath string `yaml:"xpath" json:"xpath"` Attr string `yaml:"attr" json:"attr"` NextStage string `yaml:"next_stage" json:"next_stage"` -} - -type Stage struct { - IsList bool `yaml:"is_list" json:"is_list"` - ListCss string `yaml:"list_css" json:"list_css"` - PageCss string `yaml:"page_css" json:"page_css"` - PageAttr string `yaml:"page_attr" json:"page_attr"` - Fields []Field `yaml:"fields" json:"fields"` -} - -type ConfigSpiderData struct { - Version string `yaml:"version" json:"version"` - Engine string `yaml:"engine" json:"engine"` - StartUrl string `yaml:"start_url" json:"start_url"` - StartStage string `yaml:"start_stage" json:"start_stage"` - Stages map[string]Stage `yaml:"stages" json:"stages"` + Remark string `yaml:"remark" json:"remark"` } diff --git a/backend/main.go b/backend/main.go index 565c7892..92863a20 100644 --- a/backend/main.go +++ b/backend/main.go @@ -47,6 +47,8 @@ func main() { panic(err) } log.Info("初始化定期清理日志配置成功") + }else { + log.Info("默认未开启定期清理日志配置") } // 初始化Mongodb数据库 @@ -140,9 +142,13 @@ func main() { authGroup.GET("/spiders/:id/stats", routes.GetSpiderStats) // 爬虫统计数据 authGroup.GET("/spider/types", routes.GetSpiderTypes) // 爬虫类型 // 可配置爬虫 - authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫 - authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫 - authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫 + authGroup.GET("/config_spiders/:id/config", routes.GetConfigSpiderConfig) // 获取可配置爬虫配置 + authGroup.POST("/config_spiders/:id/config", routes.PostConfigSpiderConfig) // 更改可配置爬虫配置 + authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫 + authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫 + authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫 + authGroup.POST("/config_spiders/:id/spiderfile", routes.PostConfigSpiderSpiderfile) // 上传可配置爬虫 + authGroup.GET("/config_spiders_templates", routes.GetConfigSpiderTemplateList) // 获取可配置爬虫模版列表 // 任务 authGroup.GET("/tasks", routes.GetTaskList) // 任务列表 authGroup.GET("/tasks/:id", routes.GetTask) // 任务详情 diff --git a/backend/mock/node_test.go b/backend/mock/node_test.go index 669cafc5..abd568c2 100644 --- a/backend/mock/node_test.go +++ b/backend/mock/node_test.go @@ -42,12 +42,12 @@ func init() { app.DELETE("/tasks/:id", DeleteTask) // 删除任务 app.GET("/tasks/:id/results", GetTaskResults) // 任务结果 app.GET("/tasks/:id/results/download", DownloadTaskResultsCsv) // 下载任务结果 - app.GET("/spiders", GetSpiderList) // 爬虫列表 - app.GET("/spiders/:id", GetSpider) // 爬虫详情 - app.POST("/spiders/:id", PostSpider) // 修改爬虫 - app.DELETE("/spiders/:id",DeleteSpider) // 删除爬虫 - app.GET("/spiders/:id/tasks",GetSpiderTasks) // 爬虫任务列表 - app.GET("/spiders/:id/dir",GetSpiderDir) // 爬虫目录 + app.GET("/spiders", GetSpiderList) // 爬虫列表 + app.GET("/spiders/:id", GetSpider) // 爬虫详情 + app.POST("/spiders/:id", PostSpider) // 修改爬虫 + app.DELETE("/spiders/:id", DeleteSpider) // 删除爬虫 + app.GET("/spiders/:id/tasks", GetSpiderTasks) // 爬虫任务列表 + app.GET("/spiders/:id/dir", GetSpiderDir) // 爬虫目录 } //mock test, test data in ./mock diff --git a/backend/mock/stats.go b/backend/mock/stats.go index db2348c6..f0227da9 100644 --- a/backend/mock/stats.go +++ b/backend/mock/stats.go @@ -6,8 +6,6 @@ import ( "net/http" ) - - var taskDailyItems = []model.TaskDailyItem{ { Date: "2019/08/19", diff --git a/backend/mock/system.go b/backend/mock/system.go index c4807247..f33e02ba 100644 --- a/backend/mock/system.go +++ b/backend/mock/system.go @@ -1 +1 @@ -package mock \ No newline at end of file +package mock diff --git a/backend/mock/user.go b/backend/mock/user.go index c4807247..f33e02ba 100644 --- a/backend/mock/user.go +++ b/backend/mock/user.go @@ -1 +1 @@ -package mock \ No newline at end of file +package mock diff --git a/backend/model/config_spider/scrapy.go b/backend/model/config_spider/scrapy.go index 7503b9bf..6fcb77f0 100644 --- a/backend/model/config_spider/scrapy.go +++ b/backend/model/config_spider/scrapy.go @@ -131,12 +131,7 @@ func (g ScrapyGenerator) GetNonListParserString(stageName string, stage entity.S // 遍历字段列表 for _, f := range stage.Fields { - line := "" - if f.Attr == "" { - line += fmt.Sprintf(`item['%s'] = response.css('%s::text').extract_first()`, f.Name, f.Css) - } else { - line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr) - } + line := fmt.Sprintf(`item['%s'] = response.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f)) line = g.PadCode(line, 2) str += line } @@ -163,19 +158,14 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag str += g.PadCode(`prev_item = response.meta.get('item')`, 2) // for 循环遍历列表 - str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s'):`, stage.ListCss), 2) + str += g.PadCode(fmt.Sprintf(`for elem in response.%s:`, g.GetListString(stage)), 2) // 构造item str += g.PadCode(`item = Item()`, 3) // 遍历字段列表 for _, f := range stage.Fields { - line := "" - if f.Attr == "" { - line += fmt.Sprintf(`item['%s'] = elem.css('%s::text').extract_first()`, f.Name, f.Css) - } else { - line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr) - } + line := fmt.Sprintf(`item['%s'] = elem.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f)) line = g.PadCode(line, 3) str += line } @@ -195,15 +185,9 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag } // 分页 - if stage.PageCss != "" { - // 分页元素属性,默认为 href - pageAttr := "href" - if stage.PageAttr != "" { - pageAttr = stage.PageAttr - } - - str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s::attr("%s")').extract_first()`, stage.PageCss, pageAttr), 2) - str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': item})`, stageName), 2) + if stage.PageCss != "" || stage.PageXpath != "" { + str += g.PadCode(fmt.Sprintf(`next_url = response.%s.extract_first()`, g.GetExtractStringFromStage(stage)), 2) + str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': prev_item})`, stageName), 2) } // 加入末尾换行 @@ -226,3 +210,49 @@ func (g ScrapyGenerator) GetNextStageField(stage entity.Stage) (entity.Field, er } return entity.Field{}, errors.New("cannot find next stage field") } + +func (g ScrapyGenerator) GetExtractStringFromField(f entity.Field) string { + if f.Css != "" { + // 如果为CSS + if f.Attr == "" { + // 文本 + return fmt.Sprintf(`css('%s::text')`, f.Css) + } else { + // 属性 + return fmt.Sprintf(`css('%s::attr("%s")')`, f.Css, f.Attr) + } + } else { + // 如果为XPath + if f.Attr == "" { + // 文本 + return fmt.Sprintf(`xpath('string(%s)')`, f.Xpath) + } else { + // 属性 + return fmt.Sprintf(`xpath('%s/@%s')`, f.Xpath, f.Attr) + } + } +} + +func (g ScrapyGenerator) GetExtractStringFromStage(stage entity.Stage) string { + // 分页元素属性,默认为 href + pageAttr := "href" + if stage.PageAttr != "" { + pageAttr = stage.PageAttr + } + + if stage.PageCss != "" { + // 如果为CSS + return fmt.Sprintf(`css('%s::attr("%s")')`, stage.PageCss, pageAttr) + } else { + // 如果为XPath + return fmt.Sprintf(`xpath('%s/@%s')`, stage.PageXpath, pageAttr) + } +} + +func (g ScrapyGenerator) GetListString(stage entity.Stage) string { + if stage.ListCss != "" { + return fmt.Sprintf(`css('%s')`, stage.ListCss) + } else { + return fmt.Sprintf(`xpath('%s')`, stage.ListXpath) + } +} diff --git a/backend/model/spider.go b/backend/model/spider.go index 53c5ab1f..a0d72c1c 100644 --- a/backend/model/spider.go +++ b/backend/model/spider.go @@ -1,11 +1,17 @@ package model import ( + "crawlab/constants" "crawlab/database" "crawlab/entity" + "crawlab/utils" + "errors" "github.com/apex/log" "github.com/globalsign/mgo" "github.com/globalsign/mgo/bson" + "gopkg.in/yaml.v2" + "io/ioutil" + "path/filepath" "runtime/debug" "time" ) @@ -25,14 +31,18 @@ type Spider struct { Site string `json:"site" bson:"site"` // 爬虫网站 Envs []Env `json:"envs" bson:"envs"` // 环境变量 Remark string `json:"remark" bson:"remark"` // 备注 + Src string `json:"src" bson:"src"` // 源码位置 // 自定义爬虫 - Src string `json:"src" bson:"src"` // 源码位置 Cmd string `json:"cmd" bson:"cmd"` // 执行命令 + // 可配置爬虫 + Template string `json:"template" bson:"template"` // Spiderfile模版 + // 前端展示 - LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间 - LastStatus string `json:"last_status"` // 最后执行状态 + LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间 + LastStatus string `json:"last_status"` // 最后执行状态 + Config entity.ConfigSpiderData `json:"config"` // 可配置爬虫配置 // 时间 CreateTs time.Time `json:"create_ts" bson:"create_ts"` @@ -108,6 +118,10 @@ func GetSpiderList(filter interface{}, skip int, limit int) ([]Spider, int, erro return spiders, 0, err } + if spiders == nil { + spiders = []Spider{} + } + // 遍历爬虫列表 for i, spider := range spiders { // 获取最后一次任务 @@ -161,15 +175,25 @@ func GetSpider(id bson.ObjectId) (Spider, error) { s, c := database.GetCol("spiders") defer s.Close() - var result Spider - if err := c.FindId(id).One(&result); err != nil { + // 获取爬虫 + var spider Spider + if err := c.FindId(id).One(&spider); err != nil { if err != mgo.ErrNotFound { log.Errorf("get spider error: %s, id: %id", err.Error(), id.Hex()) debug.PrintStack() } - return result, err + return spider, err } - return result, nil + + // 如果为可配置爬虫,获取爬虫配置 + if spider.Type == constants.Configurable && utils.Exists(filepath.Join(spider.Src, "Spiderfile")) { + config, err := GetConfigSpiderData(spider) + if err != nil { + return spider, err + } + spider.Config = config + } + return spider, nil } // 更新爬虫 @@ -209,10 +233,12 @@ func RemoveSpider(id bson.ObjectId) error { s, gf := database.GetGridFs("files") defer s.Close() - if err := gf.RemoveId(result.FileId); err != nil { - log.Error("remove file error, id:" + result.FileId.Hex()) - debug.PrintStack() - return err + if result.FileId.Hex() != constants.ObjectIdNull { + if err := gf.RemoveId(result.FileId); err != nil { + log.Error("remove file error, id:" + result.FileId.Hex()) + debug.PrintStack() + return err + } } return nil @@ -269,3 +295,35 @@ func GetSpiderTypes() ([]*entity.SpiderType, error) { return types, nil } + +func GetConfigSpiderData(spider Spider) (entity.ConfigSpiderData, error) { + // 构造配置数据 + configData := entity.ConfigSpiderData{} + + // 校验爬虫类别 + if spider.Type != constants.Configurable { + return configData, errors.New("not a configurable spider") + } + + // Spiderfile 目录 + sfPath := filepath.Join(spider.Src, "Spiderfile") + + // 读取YAML文件 + yamlFile, err := ioutil.ReadFile(sfPath) + if err != nil { + return configData, err + } + + // 反序列化 + if err := yaml.Unmarshal(yamlFile, &configData); err != nil { + return configData, err + } + + // 赋值 stage_name + for stageName, stage := range configData.Stages { + stage.Name = stageName + configData.Stages[stageName] = stage + } + + return configData, nil +} diff --git a/backend/routes/config_spider.go b/backend/routes/config_spider.go index 6f4a2893..e387935a 100644 --- a/backend/routes/config_spider.go +++ b/backend/routes/config_spider.go @@ -2,16 +2,13 @@ package routes import ( "crawlab/constants" - "crawlab/database" "crawlab/entity" "crawlab/model" "crawlab/services" "crawlab/utils" "fmt" - "github.com/apex/log" "github.com/gin-gonic/gin" "github.com/globalsign/mgo/bson" - uuid "github.com/satori/go.uuid" "github.com/spf13/viper" "gopkg.in/yaml.v2" "io" @@ -19,7 +16,7 @@ import ( "net/http" "os" "path/filepath" - "runtime/debug" + "strings" ) // 添加可配置爬虫 @@ -36,6 +33,12 @@ func PutConfigSpider(c *gin.Context) { return } + // 模版名不能为空 + if spider.Template == "" { + HandleErrorF(http.StatusBadRequest, c, "spider template should not be empty") + return + } + // 判断爬虫是否存在 if spider := model.GetSpiderByName(spider.Name); spider != nil { HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("spider for '%s' already exists", spider.Name)) @@ -62,6 +65,23 @@ func PutConfigSpider(c *gin.Context) { } spider.Src = spiderDir + // 复制Spiderfile模版 + contentByte, err := ioutil.ReadFile("./template/spiderfile/Spiderfile." + spider.Template) + if err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + f, err := os.Create(filepath.Join(spider.Src, "Spiderfile")) + if err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + defer f.Close() + if _, err := f.Write(contentByte); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + // 添加爬虫到数据库 if err := spider.Add(); err != nil { HandleError(http.StatusInternalServerError, c, err) @@ -100,8 +120,8 @@ func UploadConfigSpider(c *gin.Context) { // 文件名称必须为Spiderfile filename := header.Filename - if filename != "Spiderfile" { - HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile'") + if filename != "Spiderfile" && filename != "Spiderfile.yaml" && filename != "Spiderfile.yml" { + HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile(.yaml|.yml)'") return } @@ -151,88 +171,146 @@ func UploadConfigSpider(c *gin.Context) { return } - // 删除已有的爬虫文件 - for _, fInfo := range utils.ListDir(spiderDir) { - // 不删除Spiderfile - if fInfo.Name() == filename { - continue - } - - // 删除其他文件 - if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil { - HandleError(http.StatusInternalServerError, c, err) - return - } - } - - // 拷贝爬虫文件 - tplDir := "./template/scrapy" - for _, fInfo := range utils.ListDir(tplDir) { - // 跳过Spiderfile - if fInfo.Name() == "Spiderfile" { - continue - } - - srcPath := filepath.Join(tplDir, fInfo.Name()) - if fInfo.IsDir() { - dirPath := filepath.Join(spiderDir, fInfo.Name()) - if err := utils.CopyDir(srcPath, dirPath); err != nil { - HandleError(http.StatusInternalServerError, c, err) - return - } - } else { - if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil { - HandleError(http.StatusInternalServerError, c, err) - return - } - } - } - - // 更改爬虫文件 - if err := services.GenerateConfigSpiderFiles(spider, configData); err != nil { + // 根据序列化后的数据处理爬虫文件 + if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil { HandleError(http.StatusInternalServerError, c, err) - return } - // 打包为 zip 文件 - files, err := utils.GetFilesFromDir(spiderDir) - if err != nil { - HandleError(http.StatusInternalServerError, c, err) - return - } - randomId := uuid.NewV4() - tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip") - spiderZipFileName := spider.Name + ".zip" - if err := utils.Compress(files, tmpFilePath); err != nil { - HandleError(http.StatusInternalServerError, c, err) - return - } - - // 获取 GridFS 实例 - s, gf := database.GetGridFs("files") - defer s.Close() - - // 判断文件是否已经存在 - var gfFile model.GridFs - if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil { - // 已经存在文件,则删除 - _ = gf.RemoveId(gfFile.Id) - } - - // 上传到GridFs - fid, err := services.UploadToGridFs(spiderZipFileName, tmpFilePath) - if err != nil { - log.Errorf("upload to grid fs error: %s", err.Error()) - debug.PrintStack() - return - } - - // 保存爬虫 FileId - spider.FileId = fid - _ = spider.Save() - c.JSON(http.StatusOK, Response{ Status: "ok", Message: "success", }) } + +func PostConfigSpiderSpiderfile(c *gin.Context) { + type Body struct { + Content string `json:"content"` + } + + id := c.Param("id") + + // 文件内容 + var reqBody Body + if err := c.ShouldBindJSON(&reqBody); err != nil { + HandleError(http.StatusBadRequest, c, err) + return + } + content := reqBody.Content + + // 获取爬虫 + var spider model.Spider + spider, err := model.GetSpider(bson.ObjectIdHex(id)) + if err != nil { + HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("cannot find spider (id: %s)", id)) + return + } + + // 反序列化 + var configData entity.ConfigSpiderData + if err := yaml.Unmarshal([]byte(content), &configData); err != nil { + HandleError(http.StatusBadRequest, c, err) + return + } + + // 校验configData + if err := services.ValidateSpiderfile(configData); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + // 写文件 + if err := ioutil.WriteFile(filepath.Join(spider.Src, "Spiderfile"), []byte(content), os.ModePerm); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + // 根据序列化后的数据处理爬虫文件 + if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + c.JSON(http.StatusOK, Response{ + Status: "ok", + Message: "success", + }) +} + +func PostConfigSpiderConfig(c *gin.Context) { + id := c.Param("id") + + // 获取爬虫 + var spider model.Spider + spider, err := model.GetSpider(bson.ObjectIdHex(id)) + if err != nil { + HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("cannot find spider (id: %s)", id)) + return + } + + // 反序列化配置数据 + var configData entity.ConfigSpiderData + if err := c.ShouldBindJSON(&configData); err != nil { + HandleError(http.StatusBadRequest, c, err) + return + } + + // 校验configData + if err := services.ValidateSpiderfile(configData); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + // 替换Spiderfile文件 + if err := services.GenerateSpiderfileFromConfigData(spider, configData); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + // 根据序列化后的数据处理爬虫文件 + if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + c.JSON(http.StatusOK, Response{ + Status: "ok", + Message: "success", + }) +} + +func GetConfigSpiderConfig(c *gin.Context) { + id := c.Param("id") + + // 校验ID + if !bson.IsObjectIdHex(id) { + HandleErrorF(http.StatusBadRequest, c, "invalid id") + } + + // 获取爬虫 + spider, err := model.GetSpider(bson.ObjectIdHex(id)) + if err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + c.JSON(http.StatusOK, Response{ + Status: "ok", + Message: "success", + Data: spider.Config, + }) +} + +// 获取模版名称列表 +func GetConfigSpiderTemplateList(c *gin.Context) { + var data []string + for _, fInfo := range utils.ListDir("./template/spiderfile") { + templateName := strings.Replace(fInfo.Name(), "Spiderfile.", "", -1) + data = append(data, templateName) + } + + c.JSON(http.StatusOK, Response{ + Status: "ok", + Message: "success", + Data: data, + }) +} diff --git a/backend/routes/spider.go b/backend/routes/spider.go index d351f1bb..588811e3 100644 --- a/backend/routes/spider.go +++ b/backend/routes/spider.go @@ -34,7 +34,7 @@ func GetSpiderList(c *gin.Context) { "name": bson.M{"$regex": bson.RegEx{Pattern: keyword, Options: "im"}}, } - if t != "" { + if t != "" && t != "all" { filter["type"] = t } diff --git a/backend/services/config_spider.go b/backend/services/config_spider.go index 4e8005a1..7c736cc7 100644 --- a/backend/services/config_spider.go +++ b/backend/services/config_spider.go @@ -2,11 +2,20 @@ package services import ( "crawlab/constants" + "crawlab/database" "crawlab/entity" "crawlab/model" "crawlab/model/config_spider" + "crawlab/utils" "errors" "fmt" + "github.com/apex/log" + "github.com/globalsign/mgo/bson" + uuid "github.com/satori/go.uuid" + "github.com/spf13/viper" + "gopkg.in/yaml.v2" + "os" + "path/filepath" "strings" ) @@ -37,12 +46,17 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error { // 校验是否存在 start_url if configData.StartUrl == "" { - return errors.New("spiderfile start_url is empty") + return errors.New("spiderfile invalid: start_url is empty") + } + + // 校验是否存在 start_stage + if configData.StartStage == "" { + return errors.New("spiderfile invalid: start_stage is empty") } // 校验是否存在 stages if len(configData.Stages) == 0 { - return errors.New("spiderfile stages is empty") + return errors.New("spiderfile invalid: stages is empty") } // 校验stages @@ -50,56 +64,74 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error { for stageName, stage := range configData.Stages { // stage 名称不能为空 if stageName == "" { - return errors.New("spiderfile stage name is empty") + return errors.New("spiderfile invalid: stage name is empty") } // stage 名称不能为保留字符串 // NOTE: 如果有其他Engine,可以扩展,默认为Scrapy if configData.Engine == "" || configData.Engine == constants.EngineScrapy { if strings.Contains(constants.ScrapyProtectedStageNames, stageName) { - return errors.New(fmt.Sprintf("spiderfile stage name '%s' is protected", stageName)) + return errors.New(fmt.Sprintf("spiderfile invalid: stage name '%s' is protected", stageName)) } - } else if configData.Engine == constants.EngineColly { - return errors.New(fmt.Sprintf("engine '%s' is not implemented", stageName)) + } else { + return errors.New(fmt.Sprintf("spiderfile invalid: engine '%s' is not implemented", configData.Engine)) } // stage 名称不能重复 if dict[stageName] == 1 { - return errors.New("spiderfile stage name should be unique") + return errors.New(fmt.Sprintf("spiderfile invalid: stage name '%s' is duplicated", stageName)) } dict[stageName] = 1 // stage 字段不能为空 if len(stage.Fields) == 0 { - return errors.New(fmt.Sprintf("spiderfile stage '%s' has no fields", stageName)) + return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has no fields", stageName)) } - // stage 的下一个 stage 只能有一个 + // 是否包含 next_stage hasNextStage := false + + // 遍历字段列表 for _, field := range stage.Fields { + // stage 的 next stage 只能有一个 if field.NextStage != "" { if hasNextStage { - return errors.New("spiderfile stage fields should have only 1 next_stage") + return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has more than 1 next_stage", stageName)) } hasNextStage = true } + + // 字段里 css 和 xpath 只能包含一个 + if field.Css != "" && field.Xpath != "" { + return errors.New(fmt.Sprintf("spiderfile invalid: field '%s' in stage '%s' has both css and xpath set which is prohibited", field.Name, stageName)) + } + } + + // stage 里 page_css 和 page_xpath 只能包含一个 + if stage.PageCss != "" && stage.PageXpath != "" { + return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both page_css and page_xpath set which is prohibited", stageName)) + } + + // stage 里 list_css 和 list_xpath 只能包含一个 + if stage.ListCss != "" && stage.ListXpath != "" { + return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both list_css and list_xpath set which is prohibited", stageName)) } // 如果 stage 的 is_list 为 true 但 list_css 为空,报错 - if stage.IsList && stage.ListCss == "" { - return errors.New("spiderfile stage with is_list = true should have list_css being set") + if stage.IsList && (stage.ListCss == "" && stage.ListXpath == "") { + return errors.New("spiderfile invalid: stage with is_list = true should have either list_css or list_xpath being set") } } // 校验字段唯一性 if !IsUniqueConfigSpiderFields(fields) { - return errors.New("spiderfile fields not unique") + return errors.New("spiderfile invalid: fields not unique") } // 字段名称不能为保留字符串 for _, field := range fields { if strings.Contains(constants.ScrapyProtectedFieldNames, field.Name) { - return errors.New(fmt.Sprintf("spiderfile field name '%s' is protected", field.Name)) + return errors.New(fmt.Sprintf("spiderfile invalid: field name '%s' is protected", field.Name)) } } @@ -116,3 +148,118 @@ func IsUniqueConfigSpiderFields(fields []entity.Field) bool { } return true } + +func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error { + spiderDir := spider.Src + + // 赋值 stage_name + for stageName, stage := range configData.Stages { + stage.Name = stageName + configData.Stages[stageName] = stage + } + + // 删除已有的爬虫文件 + for _, fInfo := range utils.ListDir(spiderDir) { + // 不删除Spiderfile + if fInfo.Name() == "Spiderfile" { + continue + } + + // 删除其他文件 + if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil { + return err + } + } + + // 拷贝爬虫文件 + tplDir := "./template/scrapy" + for _, fInfo := range utils.ListDir(tplDir) { + // 跳过Spiderfile + if fInfo.Name() == "Spiderfile" { + continue + } + + srcPath := filepath.Join(tplDir, fInfo.Name()) + if fInfo.IsDir() { + dirPath := filepath.Join(spiderDir, fInfo.Name()) + if err := utils.CopyDir(srcPath, dirPath); err != nil { + return err + } + } else { + if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil { + return err + } + } + } + + // 更改爬虫文件 + if err := GenerateConfigSpiderFiles(spider, configData); err != nil { + return err + } + + // 打包为 zip 文件 + files, err := utils.GetFilesFromDir(spiderDir) + if err != nil { + return err + } + randomId := uuid.NewV4() + tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip") + spiderZipFileName := spider.Name + ".zip" + if err := utils.Compress(files, tmpFilePath); err != nil { + return err + } + + // 获取 GridFS 实例 + s, gf := database.GetGridFs("files") + defer s.Close() + + // 判断文件是否已经存在 + var gfFile model.GridFs + if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil { + // 已经存在文件,则删除 + _ = gf.RemoveId(gfFile.Id) + } + + // 上传到GridFs + fid, err := UploadToGridFs(spiderZipFileName, tmpFilePath) + if err != nil { + log.Errorf("upload to grid fs error: %s", err.Error()) + return err + } + + // 保存爬虫 FileId + spider.FileId = fid + _ = spider.Save() + + return nil +} + +func GenerateSpiderfileFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error { + // Spiderfile 路径 + sfPath := filepath.Join(spider.Src, "Spiderfile") + + // 生成Yaml内容 + sfContentByte, err := yaml.Marshal(configData) + if err != nil { + return err + } + + // 打开文件 + var f *os.File + if utils.Exists(sfPath) { + f, err = os.OpenFile(sfPath, os.O_WRONLY|os.O_TRUNC, 0777) + } else { + f, err = os.OpenFile(sfPath, os.O_CREATE, 0777) + } + if err != nil { + return err + } + defer f.Close() + + // 写入内容 + if _, err := f.Write(sfContentByte); err != nil { + return err + } + + return nil +} diff --git a/backend/services/spider.go b/backend/services/spider.go index aa97b4ad..3922d822 100644 --- a/backend/services/spider.go +++ b/backend/services/spider.go @@ -116,12 +116,15 @@ func PublishAllSpiders() { // 发布爬虫 func PublishSpider(spider model.Spider) { - // 查询gf file,不存在则标记为爬虫文件不存在 - gfFile := model.GetGridFs(spider.FileId) - if gfFile == nil { - spider.FileId = constants.ObjectIdNull - _ = spider.Save() - return + var gfFile *model.GridFs + if spider.FileId.Hex() != constants.ObjectIdNull { + // 查询gf file,不存在则标记为爬虫文件不存在 + gfFile = model.GetGridFs(spider.FileId) + if gfFile == nil { + spider.FileId = constants.ObjectIdNull + _ = spider.Save() + return + } } // 如果FileId为空,表示还没有上传爬虫到GridFS,则跳过 diff --git a/backend/services/spider_handler/spider.go b/backend/services/spider_handler/spider.go index cce025dc..c3a2500d 100644 --- a/backend/services/spider_handler/spider.go +++ b/backend/services/spider_handler/spider.go @@ -10,6 +10,7 @@ import ( "github.com/spf13/viper" "io" "os" + "os/exec" "path/filepath" "runtime/debug" ) @@ -99,7 +100,6 @@ func (s *SpiderSync) Download() { // 创建临时文件 tmpFilePath := filepath.Join(tmpPath, randomId.String()+".zip") tmpFile := utils.OpenFile(tmpFilePath) - defer utils.Close(tmpFile) // 将该文件写入临时文件 if _, err := io.Copy(tmpFile, f); err != nil { @@ -119,6 +119,15 @@ func (s *SpiderSync) Download() { return } + //递归修改目标文件夹权限 + // 解决scrapy.setting中开启LOG_ENABLED 和 LOG_FILE时不能创建log文件的问题 + cmd := exec.Command("chmod", "-R", "777", dstPath) + if err := cmd.Run(); err != nil { + log.Errorf(err.Error()) + debug.PrintStack() + return + } + // 关闭临时文件 if err := tmpFile.Close(); err != nil { log.Errorf(err.Error()) diff --git a/backend/services/task.go b/backend/services/task.go index 02fa53e7..5886f8f1 100644 --- a/backend/services/task.go +++ b/backend/services/task.go @@ -226,12 +226,18 @@ func ExecuteShellCmd(cmdStr string, cwd string, t model.Task, s model.Spider) (e // 环境变量配置 envs := s.Envs if s.Type == constants.Configurable { + // 数据库配置 envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_HOST", Value: viper.GetString("mongo.host")}) envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_PORT", Value: viper.GetString("mongo.port")}) envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_DB", Value: viper.GetString("mongo.db")}) envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_USERNAME", Value: viper.GetString("mongo.username")}) envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_PASSWORD", Value: viper.GetString("mongo.password")}) envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_AUTHSOURCE", Value: viper.GetString("mongo.authSource")}) + + // 设置配置 + for envName, envValue := range s.Config.Settings { + envs = append(envs, model.Env{Name: "CRAWLAB_SETTING_" + envName, Value: envValue}) + } } cmd = SetEnv(cmd, envs, t.Id, s.Col) @@ -311,9 +317,12 @@ func SaveTaskResultCount(id string) func() { // 执行任务 func ExecuteTask(id int) { - if flag, _ := LockList.Load(id); flag.(bool) { - log.Debugf(GetWorkerPrefix(id) + "正在执行任务...") - return + if flag, ok := LockList.Load(id); ok { + if flag.(bool) { + log.Debugf(GetWorkerPrefix(id) + "正在执行任务...") + return + } + } // 上锁 @@ -485,6 +494,29 @@ func GetTaskLog(id string) (logStr string, err error) { } if IsMasterNode(task.NodeId.Hex()) { + if !utils.Exists(task.LogPath) { + fileDir, err := MakeLogDir(task) + + if err != nil { + log.Errorf(err.Error()) + } + + fileP := GetLogFilePaths(fileDir) + + // 获取日志文件路径 + fLog, err := os.Create(fileP) + defer fLog.Close() + if err != nil { + log.Errorf("create task log file error: %s", fileP) + debug.PrintStack() + } + task.LogPath = fileP + if err := task.Save(); err != nil { + log.Errorf(err.Error()) + debug.PrintStack() + } + + } // 若为主节点,获取本机日志 logBytes, err := model.GetLocalLog(task.LogPath) if err != nil { diff --git a/backend/template/scrapy/config_spider/settings.py b/backend/template/scrapy/config_spider/settings.py index a0112373..4b0965f2 100644 --- a/backend/template/scrapy/config_spider/settings.py +++ b/backend/template/scrapy/config_spider/settings.py @@ -1,4 +1,7 @@ # -*- coding: utf-8 -*- +import os +import re +import json # Scrapy settings for config_spider project # @@ -9,14 +12,14 @@ # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html -BOT_NAME = 'config_spider' +BOT_NAME = 'Crawlab Configurable Spider' SPIDER_MODULES = ['config_spider.spiders'] NEWSPIDER_MODULE = 'config_spider.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'config_spider (+http://www.yourdomain.com)' +USER_AGENT = 'Crawlab Spider' # Obey robots.txt rules ROBOTSTXT_OBEY = True @@ -88,3 +91,21 @@ ITEM_PIPELINES = { #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]: + setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '') + setting_value = os.environ.get(setting_env_name) + if setting_value.lower() == 'true': + setting_value = True + elif setting_value.lower() == 'false': + setting_value = False + elif re.search(r'^\d+$', setting_value) is not None: + setting_value = int(setting_value) + elif re.search(r'^\{.*\}$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + elif re.search(r'^\[.*\]$', setting_value.strip()) is not None: + setting_value = json.loads(setting_value) + else: + pass + locals()[setting_name] = setting_value + diff --git a/backend/template/spiderfile/Spiderfile.163_news b/backend/template/spiderfile/Spiderfile.163_news new file mode 100644 index 00000000..29d58279 --- /dev/null +++ b/backend/template/spiderfile/Spiderfile.163_news @@ -0,0 +1,20 @@ +version: "0.4.0" +name: "toscrapy_books" +start_url: "http://news.163.com/special/0001386F/rank_news.html" +start_stage: "list" +engine: "scrapy" +stages: + list: + is_list: true + list_css: "table tr:not(:first-child)" + fields: + - name: "title" + css: "td:nth-child(1) > a" + - name: "url" + css: "td:nth-child(1) > a" + attr: "href" + - name: "clicks" + css: "td.cBlue" +settings: + ROBOTSTXT_OBEY: false + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/spiderfile/Spiderfile.baidu b/backend/template/spiderfile/Spiderfile.baidu new file mode 100644 index 00000000..fbf720e4 --- /dev/null +++ b/backend/template/spiderfile/Spiderfile.baidu @@ -0,0 +1,22 @@ +version: 0.4.0 +name: toscrapy_books +start_url: http://www.baidu.com/s?wd=crawlab +start_stage: list +engine: scrapy +stages: + list: + is_list: true + list_xpath: //*[contains(@class, "c-container")] + page_xpath: //*[@id="page"]//a[@class="n"][last()] + page_attr: href + fields: + - name: title + xpath: .//h3/a + - name: url + xpath: .//h3/a + attr: href + - name: abstract + xpath: .//*[@class="c-abstract"] +settings: + ROBOTSTXT_OBEY: false + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/Spiderfile b/backend/template/spiderfile/Spiderfile.toscrapy_books similarity index 83% rename from backend/template/Spiderfile rename to backend/template/spiderfile/Spiderfile.toscrapy_books index 8d0e05cf..4bf18f61 100644 --- a/backend/template/Spiderfile +++ b/backend/template/spiderfile/Spiderfile.toscrapy_books @@ -5,10 +5,10 @@ start_stage: "list" engine: "scrapy" stages: list: - is_list: true # default: false + is_list: true list_css: "section article.product_pod" page_css: "ul.pager li.next a" - page_attr: "href" # default: href + page_attr: "href" fields: - name: "title" css: "h3 > a" @@ -23,3 +23,6 @@ stages: fields: - name: "description" css: "#product_description + p" +settings: + ROBOTSTXT_OBEY: true + AUTOTHROTTLE_ENABLED: true diff --git a/backend/utils/file.go b/backend/utils/file.go index 2dacc9ed..c71b2cb0 100644 --- a/backend/utils/file.go +++ b/backend/utils/file.go @@ -167,7 +167,6 @@ func DeCompress(srcFile *os.File, dstPath string) error { debug.PrintStack() continue } - defer Close(newFile) // 拷贝该文件到新文件中 if _, err := io.Copy(newFile, srcFile); err != nil { diff --git a/frontend/package.json b/frontend/package.json index 5f19fd7b..724b5e36 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -23,7 +23,7 @@ "cross-env": "^5.2.0", "dayjs": "^1.8.6", "echarts": "^4.1.0", - "element-ui": "2.4.6", + "element-ui": "2.13.0", "font-awesome": "^4.7.0", "js-cookie": "2.2.0", "normalize.css": "7.0.0", diff --git a/frontend/src/components/Common/CrawlConfirmDialog.vue b/frontend/src/components/Common/CrawlConfirmDialog.vue index 2286beb2..f2ad70c2 100644 --- a/frontend/src/components/Common/CrawlConfirmDialog.vue +++ b/frontend/src/components/Common/CrawlConfirmDialog.vue @@ -2,13 +2,21 @@
{{$t('Are you sure to run this spider?')}}
- - - + + + + + + + + + + - - + + @@ -335,7 +860,7 @@ export default { .button-group-container { margin-top: 10px; - border-bottom: 1px dashed #dcdfe6; + /*border-bottom: 1px dashed #dcdfe6;*/ padding-bottom: 20px; } @@ -345,7 +870,7 @@ export default { .list-fields-container { margin-top: 20px; - border-bottom: 1px dashed #dcdfe6; + /*border-bottom: 1px dashed #dcdfe6;*/ padding-bottom: 20px; } @@ -369,4 +894,142 @@ export default { .el-table.table-header >>> .el-input .el-input__inner { border-radius: 0; } + + .selector-type-item { + margin: 0 5px; + cursor: pointer; + font-weight: bolder; + } + + .el-tag { + margin-right: 5px; + font-weight: bolder; + cursor: pointer; + } + + .el-tag.inactive { + opacity: 0.5; + } + + .stage-list { + width: 100%; + /*width: calc(80px + 320px);*/ + display: flex; + flex-wrap: wrap; + list-style: none; + margin: 0; + padding: 0; + } + + .stage-list .stage-item { + /*flex-basis: 320px;*/ + min-width: 120px; + display: flex; + align-items: center; + } + + .stage-list .stage-item label { + flex-basis: 90px; + margin-right: 10px; + justify-self: flex-end; + text-align: right; + } + + .stage-list .stage-item .el-input { + flex-basis: calc(100% - 90px); + height: 32px; + } + + .stage-list .stage-item .el-input .el-input__inner { + height: 32px; + inline-size: 32px; + } + + .stage-list .stage-item .action-item { + cursor: pointer; + width: 13px; + margin-right: 5px; + } + + .stage-list .stage-item .action-item:last-child { + margin-right: 10px; + } + + .stage-list .stage-item .text-wrapper { + display: flex; + align-items: center; + max-width: calc(100% - 90px - 10px); + } + + .stage-list .stage-item .text-wrapper .text { + text-overflow: ellipsis; + overflow: hidden; + } + + .stage-list .stage-item .text-wrapper .text:hover { + text-decoration: underline; + } + + .stage-list .stage-item .text-wrapper i { + margin-left: 5px; + } + + .stage-list .stage-item >>> .edit-text { + height: 32px; + line-height: 32px; + } + + .stage-list .stage-item >>> .edit-text .el-input__inner { + height: 32px; + line-height: 32px; + } + + .top-wrapper { + display: flex; + justify-content: space-between; + align-items: center; + } + + .top-wrapper .list { + list-style: none; + display: flex; + flex-wrap: wrap; + align-items: center; + padding: 0; + } + + .top-wrapper .list .item { + margin-bottom: 10px; + display: flex; + align-items: center; + } + + .top-wrapper .list .item label { + width: 100px; + text-align: right; + margin-right: 10px; + font-size: 12px; + } + + .top-wrapper .list .item label + * { + width: 240px; + } + + .invalid >>> .el-input__inner { + border: 1px solid red !important; + } + + #process-chart { + width: 100%; + height: 480px; + } + + .config-list >>> .file-content { + height: calc(100vh - 280px); + } + + .spiderfile-actions { + margin-bottom: 5px; + text-align: right; + } diff --git a/frontend/src/components/File/FileDetail.vue b/frontend/src/components/File/FileDetail.vue index d74f73b3..f5f8a4cc 100644 --- a/frontend/src/components/File/FileDetail.vue +++ b/frontend/src/components/File/FileDetail.vue @@ -18,6 +18,7 @@ import 'codemirror/mode/go/go.js' import 'codemirror/mode/shell/shell.js' import 'codemirror/mode/markdown/markdown.js' import 'codemirror/mode/php/php.js' +import 'codemirror/mode/yaml/yaml.js' export default { name: 'FileDetail', @@ -38,7 +39,7 @@ export default { }, options () { return { - mode: this.lanaguage, + mode: this.language, theme: 'darcula', styleActiveLine: true, lineNumbers: true, @@ -46,8 +47,9 @@ export default { matchBrackets: true } }, - lanaguage () { + language () { const fileName = this.$store.state.file.currentPath + if (!fileName) return '' if (fileName.match(/\.js$/)) { return 'text/javascript' } else if (fileName.match(/\.py$/)) { @@ -60,6 +62,8 @@ export default { return 'text/x-php' } else if (fileName.match(/\.md$/)) { return 'text/x-markdown' + } else if (fileName === 'Spiderfile') { + return 'text/x-yaml' } else { return 'text' } @@ -74,7 +78,7 @@ export default { diff --git a/frontend/src/components/TableView/SettingFieldsTableView.vue b/frontend/src/components/TableView/SettingFieldsTableView.vue new file mode 100644 index 00000000..7ce8a46c --- /dev/null +++ b/frontend/src/components/TableView/SettingFieldsTableView.vue @@ -0,0 +1,283 @@ + + + + + diff --git a/frontend/src/i18n/zh.js b/frontend/src/i18n/zh.js index c56959c9..65170117 100644 --- a/frontend/src/i18n/zh.js +++ b/frontend/src/i18n/zh.js @@ -125,6 +125,8 @@ export default { 'Customized Spider': '自定义爬虫', 'Configurable': '可配置', 'Customized': '自定义', + 'configurable': '可配置', + 'customized': '自定义', 'Text': '文本', 'Attribute': '属性', 'Field Name': '字段名称', @@ -148,6 +150,26 @@ export default { 'List Page Fields': '列表页字段', 'Detail Page Fields': '详情页字段', 'Detail Page URL': '详情页URL', + 'All': '全部', + 'Stages': '阶段', + 'Process': '流程', + 'Stage Process': '流程图', + 'Stage Name': '阶段名称', + 'Start Stage': '开始阶段', + 'Engine': '引擎', + 'Selector Type': '选择器类别', + 'Selector': '选择器', + 'Is Attribute': '是否为属性', + 'Next Stage': '下一阶段', + 'No Next Stage': '没有下一阶段', + 'Fields': '字段', + 'Stage': '阶段', + 'Is List': '是否为列表', + 'List': '列表', + 'Pagination': '分页', + 'Settings': '设置', + 'Display Name': '显示名称', + 'Template': '模版', // 爬虫列表 'Name': '名称', @@ -171,6 +193,9 @@ export default { 'Wait Duration (sec)': '等待时长(秒)', 'Runtime Duration (sec)': '运行时长(秒)', 'Total Duration (sec)': '总时长(秒)', + 'Run Type': '运行类型', + 'Random': '随机', + 'Selected Nodes': '指定节点', // 任务列表 'Node': '节点', diff --git a/frontend/src/store/modules/file.js b/frontend/src/store/modules/file.js index 5cc50acb..66b84651 100644 --- a/frontend/src/store/modules/file.js +++ b/frontend/src/store/modules/file.js @@ -42,12 +42,6 @@ const actions = { .then(response => { commit('SET_FILE_CONTENT', response.data.data) }) - }, - saveFileContent ({ state, rootState }, payload) { - const { path } = payload - const spiderId = rootState.spider.spiderForm._id - const content = state.fileContent - return request.post(`/spiders/${spiderId}/file`, { content, path }) } } diff --git a/frontend/src/store/modules/lang.js b/frontend/src/store/modules/lang.js index b1e57a04..dc6b8d18 100644 --- a/frontend/src/store/modules/lang.js +++ b/frontend/src/store/modules/lang.js @@ -1,5 +1,5 @@ const state = { - lang: window.localStorage.getItem('lang') || 'en' + lang: window.localStorage.getItem('lang') || 'zh' } const getters = { diff --git a/frontend/src/store/modules/spider.js b/frontend/src/store/modules/spider.js index 07a0bac3..f4d7b134 100644 --- a/frontend/src/store/modules/spider.js +++ b/frontend/src/store/modules/spider.js @@ -1,4 +1,6 @@ +import Vue from 'vue' import request from '../../api/request' +import axisModelCommonMixin from 'echarts/src/coord/axisModelCommonMixin' const state = { // list of spiders @@ -34,7 +36,10 @@ const state = { filterSite: '', // preview crawl data - previewCrawlData: [] + previewCrawlData: [], + + // template list + templateList: [] } const getters = {} @@ -72,6 +77,16 @@ const mutations = { }, SET_PREVIEW_CRAWL_DATA (state, value) { state.previewCrawlData = value + }, + SET_SPIDER_FORM_CONFIG_SETTINGS (state, payload) { + const settings = {} + payload.forEach(row => { + settings[row.name] = row.value + }) + Vue.set(state.spiderForm.config, 'settings', settings) + }, + SET_TEMPLATE_LIST (state, value) { + state.templateList = value } } @@ -103,10 +118,11 @@ const actions = { }) }, crawlSpider ({ state, dispatch }, payload) { - const { id, nodeId, param } = payload + const { spiderId, runType, nodeIds, param } = payload return request.put(`/tasks`, { - spider_id: id, - node_id: nodeId, + spider_id: spiderId, + run_type: runType, + node_ids: nodeIds, param: param }) }, @@ -148,6 +164,20 @@ const actions = { }, extractFields ({ state, commit }) { return request.post(`/spiders/${state.spiderForm._id}/extract_fields`) + }, + postConfigSpiderConfig ({ state }) { + return request.post(`/config_spiders/${state.spiderForm._id}/config`, state.spiderForm.config) + }, + saveConfigSpiderSpiderfile ({ state, rootState }) { + const content = rootState.file.fileContent + return request.post(`/config_spiders/${state.spiderForm._id}/spiderfile`, { content }) + }, + addConfigSpider ({ state }) { + return request.put(`/config_spiders`, state.spiderForm) + }, + async getTemplateList ({ state, commit }) { + const res = await request.get(`/config_spiders_templates`) + commit('SET_TEMPLATE_LIST', res.data.data) } } diff --git a/frontend/src/views/schedule/ScheduleList.vue b/frontend/src/views/schedule/ScheduleList.vue index b170c9ed..3a032b23 100644 --- a/frontend/src/views/schedule/ScheduleList.vue +++ b/frontend/src/views/schedule/ScheduleList.vue @@ -274,7 +274,7 @@ export default { // 爬虫列表 request.get('/spiders', {}) .then(response => { - this.spiderList = response.data.data.list + this.spiderList = response.data.data.list || [] }) } } diff --git a/frontend/src/views/spider/SpiderDetail.vue b/frontend/src/views/spider/SpiderDetail.vue index b42e750d..a743e47d 100644 --- a/frontend/src/views/spider/SpiderDetail.vue +++ b/frontend/src/views/spider/SpiderDetail.vue @@ -13,8 +13,8 @@ - - + + @@ -48,6 +48,13 @@ export default { FileList, SpiderOverview }, + watch: { + activeTabName () { + // 初始化文件 + this.$store.commit('file/SET_FILE_CONTENT', '') + this.$store.commit('file/SET_CURRENT_PATH', '') + } + }, data () { return { activeTabName: 'overview' @@ -77,6 +84,10 @@ export default { setTimeout(() => { this.$refs['spider-stats'].update() }, 0) + } else if (this.activeTabName === 'config') { + setTimeout(() => { + this.$refs['config'].update() + }, 0) } this.$st.sendEv('爬虫详情', '切换标签', tab.name) }, @@ -85,19 +96,26 @@ export default { this.$st.sendEv('爬虫详情', '切换爬虫') } }, - created () { + async created () { // get the list of the spiders // this.$store.dispatch('spider/getSpiderList') // get spider basic info - this.$store.dispatch('spider/getSpiderData', this.$route.params.id) - .then(() => { - // get spider file info - this.$store.dispatch('file/getFileList', this.spiderForm.src) - }) + await this.$store.dispatch('spider/getSpiderData', this.$route.params.id) + + // get spider file info + await this.$store.dispatch('file/getFileList', this.spiderForm.src) // get spider tasks - this.$store.dispatch('spider/getTaskList', this.$route.params.id) + await this.$store.dispatch('spider/getTaskList', this.$route.params.id) + + // get spider list + await this.$store.dispatch('spider/getSpiderList') + + // if spider is configurable spider, set to config tab by default + if (this.spiderForm.type === 'configurable') { + this.activeTabName = 'config' + } } } diff --git a/frontend/src/views/spider/SpiderList.vue b/frontend/src/views/spider/SpiderList.vue index eb1e548f..78c87a36 100644 --- a/frontend/src/views/spider/SpiderList.vue +++ b/frontend/src/views/spider/SpiderList.vue @@ -33,18 +33,50 @@ width="40%" :visible.sync="addDialogVisible" :before-close="onAddDialogClose"> -
-
- - {{$t('Configurable Spider')}} - -
-
- - {{$t('Customized Spider')}} - -
-
+ + + + + + + + + + + + + + + + + + +
+ {{$t('Add')}} +
+
+ + + + + {{$t('Upload')}} + + + + + +
@@ -81,19 +113,7 @@ width="40%" :visible.sync="addCustomizedDialogVisible" :before-close="onAddCustomizedDialogClose"> - - - - {{$t('Upload')}} - - - - + @@ -110,17 +130,24 @@
- - - - - + + + + + + + + + {{$t('Search')}} + +
@@ -133,16 +160,19 @@ @click="onAdd"> {{$t('Add Spider')}} - - {{$t('Refresh')}} - +
+ + + + + + + + @@ -248,7 +281,7 @@ import { import dayjs from 'dayjs' import CrawlConfirmDialog from '../../components/Common/CrawlConfirmDialog' import StatusTag from '../../components/Status/StatusTag' -import request from '../../api/request' + export default { name: 'SpiderList', components: { @@ -272,10 +305,9 @@ export default { activeSpiderId: undefined, filter: { keyword: '', - type: '' + type: 'all' }, types: [], - // tableData, columns: [ { name: 'display_name', label: 'Name', width: '160', align: 'left' }, { name: 'type', label: 'Spider Type', width: '120' }, @@ -287,7 +319,8 @@ export default { spiderFormRules: { name: [{ required: true, message: 'Required Field', trigger: 'change' }] }, - fileList: [] + fileList: [], + spiderType: 'configurable' } }, computed: { @@ -295,7 +328,8 @@ export default { 'importForm', 'spiderList', 'spiderForm', - 'spiderTotal' + 'spiderTotal', + 'templateList' ]), ...mapGetters('user', [ 'token' @@ -318,14 +352,26 @@ export default { this.getList() }, onAdd () { - // this.addDialogVisible = true - this.onAddCustomized() + this.$store.commit('spider/SET_SPIDER_FORM', { + template: this.templateList[0] + }) + this.addDialogVisible = true }, onAddConfigurable () { - this.$store.commit('spider/SET_SPIDER_FORM', {}) - this.addDialogVisible = false - this.addConfigurableDialogVisible = true - this.$st.sendEv('爬虫', '添加爬虫-可配置爬虫') + this.$refs['addConfigurableForm'].validate(async res => { + if (!res) return + + let res2 + try { + res2 = await this.$store.dispatch('spider/addConfigSpider') + } catch (e) { + this.$message.error(this.$t('Something wrong happened')) + return + } + await this.$store.dispatch('spider/getSpiderList') + this.$router.push(`/spiders/${res2.data.data._id}`) + this.$st.sendEv('爬虫', '添加爬虫-可配置爬虫') + }) }, onAddCustomized () { this.addDialogVisible = false @@ -374,7 +420,8 @@ export default { this.$store.commit('spider/SET_SPIDER_FORM', row) this.dialogVisible = true }, - onRemove (row) { + onRemove (row, ev) { + ev.stopPropagation() this.$confirm(this.$t('Are you sure to delete this spider?'), this.$t('Notification'), { confirmButtonText: this.$t('Confirm'), cancelButtonText: this.$t('Cancel'), @@ -390,12 +437,14 @@ export default { this.$st.sendEv('爬虫', '删除') }) }, - onCrawl (row) { + onCrawl (row, ev) { + ev.stopPropagation() this.crawlConfirmDialogVisible = true this.activeSpiderId = row._id this.$st.sendEv('爬虫', '点击运行') }, - onView (row) { + onView (row, ev) { + ev.stopPropagation() this.$router.push('/spiders/' + row._id) this.$st.sendEv('爬虫', '查看') }, @@ -483,10 +532,12 @@ export default { if (!str || str.match('^0001')) return 'NA' return dayjs(str).format('YYYY-MM-DD HH:mm:ss') }, - onRowClick (row, event, column) { - if (column.label !== this.$t('Action')) { - this.onView(row) - } + onRowClick (row, column, event) { + this.onView(row, event) + }, + onClickTab (tab) { + this.filter.type = tab.name + this.getList() }, getList () { let params = { @@ -496,19 +547,29 @@ export default { type: this.filter.type } this.$store.dispatch('spider/getSpiderList', params) - }, - getTypes () { - request.get(`/spider/types`).then(resp => { - this.types = resp.data.data - }) } + // getTypes () { + // request.get(`/spider/types`).then(resp => { + // this.types = resp.data.data + // }) + // } }, - created () { - this.getTypes() + async created () { + // fetch spider types + // await this.getTypes() + // fetch spider list - this.getList() + await this.getList() + + // fetch template list + await this.$store.dispatch('spider/getTemplateList') }, mounted () { + console.log(this.spiderForm) + const vm = this + this.$nextTick(() => { + vm.$store.commit('spider/SET_SPIDER_FORM', this.spiderForm) + }) } } @@ -594,4 +655,8 @@ export default { .el-table >>> tr { cursor: pointer; } + + .actions { + text-align: right; + } diff --git a/frontend/src/views/task/TaskList.vue b/frontend/src/views/task/TaskList.vue index 9db3623d..3ab19fee 100644 --- a/frontend/src/views/task/TaskList.vue +++ b/frontend/src/views/task/TaskList.vue @@ -125,7 +125,7 @@ - + @@ -250,7 +250,8 @@ export default { onSelectSpider () { this.$st.sendEv('任务', '选择爬虫') }, - onRemove (row) { + onRemove (row, ev) { + ev.stopPropagation() this.$confirm(this.$t('Are you sure to delete this task?'), this.$t('Notification'), { confirmButtonText: this.$t('Confirm'), cancelButtonText: this.$t('Cancel'), diff --git a/frontend/vue.config.js b/frontend/vue.config.js index c867140e..f5de4ac5 100644 --- a/frontend/vue.config.js +++ b/frontend/vue.config.js @@ -1,3 +1,4 @@ module.exports = { publicPath: process.env.BASE_URL || '/' + // TODO: need to configure output static files with hash } diff --git a/frontend/yarn.lock b/frontend/yarn.lock index 8697e8e2..ef361b2b 100644 --- a/frontend/yarn.lock +++ b/frontend/yarn.lock @@ -2954,9 +2954,10 @@ electron-to-chromium@^1.3.103: version "1.3.113" resolved "http://registry.npm.taobao.org/electron-to-chromium/download/electron-to-chromium-1.3.113.tgz#b1ccf619df7295aea17bc6951dc689632629e4a9" -element-ui@2.4.6: - version "2.4.6" - resolved "https://registry.yarnpkg.com/element-ui/-/element-ui-2.4.6.tgz#524d3d4cac0b68745dda87311ef0d8fe541b5fc4" +element-ui@2.13.0: + version "2.13.0" + resolved "https://registry.npm.taobao.org/element-ui/download/element-ui-2.13.0.tgz?cache=0&other_urls=https%3A%2F%2Fregistry.npm.taobao.org%2Felement-ui%2Fdownload%2Felement-ui-2.13.0.tgz#f6bb04e5b0a76ea5f62466044b774407ba4ebd2d" + integrity sha1-9rsE5bCnbqX2JGYES3dEB7pOvS0= dependencies: async-validator "~1.8.1" babel-helper-vue-jsx-merge-props "^2.0.0"