From ea3a17f4bbc89322e4b1e8f5595a917f9f56270e Mon Sep 17 00:00:00 2001 From: marvzhang Date: Sun, 29 Mar 2020 11:17:38 +0800 Subject: [PATCH] =?UTF-8?q?=E6=94=AF=E6=8C=81=E5=8E=BB=E9=87=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/model/spider.go | 7 ++- backend/routes/spider.go | 13 +++++ backend/services/spider.go | 21 +++++++++ backend/services/spider_handler/spider.go | 4 +- backend/services/task.go | 15 ++++-- .../components/InfoView/SpiderInfoView.vue | 47 ++++++++++++++++++- frontend/src/i18n/zh.js | 4 ++ 7 files changed, 104 insertions(+), 7 deletions(-) diff --git a/backend/model/spider.go b/backend/model/spider.go index 9a709b41..666ed7d1 100644 --- a/backend/model/spider.go +++ b/backend/model/spider.go @@ -59,12 +59,17 @@ type Spider struct { // 长任务 IsLongTask bool `json:"is_long_task" bson:"is_long_task"` // 是否为长任务 + // 去重 + IsDedup bool `json:"is_dedup" bson:"is_dedup"` // 是否去重 + DedupField string `json:"dedup_field" bson:"dedup_field"` // 去重字段 + DedupMethod string `json:"dedup_method" bson:"dedup_method"` // 去重方式 + // 前端展示 LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间 LastStatus string `json:"last_status"` // 最后执行状态 Config entity.ConfigSpiderData `json:"config"` // 可配置爬虫配置 LatestTasks []Task `json:"latest_tasks"` // 最近任务列表 - Username string `json:"username""` + Username string `json:"username"` // 用户名称 // 时间 UserId bson.ObjectId `json:"user_id" bson:"user_id"` diff --git a/backend/routes/spider.go b/backend/routes/spider.go index f4892dd2..c3dd4623 100644 --- a/backend/routes/spider.go +++ b/backend/routes/spider.go @@ -158,6 +158,19 @@ func PostSpider(c *gin.Context) { return } + // 获取爬虫 + spider, err := model.GetSpider(bson.ObjectIdHex(id)) + if err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + // 去重处理 + if err := services.UpdateSpiderDedup(spider); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + c.JSON(http.StatusOK, Response{ Status: "ok", Message: "success", diff --git a/backend/services/spider.go b/backend/services/spider.go index 4c3f6d45..a81bc94d 100644 --- a/backend/services/spider.go +++ b/backend/services/spider.go @@ -433,6 +433,27 @@ func CopySpider(spider model.Spider, newName string) error { return nil } +func UpdateSpiderDedup(spider model.Spider) error { + s, c := database.GetCol(spider.Col) + defer s.Close() + + if !spider.IsDedup { + if err := c.DropIndex(spider.DedupField); err != nil { + return err + } + return nil + } + + if err := c.EnsureIndex(mgo.Index{ + Key: []string{spider.DedupField}, + Unique: true, + }); err != nil { + return err + } + + return nil +} + func InitDemoSpiders() { // 添加Demo爬虫 templateSpidersDir := "./template/spiders" diff --git a/backend/services/spider_handler/spider.go b/backend/services/spider_handler/spider.go index eff5a758..189fed60 100644 --- a/backend/services/spider_handler/spider.go +++ b/backend/services/spider_handler/spider.go @@ -49,7 +49,9 @@ func (s *SpiderSync) CheckIsScrapy() { return } s.Spider.IsScrapy = utils.Exists(path.Join(s.Spider.Src, "scrapy.cfg")) - s.Spider.Cmd = "scrapy crawl" + if s.Spider.IsScrapy { + s.Spider.Cmd = "scrapy crawl" + } if err := s.Spider.Save(); err != nil { log.Errorf(err.Error()) debug.PrintStack() diff --git a/backend/services/task.go b/backend/services/task.go index c1e1fa33..6bbce5e3 100644 --- a/backend/services/task.go +++ b/backend/services/task.go @@ -107,7 +107,7 @@ func AssignTask(task model.Task) error { } // 设置环境变量 -func SetEnv(cmd *exec.Cmd, envs []model.Env, taskId string, dataCol string) *exec.Cmd { +func SetEnv(cmd *exec.Cmd, envs []model.Env, task model.Task, spider model.Spider) *exec.Cmd { // 默认把Node.js的全局node_modules加入环境变量 envPath := os.Getenv("PATH") homePath := os.Getenv("HOME") @@ -117,8 +117,8 @@ func SetEnv(cmd *exec.Cmd, envs []model.Env, taskId string, dataCol string) *exe _ = os.Setenv("NODE_PATH", nodePath) // 默认环境变量 - cmd.Env = append(os.Environ(), "CRAWLAB_TASK_ID="+taskId) - cmd.Env = append(cmd.Env, "CRAWLAB_COLLECTION="+dataCol) + cmd.Env = append(os.Environ(), "CRAWLAB_TASK_ID="+task.Id) + cmd.Env = append(cmd.Env, "CRAWLAB_COLLECTION="+spider.Col) cmd.Env = append(cmd.Env, "CRAWLAB_MONGO_HOST="+viper.GetString("mongo.host")) cmd.Env = append(cmd.Env, "CRAWLAB_MONGO_PORT="+viper.GetString("mongo.port")) if viper.GetString("mongo.db") != "" { @@ -136,6 +136,13 @@ func SetEnv(cmd *exec.Cmd, envs []model.Env, taskId string, dataCol string) *exe cmd.Env = append(cmd.Env, "PYTHONUNBUFFERED=0") cmd.Env = append(cmd.Env, "PYTHONIOENCODING=utf-8") cmd.Env = append(cmd.Env, "TZ=Asia/Shanghai") + cmd.Env = append(cmd.Env, "CRAWLAB_DEDUP_FIELD="+spider.DedupField) + cmd.Env = append(cmd.Env, "CRAWLAB_DEDUP_METHOD="+spider.DedupMethod) + if spider.IsDedup { + cmd.Env = append(cmd.Env, "CRAWLAB_IS_DEDUP=1") + } else { + cmd.Env = append(cmd.Env, "CRAWLAB_IS_DEDUP=0") + } //任务环境变量 for _, env := range envs { @@ -270,7 +277,7 @@ func ExecuteShellCmd(cmdStr string, cwd string, t model.Task, s model.Spider) (e envs = append(envs, model.Env{Name: "CRAWLAB_SETTING_" + envName, Value: envValue}) } } - cmd = SetEnv(cmd, envs, t.Id, s.Col) + cmd = SetEnv(cmd, envs, t, s) // 起一个goroutine来监控进程 ch := utils.TaskExecChanMap.ChanBlocked(t.Id) diff --git a/frontend/src/components/InfoView/SpiderInfoView.vue b/frontend/src/components/InfoView/SpiderInfoView.vue index 3e318bba..efb86db2 100644 --- a/frontend/src/components/InfoView/SpiderInfoView.vue +++ b/frontend/src/components/InfoView/SpiderInfoView.vue @@ -45,7 +45,7 @@ /> - + + +
+ + + + + + +
+
@@ -167,6 +193,17 @@ export default { } callback() } + const dedupValidator = (rule, value, callback) => { + if (!this.spiderForm.is_dedup) { + return callback() + } else { + if (value) { + return callback() + } else { + return callback(new Error('dedup field cannot be empty')) + } + } + } return { uploadLoading: false, fileList: [], @@ -176,6 +213,9 @@ export default { ], cronRules: [ { validator: cronValidator, trigger: 'blur' } + ], + dedupRules: [ + { validator: dedupValidator, trigger: 'blur' } ] } }, @@ -250,6 +290,11 @@ export default { if (value) { this.spiderForm.cmd = 'scrapy crawl' } + }, + onIsDedupChange (value) { + if (value && !this.spiderForm.dedup_method) { + this.spiderForm.dedup_method = 'overwrite' + } } }, async created () { diff --git a/frontend/src/i18n/zh.js b/frontend/src/i18n/zh.js index 8c3f41bd..c7fca4b4 100644 --- a/frontend/src/i18n/zh.js +++ b/frontend/src/i18n/zh.js @@ -228,6 +228,10 @@ export default { 'Is Public': '是否公共', 'Owner': '所有者', 'Convert to Customized': '转化为自定义', + 'Is De-Duplicated': '是否去重', + 'Please enter de-duplicated field': '请输入去重字段', + 'Overwrite': '覆盖', + 'Ignore': '忽略', // 爬虫列表 'Name': '名称',