diff --git a/backend/services/spider.go b/backend/services/spider.go index e97c7992..fe162f12 100644 --- a/backend/services/spider.go +++ b/backend/services/spider.go @@ -12,12 +12,11 @@ import ( "github.com/apex/log" "github.com/globalsign/mgo" "github.com/globalsign/mgo/bson" - uuid "github.com/satori/go.uuid" + "github.com/satori/go.uuid" "github.com/spf13/viper" "os" "path/filepath" "runtime/debug" - "strings" ) type SpiderFileData struct { @@ -192,21 +191,14 @@ func PublishSpider(spider model.Spider) { md5 := filepath.Join(path, spider_handler.Md5File) if !utils.Exists(md5) { log.Infof("md5 file not found: %s", md5) - spiderSync.RemoveSpiderFile() - spiderSync.Download() - spiderSync.CreateMd5File(gfFile.Md5) + spiderSync.RemoveDownCreate(gfFile.Md5) return } // md5值不一样,则下载 - md5Str := utils.ReadFileOneLine(md5) - // 去掉空格以及换行符 - md5Str = strings.Replace(md5Str, " ", "", -1) - md5Str = strings.Replace(md5Str, "\n", "", -1) + md5Str := utils.GetSpiderMd5Str(md5) if gfFile.Md5 != md5Str { log.Infof("md5 is different, gf-md5:%s, file-md5:%s", gfFile.Md5, md5Str) - spiderSync.RemoveSpiderFile() - spiderSync.Download() - spiderSync.CreateMd5File(gfFile.Md5) + spiderSync.RemoveDownCreate(gfFile.Md5) return } } diff --git a/backend/services/spider_handler/spider.go b/backend/services/spider_handler/spider.go index c3a2500d..cd8a1dbe 100644 --- a/backend/services/spider_handler/spider.go +++ b/backend/services/spider_handler/spider.go @@ -38,6 +38,12 @@ func (s *SpiderSync) CreateMd5File(md5 string) { } } +func (s *SpiderSync) RemoveDownCreate(md5 string) { + s.RemoveSpiderFile() + s.Download() + s.CreateMd5File(md5) +} + // 获得下载锁的key func (s *SpiderSync) GetLockDownloadKey(spiderId string) string { node, _ := model.GetCurrentNode() diff --git a/backend/services/task.go b/backend/services/task.go index b41f0d81..15513977 100644 --- a/backend/services/task.go +++ b/backend/services/task.go @@ -7,6 +7,7 @@ import ( "crawlab/lib/cron" "crawlab/model" "crawlab/services/notification" + "crawlab/services/spider_handler" "crawlab/utils" "encoding/json" "errors" @@ -450,15 +451,9 @@ func ExecuteTask(id int) { t.Status = constants.StatusRunning // 任务状态 t.WaitDuration = t.StartTs.Sub(t.CreateTs).Seconds() // 等待时长 - // 判断爬虫文件是否存在 - gfFile := model.GetGridFs(spider.FileId) - if gfFile == nil { - t.Error = "找不到爬虫文件,请重新上传" - t.Status = constants.StatusError - t.FinishTs = time.Now() // 结束时间 - t.RuntimeDuration = t.FinishTs.Sub(t.StartTs).Seconds() // 运行时长 - t.TotalDuration = t.FinishTs.Sub(t.CreateTs).Seconds() // 总时长 - _ = t.Save() + // 文件检查 + if err := SpiderFileCheck(t, spider); err != nil { + log.Errorf("spider file check error: %s", err.Error()) return } @@ -538,6 +533,30 @@ func ExecuteTask(id int) { log.Infof(GetWorkerPrefix(id) + "任务(ID:" + t.Id + ")" + "执行完毕. 消耗时间:" + durationStr + "秒") } +func SpiderFileCheck(t model.Task, spider model.Spider) error { + // 判断爬虫文件是否存在 + gfFile := model.GetGridFs(spider.FileId) + if gfFile == nil { + t.Error = "找不到爬虫文件,请重新上传" + t.Status = constants.StatusError + t.FinishTs = time.Now() // 结束时间 + t.RuntimeDuration = t.FinishTs.Sub(t.StartTs).Seconds() // 运行时长 + t.TotalDuration = t.FinishTs.Sub(t.CreateTs).Seconds() // 总时长 + _ = t.Save() + return errors.New(t.Error) + } + + // 判断md5值是否一致 + path := filepath.Join(viper.GetString("spider.path"), spider.Name) + md5File := filepath.Join(path, spider_handler.Md5File) + md5 := utils.GetSpiderMd5Str(md5File) + if gfFile.Md5 != md5 { + spiderSync := spider_handler.SpiderSync{Spider: spider} + spiderSync.RemoveDownCreate(gfFile.Md5) + } + return nil +} + func GetTaskLog(id string) (logStr string, err error) { task, err := model.GetTask(id) @@ -680,19 +699,6 @@ func AddTask(t model.Task) error { return nil } -func HandleTaskError(t model.Task, err error) { - log.Error("handle task error:" + err.Error()) - t.Status = constants.StatusError - t.Error = err.Error() - t.FinishTs = time.Now() - if err := t.Save(); err != nil { - log.Errorf(err.Error()) - debug.PrintStack() - return - } - debug.PrintStack() -} - func GetTaskEmailMarkdownContent(t model.Task, s model.Spider) string { n, _ := model.GetNode(t.NodeId) errMsg := "" diff --git a/backend/utils/file.go b/backend/utils/file.go index c71b2cb0..bfe92bd3 100644 --- a/backend/utils/file.go +++ b/backend/utils/file.go @@ -33,7 +33,14 @@ func ReadFileOneLine(fileName string) string { return "" } return line +} +func GetSpiderMd5Str(file string) string { + md5Str := ReadFileOneLine(file) + // 去掉空格以及换行符 + md5Str = strings.Replace(md5Str, " ", "", -1) + md5Str = strings.Replace(md5Str, "\n", "", -1) + return md5Str } // 创建文件