fix 无法及时同步爬虫的问题

This commit is contained in:
陈景阳
2020-01-28 15:43:57 +08:00
parent 3e792a24d2
commit 4825653ae0
4 changed files with 45 additions and 34 deletions

View File

@@ -12,12 +12,11 @@ import (
"github.com/apex/log"
"github.com/globalsign/mgo"
"github.com/globalsign/mgo/bson"
uuid "github.com/satori/go.uuid"
"github.com/satori/go.uuid"
"github.com/spf13/viper"
"os"
"path/filepath"
"runtime/debug"
"strings"
)
type SpiderFileData struct {
@@ -192,21 +191,14 @@ func PublishSpider(spider model.Spider) {
md5 := filepath.Join(path, spider_handler.Md5File)
if !utils.Exists(md5) {
log.Infof("md5 file not found: %s", md5)
spiderSync.RemoveSpiderFile()
spiderSync.Download()
spiderSync.CreateMd5File(gfFile.Md5)
spiderSync.RemoveDownCreate(gfFile.Md5)
return
}
// md5值不一样则下载
md5Str := utils.ReadFileOneLine(md5)
// 去掉空格以及换行符
md5Str = strings.Replace(md5Str, " ", "", -1)
md5Str = strings.Replace(md5Str, "\n", "", -1)
md5Str := utils.GetSpiderMd5Str(md5)
if gfFile.Md5 != md5Str {
log.Infof("md5 is different, gf-md5:%s, file-md5:%s", gfFile.Md5, md5Str)
spiderSync.RemoveSpiderFile()
spiderSync.Download()
spiderSync.CreateMd5File(gfFile.Md5)
spiderSync.RemoveDownCreate(gfFile.Md5)
return
}
}

View File

@@ -38,6 +38,12 @@ func (s *SpiderSync) CreateMd5File(md5 string) {
}
}
func (s *SpiderSync) RemoveDownCreate(md5 string) {
s.RemoveSpiderFile()
s.Download()
s.CreateMd5File(md5)
}
// 获得下载锁的key
func (s *SpiderSync) GetLockDownloadKey(spiderId string) string {
node, _ := model.GetCurrentNode()

View File

@@ -7,6 +7,7 @@ import (
"crawlab/lib/cron"
"crawlab/model"
"crawlab/services/notification"
"crawlab/services/spider_handler"
"crawlab/utils"
"encoding/json"
"errors"
@@ -450,15 +451,9 @@ func ExecuteTask(id int) {
t.Status = constants.StatusRunning // 任务状态
t.WaitDuration = t.StartTs.Sub(t.CreateTs).Seconds() // 等待时长
// 判断爬虫文件是否存在
gfFile := model.GetGridFs(spider.FileId)
if gfFile == nil {
t.Error = "找不到爬虫文件,请重新上传"
t.Status = constants.StatusError
t.FinishTs = time.Now() // 结束时间
t.RuntimeDuration = t.FinishTs.Sub(t.StartTs).Seconds() // 运行时长
t.TotalDuration = t.FinishTs.Sub(t.CreateTs).Seconds() // 总时长
_ = t.Save()
// 文件检查
if err := SpiderFileCheck(t, spider); err != nil {
log.Errorf("spider file check error: %s", err.Error())
return
}
@@ -538,6 +533,30 @@ func ExecuteTask(id int) {
log.Infof(GetWorkerPrefix(id) + "任务(ID:" + t.Id + ")" + "执行完毕. 消耗时间:" + durationStr + "秒")
}
func SpiderFileCheck(t model.Task, spider model.Spider) error {
// 判断爬虫文件是否存在
gfFile := model.GetGridFs(spider.FileId)
if gfFile == nil {
t.Error = "找不到爬虫文件,请重新上传"
t.Status = constants.StatusError
t.FinishTs = time.Now() // 结束时间
t.RuntimeDuration = t.FinishTs.Sub(t.StartTs).Seconds() // 运行时长
t.TotalDuration = t.FinishTs.Sub(t.CreateTs).Seconds() // 总时长
_ = t.Save()
return errors.New(t.Error)
}
// 判断md5值是否一致
path := filepath.Join(viper.GetString("spider.path"), spider.Name)
md5File := filepath.Join(path, spider_handler.Md5File)
md5 := utils.GetSpiderMd5Str(md5File)
if gfFile.Md5 != md5 {
spiderSync := spider_handler.SpiderSync{Spider: spider}
spiderSync.RemoveDownCreate(gfFile.Md5)
}
return nil
}
func GetTaskLog(id string) (logStr string, err error) {
task, err := model.GetTask(id)
@@ -680,19 +699,6 @@ func AddTask(t model.Task) error {
return nil
}
func HandleTaskError(t model.Task, err error) {
log.Error("handle task error:" + err.Error())
t.Status = constants.StatusError
t.Error = err.Error()
t.FinishTs = time.Now()
if err := t.Save(); err != nil {
log.Errorf(err.Error())
debug.PrintStack()
return
}
debug.PrintStack()
}
func GetTaskEmailMarkdownContent(t model.Task, s model.Spider) string {
n, _ := model.GetNode(t.NodeId)
errMsg := ""

View File

@@ -33,7 +33,14 @@ func ReadFileOneLine(fileName string) string {
return ""
}
return line
}
func GetSpiderMd5Str(file string) string {
md5Str := ReadFileOneLine(file)
// 去掉空格以及换行符
md5Str = strings.Replace(md5Str, " ", "", -1)
md5Str = strings.Replace(md5Str, "\n", "", -1)
return md5Str
}
// 创建文件