From 98e5ead2854dc8563e73b4ef6053aed45876086f Mon Sep 17 00:00:00 2001 From: marvzhang Date: Sun, 2 Feb 2020 22:56:11 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0demo=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/entity/config_spider.go | 10 +++ backend/main.go | 35 +++++----- backend/model/spider.go | 2 +- backend/routes/projects.go | 18 +++++ backend/services/spider.go | 78 ++++++++++++++++++++++ spiders/chinaz/Spiderfile | 4 ++ spiders/chinaz/chinaz/pipelines.py | 21 ------ spiders/chinaz/chinaz/settings.py | 2 +- spiders/realestate/realestate/pipelines.py | 19 ------ spiders/realestate/realestate/settings.py | 2 +- 10 files changed, 130 insertions(+), 61 deletions(-) create mode 100644 spiders/chinaz/Spiderfile diff --git a/backend/entity/config_spider.go b/backend/entity/config_spider.go index d9e085d2..bb1295e7 100644 --- a/backend/entity/config_spider.go +++ b/backend/entity/config_spider.go @@ -1,12 +1,22 @@ package entity type ConfigSpiderData struct { + // 通用 + Name string `yaml:"name" json:"name"` + DisplayName string `yaml:"display_name" json:"display_name"` + Col string `yaml:"col" json:"col"` + Remark string `yaml:"remark" json:"remark"` + + // 可配置爬虫 Version string `yaml:"version" json:"version"` Engine string `yaml:"engine" json:"engine"` StartUrl string `yaml:"start_url" json:"start_url"` StartStage string `yaml:"start_stage" json:"start_stage"` Stages []Stage `yaml:"stages" json:"stages"` Settings map[string]string `yaml:"settings" json:"settings"` + + // 自定义爬虫 + Cmd string `yaml:"cmd" json:"cmd"` } type Stage struct { diff --git a/backend/main.go b/backend/main.go index d9d44a12..ab0d0e7b 100644 --- a/backend/main.go +++ b/backend/main.go @@ -39,7 +39,6 @@ func main() { log.SetLevelFromString(logLevel) } log.Info("initialized log config successfully") - if viper.GetString("log.isDeletePeriodically") == "Y" { err := services.InitDeleteLogPeriodically() if err != nil { @@ -74,8 +73,24 @@ func main() { debug.PrintStack() panic(err) } + log.Info("initialized schedule successfully") + + // 初始化用户服务 + if err := services.InitUserService(); err != nil { + log.Error("init user service error:" + err.Error()) + debug.PrintStack() + panic(err) + } + log.Info("initialized user service successfully") + + // 初始化依赖服务 + if err := services.InitDepsFetcher(); err != nil { + log.Error("init dependency fetcher error:" + err.Error()) + debug.PrintStack() + panic(err) + } + log.Info("initialized dependency fetcher successfully") } - log.Info("initialized schedule successfully") // 初始化任务执行器 if err := services.InitTaskExecutor(); err != nil { @@ -100,22 +115,6 @@ func main() { } log.Info("initialized spider service successfully") - // 初始化用户服务 - if err := services.InitUserService(); err != nil { - log.Error("init user service error:" + err.Error()) - debug.PrintStack() - panic(err) - } - log.Info("initialized user service successfully") - - // 初始化依赖服务 - if err := services.InitDepsFetcher(); err != nil { - log.Error("init dependency fetcher error:" + err.Error()) - debug.PrintStack() - panic(err) - } - log.Info("initialized dependency fetcher successfully") - // 初始化RPC服务 if err := services.InitRpcService(); err != nil { log.Error("init rpc service error:" + err.Error()) diff --git a/backend/model/spider.go b/backend/model/spider.go index 475b12e2..2baeb6ed 100644 --- a/backend/model/spider.go +++ b/backend/model/spider.go @@ -168,7 +168,7 @@ func GetSpiderByName(name string) Spider { defer s.Close() var result Spider - if err := c.Find(bson.M{"name": name}).One(&result); err != nil { + if err := c.Find(bson.M{"name": name}).One(&result); err != nil && err != mgo.ErrNotFound { log.Errorf("get spider error: %s, spider_name: %s", err.Error(), name) //debug.PrintStack() return result diff --git a/backend/routes/projects.go b/backend/routes/projects.go index b1f99ab3..34b2d7f4 100644 --- a/backend/routes/projects.go +++ b/backend/routes/projects.go @@ -123,6 +123,24 @@ func DeleteProject(c *gin.Context) { return } + // 获取相关的爬虫 + var spiders []model.Spider + s, col := database.GetCol("spiders") + defer s.Close() + if err := col.Find(bson.M{"project_id": bson.ObjectIdHex(id)}).All(&spiders); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + // 将爬虫的项目ID置空 + for _, spider := range spiders { + spider.ProjectId = bson.ObjectIdHex(constants.ObjectIdNull) + if err := spider.Save(); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + } + c.JSON(http.StatusOK, Response{ Status: "ok", Message: "success", diff --git a/backend/services/spider.go b/backend/services/spider.go index a03d4dc8..b395a956 100644 --- a/backend/services/spider.go +++ b/backend/services/spider.go @@ -14,7 +14,10 @@ import ( "github.com/globalsign/mgo/bson" "github.com/satori/go.uuid" "github.com/spf13/viper" + "gopkg.in/yaml.v2" + "io/ioutil" "os" + "path" "path/filepath" "runtime/debug" ) @@ -264,5 +267,80 @@ func InitSpiderService() error { // 启动定时任务 c.Start() + if model.IsMaster() { + // 添加Demo爬虫 + templateSpidersDir := "../spiders" + for _, info := range utils.ListDir(templateSpidersDir) { + if !info.IsDir() { + continue + } + spiderName := info.Name() + + // 如果爬虫在数据库中不存在,则添加 + spider := model.GetSpiderByName(spiderName) + if spider.Name != "" { + // 存在同名爬虫,跳过 + continue + } + + // 拷贝爬虫 + templateSpiderPath := path.Join(templateSpidersDir, spiderName) + spiderPath := path.Join(viper.GetString("spider.path"), spiderName) + if utils.Exists(spiderPath) { + utils.RemoveFiles(spiderPath) + } + if err := utils.CopyDir(templateSpiderPath, spiderPath); err != nil { + log.Errorf("copy error: " + err.Error()) + debug.PrintStack() + continue + } + + // 构造配置数据 + configData := entity.ConfigSpiderData{} + + // 读取YAML文件 + yamlFile, err := ioutil.ReadFile(path.Join(spiderPath, "Spiderfile")) + if err != nil { + log.Errorf("read yaml error: " + err.Error()) + //debug.PrintStack() + continue + } + + // 反序列化 + if err := yaml.Unmarshal(yamlFile, &configData); err != nil { + log.Errorf("unmarshal error: " + err.Error()) + debug.PrintStack() + continue + } + + // 添加该爬虫到数据库 + spider = model.Spider{ + Id: bson.NewObjectId(), + Name: configData.Name, + DisplayName: configData.DisplayName, + Type: constants.Customized, + Col: configData.Col, + Cmd: configData.Cmd, + Src: spiderPath, + Remark: configData.Remark, + ProjectId: bson.ObjectIdHex(constants.ObjectIdNull), + FileId: bson.ObjectIdHex(constants.ObjectIdNull), + } + if err := spider.Add(); err != nil { + log.Errorf("add spider error: " + err.Error()) + debug.PrintStack() + continue + } + + // 上传爬虫到GridFS + if err := UploadSpiderToGridFsFromMaster(spider); err != nil { + log.Errorf("upload spider error: " + err.Error()) + debug.PrintStack() + continue + } + } + + } + return nil } diff --git a/spiders/chinaz/Spiderfile b/spiders/chinaz/Spiderfile new file mode 100644 index 00000000..d36c7cf2 --- /dev/null +++ b/spiders/chinaz/Spiderfile @@ -0,0 +1,4 @@ +name: "chinaz" +display_name: "站长之家 (Scrapy)" +col: "results_chinaz" +cmd: "scrapy crawl chinaz_spider" \ No newline at end of file diff --git a/spiders/chinaz/chinaz/pipelines.py b/spiders/chinaz/chinaz/pipelines.py index 747de355..b29f9eb7 100644 --- a/spiders/chinaz/chinaz/pipelines.py +++ b/spiders/chinaz/chinaz/pipelines.py @@ -5,24 +5,3 @@ # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html -import os - -from pymongo import MongoClient - -MONGO_HOST = os.environ.get('MONGO_HOST') or 'localhost' -MONGO_PORT = int(os.environ.get('MONGO_PORT') or '27017') -MONGO_DB = os.environ.get('MONGO_DB') or 'crawlab_test' - - -class MongoPipeline(object): - mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT) - db = mongo[MONGO_DB] - col_name = os.environ.get('CRAWLAB_COLLECTION') or 'sites' - col = db[col_name] - - def process_item(self, item, spider): - item['task_id'] = os.environ.get('CRAWLAB_TASK_ID') - item['_id'] = item['domain'] - if self.col.find_one({'_id': item['_id']}) is None: - self.col.save(item) - return item diff --git a/spiders/chinaz/chinaz/settings.py b/spiders/chinaz/chinaz/settings.py index 41fb31bf..932ec9ed 100644 --- a/spiders/chinaz/chinaz/settings.py +++ b/spiders/chinaz/chinaz/settings.py @@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { - 'chinaz.pipelines.MongoPipeline': 300, + 'crawlab.pipelines.CrawlabMongoPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) diff --git a/spiders/realestate/realestate/pipelines.py b/spiders/realestate/realestate/pipelines.py index 7b9eb9f2..3371792b 100644 --- a/spiders/realestate/realestate/pipelines.py +++ b/spiders/realestate/realestate/pipelines.py @@ -4,22 +4,3 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html -import os - -from pymongo import MongoClient - -MONGO_HOST = os.environ.get('MONGO_HOST') or 'localhost' -MONGO_PORT = int(os.environ.get('MONGO_PORT') or '27017') -MONGO_DB = os.environ.get('MONGO_DB') or 'crawlab_test' - - -class MongoPipeline(object): - mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT) - db = mongo[MONGO_DB] - col_name = os.environ.get('CRAWLAB_COLLECTION') - col = db[col_name] - - def process_item(self, item, spider): - item['task_id'] = os.environ.get('CRAWLAB_TASK_ID') - self.col.save(item) - return item diff --git a/spiders/realestate/realestate/settings.py b/spiders/realestate/realestate/settings.py index da1ada29..758f8ed0 100644 --- a/spiders/realestate/realestate/settings.py +++ b/spiders/realestate/realestate/settings.py @@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { - 'realestate.pipelines.MongoPipeline': 300, + 'crawlab.pipelines.CrawlabMongoPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default)