添加demo爬虫

This commit is contained in:
marvzhang
2020-02-02 22:56:11 +08:00
parent 19b2dc1086
commit 98e5ead285
10 changed files with 130 additions and 61 deletions

View File

@@ -1,12 +1,22 @@
package entity
type ConfigSpiderData struct {
// 通用
Name string `yaml:"name" json:"name"`
DisplayName string `yaml:"display_name" json:"display_name"`
Col string `yaml:"col" json:"col"`
Remark string `yaml:"remark" json:"remark"`
// 可配置爬虫
Version string `yaml:"version" json:"version"`
Engine string `yaml:"engine" json:"engine"`
StartUrl string `yaml:"start_url" json:"start_url"`
StartStage string `yaml:"start_stage" json:"start_stage"`
Stages []Stage `yaml:"stages" json:"stages"`
Settings map[string]string `yaml:"settings" json:"settings"`
// 自定义爬虫
Cmd string `yaml:"cmd" json:"cmd"`
}
type Stage struct {

View File

@@ -39,7 +39,6 @@ func main() {
log.SetLevelFromString(logLevel)
}
log.Info("initialized log config successfully")
if viper.GetString("log.isDeletePeriodically") == "Y" {
err := services.InitDeleteLogPeriodically()
if err != nil {
@@ -74,8 +73,24 @@ func main() {
debug.PrintStack()
panic(err)
}
log.Info("initialized schedule successfully")
// 初始化用户服务
if err := services.InitUserService(); err != nil {
log.Error("init user service error:" + err.Error())
debug.PrintStack()
panic(err)
}
log.Info("initialized user service successfully")
// 初始化依赖服务
if err := services.InitDepsFetcher(); err != nil {
log.Error("init dependency fetcher error:" + err.Error())
debug.PrintStack()
panic(err)
}
log.Info("initialized dependency fetcher successfully")
}
log.Info("initialized schedule successfully")
// 初始化任务执行器
if err := services.InitTaskExecutor(); err != nil {
@@ -100,22 +115,6 @@ func main() {
}
log.Info("initialized spider service successfully")
// 初始化用户服务
if err := services.InitUserService(); err != nil {
log.Error("init user service error:" + err.Error())
debug.PrintStack()
panic(err)
}
log.Info("initialized user service successfully")
// 初始化依赖服务
if err := services.InitDepsFetcher(); err != nil {
log.Error("init dependency fetcher error:" + err.Error())
debug.PrintStack()
panic(err)
}
log.Info("initialized dependency fetcher successfully")
// 初始化RPC服务
if err := services.InitRpcService(); err != nil {
log.Error("init rpc service error:" + err.Error())

View File

@@ -168,7 +168,7 @@ func GetSpiderByName(name string) Spider {
defer s.Close()
var result Spider
if err := c.Find(bson.M{"name": name}).One(&result); err != nil {
if err := c.Find(bson.M{"name": name}).One(&result); err != nil && err != mgo.ErrNotFound {
log.Errorf("get spider error: %s, spider_name: %s", err.Error(), name)
//debug.PrintStack()
return result

View File

@@ -123,6 +123,24 @@ func DeleteProject(c *gin.Context) {
return
}
// 获取相关的爬虫
var spiders []model.Spider
s, col := database.GetCol("spiders")
defer s.Close()
if err := col.Find(bson.M{"project_id": bson.ObjectIdHex(id)}).All(&spiders); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 将爬虫的项目ID置空
for _, spider := range spiders {
spider.ProjectId = bson.ObjectIdHex(constants.ObjectIdNull)
if err := spider.Save(); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",

View File

@@ -14,7 +14,10 @@ import (
"github.com/globalsign/mgo/bson"
"github.com/satori/go.uuid"
"github.com/spf13/viper"
"gopkg.in/yaml.v2"
"io/ioutil"
"os"
"path"
"path/filepath"
"runtime/debug"
)
@@ -264,5 +267,80 @@ func InitSpiderService() error {
// 启动定时任务
c.Start()
if model.IsMaster() {
// 添加Demo爬虫
templateSpidersDir := "../spiders"
for _, info := range utils.ListDir(templateSpidersDir) {
if !info.IsDir() {
continue
}
spiderName := info.Name()
// 如果爬虫在数据库中不存在,则添加
spider := model.GetSpiderByName(spiderName)
if spider.Name != "" {
// 存在同名爬虫,跳过
continue
}
// 拷贝爬虫
templateSpiderPath := path.Join(templateSpidersDir, spiderName)
spiderPath := path.Join(viper.GetString("spider.path"), spiderName)
if utils.Exists(spiderPath) {
utils.RemoveFiles(spiderPath)
}
if err := utils.CopyDir(templateSpiderPath, spiderPath); err != nil {
log.Errorf("copy error: " + err.Error())
debug.PrintStack()
continue
}
// 构造配置数据
configData := entity.ConfigSpiderData{}
// 读取YAML文件
yamlFile, err := ioutil.ReadFile(path.Join(spiderPath, "Spiderfile"))
if err != nil {
log.Errorf("read yaml error: " + err.Error())
//debug.PrintStack()
continue
}
// 反序列化
if err := yaml.Unmarshal(yamlFile, &configData); err != nil {
log.Errorf("unmarshal error: " + err.Error())
debug.PrintStack()
continue
}
// 添加该爬虫到数据库
spider = model.Spider{
Id: bson.NewObjectId(),
Name: configData.Name,
DisplayName: configData.DisplayName,
Type: constants.Customized,
Col: configData.Col,
Cmd: configData.Cmd,
Src: spiderPath,
Remark: configData.Remark,
ProjectId: bson.ObjectIdHex(constants.ObjectIdNull),
FileId: bson.ObjectIdHex(constants.ObjectIdNull),
}
if err := spider.Add(); err != nil {
log.Errorf("add spider error: " + err.Error())
debug.PrintStack()
continue
}
// 上传爬虫到GridFS
if err := UploadSpiderToGridFsFromMaster(spider); err != nil {
log.Errorf("upload spider error: " + err.Error())
debug.PrintStack()
continue
}
}
}
return nil
}

View File

@@ -0,0 +1,4 @@
name: "chinaz"
display_name: "站长之家 (Scrapy)"
col: "results_chinaz"
cmd: "scrapy crawl chinaz_spider"

View File

@@ -5,24 +5,3 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
from pymongo import MongoClient
MONGO_HOST = os.environ.get('MONGO_HOST') or 'localhost'
MONGO_PORT = int(os.environ.get('MONGO_PORT') or '27017')
MONGO_DB = os.environ.get('MONGO_DB') or 'crawlab_test'
class MongoPipeline(object):
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
db = mongo[MONGO_DB]
col_name = os.environ.get('CRAWLAB_COLLECTION') or 'sites'
col = db[col_name]
def process_item(self, item, spider):
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
item['_id'] = item['domain']
if self.col.find_one({'_id': item['_id']}) is None:
self.col.save(item)
return item

View File

@@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'chinaz.pipelines.MongoPipeline': 300,
'crawlab.pipelines.CrawlabMongoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)

View File

@@ -4,22 +4,3 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
from pymongo import MongoClient
MONGO_HOST = os.environ.get('MONGO_HOST') or 'localhost'
MONGO_PORT = int(os.environ.get('MONGO_PORT') or '27017')
MONGO_DB = os.environ.get('MONGO_DB') or 'crawlab_test'
class MongoPipeline(object):
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
db = mongo[MONGO_DB]
col_name = os.environ.get('CRAWLAB_COLLECTION')
col = db[col_name]
def process_item(self, item, spider):
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
self.col.save(item)
return item

View File

@@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'realestate.pipelines.MongoPipeline': 300,
'crawlab.pipelines.CrawlabMongoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)