diff --git a/CHANGELOG-zh.md b/CHANGELOG-zh.md index 34d53b06..83440bbb 100644 --- a/CHANGELOG-zh.md +++ b/CHANGELOG-zh.md @@ -1,15 +1,23 @@ # 0.4.5 (unkown) ### 功能 / 优化 - **交互式教程**. 引导用户了解 Crawlab 的主要功能. +- **加入全局环境变量**. 可以设置全局环境变量,然后传入到所有爬虫程序中. [#177](https://github.com/crawlab-team/crawlab/issues/177) +- **项目**. 允许用户将爬虫关联到项目上. [#316](https://github.com/crawlab-team/crawlab/issues/316) +- **示例爬虫**. 当初始化时,自动加入示例爬虫. [#379](https://github.com/crawlab-team/crawlab/issues/379) +- **用户管理优化**. 限制管理用户的权限. [#456](https://github.com/crawlab-team/crawlab/issues/456) +- **设置页面优化**. +- **任务结果页面优化**. ### Bug 修复 +- **无法找到爬虫文件错误**. [#485](https://github.com/crawlab-team/crawlab/issues/485) - **点击删除按钮导致跳转**. [#480](https://github.com/crawlab-team/crawlab/issues/480) - **无法在空爬虫里创建文件**. [#479](https://github.com/crawlab-team/crawlab/issues/479) - **下载结果错误**. [#465](https://github.com/crawlab-team/crawlab/issues/465) - **crawlab-sdk CLI 错误**. [#458](https://github.com/crawlab-team/crawlab/issues/458) - **页面刷新问题**. [#441](https://github.com/crawlab-team/crawlab/issues/441) +- **结果不支持 JSON**. [#202](https://github.com/crawlab-team/crawlab/issues/202) - **修复“删除爬虫后获取所有爬虫”错误**. -- **修复 i18n warning 警告**. +- **修复 i18n 警告**. # 0.4.4 (2020-01-17) diff --git a/CHANGELOG.md b/CHANGELOG.md index 182df69f..a4bf4fa4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,21 @@ -# 0.4.5 (unkown) +# 0.4.5 (2020-02-03) ### Features / Enhancement - **Interactive Tutorial**. Guide users through the main functionalities of Crawlab. +- **Global Environment Variables**. Allow users to set global environment variables, which will be passed into all spider programs. [#177](https://github.com/crawlab-team/crawlab/issues/177) +- **Project**. Allow users to link spiders to projects. [#316](https://github.com/crawlab-team/crawlab/issues/316) +- **Demo Spiders**. Added demo spiders when Crawlab is initialized. [#379](https://github.com/crawlab-team/crawlab/issues/379) +- **User Admin Optimization**. Restrict privilleges of admin users. [#456](https://github.com/crawlab-team/crawlab/issues/456) +- **Setting Page Optimization**. +- **Task Results Optimization**. ### Bug Fixes +- **Unable to find spider file error**. [#485](https://github.com/crawlab-team/crawlab/issues/485) - **Click delete button results in redirect**. [#480](https://github.com/crawlab-team/crawlab/issues/480) - **Unable to create files in an empty spider**. [#479](https://github.com/crawlab-team/crawlab/issues/479) - **Download results error**. [#465](https://github.com/crawlab-team/crawlab/issues/465) - **crawlab-sdk CLI error**. [#458](https://github.com/crawlab-team/crawlab/issues/458) - **Page refresh issue**. [#441](https://github.com/crawlab-team/crawlab/issues/441) +- **Results not support JSON**. [#202](https://github.com/crawlab-team/crawlab/issues/202) - **Getting all spider after deleting a spider**. - **i18n warning**. diff --git a/backend/conf/config.yml b/backend/conf/config.yml index e5dcb1f9..385834bd 100644 --- a/backend/conf/config.yml +++ b/backend/conf/config.yml @@ -35,7 +35,7 @@ task: workers: 4 other: tmppath: "/tmp" -version: 0.4.4 +version: 0.4.5 setting: allowRegister: "N" notification: diff --git a/backend/database/redis.go b/backend/database/redis.go index 57cef9a2..bc6b4982 100644 --- a/backend/database/redis.go +++ b/backend/database/redis.go @@ -83,7 +83,7 @@ func (r *Redis) HGet(collection string, key string) (string, error) { defer utils.Close(c) value, err2 := redis.String(c.Do("HGET", collection, key)) - if err2 != nil { + if err2 != nil && err2 != redis.ErrNil { log.Error(err2.Error()) debug.PrintStack() return value, err2 diff --git a/backend/entity/config_spider.go b/backend/entity/config_spider.go index d9e085d2..054ee2fe 100644 --- a/backend/entity/config_spider.go +++ b/backend/entity/config_spider.go @@ -1,12 +1,22 @@ package entity type ConfigSpiderData struct { - Version string `yaml:"version" json:"version"` + // 通用 + Name string `yaml:"name" json:"name"` + DisplayName string `yaml:"display_name" json:"display_name"` + Col string `yaml:"col" json:"col"` + Remark string `yaml:"remark" json:"remark"` + Type string `yaml:"type" bson:"type"` + + // 可配置爬虫 Engine string `yaml:"engine" json:"engine"` StartUrl string `yaml:"start_url" json:"start_url"` StartStage string `yaml:"start_stage" json:"start_stage"` Stages []Stage `yaml:"stages" json:"stages"` Settings map[string]string `yaml:"settings" json:"settings"` + + // 自定义爬虫 + Cmd string `yaml:"cmd" json:"cmd"` } type Stage struct { diff --git a/backend/main.go b/backend/main.go index 4a84462d..ab0d0e7b 100644 --- a/backend/main.go +++ b/backend/main.go @@ -39,7 +39,6 @@ func main() { log.SetLevelFromString(logLevel) } log.Info("initialized log config successfully") - if viper.GetString("log.isDeletePeriodically") == "Y" { err := services.InitDeleteLogPeriodically() if err != nil { @@ -74,8 +73,24 @@ func main() { debug.PrintStack() panic(err) } + log.Info("initialized schedule successfully") + + // 初始化用户服务 + if err := services.InitUserService(); err != nil { + log.Error("init user service error:" + err.Error()) + debug.PrintStack() + panic(err) + } + log.Info("initialized user service successfully") + + // 初始化依赖服务 + if err := services.InitDepsFetcher(); err != nil { + log.Error("init dependency fetcher error:" + err.Error()) + debug.PrintStack() + panic(err) + } + log.Info("initialized dependency fetcher successfully") } - log.Info("initialized schedule successfully") // 初始化任务执行器 if err := services.InitTaskExecutor(); err != nil { @@ -100,22 +115,6 @@ func main() { } log.Info("initialized spider service successfully") - // 初始化用户服务 - if err := services.InitUserService(); err != nil { - log.Error("init user service error:" + err.Error()) - debug.PrintStack() - panic(err) - } - log.Info("initialized user service successfully") - - // 初始化依赖服务 - if err := services.InitDepsFetcher(); err != nil { - log.Error("init dependency fetcher error:" + err.Error()) - debug.PrintStack() - panic(err) - } - log.Info("initialized dependency fetcher successfully") - // 初始化RPC服务 if err := services.InitRpcService(); err != nil { log.Error("init rpc service error:" + err.Error()) @@ -224,10 +223,18 @@ func main() { } // 全局变量 { - authGroup.POST("/variable", routes.PostVariable) // 新增 - authGroup.PUT("/variable/:id", routes.PutVariable) //修改 - authGroup.DELETE("/variable/:id", routes.DeleteVariable) //删除 authGroup.GET("/variables", routes.GetVariableList) // 列表 + authGroup.PUT("/variable", routes.PutVariable) // 新增 + authGroup.POST("/variable/:id", routes.PostVariable) //修改 + authGroup.DELETE("/variable/:id", routes.DeleteVariable) //删除 + } + // 项目 + { + authGroup.GET("/projects", routes.GetProjectList) // 列表 + authGroup.GET("/projects/tags", routes.GetProjectTags) // 项目标签 + authGroup.PUT("/projects", routes.PutProject) //修改 + authGroup.POST("/projects/:id", routes.PostProject) // 新增 + authGroup.DELETE("/projects/:id", routes.DeleteProject) //删除 } // 统计数据 authGroup.GET("/stats/home", routes.GetHomeStats) // 首页统计数据 diff --git a/backend/model/project.go b/backend/model/project.go new file mode 100644 index 00000000..92c72655 --- /dev/null +++ b/backend/model/project.go @@ -0,0 +1,146 @@ +package model + +import ( + "crawlab/constants" + "crawlab/database" + "github.com/apex/log" + "github.com/globalsign/mgo/bson" + "runtime/debug" + "time" +) + +type Project struct { + Id bson.ObjectId `json:"_id" bson:"_id"` + Name string `json:"name" bson:"name"` + Description string `json:"description" bson:"description"` + Tags []string `json:"tags" bson:"tags"` + + CreateTs time.Time `json:"create_ts" bson:"create_ts"` + UpdateTs time.Time `json:"update_ts" bson:"update_ts"` + + // 前端展示 + Spiders []Spider `json:"spiders" bson:"spiders"` +} + +func (p *Project) Save() error { + s, c := database.GetCol("projects") + defer s.Close() + + p.UpdateTs = time.Now() + + if err := c.UpdateId(p.Id, p); err != nil { + debug.PrintStack() + return err + } + return nil +} + +func (p *Project) Add() error { + s, c := database.GetCol("projects") + defer s.Close() + + p.Id = bson.NewObjectId() + p.UpdateTs = time.Now() + p.CreateTs = time.Now() + if err := c.Insert(p); err != nil { + log.Errorf(err.Error()) + debug.PrintStack() + return err + } + + return nil +} + +func (p *Project) GetSpiders() ([]Spider, error) { + s, c := database.GetCol("spiders") + defer s.Close() + + var query interface{} + if p.Id.Hex() == constants.ObjectIdNull { + query = bson.M{ + "$or": []bson.M{ + {"project_id": p.Id}, + {"project_id": bson.M{"$exists": false}}, + }, + } + } else { + query = bson.M{"project_id": p.Id} + } + + var spiders []Spider + if err := c.Find(query).All(&spiders); err != nil { + log.Errorf(err.Error()) + debug.PrintStack() + return spiders, err + } + + return spiders, nil +} + +func GetProject(id bson.ObjectId) (Project, error) { + s, c := database.GetCol("projects") + defer s.Close() + var p Project + if err := c.Find(bson.M{"_id": id}).One(&p); err != nil { + log.Errorf(err.Error()) + debug.PrintStack() + return p, err + } + return p, nil +} + +func GetProjectList(filter interface{}, skip int, sortKey string) ([]Project, error) { + s, c := database.GetCol("projects") + defer s.Close() + + var projects []Project + if err := c.Find(filter).Skip(skip).Limit(constants.Infinite).Sort(sortKey).All(&projects); err != nil { + debug.PrintStack() + return projects, err + } + return projects, nil +} + +func GetProjectListTotal(filter interface{}) (int, error) { + s, c := database.GetCol("projects") + defer s.Close() + + var result int + result, err := c.Find(filter).Count() + if err != nil { + return result, err + } + return result, nil +} + +func UpdateProject(id bson.ObjectId, item Project) error { + s, c := database.GetCol("projects") + defer s.Close() + + var result Project + if err := c.FindId(id).One(&result); err != nil { + debug.PrintStack() + return err + } + + if err := item.Save(); err != nil { + return err + } + return nil +} + +func RemoveProject(id bson.ObjectId) error { + s, c := database.GetCol("projects") + defer s.Close() + + var result User + if err := c.FindId(id).One(&result); err != nil { + return err + } + + if err := c.RemoveId(id); err != nil { + return err + } + + return nil +} diff --git a/backend/model/spider.go b/backend/model/spider.go index 3026a66b..2baeb6ed 100644 --- a/backend/model/spider.go +++ b/backend/model/spider.go @@ -32,6 +32,7 @@ type Spider struct { Envs []Env `json:"envs" bson:"envs"` // 环境变量 Remark string `json:"remark" bson:"remark"` // 备注 Src string `json:"src" bson:"src"` // 源码位置 + ProjectId bson.ObjectId `json:"project_id" bson:"project_id"` // 项目ID // 自定义爬虫 Cmd string `json:"cmd" bson:"cmd"` // 执行命令 @@ -56,6 +57,11 @@ func (spider *Spider) Save() error { spider.UpdateTs = time.Now() + // 兼容没有项目ID的爬虫 + if spider.ProjectId.Hex() == "" { + spider.ProjectId = bson.ObjectIdHex(constants.ObjectIdNull) + } + if err := c.UpdateId(spider.Id, spider); err != nil { debug.PrintStack() return err @@ -162,7 +168,7 @@ func GetSpiderByName(name string) Spider { defer s.Close() var result Spider - if err := c.Find(bson.M{"name": name}).One(&result); err != nil { + if err := c.Find(bson.M{"name": name}).One(&result); err != nil && err != mgo.ErrNotFound { log.Errorf("get spider error: %s, spider_name: %s", err.Error(), name) //debug.PrintStack() return result diff --git a/backend/routes/projects.go b/backend/routes/projects.go new file mode 100644 index 00000000..34b2d7f4 --- /dev/null +++ b/backend/routes/projects.go @@ -0,0 +1,190 @@ +package routes + +import ( + "crawlab/constants" + "crawlab/database" + "crawlab/model" + "github.com/gin-gonic/gin" + "github.com/globalsign/mgo/bson" + "net/http" +) + +func GetProjectList(c *gin.Context) { + tag := c.Query("tag") + + // 筛选条件 + query := bson.M{} + if tag != "" { + query["tags"] = tag + } + + // 获取列表 + projects, err := model.GetProjectList(query, 0, "+_id") + if err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + // 获取总数 + total, err := model.GetProjectListTotal(query) + if err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + // 获取每个项目的爬虫列表 + for i, p := range projects { + spiders, err := p.GetSpiders() + if err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + projects[i].Spiders = spiders + } + + // 获取未被分配的爬虫数量 + if tag == "" { + noProject := model.Project{ + Id: bson.ObjectIdHex(constants.ObjectIdNull), + Name: "No Project", + Description: "Not assigned to any project", + } + spiders, err := noProject.GetSpiders() + if err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + noProject.Spiders = spiders + projects = append(projects, noProject) + } + + c.JSON(http.StatusOK, ListResponse{ + Status: "ok", + Message: "success", + Data: projects, + Total: total, + }) +} + +func PutProject(c *gin.Context) { + // 绑定请求数据 + var p model.Project + if err := c.ShouldBindJSON(&p); err != nil { + HandleError(http.StatusBadRequest, c, err) + return + } + + if err := p.Add(); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + c.JSON(http.StatusOK, Response{ + Status: "ok", + Message: "success", + }) +} + +func PostProject(c *gin.Context) { + id := c.Param("id") + + if !bson.IsObjectIdHex(id) { + HandleErrorF(http.StatusBadRequest, c, "invalid id") + } + + var item model.Project + if err := c.ShouldBindJSON(&item); err != nil { + HandleError(http.StatusBadRequest, c, err) + return + } + + if err := model.UpdateProject(bson.ObjectIdHex(id), item); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + c.JSON(http.StatusOK, Response{ + Status: "ok", + Message: "success", + }) +} + +func DeleteProject(c *gin.Context) { + id := c.Param("id") + + if !bson.IsObjectIdHex(id) { + HandleErrorF(http.StatusBadRequest, c, "invalid id") + return + } + + // 从数据库中删除该爬虫 + if err := model.RemoveProject(bson.ObjectIdHex(id)); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + // 获取相关的爬虫 + var spiders []model.Spider + s, col := database.GetCol("spiders") + defer s.Close() + if err := col.Find(bson.M{"project_id": bson.ObjectIdHex(id)}).All(&spiders); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + // 将爬虫的项目ID置空 + for _, spider := range spiders { + spider.ProjectId = bson.ObjectIdHex(constants.ObjectIdNull) + if err := spider.Save(); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + } + + c.JSON(http.StatusOK, Response{ + Status: "ok", + Message: "success", + }) +} + +func GetProjectTags(c *gin.Context) { + type Result struct { + Tag string `json:"tag" bson:"tag"` + } + + s, col := database.GetCol("projects") + defer s.Close() + + pipeline := []bson.M{ + { + "$unwind": "$tags", + }, + { + "$group": bson.M{ + "_id": "$tags", + }, + }, + { + "$sort": bson.M{ + "_id": 1, + }, + }, + { + "$addFields": bson.M{ + "tag": "$_id", + }, + }, + } + + var items []Result + if err := col.Pipe(pipeline).All(&items); err != nil { + HandleError(http.StatusInternalServerError, c, err) + return + } + + c.JSON(http.StatusOK, Response{ + Status: "ok", + Message: "success", + Data: items, + }) +} diff --git a/backend/routes/spider.go b/backend/routes/spider.go index 4adfb707..2b6dfd63 100644 --- a/backend/routes/spider.go +++ b/backend/routes/spider.go @@ -30,6 +30,7 @@ func GetSpiderList(c *gin.Context) { pageNum, _ := c.GetQuery("page_num") pageSize, _ := c.GetQuery("page_size") keyword, _ := c.GetQuery("keyword") + pid, _ := c.GetQuery("project_id") t, _ := c.GetQuery("type") sortKey, _ := c.GetQuery("sort_key") sortDirection, _ := c.GetQuery("sort_direction") @@ -41,6 +42,16 @@ func GetSpiderList(c *gin.Context) { if t != "" && t != "all" { filter["type"] = t } + if pid == "" { + // do nothing + } else if pid == constants.ObjectIdNull { + filter["$or"] = []bson.M{ + {"project_id": bson.ObjectIdHex(pid)}, + {"project_id": bson.M{"$exists": false}}, + } + } else { + filter["project_id"] = bson.ObjectIdHex(pid) + } // 排序 sortStr := "-_id" diff --git a/backend/routes/variable.go b/backend/routes/variable.go index 56f51ed7..c35c16ab 100644 --- a/backend/routes/variable.go +++ b/backend/routes/variable.go @@ -8,7 +8,7 @@ import ( ) // 新增 -func PostVariable(c *gin.Context) { +func PutVariable(c *gin.Context) { var variable model.Variable if err := c.ShouldBindJSON(&variable); err != nil { HandleError(http.StatusBadRequest, c, err) @@ -22,7 +22,7 @@ func PostVariable(c *gin.Context) { } // 修改 -func PutVariable(c *gin.Context) { +func PostVariable(c *gin.Context) { var id = c.Param("id") var variable model.Variable if err := c.ShouldBindJSON(&variable); err != nil { diff --git a/backend/services/config_spider.go b/backend/services/config_spider.go index fe0a3da1..29e1c2ca 100644 --- a/backend/services/config_spider.go +++ b/backend/services/config_spider.go @@ -6,6 +6,7 @@ import ( "crawlab/entity" "crawlab/model" "crawlab/model/config_spider" + "crawlab/services/spider_handler" "crawlab/utils" "errors" "fmt" @@ -227,6 +228,17 @@ func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.Con spider.FileId = fid _ = spider.Save() + // 获取爬虫同步实例 + spiderSync := spider_handler.SpiderSync{ + Spider: spider, + } + + // 获取gfFile + gfFile2 := model.GetGridFs(spider.FileId) + + // 生成MD5 + spiderSync.CreateMd5File(gfFile2.Md5) + return nil } diff --git a/backend/services/spider.go b/backend/services/spider.go index fe162f12..fb785d85 100644 --- a/backend/services/spider.go +++ b/backend/services/spider.go @@ -14,7 +14,10 @@ import ( "github.com/globalsign/mgo/bson" "github.com/satori/go.uuid" "github.com/spf13/viper" + "gopkg.in/yaml.v2" + "io/ioutil" "os" + "path" "path/filepath" "runtime/debug" ) @@ -69,6 +72,17 @@ func UploadSpiderToGridFsFromMaster(spider model.Spider) error { spider.FileId = fid _ = spider.Save() + // 获取爬虫同步实例 + spiderSync := spider_handler.SpiderSync{ + Spider: spider, + } + + // 获取gfFile + gfFile2 := model.GetGridFs(spider.FileId) + + // 生成MD5 + spiderSync.CreateMd5File(gfFile2.Md5) + return nil } @@ -101,6 +115,7 @@ func UploadToGridFs(fileName string, filePath string) (fid bson.ObjectId, err er } // 关闭文件,提交写入 if err = f.Close(); err != nil { + debug.PrintStack() return "", err } // 文件ID @@ -252,5 +267,110 @@ func InitSpiderService() error { // 启动定时任务 c.Start() + if model.IsMaster() { + // 添加Demo爬虫 + templateSpidersDir := "./template/spiders" + for _, info := range utils.ListDir(templateSpidersDir) { + if !info.IsDir() { + continue + } + spiderName := info.Name() + + // 如果爬虫在数据库中不存在,则添加 + spider := model.GetSpiderByName(spiderName) + if spider.Name != "" { + // 存在同名爬虫,跳过 + continue + } + + // 拷贝爬虫 + templateSpiderPath := path.Join(templateSpidersDir, spiderName) + spiderPath := path.Join(viper.GetString("spider.path"), spiderName) + if utils.Exists(spiderPath) { + utils.RemoveFiles(spiderPath) + } + if err := utils.CopyDir(templateSpiderPath, spiderPath); err != nil { + log.Errorf("copy error: " + err.Error()) + debug.PrintStack() + continue + } + + // 构造配置数据 + configData := entity.ConfigSpiderData{} + + // 读取YAML文件 + yamlFile, err := ioutil.ReadFile(path.Join(spiderPath, "Spiderfile")) + if err != nil { + log.Errorf("read yaml error: " + err.Error()) + //debug.PrintStack() + continue + } + + // 反序列化 + if err := yaml.Unmarshal(yamlFile, &configData); err != nil { + log.Errorf("unmarshal error: " + err.Error()) + debug.PrintStack() + continue + } + + if configData.Type == constants.Customized { + // 添加该爬虫到数据库 + spider = model.Spider{ + Id: bson.NewObjectId(), + Name: spiderName, + DisplayName: configData.DisplayName, + Type: constants.Customized, + Col: configData.Col, + Src: spiderPath, + Remark: configData.Remark, + ProjectId: bson.ObjectIdHex(constants.ObjectIdNull), + FileId: bson.ObjectIdHex(constants.ObjectIdNull), + Cmd: configData.Cmd, + } + if err := spider.Add(); err != nil { + log.Errorf("add spider error: " + err.Error()) + debug.PrintStack() + continue + } + + // 上传爬虫到GridFS + if err := UploadSpiderToGridFsFromMaster(spider); err != nil { + log.Errorf("upload spider error: " + err.Error()) + debug.PrintStack() + continue + } + } else if configData.Type == constants.Configurable || configData.Type == "config" { + // 添加该爬虫到数据库 + spider = model.Spider{ + Id: bson.NewObjectId(), + Name: configData.Name, + DisplayName: configData.DisplayName, + Type: constants.Configurable, + Col: configData.Col, + Src: spiderPath, + Remark: configData.Remark, + ProjectId: bson.ObjectIdHex(constants.ObjectIdNull), + FileId: bson.ObjectIdHex(constants.ObjectIdNull), + Config: configData, + } + if err := spider.Add(); err != nil { + log.Errorf("add spider error: " + err.Error()) + debug.PrintStack() + continue + } + + // 根据序列化后的数据处理爬虫文件 + if err := ProcessSpiderFilesFromConfigData(spider, configData); err != nil { + log.Errorf("add spider error: " + err.Error()) + debug.PrintStack() + continue + } + } + } + + // 发布所有爬虫 + PublishAllSpiders() + } + return nil } diff --git a/backend/services/spider_handler/spider.go b/backend/services/spider_handler/spider.go index cd8a1dbe..ddc94b57 100644 --- a/backend/services/spider_handler/spider.go +++ b/backend/services/spider_handler/spider.go @@ -4,6 +4,7 @@ import ( "crawlab/database" "crawlab/model" "crawlab/utils" + "fmt" "github.com/apex/log" "github.com/globalsign/mgo/bson" "github.com/satori/go.uuid" @@ -25,7 +26,7 @@ type SpiderSync struct { func (s *SpiderSync) CreateMd5File(md5 string) { path := filepath.Join(viper.GetString("spider.path"), s.Spider.Name) - utils.CreateFilePath(path) + utils.CreateDirPath(path) fileName := filepath.Join(path, Md5File) file := utils.OpenFile(fileName) @@ -66,10 +67,14 @@ func (s *SpiderSync) RemoveSpiderFile() { // 检测是否已经下载中 func (s *SpiderSync) CheckDownLoading(spiderId string, fileId string) (bool, string) { key := s.GetLockDownloadKey(spiderId) - if _, err := database.RedisClient.HGet("spider", key); err == nil { - return true, key + key2, err := database.RedisClient.HGet("spider", key) + if err != nil { + return false, key2 } - return false, key + if key2 == "" { + return false, key2 + } + return true, key2 } // 下载爬虫 @@ -78,6 +83,7 @@ func (s *SpiderSync) Download() { fileId := s.Spider.FileId.Hex() isDownloading, key := s.CheckDownLoading(spiderId, fileId) if isDownloading { + log.Infof(fmt.Sprintf("spider is already being downloaded, spider id: %s", s.Spider.Id.Hex())) return } else { _ = database.RedisClient.HSet("spider", key, key) diff --git a/backend/template/spiderfile/Spiderfile.163_news b/backend/template/spiderfile/Spiderfile.163_news index bc224395..b87b8888 100644 --- a/backend/template/spiderfile/Spiderfile.163_news +++ b/backend/template/spiderfile/Spiderfile.163_news @@ -1,4 +1,3 @@ -version: "0.4.4" name: "toscrapy_books" start_url: "http://news.163.com/special/0001386F/rank_news.html" start_stage: "list" diff --git a/backend/template/spiderfile/Spiderfile.baidu b/backend/template/spiderfile/Spiderfile.baidu index 4c8db4a3..0259c64f 100644 --- a/backend/template/spiderfile/Spiderfile.baidu +++ b/backend/template/spiderfile/Spiderfile.baidu @@ -1,4 +1,3 @@ -version: 0.4.4 name: toscrapy_books start_url: http://www.baidu.com/s?wd=crawlab start_stage: list diff --git a/backend/template/spiderfile/Spiderfile.toscrapy_books b/backend/template/spiderfile/Spiderfile.toscrapy_books index 6d1542f7..d9100e21 100644 --- a/backend/template/spiderfile/Spiderfile.toscrapy_books +++ b/backend/template/spiderfile/Spiderfile.toscrapy_books @@ -1,4 +1,3 @@ -version: "0.4.4" name: "toscrapy_books" start_url: "http://books.toscrape.com" start_stage: "list" diff --git a/backend/template/spiders/amazon_config/Spiderfile b/backend/template/spiders/amazon_config/Spiderfile new file mode 100644 index 00000000..eea8a538 --- /dev/null +++ b/backend/template/spiders/amazon_config/Spiderfile @@ -0,0 +1,51 @@ +name: "amazon_config" +display_name: "亚马逊中国(可配置)" +remark: "亚马逊中国搜索手机,列表+分页" +type: "configurable" +col: "results_amazon_config" +engine: scrapy +start_url: https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2 +start_stage: list +stages: +- name: list + is_list: true + list_css: .s-result-item + list_xpath: "" + page_css: .a-last > a + page_xpath: "" + page_attr: href + fields: + - name: title + css: span.a-text-normal + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: .a-link-normal + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: price + css: "" + xpath: .//*[@class="a-price-whole"] + attr: "" + next_stage: "" + remark: "" + - name: price_fraction + css: "" + xpath: .//*[@class="a-price-fraction"] + attr: "" + next_stage: "" + remark: "" + - name: img + css: .s-image-square-aspect > img + xpath: "" + attr: src + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/spiders/autohome_config/Spiderfile b/backend/template/spiders/autohome_config/Spiderfile new file mode 100644 index 00000000..e69880cb --- /dev/null +++ b/backend/template/spiders/autohome_config/Spiderfile @@ -0,0 +1,57 @@ +name: "autohome_config" +display_name: "汽车之家(可配置)" +remark: "汽车之家文章,列表+详情+分页" +type: "configurable" +col: "results_autohome_config" +engine: scrapy +start_url: https://www.autohome.com.cn/all/ +start_stage: list +stages: +- name: list + is_list: true + list_css: ul.article > li + list_xpath: "" + page_css: a.page-item-next + page_xpath: "" + page_attr: href + fields: + - name: title + css: li > a > h3 + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: li > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: abstract + css: li > a > p + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: time + css: li > a .fn-left + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: views + css: li > a .fn-right > em:first-child + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: comments + css: li > a .fn-right > em:last-child + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/spiders/baidu_config/Spiderfile b/backend/template/spiders/baidu_config/Spiderfile new file mode 100644 index 00000000..a29d4acb --- /dev/null +++ b/backend/template/spiders/baidu_config/Spiderfile @@ -0,0 +1,39 @@ +name: "baidu_config" +display_name: "百度搜索(可配置)" +remark: "百度搜索Crawlab,列表+分页" +type: "configurable" +col: "results_baidu_config" +engine: scrapy +start_url: http://www.baidu.com/s?wd=crawlab +start_stage: list +stages: +- name: list + is_list: true + list_css: ".result.c-container" + list_xpath: "" + page_css: "a.n" + page_xpath: "" + page_attr: href + fields: + - name: title + css: "" + xpath: .//h3/a + attr: "" + next_stage: "" + remark: "" + - name: url + css: "" + xpath: .//h3/a + attr: href + next_stage: "" + remark: "" + - name: abstract + css: "" + xpath: .//*[@class="c-abstract"] + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/spiders/bing_general/Spiderfile b/backend/template/spiders/bing_general/Spiderfile new file mode 100644 index 00000000..614c135e --- /dev/null +++ b/backend/template/spiders/bing_general/Spiderfile @@ -0,0 +1,6 @@ +name: "bing_general" +display_name: "必应搜索 (通用)" +remark: "必应搜索 Crawlab,列表+分页" +col: "results_bing_general" +type: "customized" +cmd: "python bing_spider.py" \ No newline at end of file diff --git a/backend/template/spiders/bing_general/bing_spider.py b/backend/template/spiders/bing_general/bing_spider.py new file mode 100644 index 00000000..e982e4ee --- /dev/null +++ b/backend/template/spiders/bing_general/bing_spider.py @@ -0,0 +1,41 @@ +import requests +from bs4 import BeautifulSoup as bs +from urllib.parse import urljoin, urlparse +import re +from crawlab import save_item + +s = requests.Session() + +def get_real_url(response, url): + if re.search(r'^https?', url): + return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url + return urljoin(response.url, url) + +def start_requests(): + for i in range(0, 9): + fr = 'PERE' if not i else 'MORE' + url = f'https://cn.bing.com/search?q=crawlab&first={10 * i + 1}&FROM={fr}' + request_page(url) + +def request_page(url): + print(f'requesting {url}') + r = s.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}) + parse_list(r) + +def parse_list(response): + soup = bs(response.content.decode('utf-8')) + for el in list(soup.select('#b_results > li')): + try: + save_item({ + 'title': el.select_one('h2').text, + 'url': el.select_one('h2 a').attrs.get('href'), + 'abstract': el.select_one('.b_caption p').text, + }) + except: + pass + +if __name__ == '__main__': + start_requests() \ No newline at end of file diff --git a/backend/template/spiders/chinaz/Spiderfile b/backend/template/spiders/chinaz/Spiderfile new file mode 100644 index 00000000..2fb940bb --- /dev/null +++ b/backend/template/spiders/chinaz/Spiderfile @@ -0,0 +1,5 @@ +name: "chinaz" +display_name: "站长之家 (Scrapy)" +col: "results_chinaz" +type: "customized" +cmd: "scrapy crawl chinaz_spider" \ No newline at end of file diff --git a/spiders/chinaz/chinaz/__init__.py b/backend/template/spiders/chinaz/chinaz/__init__.py similarity index 100% rename from spiders/chinaz/chinaz/__init__.py rename to backend/template/spiders/chinaz/chinaz/__init__.py diff --git a/spiders/chinaz/chinaz/items.py b/backend/template/spiders/chinaz/chinaz/items.py similarity index 100% rename from spiders/chinaz/chinaz/items.py rename to backend/template/spiders/chinaz/chinaz/items.py diff --git a/spiders/chinaz/chinaz/middlewares.py b/backend/template/spiders/chinaz/chinaz/middlewares.py similarity index 100% rename from spiders/chinaz/chinaz/middlewares.py rename to backend/template/spiders/chinaz/chinaz/middlewares.py diff --git a/backend/template/spiders/chinaz/chinaz/pipelines.py b/backend/template/spiders/chinaz/chinaz/pipelines.py new file mode 100644 index 00000000..b29f9eb7 --- /dev/null +++ b/backend/template/spiders/chinaz/chinaz/pipelines.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html + diff --git a/spiders/chinaz/chinaz/settings.py b/backend/template/spiders/chinaz/chinaz/settings.py similarity index 98% rename from spiders/chinaz/chinaz/settings.py rename to backend/template/spiders/chinaz/chinaz/settings.py index 41fb31bf..932ec9ed 100644 --- a/spiders/chinaz/chinaz/settings.py +++ b/backend/template/spiders/chinaz/chinaz/settings.py @@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { - 'chinaz.pipelines.MongoPipeline': 300, + 'crawlab.pipelines.CrawlabMongoPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) diff --git a/spiders/chinaz/chinaz/spiders/__init__.py b/backend/template/spiders/chinaz/chinaz/spiders/__init__.py similarity index 100% rename from spiders/chinaz/chinaz/spiders/__init__.py rename to backend/template/spiders/chinaz/chinaz/spiders/__init__.py diff --git a/spiders/chinaz/chinaz/spiders/chinaz_spider.py b/backend/template/spiders/chinaz/chinaz/spiders/chinaz_spider.py similarity index 100% rename from spiders/chinaz/chinaz/spiders/chinaz_spider.py rename to backend/template/spiders/chinaz/chinaz/spiders/chinaz_spider.py diff --git a/spiders/chinaz/scrapy.cfg b/backend/template/spiders/chinaz/scrapy.cfg similarity index 100% rename from spiders/chinaz/scrapy.cfg rename to backend/template/spiders/chinaz/scrapy.cfg diff --git a/spiders/csdn/csdn_spider.js b/backend/template/spiders/csdn/csdn_spider.js similarity index 100% rename from spiders/csdn/csdn_spider.js rename to backend/template/spiders/csdn/csdn_spider.js diff --git a/backend/template/spiders/csdn_config/Spiderfile b/backend/template/spiders/csdn_config/Spiderfile new file mode 100644 index 00000000..67f4f8c5 --- /dev/null +++ b/backend/template/spiders/csdn_config/Spiderfile @@ -0,0 +1,60 @@ +name: "csdn_config" +display_name: "CSDN(可配置)" +remark: "CSDN Crawlab 文章,列表+详情+分页" +type: "configurable" +col: "results_csdn_config" +engine: scrapy +start_url: https://so.csdn.net/so/search/s.do?q=crawlab +start_stage: list +stages: +- name: list + is_list: true + list_css: .search-list-con > .search-list + list_xpath: "" + page_css: a.btn-next + page_xpath: "" + page_attr: href + fields: + - name: url + css: "" + xpath: .//*[@class="limit_width"]/a + attr: href + next_stage: detail + remark: "" +- name: detail + is_list: false + list_css: "" + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: content + css: "" + xpath: .//div[@id="content_views"] + attr: "" + next_stage: "" + remark: "" + - name: views + css: .read-count + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: title + css: .title-article + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: author + css: .follow-nickName + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + AUTOTHROTTLE_ENABLED: "false" + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/backend/template/spiders/douban_config/Spiderfile b/backend/template/spiders/douban_config/Spiderfile new file mode 100644 index 00000000..84f0647a --- /dev/null +++ b/backend/template/spiders/douban_config/Spiderfile @@ -0,0 +1,57 @@ +name: "douban_config" +display_name: "豆瓣读书(可配置)" +remark: "豆瓣读书新书推荐,列表" +type: "configurable" +col: "results_douban_config" +engine: scrapy +start_url: https://book.douban.com/latest +start_stage: list +stages: +- name: list + is_list: true + list_css: ul.cover-col-4 > li + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: title + css: h2 > a + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: h2 > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: img + css: a.cover img + xpath: "" + attr: src + next_stage: "" + remark: "" + - name: rating + css: p.rating > .color-lightgray + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: abstract + css: p:last-child + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: info + css: .color-gray + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/spiders/jd/Spiderfile b/backend/template/spiders/jd/Spiderfile new file mode 100644 index 00000000..d090472b --- /dev/null +++ b/backend/template/spiders/jd/Spiderfile @@ -0,0 +1,5 @@ +name: "jd" +display_name: "京东 (Scrapy)" +col: "results_jd" +type: "customized" +cmd: "scrapy crawl jd_spider" \ No newline at end of file diff --git a/spiders/jd/jd/__init__.py b/backend/template/spiders/jd/jd/__init__.py similarity index 100% rename from spiders/jd/jd/__init__.py rename to backend/template/spiders/jd/jd/__init__.py diff --git a/spiders/jd/jd/items.py b/backend/template/spiders/jd/jd/items.py similarity index 92% rename from spiders/jd/jd/items.py rename to backend/template/spiders/jd/jd/items.py index 9a7ba1cb..b2c5e647 100644 --- a/spiders/jd/jd/items.py +++ b/backend/template/spiders/jd/jd/items.py @@ -12,3 +12,4 @@ class JdItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() price = scrapy.Field() + url = scrapy.Field() diff --git a/spiders/jd/jd/middlewares.py b/backend/template/spiders/jd/jd/middlewares.py similarity index 100% rename from spiders/jd/jd/middlewares.py rename to backend/template/spiders/jd/jd/middlewares.py diff --git a/backend/template/spiders/jd/jd/pipelines.py b/backend/template/spiders/jd/jd/pipelines.py new file mode 100644 index 00000000..5a7d7cbf --- /dev/null +++ b/backend/template/spiders/jd/jd/pipelines.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/spiders/jd/jd/settings.py b/backend/template/spiders/jd/jd/settings.py similarity index 97% rename from spiders/jd/jd/settings.py rename to backend/template/spiders/jd/jd/settings.py index d83206b2..ef89ed0c 100644 --- a/spiders/jd/jd/settings.py +++ b/backend/template/spiders/jd/jd/settings.py @@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'jd.spiders' #USER_AGENT = 'jd (+http://www.yourdomain.com)' # Obey robots.txt rules -ROBOTSTXT_OBEY = True +ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 @@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { - 'jd.pipelines.JdPipeline': 300, + 'crawlab.pipelines.CrawlabMongoPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) diff --git a/spiders/jd/jd/spiders/__init__.py b/backend/template/spiders/jd/jd/spiders/__init__.py similarity index 100% rename from spiders/jd/jd/spiders/__init__.py rename to backend/template/spiders/jd/jd/spiders/__init__.py diff --git a/backend/template/spiders/jd/jd/spiders/jd_spider.py b/backend/template/spiders/jd/jd/spiders/jd_spider.py new file mode 100644 index 00000000..4ec94fa9 --- /dev/null +++ b/backend/template/spiders/jd/jd/spiders/jd_spider.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +import scrapy + +from jd.items import JdItem + + +class JdSpiderSpider(scrapy.Spider): + name = 'jd_spider' + allowed_domains = ['jd.com'] + + def start_requests(self): + for i in range(1, 50): + yield scrapy.Request(url=f'https://search.jd.com/Search?keyword=手机&enc=utf-8&page={i}') + + def parse(self, response): + for el in response.css('.gl-item'): + yield JdItem( + url=el.css('.p-name > a::attr("href")').extract_first(), + name=el.css('.p-name > a::attr("title")').extract_first(), + price=float(el.css('.p-price i::text').extract_first()), + ) diff --git a/spiders/jd/scrapy.cfg b/backend/template/spiders/jd/scrapy.cfg similarity index 100% rename from spiders/jd/scrapy.cfg rename to backend/template/spiders/jd/scrapy.cfg diff --git a/spiders/juejin_node/juejin_spider.js b/backend/template/spiders/juejin_node/juejin_spider.js similarity index 100% rename from spiders/juejin_node/juejin_spider.js rename to backend/template/spiders/juejin_node/juejin_spider.js diff --git a/backend/template/spiders/realestate/Spiderfile b/backend/template/spiders/realestate/Spiderfile new file mode 100644 index 00000000..772e8312 --- /dev/null +++ b/backend/template/spiders/realestate/Spiderfile @@ -0,0 +1,4 @@ +name: "realestate" +display_name: "链家网 (Scrapy)" +col: "results_realestate" +cmd: "scrapy crawl lianjia" \ No newline at end of file diff --git a/spiders/realestate/realestate/__init__.py b/backend/template/spiders/realestate/realestate/__init__.py similarity index 100% rename from spiders/realestate/realestate/__init__.py rename to backend/template/spiders/realestate/realestate/__init__.py diff --git a/spiders/realestate/realestate/items.py b/backend/template/spiders/realestate/realestate/items.py similarity index 100% rename from spiders/realestate/realestate/items.py rename to backend/template/spiders/realestate/realestate/items.py diff --git a/spiders/realestate/realestate/middlewares.py b/backend/template/spiders/realestate/realestate/middlewares.py similarity index 100% rename from spiders/realestate/realestate/middlewares.py rename to backend/template/spiders/realestate/realestate/middlewares.py diff --git a/backend/template/spiders/realestate/realestate/pipelines.py b/backend/template/spiders/realestate/realestate/pipelines.py new file mode 100644 index 00000000..3371792b --- /dev/null +++ b/backend/template/spiders/realestate/realestate/pipelines.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/spiders/realestate/realestate/settings.py b/backend/template/spiders/realestate/realestate/settings.py similarity index 98% rename from spiders/realestate/realestate/settings.py rename to backend/template/spiders/realestate/realestate/settings.py index da1ada29..758f8ed0 100644 --- a/spiders/realestate/realestate/settings.py +++ b/backend/template/spiders/realestate/realestate/settings.py @@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { - 'realestate.pipelines.MongoPipeline': 300, + 'crawlab.pipelines.CrawlabMongoPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) diff --git a/spiders/realestate/realestate/spiders/__init__.py b/backend/template/spiders/realestate/realestate/spiders/__init__.py similarity index 100% rename from spiders/realestate/realestate/spiders/__init__.py rename to backend/template/spiders/realestate/realestate/spiders/__init__.py diff --git a/spiders/realestate/realestate/spiders/lianjia.py b/backend/template/spiders/realestate/realestate/spiders/lianjia.py similarity index 100% rename from spiders/realestate/realestate/spiders/lianjia.py rename to backend/template/spiders/realestate/realestate/spiders/lianjia.py diff --git a/spiders/realestate/scrapy.cfg b/backend/template/spiders/realestate/scrapy.cfg similarity index 100% rename from spiders/realestate/scrapy.cfg rename to backend/template/spiders/realestate/scrapy.cfg diff --git a/spiders/segmentfault/segmentfault_spider.js b/backend/template/spiders/segmentfault/segmentfault_spider.js similarity index 100% rename from spiders/segmentfault/segmentfault_spider.js rename to backend/template/spiders/segmentfault/segmentfault_spider.js diff --git a/backend/template/spiders/sinastock/Spiderfile b/backend/template/spiders/sinastock/Spiderfile new file mode 100644 index 00000000..b110cb48 --- /dev/null +++ b/backend/template/spiders/sinastock/Spiderfile @@ -0,0 +1,5 @@ +name: "sinastock" +display_name: "新浪股票 (Scrapy)" +type: "customized" +col: "results_sinastock" +cmd: "scrapy crawl sinastock_spider" \ No newline at end of file diff --git a/spiders/sinastock/scrapy.cfg b/backend/template/spiders/sinastock/scrapy.cfg similarity index 100% rename from spiders/sinastock/scrapy.cfg rename to backend/template/spiders/sinastock/scrapy.cfg diff --git a/spiders/sinastock/sinastock/__init__.py b/backend/template/spiders/sinastock/sinastock/__init__.py similarity index 100% rename from spiders/sinastock/sinastock/__init__.py rename to backend/template/spiders/sinastock/sinastock/__init__.py diff --git a/spiders/sinastock/sinastock/items.py b/backend/template/spiders/sinastock/sinastock/items.py similarity index 100% rename from spiders/sinastock/sinastock/items.py rename to backend/template/spiders/sinastock/sinastock/items.py diff --git a/spiders/sinastock/sinastock/middlewares.py b/backend/template/spiders/sinastock/sinastock/middlewares.py similarity index 100% rename from spiders/sinastock/sinastock/middlewares.py rename to backend/template/spiders/sinastock/sinastock/middlewares.py diff --git a/backend/template/spiders/sinastock/sinastock/pipelines.py b/backend/template/spiders/sinastock/sinastock/pipelines.py new file mode 100644 index 00000000..5a7d7cbf --- /dev/null +++ b/backend/template/spiders/sinastock/sinastock/pipelines.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/spiders/sinastock/sinastock/settings.py b/backend/template/spiders/sinastock/sinastock/settings.py similarity index 98% rename from spiders/sinastock/sinastock/settings.py rename to backend/template/spiders/sinastock/sinastock/settings.py index c63c2eb5..3e01d3ca 100644 --- a/spiders/sinastock/sinastock/settings.py +++ b/backend/template/spiders/sinastock/sinastock/settings.py @@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { - 'sinastock.pipelines.SinastockPipeline': 300, + 'crawlab.pipelines.CrawlabMongoPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) diff --git a/spiders/sinastock/sinastock/spiders/__init__.py b/backend/template/spiders/sinastock/sinastock/spiders/__init__.py similarity index 100% rename from spiders/sinastock/sinastock/spiders/__init__.py rename to backend/template/spiders/sinastock/sinastock/spiders/__init__.py diff --git a/spiders/sinastock/sinastock/spiders/sinastock_spider.py b/backend/template/spiders/sinastock/sinastock/spiders/sinastock_spider.py similarity index 100% rename from spiders/sinastock/sinastock/spiders/sinastock_spider.py rename to backend/template/spiders/sinastock/sinastock/spiders/sinastock_spider.py diff --git a/spiders/sites_inspector/sites_inspector.py b/backend/template/spiders/sites_inspector/sites_inspector.py similarity index 100% rename from spiders/sites_inspector/sites_inspector.py rename to backend/template/spiders/sites_inspector/sites_inspector.py diff --git a/backend/template/spiders/v2ex_config/Spiderfile b/backend/template/spiders/v2ex_config/Spiderfile new file mode 100644 index 00000000..bb18d40a --- /dev/null +++ b/backend/template/spiders/v2ex_config/Spiderfile @@ -0,0 +1,54 @@ +name: "v2ex_config" +display_name: "V2ex(可配置)" +remark: "V2ex,列表+详情" +type: "configurable" +col: "results_v2ex_config" +engine: scrapy +start_url: https://v2ex.com/ +start_stage: list +stages: +- name: list + is_list: true + list_css: .cell.item + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: href + fields: + - name: title + css: a.topic-link + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: a.topic-link + xpath: "" + attr: href + next_stage: detail + remark: "" + - name: replies + css: .count_livid + xpath: "" + attr: "" + next_stage: "" + remark: "" +- name: detail + is_list: false + list_css: "" + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: content + css: "" + xpath: .//*[@class="markdown_body"] + attr: "" + next_stage: "" + remark: "" +settings: + AUTOTHROTTLE_ENABLED: "true" + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/backend/template/spiders/xueqiu/Spiderfile b/backend/template/spiders/xueqiu/Spiderfile new file mode 100644 index 00000000..38aa5dbe --- /dev/null +++ b/backend/template/spiders/xueqiu/Spiderfile @@ -0,0 +1,5 @@ +name: "xueqiu" +display_name: "雪球网 (Scrapy)" +type: "customized" +col: "results_xueqiu" +cmd: "scrapy crawl xueqiu_spider" \ No newline at end of file diff --git a/spiders/xueqiu/scrapy.cfg b/backend/template/spiders/xueqiu/scrapy.cfg similarity index 100% rename from spiders/xueqiu/scrapy.cfg rename to backend/template/spiders/xueqiu/scrapy.cfg diff --git a/spiders/xueqiu/xueqiu/__init__.py b/backend/template/spiders/xueqiu/xueqiu/__init__.py similarity index 100% rename from spiders/xueqiu/xueqiu/__init__.py rename to backend/template/spiders/xueqiu/xueqiu/__init__.py diff --git a/spiders/xueqiu/xueqiu/items.py b/backend/template/spiders/xueqiu/xueqiu/items.py similarity index 100% rename from spiders/xueqiu/xueqiu/items.py rename to backend/template/spiders/xueqiu/xueqiu/items.py diff --git a/spiders/xueqiu/xueqiu/middlewares.py b/backend/template/spiders/xueqiu/xueqiu/middlewares.py similarity index 100% rename from spiders/xueqiu/xueqiu/middlewares.py rename to backend/template/spiders/xueqiu/xueqiu/middlewares.py diff --git a/backend/template/spiders/xueqiu/xueqiu/pipelines.py b/backend/template/spiders/xueqiu/xueqiu/pipelines.py new file mode 100644 index 00000000..5a7d7cbf --- /dev/null +++ b/backend/template/spiders/xueqiu/xueqiu/pipelines.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html diff --git a/spiders/xueqiu/xueqiu/settings.py b/backend/template/spiders/xueqiu/xueqiu/settings.py similarity index 97% rename from spiders/xueqiu/xueqiu/settings.py rename to backend/template/spiders/xueqiu/xueqiu/settings.py index b44a74e1..1d898e2f 100644 --- a/spiders/xueqiu/xueqiu/settings.py +++ b/backend/template/spiders/xueqiu/xueqiu/settings.py @@ -18,7 +18,7 @@ NEWSPIDER_MODULE = 'xueqiu.spiders' USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' # Obey robots.txt rules -ROBOTSTXT_OBEY = True +ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) # CONCURRENT_REQUESTS = 32 @@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { - 'xueqiu.pipelines.XueqiuPipeline': 300, + 'crawlab.pipelines.CrawlabMongoPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) diff --git a/spiders/xueqiu/xueqiu/spiders/__init__.py b/backend/template/spiders/xueqiu/xueqiu/spiders/__init__.py similarity index 100% rename from spiders/xueqiu/xueqiu/spiders/__init__.py rename to backend/template/spiders/xueqiu/xueqiu/spiders/__init__.py diff --git a/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py b/backend/template/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py similarity index 100% rename from spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py rename to backend/template/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py diff --git a/backend/template/spiders/xueqiu_config/Spiderfile b/backend/template/spiders/xueqiu_config/Spiderfile new file mode 100644 index 00000000..0de50e9e --- /dev/null +++ b/backend/template/spiders/xueqiu_config/Spiderfile @@ -0,0 +1,39 @@ +name: "xueqiu_config" +display_name: "雪球网(可配置)" +remark: "雪球网新闻,列表" +type: "configurable" +col: "results_xueqiu_config" +engine: scrapy +start_url: https://xueqiu.com/ +start_stage: list +stages: +- name: list + is_list: true + list_css: "" + list_xpath: .//*[contains(@class, "AnonymousHome_home__timeline__item")] + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: title + css: h3 > a + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: h3 > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: abstract + css: p + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/spiders/zongheng_config/Spiderfile b/backend/template/spiders/zongheng_config/Spiderfile new file mode 100644 index 00000000..0163fac7 --- /dev/null +++ b/backend/template/spiders/zongheng_config/Spiderfile @@ -0,0 +1,45 @@ +name: "zongheng_config" +display_name: "纵横(可配置)" +remark: "纵横小说网,列表" +type: "configurable" +col: "results_zongheng_config" +engine: scrapy +start_url: http://www.zongheng.com/rank/details.html?rt=1&d=1 +start_stage: list +stages: +- name: list + is_list: true + list_css: .rank_d_list + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: href + fields: + - name: title + css: .rank_d_b_name > a + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: .rank_d_b_name > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: abstract + css: body + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: votes + css: .rank_d_b_ticket + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/utils/file.go b/backend/utils/file.go index bfe92bd3..072930cf 100644 --- a/backend/utils/file.go +++ b/backend/utils/file.go @@ -55,7 +55,7 @@ func OpenFile(fileName string) *os.File { } // 创建文件夹 -func CreateFilePath(filePath string) { +func CreateDirPath(filePath string) { if !Exists(filePath) { if err := os.MkdirAll(filePath, os.ModePerm); err != nil { log.Errorf("create file error: %s, file_path: %s", err.Error(), filePath) diff --git a/backend/utils/model.go b/backend/utils/model.go index 21a295d6..048b0001 100644 --- a/backend/utils/model.go +++ b/backend/utils/model.go @@ -2,9 +2,9 @@ package utils import ( "crawlab/constants" + "encoding/json" "github.com/globalsign/mgo/bson" - "strconv" - "time" + "strings" ) func IsObjectIdNull(id bson.ObjectId) bool { @@ -12,16 +12,13 @@ func IsObjectIdNull(id bson.ObjectId) bool { } func InterfaceToString(value interface{}) string { - switch realValue := value.(type) { - case bson.ObjectId: - return realValue.Hex() - case string: - return realValue - case int: - return strconv.Itoa(realValue) - case time.Time: - return realValue.String() - default: + bytes, err := json.Marshal(value) + if err != nil { return "" } + str := string(bytes) + if strings.HasPrefix(str, "\"") && strings.HasSuffix(str, "\"") { + str = str[1 : len(str)-1] + } + return str } diff --git a/frontend/package.json b/frontend/package.json index 5f80d096..638189e8 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "crawlab", - "version": "0.4.4", + "version": "0.4.5", "private": true, "scripts": { "serve": "vue-cli-service serve --ip=0.0.0.0 --mode=development", diff --git a/frontend/src/components/InfoView/SpiderInfoView.vue b/frontend/src/components/InfoView/SpiderInfoView.vue index 93fa9c12..48b70a12 100644 --- a/frontend/src/components/InfoView/SpiderInfoView.vue +++ b/frontend/src/components/InfoView/SpiderInfoView.vue @@ -18,6 +18,20 @@ + + + + + @@ -127,6 +141,9 @@ export default { ...mapGetters('user', [ 'token' ]), + ...mapState('project', [ + 'projectList' + ]), isShowRun () { if (this.spiderForm.type === 'customized') { return !!this.spiderForm.cmd @@ -180,6 +197,15 @@ export default { onUploadError () { this.uploadLoading = false } + }, + async created () { + // fetch project list + await this.$store.dispatch('project/getProjectList') + + // 兼容项目ID + if (!this.spiderForm.project_id) { + this.$set(this.spiderForm, 'project_id', '000000000000000000000000') + } } } diff --git a/frontend/src/components/TableView/GeneralTableView.vue b/frontend/src/components/TableView/GeneralTableView.vue index 819a0572..3f7693f3 100644 --- a/frontend/src/components/TableView/GeneralTableView.vue +++ b/frontend/src/components/TableView/GeneralTableView.vue @@ -6,6 +6,13 @@ border> @@ -58,23 +65,39 @@ export default { computed: { filteredData () { return this.data - // .map(d => d) - // .filter((d, index) => { - // // pagination - // const pageNum = this.pageNum - // const pageSize = this.pageSize - // return (pageSize * (pageNum - 1) <= index) && (index < pageSize * pageNum) - // }) } }, methods: { onPageChange () { this.$emit('page-change', { pageNum: this.pageNum, pageSize: this.pageSize }) + }, + getString (value) { + if (value === undefined) return '' + const str = JSON.stringify(value) + if (str.match(/^"(.*)"$/)) return str.match(/^"(.*)"$/)[1] + return str } } } + + diff --git a/frontend/src/i18n/zh.js b/frontend/src/i18n/zh.js index c997248e..8a3fb800 100644 --- a/frontend/src/i18n/zh.js +++ b/frontend/src/i18n/zh.js @@ -12,6 +12,7 @@ export default { 'Deploys': '部署', 'Sites': '网站', 'Setting': '设置', + 'Project': '项目', // 标签 'Overview': '概览', @@ -71,6 +72,7 @@ export default { 'Create Directory': '新建目录', 'Create File': '新建文件', 'Add Node': '添加节点', + 'Add Project': '添加项目', // 主页 'Total Tasks': '总任务数', @@ -217,6 +219,14 @@ export default { // 部署 'Time': '时间', + // 项目 + 'All Tags': '全部标签', + 'Project Name': '项目名称', + 'Project Description': '项目描述', + 'Tags': '标签', + 'Enter Tags': '输入标签', + 'No Project': '无项目', + // 定时任务 'Schedule Name': '定时任务名称', 'Schedule Description': '定时任务描述', @@ -245,6 +255,9 @@ export default { 'Home Page Response Time (sec)': '首页响应时间(秒)', 'Home Page Response Status Code': '首页响应状态码', + // 用户 + 'Super Admin': '超级管理员', + // 文件 'Choose Folder': '选择文件', 'File': '文件', @@ -350,7 +363,7 @@ export default { 'Username': '用户名', 'Password': '密码', 'Confirm Password': '确认密码', - 'normal': '正常用户', + 'normal': '普通用户', 'admin': '管理用户', 'Role': '角色', 'Edit User': '更改用户', diff --git a/frontend/src/router/index.js b/frontend/src/router/index.js index a4ba50e1..69b7f35b 100644 --- a/frontend/src/router/index.js +++ b/frontend/src/router/index.js @@ -47,6 +47,25 @@ export const constantRouterMap = [ } ] }, + { + path: '/projects', + component: Layout, + meta: { + title: 'Project', + icon: 'fa fa-gear' + }, + children: [ + { + path: '', + name: 'Project', + component: () => import('../views/project/ProjectList'), + meta: { + title: 'Project', + icon: 'fa fa-code-fork' + } + } + ] + }, { path: '/spiders', component: Layout, diff --git a/frontend/src/store/index.js b/frontend/src/store/index.js index 34c98b4a..4fcb86db 100644 --- a/frontend/src/store/index.js +++ b/frontend/src/store/index.js @@ -16,6 +16,7 @@ import stats from './modules/stats' import setting from './modules/setting' import version from './modules/version' import tour from './modules/tour' +import project from './modules/project' import getters from './getters' Vue.use(Vuex) @@ -37,6 +38,7 @@ const store = new Vuex.Store({ setting, version, tour, + project, // 统计 stats }, diff --git a/frontend/src/store/modules/project.js b/frontend/src/store/modules/project.js new file mode 100644 index 00000000..0c6b504f --- /dev/null +++ b/frontend/src/store/modules/project.js @@ -0,0 +1,60 @@ +import request from '../../api/request' + +const state = { + projectForm: {}, + projectList: [], + projectTags: [] +} + +const getters = {} + +const mutations = { + SET_PROJECT_FORM: (state, value) => { + state.projectForm = value + }, + SET_PROJECT_LIST: (state, value) => { + state.projectList = value + }, + SET_PROJECT_TAGS: (state, value) => { + state.projectTags = value + } +} + +const actions = { + getProjectList ({ state, commit }, payload) { + return request.get('/projects', payload) + .then(response => { + if (response.data.data) { + commit('SET_PROJECT_LIST', response.data.data.map(d => { + if (!d.spiders) d.spiders = [] + return d + })) + } + }) + }, + getProjectTags ({ state, commit }) { + return request.get('/projects/tags') + .then(response => { + if (response.data.data) { + commit('SET_PROJECT_TAGS', response.data.data.map(d => d.tag)) + } + }) + }, + addProject ({ state }) { + return request.put('/projects', state.projectForm) + }, + editProject ({ state }, id) { + return request.post(`/projects/${id}`, state.projectForm) + }, + removeProject ({ state }, id) { + return request.delete(`/projects/${id}`) + } +} + +export default { + namespaced: true, + state, + getters, + mutations, + actions +} diff --git a/frontend/src/store/modules/user.js b/frontend/src/store/modules/user.js index 3324ba15..4bb6e918 100644 --- a/frontend/src/store/modules/user.js +++ b/frontend/src/store/modules/user.js @@ -156,7 +156,7 @@ const user = { }, // 新增全局变量 addGlobalVariable ({ commit, state }) { - return request.post(`/variable`, state.globalVariableForm) + return request.put(`/variable`, state.globalVariableForm) .then(() => { state.globalVariableForm = {} }) diff --git a/frontend/src/views/layout/components/Sidebar/SidebarItem.vue b/frontend/src/views/layout/components/Sidebar/SidebarItem.vue index 9c525c24..983134ad 100644 --- a/frontend/src/views/layout/components/Sidebar/SidebarItem.vue +++ b/frontend/src/views/layout/components/Sidebar/SidebarItem.vue @@ -101,3 +101,10 @@ export default { } } + + diff --git a/frontend/src/views/project/ProjectList.vue b/frontend/src/views/project/ProjectList.vue new file mode 100644 index 00000000..b282c82d --- /dev/null +++ b/frontend/src/views/project/ProjectList.vue @@ -0,0 +1,330 @@ + + + + + diff --git a/frontend/src/views/setting/Setting.vue b/frontend/src/views/setting/Setting.vue index 27e79b9c..fb74d41c 100644 --- a/frontend/src/views/setting/Setting.vue +++ b/frontend/src/views/setting/Setting.vue @@ -41,6 +41,13 @@ + +
+ + {{$t('Save')}} + +
+
@@ -77,6 +84,13 @@ + +
+ + {{$t('Save')}} + +
+
@@ -199,6 +213,11 @@ export default { 'globalVariableForm' ]) }, + watch: { + userInfoStr () { + this.saveUserInfo() + } + }, methods: { deleteGlobalVariableHandle (id) { this.$confirm(this.$t('Are you sure to delete this global variable'), this.$t('Notification'), { diff --git a/frontend/src/views/spider/SpiderList.vue b/frontend/src/views/spider/SpiderList.vue index c7107f11..bbfbee75 100644 --- a/frontend/src/views/spider/SpiderList.vue +++ b/frontend/src/views/spider/SpiderList.vue @@ -58,6 +58,20 @@ + + + + + @@ -104,6 +118,20 @@ + + + + + @@ -147,7 +175,29 @@ - + + + + + + + @@ -335,6 +385,7 @@ export default { crawlConfirmDialogVisible: false, activeSpiderId: undefined, filter: { + project_id: '', keyword: '', type: 'all' }, @@ -491,6 +542,9 @@ export default { ...mapGetters('user', [ 'token' ]), + ...mapState('project', [ + 'projectList' + ]), uploadForm () { return { name: this.spiderForm.name, @@ -517,7 +571,12 @@ export default { this.getList() }, onAdd () { + let projectId = '000000000000000000000000' + if (this.filter.project_id) { + projectId = this.filter.project_id + } this.$store.commit('spider/SET_SPIDER_FORM', { + project_id: projectId, template: this.templateList[0] }) this.addDialogVisible = true @@ -737,14 +796,20 @@ export default { sort_key: this.sort.sortKey, sort_direction: this.sort.sortDirection, keyword: this.filter.keyword, - type: this.filter.type + type: this.filter.type, + project_id: this.filter.project_id } await this.$store.dispatch('spider/getSpiderList', params) } }, async created () { - // fetch spider types - // await this.getTypes() + // fetch project list + await this.$store.dispatch('project/getProjectList') + + // project id + if (this.$route.params.project_id) { + this.filter.project_id = this.$route.params.project_id + } // fetch spider list await this.getList() diff --git a/frontend/src/views/task/TaskDetail.vue b/frontend/src/views/task/TaskDetail.vue index f52597ca..39aca864 100644 --- a/frontend/src/views/task/TaskDetail.vue +++ b/frontend/src/views/task/TaskDetail.vue @@ -137,6 +137,7 @@ export default { }, computed: { ...mapState('task', [ + 'taskForm', 'taskResultsData', 'taskResultsTotalCount' ]), @@ -164,6 +165,9 @@ export default { set (value) { this.$store.commit('task/SET_RESULTS_PAGE_SIZE', value) } + }, + isRunning () { + return ['pending', 'running'].includes(this.taskForm.status) } }, methods: { @@ -197,6 +201,9 @@ export default { this.getTaskLog() this.handle = setInterval(() => { + if (!this.isRunning) return + this.$store.dispatch('task/getTaskData', this.$route.params.id) + this.$store.dispatch('task/getTaskResults', this.$route.params.id) this.getTaskLog() }, 5000) }, diff --git a/frontend/src/views/user/UserList.vue b/frontend/src/views/user/UserList.vue index 9389f08c..6e03e0b1 100644 --- a/frontend/src/views/user/UserList.vue +++ b/frontend/src/views/user/UserList.vue @@ -1,7 +1,7 @@