mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-21 17:21:09 +01:00
This commit is contained in:
@@ -3,4 +3,5 @@ package constants
|
||||
const (
|
||||
Customized = "customized"
|
||||
Configurable = "configurable"
|
||||
Plugin = "plugin"
|
||||
)
|
||||
|
||||
22
backend/entity/config_spider.go
Normal file
22
backend/entity/config_spider.go
Normal file
@@ -0,0 +1,22 @@
|
||||
package entity
|
||||
|
||||
type Field struct {
|
||||
Name string `yaml:"name" json:"name"`
|
||||
Css string `yaml:"css" json:"css"`
|
||||
Xpath string `yaml:"xpath" json:"xpath"`
|
||||
Attr string `yaml:"attr" json:"attr"`
|
||||
Stage string `yaml:"stage" json:"stage"`
|
||||
}
|
||||
|
||||
type Stage struct {
|
||||
List bool `yaml:"list" json:"list"`
|
||||
Css string `yaml:"css" json:"css"`
|
||||
Xpath string `yaml:"xpath" json:"xpath"`
|
||||
Fields []Field `yaml:"fields" json:"fields"`
|
||||
}
|
||||
|
||||
type ConfigSpiderData struct {
|
||||
Version string `yaml:"version" json:"version"`
|
||||
StartUrl string `yaml:"startUrl" json:"start_url"`
|
||||
Stages map[string]Stage `yaml:"stages" json:"stages"`
|
||||
}
|
||||
@@ -129,7 +129,7 @@ func main() {
|
||||
// 爬虫
|
||||
authGroup.GET("/spiders", routes.GetSpiderList) // 爬虫列表
|
||||
authGroup.GET("/spiders/:id", routes.GetSpider) // 爬虫详情
|
||||
authGroup.POST("/spiders", routes.PutSpider) // 上传爬虫
|
||||
authGroup.POST("/spiders", routes.PutSpider) // 上传爬虫 TODO: 名称不对
|
||||
authGroup.POST("/spiders/:id", routes.PostSpider) // 修改爬虫
|
||||
authGroup.POST("/spiders/:id/publish", routes.PublishSpider) // 发布爬虫
|
||||
authGroup.DELETE("/spiders/:id", routes.DeleteSpider) // 删除爬虫
|
||||
@@ -140,7 +140,9 @@ func main() {
|
||||
authGroup.GET("/spiders/:id/stats", routes.GetSpiderStats) // 爬虫统计数据
|
||||
authGroup.GET("/spider/types", routes.GetSpiderTypes) // 爬虫类型
|
||||
// 可配置爬虫
|
||||
authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫
|
||||
authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫
|
||||
authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫
|
||||
authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫
|
||||
// 任务
|
||||
authGroup.GET("/tasks", routes.GetTaskList) // 任务列表
|
||||
authGroup.GET("/tasks/:id", routes.GetTask) // 任务详情
|
||||
|
||||
@@ -25,6 +25,7 @@ type Spider struct {
|
||||
Site string `json:"site" bson:"site"` // 爬虫网站
|
||||
Envs []Env `json:"envs" bson:"envs"` // 环境变量
|
||||
Remark string `json:"remark" bson:"remark"` // 备注
|
||||
|
||||
// 自定义爬虫
|
||||
Src string `json:"src" bson:"src"` // 源码位置
|
||||
Cmd string `json:"cmd" bson:"cmd"` // 执行命令
|
||||
@@ -33,17 +34,7 @@ type Spider struct {
|
||||
LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间
|
||||
LastStatus string `json:"last_status"` // 最后执行状态
|
||||
|
||||
// TODO: 可配置爬虫
|
||||
//Fields []interface{} `json:"fields"`
|
||||
//DetailFields []interface{} `json:"detail_fields"`
|
||||
//CrawlType string `json:"crawl_type"`
|
||||
//StartUrl string `json:"start_url"`
|
||||
//UrlPattern string `json:"url_pattern"`
|
||||
//ItemSelector string `json:"item_selector"`
|
||||
//ItemSelectorType string `json:"item_selector_type"`
|
||||
//PaginationSelector string `json:"pagination_selector"`
|
||||
//PaginationSelectorType string `json:"pagination_selector_type"`
|
||||
|
||||
// 时间
|
||||
CreateTs time.Time `json:"create_ts" bson:"create_ts"`
|
||||
UpdateTs time.Time `json:"update_ts" bson:"update_ts"`
|
||||
}
|
||||
@@ -98,13 +89,14 @@ func (spider *Spider) GetLastTask() (Task, error) {
|
||||
return tasks[0], nil
|
||||
}
|
||||
|
||||
// 删除爬虫
|
||||
func (spider *Spider) Delete() error {
|
||||
s, c := database.GetCol("spiders")
|
||||
defer s.Close()
|
||||
return c.RemoveId(spider.Id)
|
||||
}
|
||||
|
||||
// 爬虫列表
|
||||
// 获取爬虫列表
|
||||
func GetSpiderList(filter interface{}, skip int, limit int) ([]Spider, int, error) {
|
||||
s, c := database.GetCol("spiders")
|
||||
defer s.Close()
|
||||
@@ -136,7 +128,7 @@ func GetSpiderList(filter interface{}, skip int, limit int) ([]Spider, int, erro
|
||||
return spiders, count, nil
|
||||
}
|
||||
|
||||
// 获取爬虫
|
||||
// 获取爬虫(根据FileId)
|
||||
func GetSpiderByFileId(fileId bson.ObjectId) *Spider {
|
||||
s, c := database.GetCol("spiders")
|
||||
defer s.Close()
|
||||
@@ -150,7 +142,7 @@ func GetSpiderByFileId(fileId bson.ObjectId) *Spider {
|
||||
return result
|
||||
}
|
||||
|
||||
// 获取爬虫
|
||||
// 获取爬虫(根据名称)
|
||||
func GetSpiderByName(name string) *Spider {
|
||||
s, c := database.GetCol("spiders")
|
||||
defer s.Close()
|
||||
@@ -158,13 +150,13 @@ func GetSpiderByName(name string) *Spider {
|
||||
var result *Spider
|
||||
if err := c.Find(bson.M{"name": name}).One(&result); err != nil {
|
||||
log.Errorf("get spider error: %s, spider_name: %s", err.Error(), name)
|
||||
debug.PrintStack()
|
||||
//debug.PrintStack()
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// 获取爬虫
|
||||
// 获取爬虫(根据ID)
|
||||
func GetSpider(id bson.ObjectId) (Spider, error) {
|
||||
s, c := database.GetCol("spiders")
|
||||
defer s.Close()
|
||||
@@ -245,7 +237,7 @@ func RemoveAllSpider() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// 爬虫总数
|
||||
// 获取爬虫总数
|
||||
func GetSpiderCount() (int, error) {
|
||||
s, c := database.GetCol("spiders")
|
||||
defer s.Close()
|
||||
@@ -257,7 +249,7 @@ func GetSpiderCount() (int, error) {
|
||||
return count, nil
|
||||
}
|
||||
|
||||
// 爬虫类型
|
||||
// 获取爬虫类型
|
||||
func GetSpiderTypes() ([]*entity.SpiderType, error) {
|
||||
s, c := database.GetCol("spiders")
|
||||
defer s.Close()
|
||||
|
||||
@@ -1,39 +1,112 @@
|
||||
package routes
|
||||
|
||||
import (
|
||||
"crawlab/constants"
|
||||
"crawlab/entity"
|
||||
"crawlab/model"
|
||||
"crawlab/utils"
|
||||
"fmt"
|
||||
"github.com/apex/log"
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/globalsign/mgo/bson"
|
||||
uuid "github.com/satori/go.uuid"
|
||||
"github.com/spf13/viper"
|
||||
"gopkg.in/yaml.v2"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime/debug"
|
||||
)
|
||||
|
||||
type Field struct {
|
||||
Name string `yaml:"name" json:"name"`
|
||||
Css string `yaml:"css" json:"css"`
|
||||
Xpath string `yaml:"xpath" json:"xpath"`
|
||||
Attr string `yaml:"attr" json:"attr"`
|
||||
Stage string `yaml:"stage" json:"stage"`
|
||||
}
|
||||
|
||||
type Stage struct {
|
||||
List bool `yaml:"list" json:"list"`
|
||||
Css string `yaml:"css" json:"css"`
|
||||
Xpath string `yaml:"xpath" json:"xpath"`
|
||||
Fields []Field `yaml:"fields" json:"fields"`
|
||||
}
|
||||
|
||||
type ConfigSpiderData struct {
|
||||
Version string `yaml:"version" json:"version"`
|
||||
StartUrl string `yaml:"startUrl" json:"start_url"`
|
||||
Stages map[string]Stage `yaml:"stages" json:"stages"`
|
||||
}
|
||||
|
||||
// 添加可配置爬虫
|
||||
func PutConfigSpider(c *gin.Context) {
|
||||
var spider model.Spider
|
||||
if err := c.ShouldBindJSON(&spider); err != nil {
|
||||
HandleError(http.StatusBadRequest, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
// 爬虫名称不能为空
|
||||
if spider.Name == "" {
|
||||
HandleErrorF(http.StatusBadRequest, c, "spider name should not be empty")
|
||||
return
|
||||
}
|
||||
|
||||
// 判断爬虫是否存在
|
||||
if spider := model.GetSpiderByName(spider.Name); spider != nil {
|
||||
HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("spider for '%s' already exists", spider.Name))
|
||||
return
|
||||
}
|
||||
|
||||
// 设置爬虫类别
|
||||
spider.Type = constants.Configurable
|
||||
|
||||
// 将FileId置空
|
||||
spider.FileId = bson.ObjectIdHex(constants.ObjectIdNull)
|
||||
|
||||
// 添加爬虫到数据库
|
||||
if err := spider.Add(); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, Response{
|
||||
Status: "ok",
|
||||
Message: "success",
|
||||
Data: spider,
|
||||
})
|
||||
}
|
||||
|
||||
// 更改可配置爬虫
|
||||
func PostConfigSpider(c *gin.Context) {
|
||||
PostSpider(c)
|
||||
}
|
||||
|
||||
func UploadConfigSpider(c *gin.Context) {
|
||||
// 获取上传文件
|
||||
file, header, err := c.Request.FormFile("file")
|
||||
if err != nil {
|
||||
HandleError(http.StatusBadRequest, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
// 文件名称必须为Spiderfile
|
||||
filename := header.Filename
|
||||
if filename != "Spiderfile" {
|
||||
HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile'")
|
||||
return
|
||||
}
|
||||
|
||||
// 以防tmp目录不存在
|
||||
tmpPath := viper.GetString("other.tmppath")
|
||||
if !utils.Exists(tmpPath) {
|
||||
if err := os.MkdirAll(tmpPath, os.ModePerm); err != nil {
|
||||
log.Error("mkdir other.tmppath dir error:" + err.Error())
|
||||
debug.PrintStack()
|
||||
HandleError(http.StatusBadRequest, c, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
//创建文件
|
||||
randomId := uuid.NewV4()
|
||||
tmpFilePath := filepath.Join(tmpPath, "Spiderfile."+randomId.String())
|
||||
out, err := os.Create(tmpFilePath)
|
||||
if err != nil {
|
||||
}
|
||||
_, err = io.Copy(out, file)
|
||||
if err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
}
|
||||
_ = out.Close()
|
||||
|
||||
// 构造配置数据
|
||||
data := ConfigSpiderData{}
|
||||
data := entity.ConfigSpiderData{}
|
||||
|
||||
// 读取YAML文件
|
||||
yamlFile, err := ioutil.ReadFile("./template/Spiderfile")
|
||||
yamlFile, err := ioutil.ReadFile(tmpFilePath)
|
||||
if err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
|
||||
@@ -116,12 +116,23 @@ func PublishAllSpiders() {
|
||||
|
||||
// 发布爬虫
|
||||
func PublishSpider(spider model.Spider) {
|
||||
// 查询gf file,不存在则删除
|
||||
gfFile := model.GetGridFs(spider.FileId)
|
||||
if gfFile == nil {
|
||||
_ = model.RemoveSpider(spider.Id)
|
||||
// 查询gf file,不存在则标记为爬虫文件不存在
|
||||
var gfFile *model.GridFs
|
||||
if spider.Type == constants.Customized {
|
||||
gfFile = model.GetGridFs(spider.FileId)
|
||||
if gfFile == nil {
|
||||
spider.FileId = constants.ObjectIdNull
|
||||
_ = spider.Save()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// 如果FileId为空,表示还没有上传爬虫到GridFS,则跳过
|
||||
if spider.FileId == bson.ObjectIdHex(constants.ObjectIdNull) {
|
||||
return
|
||||
}
|
||||
|
||||
// 获取爬虫同步实例
|
||||
spiderSync := spider_handler.SpiderSync{
|
||||
Spider: spider,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user