mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
加入修改可配置POST接口
This commit is contained in:
@@ -140,10 +140,11 @@ func main() {
|
||||
authGroup.GET("/spiders/:id/stats", routes.GetSpiderStats) // 爬虫统计数据
|
||||
authGroup.GET("/spider/types", routes.GetSpiderTypes) // 爬虫类型
|
||||
// 可配置爬虫
|
||||
authGroup.GET("/config_spiders/:id/config", routes.GetConfigSpiderConfig) // 可配置爬虫配置
|
||||
authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫
|
||||
authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫
|
||||
authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫
|
||||
authGroup.GET("/config_spiders/:id/config", routes.GetConfigSpiderConfig) // 获取可配置爬虫配置
|
||||
authGroup.POST("/config_spiders/:id/config", routes.PostConfigSpiderConfig) // 更改可配置爬虫配置
|
||||
authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫
|
||||
authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫
|
||||
authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫
|
||||
// 任务
|
||||
authGroup.GET("/tasks", routes.GetTaskList) // 任务列表
|
||||
authGroup.GET("/tasks/:id", routes.GetTask) // 任务详情
|
||||
|
||||
@@ -216,19 +216,19 @@ func (g ScrapyGenerator) GetExtractStringFromField(f entity.Field) string {
|
||||
// 如果为CSS
|
||||
if f.Attr == "" {
|
||||
// 文本
|
||||
return fmt.Sprintf(`css(%s::text())`, f.Css)
|
||||
return fmt.Sprintf(`css('%s::text()')`, f.Css)
|
||||
} else {
|
||||
// 属性
|
||||
return fmt.Sprintf(`css(%s::attr("%s"))`, f.Css, f.Attr)
|
||||
return fmt.Sprintf(`css('%s::attr("%s")')`, f.Css, f.Attr)
|
||||
}
|
||||
} else {
|
||||
// 如果为XPath
|
||||
if f.Attr == "" {
|
||||
// 文本
|
||||
return fmt.Sprintf(`xpath(%s/text())`, f.Xpath)
|
||||
return fmt.Sprintf(`xpath('%s/text()')`, f.Xpath)
|
||||
} else {
|
||||
// 属性
|
||||
return fmt.Sprintf(`xpath(%s/@%s)`, f.Xpath, f.Attr)
|
||||
return fmt.Sprintf(`xpath('%s/@%s')`, f.Xpath, f.Attr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,16 +2,13 @@ package routes
|
||||
|
||||
import (
|
||||
"crawlab/constants"
|
||||
"crawlab/database"
|
||||
"crawlab/entity"
|
||||
"crawlab/model"
|
||||
"crawlab/services"
|
||||
"crawlab/utils"
|
||||
"fmt"
|
||||
"github.com/apex/log"
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/globalsign/mgo/bson"
|
||||
uuid "github.com/satori/go.uuid"
|
||||
"github.com/spf13/viper"
|
||||
"gopkg.in/yaml.v2"
|
||||
"io"
|
||||
@@ -19,7 +16,6 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime/debug"
|
||||
)
|
||||
|
||||
// 添加可配置爬虫
|
||||
@@ -151,91 +147,39 @@ func UploadConfigSpider(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// 赋值 stage_name
|
||||
for stageName, stage := range configData.Stages {
|
||||
stage.Name = stageName
|
||||
configData.Stages[stageName] = stage
|
||||
}
|
||||
|
||||
// 删除已有的爬虫文件
|
||||
for _, fInfo := range utils.ListDir(spiderDir) {
|
||||
// 不删除Spiderfile
|
||||
if fInfo.Name() == filename {
|
||||
continue
|
||||
}
|
||||
|
||||
// 删除其他文件
|
||||
if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// 拷贝爬虫文件
|
||||
tplDir := "./template/scrapy"
|
||||
for _, fInfo := range utils.ListDir(tplDir) {
|
||||
// 跳过Spiderfile
|
||||
if fInfo.Name() == "Spiderfile" {
|
||||
continue
|
||||
}
|
||||
|
||||
srcPath := filepath.Join(tplDir, fInfo.Name())
|
||||
if fInfo.IsDir() {
|
||||
dirPath := filepath.Join(spiderDir, fInfo.Name())
|
||||
if err := utils.CopyDir(srcPath, dirPath); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
} else {
|
||||
if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 更改爬虫文件
|
||||
if err := services.GenerateConfigSpiderFiles(spider, configData); err != nil {
|
||||
// 根据序列化后的数据处理爬虫文件
|
||||
if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
// 打包为 zip 文件
|
||||
files, err := utils.GetFilesFromDir(spiderDir)
|
||||
c.JSON(http.StatusOK, Response{
|
||||
Status: "ok",
|
||||
Message: "success",
|
||||
})
|
||||
}
|
||||
|
||||
func PostConfigSpiderConfig(c *gin.Context) {
|
||||
id := c.Param("id")
|
||||
|
||||
// 获取爬虫
|
||||
var spider model.Spider
|
||||
spider, err := model.GetSpider(bson.ObjectIdHex(id))
|
||||
if err != nil {
|
||||
HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("cannot find spider (id: %s)", id))
|
||||
}
|
||||
|
||||
// 反序列化配置数据
|
||||
var configData entity.ConfigSpiderData
|
||||
if err := c.ShouldBindJSON(&configData); err != nil {
|
||||
HandleError(http.StatusBadRequest, c, err)
|
||||
}
|
||||
|
||||
// 根据序列化后的数据处理爬虫文件
|
||||
if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
randomId := uuid.NewV4()
|
||||
tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip")
|
||||
spiderZipFileName := spider.Name + ".zip"
|
||||
if err := utils.Compress(files, tmpFilePath); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
// 获取 GridFS 实例
|
||||
s, gf := database.GetGridFs("files")
|
||||
defer s.Close()
|
||||
|
||||
// 判断文件是否已经存在
|
||||
var gfFile model.GridFs
|
||||
if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil {
|
||||
// 已经存在文件,则删除
|
||||
_ = gf.RemoveId(gfFile.Id)
|
||||
}
|
||||
|
||||
// 上传到GridFs
|
||||
fid, err := services.UploadToGridFs(spiderZipFileName, tmpFilePath)
|
||||
if err != nil {
|
||||
log.Errorf("upload to grid fs error: %s", err.Error())
|
||||
debug.PrintStack()
|
||||
return
|
||||
}
|
||||
|
||||
// 保存爬虫 FileId
|
||||
spider.FileId = fid
|
||||
_ = spider.Save()
|
||||
// TODO: 替换Spiderfile文件
|
||||
|
||||
c.JSON(http.StatusOK, Response{
|
||||
Status: "ok",
|
||||
|
||||
@@ -2,11 +2,19 @@ package services
|
||||
|
||||
import (
|
||||
"crawlab/constants"
|
||||
"crawlab/database"
|
||||
"crawlab/entity"
|
||||
"crawlab/model"
|
||||
"crawlab/model/config_spider"
|
||||
"crawlab/utils"
|
||||
"errors"
|
||||
"fmt"
|
||||
"github.com/apex/log"
|
||||
"github.com/globalsign/mgo/bson"
|
||||
uuid "github.com/satori/go.uuid"
|
||||
"github.com/spf13/viper"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
@@ -139,3 +147,88 @@ func IsUniqueConfigSpiderFields(fields []entity.Field) bool {
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error {
|
||||
spiderDir := spider.Src
|
||||
|
||||
// 赋值 stage_name
|
||||
for stageName, stage := range configData.Stages {
|
||||
stage.Name = stageName
|
||||
configData.Stages[stageName] = stage
|
||||
}
|
||||
|
||||
// 删除已有的爬虫文件
|
||||
for _, fInfo := range utils.ListDir(spiderDir) {
|
||||
// 不删除Spiderfile
|
||||
if fInfo.Name() == "Spiderfile" {
|
||||
continue
|
||||
}
|
||||
|
||||
// 删除其他文件
|
||||
if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// 拷贝爬虫文件
|
||||
tplDir := "./template/scrapy"
|
||||
for _, fInfo := range utils.ListDir(tplDir) {
|
||||
// 跳过Spiderfile
|
||||
if fInfo.Name() == "Spiderfile" {
|
||||
continue
|
||||
}
|
||||
|
||||
srcPath := filepath.Join(tplDir, fInfo.Name())
|
||||
if fInfo.IsDir() {
|
||||
dirPath := filepath.Join(spiderDir, fInfo.Name())
|
||||
if err := utils.CopyDir(srcPath, dirPath); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 更改爬虫文件
|
||||
if err := GenerateConfigSpiderFiles(spider, configData); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 打包为 zip 文件
|
||||
files, err := utils.GetFilesFromDir(spiderDir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
randomId := uuid.NewV4()
|
||||
tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip")
|
||||
spiderZipFileName := spider.Name + ".zip"
|
||||
if err := utils.Compress(files, tmpFilePath); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 获取 GridFS 实例
|
||||
s, gf := database.GetGridFs("files")
|
||||
defer s.Close()
|
||||
|
||||
// 判断文件是否已经存在
|
||||
var gfFile model.GridFs
|
||||
if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil {
|
||||
// 已经存在文件,则删除
|
||||
_ = gf.RemoveId(gfFile.Id)
|
||||
}
|
||||
|
||||
// 上传到GridFs
|
||||
fid, err := UploadToGridFs(spiderZipFileName, tmpFilePath)
|
||||
if err != nil {
|
||||
log.Errorf("upload to grid fs error: %s", err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
// 保存爬虫 FileId
|
||||
spider.FileId = fid
|
||||
_ = spider.Save()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user