加入修改可配置POST接口

This commit is contained in:
marvzhang
2019-11-28 16:08:13 +08:00
parent d71812147a
commit 8210fa6c20
4 changed files with 128 additions and 90 deletions

View File

@@ -140,10 +140,11 @@ func main() {
authGroup.GET("/spiders/:id/stats", routes.GetSpiderStats) // 爬虫统计数据
authGroup.GET("/spider/types", routes.GetSpiderTypes) // 爬虫类型
// 可配置爬虫
authGroup.GET("/config_spiders/:id/config", routes.GetConfigSpiderConfig) // 可配置爬虫配置
authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫
authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫
authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫
authGroup.GET("/config_spiders/:id/config", routes.GetConfigSpiderConfig) // 获取可配置爬虫配置
authGroup.POST("/config_spiders/:id/config", routes.PostConfigSpiderConfig) // 更改可配置爬虫配置
authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫
authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫
authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫
// 任务
authGroup.GET("/tasks", routes.GetTaskList) // 任务列表
authGroup.GET("/tasks/:id", routes.GetTask) // 任务详情

View File

@@ -216,19 +216,19 @@ func (g ScrapyGenerator) GetExtractStringFromField(f entity.Field) string {
// 如果为CSS
if f.Attr == "" {
// 文本
return fmt.Sprintf(`css(%s::text())`, f.Css)
return fmt.Sprintf(`css('%s::text()')`, f.Css)
} else {
// 属性
return fmt.Sprintf(`css(%s::attr("%s"))`, f.Css, f.Attr)
return fmt.Sprintf(`css('%s::attr("%s")')`, f.Css, f.Attr)
}
} else {
// 如果为XPath
if f.Attr == "" {
// 文本
return fmt.Sprintf(`xpath(%s/text())`, f.Xpath)
return fmt.Sprintf(`xpath('%s/text()')`, f.Xpath)
} else {
// 属性
return fmt.Sprintf(`xpath(%s/@%s)`, f.Xpath, f.Attr)
return fmt.Sprintf(`xpath('%s/@%s')`, f.Xpath, f.Attr)
}
}
}

View File

@@ -2,16 +2,13 @@ package routes
import (
"crawlab/constants"
"crawlab/database"
"crawlab/entity"
"crawlab/model"
"crawlab/services"
"crawlab/utils"
"fmt"
"github.com/apex/log"
"github.com/gin-gonic/gin"
"github.com/globalsign/mgo/bson"
uuid "github.com/satori/go.uuid"
"github.com/spf13/viper"
"gopkg.in/yaml.v2"
"io"
@@ -19,7 +16,6 @@ import (
"net/http"
"os"
"path/filepath"
"runtime/debug"
)
// 添加可配置爬虫
@@ -151,91 +147,39 @@ func UploadConfigSpider(c *gin.Context) {
return
}
// 赋值 stage_name
for stageName, stage := range configData.Stages {
stage.Name = stageName
configData.Stages[stageName] = stage
}
// 删除已有的爬虫文件
for _, fInfo := range utils.ListDir(spiderDir) {
// 不删除Spiderfile
if fInfo.Name() == filename {
continue
}
// 删除其他文件
if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
}
// 拷贝爬虫文件
tplDir := "./template/scrapy"
for _, fInfo := range utils.ListDir(tplDir) {
// 跳过Spiderfile
if fInfo.Name() == "Spiderfile" {
continue
}
srcPath := filepath.Join(tplDir, fInfo.Name())
if fInfo.IsDir() {
dirPath := filepath.Join(spiderDir, fInfo.Name())
if err := utils.CopyDir(srcPath, dirPath); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
} else {
if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
}
}
// 更改爬虫文件
if err := services.GenerateConfigSpiderFiles(spider, configData); err != nil {
// 根据序列化后的数据处理爬虫文件
if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 打包为 zip 文件
files, err := utils.GetFilesFromDir(spiderDir)
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
})
}
func PostConfigSpiderConfig(c *gin.Context) {
id := c.Param("id")
// 获取爬虫
var spider model.Spider
spider, err := model.GetSpider(bson.ObjectIdHex(id))
if err != nil {
HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("cannot find spider (id: %s)", id))
}
// 反序列化配置数据
var configData entity.ConfigSpiderData
if err := c.ShouldBindJSON(&configData); err != nil {
HandleError(http.StatusBadRequest, c, err)
}
// 根据序列化后的数据处理爬虫文件
if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
randomId := uuid.NewV4()
tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip")
spiderZipFileName := spider.Name + ".zip"
if err := utils.Compress(files, tmpFilePath); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
// 获取 GridFS 实例
s, gf := database.GetGridFs("files")
defer s.Close()
// 判断文件是否已经存在
var gfFile model.GridFs
if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil {
// 已经存在文件,则删除
_ = gf.RemoveId(gfFile.Id)
}
// 上传到GridFs
fid, err := services.UploadToGridFs(spiderZipFileName, tmpFilePath)
if err != nil {
log.Errorf("upload to grid fs error: %s", err.Error())
debug.PrintStack()
return
}
// 保存爬虫 FileId
spider.FileId = fid
_ = spider.Save()
// TODO: 替换Spiderfile文件
c.JSON(http.StatusOK, Response{
Status: "ok",

View File

@@ -2,11 +2,19 @@ package services
import (
"crawlab/constants"
"crawlab/database"
"crawlab/entity"
"crawlab/model"
"crawlab/model/config_spider"
"crawlab/utils"
"errors"
"fmt"
"github.com/apex/log"
"github.com/globalsign/mgo/bson"
uuid "github.com/satori/go.uuid"
"github.com/spf13/viper"
"os"
"path/filepath"
"strings"
)
@@ -139,3 +147,88 @@ func IsUniqueConfigSpiderFields(fields []entity.Field) bool {
}
return true
}
func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error {
spiderDir := spider.Src
// 赋值 stage_name
for stageName, stage := range configData.Stages {
stage.Name = stageName
configData.Stages[stageName] = stage
}
// 删除已有的爬虫文件
for _, fInfo := range utils.ListDir(spiderDir) {
// 不删除Spiderfile
if fInfo.Name() == "Spiderfile" {
continue
}
// 删除其他文件
if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil {
return err
}
}
// 拷贝爬虫文件
tplDir := "./template/scrapy"
for _, fInfo := range utils.ListDir(tplDir) {
// 跳过Spiderfile
if fInfo.Name() == "Spiderfile" {
continue
}
srcPath := filepath.Join(tplDir, fInfo.Name())
if fInfo.IsDir() {
dirPath := filepath.Join(spiderDir, fInfo.Name())
if err := utils.CopyDir(srcPath, dirPath); err != nil {
return err
}
} else {
if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
return err
}
}
}
// 更改爬虫文件
if err := GenerateConfigSpiderFiles(spider, configData); err != nil {
return err
}
// 打包为 zip 文件
files, err := utils.GetFilesFromDir(spiderDir)
if err != nil {
return err
}
randomId := uuid.NewV4()
tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip")
spiderZipFileName := spider.Name + ".zip"
if err := utils.Compress(files, tmpFilePath); err != nil {
return err
}
// 获取 GridFS 实例
s, gf := database.GetGridFs("files")
defer s.Close()
// 判断文件是否已经存在
var gfFile model.GridFs
if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil {
// 已经存在文件,则删除
_ = gf.RemoveId(gfFile.Id)
}
// 上传到GridFs
fid, err := UploadToGridFs(spiderZipFileName, tmpFilePath)
if err != nil {
log.Errorf("upload to grid fs error: %s", err.Error())
return err
}
// 保存爬虫 FileId
spider.FileId = fid
_ = spider.Save()
return nil
}