diff --git a/backend/conf/config.yml b/backend/conf/config.yml
index a5e0b23b..60d2bd41 100644
--- a/backend/conf/config.yml
+++ b/backend/conf/config.yml
@@ -15,7 +15,7 @@ redis:
log:
level: info
path: "/var/logs/crawlab"
- isDeletePeriodically: "Y"
+ isDeletePeriodically: "N"
deleteFrequency: "@hourly"
server:
host: 0.0.0.0
diff --git a/backend/entity/common.go b/backend/entity/common.go
index 332cc494..c46ae4f9 100644
--- a/backend/entity/common.go
+++ b/backend/entity/common.go
@@ -3,15 +3,15 @@ package entity
import "strconv"
type Page struct {
- Skip int
- Limit int
- PageNum int
+ Skip int
+ Limit int
+ PageNum int
PageSize int
}
-func (p *Page)GetPage(pageNum string, pageSize string) {
+func (p *Page) GetPage(pageNum string, pageSize string) {
p.PageNum, _ = strconv.Atoi(pageNum)
p.PageSize, _ = strconv.Atoi(pageSize)
p.Skip = p.PageSize * (p.PageNum - 1)
p.Limit = p.PageSize
-}
\ No newline at end of file
+}
diff --git a/backend/entity/config_spider.go b/backend/entity/config_spider.go
index 5e0fe1e1..3fe28bc9 100644
--- a/backend/entity/config_spider.go
+++ b/backend/entity/config_spider.go
@@ -1,25 +1,30 @@
package entity
+type ConfigSpiderData struct {
+ Version string `yaml:"version" json:"version"`
+ Engine string `yaml:"engine" json:"engine"`
+ StartUrl string `yaml:"start_url" json:"start_url"`
+ StartStage string `yaml:"start_stage" json:"start_stage"`
+ Stages map[string]Stage `yaml:"stages" json:"stages"`
+ Settings map[string]string `yaml:"settings" json:"settings"`
+}
+
+type Stage struct {
+ Name string `yaml:"name" json:"name"`
+ IsList bool `yaml:"is_list" json:"is_list"`
+ ListCss string `yaml:"list_css" json:"list_css"`
+ ListXpath string `yaml:"list_xpath" json:"list_xpath"`
+ PageCss string `yaml:"page_css" json:"page_css"`
+ PageXpath string `yaml:"page_xpath" json:"page_xpath"`
+ PageAttr string `yaml:"page_attr" json:"page_attr"`
+ Fields []Field `yaml:"fields" json:"fields"`
+}
+
type Field struct {
Name string `yaml:"name" json:"name"`
Css string `yaml:"css" json:"css"`
Xpath string `yaml:"xpath" json:"xpath"`
Attr string `yaml:"attr" json:"attr"`
NextStage string `yaml:"next_stage" json:"next_stage"`
-}
-
-type Stage struct {
- IsList bool `yaml:"is_list" json:"is_list"`
- ListCss string `yaml:"list_css" json:"list_css"`
- PageCss string `yaml:"page_css" json:"page_css"`
- PageAttr string `yaml:"page_attr" json:"page_attr"`
- Fields []Field `yaml:"fields" json:"fields"`
-}
-
-type ConfigSpiderData struct {
- Version string `yaml:"version" json:"version"`
- Engine string `yaml:"engine" json:"engine"`
- StartUrl string `yaml:"start_url" json:"start_url"`
- StartStage string `yaml:"start_stage" json:"start_stage"`
- Stages map[string]Stage `yaml:"stages" json:"stages"`
+ Remark string `yaml:"remark" json:"remark"`
}
diff --git a/backend/main.go b/backend/main.go
index 565c7892..92863a20 100644
--- a/backend/main.go
+++ b/backend/main.go
@@ -47,6 +47,8 @@ func main() {
panic(err)
}
log.Info("初始化定期清理日志配置成功")
+ }else {
+ log.Info("默认未开启定期清理日志配置")
}
// 初始化Mongodb数据库
@@ -140,9 +142,13 @@ func main() {
authGroup.GET("/spiders/:id/stats", routes.GetSpiderStats) // 爬虫统计数据
authGroup.GET("/spider/types", routes.GetSpiderTypes) // 爬虫类型
// 可配置爬虫
- authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫
- authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫
- authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫
+ authGroup.GET("/config_spiders/:id/config", routes.GetConfigSpiderConfig) // 获取可配置爬虫配置
+ authGroup.POST("/config_spiders/:id/config", routes.PostConfigSpiderConfig) // 更改可配置爬虫配置
+ authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫
+ authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫
+ authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫
+ authGroup.POST("/config_spiders/:id/spiderfile", routes.PostConfigSpiderSpiderfile) // 上传可配置爬虫
+ authGroup.GET("/config_spiders_templates", routes.GetConfigSpiderTemplateList) // 获取可配置爬虫模版列表
// 任务
authGroup.GET("/tasks", routes.GetTaskList) // 任务列表
authGroup.GET("/tasks/:id", routes.GetTask) // 任务详情
diff --git a/backend/mock/node_test.go b/backend/mock/node_test.go
index 669cafc5..abd568c2 100644
--- a/backend/mock/node_test.go
+++ b/backend/mock/node_test.go
@@ -42,12 +42,12 @@ func init() {
app.DELETE("/tasks/:id", DeleteTask) // 删除任务
app.GET("/tasks/:id/results", GetTaskResults) // 任务结果
app.GET("/tasks/:id/results/download", DownloadTaskResultsCsv) // 下载任务结果
- app.GET("/spiders", GetSpiderList) // 爬虫列表
- app.GET("/spiders/:id", GetSpider) // 爬虫详情
- app.POST("/spiders/:id", PostSpider) // 修改爬虫
- app.DELETE("/spiders/:id",DeleteSpider) // 删除爬虫
- app.GET("/spiders/:id/tasks",GetSpiderTasks) // 爬虫任务列表
- app.GET("/spiders/:id/dir",GetSpiderDir) // 爬虫目录
+ app.GET("/spiders", GetSpiderList) // 爬虫列表
+ app.GET("/spiders/:id", GetSpider) // 爬虫详情
+ app.POST("/spiders/:id", PostSpider) // 修改爬虫
+ app.DELETE("/spiders/:id", DeleteSpider) // 删除爬虫
+ app.GET("/spiders/:id/tasks", GetSpiderTasks) // 爬虫任务列表
+ app.GET("/spiders/:id/dir", GetSpiderDir) // 爬虫目录
}
//mock test, test data in ./mock
diff --git a/backend/mock/stats.go b/backend/mock/stats.go
index db2348c6..f0227da9 100644
--- a/backend/mock/stats.go
+++ b/backend/mock/stats.go
@@ -6,8 +6,6 @@ import (
"net/http"
)
-
-
var taskDailyItems = []model.TaskDailyItem{
{
Date: "2019/08/19",
diff --git a/backend/mock/system.go b/backend/mock/system.go
index c4807247..f33e02ba 100644
--- a/backend/mock/system.go
+++ b/backend/mock/system.go
@@ -1 +1 @@
-package mock
\ No newline at end of file
+package mock
diff --git a/backend/mock/user.go b/backend/mock/user.go
index c4807247..f33e02ba 100644
--- a/backend/mock/user.go
+++ b/backend/mock/user.go
@@ -1 +1 @@
-package mock
\ No newline at end of file
+package mock
diff --git a/backend/model/config_spider/scrapy.go b/backend/model/config_spider/scrapy.go
index 7503b9bf..6fcb77f0 100644
--- a/backend/model/config_spider/scrapy.go
+++ b/backend/model/config_spider/scrapy.go
@@ -131,12 +131,7 @@ func (g ScrapyGenerator) GetNonListParserString(stageName string, stage entity.S
// 遍历字段列表
for _, f := range stage.Fields {
- line := ""
- if f.Attr == "" {
- line += fmt.Sprintf(`item['%s'] = response.css('%s::text').extract_first()`, f.Name, f.Css)
- } else {
- line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
- }
+ line := fmt.Sprintf(`item['%s'] = response.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f))
line = g.PadCode(line, 2)
str += line
}
@@ -163,19 +158,14 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag
str += g.PadCode(`prev_item = response.meta.get('item')`, 2)
// for 循环遍历列表
- str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s'):`, stage.ListCss), 2)
+ str += g.PadCode(fmt.Sprintf(`for elem in response.%s:`, g.GetListString(stage)), 2)
// 构造item
str += g.PadCode(`item = Item()`, 3)
// 遍历字段列表
for _, f := range stage.Fields {
- line := ""
- if f.Attr == "" {
- line += fmt.Sprintf(`item['%s'] = elem.css('%s::text').extract_first()`, f.Name, f.Css)
- } else {
- line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
- }
+ line := fmt.Sprintf(`item['%s'] = elem.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f))
line = g.PadCode(line, 3)
str += line
}
@@ -195,15 +185,9 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag
}
// 分页
- if stage.PageCss != "" {
- // 分页元素属性,默认为 href
- pageAttr := "href"
- if stage.PageAttr != "" {
- pageAttr = stage.PageAttr
- }
-
- str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s::attr("%s")').extract_first()`, stage.PageCss, pageAttr), 2)
- str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': item})`, stageName), 2)
+ if stage.PageCss != "" || stage.PageXpath != "" {
+ str += g.PadCode(fmt.Sprintf(`next_url = response.%s.extract_first()`, g.GetExtractStringFromStage(stage)), 2)
+ str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': prev_item})`, stageName), 2)
}
// 加入末尾换行
@@ -226,3 +210,49 @@ func (g ScrapyGenerator) GetNextStageField(stage entity.Stage) (entity.Field, er
}
return entity.Field{}, errors.New("cannot find next stage field")
}
+
+func (g ScrapyGenerator) GetExtractStringFromField(f entity.Field) string {
+ if f.Css != "" {
+ // 如果为CSS
+ if f.Attr == "" {
+ // 文本
+ return fmt.Sprintf(`css('%s::text')`, f.Css)
+ } else {
+ // 属性
+ return fmt.Sprintf(`css('%s::attr("%s")')`, f.Css, f.Attr)
+ }
+ } else {
+ // 如果为XPath
+ if f.Attr == "" {
+ // 文本
+ return fmt.Sprintf(`xpath('string(%s)')`, f.Xpath)
+ } else {
+ // 属性
+ return fmt.Sprintf(`xpath('%s/@%s')`, f.Xpath, f.Attr)
+ }
+ }
+}
+
+func (g ScrapyGenerator) GetExtractStringFromStage(stage entity.Stage) string {
+ // 分页元素属性,默认为 href
+ pageAttr := "href"
+ if stage.PageAttr != "" {
+ pageAttr = stage.PageAttr
+ }
+
+ if stage.PageCss != "" {
+ // 如果为CSS
+ return fmt.Sprintf(`css('%s::attr("%s")')`, stage.PageCss, pageAttr)
+ } else {
+ // 如果为XPath
+ return fmt.Sprintf(`xpath('%s/@%s')`, stage.PageXpath, pageAttr)
+ }
+}
+
+func (g ScrapyGenerator) GetListString(stage entity.Stage) string {
+ if stage.ListCss != "" {
+ return fmt.Sprintf(`css('%s')`, stage.ListCss)
+ } else {
+ return fmt.Sprintf(`xpath('%s')`, stage.ListXpath)
+ }
+}
diff --git a/backend/model/spider.go b/backend/model/spider.go
index 53c5ab1f..a0d72c1c 100644
--- a/backend/model/spider.go
+++ b/backend/model/spider.go
@@ -1,11 +1,17 @@
package model
import (
+ "crawlab/constants"
"crawlab/database"
"crawlab/entity"
+ "crawlab/utils"
+ "errors"
"github.com/apex/log"
"github.com/globalsign/mgo"
"github.com/globalsign/mgo/bson"
+ "gopkg.in/yaml.v2"
+ "io/ioutil"
+ "path/filepath"
"runtime/debug"
"time"
)
@@ -25,14 +31,18 @@ type Spider struct {
Site string `json:"site" bson:"site"` // 爬虫网站
Envs []Env `json:"envs" bson:"envs"` // 环境变量
Remark string `json:"remark" bson:"remark"` // 备注
+ Src string `json:"src" bson:"src"` // 源码位置
// 自定义爬虫
- Src string `json:"src" bson:"src"` // 源码位置
Cmd string `json:"cmd" bson:"cmd"` // 执行命令
+ // 可配置爬虫
+ Template string `json:"template" bson:"template"` // Spiderfile模版
+
// 前端展示
- LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间
- LastStatus string `json:"last_status"` // 最后执行状态
+ LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间
+ LastStatus string `json:"last_status"` // 最后执行状态
+ Config entity.ConfigSpiderData `json:"config"` // 可配置爬虫配置
// 时间
CreateTs time.Time `json:"create_ts" bson:"create_ts"`
@@ -108,6 +118,10 @@ func GetSpiderList(filter interface{}, skip int, limit int) ([]Spider, int, erro
return spiders, 0, err
}
+ if spiders == nil {
+ spiders = []Spider{}
+ }
+
// 遍历爬虫列表
for i, spider := range spiders {
// 获取最后一次任务
@@ -161,15 +175,25 @@ func GetSpider(id bson.ObjectId) (Spider, error) {
s, c := database.GetCol("spiders")
defer s.Close()
- var result Spider
- if err := c.FindId(id).One(&result); err != nil {
+ // 获取爬虫
+ var spider Spider
+ if err := c.FindId(id).One(&spider); err != nil {
if err != mgo.ErrNotFound {
log.Errorf("get spider error: %s, id: %id", err.Error(), id.Hex())
debug.PrintStack()
}
- return result, err
+ return spider, err
}
- return result, nil
+
+ // 如果为可配置爬虫,获取爬虫配置
+ if spider.Type == constants.Configurable && utils.Exists(filepath.Join(spider.Src, "Spiderfile")) {
+ config, err := GetConfigSpiderData(spider)
+ if err != nil {
+ return spider, err
+ }
+ spider.Config = config
+ }
+ return spider, nil
}
// 更新爬虫
@@ -209,10 +233,12 @@ func RemoveSpider(id bson.ObjectId) error {
s, gf := database.GetGridFs("files")
defer s.Close()
- if err := gf.RemoveId(result.FileId); err != nil {
- log.Error("remove file error, id:" + result.FileId.Hex())
- debug.PrintStack()
- return err
+ if result.FileId.Hex() != constants.ObjectIdNull {
+ if err := gf.RemoveId(result.FileId); err != nil {
+ log.Error("remove file error, id:" + result.FileId.Hex())
+ debug.PrintStack()
+ return err
+ }
}
return nil
@@ -269,3 +295,35 @@ func GetSpiderTypes() ([]*entity.SpiderType, error) {
return types, nil
}
+
+func GetConfigSpiderData(spider Spider) (entity.ConfigSpiderData, error) {
+ // 构造配置数据
+ configData := entity.ConfigSpiderData{}
+
+ // 校验爬虫类别
+ if spider.Type != constants.Configurable {
+ return configData, errors.New("not a configurable spider")
+ }
+
+ // Spiderfile 目录
+ sfPath := filepath.Join(spider.Src, "Spiderfile")
+
+ // 读取YAML文件
+ yamlFile, err := ioutil.ReadFile(sfPath)
+ if err != nil {
+ return configData, err
+ }
+
+ // 反序列化
+ if err := yaml.Unmarshal(yamlFile, &configData); err != nil {
+ return configData, err
+ }
+
+ // 赋值 stage_name
+ for stageName, stage := range configData.Stages {
+ stage.Name = stageName
+ configData.Stages[stageName] = stage
+ }
+
+ return configData, nil
+}
diff --git a/backend/routes/config_spider.go b/backend/routes/config_spider.go
index 6f4a2893..e387935a 100644
--- a/backend/routes/config_spider.go
+++ b/backend/routes/config_spider.go
@@ -2,16 +2,13 @@ package routes
import (
"crawlab/constants"
- "crawlab/database"
"crawlab/entity"
"crawlab/model"
"crawlab/services"
"crawlab/utils"
"fmt"
- "github.com/apex/log"
"github.com/gin-gonic/gin"
"github.com/globalsign/mgo/bson"
- uuid "github.com/satori/go.uuid"
"github.com/spf13/viper"
"gopkg.in/yaml.v2"
"io"
@@ -19,7 +16,7 @@ import (
"net/http"
"os"
"path/filepath"
- "runtime/debug"
+ "strings"
)
// 添加可配置爬虫
@@ -36,6 +33,12 @@ func PutConfigSpider(c *gin.Context) {
return
}
+ // 模版名不能为空
+ if spider.Template == "" {
+ HandleErrorF(http.StatusBadRequest, c, "spider template should not be empty")
+ return
+ }
+
// 判断爬虫是否存在
if spider := model.GetSpiderByName(spider.Name); spider != nil {
HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("spider for '%s' already exists", spider.Name))
@@ -62,6 +65,23 @@ func PutConfigSpider(c *gin.Context) {
}
spider.Src = spiderDir
+ // 复制Spiderfile模版
+ contentByte, err := ioutil.ReadFile("./template/spiderfile/Spiderfile." + spider.Template)
+ if err != nil {
+ HandleError(http.StatusInternalServerError, c, err)
+ return
+ }
+ f, err := os.Create(filepath.Join(spider.Src, "Spiderfile"))
+ if err != nil {
+ HandleError(http.StatusInternalServerError, c, err)
+ return
+ }
+ defer f.Close()
+ if _, err := f.Write(contentByte); err != nil {
+ HandleError(http.StatusInternalServerError, c, err)
+ return
+ }
+
// 添加爬虫到数据库
if err := spider.Add(); err != nil {
HandleError(http.StatusInternalServerError, c, err)
@@ -100,8 +120,8 @@ func UploadConfigSpider(c *gin.Context) {
// 文件名称必须为Spiderfile
filename := header.Filename
- if filename != "Spiderfile" {
- HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile'")
+ if filename != "Spiderfile" && filename != "Spiderfile.yaml" && filename != "Spiderfile.yml" {
+ HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile(.yaml|.yml)'")
return
}
@@ -151,88 +171,146 @@ func UploadConfigSpider(c *gin.Context) {
return
}
- // 删除已有的爬虫文件
- for _, fInfo := range utils.ListDir(spiderDir) {
- // 不删除Spiderfile
- if fInfo.Name() == filename {
- continue
- }
-
- // 删除其他文件
- if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil {
- HandleError(http.StatusInternalServerError, c, err)
- return
- }
- }
-
- // 拷贝爬虫文件
- tplDir := "./template/scrapy"
- for _, fInfo := range utils.ListDir(tplDir) {
- // 跳过Spiderfile
- if fInfo.Name() == "Spiderfile" {
- continue
- }
-
- srcPath := filepath.Join(tplDir, fInfo.Name())
- if fInfo.IsDir() {
- dirPath := filepath.Join(spiderDir, fInfo.Name())
- if err := utils.CopyDir(srcPath, dirPath); err != nil {
- HandleError(http.StatusInternalServerError, c, err)
- return
- }
- } else {
- if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
- HandleError(http.StatusInternalServerError, c, err)
- return
- }
- }
- }
-
- // 更改爬虫文件
- if err := services.GenerateConfigSpiderFiles(spider, configData); err != nil {
+ // 根据序列化后的数据处理爬虫文件
+ if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
HandleError(http.StatusInternalServerError, c, err)
- return
}
- // 打包为 zip 文件
- files, err := utils.GetFilesFromDir(spiderDir)
- if err != nil {
- HandleError(http.StatusInternalServerError, c, err)
- return
- }
- randomId := uuid.NewV4()
- tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip")
- spiderZipFileName := spider.Name + ".zip"
- if err := utils.Compress(files, tmpFilePath); err != nil {
- HandleError(http.StatusInternalServerError, c, err)
- return
- }
-
- // 获取 GridFS 实例
- s, gf := database.GetGridFs("files")
- defer s.Close()
-
- // 判断文件是否已经存在
- var gfFile model.GridFs
- if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil {
- // 已经存在文件,则删除
- _ = gf.RemoveId(gfFile.Id)
- }
-
- // 上传到GridFs
- fid, err := services.UploadToGridFs(spiderZipFileName, tmpFilePath)
- if err != nil {
- log.Errorf("upload to grid fs error: %s", err.Error())
- debug.PrintStack()
- return
- }
-
- // 保存爬虫 FileId
- spider.FileId = fid
- _ = spider.Save()
-
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
})
}
+
+func PostConfigSpiderSpiderfile(c *gin.Context) {
+ type Body struct {
+ Content string `json:"content"`
+ }
+
+ id := c.Param("id")
+
+ // 文件内容
+ var reqBody Body
+ if err := c.ShouldBindJSON(&reqBody); err != nil {
+ HandleError(http.StatusBadRequest, c, err)
+ return
+ }
+ content := reqBody.Content
+
+ // 获取爬虫
+ var spider model.Spider
+ spider, err := model.GetSpider(bson.ObjectIdHex(id))
+ if err != nil {
+ HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("cannot find spider (id: %s)", id))
+ return
+ }
+
+ // 反序列化
+ var configData entity.ConfigSpiderData
+ if err := yaml.Unmarshal([]byte(content), &configData); err != nil {
+ HandleError(http.StatusBadRequest, c, err)
+ return
+ }
+
+ // 校验configData
+ if err := services.ValidateSpiderfile(configData); err != nil {
+ HandleError(http.StatusInternalServerError, c, err)
+ return
+ }
+
+ // 写文件
+ if err := ioutil.WriteFile(filepath.Join(spider.Src, "Spiderfile"), []byte(content), os.ModePerm); err != nil {
+ HandleError(http.StatusInternalServerError, c, err)
+ return
+ }
+
+ // 根据序列化后的数据处理爬虫文件
+ if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
+ HandleError(http.StatusInternalServerError, c, err)
+ return
+ }
+
+ c.JSON(http.StatusOK, Response{
+ Status: "ok",
+ Message: "success",
+ })
+}
+
+func PostConfigSpiderConfig(c *gin.Context) {
+ id := c.Param("id")
+
+ // 获取爬虫
+ var spider model.Spider
+ spider, err := model.GetSpider(bson.ObjectIdHex(id))
+ if err != nil {
+ HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("cannot find spider (id: %s)", id))
+ return
+ }
+
+ // 反序列化配置数据
+ var configData entity.ConfigSpiderData
+ if err := c.ShouldBindJSON(&configData); err != nil {
+ HandleError(http.StatusBadRequest, c, err)
+ return
+ }
+
+ // 校验configData
+ if err := services.ValidateSpiderfile(configData); err != nil {
+ HandleError(http.StatusInternalServerError, c, err)
+ return
+ }
+
+ // 替换Spiderfile文件
+ if err := services.GenerateSpiderfileFromConfigData(spider, configData); err != nil {
+ HandleError(http.StatusInternalServerError, c, err)
+ return
+ }
+
+ // 根据序列化后的数据处理爬虫文件
+ if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
+ HandleError(http.StatusInternalServerError, c, err)
+ return
+ }
+
+ c.JSON(http.StatusOK, Response{
+ Status: "ok",
+ Message: "success",
+ })
+}
+
+func GetConfigSpiderConfig(c *gin.Context) {
+ id := c.Param("id")
+
+ // 校验ID
+ if !bson.IsObjectIdHex(id) {
+ HandleErrorF(http.StatusBadRequest, c, "invalid id")
+ }
+
+ // 获取爬虫
+ spider, err := model.GetSpider(bson.ObjectIdHex(id))
+ if err != nil {
+ HandleError(http.StatusInternalServerError, c, err)
+ return
+ }
+
+ c.JSON(http.StatusOK, Response{
+ Status: "ok",
+ Message: "success",
+ Data: spider.Config,
+ })
+}
+
+// 获取模版名称列表
+func GetConfigSpiderTemplateList(c *gin.Context) {
+ var data []string
+ for _, fInfo := range utils.ListDir("./template/spiderfile") {
+ templateName := strings.Replace(fInfo.Name(), "Spiderfile.", "", -1)
+ data = append(data, templateName)
+ }
+
+ c.JSON(http.StatusOK, Response{
+ Status: "ok",
+ Message: "success",
+ Data: data,
+ })
+}
diff --git a/backend/routes/spider.go b/backend/routes/spider.go
index d351f1bb..588811e3 100644
--- a/backend/routes/spider.go
+++ b/backend/routes/spider.go
@@ -34,7 +34,7 @@ func GetSpiderList(c *gin.Context) {
"name": bson.M{"$regex": bson.RegEx{Pattern: keyword, Options: "im"}},
}
- if t != "" {
+ if t != "" && t != "all" {
filter["type"] = t
}
diff --git a/backend/services/config_spider.go b/backend/services/config_spider.go
index 4e8005a1..7c736cc7 100644
--- a/backend/services/config_spider.go
+++ b/backend/services/config_spider.go
@@ -2,11 +2,20 @@ package services
import (
"crawlab/constants"
+ "crawlab/database"
"crawlab/entity"
"crawlab/model"
"crawlab/model/config_spider"
+ "crawlab/utils"
"errors"
"fmt"
+ "github.com/apex/log"
+ "github.com/globalsign/mgo/bson"
+ uuid "github.com/satori/go.uuid"
+ "github.com/spf13/viper"
+ "gopkg.in/yaml.v2"
+ "os"
+ "path/filepath"
"strings"
)
@@ -37,12 +46,17 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error {
// 校验是否存在 start_url
if configData.StartUrl == "" {
- return errors.New("spiderfile start_url is empty")
+ return errors.New("spiderfile invalid: start_url is empty")
+ }
+
+ // 校验是否存在 start_stage
+ if configData.StartStage == "" {
+ return errors.New("spiderfile invalid: start_stage is empty")
}
// 校验是否存在 stages
if len(configData.Stages) == 0 {
- return errors.New("spiderfile stages is empty")
+ return errors.New("spiderfile invalid: stages is empty")
}
// 校验stages
@@ -50,56 +64,74 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error {
for stageName, stage := range configData.Stages {
// stage 名称不能为空
if stageName == "" {
- return errors.New("spiderfile stage name is empty")
+ return errors.New("spiderfile invalid: stage name is empty")
}
// stage 名称不能为保留字符串
// NOTE: 如果有其他Engine,可以扩展,默认为Scrapy
if configData.Engine == "" || configData.Engine == constants.EngineScrapy {
if strings.Contains(constants.ScrapyProtectedStageNames, stageName) {
- return errors.New(fmt.Sprintf("spiderfile stage name '%s' is protected", stageName))
+ return errors.New(fmt.Sprintf("spiderfile invalid: stage name '%s' is protected", stageName))
}
- } else if configData.Engine == constants.EngineColly {
- return errors.New(fmt.Sprintf("engine '%s' is not implemented", stageName))
+ } else {
+ return errors.New(fmt.Sprintf("spiderfile invalid: engine '%s' is not implemented", configData.Engine))
}
// stage 名称不能重复
if dict[stageName] == 1 {
- return errors.New("spiderfile stage name should be unique")
+ return errors.New(fmt.Sprintf("spiderfile invalid: stage name '%s' is duplicated", stageName))
}
dict[stageName] = 1
// stage 字段不能为空
if len(stage.Fields) == 0 {
- return errors.New(fmt.Sprintf("spiderfile stage '%s' has no fields", stageName))
+ return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has no fields", stageName))
}
- // stage 的下一个 stage 只能有一个
+ // 是否包含 next_stage
hasNextStage := false
+
+ // 遍历字段列表
for _, field := range stage.Fields {
+ // stage 的 next stage 只能有一个
if field.NextStage != "" {
if hasNextStage {
- return errors.New("spiderfile stage fields should have only 1 next_stage")
+ return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has more than 1 next_stage", stageName))
}
hasNextStage = true
}
+
+ // 字段里 css 和 xpath 只能包含一个
+ if field.Css != "" && field.Xpath != "" {
+ return errors.New(fmt.Sprintf("spiderfile invalid: field '%s' in stage '%s' has both css and xpath set which is prohibited", field.Name, stageName))
+ }
+ }
+
+ // stage 里 page_css 和 page_xpath 只能包含一个
+ if stage.PageCss != "" && stage.PageXpath != "" {
+ return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both page_css and page_xpath set which is prohibited", stageName))
+ }
+
+ // stage 里 list_css 和 list_xpath 只能包含一个
+ if stage.ListCss != "" && stage.ListXpath != "" {
+ return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both list_css and list_xpath set which is prohibited", stageName))
}
// 如果 stage 的 is_list 为 true 但 list_css 为空,报错
- if stage.IsList && stage.ListCss == "" {
- return errors.New("spiderfile stage with is_list = true should have list_css being set")
+ if stage.IsList && (stage.ListCss == "" && stage.ListXpath == "") {
+ return errors.New("spiderfile invalid: stage with is_list = true should have either list_css or list_xpath being set")
}
}
// 校验字段唯一性
if !IsUniqueConfigSpiderFields(fields) {
- return errors.New("spiderfile fields not unique")
+ return errors.New("spiderfile invalid: fields not unique")
}
// 字段名称不能为保留字符串
for _, field := range fields {
if strings.Contains(constants.ScrapyProtectedFieldNames, field.Name) {
- return errors.New(fmt.Sprintf("spiderfile field name '%s' is protected", field.Name))
+ return errors.New(fmt.Sprintf("spiderfile invalid: field name '%s' is protected", field.Name))
}
}
@@ -116,3 +148,118 @@ func IsUniqueConfigSpiderFields(fields []entity.Field) bool {
}
return true
}
+
+func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error {
+ spiderDir := spider.Src
+
+ // 赋值 stage_name
+ for stageName, stage := range configData.Stages {
+ stage.Name = stageName
+ configData.Stages[stageName] = stage
+ }
+
+ // 删除已有的爬虫文件
+ for _, fInfo := range utils.ListDir(spiderDir) {
+ // 不删除Spiderfile
+ if fInfo.Name() == "Spiderfile" {
+ continue
+ }
+
+ // 删除其他文件
+ if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil {
+ return err
+ }
+ }
+
+ // 拷贝爬虫文件
+ tplDir := "./template/scrapy"
+ for _, fInfo := range utils.ListDir(tplDir) {
+ // 跳过Spiderfile
+ if fInfo.Name() == "Spiderfile" {
+ continue
+ }
+
+ srcPath := filepath.Join(tplDir, fInfo.Name())
+ if fInfo.IsDir() {
+ dirPath := filepath.Join(spiderDir, fInfo.Name())
+ if err := utils.CopyDir(srcPath, dirPath); err != nil {
+ return err
+ }
+ } else {
+ if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
+ return err
+ }
+ }
+ }
+
+ // 更改爬虫文件
+ if err := GenerateConfigSpiderFiles(spider, configData); err != nil {
+ return err
+ }
+
+ // 打包为 zip 文件
+ files, err := utils.GetFilesFromDir(spiderDir)
+ if err != nil {
+ return err
+ }
+ randomId := uuid.NewV4()
+ tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip")
+ spiderZipFileName := spider.Name + ".zip"
+ if err := utils.Compress(files, tmpFilePath); err != nil {
+ return err
+ }
+
+ // 获取 GridFS 实例
+ s, gf := database.GetGridFs("files")
+ defer s.Close()
+
+ // 判断文件是否已经存在
+ var gfFile model.GridFs
+ if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil {
+ // 已经存在文件,则删除
+ _ = gf.RemoveId(gfFile.Id)
+ }
+
+ // 上传到GridFs
+ fid, err := UploadToGridFs(spiderZipFileName, tmpFilePath)
+ if err != nil {
+ log.Errorf("upload to grid fs error: %s", err.Error())
+ return err
+ }
+
+ // 保存爬虫 FileId
+ spider.FileId = fid
+ _ = spider.Save()
+
+ return nil
+}
+
+func GenerateSpiderfileFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error {
+ // Spiderfile 路径
+ sfPath := filepath.Join(spider.Src, "Spiderfile")
+
+ // 生成Yaml内容
+ sfContentByte, err := yaml.Marshal(configData)
+ if err != nil {
+ return err
+ }
+
+ // 打开文件
+ var f *os.File
+ if utils.Exists(sfPath) {
+ f, err = os.OpenFile(sfPath, os.O_WRONLY|os.O_TRUNC, 0777)
+ } else {
+ f, err = os.OpenFile(sfPath, os.O_CREATE, 0777)
+ }
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ // 写入内容
+ if _, err := f.Write(sfContentByte); err != nil {
+ return err
+ }
+
+ return nil
+}
diff --git a/backend/services/spider.go b/backend/services/spider.go
index aa97b4ad..3922d822 100644
--- a/backend/services/spider.go
+++ b/backend/services/spider.go
@@ -116,12 +116,15 @@ func PublishAllSpiders() {
// 发布爬虫
func PublishSpider(spider model.Spider) {
- // 查询gf file,不存在则标记为爬虫文件不存在
- gfFile := model.GetGridFs(spider.FileId)
- if gfFile == nil {
- spider.FileId = constants.ObjectIdNull
- _ = spider.Save()
- return
+ var gfFile *model.GridFs
+ if spider.FileId.Hex() != constants.ObjectIdNull {
+ // 查询gf file,不存在则标记为爬虫文件不存在
+ gfFile = model.GetGridFs(spider.FileId)
+ if gfFile == nil {
+ spider.FileId = constants.ObjectIdNull
+ _ = spider.Save()
+ return
+ }
}
// 如果FileId为空,表示还没有上传爬虫到GridFS,则跳过
diff --git a/backend/services/spider_handler/spider.go b/backend/services/spider_handler/spider.go
index cce025dc..c3a2500d 100644
--- a/backend/services/spider_handler/spider.go
+++ b/backend/services/spider_handler/spider.go
@@ -10,6 +10,7 @@ import (
"github.com/spf13/viper"
"io"
"os"
+ "os/exec"
"path/filepath"
"runtime/debug"
)
@@ -99,7 +100,6 @@ func (s *SpiderSync) Download() {
// 创建临时文件
tmpFilePath := filepath.Join(tmpPath, randomId.String()+".zip")
tmpFile := utils.OpenFile(tmpFilePath)
- defer utils.Close(tmpFile)
// 将该文件写入临时文件
if _, err := io.Copy(tmpFile, f); err != nil {
@@ -119,6 +119,15 @@ func (s *SpiderSync) Download() {
return
}
+ //递归修改目标文件夹权限
+ // 解决scrapy.setting中开启LOG_ENABLED 和 LOG_FILE时不能创建log文件的问题
+ cmd := exec.Command("chmod", "-R", "777", dstPath)
+ if err := cmd.Run(); err != nil {
+ log.Errorf(err.Error())
+ debug.PrintStack()
+ return
+ }
+
// 关闭临时文件
if err := tmpFile.Close(); err != nil {
log.Errorf(err.Error())
diff --git a/backend/services/task.go b/backend/services/task.go
index 02fa53e7..5886f8f1 100644
--- a/backend/services/task.go
+++ b/backend/services/task.go
@@ -226,12 +226,18 @@ func ExecuteShellCmd(cmdStr string, cwd string, t model.Task, s model.Spider) (e
// 环境变量配置
envs := s.Envs
if s.Type == constants.Configurable {
+ // 数据库配置
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_HOST", Value: viper.GetString("mongo.host")})
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_PORT", Value: viper.GetString("mongo.port")})
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_DB", Value: viper.GetString("mongo.db")})
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_USERNAME", Value: viper.GetString("mongo.username")})
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_PASSWORD", Value: viper.GetString("mongo.password")})
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_AUTHSOURCE", Value: viper.GetString("mongo.authSource")})
+
+ // 设置配置
+ for envName, envValue := range s.Config.Settings {
+ envs = append(envs, model.Env{Name: "CRAWLAB_SETTING_" + envName, Value: envValue})
+ }
}
cmd = SetEnv(cmd, envs, t.Id, s.Col)
@@ -311,9 +317,12 @@ func SaveTaskResultCount(id string) func() {
// 执行任务
func ExecuteTask(id int) {
- if flag, _ := LockList.Load(id); flag.(bool) {
- log.Debugf(GetWorkerPrefix(id) + "正在执行任务...")
- return
+ if flag, ok := LockList.Load(id); ok {
+ if flag.(bool) {
+ log.Debugf(GetWorkerPrefix(id) + "正在执行任务...")
+ return
+ }
+
}
// 上锁
@@ -485,6 +494,29 @@ func GetTaskLog(id string) (logStr string, err error) {
}
if IsMasterNode(task.NodeId.Hex()) {
+ if !utils.Exists(task.LogPath) {
+ fileDir, err := MakeLogDir(task)
+
+ if err != nil {
+ log.Errorf(err.Error())
+ }
+
+ fileP := GetLogFilePaths(fileDir)
+
+ // 获取日志文件路径
+ fLog, err := os.Create(fileP)
+ defer fLog.Close()
+ if err != nil {
+ log.Errorf("create task log file error: %s", fileP)
+ debug.PrintStack()
+ }
+ task.LogPath = fileP
+ if err := task.Save(); err != nil {
+ log.Errorf(err.Error())
+ debug.PrintStack()
+ }
+
+ }
// 若为主节点,获取本机日志
logBytes, err := model.GetLocalLog(task.LogPath)
if err != nil {
diff --git a/backend/template/scrapy/config_spider/settings.py b/backend/template/scrapy/config_spider/settings.py
index a0112373..4b0965f2 100644
--- a/backend/template/scrapy/config_spider/settings.py
+++ b/backend/template/scrapy/config_spider/settings.py
@@ -1,4 +1,7 @@
# -*- coding: utf-8 -*-
+import os
+import re
+import json
# Scrapy settings for config_spider project
#
@@ -9,14 +12,14 @@
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-BOT_NAME = 'config_spider'
+BOT_NAME = 'Crawlab Configurable Spider'
SPIDER_MODULES = ['config_spider.spiders']
NEWSPIDER_MODULE = 'config_spider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'config_spider (+http://www.yourdomain.com)'
+USER_AGENT = 'Crawlab Spider'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
@@ -88,3 +91,21 @@ ITEM_PIPELINES = {
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]:
+ setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '')
+ setting_value = os.environ.get(setting_env_name)
+ if setting_value.lower() == 'true':
+ setting_value = True
+ elif setting_value.lower() == 'false':
+ setting_value = False
+ elif re.search(r'^\d+$', setting_value) is not None:
+ setting_value = int(setting_value)
+ elif re.search(r'^\{.*\}$', setting_value.strip()) is not None:
+ setting_value = json.loads(setting_value)
+ elif re.search(r'^\[.*\]$', setting_value.strip()) is not None:
+ setting_value = json.loads(setting_value)
+ else:
+ pass
+ locals()[setting_name] = setting_value
+
diff --git a/backend/template/spiderfile/Spiderfile.163_news b/backend/template/spiderfile/Spiderfile.163_news
new file mode 100644
index 00000000..29d58279
--- /dev/null
+++ b/backend/template/spiderfile/Spiderfile.163_news
@@ -0,0 +1,20 @@
+version: "0.4.0"
+name: "toscrapy_books"
+start_url: "http://news.163.com/special/0001386F/rank_news.html"
+start_stage: "list"
+engine: "scrapy"
+stages:
+ list:
+ is_list: true
+ list_css: "table tr:not(:first-child)"
+ fields:
+ - name: "title"
+ css: "td:nth-child(1) > a"
+ - name: "url"
+ css: "td:nth-child(1) > a"
+ attr: "href"
+ - name: "clicks"
+ css: "td.cBlue"
+settings:
+ ROBOTSTXT_OBEY: false
+ USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
diff --git a/backend/template/spiderfile/Spiderfile.baidu b/backend/template/spiderfile/Spiderfile.baidu
new file mode 100644
index 00000000..fbf720e4
--- /dev/null
+++ b/backend/template/spiderfile/Spiderfile.baidu
@@ -0,0 +1,22 @@
+version: 0.4.0
+name: toscrapy_books
+start_url: http://www.baidu.com/s?wd=crawlab
+start_stage: list
+engine: scrapy
+stages:
+ list:
+ is_list: true
+ list_xpath: //*[contains(@class, "c-container")]
+ page_xpath: //*[@id="page"]//a[@class="n"][last()]
+ page_attr: href
+ fields:
+ - name: title
+ xpath: .//h3/a
+ - name: url
+ xpath: .//h3/a
+ attr: href
+ - name: abstract
+ xpath: .//*[@class="c-abstract"]
+settings:
+ ROBOTSTXT_OBEY: false
+ USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
diff --git a/backend/template/Spiderfile b/backend/template/spiderfile/Spiderfile.toscrapy_books
similarity index 83%
rename from backend/template/Spiderfile
rename to backend/template/spiderfile/Spiderfile.toscrapy_books
index 8d0e05cf..4bf18f61 100644
--- a/backend/template/Spiderfile
+++ b/backend/template/spiderfile/Spiderfile.toscrapy_books
@@ -5,10 +5,10 @@ start_stage: "list"
engine: "scrapy"
stages:
list:
- is_list: true # default: false
+ is_list: true
list_css: "section article.product_pod"
page_css: "ul.pager li.next a"
- page_attr: "href" # default: href
+ page_attr: "href"
fields:
- name: "title"
css: "h3 > a"
@@ -23,3 +23,6 @@ stages:
fields:
- name: "description"
css: "#product_description + p"
+settings:
+ ROBOTSTXT_OBEY: true
+ AUTOTHROTTLE_ENABLED: true
diff --git a/backend/utils/file.go b/backend/utils/file.go
index 2dacc9ed..c71b2cb0 100644
--- a/backend/utils/file.go
+++ b/backend/utils/file.go
@@ -167,7 +167,6 @@ func DeCompress(srcFile *os.File, dstPath string) error {
debug.PrintStack()
continue
}
- defer Close(newFile)
// 拷贝该文件到新文件中
if _, err := io.Copy(newFile, srcFile); err != nil {
diff --git a/frontend/package.json b/frontend/package.json
index 5f19fd7b..724b5e36 100644
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -23,7 +23,7 @@
"cross-env": "^5.2.0",
"dayjs": "^1.8.6",
"echarts": "^4.1.0",
- "element-ui": "2.4.6",
+ "element-ui": "2.13.0",
"font-awesome": "^4.7.0",
"js-cookie": "2.2.0",
"normalize.css": "7.0.0",
diff --git a/frontend/src/components/Common/CrawlConfirmDialog.vue b/frontend/src/components/Common/CrawlConfirmDialog.vue
index 2286beb2..f2ad70c2 100644
--- a/frontend/src/components/Common/CrawlConfirmDialog.vue
+++ b/frontend/src/components/Common/CrawlConfirmDialog.vue
@@ -2,13 +2,21 @@
+
+
+
+
+
+