Merge remote-tracking branch 'upstream/develop' into develop

2026-01-26 17:49:15 +01:00 · 2019-12-05 07:15:07 +08:00
parent 3ef5ddfa07 6012a49623
commit 026c9756b9
41 changed files with 2260 additions and 519 deletions
--- a/backend/conf/config.yml
+++ b/backend/conf/config.yml
@@ -15,7 +15,7 @@ redis:
 log:
  level: info
  path: "/var/logs/crawlab"
-  isDeletePeriodically: "Y"
+  isDeletePeriodically: "N"
  deleteFrequency: "@hourly"
 server:
  host: 0.0.0.0
--- a/backend/entity/common.go
+++ b/backend/entity/common.go
@@ -3,15 +3,15 @@ package entity
 import "strconv"

 type Page struct {
-	Skip int
-	Limit int
-	PageNum int
+	Skip     int
+	Limit    int
+	PageNum  int
 	PageSize int
 }

-func (p *Page)GetPage(pageNum string, pageSize string) {
+func (p *Page) GetPage(pageNum string, pageSize string) {
 	p.PageNum, _ = strconv.Atoi(pageNum)
 	p.PageSize, _ = strconv.Atoi(pageSize)
 	p.Skip = p.PageSize * (p.PageNum - 1)
 	p.Limit = p.PageSize
-}
+}
--- a/backend/entity/config_spider.go
+++ b/backend/entity/config_spider.go
@@ -1,25 +1,30 @@
 package entity

+type ConfigSpiderData struct {
+	Version    string            `yaml:"version" json:"version"`
+	Engine     string            `yaml:"engine" json:"engine"`
+	StartUrl   string            `yaml:"start_url" json:"start_url"`
+	StartStage string            `yaml:"start_stage" json:"start_stage"`
+	Stages     map[string]Stage  `yaml:"stages" json:"stages"`
+	Settings   map[string]string `yaml:"settings" json:"settings"`
+}
+
+type Stage struct {
+	Name      string  `yaml:"name" json:"name"`
+	IsList    bool    `yaml:"is_list" json:"is_list"`
+	ListCss   string  `yaml:"list_css" json:"list_css"`
+	ListXpath string  `yaml:"list_xpath" json:"list_xpath"`
+	PageCss   string  `yaml:"page_css" json:"page_css"`
+	PageXpath string  `yaml:"page_xpath" json:"page_xpath"`
+	PageAttr  string  `yaml:"page_attr" json:"page_attr"`
+	Fields    []Field `yaml:"fields" json:"fields"`
+}
+
 type Field struct {
 	Name      string `yaml:"name" json:"name"`
 	Css       string `yaml:"css" json:"css"`
 	Xpath     string `yaml:"xpath" json:"xpath"`
 	Attr      string `yaml:"attr" json:"attr"`
 	NextStage string `yaml:"next_stage" json:"next_stage"`
-}
-
-type Stage struct {
-	IsList   bool    `yaml:"is_list" json:"is_list"`
-	ListCss  string  `yaml:"list_css" json:"list_css"`
-	PageCss  string  `yaml:"page_css" json:"page_css"`
-	PageAttr string  `yaml:"page_attr" json:"page_attr"`
-	Fields   []Field `yaml:"fields" json:"fields"`
-}
-
-type ConfigSpiderData struct {
-	Version    string           `yaml:"version" json:"version"`
-	Engine     string           `yaml:"engine" json:"engine"`
-	StartUrl   string           `yaml:"start_url" json:"start_url"`
-	StartStage string           `yaml:"start_stage" json:"start_stage"`
-	Stages     map[string]Stage `yaml:"stages" json:"stages"`
+	Remark    string `yaml:"remark" json:"remark"`
 }
--- a/backend/main.go
+++ b/backend/main.go
@@ -47,6 +47,8 @@ func main() {
 			panic(err)
 		}
 		log.Info("初始化定期清理日志配置成功")
+	}else {
+		log.Info("默认未开启定期清理日志配置")
 	}

 	// 初始化Mongodb数据库
@@ -140,9 +142,13 @@ func main() {
 			authGroup.GET("/spiders/:id/stats", routes.GetSpiderStats)   // 爬虫统计数据
 			authGroup.GET("/spider/types", routes.GetSpiderTypes)        // 爬虫类型
 			// 可配置爬虫
-			authGroup.PUT("/config_spiders", routes.PutConfigSpider)                // 添加可配置爬虫
-			authGroup.POST("/config_spiders/:id", routes.PostConfigSpider)          // 修改可配置爬虫
-			authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫
+			authGroup.GET("/config_spiders/:id/config", routes.GetConfigSpiderConfig)           // 获取可配置爬虫配置
+			authGroup.POST("/config_spiders/:id/config", routes.PostConfigSpiderConfig)         // 更改可配置爬虫配置
+			authGroup.PUT("/config_spiders", routes.PutConfigSpider)                            // 添加可配置爬虫
+			authGroup.POST("/config_spiders/:id", routes.PostConfigSpider)                      // 修改可配置爬虫
+			authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider)             // 上传可配置爬虫
+			authGroup.POST("/config_spiders/:id/spiderfile", routes.PostConfigSpiderSpiderfile) // 上传可配置爬虫
+			authGroup.GET("/config_spiders_templates", routes.GetConfigSpiderTemplateList)      // 获取可配置爬虫模版列表
 			// 任务
 			authGroup.GET("/tasks", routes.GetTaskList)                                 // 任务列表
 			authGroup.GET("/tasks/:id", routes.GetTask)                                 // 任务详情
--- a/backend/mock/node_test.go
+++ b/backend/mock/node_test.go
@@ -42,12 +42,12 @@ func init() {
 	app.DELETE("/tasks/:id", DeleteTask)                           // 删除任务
 	app.GET("/tasks/:id/results", GetTaskResults)                  // 任务结果
 	app.GET("/tasks/:id/results/download", DownloadTaskResultsCsv) // 下载任务结果
-	app.GET("/spiders", GetSpiderList)              // 爬虫列表
-	app.GET("/spiders/:id", GetSpider)              // 爬虫详情
-	app.POST("/spiders/:id", PostSpider)            // 修改爬虫
-	app.DELETE("/spiders/:id",DeleteSpider)        // 删除爬虫
-	app.GET("/spiders/:id/tasks",GetSpiderTasks)   // 爬虫任务列表
-	app.GET("/spiders/:id/dir",GetSpiderDir)       // 爬虫目录
+	app.GET("/spiders", GetSpiderList)                             // 爬虫列表
+	app.GET("/spiders/:id", GetSpider)                             // 爬虫详情
+	app.POST("/spiders/:id", PostSpider)                           // 修改爬虫
+	app.DELETE("/spiders/:id", DeleteSpider)                       // 删除爬虫
+	app.GET("/spiders/:id/tasks", GetSpiderTasks)                  // 爬虫任务列表
+	app.GET("/spiders/:id/dir", GetSpiderDir)                      // 爬虫目录
 }

 //mock test, test data in ./mock
--- a/backend/mock/stats.go
+++ b/backend/mock/stats.go
@@ -6,8 +6,6 @@ import (
 	"net/http"
 )

-
-
 var taskDailyItems = []model.TaskDailyItem{
 	{
 		Date:               "2019/08/19",
--- a/backend/mock/system.go
+++ b/backend/mock/system.go
@@ -1 +1 @@
-package mock
+package mock
--- a/backend/mock/user.go
+++ b/backend/mock/user.go
@@ -1 +1 @@
-package mock
+package mock
--- a/backend/model/config_spider/scrapy.go
+++ b/backend/model/config_spider/scrapy.go
@@ -131,12 +131,7 @@ func (g ScrapyGenerator) GetNonListParserString(stageName string, stage entity.S

 	// 遍历字段列表
 	for _, f := range stage.Fields {
-		line := ""
-		if f.Attr == "" {
-			line += fmt.Sprintf(`item['%s'] = response.css('%s::text').extract_first()`, f.Name, f.Css)
-		} else {
-			line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
-		}
+		line := fmt.Sprintf(`item['%s'] = response.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f))
 		line = g.PadCode(line, 2)
 		str += line
 	}
@@ -163,19 +158,14 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag
 	str += g.PadCode(`prev_item = response.meta.get('item')`, 2)

 	// for 循环遍历列表
-	str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s'):`, stage.ListCss), 2)
+	str += g.PadCode(fmt.Sprintf(`for elem in response.%s:`, g.GetListString(stage)), 2)

 	// 构造item
 	str += g.PadCode(`item = Item()`, 3)

 	// 遍历字段列表
 	for _, f := range stage.Fields {
-		line := ""
-		if f.Attr == "" {
-			line += fmt.Sprintf(`item['%s'] = elem.css('%s::text').extract_first()`, f.Name, f.Css)
-		} else {
-			line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
-		}
+		line := fmt.Sprintf(`item['%s'] = elem.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f))
 		line = g.PadCode(line, 3)
 		str += line
 	}
@@ -195,15 +185,9 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag
 	}

 	// 分页
-	if stage.PageCss != "" {
-		// 分页元素属性，默认为 href
-		pageAttr := "href"
-		if stage.PageAttr != "" {
-			pageAttr = stage.PageAttr
-		}
-
-		str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s::attr("%s")').extract_first()`, stage.PageCss, pageAttr), 2)
-		str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': item})`, stageName), 2)
+	if stage.PageCss != "" || stage.PageXpath != "" {
+		str += g.PadCode(fmt.Sprintf(`next_url = response.%s.extract_first()`, g.GetExtractStringFromStage(stage)), 2)
+		str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': prev_item})`, stageName), 2)
 	}

 	// 加入末尾换行
@@ -226,3 +210,49 @@ func (g ScrapyGenerator) GetNextStageField(stage entity.Stage) (entity.Field, er
 	}
 	return entity.Field{}, errors.New("cannot find next stage field")
 }
+
+func (g ScrapyGenerator) GetExtractStringFromField(f entity.Field) string {
+	if f.Css != "" {
+		// 如果为CSS
+		if f.Attr == "" {
+			// 文本
+			return fmt.Sprintf(`css('%s::text')`, f.Css)
+		} else {
+			// 属性
+			return fmt.Sprintf(`css('%s::attr("%s")')`, f.Css, f.Attr)
+		}
+	} else {
+		// 如果为XPath
+		if f.Attr == "" {
+			// 文本
+			return fmt.Sprintf(`xpath('string(%s)')`, f.Xpath)
+		} else {
+			// 属性
+			return fmt.Sprintf(`xpath('%s/@%s')`, f.Xpath, f.Attr)
+		}
+	}
+}
+
+func (g ScrapyGenerator) GetExtractStringFromStage(stage entity.Stage) string {
+	// 分页元素属性，默认为 href
+	pageAttr := "href"
+	if stage.PageAttr != "" {
+		pageAttr = stage.PageAttr
+	}
+
+	if stage.PageCss != "" {
+		// 如果为CSS
+		return fmt.Sprintf(`css('%s::attr("%s")')`, stage.PageCss, pageAttr)
+	} else {
+		// 如果为XPath
+		return fmt.Sprintf(`xpath('%s/@%s')`, stage.PageXpath, pageAttr)
+	}
+}
+
+func (g ScrapyGenerator) GetListString(stage entity.Stage) string {
+	if stage.ListCss != "" {
+		return fmt.Sprintf(`css('%s')`, stage.ListCss)
+	} else {
+		return fmt.Sprintf(`xpath('%s')`, stage.ListXpath)
+	}
+}
--- a/backend/model/spider.go
+++ b/backend/model/spider.go
@@ -1,11 +1,17 @@
 package model

 import (
+	"crawlab/constants"
 	"crawlab/database"
 	"crawlab/entity"
+	"crawlab/utils"
+	"errors"
 	"github.com/apex/log"
 	"github.com/globalsign/mgo"
 	"github.com/globalsign/mgo/bson"
+	"gopkg.in/yaml.v2"
+	"io/ioutil"
+	"path/filepath"
 	"runtime/debug"
 	"time"
 )
@@ -25,14 +31,18 @@ type Spider struct {
 	Site        string        `json:"site" bson:"site"`                 // 爬虫网站
 	Envs        []Env         `json:"envs" bson:"envs"`                 // 环境变量
 	Remark      string        `json:"remark" bson:"remark"`             // 备注
+	Src         string        `json:"src" bson:"src"`                   // 源码位置

 	// 自定义爬虫
-	Src string `json:"src" bson:"src"` // 源码位置
 	Cmd string `json:"cmd" bson:"cmd"` // 执行命令

+	// 可配置爬虫
+	Template string `json:"template" bson:"template"` // Spiderfile模版
+
 	// 前端展示
-	LastRunTs  time.Time `json:"last_run_ts"` // 最后一次执行时间
-	LastStatus string    `json:"last_status"` // 最后执行状态
+	LastRunTs  time.Time               `json:"last_run_ts"` // 最后一次执行时间
+	LastStatus string                  `json:"last_status"` // 最后执行状态
+	Config     entity.ConfigSpiderData `json:"config"`      // 可配置爬虫配置

 	// 时间
 	CreateTs time.Time `json:"create_ts" bson:"create_ts"`
@@ -108,6 +118,10 @@ func GetSpiderList(filter interface{}, skip int, limit int) ([]Spider, int, erro
 		return spiders, 0, err
 	}

+	if spiders == nil {
+		spiders = []Spider{}
+	}
+
 	// 遍历爬虫列表
 	for i, spider := range spiders {
 		// 获取最后一次任务
@@ -161,15 +175,25 @@ func GetSpider(id bson.ObjectId) (Spider, error) {
 	s, c := database.GetCol("spiders")
 	defer s.Close()

-	var result Spider
-	if err := c.FindId(id).One(&result); err != nil {
+	// 获取爬虫
+	var spider Spider
+	if err := c.FindId(id).One(&spider); err != nil {
 		if err != mgo.ErrNotFound {
 			log.Errorf("get spider error: %s, id: %id", err.Error(), id.Hex())
 			debug.PrintStack()
 		}
-		return result, err
+		return spider, err
 	}
-	return result, nil
+
+	// 如果为可配置爬虫，获取爬虫配置
+	if spider.Type == constants.Configurable && utils.Exists(filepath.Join(spider.Src, "Spiderfile")) {
+		config, err := GetConfigSpiderData(spider)
+		if err != nil {
+			return spider, err
+		}
+		spider.Config = config
+	}
+	return spider, nil
 }

 // 更新爬虫
@@ -209,10 +233,12 @@ func RemoveSpider(id bson.ObjectId) error {
 	s, gf := database.GetGridFs("files")
 	defer s.Close()

-	if err := gf.RemoveId(result.FileId); err != nil {
-		log.Error("remove file error, id:" + result.FileId.Hex())
-		debug.PrintStack()
-		return err
+	if result.FileId.Hex() != constants.ObjectIdNull {
+		if err := gf.RemoveId(result.FileId); err != nil {
+			log.Error("remove file error, id:" + result.FileId.Hex())
+			debug.PrintStack()
+			return err
+		}
 	}

 	return nil
@@ -269,3 +295,35 @@ func GetSpiderTypes() ([]*entity.SpiderType, error) {

 	return types, nil
 }
+
+func GetConfigSpiderData(spider Spider) (entity.ConfigSpiderData, error) {
+	// 构造配置数据
+	configData := entity.ConfigSpiderData{}
+
+	// 校验爬虫类别
+	if spider.Type != constants.Configurable {
+		return configData, errors.New("not a configurable spider")
+	}
+
+	// Spiderfile 目录
+	sfPath := filepath.Join(spider.Src, "Spiderfile")
+
+	// 读取YAML文件
+	yamlFile, err := ioutil.ReadFile(sfPath)
+	if err != nil {
+		return configData, err
+	}
+
+	// 反序列化
+	if err := yaml.Unmarshal(yamlFile, &configData); err != nil {
+		return configData, err
+	}
+
+	// 赋值 stage_name
+	for stageName, stage := range configData.Stages {
+		stage.Name = stageName
+		configData.Stages[stageName] = stage
+	}
+
+	return configData, nil
+}
--- a/backend/routes/config_spider.go
+++ b/backend/routes/config_spider.go
@@ -2,16 +2,13 @@ package routes

 import (
 	"crawlab/constants"
-	"crawlab/database"
 	"crawlab/entity"
 	"crawlab/model"
 	"crawlab/services"
 	"crawlab/utils"
 	"fmt"
-	"github.com/apex/log"
 	"github.com/gin-gonic/gin"
 	"github.com/globalsign/mgo/bson"
-	uuid "github.com/satori/go.uuid"
 	"github.com/spf13/viper"
 	"gopkg.in/yaml.v2"
 	"io"
@@ -19,7 +16,7 @@ import (
 	"net/http"
 	"os"
 	"path/filepath"
-	"runtime/debug"
+	"strings"
 )

 // 添加可配置爬虫
@@ -36,6 +33,12 @@ func PutConfigSpider(c *gin.Context) {
 		return
 	}

+	// 模版名不能为空
+	if spider.Template == "" {
+		HandleErrorF(http.StatusBadRequest, c, "spider template should not be empty")
+		return
+	}
+
 	// 判断爬虫是否存在
 	if spider := model.GetSpiderByName(spider.Name); spider != nil {
 		HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("spider for '%s' already exists", spider.Name))
@@ -62,6 +65,23 @@ func PutConfigSpider(c *gin.Context) {
 	}
 	spider.Src = spiderDir

+	// 复制Spiderfile模版
+	contentByte, err := ioutil.ReadFile("./template/spiderfile/Spiderfile." + spider.Template)
+	if err != nil {
+		HandleError(http.StatusInternalServerError, c, err)
+		return
+	}
+	f, err := os.Create(filepath.Join(spider.Src, "Spiderfile"))
+	if err != nil {
+		HandleError(http.StatusInternalServerError, c, err)
+		return
+	}
+	defer f.Close()
+	if _, err := f.Write(contentByte); err != nil {
+		HandleError(http.StatusInternalServerError, c, err)
+		return
+	}
+
 	// 添加爬虫到数据库
 	if err := spider.Add(); err != nil {
 		HandleError(http.StatusInternalServerError, c, err)
@@ -100,8 +120,8 @@ func UploadConfigSpider(c *gin.Context) {

 	// 文件名称必须为Spiderfile
 	filename := header.Filename
-	if filename != "Spiderfile" {
-		HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile'")
+	if filename != "Spiderfile" && filename != "Spiderfile.yaml" && filename != "Spiderfile.yml" {
+		HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile(.yaml|.yml)'")
 		return
 	}

@@ -151,88 +171,146 @@ func UploadConfigSpider(c *gin.Context) {
 		return
 	}

-	// 删除已有的爬虫文件
-	for _, fInfo := range utils.ListDir(spiderDir) {
-		// 不删除Spiderfile
-		if fInfo.Name() == filename {
-			continue
-		}
-
-		// 删除其他文件
-		if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil {
-			HandleError(http.StatusInternalServerError, c, err)
-			return
-		}
-	}
-
-	// 拷贝爬虫文件
-	tplDir := "./template/scrapy"
-	for _, fInfo := range utils.ListDir(tplDir) {
-		// 跳过Spiderfile
-		if fInfo.Name() == "Spiderfile" {
-			continue
-		}
-
-		srcPath := filepath.Join(tplDir, fInfo.Name())
-		if fInfo.IsDir() {
-			dirPath := filepath.Join(spiderDir, fInfo.Name())
-			if err := utils.CopyDir(srcPath, dirPath); err != nil {
-				HandleError(http.StatusInternalServerError, c, err)
-				return
-			}
-		} else {
-			if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
-				HandleError(http.StatusInternalServerError, c, err)
-				return
-			}
-		}
-	}
-
-	// 更改爬虫文件
-	if err := services.GenerateConfigSpiderFiles(spider, configData); err != nil {
+	// 根据序列化后的数据处理爬虫文件
+	if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
 		HandleError(http.StatusInternalServerError, c, err)
-		return
 	}

-	// 打包为 zip 文件
-	files, err := utils.GetFilesFromDir(spiderDir)
-	if err != nil {
-		HandleError(http.StatusInternalServerError, c, err)
-		return
-	}
-	randomId := uuid.NewV4()
-	tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip")
-	spiderZipFileName := spider.Name + ".zip"
-	if err := utils.Compress(files, tmpFilePath); err != nil {
-		HandleError(http.StatusInternalServerError, c, err)
-		return
-	}
-
-	// 获取 GridFS 实例
-	s, gf := database.GetGridFs("files")
-	defer s.Close()
-
-	// 判断文件是否已经存在
-	var gfFile model.GridFs
-	if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil {
-		// 已经存在文件，则删除
-		_ = gf.RemoveId(gfFile.Id)
-	}
-
-	// 上传到GridFs
-	fid, err := services.UploadToGridFs(spiderZipFileName, tmpFilePath)
-	if err != nil {
-		log.Errorf("upload to grid fs error: %s", err.Error())
-		debug.PrintStack()
-		return
-	}
-
-	// 保存爬虫 FileId
-	spider.FileId = fid
-	_ = spider.Save()
-
 	c.JSON(http.StatusOK, Response{
 		Status:  "ok",
 		Message: "success",
 	})
 }
+
+func PostConfigSpiderSpiderfile(c *gin.Context) {
+	type Body struct {
+		Content string `json:"content"`
+	}
+
+	id := c.Param("id")
+
+	// 文件内容
+	var reqBody Body
+	if err := c.ShouldBindJSON(&reqBody); err != nil {
+		HandleError(http.StatusBadRequest, c, err)
+		return
+	}
+	content := reqBody.Content
+
+	// 获取爬虫
+	var spider model.Spider
+	spider, err := model.GetSpider(bson.ObjectIdHex(id))
+	if err != nil {
+		HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("cannot find spider (id: %s)", id))
+		return
+	}
+
+	// 反序列化
+	var configData entity.ConfigSpiderData
+	if err := yaml.Unmarshal([]byte(content), &configData); err != nil {
+		HandleError(http.StatusBadRequest, c, err)
+		return
+	}
+
+	// 校验configData
+	if err := services.ValidateSpiderfile(configData); err != nil {
+		HandleError(http.StatusInternalServerError, c, err)
+		return
+	}
+
+	// 写文件
+	if err := ioutil.WriteFile(filepath.Join(spider.Src, "Spiderfile"), []byte(content), os.ModePerm); err != nil {
+		HandleError(http.StatusInternalServerError, c, err)
+		return
+	}
+
+	// 根据序列化后的数据处理爬虫文件
+	if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
+		HandleError(http.StatusInternalServerError, c, err)
+		return
+	}
+
+	c.JSON(http.StatusOK, Response{
+		Status:  "ok",
+		Message: "success",
+	})
+}
+
+func PostConfigSpiderConfig(c *gin.Context) {
+	id := c.Param("id")
+
+	// 获取爬虫
+	var spider model.Spider
+	spider, err := model.GetSpider(bson.ObjectIdHex(id))
+	if err != nil {
+		HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("cannot find spider (id: %s)", id))
+		return
+	}
+
+	// 反序列化配置数据
+	var configData entity.ConfigSpiderData
+	if err := c.ShouldBindJSON(&configData); err != nil {
+		HandleError(http.StatusBadRequest, c, err)
+		return
+	}
+
+	// 校验configData
+	if err := services.ValidateSpiderfile(configData); err != nil {
+		HandleError(http.StatusInternalServerError, c, err)
+		return
+	}
+
+	// 替换Spiderfile文件
+	if err := services.GenerateSpiderfileFromConfigData(spider, configData); err != nil {
+		HandleError(http.StatusInternalServerError, c, err)
+		return
+	}
+
+	// 根据序列化后的数据处理爬虫文件
+	if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
+		HandleError(http.StatusInternalServerError, c, err)
+		return
+	}
+
+	c.JSON(http.StatusOK, Response{
+		Status:  "ok",
+		Message: "success",
+	})
+}
+
+func GetConfigSpiderConfig(c *gin.Context) {
+	id := c.Param("id")
+
+	// 校验ID
+	if !bson.IsObjectIdHex(id) {
+		HandleErrorF(http.StatusBadRequest, c, "invalid id")
+	}
+
+	// 获取爬虫
+	spider, err := model.GetSpider(bson.ObjectIdHex(id))
+	if err != nil {
+		HandleError(http.StatusInternalServerError, c, err)
+		return
+	}
+
+	c.JSON(http.StatusOK, Response{
+		Status:  "ok",
+		Message: "success",
+		Data:    spider.Config,
+	})
+}
+
+// 获取模版名称列表
+func GetConfigSpiderTemplateList(c *gin.Context) {
+	var data []string
+	for _, fInfo := range utils.ListDir("./template/spiderfile") {
+		templateName := strings.Replace(fInfo.Name(), "Spiderfile.", "", -1)
+		data = append(data, templateName)
+	}
+
+	c.JSON(http.StatusOK, Response{
+		Status:  "ok",
+		Message: "success",
+		Data:    data,
+	})
+}
--- a/backend/routes/spider.go
+++ b/backend/routes/spider.go
@@ -34,7 +34,7 @@ func GetSpiderList(c *gin.Context) {
 		"name": bson.M{"$regex": bson.RegEx{Pattern: keyword, Options: "im"}},
 	}

-	if t != "" {
+	if t != "" && t != "all" {
 		filter["type"] = t
 	}

--- a/backend/services/config_spider.go
+++ b/backend/services/config_spider.go
@@ -2,11 +2,20 @@ package services

 import (
 	"crawlab/constants"
+	"crawlab/database"
 	"crawlab/entity"
 	"crawlab/model"
 	"crawlab/model/config_spider"
+	"crawlab/utils"
 	"errors"
 	"fmt"
+	"github.com/apex/log"
+	"github.com/globalsign/mgo/bson"
+	uuid "github.com/satori/go.uuid"
+	"github.com/spf13/viper"
+	"gopkg.in/yaml.v2"
+	"os"
+	"path/filepath"
 	"strings"
 )

@@ -37,12 +46,17 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error {

 	// 校验是否存在 start_url
 	if configData.StartUrl == "" {
-		return errors.New("spiderfile start_url is empty")
+		return errors.New("spiderfile invalid: start_url is empty")
+	}
+
+	// 校验是否存在 start_stage
+	if configData.StartStage == "" {
+		return errors.New("spiderfile invalid: start_stage is empty")
 	}

 	// 校验是否存在 stages
 	if len(configData.Stages) == 0 {
-		return errors.New("spiderfile stages is empty")
+		return errors.New("spiderfile invalid: stages is empty")
 	}

 	// 校验stages
@@ -50,56 +64,74 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error {
 	for stageName, stage := range configData.Stages {
 		// stage 名称不能为空
 		if stageName == "" {
-			return errors.New("spiderfile stage name is empty")
+			return errors.New("spiderfile invalid: stage name is empty")
 		}

 		// stage 名称不能为保留字符串
 		// NOTE: 如果有其他Engine，可以扩展，默认为Scrapy
 		if configData.Engine == "" || configData.Engine == constants.EngineScrapy {
 			if strings.Contains(constants.ScrapyProtectedStageNames, stageName) {
-				return errors.New(fmt.Sprintf("spiderfile stage name '%s' is protected", stageName))
+				return errors.New(fmt.Sprintf("spiderfile invalid: stage name '%s' is protected", stageName))
 			}
-		} else if configData.Engine == constants.EngineColly {
-			return errors.New(fmt.Sprintf("engine '%s' is not implemented", stageName))
+		} else {
+			return errors.New(fmt.Sprintf("spiderfile invalid: engine '%s' is not implemented", configData.Engine))
 		}

 		// stage 名称不能重复
 		if dict[stageName] == 1 {
-			return errors.New("spiderfile stage name should be unique")
+			return errors.New(fmt.Sprintf("spiderfile invalid: stage name '%s' is duplicated", stageName))
 		}
 		dict[stageName] = 1

 		// stage 字段不能为空
 		if len(stage.Fields) == 0 {
-			return errors.New(fmt.Sprintf("spiderfile stage '%s' has no fields", stageName))
+			return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has no fields", stageName))
 		}

-		// stage 的下一个 stage 只能有一个
+		// 是否包含 next_stage
 		hasNextStage := false
+
+		// 遍历字段列表
 		for _, field := range stage.Fields {
+			// stage 的 next stage 只能有一个
 			if field.NextStage != "" {
 				if hasNextStage {
-					return errors.New("spiderfile stage fields should have only 1 next_stage")
+					return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has more than 1 next_stage", stageName))
 				}
 				hasNextStage = true
 			}
+
+			// 字段里 css 和 xpath 只能包含一个
+			if field.Css != "" && field.Xpath != "" {
+				return errors.New(fmt.Sprintf("spiderfile invalid: field '%s' in stage '%s' has both css and xpath set which is prohibited", field.Name, stageName))
+			}
+		}
+
+		// stage 里 page_css 和 page_xpath 只能包含一个
+		if stage.PageCss != "" && stage.PageXpath != "" {
+			return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both page_css and page_xpath set which is prohibited", stageName))
+		}
+
+		// stage 里 list_css 和 list_xpath 只能包含一个
+		if stage.ListCss != "" && stage.ListXpath != "" {
+			return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both list_css and list_xpath set which is prohibited", stageName))
 		}

 		// 如果 stage 的 is_list 为 true 但 list_css 为空，报错
-		if stage.IsList && stage.ListCss == "" {
-			return errors.New("spiderfile stage with is_list = true should have list_css being set")
+		if stage.IsList && (stage.ListCss == "" && stage.ListXpath == "") {
+			return errors.New("spiderfile invalid: stage with is_list = true should have either list_css or list_xpath being set")
 		}
 	}

 	// 校验字段唯一性
 	if !IsUniqueConfigSpiderFields(fields) {
-		return errors.New("spiderfile fields not unique")
+		return errors.New("spiderfile invalid: fields not unique")
 	}

 	// 字段名称不能为保留字符串
 	for _, field := range fields {
 		if strings.Contains(constants.ScrapyProtectedFieldNames, field.Name) {
-			return errors.New(fmt.Sprintf("spiderfile field name '%s' is protected", field.Name))
+			return errors.New(fmt.Sprintf("spiderfile invalid: field name '%s' is protected", field.Name))
 		}
 	}

@@ -116,3 +148,118 @@ func IsUniqueConfigSpiderFields(fields []entity.Field) bool {
 	}
 	return true
 }
+
+func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error {
+	spiderDir := spider.Src
+
+	// 赋值 stage_name
+	for stageName, stage := range configData.Stages {
+		stage.Name = stageName
+		configData.Stages[stageName] = stage
+	}
+
+	// 删除已有的爬虫文件
+	for _, fInfo := range utils.ListDir(spiderDir) {
+		// 不删除Spiderfile
+		if fInfo.Name() == "Spiderfile" {
+			continue
+		}
+
+		// 删除其他文件
+		if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil {
+			return err
+		}
+	}
+
+	// 拷贝爬虫文件
+	tplDir := "./template/scrapy"
+	for _, fInfo := range utils.ListDir(tplDir) {
+		// 跳过Spiderfile
+		if fInfo.Name() == "Spiderfile" {
+			continue
+		}
+
+		srcPath := filepath.Join(tplDir, fInfo.Name())
+		if fInfo.IsDir() {
+			dirPath := filepath.Join(spiderDir, fInfo.Name())
+			if err := utils.CopyDir(srcPath, dirPath); err != nil {
+				return err
+			}
+		} else {
+			if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
+				return err
+			}
+		}
+	}
+
+	// 更改爬虫文件
+	if err := GenerateConfigSpiderFiles(spider, configData); err != nil {
+		return err
+	}
+
+	// 打包为 zip 文件
+	files, err := utils.GetFilesFromDir(spiderDir)
+	if err != nil {
+		return err
+	}
+	randomId := uuid.NewV4()
+	tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip")
+	spiderZipFileName := spider.Name + ".zip"
+	if err := utils.Compress(files, tmpFilePath); err != nil {
+		return err
+	}
+
+	// 获取 GridFS 实例
+	s, gf := database.GetGridFs("files")
+	defer s.Close()
+
+	// 判断文件是否已经存在
+	var gfFile model.GridFs
+	if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil {
+		// 已经存在文件，则删除
+		_ = gf.RemoveId(gfFile.Id)
+	}
+
+	// 上传到GridFs
+	fid, err := UploadToGridFs(spiderZipFileName, tmpFilePath)
+	if err != nil {
+		log.Errorf("upload to grid fs error: %s", err.Error())
+		return err
+	}
+
+	// 保存爬虫 FileId
+	spider.FileId = fid
+	_ = spider.Save()
+
+	return nil
+}
+
+func GenerateSpiderfileFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error {
+	// Spiderfile 路径
+	sfPath := filepath.Join(spider.Src, "Spiderfile")
+
+	// 生成Yaml内容
+	sfContentByte, err := yaml.Marshal(configData)
+	if err != nil {
+		return err
+	}
+
+	// 打开文件
+	var f *os.File
+	if utils.Exists(sfPath) {
+		f, err = os.OpenFile(sfPath, os.O_WRONLY|os.O_TRUNC, 0777)
+	} else {
+		f, err = os.OpenFile(sfPath, os.O_CREATE, 0777)
+	}
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	// 写入内容
+	if _, err := f.Write(sfContentByte); err != nil {
+		return err
+	}
+
+	return nil
+}
--- a/backend/services/spider.go
+++ b/backend/services/spider.go
@@ -116,12 +116,15 @@ func PublishAllSpiders() {

 // 发布爬虫
 func PublishSpider(spider model.Spider) {
-	// 查询gf file，不存在则标记为爬虫文件不存在
-	gfFile := model.GetGridFs(spider.FileId)
-	if gfFile == nil {
-		spider.FileId = constants.ObjectIdNull
-		_ = spider.Save()
-		return
+	var gfFile *model.GridFs
+	if spider.FileId.Hex() != constants.ObjectIdNull {
+		// 查询gf file，不存在则标记为爬虫文件不存在
+		gfFile = model.GetGridFs(spider.FileId)
+		if gfFile == nil {
+			spider.FileId = constants.ObjectIdNull
+			_ = spider.Save()
+			return
+		}
 	}

 	// 如果FileId为空，表示还没有上传爬虫到GridFS，则跳过
--- a/backend/services/spider_handler/spider.go
+++ b/backend/services/spider_handler/spider.go
@@ -10,6 +10,7 @@ import (
 	"github.com/spf13/viper"
 	"io"
 	"os"
+	"os/exec"
 	"path/filepath"
 	"runtime/debug"
 )
@@ -99,7 +100,6 @@ func (s *SpiderSync) Download() {
 	// 创建临时文件
 	tmpFilePath := filepath.Join(tmpPath, randomId.String()+".zip")
 	tmpFile := utils.OpenFile(tmpFilePath)
-	defer utils.Close(tmpFile)

 	// 将该文件写入临时文件
 	if _, err := io.Copy(tmpFile, f); err != nil {
@@ -119,6 +119,15 @@ func (s *SpiderSync) Download() {
 		return
 	}

+	//递归修改目标文件夹权限
+	// 解决scrapy.setting中开启LOG_ENABLED 和 LOG_FILE时不能创建log文件的问题
+	cmd := exec.Command("chmod", "-R", "777", dstPath)
+	if err := cmd.Run(); err != nil {
+		log.Errorf(err.Error())
+		debug.PrintStack()
+		return
+	}
+
 	// 关闭临时文件
 	if err := tmpFile.Close(); err != nil {
 		log.Errorf(err.Error())
--- a/backend/services/task.go
+++ b/backend/services/task.go
@@ -226,12 +226,18 @@ func ExecuteShellCmd(cmdStr string, cwd string, t model.Task, s model.Spider) (e
 	// 环境变量配置
 	envs := s.Envs
 	if s.Type == constants.Configurable {
+		// 数据库配置
 		envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_HOST", Value: viper.GetString("mongo.host")})
 		envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_PORT", Value: viper.GetString("mongo.port")})
 		envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_DB", Value: viper.GetString("mongo.db")})
 		envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_USERNAME", Value: viper.GetString("mongo.username")})
 		envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_PASSWORD", Value: viper.GetString("mongo.password")})
 		envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_AUTHSOURCE", Value: viper.GetString("mongo.authSource")})
+
+		// 设置配置
+		for envName, envValue := range s.Config.Settings {
+			envs = append(envs, model.Env{Name: "CRAWLAB_SETTING_" + envName, Value: envValue})
+		}
 	}
 	cmd = SetEnv(cmd, envs, t.Id, s.Col)

@@ -311,9 +317,12 @@ func SaveTaskResultCount(id string) func() {

 // 执行任务
 func ExecuteTask(id int) {
-	if flag, _ := LockList.Load(id); flag.(bool) {
-		log.Debugf(GetWorkerPrefix(id) + "正在执行任务...")
-		return
+	if flag, ok := LockList.Load(id); ok {
+		if flag.(bool) {
+			log.Debugf(GetWorkerPrefix(id) + "正在执行任务...")
+			return
+		}
+
 	}

 	// 上锁
@@ -477,6 +486,29 @@ func GetTaskLog(id string) (logStr string, err error) {
 	}

 	if IsMasterNode(task.NodeId.Hex()) {
+		if !utils.Exists(task.LogPath) {
+			fileDir, err := MakeLogDir(task)
+
+			if err != nil {
+				log.Errorf(err.Error())
+			}
+
+			fileP := GetLogFilePaths(fileDir)
+
+			// 获取日志文件路径
+			fLog, err := os.Create(fileP)
+			defer fLog.Close()
+			if err != nil {
+				log.Errorf("create task log file error: %s", fileP)
+				debug.PrintStack()
+			}
+			task.LogPath = fileP
+			if err := task.Save(); err != nil {
+				log.Errorf(err.Error())
+				debug.PrintStack()
+			}
+
+		}
 		// 若为主节点，获取本机日志
 		logBytes, err := model.GetLocalLog(task.LogPath)
 		if err != nil {
--- a/backend/template/scrapy/config_spider/settings.py
+++ b/backend/template/scrapy/config_spider/settings.py
@@ -1,4 +1,7 @@
 # -*- coding: utf-8 -*-
+import os
+import re
+import json

 # Scrapy settings for config_spider project
 #
@@ -9,14 +12,14 @@
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

-BOT_NAME = 'config_spider'
+BOT_NAME = 'Crawlab Configurable Spider'

 SPIDER_MODULES = ['config_spider.spiders']
 NEWSPIDER_MODULE = 'config_spider.spiders'


 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'config_spider (+http://www.yourdomain.com)'
+USER_AGENT = 'Crawlab Spider'

 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
@@ -88,3 +91,21 @@ ITEM_PIPELINES = {
 #HTTPCACHE_DIR = 'httpcache'
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]:
+    setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '')
+    setting_value = os.environ.get(setting_env_name)
+    if setting_value.lower() == 'true':
+        setting_value = True
+    elif setting_value.lower() == 'false':
+        setting_value = False
+    elif re.search(r'^\d+$', setting_value) is not None:
+        setting_value = int(setting_value)
+    elif re.search(r'^\{.*\}$', setting_value.strip()) is not None:
+        setting_value = json.loads(setting_value)
+    elif re.search(r'^\[.*\]$', setting_value.strip()) is not None:
+        setting_value = json.loads(setting_value)
+    else:
+        pass
+    locals()[setting_name] = setting_value
+
--- a/backend/template/spiderfile/Spiderfile.163_news
+++ b/backend/template/spiderfile/Spiderfile.163_news
@@ -0,0 +1,20 @@
+version: "0.4.0"
+name: "toscrapy_books"
+start_url: "http://news.163.com/special/0001386F/rank_news.html"
+start_stage: "list"
+engine: "scrapy"
+stages:
+  list:
+    is_list: true
+    list_css: "table tr:not(:first-child)"
+    fields:
+      - name: "title"
+        css: "td:nth-child(1) > a"
+      - name: "url"
+        css: "td:nth-child(1) > a"
+        attr: "href"
+      - name: "clicks"
+        css: "td.cBlue"
+settings:
+  ROBOTSTXT_OBEY: false
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
--- a/backend/template/spiderfile/Spiderfile.baidu
+++ b/backend/template/spiderfile/Spiderfile.baidu
@@ -0,0 +1,22 @@
+version: 0.4.0
+name: toscrapy_books
+start_url: http://www.baidu.com/s?wd=crawlab
+start_stage: list
+engine: scrapy
+stages:
+  list:
+    is_list: true
+    list_xpath: //*[contains(@class, "c-container")]
+    page_xpath: //*[@id="page"]//a[@class="n"][last()]
+    page_attr: href
+    fields:
+      - name: title
+        xpath: .//h3/a
+      - name: url
+        xpath: .//h3/a
+        attr: href
+      - name: abstract
+        xpath: .//*[@class="c-abstract"]
+settings:
+  ROBOTSTXT_OBEY: false
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
--- a/backend/template/spiderfile/Spiderfile.toscrapy_books
+++ b/backend/template/spiderfile/Spiderfile.toscrapy_books
@@ -5,10 +5,10 @@ start_stage: "list"
 engine: "scrapy"
 stages:
  list:
-    is_list: true # default: false
+    is_list: true
    list_css: "section article.product_pod"
    page_css: "ul.pager li.next a"
-    page_attr: "href" # default: href
+    page_attr: "href"
    fields:
      - name: "title"
        css: "h3 > a"
@@ -23,3 +23,6 @@ stages:
    fields:
      - name: "description"
        css: "#product_description + p"
+settings:
+  ROBOTSTXT_OBEY: true
+  AUTOTHROTTLE_ENABLED: true
--- a/backend/utils/file.go
+++ b/backend/utils/file.go
@@ -167,7 +167,6 @@ func DeCompress(srcFile *os.File, dstPath string) error {
 			debug.PrintStack()
 			continue
 		}
-		defer Close(newFile)

 		// 拷贝该文件到新文件中
 		if _, err := io.Copy(newFile, srcFile); err != nil {