mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-26 17:49:15 +01:00
Merge remote-tracking branch 'upstream/develop' into develop
This commit is contained in:
@@ -15,7 +15,7 @@ redis:
|
||||
log:
|
||||
level: info
|
||||
path: "/var/logs/crawlab"
|
||||
isDeletePeriodically: "Y"
|
||||
isDeletePeriodically: "N"
|
||||
deleteFrequency: "@hourly"
|
||||
server:
|
||||
host: 0.0.0.0
|
||||
|
||||
@@ -3,15 +3,15 @@ package entity
|
||||
import "strconv"
|
||||
|
||||
type Page struct {
|
||||
Skip int
|
||||
Limit int
|
||||
PageNum int
|
||||
Skip int
|
||||
Limit int
|
||||
PageNum int
|
||||
PageSize int
|
||||
}
|
||||
|
||||
func (p *Page)GetPage(pageNum string, pageSize string) {
|
||||
func (p *Page) GetPage(pageNum string, pageSize string) {
|
||||
p.PageNum, _ = strconv.Atoi(pageNum)
|
||||
p.PageSize, _ = strconv.Atoi(pageSize)
|
||||
p.Skip = p.PageSize * (p.PageNum - 1)
|
||||
p.Limit = p.PageSize
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,25 +1,30 @@
|
||||
package entity
|
||||
|
||||
type ConfigSpiderData struct {
|
||||
Version string `yaml:"version" json:"version"`
|
||||
Engine string `yaml:"engine" json:"engine"`
|
||||
StartUrl string `yaml:"start_url" json:"start_url"`
|
||||
StartStage string `yaml:"start_stage" json:"start_stage"`
|
||||
Stages map[string]Stage `yaml:"stages" json:"stages"`
|
||||
Settings map[string]string `yaml:"settings" json:"settings"`
|
||||
}
|
||||
|
||||
type Stage struct {
|
||||
Name string `yaml:"name" json:"name"`
|
||||
IsList bool `yaml:"is_list" json:"is_list"`
|
||||
ListCss string `yaml:"list_css" json:"list_css"`
|
||||
ListXpath string `yaml:"list_xpath" json:"list_xpath"`
|
||||
PageCss string `yaml:"page_css" json:"page_css"`
|
||||
PageXpath string `yaml:"page_xpath" json:"page_xpath"`
|
||||
PageAttr string `yaml:"page_attr" json:"page_attr"`
|
||||
Fields []Field `yaml:"fields" json:"fields"`
|
||||
}
|
||||
|
||||
type Field struct {
|
||||
Name string `yaml:"name" json:"name"`
|
||||
Css string `yaml:"css" json:"css"`
|
||||
Xpath string `yaml:"xpath" json:"xpath"`
|
||||
Attr string `yaml:"attr" json:"attr"`
|
||||
NextStage string `yaml:"next_stage" json:"next_stage"`
|
||||
}
|
||||
|
||||
type Stage struct {
|
||||
IsList bool `yaml:"is_list" json:"is_list"`
|
||||
ListCss string `yaml:"list_css" json:"list_css"`
|
||||
PageCss string `yaml:"page_css" json:"page_css"`
|
||||
PageAttr string `yaml:"page_attr" json:"page_attr"`
|
||||
Fields []Field `yaml:"fields" json:"fields"`
|
||||
}
|
||||
|
||||
type ConfigSpiderData struct {
|
||||
Version string `yaml:"version" json:"version"`
|
||||
Engine string `yaml:"engine" json:"engine"`
|
||||
StartUrl string `yaml:"start_url" json:"start_url"`
|
||||
StartStage string `yaml:"start_stage" json:"start_stage"`
|
||||
Stages map[string]Stage `yaml:"stages" json:"stages"`
|
||||
Remark string `yaml:"remark" json:"remark"`
|
||||
}
|
||||
|
||||
@@ -47,6 +47,8 @@ func main() {
|
||||
panic(err)
|
||||
}
|
||||
log.Info("初始化定期清理日志配置成功")
|
||||
}else {
|
||||
log.Info("默认未开启定期清理日志配置")
|
||||
}
|
||||
|
||||
// 初始化Mongodb数据库
|
||||
@@ -140,9 +142,13 @@ func main() {
|
||||
authGroup.GET("/spiders/:id/stats", routes.GetSpiderStats) // 爬虫统计数据
|
||||
authGroup.GET("/spider/types", routes.GetSpiderTypes) // 爬虫类型
|
||||
// 可配置爬虫
|
||||
authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫
|
||||
authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫
|
||||
authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫
|
||||
authGroup.GET("/config_spiders/:id/config", routes.GetConfigSpiderConfig) // 获取可配置爬虫配置
|
||||
authGroup.POST("/config_spiders/:id/config", routes.PostConfigSpiderConfig) // 更改可配置爬虫配置
|
||||
authGroup.PUT("/config_spiders", routes.PutConfigSpider) // 添加可配置爬虫
|
||||
authGroup.POST("/config_spiders/:id", routes.PostConfigSpider) // 修改可配置爬虫
|
||||
authGroup.POST("/config_spiders/:id/upload", routes.UploadConfigSpider) // 上传可配置爬虫
|
||||
authGroup.POST("/config_spiders/:id/spiderfile", routes.PostConfigSpiderSpiderfile) // 上传可配置爬虫
|
||||
authGroup.GET("/config_spiders_templates", routes.GetConfigSpiderTemplateList) // 获取可配置爬虫模版列表
|
||||
// 任务
|
||||
authGroup.GET("/tasks", routes.GetTaskList) // 任务列表
|
||||
authGroup.GET("/tasks/:id", routes.GetTask) // 任务详情
|
||||
|
||||
@@ -42,12 +42,12 @@ func init() {
|
||||
app.DELETE("/tasks/:id", DeleteTask) // 删除任务
|
||||
app.GET("/tasks/:id/results", GetTaskResults) // 任务结果
|
||||
app.GET("/tasks/:id/results/download", DownloadTaskResultsCsv) // 下载任务结果
|
||||
app.GET("/spiders", GetSpiderList) // 爬虫列表
|
||||
app.GET("/spiders/:id", GetSpider) // 爬虫详情
|
||||
app.POST("/spiders/:id", PostSpider) // 修改爬虫
|
||||
app.DELETE("/spiders/:id",DeleteSpider) // 删除爬虫
|
||||
app.GET("/spiders/:id/tasks",GetSpiderTasks) // 爬虫任务列表
|
||||
app.GET("/spiders/:id/dir",GetSpiderDir) // 爬虫目录
|
||||
app.GET("/spiders", GetSpiderList) // 爬虫列表
|
||||
app.GET("/spiders/:id", GetSpider) // 爬虫详情
|
||||
app.POST("/spiders/:id", PostSpider) // 修改爬虫
|
||||
app.DELETE("/spiders/:id", DeleteSpider) // 删除爬虫
|
||||
app.GET("/spiders/:id/tasks", GetSpiderTasks) // 爬虫任务列表
|
||||
app.GET("/spiders/:id/dir", GetSpiderDir) // 爬虫目录
|
||||
}
|
||||
|
||||
//mock test, test data in ./mock
|
||||
|
||||
@@ -6,8 +6,6 @@ import (
|
||||
"net/http"
|
||||
)
|
||||
|
||||
|
||||
|
||||
var taskDailyItems = []model.TaskDailyItem{
|
||||
{
|
||||
Date: "2019/08/19",
|
||||
|
||||
@@ -1 +1 @@
|
||||
package mock
|
||||
package mock
|
||||
|
||||
@@ -1 +1 @@
|
||||
package mock
|
||||
package mock
|
||||
|
||||
@@ -131,12 +131,7 @@ func (g ScrapyGenerator) GetNonListParserString(stageName string, stage entity.S
|
||||
|
||||
// 遍历字段列表
|
||||
for _, f := range stage.Fields {
|
||||
line := ""
|
||||
if f.Attr == "" {
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::text').extract_first()`, f.Name, f.Css)
|
||||
} else {
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
|
||||
}
|
||||
line := fmt.Sprintf(`item['%s'] = response.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f))
|
||||
line = g.PadCode(line, 2)
|
||||
str += line
|
||||
}
|
||||
@@ -163,19 +158,14 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag
|
||||
str += g.PadCode(`prev_item = response.meta.get('item')`, 2)
|
||||
|
||||
// for 循环遍历列表
|
||||
str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s'):`, stage.ListCss), 2)
|
||||
str += g.PadCode(fmt.Sprintf(`for elem in response.%s:`, g.GetListString(stage)), 2)
|
||||
|
||||
// 构造item
|
||||
str += g.PadCode(`item = Item()`, 3)
|
||||
|
||||
// 遍历字段列表
|
||||
for _, f := range stage.Fields {
|
||||
line := ""
|
||||
if f.Attr == "" {
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::text').extract_first()`, f.Name, f.Css)
|
||||
} else {
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr)
|
||||
}
|
||||
line := fmt.Sprintf(`item['%s'] = elem.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f))
|
||||
line = g.PadCode(line, 3)
|
||||
str += line
|
||||
}
|
||||
@@ -195,15 +185,9 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag
|
||||
}
|
||||
|
||||
// 分页
|
||||
if stage.PageCss != "" {
|
||||
// 分页元素属性,默认为 href
|
||||
pageAttr := "href"
|
||||
if stage.PageAttr != "" {
|
||||
pageAttr = stage.PageAttr
|
||||
}
|
||||
|
||||
str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s::attr("%s")').extract_first()`, stage.PageCss, pageAttr), 2)
|
||||
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': item})`, stageName), 2)
|
||||
if stage.PageCss != "" || stage.PageXpath != "" {
|
||||
str += g.PadCode(fmt.Sprintf(`next_url = response.%s.extract_first()`, g.GetExtractStringFromStage(stage)), 2)
|
||||
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': prev_item})`, stageName), 2)
|
||||
}
|
||||
|
||||
// 加入末尾换行
|
||||
@@ -226,3 +210,49 @@ func (g ScrapyGenerator) GetNextStageField(stage entity.Stage) (entity.Field, er
|
||||
}
|
||||
return entity.Field{}, errors.New("cannot find next stage field")
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) GetExtractStringFromField(f entity.Field) string {
|
||||
if f.Css != "" {
|
||||
// 如果为CSS
|
||||
if f.Attr == "" {
|
||||
// 文本
|
||||
return fmt.Sprintf(`css('%s::text')`, f.Css)
|
||||
} else {
|
||||
// 属性
|
||||
return fmt.Sprintf(`css('%s::attr("%s")')`, f.Css, f.Attr)
|
||||
}
|
||||
} else {
|
||||
// 如果为XPath
|
||||
if f.Attr == "" {
|
||||
// 文本
|
||||
return fmt.Sprintf(`xpath('string(%s)')`, f.Xpath)
|
||||
} else {
|
||||
// 属性
|
||||
return fmt.Sprintf(`xpath('%s/@%s')`, f.Xpath, f.Attr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) GetExtractStringFromStage(stage entity.Stage) string {
|
||||
// 分页元素属性,默认为 href
|
||||
pageAttr := "href"
|
||||
if stage.PageAttr != "" {
|
||||
pageAttr = stage.PageAttr
|
||||
}
|
||||
|
||||
if stage.PageCss != "" {
|
||||
// 如果为CSS
|
||||
return fmt.Sprintf(`css('%s::attr("%s")')`, stage.PageCss, pageAttr)
|
||||
} else {
|
||||
// 如果为XPath
|
||||
return fmt.Sprintf(`xpath('%s/@%s')`, stage.PageXpath, pageAttr)
|
||||
}
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) GetListString(stage entity.Stage) string {
|
||||
if stage.ListCss != "" {
|
||||
return fmt.Sprintf(`css('%s')`, stage.ListCss)
|
||||
} else {
|
||||
return fmt.Sprintf(`xpath('%s')`, stage.ListXpath)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,11 +1,17 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"crawlab/constants"
|
||||
"crawlab/database"
|
||||
"crawlab/entity"
|
||||
"crawlab/utils"
|
||||
"errors"
|
||||
"github.com/apex/log"
|
||||
"github.com/globalsign/mgo"
|
||||
"github.com/globalsign/mgo/bson"
|
||||
"gopkg.in/yaml.v2"
|
||||
"io/ioutil"
|
||||
"path/filepath"
|
||||
"runtime/debug"
|
||||
"time"
|
||||
)
|
||||
@@ -25,14 +31,18 @@ type Spider struct {
|
||||
Site string `json:"site" bson:"site"` // 爬虫网站
|
||||
Envs []Env `json:"envs" bson:"envs"` // 环境变量
|
||||
Remark string `json:"remark" bson:"remark"` // 备注
|
||||
Src string `json:"src" bson:"src"` // 源码位置
|
||||
|
||||
// 自定义爬虫
|
||||
Src string `json:"src" bson:"src"` // 源码位置
|
||||
Cmd string `json:"cmd" bson:"cmd"` // 执行命令
|
||||
|
||||
// 可配置爬虫
|
||||
Template string `json:"template" bson:"template"` // Spiderfile模版
|
||||
|
||||
// 前端展示
|
||||
LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间
|
||||
LastStatus string `json:"last_status"` // 最后执行状态
|
||||
LastRunTs time.Time `json:"last_run_ts"` // 最后一次执行时间
|
||||
LastStatus string `json:"last_status"` // 最后执行状态
|
||||
Config entity.ConfigSpiderData `json:"config"` // 可配置爬虫配置
|
||||
|
||||
// 时间
|
||||
CreateTs time.Time `json:"create_ts" bson:"create_ts"`
|
||||
@@ -108,6 +118,10 @@ func GetSpiderList(filter interface{}, skip int, limit int) ([]Spider, int, erro
|
||||
return spiders, 0, err
|
||||
}
|
||||
|
||||
if spiders == nil {
|
||||
spiders = []Spider{}
|
||||
}
|
||||
|
||||
// 遍历爬虫列表
|
||||
for i, spider := range spiders {
|
||||
// 获取最后一次任务
|
||||
@@ -161,15 +175,25 @@ func GetSpider(id bson.ObjectId) (Spider, error) {
|
||||
s, c := database.GetCol("spiders")
|
||||
defer s.Close()
|
||||
|
||||
var result Spider
|
||||
if err := c.FindId(id).One(&result); err != nil {
|
||||
// 获取爬虫
|
||||
var spider Spider
|
||||
if err := c.FindId(id).One(&spider); err != nil {
|
||||
if err != mgo.ErrNotFound {
|
||||
log.Errorf("get spider error: %s, id: %id", err.Error(), id.Hex())
|
||||
debug.PrintStack()
|
||||
}
|
||||
return result, err
|
||||
return spider, err
|
||||
}
|
||||
return result, nil
|
||||
|
||||
// 如果为可配置爬虫,获取爬虫配置
|
||||
if spider.Type == constants.Configurable && utils.Exists(filepath.Join(spider.Src, "Spiderfile")) {
|
||||
config, err := GetConfigSpiderData(spider)
|
||||
if err != nil {
|
||||
return spider, err
|
||||
}
|
||||
spider.Config = config
|
||||
}
|
||||
return spider, nil
|
||||
}
|
||||
|
||||
// 更新爬虫
|
||||
@@ -209,10 +233,12 @@ func RemoveSpider(id bson.ObjectId) error {
|
||||
s, gf := database.GetGridFs("files")
|
||||
defer s.Close()
|
||||
|
||||
if err := gf.RemoveId(result.FileId); err != nil {
|
||||
log.Error("remove file error, id:" + result.FileId.Hex())
|
||||
debug.PrintStack()
|
||||
return err
|
||||
if result.FileId.Hex() != constants.ObjectIdNull {
|
||||
if err := gf.RemoveId(result.FileId); err != nil {
|
||||
log.Error("remove file error, id:" + result.FileId.Hex())
|
||||
debug.PrintStack()
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
@@ -269,3 +295,35 @@ func GetSpiderTypes() ([]*entity.SpiderType, error) {
|
||||
|
||||
return types, nil
|
||||
}
|
||||
|
||||
func GetConfigSpiderData(spider Spider) (entity.ConfigSpiderData, error) {
|
||||
// 构造配置数据
|
||||
configData := entity.ConfigSpiderData{}
|
||||
|
||||
// 校验爬虫类别
|
||||
if spider.Type != constants.Configurable {
|
||||
return configData, errors.New("not a configurable spider")
|
||||
}
|
||||
|
||||
// Spiderfile 目录
|
||||
sfPath := filepath.Join(spider.Src, "Spiderfile")
|
||||
|
||||
// 读取YAML文件
|
||||
yamlFile, err := ioutil.ReadFile(sfPath)
|
||||
if err != nil {
|
||||
return configData, err
|
||||
}
|
||||
|
||||
// 反序列化
|
||||
if err := yaml.Unmarshal(yamlFile, &configData); err != nil {
|
||||
return configData, err
|
||||
}
|
||||
|
||||
// 赋值 stage_name
|
||||
for stageName, stage := range configData.Stages {
|
||||
stage.Name = stageName
|
||||
configData.Stages[stageName] = stage
|
||||
}
|
||||
|
||||
return configData, nil
|
||||
}
|
||||
|
||||
@@ -2,16 +2,13 @@ package routes
|
||||
|
||||
import (
|
||||
"crawlab/constants"
|
||||
"crawlab/database"
|
||||
"crawlab/entity"
|
||||
"crawlab/model"
|
||||
"crawlab/services"
|
||||
"crawlab/utils"
|
||||
"fmt"
|
||||
"github.com/apex/log"
|
||||
"github.com/gin-gonic/gin"
|
||||
"github.com/globalsign/mgo/bson"
|
||||
uuid "github.com/satori/go.uuid"
|
||||
"github.com/spf13/viper"
|
||||
"gopkg.in/yaml.v2"
|
||||
"io"
|
||||
@@ -19,7 +16,7 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime/debug"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// 添加可配置爬虫
|
||||
@@ -36,6 +33,12 @@ func PutConfigSpider(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// 模版名不能为空
|
||||
if spider.Template == "" {
|
||||
HandleErrorF(http.StatusBadRequest, c, "spider template should not be empty")
|
||||
return
|
||||
}
|
||||
|
||||
// 判断爬虫是否存在
|
||||
if spider := model.GetSpiderByName(spider.Name); spider != nil {
|
||||
HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("spider for '%s' already exists", spider.Name))
|
||||
@@ -62,6 +65,23 @@ func PutConfigSpider(c *gin.Context) {
|
||||
}
|
||||
spider.Src = spiderDir
|
||||
|
||||
// 复制Spiderfile模版
|
||||
contentByte, err := ioutil.ReadFile("./template/spiderfile/Spiderfile." + spider.Template)
|
||||
if err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
f, err := os.Create(filepath.Join(spider.Src, "Spiderfile"))
|
||||
if err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
if _, err := f.Write(contentByte); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
// 添加爬虫到数据库
|
||||
if err := spider.Add(); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
@@ -100,8 +120,8 @@ func UploadConfigSpider(c *gin.Context) {
|
||||
|
||||
// 文件名称必须为Spiderfile
|
||||
filename := header.Filename
|
||||
if filename != "Spiderfile" {
|
||||
HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile'")
|
||||
if filename != "Spiderfile" && filename != "Spiderfile.yaml" && filename != "Spiderfile.yml" {
|
||||
HandleErrorF(http.StatusBadRequest, c, "filename must be 'Spiderfile(.yaml|.yml)'")
|
||||
return
|
||||
}
|
||||
|
||||
@@ -151,88 +171,146 @@ func UploadConfigSpider(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// 删除已有的爬虫文件
|
||||
for _, fInfo := range utils.ListDir(spiderDir) {
|
||||
// 不删除Spiderfile
|
||||
if fInfo.Name() == filename {
|
||||
continue
|
||||
}
|
||||
|
||||
// 删除其他文件
|
||||
if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// 拷贝爬虫文件
|
||||
tplDir := "./template/scrapy"
|
||||
for _, fInfo := range utils.ListDir(tplDir) {
|
||||
// 跳过Spiderfile
|
||||
if fInfo.Name() == "Spiderfile" {
|
||||
continue
|
||||
}
|
||||
|
||||
srcPath := filepath.Join(tplDir, fInfo.Name())
|
||||
if fInfo.IsDir() {
|
||||
dirPath := filepath.Join(spiderDir, fInfo.Name())
|
||||
if err := utils.CopyDir(srcPath, dirPath); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
} else {
|
||||
if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 更改爬虫文件
|
||||
if err := services.GenerateConfigSpiderFiles(spider, configData); err != nil {
|
||||
// 根据序列化后的数据处理爬虫文件
|
||||
if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
// 打包为 zip 文件
|
||||
files, err := utils.GetFilesFromDir(spiderDir)
|
||||
if err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
randomId := uuid.NewV4()
|
||||
tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip")
|
||||
spiderZipFileName := spider.Name + ".zip"
|
||||
if err := utils.Compress(files, tmpFilePath); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
// 获取 GridFS 实例
|
||||
s, gf := database.GetGridFs("files")
|
||||
defer s.Close()
|
||||
|
||||
// 判断文件是否已经存在
|
||||
var gfFile model.GridFs
|
||||
if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil {
|
||||
// 已经存在文件,则删除
|
||||
_ = gf.RemoveId(gfFile.Id)
|
||||
}
|
||||
|
||||
// 上传到GridFs
|
||||
fid, err := services.UploadToGridFs(spiderZipFileName, tmpFilePath)
|
||||
if err != nil {
|
||||
log.Errorf("upload to grid fs error: %s", err.Error())
|
||||
debug.PrintStack()
|
||||
return
|
||||
}
|
||||
|
||||
// 保存爬虫 FileId
|
||||
spider.FileId = fid
|
||||
_ = spider.Save()
|
||||
|
||||
c.JSON(http.StatusOK, Response{
|
||||
Status: "ok",
|
||||
Message: "success",
|
||||
})
|
||||
}
|
||||
|
||||
func PostConfigSpiderSpiderfile(c *gin.Context) {
|
||||
type Body struct {
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
id := c.Param("id")
|
||||
|
||||
// 文件内容
|
||||
var reqBody Body
|
||||
if err := c.ShouldBindJSON(&reqBody); err != nil {
|
||||
HandleError(http.StatusBadRequest, c, err)
|
||||
return
|
||||
}
|
||||
content := reqBody.Content
|
||||
|
||||
// 获取爬虫
|
||||
var spider model.Spider
|
||||
spider, err := model.GetSpider(bson.ObjectIdHex(id))
|
||||
if err != nil {
|
||||
HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("cannot find spider (id: %s)", id))
|
||||
return
|
||||
}
|
||||
|
||||
// 反序列化
|
||||
var configData entity.ConfigSpiderData
|
||||
if err := yaml.Unmarshal([]byte(content), &configData); err != nil {
|
||||
HandleError(http.StatusBadRequest, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
// 校验configData
|
||||
if err := services.ValidateSpiderfile(configData); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
// 写文件
|
||||
if err := ioutil.WriteFile(filepath.Join(spider.Src, "Spiderfile"), []byte(content), os.ModePerm); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
// 根据序列化后的数据处理爬虫文件
|
||||
if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, Response{
|
||||
Status: "ok",
|
||||
Message: "success",
|
||||
})
|
||||
}
|
||||
|
||||
func PostConfigSpiderConfig(c *gin.Context) {
|
||||
id := c.Param("id")
|
||||
|
||||
// 获取爬虫
|
||||
var spider model.Spider
|
||||
spider, err := model.GetSpider(bson.ObjectIdHex(id))
|
||||
if err != nil {
|
||||
HandleErrorF(http.StatusBadRequest, c, fmt.Sprintf("cannot find spider (id: %s)", id))
|
||||
return
|
||||
}
|
||||
|
||||
// 反序列化配置数据
|
||||
var configData entity.ConfigSpiderData
|
||||
if err := c.ShouldBindJSON(&configData); err != nil {
|
||||
HandleError(http.StatusBadRequest, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
// 校验configData
|
||||
if err := services.ValidateSpiderfile(configData); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
// 替换Spiderfile文件
|
||||
if err := services.GenerateSpiderfileFromConfigData(spider, configData); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
// 根据序列化后的数据处理爬虫文件
|
||||
if err := services.ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, Response{
|
||||
Status: "ok",
|
||||
Message: "success",
|
||||
})
|
||||
}
|
||||
|
||||
func GetConfigSpiderConfig(c *gin.Context) {
|
||||
id := c.Param("id")
|
||||
|
||||
// 校验ID
|
||||
if !bson.IsObjectIdHex(id) {
|
||||
HandleErrorF(http.StatusBadRequest, c, "invalid id")
|
||||
}
|
||||
|
||||
// 获取爬虫
|
||||
spider, err := model.GetSpider(bson.ObjectIdHex(id))
|
||||
if err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, Response{
|
||||
Status: "ok",
|
||||
Message: "success",
|
||||
Data: spider.Config,
|
||||
})
|
||||
}
|
||||
|
||||
// 获取模版名称列表
|
||||
func GetConfigSpiderTemplateList(c *gin.Context) {
|
||||
var data []string
|
||||
for _, fInfo := range utils.ListDir("./template/spiderfile") {
|
||||
templateName := strings.Replace(fInfo.Name(), "Spiderfile.", "", -1)
|
||||
data = append(data, templateName)
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, Response{
|
||||
Status: "ok",
|
||||
Message: "success",
|
||||
Data: data,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ func GetSpiderList(c *gin.Context) {
|
||||
"name": bson.M{"$regex": bson.RegEx{Pattern: keyword, Options: "im"}},
|
||||
}
|
||||
|
||||
if t != "" {
|
||||
if t != "" && t != "all" {
|
||||
filter["type"] = t
|
||||
}
|
||||
|
||||
|
||||
@@ -2,11 +2,20 @@ package services
|
||||
|
||||
import (
|
||||
"crawlab/constants"
|
||||
"crawlab/database"
|
||||
"crawlab/entity"
|
||||
"crawlab/model"
|
||||
"crawlab/model/config_spider"
|
||||
"crawlab/utils"
|
||||
"errors"
|
||||
"fmt"
|
||||
"github.com/apex/log"
|
||||
"github.com/globalsign/mgo/bson"
|
||||
uuid "github.com/satori/go.uuid"
|
||||
"github.com/spf13/viper"
|
||||
"gopkg.in/yaml.v2"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
)
|
||||
|
||||
@@ -37,12 +46,17 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error {
|
||||
|
||||
// 校验是否存在 start_url
|
||||
if configData.StartUrl == "" {
|
||||
return errors.New("spiderfile start_url is empty")
|
||||
return errors.New("spiderfile invalid: start_url is empty")
|
||||
}
|
||||
|
||||
// 校验是否存在 start_stage
|
||||
if configData.StartStage == "" {
|
||||
return errors.New("spiderfile invalid: start_stage is empty")
|
||||
}
|
||||
|
||||
// 校验是否存在 stages
|
||||
if len(configData.Stages) == 0 {
|
||||
return errors.New("spiderfile stages is empty")
|
||||
return errors.New("spiderfile invalid: stages is empty")
|
||||
}
|
||||
|
||||
// 校验stages
|
||||
@@ -50,56 +64,74 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error {
|
||||
for stageName, stage := range configData.Stages {
|
||||
// stage 名称不能为空
|
||||
if stageName == "" {
|
||||
return errors.New("spiderfile stage name is empty")
|
||||
return errors.New("spiderfile invalid: stage name is empty")
|
||||
}
|
||||
|
||||
// stage 名称不能为保留字符串
|
||||
// NOTE: 如果有其他Engine,可以扩展,默认为Scrapy
|
||||
if configData.Engine == "" || configData.Engine == constants.EngineScrapy {
|
||||
if strings.Contains(constants.ScrapyProtectedStageNames, stageName) {
|
||||
return errors.New(fmt.Sprintf("spiderfile stage name '%s' is protected", stageName))
|
||||
return errors.New(fmt.Sprintf("spiderfile invalid: stage name '%s' is protected", stageName))
|
||||
}
|
||||
} else if configData.Engine == constants.EngineColly {
|
||||
return errors.New(fmt.Sprintf("engine '%s' is not implemented", stageName))
|
||||
} else {
|
||||
return errors.New(fmt.Sprintf("spiderfile invalid: engine '%s' is not implemented", configData.Engine))
|
||||
}
|
||||
|
||||
// stage 名称不能重复
|
||||
if dict[stageName] == 1 {
|
||||
return errors.New("spiderfile stage name should be unique")
|
||||
return errors.New(fmt.Sprintf("spiderfile invalid: stage name '%s' is duplicated", stageName))
|
||||
}
|
||||
dict[stageName] = 1
|
||||
|
||||
// stage 字段不能为空
|
||||
if len(stage.Fields) == 0 {
|
||||
return errors.New(fmt.Sprintf("spiderfile stage '%s' has no fields", stageName))
|
||||
return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has no fields", stageName))
|
||||
}
|
||||
|
||||
// stage 的下一个 stage 只能有一个
|
||||
// 是否包含 next_stage
|
||||
hasNextStage := false
|
||||
|
||||
// 遍历字段列表
|
||||
for _, field := range stage.Fields {
|
||||
// stage 的 next stage 只能有一个
|
||||
if field.NextStage != "" {
|
||||
if hasNextStage {
|
||||
return errors.New("spiderfile stage fields should have only 1 next_stage")
|
||||
return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has more than 1 next_stage", stageName))
|
||||
}
|
||||
hasNextStage = true
|
||||
}
|
||||
|
||||
// 字段里 css 和 xpath 只能包含一个
|
||||
if field.Css != "" && field.Xpath != "" {
|
||||
return errors.New(fmt.Sprintf("spiderfile invalid: field '%s' in stage '%s' has both css and xpath set which is prohibited", field.Name, stageName))
|
||||
}
|
||||
}
|
||||
|
||||
// stage 里 page_css 和 page_xpath 只能包含一个
|
||||
if stage.PageCss != "" && stage.PageXpath != "" {
|
||||
return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both page_css and page_xpath set which is prohibited", stageName))
|
||||
}
|
||||
|
||||
// stage 里 list_css 和 list_xpath 只能包含一个
|
||||
if stage.ListCss != "" && stage.ListXpath != "" {
|
||||
return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both list_css and list_xpath set which is prohibited", stageName))
|
||||
}
|
||||
|
||||
// 如果 stage 的 is_list 为 true 但 list_css 为空,报错
|
||||
if stage.IsList && stage.ListCss == "" {
|
||||
return errors.New("spiderfile stage with is_list = true should have list_css being set")
|
||||
if stage.IsList && (stage.ListCss == "" && stage.ListXpath == "") {
|
||||
return errors.New("spiderfile invalid: stage with is_list = true should have either list_css or list_xpath being set")
|
||||
}
|
||||
}
|
||||
|
||||
// 校验字段唯一性
|
||||
if !IsUniqueConfigSpiderFields(fields) {
|
||||
return errors.New("spiderfile fields not unique")
|
||||
return errors.New("spiderfile invalid: fields not unique")
|
||||
}
|
||||
|
||||
// 字段名称不能为保留字符串
|
||||
for _, field := range fields {
|
||||
if strings.Contains(constants.ScrapyProtectedFieldNames, field.Name) {
|
||||
return errors.New(fmt.Sprintf("spiderfile field name '%s' is protected", field.Name))
|
||||
return errors.New(fmt.Sprintf("spiderfile invalid: field name '%s' is protected", field.Name))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -116,3 +148,118 @@ func IsUniqueConfigSpiderFields(fields []entity.Field) bool {
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error {
|
||||
spiderDir := spider.Src
|
||||
|
||||
// 赋值 stage_name
|
||||
for stageName, stage := range configData.Stages {
|
||||
stage.Name = stageName
|
||||
configData.Stages[stageName] = stage
|
||||
}
|
||||
|
||||
// 删除已有的爬虫文件
|
||||
for _, fInfo := range utils.ListDir(spiderDir) {
|
||||
// 不删除Spiderfile
|
||||
if fInfo.Name() == "Spiderfile" {
|
||||
continue
|
||||
}
|
||||
|
||||
// 删除其他文件
|
||||
if err := os.RemoveAll(filepath.Join(spiderDir, fInfo.Name())); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// 拷贝爬虫文件
|
||||
tplDir := "./template/scrapy"
|
||||
for _, fInfo := range utils.ListDir(tplDir) {
|
||||
// 跳过Spiderfile
|
||||
if fInfo.Name() == "Spiderfile" {
|
||||
continue
|
||||
}
|
||||
|
||||
srcPath := filepath.Join(tplDir, fInfo.Name())
|
||||
if fInfo.IsDir() {
|
||||
dirPath := filepath.Join(spiderDir, fInfo.Name())
|
||||
if err := utils.CopyDir(srcPath, dirPath); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 更改爬虫文件
|
||||
if err := GenerateConfigSpiderFiles(spider, configData); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 打包为 zip 文件
|
||||
files, err := utils.GetFilesFromDir(spiderDir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
randomId := uuid.NewV4()
|
||||
tmpFilePath := filepath.Join(viper.GetString("other.tmppath"), spider.Name+"."+randomId.String()+".zip")
|
||||
spiderZipFileName := spider.Name + ".zip"
|
||||
if err := utils.Compress(files, tmpFilePath); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 获取 GridFS 实例
|
||||
s, gf := database.GetGridFs("files")
|
||||
defer s.Close()
|
||||
|
||||
// 判断文件是否已经存在
|
||||
var gfFile model.GridFs
|
||||
if err := gf.Find(bson.M{"filename": spiderZipFileName}).One(&gfFile); err == nil {
|
||||
// 已经存在文件,则删除
|
||||
_ = gf.RemoveId(gfFile.Id)
|
||||
}
|
||||
|
||||
// 上传到GridFs
|
||||
fid, err := UploadToGridFs(spiderZipFileName, tmpFilePath)
|
||||
if err != nil {
|
||||
log.Errorf("upload to grid fs error: %s", err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
// 保存爬虫 FileId
|
||||
spider.FileId = fid
|
||||
_ = spider.Save()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func GenerateSpiderfileFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error {
|
||||
// Spiderfile 路径
|
||||
sfPath := filepath.Join(spider.Src, "Spiderfile")
|
||||
|
||||
// 生成Yaml内容
|
||||
sfContentByte, err := yaml.Marshal(configData)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 打开文件
|
||||
var f *os.File
|
||||
if utils.Exists(sfPath) {
|
||||
f, err = os.OpenFile(sfPath, os.O_WRONLY|os.O_TRUNC, 0777)
|
||||
} else {
|
||||
f, err = os.OpenFile(sfPath, os.O_CREATE, 0777)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// 写入内容
|
||||
if _, err := f.Write(sfContentByte); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -116,12 +116,15 @@ func PublishAllSpiders() {
|
||||
|
||||
// 发布爬虫
|
||||
func PublishSpider(spider model.Spider) {
|
||||
// 查询gf file,不存在则标记为爬虫文件不存在
|
||||
gfFile := model.GetGridFs(spider.FileId)
|
||||
if gfFile == nil {
|
||||
spider.FileId = constants.ObjectIdNull
|
||||
_ = spider.Save()
|
||||
return
|
||||
var gfFile *model.GridFs
|
||||
if spider.FileId.Hex() != constants.ObjectIdNull {
|
||||
// 查询gf file,不存在则标记为爬虫文件不存在
|
||||
gfFile = model.GetGridFs(spider.FileId)
|
||||
if gfFile == nil {
|
||||
spider.FileId = constants.ObjectIdNull
|
||||
_ = spider.Save()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// 如果FileId为空,表示还没有上传爬虫到GridFS,则跳过
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"github.com/spf13/viper"
|
||||
"io"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime/debug"
|
||||
)
|
||||
@@ -99,7 +100,6 @@ func (s *SpiderSync) Download() {
|
||||
// 创建临时文件
|
||||
tmpFilePath := filepath.Join(tmpPath, randomId.String()+".zip")
|
||||
tmpFile := utils.OpenFile(tmpFilePath)
|
||||
defer utils.Close(tmpFile)
|
||||
|
||||
// 将该文件写入临时文件
|
||||
if _, err := io.Copy(tmpFile, f); err != nil {
|
||||
@@ -119,6 +119,15 @@ func (s *SpiderSync) Download() {
|
||||
return
|
||||
}
|
||||
|
||||
//递归修改目标文件夹权限
|
||||
// 解决scrapy.setting中开启LOG_ENABLED 和 LOG_FILE时不能创建log文件的问题
|
||||
cmd := exec.Command("chmod", "-R", "777", dstPath)
|
||||
if err := cmd.Run(); err != nil {
|
||||
log.Errorf(err.Error())
|
||||
debug.PrintStack()
|
||||
return
|
||||
}
|
||||
|
||||
// 关闭临时文件
|
||||
if err := tmpFile.Close(); err != nil {
|
||||
log.Errorf(err.Error())
|
||||
|
||||
@@ -226,12 +226,18 @@ func ExecuteShellCmd(cmdStr string, cwd string, t model.Task, s model.Spider) (e
|
||||
// 环境变量配置
|
||||
envs := s.Envs
|
||||
if s.Type == constants.Configurable {
|
||||
// 数据库配置
|
||||
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_HOST", Value: viper.GetString("mongo.host")})
|
||||
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_PORT", Value: viper.GetString("mongo.port")})
|
||||
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_DB", Value: viper.GetString("mongo.db")})
|
||||
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_USERNAME", Value: viper.GetString("mongo.username")})
|
||||
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_PASSWORD", Value: viper.GetString("mongo.password")})
|
||||
envs = append(envs, model.Env{Name: "CRAWLAB_MONGO_AUTHSOURCE", Value: viper.GetString("mongo.authSource")})
|
||||
|
||||
// 设置配置
|
||||
for envName, envValue := range s.Config.Settings {
|
||||
envs = append(envs, model.Env{Name: "CRAWLAB_SETTING_" + envName, Value: envValue})
|
||||
}
|
||||
}
|
||||
cmd = SetEnv(cmd, envs, t.Id, s.Col)
|
||||
|
||||
@@ -311,9 +317,12 @@ func SaveTaskResultCount(id string) func() {
|
||||
|
||||
// 执行任务
|
||||
func ExecuteTask(id int) {
|
||||
if flag, _ := LockList.Load(id); flag.(bool) {
|
||||
log.Debugf(GetWorkerPrefix(id) + "正在执行任务...")
|
||||
return
|
||||
if flag, ok := LockList.Load(id); ok {
|
||||
if flag.(bool) {
|
||||
log.Debugf(GetWorkerPrefix(id) + "正在执行任务...")
|
||||
return
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// 上锁
|
||||
@@ -477,6 +486,29 @@ func GetTaskLog(id string) (logStr string, err error) {
|
||||
}
|
||||
|
||||
if IsMasterNode(task.NodeId.Hex()) {
|
||||
if !utils.Exists(task.LogPath) {
|
||||
fileDir, err := MakeLogDir(task)
|
||||
|
||||
if err != nil {
|
||||
log.Errorf(err.Error())
|
||||
}
|
||||
|
||||
fileP := GetLogFilePaths(fileDir)
|
||||
|
||||
// 获取日志文件路径
|
||||
fLog, err := os.Create(fileP)
|
||||
defer fLog.Close()
|
||||
if err != nil {
|
||||
log.Errorf("create task log file error: %s", fileP)
|
||||
debug.PrintStack()
|
||||
}
|
||||
task.LogPath = fileP
|
||||
if err := task.Save(); err != nil {
|
||||
log.Errorf(err.Error())
|
||||
debug.PrintStack()
|
||||
}
|
||||
|
||||
}
|
||||
// 若为主节点,获取本机日志
|
||||
logBytes, err := model.GetLocalLog(task.LogPath)
|
||||
if err != nil {
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
|
||||
# Scrapy settings for config_spider project
|
||||
#
|
||||
@@ -9,14 +12,14 @@
|
||||
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'config_spider'
|
||||
BOT_NAME = 'Crawlab Configurable Spider'
|
||||
|
||||
SPIDER_MODULES = ['config_spider.spiders']
|
||||
NEWSPIDER_MODULE = 'config_spider.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'config_spider (+http://www.yourdomain.com)'
|
||||
USER_AGENT = 'Crawlab Spider'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
@@ -88,3 +91,21 @@ ITEM_PIPELINES = {
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
|
||||
for setting_env_name in [x for x in os.environ.keys() if x.startswith('CRAWLAB_SETTING_')]:
|
||||
setting_name = setting_env_name.replace('CRAWLAB_SETTING_', '')
|
||||
setting_value = os.environ.get(setting_env_name)
|
||||
if setting_value.lower() == 'true':
|
||||
setting_value = True
|
||||
elif setting_value.lower() == 'false':
|
||||
setting_value = False
|
||||
elif re.search(r'^\d+$', setting_value) is not None:
|
||||
setting_value = int(setting_value)
|
||||
elif re.search(r'^\{.*\}$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
elif re.search(r'^\[.*\]$', setting_value.strip()) is not None:
|
||||
setting_value = json.loads(setting_value)
|
||||
else:
|
||||
pass
|
||||
locals()[setting_name] = setting_value
|
||||
|
||||
|
||||
20
backend/template/spiderfile/Spiderfile.163_news
Normal file
20
backend/template/spiderfile/Spiderfile.163_news
Normal file
@@ -0,0 +1,20 @@
|
||||
version: "0.4.0"
|
||||
name: "toscrapy_books"
|
||||
start_url: "http://news.163.com/special/0001386F/rank_news.html"
|
||||
start_stage: "list"
|
||||
engine: "scrapy"
|
||||
stages:
|
||||
list:
|
||||
is_list: true
|
||||
list_css: "table tr:not(:first-child)"
|
||||
fields:
|
||||
- name: "title"
|
||||
css: "td:nth-child(1) > a"
|
||||
- name: "url"
|
||||
css: "td:nth-child(1) > a"
|
||||
attr: "href"
|
||||
- name: "clicks"
|
||||
css: "td.cBlue"
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: false
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
22
backend/template/spiderfile/Spiderfile.baidu
Normal file
22
backend/template/spiderfile/Spiderfile.baidu
Normal file
@@ -0,0 +1,22 @@
|
||||
version: 0.4.0
|
||||
name: toscrapy_books
|
||||
start_url: http://www.baidu.com/s?wd=crawlab
|
||||
start_stage: list
|
||||
engine: scrapy
|
||||
stages:
|
||||
list:
|
||||
is_list: true
|
||||
list_xpath: //*[contains(@class, "c-container")]
|
||||
page_xpath: //*[@id="page"]//a[@class="n"][last()]
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: title
|
||||
xpath: .//h3/a
|
||||
- name: url
|
||||
xpath: .//h3/a
|
||||
attr: href
|
||||
- name: abstract
|
||||
xpath: .//*[@class="c-abstract"]
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: false
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
@@ -5,10 +5,10 @@ start_stage: "list"
|
||||
engine: "scrapy"
|
||||
stages:
|
||||
list:
|
||||
is_list: true # default: false
|
||||
is_list: true
|
||||
list_css: "section article.product_pod"
|
||||
page_css: "ul.pager li.next a"
|
||||
page_attr: "href" # default: href
|
||||
page_attr: "href"
|
||||
fields:
|
||||
- name: "title"
|
||||
css: "h3 > a"
|
||||
@@ -23,3 +23,6 @@ stages:
|
||||
fields:
|
||||
- name: "description"
|
||||
css: "#product_description + p"
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: true
|
||||
AUTOTHROTTLE_ENABLED: true
|
||||
@@ -167,7 +167,6 @@ func DeCompress(srcFile *os.File, dstPath string) error {
|
||||
debug.PrintStack()
|
||||
continue
|
||||
}
|
||||
defer Close(newFile)
|
||||
|
||||
// 拷贝该文件到新文件中
|
||||
if _, err := io.Copy(newFile, srcFile); err != nil {
|
||||
|
||||
Reference in New Issue
Block a user