diff --git a/backend/entity/config_spider.go b/backend/entity/config_spider.go index a1b49553..b519163b 100644 --- a/backend/entity/config_spider.go +++ b/backend/entity/config_spider.go @@ -16,8 +16,9 @@ type Stage struct { } type ConfigSpiderData struct { - Version string `yaml:"version" json:"version"` - Engine string `yaml:"engine" json:"engine"` - StartUrl string `yaml:"start_url" json:"start_url"` - Stages map[string]Stage `yaml:"stages" json:"stages"` + Version string `yaml:"version" json:"version"` + Engine string `yaml:"engine" json:"engine"` + StartUrl string `yaml:"start_url" json:"start_url"` + StartStage string `yaml:"start_stage" json:"start_stage"` + Stages map[string]Stage `yaml:"stages" json:"stages"` } diff --git a/backend/model/config_spider/common.go b/backend/model/config_spider/common.go index 29b33428..e8440350 100644 --- a/backend/model/config_spider/common.go +++ b/backend/model/config_spider/common.go @@ -15,6 +15,16 @@ func GetAllFields(data entity.ConfigSpiderData) []entity.Field { } func GetStartStageName(data entity.ConfigSpiderData) string { + // 如果 start_stage 设置了且在 stages 里,则返回 + if data.StartStage != "" { + for stageName := range data.Stages { + if stageName == data.StartStage { + return data.StartStage + } + } + } + + // 否则返回第一个 stage for stageName := range data.Stages { return stageName } diff --git a/backend/model/config_spider/scrapy.go b/backend/model/config_spider/scrapy.go index b232168c..5d6eab22 100644 --- a/backend/model/config_spider/scrapy.go +++ b/backend/model/config_spider/scrapy.go @@ -33,7 +33,7 @@ func (g ScrapyGenerator) Generate() error { func (g ScrapyGenerator) ProcessItems() error { // 待处理文件名 src := g.Spider.Src - filePath := filepath.Join(src, "items.py") + filePath := filepath.Join(src, "config_spider", "items.py") // 获取所有字段 fields := g.GetAllFields() @@ -53,7 +53,7 @@ func (g ScrapyGenerator) ProcessItems() error { // 将字段名转化为python代码 str := "" for _, fieldName := range fieldNames { - line := fmt.Sprintf("%s = scrapy.Field()", fieldName) + line := g.PadCode(fmt.Sprintf("%s = scrapy.Field()", fieldName), 1) str += line } @@ -69,7 +69,7 @@ func (g ScrapyGenerator) ProcessItems() error { func (g ScrapyGenerator) ProcessSpider() error { // 待处理文件名 src := g.Spider.Src - filePath := filepath.Join(src, "spiders", "spider.py") + filePath := filepath.Join(src, "config_spider", "spiders", "spider.py") // 替换 start_stage if err := utils.SetFileVariable(filePath, constants.AnchorStartStage, GetStartStageName(g.ConfigData)); err != nil { @@ -133,22 +133,26 @@ func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string { for _, f := range stage.Fields { line := "" if f.Attr == "" { - line += fmt.Sprintf(`item['%s'] = response.css('%s::text).extract_first()'`, f.Name, f.Css) + line += fmt.Sprintf(`item['%s'] = response.css('%s::text).extract_first()')`, f.Name, f.Css) } else { - line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")).extract_first()'`, f.Name, f.Css, f.Attr) + line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")).extract_first()')`, f.Name, f.Css, f.Attr) } line = g.PadCode(line, 2) + str += line } // next stage 字段 if f, err := g.GetNextStageField(stage); err == nil { // 如果找到 next stage 字段,进行下一个回调 - str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="item['%s']", callback='%s', meta={'item': item})`, f.Name, f.NextStage), 3) + str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="item['%s']", callback='%s', meta={'item': item})`, f.Name, f.NextStage), 2) } else { // 如果没找到 next stage 字段,返回 item - str += g.PadCode(fmt.Sprintf(`yield item`), 3) + str += g.PadCode(fmt.Sprintf(`yield item`), 2) } + // 加入末尾换行 + str += g.PadCode("", 0) + return str } @@ -168,11 +172,12 @@ func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string { for _, f := range stage.Fields { line := "" if f.Attr == "" { - line += fmt.Sprintf(`item['%s'] = elem.css('%s::text).extract_first()'`, f.Name, f.Css) + line += fmt.Sprintf(`item['%s'] = elem.css('%s::text).extract_first()')`, f.Name, f.Css) } else { - line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")).extract_first()'`, f.Name, f.Css, f.Attr) + line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")).extract_first()')`, f.Name, f.Css, f.Attr) } line = g.PadCode(line, 3) + str += line } // 把前一个 stage 的 item 值赋给当前 item @@ -195,6 +200,9 @@ func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string { str += g.PadCode(`yield scrapy.Request(url=next_url, meta={'item': item})`, 2) } + // 加入末尾换行 + str += g.PadCode("", 0) + return str } diff --git a/backend/routes/config_spider.go b/backend/routes/config_spider.go index d5d8c4a3..bdae068c 100644 --- a/backend/routes/config_spider.go +++ b/backend/routes/config_spider.go @@ -171,12 +171,13 @@ func UploadConfigSpider(c *gin.Context) { srcPath := filepath.Join(tplDir, fInfo.Name()) if fInfo.IsDir() { - if err := utils.CopyDir(srcPath, spiderDir); err != nil { + dirPath := filepath.Join(spiderDir, fInfo.Name()) + if err := utils.CopyDir(srcPath, dirPath); err != nil { HandleError(http.StatusInternalServerError, c, err) return } } else { - if _, err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil { + if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil { HandleError(http.StatusInternalServerError, c, err) return } diff --git a/backend/template/Spiderfile b/backend/template/Spiderfile index faa3af17..7c9c524c 100644 --- a/backend/template/Spiderfile +++ b/backend/template/Spiderfile @@ -1,5 +1,6 @@ version: 0.4.0 start_url: "https://baidu.com/s?wd=crawlab" +start_stage: "stage_4" engine: "scrapy" stages: stage_1: @@ -14,6 +15,7 @@ stages: attr: "href" next_stage: "stage_2" stage_2: - list: false + is_list: false fields: - - name: "" + - name: "stage_2_field_1" + css: "a" diff --git a/backend/template/scrapy/config_spider/items.py b/backend/template/scrapy/config_spider/items.py index 19e9efa6..16681a52 100644 --- a/backend/template/scrapy/config_spider/items.py +++ b/backend/template/scrapy/config_spider/items.py @@ -9,5 +9,4 @@ import scrapy class Item(scrapy.Item): - ###ITEMS### - pass +###ITEMS### diff --git a/backend/utils/file.go b/backend/utils/file.go index a419404a..82ed01fd 100644 --- a/backend/utils/file.go +++ b/backend/utils/file.go @@ -3,12 +3,12 @@ package utils import ( "archive/zip" "bufio" - "errors" "fmt" "github.com/apex/log" "io" "io/ioutil" "os" + "path" "path/filepath" "runtime/debug" "strings" @@ -254,84 +254,64 @@ func _Compress(file *os.File, prefix string, zw *zip.Writer) error { return nil } -/** - * 拷贝文件夹,同时拷贝文件夹中的文件 - * @param srcPath 需要拷贝的文件夹路径: D:/test - * @param destPath 拷贝到的位置: D:/backup/ - */ -func CopyDir(srcPath string, destPath string) error { - // 检测目录正确性 - if srcInfo, err := os.Stat(srcPath); err != nil { - fmt.Println(err.Error()) - return err - } else { - if !srcInfo.IsDir() { - e := errors.New("srcPath不是一个正确的目录!") - fmt.Println(e.Error()) - return e - } - } - if destInfo, err := os.Stat(destPath); err != nil { - fmt.Println(err.Error()) - return err - } else { - if !destInfo.IsDir() { - e := errors.New("destInfo不是一个正确的目录!") - fmt.Println(e.Error()) - return e - } - } +// File copies a single file from src to dst +func CopyFile(src, dst string) error { + var err error + var srcfd *os.File + var dstfd *os.File + var srcinfo os.FileInfo - err := filepath.Walk(srcPath, func(path string, f os.FileInfo, err error) error { - if f == nil { - return err - } - if !f.IsDir() { - path := strings.Replace(path, "\\", "/", -1) - destNewPath := strings.Replace(path, srcPath, destPath, -1) - _, _ = CopyFile(path, destNewPath) - } - return nil - }) - if err != nil { - fmt.Printf(err.Error()) + if srcfd, err = os.Open(src); err != nil { + return err } - return err + defer srcfd.Close() + + if dstfd, err = os.Create(dst); err != nil { + return err + } + defer dstfd.Close() + + if _, err = io.Copy(dstfd, srcfd); err != nil { + return err + } + if srcinfo, err = os.Stat(src); err != nil { + return err + } + return os.Chmod(dst, srcinfo.Mode()) } -// 生成目录并拷贝文件 -func CopyFile(src, dest string) (w int64, err error) { - srcFile, err := os.Open(src) - if err != nil { - fmt.Println(err.Error()) - return - } - defer srcFile.Close() - // 分割path目录 - destSplitPathDirs := strings.Split(dest, "/") +// Dir copies a whole directory recursively +func CopyDir(src string, dst string) error { + var err error + var fds []os.FileInfo + var srcinfo os.FileInfo - // 检测时候存在目录 - destSplitPath := "" - for index, dir := range destSplitPathDirs { - if index < len(destSplitPathDirs)-1 { - destSplitPath = destSplitPath + dir + "/" - if !Exists(destSplitPath) { - //创建目录 - err := os.Mkdir(destSplitPath, os.ModePerm) - if err != nil { - fmt.Println(err) - } + if srcinfo, err = os.Stat(src); err != nil { + return err + } + + if err = os.MkdirAll(dst, srcinfo.Mode()); err != nil { + return err + } + + if fds, err = ioutil.ReadDir(src); err != nil { + return err + } + for _, fd := range fds { + srcfp := path.Join(src, fd.Name()) + dstfp := path.Join(dst, fd.Name()) + + if fd.IsDir() { + if err = CopyDir(srcfp, dstfp); err != nil { + fmt.Println(err) + } + } else { + if err = CopyFile(srcfp, dstfp); err != nil { + fmt.Println(err) } } } - dstFile, err := os.Create(dest) - if err != nil { - fmt.Println(err.Error()) - return - } - defer dstFile.Close() - - return io.Copy(dstFile, srcFile) + return nil } // 设置文件变量值 @@ -350,10 +330,10 @@ func SetFileVariable(filePath string, key string, value string) error { content := string(contentBytes) // 替换文本 - content = strings.ReplaceAll(content, fmt.Sprintf("%s%s%s", sep, key, sep), value) + content = strings.Replace(content, fmt.Sprintf("%s%s%s", sep, key, sep), value, -1) // 打开文件 - f, err := os.OpenFile(filePath, os.O_WRONLY, 0777) + f, err := os.OpenFile(filePath, os.O_WRONLY|os.O_TRUNC, 0777) if err != nil { return err }