更新可配置爬虫,修复一些问题

This commit is contained in:
marvzhang
2019-11-24 18:51:32 +08:00
parent 38d103da39
commit 6a07afa279
7 changed files with 93 additions and 92 deletions

View File

@@ -16,8 +16,9 @@ type Stage struct {
}
type ConfigSpiderData struct {
Version string `yaml:"version" json:"version"`
Engine string `yaml:"engine" json:"engine"`
StartUrl string `yaml:"start_url" json:"start_url"`
Stages map[string]Stage `yaml:"stages" json:"stages"`
Version string `yaml:"version" json:"version"`
Engine string `yaml:"engine" json:"engine"`
StartUrl string `yaml:"start_url" json:"start_url"`
StartStage string `yaml:"start_stage" json:"start_stage"`
Stages map[string]Stage `yaml:"stages" json:"stages"`
}

View File

@@ -15,6 +15,16 @@ func GetAllFields(data entity.ConfigSpiderData) []entity.Field {
}
func GetStartStageName(data entity.ConfigSpiderData) string {
// 如果 start_stage 设置了且在 stages 里,则返回
if data.StartStage != "" {
for stageName := range data.Stages {
if stageName == data.StartStage {
return data.StartStage
}
}
}
// 否则返回第一个 stage
for stageName := range data.Stages {
return stageName
}

View File

@@ -33,7 +33,7 @@ func (g ScrapyGenerator) Generate() error {
func (g ScrapyGenerator) ProcessItems() error {
// 待处理文件名
src := g.Spider.Src
filePath := filepath.Join(src, "items.py")
filePath := filepath.Join(src, "config_spider", "items.py")
// 获取所有字段
fields := g.GetAllFields()
@@ -53,7 +53,7 @@ func (g ScrapyGenerator) ProcessItems() error {
// 将字段名转化为python代码
str := ""
for _, fieldName := range fieldNames {
line := fmt.Sprintf("%s = scrapy.Field()", fieldName)
line := g.PadCode(fmt.Sprintf("%s = scrapy.Field()", fieldName), 1)
str += line
}
@@ -69,7 +69,7 @@ func (g ScrapyGenerator) ProcessItems() error {
func (g ScrapyGenerator) ProcessSpider() error {
// 待处理文件名
src := g.Spider.Src
filePath := filepath.Join(src, "spiders", "spider.py")
filePath := filepath.Join(src, "config_spider", "spiders", "spider.py")
// 替换 start_stage
if err := utils.SetFileVariable(filePath, constants.AnchorStartStage, GetStartStageName(g.ConfigData)); err != nil {
@@ -133,22 +133,26 @@ func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string {
for _, f := range stage.Fields {
line := ""
if f.Attr == "" {
line += fmt.Sprintf(`item['%s'] = response.css('%s::text).extract_first()'`, f.Name, f.Css)
line += fmt.Sprintf(`item['%s'] = response.css('%s::text).extract_first()')`, f.Name, f.Css)
} else {
line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")).extract_first()'`, f.Name, f.Css, f.Attr)
line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")).extract_first()')`, f.Name, f.Css, f.Attr)
}
line = g.PadCode(line, 2)
str += line
}
// next stage 字段
if f, err := g.GetNextStageField(stage); err == nil {
// 如果找到 next stage 字段,进行下一个回调
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="item['%s']", callback='%s', meta={'item': item})`, f.Name, f.NextStage), 3)
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="item['%s']", callback='%s', meta={'item': item})`, f.Name, f.NextStage), 2)
} else {
// 如果没找到 next stage 字段,返回 item
str += g.PadCode(fmt.Sprintf(`yield item`), 3)
str += g.PadCode(fmt.Sprintf(`yield item`), 2)
}
// 加入末尾换行
str += g.PadCode("", 0)
return str
}
@@ -168,11 +172,12 @@ func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string {
for _, f := range stage.Fields {
line := ""
if f.Attr == "" {
line += fmt.Sprintf(`item['%s'] = elem.css('%s::text).extract_first()'`, f.Name, f.Css)
line += fmt.Sprintf(`item['%s'] = elem.css('%s::text).extract_first()')`, f.Name, f.Css)
} else {
line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")).extract_first()'`, f.Name, f.Css, f.Attr)
line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")).extract_first()')`, f.Name, f.Css, f.Attr)
}
line = g.PadCode(line, 3)
str += line
}
// 把前一个 stage 的 item 值赋给当前 item
@@ -195,6 +200,9 @@ func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string {
str += g.PadCode(`yield scrapy.Request(url=next_url, meta={'item': item})`, 2)
}
// 加入末尾换行
str += g.PadCode("", 0)
return str
}

View File

@@ -171,12 +171,13 @@ func UploadConfigSpider(c *gin.Context) {
srcPath := filepath.Join(tplDir, fInfo.Name())
if fInfo.IsDir() {
if err := utils.CopyDir(srcPath, spiderDir); err != nil {
dirPath := filepath.Join(spiderDir, fInfo.Name())
if err := utils.CopyDir(srcPath, dirPath); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
} else {
if _, err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}

View File

@@ -1,5 +1,6 @@
version: 0.4.0
start_url: "https://baidu.com/s?wd=crawlab"
start_stage: "stage_4"
engine: "scrapy"
stages:
stage_1:
@@ -14,6 +15,7 @@ stages:
attr: "href"
next_stage: "stage_2"
stage_2:
list: false
is_list: false
fields:
- name: ""
- name: "stage_2_field_1"
css: "a"

View File

@@ -9,5 +9,4 @@ import scrapy
class Item(scrapy.Item):
###ITEMS###
pass
###ITEMS###

View File

@@ -3,12 +3,12 @@ package utils
import (
"archive/zip"
"bufio"
"errors"
"fmt"
"github.com/apex/log"
"io"
"io/ioutil"
"os"
"path"
"path/filepath"
"runtime/debug"
"strings"
@@ -254,84 +254,64 @@ func _Compress(file *os.File, prefix string, zw *zip.Writer) error {
return nil
}
/**
* 拷贝文件夹,同时拷贝文件夹中的文件
* @param srcPath 需要拷贝的文件夹路径: D:/test
* @param destPath 拷贝到的位置: D:/backup/
*/
func CopyDir(srcPath string, destPath string) error {
// 检测目录正确性
if srcInfo, err := os.Stat(srcPath); err != nil {
fmt.Println(err.Error())
return err
} else {
if !srcInfo.IsDir() {
e := errors.New("srcPath不是一个正确的目录")
fmt.Println(e.Error())
return e
}
}
if destInfo, err := os.Stat(destPath); err != nil {
fmt.Println(err.Error())
return err
} else {
if !destInfo.IsDir() {
e := errors.New("destInfo不是一个正确的目录")
fmt.Println(e.Error())
return e
}
}
// File copies a single file from src to dst
func CopyFile(src, dst string) error {
var err error
var srcfd *os.File
var dstfd *os.File
var srcinfo os.FileInfo
err := filepath.Walk(srcPath, func(path string, f os.FileInfo, err error) error {
if f == nil {
return err
}
if !f.IsDir() {
path := strings.Replace(path, "\\", "/", -1)
destNewPath := strings.Replace(path, srcPath, destPath, -1)
_, _ = CopyFile(path, destNewPath)
}
return nil
})
if err != nil {
fmt.Printf(err.Error())
if srcfd, err = os.Open(src); err != nil {
return err
}
return err
defer srcfd.Close()
if dstfd, err = os.Create(dst); err != nil {
return err
}
defer dstfd.Close()
if _, err = io.Copy(dstfd, srcfd); err != nil {
return err
}
if srcinfo, err = os.Stat(src); err != nil {
return err
}
return os.Chmod(dst, srcinfo.Mode())
}
// 生成目录并拷贝文件
func CopyFile(src, dest string) (w int64, err error) {
srcFile, err := os.Open(src)
if err != nil {
fmt.Println(err.Error())
return
}
defer srcFile.Close()
// 分割path目录
destSplitPathDirs := strings.Split(dest, "/")
// Dir copies a whole directory recursively
func CopyDir(src string, dst string) error {
var err error
var fds []os.FileInfo
var srcinfo os.FileInfo
// 检测时候存在目录
destSplitPath := ""
for index, dir := range destSplitPathDirs {
if index < len(destSplitPathDirs)-1 {
destSplitPath = destSplitPath + dir + "/"
if !Exists(destSplitPath) {
//创建目录
err := os.Mkdir(destSplitPath, os.ModePerm)
if err != nil {
fmt.Println(err)
}
if srcinfo, err = os.Stat(src); err != nil {
return err
}
if err = os.MkdirAll(dst, srcinfo.Mode()); err != nil {
return err
}
if fds, err = ioutil.ReadDir(src); err != nil {
return err
}
for _, fd := range fds {
srcfp := path.Join(src, fd.Name())
dstfp := path.Join(dst, fd.Name())
if fd.IsDir() {
if err = CopyDir(srcfp, dstfp); err != nil {
fmt.Println(err)
}
} else {
if err = CopyFile(srcfp, dstfp); err != nil {
fmt.Println(err)
}
}
}
dstFile, err := os.Create(dest)
if err != nil {
fmt.Println(err.Error())
return
}
defer dstFile.Close()
return io.Copy(dstFile, srcFile)
return nil
}
// 设置文件变量值
@@ -350,10 +330,10 @@ func SetFileVariable(filePath string, key string, value string) error {
content := string(contentBytes)
// 替换文本
content = strings.ReplaceAll(content, fmt.Sprintf("%s%s%s", sep, key, sep), value)
content = strings.Replace(content, fmt.Sprintf("%s%s%s", sep, key, sep), value, -1)
// 打开文件
f, err := os.OpenFile(filePath, os.O_WRONLY, 0777)
f, err := os.OpenFile(filePath, os.O_WRONLY|os.O_TRUNC, 0777)
if err != nil {
return err
}