mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
更新可配置爬虫,修复一些问题
This commit is contained in:
@@ -16,8 +16,9 @@ type Stage struct {
|
||||
}
|
||||
|
||||
type ConfigSpiderData struct {
|
||||
Version string `yaml:"version" json:"version"`
|
||||
Engine string `yaml:"engine" json:"engine"`
|
||||
StartUrl string `yaml:"start_url" json:"start_url"`
|
||||
Stages map[string]Stage `yaml:"stages" json:"stages"`
|
||||
Version string `yaml:"version" json:"version"`
|
||||
Engine string `yaml:"engine" json:"engine"`
|
||||
StartUrl string `yaml:"start_url" json:"start_url"`
|
||||
StartStage string `yaml:"start_stage" json:"start_stage"`
|
||||
Stages map[string]Stage `yaml:"stages" json:"stages"`
|
||||
}
|
||||
|
||||
@@ -15,6 +15,16 @@ func GetAllFields(data entity.ConfigSpiderData) []entity.Field {
|
||||
}
|
||||
|
||||
func GetStartStageName(data entity.ConfigSpiderData) string {
|
||||
// 如果 start_stage 设置了且在 stages 里,则返回
|
||||
if data.StartStage != "" {
|
||||
for stageName := range data.Stages {
|
||||
if stageName == data.StartStage {
|
||||
return data.StartStage
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 否则返回第一个 stage
|
||||
for stageName := range data.Stages {
|
||||
return stageName
|
||||
}
|
||||
|
||||
@@ -33,7 +33,7 @@ func (g ScrapyGenerator) Generate() error {
|
||||
func (g ScrapyGenerator) ProcessItems() error {
|
||||
// 待处理文件名
|
||||
src := g.Spider.Src
|
||||
filePath := filepath.Join(src, "items.py")
|
||||
filePath := filepath.Join(src, "config_spider", "items.py")
|
||||
|
||||
// 获取所有字段
|
||||
fields := g.GetAllFields()
|
||||
@@ -53,7 +53,7 @@ func (g ScrapyGenerator) ProcessItems() error {
|
||||
// 将字段名转化为python代码
|
||||
str := ""
|
||||
for _, fieldName := range fieldNames {
|
||||
line := fmt.Sprintf("%s = scrapy.Field()", fieldName)
|
||||
line := g.PadCode(fmt.Sprintf("%s = scrapy.Field()", fieldName), 1)
|
||||
str += line
|
||||
}
|
||||
|
||||
@@ -69,7 +69,7 @@ func (g ScrapyGenerator) ProcessItems() error {
|
||||
func (g ScrapyGenerator) ProcessSpider() error {
|
||||
// 待处理文件名
|
||||
src := g.Spider.Src
|
||||
filePath := filepath.Join(src, "spiders", "spider.py")
|
||||
filePath := filepath.Join(src, "config_spider", "spiders", "spider.py")
|
||||
|
||||
// 替换 start_stage
|
||||
if err := utils.SetFileVariable(filePath, constants.AnchorStartStage, GetStartStageName(g.ConfigData)); err != nil {
|
||||
@@ -133,22 +133,26 @@ func (g ScrapyGenerator) GetNonListParserString(stage entity.Stage) string {
|
||||
for _, f := range stage.Fields {
|
||||
line := ""
|
||||
if f.Attr == "" {
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::text).extract_first()'`, f.Name, f.Css)
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::text).extract_first()')`, f.Name, f.Css)
|
||||
} else {
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")).extract_first()'`, f.Name, f.Css, f.Attr)
|
||||
line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")).extract_first()')`, f.Name, f.Css, f.Attr)
|
||||
}
|
||||
line = g.PadCode(line, 2)
|
||||
str += line
|
||||
}
|
||||
|
||||
// next stage 字段
|
||||
if f, err := g.GetNextStageField(stage); err == nil {
|
||||
// 如果找到 next stage 字段,进行下一个回调
|
||||
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="item['%s']", callback='%s', meta={'item': item})`, f.Name, f.NextStage), 3)
|
||||
str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url="item['%s']", callback='%s', meta={'item': item})`, f.Name, f.NextStage), 2)
|
||||
} else {
|
||||
// 如果没找到 next stage 字段,返回 item
|
||||
str += g.PadCode(fmt.Sprintf(`yield item`), 3)
|
||||
str += g.PadCode(fmt.Sprintf(`yield item`), 2)
|
||||
}
|
||||
|
||||
// 加入末尾换行
|
||||
str += g.PadCode("", 0)
|
||||
|
||||
return str
|
||||
}
|
||||
|
||||
@@ -168,11 +172,12 @@ func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string {
|
||||
for _, f := range stage.Fields {
|
||||
line := ""
|
||||
if f.Attr == "" {
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::text).extract_first()'`, f.Name, f.Css)
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::text).extract_first()')`, f.Name, f.Css)
|
||||
} else {
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")).extract_first()'`, f.Name, f.Css, f.Attr)
|
||||
line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")).extract_first()')`, f.Name, f.Css, f.Attr)
|
||||
}
|
||||
line = g.PadCode(line, 3)
|
||||
str += line
|
||||
}
|
||||
|
||||
// 把前一个 stage 的 item 值赋给当前 item
|
||||
@@ -195,6 +200,9 @@ func (g ScrapyGenerator) GetListParserString(stage entity.Stage) string {
|
||||
str += g.PadCode(`yield scrapy.Request(url=next_url, meta={'item': item})`, 2)
|
||||
}
|
||||
|
||||
// 加入末尾换行
|
||||
str += g.PadCode("", 0)
|
||||
|
||||
return str
|
||||
}
|
||||
|
||||
|
||||
@@ -171,12 +171,13 @@ func UploadConfigSpider(c *gin.Context) {
|
||||
|
||||
srcPath := filepath.Join(tplDir, fInfo.Name())
|
||||
if fInfo.IsDir() {
|
||||
if err := utils.CopyDir(srcPath, spiderDir); err != nil {
|
||||
dirPath := filepath.Join(spiderDir, fInfo.Name())
|
||||
if err := utils.CopyDir(srcPath, dirPath); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
} else {
|
||||
if _, err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
|
||||
if err := utils.CopyFile(srcPath, filepath.Join(spiderDir, fInfo.Name())); err != nil {
|
||||
HandleError(http.StatusInternalServerError, c, err)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
version: 0.4.0
|
||||
start_url: "https://baidu.com/s?wd=crawlab"
|
||||
start_stage: "stage_4"
|
||||
engine: "scrapy"
|
||||
stages:
|
||||
stage_1:
|
||||
@@ -14,6 +15,7 @@ stages:
|
||||
attr: "href"
|
||||
next_stage: "stage_2"
|
||||
stage_2:
|
||||
list: false
|
||||
is_list: false
|
||||
fields:
|
||||
- name: ""
|
||||
- name: "stage_2_field_1"
|
||||
css: "a"
|
||||
|
||||
@@ -9,5 +9,4 @@ import scrapy
|
||||
|
||||
|
||||
class Item(scrapy.Item):
|
||||
###ITEMS###
|
||||
pass
|
||||
###ITEMS###
|
||||
|
||||
@@ -3,12 +3,12 @@ package utils
|
||||
import (
|
||||
"archive/zip"
|
||||
"bufio"
|
||||
"errors"
|
||||
"fmt"
|
||||
"github.com/apex/log"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"runtime/debug"
|
||||
"strings"
|
||||
@@ -254,84 +254,64 @@ func _Compress(file *os.File, prefix string, zw *zip.Writer) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
/**
|
||||
* 拷贝文件夹,同时拷贝文件夹中的文件
|
||||
* @param srcPath 需要拷贝的文件夹路径: D:/test
|
||||
* @param destPath 拷贝到的位置: D:/backup/
|
||||
*/
|
||||
func CopyDir(srcPath string, destPath string) error {
|
||||
// 检测目录正确性
|
||||
if srcInfo, err := os.Stat(srcPath); err != nil {
|
||||
fmt.Println(err.Error())
|
||||
return err
|
||||
} else {
|
||||
if !srcInfo.IsDir() {
|
||||
e := errors.New("srcPath不是一个正确的目录!")
|
||||
fmt.Println(e.Error())
|
||||
return e
|
||||
}
|
||||
}
|
||||
if destInfo, err := os.Stat(destPath); err != nil {
|
||||
fmt.Println(err.Error())
|
||||
return err
|
||||
} else {
|
||||
if !destInfo.IsDir() {
|
||||
e := errors.New("destInfo不是一个正确的目录!")
|
||||
fmt.Println(e.Error())
|
||||
return e
|
||||
}
|
||||
}
|
||||
// File copies a single file from src to dst
|
||||
func CopyFile(src, dst string) error {
|
||||
var err error
|
||||
var srcfd *os.File
|
||||
var dstfd *os.File
|
||||
var srcinfo os.FileInfo
|
||||
|
||||
err := filepath.Walk(srcPath, func(path string, f os.FileInfo, err error) error {
|
||||
if f == nil {
|
||||
return err
|
||||
}
|
||||
if !f.IsDir() {
|
||||
path := strings.Replace(path, "\\", "/", -1)
|
||||
destNewPath := strings.Replace(path, srcPath, destPath, -1)
|
||||
_, _ = CopyFile(path, destNewPath)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
fmt.Printf(err.Error())
|
||||
if srcfd, err = os.Open(src); err != nil {
|
||||
return err
|
||||
}
|
||||
return err
|
||||
defer srcfd.Close()
|
||||
|
||||
if dstfd, err = os.Create(dst); err != nil {
|
||||
return err
|
||||
}
|
||||
defer dstfd.Close()
|
||||
|
||||
if _, err = io.Copy(dstfd, srcfd); err != nil {
|
||||
return err
|
||||
}
|
||||
if srcinfo, err = os.Stat(src); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.Chmod(dst, srcinfo.Mode())
|
||||
}
|
||||
|
||||
// 生成目录并拷贝文件
|
||||
func CopyFile(src, dest string) (w int64, err error) {
|
||||
srcFile, err := os.Open(src)
|
||||
if err != nil {
|
||||
fmt.Println(err.Error())
|
||||
return
|
||||
}
|
||||
defer srcFile.Close()
|
||||
// 分割path目录
|
||||
destSplitPathDirs := strings.Split(dest, "/")
|
||||
// Dir copies a whole directory recursively
|
||||
func CopyDir(src string, dst string) error {
|
||||
var err error
|
||||
var fds []os.FileInfo
|
||||
var srcinfo os.FileInfo
|
||||
|
||||
// 检测时候存在目录
|
||||
destSplitPath := ""
|
||||
for index, dir := range destSplitPathDirs {
|
||||
if index < len(destSplitPathDirs)-1 {
|
||||
destSplitPath = destSplitPath + dir + "/"
|
||||
if !Exists(destSplitPath) {
|
||||
//创建目录
|
||||
err := os.Mkdir(destSplitPath, os.ModePerm)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
}
|
||||
if srcinfo, err = os.Stat(src); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err = os.MkdirAll(dst, srcinfo.Mode()); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if fds, err = ioutil.ReadDir(src); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, fd := range fds {
|
||||
srcfp := path.Join(src, fd.Name())
|
||||
dstfp := path.Join(dst, fd.Name())
|
||||
|
||||
if fd.IsDir() {
|
||||
if err = CopyDir(srcfp, dstfp); err != nil {
|
||||
fmt.Println(err)
|
||||
}
|
||||
} else {
|
||||
if err = CopyFile(srcfp, dstfp); err != nil {
|
||||
fmt.Println(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
dstFile, err := os.Create(dest)
|
||||
if err != nil {
|
||||
fmt.Println(err.Error())
|
||||
return
|
||||
}
|
||||
defer dstFile.Close()
|
||||
|
||||
return io.Copy(dstFile, srcFile)
|
||||
return nil
|
||||
}
|
||||
|
||||
// 设置文件变量值
|
||||
@@ -350,10 +330,10 @@ func SetFileVariable(filePath string, key string, value string) error {
|
||||
content := string(contentBytes)
|
||||
|
||||
// 替换文本
|
||||
content = strings.ReplaceAll(content, fmt.Sprintf("%s%s%s", sep, key, sep), value)
|
||||
content = strings.Replace(content, fmt.Sprintf("%s%s%s", sep, key, sep), value, -1)
|
||||
|
||||
// 打开文件
|
||||
f, err := os.OpenFile(filePath, os.O_WRONLY, 0777)
|
||||
f, err := os.OpenFile(filePath, os.O_WRONLY|os.O_TRUNC, 0777)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user