加入scrapy items.py支持

This commit is contained in:
marvzhang
2020-02-21 15:07:54 +08:00
parent 7d5dc27264
commit 38af9e9173
8 changed files with 525 additions and 180 deletions

View File

@@ -6,7 +6,12 @@ type SpiderType struct {
}
type ScrapySettingParam struct {
Key string
Value interface{}
Type string
Key string `json:"key"`
Value interface{} `json:"value"`
Type string `json:"type"`
}
type ScrapyItem struct {
Name string `json:"name"`
Fields []string `json:"fields"`
}

View File

@@ -178,6 +178,8 @@ func main() {
authGroup.PUT("/spiders/:id/scrapy/spiders", routes.PutSpiderScrapySpiders) // Scrapy 爬虫创建爬虫
authGroup.GET("/spiders/:id/scrapy/settings", routes.GetSpiderScrapySettings) // Scrapy 爬虫设置
authGroup.POST("/spiders/:id/scrapy/settings", routes.PostSpiderScrapySettings) // Scrapy 爬虫修改设置
authGroup.GET("/spiders/:id/scrapy/items", routes.GetSpiderScrapyItems) // Scrapy 爬虫 items
authGroup.POST("/spiders/:id/scrapy/items", routes.PostSpiderScrapyItems) // Scrapy 爬虫修改 items
authGroup.POST("/spiders/:id/git/sync", routes.PostSpiderSyncGit) // 爬虫 Git 同步
authGroup.POST("/spiders/:id/git/reset", routes.PostSpiderResetGit) // 爬虫 Git 重置
}

View File

@@ -974,8 +974,9 @@ func GetSpiderScrapySpiders(c *gin.Context) {
func PutSpiderScrapySpiders(c *gin.Context) {
type ReqBody struct {
Name string `json:"name"`
Domain string `json:"domain"`
Name string `json:"name"`
Domain string `json:"domain"`
Template string `json:"template"`
}
id := c.Param("id")
@@ -997,7 +998,7 @@ func PutSpiderScrapySpiders(c *gin.Context) {
return
}
if err := services.CreateScrapySpider(spider, reqBody.Name, reqBody.Domain); err != nil {
if err := services.CreateScrapySpider(spider, reqBody.Name, reqBody.Domain, reqBody.Template); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
@@ -1066,6 +1067,64 @@ func PostSpiderScrapySettings(c *gin.Context) {
})
}
func GetSpiderScrapyItems(c *gin.Context) {
id := c.Param("id")
if !bson.IsObjectIdHex(id) {
HandleErrorF(http.StatusBadRequest, c, "spider_id is invalid")
return
}
spider, err := model.GetSpider(bson.ObjectIdHex(id))
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
data, err := services.GetScrapyItems(spider)
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
Data: data,
})
}
func PostSpiderScrapyItems(c *gin.Context) {
id := c.Param("id")
if !bson.IsObjectIdHex(id) {
HandleErrorF(http.StatusBadRequest, c, "spider_id is invalid")
return
}
var reqData []entity.ScrapyItem
if err := c.ShouldBindJSON(&reqData); err != nil {
HandleErrorF(http.StatusBadRequest, c, "invalid request")
return
}
spider, err := model.GetSpider(bson.ObjectIdHex(id))
if err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
if err := services.SaveScrapyItems(spider, reqData); err != nil {
HandleError(http.StatusInternalServerError, c, err)
return
}
c.JSON(http.StatusOK, Response{
Status: "ok",
Message: "success",
})
}
func PostSpiderSyncGit(c *gin.Context) {
id := c.Param("id")

View File

@@ -135,11 +135,77 @@ func SaveScrapySettings(s model.Spider, settingsData []entity.ScrapySettingParam
return
}
func CreateScrapySpider(s model.Spider, name string, domain string) (err error) {
func GetScrapyItems(s model.Spider) (res []map[string]interface{}, err error) {
var stdout bytes.Buffer
var stderr bytes.Buffer
cmd := exec.Command("scrapy", "genspider", name, domain)
cmd := exec.Command("crawlab", "items")
cmd.Dir = s.Src
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
log.Errorf(err.Error())
log.Errorf(stderr.String())
debug.PrintStack()
return res, err
}
if err := json.Unmarshal([]byte(stdout.String()), &res); err != nil {
log.Errorf(err.Error())
debug.PrintStack()
return res, err
}
return res, nil
}
func SaveScrapyItems(s model.Spider, itemsData []entity.ScrapyItem) (err error) {
// 读取 scrapy.cfg
cfg, err := goconfig.LoadConfigFile(path.Join(s.Src, "scrapy.cfg"))
if err != nil {
return
}
modName, err := cfg.GetValue("settings", "default")
if err != nil {
return
}
// 定位到 settings.py 文件
arr := strings.Split(modName, ".")
dirName := arr[0]
fileName := "items"
filePath := fmt.Sprintf("%s/%s/%s.py", s.Src, dirName, fileName)
// 生成文件内容
content := ""
content += "import scrapy\n"
content += "\n\n"
for _, item := range itemsData {
content += fmt.Sprintf("class %s(scrapy.Item):\n", item.Name)
for _, field := range item.Fields {
content += fmt.Sprintf(" %s = scrapy.Field()\n", field)
}
content += "\n\n"
}
// 写到 settings.py
if err := ioutil.WriteFile(filePath, []byte(content), os.ModePerm); err != nil {
return err
}
// 同步到GridFS
if err := UploadSpiderToGridFsFromMaster(s); err != nil {
return err
}
return
}
func CreateScrapySpider(s model.Spider, name string, domain string, template string) (err error) {
var stdout bytes.Buffer
var stderr bytes.Buffer
cmd := exec.Command("scrapy", "genspider", name, domain, "-t", template)
cmd.Dir = s.Src
cmd.Stdout = &stdout
cmd.Stderr = &stderr