From 8046d94cc31f702aef2d2b66e84e51b127e826bc Mon Sep 17 00:00:00 2001 From: marvzhang Date: Tue, 26 Nov 2019 12:44:24 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8A=A0=E5=85=A5Spiderfile=E9=99=90=E5=88=B6?= =?UTF-8?q?=E6=9D=A1=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/services/config_spider.go | 39 ++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/backend/services/config_spider.go b/backend/services/config_spider.go index 4e8005a1..91b474f7 100644 --- a/backend/services/config_spider.go +++ b/backend/services/config_spider.go @@ -37,12 +37,17 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error { // 校验是否存在 start_url if configData.StartUrl == "" { - return errors.New("spiderfile start_url is empty") + return errors.New("spiderfile invalid: start_url is empty") + } + + // 校验是否存在 start_stage + if configData.StartStage == "" { + return errors.New("spiderfile invalid: start_stage is empty") } // 校验是否存在 stages if len(configData.Stages) == 0 { - return errors.New("spiderfile stages is empty") + return errors.New("spiderfile invalid: stages is empty") } // 校验stages @@ -50,56 +55,64 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error { for stageName, stage := range configData.Stages { // stage 名称不能为空 if stageName == "" { - return errors.New("spiderfile stage name is empty") + return errors.New("spiderfile invalid: stage name is empty") } // stage 名称不能为保留字符串 // NOTE: 如果有其他Engine,可以扩展,默认为Scrapy if configData.Engine == "" || configData.Engine == constants.EngineScrapy { if strings.Contains(constants.ScrapyProtectedStageNames, stageName) { - return errors.New(fmt.Sprintf("spiderfile stage name '%s' is protected", stageName)) + return errors.New(fmt.Sprintf("spiderfile invalid: stage name '%s' is protected", stageName)) } - } else if configData.Engine == constants.EngineColly { - return errors.New(fmt.Sprintf("engine '%s' is not implemented", stageName)) + } else { + return errors.New(fmt.Sprintf("spiderfile invalid: engine '%s' is not implemented", configData.Engine)) } // stage 名称不能重复 if dict[stageName] == 1 { - return errors.New("spiderfile stage name should be unique") + return errors.New(fmt.Sprintf("spiderfile invalid: stage name '%s' is duplicated", stageName)) } dict[stageName] = 1 // stage 字段不能为空 if len(stage.Fields) == 0 { - return errors.New(fmt.Sprintf("spiderfile stage '%s' has no fields", stageName)) + return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has no fields", stageName)) } - // stage 的下一个 stage 只能有一个 + // 是否包含 next_stage hasNextStage := false + + // 遍历字段列表 for _, field := range stage.Fields { + // stage 的 next stage 只能有一个 if field.NextStage != "" { if hasNextStage { - return errors.New("spiderfile stage fields should have only 1 next_stage") + return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has more than 1 next_stage", stageName)) } hasNextStage = true } + + // 字段里 CSS 和 XPath 只能包含一个 + if field.Css != "" && field.Xpath != "" { + return errors.New(fmt.Sprintf("spiderfile invalid: field '%s' in stage '%s' has both CSS and XPath set which is prohibited", field.Name, stageName)) + } } // 如果 stage 的 is_list 为 true 但 list_css 为空,报错 if stage.IsList && stage.ListCss == "" { - return errors.New("spiderfile stage with is_list = true should have list_css being set") + return errors.New("spiderfile invalid: stage with is_list = true should have list_css being set") } } // 校验字段唯一性 if !IsUniqueConfigSpiderFields(fields) { - return errors.New("spiderfile fields not unique") + return errors.New("spiderfile invalid: fields not unique") } // 字段名称不能为保留字符串 for _, field := range fields { if strings.Contains(constants.ScrapyProtectedFieldNames, field.Name) { - return errors.New(fmt.Sprintf("spiderfile field name '%s' is protected", field.Name)) + return errors.New(fmt.Sprintf("spiderfile invalid: field name '%s' is protected", field.Name)) } }