From 6d9c162e9e372d8753e676919f5ab090139bd2b2 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Wed, 4 Dec 2019 13:57:27 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=85=88=E8=B0=83=E6=95=B4xpath?= =?UTF-8?q?=E9=A1=BA=E5=BA=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/model/config_spider/scrapy.go | 10 ++++++- backend/services/config_spider.go | 4 +-- backend/template/spiderfile/Spiderfile.baidu | 30 +++++++++---------- frontend/src/components/Config/ConfigList.vue | 26 +++++++++------- .../components/TableView/FieldsTableView.vue | 4 +-- 5 files changed, 43 insertions(+), 31 deletions(-) diff --git a/backend/model/config_spider/scrapy.go b/backend/model/config_spider/scrapy.go index bf52d42a..6fcb77f0 100644 --- a/backend/model/config_spider/scrapy.go +++ b/backend/model/config_spider/scrapy.go @@ -158,7 +158,7 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag str += g.PadCode(`prev_item = response.meta.get('item')`, 2) // for 循环遍历列表 - str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s'):`, stage.ListCss), 2) + str += g.PadCode(fmt.Sprintf(`for elem in response.%s:`, g.GetListString(stage)), 2) // 构造item str += g.PadCode(`item = Item()`, 3) @@ -248,3 +248,11 @@ func (g ScrapyGenerator) GetExtractStringFromStage(stage entity.Stage) string { return fmt.Sprintf(`xpath('%s/@%s')`, stage.PageXpath, pageAttr) } } + +func (g ScrapyGenerator) GetListString(stage entity.Stage) string { + if stage.ListCss != "" { + return fmt.Sprintf(`css('%s')`, stage.ListCss) + } else { + return fmt.Sprintf(`xpath('%s')`, stage.ListXpath) + } +} diff --git a/backend/services/config_spider.go b/backend/services/config_spider.go index d96146f5..adce0531 100644 --- a/backend/services/config_spider.go +++ b/backend/services/config_spider.go @@ -118,8 +118,8 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error { } // 如果 stage 的 is_list 为 true 但 list_css 为空,报错 - if stage.IsList && stage.ListCss == "" { - return errors.New("spiderfile invalid: stage with is_list = true should have list_css being set") + if stage.IsList && (stage.ListCss == "" && stage.ListXpath == "") { + return errors.New("spiderfile invalid: stage with is_list = true should have either list_css or list_xpath being set") } } diff --git a/backend/template/spiderfile/Spiderfile.baidu b/backend/template/spiderfile/Spiderfile.baidu index c97e87af..fbf720e4 100644 --- a/backend/template/spiderfile/Spiderfile.baidu +++ b/backend/template/spiderfile/Spiderfile.baidu @@ -1,22 +1,22 @@ -version: "0.4.0" -name: "toscrapy_books" -start_url: "http://www.baidu.com/s?wd=crawlab" -start_stage: "list" -engine: "scrapy" +version: 0.4.0 +name: toscrapy_books +start_url: http://www.baidu.com/s?wd=crawlab +start_stage: list +engine: scrapy stages: list: is_list: true - list_css: ".result.c-container" - page_css: "#page a.n:last-child" - page_attr: "href" + list_xpath: //*[contains(@class, "c-container")] + page_xpath: //*[@id="page"]//a[@class="n"][last()] + page_attr: href fields: - - name: "title" - xpath: ".//h3/a" - - name: "url" - xpath: ".//h3/a" - attr: "href" - - name: "abstract" - css: ".c-abstract" + - name: title + xpath: .//h3/a + - name: url + xpath: .//h3/a + attr: href + - name: abstract + xpath: .//*[@class="c-abstract"] settings: ROBOTSTXT_OBEY: false USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index 1fc62d18..bd31441e 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -180,20 +180,20 @@ - CSS - XPath - + - CSS - XPath - +