优先调整xpath顺序

This commit is contained in:
marvzhang
2019-12-04 13:57:27 +08:00
parent 40f6675ce8
commit 6d9c162e9e
5 changed files with 43 additions and 31 deletions

View File

@@ -158,7 +158,7 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag
str += g.PadCode(`prev_item = response.meta.get('item')`, 2)
// for 循环遍历列表
str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s'):`, stage.ListCss), 2)
str += g.PadCode(fmt.Sprintf(`for elem in response.%s:`, g.GetListString(stage)), 2)
// 构造item
str += g.PadCode(`item = Item()`, 3)
@@ -248,3 +248,11 @@ func (g ScrapyGenerator) GetExtractStringFromStage(stage entity.Stage) string {
return fmt.Sprintf(`xpath('%s/@%s')`, stage.PageXpath, pageAttr)
}
}
func (g ScrapyGenerator) GetListString(stage entity.Stage) string {
if stage.ListCss != "" {
return fmt.Sprintf(`css('%s')`, stage.ListCss)
} else {
return fmt.Sprintf(`xpath('%s')`, stage.ListXpath)
}
}

View File

@@ -118,8 +118,8 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error {
}
// 如果 stage 的 is_list 为 true 但 list_css 为空,报错
if stage.IsList && stage.ListCss == "" {
return errors.New("spiderfile invalid: stage with is_list = true should have list_css being set")
if stage.IsList && (stage.ListCss == "" && stage.ListXpath == "") {
return errors.New("spiderfile invalid: stage with is_list = true should have either list_css or list_xpath being set")
}
}

View File

@@ -1,22 +1,22 @@
version: "0.4.0"
name: "toscrapy_books"
start_url: "http://www.baidu.com/s?wd=crawlab"
start_stage: "list"
engine: "scrapy"
version: 0.4.0
name: toscrapy_books
start_url: http://www.baidu.com/s?wd=crawlab
start_stage: list
engine: scrapy
stages:
list:
is_list: true
list_css: ".result.c-container"
page_css: "#page a.n:last-child"
page_attr: "href"
list_xpath: //*[contains(@class, "c-container")]
page_xpath: //*[@id="page"]//a[@class="n"][last()]
page_attr: href
fields:
- name: "title"
xpath: ".//h3/a"
- name: "url"
xpath: ".//h3/a"
attr: "href"
- name: "abstract"
css: ".c-abstract"
- name: title
xpath: .//h3/a
- name: url
xpath: .//h3/a
attr: href
- name: abstract
xpath: .//*[@class="c-abstract"]
settings:
ROBOTSTXT_OBEY: false
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36

View File

@@ -180,20 +180,20 @@
<el-popover v-model="stage.isListOpen" v-if="isList(stage)" placement="top" width="360">
<el-form label-width="120px">
<el-form-item :label="$t('Selector Type')">
<el-tag :class="stage.list_css ? 'active' : 'inactive'" type="success"
<el-tag :class="!stage.list_xpath ? 'active' : 'inactive'" type="success"
@click="onSelectStageListType(stage, 'css')">CSS
</el-tag>
<el-tag :class="!stage.list_css ? 'active' : 'inactive'" type="primary"
<el-tag :class="stage.list_xpath ? 'active' : 'inactive'" type="primary"
@click="onSelectStageListType(stage, 'xpath')">XPath
</el-tag>
</el-form-item>
<el-form-item :label="$t('Selector')" class="list-selector">
<el-input v-if="stage.list_css" v-model="stage.list_css"/>
<el-input v-if="!stage.list_xpath" v-model="stage.list_css"/>
<el-input v-else v-model="stage.list_xpath"/>
</el-form-item>
</el-form>
<el-tag
v-if="stage.list_css"
v-if="!stage.list_xpath"
type="success"
slot="reference"
@click="onClickStageList($event, stage, 'css')"
@@ -228,20 +228,20 @@
<el-popover v-model="stage.isPageOpen" v-if="isPage(stage)" placement="top" width="360">
<el-form label-width="120px">
<el-form-item :label="$t('Selector Type')">
<el-tag :class="stage.page_css ? 'active' : 'inactive'" type="success"
<el-tag :class="!stage.page_xpath ? 'active' : 'inactive'" type="success"
@click="onSelectStagePageType(stage, 'css')">CSS
</el-tag>
<el-tag :class="!stage.page_css ? 'active' : 'inactive'" type="primary"
<el-tag :class="stage.page_xpath ? 'active' : 'inactive'" type="primary"
@click="onSelectStagePageType(stage, 'xpath')">XPath
</el-tag>
</el-form-item>
<el-form-item :label="$t('Selector')" class="page-selector">
<el-input v-if="stage.page_css" v-model="stage.page_css"/>
<el-input v-if="!stage.page_xpath" v-model="stage.page_css"/>
<el-input v-else v-model="stage.page_xpath"/>
</el-form-item>
</el-form>
<el-tag
v-if="stage.page_css"
v-if="!stage.page_xpath"
type="success"
slot="reference"
@click="onClickStagePage($event, stage, 'css')"
@@ -618,10 +618,14 @@ export default {
} else if (this.isXpath) {
newField['xpath'] = '//body'
} else {
newField['css'] = 'body'
newField['xpath'] = '//body'
}
stages[newStageName] = {
name: newStageName,
list_css: this.isCss ? 'body' : '',
list_xpath: this.isXpath ? '//body' : '',
page_css: '',
page_xpath: '',
fields: [newField]
}
this.$set(this.spiderForm.config, 'stages', stages)
@@ -731,7 +735,7 @@ ${f.css || f.xpath} ${f.attr ? ('(' + f.attr + ')') : ''} ${f.next_stage ? (' --
onCheckIsList (value, stage) {
if (value) {
if (!stage.list_css && !stage.list_xpath) {
stage.list_css = 'body'
stage.list_xpath = '//body'
}
} else {
stage.list_css = ''
@@ -756,7 +760,7 @@ ${f.css || f.xpath} ${f.attr ? ('(' + f.attr + ')') : ''} ${f.next_stage ? (' --
onCheckIsPage (value, stage) {
if (value) {
if (!stage.page_css && !stage.page_xpath) {
stage.page_css = 'body'
stage.page_xpath = '//body'
}
} else {
stage.page_css = ''

View File

@@ -223,7 +223,7 @@ export default {
}
if (this.fields.length === 0) {
this.fields.push({
css: 'body',
xpath: '//body',
next_stage: ''
})
}
@@ -233,7 +233,7 @@ export default {
if (row.name === this.fields[i].name) {
this.fields.splice(i + 1, 0, {
name: `field_${Math.floor(new Date().getTime()).toString()}`,
css: 'body',
xpath: '//body',
next_stage: ''
})
break