mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
优先调整xpath顺序
This commit is contained in:
@@ -158,7 +158,7 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag
|
||||
str += g.PadCode(`prev_item = response.meta.get('item')`, 2)
|
||||
|
||||
// for 循环遍历列表
|
||||
str += g.PadCode(fmt.Sprintf(`for elem in response.css('%s'):`, stage.ListCss), 2)
|
||||
str += g.PadCode(fmt.Sprintf(`for elem in response.%s:`, g.GetListString(stage)), 2)
|
||||
|
||||
// 构造item
|
||||
str += g.PadCode(`item = Item()`, 3)
|
||||
@@ -248,3 +248,11 @@ func (g ScrapyGenerator) GetExtractStringFromStage(stage entity.Stage) string {
|
||||
return fmt.Sprintf(`xpath('%s/@%s')`, stage.PageXpath, pageAttr)
|
||||
}
|
||||
}
|
||||
|
||||
func (g ScrapyGenerator) GetListString(stage entity.Stage) string {
|
||||
if stage.ListCss != "" {
|
||||
return fmt.Sprintf(`css('%s')`, stage.ListCss)
|
||||
} else {
|
||||
return fmt.Sprintf(`xpath('%s')`, stage.ListXpath)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,8 +118,8 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error {
|
||||
}
|
||||
|
||||
// 如果 stage 的 is_list 为 true 但 list_css 为空,报错
|
||||
if stage.IsList && stage.ListCss == "" {
|
||||
return errors.New("spiderfile invalid: stage with is_list = true should have list_css being set")
|
||||
if stage.IsList && (stage.ListCss == "" && stage.ListXpath == "") {
|
||||
return errors.New("spiderfile invalid: stage with is_list = true should have either list_css or list_xpath being set")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,22 +1,22 @@
|
||||
version: "0.4.0"
|
||||
name: "toscrapy_books"
|
||||
start_url: "http://www.baidu.com/s?wd=crawlab"
|
||||
start_stage: "list"
|
||||
engine: "scrapy"
|
||||
version: 0.4.0
|
||||
name: toscrapy_books
|
||||
start_url: http://www.baidu.com/s?wd=crawlab
|
||||
start_stage: list
|
||||
engine: scrapy
|
||||
stages:
|
||||
list:
|
||||
is_list: true
|
||||
list_css: ".result.c-container"
|
||||
page_css: "#page a.n:last-child"
|
||||
page_attr: "href"
|
||||
list_xpath: //*[contains(@class, "c-container")]
|
||||
page_xpath: //*[@id="page"]//a[@class="n"][last()]
|
||||
page_attr: href
|
||||
fields:
|
||||
- name: "title"
|
||||
xpath: ".//h3/a"
|
||||
- name: "url"
|
||||
xpath: ".//h3/a"
|
||||
attr: "href"
|
||||
- name: "abstract"
|
||||
css: ".c-abstract"
|
||||
- name: title
|
||||
xpath: .//h3/a
|
||||
- name: url
|
||||
xpath: .//h3/a
|
||||
attr: href
|
||||
- name: abstract
|
||||
xpath: .//*[@class="c-abstract"]
|
||||
settings:
|
||||
ROBOTSTXT_OBEY: false
|
||||
USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36
|
||||
|
||||
@@ -180,20 +180,20 @@
|
||||
<el-popover v-model="stage.isListOpen" v-if="isList(stage)" placement="top" width="360">
|
||||
<el-form label-width="120px">
|
||||
<el-form-item :label="$t('Selector Type')">
|
||||
<el-tag :class="stage.list_css ? 'active' : 'inactive'" type="success"
|
||||
<el-tag :class="!stage.list_xpath ? 'active' : 'inactive'" type="success"
|
||||
@click="onSelectStageListType(stage, 'css')">CSS
|
||||
</el-tag>
|
||||
<el-tag :class="!stage.list_css ? 'active' : 'inactive'" type="primary"
|
||||
<el-tag :class="stage.list_xpath ? 'active' : 'inactive'" type="primary"
|
||||
@click="onSelectStageListType(stage, 'xpath')">XPath
|
||||
</el-tag>
|
||||
</el-form-item>
|
||||
<el-form-item :label="$t('Selector')" class="list-selector">
|
||||
<el-input v-if="stage.list_css" v-model="stage.list_css"/>
|
||||
<el-input v-if="!stage.list_xpath" v-model="stage.list_css"/>
|
||||
<el-input v-else v-model="stage.list_xpath"/>
|
||||
</el-form-item>
|
||||
</el-form>
|
||||
<el-tag
|
||||
v-if="stage.list_css"
|
||||
v-if="!stage.list_xpath"
|
||||
type="success"
|
||||
slot="reference"
|
||||
@click="onClickStageList($event, stage, 'css')"
|
||||
@@ -228,20 +228,20 @@
|
||||
<el-popover v-model="stage.isPageOpen" v-if="isPage(stage)" placement="top" width="360">
|
||||
<el-form label-width="120px">
|
||||
<el-form-item :label="$t('Selector Type')">
|
||||
<el-tag :class="stage.page_css ? 'active' : 'inactive'" type="success"
|
||||
<el-tag :class="!stage.page_xpath ? 'active' : 'inactive'" type="success"
|
||||
@click="onSelectStagePageType(stage, 'css')">CSS
|
||||
</el-tag>
|
||||
<el-tag :class="!stage.page_css ? 'active' : 'inactive'" type="primary"
|
||||
<el-tag :class="stage.page_xpath ? 'active' : 'inactive'" type="primary"
|
||||
@click="onSelectStagePageType(stage, 'xpath')">XPath
|
||||
</el-tag>
|
||||
</el-form-item>
|
||||
<el-form-item :label="$t('Selector')" class="page-selector">
|
||||
<el-input v-if="stage.page_css" v-model="stage.page_css"/>
|
||||
<el-input v-if="!stage.page_xpath" v-model="stage.page_css"/>
|
||||
<el-input v-else v-model="stage.page_xpath"/>
|
||||
</el-form-item>
|
||||
</el-form>
|
||||
<el-tag
|
||||
v-if="stage.page_css"
|
||||
v-if="!stage.page_xpath"
|
||||
type="success"
|
||||
slot="reference"
|
||||
@click="onClickStagePage($event, stage, 'css')"
|
||||
@@ -618,10 +618,14 @@ export default {
|
||||
} else if (this.isXpath) {
|
||||
newField['xpath'] = '//body'
|
||||
} else {
|
||||
newField['css'] = 'body'
|
||||
newField['xpath'] = '//body'
|
||||
}
|
||||
stages[newStageName] = {
|
||||
name: newStageName,
|
||||
list_css: this.isCss ? 'body' : '',
|
||||
list_xpath: this.isXpath ? '//body' : '',
|
||||
page_css: '',
|
||||
page_xpath: '',
|
||||
fields: [newField]
|
||||
}
|
||||
this.$set(this.spiderForm.config, 'stages', stages)
|
||||
@@ -731,7 +735,7 @@ ${f.css || f.xpath} ${f.attr ? ('(' + f.attr + ')') : ''} ${f.next_stage ? (' --
|
||||
onCheckIsList (value, stage) {
|
||||
if (value) {
|
||||
if (!stage.list_css && !stage.list_xpath) {
|
||||
stage.list_css = 'body'
|
||||
stage.list_xpath = '//body'
|
||||
}
|
||||
} else {
|
||||
stage.list_css = ''
|
||||
@@ -756,7 +760,7 @@ ${f.css || f.xpath} ${f.attr ? ('(' + f.attr + ')') : ''} ${f.next_stage ? (' --
|
||||
onCheckIsPage (value, stage) {
|
||||
if (value) {
|
||||
if (!stage.page_css && !stage.page_xpath) {
|
||||
stage.page_css = 'body'
|
||||
stage.page_xpath = '//body'
|
||||
}
|
||||
} else {
|
||||
stage.page_css = ''
|
||||
|
||||
@@ -223,7 +223,7 @@ export default {
|
||||
}
|
||||
if (this.fields.length === 0) {
|
||||
this.fields.push({
|
||||
css: 'body',
|
||||
xpath: '//body',
|
||||
next_stage: ''
|
||||
})
|
||||
}
|
||||
@@ -233,7 +233,7 @@ export default {
|
||||
if (row.name === this.fields[i].name) {
|
||||
this.fields.splice(i + 1, 0, {
|
||||
name: `field_${Math.floor(new Date().getTime()).toString()}`,
|
||||
css: 'body',
|
||||
xpath: '//body',
|
||||
next_stage: ''
|
||||
})
|
||||
break
|
||||
|
||||
Reference in New Issue
Block a user