From bfce15d54ee1b44c498e0369b0bd507187fc6063 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Tue, 26 Nov 2019 12:44:24 +0800 Subject: [PATCH 01/52] =?UTF-8?q?=E5=8A=A0=E5=85=A5Spiderfile=E9=99=90?= =?UTF-8?q?=E5=88=B6=E6=9D=A1=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/services/config_spider.go | 39 ++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/backend/services/config_spider.go b/backend/services/config_spider.go index 4e8005a1..91b474f7 100644 --- a/backend/services/config_spider.go +++ b/backend/services/config_spider.go @@ -37,12 +37,17 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error { // 校验是否存在 start_url if configData.StartUrl == "" { - return errors.New("spiderfile start_url is empty") + return errors.New("spiderfile invalid: start_url is empty") + } + + // 校验是否存在 start_stage + if configData.StartStage == "" { + return errors.New("spiderfile invalid: start_stage is empty") } // 校验是否存在 stages if len(configData.Stages) == 0 { - return errors.New("spiderfile stages is empty") + return errors.New("spiderfile invalid: stages is empty") } // 校验stages @@ -50,56 +55,64 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error { for stageName, stage := range configData.Stages { // stage 名称不能为空 if stageName == "" { - return errors.New("spiderfile stage name is empty") + return errors.New("spiderfile invalid: stage name is empty") } // stage 名称不能为保留字符串 // NOTE: 如果有其他Engine,可以扩展,默认为Scrapy if configData.Engine == "" || configData.Engine == constants.EngineScrapy { if strings.Contains(constants.ScrapyProtectedStageNames, stageName) { - return errors.New(fmt.Sprintf("spiderfile stage name '%s' is protected", stageName)) + return errors.New(fmt.Sprintf("spiderfile invalid: stage name '%s' is protected", stageName)) } - } else if configData.Engine == constants.EngineColly { - return errors.New(fmt.Sprintf("engine '%s' is not implemented", stageName)) + } else { + return errors.New(fmt.Sprintf("spiderfile invalid: engine '%s' is not implemented", configData.Engine)) } // stage 名称不能重复 if dict[stageName] == 1 { - return errors.New("spiderfile stage name should be unique") + return errors.New(fmt.Sprintf("spiderfile invalid: stage name '%s' is duplicated", stageName)) } dict[stageName] = 1 // stage 字段不能为空 if len(stage.Fields) == 0 { - return errors.New(fmt.Sprintf("spiderfile stage '%s' has no fields", stageName)) + return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has no fields", stageName)) } - // stage 的下一个 stage 只能有一个 + // 是否包含 next_stage hasNextStage := false + + // 遍历字段列表 for _, field := range stage.Fields { + // stage 的 next stage 只能有一个 if field.NextStage != "" { if hasNextStage { - return errors.New("spiderfile stage fields should have only 1 next_stage") + return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has more than 1 next_stage", stageName)) } hasNextStage = true } + + // 字段里 CSS 和 XPath 只能包含一个 + if field.Css != "" && field.Xpath != "" { + return errors.New(fmt.Sprintf("spiderfile invalid: field '%s' in stage '%s' has both CSS and XPath set which is prohibited", field.Name, stageName)) + } } // 如果 stage 的 is_list 为 true 但 list_css 为空,报错 if stage.IsList && stage.ListCss == "" { - return errors.New("spiderfile stage with is_list = true should have list_css being set") + return errors.New("spiderfile invalid: stage with is_list = true should have list_css being set") } } // 校验字段唯一性 if !IsUniqueConfigSpiderFields(fields) { - return errors.New("spiderfile fields not unique") + return errors.New("spiderfile invalid: fields not unique") } // 字段名称不能为保留字符串 for _, field := range fields { if strings.Contains(constants.ScrapyProtectedFieldNames, field.Name) { - return errors.New(fmt.Sprintf("spiderfile field name '%s' is protected", field.Name)) + return errors.New(fmt.Sprintf("spiderfile invalid: field name '%s' is protected", field.Name)) } } From b676a84e497c184c1d82b84e4220fed38305e436 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Tue, 26 Nov 2019 13:02:49 +0800 Subject: [PATCH 02/52] =?UTF-8?q?=E5=8A=A0=E5=85=A5xpath?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/entity/config_spider.go | 12 +++--- backend/model/config_spider/scrapy.go | 62 ++++++++++++++++++--------- backend/services/config_spider.go | 14 +++++- 3 files changed, 61 insertions(+), 27 deletions(-) diff --git a/backend/entity/config_spider.go b/backend/entity/config_spider.go index 5e0fe1e1..d46e092e 100644 --- a/backend/entity/config_spider.go +++ b/backend/entity/config_spider.go @@ -9,11 +9,13 @@ type Field struct { } type Stage struct { - IsList bool `yaml:"is_list" json:"is_list"` - ListCss string `yaml:"list_css" json:"list_css"` - PageCss string `yaml:"page_css" json:"page_css"` - PageAttr string `yaml:"page_attr" json:"page_attr"` - Fields []Field `yaml:"fields" json:"fields"` + IsList bool `yaml:"is_list" json:"is_list"` + ListCss string `yaml:"list_css" json:"list_css"` + ListXpath string `yaml:"list_xpath" json:"list_xpath"` + PageCss string `yaml:"page_css" json:"page_css"` + PageXpath string `yaml:"page_xpath" json:"page_xpath"` + PageAttr string `yaml:"page_attr" json:"page_attr"` + Fields []Field `yaml:"fields" json:"fields"` } type ConfigSpiderData struct { diff --git a/backend/model/config_spider/scrapy.go b/backend/model/config_spider/scrapy.go index 7503b9bf..9da6fb89 100644 --- a/backend/model/config_spider/scrapy.go +++ b/backend/model/config_spider/scrapy.go @@ -131,12 +131,7 @@ func (g ScrapyGenerator) GetNonListParserString(stageName string, stage entity.S // 遍历字段列表 for _, f := range stage.Fields { - line := "" - if f.Attr == "" { - line += fmt.Sprintf(`item['%s'] = response.css('%s::text').extract_first()`, f.Name, f.Css) - } else { - line += fmt.Sprintf(`item['%s'] = response.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr) - } + line := fmt.Sprintf(`item['%s'] = response.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f)) line = g.PadCode(line, 2) str += line } @@ -170,12 +165,7 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag // 遍历字段列表 for _, f := range stage.Fields { - line := "" - if f.Attr == "" { - line += fmt.Sprintf(`item['%s'] = elem.css('%s::text').extract_first()`, f.Name, f.Css) - } else { - line += fmt.Sprintf(`item['%s'] = elem.css('%s::attr("%s")').extract_first()`, f.Name, f.Css, f.Attr) - } + line := fmt.Sprintf(`item['%s'] = elem.%s.extract_first()`, f.Name, g.GetExtractStringFromField(f)) line = g.PadCode(line, 3) str += line } @@ -195,14 +185,8 @@ func (g ScrapyGenerator) GetListParserString(stageName string, stage entity.Stag } // 分页 - if stage.PageCss != "" { - // 分页元素属性,默认为 href - pageAttr := "href" - if stage.PageAttr != "" { - pageAttr = stage.PageAttr - } - - str += g.PadCode(fmt.Sprintf(`next_url = response.css('%s::attr("%s")').extract_first()`, stage.PageCss, pageAttr), 2) + if stage.PageCss != "" || stage.PageXpath != "" { + str += g.PadCode(fmt.Sprintf(`next_url = response.%s.extract_first()`, g.GetExtractStringFromStage(stage)), 2) str += g.PadCode(fmt.Sprintf(`yield scrapy.Request(url=get_real_url(response, next_url), callback=self.parse_%s, meta={'item': item})`, stageName), 2) } @@ -226,3 +210,41 @@ func (g ScrapyGenerator) GetNextStageField(stage entity.Stage) (entity.Field, er } return entity.Field{}, errors.New("cannot find next stage field") } + +func (g ScrapyGenerator) GetExtractStringFromField(f entity.Field) string { + if f.Css != "" { + // 如果为CSS + if f.Attr == "" { + // 文本 + return fmt.Sprintf(`css(%s::text())`, f.Css) + } else { + // 属性 + return fmt.Sprintf(`css(%s::attr("%s"))`, f.Css, f.Attr) + } + } else { + // 如果为XPath + if f.Attr == "" { + // 文本 + return fmt.Sprintf(`xpath(%s/text())`, f.Xpath) + } else { + // 属性 + return fmt.Sprintf(`xpath(%s/@%s)`, f.Xpath, f.Attr) + } + } +} + +func (g ScrapyGenerator) GetExtractStringFromStage(stage entity.Stage) string { + // 分页元素属性,默认为 href + pageAttr := "href" + if stage.PageAttr != "" { + pageAttr = stage.PageAttr + } + + if stage.PageCss != "" { + // 如果为CSS + return fmt.Sprintf(`css(%s::attr("%s"))`, stage.PageCss, pageAttr) + } else { + // 如果为XPath + return fmt.Sprintf(`xpath(%s/@%s)`, stage.PageXpath, pageAttr) + } +} diff --git a/backend/services/config_spider.go b/backend/services/config_spider.go index 91b474f7..35bb2790 100644 --- a/backend/services/config_spider.go +++ b/backend/services/config_spider.go @@ -92,12 +92,22 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error { hasNextStage = true } - // 字段里 CSS 和 XPath 只能包含一个 + // 字段里 css 和 xpath 只能包含一个 if field.Css != "" && field.Xpath != "" { - return errors.New(fmt.Sprintf("spiderfile invalid: field '%s' in stage '%s' has both CSS and XPath set which is prohibited", field.Name, stageName)) + return errors.New(fmt.Sprintf("spiderfile invalid: field '%s' in stage '%s' has both css and xpath set which is prohibited", field.Name, stageName)) } } + // stage 里 page_css 和 page_xpath 只能包含一个 + if stage.PageCss != "" && stage.PageXpath != "" { + return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both page_css and page_xpath set which is prohibited", stageName)) + } + + // stage 里 list_css 和 list_xpath 只能包含一个 + if stage.ListCss != "" && stage.ListXpath != "" { + return errors.New(fmt.Sprintf("spiderfile invalid: stage '%s' has both list_css and list_xpath set which is prohibited", stageName)) + } + // 如果 stage 的 is_list 为 true 但 list_css 为空,报错 if stage.IsList && stage.ListCss == "" { return errors.New("spiderfile invalid: stage with is_list = true should have list_css being set") From b34ad260d95f709bfd8b7aa0bf34ac2896113536 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Tue, 26 Nov 2019 13:13:31 +0800 Subject: [PATCH 03/52] fixed eslint errors --- frontend/src/components/ScrollView/LogItem.vue | 1 - frontend/src/components/ScrollView/LogView.vue | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/frontend/src/components/ScrollView/LogItem.vue b/frontend/src/components/ScrollView/LogItem.vue index c54d7c37..19e3e082 100644 --- a/frontend/src/components/ScrollView/LogItem.vue +++ b/frontend/src/components/ScrollView/LogItem.vue @@ -25,7 +25,6 @@ export default { } - diff --git a/frontend/src/components/TableView/FieldsTableView.vue b/frontend/src/components/TableView/FieldsTableView.vue index 836a8f49..7b1ef9f5 100644 --- a/frontend/src/components/TableView/FieldsTableView.vue +++ b/frontend/src/components/TableView/FieldsTableView.vue @@ -1,73 +1,104 @@ @@ -250,7 +250,8 @@ export default { onSelectSpider () { this.$st.sendEv('任务', '选择爬虫') }, - onRemove (row) { + onRemove (row, ev) { + ev.stopPropagation() this.$confirm(this.$t('Are you sure to delete this task?'), this.$t('Notification'), { confirmButtonText: this.$t('Confirm'), cancelButtonText: this.$t('Cancel'), From b9c7920b766cf0da935dfbf49b15646bdfaaf8b0 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Thu, 28 Nov 2019 12:54:56 +0800 Subject: [PATCH 19/52] =?UTF-8?q?fixed=20=E9=85=8D=E7=BD=AE=E4=B8=8B?= =?UTF-8?q?=E5=88=92=E7=BA=BF=E4=B8=8D=E6=98=BE=E7=A4=BA=E7=9A=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frontend/src/components/Config/ConfigList.vue | 10 ++++++++++ frontend/src/views/spider/SpiderDetail.vue | 8 ++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index 7b7c7ffd..1c127ca3 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -538,6 +538,16 @@ export default { }, onTabClick (tab) { this.activeTab = tab.name + }, + update () { + if (this.activeTab !== 'stages') return + + // 手动显示tab下划线 + const elBar = document.querySelector('.el-tabs__active-bar') + const elStages = document.querySelector('#tab-stages') + const totalWidth = Number(getComputedStyle(elStages).width.replace('px', '')) + const paddingRight = Number(getComputedStyle(elStages).paddingRight.replace('px', '')) + elBar.setAttribute('style', 'width:' + (totalWidth - paddingRight) + 'px') } }, created () { diff --git a/frontend/src/views/spider/SpiderDetail.vue b/frontend/src/views/spider/SpiderDetail.vue index b42e750d..29b531ef 100644 --- a/frontend/src/views/spider/SpiderDetail.vue +++ b/frontend/src/views/spider/SpiderDetail.vue @@ -13,8 +13,8 @@ - - + + @@ -77,6 +77,10 @@ export default { setTimeout(() => { this.$refs['spider-stats'].update() }, 0) + } else if (this.activeTabName === 'config') { + setTimeout(() => { + this.$refs['config'].update() + }, 0) } this.$st.sendEv('爬虫详情', '切换标签', tab.name) }, From d71812147a0bb0ff06eaf4ed7e2124995d93da57 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Thu, 28 Nov 2019 13:32:33 +0800 Subject: [PATCH 20/52] =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=8F=AF=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=E7=88=AC=E8=99=AB=E5=89=8D=E7=AB=AF=E7=95=8C=E9=9D=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frontend/src/components/Config/ConfigList.vue | 27 +++++++++++++++++-- .../components/TableView/FieldsTableView.vue | 15 ++++++++++- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index 1c127ca3..e7c7212a 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -493,6 +493,7 @@ export default { }, series: [ { + animation: false, type: 'graph', layout: 'force', symbolSize: 50, @@ -515,7 +516,7 @@ export default { force: { initLayout: 'force', repulsion: 100, - gravity: 0.01, + gravity: 0.00001, edgeLength: 200 }, // draggable: true, @@ -529,7 +530,29 @@ export default { } } } - ] + ], + tooltip: { + // formatter: '{b0}: {c0}
{b1}: {c1}', + formatter: (params) => { + if (!params.data.fields) return + + let str = '' + str += `

` + str += `
` + str += '
    ' + for (let i = 0; i < params.data.fields.length; i++) { + const f = params.data.fields[i] + str += ` +
  • +${f.name}: +${f.css || f.xpath} ${f.attr ? ('(' + f.attr + ')') : ''} ${f.next_stage ? (' --> ' + '' + f.next_stage + '') : ''} +
  • +` + } + str += '
' + return str + } + } } const el = document.querySelector('#process-chart') this.processChart = echarts.init(el) diff --git a/frontend/src/components/TableView/FieldsTableView.vue b/frontend/src/components/TableView/FieldsTableView.vue index 220deda8..c9f25c89 100644 --- a/frontend/src/components/TableView/FieldsTableView.vue +++ b/frontend/src/components/TableView/FieldsTableView.vue @@ -83,7 +83,7 @@ @@ -93,6 +93,8 @@