From efa26ab0aa9632e24e84e1420f98e47eebe368fe Mon Sep 17 00:00:00 2001 From: marvzhang Date: Mon, 9 Dec 2019 15:24:16 +0800 Subject: [PATCH 01/39] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=AE=9A=E6=97=B6?= =?UTF-8?q?=E4=BB=BB=E5=8A=A1=E6=97=A0=E6=B3=95=E8=B7=91=E5=8F=AF=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=E7=88=AC=E8=99=AB=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frontend/src/i18n/zh.js | 3 +- frontend/src/views/schedule/ScheduleList.vue | 63 +++++++++++--------- 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/frontend/src/i18n/zh.js b/frontend/src/i18n/zh.js index d8f1dbe2..1c46b88b 100644 --- a/frontend/src/i18n/zh.js +++ b/frontend/src/i18n/zh.js @@ -259,7 +259,7 @@ export default { 'Executables': '执行文件', // 弹出框 - Notification: '提示', + 'Notification': '提示', 'Are you sure to delete this node?': '你确定要删除该节点?', 'Are you sure to run this spider?': '你确定要运行该爬虫?', 'Node info has been saved successfully': '节点信息已成功保存', @@ -279,6 +279,7 @@ export default { 'Saved successfully': '成功保存', 'Please zip your spider files from the root directory': '爬虫文件请从根目录下开始压缩。', 'English': 'English', + 'Are you sure to delete the schedule task?': '确定删除定时任务?', // 登录 'Sign in': '登录', 'Sign-in': '登录', diff --git a/frontend/src/views/schedule/ScheduleList.vue b/frontend/src/views/schedule/ScheduleList.vue index 0259247e..7a49f808 100644 --- a/frontend/src/views/schedule/ScheduleList.vue +++ b/frontend/src/views/schedule/ScheduleList.vue @@ -14,9 +14,15 @@ - + + + + + + + + - - - - - - - - - - - - + @@ -79,7 +75,7 @@ - + @@ -165,7 +161,7 @@ export default { return { columns: [ { name: 'name', label: 'Name', width: '180' }, - { name: 'cron', label: 'schedules.cron', width: '120' }, + { name: 'cron', label: 'Cron', width: '120' }, { name: 'node_name', label: 'Node', width: '150' }, { name: 'spider_name', label: 'Spider', width: '150' }, { name: 'param', label: 'Parameters', width: '150' }, @@ -246,9 +242,9 @@ export default { this.$st.sendEv('定时任务', '修改', 'id', row._id) }, onRemove (row) { - this.$confirm('确定删除定时任务?', '提示', { - confirmButtonText: '确定', - cancelButtonText: '取消', + this.$confirm(this.$t('Are you sure to delete the schedule task?'), this.$t('Notification'), { + confirmButtonText: this.$t('Confirm'), + cancelButtonText: this.$t('Cancel'), type: 'warning' }).then(() => { this.$store.dispatch('schedule/removeSchedule', row._id) @@ -258,15 +254,16 @@ export default { this.$message.success(`Schedule "${row.name}" has been removed`) }, 100) }) - }).catch(() => {}) + }).catch(() => { + }) this.$st.sendEv('定时任务', '删除', 'id', row._id) }, onCrawl (row) { // 停止定时任务 if (!row.status || row.status === 'running') { - this.$confirm('确定停止定时任务?', '提示', { - confirmButtonText: '确定', - cancelButtonText: '取消', + this.$confirm(this.$t('Are you sure to delete the schedule task?'), this.$t('Notification'), { + confirmButtonText: this.$t('Confirm'), + cancelButtonText: this.$t('Cancel'), type: 'warning' }).then(() => { this.$store.dispatch('schedule/stopSchedule', row._id) @@ -280,13 +277,14 @@ export default { message: resp.data.error }) }) - }).catch(() => {}) + }).catch(() => { + }) } // 运行定时任务 if (row.status === 'stop') { - this.$confirm('确定运行定时任务?', '提示', { - confirmButtonText: '确定', - cancelButtonText: '取消', + this.$confirm(this.$t('Are you sure to delete the schedule task?'), this.$t('Notification'), { + confirmButtonText: this.$t('Confirm'), + cancelButtonText: this.$t('Cancel'), type: 'warning' }).then(() => { this.$store.dispatch('schedule/runSchedule', row._id) @@ -300,7 +298,15 @@ export default { message: resp.data.error }) }) - }).catch(() => {}) + }).catch(() => { + }) + } + }, + isDisabledSpider (spider) { + if (spider.type === 'customized') { + return !spider.cmd + } else { + return false } } }, @@ -338,6 +344,7 @@ export default { min-height: 360px; margin-top: 10px; } + .status-tag { cursor: pointer; } From 0cf745767cd8aad1c9b2c53ced3b6ee8f43f8a30 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Thu, 12 Dec 2019 13:13:42 +0800 Subject: [PATCH 02/39] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=AE=9A=E6=97=B6?= =?UTF-8?q?=E4=BB=BB=E5=8A=A1=E4=B8=8D=E4=B8=80=E8=87=B4=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/model/schedule.go | 106 +++++++++--------- backend/routes/task.go | 3 - backend/services/schedule.go | 111 +++++++++++++------ frontend/src/store/modules/schedule.js | 4 +- frontend/src/views/schedule/ScheduleList.vue | 36 ++++-- 5 files changed, 159 insertions(+), 101 deletions(-) diff --git a/backend/model/schedule.go b/backend/model/schedule.go index 39e1244f..c1923885 100644 --- a/backend/model/schedule.go +++ b/backend/model/schedule.go @@ -12,15 +12,18 @@ import ( ) type Schedule struct { - Id bson.ObjectId `json:"_id" bson:"_id"` - Name string `json:"name" bson:"name"` - Description string `json:"description" bson:"description"` - SpiderId bson.ObjectId `json:"spider_id" bson:"spider_id"` - NodeId bson.ObjectId `json:"node_id" bson:"node_id"` - NodeKey string `json:"node_key" bson:"node_key"` - Cron string `json:"cron" bson:"cron"` - EntryId cron.EntryID `json:"entry_id" bson:"entry_id"` - Param string `json:"param" bson:"param"` + Id bson.ObjectId `json:"_id" bson:"_id"` + Name string `json:"name" bson:"name"` + Description string `json:"description" bson:"description"` + SpiderId bson.ObjectId `json:"spider_id" bson:"spider_id"` + //NodeId bson.ObjectId `json:"node_id" bson:"node_id"` + //NodeKey string `json:"node_key" bson:"node_key"` + Cron string `json:"cron" bson:"cron"` + EntryId cron.EntryID `json:"entry_id" bson:"entry_id"` + Param string `json:"param" bson:"param"` + RunType string `json:"run_type" bson:"run_type"` + NodeIds []bson.ObjectId `json:"node_ids" bson:"node_ids"` + // 状态 Status string `json:"status" bson:"status"` @@ -49,26 +52,26 @@ func (sch *Schedule) Delete() error { return c.RemoveId(sch.Id) } -func (sch *Schedule) SyncNodeIdAndSpiderId(node Node, spider Spider) { - sch.syncNodeId(node) - sch.syncSpiderId(spider) -} +//func (sch *Schedule) SyncNodeIdAndSpiderId(node Node, spider Spider) { +// sch.syncNodeId(node) +// sch.syncSpiderId(spider) +//} -func (sch *Schedule) syncNodeId(node Node) { - if node.Id.Hex() == sch.NodeId.Hex() { - return - } - sch.NodeId = node.Id - _ = sch.Save() -} +//func (sch *Schedule) syncNodeId(node Node) { +// if node.Id.Hex() == sch.NodeId.Hex() { +// return +// } +// sch.NodeId = node.Id +// _ = sch.Save() +//} -func (sch *Schedule) syncSpiderId(spider Spider) { - if spider.Id.Hex() == sch.SpiderId.Hex() { - return - } - sch.SpiderId = spider.Id - _ = sch.Save() -} +//func (sch *Schedule) syncSpiderId(spider Spider) { +// if spider.Id.Hex() == sch.SpiderId.Hex() { +// return +// } +// sch.SpiderId = spider.Id +// _ = sch.Save() +//} func GetScheduleList(filter interface{}) ([]Schedule, error) { s, c := database.GetCol("schedules") @@ -81,20 +84,20 @@ func GetScheduleList(filter interface{}) ([]Schedule, error) { var schs []Schedule for _, schedule := range schedules { - // 获取节点名称 - if schedule.NodeId == bson.ObjectIdHex(constants.ObjectIdNull) { - // 选择所有节点 - schedule.NodeName = "All Nodes" - } else { - // 选择单一节点 - node, err := GetNode(schedule.NodeId) - if err != nil { - schedule.Status = constants.ScheduleStatusError - schedule.Message = constants.ScheduleStatusErrorNotFoundNode - } else { - schedule.NodeName = node.Name - } - } + // TODO: 获取节点名称 + //if schedule.NodeId == bson.ObjectIdHex(constants.ObjectIdNull) { + // // 选择所有节点 + // schedule.NodeName = "All Nodes" + //} else { + // // 选择单一节点 + // node, err := GetNode(schedule.NodeId) + // if err != nil { + // schedule.Status = constants.ScheduleStatusError + // schedule.Message = constants.ScheduleStatusErrorNotFoundNode + // } else { + // schedule.NodeName = node.Name + // } + //} // 获取爬虫名称 spider, err := GetSpider(schedule.SpiderId) @@ -130,12 +133,13 @@ func UpdateSchedule(id bson.ObjectId, item Schedule) error { if err := c.FindId(id).One(&result); err != nil { return err } - node, err := GetNode(item.NodeId) - if err != nil { - return err - } + //node, err := GetNode(item.NodeId) + //if err != nil { + // return err + //} - item.NodeKey = node.Key + item.UpdateTs = time.Now() + //item.NodeKey = node.Key if err := item.Save(); err != nil { return err } @@ -146,15 +150,15 @@ func AddSchedule(item Schedule) error { s, c := database.GetCol("schedules") defer s.Close() - node, err := GetNode(item.NodeId) - if err != nil { - return err - } + //node, err := GetNode(item.NodeId) + //if err != nil { + // return err + //} item.Id = bson.NewObjectId() item.CreateTs = time.Now() item.UpdateTs = time.Now() - item.NodeKey = node.Key + //item.NodeKey = node.Key if err := c.Insert(&item); err != nil { debug.PrintStack() diff --git a/backend/routes/task.go b/backend/routes/task.go index 6b91ed66..d5e3cacc 100644 --- a/backend/routes/task.go +++ b/backend/routes/task.go @@ -119,7 +119,6 @@ func PutTask(c *gin.Context) { return } } - } else if reqBody.RunType == constants.RunTypeRandom { // 随机 t := model.Task{ @@ -130,7 +129,6 @@ func PutTask(c *gin.Context) { HandleError(http.StatusInternalServerError, c, err) return } - } else if reqBody.RunType == constants.RunTypeSelectedNodes { // 指定节点 for _, nodeId := range reqBody.NodeIds { @@ -145,7 +143,6 @@ func PutTask(c *gin.Context) { return } } - } else { HandleErrorF(http.StatusInternalServerError, c, "invalid run_type") return diff --git a/backend/services/schedule.go b/backend/services/schedule.go index 7a8defde..53938aea 100644 --- a/backend/services/schedule.go +++ b/backend/services/schedule.go @@ -7,7 +7,7 @@ import ( "errors" "github.com/apex/log" "github.com/globalsign/mgo/bson" - "github.com/satori/go.uuid" + uuid "github.com/satori/go.uuid" "runtime/debug" ) @@ -19,48 +19,87 @@ type Scheduler struct { func AddScheduleTask(s model.Schedule) func() { return func() { - node, err := model.GetNodeByKey(s.NodeKey) - if err != nil || node.Id.Hex() == "" { - log.Errorf("get node by key error: %s", err.Error()) - debug.PrintStack() - return - } - - spider := model.GetSpiderByName(s.SpiderName) - if spider == nil || spider.Id.Hex() == "" { - log.Errorf("get spider by name error: %s", err.Error()) - debug.PrintStack() - return - } - - // 同步ID到定时任务 - s.SyncNodeIdAndSpiderId(node, *spider) - // 生成任务ID id := uuid.NewV4() - // 生成任务模型 - t := model.Task{ - Id: id.String(), - SpiderId: spider.Id, - NodeId: node.Id, - Status: constants.StatusPending, - Param: s.Param, - } + if s.RunType == constants.RunTypeAllNodes { + // 所有节点 + nodes, err := model.GetNodeList(nil) + if err != nil { + return + } + for _, node := range nodes { + t := model.Task{ + Id: id.String(), + SpiderId: s.SpiderId, + NodeId: node.Id, + Param: s.Param, + } - // 将任务存入数据库 - if err := model.AddTask(t); err != nil { - log.Errorf(err.Error()) - debug.PrintStack() + if err := AddTask(t); err != nil { + return + } + if err := AssignTask(t); err != nil { + log.Errorf(err.Error()) + debug.PrintStack() + return + } + } + } else if s.RunType == constants.RunTypeRandom { + // 随机 + t := model.Task{ + Id: id.String(), + SpiderId: s.SpiderId, + Param: s.Param, + } + if err := AddTask(t); err != nil { + return + } + if err := AssignTask(t); err != nil { + log.Errorf(err.Error()) + debug.PrintStack() + return + } + } else if s.RunType == constants.RunTypeSelectedNodes { + // 指定节点 + for _, nodeId := range s.NodeIds { + t := model.Task{ + Id: id.String(), + SpiderId: s.SpiderId, + NodeId: nodeId, + Param: s.Param, + } + + if err := AddTask(t); err != nil { + return + } + + if err := AssignTask(t); err != nil { + log.Errorf(err.Error()) + debug.PrintStack() + return + } + } + } else { return } - // 加入任务队列 - if err := AssignTask(t); err != nil { - log.Errorf(err.Error()) - debug.PrintStack() - return - } + //node, err := model.GetNodeByKey(s.NodeKey) + //if err != nil || node.Id.Hex() == "" { + // log.Errorf("get node by key error: %s", err.Error()) + // debug.PrintStack() + // return + //} + // + //spider := model.GetSpiderByName(s.SpiderName) + //if spider == nil || spider.Id.Hex() == "" { + // log.Errorf("get spider by name error: %s", err.Error()) + // debug.PrintStack() + // return + //} + // + //// 同步ID到定时任务 + //s.SyncNodeIdAndSpiderId(node, *spider) } } diff --git a/frontend/src/store/modules/schedule.js b/frontend/src/store/modules/schedule.js index 7c705ac3..e71d5f03 100644 --- a/frontend/src/store/modules/schedule.js +++ b/frontend/src/store/modules/schedule.js @@ -1,7 +1,9 @@ import request from '../../api/request' const state = { scheduleList: [], - scheduleForm: {} + scheduleForm: { + node_ids: [] + } } const getters = {} diff --git a/frontend/src/views/schedule/ScheduleList.vue b/frontend/src/views/schedule/ScheduleList.vue index 7a49f808..c54b292b 100644 --- a/frontend/src/views/schedule/ScheduleList.vue +++ b/frontend/src/views/schedule/ScheduleList.vue @@ -14,15 +14,15 @@ - - + + - - + + - + - @@ -116,6 +115,13 @@ + + + - + @@ -162,6 +168,7 @@ export default { columns: [ { name: 'name', label: 'Name', width: '180' }, { name: 'cron', label: 'Cron', width: '120' }, + { name: 'run_type', label: 'Run Type', width: '150' }, { name: 'node_name', label: 'Node', width: '150' }, { name: 'spider_name', label: 'Spider', width: '150' }, { name: 'param', label: 'Parameters', width: '150' }, @@ -204,7 +211,7 @@ export default { onAdd () { this.isEdit = false this.dialogVisible = true - this.$store.commit('schedule/SET_SCHEDULE_FORM', {}) + this.$store.commit('schedule/SET_SCHEDULE_FORM', { node_ids: [] }) this.$st.sendEv('定时任务', '添加') }, onAddSubmit () { @@ -308,6 +315,15 @@ export default { } else { return false } + }, + getStatusTooltip (row) { + if (row.status === 'stop') { + return 'Start' + } else if (row.status === 'running') { + return 'Stop' + } else if (row.status === 'error') { + return 'Start' + } } }, created () { From 4a77c7db0d42be37cdba2a6c8667a26203dba587 Mon Sep 17 00:00:00 2001 From: marvzhang Date: Thu, 12 Dec 2019 13:47:43 +0800 Subject: [PATCH 03/39] updated CHANGELOG.md --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 95ef9cd7..93d315a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +# 0.4.0 (2019-12-06) +### Features / Enhancement +- **Configurable Spider**. Allow users to add spiders using *Spiderfile* to configure crawling rules. +- **Execution Mode**. Allow users to select 3 modes for task execution: *All Nodes*, *Selected Nodes* and *Random*. + +### Bug Fixes +- **Task accidentally killed**. [#306](https://github.com/crawlab-team/crawlab/issues/306) +- **Documentation fix**. [#301](https://github.com/crawlab-team/crawlab/issues/258) [#301](https://github.com/crawlab-team/crawlab/issues/258) +- **Direct deploy incompatible with Windows**. [#288](https://github.com/crawlab-team/crawlab/issues/288) +- **Log files lost**. [#269](https://github.com/crawlab-team/crawlab/issues/269) + # 0.3.5 (2019-10-28) ### Features / Enhancement - **Graceful Showdown**. [detail](https://github.com/crawlab-team/crawlab/commit/63fab3917b5a29fd9770f9f51f1572b9f0420385) From 3ef794f7a2c4ad6e75697d7124a28222bbc66e8a Mon Sep 17 00:00:00 2001 From: marvzhang Date: Fri, 13 Dec 2019 12:55:53 +0800 Subject: [PATCH 04/39] =?UTF-8?q?=E5=B0=86=E5=8F=AF=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E7=88=AC=E8=99=ABstages=E8=B0=83=E6=95=B4=E4=B8=BA=E5=88=97?= =?UTF-8?q?=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/entity/config_spider.go | 2 +- backend/model/config_spider/common.go | 10 ++--- backend/model/config_spider/scrapy.go | 3 +- backend/model/spider.go | 6 --- backend/services/config_spider.go | 10 ++--- .../template/spiderfile/Spiderfile.163_news | 22 +++++------ backend/template/spiderfile/Spiderfile.baidu | 26 ++++++------- .../spiderfile/Spiderfile.toscrapy_books | 38 +++++++++---------- 8 files changed, 52 insertions(+), 65 deletions(-) diff --git a/backend/entity/config_spider.go b/backend/entity/config_spider.go index 3fe28bc9..d9e085d2 100644 --- a/backend/entity/config_spider.go +++ b/backend/entity/config_spider.go @@ -5,7 +5,7 @@ type ConfigSpiderData struct { Engine string `yaml:"engine" json:"engine"` StartUrl string `yaml:"start_url" json:"start_url"` StartStage string `yaml:"start_stage" json:"start_stage"` - Stages map[string]Stage `yaml:"stages" json:"stages"` + Stages []Stage `yaml:"stages" json:"stages"` Settings map[string]string `yaml:"settings" json:"settings"` } diff --git a/backend/model/config_spider/common.go b/backend/model/config_spider/common.go index c803755a..4d244fe1 100644 --- a/backend/model/config_spider/common.go +++ b/backend/model/config_spider/common.go @@ -15,16 +15,12 @@ func GetAllFields(data entity.ConfigSpiderData) []entity.Field { func GetStartStageName(data entity.ConfigSpiderData) string { // 如果 start_stage 设置了且在 stages 里,则返回 if data.StartStage != "" { - for stageName := range data.Stages { - if stageName == data.StartStage { - return data.StartStage - } - } + return data.StartStage } // 否则返回第一个 stage - for stageName := range data.Stages { - return stageName + for _, stage := range data.Stages { + return stage.Name } return "" } diff --git a/backend/model/config_spider/scrapy.go b/backend/model/config_spider/scrapy.go index 6fcb77f0..ee24a3e7 100644 --- a/backend/model/config_spider/scrapy.go +++ b/backend/model/config_spider/scrapy.go @@ -83,7 +83,8 @@ func (g ScrapyGenerator) ProcessSpider() error { // 替换 parsers strParser := "" - for stageName, stage := range g.ConfigData.Stages { + for _, stage := range g.ConfigData.Stages { + stageName := stage.Name stageStr := g.GetParserString(stageName, stage) strParser += stageStr } diff --git a/backend/model/spider.go b/backend/model/spider.go index a0d72c1c..78adc4d0 100644 --- a/backend/model/spider.go +++ b/backend/model/spider.go @@ -319,11 +319,5 @@ func GetConfigSpiderData(spider Spider) (entity.ConfigSpiderData, error) { return configData, err } - // 赋值 stage_name - for stageName, stage := range configData.Stages { - stage.Name = stageName - configData.Stages[stageName] = stage - } - return configData, nil } diff --git a/backend/services/config_spider.go b/backend/services/config_spider.go index 7c736cc7..fe0a3da1 100644 --- a/backend/services/config_spider.go +++ b/backend/services/config_spider.go @@ -61,7 +61,9 @@ func ValidateSpiderfile(configData entity.ConfigSpiderData) error { // 校验stages dict := map[string]int{} - for stageName, stage := range configData.Stages { + for _, stage := range configData.Stages { + stageName := stage.Name + // stage 名称不能为空 if stageName == "" { return errors.New("spiderfile invalid: stage name is empty") @@ -152,12 +154,6 @@ func IsUniqueConfigSpiderFields(fields []entity.Field) bool { func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.ConfigSpiderData) error { spiderDir := spider.Src - // 赋值 stage_name - for stageName, stage := range configData.Stages { - stage.Name = stageName - configData.Stages[stageName] = stage - } - // 删除已有的爬虫文件 for _, fInfo := range utils.ListDir(spiderDir) { // 不删除Spiderfile diff --git a/backend/template/spiderfile/Spiderfile.163_news b/backend/template/spiderfile/Spiderfile.163_news index 29d58279..c2a73be7 100644 --- a/backend/template/spiderfile/Spiderfile.163_news +++ b/backend/template/spiderfile/Spiderfile.163_news @@ -4,17 +4,17 @@ start_url: "http://news.163.com/special/0001386F/rank_news.html" start_stage: "list" engine: "scrapy" stages: - list: - is_list: true - list_css: "table tr:not(:first-child)" - fields: - - name: "title" - css: "td:nth-child(1) > a" - - name: "url" - css: "td:nth-child(1) > a" - attr: "href" - - name: "clicks" - css: "td.cBlue" +- name: list + is_list: true + list_css: "table tr:not(:first-child)" + fields: + - name: "title" + css: "td:nth-child(1) > a" + - name: "url" + css: "td:nth-child(1) > a" + attr: "href" + - name: "clicks" + css: "td.cBlue" settings: ROBOTSTXT_OBEY: false USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/spiderfile/Spiderfile.baidu b/backend/template/spiderfile/Spiderfile.baidu index fbf720e4..5643c980 100644 --- a/backend/template/spiderfile/Spiderfile.baidu +++ b/backend/template/spiderfile/Spiderfile.baidu @@ -4,19 +4,19 @@ start_url: http://www.baidu.com/s?wd=crawlab start_stage: list engine: scrapy stages: - list: - is_list: true - list_xpath: //*[contains(@class, "c-container")] - page_xpath: //*[@id="page"]//a[@class="n"][last()] - page_attr: href - fields: - - name: title - xpath: .//h3/a - - name: url - xpath: .//h3/a - attr: href - - name: abstract - xpath: .//*[@class="c-abstract"] +- name: list + is_list: true + list_xpath: //*[contains(@class, "c-container")] + page_xpath: //*[@id="page"]//a[@class="n"][last()] + page_attr: href + fields: + - name: title + xpath: .//h3/a + - name: url + xpath: .//h3/a + attr: href + - name: abstract + xpath: .//*[@class="c-abstract"] settings: ROBOTSTXT_OBEY: false USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/backend/template/spiderfile/Spiderfile.toscrapy_books b/backend/template/spiderfile/Spiderfile.toscrapy_books index 4bf18f61..247b4f40 100644 --- a/backend/template/spiderfile/Spiderfile.toscrapy_books +++ b/backend/template/spiderfile/Spiderfile.toscrapy_books @@ -4,25 +4,25 @@ start_url: "http://books.toscrape.com" start_stage: "list" engine: "scrapy" stages: - list: - is_list: true - list_css: "section article.product_pod" - page_css: "ul.pager li.next a" - page_attr: "href" - fields: - - name: "title" - css: "h3 > a" - - name: "url" - css: "h3 > a" - attr: "href" - next_stage: "detail" - - name: "price" - css: ".product_price > .price_color" - detail: - is_list: false - fields: - - name: "description" - css: "#product_description + p" +- name: list + is_list: true + list_css: "section article.product_pod" + page_css: "ul.pager li.next a" + page_attr: "href" + fields: + - name: "title" + css: "h3 > a" + - name: "url" + css: "h3 > a" + attr: "href" + next_stage: "detail" + - name: "price" + css: ".product_price > .price_color" +- name: detail + is_list: false + fields: + - name: "description" + css: "#product_description + p" settings: ROBOTSTXT_OBEY: true AUTOTHROTTLE_ENABLED: true From 127c01f26474d0ff4411c67d86a55d7bdfb06dfc Mon Sep 17 00:00:00 2001 From: marvzhang Date: Fri, 13 Dec 2019 13:01:00 +0800 Subject: [PATCH 05/39] =?UTF-8?q?=E5=B0=86=E5=8F=AF=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E7=88=AC=E8=99=ABstages=E8=B0=83=E6=95=B4=E4=B8=BA=E5=88=97?= =?UTF-8?q?=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- frontend/src/components/Config/ConfigList.vue | 96 +++++++++++-------- 1 file changed, 56 insertions(+), 40 deletions(-) diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index 5c7a9dc2..e7c79970 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -133,9 +133,9 @@ :value="activeNames" >