added demo spiders

2026-01-22 17:31:03 +01:00 · 2020-02-03 09:21:41 +08:00
parent 5740774ccc
commit cb6a8f79d8
24 changed files with 504 additions and 87 deletions
--- a/backend/entity/config_spider.go
+++ b/backend/entity/config_spider.go
@@ -6,9 +6,9 @@ type ConfigSpiderData struct {
 	DisplayName string `yaml:"display_name" json:"display_name"`
 	Col         string `yaml:"col" json:"col"`
 	Remark      string `yaml:"remark" json:"remark"`
+	Type        string `yaml:"type" bson:"type"`

 	// 可配置爬虫
-	Version    string            `yaml:"version" json:"version"`
 	Engine     string            `yaml:"engine" json:"engine"`
 	StartUrl   string            `yaml:"start_url" json:"start_url"`
 	StartStage string            `yaml:"start_stage" json:"start_stage"`
--- a/backend/services/config_spider.go
+++ b/backend/services/config_spider.go
@@ -6,6 +6,7 @@ import (
 	"crawlab/entity"
 	"crawlab/model"
 	"crawlab/model/config_spider"
+	"crawlab/services/spider_handler"
 	"crawlab/utils"
 	"errors"
 	"fmt"
@@ -227,6 +228,17 @@ func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.Con
 	spider.FileId = fid
 	_ = spider.Save()

+	// 获取爬虫同步实例
+	spiderSync := spider_handler.SpiderSync{
+		Spider: spider,
+	}
+
+	// 获取gfFile
+	gfFile2 := model.GetGridFs(spider.FileId)
+
+	// 生成MD5
+	spiderSync.CreateMd5File(gfFile2.Md5)
+
 	return nil
 }

--- a/backend/services/spider.go
+++ b/backend/services/spider.go
@@ -313,30 +313,58 @@ func InitSpiderService() error {
 				continue
 			}

-			// 添加该爬虫到数据库
-			spider = model.Spider{
-				Id:          bson.NewObjectId(),
-				Name:        configData.Name,
-				DisplayName: configData.DisplayName,
-				Type:        constants.Customized,
-				Col:         configData.Col,
-				Cmd:         configData.Cmd,
-				Src:         spiderPath,
-				Remark:      configData.Remark,
-				ProjectId:   bson.ObjectIdHex(constants.ObjectIdNull),
-				FileId:      bson.ObjectIdHex(constants.ObjectIdNull),
-			}
-			if err := spider.Add(); err != nil {
-				log.Errorf("add spider error: " + err.Error())
-				debug.PrintStack()
-				continue
-			}
+			if configData.Type == constants.Customized {
+				// 添加该爬虫到数据库
+				spider = model.Spider{
+					Id:          bson.NewObjectId(),
+					Name:        configData.Name,
+					DisplayName: configData.DisplayName,
+					Type:        constants.Customized,
+					Col:         configData.Col,
+					Src:         spiderPath,
+					Remark:      configData.Remark,
+					ProjectId:   bson.ObjectIdHex(constants.ObjectIdNull),
+					FileId:      bson.ObjectIdHex(constants.ObjectIdNull),
+					Cmd:         configData.Cmd,
+				}
+				if err := spider.Add(); err != nil {
+					log.Errorf("add spider error: " + err.Error())
+					debug.PrintStack()
+					continue
+				}

-			// 上传爬虫到GridFS
-			if err := UploadSpiderToGridFsFromMaster(spider); err != nil {
-				log.Errorf("upload spider error: " + err.Error())
-				debug.PrintStack()
-				continue
+				// 上传爬虫到GridFS
+				if err := UploadSpiderToGridFsFromMaster(spider); err != nil {
+					log.Errorf("upload spider error: " + err.Error())
+					debug.PrintStack()
+					continue
+				}
+			} else if configData.Type == constants.Configurable || configData.Type == "config" {
+				// 添加该爬虫到数据库
+				spider = model.Spider{
+					Id:          bson.NewObjectId(),
+					Name:        configData.Name,
+					DisplayName: configData.DisplayName,
+					Type:        constants.Configurable,
+					Col:         configData.Col,
+					Src:         spiderPath,
+					Remark:      configData.Remark,
+					ProjectId:   bson.ObjectIdHex(constants.ObjectIdNull),
+					FileId:      bson.ObjectIdHex(constants.ObjectIdNull),
+					Config:      configData,
+				}
+				if err := spider.Add(); err != nil {
+					log.Errorf("add spider error: " + err.Error())
+					debug.PrintStack()
+					continue
+				}
+
+				// 根据序列化后的数据处理爬虫文件
+				if err := ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
+					log.Errorf("add spider error: " + err.Error())
+					debug.PrintStack()
+					continue
+				}
 			}
 		}

--- a/spiders/amazon_config/Spiderfile
+++ b/spiders/amazon_config/Spiderfile
@@ -0,0 +1,51 @@
+name: "amazon_config"
+display_name: "亚马逊中国（可配置）"
+remark: "亚马逊中国搜索手机，列表+分页"
+type: "configurable"
+col: "results_amazon_config"
+engine: scrapy
+start_url: https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: .s-result-item
+  list_xpath: ""
+  page_css: .a-last > a
+  page_xpath: ""
+  page_attr: href
+  fields:
+  - name: title
+    css: span.a-text-normal
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: url
+    css: .a-link-normal
+    xpath: ""
+    attr: href
+    next_stage: ""
+    remark: ""
+  - name: price
+    css: ""
+    xpath: .//*[@class="a-price-whole"]
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: price_fraction
+    css: ""
+    xpath: .//*[@class="a-price-fraction"]
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: img
+    css: .s-image-square-aspect > img
+    xpath: ""
+    attr: src
+    next_stage: ""
+    remark: ""
+settings:
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/78.0.3904.108 Safari/537.36
--- a/spiders/autohome_config/Spiderfile
+++ b/spiders/autohome_config/Spiderfile
@@ -0,0 +1,57 @@
+name: "autohome_config"
+display_name: "汽车之家（可配置）"
+remark: "汽车之家文章，列表+详情+分页"
+type: "configurable"
+col: "results_autohome_config"
+engine: scrapy
+start_url: https://www.autohome.com.cn/all/
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: ul.article > li
+  list_xpath: ""
+  page_css: a.page-item-next
+  page_xpath: ""
+  page_attr: href
+  fields:
+  - name: title
+    css: li > a > h3
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: url
+    css: li > a
+    xpath: ""
+    attr: href
+    next_stage: ""
+    remark: ""
+  - name: abstract
+    css: li > a > p
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: time
+    css: li > a .fn-left
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: views
+    css: li > a .fn-right > em:first-child
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: comments
+    css: li > a .fn-right > em:last-child
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+settings:
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/78.0.3904.108 Safari/537.36
--- a/spiders/baidu_config/Spiderfile
+++ b/spiders/baidu_config/Spiderfile
@@ -0,0 +1,39 @@
+name: "baidu_config"
+display_name: "百度搜索（可配置）"
+remark: "百度搜索Crawlab，列表+分页"
+type: "configurable"
+col: "results_baidu_config"
+engine: scrapy
+start_url: http://www.baidu.com/s?wd=crawlab
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: ""
+  list_xpath: //body
+  page_css: ""
+  page_xpath: //body
+  page_attr: href
+  fields:
+  - name: title
+    css: ""
+    xpath: .//h3/a
+    attr: href
+    next_stage: ""
+    remark: ""
+  - name: url
+    css: ""
+    xpath: .//h3/a
+    attr: href
+    next_stage: ""
+    remark: ""
+  - name: abstract
+    css: ""
+    xpath: .//*[@class="c-abstract"]
+    attr: href
+    next_stage: ""
+    remark: ""
+settings:
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/78.0.3904.108 Safari/537.36
--- a/spiders/chinaz/Spiderfile
+++ b/spiders/chinaz/Spiderfile
@@ -1,4 +1,5 @@
 name: "chinaz"
 display_name: "站长之家 (Scrapy)"
 col: "results_chinaz"
+type: "customized"
 cmd: "scrapy crawl chinaz_spider"
--- a/spiders/csdn_config/Spiderfile
+++ b/spiders/csdn_config/Spiderfile
@@ -0,0 +1,60 @@
+name: "csdn_config"
+display_name: "CSDN（可配置）"
+remark: "CSDN Crawlab 文章，列表+详情+分页"
+type: "configurable"
+col: "results_csdn_config"
+engine: scrapy
+start_url: https://so.csdn.net/so/search/s.do?q=crawlab
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: .search-list-con > .search-list
+  list_xpath: ""
+  page_css: a.btn-next
+  page_xpath: ""
+  page_attr: href
+  fields:
+  - name: url
+    css: ""
+    xpath: .//*[@class="limit_width"]/a
+    attr: href
+    next_stage: detail
+    remark: ""
+- name: detail
+  is_list: false
+  list_css: ""
+  list_xpath: ""
+  page_css: ""
+  page_xpath: ""
+  page_attr: ""
+  fields:
+  - name: content
+    css: ""
+    xpath: .//div[@id="content_views"]
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: views
+    css: .read-count
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: title
+    css: .title-article
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: author
+    css: .follow-nickName
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+settings:
+  AUTOTHROTTLE_ENABLED: "false"
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/79.0.3945.117 Safari/537.36
--- a/spiders/douban_config/Spiderfile
+++ b/spiders/douban_config/Spiderfile
@@ -0,0 +1,57 @@
+name: "douban_config"
+display_name: "豆瓣读书（可配置）"
+remark: "豆瓣读书新书推荐，列表"
+type: "configurable"
+col: "results_douban_config"
+engine: scrapy
+start_url: https://book.douban.com/latest
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: ul.cover-col-4 > li
+  list_xpath: ""
+  page_css: ""
+  page_xpath: ""
+  page_attr: ""
+  fields:
+  - name: title
+    css: h2 > a
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: url
+    css: h2 > a
+    xpath: ""
+    attr: href
+    next_stage: ""
+    remark: ""
+  - name: img
+    css: a.cover img
+    xpath: ""
+    attr: src
+    next_stage: ""
+    remark: ""
+  - name: rating
+    css: p.rating > .color-lightgray
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: abstract
+    css: p:last-child
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: info
+    css: .color-gray
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+settings:
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/78.0.3904.108 Safari/537.36
--- a/spiders/jd/Spiderfile
+++ b/spiders/jd/Spiderfile
@@ -0,0 +1,5 @@
+name: "jd"
+display_name: "京东 (Scrapy)"
+col: "results_jd"
+type: "customized"
+cmd: "scrapy crawl jd_spider"
--- a/spiders/jd/jd/items.py
+++ b/spiders/jd/jd/items.py
@@ -12,3 +12,4 @@ class JdItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    price = scrapy.Field()
+    url = scrapy.Field()
--- a/spiders/jd/jd/pipelines.py
+++ b/spiders/jd/jd/pipelines.py
@@ -4,14 +4,3 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-from pymongo import MongoClient
-
-
-class JdPipeline(object):
-    mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
-    db = mongo[MONGO_DB]
-    col_name = os.environ.get('CRAWLAB_COLLECTION') or 'jd_products'
-    col = db[col_name]
-
-    def process_item(self, item, spider):
-        return item
--- a/spiders/jd/jd/settings.py
+++ b/spiders/jd/jd/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'jd.spiders'
 #USER_AGENT = 'jd (+http://www.yourdomain.com)'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-   'jd.pipelines.JdPipeline': 300,
+   'crawlab.pipelines.CrawlabMongoPipeline': 300,
 }

 # Enable and configure the AutoThrottle extension (disabled by default)
--- a/spiders/jd/jd/spiders/jd_spider.py
+++ b/spiders/jd/jd/spiders/jd_spider.py
@@ -1,11 +1,21 @@
 # -*- coding: utf-8 -*-
 import scrapy

+from jd.items import JdItem
+

 class JdSpiderSpider(scrapy.Spider):
    name = 'jd_spider'
    allowed_domains = ['jd.com']
-    start_urls = ['http://jd.com/']
+
+    def start_requests(self):
+    	for i in range(1, 50):
+    		yield scrapy.Request(url=f'https://search.jd.com/Search?keyword=手机&enc=utf-8&page={i}')

    def parse(self, response):
-        pass
+        for el in response.css('.gl-item'):
+            yield JdItem(
+                url=el.css('.p-name > a::attr("href")').extract_first(),
+                name=el.css('.p-name > a::attr("title")').extract_first(),
+                price=float(el.css('.p-price i::text').extract_first()),
+            )
--- a/spiders/realestate/Spiderfile
+++ b/spiders/realestate/Spiderfile
@@ -0,0 +1,4 @@
+name: "realestate"
+display_name: "链家网 (Scrapy)"
+col: "results_realestate"
+cmd: "scrapy crawl lianjia"
--- a/spiders/sinastock/Spiderfile
+++ b/spiders/sinastock/Spiderfile
@@ -0,0 +1,5 @@
+name: "sinastock"
+display_name: "新浪股票 (Scrapy)"
+type: "customized"
+col: "results_sinastock"
+cmd: "scrapy crawl sinastock_spider"
--- a/spiders/sinastock/sinastock/pipelines.py
+++ b/spiders/sinastock/sinastock/pipelines.py
@@ -4,25 +4,3 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-import os
-
-from pymongo import MongoClient
-
-
-class SinastockPipeline(object):
-    mongo = MongoClient(
-        host=os.environ.get('MONGO_HOST') or 'localhost',
-        port=int(os.environ.get('MONGO_PORT') or 27017)
-    )
-    db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
-    col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news')
-
-    # create indexes
-    col.create_index('stocks')
-    col.create_index('url')
-
-    def process_item(self, item, spider):
-        item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
-        if self.col.find_one({'url': item['url']}) is None:
-            self.col.save(item)
-            return item
--- a/spiders/sinastock/sinastock/settings.py
+++ b/spiders/sinastock/sinastock/settings.py
@@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-    'sinastock.pipelines.SinastockPipeline': 300,
+    'crawlab.pipelines.CrawlabMongoPipeline': 300,
 }

 # Enable and configure the AutoThrottle extension (disabled by default)
--- a/spiders/v2ex_config/Spiderfile
+++ b/spiders/v2ex_config/Spiderfile
@@ -0,0 +1,54 @@
+name: "v2ex_config"
+display_name: "V2ex（可配置）"
+remark: "V2ex，列表+详情"
+type: "configurable"
+col: "results_v2ex_config"
+engine: scrapy
+start_url: https://v2ex.com/
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: .cell.item
+  list_xpath: ""
+  page_css: ""
+  page_xpath: ""
+  page_attr: href
+  fields:
+  - name: title
+    css: a.topic-link
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: url
+    css: a.topic-link
+    xpath: ""
+    attr: href
+    next_stage: detail
+    remark: ""
+  - name: replies
+    css: .count_livid
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+- name: detail
+  is_list: false
+  list_css: ""
+  list_xpath: ""
+  page_css: ""
+  page_xpath: ""
+  page_attr: ""
+  fields:
+  - name: content
+    css: ""
+    xpath: .//*[@class="markdown_body"]
+    attr: ""
+    next_stage: ""
+    remark: ""
+settings:
+  AUTOTHROTTLE_ENABLED: "true"
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/79.0.3945.117 Safari/537.36
--- a/spiders/xueqiu/Spiderfile
+++ b/spiders/xueqiu/Spiderfile
@@ -0,0 +1,5 @@
+name: "xueqiu"
+display_name: "雪球网 (Scrapy)"
+type: "customized"
+col: "results_xueqiu"
+cmd: "scrapy crawl xueqiu_spider"
--- a/spiders/xueqiu/xueqiu/pipelines.py
+++ b/spiders/xueqiu/xueqiu/pipelines.py
@@ -4,26 +4,3 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-import os
-
-from pymongo import MongoClient
-
-
-class XueqiuPipeline(object):
-    mongo = MongoClient(
-        host=os.environ.get('MONGO_HOST') or 'localhost',
-        port=int(os.environ.get('MONGO_PORT') or 27017)
-    )
-    db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
-    col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'results_xueqiu')
-
-    # create indexes
-    col.create_index('stocks')
-    col.create_index('id')
-    col.create_index('url')
-
-    def process_item(self, item, spider):
-        item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
-        if self.col.find_one({'id': item['id']}) is None:
-            self.col.save(item)
-            return item
--- a/spiders/xueqiu/xueqiu/settings.py
+++ b/spiders/xueqiu/xueqiu/settings.py
@@ -18,7 +18,7 @@ NEWSPIDER_MODULE = 'xueqiu.spiders'
 USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False

 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 # CONCURRENT_REQUESTS = 32
@@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-    'xueqiu.pipelines.XueqiuPipeline': 300,
+    'crawlab.pipelines.CrawlabMongoPipeline': 300,
 }

 # Enable and configure the AutoThrottle extension (disabled by default)
--- a/spiders/xueqiu_config/Spiderfile
+++ b/spiders/xueqiu_config/Spiderfile
@@ -0,0 +1,39 @@
+name: "xueqiu_config"
+display_name: "雪球网（可配置）"
+remark: "雪球网新闻，列表"
+type: "configurable"
+col: "results_xueqiu_config"
+engine: scrapy
+start_url: https://xueqiu.com/
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: ""
+  list_xpath: .//*[contains(@class, "AnonymousHome_home__timeline__item")]
+  page_css: ""
+  page_xpath: ""
+  page_attr: ""
+  fields:
+  - name: title
+    css: h3 > a
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: url
+    css: h3 > a
+    xpath: ""
+    attr: href
+    next_stage: ""
+    remark: ""
+  - name: abstract
+    css: p
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+settings:
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/78.0.3904.108 Safari/537.36
--- a/spiders/zongheng_config/Spiderfile
+++ b/spiders/zongheng_config/Spiderfile
@@ -0,0 +1,45 @@
+name: "zongheng_config"
+display_name: "纵横（可配置）"
+remark: "纵横小说网，列表"
+type: "configurable"
+col: "results_zongheng_config"
+engine: scrapy
+start_url: http://www.zongheng.com/rank/details.html?rt=1&d=1
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: .rank_d_list
+  list_xpath: ""
+  page_css: ""
+  page_xpath: ""
+  page_attr: href
+  fields:
+  - name: title
+    css: .rank_d_b_name > a
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: url
+    css: .rank_d_b_name > a
+    xpath: ""
+    attr: href
+    next_stage: ""
+    remark: ""
+  - name: abstract
+    css: body
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: votes
+    css: .rank_d_b_ticket
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+settings:
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/78.0.3904.108 Safari/537.36