From 6b708683269a2600008d6626a32bb3d18fcd6add Mon Sep 17 00:00:00 2001
From: marvzhang <tikazyq@163.com>
Date: Mon, 3 Feb 2020 09:21:41 +0800
Subject: [PATCH] added demo spiders

---
 backend/entity/config_spider.go          |  2 +-
 backend/services/config_spider.go        | 12 ++++
 backend/services/spider.go               | 74 ++++++++++++++++--------
 spiders/amazon_config/Spiderfile         | 51 ++++++++++++++++
 spiders/autohome_config/Spiderfile       | 57 ++++++++++++++++++
 spiders/baidu_config/Spiderfile          | 39 +++++++++++++
 spiders/chinaz/Spiderfile                |  1 +
 spiders/csdn_config/Spiderfile           | 60 +++++++++++++++++++
 spiders/douban_config/Spiderfile         | 57 ++++++++++++++++++
 spiders/jd/Spiderfile                    |  5 ++
 spiders/jd/jd/items.py                   |  1 +
 spiders/jd/jd/pipelines.py               | 11 ----
 spiders/jd/jd/settings.py                |  4 +-
 spiders/jd/jd/spiders/jd_spider.py       | 14 ++++-
 spiders/realestate/Spiderfile            |  4 ++
 spiders/sinastock/Spiderfile             |  5 ++
 spiders/sinastock/sinastock/pipelines.py | 22 -------
 spiders/sinastock/sinastock/settings.py  |  2 +-
 spiders/v2ex_config/Spiderfile           | 54 +++++++++++++++++
 spiders/xueqiu/Spiderfile                |  5 ++
 spiders/xueqiu/xueqiu/pipelines.py       | 23 --------
 spiders/xueqiu/xueqiu/settings.py        |  4 +-
 spiders/xueqiu_config/Spiderfile         | 39 +++++++++++++
 spiders/zongheng_config/Spiderfile       | 45 ++++++++++++++
 24 files changed, 504 insertions(+), 87 deletions(-)
 create mode 100644 spiders/amazon_config/Spiderfile
 create mode 100644 spiders/autohome_config/Spiderfile
 create mode 100644 spiders/baidu_config/Spiderfile
 create mode 100644 spiders/csdn_config/Spiderfile
 create mode 100644 spiders/douban_config/Spiderfile
 create mode 100644 spiders/jd/Spiderfile
 create mode 100644 spiders/realestate/Spiderfile
 create mode 100644 spiders/sinastock/Spiderfile
 create mode 100644 spiders/v2ex_config/Spiderfile
 create mode 100644 spiders/xueqiu/Spiderfile
 create mode 100644 spiders/xueqiu_config/Spiderfile
 create mode 100644 spiders/zongheng_config/Spiderfile

diff --git a/backend/entity/config_spider.go b/backend/entity/config_spider.go
index bb1295e7..054ee2fe 100644
--- a/backend/entity/config_spider.go
+++ b/backend/entity/config_spider.go
@@ -6,9 +6,9 @@ type ConfigSpiderData struct {
 	DisplayName string `yaml:"display_name" json:"display_name"`
 	Col         string `yaml:"col" json:"col"`
 	Remark      string `yaml:"remark" json:"remark"`
+	Type        string `yaml:"type" bson:"type"`
 
 	// 可配置爬虫
-	Version    string            `yaml:"version" json:"version"`
 	Engine     string            `yaml:"engine" json:"engine"`
 	StartUrl   string            `yaml:"start_url" json:"start_url"`
 	StartStage string            `yaml:"start_stage" json:"start_stage"`
diff --git a/backend/services/config_spider.go b/backend/services/config_spider.go
index fe0a3da1..29e1c2ca 100644
--- a/backend/services/config_spider.go
+++ b/backend/services/config_spider.go
@@ -6,6 +6,7 @@ import (
 	"crawlab/entity"
 	"crawlab/model"
 	"crawlab/model/config_spider"
+	"crawlab/services/spider_handler"
 	"crawlab/utils"
 	"errors"
 	"fmt"
@@ -227,6 +228,17 @@ func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.Con
 	spider.FileId = fid
 	_ = spider.Save()
 
+	// 获取爬虫同步实例
+	spiderSync := spider_handler.SpiderSync{
+		Spider: spider,
+	}
+
+	// 获取gfFile
+	gfFile2 := model.GetGridFs(spider.FileId)
+
+	// 生成MD5
+	spiderSync.CreateMd5File(gfFile2.Md5)
+
 	return nil
 }
 
diff --git a/backend/services/spider.go b/backend/services/spider.go
index b395a956..48777042 100644
--- a/backend/services/spider.go
+++ b/backend/services/spider.go
@@ -313,30 +313,58 @@ func InitSpiderService() error {
 				continue
 			}
 
-			// 添加该爬虫到数据库
-			spider = model.Spider{
-				Id:          bson.NewObjectId(),
-				Name:        configData.Name,
-				DisplayName: configData.DisplayName,
-				Type:        constants.Customized,
-				Col:         configData.Col,
-				Cmd:         configData.Cmd,
-				Src:         spiderPath,
-				Remark:      configData.Remark,
-				ProjectId:   bson.ObjectIdHex(constants.ObjectIdNull),
-				FileId:      bson.ObjectIdHex(constants.ObjectIdNull),
-			}
-			if err := spider.Add(); err != nil {
-				log.Errorf("add spider error: " + err.Error())
-				debug.PrintStack()
-				continue
-			}
+			if configData.Type == constants.Customized {
+				// 添加该爬虫到数据库
+				spider = model.Spider{
+					Id:          bson.NewObjectId(),
+					Name:        configData.Name,
+					DisplayName: configData.DisplayName,
+					Type:        constants.Customized,
+					Col:         configData.Col,
+					Src:         spiderPath,
+					Remark:      configData.Remark,
+					ProjectId:   bson.ObjectIdHex(constants.ObjectIdNull),
+					FileId:      bson.ObjectIdHex(constants.ObjectIdNull),
+					Cmd:         configData.Cmd,
+				}
+				if err := spider.Add(); err != nil {
+					log.Errorf("add spider error: " + err.Error())
+					debug.PrintStack()
+					continue
+				}
 
-			// 上传爬虫到GridFS
-			if err := UploadSpiderToGridFsFromMaster(spider); err != nil {
-				log.Errorf("upload spider error: " + err.Error())
-				debug.PrintStack()
-				continue
+				// 上传爬虫到GridFS
+				if err := UploadSpiderToGridFsFromMaster(spider); err != nil {
+					log.Errorf("upload spider error: " + err.Error())
+					debug.PrintStack()
+					continue
+				}
+			} else if configData.Type == constants.Configurable || configData.Type == "config" {
+				// 添加该爬虫到数据库
+				spider = model.Spider{
+					Id:          bson.NewObjectId(),
+					Name:        configData.Name,
+					DisplayName: configData.DisplayName,
+					Type:        constants.Configurable,
+					Col:         configData.Col,
+					Src:         spiderPath,
+					Remark:      configData.Remark,
+					ProjectId:   bson.ObjectIdHex(constants.ObjectIdNull),
+					FileId:      bson.ObjectIdHex(constants.ObjectIdNull),
+					Config:      configData,
+				}
+				if err := spider.Add(); err != nil {
+					log.Errorf("add spider error: " + err.Error())
+					debug.PrintStack()
+					continue
+				}
+
+				// 根据序列化后的数据处理爬虫文件
+				if err := ProcessSpiderFilesFromConfigData(spider, configData); err != nil {
+					log.Errorf("add spider error: " + err.Error())
+					debug.PrintStack()
+					continue
+				}
 			}
 		}
 
diff --git a/spiders/amazon_config/Spiderfile b/spiders/amazon_config/Spiderfile
new file mode 100644
index 00000000..eea8a538
--- /dev/null
+++ b/spiders/amazon_config/Spiderfile
@@ -0,0 +1,51 @@
+name: "amazon_config"
+display_name: "亚马逊中国（可配置）"
+remark: "亚马逊中国搜索手机，列表+分页"
+type: "configurable"
+col: "results_amazon_config"
+engine: scrapy
+start_url: https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: .s-result-item
+  list_xpath: ""
+  page_css: .a-last > a
+  page_xpath: ""
+  page_attr: href
+  fields:
+  - name: title
+    css: span.a-text-normal
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: url
+    css: .a-link-normal
+    xpath: ""
+    attr: href
+    next_stage: ""
+    remark: ""
+  - name: price
+    css: ""
+    xpath: .//*[@class="a-price-whole"]
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: price_fraction
+    css: ""
+    xpath: .//*[@class="a-price-fraction"]
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: img
+    css: .s-image-square-aspect > img
+    xpath: ""
+    attr: src
+    next_stage: ""
+    remark: ""
+settings:
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/78.0.3904.108 Safari/537.36
diff --git a/spiders/autohome_config/Spiderfile b/spiders/autohome_config/Spiderfile
new file mode 100644
index 00000000..e69880cb
--- /dev/null
+++ b/spiders/autohome_config/Spiderfile
@@ -0,0 +1,57 @@
+name: "autohome_config"
+display_name: "汽车之家（可配置）"
+remark: "汽车之家文章，列表+详情+分页"
+type: "configurable"
+col: "results_autohome_config"
+engine: scrapy
+start_url: https://www.autohome.com.cn/all/
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: ul.article > li
+  list_xpath: ""
+  page_css: a.page-item-next
+  page_xpath: ""
+  page_attr: href
+  fields:
+  - name: title
+    css: li > a > h3
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: url
+    css: li > a
+    xpath: ""
+    attr: href
+    next_stage: ""
+    remark: ""
+  - name: abstract
+    css: li > a > p
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: time
+    css: li > a .fn-left
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: views
+    css: li > a .fn-right > em:first-child
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: comments
+    css: li > a .fn-right > em:last-child
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+settings:
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/78.0.3904.108 Safari/537.36
diff --git a/spiders/baidu_config/Spiderfile b/spiders/baidu_config/Spiderfile
new file mode 100644
index 00000000..5266b85b
--- /dev/null
+++ b/spiders/baidu_config/Spiderfile
@@ -0,0 +1,39 @@
+name: "baidu_config"
+display_name: "百度搜索（可配置）"
+remark: "百度搜索Crawlab，列表+分页"
+type: "configurable"
+col: "results_baidu_config"
+engine: scrapy
+start_url: http://www.baidu.com/s?wd=crawlab
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: ""
+  list_xpath: //body
+  page_css: ""
+  page_xpath: //body
+  page_attr: href
+  fields:
+  - name: title
+    css: ""
+    xpath: .//h3/a
+    attr: href
+    next_stage: ""
+    remark: ""
+  - name: url
+    css: ""
+    xpath: .//h3/a
+    attr: href
+    next_stage: ""
+    remark: ""
+  - name: abstract
+    css: ""
+    xpath: .//*[@class="c-abstract"]
+    attr: href
+    next_stage: ""
+    remark: ""
+settings:
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/78.0.3904.108 Safari/537.36
diff --git a/spiders/chinaz/Spiderfile b/spiders/chinaz/Spiderfile
index d36c7cf2..2fb940bb 100644
--- a/spiders/chinaz/Spiderfile
+++ b/spiders/chinaz/Spiderfile
@@ -1,4 +1,5 @@
 name: "chinaz"
 display_name: "站长之家 (Scrapy)"
 col: "results_chinaz"
+type: "customized"
 cmd: "scrapy crawl chinaz_spider"
\ No newline at end of file
diff --git a/spiders/csdn_config/Spiderfile b/spiders/csdn_config/Spiderfile
new file mode 100644
index 00000000..67f4f8c5
--- /dev/null
+++ b/spiders/csdn_config/Spiderfile
@@ -0,0 +1,60 @@
+name: "csdn_config"
+display_name: "CSDN（可配置）"
+remark: "CSDN Crawlab 文章，列表+详情+分页"
+type: "configurable"
+col: "results_csdn_config"
+engine: scrapy
+start_url: https://so.csdn.net/so/search/s.do?q=crawlab
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: .search-list-con > .search-list
+  list_xpath: ""
+  page_css: a.btn-next
+  page_xpath: ""
+  page_attr: href
+  fields:
+  - name: url
+    css: ""
+    xpath: .//*[@class="limit_width"]/a
+    attr: href
+    next_stage: detail
+    remark: ""
+- name: detail
+  is_list: false
+  list_css: ""
+  list_xpath: ""
+  page_css: ""
+  page_xpath: ""
+  page_attr: ""
+  fields:
+  - name: content
+    css: ""
+    xpath: .//div[@id="content_views"]
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: views
+    css: .read-count
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: title
+    css: .title-article
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: author
+    css: .follow-nickName
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+settings:
+  AUTOTHROTTLE_ENABLED: "false"
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/79.0.3945.117 Safari/537.36
diff --git a/spiders/douban_config/Spiderfile b/spiders/douban_config/Spiderfile
new file mode 100644
index 00000000..84f0647a
--- /dev/null
+++ b/spiders/douban_config/Spiderfile
@@ -0,0 +1,57 @@
+name: "douban_config"
+display_name: "豆瓣读书（可配置）"
+remark: "豆瓣读书新书推荐，列表"
+type: "configurable"
+col: "results_douban_config"
+engine: scrapy
+start_url: https://book.douban.com/latest
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: ul.cover-col-4 > li
+  list_xpath: ""
+  page_css: ""
+  page_xpath: ""
+  page_attr: ""
+  fields:
+  - name: title
+    css: h2 > a
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: url
+    css: h2 > a
+    xpath: ""
+    attr: href
+    next_stage: ""
+    remark: ""
+  - name: img
+    css: a.cover img
+    xpath: ""
+    attr: src
+    next_stage: ""
+    remark: ""
+  - name: rating
+    css: p.rating > .color-lightgray
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: abstract
+    css: p:last-child
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: info
+    css: .color-gray
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+settings:
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/78.0.3904.108 Safari/537.36
diff --git a/spiders/jd/Spiderfile b/spiders/jd/Spiderfile
new file mode 100644
index 00000000..d090472b
--- /dev/null
+++ b/spiders/jd/Spiderfile
@@ -0,0 +1,5 @@
+name: "jd"
+display_name: "京东 (Scrapy)"
+col: "results_jd"
+type: "customized"
+cmd: "scrapy crawl jd_spider"
\ No newline at end of file
diff --git a/spiders/jd/jd/items.py b/spiders/jd/jd/items.py
index 9a7ba1cb..b2c5e647 100644
--- a/spiders/jd/jd/items.py
+++ b/spiders/jd/jd/items.py
@@ -12,3 +12,4 @@ class JdItem(scrapy.Item):
     # define the fields for your item here like:
     name = scrapy.Field()
     price = scrapy.Field()
+    url = scrapy.Field()
diff --git a/spiders/jd/jd/pipelines.py b/spiders/jd/jd/pipelines.py
index b862b7e7..5a7d7cbf 100644
--- a/spiders/jd/jd/pipelines.py
+++ b/spiders/jd/jd/pipelines.py
@@ -4,14 +4,3 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-from pymongo import MongoClient
-
-
-class JdPipeline(object):
-    mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
-    db = mongo[MONGO_DB]
-    col_name = os.environ.get('CRAWLAB_COLLECTION') or 'jd_products'
-    col = db[col_name]
-
-    def process_item(self, item, spider):
-        return item
diff --git a/spiders/jd/jd/settings.py b/spiders/jd/jd/settings.py
index d83206b2..ef89ed0c 100644
--- a/spiders/jd/jd/settings.py
+++ b/spiders/jd/jd/settings.py
@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'jd.spiders'
 #USER_AGENT = 'jd (+http://www.yourdomain.com)'
 
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
@@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-   'jd.pipelines.JdPipeline': 300,
+   'crawlab.pipelines.CrawlabMongoPipeline': 300,
 }
 
 # Enable and configure the AutoThrottle extension (disabled by default)
diff --git a/spiders/jd/jd/spiders/jd_spider.py b/spiders/jd/jd/spiders/jd_spider.py
index 01113a7e..4ec94fa9 100644
--- a/spiders/jd/jd/spiders/jd_spider.py
+++ b/spiders/jd/jd/spiders/jd_spider.py
@@ -1,11 +1,21 @@
 # -*- coding: utf-8 -*-
 import scrapy
 
+from jd.items import JdItem
+
 
 class JdSpiderSpider(scrapy.Spider):
     name = 'jd_spider'
     allowed_domains = ['jd.com']
-    start_urls = ['http://jd.com/']
+
+    def start_requests(self):
+    	for i in range(1, 50):
+    		yield scrapy.Request(url=f'https://search.jd.com/Search?keyword=手机&enc=utf-8&page={i}')
 
     def parse(self, response):
-        pass
+        for el in response.css('.gl-item'):
+            yield JdItem(
+                url=el.css('.p-name > a::attr("href")').extract_first(),
+                name=el.css('.p-name > a::attr("title")').extract_first(),
+                price=float(el.css('.p-price i::text').extract_first()),
+            )
diff --git a/spiders/realestate/Spiderfile b/spiders/realestate/Spiderfile
new file mode 100644
index 00000000..772e8312
--- /dev/null
+++ b/spiders/realestate/Spiderfile
@@ -0,0 +1,4 @@
+name: "realestate"
+display_name: "链家网 (Scrapy)"
+col: "results_realestate"
+cmd: "scrapy crawl lianjia"
\ No newline at end of file
diff --git a/spiders/sinastock/Spiderfile b/spiders/sinastock/Spiderfile
new file mode 100644
index 00000000..b110cb48
--- /dev/null
+++ b/spiders/sinastock/Spiderfile
@@ -0,0 +1,5 @@
+name: "sinastock"
+display_name: "新浪股票 (Scrapy)"
+type: "customized"
+col: "results_sinastock"
+cmd: "scrapy crawl sinastock_spider"
\ No newline at end of file
diff --git a/spiders/sinastock/sinastock/pipelines.py b/spiders/sinastock/sinastock/pipelines.py
index e666c50d..5a7d7cbf 100644
--- a/spiders/sinastock/sinastock/pipelines.py
+++ b/spiders/sinastock/sinastock/pipelines.py
@@ -4,25 +4,3 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-import os
-
-from pymongo import MongoClient
-
-
-class SinastockPipeline(object):
-    mongo = MongoClient(
-        host=os.environ.get('MONGO_HOST') or 'localhost',
-        port=int(os.environ.get('MONGO_PORT') or 27017)
-    )
-    db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
-    col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news')
-
-    # create indexes
-    col.create_index('stocks')
-    col.create_index('url')
-
-    def process_item(self, item, spider):
-        item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
-        if self.col.find_one({'url': item['url']}) is None:
-            self.col.save(item)
-            return item
diff --git a/spiders/sinastock/sinastock/settings.py b/spiders/sinastock/sinastock/settings.py
index c63c2eb5..3e01d3ca 100644
--- a/spiders/sinastock/sinastock/settings.py
+++ b/spiders/sinastock/sinastock/settings.py
@@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-    'sinastock.pipelines.SinastockPipeline': 300,
+    'crawlab.pipelines.CrawlabMongoPipeline': 300,
 }
 
 # Enable and configure the AutoThrottle extension (disabled by default)
diff --git a/spiders/v2ex_config/Spiderfile b/spiders/v2ex_config/Spiderfile
new file mode 100644
index 00000000..bb18d40a
--- /dev/null
+++ b/spiders/v2ex_config/Spiderfile
@@ -0,0 +1,54 @@
+name: "v2ex_config"
+display_name: "V2ex（可配置）"
+remark: "V2ex，列表+详情"
+type: "configurable"
+col: "results_v2ex_config"
+engine: scrapy
+start_url: https://v2ex.com/
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: .cell.item
+  list_xpath: ""
+  page_css: ""
+  page_xpath: ""
+  page_attr: href
+  fields:
+  - name: title
+    css: a.topic-link
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: url
+    css: a.topic-link
+    xpath: ""
+    attr: href
+    next_stage: detail
+    remark: ""
+  - name: replies
+    css: .count_livid
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+- name: detail
+  is_list: false
+  list_css: ""
+  list_xpath: ""
+  page_css: ""
+  page_xpath: ""
+  page_attr: ""
+  fields:
+  - name: content
+    css: ""
+    xpath: .//*[@class="markdown_body"]
+    attr: ""
+    next_stage: ""
+    remark: ""
+settings:
+  AUTOTHROTTLE_ENABLED: "true"
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/79.0.3945.117 Safari/537.36
diff --git a/spiders/xueqiu/Spiderfile b/spiders/xueqiu/Spiderfile
new file mode 100644
index 00000000..38aa5dbe
--- /dev/null
+++ b/spiders/xueqiu/Spiderfile
@@ -0,0 +1,5 @@
+name: "xueqiu"
+display_name: "雪球网 (Scrapy)"
+type: "customized"
+col: "results_xueqiu"
+cmd: "scrapy crawl xueqiu_spider"
\ No newline at end of file
diff --git a/spiders/xueqiu/xueqiu/pipelines.py b/spiders/xueqiu/xueqiu/pipelines.py
index 210ce7ac..5a7d7cbf 100644
--- a/spiders/xueqiu/xueqiu/pipelines.py
+++ b/spiders/xueqiu/xueqiu/pipelines.py
@@ -4,26 +4,3 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
-import os
-
-from pymongo import MongoClient
-
-
-class XueqiuPipeline(object):
-    mongo = MongoClient(
-        host=os.environ.get('MONGO_HOST') or 'localhost',
-        port=int(os.environ.get('MONGO_PORT') or 27017)
-    )
-    db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
-    col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'results_xueqiu')
-
-    # create indexes
-    col.create_index('stocks')
-    col.create_index('id')
-    col.create_index('url')
-
-    def process_item(self, item, spider):
-        item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
-        if self.col.find_one({'id': item['id']}) is None:
-            self.col.save(item)
-            return item
diff --git a/spiders/xueqiu/xueqiu/settings.py b/spiders/xueqiu/xueqiu/settings.py
index b44a74e1..1d898e2f 100644
--- a/spiders/xueqiu/xueqiu/settings.py
+++ b/spiders/xueqiu/xueqiu/settings.py
@@ -18,7 +18,7 @@ NEWSPIDER_MODULE = 'xueqiu.spiders'
 USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
 
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 # CONCURRENT_REQUESTS = 32
@@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True
 # Configure item pipelines
 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-    'xueqiu.pipelines.XueqiuPipeline': 300,
+    'crawlab.pipelines.CrawlabMongoPipeline': 300,
 }
 
 # Enable and configure the AutoThrottle extension (disabled by default)
diff --git a/spiders/xueqiu_config/Spiderfile b/spiders/xueqiu_config/Spiderfile
new file mode 100644
index 00000000..0de50e9e
--- /dev/null
+++ b/spiders/xueqiu_config/Spiderfile
@@ -0,0 +1,39 @@
+name: "xueqiu_config"
+display_name: "雪球网（可配置）"
+remark: "雪球网新闻，列表"
+type: "configurable"
+col: "results_xueqiu_config"
+engine: scrapy
+start_url: https://xueqiu.com/
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: ""
+  list_xpath: .//*[contains(@class, "AnonymousHome_home__timeline__item")]
+  page_css: ""
+  page_xpath: ""
+  page_attr: ""
+  fields:
+  - name: title
+    css: h3 > a
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: url
+    css: h3 > a
+    xpath: ""
+    attr: href
+    next_stage: ""
+    remark: ""
+  - name: abstract
+    css: p
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+settings:
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/78.0.3904.108 Safari/537.36
diff --git a/spiders/zongheng_config/Spiderfile b/spiders/zongheng_config/Spiderfile
new file mode 100644
index 00000000..0163fac7
--- /dev/null
+++ b/spiders/zongheng_config/Spiderfile
@@ -0,0 +1,45 @@
+name: "zongheng_config"
+display_name: "纵横（可配置）"
+remark: "纵横小说网，列表"
+type: "configurable"
+col: "results_zongheng_config"
+engine: scrapy
+start_url: http://www.zongheng.com/rank/details.html?rt=1&d=1
+start_stage: list
+stages:
+- name: list
+  is_list: true
+  list_css: .rank_d_list
+  list_xpath: ""
+  page_css: ""
+  page_xpath: ""
+  page_attr: href
+  fields:
+  - name: title
+    css: .rank_d_b_name > a
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: url
+    css: .rank_d_b_name > a
+    xpath: ""
+    attr: href
+    next_stage: ""
+    remark: ""
+  - name: abstract
+    css: body
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+  - name: votes
+    css: .rank_d_b_ticket
+    xpath: ""
+    attr: ""
+    next_stage: ""
+    remark: ""
+settings:
+  ROBOTSTXT_OBEY: "false"
+  USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML,
+    like Gecko) Chrome/78.0.3904.108 Safari/537.36