From 6b708683269a2600008d6626a32bb3d18fcd6add Mon Sep 17 00:00:00 2001 From: marvzhang Date: Mon, 3 Feb 2020 09:21:41 +0800 Subject: [PATCH] added demo spiders --- backend/entity/config_spider.go | 2 +- backend/services/config_spider.go | 12 ++++ backend/services/spider.go | 74 ++++++++++++++++-------- spiders/amazon_config/Spiderfile | 51 ++++++++++++++++ spiders/autohome_config/Spiderfile | 57 ++++++++++++++++++ spiders/baidu_config/Spiderfile | 39 +++++++++++++ spiders/chinaz/Spiderfile | 1 + spiders/csdn_config/Spiderfile | 60 +++++++++++++++++++ spiders/douban_config/Spiderfile | 57 ++++++++++++++++++ spiders/jd/Spiderfile | 5 ++ spiders/jd/jd/items.py | 1 + spiders/jd/jd/pipelines.py | 11 ---- spiders/jd/jd/settings.py | 4 +- spiders/jd/jd/spiders/jd_spider.py | 14 ++++- spiders/realestate/Spiderfile | 4 ++ spiders/sinastock/Spiderfile | 5 ++ spiders/sinastock/sinastock/pipelines.py | 22 ------- spiders/sinastock/sinastock/settings.py | 2 +- spiders/v2ex_config/Spiderfile | 54 +++++++++++++++++ spiders/xueqiu/Spiderfile | 5 ++ spiders/xueqiu/xueqiu/pipelines.py | 23 -------- spiders/xueqiu/xueqiu/settings.py | 4 +- spiders/xueqiu_config/Spiderfile | 39 +++++++++++++ spiders/zongheng_config/Spiderfile | 45 ++++++++++++++ 24 files changed, 504 insertions(+), 87 deletions(-) create mode 100644 spiders/amazon_config/Spiderfile create mode 100644 spiders/autohome_config/Spiderfile create mode 100644 spiders/baidu_config/Spiderfile create mode 100644 spiders/csdn_config/Spiderfile create mode 100644 spiders/douban_config/Spiderfile create mode 100644 spiders/jd/Spiderfile create mode 100644 spiders/realestate/Spiderfile create mode 100644 spiders/sinastock/Spiderfile create mode 100644 spiders/v2ex_config/Spiderfile create mode 100644 spiders/xueqiu/Spiderfile create mode 100644 spiders/xueqiu_config/Spiderfile create mode 100644 spiders/zongheng_config/Spiderfile diff --git a/backend/entity/config_spider.go b/backend/entity/config_spider.go index bb1295e7..054ee2fe 100644 --- a/backend/entity/config_spider.go +++ b/backend/entity/config_spider.go @@ -6,9 +6,9 @@ type ConfigSpiderData struct { DisplayName string `yaml:"display_name" json:"display_name"` Col string `yaml:"col" json:"col"` Remark string `yaml:"remark" json:"remark"` + Type string `yaml:"type" bson:"type"` // 可配置爬虫 - Version string `yaml:"version" json:"version"` Engine string `yaml:"engine" json:"engine"` StartUrl string `yaml:"start_url" json:"start_url"` StartStage string `yaml:"start_stage" json:"start_stage"` diff --git a/backend/services/config_spider.go b/backend/services/config_spider.go index fe0a3da1..29e1c2ca 100644 --- a/backend/services/config_spider.go +++ b/backend/services/config_spider.go @@ -6,6 +6,7 @@ import ( "crawlab/entity" "crawlab/model" "crawlab/model/config_spider" + "crawlab/services/spider_handler" "crawlab/utils" "errors" "fmt" @@ -227,6 +228,17 @@ func ProcessSpiderFilesFromConfigData(spider model.Spider, configData entity.Con spider.FileId = fid _ = spider.Save() + // 获取爬虫同步实例 + spiderSync := spider_handler.SpiderSync{ + Spider: spider, + } + + // 获取gfFile + gfFile2 := model.GetGridFs(spider.FileId) + + // 生成MD5 + spiderSync.CreateMd5File(gfFile2.Md5) + return nil } diff --git a/backend/services/spider.go b/backend/services/spider.go index b395a956..48777042 100644 --- a/backend/services/spider.go +++ b/backend/services/spider.go @@ -313,30 +313,58 @@ func InitSpiderService() error { continue } - // 添加该爬虫到数据库 - spider = model.Spider{ - Id: bson.NewObjectId(), - Name: configData.Name, - DisplayName: configData.DisplayName, - Type: constants.Customized, - Col: configData.Col, - Cmd: configData.Cmd, - Src: spiderPath, - Remark: configData.Remark, - ProjectId: bson.ObjectIdHex(constants.ObjectIdNull), - FileId: bson.ObjectIdHex(constants.ObjectIdNull), - } - if err := spider.Add(); err != nil { - log.Errorf("add spider error: " + err.Error()) - debug.PrintStack() - continue - } + if configData.Type == constants.Customized { + // 添加该爬虫到数据库 + spider = model.Spider{ + Id: bson.NewObjectId(), + Name: configData.Name, + DisplayName: configData.DisplayName, + Type: constants.Customized, + Col: configData.Col, + Src: spiderPath, + Remark: configData.Remark, + ProjectId: bson.ObjectIdHex(constants.ObjectIdNull), + FileId: bson.ObjectIdHex(constants.ObjectIdNull), + Cmd: configData.Cmd, + } + if err := spider.Add(); err != nil { + log.Errorf("add spider error: " + err.Error()) + debug.PrintStack() + continue + } - // 上传爬虫到GridFS - if err := UploadSpiderToGridFsFromMaster(spider); err != nil { - log.Errorf("upload spider error: " + err.Error()) - debug.PrintStack() - continue + // 上传爬虫到GridFS + if err := UploadSpiderToGridFsFromMaster(spider); err != nil { + log.Errorf("upload spider error: " + err.Error()) + debug.PrintStack() + continue + } + } else if configData.Type == constants.Configurable || configData.Type == "config" { + // 添加该爬虫到数据库 + spider = model.Spider{ + Id: bson.NewObjectId(), + Name: configData.Name, + DisplayName: configData.DisplayName, + Type: constants.Configurable, + Col: configData.Col, + Src: spiderPath, + Remark: configData.Remark, + ProjectId: bson.ObjectIdHex(constants.ObjectIdNull), + FileId: bson.ObjectIdHex(constants.ObjectIdNull), + Config: configData, + } + if err := spider.Add(); err != nil { + log.Errorf("add spider error: " + err.Error()) + debug.PrintStack() + continue + } + + // 根据序列化后的数据处理爬虫文件 + if err := ProcessSpiderFilesFromConfigData(spider, configData); err != nil { + log.Errorf("add spider error: " + err.Error()) + debug.PrintStack() + continue + } } } diff --git a/spiders/amazon_config/Spiderfile b/spiders/amazon_config/Spiderfile new file mode 100644 index 00000000..eea8a538 --- /dev/null +++ b/spiders/amazon_config/Spiderfile @@ -0,0 +1,51 @@ +name: "amazon_config" +display_name: "亚马逊中国(可配置)" +remark: "亚马逊中国搜索手机,列表+分页" +type: "configurable" +col: "results_amazon_config" +engine: scrapy +start_url: https://www.amazon.cn/s?k=%E6%89%8B%E6%9C%BA&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&ref=nb_sb_noss_2 +start_stage: list +stages: +- name: list + is_list: true + list_css: .s-result-item + list_xpath: "" + page_css: .a-last > a + page_xpath: "" + page_attr: href + fields: + - name: title + css: span.a-text-normal + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: .a-link-normal + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: price + css: "" + xpath: .//*[@class="a-price-whole"] + attr: "" + next_stage: "" + remark: "" + - name: price_fraction + css: "" + xpath: .//*[@class="a-price-fraction"] + attr: "" + next_stage: "" + remark: "" + - name: img + css: .s-image-square-aspect > img + xpath: "" + attr: src + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/spiders/autohome_config/Spiderfile b/spiders/autohome_config/Spiderfile new file mode 100644 index 00000000..e69880cb --- /dev/null +++ b/spiders/autohome_config/Spiderfile @@ -0,0 +1,57 @@ +name: "autohome_config" +display_name: "汽车之家(可配置)" +remark: "汽车之家文章,列表+详情+分页" +type: "configurable" +col: "results_autohome_config" +engine: scrapy +start_url: https://www.autohome.com.cn/all/ +start_stage: list +stages: +- name: list + is_list: true + list_css: ul.article > li + list_xpath: "" + page_css: a.page-item-next + page_xpath: "" + page_attr: href + fields: + - name: title + css: li > a > h3 + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: li > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: abstract + css: li > a > p + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: time + css: li > a .fn-left + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: views + css: li > a .fn-right > em:first-child + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: comments + css: li > a .fn-right > em:last-child + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/spiders/baidu_config/Spiderfile b/spiders/baidu_config/Spiderfile new file mode 100644 index 00000000..5266b85b --- /dev/null +++ b/spiders/baidu_config/Spiderfile @@ -0,0 +1,39 @@ +name: "baidu_config" +display_name: "百度搜索(可配置)" +remark: "百度搜索Crawlab,列表+分页" +type: "configurable" +col: "results_baidu_config" +engine: scrapy +start_url: http://www.baidu.com/s?wd=crawlab +start_stage: list +stages: +- name: list + is_list: true + list_css: "" + list_xpath: //body + page_css: "" + page_xpath: //body + page_attr: href + fields: + - name: title + css: "" + xpath: .//h3/a + attr: href + next_stage: "" + remark: "" + - name: url + css: "" + xpath: .//h3/a + attr: href + next_stage: "" + remark: "" + - name: abstract + css: "" + xpath: .//*[@class="c-abstract"] + attr: href + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/spiders/chinaz/Spiderfile b/spiders/chinaz/Spiderfile index d36c7cf2..2fb940bb 100644 --- a/spiders/chinaz/Spiderfile +++ b/spiders/chinaz/Spiderfile @@ -1,4 +1,5 @@ name: "chinaz" display_name: "站长之家 (Scrapy)" col: "results_chinaz" +type: "customized" cmd: "scrapy crawl chinaz_spider" \ No newline at end of file diff --git a/spiders/csdn_config/Spiderfile b/spiders/csdn_config/Spiderfile new file mode 100644 index 00000000..67f4f8c5 --- /dev/null +++ b/spiders/csdn_config/Spiderfile @@ -0,0 +1,60 @@ +name: "csdn_config" +display_name: "CSDN(可配置)" +remark: "CSDN Crawlab 文章,列表+详情+分页" +type: "configurable" +col: "results_csdn_config" +engine: scrapy +start_url: https://so.csdn.net/so/search/s.do?q=crawlab +start_stage: list +stages: +- name: list + is_list: true + list_css: .search-list-con > .search-list + list_xpath: "" + page_css: a.btn-next + page_xpath: "" + page_attr: href + fields: + - name: url + css: "" + xpath: .//*[@class="limit_width"]/a + attr: href + next_stage: detail + remark: "" +- name: detail + is_list: false + list_css: "" + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: content + css: "" + xpath: .//div[@id="content_views"] + attr: "" + next_stage: "" + remark: "" + - name: views + css: .read-count + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: title + css: .title-article + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: author + css: .follow-nickName + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + AUTOTHROTTLE_ENABLED: "false" + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/spiders/douban_config/Spiderfile b/spiders/douban_config/Spiderfile new file mode 100644 index 00000000..84f0647a --- /dev/null +++ b/spiders/douban_config/Spiderfile @@ -0,0 +1,57 @@ +name: "douban_config" +display_name: "豆瓣读书(可配置)" +remark: "豆瓣读书新书推荐,列表" +type: "configurable" +col: "results_douban_config" +engine: scrapy +start_url: https://book.douban.com/latest +start_stage: list +stages: +- name: list + is_list: true + list_css: ul.cover-col-4 > li + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: title + css: h2 > a + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: h2 > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: img + css: a.cover img + xpath: "" + attr: src + next_stage: "" + remark: "" + - name: rating + css: p.rating > .color-lightgray + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: abstract + css: p:last-child + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: info + css: .color-gray + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/spiders/jd/Spiderfile b/spiders/jd/Spiderfile new file mode 100644 index 00000000..d090472b --- /dev/null +++ b/spiders/jd/Spiderfile @@ -0,0 +1,5 @@ +name: "jd" +display_name: "京东 (Scrapy)" +col: "results_jd" +type: "customized" +cmd: "scrapy crawl jd_spider" \ No newline at end of file diff --git a/spiders/jd/jd/items.py b/spiders/jd/jd/items.py index 9a7ba1cb..b2c5e647 100644 --- a/spiders/jd/jd/items.py +++ b/spiders/jd/jd/items.py @@ -12,3 +12,4 @@ class JdItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() price = scrapy.Field() + url = scrapy.Field() diff --git a/spiders/jd/jd/pipelines.py b/spiders/jd/jd/pipelines.py index b862b7e7..5a7d7cbf 100644 --- a/spiders/jd/jd/pipelines.py +++ b/spiders/jd/jd/pipelines.py @@ -4,14 +4,3 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html -from pymongo import MongoClient - - -class JdPipeline(object): - mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT) - db = mongo[MONGO_DB] - col_name = os.environ.get('CRAWLAB_COLLECTION') or 'jd_products' - col = db[col_name] - - def process_item(self, item, spider): - return item diff --git a/spiders/jd/jd/settings.py b/spiders/jd/jd/settings.py index d83206b2..ef89ed0c 100644 --- a/spiders/jd/jd/settings.py +++ b/spiders/jd/jd/settings.py @@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'jd.spiders' #USER_AGENT = 'jd (+http://www.yourdomain.com)' # Obey robots.txt rules -ROBOTSTXT_OBEY = True +ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 @@ -65,7 +65,7 @@ ROBOTSTXT_OBEY = True # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { - 'jd.pipelines.JdPipeline': 300, + 'crawlab.pipelines.CrawlabMongoPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) diff --git a/spiders/jd/jd/spiders/jd_spider.py b/spiders/jd/jd/spiders/jd_spider.py index 01113a7e..4ec94fa9 100644 --- a/spiders/jd/jd/spiders/jd_spider.py +++ b/spiders/jd/jd/spiders/jd_spider.py @@ -1,11 +1,21 @@ # -*- coding: utf-8 -*- import scrapy +from jd.items import JdItem + class JdSpiderSpider(scrapy.Spider): name = 'jd_spider' allowed_domains = ['jd.com'] - start_urls = ['http://jd.com/'] + + def start_requests(self): + for i in range(1, 50): + yield scrapy.Request(url=f'https://search.jd.com/Search?keyword=手机&enc=utf-8&page={i}') def parse(self, response): - pass + for el in response.css('.gl-item'): + yield JdItem( + url=el.css('.p-name > a::attr("href")').extract_first(), + name=el.css('.p-name > a::attr("title")').extract_first(), + price=float(el.css('.p-price i::text').extract_first()), + ) diff --git a/spiders/realestate/Spiderfile b/spiders/realestate/Spiderfile new file mode 100644 index 00000000..772e8312 --- /dev/null +++ b/spiders/realestate/Spiderfile @@ -0,0 +1,4 @@ +name: "realestate" +display_name: "链家网 (Scrapy)" +col: "results_realestate" +cmd: "scrapy crawl lianjia" \ No newline at end of file diff --git a/spiders/sinastock/Spiderfile b/spiders/sinastock/Spiderfile new file mode 100644 index 00000000..b110cb48 --- /dev/null +++ b/spiders/sinastock/Spiderfile @@ -0,0 +1,5 @@ +name: "sinastock" +display_name: "新浪股票 (Scrapy)" +type: "customized" +col: "results_sinastock" +cmd: "scrapy crawl sinastock_spider" \ No newline at end of file diff --git a/spiders/sinastock/sinastock/pipelines.py b/spiders/sinastock/sinastock/pipelines.py index e666c50d..5a7d7cbf 100644 --- a/spiders/sinastock/sinastock/pipelines.py +++ b/spiders/sinastock/sinastock/pipelines.py @@ -4,25 +4,3 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html -import os - -from pymongo import MongoClient - - -class SinastockPipeline(object): - mongo = MongoClient( - host=os.environ.get('MONGO_HOST') or 'localhost', - port=int(os.environ.get('MONGO_PORT') or 27017) - ) - db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test'] - col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news') - - # create indexes - col.create_index('stocks') - col.create_index('url') - - def process_item(self, item, spider): - item['task_id'] = os.environ.get('CRAWLAB_TASK_ID') - if self.col.find_one({'url': item['url']}) is None: - self.col.save(item) - return item diff --git a/spiders/sinastock/sinastock/settings.py b/spiders/sinastock/sinastock/settings.py index c63c2eb5..3e01d3ca 100644 --- a/spiders/sinastock/sinastock/settings.py +++ b/spiders/sinastock/sinastock/settings.py @@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { - 'sinastock.pipelines.SinastockPipeline': 300, + 'crawlab.pipelines.CrawlabMongoPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) diff --git a/spiders/v2ex_config/Spiderfile b/spiders/v2ex_config/Spiderfile new file mode 100644 index 00000000..bb18d40a --- /dev/null +++ b/spiders/v2ex_config/Spiderfile @@ -0,0 +1,54 @@ +name: "v2ex_config" +display_name: "V2ex(可配置)" +remark: "V2ex,列表+详情" +type: "configurable" +col: "results_v2ex_config" +engine: scrapy +start_url: https://v2ex.com/ +start_stage: list +stages: +- name: list + is_list: true + list_css: .cell.item + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: href + fields: + - name: title + css: a.topic-link + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: a.topic-link + xpath: "" + attr: href + next_stage: detail + remark: "" + - name: replies + css: .count_livid + xpath: "" + attr: "" + next_stage: "" + remark: "" +- name: detail + is_list: false + list_css: "" + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: content + css: "" + xpath: .//*[@class="markdown_body"] + attr: "" + next_stage: "" + remark: "" +settings: + AUTOTHROTTLE_ENABLED: "true" + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/79.0.3945.117 Safari/537.36 diff --git a/spiders/xueqiu/Spiderfile b/spiders/xueqiu/Spiderfile new file mode 100644 index 00000000..38aa5dbe --- /dev/null +++ b/spiders/xueqiu/Spiderfile @@ -0,0 +1,5 @@ +name: "xueqiu" +display_name: "雪球网 (Scrapy)" +type: "customized" +col: "results_xueqiu" +cmd: "scrapy crawl xueqiu_spider" \ No newline at end of file diff --git a/spiders/xueqiu/xueqiu/pipelines.py b/spiders/xueqiu/xueqiu/pipelines.py index 210ce7ac..5a7d7cbf 100644 --- a/spiders/xueqiu/xueqiu/pipelines.py +++ b/spiders/xueqiu/xueqiu/pipelines.py @@ -4,26 +4,3 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html -import os - -from pymongo import MongoClient - - -class XueqiuPipeline(object): - mongo = MongoClient( - host=os.environ.get('MONGO_HOST') or 'localhost', - port=int(os.environ.get('MONGO_PORT') or 27017) - ) - db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test'] - col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'results_xueqiu') - - # create indexes - col.create_index('stocks') - col.create_index('id') - col.create_index('url') - - def process_item(self, item, spider): - item['task_id'] = os.environ.get('CRAWLAB_TASK_ID') - if self.col.find_one({'id': item['id']}) is None: - self.col.save(item) - return item diff --git a/spiders/xueqiu/xueqiu/settings.py b/spiders/xueqiu/xueqiu/settings.py index b44a74e1..1d898e2f 100644 --- a/spiders/xueqiu/xueqiu/settings.py +++ b/spiders/xueqiu/xueqiu/settings.py @@ -18,7 +18,7 @@ NEWSPIDER_MODULE = 'xueqiu.spiders' USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' # Obey robots.txt rules -ROBOTSTXT_OBEY = True +ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) # CONCURRENT_REQUESTS = 32 @@ -64,7 +64,7 @@ ROBOTSTXT_OBEY = True # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { - 'xueqiu.pipelines.XueqiuPipeline': 300, + 'crawlab.pipelines.CrawlabMongoPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) diff --git a/spiders/xueqiu_config/Spiderfile b/spiders/xueqiu_config/Spiderfile new file mode 100644 index 00000000..0de50e9e --- /dev/null +++ b/spiders/xueqiu_config/Spiderfile @@ -0,0 +1,39 @@ +name: "xueqiu_config" +display_name: "雪球网(可配置)" +remark: "雪球网新闻,列表" +type: "configurable" +col: "results_xueqiu_config" +engine: scrapy +start_url: https://xueqiu.com/ +start_stage: list +stages: +- name: list + is_list: true + list_css: "" + list_xpath: .//*[contains(@class, "AnonymousHome_home__timeline__item")] + page_css: "" + page_xpath: "" + page_attr: "" + fields: + - name: title + css: h3 > a + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: h3 > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: abstract + css: p + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36 diff --git a/spiders/zongheng_config/Spiderfile b/spiders/zongheng_config/Spiderfile new file mode 100644 index 00000000..0163fac7 --- /dev/null +++ b/spiders/zongheng_config/Spiderfile @@ -0,0 +1,45 @@ +name: "zongheng_config" +display_name: "纵横(可配置)" +remark: "纵横小说网,列表" +type: "configurable" +col: "results_zongheng_config" +engine: scrapy +start_url: http://www.zongheng.com/rank/details.html?rt=1&d=1 +start_stage: list +stages: +- name: list + is_list: true + list_css: .rank_d_list + list_xpath: "" + page_css: "" + page_xpath: "" + page_attr: href + fields: + - name: title + css: .rank_d_b_name > a + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: url + css: .rank_d_b_name > a + xpath: "" + attr: href + next_stage: "" + remark: "" + - name: abstract + css: body + xpath: "" + attr: "" + next_stage: "" + remark: "" + - name: votes + css: .rank_d_b_ticket + xpath: "" + attr: "" + next_stage: "" + remark: "" +settings: + ROBOTSTXT_OBEY: "false" + USER_AGENT: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, + like Gecko) Chrome/78.0.3904.108 Safari/537.36