From 541f17aa6128c532622ef47b15c6390f3ddb36c8 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Thu, 4 Jul 2019 18:21:08 +0800 Subject: [PATCH] updated sinastock_spider --- spiders/sinastock/sinastock/items.py | 2 ++ spiders/sinastock/sinastock/pipelines.py | 7 +++++-- spiders/sinastock/sinastock/spiders/sinastock_spider.py | 5 ++++- spiders/xueqiu/xueqiu/items.py | 3 +++ spiders/xueqiu/xueqiu/pipelines.py | 8 ++++++-- spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py | 5 ++++- 6 files changed, 24 insertions(+), 6 deletions(-) diff --git a/spiders/sinastock/sinastock/items.py b/spiders/sinastock/sinastock/items.py index 5c0e6570..6e3e5d8e 100644 --- a/spiders/sinastock/sinastock/items.py +++ b/spiders/sinastock/sinastock/items.py @@ -17,3 +17,5 @@ class NewsItem(scrapy.Item): url = scrapy.Field() text = scrapy.Field() task_id = scrapy.Field() + source = scrapy.Field() + stocks = scrapy.Field() diff --git a/spiders/sinastock/sinastock/pipelines.py b/spiders/sinastock/sinastock/pipelines.py index ba1996fd..e666c50d 100644 --- a/spiders/sinastock/sinastock/pipelines.py +++ b/spiders/sinastock/sinastock/pipelines.py @@ -17,9 +17,12 @@ class SinastockPipeline(object): db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test'] col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news') + # create indexes + col.create_index('stocks') + col.create_index('url') + def process_item(self, item, spider): item['task_id'] = os.environ.get('CRAWLAB_TASK_ID') - item['_id'] = item['url'] - if self.col.find_one({'_id': item['_id']}) is None: + if self.col.find_one({'url': item['url']}) is None: self.col.save(item) return item diff --git a/spiders/sinastock/sinastock/spiders/sinastock_spider.py b/spiders/sinastock/sinastock/spiders/sinastock_spider.py index 6b3051b6..9d258e6c 100644 --- a/spiders/sinastock/sinastock/spiders/sinastock_spider.py +++ b/spiders/sinastock/sinastock/spiders/sinastock_spider.py @@ -27,7 +27,8 @@ class SinastockSpiderSpider(scrapy.Spider): url = f'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol={ex.lower()}{code}&Page={i + 1}' yield scrapy.Request( url=url, - callback=self.parse + callback=self.parse, + meta={'ts_code': s['ts_code']} ) def parse(self, response): @@ -36,6 +37,8 @@ class SinastockSpiderSpider(scrapy.Spider): item = NewsItem( title=a.css('a::text').extract_first(), url=url, + source='sina', + stocks=[response.meta['ts_code']] ) yield scrapy.Request( url=url, diff --git a/spiders/xueqiu/xueqiu/items.py b/spiders/xueqiu/xueqiu/items.py index e50e4823..5471594d 100644 --- a/spiders/xueqiu/xueqiu/items.py +++ b/spiders/xueqiu/xueqiu/items.py @@ -14,7 +14,10 @@ class XueqiuItem(scrapy.Item): task_id = scrapy.Field() id = scrapy.Field() text = scrapy.Field() + url = scrapy.Field() target = scrapy.Field() view_count = scrapy.Field() mark = scrapy.Field() created_at = scrapy.Field() + ts = scrapy.Field() + source = scrapy.Field() diff --git a/spiders/xueqiu/xueqiu/pipelines.py b/spiders/xueqiu/xueqiu/pipelines.py index 67173772..210ce7ac 100644 --- a/spiders/xueqiu/xueqiu/pipelines.py +++ b/spiders/xueqiu/xueqiu/pipelines.py @@ -17,9 +17,13 @@ class XueqiuPipeline(object): db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test'] col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'results_xueqiu') + # create indexes + col.create_index('stocks') + col.create_index('id') + col.create_index('url') + def process_item(self, item, spider): item['task_id'] = os.environ.get('CRAWLAB_TASK_ID') - item['_id'] = item['id'] - if self.col.find_one({'_id': item['_id']}) is None: + if self.col.find_one({'id': item['id']}) is None: self.col.save(item) return item diff --git a/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py b/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py index 6ccb13c0..a746e156 100644 --- a/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py +++ b/spiders/xueqiu/xueqiu/spiders/xueqiu_spider.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import json +from datetime import datetime from time import sleep import scrapy @@ -32,9 +33,11 @@ class XueqiuSpiderSpider(scrapy.Spider): id=d['id'], text=d['text'], mark=d['mark'], - target=d['target'], + url=d['target'], created_at=d['created_at'], + ts=datetime.fromtimestamp(d['created_at'] / 1e3), view_count=d['view_count'], + source='xueqiu' ) yield item