updated sinastock_spider

This commit is contained in:
Marvin Zhang
2019-07-04 18:21:08 +08:00
parent f06e95537e
commit 541f17aa61
6 changed files with 24 additions and 6 deletions

View File

@@ -17,3 +17,5 @@ class NewsItem(scrapy.Item):
url = scrapy.Field()
text = scrapy.Field()
task_id = scrapy.Field()
source = scrapy.Field()
stocks = scrapy.Field()

View File

@@ -17,9 +17,12 @@ class SinastockPipeline(object):
db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news')
# create indexes
col.create_index('stocks')
col.create_index('url')
def process_item(self, item, spider):
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
item['_id'] = item['url']
if self.col.find_one({'_id': item['_id']}) is None:
if self.col.find_one({'url': item['url']}) is None:
self.col.save(item)
return item

View File

@@ -27,7 +27,8 @@ class SinastockSpiderSpider(scrapy.Spider):
url = f'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol={ex.lower()}{code}&Page={i + 1}'
yield scrapy.Request(
url=url,
callback=self.parse
callback=self.parse,
meta={'ts_code': s['ts_code']}
)
def parse(self, response):
@@ -36,6 +37,8 @@ class SinastockSpiderSpider(scrapy.Spider):
item = NewsItem(
title=a.css('a::text').extract_first(),
url=url,
source='sina',
stocks=[response.meta['ts_code']]
)
yield scrapy.Request(
url=url,

View File

@@ -14,7 +14,10 @@ class XueqiuItem(scrapy.Item):
task_id = scrapy.Field()
id = scrapy.Field()
text = scrapy.Field()
url = scrapy.Field()
target = scrapy.Field()
view_count = scrapy.Field()
mark = scrapy.Field()
created_at = scrapy.Field()
ts = scrapy.Field()
source = scrapy.Field()

View File

@@ -17,9 +17,13 @@ class XueqiuPipeline(object):
db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'results_xueqiu')
# create indexes
col.create_index('stocks')
col.create_index('id')
col.create_index('url')
def process_item(self, item, spider):
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
item['_id'] = item['id']
if self.col.find_one({'_id': item['_id']}) is None:
if self.col.find_one({'id': item['id']}) is None:
self.col.save(item)
return item

View File

@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
import json
from datetime import datetime
from time import sleep
import scrapy
@@ -32,9 +33,11 @@ class XueqiuSpiderSpider(scrapy.Spider):
id=d['id'],
text=d['text'],
mark=d['mark'],
target=d['target'],
url=d['target'],
created_at=d['created_at'],
ts=datetime.fromtimestamp(d['created_at'] / 1e3),
view_count=d['view_count'],
source='xueqiu'
)
yield item