mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-21 17:21:09 +01:00
updated sinastock_spider
This commit is contained in:
@@ -17,3 +17,5 @@ class NewsItem(scrapy.Item):
|
||||
url = scrapy.Field()
|
||||
text = scrapy.Field()
|
||||
task_id = scrapy.Field()
|
||||
source = scrapy.Field()
|
||||
stocks = scrapy.Field()
|
||||
|
||||
@@ -17,9 +17,12 @@ class SinastockPipeline(object):
|
||||
db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
|
||||
col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'stock_news')
|
||||
|
||||
# create indexes
|
||||
col.create_index('stocks')
|
||||
col.create_index('url')
|
||||
|
||||
def process_item(self, item, spider):
|
||||
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
|
||||
item['_id'] = item['url']
|
||||
if self.col.find_one({'_id': item['_id']}) is None:
|
||||
if self.col.find_one({'url': item['url']}) is None:
|
||||
self.col.save(item)
|
||||
return item
|
||||
|
||||
@@ -27,7 +27,8 @@ class SinastockSpiderSpider(scrapy.Spider):
|
||||
url = f'http://vip.stock.finance.sina.com.cn/corp/view/vCB_AllNewsStock.php?symbol={ex.lower()}{code}&Page={i + 1}'
|
||||
yield scrapy.Request(
|
||||
url=url,
|
||||
callback=self.parse
|
||||
callback=self.parse,
|
||||
meta={'ts_code': s['ts_code']}
|
||||
)
|
||||
|
||||
def parse(self, response):
|
||||
@@ -36,6 +37,8 @@ class SinastockSpiderSpider(scrapy.Spider):
|
||||
item = NewsItem(
|
||||
title=a.css('a::text').extract_first(),
|
||||
url=url,
|
||||
source='sina',
|
||||
stocks=[response.meta['ts_code']]
|
||||
)
|
||||
yield scrapy.Request(
|
||||
url=url,
|
||||
|
||||
@@ -14,7 +14,10 @@ class XueqiuItem(scrapy.Item):
|
||||
task_id = scrapy.Field()
|
||||
id = scrapy.Field()
|
||||
text = scrapy.Field()
|
||||
url = scrapy.Field()
|
||||
target = scrapy.Field()
|
||||
view_count = scrapy.Field()
|
||||
mark = scrapy.Field()
|
||||
created_at = scrapy.Field()
|
||||
ts = scrapy.Field()
|
||||
source = scrapy.Field()
|
||||
|
||||
@@ -17,9 +17,13 @@ class XueqiuPipeline(object):
|
||||
db = mongo[os.environ.get('MONGO_DB') or 'crawlab_test']
|
||||
col = db.get_collection(os.environ.get('CRAWLAB_COLLECTION') or 'results_xueqiu')
|
||||
|
||||
# create indexes
|
||||
col.create_index('stocks')
|
||||
col.create_index('id')
|
||||
col.create_index('url')
|
||||
|
||||
def process_item(self, item, spider):
|
||||
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
|
||||
item['_id'] = item['id']
|
||||
if self.col.find_one({'_id': item['_id']}) is None:
|
||||
if self.col.find_one({'id': item['id']}) is None:
|
||||
self.col.save(item)
|
||||
return item
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
from datetime import datetime
|
||||
from time import sleep
|
||||
|
||||
import scrapy
|
||||
@@ -32,9 +33,11 @@ class XueqiuSpiderSpider(scrapy.Spider):
|
||||
id=d['id'],
|
||||
text=d['text'],
|
||||
mark=d['mark'],
|
||||
target=d['target'],
|
||||
url=d['target'],
|
||||
created_at=d['created_at'],
|
||||
ts=datetime.fromtimestamp(d['created_at'] / 1e3),
|
||||
view_count=d['view_count'],
|
||||
source='xueqiu'
|
||||
)
|
||||
yield item
|
||||
|
||||
|
||||
Reference in New Issue
Block a user