From 318e5876a6c548fc259e399c449b920b1924b35c Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Mon, 25 Feb 2019 19:32:43 +0800 Subject: [PATCH] fixed sorting issue --- db/manager.py | 7 +++++-- routes/spiders.py | 4 ++-- routes/stats.py | 16 ++++++++++++++-- routes/tasks.py | 2 +- spiders/taobao/taobao/items.py | 3 +-- spiders/taobao/taobao/pipelines.py | 1 + spiders/taobao/taobao/settings.py | 3 ++- spiders/taobao/taobao/spiders/taobao_spider.py | 6 +++++- tasks/spider.py | 1 + 9 files changed, 32 insertions(+), 11 deletions(-) diff --git a/db/manager.py b/db/manager.py index 3c3ca875..7d2035c8 100644 --- a/db/manager.py +++ b/db/manager.py @@ -40,10 +40,13 @@ class DbManager(object): col = self.db[col_name] col.remove({'_id': ObjectId(id)}) - def list(self, col_name: str, cond: dict, skip: int = 0, limit: int = 100, **kwargs): + def list(self, col_name: str, cond: dict, sort_key=None, sort_direction=DESCENDING, skip: int = 0, limit: int = 100, + **kwargs): + if sort_key is None: + sort_key = '_i' col = self.db[col_name] data = [] - for item in col.find(cond).skip(skip).limit(limit): + for item in col.find(cond).sort(sort_key, sort_direction).skip(skip).limit(limit): data.append(item) return data diff --git a/routes/spiders.py b/routes/spiders.py index e3e62ba1..441e4209 100644 --- a/routes/spiders.py +++ b/routes/spiders.py @@ -142,7 +142,7 @@ class SpiderApi(BaseApi): }) def get_deploys(self, id): - items = db_manager.list('deploys', {'spider_id': ObjectId(id)}, limit=10) + items = db_manager.list('deploys', {'spider_id': ObjectId(id)}, limit=10, sort_key='create_ts') deploys = [] for item in items: spider_id = item['spider_id'] @@ -155,7 +155,7 @@ class SpiderApi(BaseApi): }) def get_tasks(self, id): - items = db_manager.list('tasks', {'spider_id': ObjectId(id)}, limit=10) + items = db_manager.list('tasks', {'spider_id': ObjectId(id)}, limit=10, sort_key='finish_ts') for item in items: spider_id = item['spider_id'] spider = db_manager.get('spiders', id=str(spider_id)) diff --git a/routes/stats.py b/routes/stats.py index e425737b..f637039f 100644 --- a/routes/stats.py +++ b/routes/stats.py @@ -1,4 +1,5 @@ import os +from datetime import datetime, timedelta from flask_restful import reqparse, Resource @@ -54,9 +55,20 @@ class StatsApi(Resource): } } ]) - daily_tasks = [] + date_cache = {} for item in cur: - daily_tasks.append(item) + date_cache[item['_id']] = item['count'] + start_date = datetime.now() - timedelta(31) + end_date = datetime.now() - timedelta(1) + date = start_date + daily_tasks = [] + while date < end_date: + date = date + timedelta(1) + date_str = date.strftime('%Y-%m-%d') + daily_tasks.append({ + 'date': date_str, + 'count': date_cache.get(date_str) or 0, + }) return { 'status': 'ok', diff --git a/routes/tasks.py b/routes/tasks.py index 8a0d686b..14e384d3 100644 --- a/routes/tasks.py +++ b/routes/tasks.py @@ -33,7 +33,7 @@ class TaskApi(BaseApi): task['log'] = f.read() return jsonify(task) - tasks = db_manager.list('tasks', {}, limit=1000) + tasks = db_manager.list('tasks', {}, limit=1000, sort_key='finish_ts') items = [] for task in tasks: _task = db_manager.get('tasks_celery', id=task['_id']) diff --git a/spiders/taobao/taobao/items.py b/spiders/taobao/taobao/items.py index 322bd0fa..199c1f82 100644 --- a/spiders/taobao/taobao/items.py +++ b/spiders/taobao/taobao/items.py @@ -10,5 +10,4 @@ import scrapy class TaobaoItem(scrapy.Item): # define the fields for your item here like: - # name = scrapy.Field() - pass + name = scrapy.Field() diff --git a/spiders/taobao/taobao/pipelines.py b/spiders/taobao/taobao/pipelines.py index 21db0b84..7ddf8da5 100644 --- a/spiders/taobao/taobao/pipelines.py +++ b/spiders/taobao/taobao/pipelines.py @@ -8,4 +8,5 @@ class TaobaoPipeline(object): def process_item(self, item, spider): + print('task_id: %s' % spider.task_id) return item diff --git a/spiders/taobao/taobao/settings.py b/spiders/taobao/taobao/settings.py index b68be261..0e237049 100644 --- a/spiders/taobao/taobao/settings.py +++ b/spiders/taobao/taobao/settings.py @@ -19,7 +19,8 @@ NEWSPIDER_MODULE = 'taobao.spiders' #USER_AGENT = 'taobao (+http://www.yourdomain.com)' # Obey robots.txt rules -ROBOTSTXT_OBEY = True +# ROBOTSTXT_OBEY = True +ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 diff --git a/spiders/taobao/taobao/spiders/taobao_spider.py b/spiders/taobao/taobao/spiders/taobao_spider.py index 113efcf7..2a939a06 100644 --- a/spiders/taobao/taobao/spiders/taobao_spider.py +++ b/spiders/taobao/taobao/spiders/taobao_spider.py @@ -1,6 +1,10 @@ # -*- coding: utf-8 -*- +import os + import scrapy +from ..items import TaobaoItem + class TaobaoSpiderSpider(scrapy.Spider): name = 'taobao_spider' @@ -8,4 +12,4 @@ class TaobaoSpiderSpider(scrapy.Spider): start_urls = ['http://taobao.com/'] def parse(self, response): - pass + yield TaobaoItem() diff --git a/tasks/spider.py b/tasks/spider.py index edcf7ab0..02019ec8 100644 --- a/tasks/spider.py +++ b/tasks/spider.py @@ -46,6 +46,7 @@ def execute_spider(self, id: str, node_id: str): 'node_id': node_id, 'hostname': hostname, 'log_file_path': log_file_path, + 'spider_version': latest_version }) # execute the command