fixed sorting issue

This commit is contained in:
Marvin Zhang
2019-02-25 19:32:43 +08:00
parent dc9f27e720
commit 318e5876a6
9 changed files with 32 additions and 11 deletions

View File

@@ -40,10 +40,13 @@ class DbManager(object):
col = self.db[col_name]
col.remove({'_id': ObjectId(id)})
def list(self, col_name: str, cond: dict, skip: int = 0, limit: int = 100, **kwargs):
def list(self, col_name: str, cond: dict, sort_key=None, sort_direction=DESCENDING, skip: int = 0, limit: int = 100,
**kwargs):
if sort_key is None:
sort_key = '_i'
col = self.db[col_name]
data = []
for item in col.find(cond).skip(skip).limit(limit):
for item in col.find(cond).sort(sort_key, sort_direction).skip(skip).limit(limit):
data.append(item)
return data

View File

@@ -142,7 +142,7 @@ class SpiderApi(BaseApi):
})
def get_deploys(self, id):
items = db_manager.list('deploys', {'spider_id': ObjectId(id)}, limit=10)
items = db_manager.list('deploys', {'spider_id': ObjectId(id)}, limit=10, sort_key='create_ts')
deploys = []
for item in items:
spider_id = item['spider_id']
@@ -155,7 +155,7 @@ class SpiderApi(BaseApi):
})
def get_tasks(self, id):
items = db_manager.list('tasks', {'spider_id': ObjectId(id)}, limit=10)
items = db_manager.list('tasks', {'spider_id': ObjectId(id)}, limit=10, sort_key='finish_ts')
for item in items:
spider_id = item['spider_id']
spider = db_manager.get('spiders', id=str(spider_id))

View File

@@ -1,4 +1,5 @@
import os
from datetime import datetime, timedelta
from flask_restful import reqparse, Resource
@@ -54,9 +55,20 @@ class StatsApi(Resource):
}
}
])
daily_tasks = []
date_cache = {}
for item in cur:
daily_tasks.append(item)
date_cache[item['_id']] = item['count']
start_date = datetime.now() - timedelta(31)
end_date = datetime.now() - timedelta(1)
date = start_date
daily_tasks = []
while date < end_date:
date = date + timedelta(1)
date_str = date.strftime('%Y-%m-%d')
daily_tasks.append({
'date': date_str,
'count': date_cache.get(date_str) or 0,
})
return {
'status': 'ok',

View File

@@ -33,7 +33,7 @@ class TaskApi(BaseApi):
task['log'] = f.read()
return jsonify(task)
tasks = db_manager.list('tasks', {}, limit=1000)
tasks = db_manager.list('tasks', {}, limit=1000, sort_key='finish_ts')
items = []
for task in tasks:
_task = db_manager.get('tasks_celery', id=task['_id'])

View File

@@ -10,5 +10,4 @@ import scrapy
class TaobaoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
name = scrapy.Field()

View File

@@ -8,4 +8,5 @@
class TaobaoPipeline(object):
def process_item(self, item, spider):
print('task_id: %s' % spider.task_id)
return item

View File

@@ -19,7 +19,8 @@ NEWSPIDER_MODULE = 'taobao.spiders'
#USER_AGENT = 'taobao (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

View File

@@ -1,6 +1,10 @@
# -*- coding: utf-8 -*-
import os
import scrapy
from ..items import TaobaoItem
class TaobaoSpiderSpider(scrapy.Spider):
name = 'taobao_spider'
@@ -8,4 +12,4 @@ class TaobaoSpiderSpider(scrapy.Spider):
start_urls = ['http://taobao.com/']
def parse(self, response):
pass
yield TaobaoItem()

View File

@@ -46,6 +46,7 @@ def execute_spider(self, id: str, node_id: str):
'node_id': node_id,
'hostname': hostname,
'log_file_path': log_file_path,
'spider_version': latest_version
})
# execute the command