added results stats for tasks

This commit is contained in:
Marvin Zhang
2019-04-22 21:06:08 +08:00
parent 6317c9ca1a
commit 017116bb29
3 changed files with 55 additions and 13 deletions

View File

@@ -1,5 +1,9 @@
import os
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# 爬虫源码路径
PROJECT_SOURCE_FILE_FOLDER = '../spiders'
PROJECT_SOURCE_FILE_FOLDER = os.path.join(BASE_DIR, "spiders")
# 配置python虚拟环境的路径
PYTHON_ENV_PATH = '/Users/chennan/Desktop/2019/env/bin/python'

View File

@@ -1,38 +1,55 @@
# encoding: utf-8
import os
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# 爬虫源码路径
PROJECT_SOURCE_FILE_FOLDER = os.path.join(BASE_DIR, "spiders")
# 配置python虚拟环境的路径
PYTHON_ENV_PATH = '/Users/chennan/Desktop/2019/env/bin/python'
# 爬虫部署路径
PROJECT_DEPLOY_FILE_FOLDER = os.path.join(BASE_DIR, 'deployfile')
# PROJECT_DEPLOY_FILE_FOLDER = '../deployfile'
PROJECT_DEPLOY_FILE_FOLDER = '/var/crawlab'
PROJECT_LOGS_FOLDER = os.path.join(BASE_DIR, 'deployfile/logs')
# 爬虫日志路径
PROJECT_LOGS_FOLDER = '../deployfile/logs'
# 打包临时文件夹
PROJECT_TMP_FOLDER = '/tmp'
# celery variables
BROKER_URL = 'redis://127.0.0.1:56379/0'
CELERY_RESULT_BACKEND = 'mongodb://127.0.0.1:57017/'
# Celery中间者URL
BROKER_URL = 'redis://127.0.0.1:6379/0'
# Celery后台URL
CELERY_RESULT_BACKEND = 'mongodb://127.0.0.1:27017/'
# Celery MongoDB设置
CELERY_MONGODB_BACKEND_SETTINGS = {
'database': 'crawlab_test',
'taskmeta_collection': 'tasks_celery',
}
# Celery时区
CELERY_TIMEZONE = 'Asia/Shanghai'
# 是否启用UTC
CELERY_ENABLE_UTC = True
# Celery Scheduler Redis URL
CELERY_BEAT_SCHEDULER = 'utils.redisbeat.RedisScheduler'
CELERY_REDIS_SCHEDULER_URL = 'redis://localhost:6379'
CELERY_REDIS_SCHEDULER_KEY = 'celery:beat:order_tasks'
# flower variables
FLOWER_API_ENDPOINT = 'http://localhost:5555/api'
# database variables
# MongoDB 变量
MONGO_HOST = '127.0.0.1'
MONGO_PORT = 57017
MONGO_PORT = 27017
MONGO_DB = 'crawlab_test'
# flask variables
# Flask 变量
DEBUG = True
FLASK_HOST = '127.0.0.1'
FLASK_PORT = 8000

View File

@@ -42,9 +42,21 @@ class TaskApi(BaseApi):
elif id is not None:
task = db_manager.get(col_name=self.col_name, id=id)
spider = db_manager.get(col_name='spiders', id=str(task['spider_id']))
task['spider_name'] = spider['name']
# spider
task['num_results'] = 0
if spider:
task['spider_name'] = spider['name']
if spider.get('col'):
col = spider.get('col')
num_results = db_manager.count(col, {'task_id': task['_id']})
task['num_results'] = num_results
# duration
if task.get('finish_ts') is not None:
task['duration'] = (task['finish_ts'] - task['create_ts']).total_seconds()
task['avg_num_results'] = round(task['num_results'] / task['duration'], 1)
try:
with open(task['log_file_path']) as f:
task['log'] = f.read()
@@ -76,13 +88,22 @@ class TaskApi(BaseApi):
if task.get('status') is None:
task['status'] = TaskStatus.UNAVAILABLE
# spider name
# spider
task['num_results'] = 0
if _spider:
# spider name
task['spider_name'] = _spider['name']
# number of results
if _spider.get('col'):
col = _spider.get('col')
num_results = db_manager.count(col, {'task_id': task['_id']})
task['num_results'] = num_results
# duration
if task.get('finish_ts') is not None:
task['duration'] = (task['finish_ts'] - task['create_ts']).total_seconds()
task['avg_num_results'] = round(task['num_results'] / task['duration'], 1)
items.append(task)