diff --git a/crawlab/config/__init__.py b/crawlab/config/__init__.py index 609b69de..4d2d8d10 100644 --- a/crawlab/config/__init__.py +++ b/crawlab/config/__init__.py @@ -1,10 +1,3 @@ # encoding: utf-8 -import os - -run_env = os.environ.get("RUNENV", "local") - -if run_env == "local": # 加载本地配置 - from config.config_local import * -else: - from config.config import * +from config.config import * diff --git a/crawlab/config/config.py b/crawlab/config/config.py index afbcb9bf..08ab113c 100644 --- a/crawlab/config/config.py +++ b/crawlab/config/config.py @@ -6,7 +6,7 @@ BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__fil PROJECT_SOURCE_FILE_FOLDER = os.path.join(BASE_DIR, "spiders") # 配置python虚拟环境的路径 -PYTHON_ENV_PATH = '/Users/chennan/Desktop/2019/env/bin/python' +PYTHON_ENV_PATH = '/Users/yeqing/.pyenv/shims/python' # 爬虫部署路径 # PROJECT_DEPLOY_FILE_FOLDER = '../deployfile' diff --git a/crawlab/config/config_local.py b/crawlab/config/config_local.py deleted file mode 100644 index afbcb9bf..00000000 --- a/crawlab/config/config_local.py +++ /dev/null @@ -1,55 +0,0 @@ -import os - -BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -# 爬虫源码路径 -PROJECT_SOURCE_FILE_FOLDER = os.path.join(BASE_DIR, "spiders") - -# 配置python虚拟环境的路径 -PYTHON_ENV_PATH = '/Users/chennan/Desktop/2019/env/bin/python' - -# 爬虫部署路径 -# PROJECT_DEPLOY_FILE_FOLDER = '../deployfile' -PROJECT_DEPLOY_FILE_FOLDER = '/var/crawlab' - -# 爬虫日志路径 -PROJECT_LOGS_FOLDER = '../deployfile/logs' - -# 打包临时文件夹 -PROJECT_TMP_FOLDER = '/tmp' - -# Celery中间者URL -BROKER_URL = 'redis://127.0.0.1:6379/0' - -# Celery后台URL -CELERY_RESULT_BACKEND = 'mongodb://127.0.0.1:27017/' - -# Celery MongoDB设置 -CELERY_MONGODB_BACKEND_SETTINGS = { - 'database': 'crawlab_test', - 'taskmeta_collection': 'tasks_celery', -} - -# Celery时区 -CELERY_TIMEZONE = 'Asia/Shanghai' - -# 是否启用UTC -CELERY_ENABLE_UTC = True - -# Celery Scheduler Redis URL -CELERY_BEAT_SCHEDULER = 'utils.redisbeat.RedisScheduler' -CELERY_REDIS_SCHEDULER_URL = 'redis://localhost:6379' -CELERY_REDIS_SCHEDULER_KEY = 'celery:beat:order_tasks' - -# flower variables -FLOWER_API_ENDPOINT = 'http://localhost:5555/api' - -# MongoDB 变量 -MONGO_HOST = '127.0.0.1' -MONGO_PORT = 27017 -MONGO_DB = 'crawlab_test' - -# Flask 变量 -DEBUG = True -FLASK_HOST = '127.0.0.1' -FLASK_PORT = 8000 diff --git a/crawlab/routes/stats.py b/crawlab/routes/stats.py index aa7432c6..fe43e7a9 100644 --- a/crawlab/routes/stats.py +++ b/crawlab/routes/stats.py @@ -1,13 +1,20 @@ import os +from collections import defaultdict from datetime import datetime, timedelta from flask_restful import reqparse, Resource +from constants.task import TaskStatus from db.manager import db_manager +from routes.base import BaseApi from utils import jsonify -class StatsApi(Resource): +class StatsApi(BaseApi): + arguments = [ + ['spider_id', str], + ] + def get(self, action: str = None) -> (dict, tuple): """ GET method of StatsApi. @@ -87,3 +94,142 @@ class StatsApi(Resource): }, 'daily_tasks': daily_tasks } + + def get_spider_stats(self): + args = self.parser.parse_args() + spider_id = args.get('spider_id') + spider = db_manager.get('spiders', id=spider_id) + tasks = db_manager.list( + col_name='tasks', + cond={ + 'spider_id': spider['_id'], + 'create_ts': { + '$gte': datetime.now() - timedelta(30) + } + }, + limit=9999999 + ) + + # task count + task_count = len(tasks) + + # calculate task count stats + task_count_by_status = defaultdict(int) + task_count_by_node = defaultdict(int) + total_seconds = 0 + for task in tasks: + task_count_by_status[task['status']] += 1 + task_count_by_node[task.get('node_id')] += 1 + if task['status'] == TaskStatus.SUCCESS and task.get('finish_ts'): + duration = (task['finish_ts'] - task['create_ts']).total_seconds() + total_seconds += duration + + # task count by node + task_count_by_node_ = [] + for status, value in task_count_by_node.items(): + task_count_by_node_.append({ + 'name': status, + 'value': value + }) + + # task count by status + task_count_by_status_ = [] + for status, value in task_count_by_status.items(): + task_count_by_status_.append({ + 'name': status, + 'value': value + }) + + # success rate + success_rate = task_count_by_status[TaskStatus.SUCCESS] / task_count + + # average duration + avg_duration = total_seconds / task_count + + # calculate task count by date + cur = db_manager.aggregate('tasks', [ + { + '$match': { + 'spider_id': spider['_id'] + } + }, + { + '$project': { + 'date': { + '$dateToString': { + 'format': '%Y-%m-%d', + 'date': '$create_ts' + } + }, + 'duration': { + '$subtract': [ + '$finish_ts', + '$create_ts' + ] + } + } + }, + { + '$group': { + '_id': '$date', + 'count': { + '$sum': 1 + }, + 'duration': { + '$avg': '$duration' + } + } + }, + { + '$sort': { + '_id': 1 + } + } + ]) + date_cache = {} + for item in cur: + date_cache[item['_id']] = { + 'duration': (item['duration'] or 0) / 1000, + 'count': item['count'] + } + start_date = datetime.now() - timedelta(31) + end_date = datetime.now() - timedelta(1) + date = start_date + daily_tasks = [] + while date < end_date: + date = date + timedelta(1) + date_str = date.strftime('%Y-%m-%d') + d = date_cache.get(date_str) + row = { + 'date': date_str, + } + if d is None: + row['count'] = 0 + row['duration'] = 0 + else: + row['count'] = d['count'] + row['duration'] = d['duration'] + daily_tasks.append(row) + + # calculate total results + result_count = 0 + col_name = spider.get('col') + if col_name is not None: + for task in tasks: + result_count += db_manager.count(col_name, {'task_id': task['_id']}) + + # top tasks + # top_10_tasks = db_manager.list('tasks', {'spider_id': spider['_id']}) + + return { + 'status': 'ok', + 'overview': { + 'task_count': task_count, + 'result_count': result_count, + 'success_rate': success_rate, + 'avg_duration': avg_duration + }, + 'task_count_by_status': task_count_by_status_, + 'task_count_by_node': task_count_by_node_, + 'daily_stats': daily_tasks, + } diff --git a/crawlab/tasks/spider.py b/crawlab/tasks/spider.py index 0d843e22..48cafc27 100644 --- a/crawlab/tasks/spider.py +++ b/crawlab/tasks/spider.py @@ -1,4 +1,5 @@ import os +import sys from datetime import datetime from time import sleep @@ -35,8 +36,16 @@ def execute_spider(self, id: str, params: str = None): hostname = self.request.hostname spider = db_manager.get('spiders', id=id) command = spider.get('cmd') - if command.startswith("env"): - command = PYTHON_ENV_PATH + command.replace("env", "") + + # if start with python, then use sys.executable to execute in the virtualenv + if command.startswith('python '): + command = command.replace('python ', sys.executable + ' ') + + # if start with scrapy, then use sys.executable to execute scrapy as module in the virtualenv + elif command.startswith('scrapy '): + command = command.replace('scrapy ', sys.executable + ' -m scrapy ') + + # pass params to the command if params is not None: command += ' ' + params @@ -95,26 +104,33 @@ def execute_spider(self, id: str, params: str = None): # start process cmd_arr = command.split(' ') cmd_arr = list(filter(lambda x: x != '', cmd_arr)) - p = subprocess.Popen(cmd_arr, - stdout=stdout.fileno(), - stderr=stderr.fileno(), - cwd=current_working_directory, - env=env, - bufsize=1) + try: + p = subprocess.Popen(cmd_arr, + stdout=stdout.fileno(), + stderr=stderr.fileno(), + cwd=current_working_directory, + env=env, + bufsize=1) - # get output from the process - _stdout, _stderr = p.communicate() + # get output from the process + _stdout, _stderr = p.communicate() - # get return code - code = p.poll() - if code == 0: - status = TaskStatus.SUCCESS - else: + # get return code + code = p.poll() + if code == 0: + status = TaskStatus.SUCCESS + else: + status = TaskStatus.FAILURE + except Exception as err: + logger.error(err) + stderr.write(str(err)) status = TaskStatus.FAILURE # save task when the task is finished + finish_ts = datetime.utcnow() db_manager.update_one('tasks', id=task_id, values={ - 'finish_ts': datetime.utcnow(), + 'finish_ts': finish_ts, + 'duration': (finish_ts - task['create_ts']).total_seconds(), 'status': status }) task = db_manager.get('tasks', id=id) diff --git a/crawlab/utils/file.py b/crawlab/utils/file.py index d549c62d..06163d49 100644 --- a/crawlab/utils/file.py +++ b/crawlab/utils/file.py @@ -11,6 +11,7 @@ SUFFIX_LANG_MAPPING = { 'sh': 'shell', 'java': 'java', 'c': 'c', + 'go': 'go', } @@ -48,11 +49,18 @@ def get_file_suffix_stats(path) -> dict: Get suffix stats of given file :param path: file path """ - stats = defaultdict(int) + _stats = defaultdict(int) for file_path in get_file_list(path): suffix = get_file_suffix(file_path) if suffix is not None: - stats[suffix] += 1 + _stats[suffix] += 1 + + # only return suffixes with languages + stats = {} + for suffix, count in _stats.items(): + if SUFFIX_LANG_MAPPING.get(suffix) is not None: + stats[suffix] = count + return stats diff --git a/frontend/src/api/login 2.js b/frontend/src/api/login 2.js deleted file mode 100644 index 4699f07e..00000000 --- a/frontend/src/api/login 2.js +++ /dev/null @@ -1,27 +0,0 @@ -import request from '@/utils/request' - -export function login (username, password) { - return request({ - url: '/user/login', - method: 'post', - data: { - username, - password - } - }) -} - -export function getInfo (token) { - return request({ - url: '/user/info', - method: 'get', - params: { token } - }) -} - -export function logout () { - return request({ - url: '/user/logout', - method: 'post' - }) -} diff --git a/frontend/src/api/request 2.js b/frontend/src/api/request 2.js deleted file mode 100644 index 6ec95917..00000000 --- a/frontend/src/api/request 2.js +++ /dev/null @@ -1,46 +0,0 @@ -import axios from 'axios' - -let baseUrl = 'http://localhost:8000/api' -if (process.env.NODE_ENV === 'production') { - baseUrl = 'http://139.129.230.98:8000/api' -} -// const baseUrl = process.env.API_BASE_URL || 'http://localhost:8000/api' - -const request = (method, path, params, data) => { - return new Promise((resolve, reject) => { - const url = `${baseUrl}${path}` - axios({ - method, - url, - params, - data - }) - .then(resolve) - .catch(reject) - }) -} - -const get = (path, params) => { - return request('GET', path, params) -} - -const post = (path, data) => { - return request('POST', path, {}, data) -} - -const put = (path, data) => { - return request('PUT', path, {}, data) -} - -const del = (path, data) => { - return request('DELETE', path) -} - -export default { - baseUrl, - request, - get, - post, - put, - delete: del -} diff --git a/frontend/src/api/request.js b/frontend/src/api/request.js index 6ec95917..53603af8 100644 --- a/frontend/src/api/request.js +++ b/frontend/src/api/request.js @@ -4,6 +4,7 @@ let baseUrl = 'http://localhost:8000/api' if (process.env.NODE_ENV === 'production') { baseUrl = 'http://139.129.230.98:8000/api' } +// console.log(process.env) // const baseUrl = process.env.API_BASE_URL || 'http://localhost:8000/api' const request = (method, path, params, data) => { diff --git a/frontend/src/components/Stats/MetricCard.vue b/frontend/src/components/Stats/MetricCard.vue new file mode 100644 index 00000000..14658dbd --- /dev/null +++ b/frontend/src/components/Stats/MetricCard.vue @@ -0,0 +1,89 @@ + + + + + diff --git a/frontend/src/components/Stats/SpiderStats.vue b/frontend/src/components/Stats/SpiderStats.vue new file mode 100644 index 00000000..516c6b0e --- /dev/null +++ b/frontend/src/components/Stats/SpiderStats.vue @@ -0,0 +1,258 @@ + + + + + diff --git a/frontend/src/i18n/zh.js b/frontend/src/i18n/zh.js index 13baddc7..c3d3dc6d 100644 --- a/frontend/src/i18n/zh.js +++ b/frontend/src/i18n/zh.js @@ -18,6 +18,7 @@ export default { 'Log': '日志', 'Results': '结果', 'Environment': '环境', + 'Analytics': '分析', // 选择 Spider: '爬虫', @@ -88,6 +89,14 @@ export default { 'Add Environment Variables': '添加环境变量', 'Last 7-Day Tasks': '最近7天任务数', 'Last 5-Run Errors': '最近5次运行错误数', + '30-Day Tasks': '最近30天任务数', + '30-Day Results': '最近30天结果数', + 'Success Rate': '运行成功率', + 'Avg Duration (sec)': '平均运行时长(秒)', + 'Tasks by Status': '分状态任务数', + 'Tasks by Node': '分节点任务数', + 'Daily Tasks': '每日任务数', + 'Daily Avg Duration (sec)': '每日平均运行时长(秒)', // 爬虫列表 'Name': '名称', diff --git a/frontend/src/store/modules/spider.js b/frontend/src/store/modules/spider.js index 1e18ecde..b8345082 100644 --- a/frontend/src/store/modules/spider.js +++ b/frontend/src/store/modules/spider.js @@ -14,7 +14,19 @@ const state = { importForm: { url: '', type: 'github' - } + }, + + // spider overview stats + overviewStats: {}, + + // spider status stats + statusStats: [], + + // spider daily stats + dailyStats: [], + + // spider node stats + nodeStats: [] } const getters = {} @@ -31,7 +43,19 @@ const mutations = { }, SET_IMPORT_FORM (state, value) { state.importForm = value - } + }, + SET_OVERVIEW_STATS (state, value) { + state.overviewStats = value + }, + SET_STATUS_STATS (state, value) { + state.statusStats = value + }, + SET_DAILY_STATS (state, value) { + state.dailyStats = value + }, + SET_NODE_STATS (state, value) { + state.nodeStats = value + }, } const actions = { @@ -138,6 +162,15 @@ const actions = { .then(response => { console.log(response) }) + }, + getSpiderStats ({ state, commit }) { + return request.get('/stats/get_spider_stats?spider_id=' + state.spiderForm._id) + .then(response => { + commit('SET_OVERVIEW_STATS', response.data.overview) + commit('SET_STATUS_STATS', response.data.task_count_by_status) + commit('SET_DAILY_STATS', response.data.daily_stats) + commit('SET_NODE_STATS', response.data.task_count_by_node) + }) } } diff --git a/frontend/src/views/spider/SpiderDetail.vue b/frontend/src/views/spider/SpiderDetail.vue index 56cf7982..dac7931e 100644 --- a/frontend/src/views/spider/SpiderDetail.vue +++ b/frontend/src/views/spider/SpiderDetail.vue @@ -19,6 +19,9 @@ + + + @@ -30,10 +33,12 @@ import { import FileList from '../../components/FileList/FileList' import SpiderOverview from '../../components/Overview/SpiderOverview' import EnvironmentList from '../../components/Environment/EnvironmentList' +import SpiderStats from '../../components/Stats/SpiderStats' export default { name: 'NodeDetail', components: { + SpiderStats, EnvironmentList, FileList, SpiderOverview @@ -57,6 +62,11 @@ export default { }, methods: { onTabClick () { + if (this.activeTabName === 'analytics') { + setTimeout(() => { + this.$refs['spider-stats'].update() + }, 0) + } }, onSpiderChange (id) { this.$router.push(`/spiders/${id}`)