From db1cd7ec9b57ec36e412de4ce1717b35cfe57a2a Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Tue, 23 Apr 2019 19:33:50 +0800 Subject: [PATCH] fix issue #12 --- crawlab/db/manager.py | 4 ++++ crawlab/tasks/spider.py | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/crawlab/db/manager.py b/crawlab/db/manager.py index d210b81c..c1c6d88a 100644 --- a/crawlab/db/manager.py +++ b/crawlab/db/manager.py @@ -175,5 +175,9 @@ class DbManager(object): col = self.db[col_name] return col.aggregate(pipelines, **kwargs) + def create_index(self, col_name: str, keys: dict, **kwargs): + col = self.db[col_name] + col.create_index(keys=keys, **kwargs) + db_manager = DbManager() diff --git a/crawlab/tasks/spider.py b/crawlab/tasks/spider.py index c71c3f34..8a844968 100644 --- a/crawlab/tasks/spider.py +++ b/crawlab/tasks/spider.py @@ -1,5 +1,6 @@ import os from datetime import datetime +from time import sleep from bson import ObjectId from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_LOGS_FOLDER, PYTHON_ENV_PATH @@ -10,6 +11,17 @@ import subprocess from utils.log import other as logger +def get_task(id: str): + i = 0 + while i < 5: + task = db_manager.get('tasks', id=id) + if task is not None: + return task + i += 1 + sleep(1) + return None + + @celery_app.task(bind=True) def execute_spider(self, id: str, params: str = None): """ @@ -26,6 +38,12 @@ def execute_spider(self, id: str, params: str = None): if params is not None: command += ' ' + params + # get task object and return if not found + task = get_task(task_id) + if task is None: + return + + # current working directory current_working_directory = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id'))) # log info @@ -69,6 +87,9 @@ def execute_spider(self, id: str, params: str = None): if spider.get('col'): env['CRAWLAB_COLLECTION'] = spider.get('col') + # create index to speed results data retrieval + db_manager.create_index(spider.get('col'), {'task_id': 1}) + # start process cmd_arr = command.split(' ') cmd_arr = list(filter(lambda x: x != '', cmd_arr))