From 9a00be811e057a473d80e8867558d4f9a944d459 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Sat, 20 Apr 2019 20:04:09 +0800 Subject: [PATCH] added params for spider schedules --- crawlab/config.py | 3 ++- crawlab/routes/schedules.py | 3 ++- crawlab/routes/spiders.py | 9 ++++++++- crawlab/tasks/scheduler.py | 17 +++++++++++++---- crawlab/tasks/spider.py | 13 +++++++------ frontend/src/i18n/zh.js | 5 +++++ frontend/src/views/schedule/ScheduleList.vue | 17 +++++++++++++++++ 7 files changed, 54 insertions(+), 13 deletions(-) diff --git a/crawlab/config.py b/crawlab/config.py index d2d69f81..bad08ee2 100644 --- a/crawlab/config.py +++ b/crawlab/config.py @@ -5,7 +5,8 @@ PROJECT_SOURCE_FILE_FOLDER = '../spiders' PYTHON_ENV_PATH = '/Users/chennan/Desktop/2019/env/bin/python' # 爬虫部署路径 -PROJECT_DEPLOY_FILE_FOLDER = '../deployfile' +# PROJECT_DEPLOY_FILE_FOLDER = '../deployfile' +PROJECT_DEPLOY_FILE_FOLDER = '/var/crawlab' # 爬虫日志路径 PROJECT_LOGS_FOLDER = '../deployfile/logs' diff --git a/crawlab/routes/schedules.py b/crawlab/routes/schedules.py index 532a4ec5..01db8be1 100644 --- a/crawlab/routes/schedules.py +++ b/crawlab/routes/schedules.py @@ -17,7 +17,8 @@ class ScheduleApi(BaseApi): ('name', str), ('description', str), ('cron', str), - ('spider_id', str) + ('spider_id', str), + ('params', str) ) def after_update(self, id: str = None): diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index f36903e3..ba315ce9 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -193,12 +193,19 @@ class SpiderApi(BaseApi): :param id: spider_id :return: """ - job = execute_spider.delay(id) + args = self.parser.parse_args() + params = args.get('params') + + spider = db_manager.get('spiders', id=ObjectId(id)) + + job = execute_spider.delay(id, params) # create a new task db_manager.save('tasks', { '_id': job.id, 'spider_id': ObjectId(id), + 'cmd': spider.get('cmd'), + 'params': params, 'create_ts': datetime.utcnow(), 'status': TaskStatus.PENDING }) diff --git a/crawlab/tasks/scheduler.py b/crawlab/tasks/scheduler.py index bf29607f..55e8fc36 100644 --- a/crawlab/tasks/scheduler.py +++ b/crawlab/tasks/scheduler.py @@ -22,12 +22,15 @@ class Scheduler(object): # scheduler instance scheduler = BackgroundScheduler(jobstores=jobstores) - def execute_spider(self, id: str): + def execute_spider(self, id: str, params: str = None): + query = {} + if params is not None: + query['params'] = params r = requests.get('http://%s:%s/api/spiders/%s/on_crawl' % ( FLASK_HOST, FLASK_PORT, id - )) + ), query) def update(self): # remove all existing periodic jobs @@ -44,9 +47,15 @@ class Scheduler(object): day = cron_arr[3] month = cron_arr[4] day_of_week = cron_arr[5] - self.scheduler.add_job(func=self.execute_spider, trigger='cron', args=(str(task['spider_id']),), + self.scheduler.add_job(func=self.execute_spider, + args=(str(task['spider_id']), task.get('params'),), + trigger='cron', jobstore='mongo', - day_of_week=day_of_week, month=month, day=day, hour=hour, minute=minute, + day_of_week=day_of_week, + month=month, + day=day, + hour=hour, + minute=minute, second=second) def run(self): diff --git a/crawlab/tasks/spider.py b/crawlab/tasks/spider.py index 3413a021..c71c3f34 100644 --- a/crawlab/tasks/spider.py +++ b/crawlab/tasks/spider.py @@ -11,7 +11,7 @@ from utils.log import other as logger @celery_app.task(bind=True) -def execute_spider(self, id: str): +def execute_spider(self, id: str, params: str = None): """ Execute spider task. :param self: @@ -23,6 +23,8 @@ def execute_spider(self, id: str): command = spider.get('cmd') if command.startswith("env"): command = PYTHON_ENV_PATH + command.replace("env", "") + if params is not None: + command += ' ' + params current_working_directory = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id'))) @@ -43,7 +45,7 @@ def execute_spider(self, id: str): stdout = open(log_file_path, 'a') stderr = open(log_file_path, 'a') - # create a new task + # update task status as started db_manager.update_one('tasks', id=task_id, values={ 'start_ts': datetime.utcnow(), 'node_id': hostname, @@ -68,7 +70,9 @@ def execute_spider(self, id: str): env['CRAWLAB_COLLECTION'] = spider.get('col') # start process - p = subprocess.Popen(command.split(' '), + cmd_arr = command.split(' ') + cmd_arr = list(filter(lambda x: x != '', cmd_arr)) + p = subprocess.Popen(cmd_arr, stdout=stdout.fileno(), stderr=stderr.fileno(), cwd=current_working_directory, @@ -87,9 +91,6 @@ def execute_spider(self, id: str): # save task when the task is finished db_manager.update_one('tasks', id=task_id, values={ - 'node_id': hostname, - 'hostname': hostname, - 'log_file_path': log_file_path, 'finish_ts': datetime.utcnow(), 'status': status }) diff --git a/frontend/src/i18n/zh.js b/frontend/src/i18n/zh.js index 7e127114..a437dcf1 100644 --- a/frontend/src/i18n/zh.js +++ b/frontend/src/i18n/zh.js @@ -111,6 +111,11 @@ export default { // 部署 'Time': '时间', + // 定时任务 + 'Schedule Name': '定时任务名称', + 'Schedule Description': '定时任务描述', + 'Parameters': '参数', + // 文件 'Choose Folder': '选择文件', diff --git a/frontend/src/views/schedule/ScheduleList.vue b/frontend/src/views/schedule/ScheduleList.vue index 3b1e3307..7bcdcdf0 100644 --- a/frontend/src/views/schedule/ScheduleList.vue +++ b/frontend/src/views/schedule/ScheduleList.vue @@ -31,6 +31,15 @@ + + + + + + @@ -130,6 +139,14 @@ export default { ]), filteredTableData () { return this.scheduleList + }, + spider () { + for (let i = 0; i < this.spiderList.length; i++) { + if (this.spiderList[i]._id === this.scheduleForm.spider_id) { + return this.spiderList[i] + } + } + return {} } }, methods: {