From a4a0fdd67b85571d4354d516c83b6e802b90544e Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Mon, 18 Feb 2019 21:16:40 +0800 Subject: [PATCH] added tasks --- config/__init__.py | 1 + routes/spiders.py | 2 +- spiders/baidu/baidu.py | 5 ++--- tasks/__init__.py | 10 +++++++++- tasks/deploy.py | 18 ++++++++++++++++++ tasks/spider.py | 25 +++++++++++++++++++++---- 6 files changed, 52 insertions(+), 9 deletions(-) create mode 100644 tasks/deploy.py diff --git a/config/__init__.py b/config/__init__.py index ca07cdb2..4a968adb 100644 --- a/config/__init__.py +++ b/config/__init__.py @@ -1 +1,2 @@ PROJECT_FILE_FOLDER = '/var/crawlab' +PROJECT_LOGS_FOLDER = '/Users/yeqing/projects/crawlab/logs/crawlab' diff --git a/routes/spiders.py b/routes/spiders.py index 6fb2096d..678072a5 100644 --- a/routes/spiders.py +++ b/routes/spiders.py @@ -27,7 +27,7 @@ class SpiderApi(BaseApi): def crawl(self, id): job = execute_spider.delay(id) - print('crawl: %s' % id) + # print('crawl: %s' % id) return { 'code': 200, 'status': 'ok', diff --git a/spiders/baidu/baidu.py b/spiders/baidu/baidu.py index f0d7bba3..349afc37 100644 --- a/spiders/baidu/baidu.py +++ b/spiders/baidu/baidu.py @@ -1,6 +1,5 @@ from time import sleep import requests -for i in range(10): - r = requests.get('http://www.baidu.com') - sleep(0.1) +r = requests.get('http://www.baidu.com') +print(r.content) diff --git a/tasks/__init__.py b/tasks/__init__.py index 635bd48e..d7e86a00 100644 --- a/tasks/__init__.py +++ b/tasks/__init__.py @@ -1,9 +1,17 @@ +import os +import sys +import threading + from celery import Celery app = Celery(__name__) app.config_from_object('config.celery') import tasks.spider +import tasks.deploy if __name__ == '__main__': - app.start(argv=['tasks.spider', 'worker', '-P', 'eventlet', '-E', '-l', 'INFO']) + if sys.platform == 'windows': + app.start(argv=['tasks', 'worker', '-P', 'eventlet', '-E', '-l', 'INFO']) + else: + app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO']) diff --git a/tasks/deploy.py b/tasks/deploy.py new file mode 100644 index 00000000..875be949 --- /dev/null +++ b/tasks/deploy.py @@ -0,0 +1,18 @@ +import os +import sys +from datetime import datetime + +import requests +from celery.utils.log import get_logger + +from config import PROJECT_FILE_FOLDER, PROJECT_LOGS_FOLDER +from db.manager import db_manager +from tasks import app +import subprocess + +logger = get_logger(__name__) + + +@app.task +def deploy_spider(id): + pass diff --git a/tasks/spider.py b/tasks/spider.py index d7e2d801..f0e67841 100644 --- a/tasks/spider.py +++ b/tasks/spider.py @@ -1,10 +1,11 @@ import os import sys +from datetime import datetime import requests from celery.utils.log import get_logger -from config import PROJECT_FILE_FOLDER +from config import PROJECT_FILE_FOLDER, PROJECT_LOGS_FOLDER from db.manager import db_manager from tasks import app import subprocess @@ -18,14 +19,30 @@ def execute_spider(id: str): latest_version = db_manager.get_latest_version(spider_id=id) command = spider.get('cmd') current_working_directory = os.path.join(PROJECT_FILE_FOLDER, str(spider.get('_id')), str(latest_version)) + + # log info + logger.info('spider_id: %s' % id) + logger.info('version: %s' % latest_version) + logger.info(command) + + # make sure the log folder exists + log_path = os.path.join(PROJECT_LOGS_FOLDER, id, str(latest_version)) + if not os.path.exists(log_path): + os.makedirs(log_path) + + # execute the command p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, + stderr=subprocess.PIPE, cwd=current_working_directory, bufsize=1) - for i in iter(p.stdout.readline, 'b'): - yield i + + # output the log file + log_file_path = os.path.join(log_path, '%s.txt' % datetime.now().strftime('%Y%m%d%H%M%S')) + with open(log_file_path, 'a') as f: + for line in p.stdout.readlines(): + f.write(line.decode('utf-8') + '\n') @app.task