From f3ff0a464af7d0899b71dad00c70281063a6d4c2 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Fri, 1 Mar 2019 12:07:28 +0800 Subject: [PATCH] updated execute spider logic --- Dockerfile | 2 -- app.py | 5 +++-- config.py | 4 +++- logger/__init__.py | 0 routes/nodes.py | 51 ++++++++++++++++++++++++++-------------------- routes/spiders.py | 37 +++++++++++++++++++++++++++++++-- 6 files changed, 70 insertions(+), 29 deletions(-) delete mode 100644 logger/__init__.py diff --git a/Dockerfile b/Dockerfile index 6ee63c70..275f5b0c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,8 +6,6 @@ FROM ubuntu:latest ADD . /opt/crawlab # add dns -#RUN echo -e "nameserver 180.76.76.76" >> /etc/resolv.conf -#ADD ./resolv.conf /etc RUN cat /etc/resolv.conf # install python diff --git a/app.py b/app.py index a1d790fb..3147580a 100644 --- a/app.py +++ b/app.py @@ -1,7 +1,8 @@ -from flask import Flask +from flask import Flask, logging from flask_cors import CORS from flask_restful import Api +from config import FLASK_HOST, FLASK_PORT from routes.deploys import DeployApi from routes.files import FileApi from routes.nodes import NodeApi @@ -45,4 +46,4 @@ api.add_resource(StatsApi, '/api/stats/') if __name__ == '__main__': - app.run() + app.run(host=FLASK_HOST, port=FLASK_PORT) diff --git a/config.py b/config.py index 3cd974da..8fea5922 100644 --- a/config.py +++ b/config.py @@ -24,4 +24,6 @@ MONGO_DB = 'crawlab_test' # flask variables DEBUG = True -SERVER_NAME = '0.0.0.0:5000' +FLASK_HOST = '0.0.0.0' +FLASK_PORT = 5000 +# SERVER_NAME = '0.0.0.0:5000' diff --git a/logger/__init__.py b/logger/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/routes/nodes.py b/routes/nodes.py index 094d5e7b..2d3e692c 100644 --- a/routes/nodes.py +++ b/routes/nodes.py @@ -1,7 +1,7 @@ import json import requests -from bson import ObjectId +from flask import current_app from config import FLOWER_API_ENDPOINT from constants.node import NodeType @@ -39,33 +39,40 @@ class NodeApi(BaseApi): # TODO: use query "?status=1" to get status of nodes # get status for each node - status_data = check_nodes_status() + status_data = {} + try: + status_data = check_nodes_status() + except Exception as err: + current_app.logger.error(err) # get a list of items - res = requests.get('%s/workers' % FLOWER_API_ENDPOINT) online_node_ids = [] - for k, v in json.loads(res.content.decode('utf-8')).items(): - node_name = k - node_celery = v - node = db_manager.get('nodes', id=node_name) + try: + res = requests.get('%s/workers' % FLOWER_API_ENDPOINT) + for k, v in json.loads(res.content.decode('utf-8')).items(): + node_name = k + node_celery = v + node = db_manager.get('nodes', id=node_name) - # new node - if node is None: - node = {} - for _k, _v in node_celery.items(): - node[_k] = _v - node['_id'] = node_name - node['name'] = node_name - db_manager.save('nodes', node) + # new node + if node is None: + node = {} + for _k, _v in node_celery.items(): + node[_k] = _v + node['_id'] = node_name + node['name'] = node_name + db_manager.save('nodes', node) - # existing node - else: - for _k, _v in v.items(): - node[_k] = _v - node['name'] = node_name - db_manager.save('nodes', node) + # existing node + else: + for _k, _v in v.items(): + node[_k] = _v + node['name'] = node_name + db_manager.save('nodes', node) - online_node_ids.append(node_name) + online_node_ids.append(node_name) + except Exception as err: + current_app.logger.error(err) # iterate db nodes to update status nodes = [] diff --git a/routes/spiders.py b/routes/spiders.py index f3251da1..dc719613 100644 --- a/routes/spiders.py +++ b/routes/spiders.py @@ -93,10 +93,43 @@ class SpiderApi(BaseApi): node_id = args.get('node_id') if node_id is None: - return {}, 400 + return { + 'code': 400, + 'status': 400, + 'error': 'node_id cannot be empty' + }, 400 + + # get node from db + node = db_manager.get('nodes', id=node_id) + + # validate ip and port + if node.get('ip') is None or node.get('port') is None: + return { + 'code': 400, + 'status': 'ok', + 'error': 'node ip and port should not be empty' + }, 400 + + # dispatch crawl task + res = requests.get('http://%s:%s/api/spiders/%s/on_crawl' % ( + node.get('ip'), + node.get('port'), + id + ), {'node_id', node_id}) + data = json.loads(res) + return { + 'code': res.status_code, + 'status': 'ok', + 'error': data.get('error'), + 'task': data.get('task') + } + + def on_crawl(self, id): + args = self.parser.parse_args() + node_id = args.get('node_id') job = execute_spider.delay(id, node_id) - # print('crawl: %s' % id) + return { 'code': 200, 'status': 'ok',