From 6f07e1cd69b374a94803001b2a93f463d4c2595a Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Fri, 21 Jun 2019 12:33:35 +0800 Subject: [PATCH 1/8] fixed issue https://github.com/tikazyq/crawlab/issues/68 --- crawlab/config/config.py | 1 + crawlab/db/manager.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/crawlab/config/config.py b/crawlab/config/config.py index 14380b73..749ecdba 100644 --- a/crawlab/config/config.py +++ b/crawlab/config/config.py @@ -20,6 +20,7 @@ MONGO_PORT = 27017 MONGO_USERNAME = None MONGO_PASSWORD = None MONGO_DB = 'crawlab_test' +MONGO_AUTH_DB = 'crawlab_test' # Celery中间者URL BROKER_URL = 'redis://127.0.0.1:6379/0' diff --git a/crawlab/db/manager.py b/crawlab/db/manager.py index 17d6b1ae..c7eb16e1 100644 --- a/crawlab/db/manager.py +++ b/crawlab/db/manager.py @@ -1,7 +1,7 @@ from bson import ObjectId from mongoengine import connect from pymongo import MongoClient, DESCENDING -from config import MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_USERNAME, MONGO_PASSWORD +from config import MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_USERNAME, MONGO_PASSWORD, MONGO_AUTH_DB from utils import is_object_id connect(db=MONGO_DB, host=MONGO_HOST, port=MONGO_PORT) @@ -17,6 +17,7 @@ class DbManager(object): port=MONGO_PORT, username=MONGO_USERNAME, password=MONGO_PASSWORD, + authSource=MONGO_AUTH_DB or MONGO_DB, connect=False) self.db = self.mongo[MONGO_DB] From b57efe4d63b4a58b4cc35e163bc5699a02eee446 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Fri, 21 Jun 2019 12:34:48 +0800 Subject: [PATCH 2/8] code cleanup --- crawlab/db/manager.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/crawlab/db/manager.py b/crawlab/db/manager.py index c7eb16e1..ac157dfb 100644 --- a/crawlab/db/manager.py +++ b/crawlab/db/manager.py @@ -1,11 +1,8 @@ from bson import ObjectId -from mongoengine import connect from pymongo import MongoClient, DESCENDING from config import MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_USERNAME, MONGO_PASSWORD, MONGO_AUTH_DB from utils import is_object_id -connect(db=MONGO_DB, host=MONGO_HOST, port=MONGO_PORT) - class DbManager(object): __doc__ = """ From d59107a3038cffc80412c71b550131a30a7fd5e4 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Fri, 21 Jun 2019 12:37:21 +0800 Subject: [PATCH 3/8] requirements.txt cleanup --- crawlab/requirements.txt | 84 ++++------------------------------------ 1 file changed, 8 insertions(+), 76 deletions(-) diff --git a/crawlab/requirements.txt b/crawlab/requirements.txt index dc42e26e..d5ac763c 100644 --- a/crawlab/requirements.txt +++ b/crawlab/requirements.txt @@ -1,80 +1,12 @@ -aiohttp==3.5.4 -amqp==2.4.2 -aniso8601==6.0.0 -APScheduler==3.6.0 -asn1crypto==0.24.0 -async-timeout==3.0.1 -attrs==19.1.0 -Automat==0.7.0 -Babel==2.6.0 -beautifulsoup4==4.7.1 -billiard==3.6.0.0 -bs4==0.0.1 -bson==0.5.8 -cachetools==3.1.0 -celery==4.3.0 -certifi==2019.3.9 -cffi==1.12.3 -chardet==3.0.4 -Click==7.0 -coloredlogs==10.0 -constantly==15.1.0 -cryptography==2.6.1 -cssselect==1.0.3 -csvalidate==1.1.1 -eventlet==0.25.0 -Flask==1.0.2 -Flask-APScheduler==1.11.0 -Flask-Cors==3.0.7 -Flask-CSV==1.2.0 -Flask-RESTful==0.3.7 -flask-restplus==0.12.1 -flower==0.9.3 +Flask_CSV==1.2.0 gevent==1.4.0 -greenlet==0.4.15 -gunicorn==19.9.0 -html5lib==1.0.1 -humanfriendly==4.18 -hyperlink==19.0.0 -idna==2.8 -idna-ssl==1.1.0 -incremental==17.5.0 -itsdangerous==1.1.0 -Jinja2==2.10 -jsonpickle==1.1 -jsonschema==3.0.1 -kombu==4.5.0 -lxml==4.3.3 -MarkupSafe==1.1.1 -marshmallow==2.19.2 -mongoengine==0.17.0 -multidict==4.5.2 -parsel==1.5.1 -pyasn1==0.4.5 -pyasn1-modules==0.2.5 -pycparser==2.19 -PyDispatcher==2.0.5 -PyHamcrest==1.9.0 -pymongo==3.7.2 -pyOpenSSL==19.0.0 -pyrsistent==0.14.11 -python-dateutil==2.8.0 -pytz==2018.9 -queuelib==1.5.0 -redis==3.2.1 requests==2.21.0 Scrapy==1.6.0 -service-identity==18.1.0 -six==1.12.0 -soupsieve==1.9.1 -tornado==5.1.1 -Twisted==19.2.0 -typing-extensions==3.7.2 -tzlocal==1.5.1 -urllib3==1.24.1 -vine==1.3.0 -w3lib==1.20.0 -webencodings==0.5.1 +pymongo==3.7.2 +APScheduler==3.6.0 +coloredlogs==10.0 +Flask_RESTful==0.3.7 +Flask==1.0.2 +lxml==4.3.3 +Flask_Cors==3.0.7 Werkzeug==0.15.2 -yarl==1.3.0 -zope.interface==4.6.0 From 78c66f4930807e6eee53792c135c44863add02c3 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Fri, 21 Jun 2019 12:53:14 +0800 Subject: [PATCH 4/8] fixed issue https://github.com/tikazyq/crawlab/issues/70 --- crawlab/app.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/crawlab/app.py b/crawlab/app.py index e3741dd1..995ced14 100644 --- a/crawlab/app.py +++ b/crawlab/app.py @@ -6,19 +6,20 @@ from flask import Flask from flask_cors import CORS from flask_restful import Api # from flask_restplus import Api -from routes.sites import SiteApi +from gevent import monkey, pywsgi + +file_dir = os.path.dirname(os.path.realpath(__file__)) +root_path = os.path.abspath(os.path.join(file_dir, '.')) +sys.path.append(root_path) + from utils.log import other from constants.node import NodeStatus from db.manager import db_manager from routes.schedules import ScheduleApi from tasks.celery import celery_app from tasks.scheduler import scheduler - -file_dir = os.path.dirname(os.path.realpath(__file__)) -root_path = os.path.abspath(os.path.join(file_dir, '.')) -sys.path.append(root_path) - from config import FLASK_HOST, FLASK_PORT, PROJECT_LOGS_FOLDER +from routes.sites import SiteApi from routes.deploys import DeployApi from routes.files import FileApi from routes.nodes import NodeApi @@ -26,6 +27,9 @@ from routes.spiders import SpiderApi, SpiderImportApi, SpiderManageApi from routes.stats import StatsApi from routes.tasks import TaskApi +# 打上猴子补丁 +monkey.patch_all() + # flask app instance app = Flask(__name__) app.config.from_object('config') @@ -103,4 +107,6 @@ if not os.path.exists(PROJECT_LOGS_FOLDER): if __name__ == '__main__': # run app instance - app.run(host=FLASK_HOST, port=FLASK_PORT, threaded=False, processes=4) + # app.run(host=FLASK_HOST, port=FLASK_PORT) + server = pywsgi.WSGIServer((FLASK_HOST, FLASK_PORT), app) + server.serve_forever() From d1476822602d51e64061fd2bb6c0612c3d323abe Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Fri, 21 Jun 2019 12:54:13 +0800 Subject: [PATCH 5/8] code cleanup --- crawlab/app.py | 1 - 1 file changed, 1 deletion(-) diff --git a/crawlab/app.py b/crawlab/app.py index 995ced14..175dd130 100644 --- a/crawlab/app.py +++ b/crawlab/app.py @@ -107,6 +107,5 @@ if not os.path.exists(PROJECT_LOGS_FOLDER): if __name__ == '__main__': # run app instance - # app.run(host=FLASK_HOST, port=FLASK_PORT) server = pywsgi.WSGIServer((FLASK_HOST, FLASK_PORT), app) server.serve_forever() From 5d441510b92e896ef9e9a5b8322ff34d3205aa70 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Fri, 21 Jun 2019 12:57:23 +0800 Subject: [PATCH 6/8] added eventlet to requirements.txt --- crawlab/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/crawlab/requirements.txt b/crawlab/requirements.txt index d5ac763c..39912ed9 100644 --- a/crawlab/requirements.txt +++ b/crawlab/requirements.txt @@ -10,3 +10,4 @@ Flask==1.0.2 lxml==4.3.3 Flask_Cors==3.0.7 Werkzeug==0.15.2 +eventlet From 5f4e2b16da9e9d23f62b788acc72b35608862330 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Fri, 21 Jun 2019 13:00:52 +0800 Subject: [PATCH 7/8] updated CHANGELOG.md --- CHANGELOG.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dc2c3eb6..7d7994ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,12 @@ # 0.2.4 (unreleased) +### Features / Enhancement - **Documentation**: Better and much more detailed documentation. - **Better Crontab**: Make crontab expression through crontab UI. +- **High Concurrency**: `gevent` + `flask` to support high concurrency. [#70](https://github.com/tikazyq/crawlab/issues/70) + +### Bugs Fixes +- **MongoDB Auth**. Allow user to specify `authenticationDatabase` to connect to `mongodb`. [#68](https://github.com/tikazyq/crawlab/issues/68) +- **Windows Compatibility**. Added `eventlet` to `requirements.txt`. [#59](https://github.com/tikazyq/crawlab/issues/59) # 0.2.3 (2019-06-12) @@ -10,7 +16,7 @@ - **Upload Spider**: Allow user to upload Customized Spider to Crawlab. - **Edit Fields on Preview**: Allow user to edit fields when previewing data in Configurable Spider. -### Bugs ### +### Bugs Fixes - **Spiders Pagination**. Fixed pagination problem in spider page. # 0.2.2 (2019-05-30) From 89f3a87a96d90b8390d73f8b3740d3f740b1039f Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Fri, 21 Jun 2019 13:30:24 +0800 Subject: [PATCH 8/8] fixed issue https://github.com/tikazyq/crawlab/issues/69 --- crawlab/app.py | 3 --- crawlab/routes/spiders.py | 42 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/crawlab/app.py b/crawlab/app.py index 175dd130..db6fa95e 100644 --- a/crawlab/app.py +++ b/crawlab/app.py @@ -27,9 +27,6 @@ from routes.spiders import SpiderApi, SpiderImportApi, SpiderManageApi from routes.stats import StatsApi from routes.tasks import TaskApi -# 打上猴子补丁 -monkey.patch_all() - # flask app instance app = Flask(__name__) app.config.from_object('config') diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index 2cc39fea..e3d897cb 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -153,13 +153,14 @@ class SpiderApi(BaseApi): if spider is None: stats = get_file_suffix_stats(dir_path) lang = get_lang_by_stats(stats) - spider = db_manager.save('spiders', { + spider_id = db_manager.save('spiders', { 'name': dir_name, 'src': dir_path, 'lang': lang, 'suffix_stats': stats, 'type': SpiderType.CUSTOMIZED }) + spider = db_manager.get('spiders', id=spider_id) # existing spider else: @@ -214,11 +215,50 @@ class SpiderApi(BaseApi): items[i]['last_5_errors'] = get_last_n_run_errors_count(spider_id=spider['_id'], n=5) items[i]['last_7d_tasks'] = get_last_n_day_tasks_count(spider_id=spider['_id'], n=5) + # sort spiders by _id descending + items = reversed(sorted(items, key=lambda x: x['_id'])) + return { 'status': 'ok', 'items': jsonify(items) } + def delete(self, id: str = None) -> (dict, tuple): + """ + DELETE method of given id for deleting an spider. + :param id: + :return: + """ + # get spider from db + spider = db_manager.get(col_name=self.col_name, id=id) + + # delete spider folder + if spider.get('type') == SpiderType.CUSTOMIZED: + try: + shutil.rmtree(os.path.abspath(os.path.join(PROJECT_SOURCE_FILE_FOLDER, spider['src']))) + except Exception as err: + return { + 'status': 'ok', + 'error': str(err) + }, 500 + + # perform delete action + db_manager.remove_one(col_name=self.col_name, id=id) + + # remove related tasks + db_manager.remove(col_name='tasks', cond={'spider_id': spider['_id']}) + + # remove related schedules + db_manager.remove(col_name='schedules', cond={'spider_id': spider['_id']}) + + # execute after_update hook + self.after_update(id) + + return { + 'status': 'ok', + 'message': 'deleted successfully', + } + def crawl(self, id: str) -> (dict, tuple): """ Submit an HTTP request to start a crawl task in the node of given spider_id.