mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
@@ -1,6 +1,12 @@
|
||||
# 0.2.4 (unreleased)
|
||||
### Features / Enhancement
|
||||
- **Documentation**: Better and much more detailed documentation.
|
||||
- **Better Crontab**: Make crontab expression through crontab UI.
|
||||
- **High Concurrency**: `gevent` + `flask` to support high concurrency. [#70](https://github.com/tikazyq/crawlab/issues/70)
|
||||
|
||||
### Bugs Fixes
|
||||
- **MongoDB Auth**. Allow user to specify `authenticationDatabase` to connect to `mongodb`. [#68](https://github.com/tikazyq/crawlab/issues/68)
|
||||
- **Windows Compatibility**. Added `eventlet` to `requirements.txt`. [#59](https://github.com/tikazyq/crawlab/issues/59)
|
||||
|
||||
|
||||
# 0.2.3 (2019-06-12)
|
||||
@@ -10,7 +16,7 @@
|
||||
- **Upload Spider**: Allow user to upload Customized Spider to Crawlab.
|
||||
- **Edit Fields on Preview**: Allow user to edit fields when previewing data in Configurable Spider.
|
||||
|
||||
### Bugs ###
|
||||
### Bugs Fixes
|
||||
- **Spiders Pagination**. Fixed pagination problem in spider page.
|
||||
|
||||
# 0.2.2 (2019-05-30)
|
||||
|
||||
@@ -6,19 +6,20 @@ from flask import Flask
|
||||
from flask_cors import CORS
|
||||
from flask_restful import Api
|
||||
# from flask_restplus import Api
|
||||
from routes.sites import SiteApi
|
||||
from gevent import monkey, pywsgi
|
||||
|
||||
file_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
root_path = os.path.abspath(os.path.join(file_dir, '.'))
|
||||
sys.path.append(root_path)
|
||||
|
||||
from utils.log import other
|
||||
from constants.node import NodeStatus
|
||||
from db.manager import db_manager
|
||||
from routes.schedules import ScheduleApi
|
||||
from tasks.celery import celery_app
|
||||
from tasks.scheduler import scheduler
|
||||
|
||||
file_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
root_path = os.path.abspath(os.path.join(file_dir, '.'))
|
||||
sys.path.append(root_path)
|
||||
|
||||
from config import FLASK_HOST, FLASK_PORT, PROJECT_LOGS_FOLDER
|
||||
from routes.sites import SiteApi
|
||||
from routes.deploys import DeployApi
|
||||
from routes.files import FileApi
|
||||
from routes.nodes import NodeApi
|
||||
@@ -103,4 +104,5 @@ if not os.path.exists(PROJECT_LOGS_FOLDER):
|
||||
|
||||
if __name__ == '__main__':
|
||||
# run app instance
|
||||
app.run(host=FLASK_HOST, port=FLASK_PORT, threaded=False, processes=4)
|
||||
server = pywsgi.WSGIServer((FLASK_HOST, FLASK_PORT), app)
|
||||
server.serve_forever()
|
||||
|
||||
@@ -20,6 +20,7 @@ MONGO_PORT = 27017
|
||||
MONGO_USERNAME = None
|
||||
MONGO_PASSWORD = None
|
||||
MONGO_DB = 'crawlab_test'
|
||||
MONGO_AUTH_DB = 'crawlab_test'
|
||||
|
||||
# Celery中间者URL
|
||||
BROKER_URL = 'redis://127.0.0.1:6379/0'
|
||||
|
||||
@@ -1,11 +1,8 @@
|
||||
from bson import ObjectId
|
||||
from mongoengine import connect
|
||||
from pymongo import MongoClient, DESCENDING
|
||||
from config import MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_USERNAME, MONGO_PASSWORD
|
||||
from config import MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_USERNAME, MONGO_PASSWORD, MONGO_AUTH_DB
|
||||
from utils import is_object_id
|
||||
|
||||
connect(db=MONGO_DB, host=MONGO_HOST, port=MONGO_PORT)
|
||||
|
||||
|
||||
class DbManager(object):
|
||||
__doc__ = """
|
||||
@@ -17,6 +14,7 @@ class DbManager(object):
|
||||
port=MONGO_PORT,
|
||||
username=MONGO_USERNAME,
|
||||
password=MONGO_PASSWORD,
|
||||
authSource=MONGO_AUTH_DB or MONGO_DB,
|
||||
connect=False)
|
||||
self.db = self.mongo[MONGO_DB]
|
||||
|
||||
|
||||
@@ -1,80 +1,13 @@
|
||||
aiohttp==3.5.4
|
||||
amqp==2.4.2
|
||||
aniso8601==6.0.0
|
||||
APScheduler==3.6.0
|
||||
asn1crypto==0.24.0
|
||||
async-timeout==3.0.1
|
||||
attrs==19.1.0
|
||||
Automat==0.7.0
|
||||
Babel==2.6.0
|
||||
beautifulsoup4==4.7.1
|
||||
billiard==3.6.0.0
|
||||
bs4==0.0.1
|
||||
bson==0.5.8
|
||||
cachetools==3.1.0
|
||||
celery==4.3.0
|
||||
certifi==2019.3.9
|
||||
cffi==1.12.3
|
||||
chardet==3.0.4
|
||||
Click==7.0
|
||||
coloredlogs==10.0
|
||||
constantly==15.1.0
|
||||
cryptography==2.6.1
|
||||
cssselect==1.0.3
|
||||
csvalidate==1.1.1
|
||||
eventlet==0.25.0
|
||||
Flask==1.0.2
|
||||
Flask-APScheduler==1.11.0
|
||||
Flask-Cors==3.0.7
|
||||
Flask-CSV==1.2.0
|
||||
Flask-RESTful==0.3.7
|
||||
flask-restplus==0.12.1
|
||||
flower==0.9.3
|
||||
Flask_CSV==1.2.0
|
||||
gevent==1.4.0
|
||||
greenlet==0.4.15
|
||||
gunicorn==19.9.0
|
||||
html5lib==1.0.1
|
||||
humanfriendly==4.18
|
||||
hyperlink==19.0.0
|
||||
idna==2.8
|
||||
idna-ssl==1.1.0
|
||||
incremental==17.5.0
|
||||
itsdangerous==1.1.0
|
||||
Jinja2==2.10
|
||||
jsonpickle==1.1
|
||||
jsonschema==3.0.1
|
||||
kombu==4.5.0
|
||||
lxml==4.3.3
|
||||
MarkupSafe==1.1.1
|
||||
marshmallow==2.19.2
|
||||
mongoengine==0.17.0
|
||||
multidict==4.5.2
|
||||
parsel==1.5.1
|
||||
pyasn1==0.4.5
|
||||
pyasn1-modules==0.2.5
|
||||
pycparser==2.19
|
||||
PyDispatcher==2.0.5
|
||||
PyHamcrest==1.9.0
|
||||
pymongo==3.7.2
|
||||
pyOpenSSL==19.0.0
|
||||
pyrsistent==0.14.11
|
||||
python-dateutil==2.8.0
|
||||
pytz==2018.9
|
||||
queuelib==1.5.0
|
||||
redis==3.2.1
|
||||
requests==2.21.0
|
||||
Scrapy==1.6.0
|
||||
service-identity==18.1.0
|
||||
six==1.12.0
|
||||
soupsieve==1.9.1
|
||||
tornado==5.1.1
|
||||
Twisted==19.2.0
|
||||
typing-extensions==3.7.2
|
||||
tzlocal==1.5.1
|
||||
urllib3==1.24.1
|
||||
vine==1.3.0
|
||||
w3lib==1.20.0
|
||||
webencodings==0.5.1
|
||||
pymongo==3.7.2
|
||||
APScheduler==3.6.0
|
||||
coloredlogs==10.0
|
||||
Flask_RESTful==0.3.7
|
||||
Flask==1.0.2
|
||||
lxml==4.3.3
|
||||
Flask_Cors==3.0.7
|
||||
Werkzeug==0.15.2
|
||||
yarl==1.3.0
|
||||
zope.interface==4.6.0
|
||||
eventlet
|
||||
|
||||
@@ -153,13 +153,14 @@ class SpiderApi(BaseApi):
|
||||
if spider is None:
|
||||
stats = get_file_suffix_stats(dir_path)
|
||||
lang = get_lang_by_stats(stats)
|
||||
spider = db_manager.save('spiders', {
|
||||
spider_id = db_manager.save('spiders', {
|
||||
'name': dir_name,
|
||||
'src': dir_path,
|
||||
'lang': lang,
|
||||
'suffix_stats': stats,
|
||||
'type': SpiderType.CUSTOMIZED
|
||||
})
|
||||
spider = db_manager.get('spiders', id=spider_id)
|
||||
|
||||
# existing spider
|
||||
else:
|
||||
@@ -214,11 +215,50 @@ class SpiderApi(BaseApi):
|
||||
items[i]['last_5_errors'] = get_last_n_run_errors_count(spider_id=spider['_id'], n=5)
|
||||
items[i]['last_7d_tasks'] = get_last_n_day_tasks_count(spider_id=spider['_id'], n=5)
|
||||
|
||||
# sort spiders by _id descending
|
||||
items = reversed(sorted(items, key=lambda x: x['_id']))
|
||||
|
||||
return {
|
||||
'status': 'ok',
|
||||
'items': jsonify(items)
|
||||
}
|
||||
|
||||
def delete(self, id: str = None) -> (dict, tuple):
|
||||
"""
|
||||
DELETE method of given id for deleting an spider.
|
||||
:param id:
|
||||
:return:
|
||||
"""
|
||||
# get spider from db
|
||||
spider = db_manager.get(col_name=self.col_name, id=id)
|
||||
|
||||
# delete spider folder
|
||||
if spider.get('type') == SpiderType.CUSTOMIZED:
|
||||
try:
|
||||
shutil.rmtree(os.path.abspath(os.path.join(PROJECT_SOURCE_FILE_FOLDER, spider['src'])))
|
||||
except Exception as err:
|
||||
return {
|
||||
'status': 'ok',
|
||||
'error': str(err)
|
||||
}, 500
|
||||
|
||||
# perform delete action
|
||||
db_manager.remove_one(col_name=self.col_name, id=id)
|
||||
|
||||
# remove related tasks
|
||||
db_manager.remove(col_name='tasks', cond={'spider_id': spider['_id']})
|
||||
|
||||
# remove related schedules
|
||||
db_manager.remove(col_name='schedules', cond={'spider_id': spider['_id']})
|
||||
|
||||
# execute after_update hook
|
||||
self.after_update(id)
|
||||
|
||||
return {
|
||||
'status': 'ok',
|
||||
'message': 'deleted successfully',
|
||||
}
|
||||
|
||||
def crawl(self, id: str) -> (dict, tuple):
|
||||
"""
|
||||
Submit an HTTP request to start a crawl task in the node of given spider_id.
|
||||
|
||||
Reference in New Issue
Block a user