Merge pull request #18 from tikazyq/develop

Develop
This commit is contained in:
Marvin Zhang
2019-04-15 13:21:32 +08:00
committed by GitHub
86 changed files with 19530 additions and 786 deletions

27
LICENSE
View File

@@ -1,27 +0,0 @@
Copyright (c) 2019, Marvin Zhang
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. All advertising materials mentioning features or use of this software
must display the following acknowledgement:
This product includes software developed by the Marvin Zhang.
4. Neither the name of the Marvin Zhang nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY MARVIN ZHANG ''AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL MARVIN ZHANG BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -62,6 +62,10 @@ npm run serve
![spider-list](./docs/img/screenshot-task-detail-results.png)
## 使用流程
![user-process](./docs/img/用户使用流程图.png)
## 架构
Crawlab的架构跟Celery非常相似但是加入了包括前端、爬虫、Flower在内的额外模块以支持爬虫管理的功能。
@@ -137,7 +141,7 @@ class JuejinPipeline(object):
## 与其他框架比较
限制以及有一些爬虫管理框架了因此为啥还要用Crawlab
现在已经有一些爬虫管理框架了因此为啥还要用Crawlab
因为很多现有当平台都依赖于Scrapyd限制了爬虫的编程语言以及框架爬虫工程师只能用scrapy和python。当然scrapy是非常优秀的爬虫框架但是它不能做一切事情。

View File

@@ -8,7 +8,8 @@ from celery import Celery
from flask import Flask
from flask_cors import CORS
from flask_restful import Api
# from flask_restplus import Api
from utils.log import other
from constants.node import NodeStatus
from db.manager import db_manager
from routes.schedules import ScheduleApi
@@ -42,23 +43,22 @@ api.add_resource(NodeApi,
'/api/nodes',
'/api/nodes/<string:id>',
'/api/nodes/<string:id>/<string:action>')
api.add_resource(SpiderImportApi,
'/api/spiders/import/<string:platform>')
api.add_resource(SpiderManageApi,
'/api/spiders/manage/<string:action>')
api.add_resource(SpiderApi,
'/api/spiders',
'/api/spiders/<string:id>',
'/api/spiders/<string:id>/<string:action>')
api.add_resource(SpiderImportApi,
'/api/spiders/import/<string:platform>')
api.add_resource(SpiderManageApi,
'/api/spiders/manage/<string:action>')
api.add_resource(TaskApi,
'/api/tasks',
'/api/tasks/<string:id>',
'/api/tasks/<string:id>/<string:action>')
api.add_resource(DeployApi,
'/api/deploys',
'/api/deploys/<string:id>',
'/api/deploys/<string:id>/<string:action>')
api.add_resource(TaskApi,
'/api/tasks',
'/api/tasks/<string:id>',
'/api/tasks/<string:id>/<string:action>'
)
api.add_resource(FileApi,
'/api/files',
'/api/files/<string:action>')
@@ -78,7 +78,7 @@ def monitor_nodes_status(celery_app):
})
def update_nodes_status_online(event):
print(event)
other.info(f"{event}")
with celery_app.connection() as connection:
recv = celery_app.events.Receiver(connection, handlers={

View File

@@ -6,11 +6,11 @@ import subprocess
file_dir = os.path.dirname(os.path.realpath(__file__))
root_path = os.path.abspath(os.path.join(file_dir, '..'))
sys.path.append(root_path)
from utils.log import other
from config import BROKER_URL
if __name__ == '__main__':
p = subprocess.Popen(['celery', 'flower', '-b', BROKER_URL], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for line in iter(p.stdout.readline, 'b'):
if line.decode('utf-8') != '':
print(line.decode('utf-8'))
other.info(line.decode('utf-8'))

View File

@@ -1,12 +1,17 @@
# project variables
PROJECT_SOURCE_FILE_FOLDER = '/Users/yeqing/projects/crawlab/spiders'
PROJECT_DEPLOY_FILE_FOLDER = '/var/crawlab'
PROJECT_LOGS_FOLDER = '/var/logs/crawlab'
# 爬虫源码路径
PROJECT_SOURCE_FILE_FOLDER = '../spiders'
# 配置python虚拟环境的路径
PYTHON_ENV_PATH = '/Users/chennan/Desktop/2019/env/bin/python'
# 爬虫部署路径
PROJECT_DEPLOY_FILE_FOLDER = '../deployfile'
PROJECT_LOGS_FOLDER = '../deployfile/logs'
PROJECT_TMP_FOLDER = '/tmp'
# celery variables
BROKER_URL = 'redis://192.168.99.100:6379/0'
CELERY_RESULT_BACKEND = 'mongodb://192.168.99.100:27017/'
BROKER_URL = 'redis://127.0.0.1:6379/0'
CELERY_RESULT_BACKEND = 'mongodb://127.0.0.1:27017/'
CELERY_MONGODB_BACKEND_SETTINGS = {
'database': 'crawlab_test',
'taskmeta_collection': 'tasks_celery',
@@ -18,7 +23,7 @@ CELERY_ENABLE_UTC = True
FLOWER_API_ENDPOINT = 'http://localhost:5555/api'
# database variables
MONGO_HOST = '192.168.99.100'
MONGO_HOST = '127.0.0.1'
MONGO_PORT = 27017
MONGO_DB = 'crawlab_test'

View File

@@ -2,17 +2,26 @@ from bson import ObjectId
from mongoengine import connect
from pymongo import MongoClient, DESCENDING
from config import MONGO_HOST, MONGO_PORT, MONGO_DB
from utils import is_object_id, jsonify
from utils import is_object_id
connect(db=MONGO_DB, host=MONGO_HOST, port=MONGO_PORT)
class DbManager(object):
__doc__ = """
Database Manager class for handling database CRUD actions.
"""
def __init__(self):
self.mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
self.db = self.mongo[MONGO_DB]
def save(self, col_name: str, item, **kwargs):
def save(self, col_name: str, item: dict, **kwargs) -> None:
"""
Save the item in the specified collection
:param col_name: collection name
:param item: item object
"""
col = self.db[col_name]
# in case some fields cannot be saved in MongoDB
@@ -21,15 +30,32 @@ class DbManager(object):
col.save(item, **kwargs)
def remove(self, col_name: str, cond: dict, **kwargs):
def remove(self, col_name: str, cond: dict, **kwargs) -> None:
"""
Remove items given specified condition.
:param col_name: collection name
:param cond: condition or filter
"""
col = self.db[col_name]
col.remove(cond, **kwargs)
def update(self, col_name: str, cond: dict, values: dict, **kwargs):
"""
Update items given specified condition.
:param col_name: collection name
:param cond: condition or filter
:param values: values to update
"""
col = self.db[col_name]
col.update(cond, {'$set': values}, **kwargs)
def update_one(self, col_name: str, id: str, values: dict, **kwargs):
"""
Update an item given specified _id
:param col_name: collection name
:param id: _id
:param values: values to update
"""
col = self.db[col_name]
_id = id
if is_object_id(id):
@@ -38,6 +64,11 @@ class DbManager(object):
col.find_one_and_update({'_id': _id}, {'$set': values})
def remove_one(self, col_name: str, id: str, **kwargs):
"""
Remove an item given specified _id
:param col_name: collection name
:param id: _id
"""
col = self.db[col_name]
_id = id
if is_object_id(id):
@@ -45,7 +76,16 @@ class DbManager(object):
col.remove({'_id': _id})
def list(self, col_name: str, cond: dict, sort_key=None, sort_direction=DESCENDING, skip: int = 0, limit: int = 100,
**kwargs):
**kwargs) -> list:
"""
Return a list of items given specified condition, sort_key, sort_direction, skip, and limit.
:param col_name: collection name
:param cond: condition or filter
:param sort_key: key to sort
:param sort_direction: sort direction
:param skip: skip number
:param limit: limit number
"""
if sort_key is None:
sort_key = '_i'
col = self.db[col_name]
@@ -54,11 +94,21 @@ class DbManager(object):
data.append(item)
return data
def _get(self, col_name: str, cond: dict):
def _get(self, col_name: str, cond: dict) -> dict:
"""
Get an item given specified condition.
:param col_name: collection name
:param cond: condition or filter
"""
col = self.db[col_name]
return col.find_one(cond)
def get(self, col_name: str, id):
def get(self, col_name: str, id: (ObjectId, str)) -> dict:
"""
Get an item given specified _id.
:param col_name: collection name
:param id: _id
"""
if type(id) == ObjectId:
_id = id
elif is_object_id(id):
@@ -67,14 +117,28 @@ class DbManager(object):
_id = id
return self._get(col_name=col_name, cond={'_id': _id})
def get_one_by_key(self, col_name: str, key, value):
def get_one_by_key(self, col_name: str, key, value) -> dict:
"""
Get an item given key/value condition.
:param col_name: collection name
:param key: key
:param value: value
"""
return self._get(col_name=col_name, cond={key: value})
def count(self, col_name: str, cond):
def count(self, col_name: str, cond) -> int:
"""
Get total count of a collection given specified condition
:param col_name: collection name
:param cond: condition or filter
"""
col = self.db[col_name]
return col.count(cond)
def get_latest_version(self, spider_id, node_id):
"""
@deprecated
"""
col = self.db['deploys']
for item in col.find({'spider_id': ObjectId(spider_id), 'node_id': node_id}) \
.sort('version', DESCENDING):
@@ -82,6 +146,9 @@ class DbManager(object):
return None
def get_last_deploy(self, spider_id):
"""
@deprecated
"""
col = self.db['deploys']
for item in col.find({'spider_id': ObjectId(spider_id)}) \
.sort('finish_ts', DESCENDING):
@@ -89,6 +156,12 @@ class DbManager(object):
return None
def aggregate(self, col_name: str, pipelines, **kwargs):
"""
Perform MongoDB col.aggregate action to aggregate stats given collection name and pipelines.
Reference: https://docs.mongodb.com/manual/reference/command/aggregate/
:param col_name: collection name
:param pipelines: pipelines
"""
col = self.db[col_name]
return col.aggregate(pipelines, **kwargs)

View File

@@ -24,7 +24,7 @@ from routes.spiders import SpiderApi, SpiderImportApi, SpiderManageApi
from routes.stats import StatsApi
from routes.tasks import TaskApi
from tasks.celery import celery_app
from utils.log import other
# flask app instance
app = Flask(__name__)
app.config.from_object('config')
@@ -81,7 +81,7 @@ def run_flower():
p = subprocess.Popen(['celery', 'flower', '-b', BROKER_URL], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for line in iter(p.stdout.readline, 'b'):
if line.decode('utf-8') != '':
print(line.decode('utf-8'))
other.info(line.decode('utf-8'))
def run_worker():

35
crawlab/requirements.txt Normal file
View File

@@ -0,0 +1,35 @@
amqp==2.4.2
aniso8601==6.0.0
APScheduler==3.6.0
attrs==19.1.0
Babel==2.6.0
billiard==3.6.0.0
celery==4.3.0
certifi==2019.3.9
chardet==3.0.4
Click==7.0
coloredlogs==10.0
Flask==1.0.2
Flask-Cors==3.0.7
Flask-RESTful==0.3.7
flask-restplus==0.12.1
flower==0.9.3
humanfriendly==4.18
idna==2.8
itsdangerous==1.1.0
Jinja2==2.10
jsonschema==3.0.1
kombu==4.5.0
MarkupSafe==1.1.1
mongoengine==0.17.0
pymongo==3.7.2
pyrsistent==0.14.11
pytz==2018.9
redis==3.2.1
requests==2.21.0
six==1.12.0
tornado==5.1.1
tzlocal==1.5.1
urllib3==1.24.1
vine==1.3.0
Werkzeug==0.15.2

View File

@@ -1,4 +1,5 @@
from flask_restful import reqparse, Resource
# from flask_restplus import reqparse, Resource
from db.manager import db_manager
from utils import jsonify
@@ -11,6 +12,9 @@ DEFAULT_ARGS = [
class BaseApi(Resource):
"""
Base class for API. All API classes should inherit this class.
"""
col_name = 'tmp'
parser = reqparse.RequestParser()
arguments = []
@@ -24,7 +28,18 @@ class BaseApi(Resource):
for arg, type in self.arguments:
self.parser.add_argument(arg, type=type)
def get(self, id=None, action=None):
def get(self, id: str = None, action: str = None) -> (dict, tuple):
"""
GET method for retrieving item information.
If id is specified and action is not, return the object of the given id;
If id and action are both specified, execute the given action results of the given id;
If neither id nor action is specified, return the list of items given the page_size, page_num and filter
:param id:
:param action:
:return:
"""
import pdb
pdb.set_trace()
args = self.parser.parse_args()
# action by id
@@ -82,7 +97,11 @@ class BaseApi(Resource):
else:
return jsonify(db_manager.get(col_name=self.col_name, id=id))
def put(self):
def put(self) -> (dict, tuple):
"""
PUT method for creating a new item.
:return:
"""
args = self.parser.parse_args()
item = {}
for k in args.keys():
@@ -91,7 +110,12 @@ class BaseApi(Resource):
item = db_manager.save(col_name=self.col_name, item=item)
return item
def update(self, id=None):
def update(self, id: str = None) -> (dict, tuple):
"""
Helper function for update action given the id.
:param id:
:return:
"""
args = self.parser.parse_args()
item = db_manager.get(col_name=self.col_name, id=id)
if item is None:
@@ -103,7 +127,8 @@ class BaseApi(Resource):
values = {}
for k in args.keys():
if k not in DEFAULT_ARGS:
values[k] = args.get(k)
if args.get(k) is not None:
values[k] = args.get(k)
item = db_manager.update_one(col_name=self.col_name, id=id, values=values)
# execute after_update hook
@@ -111,10 +136,18 @@ class BaseApi(Resource):
return item
def post(self, id=None, action=None):
def post(self, id: str = None, action: str = None):
"""
POST method of the given id for performing an action.
:param id:
:param action:
:return:
"""
# perform update action if action is not specified
if action is None:
return self.update(id)
# if action is not defined in the attributes, return 400 error
if not hasattr(self, action):
return {
'status': 'ok',
@@ -122,10 +155,27 @@ class BaseApi(Resource):
'error': 'action "%s" invalid' % action
}, 400
# perform specified action of given id
return getattr(self, action)(id)
def delete(self, id=None):
def delete(self, id: str = None) -> (dict, tuple):
"""
DELETE method of given id for deleting an item.
:param id:
:return:
"""
# perform delete action
db_manager.remove_one(col_name=self.col_name, id=id)
return {
'status': 'ok',
'message': 'deleted successfully',
}
def after_update(self, id=None):
def after_update(self, id: str = None):
"""
This is the after update hook once the update method is performed.
To be overridden.
:param id:
:return:
"""
pass

View File

@@ -11,7 +11,12 @@ class DeployApi(BaseApi):
('node_id', str),
)
def get(self, id=None, action=None):
def get(self, id: str = None, action: str = None) -> (dict, tuple):
"""
GET method of DeployAPI.
:param id: deploy_id
:param action: action
"""
# action by id
if action is not None:
if not hasattr(self, action):

View File

@@ -15,6 +15,10 @@ class FileApi(Resource):
self.parser.add_argument('path', type=str)
def get(self, action=None):
"""
GET method of FileAPI.
:param action: action
"""
args = self.parser.parse_args()
path = args.get('path')

View File

@@ -15,7 +15,12 @@ class NodeApi(BaseApi):
('port', str),
)
def get(self, id=None, action=None):
def get(self, id: str = None, action: str = None) -> (dict, tuple):
"""
GET method of NodeAPI.
:param id: item id
:param action: action
"""
# action by id
if action is not None:
if not hasattr(self, action):
@@ -43,10 +48,11 @@ class NodeApi(BaseApi):
'items': jsonify(nodes)
}
def get_spiders(self, id=None):
items = db_manager.list('spiders')
def get_deploys(self, id):
def get_deploys(self, id: str) -> (dict, tuple):
"""
Get a list of latest deploys of given node_id
:param id: node_id
"""
items = db_manager.list('deploys', {'node_id': id}, limit=10, sort_key='finish_ts')
deploys = []
for item in items:
@@ -60,6 +66,10 @@ class NodeApi(BaseApi):
}
def get_tasks(self, id):
"""
Get a list of latest tasks of given node_id
:param id: node_id
"""
items = db_manager.list('tasks', {'node_id': id}, limit=10, sort_key='create_ts')
for item in items:
spider_id = item['spider_id']

View File

@@ -58,9 +58,17 @@ class SpiderApi(BaseApi):
# spider schedule cron enabled
('cron_enabled', int),
# spider schedule cron enabled
('envs', str),
)
def get(self, id=None, action=None):
"""
GET method of SpiderAPI.
:param id: spider_id
:param action: action
"""
# action by id
if action is not None:
if not hasattr(self, action):
@@ -115,7 +123,12 @@ class SpiderApi(BaseApi):
'items': jsonify(items)
}
def crawl(self, id):
def crawl(self, id: str) -> (dict, tuple):
"""
Submit an HTTP request to start a crawl task in the node of given spider_id.
@deprecated
:param id: spider_id
"""
args = self.parser.parse_args()
node_id = args.get('node_id')
@@ -152,7 +165,12 @@ class SpiderApi(BaseApi):
'task': data.get('task')
}
def on_crawl(self, id):
def on_crawl(self, id: str) -> (dict, tuple):
"""
Start a crawl task.
:param id: spider_id
:return:
"""
job = execute_spider.delay(id)
# create a new task
@@ -172,7 +190,12 @@ class SpiderApi(BaseApi):
}
}
def deploy(self, id):
def deploy(self, id: str) -> (dict, tuple):
"""
Submit HTTP requests to deploy the given spider to all nodes.
:param id:
:return:
"""
spider = db_manager.get('spiders', id=id)
nodes = db_manager.list('nodes', {'status': NodeStatus.ONLINE})
@@ -198,13 +221,19 @@ class SpiderApi(BaseApi):
node_id,
), files=files)
# TODO: checkpoint for errors
return {
'code': 200,
'status': 'ok',
'message': 'deploy success'
}
def deploy_file(self, id=None):
def deploy_file(self, id: str = None) -> (dict, tuple):
"""
Receive HTTP request of deploys and unzip zip files and copy to the destination directories.
:param id: spider_id
"""
args = parser.parse_args()
node_id = request.args.get('node_id')
f = args.file
@@ -261,7 +290,11 @@ class SpiderApi(BaseApi):
'message': 'deploy success'
}
def get_deploys(self, id):
def get_deploys(self, id: str) -> (dict, tuple):
"""
Get a list of latest deploys of given spider_id
:param id: spider_id
"""
items = db_manager.list('deploys', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='finish_ts')
deploys = []
for item in items:
@@ -274,7 +307,11 @@ class SpiderApi(BaseApi):
'items': jsonify(deploys)
}
def get_tasks(self, id):
def get_tasks(self, id: str) -> (dict, tuple):
"""
Get a list of latest tasks of given spider_id
:param id:
"""
items = db_manager.list('tasks', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='create_ts')
for item in items:
spider_id = item['spider_id']
@@ -287,11 +324,23 @@ class SpiderApi(BaseApi):
'items': jsonify(items)
}
def after_update(self, id=None):
def after_update(self, id: str = None) -> None:
"""
After each spider is updated, update the cron scheduler correspondingly.
:param id: spider_id
"""
scheduler.update()
def update_envs(self, id: str):
args = self.parser.parse_args()
envs = json.loads(args.envs)
db_manager.update_one(col_name='spiders', id=id, values={'envs': envs})
class SpiderImportApi(Resource):
__doc__ = """
API for importing spiders from external resources including Github, Gitlab, and subversion (WIP)
"""
parser = reqparse.RequestParser()
arguments = [
('url', str)
@@ -302,7 +351,7 @@ class SpiderImportApi(Resource):
for arg, type in self.arguments:
self.parser.add_argument(arg, type=type)
def post(self, platform=None):
def post(self, platform: str = None) -> (dict, tuple):
if platform is None:
return {
'status': 'ok',
@@ -319,13 +368,22 @@ class SpiderImportApi(Resource):
return getattr(self, platform)()
def github(self):
def github(self) -> None:
"""
Import Github API
"""
self._git()
def gitlab(self):
def gitlab(self) -> None:
"""
Import Gitlab API
"""
self._git()
def _git(self):
"""
Helper method to perform github important (basically "git clone" method).
"""
args = self.parser.parse_args()
url = args.get('url')
if url is None:
@@ -357,7 +415,11 @@ class SpiderManageApi(Resource):
('url', str)
]
def post(self, action):
def post(self, action: str) -> (dict, tuple):
"""
POST method for SpiderManageAPI.
:param action:
"""
if not hasattr(self, action):
return {
'status': 'ok',
@@ -367,7 +429,10 @@ class SpiderManageApi(Resource):
return getattr(self, action)()
def deploy_all(self):
def deploy_all(self) -> (dict, tuple):
"""
Deploy all spiders to all nodes.
"""
# active nodes
nodes = db_manager.list('nodes', {'status': NodeStatus.ONLINE})

View File

@@ -8,7 +8,11 @@ from utils import jsonify
class StatsApi(Resource):
def get(self, action=None):
def get(self, action: str = None) -> (dict, tuple):
"""
GET method of StatsApi.
:param action: action
"""
# action
if action is not None:
if not hasattr(self, action):
@@ -23,6 +27,9 @@ class StatsApi(Resource):
return {}
def get_home_stats(self):
"""
Get stats for home page
"""
# overview stats
task_count = db_manager.count('tasks', {})
spider_count = db_manager.count('spiders', {})

View File

@@ -1,6 +1,7 @@
import json
import requests
from bson import ObjectId
from celery.worker.control import revoke
from constants.task import TaskStatus
@@ -8,9 +9,11 @@ from db.manager import db_manager
from routes.base import BaseApi
from utils import jsonify
from utils.spider import get_spider_col_fields
from utils.log import other
class TaskApi(BaseApi):
# collection name
col_name = 'tasks'
arguments = (
@@ -18,7 +21,12 @@ class TaskApi(BaseApi):
('file_path', str)
)
def get(self, id=None, action=None):
def get(self, id: str = None, action: str = None):
"""
GET method of TaskAPI.
:param id: item id
:param action: action
"""
# action by id
if action is not None:
if not hasattr(self, action):
@@ -27,11 +35,12 @@ class TaskApi(BaseApi):
'code': 400,
'error': 'action "%s" invalid' % action
}, 400
# other.info(f"到这了{action},{id}")
return getattr(self, action)(id)
elif id is not None:
task = db_manager.get('tasks', id=id)
spider = db_manager.get('spiders', id=str(task['spider_id']))
task = db_manager.get(col_name=self.col_name, id=id)
spider = db_manager.get(col_name='spiders', id=str(task['spider_id']))
task['spider_name'] = spider['name']
try:
with open(task['log_file_path']) as f:
@@ -44,11 +53,12 @@ class TaskApi(BaseApi):
args = self.parser.parse_args()
page_size = args.get('page_size') or 10
page_num = args.get('page_num') or 1
tasks = db_manager.list('tasks', {}, limit=page_size, skip=page_size * (page_num - 1), sort_key='create_ts')
tasks = db_manager.list(col_name=self.col_name, cond={}, limit=page_size, skip=page_size * (page_num - 1),
sort_key='create_ts')
items = []
for task in tasks:
# _task = db_manager.get('tasks_celery', id=task['_id'])
_spider = db_manager.get('spiders', id=str(task['spider_id']))
_spider = db_manager.get(col_name='spiders', id=str(task['spider_id']))
if task.get('status') is None:
task['status'] = TaskStatus.UNAVAILABLE
task['spider_name'] = _spider['name']
@@ -61,9 +71,13 @@ class TaskApi(BaseApi):
'items': jsonify(items)
}
def on_get_log(self, id):
def on_get_log(self, id: (str, ObjectId)) -> (dict, tuple):
"""
Get the log of given task_id
:param id: task_id
"""
try:
task = db_manager.get('tasks', id=id)
task = db_manager.get(col_name=self.col_name, id=id)
with open(task['log_file_path']) as f:
log = f.read()
return {
@@ -77,9 +91,14 @@ class TaskApi(BaseApi):
'error': str(err)
}, 500
def get_log(self, id):
task = db_manager.get('tasks', id=id)
node = db_manager.get('nodes', id=task['node_id'])
def get_log(self, id: (str, ObjectId)) -> (dict, tuple):
"""
Submit an HTTP request to fetch log from the node of a given task.
:param id: task_id
:return:
"""
task = db_manager.get(col_name=self.col_name, id=id)
node = db_manager.get(col_name='nodes', id=task['node_id'])
r = requests.get('http://%s:%s/api/tasks/%s/on_get_log' % (
node['ip'],
node['port'],
@@ -99,7 +118,11 @@ class TaskApi(BaseApi):
'error': data['error']
}, 500
def get_results(self, id):
def get_results(self, id: str) -> (dict, tuple):
"""
Get a list of results crawled in a given task.
:param id: task_id
"""
args = self.parser.parse_args()
page_size = args.get('page_size') or 10
page_num = args.get('page_num') or 1
@@ -121,6 +144,12 @@ class TaskApi(BaseApi):
}
def stop(self, id):
"""
Stop the task in progress.
TODO: work in progress
:param id:
:return:
"""
revoke(id, terminate=True)
return {
'id': id,

353
crawlab/swagger.yaml Normal file
View File

@@ -0,0 +1,353 @@
---
swagger: '2.0'
basePath: "/api"
paths:
"/deploys":
get:
responses:
'200':
description: Success
summary: GET method of DeployAPI
operationId: get_deploy_api
tags:
- deploy
put:
responses:
'200':
description: Success
summary: PUT method for creating a new item
operationId: put_deploy_api
tags:
- deploy
"/deploys/{id}":
parameters:
- name: id
in: path
required: true
type: string
get:
responses:
'200':
description: Success
summary: GET method of DeployAPI
operationId: get_deploy_api_by_id
tags:
- deploy
post:
responses:
'200':
description: Success
summary: POST method of the given id for performing an action
operationId: post_deploy_api
tags:
- deploy
delete:
responses:
'200':
description: Success
summary: DELETE method of given id for deleting an item
operationId: delete_deploy_api
tags:
- deploy
"/files":
get:
responses:
'200':
description: Success
summary: GET method of FileAPI
operationId: get_file_api
tags:
- file
"/nodes":
get:
responses:
'200':
description: Success
summary: GET method of NodeAPI
operationId: get_node_api
tags:
- node
put:
responses:
'200':
description: Success
summary: PUT method for creating a new item
operationId: put_node_api
tags:
- node
"/nodes/{id}":
parameters:
- name: id
in: path
required: true
type: string
get:
responses:
'200':
description: Success
summary: GET method of NodeAPI
operationId: get_node_api_by_id
tags:
- node
post:
responses:
'200':
description: Success
summary: POST method of the given id for performing an action
operationId: post_node_api
tags:
- node
delete:
responses:
'200':
description: Success
summary: DELETE method of the given id
operationId: delete_node_api
tags:
- node
"/nodes/{id}/get_deploys":
parameters:
- name: id
in: path
required: true
type: string
get:
responses:
'200':
description: Success
summary: Get a list of latest deploys of given node_id
tags:
- node
"/nodes/{id}/get_tasks":
parameters:
- name: id
in: path
required: true
type: string
get:
responses:
'200':
description: Success
summary: Get a list of latest tasks of given node_id
tags:
- node
"/spiders":
get:
responses:
'200':
description: Success
summary: GET method of SpiderAPI
operationId: get_spider_api
tags:
- spider
put:
responses:
'200':
description: Success
summary: PUT method for creating a new item
operationId: put_spider_api
tags:
- spider
"/spiders/import/{platform}":
parameters:
- name: platform
in: path
required: true
type: string
post:
responses:
'200':
description: Success
operationId: post_spider_import_api
tags:
- spider
"/spiders/manage/deploy_all":
post:
responses:
'200':
description: Success
summary: Deploy all spiders to all nodes.
tags:
- spider
"/spiders/{id}":
parameters:
- name: id
in: path
required: true
type: string
get:
responses:
'200':
description: Success
summary: GET method of SpiderAPI
operationId: get_spider_api_by_id
tags:
- spider
post:
responses:
'200':
description: Success
summary: POST method of the given id for performing an action
operationId: post_spider_api
tags:
- spider
delete:
responses:
'200':
description: Success
summary: DELETE method of given id for deleting an item
operationId: delete_spider_api
tags:
- spider
"/spiders/{id}/get_tasks":
parameters:
- name: id
in: path
required: true
type: string
description: spider_id
get:
responses:
'200':
description: Success
summary: Get a list of latest tasks of given spider_id
tags:
- spider
"/spiders/{id}/get_deploys":
parameters:
- name: id
in: path
required: true
type: string
description: spider_id
get:
responses:
'200':
description: Success
summary: Get a list of latest deploys of given spider_id
tags:
- spider
"/spiders/{id}/on_crawl":
parameters:
- name: id
in: path
required: true
type: string
description: spider_id
post:
responses:
'200':
description: Success
summary: Start a crawl task.
tags:
- spider
"/spiders/{id}/deploy":
parameters:
- name: id
in: path
required: true
type: string
description: spider_id
post:
responses:
'200':
description: Success
summary: Start a crawl task.
tags:
- spider
"/stats/get_home_stats":
get:
responses:
'200':
description: Success
summary: Get stats for home page
operationId: get_stats_api
tags:
- stats
"/tasks":
get:
responses:
'200':
description: Success
summary: GET method of TaskAPI
operationId: get_task_api
tags:
- task
put:
responses:
'200':
description: Success
summary: PUT method for creating a new item
operationId: put_task_api
tags:
- task
"/tasks/{id}":
parameters:
- name: id
in: path
required: true
type: string
get:
responses:
'200':
description: Success
summary: GET method of TaskAPI
operationId: get_task_api_by_id
tags:
- task
post:
responses:
'200':
description: Success
summary: POST method of the given id for performing an action
operationId: post_task_api
tags:
- task
delete:
responses:
'200':
description: Success
summary: DELETE method of given id for deleting an item
operationId: delete_task_api
tags:
- task
"/tasks/{id}/get_log":
parameters:
- name: id
in: path
required: true
type: string
get:
responses:
'200':
description: Success
summary: Submit an HTTP request to fetch log from the node of a given task.
operationId: get_task_api_get_log
tags:
- task
"/tasks/{id}/on_get_log":
parameters:
- name: id
in: path
required: true
type: string
get:
responses:
'200':
description: Success
summary: Get the log of given task_id
operationId: get_task_api_on_get_log
tags:
- task
info:
title: Crawlab API
version: '1.0'
produces:
- application/json
consumes:
- application/json
responses:
ParseError:
description: When a mask can't be parsed
MaskError:
description: When any error occurs on mask

View File

@@ -2,23 +2,27 @@ import os
from datetime import datetime
from bson import ObjectId
from celery.utils.log import get_logger
from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_LOGS_FOLDER
from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_LOGS_FOLDER, PYTHON_ENV_PATH
from constants.task import TaskStatus
from db.manager import db_manager
from .celery import celery_app
import subprocess
logger = get_logger(__name__)
from utils.log import other as logger
@celery_app.task(bind=True)
def execute_spider(self, id: str):
"""
Execute spider task.
:param self:
:param id: task_id
"""
task_id = self.request.id
hostname = self.request.hostname
spider = db_manager.get('spiders', id=id)
command = spider.get('cmd')
if command.startswith("env"):
command = PYTHON_ENV_PATH + command.replace("env", "")
current_working_directory = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id')))
@@ -48,11 +52,22 @@ def execute_spider(self, id: str):
'status': TaskStatus.STARTED
})
# start the process and pass params as env variables
# pass params as env variables
env = os.environ.copy()
# custom environment variables
if spider.get('envs'):
for _env in spider.get('envs'):
env[_env['name']] = _env['value']
# task id environment variable
env['CRAWLAB_TASK_ID'] = task_id
# collection environment variable
if spider.get('col'):
env['CRAWLAB_COLLECTION'] = spider.get('col')
# start process
p = subprocess.Popen(command.split(' '),
stdout=stdout.fileno(),
stderr=stderr.fileno(),

View File

@@ -5,11 +5,20 @@ from datetime import datetime
from bson import json_util
def is_object_id(id):
def is_object_id(id: str) -> bool:
"""
Determine if the id is a valid ObjectId string
:param id: ObjectId string
"""
return re.search('^[a-zA-Z0-9]{24}$', id) is not None
def jsonify(obj):
def jsonify(obj: (dict, list)) -> (dict, list):
"""
Convert dict/list to a valid json object.
:param obj: object to be converted
:return: dict/list
"""
dump_str = json_util.dumps(obj)
converted_obj = json.loads(dump_str)
if type(converted_obj) == dict:

View File

@@ -1,8 +1,13 @@
import os, zipfile
from utils.log import other
# 打包目录为zip文件未压缩
def zip_file(source_dir, output_filename):
"""
打包目录为zip文件未压缩
:param source_dir: source directory
:param output_filename: output file name
"""
zipf = zipfile.ZipFile(output_filename, 'w')
pre_len = len(os.path.dirname(source_dir))
for parent, dirnames, filenames in os.walk(source_dir):
@@ -14,10 +19,15 @@ def zip_file(source_dir, output_filename):
def unzip_file(zip_src, dst_dir):
"""
Unzip file
:param zip_src: source zip file
:param dst_dir: destination directory
"""
r = zipfile.is_zipfile(zip_src)
if r:
fz = zipfile.ZipFile(zip_src, 'r')
for file in fz.namelist():
fz.extract(file, dst_dir)
else:
print('This is not zip')
other.info('This is not zip')

View File

@@ -14,7 +14,12 @@ SUFFIX_LANG_MAPPING = {
}
def get_file_suffix(file_name: str):
def get_file_suffix(file_name: str) -> (str, None):
"""
Get suffix of a file
:param file_name:
:return:
"""
file_name = file_name.lower()
m = suffix_regex.search(file_name)
if m is not None:
@@ -23,7 +28,11 @@ def get_file_suffix(file_name: str):
return None
def get_file_list(path):
def get_file_list(path: str) -> list:
"""
Get a list of files of given directory path
:param path: directory path
"""
for root, dirs, file_names in os.walk(path):
# print(root) # 当前目录路径
# print(dirs) # 当前路径下所有子目录
@@ -35,6 +44,10 @@ def get_file_list(path):
def get_file_suffix_stats(path) -> dict:
"""
Get suffix stats of given file
:param path: file path
"""
stats = defaultdict(int)
for file_path in get_file_list(path):
suffix = get_file_suffix(file_path)
@@ -44,6 +57,10 @@ def get_file_suffix_stats(path) -> dict:
def get_file_content(path) -> dict:
"""
Get file content
:param path: file path
"""
with open(path) as f:
suffix = get_file_suffix(path)
lang = SUFFIX_LANG_MAPPING.get(suffix)

75
crawlab/utils/log.py Normal file
View File

@@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-
# @Time : 2019-01-28 15:37
# @Author : cxa
# @File : log.py
# @Software: PyCharm
import os
import logging
import logging.config as log_conf
import datetime
import coloredlogs
log_dir = os.path.dirname(os.path.dirname(__file__)) + '/logs'
if not os.path.exists(log_dir):
os.mkdir(log_dir)
today = datetime.datetime.now().strftime("%Y%m%d")
log_path = os.path.join(log_dir, f'app_{today}.log')
log_config = {
'version': 1.0,
'formatters': {
'colored_console': {'()': 'coloredlogs.ColoredFormatter',
'format': "%(asctime)s - %(name)s - %(levelname)s - %(message)s", 'datefmt': '%H:%M:%S'},
'detail': {
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
'datefmt': "%Y-%m-%d %H:%M:%S" # 如果不加这个会显示到毫秒。
},
'simple': {
'format': '%(name)s - %(levelname)s - %(message)s',
},
},
'handlers': {
'console': {
'class': 'logging.StreamHandler', # 日志打印到屏幕显示的类。
'level': 'INFO',
'formatter': 'colored_console'
},
'file': {
'class': 'logging.handlers.RotatingFileHandler', # 日志打印到文件的类。
'maxBytes': 1024 * 1024 * 1024, # 单个文件最大内存
'backupCount': 1, # 备份的文件个数
'filename': log_path, # 日志文件名
'level': 'INFO', # 日志等级
'formatter': 'detail', # 调用上面的哪个格式
'encoding': 'utf-8', # 编码
},
},
'loggers': {
'crawler': {
'handlers': ['console', 'file'], # 只打印屏幕
'level': 'DEBUG', # 只显示错误的log
},
'parser': {
'handlers': ['file'],
'level': 'INFO',
},
'other': {
'handlers': ['console', 'file'],
'level': 'INFO',
},
'storage': {
'handlers': ['console', 'file'],
'level': 'INFO',
}
}
}
log_conf.dictConfig(log_config)
crawler = logging.getLogger('crawler')
storage = logging.getLogger('storage')
other = logging.getLogger('storage')
coloredlogs.install(level='DEBUG', logger=crawler)
coloredlogs.install(level='DEBUG', logger=storage)
coloredlogs.install(level='DEBUG', logger=other)

View File

@@ -8,11 +8,18 @@ from db.manager import db_manager
def check_nodes_status():
"""
Update node status from Flower.
"""
res = requests.get('%s/workers?status=1' % FLOWER_API_ENDPOINT)
return json.loads(res.content.decode('utf-8'))
def update_nodes_status(refresh=False):
"""
Update all nodes status
:param refresh:
"""
online_node_ids = []
url = '%s/workers?status=1' % FLOWER_API_ENDPOINT
if refresh:

View File

@@ -6,25 +6,37 @@ from db.manager import db_manager
def get_lang_by_stats(stats: dict) -> LangType:
"""
Get programming language provided suffix stats
:param stats: stats is generated by utils.file.get_file_suffix_stats
:return:
"""
data = stats.items()
data = sorted(data, key=lambda item: item[1])
data = list(filter(lambda item: item[0] not in SUFFIX_IGNORE, data))
top_suffix = data[-1][0]
if FILE_SUFFIX_LANG_MAPPING.get(top_suffix) is not None:
return FILE_SUFFIX_LANG_MAPPING.get(top_suffix)
return LangType.OTHER
try:
data = stats.items()
data = sorted(data, key=lambda item: item[1])
data = list(filter(lambda item: item[0] not in SUFFIX_IGNORE, data))
top_suffix = data[-1][0]
if FILE_SUFFIX_LANG_MAPPING.get(top_suffix) is not None:
return FILE_SUFFIX_LANG_MAPPING.get(top_suffix)
return LangType.OTHER
except IndexError as e:
pass
def get_spider_type(path: str) -> SpiderType:
"""
Get spider type
:param path: spider directory path
"""
for file_name in os.listdir(path):
if file_name == 'scrapy.cfg':
return SpiderType.SCRAPY
def get_spider_col_fields(col_name):
def get_spider_col_fields(col_name: str) -> list:
"""
Get spider collection fields
:param col_name: collection name
"""
items = db_manager.list(col_name, {}, limit=100, sort_key='_id')
fields = set()
for item in items:

View File

@@ -0,0 +1,82 @@
const puppeteer = require('puppeteer');
const MongoClient = require('mongodb').MongoClient;
(async () => {
// browser
const browser = await (puppeteer.launch({
headless: true
}));
// define start url
const url = 'https://juejin.im';
// start a new page
const page = await browser.newPage();
// navigate to url
try {
await page.goto(url, {waitUntil: 'domcontentloaded'});
await page.waitFor(2000);
} catch (e) {
console.error(e);
// close browser
browser.close();
// exit code 1 indicating an error happened
code = 1;
process.emit("exit ");
process.reallyExit(code);
return
}
// scroll down to fetch more data
for (let i = 0; i < 100; i++) {
console.log('Pressing PageDown...');
await page.keyboard.press('PageDown', 200);
await page.waitFor(100);
}
// scrape data
const results = await page.evaluate(() => {
let results = [];
document.querySelectorAll('.entry-list > .item').forEach(el => {
if (!el.querySelector('.title')) return;
results.push({
url: 'https://juejin.com' + el.querySelector('.title').getAttribute('href'),
title: el.querySelector('.title').innerText
});
});
return results;
});
// open database connection
const client = await MongoClient.connect('mongodb://192.168.99.100:27017');
let db = await client.db('crawlab_test');
const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin';
const taskId = process.env.CRAWLAB_TASK_ID;
const col = db.collection(colName);
// save to database
for (let i = 0; i < results.length; i++) {
// de-duplication
const r = await col.findOne({url: results[i]});
if (r) continue;
// assign taskID
results[i].task_id = taskId;
results[i].source = 'juejin';
// insert row
await col.insertOne(results[i]);
}
console.log(`results.length: ${results.length}`);
// close database connection
client.close();
// shutdown browser
browser.close();
})();

View File

@@ -0,0 +1,82 @@
const puppeteer = require('puppeteer');
const MongoClient = require('mongodb').MongoClient;
(async () => {
// browser
const browser = await (puppeteer.launch({
headless: true
}));
// define start url
const url = 'https://juejin.im';
// start a new page
const page = await browser.newPage();
// navigate to url
try {
await page.goto(url, {waitUntil: 'domcontentloaded'});
await page.waitFor(2000);
} catch (e) {
console.error(e);
// close browser
browser.close();
// exit code 1 indicating an error happened
code = 1;
process.emit("exit ");
process.reallyExit(code);
return
}
// scroll down to fetch more data
for (let i = 0; i < 100; i++) {
console.log('Pressing PageDown...');
await page.keyboard.press('PageDown', 200);
await page.waitFor(100);
}
// scrape data
const results = await page.evaluate(() => {
let results = [];
document.querySelectorAll('.entry-list > .item').forEach(el => {
if (!el.querySelector('.title')) return;
results.push({
url: 'https://juejin.com' + el.querySelector('.title').getAttribute('href'),
title: el.querySelector('.title').innerText
});
});
return results;
});
// open database connection
const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
let db = await client.db('crawlab_test');
const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin';
const taskId = process.env.CRAWLAB_TASK_ID;
const col = db.collection(colName);
// save to database
for (let i = 0; i < results.length; i++) {
// de-duplication
const r = await col.findOne({url: results[i]});
if (r) continue;
// assign taskID
results[i].task_id = taskId;
results[i].source = 'juejin';
// insert row
await col.insertOne(results[i]);
}
console.log(`results.length: ${results.length}`);
// close database connection
client.close();
// shutdown browser
browser.close();
})();

View File

@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class JuejinItem(scrapy.Item):
# define the fields for your item here like:
_id = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
like = scrapy.Field()
task_id = scrapy.Field()

View File

@@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class JuejinSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)

View File

@@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
from pymongo import MongoClient
MONGO_HOST = '127.0.0.1'
MONGO_PORT = 27017
MONGO_DB = 'crawlab_test'
class JuejinPipeline(object):
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
db = mongo[MONGO_DB]
col_name = os.environ.get('CRAWLAB_COLLECTION','test')
col = db[col_name]
def process_item(self, item, spider):
item['task_id'] = os.environ.get('CRAWLAB_TASK_ID')
self.col.save(item)
return item

View File

@@ -0,0 +1,89 @@
# -*- coding: utf-8 -*-
# Scrapy settings for juejin project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'juejin'
SPIDER_MODULES = ['juejin.spiders']
NEWSPIDER_MODULE = 'juejin.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'juejin.middlewares.JuejinSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'juejin.middlewares.MyCustomDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'juejin.pipelines.JuejinPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View File

@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@@ -0,0 +1,17 @@
# -*- coding: utf-8 -*-
import scrapy
from juejin.items import JuejinItem
class JuejinSpiderSpider(scrapy.Spider):
name = 'juejin_spider'
allowed_domains = ['juejin.com']
start_urls = ['https://juejin.im/search?query=celery']
def parse(self, response):
for item in response.css('ul.main-list > li.item'):
yield JuejinItem(
title=item.css('.title span').extract_first(),
link=item.css('a::attr("href")').extract_first(),
like=item.css('.like .count::text').extract_first(),
)

View File

@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = juejin.settings
[deploy]
#url = http://localhost:6800/
project = juejin

View File

@@ -0,0 +1,2 @@
from scrapy import cmdline
cmdline.execute(["scrapy","crawl","juejin_spider"])

View File

@@ -0,0 +1,82 @@
const puppeteer = require('puppeteer');
const MongoClient = require('mongodb').MongoClient;
(async () => {
// browser
const browser = await (puppeteer.launch({
headless: true
}));
// define start url
const url = 'https://juejin.im';
// start a new page
const page = await browser.newPage();
// navigate to url
try {
await page.goto(url, {waitUntil: 'domcontentloaded'});
await page.waitFor(2000);
} catch (e) {
console.error(e);
// close browser
browser.close();
// exit code 1 indicating an error happened
code = 1;
process.emit("exit ");
process.reallyExit(code);
return
}
// scroll down to fetch more data
for (let i = 0; i < 100; i++) {
console.log('Pressing PageDown...');
await page.keyboard.press('PageDown', 200);
await page.waitFor(100);
}
// scrape data
const results = await page.evaluate(() => {
let results = [];
document.querySelectorAll('.entry-list > .item').forEach(el => {
if (!el.querySelector('.title')) return;
results.push({
url: 'https://juejin.com' + el.querySelector('.title').getAttribute('href'),
title: el.querySelector('.title').innerText
});
});
return results;
});
// open database connection
const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
let db = await client.db('crawlab_test');
const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin';
const taskId = process.env.CRAWLAB_TASK_ID;
const col = db.collection(colName);
// save to database
for (let i = 0; i < results.length; i++) {
// de-duplication
const r = await col.findOne({url: results[i]});
if (r) continue;
// assign taskID
results[i].task_id = taskId;
results[i].source = 'juejin';
// insert row
await col.insertOne(results[i]);
}
console.log(`results.length: ${results.length}`);
// close database connection
client.close();
// shutdown browser
browser.close();
})();

View File

@@ -0,0 +1,82 @@
const puppeteer = require('puppeteer');
const MongoClient = require('mongodb').MongoClient;
(async () => {
// browser
const browser = await (puppeteer.launch({
headless: true
}));
// define start url
const url = 'https://juejin.im';
// start a new page
const page = await browser.newPage();
// navigate to url
try {
await page.goto(url, {waitUntil: 'domcontentloaded'});
await page.waitFor(2000);
} catch (e) {
console.error(e);
// close browser
browser.close();
// exit code 1 indicating an error happened
code = 1;
process.emit("exit ");
process.reallyExit(code);
return
}
// scroll down to fetch more data
for (let i = 0; i < 100; i++) {
console.log('Pressing PageDown...');
await page.keyboard.press('PageDown', 200);
await page.waitFor(100);
}
// scrape data
const results = await page.evaluate(() => {
let results = [];
document.querySelectorAll('.entry-list > .item').forEach(el => {
if (!el.querySelector('.title')) return;
results.push({
url: 'https://juejin.com' + el.querySelector('.title').getAttribute('href'),
title: el.querySelector('.title').innerText
});
});
return results;
});
// open database connection
const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
let db = await client.db('crawlab_test');
const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin';
const taskId = process.env.CRAWLAB_TASK_ID;
const col = db.collection(colName);
// save to database
for (let i = 0; i < results.length; i++) {
// de-duplication
const r = await col.findOne({url: results[i]});
if (r) continue;
// assign taskID
results[i].task_id = taskId;
results[i].source = 'juejin';
// insert row
await col.insertOne(results[i]);
}
console.log(`results.length: ${results.length}`);
// close database connection
client.close();
// shutdown browser
browser.close();
})();

BIN
docs/.DS_Store vendored

Binary file not shown.

View File

@@ -0,0 +1,2 @@
# App

6
docs/Concept/Deploy 2.md Normal file
View File

@@ -0,0 +1,6 @@
# 部署
所有爬虫在运行前需要被部署当相应当节点中。
部署时,爬虫会被打包到相应的目录中,方便环境隔离,开发环境的爬虫和生产环境的爬虫需要打包部署来实现隔离。

View File

@@ -0,0 +1,2 @@
# Examples

View File

@@ -0,0 +1,22 @@
# 安装
最快安装Crawlab的方式是克隆一份代码到本地
```bash
git clone https://github.com/tikazyq/crawlab
```
安装类库
```bash
# 安装后台类库
pip install -r requirements.txt
```
```bash
# 安装前台类库
cd frontend
npm install
```

View File

@@ -0,0 +1,436 @@
<!DOCTYPE HTML>
<html lang="" >
<head>
<meta charset="UTF-8">
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<title>App · GitBook</title>
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="description" content="">
<meta name="generator" content="GitBook 3.2.3">
<link rel="stylesheet" href="../gitbook/style.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-highlight/website.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-search/search.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-fontsettings/website.css">
<meta name="HandheldFriendly" content="true"/>
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<link rel="apple-touch-icon-precomposed" sizes="152x152" href="../gitbook/images/apple-touch-icon-precomposed-152.png">
<link rel="shortcut icon" href="../gitbook/images/favicon.ico" type="image/x-icon">
<link rel="next" href="../Examples/" />
<link rel="prev" href="Celery.html" />
</head>
<body>
<div class="book">
<div class="book-summary">
<div id="book-search-input" role="search">
<input type="text" placeholder="Type to search" />
</div>
<nav role="navigation">
<ul class="summary">
<li class="chapter " data-level="1.1" data-path="../">
<a href="../">
简介
</a>
</li>
<li class="chapter " data-level="1.2" data-path="../QuickStart/">
<a href="../QuickStart/">
快速开始
</a>
<ul class="articles">
<li class="chapter " data-level="1.2.1" data-path="../QuickStart/Installation.html">
<a href="../QuickStart/Installation.html">
安装
</a>
</li>
<li class="chapter " data-level="1.2.2" data-path="../QuickStart/Run.html">
<a href="../QuickStart/Run.html">
运行
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.3" data-path="../Concept/">
<a href="../Concept/">
概念
</a>
<ul class="articles">
<li class="chapter " data-level="1.3.1" data-path="../Concept/Node.html">
<a href="../Concept/Node.html">
节点
</a>
</li>
<li class="chapter " data-level="1.3.2" data-path="../Concept/Spider.html">
<a href="../Concept/Spider.html">
爬虫
</a>
</li>
<li class="chapter " data-level="1.3.3" data-path="../Concept/Task.html">
<a href="../Concept/Task.html">
任务
</a>
</li>
<li class="chapter " data-level="1.3.4" data-path="../Concept/Deploy.html">
<a href="../Concept/Deploy.html">
部署
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.4" data-path="./">
<a href="./">
架构
</a>
<ul class="articles">
<li class="chapter " data-level="1.4.1" data-path="Celery.html">
<a href="Celery.html">
Celery
</a>
</li>
<li class="chapter active" data-level="1.4.2" data-path="App.html">
<a href="App.html">
App
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.5" data-path="../Examples/">
<a href="../Examples/">
Examples
</a>
<ul class="articles">
<li class="chapter " data-level="1.5.1" data-path="../Examples/">
<a href="../Examples/">
与Scrapy集成
</a>
</li>
<li class="chapter " data-level="1.5.2" data-path="../Examples/">
<a href="../Examples/">
与Puppeteer集成
</a>
</li>
</ul>
</li>
<li class="divider"></li>
<li>
<a href="https://www.gitbook.com" target="blank" class="gitbook-link">
Published with GitBook
</a>
</li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<!-- Title -->
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i>
<a href=".." >App</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<div id="book-search-results">
<div class="search-noresults">
<section class="normal markdown-section">
<h1 id="app">App</h1>
</section>
</div>
<div class="search-results">
<div class="has-results">
<h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
<ul class="search-results-list"></ul>
</div>
<div class="no-results">
<h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
</div>
</div>
</div>
</div>
</div>
</div>
<a href="Celery.html" class="navigation navigation-prev " aria-label="Previous page: Celery">
<i class="fa fa-angle-left"></i>
</a>
<a href="../Examples/" class="navigation navigation-next " aria-label="Next page: Examples">
<i class="fa fa-angle-right"></i>
</a>
</div>
<script>
var gitbook = gitbook || [];
gitbook.push(function() {
gitbook.page.hasChanged({"page":{"title":"App","level":"1.4.2","depth":2,"next":{"title":"Examples","level":"1.5","depth":1,"path":"Examples/README.md","ref":"Examples/README.md","articles":[{"title":"与Scrapy集成","level":"1.5.1","depth":2,"path":"Examples/README.md","ref":"Examples/README.md","articles":[]},{"title":"与Puppeteer集成","level":"1.5.2","depth":2,"path":"Examples/README.md","ref":"Examples/README.md","articles":[]}]},"previous":{"title":"Celery","level":"1.4.1","depth":2,"path":"Architecture/Celery.md","ref":"Architecture/Celery.md","articles":[]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":[],"pluginsConfig":{"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"Architecture/App.md","mtime":"2019-03-28T11:49:43.000Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2019-03-28T12:07:05.349Z"},"basePath":"..","book":{"language":""}});
});
</script>
</div>
<script src="../gitbook/gitbook.js"></script>
<script src="../gitbook/theme.js"></script>
<script src="../gitbook/gitbook-plugin-search/search-engine.js"></script>
<script src="../gitbook/gitbook-plugin-search/search.js"></script>
<script src="../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
<script src="../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
<script src="../gitbook/gitbook-plugin-sharing/buttons.js"></script>
<script src="../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
</body>
</html>

View File

@@ -0,0 +1,438 @@
<!DOCTYPE HTML>
<html lang="" >
<head>
<meta charset="UTF-8">
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<title>部署 · GitBook</title>
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="description" content="">
<meta name="generator" content="GitBook 3.2.3">
<link rel="stylesheet" href="../gitbook/style.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-highlight/website.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-search/search.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-fontsettings/website.css">
<meta name="HandheldFriendly" content="true"/>
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<link rel="apple-touch-icon-precomposed" sizes="152x152" href="../gitbook/images/apple-touch-icon-precomposed-152.png">
<link rel="shortcut icon" href="../gitbook/images/favicon.ico" type="image/x-icon">
<link rel="next" href="../Architecture/" />
<link rel="prev" href="Task.html" />
</head>
<body>
<div class="book">
<div class="book-summary">
<div id="book-search-input" role="search">
<input type="text" placeholder="Type to search" />
</div>
<nav role="navigation">
<ul class="summary">
<li class="chapter " data-level="1.1" data-path="../">
<a href="../">
简介
</a>
</li>
<li class="chapter " data-level="1.2" data-path="../QuickStart/">
<a href="../QuickStart/">
快速开始
</a>
<ul class="articles">
<li class="chapter " data-level="1.2.1" data-path="../QuickStart/Installation.html">
<a href="../QuickStart/Installation.html">
安装
</a>
</li>
<li class="chapter " data-level="1.2.2" data-path="../QuickStart/Run.html">
<a href="../QuickStart/Run.html">
运行
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.3" data-path="./">
<a href="./">
概念
</a>
<ul class="articles">
<li class="chapter " data-level="1.3.1" data-path="Node.html">
<a href="Node.html">
节点
</a>
</li>
<li class="chapter " data-level="1.3.2" data-path="Spider.html">
<a href="Spider.html">
爬虫
</a>
</li>
<li class="chapter " data-level="1.3.3" data-path="Task.html">
<a href="Task.html">
任务
</a>
</li>
<li class="chapter active" data-level="1.3.4" data-path="Deploy.html">
<a href="Deploy.html">
部署
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.4" data-path="../Architecture/">
<a href="../Architecture/">
架构
</a>
<ul class="articles">
<li class="chapter " data-level="1.4.1" data-path="../Architecture/Celery.html">
<a href="../Architecture/Celery.html">
Celery
</a>
</li>
<li class="chapter " data-level="1.4.2" data-path="../Architecture/App.html">
<a href="../Architecture/App.html">
App
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.5" data-path="../Examples/">
<a href="../Examples/">
Examples
</a>
<ul class="articles">
<li class="chapter " data-level="1.5.1" data-path="../Examples/">
<a href="../Examples/">
与Scrapy集成
</a>
</li>
<li class="chapter " data-level="1.5.2" data-path="../Examples/">
<a href="../Examples/">
与Puppeteer集成
</a>
</li>
</ul>
</li>
<li class="divider"></li>
<li>
<a href="https://www.gitbook.com" target="blank" class="gitbook-link">
Published with GitBook
</a>
</li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<!-- Title -->
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i>
<a href=".." >部署</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<div id="book-search-results">
<div class="search-noresults">
<section class="normal markdown-section">
<h1 id="&#x90E8;&#x7F72;">&#x90E8;&#x7F72;</h1>
<p>&#x6240;&#x6709;&#x722C;&#x866B;&#x5728;&#x8FD0;&#x884C;&#x524D;&#x9700;&#x8981;&#x88AB;&#x90E8;&#x7F72;&#x5F53;&#x76F8;&#x5E94;&#x5F53;&#x8282;&#x70B9;&#x4E2D;&#x3002;</p>
<p>&#x90E8;&#x7F72;&#x65F6;&#xFF0C;&#x722C;&#x866B;&#x4F1A;&#x88AB;&#x6253;&#x5305;&#x5230;&#x76F8;&#x5E94;&#x7684;&#x76EE;&#x5F55;&#x4E2D;&#xFF0C;&#x65B9;&#x4FBF;&#x73AF;&#x5883;&#x9694;&#x79BB;&#xFF0C;&#x5F00;&#x53D1;&#x73AF;&#x5883;&#x7684;&#x722C;&#x866B;&#x548C;&#x751F;&#x4EA7;&#x73AF;&#x5883;&#x7684;&#x722C;&#x866B;&#x9700;&#x8981;&#x6253;&#x5305;&#x90E8;&#x7F72;&#x6765;&#x5B9E;&#x73B0;&#x9694;&#x79BB;&#x3002;</p>
</section>
</div>
<div class="search-results">
<div class="has-results">
<h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
<ul class="search-results-list"></ul>
</div>
<div class="no-results">
<h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
</div>
</div>
</div>
</div>
</div>
</div>
<a href="Task.html" class="navigation navigation-prev " aria-label="Previous page: 任务">
<i class="fa fa-angle-left"></i>
</a>
<a href="../Architecture/" class="navigation navigation-next " aria-label="Next page: 架构">
<i class="fa fa-angle-right"></i>
</a>
</div>
<script>
var gitbook = gitbook || [];
gitbook.push(function() {
gitbook.page.hasChanged({"page":{"title":"部署","level":"1.3.4","depth":2,"next":{"title":"架构","level":"1.4","depth":1,"path":"Architecture/README.md","ref":"Architecture/README.md","articles":[{"title":"Celery","level":"1.4.1","depth":2,"path":"Architecture/Celery.md","ref":"Architecture/Celery.md","articles":[]},{"title":"App","level":"1.4.2","depth":2,"path":"Architecture/App.md","ref":"Architecture/App.md","articles":[]}]},"previous":{"title":"任务","level":"1.3.3","depth":2,"path":"Concept/Task.md","ref":"Concept/Task.md","articles":[]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":[],"pluginsConfig":{"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"Concept/Deploy.md","mtime":"2019-03-28T12:06:24.000Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2019-03-28T12:07:05.349Z"},"basePath":"..","book":{"language":""}});
});
</script>
</div>
<script src="../gitbook/gitbook.js"></script>
<script src="../gitbook/theme.js"></script>
<script src="../gitbook/gitbook-plugin-search/search-engine.js"></script>
<script src="../gitbook/gitbook-plugin-search/search.js"></script>
<script src="../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
<script src="../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
<script src="../gitbook/gitbook-plugin-sharing/buttons.js"></script>
<script src="../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
</body>
</html>

View File

@@ -0,0 +1,436 @@
<!DOCTYPE HTML>
<html lang="" >
<head>
<meta charset="UTF-8">
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<title>Examples · GitBook</title>
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="description" content="">
<meta name="generator" content="GitBook 3.2.3">
<link rel="stylesheet" href="../gitbook/style.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-highlight/website.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-search/search.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-fontsettings/website.css">
<meta name="HandheldFriendly" content="true"/>
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<link rel="apple-touch-icon-precomposed" sizes="152x152" href="../gitbook/images/apple-touch-icon-precomposed-152.png">
<link rel="shortcut icon" href="../gitbook/images/favicon.ico" type="image/x-icon">
<link rel="next" href="./" />
<link rel="prev" href="../Architecture/App.html" />
</head>
<body>
<div class="book">
<div class="book-summary">
<div id="book-search-input" role="search">
<input type="text" placeholder="Type to search" />
</div>
<nav role="navigation">
<ul class="summary">
<li class="chapter " data-level="1.1" data-path="../">
<a href="../">
简介
</a>
</li>
<li class="chapter " data-level="1.2" data-path="../QuickStart/">
<a href="../QuickStart/">
快速开始
</a>
<ul class="articles">
<li class="chapter " data-level="1.2.1" data-path="../QuickStart/Installation.html">
<a href="../QuickStart/Installation.html">
安装
</a>
</li>
<li class="chapter " data-level="1.2.2" data-path="../QuickStart/Run.html">
<a href="../QuickStart/Run.html">
运行
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.3" data-path="../Concept/">
<a href="../Concept/">
概念
</a>
<ul class="articles">
<li class="chapter " data-level="1.3.1" data-path="../Concept/Node.html">
<a href="../Concept/Node.html">
节点
</a>
</li>
<li class="chapter " data-level="1.3.2" data-path="../Concept/Spider.html">
<a href="../Concept/Spider.html">
爬虫
</a>
</li>
<li class="chapter " data-level="1.3.3" data-path="../Concept/Task.html">
<a href="../Concept/Task.html">
任务
</a>
</li>
<li class="chapter " data-level="1.3.4" data-path="../Concept/Deploy.html">
<a href="../Concept/Deploy.html">
部署
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.4" data-path="../Architecture/">
<a href="../Architecture/">
架构
</a>
<ul class="articles">
<li class="chapter " data-level="1.4.1" data-path="../Architecture/Celery.html">
<a href="../Architecture/Celery.html">
Celery
</a>
</li>
<li class="chapter " data-level="1.4.2" data-path="../Architecture/App.html">
<a href="../Architecture/App.html">
App
</a>
</li>
</ul>
</li>
<li class="chapter active" data-level="1.5" data-path="./">
<a href="./">
Examples
</a>
<ul class="articles">
<li class="chapter active" data-level="1.5.1" data-path="./">
<a href="./">
与Scrapy集成
</a>
</li>
<li class="chapter active" data-level="1.5.2" data-path="./">
<a href="./">
与Puppeteer集成
</a>
</li>
</ul>
</li>
<li class="divider"></li>
<li>
<a href="https://www.gitbook.com" target="blank" class="gitbook-link">
Published with GitBook
</a>
</li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<!-- Title -->
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i>
<a href=".." >Examples</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<div id="book-search-results">
<div class="search-noresults">
<section class="normal markdown-section">
<h1 id="examples">Examples</h1>
</section>
</div>
<div class="search-results">
<div class="has-results">
<h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
<ul class="search-results-list"></ul>
</div>
<div class="no-results">
<h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
</div>
</div>
</div>
</div>
</div>
</div>
<a href="../Architecture/App.html" class="navigation navigation-prev " aria-label="Previous page: App">
<i class="fa fa-angle-left"></i>
</a>
<a href="./" class="navigation navigation-next " aria-label="Next page: 与Scrapy集成">
<i class="fa fa-angle-right"></i>
</a>
</div>
<script>
var gitbook = gitbook || [];
gitbook.push(function() {
gitbook.page.hasChanged({"page":{"title":"Examples","level":"1.5","depth":1,"next":{"title":"与Scrapy集成","level":"1.5.1","depth":2,"path":"Examples/README.md","ref":"Examples/README.md","articles":[]},"previous":{"title":"App","level":"1.4.2","depth":2,"path":"Architecture/App.md","ref":"Architecture/App.md","articles":[]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":[],"pluginsConfig":{"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"Examples/README.md","mtime":"2019-03-28T11:41:28.000Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2019-03-28T12:07:05.349Z"},"basePath":"..","book":{"language":""}});
});
</script>
</div>
<script src="../gitbook/gitbook.js"></script>
<script src="../gitbook/theme.js"></script>
<script src="../gitbook/gitbook-plugin-search/search-engine.js"></script>
<script src="../gitbook/gitbook-plugin-search/search.js"></script>
<script src="../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
<script src="../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
<script src="../gitbook/gitbook-plugin-sharing/buttons.js"></script>
<script src="../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
</body>
</html>

View File

@@ -0,0 +1,447 @@
<!DOCTYPE HTML>
<html lang="" >
<head>
<meta charset="UTF-8">
<meta content="text/html; charset=utf-8" http-equiv="Content-Type">
<title>安装 · GitBook</title>
<meta http-equiv="X-UA-Compatible" content="IE=edge" />
<meta name="description" content="">
<meta name="generator" content="GitBook 3.2.3">
<link rel="stylesheet" href="../gitbook/style.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-highlight/website.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-search/search.css">
<link rel="stylesheet" href="../gitbook/gitbook-plugin-fontsettings/website.css">
<meta name="HandheldFriendly" content="true"/>
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<link rel="apple-touch-icon-precomposed" sizes="152x152" href="../gitbook/images/apple-touch-icon-precomposed-152.png">
<link rel="shortcut icon" href="../gitbook/images/favicon.ico" type="image/x-icon">
<link rel="next" href="Run.html" />
<link rel="prev" href="./" />
</head>
<body>
<div class="book">
<div class="book-summary">
<div id="book-search-input" role="search">
<input type="text" placeholder="Type to search" />
</div>
<nav role="navigation">
<ul class="summary">
<li class="chapter " data-level="1.1" data-path="../">
<a href="../">
简介
</a>
</li>
<li class="chapter " data-level="1.2" data-path="./">
<a href="./">
快速开始
</a>
<ul class="articles">
<li class="chapter active" data-level="1.2.1" data-path="Installation.html">
<a href="Installation.html">
安装
</a>
</li>
<li class="chapter " data-level="1.2.2" data-path="Run.html">
<a href="Run.html">
运行
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.3" data-path="../Concept/">
<a href="../Concept/">
概念
</a>
<ul class="articles">
<li class="chapter " data-level="1.3.1" data-path="../Concept/Node.html">
<a href="../Concept/Node.html">
节点
</a>
</li>
<li class="chapter " data-level="1.3.2" data-path="../Concept/Spider.html">
<a href="../Concept/Spider.html">
爬虫
</a>
</li>
<li class="chapter " data-level="1.3.3" data-path="../Concept/Task.html">
<a href="../Concept/Task.html">
任务
</a>
</li>
<li class="chapter " data-level="1.3.4" data-path="../Concept/Deploy.html">
<a href="../Concept/Deploy.html">
部署
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.4" data-path="../Architecture/">
<a href="../Architecture/">
架构
</a>
<ul class="articles">
<li class="chapter " data-level="1.4.1" data-path="../Architecture/Celery.html">
<a href="../Architecture/Celery.html">
Celery
</a>
</li>
<li class="chapter " data-level="1.4.2" data-path="../Architecture/App.html">
<a href="../Architecture/App.html">
App
</a>
</li>
</ul>
</li>
<li class="chapter " data-level="1.5" data-path="../Examples/">
<a href="../Examples/">
Examples
</a>
<ul class="articles">
<li class="chapter " data-level="1.5.1" data-path="../Examples/">
<a href="../Examples/">
与Scrapy集成
</a>
</li>
<li class="chapter " data-level="1.5.2" data-path="../Examples/">
<a href="../Examples/">
与Puppeteer集成
</a>
</li>
</ul>
</li>
<li class="divider"></li>
<li>
<a href="https://www.gitbook.com" target="blank" class="gitbook-link">
Published with GitBook
</a>
</li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<!-- Title -->
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i>
<a href=".." >安装</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<div id="book-search-results">
<div class="search-noresults">
<section class="normal markdown-section">
<h1 id="&#x5B89;&#x88C5;">&#x5B89;&#x88C5;</h1>
<p>&#x6700;&#x5FEB;&#x5B89;&#x88C5;Crawlab&#x7684;&#x65B9;&#x5F0F;&#x662F;&#x514B;&#x9686;&#x4E00;&#x4EFD;&#x4EE3;&#x7801;&#x5230;&#x672C;&#x5730;</p>
<pre><code class="lang-bash">git <span class="hljs-built_in">clone</span> https://github.com/tikazyq/crawlab
</code></pre>
<p>&#x5B89;&#x88C5;&#x7C7B;&#x5E93;</p>
<pre><code class="lang-bash"><span class="hljs-comment"># &#x5B89;&#x88C5;&#x540E;&#x53F0;&#x7C7B;&#x5E93;</span>
pip install -r requirements.txt
</code></pre>
<pre><code class="lang-bash"><span class="hljs-comment"># &#x5B89;&#x88C5;&#x524D;&#x53F0;&#x7C7B;&#x5E93;</span>
<span class="hljs-built_in">cd</span> frontend
npm install
</code></pre>
</section>
</div>
<div class="search-results">
<div class="has-results">
<h1 class="search-results-title"><span class='search-results-count'></span> results matching "<span class='search-query'></span>"</h1>
<ul class="search-results-list"></ul>
</div>
<div class="no-results">
<h1 class="search-results-title">No results matching "<span class='search-query'></span>"</h1>
</div>
</div>
</div>
</div>
</div>
</div>
<a href="./" class="navigation navigation-prev " aria-label="Previous page: 快速开始">
<i class="fa fa-angle-left"></i>
</a>
<a href="Run.html" class="navigation navigation-next " aria-label="Next page: 运行">
<i class="fa fa-angle-right"></i>
</a>
</div>
<script>
var gitbook = gitbook || [];
gitbook.push(function() {
gitbook.page.hasChanged({"page":{"title":"安装","level":"1.2.1","depth":2,"next":{"title":"运行","level":"1.2.2","depth":2,"path":"QuickStart/Run.md","ref":"QuickStart/Run.md","articles":[]},"previous":{"title":"快速开始","level":"1.2","depth":1,"path":"QuickStart/README.md","ref":"QuickStart/README.md","articles":[{"title":"安装","level":"1.2.1","depth":2,"path":"QuickStart/Installation.md","ref":"QuickStart/Installation.md","articles":[]},{"title":"运行","level":"1.2.2","depth":2,"path":"QuickStart/Run.md","ref":"QuickStart/Run.md","articles":[]}]},"dir":"ltr"},"config":{"gitbook":"*","theme":"default","variables":{},"plugins":[],"pluginsConfig":{"highlight":{},"search":{},"lunr":{"maxIndexSize":1000000,"ignoreSpecialCharacters":false},"sharing":{"facebook":true,"twitter":true,"google":false,"weibo":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"fontsettings":{"theme":"white","family":"sans","size":2},"theme-default":{"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"},"showLevel":false}},"structure":{"langs":"LANGS.md","readme":"README.md","glossary":"GLOSSARY.md","summary":"SUMMARY.md"},"pdf":{"pageNumbers":true,"fontSize":12,"fontFamily":"Arial","paperSize":"a4","chapterMark":"pagebreak","pageBreaksBefore":"/","margin":{"right":62,"left":62,"top":56,"bottom":56}},"styles":{"website":"styles/website.css","pdf":"styles/pdf.css","epub":"styles/epub.css","mobi":"styles/mobi.css","ebook":"styles/ebook.css","print":"styles/print.css"}},"file":{"path":"QuickStart/Installation.md","mtime":"2019-03-28T11:55:48.000Z","type":"markdown"},"gitbook":{"version":"3.2.3","time":"2019-03-28T12:07:05.349Z"},"basePath":"..","book":{"language":""}});
});
</script>
</div>
<script src="../gitbook/gitbook.js"></script>
<script src="../gitbook/theme.js"></script>
<script src="../gitbook/gitbook-plugin-search/search-engine.js"></script>
<script src="../gitbook/gitbook-plugin-search/search.js"></script>
<script src="../gitbook/gitbook-plugin-lunr/lunr.min.js"></script>
<script src="../gitbook/gitbook-plugin-lunr/search-lunr.js"></script>
<script src="../gitbook/gitbook-plugin-sharing/buttons.js"></script>
<script src="../gitbook/gitbook-plugin-fontsettings/fontsettings.js"></script>
</body>
</html>

Binary file not shown.

View File

@@ -0,0 +1,240 @@
require(['gitbook', 'jquery'], function(gitbook, $) {
// Configuration
var MAX_SIZE = 4,
MIN_SIZE = 0,
BUTTON_ID;
// Current fontsettings state
var fontState;
// Default themes
var THEMES = [
{
config: 'white',
text: 'White',
id: 0
},
{
config: 'sepia',
text: 'Sepia',
id: 1
},
{
config: 'night',
text: 'Night',
id: 2
}
];
// Default font families
var FAMILIES = [
{
config: 'serif',
text: 'Serif',
id: 0
},
{
config: 'sans',
text: 'Sans',
id: 1
}
];
// Return configured themes
function getThemes() {
return THEMES;
}
// Modify configured themes
function setThemes(themes) {
THEMES = themes;
updateButtons();
}
// Return configured font families
function getFamilies() {
return FAMILIES;
}
// Modify configured font families
function setFamilies(families) {
FAMILIES = families;
updateButtons();
}
// Save current font settings
function saveFontSettings() {
gitbook.storage.set('fontState', fontState);
update();
}
// Increase font size
function enlargeFontSize(e) {
e.preventDefault();
if (fontState.size >= MAX_SIZE) return;
fontState.size++;
saveFontSettings();
}
// Decrease font size
function reduceFontSize(e) {
e.preventDefault();
if (fontState.size <= MIN_SIZE) return;
fontState.size--;
saveFontSettings();
}
// Change font family
function changeFontFamily(configName, e) {
if (e && e instanceof Event) {
e.preventDefault();
}
var familyId = getFontFamilyId(configName);
fontState.family = familyId;
saveFontSettings();
}
// Change type of color theme
function changeColorTheme(configName, e) {
if (e && e instanceof Event) {
e.preventDefault();
}
var $book = gitbook.state.$book;
// Remove currently applied color theme
if (fontState.theme !== 0)
$book.removeClass('color-theme-'+fontState.theme);
// Set new color theme
var themeId = getThemeId(configName);
fontState.theme = themeId;
if (fontState.theme !== 0)
$book.addClass('color-theme-'+fontState.theme);
saveFontSettings();
}
// Return the correct id for a font-family config key
// Default to first font-family
function getFontFamilyId(configName) {
// Search for plugin configured font family
var configFamily = $.grep(FAMILIES, function(family) {
return family.config == configName;
})[0];
// Fallback to default font family
return (!!configFamily)? configFamily.id : 0;
}
// Return the correct id for a theme config key
// Default to first theme
function getThemeId(configName) {
// Search for plugin configured theme
var configTheme = $.grep(THEMES, function(theme) {
return theme.config == configName;
})[0];
// Fallback to default theme
return (!!configTheme)? configTheme.id : 0;
}
function update() {
var $book = gitbook.state.$book;
$('.font-settings .font-family-list li').removeClass('active');
$('.font-settings .font-family-list li:nth-child('+(fontState.family+1)+')').addClass('active');
$book[0].className = $book[0].className.replace(/\bfont-\S+/g, '');
$book.addClass('font-size-'+fontState.size);
$book.addClass('font-family-'+fontState.family);
if(fontState.theme !== 0) {
$book[0].className = $book[0].className.replace(/\bcolor-theme-\S+/g, '');
$book.addClass('color-theme-'+fontState.theme);
}
}
function init(config) {
// Search for plugin configured font family
var configFamily = getFontFamilyId(config.family),
configTheme = getThemeId(config.theme);
// Instantiate font state object
fontState = gitbook.storage.get('fontState', {
size: config.size || 2,
family: configFamily,
theme: configTheme
});
update();
}
function updateButtons() {
// Remove existing fontsettings buttons
if (!!BUTTON_ID) {
gitbook.toolbar.removeButton(BUTTON_ID);
}
// Create buttons in toolbar
BUTTON_ID = gitbook.toolbar.createButton({
icon: 'fa fa-font',
label: 'Font Settings',
className: 'font-settings',
dropdown: [
[
{
text: 'A',
className: 'font-reduce',
onClick: reduceFontSize
},
{
text: 'A',
className: 'font-enlarge',
onClick: enlargeFontSize
}
],
$.map(FAMILIES, function(family) {
family.onClick = function(e) {
return changeFontFamily(family.config, e);
};
return family;
}),
$.map(THEMES, function(theme) {
theme.onClick = function(e) {
return changeColorTheme(theme.config, e);
};
return theme;
})
]
});
}
// Init configuration at start
gitbook.events.bind('start', function(e, config) {
var opts = config.fontsettings;
// Generate buttons at start
updateButtons();
// Init current settings
init(opts);
});
// Expose API
gitbook.fontsettings = {
enlargeFontSize: enlargeFontSize,
reduceFontSize: reduceFontSize,
setTheme: changeColorTheme,
setFamily: changeFontFamily,
getThemes: getThemes,
setThemes: setThemes,
getFamilies: getFamilies,
setFamilies: setFamilies
};
});

View File

@@ -0,0 +1,135 @@
pre,
code {
/* http://jmblog.github.io/color-themes-for-highlightjs */
/* Tomorrow Comment */
/* Tomorrow Red */
/* Tomorrow Orange */
/* Tomorrow Yellow */
/* Tomorrow Green */
/* Tomorrow Aqua */
/* Tomorrow Blue */
/* Tomorrow Purple */
}
pre .hljs-comment,
code .hljs-comment,
pre .hljs-title,
code .hljs-title {
color: #8e908c;
}
pre .hljs-variable,
code .hljs-variable,
pre .hljs-attribute,
code .hljs-attribute,
pre .hljs-tag,
code .hljs-tag,
pre .hljs-regexp,
code .hljs-regexp,
pre .hljs-deletion,
code .hljs-deletion,
pre .ruby .hljs-constant,
code .ruby .hljs-constant,
pre .xml .hljs-tag .hljs-title,
code .xml .hljs-tag .hljs-title,
pre .xml .hljs-pi,
code .xml .hljs-pi,
pre .xml .hljs-doctype,
code .xml .hljs-doctype,
pre .html .hljs-doctype,
code .html .hljs-doctype,
pre .css .hljs-id,
code .css .hljs-id,
pre .css .hljs-class,
code .css .hljs-class,
pre .css .hljs-pseudo,
code .css .hljs-pseudo {
color: #c82829;
}
pre .hljs-number,
code .hljs-number,
pre .hljs-preprocessor,
code .hljs-preprocessor,
pre .hljs-pragma,
code .hljs-pragma,
pre .hljs-built_in,
code .hljs-built_in,
pre .hljs-literal,
code .hljs-literal,
pre .hljs-params,
code .hljs-params,
pre .hljs-constant,
code .hljs-constant {
color: #f5871f;
}
pre .ruby .hljs-class .hljs-title,
code .ruby .hljs-class .hljs-title,
pre .css .hljs-rules .hljs-attribute,
code .css .hljs-rules .hljs-attribute {
color: #eab700;
}
pre .hljs-string,
code .hljs-string,
pre .hljs-value,
code .hljs-value,
pre .hljs-inheritance,
code .hljs-inheritance,
pre .hljs-header,
code .hljs-header,
pre .hljs-addition,
code .hljs-addition,
pre .ruby .hljs-symbol,
code .ruby .hljs-symbol,
pre .xml .hljs-cdata,
code .xml .hljs-cdata {
color: #718c00;
}
pre .css .hljs-hexcolor,
code .css .hljs-hexcolor {
color: #3e999f;
}
pre .hljs-function,
code .hljs-function,
pre .python .hljs-decorator,
code .python .hljs-decorator,
pre .python .hljs-title,
code .python .hljs-title,
pre .ruby .hljs-function .hljs-title,
code .ruby .hljs-function .hljs-title,
pre .ruby .hljs-title .hljs-keyword,
code .ruby .hljs-title .hljs-keyword,
pre .perl .hljs-sub,
code .perl .hljs-sub,
pre .javascript .hljs-title,
code .javascript .hljs-title,
pre .coffeescript .hljs-title,
code .coffeescript .hljs-title {
color: #4271ae;
}
pre .hljs-keyword,
code .hljs-keyword,
pre .javascript .hljs-function,
code .javascript .hljs-function {
color: #8959a8;
}
pre .hljs,
code .hljs {
display: block;
background: white;
color: #4d4d4c;
padding: 0.5em;
}
pre .coffeescript .javascript,
code .coffeescript .javascript,
pre .javascript .xml,
code .javascript .xml,
pre .tex .hljs-formula,
code .tex .hljs-formula,
pre .xml .javascript,
code .xml .javascript,
pre .xml .vbscript,
code .xml .vbscript,
pre .xml .css,
code .xml .css,
pre .xml .hljs-cdata,
code .xml .hljs-cdata {
opacity: 0.5;
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,90 @@
require(['gitbook', 'jquery'], function(gitbook, $) {
var SITES = {
'facebook': {
'label': 'Facebook',
'icon': 'fa fa-facebook',
'onClick': function(e) {
e.preventDefault();
window.open('http://www.facebook.com/sharer/sharer.php?s=100&p[url]='+encodeURIComponent(location.href));
}
},
'twitter': {
'label': 'Twitter',
'icon': 'fa fa-twitter',
'onClick': function(e) {
e.preventDefault();
window.open('http://twitter.com/home?status='+encodeURIComponent(document.title+' '+location.href));
}
},
'google': {
'label': 'Google+',
'icon': 'fa fa-google-plus',
'onClick': function(e) {
e.preventDefault();
window.open('https://plus.google.com/share?url='+encodeURIComponent(location.href));
}
},
'weibo': {
'label': 'Weibo',
'icon': 'fa fa-weibo',
'onClick': function(e) {
e.preventDefault();
window.open('http://service.weibo.com/share/share.php?content=utf-8&url='+encodeURIComponent(location.href)+'&title='+encodeURIComponent(document.title));
}
},
'instapaper': {
'label': 'Instapaper',
'icon': 'fa fa-instapaper',
'onClick': function(e) {
e.preventDefault();
window.open('http://www.instapaper.com/text?u='+encodeURIComponent(location.href));
}
},
'vk': {
'label': 'VK',
'icon': 'fa fa-vk',
'onClick': function(e) {
e.preventDefault();
window.open('http://vkontakte.ru/share.php?url='+encodeURIComponent(location.href));
}
}
};
gitbook.events.bind('start', function(e, config) {
var opts = config.sharing;
// Create dropdown menu
var menu = $.map(opts.all, function(id) {
var site = SITES[id];
return {
text: site.label,
onClick: site.onClick
};
});
// Create main button with dropdown
if (menu.length > 0) {
gitbook.toolbar.createButton({
icon: 'fa fa-share-alt',
label: 'Share',
position: 'right',
dropdown: [menu]
});
}
// Direct actions to share
$.each(SITES, function(sideId, site) {
if (!opts[sideId]) return;
gitbook.toolbar.createButton({
icon: site.icon,
label: site.text,
position: 'right',
onClick: site.onClick
});
});
});
});

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 89 KiB

15537
frontend/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,27 @@
import request from '@/utils/request'
export function login (username, password) {
return request({
url: '/user/login',
method: 'post',
data: {
username,
password
}
})
}
export function getInfo (token) {
return request({
url: '/user/info',
method: 'get',
params: { token }
})
}
export function logout () {
return request({
url: '/user/logout',
method: 'post'
})
}

View File

@@ -0,0 +1,46 @@
import axios from 'axios'
let baseUrl = 'http://localhost:8000/api'
if (process.env.NODE_ENV === 'production') {
baseUrl = 'http://139.129.230.98:8000/api'
}
// const baseUrl = process.env.API_BASE_URL || 'http://localhost:8000/api'
const request = (method, path, params, data) => {
return new Promise((resolve, reject) => {
const url = `${baseUrl}${path}`
axios({
method,
url,
params,
data
})
.then(resolve)
.catch(reject)
})
}
const get = (path, params) => {
return request('GET', path, params)
}
const post = (path, data) => {
return request('POST', path, {}, data)
}
const put = (path, data) => {
return request('PUT', path, {}, data)
}
const del = (path, data) => {
return request('DELETE', path)
}
export default {
baseUrl,
request,
get,
post,
put,
delete: del
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 96 KiB

View File

@@ -0,0 +1,75 @@
<template>
<div class="environment-list">
<el-row>
<div class="button-group">
<el-button type="primary" @click="addEnv" icon="el-icon-plus">{{$t('Add Environment Variables')}}</el-button>
<el-button type="success" @click="save">{{$t('Save')}}</el-button>
</div>
</el-row>
<el-row>
<el-table :data="spiderForm.envs">
<el-table-column :label="$t('Variable')">
<template slot-scope="scope">
<el-input v-model="scope.row.name" :placeholder="$t('Variable')"></el-input>
</template>
</el-table-column>
<el-table-column :label="$t('Value')">
<template slot-scope="scope">
<el-input v-model="scope.row.value" :placeholder="$t('Value')"></el-input>
</template>
</el-table-column>
<el-table-column :label="$t('Action')">
<template slot-scope="scope">
<el-button size="mini" icon="el-icon-delete" type="danger" @click="deleteEnv(scope.$index)"></el-button>
</template>
</el-table-column>
</el-table>
</el-row>
</div>
</template>
<script>
import {
mapState
} from 'vuex'
export default {
name: 'EnvironmentList',
computed: {
...mapState('spider', [
'spiderForm'
])
},
methods: {
addEnv () {
if (!this.spiderForm.envs) {
this.$set(this.spiderForm, 'envs', [])
}
this.spiderForm.envs.push({
name: '',
value: ''
})
console.log(this.spiderForm)
},
deleteEnv (index) {
this.spiderForm.envs.splice(index, 1)
},
save () {
this.$store.dispatch('spider/updateSpiderEnvs')
.then(() => {
this.$message.success(this.$t('Spider info has been saved successfully'))
})
.catch(error => {
this.$message.error(error)
})
}
}
}
</script>
<style scoped>
.button-group {
width: 100%;
text-align: right;
}
</style>

View File

@@ -0,0 +1,19 @@
/**
* database64文件格式转换为2进制
*
* @param {[String]} data dataURL 的格式为 “data:image/png;base64,****”,逗号之前都是一些说明性的文字,我们只需要逗号之后的就行了
* @param {[String]} mime [description]
* @return {[blob]} [description]
*/
export default function(data, mime) {
data = data.split(',')[1]
data = window.atob(data)
var ia = new Uint8Array(data.length)
for (var i = 0; i < data.length; i++) {
ia[i] = data.charCodeAt(i)
}
// canvas.toDataURL 返回的默认格式就是 image/png
return new Blob([ia], {
type: mime
})
}

View File

@@ -0,0 +1,103 @@
<template>
<div class="upload-container">
<el-button :style="{background:color,borderColor:color}" icon="el-icon-upload" size="mini" type="primary" @click=" dialogVisible=true">上传图片
</el-button>
<el-dialog :visible.sync="dialogVisible">
<el-upload
:multiple="true"
:file-list="fileList"
:show-file-list="true"
:on-remove="handleRemove"
:on-success="handleSuccess"
:before-upload="beforeUpload"
class="editor-slide-upload"
action="https://httpbin.org/post"
list-type="picture-card">
<el-button size="small" type="primary">点击上传</el-button>
</el-upload>
<el-button @click="dialogVisible = false"> </el-button>
<el-button type="primary" @click="handleSubmit"> </el-button>
</el-dialog>
</div>
</template>
<script>
// import { getToken } from 'api/qiniu'
export default {
name: 'EditorSlideUpload',
props: {
color: {
type: String,
default: '#1890ff'
}
},
data() {
return {
dialogVisible: false,
listObj: {},
fileList: []
}
},
methods: {
checkAllSuccess() {
return Object.keys(this.listObj).every(item => this.listObj[item].hasSuccess)
},
handleSubmit() {
const arr = Object.keys(this.listObj).map(v => this.listObj[v])
if (!this.checkAllSuccess()) {
this.$message('请等待所有图片上传成功 或 出现了网络问题,请刷新页面重新上传!')
return
}
this.$emit('successCBK', arr)
this.listObj = {}
this.fileList = []
this.dialogVisible = false
},
handleSuccess(response, file) {
const uid = file.uid
const objKeyArr = Object.keys(this.listObj)
for (let i = 0, len = objKeyArr.length; i < len; i++) {
if (this.listObj[objKeyArr[i]].uid === uid) {
this.listObj[objKeyArr[i]].url = response.files.file
this.listObj[objKeyArr[i]].hasSuccess = true
return
}
}
},
handleRemove(file) {
const uid = file.uid
const objKeyArr = Object.keys(this.listObj)
for (let i = 0, len = objKeyArr.length; i < len; i++) {
if (this.listObj[objKeyArr[i]].uid === uid) {
delete this.listObj[objKeyArr[i]]
return
}
}
},
beforeUpload(file) {
const _self = this
const _URL = window.URL || window.webkitURL
const fileName = file.uid
this.listObj[fileName] = {}
return new Promise((resolve, reject) => {
const img = new Image()
img.src = _URL.createObjectURL(file)
img.onload = function() {
_self.listObj[fileName] = { hasSuccess: false, uid: file.uid, width: this.width, height: this.height }
}
resolve(true)
})
}
}
}
</script>
<style rel="stylesheet/scss" lang="scss" scoped>
.editor-slide-upload {
margin-bottom: 20px;
/deep/ .el-upload--picture-card {
width: 100%;
}
}
</style>

View File

@@ -1,127 +0,0 @@
<template>
<div class="singleImageUpload2 upload-container">
<el-upload
:data="dataObj"
:multiple="false"
:show-file-list="false"
:on-success="handleImageSuccess"
class="image-uploader"
drag
action="https://httpbin.org/post">
<i class="el-icon-upload"/>
<div class="el-upload__text">Drag或<em>点击上传</em></div>
</el-upload>
<div v-show="imageUrl.length>0" class="image-preview">
<div v-show="imageUrl.length>1" class="image-preview-wrapper">
<img :src="imageUrl">
<div class="image-preview-action">
<i class="el-icon-delete" @click="rmImage"/>
</div>
</div>
</div>
</div>
</template>
<script>
import { getToken } from '@/api/qiniu'
export default {
name: 'SingleImageUpload2',
props: {
value: {
type: String,
default: ''
}
},
data() {
return {
tempUrl: '',
dataObj: { token: '', key: '' }
}
},
computed: {
imageUrl() {
return this.value
}
},
methods: {
rmImage() {
this.emitInput('')
},
emitInput(val) {
this.$emit('input', val)
},
handleImageSuccess() {
this.emitInput(this.tempUrl)
},
beforeUpload() {
const _self = this
return new Promise((resolve, reject) => {
getToken().then(response => {
const key = response.data.qiniu_key
const token = response.data.qiniu_token
_self._data.dataObj.token = token
_self._data.dataObj.key = key
this.tempUrl = response.data.qiniu_url
resolve(true)
}).catch(() => {
reject(false)
})
})
}
}
}
</script>
<style rel="stylesheet/scss" lang="scss" scoped>
.upload-container {
width: 100%;
height: 100%;
position: relative;
.image-uploader {
height: 100%;
}
.image-preview {
width: 100%;
height: 100%;
position: absolute;
left: 0px;
top: 0px;
border: 1px dashed #d9d9d9;
.image-preview-wrapper {
position: relative;
width: 100%;
height: 100%;
img {
width: 100%;
height: 100%;
}
}
.image-preview-action {
position: absolute;
width: 100%;
height: 100%;
left: 0;
top: 0;
cursor: default;
text-align: center;
color: #fff;
opacity: 0;
font-size: 20px;
background-color: rgba(0, 0, 0, .5);
transition: opacity .3s;
cursor: pointer;
text-align: center;
line-height: 200px;
.el-icon-delete {
font-size: 36px;
}
}
&:hover {
.image-preview-action {
opacity: 1;
}
}
}
}
</style>

View File

@@ -1,154 +0,0 @@
<template>
<div class="upload-container">
<el-upload
:data="dataObj"
:multiple="false"
:show-file-list="false"
:on-success="handleImageSuccess"
class="image-uploader"
drag
action="https://httpbin.org/post">
<i class="el-icon-upload"/>
<div class="el-upload__text">将文件拖到此处<em>点击上传</em></div>
</el-upload>
<div class="image-preview image-app-preview">
<div v-show="imageUrl.length>1" class="image-preview-wrapper">
<img :src="imageUrl">
<div class="image-preview-action">
<i class="el-icon-delete" @click="rmImage"/>
</div>
</div>
</div>
<div class="image-preview">
<div v-show="imageUrl.length>1" class="image-preview-wrapper">
<img :src="imageUrl">
<div class="image-preview-action">
<i class="el-icon-delete" @click="rmImage"/>
</div>
</div>
</div>
</div>
</template>
<script>
import { getToken } from '@/api/qiniu'
export default {
name: 'SingleImageUpload3',
props: {
value: {
type: String,
default: ''
}
},
data() {
return {
tempUrl: '',
dataObj: { token: '', key: '' }
}
},
computed: {
imageUrl() {
return this.value
}
},
methods: {
rmImage() {
this.emitInput('')
},
emitInput(val) {
this.$emit('input', val)
},
handleImageSuccess(file) {
this.emitInput(file.files.file)
},
beforeUpload() {
const _self = this
return new Promise((resolve, reject) => {
getToken().then(response => {
const key = response.data.qiniu_key
const token = response.data.qiniu_token
_self._data.dataObj.token = token
_self._data.dataObj.key = key
this.tempUrl = response.data.qiniu_url
resolve(true)
}).catch(err => {
console.log(err)
reject(false)
})
})
}
}
}
</script>
<style rel="stylesheet/scss" lang="scss" scoped>
@import "~@/styles/mixin.scss";
.upload-container {
width: 100%;
position: relative;
@include clearfix;
.image-uploader {
width: 35%;
float: left;
}
.image-preview {
width: 200px;
height: 200px;
position: relative;
border: 1px dashed #d9d9d9;
float: left;
margin-left: 50px;
.image-preview-wrapper {
position: relative;
width: 100%;
height: 100%;
img {
width: 100%;
height: 100%;
}
}
.image-preview-action {
position: absolute;
width: 100%;
height: 100%;
left: 0;
top: 0;
cursor: default;
text-align: center;
color: #fff;
opacity: 0;
font-size: 20px;
background-color: rgba(0, 0, 0, .5);
transition: opacity .3s;
cursor: pointer;
text-align: center;
line-height: 200px;
.el-icon-delete {
font-size: 36px;
}
}
&:hover {
.image-preview-action {
opacity: 1;
}
}
}
.image-app-preview {
width: 320px;
height: 180px;
position: relative;
border: 1px dashed #d9d9d9;
float: left;
margin-left: 50px;
.app-fake-conver {
height: 44px;
position: absolute;
width: 100%; // background: rgba(0, 0, 0, .1);
text-align: center;
line-height: 64px;
color: #fff;
}
}
}
</style>

View File

@@ -16,6 +16,7 @@ export default {
'Deployed Spiders': '已部署爬虫',
'Log': '日志',
'Results': '结果',
'Environment': '环境',
// 选择
Spider: '爬虫',
@@ -79,6 +80,9 @@ export default {
'Language': '语言',
'Schedule Enabled': '是否开启定时任务',
'Schedule Cron': '定时任务',
'Variable': '变量',
'Value': '值',
'Add Environment Variables': '添加环境变量',
// 爬虫列表
'Name': '名称',
@@ -126,5 +130,5 @@ export default {
'Node info has been saved successfully': '节点信息已成功保存',
'Are you sure to deploy this spider?': '你确定要部署该爬虫?',
'Are you sure to delete this spider?': '你确定要删除该爬虫?',
'Spider info has been saved successfully': '爬虫信息已成功保存',
'Spider info has been saved successfully': '爬虫信息已成功保存'
}

View File

@@ -5,7 +5,7 @@ const state = {
spiderList: [],
// active spider data
spiderForm: { _id: {} },
spiderForm: {},
// node to deploy/run
activeNode: {},
@@ -77,6 +77,11 @@ const actions = {
dispatch('getSpiderList')
})
},
updateSpiderEnvs ({ state }) {
return request.post(`/spiders/${state.spiderForm._id}/update_envs`, {
envs: JSON.stringify(state.spiderForm.envs)
})
},
getSpiderData ({ state, commit }, id) {
return request.get(`/spiders/${id}`)
.then(response => {

View File

@@ -1,32 +0,0 @@
<template>
<div class="dashboard-container">
<div class="dashboard-text">name:{{ name }}</div>
<div class="dashboard-text">roles:<span v-for="role in roles" :key="role">{{ role }}</span></div>
</div>
</template>
<script>
import { mapGetters } from 'vuex'
export default {
name: 'Dashboard',
computed: {
...mapGetters([
'name',
'roles'
])
}
}
</script>
<style rel="stylesheet/scss" lang="scss" scoped>
.dashboard {
&-container {
margin: 30px;
}
&-text {
font-size: 30px;
line-height: 46px;
}
}
</style>

View File

@@ -1,84 +0,0 @@
<template>
<div class="app-container">
<el-form ref="form" :model="form" label-width="120px">
<el-form-item label="Activity name">
<el-input v-model="form.name"/>
</el-form-item>
<el-form-item label="Activity zone">
<el-select v-model="form.region" placeholder="please select your zone">
<el-option label="Zone one" value="shanghai"/>
<el-option label="Zone two" value="beijing"/>
</el-select>
</el-form-item>
<el-form-item label="Activity time">
<el-col :span="11">
<el-date-picker v-model="form.date1" type="date" placeholder="Pick a date" style="width: 100%;"/>
</el-col>
<el-col :span="2" class="line">-</el-col>
<el-col :span="11">
<el-time-picker v-model="form.date2" type="fixed-time" placeholder="Pick a time" style="width: 100%;"/>
</el-col>
</el-form-item>
<el-form-item label="Instant delivery">
<el-switch v-model="form.delivery"/>
</el-form-item>
<el-form-item label="Activity type">
<el-checkbox-group v-model="form.type">
<el-checkbox label="Online activities" name="type"/>
<el-checkbox label="Promotion activities" name="type"/>
<el-checkbox label="Offline activities" name="type"/>
<el-checkbox label="Simple brand exposure" name="type"/>
</el-checkbox-group>
</el-form-item>
<el-form-item label="Resources">
<el-radio-group v-model="form.resource">
<el-radio label="Sponsor"/>
<el-radio label="Venue"/>
</el-radio-group>
</el-form-item>
<el-form-item label="Activity form">
<el-input v-model="form.desc" type="textarea"/>
</el-form-item>
<el-form-item>
<el-button type="primary" @click="onSubmit">Create</el-button>
<el-button @click="onCancel">Cancel</el-button>
</el-form-item>
</el-form>
</div>
</template>
<script>
export default {
data () {
return {
form: {
name: '',
region: '',
date1: '',
date2: '',
delivery: false,
type: [],
resource: '',
desc: ''
}
}
},
methods: {
onSubmit () {
this.$message('submit!')
},
onCancel () {
this.$message({
message: 'cancel!',
type: 'warning'
})
}
}
}
</script>
<style scoped>
.line{
text-align: center;
}
</style>

View File

@@ -1,7 +0,0 @@
<template >
<div style="padding:30px;">
<el-alert :closable="false" title="menu 1">
<router-view />
</el-alert>
</div>
</template>

View File

@@ -1,7 +0,0 @@
<template >
<div style="padding:30px;">
<el-alert :closable="false" title="menu 1-1" type="success">
<router-view />
</el-alert>
</div>
</template>

View File

@@ -1,7 +0,0 @@
<template>
<div style="padding:30px;">
<el-alert :closable="false" title="menu 1-2" type="success">
<router-view />
</el-alert>
</div>
</template>

View File

@@ -1,5 +0,0 @@
<template functional>
<div style="padding:30px;">
<el-alert :closable="false" title="menu 1-2-1" type="warning" />
</div>
</template>

View File

@@ -1,5 +0,0 @@
<template functional>
<div style="padding:30px;">
<el-alert :closable="false" title="menu 1-2-2" type="warning" />
</div>
</template>

View File

@@ -1,5 +0,0 @@
<template functional>
<div style="padding:30px;">
<el-alert :closable="false" title="menu 1-3" type="success" />
</div>
</template>

View File

@@ -1,5 +0,0 @@
<template>
<div style="padding:30px;">
<el-alert :closable="false" title="menu 2" />
</div>
</template>

View File

@@ -16,6 +16,9 @@
<el-tab-pane :label="$t('Files')" name="files">
<file-list/>
</el-tab-pane>
<el-tab-pane :label="$t('Environment')" name="environment">
<environment-list/>
</el-tab-pane>
</el-tabs>
</div>
</template>
@@ -26,10 +29,12 @@ import {
} from 'vuex'
import FileList from '../../components/FileList/FileList'
import SpiderOverview from '../../components/Overview/SpiderOverview'
import EnvironmentList from '../../components/Environment/EnvironmentList'
export default {
name: 'NodeDetail',
components: {
EnvironmentList,
FileList,
SpiderOverview
},

View File

@@ -1,78 +0,0 @@
<template>
<div class="app-container">
<el-table
v-loading="listLoading"
:data="list"
element-loading-text="Loading"
border
fit
highlight-current-row>
<el-table-column align="center" label="ID" width="95">
<template slot-scope="scope">
{{ scope.$index }}
</template>
</el-table-column>
<el-table-column label="Title">
<template slot-scope="scope">
{{ scope.row.title }}
</template>
</el-table-column>
<el-table-column label="Author" width="110" align="center">
<template slot-scope="scope">
<span>{{ scope.row.author }}</span>
</template>
</el-table-column>
<el-table-column label="Pageviews" width="110" align="center">
<template slot-scope="scope">
{{ scope.row.pageviews }}
</template>
</el-table-column>
<el-table-column class-name="status-col" label="Status" width="110" align="center">
<template slot-scope="scope">
<el-tag :type="scope.row.status | statusFilter">{{ scope.row.status }}</el-tag>
</template>
</el-table-column>
<el-table-column align="center" prop="created_at" label="Display_time" width="200">
<template slot-scope="scope">
<i class="el-icon-time"/>
<span>{{ scope.row.display_time }}</span>
</template>
</el-table-column>
</el-table>
</div>
</template>
<script>
import { getList } from '@/api/table'
export default {
filters: {
statusFilter (status) {
const statusMap = {
published: 'success',
draft: 'gray',
deleted: 'danger'
}
return statusMap[status]
}
},
data () {
return {
list: null,
listLoading: true
}
},
created () {
this.fetchData()
},
methods: {
fetchData () {
this.listLoading = true
getList(this.listQuery).then(response => {
this.list = response.data.items
this.listLoading = false
})
}
}
}
</script>

View File

@@ -1,77 +0,0 @@
<template>
<div class="app-container">
<el-input v-model="filterText" placeholder="Filter keyword" style="margin-bottom:30px;"/>
<el-tree
ref="tree2"
:data="data2"
:props="defaultProps"
:filter-node-method="filterNode"
class="filter-tree"
default-expand-all
/>
</div>
</template>
<script>
export default {
data () {
return {
filterText: '',
data2: [{
id: 1,
label: 'Level one 1',
children: [{
id: 4,
label: 'Level two 1-1',
children: [{
id: 9,
label: 'Level three 1-1-1'
}, {
id: 10,
label: 'Level three 1-1-2'
}]
}]
}, {
id: 2,
label: 'Level one 2',
children: [{
id: 5,
label: 'Level two 2-1'
}, {
id: 6,
label: 'Level two 2-2'
}]
}, {
id: 3,
label: 'Level one 3',
children: [{
id: 7,
label: 'Level two 3-1'
}, {
id: 8,
label: 'Level two 3-2'
}]
}],
defaultProps: {
children: 'children',
label: 'label'
}
}
},
watch: {
filterText (val) {
this.$refs.tree2.filter(val)
}
},
methods: {
filterNode (value, data) {
if (!value) return true
return data.label.indexOf(value) !== -1
}
}
}
</script>

View File

@@ -1,66 +0,0 @@
amqp==2.4.1
aniso8601==4.1.0
APScheduler==3.5.3
asn1crypto==0.24.0
attrs==18.2.0
Automat==0.7.0
Babel==2.6.0
billiard==3.5.0.5
celery==4.2.1
certifi==2018.11.29
cffi==1.11.5
chardet==3.0.4
Click==7.0
constantly==15.1.0
cryptography==2.5
cssselect==1.0.3
Django==2.1.7
django-cors-headers==2.4.0
dnspython==1.16.0
docopt==0.6.2
eventlet==0.24.1
Flask==1.0.2
Flask-Cors==3.0.7
Flask-RESTful==0.3.7
Flask-Uploads==0.2.1
flower==0.9.2
gerapy==0.8.5
greenlet==0.4.15
gunicorn==19.9.0
hyperlink==18.0.0
idna==2.8
incremental==17.5.0
itsdangerous==1.1.0
Jinja2==2.10
kombu==4.3.0
lxml==4.3.1
MarkupSafe==1.1.0
mongoengine==0.16.3
monotonic==1.5
parsel==1.5.1
pyasn1==0.4.5
pyasn1-modules==0.2.4
pycparser==2.19
PyDispatcher==2.0.5
PyHamcrest==1.9.0
pymongo==3.7.2
PyMySQL==0.9.3
pyOpenSSL==19.0.0
python-scrapyd-api==2.1.2
pytz==2018.9
queuelib==1.5.0
redis==3.1.0
requests==2.21.0
Scrapy==1.6.0
scrapy-redis==0.6.8
scrapy-splash==0.7.2
service-identity==18.1.0
six==1.12.0
tornado==5.1.1
Twisted==18.9.0
tzlocal==1.5.1
urllib3==1.24.1
vine==1.2.0
w3lib==1.20.0
Werkzeug==0.14.1
zope.interface==4.6.0

View File

@@ -11,7 +11,7 @@ const MongoClient = require('mongodb').MongoClient;
const page = await browser.newPage();
// open database connection
const client = await MongoClient.connect('mongodb://192.168.99.100:27017');
const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
let db = await client.db('crawlab_test');
const colName = process.env.CRAWLAB_COLLECTION || 'results';
const col = db.collection(colName);

View File

@@ -53,7 +53,7 @@ const MongoClient = require('mongodb').MongoClient;
});
// open database connection
const client = await MongoClient.connect('mongodb://192.168.99.100:27017');
const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
let db = await client.db('crawlab_test');
const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin';
const taskId = process.env.CRAWLAB_TASK_ID;

View File

@@ -8,17 +8,16 @@ import os
from pymongo import MongoClient
MONGO_HOST = '192.168.99.100'
MONGO_PORT = 27017
MONGO_DB = 'crawlab_test'
MONGO_HOST = os.environ['MONGO_HOST']
MONGO_PORT = os.environ['MONGO_PORT']
MONGO_DB = os.environ['MONGO_DB']
print(MONGO_HOST)
class JuejinPipeline(object):
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
db = mongo[MONGO_DB]
col_name = os.environ.get('CRAWLAB_COLLECTION')
if not col_name:
col_name = 'test'
col_name = os.environ.get('CRAWLAB_COLLECTION','test')
col = db[col_name]
def process_item(self, item, spider):

View File

@@ -0,0 +1,2 @@
from scrapy import cmdline
cmdline.execute(["scrapy","crawl","juejin_spider"])

View File

@@ -52,8 +52,10 @@ const MongoClient = require('mongodb').MongoClient;
});
// open database connection
const client = await MongoClient.connect('mongodb://192.168.99.100:27017');
let db = await client.db('crawlab_test');
console.log(process.env.MONGO_HOST);
console.log(process.env.MONGO_PORT);
const client = await MongoClient.connect(`mongodb://${process.env.MONGO_HOST}:${process.env.MONGO_PORT}`);
let db = await client.db(process.env.MONGO_DB);
const colName = process.env.CRAWLAB_COLLECTION || 'results_juejin';
const taskId = process.env.CRAWLAB_TASK_ID;
const col = db.collection(colName);

View File

@@ -51,7 +51,7 @@ const MongoClient = require('mongodb').MongoClient;
});
// open database connection
const client = await MongoClient.connect('mongodb://192.168.99.100:27017');
const client = await MongoClient.connect('mongodb://127.0.0.1:27017');
let db = await client.db('crawlab_test');
const colName = process.env.CRAWLAB_COLLECTION || 'results_segmentfault';
const taskId = process.env.CRAWLAB_TASK_ID;