mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-27 17:50:53 +01:00
898 lines
28 KiB
Python
898 lines
28 KiB
Python
import json
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
from datetime import datetime
|
|
from random import random
|
|
from urllib.parse import urlparse
|
|
|
|
import gevent
|
|
import requests
|
|
from bson import ObjectId
|
|
from flask import current_app, request
|
|
from flask_restful import reqparse, Resource
|
|
from lxml import etree
|
|
from werkzeug.datastructures import FileStorage
|
|
|
|
from config import PROJECT_DEPLOY_FILE_FOLDER, PROJECT_SOURCE_FILE_FOLDER, PROJECT_TMP_FOLDER
|
|
from constants.node import NodeStatus
|
|
from constants.spider import SpiderType, CrawlType, QueryType, ExtractType
|
|
from constants.task import TaskStatus
|
|
from db.manager import db_manager
|
|
from routes.base import BaseApi
|
|
from tasks.scheduler import scheduler
|
|
from tasks.spider import execute_spider, execute_config_spider
|
|
from utils import jsonify
|
|
from utils.deploy import zip_file, unzip_file
|
|
from utils.file import get_file_suffix_stats, get_file_suffix
|
|
from utils.spider import get_lang_by_stats, get_last_n_run_errors_count, get_last_n_day_tasks_count, get_list_page_data, \
|
|
get_detail_page_data, generate_urls
|
|
|
|
parser = reqparse.RequestParser()
|
|
parser.add_argument('file', type=FileStorage, location='files')
|
|
|
|
IGNORE_DIRS = [
|
|
'.idea'
|
|
]
|
|
|
|
|
|
class SpiderApi(BaseApi):
|
|
col_name = 'spiders'
|
|
|
|
arguments = (
|
|
# name of spider
|
|
('name', str),
|
|
|
|
# execute shell command
|
|
('cmd', str),
|
|
|
|
# spider source folder
|
|
('src', str),
|
|
|
|
# spider type
|
|
('type', str),
|
|
|
|
# spider language
|
|
('lang', str),
|
|
|
|
# spider results collection
|
|
('col', str),
|
|
|
|
# spider schedule cron
|
|
('cron', str),
|
|
|
|
# spider schedule cron enabled
|
|
('cron_enabled', int),
|
|
|
|
# spider schedule cron enabled
|
|
('envs', str),
|
|
|
|
# spider site
|
|
('site', str),
|
|
|
|
########################
|
|
# Configurable Spider
|
|
########################
|
|
|
|
# spider crawl fields for list page
|
|
('fields', str),
|
|
|
|
# spider crawl fields for detail page
|
|
('detail_fields', str),
|
|
|
|
# spider crawl type
|
|
('crawl_type', str),
|
|
|
|
# spider start url
|
|
('start_url', str),
|
|
|
|
# url pattern: support generation of urls with patterns
|
|
('url_pattern', str),
|
|
|
|
# spider item selector
|
|
('item_selector', str),
|
|
|
|
# spider item selector type
|
|
('item_selector_type', str),
|
|
|
|
# spider pagination selector
|
|
('pagination_selector', str),
|
|
|
|
# spider pagination selector type
|
|
('pagination_selector_type', str),
|
|
|
|
# whether to obey robots.txt
|
|
('obey_robots_txt', bool),
|
|
|
|
# item threshold to filter out non-relevant list items
|
|
('item_threshold', int),
|
|
)
|
|
|
|
def get(self, id=None, action=None):
|
|
"""
|
|
GET method of SpiderAPI.
|
|
:param id: spider_id
|
|
:param action: action
|
|
"""
|
|
# action by id
|
|
if action is not None:
|
|
if not hasattr(self, action):
|
|
return {
|
|
'status': 'ok',
|
|
'code': 400,
|
|
'error': 'action "%s" invalid' % action
|
|
}, 400
|
|
return getattr(self, action)(id)
|
|
|
|
# get one node
|
|
elif id is not None:
|
|
spider = db_manager.get('spiders', id=id)
|
|
|
|
# get deploy
|
|
last_deploy = db_manager.get_last_deploy(spider_id=spider['_id'])
|
|
if last_deploy is not None:
|
|
spider['deploy_ts'] = last_deploy['finish_ts']
|
|
|
|
return jsonify(spider)
|
|
|
|
# get a list of items
|
|
else:
|
|
items = []
|
|
|
|
# get customized spiders
|
|
dirs = os.listdir(PROJECT_SOURCE_FILE_FOLDER)
|
|
for _dir in dirs:
|
|
if _dir in IGNORE_DIRS:
|
|
continue
|
|
|
|
dir_path = os.path.join(PROJECT_SOURCE_FILE_FOLDER, _dir)
|
|
dir_name = _dir
|
|
spider = db_manager.get_one_by_key('spiders', key='src', value=dir_path)
|
|
|
|
# new spider
|
|
if spider is None:
|
|
stats = get_file_suffix_stats(dir_path)
|
|
lang = get_lang_by_stats(stats)
|
|
spider = db_manager.save('spiders', {
|
|
'name': dir_name,
|
|
'src': dir_path,
|
|
'lang': lang,
|
|
'suffix_stats': stats,
|
|
'type': SpiderType.CUSTOMIZED
|
|
})
|
|
|
|
# existing spider
|
|
else:
|
|
# get last deploy
|
|
last_deploy = db_manager.get_last_deploy(spider_id=spider['_id'])
|
|
if last_deploy is not None:
|
|
spider['deploy_ts'] = last_deploy['finish_ts']
|
|
|
|
# file stats
|
|
stats = get_file_suffix_stats(dir_path)
|
|
|
|
# language
|
|
lang = get_lang_by_stats(stats)
|
|
|
|
# spider type
|
|
type_ = SpiderType.CUSTOMIZED
|
|
|
|
# update spider data
|
|
db_manager.update_one('spiders', id=str(spider['_id']), values={
|
|
'lang': lang,
|
|
'type': type_,
|
|
'suffix_stats': stats,
|
|
})
|
|
|
|
# append spider
|
|
items.append(spider)
|
|
|
|
# get configurable spiders
|
|
for spider in db_manager.list('spiders', {'type': SpiderType.CONFIGURABLE}):
|
|
# append spider
|
|
items.append(spider)
|
|
|
|
# get other info
|
|
for i in range(len(items)):
|
|
spider = items[i]
|
|
|
|
# get site
|
|
if spider.get('site') is not None:
|
|
site = db_manager.get('sites', spider['site'])
|
|
if site is not None:
|
|
items[i]['site_name'] = site['name']
|
|
|
|
# get last task
|
|
last_task = db_manager.get_last_task(spider_id=spider['_id'])
|
|
if last_task is not None:
|
|
items[i]['task_ts'] = last_task['create_ts']
|
|
|
|
# ---------
|
|
# stats
|
|
# ---------
|
|
# last 5-run errors
|
|
items[i]['last_5_errors'] = get_last_n_run_errors_count(spider_id=spider['_id'], n=5)
|
|
items[i]['last_7d_tasks'] = get_last_n_day_tasks_count(spider_id=spider['_id'], n=5)
|
|
|
|
return {
|
|
'status': 'ok',
|
|
'items': jsonify(items)
|
|
}
|
|
|
|
def crawl(self, id: str) -> (dict, tuple):
|
|
"""
|
|
Submit an HTTP request to start a crawl task in the node of given spider_id.
|
|
@deprecated
|
|
:param id: spider_id
|
|
"""
|
|
args = self.parser.parse_args()
|
|
node_id = args.get('node_id')
|
|
|
|
if node_id is None:
|
|
return {
|
|
'code': 400,
|
|
'status': 400,
|
|
'error': 'node_id cannot be empty'
|
|
}, 400
|
|
|
|
# get node from db
|
|
node = db_manager.get('nodes', id=node_id)
|
|
|
|
# validate ip and port
|
|
if node.get('ip') is None or node.get('port') is None:
|
|
return {
|
|
'code': 400,
|
|
'status': 'ok',
|
|
'error': 'node ip and port should not be empty'
|
|
}, 400
|
|
|
|
# dispatch crawl task
|
|
res = requests.get('http://%s:%s/api/spiders/%s/on_crawl?node_id=%s' % (
|
|
node.get('ip'),
|
|
node.get('port'),
|
|
id,
|
|
node_id
|
|
))
|
|
data = json.loads(res.content.decode('utf-8'))
|
|
return {
|
|
'code': res.status_code,
|
|
'status': 'ok',
|
|
'error': data.get('error'),
|
|
'task': data.get('task')
|
|
}
|
|
|
|
def on_crawl(self, id: str) -> (dict, tuple):
|
|
"""
|
|
Start a crawl task.
|
|
:param id: spider_id
|
|
:return:
|
|
"""
|
|
args = self.parser.parse_args()
|
|
params = args.get('params')
|
|
|
|
spider = db_manager.get('spiders', id=ObjectId(id))
|
|
|
|
# determine execute function
|
|
if spider['type'] == SpiderType.CONFIGURABLE:
|
|
# configurable spider
|
|
exec_func = execute_config_spider
|
|
else:
|
|
# customized spider
|
|
exec_func = execute_spider
|
|
|
|
# trigger an asynchronous job
|
|
job = exec_func.delay(id, params)
|
|
|
|
# create a new task
|
|
db_manager.save('tasks', {
|
|
'_id': job.id,
|
|
'spider_id': ObjectId(id),
|
|
'cmd': spider.get('cmd'),
|
|
'params': params,
|
|
'create_ts': datetime.utcnow(),
|
|
'status': TaskStatus.PENDING
|
|
})
|
|
|
|
return {
|
|
'code': 200,
|
|
'status': 'ok',
|
|
'task': {
|
|
'id': job.id,
|
|
'status': job.status
|
|
}
|
|
}
|
|
|
|
def deploy(self, id: str) -> (dict, tuple):
|
|
"""
|
|
Submit HTTP requests to deploy the given spider to all nodes.
|
|
:param id:
|
|
:return:
|
|
"""
|
|
spider = db_manager.get('spiders', id=id)
|
|
nodes = db_manager.list('nodes', {'status': NodeStatus.ONLINE})
|
|
|
|
for node in nodes:
|
|
node_id = node['_id']
|
|
|
|
output_file_name = '%s_%s.zip' % (
|
|
datetime.now().strftime('%Y%m%d%H%M%S'),
|
|
str(random())[2:12]
|
|
)
|
|
output_file_path = os.path.join(PROJECT_TMP_FOLDER, output_file_name)
|
|
|
|
# zip source folder to zip file
|
|
zip_file(source_dir=spider['src'],
|
|
output_filename=output_file_path)
|
|
|
|
# upload to api
|
|
files = {'file': open(output_file_path, 'rb')}
|
|
r = requests.post('http://%s:%s/api/spiders/%s/deploy_file?node_id=%s' % (
|
|
node.get('ip'),
|
|
node.get('port'),
|
|
id,
|
|
node_id,
|
|
), files=files)
|
|
|
|
# TODO: checkpoint for errors
|
|
|
|
return {
|
|
'code': 200,
|
|
'status': 'ok',
|
|
'message': 'deploy success'
|
|
}
|
|
|
|
def deploy_file(self, id: str = None) -> (dict, tuple):
|
|
"""
|
|
Receive HTTP request of deploys and unzip zip files and copy to the destination directories.
|
|
:param id: spider_id
|
|
"""
|
|
args = parser.parse_args()
|
|
node_id = request.args.get('node_id')
|
|
f = args.file
|
|
|
|
if get_file_suffix(f.filename) != 'zip':
|
|
return {
|
|
'status': 'ok',
|
|
'error': 'file type mismatch'
|
|
}, 400
|
|
|
|
# save zip file on temp folder
|
|
file_path = '%s/%s' % (PROJECT_TMP_FOLDER, f.filename)
|
|
with open(file_path, 'wb') as fw:
|
|
fw.write(f.stream.read())
|
|
|
|
# unzip zip file
|
|
dir_path = file_path.replace('.zip', '')
|
|
if os.path.exists(dir_path):
|
|
shutil.rmtree(dir_path)
|
|
unzip_file(file_path, dir_path)
|
|
|
|
# get spider and version
|
|
spider = db_manager.get(col_name=self.col_name, id=id)
|
|
if spider is None:
|
|
return None, 400
|
|
|
|
# make source / destination
|
|
src = os.path.join(dir_path, os.listdir(dir_path)[0])
|
|
dst = os.path.join(PROJECT_DEPLOY_FILE_FOLDER, str(spider.get('_id')))
|
|
|
|
# logging info
|
|
current_app.logger.info('src: %s' % src)
|
|
current_app.logger.info('dst: %s' % dst)
|
|
|
|
# remove if the target folder exists
|
|
if os.path.exists(dst):
|
|
shutil.rmtree(dst)
|
|
|
|
# copy from source to destination
|
|
shutil.copytree(src=src, dst=dst)
|
|
|
|
# save to db
|
|
# TODO: task management for deployment
|
|
db_manager.save('deploys', {
|
|
'spider_id': ObjectId(id),
|
|
'node_id': node_id,
|
|
'finish_ts': datetime.utcnow()
|
|
})
|
|
|
|
return {
|
|
'code': 200,
|
|
'status': 'ok',
|
|
'message': 'deploy success'
|
|
}
|
|
|
|
def get_deploys(self, id: str) -> (dict, tuple):
|
|
"""
|
|
Get a list of latest deploys of given spider_id
|
|
:param id: spider_id
|
|
"""
|
|
items = db_manager.list('deploys', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='finish_ts')
|
|
deploys = []
|
|
for item in items:
|
|
spider_id = item['spider_id']
|
|
spider = db_manager.get('spiders', id=str(spider_id))
|
|
item['spider_name'] = spider['name']
|
|
deploys.append(item)
|
|
return {
|
|
'status': 'ok',
|
|
'items': jsonify(deploys)
|
|
}
|
|
|
|
def get_tasks(self, id: str) -> (dict, tuple):
|
|
"""
|
|
Get a list of latest tasks of given spider_id
|
|
:param id:
|
|
"""
|
|
items = db_manager.list('tasks', cond={'spider_id': ObjectId(id)}, limit=10, sort_key='create_ts')
|
|
for item in items:
|
|
spider_id = item['spider_id']
|
|
spider = db_manager.get('spiders', id=str(spider_id))
|
|
item['spider_name'] = spider['name']
|
|
if item.get('status') is None:
|
|
item['status'] = TaskStatus.UNAVAILABLE
|
|
return {
|
|
'status': 'ok',
|
|
'items': jsonify(items)
|
|
}
|
|
|
|
def after_update(self, id: str = None) -> None:
|
|
"""
|
|
After each spider is updated, update the cron scheduler correspondingly.
|
|
:param id: spider_id
|
|
"""
|
|
scheduler.update()
|
|
|
|
def update_envs(self, id: str):
|
|
"""
|
|
Update environment variables
|
|
:param id: spider_id
|
|
"""
|
|
args = self.parser.parse_args()
|
|
envs = json.loads(args.envs)
|
|
db_manager.update_one(col_name='spiders', id=id, values={'envs': envs})
|
|
|
|
def update_fields(self, id: str):
|
|
"""
|
|
Update list page fields variables for configurable spiders
|
|
:param id: spider_id
|
|
"""
|
|
args = self.parser.parse_args()
|
|
fields = json.loads(args.fields)
|
|
db_manager.update_one(col_name='spiders', id=id, values={'fields': fields})
|
|
|
|
def update_detail_fields(self, id: str):
|
|
"""
|
|
Update detail page fields variables for configurable spiders
|
|
:param id: spider_id
|
|
"""
|
|
args = self.parser.parse_args()
|
|
detail_fields = json.loads(args.detail_fields)
|
|
db_manager.update_one(col_name='spiders', id=id, values={'detail_fields': detail_fields})
|
|
|
|
@staticmethod
|
|
def _get_html(spider) -> etree.Element:
|
|
if spider['type'] != SpiderType.CONFIGURABLE:
|
|
return {
|
|
'status': 'ok',
|
|
'error': 'type %s is invalid' % spider['type']
|
|
}, 400
|
|
|
|
if spider.get('start_url') is None:
|
|
return {
|
|
'status': 'ok',
|
|
'error': 'start_url should not be empty'
|
|
}, 400
|
|
|
|
try:
|
|
r = None
|
|
for url in generate_urls(spider['start_url']):
|
|
r = requests.get(url, headers={
|
|
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
|
|
})
|
|
break
|
|
except Exception as err:
|
|
return {
|
|
'status': 'ok',
|
|
'error': 'connection error'
|
|
}, 500
|
|
|
|
if not r:
|
|
return {
|
|
'status': 'ok',
|
|
'error': 'response is not returned'
|
|
}, 500
|
|
|
|
if r and r.status_code != 200:
|
|
return {
|
|
'status': 'ok',
|
|
'error': 'status code is not 200, but %s' % r.status_code
|
|
}, r.status_code
|
|
|
|
# get html parse tree
|
|
try:
|
|
sel = etree.HTML(r.content.decode('utf-8'))
|
|
except Exception as err:
|
|
sel = etree.HTML(r.content)
|
|
|
|
# remove unnecessary tags
|
|
unnecessary_tags = [
|
|
'script'
|
|
]
|
|
for t in unnecessary_tags:
|
|
etree.strip_tags(sel, t)
|
|
|
|
return sel
|
|
|
|
@staticmethod
|
|
def _get_children(sel):
|
|
return [tag for tag in sel.getchildren() if type(tag) != etree._Comment]
|
|
|
|
@staticmethod
|
|
def _get_text_child_tags(sel):
|
|
tags = []
|
|
for tag in sel.iter():
|
|
if type(tag) != etree._Comment and tag.text is not None and tag.text.strip() != '':
|
|
tags.append(tag)
|
|
return tags
|
|
|
|
@staticmethod
|
|
def _get_a_child_tags(sel):
|
|
tags = []
|
|
for tag in sel.iter():
|
|
if tag.tag == 'a':
|
|
if tag.get('href') is not None and not tag.get('href').startswith('#') and not tag.get(
|
|
'href').startswith('javascript'):
|
|
tags.append(tag)
|
|
|
|
return tags
|
|
|
|
@staticmethod
|
|
def _get_next_page_tag(sel):
|
|
next_page_text_list = [
|
|
'下一页',
|
|
'下页',
|
|
'next page',
|
|
'next',
|
|
'>'
|
|
]
|
|
for tag in sel.iter():
|
|
if tag.text is not None and tag.text.lower().strip() in next_page_text_list:
|
|
return tag
|
|
return None
|
|
|
|
def preview_crawl(self, id: str):
|
|
spider = db_manager.get(col_name='spiders', id=id)
|
|
|
|
# get html parse tree
|
|
sel = self._get_html(spider)
|
|
|
|
# when error happens, return
|
|
if type(sel) == type(tuple):
|
|
return sel
|
|
|
|
# parse fields
|
|
if spider['crawl_type'] == CrawlType.LIST:
|
|
if spider.get('item_selector') is None:
|
|
return {
|
|
'status': 'ok',
|
|
'error': 'item_selector should not be empty'
|
|
}, 400
|
|
|
|
data = get_list_page_data(spider, sel)[:10]
|
|
|
|
return {
|
|
'status': 'ok',
|
|
'items': data
|
|
}
|
|
|
|
elif spider['crawl_type'] == CrawlType.DETAIL:
|
|
# TODO: 详情页预览
|
|
pass
|
|
|
|
elif spider['crawl_type'] == CrawlType.LIST_DETAIL:
|
|
data = get_list_page_data(spider, sel)[:10]
|
|
|
|
ev_list = []
|
|
for idx, d in enumerate(data):
|
|
for f in spider['fields']:
|
|
if f.get('is_detail'):
|
|
url = d.get(f['name'])
|
|
if url is not None:
|
|
if not url.startswith('http') and not url.startswith('//'):
|
|
u = urlparse(spider['start_url'])
|
|
url = f'{u.scheme}://{u.netloc}{url}'
|
|
ev_list.append(gevent.spawn(get_detail_page_data, url, spider, idx, data))
|
|
break
|
|
|
|
gevent.joinall(ev_list)
|
|
|
|
return {
|
|
'status': 'ok',
|
|
'items': data
|
|
}
|
|
|
|
def extract_fields(self, id: str):
|
|
"""
|
|
Extract list fields from a web page
|
|
:param id:
|
|
:return:
|
|
"""
|
|
spider = db_manager.get(col_name='spiders', id=id)
|
|
|
|
# get html parse tree
|
|
sel = self._get_html(spider)
|
|
|
|
# when error happens, return
|
|
if type(sel) == tuple:
|
|
return sel
|
|
|
|
list_tag_list = []
|
|
threshold = spider.get('item_threshold') or 10
|
|
# iterate all child nodes in a top-down direction
|
|
for tag in sel.iter():
|
|
# get child tags
|
|
child_tags = self._get_children(tag)
|
|
|
|
if len(child_tags) < threshold:
|
|
# if number of child tags is below threshold, skip
|
|
continue
|
|
else:
|
|
# have one or more child tags
|
|
child_tags_set = set(map(lambda x: x.tag, child_tags))
|
|
|
|
# if there are more than 1 tag names, skip
|
|
if len(child_tags_set) > 1:
|
|
continue
|
|
|
|
# add as list tag
|
|
list_tag_list.append(tag)
|
|
|
|
# find the list tag with the most child text tags
|
|
max_tag = None
|
|
max_num = 0
|
|
for tag in list_tag_list:
|
|
_child_text_tags = self._get_text_child_tags(self._get_children(tag)[0])
|
|
if len(_child_text_tags) > max_num:
|
|
max_tag = tag
|
|
max_num = len(_child_text_tags)
|
|
|
|
# get list item selector
|
|
item_selector = None
|
|
item_selector_type = 'css'
|
|
if max_tag.get('id') is not None:
|
|
item_selector = f'#{max_tag.get("id")} > {self._get_children(max_tag)[0].tag}'
|
|
elif max_tag.get('class') is not None:
|
|
cls_str = '.'.join([x for x in max_tag.get("class").split(' ') if x != ''])
|
|
if len(sel.cssselect(f'.{cls_str}')) == 1:
|
|
item_selector = f'.{cls_str} > {self._get_children(max_tag)[0].tag}'
|
|
else:
|
|
item_selector = max_tag.getroottree().getpath(max_tag)
|
|
item_selector_type = 'xpath'
|
|
|
|
# get list fields
|
|
fields = []
|
|
if item_selector is not None:
|
|
first_tag = self._get_children(max_tag)[0]
|
|
for i, tag in enumerate(self._get_text_child_tags(first_tag)):
|
|
el_list = first_tag.cssselect(f'{tag.tag}')
|
|
if len(el_list) == 1:
|
|
fields.append({
|
|
'name': f'field{i + 1}',
|
|
'type': 'css',
|
|
'extract_type': 'text',
|
|
'query': f'{tag.tag}',
|
|
})
|
|
elif tag.get('class') is not None:
|
|
cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != ''])
|
|
if len(tag.cssselect(f'{tag.tag}.{cls_str}')) == 1:
|
|
fields.append({
|
|
'name': f'field{i + 1}',
|
|
'type': 'css',
|
|
'extract_type': 'text',
|
|
'query': f'{tag.tag}.{cls_str}',
|
|
})
|
|
else:
|
|
for j, el in enumerate(el_list):
|
|
if tag == el:
|
|
fields.append({
|
|
'name': f'field{i + 1}',
|
|
'type': 'css',
|
|
'extract_type': 'text',
|
|
'query': f'{tag.tag}:nth-of-type({j + 1})',
|
|
})
|
|
|
|
for i, tag in enumerate(self._get_a_child_tags(self._get_children(max_tag)[0])):
|
|
# if the tag is <a...></a>, extract its href
|
|
if tag.get('class') is not None:
|
|
cls_str = '.'.join([x for x in tag.get("class").split(' ') if x != ''])
|
|
fields.append({
|
|
'name': f'field{i + 1}_url',
|
|
'type': 'css',
|
|
'extract_type': 'attribute',
|
|
'attribute': 'href',
|
|
'query': f'{tag.tag}.{cls_str}',
|
|
})
|
|
|
|
# get pagination tag
|
|
pagination_selector = None
|
|
pagination_tag = self._get_next_page_tag(sel)
|
|
if pagination_tag is not None:
|
|
if pagination_tag.get('id') is not None:
|
|
pagination_selector = f'#{pagination_tag.get("id")}'
|
|
elif pagination_tag.get('class') is not None and len(sel.cssselect(f'.{pagination_tag.get("id")}')) == 1:
|
|
pagination_selector = f'.{pagination_tag.get("id")}'
|
|
|
|
return {
|
|
'status': 'ok',
|
|
'item_selector': item_selector,
|
|
'item_selector_type': item_selector_type,
|
|
'pagination_selector': pagination_selector,
|
|
'fields': fields
|
|
}
|
|
|
|
|
|
class SpiderImportApi(Resource):
|
|
__doc__ = """
|
|
API for importing spiders from external resources including Github, Gitlab, and subversion (WIP)
|
|
"""
|
|
parser = reqparse.RequestParser()
|
|
arguments = [
|
|
('url', str)
|
|
]
|
|
|
|
def __init__(self):
|
|
super(SpiderImportApi).__init__()
|
|
for arg, type in self.arguments:
|
|
self.parser.add_argument(arg, type=type)
|
|
|
|
def post(self, platform: str = None) -> (dict, tuple):
|
|
if platform is None:
|
|
return {
|
|
'status': 'ok',
|
|
'code': 404,
|
|
'error': 'platform invalid'
|
|
}, 404
|
|
|
|
if not hasattr(self, platform):
|
|
return {
|
|
'status': 'ok',
|
|
'code': 400,
|
|
'error': 'platform "%s" invalid' % platform
|
|
}, 400
|
|
|
|
return getattr(self, platform)()
|
|
|
|
def github(self) -> None:
|
|
"""
|
|
Import Github API
|
|
"""
|
|
self._git()
|
|
|
|
def gitlab(self) -> None:
|
|
"""
|
|
Import Gitlab API
|
|
"""
|
|
self._git()
|
|
|
|
def _git(self):
|
|
"""
|
|
Helper method to perform github important (basically "git clone" method).
|
|
"""
|
|
args = self.parser.parse_args()
|
|
url = args.get('url')
|
|
if url is None:
|
|
return {
|
|
'status': 'ok',
|
|
'code': 400,
|
|
'error': 'url should not be empty'
|
|
}, 400
|
|
|
|
try:
|
|
p = subprocess.Popen(['git', 'clone', url], cwd=PROJECT_SOURCE_FILE_FOLDER)
|
|
_stdout, _stderr = p.communicate()
|
|
except Exception as err:
|
|
return {
|
|
'status': 'ok',
|
|
'code': 500,
|
|
'error': str(err)
|
|
}, 500
|
|
|
|
return {
|
|
'status': 'ok',
|
|
'message': 'success'
|
|
}
|
|
|
|
|
|
class SpiderManageApi(Resource):
|
|
parser = reqparse.RequestParser()
|
|
arguments = [
|
|
('url', str)
|
|
]
|
|
|
|
def post(self, action: str) -> (dict, tuple):
|
|
"""
|
|
POST method for SpiderManageAPI.
|
|
:param action:
|
|
"""
|
|
if not hasattr(self, action):
|
|
return {
|
|
'status': 'ok',
|
|
'code': 400,
|
|
'error': 'action "%s" invalid' % action
|
|
}, 400
|
|
|
|
return getattr(self, action)()
|
|
|
|
def deploy_all(self) -> (dict, tuple):
|
|
"""
|
|
Deploy all spiders to all nodes.
|
|
"""
|
|
# active nodes
|
|
nodes = db_manager.list('nodes', {'status': NodeStatus.ONLINE})
|
|
|
|
# all spiders
|
|
spiders = db_manager.list('spiders', {'cmd': {'$exists': True}})
|
|
|
|
# iterate all nodes
|
|
for node in nodes:
|
|
node_id = node['_id']
|
|
for spider in spiders:
|
|
spider_id = spider['_id']
|
|
spider_src = spider['src']
|
|
|
|
output_file_name = '%s_%s.zip' % (
|
|
datetime.now().strftime('%Y%m%d%H%M%S'),
|
|
str(random())[2:12]
|
|
)
|
|
output_file_path = os.path.join(PROJECT_TMP_FOLDER, output_file_name)
|
|
|
|
# zip source folder to zip file
|
|
zip_file(source_dir=spider_src,
|
|
output_filename=output_file_path)
|
|
|
|
# upload to api
|
|
files = {'file': open(output_file_path, 'rb')}
|
|
r = requests.post('http://%s:%s/api/spiders/%s/deploy_file?node_id=%s' % (
|
|
node.get('ip'),
|
|
node.get('port'),
|
|
spider_id,
|
|
node_id,
|
|
), files=files)
|
|
|
|
return {
|
|
'status': 'ok',
|
|
'message': 'success'
|
|
}
|
|
|
|
def upload(self):
|
|
f = request.files['file']
|
|
|
|
if get_file_suffix(f.filename) != 'zip':
|
|
return {
|
|
'status': 'ok',
|
|
'error': 'file type mismatch'
|
|
}, 400
|
|
|
|
# save zip file on temp folder
|
|
file_path = '%s/%s' % (PROJECT_TMP_FOLDER, f.filename)
|
|
with open(file_path, 'wb') as fw:
|
|
fw.write(f.stream.read())
|
|
|
|
# unzip zip file
|
|
dir_path = file_path.replace('.zip', '')
|
|
if os.path.exists(dir_path):
|
|
shutil.rmtree(dir_path)
|
|
unzip_file(file_path, dir_path)
|
|
|
|
# copy to source folder
|
|
output_path = os.path.join(PROJECT_SOURCE_FILE_FOLDER, f.filename.replace('.zip', ''))
|
|
print(output_path)
|
|
if os.path.exists(output_path):
|
|
shutil.rmtree(output_path)
|
|
shutil.copytree(dir_path, output_path)
|
|
|
|
return {
|
|
'status': 'ok',
|
|
'message': 'success'
|
|
}
|