Merge remote-tracking branch 'origin/develop' into develop

This commit is contained in:
Marvin Zhang
2019-05-28 14:20:53 +08:00
5 changed files with 75 additions and 3 deletions

View File

@@ -13,7 +13,7 @@ import tasks.spider
import tasks.deploy
if __name__ == '__main__':
if 'win' in sys.platform:
if 'win32' in sys.platform:
celery_app.start(argv=['tasks', 'worker', '-P', 'eventlet', '-E', '-l', 'INFO'])
else:
celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO'])

View File

@@ -479,7 +479,9 @@ class SpiderApi(BaseApi):
}, 400
try:
r = requests.get(spider['start_url'])
r = requests.get(spider['start_url'], headers={
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
})
except Exception as err:
return {
'status': 'ok',

View File

@@ -1,4 +1,7 @@
import json
import os
import sys
from _signal import SIGINT, SIGKILL
from datetime import datetime
import requests
@@ -189,10 +192,21 @@ class TaskApi(BaseApi):
:param id:
:return:
"""
task = db_manager.get('tasks', id=id)
celery_app.control.revoke(id, terminate=True)
db_manager.update_one('tasks', id=id, values={
'status': TaskStatus.REVOKED
})
# kill process
if task.get('pid'):
pid = task.get('pid')
if 'win32' in sys.platform:
os.popen('taskkill /pid:' + str(pid))
else:
# unix system
os.kill(pid, SIGKILL)
return {
'id': id,
'status': 'ok',

View File

@@ -0,0 +1,55 @@
import itertools
import re
def generate_urls(base_url: str) -> str:
url = base_url
# number range list
list_arr = []
for i, res in enumerate(re.findall(r'{(\d+),(\d+)}', base_url)):
try:
_min = int(res[0])
_max = int(res[1])
except ValueError as err:
raise ValueError(f'{base_url} is not a valid URL pattern')
# list
_list = range(_min, _max + 1)
# key
_key = f'n{i}'
# append list and key
list_arr.append((_list, _key))
# replace url placeholder with key
url = url.replace('{' + res[0] + ',' + res[1] + '}', '{' + _key + '}', 1)
# string list
for i, res in enumerate(re.findall(r'\[([\w\-,]+)\]', base_url)):
# list
_list = res.split(',')
# key
_key = f's{i}'
# append list and key
list_arr.append((_list, _key))
# replace url placeholder with key
url = url.replace('[' + ','.join(_list) + ']', '{' + _key + '}', 1)
# combine together
_list_arr = []
for res in itertools.product(*map(lambda x: x[0], list_arr)):
_url = url
for _arr, _rep in zip(list_arr, res):
_list, _key = _arr
_url = _url.replace('{' + _key + '}', str(_rep), 1)
yield _url
#
# base_url = 'http://[baidu,ali].com/page-{1,10}-[1,2,3]'
# for url in generate_urls(base_url):
# print(url)

View File

@@ -63,7 +63,8 @@ async def request_site_home_page(url: str, semophore):
async def run():
semaphore = asyncio.Semaphore(50) # 限制并发量为50
sites = [site for site in col.find({'rank': {'$lte': 5000}})]
# sites = [site for site in col.find({'rank': {'$lte': 5000}})]
sites = [site for site in col.find({'rank': {'$lte': 100}})]
urls = [site['_id'] for site in sites]
to_get = [request_site(url, semaphore) for url in urls]
to_get += [request_site_home_page(url, semaphore) for url in urls]