mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-21 17:21:09 +01:00
added generate_urls function
This commit is contained in:
@@ -13,7 +13,7 @@ import tasks.spider
|
||||
import tasks.deploy
|
||||
|
||||
if __name__ == '__main__':
|
||||
if 'win' in sys.platform:
|
||||
if 'win32' in sys.platform:
|
||||
celery_app.start(argv=['tasks', 'worker', '-P', 'eventlet', '-E', '-l', 'INFO'])
|
||||
else:
|
||||
celery_app.start(argv=['tasks', 'worker', '-E', '-l', 'INFO'])
|
||||
|
||||
@@ -479,7 +479,9 @@ class SpiderApi(BaseApi):
|
||||
}, 400
|
||||
|
||||
try:
|
||||
r = requests.get(spider['start_url'])
|
||||
r = requests.get(spider['start_url'], headers={
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
|
||||
})
|
||||
except Exception as err:
|
||||
return {
|
||||
'status': 'ok',
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from _signal import SIGINT, SIGKILL
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
@@ -189,10 +192,21 @@ class TaskApi(BaseApi):
|
||||
:param id:
|
||||
:return:
|
||||
"""
|
||||
task = db_manager.get('tasks', id=id)
|
||||
celery_app.control.revoke(id, terminate=True)
|
||||
db_manager.update_one('tasks', id=id, values={
|
||||
'status': TaskStatus.REVOKED
|
||||
})
|
||||
|
||||
# kill process
|
||||
if task.get('pid'):
|
||||
pid = task.get('pid')
|
||||
if 'win32' in sys.platform:
|
||||
os.popen('taskkill /pid:' + str(pid))
|
||||
else:
|
||||
# unix system
|
||||
os.kill(pid, SIGKILL)
|
||||
|
||||
return {
|
||||
'id': id,
|
||||
'status': 'ok',
|
||||
|
||||
50
crawlab/spiders/spiders/utils.py
Normal file
50
crawlab/spiders/spiders/utils.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import itertools
|
||||
import re
|
||||
|
||||
|
||||
def generate_urls(base_url: str) -> str:
|
||||
url = base_url
|
||||
|
||||
# number range list
|
||||
list_arr = []
|
||||
for i, res in enumerate(re.findall(r'{(\d+),(\d+)}', base_url)):
|
||||
try:
|
||||
_min = int(res[0])
|
||||
_max = int(res[1])
|
||||
except ValueError as err:
|
||||
raise ValueError(f'{base_url} is not a valid URL pattern')
|
||||
|
||||
# list
|
||||
_list = range(_min, _max + 1)
|
||||
|
||||
# key
|
||||
_key = f'n{i}'
|
||||
|
||||
# append list and key
|
||||
list_arr.append((_list, _key))
|
||||
|
||||
# replace url placeholder with key
|
||||
url = url.replace('{' + res[0] + ',' + res[1] + '}', '{' + _key + '}', 1)
|
||||
|
||||
# string list
|
||||
for i, res in enumerate(re.findall(r'\[(.+)\]', base_url)):
|
||||
# list
|
||||
_list = res.split(',')
|
||||
|
||||
# key
|
||||
_key = f's{i}'
|
||||
|
||||
# append list and key
|
||||
list_arr.append((_list, _key))
|
||||
|
||||
# replace url placeholder with key
|
||||
url = url.replace('[' + ','.join(_list) + ']', '{' + _key + '}', 1)
|
||||
|
||||
# combine together
|
||||
_list_arr = []
|
||||
for res in itertools.product(*map(lambda x: x[0], list_arr)):
|
||||
_url = url
|
||||
for _arr, _rep in zip(list_arr, res):
|
||||
_list, _key = _arr
|
||||
_url = _url.replace('{' + _key + '}', str(_rep), 1)
|
||||
yield _url
|
||||
@@ -63,7 +63,8 @@ async def request_site_home_page(url: str, semophore):
|
||||
|
||||
async def run():
|
||||
semaphore = asyncio.Semaphore(50) # 限制并发量为50
|
||||
sites = [site for site in col.find({'rank': {'$lte': 5000}})]
|
||||
# sites = [site for site in col.find({'rank': {'$lte': 5000}})]
|
||||
sites = [site for site in col.find({'rank': {'$lte': 100}})]
|
||||
urls = [site['_id'] for site in sites]
|
||||
to_get = [request_site(url, semaphore) for url in urls]
|
||||
to_get += [request_site_home_page(url, semaphore) for url in urls]
|
||||
|
||||
Reference in New Issue
Block a user