mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-24 17:41:03 +01:00
78 lines
2.5 KiB
Python
78 lines
2.5 KiB
Python
import asyncio
|
|
import os
|
|
from datetime import datetime
|
|
|
|
import aiohttp
|
|
import requests
|
|
|
|
from pymongo import MongoClient
|
|
|
|
# MONGO_HOST = os.environ['MONGO_HOST']
|
|
# MONGO_PORT = int(os.environ['MONGO_PORT'])
|
|
# MONGO_DB = os.environ['MONGO_DB']
|
|
MONGO_HOST = 'localhost'
|
|
MONGO_PORT = 27017
|
|
MONGO_DB = 'crawlab_test'
|
|
|
|
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
|
|
db = mongo[MONGO_DB]
|
|
col = db['sites']
|
|
|
|
|
|
async def process_response(resp, **kwargs):
|
|
url = kwargs.get('url')
|
|
status = resp.status # 读取状态
|
|
if status == 200 and ('robots.txt' in str(resp.url)):
|
|
col.update({'_id': url}, {'$set': {'has_robots': True}})
|
|
else:
|
|
# 错误状态
|
|
col.update({'_id': url}, {'$set': {'has_robots': False}})
|
|
|
|
|
|
async def process_home_page_response(resp, **kwargs):
|
|
url = kwargs.get('url')
|
|
duration = kwargs.get('duration')
|
|
status = resp.status # 读取状态
|
|
col.update({'_id': url}, {'$set': {'home_http_status': status, 'home_response_time': duration}})
|
|
|
|
|
|
async def request_site(url: str, semaphore):
|
|
_url = 'http://' + url + '/robots.txt'
|
|
# print('crawling ' + _url)
|
|
async with semaphore:
|
|
async with aiohttp.ClientSession() as session: # <1> 开启一个会话
|
|
async with session.get(_url) as resp: # 发送请求
|
|
await process_response(resp=resp, url=url)
|
|
print('crawled ' + _url)
|
|
# resp = requests.get(_url)
|
|
return resp
|
|
|
|
|
|
async def request_site_home_page(url: str, semophore):
|
|
_url = 'http://' + url
|
|
# print('crawling ' + _url)
|
|
async with semophore:
|
|
tic = datetime.now()
|
|
async with aiohttp.ClientSession() as session: # <1> 开启一个会话
|
|
async with session.get(_url) as resp: # 发送请求
|
|
toc = datetime.now()
|
|
duration = (toc - tic).total_seconds()
|
|
await process_home_page_response(resp=resp, url=url, duration=duration)
|
|
print('crawled ' + _url)
|
|
|
|
|
|
async def run():
|
|
semaphore = asyncio.Semaphore(50) # 限制并发量为50
|
|
# sites = [site for site in col.find({'rank': {'$lte': 5000}})]
|
|
sites = [site for site in col.find({'rank': {'$lte': 100}})]
|
|
urls = [site['_id'] for site in sites]
|
|
to_get = [request_site(url, semaphore) for url in urls]
|
|
to_get += [request_site_home_page(url, semaphore) for url in urls]
|
|
await asyncio.wait(to_get)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
loop = asyncio.get_event_loop()
|
|
loop.run_until_complete(run())
|
|
loop.close()
|