Files
crawlab/backend/template/spiders/sites_inspector/sites_inspector.py
2020-02-03 11:58:05 +08:00

78 lines
2.5 KiB
Python

import asyncio
import os
from datetime import datetime
import aiohttp
import requests
from pymongo import MongoClient
# MONGO_HOST = os.environ['MONGO_HOST']
# MONGO_PORT = int(os.environ['MONGO_PORT'])
# MONGO_DB = os.environ['MONGO_DB']
MONGO_HOST = 'localhost'
MONGO_PORT = 27017
MONGO_DB = 'crawlab_test'
mongo = MongoClient(host=MONGO_HOST, port=MONGO_PORT)
db = mongo[MONGO_DB]
col = db['sites']
async def process_response(resp, **kwargs):
url = kwargs.get('url')
status = resp.status # 读取状态
if status == 200 and ('robots.txt' in str(resp.url)):
col.update({'_id': url}, {'$set': {'has_robots': True}})
else:
# 错误状态
col.update({'_id': url}, {'$set': {'has_robots': False}})
async def process_home_page_response(resp, **kwargs):
url = kwargs.get('url')
duration = kwargs.get('duration')
status = resp.status # 读取状态
col.update({'_id': url}, {'$set': {'home_http_status': status, 'home_response_time': duration}})
async def request_site(url: str, semaphore):
_url = 'http://' + url + '/robots.txt'
# print('crawling ' + _url)
async with semaphore:
async with aiohttp.ClientSession() as session: # <1> 开启一个会话
async with session.get(_url) as resp: # 发送请求
await process_response(resp=resp, url=url)
print('crawled ' + _url)
# resp = requests.get(_url)
return resp
async def request_site_home_page(url: str, semophore):
_url = 'http://' + url
# print('crawling ' + _url)
async with semophore:
tic = datetime.now()
async with aiohttp.ClientSession() as session: # <1> 开启一个会话
async with session.get(_url) as resp: # 发送请求
toc = datetime.now()
duration = (toc - tic).total_seconds()
await process_home_page_response(resp=resp, url=url, duration=duration)
print('crawled ' + _url)
async def run():
semaphore = asyncio.Semaphore(50) # 限制并发量为50
# sites = [site for site in col.find({'rank': {'$lte': 5000}})]
sites = [site for site in col.find({'rank': {'$lte': 100}})]
urls = [site['_id'] for site in sites]
to_get = [request_site(url, semaphore) for url in urls]
to_get += [request_site_home_page(url, semaphore) for url in urls]
await asyncio.wait(to_get)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(run())
loop.close()