From 2a5b5c038c40e53fe5102571f579fb38c32e9a2f Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Wed, 22 May 2019 13:16:45 +0800 Subject: [PATCH] cleanup requirements.txt --- crawlab/requirements.txt | 1 - spiders/sites_inspector/sites_inspector.py | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/crawlab/requirements.txt b/crawlab/requirements.txt index 901a56ca..68bf4ac3 100644 --- a/crawlab/requirements.txt +++ b/crawlab/requirements.txt @@ -57,7 +57,6 @@ pytz==2018.9 queuelib==1.5.0 redis==3.2.1 redisbeat==1.1.4 -reppy==0.4.12 requests==2.21.0 Scrapy==1.6.0 selenium==3.141.0 diff --git a/spiders/sites_inspector/sites_inspector.py b/spiders/sites_inspector/sites_inspector.py index 80c9c60a..ae425abc 100644 --- a/spiders/sites_inspector/sites_inspector.py +++ b/spiders/sites_inspector/sites_inspector.py @@ -22,7 +22,7 @@ col = db['sites'] async def process_response(resp, **kwargs): url = kwargs.get('url') status = resp.status # 读取状态 - if status == 200: + if status == 200 and ('robots.txt' in str(resp.url)): col.update({'_id': url}, {'$set': {'has_robots': True}}) else: # 错误状态 @@ -38,7 +38,7 @@ async def process_home_page_response(resp, **kwargs): async def request_site(url: str, semaphore): _url = 'http://' + url + '/robots.txt' - print('crawling ' + _url) + # print('crawling ' + _url) async with semaphore: async with aiohttp.ClientSession() as session: # <1> 开启一个会话 async with session.get(_url) as resp: # 发送请求 @@ -50,7 +50,7 @@ async def request_site(url: str, semaphore): async def request_site_home_page(url: str, semophore): _url = 'http://' + url - print('crawling ' + _url) + # print('crawling ' + _url) async with semophore: tic = datetime.now() async with aiohttp.ClientSession() as session: # <1> 开启一个会话 @@ -63,7 +63,7 @@ async def request_site_home_page(url: str, semophore): async def run(): semaphore = asyncio.Semaphore(50) # 限制并发量为50 - sites = [site for site in col.find({'rank': {'$lte': 5000}})] + sites = [site for site in col.find({'rank': {'$lte': 100}})] urls = [site['_id'] for site in sites] to_get = [request_site(url, semaphore) for url in urls] to_get += [request_site_home_page(url, semaphore) for url in urls]