From 2a5b5c038c40e53fe5102571f579fb38c32e9a2f Mon Sep 17 00:00:00 2001
From: Marvin Zhang <tikazyq@gmail.com>
Date: Wed, 22 May 2019 13:16:45 +0800
Subject: [PATCH] cleanup requirements.txt

---
 crawlab/requirements.txt                   | 1 -
 spiders/sites_inspector/sites_inspector.py | 8 ++++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/crawlab/requirements.txt b/crawlab/requirements.txt
index 901a56ca..68bf4ac3 100644
--- a/crawlab/requirements.txt
+++ b/crawlab/requirements.txt
@@ -57,7 +57,6 @@ pytz==2018.9
 queuelib==1.5.0
 redis==3.2.1
 redisbeat==1.1.4
-reppy==0.4.12
 requests==2.21.0
 Scrapy==1.6.0
 selenium==3.141.0
diff --git a/spiders/sites_inspector/sites_inspector.py b/spiders/sites_inspector/sites_inspector.py
index 80c9c60a..ae425abc 100644
--- a/spiders/sites_inspector/sites_inspector.py
+++ b/spiders/sites_inspector/sites_inspector.py
@@ -22,7 +22,7 @@ col = db['sites']
 async def process_response(resp, **kwargs):
     url = kwargs.get('url')
     status = resp.status  # 读取状态
-    if status == 200:
+    if status == 200 and ('robots.txt' in str(resp.url)):
         col.update({'_id': url}, {'$set': {'has_robots': True}})
     else:
         # 错误状态
@@ -38,7 +38,7 @@ async def process_home_page_response(resp, **kwargs):
 
 async def request_site(url: str, semaphore):
     _url = 'http://' + url + '/robots.txt'
-    print('crawling ' + _url)
+    # print('crawling ' + _url)
     async with semaphore:
         async with aiohttp.ClientSession() as session:  # <1> 开启一个会话
             async with session.get(_url) as resp:  # 发送请求
@@ -50,7 +50,7 @@ async def request_site(url: str, semaphore):
 
 async def request_site_home_page(url: str, semophore):
     _url = 'http://' + url
-    print('crawling ' + _url)
+    # print('crawling ' + _url)
     async with semophore:
         tic = datetime.now()
         async with aiohttp.ClientSession() as session:  # <1> 开启一个会话
@@ -63,7 +63,7 @@ async def request_site_home_page(url: str, semophore):
 
 async def run():
     semaphore = asyncio.Semaphore(50)  # 限制并发量为50
-    sites = [site for site in col.find({'rank': {'$lte': 5000}})]
+    sites = [site for site in col.find({'rank': {'$lte': 100}})]
     urls = [site['_id'] for site in sites]
     to_get = [request_site(url, semaphore) for url in urls]
     to_get += [request_site_home_page(url, semaphore) for url in urls]