cleanup requirements.txt

2026-01-22 17:31:03 +01:00 · 2019-05-22 13:16:45 +08:00
parent 366ec7d857
commit d545dc2697
2 changed files with 4 additions and 5 deletions
--- a/crawlab/requirements.txt
+++ b/crawlab/requirements.txt
@@ -57,7 +57,6 @@ pytz==2018.9
 queuelib==1.5.0
 redis==3.2.1
 redisbeat==1.1.4
-reppy==0.4.12
 requests==2.21.0
 Scrapy==1.6.0
 selenium==3.141.0
--- a/spiders/sites_inspector/sites_inspector.py
+++ b/spiders/sites_inspector/sites_inspector.py
@@ -22,7 +22,7 @@ col = db['sites']
 async def process_response(resp, **kwargs):
    url = kwargs.get('url')
    status = resp.status  # 读取状态
-    if status == 200:
+    if status == 200 and ('robots.txt' in str(resp.url)):
        col.update({'_id': url}, {'$set': {'has_robots': True}})
    else:
        # 错误状态
@@ -38,7 +38,7 @@ async def process_home_page_response(resp, **kwargs):

 async def request_site(url: str, semaphore):
    _url = 'http://' + url + '/robots.txt'
-    print('crawling ' + _url)
+    # print('crawling ' + _url)
    async with semaphore:
        async with aiohttp.ClientSession() as session:  # <1> 开启一个会话
            async with session.get(_url) as resp:  # 发送请求
@@ -50,7 +50,7 @@ async def request_site(url: str, semaphore):

 async def request_site_home_page(url: str, semophore):
    _url = 'http://' + url
-    print('crawling ' + _url)
+    # print('crawling ' + _url)
    async with semophore:
        tic = datetime.now()
        async with aiohttp.ClientSession() as session:  # <1> 开启一个会话
@@ -63,7 +63,7 @@ async def request_site_home_page(url: str, semophore):

 async def run():
    semaphore = asyncio.Semaphore(50)  # 限制并发量为50
-    sites = [site for site in col.find({'rank': {'$lte': 5000}})]
+    sites = [site for site in col.find({'rank': {'$lte': 100}})]
    urls = [site['_id'] for site in sites]
    to_get = [request_site(url, semaphore) for url in urls]
    to_get += [request_site_home_page(url, semaphore) for url in urls]