From 558b7472f8c645131d60f6b83ae479e4159dd2bf Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Wed, 29 May 2019 19:34:51 +0800 Subject: [PATCH] fixed bugs --- crawlab/routes/spiders.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index d4f99a3a..0bfe8e4c 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -4,6 +4,7 @@ import shutil import subprocess from datetime import datetime from random import random +from urllib.parse import urlparse import gevent import requests @@ -101,7 +102,7 @@ class SpiderApi(BaseApi): ('pagination_selector_type', str), # whether to obey robots.txt - ('obey_robots_txt', str), + ('obey_robots_txt', bool), ) def get(self, id=None, action=None): @@ -523,7 +524,8 @@ class SpiderApi(BaseApi): tags = [] for tag in sel.iter(): if tag.tag == 'a': - if tag.get('href') is not None and not tag.get('href').startswith('#') and not tag.get('href').startswith('javascript'): + if tag.get('href') is not None and not tag.get('href').startswith('#') and not tag.get( + 'href').startswith('javascript'): tags.append(tag) return tags @@ -566,6 +568,9 @@ class SpiderApi(BaseApi): if f.get('is_detail'): url = d.get(f['name']) if url is not None: + if not url.startswith('http') and not url.startswith('//'): + u = urlparse(spider['start_url']) + url = f'{u.scheme}://{u.netloc}{url}' ev_list.append(gevent.spawn(get_detail_page_data, url, spider, idx, data)) break @@ -657,7 +662,6 @@ class SpiderApi(BaseApi): 'query': f'{tag.tag}.{cls_str}', }) - return { 'status': 'ok', 'item_selector': item_selector,