fixed bugs

2026-01-22 17:31:03 +01:00 · 2019-05-29 19:34:51 +08:00
parent a37965338c
commit dc7e6eda70
1 changed files with 7 additions and 3 deletions
--- a/crawlab/routes/spiders.py
+++ b/crawlab/routes/spiders.py
@@ -4,6 +4,7 @@ import shutil
 import subprocess
 from datetime import datetime
 from random import random
+from urllib.parse import urlparse

 import gevent
 import requests
@@ -101,7 +102,7 @@ class SpiderApi(BaseApi):
        ('pagination_selector_type', str),

        # whether to obey robots.txt
-        ('obey_robots_txt', str),
+        ('obey_robots_txt', bool),
    )

    def get(self, id=None, action=None):
@@ -523,7 +524,8 @@ class SpiderApi(BaseApi):
        tags = []
        for tag in sel.iter():
            if tag.tag == 'a':
-                if tag.get('href') is not None and not tag.get('href').startswith('#') and not tag.get('href').startswith('javascript'):
+                if tag.get('href') is not None and not tag.get('href').startswith('#') and not tag.get(
+                        'href').startswith('javascript'):
                    tags.append(tag)

        return tags
@@ -566,6 +568,9 @@ class SpiderApi(BaseApi):
                    if f.get('is_detail'):
                        url = d.get(f['name'])
                        if url is not None:
+                            if not url.startswith('http') and not url.startswith('//'):
+                                u = urlparse(spider['start_url'])
+                                url = f'{u.scheme}://{u.netloc}{url}'
                            ev_list.append(gevent.spawn(get_detail_page_data, url, spider, idx, data))
                        break

@@ -657,7 +662,6 @@ class SpiderApi(BaseApi):
                        'query': f'{tag.tag}.{cls_str}',
                    })

-
        return {
            'status': 'ok',
            'item_selector': item_selector,