修复可配置爬虫无法解析 "//" 打头的URL

This commit is contained in:
marvzhang
2020-01-30 17:36:38 +08:00
parent a685ef55ef
commit c86a6dda9c

View File

@@ -2,11 +2,14 @@
import scrapy
import re
from config_spider.items import Item
from urllib.parse import urljoin
from urllib.parse import urljoin, urlparse
def get_real_url(response, url):
if re.search(r'^https?|^\/\/', url):
if re.search(r'^https?', url):
return url
elif re.search(r'^\/\/', url):
u = urlparse(response.url)
return u.scheme + url
return urljoin(response.url, url)
class ConfigSpider(scrapy.Spider):