From c86a6dda9cbe75bce02ebbcf8528efc302fcc9bf Mon Sep 17 00:00:00 2001 From: marvzhang Date: Thu, 30 Jan 2020 17:36:38 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=8F=AF=E9=85=8D=E7=BD=AE?= =?UTF-8?q?=E7=88=AC=E8=99=AB=E6=97=A0=E6=B3=95=E8=A7=A3=E6=9E=90=20"//"?= =?UTF-8?q?=20=E6=89=93=E5=A4=B4=E7=9A=84URL?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/template/scrapy/config_spider/spiders/spider.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/backend/template/scrapy/config_spider/spiders/spider.py b/backend/template/scrapy/config_spider/spiders/spider.py index 0e3c661d..d87f4297 100644 --- a/backend/template/scrapy/config_spider/spiders/spider.py +++ b/backend/template/scrapy/config_spider/spiders/spider.py @@ -2,11 +2,14 @@ import scrapy import re from config_spider.items import Item -from urllib.parse import urljoin +from urllib.parse import urljoin, urlparse def get_real_url(response, url): - if re.search(r'^https?|^\/\/', url): + if re.search(r'^https?', url): return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url return urljoin(response.url, url) class ConfigSpider(scrapy.Spider):