diff --git a/backend/template/scrapy/config_spider/spiders/spider.py b/backend/template/scrapy/config_spider/spiders/spider.py index 0e3c661d..d87f4297 100644 --- a/backend/template/scrapy/config_spider/spiders/spider.py +++ b/backend/template/scrapy/config_spider/spiders/spider.py @@ -2,11 +2,14 @@ import scrapy import re from config_spider.items import Item -from urllib.parse import urljoin +from urllib.parse import urljoin, urlparse def get_real_url(response, url): - if re.search(r'^https?|^\/\/', url): + if re.search(r'^https?', url): return url + elif re.search(r'^\/\/', url): + u = urlparse(response.url) + return u.scheme + url return urljoin(response.url, url) class ConfigSpider(scrapy.Spider):