mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
修复可配置爬虫无法解析 "//" 打头的URL
This commit is contained in:
@@ -2,11 +2,14 @@
|
||||
import scrapy
|
||||
import re
|
||||
from config_spider.items import Item
|
||||
from urllib.parse import urljoin
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
def get_real_url(response, url):
|
||||
if re.search(r'^https?|^\/\/', url):
|
||||
if re.search(r'^https?', url):
|
||||
return url
|
||||
elif re.search(r'^\/\/', url):
|
||||
u = urlparse(response.url)
|
||||
return u.scheme + url
|
||||
return urljoin(response.url, url)
|
||||
|
||||
class ConfigSpider(scrapy.Spider):
|
||||
|
||||
Reference in New Issue
Block a user