allow generate URLs by pattern

This commit is contained in:
Marvin Zhang
2019-05-28 14:59:21 +08:00
parent dec76bbca9
commit f710d327b1
5 changed files with 88 additions and 14 deletions

View File

@@ -25,7 +25,7 @@ from utils import jsonify
from utils.deploy import zip_file, unzip_file
from utils.file import get_file_suffix_stats, get_file_suffix
from utils.spider import get_lang_by_stats, get_last_n_run_errors_count, get_last_n_day_tasks_count, get_list_page_data, \
get_detail_page_data
get_detail_page_data, generate_urls
parser = reqparse.RequestParser()
parser.add_argument('file', type=FileStorage, location='files')
@@ -85,6 +85,9 @@ class SpiderApi(BaseApi):
# spider start url
('start_url', str),
# url pattern: support generation of urls with patterns
('url_pattern', str),
# spider item selector
('item_selector', str),
@@ -479,20 +482,29 @@ class SpiderApi(BaseApi):
}, 400
try:
r = requests.get(spider['start_url'], headers={
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
})
r = None
for url in generate_urls(spider['start_url']):
r = requests.get(url, headers={
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
})
break
except Exception as err:
return {
'status': 'ok',
'error': 'connection error'
}, 500
if r.status_code != 200:
if not r:
return {
'status': 'ok',
'error': 'status code is not 200, but %s' % r.status_code
}
'status': 'ok',
'error': 'response is not returned'
}, 500
if r and r.status_code != 200:
return {
'status': 'ok',
'error': 'status code is not 200, but %s' % r.status_code
}, r.status_code
# get html parse tree
sel = etree.HTML(r.content)

View File

@@ -1,10 +1,15 @@
# -*- coding: utf-8 -*-
import os
import sys
from urllib.parse import urlparse
import scrapy
from spiders.db import spider
from spiders.items import SpidersItem
from spiders.utils import generate_urls
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')))
def get_detail_url(item):
@@ -75,8 +80,10 @@ def get_next_url(response):
class ConfigSpiderSpider(scrapy.Spider):
name = 'config_spider'
# allowed_domains = []
start_urls = [spider['start_url']]
def start_requests(self):
for url in generate_urls(spider['start_url']):
yield scrapy.Request(url=url)
def parse(self, response):
@@ -91,7 +98,7 @@ class ConfigSpiderSpider(scrapy.Spider):
yield scrapy.Request(url=next_url)
elif spider['crawl_type'] == 'detail':
# TODO: detail page onlny
# TODO: detail page only
# detail page only
pass

View File

@@ -1,4 +1,7 @@
import itertools
import os
import re
import requests
from datetime import datetime, timedelta
@@ -121,3 +124,51 @@ def get_detail_page_data(url, spider, idx, data):
# assign values
for k, v in row.items():
data[idx][k] = v
def generate_urls(base_url: str) -> str:
url = base_url
# number range list
list_arr = []
for i, res in enumerate(re.findall(r'{(\d+),(\d+)}', base_url)):
try:
_min = int(res[0])
_max = int(res[1])
except ValueError as err:
raise ValueError(f'{base_url} is not a valid URL pattern')
# list
_list = range(_min, _max + 1)
# key
_key = f'n{i}'
# append list and key
list_arr.append((_list, _key))
# replace url placeholder with key
url = url.replace('{' + res[0] + ',' + res[1] + '}', '{' + _key + '}', 1)
# string list
for i, res in enumerate(re.findall(r'\[([\w\-,]+)\]', base_url)):
# list
_list = res.split(',')
# key
_key = f's{i}'
# append list and key
list_arr.append((_list, _key))
# replace url placeholder with key
url = url.replace('[' + ','.join(_list) + ']', '{' + _key + '}', 1)
# combine together
_list_arr = []
for res in itertools.product(*map(lambda x: x[0], list_arr)):
_url = url
for _arr, _rep in zip(list_arr, res):
_list, _key = _arr
_url = _url.replace('{' + _key + '}', str(_rep), 1)
yield _url

View File

@@ -37,9 +37,9 @@
<el-form-item :label="$t('Start URL')">
<el-input v-model="spiderForm.start_url" :placeholder="$t('Start URL')"></el-input>
</el-form-item>
<el-form-item :label="$t('Obey robots.txt')">
<el-switch v-model="spiderForm.obey_robots_txt" :placeholder="$t('Obey robots.txt')"></el-switch>
</el-form-item>
<!--<el-form-item :label="$t('URL Pattern')">-->
<!--<el-input v-model="spiderForm.url_pattern" :placeholder="$t('URL Pattern')"></el-input>-->
<!--</el-form-item>-->
</el-form>
</el-col>
<el-col :span="11" :offset="1">
@@ -70,6 +70,9 @@
:placeholder="$t('Pagination Selector')">
</el-input>
</el-form-item>
<el-form-item :label="$t('Obey robots.txt')">
<el-switch v-model="spiderForm.obey_robots_txt" :placeholder="$t('Obey robots.txt')"></el-switch>
</el-form-item>
</el-form>
</el-col>
</el-row>

View File

@@ -105,6 +105,7 @@ const actions = {
// configurable spider
crawl_type: state.spiderForm.crawl_type,
start_url: state.spiderForm.start_url,
url_pattern: state.spiderForm.url_pattern,
item_selector: state.spiderForm.item_selector,
item_selector_type: state.spiderForm.item_selector_type,
pagination_selector: state.spiderForm.pagination_selector,