diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py
index 5fabde73..a9240dae 100644
--- a/crawlab/routes/spiders.py
+++ b/crawlab/routes/spiders.py
@@ -25,7 +25,7 @@ from utils import jsonify
from utils.deploy import zip_file, unzip_file
from utils.file import get_file_suffix_stats, get_file_suffix
from utils.spider import get_lang_by_stats, get_last_n_run_errors_count, get_last_n_day_tasks_count, get_list_page_data, \
- get_detail_page_data
+ get_detail_page_data, generate_urls
parser = reqparse.RequestParser()
parser.add_argument('file', type=FileStorage, location='files')
@@ -85,6 +85,9 @@ class SpiderApi(BaseApi):
# spider start url
('start_url', str),
+ # url pattern: support generation of urls with patterns
+ ('url_pattern', str),
+
# spider item selector
('item_selector', str),
@@ -479,20 +482,29 @@ class SpiderApi(BaseApi):
}, 400
try:
- r = requests.get(spider['start_url'], headers={
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
- })
+ r = None
+ for url in generate_urls(spider['start_url']):
+ r = requests.get(url, headers={
+ 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
+ })
+ break
except Exception as err:
return {
'status': 'ok',
'error': 'connection error'
}, 500
- if r.status_code != 200:
+ if not r:
return {
- 'status': 'ok',
- 'error': 'status code is not 200, but %s' % r.status_code
- }
+ 'status': 'ok',
+ 'error': 'response is not returned'
+ }, 500
+
+ if r and r.status_code != 200:
+ return {
+ 'status': 'ok',
+ 'error': 'status code is not 200, but %s' % r.status_code
+ }, r.status_code
# get html parse tree
sel = etree.HTML(r.content)
diff --git a/crawlab/spiders/spiders/spiders/config_spider.py b/crawlab/spiders/spiders/spiders/config_spider.py
index fe801f8e..13fa82bf 100644
--- a/crawlab/spiders/spiders/spiders/config_spider.py
+++ b/crawlab/spiders/spiders/spiders/config_spider.py
@@ -1,10 +1,15 @@
# -*- coding: utf-8 -*-
+import os
+import sys
from urllib.parse import urlparse
import scrapy
from spiders.db import spider
from spiders.items import SpidersItem
+from spiders.utils import generate_urls
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')))
def get_detail_url(item):
@@ -75,8 +80,10 @@ def get_next_url(response):
class ConfigSpiderSpider(scrapy.Spider):
name = 'config_spider'
- # allowed_domains = []
- start_urls = [spider['start_url']]
+
+ def start_requests(self):
+ for url in generate_urls(spider['start_url']):
+ yield scrapy.Request(url=url)
def parse(self, response):
@@ -91,7 +98,7 @@ class ConfigSpiderSpider(scrapy.Spider):
yield scrapy.Request(url=next_url)
elif spider['crawl_type'] == 'detail':
- # TODO: detail page onlny
+ # TODO: detail page only
# detail page only
pass
diff --git a/crawlab/utils/spider.py b/crawlab/utils/spider.py
index 8720c50f..9a2b48df 100644
--- a/crawlab/utils/spider.py
+++ b/crawlab/utils/spider.py
@@ -1,4 +1,7 @@
+import itertools
import os
+import re
+
import requests
from datetime import datetime, timedelta
@@ -121,3 +124,51 @@ def get_detail_page_data(url, spider, idx, data):
# assign values
for k, v in row.items():
data[idx][k] = v
+
+
+def generate_urls(base_url: str) -> str:
+ url = base_url
+
+ # number range list
+ list_arr = []
+ for i, res in enumerate(re.findall(r'{(\d+),(\d+)}', base_url)):
+ try:
+ _min = int(res[0])
+ _max = int(res[1])
+ except ValueError as err:
+ raise ValueError(f'{base_url} is not a valid URL pattern')
+
+ # list
+ _list = range(_min, _max + 1)
+
+ # key
+ _key = f'n{i}'
+
+ # append list and key
+ list_arr.append((_list, _key))
+
+ # replace url placeholder with key
+ url = url.replace('{' + res[0] + ',' + res[1] + '}', '{' + _key + '}', 1)
+
+ # string list
+ for i, res in enumerate(re.findall(r'\[([\w\-,]+)\]', base_url)):
+ # list
+ _list = res.split(',')
+
+ # key
+ _key = f's{i}'
+
+ # append list and key
+ list_arr.append((_list, _key))
+
+ # replace url placeholder with key
+ url = url.replace('[' + ','.join(_list) + ']', '{' + _key + '}', 1)
+
+ # combine together
+ _list_arr = []
+ for res in itertools.product(*map(lambda x: x[0], list_arr)):
+ _url = url
+ for _arr, _rep in zip(list_arr, res):
+ _list, _key = _arr
+ _url = _url.replace('{' + _key + '}', str(_rep), 1)
+ yield _url
diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue
index 6c47570a..76f42e34 100644
--- a/frontend/src/components/Config/ConfigList.vue
+++ b/frontend/src/components/Config/ConfigList.vue
@@ -37,9 +37,9 @@
-
-
-
+
+
+
@@ -70,6 +70,9 @@
:placeholder="$t('Pagination Selector')">
+
+
+
diff --git a/frontend/src/store/modules/spider.js b/frontend/src/store/modules/spider.js
index 673ce33f..18fcce2a 100644
--- a/frontend/src/store/modules/spider.js
+++ b/frontend/src/store/modules/spider.js
@@ -105,6 +105,7 @@ const actions = {
// configurable spider
crawl_type: state.spiderForm.crawl_type,
start_url: state.spiderForm.start_url,
+ url_pattern: state.spiderForm.url_pattern,
item_selector: state.spiderForm.item_selector,
item_selector_type: state.spiderForm.item_selector_type,
pagination_selector: state.spiderForm.pagination_selector,