diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index 5fabde73..a9240dae 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -25,7 +25,7 @@ from utils import jsonify from utils.deploy import zip_file, unzip_file from utils.file import get_file_suffix_stats, get_file_suffix from utils.spider import get_lang_by_stats, get_last_n_run_errors_count, get_last_n_day_tasks_count, get_list_page_data, \ - get_detail_page_data + get_detail_page_data, generate_urls parser = reqparse.RequestParser() parser.add_argument('file', type=FileStorage, location='files') @@ -85,6 +85,9 @@ class SpiderApi(BaseApi): # spider start url ('start_url', str), + # url pattern: support generation of urls with patterns + ('url_pattern', str), + # spider item selector ('item_selector', str), @@ -479,20 +482,29 @@ class SpiderApi(BaseApi): }, 400 try: - r = requests.get(spider['start_url'], headers={ - 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' - }) + r = None + for url in generate_urls(spider['start_url']): + r = requests.get(url, headers={ + 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' + }) + break except Exception as err: return { 'status': 'ok', 'error': 'connection error' }, 500 - if r.status_code != 200: + if not r: return { - 'status': 'ok', - 'error': 'status code is not 200, but %s' % r.status_code - } + 'status': 'ok', + 'error': 'response is not returned' + }, 500 + + if r and r.status_code != 200: + return { + 'status': 'ok', + 'error': 'status code is not 200, but %s' % r.status_code + }, r.status_code # get html parse tree sel = etree.HTML(r.content) diff --git a/crawlab/spiders/spiders/spiders/config_spider.py b/crawlab/spiders/spiders/spiders/config_spider.py index fe801f8e..13fa82bf 100644 --- a/crawlab/spiders/spiders/spiders/config_spider.py +++ b/crawlab/spiders/spiders/spiders/config_spider.py @@ -1,10 +1,15 @@ # -*- coding: utf-8 -*- +import os +import sys from urllib.parse import urlparse import scrapy from spiders.db import spider from spiders.items import SpidersItem +from spiders.utils import generate_urls + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))) def get_detail_url(item): @@ -75,8 +80,10 @@ def get_next_url(response): class ConfigSpiderSpider(scrapy.Spider): name = 'config_spider' - # allowed_domains = [] - start_urls = [spider['start_url']] + + def start_requests(self): + for url in generate_urls(spider['start_url']): + yield scrapy.Request(url=url) def parse(self, response): @@ -91,7 +98,7 @@ class ConfigSpiderSpider(scrapy.Spider): yield scrapy.Request(url=next_url) elif spider['crawl_type'] == 'detail': - # TODO: detail page onlny + # TODO: detail page only # detail page only pass diff --git a/crawlab/utils/spider.py b/crawlab/utils/spider.py index 8720c50f..9a2b48df 100644 --- a/crawlab/utils/spider.py +++ b/crawlab/utils/spider.py @@ -1,4 +1,7 @@ +import itertools import os +import re + import requests from datetime import datetime, timedelta @@ -121,3 +124,51 @@ def get_detail_page_data(url, spider, idx, data): # assign values for k, v in row.items(): data[idx][k] = v + + +def generate_urls(base_url: str) -> str: + url = base_url + + # number range list + list_arr = [] + for i, res in enumerate(re.findall(r'{(\d+),(\d+)}', base_url)): + try: + _min = int(res[0]) + _max = int(res[1]) + except ValueError as err: + raise ValueError(f'{base_url} is not a valid URL pattern') + + # list + _list = range(_min, _max + 1) + + # key + _key = f'n{i}' + + # append list and key + list_arr.append((_list, _key)) + + # replace url placeholder with key + url = url.replace('{' + res[0] + ',' + res[1] + '}', '{' + _key + '}', 1) + + # string list + for i, res in enumerate(re.findall(r'\[([\w\-,]+)\]', base_url)): + # list + _list = res.split(',') + + # key + _key = f's{i}' + + # append list and key + list_arr.append((_list, _key)) + + # replace url placeholder with key + url = url.replace('[' + ','.join(_list) + ']', '{' + _key + '}', 1) + + # combine together + _list_arr = [] + for res in itertools.product(*map(lambda x: x[0], list_arr)): + _url = url + for _arr, _rep in zip(list_arr, res): + _list, _key = _arr + _url = _url.replace('{' + _key + '}', str(_rep), 1) + yield _url diff --git a/frontend/src/components/Config/ConfigList.vue b/frontend/src/components/Config/ConfigList.vue index 6c47570a..76f42e34 100644 --- a/frontend/src/components/Config/ConfigList.vue +++ b/frontend/src/components/Config/ConfigList.vue @@ -37,9 +37,9 @@ - - - + + + @@ -70,6 +70,9 @@ :placeholder="$t('Pagination Selector')"> + + + diff --git a/frontend/src/store/modules/spider.js b/frontend/src/store/modules/spider.js index 673ce33f..18fcce2a 100644 --- a/frontend/src/store/modules/spider.js +++ b/frontend/src/store/modules/spider.js @@ -105,6 +105,7 @@ const actions = { // configurable spider crawl_type: state.spiderForm.crawl_type, start_url: state.spiderForm.start_url, + url_pattern: state.spiderForm.url_pattern, item_selector: state.spiderForm.item_selector, item_selector_type: state.spiderForm.item_selector_type, pagination_selector: state.spiderForm.pagination_selector,