mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
allow generate URLs by pattern
This commit is contained in:
@@ -25,7 +25,7 @@ from utils import jsonify
|
||||
from utils.deploy import zip_file, unzip_file
|
||||
from utils.file import get_file_suffix_stats, get_file_suffix
|
||||
from utils.spider import get_lang_by_stats, get_last_n_run_errors_count, get_last_n_day_tasks_count, get_list_page_data, \
|
||||
get_detail_page_data
|
||||
get_detail_page_data, generate_urls
|
||||
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument('file', type=FileStorage, location='files')
|
||||
@@ -85,6 +85,9 @@ class SpiderApi(BaseApi):
|
||||
# spider start url
|
||||
('start_url', str),
|
||||
|
||||
# url pattern: support generation of urls with patterns
|
||||
('url_pattern', str),
|
||||
|
||||
# spider item selector
|
||||
('item_selector', str),
|
||||
|
||||
@@ -479,20 +482,29 @@ class SpiderApi(BaseApi):
|
||||
}, 400
|
||||
|
||||
try:
|
||||
r = requests.get(spider['start_url'], headers={
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
|
||||
})
|
||||
r = None
|
||||
for url in generate_urls(spider['start_url']):
|
||||
r = requests.get(url, headers={
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
|
||||
})
|
||||
break
|
||||
except Exception as err:
|
||||
return {
|
||||
'status': 'ok',
|
||||
'error': 'connection error'
|
||||
}, 500
|
||||
|
||||
if r.status_code != 200:
|
||||
if not r:
|
||||
return {
|
||||
'status': 'ok',
|
||||
'error': 'status code is not 200, but %s' % r.status_code
|
||||
}
|
||||
'status': 'ok',
|
||||
'error': 'response is not returned'
|
||||
}, 500
|
||||
|
||||
if r and r.status_code != 200:
|
||||
return {
|
||||
'status': 'ok',
|
||||
'error': 'status code is not 200, but %s' % r.status_code
|
||||
}, r.status_code
|
||||
|
||||
# get html parse tree
|
||||
sel = etree.HTML(r.content)
|
||||
|
||||
@@ -1,10 +1,15 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import sys
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import scrapy
|
||||
|
||||
from spiders.db import spider
|
||||
from spiders.items import SpidersItem
|
||||
from spiders.utils import generate_urls
|
||||
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')))
|
||||
|
||||
|
||||
def get_detail_url(item):
|
||||
@@ -75,8 +80,10 @@ def get_next_url(response):
|
||||
|
||||
class ConfigSpiderSpider(scrapy.Spider):
|
||||
name = 'config_spider'
|
||||
# allowed_domains = []
|
||||
start_urls = [spider['start_url']]
|
||||
|
||||
def start_requests(self):
|
||||
for url in generate_urls(spider['start_url']):
|
||||
yield scrapy.Request(url=url)
|
||||
|
||||
def parse(self, response):
|
||||
|
||||
@@ -91,7 +98,7 @@ class ConfigSpiderSpider(scrapy.Spider):
|
||||
yield scrapy.Request(url=next_url)
|
||||
|
||||
elif spider['crawl_type'] == 'detail':
|
||||
# TODO: detail page onlny
|
||||
# TODO: detail page only
|
||||
# detail page only
|
||||
pass
|
||||
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
import itertools
|
||||
import os
|
||||
import re
|
||||
|
||||
import requests
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
@@ -121,3 +124,51 @@ def get_detail_page_data(url, spider, idx, data):
|
||||
# assign values
|
||||
for k, v in row.items():
|
||||
data[idx][k] = v
|
||||
|
||||
|
||||
def generate_urls(base_url: str) -> str:
|
||||
url = base_url
|
||||
|
||||
# number range list
|
||||
list_arr = []
|
||||
for i, res in enumerate(re.findall(r'{(\d+),(\d+)}', base_url)):
|
||||
try:
|
||||
_min = int(res[0])
|
||||
_max = int(res[1])
|
||||
except ValueError as err:
|
||||
raise ValueError(f'{base_url} is not a valid URL pattern')
|
||||
|
||||
# list
|
||||
_list = range(_min, _max + 1)
|
||||
|
||||
# key
|
||||
_key = f'n{i}'
|
||||
|
||||
# append list and key
|
||||
list_arr.append((_list, _key))
|
||||
|
||||
# replace url placeholder with key
|
||||
url = url.replace('{' + res[0] + ',' + res[1] + '}', '{' + _key + '}', 1)
|
||||
|
||||
# string list
|
||||
for i, res in enumerate(re.findall(r'\[([\w\-,]+)\]', base_url)):
|
||||
# list
|
||||
_list = res.split(',')
|
||||
|
||||
# key
|
||||
_key = f's{i}'
|
||||
|
||||
# append list and key
|
||||
list_arr.append((_list, _key))
|
||||
|
||||
# replace url placeholder with key
|
||||
url = url.replace('[' + ','.join(_list) + ']', '{' + _key + '}', 1)
|
||||
|
||||
# combine together
|
||||
_list_arr = []
|
||||
for res in itertools.product(*map(lambda x: x[0], list_arr)):
|
||||
_url = url
|
||||
for _arr, _rep in zip(list_arr, res):
|
||||
_list, _key = _arr
|
||||
_url = _url.replace('{' + _key + '}', str(_rep), 1)
|
||||
yield _url
|
||||
|
||||
@@ -37,9 +37,9 @@
|
||||
<el-form-item :label="$t('Start URL')">
|
||||
<el-input v-model="spiderForm.start_url" :placeholder="$t('Start URL')"></el-input>
|
||||
</el-form-item>
|
||||
<el-form-item :label="$t('Obey robots.txt')">
|
||||
<el-switch v-model="spiderForm.obey_robots_txt" :placeholder="$t('Obey robots.txt')"></el-switch>
|
||||
</el-form-item>
|
||||
<!--<el-form-item :label="$t('URL Pattern')">-->
|
||||
<!--<el-input v-model="spiderForm.url_pattern" :placeholder="$t('URL Pattern')"></el-input>-->
|
||||
<!--</el-form-item>-->
|
||||
</el-form>
|
||||
</el-col>
|
||||
<el-col :span="11" :offset="1">
|
||||
@@ -70,6 +70,9 @@
|
||||
:placeholder="$t('Pagination Selector')">
|
||||
</el-input>
|
||||
</el-form-item>
|
||||
<el-form-item :label="$t('Obey robots.txt')">
|
||||
<el-switch v-model="spiderForm.obey_robots_txt" :placeholder="$t('Obey robots.txt')"></el-switch>
|
||||
</el-form-item>
|
||||
</el-form>
|
||||
</el-col>
|
||||
</el-row>
|
||||
|
||||
@@ -105,6 +105,7 @@ const actions = {
|
||||
// configurable spider
|
||||
crawl_type: state.spiderForm.crawl_type,
|
||||
start_url: state.spiderForm.start_url,
|
||||
url_pattern: state.spiderForm.url_pattern,
|
||||
item_selector: state.spiderForm.item_selector,
|
||||
item_selector_type: state.spiderForm.item_selector_type,
|
||||
pagination_selector: state.spiderForm.pagination_selector,
|
||||
|
||||
Reference in New Issue
Block a user