mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
modified fields extraction algorithm
This commit is contained in:
@@ -657,19 +657,24 @@ class SpiderApi(BaseApi):
|
||||
|
||||
# get list item selector
|
||||
item_selector = None
|
||||
item_selector_type = 'css'
|
||||
if max_tag.get('id') is not None:
|
||||
item_selector = f'#{max_tag.get("id")} > {self._get_children(max_tag)[0].tag}'
|
||||
elif max_tag.get('class') is not None:
|
||||
cls_str = '.'.join([x for x in max_tag.get("class").split(' ') if x != ''])
|
||||
if len(sel.cssselect(f'.{cls_str}')) == 1:
|
||||
item_selector = f'.{cls_str} > {self._get_children(max_tag)[0].tag}'
|
||||
else:
|
||||
item_selector = max_tag.getroottree().getpath(max_tag)
|
||||
item_selector_type = 'xpath'
|
||||
|
||||
# get list fields
|
||||
fields = []
|
||||
if item_selector is not None:
|
||||
first_tag = self._get_children(max_tag)[0]
|
||||
for i, tag in enumerate(self._get_text_child_tags(first_tag)):
|
||||
if len(first_tag.cssselect(f'{tag.tag}')) == 1:
|
||||
el_list = first_tag.cssselect(f'{tag.tag}')
|
||||
if len(el_list) == 1:
|
||||
fields.append({
|
||||
'name': f'field{i + 1}',
|
||||
'type': 'css',
|
||||
@@ -685,6 +690,15 @@ class SpiderApi(BaseApi):
|
||||
'extract_type': 'text',
|
||||
'query': f'{tag.tag}.{cls_str}',
|
||||
})
|
||||
else:
|
||||
for j, el in enumerate(el_list):
|
||||
if tag == el:
|
||||
fields.append({
|
||||
'name': f'field{i + 1}',
|
||||
'type': 'css',
|
||||
'extract_type': 'text',
|
||||
'query': f'{tag.tag}:nth-of-type({j + 1})',
|
||||
})
|
||||
|
||||
for i, tag in enumerate(self._get_a_child_tags(self._get_children(max_tag)[0])):
|
||||
# if the tag is <a...></a>, extract its href
|
||||
@@ -710,6 +724,7 @@ class SpiderApi(BaseApi):
|
||||
return {
|
||||
'status': 'ok',
|
||||
'item_selector': item_selector,
|
||||
'item_selector_type': item_selector_type,
|
||||
'pagination_selector': pagination_selector,
|
||||
'fields': fields
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user