From 0cdf44842f2d9effe10b3f0c9c55fdaead00603e Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Sun, 9 Jun 2019 16:44:40 +0800 Subject: [PATCH] modified fields extraction algorithm --- crawlab/routes/spiders.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index 04e1eb96..14d028e6 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -657,19 +657,24 @@ class SpiderApi(BaseApi): # get list item selector item_selector = None + item_selector_type = 'css' if max_tag.get('id') is not None: item_selector = f'#{max_tag.get("id")} > {self._get_children(max_tag)[0].tag}' elif max_tag.get('class') is not None: cls_str = '.'.join([x for x in max_tag.get("class").split(' ') if x != '']) if len(sel.cssselect(f'.{cls_str}')) == 1: item_selector = f'.{cls_str} > {self._get_children(max_tag)[0].tag}' + else: + item_selector = max_tag.getroottree().getpath(max_tag) + item_selector_type = 'xpath' # get list fields fields = [] if item_selector is not None: first_tag = self._get_children(max_tag)[0] for i, tag in enumerate(self._get_text_child_tags(first_tag)): - if len(first_tag.cssselect(f'{tag.tag}')) == 1: + el_list = first_tag.cssselect(f'{tag.tag}') + if len(el_list) == 1: fields.append({ 'name': f'field{i + 1}', 'type': 'css', @@ -685,6 +690,15 @@ class SpiderApi(BaseApi): 'extract_type': 'text', 'query': f'{tag.tag}.{cls_str}', }) + else: + for j, el in enumerate(el_list): + if tag == el: + fields.append({ + 'name': f'field{i + 1}', + 'type': 'css', + 'extract_type': 'text', + 'query': f'{tag.tag}:nth-of-type({j + 1})', + }) for i, tag in enumerate(self._get_a_child_tags(self._get_children(max_tag)[0])): # if the tag is , extract its href @@ -710,6 +724,7 @@ class SpiderApi(BaseApi): return { 'status': 'ok', 'item_selector': item_selector, + 'item_selector_type': item_selector_type, 'pagination_selector': pagination_selector, 'fields': fields }