From d7c6680ee4d283b2fbe73a16668d83ced305f933 Mon Sep 17 00:00:00 2001 From: Marvin Zhang Date: Wed, 29 May 2019 13:43:03 +0800 Subject: [PATCH] prepare for list fields extraction --- crawlab/routes/spiders.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index a46bdad3..03d0659a 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -498,6 +498,14 @@ class SpiderApi(BaseApi): return sel + @staticmethod + def _get_text_child_tags(sel): + tags = [] + for tag in sel.iter(): + if tag.text is not None: + tags.append(tag) + return tags + def preview_crawl(self, id: str): spider = db_manager.get(col_name='spiders', id=id) @@ -582,6 +590,17 @@ class SpiderApi(BaseApi): # add as list tag list_tag_list.append(tag) + # find the list tag with the most child text tags + _tag_list = [] + _max_tag = None + _max_num = 0 + for tag in list_tag_list: + _child_text_tags = self._get_text_child_tags(tag[0]) + if len(_child_text_tags) > _max_num: + _max_tag = tag + _max_num = len(_child_text_tags) + return _max_tag + class SpiderImportApi(Resource): __doc__ = """