prepare for list fields extraction

This commit is contained in:
Marvin Zhang
2019-05-29 13:43:03 +08:00
parent e570bcfa13
commit d7c6680ee4

View File

@@ -498,6 +498,14 @@ class SpiderApi(BaseApi):
return sel
@staticmethod
def _get_text_child_tags(sel):
tags = []
for tag in sel.iter():
if tag.text is not None:
tags.append(tag)
return tags
def preview_crawl(self, id: str):
spider = db_manager.get(col_name='spiders', id=id)
@@ -582,6 +590,17 @@ class SpiderApi(BaseApi):
# add as list tag
list_tag_list.append(tag)
# find the list tag with the most child text tags
_tag_list = []
_max_tag = None
_max_num = 0
for tag in list_tag_list:
_child_text_tags = self._get_text_child_tags(tag[0])
if len(_child_text_tags) > _max_num:
_max_tag = tag
_max_num = len(_child_text_tags)
return _max_tag
class SpiderImportApi(Resource):
__doc__ = """