prepare for list fields extraction

2026-01-22 17:31:03 +01:00 · 2019-05-29 13:43:03 +08:00
parent e570bcfa13
commit d7c6680ee4
1 changed files with 19 additions and 0 deletions
--- a/crawlab/routes/spiders.py
+++ b/crawlab/routes/spiders.py
@@ -498,6 +498,14 @@ class SpiderApi(BaseApi):

        return sel

+    @staticmethod
+    def _get_text_child_tags(sel):
+        tags = []
+        for tag in sel.iter():
+            if tag.text is not None:
+                tags.append(tag)
+        return tags
+
    def preview_crawl(self, id: str):
        spider = db_manager.get(col_name='spiders', id=id)

@@ -582,6 +590,17 @@ class SpiderApi(BaseApi):
                # add as list tag
                list_tag_list.append(tag)

+        # find the list tag with the most child text tags
+        _tag_list = []
+        _max_tag = None
+        _max_num = 0
+        for tag in list_tag_list:
+            _child_text_tags = self._get_text_child_tags(tag[0])
+            if len(_child_text_tags) > _max_num:
+                _max_tag = tag
+                _max_num = len(_child_text_tags)
+        return _max_tag
+

 class SpiderImportApi(Resource):
    __doc__ = """