prepare for list fields extraction

2026-01-22 17:31:03 +01:00 · 2019-05-29 13:33:05 +08:00
parent 87c81deefb
commit e570bcfa13
1 changed files with 51 additions and 3 deletions
--- a/crawlab/routes/spiders.py
+++ b/crawlab/routes/spiders.py
@@ -463,9 +463,8 @@ class SpiderApi(BaseApi):
        detail_fields = json.loads(args.detail_fields)
        db_manager.update_one(col_name='spiders', id=id, values={'detail_fields': detail_fields})

-    def preview_crawl(self, id: str):
-        spider = db_manager.get(col_name='spiders', id=id)
-
+    @staticmethod
+    def _get_html(spider) -> etree.Element:
        if spider['type'] != SpiderType.CONFIGURABLE:
            return {
                       'status': 'ok',
@@ -497,6 +496,18 @@ class SpiderApi(BaseApi):
        # get html parse tree
        sel = etree.HTML(r.content)

+        return sel
+
+    def preview_crawl(self, id: str):
+        spider = db_manager.get(col_name='spiders', id=id)
+
+        # get html parse tree
+        sel = self._get_html(spider)
+
+        # when error happens, return
+        if type(sel) == type(tuple):
+            return sel
+
        # parse fields
        if spider['crawl_type'] == CrawlType.LIST:
            if spider.get('item_selector') is None:
@@ -513,6 +524,7 @@ class SpiderApi(BaseApi):
            }

        elif spider['crawl_type'] == CrawlType.DETAIL:
+            # TODO: 详情页预览
            pass

        elif spider['crawl_type'] == CrawlType.LIST_DETAIL:
@@ -534,6 +546,42 @@ class SpiderApi(BaseApi):
                'items': data
            }

+    def extract_fields(self, id: str):
+        """
+        Extract list fields from a web page
+        :param id:
+        :return:
+        """
+        spider = db_manager.get(col_name='spiders', id=id)
+
+        # get html parse tree
+        sel = self._get_html(spider)
+
+        # when error happens, return
+        if type(sel) == type(tuple):
+            return sel
+
+        list_tag_list = []
+        threshold = 10
+        # iterate all child nodes in a top-down direction
+        for tag in sel.iter():
+            # get child tags
+            child_tags = tag.getchildren()
+
+            if len(child_tags) < threshold:
+                # if number of child tags is below threshold, skip
+                continue
+            else:
+                # have one or more child tags
+                child_tags_set = set(map(lambda x: x.tag, child_tags))
+
+                # if there are more than 1 tag names, skip
+                if len(child_tags_set) > 1:
+                    continue
+
+                # add as list tag
+                list_tag_list.append(tag)
+

 class SpiderImportApi(Resource):
    __doc__ = """