diff --git a/crawlab/routes/spiders.py b/crawlab/routes/spiders.py index 5fabde73..a46bdad3 100644 --- a/crawlab/routes/spiders.py +++ b/crawlab/routes/spiders.py @@ -463,9 +463,8 @@ class SpiderApi(BaseApi): detail_fields = json.loads(args.detail_fields) db_manager.update_one(col_name='spiders', id=id, values={'detail_fields': detail_fields}) - def preview_crawl(self, id: str): - spider = db_manager.get(col_name='spiders', id=id) - + @staticmethod + def _get_html(spider) -> etree.Element: if spider['type'] != SpiderType.CONFIGURABLE: return { 'status': 'ok', @@ -497,6 +496,18 @@ class SpiderApi(BaseApi): # get html parse tree sel = etree.HTML(r.content) + return sel + + def preview_crawl(self, id: str): + spider = db_manager.get(col_name='spiders', id=id) + + # get html parse tree + sel = self._get_html(spider) + + # when error happens, return + if type(sel) == type(tuple): + return sel + # parse fields if spider['crawl_type'] == CrawlType.LIST: if spider.get('item_selector') is None: @@ -513,6 +524,7 @@ class SpiderApi(BaseApi): } elif spider['crawl_type'] == CrawlType.DETAIL: + # TODO: 详情页预览 pass elif spider['crawl_type'] == CrawlType.LIST_DETAIL: @@ -534,6 +546,42 @@ class SpiderApi(BaseApi): 'items': data } + def extract_fields(self, id: str): + """ + Extract list fields from a web page + :param id: + :return: + """ + spider = db_manager.get(col_name='spiders', id=id) + + # get html parse tree + sel = self._get_html(spider) + + # when error happens, return + if type(sel) == type(tuple): + return sel + + list_tag_list = [] + threshold = 10 + # iterate all child nodes in a top-down direction + for tag in sel.iter(): + # get child tags + child_tags = tag.getchildren() + + if len(child_tags) < threshold: + # if number of child tags is below threshold, skip + continue + else: + # have one or more child tags + child_tags_set = set(map(lambda x: x.tag, child_tags)) + + # if there are more than 1 tag names, skip + if len(child_tags_set) > 1: + continue + + # add as list tag + list_tag_list.append(tag) + class SpiderImportApi(Resource): __doc__ = """