prepare for list fields extraction

This commit is contained in:
Marvin Zhang
2019-05-29 13:33:05 +08:00
parent 87c81deefb
commit e570bcfa13

View File

@@ -463,9 +463,8 @@ class SpiderApi(BaseApi):
detail_fields = json.loads(args.detail_fields)
db_manager.update_one(col_name='spiders', id=id, values={'detail_fields': detail_fields})
def preview_crawl(self, id: str):
spider = db_manager.get(col_name='spiders', id=id)
@staticmethod
def _get_html(spider) -> etree.Element:
if spider['type'] != SpiderType.CONFIGURABLE:
return {
'status': 'ok',
@@ -497,6 +496,18 @@ class SpiderApi(BaseApi):
# get html parse tree
sel = etree.HTML(r.content)
return sel
def preview_crawl(self, id: str):
spider = db_manager.get(col_name='spiders', id=id)
# get html parse tree
sel = self._get_html(spider)
# when error happens, return
if type(sel) == type(tuple):
return sel
# parse fields
if spider['crawl_type'] == CrawlType.LIST:
if spider.get('item_selector') is None:
@@ -513,6 +524,7 @@ class SpiderApi(BaseApi):
}
elif spider['crawl_type'] == CrawlType.DETAIL:
# TODO: 详情页预览
pass
elif spider['crawl_type'] == CrawlType.LIST_DETAIL:
@@ -534,6 +546,42 @@ class SpiderApi(BaseApi):
'items': data
}
def extract_fields(self, id: str):
"""
Extract list fields from a web page
:param id:
:return:
"""
spider = db_manager.get(col_name='spiders', id=id)
# get html parse tree
sel = self._get_html(spider)
# when error happens, return
if type(sel) == type(tuple):
return sel
list_tag_list = []
threshold = 10
# iterate all child nodes in a top-down direction
for tag in sel.iter():
# get child tags
child_tags = tag.getchildren()
if len(child_tags) < threshold:
# if number of child tags is below threshold, skip
continue
else:
# have one or more child tags
child_tags_set = set(map(lambda x: x.tag, child_tags))
# if there are more than 1 tag names, skip
if len(child_tags_set) > 1:
continue
# add as list tag
list_tag_list.append(tag)
class SpiderImportApi(Resource):
__doc__ = """