mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-22 17:31:03 +01:00
prepare for list fields extraction
This commit is contained in:
@@ -463,9 +463,8 @@ class SpiderApi(BaseApi):
|
||||
detail_fields = json.loads(args.detail_fields)
|
||||
db_manager.update_one(col_name='spiders', id=id, values={'detail_fields': detail_fields})
|
||||
|
||||
def preview_crawl(self, id: str):
|
||||
spider = db_manager.get(col_name='spiders', id=id)
|
||||
|
||||
@staticmethod
|
||||
def _get_html(spider) -> etree.Element:
|
||||
if spider['type'] != SpiderType.CONFIGURABLE:
|
||||
return {
|
||||
'status': 'ok',
|
||||
@@ -497,6 +496,18 @@ class SpiderApi(BaseApi):
|
||||
# get html parse tree
|
||||
sel = etree.HTML(r.content)
|
||||
|
||||
return sel
|
||||
|
||||
def preview_crawl(self, id: str):
|
||||
spider = db_manager.get(col_name='spiders', id=id)
|
||||
|
||||
# get html parse tree
|
||||
sel = self._get_html(spider)
|
||||
|
||||
# when error happens, return
|
||||
if type(sel) == type(tuple):
|
||||
return sel
|
||||
|
||||
# parse fields
|
||||
if spider['crawl_type'] == CrawlType.LIST:
|
||||
if spider.get('item_selector') is None:
|
||||
@@ -513,6 +524,7 @@ class SpiderApi(BaseApi):
|
||||
}
|
||||
|
||||
elif spider['crawl_type'] == CrawlType.DETAIL:
|
||||
# TODO: 详情页预览
|
||||
pass
|
||||
|
||||
elif spider['crawl_type'] == CrawlType.LIST_DETAIL:
|
||||
@@ -534,6 +546,42 @@ class SpiderApi(BaseApi):
|
||||
'items': data
|
||||
}
|
||||
|
||||
def extract_fields(self, id: str):
|
||||
"""
|
||||
Extract list fields from a web page
|
||||
:param id:
|
||||
:return:
|
||||
"""
|
||||
spider = db_manager.get(col_name='spiders', id=id)
|
||||
|
||||
# get html parse tree
|
||||
sel = self._get_html(spider)
|
||||
|
||||
# when error happens, return
|
||||
if type(sel) == type(tuple):
|
||||
return sel
|
||||
|
||||
list_tag_list = []
|
||||
threshold = 10
|
||||
# iterate all child nodes in a top-down direction
|
||||
for tag in sel.iter():
|
||||
# get child tags
|
||||
child_tags = tag.getchildren()
|
||||
|
||||
if len(child_tags) < threshold:
|
||||
# if number of child tags is below threshold, skip
|
||||
continue
|
||||
else:
|
||||
# have one or more child tags
|
||||
child_tags_set = set(map(lambda x: x.tag, child_tags))
|
||||
|
||||
# if there are more than 1 tag names, skip
|
||||
if len(child_tags_set) > 1:
|
||||
continue
|
||||
|
||||
# add as list tag
|
||||
list_tag_list.append(tag)
|
||||
|
||||
|
||||
class SpiderImportApi(Resource):
|
||||
__doc__ = """
|
||||
|
||||
Reference in New Issue
Block a user