mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-30 18:00:56 +01:00
updated node/spider overviews
This commit is contained in:
34
utils/file.py
Normal file
34
utils/file.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
SUFFIX_PATTERN = r'\.(\w{,10})$'
|
||||
suffix_regex = re.compile(SUFFIX_PATTERN, re.IGNORECASE)
|
||||
|
||||
|
||||
def get_file_suffix(file_name: str):
|
||||
file_name = file_name.lower()
|
||||
m = suffix_regex.search(file_name)
|
||||
if m is not None:
|
||||
return m.groups()[0]
|
||||
else:
|
||||
return file_name
|
||||
|
||||
|
||||
def get_file_list(path):
|
||||
for root, dirs, file_names in os.walk(path):
|
||||
# print(root) # 当前目录路径
|
||||
# print(dirs) # 当前路径下所有子目录
|
||||
# print(file_names) # 当前路径下所有非目录子文件
|
||||
|
||||
for file_name in file_names:
|
||||
file_path = os.path.join(root, file_name)
|
||||
yield file_path
|
||||
|
||||
|
||||
def get_file_suffix_stats(path) -> dict:
|
||||
stats = defaultdict(int)
|
||||
for file_path in get_file_list(path):
|
||||
suffix = get_file_suffix(file_path)
|
||||
stats[suffix] += 1
|
||||
return stats
|
||||
15
utils/spider.py
Normal file
15
utils/spider.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from constants.spider import FILE_SUFFIX_LANG_MAPPING, LangType, SUFFIX_IGNORE
|
||||
|
||||
|
||||
def get_lang_by_stats(stats: dict) -> LangType:
|
||||
"""
|
||||
:param stats: stats is generated by utils.file.get_file_suffix_stats
|
||||
:return:
|
||||
"""
|
||||
data = stats.items()
|
||||
data = sorted(data, key=lambda item: item[1])
|
||||
data = list(filter(lambda item: item[0] not in SUFFIX_IGNORE, data))
|
||||
top_suffix = data[-1][0]
|
||||
if FILE_SUFFIX_LANG_MAPPING.get(top_suffix) is not None:
|
||||
return FILE_SUFFIX_LANG_MAPPING.get(top_suffix)
|
||||
return LangType.OTHER
|
||||
Reference in New Issue
Block a user