Files
crawlab/utils/file.py
2019-02-21 14:08:48 +08:00

35 lines
914 B
Python

import os
import re
from collections import defaultdict
SUFFIX_PATTERN = r'\.(\w{,10})$'
suffix_regex = re.compile(SUFFIX_PATTERN, re.IGNORECASE)
def get_file_suffix(file_name: str):
file_name = file_name.lower()
m = suffix_regex.search(file_name)
if m is not None:
return m.groups()[0]
else:
return file_name
def get_file_list(path):
for root, dirs, file_names in os.walk(path):
# print(root) # 当前目录路径
# print(dirs) # 当前路径下所有子目录
# print(file_names) # 当前路径下所有非目录子文件
for file_name in file_names:
file_path = os.path.join(root, file_name)
yield file_path
def get_file_suffix_stats(path) -> dict:
stats = defaultdict(int)
for file_path in get_file_list(path):
suffix = get_file_suffix(file_path)
stats[suffix] += 1
return stats