Files
crawlab/crawlab/utils/file.py
2019-04-26 12:36:31 +08:00

80 lines
1.8 KiB
Python

import os
import re
from collections import defaultdict
SUFFIX_PATTERN = r'\.([a-zA-Z]{,6})$'
suffix_regex = re.compile(SUFFIX_PATTERN, re.IGNORECASE)
SUFFIX_LANG_MAPPING = {
'py': 'python',
'js': 'javascript',
'sh': 'shell',
'java': 'java',
'c': 'c',
'go': 'go',
}
def get_file_suffix(file_name: str) -> (str, None):
"""
Get suffix of a file
:param file_name:
:return:
"""
file_name = file_name.lower()
m = suffix_regex.search(file_name)
if m is not None:
return m.groups()[0]
else:
return None
def get_file_list(path: str) -> list:
"""
Get a list of files of given directory path
:param path: directory path
"""
for root, dirs, file_names in os.walk(path):
# print(root) # 当前目录路径
# print(dirs) # 当前路径下所有子目录
# print(file_names) # 当前路径下所有非目录子文件
for file_name in file_names:
file_path = os.path.join(root, file_name)
yield file_path
def get_file_suffix_stats(path) -> dict:
"""
Get suffix stats of given file
:param path: file path
"""
_stats = defaultdict(int)
for file_path in get_file_list(path):
suffix = get_file_suffix(file_path)
if suffix is not None:
_stats[suffix] += 1
# only return suffixes with languages
stats = {}
for suffix, count in _stats.items():
if SUFFIX_LANG_MAPPING.get(suffix) is not None:
stats[suffix] = count
return stats
def get_file_content(path) -> dict:
"""
Get file content
:param path: file path
"""
with open(path) as f:
suffix = get_file_suffix(path)
lang = SUFFIX_LANG_MAPPING.get(suffix)
return {
'lang': lang,
'suffix': suffix,
'content': f.read()
}