mirror of
https://github.com/crawlab-team/crawlab.git
synced 2026-01-25 17:42:25 +01:00
80 lines
1.8 KiB
Python
80 lines
1.8 KiB
Python
import os
|
|
import re
|
|
from collections import defaultdict
|
|
|
|
SUFFIX_PATTERN = r'\.([a-zA-Z]{,6})$'
|
|
suffix_regex = re.compile(SUFFIX_PATTERN, re.IGNORECASE)
|
|
|
|
SUFFIX_LANG_MAPPING = {
|
|
'py': 'python',
|
|
'js': 'javascript',
|
|
'sh': 'shell',
|
|
'java': 'java',
|
|
'c': 'c',
|
|
'go': 'go',
|
|
}
|
|
|
|
|
|
def get_file_suffix(file_name: str) -> (str, None):
|
|
"""
|
|
Get suffix of a file
|
|
:param file_name:
|
|
:return:
|
|
"""
|
|
file_name = file_name.lower()
|
|
m = suffix_regex.search(file_name)
|
|
if m is not None:
|
|
return m.groups()[0]
|
|
else:
|
|
return None
|
|
|
|
|
|
def get_file_list(path: str) -> list:
|
|
"""
|
|
Get a list of files of given directory path
|
|
:param path: directory path
|
|
"""
|
|
for root, dirs, file_names in os.walk(path):
|
|
# print(root) # 当前目录路径
|
|
# print(dirs) # 当前路径下所有子目录
|
|
# print(file_names) # 当前路径下所有非目录子文件
|
|
|
|
for file_name in file_names:
|
|
file_path = os.path.join(root, file_name)
|
|
yield file_path
|
|
|
|
|
|
def get_file_suffix_stats(path) -> dict:
|
|
"""
|
|
Get suffix stats of given file
|
|
:param path: file path
|
|
"""
|
|
_stats = defaultdict(int)
|
|
for file_path in get_file_list(path):
|
|
suffix = get_file_suffix(file_path)
|
|
if suffix is not None:
|
|
_stats[suffix] += 1
|
|
|
|
# only return suffixes with languages
|
|
stats = {}
|
|
for suffix, count in _stats.items():
|
|
if SUFFIX_LANG_MAPPING.get(suffix) is not None:
|
|
stats[suffix] = count
|
|
|
|
return stats
|
|
|
|
|
|
def get_file_content(path) -> dict:
|
|
"""
|
|
Get file content
|
|
:param path: file path
|
|
"""
|
|
with open(path) as f:
|
|
suffix = get_file_suffix(path)
|
|
lang = SUFFIX_LANG_MAPPING.get(suffix)
|
|
return {
|
|
'lang': lang,
|
|
'suffix': suffix,
|
|
'content': f.read()
|
|
}
|