121 lines
2.3 KiB
Python
121 lines
2.3 KiB
Python
"""
|
|
文档解析工具函数
|
|
"""
|
|
import re
|
|
from typing import List, Optional, Dict, Any
|
|
|
|
|
|
def clean_text(text: str) -> str:
|
|
"""
|
|
清洗文本,去除多余的空白字符和特殊符号
|
|
|
|
Args:
|
|
text: 原始文本
|
|
|
|
Returns:
|
|
str: 清洗后的文本
|
|
"""
|
|
if not text:
|
|
return ""
|
|
|
|
# 去除首尾空白
|
|
text = text.strip()
|
|
|
|
# 将多个连续的空白字符替换为单个空格
|
|
text = re.sub(r'\s+', ' ', text)
|
|
|
|
# 去除不可打印字符
|
|
text = ''.join(char for char in text if char.isprintable() or char in '\n\r\t')
|
|
|
|
return text
|
|
|
|
|
|
def chunk_text(
|
|
text: str,
|
|
chunk_size: int = 1000,
|
|
overlap: int = 100
|
|
) -> List[str]:
|
|
"""
|
|
将文本分块
|
|
|
|
Args:
|
|
text: 原始文本
|
|
chunk_size: 每块的大小(字符数)
|
|
overlap: 重叠区域的大小
|
|
|
|
Returns:
|
|
List[str]: 文本块列表
|
|
"""
|
|
if not text:
|
|
return []
|
|
|
|
chunks = []
|
|
start = 0
|
|
text_length = len(text)
|
|
|
|
while start < text_length:
|
|
end = start + chunk_size
|
|
chunk = text[start:end]
|
|
chunks.append(chunk)
|
|
start = end - overlap
|
|
|
|
return chunks
|
|
|
|
|
|
def normalize_string(s: Any) -> str:
|
|
"""
|
|
标准化字符串
|
|
|
|
Args:
|
|
s: 输入值
|
|
|
|
Returns:
|
|
str: 标准化后的字符串
|
|
"""
|
|
if s is None:
|
|
return ""
|
|
if isinstance(s, (int, float)):
|
|
return str(s)
|
|
if isinstance(s, str):
|
|
return clean_text(s)
|
|
return str(s)
|
|
|
|
|
|
def detect_encoding(file_path: str) -> Optional[str]:
|
|
"""
|
|
检测文件编码(简化版)
|
|
|
|
Args:
|
|
file_path: 文件路径
|
|
|
|
Returns:
|
|
Optional[str]: 编码格式,无法检测则返回 None
|
|
"""
|
|
import chardet
|
|
|
|
try:
|
|
with open(file_path, 'rb') as f:
|
|
raw_data = f.read(10000) # 读取前 10000 字节
|
|
result = chardet.detect(raw_data)
|
|
return result.get('encoding')
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def safe_get(d: Dict[str, Any], key: str, default: Any = None) -> Any:
|
|
"""
|
|
安全地获取字典值
|
|
|
|
Args:
|
|
d: 字典
|
|
key: 键
|
|
default: 默认值
|
|
|
|
Returns:
|
|
Any: 字典值或默认值
|
|
"""
|
|
try:
|
|
return d.get(key, default)
|
|
except Exception:
|
|
return default
|