前后端基本架构和完全excel表的解析及统计图表的生成以及excel表的到出

This commit is contained in:
2026-03-19 01:51:34 +08:00
parent c23b93bb70
commit 2f630695ff
194 changed files with 23354 additions and 174 deletions

View File

@@ -0,0 +1,7 @@
"""
文档解析模块 - 支持多种文件格式的解析
"""
from .base import BaseParser
from .xlsx_parser import XlsxParser
__all__ = ['BaseParser', 'XlsxParser']

View File

@@ -0,0 +1,87 @@
"""
解析器基类 - 定义所有解析器的通用接口
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional
from pathlib import Path
class ParseResult:
"""解析结果类"""
def __init__(
self,
success: bool,
data: Optional[Dict[str, Any]] = None,
error: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
):
self.success = success
self.data = data or {}
self.error = error
self.metadata = metadata or {}
def to_dict(self) -> Dict[str, Any]:
"""转换为字典"""
return {
"success": self.success,
"data": self.data,
"error": self.error,
"metadata": self.metadata
}
class BaseParser(ABC):
"""文档解析器基类"""
def __init__(self):
self.supported_extensions: List[str] = []
self.parser_name: str = "base_parser"
@abstractmethod
def parse(self, file_path: str, **kwargs) -> ParseResult:
"""
解析文件
Args:
file_path: 文件路径
**kwargs: 其他解析参数
Returns:
ParseResult: 解析结果
"""
pass
def can_parse(self, file_path: str) -> bool:
"""
检查是否可以解析该文件
Args:
file_path: 文件路径
Returns:
bool: 是否可以解析
"""
ext = Path(file_path).suffix.lower()
return ext in self.supported_extensions
def get_file_info(self, file_path: str) -> Dict[str, Any]:
"""
获取文件基本信息
Args:
file_path: 文件路径
Returns:
Dict[str, Any]: 文件信息
"""
path = Path(file_path)
if not path.exists():
return {"error": "File not found"}
return {
"filename": path.name,
"extension": path.suffix.lower(),
"size": path.stat().st_size,
"parser": self.parser_name
}

View File

@@ -0,0 +1,120 @@
"""
文档解析工具函数
"""
import re
from typing import List, Optional, Dict, Any
def clean_text(text: str) -> str:
"""
清洗文本,去除多余的空白字符和特殊符号
Args:
text: 原始文本
Returns:
str: 清洗后的文本
"""
if not text:
return ""
# 去除首尾空白
text = text.strip()
# 将多个连续的空白字符替换为单个空格
text = re.sub(r'\s+', ' ', text)
# 去除不可打印字符
text = ''.join(char for char in text if char.isprintable() or char in '\n\r\t')
return text
def chunk_text(
text: str,
chunk_size: int = 1000,
overlap: int = 100
) -> List[str]:
"""
将文本分块
Args:
text: 原始文本
chunk_size: 每块的大小(字符数)
overlap: 重叠区域的大小
Returns:
List[str]: 文本块列表
"""
if not text:
return []
chunks = []
start = 0
text_length = len(text)
while start < text_length:
end = start + chunk_size
chunk = text[start:end]
chunks.append(chunk)
start = end - overlap
return chunks
def normalize_string(s: Any) -> str:
"""
标准化字符串
Args:
s: 输入值
Returns:
str: 标准化后的字符串
"""
if s is None:
return ""
if isinstance(s, (int, float)):
return str(s)
if isinstance(s, str):
return clean_text(s)
return str(s)
def detect_encoding(file_path: str) -> Optional[str]:
"""
检测文件编码(简化版)
Args:
file_path: 文件路径
Returns:
Optional[str]: 编码格式,无法检测则返回 None
"""
import chardet
try:
with open(file_path, 'rb') as f:
raw_data = f.read(10000) # 读取前 10000 字节
result = chardet.detect(raw_data)
return result.get('encoding')
except Exception:
return None
def safe_get(d: Dict[str, Any], key: str, default: Any = None) -> Any:
"""
安全地获取字典值
Args:
d: 字典
key: 键
default: 默认值
Returns:
Any: 字典值或默认值
"""
try:
return d.get(key, default)
except Exception:
return default

View File

@@ -0,0 +1,288 @@
"""
Excel 文件解析器 - 解析 .xlsx 和 .xls 文件
"""
from typing import Any, Dict, List, Optional
from pathlib import Path
import pandas as pd
import logging
from .base import BaseParser, ParseResult
logger = logging.getLogger(__name__)
class XlsxParser(BaseParser):
"""Excel 文件解析器"""
def __init__(self):
super().__init__()
self.supported_extensions = ['.xlsx', '.xls']
self.parser_name = "excel_parser"
def parse(
self,
file_path: str,
sheet_name: Optional[str | int] = 0,
header_row: int = 0,
**kwargs
) -> ParseResult:
"""
解析 Excel 文件
Args:
file_path: 文件路径
sheet_name: 工作表名称或索引,默认为第一个工作表
header_row: 表头所在的行索引,默认为 0
**kwargs: 其他参数传递给 pandas.read_excel
Returns:
ParseResult: 解析结果
"""
path = Path(file_path)
# 检查文件是否存在
if not path.exists():
return ParseResult(
success=False,
error=f"File not found: {file_path}"
)
# 检查文件扩展名
if path.suffix.lower() not in self.supported_extensions:
return ParseResult(
success=False,
error=f"Unsupported file type: {path.suffix}"
)
# 检查文件大小
file_size = path.stat().st_size
if file_size == 0:
return ParseResult(
success=False,
error=f"File is empty: {file_path}"
)
try:
# 尝试读取 Excel 文件,检查是否有工作表
xls_file = pd.ExcelFile(file_path)
sheet_names = xls_file.sheet_names
if not sheet_names:
return ParseResult(
success=False,
error=f"Excel 文件没有找到任何工作表: {file_path}"
)
# 验证请求的工作表索引/名称
target_sheet = None
if sheet_name is not None:
if isinstance(sheet_name, int) and sheet_name < len(sheet_names):
target_sheet = sheet_names[sheet_name]
elif isinstance(sheet_name, str) and sheet_name in sheet_names:
target_sheet = sheet_name
else:
# 如果指定的 sheet_name 无效,使用第一个工作表
target_sheet = sheet_names[0]
else:
# 默认使用第一个工作表
target_sheet = sheet_names[0]
# 读取 Excel 文件
df = pd.read_excel(
file_path,
sheet_name=target_sheet,
header=header_row,
**kwargs
)
# 检查 DataFrame 是否为空
if df.empty:
return ParseResult(
success=False,
error=f"工作表 '{target_sheet}' 为空,请检查 Excel 文件内容"
)
# 转换为可序列化的数据
data = self._df_to_dict(df)
# 构建元数据
metadata = {
"filename": path.name,
"extension": path.suffix.lower(),
"sheet_count": len(sheet_names),
"sheet_names": sheet_names,
"current_sheet": target_sheet,
"row_count": len(df),
"column_count": len(df.columns) if not df.empty else 0,
"columns": df.columns.tolist() if not df.empty else [],
"file_size": file_size
}
return ParseResult(
success=True,
data=data,
metadata=metadata
)
except IndexError as e:
logger.error(f"工作表索引错误: {str(e)}")
# 工作表索引超出范围时,尝试使用第一个工作表
try:
xls_file = pd.ExcelFile(file_path)
sheet_names = xls_file.sheet_names
if sheet_names:
df = pd.read_excel(
file_path,
sheet_name=sheet_names[0],
header=header_row,
**kwargs
)
data = self._df_to_dict(df)
metadata = {
"filename": path.name,
"extension": path.suffix.lower(),
"sheet_count": len(sheet_names),
"sheet_names": sheet_names,
"current_sheet": sheet_names[0],
"row_count": len(df),
"column_count": len(df.columns) if not df.empty else 0,
"columns": df.columns.tolist() if not df.empty else [],
"file_size": path.stat().st_size
}
return ParseResult(
success=True,
data=data,
metadata=metadata
)
else:
return ParseResult(
success=False,
error=f"Excel 文件没有有效的工作表"
)
except Exception as e2:
logger.error(f"重试解析失败: {str(e2)}")
return ParseResult(
success=False,
error=f"无法解析 Excel 文件: {str(e)}"
)
except Exception as e:
logger.error(f"解析 Excel 文件时出错: {str(e)}")
return ParseResult(
success=False,
error=f"Failed to parse Excel file: {str(e)}"
)
def parse_all_sheets(self, file_path: str, **kwargs) -> ParseResult:
"""
解析 Excel 文件的所有工作表
Args:
file_path: 文件路径
**kwargs: 其他参数传递给 pandas.read_excel
Returns:
ParseResult: 解析结果
"""
path = Path(file_path)
# 检查文件是否存在
if not path.exists():
return ParseResult(
success=False,
error=f"File not found: {file_path}"
)
if path.suffix.lower() not in self.supported_extensions:
return ParseResult(
success=False,
error=f"Unsupported file type: {path.suffix}"
)
# 检查文件大小
file_size = path.stat().st_size
if file_size == 0:
return ParseResult(
success=False,
error=f"File is empty: {file_path}"
)
try:
# 读取所有工作表
all_data = pd.read_excel(file_path, sheet_name=None, **kwargs)
# 检查是否成功读取到数据
if not all_data or len(all_data) == 0:
return ParseResult(
success=False,
error=f"无法读取 Excel 文件或文件为空: {file_path}"
)
# 转换为可序列化的数据
sheets_data = {}
for sheet_name, df in all_data.items():
sheets_data[sheet_name] = self._df_to_dict(df)
# 获取所有工作表名称
all_sheets = list(all_data.keys())
# 构建元数据
total_rows = sum(len(df) for df in all_data.values())
metadata = {
"filename": path.name,
"extension": path.suffix.lower(),
"sheet_count": len(all_sheets),
"sheet_names": all_sheets,
"total_rows": total_rows,
"file_size": file_size
}
return ParseResult(
success=True,
data={"sheets": sheets_data},
metadata=metadata
)
except Exception as e:
logger.error(f"Failed to parse Excel file: {str(e)}")
return ParseResult(
success=False,
error=f"Failed to parse Excel file: {str(e)}"
)
def _get_sheet_names(self, file_path: str) -> List[str]:
"""获取 Excel 文件中的所有工作表名称"""
try:
xls = pd.ExcelFile(file_path)
sheet_names = xls.sheet_names
if not sheet_names:
return []
return sheet_names
except Exception as e:
logger.error(f"获取工作表名称失败: {str(e)}")
return []
def _df_to_dict(self, df: pd.DataFrame) -> Dict[str, Any]:
"""
将 DataFrame 转换为字典,处理 NaN 值
Args:
df: pandas DataFrame
Returns:
Dict[str, Any]: 转换后的字典
"""
# 将 NaN 替换为 None
df = df.replace({pd.NA: None, float('nan'): None})
# 转换为字典列表(每一行一个字典)
rows = df.to_dict(orient='records')
return {
"columns": df.columns.tolist(),
"rows": rows,
"row_count": len(rows),
"column_count": len(df.columns) if not df.empty else 0
}