前后端基本架构和完全excel表的解析及统计图表的生成以及excel表的到出
This commit is contained in:
7
backend/app/core/document_parser/__init__.py
Normal file
7
backend/app/core/document_parser/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
"""
|
||||
文档解析模块 - 支持多种文件格式的解析
|
||||
"""
|
||||
from .base import BaseParser
|
||||
from .xlsx_parser import XlsxParser
|
||||
|
||||
__all__ = ['BaseParser', 'XlsxParser']
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
87
backend/app/core/document_parser/base.py
Normal file
87
backend/app/core/document_parser/base.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""
|
||||
解析器基类 - 定义所有解析器的通用接口
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class ParseResult:
|
||||
"""解析结果类"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
success: bool,
|
||||
data: Optional[Dict[str, Any]] = None,
|
||||
error: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
):
|
||||
self.success = success
|
||||
self.data = data or {}
|
||||
self.error = error
|
||||
self.metadata = metadata or {}
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""转换为字典"""
|
||||
return {
|
||||
"success": self.success,
|
||||
"data": self.data,
|
||||
"error": self.error,
|
||||
"metadata": self.metadata
|
||||
}
|
||||
|
||||
|
||||
class BaseParser(ABC):
|
||||
"""文档解析器基类"""
|
||||
|
||||
def __init__(self):
|
||||
self.supported_extensions: List[str] = []
|
||||
self.parser_name: str = "base_parser"
|
||||
|
||||
@abstractmethod
|
||||
def parse(self, file_path: str, **kwargs) -> ParseResult:
|
||||
"""
|
||||
解析文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
**kwargs: 其他解析参数
|
||||
|
||||
Returns:
|
||||
ParseResult: 解析结果
|
||||
"""
|
||||
pass
|
||||
|
||||
def can_parse(self, file_path: str) -> bool:
|
||||
"""
|
||||
检查是否可以解析该文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
bool: 是否可以解析
|
||||
"""
|
||||
ext = Path(file_path).suffix.lower()
|
||||
return ext in self.supported_extensions
|
||||
|
||||
def get_file_info(self, file_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
获取文件基本信息
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 文件信息
|
||||
"""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
return {"error": "File not found"}
|
||||
|
||||
return {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"size": path.stat().st_size,
|
||||
"parser": self.parser_name
|
||||
}
|
||||
120
backend/app/core/document_parser/utils.py
Normal file
120
backend/app/core/document_parser/utils.py
Normal file
@@ -0,0 +1,120 @@
|
||||
"""
|
||||
文档解析工具函数
|
||||
"""
|
||||
import re
|
||||
from typing import List, Optional, Dict, Any
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""
|
||||
清洗文本,去除多余的空白字符和特殊符号
|
||||
|
||||
Args:
|
||||
text: 原始文本
|
||||
|
||||
Returns:
|
||||
str: 清洗后的文本
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# 去除首尾空白
|
||||
text = text.strip()
|
||||
|
||||
# 将多个连续的空白字符替换为单个空格
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
|
||||
# 去除不可打印字符
|
||||
text = ''.join(char for char in text if char.isprintable() or char in '\n\r\t')
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def chunk_text(
|
||||
text: str,
|
||||
chunk_size: int = 1000,
|
||||
overlap: int = 100
|
||||
) -> List[str]:
|
||||
"""
|
||||
将文本分块
|
||||
|
||||
Args:
|
||||
text: 原始文本
|
||||
chunk_size: 每块的大小(字符数)
|
||||
overlap: 重叠区域的大小
|
||||
|
||||
Returns:
|
||||
List[str]: 文本块列表
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
text_length = len(text)
|
||||
|
||||
while start < text_length:
|
||||
end = start + chunk_size
|
||||
chunk = text[start:end]
|
||||
chunks.append(chunk)
|
||||
start = end - overlap
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def normalize_string(s: Any) -> str:
|
||||
"""
|
||||
标准化字符串
|
||||
|
||||
Args:
|
||||
s: 输入值
|
||||
|
||||
Returns:
|
||||
str: 标准化后的字符串
|
||||
"""
|
||||
if s is None:
|
||||
return ""
|
||||
if isinstance(s, (int, float)):
|
||||
return str(s)
|
||||
if isinstance(s, str):
|
||||
return clean_text(s)
|
||||
return str(s)
|
||||
|
||||
|
||||
def detect_encoding(file_path: str) -> Optional[str]:
|
||||
"""
|
||||
检测文件编码(简化版)
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
Optional[str]: 编码格式,无法检测则返回 None
|
||||
"""
|
||||
import chardet
|
||||
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
raw_data = f.read(10000) # 读取前 10000 字节
|
||||
result = chardet.detect(raw_data)
|
||||
return result.get('encoding')
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def safe_get(d: Dict[str, Any], key: str, default: Any = None) -> Any:
|
||||
"""
|
||||
安全地获取字典值
|
||||
|
||||
Args:
|
||||
d: 字典
|
||||
key: 键
|
||||
default: 默认值
|
||||
|
||||
Returns:
|
||||
Any: 字典值或默认值
|
||||
"""
|
||||
try:
|
||||
return d.get(key, default)
|
||||
except Exception:
|
||||
return default
|
||||
288
backend/app/core/document_parser/xlsx_parser.py
Normal file
288
backend/app/core/document_parser/xlsx_parser.py
Normal file
@@ -0,0 +1,288 @@
|
||||
"""
|
||||
Excel 文件解析器 - 解析 .xlsx 和 .xls 文件
|
||||
"""
|
||||
from typing import Any, Dict, List, Optional
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
import logging
|
||||
|
||||
from .base import BaseParser, ParseResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class XlsxParser(BaseParser):
|
||||
"""Excel 文件解析器"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.supported_extensions = ['.xlsx', '.xls']
|
||||
self.parser_name = "excel_parser"
|
||||
|
||||
def parse(
|
||||
self,
|
||||
file_path: str,
|
||||
sheet_name: Optional[str | int] = 0,
|
||||
header_row: int = 0,
|
||||
**kwargs
|
||||
) -> ParseResult:
|
||||
"""
|
||||
解析 Excel 文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
sheet_name: 工作表名称或索引,默认为第一个工作表
|
||||
header_row: 表头所在的行索引,默认为 0
|
||||
**kwargs: 其他参数传递给 pandas.read_excel
|
||||
|
||||
Returns:
|
||||
ParseResult: 解析结果
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
# 检查文件是否存在
|
||||
if not path.exists():
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"File not found: {file_path}"
|
||||
)
|
||||
|
||||
# 检查文件扩展名
|
||||
if path.suffix.lower() not in self.supported_extensions:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"Unsupported file type: {path.suffix}"
|
||||
)
|
||||
|
||||
# 检查文件大小
|
||||
file_size = path.stat().st_size
|
||||
if file_size == 0:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"File is empty: {file_path}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 尝试读取 Excel 文件,检查是否有工作表
|
||||
xls_file = pd.ExcelFile(file_path)
|
||||
sheet_names = xls_file.sheet_names
|
||||
|
||||
if not sheet_names:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"Excel 文件没有找到任何工作表: {file_path}"
|
||||
)
|
||||
|
||||
# 验证请求的工作表索引/名称
|
||||
target_sheet = None
|
||||
if sheet_name is not None:
|
||||
if isinstance(sheet_name, int) and sheet_name < len(sheet_names):
|
||||
target_sheet = sheet_names[sheet_name]
|
||||
elif isinstance(sheet_name, str) and sheet_name in sheet_names:
|
||||
target_sheet = sheet_name
|
||||
else:
|
||||
# 如果指定的 sheet_name 无效,使用第一个工作表
|
||||
target_sheet = sheet_names[0]
|
||||
else:
|
||||
# 默认使用第一个工作表
|
||||
target_sheet = sheet_names[0]
|
||||
|
||||
# 读取 Excel 文件
|
||||
df = pd.read_excel(
|
||||
file_path,
|
||||
sheet_name=target_sheet,
|
||||
header=header_row,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# 检查 DataFrame 是否为空
|
||||
if df.empty:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"工作表 '{target_sheet}' 为空,请检查 Excel 文件内容"
|
||||
)
|
||||
|
||||
# 转换为可序列化的数据
|
||||
data = self._df_to_dict(df)
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"sheet_count": len(sheet_names),
|
||||
"sheet_names": sheet_names,
|
||||
"current_sheet": target_sheet,
|
||||
"row_count": len(df),
|
||||
"column_count": len(df.columns) if not df.empty else 0,
|
||||
"columns": df.columns.tolist() if not df.empty else [],
|
||||
"file_size": file_size
|
||||
}
|
||||
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data=data,
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
except IndexError as e:
|
||||
logger.error(f"工作表索引错误: {str(e)}")
|
||||
# 工作表索引超出范围时,尝试使用第一个工作表
|
||||
try:
|
||||
xls_file = pd.ExcelFile(file_path)
|
||||
sheet_names = xls_file.sheet_names
|
||||
if sheet_names:
|
||||
df = pd.read_excel(
|
||||
file_path,
|
||||
sheet_name=sheet_names[0],
|
||||
header=header_row,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
data = self._df_to_dict(df)
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"sheet_count": len(sheet_names),
|
||||
"sheet_names": sheet_names,
|
||||
"current_sheet": sheet_names[0],
|
||||
"row_count": len(df),
|
||||
"column_count": len(df.columns) if not df.empty else 0,
|
||||
"columns": df.columns.tolist() if not df.empty else [],
|
||||
"file_size": path.stat().st_size
|
||||
}
|
||||
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data=data,
|
||||
metadata=metadata
|
||||
)
|
||||
else:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"Excel 文件没有有效的工作表"
|
||||
)
|
||||
except Exception as e2:
|
||||
logger.error(f"重试解析失败: {str(e2)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"无法解析 Excel 文件: {str(e)}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析 Excel 文件时出错: {str(e)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"Failed to parse Excel file: {str(e)}"
|
||||
)
|
||||
|
||||
def parse_all_sheets(self, file_path: str, **kwargs) -> ParseResult:
|
||||
"""
|
||||
解析 Excel 文件的所有工作表
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
**kwargs: 其他参数传递给 pandas.read_excel
|
||||
|
||||
Returns:
|
||||
ParseResult: 解析结果
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
# 检查文件是否存在
|
||||
if not path.exists():
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"File not found: {file_path}"
|
||||
)
|
||||
|
||||
if path.suffix.lower() not in self.supported_extensions:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"Unsupported file type: {path.suffix}"
|
||||
)
|
||||
|
||||
# 检查文件大小
|
||||
file_size = path.stat().st_size
|
||||
if file_size == 0:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"File is empty: {file_path}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 读取所有工作表
|
||||
all_data = pd.read_excel(file_path, sheet_name=None, **kwargs)
|
||||
|
||||
# 检查是否成功读取到数据
|
||||
if not all_data or len(all_data) == 0:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"无法读取 Excel 文件或文件为空: {file_path}"
|
||||
)
|
||||
|
||||
# 转换为可序列化的数据
|
||||
sheets_data = {}
|
||||
for sheet_name, df in all_data.items():
|
||||
sheets_data[sheet_name] = self._df_to_dict(df)
|
||||
|
||||
# 获取所有工作表名称
|
||||
all_sheets = list(all_data.keys())
|
||||
|
||||
# 构建元数据
|
||||
total_rows = sum(len(df) for df in all_data.values())
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"sheet_count": len(all_sheets),
|
||||
"sheet_names": all_sheets,
|
||||
"total_rows": total_rows,
|
||||
"file_size": file_size
|
||||
}
|
||||
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={"sheets": sheets_data},
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse Excel file: {str(e)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"Failed to parse Excel file: {str(e)}"
|
||||
)
|
||||
|
||||
def _get_sheet_names(self, file_path: str) -> List[str]:
|
||||
"""获取 Excel 文件中的所有工作表名称"""
|
||||
try:
|
||||
xls = pd.ExcelFile(file_path)
|
||||
sheet_names = xls.sheet_names
|
||||
if not sheet_names:
|
||||
return []
|
||||
return sheet_names
|
||||
except Exception as e:
|
||||
logger.error(f"获取工作表名称失败: {str(e)}")
|
||||
return []
|
||||
|
||||
def _df_to_dict(self, df: pd.DataFrame) -> Dict[str, Any]:
|
||||
"""
|
||||
将 DataFrame 转换为字典,处理 NaN 值
|
||||
|
||||
Args:
|
||||
df: pandas DataFrame
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 转换后的字典
|
||||
"""
|
||||
# 将 NaN 替换为 None
|
||||
df = df.replace({pd.NA: None, float('nan'): None})
|
||||
|
||||
# 转换为字典列表(每一行一个字典)
|
||||
rows = df.to_dict(orient='records')
|
||||
|
||||
return {
|
||||
"columns": df.columns.tolist(),
|
||||
"rows": rows,
|
||||
"row_count": len(rows),
|
||||
"column_count": len(df.columns) if not df.empty else 0
|
||||
}
|
||||
Reference in New Issue
Block a user