- 在analyze_markdown、analyze_markdown_stream和get_markdown_outline函数中添加了 try-catch块来处理临时文件清理过程中的异常 - 将/analyze/md/outline接口从GET方法改为POST方法以支持文件上传 - 确保在所有情况下都能正确清理临时文件,并记录清理失败的日志 refactor(health): 改进健康检查逻辑验证实际数据库连接 - 修改MySQL健康检查,实际执行SELECT 1查询来验证连接 - 修改MongoDB健康检查,执行ping命令来验证连接 - 修改Redis健康检查,执行ping命令来验证连接 - 添加异常捕获并记录具体的错误日志 refactor(upload): 使用os.path.basename优化文件名提取 - 替换手动字符串分割为os.path.basename来获取文件名 - 统一Excel上传和导出中文件名的处理方式 feat(instruction): 新增指令执行框架模块 - 创建instruction包包含意图解析和指令执行的基础架构 - 添加IntentParser和InstructionExecutor抽象基类 - 提供默认实现但标记为未完成,为未来功能扩展做准备 refactor(frontend): 调整AuthContext导入路径并移除重复文件 - 将AuthContext从src/context移动到src/contexts目录 - 更新App.tsx和RouteGuard.tsx中的导入路径 - 移除旧的AuthContext.tsx文件 fix(backend-api): 修复AI分析API的HTTP方法错误 - 将aiApi中的fetch请求方法从GET改为POST以支持文件上传
276 lines
10 KiB
Python
276 lines
10 KiB
Python
"""
|
||
文件上传 API 接口
|
||
"""
|
||
from fastapi import APIRouter, UploadFile, File, HTTPException, Query
|
||
from fastapi.responses import StreamingResponse
|
||
from typing import Optional
|
||
import logging
|
||
import os
|
||
import pandas as pd
|
||
import io
|
||
|
||
from app.services.file_service import file_service
|
||
from app.core.document_parser import XlsxParser
|
||
from app.services.table_rag_service import table_rag_service
|
||
from app.core.database import mongodb
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
router = APIRouter(prefix="/upload", tags=["文件上传"])
|
||
|
||
# 初始化解析器
|
||
excel_parser = XlsxParser()
|
||
|
||
|
||
@router.post("/excel")
|
||
async def upload_excel(
|
||
file: UploadFile = File(...),
|
||
parse_all_sheets: bool = Query(False, description="是否解析所有工作表"),
|
||
sheet_name: Optional[str] = Query(None, description="指定解析的工作表名称"),
|
||
header_row: int = Query(0, description="表头所在的行索引")
|
||
):
|
||
"""
|
||
上传并解析 Excel 文件,同时存储到 MySQL 数据库
|
||
|
||
Args:
|
||
file: 上传的 Excel 文件
|
||
parse_all_sheets: 是否解析所有工作表
|
||
sheet_name: 指定解析的工作表名称
|
||
header_row: 表头所在的行索引
|
||
|
||
Returns:
|
||
dict: 解析结果
|
||
"""
|
||
# 检查文件类型
|
||
if not file.filename:
|
||
raise HTTPException(status_code=400, detail="文件名为空")
|
||
|
||
file_ext = file.filename.split('.')[-1].lower()
|
||
if file_ext not in ['xlsx', 'xls']:
|
||
raise HTTPException(
|
||
status_code=400,
|
||
detail=f"不支持的文件类型: {file_ext},仅支持 .xlsx 和 .xls"
|
||
)
|
||
|
||
try:
|
||
# 读取文件内容
|
||
content = await file.read()
|
||
|
||
# 保存文件
|
||
saved_path = file_service.save_uploaded_file(
|
||
content,
|
||
file.filename,
|
||
subfolder="excel"
|
||
)
|
||
|
||
logger.info(f"文件已保存: {saved_path}")
|
||
|
||
# 解析文件
|
||
if parse_all_sheets:
|
||
result = excel_parser.parse_all_sheets(saved_path)
|
||
else:
|
||
# 如果指定了 sheet_name,使用指定的,否则使用默认的第一个
|
||
if sheet_name:
|
||
result = excel_parser.parse(saved_path, sheet_name=sheet_name, header_row=header_row)
|
||
else:
|
||
result = excel_parser.parse(saved_path, header_row=header_row)
|
||
|
||
# 添加文件路径到元数据
|
||
if result.metadata:
|
||
result.metadata['saved_path'] = saved_path
|
||
result.metadata['original_filename'] = file.filename
|
||
|
||
# 存储到 MySQL 数据库
|
||
try:
|
||
store_result = await table_rag_service.build_table_rag_index(
|
||
file_path=saved_path,
|
||
filename=file.filename,
|
||
sheet_name=sheet_name if sheet_name else None,
|
||
header_row=header_row
|
||
)
|
||
if store_result.get("success"):
|
||
result.metadata['mysql_table'] = store_result.get('table_name')
|
||
result.metadata['row_count'] = store_result.get('row_count')
|
||
logger.info(f"Excel已存储到MySQL: {file.filename}, 表: {store_result.get('table_name')}")
|
||
else:
|
||
logger.warning(f"Excel存储到MySQL失败: {store_result.get('error')}")
|
||
except Exception as e:
|
||
logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True)
|
||
|
||
# 存储到 MongoDB(用于文档列表展示)
|
||
try:
|
||
content = ""
|
||
# 构建文本内容用于展示
|
||
if result.data:
|
||
if isinstance(result.data, dict):
|
||
# 单 sheet 格式: {columns, rows, ...}
|
||
if 'columns' in result.data and 'rows' in result.data:
|
||
content += f"Sheet: {result.metadata.get('current_sheet', 'Sheet1') if result.metadata else 'Sheet1'}\n"
|
||
content += ", ".join(str(h) for h in result.data['columns']) + "\n"
|
||
for row in result.data['rows'][:100]:
|
||
if isinstance(row, dict):
|
||
content += ", ".join(str(row.get(col, "")) for col in result.data['columns']) + "\n"
|
||
elif isinstance(row, list):
|
||
content += ", ".join(str(cell) for cell in row) + "\n"
|
||
content += f"... (共 {len(result.data['rows'])} 行)\n\n"
|
||
# 多 sheet 格式: {sheets: {sheet_name: {columns, rows}}}
|
||
elif 'sheets' in result.data:
|
||
for sheet_name_key, sheet_data in result.data['sheets'].items():
|
||
if isinstance(sheet_data, dict) and 'columns' in sheet_data and 'rows' in sheet_data:
|
||
content += f"Sheet: {sheet_name_key}\n"
|
||
content += ", ".join(str(h) for h in sheet_data['columns']) + "\n"
|
||
for row in sheet_data['rows'][:100]:
|
||
if isinstance(row, dict):
|
||
content += ", ".join(str(row.get(col, "")) for col in sheet_data['columns']) + "\n"
|
||
elif isinstance(row, list):
|
||
content += ", ".join(str(cell) for cell in row) + "\n"
|
||
content += f"... (共 {len(sheet_data['rows'])} 行)\n\n"
|
||
|
||
doc_metadata = {
|
||
"filename": os.path.basename(saved_path),
|
||
"original_filename": file.filename,
|
||
"saved_path": saved_path,
|
||
"file_size": len(content),
|
||
"row_count": result.metadata.get('row_count', 0) if result.metadata else 0,
|
||
"column_count": result.metadata.get('column_count', 0) if result.metadata else 0,
|
||
"columns": result.metadata.get('columns', []) if result.metadata else [],
|
||
"mysql_table": result.metadata.get('mysql_table') if result.metadata else None,
|
||
"sheet_count": result.metadata.get('sheet_count', 1) if result.metadata else 1,
|
||
}
|
||
await mongodb.insert_document(
|
||
doc_type="xlsx",
|
||
content=content,
|
||
metadata=doc_metadata,
|
||
structured_data=result.data if result.data else None
|
||
)
|
||
logger.info(f"Excel文档已存储到MongoDB: {file.filename}, content长度: {len(content)}")
|
||
except Exception as e:
|
||
logger.error(f"Excel存储到MongoDB异常: {str(e)}", exc_info=True)
|
||
|
||
return result.to_dict()
|
||
|
||
except HTTPException:
|
||
raise
|
||
except Exception as e:
|
||
logger.error(f"解析 Excel 文件时出错: {str(e)}")
|
||
raise HTTPException(status_code=500, detail=f"解析失败: {str(e)}")
|
||
|
||
|
||
@router.get("/excel/preview/{file_path:path}")
|
||
async def get_excel_preview(
|
||
file_path: str,
|
||
sheet_name: Optional[str] = Query(None, description="工作表名称"),
|
||
max_rows: int = Query(10, description="最多返回的行数", ge=1, le=100)
|
||
):
|
||
"""
|
||
获取 Excel 文件的预览数据
|
||
|
||
Args:
|
||
file_path: 文件路径
|
||
sheet_name: 工作表名称
|
||
max_rows: 最多返回的行数
|
||
|
||
Returns:
|
||
dict: 预览数据
|
||
"""
|
||
try:
|
||
# 解析工作表名称参数
|
||
sheet_param = sheet_name if sheet_name else 0
|
||
|
||
result = excel_parser.get_sheet_preview(
|
||
file_path,
|
||
sheet_name=sheet_param,
|
||
max_rows=max_rows
|
||
)
|
||
|
||
return result.to_dict()
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取预览数据时出错: {str(e)}")
|
||
raise HTTPException(status_code=500, detail=f"获取预览失败: {str(e)}")
|
||
|
||
|
||
@router.delete("/file")
|
||
async def delete_uploaded_file(file_path: str = Query(..., description="要删除的文件路径")):
|
||
"""
|
||
删除已上传的文件
|
||
|
||
Args:
|
||
file_path: 文件路径
|
||
|
||
Returns:
|
||
dict: 删除结果
|
||
"""
|
||
try:
|
||
success = file_service.delete_file(file_path)
|
||
|
||
if success:
|
||
return {"success": True, "message": "文件删除成功"}
|
||
else:
|
||
return {"success": False, "message": "文件不存在或删除失败"}
|
||
|
||
except Exception as e:
|
||
logger.error(f"删除文件时出错: {str(e)}")
|
||
raise HTTPException(status_code=500, detail=f"删除失败: {str(e)}")
|
||
|
||
|
||
@router.get("/excel/export/{file_path:path}")
|
||
async def export_excel(
|
||
file_path: str,
|
||
sheet_name: Optional[str] = Query(None, description="工作表名称"),
|
||
columns: Optional[str] = Query(None, description="要导出的列,逗号分隔")
|
||
):
|
||
"""
|
||
导出 Excel 文件(可选择工作表和列)
|
||
|
||
Args:
|
||
file_path: 原始文件路径
|
||
sheet_name: 工作表名称(可选)
|
||
columns: 要导出的列名,逗号分隔(可选)
|
||
|
||
Returns:
|
||
StreamingResponse: Excel 文件
|
||
"""
|
||
try:
|
||
# 读取 Excel 文件
|
||
if sheet_name:
|
||
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
||
else:
|
||
df = pd.read_excel(file_path)
|
||
|
||
# 如果指定了列,只选择这些列
|
||
if columns:
|
||
column_list = [col.strip() for col in columns.split(',')]
|
||
# 过滤掉不存在的列
|
||
available_columns = [col for col in column_list if col in df.columns]
|
||
if available_columns:
|
||
df = df[available_columns]
|
||
|
||
# 创建 Excel 文件
|
||
output = io.BytesIO()
|
||
with pd.ExcelWriter(output, engine='openpyxl') as writer:
|
||
df.to_excel(writer, index=False, sheet_name=sheet_name or 'Sheet1')
|
||
|
||
output.seek(0)
|
||
|
||
# 生成文件名
|
||
original_name = os.path.basename(file_path)
|
||
if columns:
|
||
export_name = f"export_{sheet_name or 'data'}_{len(column_list) if columns else 'all'}_cols.xlsx"
|
||
else:
|
||
export_name = f"export_{original_name}"
|
||
|
||
# 返回文件流
|
||
return StreamingResponse(
|
||
io.BytesIO(output.getvalue()),
|
||
media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||
headers={"Content-Disposition": f"attachment; filename={export_name}"}
|
||
)
|
||
|
||
except FileNotFoundError:
|
||
logger.error(f"文件不存在: {file_path}")
|
||
raise HTTPException(status_code=404, detail="文件不存在")
|
||
except Exception as e:
|
||
logger.error(f"导出 Excel 文件时出错: {str(e)}")
|
||
raise HTTPException(status_code=500, detail=f"导出失败: {str(e)}")
|