Files
FilesReadSystem/backend/app/api/endpoints/ai_analyze.py
2026-04-16 23:08:21 +08:00

698 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
AI 分析 API 接口
"""
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body, Form
from fastapi.responses import StreamingResponse
from typing import Optional
import logging
import tempfile
import os
from app.services.excel_ai_service import excel_ai_service
from app.services.markdown_ai_service import markdown_ai_service
from app.services.template_fill_service import template_fill_service
from app.services.word_ai_service import word_ai_service
from app.services.txt_ai_service import txt_ai_service
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/ai", tags=["AI 分析"])
@router.post("/analyze/excel")
async def analyze_excel(
file: Optional[UploadFile] = File(None),
doc_id: Optional[str] = Form(None, description="文档ID从数据库读取"),
user_prompt: str = Query("", description="用户自定义提示词"),
analysis_type: str = Query("general", description="分析类型: general, summary, statistics, insights"),
parse_all_sheets: bool = Query(False, description="是否分析所有工作表")
):
"""
上传并使用 AI 分析 Excel 文件
Args:
file: 上传的 Excel 文件(与 doc_id 二选一)
doc_id: 文档ID从数据库读取
user_prompt: 用户自定义提示词
analysis_type: 分析类型
parse_all_sheets: 是否分析所有工作表
Returns:
dict: 分析结果,包含 Excel 数据和 AI 分析结果
"""
filename = None
# 从数据库读取模式
if doc_id:
try:
from app.core.database.mongodb import mongodb
doc = await mongodb.get_document(doc_id)
if not doc:
raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")
filename = doc.get("metadata", {}).get("original_filename", "unknown.xlsx")
file_ext = filename.split('.')[-1].lower()
if file_ext not in ['xlsx', 'xls']:
raise HTTPException(status_code=400, detail=f"文档类型不是 Excel: {file_ext}")
file_path = doc.get("metadata", {}).get("file_path")
if not file_path:
raise HTTPException(status_code=400, detail="文档没有存储文件路径,请重新上传")
# 使用文件路径进行 AI 分析
if parse_all_sheets:
result = await excel_ai_service.batch_analyze_sheets_from_path(
file_path=file_path,
filename=filename,
user_prompt=user_prompt,
analysis_type=analysis_type
)
else:
result = await excel_ai_service.analyze_excel_file_from_path(
file_path=file_path,
filename=filename,
user_prompt=user_prompt,
analysis_type=analysis_type
)
if result.get("success"):
return result
else:
return result
except HTTPException:
raise
except Exception as e:
logger.error(f"从数据库读取 Excel 文档失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
# 文件上传模式
if not file:
raise HTTPException(status_code=400, detail="请提供文件或文档ID")
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['xlsx', 'xls']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .xlsx 和 .xls"
)
# 验证分析类型
supported_types = ['general', 'summary', 'statistics', 'insights']
if analysis_type not in supported_types:
raise HTTPException(
status_code=400,
detail=f"不支持的分析类型: {analysis_type},支持的类型: {', '.join(supported_types)}"
)
try:
# 读取文件内容
content = await file.read()
# 验证文件内容不为空
if not content:
raise HTTPException(status_code=400, detail="文件内容为空,请确保文件已正确上传")
logger.info(f"开始分析文件: {file.filename}, 分析类型: {analysis_type}, 文件大小: {len(content)} bytes")
# 调用 AI 分析服务
if parse_all_sheets:
result = await excel_ai_service.batch_analyze_sheets(
content,
file.filename,
user_prompt=user_prompt,
analysis_type=analysis_type
)
else:
# 解析选项
parse_options = {"header_row": 0}
result = await excel_ai_service.analyze_excel_file(
content,
file.filename,
user_prompt=user_prompt,
analysis_type=analysis_type,
parse_options=parse_options
)
logger.info(f"文件分析完成: {file.filename}, 成功: {result['success']}")
return result
except HTTPException:
raise
except Exception as e:
logger.error(f"AI 分析过程中出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
@router.get("/analysis/types")
async def get_analysis_types():
"""
获取支持的分析类型列表
Returns:
dict: 支持的分析类型(包含 Excel 和 Markdown
"""
return {
"excel_types": excel_ai_service.get_supported_analysis_types(),
"markdown_types": markdown_ai_service.get_supported_analysis_types()
}
@router.post("/analyze/text")
async def analyze_text(
excel_data: dict = Body(..., description="Excel 解析后的数据"),
user_prompt: str = Body("", description="用户提示词"),
analysis_type: str = Body("general", description="分析类型")
):
"""
对已解析的 Excel 数据进行 AI 分析
Args:
excel_data: Excel 数据
user_prompt: 用户提示词
analysis_type: 分析类型
Returns:
dict: 分析结果
"""
try:
logger.info(f"开始文本分析, 分析类型: {analysis_type}")
# 调用 LLM 服务
from app.services.llm_service import llm_service
if user_prompt and user_prompt.strip():
result = await llm_service.analyze_with_template(
excel_data,
user_prompt
)
else:
result = await llm_service.analyze_excel_data(
excel_data,
user_prompt,
analysis_type
)
logger.info(f"文本分析完成, 成功: {result['success']}")
return result
except Exception as e:
logger.error(f"文本分析失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
@router.post("/analyze/md")
async def analyze_markdown(
file: Optional[UploadFile] = File(None),
doc_id: Optional[str] = Form(None, description="文档ID从数据库读取"),
analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section, charts"),
user_prompt: str = Query("", description="用户自定义提示词"),
section_number: Optional[str] = Query(None, description="指定章节编号,如 '''(一)'")
):
"""
上传并使用 AI 分析 Markdown 文件
Args:
file: 上传的 Markdown 文件(与 doc_id 二选一)
doc_id: 文档ID从数据库读取
analysis_type: 分析类型
user_prompt: 用户自定义提示词
section_number: 指定分析的章节编号
Returns:
dict: 分析结果
"""
filename = None
tmp_path = None
# 验证分析类型
supported_types = markdown_ai_service.get_supported_analysis_types()
if analysis_type not in supported_types:
raise HTTPException(
status_code=400,
detail=f"不支持的分析类型: {analysis_type},支持的类型: {', '.join(supported_types)}"
)
if doc_id:
# 从数据库读取文档
try:
from app.core.database.mongodb import mongodb
doc = await mongodb.get_document(doc_id)
if not doc:
raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")
filename = doc.get("metadata", {}).get("original_filename", "unknown.md")
file_ext = filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(status_code=400, detail=f"文档类型不是 Markdown: {file_ext}")
content = doc.get("content") or ""
if not content:
raise HTTPException(status_code=400, detail="文档内容为空")
# 保存到临时文件
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content.encode('utf-8'))
tmp_path = tmp.name
logger.info(f"从数据库加载 Markdown 文档: {filename}, 长度: {len(content)}")
except HTTPException:
raise
except Exception as e:
logger.error(f"从数据库读取 Markdown 文档失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
else:
# 文件上传模式
if not file:
raise HTTPException(status_code=400, detail="请提供文件或文档ID")
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
try:
# 读取文件内容
content = await file.read()
# 保存到临时文件
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
filename = file.filename
except Exception as e:
logger.error(f"读取 Markdown 文件失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"读取文件失败: {str(e)}")
try:
logger.info(f"开始分析 Markdown 文件: {filename}, 分析类型: {analysis_type}, 章节: {section_number}")
# 调用 AI 分析服务
result = await markdown_ai_service.analyze_markdown(
file_path=tmp_path,
analysis_type=analysis_type,
user_prompt=user_prompt,
section_number=section_number
)
logger.info(f"Markdown 分析完成: {filename}, 成功: {result['success']}")
if not result['success']:
raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
return result
except HTTPException:
raise
except Exception as e:
logger.error(f"Markdown AI 分析过程中出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
finally:
# 清理临时文件
if tmp_path and os.path.exists(tmp_path):
try:
os.unlink(tmp_path)
except Exception as cleanup_error:
logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")
@router.post("/analyze/md/stream")
async def analyze_markdown_stream(
file: UploadFile = File(...),
analysis_type: str = Query("summary", description="分析类型"),
user_prompt: str = Query("", description="用户自定义提示词"),
section_number: Optional[str] = Query(None, description="指定章节编号")
):
"""
流式分析 Markdown 文件 (SSE)
Returns:
StreamingResponse: SSE 流式响应
"""
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
try:
content = await file.read()
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
logger.info(f"开始流式分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}")
async def stream_generator():
async for chunk in markdown_ai_service.analyze_markdown_stream(
file_path=tmp_path,
analysis_type=analysis_type,
user_prompt=user_prompt,
section_number=section_number
):
yield chunk
return StreamingResponse(
stream_generator(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no"
}
)
finally:
# 清理临时文件,确保在所有情况下都能清理
try:
if tmp_path and os.path.exists(tmp_path):
os.unlink(tmp_path)
except Exception as cleanup_error:
logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")
except HTTPException:
raise
except Exception as e:
logger.error(f"Markdown AI 流式分析出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"流式分析失败: {str(e)}")
@router.post("/analyze/md/outline")
async def get_markdown_outline(
file: UploadFile = File(...)
):
"""
获取 Markdown 文档的大纲结构(分章节信息)
Args:
file: 上传的 Markdown 文件
Returns:
dict: 文档大纲结构
"""
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
try:
content = await file.read()
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
result = await markdown_ai_service.extract_outline(tmp_path)
return result
finally:
# 清理临时文件,确保在所有情况下都能清理
try:
if tmp_path and os.path.exists(tmp_path):
os.unlink(tmp_path)
except Exception as cleanup_error:
logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")
except Exception as e:
logger.error(f"获取 Markdown 大纲失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"获取大纲失败: {str(e)}")
@router.post("/analyze/txt")
async def analyze_txt(
file: Optional[UploadFile] = File(None),
doc_id: Optional[str] = Form(None, description="文档ID从数据库读取"),
analysis_type: str = Query("structured", description="分析类型: structured, charts")
):
"""
上传并使用 AI 分析 TXT 文本文件,提取结构化数据或生成图表
将非结构化文本转换为结构化表格数据,便于后续填表使用
当 analysis_type=charts 时,可生成可视化图表
Args:
file: 上传的 TXT 文件(与 doc_id 二选一)
doc_id: 文档ID从数据库读取
analysis_type: 分析类型 - "structured"(默认,提取结构化数据)或 "charts"(生成图表)
Returns:
dict: 分析结果,包含结构化表格数据或图表数据
"""
filename = None
text_content = None
if doc_id:
# 从数据库读取文档
try:
from app.core.database.mongodb import mongodb
doc = await mongodb.get_document(doc_id)
if not doc:
raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")
filename = doc.get("metadata", {}).get("original_filename", "unknown.txt")
file_ext = filename.split('.')[-1].lower()
if file_ext not in ['txt', 'text']:
raise HTTPException(status_code=400, detail=f"文档类型不是 TXT: {file_ext}")
# 使用数据库中的 content
text_content = doc.get("content") or ""
if not text_content:
raise HTTPException(status_code=400, detail="文档内容为空")
logger.info(f"从数据库加载 TXT 文档: {filename}, 长度: {len(text_content)}")
except HTTPException:
raise
except Exception as e:
logger.error(f"从数据库读取 TXT 文档失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
else:
# 文件上传模式
if not file:
raise HTTPException(status_code=400, detail="请提供文件或文档ID")
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['txt', 'text']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .txt"
)
# 读取文件内容
content = await file.read()
text_content = content.decode('utf-8', errors='replace')
filename = file.filename
try:
logger.info(f"开始 AI 分析 TXT 文件: {filename}, analysis_type={analysis_type}")
# 使用 txt_ai_service 的 AI 分析方法
result = await txt_ai_service.analyze_txt_with_ai(
content=text_content,
filename=filename,
analysis_type=analysis_type
)
if result:
logger.info(f"TXT AI 分析成功: {filename}")
return {
"success": result.get("success", True),
"filename": filename,
"analysis_type": analysis_type,
"result": result
}
else:
logger.warning(f"TXT AI 分析返回空结果: {filename}")
return {
"success": False,
"filename": filename,
"error": "AI 分析未能提取到结构化数据",
"result": None
}
except HTTPException:
raise
except Exception as e:
logger.error(f"TXT AI 分析过程中出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
# ==================== Word 文档 AI 解析 ====================
@router.post("/analyze/word")
async def analyze_word(
file: Optional[UploadFile] = File(None),
doc_id: Optional[str] = Form(None, description="文档ID从数据库读取"),
user_hint: str = Form("", description="用户提示词,如'请提取表格数据'"),
analysis_type: str = Query("structured", description="分析类型: structured, charts")
):
"""
使用 AI 解析 Word 文档,提取结构化数据或生成图表
适用于从非结构化的 Word 文档中提取表格数据、键值对等信息
当 analysis_type=charts 时,可生成可视化图表
Args:
file: 上传的 Word 文件(与 doc_id 二选一)
doc_id: 文档ID从数据库读取
user_hint: 用户提示词
analysis_type: 分析类型 - "structured"(默认,提取结构化数据)或 "charts"(生成图表)
Returns:
dict: 包含结构化数据的解析结果或图表数据
"""
# 获取文件名和扩展名
filename = None
file_ext = None
if doc_id:
# 从数据库读取文档
try:
from app.core.database.mongodb import mongodb
doc = await mongodb.get_document(doc_id)
if not doc:
raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")
filename = doc.get("metadata", {}).get("original_filename", "unknown.docx")
file_ext = filename.split('.')[-1].lower()
if file_ext not in ['docx']:
raise HTTPException(status_code=400, detail=f"文档类型不是 Word: {file_ext}")
# 使用数据库中的 content 进行分析
content = doc.get("content", "") or ""
structured_data = doc.get("structured_data") or {}
tables = structured_data.get("tables", [])
# 调用 AI 分析服务,传入数据库内容
if analysis_type == "charts":
result = await word_ai_service.generate_charts_from_db(
content=content,
tables=tables,
filename=filename,
user_hint=user_hint
)
else:
result = await word_ai_service.parse_word_with_ai_from_db(
content=content,
tables=tables,
filename=filename,
user_hint=user_hint or "请提取文档中的所有结构化数据,包括表格、键值对等"
)
if result.get("success"):
return {
"success": True,
"filename": filename,
"analysis_type": analysis_type,
"result": result
}
else:
return {
"success": False,
"filename": filename,
"error": result.get("error", "AI 解析失败"),
"result": None
}
except HTTPException:
raise
except Exception as e:
logger.error(f"从数据库读取 Word 文档失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
# 文件上传模式
if not file:
raise HTTPException(status_code=400, detail="请提供文件或文档ID")
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['docx']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .docx"
)
try:
# 保存上传的文件
content = await file.read()
suffix = f".{file_ext}"
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
# 根据 analysis_type 选择处理方式
if analysis_type == "charts":
# 生成图表
result = await word_ai_service.generate_charts(
file_path=tmp_path,
user_hint=user_hint
)
else:
# 提取结构化数据
result = await word_ai_service.parse_word_with_ai(
file_path=tmp_path,
user_hint=user_hint or "请提取文档中的所有结构化数据,包括表格、键值对等"
)
if result.get("success"):
return {
"success": True,
"filename": file.filename,
"analysis_type": analysis_type,
"result": result
}
else:
return {
"success": False,
"filename": file.filename,
"error": result.get("error", "AI 解析失败"),
"result": None
}
finally:
# 清理临时文件
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except HTTPException:
raise
except Exception as e:
logger.error(f"Word AI 分析过程中出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")