支持从数据库读取文档进行AI分析

新增 doc_id 参数支持从数据库读取文档内容,同时保留文件上传功能,
实现两种方式的灵活切换。修改了 Markdown、TXT 和 Word 文档的分析接口,
添加从数据库获取文档的逻辑,并相应更新前端 API 调用。

BREAKING CHANGE: 分析接口现在支持文件上传和数据库文档 ID 两种方式
This commit is contained in:
2026-04-16 19:43:43 +08:00
parent 2adf9aef60
commit c2f50d3bd8
5 changed files with 762 additions and 120 deletions

View File

@@ -154,8 +154,9 @@ async def analyze_text(
@router.post("/analyze/md")
async def analyze_markdown(
file: UploadFile = File(...),
analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section"),
file: Optional[UploadFile] = File(None),
doc_id: Optional[str] = Query(None, description="文档ID从数据库读取"),
analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section, charts"),
user_prompt: str = Query("", description="用户自定义提示词"),
section_number: Optional[str] = Query(None, description="指定章节编号,如 '''(一)'")
):
@@ -163,7 +164,8 @@ async def analyze_markdown(
上传并使用 AI 分析 Markdown 文件
Args:
file: 上传的 Markdown 文件
file: 上传的 Markdown 文件(与 doc_id 二选一)
doc_id: 文档ID从数据库读取
analysis_type: 分析类型
user_prompt: 用户自定义提示词
section_number: 指定分析的章节编号
@@ -171,16 +173,8 @@ async def analyze_markdown(
Returns:
dict: 分析结果
"""
# 检查文件类型
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
filename = None
tmp_path = None
# 验证分析类型
supported_types = markdown_ai_service.get_supported_analysis_types()
@@ -190,46 +184,96 @@ async def analyze_markdown(
detail=f"不支持的分析类型: {analysis_type},支持的类型: {', '.join(supported_types)}"
)
try:
# 读取文件内容
content = await file.read()
# 保存到临时文件
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
if doc_id:
# 从数据库读取文档
try:
logger.info(f"开始分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}, 章节: {section_number}")
from app.core.database.mongodb import mongodb
doc = await mongodb.get_document(doc_id)
if not doc:
raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")
# 调用 AI 分析服务
result = await markdown_ai_service.analyze_markdown(
file_path=tmp_path,
analysis_type=analysis_type,
user_prompt=user_prompt,
section_number=section_number
filename = doc.get("metadata", {}).get("original_filename", "unknown.md")
file_ext = filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(status_code=400, detail=f"文档类型不是 Markdown: {file_ext}")
content = doc.get("content", "")
if not content:
raise HTTPException(status_code=400, detail="文档内容为空")
# 保存到临时文件
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content.encode('utf-8'))
tmp_path = tmp.name
logger.info(f"从数据库加载 Markdown 文档: {filename}, 长度: {len(content)}")
except HTTPException:
raise
except Exception as e:
logger.error(f"从数据库读取 Markdown 文档失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
else:
# 文件上传模式
if not file:
raise HTTPException(status_code=400, detail="请提供文件或文档ID")
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
logger.info(f"Markdown 分析完成: {file.filename}, 成功: {result['success']}")
try:
# 读取文件内容
content = await file.read()
if not result['success']:
raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
# 保存到临时文件
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
return result
filename = file.filename
finally:
# 清理临时文件,确保在所有情况下都能清理
try:
if tmp_path and os.path.exists(tmp_path):
os.unlink(tmp_path)
except Exception as cleanup_error:
logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")
except Exception as e:
logger.error(f"读取 Markdown 文件失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"读取文件失败: {str(e)}")
try:
logger.info(f"开始分析 Markdown 文件: {filename}, 分析类型: {analysis_type}, 章节: {section_number}")
# 调用 AI 分析服务
result = await markdown_ai_service.analyze_markdown(
file_path=tmp_path,
analysis_type=analysis_type,
user_prompt=user_prompt,
section_number=section_number
)
logger.info(f"Markdown 分析完成: {filename}, 成功: {result['success']}")
if not result['success']:
raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
return result
except HTTPException:
raise
except Exception as e:
logger.error(f"Markdown AI 分析过程中出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
finally:
# 清理临时文件
if tmp_path and os.path.exists(tmp_path):
try:
os.unlink(tmp_path)
except Exception as cleanup_error:
logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")
@router.post("/analyze/md/stream")
@@ -347,7 +391,8 @@ async def get_markdown_outline(
@router.post("/analyze/txt")
async def analyze_txt(
file: UploadFile = File(...),
file: Optional[UploadFile] = File(None),
doc_id: Optional[str] = Query(None, description="文档ID从数据库读取"),
analysis_type: str = Query("structured", description="分析类型: structured, charts")
):
"""
@@ -357,63 +402,89 @@ async def analyze_txt(
当 analysis_type=charts 时,可生成可视化图表
Args:
file: 上传的 TXT 文件
file: 上传的 TXT 文件(与 doc_id 二选一)
doc_id: 文档ID从数据库读取
analysis_type: 分析类型 - "structured"(默认,提取结构化数据)或 "charts"(生成图表)
Returns:
dict: 分析结果,包含结构化表格数据或图表数据
"""
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
filename = None
text_content = None
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['txt', 'text']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .txt"
)
if doc_id:
# 从数据库读取文档
try:
from app.core.database.mongodb import mongodb
doc = await mongodb.get_document(doc_id)
if not doc:
raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")
filename = doc.get("metadata", {}).get("original_filename", "unknown.txt")
file_ext = filename.split('.')[-1].lower()
if file_ext not in ['txt', 'text']:
raise HTTPException(status_code=400, detail=f"文档类型不是 TXT: {file_ext}")
# 使用数据库中的 content
text_content = doc.get("content", "")
if not text_content:
raise HTTPException(status_code=400, detail="文档内容为空")
logger.info(f"从数据库加载 TXT 文档: {filename}, 长度: {len(text_content)}")
except HTTPException:
raise
except Exception as e:
logger.error(f"从数据库读取 TXT 文档失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
else:
# 文件上传模式
if not file:
raise HTTPException(status_code=400, detail="请提供文件或文档ID")
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['txt', 'text']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .txt"
)
try:
# 读取文件内容
content = await file.read()
text_content = content.decode('utf-8', errors='replace')
filename = file.filename
# 保存到临时文件
with tempfile.NamedTemporaryFile(mode='wb', suffix='.txt', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
logger.info(f"开始 AI 分析 TXT 文件: {filename}, analysis_type={analysis_type}")
try:
logger.info(f"开始 AI 分析 TXT 文件: {file.filename}, analysis_type={analysis_type}")
# 使用 txt_ai_service 的 AI 分析方法
result = await txt_ai_service.analyze_txt_with_ai(
content=text_content,
filename=filename,
analysis_type=analysis_type
)
# 使用 txt_ai_service 的 AI 分析方法
result = await txt_ai_service.analyze_txt_with_ai(
content=text_content,
filename=file.filename,
analysis_type=analysis_type
)
if result:
logger.info(f"TXT AI 分析成功: {file.filename}")
return {
"success": result.get("success", True),
"filename": file.filename,
"analysis_type": analysis_type,
"result": result
}
else:
logger.warning(f"TXT AI 分析返回空结果: {file.filename}")
return {
"success": False,
"filename": file.filename,
"error": "AI 分析未能提取到结构化数据",
"result": None
}
finally:
# 清理临时文件
if os.path.exists(tmp_path):
os.unlink(tmp_path)
if result:
logger.info(f"TXT AI 分析成功: {filename}")
return {
"success": result.get("success", True),
"filename": filename,
"analysis_type": analysis_type,
"result": result
}
else:
logger.warning(f"TXT AI 分析返回空结果: {filename}")
return {
"success": False,
"filename": filename,
"error": "AI 分析未能提取到结构化数据",
"result": None
}
except HTTPException:
raise
@@ -426,7 +497,8 @@ async def analyze_txt(
@router.post("/analyze/word")
async def analyze_word(
file: UploadFile = File(...),
file: Optional[UploadFile] = File(None),
doc_id: Optional[str] = Query(None, description="文档ID从数据库读取"),
user_hint: str = Query("", description="用户提示词,如'请提取表格数据'"),
analysis_type: str = Query("structured", description="分析类型: structured, charts")
):
@@ -437,13 +509,77 @@ async def analyze_word(
当 analysis_type=charts 时,可生成可视化图表
Args:
file: 上传的 Word 文件
file: 上传的 Word 文件(与 doc_id 二选一)
doc_id: 文档ID从数据库读取
user_hint: 用户提示词
analysis_type: 分析类型 - "structured"(默认,提取结构化数据)或 "charts"(生成图表)
Returns:
dict: 包含结构化数据的解析结果或图表数据
"""
# 获取文件名和扩展名
filename = None
file_ext = None
if doc_id:
# 从数据库读取文档
try:
from app.core.database.mongodb import mongodb
doc = await mongodb.get_document(doc_id)
if not doc:
raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")
filename = doc.get("metadata", {}).get("original_filename", "unknown.docx")
file_ext = filename.split('.')[-1].lower()
if file_ext not in ['docx']:
raise HTTPException(status_code=400, detail=f"文档类型不是 Word: {file_ext}")
# 使用数据库中的 content 进行分析
content = doc.get("content", "")
tables = doc.get("structured_data", {}).get("tables", [])
# 调用 AI 分析服务,传入数据库内容
if analysis_type == "charts":
result = await word_ai_service.generate_charts_from_db(
content=content,
tables=tables,
filename=filename,
user_hint=user_hint
)
else:
result = await word_ai_service.parse_word_with_ai_from_db(
content=content,
tables=tables,
filename=filename,
user_hint=user_hint or "请提取文档中的所有结构化数据,包括表格、键值对等"
)
if result.get("success"):
return {
"success": True,
"filename": filename,
"analysis_type": analysis_type,
"result": result
}
else:
return {
"success": False,
"filename": filename,
"error": result.get("error", "AI 解析失败"),
"result": None
}
except HTTPException:
raise
except Exception as e:
logger.error(f"从数据库读取 Word 文档失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
# 文件上传模式
if not file:
raise HTTPException(status_code=400, detail="请提供文件或文档ID")
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")

View File

@@ -405,7 +405,7 @@ async def process_documents_batch(task_id: str, files: List[dict]):
if content and len(content) > 50:
await index_document_to_rag(doc_id, filename, result, file_info["ext"])
return {"index": index, "filename": filename, "doc_id": doc_id, "success": True}
return {"index": index, "filename": filename, "doc_id": doc_id, "file_path": file_info["path"], "success": True}
except Exception as e:
logger.error(f"处理文件 {filename} 失败: {e}")

View File

@@ -757,5 +757,150 @@ class WordAIService:
}
# 全局单例
word_ai_service = WordAIService()
async def parse_word_with_ai_from_db(
self,
content: str,
tables: List[Dict],
filename: str = "",
user_hint: str = ""
) -> Dict[str, Any]:
"""
使用 AI 解析从数据库读取的 Word 文档内容,提取结构化数据
Args:
content: 文档文本内容
tables: 表格数据列表
filename: 文件名
user_hint: 用户提示词
Returns:
Dict: 包含结构化数据的解析结果
"""
try:
# 解析段落
paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
logger.info(f"从数据库解析 Word: {len(paragraphs)} 个段落, {len(tables)} 个表格")
# 优先处理表格数据
if tables and len(tables) > 0:
structured_data = await self._extract_tables_with_ai(
tables, paragraphs, 0, user_hint, {"filename": filename}
)
elif paragraphs and len(paragraphs) > 0:
structured_data = await self._extract_from_text_with_ai(
paragraphs, content, 0, [], user_hint
)
else:
structured_data = {
"success": True,
"type": "empty",
"message": "文档内容为空"
}
return structured_data
except Exception as e:
logger.error(f"从数据库解析 Word 文档失败: {str(e)}")
return {
"success": False,
"error": str(e)
}
async def generate_charts_from_db(
self,
content: str,
tables: List[Dict],
filename: str = "",
user_hint: str = ""
) -> Dict[str, Any]:
"""
使用 AI 解析从数据库读取的 Word 文档并生成可视化图表
Args:
content: 文档文本内容
tables: 表格数据列表
filename: 文件名
user_hint: 用户提示词
Returns:
Dict: 包含图表数据和统计信息的结果
"""
try:
# 解析段落
paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
logger.info(f"从数据库生成 Word 图表: {len(paragraphs)} 个段落, {len(tables)} 个表格")
# 优先处理表格数据
if tables and len(tables) > 0:
structured_data = await self._extract_tables_with_ai(
tables, paragraphs, 0, user_hint, {"filename": filename}
)
elif paragraphs and len(paragraphs) > 0:
structured_data = await self._extract_from_text_with_ai(
paragraphs, content, 0, [], user_hint
)
else:
return {
"success": False,
"error": "文档内容为空"
}
# 提取可用于图表的数据
chart_data = None
if structured_data.get("type") == "table_data":
headers = structured_data.get("headers", [])
rows = structured_data.get("rows", [])
if headers and rows:
chart_data = {
"columns": headers,
"rows": rows
}
elif structured_data.get("type") == "structured_text":
tables_data = structured_data.get("tables", [])
if tables_data and len(tables_data) > 0:
first_table = tables_data[0]
headers = first_table.get("headers", [])
rows = first_table.get("rows", [])
if headers and rows:
chart_data = {
"columns": headers,
"rows": rows
}
# 生成可视化图表
if chart_data:
logger.info(f"开始生成图表,列数: {len(chart_data['columns'])}, 行数: {len(chart_data['rows'])}")
vis_result = visualization_service.analyze_and_visualize(chart_data)
if vis_result.get("success"):
return {
"success": True,
"charts": vis_result.get("charts", {}),
"statistics": vis_result.get("statistics", {}),
"distributions": vis_result.get("distributions", {}),
"structured_data": structured_data,
"row_count": vis_result.get("row_count", 0),
"column_count": vis_result.get("column_count", 0)
}
else:
return {
"success": False,
"error": vis_result.get("error", "可视化生成失败"),
"structured_data": structured_data
}
else:
return {
"success": False,
"error": "文档中没有可用于图表的表格数据",
"structured_data": structured_data
}
except Exception as e:
logger.error(f"从数据库生成 Word 图表失败: {str(e)}")
return {
"success": False,
"error": str(e)
}