Merge branch 'main' of https://gitea.kronecker.cc/OurCodesAreAllRight/FilesReadSystem
This commit is contained in:
@@ -12,6 +12,7 @@ from app.services.excel_ai_service import excel_ai_service
|
||||
from app.services.markdown_ai_service import markdown_ai_service
|
||||
from app.services.template_fill_service import template_fill_service
|
||||
from app.services.word_ai_service import word_ai_service
|
||||
from app.services.txt_ai_service import txt_ai_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -153,8 +154,9 @@ async def analyze_text(
|
||||
|
||||
@router.post("/analyze/md")
|
||||
async def analyze_markdown(
|
||||
file: UploadFile = File(...),
|
||||
analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section"),
|
||||
file: Optional[UploadFile] = File(None),
|
||||
doc_id: Optional[str] = Query(None, description="文档ID(从数据库读取)"),
|
||||
analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section, charts"),
|
||||
user_prompt: str = Query("", description="用户自定义提示词"),
|
||||
section_number: Optional[str] = Query(None, description="指定章节编号,如 '一' 或 '(一)'")
|
||||
):
|
||||
@@ -162,7 +164,8 @@ async def analyze_markdown(
|
||||
上传并使用 AI 分析 Markdown 文件
|
||||
|
||||
Args:
|
||||
file: 上传的 Markdown 文件
|
||||
file: 上传的 Markdown 文件(与 doc_id 二选一)
|
||||
doc_id: 文档ID(从数据库读取)
|
||||
analysis_type: 分析类型
|
||||
user_prompt: 用户自定义提示词
|
||||
section_number: 指定分析的章节编号
|
||||
@@ -170,16 +173,8 @@ async def analyze_markdown(
|
||||
Returns:
|
||||
dict: 分析结果
|
||||
"""
|
||||
# 检查文件类型
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['md', 'markdown']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
|
||||
)
|
||||
filename = None
|
||||
tmp_path = None
|
||||
|
||||
# 验证分析类型
|
||||
supported_types = markdown_ai_service.get_supported_analysis_types()
|
||||
@@ -189,46 +184,96 @@ async def analyze_markdown(
|
||||
detail=f"不支持的分析类型: {analysis_type},支持的类型: {', '.join(supported_types)}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 读取文件内容
|
||||
content = await file.read()
|
||||
|
||||
# 保存到临时文件
|
||||
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
if doc_id:
|
||||
# 从数据库读取文档
|
||||
try:
|
||||
logger.info(f"开始分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}, 章节: {section_number}")
|
||||
from app.core.database.mongodb import mongodb
|
||||
doc = await mongodb.get_document(doc_id)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")
|
||||
|
||||
# 调用 AI 分析服务
|
||||
result = await markdown_ai_service.analyze_markdown(
|
||||
file_path=tmp_path,
|
||||
analysis_type=analysis_type,
|
||||
user_prompt=user_prompt,
|
||||
section_number=section_number
|
||||
filename = doc.get("metadata", {}).get("original_filename", "unknown.md")
|
||||
file_ext = filename.split('.')[-1].lower()
|
||||
|
||||
if file_ext not in ['md', 'markdown']:
|
||||
raise HTTPException(status_code=400, detail=f"文档类型不是 Markdown: {file_ext}")
|
||||
|
||||
content = doc.get("content", "")
|
||||
if not content:
|
||||
raise HTTPException(status_code=400, detail="文档内容为空")
|
||||
|
||||
# 保存到临时文件
|
||||
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
|
||||
tmp.write(content.encode('utf-8'))
|
||||
tmp_path = tmp.name
|
||||
|
||||
logger.info(f"从数据库加载 Markdown 文档: {filename}, 长度: {len(content)}")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"从数据库读取 Markdown 文档失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
|
||||
else:
|
||||
# 文件上传模式
|
||||
if not file:
|
||||
raise HTTPException(status_code=400, detail="请提供文件或文档ID")
|
||||
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['md', 'markdown']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
|
||||
)
|
||||
|
||||
logger.info(f"Markdown 分析完成: {file.filename}, 成功: {result['success']}")
|
||||
try:
|
||||
# 读取文件内容
|
||||
content = await file.read()
|
||||
|
||||
if not result['success']:
|
||||
raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
|
||||
# 保存到临时文件
|
||||
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
return result
|
||||
filename = file.filename
|
||||
|
||||
finally:
|
||||
# 清理临时文件,确保在所有情况下都能清理
|
||||
try:
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
except Exception as cleanup_error:
|
||||
logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")
|
||||
except Exception as e:
|
||||
logger.error(f"读取 Markdown 文件失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"读取文件失败: {str(e)}")
|
||||
|
||||
try:
|
||||
logger.info(f"开始分析 Markdown 文件: {filename}, 分析类型: {analysis_type}, 章节: {section_number}")
|
||||
|
||||
# 调用 AI 分析服务
|
||||
result = await markdown_ai_service.analyze_markdown(
|
||||
file_path=tmp_path,
|
||||
analysis_type=analysis_type,
|
||||
user_prompt=user_prompt,
|
||||
section_number=section_number
|
||||
)
|
||||
|
||||
logger.info(f"Markdown 分析完成: {filename}, 成功: {result['success']}")
|
||||
|
||||
if not result['success']:
|
||||
raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
|
||||
|
||||
return result
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Markdown AI 分析过程中出错: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
|
||||
finally:
|
||||
# 清理临时文件
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except Exception as cleanup_error:
|
||||
logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")
|
||||
|
||||
|
||||
@router.post("/analyze/md/stream")
|
||||
@@ -346,67 +391,100 @@ async def get_markdown_outline(
|
||||
|
||||
@router.post("/analyze/txt")
|
||||
async def analyze_txt(
|
||||
file: UploadFile = File(...),
|
||||
file: Optional[UploadFile] = File(None),
|
||||
doc_id: Optional[str] = Query(None, description="文档ID(从数据库读取)"),
|
||||
analysis_type: str = Query("structured", description="分析类型: structured, charts")
|
||||
):
|
||||
"""
|
||||
上传并使用 AI 分析 TXT 文本文件,提取结构化数据
|
||||
上传并使用 AI 分析 TXT 文本文件,提取结构化数据或生成图表
|
||||
|
||||
将非结构化文本转换为结构化表格数据,便于后续填表使用
|
||||
当 analysis_type=charts 时,可生成可视化图表
|
||||
|
||||
Args:
|
||||
file: 上传的 TXT 文件
|
||||
file: 上传的 TXT 文件(与 doc_id 二选一)
|
||||
doc_id: 文档ID(从数据库读取)
|
||||
analysis_type: 分析类型 - "structured"(默认,提取结构化数据)或 "charts"(生成图表)
|
||||
|
||||
Returns:
|
||||
dict: 分析结果,包含结构化表格数据
|
||||
dict: 分析结果,包含结构化表格数据或图表数据
|
||||
"""
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['txt', 'text']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .txt"
|
||||
)
|
||||
|
||||
try:
|
||||
# 读取文件内容
|
||||
content = await file.read()
|
||||
|
||||
# 保存到临时文件
|
||||
with tempfile.NamedTemporaryFile(mode='wb', suffix='.txt', delete=False) as tmp:
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
filename = None
|
||||
text_content = None
|
||||
|
||||
if doc_id:
|
||||
# 从数据库读取文档
|
||||
try:
|
||||
logger.info(f"开始 AI 分析 TXT 文件: {file.filename}")
|
||||
from app.core.database.mongodb import mongodb
|
||||
doc = await mongodb.get_document(doc_id)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")
|
||||
|
||||
# 使用 template_fill_service 的 AI 分析方法
|
||||
result = await template_fill_service.analyze_txt_with_ai(
|
||||
content=content.decode('utf-8', errors='replace'),
|
||||
filename=file.filename
|
||||
filename = doc.get("metadata", {}).get("original_filename", "unknown.txt")
|
||||
file_ext = filename.split('.')[-1].lower()
|
||||
|
||||
if file_ext not in ['txt', 'text']:
|
||||
raise HTTPException(status_code=400, detail=f"文档类型不是 TXT: {file_ext}")
|
||||
|
||||
# 使用数据库中的 content
|
||||
text_content = doc.get("content", "")
|
||||
|
||||
if not text_content:
|
||||
raise HTTPException(status_code=400, detail="文档内容为空")
|
||||
|
||||
logger.info(f"从数据库加载 TXT 文档: {filename}, 长度: {len(text_content)}")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"从数据库读取 TXT 文档失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
|
||||
else:
|
||||
# 文件上传模式
|
||||
if not file:
|
||||
raise HTTPException(status_code=400, detail="请提供文件或文档ID")
|
||||
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['txt', 'text']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .txt"
|
||||
)
|
||||
|
||||
if result:
|
||||
logger.info(f"TXT AI 分析成功: {file.filename}")
|
||||
return {
|
||||
"success": True,
|
||||
"filename": file.filename,
|
||||
"structured_data": result
|
||||
}
|
||||
else:
|
||||
logger.warning(f"TXT AI 分析返回空结果: {file.filename}")
|
||||
return {
|
||||
"success": False,
|
||||
"filename": file.filename,
|
||||
"error": "AI 分析未能提取到结构化数据",
|
||||
"structured_data": None
|
||||
}
|
||||
# 读取文件内容
|
||||
content = await file.read()
|
||||
text_content = content.decode('utf-8', errors='replace')
|
||||
filename = file.filename
|
||||
|
||||
finally:
|
||||
# 清理临时文件
|
||||
if os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
try:
|
||||
logger.info(f"开始 AI 分析 TXT 文件: {filename}, analysis_type={analysis_type}")
|
||||
|
||||
# 使用 txt_ai_service 的 AI 分析方法
|
||||
result = await txt_ai_service.analyze_txt_with_ai(
|
||||
content=text_content,
|
||||
filename=filename,
|
||||
analysis_type=analysis_type
|
||||
)
|
||||
|
||||
if result:
|
||||
logger.info(f"TXT AI 分析成功: {filename}")
|
||||
return {
|
||||
"success": result.get("success", True),
|
||||
"filename": filename,
|
||||
"analysis_type": analysis_type,
|
||||
"result": result
|
||||
}
|
||||
else:
|
||||
logger.warning(f"TXT AI 分析返回空结果: {filename}")
|
||||
return {
|
||||
"success": False,
|
||||
"filename": filename,
|
||||
"error": "AI 分析未能提取到结构化数据",
|
||||
"result": None
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
@@ -419,21 +497,89 @@ async def analyze_txt(
|
||||
|
||||
@router.post("/analyze/word")
|
||||
async def analyze_word(
|
||||
file: UploadFile = File(...),
|
||||
user_hint: str = Query("", description="用户提示词,如'请提取表格数据'")
|
||||
file: Optional[UploadFile] = File(None),
|
||||
doc_id: Optional[str] = Query(None, description="文档ID(从数据库读取)"),
|
||||
user_hint: str = Query("", description="用户提示词,如'请提取表格数据'"),
|
||||
analysis_type: str = Query("structured", description="分析类型: structured, charts")
|
||||
):
|
||||
"""
|
||||
使用 AI 解析 Word 文档,提取结构化数据
|
||||
使用 AI 解析 Word 文档,提取结构化数据或生成图表
|
||||
|
||||
适用于从非结构化的 Word 文档中提取表格数据、键值对等信息
|
||||
当 analysis_type=charts 时,可生成可视化图表
|
||||
|
||||
Args:
|
||||
file: 上传的 Word 文件
|
||||
file: 上传的 Word 文件(与 doc_id 二选一)
|
||||
doc_id: 文档ID(从数据库读取)
|
||||
user_hint: 用户提示词
|
||||
analysis_type: 分析类型 - "structured"(默认,提取结构化数据)或 "charts"(生成图表)
|
||||
|
||||
Returns:
|
||||
dict: 包含结构化数据的解析结果
|
||||
dict: 包含结构化数据的解析结果或图表数据
|
||||
"""
|
||||
# 获取文件名和扩展名
|
||||
filename = None
|
||||
file_ext = None
|
||||
|
||||
if doc_id:
|
||||
# 从数据库读取文档
|
||||
try:
|
||||
from app.core.database.mongodb import mongodb
|
||||
doc = await mongodb.get_document(doc_id)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")
|
||||
|
||||
filename = doc.get("metadata", {}).get("original_filename", "unknown.docx")
|
||||
file_ext = filename.split('.')[-1].lower()
|
||||
|
||||
if file_ext not in ['docx']:
|
||||
raise HTTPException(status_code=400, detail=f"文档类型不是 Word: {file_ext}")
|
||||
|
||||
# 使用数据库中的 content 进行分析
|
||||
content = doc.get("content", "")
|
||||
tables = doc.get("structured_data", {}).get("tables", [])
|
||||
|
||||
# 调用 AI 分析服务,传入数据库内容
|
||||
if analysis_type == "charts":
|
||||
result = await word_ai_service.generate_charts_from_db(
|
||||
content=content,
|
||||
tables=tables,
|
||||
filename=filename,
|
||||
user_hint=user_hint
|
||||
)
|
||||
else:
|
||||
result = await word_ai_service.parse_word_with_ai_from_db(
|
||||
content=content,
|
||||
tables=tables,
|
||||
filename=filename,
|
||||
user_hint=user_hint or "请提取文档中的所有结构化数据,包括表格、键值对等"
|
||||
)
|
||||
|
||||
if result.get("success"):
|
||||
return {
|
||||
"success": True,
|
||||
"filename": filename,
|
||||
"analysis_type": analysis_type,
|
||||
"result": result
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"filename": filename,
|
||||
"error": result.get("error", "AI 解析失败"),
|
||||
"result": None
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"从数据库读取 Word 文档失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
|
||||
|
||||
# 文件上传模式
|
||||
if not file:
|
||||
raise HTTPException(status_code=400, detail="请提供文件或文档ID")
|
||||
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
@@ -453,16 +599,25 @@ async def analyze_word(
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
# 使用 AI 解析 Word 文档
|
||||
result = await word_ai_service.parse_word_with_ai(
|
||||
file_path=tmp_path,
|
||||
user_hint=user_hint or "请提取文档中的所有结构化数据,包括表格、键值对等"
|
||||
)
|
||||
# 根据 analysis_type 选择处理方式
|
||||
if analysis_type == "charts":
|
||||
# 生成图表
|
||||
result = await word_ai_service.generate_charts(
|
||||
file_path=tmp_path,
|
||||
user_hint=user_hint
|
||||
)
|
||||
else:
|
||||
# 提取结构化数据
|
||||
result = await word_ai_service.parse_word_with_ai(
|
||||
file_path=tmp_path,
|
||||
user_hint=user_hint or "请提取文档中的所有结构化数据,包括表格、键值对等"
|
||||
)
|
||||
|
||||
if result.get("success"):
|
||||
return {
|
||||
"success": True,
|
||||
"filename": file.filename,
|
||||
"analysis_type": analysis_type,
|
||||
"result": result
|
||||
}
|
||||
else:
|
||||
|
||||
@@ -405,7 +405,7 @@ async def process_documents_batch(task_id: str, files: List[dict]):
|
||||
if content and len(content) > 50:
|
||||
await index_document_to_rag(doc_id, filename, result, file_info["ext"])
|
||||
|
||||
return {"index": index, "filename": filename, "doc_id": doc_id, "success": True}
|
||||
return {"index": index, "filename": filename, "doc_id": doc_id, "file_path": file_info["path"], "success": True}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"处理文件 {filename} 失败: {e}")
|
||||
|
||||
352
backend/app/services/txt_ai_service.py
Normal file
352
backend/app/services/txt_ai_service.py
Normal file
@@ -0,0 +1,352 @@
|
||||
"""
|
||||
TXT 文档 AI 分析服务
|
||||
|
||||
使用 LLM 对 TXT 文本文件进行深度分析,提取结构化数据并生成可视化图表
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from app.services.llm_service import llm_service
|
||||
from app.services.visualization_service import visualization_service
|
||||
from app.core.document_parser.txt_parser import TxtParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TxtAIService:
|
||||
"""TXT 文档 AI 分析服务"""
|
||||
|
||||
def __init__(self):
|
||||
self.parser = TxtParser()
|
||||
|
||||
async def analyze_txt_with_ai(
|
||||
self,
|
||||
content: str,
|
||||
filename: str = "",
|
||||
user_hint: str = "",
|
||||
analysis_type: str = "structured"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
使用 AI 解析 TXT 文本文件
|
||||
|
||||
Args:
|
||||
content: 文本内容
|
||||
filename: 文件名(可选)
|
||||
user_hint: 用户提示词
|
||||
analysis_type: 分析类型 - "structured"(默认,提取结构化数据)或 "charts"(生成图表)
|
||||
|
||||
Returns:
|
||||
Dict: 包含结构化数据的分析结果
|
||||
"""
|
||||
try:
|
||||
if not content or not content.strip():
|
||||
return {
|
||||
"success": False,
|
||||
"error": "文档内容为空"
|
||||
}
|
||||
|
||||
# 根据分析类型选择处理方式
|
||||
if analysis_type == "charts":
|
||||
return await self.generate_charts(content, filename, user_hint)
|
||||
|
||||
# 默认:提取结构化数据
|
||||
return await self._extract_structured_data(content, filename, user_hint)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TXT AI 分析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
async def _extract_structured_data(
|
||||
self,
|
||||
content: str,
|
||||
filename: str = "",
|
||||
user_hint: str = ""
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
从文本中提取结构化数据
|
||||
|
||||
Args:
|
||||
content: 文本内容
|
||||
filename: 文件名
|
||||
user_hint: 用户提示词
|
||||
|
||||
Returns:
|
||||
结构化数据
|
||||
"""
|
||||
try:
|
||||
# 截断内容避免超出 token 限制
|
||||
max_content_len = 8000
|
||||
text_preview = content[:max_content_len] if len(content) > max_content_len else content
|
||||
|
||||
prompt = f"""你是一个专业的数据提取专家。请从以下文本中提取结构化数据。
|
||||
|
||||
【用户需求】
|
||||
{user_hint if user_hint else "请提取文档中的所有结构化数据,包括表格数据、键值对、列表项等。"}
|
||||
|
||||
【文档内容】({"前" + str(max_content_len) + "字符,仅显示部分" if len(content) > max_content_len else "全文"})
|
||||
{text_preview}
|
||||
|
||||
请按照以下 JSON 格式输出:
|
||||
{{
|
||||
"type": "structured_text",
|
||||
"tables": [{{"headers": [...], "rows": [...]}}],
|
||||
"key_values": {{"键1": "值1", "键2": "值2", ...}},
|
||||
"list_items": ["项1", "项2", ...],
|
||||
"summary": "文档内容摘要"
|
||||
}}
|
||||
|
||||
重点:
|
||||
- 如果文档包含表格数据(制表符、空格对齐等),提取到 tables 中
|
||||
- 如果文档包含键值对(如 名称: 张三),提取到 key_values 中
|
||||
- 如果文档包含列表项,提取到 list_items 中
|
||||
- 如果无法提取到结构化数据,至少提供一个详细的摘要
|
||||
"""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": "你是一个专业的数据提取助手。请严格按JSON格式输出。"},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
|
||||
response = await self.llm.chat(
|
||||
messages=messages,
|
||||
temperature=0.1,
|
||||
max_tokens=50000
|
||||
)
|
||||
|
||||
content_text = self.llm.extract_message_content(response)
|
||||
result = self._parse_json_response(content_text)
|
||||
|
||||
if result:
|
||||
logger.info(f"TXT 结构化数据提取成功: type={result.get('type')}")
|
||||
return {
|
||||
"success": True,
|
||||
"type": result.get("type", "structured_text"),
|
||||
"tables": result.get("tables", []),
|
||||
"key_values": result.get("key_values", {}),
|
||||
"list_items": result.get("list_items", []),
|
||||
"summary": result.get("summary", "")
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": True,
|
||||
"type": "text",
|
||||
"summary": text_preview[:500],
|
||||
"raw_text_preview": text_preview[:500]
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TXT 结构化数据提取失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
async def generate_charts(
|
||||
self,
|
||||
content: str,
|
||||
filename: str = "",
|
||||
user_hint: str = ""
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
从文本中提取数据并生成可视化图表
|
||||
|
||||
Args:
|
||||
content: 文本内容
|
||||
filename: 文件名
|
||||
user_hint: 用户提示词
|
||||
|
||||
Returns:
|
||||
包含图表数据和统计信息的结果
|
||||
"""
|
||||
try:
|
||||
# 截断内容避免超出 token 限制
|
||||
max_content_len = 8000
|
||||
text_preview = content[:max_content_len] if len(content) > max_content_len else content
|
||||
|
||||
# 使用 LLM 提取可用于图表的数据
|
||||
prompt = f"""你是一个专业的数据可视化助手。请从以下文本中提取可用于可视化的数据。
|
||||
|
||||
文档标题:{filename}
|
||||
|
||||
文档内容:
|
||||
{text_preview}
|
||||
|
||||
请完成以下任务:
|
||||
1. 识别文本中的表格数据(制表符分隔、空格对齐的表格等)
|
||||
2. 识别文本中的关键统计数据(百分比、数量、趋势等)
|
||||
3. 识别可用于比较的分类数据
|
||||
|
||||
请用 JSON 格式返回以下结构的数据(如果没有表格数据,返回空结构):
|
||||
{{
|
||||
"tables": [
|
||||
{{
|
||||
"description": "表格的描述",
|
||||
"columns": ["列名1", "列名2", ...],
|
||||
"rows": [
|
||||
["值1", "值2", ...],
|
||||
["值1", "值2", ...]
|
||||
]
|
||||
}}
|
||||
],
|
||||
"key_statistics": [
|
||||
{{
|
||||
"name": "指标名称",
|
||||
"value": "数值",
|
||||
"trend": "增长/下降/持平",
|
||||
"description": "指标说明"
|
||||
}}
|
||||
],
|
||||
"chart_suggestions": [
|
||||
{{
|
||||
"chart_type": "bar/line/pie",
|
||||
"title": "图表标题",
|
||||
"data_source": "数据来源说明"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
如果没有表格数据,返回空结构:{{"tables": [], "key_statistics": [], "chart_suggestions": []}}
|
||||
请确保返回的是合法的 JSON 格式。"""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": "你是一个专业的数据可视化助手,擅长从文本中提取数据并生成图表。"},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
|
||||
response = await self.llm.chat(
|
||||
messages=messages,
|
||||
temperature=0.1,
|
||||
max_tokens=50000
|
||||
)
|
||||
|
||||
content_text = self.llm.extract_message_content(response)
|
||||
chart_data = self._parse_json_response(content_text)
|
||||
|
||||
if not chart_data:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "无法从文本中提取有效的数据结构"
|
||||
}
|
||||
|
||||
# 检查是否有表格数据
|
||||
tables = chart_data.get("tables", [])
|
||||
key_statistics = chart_data.get("key_statistics", [])
|
||||
|
||||
if not tables:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "文档中没有可用于图表的表格数据",
|
||||
"key_statistics": key_statistics,
|
||||
"chart_suggestions": chart_data.get("chart_suggestions", [])
|
||||
}
|
||||
|
||||
# 使用第一个表格生成图表
|
||||
first_table = tables[0]
|
||||
columns = first_table.get("columns", [])
|
||||
rows = first_table.get("rows", [])
|
||||
|
||||
if not columns or not rows:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "表格数据为空"
|
||||
}
|
||||
|
||||
# 转换为 visualization_service 需要的格式
|
||||
viz_data = {
|
||||
"columns": columns,
|
||||
"rows": rows
|
||||
}
|
||||
|
||||
# 生成可视化图表
|
||||
logger.info(f"开始生成图表,列数: {len(columns)}, 行数: {len(rows)}")
|
||||
vis_result = visualization_service.analyze_and_visualize(viz_data)
|
||||
|
||||
if vis_result.get("success"):
|
||||
return {
|
||||
"success": True,
|
||||
"charts": vis_result.get("charts", {}),
|
||||
"statistics": vis_result.get("statistics", {}),
|
||||
"distributions": vis_result.get("distributions", {}),
|
||||
"row_count": vis_result.get("row_count", 0),
|
||||
"column_count": vis_result.get("column_count", 0),
|
||||
"key_statistics": key_statistics,
|
||||
"chart_suggestions": chart_data.get("chart_suggestions", []),
|
||||
"table_description": first_table.get("description", "")
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": vis_result.get("error", "可视化生成失败"),
|
||||
"key_statistics": key_statistics
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TXT 图表生成失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def _parse_json_response(self, content: str) -> Optional[Dict]:
|
||||
"""解析 JSON 响应,处理各种格式问题"""
|
||||
if not content:
|
||||
return None
|
||||
|
||||
import json
|
||||
|
||||
# 清理 markdown 标记
|
||||
cleaned = content.strip()
|
||||
cleaned = re.sub(r'^```json\s*', '', cleaned, flags=re.MULTILINE)
|
||||
cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE)
|
||||
cleaned = cleaned.strip()
|
||||
|
||||
# 找到 JSON 开始位置
|
||||
json_start = -1
|
||||
for i, c in enumerate(cleaned):
|
||||
if c == '{':
|
||||
json_start = i
|
||||
break
|
||||
|
||||
if json_start == -1:
|
||||
logger.warning("无法找到 JSON 开始位置")
|
||||
return None
|
||||
|
||||
json_text = cleaned[json_start:]
|
||||
|
||||
# 尝试直接解析
|
||||
try:
|
||||
return json.loads(json_text)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# 尝试修复并解析
|
||||
try:
|
||||
# 找到闭合括号
|
||||
depth = 0
|
||||
end_pos = -1
|
||||
for i, c in enumerate(json_text):
|
||||
if c == '{':
|
||||
depth += 1
|
||||
elif c == '}':
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
end_pos = i + 1
|
||||
break
|
||||
|
||||
if end_pos > 0:
|
||||
fixed = json_text[:end_pos]
|
||||
# 移除末尾逗号
|
||||
fixed = re.sub(r',\s*([}]])', r'\1', fixed)
|
||||
return json.loads(fixed)
|
||||
except Exception as e:
|
||||
logger.warning(f"JSON 修复失败: {e}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# 全局单例
|
||||
txt_ai_service = TxtAIService()
|
||||
@@ -8,6 +8,7 @@ from typing import Dict, Any, List, Optional
|
||||
import json
|
||||
|
||||
from app.services.llm_service import llm_service
|
||||
from app.services.visualization_service import visualization_service
|
||||
from app.core.document_parser.docx_parser import DocxParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -634,6 +635,272 @@ class WordAIService:
|
||||
|
||||
return values
|
||||
|
||||
async def generate_charts(
|
||||
self,
|
||||
file_path: str,
|
||||
user_hint: str = ""
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
使用 AI 解析 Word 文档并生成可视化图表
|
||||
|
||||
# 全局单例
|
||||
word_ai_service = WordAIService()
|
||||
从 Word 文档中提取表格数据,然后生成统计图表
|
||||
|
||||
Args:
|
||||
file_path: Word 文件路径
|
||||
user_hint: 用户提示词,指定要提取的内容类型
|
||||
|
||||
Returns:
|
||||
Dict: 包含图表数据和统计信息的结果
|
||||
"""
|
||||
try:
|
||||
# 1. 先用基础解析器提取原始内容
|
||||
parse_result = self.parser.parse(file_path)
|
||||
|
||||
if not parse_result.success:
|
||||
return {
|
||||
"success": False,
|
||||
"error": parse_result.error,
|
||||
"structured_data": None
|
||||
}
|
||||
|
||||
# 2. 获取原始数据
|
||||
raw_data = parse_result.data
|
||||
paragraphs = raw_data.get("paragraphs", [])
|
||||
tables = raw_data.get("tables", [])
|
||||
content = raw_data.get("content", "")
|
||||
|
||||
logger.info(f"Word 基础解析完成: {len(paragraphs)} 个段落, {len(tables)} 个表格")
|
||||
|
||||
# 3. 优先处理表格数据
|
||||
if tables and len(tables) > 0:
|
||||
structured_data = await self._extract_tables_with_ai(
|
||||
tables, paragraphs, 0, user_hint, parse_result.metadata
|
||||
)
|
||||
elif paragraphs and len(paragraphs) > 0:
|
||||
structured_data = await self._extract_from_text_with_ai(
|
||||
paragraphs, content, 0, [], user_hint
|
||||
)
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "文档内容为空",
|
||||
"structured_data": None
|
||||
}
|
||||
|
||||
# 4. 检查是否有表格数据用于可视化
|
||||
if not structured_data.get("success"):
|
||||
return {
|
||||
"success": False,
|
||||
"error": structured_data.get("error", "解析失败"),
|
||||
"structured_data": None
|
||||
}
|
||||
|
||||
parse_type = structured_data.get("type", "")
|
||||
|
||||
# 5. 提取可用于图表的数据
|
||||
chart_data = None
|
||||
|
||||
if parse_type == "table_data":
|
||||
headers = structured_data.get("headers", [])
|
||||
rows = structured_data.get("rows", [])
|
||||
if headers and rows:
|
||||
chart_data = {
|
||||
"columns": headers,
|
||||
"rows": rows
|
||||
}
|
||||
elif parse_type == "structured_text":
|
||||
tables = structured_data.get("tables", [])
|
||||
if tables and len(tables) > 0:
|
||||
first_table = tables[0]
|
||||
headers = first_table.get("headers", [])
|
||||
rows = first_table.get("rows", [])
|
||||
if headers and rows:
|
||||
chart_data = {
|
||||
"columns": headers,
|
||||
"rows": rows
|
||||
}
|
||||
|
||||
# 6. 生成可视化图表
|
||||
if chart_data:
|
||||
logger.info(f"开始生成图表,列数: {len(chart_data['columns'])}, 行数: {len(chart_data['rows'])}")
|
||||
vis_result = visualization_service.analyze_and_visualize(chart_data)
|
||||
|
||||
if vis_result.get("success"):
|
||||
return {
|
||||
"success": True,
|
||||
"charts": vis_result.get("charts", {}),
|
||||
"statistics": vis_result.get("statistics", {}),
|
||||
"distributions": vis_result.get("distributions", {}),
|
||||
"structured_data": structured_data,
|
||||
"row_count": vis_result.get("row_count", 0),
|
||||
"column_count": vis_result.get("column_count", 0)
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": vis_result.get("error", "可视化生成失败"),
|
||||
"structured_data": structured_data
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "文档中没有可用于图表的表格数据",
|
||||
"structured_data": structured_data
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Word 文档图表生成失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"structured_data": None
|
||||
}
|
||||
|
||||
|
||||
async def parse_word_with_ai_from_db(
|
||||
self,
|
||||
content: str,
|
||||
tables: List[Dict],
|
||||
filename: str = "",
|
||||
user_hint: str = ""
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
使用 AI 解析从数据库读取的 Word 文档内容,提取结构化数据
|
||||
|
||||
Args:
|
||||
content: 文档文本内容
|
||||
tables: 表格数据列表
|
||||
filename: 文件名
|
||||
user_hint: 用户提示词
|
||||
|
||||
Returns:
|
||||
Dict: 包含结构化数据的解析结果
|
||||
"""
|
||||
try:
|
||||
# 解析段落
|
||||
paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
|
||||
|
||||
logger.info(f"从数据库解析 Word: {len(paragraphs)} 个段落, {len(tables)} 个表格")
|
||||
|
||||
# 优先处理表格数据
|
||||
if tables and len(tables) > 0:
|
||||
structured_data = await self._extract_tables_with_ai(
|
||||
tables, paragraphs, 0, user_hint, {"filename": filename}
|
||||
)
|
||||
elif paragraphs and len(paragraphs) > 0:
|
||||
structured_data = await self._extract_from_text_with_ai(
|
||||
paragraphs, content, 0, [], user_hint
|
||||
)
|
||||
else:
|
||||
structured_data = {
|
||||
"success": True,
|
||||
"type": "empty",
|
||||
"message": "文档内容为空"
|
||||
}
|
||||
|
||||
return structured_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"从数据库解析 Word 文档失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
async def generate_charts_from_db(
|
||||
self,
|
||||
content: str,
|
||||
tables: List[Dict],
|
||||
filename: str = "",
|
||||
user_hint: str = ""
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
使用 AI 解析从数据库读取的 Word 文档并生成可视化图表
|
||||
|
||||
Args:
|
||||
content: 文档文本内容
|
||||
tables: 表格数据列表
|
||||
filename: 文件名
|
||||
user_hint: 用户提示词
|
||||
|
||||
Returns:
|
||||
Dict: 包含图表数据和统计信息的结果
|
||||
"""
|
||||
try:
|
||||
# 解析段落
|
||||
paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
|
||||
|
||||
logger.info(f"从数据库生成 Word 图表: {len(paragraphs)} 个段落, {len(tables)} 个表格")
|
||||
|
||||
# 优先处理表格数据
|
||||
if tables and len(tables) > 0:
|
||||
structured_data = await self._extract_tables_with_ai(
|
||||
tables, paragraphs, 0, user_hint, {"filename": filename}
|
||||
)
|
||||
elif paragraphs and len(paragraphs) > 0:
|
||||
structured_data = await self._extract_from_text_with_ai(
|
||||
paragraphs, content, 0, [], user_hint
|
||||
)
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "文档内容为空"
|
||||
}
|
||||
|
||||
# 提取可用于图表的数据
|
||||
chart_data = None
|
||||
|
||||
if structured_data.get("type") == "table_data":
|
||||
headers = structured_data.get("headers", [])
|
||||
rows = structured_data.get("rows", [])
|
||||
if headers and rows:
|
||||
chart_data = {
|
||||
"columns": headers,
|
||||
"rows": rows
|
||||
}
|
||||
elif structured_data.get("type") == "structured_text":
|
||||
tables_data = structured_data.get("tables", [])
|
||||
if tables_data and len(tables_data) > 0:
|
||||
first_table = tables_data[0]
|
||||
headers = first_table.get("headers", [])
|
||||
rows = first_table.get("rows", [])
|
||||
if headers and rows:
|
||||
chart_data = {
|
||||
"columns": headers,
|
||||
"rows": rows
|
||||
}
|
||||
|
||||
# 生成可视化图表
|
||||
if chart_data:
|
||||
logger.info(f"开始生成图表,列数: {len(chart_data['columns'])}, 行数: {len(chart_data['rows'])}")
|
||||
vis_result = visualization_service.analyze_and_visualize(chart_data)
|
||||
|
||||
if vis_result.get("success"):
|
||||
return {
|
||||
"success": True,
|
||||
"charts": vis_result.get("charts", {}),
|
||||
"statistics": vis_result.get("statistics", {}),
|
||||
"distributions": vis_result.get("distributions", {}),
|
||||
"structured_data": structured_data,
|
||||
"row_count": vis_result.get("row_count", 0),
|
||||
"column_count": vis_result.get("column_count", 0)
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": vis_result.get("error", "可视化生成失败"),
|
||||
"structured_data": structured_data
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "文档中没有可用于图表的表格数据",
|
||||
"structured_data": structured_data
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"从数据库生成 Word 图表失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user