【智能助手增强】

- 新增对话历史管理：MongoDB新增conversations集合，存储用户与AI的对话上下文，支持多轮对话意图延续 - 新增对话历史API（conversation.py）：GET/DELETE conversation历史、列出所有会话 - 意图解析增强：支持基于对话历史的意图识别，上下文理解更准确 - 字段提取优化：支持"提取文档中的医院数量"等自然语言模式，智能去除"文档中的"前缀 - 文档对比优化：从指令中提取文件名并精确匹配source_docs，支持"对比A和B两个文档" - 文档摘要优化：使用LLM生成真实AI摘要而非返回原始文档预览【Word模板填表核心功能】 - Word模板字段生成：空白Word上传后，自动从源文档（Excel/Word/TXT/MD）内容AI生成字段名 - Word模板填表（_fill_docx）：将提取数据写入Word模板表格，支持精确匹配、模糊匹配、追加新行 - 数据润色（_polish_word_filled_data）：LLM对多行Excel数据进行统计归纳（合计/平均/极值），转化为专业自然语言描述 - 段落格式输出：使用📌字段名+值段落+分隔线（灰色横线）格式，提升可读性 - 导出链打通：fill_template返回filled_file_path，export直接返回已填好的Word文件【其他修复】 - 修复Word导出Windows文件锁问题：NamedTemporaryFile改为mkstemp+close - 修复Word方框非法字符：扩展clean_text移除\uFFFD、□等Unicode替代符和零宽字符 - 修复文档对比"需要至少2个文档"：从指令提取具体文件名优先匹配而非取前2个 - 修复导出format硬编码：自动识别docx/xlsx格式 - Docx解析器增加备用解析方法和更完整的段落/表格/标题提取 - RAG服务新增MySQL数据源支持
2026-04-15 23:32:55 +08:00
parent 9e7f9df384
commit e5d4724e82
19 changed files with 2185 additions and 407 deletions
--- a/backend/app/api/init.py
+++ b/backend/app/api/init.py
@@ -14,6 +14,7 @@ from app.api.endpoints import (
    analysis_charts,
    health,
    instruction,    # 智能指令
+    conversation,   # 对话历史
 )

 # 创建主路由
@@ -31,3 +32,4 @@ api_router.include_router(ai_analyze.router)       # AI分析
 api_router.include_router(visualization.router)    # 可视化
 api_router.include_router(analysis_charts.router)  # 分析图表
 api_router.include_router(instruction.router)      # 智能指令
+api_router.include_router(conversation.router)     # 对话历史
--- a/backend/app/api/endpoints/conversation.py
+++ b/backend/app/api/endpoints/conversation.py
@@ -0,0 +1,98 @@
+"""
+对话历史 API 接口
+
+提供对话历史的存储和查询功能
+"""
+import logging
+from typing import Optional
+
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+
+from app.core.database import mongodb
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/conversation", tags=["对话历史"])
+
+
+# ==================== 请求/响应模型 ====================
+
+class ConversationMessage(BaseModel):
+    role: str
+    content: str
+    intent: Optional[str] = None
+
+
+class ConversationHistoryResponse(BaseModel):
+    success: bool
+    messages: list
+
+
+class ConversationListResponse(BaseModel):
+    success: bool
+    conversations: list
+
+
+# ==================== 接口 ====================
+
+@router.get("/{conversation_id}/history", response_model=ConversationHistoryResponse)
+async def get_conversation_history(conversation_id: str, limit: int = 20):
+    """
+    获取对话历史
+
+    Args:
+        conversation_id: 对话会话ID
+        limit: 返回消息数量（默认20条）
+    """
+    try:
+        messages = await mongodb.get_conversation_history(conversation_id, limit=limit)
+        return ConversationHistoryResponse(
+            success=True,
+            messages=messages
+        )
+    except Exception as e:
+        logger.error(f"获取对话历史失败: {e}")
+        return ConversationHistoryResponse(
+            success=False,
+            messages=[]
+        )
+
+
+@router.delete("/{conversation_id}")
+async def delete_conversation(conversation_id: str):
+    """
+    删除对话会话
+
+    Args:
+        conversation_id: 对话会话ID
+    """
+    try:
+        success = await mongodb.delete_conversation(conversation_id)
+        return {"success": success}
+    except Exception as e:
+        logger.error(f"删除对话失败: {e}")
+        return {"success": False, "error": str(e)}
+
+
+@router.get("/all", response_model=ConversationListResponse)
+async def list_conversations(limit: int = 50, skip: int = 0):
+    """
+    获取会话列表
+
+    Args:
+        limit: 返回数量
+        skip: 跳过数量
+    """
+    try:
+        conversations = await mongodb.list_conversations(limit=limit, skip=skip)
+        return ConversationListResponse(
+            success=True,
+            conversations=conversations
+        )
+    except Exception as e:
+        logger.error(f"获取会话列表失败: {e}")
+        return ConversationListResponse(
+            success=False,
+            conversations=[]
+        )
--- a/backend/app/api/endpoints/documents.py
+++ b/backend/app/api/endpoints/documents.py
@@ -4,6 +4,7 @@
 支持多格式文档(docx/xlsx/md/txt)上传、解析、存储和RAG索引
 集成 Excel 存储和 AI 生成字段描述
 """
+import asyncio
 import logging
 import uuid
 from typing import List, Optional
@@ -258,6 +259,7 @@ async def process_document(
        )

        # 如果是 Excel，存储到 MySQL + AI生成描述 + RAG索引
+        mysql_table_name = None
        if doc_type in ["xlsx", "xls"]:
            await update_task_status(
                task_id, status="processing",
@@ -265,17 +267,29 @@ async def process_document(
            )

            try:
-                # 使用 TableRAG 服务完成建表和RAG索引
+                # 使用 TableRAG 服务存储到 MySQL（跳过 RAG 索引以提升速度）
                logger.info(f"开始存储Excel到MySQL: {original_filename}, file_path: {file_path}")
                rag_result = await table_rag_service.build_table_rag_index(
                    file_path=file_path,
                    filename=original_filename,
                    sheet_name=parse_options.get("sheet_name"),
-                    header_row=parse_options.get("header_row", 0)
+                    header_row=parse_options.get("header_row", 0),
+                    skip_rag_index=True  # 跳过 AI 字段描述生成和索引
                )

                if rag_result.get("success"):
-                    logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {rag_result.get('table_name')}")
+                    mysql_table_name = rag_result.get('table_name')
+                    logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {mysql_table_name}")
+                    # 更新 MongoDB 中的 metadata，记录 MySQL 表名
+                    try:
+                        doc = await mongodb.get_document(doc_id)
+                        if doc:
+                            metadata = doc.get("metadata", {})
+                            metadata["mysql_table_name"] = mysql_table_name
+                            await mongodb.update_document_metadata(doc_id, metadata)
+                            logger.info(f"已更新 MongoDB 文档的 mysql_table_name: {mysql_table_name}")
+                    except Exception as update_err:
+                        logger.warning(f"更新 MongoDB mysql_table_name 失败: {update_err}")
                else:
                    logger.error(f"RAG索引构建失败: {rag_result.get('error')}")
            except Exception as e:
@@ -283,17 +297,16 @@ async def process_document(

        else:
            # 非结构化文档
-            await update_task_status(
-                task_id, status="processing",
-                progress=60, message="正在建立索引"
-            )
-
-            # 如果文档中有表格数据，提取并存储到 MySQL + RAG
            structured_data = result.data.get("structured_data", {})
            tables = structured_data.get("tables", [])

+            # 如果文档中有表格数据，提取并存储到 MySQL（不需要 RAG 索引）
            if tables:
-                # 对每个表格建立 MySQL 表和 RAG 索引
+                await update_task_status(
+                    task_id, status="processing",
+                    progress=60, message="正在存储表格数据"
+                )
+                # 对每个表格建立 MySQL 表（跳过 RAG 索引，速度更快）
                for table_info in tables:
                    await table_rag_service.index_document_table(
                        doc_id=doc_id,
@@ -302,8 +315,14 @@ async def process_document(
                        source_doc_type=doc_type
                    )

-            # 同时对文档内容建立 RAG 索引
-            await index_document_to_rag(doc_id, original_filename, result, doc_type)
+            # 对文档内容建立 RAG 索引（非结构化文本需要语义搜索）
+            content = result.data.get("content", "")
+            if content and len(content) > 50:  # 只有内容足够长才建立索引
+                await update_task_status(
+                    task_id, status="processing",
+                    progress=80, message="正在建立语义索引"
+                )
+                await index_document_to_rag(doc_id, original_filename, result, doc_type)

        # 完成
        await update_task_status(
@@ -328,72 +347,95 @@ async def process_document(


 async def process_documents_batch(task_id: str, files: List[dict]):
-    """批量处理文档"""
+    """批量并行处理文档"""
    try:
        await update_task_status(
            task_id, status="processing",
-            progress=0, message="开始批量处理"
+            progress=0, message=f"开始批量处理 {len(files)} 个文档",
+            result={"total": len(files), "files": []}
        )

-        results = []
-        for i, file_info in enumerate(files):
+        async def process_single_file(file_info: dict, index: int) -> dict:
+            """处理单个文件"""
+            filename = file_info["filename"]
            try:
+                # 解析文档
                parser = ParserFactory.get_parser(file_info["path"])
                result = parser.parse(file_info["path"])

-                if result.success:
-                    doc_id = await mongodb.insert_document(
-                        doc_type=file_info["ext"],
-                        content=result.data.get("content", ""),
-                        metadata={
-                            **result.metadata,
-                            "original_filename": file_info["filename"],
-                            "file_path": file_info["path"]
-                        },
-                        structured_data=result.data.get("structured_data")
+                if not result.success:
+                    return {"index": index, "filename": filename, "success": False, "error": result.error or "解析失败"}
+
+                # 存储到 MongoDB
+                doc_id = await mongodb.insert_document(
+                    doc_type=file_info["ext"],
+                    content=result.data.get("content", ""),
+                    metadata={
+                        **result.metadata,
+                        "original_filename": filename,
+                        "file_path": file_info["path"]
+                    },
+                    structured_data=result.data.get("structured_data")
+                )
+
+                # Excel 处理
+                if file_info["ext"] in ["xlsx", "xls"]:
+                    await table_rag_service.build_table_rag_index(
+                        file_path=file_info["path"],
+                        filename=filename,
+                        skip_rag_index=True  # 跳过 AI 字段描述生成和索引
                    )
-
-                    # Excel 处理
-                    if file_info["ext"] in ["xlsx", "xls"]:
-                        await table_rag_service.build_table_rag_index(
-                            file_path=file_info["path"],
-                            filename=file_info["filename"]
-                        )
-                    else:
-                        # 非结构化文档：处理其中的表格 + 内容索引
-                        structured_data = result.data.get("structured_data", {})
-                        tables = structured_data.get("tables", [])
-
-                        if tables:
-                            for table_info in tables:
-                                await table_rag_service.index_document_table(
-                                    doc_id=doc_id,
-                                    filename=file_info["filename"],
-                                    table_data=table_info,
-                                    source_doc_type=file_info["ext"]
-                                )
-
-                        await index_document_to_rag(doc_id, file_info["filename"], result, file_info["ext"])
-
-                    results.append({"filename": file_info["filename"], "doc_id": doc_id, "success": True})
                else:
-                    results.append({"filename": file_info["filename"], "success": False, "error": result.error})
+                    # 非结构化文档
+                    structured_data = result.data.get("structured_data", {})
+                    tables = structured_data.get("tables", [])
+
+                    # 表格数据直接存 MySQL（跳过 RAG 索引）
+                    if tables:
+                        for table_info in tables:
+                            await table_rag_service.index_document_table(
+                                doc_id=doc_id,
+                                filename=filename,
+                                table_data=table_info,
+                                source_doc_type=file_info["ext"]
+                            )
+
+                    # 只有内容足够长才建立语义索引
+                    content = result.data.get("content", "")
+                    if content and len(content) > 50:
+                        await index_document_to_rag(doc_id, filename, result, file_info["ext"])
+
+                return {"index": index, "filename": filename, "doc_id": doc_id, "success": True}

            except Exception as e:
-                results.append({"filename": file_info["filename"], "success": False, "error": str(e)})
+                logger.error(f"处理文件 {filename} 失败: {e}")
+                return {"index": index, "filename": filename, "success": False, "error": str(e)}

-            progress = int((i + 1) / len(files) * 100)
-            await update_task_status(
-                task_id, status="processing",
-                progress=progress, message=f"已处理 {i+1}/{len(files)}"
-            )
+        # 并行处理所有文档
+        tasks = [process_single_file(f, i) for i, f in enumerate(files)]
+        results = await asyncio.gather(*tasks)

+        # 按原始顺序排序
+        results.sort(key=lambda x: x["index"])
+
+        # 统计成功/失败数量
+        success_count = sum(1 for r in results if r["success"])
+        fail_count = len(results) - success_count
+
+        # 更新最终状态
        await update_task_status(
            task_id, status="success",
-            progress=100, message="批量处理完成",
-            result={"results": results}
+            progress=100, message=f"批量处理完成: {success_count} 成功, {fail_count} 失败",
+            result={
+                "total": len(files),
+                "success": success_count,
+                "failure": fail_count,
+                "results": results
+            }
        )

+        logger.info(f"批量处理完成: {success_count}/{len(files)} 成功")
+
    except Exception as e:
        logger.error(f"批量处理失败: {str(e)}")
        await update_task_status(
@@ -404,20 +446,20 @@ async def process_documents_batch(task_id: str, files: List[dict]):


 async def index_document_to_rag(doc_id: str, filename: str, result: ParseResult, doc_type: str):
-    """将非结构化文档索引到 RAG（使用分块索引）"""
+    """将非结构化文档索引到 RAG（使用分块索引，异步执行）"""
    try:
        content = result.data.get("content", "")
        if content:
-            # 将完整内容传递给 RAG 服务自动分块索引
-            rag_service.index_document_content(
+            # 使用异步方法索引，避免阻塞事件循环
+            await rag_service.index_document_content_async(
                doc_id=doc_id,
-                content=content,  # 传递完整内容，由 RAG 服务自动分块
+                content=content,
                metadata={
                    "filename": filename,
                    "doc_type": doc_type
                },
-                chunk_size=500,  # 每块 500 字符
-                chunk_overlap=50  # 块之间 50 字符重叠
+                chunk_size=1000,  # 每块 1000 字符，提升速度
+                chunk_overlap=100  # 块之间 100 字符重叠
            )
            logger.info(f"RAG 索引完成: {filename}, doc_id={doc_id}")
    except Exception as e:
--- a/backend/app/api/endpoints/instruction.py
+++ b/backend/app/api/endpoints/instruction.py
@@ -25,6 +25,7 @@ class InstructionRequest(BaseModel):
    instruction: str
    doc_ids: Optional[List[str]] = None  # 关联的文档 ID 列表
    context: Optional[Dict[str, Any]] = None  # 额外上下文
+    conversation_id: Optional[str] = None  # 对话会话ID，用于关联历史记录


 class IntentRecognitionResponse(BaseModel):
@@ -240,7 +241,8 @@ async def instruction_chat(
            task_id=task_id,
            instruction=request.instruction,
            doc_ids=request.doc_ids,
-            context=request.context
+            context=request.context,
+            conversation_id=request.conversation_id
        )

        return {
@@ -251,14 +253,15 @@ async def instruction_chat(
        }

    # 同步模式：等待执行完成
-    return await _execute_chat_task(task_id, request.instruction, request.doc_ids, request.context)
+    return await _execute_chat_task(task_id, request.instruction, request.doc_ids, request.context, request.conversation_id)


 async def _execute_chat_task(
    task_id: str,
    instruction: str,
    doc_ids: Optional[List[str]],
-    context: Optional[Dict[str, Any]]
+    context: Optional[Dict[str, Any]],
+    conversation_id: Optional[str] = None
 ):
    """执行指令对话的后台任务"""
    from app.core.database import mongodb as mongo_client
@@ -278,6 +281,13 @@ async def _execute_chat_task(
        # 构建上下文
        ctx: Dict[str, Any] = context or {}

+        # 获取对话历史
+        if conversation_id:
+            history = await mongo_client.get_conversation_history(conversation_id, limit=20)
+            if history:
+                ctx["conversation_history"] = history
+                logger.info(f"加载对话历史: conversation_id={conversation_id}, 消息数={len(history)}")
+
        # 获取关联文档
        if doc_ids:
            docs = []
@@ -291,6 +301,29 @@ async def _execute_chat_task(
        # 执行指令
        result = await instruction_executor.execute(instruction, ctx)

+        # 存储对话历史
+        if conversation_id:
+            try:
+                # 存储用户消息
+                await mongo_client.insert_conversation(
+                    conversation_id=conversation_id,
+                    role="user",
+                    content=instruction,
+                    intent=result.get("intent", "unknown")
+                )
+                # 存储助手回复
+                response_content = result.get("message", "")
+                if response_content:
+                    await mongo_client.insert_conversation(
+                        conversation_id=conversation_id,
+                        role="assistant",
+                        content=response_content,
+                        intent=result.get("intent", "unknown")
+                    )
+                logger.info(f"已存储对话历史: conversation_id={conversation_id}")
+            except Exception as e:
+                logger.error(f"存储对话历史失败: {e}")
+
        # 根据意图类型添加友好的响应消息
        response_messages = {
            "extract": f"已提取 {len(result.get('extracted_data', {}))} 个字段的数据",
--- a/backend/app/api/endpoints/templates.py
+++ b/backend/app/api/endpoints/templates.py
@@ -87,6 +87,7 @@ class ExportRequest(BaseModel):
    template_id: str
    filled_data: dict
    format: str = "xlsx"  # xlsx 或 docx
+    filled_file_path: Optional[str] = None  # 已填写的 Word 文件路径（可选）


 # ==================== 接口实现 ====================
@@ -541,7 +542,7 @@ async def export_filled_template(
        if request.format == "xlsx":
            return await _export_to_excel(request.filled_data, request.template_id)
        elif request.format == "docx":
-            return await _export_to_word(request.filled_data, request.template_id)
+            return await _export_to_word(request.filled_data, request.template_id, request.filled_file_path)
        else:
            raise HTTPException(
                status_code=400,
@@ -608,11 +609,12 @@ async def _export_to_excel(filled_data: dict, template_id: str) -> StreamingResp
    )


-async def _export_to_word(filled_data: dict, template_id: str) -> StreamingResponse:
+async def _export_to_word(filled_data: dict, template_id: str, filled_file_path: Optional[str] = None) -> StreamingResponse:
    """导出为 Word 格式"""
    import re
    import tempfile
    import os
+    import urllib.parse
    from docx import Document
    from docx.shared import Pt, RGBColor
    from docx.enum.text import WD_ALIGN_PARAGRAPH
@@ -623,12 +625,32 @@ async def _export_to_word(filled_data: dict, template_id: str) -> StreamingRespo
            return ""
        # 移除控制字符
        text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
+        # 转义 XML 特殊字符以防破坏文档结构
+        text = text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
        return text.strip()

+    tmp_path = None
    try:
-        # 先保存到临时文件，再读取到内存，确保文档完整性
-        with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
-            tmp_path = tmp_file.name
+        # 如果有已填写的文件（通过 _fill_docx 填写了模板单元格），直接返回该文件
+        if filled_file_path and os.path.exists(filled_file_path):
+            filename = os.path.basename(filled_file_path)
+            with open(filled_file_path, 'rb') as f:
+                file_content = f.read()
+            output = io.BytesIO(file_content)
+            encoded_filename = urllib.parse.quote(filename)
+            return StreamingResponse(
+                output,
+                media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                headers={
+                    "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}",
+                    "Content-Length": str(len(file_content))
+                }
+            )
+
+        # 没有已填写文件，创建新的 Word 文档（表格形式）
+        # 创建临时文件（立即关闭句柄，避免 Windows 文件锁问题）
+        tmp_fd, tmp_path = tempfile.mkstemp(suffix='.docx')
+        os.close(tmp_fd)  # 关闭立即得到的 fd，让 docx 可以写入

        doc = Document()
        doc.add_heading('填写结果', level=1)
@@ -670,19 +692,23 @@ async def _export_to_word(filled_data: dict, template_id: str) -> StreamingRespo

    finally:
        # 清理临时文件
-        if os.path.exists(tmp_path):
+        if tmp_path and os.path.exists(tmp_path):
            try:
                os.unlink(tmp_path)
-            except:
+            except Exception:
                pass

    output = io.BytesIO(file_content)
    filename = "filled_template.docx"
+    encoded_filename = urllib.parse.quote(filename)

    return StreamingResponse(
        output,
        media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        headers={"Content-Disposition": f"attachment; filename*=UTF-8''{filename}"}
+        headers={
+            "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}",
+            "Content-Length": str(len(file_content))
+        }
    )