【智能助手增强】

- 新增对话历史管理：MongoDB新增conversations集合，存储用户与AI的对话上下文，支持多轮对话意图延续 - 新增对话历史API（conversation.py）：GET/DELETE conversation历史、列出所有会话 - 意图解析增强：支持基于对话历史的意图识别，上下文理解更准确 - 字段提取优化：支持"提取文档中的医院数量"等自然语言模式，智能去除"文档中的"前缀 - 文档对比优化：从指令中提取文件名并精确匹配source_docs，支持"对比A和B两个文档" - 文档摘要优化：使用LLM生成真实AI摘要而非返回原始文档预览【Word模板填表核心功能】 - Word模板字段生成：空白Word上传后，自动从源文档（Excel/Word/TXT/MD）内容AI生成字段名 - Word模板填表（_fill_docx）：将提取数据写入Word模板表格，支持精确匹配、模糊匹配、追加新行 - 数据润色（_polish_word_filled_data）：LLM对多行Excel数据进行统计归纳（合计/平均/极值），转化为专业自然语言描述 - 段落格式输出：使用📌字段名+值段落+分隔线（灰色横线）格式，提升可读性 - 导出链打通：fill_template返回filled_file_path，export直接返回已填好的Word文件【其他修复】 - 修复Word导出Windows文件锁问题：NamedTemporaryFile改为mkstemp+close - 修复Word方框非法字符：扩展clean_text移除\uFFFD、□等Unicode替代符和零宽字符 - 修复文档对比"需要至少2个文档"：从指令提取具体文件名优先匹配而非取前2个 - 修复导出format硬编码：自动识别docx/xlsx格式 - Docx解析器增加备用解析方法和更完整的段落/表格/标题提取 - RAG服务新增MySQL数据源支持
2026-04-15 23:32:55 +08:00
parent 9e7f9df384
commit e5d4724e82
19 changed files with 2185 additions and 407 deletions
--- a/backend/app/instruction/executor.py
+++ b/backend/app/instruction/executor.py
@@ -5,9 +5,10 @@
 """
 import logging
 import json
+import re
 from typing import Any, Dict, List, Optional

-from app.services.template_fill_service import template_fill_service
+from app.services.template_fill_service import template_fill_service, TemplateField
 from app.services.rag_service import rag_service
 from app.services.markdown_ai_service import markdown_ai_service
 from app.core.database import mongodb
@@ -15,6 +16,31 @@ from app.core.database import mongodb
 logger = logging.getLogger(__name__)


+def _extract_filenames_from_text(text: str) -> List[str]:
+    """
+    从指令文本中提取文件名列表。
+
+    智能处理用'和'/'与'/'、分隔的多个文件名（尤其是带年号的统计公报）。
+    """
+    # 先去掉"对比这两个文档"等引导语，只保留文件名部分
+    text = re.sub(r'^(?:对比|比较)这两个?文档[的差异]?[：:]?', '', text).strip()
+    text = re.sub(r'两个文档.*$', '', text).strip()
+    if not text:
+        return []
+
+    # 直接查找所有带扩展名的文件名模式
+    results = []
+    for m in re.finditer(r'[^\s，。！？、和与]+(?=\.(?:docx|xlsx|md|txt))', text):
+        start = m.start()
+        ext_match = re.search(r'\.(?:docx|xlsx|md|txt)', text[m.end():])
+        if ext_match:
+            fn = text[start:m.end() + ext_match.end()]
+            if fn:
+                results.append(fn)
+
+    return results
+
+
 class InstructionExecutor:
    """指令执行器"""

@@ -41,9 +67,10 @@ class InstructionExecutor:
            self.intent_parser = intent_parser

        context = context or {}
+        context["instruction"] = instruction  # 保存原始指令以便后续使用

-        # 解析意图
-        intent, params = await self.intent_parser.parse(instruction)
+        # 解析意图（传递对话历史上下文）
+        intent, params = await self.intent_parser.parse(instruction, context)

        # 根据意图类型执行相应操作
        if intent == "extract":
@@ -72,18 +99,48 @@ class InstructionExecutor:
    async def _execute_extract(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
        """执行信息提取"""
        try:
-            target_fields = params.get("field_refs", [])
+            # target_fields 来自意图解析，field_refs 来自引号/字段关键词匹配
+            target_fields = params.get("target_fields", []) or params.get("field_refs", [])
            doc_ids = params.get("document_refs", [])
+            instruction_text = context.get("instruction", "")
+
+            # 如果没有指定文档，尝试按文件名精确搜索
+            if not doc_ids or "all_docs" in doc_ids:
+                if instruction_text:
+                    import re
+                    # 提取引号内的内容或文件名
+                    filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text)
+                    if filename_match:
+                        search_term = filename_match.group(1)
+                    else:
+                        match = re.search(r'([^\s]+\.(?:docx|xlsx|md|txt))', instruction_text)
+                        search_term = match.group(1) if match else None
+
+                    if search_term:
+                        logger.info(f"提取时搜索文档: {search_term}")
+                        searched_docs = await mongodb.search_documents(search_term, limit=5)
+                        if searched_docs:
+                            # 优先选择文件名完全匹配的文档
+                            best_docs = [
+                                d for d in searched_docs
+                                if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower()
+                            ]
+                            if not best_docs:
+                                best_docs = [searched_docs[0]]
+                            context["source_docs"] = best_docs
+                            doc_ids = [doc.get("_id", "") for doc in best_docs]
+                            logger.info(f"找到 {len(best_docs)} 个文档用于提取，最佳: {best_docs[0].get('metadata', {}).get('original_filename', '?')}")

            if not target_fields:
                return {
                    "success": False,
+                    "intent": "extract",
                    "error": "未指定要提取的字段",
                    "message": "请明确说明要提取哪些字段，如：'提取医院数量和床位数'"
                }

-            # 如果指定了文档，验证文档存在
-            if doc_ids and "all_docs" not in doc_ids:
+            # 如果指定了文档且还没有加载 source_docs，则验证并加载
+            if doc_ids and "all_docs" not in doc_ids and not context.get("source_docs"):
                valid_docs = []
                for doc_ref in doc_ids:
                    doc_id = doc_ref.replace("doc_", "")
@@ -93,20 +150,22 @@ class InstructionExecutor:
                if not valid_docs:
                    return {
                        "success": False,
+                        "intent": "extract",
                        "error": "指定的文档不存在",
                        "message": "请检查文档编号是否正确"
                    }
                context["source_docs"] = valid_docs

-            # 构建字段列表
-            fields = []
-            for i, field_name in enumerate(target_fields):
-                fields.append({
-                    "name": field_name,
-                    "cell": f"A{i+1}",
-                    "field_type": "text",
-                    "required": False
-                })
+            # 构建字段列表（使用 TemplateField dataclass）
+            fields = [
+                TemplateField(
+                    name=field_name,
+                    cell=f"A{i+1}",
+                    field_type="text",
+                    required=False
+                )
+                for i, field_name in enumerate(target_fields)
+            ]

            # 调用填表服务
            result = await template_fill_service.fill_template(
@@ -143,7 +202,7 @@ class InstructionExecutor:
                }

            # 获取源文档
-            source_docs = context.get("source_docs", [])
+            source_docs = context.get("source_docs", []) or []
            source_doc_ids = [doc.get("_id") for doc in source_docs if doc.get("_id")]

            # 获取字段
@@ -175,36 +234,103 @@ class InstructionExecutor:
            }

    async def _execute_summarize(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
-        """执行摘要总结"""
+        """执行摘要总结 - 使用 LLM 生成真实摘要"""
        try:
-            docs = context.get("source_docs", [])
+            import re
+            docs = context.get("source_docs", []) or []
+            instruction_text = context.get("instruction", "")
+
+            # 从指令中提取文件名/关键词，优先搜索精确文档
+            search_term = None
+            if instruction_text:
+                filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text)
+                if filename_match:
+                    search_term = filename_match.group(1)
+                else:
+                    file_match = re.search(r'([^\s，。！？,]+\.(?:docx|xlsx|md|txt))', instruction_text)
+                    if file_match:
+                        search_term = file_match.group(1)
+
+            # 如果没有文档或有更精确的搜索词，尝试重新搜索
+            if not docs or search_term:
+                if search_term:
+                    logger.info(f"按关键词搜索文档: {search_term}")
+                    searched_docs = await mongodb.search_documents(search_term, limit=5)
+                    if searched_docs:
+                        # 优先使用文件名最匹配的文档
+                        docs = sorted(
+                            searched_docs,
+                            key=lambda d: 1 if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower() else 0,
+                            reverse=True
+                        )
+                        logger.info(f"找到 {len(docs)} 个文档，最佳匹配: {docs[0].get('metadata', {}).get('original_filename', '?')}")
+
            if not docs:
                return {
-                    "success": False,
-                    "error": "没有可用的文档",
-                    "message": "请先上传要总结的文档"
+                    "success": True,
+                    "intent": "summarize",
+                    "action_needed": "provide_document",
+                    "message": "我理解了，您想分析文档内容。",
+                    "suggestion": "请提供已上传文档的名称（可以是文件名或部分名称），或者上传您想要分析的文档。\n\n支持的格式：docx、xlsx、md、txt\n\n例如：'分析2021年民政事业发展统计公报' 或 '总结卫生健康数据'"
                }

-            summaries = []
-            for doc in docs[:5]:  # 最多处理5个文档
-                content = doc.get("content", "")[:5000]  # 限制内容长度
-                if content:
-                    summaries.append({
-                        "filename": doc.get("metadata", {}).get("original_filename", "未知"),
-                        "content_preview": content[:500] + "..." if len(content) > 500 else content
-                    })
+            # 对第一个（最佳匹配）文档生成 AI 摘要
+            primary_doc = docs[0]
+            content = primary_doc.get("content", "")
+            filename = primary_doc.get("metadata", {}).get("original_filename", "未知文档")
+
+            if not content:
+                return {
+                    "success": False,
+                    "intent": "summarize",
+                    "error": "文档内容为空",
+                    "message": f"文档 {filename} 没有可供分析的文本内容"
+                }
+
+            # 使用 LLM 生成摘要
+            content_for_summary = content[:12000]  # 最多取前 12000 字
+            user_request = instruction_text or "请总结这份文档"
+
+            prompt = f"""请对以下文档进行全面、有条理的摘要分析。
+
+文档名称：{filename}
+用户要求：{user_request}
+
+文档内容：
+{content_for_summary}
+
+请按以下格式输出摘要：
+1. **文档概述**：简述文档主题和背景（2-3句）
+2. **主要内容**：列出文档的核心数据和关键信息（用要点列出）
+3. **重要数据**：提取文档中的重要数字、统计数据
+4. **主要结论**：归纳文档的主要结论或趋势
+
+要求：条理清晰，数据准确，不要遗漏关键信息。"""
+
+            from app.services.llm_service import llm_service
+            messages = [
+                {"role": "system", "content": "你是一个专业的文档分析助手，擅长提取关键信息并生成结构化摘要。"},
+                {"role": "user", "content": prompt}
+            ]
+
+            response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=2000)
+            ai_summary = llm_service.extract_message_content(response)

            return {
                "success": True,
                "intent": "summarize",
-                "summaries": summaries,
-                "message": f"找到 {len(summaries)} 个文档可供参考"
+                "ai_summary": ai_summary,
+                "filename": filename,
+                "doc_id": primary_doc.get("_id", ""),
+                "total_docs_found": len(docs),
+                "message": f"已生成文档摘要"
            }

        except Exception as e:
            logger.error(f"摘要执行失败: {e}")
            return {
                "success": False,
+                "intent": "summarize",
                "error": str(e),
                "message": f"摘要生成失败: {str(e)}"
            }
@@ -213,17 +339,39 @@ class InstructionExecutor:
        """执行问答"""
        try:
            question = params.get("question", "")
+            instruction_text = context.get("instruction", "")
+
            if not question:
                return {
                    "success": False,
+                    "intent": "question",
                    "error": "未提供问题",
                    "message": "请输入要回答的问题"
                }

-            # 使用 RAG 检索相关文档
-            docs = context.get("source_docs", [])
-            rag_results = []
+            docs = context.get("source_docs", []) or []

+            # 如果没有文档，尝试从指令中提取文件名搜索
+            if not docs:
+                filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text)
+                if not filename_match:
+                    filename_match = re.search(r'([^\s，。！？]+\.(?:docx|xlsx|md|txt))', instruction_text)
+                if filename_match:
+                    found = await mongodb.search_documents(filename_match.group(1), limit=5)
+                    if found:
+                        docs = found
+
+            if not docs:
+                return {
+                    "success": True,
+                    "intent": "question",
+                    "question": question,
+                    "answer": None,
+                    "message": "请先上传文档，我才能回答您的问题"
+                }
+
+            # 使用 RAG 检索相关文档
+            rag_results = []
            for doc in docs:
                doc_id = doc.get("_id", "")
                if doc_id:
@@ -241,12 +389,42 @@ class InstructionExecutor:
                    doc.get("content", "")[:3000] for doc in docs[:3] if doc.get("content")
                ])

+            if not context_text:
+                return {
+                    "success": True,
+                    "intent": "question",
+                    "question": question,
+                    "answer": None,
+                    "message": "文档内容为空，无法回答问题"
+                }
+
+            # 使用 LLM 生成答案
+            filename = docs[0].get("metadata", {}).get("original_filename", "文档")
+            prompt = f"""基于以下文档内容，回答用户的问题。
+
+文档名称：{filename}
+用户问题：{question}
+
+文档内容：
+{context_text[:8000]}
+
+请根据文档内容准确回答问题。如果文档中没有相关信息，请明确说明。"""
+
+            from app.services.llm_service import llm_service
+            messages = [
+                {"role": "system", "content": "你是一个专业的文档问答助手，根据提供的内容准确回答用户问题。"},
+                {"role": "user", "content": prompt}
+            ]
+            response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=1500)
+            answer = llm_service.extract_message_content(response)
+
            return {
                "success": True,
                "intent": "question",
                "question": question,
-                "context_preview": context_text[:500] + "..." if len(context_text) > 500 else context_text,
-                "message": "已找到相关上下文，可进行问答"
+                "answer": answer,
+                "filename": filename,
+                "message": "已生成回答"
            }

        except Exception as e:
@@ -299,12 +477,53 @@ class InstructionExecutor:
    async def _execute_compare(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
        """执行对比分析"""
        try:
-            docs = context.get("source_docs", [])
+            docs = context.get("source_docs", []) or []
+            instruction_text = context.get("instruction", "")
+
+            # 优先从指令中提取具体的文件名
+            filenames = _extract_filenames_from_text(instruction_text)
+
+            if filenames:
+                # 只选择文件名匹配的那些文档
+                matched_docs = []
+                for doc in docs:
+                    fname = doc.get("metadata", {}).get("original_filename", "").lower()
+                    for fn in filenames:
+                        if fn.lower() in fname or fname in fn.lower():
+                            matched_docs.append(doc)
+                            break
+                # 如果匹配到足够文档，用匹配的
+                if len(matched_docs) >= 2:
+                    docs = matched_docs
+                else:
+                    # 匹配不够，尝试按文件名搜索 MongoDB
+                    all_found = []
+                    for fn in filenames:
+                        found = await mongodb.search_documents(fn, limit=5)
+                        all_found.extend(found)
+                    seen = set()
+                    unique_docs = []
+                    for d in all_found:
+                        did = d.get("_id", "")
+                        if did and did not in seen:
+                            seen.add(did)
+                            unique_docs.append(d)
+                    if len(unique_docs) >= 2:
+                        docs = unique_docs
+                    elif len(unique_docs) == 1 and len(docs) >= 1:
+                        # 找到一个指定的 + 用一个通用的
+                        docs = unique_docs + docs[:1]
+                    elif docs and len(filenames) == 1:
+                        # 找到一个指定文件名但只有一个匹配，尝试补充
+                        docs = unique_docs + [d for d in docs if d not in unique_docs]
+                        docs = docs[:2]
+
            if len(docs) < 2:
                return {
                    "success": False,
+                    "intent": "compare",
                    "error": "对比需要至少2个文档",
-                    "message": "请上传至少2个文档进行对比"
+                    "message": "请上传至少2个文档进行对比，或明确说出要对比的文档名称"
                }

            # 提取文档基本信息
@@ -329,6 +548,7 @@ class InstructionExecutor:
            logger.error(f"对比执行失败: {e}")
            return {
                "success": False,
+                "intent": "compare",
                "error": str(e),
                "message": f"对比分析失败: {str(e)}"
            }
@@ -336,10 +556,23 @@ class InstructionExecutor:
    async def _execute_edit(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
        """执行文档编辑操作"""
        try:
-            docs = context.get("source_docs", [])
+            docs = context.get("source_docs", []) or []
+            instruction_text = context.get("instruction", "")
+
+            # 如果没有文档，尝试从指令中提取文件名搜索
+            if not docs:
+                filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text)
+                if not filename_match:
+                    filename_match = re.search(r'([^\s，。！？]+\.(?:docx|xlsx|md|txt))', instruction_text)
+                if filename_match:
+                    found = await mongodb.search_documents(filename_match.group(1), limit=3)
+                    if found:
+                        docs = found
+
            if not docs:
                return {
                    "success": False,
+                    "intent": "edit",
                    "error": "没有可用的文档",
                    "message": "请先上传要编辑的文档"
                }
@@ -405,7 +638,7 @@ class InstructionExecutor:
        - Word -> Markdown
        """
        try:
-            docs = context.get("source_docs", [])
+            docs = context.get("source_docs", []) or []
            if not docs:
                return {
                    "success": False,
--- a/backend/app/instruction/intent_parser.py
+++ b/backend/app/instruction/intent_parser.py
@@ -28,7 +28,7 @@ class IntentParser:
    INTENT_KEYWORDS = {
        INTENT_EXTRACT: ["提取", "抽取", "获取", "找出", "查找", "识别", "找到"],
        INTENT_FILL_TABLE: ["填表", "填写", "填充", "录入", "导入到表格", "填写到"],
-        INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼"],
+        INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼", "分析", "聊聊"],
        INTENT_QUESTION: ["问答", "回答", "解释", "什么是", "为什么", "如何", "怎样", "多少", "几个"],
        INTENT_SEARCH: ["搜索", "查找", "检索", "查询", "找"],
        INTENT_COMPARE: ["对比", "比较", "差异", "区别", "不同"],
@@ -47,12 +47,13 @@ class IntentParser:
    def __init__(self):
        self.intent_history: List[Dict[str, Any]] = []

-    async def parse(self, text: str) -> Tuple[str, Dict[str, Any]]:
+    async def parse(self, text: str, context: Dict[str, Any] = None) -> Tuple[str, Dict[str, Any]]:
        """
        解析自然语言指令

        Args:
            text: 用户输入的自然语言
+            context: 执行上下文（包含对话历史等）

        Returns:
            (意图类型, 参数字典)
@@ -61,11 +62,17 @@ class IntentParser:
        if not text:
            return self.INTENT_UNKNOWN, {}

+        # 检查对话历史中的上下文
+        conversation_history = []
+        if context and context.get("conversation_history"):
+            conversation_history = context.get("conversation_history", [])
+            logger.info(f"解析时使用对话历史: {len(conversation_history)} 条消息")
+
        # 记录历史
        self.intent_history.append({"text": text, "intent": None})

-        # 识别意图
-        intent = self._recognize_intent(text)
+        # 识别意图（考虑对话上下文）
+        intent = self._recognize_intent_with_context(text, conversation_history)

        # 提取参数
        params = self._extract_params(text, intent)
@@ -78,6 +85,42 @@ class IntentParser:

        return intent, params

+    def _recognize_intent_with_context(self, text: str, conversation_history: List[Dict[str, Any]]) -> str:
+        """
+        基于对话历史识别意图
+
+        Args:
+            text: 当前用户输入
+            conversation_history: 对话历史
+
+        Returns:
+            意图类型
+        """
+        # 如果对话历史为空，使用基础意图识别
+        if not conversation_history:
+            return self._recognize_intent(text)
+
+        # 基于历史上下文进行意图识别
+        # 分析最近的对话了解用户意图的延续性
+        last_intent = None
+        last_topic = None
+
+        for msg in conversation_history[-5:]:  # 最多看最近5条消息
+            if msg.get("role") == "assistant":
+                last_intent = msg.get("intent")
+            if msg.get("intent") and msg.get("intent") != "unknown":
+                last_topic = msg.get("intent")
+
+        # 如果当前消息很短（如"继续"、"是的"），可能延续之前的意图
+        short_confirmation = ["是", "是的", "好", "继续", "ok", "好", "接着", "然后", "还有吗"]
+        if text.strip() in short_confirmation or len(text.strip()) <= 3:
+            if last_topic:
+                logger.info(f"简短确认，延续之前的意图: {last_topic}")
+                return last_topic
+
+        # 否则使用标准意图识别
+        return self._recognize_intent(text)
+
    def _recognize_intent(self, text: str) -> str:
        """识别意图类型"""
        intent_scores: Dict[str, float] = {}
@@ -214,18 +257,27 @@ class IntentParser:
        return template_info if template_info else None

    def _extract_target_fields(self, text: str) -> List[str]:
-        """提取目标字段"""
+        """提取目标字段 - 按分隔符切分再逐段清理"""
        fields = []

-        # 匹配 "提取XXX和YYY"、"抽取XXX、YYY"
-        patterns = [
-            r"提取([^(and|,|，)+]+?)(?:和|与|、|,|plus)",
-            r"抽取([^(and|,|，)+]+?)(?:和|与|、|,|plus)",
-        ]
+        # 去除提取/抽取前缀
+        cleaned_text = re.sub(r"^(?:提取|抽取)", "", text).strip()

-        for pattern in patterns:
-            matches = re.findall(pattern, text)
-            fields.extend([m.strip() for m in matches if m.strip()])
+        # 按'和'、'与'、'、'分割成多段
+        segments = re.split(r"[和与、]", cleaned_text)
+
+        # 常见前缀（这些不是字段名，需要去除）
+        prefixes = ["文档中的", "文档中", "文件中的", "文件中", "内容中的", "内容中"]
+
+        for seg in segments:
+            seg = seg.strip()
+            # 去除常见前缀
+            for p in prefixes:
+                if seg.startswith(p):
+                    seg = seg[len(p):]
+                    break
+            if seg and 2 <= len(seg) <= 20:
+                fields.append(seg)

        return list(set(fields))