【智能助手增强】

- 新增对话历史管理:MongoDB新增conversations集合,存储用户与AI的对话上下文,支持多轮对话意图延续
- 新增对话历史API(conversation.py):GET/DELETE conversation历史、列出所有会话
- 意图解析增强:支持基于对话历史的意图识别,上下文理解更准确
- 字段提取优化:支持"提取文档中的医院数量"等自然语言模式,智能去除"文档中的"前缀
- 文档对比优化:从指令中提取文件名并精确匹配source_docs,支持"对比A和B两个文档"
- 文档摘要优化:使用LLM生成真实AI摘要而非返回原始文档预览

【Word模板填表核心功能】
- Word模板字段生成:空白Word上传后,自动从源文档(Excel/Word/TXT/MD)内容AI生成字段名
- Word模板填表(_fill_docx):将提取数据写入Word模板表格,支持精确匹配、模糊匹配、追加新行
- 数据润色(_polish_word_filled_data):LLM对多行Excel数据进行统计归纳(合计/平均/极值),转化为专业自然语言描述
- 段落格式输出:使用📌字段名+值段落+分隔线(灰色横线)格式,提升可读性
- 导出链打通:fill_template返回filled_file_path,export直接返回已填好的Word文件

【其他修复】
- 修复Word导出Windows文件锁问题:NamedTemporaryFile改为mkstemp+close
- 修复Word方框非法字符:扩展clean_text移除\uFFFD、□等Unicode替代符和零宽字符
- 修复文档对比"需要至少2个文档":从指令提取具体文件名优先匹配而非取前2个
- 修复导出format硬编码:自动识别docx/xlsx格式
- Docx解析器增加备用解析方法和更完整的段落/表格/标题提取
- RAG服务新增MySQL数据源支持
This commit is contained in:
dj
2026-04-15 23:32:55 +08:00
parent 9e7f9df384
commit e5d4724e82
19 changed files with 2185 additions and 407 deletions

View File

@@ -5,9 +5,10 @@
"""
import logging
import json
import re
from typing import Any, Dict, List, Optional
from app.services.template_fill_service import template_fill_service
from app.services.template_fill_service import template_fill_service, TemplateField
from app.services.rag_service import rag_service
from app.services.markdown_ai_service import markdown_ai_service
from app.core.database import mongodb
@@ -15,6 +16,31 @@ from app.core.database import mongodb
logger = logging.getLogger(__name__)
def _extract_filenames_from_text(text: str) -> List[str]:
"""
从指令文本中提取文件名列表。
智能处理用''/''/'、分隔的多个文件名(尤其是带年号的统计公报)。
"""
# 先去掉"对比这两个文档"等引导语,只保留文件名部分
text = re.sub(r'^(?:对比|比较)这两个?文档[的差异]?[:]?', '', text).strip()
text = re.sub(r'两个文档.*$', '', text).strip()
if not text:
return []
# 直接查找所有带扩展名的文件名模式
results = []
for m in re.finditer(r'[^\s、和与]+(?=\.(?:docx|xlsx|md|txt))', text):
start = m.start()
ext_match = re.search(r'\.(?:docx|xlsx|md|txt)', text[m.end():])
if ext_match:
fn = text[start:m.end() + ext_match.end()]
if fn:
results.append(fn)
return results
class InstructionExecutor:
"""指令执行器"""
@@ -41,9 +67,10 @@ class InstructionExecutor:
self.intent_parser = intent_parser
context = context or {}
context["instruction"] = instruction # 保存原始指令以便后续使用
# 解析意图
intent, params = await self.intent_parser.parse(instruction)
# 解析意图(传递对话历史上下文)
intent, params = await self.intent_parser.parse(instruction, context)
# 根据意图类型执行相应操作
if intent == "extract":
@@ -72,18 +99,48 @@ class InstructionExecutor:
async def _execute_extract(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
"""执行信息提取"""
try:
target_fields = params.get("field_refs", [])
# target_fields 来自意图解析field_refs 来自引号/字段关键词匹配
target_fields = params.get("target_fields", []) or params.get("field_refs", [])
doc_ids = params.get("document_refs", [])
instruction_text = context.get("instruction", "")
# 如果没有指定文档,尝试按文件名精确搜索
if not doc_ids or "all_docs" in doc_ids:
if instruction_text:
import re
# 提取引号内的内容或文件名
filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text)
if filename_match:
search_term = filename_match.group(1)
else:
match = re.search(r'([^\s]+\.(?:docx|xlsx|md|txt))', instruction_text)
search_term = match.group(1) if match else None
if search_term:
logger.info(f"提取时搜索文档: {search_term}")
searched_docs = await mongodb.search_documents(search_term, limit=5)
if searched_docs:
# 优先选择文件名完全匹配的文档
best_docs = [
d for d in searched_docs
if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower()
]
if not best_docs:
best_docs = [searched_docs[0]]
context["source_docs"] = best_docs
doc_ids = [doc.get("_id", "") for doc in best_docs]
logger.info(f"找到 {len(best_docs)} 个文档用于提取,最佳: {best_docs[0].get('metadata', {}).get('original_filename', '?')}")
if not target_fields:
return {
"success": False,
"intent": "extract",
"error": "未指定要提取的字段",
"message": "请明确说明要提取哪些字段,如:'提取医院数量和床位数'"
}
# 如果指定了文档,验证文档存在
if doc_ids and "all_docs" not in doc_ids:
# 如果指定了文档且还没有加载 source_docs则验证并加载
if doc_ids and "all_docs" not in doc_ids and not context.get("source_docs"):
valid_docs = []
for doc_ref in doc_ids:
doc_id = doc_ref.replace("doc_", "")
@@ -93,20 +150,22 @@ class InstructionExecutor:
if not valid_docs:
return {
"success": False,
"intent": "extract",
"error": "指定的文档不存在",
"message": "请检查文档编号是否正确"
}
context["source_docs"] = valid_docs
# 构建字段列表
fields = []
for i, field_name in enumerate(target_fields):
fields.append({
"name": field_name,
"cell": f"A{i+1}",
"field_type": "text",
"required": False
})
# 构建字段列表(使用 TemplateField dataclass
fields = [
TemplateField(
name=field_name,
cell=f"A{i+1}",
field_type="text",
required=False
)
for i, field_name in enumerate(target_fields)
]
# 调用填表服务
result = await template_fill_service.fill_template(
@@ -143,7 +202,7 @@ class InstructionExecutor:
}
# 获取源文档
source_docs = context.get("source_docs", [])
source_docs = context.get("source_docs", []) or []
source_doc_ids = [doc.get("_id") for doc in source_docs if doc.get("_id")]
# 获取字段
@@ -175,36 +234,103 @@ class InstructionExecutor:
}
async def _execute_summarize(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
"""执行摘要总结"""
"""执行摘要总结 - 使用 LLM 生成真实摘要"""
try:
docs = context.get("source_docs", [])
import re
docs = context.get("source_docs", []) or []
instruction_text = context.get("instruction", "")
# 从指令中提取文件名/关键词,优先搜索精确文档
search_term = None
if instruction_text:
filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text)
if filename_match:
search_term = filename_match.group(1)
else:
file_match = re.search(r'([^\s,]+\.(?:docx|xlsx|md|txt))', instruction_text)
if file_match:
search_term = file_match.group(1)
# 如果没有文档或有更精确的搜索词,尝试重新搜索
if not docs or search_term:
if search_term:
logger.info(f"按关键词搜索文档: {search_term}")
searched_docs = await mongodb.search_documents(search_term, limit=5)
if searched_docs:
# 优先使用文件名最匹配的文档
docs = sorted(
searched_docs,
key=lambda d: 1 if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower() else 0,
reverse=True
)
logger.info(f"找到 {len(docs)} 个文档,最佳匹配: {docs[0].get('metadata', {}).get('original_filename', '?')}")
if not docs:
return {
"success": False,
"error": "没有可用的文档",
"message": "请先上传要总结的文档"
"success": True,
"intent": "summarize",
"action_needed": "provide_document",
"message": "我理解了,您想分析文档内容。",
"suggestion": "请提供已上传文档的名称(可以是文件名或部分名称),或者上传您想要分析的文档。\n\n支持的格式docx、xlsx、md、txt\n\n例如:'分析2021年民政事业发展统计公报''总结卫生健康数据'"
}
summaries = []
for doc in docs[:5]: # 最多处理5个文档
content = doc.get("content", "")[:5000] # 限制内容长度
if content:
summaries.append({
"filename": doc.get("metadata", {}).get("original_filename", "未知"),
"content_preview": content[:500] + "..." if len(content) > 500 else content
})
# 对第一个(最佳匹配)文档生成 AI 摘要
primary_doc = docs[0]
content = primary_doc.get("content", "")
filename = primary_doc.get("metadata", {}).get("original_filename", "未知文档")
if not content:
return {
"success": False,
"intent": "summarize",
"error": "文档内容为空",
"message": f"文档 {filename} 没有可供分析的文本内容"
}
# 使用 LLM 生成摘要
content_for_summary = content[:12000] # 最多取前 12000 字
user_request = instruction_text or "请总结这份文档"
prompt = f"""请对以下文档进行全面、有条理的摘要分析。
文档名称:{filename}
用户要求:{user_request}
文档内容:
{content_for_summary}
请按以下格式输出摘要:
1. **文档概述**简述文档主题和背景2-3句
2. **主要内容**:列出文档的核心数据和关键信息(用要点列出)
3. **重要数据**:提取文档中的重要数字、统计数据
4. **主要结论**:归纳文档的主要结论或趋势
要求:条理清晰,数据准确,不要遗漏关键信息。"""
from app.services.llm_service import llm_service
messages = [
{"role": "system", "content": "你是一个专业的文档分析助手,擅长提取关键信息并生成结构化摘要。"},
{"role": "user", "content": prompt}
]
response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=2000)
ai_summary = llm_service.extract_message_content(response)
return {
"success": True,
"intent": "summarize",
"summaries": summaries,
"message": f"找到 {len(summaries)} 个文档可供参考"
"ai_summary": ai_summary,
"filename": filename,
"doc_id": primary_doc.get("_id", ""),
"total_docs_found": len(docs),
"message": f"已生成文档摘要"
}
except Exception as e:
logger.error(f"摘要执行失败: {e}")
return {
"success": False,
"intent": "summarize",
"error": str(e),
"message": f"摘要生成失败: {str(e)}"
}
@@ -213,17 +339,39 @@ class InstructionExecutor:
"""执行问答"""
try:
question = params.get("question", "")
instruction_text = context.get("instruction", "")
if not question:
return {
"success": False,
"intent": "question",
"error": "未提供问题",
"message": "请输入要回答的问题"
}
# 使用 RAG 检索相关文档
docs = context.get("source_docs", [])
rag_results = []
docs = context.get("source_docs", []) or []
# 如果没有文档,尝试从指令中提取文件名搜索
if not docs:
filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text)
if not filename_match:
filename_match = re.search(r'([^\s]+\.(?:docx|xlsx|md|txt))', instruction_text)
if filename_match:
found = await mongodb.search_documents(filename_match.group(1), limit=5)
if found:
docs = found
if not docs:
return {
"success": True,
"intent": "question",
"question": question,
"answer": None,
"message": "请先上传文档,我才能回答您的问题"
}
# 使用 RAG 检索相关文档
rag_results = []
for doc in docs:
doc_id = doc.get("_id", "")
if doc_id:
@@ -241,12 +389,42 @@ class InstructionExecutor:
doc.get("content", "")[:3000] for doc in docs[:3] if doc.get("content")
])
if not context_text:
return {
"success": True,
"intent": "question",
"question": question,
"answer": None,
"message": "文档内容为空,无法回答问题"
}
# 使用 LLM 生成答案
filename = docs[0].get("metadata", {}).get("original_filename", "文档")
prompt = f"""基于以下文档内容,回答用户的问题。
文档名称:{filename}
用户问题:{question}
文档内容:
{context_text[:8000]}
请根据文档内容准确回答问题。如果文档中没有相关信息,请明确说明。"""
from app.services.llm_service import llm_service
messages = [
{"role": "system", "content": "你是一个专业的文档问答助手,根据提供的内容准确回答用户问题。"},
{"role": "user", "content": prompt}
]
response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=1500)
answer = llm_service.extract_message_content(response)
return {
"success": True,
"intent": "question",
"question": question,
"context_preview": context_text[:500] + "..." if len(context_text) > 500 else context_text,
"message": "已找到相关上下文,可进行问答"
"answer": answer,
"filename": filename,
"message": "已生成回答"
}
except Exception as e:
@@ -299,12 +477,53 @@ class InstructionExecutor:
async def _execute_compare(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
"""执行对比分析"""
try:
docs = context.get("source_docs", [])
docs = context.get("source_docs", []) or []
instruction_text = context.get("instruction", "")
# 优先从指令中提取具体的文件名
filenames = _extract_filenames_from_text(instruction_text)
if filenames:
# 只选择文件名匹配的那些文档
matched_docs = []
for doc in docs:
fname = doc.get("metadata", {}).get("original_filename", "").lower()
for fn in filenames:
if fn.lower() in fname or fname in fn.lower():
matched_docs.append(doc)
break
# 如果匹配到足够文档,用匹配的
if len(matched_docs) >= 2:
docs = matched_docs
else:
# 匹配不够,尝试按文件名搜索 MongoDB
all_found = []
for fn in filenames:
found = await mongodb.search_documents(fn, limit=5)
all_found.extend(found)
seen = set()
unique_docs = []
for d in all_found:
did = d.get("_id", "")
if did and did not in seen:
seen.add(did)
unique_docs.append(d)
if len(unique_docs) >= 2:
docs = unique_docs
elif len(unique_docs) == 1 and len(docs) >= 1:
# 找到一个指定的 + 用一个通用的
docs = unique_docs + docs[:1]
elif docs and len(filenames) == 1:
# 找到一个指定文件名但只有一个匹配,尝试补充
docs = unique_docs + [d for d in docs if d not in unique_docs]
docs = docs[:2]
if len(docs) < 2:
return {
"success": False,
"intent": "compare",
"error": "对比需要至少2个文档",
"message": "请上传至少2个文档进行对比"
"message": "请上传至少2个文档进行对比,或明确说出要对比的文档名称"
}
# 提取文档基本信息
@@ -329,6 +548,7 @@ class InstructionExecutor:
logger.error(f"对比执行失败: {e}")
return {
"success": False,
"intent": "compare",
"error": str(e),
"message": f"对比分析失败: {str(e)}"
}
@@ -336,10 +556,23 @@ class InstructionExecutor:
async def _execute_edit(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
"""执行文档编辑操作"""
try:
docs = context.get("source_docs", [])
docs = context.get("source_docs", []) or []
instruction_text = context.get("instruction", "")
# 如果没有文档,尝试从指令中提取文件名搜索
if not docs:
filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text)
if not filename_match:
filename_match = re.search(r'([^\s]+\.(?:docx|xlsx|md|txt))', instruction_text)
if filename_match:
found = await mongodb.search_documents(filename_match.group(1), limit=3)
if found:
docs = found
if not docs:
return {
"success": False,
"intent": "edit",
"error": "没有可用的文档",
"message": "请先上传要编辑的文档"
}
@@ -405,7 +638,7 @@ class InstructionExecutor:
- Word -> Markdown
"""
try:
docs = context.get("source_docs", [])
docs = context.get("source_docs", []) or []
if not docs:
return {
"success": False,

View File

@@ -28,7 +28,7 @@ class IntentParser:
INTENT_KEYWORDS = {
INTENT_EXTRACT: ["提取", "抽取", "获取", "找出", "查找", "识别", "找到"],
INTENT_FILL_TABLE: ["填表", "填写", "填充", "录入", "导入到表格", "填写到"],
INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼"],
INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼", "分析", "聊聊"],
INTENT_QUESTION: ["问答", "回答", "解释", "什么是", "为什么", "如何", "怎样", "多少", "几个"],
INTENT_SEARCH: ["搜索", "查找", "检索", "查询", ""],
INTENT_COMPARE: ["对比", "比较", "差异", "区别", "不同"],
@@ -47,12 +47,13 @@ class IntentParser:
def __init__(self):
self.intent_history: List[Dict[str, Any]] = []
async def parse(self, text: str) -> Tuple[str, Dict[str, Any]]:
async def parse(self, text: str, context: Dict[str, Any] = None) -> Tuple[str, Dict[str, Any]]:
"""
解析自然语言指令
Args:
text: 用户输入的自然语言
context: 执行上下文(包含对话历史等)
Returns:
(意图类型, 参数字典)
@@ -61,11 +62,17 @@ class IntentParser:
if not text:
return self.INTENT_UNKNOWN, {}
# 检查对话历史中的上下文
conversation_history = []
if context and context.get("conversation_history"):
conversation_history = context.get("conversation_history", [])
logger.info(f"解析时使用对话历史: {len(conversation_history)} 条消息")
# 记录历史
self.intent_history.append({"text": text, "intent": None})
# 识别意图
intent = self._recognize_intent(text)
# 识别意图(考虑对话上下文)
intent = self._recognize_intent_with_context(text, conversation_history)
# 提取参数
params = self._extract_params(text, intent)
@@ -78,6 +85,42 @@ class IntentParser:
return intent, params
def _recognize_intent_with_context(self, text: str, conversation_history: List[Dict[str, Any]]) -> str:
"""
基于对话历史识别意图
Args:
text: 当前用户输入
conversation_history: 对话历史
Returns:
意图类型
"""
# 如果对话历史为空,使用基础意图识别
if not conversation_history:
return self._recognize_intent(text)
# 基于历史上下文进行意图识别
# 分析最近的对话了解用户意图的延续性
last_intent = None
last_topic = None
for msg in conversation_history[-5:]: # 最多看最近5条消息
if msg.get("role") == "assistant":
last_intent = msg.get("intent")
if msg.get("intent") and msg.get("intent") != "unknown":
last_topic = msg.get("intent")
# 如果当前消息很短(如"继续"、"是的"),可能延续之前的意图
short_confirmation = ["", "是的", "", "继续", "ok", "", "接着", "然后", "还有吗"]
if text.strip() in short_confirmation or len(text.strip()) <= 3:
if last_topic:
logger.info(f"简短确认,延续之前的意图: {last_topic}")
return last_topic
# 否则使用标准意图识别
return self._recognize_intent(text)
def _recognize_intent(self, text: str) -> str:
"""识别意图类型"""
intent_scores: Dict[str, float] = {}
@@ -214,18 +257,27 @@ class IntentParser:
return template_info if template_info else None
def _extract_target_fields(self, text: str) -> List[str]:
"""提取目标字段"""
"""提取目标字段 - 按分隔符切分再逐段清理"""
fields = []
# 匹配 "提取XXX和YYY"、"抽取XXX、YYY"
patterns = [
r"提取([^(and|,|)+]+?)(?:和|与|、|,|plus)",
r"抽取([^(and|,|)+]+?)(?:和|与|、|,|plus)",
]
# 去除提取/抽取前缀
cleaned_text = re.sub(r"^(?:提取|抽取)", "", text).strip()
for pattern in patterns:
matches = re.findall(pattern, text)
fields.extend([m.strip() for m in matches if m.strip()])
# 按'和'、'与'、'、'分割成多段
segments = re.split(r"[和与、]", cleaned_text)
# 常见前缀(这些不是字段名,需要去除)
prefixes = ["文档中的", "文档中", "文件中的", "文件中", "内容中的", "内容中"]
for seg in segments:
seg = seg.strip()
# 去除常见前缀
for p in prefixes:
if seg.startswith(p):
seg = seg[len(p):]
break
if seg and 2 <= len(seg) <= 20:
fields.append(seg)
return list(set(fields))