【智能助手增强】
- 新增对话历史管理:MongoDB新增conversations集合,存储用户与AI的对话上下文,支持多轮对话意图延续
- 新增对话历史API(conversation.py):GET/DELETE conversation历史、列出所有会话
- 意图解析增强:支持基于对话历史的意图识别,上下文理解更准确
- 字段提取优化:支持"提取文档中的医院数量"等自然语言模式,智能去除"文档中的"前缀
- 文档对比优化:从指令中提取文件名并精确匹配source_docs,支持"对比A和B两个文档"
- 文档摘要优化:使用LLM生成真实AI摘要而非返回原始文档预览
【Word模板填表核心功能】
- Word模板字段生成:空白Word上传后,自动从源文档(Excel/Word/TXT/MD)内容AI生成字段名
- Word模板填表(_fill_docx):将提取数据写入Word模板表格,支持精确匹配、模糊匹配、追加新行
- 数据润色(_polish_word_filled_data):LLM对多行Excel数据进行统计归纳(合计/平均/极值),转化为专业自然语言描述
- 段落格式输出:使用📌字段名+值段落+分隔线(灰色横线)格式,提升可读性
- 导出链打通:fill_template返回filled_file_path,export直接返回已填好的Word文件
【其他修复】
- 修复Word导出Windows文件锁问题:NamedTemporaryFile改为mkstemp+close
- 修复Word方框非法字符:扩展clean_text移除\uFFFD、□等Unicode替代符和零宽字符
- 修复文档对比"需要至少2个文档":从指令提取具体文件名优先匹配而非取前2个
- 修复导出format硬编码:自动识别docx/xlsx格式
- Docx解析器增加备用解析方法和更完整的段落/表格/标题提取
- RAG服务新增MySQL数据源支持
This commit is contained in:
@@ -5,9 +5,10 @@
|
||||
"""
|
||||
import logging
|
||||
import json
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from app.services.template_fill_service import template_fill_service
|
||||
from app.services.template_fill_service import template_fill_service, TemplateField
|
||||
from app.services.rag_service import rag_service
|
||||
from app.services.markdown_ai_service import markdown_ai_service
|
||||
from app.core.database import mongodb
|
||||
@@ -15,6 +16,31 @@ from app.core.database import mongodb
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _extract_filenames_from_text(text: str) -> List[str]:
|
||||
"""
|
||||
从指令文本中提取文件名列表。
|
||||
|
||||
智能处理用'和'/'与'/'、分隔的多个文件名(尤其是带年号的统计公报)。
|
||||
"""
|
||||
# 先去掉"对比这两个文档"等引导语,只保留文件名部分
|
||||
text = re.sub(r'^(?:对比|比较)这两个?文档[的差异]?[::]?', '', text).strip()
|
||||
text = re.sub(r'两个文档.*$', '', text).strip()
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# 直接查找所有带扩展名的文件名模式
|
||||
results = []
|
||||
for m in re.finditer(r'[^\s,。!?、和与]+(?=\.(?:docx|xlsx|md|txt))', text):
|
||||
start = m.start()
|
||||
ext_match = re.search(r'\.(?:docx|xlsx|md|txt)', text[m.end():])
|
||||
if ext_match:
|
||||
fn = text[start:m.end() + ext_match.end()]
|
||||
if fn:
|
||||
results.append(fn)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
class InstructionExecutor:
|
||||
"""指令执行器"""
|
||||
|
||||
@@ -41,9 +67,10 @@ class InstructionExecutor:
|
||||
self.intent_parser = intent_parser
|
||||
|
||||
context = context or {}
|
||||
context["instruction"] = instruction # 保存原始指令以便后续使用
|
||||
|
||||
# 解析意图
|
||||
intent, params = await self.intent_parser.parse(instruction)
|
||||
# 解析意图(传递对话历史上下文)
|
||||
intent, params = await self.intent_parser.parse(instruction, context)
|
||||
|
||||
# 根据意图类型执行相应操作
|
||||
if intent == "extract":
|
||||
@@ -72,18 +99,48 @@ class InstructionExecutor:
|
||||
async def _execute_extract(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""执行信息提取"""
|
||||
try:
|
||||
target_fields = params.get("field_refs", [])
|
||||
# target_fields 来自意图解析,field_refs 来自引号/字段关键词匹配
|
||||
target_fields = params.get("target_fields", []) or params.get("field_refs", [])
|
||||
doc_ids = params.get("document_refs", [])
|
||||
instruction_text = context.get("instruction", "")
|
||||
|
||||
# 如果没有指定文档,尝试按文件名精确搜索
|
||||
if not doc_ids or "all_docs" in doc_ids:
|
||||
if instruction_text:
|
||||
import re
|
||||
# 提取引号内的内容或文件名
|
||||
filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text)
|
||||
if filename_match:
|
||||
search_term = filename_match.group(1)
|
||||
else:
|
||||
match = re.search(r'([^\s]+\.(?:docx|xlsx|md|txt))', instruction_text)
|
||||
search_term = match.group(1) if match else None
|
||||
|
||||
if search_term:
|
||||
logger.info(f"提取时搜索文档: {search_term}")
|
||||
searched_docs = await mongodb.search_documents(search_term, limit=5)
|
||||
if searched_docs:
|
||||
# 优先选择文件名完全匹配的文档
|
||||
best_docs = [
|
||||
d for d in searched_docs
|
||||
if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower()
|
||||
]
|
||||
if not best_docs:
|
||||
best_docs = [searched_docs[0]]
|
||||
context["source_docs"] = best_docs
|
||||
doc_ids = [doc.get("_id", "") for doc in best_docs]
|
||||
logger.info(f"找到 {len(best_docs)} 个文档用于提取,最佳: {best_docs[0].get('metadata', {}).get('original_filename', '?')}")
|
||||
|
||||
if not target_fields:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "extract",
|
||||
"error": "未指定要提取的字段",
|
||||
"message": "请明确说明要提取哪些字段,如:'提取医院数量和床位数'"
|
||||
}
|
||||
|
||||
# 如果指定了文档,验证文档存在
|
||||
if doc_ids and "all_docs" not in doc_ids:
|
||||
# 如果指定了文档且还没有加载 source_docs,则验证并加载
|
||||
if doc_ids and "all_docs" not in doc_ids and not context.get("source_docs"):
|
||||
valid_docs = []
|
||||
for doc_ref in doc_ids:
|
||||
doc_id = doc_ref.replace("doc_", "")
|
||||
@@ -93,20 +150,22 @@ class InstructionExecutor:
|
||||
if not valid_docs:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "extract",
|
||||
"error": "指定的文档不存在",
|
||||
"message": "请检查文档编号是否正确"
|
||||
}
|
||||
context["source_docs"] = valid_docs
|
||||
|
||||
# 构建字段列表
|
||||
fields = []
|
||||
for i, field_name in enumerate(target_fields):
|
||||
fields.append({
|
||||
"name": field_name,
|
||||
"cell": f"A{i+1}",
|
||||
"field_type": "text",
|
||||
"required": False
|
||||
})
|
||||
# 构建字段列表(使用 TemplateField dataclass)
|
||||
fields = [
|
||||
TemplateField(
|
||||
name=field_name,
|
||||
cell=f"A{i+1}",
|
||||
field_type="text",
|
||||
required=False
|
||||
)
|
||||
for i, field_name in enumerate(target_fields)
|
||||
]
|
||||
|
||||
# 调用填表服务
|
||||
result = await template_fill_service.fill_template(
|
||||
@@ -143,7 +202,7 @@ class InstructionExecutor:
|
||||
}
|
||||
|
||||
# 获取源文档
|
||||
source_docs = context.get("source_docs", [])
|
||||
source_docs = context.get("source_docs", []) or []
|
||||
source_doc_ids = [doc.get("_id") for doc in source_docs if doc.get("_id")]
|
||||
|
||||
# 获取字段
|
||||
@@ -175,36 +234,103 @@ class InstructionExecutor:
|
||||
}
|
||||
|
||||
async def _execute_summarize(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""执行摘要总结"""
|
||||
"""执行摘要总结 - 使用 LLM 生成真实摘要"""
|
||||
try:
|
||||
docs = context.get("source_docs", [])
|
||||
import re
|
||||
docs = context.get("source_docs", []) or []
|
||||
instruction_text = context.get("instruction", "")
|
||||
|
||||
# 从指令中提取文件名/关键词,优先搜索精确文档
|
||||
search_term = None
|
||||
if instruction_text:
|
||||
filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text)
|
||||
if filename_match:
|
||||
search_term = filename_match.group(1)
|
||||
else:
|
||||
file_match = re.search(r'([^\s,。!?,]+\.(?:docx|xlsx|md|txt))', instruction_text)
|
||||
if file_match:
|
||||
search_term = file_match.group(1)
|
||||
|
||||
# 如果没有文档或有更精确的搜索词,尝试重新搜索
|
||||
if not docs or search_term:
|
||||
if search_term:
|
||||
logger.info(f"按关键词搜索文档: {search_term}")
|
||||
searched_docs = await mongodb.search_documents(search_term, limit=5)
|
||||
if searched_docs:
|
||||
# 优先使用文件名最匹配的文档
|
||||
docs = sorted(
|
||||
searched_docs,
|
||||
key=lambda d: 1 if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower() else 0,
|
||||
reverse=True
|
||||
)
|
||||
logger.info(f"找到 {len(docs)} 个文档,最佳匹配: {docs[0].get('metadata', {}).get('original_filename', '?')}")
|
||||
|
||||
if not docs:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "没有可用的文档",
|
||||
"message": "请先上传要总结的文档"
|
||||
"success": True,
|
||||
"intent": "summarize",
|
||||
"action_needed": "provide_document",
|
||||
"message": "我理解了,您想分析文档内容。",
|
||||
"suggestion": "请提供已上传文档的名称(可以是文件名或部分名称),或者上传您想要分析的文档。\n\n支持的格式:docx、xlsx、md、txt\n\n例如:'分析2021年民政事业发展统计公报' 或 '总结卫生健康数据'"
|
||||
}
|
||||
|
||||
summaries = []
|
||||
for doc in docs[:5]: # 最多处理5个文档
|
||||
content = doc.get("content", "")[:5000] # 限制内容长度
|
||||
if content:
|
||||
summaries.append({
|
||||
"filename": doc.get("metadata", {}).get("original_filename", "未知"),
|
||||
"content_preview": content[:500] + "..." if len(content) > 500 else content
|
||||
})
|
||||
# 对第一个(最佳匹配)文档生成 AI 摘要
|
||||
primary_doc = docs[0]
|
||||
content = primary_doc.get("content", "")
|
||||
filename = primary_doc.get("metadata", {}).get("original_filename", "未知文档")
|
||||
|
||||
if not content:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "summarize",
|
||||
"error": "文档内容为空",
|
||||
"message": f"文档 {filename} 没有可供分析的文本内容"
|
||||
}
|
||||
|
||||
# 使用 LLM 生成摘要
|
||||
content_for_summary = content[:12000] # 最多取前 12000 字
|
||||
user_request = instruction_text or "请总结这份文档"
|
||||
|
||||
prompt = f"""请对以下文档进行全面、有条理的摘要分析。
|
||||
|
||||
文档名称:{filename}
|
||||
用户要求:{user_request}
|
||||
|
||||
文档内容:
|
||||
{content_for_summary}
|
||||
|
||||
请按以下格式输出摘要:
|
||||
1. **文档概述**:简述文档主题和背景(2-3句)
|
||||
2. **主要内容**:列出文档的核心数据和关键信息(用要点列出)
|
||||
3. **重要数据**:提取文档中的重要数字、统计数据
|
||||
4. **主要结论**:归纳文档的主要结论或趋势
|
||||
|
||||
要求:条理清晰,数据准确,不要遗漏关键信息。"""
|
||||
|
||||
from app.services.llm_service import llm_service
|
||||
messages = [
|
||||
{"role": "system", "content": "你是一个专业的文档分析助手,擅长提取关键信息并生成结构化摘要。"},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
|
||||
response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=2000)
|
||||
ai_summary = llm_service.extract_message_content(response)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"intent": "summarize",
|
||||
"summaries": summaries,
|
||||
"message": f"找到 {len(summaries)} 个文档可供参考"
|
||||
"ai_summary": ai_summary,
|
||||
"filename": filename,
|
||||
"doc_id": primary_doc.get("_id", ""),
|
||||
"total_docs_found": len(docs),
|
||||
"message": f"已生成文档摘要"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"摘要执行失败: {e}")
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "summarize",
|
||||
"error": str(e),
|
||||
"message": f"摘要生成失败: {str(e)}"
|
||||
}
|
||||
@@ -213,17 +339,39 @@ class InstructionExecutor:
|
||||
"""执行问答"""
|
||||
try:
|
||||
question = params.get("question", "")
|
||||
instruction_text = context.get("instruction", "")
|
||||
|
||||
if not question:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "question",
|
||||
"error": "未提供问题",
|
||||
"message": "请输入要回答的问题"
|
||||
}
|
||||
|
||||
# 使用 RAG 检索相关文档
|
||||
docs = context.get("source_docs", [])
|
||||
rag_results = []
|
||||
docs = context.get("source_docs", []) or []
|
||||
|
||||
# 如果没有文档,尝试从指令中提取文件名搜索
|
||||
if not docs:
|
||||
filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text)
|
||||
if not filename_match:
|
||||
filename_match = re.search(r'([^\s,。!?]+\.(?:docx|xlsx|md|txt))', instruction_text)
|
||||
if filename_match:
|
||||
found = await mongodb.search_documents(filename_match.group(1), limit=5)
|
||||
if found:
|
||||
docs = found
|
||||
|
||||
if not docs:
|
||||
return {
|
||||
"success": True,
|
||||
"intent": "question",
|
||||
"question": question,
|
||||
"answer": None,
|
||||
"message": "请先上传文档,我才能回答您的问题"
|
||||
}
|
||||
|
||||
# 使用 RAG 检索相关文档
|
||||
rag_results = []
|
||||
for doc in docs:
|
||||
doc_id = doc.get("_id", "")
|
||||
if doc_id:
|
||||
@@ -241,12 +389,42 @@ class InstructionExecutor:
|
||||
doc.get("content", "")[:3000] for doc in docs[:3] if doc.get("content")
|
||||
])
|
||||
|
||||
if not context_text:
|
||||
return {
|
||||
"success": True,
|
||||
"intent": "question",
|
||||
"question": question,
|
||||
"answer": None,
|
||||
"message": "文档内容为空,无法回答问题"
|
||||
}
|
||||
|
||||
# 使用 LLM 生成答案
|
||||
filename = docs[0].get("metadata", {}).get("original_filename", "文档")
|
||||
prompt = f"""基于以下文档内容,回答用户的问题。
|
||||
|
||||
文档名称:{filename}
|
||||
用户问题:{question}
|
||||
|
||||
文档内容:
|
||||
{context_text[:8000]}
|
||||
|
||||
请根据文档内容准确回答问题。如果文档中没有相关信息,请明确说明。"""
|
||||
|
||||
from app.services.llm_service import llm_service
|
||||
messages = [
|
||||
{"role": "system", "content": "你是一个专业的文档问答助手,根据提供的内容准确回答用户问题。"},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=1500)
|
||||
answer = llm_service.extract_message_content(response)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"intent": "question",
|
||||
"question": question,
|
||||
"context_preview": context_text[:500] + "..." if len(context_text) > 500 else context_text,
|
||||
"message": "已找到相关上下文,可进行问答"
|
||||
"answer": answer,
|
||||
"filename": filename,
|
||||
"message": "已生成回答"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
@@ -299,12 +477,53 @@ class InstructionExecutor:
|
||||
async def _execute_compare(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""执行对比分析"""
|
||||
try:
|
||||
docs = context.get("source_docs", [])
|
||||
docs = context.get("source_docs", []) or []
|
||||
instruction_text = context.get("instruction", "")
|
||||
|
||||
# 优先从指令中提取具体的文件名
|
||||
filenames = _extract_filenames_from_text(instruction_text)
|
||||
|
||||
if filenames:
|
||||
# 只选择文件名匹配的那些文档
|
||||
matched_docs = []
|
||||
for doc in docs:
|
||||
fname = doc.get("metadata", {}).get("original_filename", "").lower()
|
||||
for fn in filenames:
|
||||
if fn.lower() in fname or fname in fn.lower():
|
||||
matched_docs.append(doc)
|
||||
break
|
||||
# 如果匹配到足够文档,用匹配的
|
||||
if len(matched_docs) >= 2:
|
||||
docs = matched_docs
|
||||
else:
|
||||
# 匹配不够,尝试按文件名搜索 MongoDB
|
||||
all_found = []
|
||||
for fn in filenames:
|
||||
found = await mongodb.search_documents(fn, limit=5)
|
||||
all_found.extend(found)
|
||||
seen = set()
|
||||
unique_docs = []
|
||||
for d in all_found:
|
||||
did = d.get("_id", "")
|
||||
if did and did not in seen:
|
||||
seen.add(did)
|
||||
unique_docs.append(d)
|
||||
if len(unique_docs) >= 2:
|
||||
docs = unique_docs
|
||||
elif len(unique_docs) == 1 and len(docs) >= 1:
|
||||
# 找到一个指定的 + 用一个通用的
|
||||
docs = unique_docs + docs[:1]
|
||||
elif docs and len(filenames) == 1:
|
||||
# 找到一个指定文件名但只有一个匹配,尝试补充
|
||||
docs = unique_docs + [d for d in docs if d not in unique_docs]
|
||||
docs = docs[:2]
|
||||
|
||||
if len(docs) < 2:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "compare",
|
||||
"error": "对比需要至少2个文档",
|
||||
"message": "请上传至少2个文档进行对比"
|
||||
"message": "请上传至少2个文档进行对比,或明确说出要对比的文档名称"
|
||||
}
|
||||
|
||||
# 提取文档基本信息
|
||||
@@ -329,6 +548,7 @@ class InstructionExecutor:
|
||||
logger.error(f"对比执行失败: {e}")
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "compare",
|
||||
"error": str(e),
|
||||
"message": f"对比分析失败: {str(e)}"
|
||||
}
|
||||
@@ -336,10 +556,23 @@ class InstructionExecutor:
|
||||
async def _execute_edit(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""执行文档编辑操作"""
|
||||
try:
|
||||
docs = context.get("source_docs", [])
|
||||
docs = context.get("source_docs", []) or []
|
||||
instruction_text = context.get("instruction", "")
|
||||
|
||||
# 如果没有文档,尝试从指令中提取文件名搜索
|
||||
if not docs:
|
||||
filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text)
|
||||
if not filename_match:
|
||||
filename_match = re.search(r'([^\s,。!?]+\.(?:docx|xlsx|md|txt))', instruction_text)
|
||||
if filename_match:
|
||||
found = await mongodb.search_documents(filename_match.group(1), limit=3)
|
||||
if found:
|
||||
docs = found
|
||||
|
||||
if not docs:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "edit",
|
||||
"error": "没有可用的文档",
|
||||
"message": "请先上传要编辑的文档"
|
||||
}
|
||||
@@ -405,7 +638,7 @@ class InstructionExecutor:
|
||||
- Word -> Markdown
|
||||
"""
|
||||
try:
|
||||
docs = context.get("source_docs", [])
|
||||
docs = context.get("source_docs", []) or []
|
||||
if not docs:
|
||||
return {
|
||||
"success": False,
|
||||
|
||||
@@ -28,7 +28,7 @@ class IntentParser:
|
||||
INTENT_KEYWORDS = {
|
||||
INTENT_EXTRACT: ["提取", "抽取", "获取", "找出", "查找", "识别", "找到"],
|
||||
INTENT_FILL_TABLE: ["填表", "填写", "填充", "录入", "导入到表格", "填写到"],
|
||||
INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼"],
|
||||
INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼", "分析", "聊聊"],
|
||||
INTENT_QUESTION: ["问答", "回答", "解释", "什么是", "为什么", "如何", "怎样", "多少", "几个"],
|
||||
INTENT_SEARCH: ["搜索", "查找", "检索", "查询", "找"],
|
||||
INTENT_COMPARE: ["对比", "比较", "差异", "区别", "不同"],
|
||||
@@ -47,12 +47,13 @@ class IntentParser:
|
||||
def __init__(self):
|
||||
self.intent_history: List[Dict[str, Any]] = []
|
||||
|
||||
async def parse(self, text: str) -> Tuple[str, Dict[str, Any]]:
|
||||
async def parse(self, text: str, context: Dict[str, Any] = None) -> Tuple[str, Dict[str, Any]]:
|
||||
"""
|
||||
解析自然语言指令
|
||||
|
||||
Args:
|
||||
text: 用户输入的自然语言
|
||||
context: 执行上下文(包含对话历史等)
|
||||
|
||||
Returns:
|
||||
(意图类型, 参数字典)
|
||||
@@ -61,11 +62,17 @@ class IntentParser:
|
||||
if not text:
|
||||
return self.INTENT_UNKNOWN, {}
|
||||
|
||||
# 检查对话历史中的上下文
|
||||
conversation_history = []
|
||||
if context and context.get("conversation_history"):
|
||||
conversation_history = context.get("conversation_history", [])
|
||||
logger.info(f"解析时使用对话历史: {len(conversation_history)} 条消息")
|
||||
|
||||
# 记录历史
|
||||
self.intent_history.append({"text": text, "intent": None})
|
||||
|
||||
# 识别意图
|
||||
intent = self._recognize_intent(text)
|
||||
# 识别意图(考虑对话上下文)
|
||||
intent = self._recognize_intent_with_context(text, conversation_history)
|
||||
|
||||
# 提取参数
|
||||
params = self._extract_params(text, intent)
|
||||
@@ -78,6 +85,42 @@ class IntentParser:
|
||||
|
||||
return intent, params
|
||||
|
||||
def _recognize_intent_with_context(self, text: str, conversation_history: List[Dict[str, Any]]) -> str:
|
||||
"""
|
||||
基于对话历史识别意图
|
||||
|
||||
Args:
|
||||
text: 当前用户输入
|
||||
conversation_history: 对话历史
|
||||
|
||||
Returns:
|
||||
意图类型
|
||||
"""
|
||||
# 如果对话历史为空,使用基础意图识别
|
||||
if not conversation_history:
|
||||
return self._recognize_intent(text)
|
||||
|
||||
# 基于历史上下文进行意图识别
|
||||
# 分析最近的对话了解用户意图的延续性
|
||||
last_intent = None
|
||||
last_topic = None
|
||||
|
||||
for msg in conversation_history[-5:]: # 最多看最近5条消息
|
||||
if msg.get("role") == "assistant":
|
||||
last_intent = msg.get("intent")
|
||||
if msg.get("intent") and msg.get("intent") != "unknown":
|
||||
last_topic = msg.get("intent")
|
||||
|
||||
# 如果当前消息很短(如"继续"、"是的"),可能延续之前的意图
|
||||
short_confirmation = ["是", "是的", "好", "继续", "ok", "好", "接着", "然后", "还有吗"]
|
||||
if text.strip() in short_confirmation or len(text.strip()) <= 3:
|
||||
if last_topic:
|
||||
logger.info(f"简短确认,延续之前的意图: {last_topic}")
|
||||
return last_topic
|
||||
|
||||
# 否则使用标准意图识别
|
||||
return self._recognize_intent(text)
|
||||
|
||||
def _recognize_intent(self, text: str) -> str:
|
||||
"""识别意图类型"""
|
||||
intent_scores: Dict[str, float] = {}
|
||||
@@ -214,18 +257,27 @@ class IntentParser:
|
||||
return template_info if template_info else None
|
||||
|
||||
def _extract_target_fields(self, text: str) -> List[str]:
|
||||
"""提取目标字段"""
|
||||
"""提取目标字段 - 按分隔符切分再逐段清理"""
|
||||
fields = []
|
||||
|
||||
# 匹配 "提取XXX和YYY"、"抽取XXX、YYY"
|
||||
patterns = [
|
||||
r"提取([^(and|,|,)+]+?)(?:和|与|、|,|plus)",
|
||||
r"抽取([^(and|,|,)+]+?)(?:和|与|、|,|plus)",
|
||||
]
|
||||
# 去除提取/抽取前缀
|
||||
cleaned_text = re.sub(r"^(?:提取|抽取)", "", text).strip()
|
||||
|
||||
for pattern in patterns:
|
||||
matches = re.findall(pattern, text)
|
||||
fields.extend([m.strip() for m in matches if m.strip()])
|
||||
# 按'和'、'与'、'、'分割成多段
|
||||
segments = re.split(r"[和与、]", cleaned_text)
|
||||
|
||||
# 常见前缀(这些不是字段名,需要去除)
|
||||
prefixes = ["文档中的", "文档中", "文件中的", "文件中", "内容中的", "内容中"]
|
||||
|
||||
for seg in segments:
|
||||
seg = seg.strip()
|
||||
# 去除常见前缀
|
||||
for p in prefixes:
|
||||
if seg.startswith(p):
|
||||
seg = seg[len(p):]
|
||||
break
|
||||
if seg and 2 <= len(seg) <= 20:
|
||||
fields.append(seg)
|
||||
|
||||
return list(set(fields))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user