feat: 实现智能指令的格式转换和文档编辑功能
主要更新: - 新增 transform 意图:支持 Word/Excel/Markdown 格式互转 - 新增 edit 意图:使用 LLM 润色编辑文档内容 - 智能指令接口增加异步执行模式(async_execute 参数) - 修复 Word 模板导出文档损坏问题(改用临时文件方式) - 优化 intent_parser 增加 transform/edit 关键词识别 新增文件: - app/api/endpoints/instruction.py: 智能指令 API 端点 - app/services/multi_doc_reasoning_service.py: 多文档推理服务 其他优化: - RAG 服务混合搜索(BM25 + 向量)融合 - 模板填充服务表头匹配增强 - Word AI 解析服务返回结构完善 - 前端 InstructionChat 组件对接真实 API
This commit is contained in:
@@ -2,17 +2,51 @@
|
||||
意图解析器模块
|
||||
|
||||
解析用户自然语言指令,识别意图和参数
|
||||
|
||||
注意: 此模块为可选功能,当前尚未实现。
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, Tuple
|
||||
import re
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class IntentParser(ABC):
|
||||
"""意图解析器抽象基类"""
|
||||
class IntentParser:
|
||||
"""意图解析器"""
|
||||
|
||||
# 意图类型定义
|
||||
INTENT_EXTRACT = "extract" # 信息提取
|
||||
INTENT_FILL_TABLE = "fill_table" # 填表
|
||||
INTENT_SUMMARIZE = "summarize" # 摘要总结
|
||||
INTENT_QUESTION = "question" # 问答
|
||||
INTENT_SEARCH = "search" # 搜索
|
||||
INTENT_COMPARE = "compare" # 对比分析
|
||||
INTENT_TRANSFORM = "transform" # 格式转换
|
||||
INTENT_EDIT = "edit" # 编辑文档
|
||||
INTENT_UNKNOWN = "unknown" # 未知
|
||||
|
||||
# 意图关键词映射
|
||||
INTENT_KEYWORDS = {
|
||||
INTENT_EXTRACT: ["提取", "抽取", "获取", "找出", "查找", "识别", "找到"],
|
||||
INTENT_FILL_TABLE: ["填表", "填写", "填充", "录入", "导入到表格", "填写到"],
|
||||
INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼"],
|
||||
INTENT_QUESTION: ["问答", "回答", "解释", "什么是", "为什么", "如何", "怎样", "多少", "几个"],
|
||||
INTENT_SEARCH: ["搜索", "查找", "检索", "查询", "找"],
|
||||
INTENT_COMPARE: ["对比", "比较", "差异", "区别", "不同"],
|
||||
INTENT_TRANSFORM: ["转换", "转化", "变成", "转为", "导出"],
|
||||
INTENT_EDIT: ["修改", "编辑", "调整", "改写", "润色", "优化"],
|
||||
}
|
||||
|
||||
# 实体模式定义
|
||||
ENTITY_PATTERNS = {
|
||||
"number": [r"\d+", r"[一二三四五六七八九十百千万]+"],
|
||||
"date": [r"\d{4}年", r"\d{1,2}月", r"\d{1,2}日"],
|
||||
"percentage": [r"\d+(\.\d+)?%", r"\d+(\.\d+)?‰"],
|
||||
"currency": [r"\d+(\.\d+)?万元", r"\d+(\.\d+)?亿元", r"\d+(\.\d+)?元"],
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self.intent_history: List[Dict[str, Any]] = []
|
||||
|
||||
@abstractmethod
|
||||
async def parse(self, text: str) -> Tuple[str, Dict[str, Any]]:
|
||||
"""
|
||||
解析自然语言指令
|
||||
@@ -23,12 +57,186 @@ class IntentParser(ABC):
|
||||
Returns:
|
||||
(意图类型, 参数字典)
|
||||
"""
|
||||
pass
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return self.INTENT_UNKNOWN, {}
|
||||
|
||||
# 记录历史
|
||||
self.intent_history.append({"text": text, "intent": None})
|
||||
|
||||
# 识别意图
|
||||
intent = self._recognize_intent(text)
|
||||
|
||||
# 提取参数
|
||||
params = self._extract_params(text, intent)
|
||||
|
||||
# 更新历史
|
||||
if self.intent_history:
|
||||
self.intent_history[-1]["intent"] = intent
|
||||
|
||||
logger.info(f"意图解析: text={text[:50]}..., intent={intent}, params={params}")
|
||||
|
||||
return intent, params
|
||||
|
||||
def _recognize_intent(self, text: str) -> str:
|
||||
"""识别意图类型"""
|
||||
intent_scores: Dict[str, float] = {}
|
||||
|
||||
for intent, keywords in self.INTENT_KEYWORDS.items():
|
||||
score = 0
|
||||
for keyword in keywords:
|
||||
if keyword in text:
|
||||
score += 1
|
||||
if score > 0:
|
||||
intent_scores[intent] = score
|
||||
|
||||
if not intent_scores:
|
||||
return self.INTENT_UNKNOWN
|
||||
|
||||
# 返回得分最高的意图
|
||||
return max(intent_scores, key=intent_scores.get)
|
||||
|
||||
def _extract_params(self, text: str, intent: str) -> Dict[str, Any]:
|
||||
"""提取参数"""
|
||||
params: Dict[str, Any] = {
|
||||
"entities": self._extract_entities(text),
|
||||
"document_refs": self._extract_document_refs(text),
|
||||
"field_refs": self._extract_field_refs(text),
|
||||
"template_refs": self._extract_template_refs(text),
|
||||
}
|
||||
|
||||
# 根据意图类型提取特定参数
|
||||
if intent == self.INTENT_QUESTION:
|
||||
params["question"] = text
|
||||
params["focus"] = self._extract_question_focus(text)
|
||||
elif intent == self.INTENT_FILL_TABLE:
|
||||
params["template"] = self._extract_template_info(text)
|
||||
elif intent == self.INTENT_EXTRACT:
|
||||
params["target_fields"] = self._extract_target_fields(text)
|
||||
|
||||
return params
|
||||
|
||||
def _extract_entities(self, text: str) -> Dict[str, List[str]]:
|
||||
"""提取实体"""
|
||||
entities: Dict[str, List[str]] = {}
|
||||
|
||||
for entity_type, patterns in self.ENTITY_PATTERNS.items():
|
||||
matches = []
|
||||
for pattern in patterns:
|
||||
found = re.findall(pattern, text)
|
||||
matches.extend(found)
|
||||
if matches:
|
||||
entities[entity_type] = list(set(matches))
|
||||
|
||||
return entities
|
||||
|
||||
def _extract_document_refs(self, text: str) -> List[str]:
|
||||
"""提取文档引用"""
|
||||
# 匹配 "文档1"、"doc1"、"第一个文档" 等
|
||||
refs = []
|
||||
|
||||
# 数字索引: 文档1, doc1, 第1个文档
|
||||
num_patterns = [
|
||||
r"[文档doc]+(\d+)",
|
||||
r"第(\d+)个文档",
|
||||
r"第(\d+)份",
|
||||
]
|
||||
for pattern in num_patterns:
|
||||
matches = re.findall(pattern, text.lower())
|
||||
refs.extend([f"doc_{m}" for m in matches])
|
||||
|
||||
# "所有文档"、"全部文档"
|
||||
if any(kw in text for kw in ["所有", "全部", "整个"]):
|
||||
refs.append("all_docs")
|
||||
|
||||
return refs
|
||||
|
||||
def _extract_field_refs(self, text: str) -> List[str]:
|
||||
"""提取字段引用"""
|
||||
fields = []
|
||||
|
||||
# 匹配引号内的字段名
|
||||
quoted = re.findall(r"['\"『「]([^'\"』」]+)['\"』」]", text)
|
||||
fields.extend(quoted)
|
||||
|
||||
# 匹配 "xxx字段"、"xxx列" 等
|
||||
field_patterns = [
|
||||
r"([^\s]+)字段",
|
||||
r"([^\s]+)列",
|
||||
r"([^\s]+)数据",
|
||||
]
|
||||
for pattern in field_patterns:
|
||||
matches = re.findall(pattern, text)
|
||||
fields.extend(matches)
|
||||
|
||||
return list(set(fields))
|
||||
|
||||
def _extract_template_refs(self, text: str) -> List[str]:
|
||||
"""提取模板引用"""
|
||||
templates = []
|
||||
|
||||
# 匹配 "表格模板"、"Excel模板"、"表1" 等
|
||||
template_patterns = [
|
||||
r"([^\s]+模板)",
|
||||
r"表(\d+)",
|
||||
r"([^\s]+表格)",
|
||||
]
|
||||
for pattern in template_patterns:
|
||||
matches = re.findall(pattern, text)
|
||||
templates.extend(matches)
|
||||
|
||||
return list(set(templates))
|
||||
|
||||
def _extract_question_focus(self, text: str) -> Optional[str]:
|
||||
"""提取问题焦点"""
|
||||
# "什么是XXX"、"XXX是什么"
|
||||
match = re.search(r"[什么是]([^?]+)", text)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
# "XXX有多少"
|
||||
match = re.search(r"([^?]+)有多少", text)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
|
||||
return None
|
||||
|
||||
def _extract_template_info(self, text: str) -> Optional[Dict[str, str]]:
|
||||
"""提取模板信息"""
|
||||
template_info: Dict[str, str] = {}
|
||||
|
||||
# 提取模板类型
|
||||
if "excel" in text.lower() or "xlsx" in text.lower() or "电子表格" in text:
|
||||
template_info["type"] = "xlsx"
|
||||
elif "word" in text.lower() or "docx" in text.lower() or "文档" in text:
|
||||
template_info["type"] = "docx"
|
||||
|
||||
return template_info if template_info else None
|
||||
|
||||
def _extract_target_fields(self, text: str) -> List[str]:
|
||||
"""提取目标字段"""
|
||||
fields = []
|
||||
|
||||
# 匹配 "提取XXX和YYY"、"抽取XXX、YYY"
|
||||
patterns = [
|
||||
r"提取([^(and|,|,)+]+?)(?:和|与|、|,|plus)",
|
||||
r"抽取([^(and|,|,)+]+?)(?:和|与|、|,|plus)",
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
matches = re.findall(pattern, text)
|
||||
fields.extend([m.strip() for m in matches if m.strip()])
|
||||
|
||||
return list(set(fields))
|
||||
|
||||
def get_intent_history(self) -> List[Dict[str, Any]]:
|
||||
"""获取意图历史"""
|
||||
return self.intent_history
|
||||
|
||||
def clear_history(self):
|
||||
"""清空历史"""
|
||||
self.intent_history = []
|
||||
|
||||
|
||||
class DefaultIntentParser(IntentParser):
|
||||
"""默认意图解析器"""
|
||||
|
||||
async def parse(self, text: str) -> Tuple[str, Dict[str, Any]]:
|
||||
"""暂未实现"""
|
||||
raise NotImplementedError("意图解析功能暂未实现")
|
||||
# 全局单例
|
||||
intent_parser = IntentParser()
|
||||
|
||||
Reference in New Issue
Block a user