feat: 实现智能指令的格式转换和文档编辑功能
主要更新: - 新增 transform 意图:支持 Word/Excel/Markdown 格式互转 - 新增 edit 意图:使用 LLM 润色编辑文档内容 - 智能指令接口增加异步执行模式(async_execute 参数) - 修复 Word 模板导出文档损坏问题(改用临时文件方式) - 优化 intent_parser 增加 transform/edit 关键词识别 新增文件: - app/api/endpoints/instruction.py: 智能指令 API 端点 - app/services/multi_doc_reasoning_service.py: 多文档推理服务 其他优化: - RAG 服务混合搜索(BM25 + 向量)融合 - 模板填充服务表头匹配增强 - Word AI 解析服务返回结构完善 - 前端 InstructionChat 组件对接真实 API
This commit is contained in:
@@ -13,6 +13,7 @@ from app.api.endpoints import (
|
||||
visualization,
|
||||
analysis_charts,
|
||||
health,
|
||||
instruction, # 智能指令
|
||||
)
|
||||
|
||||
# 创建主路由
|
||||
@@ -29,3 +30,4 @@ api_router.include_router(templates.router) # 表格模板
|
||||
api_router.include_router(ai_analyze.router) # AI分析
|
||||
api_router.include_router(visualization.router) # 可视化
|
||||
api_router.include_router(analysis_charts.router) # 分析图表
|
||||
api_router.include_router(instruction.router) # 智能指令
|
||||
|
||||
439
backend/app/api/endpoints/instruction.py
Normal file
439
backend/app/api/endpoints/instruction.py
Normal file
@@ -0,0 +1,439 @@
|
||||
"""
|
||||
智能指令 API 接口
|
||||
|
||||
支持自然语言指令解析和执行
|
||||
"""
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Query, BackgroundTasks
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.instruction.intent_parser import intent_parser
|
||||
from app.instruction.executor import instruction_executor
|
||||
from app.core.database import mongodb
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/instruction", tags=["智能指令"])
|
||||
|
||||
|
||||
# ==================== 请求/响应模型 ====================
|
||||
|
||||
class InstructionRequest(BaseModel):
|
||||
instruction: str
|
||||
doc_ids: Optional[List[str]] = None # 关联的文档 ID 列表
|
||||
context: Optional[Dict[str, Any]] = None # 额外上下文
|
||||
|
||||
|
||||
class IntentRecognitionResponse(BaseModel):
|
||||
success: bool
|
||||
intent: str
|
||||
params: Dict[str, Any]
|
||||
message: str
|
||||
|
||||
|
||||
class InstructionExecutionResponse(BaseModel):
|
||||
success: bool
|
||||
intent: str
|
||||
result: Dict[str, Any]
|
||||
message: str
|
||||
|
||||
|
||||
# ==================== 接口 ====================
|
||||
|
||||
@router.post("/recognize", response_model=IntentRecognitionResponse)
|
||||
async def recognize_intent(request: InstructionRequest):
|
||||
"""
|
||||
意图识别接口
|
||||
|
||||
将自然语言指令解析为结构化的意图和参数
|
||||
|
||||
示例指令:
|
||||
- "提取文档中的医院数量和床位数"
|
||||
- "根据这些数据填表"
|
||||
- "总结一下这份文档"
|
||||
- "对比这两个文档的差异"
|
||||
"""
|
||||
try:
|
||||
intent, params = await intent_parser.parse(request.instruction)
|
||||
|
||||
# 添加文档关联信息
|
||||
if request.doc_ids:
|
||||
params["document_refs"] = [f"doc_{doc_id}" for doc_id in request.doc_ids]
|
||||
|
||||
intent_names = {
|
||||
"extract": "信息提取",
|
||||
"fill_table": "表格填写",
|
||||
"summarize": "摘要总结",
|
||||
"question": "智能问答",
|
||||
"search": "文档搜索",
|
||||
"compare": "对比分析",
|
||||
"transform": "格式转换",
|
||||
"edit": "文档编辑",
|
||||
"unknown": "未知"
|
||||
}
|
||||
|
||||
return IntentRecognitionResponse(
|
||||
success=True,
|
||||
intent=intent,
|
||||
params=params,
|
||||
message=f"识别到意图: {intent_names.get(intent, intent)}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"意图识别失败: {e}")
|
||||
return IntentRecognitionResponse(
|
||||
success=False,
|
||||
intent="error",
|
||||
params={},
|
||||
message=f"意图识别失败: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.post("/execute")
|
||||
async def execute_instruction(
|
||||
background_tasks: BackgroundTasks,
|
||||
request: InstructionRequest,
|
||||
async_execute: bool = Query(False, description="是否异步执行(仅返回任务ID)")
|
||||
):
|
||||
"""
|
||||
指令执行接口
|
||||
|
||||
解析并执行自然语言指令
|
||||
|
||||
示例:
|
||||
- 指令: "提取文档1中的医院数量"
|
||||
返回: {"extracted_data": {"医院数量": ["38710个"]}}
|
||||
|
||||
- 指令: "填表"
|
||||
返回: {"filled_data": {...}}
|
||||
|
||||
设置 async_execute=true 可异步执行,返回任务ID用于查询进度
|
||||
"""
|
||||
task_id = str(uuid.uuid4())
|
||||
|
||||
if async_execute:
|
||||
# 异步模式:立即返回任务ID,后台执行
|
||||
background_tasks.add_task(
|
||||
_execute_instruction_task,
|
||||
task_id=task_id,
|
||||
instruction=request.instruction,
|
||||
doc_ids=request.doc_ids,
|
||||
context=request.context
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"task_id": task_id,
|
||||
"message": "指令已提交执行",
|
||||
"status_url": f"/api/v1/tasks/{task_id}"
|
||||
}
|
||||
|
||||
# 同步模式:等待执行完成
|
||||
return await _execute_instruction_task(task_id, request.instruction, request.doc_ids, request.context)
|
||||
|
||||
|
||||
async def _execute_instruction_task(
|
||||
task_id: str,
|
||||
instruction: str,
|
||||
doc_ids: Optional[List[str]],
|
||||
context: Optional[Dict[str, Any]]
|
||||
) -> InstructionExecutionResponse:
|
||||
"""执行指令的后台任务"""
|
||||
from app.core.database import redis_db, mongodb as mongo_client
|
||||
|
||||
try:
|
||||
# 记录任务
|
||||
try:
|
||||
await mongo_client.insert_task(
|
||||
task_id=task_id,
|
||||
task_type="instruction_execute",
|
||||
status="processing",
|
||||
message="正在执行指令"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 构建执行上下文
|
||||
ctx: Dict[str, Any] = context or {}
|
||||
|
||||
# 如果提供了文档 ID,获取文档内容
|
||||
if doc_ids:
|
||||
docs = []
|
||||
for doc_id in doc_ids:
|
||||
doc = await mongo_client.get_document(doc_id)
|
||||
if doc:
|
||||
docs.append(doc)
|
||||
|
||||
if docs:
|
||||
ctx["source_docs"] = docs
|
||||
logger.info(f"指令执行上下文: 关联了 {len(docs)} 个文档")
|
||||
|
||||
# 执行指令
|
||||
result = await instruction_executor.execute(instruction, ctx)
|
||||
|
||||
# 更新任务状态
|
||||
try:
|
||||
await mongo_client.update_task(
|
||||
task_id=task_id,
|
||||
status="success",
|
||||
message="执行完成",
|
||||
result=result
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return InstructionExecutionResponse(
|
||||
success=result.get("success", False),
|
||||
intent=result.get("intent", "unknown"),
|
||||
result=result,
|
||||
message=result.get("message", "执行完成")
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"指令执行失败: {e}")
|
||||
try:
|
||||
await mongo_client.update_task(
|
||||
task_id=task_id,
|
||||
status="failure",
|
||||
message="执行失败",
|
||||
error=str(e)
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return InstructionExecutionResponse(
|
||||
success=False,
|
||||
intent="error",
|
||||
result={"error": str(e)},
|
||||
message=f"指令执行失败: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.post("/chat")
|
||||
async def instruction_chat(
|
||||
background_tasks: BackgroundTasks,
|
||||
request: InstructionRequest,
|
||||
async_execute: bool = Query(False, description="是否异步执行(仅返回任务ID)")
|
||||
):
|
||||
"""
|
||||
指令对话接口
|
||||
|
||||
支持多轮对话的指令执行
|
||||
|
||||
示例对话流程:
|
||||
1. 用户: "上传一些文档"
|
||||
2. 系统: "请上传文档"
|
||||
3. 用户: "提取其中的医院数量"
|
||||
4. 系统: 返回提取结果
|
||||
|
||||
设置 async_execute=true 可异步执行,返回任务ID用于查询进度
|
||||
"""
|
||||
task_id = str(uuid.uuid4())
|
||||
|
||||
if async_execute:
|
||||
# 异步模式:立即返回任务ID,后台执行
|
||||
background_tasks.add_task(
|
||||
_execute_chat_task,
|
||||
task_id=task_id,
|
||||
instruction=request.instruction,
|
||||
doc_ids=request.doc_ids,
|
||||
context=request.context
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"task_id": task_id,
|
||||
"message": "指令已提交执行",
|
||||
"status_url": f"/api/v1/tasks/{task_id}"
|
||||
}
|
||||
|
||||
# 同步模式:等待执行完成
|
||||
return await _execute_chat_task(task_id, request.instruction, request.doc_ids, request.context)
|
||||
|
||||
|
||||
async def _execute_chat_task(
|
||||
task_id: str,
|
||||
instruction: str,
|
||||
doc_ids: Optional[List[str]],
|
||||
context: Optional[Dict[str, Any]]
|
||||
):
|
||||
"""执行指令对话的后台任务"""
|
||||
from app.core.database import mongodb as mongo_client
|
||||
|
||||
try:
|
||||
# 记录任务
|
||||
try:
|
||||
await mongo_client.insert_task(
|
||||
task_id=task_id,
|
||||
task_type="instruction_chat",
|
||||
status="processing",
|
||||
message="正在处理对话"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 构建上下文
|
||||
ctx: Dict[str, Any] = context or {}
|
||||
|
||||
# 获取关联文档
|
||||
if doc_ids:
|
||||
docs = []
|
||||
for doc_id in doc_ids:
|
||||
doc = await mongo_client.get_document(doc_id)
|
||||
if doc:
|
||||
docs.append(doc)
|
||||
if docs:
|
||||
ctx["source_docs"] = docs
|
||||
|
||||
# 执行指令
|
||||
result = await instruction_executor.execute(instruction, ctx)
|
||||
|
||||
# 根据意图类型添加友好的响应消息
|
||||
response_messages = {
|
||||
"extract": f"已提取 {len(result.get('extracted_data', {}))} 个字段的数据",
|
||||
"fill_table": f"填表完成,填写了 {len(result.get('result', {}).get('filled_data', {}))} 个字段",
|
||||
"summarize": "已生成文档摘要",
|
||||
"question": "已找到相关答案",
|
||||
"search": f"找到 {len(result.get('results', []))} 条相关内容",
|
||||
"compare": f"对比了 {len(result.get('comparison', []))} 个文档",
|
||||
"edit": "编辑操作已完成",
|
||||
"transform": "格式转换已完成",
|
||||
"unknown": "无法理解该指令,请尝试更明确的描述"
|
||||
}
|
||||
|
||||
response = {
|
||||
"success": result.get("success", False),
|
||||
"intent": result.get("intent", "unknown"),
|
||||
"result": result,
|
||||
"message": response_messages.get(result.get("intent", ""), result.get("message", "")),
|
||||
"hint": _get_intent_hint(result.get("intent", ""))
|
||||
}
|
||||
|
||||
# 更新任务状态
|
||||
try:
|
||||
await mongo_client.update_task(
|
||||
task_id=task_id,
|
||||
status="success",
|
||||
message="处理完成",
|
||||
result=response
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"指令对话失败: {e}")
|
||||
try:
|
||||
await mongo_client.update_task(
|
||||
task_id=task_id,
|
||||
status="failure",
|
||||
message="处理失败",
|
||||
error=str(e)
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"message": f"处理失败: {str(e)}"
|
||||
}
|
||||
|
||||
|
||||
def _get_intent_hint(intent: str) -> Optional[str]:
|
||||
"""根据意图返回下一步提示"""
|
||||
hints = {
|
||||
"extract": "您可以继续说 '提取更多字段' 或 '将数据填入表格'",
|
||||
"fill_table": "您可以提供表格模板或说 '帮我创建一个表格'",
|
||||
"question": "您可以继续提问或说 '总结一下这些内容'",
|
||||
"search": "您可以查看搜索结果或说 '对比这些内容'",
|
||||
"unknown": "您可以尝试: '提取数据'、'填表'、'总结'、'问答' 等指令"
|
||||
}
|
||||
return hints.get(intent)
|
||||
|
||||
|
||||
@router.get("/intents")
|
||||
async def list_supported_intents():
|
||||
"""
|
||||
获取支持的意图类型列表
|
||||
|
||||
返回所有可用的自然语言指令类型
|
||||
"""
|
||||
return {
|
||||
"intents": [
|
||||
{
|
||||
"intent": "extract",
|
||||
"name": "信息提取",
|
||||
"examples": [
|
||||
"提取文档中的医院数量",
|
||||
"抽取所有机构的名称",
|
||||
"找出表格中的数据"
|
||||
],
|
||||
"params": ["field_refs", "document_refs"]
|
||||
},
|
||||
{
|
||||
"intent": "fill_table",
|
||||
"name": "表格填写",
|
||||
"examples": [
|
||||
"填表",
|
||||
"根据这些数据填写表格",
|
||||
"帮我填到Excel里"
|
||||
],
|
||||
"params": ["template", "document_refs"]
|
||||
},
|
||||
{
|
||||
"intent": "summarize",
|
||||
"name": "摘要总结",
|
||||
"examples": [
|
||||
"总结一下这份文档",
|
||||
"生成摘要",
|
||||
"概括主要内容"
|
||||
],
|
||||
"params": ["document_refs"]
|
||||
},
|
||||
{
|
||||
"intent": "question",
|
||||
"name": "智能问答",
|
||||
"examples": [
|
||||
"这段话说的是什么?",
|
||||
"有多少家医院?",
|
||||
"解释一下这个概念"
|
||||
],
|
||||
"params": ["question", "focus"]
|
||||
},
|
||||
{
|
||||
"intent": "search",
|
||||
"name": "文档搜索",
|
||||
"examples": [
|
||||
"搜索相关内容",
|
||||
"找找看有哪些机构",
|
||||
"查询医院相关的数据"
|
||||
],
|
||||
"params": ["field_refs", "question"]
|
||||
},
|
||||
{
|
||||
"intent": "compare",
|
||||
"name": "对比分析",
|
||||
"examples": [
|
||||
"对比这两个文档",
|
||||
"比较一下差异",
|
||||
"找出不同点"
|
||||
],
|
||||
"params": ["document_refs"]
|
||||
},
|
||||
{
|
||||
"intent": "edit",
|
||||
"name": "文档编辑",
|
||||
"examples": [
|
||||
"润色这段文字",
|
||||
"修改格式",
|
||||
"添加注释"
|
||||
],
|
||||
"params": []
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -610,51 +610,79 @@ async def _export_to_excel(filled_data: dict, template_id: str) -> StreamingResp
|
||||
|
||||
async def _export_to_word(filled_data: dict, template_id: str) -> StreamingResponse:
|
||||
"""导出为 Word 格式"""
|
||||
import re
|
||||
import tempfile
|
||||
import os
|
||||
from docx import Document
|
||||
from docx.shared import Pt, RGBColor
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
|
||||
doc = Document()
|
||||
def clean_text(text: str) -> str:
|
||||
"""清理文本,移除可能导致Word问题的非法字符"""
|
||||
if not text:
|
||||
return ""
|
||||
# 移除控制字符
|
||||
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
|
||||
return text.strip()
|
||||
|
||||
# 添加标题
|
||||
title = doc.add_heading('填写结果', level=1)
|
||||
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
try:
|
||||
# 先保存到临时文件,再读取到内存,确保文档完整性
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
|
||||
tmp_path = tmp_file.name
|
||||
|
||||
# 添加填写时间和模板信息
|
||||
from datetime import datetime
|
||||
info_para = doc.add_paragraph()
|
||||
info_para.add_run(f"模板ID: {template_id}\n").bold = True
|
||||
info_para.add_run(f"导出时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
doc = Document()
|
||||
doc.add_heading('填写结果', level=1)
|
||||
|
||||
doc.add_paragraph() # 空行
|
||||
from datetime import datetime
|
||||
info_para = doc.add_paragraph()
|
||||
template_filename = template_id.split('/')[-1].split('\\')[-1] if template_id else '未知'
|
||||
info_para.add_run(f"模板文件: {clean_text(template_filename)}\n").bold = True
|
||||
info_para.add_run(f"导出时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
doc.add_paragraph()
|
||||
|
||||
# 添加字段表格
|
||||
table = doc.add_table(rows=1, cols=3)
|
||||
table.style = 'Light Grid Accent 1'
|
||||
table = doc.add_table(rows=1, cols=3)
|
||||
table.style = 'Table Grid'
|
||||
|
||||
# 表头
|
||||
header_cells = table.rows[0].cells
|
||||
header_cells[0].text = '字段名'
|
||||
header_cells[1].text = '填写值'
|
||||
header_cells[2].text = '状态'
|
||||
header_cells = table.rows[0].cells
|
||||
header_cells[0].text = '字段名'
|
||||
header_cells[1].text = '填写值'
|
||||
header_cells[2].text = '状态'
|
||||
|
||||
for field_name, field_value in filled_data.items():
|
||||
row_cells = table.add_row().cells
|
||||
row_cells[0].text = field_name
|
||||
row_cells[1].text = str(field_value) if field_value else ''
|
||||
row_cells[2].text = '已填写' if field_value else '为空'
|
||||
for field_name, field_value in filled_data.items():
|
||||
row_cells = table.add_row().cells
|
||||
row_cells[0].text = clean_text(str(field_name))
|
||||
|
||||
# 保存到 BytesIO
|
||||
output = io.BytesIO()
|
||||
doc.save(output)
|
||||
output.seek(0)
|
||||
if isinstance(field_value, list):
|
||||
clean_values = [clean_text(str(v)) for v in field_value if v]
|
||||
display_value = ', '.join(clean_values) if clean_values else ''
|
||||
else:
|
||||
display_value = clean_text(str(field_value)) if field_value else ''
|
||||
|
||||
filename = f"filled_template.docx"
|
||||
row_cells[1].text = display_value
|
||||
row_cells[2].text = '已填写' if display_value else '为空'
|
||||
|
||||
# 保存到临时文件
|
||||
doc.save(tmp_path)
|
||||
|
||||
# 读取文件内容
|
||||
with open(tmp_path, 'rb') as f:
|
||||
file_content = f.read()
|
||||
|
||||
finally:
|
||||
# 清理临时文件
|
||||
if os.path.exists(tmp_path):
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except:
|
||||
pass
|
||||
|
||||
output = io.BytesIO(file_content)
|
||||
filename = "filled_template.docx"
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(output.getvalue()),
|
||||
output,
|
||||
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
headers={"Content-Disposition": f"attachment; filename={filename}"}
|
||||
headers={"Content-Disposition": f"attachment; filename*=UTF-8''{filename}"}
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user