【智能助手增强】
- 新增对话历史管理:MongoDB新增conversations集合,存储用户与AI的对话上下文,支持多轮对话意图延续
- 新增对话历史API(conversation.py):GET/DELETE conversation历史、列出所有会话
- 意图解析增强:支持基于对话历史的意图识别,上下文理解更准确
- 字段提取优化:支持"提取文档中的医院数量"等自然语言模式,智能去除"文档中的"前缀
- 文档对比优化:从指令中提取文件名并精确匹配source_docs,支持"对比A和B两个文档"
- 文档摘要优化:使用LLM生成真实AI摘要而非返回原始文档预览
【Word模板填表核心功能】
- Word模板字段生成:空白Word上传后,自动从源文档(Excel/Word/TXT/MD)内容AI生成字段名
- Word模板填表(_fill_docx):将提取数据写入Word模板表格,支持精确匹配、模糊匹配、追加新行
- 数据润色(_polish_word_filled_data):LLM对多行Excel数据进行统计归纳(合计/平均/极值),转化为专业自然语言描述
- 段落格式输出:使用📌字段名+值段落+分隔线(灰色横线)格式,提升可读性
- 导出链打通:fill_template返回filled_file_path,export直接返回已填好的Word文件
【其他修复】
- 修复Word导出Windows文件锁问题:NamedTemporaryFile改为mkstemp+close
- 修复Word方框非法字符:扩展clean_text移除\uFFFD、□等Unicode替代符和零宽字符
- 修复文档对比"需要至少2个文档":从指令提取具体文件名优先匹配而非取前2个
- 修复导出format硬编码:自动识别docx/xlsx格式
- Docx解析器增加备用解析方法和更完整的段落/表格/标题提取
- RAG服务新增MySQL数据源支持
This commit is contained in:
@@ -14,6 +14,7 @@ from app.api.endpoints import (
|
||||
analysis_charts,
|
||||
health,
|
||||
instruction, # 智能指令
|
||||
conversation, # 对话历史
|
||||
)
|
||||
|
||||
# 创建主路由
|
||||
@@ -31,3 +32,4 @@ api_router.include_router(ai_analyze.router) # AI分析
|
||||
api_router.include_router(visualization.router) # 可视化
|
||||
api_router.include_router(analysis_charts.router) # 分析图表
|
||||
api_router.include_router(instruction.router) # 智能指令
|
||||
api_router.include_router(conversation.router) # 对话历史
|
||||
|
||||
98
backend/app/api/endpoints/conversation.py
Normal file
98
backend/app/api/endpoints/conversation.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""
|
||||
对话历史 API 接口
|
||||
|
||||
提供对话历史的存储和查询功能
|
||||
"""
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.core.database import mongodb
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/conversation", tags=["对话历史"])
|
||||
|
||||
|
||||
# ==================== 请求/响应模型 ====================
|
||||
|
||||
class ConversationMessage(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
intent: Optional[str] = None
|
||||
|
||||
|
||||
class ConversationHistoryResponse(BaseModel):
|
||||
success: bool
|
||||
messages: list
|
||||
|
||||
|
||||
class ConversationListResponse(BaseModel):
|
||||
success: bool
|
||||
conversations: list
|
||||
|
||||
|
||||
# ==================== 接口 ====================
|
||||
|
||||
@router.get("/{conversation_id}/history", response_model=ConversationHistoryResponse)
|
||||
async def get_conversation_history(conversation_id: str, limit: int = 20):
|
||||
"""
|
||||
获取对话历史
|
||||
|
||||
Args:
|
||||
conversation_id: 对话会话ID
|
||||
limit: 返回消息数量(默认20条)
|
||||
"""
|
||||
try:
|
||||
messages = await mongodb.get_conversation_history(conversation_id, limit=limit)
|
||||
return ConversationHistoryResponse(
|
||||
success=True,
|
||||
messages=messages
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"获取对话历史失败: {e}")
|
||||
return ConversationHistoryResponse(
|
||||
success=False,
|
||||
messages=[]
|
||||
)
|
||||
|
||||
|
||||
@router.delete("/{conversation_id}")
|
||||
async def delete_conversation(conversation_id: str):
|
||||
"""
|
||||
删除对话会话
|
||||
|
||||
Args:
|
||||
conversation_id: 对话会话ID
|
||||
"""
|
||||
try:
|
||||
success = await mongodb.delete_conversation(conversation_id)
|
||||
return {"success": success}
|
||||
except Exception as e:
|
||||
logger.error(f"删除对话失败: {e}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
@router.get("/all", response_model=ConversationListResponse)
|
||||
async def list_conversations(limit: int = 50, skip: int = 0):
|
||||
"""
|
||||
获取会话列表
|
||||
|
||||
Args:
|
||||
limit: 返回数量
|
||||
skip: 跳过数量
|
||||
"""
|
||||
try:
|
||||
conversations = await mongodb.list_conversations(limit=limit, skip=skip)
|
||||
return ConversationListResponse(
|
||||
success=True,
|
||||
conversations=conversations
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"获取会话列表失败: {e}")
|
||||
return ConversationListResponse(
|
||||
success=False,
|
||||
conversations=[]
|
||||
)
|
||||
@@ -4,6 +4,7 @@
|
||||
支持多格式文档(docx/xlsx/md/txt)上传、解析、存储和RAG索引
|
||||
集成 Excel 存储和 AI 生成字段描述
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
@@ -258,6 +259,7 @@ async def process_document(
|
||||
)
|
||||
|
||||
# 如果是 Excel,存储到 MySQL + AI生成描述 + RAG索引
|
||||
mysql_table_name = None
|
||||
if doc_type in ["xlsx", "xls"]:
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
@@ -265,17 +267,29 @@ async def process_document(
|
||||
)
|
||||
|
||||
try:
|
||||
# 使用 TableRAG 服务完成建表和RAG索引
|
||||
# 使用 TableRAG 服务存储到 MySQL(跳过 RAG 索引以提升速度)
|
||||
logger.info(f"开始存储Excel到MySQL: {original_filename}, file_path: {file_path}")
|
||||
rag_result = await table_rag_service.build_table_rag_index(
|
||||
file_path=file_path,
|
||||
filename=original_filename,
|
||||
sheet_name=parse_options.get("sheet_name"),
|
||||
header_row=parse_options.get("header_row", 0)
|
||||
header_row=parse_options.get("header_row", 0),
|
||||
skip_rag_index=True # 跳过 AI 字段描述生成和索引
|
||||
)
|
||||
|
||||
if rag_result.get("success"):
|
||||
logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {rag_result.get('table_name')}")
|
||||
mysql_table_name = rag_result.get('table_name')
|
||||
logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {mysql_table_name}")
|
||||
# 更新 MongoDB 中的 metadata,记录 MySQL 表名
|
||||
try:
|
||||
doc = await mongodb.get_document(doc_id)
|
||||
if doc:
|
||||
metadata = doc.get("metadata", {})
|
||||
metadata["mysql_table_name"] = mysql_table_name
|
||||
await mongodb.update_document_metadata(doc_id, metadata)
|
||||
logger.info(f"已更新 MongoDB 文档的 mysql_table_name: {mysql_table_name}")
|
||||
except Exception as update_err:
|
||||
logger.warning(f"更新 MongoDB mysql_table_name 失败: {update_err}")
|
||||
else:
|
||||
logger.error(f"RAG索引构建失败: {rag_result.get('error')}")
|
||||
except Exception as e:
|
||||
@@ -283,17 +297,16 @@ async def process_document(
|
||||
|
||||
else:
|
||||
# 非结构化文档
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=60, message="正在建立索引"
|
||||
)
|
||||
|
||||
# 如果文档中有表格数据,提取并存储到 MySQL + RAG
|
||||
structured_data = result.data.get("structured_data", {})
|
||||
tables = structured_data.get("tables", [])
|
||||
|
||||
# 如果文档中有表格数据,提取并存储到 MySQL(不需要 RAG 索引)
|
||||
if tables:
|
||||
# 对每个表格建立 MySQL 表和 RAG 索引
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=60, message="正在存储表格数据"
|
||||
)
|
||||
# 对每个表格建立 MySQL 表(跳过 RAG 索引,速度更快)
|
||||
for table_info in tables:
|
||||
await table_rag_service.index_document_table(
|
||||
doc_id=doc_id,
|
||||
@@ -302,8 +315,14 @@ async def process_document(
|
||||
source_doc_type=doc_type
|
||||
)
|
||||
|
||||
# 同时对文档内容建立 RAG 索引
|
||||
await index_document_to_rag(doc_id, original_filename, result, doc_type)
|
||||
# 对文档内容建立 RAG 索引(非结构化文本需要语义搜索)
|
||||
content = result.data.get("content", "")
|
||||
if content and len(content) > 50: # 只有内容足够长才建立索引
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=80, message="正在建立语义索引"
|
||||
)
|
||||
await index_document_to_rag(doc_id, original_filename, result, doc_type)
|
||||
|
||||
# 完成
|
||||
await update_task_status(
|
||||
@@ -328,72 +347,95 @@ async def process_document(
|
||||
|
||||
|
||||
async def process_documents_batch(task_id: str, files: List[dict]):
|
||||
"""批量处理文档"""
|
||||
"""批量并行处理文档"""
|
||||
try:
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=0, message="开始批量处理"
|
||||
progress=0, message=f"开始批量处理 {len(files)} 个文档",
|
||||
result={"total": len(files), "files": []}
|
||||
)
|
||||
|
||||
results = []
|
||||
for i, file_info in enumerate(files):
|
||||
async def process_single_file(file_info: dict, index: int) -> dict:
|
||||
"""处理单个文件"""
|
||||
filename = file_info["filename"]
|
||||
try:
|
||||
# 解析文档
|
||||
parser = ParserFactory.get_parser(file_info["path"])
|
||||
result = parser.parse(file_info["path"])
|
||||
|
||||
if result.success:
|
||||
doc_id = await mongodb.insert_document(
|
||||
doc_type=file_info["ext"],
|
||||
content=result.data.get("content", ""),
|
||||
metadata={
|
||||
**result.metadata,
|
||||
"original_filename": file_info["filename"],
|
||||
"file_path": file_info["path"]
|
||||
},
|
||||
structured_data=result.data.get("structured_data")
|
||||
if not result.success:
|
||||
return {"index": index, "filename": filename, "success": False, "error": result.error or "解析失败"}
|
||||
|
||||
# 存储到 MongoDB
|
||||
doc_id = await mongodb.insert_document(
|
||||
doc_type=file_info["ext"],
|
||||
content=result.data.get("content", ""),
|
||||
metadata={
|
||||
**result.metadata,
|
||||
"original_filename": filename,
|
||||
"file_path": file_info["path"]
|
||||
},
|
||||
structured_data=result.data.get("structured_data")
|
||||
)
|
||||
|
||||
# Excel 处理
|
||||
if file_info["ext"] in ["xlsx", "xls"]:
|
||||
await table_rag_service.build_table_rag_index(
|
||||
file_path=file_info["path"],
|
||||
filename=filename,
|
||||
skip_rag_index=True # 跳过 AI 字段描述生成和索引
|
||||
)
|
||||
|
||||
# Excel 处理
|
||||
if file_info["ext"] in ["xlsx", "xls"]:
|
||||
await table_rag_service.build_table_rag_index(
|
||||
file_path=file_info["path"],
|
||||
filename=file_info["filename"]
|
||||
)
|
||||
else:
|
||||
# 非结构化文档:处理其中的表格 + 内容索引
|
||||
structured_data = result.data.get("structured_data", {})
|
||||
tables = structured_data.get("tables", [])
|
||||
|
||||
if tables:
|
||||
for table_info in tables:
|
||||
await table_rag_service.index_document_table(
|
||||
doc_id=doc_id,
|
||||
filename=file_info["filename"],
|
||||
table_data=table_info,
|
||||
source_doc_type=file_info["ext"]
|
||||
)
|
||||
|
||||
await index_document_to_rag(doc_id, file_info["filename"], result, file_info["ext"])
|
||||
|
||||
results.append({"filename": file_info["filename"], "doc_id": doc_id, "success": True})
|
||||
else:
|
||||
results.append({"filename": file_info["filename"], "success": False, "error": result.error})
|
||||
# 非结构化文档
|
||||
structured_data = result.data.get("structured_data", {})
|
||||
tables = structured_data.get("tables", [])
|
||||
|
||||
# 表格数据直接存 MySQL(跳过 RAG 索引)
|
||||
if tables:
|
||||
for table_info in tables:
|
||||
await table_rag_service.index_document_table(
|
||||
doc_id=doc_id,
|
||||
filename=filename,
|
||||
table_data=table_info,
|
||||
source_doc_type=file_info["ext"]
|
||||
)
|
||||
|
||||
# 只有内容足够长才建立语义索引
|
||||
content = result.data.get("content", "")
|
||||
if content and len(content) > 50:
|
||||
await index_document_to_rag(doc_id, filename, result, file_info["ext"])
|
||||
|
||||
return {"index": index, "filename": filename, "doc_id": doc_id, "success": True}
|
||||
|
||||
except Exception as e:
|
||||
results.append({"filename": file_info["filename"], "success": False, "error": str(e)})
|
||||
logger.error(f"处理文件 {filename} 失败: {e}")
|
||||
return {"index": index, "filename": filename, "success": False, "error": str(e)}
|
||||
|
||||
progress = int((i + 1) / len(files) * 100)
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=progress, message=f"已处理 {i+1}/{len(files)}"
|
||||
)
|
||||
# 并行处理所有文档
|
||||
tasks = [process_single_file(f, i) for i, f in enumerate(files)]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
# 按原始顺序排序
|
||||
results.sort(key=lambda x: x["index"])
|
||||
|
||||
# 统计成功/失败数量
|
||||
success_count = sum(1 for r in results if r["success"])
|
||||
fail_count = len(results) - success_count
|
||||
|
||||
# 更新最终状态
|
||||
await update_task_status(
|
||||
task_id, status="success",
|
||||
progress=100, message="批量处理完成",
|
||||
result={"results": results}
|
||||
progress=100, message=f"批量处理完成: {success_count} 成功, {fail_count} 失败",
|
||||
result={
|
||||
"total": len(files),
|
||||
"success": success_count,
|
||||
"failure": fail_count,
|
||||
"results": results
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(f"批量处理完成: {success_count}/{len(files)} 成功")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"批量处理失败: {str(e)}")
|
||||
await update_task_status(
|
||||
@@ -404,20 +446,20 @@ async def process_documents_batch(task_id: str, files: List[dict]):
|
||||
|
||||
|
||||
async def index_document_to_rag(doc_id: str, filename: str, result: ParseResult, doc_type: str):
|
||||
"""将非结构化文档索引到 RAG(使用分块索引)"""
|
||||
"""将非结构化文档索引到 RAG(使用分块索引,异步执行)"""
|
||||
try:
|
||||
content = result.data.get("content", "")
|
||||
if content:
|
||||
# 将完整内容传递给 RAG 服务自动分块索引
|
||||
rag_service.index_document_content(
|
||||
# 使用异步方法索引,避免阻塞事件循环
|
||||
await rag_service.index_document_content_async(
|
||||
doc_id=doc_id,
|
||||
content=content, # 传递完整内容,由 RAG 服务自动分块
|
||||
content=content,
|
||||
metadata={
|
||||
"filename": filename,
|
||||
"doc_type": doc_type
|
||||
},
|
||||
chunk_size=500, # 每块 500 字符
|
||||
chunk_overlap=50 # 块之间 50 字符重叠
|
||||
chunk_size=1000, # 每块 1000 字符,提升速度
|
||||
chunk_overlap=100 # 块之间 100 字符重叠
|
||||
)
|
||||
logger.info(f"RAG 索引完成: {filename}, doc_id={doc_id}")
|
||||
except Exception as e:
|
||||
|
||||
@@ -25,6 +25,7 @@ class InstructionRequest(BaseModel):
|
||||
instruction: str
|
||||
doc_ids: Optional[List[str]] = None # 关联的文档 ID 列表
|
||||
context: Optional[Dict[str, Any]] = None # 额外上下文
|
||||
conversation_id: Optional[str] = None # 对话会话ID,用于关联历史记录
|
||||
|
||||
|
||||
class IntentRecognitionResponse(BaseModel):
|
||||
@@ -240,7 +241,8 @@ async def instruction_chat(
|
||||
task_id=task_id,
|
||||
instruction=request.instruction,
|
||||
doc_ids=request.doc_ids,
|
||||
context=request.context
|
||||
context=request.context,
|
||||
conversation_id=request.conversation_id
|
||||
)
|
||||
|
||||
return {
|
||||
@@ -251,14 +253,15 @@ async def instruction_chat(
|
||||
}
|
||||
|
||||
# 同步模式:等待执行完成
|
||||
return await _execute_chat_task(task_id, request.instruction, request.doc_ids, request.context)
|
||||
return await _execute_chat_task(task_id, request.instruction, request.doc_ids, request.context, request.conversation_id)
|
||||
|
||||
|
||||
async def _execute_chat_task(
|
||||
task_id: str,
|
||||
instruction: str,
|
||||
doc_ids: Optional[List[str]],
|
||||
context: Optional[Dict[str, Any]]
|
||||
context: Optional[Dict[str, Any]],
|
||||
conversation_id: Optional[str] = None
|
||||
):
|
||||
"""执行指令对话的后台任务"""
|
||||
from app.core.database import mongodb as mongo_client
|
||||
@@ -278,6 +281,13 @@ async def _execute_chat_task(
|
||||
# 构建上下文
|
||||
ctx: Dict[str, Any] = context or {}
|
||||
|
||||
# 获取对话历史
|
||||
if conversation_id:
|
||||
history = await mongo_client.get_conversation_history(conversation_id, limit=20)
|
||||
if history:
|
||||
ctx["conversation_history"] = history
|
||||
logger.info(f"加载对话历史: conversation_id={conversation_id}, 消息数={len(history)}")
|
||||
|
||||
# 获取关联文档
|
||||
if doc_ids:
|
||||
docs = []
|
||||
@@ -291,6 +301,29 @@ async def _execute_chat_task(
|
||||
# 执行指令
|
||||
result = await instruction_executor.execute(instruction, ctx)
|
||||
|
||||
# 存储对话历史
|
||||
if conversation_id:
|
||||
try:
|
||||
# 存储用户消息
|
||||
await mongo_client.insert_conversation(
|
||||
conversation_id=conversation_id,
|
||||
role="user",
|
||||
content=instruction,
|
||||
intent=result.get("intent", "unknown")
|
||||
)
|
||||
# 存储助手回复
|
||||
response_content = result.get("message", "")
|
||||
if response_content:
|
||||
await mongo_client.insert_conversation(
|
||||
conversation_id=conversation_id,
|
||||
role="assistant",
|
||||
content=response_content,
|
||||
intent=result.get("intent", "unknown")
|
||||
)
|
||||
logger.info(f"已存储对话历史: conversation_id={conversation_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"存储对话历史失败: {e}")
|
||||
|
||||
# 根据意图类型添加友好的响应消息
|
||||
response_messages = {
|
||||
"extract": f"已提取 {len(result.get('extracted_data', {}))} 个字段的数据",
|
||||
|
||||
@@ -87,6 +87,7 @@ class ExportRequest(BaseModel):
|
||||
template_id: str
|
||||
filled_data: dict
|
||||
format: str = "xlsx" # xlsx 或 docx
|
||||
filled_file_path: Optional[str] = None # 已填写的 Word 文件路径(可选)
|
||||
|
||||
|
||||
# ==================== 接口实现 ====================
|
||||
@@ -541,7 +542,7 @@ async def export_filled_template(
|
||||
if request.format == "xlsx":
|
||||
return await _export_to_excel(request.filled_data, request.template_id)
|
||||
elif request.format == "docx":
|
||||
return await _export_to_word(request.filled_data, request.template_id)
|
||||
return await _export_to_word(request.filled_data, request.template_id, request.filled_file_path)
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
@@ -608,11 +609,12 @@ async def _export_to_excel(filled_data: dict, template_id: str) -> StreamingResp
|
||||
)
|
||||
|
||||
|
||||
async def _export_to_word(filled_data: dict, template_id: str) -> StreamingResponse:
|
||||
async def _export_to_word(filled_data: dict, template_id: str, filled_file_path: Optional[str] = None) -> StreamingResponse:
|
||||
"""导出为 Word 格式"""
|
||||
import re
|
||||
import tempfile
|
||||
import os
|
||||
import urllib.parse
|
||||
from docx import Document
|
||||
from docx.shared import Pt, RGBColor
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
@@ -623,12 +625,32 @@ async def _export_to_word(filled_data: dict, template_id: str) -> StreamingRespo
|
||||
return ""
|
||||
# 移除控制字符
|
||||
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
|
||||
# 转义 XML 特殊字符以防破坏文档结构
|
||||
text = text.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||
return text.strip()
|
||||
|
||||
tmp_path = None
|
||||
try:
|
||||
# 先保存到临时文件,再读取到内存,确保文档完整性
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
|
||||
tmp_path = tmp_file.name
|
||||
# 如果有已填写的文件(通过 _fill_docx 填写了模板单元格),直接返回该文件
|
||||
if filled_file_path and os.path.exists(filled_file_path):
|
||||
filename = os.path.basename(filled_file_path)
|
||||
with open(filled_file_path, 'rb') as f:
|
||||
file_content = f.read()
|
||||
output = io.BytesIO(file_content)
|
||||
encoded_filename = urllib.parse.quote(filename)
|
||||
return StreamingResponse(
|
||||
output,
|
||||
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
headers={
|
||||
"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}",
|
||||
"Content-Length": str(len(file_content))
|
||||
}
|
||||
)
|
||||
|
||||
# 没有已填写文件,创建新的 Word 文档(表格形式)
|
||||
# 创建临时文件(立即关闭句柄,避免 Windows 文件锁问题)
|
||||
tmp_fd, tmp_path = tempfile.mkstemp(suffix='.docx')
|
||||
os.close(tmp_fd) # 关闭立即得到的 fd,让 docx 可以写入
|
||||
|
||||
doc = Document()
|
||||
doc.add_heading('填写结果', level=1)
|
||||
@@ -670,19 +692,23 @@ async def _export_to_word(filled_data: dict, template_id: str) -> StreamingRespo
|
||||
|
||||
finally:
|
||||
# 清理临时文件
|
||||
if os.path.exists(tmp_path):
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
output = io.BytesIO(file_content)
|
||||
filename = "filled_template.docx"
|
||||
encoded_filename = urllib.parse.quote(filename)
|
||||
|
||||
return StreamingResponse(
|
||||
output,
|
||||
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
headers={"Content-Disposition": f"attachment; filename*=UTF-8''{filename}"}
|
||||
headers={
|
||||
"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}",
|
||||
"Content-Length": str(len(file_content))
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -64,6 +64,11 @@ class MongoDB:
|
||||
"""任务集合 - 存储任务历史记录"""
|
||||
return self.db["tasks"]
|
||||
|
||||
@property
|
||||
def conversations(self):
|
||||
"""对话集合 - 存储对话历史记录"""
|
||||
return self.db["conversations"]
|
||||
|
||||
# ==================== 文档操作 ====================
|
||||
|
||||
async def insert_document(
|
||||
@@ -117,14 +122,20 @@ class MongoDB:
|
||||
搜索文档
|
||||
|
||||
Args:
|
||||
query: 搜索关键词
|
||||
query: 搜索关键词(支持文件名和内容搜索)
|
||||
doc_type: 文档类型过滤
|
||||
limit: 返回数量
|
||||
|
||||
Returns:
|
||||
文档列表
|
||||
"""
|
||||
filter_query = {"content": {"$regex": query}}
|
||||
filter_query = {
|
||||
"$or": [
|
||||
{"content": {"$regex": query, "$options": "i"}},
|
||||
{"metadata.original_filename": {"$regex": query, "$options": "i"}},
|
||||
{"metadata.filename": {"$regex": query, "$options": "i"}},
|
||||
]
|
||||
}
|
||||
if doc_type:
|
||||
filter_query["doc_type"] = doc_type
|
||||
|
||||
@@ -141,6 +152,15 @@ class MongoDB:
|
||||
result = await self.documents.delete_one({"_id": ObjectId(doc_id)})
|
||||
return result.deleted_count > 0
|
||||
|
||||
async def update_document_metadata(self, doc_id: str, metadata: Dict[str, Any]) -> bool:
|
||||
"""更新文档 metadata 字段"""
|
||||
from bson import ObjectId
|
||||
result = await self.documents.update_one(
|
||||
{"_id": ObjectId(doc_id)},
|
||||
{"$set": {"metadata": metadata}}
|
||||
)
|
||||
return result.modified_count > 0
|
||||
|
||||
# ==================== RAG 索引操作 ====================
|
||||
|
||||
async def insert_rag_entry(
|
||||
@@ -251,6 +271,10 @@ class MongoDB:
|
||||
await self.tasks.create_index("task_id", unique=True)
|
||||
await self.tasks.create_index("created_at")
|
||||
|
||||
# 对话集合索引
|
||||
await self.conversations.create_index("conversation_id")
|
||||
await self.conversations.create_index("created_at")
|
||||
|
||||
logger.info("MongoDB 索引创建完成")
|
||||
|
||||
# ==================== 任务历史操作 ====================
|
||||
@@ -369,6 +393,108 @@ class MongoDB:
|
||||
result = await self.tasks.delete_one({"task_id": task_id})
|
||||
return result.deleted_count > 0
|
||||
|
||||
# ==================== 对话历史操作 ====================
|
||||
|
||||
async def insert_conversation(
|
||||
self,
|
||||
conversation_id: str,
|
||||
role: str,
|
||||
content: str,
|
||||
intent: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> str:
|
||||
"""
|
||||
插入对话记录
|
||||
|
||||
Args:
|
||||
conversation_id: 对话会话ID
|
||||
role: 角色 (user/assistant)
|
||||
content: 对话内容
|
||||
intent: 意图类型
|
||||
metadata: 额外元数据
|
||||
|
||||
Returns:
|
||||
插入文档的ID
|
||||
"""
|
||||
message = {
|
||||
"conversation_id": conversation_id,
|
||||
"role": role,
|
||||
"content": content,
|
||||
"intent": intent,
|
||||
"metadata": metadata or {},
|
||||
"created_at": datetime.utcnow(),
|
||||
}
|
||||
result = await self.conversations.insert_one(message)
|
||||
return str(result.inserted_id)
|
||||
|
||||
async def get_conversation_history(
|
||||
self,
|
||||
conversation_id: str,
|
||||
limit: int = 20,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
获取对话历史
|
||||
|
||||
Args:
|
||||
conversation_id: 对话会话ID
|
||||
limit: 返回消息数量
|
||||
|
||||
Returns:
|
||||
对话消息列表
|
||||
"""
|
||||
cursor = self.conversations.find(
|
||||
{"conversation_id": conversation_id}
|
||||
).sort("created_at", 1).limit(limit)
|
||||
|
||||
messages = []
|
||||
async for msg in cursor:
|
||||
msg["_id"] = str(msg["_id"])
|
||||
if msg.get("created_at"):
|
||||
msg["created_at"] = msg["created_at"].isoformat()
|
||||
messages.append(msg)
|
||||
return messages
|
||||
|
||||
async def delete_conversation(self, conversation_id: str) -> bool:
|
||||
"""删除对话会话"""
|
||||
result = await self.conversations.delete_many({"conversation_id": conversation_id})
|
||||
return result.deleted_count > 0
|
||||
|
||||
async def list_conversations(
|
||||
self,
|
||||
limit: int = 50,
|
||||
skip: int = 0,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
获取会话列表(按最近一条消息排序)
|
||||
|
||||
Args:
|
||||
limit: 返回数量
|
||||
skip: 跳过数量
|
||||
|
||||
Returns:
|
||||
会话列表
|
||||
"""
|
||||
# 使用 aggregation 获取每个会话的最新一条消息
|
||||
pipeline = [
|
||||
{"$sort": {"created_at": -1}},
|
||||
{"$group": {
|
||||
"_id": "$conversation_id",
|
||||
"last_message": {"$first": "$$ROOT"},
|
||||
}},
|
||||
{"$replaceRoot": {"newRoot": "$last_message"}},
|
||||
{"$sort": {"created_at": -1}},
|
||||
{"$skip": skip},
|
||||
{"$limit": limit},
|
||||
]
|
||||
|
||||
conversations = []
|
||||
async for doc in self.conversations.aggregate(pipeline):
|
||||
doc["_id"] = str(doc["_id"])
|
||||
if doc.get("created_at"):
|
||||
doc["created_at"] = doc["created_at"].isoformat()
|
||||
conversations.append(doc)
|
||||
return conversations
|
||||
|
||||
|
||||
# ==================== 全局单例 ====================
|
||||
|
||||
|
||||
@@ -44,6 +44,22 @@ class DocxParser(BaseParser):
|
||||
error=f"文件不存在: {file_path}"
|
||||
)
|
||||
|
||||
# 尝试使用 python-docx 解析,失败则使用备用方法
|
||||
try:
|
||||
return self._parse_with_docx(path)
|
||||
except Exception as e:
|
||||
logger.warning(f"python-docx 解析失败,使用备用方法: {e}")
|
||||
try:
|
||||
return self._parse_fallback(path)
|
||||
except Exception as fallback_error:
|
||||
logger.error(f"备用解析方法也失败: {fallback_error}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"解析 Word 文档失败: {str(e)}"
|
||||
)
|
||||
|
||||
def _parse_with_docx(self, path: Path) -> ParseResult:
|
||||
"""使用 python-docx 解析文档"""
|
||||
# 检查文件扩展名
|
||||
if path.suffix.lower() not in self.supported_extensions:
|
||||
return ParseResult(
|
||||
@@ -51,98 +67,177 @@ class DocxParser(BaseParser):
|
||||
error=f"不支持的文件类型: {path.suffix}"
|
||||
)
|
||||
|
||||
# 读取 Word 文档
|
||||
doc = Document(path)
|
||||
|
||||
# 提取文本内容
|
||||
paragraphs = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
paragraphs.append({
|
||||
"text": para.text,
|
||||
"style": str(para.style.name) if para.style else "Normal"
|
||||
})
|
||||
|
||||
# 提取段落纯文本(用于 AI 解析)
|
||||
paragraphs_text = [p["text"] for p in paragraphs if p["text"].strip()]
|
||||
|
||||
# 提取表格内容
|
||||
tables_data = []
|
||||
for i, table in enumerate(doc.tables):
|
||||
table_rows = []
|
||||
for row in table.rows:
|
||||
row_data = [cell.text.strip() for cell in row.cells]
|
||||
table_rows.append(row_data)
|
||||
|
||||
if table_rows:
|
||||
tables_data.append({
|
||||
"table_index": i,
|
||||
"rows": table_rows,
|
||||
"row_count": len(table_rows),
|
||||
"column_count": len(table_rows[0]) if table_rows else 0
|
||||
})
|
||||
|
||||
# 提取图片/嵌入式对象信息
|
||||
images_info = self._extract_images_info(doc, path)
|
||||
|
||||
# 合并所有文本(包括图片描述)
|
||||
full_text_parts = []
|
||||
full_text_parts.append("【文档正文】")
|
||||
full_text_parts.extend(paragraphs_text)
|
||||
|
||||
if tables_data:
|
||||
full_text_parts.append("\n【文档表格】")
|
||||
for idx, table in enumerate(tables_data):
|
||||
full_text_parts.append(f"--- 表格 {idx + 1} ---")
|
||||
for row in table["rows"]:
|
||||
full_text_parts.append(" | ".join(str(cell) for cell in row))
|
||||
|
||||
if images_info.get("image_count", 0) > 0:
|
||||
full_text_parts.append(f"\n【文档图片】文档包含 {images_info['image_count']} 张图片/图表")
|
||||
|
||||
full_text = "\n".join(full_text_parts)
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"paragraph_count": len(paragraphs),
|
||||
"table_count": len(tables_data),
|
||||
"image_count": images_info.get("image_count", 0)
|
||||
}
|
||||
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": full_text,
|
||||
"paragraphs": paragraphs,
|
||||
"paragraphs_with_style": paragraphs,
|
||||
"tables": tables_data,
|
||||
"images": images_info
|
||||
},
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
def _parse_fallback(self, path: Path) -> ParseResult:
|
||||
"""备用解析方法:直接解析 docx 的 XML 结构"""
|
||||
import zipfile
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
try:
|
||||
# 读取 Word 文档
|
||||
doc = Document(file_path)
|
||||
with zipfile.ZipFile(path, 'r') as zf:
|
||||
# 读取 document.xml
|
||||
if 'word/document.xml' not in zf.namelist():
|
||||
return ParseResult(success=False, error="无效的 docx 文件格式")
|
||||
|
||||
# 提取文本内容
|
||||
paragraphs = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
paragraphs.append({
|
||||
"text": para.text,
|
||||
"style": str(para.style.name) if para.style else "Normal"
|
||||
xml_content = zf.read('word/document.xml')
|
||||
root = ET.fromstring(xml_content)
|
||||
|
||||
# 命名空间
|
||||
namespaces = {
|
||||
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
||||
}
|
||||
|
||||
paragraphs = []
|
||||
tables = []
|
||||
current_table = []
|
||||
|
||||
for elem in root.iter():
|
||||
if elem.tag.endswith('}p'): # 段落
|
||||
text_parts = []
|
||||
for t in elem.iter():
|
||||
if t.tag.endswith('}t') and t.text:
|
||||
text_parts.append(t.text)
|
||||
text = ''.join(text_parts).strip()
|
||||
if text:
|
||||
paragraphs.append({'text': text, 'style': 'Normal'})
|
||||
elif elem.tag.endswith('}tr'): # 表格行
|
||||
row_data = []
|
||||
for tc in elem.iter():
|
||||
if tc.tag.endswith('}tc'): # 单元格
|
||||
cell_text = []
|
||||
for t in tc.iter():
|
||||
if t.tag.endswith('}t') and t.text:
|
||||
cell_text.append(t.text)
|
||||
row_data.append(''.join(cell_text).strip())
|
||||
if row_data:
|
||||
current_table.append(row_data)
|
||||
else:
|
||||
# 表格结束,保存
|
||||
if current_table:
|
||||
tables.append({
|
||||
'table_index': len(tables),
|
||||
'rows': current_table,
|
||||
'row_count': len(current_table),
|
||||
'column_count': len(current_table[0]) if current_table else 0
|
||||
})
|
||||
current_table = []
|
||||
|
||||
# 保存最后一张表格
|
||||
if current_table:
|
||||
tables.append({
|
||||
'table_index': len(tables),
|
||||
'rows': current_table,
|
||||
'row_count': len(current_table),
|
||||
'column_count': len(current_table[0]) if current_table else 0
|
||||
})
|
||||
|
||||
# 提取段落纯文本(用于 AI 解析)
|
||||
paragraphs_text = [p["text"] for p in paragraphs if p["text"].strip()]
|
||||
# 构建文本
|
||||
paragraphs_text = [p["text"] for p in paragraphs]
|
||||
full_text_parts = ["【文档正文】"] + paragraphs_text
|
||||
|
||||
# 提取表格内容
|
||||
tables_data = []
|
||||
for i, table in enumerate(doc.tables):
|
||||
table_rows = []
|
||||
for row in table.rows:
|
||||
row_data = [cell.text.strip() for cell in row.cells]
|
||||
table_rows.append(row_data)
|
||||
if tables:
|
||||
full_text_parts.append("\n【文档表格】")
|
||||
for idx, table in enumerate(tables):
|
||||
full_text_parts.append(f"--- 表格 {idx + 1} ---")
|
||||
for row in table["rows"]:
|
||||
full_text_parts.append(" | ".join(str(cell) for cell in row))
|
||||
|
||||
if table_rows:
|
||||
tables_data.append({
|
||||
"table_index": i,
|
||||
"rows": table_rows,
|
||||
"row_count": len(table_rows),
|
||||
"column_count": len(table_rows[0]) if table_rows else 0
|
||||
})
|
||||
full_text = "\n".join(full_text_parts)
|
||||
|
||||
# 提取图片/嵌入式对象信息
|
||||
images_info = self._extract_images_info(doc, path)
|
||||
|
||||
# 合并所有文本(包括图片描述)
|
||||
full_text_parts = []
|
||||
full_text_parts.append("【文档正文】")
|
||||
full_text_parts.extend(paragraphs_text)
|
||||
|
||||
if tables_data:
|
||||
full_text_parts.append("\n【文档表格】")
|
||||
for idx, table in enumerate(tables_data):
|
||||
full_text_parts.append(f"--- 表格 {idx + 1} ---")
|
||||
for row in table["rows"]:
|
||||
full_text_parts.append(" | ".join(str(cell) for cell in row))
|
||||
|
||||
if images_info.get("image_count", 0) > 0:
|
||||
full_text_parts.append(f"\n【文档图片】文档包含 {images_info['image_count']} 张图片/图表")
|
||||
|
||||
full_text = "\n".join(full_text_parts)
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"file_size": path.stat().st_size,
|
||||
"paragraph_count": len(paragraphs),
|
||||
"table_count": len(tables_data),
|
||||
"word_count": len(full_text),
|
||||
"char_count": len(full_text.replace("\n", "")),
|
||||
"has_tables": len(tables_data) > 0,
|
||||
"has_images": images_info.get("image_count", 0) > 0,
|
||||
"image_count": images_info.get("image_count", 0)
|
||||
}
|
||||
|
||||
# 返回结果
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": full_text,
|
||||
"paragraphs": paragraphs_text,
|
||||
"paragraphs_with_style": paragraphs,
|
||||
"tables": tables_data,
|
||||
"images": images_info,
|
||||
"word_count": len(full_text),
|
||||
"structured_data": {
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": full_text,
|
||||
"paragraphs": paragraphs,
|
||||
"paragraphs_text": paragraphs_text,
|
||||
"tables": tables_data,
|
||||
"images": images_info
|
||||
"paragraphs_with_style": paragraphs,
|
||||
"tables": tables,
|
||||
"images": {"image_count": 0, "descriptions": []}
|
||||
},
|
||||
metadata={
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"paragraph_count": len(paragraphs),
|
||||
"table_count": len(tables),
|
||||
"image_count": 0,
|
||||
"parse_method": "fallback_xml"
|
||||
}
|
||||
},
|
||||
metadata=metadata
|
||||
)
|
||||
)
|
||||
|
||||
except zipfile.BadZipFile:
|
||||
return ParseResult(success=False, error="无效的 ZIP/文档文件")
|
||||
except Exception as e:
|
||||
logger.error(f"解析 Word 文档失败: {str(e)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"解析 Word 文档失败: {str(e)}"
|
||||
)
|
||||
return ParseResult(success=False, error=f"备用解析失败: {str(e)}")
|
||||
|
||||
def extract_images_as_base64(self, file_path: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
@@ -197,6 +292,83 @@ class DocxParser(BaseParser):
|
||||
logger.info(f"共提取 {len(images)} 张图片")
|
||||
return images
|
||||
|
||||
def extract_text_from_images(self, file_path: str, lang: str = 'chi_sim+eng') -> Dict[str, Any]:
|
||||
"""
|
||||
对 Word 文档中的图片进行 OCR 文字识别
|
||||
|
||||
Args:
|
||||
file_path: Word 文件路径
|
||||
lang: Tesseract 语言代码,默认简体中文+英文 (chi_sim+eng)
|
||||
|
||||
Returns:
|
||||
包含识别结果的字典
|
||||
"""
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from PIL import Image
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
except ImportError:
|
||||
logger.warning("pytesseract 未安装,OCR 功能不可用")
|
||||
return {
|
||||
"success": False,
|
||||
"error": "pytesseract 未安装,请运行: pip install pytesseract",
|
||||
"image_count": 0,
|
||||
"extracted_text": []
|
||||
}
|
||||
|
||||
results = {
|
||||
"success": True,
|
||||
"image_count": 0,
|
||||
"extracted_text": [],
|
||||
"total_chars": 0
|
||||
}
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as zf:
|
||||
# 查找 word/media 目录下的图片文件
|
||||
media_files = [f for f in zf.namelist() if f.startswith('word/media/')]
|
||||
|
||||
for idx, filename in enumerate(media_files):
|
||||
ext = filename.split('.')[-1].lower()
|
||||
if ext not in ['png', 'jpg', 'jpeg', 'gif', 'bmp']:
|
||||
continue
|
||||
|
||||
try:
|
||||
# 读取图片数据
|
||||
image_data = zf.read(filename)
|
||||
image = Image.open(BytesIO(image_data))
|
||||
|
||||
# 使用 Tesseract OCR 提取文字
|
||||
text = pytesseract.image_to_string(image, lang=lang)
|
||||
text = text.strip()
|
||||
|
||||
if text:
|
||||
results["extracted_text"].append({
|
||||
"image_index": idx,
|
||||
"filename": filename,
|
||||
"text": text,
|
||||
"char_count": len(text)
|
||||
})
|
||||
results["total_chars"] += len(text)
|
||||
|
||||
logger.info(f"图片 {filename} OCR 识别完成,提取 {len(text)} 字符")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"图片 {filename} OCR 识别失败: {str(e)}")
|
||||
|
||||
results["image_count"] = len(results["extracted_text"])
|
||||
|
||||
except zipfile.BadZipFile:
|
||||
results["success"] = False
|
||||
results["error"] = "无效的 Word 文档文件"
|
||||
except Exception as e:
|
||||
results["success"] = False
|
||||
results["error"] = f"OCR 处理失败: {str(e)}"
|
||||
|
||||
return results
|
||||
|
||||
def extract_key_sentences(self, text: str, max_sentences: int = 10) -> List[str]:
|
||||
"""
|
||||
从文本中提取关键句子
|
||||
|
||||
@@ -5,9 +5,10 @@
|
||||
"""
|
||||
import logging
|
||||
import json
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from app.services.template_fill_service import template_fill_service
|
||||
from app.services.template_fill_service import template_fill_service, TemplateField
|
||||
from app.services.rag_service import rag_service
|
||||
from app.services.markdown_ai_service import markdown_ai_service
|
||||
from app.core.database import mongodb
|
||||
@@ -15,6 +16,31 @@ from app.core.database import mongodb
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _extract_filenames_from_text(text: str) -> List[str]:
|
||||
"""
|
||||
从指令文本中提取文件名列表。
|
||||
|
||||
智能处理用'和'/'与'/'、分隔的多个文件名(尤其是带年号的统计公报)。
|
||||
"""
|
||||
# 先去掉"对比这两个文档"等引导语,只保留文件名部分
|
||||
text = re.sub(r'^(?:对比|比较)这两个?文档[的差异]?[::]?', '', text).strip()
|
||||
text = re.sub(r'两个文档.*$', '', text).strip()
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# 直接查找所有带扩展名的文件名模式
|
||||
results = []
|
||||
for m in re.finditer(r'[^\s,。!?、和与]+(?=\.(?:docx|xlsx|md|txt))', text):
|
||||
start = m.start()
|
||||
ext_match = re.search(r'\.(?:docx|xlsx|md|txt)', text[m.end():])
|
||||
if ext_match:
|
||||
fn = text[start:m.end() + ext_match.end()]
|
||||
if fn:
|
||||
results.append(fn)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
class InstructionExecutor:
|
||||
"""指令执行器"""
|
||||
|
||||
@@ -41,9 +67,10 @@ class InstructionExecutor:
|
||||
self.intent_parser = intent_parser
|
||||
|
||||
context = context or {}
|
||||
context["instruction"] = instruction # 保存原始指令以便后续使用
|
||||
|
||||
# 解析意图
|
||||
intent, params = await self.intent_parser.parse(instruction)
|
||||
# 解析意图(传递对话历史上下文)
|
||||
intent, params = await self.intent_parser.parse(instruction, context)
|
||||
|
||||
# 根据意图类型执行相应操作
|
||||
if intent == "extract":
|
||||
@@ -72,18 +99,48 @@ class InstructionExecutor:
|
||||
async def _execute_extract(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""执行信息提取"""
|
||||
try:
|
||||
target_fields = params.get("field_refs", [])
|
||||
# target_fields 来自意图解析,field_refs 来自引号/字段关键词匹配
|
||||
target_fields = params.get("target_fields", []) or params.get("field_refs", [])
|
||||
doc_ids = params.get("document_refs", [])
|
||||
instruction_text = context.get("instruction", "")
|
||||
|
||||
# 如果没有指定文档,尝试按文件名精确搜索
|
||||
if not doc_ids or "all_docs" in doc_ids:
|
||||
if instruction_text:
|
||||
import re
|
||||
# 提取引号内的内容或文件名
|
||||
filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text)
|
||||
if filename_match:
|
||||
search_term = filename_match.group(1)
|
||||
else:
|
||||
match = re.search(r'([^\s]+\.(?:docx|xlsx|md|txt))', instruction_text)
|
||||
search_term = match.group(1) if match else None
|
||||
|
||||
if search_term:
|
||||
logger.info(f"提取时搜索文档: {search_term}")
|
||||
searched_docs = await mongodb.search_documents(search_term, limit=5)
|
||||
if searched_docs:
|
||||
# 优先选择文件名完全匹配的文档
|
||||
best_docs = [
|
||||
d for d in searched_docs
|
||||
if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower()
|
||||
]
|
||||
if not best_docs:
|
||||
best_docs = [searched_docs[0]]
|
||||
context["source_docs"] = best_docs
|
||||
doc_ids = [doc.get("_id", "") for doc in best_docs]
|
||||
logger.info(f"找到 {len(best_docs)} 个文档用于提取,最佳: {best_docs[0].get('metadata', {}).get('original_filename', '?')}")
|
||||
|
||||
if not target_fields:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "extract",
|
||||
"error": "未指定要提取的字段",
|
||||
"message": "请明确说明要提取哪些字段,如:'提取医院数量和床位数'"
|
||||
}
|
||||
|
||||
# 如果指定了文档,验证文档存在
|
||||
if doc_ids and "all_docs" not in doc_ids:
|
||||
# 如果指定了文档且还没有加载 source_docs,则验证并加载
|
||||
if doc_ids and "all_docs" not in doc_ids and not context.get("source_docs"):
|
||||
valid_docs = []
|
||||
for doc_ref in doc_ids:
|
||||
doc_id = doc_ref.replace("doc_", "")
|
||||
@@ -93,20 +150,22 @@ class InstructionExecutor:
|
||||
if not valid_docs:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "extract",
|
||||
"error": "指定的文档不存在",
|
||||
"message": "请检查文档编号是否正确"
|
||||
}
|
||||
context["source_docs"] = valid_docs
|
||||
|
||||
# 构建字段列表
|
||||
fields = []
|
||||
for i, field_name in enumerate(target_fields):
|
||||
fields.append({
|
||||
"name": field_name,
|
||||
"cell": f"A{i+1}",
|
||||
"field_type": "text",
|
||||
"required": False
|
||||
})
|
||||
# 构建字段列表(使用 TemplateField dataclass)
|
||||
fields = [
|
||||
TemplateField(
|
||||
name=field_name,
|
||||
cell=f"A{i+1}",
|
||||
field_type="text",
|
||||
required=False
|
||||
)
|
||||
for i, field_name in enumerate(target_fields)
|
||||
]
|
||||
|
||||
# 调用填表服务
|
||||
result = await template_fill_service.fill_template(
|
||||
@@ -143,7 +202,7 @@ class InstructionExecutor:
|
||||
}
|
||||
|
||||
# 获取源文档
|
||||
source_docs = context.get("source_docs", [])
|
||||
source_docs = context.get("source_docs", []) or []
|
||||
source_doc_ids = [doc.get("_id") for doc in source_docs if doc.get("_id")]
|
||||
|
||||
# 获取字段
|
||||
@@ -175,36 +234,103 @@ class InstructionExecutor:
|
||||
}
|
||||
|
||||
async def _execute_summarize(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""执行摘要总结"""
|
||||
"""执行摘要总结 - 使用 LLM 生成真实摘要"""
|
||||
try:
|
||||
docs = context.get("source_docs", [])
|
||||
import re
|
||||
docs = context.get("source_docs", []) or []
|
||||
instruction_text = context.get("instruction", "")
|
||||
|
||||
# 从指令中提取文件名/关键词,优先搜索精确文档
|
||||
search_term = None
|
||||
if instruction_text:
|
||||
filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text)
|
||||
if filename_match:
|
||||
search_term = filename_match.group(1)
|
||||
else:
|
||||
file_match = re.search(r'([^\s,。!?,]+\.(?:docx|xlsx|md|txt))', instruction_text)
|
||||
if file_match:
|
||||
search_term = file_match.group(1)
|
||||
|
||||
# 如果没有文档或有更精确的搜索词,尝试重新搜索
|
||||
if not docs or search_term:
|
||||
if search_term:
|
||||
logger.info(f"按关键词搜索文档: {search_term}")
|
||||
searched_docs = await mongodb.search_documents(search_term, limit=5)
|
||||
if searched_docs:
|
||||
# 优先使用文件名最匹配的文档
|
||||
docs = sorted(
|
||||
searched_docs,
|
||||
key=lambda d: 1 if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower() else 0,
|
||||
reverse=True
|
||||
)
|
||||
logger.info(f"找到 {len(docs)} 个文档,最佳匹配: {docs[0].get('metadata', {}).get('original_filename', '?')}")
|
||||
|
||||
if not docs:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "没有可用的文档",
|
||||
"message": "请先上传要总结的文档"
|
||||
"success": True,
|
||||
"intent": "summarize",
|
||||
"action_needed": "provide_document",
|
||||
"message": "我理解了,您想分析文档内容。",
|
||||
"suggestion": "请提供已上传文档的名称(可以是文件名或部分名称),或者上传您想要分析的文档。\n\n支持的格式:docx、xlsx、md、txt\n\n例如:'分析2021年民政事业发展统计公报' 或 '总结卫生健康数据'"
|
||||
}
|
||||
|
||||
summaries = []
|
||||
for doc in docs[:5]: # 最多处理5个文档
|
||||
content = doc.get("content", "")[:5000] # 限制内容长度
|
||||
if content:
|
||||
summaries.append({
|
||||
"filename": doc.get("metadata", {}).get("original_filename", "未知"),
|
||||
"content_preview": content[:500] + "..." if len(content) > 500 else content
|
||||
})
|
||||
# 对第一个(最佳匹配)文档生成 AI 摘要
|
||||
primary_doc = docs[0]
|
||||
content = primary_doc.get("content", "")
|
||||
filename = primary_doc.get("metadata", {}).get("original_filename", "未知文档")
|
||||
|
||||
if not content:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "summarize",
|
||||
"error": "文档内容为空",
|
||||
"message": f"文档 {filename} 没有可供分析的文本内容"
|
||||
}
|
||||
|
||||
# 使用 LLM 生成摘要
|
||||
content_for_summary = content[:12000] # 最多取前 12000 字
|
||||
user_request = instruction_text or "请总结这份文档"
|
||||
|
||||
prompt = f"""请对以下文档进行全面、有条理的摘要分析。
|
||||
|
||||
文档名称:{filename}
|
||||
用户要求:{user_request}
|
||||
|
||||
文档内容:
|
||||
{content_for_summary}
|
||||
|
||||
请按以下格式输出摘要:
|
||||
1. **文档概述**:简述文档主题和背景(2-3句)
|
||||
2. **主要内容**:列出文档的核心数据和关键信息(用要点列出)
|
||||
3. **重要数据**:提取文档中的重要数字、统计数据
|
||||
4. **主要结论**:归纳文档的主要结论或趋势
|
||||
|
||||
要求:条理清晰,数据准确,不要遗漏关键信息。"""
|
||||
|
||||
from app.services.llm_service import llm_service
|
||||
messages = [
|
||||
{"role": "system", "content": "你是一个专业的文档分析助手,擅长提取关键信息并生成结构化摘要。"},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
|
||||
response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=2000)
|
||||
ai_summary = llm_service.extract_message_content(response)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"intent": "summarize",
|
||||
"summaries": summaries,
|
||||
"message": f"找到 {len(summaries)} 个文档可供参考"
|
||||
"ai_summary": ai_summary,
|
||||
"filename": filename,
|
||||
"doc_id": primary_doc.get("_id", ""),
|
||||
"total_docs_found": len(docs),
|
||||
"message": f"已生成文档摘要"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"摘要执行失败: {e}")
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "summarize",
|
||||
"error": str(e),
|
||||
"message": f"摘要生成失败: {str(e)}"
|
||||
}
|
||||
@@ -213,17 +339,39 @@ class InstructionExecutor:
|
||||
"""执行问答"""
|
||||
try:
|
||||
question = params.get("question", "")
|
||||
instruction_text = context.get("instruction", "")
|
||||
|
||||
if not question:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "question",
|
||||
"error": "未提供问题",
|
||||
"message": "请输入要回答的问题"
|
||||
}
|
||||
|
||||
# 使用 RAG 检索相关文档
|
||||
docs = context.get("source_docs", [])
|
||||
rag_results = []
|
||||
docs = context.get("source_docs", []) or []
|
||||
|
||||
# 如果没有文档,尝试从指令中提取文件名搜索
|
||||
if not docs:
|
||||
filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text)
|
||||
if not filename_match:
|
||||
filename_match = re.search(r'([^\s,。!?]+\.(?:docx|xlsx|md|txt))', instruction_text)
|
||||
if filename_match:
|
||||
found = await mongodb.search_documents(filename_match.group(1), limit=5)
|
||||
if found:
|
||||
docs = found
|
||||
|
||||
if not docs:
|
||||
return {
|
||||
"success": True,
|
||||
"intent": "question",
|
||||
"question": question,
|
||||
"answer": None,
|
||||
"message": "请先上传文档,我才能回答您的问题"
|
||||
}
|
||||
|
||||
# 使用 RAG 检索相关文档
|
||||
rag_results = []
|
||||
for doc in docs:
|
||||
doc_id = doc.get("_id", "")
|
||||
if doc_id:
|
||||
@@ -241,12 +389,42 @@ class InstructionExecutor:
|
||||
doc.get("content", "")[:3000] for doc in docs[:3] if doc.get("content")
|
||||
])
|
||||
|
||||
if not context_text:
|
||||
return {
|
||||
"success": True,
|
||||
"intent": "question",
|
||||
"question": question,
|
||||
"answer": None,
|
||||
"message": "文档内容为空,无法回答问题"
|
||||
}
|
||||
|
||||
# 使用 LLM 生成答案
|
||||
filename = docs[0].get("metadata", {}).get("original_filename", "文档")
|
||||
prompt = f"""基于以下文档内容,回答用户的问题。
|
||||
|
||||
文档名称:{filename}
|
||||
用户问题:{question}
|
||||
|
||||
文档内容:
|
||||
{context_text[:8000]}
|
||||
|
||||
请根据文档内容准确回答问题。如果文档中没有相关信息,请明确说明。"""
|
||||
|
||||
from app.services.llm_service import llm_service
|
||||
messages = [
|
||||
{"role": "system", "content": "你是一个专业的文档问答助手,根据提供的内容准确回答用户问题。"},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=1500)
|
||||
answer = llm_service.extract_message_content(response)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"intent": "question",
|
||||
"question": question,
|
||||
"context_preview": context_text[:500] + "..." if len(context_text) > 500 else context_text,
|
||||
"message": "已找到相关上下文,可进行问答"
|
||||
"answer": answer,
|
||||
"filename": filename,
|
||||
"message": "已生成回答"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
@@ -299,12 +477,53 @@ class InstructionExecutor:
|
||||
async def _execute_compare(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""执行对比分析"""
|
||||
try:
|
||||
docs = context.get("source_docs", [])
|
||||
docs = context.get("source_docs", []) or []
|
||||
instruction_text = context.get("instruction", "")
|
||||
|
||||
# 优先从指令中提取具体的文件名
|
||||
filenames = _extract_filenames_from_text(instruction_text)
|
||||
|
||||
if filenames:
|
||||
# 只选择文件名匹配的那些文档
|
||||
matched_docs = []
|
||||
for doc in docs:
|
||||
fname = doc.get("metadata", {}).get("original_filename", "").lower()
|
||||
for fn in filenames:
|
||||
if fn.lower() in fname or fname in fn.lower():
|
||||
matched_docs.append(doc)
|
||||
break
|
||||
# 如果匹配到足够文档,用匹配的
|
||||
if len(matched_docs) >= 2:
|
||||
docs = matched_docs
|
||||
else:
|
||||
# 匹配不够,尝试按文件名搜索 MongoDB
|
||||
all_found = []
|
||||
for fn in filenames:
|
||||
found = await mongodb.search_documents(fn, limit=5)
|
||||
all_found.extend(found)
|
||||
seen = set()
|
||||
unique_docs = []
|
||||
for d in all_found:
|
||||
did = d.get("_id", "")
|
||||
if did and did not in seen:
|
||||
seen.add(did)
|
||||
unique_docs.append(d)
|
||||
if len(unique_docs) >= 2:
|
||||
docs = unique_docs
|
||||
elif len(unique_docs) == 1 and len(docs) >= 1:
|
||||
# 找到一个指定的 + 用一个通用的
|
||||
docs = unique_docs + docs[:1]
|
||||
elif docs and len(filenames) == 1:
|
||||
# 找到一个指定文件名但只有一个匹配,尝试补充
|
||||
docs = unique_docs + [d for d in docs if d not in unique_docs]
|
||||
docs = docs[:2]
|
||||
|
||||
if len(docs) < 2:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "compare",
|
||||
"error": "对比需要至少2个文档",
|
||||
"message": "请上传至少2个文档进行对比"
|
||||
"message": "请上传至少2个文档进行对比,或明确说出要对比的文档名称"
|
||||
}
|
||||
|
||||
# 提取文档基本信息
|
||||
@@ -329,6 +548,7 @@ class InstructionExecutor:
|
||||
logger.error(f"对比执行失败: {e}")
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "compare",
|
||||
"error": str(e),
|
||||
"message": f"对比分析失败: {str(e)}"
|
||||
}
|
||||
@@ -336,10 +556,23 @@ class InstructionExecutor:
|
||||
async def _execute_edit(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""执行文档编辑操作"""
|
||||
try:
|
||||
docs = context.get("source_docs", [])
|
||||
docs = context.get("source_docs", []) or []
|
||||
instruction_text = context.get("instruction", "")
|
||||
|
||||
# 如果没有文档,尝试从指令中提取文件名搜索
|
||||
if not docs:
|
||||
filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text)
|
||||
if not filename_match:
|
||||
filename_match = re.search(r'([^\s,。!?]+\.(?:docx|xlsx|md|txt))', instruction_text)
|
||||
if filename_match:
|
||||
found = await mongodb.search_documents(filename_match.group(1), limit=3)
|
||||
if found:
|
||||
docs = found
|
||||
|
||||
if not docs:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "edit",
|
||||
"error": "没有可用的文档",
|
||||
"message": "请先上传要编辑的文档"
|
||||
}
|
||||
@@ -405,7 +638,7 @@ class InstructionExecutor:
|
||||
- Word -> Markdown
|
||||
"""
|
||||
try:
|
||||
docs = context.get("source_docs", [])
|
||||
docs = context.get("source_docs", []) or []
|
||||
if not docs:
|
||||
return {
|
||||
"success": False,
|
||||
|
||||
@@ -28,7 +28,7 @@ class IntentParser:
|
||||
INTENT_KEYWORDS = {
|
||||
INTENT_EXTRACT: ["提取", "抽取", "获取", "找出", "查找", "识别", "找到"],
|
||||
INTENT_FILL_TABLE: ["填表", "填写", "填充", "录入", "导入到表格", "填写到"],
|
||||
INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼"],
|
||||
INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼", "分析", "聊聊"],
|
||||
INTENT_QUESTION: ["问答", "回答", "解释", "什么是", "为什么", "如何", "怎样", "多少", "几个"],
|
||||
INTENT_SEARCH: ["搜索", "查找", "检索", "查询", "找"],
|
||||
INTENT_COMPARE: ["对比", "比较", "差异", "区别", "不同"],
|
||||
@@ -47,12 +47,13 @@ class IntentParser:
|
||||
def __init__(self):
|
||||
self.intent_history: List[Dict[str, Any]] = []
|
||||
|
||||
async def parse(self, text: str) -> Tuple[str, Dict[str, Any]]:
|
||||
async def parse(self, text: str, context: Dict[str, Any] = None) -> Tuple[str, Dict[str, Any]]:
|
||||
"""
|
||||
解析自然语言指令
|
||||
|
||||
Args:
|
||||
text: 用户输入的自然语言
|
||||
context: 执行上下文(包含对话历史等)
|
||||
|
||||
Returns:
|
||||
(意图类型, 参数字典)
|
||||
@@ -61,11 +62,17 @@ class IntentParser:
|
||||
if not text:
|
||||
return self.INTENT_UNKNOWN, {}
|
||||
|
||||
# 检查对话历史中的上下文
|
||||
conversation_history = []
|
||||
if context and context.get("conversation_history"):
|
||||
conversation_history = context.get("conversation_history", [])
|
||||
logger.info(f"解析时使用对话历史: {len(conversation_history)} 条消息")
|
||||
|
||||
# 记录历史
|
||||
self.intent_history.append({"text": text, "intent": None})
|
||||
|
||||
# 识别意图
|
||||
intent = self._recognize_intent(text)
|
||||
# 识别意图(考虑对话上下文)
|
||||
intent = self._recognize_intent_with_context(text, conversation_history)
|
||||
|
||||
# 提取参数
|
||||
params = self._extract_params(text, intent)
|
||||
@@ -78,6 +85,42 @@ class IntentParser:
|
||||
|
||||
return intent, params
|
||||
|
||||
def _recognize_intent_with_context(self, text: str, conversation_history: List[Dict[str, Any]]) -> str:
|
||||
"""
|
||||
基于对话历史识别意图
|
||||
|
||||
Args:
|
||||
text: 当前用户输入
|
||||
conversation_history: 对话历史
|
||||
|
||||
Returns:
|
||||
意图类型
|
||||
"""
|
||||
# 如果对话历史为空,使用基础意图识别
|
||||
if not conversation_history:
|
||||
return self._recognize_intent(text)
|
||||
|
||||
# 基于历史上下文进行意图识别
|
||||
# 分析最近的对话了解用户意图的延续性
|
||||
last_intent = None
|
||||
last_topic = None
|
||||
|
||||
for msg in conversation_history[-5:]: # 最多看最近5条消息
|
||||
if msg.get("role") == "assistant":
|
||||
last_intent = msg.get("intent")
|
||||
if msg.get("intent") and msg.get("intent") != "unknown":
|
||||
last_topic = msg.get("intent")
|
||||
|
||||
# 如果当前消息很短(如"继续"、"是的"),可能延续之前的意图
|
||||
short_confirmation = ["是", "是的", "好", "继续", "ok", "好", "接着", "然后", "还有吗"]
|
||||
if text.strip() in short_confirmation or len(text.strip()) <= 3:
|
||||
if last_topic:
|
||||
logger.info(f"简短确认,延续之前的意图: {last_topic}")
|
||||
return last_topic
|
||||
|
||||
# 否则使用标准意图识别
|
||||
return self._recognize_intent(text)
|
||||
|
||||
def _recognize_intent(self, text: str) -> str:
|
||||
"""识别意图类型"""
|
||||
intent_scores: Dict[str, float] = {}
|
||||
@@ -214,18 +257,27 @@ class IntentParser:
|
||||
return template_info if template_info else None
|
||||
|
||||
def _extract_target_fields(self, text: str) -> List[str]:
|
||||
"""提取目标字段"""
|
||||
"""提取目标字段 - 按分隔符切分再逐段清理"""
|
||||
fields = []
|
||||
|
||||
# 匹配 "提取XXX和YYY"、"抽取XXX、YYY"
|
||||
patterns = [
|
||||
r"提取([^(and|,|,)+]+?)(?:和|与|、|,|plus)",
|
||||
r"抽取([^(and|,|,)+]+?)(?:和|与|、|,|plus)",
|
||||
]
|
||||
# 去除提取/抽取前缀
|
||||
cleaned_text = re.sub(r"^(?:提取|抽取)", "", text).strip()
|
||||
|
||||
for pattern in patterns:
|
||||
matches = re.findall(pattern, text)
|
||||
fields.extend([m.strip() for m in matches if m.strip()])
|
||||
# 按'和'、'与'、'、'分割成多段
|
||||
segments = re.split(r"[和与、]", cleaned_text)
|
||||
|
||||
# 常见前缀(这些不是字段名,需要去除)
|
||||
prefixes = ["文档中的", "文档中", "文件中的", "文件中", "内容中的", "内容中"]
|
||||
|
||||
for seg in segments:
|
||||
seg = seg.strip()
|
||||
# 去除常见前缀
|
||||
for p in prefixes:
|
||||
if seg.startswith(p):
|
||||
seg = seg[len(p):]
|
||||
break
|
||||
if seg and 2 <= len(seg) <= 20:
|
||||
fields.append(seg)
|
||||
|
||||
return list(set(fields))
|
||||
|
||||
|
||||
@@ -526,9 +526,10 @@ class ExcelStorageService:
|
||||
# 创建表
|
||||
model_class = self._create_table_model(table_name, columns, column_types)
|
||||
|
||||
# 创建表结构
|
||||
# 创建表结构 (使用异步方式)
|
||||
async with self.mysql_db.get_session() as session:
|
||||
model_class.__table__.create(session.bind, checkfirst=True)
|
||||
async with session.bind.begin() as conn:
|
||||
await conn.run_sync(lambda: model_class.__table__.create(checkfirst=True))
|
||||
|
||||
# 插入数据
|
||||
records = []
|
||||
|
||||
@@ -165,9 +165,9 @@ class BM25:
|
||||
class RAGService:
|
||||
"""RAG 检索增强服务"""
|
||||
|
||||
# 默认分块参数
|
||||
DEFAULT_CHUNK_SIZE = 500 # 每个文本块的大小(字符数)
|
||||
DEFAULT_CHUNK_OVERLAP = 50 # 块之间的重叠(字符数)
|
||||
# 默认分块参数 - 增大块大小减少embedding次数
|
||||
DEFAULT_CHUNK_SIZE = 1000 # 每个文本块的大小(字符数),增大以提升速度
|
||||
DEFAULT_CHUNK_OVERLAP = 100 # 块之间的重叠(字符数)
|
||||
|
||||
def __init__(self):
|
||||
self.embedding_model = None
|
||||
@@ -389,6 +389,70 @@ class RAGService:
|
||||
self._add_documents(documents, chunk_ids)
|
||||
logger.info(f"已索引文档 {doc_id},共 {len(chunks)} 个块")
|
||||
|
||||
async def index_document_content_async(
|
||||
self,
|
||||
doc_id: str,
|
||||
content: str,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
chunk_size: int = None,
|
||||
chunk_overlap: int = None
|
||||
):
|
||||
"""
|
||||
异步将文档内容索引到向量数据库(自动分块)
|
||||
|
||||
使用 asyncio.to_thread 避免阻塞事件循环
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
if self._disabled:
|
||||
logger.info(f"[RAG DISABLED] 文档索引操作已跳过: {doc_id}")
|
||||
return
|
||||
|
||||
if not self._initialized:
|
||||
self._init_vector_store()
|
||||
|
||||
if self.embedding_model is None:
|
||||
logger.debug(f"文档跳过索引 (无嵌入模型): {doc_id}")
|
||||
return
|
||||
|
||||
# 分割文档为小块
|
||||
if chunk_size is None:
|
||||
chunk_size = self.DEFAULT_CHUNK_SIZE
|
||||
if chunk_overlap is None:
|
||||
chunk_overlap = self.DEFAULT_CHUNK_OVERLAP
|
||||
|
||||
chunks = self._split_into_chunks(content, chunk_size, chunk_overlap)
|
||||
|
||||
if not chunks:
|
||||
logger.warning(f"文档内容为空,跳过索引: {doc_id}")
|
||||
return
|
||||
|
||||
# 为每个块创建文档对象
|
||||
documents = []
|
||||
chunk_ids = []
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
chunk_id = f"{doc_id}_chunk_{i}"
|
||||
chunk_metadata = metadata.copy() if metadata else {}
|
||||
chunk_metadata.update({
|
||||
"chunk_index": i,
|
||||
"total_chunks": len(chunks),
|
||||
"doc_id": doc_id
|
||||
})
|
||||
|
||||
documents.append(SimpleDocument(
|
||||
page_content=chunk,
|
||||
metadata=chunk_metadata
|
||||
))
|
||||
chunk_ids.append(chunk_id)
|
||||
|
||||
# 使用线程池执行 CPU 密集型的 embedding 计算
|
||||
def _sync_add():
|
||||
self._add_documents(documents, chunk_ids)
|
||||
|
||||
await asyncio.to_thread(_sync_add)
|
||||
logger.info(f"已异步索引文档 {doc_id},共 {len(chunks)} 个块")
|
||||
|
||||
def _add_documents(self, documents: List[SimpleDocument], doc_ids: List[str]):
|
||||
"""批量添加文档到向量索引"""
|
||||
if not documents:
|
||||
|
||||
@@ -300,13 +300,15 @@ class TableRAGService:
|
||||
filename: str,
|
||||
sheet_name: Optional[str] = None,
|
||||
header_row: int = 0,
|
||||
sample_size: int = 10
|
||||
sample_size: int = 10,
|
||||
skip_rag_index: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
为 Excel 表构建完整的 RAG 索引
|
||||
|
||||
流程:
|
||||
1. 读取 Excel 获取字段信息
|
||||
2. 如果 skip_rag_index=True,跳过 RAG 索引,直接存 MySQL
|
||||
2. AI 生成每个字段的语义描述
|
||||
3. 将字段描述存入向量数据库
|
||||
|
||||
@@ -367,6 +369,20 @@ class TableRAGService:
|
||||
results["field_count"] = len(df.columns)
|
||||
logger.info(f"表名: {table_name}, 字段数: {len(df.columns)}")
|
||||
|
||||
# 跳过 RAG 索引时直接存 MySQL
|
||||
if skip_rag_index:
|
||||
logger.info(f"跳过 RAG 索引,直接存储到 MySQL")
|
||||
store_result = await self.excel_storage.store_excel(
|
||||
file_path=file_path,
|
||||
filename=filename,
|
||||
sheet_name=sheet_name,
|
||||
header_row=header_row
|
||||
)
|
||||
results["mysql_table"] = store_result.get("table_name") if store_result.get("success") else None
|
||||
results["row_count"] = store_result.get("row_count", len(df))
|
||||
results["indexed_count"] = 0
|
||||
return results
|
||||
|
||||
# 3. 初始化 RAG (如果需要)
|
||||
if not self.rag._initialized:
|
||||
self.rag._init_vector_store()
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
@@ -13,6 +14,7 @@ from app.services.llm_service import llm_service
|
||||
from app.core.document_parser import ParserFactory
|
||||
from app.services.markdown_ai_service import markdown_ai_service
|
||||
from app.services.rag_service import rag_service
|
||||
from app.services.excel_storage_service import excel_storage_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -105,12 +107,60 @@ class TemplateFillService:
|
||||
|
||||
# 3. 检查是否需要使用源文档重新生成表头
|
||||
# 条件:源文档已加载 AND 现有字段看起来是自动生成的(如"字段1"、"字段2")
|
||||
# 注意:Word 模板(docx)不自动重新生成表头,因为 Word 模板的表结构由用户定义,必须保留
|
||||
needs_regenerate_headers = (
|
||||
template_file_type != "docx" and
|
||||
len(source_docs) > 0 and
|
||||
len(template_fields) > 0 and
|
||||
all(self._is_auto_generated_field(f.name) for f in template_fields)
|
||||
)
|
||||
|
||||
# 4. Word 模板特殊处理:表头为空时,从源文档生成字段
|
||||
# 仅当有源文档、模板字段为空、模板文件类型为 docx 时触发
|
||||
if not needs_regenerate_headers and template_file_type == "docx" and len(source_docs) > 0 and len(template_fields) == 0:
|
||||
logger.info(f"Word 模板表头为空,从源文档生成字段... (source_docs={len(source_docs)})")
|
||||
source_contents = []
|
||||
for doc in source_docs:
|
||||
structured = doc.structured_data if doc.structured_data else {}
|
||||
titles = structured.get("titles", [])
|
||||
tables = structured.get("tables", [])
|
||||
tables_count = len(tables) if tables else 0
|
||||
tables_summary = ""
|
||||
if tables:
|
||||
tables_summary = "\n【文档中的表格】:\n"
|
||||
for idx, table in enumerate(tables[:5]):
|
||||
if isinstance(table, dict):
|
||||
headers = table.get("headers", [])
|
||||
rows = table.get("rows", [])
|
||||
if headers:
|
||||
tables_summary += f"表格{idx+1}表头: {', '.join(str(h) for h in headers)}\n"
|
||||
if rows:
|
||||
tables_summary += f"表格{idx+1}前3行: "
|
||||
for row_idx, row in enumerate(rows[:3]):
|
||||
if isinstance(row, list):
|
||||
tables_summary += " | ".join(str(c) for c in row) + "; "
|
||||
elif isinstance(row, dict):
|
||||
tables_summary += " | ".join(str(row.get(h, "")) for h in headers if headers) + "; "
|
||||
tables_summary += "\n"
|
||||
source_contents.append({
|
||||
"filename": doc.filename,
|
||||
"doc_type": doc.doc_type,
|
||||
"content": doc.content[:5000] if doc.content else "",
|
||||
"titles": titles[:10] if titles else [],
|
||||
"tables_count": tables_count,
|
||||
"tables_summary": tables_summary
|
||||
})
|
||||
if template_id:
|
||||
generated_fields = await self.get_template_fields_from_file(
|
||||
template_id,
|
||||
template_file_type,
|
||||
source_contents=source_contents,
|
||||
source_docs=source_docs
|
||||
)
|
||||
if generated_fields:
|
||||
template_fields = generated_fields
|
||||
logger.info(f"Word 模板字段生成成功: {[f.name for f in template_fields]}")
|
||||
|
||||
if needs_regenerate_headers:
|
||||
logger.info(f"检测到自动生成表头,尝试使用源文档重新生成... (当前字段: {[f.name for f in template_fields]})")
|
||||
|
||||
@@ -162,7 +212,8 @@ class TemplateFillService:
|
||||
new_fields = await self.get_template_fields_from_file(
|
||||
template_id,
|
||||
template_file_type,
|
||||
source_contents=source_contents
|
||||
source_contents=source_contents,
|
||||
source_docs=source_docs
|
||||
)
|
||||
if new_fields and len(new_fields) > 0:
|
||||
logger.info(f"成功重新生成表头: {[f.name for f in new_fields]}")
|
||||
@@ -224,14 +275,357 @@ class TemplateFillService:
|
||||
max_rows = max(len(v) for v in filled_data.values()) if filled_data else 1
|
||||
logger.info(f"填表完成: {len(filled_data)} 个字段, 最大行数: {max_rows}")
|
||||
|
||||
# 如果是 Word 模板,将数据填入模板文件
|
||||
filled_file_path = None
|
||||
if template_file_type == "docx" and template_id and filled_data:
|
||||
filled_file_path = await self._fill_docx(template_id, filled_data)
|
||||
if filled_file_path:
|
||||
logger.info(f"Word 模板已填写,输出文件: {filled_file_path}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"filled_data": filled_data,
|
||||
"fill_details": fill_details,
|
||||
"source_doc_count": len(source_docs),
|
||||
"max_rows": max_rows
|
||||
"max_rows": max_rows,
|
||||
"filled_file_path": filled_file_path
|
||||
}
|
||||
|
||||
async def _polish_word_filled_data(
|
||||
self,
|
||||
filled_data: Dict[str, Any]
|
||||
) -> Dict[str, str]:
|
||||
"""
|
||||
将提取的结构化数据(尤其是多行Excel数据)进行统计归纳,
|
||||
然后润色为自然语言文本
|
||||
|
||||
Args:
|
||||
filled_data: {字段名: [原始值列表]}
|
||||
|
||||
Returns:
|
||||
{字段名: 润色后的文本}
|
||||
"""
|
||||
if not filled_data:
|
||||
return {}
|
||||
|
||||
try:
|
||||
import json
|
||||
|
||||
# 第一步:对数值型多行数据进行统计分析
|
||||
data_summary = []
|
||||
for field_name, values in filled_data.items():
|
||||
if not isinstance(values, list) or not values:
|
||||
continue
|
||||
|
||||
# 过滤掉无效值
|
||||
raw_values = []
|
||||
for v in values:
|
||||
if v and str(v).strip() and not str(v).startswith('[提取失败'):
|
||||
raw_values.append(str(v).strip())
|
||||
|
||||
if not raw_values:
|
||||
continue
|
||||
|
||||
# 尝试解析为数值进行统计
|
||||
numeric_values = []
|
||||
for v in raw_values:
|
||||
# 提取数值(处理 "123个"、"78.5%"、"1,234" 等格式)
|
||||
num_str = re.sub(r'[^\d.\-]', '', str(v))
|
||||
try:
|
||||
if num_str and num_str != '-' and num_str != '.':
|
||||
numeric_values.append(float(num_str))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 根据字段名判断类型
|
||||
field_lower = field_name.lower()
|
||||
is_count_field = any(kw in field_lower for kw in ['数量', '总数', '次数', '条数', '订单数', '记录数', '条目'])
|
||||
is_amount_field = any(kw in field_lower for kw in ['金额', '总额', '合计', '总计', '销售额', '收入', '支出', '成本'])
|
||||
is_ratio_field = any(kw in field_lower for kw in ['比率', '比例', '占比', '率', '使用率', '增长', '增幅'])
|
||||
is_name_field = any(kw in field_lower for kw in ['名称', '机构', '医院', '公司', '单位', '部门', '区域', '类别'])
|
||||
|
||||
if len(numeric_values) >= 2 and len(numeric_values) == len(raw_values):
|
||||
# 多行数值数据,进行统计归纳
|
||||
total = sum(numeric_values)
|
||||
avg = total / len(numeric_values)
|
||||
max_val = max(numeric_values)
|
||||
min_val = min(numeric_values)
|
||||
|
||||
stats_lines = [
|
||||
f"【{field_name}】(共 {len(raw_values)} 条数据):",
|
||||
f" - 合计: {self._format_number(total)}" if is_amount_field else f" - 合计: {total:.2f}",
|
||||
f" - 平均: {avg:.2f}",
|
||||
f" - 最大: {max_val:.2f}",
|
||||
f" - 最小: {min_val:.2f}",
|
||||
]
|
||||
|
||||
# 对原始值去重计数(如果是名称类字段)
|
||||
if is_name_field:
|
||||
unique_values = list(set(raw_values))
|
||||
if len(unique_values) <= 10:
|
||||
stats_lines.append(f" - 涉及类别(共 {len(unique_values)} 种): {'、'.join(unique_values[:8])}")
|
||||
else:
|
||||
stats_lines.append(f" - 涉及 {len(unique_values)} 个不同类别")
|
||||
|
||||
# 取前5个原始示例
|
||||
stats_lines.append(f" - 示例值: {'、'.join(raw_values[:5])}")
|
||||
data_summary.append('\n'.join(stats_lines))
|
||||
|
||||
elif is_ratio_field and len(numeric_values) == 1:
|
||||
# 单值百分比
|
||||
pct = numeric_values[0]
|
||||
data_summary.append(f"【{field_name}】: {pct:.1f}%,表示相关指标的相对水平")
|
||||
|
||||
elif is_amount_field and len(numeric_values) >= 1:
|
||||
# 金额类(单位通常是万元/亿元)
|
||||
total = sum(numeric_values)
|
||||
unit = ""
|
||||
if total >= 10000:
|
||||
unit = f"(约 {total/10000:.2f} 万元)"
|
||||
elif total >= 1:
|
||||
unit = f"(约 {total:.2f} 元)"
|
||||
data_summary.append(f"【{field_name}】: 合计 {self._format_number(total)}{unit},基于 {len(raw_values)} 条记录汇总")
|
||||
|
||||
elif is_count_field and len(numeric_values) >= 1:
|
||||
# 数量类
|
||||
total = sum(numeric_values)
|
||||
data_summary.append(f"【{field_name}】: 共 {self._format_number(total)},基于 {len(raw_values)} 条记录汇总")
|
||||
|
||||
else:
|
||||
# 无法归类的多值数据,做去重归纳
|
||||
unique_values = list(set(raw_values))
|
||||
if len(unique_values) <= 8:
|
||||
data_summary.append(f"【{field_name}】(共 {len(raw_values)} 条,去重后 {len(unique_values)} 项): {'、'.join(unique_values[:8])}")
|
||||
elif len(raw_values) > 8:
|
||||
data_summary.append(f"【{field_name}】(共 {len(raw_values)} 条记录): {'、'.join(raw_values[:5])} 等")
|
||||
else:
|
||||
data_summary.append(f"【{field_name}】: {'、'.join(raw_values)}")
|
||||
|
||||
if not data_summary:
|
||||
return {k: (', '.join(str(v) for v in vals[:5]) if isinstance(vals, list) else str(vals))
|
||||
for k, vals in filled_data.items()}
|
||||
|
||||
# 第二步:调用 LLM 将统计分析结果转化为专业自然语言描述
|
||||
prompt = f"""你是一个专业的数据分析报告助手。请根据以下从文档中提取并统计的数据,生成专业、简洁的自然语言描述。
|
||||
|
||||
【数据统计结果】:
|
||||
{chr(10).join(data_summary)}
|
||||
|
||||
【润色要求】:
|
||||
1. 每个字段生成一段专业的描述性文本(20-60字)
|
||||
2. 数值类字段要明确标注单位和含义,如"销售总额达1,234.5万元,共涵盖56个订单"
|
||||
3. 分类/名称类字段要归纳总结类别,如"涉及医疗器械、药品采购、设备维修等5个业务类别"
|
||||
4. 多值数据不要简单罗列,要做总结,如"覆盖华东地区(上海、江苏、浙江)、华南地区(广东)等6个省市的销售网络"
|
||||
5. 百分比/比率类要加背景说明,如"综合毛利率为23.5%,处于行业正常水平"
|
||||
6. 保持文本通顺、专业,符合正式报告风格
|
||||
7. 每段控制在60字以内
|
||||
|
||||
【输出格式】(严格按JSON格式,只返回JSON,不要任何其他内容):
|
||||
{{
|
||||
"字段名1": "润色后的描述文本1",
|
||||
"字段名2": "润色后的描述文本2"
|
||||
}}
|
||||
"""
|
||||
messages = [
|
||||
{"role": "system", "content": "你是一个专业的数据分析报告助手。请严格按JSON格式输出,只返回纯JSON,不要任何其他内容。"},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
|
||||
response = await self.llm.chat(
|
||||
messages=messages,
|
||||
temperature=0.3,
|
||||
max_tokens=3000
|
||||
)
|
||||
content = self.llm.extract_message_content(response)
|
||||
logger.info(f"LLM 润色 Word 数据返回: {content[:500]}")
|
||||
|
||||
# 尝试解析 JSON
|
||||
json_match = re.search(r'\{[\s\S]*\}', content)
|
||||
if json_match:
|
||||
polished = json.loads(json_match.group())
|
||||
logger.info(f"LLM 润色成功: {len(polished)} 个字段")
|
||||
return polished
|
||||
else:
|
||||
logger.warning(f"LLM 返回无法解析为 JSON: {content[:200]}")
|
||||
# 回退到原始统计摘要
|
||||
return {k: (', '.join(str(v) for v in vals[:5]) if isinstance(vals, list) else str(vals))
|
||||
for k, vals in filled_data.items()}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LLM 润色失败: {str(e)}")
|
||||
# 润色失败时回退到原始值
|
||||
return {k: (', '.join(str(v) for v in vals[:5]) if isinstance(vals, list) else str(vals))
|
||||
for k, vals in filled_data.items()}
|
||||
|
||||
def _format_number(self, num: float) -> str:
|
||||
"""格式化数字,添加千分位"""
|
||||
if abs(num) >= 10000:
|
||||
return f"{num:,.2f}"
|
||||
elif abs(num) >= 1:
|
||||
return f"{num:,.2f}"
|
||||
else:
|
||||
return f"{num:.4f}"
|
||||
|
||||
async def _fill_docx(
|
||||
self,
|
||||
template_path: str,
|
||||
filled_data: Dict[str, Any]
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
将提取的数据填入 Word 模板
|
||||
|
||||
Args:
|
||||
template_path: Word 模板文件路径
|
||||
filled_data: 字段值字典 {field_name: [values]}
|
||||
|
||||
Returns:
|
||||
填写后的文件路径,失败返回 None
|
||||
"""
|
||||
import re
|
||||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
from docx import Document
|
||||
from docx.shared import RGBColor
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""清理文本,移除非法字符"""
|
||||
if not text:
|
||||
return ""
|
||||
# 移除控制字符
|
||||
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
|
||||
# 移除 Word 中常见的非法替代字符(显示为方框)
|
||||
text = re.sub(r'[\ufffd\u25a1\u25a9\u2610\u2611\u25cb\u25c9]', '', text)
|
||||
# 移除其他无效 Unicode 字符
|
||||
text = re.sub(r'[\ufeff\u200b-\u200f\u2028-\u202e]', '', text)
|
||||
return text.strip()
|
||||
|
||||
def set_cell_text(cell, text: str):
|
||||
"""设置单元格文本(保留原有格式)"""
|
||||
cell.text = text
|
||||
# 确保文本颜色为黑色
|
||||
for para in cell.paragraphs:
|
||||
for run in para.runs:
|
||||
run.font.color.rgb = RGBColor(0, 0, 0)
|
||||
|
||||
try:
|
||||
# 先对数据进行 LLM 润色(非结构化文本补充和润色)
|
||||
logger.info(f"Word 填写前开始 LLM 润色 {len(filled_data)} 个字段...")
|
||||
polished_data = await self._polish_word_filled_data(filled_data)
|
||||
logger.info(f"LLM 润色完成,使用润色后文本写入 Word")
|
||||
|
||||
# 创建临时目录存放修改后的文件
|
||||
temp_dir = tempfile.mkdtemp()
|
||||
output_path = os.path.join(temp_dir, "filled_template.docx")
|
||||
|
||||
# 复制模板到临时文件
|
||||
shutil.copy2(template_path, output_path)
|
||||
|
||||
# 打开复制的模板
|
||||
doc = Document(output_path)
|
||||
|
||||
matched_fields = set()
|
||||
|
||||
# 遍历表格,找到字段名所在的行,填写对应值
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
cells = row.cells
|
||||
if not cells:
|
||||
continue
|
||||
|
||||
first_cell_text = cells[0].text.strip()
|
||||
if not first_cell_text:
|
||||
continue
|
||||
|
||||
# 精确匹配字段名
|
||||
if first_cell_text in polished_data:
|
||||
display_text = polished_data[first_cell_text]
|
||||
if display_text:
|
||||
if len(cells) > 1:
|
||||
set_cell_text(cells[1], clean_text(display_text))
|
||||
matched_fields.add(first_cell_text)
|
||||
logger.info(f"Word 填写(精确): {first_cell_text} = {display_text[:50] if display_text else ''}")
|
||||
continue
|
||||
|
||||
# 前缀/后缀匹配
|
||||
for field_name, display_text in polished_data.items():
|
||||
if field_name and first_cell_text and (
|
||||
field_name.startswith(first_cell_text) or first_cell_text.startswith(field_name)
|
||||
):
|
||||
if display_text:
|
||||
if len(cells) > 1:
|
||||
set_cell_text(cells[1], clean_text(display_text))
|
||||
matched_fields.add(field_name)
|
||||
logger.info(f"Word 填写(模糊): {first_cell_text} ≈ {field_name} = {display_text[:50] if display_text else ''}")
|
||||
break
|
||||
|
||||
# 如果有未匹配的字段(模板第一列为空),使用段落格式写入(带分隔线,更清晰)
|
||||
unmatched_fields = [f for f in polished_data if f not in matched_fields]
|
||||
if unmatched_fields:
|
||||
logger.info(f"使用段落格式写入 {len(unmatched_fields)} 个字段(带分隔线)")
|
||||
|
||||
from docx.oxml.ns import qn
|
||||
from docx.oxml import OxmlElement
|
||||
from docx.shared import Pt, RGBColor
|
||||
|
||||
def add_horizontal_separator(doc, before_para=None):
|
||||
"""添加水平分隔线(通过段落下边框实现)"""
|
||||
sep_para = OxmlElement('w:p')
|
||||
pPr = OxmlElement('w:pPr')
|
||||
pBdr = OxmlElement('w:pBdr')
|
||||
bottom = OxmlElement('w:bottom')
|
||||
bottom.set(qn('w:val'), 'single')
|
||||
bottom.set(qn('w:sz'), '6')
|
||||
bottom.set(qn('w:space'), '1')
|
||||
bottom.set(qn('w:color'), 'CCCCCC')
|
||||
pBdr.append(bottom)
|
||||
pPr.append(pBdr)
|
||||
sep_para.append(pPr)
|
||||
if before_para is not None:
|
||||
before_para._element.addprevious(sep_para)
|
||||
else:
|
||||
doc._body.append(sep_para)
|
||||
|
||||
def add_field_section(doc, field_name: str, display_text: str):
|
||||
"""添加一个字段区域:字段名(加粗)+ 值段落 + 分隔线"""
|
||||
from docx.shared import Pt
|
||||
|
||||
# 字段名段落(加粗)
|
||||
name_para = doc.add_paragraph()
|
||||
name_run = name_para.add_run(f"📌 {field_name}")
|
||||
name_run.bold = True
|
||||
name_run.font.size = Pt(11)
|
||||
name_run.font.color.rgb = RGBColor(0, 51, 102)
|
||||
name_para.paragraph_format.space_before = Pt(12)
|
||||
name_para.paragraph_format.space_after = Pt(3)
|
||||
|
||||
# 值段落
|
||||
value_para = doc.add_paragraph()
|
||||
value_run = value_para.add_run(display_text)
|
||||
value_run.font.size = Pt(10.5)
|
||||
value_run.font.color.rgb = RGBColor(51, 51, 51)
|
||||
value_para.paragraph_format.space_before = Pt(0)
|
||||
value_para.paragraph_format.space_after = Pt(6)
|
||||
|
||||
# 分隔线
|
||||
add_horizontal_separator(doc, value_para)
|
||||
|
||||
# 在文档末尾添加各字段段落
|
||||
for field_name in unmatched_fields:
|
||||
display_text = polished_data[field_name]
|
||||
if display_text:
|
||||
add_field_section(doc, field_name, clean_text(display_text))
|
||||
logger.info(f"Word 段落写入: {field_name} = {display_text[:60]}")
|
||||
|
||||
# 保存修改后的文档
|
||||
doc.save(output_path)
|
||||
logger.info(f"Word 模板填写完成: {output_path}, 匹配字段: {len(matched_fields)}, 追加字段: {len(unmatched_fields)}")
|
||||
return output_path
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Word 模板填写失败: {str(e)}")
|
||||
return None
|
||||
|
||||
async def _load_source_documents(
|
||||
self,
|
||||
source_doc_ids: Optional[List[str]] = None,
|
||||
@@ -257,10 +651,38 @@ class TemplateFillService:
|
||||
if doc:
|
||||
sd = doc.get("structured_data", {})
|
||||
sd_keys = list(sd.keys()) if sd else []
|
||||
logger.info(f"从MongoDB加载文档: {doc_id}, doc_type={doc.get('doc_type')}, structured_data keys={sd_keys}")
|
||||
doc_type = doc.get("doc_type", "")
|
||||
mysql_table_name = doc.get("metadata", {}).get("mysql_table_name")
|
||||
logger.info(f"从MongoDB加载文档: {doc_id}, doc_type={doc_type}, structured_data keys={sd_keys}, mysql_table={mysql_table_name}")
|
||||
|
||||
# 如果 structured_data 为空,但有 file_path,尝试重新解析文件
|
||||
doc_content = doc.get("content", "")
|
||||
|
||||
# 如果是 Excel 类型且有 MySQL 表名,直接从 MySQL 加载数据
|
||||
if doc_type in ["xlsx", "xls"] and mysql_table_name:
|
||||
try:
|
||||
logger.info(f" 从 MySQL 表 {mysql_table_name} 加载 Excel 数据")
|
||||
mysql_data = await excel_storage_service.query_table(mysql_table_name, limit=1000)
|
||||
if mysql_data:
|
||||
# 转换为 SourceDocument 格式
|
||||
if mysql_data and len(mysql_data) > 0:
|
||||
columns = list(mysql_data[0].keys()) if mysql_data else []
|
||||
rows = [[row.get(col) for col in columns] for row in mysql_data]
|
||||
sd = {
|
||||
"headers": columns,
|
||||
"rows": rows,
|
||||
"row_count": len(mysql_data),
|
||||
"column_count": len(columns),
|
||||
"source": "mysql"
|
||||
}
|
||||
logger.info(f" MySQL 数据加载成功: {len(mysql_data)} 行, {len(columns)} 列")
|
||||
else:
|
||||
logger.warning(f" MySQL 表 {mysql_table_name} 无数据")
|
||||
else:
|
||||
logger.warning(f" MySQL 表 {mysql_table_name} 查询无结果")
|
||||
except Exception as mysql_err:
|
||||
logger.error(f" MySQL 加载失败: {str(mysql_err)}")
|
||||
|
||||
# 如果 structured_data 仍然为空,尝试重新解析文件
|
||||
if not sd or (not sd.get("tables") and not sd.get("headers") and not sd.get("rows")):
|
||||
file_path = doc.get("metadata", {}).get("file_path")
|
||||
if file_path:
|
||||
@@ -294,7 +716,7 @@ class TemplateFillService:
|
||||
source_docs.append(SourceDocument(
|
||||
doc_id=doc_id,
|
||||
filename=doc.get("metadata", {}).get("original_filename", "unknown"),
|
||||
doc_type=doc.get("doc_type", "unknown"),
|
||||
doc_type=doc_type,
|
||||
content=doc_content,
|
||||
structured_data=sd
|
||||
))
|
||||
@@ -1047,7 +1469,8 @@ class TemplateFillService:
|
||||
self,
|
||||
file_path: str,
|
||||
file_type: str = "xlsx",
|
||||
source_contents: List[dict] = None
|
||||
source_contents: List[dict] = None,
|
||||
source_docs: List["SourceDocument"] = None
|
||||
) -> List[TemplateField]:
|
||||
"""
|
||||
从模板文件提取字段定义
|
||||
@@ -1071,15 +1494,18 @@ class TemplateFillService:
|
||||
fields = await self._get_template_fields_from_docx(file_path)
|
||||
|
||||
# 检查是否需要 AI 生成表头
|
||||
# 条件:没有字段 OR 所有字段都是自动命名的(如"字段1"、"列1"、"Unnamed"开头)
|
||||
# 条件:没有字段 OR 所有字段都是自动命名的
|
||||
# 对于 docx:仅当有源文档时才允许 AI 生成(避免覆盖用户定义的表头)
|
||||
needs_ai_generation = (
|
||||
len(fields) == 0 or
|
||||
all(self._is_auto_generated_field(f.name) for f in fields)
|
||||
(len(fields) == 0 or
|
||||
all(self._is_auto_generated_field(f.name) for f in fields))
|
||||
) and (
|
||||
file_type != "docx" or len(source_contents) > 0
|
||||
)
|
||||
|
||||
if needs_ai_generation:
|
||||
logger.info(f"模板表头为空或自动生成,尝试 AI 生成表头... (fields={len(fields)}, source_docs={len(source_contents)})")
|
||||
ai_fields = await self._generate_fields_with_ai(file_path, file_type, source_contents)
|
||||
ai_fields = await self._generate_fields_with_ai(file_path, file_type, source_contents, source_docs)
|
||||
if ai_fields:
|
||||
fields = ai_fields
|
||||
logger.info(f"AI 生成表头成功: {len(fields)} 个字段")
|
||||
@@ -2134,7 +2560,8 @@ class TemplateFillService:
|
||||
self,
|
||||
file_path: str,
|
||||
file_type: str,
|
||||
source_contents: List[dict] = None
|
||||
source_contents: List[dict] = None,
|
||||
source_docs: List["SourceDocument"] = None
|
||||
) -> Optional[List[TemplateField]]:
|
||||
"""
|
||||
使用 AI 为空表生成表头字段
|
||||
@@ -2148,6 +2575,8 @@ class TemplateFillService:
|
||||
Returns:
|
||||
生成的字段列表,如果失败返回 None
|
||||
"""
|
||||
import random
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
|
||||
@@ -2182,24 +2611,21 @@ class TemplateFillService:
|
||||
else:
|
||||
content_sample = ""
|
||||
|
||||
# 调用 AI 生成表头
|
||||
# 根据源文档内容生成表头
|
||||
source_info = ""
|
||||
logger.info(f"[DEBUG] _generate_fields_with_ai received source_contents: {len(source_contents) if source_contents else 0} items")
|
||||
# 优先从源文档的表格表头中随机选取
|
||||
if source_contents:
|
||||
for sc in source_contents:
|
||||
logger.info(f"[DEBUG] source doc: filename={sc.get('filename')}, content_len={len(sc.get('content', ''))}, titles={len(sc.get('titles', []))}, tables_count={sc.get('tables_count', 0)}, has_tables_summary={bool(sc.get('tables_summary'))}")
|
||||
source_info = "\n\n【源文档内容摘要】(根据以下文档内容生成表头):\n"
|
||||
import re
|
||||
all_headers = []
|
||||
source_info = ""
|
||||
|
||||
for idx, src in enumerate(source_contents[:5]): # 最多5个源文档
|
||||
filename = src.get("filename", f"文档{idx+1}")
|
||||
doc_type = src.get("doc_type", "unknown")
|
||||
content = src.get("content", "")[:3000] # 限制内容长度
|
||||
titles = src.get("titles", [])[:10] # 最多10个标题
|
||||
content = src.get("content", "")[:3000]
|
||||
titles = src.get("titles", [])[:10]
|
||||
tables_count = src.get("tables_count", 0)
|
||||
tables_summary = src.get("tables_summary", "")
|
||||
|
||||
source_info += f"\n--- 文档 {idx+1}: {filename} ({doc_type}) ---\n"
|
||||
# 处理 titles(可能是字符串列表或字典列表)
|
||||
if titles:
|
||||
title_texts = []
|
||||
for t in titles[:5]:
|
||||
@@ -2216,6 +2642,72 @@ class TemplateFillService:
|
||||
if content:
|
||||
source_info += f"【文档内容】(前3000字符):{content[:3000]}\n"
|
||||
|
||||
# 从 tables_summary 中提取表头
|
||||
# 表格摘要格式如: "表格1表头: 姓名, 年龄, 性别"
|
||||
if tables_summary:
|
||||
header_matches = re.findall(r'表头:\s*([^\n]+)', tables_summary)
|
||||
for match in header_matches:
|
||||
# 分割表头字符串
|
||||
headers = [h.strip() for h in match.split(',') if h.strip()]
|
||||
all_headers.extend(headers)
|
||||
logger.info(f"从表格摘要提取到表头: {headers}")
|
||||
|
||||
# 从源文档的 structured_data 中直接提取表头(Excel 等数据源)
|
||||
for doc in source_docs:
|
||||
if doc.structured_data:
|
||||
sd = doc.structured_data
|
||||
# Excel 格式: {columns: [...], rows: [...]}
|
||||
if sd.get("columns"):
|
||||
cols = sd.get("columns", [])
|
||||
if isinstance(cols, list) and cols:
|
||||
all_headers.extend([str(c) for c in cols if str(c).strip()])
|
||||
logger.info(f"从 structured_data.columns 提取到表头: {cols}")
|
||||
# 多 sheet 格式: {sheets: {sheet_name: {columns, rows}}}
|
||||
if sd.get("sheets"):
|
||||
for sheet_name, sheet_data in sd.get("sheets", {}).items():
|
||||
if isinstance(sheet_data, dict) and sheet_data.get("columns"):
|
||||
cols = sheet_data.get("columns", [])
|
||||
if isinstance(cols, list) and cols:
|
||||
all_headers.extend([str(c) for c in cols if str(c).strip()])
|
||||
logger.info(f"从 sheets.{sheet_name} 提取到表头: {cols}")
|
||||
# Markdown/表格格式: {tables: [{headers, rows}]}
|
||||
if sd.get("tables") and isinstance(sd.get("tables"), list):
|
||||
for table in sd.get("tables", []):
|
||||
if isinstance(table, dict) and table.get("headers"):
|
||||
headers = table.get("headers", [])
|
||||
if isinstance(headers, list) and headers:
|
||||
all_headers.extend([str(h) for h in headers if str(h).strip()])
|
||||
logger.info(f"从 tables 提取到表头: {headers}")
|
||||
# 另一种格式: {headers, rows}
|
||||
if sd.get("headers") and sd.get("rows"):
|
||||
headers = sd.get("headers", [])
|
||||
if isinstance(headers, list) and headers:
|
||||
all_headers.extend([str(h) for h in headers if str(h).strip()])
|
||||
logger.info(f"从 headers/rows 提取到表头: {headers}")
|
||||
|
||||
# 如果从表格摘要中获取到了表头,随机选取一部分
|
||||
if all_headers:
|
||||
logger.info(f"共有 {len(all_headers)} 个表头可用")
|
||||
# 随机选取 5-7 个表头
|
||||
num_fields = min(random.randint(5, 7), len(all_headers))
|
||||
selected_headers = random.sample(all_headers, num_fields)
|
||||
logger.info(f"随机选取的表头: {selected_headers}")
|
||||
|
||||
fields = []
|
||||
for idx, header in enumerate(selected_headers):
|
||||
fields.append(TemplateField(
|
||||
cell=self._column_to_cell(idx),
|
||||
name=header,
|
||||
field_type="text",
|
||||
required=False,
|
||||
hint=""
|
||||
))
|
||||
return fields
|
||||
else:
|
||||
source_info = ""
|
||||
|
||||
# 如果无法从表格表头获取,才调用 AI 生成
|
||||
|
||||
prompt = f"""你是一个专业的数据分析助手。请分析源文档中的所有数据,生成表格表头字段。
|
||||
|
||||
任务:分析源文档,找出所有具体的数据指标及其分类。
|
||||
|
||||
@@ -39,6 +39,8 @@ openpyxl==3.1.2
|
||||
python-docx==0.8.11
|
||||
markdown-it-py==3.0.0
|
||||
chardet==5.2.0
|
||||
Pillow>=10.0.0
|
||||
pytesseract>=0.3.10
|
||||
|
||||
# ==================== AI / LLM ====================
|
||||
httpx==0.25.2
|
||||
|
||||
@@ -781,7 +781,8 @@ export const backendApi = {
|
||||
async exportFilledTemplate(
|
||||
templateId: string,
|
||||
filledData: Record<string, any>,
|
||||
format: 'xlsx' | 'docx' = 'xlsx'
|
||||
format: 'xlsx' | 'docx' = 'xlsx',
|
||||
filledFilePath?: string
|
||||
): Promise<Blob> {
|
||||
const url = `${BACKEND_BASE_URL}/templates/export`;
|
||||
|
||||
@@ -793,6 +794,7 @@ export const backendApi = {
|
||||
template_id: templateId,
|
||||
filled_data: filledData,
|
||||
format,
|
||||
...(filledFilePath && { filled_file_path: filledFilePath }),
|
||||
}),
|
||||
});
|
||||
|
||||
@@ -964,6 +966,101 @@ export const backendApi = {
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
|
||||
// ==================== 智能指令 API ====================
|
||||
|
||||
/**
|
||||
* 智能对话(支持多轮对话的指令执行)
|
||||
*/
|
||||
async instructionChat(
|
||||
instruction: string,
|
||||
docIds?: string[],
|
||||
context?: Record<string, any>
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
intent: string;
|
||||
result: Record<string, any>;
|
||||
message: string;
|
||||
hint?: string;
|
||||
}> {
|
||||
const url = `${BACKEND_BASE_URL}/instruction/chat`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ instruction, doc_ids: docIds, context }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
throw new Error(error.detail || '对话处理失败');
|
||||
}
|
||||
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('对话处理失败:', error);
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* 获取支持的指令类型列表
|
||||
*/
|
||||
async getSupportedIntents(): Promise<{
|
||||
intents: Array<{
|
||||
intent: string;
|
||||
name: string;
|
||||
examples: string[];
|
||||
params: string[];
|
||||
}>;
|
||||
}> {
|
||||
const url = `${BACKEND_BASE_URL}/instruction/intents`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) throw new Error('获取指令列表失败');
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('获取指令列表失败:', error);
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* 执行指令(同步模式)
|
||||
*/
|
||||
async executeInstruction(
|
||||
instruction: string,
|
||||
docIds?: string[],
|
||||
context?: Record<string, any>
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
intent: string;
|
||||
result: Record<string, any>;
|
||||
message: string;
|
||||
}> {
|
||||
const url = `${BACKEND_BASE_URL}/instruction/execute`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ instruction, doc_ids: docIds, context }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
throw new Error(error.detail || '指令执行失败');
|
||||
}
|
||||
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('指令执行失败:', error);
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
|
||||
};
|
||||
|
||||
// ==================== AI 分析 API ====================
|
||||
@@ -1529,61 +1626,66 @@ export const aiApi = {
|
||||
}
|
||||
},
|
||||
|
||||
// ==================== 对话历史 API ====================
|
||||
|
||||
/**
|
||||
* 智能对话(支持多轮对话的指令执行)
|
||||
* 获取对话历史
|
||||
*/
|
||||
async instructionChat(
|
||||
instruction: string,
|
||||
docIds?: string[],
|
||||
context?: Record<string, any>
|
||||
): Promise<{
|
||||
async getConversationHistory(conversationId: string, limit: number = 20): Promise<{
|
||||
success: boolean;
|
||||
intent: string;
|
||||
result: Record<string, any>;
|
||||
message: string;
|
||||
hint?: string;
|
||||
}> {
|
||||
const url = `${BACKEND_BASE_URL}/instruction/chat`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ instruction, doc_ids: docIds, context }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
throw new Error(error.detail || '对话处理失败');
|
||||
}
|
||||
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('对话处理失败:', error);
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* 获取支持的指令类型列表
|
||||
*/
|
||||
async getSupportedIntents(): Promise<{
|
||||
intents: Array<{
|
||||
intent: string;
|
||||
name: string;
|
||||
examples: string[];
|
||||
params: string[];
|
||||
messages: Array<{
|
||||
role: string;
|
||||
content: string;
|
||||
intent?: string;
|
||||
created_at: string;
|
||||
}>;
|
||||
}> {
|
||||
const url = `${BACKEND_BASE_URL}/instruction/intents`;
|
||||
const url = `${BACKEND_BASE_URL}/conversation/${conversationId}/history?limit=${limit}`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) throw new Error('获取指令列表失败');
|
||||
if (!response.ok) throw new Error('获取对话历史失败');
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('获取指令列表失败:', error);
|
||||
throw error;
|
||||
console.error('获取对话历史失败:', error);
|
||||
return { success: false, messages: [] };
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* 删除对话历史
|
||||
*/
|
||||
async deleteConversation(conversationId: string): Promise<{
|
||||
success: boolean;
|
||||
}> {
|
||||
const url = `${BACKEND_BASE_URL}/conversation/${conversationId}`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, { method: 'DELETE' });
|
||||
if (!response.ok) throw new Error('删除对话历史失败');
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('删除对话历史失败:', error);
|
||||
return { success: false };
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* 获取会话列表
|
||||
*/
|
||||
async listConversations(limit: number = 50): Promise<{
|
||||
success: boolean;
|
||||
conversations: Array<any>;
|
||||
}> {
|
||||
const url = `${BACKEND_BASE_URL}/conversation/all?limit=${limit}`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) throw new Error('获取会话列表失败');
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('获取会话列表失败:', error);
|
||||
return { success: false, conversations: [] };
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -15,12 +15,14 @@ import {
|
||||
Sparkles,
|
||||
Database,
|
||||
FileSpreadsheet,
|
||||
RefreshCcw
|
||||
RefreshCcw,
|
||||
Trash2
|
||||
} from 'lucide-react';
|
||||
import { backendApi } from '@/db/backend-api';
|
||||
import { formatDistanceToNow } from 'date-fns';
|
||||
import { zhCN } from 'date-fns/locale';
|
||||
import { cn } from '@/lib/utils';
|
||||
import { toast } from 'sonner';
|
||||
|
||||
type DocumentItem = {
|
||||
doc_id: string;
|
||||
@@ -108,7 +110,7 @@ const Dashboard: React.FC = () => {
|
||||
<div className="grid grid-cols-1 md:grid-cols-3 gap-6">
|
||||
{[
|
||||
{ label: '已上传文档', value: stats.docs, icon: FileText, color: 'bg-blue-500', trend: '非结构化文档', link: '/documents' },
|
||||
{ label: 'Excel 文件', value: stats.excelFiles, icon: FileSpreadsheet, color: 'bg-emerald-500', trend: '结构化数据', link: '/excel-parse' },
|
||||
{ label: 'Excel 文件', value: stats.excelFiles, icon: FileSpreadsheet, color: 'bg-emerald-500', trend: '结构化数据', link: '/documents' },
|
||||
{ label: '填表任务', value: stats.tasks, icon: TableProperties, color: 'bg-indigo-500', trend: '待实现', link: '/form-fill' }
|
||||
].map((stat, i) => (
|
||||
<Card key={i} className="border-none shadow-md overflow-hidden group hover:shadow-xl transition-all duration-300">
|
||||
@@ -164,8 +166,30 @@ const Dashboard: React.FC = () => {
|
||||
{doc.doc_type.toUpperCase()} • {formatDistanceToNow(new Date(doc.created_at), { addSuffix: true, locale: zhCN })}
|
||||
</p>
|
||||
</div>
|
||||
<div className="px-2 py-1 rounded-full text-[10px] font-bold uppercase tracking-wider bg-muted">
|
||||
{doc.doc_type}
|
||||
<div className="flex items-center gap-2">
|
||||
<div className="px-2 py-1 rounded-full text-[10px] font-bold uppercase tracking-wider bg-muted">
|
||||
{doc.doc_type}
|
||||
</div>
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="opacity-0 group-hover:opacity-100 text-destructive hover:bg-destructive/10 transition-opacity"
|
||||
onClick={async (e) => {
|
||||
e.stopPropagation();
|
||||
if (!confirm(`确定要删除 "${doc.original_filename}" 吗?`)) return;
|
||||
try {
|
||||
const result = await backendApi.deleteDocument(doc.doc_id);
|
||||
if (result.success) {
|
||||
setRecentDocs(prev => prev.filter(d => d.doc_id !== doc.doc_id));
|
||||
toast.success('文档已删除');
|
||||
}
|
||||
} catch (err: any) {
|
||||
toast.error(err.message || '删除失败');
|
||||
}
|
||||
}}
|
||||
>
|
||||
<Trash2 size={16} />
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
@@ -197,7 +221,7 @@ const Dashboard: React.FC = () => {
|
||||
<div className="grid grid-cols-1 sm:grid-cols-2 gap-4">
|
||||
{[
|
||||
{ title: '上传文档', desc: '支持 docx/md/txt', icon: FileText, link: '/documents', color: 'bg-blue-500' },
|
||||
{ title: '解析 Excel', desc: '上传并分析数据', icon: FileSpreadsheet, link: '/excel-parse', color: 'bg-emerald-500' },
|
||||
{ title: '解析 Excel', desc: '上传并分析数据', icon: FileSpreadsheet, link: '/documents', color: 'bg-emerald-500' },
|
||||
{ title: '智能填表', desc: '自动填写表格模板', icon: TableProperties, link: '/form-fill', color: 'bg-indigo-500' },
|
||||
{ title: 'AI 助手', desc: '自然语言交互', icon: MessageSquareCode, link: '/assistant', color: 'bg-amber-500' }
|
||||
].map((item, i) => (
|
||||
|
||||
@@ -78,6 +78,19 @@ const Documents: React.FC = () => {
|
||||
const [expandedSheet, setExpandedSheet] = useState<string | null>(null);
|
||||
const [uploadExpanded, setUploadExpanded] = useState(false);
|
||||
|
||||
// 批量上传状态跟踪
|
||||
type FileUploadStatus = 'pending' | 'uploading' | 'processing' | 'success' | 'failed';
|
||||
interface UploadFileState {
|
||||
file: File;
|
||||
status: FileUploadStatus;
|
||||
progress: number;
|
||||
taskId?: string;
|
||||
error?: string;
|
||||
docId?: string;
|
||||
}
|
||||
const [uploadStates, setUploadStates] = useState<UploadFileState[]>([]);
|
||||
const [batchTaskId, setBatchTaskId] = useState<string | null>(null);
|
||||
|
||||
// AI 分析相关状态
|
||||
const [analyzing, setAnalyzing] = useState(false);
|
||||
const [analyzingForCharts, setAnalyzingForCharts] = useState(false);
|
||||
@@ -211,21 +224,119 @@ const Documents: React.FC = () => {
|
||||
}
|
||||
};
|
||||
|
||||
// 文件上传处理
|
||||
// 文件上传处理 - 批量上传
|
||||
const onDrop = async (acceptedFiles: File[]) => {
|
||||
if (acceptedFiles.length === 0) return;
|
||||
|
||||
// 初始化上传状态
|
||||
const initialStates: UploadFileState[] = acceptedFiles.map(file => ({
|
||||
file,
|
||||
status: 'pending',
|
||||
progress: 0
|
||||
}));
|
||||
setUploadStates(initialStates);
|
||||
setUploadExpanded(true);
|
||||
setUploading(true);
|
||||
|
||||
try {
|
||||
// 使用批量上传接口
|
||||
const result = await backendApi.uploadDocuments(acceptedFiles);
|
||||
|
||||
if (result.task_id) {
|
||||
setBatchTaskId(result.task_id);
|
||||
|
||||
// 更新所有文件状态为上传中
|
||||
setUploadStates(prev => prev.map(s => ({ ...s, status: 'uploading', progress: 30 })));
|
||||
|
||||
// 轮询任务状态
|
||||
let attempts = 0;
|
||||
const maxAttempts = 150; // 最多5分钟
|
||||
|
||||
const checkBatchStatus = async () => {
|
||||
while (attempts < maxAttempts) {
|
||||
try {
|
||||
const status = await backendApi.getTaskStatus(result.task_id);
|
||||
|
||||
if (status.status === 'success' && status.result) {
|
||||
// 更新每个文件的状态
|
||||
const fileResults = status.result.results || [];
|
||||
setUploadStates(prev => prev.map((s, idx) => {
|
||||
const fileResult = fileResults[idx];
|
||||
if (fileResult?.success) {
|
||||
return { ...s, status: 'success', progress: 100, docId: fileResult.doc_id };
|
||||
} else {
|
||||
return { ...s, status: 'failed', progress: 0, error: fileResult?.error || '处理失败' };
|
||||
}
|
||||
}));
|
||||
loadDocuments();
|
||||
return;
|
||||
} else if (status.status === 'failure') {
|
||||
setUploadStates(prev => prev.map(s => ({
|
||||
...s,
|
||||
status: 'failed',
|
||||
error: status.error || '批量处理失败'
|
||||
})));
|
||||
return;
|
||||
} else {
|
||||
// 处理中 - 更新进度
|
||||
const progress = status.progress || Math.min(30 + attempts * 2, 90);
|
||||
setUploadStates(prev => prev.map(s => ({
|
||||
...s,
|
||||
status: s.status === 'uploading' ? 'processing' : s.status,
|
||||
progress
|
||||
})));
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('检查批量状态失败', e);
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
attempts++;
|
||||
}
|
||||
|
||||
// 超时
|
||||
setUploadStates(prev => prev.map(s => {
|
||||
if (s.status !== 'success') {
|
||||
return { ...s, status: 'failed', error: '处理超时' };
|
||||
}
|
||||
return s;
|
||||
}));
|
||||
};
|
||||
|
||||
checkBatchStatus();
|
||||
} else {
|
||||
// 单文件直接上传(旧逻辑作为后备)
|
||||
await handleSingleFileUploads(acceptedFiles);
|
||||
}
|
||||
} catch (error: any) {
|
||||
toast.error(error.message || '上传失败');
|
||||
setUploadStates(prev => prev.map(s => ({
|
||||
...s,
|
||||
status: 'failed',
|
||||
error: error.message || '上传失败'
|
||||
})));
|
||||
} finally {
|
||||
setUploading(false);
|
||||
}
|
||||
};
|
||||
|
||||
// 单文件上传后备逻辑
|
||||
const handleSingleFileUploads = async (files: File[]) => {
|
||||
let successCount = 0;
|
||||
let failCount = 0;
|
||||
const successfulFiles: File[] = [];
|
||||
|
||||
// 逐个上传文件
|
||||
for (const file of acceptedFiles) {
|
||||
for (let i = 0; i < files.length; i++) {
|
||||
const file = files[i];
|
||||
const ext = file.name.split('.').pop()?.toLowerCase();
|
||||
|
||||
setUploadStates(prev => prev.map((s, idx) =>
|
||||
idx === i ? { ...s, status: 'uploading' } : s
|
||||
));
|
||||
|
||||
try {
|
||||
if (ext === 'xlsx' || ext === 'xls') {
|
||||
setUploadStates(prev => prev.map((s, idx) =>
|
||||
idx === i ? { ...s, status: 'processing', progress: 50 } : s
|
||||
));
|
||||
const result = await backendApi.uploadExcel(file, {
|
||||
parseAllSheets: parseOptions.parseAllSheets,
|
||||
headerRow: parseOptions.headerRow
|
||||
@@ -233,99 +344,60 @@ const Documents: React.FC = () => {
|
||||
if (result.success) {
|
||||
successCount++;
|
||||
successfulFiles.push(file);
|
||||
// 第一个Excel文件设置解析结果供预览
|
||||
setUploadStates(prev => prev.map((s, idx) =>
|
||||
idx === i ? { ...s, status: 'success', progress: 100 } : s
|
||||
));
|
||||
if (successCount === 1) {
|
||||
setUploadedFile(file);
|
||||
setParseResult(result);
|
||||
if (result.metadata?.sheet_count === 1) {
|
||||
setExpandedSheet(Object.keys(result.data?.sheets || {})[0] || null);
|
||||
}
|
||||
}
|
||||
loadDocuments();
|
||||
} else {
|
||||
failCount++;
|
||||
toast.error(`${file.name}: ${result.error || '解析失败'}`);
|
||||
}
|
||||
} else if (ext === 'md' || ext === 'markdown') {
|
||||
const result = await backendApi.uploadDocument(file);
|
||||
if (result.task_id) {
|
||||
successCount++;
|
||||
successfulFiles.push(file);
|
||||
if (successCount === 1) {
|
||||
setUploadedFile(file);
|
||||
}
|
||||
// 轮询任务状态
|
||||
let attempts = 0;
|
||||
const checkStatus = async () => {
|
||||
while (attempts < 30) {
|
||||
try {
|
||||
const status = await backendApi.getTaskStatus(result.task_id);
|
||||
if (status.status === 'success') {
|
||||
loadDocuments();
|
||||
return;
|
||||
} else if (status.status === 'failure') {
|
||||
return;
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('检查状态失败', e);
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
attempts++;
|
||||
}
|
||||
};
|
||||
checkStatus();
|
||||
} else {
|
||||
failCount++;
|
||||
setUploadStates(prev => prev.map((s, idx) =>
|
||||
idx === i ? { ...s, status: 'failed', error: result.error || '解析失败' } : s
|
||||
));
|
||||
}
|
||||
} else {
|
||||
// 其他文档使用通用上传接口
|
||||
setUploadStates(prev => prev.map((s, idx) =>
|
||||
idx === i ? { ...s, status: 'processing', progress: 50 } : s
|
||||
));
|
||||
const result = await backendApi.uploadDocument(file);
|
||||
if (result.task_id) {
|
||||
successCount++;
|
||||
successfulFiles.push(file);
|
||||
if (successCount === 1) {
|
||||
setUploadedFile(file);
|
||||
}
|
||||
// 轮询任务状态
|
||||
// 等待任务完成
|
||||
let attempts = 0;
|
||||
const checkStatus = async () => {
|
||||
while (attempts < 30) {
|
||||
try {
|
||||
const status = await backendApi.getTaskStatus(result.task_id);
|
||||
if (status.status === 'success') {
|
||||
loadDocuments();
|
||||
return;
|
||||
} else if (status.status === 'failure') {
|
||||
return;
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('检查状态失败', e);
|
||||
while (attempts < 60) {
|
||||
const status = await backendApi.getTaskStatus(result.task_id);
|
||||
if (status.status === 'success') {
|
||||
successCount++;
|
||||
successfulFiles.push(file);
|
||||
setUploadStates(prev => prev.map((s, idx) =>
|
||||
idx === i ? { ...s, status: 'success', progress: 100, docId: status.result?.doc_id } : s
|
||||
));
|
||||
if (successCount === 1) {
|
||||
setUploadedFile(file);
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
attempts++;
|
||||
loadDocuments();
|
||||
break;
|
||||
} else if (status.status === 'failure') {
|
||||
setUploadStates(prev => prev.map((s, idx) =>
|
||||
idx === i ? { ...s, status: 'failed', error: status.error || '处理失败' } : s
|
||||
));
|
||||
break;
|
||||
}
|
||||
};
|
||||
checkStatus();
|
||||
} else {
|
||||
failCount++;
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
attempts++;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error: any) {
|
||||
failCount++;
|
||||
toast.error(`${file.name}: ${error.message || '上传失败'}`);
|
||||
setUploadStates(prev => prev.map((s, idx) =>
|
||||
idx === i ? { ...s, status: 'failed', error: error.message || '上传失败' } : s
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
setUploading(false);
|
||||
loadDocuments();
|
||||
|
||||
if (successCount > 0) {
|
||||
toast.success(`成功上传 ${successCount} 个文件`);
|
||||
setUploadedFiles(prev => [...prev, ...successfulFiles]);
|
||||
setUploadExpanded(true);
|
||||
}
|
||||
if (failCount > 0) {
|
||||
toast.error(`${failCount} 个文件上传失败`);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -699,7 +771,110 @@ const Documents: React.FC = () => {
|
||||
</CardHeader>
|
||||
{uploadPanelOpen && (
|
||||
<CardContent className="space-y-4">
|
||||
{uploadedFiles.length > 0 || uploadedFile ? (
|
||||
{/* 优先显示正在上传的状态 */}
|
||||
{uploadStates.length > 0 && (
|
||||
<div className="space-y-3">
|
||||
{/* 上传状态头部 */}
|
||||
<div
|
||||
className="flex items-center justify-between p-3 bg-primary/5 rounded-xl cursor-pointer hover:bg-primary/10 transition-colors"
|
||||
onClick={() => setUploadExpanded(!uploadExpanded)}
|
||||
>
|
||||
<div className="flex items-center gap-3">
|
||||
<div className="w-10 h-10 rounded-lg bg-primary/10 text-primary flex items-center justify-center">
|
||||
{uploading ? <Loader2 size={20} className="animate-spin" /> : <Upload size={20} />}
|
||||
</div>
|
||||
<div>
|
||||
<p className="font-semibold text-sm">
|
||||
{uploading ? '正在上传' : '上传完成'} {uploadStates.length} 个文件
|
||||
</p>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
{uploading ? '上传中,请稍候...' : uploadStates.filter(s => s.status === 'failed').length > 0 ? '部分失败' : '点击查看详情'}
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
<div className="flex items-center gap-2">
|
||||
{!uploading && (
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="sm"
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
setUploadStates([]);
|
||||
setUploadedFiles([]);
|
||||
setUploadedFile(null);
|
||||
}}
|
||||
className="text-destructive hover:text-destructive"
|
||||
>
|
||||
<Trash2 size={14} className="mr-1" />
|
||||
清空
|
||||
</Button>
|
||||
)}
|
||||
{uploadExpanded ? <ChevronUp size={16} /> : <ChevronDown size={16} />}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* 上传进度列表(总是展开显示) */}
|
||||
{uploadExpanded && (
|
||||
<div className="space-y-2 border rounded-xl p-3 bg-background">
|
||||
{uploadStates.map((state, index) => (
|
||||
<div key={index} className="flex items-center gap-3 p-2 rounded-lg hover:bg-muted/30 transition-colors">
|
||||
<div className={cn(
|
||||
"w-8 h-8 rounded flex items-center justify-center shrink-0",
|
||||
isExcelFile(state.file.name) ? "bg-emerald-500/10 text-emerald-500" : "bg-blue-500/10 text-blue-500"
|
||||
)}>
|
||||
{state.status === 'pending' && <Clock size={16} />}
|
||||
{state.status === 'uploading' && <Upload size={16} className="animate-pulse" />}
|
||||
{state.status === 'processing' && <Loader2 size={16} className="animate-spin" />}
|
||||
{state.status === 'success' && <CheckCircle size={16} className="text-green-500" />}
|
||||
{state.status === 'failed' && <AlertCircle size={16} className="text-red-500" />}
|
||||
</div>
|
||||
<div className="flex-1 min-w-0">
|
||||
<p className="text-sm truncate">{state.file.name}</p>
|
||||
<div className="flex items-center gap-2">
|
||||
{state.status === 'pending' && <p className="text-xs text-muted-foreground">等待上传...</p>}
|
||||
{state.status === 'uploading' && <p className="text-xs text-primary">上传中...</p>}
|
||||
{state.status === 'processing' && <p className="text-xs text-primary">处理中...</p>}
|
||||
{state.status === 'failed' && state.error && (
|
||||
<p className="text-xs text-red-500 truncate">{state.error}</p>
|
||||
)}
|
||||
{state.status === 'success' && (
|
||||
<p className="text-xs text-green-500">已完成</p>
|
||||
)}
|
||||
</div>
|
||||
{/* 进度条 */}
|
||||
{(state.status === 'uploading' || state.status === 'processing') && (
|
||||
<div className="mt-1 h-1 bg-muted rounded-full overflow-hidden">
|
||||
<div
|
||||
className="h-full bg-primary transition-all duration-300"
|
||||
style={{ width: `${state.progress}%` }}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
{state.status === 'success' && (
|
||||
<CheckCircle size={16} className="text-green-500 shrink-0" />
|
||||
)}
|
||||
{state.status === 'failed' && (
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="text-destructive hover:bg-destructive/10 shrink-0"
|
||||
onClick={() => {
|
||||
setUploadStates(prev => prev.filter((_, i) => i !== index));
|
||||
}}
|
||||
>
|
||||
<Trash2 size={14} />
|
||||
</Button>
|
||||
)}
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* 已上传文件列表(没有正在上传时显示) */}
|
||||
{uploadStates.length === 0 && (uploadedFiles.length > 0 || uploadedFile) ? (
|
||||
<div className="space-y-3">
|
||||
{/* 文件列表头部 */}
|
||||
<div
|
||||
@@ -739,6 +914,84 @@ const Documents: React.FC = () => {
|
||||
{/* 展开的文件列表 */}
|
||||
{uploadExpanded && (
|
||||
<div className="space-y-2 border rounded-xl p-3">
|
||||
{/* 显示已上传文件列表 */}
|
||||
{(uploadedFiles.length > 0 ? uploadedFiles : [uploadedFile]).filter(Boolean).map((file, index) => (
|
||||
<div key={index} className="flex items-center gap-3 p-2 bg-background rounded-lg">
|
||||
<div className={cn(
|
||||
"w-8 h-8 rounded flex items-center justify-center",
|
||||
isExcelFile(file?.name || '') ? "bg-emerald-500/10 text-emerald-500" : "bg-blue-500/10 text-blue-500"
|
||||
)}>
|
||||
{isExcelFile(file?.name || '') ? <FileSpreadsheet size={16} /> : <FileText size={16} />}
|
||||
</div>
|
||||
<div className="flex-1 min-w-0">
|
||||
<p className="text-sm truncate">{file?.name}</p>
|
||||
<p className="text-xs text-muted-foreground">{formatFileSize(file?.size || 0)}</p>
|
||||
</div>
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="text-destructive hover:bg-destructive/10"
|
||||
onClick={() => handleRemoveUploadedFile(index)}
|
||||
>
|
||||
<Trash2 size={14} />
|
||||
</Button>
|
||||
</div>
|
||||
))}
|
||||
|
||||
{/* 继续添加按钮 */}
|
||||
<div
|
||||
{...getRootProps()}
|
||||
className="flex items-center justify-center gap-2 p-3 border-2 border-dashed rounded-lg cursor-pointer hover:border-primary/50 hover:bg-primary/5 transition-colors"
|
||||
onClick={(e) => e.stopPropagation()}
|
||||
>
|
||||
<input {...getInputProps()} multiple={true} />
|
||||
<Plus size={16} className="text-muted-foreground" />
|
||||
<span className="text-sm text-muted-foreground">继续添加更多文件</span>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
) : (uploadedFiles.length > 0 || uploadedFile) ? (
|
||||
<div className="space-y-3">
|
||||
{/* 文件列表头部 */}
|
||||
<div
|
||||
className="flex items-center justify-between p-3 bg-muted/50 rounded-xl cursor-pointer hover:bg-muted/70 transition-colors"
|
||||
onClick={() => setUploadExpanded(!uploadExpanded)}
|
||||
>
|
||||
<div className="flex items-center gap-3">
|
||||
<div className="w-10 h-10 rounded-lg bg-primary/10 text-primary flex items-center justify-center">
|
||||
<Upload size={20} />
|
||||
</div>
|
||||
<div>
|
||||
<p className="font-semibold text-sm">
|
||||
已上传 {(uploadedFiles.length > 0 ? uploadedFiles : [uploadedFile]).length} 个文件
|
||||
</p>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
{uploadExpanded ? '点击收起' : '点击展开查看'}
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
<div className="flex items-center gap-2">
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="sm"
|
||||
onClick={(e) => {
|
||||
e.stopPropagation();
|
||||
handleDeleteFile();
|
||||
}}
|
||||
className="text-destructive hover:text-destructive"
|
||||
>
|
||||
<Trash2 size={14} className="mr-1" />
|
||||
清空
|
||||
</Button>
|
||||
{uploadExpanded ? <ChevronUp size={16} /> : <ChevronDown size={16} />}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* 展开的文件列表 */}
|
||||
{uploadExpanded && (
|
||||
<div className="space-y-2 border rounded-xl p-3">
|
||||
{/* 显示已上传文件列表 */}
|
||||
{(uploadedFiles.length > 0 ? uploadedFiles : [uploadedFile]).filter(Boolean).map((file, index) => (
|
||||
<div key={index} className="flex items-center gap-3 p-2 bg-background rounded-lg">
|
||||
<div className={cn(
|
||||
|
||||
@@ -1,26 +1,10 @@
|
||||
import React, { useState, useRef, useEffect } from 'react';
|
||||
import {
|
||||
Send,
|
||||
Bot,
|
||||
User,
|
||||
Sparkles,
|
||||
Trash2,
|
||||
RefreshCcw,
|
||||
FileText,
|
||||
TableProperties,
|
||||
ChevronRight,
|
||||
ArrowRight,
|
||||
Loader2,
|
||||
Download,
|
||||
Search,
|
||||
MessageSquare,
|
||||
CheckCircle
|
||||
} from 'lucide-react';
|
||||
import { Send, Bot, User, Sparkles, Trash2, FileText, TableProperties, ArrowRight, Search, MessageSquare } from 'lucide-react';
|
||||
import { Button } from '@/components/ui/button';
|
||||
import { Input } from '@/components/ui/input';
|
||||
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
|
||||
import { ScrollArea } from '@/components/ui/scroll-area';
|
||||
import { Badge } from '@/components/ui/badge';
|
||||
import { Markdown } from '@/components/ui/markdown';
|
||||
import { backendApi } from '@/db/backend-api';
|
||||
import { toast } from 'sonner';
|
||||
import { cn } from '@/lib/utils';
|
||||
@@ -39,8 +23,21 @@ const InstructionChat: React.FC = () => {
|
||||
const [input, setInput] = useState('');
|
||||
const [loading, setLoading] = useState(false);
|
||||
const [currentDocIds, setCurrentDocIds] = useState<string[]>([]);
|
||||
const [conversationId, setConversationId] = useState<string>('');
|
||||
const scrollAreaRef = useRef<HTMLDivElement>(null);
|
||||
|
||||
// 初始化会话ID
|
||||
useEffect(() => {
|
||||
const storedId = localStorage.getItem('chat_conversation_id');
|
||||
if (storedId) {
|
||||
setConversationId(storedId);
|
||||
} else {
|
||||
const newId = `conv_${Date.now()}_${Math.random().toString(36).substring(7)}`;
|
||||
setConversationId(newId);
|
||||
localStorage.setItem('chat_conversation_id', newId);
|
||||
}
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
// Initial welcome message
|
||||
if (messages.length === 0) {
|
||||
@@ -119,7 +116,8 @@ const InstructionChat: React.FC = () => {
|
||||
// 使用真实的智能指令 API
|
||||
const response = await backendApi.instructionChat(
|
||||
input.trim(),
|
||||
currentDocIds.length > 0 ? currentDocIds : undefined
|
||||
currentDocIds.length > 0 ? currentDocIds : undefined,
|
||||
{ conversation_id: conversationId }
|
||||
);
|
||||
|
||||
// 根据意图类型生成友好响应
|
||||
@@ -135,11 +133,12 @@ const InstructionChat: React.FC = () => {
|
||||
responseContent = `✅ 已提取到 ${keys.length} 个字段的数据:\n\n`;
|
||||
for (const [key, value] of Object.entries(extracted)) {
|
||||
const values = Array.isArray(value) ? value : [value];
|
||||
responseContent += `**${key}**: ${values.slice(0, 3).join(', ')}${values.length > 3 ? '...' : ''}\n`;
|
||||
const displayValues = values.length > 10 ? values.slice(0, 10).join(', ') + ` ...(共${values.length}条)` : values.join(', ');
|
||||
responseContent += `**${key}**: ${displayValues}\n`;
|
||||
}
|
||||
responseContent += `\n💡 您可以将这些数据填入表格。`;
|
||||
responseContent += `\n💡 可直接使用以上数据,或说"填入表格"继续填表操作。`;
|
||||
} else {
|
||||
responseContent = '未能从文档中提取到相关数据。请尝试更明确的字段名称。';
|
||||
responseContent = resultData?.message || '未能从文档中提取到相关数据。请尝试更明确的字段名称。';
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -151,24 +150,24 @@ const InstructionChat: React.FC = () => {
|
||||
responseContent = `✅ 填表完成!成功填写 ${filledKeys.length} 个字段:\n\n`;
|
||||
for (const [key, value] of Object.entries(filled)) {
|
||||
const values = Array.isArray(value) ? value : [value];
|
||||
responseContent += `**${key}**: ${values.slice(0, 3).join(', ')}\n`;
|
||||
const displayValues = values.length > 10 ? values.slice(0, 10).join(', ') + ` ...(共${values.length}条)` : values.join(', ');
|
||||
responseContent += `**${key}**: ${displayValues}\n`;
|
||||
}
|
||||
responseContent += `\n📋 请到【智能填表】页面查看或导出结果。`;
|
||||
} else {
|
||||
responseContent = '填表未能提取到数据。请检查模板表头和数据源内容。';
|
||||
responseContent = resultData?.message || '填表未能提取到数据。请检查模板表头和数据源内容。';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'summarize':
|
||||
// 摘要结果
|
||||
const summaries = resultData?.summaries || [];
|
||||
if (summaries.length > 0) {
|
||||
responseContent = `📄 找到 ${summaries.length} 个文档的摘要:\n\n`;
|
||||
summaries.forEach((s: any, idx: number) => {
|
||||
responseContent += `**${idx + 1}. ${s.filename}**\n${s.content_preview}\n\n`;
|
||||
});
|
||||
if (resultData?.action_needed === 'provide_document' || resultData?.action_needed === 'upload_document') {
|
||||
responseContent = `📋 ${resultData.message}\n\n${resultData.suggestion || ''}`;
|
||||
} else if (resultData?.ai_summary) {
|
||||
// AI 生成的摘要
|
||||
responseContent = `📄 **${resultData.filename}** 摘要分析:\n\n${resultData.ai_summary}`;
|
||||
} else {
|
||||
responseContent = '未能生成摘要。请确保已上传文档。';
|
||||
responseContent = resultData?.message || '未能生成摘要。请确保已上传文档。';
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -176,8 +175,10 @@ const InstructionChat: React.FC = () => {
|
||||
// 问答结果
|
||||
if (resultData?.answer) {
|
||||
responseContent = `**问题**: ${resultData.question}\n\n**答案**: ${resultData.answer}`;
|
||||
} else if (resultData?.context_preview) {
|
||||
responseContent = `**问题**: ${resultData.question}\n\n**相关上下文**:\n${resultData.context_preview}`;
|
||||
} else {
|
||||
responseContent = resultData?.message || '我找到了相关信息,请查看上文。';
|
||||
responseContent = resultData?.message || '请先上传文档,我才能回答您的问题。';
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -207,8 +208,35 @@ const InstructionChat: React.FC = () => {
|
||||
}
|
||||
break;
|
||||
|
||||
case 'edit':
|
||||
// 文档编辑结果
|
||||
if (resultData?.edited_content) {
|
||||
responseContent = `✏️ **${resultData.original_filename}** 编辑完成:\n\n${resultData.edited_content.substring(0, 500)}${resultData.edited_content.length > 500 ? '\n\n...(内容已截断)' : ''}`;
|
||||
} else {
|
||||
responseContent = resultData?.message || '编辑完成。';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'transform':
|
||||
// 格式转换结果
|
||||
if (resultData?.excel_data) {
|
||||
responseContent = `🔄 格式转换完成!\n\n已转换为 **Excel** 格式,共 **${resultData.excel_data.length}** 行数据。\n\n${resultData.message || ''}`;
|
||||
} else if (resultData?.content) {
|
||||
responseContent = `🔄 格式转换完成!\n\n目标格式: **${resultData.target_format?.toUpperCase()}**\n\n${resultData.message || ''}`;
|
||||
} else {
|
||||
responseContent = resultData?.message || '格式转换完成。';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'unknown':
|
||||
responseContent = `我理解您想要: "${input.trim()}"\n\n但我目前无法完成此操作。您可以尝试:\n\n1. **提取数据**: "提取医院数量和床位数"\n2. **填表**: "根据这些数据填表"\n3. **总结**: "总结这份文档"\n4. **问答**: "文档里说了什么?"\n5. **搜索**: "搜索相关内容"`;
|
||||
// 检查是否需要用户上传文档
|
||||
if (resultData?.suggestion) {
|
||||
responseContent = resultData.suggestion;
|
||||
} else if (resultData?.message && resultData.message !== '无法理解该指令,请尝试更明确的描述') {
|
||||
responseContent = resultData.message;
|
||||
} else {
|
||||
responseContent = `我理解您想要: "${input.trim()}"\n\n请尝试以下操作:\n\n1. **提取数据**: "提取医院数量和床位数"\n2. **填表**: "根据这些数据填表"\n3. **总结**: "总结这份文档"\n4. **问答**: "文档里说了什么?"\n5. **搜索**: "搜索相关内容"`;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -299,9 +327,11 @@ const InstructionChat: React.FC = () => {
|
||||
? "bg-primary text-primary-foreground shadow-xl shadow-primary/20 rounded-tr-none"
|
||||
: "bg-white border border-border/50 shadow-md rounded-tl-none"
|
||||
)}>
|
||||
<p className="text-sm leading-relaxed whitespace-pre-wrap font-medium">
|
||||
{m.content}
|
||||
</p>
|
||||
{m.role === 'assistant' ? (
|
||||
<Markdown content={m.content} className="text-sm leading-relaxed prose prose-sm max-w-none" />
|
||||
) : (
|
||||
<p className="text-sm leading-relaxed whitespace-pre-wrap font-medium">{m.content}</p>
|
||||
)}
|
||||
<span className={cn(
|
||||
"text-[10px] block opacity-50 font-bold tracking-widest",
|
||||
m.role === 'user' ? "text-right" : "text-left"
|
||||
|
||||
@@ -248,15 +248,25 @@ const TemplateFill: React.FC = () => {
|
||||
if (!templateFile || !filledResult) return;
|
||||
|
||||
try {
|
||||
const ext = templateFile.name.split('.').pop()?.toLowerCase();
|
||||
const exportFormat = (ext === 'docx') ? 'docx' : 'xlsx';
|
||||
// 对于 Word 模板,如果已有填写后的文件(已填入表格单元格),传递其路径以便直接下载
|
||||
const filledFilePath = (ext === 'docx' && filledResult.filled_file_path)
|
||||
? filledResult.filled_file_path
|
||||
: undefined;
|
||||
const blob = await backendApi.exportFilledTemplate(
|
||||
templateId || 'temp',
|
||||
filledResult.filled_data || {},
|
||||
'xlsx'
|
||||
exportFormat,
|
||||
filledFilePath
|
||||
);
|
||||
const ext_match = templateFile.name.match(/\.([^.])+$/);
|
||||
const baseName = ext_match ? templateFile.name.replace(ext_match[0], '') : templateFile.name;
|
||||
const downloadName = `filled_${baseName}.${exportFormat}`;
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = `filled_${templateFile.name}`;
|
||||
a.download = downloadName;
|
||||
a.click();
|
||||
URL.revokeObjectURL(url);
|
||||
toast.success('导出成功');
|
||||
@@ -546,7 +556,7 @@ const TemplateFill: React.FC = () => {
|
||||
</div>
|
||||
<h3 className="text-xl font-bold mb-2">AI 正在智能分析并填表</h3>
|
||||
<p className="text-muted-foreground text-center max-w-md">
|
||||
系统正在从 {sourceFiles.length || sourceFilePaths.length} 份文档中检索相关信息...
|
||||
系统正在从 {sourceFiles.length || sourceFilePaths.length || sourceDocIds.length || 0} 份文档中检索相关信息...
|
||||
</p>
|
||||
</CardContent>
|
||||
</Card>
|
||||
@@ -562,7 +572,7 @@ const TemplateFill: React.FC = () => {
|
||||
填表完成
|
||||
</CardTitle>
|
||||
<CardDescription>
|
||||
系统已根据 {sourceFiles.length || sourceFilePaths.length} 份文档自动完成表格填写
|
||||
系统已根据 {filledResult.source_doc_count || sourceFiles.length || sourceFilePaths.length || sourceDocIds.length} 份文档自动完成表格填写
|
||||
</CardDescription>
|
||||
</CardHeader>
|
||||
<CardContent>
|
||||
|
||||
Reference in New Issue
Block a user