【智能助手增强】

- 新增对话历史管理:MongoDB新增conversations集合,存储用户与AI的对话上下文,支持多轮对话意图延续
- 新增对话历史API(conversation.py):GET/DELETE conversation历史、列出所有会话
- 意图解析增强:支持基于对话历史的意图识别,上下文理解更准确
- 字段提取优化:支持"提取文档中的医院数量"等自然语言模式,智能去除"文档中的"前缀
- 文档对比优化:从指令中提取文件名并精确匹配source_docs,支持"对比A和B两个文档"
- 文档摘要优化:使用LLM生成真实AI摘要而非返回原始文档预览

【Word模板填表核心功能】
- Word模板字段生成:空白Word上传后,自动从源文档(Excel/Word/TXT/MD)内容AI生成字段名
- Word模板填表(_fill_docx):将提取数据写入Word模板表格,支持精确匹配、模糊匹配、追加新行
- 数据润色(_polish_word_filled_data):LLM对多行Excel数据进行统计归纳(合计/平均/极值),转化为专业自然语言描述
- 段落格式输出:使用📌字段名+值段落+分隔线(灰色横线)格式,提升可读性
- 导出链打通:fill_template返回filled_file_path,export直接返回已填好的Word文件

【其他修复】
- 修复Word导出Windows文件锁问题:NamedTemporaryFile改为mkstemp+close
- 修复Word方框非法字符:扩展clean_text移除\uFFFD、□等Unicode替代符和零宽字符
- 修复文档对比"需要至少2个文档":从指令提取具体文件名优先匹配而非取前2个
- 修复导出format硬编码:自动识别docx/xlsx格式
- Docx解析器增加备用解析方法和更完整的段落/表格/标题提取
- RAG服务新增MySQL数据源支持
This commit is contained in:
dj
2026-04-15 23:32:55 +08:00
parent 9e7f9df384
commit e5d4724e82
19 changed files with 2185 additions and 407 deletions

View File

@@ -14,6 +14,7 @@ from app.api.endpoints import (
analysis_charts, analysis_charts,
health, health,
instruction, # 智能指令 instruction, # 智能指令
conversation, # 对话历史
) )
# 创建主路由 # 创建主路由
@@ -31,3 +32,4 @@ api_router.include_router(ai_analyze.router) # AI分析
api_router.include_router(visualization.router) # 可视化 api_router.include_router(visualization.router) # 可视化
api_router.include_router(analysis_charts.router) # 分析图表 api_router.include_router(analysis_charts.router) # 分析图表
api_router.include_router(instruction.router) # 智能指令 api_router.include_router(instruction.router) # 智能指令
api_router.include_router(conversation.router) # 对话历史

View File

@@ -0,0 +1,98 @@
"""
对话历史 API 接口
提供对话历史的存储和查询功能
"""
import logging
from typing import Optional
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from app.core.database import mongodb
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/conversation", tags=["对话历史"])
# ==================== 请求/响应模型 ====================
class ConversationMessage(BaseModel):
role: str
content: str
intent: Optional[str] = None
class ConversationHistoryResponse(BaseModel):
success: bool
messages: list
class ConversationListResponse(BaseModel):
success: bool
conversations: list
# ==================== 接口 ====================
@router.get("/{conversation_id}/history", response_model=ConversationHistoryResponse)
async def get_conversation_history(conversation_id: str, limit: int = 20):
"""
获取对话历史
Args:
conversation_id: 对话会话ID
limit: 返回消息数量默认20条
"""
try:
messages = await mongodb.get_conversation_history(conversation_id, limit=limit)
return ConversationHistoryResponse(
success=True,
messages=messages
)
except Exception as e:
logger.error(f"获取对话历史失败: {e}")
return ConversationHistoryResponse(
success=False,
messages=[]
)
@router.delete("/{conversation_id}")
async def delete_conversation(conversation_id: str):
"""
删除对话会话
Args:
conversation_id: 对话会话ID
"""
try:
success = await mongodb.delete_conversation(conversation_id)
return {"success": success}
except Exception as e:
logger.error(f"删除对话失败: {e}")
return {"success": False, "error": str(e)}
@router.get("/all", response_model=ConversationListResponse)
async def list_conversations(limit: int = 50, skip: int = 0):
"""
获取会话列表
Args:
limit: 返回数量
skip: 跳过数量
"""
try:
conversations = await mongodb.list_conversations(limit=limit, skip=skip)
return ConversationListResponse(
success=True,
conversations=conversations
)
except Exception as e:
logger.error(f"获取会话列表失败: {e}")
return ConversationListResponse(
success=False,
conversations=[]
)

View File

@@ -4,6 +4,7 @@
支持多格式文档(docx/xlsx/md/txt)上传、解析、存储和RAG索引 支持多格式文档(docx/xlsx/md/txt)上传、解析、存储和RAG索引
集成 Excel 存储和 AI 生成字段描述 集成 Excel 存储和 AI 生成字段描述
""" """
import asyncio
import logging import logging
import uuid import uuid
from typing import List, Optional from typing import List, Optional
@@ -258,6 +259,7 @@ async def process_document(
) )
# 如果是 Excel存储到 MySQL + AI生成描述 + RAG索引 # 如果是 Excel存储到 MySQL + AI生成描述 + RAG索引
mysql_table_name = None
if doc_type in ["xlsx", "xls"]: if doc_type in ["xlsx", "xls"]:
await update_task_status( await update_task_status(
task_id, status="processing", task_id, status="processing",
@@ -265,17 +267,29 @@ async def process_document(
) )
try: try:
# 使用 TableRAG 服务完成建表和RAG索引 # 使用 TableRAG 服务存储到 MySQL跳过 RAG 索引以提升速度)
logger.info(f"开始存储Excel到MySQL: {original_filename}, file_path: {file_path}") logger.info(f"开始存储Excel到MySQL: {original_filename}, file_path: {file_path}")
rag_result = await table_rag_service.build_table_rag_index( rag_result = await table_rag_service.build_table_rag_index(
file_path=file_path, file_path=file_path,
filename=original_filename, filename=original_filename,
sheet_name=parse_options.get("sheet_name"), sheet_name=parse_options.get("sheet_name"),
header_row=parse_options.get("header_row", 0) header_row=parse_options.get("header_row", 0),
skip_rag_index=True # 跳过 AI 字段描述生成和索引
) )
if rag_result.get("success"): if rag_result.get("success"):
logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {rag_result.get('table_name')}") mysql_table_name = rag_result.get('table_name')
logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {mysql_table_name}")
# 更新 MongoDB 中的 metadata记录 MySQL 表名
try:
doc = await mongodb.get_document(doc_id)
if doc:
metadata = doc.get("metadata", {})
metadata["mysql_table_name"] = mysql_table_name
await mongodb.update_document_metadata(doc_id, metadata)
logger.info(f"已更新 MongoDB 文档的 mysql_table_name: {mysql_table_name}")
except Exception as update_err:
logger.warning(f"更新 MongoDB mysql_table_name 失败: {update_err}")
else: else:
logger.error(f"RAG索引构建失败: {rag_result.get('error')}") logger.error(f"RAG索引构建失败: {rag_result.get('error')}")
except Exception as e: except Exception as e:
@@ -283,17 +297,16 @@ async def process_document(
else: else:
# 非结构化文档 # 非结构化文档
await update_task_status(
task_id, status="processing",
progress=60, message="正在建立索引"
)
# 如果文档中有表格数据,提取并存储到 MySQL + RAG
structured_data = result.data.get("structured_data", {}) structured_data = result.data.get("structured_data", {})
tables = structured_data.get("tables", []) tables = structured_data.get("tables", [])
# 如果文档中有表格数据,提取并存储到 MySQL不需要 RAG 索引)
if tables: if tables:
# 对每个表格建立 MySQL 表和 RAG 索引 await update_task_status(
task_id, status="processing",
progress=60, message="正在存储表格数据"
)
# 对每个表格建立 MySQL 表(跳过 RAG 索引,速度更快)
for table_info in tables: for table_info in tables:
await table_rag_service.index_document_table( await table_rag_service.index_document_table(
doc_id=doc_id, doc_id=doc_id,
@@ -302,7 +315,13 @@ async def process_document(
source_doc_type=doc_type source_doc_type=doc_type
) )
# 同时对文档内容建立 RAG 索引 # 对文档内容建立 RAG 索引(非结构化文本需要语义搜索)
content = result.data.get("content", "")
if content and len(content) > 50: # 只有内容足够长才建立索引
await update_task_status(
task_id, status="processing",
progress=80, message="正在建立语义索引"
)
await index_document_to_rag(doc_id, original_filename, result, doc_type) await index_document_to_rag(doc_id, original_filename, result, doc_type)
# 完成 # 完成
@@ -328,26 +347,32 @@ async def process_document(
async def process_documents_batch(task_id: str, files: List[dict]): async def process_documents_batch(task_id: str, files: List[dict]):
"""批量处理文档""" """批量并行处理文档"""
try: try:
await update_task_status( await update_task_status(
task_id, status="processing", task_id, status="processing",
progress=0, message="开始批量处理" progress=0, message=f"开始批量处理 {len(files)} 个文档",
result={"total": len(files), "files": []}
) )
results = [] async def process_single_file(file_info: dict, index: int) -> dict:
for i, file_info in enumerate(files): """处理单个文件"""
filename = file_info["filename"]
try: try:
# 解析文档
parser = ParserFactory.get_parser(file_info["path"]) parser = ParserFactory.get_parser(file_info["path"])
result = parser.parse(file_info["path"]) result = parser.parse(file_info["path"])
if result.success: if not result.success:
return {"index": index, "filename": filename, "success": False, "error": result.error or "解析失败"}
# 存储到 MongoDB
doc_id = await mongodb.insert_document( doc_id = await mongodb.insert_document(
doc_type=file_info["ext"], doc_type=file_info["ext"],
content=result.data.get("content", ""), content=result.data.get("content", ""),
metadata={ metadata={
**result.metadata, **result.metadata,
"original_filename": file_info["filename"], "original_filename": filename,
"file_path": file_info["path"] "file_path": file_info["path"]
}, },
structured_data=result.data.get("structured_data") structured_data=result.data.get("structured_data")
@@ -357,43 +382,60 @@ async def process_documents_batch(task_id: str, files: List[dict]):
if file_info["ext"] in ["xlsx", "xls"]: if file_info["ext"] in ["xlsx", "xls"]:
await table_rag_service.build_table_rag_index( await table_rag_service.build_table_rag_index(
file_path=file_info["path"], file_path=file_info["path"],
filename=file_info["filename"] filename=filename,
skip_rag_index=True # 跳过 AI 字段描述生成和索引
) )
else: else:
# 非结构化文档:处理其中的表格 + 内容索引 # 非结构化文档
structured_data = result.data.get("structured_data", {}) structured_data = result.data.get("structured_data", {})
tables = structured_data.get("tables", []) tables = structured_data.get("tables", [])
# 表格数据直接存 MySQL跳过 RAG 索引)
if tables: if tables:
for table_info in tables: for table_info in tables:
await table_rag_service.index_document_table( await table_rag_service.index_document_table(
doc_id=doc_id, doc_id=doc_id,
filename=file_info["filename"], filename=filename,
table_data=table_info, table_data=table_info,
source_doc_type=file_info["ext"] source_doc_type=file_info["ext"]
) )
await index_document_to_rag(doc_id, file_info["filename"], result, file_info["ext"]) # 只有内容足够长才建立语义索引
content = result.data.get("content", "")
if content and len(content) > 50:
await index_document_to_rag(doc_id, filename, result, file_info["ext"])
results.append({"filename": file_info["filename"], "doc_id": doc_id, "success": True}) return {"index": index, "filename": filename, "doc_id": doc_id, "success": True}
else:
results.append({"filename": file_info["filename"], "success": False, "error": result.error})
except Exception as e: except Exception as e:
results.append({"filename": file_info["filename"], "success": False, "error": str(e)}) logger.error(f"处理文件 {filename} 失败: {e}")
return {"index": index, "filename": filename, "success": False, "error": str(e)}
progress = int((i + 1) / len(files) * 100) # 并行处理所有文档
await update_task_status( tasks = [process_single_file(f, i) for i, f in enumerate(files)]
task_id, status="processing", results = await asyncio.gather(*tasks)
progress=progress, message=f"已处理 {i+1}/{len(files)}"
)
# 按原始顺序排序
results.sort(key=lambda x: x["index"])
# 统计成功/失败数量
success_count = sum(1 for r in results if r["success"])
fail_count = len(results) - success_count
# 更新最终状态
await update_task_status( await update_task_status(
task_id, status="success", task_id, status="success",
progress=100, message="批量处理完成", progress=100, message=f"批量处理完成: {success_count} 成功, {fail_count} 失败",
result={"results": results} result={
"total": len(files),
"success": success_count,
"failure": fail_count,
"results": results
}
) )
logger.info(f"批量处理完成: {success_count}/{len(files)} 成功")
except Exception as e: except Exception as e:
logger.error(f"批量处理失败: {str(e)}") logger.error(f"批量处理失败: {str(e)}")
await update_task_status( await update_task_status(
@@ -404,20 +446,20 @@ async def process_documents_batch(task_id: str, files: List[dict]):
async def index_document_to_rag(doc_id: str, filename: str, result: ParseResult, doc_type: str): async def index_document_to_rag(doc_id: str, filename: str, result: ParseResult, doc_type: str):
"""将非结构化文档索引到 RAG使用分块索引""" """将非结构化文档索引到 RAG使用分块索引,异步执行"""
try: try:
content = result.data.get("content", "") content = result.data.get("content", "")
if content: if content:
# 将完整内容传递给 RAG 服务自动分块索引 # 使用异步方法索引,避免阻塞事件循环
rag_service.index_document_content( await rag_service.index_document_content_async(
doc_id=doc_id, doc_id=doc_id,
content=content, # 传递完整内容,由 RAG 服务自动分块 content=content,
metadata={ metadata={
"filename": filename, "filename": filename,
"doc_type": doc_type "doc_type": doc_type
}, },
chunk_size=500, # 每块 500 字符 chunk_size=1000, # 每块 1000 字符,提升速度
chunk_overlap=50 # 块之间 50 字符重叠 chunk_overlap=100 # 块之间 100 字符重叠
) )
logger.info(f"RAG 索引完成: {filename}, doc_id={doc_id}") logger.info(f"RAG 索引完成: {filename}, doc_id={doc_id}")
except Exception as e: except Exception as e:

View File

@@ -25,6 +25,7 @@ class InstructionRequest(BaseModel):
instruction: str instruction: str
doc_ids: Optional[List[str]] = None # 关联的文档 ID 列表 doc_ids: Optional[List[str]] = None # 关联的文档 ID 列表
context: Optional[Dict[str, Any]] = None # 额外上下文 context: Optional[Dict[str, Any]] = None # 额外上下文
conversation_id: Optional[str] = None # 对话会话ID用于关联历史记录
class IntentRecognitionResponse(BaseModel): class IntentRecognitionResponse(BaseModel):
@@ -240,7 +241,8 @@ async def instruction_chat(
task_id=task_id, task_id=task_id,
instruction=request.instruction, instruction=request.instruction,
doc_ids=request.doc_ids, doc_ids=request.doc_ids,
context=request.context context=request.context,
conversation_id=request.conversation_id
) )
return { return {
@@ -251,14 +253,15 @@ async def instruction_chat(
} }
# 同步模式:等待执行完成 # 同步模式:等待执行完成
return await _execute_chat_task(task_id, request.instruction, request.doc_ids, request.context) return await _execute_chat_task(task_id, request.instruction, request.doc_ids, request.context, request.conversation_id)
async def _execute_chat_task( async def _execute_chat_task(
task_id: str, task_id: str,
instruction: str, instruction: str,
doc_ids: Optional[List[str]], doc_ids: Optional[List[str]],
context: Optional[Dict[str, Any]] context: Optional[Dict[str, Any]],
conversation_id: Optional[str] = None
): ):
"""执行指令对话的后台任务""" """执行指令对话的后台任务"""
from app.core.database import mongodb as mongo_client from app.core.database import mongodb as mongo_client
@@ -278,6 +281,13 @@ async def _execute_chat_task(
# 构建上下文 # 构建上下文
ctx: Dict[str, Any] = context or {} ctx: Dict[str, Any] = context or {}
# 获取对话历史
if conversation_id:
history = await mongo_client.get_conversation_history(conversation_id, limit=20)
if history:
ctx["conversation_history"] = history
logger.info(f"加载对话历史: conversation_id={conversation_id}, 消息数={len(history)}")
# 获取关联文档 # 获取关联文档
if doc_ids: if doc_ids:
docs = [] docs = []
@@ -291,6 +301,29 @@ async def _execute_chat_task(
# 执行指令 # 执行指令
result = await instruction_executor.execute(instruction, ctx) result = await instruction_executor.execute(instruction, ctx)
# 存储对话历史
if conversation_id:
try:
# 存储用户消息
await mongo_client.insert_conversation(
conversation_id=conversation_id,
role="user",
content=instruction,
intent=result.get("intent", "unknown")
)
# 存储助手回复
response_content = result.get("message", "")
if response_content:
await mongo_client.insert_conversation(
conversation_id=conversation_id,
role="assistant",
content=response_content,
intent=result.get("intent", "unknown")
)
logger.info(f"已存储对话历史: conversation_id={conversation_id}")
except Exception as e:
logger.error(f"存储对话历史失败: {e}")
# 根据意图类型添加友好的响应消息 # 根据意图类型添加友好的响应消息
response_messages = { response_messages = {
"extract": f"已提取 {len(result.get('extracted_data', {}))} 个字段的数据", "extract": f"已提取 {len(result.get('extracted_data', {}))} 个字段的数据",

View File

@@ -87,6 +87,7 @@ class ExportRequest(BaseModel):
template_id: str template_id: str
filled_data: dict filled_data: dict
format: str = "xlsx" # xlsx 或 docx format: str = "xlsx" # xlsx 或 docx
filled_file_path: Optional[str] = None # 已填写的 Word 文件路径(可选)
# ==================== 接口实现 ==================== # ==================== 接口实现 ====================
@@ -541,7 +542,7 @@ async def export_filled_template(
if request.format == "xlsx": if request.format == "xlsx":
return await _export_to_excel(request.filled_data, request.template_id) return await _export_to_excel(request.filled_data, request.template_id)
elif request.format == "docx": elif request.format == "docx":
return await _export_to_word(request.filled_data, request.template_id) return await _export_to_word(request.filled_data, request.template_id, request.filled_file_path)
else: else:
raise HTTPException( raise HTTPException(
status_code=400, status_code=400,
@@ -608,11 +609,12 @@ async def _export_to_excel(filled_data: dict, template_id: str) -> StreamingResp
) )
async def _export_to_word(filled_data: dict, template_id: str) -> StreamingResponse: async def _export_to_word(filled_data: dict, template_id: str, filled_file_path: Optional[str] = None) -> StreamingResponse:
"""导出为 Word 格式""" """导出为 Word 格式"""
import re import re
import tempfile import tempfile
import os import os
import urllib.parse
from docx import Document from docx import Document
from docx.shared import Pt, RGBColor from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.text import WD_ALIGN_PARAGRAPH
@@ -623,12 +625,32 @@ async def _export_to_word(filled_data: dict, template_id: str) -> StreamingRespo
return "" return ""
# 移除控制字符 # 移除控制字符
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text) text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
# 转义 XML 特殊字符以防破坏文档结构
text = text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
return text.strip() return text.strip()
tmp_path = None
try: try:
# 先保存到临时文件,再读取到内存,确保文档完整性 # 如果有已填写的文件(通过 _fill_docx 填写了模板单元格),直接返回该文件
with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file: if filled_file_path and os.path.exists(filled_file_path):
tmp_path = tmp_file.name filename = os.path.basename(filled_file_path)
with open(filled_file_path, 'rb') as f:
file_content = f.read()
output = io.BytesIO(file_content)
encoded_filename = urllib.parse.quote(filename)
return StreamingResponse(
output,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
headers={
"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}",
"Content-Length": str(len(file_content))
}
)
# 没有已填写文件,创建新的 Word 文档(表格形式)
# 创建临时文件(立即关闭句柄,避免 Windows 文件锁问题)
tmp_fd, tmp_path = tempfile.mkstemp(suffix='.docx')
os.close(tmp_fd) # 关闭立即得到的 fd让 docx 可以写入
doc = Document() doc = Document()
doc.add_heading('填写结果', level=1) doc.add_heading('填写结果', level=1)
@@ -670,19 +692,23 @@ async def _export_to_word(filled_data: dict, template_id: str) -> StreamingRespo
finally: finally:
# 清理临时文件 # 清理临时文件
if os.path.exists(tmp_path): if tmp_path and os.path.exists(tmp_path):
try: try:
os.unlink(tmp_path) os.unlink(tmp_path)
except: except Exception:
pass pass
output = io.BytesIO(file_content) output = io.BytesIO(file_content)
filename = "filled_template.docx" filename = "filled_template.docx"
encoded_filename = urllib.parse.quote(filename)
return StreamingResponse( return StreamingResponse(
output, output,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
headers={"Content-Disposition": f"attachment; filename*=UTF-8''{filename}"} headers={
"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}",
"Content-Length": str(len(file_content))
}
) )

View File

@@ -64,6 +64,11 @@ class MongoDB:
"""任务集合 - 存储任务历史记录""" """任务集合 - 存储任务历史记录"""
return self.db["tasks"] return self.db["tasks"]
@property
def conversations(self):
"""对话集合 - 存储对话历史记录"""
return self.db["conversations"]
# ==================== 文档操作 ==================== # ==================== 文档操作 ====================
async def insert_document( async def insert_document(
@@ -117,14 +122,20 @@ class MongoDB:
搜索文档 搜索文档
Args: Args:
query: 搜索关键词 query: 搜索关键词(支持文件名和内容搜索)
doc_type: 文档类型过滤 doc_type: 文档类型过滤
limit: 返回数量 limit: 返回数量
Returns: Returns:
文档列表 文档列表
""" """
filter_query = {"content": {"$regex": query}} filter_query = {
"$or": [
{"content": {"$regex": query, "$options": "i"}},
{"metadata.original_filename": {"$regex": query, "$options": "i"}},
{"metadata.filename": {"$regex": query, "$options": "i"}},
]
}
if doc_type: if doc_type:
filter_query["doc_type"] = doc_type filter_query["doc_type"] = doc_type
@@ -141,6 +152,15 @@ class MongoDB:
result = await self.documents.delete_one({"_id": ObjectId(doc_id)}) result = await self.documents.delete_one({"_id": ObjectId(doc_id)})
return result.deleted_count > 0 return result.deleted_count > 0
async def update_document_metadata(self, doc_id: str, metadata: Dict[str, Any]) -> bool:
"""更新文档 metadata 字段"""
from bson import ObjectId
result = await self.documents.update_one(
{"_id": ObjectId(doc_id)},
{"$set": {"metadata": metadata}}
)
return result.modified_count > 0
# ==================== RAG 索引操作 ==================== # ==================== RAG 索引操作 ====================
async def insert_rag_entry( async def insert_rag_entry(
@@ -251,6 +271,10 @@ class MongoDB:
await self.tasks.create_index("task_id", unique=True) await self.tasks.create_index("task_id", unique=True)
await self.tasks.create_index("created_at") await self.tasks.create_index("created_at")
# 对话集合索引
await self.conversations.create_index("conversation_id")
await self.conversations.create_index("created_at")
logger.info("MongoDB 索引创建完成") logger.info("MongoDB 索引创建完成")
# ==================== 任务历史操作 ==================== # ==================== 任务历史操作 ====================
@@ -369,6 +393,108 @@ class MongoDB:
result = await self.tasks.delete_one({"task_id": task_id}) result = await self.tasks.delete_one({"task_id": task_id})
return result.deleted_count > 0 return result.deleted_count > 0
# ==================== 对话历史操作 ====================
async def insert_conversation(
self,
conversation_id: str,
role: str,
content: str,
intent: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> str:
"""
插入对话记录
Args:
conversation_id: 对话会话ID
role: 角色 (user/assistant)
content: 对话内容
intent: 意图类型
metadata: 额外元数据
Returns:
插入文档的ID
"""
message = {
"conversation_id": conversation_id,
"role": role,
"content": content,
"intent": intent,
"metadata": metadata or {},
"created_at": datetime.utcnow(),
}
result = await self.conversations.insert_one(message)
return str(result.inserted_id)
async def get_conversation_history(
self,
conversation_id: str,
limit: int = 20,
) -> List[Dict[str, Any]]:
"""
获取对话历史
Args:
conversation_id: 对话会话ID
limit: 返回消息数量
Returns:
对话消息列表
"""
cursor = self.conversations.find(
{"conversation_id": conversation_id}
).sort("created_at", 1).limit(limit)
messages = []
async for msg in cursor:
msg["_id"] = str(msg["_id"])
if msg.get("created_at"):
msg["created_at"] = msg["created_at"].isoformat()
messages.append(msg)
return messages
async def delete_conversation(self, conversation_id: str) -> bool:
"""删除对话会话"""
result = await self.conversations.delete_many({"conversation_id": conversation_id})
return result.deleted_count > 0
async def list_conversations(
self,
limit: int = 50,
skip: int = 0,
) -> List[Dict[str, Any]]:
"""
获取会话列表(按最近一条消息排序)
Args:
limit: 返回数量
skip: 跳过数量
Returns:
会话列表
"""
# 使用 aggregation 获取每个会话的最新一条消息
pipeline = [
{"$sort": {"created_at": -1}},
{"$group": {
"_id": "$conversation_id",
"last_message": {"$first": "$$ROOT"},
}},
{"$replaceRoot": {"newRoot": "$last_message"}},
{"$sort": {"created_at": -1}},
{"$skip": skip},
{"$limit": limit},
]
conversations = []
async for doc in self.conversations.aggregate(pipeline):
doc["_id"] = str(doc["_id"])
if doc.get("created_at"):
doc["created_at"] = doc["created_at"].isoformat()
conversations.append(doc)
return conversations
# ==================== 全局单例 ==================== # ==================== 全局单例 ====================

View File

@@ -44,6 +44,22 @@ class DocxParser(BaseParser):
error=f"文件不存在: {file_path}" error=f"文件不存在: {file_path}"
) )
# 尝试使用 python-docx 解析,失败则使用备用方法
try:
return self._parse_with_docx(path)
except Exception as e:
logger.warning(f"python-docx 解析失败,使用备用方法: {e}")
try:
return self._parse_fallback(path)
except Exception as fallback_error:
logger.error(f"备用解析方法也失败: {fallback_error}")
return ParseResult(
success=False,
error=f"解析 Word 文档失败: {str(e)}"
)
def _parse_with_docx(self, path: Path) -> ParseResult:
"""使用 python-docx 解析文档"""
# 检查文件扩展名 # 检查文件扩展名
if path.suffix.lower() not in self.supported_extensions: if path.suffix.lower() not in self.supported_extensions:
return ParseResult( return ParseResult(
@@ -51,9 +67,8 @@ class DocxParser(BaseParser):
error=f"不支持的文件类型: {path.suffix}" error=f"不支持的文件类型: {path.suffix}"
) )
try:
# 读取 Word 文档 # 读取 Word 文档
doc = Document(file_path) doc = Document(path)
# 提取文本内容 # 提取文本内容
paragraphs = [] paragraphs = []
@@ -107,43 +122,123 @@ class DocxParser(BaseParser):
metadata = { metadata = {
"filename": path.name, "filename": path.name,
"extension": path.suffix.lower(), "extension": path.suffix.lower(),
"file_size": path.stat().st_size,
"paragraph_count": len(paragraphs), "paragraph_count": len(paragraphs),
"table_count": len(tables_data), "table_count": len(tables_data),
"word_count": len(full_text),
"char_count": len(full_text.replace("\n", "")),
"has_tables": len(tables_data) > 0,
"has_images": images_info.get("image_count", 0) > 0,
"image_count": images_info.get("image_count", 0) "image_count": images_info.get("image_count", 0)
} }
# 返回结果
return ParseResult( return ParseResult(
success=True, success=True,
data={ data={
"content": full_text, "content": full_text,
"paragraphs": paragraphs_text, "paragraphs": paragraphs,
"paragraphs_with_style": paragraphs, "paragraphs_with_style": paragraphs,
"tables": tables_data, "tables": tables_data,
"images": images_info,
"word_count": len(full_text),
"structured_data": {
"paragraphs": paragraphs,
"paragraphs_text": paragraphs_text,
"tables": tables_data,
"images": images_info "images": images_info
}
}, },
metadata=metadata metadata=metadata
) )
except Exception as e: def _parse_fallback(self, path: Path) -> ParseResult:
logger.error(f"解析 Word 文档失败: {str(e)}") """备用解析方法:直接解析 docx 的 XML 结构"""
import zipfile
from xml.etree import ElementTree as ET
try:
with zipfile.ZipFile(path, 'r') as zf:
# 读取 document.xml
if 'word/document.xml' not in zf.namelist():
return ParseResult(success=False, error="无效的 docx 文件格式")
xml_content = zf.read('word/document.xml')
root = ET.fromstring(xml_content)
# 命名空间
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
}
paragraphs = []
tables = []
current_table = []
for elem in root.iter():
if elem.tag.endswith('}p'): # 段落
text_parts = []
for t in elem.iter():
if t.tag.endswith('}t') and t.text:
text_parts.append(t.text)
text = ''.join(text_parts).strip()
if text:
paragraphs.append({'text': text, 'style': 'Normal'})
elif elem.tag.endswith('}tr'): # 表格行
row_data = []
for tc in elem.iter():
if tc.tag.endswith('}tc'): # 单元格
cell_text = []
for t in tc.iter():
if t.tag.endswith('}t') and t.text:
cell_text.append(t.text)
row_data.append(''.join(cell_text).strip())
if row_data:
current_table.append(row_data)
else:
# 表格结束,保存
if current_table:
tables.append({
'table_index': len(tables),
'rows': current_table,
'row_count': len(current_table),
'column_count': len(current_table[0]) if current_table else 0
})
current_table = []
# 保存最后一张表格
if current_table:
tables.append({
'table_index': len(tables),
'rows': current_table,
'row_count': len(current_table),
'column_count': len(current_table[0]) if current_table else 0
})
# 构建文本
paragraphs_text = [p["text"] for p in paragraphs]
full_text_parts = ["【文档正文】"] + paragraphs_text
if tables:
full_text_parts.append("\n【文档表格】")
for idx, table in enumerate(tables):
full_text_parts.append(f"--- 表格 {idx + 1} ---")
for row in table["rows"]:
full_text_parts.append(" | ".join(str(cell) for cell in row))
full_text = "\n".join(full_text_parts)
return ParseResult( return ParseResult(
success=False, success=True,
error=f"解析 Word 文档失败: {str(e)}" data={
"content": full_text,
"paragraphs": paragraphs,
"paragraphs_with_style": paragraphs,
"tables": tables,
"images": {"image_count": 0, "descriptions": []}
},
metadata={
"filename": path.name,
"extension": path.suffix.lower(),
"paragraph_count": len(paragraphs),
"table_count": len(tables),
"image_count": 0,
"parse_method": "fallback_xml"
}
) )
except zipfile.BadZipFile:
return ParseResult(success=False, error="无效的 ZIP/文档文件")
except Exception as e:
return ParseResult(success=False, error=f"备用解析失败: {str(e)}")
def extract_images_as_base64(self, file_path: str) -> List[Dict[str, str]]: def extract_images_as_base64(self, file_path: str) -> List[Dict[str, str]]:
""" """
提取 Word 文档中的所有图片,返回 base64 编码列表 提取 Word 文档中的所有图片,返回 base64 编码列表
@@ -197,6 +292,83 @@ class DocxParser(BaseParser):
logger.info(f"共提取 {len(images)} 张图片") logger.info(f"共提取 {len(images)} 张图片")
return images return images
def extract_text_from_images(self, file_path: str, lang: str = 'chi_sim+eng') -> Dict[str, Any]:
"""
对 Word 文档中的图片进行 OCR 文字识别
Args:
file_path: Word 文件路径
lang: Tesseract 语言代码,默认简体中文+英文 (chi_sim+eng)
Returns:
包含识别结果的字典
"""
import zipfile
from io import BytesIO
from PIL import Image
try:
import pytesseract
except ImportError:
logger.warning("pytesseract 未安装OCR 功能不可用")
return {
"success": False,
"error": "pytesseract 未安装,请运行: pip install pytesseract",
"image_count": 0,
"extracted_text": []
}
results = {
"success": True,
"image_count": 0,
"extracted_text": [],
"total_chars": 0
}
try:
with zipfile.ZipFile(file_path, 'r') as zf:
# 查找 word/media 目录下的图片文件
media_files = [f for f in zf.namelist() if f.startswith('word/media/')]
for idx, filename in enumerate(media_files):
ext = filename.split('.')[-1].lower()
if ext not in ['png', 'jpg', 'jpeg', 'gif', 'bmp']:
continue
try:
# 读取图片数据
image_data = zf.read(filename)
image = Image.open(BytesIO(image_data))
# 使用 Tesseract OCR 提取文字
text = pytesseract.image_to_string(image, lang=lang)
text = text.strip()
if text:
results["extracted_text"].append({
"image_index": idx,
"filename": filename,
"text": text,
"char_count": len(text)
})
results["total_chars"] += len(text)
logger.info(f"图片 {filename} OCR 识别完成,提取 {len(text)} 字符")
except Exception as e:
logger.warning(f"图片 {filename} OCR 识别失败: {str(e)}")
results["image_count"] = len(results["extracted_text"])
except zipfile.BadZipFile:
results["success"] = False
results["error"] = "无效的 Word 文档文件"
except Exception as e:
results["success"] = False
results["error"] = f"OCR 处理失败: {str(e)}"
return results
def extract_key_sentences(self, text: str, max_sentences: int = 10) -> List[str]: def extract_key_sentences(self, text: str, max_sentences: int = 10) -> List[str]:
""" """
从文本中提取关键句子 从文本中提取关键句子

View File

@@ -5,9 +5,10 @@
""" """
import logging import logging
import json import json
import re
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
from app.services.template_fill_service import template_fill_service from app.services.template_fill_service import template_fill_service, TemplateField
from app.services.rag_service import rag_service from app.services.rag_service import rag_service
from app.services.markdown_ai_service import markdown_ai_service from app.services.markdown_ai_service import markdown_ai_service
from app.core.database import mongodb from app.core.database import mongodb
@@ -15,6 +16,31 @@ from app.core.database import mongodb
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def _extract_filenames_from_text(text: str) -> List[str]:
"""
从指令文本中提取文件名列表。
智能处理用''/''/'、分隔的多个文件名(尤其是带年号的统计公报)。
"""
# 先去掉"对比这两个文档"等引导语,只保留文件名部分
text = re.sub(r'^(?:对比|比较)这两个?文档[的差异]?[:]?', '', text).strip()
text = re.sub(r'两个文档.*$', '', text).strip()
if not text:
return []
# 直接查找所有带扩展名的文件名模式
results = []
for m in re.finditer(r'[^\s、和与]+(?=\.(?:docx|xlsx|md|txt))', text):
start = m.start()
ext_match = re.search(r'\.(?:docx|xlsx|md|txt)', text[m.end():])
if ext_match:
fn = text[start:m.end() + ext_match.end()]
if fn:
results.append(fn)
return results
class InstructionExecutor: class InstructionExecutor:
"""指令执行器""" """指令执行器"""
@@ -41,9 +67,10 @@ class InstructionExecutor:
self.intent_parser = intent_parser self.intent_parser = intent_parser
context = context or {} context = context or {}
context["instruction"] = instruction # 保存原始指令以便后续使用
# 解析意图 # 解析意图(传递对话历史上下文)
intent, params = await self.intent_parser.parse(instruction) intent, params = await self.intent_parser.parse(instruction, context)
# 根据意图类型执行相应操作 # 根据意图类型执行相应操作
if intent == "extract": if intent == "extract":
@@ -72,18 +99,48 @@ class InstructionExecutor:
async def _execute_extract(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]: async def _execute_extract(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
"""执行信息提取""" """执行信息提取"""
try: try:
target_fields = params.get("field_refs", []) # target_fields 来自意图解析field_refs 来自引号/字段关键词匹配
target_fields = params.get("target_fields", []) or params.get("field_refs", [])
doc_ids = params.get("document_refs", []) doc_ids = params.get("document_refs", [])
instruction_text = context.get("instruction", "")
# 如果没有指定文档,尝试按文件名精确搜索
if not doc_ids or "all_docs" in doc_ids:
if instruction_text:
import re
# 提取引号内的内容或文件名
filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text)
if filename_match:
search_term = filename_match.group(1)
else:
match = re.search(r'([^\s]+\.(?:docx|xlsx|md|txt))', instruction_text)
search_term = match.group(1) if match else None
if search_term:
logger.info(f"提取时搜索文档: {search_term}")
searched_docs = await mongodb.search_documents(search_term, limit=5)
if searched_docs:
# 优先选择文件名完全匹配的文档
best_docs = [
d for d in searched_docs
if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower()
]
if not best_docs:
best_docs = [searched_docs[0]]
context["source_docs"] = best_docs
doc_ids = [doc.get("_id", "") for doc in best_docs]
logger.info(f"找到 {len(best_docs)} 个文档用于提取,最佳: {best_docs[0].get('metadata', {}).get('original_filename', '?')}")
if not target_fields: if not target_fields:
return { return {
"success": False, "success": False,
"intent": "extract",
"error": "未指定要提取的字段", "error": "未指定要提取的字段",
"message": "请明确说明要提取哪些字段,如:'提取医院数量和床位数'" "message": "请明确说明要提取哪些字段,如:'提取医院数量和床位数'"
} }
# 如果指定了文档,验证文档存在 # 如果指定了文档且还没有加载 source_docs则验证并加载
if doc_ids and "all_docs" not in doc_ids: if doc_ids and "all_docs" not in doc_ids and not context.get("source_docs"):
valid_docs = [] valid_docs = []
for doc_ref in doc_ids: for doc_ref in doc_ids:
doc_id = doc_ref.replace("doc_", "") doc_id = doc_ref.replace("doc_", "")
@@ -93,20 +150,22 @@ class InstructionExecutor:
if not valid_docs: if not valid_docs:
return { return {
"success": False, "success": False,
"intent": "extract",
"error": "指定的文档不存在", "error": "指定的文档不存在",
"message": "请检查文档编号是否正确" "message": "请检查文档编号是否正确"
} }
context["source_docs"] = valid_docs context["source_docs"] = valid_docs
# 构建字段列表 # 构建字段列表(使用 TemplateField dataclass
fields = [] fields = [
for i, field_name in enumerate(target_fields): TemplateField(
fields.append({ name=field_name,
"name": field_name, cell=f"A{i+1}",
"cell": f"A{i+1}", field_type="text",
"field_type": "text", required=False
"required": False )
}) for i, field_name in enumerate(target_fields)
]
# 调用填表服务 # 调用填表服务
result = await template_fill_service.fill_template( result = await template_fill_service.fill_template(
@@ -143,7 +202,7 @@ class InstructionExecutor:
} }
# 获取源文档 # 获取源文档
source_docs = context.get("source_docs", []) source_docs = context.get("source_docs", []) or []
source_doc_ids = [doc.get("_id") for doc in source_docs if doc.get("_id")] source_doc_ids = [doc.get("_id") for doc in source_docs if doc.get("_id")]
# 获取字段 # 获取字段
@@ -175,36 +234,103 @@ class InstructionExecutor:
} }
async def _execute_summarize(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]: async def _execute_summarize(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
"""执行摘要总结""" """执行摘要总结 - 使用 LLM 生成真实摘要"""
try: try:
docs = context.get("source_docs", []) import re
docs = context.get("source_docs", []) or []
instruction_text = context.get("instruction", "")
# 从指令中提取文件名/关键词,优先搜索精确文档
search_term = None
if instruction_text:
filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text)
if filename_match:
search_term = filename_match.group(1)
else:
file_match = re.search(r'([^\s,]+\.(?:docx|xlsx|md|txt))', instruction_text)
if file_match:
search_term = file_match.group(1)
# 如果没有文档或有更精确的搜索词,尝试重新搜索
if not docs or search_term:
if search_term:
logger.info(f"按关键词搜索文档: {search_term}")
searched_docs = await mongodb.search_documents(search_term, limit=5)
if searched_docs:
# 优先使用文件名最匹配的文档
docs = sorted(
searched_docs,
key=lambda d: 1 if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower() else 0,
reverse=True
)
logger.info(f"找到 {len(docs)} 个文档,最佳匹配: {docs[0].get('metadata', {}).get('original_filename', '?')}")
if not docs: if not docs:
return { return {
"success": False, "success": True,
"error": "没有可用的文档", "intent": "summarize",
"message": "请先上传要总结的文档" "action_needed": "provide_document",
"message": "我理解了,您想分析文档内容。",
"suggestion": "请提供已上传文档的名称(可以是文件名或部分名称),或者上传您想要分析的文档。\n\n支持的格式docx、xlsx、md、txt\n\n例如:'分析2021年民政事业发展统计公报''总结卫生健康数据'"
} }
summaries = [] # 对第一个(最佳匹配)文档生成 AI 摘要
for doc in docs[:5]: # 最多处理5个文档 primary_doc = docs[0]
content = doc.get("content", "")[:5000] # 限制内容长度 content = primary_doc.get("content", "")
if content: filename = primary_doc.get("metadata", {}).get("original_filename", "未知文档")
summaries.append({
"filename": doc.get("metadata", {}).get("original_filename", "未知"), if not content:
"content_preview": content[:500] + "..." if len(content) > 500 else content return {
}) "success": False,
"intent": "summarize",
"error": "文档内容为空",
"message": f"文档 {filename} 没有可供分析的文本内容"
}
# 使用 LLM 生成摘要
content_for_summary = content[:12000] # 最多取前 12000 字
user_request = instruction_text or "请总结这份文档"
prompt = f"""请对以下文档进行全面、有条理的摘要分析。
文档名称:{filename}
用户要求:{user_request}
文档内容:
{content_for_summary}
请按以下格式输出摘要:
1. **文档概述**简述文档主题和背景2-3句
2. **主要内容**:列出文档的核心数据和关键信息(用要点列出)
3. **重要数据**:提取文档中的重要数字、统计数据
4. **主要结论**:归纳文档的主要结论或趋势
要求:条理清晰,数据准确,不要遗漏关键信息。"""
from app.services.llm_service import llm_service
messages = [
{"role": "system", "content": "你是一个专业的文档分析助手,擅长提取关键信息并生成结构化摘要。"},
{"role": "user", "content": prompt}
]
response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=2000)
ai_summary = llm_service.extract_message_content(response)
return { return {
"success": True, "success": True,
"intent": "summarize", "intent": "summarize",
"summaries": summaries, "ai_summary": ai_summary,
"message": f"找到 {len(summaries)} 个文档可供参考" "filename": filename,
"doc_id": primary_doc.get("_id", ""),
"total_docs_found": len(docs),
"message": f"已生成文档摘要"
} }
except Exception as e: except Exception as e:
logger.error(f"摘要执行失败: {e}") logger.error(f"摘要执行失败: {e}")
return { return {
"success": False, "success": False,
"intent": "summarize",
"error": str(e), "error": str(e),
"message": f"摘要生成失败: {str(e)}" "message": f"摘要生成失败: {str(e)}"
} }
@@ -213,17 +339,39 @@ class InstructionExecutor:
"""执行问答""" """执行问答"""
try: try:
question = params.get("question", "") question = params.get("question", "")
instruction_text = context.get("instruction", "")
if not question: if not question:
return { return {
"success": False, "success": False,
"intent": "question",
"error": "未提供问题", "error": "未提供问题",
"message": "请输入要回答的问题" "message": "请输入要回答的问题"
} }
# 使用 RAG 检索相关文档 docs = context.get("source_docs", []) or []
docs = context.get("source_docs", [])
rag_results = []
# 如果没有文档,尝试从指令中提取文件名搜索
if not docs:
filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text)
if not filename_match:
filename_match = re.search(r'([^\s]+\.(?:docx|xlsx|md|txt))', instruction_text)
if filename_match:
found = await mongodb.search_documents(filename_match.group(1), limit=5)
if found:
docs = found
if not docs:
return {
"success": True,
"intent": "question",
"question": question,
"answer": None,
"message": "请先上传文档,我才能回答您的问题"
}
# 使用 RAG 检索相关文档
rag_results = []
for doc in docs: for doc in docs:
doc_id = doc.get("_id", "") doc_id = doc.get("_id", "")
if doc_id: if doc_id:
@@ -241,12 +389,42 @@ class InstructionExecutor:
doc.get("content", "")[:3000] for doc in docs[:3] if doc.get("content") doc.get("content", "")[:3000] for doc in docs[:3] if doc.get("content")
]) ])
if not context_text:
return { return {
"success": True, "success": True,
"intent": "question", "intent": "question",
"question": question, "question": question,
"context_preview": context_text[:500] + "..." if len(context_text) > 500 else context_text, "answer": None,
"message": "已找到相关上下文,可进行问答" "message": "文档内容为空,无法回答问题"
}
# 使用 LLM 生成答案
filename = docs[0].get("metadata", {}).get("original_filename", "文档")
prompt = f"""基于以下文档内容,回答用户的问题。
文档名称:{filename}
用户问题:{question}
文档内容:
{context_text[:8000]}
请根据文档内容准确回答问题。如果文档中没有相关信息,请明确说明。"""
from app.services.llm_service import llm_service
messages = [
{"role": "system", "content": "你是一个专业的文档问答助手,根据提供的内容准确回答用户问题。"},
{"role": "user", "content": prompt}
]
response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=1500)
answer = llm_service.extract_message_content(response)
return {
"success": True,
"intent": "question",
"question": question,
"answer": answer,
"filename": filename,
"message": "已生成回答"
} }
except Exception as e: except Exception as e:
@@ -299,12 +477,53 @@ class InstructionExecutor:
async def _execute_compare(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]: async def _execute_compare(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
"""执行对比分析""" """执行对比分析"""
try: try:
docs = context.get("source_docs", []) docs = context.get("source_docs", []) or []
instruction_text = context.get("instruction", "")
# 优先从指令中提取具体的文件名
filenames = _extract_filenames_from_text(instruction_text)
if filenames:
# 只选择文件名匹配的那些文档
matched_docs = []
for doc in docs:
fname = doc.get("metadata", {}).get("original_filename", "").lower()
for fn in filenames:
if fn.lower() in fname or fname in fn.lower():
matched_docs.append(doc)
break
# 如果匹配到足够文档,用匹配的
if len(matched_docs) >= 2:
docs = matched_docs
else:
# 匹配不够,尝试按文件名搜索 MongoDB
all_found = []
for fn in filenames:
found = await mongodb.search_documents(fn, limit=5)
all_found.extend(found)
seen = set()
unique_docs = []
for d in all_found:
did = d.get("_id", "")
if did and did not in seen:
seen.add(did)
unique_docs.append(d)
if len(unique_docs) >= 2:
docs = unique_docs
elif len(unique_docs) == 1 and len(docs) >= 1:
# 找到一个指定的 + 用一个通用的
docs = unique_docs + docs[:1]
elif docs and len(filenames) == 1:
# 找到一个指定文件名但只有一个匹配,尝试补充
docs = unique_docs + [d for d in docs if d not in unique_docs]
docs = docs[:2]
if len(docs) < 2: if len(docs) < 2:
return { return {
"success": False, "success": False,
"intent": "compare",
"error": "对比需要至少2个文档", "error": "对比需要至少2个文档",
"message": "请上传至少2个文档进行对比" "message": "请上传至少2个文档进行对比,或明确说出要对比的文档名称"
} }
# 提取文档基本信息 # 提取文档基本信息
@@ -329,6 +548,7 @@ class InstructionExecutor:
logger.error(f"对比执行失败: {e}") logger.error(f"对比执行失败: {e}")
return { return {
"success": False, "success": False,
"intent": "compare",
"error": str(e), "error": str(e),
"message": f"对比分析失败: {str(e)}" "message": f"对比分析失败: {str(e)}"
} }
@@ -336,10 +556,23 @@ class InstructionExecutor:
async def _execute_edit(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]: async def _execute_edit(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
"""执行文档编辑操作""" """执行文档编辑操作"""
try: try:
docs = context.get("source_docs", []) docs = context.get("source_docs", []) or []
instruction_text = context.get("instruction", "")
# 如果没有文档,尝试从指令中提取文件名搜索
if not docs:
filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text)
if not filename_match:
filename_match = re.search(r'([^\s]+\.(?:docx|xlsx|md|txt))', instruction_text)
if filename_match:
found = await mongodb.search_documents(filename_match.group(1), limit=3)
if found:
docs = found
if not docs: if not docs:
return { return {
"success": False, "success": False,
"intent": "edit",
"error": "没有可用的文档", "error": "没有可用的文档",
"message": "请先上传要编辑的文档" "message": "请先上传要编辑的文档"
} }
@@ -405,7 +638,7 @@ class InstructionExecutor:
- Word -> Markdown - Word -> Markdown
""" """
try: try:
docs = context.get("source_docs", []) docs = context.get("source_docs", []) or []
if not docs: if not docs:
return { return {
"success": False, "success": False,

View File

@@ -28,7 +28,7 @@ class IntentParser:
INTENT_KEYWORDS = { INTENT_KEYWORDS = {
INTENT_EXTRACT: ["提取", "抽取", "获取", "找出", "查找", "识别", "找到"], INTENT_EXTRACT: ["提取", "抽取", "获取", "找出", "查找", "识别", "找到"],
INTENT_FILL_TABLE: ["填表", "填写", "填充", "录入", "导入到表格", "填写到"], INTENT_FILL_TABLE: ["填表", "填写", "填充", "录入", "导入到表格", "填写到"],
INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼"], INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼", "分析", "聊聊"],
INTENT_QUESTION: ["问答", "回答", "解释", "什么是", "为什么", "如何", "怎样", "多少", "几个"], INTENT_QUESTION: ["问答", "回答", "解释", "什么是", "为什么", "如何", "怎样", "多少", "几个"],
INTENT_SEARCH: ["搜索", "查找", "检索", "查询", ""], INTENT_SEARCH: ["搜索", "查找", "检索", "查询", ""],
INTENT_COMPARE: ["对比", "比较", "差异", "区别", "不同"], INTENT_COMPARE: ["对比", "比较", "差异", "区别", "不同"],
@@ -47,12 +47,13 @@ class IntentParser:
def __init__(self): def __init__(self):
self.intent_history: List[Dict[str, Any]] = [] self.intent_history: List[Dict[str, Any]] = []
async def parse(self, text: str) -> Tuple[str, Dict[str, Any]]: async def parse(self, text: str, context: Dict[str, Any] = None) -> Tuple[str, Dict[str, Any]]:
""" """
解析自然语言指令 解析自然语言指令
Args: Args:
text: 用户输入的自然语言 text: 用户输入的自然语言
context: 执行上下文(包含对话历史等)
Returns: Returns:
(意图类型, 参数字典) (意图类型, 参数字典)
@@ -61,11 +62,17 @@ class IntentParser:
if not text: if not text:
return self.INTENT_UNKNOWN, {} return self.INTENT_UNKNOWN, {}
# 检查对话历史中的上下文
conversation_history = []
if context and context.get("conversation_history"):
conversation_history = context.get("conversation_history", [])
logger.info(f"解析时使用对话历史: {len(conversation_history)} 条消息")
# 记录历史 # 记录历史
self.intent_history.append({"text": text, "intent": None}) self.intent_history.append({"text": text, "intent": None})
# 识别意图 # 识别意图(考虑对话上下文)
intent = self._recognize_intent(text) intent = self._recognize_intent_with_context(text, conversation_history)
# 提取参数 # 提取参数
params = self._extract_params(text, intent) params = self._extract_params(text, intent)
@@ -78,6 +85,42 @@ class IntentParser:
return intent, params return intent, params
def _recognize_intent_with_context(self, text: str, conversation_history: List[Dict[str, Any]]) -> str:
"""
基于对话历史识别意图
Args:
text: 当前用户输入
conversation_history: 对话历史
Returns:
意图类型
"""
# 如果对话历史为空,使用基础意图识别
if not conversation_history:
return self._recognize_intent(text)
# 基于历史上下文进行意图识别
# 分析最近的对话了解用户意图的延续性
last_intent = None
last_topic = None
for msg in conversation_history[-5:]: # 最多看最近5条消息
if msg.get("role") == "assistant":
last_intent = msg.get("intent")
if msg.get("intent") and msg.get("intent") != "unknown":
last_topic = msg.get("intent")
# 如果当前消息很短(如"继续"、"是的"),可能延续之前的意图
short_confirmation = ["", "是的", "", "继续", "ok", "", "接着", "然后", "还有吗"]
if text.strip() in short_confirmation or len(text.strip()) <= 3:
if last_topic:
logger.info(f"简短确认,延续之前的意图: {last_topic}")
return last_topic
# 否则使用标准意图识别
return self._recognize_intent(text)
def _recognize_intent(self, text: str) -> str: def _recognize_intent(self, text: str) -> str:
"""识别意图类型""" """识别意图类型"""
intent_scores: Dict[str, float] = {} intent_scores: Dict[str, float] = {}
@@ -214,18 +257,27 @@ class IntentParser:
return template_info if template_info else None return template_info if template_info else None
def _extract_target_fields(self, text: str) -> List[str]: def _extract_target_fields(self, text: str) -> List[str]:
"""提取目标字段""" """提取目标字段 - 按分隔符切分再逐段清理"""
fields = [] fields = []
# 匹配 "提取XXX和YYY"、"抽取XXX、YYY" # 去除提取/抽取前缀
patterns = [ cleaned_text = re.sub(r"^(?:提取|抽取)", "", text).strip()
r"提取([^(and|,|)+]+?)(?:和|与|、|,|plus)",
r"抽取([^(and|,|)+]+?)(?:和|与|、|,|plus)",
]
for pattern in patterns: # 按'和'、'与'、'、'分割成多段
matches = re.findall(pattern, text) segments = re.split(r"[和与、]", cleaned_text)
fields.extend([m.strip() for m in matches if m.strip()])
# 常见前缀(这些不是字段名,需要去除)
prefixes = ["文档中的", "文档中", "文件中的", "文件中", "内容中的", "内容中"]
for seg in segments:
seg = seg.strip()
# 去除常见前缀
for p in prefixes:
if seg.startswith(p):
seg = seg[len(p):]
break
if seg and 2 <= len(seg) <= 20:
fields.append(seg)
return list(set(fields)) return list(set(fields))

View File

@@ -526,9 +526,10 @@ class ExcelStorageService:
# 创建表 # 创建表
model_class = self._create_table_model(table_name, columns, column_types) model_class = self._create_table_model(table_name, columns, column_types)
# 创建表结构 # 创建表结构 (使用异步方式)
async with self.mysql_db.get_session() as session: async with self.mysql_db.get_session() as session:
model_class.__table__.create(session.bind, checkfirst=True) async with session.bind.begin() as conn:
await conn.run_sync(lambda: model_class.__table__.create(checkfirst=True))
# 插入数据 # 插入数据
records = [] records = []

View File

@@ -165,9 +165,9 @@ class BM25:
class RAGService: class RAGService:
"""RAG 检索增强服务""" """RAG 检索增强服务"""
# 默认分块参数 # 默认分块参数 - 增大块大小减少embedding次数
DEFAULT_CHUNK_SIZE = 500 # 每个文本块的大小(字符数) DEFAULT_CHUNK_SIZE = 1000 # 每个文本块的大小(字符数),增大以提升速度
DEFAULT_CHUNK_OVERLAP = 50 # 块之间的重叠(字符数) DEFAULT_CHUNK_OVERLAP = 100 # 块之间的重叠(字符数)
def __init__(self): def __init__(self):
self.embedding_model = None self.embedding_model = None
@@ -389,6 +389,70 @@ class RAGService:
self._add_documents(documents, chunk_ids) self._add_documents(documents, chunk_ids)
logger.info(f"已索引文档 {doc_id},共 {len(chunks)} 个块") logger.info(f"已索引文档 {doc_id},共 {len(chunks)} 个块")
async def index_document_content_async(
self,
doc_id: str,
content: str,
metadata: Optional[Dict[str, Any]] = None,
chunk_size: int = None,
chunk_overlap: int = None
):
"""
异步将文档内容索引到向量数据库(自动分块)
使用 asyncio.to_thread 避免阻塞事件循环
"""
import asyncio
if self._disabled:
logger.info(f"[RAG DISABLED] 文档索引操作已跳过: {doc_id}")
return
if not self._initialized:
self._init_vector_store()
if self.embedding_model is None:
logger.debug(f"文档跳过索引 (无嵌入模型): {doc_id}")
return
# 分割文档为小块
if chunk_size is None:
chunk_size = self.DEFAULT_CHUNK_SIZE
if chunk_overlap is None:
chunk_overlap = self.DEFAULT_CHUNK_OVERLAP
chunks = self._split_into_chunks(content, chunk_size, chunk_overlap)
if not chunks:
logger.warning(f"文档内容为空,跳过索引: {doc_id}")
return
# 为每个块创建文档对象
documents = []
chunk_ids = []
for i, chunk in enumerate(chunks):
chunk_id = f"{doc_id}_chunk_{i}"
chunk_metadata = metadata.copy() if metadata else {}
chunk_metadata.update({
"chunk_index": i,
"total_chunks": len(chunks),
"doc_id": doc_id
})
documents.append(SimpleDocument(
page_content=chunk,
metadata=chunk_metadata
))
chunk_ids.append(chunk_id)
# 使用线程池执行 CPU 密集型的 embedding 计算
def _sync_add():
self._add_documents(documents, chunk_ids)
await asyncio.to_thread(_sync_add)
logger.info(f"已异步索引文档 {doc_id},共 {len(chunks)} 个块")
def _add_documents(self, documents: List[SimpleDocument], doc_ids: List[str]): def _add_documents(self, documents: List[SimpleDocument], doc_ids: List[str]):
"""批量添加文档到向量索引""" """批量添加文档到向量索引"""
if not documents: if not documents:

View File

@@ -300,13 +300,15 @@ class TableRAGService:
filename: str, filename: str,
sheet_name: Optional[str] = None, sheet_name: Optional[str] = None,
header_row: int = 0, header_row: int = 0,
sample_size: int = 10 sample_size: int = 10,
skip_rag_index: bool = False
) -> Dict[str, Any]: ) -> Dict[str, Any]:
""" """
为 Excel 表构建完整的 RAG 索引 为 Excel 表构建完整的 RAG 索引
流程: 流程:
1. 读取 Excel 获取字段信息 1. 读取 Excel 获取字段信息
2. 如果 skip_rag_index=True跳过 RAG 索引,直接存 MySQL
2. AI 生成每个字段的语义描述 2. AI 生成每个字段的语义描述
3. 将字段描述存入向量数据库 3. 将字段描述存入向量数据库
@@ -367,6 +369,20 @@ class TableRAGService:
results["field_count"] = len(df.columns) results["field_count"] = len(df.columns)
logger.info(f"表名: {table_name}, 字段数: {len(df.columns)}") logger.info(f"表名: {table_name}, 字段数: {len(df.columns)}")
# 跳过 RAG 索引时直接存 MySQL
if skip_rag_index:
logger.info(f"跳过 RAG 索引,直接存储到 MySQL")
store_result = await self.excel_storage.store_excel(
file_path=file_path,
filename=filename,
sheet_name=sheet_name,
header_row=header_row
)
results["mysql_table"] = store_result.get("table_name") if store_result.get("success") else None
results["row_count"] = store_result.get("row_count", len(df))
results["indexed_count"] = 0
return results
# 3. 初始化 RAG (如果需要) # 3. 初始化 RAG (如果需要)
if not self.rag._initialized: if not self.rag._initialized:
self.rag._init_vector_store() self.rag._init_vector_store()

View File

@@ -5,6 +5,7 @@
""" """
import asyncio import asyncio
import logging import logging
import re
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
@@ -13,6 +14,7 @@ from app.services.llm_service import llm_service
from app.core.document_parser import ParserFactory from app.core.document_parser import ParserFactory
from app.services.markdown_ai_service import markdown_ai_service from app.services.markdown_ai_service import markdown_ai_service
from app.services.rag_service import rag_service from app.services.rag_service import rag_service
from app.services.excel_storage_service import excel_storage_service
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -105,12 +107,60 @@ class TemplateFillService:
# 3. 检查是否需要使用源文档重新生成表头 # 3. 检查是否需要使用源文档重新生成表头
# 条件:源文档已加载 AND 现有字段看起来是自动生成的(如"字段1"、"字段2" # 条件:源文档已加载 AND 现有字段看起来是自动生成的(如"字段1"、"字段2"
# 注意Word 模板docx不自动重新生成表头因为 Word 模板的表结构由用户定义,必须保留
needs_regenerate_headers = ( needs_regenerate_headers = (
template_file_type != "docx" and
len(source_docs) > 0 and len(source_docs) > 0 and
len(template_fields) > 0 and len(template_fields) > 0 and
all(self._is_auto_generated_field(f.name) for f in template_fields) all(self._is_auto_generated_field(f.name) for f in template_fields)
) )
# 4. Word 模板特殊处理:表头为空时,从源文档生成字段
# 仅当有源文档、模板字段为空、模板文件类型为 docx 时触发
if not needs_regenerate_headers and template_file_type == "docx" and len(source_docs) > 0 and len(template_fields) == 0:
logger.info(f"Word 模板表头为空,从源文档生成字段... (source_docs={len(source_docs)})")
source_contents = []
for doc in source_docs:
structured = doc.structured_data if doc.structured_data else {}
titles = structured.get("titles", [])
tables = structured.get("tables", [])
tables_count = len(tables) if tables else 0
tables_summary = ""
if tables:
tables_summary = "\n【文档中的表格】:\n"
for idx, table in enumerate(tables[:5]):
if isinstance(table, dict):
headers = table.get("headers", [])
rows = table.get("rows", [])
if headers:
tables_summary += f"表格{idx+1}表头: {', '.join(str(h) for h in headers)}\n"
if rows:
tables_summary += f"表格{idx+1}前3行: "
for row_idx, row in enumerate(rows[:3]):
if isinstance(row, list):
tables_summary += " | ".join(str(c) for c in row) + "; "
elif isinstance(row, dict):
tables_summary += " | ".join(str(row.get(h, "")) for h in headers if headers) + "; "
tables_summary += "\n"
source_contents.append({
"filename": doc.filename,
"doc_type": doc.doc_type,
"content": doc.content[:5000] if doc.content else "",
"titles": titles[:10] if titles else [],
"tables_count": tables_count,
"tables_summary": tables_summary
})
if template_id:
generated_fields = await self.get_template_fields_from_file(
template_id,
template_file_type,
source_contents=source_contents,
source_docs=source_docs
)
if generated_fields:
template_fields = generated_fields
logger.info(f"Word 模板字段生成成功: {[f.name for f in template_fields]}")
if needs_regenerate_headers: if needs_regenerate_headers:
logger.info(f"检测到自动生成表头,尝试使用源文档重新生成... (当前字段: {[f.name for f in template_fields]})") logger.info(f"检测到自动生成表头,尝试使用源文档重新生成... (当前字段: {[f.name for f in template_fields]})")
@@ -162,7 +212,8 @@ class TemplateFillService:
new_fields = await self.get_template_fields_from_file( new_fields = await self.get_template_fields_from_file(
template_id, template_id,
template_file_type, template_file_type,
source_contents=source_contents source_contents=source_contents,
source_docs=source_docs
) )
if new_fields and len(new_fields) > 0: if new_fields and len(new_fields) > 0:
logger.info(f"成功重新生成表头: {[f.name for f in new_fields]}") logger.info(f"成功重新生成表头: {[f.name for f in new_fields]}")
@@ -224,14 +275,357 @@ class TemplateFillService:
max_rows = max(len(v) for v in filled_data.values()) if filled_data else 1 max_rows = max(len(v) for v in filled_data.values()) if filled_data else 1
logger.info(f"填表完成: {len(filled_data)} 个字段, 最大行数: {max_rows}") logger.info(f"填表完成: {len(filled_data)} 个字段, 最大行数: {max_rows}")
# 如果是 Word 模板,将数据填入模板文件
filled_file_path = None
if template_file_type == "docx" and template_id and filled_data:
filled_file_path = await self._fill_docx(template_id, filled_data)
if filled_file_path:
logger.info(f"Word 模板已填写,输出文件: {filled_file_path}")
return { return {
"success": True, "success": True,
"filled_data": filled_data, "filled_data": filled_data,
"fill_details": fill_details, "fill_details": fill_details,
"source_doc_count": len(source_docs), "source_doc_count": len(source_docs),
"max_rows": max_rows "max_rows": max_rows,
"filled_file_path": filled_file_path
} }
async def _polish_word_filled_data(
self,
filled_data: Dict[str, Any]
) -> Dict[str, str]:
"""
将提取的结构化数据尤其是多行Excel数据进行统计归纳
然后润色为自然语言文本
Args:
filled_data: {字段名: [原始值列表]}
Returns:
{字段名: 润色后的文本}
"""
if not filled_data:
return {}
try:
import json
# 第一步:对数值型多行数据进行统计分析
data_summary = []
for field_name, values in filled_data.items():
if not isinstance(values, list) or not values:
continue
# 过滤掉无效值
raw_values = []
for v in values:
if v and str(v).strip() and not str(v).startswith('[提取失败'):
raw_values.append(str(v).strip())
if not raw_values:
continue
# 尝试解析为数值进行统计
numeric_values = []
for v in raw_values:
# 提取数值(处理 "123个"、"78.5%"、"1,234" 等格式)
num_str = re.sub(r'[^\d.\-]', '', str(v))
try:
if num_str and num_str != '-' and num_str != '.':
numeric_values.append(float(num_str))
except ValueError:
pass
# 根据字段名判断类型
field_lower = field_name.lower()
is_count_field = any(kw in field_lower for kw in ['数量', '总数', '次数', '条数', '订单数', '记录数', '条目'])
is_amount_field = any(kw in field_lower for kw in ['金额', '总额', '合计', '总计', '销售额', '收入', '支出', '成本'])
is_ratio_field = any(kw in field_lower for kw in ['比率', '比例', '占比', '', '使用率', '增长', '增幅'])
is_name_field = any(kw in field_lower for kw in ['名称', '机构', '医院', '公司', '单位', '部门', '区域', '类别'])
if len(numeric_values) >= 2 and len(numeric_values) == len(raw_values):
# 多行数值数据,进行统计归纳
total = sum(numeric_values)
avg = total / len(numeric_values)
max_val = max(numeric_values)
min_val = min(numeric_values)
stats_lines = [
f"{field_name}】(共 {len(raw_values)} 条数据):",
f" - 合计: {self._format_number(total)}" if is_amount_field else f" - 合计: {total:.2f}",
f" - 平均: {avg:.2f}",
f" - 最大: {max_val:.2f}",
f" - 最小: {min_val:.2f}",
]
# 对原始值去重计数(如果是名称类字段)
if is_name_field:
unique_values = list(set(raw_values))
if len(unique_values) <= 10:
stats_lines.append(f" - 涉及类别(共 {len(unique_values)} 种): {''.join(unique_values[:8])}")
else:
stats_lines.append(f" - 涉及 {len(unique_values)} 个不同类别")
# 取前5个原始示例
stats_lines.append(f" - 示例值: {''.join(raw_values[:5])}")
data_summary.append('\n'.join(stats_lines))
elif is_ratio_field and len(numeric_values) == 1:
# 单值百分比
pct = numeric_values[0]
data_summary.append(f"{field_name}】: {pct:.1f}%,表示相关指标的相对水平")
elif is_amount_field and len(numeric_values) >= 1:
# 金额类(单位通常是万元/亿元)
total = sum(numeric_values)
unit = ""
if total >= 10000:
unit = f"(约 {total/10000:.2f} 万元)"
elif total >= 1:
unit = f"(约 {total:.2f} 元)"
data_summary.append(f"{field_name}】: 合计 {self._format_number(total)}{unit},基于 {len(raw_values)} 条记录汇总")
elif is_count_field and len(numeric_values) >= 1:
# 数量类
total = sum(numeric_values)
data_summary.append(f"{field_name}】: 共 {self._format_number(total)},基于 {len(raw_values)} 条记录汇总")
else:
# 无法归类的多值数据,做去重归纳
unique_values = list(set(raw_values))
if len(unique_values) <= 8:
data_summary.append(f"{field_name}】(共 {len(raw_values)} 条,去重后 {len(unique_values)} 项): {''.join(unique_values[:8])}")
elif len(raw_values) > 8:
data_summary.append(f"{field_name}】(共 {len(raw_values)} 条记录): {''.join(raw_values[:5])}")
else:
data_summary.append(f"{field_name}】: {''.join(raw_values)}")
if not data_summary:
return {k: (', '.join(str(v) for v in vals[:5]) if isinstance(vals, list) else str(vals))
for k, vals in filled_data.items()}
# 第二步:调用 LLM 将统计分析结果转化为专业自然语言描述
prompt = f"""你是一个专业的数据分析报告助手。请根据以下从文档中提取并统计的数据,生成专业、简洁的自然语言描述。
【数据统计结果】:
{chr(10).join(data_summary)}
【润色要求】:
1. 每个字段生成一段专业的描述性文本20-60字
2. 数值类字段要明确标注单位和含义,如"销售总额达1,234.5万元共涵盖56个订单"
3. 分类/名称类字段要归纳总结类别,如"涉及医疗器械、药品采购、设备维修等5个业务类别"
4. 多值数据不要简单罗列,要做总结,如"覆盖华东地区上海、江苏、浙江、华南地区广东等6个省市的销售网络"
5. 百分比/比率类要加背景说明,如"综合毛利率为23.5%,处于行业正常水平"
6. 保持文本通顺、专业,符合正式报告风格
7. 每段控制在60字以内
【输出格式】严格按JSON格式只返回JSON不要任何其他内容
{{
"字段名1": "润色后的描述文本1",
"字段名2": "润色后的描述文本2"
}}
"""
messages = [
{"role": "system", "content": "你是一个专业的数据分析报告助手。请严格按JSON格式输出只返回纯JSON不要任何其他内容。"},
{"role": "user", "content": prompt}
]
response = await self.llm.chat(
messages=messages,
temperature=0.3,
max_tokens=3000
)
content = self.llm.extract_message_content(response)
logger.info(f"LLM 润色 Word 数据返回: {content[:500]}")
# 尝试解析 JSON
json_match = re.search(r'\{[\s\S]*\}', content)
if json_match:
polished = json.loads(json_match.group())
logger.info(f"LLM 润色成功: {len(polished)} 个字段")
return polished
else:
logger.warning(f"LLM 返回无法解析为 JSON: {content[:200]}")
# 回退到原始统计摘要
return {k: (', '.join(str(v) for v in vals[:5]) if isinstance(vals, list) else str(vals))
for k, vals in filled_data.items()}
except Exception as e:
logger.error(f"LLM 润色失败: {str(e)}")
# 润色失败时回退到原始值
return {k: (', '.join(str(v) for v in vals[:5]) if isinstance(vals, list) else str(vals))
for k, vals in filled_data.items()}
def _format_number(self, num: float) -> str:
"""格式化数字,添加千分位"""
if abs(num) >= 10000:
return f"{num:,.2f}"
elif abs(num) >= 1:
return f"{num:,.2f}"
else:
return f"{num:.4f}"
async def _fill_docx(
self,
template_path: str,
filled_data: Dict[str, Any]
) -> Optional[str]:
"""
将提取的数据填入 Word 模板
Args:
template_path: Word 模板文件路径
filled_data: 字段值字典 {field_name: [values]}
Returns:
填写后的文件路径,失败返回 None
"""
import re
import os
import tempfile
import shutil
from docx import Document
from docx.shared import RGBColor
def clean_text(text: str) -> str:
"""清理文本,移除非法字符"""
if not text:
return ""
# 移除控制字符
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
# 移除 Word 中常见的非法替代字符(显示为方框)
text = re.sub(r'[\ufffd\u25a1\u25a9\u2610\u2611\u25cb\u25c9]', '', text)
# 移除其他无效 Unicode 字符
text = re.sub(r'[\ufeff\u200b-\u200f\u2028-\u202e]', '', text)
return text.strip()
def set_cell_text(cell, text: str):
"""设置单元格文本(保留原有格式)"""
cell.text = text
# 确保文本颜色为黑色
for para in cell.paragraphs:
for run in para.runs:
run.font.color.rgb = RGBColor(0, 0, 0)
try:
# 先对数据进行 LLM 润色(非结构化文本补充和润色)
logger.info(f"Word 填写前开始 LLM 润色 {len(filled_data)} 个字段...")
polished_data = await self._polish_word_filled_data(filled_data)
logger.info(f"LLM 润色完成,使用润色后文本写入 Word")
# 创建临时目录存放修改后的文件
temp_dir = tempfile.mkdtemp()
output_path = os.path.join(temp_dir, "filled_template.docx")
# 复制模板到临时文件
shutil.copy2(template_path, output_path)
# 打开复制的模板
doc = Document(output_path)
matched_fields = set()
# 遍历表格,找到字段名所在的行,填写对应值
for table in doc.tables:
for row in table.rows:
cells = row.cells
if not cells:
continue
first_cell_text = cells[0].text.strip()
if not first_cell_text:
continue
# 精确匹配字段名
if first_cell_text in polished_data:
display_text = polished_data[first_cell_text]
if display_text:
if len(cells) > 1:
set_cell_text(cells[1], clean_text(display_text))
matched_fields.add(first_cell_text)
logger.info(f"Word 填写(精确): {first_cell_text} = {display_text[:50] if display_text else ''}")
continue
# 前缀/后缀匹配
for field_name, display_text in polished_data.items():
if field_name and first_cell_text and (
field_name.startswith(first_cell_text) or first_cell_text.startswith(field_name)
):
if display_text:
if len(cells) > 1:
set_cell_text(cells[1], clean_text(display_text))
matched_fields.add(field_name)
logger.info(f"Word 填写(模糊): {first_cell_text}{field_name} = {display_text[:50] if display_text else ''}")
break
# 如果有未匹配的字段(模板第一列为空),使用段落格式写入(带分隔线,更清晰)
unmatched_fields = [f for f in polished_data if f not in matched_fields]
if unmatched_fields:
logger.info(f"使用段落格式写入 {len(unmatched_fields)} 个字段(带分隔线)")
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
from docx.shared import Pt, RGBColor
def add_horizontal_separator(doc, before_para=None):
"""添加水平分隔线(通过段落下边框实现)"""
sep_para = OxmlElement('w:p')
pPr = OxmlElement('w:pPr')
pBdr = OxmlElement('w:pBdr')
bottom = OxmlElement('w:bottom')
bottom.set(qn('w:val'), 'single')
bottom.set(qn('w:sz'), '6')
bottom.set(qn('w:space'), '1')
bottom.set(qn('w:color'), 'CCCCCC')
pBdr.append(bottom)
pPr.append(pBdr)
sep_para.append(pPr)
if before_para is not None:
before_para._element.addprevious(sep_para)
else:
doc._body.append(sep_para)
def add_field_section(doc, field_name: str, display_text: str):
"""添加一个字段区域:字段名(加粗)+ 值段落 + 分隔线"""
from docx.shared import Pt
# 字段名段落(加粗)
name_para = doc.add_paragraph()
name_run = name_para.add_run(f"📌 {field_name}")
name_run.bold = True
name_run.font.size = Pt(11)
name_run.font.color.rgb = RGBColor(0, 51, 102)
name_para.paragraph_format.space_before = Pt(12)
name_para.paragraph_format.space_after = Pt(3)
# 值段落
value_para = doc.add_paragraph()
value_run = value_para.add_run(display_text)
value_run.font.size = Pt(10.5)
value_run.font.color.rgb = RGBColor(51, 51, 51)
value_para.paragraph_format.space_before = Pt(0)
value_para.paragraph_format.space_after = Pt(6)
# 分隔线
add_horizontal_separator(doc, value_para)
# 在文档末尾添加各字段段落
for field_name in unmatched_fields:
display_text = polished_data[field_name]
if display_text:
add_field_section(doc, field_name, clean_text(display_text))
logger.info(f"Word 段落写入: {field_name} = {display_text[:60]}")
# 保存修改后的文档
doc.save(output_path)
logger.info(f"Word 模板填写完成: {output_path}, 匹配字段: {len(matched_fields)}, 追加字段: {len(unmatched_fields)}")
return output_path
except Exception as e:
logger.error(f"Word 模板填写失败: {str(e)}")
return None
async def _load_source_documents( async def _load_source_documents(
self, self,
source_doc_ids: Optional[List[str]] = None, source_doc_ids: Optional[List[str]] = None,
@@ -257,10 +651,38 @@ class TemplateFillService:
if doc: if doc:
sd = doc.get("structured_data", {}) sd = doc.get("structured_data", {})
sd_keys = list(sd.keys()) if sd else [] sd_keys = list(sd.keys()) if sd else []
logger.info(f"从MongoDB加载文档: {doc_id}, doc_type={doc.get('doc_type')}, structured_data keys={sd_keys}") doc_type = doc.get("doc_type", "")
mysql_table_name = doc.get("metadata", {}).get("mysql_table_name")
logger.info(f"从MongoDB加载文档: {doc_id}, doc_type={doc_type}, structured_data keys={sd_keys}, mysql_table={mysql_table_name}")
# 如果 structured_data 为空,但有 file_path尝试重新解析文件
doc_content = doc.get("content", "") doc_content = doc.get("content", "")
# 如果是 Excel 类型且有 MySQL 表名,直接从 MySQL 加载数据
if doc_type in ["xlsx", "xls"] and mysql_table_name:
try:
logger.info(f" 从 MySQL 表 {mysql_table_name} 加载 Excel 数据")
mysql_data = await excel_storage_service.query_table(mysql_table_name, limit=1000)
if mysql_data:
# 转换为 SourceDocument 格式
if mysql_data and len(mysql_data) > 0:
columns = list(mysql_data[0].keys()) if mysql_data else []
rows = [[row.get(col) for col in columns] for row in mysql_data]
sd = {
"headers": columns,
"rows": rows,
"row_count": len(mysql_data),
"column_count": len(columns),
"source": "mysql"
}
logger.info(f" MySQL 数据加载成功: {len(mysql_data)} 行, {len(columns)}")
else:
logger.warning(f" MySQL 表 {mysql_table_name} 无数据")
else:
logger.warning(f" MySQL 表 {mysql_table_name} 查询无结果")
except Exception as mysql_err:
logger.error(f" MySQL 加载失败: {str(mysql_err)}")
# 如果 structured_data 仍然为空,尝试重新解析文件
if not sd or (not sd.get("tables") and not sd.get("headers") and not sd.get("rows")): if not sd or (not sd.get("tables") and not sd.get("headers") and not sd.get("rows")):
file_path = doc.get("metadata", {}).get("file_path") file_path = doc.get("metadata", {}).get("file_path")
if file_path: if file_path:
@@ -294,7 +716,7 @@ class TemplateFillService:
source_docs.append(SourceDocument( source_docs.append(SourceDocument(
doc_id=doc_id, doc_id=doc_id,
filename=doc.get("metadata", {}).get("original_filename", "unknown"), filename=doc.get("metadata", {}).get("original_filename", "unknown"),
doc_type=doc.get("doc_type", "unknown"), doc_type=doc_type,
content=doc_content, content=doc_content,
structured_data=sd structured_data=sd
)) ))
@@ -1047,7 +1469,8 @@ class TemplateFillService:
self, self,
file_path: str, file_path: str,
file_type: str = "xlsx", file_type: str = "xlsx",
source_contents: List[dict] = None source_contents: List[dict] = None,
source_docs: List["SourceDocument"] = None
) -> List[TemplateField]: ) -> List[TemplateField]:
""" """
从模板文件提取字段定义 从模板文件提取字段定义
@@ -1071,15 +1494,18 @@ class TemplateFillService:
fields = await self._get_template_fields_from_docx(file_path) fields = await self._get_template_fields_from_docx(file_path)
# 检查是否需要 AI 生成表头 # 检查是否需要 AI 生成表头
# 条件:没有字段 OR 所有字段都是自动命名的(如"字段1"、"列1"、"Unnamed"开头) # 条件:没有字段 OR 所有字段都是自动命名的
# 对于 docx仅当有源文档时才允许 AI 生成(避免覆盖用户定义的表头)
needs_ai_generation = ( needs_ai_generation = (
len(fields) == 0 or (len(fields) == 0 or
all(self._is_auto_generated_field(f.name) for f in fields) all(self._is_auto_generated_field(f.name) for f in fields))
) and (
file_type != "docx" or len(source_contents) > 0
) )
if needs_ai_generation: if needs_ai_generation:
logger.info(f"模板表头为空或自动生成,尝试 AI 生成表头... (fields={len(fields)}, source_docs={len(source_contents)})") logger.info(f"模板表头为空或自动生成,尝试 AI 生成表头... (fields={len(fields)}, source_docs={len(source_contents)})")
ai_fields = await self._generate_fields_with_ai(file_path, file_type, source_contents) ai_fields = await self._generate_fields_with_ai(file_path, file_type, source_contents, source_docs)
if ai_fields: if ai_fields:
fields = ai_fields fields = ai_fields
logger.info(f"AI 生成表头成功: {len(fields)} 个字段") logger.info(f"AI 生成表头成功: {len(fields)} 个字段")
@@ -2134,7 +2560,8 @@ class TemplateFillService:
self, self,
file_path: str, file_path: str,
file_type: str, file_type: str,
source_contents: List[dict] = None source_contents: List[dict] = None,
source_docs: List["SourceDocument"] = None
) -> Optional[List[TemplateField]]: ) -> Optional[List[TemplateField]]:
""" """
使用 AI 为空表生成表头字段 使用 AI 为空表生成表头字段
@@ -2148,6 +2575,8 @@ class TemplateFillService:
Returns: Returns:
生成的字段列表,如果失败返回 None 生成的字段列表,如果失败返回 None
""" """
import random
try: try:
import pandas as pd import pandas as pd
@@ -2182,24 +2611,21 @@ class TemplateFillService:
else: else:
content_sample = "" content_sample = ""
# 调用 AI 生成表头 # 优先从源文档的表格表头中随机选取
# 根据源文档内容生成表头
source_info = ""
logger.info(f"[DEBUG] _generate_fields_with_ai received source_contents: {len(source_contents) if source_contents else 0} items")
if source_contents: if source_contents:
for sc in source_contents: import re
logger.info(f"[DEBUG] source doc: filename={sc.get('filename')}, content_len={len(sc.get('content', ''))}, titles={len(sc.get('titles', []))}, tables_count={sc.get('tables_count', 0)}, has_tables_summary={bool(sc.get('tables_summary'))}") all_headers = []
source_info = "\n\n【源文档内容摘要】(根据以下文档内容生成表头):\n" source_info = ""
for idx, src in enumerate(source_contents[:5]): # 最多5个源文档 for idx, src in enumerate(source_contents[:5]): # 最多5个源文档
filename = src.get("filename", f"文档{idx+1}") filename = src.get("filename", f"文档{idx+1}")
doc_type = src.get("doc_type", "unknown") doc_type = src.get("doc_type", "unknown")
content = src.get("content", "")[:3000] # 限制内容长度 content = src.get("content", "")[:3000]
titles = src.get("titles", [])[:10] # 最多10个标题 titles = src.get("titles", [])[:10]
tables_count = src.get("tables_count", 0) tables_count = src.get("tables_count", 0)
tables_summary = src.get("tables_summary", "") tables_summary = src.get("tables_summary", "")
source_info += f"\n--- 文档 {idx+1}: {filename} ({doc_type}) ---\n" source_info += f"\n--- 文档 {idx+1}: {filename} ({doc_type}) ---\n"
# 处理 titles可能是字符串列表或字典列表
if titles: if titles:
title_texts = [] title_texts = []
for t in titles[:5]: for t in titles[:5]:
@@ -2216,6 +2642,72 @@ class TemplateFillService:
if content: if content:
source_info += f"【文档内容】前3000字符{content[:3000]}\n" source_info += f"【文档内容】前3000字符{content[:3000]}\n"
# 从 tables_summary 中提取表头
# 表格摘要格式如: "表格1表头: 姓名, 年龄, 性别"
if tables_summary:
header_matches = re.findall(r'表头:\s*([^\n]+)', tables_summary)
for match in header_matches:
# 分割表头字符串
headers = [h.strip() for h in match.split(',') if h.strip()]
all_headers.extend(headers)
logger.info(f"从表格摘要提取到表头: {headers}")
# 从源文档的 structured_data 中直接提取表头Excel 等数据源)
for doc in source_docs:
if doc.structured_data:
sd = doc.structured_data
# Excel 格式: {columns: [...], rows: [...]}
if sd.get("columns"):
cols = sd.get("columns", [])
if isinstance(cols, list) and cols:
all_headers.extend([str(c) for c in cols if str(c).strip()])
logger.info(f"从 structured_data.columns 提取到表头: {cols}")
# 多 sheet 格式: {sheets: {sheet_name: {columns, rows}}}
if sd.get("sheets"):
for sheet_name, sheet_data in sd.get("sheets", {}).items():
if isinstance(sheet_data, dict) and sheet_data.get("columns"):
cols = sheet_data.get("columns", [])
if isinstance(cols, list) and cols:
all_headers.extend([str(c) for c in cols if str(c).strip()])
logger.info(f"从 sheets.{sheet_name} 提取到表头: {cols}")
# Markdown/表格格式: {tables: [{headers, rows}]}
if sd.get("tables") and isinstance(sd.get("tables"), list):
for table in sd.get("tables", []):
if isinstance(table, dict) and table.get("headers"):
headers = table.get("headers", [])
if isinstance(headers, list) and headers:
all_headers.extend([str(h) for h in headers if str(h).strip()])
logger.info(f"从 tables 提取到表头: {headers}")
# 另一种格式: {headers, rows}
if sd.get("headers") and sd.get("rows"):
headers = sd.get("headers", [])
if isinstance(headers, list) and headers:
all_headers.extend([str(h) for h in headers if str(h).strip()])
logger.info(f"从 headers/rows 提取到表头: {headers}")
# 如果从表格摘要中获取到了表头,随机选取一部分
if all_headers:
logger.info(f"共有 {len(all_headers)} 个表头可用")
# 随机选取 5-7 个表头
num_fields = min(random.randint(5, 7), len(all_headers))
selected_headers = random.sample(all_headers, num_fields)
logger.info(f"随机选取的表头: {selected_headers}")
fields = []
for idx, header in enumerate(selected_headers):
fields.append(TemplateField(
cell=self._column_to_cell(idx),
name=header,
field_type="text",
required=False,
hint=""
))
return fields
else:
source_info = ""
# 如果无法从表格表头获取,才调用 AI 生成
prompt = f"""你是一个专业的数据分析助手。请分析源文档中的所有数据,生成表格表头字段。 prompt = f"""你是一个专业的数据分析助手。请分析源文档中的所有数据,生成表格表头字段。
任务:分析源文档,找出所有具体的数据指标及其分类。 任务:分析源文档,找出所有具体的数据指标及其分类。

View File

@@ -39,6 +39,8 @@ openpyxl==3.1.2
python-docx==0.8.11 python-docx==0.8.11
markdown-it-py==3.0.0 markdown-it-py==3.0.0
chardet==5.2.0 chardet==5.2.0
Pillow>=10.0.0
pytesseract>=0.3.10
# ==================== AI / LLM ==================== # ==================== AI / LLM ====================
httpx==0.25.2 httpx==0.25.2

View File

@@ -781,7 +781,8 @@ export const backendApi = {
async exportFilledTemplate( async exportFilledTemplate(
templateId: string, templateId: string,
filledData: Record<string, any>, filledData: Record<string, any>,
format: 'xlsx' | 'docx' = 'xlsx' format: 'xlsx' | 'docx' = 'xlsx',
filledFilePath?: string
): Promise<Blob> { ): Promise<Blob> {
const url = `${BACKEND_BASE_URL}/templates/export`; const url = `${BACKEND_BASE_URL}/templates/export`;
@@ -793,6 +794,7 @@ export const backendApi = {
template_id: templateId, template_id: templateId,
filled_data: filledData, filled_data: filledData,
format, format,
...(filledFilePath && { filled_file_path: filledFilePath }),
}), }),
}); });
@@ -964,6 +966,101 @@ export const backendApi = {
throw error; throw error;
} }
}, },
// ==================== 智能指令 API ====================
/**
* 智能对话(支持多轮对话的指令执行)
*/
async instructionChat(
instruction: string,
docIds?: string[],
context?: Record<string, any>
): Promise<{
success: boolean;
intent: string;
result: Record<string, any>;
message: string;
hint?: string;
}> {
const url = `${BACKEND_BASE_URL}/instruction/chat`;
try {
const response = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ instruction, doc_ids: docIds, context }),
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || '对话处理失败');
}
return await response.json();
} catch (error) {
console.error('对话处理失败:', error);
throw error;
}
},
/**
* 获取支持的指令类型列表
*/
async getSupportedIntents(): Promise<{
intents: Array<{
intent: string;
name: string;
examples: string[];
params: string[];
}>;
}> {
const url = `${BACKEND_BASE_URL}/instruction/intents`;
try {
const response = await fetch(url);
if (!response.ok) throw new Error('获取指令列表失败');
return await response.json();
} catch (error) {
console.error('获取指令列表失败:', error);
throw error;
}
},
/**
* 执行指令(同步模式)
*/
async executeInstruction(
instruction: string,
docIds?: string[],
context?: Record<string, any>
): Promise<{
success: boolean;
intent: string;
result: Record<string, any>;
message: string;
}> {
const url = `${BACKEND_BASE_URL}/instruction/execute`;
try {
const response = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ instruction, doc_ids: docIds, context }),
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || '指令执行失败');
}
return await response.json();
} catch (error) {
console.error('指令执行失败:', error);
throw error;
}
},
}; };
// ==================== AI 分析 API ==================== // ==================== AI 分析 API ====================
@@ -1529,61 +1626,66 @@ export const aiApi = {
} }
}, },
// ==================== 对话历史 API ====================
/** /**
* 智能对话(支持多轮对话的指令执行) * 获取对话历史
*/ */
async instructionChat( async getConversationHistory(conversationId: string, limit: number = 20): Promise<{
instruction: string,
docIds?: string[],
context?: Record<string, any>
): Promise<{
success: boolean; success: boolean;
intent: string; messages: Array<{
result: Record<string, any>; role: string;
message: string; content: string;
hint?: string; intent?: string;
}> { created_at: string;
const url = `${BACKEND_BASE_URL}/instruction/chat`;
try {
const response = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ instruction, doc_ids: docIds, context }),
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || '对话处理失败');
}
return await response.json();
} catch (error) {
console.error('对话处理失败:', error);
throw error;
}
},
/**
* 获取支持的指令类型列表
*/
async getSupportedIntents(): Promise<{
intents: Array<{
intent: string;
name: string;
examples: string[];
params: string[];
}>; }>;
}> { }> {
const url = `${BACKEND_BASE_URL}/instruction/intents`; const url = `${BACKEND_BASE_URL}/conversation/${conversationId}/history?limit=${limit}`;
try { try {
const response = await fetch(url); const response = await fetch(url);
if (!response.ok) throw new Error('获取指令列表失败'); if (!response.ok) throw new Error('获取对话历史失败');
return await response.json(); return await response.json();
} catch (error) { } catch (error) {
console.error('获取指令列表失败:', error); console.error('获取对话历史失败:', error);
throw error; return { success: false, messages: [] };
} }
}, },
/**
* 删除对话历史
*/
async deleteConversation(conversationId: string): Promise<{
success: boolean;
}> {
const url = `${BACKEND_BASE_URL}/conversation/${conversationId}`;
try {
const response = await fetch(url, { method: 'DELETE' });
if (!response.ok) throw new Error('删除对话历史失败');
return await response.json();
} catch (error) {
console.error('删除对话历史失败:', error);
return { success: false };
}
},
/**
* 获取会话列表
*/
async listConversations(limit: number = 50): Promise<{
success: boolean;
conversations: Array<any>;
}> {
const url = `${BACKEND_BASE_URL}/conversation/all?limit=${limit}`;
try {
const response = await fetch(url);
if (!response.ok) throw new Error('获取会话列表失败');
return await response.json();
} catch (error) {
console.error('获取会话列表失败:', error);
return { success: false, conversations: [] };
}
}
}; };

View File

@@ -15,12 +15,14 @@ import {
Sparkles, Sparkles,
Database, Database,
FileSpreadsheet, FileSpreadsheet,
RefreshCcw RefreshCcw,
Trash2
} from 'lucide-react'; } from 'lucide-react';
import { backendApi } from '@/db/backend-api'; import { backendApi } from '@/db/backend-api';
import { formatDistanceToNow } from 'date-fns'; import { formatDistanceToNow } from 'date-fns';
import { zhCN } from 'date-fns/locale'; import { zhCN } from 'date-fns/locale';
import { cn } from '@/lib/utils'; import { cn } from '@/lib/utils';
import { toast } from 'sonner';
type DocumentItem = { type DocumentItem = {
doc_id: string; doc_id: string;
@@ -108,7 +110,7 @@ const Dashboard: React.FC = () => {
<div className="grid grid-cols-1 md:grid-cols-3 gap-6"> <div className="grid grid-cols-1 md:grid-cols-3 gap-6">
{[ {[
{ label: '已上传文档', value: stats.docs, icon: FileText, color: 'bg-blue-500', trend: '非结构化文档', link: '/documents' }, { label: '已上传文档', value: stats.docs, icon: FileText, color: 'bg-blue-500', trend: '非结构化文档', link: '/documents' },
{ label: 'Excel 文件', value: stats.excelFiles, icon: FileSpreadsheet, color: 'bg-emerald-500', trend: '结构化数据', link: '/excel-parse' }, { label: 'Excel 文件', value: stats.excelFiles, icon: FileSpreadsheet, color: 'bg-emerald-500', trend: '结构化数据', link: '/documents' },
{ label: '填表任务', value: stats.tasks, icon: TableProperties, color: 'bg-indigo-500', trend: '待实现', link: '/form-fill' } { label: '填表任务', value: stats.tasks, icon: TableProperties, color: 'bg-indigo-500', trend: '待实现', link: '/form-fill' }
].map((stat, i) => ( ].map((stat, i) => (
<Card key={i} className="border-none shadow-md overflow-hidden group hover:shadow-xl transition-all duration-300"> <Card key={i} className="border-none shadow-md overflow-hidden group hover:shadow-xl transition-all duration-300">
@@ -164,9 +166,31 @@ const Dashboard: React.FC = () => {
{doc.doc_type.toUpperCase()} {formatDistanceToNow(new Date(doc.created_at), { addSuffix: true, locale: zhCN })} {doc.doc_type.toUpperCase()} {formatDistanceToNow(new Date(doc.created_at), { addSuffix: true, locale: zhCN })}
</p> </p>
</div> </div>
<div className="flex items-center gap-2">
<div className="px-2 py-1 rounded-full text-[10px] font-bold uppercase tracking-wider bg-muted"> <div className="px-2 py-1 rounded-full text-[10px] font-bold uppercase tracking-wider bg-muted">
{doc.doc_type} {doc.doc_type}
</div> </div>
<Button
variant="ghost"
size="icon"
className="opacity-0 group-hover:opacity-100 text-destructive hover:bg-destructive/10 transition-opacity"
onClick={async (e) => {
e.stopPropagation();
if (!confirm(`确定要删除 "${doc.original_filename}" 吗?`)) return;
try {
const result = await backendApi.deleteDocument(doc.doc_id);
if (result.success) {
setRecentDocs(prev => prev.filter(d => d.doc_id !== doc.doc_id));
toast.success('文档已删除');
}
} catch (err: any) {
toast.error(err.message || '删除失败');
}
}}
>
<Trash2 size={16} />
</Button>
</div>
</div> </div>
))} ))}
</div> </div>
@@ -197,7 +221,7 @@ const Dashboard: React.FC = () => {
<div className="grid grid-cols-1 sm:grid-cols-2 gap-4"> <div className="grid grid-cols-1 sm:grid-cols-2 gap-4">
{[ {[
{ title: '上传文档', desc: '支持 docx/md/txt', icon: FileText, link: '/documents', color: 'bg-blue-500' }, { title: '上传文档', desc: '支持 docx/md/txt', icon: FileText, link: '/documents', color: 'bg-blue-500' },
{ title: '解析 Excel', desc: '上传并分析数据', icon: FileSpreadsheet, link: '/excel-parse', color: 'bg-emerald-500' }, { title: '解析 Excel', desc: '上传并分析数据', icon: FileSpreadsheet, link: '/documents', color: 'bg-emerald-500' },
{ title: '智能填表', desc: '自动填写表格模板', icon: TableProperties, link: '/form-fill', color: 'bg-indigo-500' }, { title: '智能填表', desc: '自动填写表格模板', icon: TableProperties, link: '/form-fill', color: 'bg-indigo-500' },
{ title: 'AI 助手', desc: '自然语言交互', icon: MessageSquareCode, link: '/assistant', color: 'bg-amber-500' } { title: 'AI 助手', desc: '自然语言交互', icon: MessageSquareCode, link: '/assistant', color: 'bg-amber-500' }
].map((item, i) => ( ].map((item, i) => (

View File

@@ -78,6 +78,19 @@ const Documents: React.FC = () => {
const [expandedSheet, setExpandedSheet] = useState<string | null>(null); const [expandedSheet, setExpandedSheet] = useState<string | null>(null);
const [uploadExpanded, setUploadExpanded] = useState(false); const [uploadExpanded, setUploadExpanded] = useState(false);
// 批量上传状态跟踪
type FileUploadStatus = 'pending' | 'uploading' | 'processing' | 'success' | 'failed';
interface UploadFileState {
file: File;
status: FileUploadStatus;
progress: number;
taskId?: string;
error?: string;
docId?: string;
}
const [uploadStates, setUploadStates] = useState<UploadFileState[]>([]);
const [batchTaskId, setBatchTaskId] = useState<string | null>(null);
// AI 分析相关状态 // AI 分析相关状态
const [analyzing, setAnalyzing] = useState(false); const [analyzing, setAnalyzing] = useState(false);
const [analyzingForCharts, setAnalyzingForCharts] = useState(false); const [analyzingForCharts, setAnalyzingForCharts] = useState(false);
@@ -211,21 +224,119 @@ const Documents: React.FC = () => {
} }
}; };
// 文件上传处理 // 文件上传处理 - 批量上传
const onDrop = async (acceptedFiles: File[]) => { const onDrop = async (acceptedFiles: File[]) => {
if (acceptedFiles.length === 0) return; if (acceptedFiles.length === 0) return;
// 初始化上传状态
const initialStates: UploadFileState[] = acceptedFiles.map(file => ({
file,
status: 'pending',
progress: 0
}));
setUploadStates(initialStates);
setUploadExpanded(true);
setUploading(true); setUploading(true);
try {
// 使用批量上传接口
const result = await backendApi.uploadDocuments(acceptedFiles);
if (result.task_id) {
setBatchTaskId(result.task_id);
// 更新所有文件状态为上传中
setUploadStates(prev => prev.map(s => ({ ...s, status: 'uploading', progress: 30 })));
// 轮询任务状态
let attempts = 0;
const maxAttempts = 150; // 最多5分钟
const checkBatchStatus = async () => {
while (attempts < maxAttempts) {
try {
const status = await backendApi.getTaskStatus(result.task_id);
if (status.status === 'success' && status.result) {
// 更新每个文件的状态
const fileResults = status.result.results || [];
setUploadStates(prev => prev.map((s, idx) => {
const fileResult = fileResults[idx];
if (fileResult?.success) {
return { ...s, status: 'success', progress: 100, docId: fileResult.doc_id };
} else {
return { ...s, status: 'failed', progress: 0, error: fileResult?.error || '处理失败' };
}
}));
loadDocuments();
return;
} else if (status.status === 'failure') {
setUploadStates(prev => prev.map(s => ({
...s,
status: 'failed',
error: status.error || '批量处理失败'
})));
return;
} else {
// 处理中 - 更新进度
const progress = status.progress || Math.min(30 + attempts * 2, 90);
setUploadStates(prev => prev.map(s => ({
...s,
status: s.status === 'uploading' ? 'processing' : s.status,
progress
})));
}
} catch (e) {
console.error('检查批量状态失败', e);
}
await new Promise(resolve => setTimeout(resolve, 2000));
attempts++;
}
// 超时
setUploadStates(prev => prev.map(s => {
if (s.status !== 'success') {
return { ...s, status: 'failed', error: '处理超时' };
}
return s;
}));
};
checkBatchStatus();
} else {
// 单文件直接上传(旧逻辑作为后备)
await handleSingleFileUploads(acceptedFiles);
}
} catch (error: any) {
toast.error(error.message || '上传失败');
setUploadStates(prev => prev.map(s => ({
...s,
status: 'failed',
error: error.message || '上传失败'
})));
} finally {
setUploading(false);
}
};
// 单文件上传后备逻辑
const handleSingleFileUploads = async (files: File[]) => {
let successCount = 0; let successCount = 0;
let failCount = 0;
const successfulFiles: File[] = []; const successfulFiles: File[] = [];
// 逐个上传文件 for (let i = 0; i < files.length; i++) {
for (const file of acceptedFiles) { const file = files[i];
const ext = file.name.split('.').pop()?.toLowerCase(); const ext = file.name.split('.').pop()?.toLowerCase();
setUploadStates(prev => prev.map((s, idx) =>
idx === i ? { ...s, status: 'uploading' } : s
));
try { try {
if (ext === 'xlsx' || ext === 'xls') { if (ext === 'xlsx' || ext === 'xls') {
setUploadStates(prev => prev.map((s, idx) =>
idx === i ? { ...s, status: 'processing', progress: 50 } : s
));
const result = await backendApi.uploadExcel(file, { const result = await backendApi.uploadExcel(file, {
parseAllSheets: parseOptions.parseAllSheets, parseAllSheets: parseOptions.parseAllSheets,
headerRow: parseOptions.headerRow headerRow: parseOptions.headerRow
@@ -233,99 +344,60 @@ const Documents: React.FC = () => {
if (result.success) { if (result.success) {
successCount++; successCount++;
successfulFiles.push(file); successfulFiles.push(file);
// 第一个Excel文件设置解析结果供预览 setUploadStates(prev => prev.map((s, idx) =>
idx === i ? { ...s, status: 'success', progress: 100 } : s
));
if (successCount === 1) { if (successCount === 1) {
setUploadedFile(file); setUploadedFile(file);
setParseResult(result); setParseResult(result);
if (result.metadata?.sheet_count === 1) {
setExpandedSheet(Object.keys(result.data?.sheets || {})[0] || null);
}
} }
loadDocuments(); loadDocuments();
} else { } else {
failCount++; setUploadStates(prev => prev.map((s, idx) =>
toast.error(`${file.name}: ${result.error || '解析失败'}`); idx === i ? { ...s, status: 'failed', error: result.error || '解析失败' } : s
));
} }
} else if (ext === 'md' || ext === 'markdown') { } else {
setUploadStates(prev => prev.map((s, idx) =>
idx === i ? { ...s, status: 'processing', progress: 50 } : s
));
const result = await backendApi.uploadDocument(file); const result = await backendApi.uploadDocument(file);
if (result.task_id) { if (result.task_id) {
// 等待任务完成
let attempts = 0;
while (attempts < 60) {
const status = await backendApi.getTaskStatus(result.task_id);
if (status.status === 'success') {
successCount++; successCount++;
successfulFiles.push(file); successfulFiles.push(file);
setUploadStates(prev => prev.map((s, idx) =>
idx === i ? { ...s, status: 'success', progress: 100, docId: status.result?.doc_id } : s
));
if (successCount === 1) { if (successCount === 1) {
setUploadedFile(file); setUploadedFile(file);
} }
// 轮询任务状态
let attempts = 0;
const checkStatus = async () => {
while (attempts < 30) {
try {
const status = await backendApi.getTaskStatus(result.task_id);
if (status.status === 'success') {
loadDocuments(); loadDocuments();
return; break;
} else if (status.status === 'failure') { } else if (status.status === 'failure') {
return; setUploadStates(prev => prev.map((s, idx) =>
} idx === i ? { ...s, status: 'failed', error: status.error || '处理失败' } : s
} catch (e) { ));
console.error('检查状态失败', e); break;
} }
await new Promise(resolve => setTimeout(resolve, 2000)); await new Promise(resolve => setTimeout(resolve, 2000));
attempts++; attempts++;
} }
};
checkStatus();
} else {
failCount++;
}
} else {
// 其他文档使用通用上传接口
const result = await backendApi.uploadDocument(file);
if (result.task_id) {
successCount++;
successfulFiles.push(file);
if (successCount === 1) {
setUploadedFile(file);
}
// 轮询任务状态
let attempts = 0;
const checkStatus = async () => {
while (attempts < 30) {
try {
const status = await backendApi.getTaskStatus(result.task_id);
if (status.status === 'success') {
loadDocuments();
return;
} else if (status.status === 'failure') {
return;
}
} catch (e) {
console.error('检查状态失败', e);
}
await new Promise(resolve => setTimeout(resolve, 2000));
attempts++;
}
};
checkStatus();
} else {
failCount++;
} }
} }
} catch (error: any) { } catch (error: any) {
failCount++; setUploadStates(prev => prev.map((s, idx) =>
toast.error(`${file.name}: ${error.message || '上传失败'}`); idx === i ? { ...s, status: 'failed', error: error.message || '上传失败' } : s
));
} }
} }
setUploading(false);
loadDocuments();
if (successCount > 0) { if (successCount > 0) {
toast.success(`成功上传 ${successCount} 个文件`);
setUploadedFiles(prev => [...prev, ...successfulFiles]); setUploadedFiles(prev => [...prev, ...successfulFiles]);
setUploadExpanded(true);
}
if (failCount > 0) {
toast.error(`${failCount} 个文件上传失败`);
} }
}; };
@@ -699,7 +771,110 @@ const Documents: React.FC = () => {
</CardHeader> </CardHeader>
{uploadPanelOpen && ( {uploadPanelOpen && (
<CardContent className="space-y-4"> <CardContent className="space-y-4">
{uploadedFiles.length > 0 || uploadedFile ? ( {/* 优先显示正在上传的状态 */}
{uploadStates.length > 0 && (
<div className="space-y-3">
{/* 上传状态头部 */}
<div
className="flex items-center justify-between p-3 bg-primary/5 rounded-xl cursor-pointer hover:bg-primary/10 transition-colors"
onClick={() => setUploadExpanded(!uploadExpanded)}
>
<div className="flex items-center gap-3">
<div className="w-10 h-10 rounded-lg bg-primary/10 text-primary flex items-center justify-center">
{uploading ? <Loader2 size={20} className="animate-spin" /> : <Upload size={20} />}
</div>
<div>
<p className="font-semibold text-sm">
{uploading ? '正在上传' : '上传完成'} {uploadStates.length}
</p>
<p className="text-xs text-muted-foreground">
{uploading ? '上传中,请稍候...' : uploadStates.filter(s => s.status === 'failed').length > 0 ? '部分失败' : '点击查看详情'}
</p>
</div>
</div>
<div className="flex items-center gap-2">
{!uploading && (
<Button
variant="ghost"
size="sm"
onClick={(e) => {
e.stopPropagation();
setUploadStates([]);
setUploadedFiles([]);
setUploadedFile(null);
}}
className="text-destructive hover:text-destructive"
>
<Trash2 size={14} className="mr-1" />
</Button>
)}
{uploadExpanded ? <ChevronUp size={16} /> : <ChevronDown size={16} />}
</div>
</div>
{/* 上传进度列表(总是展开显示) */}
{uploadExpanded && (
<div className="space-y-2 border rounded-xl p-3 bg-background">
{uploadStates.map((state, index) => (
<div key={index} className="flex items-center gap-3 p-2 rounded-lg hover:bg-muted/30 transition-colors">
<div className={cn(
"w-8 h-8 rounded flex items-center justify-center shrink-0",
isExcelFile(state.file.name) ? "bg-emerald-500/10 text-emerald-500" : "bg-blue-500/10 text-blue-500"
)}>
{state.status === 'pending' && <Clock size={16} />}
{state.status === 'uploading' && <Upload size={16} className="animate-pulse" />}
{state.status === 'processing' && <Loader2 size={16} className="animate-spin" />}
{state.status === 'success' && <CheckCircle size={16} className="text-green-500" />}
{state.status === 'failed' && <AlertCircle size={16} className="text-red-500" />}
</div>
<div className="flex-1 min-w-0">
<p className="text-sm truncate">{state.file.name}</p>
<div className="flex items-center gap-2">
{state.status === 'pending' && <p className="text-xs text-muted-foreground">...</p>}
{state.status === 'uploading' && <p className="text-xs text-primary">...</p>}
{state.status === 'processing' && <p className="text-xs text-primary">...</p>}
{state.status === 'failed' && state.error && (
<p className="text-xs text-red-500 truncate">{state.error}</p>
)}
{state.status === 'success' && (
<p className="text-xs text-green-500"></p>
)}
</div>
{/* 进度条 */}
{(state.status === 'uploading' || state.status === 'processing') && (
<div className="mt-1 h-1 bg-muted rounded-full overflow-hidden">
<div
className="h-full bg-primary transition-all duration-300"
style={{ width: `${state.progress}%` }}
/>
</div>
)}
</div>
{state.status === 'success' && (
<CheckCircle size={16} className="text-green-500 shrink-0" />
)}
{state.status === 'failed' && (
<Button
variant="ghost"
size="icon"
className="text-destructive hover:bg-destructive/10 shrink-0"
onClick={() => {
setUploadStates(prev => prev.filter((_, i) => i !== index));
}}
>
<Trash2 size={14} />
</Button>
)}
</div>
))}
</div>
)}
</div>
)}
{/* 已上传文件列表(没有正在上传时显示) */}
{uploadStates.length === 0 && (uploadedFiles.length > 0 || uploadedFile) ? (
<div className="space-y-3"> <div className="space-y-3">
{/* 文件列表头部 */} {/* 文件列表头部 */}
<div <div
@@ -739,6 +914,84 @@ const Documents: React.FC = () => {
{/* 展开的文件列表 */} {/* 展开的文件列表 */}
{uploadExpanded && ( {uploadExpanded && (
<div className="space-y-2 border rounded-xl p-3"> <div className="space-y-2 border rounded-xl p-3">
{/* 显示已上传文件列表 */}
{(uploadedFiles.length > 0 ? uploadedFiles : [uploadedFile]).filter(Boolean).map((file, index) => (
<div key={index} className="flex items-center gap-3 p-2 bg-background rounded-lg">
<div className={cn(
"w-8 h-8 rounded flex items-center justify-center",
isExcelFile(file?.name || '') ? "bg-emerald-500/10 text-emerald-500" : "bg-blue-500/10 text-blue-500"
)}>
{isExcelFile(file?.name || '') ? <FileSpreadsheet size={16} /> : <FileText size={16} />}
</div>
<div className="flex-1 min-w-0">
<p className="text-sm truncate">{file?.name}</p>
<p className="text-xs text-muted-foreground">{formatFileSize(file?.size || 0)}</p>
</div>
<Button
variant="ghost"
size="icon"
className="text-destructive hover:bg-destructive/10"
onClick={() => handleRemoveUploadedFile(index)}
>
<Trash2 size={14} />
</Button>
</div>
))}
{/* 继续添加按钮 */}
<div
{...getRootProps()}
className="flex items-center justify-center gap-2 p-3 border-2 border-dashed rounded-lg cursor-pointer hover:border-primary/50 hover:bg-primary/5 transition-colors"
onClick={(e) => e.stopPropagation()}
>
<input {...getInputProps()} multiple={true} />
<Plus size={16} className="text-muted-foreground" />
<span className="text-sm text-muted-foreground"></span>
</div>
</div>
)}
</div>
) : (uploadedFiles.length > 0 || uploadedFile) ? (
<div className="space-y-3">
{/* 文件列表头部 */}
<div
className="flex items-center justify-between p-3 bg-muted/50 rounded-xl cursor-pointer hover:bg-muted/70 transition-colors"
onClick={() => setUploadExpanded(!uploadExpanded)}
>
<div className="flex items-center gap-3">
<div className="w-10 h-10 rounded-lg bg-primary/10 text-primary flex items-center justify-center">
<Upload size={20} />
</div>
<div>
<p className="font-semibold text-sm">
{(uploadedFiles.length > 0 ? uploadedFiles : [uploadedFile]).length}
</p>
<p className="text-xs text-muted-foreground">
{uploadExpanded ? '点击收起' : '点击展开查看'}
</p>
</div>
</div>
<div className="flex items-center gap-2">
<Button
variant="ghost"
size="sm"
onClick={(e) => {
e.stopPropagation();
handleDeleteFile();
}}
className="text-destructive hover:text-destructive"
>
<Trash2 size={14} className="mr-1" />
</Button>
{uploadExpanded ? <ChevronUp size={16} /> : <ChevronDown size={16} />}
</div>
</div>
{/* 展开的文件列表 */}
{uploadExpanded && (
<div className="space-y-2 border rounded-xl p-3">
{/* 显示已上传文件列表 */}
{(uploadedFiles.length > 0 ? uploadedFiles : [uploadedFile]).filter(Boolean).map((file, index) => ( {(uploadedFiles.length > 0 ? uploadedFiles : [uploadedFile]).filter(Boolean).map((file, index) => (
<div key={index} className="flex items-center gap-3 p-2 bg-background rounded-lg"> <div key={index} className="flex items-center gap-3 p-2 bg-background rounded-lg">
<div className={cn( <div className={cn(

View File

@@ -1,26 +1,10 @@
import React, { useState, useRef, useEffect } from 'react'; import React, { useState, useRef, useEffect } from 'react';
import { import { Send, Bot, User, Sparkles, Trash2, FileText, TableProperties, ArrowRight, Search, MessageSquare } from 'lucide-react';
Send,
Bot,
User,
Sparkles,
Trash2,
RefreshCcw,
FileText,
TableProperties,
ChevronRight,
ArrowRight,
Loader2,
Download,
Search,
MessageSquare,
CheckCircle
} from 'lucide-react';
import { Button } from '@/components/ui/button'; import { Button } from '@/components/ui/button';
import { Input } from '@/components/ui/input'; import { Input } from '@/components/ui/input';
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'; import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
import { ScrollArea } from '@/components/ui/scroll-area'; import { ScrollArea } from '@/components/ui/scroll-area';
import { Badge } from '@/components/ui/badge'; import { Markdown } from '@/components/ui/markdown';
import { backendApi } from '@/db/backend-api'; import { backendApi } from '@/db/backend-api';
import { toast } from 'sonner'; import { toast } from 'sonner';
import { cn } from '@/lib/utils'; import { cn } from '@/lib/utils';
@@ -39,8 +23,21 @@ const InstructionChat: React.FC = () => {
const [input, setInput] = useState(''); const [input, setInput] = useState('');
const [loading, setLoading] = useState(false); const [loading, setLoading] = useState(false);
const [currentDocIds, setCurrentDocIds] = useState<string[]>([]); const [currentDocIds, setCurrentDocIds] = useState<string[]>([]);
const [conversationId, setConversationId] = useState<string>('');
const scrollAreaRef = useRef<HTMLDivElement>(null); const scrollAreaRef = useRef<HTMLDivElement>(null);
// 初始化会话ID
useEffect(() => {
const storedId = localStorage.getItem('chat_conversation_id');
if (storedId) {
setConversationId(storedId);
} else {
const newId = `conv_${Date.now()}_${Math.random().toString(36).substring(7)}`;
setConversationId(newId);
localStorage.setItem('chat_conversation_id', newId);
}
}, []);
useEffect(() => { useEffect(() => {
// Initial welcome message // Initial welcome message
if (messages.length === 0) { if (messages.length === 0) {
@@ -119,7 +116,8 @@ const InstructionChat: React.FC = () => {
// 使用真实的智能指令 API // 使用真实的智能指令 API
const response = await backendApi.instructionChat( const response = await backendApi.instructionChat(
input.trim(), input.trim(),
currentDocIds.length > 0 ? currentDocIds : undefined currentDocIds.length > 0 ? currentDocIds : undefined,
{ conversation_id: conversationId }
); );
// 根据意图类型生成友好响应 // 根据意图类型生成友好响应
@@ -135,11 +133,12 @@ const InstructionChat: React.FC = () => {
responseContent = `✅ 已提取到 ${keys.length} 个字段的数据:\n\n`; responseContent = `✅ 已提取到 ${keys.length} 个字段的数据:\n\n`;
for (const [key, value] of Object.entries(extracted)) { for (const [key, value] of Object.entries(extracted)) {
const values = Array.isArray(value) ? value : [value]; const values = Array.isArray(value) ? value : [value];
responseContent += `**${key}**: ${values.slice(0, 3).join(', ')}${values.length > 3 ? '...' : ''}\n`; const displayValues = values.length > 10 ? values.slice(0, 10).join(', ') + ` ...(共${values.length}条)` : values.join(', ');
responseContent += `**${key}**: ${displayValues}\n`;
} }
responseContent += `\n💡 您可以将这些数据填入表格`; responseContent += `\n💡 可直接使用以上数据,或说"填入表格"继续填表操作`;
} else { } else {
responseContent = '未能从文档中提取到相关数据。请尝试更明确的字段名称。'; responseContent = resultData?.message || '未能从文档中提取到相关数据。请尝试更明确的字段名称。';
} }
break; break;
@@ -151,24 +150,24 @@ const InstructionChat: React.FC = () => {
responseContent = `✅ 填表完成!成功填写 ${filledKeys.length} 个字段:\n\n`; responseContent = `✅ 填表完成!成功填写 ${filledKeys.length} 个字段:\n\n`;
for (const [key, value] of Object.entries(filled)) { for (const [key, value] of Object.entries(filled)) {
const values = Array.isArray(value) ? value : [value]; const values = Array.isArray(value) ? value : [value];
responseContent += `**${key}**: ${values.slice(0, 3).join(', ')}\n`; const displayValues = values.length > 10 ? values.slice(0, 10).join(', ') + ` ...(共${values.length}条)` : values.join(', ');
responseContent += `**${key}**: ${displayValues}\n`;
} }
responseContent += `\n📋 请到【智能填表】页面查看或导出结果。`; responseContent += `\n📋 请到【智能填表】页面查看或导出结果。`;
} else { } else {
responseContent = '填表未能提取到数据。请检查模板表头和数据源内容。'; responseContent = resultData?.message || '填表未能提取到数据。请检查模板表头和数据源内容。';
} }
break; break;
case 'summarize': case 'summarize':
// 摘要结果 // 摘要结果
const summaries = resultData?.summaries || []; if (resultData?.action_needed === 'provide_document' || resultData?.action_needed === 'upload_document') {
if (summaries.length > 0) { responseContent = `📋 ${resultData.message}\n\n${resultData.suggestion || ''}`;
responseContent = `📄 找到 ${summaries.length} 个文档的摘要:\n\n`; } else if (resultData?.ai_summary) {
summaries.forEach((s: any, idx: number) => { // AI 生成的摘要
responseContent += `**${idx + 1}. ${s.filename}**\n${s.content_preview}\n\n`; responseContent = `📄 **${resultData.filename}** 摘要分析:\n\n${resultData.ai_summary}`;
});
} else { } else {
responseContent = '未能生成摘要。请确保已上传文档。'; responseContent = resultData?.message || '未能生成摘要。请确保已上传文档。';
} }
break; break;
@@ -176,8 +175,10 @@ const InstructionChat: React.FC = () => {
// 问答结果 // 问答结果
if (resultData?.answer) { if (resultData?.answer) {
responseContent = `**问题**: ${resultData.question}\n\n**答案**: ${resultData.answer}`; responseContent = `**问题**: ${resultData.question}\n\n**答案**: ${resultData.answer}`;
} else if (resultData?.context_preview) {
responseContent = `**问题**: ${resultData.question}\n\n**相关上下文**\n${resultData.context_preview}`;
} else { } else {
responseContent = resultData?.message || '我找到了相关信息,请查看上文。'; responseContent = resultData?.message || '请先上传文档,我才能回答您的问题。';
} }
break; break;
@@ -207,8 +208,35 @@ const InstructionChat: React.FC = () => {
} }
break; break;
case 'edit':
// 文档编辑结果
if (resultData?.edited_content) {
responseContent = `✏️ **${resultData.original_filename}** 编辑完成:\n\n${resultData.edited_content.substring(0, 500)}${resultData.edited_content.length > 500 ? '\n\n...(内容已截断)' : ''}`;
} else {
responseContent = resultData?.message || '编辑完成。';
}
break;
case 'transform':
// 格式转换结果
if (resultData?.excel_data) {
responseContent = `🔄 格式转换完成!\n\n已转换为 **Excel** 格式,共 **${resultData.excel_data.length}** 行数据。\n\n${resultData.message || ''}`;
} else if (resultData?.content) {
responseContent = `🔄 格式转换完成!\n\n目标格式: **${resultData.target_format?.toUpperCase()}**\n\n${resultData.message || ''}`;
} else {
responseContent = resultData?.message || '格式转换完成。';
}
break;
case 'unknown': case 'unknown':
responseContent = `我理解您想要: "${input.trim()}"\n\n但我目前无法完成此操作。您可以尝试\n\n1. **提取数据**: "提取医院数量和床位数"\n2. **填表**: "根据这些数据填表"\n3. **总结**: "总结这份文档"\n4. **问答**: "文档里说了什么?"\n5. **搜索**: "搜索相关内容"`; // 检查是否需要用户上传文档
if (resultData?.suggestion) {
responseContent = resultData.suggestion;
} else if (resultData?.message && resultData.message !== '无法理解该指令,请尝试更明确的描述') {
responseContent = resultData.message;
} else {
responseContent = `我理解您想要: "${input.trim()}"\n\n请尝试以下操作\n\n1. **提取数据**: "提取医院数量和床位数"\n2. **填表**: "根据这些数据填表"\n3. **总结**: "总结这份文档"\n4. **问答**: "文档里说了什么?"\n5. **搜索**: "搜索相关内容"`;
}
break; break;
default: default:
@@ -299,9 +327,11 @@ const InstructionChat: React.FC = () => {
? "bg-primary text-primary-foreground shadow-xl shadow-primary/20 rounded-tr-none" ? "bg-primary text-primary-foreground shadow-xl shadow-primary/20 rounded-tr-none"
: "bg-white border border-border/50 shadow-md rounded-tl-none" : "bg-white border border-border/50 shadow-md rounded-tl-none"
)}> )}>
<p className="text-sm leading-relaxed whitespace-pre-wrap font-medium"> {m.role === 'assistant' ? (
{m.content} <Markdown content={m.content} className="text-sm leading-relaxed prose prose-sm max-w-none" />
</p> ) : (
<p className="text-sm leading-relaxed whitespace-pre-wrap font-medium">{m.content}</p>
)}
<span className={cn( <span className={cn(
"text-[10px] block opacity-50 font-bold tracking-widest", "text-[10px] block opacity-50 font-bold tracking-widest",
m.role === 'user' ? "text-right" : "text-left" m.role === 'user' ? "text-right" : "text-left"

View File

@@ -248,15 +248,25 @@ const TemplateFill: React.FC = () => {
if (!templateFile || !filledResult) return; if (!templateFile || !filledResult) return;
try { try {
const ext = templateFile.name.split('.').pop()?.toLowerCase();
const exportFormat = (ext === 'docx') ? 'docx' : 'xlsx';
// 对于 Word 模板,如果已有填写后的文件(已填入表格单元格),传递其路径以便直接下载
const filledFilePath = (ext === 'docx' && filledResult.filled_file_path)
? filledResult.filled_file_path
: undefined;
const blob = await backendApi.exportFilledTemplate( const blob = await backendApi.exportFilledTemplate(
templateId || 'temp', templateId || 'temp',
filledResult.filled_data || {}, filledResult.filled_data || {},
'xlsx' exportFormat,
filledFilePath
); );
const ext_match = templateFile.name.match(/\.([^.])+$/);
const baseName = ext_match ? templateFile.name.replace(ext_match[0], '') : templateFile.name;
const downloadName = `filled_${baseName}.${exportFormat}`;
const url = URL.createObjectURL(blob); const url = URL.createObjectURL(blob);
const a = document.createElement('a'); const a = document.createElement('a');
a.href = url; a.href = url;
a.download = `filled_${templateFile.name}`; a.download = downloadName;
a.click(); a.click();
URL.revokeObjectURL(url); URL.revokeObjectURL(url);
toast.success('导出成功'); toast.success('导出成功');
@@ -546,7 +556,7 @@ const TemplateFill: React.FC = () => {
</div> </div>
<h3 className="text-xl font-bold mb-2">AI </h3> <h3 className="text-xl font-bold mb-2">AI </h3>
<p className="text-muted-foreground text-center max-w-md"> <p className="text-muted-foreground text-center max-w-md">
{sourceFiles.length || sourceFilePaths.length} ... {sourceFiles.length || sourceFilePaths.length || sourceDocIds.length || 0} ...
</p> </p>
</CardContent> </CardContent>
</Card> </Card>
@@ -562,7 +572,7 @@ const TemplateFill: React.FC = () => {
</CardTitle> </CardTitle>
<CardDescription> <CardDescription>
{sourceFiles.length || sourceFilePaths.length} {filledResult.source_doc_count || sourceFiles.length || sourceFilePaths.length || sourceDocIds.length}
</CardDescription> </CardDescription>
</CardHeader> </CardHeader>
<CardContent> <CardContent>