【智能助手增强】

- 新增对话历史管理:MongoDB新增conversations集合,存储用户与AI的对话上下文,支持多轮对话意图延续
- 新增对话历史API(conversation.py):GET/DELETE conversation历史、列出所有会话
- 意图解析增强:支持基于对话历史的意图识别,上下文理解更准确
- 字段提取优化:支持"提取文档中的医院数量"等自然语言模式,智能去除"文档中的"前缀
- 文档对比优化:从指令中提取文件名并精确匹配source_docs,支持"对比A和B两个文档"
- 文档摘要优化:使用LLM生成真实AI摘要而非返回原始文档预览

【Word模板填表核心功能】
- Word模板字段生成:空白Word上传后,自动从源文档(Excel/Word/TXT/MD)内容AI生成字段名
- Word模板填表(_fill_docx):将提取数据写入Word模板表格,支持精确匹配、模糊匹配、追加新行
- 数据润色(_polish_word_filled_data):LLM对多行Excel数据进行统计归纳(合计/平均/极值),转化为专业自然语言描述
- 段落格式输出:使用📌字段名+值段落+分隔线(灰色横线)格式,提升可读性
- 导出链打通:fill_template返回filled_file_path,export直接返回已填好的Word文件

【其他修复】
- 修复Word导出Windows文件锁问题:NamedTemporaryFile改为mkstemp+close
- 修复Word方框非法字符:扩展clean_text移除\uFFFD、□等Unicode替代符和零宽字符
- 修复文档对比"需要至少2个文档":从指令提取具体文件名优先匹配而非取前2个
- 修复导出format硬编码:自动识别docx/xlsx格式
- Docx解析器增加备用解析方法和更完整的段落/表格/标题提取
- RAG服务新增MySQL数据源支持
This commit is contained in:
dj
2026-04-15 23:32:55 +08:00
parent 9e7f9df384
commit e5d4724e82
19 changed files with 2185 additions and 407 deletions

View File

@@ -14,6 +14,7 @@ from app.api.endpoints import (
analysis_charts,
health,
instruction, # 智能指令
conversation, # 对话历史
)
# 创建主路由
@@ -31,3 +32,4 @@ api_router.include_router(ai_analyze.router) # AI分析
api_router.include_router(visualization.router) # 可视化
api_router.include_router(analysis_charts.router) # 分析图表
api_router.include_router(instruction.router) # 智能指令
api_router.include_router(conversation.router) # 对话历史

View File

@@ -0,0 +1,98 @@
"""
对话历史 API 接口
提供对话历史的存储和查询功能
"""
import logging
from typing import Optional
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from app.core.database import mongodb
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/conversation", tags=["对话历史"])
# ==================== 请求/响应模型 ====================
class ConversationMessage(BaseModel):
role: str
content: str
intent: Optional[str] = None
class ConversationHistoryResponse(BaseModel):
success: bool
messages: list
class ConversationListResponse(BaseModel):
success: bool
conversations: list
# ==================== 接口 ====================
@router.get("/{conversation_id}/history", response_model=ConversationHistoryResponse)
async def get_conversation_history(conversation_id: str, limit: int = 20):
"""
获取对话历史
Args:
conversation_id: 对话会话ID
limit: 返回消息数量默认20条
"""
try:
messages = await mongodb.get_conversation_history(conversation_id, limit=limit)
return ConversationHistoryResponse(
success=True,
messages=messages
)
except Exception as e:
logger.error(f"获取对话历史失败: {e}")
return ConversationHistoryResponse(
success=False,
messages=[]
)
@router.delete("/{conversation_id}")
async def delete_conversation(conversation_id: str):
"""
删除对话会话
Args:
conversation_id: 对话会话ID
"""
try:
success = await mongodb.delete_conversation(conversation_id)
return {"success": success}
except Exception as e:
logger.error(f"删除对话失败: {e}")
return {"success": False, "error": str(e)}
@router.get("/all", response_model=ConversationListResponse)
async def list_conversations(limit: int = 50, skip: int = 0):
"""
获取会话列表
Args:
limit: 返回数量
skip: 跳过数量
"""
try:
conversations = await mongodb.list_conversations(limit=limit, skip=skip)
return ConversationListResponse(
success=True,
conversations=conversations
)
except Exception as e:
logger.error(f"获取会话列表失败: {e}")
return ConversationListResponse(
success=False,
conversations=[]
)

View File

@@ -4,6 +4,7 @@
支持多格式文档(docx/xlsx/md/txt)上传、解析、存储和RAG索引
集成 Excel 存储和 AI 生成字段描述
"""
import asyncio
import logging
import uuid
from typing import List, Optional
@@ -258,6 +259,7 @@ async def process_document(
)
# 如果是 Excel存储到 MySQL + AI生成描述 + RAG索引
mysql_table_name = None
if doc_type in ["xlsx", "xls"]:
await update_task_status(
task_id, status="processing",
@@ -265,17 +267,29 @@ async def process_document(
)
try:
# 使用 TableRAG 服务完成建表和RAG索引
# 使用 TableRAG 服务存储到 MySQL跳过 RAG 索引以提升速度)
logger.info(f"开始存储Excel到MySQL: {original_filename}, file_path: {file_path}")
rag_result = await table_rag_service.build_table_rag_index(
file_path=file_path,
filename=original_filename,
sheet_name=parse_options.get("sheet_name"),
header_row=parse_options.get("header_row", 0)
header_row=parse_options.get("header_row", 0),
skip_rag_index=True # 跳过 AI 字段描述生成和索引
)
if rag_result.get("success"):
logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {rag_result.get('table_name')}")
mysql_table_name = rag_result.get('table_name')
logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {mysql_table_name}")
# 更新 MongoDB 中的 metadata记录 MySQL 表名
try:
doc = await mongodb.get_document(doc_id)
if doc:
metadata = doc.get("metadata", {})
metadata["mysql_table_name"] = mysql_table_name
await mongodb.update_document_metadata(doc_id, metadata)
logger.info(f"已更新 MongoDB 文档的 mysql_table_name: {mysql_table_name}")
except Exception as update_err:
logger.warning(f"更新 MongoDB mysql_table_name 失败: {update_err}")
else:
logger.error(f"RAG索引构建失败: {rag_result.get('error')}")
except Exception as e:
@@ -283,17 +297,16 @@ async def process_document(
else:
# 非结构化文档
await update_task_status(
task_id, status="processing",
progress=60, message="正在建立索引"
)
# 如果文档中有表格数据,提取并存储到 MySQL + RAG
structured_data = result.data.get("structured_data", {})
tables = structured_data.get("tables", [])
# 如果文档中有表格数据,提取并存储到 MySQL不需要 RAG 索引)
if tables:
# 对每个表格建立 MySQL 表和 RAG 索引
await update_task_status(
task_id, status="processing",
progress=60, message="正在存储表格数据"
)
# 对每个表格建立 MySQL 表(跳过 RAG 索引,速度更快)
for table_info in tables:
await table_rag_service.index_document_table(
doc_id=doc_id,
@@ -302,7 +315,13 @@ async def process_document(
source_doc_type=doc_type
)
# 同时对文档内容建立 RAG 索引
# 对文档内容建立 RAG 索引(非结构化文本需要语义搜索)
content = result.data.get("content", "")
if content and len(content) > 50: # 只有内容足够长才建立索引
await update_task_status(
task_id, status="processing",
progress=80, message="正在建立语义索引"
)
await index_document_to_rag(doc_id, original_filename, result, doc_type)
# 完成
@@ -328,26 +347,32 @@ async def process_document(
async def process_documents_batch(task_id: str, files: List[dict]):
"""批量处理文档"""
"""批量并行处理文档"""
try:
await update_task_status(
task_id, status="processing",
progress=0, message="开始批量处理"
progress=0, message=f"开始批量处理 {len(files)} 个文档",
result={"total": len(files), "files": []}
)
results = []
for i, file_info in enumerate(files):
async def process_single_file(file_info: dict, index: int) -> dict:
"""处理单个文件"""
filename = file_info["filename"]
try:
# 解析文档
parser = ParserFactory.get_parser(file_info["path"])
result = parser.parse(file_info["path"])
if result.success:
if not result.success:
return {"index": index, "filename": filename, "success": False, "error": result.error or "解析失败"}
# 存储到 MongoDB
doc_id = await mongodb.insert_document(
doc_type=file_info["ext"],
content=result.data.get("content", ""),
metadata={
**result.metadata,
"original_filename": file_info["filename"],
"original_filename": filename,
"file_path": file_info["path"]
},
structured_data=result.data.get("structured_data")
@@ -357,43 +382,60 @@ async def process_documents_batch(task_id: str, files: List[dict]):
if file_info["ext"] in ["xlsx", "xls"]:
await table_rag_service.build_table_rag_index(
file_path=file_info["path"],
filename=file_info["filename"]
filename=filename,
skip_rag_index=True # 跳过 AI 字段描述生成和索引
)
else:
# 非结构化文档:处理其中的表格 + 内容索引
# 非结构化文档
structured_data = result.data.get("structured_data", {})
tables = structured_data.get("tables", [])
# 表格数据直接存 MySQL跳过 RAG 索引)
if tables:
for table_info in tables:
await table_rag_service.index_document_table(
doc_id=doc_id,
filename=file_info["filename"],
filename=filename,
table_data=table_info,
source_doc_type=file_info["ext"]
)
await index_document_to_rag(doc_id, file_info["filename"], result, file_info["ext"])
# 只有内容足够长才建立语义索引
content = result.data.get("content", "")
if content and len(content) > 50:
await index_document_to_rag(doc_id, filename, result, file_info["ext"])
results.append({"filename": file_info["filename"], "doc_id": doc_id, "success": True})
else:
results.append({"filename": file_info["filename"], "success": False, "error": result.error})
return {"index": index, "filename": filename, "doc_id": doc_id, "success": True}
except Exception as e:
results.append({"filename": file_info["filename"], "success": False, "error": str(e)})
logger.error(f"处理文件 {filename} 失败: {e}")
return {"index": index, "filename": filename, "success": False, "error": str(e)}
progress = int((i + 1) / len(files) * 100)
await update_task_status(
task_id, status="processing",
progress=progress, message=f"已处理 {i+1}/{len(files)}"
)
# 并行处理所有文档
tasks = [process_single_file(f, i) for i, f in enumerate(files)]
results = await asyncio.gather(*tasks)
# 按原始顺序排序
results.sort(key=lambda x: x["index"])
# 统计成功/失败数量
success_count = sum(1 for r in results if r["success"])
fail_count = len(results) - success_count
# 更新最终状态
await update_task_status(
task_id, status="success",
progress=100, message="批量处理完成",
result={"results": results}
progress=100, message=f"批量处理完成: {success_count} 成功, {fail_count} 失败",
result={
"total": len(files),
"success": success_count,
"failure": fail_count,
"results": results
}
)
logger.info(f"批量处理完成: {success_count}/{len(files)} 成功")
except Exception as e:
logger.error(f"批量处理失败: {str(e)}")
await update_task_status(
@@ -404,20 +446,20 @@ async def process_documents_batch(task_id: str, files: List[dict]):
async def index_document_to_rag(doc_id: str, filename: str, result: ParseResult, doc_type: str):
"""将非结构化文档索引到 RAG使用分块索引"""
"""将非结构化文档索引到 RAG使用分块索引,异步执行"""
try:
content = result.data.get("content", "")
if content:
# 将完整内容传递给 RAG 服务自动分块索引
rag_service.index_document_content(
# 使用异步方法索引,避免阻塞事件循环
await rag_service.index_document_content_async(
doc_id=doc_id,
content=content, # 传递完整内容,由 RAG 服务自动分块
content=content,
metadata={
"filename": filename,
"doc_type": doc_type
},
chunk_size=500, # 每块 500 字符
chunk_overlap=50 # 块之间 50 字符重叠
chunk_size=1000, # 每块 1000 字符,提升速度
chunk_overlap=100 # 块之间 100 字符重叠
)
logger.info(f"RAG 索引完成: {filename}, doc_id={doc_id}")
except Exception as e:

View File

@@ -25,6 +25,7 @@ class InstructionRequest(BaseModel):
instruction: str
doc_ids: Optional[List[str]] = None # 关联的文档 ID 列表
context: Optional[Dict[str, Any]] = None # 额外上下文
conversation_id: Optional[str] = None # 对话会话ID用于关联历史记录
class IntentRecognitionResponse(BaseModel):
@@ -240,7 +241,8 @@ async def instruction_chat(
task_id=task_id,
instruction=request.instruction,
doc_ids=request.doc_ids,
context=request.context
context=request.context,
conversation_id=request.conversation_id
)
return {
@@ -251,14 +253,15 @@ async def instruction_chat(
}
# 同步模式:等待执行完成
return await _execute_chat_task(task_id, request.instruction, request.doc_ids, request.context)
return await _execute_chat_task(task_id, request.instruction, request.doc_ids, request.context, request.conversation_id)
async def _execute_chat_task(
task_id: str,
instruction: str,
doc_ids: Optional[List[str]],
context: Optional[Dict[str, Any]]
context: Optional[Dict[str, Any]],
conversation_id: Optional[str] = None
):
"""执行指令对话的后台任务"""
from app.core.database import mongodb as mongo_client
@@ -278,6 +281,13 @@ async def _execute_chat_task(
# 构建上下文
ctx: Dict[str, Any] = context or {}
# 获取对话历史
if conversation_id:
history = await mongo_client.get_conversation_history(conversation_id, limit=20)
if history:
ctx["conversation_history"] = history
logger.info(f"加载对话历史: conversation_id={conversation_id}, 消息数={len(history)}")
# 获取关联文档
if doc_ids:
docs = []
@@ -291,6 +301,29 @@ async def _execute_chat_task(
# 执行指令
result = await instruction_executor.execute(instruction, ctx)
# 存储对话历史
if conversation_id:
try:
# 存储用户消息
await mongo_client.insert_conversation(
conversation_id=conversation_id,
role="user",
content=instruction,
intent=result.get("intent", "unknown")
)
# 存储助手回复
response_content = result.get("message", "")
if response_content:
await mongo_client.insert_conversation(
conversation_id=conversation_id,
role="assistant",
content=response_content,
intent=result.get("intent", "unknown")
)
logger.info(f"已存储对话历史: conversation_id={conversation_id}")
except Exception as e:
logger.error(f"存储对话历史失败: {e}")
# 根据意图类型添加友好的响应消息
response_messages = {
"extract": f"已提取 {len(result.get('extracted_data', {}))} 个字段的数据",

View File

@@ -87,6 +87,7 @@ class ExportRequest(BaseModel):
template_id: str
filled_data: dict
format: str = "xlsx" # xlsx 或 docx
filled_file_path: Optional[str] = None # 已填写的 Word 文件路径(可选)
# ==================== 接口实现 ====================
@@ -541,7 +542,7 @@ async def export_filled_template(
if request.format == "xlsx":
return await _export_to_excel(request.filled_data, request.template_id)
elif request.format == "docx":
return await _export_to_word(request.filled_data, request.template_id)
return await _export_to_word(request.filled_data, request.template_id, request.filled_file_path)
else:
raise HTTPException(
status_code=400,
@@ -608,11 +609,12 @@ async def _export_to_excel(filled_data: dict, template_id: str) -> StreamingResp
)
async def _export_to_word(filled_data: dict, template_id: str) -> StreamingResponse:
async def _export_to_word(filled_data: dict, template_id: str, filled_file_path: Optional[str] = None) -> StreamingResponse:
"""导出为 Word 格式"""
import re
import tempfile
import os
import urllib.parse
from docx import Document
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
@@ -623,12 +625,32 @@ async def _export_to_word(filled_data: dict, template_id: str) -> StreamingRespo
return ""
# 移除控制字符
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
# 转义 XML 特殊字符以防破坏文档结构
text = text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
return text.strip()
tmp_path = None
try:
# 先保存到临时文件,再读取到内存,确保文档完整性
with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
tmp_path = tmp_file.name
# 如果有已填写的文件(通过 _fill_docx 填写了模板单元格),直接返回该文件
if filled_file_path and os.path.exists(filled_file_path):
filename = os.path.basename(filled_file_path)
with open(filled_file_path, 'rb') as f:
file_content = f.read()
output = io.BytesIO(file_content)
encoded_filename = urllib.parse.quote(filename)
return StreamingResponse(
output,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
headers={
"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}",
"Content-Length": str(len(file_content))
}
)
# 没有已填写文件,创建新的 Word 文档(表格形式)
# 创建临时文件(立即关闭句柄,避免 Windows 文件锁问题)
tmp_fd, tmp_path = tempfile.mkstemp(suffix='.docx')
os.close(tmp_fd) # 关闭立即得到的 fd让 docx 可以写入
doc = Document()
doc.add_heading('填写结果', level=1)
@@ -670,19 +692,23 @@ async def _export_to_word(filled_data: dict, template_id: str) -> StreamingRespo
finally:
# 清理临时文件
if os.path.exists(tmp_path):
if tmp_path and os.path.exists(tmp_path):
try:
os.unlink(tmp_path)
except:
except Exception:
pass
output = io.BytesIO(file_content)
filename = "filled_template.docx"
encoded_filename = urllib.parse.quote(filename)
return StreamingResponse(
output,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
headers={"Content-Disposition": f"attachment; filename*=UTF-8''{filename}"}
headers={
"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}",
"Content-Length": str(len(file_content))
}
)

View File

@@ -64,6 +64,11 @@ class MongoDB:
"""任务集合 - 存储任务历史记录"""
return self.db["tasks"]
@property
def conversations(self):
"""对话集合 - 存储对话历史记录"""
return self.db["conversations"]
# ==================== 文档操作 ====================
async def insert_document(
@@ -117,14 +122,20 @@ class MongoDB:
搜索文档
Args:
query: 搜索关键词
query: 搜索关键词(支持文件名和内容搜索)
doc_type: 文档类型过滤
limit: 返回数量
Returns:
文档列表
"""
filter_query = {"content": {"$regex": query}}
filter_query = {
"$or": [
{"content": {"$regex": query, "$options": "i"}},
{"metadata.original_filename": {"$regex": query, "$options": "i"}},
{"metadata.filename": {"$regex": query, "$options": "i"}},
]
}
if doc_type:
filter_query["doc_type"] = doc_type
@@ -141,6 +152,15 @@ class MongoDB:
result = await self.documents.delete_one({"_id": ObjectId(doc_id)})
return result.deleted_count > 0
async def update_document_metadata(self, doc_id: str, metadata: Dict[str, Any]) -> bool:
"""更新文档 metadata 字段"""
from bson import ObjectId
result = await self.documents.update_one(
{"_id": ObjectId(doc_id)},
{"$set": {"metadata": metadata}}
)
return result.modified_count > 0
# ==================== RAG 索引操作 ====================
async def insert_rag_entry(
@@ -251,6 +271,10 @@ class MongoDB:
await self.tasks.create_index("task_id", unique=True)
await self.tasks.create_index("created_at")
# 对话集合索引
await self.conversations.create_index("conversation_id")
await self.conversations.create_index("created_at")
logger.info("MongoDB 索引创建完成")
# ==================== 任务历史操作 ====================
@@ -369,6 +393,108 @@ class MongoDB:
result = await self.tasks.delete_one({"task_id": task_id})
return result.deleted_count > 0
# ==================== 对话历史操作 ====================
async def insert_conversation(
self,
conversation_id: str,
role: str,
content: str,
intent: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> str:
"""
插入对话记录
Args:
conversation_id: 对话会话ID
role: 角色 (user/assistant)
content: 对话内容
intent: 意图类型
metadata: 额外元数据
Returns:
插入文档的ID
"""
message = {
"conversation_id": conversation_id,
"role": role,
"content": content,
"intent": intent,
"metadata": metadata or {},
"created_at": datetime.utcnow(),
}
result = await self.conversations.insert_one(message)
return str(result.inserted_id)
async def get_conversation_history(
self,
conversation_id: str,
limit: int = 20,
) -> List[Dict[str, Any]]:
"""
获取对话历史
Args:
conversation_id: 对话会话ID
limit: 返回消息数量
Returns:
对话消息列表
"""
cursor = self.conversations.find(
{"conversation_id": conversation_id}
).sort("created_at", 1).limit(limit)
messages = []
async for msg in cursor:
msg["_id"] = str(msg["_id"])
if msg.get("created_at"):
msg["created_at"] = msg["created_at"].isoformat()
messages.append(msg)
return messages
async def delete_conversation(self, conversation_id: str) -> bool:
"""删除对话会话"""
result = await self.conversations.delete_many({"conversation_id": conversation_id})
return result.deleted_count > 0
async def list_conversations(
self,
limit: int = 50,
skip: int = 0,
) -> List[Dict[str, Any]]:
"""
获取会话列表(按最近一条消息排序)
Args:
limit: 返回数量
skip: 跳过数量
Returns:
会话列表
"""
# 使用 aggregation 获取每个会话的最新一条消息
pipeline = [
{"$sort": {"created_at": -1}},
{"$group": {
"_id": "$conversation_id",
"last_message": {"$first": "$$ROOT"},
}},
{"$replaceRoot": {"newRoot": "$last_message"}},
{"$sort": {"created_at": -1}},
{"$skip": skip},
{"$limit": limit},
]
conversations = []
async for doc in self.conversations.aggregate(pipeline):
doc["_id"] = str(doc["_id"])
if doc.get("created_at"):
doc["created_at"] = doc["created_at"].isoformat()
conversations.append(doc)
return conversations
# ==================== 全局单例 ====================

View File

@@ -44,6 +44,22 @@ class DocxParser(BaseParser):
error=f"文件不存在: {file_path}"
)
# 尝试使用 python-docx 解析,失败则使用备用方法
try:
return self._parse_with_docx(path)
except Exception as e:
logger.warning(f"python-docx 解析失败,使用备用方法: {e}")
try:
return self._parse_fallback(path)
except Exception as fallback_error:
logger.error(f"备用解析方法也失败: {fallback_error}")
return ParseResult(
success=False,
error=f"解析 Word 文档失败: {str(e)}"
)
def _parse_with_docx(self, path: Path) -> ParseResult:
"""使用 python-docx 解析文档"""
# 检查文件扩展名
if path.suffix.lower() not in self.supported_extensions:
return ParseResult(
@@ -51,9 +67,8 @@ class DocxParser(BaseParser):
error=f"不支持的文件类型: {path.suffix}"
)
try:
# 读取 Word 文档
doc = Document(file_path)
doc = Document(path)
# 提取文本内容
paragraphs = []
@@ -107,43 +122,123 @@ class DocxParser(BaseParser):
metadata = {
"filename": path.name,
"extension": path.suffix.lower(),
"file_size": path.stat().st_size,
"paragraph_count": len(paragraphs),
"table_count": len(tables_data),
"word_count": len(full_text),
"char_count": len(full_text.replace("\n", "")),
"has_tables": len(tables_data) > 0,
"has_images": images_info.get("image_count", 0) > 0,
"image_count": images_info.get("image_count", 0)
}
# 返回结果
return ParseResult(
success=True,
data={
"content": full_text,
"paragraphs": paragraphs_text,
"paragraphs": paragraphs,
"paragraphs_with_style": paragraphs,
"tables": tables_data,
"images": images_info,
"word_count": len(full_text),
"structured_data": {
"paragraphs": paragraphs,
"paragraphs_text": paragraphs_text,
"tables": tables_data,
"images": images_info
}
},
metadata=metadata
)
except Exception as e:
logger.error(f"解析 Word 文档失败: {str(e)}")
def _parse_fallback(self, path: Path) -> ParseResult:
"""备用解析方法:直接解析 docx 的 XML 结构"""
import zipfile
from xml.etree import ElementTree as ET
try:
with zipfile.ZipFile(path, 'r') as zf:
# 读取 document.xml
if 'word/document.xml' not in zf.namelist():
return ParseResult(success=False, error="无效的 docx 文件格式")
xml_content = zf.read('word/document.xml')
root = ET.fromstring(xml_content)
# 命名空间
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
}
paragraphs = []
tables = []
current_table = []
for elem in root.iter():
if elem.tag.endswith('}p'): # 段落
text_parts = []
for t in elem.iter():
if t.tag.endswith('}t') and t.text:
text_parts.append(t.text)
text = ''.join(text_parts).strip()
if text:
paragraphs.append({'text': text, 'style': 'Normal'})
elif elem.tag.endswith('}tr'): # 表格行
row_data = []
for tc in elem.iter():
if tc.tag.endswith('}tc'): # 单元格
cell_text = []
for t in tc.iter():
if t.tag.endswith('}t') and t.text:
cell_text.append(t.text)
row_data.append(''.join(cell_text).strip())
if row_data:
current_table.append(row_data)
else:
# 表格结束,保存
if current_table:
tables.append({
'table_index': len(tables),
'rows': current_table,
'row_count': len(current_table),
'column_count': len(current_table[0]) if current_table else 0
})
current_table = []
# 保存最后一张表格
if current_table:
tables.append({
'table_index': len(tables),
'rows': current_table,
'row_count': len(current_table),
'column_count': len(current_table[0]) if current_table else 0
})
# 构建文本
paragraphs_text = [p["text"] for p in paragraphs]
full_text_parts = ["【文档正文】"] + paragraphs_text
if tables:
full_text_parts.append("\n【文档表格】")
for idx, table in enumerate(tables):
full_text_parts.append(f"--- 表格 {idx + 1} ---")
for row in table["rows"]:
full_text_parts.append(" | ".join(str(cell) for cell in row))
full_text = "\n".join(full_text_parts)
return ParseResult(
success=False,
error=f"解析 Word 文档失败: {str(e)}"
success=True,
data={
"content": full_text,
"paragraphs": paragraphs,
"paragraphs_with_style": paragraphs,
"tables": tables,
"images": {"image_count": 0, "descriptions": []}
},
metadata={
"filename": path.name,
"extension": path.suffix.lower(),
"paragraph_count": len(paragraphs),
"table_count": len(tables),
"image_count": 0,
"parse_method": "fallback_xml"
}
)
except zipfile.BadZipFile:
return ParseResult(success=False, error="无效的 ZIP/文档文件")
except Exception as e:
return ParseResult(success=False, error=f"备用解析失败: {str(e)}")
def extract_images_as_base64(self, file_path: str) -> List[Dict[str, str]]:
"""
提取 Word 文档中的所有图片,返回 base64 编码列表
@@ -197,6 +292,83 @@ class DocxParser(BaseParser):
logger.info(f"共提取 {len(images)} 张图片")
return images
def extract_text_from_images(self, file_path: str, lang: str = 'chi_sim+eng') -> Dict[str, Any]:
"""
对 Word 文档中的图片进行 OCR 文字识别
Args:
file_path: Word 文件路径
lang: Tesseract 语言代码,默认简体中文+英文 (chi_sim+eng)
Returns:
包含识别结果的字典
"""
import zipfile
from io import BytesIO
from PIL import Image
try:
import pytesseract
except ImportError:
logger.warning("pytesseract 未安装OCR 功能不可用")
return {
"success": False,
"error": "pytesseract 未安装,请运行: pip install pytesseract",
"image_count": 0,
"extracted_text": []
}
results = {
"success": True,
"image_count": 0,
"extracted_text": [],
"total_chars": 0
}
try:
with zipfile.ZipFile(file_path, 'r') as zf:
# 查找 word/media 目录下的图片文件
media_files = [f for f in zf.namelist() if f.startswith('word/media/')]
for idx, filename in enumerate(media_files):
ext = filename.split('.')[-1].lower()
if ext not in ['png', 'jpg', 'jpeg', 'gif', 'bmp']:
continue
try:
# 读取图片数据
image_data = zf.read(filename)
image = Image.open(BytesIO(image_data))
# 使用 Tesseract OCR 提取文字
text = pytesseract.image_to_string(image, lang=lang)
text = text.strip()
if text:
results["extracted_text"].append({
"image_index": idx,
"filename": filename,
"text": text,
"char_count": len(text)
})
results["total_chars"] += len(text)
logger.info(f"图片 {filename} OCR 识别完成,提取 {len(text)} 字符")
except Exception as e:
logger.warning(f"图片 {filename} OCR 识别失败: {str(e)}")
results["image_count"] = len(results["extracted_text"])
except zipfile.BadZipFile:
results["success"] = False
results["error"] = "无效的 Word 文档文件"
except Exception as e:
results["success"] = False
results["error"] = f"OCR 处理失败: {str(e)}"
return results
def extract_key_sentences(self, text: str, max_sentences: int = 10) -> List[str]:
"""
从文本中提取关键句子

View File

@@ -5,9 +5,10 @@
"""
import logging
import json
import re
from typing import Any, Dict, List, Optional
from app.services.template_fill_service import template_fill_service
from app.services.template_fill_service import template_fill_service, TemplateField
from app.services.rag_service import rag_service
from app.services.markdown_ai_service import markdown_ai_service
from app.core.database import mongodb
@@ -15,6 +16,31 @@ from app.core.database import mongodb
logger = logging.getLogger(__name__)
def _extract_filenames_from_text(text: str) -> List[str]:
"""
从指令文本中提取文件名列表。
智能处理用''/''/'、分隔的多个文件名(尤其是带年号的统计公报)。
"""
# 先去掉"对比这两个文档"等引导语,只保留文件名部分
text = re.sub(r'^(?:对比|比较)这两个?文档[的差异]?[:]?', '', text).strip()
text = re.sub(r'两个文档.*$', '', text).strip()
if not text:
return []
# 直接查找所有带扩展名的文件名模式
results = []
for m in re.finditer(r'[^\s、和与]+(?=\.(?:docx|xlsx|md|txt))', text):
start = m.start()
ext_match = re.search(r'\.(?:docx|xlsx|md|txt)', text[m.end():])
if ext_match:
fn = text[start:m.end() + ext_match.end()]
if fn:
results.append(fn)
return results
class InstructionExecutor:
"""指令执行器"""
@@ -41,9 +67,10 @@ class InstructionExecutor:
self.intent_parser = intent_parser
context = context or {}
context["instruction"] = instruction # 保存原始指令以便后续使用
# 解析意图
intent, params = await self.intent_parser.parse(instruction)
# 解析意图(传递对话历史上下文)
intent, params = await self.intent_parser.parse(instruction, context)
# 根据意图类型执行相应操作
if intent == "extract":
@@ -72,18 +99,48 @@ class InstructionExecutor:
async def _execute_extract(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
"""执行信息提取"""
try:
target_fields = params.get("field_refs", [])
# target_fields 来自意图解析field_refs 来自引号/字段关键词匹配
target_fields = params.get("target_fields", []) or params.get("field_refs", [])
doc_ids = params.get("document_refs", [])
instruction_text = context.get("instruction", "")
# 如果没有指定文档,尝试按文件名精确搜索
if not doc_ids or "all_docs" in doc_ids:
if instruction_text:
import re
# 提取引号内的内容或文件名
filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text)
if filename_match:
search_term = filename_match.group(1)
else:
match = re.search(r'([^\s]+\.(?:docx|xlsx|md|txt))', instruction_text)
search_term = match.group(1) if match else None
if search_term:
logger.info(f"提取时搜索文档: {search_term}")
searched_docs = await mongodb.search_documents(search_term, limit=5)
if searched_docs:
# 优先选择文件名完全匹配的文档
best_docs = [
d for d in searched_docs
if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower()
]
if not best_docs:
best_docs = [searched_docs[0]]
context["source_docs"] = best_docs
doc_ids = [doc.get("_id", "") for doc in best_docs]
logger.info(f"找到 {len(best_docs)} 个文档用于提取,最佳: {best_docs[0].get('metadata', {}).get('original_filename', '?')}")
if not target_fields:
return {
"success": False,
"intent": "extract",
"error": "未指定要提取的字段",
"message": "请明确说明要提取哪些字段,如:'提取医院数量和床位数'"
}
# 如果指定了文档,验证文档存在
if doc_ids and "all_docs" not in doc_ids:
# 如果指定了文档且还没有加载 source_docs则验证并加载
if doc_ids and "all_docs" not in doc_ids and not context.get("source_docs"):
valid_docs = []
for doc_ref in doc_ids:
doc_id = doc_ref.replace("doc_", "")
@@ -93,20 +150,22 @@ class InstructionExecutor:
if not valid_docs:
return {
"success": False,
"intent": "extract",
"error": "指定的文档不存在",
"message": "请检查文档编号是否正确"
}
context["source_docs"] = valid_docs
# 构建字段列表
fields = []
for i, field_name in enumerate(target_fields):
fields.append({
"name": field_name,
"cell": f"A{i+1}",
"field_type": "text",
"required": False
})
# 构建字段列表(使用 TemplateField dataclass
fields = [
TemplateField(
name=field_name,
cell=f"A{i+1}",
field_type="text",
required=False
)
for i, field_name in enumerate(target_fields)
]
# 调用填表服务
result = await template_fill_service.fill_template(
@@ -143,7 +202,7 @@ class InstructionExecutor:
}
# 获取源文档
source_docs = context.get("source_docs", [])
source_docs = context.get("source_docs", []) or []
source_doc_ids = [doc.get("_id") for doc in source_docs if doc.get("_id")]
# 获取字段
@@ -175,36 +234,103 @@ class InstructionExecutor:
}
async def _execute_summarize(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
"""执行摘要总结"""
"""执行摘要总结 - 使用 LLM 生成真实摘要"""
try:
docs = context.get("source_docs", [])
import re
docs = context.get("source_docs", []) or []
instruction_text = context.get("instruction", "")
# 从指令中提取文件名/关键词,优先搜索精确文档
search_term = None
if instruction_text:
filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text)
if filename_match:
search_term = filename_match.group(1)
else:
file_match = re.search(r'([^\s,]+\.(?:docx|xlsx|md|txt))', instruction_text)
if file_match:
search_term = file_match.group(1)
# 如果没有文档或有更精确的搜索词,尝试重新搜索
if not docs or search_term:
if search_term:
logger.info(f"按关键词搜索文档: {search_term}")
searched_docs = await mongodb.search_documents(search_term, limit=5)
if searched_docs:
# 优先使用文件名最匹配的文档
docs = sorted(
searched_docs,
key=lambda d: 1 if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower() else 0,
reverse=True
)
logger.info(f"找到 {len(docs)} 个文档,最佳匹配: {docs[0].get('metadata', {}).get('original_filename', '?')}")
if not docs:
return {
"success": False,
"error": "没有可用的文档",
"message": "请先上传要总结的文档"
"success": True,
"intent": "summarize",
"action_needed": "provide_document",
"message": "我理解了,您想分析文档内容。",
"suggestion": "请提供已上传文档的名称(可以是文件名或部分名称),或者上传您想要分析的文档。\n\n支持的格式docx、xlsx、md、txt\n\n例如:'分析2021年民政事业发展统计公报''总结卫生健康数据'"
}
summaries = []
for doc in docs[:5]: # 最多处理5个文档
content = doc.get("content", "")[:5000] # 限制内容长度
if content:
summaries.append({
"filename": doc.get("metadata", {}).get("original_filename", "未知"),
"content_preview": content[:500] + "..." if len(content) > 500 else content
})
# 对第一个(最佳匹配)文档生成 AI 摘要
primary_doc = docs[0]
content = primary_doc.get("content", "")
filename = primary_doc.get("metadata", {}).get("original_filename", "未知文档")
if not content:
return {
"success": False,
"intent": "summarize",
"error": "文档内容为空",
"message": f"文档 {filename} 没有可供分析的文本内容"
}
# 使用 LLM 生成摘要
content_for_summary = content[:12000] # 最多取前 12000 字
user_request = instruction_text or "请总结这份文档"
prompt = f"""请对以下文档进行全面、有条理的摘要分析。
文档名称:{filename}
用户要求:{user_request}
文档内容:
{content_for_summary}
请按以下格式输出摘要:
1. **文档概述**简述文档主题和背景2-3句
2. **主要内容**:列出文档的核心数据和关键信息(用要点列出)
3. **重要数据**:提取文档中的重要数字、统计数据
4. **主要结论**:归纳文档的主要结论或趋势
要求:条理清晰,数据准确,不要遗漏关键信息。"""
from app.services.llm_service import llm_service
messages = [
{"role": "system", "content": "你是一个专业的文档分析助手,擅长提取关键信息并生成结构化摘要。"},
{"role": "user", "content": prompt}
]
response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=2000)
ai_summary = llm_service.extract_message_content(response)
return {
"success": True,
"intent": "summarize",
"summaries": summaries,
"message": f"找到 {len(summaries)} 个文档可供参考"
"ai_summary": ai_summary,
"filename": filename,
"doc_id": primary_doc.get("_id", ""),
"total_docs_found": len(docs),
"message": f"已生成文档摘要"
}
except Exception as e:
logger.error(f"摘要执行失败: {e}")
return {
"success": False,
"intent": "summarize",
"error": str(e),
"message": f"摘要生成失败: {str(e)}"
}
@@ -213,17 +339,39 @@ class InstructionExecutor:
"""执行问答"""
try:
question = params.get("question", "")
instruction_text = context.get("instruction", "")
if not question:
return {
"success": False,
"intent": "question",
"error": "未提供问题",
"message": "请输入要回答的问题"
}
# 使用 RAG 检索相关文档
docs = context.get("source_docs", [])
rag_results = []
docs = context.get("source_docs", []) or []
# 如果没有文档,尝试从指令中提取文件名搜索
if not docs:
filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text)
if not filename_match:
filename_match = re.search(r'([^\s]+\.(?:docx|xlsx|md|txt))', instruction_text)
if filename_match:
found = await mongodb.search_documents(filename_match.group(1), limit=5)
if found:
docs = found
if not docs:
return {
"success": True,
"intent": "question",
"question": question,
"answer": None,
"message": "请先上传文档,我才能回答您的问题"
}
# 使用 RAG 检索相关文档
rag_results = []
for doc in docs:
doc_id = doc.get("_id", "")
if doc_id:
@@ -241,12 +389,42 @@ class InstructionExecutor:
doc.get("content", "")[:3000] for doc in docs[:3] if doc.get("content")
])
if not context_text:
return {
"success": True,
"intent": "question",
"question": question,
"context_preview": context_text[:500] + "..." if len(context_text) > 500 else context_text,
"message": "已找到相关上下文,可进行问答"
"answer": None,
"message": "文档内容为空,无法回答问题"
}
# 使用 LLM 生成答案
filename = docs[0].get("metadata", {}).get("original_filename", "文档")
prompt = f"""基于以下文档内容,回答用户的问题。
文档名称:{filename}
用户问题:{question}
文档内容:
{context_text[:8000]}
请根据文档内容准确回答问题。如果文档中没有相关信息,请明确说明。"""
from app.services.llm_service import llm_service
messages = [
{"role": "system", "content": "你是一个专业的文档问答助手,根据提供的内容准确回答用户问题。"},
{"role": "user", "content": prompt}
]
response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=1500)
answer = llm_service.extract_message_content(response)
return {
"success": True,
"intent": "question",
"question": question,
"answer": answer,
"filename": filename,
"message": "已生成回答"
}
except Exception as e:
@@ -299,12 +477,53 @@ class InstructionExecutor:
async def _execute_compare(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
"""执行对比分析"""
try:
docs = context.get("source_docs", [])
docs = context.get("source_docs", []) or []
instruction_text = context.get("instruction", "")
# 优先从指令中提取具体的文件名
filenames = _extract_filenames_from_text(instruction_text)
if filenames:
# 只选择文件名匹配的那些文档
matched_docs = []
for doc in docs:
fname = doc.get("metadata", {}).get("original_filename", "").lower()
for fn in filenames:
if fn.lower() in fname or fname in fn.lower():
matched_docs.append(doc)
break
# 如果匹配到足够文档,用匹配的
if len(matched_docs) >= 2:
docs = matched_docs
else:
# 匹配不够,尝试按文件名搜索 MongoDB
all_found = []
for fn in filenames:
found = await mongodb.search_documents(fn, limit=5)
all_found.extend(found)
seen = set()
unique_docs = []
for d in all_found:
did = d.get("_id", "")
if did and did not in seen:
seen.add(did)
unique_docs.append(d)
if len(unique_docs) >= 2:
docs = unique_docs
elif len(unique_docs) == 1 and len(docs) >= 1:
# 找到一个指定的 + 用一个通用的
docs = unique_docs + docs[:1]
elif docs and len(filenames) == 1:
# 找到一个指定文件名但只有一个匹配,尝试补充
docs = unique_docs + [d for d in docs if d not in unique_docs]
docs = docs[:2]
if len(docs) < 2:
return {
"success": False,
"intent": "compare",
"error": "对比需要至少2个文档",
"message": "请上传至少2个文档进行对比"
"message": "请上传至少2个文档进行对比,或明确说出要对比的文档名称"
}
# 提取文档基本信息
@@ -329,6 +548,7 @@ class InstructionExecutor:
logger.error(f"对比执行失败: {e}")
return {
"success": False,
"intent": "compare",
"error": str(e),
"message": f"对比分析失败: {str(e)}"
}
@@ -336,10 +556,23 @@ class InstructionExecutor:
async def _execute_edit(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
"""执行文档编辑操作"""
try:
docs = context.get("source_docs", [])
docs = context.get("source_docs", []) or []
instruction_text = context.get("instruction", "")
# 如果没有文档,尝试从指令中提取文件名搜索
if not docs:
filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text)
if not filename_match:
filename_match = re.search(r'([^\s]+\.(?:docx|xlsx|md|txt))', instruction_text)
if filename_match:
found = await mongodb.search_documents(filename_match.group(1), limit=3)
if found:
docs = found
if not docs:
return {
"success": False,
"intent": "edit",
"error": "没有可用的文档",
"message": "请先上传要编辑的文档"
}
@@ -405,7 +638,7 @@ class InstructionExecutor:
- Word -> Markdown
"""
try:
docs = context.get("source_docs", [])
docs = context.get("source_docs", []) or []
if not docs:
return {
"success": False,

View File

@@ -28,7 +28,7 @@ class IntentParser:
INTENT_KEYWORDS = {
INTENT_EXTRACT: ["提取", "抽取", "获取", "找出", "查找", "识别", "找到"],
INTENT_FILL_TABLE: ["填表", "填写", "填充", "录入", "导入到表格", "填写到"],
INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼"],
INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼", "分析", "聊聊"],
INTENT_QUESTION: ["问答", "回答", "解释", "什么是", "为什么", "如何", "怎样", "多少", "几个"],
INTENT_SEARCH: ["搜索", "查找", "检索", "查询", ""],
INTENT_COMPARE: ["对比", "比较", "差异", "区别", "不同"],
@@ -47,12 +47,13 @@ class IntentParser:
def __init__(self):
self.intent_history: List[Dict[str, Any]] = []
async def parse(self, text: str) -> Tuple[str, Dict[str, Any]]:
async def parse(self, text: str, context: Dict[str, Any] = None) -> Tuple[str, Dict[str, Any]]:
"""
解析自然语言指令
Args:
text: 用户输入的自然语言
context: 执行上下文(包含对话历史等)
Returns:
(意图类型, 参数字典)
@@ -61,11 +62,17 @@ class IntentParser:
if not text:
return self.INTENT_UNKNOWN, {}
# 检查对话历史中的上下文
conversation_history = []
if context and context.get("conversation_history"):
conversation_history = context.get("conversation_history", [])
logger.info(f"解析时使用对话历史: {len(conversation_history)} 条消息")
# 记录历史
self.intent_history.append({"text": text, "intent": None})
# 识别意图
intent = self._recognize_intent(text)
# 识别意图(考虑对话上下文)
intent = self._recognize_intent_with_context(text, conversation_history)
# 提取参数
params = self._extract_params(text, intent)
@@ -78,6 +85,42 @@ class IntentParser:
return intent, params
def _recognize_intent_with_context(self, text: str, conversation_history: List[Dict[str, Any]]) -> str:
"""
基于对话历史识别意图
Args:
text: 当前用户输入
conversation_history: 对话历史
Returns:
意图类型
"""
# 如果对话历史为空,使用基础意图识别
if not conversation_history:
return self._recognize_intent(text)
# 基于历史上下文进行意图识别
# 分析最近的对话了解用户意图的延续性
last_intent = None
last_topic = None
for msg in conversation_history[-5:]: # 最多看最近5条消息
if msg.get("role") == "assistant":
last_intent = msg.get("intent")
if msg.get("intent") and msg.get("intent") != "unknown":
last_topic = msg.get("intent")
# 如果当前消息很短(如"继续"、"是的"),可能延续之前的意图
short_confirmation = ["", "是的", "", "继续", "ok", "", "接着", "然后", "还有吗"]
if text.strip() in short_confirmation or len(text.strip()) <= 3:
if last_topic:
logger.info(f"简短确认,延续之前的意图: {last_topic}")
return last_topic
# 否则使用标准意图识别
return self._recognize_intent(text)
def _recognize_intent(self, text: str) -> str:
"""识别意图类型"""
intent_scores: Dict[str, float] = {}
@@ -214,18 +257,27 @@ class IntentParser:
return template_info if template_info else None
def _extract_target_fields(self, text: str) -> List[str]:
"""提取目标字段"""
"""提取目标字段 - 按分隔符切分再逐段清理"""
fields = []
# 匹配 "提取XXX和YYY"、"抽取XXX、YYY"
patterns = [
r"提取([^(and|,|)+]+?)(?:和|与|、|,|plus)",
r"抽取([^(and|,|)+]+?)(?:和|与|、|,|plus)",
]
# 去除提取/抽取前缀
cleaned_text = re.sub(r"^(?:提取|抽取)", "", text).strip()
for pattern in patterns:
matches = re.findall(pattern, text)
fields.extend([m.strip() for m in matches if m.strip()])
# 按'和'、'与'、'、'分割成多段
segments = re.split(r"[和与、]", cleaned_text)
# 常见前缀(这些不是字段名,需要去除)
prefixes = ["文档中的", "文档中", "文件中的", "文件中", "内容中的", "内容中"]
for seg in segments:
seg = seg.strip()
# 去除常见前缀
for p in prefixes:
if seg.startswith(p):
seg = seg[len(p):]
break
if seg and 2 <= len(seg) <= 20:
fields.append(seg)
return list(set(fields))

View File

@@ -526,9 +526,10 @@ class ExcelStorageService:
# 创建表
model_class = self._create_table_model(table_name, columns, column_types)
# 创建表结构
# 创建表结构 (使用异步方式)
async with self.mysql_db.get_session() as session:
model_class.__table__.create(session.bind, checkfirst=True)
async with session.bind.begin() as conn:
await conn.run_sync(lambda: model_class.__table__.create(checkfirst=True))
# 插入数据
records = []

View File

@@ -165,9 +165,9 @@ class BM25:
class RAGService:
"""RAG 检索增强服务"""
# 默认分块参数
DEFAULT_CHUNK_SIZE = 500 # 每个文本块的大小(字符数)
DEFAULT_CHUNK_OVERLAP = 50 # 块之间的重叠(字符数)
# 默认分块参数 - 增大块大小减少embedding次数
DEFAULT_CHUNK_SIZE = 1000 # 每个文本块的大小(字符数),增大以提升速度
DEFAULT_CHUNK_OVERLAP = 100 # 块之间的重叠(字符数)
def __init__(self):
self.embedding_model = None
@@ -389,6 +389,70 @@ class RAGService:
self._add_documents(documents, chunk_ids)
logger.info(f"已索引文档 {doc_id},共 {len(chunks)} 个块")
async def index_document_content_async(
self,
doc_id: str,
content: str,
metadata: Optional[Dict[str, Any]] = None,
chunk_size: int = None,
chunk_overlap: int = None
):
"""
异步将文档内容索引到向量数据库(自动分块)
使用 asyncio.to_thread 避免阻塞事件循环
"""
import asyncio
if self._disabled:
logger.info(f"[RAG DISABLED] 文档索引操作已跳过: {doc_id}")
return
if not self._initialized:
self._init_vector_store()
if self.embedding_model is None:
logger.debug(f"文档跳过索引 (无嵌入模型): {doc_id}")
return
# 分割文档为小块
if chunk_size is None:
chunk_size = self.DEFAULT_CHUNK_SIZE
if chunk_overlap is None:
chunk_overlap = self.DEFAULT_CHUNK_OVERLAP
chunks = self._split_into_chunks(content, chunk_size, chunk_overlap)
if not chunks:
logger.warning(f"文档内容为空,跳过索引: {doc_id}")
return
# 为每个块创建文档对象
documents = []
chunk_ids = []
for i, chunk in enumerate(chunks):
chunk_id = f"{doc_id}_chunk_{i}"
chunk_metadata = metadata.copy() if metadata else {}
chunk_metadata.update({
"chunk_index": i,
"total_chunks": len(chunks),
"doc_id": doc_id
})
documents.append(SimpleDocument(
page_content=chunk,
metadata=chunk_metadata
))
chunk_ids.append(chunk_id)
# 使用线程池执行 CPU 密集型的 embedding 计算
def _sync_add():
self._add_documents(documents, chunk_ids)
await asyncio.to_thread(_sync_add)
logger.info(f"已异步索引文档 {doc_id},共 {len(chunks)} 个块")
def _add_documents(self, documents: List[SimpleDocument], doc_ids: List[str]):
"""批量添加文档到向量索引"""
if not documents:

View File

@@ -300,13 +300,15 @@ class TableRAGService:
filename: str,
sheet_name: Optional[str] = None,
header_row: int = 0,
sample_size: int = 10
sample_size: int = 10,
skip_rag_index: bool = False
) -> Dict[str, Any]:
"""
为 Excel 表构建完整的 RAG 索引
流程:
1. 读取 Excel 获取字段信息
2. 如果 skip_rag_index=True跳过 RAG 索引,直接存 MySQL
2. AI 生成每个字段的语义描述
3. 将字段描述存入向量数据库
@@ -367,6 +369,20 @@ class TableRAGService:
results["field_count"] = len(df.columns)
logger.info(f"表名: {table_name}, 字段数: {len(df.columns)}")
# 跳过 RAG 索引时直接存 MySQL
if skip_rag_index:
logger.info(f"跳过 RAG 索引,直接存储到 MySQL")
store_result = await self.excel_storage.store_excel(
file_path=file_path,
filename=filename,
sheet_name=sheet_name,
header_row=header_row
)
results["mysql_table"] = store_result.get("table_name") if store_result.get("success") else None
results["row_count"] = store_result.get("row_count", len(df))
results["indexed_count"] = 0
return results
# 3. 初始化 RAG (如果需要)
if not self.rag._initialized:
self.rag._init_vector_store()

View File

@@ -5,6 +5,7 @@
"""
import asyncio
import logging
import re
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
@@ -13,6 +14,7 @@ from app.services.llm_service import llm_service
from app.core.document_parser import ParserFactory
from app.services.markdown_ai_service import markdown_ai_service
from app.services.rag_service import rag_service
from app.services.excel_storage_service import excel_storage_service
logger = logging.getLogger(__name__)
@@ -105,12 +107,60 @@ class TemplateFillService:
# 3. 检查是否需要使用源文档重新生成表头
# 条件:源文档已加载 AND 现有字段看起来是自动生成的(如"字段1"、"字段2"
# 注意Word 模板docx不自动重新生成表头因为 Word 模板的表结构由用户定义,必须保留
needs_regenerate_headers = (
template_file_type != "docx" and
len(source_docs) > 0 and
len(template_fields) > 0 and
all(self._is_auto_generated_field(f.name) for f in template_fields)
)
# 4. Word 模板特殊处理:表头为空时,从源文档生成字段
# 仅当有源文档、模板字段为空、模板文件类型为 docx 时触发
if not needs_regenerate_headers and template_file_type == "docx" and len(source_docs) > 0 and len(template_fields) == 0:
logger.info(f"Word 模板表头为空,从源文档生成字段... (source_docs={len(source_docs)})")
source_contents = []
for doc in source_docs:
structured = doc.structured_data if doc.structured_data else {}
titles = structured.get("titles", [])
tables = structured.get("tables", [])
tables_count = len(tables) if tables else 0
tables_summary = ""
if tables:
tables_summary = "\n【文档中的表格】:\n"
for idx, table in enumerate(tables[:5]):
if isinstance(table, dict):
headers = table.get("headers", [])
rows = table.get("rows", [])
if headers:
tables_summary += f"表格{idx+1}表头: {', '.join(str(h) for h in headers)}\n"
if rows:
tables_summary += f"表格{idx+1}前3行: "
for row_idx, row in enumerate(rows[:3]):
if isinstance(row, list):
tables_summary += " | ".join(str(c) for c in row) + "; "
elif isinstance(row, dict):
tables_summary += " | ".join(str(row.get(h, "")) for h in headers if headers) + "; "
tables_summary += "\n"
source_contents.append({
"filename": doc.filename,
"doc_type": doc.doc_type,
"content": doc.content[:5000] if doc.content else "",
"titles": titles[:10] if titles else [],
"tables_count": tables_count,
"tables_summary": tables_summary
})
if template_id:
generated_fields = await self.get_template_fields_from_file(
template_id,
template_file_type,
source_contents=source_contents,
source_docs=source_docs
)
if generated_fields:
template_fields = generated_fields
logger.info(f"Word 模板字段生成成功: {[f.name for f in template_fields]}")
if needs_regenerate_headers:
logger.info(f"检测到自动生成表头,尝试使用源文档重新生成... (当前字段: {[f.name for f in template_fields]})")
@@ -162,7 +212,8 @@ class TemplateFillService:
new_fields = await self.get_template_fields_from_file(
template_id,
template_file_type,
source_contents=source_contents
source_contents=source_contents,
source_docs=source_docs
)
if new_fields and len(new_fields) > 0:
logger.info(f"成功重新生成表头: {[f.name for f in new_fields]}")
@@ -224,14 +275,357 @@ class TemplateFillService:
max_rows = max(len(v) for v in filled_data.values()) if filled_data else 1
logger.info(f"填表完成: {len(filled_data)} 个字段, 最大行数: {max_rows}")
# 如果是 Word 模板,将数据填入模板文件
filled_file_path = None
if template_file_type == "docx" and template_id and filled_data:
filled_file_path = await self._fill_docx(template_id, filled_data)
if filled_file_path:
logger.info(f"Word 模板已填写,输出文件: {filled_file_path}")
return {
"success": True,
"filled_data": filled_data,
"fill_details": fill_details,
"source_doc_count": len(source_docs),
"max_rows": max_rows
"max_rows": max_rows,
"filled_file_path": filled_file_path
}
async def _polish_word_filled_data(
self,
filled_data: Dict[str, Any]
) -> Dict[str, str]:
"""
将提取的结构化数据尤其是多行Excel数据进行统计归纳
然后润色为自然语言文本
Args:
filled_data: {字段名: [原始值列表]}
Returns:
{字段名: 润色后的文本}
"""
if not filled_data:
return {}
try:
import json
# 第一步:对数值型多行数据进行统计分析
data_summary = []
for field_name, values in filled_data.items():
if not isinstance(values, list) or not values:
continue
# 过滤掉无效值
raw_values = []
for v in values:
if v and str(v).strip() and not str(v).startswith('[提取失败'):
raw_values.append(str(v).strip())
if not raw_values:
continue
# 尝试解析为数值进行统计
numeric_values = []
for v in raw_values:
# 提取数值(处理 "123个"、"78.5%"、"1,234" 等格式)
num_str = re.sub(r'[^\d.\-]', '', str(v))
try:
if num_str and num_str != '-' and num_str != '.':
numeric_values.append(float(num_str))
except ValueError:
pass
# 根据字段名判断类型
field_lower = field_name.lower()
is_count_field = any(kw in field_lower for kw in ['数量', '总数', '次数', '条数', '订单数', '记录数', '条目'])
is_amount_field = any(kw in field_lower for kw in ['金额', '总额', '合计', '总计', '销售额', '收入', '支出', '成本'])
is_ratio_field = any(kw in field_lower for kw in ['比率', '比例', '占比', '', '使用率', '增长', '增幅'])
is_name_field = any(kw in field_lower for kw in ['名称', '机构', '医院', '公司', '单位', '部门', '区域', '类别'])
if len(numeric_values) >= 2 and len(numeric_values) == len(raw_values):
# 多行数值数据,进行统计归纳
total = sum(numeric_values)
avg = total / len(numeric_values)
max_val = max(numeric_values)
min_val = min(numeric_values)
stats_lines = [
f"{field_name}】(共 {len(raw_values)} 条数据):",
f" - 合计: {self._format_number(total)}" if is_amount_field else f" - 合计: {total:.2f}",
f" - 平均: {avg:.2f}",
f" - 最大: {max_val:.2f}",
f" - 最小: {min_val:.2f}",
]
# 对原始值去重计数(如果是名称类字段)
if is_name_field:
unique_values = list(set(raw_values))
if len(unique_values) <= 10:
stats_lines.append(f" - 涉及类别(共 {len(unique_values)} 种): {''.join(unique_values[:8])}")
else:
stats_lines.append(f" - 涉及 {len(unique_values)} 个不同类别")
# 取前5个原始示例
stats_lines.append(f" - 示例值: {''.join(raw_values[:5])}")
data_summary.append('\n'.join(stats_lines))
elif is_ratio_field and len(numeric_values) == 1:
# 单值百分比
pct = numeric_values[0]
data_summary.append(f"{field_name}】: {pct:.1f}%,表示相关指标的相对水平")
elif is_amount_field and len(numeric_values) >= 1:
# 金额类(单位通常是万元/亿元)
total = sum(numeric_values)
unit = ""
if total >= 10000:
unit = f"(约 {total/10000:.2f} 万元)"
elif total >= 1:
unit = f"(约 {total:.2f} 元)"
data_summary.append(f"{field_name}】: 合计 {self._format_number(total)}{unit},基于 {len(raw_values)} 条记录汇总")
elif is_count_field and len(numeric_values) >= 1:
# 数量类
total = sum(numeric_values)
data_summary.append(f"{field_name}】: 共 {self._format_number(total)},基于 {len(raw_values)} 条记录汇总")
else:
# 无法归类的多值数据,做去重归纳
unique_values = list(set(raw_values))
if len(unique_values) <= 8:
data_summary.append(f"{field_name}】(共 {len(raw_values)} 条,去重后 {len(unique_values)} 项): {''.join(unique_values[:8])}")
elif len(raw_values) > 8:
data_summary.append(f"{field_name}】(共 {len(raw_values)} 条记录): {''.join(raw_values[:5])}")
else:
data_summary.append(f"{field_name}】: {''.join(raw_values)}")
if not data_summary:
return {k: (', '.join(str(v) for v in vals[:5]) if isinstance(vals, list) else str(vals))
for k, vals in filled_data.items()}
# 第二步:调用 LLM 将统计分析结果转化为专业自然语言描述
prompt = f"""你是一个专业的数据分析报告助手。请根据以下从文档中提取并统计的数据,生成专业、简洁的自然语言描述。
【数据统计结果】:
{chr(10).join(data_summary)}
【润色要求】:
1. 每个字段生成一段专业的描述性文本20-60字
2. 数值类字段要明确标注单位和含义,如"销售总额达1,234.5万元共涵盖56个订单"
3. 分类/名称类字段要归纳总结类别,如"涉及医疗器械、药品采购、设备维修等5个业务类别"
4. 多值数据不要简单罗列,要做总结,如"覆盖华东地区上海、江苏、浙江、华南地区广东等6个省市的销售网络"
5. 百分比/比率类要加背景说明,如"综合毛利率为23.5%,处于行业正常水平"
6. 保持文本通顺、专业,符合正式报告风格
7. 每段控制在60字以内
【输出格式】严格按JSON格式只返回JSON不要任何其他内容
{{
"字段名1": "润色后的描述文本1",
"字段名2": "润色后的描述文本2"
}}
"""
messages = [
{"role": "system", "content": "你是一个专业的数据分析报告助手。请严格按JSON格式输出只返回纯JSON不要任何其他内容。"},
{"role": "user", "content": prompt}
]
response = await self.llm.chat(
messages=messages,
temperature=0.3,
max_tokens=3000
)
content = self.llm.extract_message_content(response)
logger.info(f"LLM 润色 Word 数据返回: {content[:500]}")
# 尝试解析 JSON
json_match = re.search(r'\{[\s\S]*\}', content)
if json_match:
polished = json.loads(json_match.group())
logger.info(f"LLM 润色成功: {len(polished)} 个字段")
return polished
else:
logger.warning(f"LLM 返回无法解析为 JSON: {content[:200]}")
# 回退到原始统计摘要
return {k: (', '.join(str(v) for v in vals[:5]) if isinstance(vals, list) else str(vals))
for k, vals in filled_data.items()}
except Exception as e:
logger.error(f"LLM 润色失败: {str(e)}")
# 润色失败时回退到原始值
return {k: (', '.join(str(v) for v in vals[:5]) if isinstance(vals, list) else str(vals))
for k, vals in filled_data.items()}
def _format_number(self, num: float) -> str:
"""格式化数字,添加千分位"""
if abs(num) >= 10000:
return f"{num:,.2f}"
elif abs(num) >= 1:
return f"{num:,.2f}"
else:
return f"{num:.4f}"
async def _fill_docx(
self,
template_path: str,
filled_data: Dict[str, Any]
) -> Optional[str]:
"""
将提取的数据填入 Word 模板
Args:
template_path: Word 模板文件路径
filled_data: 字段值字典 {field_name: [values]}
Returns:
填写后的文件路径,失败返回 None
"""
import re
import os
import tempfile
import shutil
from docx import Document
from docx.shared import RGBColor
def clean_text(text: str) -> str:
"""清理文本,移除非法字符"""
if not text:
return ""
# 移除控制字符
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
# 移除 Word 中常见的非法替代字符(显示为方框)
text = re.sub(r'[\ufffd\u25a1\u25a9\u2610\u2611\u25cb\u25c9]', '', text)
# 移除其他无效 Unicode 字符
text = re.sub(r'[\ufeff\u200b-\u200f\u2028-\u202e]', '', text)
return text.strip()
def set_cell_text(cell, text: str):
"""设置单元格文本(保留原有格式)"""
cell.text = text
# 确保文本颜色为黑色
for para in cell.paragraphs:
for run in para.runs:
run.font.color.rgb = RGBColor(0, 0, 0)
try:
# 先对数据进行 LLM 润色(非结构化文本补充和润色)
logger.info(f"Word 填写前开始 LLM 润色 {len(filled_data)} 个字段...")
polished_data = await self._polish_word_filled_data(filled_data)
logger.info(f"LLM 润色完成,使用润色后文本写入 Word")
# 创建临时目录存放修改后的文件
temp_dir = tempfile.mkdtemp()
output_path = os.path.join(temp_dir, "filled_template.docx")
# 复制模板到临时文件
shutil.copy2(template_path, output_path)
# 打开复制的模板
doc = Document(output_path)
matched_fields = set()
# 遍历表格,找到字段名所在的行,填写对应值
for table in doc.tables:
for row in table.rows:
cells = row.cells
if not cells:
continue
first_cell_text = cells[0].text.strip()
if not first_cell_text:
continue
# 精确匹配字段名
if first_cell_text in polished_data:
display_text = polished_data[first_cell_text]
if display_text:
if len(cells) > 1:
set_cell_text(cells[1], clean_text(display_text))
matched_fields.add(first_cell_text)
logger.info(f"Word 填写(精确): {first_cell_text} = {display_text[:50] if display_text else ''}")
continue
# 前缀/后缀匹配
for field_name, display_text in polished_data.items():
if field_name and first_cell_text and (
field_name.startswith(first_cell_text) or first_cell_text.startswith(field_name)
):
if display_text:
if len(cells) > 1:
set_cell_text(cells[1], clean_text(display_text))
matched_fields.add(field_name)
logger.info(f"Word 填写(模糊): {first_cell_text}{field_name} = {display_text[:50] if display_text else ''}")
break
# 如果有未匹配的字段(模板第一列为空),使用段落格式写入(带分隔线,更清晰)
unmatched_fields = [f for f in polished_data if f not in matched_fields]
if unmatched_fields:
logger.info(f"使用段落格式写入 {len(unmatched_fields)} 个字段(带分隔线)")
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
from docx.shared import Pt, RGBColor
def add_horizontal_separator(doc, before_para=None):
"""添加水平分隔线(通过段落下边框实现)"""
sep_para = OxmlElement('w:p')
pPr = OxmlElement('w:pPr')
pBdr = OxmlElement('w:pBdr')
bottom = OxmlElement('w:bottom')
bottom.set(qn('w:val'), 'single')
bottom.set(qn('w:sz'), '6')
bottom.set(qn('w:space'), '1')
bottom.set(qn('w:color'), 'CCCCCC')
pBdr.append(bottom)
pPr.append(pBdr)
sep_para.append(pPr)
if before_para is not None:
before_para._element.addprevious(sep_para)
else:
doc._body.append(sep_para)
def add_field_section(doc, field_name: str, display_text: str):
"""添加一个字段区域:字段名(加粗)+ 值段落 + 分隔线"""
from docx.shared import Pt
# 字段名段落(加粗)
name_para = doc.add_paragraph()
name_run = name_para.add_run(f"📌 {field_name}")
name_run.bold = True
name_run.font.size = Pt(11)
name_run.font.color.rgb = RGBColor(0, 51, 102)
name_para.paragraph_format.space_before = Pt(12)
name_para.paragraph_format.space_after = Pt(3)
# 值段落
value_para = doc.add_paragraph()
value_run = value_para.add_run(display_text)
value_run.font.size = Pt(10.5)
value_run.font.color.rgb = RGBColor(51, 51, 51)
value_para.paragraph_format.space_before = Pt(0)
value_para.paragraph_format.space_after = Pt(6)
# 分隔线
add_horizontal_separator(doc, value_para)
# 在文档末尾添加各字段段落
for field_name in unmatched_fields:
display_text = polished_data[field_name]
if display_text:
add_field_section(doc, field_name, clean_text(display_text))
logger.info(f"Word 段落写入: {field_name} = {display_text[:60]}")
# 保存修改后的文档
doc.save(output_path)
logger.info(f"Word 模板填写完成: {output_path}, 匹配字段: {len(matched_fields)}, 追加字段: {len(unmatched_fields)}")
return output_path
except Exception as e:
logger.error(f"Word 模板填写失败: {str(e)}")
return None
async def _load_source_documents(
self,
source_doc_ids: Optional[List[str]] = None,
@@ -257,10 +651,38 @@ class TemplateFillService:
if doc:
sd = doc.get("structured_data", {})
sd_keys = list(sd.keys()) if sd else []
logger.info(f"从MongoDB加载文档: {doc_id}, doc_type={doc.get('doc_type')}, structured_data keys={sd_keys}")
doc_type = doc.get("doc_type", "")
mysql_table_name = doc.get("metadata", {}).get("mysql_table_name")
logger.info(f"从MongoDB加载文档: {doc_id}, doc_type={doc_type}, structured_data keys={sd_keys}, mysql_table={mysql_table_name}")
# 如果 structured_data 为空,但有 file_path尝试重新解析文件
doc_content = doc.get("content", "")
# 如果是 Excel 类型且有 MySQL 表名,直接从 MySQL 加载数据
if doc_type in ["xlsx", "xls"] and mysql_table_name:
try:
logger.info(f" 从 MySQL 表 {mysql_table_name} 加载 Excel 数据")
mysql_data = await excel_storage_service.query_table(mysql_table_name, limit=1000)
if mysql_data:
# 转换为 SourceDocument 格式
if mysql_data and len(mysql_data) > 0:
columns = list(mysql_data[0].keys()) if mysql_data else []
rows = [[row.get(col) for col in columns] for row in mysql_data]
sd = {
"headers": columns,
"rows": rows,
"row_count": len(mysql_data),
"column_count": len(columns),
"source": "mysql"
}
logger.info(f" MySQL 数据加载成功: {len(mysql_data)} 行, {len(columns)}")
else:
logger.warning(f" MySQL 表 {mysql_table_name} 无数据")
else:
logger.warning(f" MySQL 表 {mysql_table_name} 查询无结果")
except Exception as mysql_err:
logger.error(f" MySQL 加载失败: {str(mysql_err)}")
# 如果 structured_data 仍然为空,尝试重新解析文件
if not sd or (not sd.get("tables") and not sd.get("headers") and not sd.get("rows")):
file_path = doc.get("metadata", {}).get("file_path")
if file_path:
@@ -294,7 +716,7 @@ class TemplateFillService:
source_docs.append(SourceDocument(
doc_id=doc_id,
filename=doc.get("metadata", {}).get("original_filename", "unknown"),
doc_type=doc.get("doc_type", "unknown"),
doc_type=doc_type,
content=doc_content,
structured_data=sd
))
@@ -1047,7 +1469,8 @@ class TemplateFillService:
self,
file_path: str,
file_type: str = "xlsx",
source_contents: List[dict] = None
source_contents: List[dict] = None,
source_docs: List["SourceDocument"] = None
) -> List[TemplateField]:
"""
从模板文件提取字段定义
@@ -1071,15 +1494,18 @@ class TemplateFillService:
fields = await self._get_template_fields_from_docx(file_path)
# 检查是否需要 AI 生成表头
# 条件:没有字段 OR 所有字段都是自动命名的(如"字段1"、"列1"、"Unnamed"开头)
# 条件:没有字段 OR 所有字段都是自动命名的
# 对于 docx仅当有源文档时才允许 AI 生成(避免覆盖用户定义的表头)
needs_ai_generation = (
len(fields) == 0 or
all(self._is_auto_generated_field(f.name) for f in fields)
(len(fields) == 0 or
all(self._is_auto_generated_field(f.name) for f in fields))
) and (
file_type != "docx" or len(source_contents) > 0
)
if needs_ai_generation:
logger.info(f"模板表头为空或自动生成,尝试 AI 生成表头... (fields={len(fields)}, source_docs={len(source_contents)})")
ai_fields = await self._generate_fields_with_ai(file_path, file_type, source_contents)
ai_fields = await self._generate_fields_with_ai(file_path, file_type, source_contents, source_docs)
if ai_fields:
fields = ai_fields
logger.info(f"AI 生成表头成功: {len(fields)} 个字段")
@@ -2134,7 +2560,8 @@ class TemplateFillService:
self,
file_path: str,
file_type: str,
source_contents: List[dict] = None
source_contents: List[dict] = None,
source_docs: List["SourceDocument"] = None
) -> Optional[List[TemplateField]]:
"""
使用 AI 为空表生成表头字段
@@ -2148,6 +2575,8 @@ class TemplateFillService:
Returns:
生成的字段列表,如果失败返回 None
"""
import random
try:
import pandas as pd
@@ -2182,24 +2611,21 @@ class TemplateFillService:
else:
content_sample = ""
# 调用 AI 生成表头
# 根据源文档内容生成表头
source_info = ""
logger.info(f"[DEBUG] _generate_fields_with_ai received source_contents: {len(source_contents) if source_contents else 0} items")
# 优先从源文档的表格表头中随机选取
if source_contents:
for sc in source_contents:
logger.info(f"[DEBUG] source doc: filename={sc.get('filename')}, content_len={len(sc.get('content', ''))}, titles={len(sc.get('titles', []))}, tables_count={sc.get('tables_count', 0)}, has_tables_summary={bool(sc.get('tables_summary'))}")
source_info = "\n\n【源文档内容摘要】(根据以下文档内容生成表头):\n"
import re
all_headers = []
source_info = ""
for idx, src in enumerate(source_contents[:5]): # 最多5个源文档
filename = src.get("filename", f"文档{idx+1}")
doc_type = src.get("doc_type", "unknown")
content = src.get("content", "")[:3000] # 限制内容长度
titles = src.get("titles", [])[:10] # 最多10个标题
content = src.get("content", "")[:3000]
titles = src.get("titles", [])[:10]
tables_count = src.get("tables_count", 0)
tables_summary = src.get("tables_summary", "")
source_info += f"\n--- 文档 {idx+1}: {filename} ({doc_type}) ---\n"
# 处理 titles可能是字符串列表或字典列表
if titles:
title_texts = []
for t in titles[:5]:
@@ -2216,6 +2642,72 @@ class TemplateFillService:
if content:
source_info += f"【文档内容】前3000字符{content[:3000]}\n"
# 从 tables_summary 中提取表头
# 表格摘要格式如: "表格1表头: 姓名, 年龄, 性别"
if tables_summary:
header_matches = re.findall(r'表头:\s*([^\n]+)', tables_summary)
for match in header_matches:
# 分割表头字符串
headers = [h.strip() for h in match.split(',') if h.strip()]
all_headers.extend(headers)
logger.info(f"从表格摘要提取到表头: {headers}")
# 从源文档的 structured_data 中直接提取表头Excel 等数据源)
for doc in source_docs:
if doc.structured_data:
sd = doc.structured_data
# Excel 格式: {columns: [...], rows: [...]}
if sd.get("columns"):
cols = sd.get("columns", [])
if isinstance(cols, list) and cols:
all_headers.extend([str(c) for c in cols if str(c).strip()])
logger.info(f"从 structured_data.columns 提取到表头: {cols}")
# 多 sheet 格式: {sheets: {sheet_name: {columns, rows}}}
if sd.get("sheets"):
for sheet_name, sheet_data in sd.get("sheets", {}).items():
if isinstance(sheet_data, dict) and sheet_data.get("columns"):
cols = sheet_data.get("columns", [])
if isinstance(cols, list) and cols:
all_headers.extend([str(c) for c in cols if str(c).strip()])
logger.info(f"从 sheets.{sheet_name} 提取到表头: {cols}")
# Markdown/表格格式: {tables: [{headers, rows}]}
if sd.get("tables") and isinstance(sd.get("tables"), list):
for table in sd.get("tables", []):
if isinstance(table, dict) and table.get("headers"):
headers = table.get("headers", [])
if isinstance(headers, list) and headers:
all_headers.extend([str(h) for h in headers if str(h).strip()])
logger.info(f"从 tables 提取到表头: {headers}")
# 另一种格式: {headers, rows}
if sd.get("headers") and sd.get("rows"):
headers = sd.get("headers", [])
if isinstance(headers, list) and headers:
all_headers.extend([str(h) for h in headers if str(h).strip()])
logger.info(f"从 headers/rows 提取到表头: {headers}")
# 如果从表格摘要中获取到了表头,随机选取一部分
if all_headers:
logger.info(f"共有 {len(all_headers)} 个表头可用")
# 随机选取 5-7 个表头
num_fields = min(random.randint(5, 7), len(all_headers))
selected_headers = random.sample(all_headers, num_fields)
logger.info(f"随机选取的表头: {selected_headers}")
fields = []
for idx, header in enumerate(selected_headers):
fields.append(TemplateField(
cell=self._column_to_cell(idx),
name=header,
field_type="text",
required=False,
hint=""
))
return fields
else:
source_info = ""
# 如果无法从表格表头获取,才调用 AI 生成
prompt = f"""你是一个专业的数据分析助手。请分析源文档中的所有数据,生成表格表头字段。
任务:分析源文档,找出所有具体的数据指标及其分类。

View File

@@ -39,6 +39,8 @@ openpyxl==3.1.2
python-docx==0.8.11
markdown-it-py==3.0.0
chardet==5.2.0
Pillow>=10.0.0
pytesseract>=0.3.10
# ==================== AI / LLM ====================
httpx==0.25.2

View File

@@ -781,7 +781,8 @@ export const backendApi = {
async exportFilledTemplate(
templateId: string,
filledData: Record<string, any>,
format: 'xlsx' | 'docx' = 'xlsx'
format: 'xlsx' | 'docx' = 'xlsx',
filledFilePath?: string
): Promise<Blob> {
const url = `${BACKEND_BASE_URL}/templates/export`;
@@ -793,6 +794,7 @@ export const backendApi = {
template_id: templateId,
filled_data: filledData,
format,
...(filledFilePath && { filled_file_path: filledFilePath }),
}),
});
@@ -964,6 +966,101 @@ export const backendApi = {
throw error;
}
},
// ==================== 智能指令 API ====================
/**
* 智能对话(支持多轮对话的指令执行)
*/
async instructionChat(
instruction: string,
docIds?: string[],
context?: Record<string, any>
): Promise<{
success: boolean;
intent: string;
result: Record<string, any>;
message: string;
hint?: string;
}> {
const url = `${BACKEND_BASE_URL}/instruction/chat`;
try {
const response = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ instruction, doc_ids: docIds, context }),
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || '对话处理失败');
}
return await response.json();
} catch (error) {
console.error('对话处理失败:', error);
throw error;
}
},
/**
* 获取支持的指令类型列表
*/
async getSupportedIntents(): Promise<{
intents: Array<{
intent: string;
name: string;
examples: string[];
params: string[];
}>;
}> {
const url = `${BACKEND_BASE_URL}/instruction/intents`;
try {
const response = await fetch(url);
if (!response.ok) throw new Error('获取指令列表失败');
return await response.json();
} catch (error) {
console.error('获取指令列表失败:', error);
throw error;
}
},
/**
* 执行指令(同步模式)
*/
async executeInstruction(
instruction: string,
docIds?: string[],
context?: Record<string, any>
): Promise<{
success: boolean;
intent: string;
result: Record<string, any>;
message: string;
}> {
const url = `${BACKEND_BASE_URL}/instruction/execute`;
try {
const response = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ instruction, doc_ids: docIds, context }),
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || '指令执行失败');
}
return await response.json();
} catch (error) {
console.error('指令执行失败:', error);
throw error;
}
},
};
// ==================== AI 分析 API ====================
@@ -1529,61 +1626,66 @@ export const aiApi = {
}
},
// ==================== 对话历史 API ====================
/**
* 智能对话(支持多轮对话的指令执行)
* 获取对话历史
*/
async instructionChat(
instruction: string,
docIds?: string[],
context?: Record<string, any>
): Promise<{
async getConversationHistory(conversationId: string, limit: number = 20): Promise<{
success: boolean;
intent: string;
result: Record<string, any>;
message: string;
hint?: string;
}> {
const url = `${BACKEND_BASE_URL}/instruction/chat`;
try {
const response = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ instruction, doc_ids: docIds, context }),
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || '对话处理失败');
}
return await response.json();
} catch (error) {
console.error('对话处理失败:', error);
throw error;
}
},
/**
* 获取支持的指令类型列表
*/
async getSupportedIntents(): Promise<{
intents: Array<{
intent: string;
name: string;
examples: string[];
params: string[];
messages: Array<{
role: string;
content: string;
intent?: string;
created_at: string;
}>;
}> {
const url = `${BACKEND_BASE_URL}/instruction/intents`;
const url = `${BACKEND_BASE_URL}/conversation/${conversationId}/history?limit=${limit}`;
try {
const response = await fetch(url);
if (!response.ok) throw new Error('获取指令列表失败');
if (!response.ok) throw new Error('获取对话历史失败');
return await response.json();
} catch (error) {
console.error('获取指令列表失败:', error);
throw error;
console.error('获取对话历史失败:', error);
return { success: false, messages: [] };
}
},
/**
* 删除对话历史
*/
async deleteConversation(conversationId: string): Promise<{
success: boolean;
}> {
const url = `${BACKEND_BASE_URL}/conversation/${conversationId}`;
try {
const response = await fetch(url, { method: 'DELETE' });
if (!response.ok) throw new Error('删除对话历史失败');
return await response.json();
} catch (error) {
console.error('删除对话历史失败:', error);
return { success: false };
}
},
/**
* 获取会话列表
*/
async listConversations(limit: number = 50): Promise<{
success: boolean;
conversations: Array<any>;
}> {
const url = `${BACKEND_BASE_URL}/conversation/all?limit=${limit}`;
try {
const response = await fetch(url);
if (!response.ok) throw new Error('获取会话列表失败');
return await response.json();
} catch (error) {
console.error('获取会话列表失败:', error);
return { success: false, conversations: [] };
}
}
};

View File

@@ -15,12 +15,14 @@ import {
Sparkles,
Database,
FileSpreadsheet,
RefreshCcw
RefreshCcw,
Trash2
} from 'lucide-react';
import { backendApi } from '@/db/backend-api';
import { formatDistanceToNow } from 'date-fns';
import { zhCN } from 'date-fns/locale';
import { cn } from '@/lib/utils';
import { toast } from 'sonner';
type DocumentItem = {
doc_id: string;
@@ -108,7 +110,7 @@ const Dashboard: React.FC = () => {
<div className="grid grid-cols-1 md:grid-cols-3 gap-6">
{[
{ label: '已上传文档', value: stats.docs, icon: FileText, color: 'bg-blue-500', trend: '非结构化文档', link: '/documents' },
{ label: 'Excel 文件', value: stats.excelFiles, icon: FileSpreadsheet, color: 'bg-emerald-500', trend: '结构化数据', link: '/excel-parse' },
{ label: 'Excel 文件', value: stats.excelFiles, icon: FileSpreadsheet, color: 'bg-emerald-500', trend: '结构化数据', link: '/documents' },
{ label: '填表任务', value: stats.tasks, icon: TableProperties, color: 'bg-indigo-500', trend: '待实现', link: '/form-fill' }
].map((stat, i) => (
<Card key={i} className="border-none shadow-md overflow-hidden group hover:shadow-xl transition-all duration-300">
@@ -164,9 +166,31 @@ const Dashboard: React.FC = () => {
{doc.doc_type.toUpperCase()} {formatDistanceToNow(new Date(doc.created_at), { addSuffix: true, locale: zhCN })}
</p>
</div>
<div className="flex items-center gap-2">
<div className="px-2 py-1 rounded-full text-[10px] font-bold uppercase tracking-wider bg-muted">
{doc.doc_type}
</div>
<Button
variant="ghost"
size="icon"
className="opacity-0 group-hover:opacity-100 text-destructive hover:bg-destructive/10 transition-opacity"
onClick={async (e) => {
e.stopPropagation();
if (!confirm(`确定要删除 "${doc.original_filename}" 吗?`)) return;
try {
const result = await backendApi.deleteDocument(doc.doc_id);
if (result.success) {
setRecentDocs(prev => prev.filter(d => d.doc_id !== doc.doc_id));
toast.success('文档已删除');
}
} catch (err: any) {
toast.error(err.message || '删除失败');
}
}}
>
<Trash2 size={16} />
</Button>
</div>
</div>
))}
</div>
@@ -197,7 +221,7 @@ const Dashboard: React.FC = () => {
<div className="grid grid-cols-1 sm:grid-cols-2 gap-4">
{[
{ title: '上传文档', desc: '支持 docx/md/txt', icon: FileText, link: '/documents', color: 'bg-blue-500' },
{ title: '解析 Excel', desc: '上传并分析数据', icon: FileSpreadsheet, link: '/excel-parse', color: 'bg-emerald-500' },
{ title: '解析 Excel', desc: '上传并分析数据', icon: FileSpreadsheet, link: '/documents', color: 'bg-emerald-500' },
{ title: '智能填表', desc: '自动填写表格模板', icon: TableProperties, link: '/form-fill', color: 'bg-indigo-500' },
{ title: 'AI 助手', desc: '自然语言交互', icon: MessageSquareCode, link: '/assistant', color: 'bg-amber-500' }
].map((item, i) => (

View File

@@ -78,6 +78,19 @@ const Documents: React.FC = () => {
const [expandedSheet, setExpandedSheet] = useState<string | null>(null);
const [uploadExpanded, setUploadExpanded] = useState(false);
// 批量上传状态跟踪
type FileUploadStatus = 'pending' | 'uploading' | 'processing' | 'success' | 'failed';
interface UploadFileState {
file: File;
status: FileUploadStatus;
progress: number;
taskId?: string;
error?: string;
docId?: string;
}
const [uploadStates, setUploadStates] = useState<UploadFileState[]>([]);
const [batchTaskId, setBatchTaskId] = useState<string | null>(null);
// AI 分析相关状态
const [analyzing, setAnalyzing] = useState(false);
const [analyzingForCharts, setAnalyzingForCharts] = useState(false);
@@ -211,21 +224,119 @@ const Documents: React.FC = () => {
}
};
// 文件上传处理
// 文件上传处理 - 批量上传
const onDrop = async (acceptedFiles: File[]) => {
if (acceptedFiles.length === 0) return;
// 初始化上传状态
const initialStates: UploadFileState[] = acceptedFiles.map(file => ({
file,
status: 'pending',
progress: 0
}));
setUploadStates(initialStates);
setUploadExpanded(true);
setUploading(true);
try {
// 使用批量上传接口
const result = await backendApi.uploadDocuments(acceptedFiles);
if (result.task_id) {
setBatchTaskId(result.task_id);
// 更新所有文件状态为上传中
setUploadStates(prev => prev.map(s => ({ ...s, status: 'uploading', progress: 30 })));
// 轮询任务状态
let attempts = 0;
const maxAttempts = 150; // 最多5分钟
const checkBatchStatus = async () => {
while (attempts < maxAttempts) {
try {
const status = await backendApi.getTaskStatus(result.task_id);
if (status.status === 'success' && status.result) {
// 更新每个文件的状态
const fileResults = status.result.results || [];
setUploadStates(prev => prev.map((s, idx) => {
const fileResult = fileResults[idx];
if (fileResult?.success) {
return { ...s, status: 'success', progress: 100, docId: fileResult.doc_id };
} else {
return { ...s, status: 'failed', progress: 0, error: fileResult?.error || '处理失败' };
}
}));
loadDocuments();
return;
} else if (status.status === 'failure') {
setUploadStates(prev => prev.map(s => ({
...s,
status: 'failed',
error: status.error || '批量处理失败'
})));
return;
} else {
// 处理中 - 更新进度
const progress = status.progress || Math.min(30 + attempts * 2, 90);
setUploadStates(prev => prev.map(s => ({
...s,
status: s.status === 'uploading' ? 'processing' : s.status,
progress
})));
}
} catch (e) {
console.error('检查批量状态失败', e);
}
await new Promise(resolve => setTimeout(resolve, 2000));
attempts++;
}
// 超时
setUploadStates(prev => prev.map(s => {
if (s.status !== 'success') {
return { ...s, status: 'failed', error: '处理超时' };
}
return s;
}));
};
checkBatchStatus();
} else {
// 单文件直接上传(旧逻辑作为后备)
await handleSingleFileUploads(acceptedFiles);
}
} catch (error: any) {
toast.error(error.message || '上传失败');
setUploadStates(prev => prev.map(s => ({
...s,
status: 'failed',
error: error.message || '上传失败'
})));
} finally {
setUploading(false);
}
};
// 单文件上传后备逻辑
const handleSingleFileUploads = async (files: File[]) => {
let successCount = 0;
let failCount = 0;
const successfulFiles: File[] = [];
// 逐个上传文件
for (const file of acceptedFiles) {
for (let i = 0; i < files.length; i++) {
const file = files[i];
const ext = file.name.split('.').pop()?.toLowerCase();
setUploadStates(prev => prev.map((s, idx) =>
idx === i ? { ...s, status: 'uploading' } : s
));
try {
if (ext === 'xlsx' || ext === 'xls') {
setUploadStates(prev => prev.map((s, idx) =>
idx === i ? { ...s, status: 'processing', progress: 50 } : s
));
const result = await backendApi.uploadExcel(file, {
parseAllSheets: parseOptions.parseAllSheets,
headerRow: parseOptions.headerRow
@@ -233,99 +344,60 @@ const Documents: React.FC = () => {
if (result.success) {
successCount++;
successfulFiles.push(file);
// 第一个Excel文件设置解析结果供预览
setUploadStates(prev => prev.map((s, idx) =>
idx === i ? { ...s, status: 'success', progress: 100 } : s
));
if (successCount === 1) {
setUploadedFile(file);
setParseResult(result);
if (result.metadata?.sheet_count === 1) {
setExpandedSheet(Object.keys(result.data?.sheets || {})[0] || null);
}
}
loadDocuments();
} else {
failCount++;
toast.error(`${file.name}: ${result.error || '解析失败'}`);
setUploadStates(prev => prev.map((s, idx) =>
idx === i ? { ...s, status: 'failed', error: result.error || '解析失败' } : s
));
}
} else if (ext === 'md' || ext === 'markdown') {
} else {
setUploadStates(prev => prev.map((s, idx) =>
idx === i ? { ...s, status: 'processing', progress: 50 } : s
));
const result = await backendApi.uploadDocument(file);
if (result.task_id) {
// 等待任务完成
let attempts = 0;
while (attempts < 60) {
const status = await backendApi.getTaskStatus(result.task_id);
if (status.status === 'success') {
successCount++;
successfulFiles.push(file);
setUploadStates(prev => prev.map((s, idx) =>
idx === i ? { ...s, status: 'success', progress: 100, docId: status.result?.doc_id } : s
));
if (successCount === 1) {
setUploadedFile(file);
}
// 轮询任务状态
let attempts = 0;
const checkStatus = async () => {
while (attempts < 30) {
try {
const status = await backendApi.getTaskStatus(result.task_id);
if (status.status === 'success') {
loadDocuments();
return;
break;
} else if (status.status === 'failure') {
return;
}
} catch (e) {
console.error('检查状态失败', e);
setUploadStates(prev => prev.map((s, idx) =>
idx === i ? { ...s, status: 'failed', error: status.error || '处理失败' } : s
));
break;
}
await new Promise(resolve => setTimeout(resolve, 2000));
attempts++;
}
};
checkStatus();
} else {
failCount++;
}
} else {
// 其他文档使用通用上传接口
const result = await backendApi.uploadDocument(file);
if (result.task_id) {
successCount++;
successfulFiles.push(file);
if (successCount === 1) {
setUploadedFile(file);
}
// 轮询任务状态
let attempts = 0;
const checkStatus = async () => {
while (attempts < 30) {
try {
const status = await backendApi.getTaskStatus(result.task_id);
if (status.status === 'success') {
loadDocuments();
return;
} else if (status.status === 'failure') {
return;
}
} catch (e) {
console.error('检查状态失败', e);
}
await new Promise(resolve => setTimeout(resolve, 2000));
attempts++;
}
};
checkStatus();
} else {
failCount++;
}
}
} catch (error: any) {
failCount++;
toast.error(`${file.name}: ${error.message || '上传失败'}`);
setUploadStates(prev => prev.map((s, idx) =>
idx === i ? { ...s, status: 'failed', error: error.message || '上传失败' } : s
));
}
}
setUploading(false);
loadDocuments();
if (successCount > 0) {
toast.success(`成功上传 ${successCount} 个文件`);
setUploadedFiles(prev => [...prev, ...successfulFiles]);
setUploadExpanded(true);
}
if (failCount > 0) {
toast.error(`${failCount} 个文件上传失败`);
}
};
@@ -699,7 +771,110 @@ const Documents: React.FC = () => {
</CardHeader>
{uploadPanelOpen && (
<CardContent className="space-y-4">
{uploadedFiles.length > 0 || uploadedFile ? (
{/* 优先显示正在上传的状态 */}
{uploadStates.length > 0 && (
<div className="space-y-3">
{/* 上传状态头部 */}
<div
className="flex items-center justify-between p-3 bg-primary/5 rounded-xl cursor-pointer hover:bg-primary/10 transition-colors"
onClick={() => setUploadExpanded(!uploadExpanded)}
>
<div className="flex items-center gap-3">
<div className="w-10 h-10 rounded-lg bg-primary/10 text-primary flex items-center justify-center">
{uploading ? <Loader2 size={20} className="animate-spin" /> : <Upload size={20} />}
</div>
<div>
<p className="font-semibold text-sm">
{uploading ? '正在上传' : '上传完成'} {uploadStates.length}
</p>
<p className="text-xs text-muted-foreground">
{uploading ? '上传中,请稍候...' : uploadStates.filter(s => s.status === 'failed').length > 0 ? '部分失败' : '点击查看详情'}
</p>
</div>
</div>
<div className="flex items-center gap-2">
{!uploading && (
<Button
variant="ghost"
size="sm"
onClick={(e) => {
e.stopPropagation();
setUploadStates([]);
setUploadedFiles([]);
setUploadedFile(null);
}}
className="text-destructive hover:text-destructive"
>
<Trash2 size={14} className="mr-1" />
</Button>
)}
{uploadExpanded ? <ChevronUp size={16} /> : <ChevronDown size={16} />}
</div>
</div>
{/* 上传进度列表(总是展开显示) */}
{uploadExpanded && (
<div className="space-y-2 border rounded-xl p-3 bg-background">
{uploadStates.map((state, index) => (
<div key={index} className="flex items-center gap-3 p-2 rounded-lg hover:bg-muted/30 transition-colors">
<div className={cn(
"w-8 h-8 rounded flex items-center justify-center shrink-0",
isExcelFile(state.file.name) ? "bg-emerald-500/10 text-emerald-500" : "bg-blue-500/10 text-blue-500"
)}>
{state.status === 'pending' && <Clock size={16} />}
{state.status === 'uploading' && <Upload size={16} className="animate-pulse" />}
{state.status === 'processing' && <Loader2 size={16} className="animate-spin" />}
{state.status === 'success' && <CheckCircle size={16} className="text-green-500" />}
{state.status === 'failed' && <AlertCircle size={16} className="text-red-500" />}
</div>
<div className="flex-1 min-w-0">
<p className="text-sm truncate">{state.file.name}</p>
<div className="flex items-center gap-2">
{state.status === 'pending' && <p className="text-xs text-muted-foreground">...</p>}
{state.status === 'uploading' && <p className="text-xs text-primary">...</p>}
{state.status === 'processing' && <p className="text-xs text-primary">...</p>}
{state.status === 'failed' && state.error && (
<p className="text-xs text-red-500 truncate">{state.error}</p>
)}
{state.status === 'success' && (
<p className="text-xs text-green-500"></p>
)}
</div>
{/* 进度条 */}
{(state.status === 'uploading' || state.status === 'processing') && (
<div className="mt-1 h-1 bg-muted rounded-full overflow-hidden">
<div
className="h-full bg-primary transition-all duration-300"
style={{ width: `${state.progress}%` }}
/>
</div>
)}
</div>
{state.status === 'success' && (
<CheckCircle size={16} className="text-green-500 shrink-0" />
)}
{state.status === 'failed' && (
<Button
variant="ghost"
size="icon"
className="text-destructive hover:bg-destructive/10 shrink-0"
onClick={() => {
setUploadStates(prev => prev.filter((_, i) => i !== index));
}}
>
<Trash2 size={14} />
</Button>
)}
</div>
))}
</div>
)}
</div>
)}
{/* 已上传文件列表(没有正在上传时显示) */}
{uploadStates.length === 0 && (uploadedFiles.length > 0 || uploadedFile) ? (
<div className="space-y-3">
{/* 文件列表头部 */}
<div
@@ -739,6 +914,84 @@ const Documents: React.FC = () => {
{/* 展开的文件列表 */}
{uploadExpanded && (
<div className="space-y-2 border rounded-xl p-3">
{/* 显示已上传文件列表 */}
{(uploadedFiles.length > 0 ? uploadedFiles : [uploadedFile]).filter(Boolean).map((file, index) => (
<div key={index} className="flex items-center gap-3 p-2 bg-background rounded-lg">
<div className={cn(
"w-8 h-8 rounded flex items-center justify-center",
isExcelFile(file?.name || '') ? "bg-emerald-500/10 text-emerald-500" : "bg-blue-500/10 text-blue-500"
)}>
{isExcelFile(file?.name || '') ? <FileSpreadsheet size={16} /> : <FileText size={16} />}
</div>
<div className="flex-1 min-w-0">
<p className="text-sm truncate">{file?.name}</p>
<p className="text-xs text-muted-foreground">{formatFileSize(file?.size || 0)}</p>
</div>
<Button
variant="ghost"
size="icon"
className="text-destructive hover:bg-destructive/10"
onClick={() => handleRemoveUploadedFile(index)}
>
<Trash2 size={14} />
</Button>
</div>
))}
{/* 继续添加按钮 */}
<div
{...getRootProps()}
className="flex items-center justify-center gap-2 p-3 border-2 border-dashed rounded-lg cursor-pointer hover:border-primary/50 hover:bg-primary/5 transition-colors"
onClick={(e) => e.stopPropagation()}
>
<input {...getInputProps()} multiple={true} />
<Plus size={16} className="text-muted-foreground" />
<span className="text-sm text-muted-foreground"></span>
</div>
</div>
)}
</div>
) : (uploadedFiles.length > 0 || uploadedFile) ? (
<div className="space-y-3">
{/* 文件列表头部 */}
<div
className="flex items-center justify-between p-3 bg-muted/50 rounded-xl cursor-pointer hover:bg-muted/70 transition-colors"
onClick={() => setUploadExpanded(!uploadExpanded)}
>
<div className="flex items-center gap-3">
<div className="w-10 h-10 rounded-lg bg-primary/10 text-primary flex items-center justify-center">
<Upload size={20} />
</div>
<div>
<p className="font-semibold text-sm">
{(uploadedFiles.length > 0 ? uploadedFiles : [uploadedFile]).length}
</p>
<p className="text-xs text-muted-foreground">
{uploadExpanded ? '点击收起' : '点击展开查看'}
</p>
</div>
</div>
<div className="flex items-center gap-2">
<Button
variant="ghost"
size="sm"
onClick={(e) => {
e.stopPropagation();
handleDeleteFile();
}}
className="text-destructive hover:text-destructive"
>
<Trash2 size={14} className="mr-1" />
</Button>
{uploadExpanded ? <ChevronUp size={16} /> : <ChevronDown size={16} />}
</div>
</div>
{/* 展开的文件列表 */}
{uploadExpanded && (
<div className="space-y-2 border rounded-xl p-3">
{/* 显示已上传文件列表 */}
{(uploadedFiles.length > 0 ? uploadedFiles : [uploadedFile]).filter(Boolean).map((file, index) => (
<div key={index} className="flex items-center gap-3 p-2 bg-background rounded-lg">
<div className={cn(

View File

@@ -1,26 +1,10 @@
import React, { useState, useRef, useEffect } from 'react';
import {
Send,
Bot,
User,
Sparkles,
Trash2,
RefreshCcw,
FileText,
TableProperties,
ChevronRight,
ArrowRight,
Loader2,
Download,
Search,
MessageSquare,
CheckCircle
} from 'lucide-react';
import { Send, Bot, User, Sparkles, Trash2, FileText, TableProperties, ArrowRight, Search, MessageSquare } from 'lucide-react';
import { Button } from '@/components/ui/button';
import { Input } from '@/components/ui/input';
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
import { ScrollArea } from '@/components/ui/scroll-area';
import { Badge } from '@/components/ui/badge';
import { Markdown } from '@/components/ui/markdown';
import { backendApi } from '@/db/backend-api';
import { toast } from 'sonner';
import { cn } from '@/lib/utils';
@@ -39,8 +23,21 @@ const InstructionChat: React.FC = () => {
const [input, setInput] = useState('');
const [loading, setLoading] = useState(false);
const [currentDocIds, setCurrentDocIds] = useState<string[]>([]);
const [conversationId, setConversationId] = useState<string>('');
const scrollAreaRef = useRef<HTMLDivElement>(null);
// 初始化会话ID
useEffect(() => {
const storedId = localStorage.getItem('chat_conversation_id');
if (storedId) {
setConversationId(storedId);
} else {
const newId = `conv_${Date.now()}_${Math.random().toString(36).substring(7)}`;
setConversationId(newId);
localStorage.setItem('chat_conversation_id', newId);
}
}, []);
useEffect(() => {
// Initial welcome message
if (messages.length === 0) {
@@ -119,7 +116,8 @@ const InstructionChat: React.FC = () => {
// 使用真实的智能指令 API
const response = await backendApi.instructionChat(
input.trim(),
currentDocIds.length > 0 ? currentDocIds : undefined
currentDocIds.length > 0 ? currentDocIds : undefined,
{ conversation_id: conversationId }
);
// 根据意图类型生成友好响应
@@ -135,11 +133,12 @@ const InstructionChat: React.FC = () => {
responseContent = `✅ 已提取到 ${keys.length} 个字段的数据:\n\n`;
for (const [key, value] of Object.entries(extracted)) {
const values = Array.isArray(value) ? value : [value];
responseContent += `**${key}**: ${values.slice(0, 3).join(', ')}${values.length > 3 ? '...' : ''}\n`;
const displayValues = values.length > 10 ? values.slice(0, 10).join(', ') + ` ...(共${values.length}条)` : values.join(', ');
responseContent += `**${key}**: ${displayValues}\n`;
}
responseContent += `\n💡 您可以将这些数据填入表格`;
responseContent += `\n💡 可直接使用以上数据,或说"填入表格"继续填表操作`;
} else {
responseContent = '未能从文档中提取到相关数据。请尝试更明确的字段名称。';
responseContent = resultData?.message || '未能从文档中提取到相关数据。请尝试更明确的字段名称。';
}
break;
@@ -151,24 +150,24 @@ const InstructionChat: React.FC = () => {
responseContent = `✅ 填表完成!成功填写 ${filledKeys.length} 个字段:\n\n`;
for (const [key, value] of Object.entries(filled)) {
const values = Array.isArray(value) ? value : [value];
responseContent += `**${key}**: ${values.slice(0, 3).join(', ')}\n`;
const displayValues = values.length > 10 ? values.slice(0, 10).join(', ') + ` ...(共${values.length}条)` : values.join(', ');
responseContent += `**${key}**: ${displayValues}\n`;
}
responseContent += `\n📋 请到【智能填表】页面查看或导出结果。`;
} else {
responseContent = '填表未能提取到数据。请检查模板表头和数据源内容。';
responseContent = resultData?.message || '填表未能提取到数据。请检查模板表头和数据源内容。';
}
break;
case 'summarize':
// 摘要结果
const summaries = resultData?.summaries || [];
if (summaries.length > 0) {
responseContent = `📄 找到 ${summaries.length} 个文档的摘要:\n\n`;
summaries.forEach((s: any, idx: number) => {
responseContent += `**${idx + 1}. ${s.filename}**\n${s.content_preview}\n\n`;
});
if (resultData?.action_needed === 'provide_document' || resultData?.action_needed === 'upload_document') {
responseContent = `📋 ${resultData.message}\n\n${resultData.suggestion || ''}`;
} else if (resultData?.ai_summary) {
// AI 生成的摘要
responseContent = `📄 **${resultData.filename}** 摘要分析:\n\n${resultData.ai_summary}`;
} else {
responseContent = '未能生成摘要。请确保已上传文档。';
responseContent = resultData?.message || '未能生成摘要。请确保已上传文档。';
}
break;
@@ -176,8 +175,10 @@ const InstructionChat: React.FC = () => {
// 问答结果
if (resultData?.answer) {
responseContent = `**问题**: ${resultData.question}\n\n**答案**: ${resultData.answer}`;
} else if (resultData?.context_preview) {
responseContent = `**问题**: ${resultData.question}\n\n**相关上下文**\n${resultData.context_preview}`;
} else {
responseContent = resultData?.message || '我找到了相关信息,请查看上文。';
responseContent = resultData?.message || '请先上传文档,我才能回答您的问题。';
}
break;
@@ -207,8 +208,35 @@ const InstructionChat: React.FC = () => {
}
break;
case 'edit':
// 文档编辑结果
if (resultData?.edited_content) {
responseContent = `✏️ **${resultData.original_filename}** 编辑完成:\n\n${resultData.edited_content.substring(0, 500)}${resultData.edited_content.length > 500 ? '\n\n...(内容已截断)' : ''}`;
} else {
responseContent = resultData?.message || '编辑完成。';
}
break;
case 'transform':
// 格式转换结果
if (resultData?.excel_data) {
responseContent = `🔄 格式转换完成!\n\n已转换为 **Excel** 格式,共 **${resultData.excel_data.length}** 行数据。\n\n${resultData.message || ''}`;
} else if (resultData?.content) {
responseContent = `🔄 格式转换完成!\n\n目标格式: **${resultData.target_format?.toUpperCase()}**\n\n${resultData.message || ''}`;
} else {
responseContent = resultData?.message || '格式转换完成。';
}
break;
case 'unknown':
responseContent = `我理解您想要: "${input.trim()}"\n\n但我目前无法完成此操作。您可以尝试\n\n1. **提取数据**: "提取医院数量和床位数"\n2. **填表**: "根据这些数据填表"\n3. **总结**: "总结这份文档"\n4. **问答**: "文档里说了什么?"\n5. **搜索**: "搜索相关内容"`;
// 检查是否需要用户上传文档
if (resultData?.suggestion) {
responseContent = resultData.suggestion;
} else if (resultData?.message && resultData.message !== '无法理解该指令,请尝试更明确的描述') {
responseContent = resultData.message;
} else {
responseContent = `我理解您想要: "${input.trim()}"\n\n请尝试以下操作\n\n1. **提取数据**: "提取医院数量和床位数"\n2. **填表**: "根据这些数据填表"\n3. **总结**: "总结这份文档"\n4. **问答**: "文档里说了什么?"\n5. **搜索**: "搜索相关内容"`;
}
break;
default:
@@ -299,9 +327,11 @@ const InstructionChat: React.FC = () => {
? "bg-primary text-primary-foreground shadow-xl shadow-primary/20 rounded-tr-none"
: "bg-white border border-border/50 shadow-md rounded-tl-none"
)}>
<p className="text-sm leading-relaxed whitespace-pre-wrap font-medium">
{m.content}
</p>
{m.role === 'assistant' ? (
<Markdown content={m.content} className="text-sm leading-relaxed prose prose-sm max-w-none" />
) : (
<p className="text-sm leading-relaxed whitespace-pre-wrap font-medium">{m.content}</p>
)}
<span className={cn(
"text-[10px] block opacity-50 font-bold tracking-widest",
m.role === 'user' ? "text-right" : "text-left"

View File

@@ -248,15 +248,25 @@ const TemplateFill: React.FC = () => {
if (!templateFile || !filledResult) return;
try {
const ext = templateFile.name.split('.').pop()?.toLowerCase();
const exportFormat = (ext === 'docx') ? 'docx' : 'xlsx';
// 对于 Word 模板,如果已有填写后的文件(已填入表格单元格),传递其路径以便直接下载
const filledFilePath = (ext === 'docx' && filledResult.filled_file_path)
? filledResult.filled_file_path
: undefined;
const blob = await backendApi.exportFilledTemplate(
templateId || 'temp',
filledResult.filled_data || {},
'xlsx'
exportFormat,
filledFilePath
);
const ext_match = templateFile.name.match(/\.([^.])+$/);
const baseName = ext_match ? templateFile.name.replace(ext_match[0], '') : templateFile.name;
const downloadName = `filled_${baseName}.${exportFormat}`;
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `filled_${templateFile.name}`;
a.download = downloadName;
a.click();
URL.revokeObjectURL(url);
toast.success('导出成功');
@@ -546,7 +556,7 @@ const TemplateFill: React.FC = () => {
</div>
<h3 className="text-xl font-bold mb-2">AI </h3>
<p className="text-muted-foreground text-center max-w-md">
{sourceFiles.length || sourceFilePaths.length} ...
{sourceFiles.length || sourceFilePaths.length || sourceDocIds.length || 0} ...
</p>
</CardContent>
</Card>
@@ -562,7 +572,7 @@ const TemplateFill: React.FC = () => {
</CardTitle>
<CardDescription>
{sourceFiles.length || sourceFilePaths.length}
{filledResult.source_doc_count || sourceFiles.length || sourceFilePaths.length || sourceDocIds.length}
</CardDescription>
</CardHeader>
<CardContent>