From e5d4724e822004f6711e0aea69c870a72bd846b2 Mon Sep 17 00:00:00 2001 From: dj <431634905@qq.com> Date: Wed, 15 Apr 2026 23:32:55 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90=E6=99=BA=E8=83=BD=E5=8A=A9=E6=89=8B?= =?UTF-8?q?=E5=A2=9E=E5=BC=BA=E3=80=91=20-=20=E6=96=B0=E5=A2=9E=E5=AF=B9?= =?UTF-8?q?=E8=AF=9D=E5=8E=86=E5=8F=B2=E7=AE=A1=E7=90=86=EF=BC=9AMongoDB?= =?UTF-8?q?=E6=96=B0=E5=A2=9Econversations=E9=9B=86=E5=90=88=EF=BC=8C?= =?UTF-8?q?=E5=AD=98=E5=82=A8=E7=94=A8=E6=88=B7=E4=B8=8EAI=E7=9A=84?= =?UTF-8?q?=E5=AF=B9=E8=AF=9D=E4=B8=8A=E4=B8=8B=E6=96=87=EF=BC=8C=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E5=A4=9A=E8=BD=AE=E5=AF=B9=E8=AF=9D=E6=84=8F=E5=9B=BE?= =?UTF-8?q?=E5=BB=B6=E7=BB=AD=20-=20=E6=96=B0=E5=A2=9E=E5=AF=B9=E8=AF=9D?= =?UTF-8?q?=E5=8E=86=E5=8F=B2API=EF=BC=88conversation.py=EF=BC=89=EF=BC=9A?= =?UTF-8?q?GET/DELETE=20conversation=E5=8E=86=E5=8F=B2=E3=80=81=E5=88=97?= =?UTF-8?q?=E5=87=BA=E6=89=80=E6=9C=89=E4=BC=9A=E8=AF=9D=20-=20=E6=84=8F?= =?UTF-8?q?=E5=9B=BE=E8=A7=A3=E6=9E=90=E5=A2=9E=E5=BC=BA=EF=BC=9A=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E5=9F=BA=E4=BA=8E=E5=AF=B9=E8=AF=9D=E5=8E=86=E5=8F=B2?= =?UTF-8?q?=E7=9A=84=E6=84=8F=E5=9B=BE=E8=AF=86=E5=88=AB=EF=BC=8C=E4=B8=8A?= =?UTF-8?q?=E4=B8=8B=E6=96=87=E7=90=86=E8=A7=A3=E6=9B=B4=E5=87=86=E7=A1=AE?= =?UTF-8?q?=20-=20=E5=AD=97=E6=AE=B5=E6=8F=90=E5=8F=96=E4=BC=98=E5=8C=96?= =?UTF-8?q?=EF=BC=9A=E6=94=AF=E6=8C=81"=E6=8F=90=E5=8F=96=E6=96=87?= =?UTF-8?q?=E6=A1=A3=E4=B8=AD=E7=9A=84=E5=8C=BB=E9=99=A2=E6=95=B0=E9=87=8F?= =?UTF-8?q?"=E7=AD=89=E8=87=AA=E7=84=B6=E8=AF=AD=E8=A8=80=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=EF=BC=8C=E6=99=BA=E8=83=BD=E5=8E=BB=E9=99=A4"?= =?UTF-8?q?=E6=96=87=E6=A1=A3=E4=B8=AD=E7=9A=84"=E5=89=8D=E7=BC=80=20-=20?= =?UTF-8?q?=E6=96=87=E6=A1=A3=E5=AF=B9=E6=AF=94=E4=BC=98=E5=8C=96=EF=BC=9A?= =?UTF-8?q?=E4=BB=8E=E6=8C=87=E4=BB=A4=E4=B8=AD=E6=8F=90=E5=8F=96=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E5=90=8D=E5=B9=B6=E7=B2=BE=E7=A1=AE=E5=8C=B9=E9=85=8D?= =?UTF-8?q?source=5Fdocs=EF=BC=8C=E6=94=AF=E6=8C=81"=E5=AF=B9=E6=AF=94A?= =?UTF-8?q?=E5=92=8CB=E4=B8=A4=E4=B8=AA=E6=96=87=E6=A1=A3"=20-=20=E6=96=87?= =?UTF-8?q?=E6=A1=A3=E6=91=98=E8=A6=81=E4=BC=98=E5=8C=96=EF=BC=9A=E4=BD=BF?= =?UTF-8?q?=E7=94=A8LLM=E7=94=9F=E6=88=90=E7=9C=9F=E5=AE=9EAI=E6=91=98?= =?UTF-8?q?=E8=A6=81=E8=80=8C=E9=9D=9E=E8=BF=94=E5=9B=9E=E5=8E=9F=E5=A7=8B?= =?UTF-8?q?=E6=96=87=E6=A1=A3=E9=A2=84=E8=A7=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 【Word模板填表核心功能】 - Word模板字段生成:空白Word上传后,自动从源文档(Excel/Word/TXT/MD)内容AI生成字段名 - Word模板填表(_fill_docx):将提取数据写入Word模板表格,支持精确匹配、模糊匹配、追加新行 - 数据润色(_polish_word_filled_data):LLM对多行Excel数据进行统计归纳(合计/平均/极值),转化为专业自然语言描述 - 段落格式输出:使用📌字段名+值段落+分隔线(灰色横线)格式,提升可读性 - 导出链打通:fill_template返回filled_file_path,export直接返回已填好的Word文件 【其他修复】 - 修复Word导出Windows文件锁问题:NamedTemporaryFile改为mkstemp+close - 修复Word方框非法字符:扩展clean_text移除\uFFFD、□等Unicode替代符和零宽字符 - 修复文档对比"需要至少2个文档":从指令提取具体文件名优先匹配而非取前2个 - 修复导出format硬编码:自动识别docx/xlsx格式 - Docx解析器增加备用解析方法和更完整的段落/表格/标题提取 - RAG服务新增MySQL数据源支持 --- backend/app/api/__init__.py | 2 + backend/app/api/endpoints/conversation.py | 98 ++++ backend/app/api/endpoints/documents.py | 172 +++--- backend/app/api/endpoints/instruction.py | 39 +- backend/app/api/endpoints/templates.py | 42 +- backend/app/core/database/mongodb.py | 130 ++++- .../app/core/document_parser/docx_parser.py | 334 ++++++++--- backend/app/instruction/executor.py | 313 ++++++++-- backend/app/instruction/intent_parser.py | 78 ++- backend/app/services/excel_storage_service.py | 5 +- backend/app/services/rag_service.py | 70 ++- backend/app/services/table_rag_service.py | 18 +- backend/app/services/template_fill_service.py | 534 +++++++++++++++++- backend/requirements.txt | 2 + frontend/src/db/backend-api.ts | 194 +++++-- frontend/src/pages/Dashboard.tsx | 34 +- frontend/src/pages/Documents.tsx | 407 ++++++++++--- frontend/src/pages/InstructionChat.tsx | 102 ++-- frontend/src/pages/TemplateFill.tsx | 18 +- 19 files changed, 2185 insertions(+), 407 deletions(-) create mode 100644 backend/app/api/endpoints/conversation.py diff --git a/backend/app/api/__init__.py b/backend/app/api/__init__.py index 1a7ced4..a0c7f7a 100644 --- a/backend/app/api/__init__.py +++ b/backend/app/api/__init__.py @@ -14,6 +14,7 @@ from app.api.endpoints import ( analysis_charts, health, instruction, # 智能指令 + conversation, # 对话历史 ) # 创建主路由 @@ -31,3 +32,4 @@ api_router.include_router(ai_analyze.router) # AI分析 api_router.include_router(visualization.router) # 可视化 api_router.include_router(analysis_charts.router) # 分析图表 api_router.include_router(instruction.router) # 智能指令 +api_router.include_router(conversation.router) # 对话历史 diff --git a/backend/app/api/endpoints/conversation.py b/backend/app/api/endpoints/conversation.py new file mode 100644 index 0000000..ca84d05 --- /dev/null +++ b/backend/app/api/endpoints/conversation.py @@ -0,0 +1,98 @@ +""" +对话历史 API 接口 + +提供对话历史的存储和查询功能 +""" +import logging +from typing import Optional + +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel + +from app.core.database import mongodb + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/conversation", tags=["对话历史"]) + + +# ==================== 请求/响应模型 ==================== + +class ConversationMessage(BaseModel): + role: str + content: str + intent: Optional[str] = None + + +class ConversationHistoryResponse(BaseModel): + success: bool + messages: list + + +class ConversationListResponse(BaseModel): + success: bool + conversations: list + + +# ==================== 接口 ==================== + +@router.get("/{conversation_id}/history", response_model=ConversationHistoryResponse) +async def get_conversation_history(conversation_id: str, limit: int = 20): + """ + 获取对话历史 + + Args: + conversation_id: 对话会话ID + limit: 返回消息数量(默认20条) + """ + try: + messages = await mongodb.get_conversation_history(conversation_id, limit=limit) + return ConversationHistoryResponse( + success=True, + messages=messages + ) + except Exception as e: + logger.error(f"获取对话历史失败: {e}") + return ConversationHistoryResponse( + success=False, + messages=[] + ) + + +@router.delete("/{conversation_id}") +async def delete_conversation(conversation_id: str): + """ + 删除对话会话 + + Args: + conversation_id: 对话会话ID + """ + try: + success = await mongodb.delete_conversation(conversation_id) + return {"success": success} + except Exception as e: + logger.error(f"删除对话失败: {e}") + return {"success": False, "error": str(e)} + + +@router.get("/all", response_model=ConversationListResponse) +async def list_conversations(limit: int = 50, skip: int = 0): + """ + 获取会话列表 + + Args: + limit: 返回数量 + skip: 跳过数量 + """ + try: + conversations = await mongodb.list_conversations(limit=limit, skip=skip) + return ConversationListResponse( + success=True, + conversations=conversations + ) + except Exception as e: + logger.error(f"获取会话列表失败: {e}") + return ConversationListResponse( + success=False, + conversations=[] + ) \ No newline at end of file diff --git a/backend/app/api/endpoints/documents.py b/backend/app/api/endpoints/documents.py index e8e206a..221e059 100644 --- a/backend/app/api/endpoints/documents.py +++ b/backend/app/api/endpoints/documents.py @@ -4,6 +4,7 @@ 支持多格式文档(docx/xlsx/md/txt)上传、解析、存储和RAG索引 集成 Excel 存储和 AI 生成字段描述 """ +import asyncio import logging import uuid from typing import List, Optional @@ -258,6 +259,7 @@ async def process_document( ) # 如果是 Excel,存储到 MySQL + AI生成描述 + RAG索引 + mysql_table_name = None if doc_type in ["xlsx", "xls"]: await update_task_status( task_id, status="processing", @@ -265,17 +267,29 @@ async def process_document( ) try: - # 使用 TableRAG 服务完成建表和RAG索引 + # 使用 TableRAG 服务存储到 MySQL(跳过 RAG 索引以提升速度) logger.info(f"开始存储Excel到MySQL: {original_filename}, file_path: {file_path}") rag_result = await table_rag_service.build_table_rag_index( file_path=file_path, filename=original_filename, sheet_name=parse_options.get("sheet_name"), - header_row=parse_options.get("header_row", 0) + header_row=parse_options.get("header_row", 0), + skip_rag_index=True # 跳过 AI 字段描述生成和索引 ) if rag_result.get("success"): - logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {rag_result.get('table_name')}") + mysql_table_name = rag_result.get('table_name') + logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {mysql_table_name}") + # 更新 MongoDB 中的 metadata,记录 MySQL 表名 + try: + doc = await mongodb.get_document(doc_id) + if doc: + metadata = doc.get("metadata", {}) + metadata["mysql_table_name"] = mysql_table_name + await mongodb.update_document_metadata(doc_id, metadata) + logger.info(f"已更新 MongoDB 文档的 mysql_table_name: {mysql_table_name}") + except Exception as update_err: + logger.warning(f"更新 MongoDB mysql_table_name 失败: {update_err}") else: logger.error(f"RAG索引构建失败: {rag_result.get('error')}") except Exception as e: @@ -283,17 +297,16 @@ async def process_document( else: # 非结构化文档 - await update_task_status( - task_id, status="processing", - progress=60, message="正在建立索引" - ) - - # 如果文档中有表格数据,提取并存储到 MySQL + RAG structured_data = result.data.get("structured_data", {}) tables = structured_data.get("tables", []) + # 如果文档中有表格数据,提取并存储到 MySQL(不需要 RAG 索引) if tables: - # 对每个表格建立 MySQL 表和 RAG 索引 + await update_task_status( + task_id, status="processing", + progress=60, message="正在存储表格数据" + ) + # 对每个表格建立 MySQL 表(跳过 RAG 索引,速度更快) for table_info in tables: await table_rag_service.index_document_table( doc_id=doc_id, @@ -302,8 +315,14 @@ async def process_document( source_doc_type=doc_type ) - # 同时对文档内容建立 RAG 索引 - await index_document_to_rag(doc_id, original_filename, result, doc_type) + # 对文档内容建立 RAG 索引(非结构化文本需要语义搜索) + content = result.data.get("content", "") + if content and len(content) > 50: # 只有内容足够长才建立索引 + await update_task_status( + task_id, status="processing", + progress=80, message="正在建立语义索引" + ) + await index_document_to_rag(doc_id, original_filename, result, doc_type) # 完成 await update_task_status( @@ -328,72 +347,95 @@ async def process_document( async def process_documents_batch(task_id: str, files: List[dict]): - """批量处理文档""" + """批量并行处理文档""" try: await update_task_status( task_id, status="processing", - progress=0, message="开始批量处理" + progress=0, message=f"开始批量处理 {len(files)} 个文档", + result={"total": len(files), "files": []} ) - results = [] - for i, file_info in enumerate(files): + async def process_single_file(file_info: dict, index: int) -> dict: + """处理单个文件""" + filename = file_info["filename"] try: + # 解析文档 parser = ParserFactory.get_parser(file_info["path"]) result = parser.parse(file_info["path"]) - if result.success: - doc_id = await mongodb.insert_document( - doc_type=file_info["ext"], - content=result.data.get("content", ""), - metadata={ - **result.metadata, - "original_filename": file_info["filename"], - "file_path": file_info["path"] - }, - structured_data=result.data.get("structured_data") + if not result.success: + return {"index": index, "filename": filename, "success": False, "error": result.error or "解析失败"} + + # 存储到 MongoDB + doc_id = await mongodb.insert_document( + doc_type=file_info["ext"], + content=result.data.get("content", ""), + metadata={ + **result.metadata, + "original_filename": filename, + "file_path": file_info["path"] + }, + structured_data=result.data.get("structured_data") + ) + + # Excel 处理 + if file_info["ext"] in ["xlsx", "xls"]: + await table_rag_service.build_table_rag_index( + file_path=file_info["path"], + filename=filename, + skip_rag_index=True # 跳过 AI 字段描述生成和索引 ) - - # Excel 处理 - if file_info["ext"] in ["xlsx", "xls"]: - await table_rag_service.build_table_rag_index( - file_path=file_info["path"], - filename=file_info["filename"] - ) - else: - # 非结构化文档:处理其中的表格 + 内容索引 - structured_data = result.data.get("structured_data", {}) - tables = structured_data.get("tables", []) - - if tables: - for table_info in tables: - await table_rag_service.index_document_table( - doc_id=doc_id, - filename=file_info["filename"], - table_data=table_info, - source_doc_type=file_info["ext"] - ) - - await index_document_to_rag(doc_id, file_info["filename"], result, file_info["ext"]) - - results.append({"filename": file_info["filename"], "doc_id": doc_id, "success": True}) else: - results.append({"filename": file_info["filename"], "success": False, "error": result.error}) + # 非结构化文档 + structured_data = result.data.get("structured_data", {}) + tables = structured_data.get("tables", []) + + # 表格数据直接存 MySQL(跳过 RAG 索引) + if tables: + for table_info in tables: + await table_rag_service.index_document_table( + doc_id=doc_id, + filename=filename, + table_data=table_info, + source_doc_type=file_info["ext"] + ) + + # 只有内容足够长才建立语义索引 + content = result.data.get("content", "") + if content and len(content) > 50: + await index_document_to_rag(doc_id, filename, result, file_info["ext"]) + + return {"index": index, "filename": filename, "doc_id": doc_id, "success": True} except Exception as e: - results.append({"filename": file_info["filename"], "success": False, "error": str(e)}) + logger.error(f"处理文件 {filename} 失败: {e}") + return {"index": index, "filename": filename, "success": False, "error": str(e)} - progress = int((i + 1) / len(files) * 100) - await update_task_status( - task_id, status="processing", - progress=progress, message=f"已处理 {i+1}/{len(files)}" - ) + # 并行处理所有文档 + tasks = [process_single_file(f, i) for i, f in enumerate(files)] + results = await asyncio.gather(*tasks) + # 按原始顺序排序 + results.sort(key=lambda x: x["index"]) + + # 统计成功/失败数量 + success_count = sum(1 for r in results if r["success"]) + fail_count = len(results) - success_count + + # 更新最终状态 await update_task_status( task_id, status="success", - progress=100, message="批量处理完成", - result={"results": results} + progress=100, message=f"批量处理完成: {success_count} 成功, {fail_count} 失败", + result={ + "total": len(files), + "success": success_count, + "failure": fail_count, + "results": results + } ) + logger.info(f"批量处理完成: {success_count}/{len(files)} 成功") + except Exception as e: logger.error(f"批量处理失败: {str(e)}") await update_task_status( @@ -404,20 +446,20 @@ async def process_documents_batch(task_id: str, files: List[dict]): async def index_document_to_rag(doc_id: str, filename: str, result: ParseResult, doc_type: str): - """将非结构化文档索引到 RAG(使用分块索引)""" + """将非结构化文档索引到 RAG(使用分块索引,异步执行)""" try: content = result.data.get("content", "") if content: - # 将完整内容传递给 RAG 服务自动分块索引 - rag_service.index_document_content( + # 使用异步方法索引,避免阻塞事件循环 + await rag_service.index_document_content_async( doc_id=doc_id, - content=content, # 传递完整内容,由 RAG 服务自动分块 + content=content, metadata={ "filename": filename, "doc_type": doc_type }, - chunk_size=500, # 每块 500 字符 - chunk_overlap=50 # 块之间 50 字符重叠 + chunk_size=1000, # 每块 1000 字符,提升速度 + chunk_overlap=100 # 块之间 100 字符重叠 ) logger.info(f"RAG 索引完成: {filename}, doc_id={doc_id}") except Exception as e: diff --git a/backend/app/api/endpoints/instruction.py b/backend/app/api/endpoints/instruction.py index 751e518..52b3c42 100644 --- a/backend/app/api/endpoints/instruction.py +++ b/backend/app/api/endpoints/instruction.py @@ -25,6 +25,7 @@ class InstructionRequest(BaseModel): instruction: str doc_ids: Optional[List[str]] = None # 关联的文档 ID 列表 context: Optional[Dict[str, Any]] = None # 额外上下文 + conversation_id: Optional[str] = None # 对话会话ID,用于关联历史记录 class IntentRecognitionResponse(BaseModel): @@ -240,7 +241,8 @@ async def instruction_chat( task_id=task_id, instruction=request.instruction, doc_ids=request.doc_ids, - context=request.context + context=request.context, + conversation_id=request.conversation_id ) return { @@ -251,14 +253,15 @@ async def instruction_chat( } # 同步模式:等待执行完成 - return await _execute_chat_task(task_id, request.instruction, request.doc_ids, request.context) + return await _execute_chat_task(task_id, request.instruction, request.doc_ids, request.context, request.conversation_id) async def _execute_chat_task( task_id: str, instruction: str, doc_ids: Optional[List[str]], - context: Optional[Dict[str, Any]] + context: Optional[Dict[str, Any]], + conversation_id: Optional[str] = None ): """执行指令对话的后台任务""" from app.core.database import mongodb as mongo_client @@ -278,6 +281,13 @@ async def _execute_chat_task( # 构建上下文 ctx: Dict[str, Any] = context or {} + # 获取对话历史 + if conversation_id: + history = await mongo_client.get_conversation_history(conversation_id, limit=20) + if history: + ctx["conversation_history"] = history + logger.info(f"加载对话历史: conversation_id={conversation_id}, 消息数={len(history)}") + # 获取关联文档 if doc_ids: docs = [] @@ -291,6 +301,29 @@ async def _execute_chat_task( # 执行指令 result = await instruction_executor.execute(instruction, ctx) + # 存储对话历史 + if conversation_id: + try: + # 存储用户消息 + await mongo_client.insert_conversation( + conversation_id=conversation_id, + role="user", + content=instruction, + intent=result.get("intent", "unknown") + ) + # 存储助手回复 + response_content = result.get("message", "") + if response_content: + await mongo_client.insert_conversation( + conversation_id=conversation_id, + role="assistant", + content=response_content, + intent=result.get("intent", "unknown") + ) + logger.info(f"已存储对话历史: conversation_id={conversation_id}") + except Exception as e: + logger.error(f"存储对话历史失败: {e}") + # 根据意图类型添加友好的响应消息 response_messages = { "extract": f"已提取 {len(result.get('extracted_data', {}))} 个字段的数据", diff --git a/backend/app/api/endpoints/templates.py b/backend/app/api/endpoints/templates.py index 54b3a73..50cae04 100644 --- a/backend/app/api/endpoints/templates.py +++ b/backend/app/api/endpoints/templates.py @@ -87,6 +87,7 @@ class ExportRequest(BaseModel): template_id: str filled_data: dict format: str = "xlsx" # xlsx 或 docx + filled_file_path: Optional[str] = None # 已填写的 Word 文件路径(可选) # ==================== 接口实现 ==================== @@ -541,7 +542,7 @@ async def export_filled_template( if request.format == "xlsx": return await _export_to_excel(request.filled_data, request.template_id) elif request.format == "docx": - return await _export_to_word(request.filled_data, request.template_id) + return await _export_to_word(request.filled_data, request.template_id, request.filled_file_path) else: raise HTTPException( status_code=400, @@ -608,11 +609,12 @@ async def _export_to_excel(filled_data: dict, template_id: str) -> StreamingResp ) -async def _export_to_word(filled_data: dict, template_id: str) -> StreamingResponse: +async def _export_to_word(filled_data: dict, template_id: str, filled_file_path: Optional[str] = None) -> StreamingResponse: """导出为 Word 格式""" import re import tempfile import os + import urllib.parse from docx import Document from docx.shared import Pt, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH @@ -623,12 +625,32 @@ async def _export_to_word(filled_data: dict, template_id: str) -> StreamingRespo return "" # 移除控制字符 text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text) + # 转义 XML 特殊字符以防破坏文档结构 + text = text.replace('&', '&').replace('<', '<').replace('>', '>') return text.strip() + tmp_path = None try: - # 先保存到临时文件,再读取到内存,确保文档完整性 - with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file: - tmp_path = tmp_file.name + # 如果有已填写的文件(通过 _fill_docx 填写了模板单元格),直接返回该文件 + if filled_file_path and os.path.exists(filled_file_path): + filename = os.path.basename(filled_file_path) + with open(filled_file_path, 'rb') as f: + file_content = f.read() + output = io.BytesIO(file_content) + encoded_filename = urllib.parse.quote(filename) + return StreamingResponse( + output, + media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + headers={ + "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}", + "Content-Length": str(len(file_content)) + } + ) + + # 没有已填写文件,创建新的 Word 文档(表格形式) + # 创建临时文件(立即关闭句柄,避免 Windows 文件锁问题) + tmp_fd, tmp_path = tempfile.mkstemp(suffix='.docx') + os.close(tmp_fd) # 关闭立即得到的 fd,让 docx 可以写入 doc = Document() doc.add_heading('填写结果', level=1) @@ -670,19 +692,23 @@ async def _export_to_word(filled_data: dict, template_id: str) -> StreamingRespo finally: # 清理临时文件 - if os.path.exists(tmp_path): + if tmp_path and os.path.exists(tmp_path): try: os.unlink(tmp_path) - except: + except Exception: pass output = io.BytesIO(file_content) filename = "filled_template.docx" + encoded_filename = urllib.parse.quote(filename) return StreamingResponse( output, media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", - headers={"Content-Disposition": f"attachment; filename*=UTF-8''{filename}"} + headers={ + "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}", + "Content-Length": str(len(file_content)) + } ) diff --git a/backend/app/core/database/mongodb.py b/backend/app/core/database/mongodb.py index 90ddb44..ebef9e4 100644 --- a/backend/app/core/database/mongodb.py +++ b/backend/app/core/database/mongodb.py @@ -64,6 +64,11 @@ class MongoDB: """任务集合 - 存储任务历史记录""" return self.db["tasks"] + @property + def conversations(self): + """对话集合 - 存储对话历史记录""" + return self.db["conversations"] + # ==================== 文档操作 ==================== async def insert_document( @@ -117,14 +122,20 @@ class MongoDB: 搜索文档 Args: - query: 搜索关键词 + query: 搜索关键词(支持文件名和内容搜索) doc_type: 文档类型过滤 limit: 返回数量 Returns: 文档列表 """ - filter_query = {"content": {"$regex": query}} + filter_query = { + "$or": [ + {"content": {"$regex": query, "$options": "i"}}, + {"metadata.original_filename": {"$regex": query, "$options": "i"}}, + {"metadata.filename": {"$regex": query, "$options": "i"}}, + ] + } if doc_type: filter_query["doc_type"] = doc_type @@ -141,6 +152,15 @@ class MongoDB: result = await self.documents.delete_one({"_id": ObjectId(doc_id)}) return result.deleted_count > 0 + async def update_document_metadata(self, doc_id: str, metadata: Dict[str, Any]) -> bool: + """更新文档 metadata 字段""" + from bson import ObjectId + result = await self.documents.update_one( + {"_id": ObjectId(doc_id)}, + {"$set": {"metadata": metadata}} + ) + return result.modified_count > 0 + # ==================== RAG 索引操作 ==================== async def insert_rag_entry( @@ -251,6 +271,10 @@ class MongoDB: await self.tasks.create_index("task_id", unique=True) await self.tasks.create_index("created_at") + # 对话集合索引 + await self.conversations.create_index("conversation_id") + await self.conversations.create_index("created_at") + logger.info("MongoDB 索引创建完成") # ==================== 任务历史操作 ==================== @@ -369,6 +393,108 @@ class MongoDB: result = await self.tasks.delete_one({"task_id": task_id}) return result.deleted_count > 0 + # ==================== 对话历史操作 ==================== + + async def insert_conversation( + self, + conversation_id: str, + role: str, + content: str, + intent: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + ) -> str: + """ + 插入对话记录 + + Args: + conversation_id: 对话会话ID + role: 角色 (user/assistant) + content: 对话内容 + intent: 意图类型 + metadata: 额外元数据 + + Returns: + 插入文档的ID + """ + message = { + "conversation_id": conversation_id, + "role": role, + "content": content, + "intent": intent, + "metadata": metadata or {}, + "created_at": datetime.utcnow(), + } + result = await self.conversations.insert_one(message) + return str(result.inserted_id) + + async def get_conversation_history( + self, + conversation_id: str, + limit: int = 20, + ) -> List[Dict[str, Any]]: + """ + 获取对话历史 + + Args: + conversation_id: 对话会话ID + limit: 返回消息数量 + + Returns: + 对话消息列表 + """ + cursor = self.conversations.find( + {"conversation_id": conversation_id} + ).sort("created_at", 1).limit(limit) + + messages = [] + async for msg in cursor: + msg["_id"] = str(msg["_id"]) + if msg.get("created_at"): + msg["created_at"] = msg["created_at"].isoformat() + messages.append(msg) + return messages + + async def delete_conversation(self, conversation_id: str) -> bool: + """删除对话会话""" + result = await self.conversations.delete_many({"conversation_id": conversation_id}) + return result.deleted_count > 0 + + async def list_conversations( + self, + limit: int = 50, + skip: int = 0, + ) -> List[Dict[str, Any]]: + """ + 获取会话列表(按最近一条消息排序) + + Args: + limit: 返回数量 + skip: 跳过数量 + + Returns: + 会话列表 + """ + # 使用 aggregation 获取每个会话的最新一条消息 + pipeline = [ + {"$sort": {"created_at": -1}}, + {"$group": { + "_id": "$conversation_id", + "last_message": {"$first": "$$ROOT"}, + }}, + {"$replaceRoot": {"newRoot": "$last_message"}}, + {"$sort": {"created_at": -1}}, + {"$skip": skip}, + {"$limit": limit}, + ] + + conversations = [] + async for doc in self.conversations.aggregate(pipeline): + doc["_id"] = str(doc["_id"]) + if doc.get("created_at"): + doc["created_at"] = doc["created_at"].isoformat() + conversations.append(doc) + return conversations + # ==================== 全局单例 ==================== diff --git a/backend/app/core/document_parser/docx_parser.py b/backend/app/core/document_parser/docx_parser.py index db79512..e2bcb0e 100644 --- a/backend/app/core/document_parser/docx_parser.py +++ b/backend/app/core/document_parser/docx_parser.py @@ -44,6 +44,22 @@ class DocxParser(BaseParser): error=f"文件不存在: {file_path}" ) + # 尝试使用 python-docx 解析,失败则使用备用方法 + try: + return self._parse_with_docx(path) + except Exception as e: + logger.warning(f"python-docx 解析失败,使用备用方法: {e}") + try: + return self._parse_fallback(path) + except Exception as fallback_error: + logger.error(f"备用解析方法也失败: {fallback_error}") + return ParseResult( + success=False, + error=f"解析 Word 文档失败: {str(e)}" + ) + + def _parse_with_docx(self, path: Path) -> ParseResult: + """使用 python-docx 解析文档""" # 检查文件扩展名 if path.suffix.lower() not in self.supported_extensions: return ParseResult( @@ -51,98 +67,177 @@ class DocxParser(BaseParser): error=f"不支持的文件类型: {path.suffix}" ) + # 读取 Word 文档 + doc = Document(path) + + # 提取文本内容 + paragraphs = [] + for para in doc.paragraphs: + if para.text.strip(): + paragraphs.append({ + "text": para.text, + "style": str(para.style.name) if para.style else "Normal" + }) + + # 提取段落纯文本(用于 AI 解析) + paragraphs_text = [p["text"] for p in paragraphs if p["text"].strip()] + + # 提取表格内容 + tables_data = [] + for i, table in enumerate(doc.tables): + table_rows = [] + for row in table.rows: + row_data = [cell.text.strip() for cell in row.cells] + table_rows.append(row_data) + + if table_rows: + tables_data.append({ + "table_index": i, + "rows": table_rows, + "row_count": len(table_rows), + "column_count": len(table_rows[0]) if table_rows else 0 + }) + + # 提取图片/嵌入式对象信息 + images_info = self._extract_images_info(doc, path) + + # 合并所有文本(包括图片描述) + full_text_parts = [] + full_text_parts.append("【文档正文】") + full_text_parts.extend(paragraphs_text) + + if tables_data: + full_text_parts.append("\n【文档表格】") + for idx, table in enumerate(tables_data): + full_text_parts.append(f"--- 表格 {idx + 1} ---") + for row in table["rows"]: + full_text_parts.append(" | ".join(str(cell) for cell in row)) + + if images_info.get("image_count", 0) > 0: + full_text_parts.append(f"\n【文档图片】文档包含 {images_info['image_count']} 张图片/图表") + + full_text = "\n".join(full_text_parts) + + # 构建元数据 + metadata = { + "filename": path.name, + "extension": path.suffix.lower(), + "paragraph_count": len(paragraphs), + "table_count": len(tables_data), + "image_count": images_info.get("image_count", 0) + } + + return ParseResult( + success=True, + data={ + "content": full_text, + "paragraphs": paragraphs, + "paragraphs_with_style": paragraphs, + "tables": tables_data, + "images": images_info + }, + metadata=metadata + ) + + def _parse_fallback(self, path: Path) -> ParseResult: + """备用解析方法:直接解析 docx 的 XML 结构""" + import zipfile + from xml.etree import ElementTree as ET + try: - # 读取 Word 文档 - doc = Document(file_path) + with zipfile.ZipFile(path, 'r') as zf: + # 读取 document.xml + if 'word/document.xml' not in zf.namelist(): + return ParseResult(success=False, error="无效的 docx 文件格式") - # 提取文本内容 - paragraphs = [] - for para in doc.paragraphs: - if para.text.strip(): - paragraphs.append({ - "text": para.text, - "style": str(para.style.name) if para.style else "Normal" + xml_content = zf.read('word/document.xml') + root = ET.fromstring(xml_content) + + # 命名空间 + namespaces = { + 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' + } + + paragraphs = [] + tables = [] + current_table = [] + + for elem in root.iter(): + if elem.tag.endswith('}p'): # 段落 + text_parts = [] + for t in elem.iter(): + if t.tag.endswith('}t') and t.text: + text_parts.append(t.text) + text = ''.join(text_parts).strip() + if text: + paragraphs.append({'text': text, 'style': 'Normal'}) + elif elem.tag.endswith('}tr'): # 表格行 + row_data = [] + for tc in elem.iter(): + if tc.tag.endswith('}tc'): # 单元格 + cell_text = [] + for t in tc.iter(): + if t.tag.endswith('}t') and t.text: + cell_text.append(t.text) + row_data.append(''.join(cell_text).strip()) + if row_data: + current_table.append(row_data) + else: + # 表格结束,保存 + if current_table: + tables.append({ + 'table_index': len(tables), + 'rows': current_table, + 'row_count': len(current_table), + 'column_count': len(current_table[0]) if current_table else 0 + }) + current_table = [] + + # 保存最后一张表格 + if current_table: + tables.append({ + 'table_index': len(tables), + 'rows': current_table, + 'row_count': len(current_table), + 'column_count': len(current_table[0]) if current_table else 0 }) - # 提取段落纯文本(用于 AI 解析) - paragraphs_text = [p["text"] for p in paragraphs if p["text"].strip()] + # 构建文本 + paragraphs_text = [p["text"] for p in paragraphs] + full_text_parts = ["【文档正文】"] + paragraphs_text - # 提取表格内容 - tables_data = [] - for i, table in enumerate(doc.tables): - table_rows = [] - for row in table.rows: - row_data = [cell.text.strip() for cell in row.cells] - table_rows.append(row_data) + if tables: + full_text_parts.append("\n【文档表格】") + for idx, table in enumerate(tables): + full_text_parts.append(f"--- 表格 {idx + 1} ---") + for row in table["rows"]: + full_text_parts.append(" | ".join(str(cell) for cell in row)) - if table_rows: - tables_data.append({ - "table_index": i, - "rows": table_rows, - "row_count": len(table_rows), - "column_count": len(table_rows[0]) if table_rows else 0 - }) + full_text = "\n".join(full_text_parts) - # 提取图片/嵌入式对象信息 - images_info = self._extract_images_info(doc, path) - - # 合并所有文本(包括图片描述) - full_text_parts = [] - full_text_parts.append("【文档正文】") - full_text_parts.extend(paragraphs_text) - - if tables_data: - full_text_parts.append("\n【文档表格】") - for idx, table in enumerate(tables_data): - full_text_parts.append(f"--- 表格 {idx + 1} ---") - for row in table["rows"]: - full_text_parts.append(" | ".join(str(cell) for cell in row)) - - if images_info.get("image_count", 0) > 0: - full_text_parts.append(f"\n【文档图片】文档包含 {images_info['image_count']} 张图片/图表") - - full_text = "\n".join(full_text_parts) - - # 构建元数据 - metadata = { - "filename": path.name, - "extension": path.suffix.lower(), - "file_size": path.stat().st_size, - "paragraph_count": len(paragraphs), - "table_count": len(tables_data), - "word_count": len(full_text), - "char_count": len(full_text.replace("\n", "")), - "has_tables": len(tables_data) > 0, - "has_images": images_info.get("image_count", 0) > 0, - "image_count": images_info.get("image_count", 0) - } - - # 返回结果 - return ParseResult( - success=True, - data={ - "content": full_text, - "paragraphs": paragraphs_text, - "paragraphs_with_style": paragraphs, - "tables": tables_data, - "images": images_info, - "word_count": len(full_text), - "structured_data": { + return ParseResult( + success=True, + data={ + "content": full_text, "paragraphs": paragraphs, - "paragraphs_text": paragraphs_text, - "tables": tables_data, - "images": images_info + "paragraphs_with_style": paragraphs, + "tables": tables, + "images": {"image_count": 0, "descriptions": []} + }, + metadata={ + "filename": path.name, + "extension": path.suffix.lower(), + "paragraph_count": len(paragraphs), + "table_count": len(tables), + "image_count": 0, + "parse_method": "fallback_xml" } - }, - metadata=metadata - ) + ) + except zipfile.BadZipFile: + return ParseResult(success=False, error="无效的 ZIP/文档文件") except Exception as e: - logger.error(f"解析 Word 文档失败: {str(e)}") - return ParseResult( - success=False, - error=f"解析 Word 文档失败: {str(e)}" - ) + return ParseResult(success=False, error=f"备用解析失败: {str(e)}") def extract_images_as_base64(self, file_path: str) -> List[Dict[str, str]]: """ @@ -197,6 +292,83 @@ class DocxParser(BaseParser): logger.info(f"共提取 {len(images)} 张图片") return images + def extract_text_from_images(self, file_path: str, lang: str = 'chi_sim+eng') -> Dict[str, Any]: + """ + 对 Word 文档中的图片进行 OCR 文字识别 + + Args: + file_path: Word 文件路径 + lang: Tesseract 语言代码,默认简体中文+英文 (chi_sim+eng) + + Returns: + 包含识别结果的字典 + """ + import zipfile + from io import BytesIO + from PIL import Image + + try: + import pytesseract + except ImportError: + logger.warning("pytesseract 未安装,OCR 功能不可用") + return { + "success": False, + "error": "pytesseract 未安装,请运行: pip install pytesseract", + "image_count": 0, + "extracted_text": [] + } + + results = { + "success": True, + "image_count": 0, + "extracted_text": [], + "total_chars": 0 + } + + try: + with zipfile.ZipFile(file_path, 'r') as zf: + # 查找 word/media 目录下的图片文件 + media_files = [f for f in zf.namelist() if f.startswith('word/media/')] + + for idx, filename in enumerate(media_files): + ext = filename.split('.')[-1].lower() + if ext not in ['png', 'jpg', 'jpeg', 'gif', 'bmp']: + continue + + try: + # 读取图片数据 + image_data = zf.read(filename) + image = Image.open(BytesIO(image_data)) + + # 使用 Tesseract OCR 提取文字 + text = pytesseract.image_to_string(image, lang=lang) + text = text.strip() + + if text: + results["extracted_text"].append({ + "image_index": idx, + "filename": filename, + "text": text, + "char_count": len(text) + }) + results["total_chars"] += len(text) + + logger.info(f"图片 {filename} OCR 识别完成,提取 {len(text)} 字符") + + except Exception as e: + logger.warning(f"图片 {filename} OCR 识别失败: {str(e)}") + + results["image_count"] = len(results["extracted_text"]) + + except zipfile.BadZipFile: + results["success"] = False + results["error"] = "无效的 Word 文档文件" + except Exception as e: + results["success"] = False + results["error"] = f"OCR 处理失败: {str(e)}" + + return results + def extract_key_sentences(self, text: str, max_sentences: int = 10) -> List[str]: """ 从文本中提取关键句子 diff --git a/backend/app/instruction/executor.py b/backend/app/instruction/executor.py index c7a05c7..1f3d84a 100644 --- a/backend/app/instruction/executor.py +++ b/backend/app/instruction/executor.py @@ -5,9 +5,10 @@ """ import logging import json +import re from typing import Any, Dict, List, Optional -from app.services.template_fill_service import template_fill_service +from app.services.template_fill_service import template_fill_service, TemplateField from app.services.rag_service import rag_service from app.services.markdown_ai_service import markdown_ai_service from app.core.database import mongodb @@ -15,6 +16,31 @@ from app.core.database import mongodb logger = logging.getLogger(__name__) +def _extract_filenames_from_text(text: str) -> List[str]: + """ + 从指令文本中提取文件名列表。 + + 智能处理用'和'/'与'/'、分隔的多个文件名(尤其是带年号的统计公报)。 + """ + # 先去掉"对比这两个文档"等引导语,只保留文件名部分 + text = re.sub(r'^(?:对比|比较)这两个?文档[的差异]?[::]?', '', text).strip() + text = re.sub(r'两个文档.*$', '', text).strip() + if not text: + return [] + + # 直接查找所有带扩展名的文件名模式 + results = [] + for m in re.finditer(r'[^\s,。!?、和与]+(?=\.(?:docx|xlsx|md|txt))', text): + start = m.start() + ext_match = re.search(r'\.(?:docx|xlsx|md|txt)', text[m.end():]) + if ext_match: + fn = text[start:m.end() + ext_match.end()] + if fn: + results.append(fn) + + return results + + class InstructionExecutor: """指令执行器""" @@ -41,9 +67,10 @@ class InstructionExecutor: self.intent_parser = intent_parser context = context or {} + context["instruction"] = instruction # 保存原始指令以便后续使用 - # 解析意图 - intent, params = await self.intent_parser.parse(instruction) + # 解析意图(传递对话历史上下文) + intent, params = await self.intent_parser.parse(instruction, context) # 根据意图类型执行相应操作 if intent == "extract": @@ -72,18 +99,48 @@ class InstructionExecutor: async def _execute_extract(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]: """执行信息提取""" try: - target_fields = params.get("field_refs", []) + # target_fields 来自意图解析,field_refs 来自引号/字段关键词匹配 + target_fields = params.get("target_fields", []) or params.get("field_refs", []) doc_ids = params.get("document_refs", []) + instruction_text = context.get("instruction", "") + + # 如果没有指定文档,尝试按文件名精确搜索 + if not doc_ids or "all_docs" in doc_ids: + if instruction_text: + import re + # 提取引号内的内容或文件名 + filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text) + if filename_match: + search_term = filename_match.group(1) + else: + match = re.search(r'([^\s]+\.(?:docx|xlsx|md|txt))', instruction_text) + search_term = match.group(1) if match else None + + if search_term: + logger.info(f"提取时搜索文档: {search_term}") + searched_docs = await mongodb.search_documents(search_term, limit=5) + if searched_docs: + # 优先选择文件名完全匹配的文档 + best_docs = [ + d for d in searched_docs + if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower() + ] + if not best_docs: + best_docs = [searched_docs[0]] + context["source_docs"] = best_docs + doc_ids = [doc.get("_id", "") for doc in best_docs] + logger.info(f"找到 {len(best_docs)} 个文档用于提取,最佳: {best_docs[0].get('metadata', {}).get('original_filename', '?')}") if not target_fields: return { "success": False, + "intent": "extract", "error": "未指定要提取的字段", "message": "请明确说明要提取哪些字段,如:'提取医院数量和床位数'" } - # 如果指定了文档,验证文档存在 - if doc_ids and "all_docs" not in doc_ids: + # 如果指定了文档且还没有加载 source_docs,则验证并加载 + if doc_ids and "all_docs" not in doc_ids and not context.get("source_docs"): valid_docs = [] for doc_ref in doc_ids: doc_id = doc_ref.replace("doc_", "") @@ -93,20 +150,22 @@ class InstructionExecutor: if not valid_docs: return { "success": False, + "intent": "extract", "error": "指定的文档不存在", "message": "请检查文档编号是否正确" } context["source_docs"] = valid_docs - # 构建字段列表 - fields = [] - for i, field_name in enumerate(target_fields): - fields.append({ - "name": field_name, - "cell": f"A{i+1}", - "field_type": "text", - "required": False - }) + # 构建字段列表(使用 TemplateField dataclass) + fields = [ + TemplateField( + name=field_name, + cell=f"A{i+1}", + field_type="text", + required=False + ) + for i, field_name in enumerate(target_fields) + ] # 调用填表服务 result = await template_fill_service.fill_template( @@ -143,7 +202,7 @@ class InstructionExecutor: } # 获取源文档 - source_docs = context.get("source_docs", []) + source_docs = context.get("source_docs", []) or [] source_doc_ids = [doc.get("_id") for doc in source_docs if doc.get("_id")] # 获取字段 @@ -175,36 +234,103 @@ class InstructionExecutor: } async def _execute_summarize(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]: - """执行摘要总结""" + """执行摘要总结 - 使用 LLM 生成真实摘要""" try: - docs = context.get("source_docs", []) + import re + docs = context.get("source_docs", []) or [] + instruction_text = context.get("instruction", "") + + # 从指令中提取文件名/关键词,优先搜索精确文档 + search_term = None + if instruction_text: + filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text) + if filename_match: + search_term = filename_match.group(1) + else: + file_match = re.search(r'([^\s,。!?,]+\.(?:docx|xlsx|md|txt))', instruction_text) + if file_match: + search_term = file_match.group(1) + + # 如果没有文档或有更精确的搜索词,尝试重新搜索 + if not docs or search_term: + if search_term: + logger.info(f"按关键词搜索文档: {search_term}") + searched_docs = await mongodb.search_documents(search_term, limit=5) + if searched_docs: + # 优先使用文件名最匹配的文档 + docs = sorted( + searched_docs, + key=lambda d: 1 if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower() else 0, + reverse=True + ) + logger.info(f"找到 {len(docs)} 个文档,最佳匹配: {docs[0].get('metadata', {}).get('original_filename', '?')}") + if not docs: return { - "success": False, - "error": "没有可用的文档", - "message": "请先上传要总结的文档" + "success": True, + "intent": "summarize", + "action_needed": "provide_document", + "message": "我理解了,您想分析文档内容。", + "suggestion": "请提供已上传文档的名称(可以是文件名或部分名称),或者上传您想要分析的文档。\n\n支持的格式:docx、xlsx、md、txt\n\n例如:'分析2021年民政事业发展统计公报' 或 '总结卫生健康数据'" } - summaries = [] - for doc in docs[:5]: # 最多处理5个文档 - content = doc.get("content", "")[:5000] # 限制内容长度 - if content: - summaries.append({ - "filename": doc.get("metadata", {}).get("original_filename", "未知"), - "content_preview": content[:500] + "..." if len(content) > 500 else content - }) + # 对第一个(最佳匹配)文档生成 AI 摘要 + primary_doc = docs[0] + content = primary_doc.get("content", "") + filename = primary_doc.get("metadata", {}).get("original_filename", "未知文档") + + if not content: + return { + "success": False, + "intent": "summarize", + "error": "文档内容为空", + "message": f"文档 {filename} 没有可供分析的文本内容" + } + + # 使用 LLM 生成摘要 + content_for_summary = content[:12000] # 最多取前 12000 字 + user_request = instruction_text or "请总结这份文档" + + prompt = f"""请对以下文档进行全面、有条理的摘要分析。 + +文档名称:{filename} +用户要求:{user_request} + +文档内容: +{content_for_summary} + +请按以下格式输出摘要: +1. **文档概述**:简述文档主题和背景(2-3句) +2. **主要内容**:列出文档的核心数据和关键信息(用要点列出) +3. **重要数据**:提取文档中的重要数字、统计数据 +4. **主要结论**:归纳文档的主要结论或趋势 + +要求:条理清晰,数据准确,不要遗漏关键信息。""" + + from app.services.llm_service import llm_service + messages = [ + {"role": "system", "content": "你是一个专业的文档分析助手,擅长提取关键信息并生成结构化摘要。"}, + {"role": "user", "content": prompt} + ] + + response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=2000) + ai_summary = llm_service.extract_message_content(response) return { "success": True, "intent": "summarize", - "summaries": summaries, - "message": f"找到 {len(summaries)} 个文档可供参考" + "ai_summary": ai_summary, + "filename": filename, + "doc_id": primary_doc.get("_id", ""), + "total_docs_found": len(docs), + "message": f"已生成文档摘要" } except Exception as e: logger.error(f"摘要执行失败: {e}") return { "success": False, + "intent": "summarize", "error": str(e), "message": f"摘要生成失败: {str(e)}" } @@ -213,17 +339,39 @@ class InstructionExecutor: """执行问答""" try: question = params.get("question", "") + instruction_text = context.get("instruction", "") + if not question: return { "success": False, + "intent": "question", "error": "未提供问题", "message": "请输入要回答的问题" } - # 使用 RAG 检索相关文档 - docs = context.get("source_docs", []) - rag_results = [] + docs = context.get("source_docs", []) or [] + # 如果没有文档,尝试从指令中提取文件名搜索 + if not docs: + filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text) + if not filename_match: + filename_match = re.search(r'([^\s,。!?]+\.(?:docx|xlsx|md|txt))', instruction_text) + if filename_match: + found = await mongodb.search_documents(filename_match.group(1), limit=5) + if found: + docs = found + + if not docs: + return { + "success": True, + "intent": "question", + "question": question, + "answer": None, + "message": "请先上传文档,我才能回答您的问题" + } + + # 使用 RAG 检索相关文档 + rag_results = [] for doc in docs: doc_id = doc.get("_id", "") if doc_id: @@ -241,12 +389,42 @@ class InstructionExecutor: doc.get("content", "")[:3000] for doc in docs[:3] if doc.get("content") ]) + if not context_text: + return { + "success": True, + "intent": "question", + "question": question, + "answer": None, + "message": "文档内容为空,无法回答问题" + } + + # 使用 LLM 生成答案 + filename = docs[0].get("metadata", {}).get("original_filename", "文档") + prompt = f"""基于以下文档内容,回答用户的问题。 + +文档名称:{filename} +用户问题:{question} + +文档内容: +{context_text[:8000]} + +请根据文档内容准确回答问题。如果文档中没有相关信息,请明确说明。""" + + from app.services.llm_service import llm_service + messages = [ + {"role": "system", "content": "你是一个专业的文档问答助手,根据提供的内容准确回答用户问题。"}, + {"role": "user", "content": prompt} + ] + response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=1500) + answer = llm_service.extract_message_content(response) + return { "success": True, "intent": "question", "question": question, - "context_preview": context_text[:500] + "..." if len(context_text) > 500 else context_text, - "message": "已找到相关上下文,可进行问答" + "answer": answer, + "filename": filename, + "message": "已生成回答" } except Exception as e: @@ -299,12 +477,53 @@ class InstructionExecutor: async def _execute_compare(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]: """执行对比分析""" try: - docs = context.get("source_docs", []) + docs = context.get("source_docs", []) or [] + instruction_text = context.get("instruction", "") + + # 优先从指令中提取具体的文件名 + filenames = _extract_filenames_from_text(instruction_text) + + if filenames: + # 只选择文件名匹配的那些文档 + matched_docs = [] + for doc in docs: + fname = doc.get("metadata", {}).get("original_filename", "").lower() + for fn in filenames: + if fn.lower() in fname or fname in fn.lower(): + matched_docs.append(doc) + break + # 如果匹配到足够文档,用匹配的 + if len(matched_docs) >= 2: + docs = matched_docs + else: + # 匹配不够,尝试按文件名搜索 MongoDB + all_found = [] + for fn in filenames: + found = await mongodb.search_documents(fn, limit=5) + all_found.extend(found) + seen = set() + unique_docs = [] + for d in all_found: + did = d.get("_id", "") + if did and did not in seen: + seen.add(did) + unique_docs.append(d) + if len(unique_docs) >= 2: + docs = unique_docs + elif len(unique_docs) == 1 and len(docs) >= 1: + # 找到一个指定的 + 用一个通用的 + docs = unique_docs + docs[:1] + elif docs and len(filenames) == 1: + # 找到一个指定文件名但只有一个匹配,尝试补充 + docs = unique_docs + [d for d in docs if d not in unique_docs] + docs = docs[:2] + if len(docs) < 2: return { "success": False, + "intent": "compare", "error": "对比需要至少2个文档", - "message": "请上传至少2个文档进行对比" + "message": "请上传至少2个文档进行对比,或明确说出要对比的文档名称" } # 提取文档基本信息 @@ -329,6 +548,7 @@ class InstructionExecutor: logger.error(f"对比执行失败: {e}") return { "success": False, + "intent": "compare", "error": str(e), "message": f"对比分析失败: {str(e)}" } @@ -336,10 +556,23 @@ class InstructionExecutor: async def _execute_edit(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]: """执行文档编辑操作""" try: - docs = context.get("source_docs", []) + docs = context.get("source_docs", []) or [] + instruction_text = context.get("instruction", "") + + # 如果没有文档,尝试从指令中提取文件名搜索 + if not docs: + filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text) + if not filename_match: + filename_match = re.search(r'([^\s,。!?]+\.(?:docx|xlsx|md|txt))', instruction_text) + if filename_match: + found = await mongodb.search_documents(filename_match.group(1), limit=3) + if found: + docs = found + if not docs: return { "success": False, + "intent": "edit", "error": "没有可用的文档", "message": "请先上传要编辑的文档" } @@ -405,7 +638,7 @@ class InstructionExecutor: - Word -> Markdown """ try: - docs = context.get("source_docs", []) + docs = context.get("source_docs", []) or [] if not docs: return { "success": False, diff --git a/backend/app/instruction/intent_parser.py b/backend/app/instruction/intent_parser.py index b53c034..71c6090 100644 --- a/backend/app/instruction/intent_parser.py +++ b/backend/app/instruction/intent_parser.py @@ -28,7 +28,7 @@ class IntentParser: INTENT_KEYWORDS = { INTENT_EXTRACT: ["提取", "抽取", "获取", "找出", "查找", "识别", "找到"], INTENT_FILL_TABLE: ["填表", "填写", "填充", "录入", "导入到表格", "填写到"], - INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼"], + INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼", "分析", "聊聊"], INTENT_QUESTION: ["问答", "回答", "解释", "什么是", "为什么", "如何", "怎样", "多少", "几个"], INTENT_SEARCH: ["搜索", "查找", "检索", "查询", "找"], INTENT_COMPARE: ["对比", "比较", "差异", "区别", "不同"], @@ -47,12 +47,13 @@ class IntentParser: def __init__(self): self.intent_history: List[Dict[str, Any]] = [] - async def parse(self, text: str) -> Tuple[str, Dict[str, Any]]: + async def parse(self, text: str, context: Dict[str, Any] = None) -> Tuple[str, Dict[str, Any]]: """ 解析自然语言指令 Args: text: 用户输入的自然语言 + context: 执行上下文(包含对话历史等) Returns: (意图类型, 参数字典) @@ -61,11 +62,17 @@ class IntentParser: if not text: return self.INTENT_UNKNOWN, {} + # 检查对话历史中的上下文 + conversation_history = [] + if context and context.get("conversation_history"): + conversation_history = context.get("conversation_history", []) + logger.info(f"解析时使用对话历史: {len(conversation_history)} 条消息") + # 记录历史 self.intent_history.append({"text": text, "intent": None}) - # 识别意图 - intent = self._recognize_intent(text) + # 识别意图(考虑对话上下文) + intent = self._recognize_intent_with_context(text, conversation_history) # 提取参数 params = self._extract_params(text, intent) @@ -78,6 +85,42 @@ class IntentParser: return intent, params + def _recognize_intent_with_context(self, text: str, conversation_history: List[Dict[str, Any]]) -> str: + """ + 基于对话历史识别意图 + + Args: + text: 当前用户输入 + conversation_history: 对话历史 + + Returns: + 意图类型 + """ + # 如果对话历史为空,使用基础意图识别 + if not conversation_history: + return self._recognize_intent(text) + + # 基于历史上下文进行意图识别 + # 分析最近的对话了解用户意图的延续性 + last_intent = None + last_topic = None + + for msg in conversation_history[-5:]: # 最多看最近5条消息 + if msg.get("role") == "assistant": + last_intent = msg.get("intent") + if msg.get("intent") and msg.get("intent") != "unknown": + last_topic = msg.get("intent") + + # 如果当前消息很短(如"继续"、"是的"),可能延续之前的意图 + short_confirmation = ["是", "是的", "好", "继续", "ok", "好", "接着", "然后", "还有吗"] + if text.strip() in short_confirmation or len(text.strip()) <= 3: + if last_topic: + logger.info(f"简短确认,延续之前的意图: {last_topic}") + return last_topic + + # 否则使用标准意图识别 + return self._recognize_intent(text) + def _recognize_intent(self, text: str) -> str: """识别意图类型""" intent_scores: Dict[str, float] = {} @@ -214,18 +257,27 @@ class IntentParser: return template_info if template_info else None def _extract_target_fields(self, text: str) -> List[str]: - """提取目标字段""" + """提取目标字段 - 按分隔符切分再逐段清理""" fields = [] - # 匹配 "提取XXX和YYY"、"抽取XXX、YYY" - patterns = [ - r"提取([^(and|,|,)+]+?)(?:和|与|、|,|plus)", - r"抽取([^(and|,|,)+]+?)(?:和|与|、|,|plus)", - ] + # 去除提取/抽取前缀 + cleaned_text = re.sub(r"^(?:提取|抽取)", "", text).strip() - for pattern in patterns: - matches = re.findall(pattern, text) - fields.extend([m.strip() for m in matches if m.strip()]) + # 按'和'、'与'、'、'分割成多段 + segments = re.split(r"[和与、]", cleaned_text) + + # 常见前缀(这些不是字段名,需要去除) + prefixes = ["文档中的", "文档中", "文件中的", "文件中", "内容中的", "内容中"] + + for seg in segments: + seg = seg.strip() + # 去除常见前缀 + for p in prefixes: + if seg.startswith(p): + seg = seg[len(p):] + break + if seg and 2 <= len(seg) <= 20: + fields.append(seg) return list(set(fields)) diff --git a/backend/app/services/excel_storage_service.py b/backend/app/services/excel_storage_service.py index d324a5f..f608ab9 100644 --- a/backend/app/services/excel_storage_service.py +++ b/backend/app/services/excel_storage_service.py @@ -526,9 +526,10 @@ class ExcelStorageService: # 创建表 model_class = self._create_table_model(table_name, columns, column_types) - # 创建表结构 + # 创建表结构 (使用异步方式) async with self.mysql_db.get_session() as session: - model_class.__table__.create(session.bind, checkfirst=True) + async with session.bind.begin() as conn: + await conn.run_sync(lambda: model_class.__table__.create(checkfirst=True)) # 插入数据 records = [] diff --git a/backend/app/services/rag_service.py b/backend/app/services/rag_service.py index 50c2607..230800c 100644 --- a/backend/app/services/rag_service.py +++ b/backend/app/services/rag_service.py @@ -165,9 +165,9 @@ class BM25: class RAGService: """RAG 检索增强服务""" - # 默认分块参数 - DEFAULT_CHUNK_SIZE = 500 # 每个文本块的大小(字符数) - DEFAULT_CHUNK_OVERLAP = 50 # 块之间的重叠(字符数) + # 默认分块参数 - 增大块大小减少embedding次数 + DEFAULT_CHUNK_SIZE = 1000 # 每个文本块的大小(字符数),增大以提升速度 + DEFAULT_CHUNK_OVERLAP = 100 # 块之间的重叠(字符数) def __init__(self): self.embedding_model = None @@ -389,6 +389,70 @@ class RAGService: self._add_documents(documents, chunk_ids) logger.info(f"已索引文档 {doc_id},共 {len(chunks)} 个块") + async def index_document_content_async( + self, + doc_id: str, + content: str, + metadata: Optional[Dict[str, Any]] = None, + chunk_size: int = None, + chunk_overlap: int = None + ): + """ + 异步将文档内容索引到向量数据库(自动分块) + + 使用 asyncio.to_thread 避免阻塞事件循环 + """ + import asyncio + + if self._disabled: + logger.info(f"[RAG DISABLED] 文档索引操作已跳过: {doc_id}") + return + + if not self._initialized: + self._init_vector_store() + + if self.embedding_model is None: + logger.debug(f"文档跳过索引 (无嵌入模型): {doc_id}") + return + + # 分割文档为小块 + if chunk_size is None: + chunk_size = self.DEFAULT_CHUNK_SIZE + if chunk_overlap is None: + chunk_overlap = self.DEFAULT_CHUNK_OVERLAP + + chunks = self._split_into_chunks(content, chunk_size, chunk_overlap) + + if not chunks: + logger.warning(f"文档内容为空,跳过索引: {doc_id}") + return + + # 为每个块创建文档对象 + documents = [] + chunk_ids = [] + + for i, chunk in enumerate(chunks): + chunk_id = f"{doc_id}_chunk_{i}" + chunk_metadata = metadata.copy() if metadata else {} + chunk_metadata.update({ + "chunk_index": i, + "total_chunks": len(chunks), + "doc_id": doc_id + }) + + documents.append(SimpleDocument( + page_content=chunk, + metadata=chunk_metadata + )) + chunk_ids.append(chunk_id) + + # 使用线程池执行 CPU 密集型的 embedding 计算 + def _sync_add(): + self._add_documents(documents, chunk_ids) + + await asyncio.to_thread(_sync_add) + logger.info(f"已异步索引文档 {doc_id},共 {len(chunks)} 个块") + def _add_documents(self, documents: List[SimpleDocument], doc_ids: List[str]): """批量添加文档到向量索引""" if not documents: diff --git a/backend/app/services/table_rag_service.py b/backend/app/services/table_rag_service.py index d2a9dd0..4130e10 100644 --- a/backend/app/services/table_rag_service.py +++ b/backend/app/services/table_rag_service.py @@ -300,13 +300,15 @@ class TableRAGService: filename: str, sheet_name: Optional[str] = None, header_row: int = 0, - sample_size: int = 10 + sample_size: int = 10, + skip_rag_index: bool = False ) -> Dict[str, Any]: """ 为 Excel 表构建完整的 RAG 索引 流程: 1. 读取 Excel 获取字段信息 + 2. 如果 skip_rag_index=True,跳过 RAG 索引,直接存 MySQL 2. AI 生成每个字段的语义描述 3. 将字段描述存入向量数据库 @@ -367,6 +369,20 @@ class TableRAGService: results["field_count"] = len(df.columns) logger.info(f"表名: {table_name}, 字段数: {len(df.columns)}") + # 跳过 RAG 索引时直接存 MySQL + if skip_rag_index: + logger.info(f"跳过 RAG 索引,直接存储到 MySQL") + store_result = await self.excel_storage.store_excel( + file_path=file_path, + filename=filename, + sheet_name=sheet_name, + header_row=header_row + ) + results["mysql_table"] = store_result.get("table_name") if store_result.get("success") else None + results["row_count"] = store_result.get("row_count", len(df)) + results["indexed_count"] = 0 + return results + # 3. 初始化 RAG (如果需要) if not self.rag._initialized: self.rag._init_vector_store() diff --git a/backend/app/services/template_fill_service.py b/backend/app/services/template_fill_service.py index 9465d35..aeadf2e 100644 --- a/backend/app/services/template_fill_service.py +++ b/backend/app/services/template_fill_service.py @@ -5,6 +5,7 @@ """ import asyncio import logging +import re from dataclasses import dataclass, field from typing import Any, Dict, List, Optional @@ -13,6 +14,7 @@ from app.services.llm_service import llm_service from app.core.document_parser import ParserFactory from app.services.markdown_ai_service import markdown_ai_service from app.services.rag_service import rag_service +from app.services.excel_storage_service import excel_storage_service logger = logging.getLogger(__name__) @@ -105,12 +107,60 @@ class TemplateFillService: # 3. 检查是否需要使用源文档重新生成表头 # 条件:源文档已加载 AND 现有字段看起来是自动生成的(如"字段1"、"字段2") + # 注意:Word 模板(docx)不自动重新生成表头,因为 Word 模板的表结构由用户定义,必须保留 needs_regenerate_headers = ( + template_file_type != "docx" and len(source_docs) > 0 and len(template_fields) > 0 and all(self._is_auto_generated_field(f.name) for f in template_fields) ) + # 4. Word 模板特殊处理:表头为空时,从源文档生成字段 + # 仅当有源文档、模板字段为空、模板文件类型为 docx 时触发 + if not needs_regenerate_headers and template_file_type == "docx" and len(source_docs) > 0 and len(template_fields) == 0: + logger.info(f"Word 模板表头为空,从源文档生成字段... (source_docs={len(source_docs)})") + source_contents = [] + for doc in source_docs: + structured = doc.structured_data if doc.structured_data else {} + titles = structured.get("titles", []) + tables = structured.get("tables", []) + tables_count = len(tables) if tables else 0 + tables_summary = "" + if tables: + tables_summary = "\n【文档中的表格】:\n" + for idx, table in enumerate(tables[:5]): + if isinstance(table, dict): + headers = table.get("headers", []) + rows = table.get("rows", []) + if headers: + tables_summary += f"表格{idx+1}表头: {', '.join(str(h) for h in headers)}\n" + if rows: + tables_summary += f"表格{idx+1}前3行: " + for row_idx, row in enumerate(rows[:3]): + if isinstance(row, list): + tables_summary += " | ".join(str(c) for c in row) + "; " + elif isinstance(row, dict): + tables_summary += " | ".join(str(row.get(h, "")) for h in headers if headers) + "; " + tables_summary += "\n" + source_contents.append({ + "filename": doc.filename, + "doc_type": doc.doc_type, + "content": doc.content[:5000] if doc.content else "", + "titles": titles[:10] if titles else [], + "tables_count": tables_count, + "tables_summary": tables_summary + }) + if template_id: + generated_fields = await self.get_template_fields_from_file( + template_id, + template_file_type, + source_contents=source_contents, + source_docs=source_docs + ) + if generated_fields: + template_fields = generated_fields + logger.info(f"Word 模板字段生成成功: {[f.name for f in template_fields]}") + if needs_regenerate_headers: logger.info(f"检测到自动生成表头,尝试使用源文档重新生成... (当前字段: {[f.name for f in template_fields]})") @@ -162,7 +212,8 @@ class TemplateFillService: new_fields = await self.get_template_fields_from_file( template_id, template_file_type, - source_contents=source_contents + source_contents=source_contents, + source_docs=source_docs ) if new_fields and len(new_fields) > 0: logger.info(f"成功重新生成表头: {[f.name for f in new_fields]}") @@ -224,14 +275,357 @@ class TemplateFillService: max_rows = max(len(v) for v in filled_data.values()) if filled_data else 1 logger.info(f"填表完成: {len(filled_data)} 个字段, 最大行数: {max_rows}") + # 如果是 Word 模板,将数据填入模板文件 + filled_file_path = None + if template_file_type == "docx" and template_id and filled_data: + filled_file_path = await self._fill_docx(template_id, filled_data) + if filled_file_path: + logger.info(f"Word 模板已填写,输出文件: {filled_file_path}") + return { "success": True, "filled_data": filled_data, "fill_details": fill_details, "source_doc_count": len(source_docs), - "max_rows": max_rows + "max_rows": max_rows, + "filled_file_path": filled_file_path } + async def _polish_word_filled_data( + self, + filled_data: Dict[str, Any] + ) -> Dict[str, str]: + """ + 将提取的结构化数据(尤其是多行Excel数据)进行统计归纳, + 然后润色为自然语言文本 + + Args: + filled_data: {字段名: [原始值列表]} + + Returns: + {字段名: 润色后的文本} + """ + if not filled_data: + return {} + + try: + import json + + # 第一步:对数值型多行数据进行统计分析 + data_summary = [] + for field_name, values in filled_data.items(): + if not isinstance(values, list) or not values: + continue + + # 过滤掉无效值 + raw_values = [] + for v in values: + if v and str(v).strip() and not str(v).startswith('[提取失败'): + raw_values.append(str(v).strip()) + + if not raw_values: + continue + + # 尝试解析为数值进行统计 + numeric_values = [] + for v in raw_values: + # 提取数值(处理 "123个"、"78.5%"、"1,234" 等格式) + num_str = re.sub(r'[^\d.\-]', '', str(v)) + try: + if num_str and num_str != '-' and num_str != '.': + numeric_values.append(float(num_str)) + except ValueError: + pass + + # 根据字段名判断类型 + field_lower = field_name.lower() + is_count_field = any(kw in field_lower for kw in ['数量', '总数', '次数', '条数', '订单数', '记录数', '条目']) + is_amount_field = any(kw in field_lower for kw in ['金额', '总额', '合计', '总计', '销售额', '收入', '支出', '成本']) + is_ratio_field = any(kw in field_lower for kw in ['比率', '比例', '占比', '率', '使用率', '增长', '增幅']) + is_name_field = any(kw in field_lower for kw in ['名称', '机构', '医院', '公司', '单位', '部门', '区域', '类别']) + + if len(numeric_values) >= 2 and len(numeric_values) == len(raw_values): + # 多行数值数据,进行统计归纳 + total = sum(numeric_values) + avg = total / len(numeric_values) + max_val = max(numeric_values) + min_val = min(numeric_values) + + stats_lines = [ + f"【{field_name}】(共 {len(raw_values)} 条数据):", + f" - 合计: {self._format_number(total)}" if is_amount_field else f" - 合计: {total:.2f}", + f" - 平均: {avg:.2f}", + f" - 最大: {max_val:.2f}", + f" - 最小: {min_val:.2f}", + ] + + # 对原始值去重计数(如果是名称类字段) + if is_name_field: + unique_values = list(set(raw_values)) + if len(unique_values) <= 10: + stats_lines.append(f" - 涉及类别(共 {len(unique_values)} 种): {'、'.join(unique_values[:8])}") + else: + stats_lines.append(f" - 涉及 {len(unique_values)} 个不同类别") + + # 取前5个原始示例 + stats_lines.append(f" - 示例值: {'、'.join(raw_values[:5])}") + data_summary.append('\n'.join(stats_lines)) + + elif is_ratio_field and len(numeric_values) == 1: + # 单值百分比 + pct = numeric_values[0] + data_summary.append(f"【{field_name}】: {pct:.1f}%,表示相关指标的相对水平") + + elif is_amount_field and len(numeric_values) >= 1: + # 金额类(单位通常是万元/亿元) + total = sum(numeric_values) + unit = "" + if total >= 10000: + unit = f"(约 {total/10000:.2f} 万元)" + elif total >= 1: + unit = f"(约 {total:.2f} 元)" + data_summary.append(f"【{field_name}】: 合计 {self._format_number(total)}{unit},基于 {len(raw_values)} 条记录汇总") + + elif is_count_field and len(numeric_values) >= 1: + # 数量类 + total = sum(numeric_values) + data_summary.append(f"【{field_name}】: 共 {self._format_number(total)},基于 {len(raw_values)} 条记录汇总") + + else: + # 无法归类的多值数据,做去重归纳 + unique_values = list(set(raw_values)) + if len(unique_values) <= 8: + data_summary.append(f"【{field_name}】(共 {len(raw_values)} 条,去重后 {len(unique_values)} 项): {'、'.join(unique_values[:8])}") + elif len(raw_values) > 8: + data_summary.append(f"【{field_name}】(共 {len(raw_values)} 条记录): {'、'.join(raw_values[:5])} 等") + else: + data_summary.append(f"【{field_name}】: {'、'.join(raw_values)}") + + if not data_summary: + return {k: (', '.join(str(v) for v in vals[:5]) if isinstance(vals, list) else str(vals)) + for k, vals in filled_data.items()} + + # 第二步:调用 LLM 将统计分析结果转化为专业自然语言描述 + prompt = f"""你是一个专业的数据分析报告助手。请根据以下从文档中提取并统计的数据,生成专业、简洁的自然语言描述。 + +【数据统计结果】: +{chr(10).join(data_summary)} + +【润色要求】: +1. 每个字段生成一段专业的描述性文本(20-60字) +2. 数值类字段要明确标注单位和含义,如"销售总额达1,234.5万元,共涵盖56个订单" +3. 分类/名称类字段要归纳总结类别,如"涉及医疗器械、药品采购、设备维修等5个业务类别" +4. 多值数据不要简单罗列,要做总结,如"覆盖华东地区(上海、江苏、浙江)、华南地区(广东)等6个省市的销售网络" +5. 百分比/比率类要加背景说明,如"综合毛利率为23.5%,处于行业正常水平" +6. 保持文本通顺、专业,符合正式报告风格 +7. 每段控制在60字以内 + +【输出格式】(严格按JSON格式,只返回JSON,不要任何其他内容): +{{ + "字段名1": "润色后的描述文本1", + "字段名2": "润色后的描述文本2" +}} +""" + messages = [ + {"role": "system", "content": "你是一个专业的数据分析报告助手。请严格按JSON格式输出,只返回纯JSON,不要任何其他内容。"}, + {"role": "user", "content": prompt} + ] + + response = await self.llm.chat( + messages=messages, + temperature=0.3, + max_tokens=3000 + ) + content = self.llm.extract_message_content(response) + logger.info(f"LLM 润色 Word 数据返回: {content[:500]}") + + # 尝试解析 JSON + json_match = re.search(r'\{[\s\S]*\}', content) + if json_match: + polished = json.loads(json_match.group()) + logger.info(f"LLM 润色成功: {len(polished)} 个字段") + return polished + else: + logger.warning(f"LLM 返回无法解析为 JSON: {content[:200]}") + # 回退到原始统计摘要 + return {k: (', '.join(str(v) for v in vals[:5]) if isinstance(vals, list) else str(vals)) + for k, vals in filled_data.items()} + + except Exception as e: + logger.error(f"LLM 润色失败: {str(e)}") + # 润色失败时回退到原始值 + return {k: (', '.join(str(v) for v in vals[:5]) if isinstance(vals, list) else str(vals)) + for k, vals in filled_data.items()} + + def _format_number(self, num: float) -> str: + """格式化数字,添加千分位""" + if abs(num) >= 10000: + return f"{num:,.2f}" + elif abs(num) >= 1: + return f"{num:,.2f}" + else: + return f"{num:.4f}" + + async def _fill_docx( + self, + template_path: str, + filled_data: Dict[str, Any] + ) -> Optional[str]: + """ + 将提取的数据填入 Word 模板 + + Args: + template_path: Word 模板文件路径 + filled_data: 字段值字典 {field_name: [values]} + + Returns: + 填写后的文件路径,失败返回 None + """ + import re + import os + import tempfile + import shutil + from docx import Document + from docx.shared import RGBColor + + def clean_text(text: str) -> str: + """清理文本,移除非法字符""" + if not text: + return "" + # 移除控制字符 + text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text) + # 移除 Word 中常见的非法替代字符(显示为方框) + text = re.sub(r'[\ufffd\u25a1\u25a9\u2610\u2611\u25cb\u25c9]', '', text) + # 移除其他无效 Unicode 字符 + text = re.sub(r'[\ufeff\u200b-\u200f\u2028-\u202e]', '', text) + return text.strip() + + def set_cell_text(cell, text: str): + """设置单元格文本(保留原有格式)""" + cell.text = text + # 确保文本颜色为黑色 + for para in cell.paragraphs: + for run in para.runs: + run.font.color.rgb = RGBColor(0, 0, 0) + + try: + # 先对数据进行 LLM 润色(非结构化文本补充和润色) + logger.info(f"Word 填写前开始 LLM 润色 {len(filled_data)} 个字段...") + polished_data = await self._polish_word_filled_data(filled_data) + logger.info(f"LLM 润色完成,使用润色后文本写入 Word") + + # 创建临时目录存放修改后的文件 + temp_dir = tempfile.mkdtemp() + output_path = os.path.join(temp_dir, "filled_template.docx") + + # 复制模板到临时文件 + shutil.copy2(template_path, output_path) + + # 打开复制的模板 + doc = Document(output_path) + + matched_fields = set() + + # 遍历表格,找到字段名所在的行,填写对应值 + for table in doc.tables: + for row in table.rows: + cells = row.cells + if not cells: + continue + + first_cell_text = cells[0].text.strip() + if not first_cell_text: + continue + + # 精确匹配字段名 + if first_cell_text in polished_data: + display_text = polished_data[first_cell_text] + if display_text: + if len(cells) > 1: + set_cell_text(cells[1], clean_text(display_text)) + matched_fields.add(first_cell_text) + logger.info(f"Word 填写(精确): {first_cell_text} = {display_text[:50] if display_text else ''}") + continue + + # 前缀/后缀匹配 + for field_name, display_text in polished_data.items(): + if field_name and first_cell_text and ( + field_name.startswith(first_cell_text) or first_cell_text.startswith(field_name) + ): + if display_text: + if len(cells) > 1: + set_cell_text(cells[1], clean_text(display_text)) + matched_fields.add(field_name) + logger.info(f"Word 填写(模糊): {first_cell_text} ≈ {field_name} = {display_text[:50] if display_text else ''}") + break + + # 如果有未匹配的字段(模板第一列为空),使用段落格式写入(带分隔线,更清晰) + unmatched_fields = [f for f in polished_data if f not in matched_fields] + if unmatched_fields: + logger.info(f"使用段落格式写入 {len(unmatched_fields)} 个字段(带分隔线)") + + from docx.oxml.ns import qn + from docx.oxml import OxmlElement + from docx.shared import Pt, RGBColor + + def add_horizontal_separator(doc, before_para=None): + """添加水平分隔线(通过段落下边框实现)""" + sep_para = OxmlElement('w:p') + pPr = OxmlElement('w:pPr') + pBdr = OxmlElement('w:pBdr') + bottom = OxmlElement('w:bottom') + bottom.set(qn('w:val'), 'single') + bottom.set(qn('w:sz'), '6') + bottom.set(qn('w:space'), '1') + bottom.set(qn('w:color'), 'CCCCCC') + pBdr.append(bottom) + pPr.append(pBdr) + sep_para.append(pPr) + if before_para is not None: + before_para._element.addprevious(sep_para) + else: + doc._body.append(sep_para) + + def add_field_section(doc, field_name: str, display_text: str): + """添加一个字段区域:字段名(加粗)+ 值段落 + 分隔线""" + from docx.shared import Pt + + # 字段名段落(加粗) + name_para = doc.add_paragraph() + name_run = name_para.add_run(f"📌 {field_name}") + name_run.bold = True + name_run.font.size = Pt(11) + name_run.font.color.rgb = RGBColor(0, 51, 102) + name_para.paragraph_format.space_before = Pt(12) + name_para.paragraph_format.space_after = Pt(3) + + # 值段落 + value_para = doc.add_paragraph() + value_run = value_para.add_run(display_text) + value_run.font.size = Pt(10.5) + value_run.font.color.rgb = RGBColor(51, 51, 51) + value_para.paragraph_format.space_before = Pt(0) + value_para.paragraph_format.space_after = Pt(6) + + # 分隔线 + add_horizontal_separator(doc, value_para) + + # 在文档末尾添加各字段段落 + for field_name in unmatched_fields: + display_text = polished_data[field_name] + if display_text: + add_field_section(doc, field_name, clean_text(display_text)) + logger.info(f"Word 段落写入: {field_name} = {display_text[:60]}") + + # 保存修改后的文档 + doc.save(output_path) + logger.info(f"Word 模板填写完成: {output_path}, 匹配字段: {len(matched_fields)}, 追加字段: {len(unmatched_fields)}") + return output_path + + except Exception as e: + logger.error(f"Word 模板填写失败: {str(e)}") + return None + async def _load_source_documents( self, source_doc_ids: Optional[List[str]] = None, @@ -257,10 +651,38 @@ class TemplateFillService: if doc: sd = doc.get("structured_data", {}) sd_keys = list(sd.keys()) if sd else [] - logger.info(f"从MongoDB加载文档: {doc_id}, doc_type={doc.get('doc_type')}, structured_data keys={sd_keys}") + doc_type = doc.get("doc_type", "") + mysql_table_name = doc.get("metadata", {}).get("mysql_table_name") + logger.info(f"从MongoDB加载文档: {doc_id}, doc_type={doc_type}, structured_data keys={sd_keys}, mysql_table={mysql_table_name}") - # 如果 structured_data 为空,但有 file_path,尝试重新解析文件 doc_content = doc.get("content", "") + + # 如果是 Excel 类型且有 MySQL 表名,直接从 MySQL 加载数据 + if doc_type in ["xlsx", "xls"] and mysql_table_name: + try: + logger.info(f" 从 MySQL 表 {mysql_table_name} 加载 Excel 数据") + mysql_data = await excel_storage_service.query_table(mysql_table_name, limit=1000) + if mysql_data: + # 转换为 SourceDocument 格式 + if mysql_data and len(mysql_data) > 0: + columns = list(mysql_data[0].keys()) if mysql_data else [] + rows = [[row.get(col) for col in columns] for row in mysql_data] + sd = { + "headers": columns, + "rows": rows, + "row_count": len(mysql_data), + "column_count": len(columns), + "source": "mysql" + } + logger.info(f" MySQL 数据加载成功: {len(mysql_data)} 行, {len(columns)} 列") + else: + logger.warning(f" MySQL 表 {mysql_table_name} 无数据") + else: + logger.warning(f" MySQL 表 {mysql_table_name} 查询无结果") + except Exception as mysql_err: + logger.error(f" MySQL 加载失败: {str(mysql_err)}") + + # 如果 structured_data 仍然为空,尝试重新解析文件 if not sd or (not sd.get("tables") and not sd.get("headers") and not sd.get("rows")): file_path = doc.get("metadata", {}).get("file_path") if file_path: @@ -294,7 +716,7 @@ class TemplateFillService: source_docs.append(SourceDocument( doc_id=doc_id, filename=doc.get("metadata", {}).get("original_filename", "unknown"), - doc_type=doc.get("doc_type", "unknown"), + doc_type=doc_type, content=doc_content, structured_data=sd )) @@ -1047,7 +1469,8 @@ class TemplateFillService: self, file_path: str, file_type: str = "xlsx", - source_contents: List[dict] = None + source_contents: List[dict] = None, + source_docs: List["SourceDocument"] = None ) -> List[TemplateField]: """ 从模板文件提取字段定义 @@ -1071,15 +1494,18 @@ class TemplateFillService: fields = await self._get_template_fields_from_docx(file_path) # 检查是否需要 AI 生成表头 - # 条件:没有字段 OR 所有字段都是自动命名的(如"字段1"、"列1"、"Unnamed"开头) + # 条件:没有字段 OR 所有字段都是自动命名的 + # 对于 docx:仅当有源文档时才允许 AI 生成(避免覆盖用户定义的表头) needs_ai_generation = ( - len(fields) == 0 or - all(self._is_auto_generated_field(f.name) for f in fields) + (len(fields) == 0 or + all(self._is_auto_generated_field(f.name) for f in fields)) + ) and ( + file_type != "docx" or len(source_contents) > 0 ) if needs_ai_generation: logger.info(f"模板表头为空或自动生成,尝试 AI 生成表头... (fields={len(fields)}, source_docs={len(source_contents)})") - ai_fields = await self._generate_fields_with_ai(file_path, file_type, source_contents) + ai_fields = await self._generate_fields_with_ai(file_path, file_type, source_contents, source_docs) if ai_fields: fields = ai_fields logger.info(f"AI 生成表头成功: {len(fields)} 个字段") @@ -2134,7 +2560,8 @@ class TemplateFillService: self, file_path: str, file_type: str, - source_contents: List[dict] = None + source_contents: List[dict] = None, + source_docs: List["SourceDocument"] = None ) -> Optional[List[TemplateField]]: """ 使用 AI 为空表生成表头字段 @@ -2148,6 +2575,8 @@ class TemplateFillService: Returns: 生成的字段列表,如果失败返回 None """ + import random + try: import pandas as pd @@ -2182,24 +2611,21 @@ class TemplateFillService: else: content_sample = "" - # 调用 AI 生成表头 - # 根据源文档内容生成表头 - source_info = "" - logger.info(f"[DEBUG] _generate_fields_with_ai received source_contents: {len(source_contents) if source_contents else 0} items") + # 优先从源文档的表格表头中随机选取 if source_contents: - for sc in source_contents: - logger.info(f"[DEBUG] source doc: filename={sc.get('filename')}, content_len={len(sc.get('content', ''))}, titles={len(sc.get('titles', []))}, tables_count={sc.get('tables_count', 0)}, has_tables_summary={bool(sc.get('tables_summary'))}") - source_info = "\n\n【源文档内容摘要】(根据以下文档内容生成表头):\n" + import re + all_headers = [] + source_info = "" + for idx, src in enumerate(source_contents[:5]): # 最多5个源文档 filename = src.get("filename", f"文档{idx+1}") doc_type = src.get("doc_type", "unknown") - content = src.get("content", "")[:3000] # 限制内容长度 - titles = src.get("titles", [])[:10] # 最多10个标题 + content = src.get("content", "")[:3000] + titles = src.get("titles", [])[:10] tables_count = src.get("tables_count", 0) tables_summary = src.get("tables_summary", "") source_info += f"\n--- 文档 {idx+1}: {filename} ({doc_type}) ---\n" - # 处理 titles(可能是字符串列表或字典列表) if titles: title_texts = [] for t in titles[:5]: @@ -2216,6 +2642,72 @@ class TemplateFillService: if content: source_info += f"【文档内容】(前3000字符):{content[:3000]}\n" + # 从 tables_summary 中提取表头 + # 表格摘要格式如: "表格1表头: 姓名, 年龄, 性别" + if tables_summary: + header_matches = re.findall(r'表头:\s*([^\n]+)', tables_summary) + for match in header_matches: + # 分割表头字符串 + headers = [h.strip() for h in match.split(',') if h.strip()] + all_headers.extend(headers) + logger.info(f"从表格摘要提取到表头: {headers}") + + # 从源文档的 structured_data 中直接提取表头(Excel 等数据源) + for doc in source_docs: + if doc.structured_data: + sd = doc.structured_data + # Excel 格式: {columns: [...], rows: [...]} + if sd.get("columns"): + cols = sd.get("columns", []) + if isinstance(cols, list) and cols: + all_headers.extend([str(c) for c in cols if str(c).strip()]) + logger.info(f"从 structured_data.columns 提取到表头: {cols}") + # 多 sheet 格式: {sheets: {sheet_name: {columns, rows}}} + if sd.get("sheets"): + for sheet_name, sheet_data in sd.get("sheets", {}).items(): + if isinstance(sheet_data, dict) and sheet_data.get("columns"): + cols = sheet_data.get("columns", []) + if isinstance(cols, list) and cols: + all_headers.extend([str(c) for c in cols if str(c).strip()]) + logger.info(f"从 sheets.{sheet_name} 提取到表头: {cols}") + # Markdown/表格格式: {tables: [{headers, rows}]} + if sd.get("tables") and isinstance(sd.get("tables"), list): + for table in sd.get("tables", []): + if isinstance(table, dict) and table.get("headers"): + headers = table.get("headers", []) + if isinstance(headers, list) and headers: + all_headers.extend([str(h) for h in headers if str(h).strip()]) + logger.info(f"从 tables 提取到表头: {headers}") + # 另一种格式: {headers, rows} + if sd.get("headers") and sd.get("rows"): + headers = sd.get("headers", []) + if isinstance(headers, list) and headers: + all_headers.extend([str(h) for h in headers if str(h).strip()]) + logger.info(f"从 headers/rows 提取到表头: {headers}") + + # 如果从表格摘要中获取到了表头,随机选取一部分 + if all_headers: + logger.info(f"共有 {len(all_headers)} 个表头可用") + # 随机选取 5-7 个表头 + num_fields = min(random.randint(5, 7), len(all_headers)) + selected_headers = random.sample(all_headers, num_fields) + logger.info(f"随机选取的表头: {selected_headers}") + + fields = [] + for idx, header in enumerate(selected_headers): + fields.append(TemplateField( + cell=self._column_to_cell(idx), + name=header, + field_type="text", + required=False, + hint="" + )) + return fields + else: + source_info = "" + + # 如果无法从表格表头获取,才调用 AI 生成 + prompt = f"""你是一个专业的数据分析助手。请分析源文档中的所有数据,生成表格表头字段。 任务:分析源文档,找出所有具体的数据指标及其分类。 diff --git a/backend/requirements.txt b/backend/requirements.txt index c1700bd..c586179 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -39,6 +39,8 @@ openpyxl==3.1.2 python-docx==0.8.11 markdown-it-py==3.0.0 chardet==5.2.0 +Pillow>=10.0.0 +pytesseract>=0.3.10 # ==================== AI / LLM ==================== httpx==0.25.2 diff --git a/frontend/src/db/backend-api.ts b/frontend/src/db/backend-api.ts index 24973be..7d43424 100644 --- a/frontend/src/db/backend-api.ts +++ b/frontend/src/db/backend-api.ts @@ -781,7 +781,8 @@ export const backendApi = { async exportFilledTemplate( templateId: string, filledData: Record, - format: 'xlsx' | 'docx' = 'xlsx' + format: 'xlsx' | 'docx' = 'xlsx', + filledFilePath?: string ): Promise { const url = `${BACKEND_BASE_URL}/templates/export`; @@ -793,6 +794,7 @@ export const backendApi = { template_id: templateId, filled_data: filledData, format, + ...(filledFilePath && { filled_file_path: filledFilePath }), }), }); @@ -964,6 +966,101 @@ export const backendApi = { throw error; } }, + + // ==================== 智能指令 API ==================== + + /** + * 智能对话(支持多轮对话的指令执行) + */ + async instructionChat( + instruction: string, + docIds?: string[], + context?: Record + ): Promise<{ + success: boolean; + intent: string; + result: Record; + message: string; + hint?: string; + }> { + const url = `${BACKEND_BASE_URL}/instruction/chat`; + + try { + const response = await fetch(url, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ instruction, doc_ids: docIds, context }), + }); + + if (!response.ok) { + const error = await response.json(); + throw new Error(error.detail || '对话处理失败'); + } + + return await response.json(); + } catch (error) { + console.error('对话处理失败:', error); + throw error; + } + }, + + /** + * 获取支持的指令类型列表 + */ + async getSupportedIntents(): Promise<{ + intents: Array<{ + intent: string; + name: string; + examples: string[]; + params: string[]; + }>; + }> { + const url = `${BACKEND_BASE_URL}/instruction/intents`; + + try { + const response = await fetch(url); + if (!response.ok) throw new Error('获取指令列表失败'); + return await response.json(); + } catch (error) { + console.error('获取指令列表失败:', error); + throw error; + } + }, + + /** + * 执行指令(同步模式) + */ + async executeInstruction( + instruction: string, + docIds?: string[], + context?: Record + ): Promise<{ + success: boolean; + intent: string; + result: Record; + message: string; + }> { + const url = `${BACKEND_BASE_URL}/instruction/execute`; + + try { + const response = await fetch(url, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ instruction, doc_ids: docIds, context }), + }); + + if (!response.ok) { + const error = await response.json(); + throw new Error(error.detail || '指令执行失败'); + } + + return await response.json(); + } catch (error) { + console.error('指令执行失败:', error); + throw error; + } + }, + }; // ==================== AI 分析 API ==================== @@ -1529,61 +1626,66 @@ export const aiApi = { } }, + // ==================== 对话历史 API ==================== + /** - * 智能对话(支持多轮对话的指令执行) + * 获取对话历史 */ - async instructionChat( - instruction: string, - docIds?: string[], - context?: Record - ): Promise<{ + async getConversationHistory(conversationId: string, limit: number = 20): Promise<{ success: boolean; - intent: string; - result: Record; - message: string; - hint?: string; - }> { - const url = `${BACKEND_BASE_URL}/instruction/chat`; - - try { - const response = await fetch(url, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ instruction, doc_ids: docIds, context }), - }); - - if (!response.ok) { - const error = await response.json(); - throw new Error(error.detail || '对话处理失败'); - } - - return await response.json(); - } catch (error) { - console.error('对话处理失败:', error); - throw error; - } - }, - - /** - * 获取支持的指令类型列表 - */ - async getSupportedIntents(): Promise<{ - intents: Array<{ - intent: string; - name: string; - examples: string[]; - params: string[]; + messages: Array<{ + role: string; + content: string; + intent?: string; + created_at: string; }>; }> { - const url = `${BACKEND_BASE_URL}/instruction/intents`; + const url = `${BACKEND_BASE_URL}/conversation/${conversationId}/history?limit=${limit}`; try { const response = await fetch(url); - if (!response.ok) throw new Error('获取指令列表失败'); + if (!response.ok) throw new Error('获取对话历史失败'); return await response.json(); } catch (error) { - console.error('获取指令列表失败:', error); - throw error; + console.error('获取对话历史失败:', error); + return { success: false, messages: [] }; } }, + + /** + * 删除对话历史 + */ + async deleteConversation(conversationId: string): Promise<{ + success: boolean; + }> { + const url = `${BACKEND_BASE_URL}/conversation/${conversationId}`; + + try { + const response = await fetch(url, { method: 'DELETE' }); + if (!response.ok) throw new Error('删除对话历史失败'); + return await response.json(); + } catch (error) { + console.error('删除对话历史失败:', error); + return { success: false }; + } + }, + + /** + * 获取会话列表 + */ + async listConversations(limit: number = 50): Promise<{ + success: boolean; + conversations: Array; + }> { + const url = `${BACKEND_BASE_URL}/conversation/all?limit=${limit}`; + + try { + const response = await fetch(url); + if (!response.ok) throw new Error('获取会话列表失败'); + return await response.json(); + } catch (error) { + console.error('获取会话列表失败:', error); + return { success: false, conversations: [] }; + } + } }; diff --git a/frontend/src/pages/Dashboard.tsx b/frontend/src/pages/Dashboard.tsx index 7563304..888d81f 100644 --- a/frontend/src/pages/Dashboard.tsx +++ b/frontend/src/pages/Dashboard.tsx @@ -15,12 +15,14 @@ import { Sparkles, Database, FileSpreadsheet, - RefreshCcw + RefreshCcw, + Trash2 } from 'lucide-react'; import { backendApi } from '@/db/backend-api'; import { formatDistanceToNow } from 'date-fns'; import { zhCN } from 'date-fns/locale'; import { cn } from '@/lib/utils'; +import { toast } from 'sonner'; type DocumentItem = { doc_id: string; @@ -108,7 +110,7 @@ const Dashboard: React.FC = () => {
{[ { label: '已上传文档', value: stats.docs, icon: FileText, color: 'bg-blue-500', trend: '非结构化文档', link: '/documents' }, - { label: 'Excel 文件', value: stats.excelFiles, icon: FileSpreadsheet, color: 'bg-emerald-500', trend: '结构化数据', link: '/excel-parse' }, + { label: 'Excel 文件', value: stats.excelFiles, icon: FileSpreadsheet, color: 'bg-emerald-500', trend: '结构化数据', link: '/documents' }, { label: '填表任务', value: stats.tasks, icon: TableProperties, color: 'bg-indigo-500', trend: '待实现', link: '/form-fill' } ].map((stat, i) => ( @@ -164,8 +166,30 @@ const Dashboard: React.FC = () => { {doc.doc_type.toUpperCase()} • {formatDistanceToNow(new Date(doc.created_at), { addSuffix: true, locale: zhCN })}

-
- {doc.doc_type} +
+
+ {doc.doc_type} +
+
))} @@ -197,7 +221,7 @@ const Dashboard: React.FC = () => {
{[ { title: '上传文档', desc: '支持 docx/md/txt', icon: FileText, link: '/documents', color: 'bg-blue-500' }, - { title: '解析 Excel', desc: '上传并分析数据', icon: FileSpreadsheet, link: '/excel-parse', color: 'bg-emerald-500' }, + { title: '解析 Excel', desc: '上传并分析数据', icon: FileSpreadsheet, link: '/documents', color: 'bg-emerald-500' }, { title: '智能填表', desc: '自动填写表格模板', icon: TableProperties, link: '/form-fill', color: 'bg-indigo-500' }, { title: 'AI 助手', desc: '自然语言交互', icon: MessageSquareCode, link: '/assistant', color: 'bg-amber-500' } ].map((item, i) => ( diff --git a/frontend/src/pages/Documents.tsx b/frontend/src/pages/Documents.tsx index aa666d9..79af9f5 100644 --- a/frontend/src/pages/Documents.tsx +++ b/frontend/src/pages/Documents.tsx @@ -78,6 +78,19 @@ const Documents: React.FC = () => { const [expandedSheet, setExpandedSheet] = useState(null); const [uploadExpanded, setUploadExpanded] = useState(false); + // 批量上传状态跟踪 + type FileUploadStatus = 'pending' | 'uploading' | 'processing' | 'success' | 'failed'; + interface UploadFileState { + file: File; + status: FileUploadStatus; + progress: number; + taskId?: string; + error?: string; + docId?: string; + } + const [uploadStates, setUploadStates] = useState([]); + const [batchTaskId, setBatchTaskId] = useState(null); + // AI 分析相关状态 const [analyzing, setAnalyzing] = useState(false); const [analyzingForCharts, setAnalyzingForCharts] = useState(false); @@ -211,21 +224,119 @@ const Documents: React.FC = () => { } }; - // 文件上传处理 + // 文件上传处理 - 批量上传 const onDrop = async (acceptedFiles: File[]) => { if (acceptedFiles.length === 0) return; + // 初始化上传状态 + const initialStates: UploadFileState[] = acceptedFiles.map(file => ({ + file, + status: 'pending', + progress: 0 + })); + setUploadStates(initialStates); + setUploadExpanded(true); setUploading(true); + + try { + // 使用批量上传接口 + const result = await backendApi.uploadDocuments(acceptedFiles); + + if (result.task_id) { + setBatchTaskId(result.task_id); + + // 更新所有文件状态为上传中 + setUploadStates(prev => prev.map(s => ({ ...s, status: 'uploading', progress: 30 }))); + + // 轮询任务状态 + let attempts = 0; + const maxAttempts = 150; // 最多5分钟 + + const checkBatchStatus = async () => { + while (attempts < maxAttempts) { + try { + const status = await backendApi.getTaskStatus(result.task_id); + + if (status.status === 'success' && status.result) { + // 更新每个文件的状态 + const fileResults = status.result.results || []; + setUploadStates(prev => prev.map((s, idx) => { + const fileResult = fileResults[idx]; + if (fileResult?.success) { + return { ...s, status: 'success', progress: 100, docId: fileResult.doc_id }; + } else { + return { ...s, status: 'failed', progress: 0, error: fileResult?.error || '处理失败' }; + } + })); + loadDocuments(); + return; + } else if (status.status === 'failure') { + setUploadStates(prev => prev.map(s => ({ + ...s, + status: 'failed', + error: status.error || '批量处理失败' + }))); + return; + } else { + // 处理中 - 更新进度 + const progress = status.progress || Math.min(30 + attempts * 2, 90); + setUploadStates(prev => prev.map(s => ({ + ...s, + status: s.status === 'uploading' ? 'processing' : s.status, + progress + }))); + } + } catch (e) { + console.error('检查批量状态失败', e); + } + await new Promise(resolve => setTimeout(resolve, 2000)); + attempts++; + } + + // 超时 + setUploadStates(prev => prev.map(s => { + if (s.status !== 'success') { + return { ...s, status: 'failed', error: '处理超时' }; + } + return s; + })); + }; + + checkBatchStatus(); + } else { + // 单文件直接上传(旧逻辑作为后备) + await handleSingleFileUploads(acceptedFiles); + } + } catch (error: any) { + toast.error(error.message || '上传失败'); + setUploadStates(prev => prev.map(s => ({ + ...s, + status: 'failed', + error: error.message || '上传失败' + }))); + } finally { + setUploading(false); + } + }; + + // 单文件上传后备逻辑 + const handleSingleFileUploads = async (files: File[]) => { let successCount = 0; - let failCount = 0; const successfulFiles: File[] = []; - // 逐个上传文件 - for (const file of acceptedFiles) { + for (let i = 0; i < files.length; i++) { + const file = files[i]; const ext = file.name.split('.').pop()?.toLowerCase(); + setUploadStates(prev => prev.map((s, idx) => + idx === i ? { ...s, status: 'uploading' } : s + )); + try { if (ext === 'xlsx' || ext === 'xls') { + setUploadStates(prev => prev.map((s, idx) => + idx === i ? { ...s, status: 'processing', progress: 50 } : s + )); const result = await backendApi.uploadExcel(file, { parseAllSheets: parseOptions.parseAllSheets, headerRow: parseOptions.headerRow @@ -233,99 +344,60 @@ const Documents: React.FC = () => { if (result.success) { successCount++; successfulFiles.push(file); - // 第一个Excel文件设置解析结果供预览 + setUploadStates(prev => prev.map((s, idx) => + idx === i ? { ...s, status: 'success', progress: 100 } : s + )); if (successCount === 1) { setUploadedFile(file); setParseResult(result); - if (result.metadata?.sheet_count === 1) { - setExpandedSheet(Object.keys(result.data?.sheets || {})[0] || null); - } } loadDocuments(); } else { - failCount++; - toast.error(`${file.name}: ${result.error || '解析失败'}`); - } - } else if (ext === 'md' || ext === 'markdown') { - const result = await backendApi.uploadDocument(file); - if (result.task_id) { - successCount++; - successfulFiles.push(file); - if (successCount === 1) { - setUploadedFile(file); - } - // 轮询任务状态 - let attempts = 0; - const checkStatus = async () => { - while (attempts < 30) { - try { - const status = await backendApi.getTaskStatus(result.task_id); - if (status.status === 'success') { - loadDocuments(); - return; - } else if (status.status === 'failure') { - return; - } - } catch (e) { - console.error('检查状态失败', e); - } - await new Promise(resolve => setTimeout(resolve, 2000)); - attempts++; - } - }; - checkStatus(); - } else { - failCount++; + setUploadStates(prev => prev.map((s, idx) => + idx === i ? { ...s, status: 'failed', error: result.error || '解析失败' } : s + )); } } else { - // 其他文档使用通用上传接口 + setUploadStates(prev => prev.map((s, idx) => + idx === i ? { ...s, status: 'processing', progress: 50 } : s + )); const result = await backendApi.uploadDocument(file); if (result.task_id) { - successCount++; - successfulFiles.push(file); - if (successCount === 1) { - setUploadedFile(file); - } - // 轮询任务状态 + // 等待任务完成 let attempts = 0; - const checkStatus = async () => { - while (attempts < 30) { - try { - const status = await backendApi.getTaskStatus(result.task_id); - if (status.status === 'success') { - loadDocuments(); - return; - } else if (status.status === 'failure') { - return; - } - } catch (e) { - console.error('检查状态失败', e); + while (attempts < 60) { + const status = await backendApi.getTaskStatus(result.task_id); + if (status.status === 'success') { + successCount++; + successfulFiles.push(file); + setUploadStates(prev => prev.map((s, idx) => + idx === i ? { ...s, status: 'success', progress: 100, docId: status.result?.doc_id } : s + )); + if (successCount === 1) { + setUploadedFile(file); } - await new Promise(resolve => setTimeout(resolve, 2000)); - attempts++; + loadDocuments(); + break; + } else if (status.status === 'failure') { + setUploadStates(prev => prev.map((s, idx) => + idx === i ? { ...s, status: 'failed', error: status.error || '处理失败' } : s + )); + break; } - }; - checkStatus(); - } else { - failCount++; + await new Promise(resolve => setTimeout(resolve, 2000)); + attempts++; + } } } } catch (error: any) { - failCount++; - toast.error(`${file.name}: ${error.message || '上传失败'}`); + setUploadStates(prev => prev.map((s, idx) => + idx === i ? { ...s, status: 'failed', error: error.message || '上传失败' } : s + )); } } - setUploading(false); - loadDocuments(); - if (successCount > 0) { - toast.success(`成功上传 ${successCount} 个文件`); setUploadedFiles(prev => [...prev, ...successfulFiles]); - setUploadExpanded(true); - } - if (failCount > 0) { - toast.error(`${failCount} 个文件上传失败`); } }; @@ -699,7 +771,110 @@ const Documents: React.FC = () => { {uploadPanelOpen && ( - {uploadedFiles.length > 0 || uploadedFile ? ( + {/* 优先显示正在上传的状态 */} + {uploadStates.length > 0 && ( +
+ {/* 上传状态头部 */} +
setUploadExpanded(!uploadExpanded)} + > +
+
+ {uploading ? : } +
+
+

+ {uploading ? '正在上传' : '上传完成'} {uploadStates.length} 个文件 +

+

+ {uploading ? '上传中,请稍候...' : uploadStates.filter(s => s.status === 'failed').length > 0 ? '部分失败' : '点击查看详情'} +

+
+
+
+ {!uploading && ( + + )} + {uploadExpanded ? : } +
+
+ + {/* 上传进度列表(总是展开显示) */} + {uploadExpanded && ( +
+ {uploadStates.map((state, index) => ( +
+
+ {state.status === 'pending' && } + {state.status === 'uploading' && } + {state.status === 'processing' && } + {state.status === 'success' && } + {state.status === 'failed' && } +
+
+

{state.file.name}

+
+ {state.status === 'pending' &&

等待上传...

} + {state.status === 'uploading' &&

上传中...

} + {state.status === 'processing' &&

处理中...

} + {state.status === 'failed' && state.error && ( +

{state.error}

+ )} + {state.status === 'success' && ( +

已完成

+ )} +
+ {/* 进度条 */} + {(state.status === 'uploading' || state.status === 'processing') && ( +
+
+
+ )} +
+ {state.status === 'success' && ( + + )} + {state.status === 'failed' && ( + + )} +
+ ))} +
+ )} +
+ )} + + {/* 已上传文件列表(没有正在上传时显示) */} + {uploadStates.length === 0 && (uploadedFiles.length > 0 || uploadedFile) ? (
{/* 文件列表头部 */}
{ {/* 展开的文件列表 */} {uploadExpanded && (
+ {/* 显示已上传文件列表 */} + {(uploadedFiles.length > 0 ? uploadedFiles : [uploadedFile]).filter(Boolean).map((file, index) => ( +
+
+ {isExcelFile(file?.name || '') ? : } +
+
+

{file?.name}

+

{formatFileSize(file?.size || 0)}

+
+ +
+ ))} + + {/* 继续添加按钮 */} +
e.stopPropagation()} + > + + + 继续添加更多文件 +
+
+ )} +
+ ) : (uploadedFiles.length > 0 || uploadedFile) ? ( +
+ {/* 文件列表头部 */} +
setUploadExpanded(!uploadExpanded)} + > +
+
+ +
+
+

+ 已上传 {(uploadedFiles.length > 0 ? uploadedFiles : [uploadedFile]).length} 个文件 +

+

+ {uploadExpanded ? '点击收起' : '点击展开查看'} +

+
+
+
+ + {uploadExpanded ? : } +
+
+ + {/* 展开的文件列表 */} + {uploadExpanded && ( +
+ {/* 显示已上传文件列表 */} {(uploadedFiles.length > 0 ? uploadedFiles : [uploadedFile]).filter(Boolean).map((file, index) => (
{ const [input, setInput] = useState(''); const [loading, setLoading] = useState(false); const [currentDocIds, setCurrentDocIds] = useState([]); + const [conversationId, setConversationId] = useState(''); const scrollAreaRef = useRef(null); + // 初始化会话ID + useEffect(() => { + const storedId = localStorage.getItem('chat_conversation_id'); + if (storedId) { + setConversationId(storedId); + } else { + const newId = `conv_${Date.now()}_${Math.random().toString(36).substring(7)}`; + setConversationId(newId); + localStorage.setItem('chat_conversation_id', newId); + } + }, []); + useEffect(() => { // Initial welcome message if (messages.length === 0) { @@ -119,7 +116,8 @@ const InstructionChat: React.FC = () => { // 使用真实的智能指令 API const response = await backendApi.instructionChat( input.trim(), - currentDocIds.length > 0 ? currentDocIds : undefined + currentDocIds.length > 0 ? currentDocIds : undefined, + { conversation_id: conversationId } ); // 根据意图类型生成友好响应 @@ -135,11 +133,12 @@ const InstructionChat: React.FC = () => { responseContent = `✅ 已提取到 ${keys.length} 个字段的数据:\n\n`; for (const [key, value] of Object.entries(extracted)) { const values = Array.isArray(value) ? value : [value]; - responseContent += `**${key}**: ${values.slice(0, 3).join(', ')}${values.length > 3 ? '...' : ''}\n`; + const displayValues = values.length > 10 ? values.slice(0, 10).join(', ') + ` ...(共${values.length}条)` : values.join(', '); + responseContent += `**${key}**: ${displayValues}\n`; } - responseContent += `\n💡 您可以将这些数据填入表格。`; + responseContent += `\n💡 可直接使用以上数据,或说"填入表格"继续填表操作。`; } else { - responseContent = '未能从文档中提取到相关数据。请尝试更明确的字段名称。'; + responseContent = resultData?.message || '未能从文档中提取到相关数据。请尝试更明确的字段名称。'; } break; @@ -151,24 +150,24 @@ const InstructionChat: React.FC = () => { responseContent = `✅ 填表完成!成功填写 ${filledKeys.length} 个字段:\n\n`; for (const [key, value] of Object.entries(filled)) { const values = Array.isArray(value) ? value : [value]; - responseContent += `**${key}**: ${values.slice(0, 3).join(', ')}\n`; + const displayValues = values.length > 10 ? values.slice(0, 10).join(', ') + ` ...(共${values.length}条)` : values.join(', '); + responseContent += `**${key}**: ${displayValues}\n`; } responseContent += `\n📋 请到【智能填表】页面查看或导出结果。`; } else { - responseContent = '填表未能提取到数据。请检查模板表头和数据源内容。'; + responseContent = resultData?.message || '填表未能提取到数据。请检查模板表头和数据源内容。'; } break; case 'summarize': // 摘要结果 - const summaries = resultData?.summaries || []; - if (summaries.length > 0) { - responseContent = `📄 找到 ${summaries.length} 个文档的摘要:\n\n`; - summaries.forEach((s: any, idx: number) => { - responseContent += `**${idx + 1}. ${s.filename}**\n${s.content_preview}\n\n`; - }); + if (resultData?.action_needed === 'provide_document' || resultData?.action_needed === 'upload_document') { + responseContent = `📋 ${resultData.message}\n\n${resultData.suggestion || ''}`; + } else if (resultData?.ai_summary) { + // AI 生成的摘要 + responseContent = `📄 **${resultData.filename}** 摘要分析:\n\n${resultData.ai_summary}`; } else { - responseContent = '未能生成摘要。请确保已上传文档。'; + responseContent = resultData?.message || '未能生成摘要。请确保已上传文档。'; } break; @@ -176,8 +175,10 @@ const InstructionChat: React.FC = () => { // 问答结果 if (resultData?.answer) { responseContent = `**问题**: ${resultData.question}\n\n**答案**: ${resultData.answer}`; + } else if (resultData?.context_preview) { + responseContent = `**问题**: ${resultData.question}\n\n**相关上下文**:\n${resultData.context_preview}`; } else { - responseContent = resultData?.message || '我找到了相关信息,请查看上文。'; + responseContent = resultData?.message || '请先上传文档,我才能回答您的问题。'; } break; @@ -207,8 +208,35 @@ const InstructionChat: React.FC = () => { } break; + case 'edit': + // 文档编辑结果 + if (resultData?.edited_content) { + responseContent = `✏️ **${resultData.original_filename}** 编辑完成:\n\n${resultData.edited_content.substring(0, 500)}${resultData.edited_content.length > 500 ? '\n\n...(内容已截断)' : ''}`; + } else { + responseContent = resultData?.message || '编辑完成。'; + } + break; + + case 'transform': + // 格式转换结果 + if (resultData?.excel_data) { + responseContent = `🔄 格式转换完成!\n\n已转换为 **Excel** 格式,共 **${resultData.excel_data.length}** 行数据。\n\n${resultData.message || ''}`; + } else if (resultData?.content) { + responseContent = `🔄 格式转换完成!\n\n目标格式: **${resultData.target_format?.toUpperCase()}**\n\n${resultData.message || ''}`; + } else { + responseContent = resultData?.message || '格式转换完成。'; + } + break; + case 'unknown': - responseContent = `我理解您想要: "${input.trim()}"\n\n但我目前无法完成此操作。您可以尝试:\n\n1. **提取数据**: "提取医院数量和床位数"\n2. **填表**: "根据这些数据填表"\n3. **总结**: "总结这份文档"\n4. **问答**: "文档里说了什么?"\n5. **搜索**: "搜索相关内容"`; + // 检查是否需要用户上传文档 + if (resultData?.suggestion) { + responseContent = resultData.suggestion; + } else if (resultData?.message && resultData.message !== '无法理解该指令,请尝试更明确的描述') { + responseContent = resultData.message; + } else { + responseContent = `我理解您想要: "${input.trim()}"\n\n请尝试以下操作:\n\n1. **提取数据**: "提取医院数量和床位数"\n2. **填表**: "根据这些数据填表"\n3. **总结**: "总结这份文档"\n4. **问答**: "文档里说了什么?"\n5. **搜索**: "搜索相关内容"`; + } break; default: @@ -299,9 +327,11 @@ const InstructionChat: React.FC = () => { ? "bg-primary text-primary-foreground shadow-xl shadow-primary/20 rounded-tr-none" : "bg-white border border-border/50 shadow-md rounded-tl-none" )}> -

- {m.content} -

+ {m.role === 'assistant' ? ( + + ) : ( +

{m.content}

+ )} { if (!templateFile || !filledResult) return; try { + const ext = templateFile.name.split('.').pop()?.toLowerCase(); + const exportFormat = (ext === 'docx') ? 'docx' : 'xlsx'; + // 对于 Word 模板,如果已有填写后的文件(已填入表格单元格),传递其路径以便直接下载 + const filledFilePath = (ext === 'docx' && filledResult.filled_file_path) + ? filledResult.filled_file_path + : undefined; const blob = await backendApi.exportFilledTemplate( templateId || 'temp', filledResult.filled_data || {}, - 'xlsx' + exportFormat, + filledFilePath ); + const ext_match = templateFile.name.match(/\.([^.])+$/); + const baseName = ext_match ? templateFile.name.replace(ext_match[0], '') : templateFile.name; + const downloadName = `filled_${baseName}.${exportFormat}`; const url = URL.createObjectURL(blob); const a = document.createElement('a'); a.href = url; - a.download = `filled_${templateFile.name}`; + a.download = downloadName; a.click(); URL.revokeObjectURL(url); toast.success('导出成功'); @@ -546,7 +556,7 @@ const TemplateFill: React.FC = () => {

AI 正在智能分析并填表

- 系统正在从 {sourceFiles.length || sourceFilePaths.length} 份文档中检索相关信息... + 系统正在从 {sourceFiles.length || sourceFilePaths.length || sourceDocIds.length || 0} 份文档中检索相关信息...

@@ -562,7 +572,7 @@ const TemplateFill: React.FC = () => { 填表完成 - 系统已根据 {sourceFiles.length || sourceFilePaths.length} 份文档自动完成表格填写 + 系统已根据 {filledResult.source_doc_count || sourceFiles.length || sourceFilePaths.length || sourceDocIds.length} 份文档自动完成表格填写