From 975ebf536bd43d274ce857b890da0cfac56d612b Mon Sep 17 00:00:00 2001 From: dj <431634905@qq.com> Date: Thu, 16 Apr 2026 23:08:21 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E7=B3=BB=E7=BB=9F=E6=9E=B6?= =?UTF-8?q?=E6=9E=84=E5=9B=BE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/app/api/endpoints/ai_analyze.py | 83 +++++++-- backend/app/services/excel_ai_service.py | 171 ++++++++++++++++++ backend/app/services/llm_service.py | 4 +- backend/app/services/txt_ai_service.py | 5 +- backend/app/services/visualization_service.py | 14 +- backend/app/services/word_ai_service.py | 13 +- frontend/src/db/backend-api.ts | 18 +- frontend/src/pages/Documents.tsx | 88 ++++++--- 8 files changed, 339 insertions(+), 57 deletions(-) diff --git a/backend/app/api/endpoints/ai_analyze.py b/backend/app/api/endpoints/ai_analyze.py index 36dedfe..d2d58ca 100644 --- a/backend/app/api/endpoints/ai_analyze.py +++ b/backend/app/api/endpoints/ai_analyze.py @@ -1,7 +1,7 @@ """ AI 分析 API 接口 """ -from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body +from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body, Form from fastapi.responses import StreamingResponse from typing import Optional import logging @@ -21,7 +21,8 @@ router = APIRouter(prefix="/ai", tags=["AI 分析"]) @router.post("/analyze/excel") async def analyze_excel( - file: UploadFile = File(...), + file: Optional[UploadFile] = File(None), + doc_id: Optional[str] = Form(None, description="文档ID(从数据库读取)"), user_prompt: str = Query("", description="用户自定义提示词"), analysis_type: str = Query("general", description="分析类型: general, summary, statistics, insights"), parse_all_sheets: bool = Query(False, description="是否分析所有工作表") @@ -30,7 +31,8 @@ async def analyze_excel( 上传并使用 AI 分析 Excel 文件 Args: - file: 上传的 Excel 文件 + file: 上传的 Excel 文件(与 doc_id 二选一) + doc_id: 文档ID(从数据库读取) user_prompt: 用户自定义提示词 analysis_type: 分析类型 parse_all_sheets: 是否分析所有工作表 @@ -38,7 +40,57 @@ async def analyze_excel( Returns: dict: 分析结果,包含 Excel 数据和 AI 分析结果 """ - # 检查文件类型 + filename = None + + # 从数据库读取模式 + if doc_id: + try: + from app.core.database.mongodb import mongodb + doc = await mongodb.get_document(doc_id) + if not doc: + raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}") + + filename = doc.get("metadata", {}).get("original_filename", "unknown.xlsx") + file_ext = filename.split('.')[-1].lower() + + if file_ext not in ['xlsx', 'xls']: + raise HTTPException(status_code=400, detail=f"文档类型不是 Excel: {file_ext}") + + file_path = doc.get("metadata", {}).get("file_path") + if not file_path: + raise HTTPException(status_code=400, detail="文档没有存储文件路径,请重新上传") + + # 使用文件路径进行 AI 分析 + if parse_all_sheets: + result = await excel_ai_service.batch_analyze_sheets_from_path( + file_path=file_path, + filename=filename, + user_prompt=user_prompt, + analysis_type=analysis_type + ) + else: + result = await excel_ai_service.analyze_excel_file_from_path( + file_path=file_path, + filename=filename, + user_prompt=user_prompt, + analysis_type=analysis_type + ) + + if result.get("success"): + return result + else: + return result + + except HTTPException: + raise + except Exception as e: + logger.error(f"从数据库读取 Excel 文档失败: {str(e)}") + raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}") + + # 文件上传模式 + if not file: + raise HTTPException(status_code=400, detail="请提供文件或文档ID") + if not file.filename: raise HTTPException(status_code=400, detail="文件名为空") @@ -61,7 +113,11 @@ async def analyze_excel( # 读取文件内容 content = await file.read() - logger.info(f"开始分析文件: {file.filename}, 分析类型: {analysis_type}") + # 验证文件内容不为空 + if not content: + raise HTTPException(status_code=400, detail="文件内容为空,请确保文件已正确上传") + + logger.info(f"开始分析文件: {file.filename}, 分析类型: {analysis_type}, 文件大小: {len(content)} bytes") # 调用 AI 分析服务 if parse_all_sheets: @@ -155,7 +211,7 @@ async def analyze_text( @router.post("/analyze/md") async def analyze_markdown( file: Optional[UploadFile] = File(None), - doc_id: Optional[str] = Query(None, description="文档ID(从数据库读取)"), + doc_id: Optional[str] = Form(None, description="文档ID(从数据库读取)"), analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section, charts"), user_prompt: str = Query("", description="用户自定义提示词"), section_number: Optional[str] = Query(None, description="指定章节编号,如 '一' 或 '(一)'") @@ -198,7 +254,7 @@ async def analyze_markdown( if file_ext not in ['md', 'markdown']: raise HTTPException(status_code=400, detail=f"文档类型不是 Markdown: {file_ext}") - content = doc.get("content", "") + content = doc.get("content") or "" if not content: raise HTTPException(status_code=400, detail="文档内容为空") @@ -392,7 +448,7 @@ async def get_markdown_outline( @router.post("/analyze/txt") async def analyze_txt( file: Optional[UploadFile] = File(None), - doc_id: Optional[str] = Query(None, description="文档ID(从数据库读取)"), + doc_id: Optional[str] = Form(None, description="文档ID(从数据库读取)"), analysis_type: str = Query("structured", description="分析类型: structured, charts") ): """ @@ -427,7 +483,7 @@ async def analyze_txt( raise HTTPException(status_code=400, detail=f"文档类型不是 TXT: {file_ext}") # 使用数据库中的 content - text_content = doc.get("content", "") + text_content = doc.get("content") or "" if not text_content: raise HTTPException(status_code=400, detail="文档内容为空") @@ -498,8 +554,8 @@ async def analyze_txt( @router.post("/analyze/word") async def analyze_word( file: Optional[UploadFile] = File(None), - doc_id: Optional[str] = Query(None, description="文档ID(从数据库读取)"), - user_hint: str = Query("", description="用户提示词,如'请提取表格数据'"), + doc_id: Optional[str] = Form(None, description="文档ID(从数据库读取)"), + user_hint: str = Form("", description="用户提示词,如'请提取表格数据'"), analysis_type: str = Query("structured", description="分析类型: structured, charts") ): """ @@ -536,8 +592,9 @@ async def analyze_word( raise HTTPException(status_code=400, detail=f"文档类型不是 Word: {file_ext}") # 使用数据库中的 content 进行分析 - content = doc.get("content", "") - tables = doc.get("structured_data", {}).get("tables", []) + content = doc.get("content", "") or "" + structured_data = doc.get("structured_data") or {} + tables = structured_data.get("tables", []) # 调用 AI 分析服务,传入数据库内容 if analysis_type == "charts": diff --git a/backend/app/services/excel_ai_service.py b/backend/app/services/excel_ai_service.py index 8f8f36d..480041b 100644 --- a/backend/app/services/excel_ai_service.py +++ b/backend/app/services/excel_ai_service.py @@ -223,6 +223,177 @@ class ExcelAIService: } } + async def analyze_excel_file_from_path( + self, + file_path: str, + filename: str, + user_prompt: str = "", + analysis_type: str = "general", + parse_options: Optional[Dict[str, Any]] = None + ) -> Dict[str, Any]: + """ + 从文件路径分析 Excel 文件(用于从数据库加载的文档) + + Args: + file_path: Excel 文件路径 + filename: 文件名 + user_prompt: 用户自定义提示词 + analysis_type: 分析类型 + parse_options: 解析选项 + + Returns: + Dict[str, Any]: 分析结果 + """ + # 1. 解析 Excel 文件 + excel_data = None + parse_result_metadata = None + try: + parse_options = parse_options or {} + parse_result = self.parser.parse(file_path, **parse_options) + + if not parse_result.success: + return { + "success": False, + "error": parse_result.error, + "analysis": None + } + + excel_data = parse_result.data + parse_result_metadata = parse_result.metadata + logger.info(f"Excel 解析成功: {parse_result_metadata}") + + except Exception as e: + logger.error(f"Excel 解析失败: {str(e)}") + return { + "success": False, + "error": f"Excel 解析失败: {str(e)}", + "analysis": None + } + + # 2. 调用 LLM 进行分析 + try: + if user_prompt and user_prompt.strip(): + llm_result = await self.llm_service.analyze_with_template( + excel_data, + user_prompt + ) + else: + llm_result = await self.llm_service.analyze_excel_data( + excel_data, + user_prompt, + analysis_type + ) + + logger.info(f"AI 分析完成: {llm_result['success']}") + + return { + "success": True, + "excel": { + "data": excel_data, + "metadata": parse_result_metadata, + "saved_path": file_path + }, + "analysis": llm_result + } + + except Exception as e: + logger.error(f"AI 分析失败: {str(e)}") + return { + "success": False, + "error": f"AI 分析失败: {str(e)}", + "excel": { + "data": excel_data, + "metadata": parse_result_metadata + }, + "analysis": None + } + + async def batch_analyze_sheets_from_path( + self, + file_path: str, + filename: str, + user_prompt: str = "", + analysis_type: str = "general" + ) -> Dict[str, Any]: + """ + 从文件路径批量分析 Excel 文件的所有工作表(用于从数据库加载的文档) + + Args: + file_path: Excel 文件路径 + filename: 文件名 + user_prompt: 用户自定义提示词 + analysis_type: 分析类型 + + Returns: + Dict[str, Any]: 分析结果 + """ + # 1. 解析所有工作表 + try: + parse_result = self.parser.parse_all_sheets(file_path) + + if not parse_result.success: + return { + "success": False, + "error": parse_result.error, + "analysis": None + } + + sheets_data = parse_result.data.get("sheets", {}) + logger.info(f"Excel 解析成功,共 {len(sheets_data)} 个工作表") + + except Exception as e: + logger.error(f"Excel 解析失败: {str(e)}") + return { + "success": False, + "error": f"Excel 解析失败: {str(e)}", + "analysis": None + } + + # 2. 批量分析每个工作表 + sheet_analyses = {} + errors = {} + + for sheet_name, sheet_data in sheets_data.items(): + try: + if user_prompt and user_prompt.strip(): + llm_result = await self.llm_service.analyze_with_template( + sheet_data, + user_prompt + ) + else: + llm_result = await self.llm_service.analyze_excel_data( + sheet_data, + user_prompt, + analysis_type + ) + + sheet_analyses[sheet_name] = llm_result + + if not llm_result["success"]: + errors[sheet_name] = llm_result.get("error", "未知错误") + + logger.info(f"工作表 '{sheet_name}' 分析完成") + + except Exception as e: + logger.error(f"工作表 '{sheet_name}' 分析失败: {str(e)}") + errors[sheet_name] = str(e) + + # 3. 组合结果 + return { + "success": len(errors) == 0, + "excel": { + "sheets": sheets_data, + "metadata": parse_result.metadata, + "saved_path": file_path + }, + "analysis": { + "sheets": sheet_analyses, + "total_sheets": len(sheets_data), + "successful": len(sheet_analyses) - len(errors), + "errors": errors + } + } + def get_supported_analysis_types(self) -> List[str]: """获取支持的分析类型""" return [ diff --git a/backend/app/services/llm_service.py b/backend/app/services/llm_service.py index c0a5dd9..11cddf9 100644 --- a/backend/app/services/llm_service.py +++ b/backend/app/services/llm_service.py @@ -58,7 +58,7 @@ class LLMService: _start_time = time.time() logger.info(f"🤖 [LLM] 正在调用 DeepSeek API... 模型: {self.model_name}") try: - async with httpx.AsyncClient(timeout=60.0) as client: + async with httpx.AsyncClient(timeout=120.0) as client: response = await client.post( f"{self.base_url}/chat/completions", headers=headers, @@ -84,7 +84,7 @@ class LLMService: pass raise except Exception as e: - logger.error(f"LLM API 调用异常: {str(e)}") + logger.error(f"LLM API 调用异常: {repr(e)} - {str(e)}") raise def extract_message_content(self, response: Dict[str, Any]) -> str: diff --git a/backend/app/services/txt_ai_service.py b/backend/app/services/txt_ai_service.py index 740121d..397907d 100644 --- a/backend/app/services/txt_ai_service.py +++ b/backend/app/services/txt_ai_service.py @@ -19,6 +19,7 @@ class TxtAIService: def __init__(self): self.parser = TxtParser() + self.llm = llm_service async def analyze_txt_with_ai( self, @@ -114,7 +115,7 @@ class TxtAIService: response = await self.llm.chat( messages=messages, temperature=0.1, - max_tokens=50000 + max_tokens=8000 ) content_text = self.llm.extract_message_content(response) @@ -220,7 +221,7 @@ class TxtAIService: response = await self.llm.chat( messages=messages, temperature=0.1, - max_tokens=50000 + max_tokens=8000 ) content_text = self.llm.extract_message_content(response) diff --git a/backend/app/services/visualization_service.py b/backend/app/services/visualization_service.py index 6cacae7..5b43b73 100644 --- a/backend/app/services/visualization_service.py +++ b/backend/app/services/visualization_service.py @@ -53,7 +53,11 @@ class VisualizationService: } # 转换为 DataFrame - df = pd.DataFrame(rows, columns=columns) + # 过滤掉行数与列数不匹配的数据 + valid_rows = [row for row in rows if len(row) == len(columns)] + if len(valid_rows) < len(rows): + logger.warning(f"过滤了 {len(rows) - len(valid_rows)} 行无效数据(列数不匹配)") + df = pd.DataFrame(valid_rows, columns=columns) # 根据列类型分类 numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist() @@ -141,18 +145,18 @@ class VisualizationService: charts = {} # 1. 数值型列的直方图 - charts["histograms"] = [] + charts["numeric_charts"] = [] for col in numeric_columns[:5]: # 限制最多 5 个数值列 chart_data = self._create_histogram(df[col], col) if chart_data: - charts["histograms"].append(chart_data) + charts["numeric_charts"].append(chart_data) # 2. 分类型列的条形图 - charts["bar_charts"] = [] + charts["categorical_charts"] = [] for col in categorical_columns[:5]: # 限制最多 5 个分类型列 chart_data = self._create_bar_chart(df[col], col) if chart_data: - charts["bar_charts"].append(chart_data) + charts["categorical_charts"].append(chart_data) # 3. 数值型列的箱线图 charts["box_plots"] = [] diff --git a/backend/app/services/word_ai_service.py b/backend/app/services/word_ai_service.py index a38d70d..e3278db 100644 --- a/backend/app/services/word_ai_service.py +++ b/backend/app/services/word_ai_service.py @@ -184,7 +184,7 @@ class WordAIService: response = await self.llm.chat( messages=messages, temperature=0.1, - max_tokens=50000 + max_tokens=8000 ) content = self.llm.extract_message_content(response) @@ -276,7 +276,7 @@ class WordAIService: response = await self.llm.chat( messages=messages, temperature=0.1, - max_tokens=50000 + max_tokens=8000 ) content = self.llm.extract_message_content(response) @@ -849,10 +849,12 @@ class WordAIService: # 提取可用于图表的数据 chart_data = None + logger.info(f"准备提取图表数据,structured_data type: {structured_data.get('type')}, keys: {list(structured_data.keys())}") if structured_data.get("type") == "table_data": headers = structured_data.get("headers", []) rows = structured_data.get("rows", []) + logger.info(f"table_data类型: headers数量={len(headers)}, rows数量={len(rows)}") if headers and rows: chart_data = { "columns": headers, @@ -860,15 +862,19 @@ class WordAIService: } elif structured_data.get("type") == "structured_text": tables_data = structured_data.get("tables", []) + logger.info(f"structured_text类型: tables数量={len(tables_data)}") if tables_data and len(tables_data) > 0: first_table = tables_data[0] headers = first_table.get("headers", []) rows = first_table.get("rows", []) + logger.info(f"第一个表格: headers={headers[:5]}, rows数量={len(rows)}") if headers and rows: chart_data = { "columns": headers, "rows": rows } + else: + logger.warning(f"无法识别的structured_data类型: {structured_data.get('type')}") # 生成可视化图表 if chart_data: @@ -904,3 +910,6 @@ class WordAIService: "success": False, "error": str(e) } + + +word_ai_service = WordAIService() diff --git a/frontend/src/db/backend-api.ts b/frontend/src/db/backend-api.ts index bb6c162..b5ecdcb 100644 --- a/frontend/src/db/backend-api.ts +++ b/frontend/src/db/backend-api.ts @@ -1187,11 +1187,19 @@ export const aiApi = { * 上传并使用 AI 分析 Excel 文件 */ async analyzeExcel( - file: File, - options: AIAnalyzeOptions = {} + file: File | null, + options: AIAnalyzeOptions = {}, + docId: string | null = null ): Promise { const formData = new FormData(); - formData.append('file', file); + + if (docId) { + formData.append('doc_id', docId); + } else if (file) { + formData.append('file', file); + } else { + throw new Error('必须提供文件或文档ID'); + } const params = new URLSearchParams(); if (options.userPrompt) { @@ -1268,7 +1276,9 @@ export const aiApi = { try { const response = await fetch(url); if (!response.ok) throw new Error('获取分析类型失败'); - return await response.json(); + const data = await response.json(); + // 转换后端返回格式 {excel_types: [], markdown_types: []} 为前端期望的 {types: []} + return { types: data.excel_types || [] }; } catch (error) { console.error('获取分析类型失败:', error); throw error; diff --git a/frontend/src/pages/Documents.tsx b/frontend/src/pages/Documents.tsx index a71b113..003cd87 100644 --- a/frontend/src/pages/Documents.tsx +++ b/frontend/src/pages/Documents.tsx @@ -472,11 +472,17 @@ const Documents: React.FC = () => { setAnalysisCharts(null); try { - const result = await aiApi.analyzeExcel(uploadedFile, { - userPrompt: aiOptions.userPrompt, - analysisType: aiOptions.analysisType, - parseAllSheets: aiOptions.parseAllSheetsForAI - }); + // 判断是从历史文档还是本地上传 + const docId = selectedDocument?.doc_id && uploadedFile.size === 0 ? selectedDocument.doc_id : null; + const result = await aiApi.analyzeExcel( + uploadedFile.size > 0 ? uploadedFile : null, + { + userPrompt: aiOptions.userPrompt, + analysisType: aiOptions.analysisType, + parseAllSheets: aiOptions.parseAllSheetsForAI + }, + docId + ); if (result.success) { toast.success('AI 分析完成'); @@ -706,6 +712,12 @@ const Documents: React.FC = () => { const handleSelectDocument = async (docId: string) => { setLoadingDocument(true); + // 重置所有 AI 分析结果,避免显示上一个文档的分析 + setAiAnalysis(null); + setAnalysisCharts(null); + setMdAnalysis(null); + setWordAnalysis(null); + setTxtAnalysis(null); try { const result = await backendApi.getDocument(docId); if (result.success && result.document) { @@ -2264,39 +2276,57 @@ const Documents: React.FC = () => { ); }; -// 数据表格组件 +// 数据表格组件 - 滑动窗口样式 const DataTable: React.FC<{ columns: string[]; rows: Record[] }> = ({ columns, rows }) => { if (!columns.length || !rows.length) { return
暂无数据
; } + const displayRows = rows.slice(0, 500); // 限制最多显示500行 + return ( -
- - - - # - {columns.map((col, idx) => ( - {col || `<列${idx + 1}>`} - ))} - - - - {rows.slice(0, 100).map((row, rowIdx) => ( - - {rowIdx + 1} - {columns.map((col, colIdx) => ( - - {row[col] !== null && row[col] !== undefined ? String(row[col]) : '-'} - +
+ {/* 表头 - 固定 */} +
+ + + + # + {columns.map((col, idx) => ( + {col || `<列${idx + 1}>`} ))} - ))} - - - {rows.length > 100 && ( + + +
+ {/* 表体 - 可滚动 */} +
+ + + {displayRows.map((row, rowIdx) => ( + + {rowIdx + 1} + {columns.map((col, colIdx) => ( + + {row[col] !== null && row[col] !== undefined ? String(row[col]) : '-'} + + ))} + + ))} + + +
+ {rows.length > 500 && (
- 仅显示前 100 行数据 + 仅显示前 500 行数据(共 {rows.length} 行) +
+ )} + {rows.length > 100 && rows.length <= 500 && ( +
+ 共 {rows.length} 行数据,向下滚动查看更多
)}