From 8e46e635f1a90770a020a9c01a3fa47d05839f32 Mon Sep 17 00:00:00 2001 From: dj <431634905@qq.com> Date: Thu, 16 Apr 2026 19:59:56 +0800 Subject: [PATCH] =?UTF-8?q?rag=E6=97=A5=E5=BF=97=E6=94=B9=E4=B8=BAinfo?= =?UTF-8?q?=E7=BA=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/app/core/document_parser/docx_parser.py | 10 +++++++--- backend/app/services/llm_service.py | 14 +++++++++++++- backend/app/services/rag_service.py | 2 +- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/backend/app/core/document_parser/docx_parser.py b/backend/app/core/document_parser/docx_parser.py index e2bcb0e..24ff54a 100644 --- a/backend/app/core/document_parser/docx_parser.py +++ b/backend/app/core/document_parser/docx_parser.py @@ -91,11 +91,15 @@ class DocxParser(BaseParser): table_rows.append(row_data) if table_rows: + # 第一行作为表头,其余行作为数据 + headers = table_rows[0] if table_rows else [] + data_rows = table_rows[1:] if len(table_rows) > 1 else [] tables_data.append({ "table_index": i, - "rows": table_rows, - "row_count": len(table_rows), - "column_count": len(table_rows[0]) if table_rows else 0 + "headers": headers, # 添加 headers 字段 + "rows": data_rows, # 数据行(不含表头) + "row_count": len(data_rows), + "column_count": len(headers) if headers else 0 }) # 提取图片/嵌入式对象信息 diff --git a/backend/app/services/llm_service.py b/backend/app/services/llm_service.py index fac51e3..c0a5dd9 100644 --- a/backend/app/services/llm_service.py +++ b/backend/app/services/llm_service.py @@ -54,6 +54,9 @@ class LLMService: # 添加其他参数 payload.update(kwargs) + import time + _start_time = time.time() + logger.info(f"🤖 [LLM] 正在调用 DeepSeek API... 模型: {self.model_name}") try: async with httpx.AsyncClient(timeout=60.0) as client: response = await client.post( @@ -62,7 +65,10 @@ class LLMService: json=payload ) response.raise_for_status() - return response.json() + result = response.json() + _elapsed = time.time() - _start_time + logger.info(f"✅ [LLM] DeepSeek API 响应成功 | 模型: {self.model_name} | 耗时: {_elapsed:.2f}s | Token: {result.get('usage', {}).get('total_tokens', 'N/A')}") + return result except httpx.HTTPStatusError as e: error_detail = e.response.text @@ -133,6 +139,9 @@ class LLMService: payload.update(kwargs) + import time + _start_time = time.time() + logger.info(f"🤖 [LLM] 正在调用 DeepSeek API (流式) | 模型: {self.model_name}") try: async with httpx.AsyncClient(timeout=120.0) as client: async with client.stream( @@ -141,10 +150,13 @@ class LLMService: headers=headers, json=payload ) as response: + _elapsed = time.time() - _start_time + logger.info(f"✅ [LLM] DeepSeek API 流式响应开始 | 模型: {self.model_name} | 耗时: {_elapsed:.2f}s") async for line in response.aiter_lines(): if line.startswith("data: "): data = line[6:] # Remove "data: " prefix if data == "[DONE]": + logger.info(f"✅ [LLM] DeepSeek API 流式响应完成") break try: import json as json_module diff --git a/backend/app/services/rag_service.py b/backend/app/services/rag_service.py index 230800c..866b7ff 100644 --- a/backend/app/services/rag_service.py +++ b/backend/app/services/rag_service.py @@ -669,7 +669,7 @@ class RAGService: # 按融合分数降序排序 fused_results.sort(key=lambda x: x["score"], reverse=True) - logger.debug(f"混合融合: {len(fused_results)} 个文档, 向量:{len(vector_results)}, BM25:{len(bm25_results)}") + logger.info(f"RRF 混合融合: {len(fused_results)} 个文档参与融合, 向量检索命中:{len(vector_results)}, BM25命中:{len(bm25_results)}") return fused_results[:top_k]