修复智能填表功能：支持直接从结构化数据提取列值并完善JSON解析

- 新增 _extract_values_from_structured_data 方法，直接从Excel rows提取列值 - 新增 _extract_values_by_regex 方法，使用正则从损坏的JSON中提取值 - 增大 max_tokens (500→50000) 和 max_length (8000→200000) 限制 - 改进JSON解析逻辑，处理markdown代码块包裹和不完整JSON - 解决LLM返回被截断的JSON无法正确解析的问题
2026-04-09 17:40:10 +08:00
parent b2ebd3e12d
commit 2c2ab56d2d
9 changed files with 757 additions and 76 deletions
--- a/backend/app/api/endpoints/upload.py
+++ b/backend/app/api/endpoints/upload.py
@@ -11,6 +11,7 @@ import io
 from app.services.file_service import file_service
 from app.core.document_parser import XlsxParser
 from app.services.table_rag_service import table_rag_service
+from app.core.database import mongodb

 logger = logging.getLogger(__name__)

@@ -95,6 +96,56 @@ async def upload_excel(
        except Exception as e:
            logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True)

+        # 存储到 MongoDB（用于文档列表展示）
+        try:
+            content = ""
+            # 构建文本内容用于展示
+            if result.data:
+                if isinstance(result.data, dict):
+                    # 单 sheet 格式: {columns, rows, ...}
+                    if 'columns' in result.data and 'rows' in result.data:
+                        content += f"Sheet: {result.metadata.get('current_sheet', 'Sheet1') if result.metadata else 'Sheet1'}\n"
+                        content += ", ".join(str(h) for h in result.data['columns']) + "\n"
+                        for row in result.data['rows'][:100]:
+                            if isinstance(row, dict):
+                                content += ", ".join(str(row.get(col, "")) for col in result.data['columns']) + "\n"
+                            elif isinstance(row, list):
+                                content += ", ".join(str(cell) for cell in row) + "\n"
+                        content += f"... (共 {len(result.data['rows'])} 行)\n\n"
+                    # 多 sheet 格式: {sheets: {sheet_name: {columns, rows}}}
+                    elif 'sheets' in result.data:
+                        for sheet_name_key, sheet_data in result.data['sheets'].items():
+                            if isinstance(sheet_data, dict) and 'columns' in sheet_data and 'rows' in sheet_data:
+                                content += f"Sheet: {sheet_name_key}\n"
+                                content += ", ".join(str(h) for h in sheet_data['columns']) + "\n"
+                                for row in sheet_data['rows'][:100]:
+                                    if isinstance(row, dict):
+                                        content += ", ".join(str(row.get(col, "")) for col in sheet_data['columns']) + "\n"
+                                    elif isinstance(row, list):
+                                        content += ", ".join(str(cell) for cell in row) + "\n"
+                                content += f"... (共 {len(sheet_data['rows'])} 行)\n\n"
+
+            doc_metadata = {
+                "filename": saved_path.split("/")[-1] if "/" in saved_path else saved_path.split("\\")[-1],
+                "original_filename": file.filename,
+                "saved_path": saved_path,
+                "file_size": len(content),
+                "row_count": result.metadata.get('row_count', 0) if result.metadata else 0,
+                "column_count": result.metadata.get('column_count', 0) if result.metadata else 0,
+                "columns": result.metadata.get('columns', []) if result.metadata else [],
+                "mysql_table": result.metadata.get('mysql_table') if result.metadata else None,
+                "sheet_count": result.metadata.get('sheet_count', 1) if result.metadata else 1,
+            }
+            await mongodb.insert_document(
+                doc_type="xlsx",
+                content=content,
+                metadata=doc_metadata,
+                structured_data=result.data if result.data else None
+            )
+            logger.info(f"Excel文档已存储到MongoDB: {file.filename}, content长度: {len(content)}")
+        except Exception as e:
+            logger.error(f"Excel存储到MongoDB异常: {str(e)}", exc_info=True)
+
        return result.to_dict()

    except HTTPException: