修复智能填表功能:支持直接从结构化数据提取列值并完善JSON解析

- 新增 _extract_values_from_structured_data 方法,直接从Excel rows提取列值
- 新增 _extract_values_by_regex 方法,使用正则从损坏的JSON中提取值
- 增大 max_tokens (500→50000) 和 max_length (8000→200000) 限制
- 改进JSON解析逻辑,处理markdown代码块包裹和不完整JSON
- 解决LLM返回被截断的JSON无法正确解析的问题
This commit is contained in:
dj
2026-04-09 17:40:10 +08:00
parent b2ebd3e12d
commit 2c2ab56d2d
9 changed files with 757 additions and 76 deletions

View File

@@ -11,6 +11,7 @@ import io
from app.services.file_service import file_service
from app.core.document_parser import XlsxParser
from app.services.table_rag_service import table_rag_service
from app.core.database import mongodb
logger = logging.getLogger(__name__)
@@ -95,6 +96,56 @@ async def upload_excel(
except Exception as e:
logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True)
# 存储到 MongoDB用于文档列表展示
try:
content = ""
# 构建文本内容用于展示
if result.data:
if isinstance(result.data, dict):
# 单 sheet 格式: {columns, rows, ...}
if 'columns' in result.data and 'rows' in result.data:
content += f"Sheet: {result.metadata.get('current_sheet', 'Sheet1') if result.metadata else 'Sheet1'}\n"
content += ", ".join(str(h) for h in result.data['columns']) + "\n"
for row in result.data['rows'][:100]:
if isinstance(row, dict):
content += ", ".join(str(row.get(col, "")) for col in result.data['columns']) + "\n"
elif isinstance(row, list):
content += ", ".join(str(cell) for cell in row) + "\n"
content += f"... (共 {len(result.data['rows'])} 行)\n\n"
# 多 sheet 格式: {sheets: {sheet_name: {columns, rows}}}
elif 'sheets' in result.data:
for sheet_name_key, sheet_data in result.data['sheets'].items():
if isinstance(sheet_data, dict) and 'columns' in sheet_data and 'rows' in sheet_data:
content += f"Sheet: {sheet_name_key}\n"
content += ", ".join(str(h) for h in sheet_data['columns']) + "\n"
for row in sheet_data['rows'][:100]:
if isinstance(row, dict):
content += ", ".join(str(row.get(col, "")) for col in sheet_data['columns']) + "\n"
elif isinstance(row, list):
content += ", ".join(str(cell) for cell in row) + "\n"
content += f"... (共 {len(sheet_data['rows'])} 行)\n\n"
doc_metadata = {
"filename": saved_path.split("/")[-1] if "/" in saved_path else saved_path.split("\\")[-1],
"original_filename": file.filename,
"saved_path": saved_path,
"file_size": len(content),
"row_count": result.metadata.get('row_count', 0) if result.metadata else 0,
"column_count": result.metadata.get('column_count', 0) if result.metadata else 0,
"columns": result.metadata.get('columns', []) if result.metadata else [],
"mysql_table": result.metadata.get('mysql_table') if result.metadata else None,
"sheet_count": result.metadata.get('sheet_count', 1) if result.metadata else 1,
}
await mongodb.insert_document(
doc_type="xlsx",
content=content,
metadata=doc_metadata,
structured_data=result.data if result.data else None
)
logger.info(f"Excel文档已存储到MongoDB: {file.filename}, content长度: {len(content)}")
except Exception as e:
logger.error(f"Excel存储到MongoDB异常: {str(e)}", exc_info=True)
return result.to_dict()
except HTTPException: