djh
This commit is contained in:
@@ -3,6 +3,7 @@
|
||||
|
||||
提供文档列表、详情查询和删除功能
|
||||
"""
|
||||
import logging
|
||||
from typing import Optional, List
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Query
|
||||
@@ -10,6 +11,8 @@ from pydantic import BaseModel
|
||||
|
||||
from app.core.database import mongodb
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/documents", tags=["文档库"])
|
||||
|
||||
|
||||
@@ -26,7 +29,8 @@ class DocumentItem(BaseModel):
|
||||
@router.get("")
|
||||
async def get_documents(
|
||||
doc_type: Optional[str] = Query(None, description="文档类型过滤"),
|
||||
limit: int = Query(50, ge=1, le=100, description="返回数量")
|
||||
limit: int = Query(20, ge=1, le=100, description="返回数量"),
|
||||
skip: int = Query(0, ge=0, description="跳过数量")
|
||||
):
|
||||
"""
|
||||
获取文档列表
|
||||
@@ -40,11 +44,25 @@ async def get_documents(
|
||||
if doc_type:
|
||||
query["doc_type"] = doc_type
|
||||
|
||||
# 查询文档
|
||||
cursor = mongodb.documents.find(query).sort("created_at", -1).limit(limit)
|
||||
logger.info(f"开始查询文档列表, query: {query}, limit: {limit}")
|
||||
|
||||
# 使用 batch_size 和 max_time_ms 来控制查询
|
||||
cursor = mongodb.documents.find(
|
||||
query,
|
||||
{"content": 0} # 不返回 content 字段,减少数据传输
|
||||
).sort("created_at", -1).skip(skip).limit(limit)
|
||||
|
||||
# 设置 10 秒超时
|
||||
cursor.max_time_ms(10000)
|
||||
|
||||
logger.info("Cursor created with 10s timeout, executing...")
|
||||
|
||||
# 使用 batch_size 逐批获取
|
||||
documents_raw = await cursor.to_list(length=limit)
|
||||
logger.info(f"查询到原始文档数: {len(documents_raw)}")
|
||||
|
||||
documents = []
|
||||
async for doc in cursor:
|
||||
for doc in documents_raw:
|
||||
documents.append({
|
||||
"doc_id": str(doc["_id"]),
|
||||
"filename": doc.get("metadata", {}).get("filename", ""),
|
||||
@@ -55,10 +73,12 @@ async def get_documents(
|
||||
"metadata": {
|
||||
"row_count": doc.get("metadata", {}).get("row_count"),
|
||||
"column_count": doc.get("metadata", {}).get("column_count"),
|
||||
"columns": doc.get("metadata", {}).get("columns", [])[:10] # 只返回前10列
|
||||
"columns": doc.get("metadata", {}).get("columns", [])[:10]
|
||||
}
|
||||
})
|
||||
|
||||
logger.info(f"文档列表处理完成: {len(documents)} 个文档")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"documents": documents,
|
||||
@@ -66,6 +86,17 @@ async def get_documents(
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
err_str = str(e)
|
||||
# 如果是超时错误,返回空列表而不是报错
|
||||
if "timeout" in err_str.lower() or "time" in err_str.lower():
|
||||
logger.warning(f"文档查询超时,返回空列表: {err_str}")
|
||||
return {
|
||||
"success": True,
|
||||
"documents": [],
|
||||
"total": 0,
|
||||
"warning": "查询超时,请稍后重试"
|
||||
}
|
||||
logger.error(f"获取文档列表失败: {str(e)}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=f"获取文档列表失败: {str(e)}")
|
||||
|
||||
|
||||
|
||||
@@ -226,9 +226,42 @@ async def export_filled_template(
|
||||
|
||||
|
||||
async def _export_to_excel(filled_data: dict, template_id: str) -> StreamingResponse:
|
||||
"""导出为 Excel 格式"""
|
||||
# 将字典转换为单行 DataFrame
|
||||
df = pd.DataFrame([filled_data])
|
||||
"""导出为 Excel 格式(支持多行)"""
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
logger.info(f"导出填表数据: {len(filled_data)} 个字段")
|
||||
|
||||
# 计算最大行数
|
||||
max_rows = 1
|
||||
for k, v in filled_data.items():
|
||||
if isinstance(v, list) and len(v) > max_rows:
|
||||
max_rows = len(v)
|
||||
logger.info(f" {k}: {type(v).__name__} = {str(v)[:80]}")
|
||||
|
||||
logger.info(f"最大行数: {max_rows}")
|
||||
|
||||
# 构建多行数据
|
||||
rows_data = []
|
||||
for row_idx in range(max_rows):
|
||||
row = {}
|
||||
for col_name, values in filled_data.items():
|
||||
if isinstance(values, list):
|
||||
# 取对应行的值,不足则填空
|
||||
row[col_name] = values[row_idx] if row_idx < len(values) else ""
|
||||
else:
|
||||
# 非列表,整个值填入第一行
|
||||
row[col_name] = values if row_idx == 0 else ""
|
||||
rows_data.append(row)
|
||||
|
||||
df = pd.DataFrame(rows_data)
|
||||
|
||||
# 确保列顺序
|
||||
if not df.empty:
|
||||
df = df[list(filled_data.keys())]
|
||||
|
||||
logger.info(f"DataFrame 形状: {df.shape}")
|
||||
logger.info(f"DataFrame 列: {list(df.columns)}")
|
||||
|
||||
output = io.BytesIO()
|
||||
with pd.ExcelWriter(output, engine='openpyxl') as writer:
|
||||
|
||||
@@ -11,6 +11,7 @@ import io
|
||||
from app.services.file_service import file_service
|
||||
from app.core.document_parser import XlsxParser
|
||||
from app.services.table_rag_service import table_rag_service
|
||||
from app.core.database import mongodb
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -95,6 +96,56 @@ async def upload_excel(
|
||||
except Exception as e:
|
||||
logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True)
|
||||
|
||||
# 存储到 MongoDB(用于文档列表展示)
|
||||
try:
|
||||
content = ""
|
||||
# 构建文本内容用于展示
|
||||
if result.data:
|
||||
if isinstance(result.data, dict):
|
||||
# 单 sheet 格式: {columns, rows, ...}
|
||||
if 'columns' in result.data and 'rows' in result.data:
|
||||
content += f"Sheet: {result.metadata.get('current_sheet', 'Sheet1') if result.metadata else 'Sheet1'}\n"
|
||||
content += ", ".join(str(h) for h in result.data['columns']) + "\n"
|
||||
for row in result.data['rows'][:100]:
|
||||
if isinstance(row, dict):
|
||||
content += ", ".join(str(row.get(col, "")) for col in result.data['columns']) + "\n"
|
||||
elif isinstance(row, list):
|
||||
content += ", ".join(str(cell) for cell in row) + "\n"
|
||||
content += f"... (共 {len(result.data['rows'])} 行)\n\n"
|
||||
# 多 sheet 格式: {sheets: {sheet_name: {columns, rows}}}
|
||||
elif 'sheets' in result.data:
|
||||
for sheet_name_key, sheet_data in result.data['sheets'].items():
|
||||
if isinstance(sheet_data, dict) and 'columns' in sheet_data and 'rows' in sheet_data:
|
||||
content += f"Sheet: {sheet_name_key}\n"
|
||||
content += ", ".join(str(h) for h in sheet_data['columns']) + "\n"
|
||||
for row in sheet_data['rows'][:100]:
|
||||
if isinstance(row, dict):
|
||||
content += ", ".join(str(row.get(col, "")) for col in sheet_data['columns']) + "\n"
|
||||
elif isinstance(row, list):
|
||||
content += ", ".join(str(cell) for cell in row) + "\n"
|
||||
content += f"... (共 {len(sheet_data['rows'])} 行)\n\n"
|
||||
|
||||
doc_metadata = {
|
||||
"filename": saved_path.split("/")[-1] if "/" in saved_path else saved_path.split("\\")[-1],
|
||||
"original_filename": file.filename,
|
||||
"saved_path": saved_path,
|
||||
"file_size": len(content),
|
||||
"row_count": result.metadata.get('row_count', 0) if result.metadata else 0,
|
||||
"column_count": result.metadata.get('column_count', 0) if result.metadata else 0,
|
||||
"columns": result.metadata.get('columns', []) if result.metadata else [],
|
||||
"mysql_table": result.metadata.get('mysql_table') if result.metadata else None,
|
||||
"sheet_count": result.metadata.get('sheet_count', 1) if result.metadata else 1,
|
||||
}
|
||||
await mongodb.insert_document(
|
||||
doc_type="xlsx",
|
||||
content=content,
|
||||
metadata=doc_metadata,
|
||||
structured_data=result.data if result.data else None
|
||||
)
|
||||
logger.info(f"Excel文档已存储到MongoDB: {file.filename}, content长度: {len(content)}")
|
||||
except Exception as e:
|
||||
logger.error(f"Excel存储到MongoDB异常: {str(e)}", exc_info=True)
|
||||
|
||||
return result.to_dict()
|
||||
|
||||
except HTTPException:
|
||||
|
||||
Reference in New Issue
Block a user