- 新增 _extract_values_from_structured_data 方法,直接从Excel rows提取列值 - 新增 _extract_values_by_regex 方法,使用正则从损坏的JSON中提取值 - 增大 max_tokens (500→50000) 和 max_length (8000→200000) 限制 - 改进JSON解析逻辑,处理markdown代码块包裹和不完整JSON - 解决LLM返回被截断的JSON无法正确解析的问题
171 lines
5.1 KiB
Python
171 lines
5.1 KiB
Python
"""
|
|
文档库管理 API 接口
|
|
|
|
提供文档列表、详情查询和删除功能
|
|
"""
|
|
import logging
|
|
from typing import Optional, List
|
|
|
|
from fastapi import APIRouter, HTTPException, Query
|
|
from pydantic import BaseModel
|
|
|
|
from app.core.database import mongodb
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(prefix="/documents", tags=["文档库"])
|
|
|
|
|
|
class DocumentItem(BaseModel):
|
|
doc_id: str
|
|
filename: str
|
|
original_filename: str
|
|
doc_type: str
|
|
file_size: int
|
|
created_at: str
|
|
metadata: Optional[dict] = None
|
|
|
|
|
|
@router.get("")
|
|
async def get_documents(
|
|
doc_type: Optional[str] = Query(None, description="文档类型过滤"),
|
|
limit: int = Query(20, ge=1, le=100, description="返回数量"),
|
|
skip: int = Query(0, ge=0, description="跳过数量")
|
|
):
|
|
"""
|
|
获取文档列表
|
|
|
|
Returns:
|
|
文档列表
|
|
"""
|
|
try:
|
|
# 构建查询条件
|
|
query = {}
|
|
if doc_type:
|
|
query["doc_type"] = doc_type
|
|
|
|
logger.info(f"开始查询文档列表, query: {query}, limit: {limit}")
|
|
|
|
# 使用 batch_size 和 max_time_ms 来控制查询
|
|
cursor = mongodb.documents.find(
|
|
query,
|
|
{"content": 0} # 不返回 content 字段,减少数据传输
|
|
).sort("created_at", -1).skip(skip).limit(limit)
|
|
|
|
# 设置 10 秒超时
|
|
cursor.max_time_ms(10000)
|
|
|
|
logger.info("Cursor created with 10s timeout, executing...")
|
|
|
|
# 使用 batch_size 逐批获取
|
|
documents_raw = await cursor.to_list(length=limit)
|
|
logger.info(f"查询到原始文档数: {len(documents_raw)}")
|
|
|
|
documents = []
|
|
for doc in documents_raw:
|
|
documents.append({
|
|
"doc_id": str(doc["_id"]),
|
|
"filename": doc.get("metadata", {}).get("filename", ""),
|
|
"original_filename": doc.get("metadata", {}).get("original_filename", ""),
|
|
"doc_type": doc.get("doc_type", ""),
|
|
"file_size": doc.get("metadata", {}).get("file_size", 0),
|
|
"created_at": doc.get("created_at", "").isoformat() if doc.get("created_at") else "",
|
|
"metadata": {
|
|
"row_count": doc.get("metadata", {}).get("row_count"),
|
|
"column_count": doc.get("metadata", {}).get("column_count"),
|
|
"columns": doc.get("metadata", {}).get("columns", [])[:10]
|
|
}
|
|
})
|
|
|
|
logger.info(f"文档列表处理完成: {len(documents)} 个文档")
|
|
|
|
return {
|
|
"success": True,
|
|
"documents": documents,
|
|
"total": len(documents)
|
|
}
|
|
|
|
except Exception as e:
|
|
err_str = str(e)
|
|
# 如果是超时错误,返回空列表而不是报错
|
|
if "timeout" in err_str.lower() or "time" in err_str.lower():
|
|
logger.warning(f"文档查询超时,返回空列表: {err_str}")
|
|
return {
|
|
"success": True,
|
|
"documents": [],
|
|
"total": 0,
|
|
"warning": "查询超时,请稍后重试"
|
|
}
|
|
logger.error(f"获取文档列表失败: {str(e)}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail=f"获取文档列表失败: {str(e)}")
|
|
|
|
|
|
@router.get("/{doc_id}")
|
|
async def get_document(doc_id: str):
|
|
"""
|
|
获取文档详情
|
|
|
|
Args:
|
|
doc_id: 文档ID
|
|
|
|
Returns:
|
|
文档详情
|
|
"""
|
|
try:
|
|
doc = await mongodb.get_document(doc_id)
|
|
|
|
if not doc:
|
|
raise HTTPException(status_code=404, detail="文档不存在")
|
|
|
|
return {
|
|
"success": True,
|
|
"document": {
|
|
"doc_id": str(doc["_id"]),
|
|
"filename": doc.get("metadata", {}).get("filename", ""),
|
|
"original_filename": doc.get("metadata", {}).get("original_filename", ""),
|
|
"doc_type": doc.get("doc_type", ""),
|
|
"file_size": doc.get("metadata", {}).get("file_size", 0),
|
|
"created_at": doc.get("created_at", "").isoformat() if doc.get("created_at") else "",
|
|
"content": doc.get("content", ""), # 原始文本内容
|
|
"structured_data": doc.get("structured_data"), # 结构化数据(如果有)
|
|
"metadata": doc.get("metadata", {})
|
|
}
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"获取文档详情失败: {str(e)}")
|
|
|
|
|
|
@router.delete("/{doc_id}")
|
|
async def delete_document(doc_id: str):
|
|
"""
|
|
删除文档
|
|
|
|
Args:
|
|
doc_id: 文档ID
|
|
|
|
Returns:
|
|
删除结果
|
|
"""
|
|
try:
|
|
# 从 MongoDB 删除
|
|
deleted = await mongodb.delete_document(doc_id)
|
|
|
|
if not deleted:
|
|
raise HTTPException(status_code=404, detail="文档不存在")
|
|
|
|
# TODO: 从 MySQL 删除相关数据(如果是Excel)
|
|
# TODO: 从 RAG 删除相关索引
|
|
|
|
return {
|
|
"success": True,
|
|
"message": "文档已删除"
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"删除失败: {str(e)}")
|