FilesReadSystem/backend/app/api/endpoints/library.py

"""
文档库管理 API 接口

提供文档列表、详情查询和删除功能
"""
import logging
from typing import Optional, List

from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel

from app.core.database import mongodb

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/documents", tags=["文档库"])


class DocumentItem(BaseModel):
    doc_id: str
    filename: str
    original_filename: str
    doc_type: str
    file_size: int
    created_at: str
    metadata: Optional[dict] = None


@router.get("")
async def get_documents(
    doc_type: Optional[str] = Query(None, description="文档类型过滤"),
    limit: int = Query(20, ge=1, le=100, description="返回数量"),
    skip: int = Query(0, ge=0, description="跳过数量")
):
    """
    获取文档列表

    Returns:
        文档列表
    """
    try:
        # 构建查询条件
        query = {}
        if doc_type:
            query["doc_type"] = doc_type

        logger.info(f"开始查询文档列表, query: {query}, limit: {limit}")

        # 使用 batch_size 和 max_time_ms 来控制查询
        cursor = mongodb.documents.find(
            query,
            {"content": 0}  # 不返回 content 字段，减少数据传输
        ).sort("created_at", -1).skip(skip).limit(limit)

        # 设置 10 秒超时
        cursor.max_time_ms(10000)

        logger.info("Cursor created with 10s timeout, executing...")

        # 使用 batch_size 逐批获取
        documents_raw = await cursor.to_list(length=limit)
        logger.info(f"查询到原始文档数: {len(documents_raw)}")

        documents = []
        for doc in documents_raw:
            documents.append({
                "doc_id": str(doc["_id"]),
                "filename": doc.get("metadata", {}).get("filename", ""),
                "original_filename": doc.get("metadata", {}).get("original_filename", ""),
                "doc_type": doc.get("doc_type", ""),
                "file_size": doc.get("metadata", {}).get("file_size", 0),
                "created_at": doc.get("created_at", "").isoformat() if doc.get("created_at") else "",
                "metadata": {
                    "row_count": doc.get("metadata", {}).get("row_count"),
                    "column_count": doc.get("metadata", {}).get("column_count"),
                    "columns": doc.get("metadata", {}).get("columns", [])[:10]
                }
            })

        logger.info(f"文档列表处理完成: {len(documents)} 个文档")

        return {
            "success": True,
            "documents": documents,
            "total": len(documents)
        }

    except Exception as e:
        err_str = str(e)
        # 如果是超时错误，返回空列表而不是报错
        if "timeout" in err_str.lower() or "time" in err_str.lower():
            logger.warning(f"文档查询超时，返回空列表: {err_str}")
            return {
                "success": True,
                "documents": [],
                "total": 0,
                "warning": "查询超时，请稍后重试"
            }
        logger.error(f"获取文档列表失败: {str(e)}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"获取文档列表失败: {str(e)}")


@router.get("/{doc_id}")
async def get_document(doc_id: str):
    """
    获取文档详情

    Args:
        doc_id: 文档ID

    Returns:
        文档详情
    """
    try:
        doc = await mongodb.get_document(doc_id)

        if not doc:
            raise HTTPException(status_code=404, detail="文档不存在")

        return {
            "success": True,
            "document": {
                "doc_id": str(doc["_id"]),
                "filename": doc.get("metadata", {}).get("filename", ""),
                "original_filename": doc.get("metadata", {}).get("original_filename", ""),
                "doc_type": doc.get("doc_type", ""),
                "file_size": doc.get("metadata", {}).get("file_size", 0),
                "created_at": doc.get("created_at", "").isoformat() if doc.get("created_at") else "",
                "content": doc.get("content", ""),  # 原始文本内容
                "structured_data": doc.get("structured_data"),  # 结构化数据(如果有)
                "metadata": doc.get("metadata", {})
            }
        }

    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"获取文档详情失败: {str(e)}")


@router.delete("/{doc_id}")
async def delete_document(doc_id: str):
    """
    删除文档

    Args:
        doc_id: 文档ID

    Returns:
        删除结果
    """
    try:
        # 从 MongoDB 删除
        deleted = await mongodb.delete_document(doc_id)

        if not deleted:
            raise HTTPException(status_code=404, detail="文档不存在")

        # TODO: 从 MySQL 删除相关数据(如果是Excel)
        # TODO: 从 RAG 删除相关索引

        return {
            "success": True,
            "message": "文档已删除"
        }

    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"删除失败: {str(e)}")