FilesReadSystem/backend/app/api/endpoints/ai_analyze.py

"""
AI 分析 API 接口
"""
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body, Form
from fastapi.responses import StreamingResponse
from typing import Optional
import logging
import tempfile
import os

from app.services.excel_ai_service import excel_ai_service
from app.services.markdown_ai_service import markdown_ai_service
from app.services.template_fill_service import template_fill_service
from app.services.word_ai_service import word_ai_service
from app.services.txt_ai_service import txt_ai_service

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/ai", tags=["AI 分析"])


@router.post("/analyze/excel")
async def analyze_excel(
    file: Optional[UploadFile] = File(None),
    doc_id: Optional[str] = Form(None, description="文档ID（从数据库读取）"),
    user_prompt: str = Query("", description="用户自定义提示词"),
    analysis_type: str = Query("general", description="分析类型: general, summary, statistics, insights"),
    parse_all_sheets: bool = Query(False, description="是否分析所有工作表")
):
    """
    上传并使用 AI 分析 Excel 文件

    Args:
        file: 上传的 Excel 文件（与 doc_id 二选一）
        doc_id: 文档ID（从数据库读取）
        user_prompt: 用户自定义提示词
        analysis_type: 分析类型
        parse_all_sheets: 是否分析所有工作表

    Returns:
        dict: 分析结果，包含 Excel 数据和 AI 分析结果
    """
    filename = None

    # 从数据库读取模式
    if doc_id:
        try:
            from app.core.database.mongodb import mongodb
            doc = await mongodb.get_document(doc_id)
            if not doc:
                raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")

            filename = doc.get("metadata", {}).get("original_filename", "unknown.xlsx")
            file_ext = filename.split('.')[-1].lower()

            if file_ext not in ['xlsx', 'xls']:
                raise HTTPException(status_code=400, detail=f"文档类型不是 Excel: {file_ext}")

            file_path = doc.get("metadata", {}).get("file_path")
            if not file_path:
                raise HTTPException(status_code=400, detail="文档没有存储文件路径，请重新上传")

            # 使用文件路径进行 AI 分析
            if parse_all_sheets:
                result = await excel_ai_service.batch_analyze_sheets_from_path(
                    file_path=file_path,
                    filename=filename,
                    user_prompt=user_prompt,
                    analysis_type=analysis_type
                )
            else:
                result = await excel_ai_service.analyze_excel_file_from_path(
                    file_path=file_path,
                    filename=filename,
                    user_prompt=user_prompt,
                    analysis_type=analysis_type
                )

            if result.get("success"):
                return result
            else:
                return result

        except HTTPException:
            raise
        except Exception as e:
            logger.error(f"从数据库读取 Excel 文档失败: {str(e)}")
            raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")

    # 文件上传模式
    if not file:
        raise HTTPException(status_code=400, detail="请提供文件或文档ID")

    if not file.filename:
        raise HTTPException(status_code=400, detail="文件名为空")

    file_ext = file.filename.split('.')[-1].lower()
    if file_ext not in ['xlsx', 'xls']:
        raise HTTPException(
            status_code=400,
            detail=f"不支持的文件类型: {file_ext}，仅支持 .xlsx 和 .xls"
        )

    # 验证分析类型
    supported_types = ['general', 'summary', 'statistics', 'insights']
    if analysis_type not in supported_types:
        raise HTTPException(
            status_code=400,
            detail=f"不支持的分析类型: {analysis_type}，支持的类型: {', '.join(supported_types)}"
        )

    try:
        # 读取文件内容
        content = await file.read()

        # 验证文件内容不为空
        if not content:
            raise HTTPException(status_code=400, detail="文件内容为空，请确保文件已正确上传")

        logger.info(f"开始分析文件: {file.filename}, 分析类型: {analysis_type}, 文件大小: {len(content)} bytes")

        # 调用 AI 分析服务
        if parse_all_sheets:
            result = await excel_ai_service.batch_analyze_sheets(
                content,
                file.filename,
                user_prompt=user_prompt,
                analysis_type=analysis_type
            )
        else:
            # 解析选项
            parse_options = {"header_row": 0}

            result = await excel_ai_service.analyze_excel_file(
                content,
                file.filename,
                user_prompt=user_prompt,
                analysis_type=analysis_type,
                parse_options=parse_options
            )

        logger.info(f"文件分析完成: {file.filename}, 成功: {result['success']}")

        return result

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"AI 分析过程中出错: {str(e)}")
        raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")


@router.get("/analysis/types")
async def get_analysis_types():
    """
    获取支持的分析类型列表

    Returns:
        dict: 支持的分析类型（包含 Excel 和 Markdown）
    """
    return {
        "excel_types": excel_ai_service.get_supported_analysis_types(),
        "markdown_types": markdown_ai_service.get_supported_analysis_types()
    }


@router.post("/analyze/text")
async def analyze_text(
    excel_data: dict = Body(..., description="Excel 解析后的数据"),
    user_prompt: str = Body("", description="用户提示词"),
    analysis_type: str = Body("general", description="分析类型")
):
    """
    对已解析的 Excel 数据进行 AI 分析

    Args:
        excel_data: Excel 数据
        user_prompt: 用户提示词
        analysis_type: 分析类型

    Returns:
        dict: 分析结果
    """
    try:
        logger.info(f"开始文本分析, 分析类型: {analysis_type}")

        # 调用 LLM 服务
        from app.services.llm_service import llm_service

        if user_prompt and user_prompt.strip():
            result = await llm_service.analyze_with_template(
                excel_data,
                user_prompt
            )
        else:
            result = await llm_service.analyze_excel_data(
                excel_data,
                user_prompt,
                analysis_type
            )

        logger.info(f"文本分析完成, 成功: {result['success']}")

        return result

    except Exception as e:
        logger.error(f"文本分析失败: {str(e)}")
        raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")


@router.post("/analyze/md")
async def analyze_markdown(
    file: Optional[UploadFile] = File(None),
    doc_id: Optional[str] = Form(None, description="文档ID（从数据库读取）"),
    analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section, charts"),
    user_prompt: str = Query("", description="用户自定义提示词"),
    section_number: Optional[str] = Query(None, description="指定章节编号，如 '一' 或 '（一）'")
):
    """
    上传并使用 AI 分析 Markdown 文件

    Args:
        file: 上传的 Markdown 文件（与 doc_id 二选一）
        doc_id: 文档ID（从数据库读取）
        analysis_type: 分析类型
        user_prompt: 用户自定义提示词
        section_number: 指定分析的章节编号

    Returns:
        dict: 分析结果
    """
    filename = None
    tmp_path = None

    # 验证分析类型
    supported_types = markdown_ai_service.get_supported_analysis_types()
    if analysis_type not in supported_types:
        raise HTTPException(
            status_code=400,
            detail=f"不支持的分析类型: {analysis_type}，支持的类型: {', '.join(supported_types)}"
        )

    if doc_id:
        # 从数据库读取文档
        try:
            from app.core.database.mongodb import mongodb
            doc = await mongodb.get_document(doc_id)
            if not doc:
                raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")

            filename = doc.get("metadata", {}).get("original_filename", "unknown.md")
            file_ext = filename.split('.')[-1].lower()

            if file_ext not in ['md', 'markdown']:
                raise HTTPException(status_code=400, detail=f"文档类型不是 Markdown: {file_ext}")

            content = doc.get("content") or ""
            if not content:
                raise HTTPException(status_code=400, detail="文档内容为空")

            # 保存到临时文件
            with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
                tmp.write(content.encode('utf-8'))
                tmp_path = tmp.name

            logger.info(f"从数据库加载 Markdown 文档: {filename}, 长度: {len(content)}")

        except HTTPException:
            raise
        except Exception as e:
            logger.error(f"从数据库读取 Markdown 文档失败: {str(e)}")
            raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
    else:
        # 文件上传模式
        if not file:
            raise HTTPException(status_code=400, detail="请提供文件或文档ID")

        if not file.filename:
            raise HTTPException(status_code=400, detail="文件名为空")

        file_ext = file.filename.split('.')[-1].lower()
        if file_ext not in ['md', 'markdown']:
            raise HTTPException(
                status_code=400,
                detail=f"不支持的文件类型: {file_ext}，仅支持 .md 和 .markdown"
            )

        try:
            # 读取文件内容
            content = await file.read()

            # 保存到临时文件
            with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
                tmp.write(content)
                tmp_path = tmp.name

            filename = file.filename

        except Exception as e:
            logger.error(f"读取 Markdown 文件失败: {str(e)}")
            raise HTTPException(status_code=500, detail=f"读取文件失败: {str(e)}")

    try:
        logger.info(f"开始分析 Markdown 文件: {filename}, 分析类型: {analysis_type}, 章节: {section_number}")

        # 调用 AI 分析服务
        result = await markdown_ai_service.analyze_markdown(
            file_path=tmp_path,
            analysis_type=analysis_type,
            user_prompt=user_prompt,
            section_number=section_number
        )

        logger.info(f"Markdown 分析完成: {filename}, 成功: {result['success']}")

        if not result['success']:
            raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))

        return result

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Markdown AI 分析过程中出错: {str(e)}")
        raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
    finally:
        # 清理临时文件
        if tmp_path and os.path.exists(tmp_path):
            try:
                os.unlink(tmp_path)
            except Exception as cleanup_error:
                logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")


@router.post("/analyze/md/stream")
async def analyze_markdown_stream(
    file: UploadFile = File(...),
    analysis_type: str = Query("summary", description="分析类型"),
    user_prompt: str = Query("", description="用户自定义提示词"),
    section_number: Optional[str] = Query(None, description="指定章节编号")
):
    """
    流式分析 Markdown 文件 (SSE)

    Returns:
        StreamingResponse: SSE 流式响应
    """
    if not file.filename:
        raise HTTPException(status_code=400, detail="文件名为空")

    file_ext = file.filename.split('.')[-1].lower()
    if file_ext not in ['md', 'markdown']:
        raise HTTPException(
            status_code=400,
            detail=f"不支持的文件类型: {file_ext}，仅支持 .md 和 .markdown"
        )

    try:
        content = await file.read()

        with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
            tmp.write(content)
            tmp_path = tmp.name

        try:
            logger.info(f"开始流式分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}")

            async def stream_generator():
                async for chunk in markdown_ai_service.analyze_markdown_stream(
                    file_path=tmp_path,
                    analysis_type=analysis_type,
                    user_prompt=user_prompt,
                    section_number=section_number
                ):
                    yield chunk

            return StreamingResponse(
                stream_generator(),
                media_type="text/event-stream",
                headers={
                    "Cache-Control": "no-cache",
                    "Connection": "keep-alive",
                    "X-Accel-Buffering": "no"
                }
            )

        finally:
            # 清理临时文件，确保在所有情况下都能清理
            try:
                if tmp_path and os.path.exists(tmp_path):
                    os.unlink(tmp_path)
            except Exception as cleanup_error:
                logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Markdown AI 流式分析出错: {str(e)}")
        raise HTTPException(status_code=500, detail=f"流式分析失败: {str(e)}")


@router.post("/analyze/md/outline")
async def get_markdown_outline(
    file: UploadFile = File(...)
):
    """
    获取 Markdown 文档的大纲结构（分章节信息）

    Args:
        file: 上传的 Markdown 文件

    Returns:
        dict: 文档大纲结构
    """
    if not file.filename:
        raise HTTPException(status_code=400, detail="文件名为空")

    file_ext = file.filename.split('.')[-1].lower()
    if file_ext not in ['md', 'markdown']:
        raise HTTPException(
            status_code=400,
            detail=f"不支持的文件类型: {file_ext}，仅支持 .md 和 .markdown"
        )

    try:
        content = await file.read()

        with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
            tmp.write(content)
            tmp_path = tmp.name

        try:
            result = await markdown_ai_service.extract_outline(tmp_path)
            return result
        finally:
            # 清理临时文件，确保在所有情况下都能清理
            try:
                if tmp_path and os.path.exists(tmp_path):
                    os.unlink(tmp_path)
            except Exception as cleanup_error:
                logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")

    except Exception as e:
        logger.error(f"获取 Markdown 大纲失败: {str(e)}")
        raise HTTPException(status_code=500, detail=f"获取大纲失败: {str(e)}")


@router.post("/analyze/txt")
async def analyze_txt(
    file: Optional[UploadFile] = File(None),
    doc_id: Optional[str] = Form(None, description="文档ID（从数据库读取）"),
    analysis_type: str = Query("structured", description="分析类型: structured, charts")
):
    """
    上传并使用 AI 分析 TXT 文本文件，提取结构化数据或生成图表

    将非结构化文本转换为结构化表格数据，便于后续填表使用
    当 analysis_type=charts 时，可生成可视化图表

    Args:
        file: 上传的 TXT 文件（与 doc_id 二选一）
        doc_id: 文档ID（从数据库读取）
        analysis_type: 分析类型 - "structured"（默认，提取结构化数据）或 "charts"（生成图表）

    Returns:
        dict: 分析结果，包含结构化表格数据或图表数据
    """
    filename = None
    text_content = None

    if doc_id:
        # 从数据库读取文档
        try:
            from app.core.database.mongodb import mongodb
            doc = await mongodb.get_document(doc_id)
            if not doc:
                raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")

            filename = doc.get("metadata", {}).get("original_filename", "unknown.txt")
            file_ext = filename.split('.')[-1].lower()

            if file_ext not in ['txt', 'text']:
                raise HTTPException(status_code=400, detail=f"文档类型不是 TXT: {file_ext}")

            # 使用数据库中的 content
            text_content = doc.get("content") or ""

            if not text_content:
                raise HTTPException(status_code=400, detail="文档内容为空")

            logger.info(f"从数据库加载 TXT 文档: {filename}, 长度: {len(text_content)}")

        except HTTPException:
            raise
        except Exception as e:
            logger.error(f"从数据库读取 TXT 文档失败: {str(e)}")
            raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
    else:
        # 文件上传模式
        if not file:
            raise HTTPException(status_code=400, detail="请提供文件或文档ID")

        if not file.filename:
            raise HTTPException(status_code=400, detail="文件名为空")

        file_ext = file.filename.split('.')[-1].lower()
        if file_ext not in ['txt', 'text']:
            raise HTTPException(
                status_code=400,
                detail=f"不支持的文件类型: {file_ext}，仅支持 .txt"
            )

        # 读取文件内容
        content = await file.read()
        text_content = content.decode('utf-8', errors='replace')
        filename = file.filename

    try:
        logger.info(f"开始 AI 分析 TXT 文件: {filename}, analysis_type={analysis_type}")

        # 使用 txt_ai_service 的 AI 分析方法
        result = await txt_ai_service.analyze_txt_with_ai(
            content=text_content,
            filename=filename,
            analysis_type=analysis_type
        )

        if result:
            logger.info(f"TXT AI 分析成功: {filename}")
            return {
                "success": result.get("success", True),
                "filename": filename,
                "analysis_type": analysis_type,
                "result": result
            }
        else:
            logger.warning(f"TXT AI 分析返回空结果: {filename}")
            return {
                "success": False,
                "filename": filename,
                "error": "AI 分析未能提取到结构化数据",
                "result": None
            }

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"TXT AI 分析过程中出错: {str(e)}")
        raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")


# ==================== Word 文档 AI 解析 ====================

@router.post("/analyze/word")
async def analyze_word(
    file: Optional[UploadFile] = File(None),
    doc_id: Optional[str] = Form(None, description="文档ID（从数据库读取）"),
    user_hint: str = Form("", description="用户提示词，如'请提取表格数据'"),
    analysis_type: str = Query("structured", description="分析类型: structured, charts")
):
    """
    使用 AI 解析 Word 文档，提取结构化数据或生成图表

    适用于从非结构化的 Word 文档中提取表格数据、键值对等信息
    当 analysis_type=charts 时，可生成可视化图表

    Args:
        file: 上传的 Word 文件（与 doc_id 二选一）
        doc_id: 文档ID（从数据库读取）
        user_hint: 用户提示词
        analysis_type: 分析类型 - "structured"（默认，提取结构化数据）或 "charts"（生成图表）

    Returns:
        dict: 包含结构化数据的解析结果或图表数据
    """
    # 获取文件名和扩展名
    filename = None
    file_ext = None

    if doc_id:
        # 从数据库读取文档
        try:
            from app.core.database.mongodb import mongodb
            doc = await mongodb.get_document(doc_id)
            if not doc:
                raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")

            filename = doc.get("metadata", {}).get("original_filename", "unknown.docx")
            file_ext = filename.split('.')[-1].lower()

            if file_ext not in ['docx']:
                raise HTTPException(status_code=400, detail=f"文档类型不是 Word: {file_ext}")

            # 使用数据库中的 content 进行分析
            content = doc.get("content", "") or ""
            structured_data = doc.get("structured_data") or {}
            tables = structured_data.get("tables", [])

            # 调用 AI 分析服务，传入数据库内容
            if analysis_type == "charts":
                result = await word_ai_service.generate_charts_from_db(
                    content=content,
                    tables=tables,
                    filename=filename,
                    user_hint=user_hint
                )
            else:
                result = await word_ai_service.parse_word_with_ai_from_db(
                    content=content,
                    tables=tables,
                    filename=filename,
                    user_hint=user_hint or "请提取文档中的所有结构化数据，包括表格、键值对等"
                )

            if result.get("success"):
                return {
                    "success": True,
                    "filename": filename,
                    "analysis_type": analysis_type,
                    "result": result
                }
            else:
                return {
                    "success": False,
                    "filename": filename,
                    "error": result.get("error", "AI 解析失败"),
                    "result": None
                }

        except HTTPException:
            raise
        except Exception as e:
            logger.error(f"从数据库读取 Word 文档失败: {str(e)}")
            raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")

    # 文件上传模式
    if not file:
        raise HTTPException(status_code=400, detail="请提供文件或文档ID")

    if not file.filename:
        raise HTTPException(status_code=400, detail="文件名为空")

    file_ext = file.filename.split('.')[-1].lower()
    if file_ext not in ['docx']:
        raise HTTPException(
            status_code=400,
            detail=f"不支持的文件类型: {file_ext}，仅支持 .docx"
        )

    try:
        # 保存上传的文件
        content = await file.read()
        suffix = f".{file_ext}"
        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
            tmp.write(content)
            tmp_path = tmp.name

        try:
            # 根据 analysis_type 选择处理方式
            if analysis_type == "charts":
                # 生成图表
                result = await word_ai_service.generate_charts(
                    file_path=tmp_path,
                    user_hint=user_hint
                )
            else:
                # 提取结构化数据
                result = await word_ai_service.parse_word_with_ai(
                    file_path=tmp_path,
                    user_hint=user_hint or "请提取文档中的所有结构化数据，包括表格、键值对等"
                )

            if result.get("success"):
                return {
                    "success": True,
                    "filename": file.filename,
                    "analysis_type": analysis_type,
                    "result": result
                }
            else:
                return {
                    "success": False,
                    "filename": file.filename,
                    "error": result.get("error", "AI 解析失败"),
                    "result": None
                }

        finally:
            # 清理临时文件
            if os.path.exists(tmp_path):
                os.unlink(tmp_path)

    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Word AI 分析过程中出错: {str(e)}")
        raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")