feat(ai-analyze): 新增 Markdown 文件 AI 分析功能

- 添加 Markdown 文件上传和解析接口 - 实现流式分析和大纲提取功能 - 支持多种分析类型：摘要、大纲、关键点等 - 新增 markdown_ai_service 服务类 - 扩展 LLMService 支持流式调用 - 更新前端 API 接口定义和实现
2026-04-02 11:53:12 +08:00
parent ddf30078f0
commit d189ea9620
6 changed files with 1286 additions and 118 deletions
--- a/backend/app/api/endpoints/ai_analyze.py
+++ b/backend/app/api/endpoints/ai_analyze.py
@@ -2,10 +2,14 @@
 AI 分析 API 接口
 """
 from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body
+from fastapi.responses import StreamingResponse
 from typing import Optional
 import logging
+import tempfile
+import os

 from app.services.excel_ai_service import excel_ai_service
+from app.services.markdown_ai_service import markdown_ai_service

 logger = logging.getLogger(__name__)

@@ -93,10 +97,11 @@ async def get_analysis_types():
    获取支持的分析类型列表

    Returns:
-        list: 支持的分析类型
+        dict: 支持的分析类型（包含 Excel 和 Markdown）
    """
    return {
-        "types": excel_ai_service.get_supported_analysis_types()
+        "excel_types": excel_ai_service.get_supported_analysis_types(),
+        "markdown_types": markdown_ai_service.get_supported_analysis_types()
    }


@@ -142,3 +147,185 @@ async def analyze_text(
    except Exception as e:
        logger.error(f"文本分析失败: {str(e)}")
        raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
+
+
+@router.post("/analyze/md")
+async def analyze_markdown(
+    file: UploadFile = File(...),
+    analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section"),
+    user_prompt: str = Query("", description="用户自定义提示词"),
+    section_number: Optional[str] = Query(None, description="指定章节编号，如 '一' 或 '（一）'")
+):
+    """
+    上传并使用 AI 分析 Markdown 文件
+
+    Args:
+        file: 上传的 Markdown 文件
+        analysis_type: 分析类型
+        user_prompt: 用户自定义提示词
+        section_number: 指定分析的章节编号
+
+    Returns:
+        dict: 分析结果
+    """
+    # 检查文件类型
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="文件名为空")
+
+    file_ext = file.filename.split('.')[-1].lower()
+    if file_ext not in ['md', 'markdown']:
+        raise HTTPException(
+            status_code=400,
+            detail=f"不支持的文件类型: {file_ext}，仅支持 .md 和 .markdown"
+        )
+
+    # 验证分析类型
+    supported_types = markdown_ai_service.get_supported_analysis_types()
+    if analysis_type not in supported_types:
+        raise HTTPException(
+            status_code=400,
+            detail=f"不支持的分析类型: {analysis_type}，支持的类型: {', '.join(supported_types)}"
+        )
+
+    try:
+        # 读取文件内容
+        content = await file.read()
+
+        # 保存到临时文件
+        with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
+            tmp.write(content)
+            tmp_path = tmp.name
+
+        try:
+            logger.info(f"开始分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}, 章节: {section_number}")
+
+            # 调用 AI 分析服务
+            result = await markdown_ai_service.analyze_markdown(
+                file_path=tmp_path,
+                analysis_type=analysis_type,
+                user_prompt=user_prompt,
+                section_number=section_number
+            )
+
+            logger.info(f"Markdown 分析完成: {file.filename}, 成功: {result['success']}")
+
+            if not result['success']:
+                raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
+
+            return result
+
+        finally:
+            # 清理临时文件
+            if os.path.exists(tmp_path):
+                os.unlink(tmp_path)
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Markdown AI 分析过程中出错: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
+
+
+@router.post("/analyze/md/stream")
+async def analyze_markdown_stream(
+    file: UploadFile = File(...),
+    analysis_type: str = Query("summary", description="分析类型"),
+    user_prompt: str = Query("", description="用户自定义提示词"),
+    section_number: Optional[str] = Query(None, description="指定章节编号")
+):
+    """
+    流式分析 Markdown 文件 (SSE)
+
+    Returns:
+        StreamingResponse: SSE 流式响应
+    """
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="文件名为空")
+
+    file_ext = file.filename.split('.')[-1].lower()
+    if file_ext not in ['md', 'markdown']:
+        raise HTTPException(
+            status_code=400,
+            detail=f"不支持的文件类型: {file_ext}，仅支持 .md 和 .markdown"
+        )
+
+    try:
+        content = await file.read()
+
+        with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
+            tmp.write(content)
+            tmp_path = tmp.name
+
+        try:
+            logger.info(f"开始流式分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}")
+
+            async def stream_generator():
+                async for chunk in markdown_ai_service.analyze_markdown_stream(
+                    file_path=tmp_path,
+                    analysis_type=analysis_type,
+                    user_prompt=user_prompt,
+                    section_number=section_number
+                ):
+                    yield chunk
+
+            return StreamingResponse(
+                stream_generator(),
+                media_type="text/event-stream",
+                headers={
+                    "Cache-Control": "no-cache",
+                    "Connection": "keep-alive",
+                    "X-Accel-Buffering": "no"
+                }
+            )
+
+        finally:
+            if os.path.exists(tmp_path):
+                os.unlink(tmp_path)
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Markdown AI 流式分析出错: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"流式分析失败: {str(e)}")
+
+
+@router.get("/analyze/md/outline")
+async def get_markdown_outline(
+    file: UploadFile = File(...)
+):
+    """
+    获取 Markdown 文档的大纲结构（分章节信息）
+
+    Args:
+        file: 上传的 Markdown 文件
+
+    Returns:
+        dict: 文档大纲结构
+    """
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="文件名为空")
+
+    file_ext = file.filename.split('.')[-1].lower()
+    if file_ext not in ['md', 'markdown']:
+        raise HTTPException(
+            status_code=400,
+            detail=f"不支持的文件类型: {file_ext}，仅支持 .md 和 .markdown"
+        )
+
+    try:
+        content = await file.read()
+
+        with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
+            tmp.write(content)
+            tmp_path = tmp.name
+
+        try:
+            result = await markdown_ai_service.extract_outline(tmp_path)
+            return result
+        finally:
+            if os.path.exists(tmp_path):
+                os.unlink(tmp_path)
+
+    except Exception as e:
+        logger.error(f"获取 Markdown 大纲失败: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"获取大纲失败: {str(e)}")
--- a/backend/app/services/llm_service.py
+++ b/backend/app/services/llm_service.py
@@ -2,7 +2,7 @@
 LLM 服务模块 - 封装大模型 API 调用
 """
 import logging
-from typing import Dict, Any, List, Optional
+from typing import Dict, Any, List, Optional, AsyncGenerator
 import httpx

 from app.config import settings
@@ -87,6 +87,71 @@ class LLMService:
            logger.error(f"解析 API 响应失败: {str(e)}")
            raise

+    async def chat_stream(
+        self,
+        messages: List[Dict[str, str]],
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None,
+        **kwargs
+    ) -> AsyncGenerator[Dict[str, Any], None]:
+        """
+        流式调用聊天 API
+
+        Args:
+            messages: 消息列表
+            temperature: 温度参数
+            max_tokens: 最大 token 数
+            **kwargs: 其他参数
+
+        Yields:
+            Dict[str, Any]: 包含 delta 内容的块
+        """
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json"
+        }
+
+        payload = {
+            "model": self.model_name,
+            "messages": messages,
+            "temperature": temperature,
+            "stream": True
+        }
+
+        if max_tokens:
+            payload["max_tokens"] = max_tokens
+
+        payload.update(kwargs)
+
+        try:
+            async with httpx.AsyncClient(timeout=120.0) as client:
+                async with client.stream(
+                    "POST",
+                    f"{self.base_url}/chat/completions",
+                    headers=headers,
+                    json=payload
+                ) as response:
+                    async for line in response.aiter_lines():
+                        if line.startswith("data: "):
+                            data = line[6:]  # Remove "data: " prefix
+                            if data == "[DONE]":
+                                break
+                            try:
+                                import json as json_module
+                                chunk = json_module.loads(data)
+                                delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
+                                if delta:
+                                    yield {"content": delta}
+                            except json_module.JSONDecodeError:
+                                continue
+
+        except httpx.HTTPStatusError as e:
+            logger.error(f"LLM 流式 API 请求失败: {e.response.status_code}")
+            raise
+        except Exception as e:
+            logger.error(f"LLM 流式 API 调用异常: {str(e)}")
+            raise
+
    async def analyze_excel_data(
        self,
        excel_data: Dict[str, Any],
--- a/backend/app/services/markdown_ai_service.py
+++ b/backend/app/services/markdown_ai_service.py
@@ -0,0 +1,591 @@
+"""
+Markdown 文档 AI 分析服务
+
+支持：
+- 分章节解析（中文章节编号：一、二、三， （一）（二）（三））
+- 结构化数据提取
+- 流式输出
+- 多种分析类型
+"""
+import asyncio
+import json
+import logging
+import re
+from typing import Any, AsyncGenerator, Dict, List, Optional
+
+from app.services.llm_service import llm_service
+from app.core.document_parser import MarkdownParser
+
+logger = logging.getLogger(__name__)
+
+
+class MarkdownSection:
+    """文档章节结构"""
+    def __init__(self, number: str, title: str, level: int, content: str, line_start: int, line_end: int):
+        self.number = number  # 章节编号，如 "一", "（一）", "1"
+        self.title = title
+        self.level = level  # 层级深度
+        self.content = content  # 章节内容（不含子章节）
+        self.line_start = line_start
+        self.line_end = line_end
+        self.subsections: List[MarkdownSection] = []
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "number": self.number,
+            "title": self.title,
+            "level": self.level,
+            "content_preview": self.content[:200] + "..." if len(self.content) > 200 else self.content,
+            "line_start": self.line_start,
+            "line_end": self.line_end,
+            "subsections": [s.to_dict() for s in self.subsections]
+        }
+
+
+class MarkdownAIService:
+    """Markdown 文档 AI 分析服务"""
+
+    # 中文章节编号模式
+    CHINESE_NUMBERS = ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"]
+    CHINESE_SUFFIX = "、"
+    PARENTHESIS_PATTERN = re.compile(r'^（([一二三四五六七八九十]+)）\s*(.+)$')
+    CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+）、\s*(.+)$')
+    ARABIC_SECTION_PATTERN = re.compile(r'^(\d+)\.\s+(.+)$')
+
+    def __init__(self):
+        self.parser = MarkdownParser()
+
+    def get_supported_analysis_types(self) -> list:
+        """获取支持的分析类型"""
+        return [
+            "summary",      # 文档摘要
+            "outline",     # 大纲提取
+            "key_points",   # 关键点提取
+            "questions",    # 生成问题
+            "tags",         # 生成标签
+            "qa",           # 问答对
+            "statistics",   # 统计数据分析（适合政府公报）
+            "section"       # 分章节详细分析
+        ]
+
+    def extract_sections(self, content: str, titles: List[Dict]) -> List[MarkdownSection]:
+        """
+        从文档内容中提取章节结构
+
+        识别以下章节格式：
+        - 一级：一、二、三...
+        - 二级：（一）（二）（三）...
+        - 三级：1. 2. 3. ...
+        """
+        sections = []
+        lines = content.split('\n')
+
+        # 构建标题行到内容的映射
+        title_lines = {}
+        for t in titles:
+            title_lines[t.get('line', 0)] = t
+
+        current_section = None
+        section_stack = []
+
+        for i, line in enumerate(lines, 1):
+            stripped = line.strip()
+
+            # 检查是否是一级标题（中文数字 + 、）
+            match = self.CHINESE_SECTION_PATTERN.match(stripped)
+            if match:
+                # 结束当前章节
+                if current_section:
+                    current_section.content = self._get_section_content(
+                        lines, current_section.line_start, i - 1
+                    )
+
+                current_section = MarkdownSection(
+                    number=match.group(1),
+                    title=match.group(2),
+                    level=1,
+                    content="",
+                    line_start=i,
+                    line_end=len(lines)
+                )
+                sections.append(current_section)
+                section_stack = [current_section]
+                continue
+
+            # 检查是否是二级标题（（一）（二）...）
+            match = self.PARENTHESIS_PATTERN.match(stripped)
+            if match and current_section:
+                # 结束当前子章节
+                if section_stack and len(section_stack) > 1:
+                    parent = section_stack[-1]
+                    parent.content = self._get_section_content(
+                        lines, parent.line_start, i - 1
+                    )
+
+                subsection = MarkdownSection(
+                    number=match.group(1),
+                    title=match.group(2),
+                    level=2,
+                    content="",
+                    line_start=i,
+                    line_end=len(lines)
+                )
+                current_section.subsections.append(subsection)
+                section_stack = [current_section, subsection]
+                continue
+
+            # 检查是否是三级标题（1. 2. 3.）
+            match = self.ARABIC_SECTION_PATTERN.match(stripped)
+            if match and len(section_stack) > 1:
+                # 结束当前子章节
+                if len(section_stack) > 2:
+                    parent = section_stack[-1]
+                    parent.content = self._get_section_content(
+                        lines, parent.line_start, i - 1
+                    )
+
+                sub_subsection = MarkdownSection(
+                    number=match.group(1),
+                    title=match.group(2),
+                    level=3,
+                    content="",
+                    line_start=i,
+                    line_end=len(lines)
+                )
+                section_stack[-1].subsections.append(sub_subsection)
+                section_stack = section_stack[:-1] + [sub_subsection]
+                continue
+
+        # 处理最后一个章节
+        if current_section:
+            current_section.content = self._get_section_content(
+                lines, current_section.line_start, len(lines)
+            )
+
+        return sections
+
+    def _get_section_content(self, lines: List[str], start: int, end: int) -> str:
+        """获取指定行范围的内容"""
+        if start > end:
+            return ""
+        content_lines = lines[start-1:end]
+        # 清理：移除标题行和空行
+        cleaned = []
+        for line in content_lines:
+            stripped = line.strip()
+            if not stripped:
+                continue
+            # 跳过章节标题行
+            if self.CHINESE_SECTION_PATTERN.match(stripped):
+                continue
+            if self.PARENTHESIS_PATTERN.match(stripped):
+                continue
+            if self.ARABIC_SECTION_PATTERN.match(stripped):
+                continue
+            cleaned.append(stripped)
+        return '\n'.join(cleaned)
+
+    async def analyze_markdown(
+        self,
+        file_path: str,
+        analysis_type: str = "summary",
+        user_prompt: str = "",
+        section_number: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        使用 AI 分析 Markdown 文档
+
+        Args:
+            file_path: 文件路径
+            analysis_type: 分析类型
+            user_prompt: 用户自定义提示词
+            section_number: 指定分析的章节编号（如 "一" 或 "（一）"）
+
+        Returns:
+            dict: 分析结果
+        """
+        try:
+            parse_result = self.parser.parse(file_path)
+
+            if not parse_result.success:
+                return {
+                    "success": False,
+                    "error": parse_result.error
+                }
+
+            data = parse_result.data
+
+            # 提取章节结构
+            sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
+
+            # 如果指定了章节，只分析该章节
+            target_content = data.get("content", "")
+            target_title = parse_result.metadata.get("filename", "")
+
+            if section_number:
+                section = self._find_section(sections, section_number)
+                if section:
+                    target_content = section.content
+                    target_title = f"{section.number}、{section.title}"
+                else:
+                    return {
+                        "success": False,
+                        "error": f"未找到章节: {section_number}"
+                    }
+
+            # 根据分析类型构建提示词
+            prompt = self._build_prompt(
+                content=target_content,
+                analysis_type=analysis_type,
+                user_prompt=user_prompt,
+                title=target_title
+            )
+
+            # 调用 LLM 分析
+            messages = [
+                {"role": "system", "content": self._get_system_prompt(analysis_type)},
+                {"role": "user", "content": prompt}
+            ]
+
+            response = await llm_service.chat(
+                messages=messages,
+                temperature=0.3,
+                max_tokens=4000
+            )
+
+            analysis = llm_service.extract_message_content(response)
+
+            return {
+                "success": True,
+                "filename": parse_result.metadata.get("filename", ""),
+                "analysis_type": analysis_type,
+                "section": target_title if section_number else None,
+                "word_count": len(target_content),
+                "structure": {
+                    "title_count": parse_result.metadata.get("title_count", 0),
+                    "code_block_count": parse_result.metadata.get("code_block_count", 0),
+                    "table_count": parse_result.metadata.get("table_count", 0),
+                    "section_count": len(sections)
+                },
+                "sections": [s.to_dict() for s in sections[:10]],  # 最多返回10个一级章节
+                "analysis": analysis
+            }
+
+        except Exception as e:
+            logger.error(f"Markdown AI 分析失败: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e)
+            }
+
+    async def analyze_markdown_stream(
+        self,
+        file_path: str,
+        analysis_type: str = "summary",
+        user_prompt: str = "",
+        section_number: Optional[str] = None
+    ) -> AsyncGenerator[str, None]:
+        """
+        流式分析 Markdown 文档 (SSE)
+
+        Yields:
+            str: SSE 格式的数据块
+        """
+        try:
+            parse_result = self.parser.parse(file_path)
+
+            if not parse_result.success:
+                yield f"data: {json.dumps({'error': parse_result.error}, ensure_ascii=False)}\n\n"
+                return
+
+            data = parse_result.data
+            sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
+
+            target_content = data.get("content", "")
+            target_title = parse_result.metadata.get("filename", "")
+
+            if section_number:
+                section = self._find_section(sections, section_number)
+                if section:
+                    target_content = section.content
+                    target_title = f"{section.number}、{section.title}"
+                else:
+                    yield f"data: {json.dumps({'error': f'未找到章节: {section_number}'}, ensure_ascii=False)}\n\n"
+                    return
+
+            prompt = self._build_prompt(
+                content=target_content,
+                analysis_type=analysis_type,
+                user_prompt=user_prompt,
+                title=target_title
+            )
+
+            messages = [
+                {"role": "system", "content": self._get_system_prompt(analysis_type)},
+                {"role": "user", "content": prompt}
+            ]
+
+            # 发送初始元数据
+            yield f"data: {json.dumps({
+                'type': 'start',
+                'filename': parse_result.metadata.get("filename", ""),
+                'analysis_type': analysis_type,
+                'section': target_title if section_number else None,
+                'word_count': len(target_content)
+            }, ensure_ascii=False)}\n\n"
+
+            # 流式调用 LLM
+            full_response = ""
+            async for chunk in llm_service.chat_stream(messages, temperature=0.3, max_tokens=4000):
+                content = chunk.get("content", "")
+                if content:
+                    full_response += content
+                    yield f"data: {json.dumps({'type': 'content', 'delta': content}, ensure_ascii=False)}\n\n"
+
+            # 发送完成消息
+            yield f"data: {json.dumps({'type': 'done', 'full_response': full_response}, ensure_ascii=False)}\n\n"
+
+        except Exception as e:
+            logger.error(f"Markdown AI 流式分析失败: {str(e)}")
+            yield f"data: {json.dumps({'error': str(e)}, ensure_ascii=False)}\n\n"
+
+    def _find_section(self, sections: List[MarkdownSection], number: str) -> Optional[MarkdownSection]:
+        """查找指定编号的章节"""
+        # 标准化编号
+        num = number.strip()
+        for section in sections:
+            if section.number == num or section.title == num:
+                return section
+            # 在子章节中查找
+            found = self._find_section(section.subsections, number)
+            if found:
+                return found
+        return None
+
+    def _get_system_prompt(self, analysis_type: str) -> str:
+        """根据分析类型获取系统提示词"""
+        prompts = {
+            "summary": "你是一个专业的文档摘要助手，擅长从长文档中提取核心信息。",
+            "outline": "你是一个专业的文档结构分析助手，擅长提取文档大纲和层级结构。",
+            "key_points": "你是一个专业的知识提取助手，擅长从文档中提取关键信息和要点。",
+            "questions": "你是一个专业的教育助手，擅长生成帮助理解文档的问题。",
+            "tags": "你是一个专业的标签生成助手，擅长提取文档的主题标签。",
+            "qa": "你是一个专业的问答助手，擅长基于文档内容生成问答对。",
+            "statistics": "你是一个专业的统计数据分析助手，擅长分析政府统计公报中的数据。",
+            "section": "你是一个专业的章节分析助手，擅长对文档的特定章节进行深入分析。"
+        }
+        return prompts.get(analysis_type, "你是一个专业的文档分析助手。")
+
+    def _build_prompt(
+        self,
+        content: str,
+        analysis_type: str,
+        user_prompt: str,
+        title: str = ""
+    ) -> str:
+        """根据分析类型构建提示词"""
+
+        # 截断内容避免超出 token 限制
+        max_content_len = 6000
+        if len(content) > max_content_len:
+            content = content[:max_content_len] + "\n\n[内容已截断...]"
+
+        base_prompts = {
+            "summary": f"""请对以下文档进行摘要分析：
+
+文档标题：{title}
+
+文档内容：
+{content}
+
+请提供：
+1. 文档主要内容摘要（300字以内）
+2. 文档的目的和用途
+3. 适合的读者群体
+
+请用中文回答，结构清晰。""",
+
+            "outline": f"""请提取以下文档的大纲结构：
+
+文档标题：{title}
+
+文档内容：
+{content}
+
+请按层级列出文档大纲，用缩进表示层级关系。
+格式：
+一、一级标题
+   （一）二级标题
+      1. 三级标题
+
+请用中文回答。""",
+
+            "key_points": f"""请从以下文档中提取关键要点：
+
+文档标题：{title}
+
+文档内容：
+{content}
+
+请列出文档的关键要点（5-10条），每条用简洁的语言描述，并说明其在文档中的重要性。
+
+请用中文回答，格式清晰。""",
+
+            "questions": f"""请根据以下文档生成有助于理解内容的问题：
+
+文档标题：{title}
+
+文档内容：
+{content}
+
+请生成5-10个问题，帮助读者更好地理解文档内容。每个问题应该：
+1. 涵盖文档的重要信息点
+2. 易于理解和回答
+3. 具有思考价值
+
+请用中文回答。""",
+
+            "tags": f"""请为以下文档生成标签：
+
+文档标题：{title}
+
+文档内容：
+{content[:3000]}
+
+请生成5-8个标签，用逗号分隔。标签应该反映：
+- 文档的主题领域
+- 文档的类型
+- 文档的关键特征
+
+请用中文回答，只需输出标签，不要其他内容。""",
+
+            "qa": f"""请根据以下文档生成问答对：
+
+文档标题：{title}
+
+文档内容：
+{content[:4000]}
+
+请生成3-5个问答对，帮助读者通过问答形式理解文档内容。
+格式：
+Q1: 问题
+A1: 回答
+Q2: 问题
+A2: 回答
+
+请用中文回答，内容准确。""",
+
+            "statistics": f"""请分析以下政府统计公报中的数据和结论：
+
+文档标题：{title}
+
+文档内容：
+{content}
+
+请提供：
+1. 文档中涉及的主要统计数据（列出关键数字和指标）
+2. 数据的变化趋势（增长/下降）
+3. 重要的百分比和对比
+4. 数据来源和统计口径说明
+
+请用中文回答，数据准确。""",
+
+            "section": f"""请详细分析以下文档章节：
+
+章节标题：{title}
+
+章节内容：
+{content}
+
+请提供：
+1. 章节主要内容概括
+2. 关键信息和数据
+3. 与其他部分的关联（如有）
+4. 重要结论
+
+请用中文回答，分析深入。"""
+        }
+
+        prompt = base_prompts.get(analysis_type, base_prompts["summary"])
+
+        if user_prompt and user_prompt.strip():
+            prompt += f"\n\n用户额外需求：{user_prompt}"
+
+        return prompt
+
+    async def extract_outline(self, file_path: str) -> Dict[str, Any]:
+        """提取文档大纲"""
+        try:
+            parse_result = self.parser.parse(file_path)
+
+            if not parse_result.success:
+                return {"success": False, "error": parse_result.error}
+
+            data = parse_result.data
+            sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
+
+            # 构建结构化大纲
+            outline = []
+            for section in sections:
+                outline.append({
+                    "number": section.number,
+                    "title": section.title,
+                    "level": section.level,
+                    "line": section.line_start,
+                    "content_preview": section.content[:100] + "..." if len(section.content) > 100 else section.content,
+                    "subsections": [{
+                        "number": s.number,
+                        "title": s.title,
+                        "level": s.level,
+                        "line": s.line_start
+                    } for s in section.subsections]
+                })
+
+            return {
+                "success": True,
+                "outline": outline
+            }
+
+        except Exception as e:
+            logger.error(f"大纲提取失败: {str(e)}")
+            return {"success": False, "error": str(e)}
+
+    async def extract_tables_summary(self, file_path: str) -> Dict[str, Any]:
+        """提取并总结文档中的表格"""
+        try:
+            parse_result = self.parser.parse(file_path)
+
+            if not parse_result.success:
+                return {"success": False, "error": parse_result.error}
+
+            tables = parse_result.data.get("tables", [])
+
+            if not tables:
+                return {"success": True, "tables": [], "message": "文档中没有表格"}
+
+            # 提取每个表格的关键信息
+            table_summaries = []
+            for i, table in enumerate(tables):
+                summary = {
+                    "index": i + 1,
+                    "headers": table.get("headers", []),
+                    "row_count": table.get("row_count", 0),
+                    "column_count": table.get("column_count", 0),
+                    "preview_rows": table.get("rows", [])[:3],  # 只取前3行预览
+                    "first_column": [row[0] if row else "" for row in table.get("rows", [])[:5]]
+                }
+                table_summaries.append(summary)
+
+            return {
+                "success": True,
+                "tables": table_summaries,
+                "table_count": len(tables)
+            }
+
+        except Exception as e:
+            logger.error(f"表格提取失败: {str(e)}")
+            return {"success": False, "error": str(e)}
+
+
+# 全局单例
+markdown_ai_service = MarkdownAIService()