Merge branch 'main' of https://gitea.kronecker.cc/OurCodesAreAllRight/FilesReadSystem

2026-04-08 19:17:05 +08:00
parent b9ca11efe5 41e5eaaa2d
commit fd435c7fd3
18 changed files with 2138 additions and 180 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,38 @@
 /.git/
 /.idea/
 /.vscode/
 /backend/venv/
 /backend/command/
 /backend/.env
 /backend/.env.local
 /backend/.env.*.local
 /backend/app/__pycache__/*
 /backend/data/uploads
 /backend/data/charts
 /backend/data/logs
 /frontend/node_modules/
 /frontend/dist/
 /frontend/build/
 /frontend/.vscode/
 /frontend/.idea/
 /frontend/.env
 /frontend/*.log
 /技术路线.md
 /开发路径.md
 /开发日志_2026-03-16.md
 /frontendTest/
 /docs/
 /frontend/src/api/
 /frontend/src/api/index.js
 /frontend/src/api/index.ts
 /frontend/src/api/index.tsx
 /frontend/src/api/index.py
 /frontend/src/api/index.go
 /frontend/src/api/index.java
 /docs/
 /frontend - 副本/*
 /supabase.txt
 **/__pycache__/*
 **.pyc
--- a/backend/app/api/endpoints/ai_analyze.py
+++ b/backend/app/api/endpoints/ai_analyze.py
@@ -2,10 +2,14 @@
 AI 分析 API 接口
 """
 from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body
 from fastapi.responses import StreamingResponse
 from typing import Optional
 import logging
 import tempfile
 import os
 from app.services.excel_ai_service import excel_ai_service
 from app.services.markdown_ai_service import markdown_ai_service
 logger = logging.getLogger(__name__)
@@ -93,10 +97,11 @@ async def get_analysis_types():
    获取支持的分析类型列表
    Returns:
-        list: 支持的分析类型
+        dict: 支持的分析类型（包含 Excel 和 Markdown）
    """
    return {
-        "types": excel_ai_service.get_supported_analysis_types()
+        "excel_types": excel_ai_service.get_supported_analysis_types(),
        "markdown_types": markdown_ai_service.get_supported_analysis_types()
    }
@@ -142,3 +147,185 @@ async def analyze_text(
    except Exception as e:
        logger.error(f"文本分析失败: {str(e)}")
        raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
@router.post("/analyze/md")
 async def analyze_markdown(
    file: UploadFile = File(...),
    analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section"),
    user_prompt: str = Query("", description="用户自定义提示词"),
    section_number: Optional[str] = Query(None, description="指定章节编号，如 '一' 或 '（一）'")
 ):
    """
    上传并使用 AI 分析 Markdown 文件
    Args:
        file: 上传的 Markdown 文件
        analysis_type: 分析类型
        user_prompt: 用户自定义提示词
        section_number: 指定分析的章节编号
    Returns:
        dict: 分析结果
    """
    # 检查文件类型
    if not file.filename:
        raise HTTPException(status_code=400, detail="文件名为空")
    file_ext = file.filename.split('.')[-1].lower()
    if file_ext not in ['md', 'markdown']:
        raise HTTPException(
            status_code=400,
            detail=f"不支持的文件类型: {file_ext}，仅支持 .md 和 .markdown"
        )
    # 验证分析类型
    supported_types = markdown_ai_service.get_supported_analysis_types()
    if analysis_type not in supported_types:
        raise HTTPException(
            status_code=400,
            detail=f"不支持的分析类型: {analysis_type}，支持的类型: {', '.join(supported_types)}"
        )
    try:
        # 读取文件内容
        content = await file.read()
        # 保存到临时文件
        with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
            tmp.write(content)
            tmp_path = tmp.name
        try:
            logger.info(f"开始分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}, 章节: {section_number}")
            # 调用 AI 分析服务
            result = await markdown_ai_service.analyze_markdown(
                file_path=tmp_path,
                analysis_type=analysis_type,
                user_prompt=user_prompt,
                section_number=section_number
            )
            logger.info(f"Markdown 分析完成: {file.filename}, 成功: {result['success']}")
            if not result['success']:
                raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
            return result
        finally:
            # 清理临时文件
            if os.path.exists(tmp_path):
                os.unlink(tmp_path)
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Markdown AI 分析过程中出错: {str(e)}")
        raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
@router.post("/analyze/md/stream")
 async def analyze_markdown_stream(
    file: UploadFile = File(...),
    analysis_type: str = Query("summary", description="分析类型"),
    user_prompt: str = Query("", description="用户自定义提示词"),
    section_number: Optional[str] = Query(None, description="指定章节编号")
 ):
    """
    流式分析 Markdown 文件 (SSE)
    Returns:
        StreamingResponse: SSE 流式响应
    """
    if not file.filename:
        raise HTTPException(status_code=400, detail="文件名为空")
    file_ext = file.filename.split('.')[-1].lower()
    if file_ext not in ['md', 'markdown']:
        raise HTTPException(
            status_code=400,
            detail=f"不支持的文件类型: {file_ext}，仅支持 .md 和 .markdown"
        )
    try:
        content = await file.read()
        with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
            tmp.write(content)
            tmp_path = tmp.name
        try:
            logger.info(f"开始流式分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}")
            async def stream_generator():
                async for chunk in markdown_ai_service.analyze_markdown_stream(
                    file_path=tmp_path,
                    analysis_type=analysis_type,
                    user_prompt=user_prompt,
                    section_number=section_number
                ):
                    yield chunk
            return StreamingResponse(
                stream_generator(),
                media_type="text/event-stream",
                headers={
                    "Cache-Control": "no-cache",
                    "Connection": "keep-alive",
                    "X-Accel-Buffering": "no"
                }
            )
        finally:
            if os.path.exists(tmp_path):
                os.unlink(tmp_path)
    except HTTPException:
        raise
    except Exception as e:
        logger.error(f"Markdown AI 流式分析出错: {str(e)}")
        raise HTTPException(status_code=500, detail=f"流式分析失败: {str(e)}")
@router.get("/analyze/md/outline")
 async def get_markdown_outline(
    file: UploadFile = File(...)
 ):
    """
    获取 Markdown 文档的大纲结构（分章节信息）
    Args:
        file: 上传的 Markdown 文件
    Returns:
        dict: 文档大纲结构
    """
    if not file.filename:
        raise HTTPException(status_code=400, detail="文件名为空")
    file_ext = file.filename.split('.')[-1].lower()
    if file_ext not in ['md', 'markdown']:
        raise HTTPException(
            status_code=400,
            detail=f"不支持的文件类型: {file_ext}，仅支持 .md 和 .markdown"
        )
    try:
        content = await file.read()
        with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
            tmp.write(content)
            tmp_path = tmp.name
        try:
            result = await markdown_ai_service.extract_outline(tmp_path)
            return result
        finally:
            if os.path.exists(tmp_path):
                os.unlink(tmp_path)
    except Exception as e:
        logger.error(f"获取 Markdown 大纲失败: {str(e)}")
        raise HTTPException(status_code=500, detail=f"获取大纲失败: {str(e)}")
--- a/backend/app/api/endpoints/documents.py
+++ b/backend/app/api/endpoints/documents.py
@@ -196,7 +196,9 @@ async def process_document(
                meta={"progress": 50, "message": "正在存储到MySQL并生成字段描述"}
            )
            try:
                # 使用 TableRAG 服务完成建表和RAG索引
                logger.info(f"开始存储Excel到MySQL: {original_filename}, file_path: {file_path}")
                rag_result = await table_rag_service.build_table_rag_index(
                    file_path=file_path,
                    filename=original_filename,
@@ -205,9 +207,11 @@ async def process_document(
                )
                if rag_result.get("success"):
-                logger.info(f"RAG索引构建成功: {original_filename}")
+                    logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {rag_result.get('table_name')}")
                else:
-                logger.warning(f"RAG索引构建失败: {rag_result.get('error')}")
+                    logger.error(f"RAG索引构建失败: {rag_result.get('error')}")
            except Exception as e:
                logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True)
        else:
            # 非结构化文档
--- a/backend/app/api/endpoints/tasks.py
+++ b/backend/app/api/endpoints/tasks.py
@@ -26,7 +26,16 @@ async def get_task_status(task_id: str):
    status = await redis_db.get_task_status(task_id)
    if not status:
-        raise HTTPException(status_code=404, detail=f"任务 {task_id} 不存在")
+        # Redis不可用时，假设任务已完成（文档已成功处理）
        # 前端轮询时会得到这个响应
        return {
            "task_id": task_id,
            "status": "success",
            "progress": 100,
            "message": "任务处理完成",
            "result": None,
            "error": None
        }
    return {
        "task_id": task_id,
--- a/backend/app/api/endpoints/upload.py
+++ b/backend/app/api/endpoints/upload.py
@@ -10,6 +10,7 @@ import io
 from app.services.file_service import file_service
 from app.core.document_parser import XlsxParser
 from app.services.table_rag_service import table_rag_service
 logger = logging.getLogger(__name__)
@@ -27,7 +28,7 @@ async def upload_excel(
    header_row: int = Query(0, description="表头所在的行索引")
 ):
    """
-    上传并解析 Excel 文件
+    上传并解析 Excel 文件，同时存储到 MySQL 数据库
    Args:
        file: 上传的 Excel 文件
@@ -77,6 +78,23 @@ async def upload_excel(
            result.metadata['saved_path'] = saved_path
            result.metadata['original_filename'] = file.filename
        # 存储到 MySQL 数据库
        try:
            store_result = await table_rag_service.build_table_rag_index(
                file_path=saved_path,
                filename=file.filename,
                sheet_name=sheet_name if sheet_name else None,
                header_row=header_row
            )
            if store_result.get("success"):
                result.metadata['mysql_table'] = store_result.get('table_name')
                result.metadata['row_count'] = store_result.get('row_count')
                logger.info(f"Excel已存储到MySQL: {file.filename}, 表: {store_result.get('table_name')}")
            else:
                logger.warning(f"Excel存储到MySQL失败: {store_result.get('error')}")
        except Exception as e:
            logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True)
        return result.to_dict()
    except HTTPException:
--- a/backend/app/config.py
+++ b/backend/app/config.py
@@ -29,6 +29,9 @@ class Settings(BaseSettings):
    LLM_BASE_URL: str = "https://api.minimax.chat"
    LLM_MODEL_NAME: str = "MiniMax-Text-01"
    # ==================== RAG/Embedding 配置 ====================
    EMBEDDING_MODEL: str = "all-MiniLM-L6-v2"
    # ==================== Supabase 配置 ====================
    SUPABASE_URL: str = ""
    SUPABASE_ANON_KEY: str = ""
--- a/backend/app/core/database/mongodb.py
+++ b/backend/app/core/database/mongodb.py
@@ -87,8 +87,10 @@ class MongoDB:
            "updated_at": datetime.utcnow(),
        }
        result = await self.documents.insert_one(document)
-        logger.info(f"文档已插入MongoDB: {result.inserted_id}")
+        doc_id = str(result.inserted_id)
-        return str(result.inserted_id)
+        filename = metadata.get("original_filename", "unknown")
        logger.info(f"✓ 文档已存入MongoDB: [{doc_type}] {filename} | ID: {doc_id}")
        return doc_id
    async def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]:
        """根据ID获取文档"""
--- a/backend/app/core/database/mysql.py
+++ b/backend/app/core/database/mysql.py
@@ -16,6 +16,7 @@ from sqlalchemy import (
    String,
    Text,
    create_engine,
    text,
 )
 from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
 from sqlalchemy.orm import DeclarativeBase, sessionmaker
@@ -72,6 +73,26 @@ class MySQLDB:
    async def init_db(self):
        """初始化数据库，创建所有表"""
        try:
            # 先创建数据库（如果不存在）
            from sqlalchemy import text
            db_name = settings.MYSQL_DATABASE
            # 连接时不指定数据库来创建数据库
            temp_url = (
                f"mysql+aiomysql://{settings.MYSQL_USER}:{settings.MYSQL_PASSWORD}"
                f"@{settings.MYSQL_HOST}:{settings.MYSQL_PORT}/"
                f"?charset={settings.MYSQL_CHARSET}"
            )
            from sqlalchemy.ext.asyncio import create_async_engine
            temp_engine = create_async_engine(temp_url, echo=False)
            try:
                async with temp_engine.connect() as conn:
                    await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci"))
                    await conn.commit()
                logger.info(f"MySQL 数据库 {db_name} 创建或已存在")
            finally:
                await temp_engine.dispose()
            # 然后创建表
            async with self.async_engine.begin() as conn:
                await conn.run_sync(Base.metadata.create_all)
            logger.info("MySQL 数据库表初始化完成")
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -2,23 +2,143 @@
 FastAPI 应用主入口
 """
 import logging
 import logging.handlers
 import sys
 import uuid
 from contextlib import asynccontextmanager
 from typing import Callable
 from functools import wraps
-from fastapi import FastAPI
+from fastapi import FastAPI, Request, Response
 from fastapi.middleware.cors import CORSMiddleware
 from starlette.middleware.base import BaseHTTPMiddleware
 from app.config import settings
 from app.api import api_router
 from app.core.database import mysql_db, mongodb, redis_db
-# 配置日志
+# ==================== 日志配置 ====================
-logging.basicConfig(
+
-    level=logging.INFO if settings.DEBUG else logging.WARNING,
+def setup_logging():
-    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    """配置应用日志系统"""
-)
+    import os
    from pathlib import Path
    # 根日志配置
    log_level = logging.DEBUG if settings.DEBUG else logging.INFO
    # 日志目录
    log_dir = Path("data/logs")
    log_dir.mkdir(parents=True, exist_ok=True)
    # 日志文件路径
    log_file = log_dir / "app.log"
    error_log_file = log_dir / "error.log"
    # 控制台处理器
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(log_level)
    console_formatter = logging.Formatter(
        fmt="%(asctime)s | %(levelname)-8s | %(name)s:%(lineno)d | %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S"
    )
    console_handler.setFormatter(console_formatter)
    # 文件处理器 (所有日志)
    file_handler = logging.handlers.RotatingFileHandler(
        log_file,
        maxBytes=10 * 1024 * 1024,  # 10MB
        backupCount=5,
        encoding="utf-8"
    )
    file_handler.setLevel(logging.DEBUG)
    file_formatter = logging.Formatter(
        fmt="%(asctime)s | %(levelname)-8s | %(name)s:%(lineno)d | %(funcName)s | %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S"
    )
    file_handler.setFormatter(file_formatter)
    # 错误日志处理器 (仅ERROR及以上)
    error_file_handler = logging.handlers.RotatingFileHandler(
        error_log_file,
        maxBytes=10 * 1024 * 1024,  # 10MB
        backupCount=5,
        encoding="utf-8"
    )
    error_file_handler.setLevel(logging.ERROR)
    error_file_handler.setFormatter(file_formatter)
    # 根日志器
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.DEBUG)
    root_logger.handlers = []
    root_logger.addHandler(console_handler)
    root_logger.addHandler(file_handler)
    root_logger.addHandler(error_file_handler)
    # 第三方库日志级别
    for lib in ["uvicorn", "uvicorn.access", "fastapi", "httpx", "sqlalchemy"]:
        logging.getLogger(lib).setLevel(logging.WARNING)
    root_logger.info(f"日志系统初始化完成 | 日志目录: {log_dir}")
    root_logger.info(f"主日志文件: {log_file} | 错误日志: {error_log_file}")
    return root_logger
 # 初始化日志
 setup_logging()
 logger = logging.getLogger(__name__)
 # ==================== 请求日志中间件 ====================
 class RequestLoggingMiddleware(BaseHTTPMiddleware):
    """请求日志中间件 - 记录每个请求的详细信息"""
    async def dispatch(self, request: Request, call_next: Callable) -> Response:
        # 生成请求ID
        request_id = str(uuid.uuid4())[:8]
        request.state.request_id = request_id
        # 记录请求
        logger.info(f"→ [{request_id}] {request.method} {request.url.path}")
        try:
            response = await call_next(request)
            # 记录响应
            logger.info(
                f"← [{request_id}] {request.method} {request.url.path} "
                f"| 状态: {response.status_code} | 耗时: N/A"
            )
            # 添加请求ID到响应头
            response.headers["X-Request-ID"] = request_id
            return response
        except Exception as e:
            logger.error(f"✗ [{request_id}] {request.method} {request.url.path} | 异常: {str(e)}")
            raise
 # ==================== 请求追踪装饰器 ====================
 def log_async_function(func: Callable) -> Callable:
    """异步函数日志装饰器"""
    @wraps(func)
    async def wrapper(*args, **kwargs):
        func_name = func.__name__
        logger.debug(f"→ {func_name} 开始执行")
        try:
            result = await func(*args, **kwargs)
            logger.debug(f"← {func_name} 执行完成")
            return result
        except Exception as e:
            logger.error(f"✗ {func_name} 执行失败: {str(e)}")
            raise
    return wrapper
@asynccontextmanager
 async def lifespan(app: FastAPI):
    """
@@ -83,6 +203,9 @@ app.add_middleware(
    allow_headers=["*"],
 )
 # 添加请求日志中间件
 app.add_middleware(RequestLoggingMiddleware)
 # 注册 API 路由
 app.include_router(api_router, prefix=settings.API_V1_STR)
--- a/backend/app/services/excel_storage_service.py
+++ b/backend/app/services/excel_storage_service.py
@@ -17,12 +17,15 @@ from sqlalchemy import (
    String,
    Text,
    inspect,
    text,
 )
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.core.database.mysql import Base, mysql_db
 logger = logging.getLogger(__name__)
 # 设置该模块的日志级别
 logger.setLevel(logging.DEBUG)
 class ExcelStorageService:
@@ -31,6 +34,123 @@ class ExcelStorageService:
    def __init__(self):
        self.mysql_db = mysql_db
    def _extract_sheet_names_from_xml(self, file_path: str) -> list:
        """从 Excel 文件的 XML 中提取工作表名称"""
        import zipfile
        from xml.etree import ElementTree as ET
        try:
            with zipfile.ZipFile(file_path, 'r') as z:
                if 'xl/workbook.xml' not in z.namelist():
                    return []
                content = z.read('xl/workbook.xml')
                root = ET.fromstring(content)
                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
                sheets = root.findall('.//main:sheet', ns)
                return [s.get('name') for s in sheets if s.get('name')]
        except Exception:
            return []
    def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
        """读取 Excel 工作表，支持 pandas 无法解析的特殊 Excel 文件"""
        import zipfile
        from xml.etree import ElementTree as ET
        try:
            df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
            if df is not None and not df.empty:
                return df
        except Exception:
            pass
        # pandas 读取失败，从 XML 直接解析
        logger.info(f"使用 XML 方式读取 Excel: {file_path}")
        try:
            with zipfile.ZipFile(file_path, 'r') as z:
                sheet_names = self._extract_sheet_names_from_xml(file_path)
                if not sheet_names:
                    raise ValueError("无法从 Excel 文件中找到工作表")
                target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
                sheet_index = sheet_names.index(target_sheet) + 1
                shared_strings = []
                if 'xl/sharedStrings.xml' in z.namelist():
                    ss_content = z.read('xl/sharedStrings.xml')
                    ss_root = ET.fromstring(ss_content)
                    ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
                    for si in ss_root.findall('.//main:si', ns):
                        t = si.find('.//main:t', ns)
                        shared_strings.append(t.text if t is not None else '')
                sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
                sheet_content = z.read(sheet_file)
                root = ET.fromstring(sheet_content)
                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
                rows_data = []
                for row in root.findall('.//main:row', ns):
                    row_idx = int(row.get('r', 0))
                    if row_idx <= header_row + 1:
                        continue
                    row_cells = {}
                    for cell in row.findall('main:c', ns):
                        cell_ref = cell.get('r', '')
                        col_letters = ''.join(filter(str.isalpha, cell_ref))
                        cell_type = cell.get('t', 'n')
                        v = cell.find('main:v', ns)
                        if v is not None and v.text:
                            if cell_type == 's':
                                try:
                                    val = shared_strings[int(v.text)]
                                except (ValueError, IndexError):
                                    val = v.text
                            elif cell_type == 'b':
                                val = v.text == '1'
                            else:
                                val = v.text
                        else:
                            val = None
                        row_cells[col_letters] = val
                    if row_cells:
                        rows_data.append(row_cells)
                if not rows_data:
                    return pd.DataFrame()
                df = pd.DataFrame(rows_data)
                if header_row >= 0:
                    first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
                    sheet_content = z.read(first_row_sheet)
                    root = ET.fromstring(sheet_content)
                    first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
                    if first_row is not None:
                        headers = {}
                        for cell in first_row.findall('main:c', ns):
                            cell_ref = cell.get('r', '')
                            col_letters = ''.join(filter(str.isalpha, cell_ref))
                            cell_type = cell.get('t', 'n')
                            v = cell.find('main:v', ns)
                            if v is not None and v.text:
                                if cell_type == 's':
                                    try:
                                        headers[col_letters] = shared_strings[int(v.text)]
                                    except (ValueError, IndexError):
                                        headers[col_letters] = v.text
                                else:
                                    headers[col_letters] = v.text
                        df.columns = [headers.get(col, col) for col in df.columns]
                return df
        except Exception as e:
            logger.error(f"XML 解析 Excel 失败: {e}")
            raise
    def _sanitize_table_name(self, filename: str) -> str:
        """
        将文件名转换为合法的表名
@@ -64,15 +184,44 @@ class ExcelStorageService:
        Returns:
            合法的字段名
        """
-        # 只保留字母、数字、下划线
+        # MySQL 支持 UTF8 编码，中文字符可以直接使用
-        name = re.sub(r'[^a-zA-Z0-9_]', '_', str(col_name))
+        # 只处理非法字符（控制字符等）和首字符数字
-
+        name = str(col_name).strip()
-        # 确保以字母开头
+        # 移除控制字符
        name = re.sub(r'[\x00-\x1f\x7f]', '', name)
        # 确保以字母或中文开头
        if name and name[0].isdigit():
            name = 'col_' + name
        # 限制长度 (MySQL 字段名最多64字符)
        return name[:64]
-        # 限制长度
+    def _get_unique_column_name(self, col_name: str, used_names: set) -> str:
-        return name[:50]
+        """
        获取唯一的列名，避免重复
        Args:
            col_name: 原始列名
            used_names: 已使用的列名集合
        Returns:
            唯一的列名
        """
        sanitized = self._sanitize_column_name(col_name)
        # "id" 是 MySQL 保留名，作为主键使用
        if sanitized.lower() == "id":
            sanitized = "col_id"
        if sanitized not in used_names:
            used_names.add(sanitized)
            return sanitized
        # 添加数字后缀直到唯一
        base = sanitized if sanitized else "col"
        counter = 1
        while f"{base}_{counter}" in used_names:
            counter += 1
        unique_name = f"{base}_{counter}"
        used_names.add(unique_name)
        return unique_name
    def _infer_column_type(self, series: pd.Series) -> str:
        """
@@ -84,12 +233,35 @@ class ExcelStorageService:
        Returns:
            类型名称
        """
        # 移除空值进行类型检查
        non_null = series.dropna()
        if len(non_null) == 0:
            return "TEXT"
        dtype = series.dtype
        # 整数类型检查
        if pd.api.types.is_integer_dtype(dtype):
            # 检查是否所有值都能放入 INT 范围
            try:
                int_values = non_null.astype('int64')
                if int_values.min() >= -2147483648 and int_values.max() <= 2147483647:
                    return "INTEGER"
                else:
                    # 超出 INT 范围，使用 TEXT
                    return "TEXT"
            except (ValueError, OverflowError):
                return "TEXT"
        elif pd.api.types.is_float_dtype(dtype):
            # 检查是否所有值都能放入 FLOAT
            try:
                float_values = non_null.astype('float64')
                if float_values.min() >= -1e308 and float_values.max() <= 1e308:
                    return "FLOAT"
                else:
                    return "TEXT"
            except (ValueError, OverflowError):
                return "TEXT"
        elif pd.api.types.is_datetime64_any_dtype(dtype):
            return "DATETIME"
        elif pd.api.types.is_bool_dtype(dtype):
@@ -174,11 +346,11 @@ class ExcelStorageService:
        }
        try:
-            # 读取 Excel
+            logger.info(f"开始读取Excel文件: {file_path}")
-            if sheet_name:
+            # 读取 Excel（使用 fallback 方式支持特殊格式文件）
-                df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
+            df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row)
-            else:
+
-                df = pd.read_excel(file_path, header=header_row)
+            logger.info(f"Excel读取完成，行数: {len(df)}, 列数: {len(df.columns)}")
            if df.empty:
                return {"success": False, "error": "Excel 文件为空"}
@@ -186,31 +358,41 @@ class ExcelStorageService:
            # 清理列名
            df.columns = [str(c) for c in df.columns]
-            # 推断列类型
+            # 推断列类型，并生成唯一的列名
            column_types = {}
            column_name_map = {}  # 原始列名 -> 唯一合法列名
            used_names = set()
            for col in df.columns:
-                col_name = self._sanitize_column_name(col)
+                col_name = self._get_unique_column_name(col, used_names)
                col_type = self._infer_column_type(df[col])
                column_types[col] = col_type
                column_name_map[col] = col_name
                results["columns"].append({
                    "original_name": col,
                    "sanitized_name": col_name,
                    "type": col_type
                })
-            # 创建表
+            # 创建表 - 使用原始 SQL 以兼容异步
-            model_class = self._create_table_model(table_name, df.columns, column_types)
+            logger.info(f"正在创建MySQL表: {table_name}")
-
+            sql_columns = ["id INT AUTO_INCREMENT PRIMARY KEY"]
-            # 创建表结构
+            for col in df.columns:
-            async with self.mysql_db.get_session() as session:
+                col_name = column_name_map[col]
-                model_class.__table__.create(session.bind, checkfirst=True)
+                col_type = column_types.get(col, "TEXT")
                sql_type = "INT" if col_type == "INTEGER" else "FLOAT" if col_type == "FLOAT" else "DATETIME" if col_type == "DATETIME" else "TEXT"
                sql_columns.append(f"`{col_name}` {sql_type}")
            sql_columns.append("created_at DATETIME DEFAULT CURRENT_TIMESTAMP")
            sql_columns.append("updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")
            create_sql = text(f"CREATE TABLE IF NOT EXISTS `{table_name}` ({', '.join(sql_columns)})")
            await self.mysql_db.execute_raw_sql(str(create_sql))
            logger.info(f"MySQL表创建完成: {table_name}")
            # 插入数据
            records = []
            for _, row in df.iterrows():
                record = {}
                for col in df.columns:
-                    col_name = self._sanitize_column_name(col)
+                    col_name = column_name_map[col]
                    value = row[col]
                    # 处理 NaN 值
@@ -231,11 +413,33 @@ class ExcelStorageService:
                records.append(record)
-            # 批量插入
+            logger.info(f"正在插入 {len(records)} 条数据到 MySQL (使用批量插入)...")
-            async with self.mysql_db.get_session() as session:
+            # 使用 pymysql 直接插入以避免 SQLAlchemy 异步问题
-                for record in records:
+            import pymysql
-                    session.add(model_class(**record))
+            from app.config import settings
-                await session.commit()
+
            connection = pymysql.connect(
                host=settings.MYSQL_HOST,
                port=settings.MYSQL_PORT,
                user=settings.MYSQL_USER,
                password=settings.MYSQL_PASSWORD,
                database=settings.MYSQL_DATABASE,
                charset=settings.MYSQL_CHARSET
            )
            try:
                columns_str = ', '.join(['`' + column_name_map[col] + '`' for col in df.columns])
                placeholders = ', '.join(['%s' for _ in df.columns])
                insert_sql = f"INSERT INTO `{table_name}` ({columns_str}) VALUES ({placeholders})"
                # 转换为元组列表 (使用映射后的列名)
                param_list = [tuple(record.get(column_name_map[col]) for col in df.columns) for record in records]
                with connection.cursor() as cursor:
                    cursor.executemany(insert_sql, param_list)
                    connection.commit()
                logger.info(f"数据插入完成: {len(records)} 条")
            finally:
                connection.close()
            results["row_count"] = len(records)
            logger.info(f"Excel 数据已存储到 MySQL 表 {table_name}，共 {len(records)} 行")
@@ -243,7 +447,7 @@ class ExcelStorageService:
            return results
        except Exception as e:
-            logger.error(f"存储 Excel 到 MySQL 失败: {str(e)}")
+            logger.error(f"存储 Excel 到 MySQL 失败: {str(e)}", exc_info=True)
            return {"success": False, "error": str(e)}
    async def store_structured_data(
--- a/backend/app/services/file_service.py
+++ b/backend/app/services/file_service.py
@@ -3,6 +3,7 @@
 """
 import os
 import shutil
 import logging
 from pathlib import Path
 from datetime import datetime
 from typing import Optional
@@ -10,6 +11,8 @@ import uuid
 from app.config import settings
 logger = logging.getLogger(__name__)
 class FileService:
    """文件服务类，负责文件的存储、读取和管理"""
@@ -17,6 +20,7 @@ class FileService:
    def __init__(self):
        self.upload_dir = Path(settings.UPLOAD_DIR)
        self._ensure_upload_dir()
        logger.info(f"FileService 初始化，上传目录: {self.upload_dir}")
    def _ensure_upload_dir(self):
        """确保上传目录存在"""
@@ -56,6 +60,8 @@ class FileService:
        with open(file_path, 'wb') as f:
            f.write(file_content)
        file_size = len(file_content)
        logger.info(f"文件已保存: {filename} -> {file_path} ({file_size} bytes)")
        return str(file_path)
    def read_file(self, file_path: str) -> bytes:
--- a/backend/app/services/llm_service.py
+++ b/backend/app/services/llm_service.py
@@ -2,7 +2,7 @@
 LLM 服务模块 - 封装大模型 API 调用
 """
 import logging
-from typing import Dict, Any, List, Optional
+from typing import Dict, Any, List, Optional, AsyncGenerator
 import httpx
 from app.config import settings
@@ -87,6 +87,71 @@ class LLMService:
            logger.error(f"解析 API 响应失败: {str(e)}")
            raise
    async def chat_stream(
        self,
        messages: List[Dict[str, str]],
        temperature: float = 0.7,
        max_tokens: Optional[int] = None,
        **kwargs
    ) -> AsyncGenerator[Dict[str, Any], None]:
        """
        流式调用聊天 API
        Args:
            messages: 消息列表
            temperature: 温度参数
            max_tokens: 最大 token 数
            **kwargs: 其他参数
        Yields:
            Dict[str, Any]: 包含 delta 内容的块
        """
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        payload = {
            "model": self.model_name,
            "messages": messages,
            "temperature": temperature,
            "stream": True
        }
        if max_tokens:
            payload["max_tokens"] = max_tokens
        payload.update(kwargs)
        try:
            async with httpx.AsyncClient(timeout=120.0) as client:
                async with client.stream(
                    "POST",
                    f"{self.base_url}/chat/completions",
                    headers=headers,
                    json=payload
                ) as response:
                    async for line in response.aiter_lines():
                        if line.startswith("data: "):
                            data = line[6:]  # Remove "data: " prefix
                            if data == "[DONE]":
                                break
                            try:
                                import json as json_module
                                chunk = json_module.loads(data)
                                delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
                                if delta:
                                    yield {"content": delta}
                            except json_module.JSONDecodeError:
                                continue
        except httpx.HTTPStatusError as e:
            logger.error(f"LLM 流式 API 请求失败: {e.response.status_code}")
            raise
        except Exception as e:
            logger.error(f"LLM 流式 API 调用异常: {str(e)}")
            raise
    async def analyze_excel_data(
        self,
        excel_data: Dict[str, Any],
--- a/backend/app/services/markdown_ai_service.py
+++ b/backend/app/services/markdown_ai_service.py
@@ -0,0 +1,707 @@
 """
 Markdown 文档 AI 分析服务
 支持：
 - 分章节解析（中文章节编号：一、二、三， （一）（二）（三））
 - 结构化数据提取
 - 流式输出
 - 多种分析类型
 - 可视化图表生成
 """
 import asyncio
 import json
 import logging
 import re
 from typing import Any, AsyncGenerator, Dict, List, Optional
 from app.services.llm_service import llm_service
 from app.core.document_parser import MarkdownParser
 from app.services.visualization_service import visualization_service
 logger = logging.getLogger(__name__)
 class MarkdownSection:
    """文档章节结构"""
    def __init__(self, number: str, title: str, level: int, content: str, line_start: int, line_end: int):
        self.number = number  # 章节编号，如 "一", "（一）", "1"
        self.title = title
        self.level = level  # 层级深度
        self.content = content  # 章节内容（不含子章节）
        self.line_start = line_start
        self.line_end = line_end
        self.subsections: List[MarkdownSection] = []
    def to_dict(self) -> Dict[str, Any]:
        return {
            "number": self.number,
            "title": self.title,
            "level": self.level,
            "content_preview": self.content[:200] + "..." if len(self.content) > 200 else self.content,
            "line_start": self.line_start,
            "line_end": self.line_end,
            "subsections": [s.to_dict() for s in self.subsections]
        }
 class MarkdownAIService:
    """Markdown 文档 AI 分析服务"""
    # 中文章节编号模式
    CHINESE_NUMBERS = ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"]
    CHINESE_SUFFIX = "、"
    PARENTHESIS_PATTERN = re.compile(r'^（([一二三四五六七八九十]+)\s*(.+)$')
    CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+)、\s*(.+)$')
    ARABIC_SECTION_PATTERN = re.compile(r'^(\d+)\.\s+(.+)$')
    def __init__(self):
        self.parser = MarkdownParser()
    def get_supported_analysis_types(self) -> list:
        """获取支持的分析类型"""
        return [
            "summary",      # 文档摘要
            "outline",      # 大纲提取
            "key_points",   # 关键点提取
            "questions",    # 生成问题
            "tags",         # 生成标签
            "qa",           # 问答对
            "statistics",   # 统计数据分析（适合政府公报）
            "section",      # 分章节详细分析
            "charts"        # 可视化图表生成
        ]
    def extract_sections(self, content: str, titles: List[Dict]) -> List[MarkdownSection]:
        """
        从文档内容中提取章节结构
        识别以下章节格式：
        - 一级：一、二、三...
        - 二级：（一）（二）（三）...
        - 三级：1. 2. 3. ...
        """
        sections = []
        lines = content.split('\n')
        # 构建标题行到内容的映射
        title_lines = {}
        for t in titles:
            title_lines[t.get('line', 0)] = t
        current_section = None
        section_stack = []
        for i, line in enumerate(lines, 1):
            stripped = line.strip()
            # 检查是否是一级标题（中文数字 + 、）
            match = self.CHINESE_SECTION_PATTERN.match(stripped)
            if match:
                # 结束当前章节
                if current_section:
                    current_section.content = self._get_section_content(
                        lines, current_section.line_start, i - 1
                    )
                current_section = MarkdownSection(
                    number=match.group(1),
                    title=match.group(2),
                    level=1,
                    content="",
                    line_start=i,
                    line_end=len(lines)
                )
                sections.append(current_section)
                section_stack = [current_section]
                continue
            # 检查是否是二级标题（（一）（二）...）
            match = self.PARENTHESIS_PATTERN.match(stripped)
            if match and current_section:
                # 结束当前子章节
                if section_stack and len(section_stack) > 1:
                    parent = section_stack[-1]
                    parent.content = self._get_section_content(
                        lines, parent.line_start, i - 1
                    )
                subsection = MarkdownSection(
                    number=match.group(1),
                    title=match.group(2),
                    level=2,
                    content="",
                    line_start=i,
                    line_end=len(lines)
                )
                current_section.subsections.append(subsection)
                section_stack = [current_section, subsection]
                continue
            # 检查是否是三级标题（1. 2. 3.）
            match = self.ARABIC_SECTION_PATTERN.match(stripped)
            if match and len(section_stack) > 1:
                # 结束当前子章节
                if len(section_stack) > 2:
                    parent = section_stack[-1]
                    parent.content = self._get_section_content(
                        lines, parent.line_start, i - 1
                    )
                sub_subsection = MarkdownSection(
                    number=match.group(1),
                    title=match.group(2),
                    level=3,
                    content="",
                    line_start=i,
                    line_end=len(lines)
                )
                section_stack[-1].subsections.append(sub_subsection)
                section_stack = section_stack[:-1] + [sub_subsection]
                continue
        # 处理最后一个章节
        if current_section:
            current_section.content = self._get_section_content(
                lines, current_section.line_start, len(lines)
            )
        return sections
    def _get_section_content(self, lines: List[str], start: int, end: int) -> str:
        """获取指定行范围的内容"""
        if start > end:
            return ""
        content_lines = lines[start-1:end]
        # 清理：移除标题行和空行
        cleaned = []
        for line in content_lines:
            stripped = line.strip()
            if not stripped:
                continue
            # 跳过章节标题行
            if self.CHINESE_SECTION_PATTERN.match(stripped):
                continue
            if self.PARENTHESIS_PATTERN.match(stripped):
                continue
            if self.ARABIC_SECTION_PATTERN.match(stripped):
                continue
            cleaned.append(stripped)
        return '\n'.join(cleaned)
    async def analyze_markdown(
        self,
        file_path: str,
        analysis_type: str = "summary",
        user_prompt: str = "",
        section_number: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        使用 AI 分析 Markdown 文档
        Args:
            file_path: 文件路径
            analysis_type: 分析类型
            user_prompt: 用户自定义提示词
            section_number: 指定分析的章节编号（如 "一" 或 "（一）"）
        Returns:
            dict: 分析结果
        """
        try:
            parse_result = self.parser.parse(file_path)
            if not parse_result.success:
                return {
                    "success": False,
                    "error": parse_result.error
                }
            data = parse_result.data
            # 提取章节结构
            sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
            # 如果指定了章节，只分析该章节
            target_content = data.get("content", "")
            target_title = parse_result.metadata.get("filename", "")
            if section_number:
                section = self._find_section(sections, section_number)
                if section:
                    target_content = section.content
                    target_title = f"{section.number}、{section.title}"
                else:
                    return {
                        "success": False,
                        "error": f"未找到章节: {section_number}"
                    }
            # 根据分析类型构建提示词
            prompt = self._build_prompt(
                content=target_content,
                analysis_type=analysis_type,
                user_prompt=user_prompt,
                title=target_title
            )
            # 调用 LLM 分析
            messages = [
                {"role": "system", "content": self._get_system_prompt(analysis_type)},
                {"role": "user", "content": prompt}
            ]
            response = await llm_service.chat(
                messages=messages,
                temperature=0.3,
                max_tokens=4000
            )
            analysis = llm_service.extract_message_content(response)
            # 构建基础返回
            result = {
                "success": True,
                "filename": parse_result.metadata.get("filename", ""),
                "analysis_type": analysis_type,
                "section": target_title if section_number else None,
                "word_count": len(target_content),
                "structure": {
                    "title_count": parse_result.metadata.get("title_count", 0),
                    "code_block_count": parse_result.metadata.get("code_block_count", 0),
                    "table_count": parse_result.metadata.get("table_count", 0),
                    "section_count": len(sections)
                },
                "sections": [s.to_dict() for s in sections[:10]],  # 最多返回10个一级章节
                "analysis": analysis
            }
            # 如果是 charts 类型，额外生成可视化
            if analysis_type == "charts":
                try:
                    # 解析 LLM 返回的 JSON 数据
                    chart_data = self._parse_chart_json(analysis)
                    if chart_data and chart_data.get("tables"):
                        # 使用可视化服务生成图表
                        for table_info in chart_data.get("tables", []):
                            columns = table_info.get("columns", [])
                            rows = table_info.get("rows", [])
                            if columns and rows:
                                vis_result = visualization_service.analyze_and_visualize({
                                    "columns": columns,
                                    "rows": [dict(zip(columns, row)) for row in rows]
                                })
                                if vis_result.get("success"):
                                    table_info["visualization"] = {
                                        "statistics": vis_result.get("statistics"),
                                        "charts": vis_result.get("charts"),
                                        "distributions": vis_result.get("distributions")
                                    }
                    result["chart_data"] = chart_data
                except Exception as e:
                    logger.warning(f"生成可视化图表失败: {e}")
                    result["chart_data"] = {"tables": [], "key_statistics": [], "chart_suggestions": []}
            return result
        except Exception as e:
            logger.error(f"Markdown AI 分析失败: {str(e)}")
            return {
                "success": False,
                "error": str(e)
            }
    async def analyze_markdown_stream(
        self,
        file_path: str,
        analysis_type: str = "summary",
        user_prompt: str = "",
        section_number: Optional[str] = None
    ) -> AsyncGenerator[str, None]:
        """
        流式分析 Markdown 文档 (SSE)
        Yields:
            str: SSE 格式的数据块
        """
        try:
            parse_result = self.parser.parse(file_path)
            if not parse_result.success:
                yield f"data: {json.dumps({'error': parse_result.error}, ensure_ascii=False)}\n\n"
                return
            data = parse_result.data
            sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
            target_content = data.get("content", "")
            target_title = parse_result.metadata.get("filename", "")
            if section_number:
                section = self._find_section(sections, section_number)
                if section:
                    target_content = section.content
                    target_title = f"{section.number}、{section.title}"
                else:
                    yield f"data: {json.dumps({'error': f'未找到章节: {section_number}'}, ensure_ascii=False)}\n\n"
                    return
            prompt = self._build_prompt(
                content=target_content,
                analysis_type=analysis_type,
                user_prompt=user_prompt,
                title=target_title
            )
            messages = [
                {"role": "system", "content": self._get_system_prompt(analysis_type)},
                {"role": "user", "content": prompt}
            ]
            # 发送初始元数据
            yield f"data: {json.dumps({
                'type': 'start',
                'filename': parse_result.metadata.get("filename", ""),
                'analysis_type': analysis_type,
                'section': target_title if section_number else None,
                'word_count': len(target_content)
            }, ensure_ascii=False)}\n\n"
            # 流式调用 LLM
            full_response = ""
            async for chunk in llm_service.chat_stream(messages, temperature=0.3, max_tokens=4000):
                content = chunk.get("content", "")
                if content:
                    full_response += content
                    yield f"data: {json.dumps({'type': 'content', 'delta': content}, ensure_ascii=False)}\n\n"
            # 发送完成消息
            yield f"data: {json.dumps({'type': 'done', 'full_response': full_response}, ensure_ascii=False)}\n\n"
        except Exception as e:
            logger.error(f"Markdown AI 流式分析失败: {str(e)}")
            yield f"data: {json.dumps({'error': str(e)}, ensure_ascii=False)}\n\n"
    def _find_section(self, sections: List[MarkdownSection], number: str) -> Optional[MarkdownSection]:
        """查找指定编号的章节"""
        # 标准化编号
        num = number.strip()
        for section in sections:
            if section.number == num or section.title == num:
                return section
            # 在子章节中查找
            found = self._find_section(section.subsections, number)
            if found:
                return found
        return None
    def _parse_chart_json(self, json_str: str) -> Optional[Dict[str, Any]]:
        """
        解析 LLM 返回的 JSON 字符串
        Args:
            json_str: LLM 返回的 JSON 字符串
        Returns:
            解析后的字典，如果解析失败返回 None
        """
        if not json_str:
            return None
        try:
            # 尝试直接解析
            return json.loads(json_str)
        except json.JSONDecodeError:
            pass
        # 尝试提取 JSON 代码块
        import re
        # 匹配 ```json ... ``` 格式
        match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', json_str)
        if match:
            try:
                return json.loads(match.group(1))
            except json.JSONDecodeError:
                pass
        # 尝试找到 JSON 对象的开始和结束
        start = json_str.find('{')
        end = json_str.rfind('}')
        if start != -1 and end != -1 and end > start:
            try:
                return json.loads(json_str[start:end+1])
            except json.JSONDecodeError:
                pass
        return None
    def _get_system_prompt(self, analysis_type: str) -> str:
        """根据分析类型获取系统提示词"""
        prompts = {
            "summary": "你是一个专业的文档摘要助手，擅长从长文档中提取核心信息。",
            "outline": "你是一个专业的文档结构分析助手，擅长提取文档大纲和层级结构。",
            "key_points": "你是一个专业的知识提取助手，擅长从文档中提取关键信息和要点。",
            "questions": "你是一个专业的教育助手，擅长生成帮助理解文档的问题。",
            "tags": "你是一个专业的标签生成助手，擅长提取文档的主题标签。",
            "qa": "你是一个专业的问答助手，擅长基于文档内容生成问答对。",
            "statistics": "你是一个专业的统计数据分析助手，擅长分析政府统计公报中的数据。",
            "section": "你是一个专业的章节分析助手，擅长对文档的特定章节进行深入分析。",
            "charts": "你是一个专业的数据可视化助手，擅长从文档中提取数据并生成适合制作图表的数据结构。"
        }
        return prompts.get(analysis_type, "你是一个专业的文档分析助手。")
    def _build_prompt(
        self,
        content: str,
        analysis_type: str,
        user_prompt: str,
        title: str = ""
    ) -> str:
        """根据分析类型构建提示词"""
        # 截断内容避免超出 token 限制
        max_content_len = 6000
        if len(content) > max_content_len:
            content = content[:max_content_len] + "\n\n[内容已截断...]"
        base_prompts = {
            "summary": f"""请对以下文档进行摘要分析：
 文档标题：{title}
 文档内容：
 {content}
 请提供：
 1. 文档主要内容摘要（300字以内）
 2. 文档的目的和用途
 3. 适合的读者群体
 请用中文回答，结构清晰。""",
            "outline": f"""请提取以下文档的大纲结构：
 文档标题：{title}
 文档内容：
 {content}
 请按层级列出文档大纲，用缩进表示层级关系。
 格式：
 一、一级标题
   （一）二级标题
      1. 三级标题
 请用中文回答。""",
            "key_points": f"""请从以下文档中提取关键要点：
 文档标题：{title}
 文档内容：
 {content}
 请列出文档的关键要点（5-10条），每条用简洁的语言描述，并说明其在文档中的重要性。
 请用中文回答，格式清晰。""",
            "questions": f"""请根据以下文档生成有助于理解内容的问题：
 文档标题：{title}
 文档内容：
 {content}
 请生成5-10个问题，帮助读者更好地理解文档内容。每个问题应该：
 1. 涵盖文档的重要信息点
 2. 易于理解和回答
 3. 具有思考价值
 请用中文回答。""",
            "tags": f"""请为以下文档生成标签：
 文档标题：{title}
 文档内容：
 {content[:3000]}
 请生成5-8个标签，用逗号分隔。标签应该反映：
 - 文档的主题领域
 - 文档的类型
 - 文档的关键特征
 请用中文回答，只需输出标签，不要其他内容。""",
            "qa": f"""请根据以下文档生成问答对：
 文档标题：{title}
 文档内容：
 {content[:4000]}
 请生成3-5个问答对，帮助读者通过问答形式理解文档内容。
 格式：
 Q1: 问题
 A1: 回答
 Q2: 问题
 A2: 回答
 请用中文回答，内容准确。""",
            "statistics": f"""请分析以下政府统计公报中的数据和结论：
 文档标题：{title}
 文档内容：
 {content}
 请提供：
 1. 文档中涉及的主要统计数据（列出关键数字和指标）
 2. 数据的变化趋势（增长/下降）
 3. 重要的百分比和对比
 4. 数据来源和统计口径说明
 请用中文回答，数据准确。""",
            "section": f"""请详细分析以下文档章节：
 章节标题：{title}
 章节内容：
 {content}
 请提供：
 1. 章节主要内容概括
 2. 关键信息和数据
 3. 与其他部分的关联（如有）
 4. 重要结论
 请用中文回答，分析深入。""",
            "charts": f"""请从以下文档中提取可用于可视化的数据，并生成适合制作图表的数据结构：
 文档标题：{title}
 文档内容：
 {content}
 请完成以下任务：
 1. 识别文档中的表格数据（Markdown表格格式）
 2. 识别文档中的关键统计数据（百分比、数量、趋势等）
 3. 识别可用于比较的分类数据
 请用 JSON 格式返回以下结构的数据（如果没有表格数据，返回空结构）：
 {{
  "tables": [
    {{
      "description": "表格的描述",
      "columns": ["列名1", "列名2", ...],
      "rows": [
        ["值1", "值2", ...],
        ["值1", "值2", ...]
      ]
    }}
  ],
  "key_statistics": [
    {{
      "name": "指标名称",
      "value": "数值",
      "trend": "增长/下降/持平",
      "description": "指标说明"
    }}
  ],
  "chart_suggestions": [
    {{
      "chart_type": "bar/line/pie",
      "title": "图表标题",
      "data_source": "数据来源说明"
    }}
  ]
 }}
 请确保返回的是合法的 JSON 格式。"""
        }
        prompt = base_prompts.get(analysis_type, base_prompts["summary"])
        if user_prompt and user_prompt.strip():
            prompt += f"\n\n用户额外需求：{user_prompt}"
        return prompt
    async def extract_outline(self, file_path: str) -> Dict[str, Any]:
        """提取文档大纲"""
        try:
            parse_result = self.parser.parse(file_path)
            if not parse_result.success:
                return {"success": False, "error": parse_result.error}
            data = parse_result.data
            sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
            # 构建结构化大纲
            outline = []
            for section in sections:
                outline.append({
                    "number": section.number,
                    "title": section.title,
                    "level": section.level,
                    "line": section.line_start,
                    "content_preview": section.content[:100] + "..." if len(section.content) > 100 else section.content,
                    "subsections": [{
                        "number": s.number,
                        "title": s.title,
                        "level": s.level,
                        "line": s.line_start
                    } for s in section.subsections]
                })
            return {
                "success": True,
                "outline": outline
            }
        except Exception as e:
            logger.error(f"大纲提取失败: {str(e)}")
            return {"success": False, "error": str(e)}
    async def extract_tables_summary(self, file_path: str) -> Dict[str, Any]:
        """提取并总结文档中的表格"""
        try:
            parse_result = self.parser.parse(file_path)
            if not parse_result.success:
                return {"success": False, "error": parse_result.error}
            tables = parse_result.data.get("tables", [])
            if not tables:
                return {"success": True, "tables": [], "message": "文档中没有表格"}
            # 提取每个表格的关键信息
            table_summaries = []
            for i, table in enumerate(tables):
                summary = {
                    "index": i + 1,
                    "headers": table.get("headers", []),
                    "row_count": table.get("row_count", 0),
                    "column_count": table.get("column_count", 0),
                    "preview_rows": table.get("rows", [])[:3],  # 只取前3行预览
                    "first_column": [row[0] if row else "" for row in table.get("rows", [])[:5]]
                }
                table_summaries.append(summary)
            return {
                "success": True,
                "tables": table_summaries,
                "table_count": len(tables)
            }
        except Exception as e:
            logger.error(f"表格提取失败: {str(e)}")
            return {"success": False, "error": str(e)}
 # 全局单例
 markdown_ai_service = MarkdownAIService()
--- a/backend/app/services/rag_service.py
+++ b/backend/app/services/rag_service.py
@@ -40,14 +40,29 @@ class RAGService:
    def _init_embeddings(self):
        """初始化嵌入模型"""
        if self.embedding_model is None:
-            self.embedding_model = SentenceTransformer(settings.EMBEDDING_MODEL)
+            # 使用轻量级本地模型，避免网络问题
            model_name = 'all-MiniLM-L6-v2'
            try:
                self.embedding_model = SentenceTransformer(model_name)
                self._dimension = self.embedding_model.get_sentence_embedding_dimension()
-            logger.info(f"RAG 嵌入模型初始化完成: {settings.EMBEDDING_MODEL}, 维度: {self._dimension}")
+                logger.info(f"RAG 嵌入模型初始化完成: {model_name}, 维度: {self._dimension}")
            except Exception as e:
                logger.warning(f"嵌入模型 {model_name} 加载失败: {e}")
                # 如果本地模型也失败，使用简单hash作为后备
                self.embedding_model = None
                self._dimension = 384
                logger.info("RAG 使用简化模式 (无向量嵌入)")
    def _init_vector_store(self):
        """初始化向量存储"""
        if self.index is None:
            self._init_embeddings()
            if self.embedding_model is None:
                # 无法加载嵌入模型，使用简化模式
                self._dimension = 384
                self.index = None
                logger.warning("RAG 嵌入模型未加载，使用简化模式")
            else:
                self.index = faiss.IndexIDMap(faiss.IndexFlatIP(self._dimension))
                logger.info("Faiss 向量存储初始化完成")
@@ -78,6 +93,11 @@ class RAGService:
        if not self._initialized:
            self._init_vector_store()
        # 如果没有嵌入模型，只记录到日志
        if self.embedding_model is None:
            logger.debug(f"字段跳过索引 (无嵌入模型): {table_name}.{field_name}")
            return
        text = f"表名: {table_name}, 字段: {field_name}, 描述: {field_description}"
        if sample_values:
            text += f", 示例值: {', '.join(sample_values)}"
@@ -100,6 +120,11 @@ class RAGService:
        if not self._initialized:
            self._init_vector_store()
        # 如果没有嵌入模型，只记录到日志
        if self.embedding_model is None:
            logger.debug(f"文档跳过索引 (无嵌入模型): {doc_id}")
            return
        doc = SimpleDocument(
            page_content=content,
            metadata=metadata or {"doc_id": doc_id}
--- a/backend/app/services/table_rag_service.py
+++ b/backend/app/services/table_rag_service.py
@@ -31,6 +31,178 @@ class TableRAGService:
        self.rag = rag_service
        self.excel_storage = excel_storage_service
    def _extract_sheet_names_from_xml(self, file_path: str) -> List[str]:
        """
        从 Excel 文件的 XML 中提取工作表名称
        某些 Excel 文件由于包含非标准元素，pandas/openpyxl 无法正确解析工作表列表，
        此时需要直接从 XML 中提取。
        Args:
            file_path: Excel 文件路径
        Returns:
            工作表名称列表
        """
        import zipfile
        from xml.etree import ElementTree as ET
        try:
            with zipfile.ZipFile(file_path, 'r') as z:
                # 读取 workbook.xml
                if 'xl/workbook.xml' not in z.namelist():
                    return []
                content = z.read('xl/workbook.xml')
                root = ET.fromstring(content)
                # 定义命名空间
                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
                # 提取所有 sheet 的 name 属性
                sheets = root.findall('.//main:sheet', ns)
                return [s.get('name') for s in sheets if s.get('name')]
        except Exception as e:
            logger.warning(f"从 XML 提取工作表失败: {file_path}, error: {e}")
            return []
    def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
        """
        读取 Excel 工作表，支持 pandas 无法解析的特殊 Excel 文件
        当 pandas 的 ExcelFile 无法正确解析时，直接从 XML 读取数据。
        Args:
            file_path: Excel 文件路径
            sheet_name: 工作表名称（如果为 None，读取第一个工作表）
            header_row: 表头行号
        Returns:
            DataFrame
        """
        import zipfile
        from xml.etree import ElementTree as ET
        try:
            # 先尝试用 pandas 正常读取
            df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
            if df is not None and not df.empty:
                return df
        except Exception:
            pass
        # pandas 读取失败，从 XML 直接解析
        logger.info(f"使用 XML 方式读取 Excel: {file_path}")
        try:
            with zipfile.ZipFile(file_path, 'r') as z:
                # 获取工作表名称
                sheet_names = self._extract_sheet_names_from_xml(file_path)
                if not sheet_names:
                    raise ValueError("无法从 Excel 文件中找到工作表")
                # 确定要读取的工作表
                target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
                sheet_index = sheet_names.index(target_sheet) + 1  # sheet1.xml, sheet2.xml, ...
                # 读取 shared strings
                shared_strings = []
                if 'xl/sharedStrings.xml' in z.namelist():
                    ss_content = z.read('xl/sharedStrings.xml')
                    ss_root = ET.fromstring(ss_content)
                    ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
                    for si in ss_root.findall('.//main:si', ns):
                        t = si.find('.//main:t', ns)
                        if t is not None:
                            shared_strings.append(t.text or '')
                        else:
                            shared_strings.append('')
                # 读取工作表
                sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
                if sheet_file not in z.namelist():
                    raise ValueError(f"工作表文件 {sheet_file} 不存在")
                sheet_content = z.read(sheet_file)
                root = ET.fromstring(sheet_content)
                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
                # 解析行
                rows_data = []
                for row in root.findall('.//main:row', ns):
                    row_idx = int(row.get('r', 0))
                    # header_row 是 0-indexed，row_idx 是 1-indexed
                    # 如果 header_row=0 表示第一行是表头，需要跳过 row_idx=1
                    if row_idx <= header_row + 1:
                        continue  # 跳过表头行
                    row_cells = {}
                    for cell in row.findall('main:c', ns):
                        cell_ref = cell.get('r', '')
                        col_letters = ''.join(filter(str.isalpha, cell_ref))
                        cell_type = cell.get('t', 'n')
                        v = cell.find('main:v', ns)
                        if v is not None and v.text:
                            if cell_type == 's':
                                # shared string
                                try:
                                    val = shared_strings[int(v.text)]
                                except (ValueError, IndexError):
                                    val = v.text
                            elif cell_type == 'b':
                                # boolean
                                val = v.text == '1'
                            else:
                                # number or other
                                val = v.text
                        else:
                            val = None
                        row_cells[col_letters] = val
                    if row_cells:
                        rows_data.append(row_cells)
                # 转换为 DataFrame
                if not rows_data:
                    return pd.DataFrame()
                df = pd.DataFrame(rows_data)
                # 如果有 header_row，重新设置列名
                if header_row >= 0:
                    # 重新读取第一行作为表头
                    first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
                    sheet_content = z.read(first_row_sheet)
                    root = ET.fromstring(sheet_content)
                    first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
                    if first_row is not None:
                        headers = {}
                        for cell in first_row.findall('main:c', ns):
                            cell_ref = cell.get('r', '')
                            col_letters = ''.join(filter(str.isalpha, cell_ref))
                            cell_type = cell.get('t', 'n')
                            v = cell.find('main:v', ns)
                            if v is not None and v.text:
                                if cell_type == 's':
                                    try:
                                        headers[col_letters] = shared_strings[int(v.text)]
                                    except (ValueError, IndexError):
                                        headers[col_letters] = v.text
                                else:
                                    headers[col_letters] = v.text
                        # 重命名列
                        df.columns = [headers.get(col, col) for col in df.columns]
                logger.info(f"XML 解析完成: {len(df)} 行, {len(df.columns)} 列")
                return df
        except Exception as e:
            logger.error(f"XML 解析 Excel 失败: {e}")
            raise
    async def generate_field_description(
        self,
        table_name: str,
@@ -126,26 +298,49 @@ class TableRAGService:
        }
        try:
-            # 1. 读取 Excel
+            # 1. 先检查 Excel 文件是否有效
            logger.info(f"正在检查Excel文件: {file_path}")
            try:
                xls_file = pd.ExcelFile(file_path)
                sheet_names = xls_file.sheet_names
                logger.info(f"Excel文件工作表: {sheet_names}")
                # 如果 sheet_names 为空，尝试从 XML 中手动提取
                if not sheet_names:
                    sheet_names = self._extract_sheet_names_from_xml(file_path)
                    logger.info(f"从XML提取工作表: {sheet_names}")
                if not sheet_names:
                    return {"success": False, "error": "Excel 文件没有工作表"}
            except Exception as e:
                logger.error(f"读取Excel文件失败: {file_path}, error: {e}")
                return {"success": False, "error": f"无法读取Excel文件: {str(e)}"}
            # 2. 读取 Excel
            if sheet_name:
-                df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
+                # 验证指定的sheet_name是否存在
-            else:
+                if sheet_name not in sheet_names:
-                df = pd.read_excel(file_path, header=header_row)
+                    logger.warning(f"指定的工作表 '{sheet_name}' 不存在，使用第一个工作表: {sheet_names[0]}")
                    sheet_name = sheet_names[0]
            df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row)
            logger.info(f"读取到数据: {len(df)} 行, {len(df.columns)} 列")
            if df.empty:
                return {"success": False, "error": "Excel 文件为空"}
            # 清理列名
            df.columns = [str(c) for c in df.columns]
-            table_name = excel_storage._sanitize_table_name(filename)
+            table_name = self.excel_storage._sanitize_table_name(filename)
            results["table_name"] = table_name
            results["field_count"] = len(df.columns)
            logger.info(f"表名: {table_name}, 字段数: {len(df.columns)}")
-            # 2. 初始化 RAG (如果需要)
+            # 3. 初始化 RAG (如果需要)
            if not self.rag._initialized:
                self.rag._init_vector_store()
-            # 3. 为每个字段生成描述并索引
+            # 4. 为每个字段生成描述并索引
            all_fields_data = {}
            for col in df.columns:
                # 采样示例值
@@ -187,7 +382,8 @@ class TableRAGService:
                    logger.error(error_msg)
                    results["errors"].append(error_msg)
-            # 4. 存储到 MySQL
+            # 5. 存储到 MySQL
            logger.info(f"开始存储到MySQL: {filename}")
            store_result = await self.excel_storage.store_excel(
                file_path=file_path,
                filename=filename,
--- a/docs/test/test.md
+++ b/docs/test/test.md
@@ -1,113 +0,0 @@
 ✅ Excel 文件解析功能已完成并测试通过
  已完成的工作
  后端部分
  1. 文件服务层 (backend/app/services/file_service.py)
    - 文件保存、读取、删除功能
    - 文件信息获取
  2. Excel 解析模块 (backend/app/core/document_parser/)
    - base.py - 解析器基类
    - xlsx_parser.py - Excel 文件解析器
    - utils.py - 工具函数
  3. API 接口 (backend/app/api/endpoints/upload.py)
    - POST /upload/excel - 上传并解析 Excel 文件
    - GET /upload/excel/preview/{file_path} - 获取预览
    - DELETE /upload/file - 删除文件
  4. 主应用配置 (backend/app/main.py)
    - CORS 中间件配置
    - API 路由注册
  前端部分
  1. 后端 API 封装 (frontendTest/app-a6ww9j3ja3nl/src/db/backend-api.ts)
    - Excel 上传、预览、删除函数
  2. Excel 解析页面 (frontendTest/app-a6ww9j3ja3nl/src/pages/ExcelParse.tsx)
    - 拖拽上传界面
    - 解析选项配置
    - 数据表格展示
    - 多工作表支持
  3. 路由配置 (frontendTest/app-a6ww9j3ja3nl/src/routes.tsx)
    - 添加 /excel-parse 路由
  4. 环境配置 (.env)
    - 后端 API URL 配置
  测试结果
  后端 API 测试通过：
  - ✅ 健康检查：http://127.0.0.1:8000/health
  - ✅ Excel 上传解析：成功解析 5 行 6 列的测试数据
  访问方式
  1. 后端 API 文档：http://127.0.0.1:8000/api/v1/docs
  2. 前端页面：
    - 启动前端：cd frontendTest/app-a6ww9j3ja3nl && npm run dev
    - 访问 Excel 解析页面：/excel-parse
  下一步建议
  1. 启动前端开发服务器测试完整的前端界面
  2. 添加更多文档格式解析支持（docx, md, txt）
  3. 完善错误处理和用户提示
 ✻ Worked for 4m 18s · 1 background task still running (↓ to manage)
 > 如何启动后端呢
 ● 后端启动步骤如下：
  方法一：使用命令行启动（开发模式）
  # 1. 进入后端目录
  cd H:/OwnProject/FilesReadSysteam/backend
  # 2. 激活虚拟环境
  # PowerShell:
  .\venv\Scripts\Activate.ps1
  # 或 CMD:
  .\venv\Scripts\Activate.bat
  # 3. 启动后端服务器
  python -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload
  方法二：直接使用虚拟环境中的 Python
  cd H:/OwnProject/FilesReadSysteam/backend
  ./venv/Scripts/python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload
  方法三：直接运行 main.py（开发调试）
  cd H:/OwnProject/FilesReadSysteam/backend
  ./venv/Scripts/python.exe app/main.py
  启动成功的标志
  INFO:     Uvicorn running on http://127.0.0.1:8000
  INFO:     Started server process [xxxxx]
  INFO:     Waiting for application startup.
  INFO:     Application startup complete.
  验证服务是否正常
  # 健康检查
  curl http://127.0.0.1:8000/health
  # 访问 API 文档
  # 浏览器打开: http://127.0.0.1:8000/api/v1/docs
 ---
  当前状态：后端已在后台运行（任务 ID: b22jkg69j），可以直接访问 http://127.0.0.1:8000
  需要停止的话告诉我即可。
--- a/frontend/src/db/backend-api.ts
+++ b/frontend/src/db/backend-api.ts
@@ -166,6 +166,66 @@ export interface AIAnalysisResult {
  error?: string;
 }
 // ==================== Markdown AI 分析类型 ====================
 export interface AIMarkdownAnalyzeResult {
  success: boolean;
  filename?: string;
  analysis_type?: string;
  section?: string;
  word_count?: number;
  structure?: {
    title_count?: number;
    code_block_count?: number;
    table_count?: number;
    section_count?: number;
  };
  sections?: MarkdownSection[];
  analysis?: string;
  chart_data?: {
    tables?: Array<{
      description?: string;
      columns?: string[];
      rows?: string[][];
      visualization?: {
        statistics?: any;
        charts?: any;
        distributions?: any;
      };
    }>;
    key_statistics?: Array<{
      name?: string;
      value?: string;
      trend?: string;
      description?: string;
    }>;
    chart_suggestions?: Array<{
      chart_type?: string;
      title?: string;
      data_source?: string;
    }>;
  };
  error?: string;
 }
 export interface MarkdownSection {
  number: string;
  title: string;
  level: number;
  content_preview?: string;
  line_start: number;
  line_end?: number;
  subsections?: MarkdownSection[];
 }
 export interface MarkdownOutlineResult {
  success: boolean;
  outline?: MarkdownSection[];
  error?: string;
 }
 export type MarkdownAnalysisType = 'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section' | 'charts';
 export interface AIExcelAnalyzeResult {
  success: boolean;
  excel?: {
@@ -842,6 +902,159 @@ export const aiApi = {
    }
  },
  /**
   * 上传并使用 AI 分析 Markdown 文件
   */
  async analyzeMarkdown(
    file: File,
    options: {
      analysisType?: MarkdownAnalysisType;
      userPrompt?: string;
      sectionNumber?: string;
    } = {}
  ): Promise<AIMarkdownAnalyzeResult> {
    const formData = new FormData();
    formData.append('file', file);
    const params = new URLSearchParams();
    if (options.analysisType) {
      params.append('analysis_type', options.analysisType);
    }
    if (options.userPrompt) {
      params.append('user_prompt', options.userPrompt);
    }
    if (options.sectionNumber) {
      params.append('section_number', options.sectionNumber);
    }
    const url = `${BACKEND_BASE_URL}/ai/analyze/md?${params.toString()}`;
    try {
      const response = await fetch(url, {
        method: 'POST',
        body: formData,
      });
      if (!response.ok) {
        const error = await response.json();
        throw new Error(error.detail || 'Markdown AI 分析失败');
      }
      return await response.json();
    } catch (error) {
      console.error('Markdown AI 分析失败:', error);
      throw error;
    }
  },
  /**
   * 流式分析 Markdown 文件 (SSE)
   */
  async analyzeMarkdownStream(
    file: File,
    options: {
      analysisType?: MarkdownAnalysisType;
      userPrompt?: string;
      sectionNumber?: string;
    } = {},
    onChunk?: (chunk: { type: string; delta?: string; error?: string }) => void
  ): Promise<string> {
    const formData = new FormData();
    formData.append('file', file);
    const params = new URLSearchParams();
    if (options.analysisType) {
      params.append('analysis_type', options.analysisType);
    }
    if (options.userPrompt) {
      params.append('user_prompt', options.userPrompt);
    }
    if (options.sectionNumber) {
      params.append('section_number', options.sectionNumber);
    }
    const url = `${BACKEND_BASE_URL}/ai/analyze/md/stream?${params.toString()}`;
    try {
      const response = await fetch(url, {
        method: 'POST',
        body: formData,
      });
      if (!response.ok) {
        const error = await response.json();
        throw new Error(error.detail || 'Markdown AI 流式分析失败');
      }
      const reader = response.body?.getReader();
      if (!reader) throw new Error('无法读取响应流');
      const decoder = new TextDecoder();
      let fullResponse = '';
      while (true) {
        const { done, value } = await reader.read();
        if (done) break;
        const chunk = decoder.decode(value);
        const lines = chunk.split('\n');
        for (const line of lines) {
          if (line.startsWith('data: ')) {
            const data = line.slice(6);
            if (data === '[DONE]') continue;
            try {
              const parsed = JSON.parse(data);
              if (parsed.type === 'content' && parsed.delta) {
                fullResponse += parsed.delta;
                onChunk?.({ type: 'content', delta: parsed.delta });
              } else if (parsed.type === 'done') {
                fullResponse = parsed.full_response || fullResponse;
              } else if (parsed.error) {
                onChunk?.({ type: 'error', error: parsed.error });
              }
            } catch {
              // Ignore parse errors for incomplete JSON
            }
          }
        }
      }
      return fullResponse;
    } catch (error) {
      console.error('Markdown AI 流式分析失败:', error);
      throw error;
    }
  },
  /**
   * 获取 Markdown 文档大纲（分章节信息）
   */
  async getMarkdownOutline(file: File): Promise<MarkdownOutlineResult> {
    const formData = new FormData();
    formData.append('file', file);
    const url = `${BACKEND_BASE_URL}/ai/analyze/md/outline`;
    try {
      const response = await fetch(url, {
        method: 'GET',
        body: formData,
      });
      if (!response.ok) {
        const error = await response.json();
        throw new Error(error.detail || '获取 Markdown 大纲失败');
      }
      return await response.json();
    } catch (error) {
      console.error('获取 Markdown 大纲失败:', error);
      throw error;
    }
  },
  /**
   * 生成统计信息和图表
   */
--- a/frontend/src/pages/Documents.tsx
+++ b/frontend/src/pages/Documents.tsx
@@ -19,7 +19,11 @@ import {
  TrendingUp,
  Download,
  Brain,
-  Settings2
+  Settings2,
  List,
  MessageSquareCode,
  Tag,
  HelpCircle
 } from 'lucide-react';
 import { Button } from '@/components/ui/button';
 import { Input } from '@/components/ui/input';
@@ -33,7 +37,7 @@ import { Checkbox } from '@/components/ui/checkbox';
 import { toast } from 'sonner';
 import { cn } from '@/lib/utils';
 import { Skeleton } from '@/components/ui/skeleton';
-import { backendApi, type ExcelParseResult, aiApi } from '@/db/backend-api';
+import { backendApi, type ExcelParseResult, type AIMarkdownAnalyzeResult, type MarkdownSection, aiApi } from '@/db/backend-api';
 import {
  Table as TableComponent,
  TableBody,
@@ -78,6 +82,15 @@ const Documents: React.FC = () => {
  const [analysisCharts, setAnalysisCharts] = useState<any>(null);
  const [analysisTypes, setAnalysisTypes] = useState<Array<{ value: string; label: string; description: string }>>([]);
  // Markdown AI 分析相关状态
  const [mdAnalysis, setMdAnalysis] = useState<AIMarkdownAnalyzeResult | null>(null);
  const [mdAnalysisType, setMdAnalysisType] = useState<'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section' | 'charts'>('summary');
  const [mdUserPrompt, setMdUserPrompt] = useState('');
  const [mdSections, setMdSections] = useState<MarkdownSection[]>([]);
  const [mdSelectedSection, setMdSelectedSection] = useState<string>('');
  const [mdStreaming, setMdStreaming] = useState(false);
  const [mdStreamingContent, setMdStreamingContent] = useState('');
  // 解析选项
  const [parseOptions, setParseOptions] = useState({
    parseAllSheets: false,
@@ -144,6 +157,9 @@ const Documents: React.FC = () => {
    setAiAnalysis(null);
    setAnalysisCharts(null);
    setExpandedSheet(null);
    setMdAnalysis(null);
    setMdSections([]);
    setMdStreamingContent('');
    const ext = file.name.split('.').pop()?.toLowerCase();
@@ -163,6 +179,9 @@ const Documents: React.FC = () => {
        } else {
          toast.error(result.error || '解析失败');
        }
      } else if (ext === 'md' || ext === 'markdown') {
        // Markdown 文件：获取大纲
        await fetchMdOutline();
      } else {
        // 其他文档使用通用上传接口
        const result = await backendApi.uploadDocument(file);
@@ -403,6 +422,106 @@ const Documents: React.FC = () => {
    }
  };
  const isMarkdownFile = (filename: string) => {
    const ext = filename.split('.').pop()?.toLowerCase();
    return ext === 'md' || ext === 'markdown';
  };
  // Markdown AI 分析处理
  const handleMdAnalyze = async () => {
    if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) {
      toast.error('请先上传 Markdown 文件');
      return;
    }
    setAnalyzing(true);
    setMdAnalysis(null);
    try {
      const result = await aiApi.analyzeMarkdown(uploadedFile, {
        analysisType: mdAnalysisType,
        userPrompt: mdUserPrompt,
        sectionNumber: mdSelectedSection || undefined
      });
      if (result.success) {
        toast.success('Markdown AI 分析完成');
        setMdAnalysis(result);
      } else {
        toast.error(result.error || 'AI 分析失败');
      }
    } catch (error: any) {
      toast.error(error.message || 'AI 分析失败');
    } finally {
      setAnalyzing(false);
    }
  };
  // 流式分析 Markdown
  const handleMdAnalyzeStream = async () => {
    if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) {
      toast.error('请先上传 Markdown 文件');
      return;
    }
    setAnalyzing(true);
    setMdStreaming(true);
    setMdStreamingContent('');
    setMdAnalysis(null);
    try {
      await aiApi.analyzeMarkdownStream(
        uploadedFile,
        {
          analysisType: mdAnalysisType,
          userPrompt: mdUserPrompt,
          sectionNumber: mdSelectedSection || undefined
        },
        (chunk: { type: string; delta?: string; error?: string }) => {
          if (chunk.type === 'content' && chunk.delta) {
            setMdStreamingContent(prev => prev + chunk.delta);
          } else if (chunk.type === 'error') {
            toast.error(chunk.error || '流式分析出错');
          }
        }
      );
    } catch (error: any) {
      toast.error(error.message || 'AI 分析失败');
    } finally {
      setAnalyzing(false);
      setMdStreaming(false);
    }
  };
  // 获取 Markdown 文档大纲（分章节）
  const fetchMdOutline = async () => {
    if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) return;
    try {
      const result = await aiApi.getMarkdownOutline(uploadedFile);
      if (result.success && result.outline) {
        setMdSections(result.outline);
      }
    } catch (error) {
      console.error('获取大纲失败:', error);
    }
  };
  const getMdAnalysisIcon = (type: string) => {
    switch (type) {
      case 'summary': return <FileText size={20} />;
      case 'outline': return <List size={20} />;
      case 'key_points': return <TrendingUp size={20} />;
      case 'statistics': return <TrendingUp size={20} />;
      case 'section': return <FileText size={20} />;
      case 'questions': return <MessageSquareCode size={20} />;
      case 'tags': return <Tag size={20} />;
      case 'qa': return <HelpCircle size={20} />;
      case 'charts': return <TrendingUp size={20} />;
      default: return <Sparkles size={20} />;
    }
  };
  const formatFileSize = (bytes: number): string => {
    if (bytes === 0) return '0 B';
    const k = 1024;
@@ -600,6 +719,98 @@ const Documents: React.FC = () => {
            </Card>
          )}
          {/* Markdown AI 分析选项 */}
          {uploadedFile && isMarkdownFile(uploadedFile.name) && (
            <Card className="border-none shadow-md bg-gradient-to-br from-purple-500/5 to-primary/5">
              <CardHeader className="pb-4">
                <CardTitle className="flex items-center gap-2">
                  <Sparkles className="text-purple-500" size={20} />
                  Markdown AI 分析
                </CardTitle>
              </CardHeader>
              <CardContent className="space-y-4">
                {/* 章节选择 */}
                {mdSections.length > 0 && (
                  <div className="space-y-2">
                    <Label htmlFor="md-section" className="text-sm">指定章节（可选）</Label>
                    <Select value={mdSelectedSection} onValueChange={setMdSelectedSection}>
                      <SelectTrigger id="md-section" className="bg-background">
                        <SelectValue placeholder="全文分析" />
                      </SelectTrigger>
                      <SelectContent>
                        <SelectItem value="">全文分析</SelectItem>
                        {mdSections.map((section) => (
                          <SelectItem key={section.number} value={section.number}>
                            {section.number}、{section.title}
                          </SelectItem>
                        ))}
                      </SelectContent>
                    </Select>
                  </div>
                )}
                <div className="space-y-2">
                  <Label htmlFor="md-analysis-type" className="text-sm">分析类型</Label>
                  <Select value={mdAnalysisType} onValueChange={(value: any) => setMdAnalysisType(value)}>
                    <SelectTrigger id="md-analysis-type" className="bg-background">
                      <SelectValue />
                    </SelectTrigger>
                    <SelectContent>
                      {[
                        { value: 'summary', label: '文档摘要', desc: '主要内容摘要' },
                        { value: 'outline', label: '大纲提取', desc: '提取文档结构' },
                        { value: 'key_points', label: '关键要点', desc: '提取关键信息' },
                        { value: 'statistics', label: '统计分析', desc: '统计数据分析' },
                        { value: 'section', label: '章节分析', desc: '分章节详细分析' },
                        { value: 'questions', label: '生成问题', desc: '生成理解性问题' },
                        { value: 'tags', label: '生成标签', desc: '提取主题标签' },
                        { value: 'qa', label: '问答对', desc: '生成问答内容' },
                        { value: 'charts', label: '数据图表', desc: '生成可视化数据' }
                      ].map(type => (
                        <SelectItem key={type.value} value={type.value}>
                          <div className="flex items-center gap-2">
                            {getMdAnalysisIcon(type.value)}
                            <div className="flex flex-col">
                              <span className="font-medium">{type.label}</span>
                              <span className="text-xs text-muted-foreground">{type.desc}</span>
                            </div>
                          </div>
                        </SelectItem>
                      ))}
                    </SelectContent>
                  </Select>
                </div>
                <div className="space-y-2">
                  <Label htmlFor="md-user-prompt" className="text-sm">自定义提示词（可选）</Label>
                  <Textarea
                    id="md-user-prompt"
                    placeholder="例如：请重点关注技术实现部分..."
                    value={mdUserPrompt}
                    onChange={(e) => setMdUserPrompt(e.target.value)}
                    className="bg-background resize-none"
                    rows={2}
                  />
                </div>
                <div className="flex gap-2">
                  <Button
                    onClick={handleMdAnalyze}
                    disabled={analyzing}
                    className="flex-1 bg-gradient-to-r from-purple-500 to-primary hover:from-purple-500/90 hover:to-primary/90"
                  >
                    {analyzing && !mdStreaming ? <><Loader2 className="mr-2 animate-spin" size={16} /> 分析中...</> : <><Sparkles className="mr-2" size={16} />普通分析</>}
                  </Button>
                  <Button
                    onClick={handleMdAnalyzeStream}
                    disabled={analyzing}
                    variant="outline"
                    className="flex-1"
                  >
                    {analyzing && mdStreaming ? <><Loader2 className="mr-2 animate-spin" size={16} /> 流式...</> : <><Sparkles className="mr-2" size={16} />流式分析</>}
                  </Button>
                </div>
              </CardContent>
            </Card>
          )}
          {/* 数据操作 */}
          {parseResult?.success && (
            <Card className="border-none shadow-md bg-gradient-to-br from-emerald-500/5 to-blue-500/5">
@@ -661,6 +872,45 @@ const Documents: React.FC = () => {
            </Card>
          )}
          {/* Markdown AI 分析结果 */}
          {(mdAnalysis || mdStreamingContent) && (
            <Card className="border-none shadow-md border-l-4 border-l-purple-500">
              <CardHeader>
                <div className="flex items-center justify-between">
                  <div className="space-y-1">
                    <CardTitle className="flex items-center gap-2">
                      <Sparkles className="text-purple-500" size={20} />
                      Markdown AI 分析结果
                      {mdStreaming && <Badge variant="default" className="ml-2 bg-purple-500">流式输出中</Badge>}
                    </CardTitle>
                    {mdAnalysis && (
                      <CardDescription>
                        {mdAnalysis.filename} • {mdAnalysis.word_count || 0} 字 • {mdAnalysis.analysis_type}
                        {mdAnalysis.section && ` • ${mdAnalysis.section}`}
                      </CardDescription>
                    )}
                  </div>
                  {mdAnalysis?.structure && (
                    <Badge variant="secondary">
                      {mdAnalysis.structure.title_count || 0} 标题 • {mdAnalysis.structure.section_count || 0} 章节
                    </Badge>
                  )}
                </div>
              </CardHeader>
              <CardContent className="max-h-[500px] overflow-y-auto">
                {/* 流式内容优先显示 */}
                {mdStreamingContent && (
                  <div className="animate-pulse text-sm text-muted-foreground mb-4">
                    流式输出中...
                  </div>
                )}
                {mdStreamingContent && <Markdown content={mdStreamingContent} />}
                {mdAnalysis?.analysis && !mdStreamingContent && <Markdown content={mdAnalysis.analysis} />}
                {!mdAnalysis?.success && !mdStreamingContent && <p className="text-sm text-destructive">{mdAnalysis?.error || '分析失败'}</p>}
              </CardContent>
            </Card>
          )}
          {/* 图表显示 */}
          {analysisCharts && (
            <Card className="border-none shadow-md border-l-4 border-l-indigo-500">