Merge branch 'main' of https://gitea.kronecker.cc/OurCodesAreAllRight/FilesReadSystem

2026-04-08 19:17:05 +08:00
parent b9ca11efe5 41e5eaaa2d
commit fd435c7fd3
18 changed files with 2138 additions and 180 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,38 @@
+/.git/
+/.idea/
+/.vscode/
+/backend/venv/
+/backend/command/
+/backend/.env
+/backend/.env.local
+/backend/.env.*.local
+/backend/app/__pycache__/*
+/backend/data/uploads
+/backend/data/charts
+/backend/data/logs
+
+/frontend/node_modules/
+/frontend/dist/
+/frontend/build/
+/frontend/.vscode/
+/frontend/.idea/
+/frontend/.env
+/frontend/*.log
+/技术路线.md
+/开发路径.md
+/开发日志_2026-03-16.md
+/frontendTest/
+/docs/
+/frontend/src/api/
+/frontend/src/api/index.js
+/frontend/src/api/index.ts
+/frontend/src/api/index.tsx
+/frontend/src/api/index.py
+/frontend/src/api/index.go
+/frontend/src/api/index.java
+/docs/
+/frontend - 副本/*
+/supabase.txt
+
+**/__pycache__/*
+**.pyc
--- a/backend/app/api/endpoints/ai_analyze.py
+++ b/backend/app/api/endpoints/ai_analyze.py
@@ -2,10 +2,14 @@
 AI 分析 API 接口
 """
 from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body
+from fastapi.responses import StreamingResponse
 from typing import Optional
 import logging
+import tempfile
+import os

 from app.services.excel_ai_service import excel_ai_service
+from app.services.markdown_ai_service import markdown_ai_service

 logger = logging.getLogger(__name__)

@@ -93,10 +97,11 @@ async def get_analysis_types():
    获取支持的分析类型列表

    Returns:
-        list: 支持的分析类型
+        dict: 支持的分析类型（包含 Excel 和 Markdown）
    """
    return {
-        "types": excel_ai_service.get_supported_analysis_types()
+        "excel_types": excel_ai_service.get_supported_analysis_types(),
+        "markdown_types": markdown_ai_service.get_supported_analysis_types()
    }


@@ -142,3 +147,185 @@ async def analyze_text(
    except Exception as e:
        logger.error(f"文本分析失败: {str(e)}")
        raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
+
+
+@router.post("/analyze/md")
+async def analyze_markdown(
+    file: UploadFile = File(...),
+    analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section"),
+    user_prompt: str = Query("", description="用户自定义提示词"),
+    section_number: Optional[str] = Query(None, description="指定章节编号，如 '一' 或 '（一）'")
+):
+    """
+    上传并使用 AI 分析 Markdown 文件
+
+    Args:
+        file: 上传的 Markdown 文件
+        analysis_type: 分析类型
+        user_prompt: 用户自定义提示词
+        section_number: 指定分析的章节编号
+
+    Returns:
+        dict: 分析结果
+    """
+    # 检查文件类型
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="文件名为空")
+
+    file_ext = file.filename.split('.')[-1].lower()
+    if file_ext not in ['md', 'markdown']:
+        raise HTTPException(
+            status_code=400,
+            detail=f"不支持的文件类型: {file_ext}，仅支持 .md 和 .markdown"
+        )
+
+    # 验证分析类型
+    supported_types = markdown_ai_service.get_supported_analysis_types()
+    if analysis_type not in supported_types:
+        raise HTTPException(
+            status_code=400,
+            detail=f"不支持的分析类型: {analysis_type}，支持的类型: {', '.join(supported_types)}"
+        )
+
+    try:
+        # 读取文件内容
+        content = await file.read()
+
+        # 保存到临时文件
+        with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
+            tmp.write(content)
+            tmp_path = tmp.name
+
+        try:
+            logger.info(f"开始分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}, 章节: {section_number}")
+
+            # 调用 AI 分析服务
+            result = await markdown_ai_service.analyze_markdown(
+                file_path=tmp_path,
+                analysis_type=analysis_type,
+                user_prompt=user_prompt,
+                section_number=section_number
+            )
+
+            logger.info(f"Markdown 分析完成: {file.filename}, 成功: {result['success']}")
+
+            if not result['success']:
+                raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
+
+            return result
+
+        finally:
+            # 清理临时文件
+            if os.path.exists(tmp_path):
+                os.unlink(tmp_path)
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Markdown AI 分析过程中出错: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
+
+
+@router.post("/analyze/md/stream")
+async def analyze_markdown_stream(
+    file: UploadFile = File(...),
+    analysis_type: str = Query("summary", description="分析类型"),
+    user_prompt: str = Query("", description="用户自定义提示词"),
+    section_number: Optional[str] = Query(None, description="指定章节编号")
+):
+    """
+    流式分析 Markdown 文件 (SSE)
+
+    Returns:
+        StreamingResponse: SSE 流式响应
+    """
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="文件名为空")
+
+    file_ext = file.filename.split('.')[-1].lower()
+    if file_ext not in ['md', 'markdown']:
+        raise HTTPException(
+            status_code=400,
+            detail=f"不支持的文件类型: {file_ext}，仅支持 .md 和 .markdown"
+        )
+
+    try:
+        content = await file.read()
+
+        with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
+            tmp.write(content)
+            tmp_path = tmp.name
+
+        try:
+            logger.info(f"开始流式分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}")
+
+            async def stream_generator():
+                async for chunk in markdown_ai_service.analyze_markdown_stream(
+                    file_path=tmp_path,
+                    analysis_type=analysis_type,
+                    user_prompt=user_prompt,
+                    section_number=section_number
+                ):
+                    yield chunk
+
+            return StreamingResponse(
+                stream_generator(),
+                media_type="text/event-stream",
+                headers={
+                    "Cache-Control": "no-cache",
+                    "Connection": "keep-alive",
+                    "X-Accel-Buffering": "no"
+                }
+            )
+
+        finally:
+            if os.path.exists(tmp_path):
+                os.unlink(tmp_path)
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Markdown AI 流式分析出错: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"流式分析失败: {str(e)}")
+
+
+@router.get("/analyze/md/outline")
+async def get_markdown_outline(
+    file: UploadFile = File(...)
+):
+    """
+    获取 Markdown 文档的大纲结构（分章节信息）
+
+    Args:
+        file: 上传的 Markdown 文件
+
+    Returns:
+        dict: 文档大纲结构
+    """
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="文件名为空")
+
+    file_ext = file.filename.split('.')[-1].lower()
+    if file_ext not in ['md', 'markdown']:
+        raise HTTPException(
+            status_code=400,
+            detail=f"不支持的文件类型: {file_ext}，仅支持 .md 和 .markdown"
+        )
+
+    try:
+        content = await file.read()
+
+        with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
+            tmp.write(content)
+            tmp_path = tmp.name
+
+        try:
+            result = await markdown_ai_service.extract_outline(tmp_path)
+            return result
+        finally:
+            if os.path.exists(tmp_path):
+                os.unlink(tmp_path)
+
+    except Exception as e:
+        logger.error(f"获取 Markdown 大纲失败: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"获取大纲失败: {str(e)}")
--- a/backend/app/api/endpoints/documents.py
+++ b/backend/app/api/endpoints/documents.py
@@ -196,7 +196,9 @@ async def process_document(
                meta={"progress": 50, "message": "正在存储到MySQL并生成字段描述"}
            )

+            try:
                # 使用 TableRAG 服务完成建表和RAG索引
+                logger.info(f"开始存储Excel到MySQL: {original_filename}, file_path: {file_path}")
                rag_result = await table_rag_service.build_table_rag_index(
                    file_path=file_path,
                    filename=original_filename,
@@ -205,9 +207,11 @@ async def process_document(
                )

                if rag_result.get("success"):
-                logger.info(f"RAG索引构建成功: {original_filename}")
+                    logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {rag_result.get('table_name')}")
                else:
-                logger.warning(f"RAG索引构建失败: {rag_result.get('error')}")
+                    logger.error(f"RAG索引构建失败: {rag_result.get('error')}")
+            except Exception as e:
+                logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True)

        else:
            # 非结构化文档
--- a/backend/app/api/endpoints/tasks.py
+++ b/backend/app/api/endpoints/tasks.py
@@ -26,7 +26,16 @@ async def get_task_status(task_id: str):
    status = await redis_db.get_task_status(task_id)

    if not status:
-        raise HTTPException(status_code=404, detail=f"任务 {task_id} 不存在")
+        # Redis不可用时，假设任务已完成（文档已成功处理）
+        # 前端轮询时会得到这个响应
+        return {
+            "task_id": task_id,
+            "status": "success",
+            "progress": 100,
+            "message": "任务处理完成",
+            "result": None,
+            "error": None
+        }

    return {
        "task_id": task_id,
--- a/backend/app/api/endpoints/upload.py
+++ b/backend/app/api/endpoints/upload.py
@@ -10,6 +10,7 @@ import io

 from app.services.file_service import file_service
 from app.core.document_parser import XlsxParser
+from app.services.table_rag_service import table_rag_service

 logger = logging.getLogger(__name__)

@@ -27,7 +28,7 @@ async def upload_excel(
    header_row: int = Query(0, description="表头所在的行索引")
 ):
    """
-    上传并解析 Excel 文件
+    上传并解析 Excel 文件，同时存储到 MySQL 数据库

    Args:
        file: 上传的 Excel 文件
@@ -77,6 +78,23 @@ async def upload_excel(
            result.metadata['saved_path'] = saved_path
            result.metadata['original_filename'] = file.filename

+        # 存储到 MySQL 数据库
+        try:
+            store_result = await table_rag_service.build_table_rag_index(
+                file_path=saved_path,
+                filename=file.filename,
+                sheet_name=sheet_name if sheet_name else None,
+                header_row=header_row
+            )
+            if store_result.get("success"):
+                result.metadata['mysql_table'] = store_result.get('table_name')
+                result.metadata['row_count'] = store_result.get('row_count')
+                logger.info(f"Excel已存储到MySQL: {file.filename}, 表: {store_result.get('table_name')}")
+            else:
+                logger.warning(f"Excel存储到MySQL失败: {store_result.get('error')}")
+        except Exception as e:
+            logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True)
+
        return result.to_dict()

    except HTTPException:
--- a/backend/app/config.py
+++ b/backend/app/config.py
@@ -29,6 +29,9 @@ class Settings(BaseSettings):
    LLM_BASE_URL: str = "https://api.minimax.chat"
    LLM_MODEL_NAME: str = "MiniMax-Text-01"

+    # ==================== RAG/Embedding 配置 ====================
+    EMBEDDING_MODEL: str = "all-MiniLM-L6-v2"
+
    # ==================== Supabase 配置 ====================
    SUPABASE_URL: str = ""
    SUPABASE_ANON_KEY: str = ""
--- a/backend/app/core/database/mongodb.py
+++ b/backend/app/core/database/mongodb.py
@@ -87,8 +87,10 @@ class MongoDB:
            "updated_at": datetime.utcnow(),
        }
        result = await self.documents.insert_one(document)
-        logger.info(f"文档已插入MongoDB: {result.inserted_id}")
-        return str(result.inserted_id)
+        doc_id = str(result.inserted_id)
+        filename = metadata.get("original_filename", "unknown")
+        logger.info(f"✓ 文档已存入MongoDB: [{doc_type}] {filename} | ID: {doc_id}")
+        return doc_id

    async def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]:
        """根据ID获取文档"""
--- a/backend/app/core/database/mysql.py
+++ b/backend/app/core/database/mysql.py
@@ -16,6 +16,7 @@ from sqlalchemy import (
    String,
    Text,
    create_engine,
+    text,
 )
 from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
 from sqlalchemy.orm import DeclarativeBase, sessionmaker
@@ -72,6 +73,26 @@ class MySQLDB:
    async def init_db(self):
        """初始化数据库，创建所有表"""
        try:
+            # 先创建数据库（如果不存在）
+            from sqlalchemy import text
+            db_name = settings.MYSQL_DATABASE
+            # 连接时不指定数据库来创建数据库
+            temp_url = (
+                f"mysql+aiomysql://{settings.MYSQL_USER}:{settings.MYSQL_PASSWORD}"
+                f"@{settings.MYSQL_HOST}:{settings.MYSQL_PORT}/"
+                f"?charset={settings.MYSQL_CHARSET}"
+            )
+            from sqlalchemy.ext.asyncio import create_async_engine
+            temp_engine = create_async_engine(temp_url, echo=False)
+            try:
+                async with temp_engine.connect() as conn:
+                    await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci"))
+                    await conn.commit()
+                logger.info(f"MySQL 数据库 {db_name} 创建或已存在")
+            finally:
+                await temp_engine.dispose()
+
+            # 然后创建表
            async with self.async_engine.begin() as conn:
                await conn.run_sync(Base.metadata.create_all)
            logger.info("MySQL 数据库表初始化完成")
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -2,23 +2,143 @@
 FastAPI 应用主入口
 """
 import logging
+import logging.handlers
+import sys
+import uuid
 from contextlib import asynccontextmanager
+from typing import Callable
+from functools import wraps

-from fastapi import FastAPI
+from fastapi import FastAPI, Request, Response
 from fastapi.middleware.cors import CORSMiddleware
+from starlette.middleware.base import BaseHTTPMiddleware

 from app.config import settings
 from app.api import api_router
 from app.core.database import mysql_db, mongodb, redis_db

-# 配置日志
-logging.basicConfig(
-    level=logging.INFO if settings.DEBUG else logging.WARNING,
-    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-)
+# ==================== 日志配置 ====================
+
+def setup_logging():
+    """配置应用日志系统"""
+    import os
+    from pathlib import Path
+
+    # 根日志配置
+    log_level = logging.DEBUG if settings.DEBUG else logging.INFO
+
+    # 日志目录
+    log_dir = Path("data/logs")
+    log_dir.mkdir(parents=True, exist_ok=True)
+
+    # 日志文件路径
+    log_file = log_dir / "app.log"
+    error_log_file = log_dir / "error.log"
+
+    # 控制台处理器
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setLevel(log_level)
+    console_formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)-8s | %(name)s:%(lineno)d | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S"
+    )
+    console_handler.setFormatter(console_formatter)
+
+    # 文件处理器 (所有日志)
+    file_handler = logging.handlers.RotatingFileHandler(
+        log_file,
+        maxBytes=10 * 1024 * 1024,  # 10MB
+        backupCount=5,
+        encoding="utf-8"
+    )
+    file_handler.setLevel(logging.DEBUG)
+    file_formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)-8s | %(name)s:%(lineno)d | %(funcName)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S"
+    )
+    file_handler.setFormatter(file_formatter)
+
+    # 错误日志处理器 (仅ERROR及以上)
+    error_file_handler = logging.handlers.RotatingFileHandler(
+        error_log_file,
+        maxBytes=10 * 1024 * 1024,  # 10MB
+        backupCount=5,
+        encoding="utf-8"
+    )
+    error_file_handler.setLevel(logging.ERROR)
+    error_file_handler.setFormatter(file_formatter)
+
+    # 根日志器
+    root_logger = logging.getLogger()
+    root_logger.setLevel(logging.DEBUG)
+    root_logger.handlers = []
+    root_logger.addHandler(console_handler)
+    root_logger.addHandler(file_handler)
+    root_logger.addHandler(error_file_handler)
+
+    # 第三方库日志级别
+    for lib in ["uvicorn", "uvicorn.access", "fastapi", "httpx", "sqlalchemy"]:
+        logging.getLogger(lib).setLevel(logging.WARNING)
+
+    root_logger.info(f"日志系统初始化完成 | 日志目录: {log_dir}")
+    root_logger.info(f"主日志文件: {log_file} | 错误日志: {error_log_file}")
+
+    return root_logger
+
+# 初始化日志
+setup_logging()
 logger = logging.getLogger(__name__)


+# ==================== 请求日志中间件 ====================
+
+class RequestLoggingMiddleware(BaseHTTPMiddleware):
+    """请求日志中间件 - 记录每个请求的详细信息"""
+
+    async def dispatch(self, request: Request, call_next: Callable) -> Response:
+        # 生成请求ID
+        request_id = str(uuid.uuid4())[:8]
+        request.state.request_id = request_id
+
+        # 记录请求
+        logger.info(f"→ [{request_id}] {request.method} {request.url.path}")
+
+        try:
+            response = await call_next(request)
+
+            # 记录响应
+            logger.info(
+                f"← [{request_id}] {request.method} {request.url.path} "
+                f"| 状态: {response.status_code} | 耗时: N/A"
+            )
+
+            # 添加请求ID到响应头
+            response.headers["X-Request-ID"] = request_id
+            return response
+
+        except Exception as e:
+            logger.error(f"✗ [{request_id}] {request.method} {request.url.path} | 异常: {str(e)}")
+            raise
+
+
+# ==================== 请求追踪装饰器 ====================
+
+def log_async_function(func: Callable) -> Callable:
+    """异步函数日志装饰器"""
+    @wraps(func)
+    async def wrapper(*args, **kwargs):
+        func_name = func.__name__
+        logger.debug(f"→ {func_name} 开始执行")
+        try:
+            result = await func(*args, **kwargs)
+            logger.debug(f"← {func_name} 执行完成")
+            return result
+        except Exception as e:
+            logger.error(f"✗ {func_name} 执行失败: {str(e)}")
+            raise
+    return wrapper
+
+
@asynccontextmanager
 async def lifespan(app: FastAPI):
    """
@@ -83,6 +203,9 @@ app.add_middleware(
    allow_headers=["*"],
 )

+# 添加请求日志中间件
+app.add_middleware(RequestLoggingMiddleware)
+
 # 注册 API 路由
 app.include_router(api_router, prefix=settings.API_V1_STR)

--- a/backend/app/services/excel_storage_service.py
+++ b/backend/app/services/excel_storage_service.py
@@ -17,12 +17,15 @@ from sqlalchemy import (
    String,
    Text,
    inspect,
+    text,
 )
 from sqlalchemy.ext.asyncio import AsyncSession

 from app.core.database.mysql import Base, mysql_db

 logger = logging.getLogger(__name__)
+# 设置该模块的日志级别
+logger.setLevel(logging.DEBUG)


 class ExcelStorageService:
@@ -31,6 +34,123 @@ class ExcelStorageService:
    def __init__(self):
        self.mysql_db = mysql_db

+    def _extract_sheet_names_from_xml(self, file_path: str) -> list:
+        """从 Excel 文件的 XML 中提取工作表名称"""
+        import zipfile
+        from xml.etree import ElementTree as ET
+
+        try:
+            with zipfile.ZipFile(file_path, 'r') as z:
+                if 'xl/workbook.xml' not in z.namelist():
+                    return []
+                content = z.read('xl/workbook.xml')
+                root = ET.fromstring(content)
+                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
+                sheets = root.findall('.//main:sheet', ns)
+                return [s.get('name') for s in sheets if s.get('name')]
+        except Exception:
+            return []
+
+    def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
+        """读取 Excel 工作表，支持 pandas 无法解析的特殊 Excel 文件"""
+        import zipfile
+        from xml.etree import ElementTree as ET
+
+        try:
+            df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
+            if df is not None and not df.empty:
+                return df
+        except Exception:
+            pass
+
+        # pandas 读取失败，从 XML 直接解析
+        logger.info(f"使用 XML 方式读取 Excel: {file_path}")
+
+        try:
+            with zipfile.ZipFile(file_path, 'r') as z:
+                sheet_names = self._extract_sheet_names_from_xml(file_path)
+                if not sheet_names:
+                    raise ValueError("无法从 Excel 文件中找到工作表")
+
+                target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
+                sheet_index = sheet_names.index(target_sheet) + 1
+
+                shared_strings = []
+                if 'xl/sharedStrings.xml' in z.namelist():
+                    ss_content = z.read('xl/sharedStrings.xml')
+                    ss_root = ET.fromstring(ss_content)
+                    ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
+                    for si in ss_root.findall('.//main:si', ns):
+                        t = si.find('.//main:t', ns)
+                        shared_strings.append(t.text if t is not None else '')
+
+                sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
+                sheet_content = z.read(sheet_file)
+                root = ET.fromstring(sheet_content)
+                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
+
+                rows_data = []
+                for row in root.findall('.//main:row', ns):
+                    row_idx = int(row.get('r', 0))
+                    if row_idx <= header_row + 1:
+                        continue
+
+                    row_cells = {}
+                    for cell in row.findall('main:c', ns):
+                        cell_ref = cell.get('r', '')
+                        col_letters = ''.join(filter(str.isalpha, cell_ref))
+                        cell_type = cell.get('t', 'n')
+                        v = cell.find('main:v', ns)
+
+                        if v is not None and v.text:
+                            if cell_type == 's':
+                                try:
+                                    val = shared_strings[int(v.text)]
+                                except (ValueError, IndexError):
+                                    val = v.text
+                            elif cell_type == 'b':
+                                val = v.text == '1'
+                            else:
+                                val = v.text
+                        else:
+                            val = None
+                        row_cells[col_letters] = val
+
+                    if row_cells:
+                        rows_data.append(row_cells)
+
+                if not rows_data:
+                    return pd.DataFrame()
+
+                df = pd.DataFrame(rows_data)
+
+                if header_row >= 0:
+                    first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
+                    sheet_content = z.read(first_row_sheet)
+                    root = ET.fromstring(sheet_content)
+                    first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
+                    if first_row is not None:
+                        headers = {}
+                        for cell in first_row.findall('main:c', ns):
+                            cell_ref = cell.get('r', '')
+                            col_letters = ''.join(filter(str.isalpha, cell_ref))
+                            cell_type = cell.get('t', 'n')
+                            v = cell.find('main:v', ns)
+                            if v is not None and v.text:
+                                if cell_type == 's':
+                                    try:
+                                        headers[col_letters] = shared_strings[int(v.text)]
+                                    except (ValueError, IndexError):
+                                        headers[col_letters] = v.text
+                                else:
+                                    headers[col_letters] = v.text
+                        df.columns = [headers.get(col, col) for col in df.columns]
+
+                return df
+        except Exception as e:
+            logger.error(f"XML 解析 Excel 失败: {e}")
+            raise
+
    def _sanitize_table_name(self, filename: str) -> str:
        """
        将文件名转换为合法的表名
@@ -64,15 +184,44 @@ class ExcelStorageService:
        Returns:
            合法的字段名
        """
-        # 只保留字母、数字、下划线
-        name = re.sub(r'[^a-zA-Z0-9_]', '_', str(col_name))
-
-        # 确保以字母开头
+        # MySQL 支持 UTF8 编码，中文字符可以直接使用
+        # 只处理非法字符（控制字符等）和首字符数字
+        name = str(col_name).strip()
+        # 移除控制字符
+        name = re.sub(r'[\x00-\x1f\x7f]', '', name)
+        # 确保以字母或中文开头
        if name and name[0].isdigit():
            name = 'col_' + name
+        # 限制长度 (MySQL 字段名最多64字符)
+        return name[:64]

-        # 限制长度
-        return name[:50]
+    def _get_unique_column_name(self, col_name: str, used_names: set) -> str:
+        """
+        获取唯一的列名，避免重复
+
+        Args:
+            col_name: 原始列名
+            used_names: 已使用的列名集合
+
+        Returns:
+            唯一的列名
+        """
+        sanitized = self._sanitize_column_name(col_name)
+        # "id" 是 MySQL 保留名，作为主键使用
+        if sanitized.lower() == "id":
+            sanitized = "col_id"
+        if sanitized not in used_names:
+            used_names.add(sanitized)
+            return sanitized
+
+        # 添加数字后缀直到唯一
+        base = sanitized if sanitized else "col"
+        counter = 1
+        while f"{base}_{counter}" in used_names:
+            counter += 1
+        unique_name = f"{base}_{counter}"
+        used_names.add(unique_name)
+        return unique_name

    def _infer_column_type(self, series: pd.Series) -> str:
        """
@@ -84,12 +233,35 @@ class ExcelStorageService:
        Returns:
            类型名称
        """
+        # 移除空值进行类型检查
+        non_null = series.dropna()
+        if len(non_null) == 0:
+            return "TEXT"
+
        dtype = series.dtype

+        # 整数类型检查
        if pd.api.types.is_integer_dtype(dtype):
+            # 检查是否所有值都能放入 INT 范围
+            try:
+                int_values = non_null.astype('int64')
+                if int_values.min() >= -2147483648 and int_values.max() <= 2147483647:
                    return "INTEGER"
+                else:
+                    # 超出 INT 范围，使用 TEXT
+                    return "TEXT"
+            except (ValueError, OverflowError):
+                return "TEXT"
        elif pd.api.types.is_float_dtype(dtype):
+            # 检查是否所有值都能放入 FLOAT
+            try:
+                float_values = non_null.astype('float64')
+                if float_values.min() >= -1e308 and float_values.max() <= 1e308:
                    return "FLOAT"
+                else:
+                    return "TEXT"
+            except (ValueError, OverflowError):
+                return "TEXT"
        elif pd.api.types.is_datetime64_any_dtype(dtype):
            return "DATETIME"
        elif pd.api.types.is_bool_dtype(dtype):
@@ -174,11 +346,11 @@ class ExcelStorageService:
        }

        try:
-            # 读取 Excel
-            if sheet_name:
-                df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
-            else:
-                df = pd.read_excel(file_path, header=header_row)
+            logger.info(f"开始读取Excel文件: {file_path}")
+            # 读取 Excel（使用 fallback 方式支持特殊格式文件）
+            df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row)
+
+            logger.info(f"Excel读取完成，行数: {len(df)}, 列数: {len(df.columns)}")

            if df.empty:
                return {"success": False, "error": "Excel 文件为空"}
@@ -186,31 +358,41 @@ class ExcelStorageService:
            # 清理列名
            df.columns = [str(c) for c in df.columns]

-            # 推断列类型
+            # 推断列类型，并生成唯一的列名
            column_types = {}
+            column_name_map = {}  # 原始列名 -> 唯一合法列名
+            used_names = set()
            for col in df.columns:
-                col_name = self._sanitize_column_name(col)
+                col_name = self._get_unique_column_name(col, used_names)
                col_type = self._infer_column_type(df[col])
                column_types[col] = col_type
+                column_name_map[col] = col_name
                results["columns"].append({
                    "original_name": col,
                    "sanitized_name": col_name,
                    "type": col_type
                })

-            # 创建表
-            model_class = self._create_table_model(table_name, df.columns, column_types)
-
-            # 创建表结构
-            async with self.mysql_db.get_session() as session:
-                model_class.__table__.create(session.bind, checkfirst=True)
+            # 创建表 - 使用原始 SQL 以兼容异步
+            logger.info(f"正在创建MySQL表: {table_name}")
+            sql_columns = ["id INT AUTO_INCREMENT PRIMARY KEY"]
+            for col in df.columns:
+                col_name = column_name_map[col]
+                col_type = column_types.get(col, "TEXT")
+                sql_type = "INT" if col_type == "INTEGER" else "FLOAT" if col_type == "FLOAT" else "DATETIME" if col_type == "DATETIME" else "TEXT"
+                sql_columns.append(f"`{col_name}` {sql_type}")
+            sql_columns.append("created_at DATETIME DEFAULT CURRENT_TIMESTAMP")
+            sql_columns.append("updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")
+            create_sql = text(f"CREATE TABLE IF NOT EXISTS `{table_name}` ({', '.join(sql_columns)})")
+            await self.mysql_db.execute_raw_sql(str(create_sql))
+            logger.info(f"MySQL表创建完成: {table_name}")

            # 插入数据
            records = []
            for _, row in df.iterrows():
                record = {}
                for col in df.columns:
-                    col_name = self._sanitize_column_name(col)
+                    col_name = column_name_map[col]
                    value = row[col]

                    # 处理 NaN 值
@@ -231,11 +413,33 @@ class ExcelStorageService:

                records.append(record)

-            # 批量插入
-            async with self.mysql_db.get_session() as session:
-                for record in records:
-                    session.add(model_class(**record))
-                await session.commit()
+            logger.info(f"正在插入 {len(records)} 条数据到 MySQL (使用批量插入)...")
+            # 使用 pymysql 直接插入以避免 SQLAlchemy 异步问题
+            import pymysql
+            from app.config import settings
+
+            connection = pymysql.connect(
+                host=settings.MYSQL_HOST,
+                port=settings.MYSQL_PORT,
+                user=settings.MYSQL_USER,
+                password=settings.MYSQL_PASSWORD,
+                database=settings.MYSQL_DATABASE,
+                charset=settings.MYSQL_CHARSET
+            )
+            try:
+                columns_str = ', '.join(['`' + column_name_map[col] + '`' for col in df.columns])
+                placeholders = ', '.join(['%s' for _ in df.columns])
+                insert_sql = f"INSERT INTO `{table_name}` ({columns_str}) VALUES ({placeholders})"
+
+                # 转换为元组列表 (使用映射后的列名)
+                param_list = [tuple(record.get(column_name_map[col]) for col in df.columns) for record in records]
+
+                with connection.cursor() as cursor:
+                    cursor.executemany(insert_sql, param_list)
+                    connection.commit()
+                logger.info(f"数据插入完成: {len(records)} 条")
+            finally:
+                connection.close()

            results["row_count"] = len(records)
            logger.info(f"Excel 数据已存储到 MySQL 表 {table_name}，共 {len(records)} 行")
@@ -243,7 +447,7 @@ class ExcelStorageService:
            return results

        except Exception as e:
-            logger.error(f"存储 Excel 到 MySQL 失败: {str(e)}")
+            logger.error(f"存储 Excel 到 MySQL 失败: {str(e)}", exc_info=True)
            return {"success": False, "error": str(e)}

    async def store_structured_data(
--- a/backend/app/services/file_service.py
+++ b/backend/app/services/file_service.py
@@ -3,6 +3,7 @@
 """
 import os
 import shutil
+import logging
 from pathlib import Path
 from datetime import datetime
 from typing import Optional
@@ -10,6 +11,8 @@ import uuid

 from app.config import settings

+logger = logging.getLogger(__name__)
+

 class FileService:
    """文件服务类，负责文件的存储、读取和管理"""
@@ -17,6 +20,7 @@ class FileService:
    def __init__(self):
        self.upload_dir = Path(settings.UPLOAD_DIR)
        self._ensure_upload_dir()
+        logger.info(f"FileService 初始化，上传目录: {self.upload_dir}")

    def _ensure_upload_dir(self):
        """确保上传目录存在"""
@@ -56,6 +60,8 @@ class FileService:
        with open(file_path, 'wb') as f:
            f.write(file_content)

+        file_size = len(file_content)
+        logger.info(f"文件已保存: {filename} -> {file_path} ({file_size} bytes)")
        return str(file_path)

    def read_file(self, file_path: str) -> bytes:
--- a/backend/app/services/llm_service.py
+++ b/backend/app/services/llm_service.py
@@ -2,7 +2,7 @@
 LLM 服务模块 - 封装大模型 API 调用
 """
 import logging
-from typing import Dict, Any, List, Optional
+from typing import Dict, Any, List, Optional, AsyncGenerator
 import httpx

 from app.config import settings
@@ -87,6 +87,71 @@ class LLMService:
            logger.error(f"解析 API 响应失败: {str(e)}")
            raise

+    async def chat_stream(
+        self,
+        messages: List[Dict[str, str]],
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None,
+        **kwargs
+    ) -> AsyncGenerator[Dict[str, Any], None]:
+        """
+        流式调用聊天 API
+
+        Args:
+            messages: 消息列表
+            temperature: 温度参数
+            max_tokens: 最大 token 数
+            **kwargs: 其他参数
+
+        Yields:
+            Dict[str, Any]: 包含 delta 内容的块
+        """
+        headers = {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json"
+        }
+
+        payload = {
+            "model": self.model_name,
+            "messages": messages,
+            "temperature": temperature,
+            "stream": True
+        }
+
+        if max_tokens:
+            payload["max_tokens"] = max_tokens
+
+        payload.update(kwargs)
+
+        try:
+            async with httpx.AsyncClient(timeout=120.0) as client:
+                async with client.stream(
+                    "POST",
+                    f"{self.base_url}/chat/completions",
+                    headers=headers,
+                    json=payload
+                ) as response:
+                    async for line in response.aiter_lines():
+                        if line.startswith("data: "):
+                            data = line[6:]  # Remove "data: " prefix
+                            if data == "[DONE]":
+                                break
+                            try:
+                                import json as json_module
+                                chunk = json_module.loads(data)
+                                delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
+                                if delta:
+                                    yield {"content": delta}
+                            except json_module.JSONDecodeError:
+                                continue
+
+        except httpx.HTTPStatusError as e:
+            logger.error(f"LLM 流式 API 请求失败: {e.response.status_code}")
+            raise
+        except Exception as e:
+            logger.error(f"LLM 流式 API 调用异常: {str(e)}")
+            raise
+
    async def analyze_excel_data(
        self,
        excel_data: Dict[str, Any],
--- a/backend/app/services/markdown_ai_service.py
+++ b/backend/app/services/markdown_ai_service.py
@@ -0,0 +1,707 @@
+"""
+Markdown 文档 AI 分析服务
+
+支持：
+- 分章节解析（中文章节编号：一、二、三， （一）（二）（三））
+- 结构化数据提取
+- 流式输出
+- 多种分析类型
+- 可视化图表生成
+"""
+import asyncio
+import json
+import logging
+import re
+from typing import Any, AsyncGenerator, Dict, List, Optional
+
+from app.services.llm_service import llm_service
+from app.core.document_parser import MarkdownParser
+from app.services.visualization_service import visualization_service
+
+logger = logging.getLogger(__name__)
+
+
+class MarkdownSection:
+    """文档章节结构"""
+    def __init__(self, number: str, title: str, level: int, content: str, line_start: int, line_end: int):
+        self.number = number  # 章节编号，如 "一", "（一）", "1"
+        self.title = title
+        self.level = level  # 层级深度
+        self.content = content  # 章节内容（不含子章节）
+        self.line_start = line_start
+        self.line_end = line_end
+        self.subsections: List[MarkdownSection] = []
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "number": self.number,
+            "title": self.title,
+            "level": self.level,
+            "content_preview": self.content[:200] + "..." if len(self.content) > 200 else self.content,
+            "line_start": self.line_start,
+            "line_end": self.line_end,
+            "subsections": [s.to_dict() for s in self.subsections]
+        }
+
+
+class MarkdownAIService:
+    """Markdown 文档 AI 分析服务"""
+
+    # 中文章节编号模式
+    CHINESE_NUMBERS = ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"]
+    CHINESE_SUFFIX = "、"
+    PARENTHESIS_PATTERN = re.compile(r'^（([一二三四五六七八九十]+)\s*(.+)$')
+    CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+)、\s*(.+)$')
+    ARABIC_SECTION_PATTERN = re.compile(r'^(\d+)\.\s+(.+)$')
+
+    def __init__(self):
+        self.parser = MarkdownParser()
+
+    def get_supported_analysis_types(self) -> list:
+        """获取支持的分析类型"""
+        return [
+            "summary",      # 文档摘要
+            "outline",      # 大纲提取
+            "key_points",   # 关键点提取
+            "questions",    # 生成问题
+            "tags",         # 生成标签
+            "qa",           # 问答对
+            "statistics",   # 统计数据分析（适合政府公报）
+            "section",      # 分章节详细分析
+            "charts"        # 可视化图表生成
+        ]
+
+    def extract_sections(self, content: str, titles: List[Dict]) -> List[MarkdownSection]:
+        """
+        从文档内容中提取章节结构
+
+        识别以下章节格式：
+        - 一级：一、二、三...
+        - 二级：（一）（二）（三）...
+        - 三级：1. 2. 3. ...
+        """
+        sections = []
+        lines = content.split('\n')
+
+        # 构建标题行到内容的映射
+        title_lines = {}
+        for t in titles:
+            title_lines[t.get('line', 0)] = t
+
+        current_section = None
+        section_stack = []
+
+        for i, line in enumerate(lines, 1):
+            stripped = line.strip()
+
+            # 检查是否是一级标题（中文数字 + 、）
+            match = self.CHINESE_SECTION_PATTERN.match(stripped)
+            if match:
+                # 结束当前章节
+                if current_section:
+                    current_section.content = self._get_section_content(
+                        lines, current_section.line_start, i - 1
+                    )
+
+                current_section = MarkdownSection(
+                    number=match.group(1),
+                    title=match.group(2),
+                    level=1,
+                    content="",
+                    line_start=i,
+                    line_end=len(lines)
+                )
+                sections.append(current_section)
+                section_stack = [current_section]
+                continue
+
+            # 检查是否是二级标题（（一）（二）...）
+            match = self.PARENTHESIS_PATTERN.match(stripped)
+            if match and current_section:
+                # 结束当前子章节
+                if section_stack and len(section_stack) > 1:
+                    parent = section_stack[-1]
+                    parent.content = self._get_section_content(
+                        lines, parent.line_start, i - 1
+                    )
+
+                subsection = MarkdownSection(
+                    number=match.group(1),
+                    title=match.group(2),
+                    level=2,
+                    content="",
+                    line_start=i,
+                    line_end=len(lines)
+                )
+                current_section.subsections.append(subsection)
+                section_stack = [current_section, subsection]
+                continue
+
+            # 检查是否是三级标题（1. 2. 3.）
+            match = self.ARABIC_SECTION_PATTERN.match(stripped)
+            if match and len(section_stack) > 1:
+                # 结束当前子章节
+                if len(section_stack) > 2:
+                    parent = section_stack[-1]
+                    parent.content = self._get_section_content(
+                        lines, parent.line_start, i - 1
+                    )
+
+                sub_subsection = MarkdownSection(
+                    number=match.group(1),
+                    title=match.group(2),
+                    level=3,
+                    content="",
+                    line_start=i,
+                    line_end=len(lines)
+                )
+                section_stack[-1].subsections.append(sub_subsection)
+                section_stack = section_stack[:-1] + [sub_subsection]
+                continue
+
+        # 处理最后一个章节
+        if current_section:
+            current_section.content = self._get_section_content(
+                lines, current_section.line_start, len(lines)
+            )
+
+        return sections
+
+    def _get_section_content(self, lines: List[str], start: int, end: int) -> str:
+        """获取指定行范围的内容"""
+        if start > end:
+            return ""
+        content_lines = lines[start-1:end]
+        # 清理：移除标题行和空行
+        cleaned = []
+        for line in content_lines:
+            stripped = line.strip()
+            if not stripped:
+                continue
+            # 跳过章节标题行
+            if self.CHINESE_SECTION_PATTERN.match(stripped):
+                continue
+            if self.PARENTHESIS_PATTERN.match(stripped):
+                continue
+            if self.ARABIC_SECTION_PATTERN.match(stripped):
+                continue
+            cleaned.append(stripped)
+        return '\n'.join(cleaned)
+
+    async def analyze_markdown(
+        self,
+        file_path: str,
+        analysis_type: str = "summary",
+        user_prompt: str = "",
+        section_number: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        使用 AI 分析 Markdown 文档
+
+        Args:
+            file_path: 文件路径
+            analysis_type: 分析类型
+            user_prompt: 用户自定义提示词
+            section_number: 指定分析的章节编号（如 "一" 或 "（一）"）
+
+        Returns:
+            dict: 分析结果
+        """
+        try:
+            parse_result = self.parser.parse(file_path)
+
+            if not parse_result.success:
+                return {
+                    "success": False,
+                    "error": parse_result.error
+                }
+
+            data = parse_result.data
+
+            # 提取章节结构
+            sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
+
+            # 如果指定了章节，只分析该章节
+            target_content = data.get("content", "")
+            target_title = parse_result.metadata.get("filename", "")
+
+            if section_number:
+                section = self._find_section(sections, section_number)
+                if section:
+                    target_content = section.content
+                    target_title = f"{section.number}、{section.title}"
+                else:
+                    return {
+                        "success": False,
+                        "error": f"未找到章节: {section_number}"
+                    }
+
+            # 根据分析类型构建提示词
+            prompt = self._build_prompt(
+                content=target_content,
+                analysis_type=analysis_type,
+                user_prompt=user_prompt,
+                title=target_title
+            )
+
+            # 调用 LLM 分析
+            messages = [
+                {"role": "system", "content": self._get_system_prompt(analysis_type)},
+                {"role": "user", "content": prompt}
+            ]
+
+            response = await llm_service.chat(
+                messages=messages,
+                temperature=0.3,
+                max_tokens=4000
+            )
+
+            analysis = llm_service.extract_message_content(response)
+
+            # 构建基础返回
+            result = {
+                "success": True,
+                "filename": parse_result.metadata.get("filename", ""),
+                "analysis_type": analysis_type,
+                "section": target_title if section_number else None,
+                "word_count": len(target_content),
+                "structure": {
+                    "title_count": parse_result.metadata.get("title_count", 0),
+                    "code_block_count": parse_result.metadata.get("code_block_count", 0),
+                    "table_count": parse_result.metadata.get("table_count", 0),
+                    "section_count": len(sections)
+                },
+                "sections": [s.to_dict() for s in sections[:10]],  # 最多返回10个一级章节
+                "analysis": analysis
+            }
+
+            # 如果是 charts 类型，额外生成可视化
+            if analysis_type == "charts":
+                try:
+                    # 解析 LLM 返回的 JSON 数据
+                    chart_data = self._parse_chart_json(analysis)
+                    if chart_data and chart_data.get("tables"):
+                        # 使用可视化服务生成图表
+                        for table_info in chart_data.get("tables", []):
+                            columns = table_info.get("columns", [])
+                            rows = table_info.get("rows", [])
+                            if columns and rows:
+                                vis_result = visualization_service.analyze_and_visualize({
+                                    "columns": columns,
+                                    "rows": [dict(zip(columns, row)) for row in rows]
+                                })
+                                if vis_result.get("success"):
+                                    table_info["visualization"] = {
+                                        "statistics": vis_result.get("statistics"),
+                                        "charts": vis_result.get("charts"),
+                                        "distributions": vis_result.get("distributions")
+                                    }
+                    result["chart_data"] = chart_data
+                except Exception as e:
+                    logger.warning(f"生成可视化图表失败: {e}")
+                    result["chart_data"] = {"tables": [], "key_statistics": [], "chart_suggestions": []}
+
+            return result
+
+        except Exception as e:
+            logger.error(f"Markdown AI 分析失败: {str(e)}")
+            return {
+                "success": False,
+                "error": str(e)
+            }
+
+    async def analyze_markdown_stream(
+        self,
+        file_path: str,
+        analysis_type: str = "summary",
+        user_prompt: str = "",
+        section_number: Optional[str] = None
+    ) -> AsyncGenerator[str, None]:
+        """
+        流式分析 Markdown 文档 (SSE)
+
+        Yields:
+            str: SSE 格式的数据块
+        """
+        try:
+            parse_result = self.parser.parse(file_path)
+
+            if not parse_result.success:
+                yield f"data: {json.dumps({'error': parse_result.error}, ensure_ascii=False)}\n\n"
+                return
+
+            data = parse_result.data
+            sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
+
+            target_content = data.get("content", "")
+            target_title = parse_result.metadata.get("filename", "")
+
+            if section_number:
+                section = self._find_section(sections, section_number)
+                if section:
+                    target_content = section.content
+                    target_title = f"{section.number}、{section.title}"
+                else:
+                    yield f"data: {json.dumps({'error': f'未找到章节: {section_number}'}, ensure_ascii=False)}\n\n"
+                    return
+
+            prompt = self._build_prompt(
+                content=target_content,
+                analysis_type=analysis_type,
+                user_prompt=user_prompt,
+                title=target_title
+            )
+
+            messages = [
+                {"role": "system", "content": self._get_system_prompt(analysis_type)},
+                {"role": "user", "content": prompt}
+            ]
+
+            # 发送初始元数据
+            yield f"data: {json.dumps({
+                'type': 'start',
+                'filename': parse_result.metadata.get("filename", ""),
+                'analysis_type': analysis_type,
+                'section': target_title if section_number else None,
+                'word_count': len(target_content)
+            }, ensure_ascii=False)}\n\n"
+
+            # 流式调用 LLM
+            full_response = ""
+            async for chunk in llm_service.chat_stream(messages, temperature=0.3, max_tokens=4000):
+                content = chunk.get("content", "")
+                if content:
+                    full_response += content
+                    yield f"data: {json.dumps({'type': 'content', 'delta': content}, ensure_ascii=False)}\n\n"
+
+            # 发送完成消息
+            yield f"data: {json.dumps({'type': 'done', 'full_response': full_response}, ensure_ascii=False)}\n\n"
+
+        except Exception as e:
+            logger.error(f"Markdown AI 流式分析失败: {str(e)}")
+            yield f"data: {json.dumps({'error': str(e)}, ensure_ascii=False)}\n\n"
+
+    def _find_section(self, sections: List[MarkdownSection], number: str) -> Optional[MarkdownSection]:
+        """查找指定编号的章节"""
+        # 标准化编号
+        num = number.strip()
+        for section in sections:
+            if section.number == num or section.title == num:
+                return section
+            # 在子章节中查找
+            found = self._find_section(section.subsections, number)
+            if found:
+                return found
+        return None
+
+    def _parse_chart_json(self, json_str: str) -> Optional[Dict[str, Any]]:
+        """
+        解析 LLM 返回的 JSON 字符串
+
+        Args:
+            json_str: LLM 返回的 JSON 字符串
+
+        Returns:
+            解析后的字典，如果解析失败返回 None
+        """
+        if not json_str:
+            return None
+
+        try:
+            # 尝试直接解析
+            return json.loads(json_str)
+        except json.JSONDecodeError:
+            pass
+
+        # 尝试提取 JSON 代码块
+        import re
+        # 匹配 ```json ... ``` 格式
+        match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', json_str)
+        if match:
+            try:
+                return json.loads(match.group(1))
+            except json.JSONDecodeError:
+                pass
+
+        # 尝试找到 JSON 对象的开始和结束
+        start = json_str.find('{')
+        end = json_str.rfind('}')
+        if start != -1 and end != -1 and end > start:
+            try:
+                return json.loads(json_str[start:end+1])
+            except json.JSONDecodeError:
+                pass
+
+        return None
+
+    def _get_system_prompt(self, analysis_type: str) -> str:
+        """根据分析类型获取系统提示词"""
+        prompts = {
+            "summary": "你是一个专业的文档摘要助手，擅长从长文档中提取核心信息。",
+            "outline": "你是一个专业的文档结构分析助手，擅长提取文档大纲和层级结构。",
+            "key_points": "你是一个专业的知识提取助手，擅长从文档中提取关键信息和要点。",
+            "questions": "你是一个专业的教育助手，擅长生成帮助理解文档的问题。",
+            "tags": "你是一个专业的标签生成助手，擅长提取文档的主题标签。",
+            "qa": "你是一个专业的问答助手，擅长基于文档内容生成问答对。",
+            "statistics": "你是一个专业的统计数据分析助手，擅长分析政府统计公报中的数据。",
+            "section": "你是一个专业的章节分析助手，擅长对文档的特定章节进行深入分析。",
+            "charts": "你是一个专业的数据可视化助手，擅长从文档中提取数据并生成适合制作图表的数据结构。"
+        }
+        return prompts.get(analysis_type, "你是一个专业的文档分析助手。")
+
+    def _build_prompt(
+        self,
+        content: str,
+        analysis_type: str,
+        user_prompt: str,
+        title: str = ""
+    ) -> str:
+        """根据分析类型构建提示词"""
+
+        # 截断内容避免超出 token 限制
+        max_content_len = 6000
+        if len(content) > max_content_len:
+            content = content[:max_content_len] + "\n\n[内容已截断...]"
+
+        base_prompts = {
+            "summary": f"""请对以下文档进行摘要分析：
+
+文档标题：{title}
+
+文档内容：
+{content}
+
+请提供：
+1. 文档主要内容摘要（300字以内）
+2. 文档的目的和用途
+3. 适合的读者群体
+
+请用中文回答，结构清晰。""",
+
+            "outline": f"""请提取以下文档的大纲结构：
+
+文档标题：{title}
+
+文档内容：
+{content}
+
+请按层级列出文档大纲，用缩进表示层级关系。
+格式：
+一、一级标题
+   （一）二级标题
+      1. 三级标题
+
+请用中文回答。""",
+
+            "key_points": f"""请从以下文档中提取关键要点：
+
+文档标题：{title}
+
+文档内容：
+{content}
+
+请列出文档的关键要点（5-10条），每条用简洁的语言描述，并说明其在文档中的重要性。
+
+请用中文回答，格式清晰。""",
+
+            "questions": f"""请根据以下文档生成有助于理解内容的问题：
+
+文档标题：{title}
+
+文档内容：
+{content}
+
+请生成5-10个问题，帮助读者更好地理解文档内容。每个问题应该：
+1. 涵盖文档的重要信息点
+2. 易于理解和回答
+3. 具有思考价值
+
+请用中文回答。""",
+
+            "tags": f"""请为以下文档生成标签：
+
+文档标题：{title}
+
+文档内容：
+{content[:3000]}
+
+请生成5-8个标签，用逗号分隔。标签应该反映：
+- 文档的主题领域
+- 文档的类型
+- 文档的关键特征
+
+请用中文回答，只需输出标签，不要其他内容。""",
+
+            "qa": f"""请根据以下文档生成问答对：
+
+文档标题：{title}
+
+文档内容：
+{content[:4000]}
+
+请生成3-5个问答对，帮助读者通过问答形式理解文档内容。
+格式：
+Q1: 问题
+A1: 回答
+Q2: 问题
+A2: 回答
+
+请用中文回答，内容准确。""",
+
+            "statistics": f"""请分析以下政府统计公报中的数据和结论：
+
+文档标题：{title}
+
+文档内容：
+{content}
+
+请提供：
+1. 文档中涉及的主要统计数据（列出关键数字和指标）
+2. 数据的变化趋势（增长/下降）
+3. 重要的百分比和对比
+4. 数据来源和统计口径说明
+
+请用中文回答，数据准确。""",
+
+            "section": f"""请详细分析以下文档章节：
+
+章节标题：{title}
+
+章节内容：
+{content}
+
+请提供：
+1. 章节主要内容概括
+2. 关键信息和数据
+3. 与其他部分的关联（如有）
+4. 重要结论
+
+请用中文回答，分析深入。""",
+
+            "charts": f"""请从以下文档中提取可用于可视化的数据，并生成适合制作图表的数据结构：
+
+文档标题：{title}
+
+文档内容：
+{content}
+
+请完成以下任务：
+1. 识别文档中的表格数据（Markdown表格格式）
+2. 识别文档中的关键统计数据（百分比、数量、趋势等）
+3. 识别可用于比较的分类数据
+
+请用 JSON 格式返回以下结构的数据（如果没有表格数据，返回空结构）：
+{{
+  "tables": [
+    {{
+      "description": "表格的描述",
+      "columns": ["列名1", "列名2", ...],
+      "rows": [
+        ["值1", "值2", ...],
+        ["值1", "值2", ...]
+      ]
+    }}
+  ],
+  "key_statistics": [
+    {{
+      "name": "指标名称",
+      "value": "数值",
+      "trend": "增长/下降/持平",
+      "description": "指标说明"
+    }}
+  ],
+  "chart_suggestions": [
+    {{
+      "chart_type": "bar/line/pie",
+      "title": "图表标题",
+      "data_source": "数据来源说明"
+    }}
+  ]
+}}
+
+请确保返回的是合法的 JSON 格式。"""
+        }
+
+        prompt = base_prompts.get(analysis_type, base_prompts["summary"])
+
+        if user_prompt and user_prompt.strip():
+            prompt += f"\n\n用户额外需求：{user_prompt}"
+
+        return prompt
+
+    async def extract_outline(self, file_path: str) -> Dict[str, Any]:
+        """提取文档大纲"""
+        try:
+            parse_result = self.parser.parse(file_path)
+
+            if not parse_result.success:
+                return {"success": False, "error": parse_result.error}
+
+            data = parse_result.data
+            sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
+
+            # 构建结构化大纲
+            outline = []
+            for section in sections:
+                outline.append({
+                    "number": section.number,
+                    "title": section.title,
+                    "level": section.level,
+                    "line": section.line_start,
+                    "content_preview": section.content[:100] + "..." if len(section.content) > 100 else section.content,
+                    "subsections": [{
+                        "number": s.number,
+                        "title": s.title,
+                        "level": s.level,
+                        "line": s.line_start
+                    } for s in section.subsections]
+                })
+
+            return {
+                "success": True,
+                "outline": outline
+            }
+
+        except Exception as e:
+            logger.error(f"大纲提取失败: {str(e)}")
+            return {"success": False, "error": str(e)}
+
+    async def extract_tables_summary(self, file_path: str) -> Dict[str, Any]:
+        """提取并总结文档中的表格"""
+        try:
+            parse_result = self.parser.parse(file_path)
+
+            if not parse_result.success:
+                return {"success": False, "error": parse_result.error}
+
+            tables = parse_result.data.get("tables", [])
+
+            if not tables:
+                return {"success": True, "tables": [], "message": "文档中没有表格"}
+
+            # 提取每个表格的关键信息
+            table_summaries = []
+            for i, table in enumerate(tables):
+                summary = {
+                    "index": i + 1,
+                    "headers": table.get("headers", []),
+                    "row_count": table.get("row_count", 0),
+                    "column_count": table.get("column_count", 0),
+                    "preview_rows": table.get("rows", [])[:3],  # 只取前3行预览
+                    "first_column": [row[0] if row else "" for row in table.get("rows", [])[:5]]
+                }
+                table_summaries.append(summary)
+
+            return {
+                "success": True,
+                "tables": table_summaries,
+                "table_count": len(tables)
+            }
+
+        except Exception as e:
+            logger.error(f"表格提取失败: {str(e)}")
+            return {"success": False, "error": str(e)}
+
+
+# 全局单例
+markdown_ai_service = MarkdownAIService()
--- a/backend/app/services/rag_service.py
+++ b/backend/app/services/rag_service.py
@@ -40,14 +40,29 @@ class RAGService:
    def _init_embeddings(self):
        """初始化嵌入模型"""
        if self.embedding_model is None:
-            self.embedding_model = SentenceTransformer(settings.EMBEDDING_MODEL)
+            # 使用轻量级本地模型，避免网络问题
+            model_name = 'all-MiniLM-L6-v2'
+            try:
+                self.embedding_model = SentenceTransformer(model_name)
                self._dimension = self.embedding_model.get_sentence_embedding_dimension()
-            logger.info(f"RAG 嵌入模型初始化完成: {settings.EMBEDDING_MODEL}, 维度: {self._dimension}")
+                logger.info(f"RAG 嵌入模型初始化完成: {model_name}, 维度: {self._dimension}")
+            except Exception as e:
+                logger.warning(f"嵌入模型 {model_name} 加载失败: {e}")
+                # 如果本地模型也失败，使用简单hash作为后备
+                self.embedding_model = None
+                self._dimension = 384
+                logger.info("RAG 使用简化模式 (无向量嵌入)")

    def _init_vector_store(self):
        """初始化向量存储"""
        if self.index is None:
            self._init_embeddings()
+            if self.embedding_model is None:
+                # 无法加载嵌入模型，使用简化模式
+                self._dimension = 384
+                self.index = None
+                logger.warning("RAG 嵌入模型未加载，使用简化模式")
+            else:
                self.index = faiss.IndexIDMap(faiss.IndexFlatIP(self._dimension))
                logger.info("Faiss 向量存储初始化完成")

@@ -78,6 +93,11 @@ class RAGService:
        if not self._initialized:
            self._init_vector_store()

+        # 如果没有嵌入模型，只记录到日志
+        if self.embedding_model is None:
+            logger.debug(f"字段跳过索引 (无嵌入模型): {table_name}.{field_name}")
+            return
+
        text = f"表名: {table_name}, 字段: {field_name}, 描述: {field_description}"
        if sample_values:
            text += f", 示例值: {', '.join(sample_values)}"
@@ -100,6 +120,11 @@ class RAGService:
        if not self._initialized:
            self._init_vector_store()

+        # 如果没有嵌入模型，只记录到日志
+        if self.embedding_model is None:
+            logger.debug(f"文档跳过索引 (无嵌入模型): {doc_id}")
+            return
+
        doc = SimpleDocument(
            page_content=content,
            metadata=metadata or {"doc_id": doc_id}
--- a/backend/app/services/table_rag_service.py
+++ b/backend/app/services/table_rag_service.py
@@ -31,6 +31,178 @@ class TableRAGService:
        self.rag = rag_service
        self.excel_storage = excel_storage_service

+    def _extract_sheet_names_from_xml(self, file_path: str) -> List[str]:
+        """
+        从 Excel 文件的 XML 中提取工作表名称
+
+        某些 Excel 文件由于包含非标准元素，pandas/openpyxl 无法正确解析工作表列表，
+        此时需要直接从 XML 中提取。
+
+        Args:
+            file_path: Excel 文件路径
+
+        Returns:
+            工作表名称列表
+        """
+        import zipfile
+        from xml.etree import ElementTree as ET
+
+        try:
+            with zipfile.ZipFile(file_path, 'r') as z:
+                # 读取 workbook.xml
+                if 'xl/workbook.xml' not in z.namelist():
+                    return []
+
+                content = z.read('xl/workbook.xml')
+                root = ET.fromstring(content)
+
+                # 定义命名空间
+                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
+
+                # 提取所有 sheet 的 name 属性
+                sheets = root.findall('.//main:sheet', ns)
+                return [s.get('name') for s in sheets if s.get('name')]
+
+        except Exception as e:
+            logger.warning(f"从 XML 提取工作表失败: {file_path}, error: {e}")
+            return []
+
+    def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
+        """
+        读取 Excel 工作表，支持 pandas 无法解析的特殊 Excel 文件
+
+        当 pandas 的 ExcelFile 无法正确解析时，直接从 XML 读取数据。
+
+        Args:
+            file_path: Excel 文件路径
+            sheet_name: 工作表名称（如果为 None，读取第一个工作表）
+            header_row: 表头行号
+
+        Returns:
+            DataFrame
+        """
+        import zipfile
+        from xml.etree import ElementTree as ET
+
+        try:
+            # 先尝试用 pandas 正常读取
+            df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
+            if df is not None and not df.empty:
+                return df
+        except Exception:
+            pass
+
+        # pandas 读取失败，从 XML 直接解析
+        logger.info(f"使用 XML 方式读取 Excel: {file_path}")
+
+        try:
+            with zipfile.ZipFile(file_path, 'r') as z:
+                # 获取工作表名称
+                sheet_names = self._extract_sheet_names_from_xml(file_path)
+                if not sheet_names:
+                    raise ValueError("无法从 Excel 文件中找到工作表")
+
+                # 确定要读取的工作表
+                target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
+                sheet_index = sheet_names.index(target_sheet) + 1  # sheet1.xml, sheet2.xml, ...
+
+                # 读取 shared strings
+                shared_strings = []
+                if 'xl/sharedStrings.xml' in z.namelist():
+                    ss_content = z.read('xl/sharedStrings.xml')
+                    ss_root = ET.fromstring(ss_content)
+                    ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
+                    for si in ss_root.findall('.//main:si', ns):
+                        t = si.find('.//main:t', ns)
+                        if t is not None:
+                            shared_strings.append(t.text or '')
+                        else:
+                            shared_strings.append('')
+
+                # 读取工作表
+                sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
+                if sheet_file not in z.namelist():
+                    raise ValueError(f"工作表文件 {sheet_file} 不存在")
+
+                sheet_content = z.read(sheet_file)
+                root = ET.fromstring(sheet_content)
+                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
+
+                # 解析行
+                rows_data = []
+                for row in root.findall('.//main:row', ns):
+                    row_idx = int(row.get('r', 0))
+                    # header_row 是 0-indexed，row_idx 是 1-indexed
+                    # 如果 header_row=0 表示第一行是表头，需要跳过 row_idx=1
+                    if row_idx <= header_row + 1:
+                        continue  # 跳过表头行
+
+                    row_cells = {}
+                    for cell in row.findall('main:c', ns):
+                        cell_ref = cell.get('r', '')
+                        col_letters = ''.join(filter(str.isalpha, cell_ref))
+                        cell_type = cell.get('t', 'n')
+                        v = cell.find('main:v', ns)
+
+                        if v is not None and v.text:
+                            if cell_type == 's':
+                                # shared string
+                                try:
+                                    val = shared_strings[int(v.text)]
+                                except (ValueError, IndexError):
+                                    val = v.text
+                            elif cell_type == 'b':
+                                # boolean
+                                val = v.text == '1'
+                            else:
+                                # number or other
+                                val = v.text
+                        else:
+                            val = None
+
+                        row_cells[col_letters] = val
+
+                    if row_cells:
+                        rows_data.append(row_cells)
+
+                # 转换为 DataFrame
+                if not rows_data:
+                    return pd.DataFrame()
+
+                df = pd.DataFrame(rows_data)
+
+                # 如果有 header_row，重新设置列名
+                if header_row >= 0:
+                    # 重新读取第一行作为表头
+                    first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
+                    sheet_content = z.read(first_row_sheet)
+                    root = ET.fromstring(sheet_content)
+                    first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
+                    if first_row is not None:
+                        headers = {}
+                        for cell in first_row.findall('main:c', ns):
+                            cell_ref = cell.get('r', '')
+                            col_letters = ''.join(filter(str.isalpha, cell_ref))
+                            cell_type = cell.get('t', 'n')
+                            v = cell.find('main:v', ns)
+                            if v is not None and v.text:
+                                if cell_type == 's':
+                                    try:
+                                        headers[col_letters] = shared_strings[int(v.text)]
+                                    except (ValueError, IndexError):
+                                        headers[col_letters] = v.text
+                                else:
+                                    headers[col_letters] = v.text
+                        # 重命名列
+                        df.columns = [headers.get(col, col) for col in df.columns]
+
+                logger.info(f"XML 解析完成: {len(df)} 行, {len(df.columns)} 列")
+                return df
+
+        except Exception as e:
+            logger.error(f"XML 解析 Excel 失败: {e}")
+            raise
+
    async def generate_field_description(
        self,
        table_name: str,
@@ -126,26 +298,49 @@ class TableRAGService:
        }

        try:
-            # 1. 读取 Excel
+            # 1. 先检查 Excel 文件是否有效
+            logger.info(f"正在检查Excel文件: {file_path}")
+            try:
+                xls_file = pd.ExcelFile(file_path)
+                sheet_names = xls_file.sheet_names
+                logger.info(f"Excel文件工作表: {sheet_names}")
+
+                # 如果 sheet_names 为空，尝试从 XML 中手动提取
+                if not sheet_names:
+                    sheet_names = self._extract_sheet_names_from_xml(file_path)
+                    logger.info(f"从XML提取工作表: {sheet_names}")
+
+                if not sheet_names:
+                    return {"success": False, "error": "Excel 文件没有工作表"}
+            except Exception as e:
+                logger.error(f"读取Excel文件失败: {file_path}, error: {e}")
+                return {"success": False, "error": f"无法读取Excel文件: {str(e)}"}
+
+            # 2. 读取 Excel
            if sheet_name:
-                df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
-            else:
-                df = pd.read_excel(file_path, header=header_row)
+                # 验证指定的sheet_name是否存在
+                if sheet_name not in sheet_names:
+                    logger.warning(f"指定的工作表 '{sheet_name}' 不存在，使用第一个工作表: {sheet_names[0]}")
+                    sheet_name = sheet_names[0]
+            df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row)
+
+            logger.info(f"读取到数据: {len(df)} 行, {len(df.columns)} 列")

            if df.empty:
                return {"success": False, "error": "Excel 文件为空"}

            # 清理列名
            df.columns = [str(c) for c in df.columns]
-            table_name = excel_storage._sanitize_table_name(filename)
+            table_name = self.excel_storage._sanitize_table_name(filename)
            results["table_name"] = table_name
            results["field_count"] = len(df.columns)
+            logger.info(f"表名: {table_name}, 字段数: {len(df.columns)}")

-            # 2. 初始化 RAG (如果需要)
+            # 3. 初始化 RAG (如果需要)
            if not self.rag._initialized:
                self.rag._init_vector_store()

-            # 3. 为每个字段生成描述并索引
+            # 4. 为每个字段生成描述并索引
            all_fields_data = {}
            for col in df.columns:
                # 采样示例值
@@ -187,7 +382,8 @@ class TableRAGService:
                    logger.error(error_msg)
                    results["errors"].append(error_msg)

-            # 4. 存储到 MySQL
+            # 5. 存储到 MySQL
+            logger.info(f"开始存储到MySQL: {filename}")
            store_result = await self.excel_storage.store_excel(
                file_path=file_path,
                filename=filename,
--- a/docs/test/test.md
+++ b/docs/test/test.md
@@ -1,113 +0,0 @@
- ✅ Excel 文件解析功能已完成并测试通过
-
-  已完成的工作
-
-  后端部分
-
-  1. 文件服务层 (backend/app/services/file_service.py)
-
-    - 文件保存、读取、删除功能
-    - 文件信息获取
-  2. Excel 解析模块 (backend/app/core/document_parser/)
-
-    - base.py - 解析器基类
-    - xlsx_parser.py - Excel 文件解析器
-    - utils.py - 工具函数
-  3. API 接口 (backend/app/api/endpoints/upload.py)
-
-    - POST /upload/excel - 上传并解析 Excel 文件
-    - GET /upload/excel/preview/{file_path} - 获取预览
-    - DELETE /upload/file - 删除文件
-  4. 主应用配置 (backend/app/main.py)
-
-    - CORS 中间件配置
-    - API 路由注册
-
-  前端部分
-
-  1. 后端 API 封装 (frontendTest/app-a6ww9j3ja3nl/src/db/backend-api.ts)
-
-    - Excel 上传、预览、删除函数
-  2. Excel 解析页面 (frontendTest/app-a6ww9j3ja3nl/src/pages/ExcelParse.tsx)
-
-    - 拖拽上传界面
-    - 解析选项配置
-    - 数据表格展示
-    - 多工作表支持
-  3. 路由配置 (frontendTest/app-a6ww9j3ja3nl/src/routes.tsx)
-
-    - 添加 /excel-parse 路由
-  4. 环境配置 (.env)
-
-    - 后端 API URL 配置
-
-  测试结果
-
-  后端 API 测试通过：
-  - ✅ 健康检查：http://127.0.0.1:8000/health
-  - ✅ Excel 上传解析：成功解析 5 行 6 列的测试数据
-
-  访问方式
-
-  1. 后端 API 文档：http://127.0.0.1:8000/api/v1/docs
-  2. 前端页面：
-
-    - 启动前端：cd frontendTest/app-a6ww9j3ja3nl && npm run dev
-    - 访问 Excel 解析页面：/excel-parse
-
-  下一步建议
-
-  1. 启动前端开发服务器测试完整的前端界面
-  2. 添加更多文档格式解析支持（docx, md, txt）
-  3. 完善错误处理和用户提示
-
-✻ Worked for 4m 18s · 1 background task still running (↓ to manage)
-
-> 如何启动后端呢
-
-● 后端启动步骤如下：
-
-  方法一：使用命令行启动（开发模式）
-
-  # 1. 进入后端目录
-  cd H:/OwnProject/FilesReadSysteam/backend
-
-  # 2. 激活虚拟环境
-  # PowerShell:
-  .\venv\Scripts\Activate.ps1
-
-  # 或 CMD:
-  .\venv\Scripts\Activate.bat
-
-  # 3. 启动后端服务器
-  python -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload
-
-  方法二：直接使用虚拟环境中的 Python
-
-  cd H:/OwnProject/FilesReadSysteam/backend
-  ./venv/Scripts/python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload
-
-  方法三：直接运行 main.py（开发调试）
-
-  cd H:/OwnProject/FilesReadSysteam/backend
-  ./venv/Scripts/python.exe app/main.py
-
-  启动成功的标志
-
-  INFO:     Uvicorn running on http://127.0.0.1:8000
-  INFO:     Started server process [xxxxx]
-  INFO:     Waiting for application startup.
-  INFO:     Application startup complete.
-
-  验证服务是否正常
-
-  # 健康检查
-  curl http://127.0.0.1:8000/health
-
-  # 访问 API 文档
-  # 浏览器打开: http://127.0.0.1:8000/api/v1/docs
-
---
-  当前状态：后端已在后台运行（任务 ID: b22jkg69j），可以直接访问 http://127.0.0.1:8000
-
-  需要停止的话告诉我即可。
--- a/frontend/src/db/backend-api.ts
+++ b/frontend/src/db/backend-api.ts
@@ -166,6 +166,66 @@ export interface AIAnalysisResult {
  error?: string;
 }

+// ==================== Markdown AI 分析类型 ====================
+
+export interface AIMarkdownAnalyzeResult {
+  success: boolean;
+  filename?: string;
+  analysis_type?: string;
+  section?: string;
+  word_count?: number;
+  structure?: {
+    title_count?: number;
+    code_block_count?: number;
+    table_count?: number;
+    section_count?: number;
+  };
+  sections?: MarkdownSection[];
+  analysis?: string;
+  chart_data?: {
+    tables?: Array<{
+      description?: string;
+      columns?: string[];
+      rows?: string[][];
+      visualization?: {
+        statistics?: any;
+        charts?: any;
+        distributions?: any;
+      };
+    }>;
+    key_statistics?: Array<{
+      name?: string;
+      value?: string;
+      trend?: string;
+      description?: string;
+    }>;
+    chart_suggestions?: Array<{
+      chart_type?: string;
+      title?: string;
+      data_source?: string;
+    }>;
+  };
+  error?: string;
+}
+
+export interface MarkdownSection {
+  number: string;
+  title: string;
+  level: number;
+  content_preview?: string;
+  line_start: number;
+  line_end?: number;
+  subsections?: MarkdownSection[];
+}
+
+export interface MarkdownOutlineResult {
+  success: boolean;
+  outline?: MarkdownSection[];
+  error?: string;
+}
+
+export type MarkdownAnalysisType = 'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section' | 'charts';
+
 export interface AIExcelAnalyzeResult {
  success: boolean;
  excel?: {
@@ -842,6 +902,159 @@ export const aiApi = {
    }
  },

+  /**
+   * 上传并使用 AI 分析 Markdown 文件
+   */
+  async analyzeMarkdown(
+    file: File,
+    options: {
+      analysisType?: MarkdownAnalysisType;
+      userPrompt?: string;
+      sectionNumber?: string;
+    } = {}
+  ): Promise<AIMarkdownAnalyzeResult> {
+    const formData = new FormData();
+    formData.append('file', file);
+
+    const params = new URLSearchParams();
+    if (options.analysisType) {
+      params.append('analysis_type', options.analysisType);
+    }
+    if (options.userPrompt) {
+      params.append('user_prompt', options.userPrompt);
+    }
+    if (options.sectionNumber) {
+      params.append('section_number', options.sectionNumber);
+    }
+
+    const url = `${BACKEND_BASE_URL}/ai/analyze/md?${params.toString()}`;
+
+    try {
+      const response = await fetch(url, {
+        method: 'POST',
+        body: formData,
+      });
+
+      if (!response.ok) {
+        const error = await response.json();
+        throw new Error(error.detail || 'Markdown AI 分析失败');
+      }
+
+      return await response.json();
+    } catch (error) {
+      console.error('Markdown AI 分析失败:', error);
+      throw error;
+    }
+  },
+
+  /**
+   * 流式分析 Markdown 文件 (SSE)
+   */
+  async analyzeMarkdownStream(
+    file: File,
+    options: {
+      analysisType?: MarkdownAnalysisType;
+      userPrompt?: string;
+      sectionNumber?: string;
+    } = {},
+    onChunk?: (chunk: { type: string; delta?: string; error?: string }) => void
+  ): Promise<string> {
+    const formData = new FormData();
+    formData.append('file', file);
+
+    const params = new URLSearchParams();
+    if (options.analysisType) {
+      params.append('analysis_type', options.analysisType);
+    }
+    if (options.userPrompt) {
+      params.append('user_prompt', options.userPrompt);
+    }
+    if (options.sectionNumber) {
+      params.append('section_number', options.sectionNumber);
+    }
+
+    const url = `${BACKEND_BASE_URL}/ai/analyze/md/stream?${params.toString()}`;
+
+    try {
+      const response = await fetch(url, {
+        method: 'POST',
+        body: formData,
+      });
+
+      if (!response.ok) {
+        const error = await response.json();
+        throw new Error(error.detail || 'Markdown AI 流式分析失败');
+      }
+
+      const reader = response.body?.getReader();
+      if (!reader) throw new Error('无法读取响应流');
+
+      const decoder = new TextDecoder();
+      let fullResponse = '';
+
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) break;
+
+        const chunk = decoder.decode(value);
+        const lines = chunk.split('\n');
+
+        for (const line of lines) {
+          if (line.startsWith('data: ')) {
+            const data = line.slice(6);
+            if (data === '[DONE]') continue;
+
+            try {
+              const parsed = JSON.parse(data);
+              if (parsed.type === 'content' && parsed.delta) {
+                fullResponse += parsed.delta;
+                onChunk?.({ type: 'content', delta: parsed.delta });
+              } else if (parsed.type === 'done') {
+                fullResponse = parsed.full_response || fullResponse;
+              } else if (parsed.error) {
+                onChunk?.({ type: 'error', error: parsed.error });
+              }
+            } catch {
+              // Ignore parse errors for incomplete JSON
+            }
+          }
+        }
+      }
+
+      return fullResponse;
+    } catch (error) {
+      console.error('Markdown AI 流式分析失败:', error);
+      throw error;
+    }
+  },
+
+  /**
+   * 获取 Markdown 文档大纲（分章节信息）
+   */
+  async getMarkdownOutline(file: File): Promise<MarkdownOutlineResult> {
+    const formData = new FormData();
+    formData.append('file', file);
+
+    const url = `${BACKEND_BASE_URL}/ai/analyze/md/outline`;
+
+    try {
+      const response = await fetch(url, {
+        method: 'GET',
+        body: formData,
+      });
+
+      if (!response.ok) {
+        const error = await response.json();
+        throw new Error(error.detail || '获取 Markdown 大纲失败');
+      }
+
+      return await response.json();
+    } catch (error) {
+      console.error('获取 Markdown 大纲失败:', error);
+      throw error;
+    }
+  },
+
  /**
   * 生成统计信息和图表
   */
--- a/frontend/src/pages/Documents.tsx
+++ b/frontend/src/pages/Documents.tsx
@@ -19,7 +19,11 @@ import {
  TrendingUp,
  Download,
  Brain,
-  Settings2
+  Settings2,
+  List,
+  MessageSquareCode,
+  Tag,
+  HelpCircle
 } from 'lucide-react';
 import { Button } from '@/components/ui/button';
 import { Input } from '@/components/ui/input';
@@ -33,7 +37,7 @@ import { Checkbox } from '@/components/ui/checkbox';
 import { toast } from 'sonner';
 import { cn } from '@/lib/utils';
 import { Skeleton } from '@/components/ui/skeleton';
-import { backendApi, type ExcelParseResult, aiApi } from '@/db/backend-api';
+import { backendApi, type ExcelParseResult, type AIMarkdownAnalyzeResult, type MarkdownSection, aiApi } from '@/db/backend-api';
 import {
  Table as TableComponent,
  TableBody,
@@ -78,6 +82,15 @@ const Documents: React.FC = () => {
  const [analysisCharts, setAnalysisCharts] = useState<any>(null);
  const [analysisTypes, setAnalysisTypes] = useState<Array<{ value: string; label: string; description: string }>>([]);

+  // Markdown AI 分析相关状态
+  const [mdAnalysis, setMdAnalysis] = useState<AIMarkdownAnalyzeResult | null>(null);
+  const [mdAnalysisType, setMdAnalysisType] = useState<'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section' | 'charts'>('summary');
+  const [mdUserPrompt, setMdUserPrompt] = useState('');
+  const [mdSections, setMdSections] = useState<MarkdownSection[]>([]);
+  const [mdSelectedSection, setMdSelectedSection] = useState<string>('');
+  const [mdStreaming, setMdStreaming] = useState(false);
+  const [mdStreamingContent, setMdStreamingContent] = useState('');
+
  // 解析选项
  const [parseOptions, setParseOptions] = useState({
    parseAllSheets: false,
@@ -144,6 +157,9 @@ const Documents: React.FC = () => {
    setAiAnalysis(null);
    setAnalysisCharts(null);
    setExpandedSheet(null);
+    setMdAnalysis(null);
+    setMdSections([]);
+    setMdStreamingContent('');

    const ext = file.name.split('.').pop()?.toLowerCase();

@@ -163,6 +179,9 @@ const Documents: React.FC = () => {
        } else {
          toast.error(result.error || '解析失败');
        }
+      } else if (ext === 'md' || ext === 'markdown') {
+        // Markdown 文件：获取大纲
+        await fetchMdOutline();
      } else {
        // 其他文档使用通用上传接口
        const result = await backendApi.uploadDocument(file);
@@ -403,6 +422,106 @@ const Documents: React.FC = () => {
    }
  };

+  const isMarkdownFile = (filename: string) => {
+    const ext = filename.split('.').pop()?.toLowerCase();
+    return ext === 'md' || ext === 'markdown';
+  };
+
+  // Markdown AI 分析处理
+  const handleMdAnalyze = async () => {
+    if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) {
+      toast.error('请先上传 Markdown 文件');
+      return;
+    }
+
+    setAnalyzing(true);
+    setMdAnalysis(null);
+
+    try {
+      const result = await aiApi.analyzeMarkdown(uploadedFile, {
+        analysisType: mdAnalysisType,
+        userPrompt: mdUserPrompt,
+        sectionNumber: mdSelectedSection || undefined
+      });
+
+      if (result.success) {
+        toast.success('Markdown AI 分析完成');
+        setMdAnalysis(result);
+      } else {
+        toast.error(result.error || 'AI 分析失败');
+      }
+    } catch (error: any) {
+      toast.error(error.message || 'AI 分析失败');
+    } finally {
+      setAnalyzing(false);
+    }
+  };
+
+  // 流式分析 Markdown
+  const handleMdAnalyzeStream = async () => {
+    if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) {
+      toast.error('请先上传 Markdown 文件');
+      return;
+    }
+
+    setAnalyzing(true);
+    setMdStreaming(true);
+    setMdStreamingContent('');
+    setMdAnalysis(null);
+
+    try {
+      await aiApi.analyzeMarkdownStream(
+        uploadedFile,
+        {
+          analysisType: mdAnalysisType,
+          userPrompt: mdUserPrompt,
+          sectionNumber: mdSelectedSection || undefined
+        },
+        (chunk: { type: string; delta?: string; error?: string }) => {
+          if (chunk.type === 'content' && chunk.delta) {
+            setMdStreamingContent(prev => prev + chunk.delta);
+          } else if (chunk.type === 'error') {
+            toast.error(chunk.error || '流式分析出错');
+          }
+        }
+      );
+    } catch (error: any) {
+      toast.error(error.message || 'AI 分析失败');
+    } finally {
+      setAnalyzing(false);
+      setMdStreaming(false);
+    }
+  };
+
+  // 获取 Markdown 文档大纲（分章节）
+  const fetchMdOutline = async () => {
+    if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) return;
+
+    try {
+      const result = await aiApi.getMarkdownOutline(uploadedFile);
+      if (result.success && result.outline) {
+        setMdSections(result.outline);
+      }
+    } catch (error) {
+      console.error('获取大纲失败:', error);
+    }
+  };
+
+  const getMdAnalysisIcon = (type: string) => {
+    switch (type) {
+      case 'summary': return <FileText size={20} />;
+      case 'outline': return <List size={20} />;
+      case 'key_points': return <TrendingUp size={20} />;
+      case 'statistics': return <TrendingUp size={20} />;
+      case 'section': return <FileText size={20} />;
+      case 'questions': return <MessageSquareCode size={20} />;
+      case 'tags': return <Tag size={20} />;
+      case 'qa': return <HelpCircle size={20} />;
+      case 'charts': return <TrendingUp size={20} />;
+      default: return <Sparkles size={20} />;
+    }
+  };
+
  const formatFileSize = (bytes: number): string => {
    if (bytes === 0) return '0 B';
    const k = 1024;
@@ -600,6 +719,98 @@ const Documents: React.FC = () => {
            </Card>
          )}

+          {/* Markdown AI 分析选项 */}
+          {uploadedFile && isMarkdownFile(uploadedFile.name) && (
+            <Card className="border-none shadow-md bg-gradient-to-br from-purple-500/5 to-primary/5">
+              <CardHeader className="pb-4">
+                <CardTitle className="flex items-center gap-2">
+                  <Sparkles className="text-purple-500" size={20} />
+                  Markdown AI 分析
+                </CardTitle>
+              </CardHeader>
+              <CardContent className="space-y-4">
+                {/* 章节选择 */}
+                {mdSections.length > 0 && (
+                  <div className="space-y-2">
+                    <Label htmlFor="md-section" className="text-sm">指定章节（可选）</Label>
+                    <Select value={mdSelectedSection} onValueChange={setMdSelectedSection}>
+                      <SelectTrigger id="md-section" className="bg-background">
+                        <SelectValue placeholder="全文分析" />
+                      </SelectTrigger>
+                      <SelectContent>
+                        <SelectItem value="">全文分析</SelectItem>
+                        {mdSections.map((section) => (
+                          <SelectItem key={section.number} value={section.number}>
+                            {section.number}、{section.title}
+                          </SelectItem>
+                        ))}
+                      </SelectContent>
+                    </Select>
+                  </div>
+                )}
+                <div className="space-y-2">
+                  <Label htmlFor="md-analysis-type" className="text-sm">分析类型</Label>
+                  <Select value={mdAnalysisType} onValueChange={(value: any) => setMdAnalysisType(value)}>
+                    <SelectTrigger id="md-analysis-type" className="bg-background">
+                      <SelectValue />
+                    </SelectTrigger>
+                    <SelectContent>
+                      {[
+                        { value: 'summary', label: '文档摘要', desc: '主要内容摘要' },
+                        { value: 'outline', label: '大纲提取', desc: '提取文档结构' },
+                        { value: 'key_points', label: '关键要点', desc: '提取关键信息' },
+                        { value: 'statistics', label: '统计分析', desc: '统计数据分析' },
+                        { value: 'section', label: '章节分析', desc: '分章节详细分析' },
+                        { value: 'questions', label: '生成问题', desc: '生成理解性问题' },
+                        { value: 'tags', label: '生成标签', desc: '提取主题标签' },
+                        { value: 'qa', label: '问答对', desc: '生成问答内容' },
+                        { value: 'charts', label: '数据图表', desc: '生成可视化数据' }
+                      ].map(type => (
+                        <SelectItem key={type.value} value={type.value}>
+                          <div className="flex items-center gap-2">
+                            {getMdAnalysisIcon(type.value)}
+                            <div className="flex flex-col">
+                              <span className="font-medium">{type.label}</span>
+                              <span className="text-xs text-muted-foreground">{type.desc}</span>
+                            </div>
+                          </div>
+                        </SelectItem>
+                      ))}
+                    </SelectContent>
+                  </Select>
+                </div>
+                <div className="space-y-2">
+                  <Label htmlFor="md-user-prompt" className="text-sm">自定义提示词（可选）</Label>
+                  <Textarea
+                    id="md-user-prompt"
+                    placeholder="例如：请重点关注技术实现部分..."
+                    value={mdUserPrompt}
+                    onChange={(e) => setMdUserPrompt(e.target.value)}
+                    className="bg-background resize-none"
+                    rows={2}
+                  />
+                </div>
+                <div className="flex gap-2">
+                  <Button
+                    onClick={handleMdAnalyze}
+                    disabled={analyzing}
+                    className="flex-1 bg-gradient-to-r from-purple-500 to-primary hover:from-purple-500/90 hover:to-primary/90"
+                  >
+                    {analyzing && !mdStreaming ? <><Loader2 className="mr-2 animate-spin" size={16} /> 分析中...</> : <><Sparkles className="mr-2" size={16} />普通分析</>}
+                  </Button>
+                  <Button
+                    onClick={handleMdAnalyzeStream}
+                    disabled={analyzing}
+                    variant="outline"
+                    className="flex-1"
+                  >
+                    {analyzing && mdStreaming ? <><Loader2 className="mr-2 animate-spin" size={16} /> 流式...</> : <><Sparkles className="mr-2" size={16} />流式分析</>}
+                  </Button>
+                </div>
+              </CardContent>
+            </Card>
+          )}
+
          {/* 数据操作 */}
          {parseResult?.success && (
            <Card className="border-none shadow-md bg-gradient-to-br from-emerald-500/5 to-blue-500/5">
@@ -661,6 +872,45 @@ const Documents: React.FC = () => {
            </Card>
          )}

+          {/* Markdown AI 分析结果 */}
+          {(mdAnalysis || mdStreamingContent) && (
+            <Card className="border-none shadow-md border-l-4 border-l-purple-500">
+              <CardHeader>
+                <div className="flex items-center justify-between">
+                  <div className="space-y-1">
+                    <CardTitle className="flex items-center gap-2">
+                      <Sparkles className="text-purple-500" size={20} />
+                      Markdown AI 分析结果
+                      {mdStreaming && <Badge variant="default" className="ml-2 bg-purple-500">流式输出中</Badge>}
+                    </CardTitle>
+                    {mdAnalysis && (
+                      <CardDescription>
+                        {mdAnalysis.filename} • {mdAnalysis.word_count || 0} 字 • {mdAnalysis.analysis_type}
+                        {mdAnalysis.section && ` • ${mdAnalysis.section}`}
+                      </CardDescription>
+                    )}
+                  </div>
+                  {mdAnalysis?.structure && (
+                    <Badge variant="secondary">
+                      {mdAnalysis.structure.title_count || 0} 标题 • {mdAnalysis.structure.section_count || 0} 章节
+                    </Badge>
+                  )}
+                </div>
+              </CardHeader>
+              <CardContent className="max-h-[500px] overflow-y-auto">
+                {/* 流式内容优先显示 */}
+                {mdStreamingContent && (
+                  <div className="animate-pulse text-sm text-muted-foreground mb-4">
+                    流式输出中...
+                  </div>
+                )}
+                {mdStreamingContent && <Markdown content={mdStreamingContent} />}
+                {mdAnalysis?.analysis && !mdStreamingContent && <Markdown content={mdAnalysis.analysis} />}
+                {!mdAnalysis?.success && !mdStreamingContent && <p className="text-sm text-destructive">{mdAnalysis?.error || '分析失败'}</p>}
+              </CardContent>
+            </Card>
+          )}
+
          {/* 图表显示 */}
          {analysisCharts && (
            <Card className="border-none shadow-md border-l-4 border-l-indigo-500">