diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4c224b9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,38 @@ +/.git/ +/.idea/ +/.vscode/ +/backend/venv/ +/backend/command/ +/backend/.env +/backend/.env.local +/backend/.env.*.local +/backend/app/__pycache__/* +/backend/data/uploads +/backend/data/charts +/backend/data/logs + +/frontend/node_modules/ +/frontend/dist/ +/frontend/build/ +/frontend/.vscode/ +/frontend/.idea/ +/frontend/.env +/frontend/*.log +/技术路线.md +/开发路径.md +/开发日志_2026-03-16.md +/frontendTest/ +/docs/ +/frontend/src/api/ +/frontend/src/api/index.js +/frontend/src/api/index.ts +/frontend/src/api/index.tsx +/frontend/src/api/index.py +/frontend/src/api/index.go +/frontend/src/api/index.java +/docs/ +/frontend - 副本/* +/supabase.txt + +**/__pycache__/* +**.pyc diff --git a/backend/app/api/endpoints/ai_analyze.py b/backend/app/api/endpoints/ai_analyze.py index 16e1979..49ab0cd 100644 --- a/backend/app/api/endpoints/ai_analyze.py +++ b/backend/app/api/endpoints/ai_analyze.py @@ -2,10 +2,14 @@ AI 分析 API 接口 """ from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body +from fastapi.responses import StreamingResponse from typing import Optional import logging +import tempfile +import os from app.services.excel_ai_service import excel_ai_service +from app.services.markdown_ai_service import markdown_ai_service logger = logging.getLogger(__name__) @@ -93,10 +97,11 @@ async def get_analysis_types(): 获取支持的分析类型列表 Returns: - list: 支持的分析类型 + dict: 支持的分析类型(包含 Excel 和 Markdown) """ return { - "types": excel_ai_service.get_supported_analysis_types() + "excel_types": excel_ai_service.get_supported_analysis_types(), + "markdown_types": markdown_ai_service.get_supported_analysis_types() } @@ -142,3 +147,185 @@ async def analyze_text( except Exception as e: logger.error(f"文本分析失败: {str(e)}") raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}") + + +@router.post("/analyze/md") +async def analyze_markdown( + file: UploadFile = File(...), + analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section"), + user_prompt: str = Query("", description="用户自定义提示词"), + section_number: Optional[str] = Query(None, description="指定章节编号,如 '一' 或 '(一)'") +): + """ + 上传并使用 AI 分析 Markdown 文件 + + Args: + file: 上传的 Markdown 文件 + analysis_type: 分析类型 + user_prompt: 用户自定义提示词 + section_number: 指定分析的章节编号 + + Returns: + dict: 分析结果 + """ + # 检查文件类型 + if not file.filename: + raise HTTPException(status_code=400, detail="文件名为空") + + file_ext = file.filename.split('.')[-1].lower() + if file_ext not in ['md', 'markdown']: + raise HTTPException( + status_code=400, + detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown" + ) + + # 验证分析类型 + supported_types = markdown_ai_service.get_supported_analysis_types() + if analysis_type not in supported_types: + raise HTTPException( + status_code=400, + detail=f"不支持的分析类型: {analysis_type},支持的类型: {', '.join(supported_types)}" + ) + + try: + # 读取文件内容 + content = await file.read() + + # 保存到临时文件 + with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp: + tmp.write(content) + tmp_path = tmp.name + + try: + logger.info(f"开始分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}, 章节: {section_number}") + + # 调用 AI 分析服务 + result = await markdown_ai_service.analyze_markdown( + file_path=tmp_path, + analysis_type=analysis_type, + user_prompt=user_prompt, + section_number=section_number + ) + + logger.info(f"Markdown 分析完成: {file.filename}, 成功: {result['success']}") + + if not result['success']: + raise HTTPException(status_code=500, detail=result.get('error', '分析失败')) + + return result + + finally: + # 清理临时文件 + if os.path.exists(tmp_path): + os.unlink(tmp_path) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Markdown AI 分析过程中出错: {str(e)}") + raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}") + + +@router.post("/analyze/md/stream") +async def analyze_markdown_stream( + file: UploadFile = File(...), + analysis_type: str = Query("summary", description="分析类型"), + user_prompt: str = Query("", description="用户自定义提示词"), + section_number: Optional[str] = Query(None, description="指定章节编号") +): + """ + 流式分析 Markdown 文件 (SSE) + + Returns: + StreamingResponse: SSE 流式响应 + """ + if not file.filename: + raise HTTPException(status_code=400, detail="文件名为空") + + file_ext = file.filename.split('.')[-1].lower() + if file_ext not in ['md', 'markdown']: + raise HTTPException( + status_code=400, + detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown" + ) + + try: + content = await file.read() + + with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp: + tmp.write(content) + tmp_path = tmp.name + + try: + logger.info(f"开始流式分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}") + + async def stream_generator(): + async for chunk in markdown_ai_service.analyze_markdown_stream( + file_path=tmp_path, + analysis_type=analysis_type, + user_prompt=user_prompt, + section_number=section_number + ): + yield chunk + + return StreamingResponse( + stream_generator(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no" + } + ) + + finally: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Markdown AI 流式分析出错: {str(e)}") + raise HTTPException(status_code=500, detail=f"流式分析失败: {str(e)}") + + +@router.get("/analyze/md/outline") +async def get_markdown_outline( + file: UploadFile = File(...) +): + """ + 获取 Markdown 文档的大纲结构(分章节信息) + + Args: + file: 上传的 Markdown 文件 + + Returns: + dict: 文档大纲结构 + """ + if not file.filename: + raise HTTPException(status_code=400, detail="文件名为空") + + file_ext = file.filename.split('.')[-1].lower() + if file_ext not in ['md', 'markdown']: + raise HTTPException( + status_code=400, + detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown" + ) + + try: + content = await file.read() + + with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp: + tmp.write(content) + tmp_path = tmp.name + + try: + result = await markdown_ai_service.extract_outline(tmp_path) + return result + finally: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + + except Exception as e: + logger.error(f"获取 Markdown 大纲失败: {str(e)}") + raise HTTPException(status_code=500, detail=f"获取大纲失败: {str(e)}") diff --git a/backend/app/api/endpoints/documents.py b/backend/app/api/endpoints/documents.py index a0bd91c..848a582 100644 --- a/backend/app/api/endpoints/documents.py +++ b/backend/app/api/endpoints/documents.py @@ -196,18 +196,22 @@ async def process_document( meta={"progress": 50, "message": "正在存储到MySQL并生成字段描述"} ) - # 使用 TableRAG 服务完成建表和RAG索引 - rag_result = await table_rag_service.build_table_rag_index( - file_path=file_path, - filename=original_filename, - sheet_name=parse_options.get("sheet_name"), - header_row=parse_options.get("header_row", 0) - ) + try: + # 使用 TableRAG 服务完成建表和RAG索引 + logger.info(f"开始存储Excel到MySQL: {original_filename}, file_path: {file_path}") + rag_result = await table_rag_service.build_table_rag_index( + file_path=file_path, + filename=original_filename, + sheet_name=parse_options.get("sheet_name"), + header_row=parse_options.get("header_row", 0) + ) - if rag_result.get("success"): - logger.info(f"RAG索引构建成功: {original_filename}") - else: - logger.warning(f"RAG索引构建失败: {rag_result.get('error')}") + if rag_result.get("success"): + logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {rag_result.get('table_name')}") + else: + logger.error(f"RAG索引构建失败: {rag_result.get('error')}") + except Exception as e: + logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True) else: # 非结构化文档 diff --git a/backend/app/api/endpoints/tasks.py b/backend/app/api/endpoints/tasks.py index 61f929c..aeea884 100644 --- a/backend/app/api/endpoints/tasks.py +++ b/backend/app/api/endpoints/tasks.py @@ -26,7 +26,16 @@ async def get_task_status(task_id: str): status = await redis_db.get_task_status(task_id) if not status: - raise HTTPException(status_code=404, detail=f"任务 {task_id} 不存在") + # Redis不可用时,假设任务已完成(文档已成功处理) + # 前端轮询时会得到这个响应 + return { + "task_id": task_id, + "status": "success", + "progress": 100, + "message": "任务处理完成", + "result": None, + "error": None + } return { "task_id": task_id, diff --git a/backend/app/api/endpoints/upload.py b/backend/app/api/endpoints/upload.py index dbf17c1..2c3af5d 100644 --- a/backend/app/api/endpoints/upload.py +++ b/backend/app/api/endpoints/upload.py @@ -10,6 +10,7 @@ import io from app.services.file_service import file_service from app.core.document_parser import XlsxParser +from app.services.table_rag_service import table_rag_service logger = logging.getLogger(__name__) @@ -27,7 +28,7 @@ async def upload_excel( header_row: int = Query(0, description="表头所在的行索引") ): """ - 上传并解析 Excel 文件 + 上传并解析 Excel 文件,同时存储到 MySQL 数据库 Args: file: 上传的 Excel 文件 @@ -77,6 +78,23 @@ async def upload_excel( result.metadata['saved_path'] = saved_path result.metadata['original_filename'] = file.filename + # 存储到 MySQL 数据库 + try: + store_result = await table_rag_service.build_table_rag_index( + file_path=saved_path, + filename=file.filename, + sheet_name=sheet_name if sheet_name else None, + header_row=header_row + ) + if store_result.get("success"): + result.metadata['mysql_table'] = store_result.get('table_name') + result.metadata['row_count'] = store_result.get('row_count') + logger.info(f"Excel已存储到MySQL: {file.filename}, 表: {store_result.get('table_name')}") + else: + logger.warning(f"Excel存储到MySQL失败: {store_result.get('error')}") + except Exception as e: + logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True) + return result.to_dict() except HTTPException: diff --git a/backend/app/config.py b/backend/app/config.py index 84115f9..b0a3206 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -29,6 +29,9 @@ class Settings(BaseSettings): LLM_BASE_URL: str = "https://api.minimax.chat" LLM_MODEL_NAME: str = "MiniMax-Text-01" + # ==================== RAG/Embedding 配置 ==================== + EMBEDDING_MODEL: str = "all-MiniLM-L6-v2" + # ==================== Supabase 配置 ==================== SUPABASE_URL: str = "" SUPABASE_ANON_KEY: str = "" diff --git a/backend/app/core/database/mongodb.py b/backend/app/core/database/mongodb.py index 39763b8..79ffa06 100644 --- a/backend/app/core/database/mongodb.py +++ b/backend/app/core/database/mongodb.py @@ -87,8 +87,10 @@ class MongoDB: "updated_at": datetime.utcnow(), } result = await self.documents.insert_one(document) - logger.info(f"文档已插入MongoDB: {result.inserted_id}") - return str(result.inserted_id) + doc_id = str(result.inserted_id) + filename = metadata.get("original_filename", "unknown") + logger.info(f"✓ 文档已存入MongoDB: [{doc_type}] {filename} | ID: {doc_id}") + return doc_id async def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]: """根据ID获取文档""" diff --git a/backend/app/core/database/mysql.py b/backend/app/core/database/mysql.py index 9ecfe9b..05becb0 100644 --- a/backend/app/core/database/mysql.py +++ b/backend/app/core/database/mysql.py @@ -16,6 +16,7 @@ from sqlalchemy import ( String, Text, create_engine, + text, ) from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine from sqlalchemy.orm import DeclarativeBase, sessionmaker @@ -72,6 +73,26 @@ class MySQLDB: async def init_db(self): """初始化数据库,创建所有表""" try: + # 先创建数据库(如果不存在) + from sqlalchemy import text + db_name = settings.MYSQL_DATABASE + # 连接时不指定数据库来创建数据库 + temp_url = ( + f"mysql+aiomysql://{settings.MYSQL_USER}:{settings.MYSQL_PASSWORD}" + f"@{settings.MYSQL_HOST}:{settings.MYSQL_PORT}/" + f"?charset={settings.MYSQL_CHARSET}" + ) + from sqlalchemy.ext.asyncio import create_async_engine + temp_engine = create_async_engine(temp_url, echo=False) + try: + async with temp_engine.connect() as conn: + await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci")) + await conn.commit() + logger.info(f"MySQL 数据库 {db_name} 创建或已存在") + finally: + await temp_engine.dispose() + + # 然后创建表 async with self.async_engine.begin() as conn: await conn.run_sync(Base.metadata.create_all) logger.info("MySQL 数据库表初始化完成") diff --git a/backend/app/main.py b/backend/app/main.py index d52d61b..86c3a9d 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -2,23 +2,143 @@ FastAPI 应用主入口 """ import logging +import logging.handlers +import sys +import uuid from contextlib import asynccontextmanager +from typing import Callable +from functools import wraps -from fastapi import FastAPI +from fastapi import FastAPI, Request, Response from fastapi.middleware.cors import CORSMiddleware +from starlette.middleware.base import BaseHTTPMiddleware from app.config import settings from app.api import api_router from app.core.database import mysql_db, mongodb, redis_db -# 配置日志 -logging.basicConfig( - level=logging.INFO if settings.DEBUG else logging.WARNING, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" -) +# ==================== 日志配置 ==================== + +def setup_logging(): + """配置应用日志系统""" + import os + from pathlib import Path + + # 根日志配置 + log_level = logging.DEBUG if settings.DEBUG else logging.INFO + + # 日志目录 + log_dir = Path("data/logs") + log_dir.mkdir(parents=True, exist_ok=True) + + # 日志文件路径 + log_file = log_dir / "app.log" + error_log_file = log_dir / "error.log" + + # 控制台处理器 + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(log_level) + console_formatter = logging.Formatter( + fmt="%(asctime)s | %(levelname)-8s | %(name)s:%(lineno)d | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S" + ) + console_handler.setFormatter(console_formatter) + + # 文件处理器 (所有日志) + file_handler = logging.handlers.RotatingFileHandler( + log_file, + maxBytes=10 * 1024 * 1024, # 10MB + backupCount=5, + encoding="utf-8" + ) + file_handler.setLevel(logging.DEBUG) + file_formatter = logging.Formatter( + fmt="%(asctime)s | %(levelname)-8s | %(name)s:%(lineno)d | %(funcName)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S" + ) + file_handler.setFormatter(file_formatter) + + # 错误日志处理器 (仅ERROR及以上) + error_file_handler = logging.handlers.RotatingFileHandler( + error_log_file, + maxBytes=10 * 1024 * 1024, # 10MB + backupCount=5, + encoding="utf-8" + ) + error_file_handler.setLevel(logging.ERROR) + error_file_handler.setFormatter(file_formatter) + + # 根日志器 + root_logger = logging.getLogger() + root_logger.setLevel(logging.DEBUG) + root_logger.handlers = [] + root_logger.addHandler(console_handler) + root_logger.addHandler(file_handler) + root_logger.addHandler(error_file_handler) + + # 第三方库日志级别 + for lib in ["uvicorn", "uvicorn.access", "fastapi", "httpx", "sqlalchemy"]: + logging.getLogger(lib).setLevel(logging.WARNING) + + root_logger.info(f"日志系统初始化完成 | 日志目录: {log_dir}") + root_logger.info(f"主日志文件: {log_file} | 错误日志: {error_log_file}") + + return root_logger + +# 初始化日志 +setup_logging() logger = logging.getLogger(__name__) +# ==================== 请求日志中间件 ==================== + +class RequestLoggingMiddleware(BaseHTTPMiddleware): + """请求日志中间件 - 记录每个请求的详细信息""" + + async def dispatch(self, request: Request, call_next: Callable) -> Response: + # 生成请求ID + request_id = str(uuid.uuid4())[:8] + request.state.request_id = request_id + + # 记录请求 + logger.info(f"→ [{request_id}] {request.method} {request.url.path}") + + try: + response = await call_next(request) + + # 记录响应 + logger.info( + f"← [{request_id}] {request.method} {request.url.path} " + f"| 状态: {response.status_code} | 耗时: N/A" + ) + + # 添加请求ID到响应头 + response.headers["X-Request-ID"] = request_id + return response + + except Exception as e: + logger.error(f"✗ [{request_id}] {request.method} {request.url.path} | 异常: {str(e)}") + raise + + +# ==================== 请求追踪装饰器 ==================== + +def log_async_function(func: Callable) -> Callable: + """异步函数日志装饰器""" + @wraps(func) + async def wrapper(*args, **kwargs): + func_name = func.__name__ + logger.debug(f"→ {func_name} 开始执行") + try: + result = await func(*args, **kwargs) + logger.debug(f"← {func_name} 执行完成") + return result + except Exception as e: + logger.error(f"✗ {func_name} 执行失败: {str(e)}") + raise + return wrapper + + @asynccontextmanager async def lifespan(app: FastAPI): """ @@ -83,6 +203,9 @@ app.add_middleware( allow_headers=["*"], ) +# 添加请求日志中间件 +app.add_middleware(RequestLoggingMiddleware) + # 注册 API 路由 app.include_router(api_router, prefix=settings.API_V1_STR) diff --git a/backend/app/services/excel_storage_service.py b/backend/app/services/excel_storage_service.py index eb6d98a..5c538a7 100644 --- a/backend/app/services/excel_storage_service.py +++ b/backend/app/services/excel_storage_service.py @@ -17,12 +17,15 @@ from sqlalchemy import ( String, Text, inspect, + text, ) from sqlalchemy.ext.asyncio import AsyncSession from app.core.database.mysql import Base, mysql_db logger = logging.getLogger(__name__) +# 设置该模块的日志级别 +logger.setLevel(logging.DEBUG) class ExcelStorageService: @@ -31,6 +34,123 @@ class ExcelStorageService: def __init__(self): self.mysql_db = mysql_db + def _extract_sheet_names_from_xml(self, file_path: str) -> list: + """从 Excel 文件的 XML 中提取工作表名称""" + import zipfile + from xml.etree import ElementTree as ET + + try: + with zipfile.ZipFile(file_path, 'r') as z: + if 'xl/workbook.xml' not in z.namelist(): + return [] + content = z.read('xl/workbook.xml') + root = ET.fromstring(content) + ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'} + sheets = root.findall('.//main:sheet', ns) + return [s.get('name') for s in sheets if s.get('name')] + except Exception: + return [] + + def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame: + """读取 Excel 工作表,支持 pandas 无法解析的特殊 Excel 文件""" + import zipfile + from xml.etree import ElementTree as ET + + try: + df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row) + if df is not None and not df.empty: + return df + except Exception: + pass + + # pandas 读取失败,从 XML 直接解析 + logger.info(f"使用 XML 方式读取 Excel: {file_path}") + + try: + with zipfile.ZipFile(file_path, 'r') as z: + sheet_names = self._extract_sheet_names_from_xml(file_path) + if not sheet_names: + raise ValueError("无法从 Excel 文件中找到工作表") + + target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0] + sheet_index = sheet_names.index(target_sheet) + 1 + + shared_strings = [] + if 'xl/sharedStrings.xml' in z.namelist(): + ss_content = z.read('xl/sharedStrings.xml') + ss_root = ET.fromstring(ss_content) + ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'} + for si in ss_root.findall('.//main:si', ns): + t = si.find('.//main:t', ns) + shared_strings.append(t.text if t is not None else '') + + sheet_file = f'xl/worksheets/sheet{sheet_index}.xml' + sheet_content = z.read(sheet_file) + root = ET.fromstring(sheet_content) + ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'} + + rows_data = [] + for row in root.findall('.//main:row', ns): + row_idx = int(row.get('r', 0)) + if row_idx <= header_row + 1: + continue + + row_cells = {} + for cell in row.findall('main:c', ns): + cell_ref = cell.get('r', '') + col_letters = ''.join(filter(str.isalpha, cell_ref)) + cell_type = cell.get('t', 'n') + v = cell.find('main:v', ns) + + if v is not None and v.text: + if cell_type == 's': + try: + val = shared_strings[int(v.text)] + except (ValueError, IndexError): + val = v.text + elif cell_type == 'b': + val = v.text == '1' + else: + val = v.text + else: + val = None + row_cells[col_letters] = val + + if row_cells: + rows_data.append(row_cells) + + if not rows_data: + return pd.DataFrame() + + df = pd.DataFrame(rows_data) + + if header_row >= 0: + first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml' + sheet_content = z.read(first_row_sheet) + root = ET.fromstring(sheet_content) + first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns) + if first_row is not None: + headers = {} + for cell in first_row.findall('main:c', ns): + cell_ref = cell.get('r', '') + col_letters = ''.join(filter(str.isalpha, cell_ref)) + cell_type = cell.get('t', 'n') + v = cell.find('main:v', ns) + if v is not None and v.text: + if cell_type == 's': + try: + headers[col_letters] = shared_strings[int(v.text)] + except (ValueError, IndexError): + headers[col_letters] = v.text + else: + headers[col_letters] = v.text + df.columns = [headers.get(col, col) for col in df.columns] + + return df + except Exception as e: + logger.error(f"XML 解析 Excel 失败: {e}") + raise + def _sanitize_table_name(self, filename: str) -> str: """ 将文件名转换为合法的表名 @@ -64,15 +184,44 @@ class ExcelStorageService: Returns: 合法的字段名 """ - # 只保留字母、数字、下划线 - name = re.sub(r'[^a-zA-Z0-9_]', '_', str(col_name)) - - # 确保以字母开头 + # MySQL 支持 UTF8 编码,中文字符可以直接使用 + # 只处理非法字符(控制字符等)和首字符数字 + name = str(col_name).strip() + # 移除控制字符 + name = re.sub(r'[\x00-\x1f\x7f]', '', name) + # 确保以字母或中文开头 if name and name[0].isdigit(): name = 'col_' + name + # 限制长度 (MySQL 字段名最多64字符) + return name[:64] - # 限制长度 - return name[:50] + def _get_unique_column_name(self, col_name: str, used_names: set) -> str: + """ + 获取唯一的列名,避免重复 + + Args: + col_name: 原始列名 + used_names: 已使用的列名集合 + + Returns: + 唯一的列名 + """ + sanitized = self._sanitize_column_name(col_name) + # "id" 是 MySQL 保留名,作为主键使用 + if sanitized.lower() == "id": + sanitized = "col_id" + if sanitized not in used_names: + used_names.add(sanitized) + return sanitized + + # 添加数字后缀直到唯一 + base = sanitized if sanitized else "col" + counter = 1 + while f"{base}_{counter}" in used_names: + counter += 1 + unique_name = f"{base}_{counter}" + used_names.add(unique_name) + return unique_name def _infer_column_type(self, series: pd.Series) -> str: """ @@ -84,12 +233,35 @@ class ExcelStorageService: Returns: 类型名称 """ + # 移除空值进行类型检查 + non_null = series.dropna() + if len(non_null) == 0: + return "TEXT" + dtype = series.dtype + # 整数类型检查 if pd.api.types.is_integer_dtype(dtype): - return "INTEGER" + # 检查是否所有值都能放入 INT 范围 + try: + int_values = non_null.astype('int64') + if int_values.min() >= -2147483648 and int_values.max() <= 2147483647: + return "INTEGER" + else: + # 超出 INT 范围,使用 TEXT + return "TEXT" + except (ValueError, OverflowError): + return "TEXT" elif pd.api.types.is_float_dtype(dtype): - return "FLOAT" + # 检查是否所有值都能放入 FLOAT + try: + float_values = non_null.astype('float64') + if float_values.min() >= -1e308 and float_values.max() <= 1e308: + return "FLOAT" + else: + return "TEXT" + except (ValueError, OverflowError): + return "TEXT" elif pd.api.types.is_datetime64_any_dtype(dtype): return "DATETIME" elif pd.api.types.is_bool_dtype(dtype): @@ -174,11 +346,11 @@ class ExcelStorageService: } try: - # 读取 Excel - if sheet_name: - df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row) - else: - df = pd.read_excel(file_path, header=header_row) + logger.info(f"开始读取Excel文件: {file_path}") + # 读取 Excel(使用 fallback 方式支持特殊格式文件) + df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row) + + logger.info(f"Excel读取完成,行数: {len(df)}, 列数: {len(df.columns)}") if df.empty: return {"success": False, "error": "Excel 文件为空"} @@ -186,31 +358,41 @@ class ExcelStorageService: # 清理列名 df.columns = [str(c) for c in df.columns] - # 推断列类型 + # 推断列类型,并生成唯一的列名 column_types = {} + column_name_map = {} # 原始列名 -> 唯一合法列名 + used_names = set() for col in df.columns: - col_name = self._sanitize_column_name(col) + col_name = self._get_unique_column_name(col, used_names) col_type = self._infer_column_type(df[col]) column_types[col] = col_type + column_name_map[col] = col_name results["columns"].append({ "original_name": col, "sanitized_name": col_name, "type": col_type }) - # 创建表 - model_class = self._create_table_model(table_name, df.columns, column_types) - - # 创建表结构 - async with self.mysql_db.get_session() as session: - model_class.__table__.create(session.bind, checkfirst=True) + # 创建表 - 使用原始 SQL 以兼容异步 + logger.info(f"正在创建MySQL表: {table_name}") + sql_columns = ["id INT AUTO_INCREMENT PRIMARY KEY"] + for col in df.columns: + col_name = column_name_map[col] + col_type = column_types.get(col, "TEXT") + sql_type = "INT" if col_type == "INTEGER" else "FLOAT" if col_type == "FLOAT" else "DATETIME" if col_type == "DATETIME" else "TEXT" + sql_columns.append(f"`{col_name}` {sql_type}") + sql_columns.append("created_at DATETIME DEFAULT CURRENT_TIMESTAMP") + sql_columns.append("updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP") + create_sql = text(f"CREATE TABLE IF NOT EXISTS `{table_name}` ({', '.join(sql_columns)})") + await self.mysql_db.execute_raw_sql(str(create_sql)) + logger.info(f"MySQL表创建完成: {table_name}") # 插入数据 records = [] for _, row in df.iterrows(): record = {} for col in df.columns: - col_name = self._sanitize_column_name(col) + col_name = column_name_map[col] value = row[col] # 处理 NaN 值 @@ -231,11 +413,33 @@ class ExcelStorageService: records.append(record) - # 批量插入 - async with self.mysql_db.get_session() as session: - for record in records: - session.add(model_class(**record)) - await session.commit() + logger.info(f"正在插入 {len(records)} 条数据到 MySQL (使用批量插入)...") + # 使用 pymysql 直接插入以避免 SQLAlchemy 异步问题 + import pymysql + from app.config import settings + + connection = pymysql.connect( + host=settings.MYSQL_HOST, + port=settings.MYSQL_PORT, + user=settings.MYSQL_USER, + password=settings.MYSQL_PASSWORD, + database=settings.MYSQL_DATABASE, + charset=settings.MYSQL_CHARSET + ) + try: + columns_str = ', '.join(['`' + column_name_map[col] + '`' for col in df.columns]) + placeholders = ', '.join(['%s' for _ in df.columns]) + insert_sql = f"INSERT INTO `{table_name}` ({columns_str}) VALUES ({placeholders})" + + # 转换为元组列表 (使用映射后的列名) + param_list = [tuple(record.get(column_name_map[col]) for col in df.columns) for record in records] + + with connection.cursor() as cursor: + cursor.executemany(insert_sql, param_list) + connection.commit() + logger.info(f"数据插入完成: {len(records)} 条") + finally: + connection.close() results["row_count"] = len(records) logger.info(f"Excel 数据已存储到 MySQL 表 {table_name},共 {len(records)} 行") @@ -243,7 +447,7 @@ class ExcelStorageService: return results except Exception as e: - logger.error(f"存储 Excel 到 MySQL 失败: {str(e)}") + logger.error(f"存储 Excel 到 MySQL 失败: {str(e)}", exc_info=True) return {"success": False, "error": str(e)} async def store_structured_data( diff --git a/backend/app/services/file_service.py b/backend/app/services/file_service.py index 813e26d..8b639c7 100644 --- a/backend/app/services/file_service.py +++ b/backend/app/services/file_service.py @@ -3,6 +3,7 @@ """ import os import shutil +import logging from pathlib import Path from datetime import datetime from typing import Optional @@ -10,6 +11,8 @@ import uuid from app.config import settings +logger = logging.getLogger(__name__) + class FileService: """文件服务类,负责文件的存储、读取和管理""" @@ -17,6 +20,7 @@ class FileService: def __init__(self): self.upload_dir = Path(settings.UPLOAD_DIR) self._ensure_upload_dir() + logger.info(f"FileService 初始化,上传目录: {self.upload_dir}") def _ensure_upload_dir(self): """确保上传目录存在""" @@ -56,6 +60,8 @@ class FileService: with open(file_path, 'wb') as f: f.write(file_content) + file_size = len(file_content) + logger.info(f"文件已保存: {filename} -> {file_path} ({file_size} bytes)") return str(file_path) def read_file(self, file_path: str) -> bytes: diff --git a/backend/app/services/llm_service.py b/backend/app/services/llm_service.py index 841d605..8878deb 100644 --- a/backend/app/services/llm_service.py +++ b/backend/app/services/llm_service.py @@ -2,7 +2,7 @@ LLM 服务模块 - 封装大模型 API 调用 """ import logging -from typing import Dict, Any, List, Optional +from typing import Dict, Any, List, Optional, AsyncGenerator import httpx from app.config import settings @@ -87,6 +87,71 @@ class LLMService: logger.error(f"解析 API 响应失败: {str(e)}") raise + async def chat_stream( + self, + messages: List[Dict[str, str]], + temperature: float = 0.7, + max_tokens: Optional[int] = None, + **kwargs + ) -> AsyncGenerator[Dict[str, Any], None]: + """ + 流式调用聊天 API + + Args: + messages: 消息列表 + temperature: 温度参数 + max_tokens: 最大 token 数 + **kwargs: 其他参数 + + Yields: + Dict[str, Any]: 包含 delta 内容的块 + """ + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + } + + payload = { + "model": self.model_name, + "messages": messages, + "temperature": temperature, + "stream": True + } + + if max_tokens: + payload["max_tokens"] = max_tokens + + payload.update(kwargs) + + try: + async with httpx.AsyncClient(timeout=120.0) as client: + async with client.stream( + "POST", + f"{self.base_url}/chat/completions", + headers=headers, + json=payload + ) as response: + async for line in response.aiter_lines(): + if line.startswith("data: "): + data = line[6:] # Remove "data: " prefix + if data == "[DONE]": + break + try: + import json as json_module + chunk = json_module.loads(data) + delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "") + if delta: + yield {"content": delta} + except json_module.JSONDecodeError: + continue + + except httpx.HTTPStatusError as e: + logger.error(f"LLM 流式 API 请求失败: {e.response.status_code}") + raise + except Exception as e: + logger.error(f"LLM 流式 API 调用异常: {str(e)}") + raise + async def analyze_excel_data( self, excel_data: Dict[str, Any], diff --git a/backend/app/services/markdown_ai_service.py b/backend/app/services/markdown_ai_service.py new file mode 100644 index 0000000..9a8d5a2 --- /dev/null +++ b/backend/app/services/markdown_ai_service.py @@ -0,0 +1,707 @@ +""" +Markdown 文档 AI 分析服务 + +支持: +- 分章节解析(中文章节编号:一、二、三, (一)(二)(三)) +- 结构化数据提取 +- 流式输出 +- 多种分析类型 +- 可视化图表生成 +""" +import asyncio +import json +import logging +import re +from typing import Any, AsyncGenerator, Dict, List, Optional + +from app.services.llm_service import llm_service +from app.core.document_parser import MarkdownParser +from app.services.visualization_service import visualization_service + +logger = logging.getLogger(__name__) + + +class MarkdownSection: + """文档章节结构""" + def __init__(self, number: str, title: str, level: int, content: str, line_start: int, line_end: int): + self.number = number # 章节编号,如 "一", "(一)", "1" + self.title = title + self.level = level # 层级深度 + self.content = content # 章节内容(不含子章节) + self.line_start = line_start + self.line_end = line_end + self.subsections: List[MarkdownSection] = [] + + def to_dict(self) -> Dict[str, Any]: + return { + "number": self.number, + "title": self.title, + "level": self.level, + "content_preview": self.content[:200] + "..." if len(self.content) > 200 else self.content, + "line_start": self.line_start, + "line_end": self.line_end, + "subsections": [s.to_dict() for s in self.subsections] + } + + +class MarkdownAIService: + """Markdown 文档 AI 分析服务""" + + # 中文章节编号模式 + CHINESE_NUMBERS = ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"] + CHINESE_SUFFIX = "、" + PARENTHESIS_PATTERN = re.compile(r'^(([一二三四五六七八九十]+)\s*(.+)$') + CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+)、\s*(.+)$') + ARABIC_SECTION_PATTERN = re.compile(r'^(\d+)\.\s+(.+)$') + + def __init__(self): + self.parser = MarkdownParser() + + def get_supported_analysis_types(self) -> list: + """获取支持的分析类型""" + return [ + "summary", # 文档摘要 + "outline", # 大纲提取 + "key_points", # 关键点提取 + "questions", # 生成问题 + "tags", # 生成标签 + "qa", # 问答对 + "statistics", # 统计数据分析(适合政府公报) + "section", # 分章节详细分析 + "charts" # 可视化图表生成 + ] + + def extract_sections(self, content: str, titles: List[Dict]) -> List[MarkdownSection]: + """ + 从文档内容中提取章节结构 + + 识别以下章节格式: + - 一级:一、二、三... + - 二级:(一)(二)(三)... + - 三级:1. 2. 3. ... + """ + sections = [] + lines = content.split('\n') + + # 构建标题行到内容的映射 + title_lines = {} + for t in titles: + title_lines[t.get('line', 0)] = t + + current_section = None + section_stack = [] + + for i, line in enumerate(lines, 1): + stripped = line.strip() + + # 检查是否是一级标题(中文数字 + 、) + match = self.CHINESE_SECTION_PATTERN.match(stripped) + if match: + # 结束当前章节 + if current_section: + current_section.content = self._get_section_content( + lines, current_section.line_start, i - 1 + ) + + current_section = MarkdownSection( + number=match.group(1), + title=match.group(2), + level=1, + content="", + line_start=i, + line_end=len(lines) + ) + sections.append(current_section) + section_stack = [current_section] + continue + + # 检查是否是二级标题((一)(二)...) + match = self.PARENTHESIS_PATTERN.match(stripped) + if match and current_section: + # 结束当前子章节 + if section_stack and len(section_stack) > 1: + parent = section_stack[-1] + parent.content = self._get_section_content( + lines, parent.line_start, i - 1 + ) + + subsection = MarkdownSection( + number=match.group(1), + title=match.group(2), + level=2, + content="", + line_start=i, + line_end=len(lines) + ) + current_section.subsections.append(subsection) + section_stack = [current_section, subsection] + continue + + # 检查是否是三级标题(1. 2. 3.) + match = self.ARABIC_SECTION_PATTERN.match(stripped) + if match and len(section_stack) > 1: + # 结束当前子章节 + if len(section_stack) > 2: + parent = section_stack[-1] + parent.content = self._get_section_content( + lines, parent.line_start, i - 1 + ) + + sub_subsection = MarkdownSection( + number=match.group(1), + title=match.group(2), + level=3, + content="", + line_start=i, + line_end=len(lines) + ) + section_stack[-1].subsections.append(sub_subsection) + section_stack = section_stack[:-1] + [sub_subsection] + continue + + # 处理最后一个章节 + if current_section: + current_section.content = self._get_section_content( + lines, current_section.line_start, len(lines) + ) + + return sections + + def _get_section_content(self, lines: List[str], start: int, end: int) -> str: + """获取指定行范围的内容""" + if start > end: + return "" + content_lines = lines[start-1:end] + # 清理:移除标题行和空行 + cleaned = [] + for line in content_lines: + stripped = line.strip() + if not stripped: + continue + # 跳过章节标题行 + if self.CHINESE_SECTION_PATTERN.match(stripped): + continue + if self.PARENTHESIS_PATTERN.match(stripped): + continue + if self.ARABIC_SECTION_PATTERN.match(stripped): + continue + cleaned.append(stripped) + return '\n'.join(cleaned) + + async def analyze_markdown( + self, + file_path: str, + analysis_type: str = "summary", + user_prompt: str = "", + section_number: Optional[str] = None + ) -> Dict[str, Any]: + """ + 使用 AI 分析 Markdown 文档 + + Args: + file_path: 文件路径 + analysis_type: 分析类型 + user_prompt: 用户自定义提示词 + section_number: 指定分析的章节编号(如 "一" 或 "(一)") + + Returns: + dict: 分析结果 + """ + try: + parse_result = self.parser.parse(file_path) + + if not parse_result.success: + return { + "success": False, + "error": parse_result.error + } + + data = parse_result.data + + # 提取章节结构 + sections = self.extract_sections(data.get("content", ""), data.get("titles", [])) + + # 如果指定了章节,只分析该章节 + target_content = data.get("content", "") + target_title = parse_result.metadata.get("filename", "") + + if section_number: + section = self._find_section(sections, section_number) + if section: + target_content = section.content + target_title = f"{section.number}、{section.title}" + else: + return { + "success": False, + "error": f"未找到章节: {section_number}" + } + + # 根据分析类型构建提示词 + prompt = self._build_prompt( + content=target_content, + analysis_type=analysis_type, + user_prompt=user_prompt, + title=target_title + ) + + # 调用 LLM 分析 + messages = [ + {"role": "system", "content": self._get_system_prompt(analysis_type)}, + {"role": "user", "content": prompt} + ] + + response = await llm_service.chat( + messages=messages, + temperature=0.3, + max_tokens=4000 + ) + + analysis = llm_service.extract_message_content(response) + + # 构建基础返回 + result = { + "success": True, + "filename": parse_result.metadata.get("filename", ""), + "analysis_type": analysis_type, + "section": target_title if section_number else None, + "word_count": len(target_content), + "structure": { + "title_count": parse_result.metadata.get("title_count", 0), + "code_block_count": parse_result.metadata.get("code_block_count", 0), + "table_count": parse_result.metadata.get("table_count", 0), + "section_count": len(sections) + }, + "sections": [s.to_dict() for s in sections[:10]], # 最多返回10个一级章节 + "analysis": analysis + } + + # 如果是 charts 类型,额外生成可视化 + if analysis_type == "charts": + try: + # 解析 LLM 返回的 JSON 数据 + chart_data = self._parse_chart_json(analysis) + if chart_data and chart_data.get("tables"): + # 使用可视化服务生成图表 + for table_info in chart_data.get("tables", []): + columns = table_info.get("columns", []) + rows = table_info.get("rows", []) + if columns and rows: + vis_result = visualization_service.analyze_and_visualize({ + "columns": columns, + "rows": [dict(zip(columns, row)) for row in rows] + }) + if vis_result.get("success"): + table_info["visualization"] = { + "statistics": vis_result.get("statistics"), + "charts": vis_result.get("charts"), + "distributions": vis_result.get("distributions") + } + result["chart_data"] = chart_data + except Exception as e: + logger.warning(f"生成可视化图表失败: {e}") + result["chart_data"] = {"tables": [], "key_statistics": [], "chart_suggestions": []} + + return result + + except Exception as e: + logger.error(f"Markdown AI 分析失败: {str(e)}") + return { + "success": False, + "error": str(e) + } + + async def analyze_markdown_stream( + self, + file_path: str, + analysis_type: str = "summary", + user_prompt: str = "", + section_number: Optional[str] = None + ) -> AsyncGenerator[str, None]: + """ + 流式分析 Markdown 文档 (SSE) + + Yields: + str: SSE 格式的数据块 + """ + try: + parse_result = self.parser.parse(file_path) + + if not parse_result.success: + yield f"data: {json.dumps({'error': parse_result.error}, ensure_ascii=False)}\n\n" + return + + data = parse_result.data + sections = self.extract_sections(data.get("content", ""), data.get("titles", [])) + + target_content = data.get("content", "") + target_title = parse_result.metadata.get("filename", "") + + if section_number: + section = self._find_section(sections, section_number) + if section: + target_content = section.content + target_title = f"{section.number}、{section.title}" + else: + yield f"data: {json.dumps({'error': f'未找到章节: {section_number}'}, ensure_ascii=False)}\n\n" + return + + prompt = self._build_prompt( + content=target_content, + analysis_type=analysis_type, + user_prompt=user_prompt, + title=target_title + ) + + messages = [ + {"role": "system", "content": self._get_system_prompt(analysis_type)}, + {"role": "user", "content": prompt} + ] + + # 发送初始元数据 + yield f"data: {json.dumps({ + 'type': 'start', + 'filename': parse_result.metadata.get("filename", ""), + 'analysis_type': analysis_type, + 'section': target_title if section_number else None, + 'word_count': len(target_content) + }, ensure_ascii=False)}\n\n" + + # 流式调用 LLM + full_response = "" + async for chunk in llm_service.chat_stream(messages, temperature=0.3, max_tokens=4000): + content = chunk.get("content", "") + if content: + full_response += content + yield f"data: {json.dumps({'type': 'content', 'delta': content}, ensure_ascii=False)}\n\n" + + # 发送完成消息 + yield f"data: {json.dumps({'type': 'done', 'full_response': full_response}, ensure_ascii=False)}\n\n" + + except Exception as e: + logger.error(f"Markdown AI 流式分析失败: {str(e)}") + yield f"data: {json.dumps({'error': str(e)}, ensure_ascii=False)}\n\n" + + def _find_section(self, sections: List[MarkdownSection], number: str) -> Optional[MarkdownSection]: + """查找指定编号的章节""" + # 标准化编号 + num = number.strip() + for section in sections: + if section.number == num or section.title == num: + return section + # 在子章节中查找 + found = self._find_section(section.subsections, number) + if found: + return found + return None + + def _parse_chart_json(self, json_str: str) -> Optional[Dict[str, Any]]: + """ + 解析 LLM 返回的 JSON 字符串 + + Args: + json_str: LLM 返回的 JSON 字符串 + + Returns: + 解析后的字典,如果解析失败返回 None + """ + if not json_str: + return None + + try: + # 尝试直接解析 + return json.loads(json_str) + except json.JSONDecodeError: + pass + + # 尝试提取 JSON 代码块 + import re + # 匹配 ```json ... ``` 格式 + match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', json_str) + if match: + try: + return json.loads(match.group(1)) + except json.JSONDecodeError: + pass + + # 尝试找到 JSON 对象的开始和结束 + start = json_str.find('{') + end = json_str.rfind('}') + if start != -1 and end != -1 and end > start: + try: + return json.loads(json_str[start:end+1]) + except json.JSONDecodeError: + pass + + return None + + def _get_system_prompt(self, analysis_type: str) -> str: + """根据分析类型获取系统提示词""" + prompts = { + "summary": "你是一个专业的文档摘要助手,擅长从长文档中提取核心信息。", + "outline": "你是一个专业的文档结构分析助手,擅长提取文档大纲和层级结构。", + "key_points": "你是一个专业的知识提取助手,擅长从文档中提取关键信息和要点。", + "questions": "你是一个专业的教育助手,擅长生成帮助理解文档的问题。", + "tags": "你是一个专业的标签生成助手,擅长提取文档的主题标签。", + "qa": "你是一个专业的问答助手,擅长基于文档内容生成问答对。", + "statistics": "你是一个专业的统计数据分析助手,擅长分析政府统计公报中的数据。", + "section": "你是一个专业的章节分析助手,擅长对文档的特定章节进行深入分析。", + "charts": "你是一个专业的数据可视化助手,擅长从文档中提取数据并生成适合制作图表的数据结构。" + } + return prompts.get(analysis_type, "你是一个专业的文档分析助手。") + + def _build_prompt( + self, + content: str, + analysis_type: str, + user_prompt: str, + title: str = "" + ) -> str: + """根据分析类型构建提示词""" + + # 截断内容避免超出 token 限制 + max_content_len = 6000 + if len(content) > max_content_len: + content = content[:max_content_len] + "\n\n[内容已截断...]" + + base_prompts = { + "summary": f"""请对以下文档进行摘要分析: + +文档标题:{title} + +文档内容: +{content} + +请提供: +1. 文档主要内容摘要(300字以内) +2. 文档的目的和用途 +3. 适合的读者群体 + +请用中文回答,结构清晰。""", + + "outline": f"""请提取以下文档的大纲结构: + +文档标题:{title} + +文档内容: +{content} + +请按层级列出文档大纲,用缩进表示层级关系。 +格式: +一、一级标题 + (一)二级标题 + 1. 三级标题 + +请用中文回答。""", + + "key_points": f"""请从以下文档中提取关键要点: + +文档标题:{title} + +文档内容: +{content} + +请列出文档的关键要点(5-10条),每条用简洁的语言描述,并说明其在文档中的重要性。 + +请用中文回答,格式清晰。""", + + "questions": f"""请根据以下文档生成有助于理解内容的问题: + +文档标题:{title} + +文档内容: +{content} + +请生成5-10个问题,帮助读者更好地理解文档内容。每个问题应该: +1. 涵盖文档的重要信息点 +2. 易于理解和回答 +3. 具有思考价值 + +请用中文回答。""", + + "tags": f"""请为以下文档生成标签: + +文档标题:{title} + +文档内容: +{content[:3000]} + +请生成5-8个标签,用逗号分隔。标签应该反映: +- 文档的主题领域 +- 文档的类型 +- 文档的关键特征 + +请用中文回答,只需输出标签,不要其他内容。""", + + "qa": f"""请根据以下文档生成问答对: + +文档标题:{title} + +文档内容: +{content[:4000]} + +请生成3-5个问答对,帮助读者通过问答形式理解文档内容。 +格式: +Q1: 问题 +A1: 回答 +Q2: 问题 +A2: 回答 + +请用中文回答,内容准确。""", + + "statistics": f"""请分析以下政府统计公报中的数据和结论: + +文档标题:{title} + +文档内容: +{content} + +请提供: +1. 文档中涉及的主要统计数据(列出关键数字和指标) +2. 数据的变化趋势(增长/下降) +3. 重要的百分比和对比 +4. 数据来源和统计口径说明 + +请用中文回答,数据准确。""", + + "section": f"""请详细分析以下文档章节: + +章节标题:{title} + +章节内容: +{content} + +请提供: +1. 章节主要内容概括 +2. 关键信息和数据 +3. 与其他部分的关联(如有) +4. 重要结论 + +请用中文回答,分析深入。""", + + "charts": f"""请从以下文档中提取可用于可视化的数据,并生成适合制作图表的数据结构: + +文档标题:{title} + +文档内容: +{content} + +请完成以下任务: +1. 识别文档中的表格数据(Markdown表格格式) +2. 识别文档中的关键统计数据(百分比、数量、趋势等) +3. 识别可用于比较的分类数据 + +请用 JSON 格式返回以下结构的数据(如果没有表格数据,返回空结构): +{{ + "tables": [ + {{ + "description": "表格的描述", + "columns": ["列名1", "列名2", ...], + "rows": [ + ["值1", "值2", ...], + ["值1", "值2", ...] + ] + }} + ], + "key_statistics": [ + {{ + "name": "指标名称", + "value": "数值", + "trend": "增长/下降/持平", + "description": "指标说明" + }} + ], + "chart_suggestions": [ + {{ + "chart_type": "bar/line/pie", + "title": "图表标题", + "data_source": "数据来源说明" + }} + ] +}} + +请确保返回的是合法的 JSON 格式。""" + } + + prompt = base_prompts.get(analysis_type, base_prompts["summary"]) + + if user_prompt and user_prompt.strip(): + prompt += f"\n\n用户额外需求:{user_prompt}" + + return prompt + + async def extract_outline(self, file_path: str) -> Dict[str, Any]: + """提取文档大纲""" + try: + parse_result = self.parser.parse(file_path) + + if not parse_result.success: + return {"success": False, "error": parse_result.error} + + data = parse_result.data + sections = self.extract_sections(data.get("content", ""), data.get("titles", [])) + + # 构建结构化大纲 + outline = [] + for section in sections: + outline.append({ + "number": section.number, + "title": section.title, + "level": section.level, + "line": section.line_start, + "content_preview": section.content[:100] + "..." if len(section.content) > 100 else section.content, + "subsections": [{ + "number": s.number, + "title": s.title, + "level": s.level, + "line": s.line_start + } for s in section.subsections] + }) + + return { + "success": True, + "outline": outline + } + + except Exception as e: + logger.error(f"大纲提取失败: {str(e)}") + return {"success": False, "error": str(e)} + + async def extract_tables_summary(self, file_path: str) -> Dict[str, Any]: + """提取并总结文档中的表格""" + try: + parse_result = self.parser.parse(file_path) + + if not parse_result.success: + return {"success": False, "error": parse_result.error} + + tables = parse_result.data.get("tables", []) + + if not tables: + return {"success": True, "tables": [], "message": "文档中没有表格"} + + # 提取每个表格的关键信息 + table_summaries = [] + for i, table in enumerate(tables): + summary = { + "index": i + 1, + "headers": table.get("headers", []), + "row_count": table.get("row_count", 0), + "column_count": table.get("column_count", 0), + "preview_rows": table.get("rows", [])[:3], # 只取前3行预览 + "first_column": [row[0] if row else "" for row in table.get("rows", [])[:5]] + } + table_summaries.append(summary) + + return { + "success": True, + "tables": table_summaries, + "table_count": len(tables) + } + + except Exception as e: + logger.error(f"表格提取失败: {str(e)}") + return {"success": False, "error": str(e)} + + +# 全局单例 +markdown_ai_service = MarkdownAIService() diff --git a/backend/app/services/rag_service.py b/backend/app/services/rag_service.py index 65bdb40..2264b11 100644 --- a/backend/app/services/rag_service.py +++ b/backend/app/services/rag_service.py @@ -40,16 +40,31 @@ class RAGService: def _init_embeddings(self): """初始化嵌入模型""" if self.embedding_model is None: - self.embedding_model = SentenceTransformer(settings.EMBEDDING_MODEL) - self._dimension = self.embedding_model.get_sentence_embedding_dimension() - logger.info(f"RAG 嵌入模型初始化完成: {settings.EMBEDDING_MODEL}, 维度: {self._dimension}") + # 使用轻量级本地模型,避免网络问题 + model_name = 'all-MiniLM-L6-v2' + try: + self.embedding_model = SentenceTransformer(model_name) + self._dimension = self.embedding_model.get_sentence_embedding_dimension() + logger.info(f"RAG 嵌入模型初始化完成: {model_name}, 维度: {self._dimension}") + except Exception as e: + logger.warning(f"嵌入模型 {model_name} 加载失败: {e}") + # 如果本地模型也失败,使用简单hash作为后备 + self.embedding_model = None + self._dimension = 384 + logger.info("RAG 使用简化模式 (无向量嵌入)") def _init_vector_store(self): """初始化向量存储""" if self.index is None: self._init_embeddings() - self.index = faiss.IndexIDMap(faiss.IndexFlatIP(self._dimension)) - logger.info("Faiss 向量存储初始化完成") + if self.embedding_model is None: + # 无法加载嵌入模型,使用简化模式 + self._dimension = 384 + self.index = None + logger.warning("RAG 嵌入模型未加载,使用简化模式") + else: + self.index = faiss.IndexIDMap(faiss.IndexFlatIP(self._dimension)) + logger.info("Faiss 向量存储初始化完成") async def initialize(self): """异步初始化""" @@ -78,6 +93,11 @@ class RAGService: if not self._initialized: self._init_vector_store() + # 如果没有嵌入模型,只记录到日志 + if self.embedding_model is None: + logger.debug(f"字段跳过索引 (无嵌入模型): {table_name}.{field_name}") + return + text = f"表名: {table_name}, 字段: {field_name}, 描述: {field_description}" if sample_values: text += f", 示例值: {', '.join(sample_values)}" @@ -100,6 +120,11 @@ class RAGService: if not self._initialized: self._init_vector_store() + # 如果没有嵌入模型,只记录到日志 + if self.embedding_model is None: + logger.debug(f"文档跳过索引 (无嵌入模型): {doc_id}") + return + doc = SimpleDocument( page_content=content, metadata=metadata or {"doc_id": doc_id} diff --git a/backend/app/services/table_rag_service.py b/backend/app/services/table_rag_service.py index 4471e1d..ddf6bf3 100644 --- a/backend/app/services/table_rag_service.py +++ b/backend/app/services/table_rag_service.py @@ -31,6 +31,178 @@ class TableRAGService: self.rag = rag_service self.excel_storage = excel_storage_service + def _extract_sheet_names_from_xml(self, file_path: str) -> List[str]: + """ + 从 Excel 文件的 XML 中提取工作表名称 + + 某些 Excel 文件由于包含非标准元素,pandas/openpyxl 无法正确解析工作表列表, + 此时需要直接从 XML 中提取。 + + Args: + file_path: Excel 文件路径 + + Returns: + 工作表名称列表 + """ + import zipfile + from xml.etree import ElementTree as ET + + try: + with zipfile.ZipFile(file_path, 'r') as z: + # 读取 workbook.xml + if 'xl/workbook.xml' not in z.namelist(): + return [] + + content = z.read('xl/workbook.xml') + root = ET.fromstring(content) + + # 定义命名空间 + ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'} + + # 提取所有 sheet 的 name 属性 + sheets = root.findall('.//main:sheet', ns) + return [s.get('name') for s in sheets if s.get('name')] + + except Exception as e: + logger.warning(f"从 XML 提取工作表失败: {file_path}, error: {e}") + return [] + + def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame: + """ + 读取 Excel 工作表,支持 pandas 无法解析的特殊 Excel 文件 + + 当 pandas 的 ExcelFile 无法正确解析时,直接从 XML 读取数据。 + + Args: + file_path: Excel 文件路径 + sheet_name: 工作表名称(如果为 None,读取第一个工作表) + header_row: 表头行号 + + Returns: + DataFrame + """ + import zipfile + from xml.etree import ElementTree as ET + + try: + # 先尝试用 pandas 正常读取 + df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row) + if df is not None and not df.empty: + return df + except Exception: + pass + + # pandas 读取失败,从 XML 直接解析 + logger.info(f"使用 XML 方式读取 Excel: {file_path}") + + try: + with zipfile.ZipFile(file_path, 'r') as z: + # 获取工作表名称 + sheet_names = self._extract_sheet_names_from_xml(file_path) + if not sheet_names: + raise ValueError("无法从 Excel 文件中找到工作表") + + # 确定要读取的工作表 + target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0] + sheet_index = sheet_names.index(target_sheet) + 1 # sheet1.xml, sheet2.xml, ... + + # 读取 shared strings + shared_strings = [] + if 'xl/sharedStrings.xml' in z.namelist(): + ss_content = z.read('xl/sharedStrings.xml') + ss_root = ET.fromstring(ss_content) + ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'} + for si in ss_root.findall('.//main:si', ns): + t = si.find('.//main:t', ns) + if t is not None: + shared_strings.append(t.text or '') + else: + shared_strings.append('') + + # 读取工作表 + sheet_file = f'xl/worksheets/sheet{sheet_index}.xml' + if sheet_file not in z.namelist(): + raise ValueError(f"工作表文件 {sheet_file} 不存在") + + sheet_content = z.read(sheet_file) + root = ET.fromstring(sheet_content) + ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'} + + # 解析行 + rows_data = [] + for row in root.findall('.//main:row', ns): + row_idx = int(row.get('r', 0)) + # header_row 是 0-indexed,row_idx 是 1-indexed + # 如果 header_row=0 表示第一行是表头,需要跳过 row_idx=1 + if row_idx <= header_row + 1: + continue # 跳过表头行 + + row_cells = {} + for cell in row.findall('main:c', ns): + cell_ref = cell.get('r', '') + col_letters = ''.join(filter(str.isalpha, cell_ref)) + cell_type = cell.get('t', 'n') + v = cell.find('main:v', ns) + + if v is not None and v.text: + if cell_type == 's': + # shared string + try: + val = shared_strings[int(v.text)] + except (ValueError, IndexError): + val = v.text + elif cell_type == 'b': + # boolean + val = v.text == '1' + else: + # number or other + val = v.text + else: + val = None + + row_cells[col_letters] = val + + if row_cells: + rows_data.append(row_cells) + + # 转换为 DataFrame + if not rows_data: + return pd.DataFrame() + + df = pd.DataFrame(rows_data) + + # 如果有 header_row,重新设置列名 + if header_row >= 0: + # 重新读取第一行作为表头 + first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml' + sheet_content = z.read(first_row_sheet) + root = ET.fromstring(sheet_content) + first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns) + if first_row is not None: + headers = {} + for cell in first_row.findall('main:c', ns): + cell_ref = cell.get('r', '') + col_letters = ''.join(filter(str.isalpha, cell_ref)) + cell_type = cell.get('t', 'n') + v = cell.find('main:v', ns) + if v is not None and v.text: + if cell_type == 's': + try: + headers[col_letters] = shared_strings[int(v.text)] + except (ValueError, IndexError): + headers[col_letters] = v.text + else: + headers[col_letters] = v.text + # 重命名列 + df.columns = [headers.get(col, col) for col in df.columns] + + logger.info(f"XML 解析完成: {len(df)} 行, {len(df.columns)} 列") + return df + + except Exception as e: + logger.error(f"XML 解析 Excel 失败: {e}") + raise + async def generate_field_description( self, table_name: str, @@ -126,26 +298,49 @@ class TableRAGService: } try: - # 1. 读取 Excel + # 1. 先检查 Excel 文件是否有效 + logger.info(f"正在检查Excel文件: {file_path}") + try: + xls_file = pd.ExcelFile(file_path) + sheet_names = xls_file.sheet_names + logger.info(f"Excel文件工作表: {sheet_names}") + + # 如果 sheet_names 为空,尝试从 XML 中手动提取 + if not sheet_names: + sheet_names = self._extract_sheet_names_from_xml(file_path) + logger.info(f"从XML提取工作表: {sheet_names}") + + if not sheet_names: + return {"success": False, "error": "Excel 文件没有工作表"} + except Exception as e: + logger.error(f"读取Excel文件失败: {file_path}, error: {e}") + return {"success": False, "error": f"无法读取Excel文件: {str(e)}"} + + # 2. 读取 Excel if sheet_name: - df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row) - else: - df = pd.read_excel(file_path, header=header_row) + # 验证指定的sheet_name是否存在 + if sheet_name not in sheet_names: + logger.warning(f"指定的工作表 '{sheet_name}' 不存在,使用第一个工作表: {sheet_names[0]}") + sheet_name = sheet_names[0] + df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row) + + logger.info(f"读取到数据: {len(df)} 行, {len(df.columns)} 列") if df.empty: return {"success": False, "error": "Excel 文件为空"} # 清理列名 df.columns = [str(c) for c in df.columns] - table_name = excel_storage._sanitize_table_name(filename) + table_name = self.excel_storage._sanitize_table_name(filename) results["table_name"] = table_name results["field_count"] = len(df.columns) + logger.info(f"表名: {table_name}, 字段数: {len(df.columns)}") - # 2. 初始化 RAG (如果需要) + # 3. 初始化 RAG (如果需要) if not self.rag._initialized: self.rag._init_vector_store() - # 3. 为每个字段生成描述并索引 + # 4. 为每个字段生成描述并索引 all_fields_data = {} for col in df.columns: # 采样示例值 @@ -187,7 +382,8 @@ class TableRAGService: logger.error(error_msg) results["errors"].append(error_msg) - # 4. 存储到 MySQL + # 5. 存储到 MySQL + logger.info(f"开始存储到MySQL: {filename}") store_result = await self.excel_storage.store_excel( file_path=file_path, filename=filename, diff --git a/docs/test/test.md b/docs/test/test.md deleted file mode 100644 index 9694da8..0000000 --- a/docs/test/test.md +++ /dev/null @@ -1,113 +0,0 @@ - ✅ Excel 文件解析功能已完成并测试通过 - - 已完成的工作 - - 后端部分 - - 1. 文件服务层 (backend/app/services/file_service.py) - - - 文件保存、读取、删除功能 - - 文件信息获取 - 2. Excel 解析模块 (backend/app/core/document_parser/) - - - base.py - 解析器基类 - - xlsx_parser.py - Excel 文件解析器 - - utils.py - 工具函数 - 3. API 接口 (backend/app/api/endpoints/upload.py) - - - POST /upload/excel - 上传并解析 Excel 文件 - - GET /upload/excel/preview/{file_path} - 获取预览 - - DELETE /upload/file - 删除文件 - 4. 主应用配置 (backend/app/main.py) - - - CORS 中间件配置 - - API 路由注册 - - 前端部分 - - 1. 后端 API 封装 (frontendTest/app-a6ww9j3ja3nl/src/db/backend-api.ts) - - - Excel 上传、预览、删除函数 - 2. Excel 解析页面 (frontendTest/app-a6ww9j3ja3nl/src/pages/ExcelParse.tsx) - - - 拖拽上传界面 - - 解析选项配置 - - 数据表格展示 - - 多工作表支持 - 3. 路由配置 (frontendTest/app-a6ww9j3ja3nl/src/routes.tsx) - - - 添加 /excel-parse 路由 - 4. 环境配置 (.env) - - - 后端 API URL 配置 - - 测试结果 - - 后端 API 测试通过: - - ✅ 健康检查:http://127.0.0.1:8000/health - - ✅ Excel 上传解析:成功解析 5 行 6 列的测试数据 - - 访问方式 - - 1. 后端 API 文档:http://127.0.0.1:8000/api/v1/docs - 2. 前端页面: - - - 启动前端:cd frontendTest/app-a6ww9j3ja3nl && npm run dev - - 访问 Excel 解析页面:/excel-parse - - 下一步建议 - - 1. 启动前端开发服务器测试完整的前端界面 - 2. 添加更多文档格式解析支持(docx, md, txt) - 3. 完善错误处理和用户提示 - -✻ Worked for 4m 18s · 1 background task still running (↓ to manage) - -> 如何启动后端呢 - -● 后端启动步骤如下: - - 方法一:使用命令行启动(开发模式) - - # 1. 进入后端目录 - cd H:/OwnProject/FilesReadSysteam/backend - - # 2. 激活虚拟环境 - # PowerShell: - .\venv\Scripts\Activate.ps1 - - # 或 CMD: - .\venv\Scripts\Activate.bat - - # 3. 启动后端服务器 - python -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload - - 方法二:直接使用虚拟环境中的 Python - - cd H:/OwnProject/FilesReadSysteam/backend - ./venv/Scripts/python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload - - 方法三:直接运行 main.py(开发调试) - - cd H:/OwnProject/FilesReadSysteam/backend - ./venv/Scripts/python.exe app/main.py - - 启动成功的标志 - - INFO: Uvicorn running on http://127.0.0.1:8000 - INFO: Started server process [xxxxx] - INFO: Waiting for application startup. - INFO: Application startup complete. - - 验证服务是否正常 - - # 健康检查 - curl http://127.0.0.1:8000/health - - # 访问 API 文档 - # 浏览器打开: http://127.0.0.1:8000/api/v1/docs - ---- - 当前状态:后端已在后台运行(任务 ID: b22jkg69j),可以直接访问 http://127.0.0.1:8000 - - 需要停止的话告诉我即可。 \ No newline at end of file diff --git a/frontend/src/db/backend-api.ts b/frontend/src/db/backend-api.ts index 669d1db..c10d938 100644 --- a/frontend/src/db/backend-api.ts +++ b/frontend/src/db/backend-api.ts @@ -166,6 +166,66 @@ export interface AIAnalysisResult { error?: string; } +// ==================== Markdown AI 分析类型 ==================== + +export interface AIMarkdownAnalyzeResult { + success: boolean; + filename?: string; + analysis_type?: string; + section?: string; + word_count?: number; + structure?: { + title_count?: number; + code_block_count?: number; + table_count?: number; + section_count?: number; + }; + sections?: MarkdownSection[]; + analysis?: string; + chart_data?: { + tables?: Array<{ + description?: string; + columns?: string[]; + rows?: string[][]; + visualization?: { + statistics?: any; + charts?: any; + distributions?: any; + }; + }>; + key_statistics?: Array<{ + name?: string; + value?: string; + trend?: string; + description?: string; + }>; + chart_suggestions?: Array<{ + chart_type?: string; + title?: string; + data_source?: string; + }>; + }; + error?: string; +} + +export interface MarkdownSection { + number: string; + title: string; + level: number; + content_preview?: string; + line_start: number; + line_end?: number; + subsections?: MarkdownSection[]; +} + +export interface MarkdownOutlineResult { + success: boolean; + outline?: MarkdownSection[]; + error?: string; +} + +export type MarkdownAnalysisType = 'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section' | 'charts'; + export interface AIExcelAnalyzeResult { success: boolean; excel?: { @@ -842,6 +902,159 @@ export const aiApi = { } }, + /** + * 上传并使用 AI 分析 Markdown 文件 + */ + async analyzeMarkdown( + file: File, + options: { + analysisType?: MarkdownAnalysisType; + userPrompt?: string; + sectionNumber?: string; + } = {} + ): Promise { + const formData = new FormData(); + formData.append('file', file); + + const params = new URLSearchParams(); + if (options.analysisType) { + params.append('analysis_type', options.analysisType); + } + if (options.userPrompt) { + params.append('user_prompt', options.userPrompt); + } + if (options.sectionNumber) { + params.append('section_number', options.sectionNumber); + } + + const url = `${BACKEND_BASE_URL}/ai/analyze/md?${params.toString()}`; + + try { + const response = await fetch(url, { + method: 'POST', + body: formData, + }); + + if (!response.ok) { + const error = await response.json(); + throw new Error(error.detail || 'Markdown AI 分析失败'); + } + + return await response.json(); + } catch (error) { + console.error('Markdown AI 分析失败:', error); + throw error; + } + }, + + /** + * 流式分析 Markdown 文件 (SSE) + */ + async analyzeMarkdownStream( + file: File, + options: { + analysisType?: MarkdownAnalysisType; + userPrompt?: string; + sectionNumber?: string; + } = {}, + onChunk?: (chunk: { type: string; delta?: string; error?: string }) => void + ): Promise { + const formData = new FormData(); + formData.append('file', file); + + const params = new URLSearchParams(); + if (options.analysisType) { + params.append('analysis_type', options.analysisType); + } + if (options.userPrompt) { + params.append('user_prompt', options.userPrompt); + } + if (options.sectionNumber) { + params.append('section_number', options.sectionNumber); + } + + const url = `${BACKEND_BASE_URL}/ai/analyze/md/stream?${params.toString()}`; + + try { + const response = await fetch(url, { + method: 'POST', + body: formData, + }); + + if (!response.ok) { + const error = await response.json(); + throw new Error(error.detail || 'Markdown AI 流式分析失败'); + } + + const reader = response.body?.getReader(); + if (!reader) throw new Error('无法读取响应流'); + + const decoder = new TextDecoder(); + let fullResponse = ''; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + const chunk = decoder.decode(value); + const lines = chunk.split('\n'); + + for (const line of lines) { + if (line.startsWith('data: ')) { + const data = line.slice(6); + if (data === '[DONE]') continue; + + try { + const parsed = JSON.parse(data); + if (parsed.type === 'content' && parsed.delta) { + fullResponse += parsed.delta; + onChunk?.({ type: 'content', delta: parsed.delta }); + } else if (parsed.type === 'done') { + fullResponse = parsed.full_response || fullResponse; + } else if (parsed.error) { + onChunk?.({ type: 'error', error: parsed.error }); + } + } catch { + // Ignore parse errors for incomplete JSON + } + } + } + } + + return fullResponse; + } catch (error) { + console.error('Markdown AI 流式分析失败:', error); + throw error; + } + }, + + /** + * 获取 Markdown 文档大纲(分章节信息) + */ + async getMarkdownOutline(file: File): Promise { + const formData = new FormData(); + formData.append('file', file); + + const url = `${BACKEND_BASE_URL}/ai/analyze/md/outline`; + + try { + const response = await fetch(url, { + method: 'GET', + body: formData, + }); + + if (!response.ok) { + const error = await response.json(); + throw new Error(error.detail || '获取 Markdown 大纲失败'); + } + + return await response.json(); + } catch (error) { + console.error('获取 Markdown 大纲失败:', error); + throw error; + } + }, + /** * 生成统计信息和图表 */ diff --git a/frontend/src/pages/Documents.tsx b/frontend/src/pages/Documents.tsx index b81e564..29f5969 100644 --- a/frontend/src/pages/Documents.tsx +++ b/frontend/src/pages/Documents.tsx @@ -19,7 +19,11 @@ import { TrendingUp, Download, Brain, - Settings2 + Settings2, + List, + MessageSquareCode, + Tag, + HelpCircle } from 'lucide-react'; import { Button } from '@/components/ui/button'; import { Input } from '@/components/ui/input'; @@ -33,7 +37,7 @@ import { Checkbox } from '@/components/ui/checkbox'; import { toast } from 'sonner'; import { cn } from '@/lib/utils'; import { Skeleton } from '@/components/ui/skeleton'; -import { backendApi, type ExcelParseResult, aiApi } from '@/db/backend-api'; +import { backendApi, type ExcelParseResult, type AIMarkdownAnalyzeResult, type MarkdownSection, aiApi } from '@/db/backend-api'; import { Table as TableComponent, TableBody, @@ -78,6 +82,15 @@ const Documents: React.FC = () => { const [analysisCharts, setAnalysisCharts] = useState(null); const [analysisTypes, setAnalysisTypes] = useState>([]); + // Markdown AI 分析相关状态 + const [mdAnalysis, setMdAnalysis] = useState(null); + const [mdAnalysisType, setMdAnalysisType] = useState<'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section' | 'charts'>('summary'); + const [mdUserPrompt, setMdUserPrompt] = useState(''); + const [mdSections, setMdSections] = useState([]); + const [mdSelectedSection, setMdSelectedSection] = useState(''); + const [mdStreaming, setMdStreaming] = useState(false); + const [mdStreamingContent, setMdStreamingContent] = useState(''); + // 解析选项 const [parseOptions, setParseOptions] = useState({ parseAllSheets: false, @@ -144,6 +157,9 @@ const Documents: React.FC = () => { setAiAnalysis(null); setAnalysisCharts(null); setExpandedSheet(null); + setMdAnalysis(null); + setMdSections([]); + setMdStreamingContent(''); const ext = file.name.split('.').pop()?.toLowerCase(); @@ -163,6 +179,9 @@ const Documents: React.FC = () => { } else { toast.error(result.error || '解析失败'); } + } else if (ext === 'md' || ext === 'markdown') { + // Markdown 文件:获取大纲 + await fetchMdOutline(); } else { // 其他文档使用通用上传接口 const result = await backendApi.uploadDocument(file); @@ -403,6 +422,106 @@ const Documents: React.FC = () => { } }; + const isMarkdownFile = (filename: string) => { + const ext = filename.split('.').pop()?.toLowerCase(); + return ext === 'md' || ext === 'markdown'; + }; + + // Markdown AI 分析处理 + const handleMdAnalyze = async () => { + if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) { + toast.error('请先上传 Markdown 文件'); + return; + } + + setAnalyzing(true); + setMdAnalysis(null); + + try { + const result = await aiApi.analyzeMarkdown(uploadedFile, { + analysisType: mdAnalysisType, + userPrompt: mdUserPrompt, + sectionNumber: mdSelectedSection || undefined + }); + + if (result.success) { + toast.success('Markdown AI 分析完成'); + setMdAnalysis(result); + } else { + toast.error(result.error || 'AI 分析失败'); + } + } catch (error: any) { + toast.error(error.message || 'AI 分析失败'); + } finally { + setAnalyzing(false); + } + }; + + // 流式分析 Markdown + const handleMdAnalyzeStream = async () => { + if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) { + toast.error('请先上传 Markdown 文件'); + return; + } + + setAnalyzing(true); + setMdStreaming(true); + setMdStreamingContent(''); + setMdAnalysis(null); + + try { + await aiApi.analyzeMarkdownStream( + uploadedFile, + { + analysisType: mdAnalysisType, + userPrompt: mdUserPrompt, + sectionNumber: mdSelectedSection || undefined + }, + (chunk: { type: string; delta?: string; error?: string }) => { + if (chunk.type === 'content' && chunk.delta) { + setMdStreamingContent(prev => prev + chunk.delta); + } else if (chunk.type === 'error') { + toast.error(chunk.error || '流式分析出错'); + } + } + ); + } catch (error: any) { + toast.error(error.message || 'AI 分析失败'); + } finally { + setAnalyzing(false); + setMdStreaming(false); + } + }; + + // 获取 Markdown 文档大纲(分章节) + const fetchMdOutline = async () => { + if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) return; + + try { + const result = await aiApi.getMarkdownOutline(uploadedFile); + if (result.success && result.outline) { + setMdSections(result.outline); + } + } catch (error) { + console.error('获取大纲失败:', error); + } + }; + + const getMdAnalysisIcon = (type: string) => { + switch (type) { + case 'summary': return ; + case 'outline': return ; + case 'key_points': return ; + case 'statistics': return ; + case 'section': return ; + case 'questions': return ; + case 'tags': return ; + case 'qa': return ; + case 'charts': return ; + default: return ; + } + }; + const formatFileSize = (bytes: number): string => { if (bytes === 0) return '0 B'; const k = 1024; @@ -600,6 +719,98 @@ const Documents: React.FC = () => { )} + {/* Markdown AI 分析选项 */} + {uploadedFile && isMarkdownFile(uploadedFile.name) && ( + + + + + Markdown AI 分析 + + + + {/* 章节选择 */} + {mdSections.length > 0 && ( +
+ + +
+ )} +
+ + +
+
+ +