From d189ea9620072ea376edba5101c78192585d6993 Mon Sep 17 00:00:00 2001 From: KiriAky 107 Date: Thu, 2 Apr 2026 11:53:12 +0800 Subject: [PATCH] =?UTF-8?q?feat(ai-analyze):=20=E6=96=B0=E5=A2=9E=20Markdo?= =?UTF-8?q?wn=20=E6=96=87=E4=BB=B6=20AI=20=E5=88=86=E6=9E=90=E5=8A=9F?= =?UTF-8?q?=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加 Markdown 文件上传和解析接口 - 实现流式分析和大纲提取功能 - 支持多种分析类型:摘要、大纲、关键点等 - 新增 markdown_ai_service 服务类 - 扩展 LLMService 支持流式调用 - 更新前端 API 接口定义和实现 --- backend/app/api/endpoints/ai_analyze.py | 191 ++++++- backend/app/services/llm_service.py | 67 ++- backend/app/services/markdown_ai_service.py | 591 ++++++++++++++++++++ docs/test/test.md | 113 ---- frontend/src/db/backend-api.ts | 190 +++++++ frontend/src/pages/Documents.tsx | 252 ++++++++- 6 files changed, 1286 insertions(+), 118 deletions(-) create mode 100644 backend/app/services/markdown_ai_service.py delete mode 100644 docs/test/test.md diff --git a/backend/app/api/endpoints/ai_analyze.py b/backend/app/api/endpoints/ai_analyze.py index 16e1979..49ab0cd 100644 --- a/backend/app/api/endpoints/ai_analyze.py +++ b/backend/app/api/endpoints/ai_analyze.py @@ -2,10 +2,14 @@ AI 分析 API 接口 """ from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body +from fastapi.responses import StreamingResponse from typing import Optional import logging +import tempfile +import os from app.services.excel_ai_service import excel_ai_service +from app.services.markdown_ai_service import markdown_ai_service logger = logging.getLogger(__name__) @@ -93,10 +97,11 @@ async def get_analysis_types(): 获取支持的分析类型列表 Returns: - list: 支持的分析类型 + dict: 支持的分析类型(包含 Excel 和 Markdown) """ return { - "types": excel_ai_service.get_supported_analysis_types() + "excel_types": excel_ai_service.get_supported_analysis_types(), + "markdown_types": markdown_ai_service.get_supported_analysis_types() } @@ -142,3 +147,185 @@ async def analyze_text( except Exception as e: logger.error(f"文本分析失败: {str(e)}") raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}") + + +@router.post("/analyze/md") +async def analyze_markdown( + file: UploadFile = File(...), + analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section"), + user_prompt: str = Query("", description="用户自定义提示词"), + section_number: Optional[str] = Query(None, description="指定章节编号,如 '一' 或 '(一)'") +): + """ + 上传并使用 AI 分析 Markdown 文件 + + Args: + file: 上传的 Markdown 文件 + analysis_type: 分析类型 + user_prompt: 用户自定义提示词 + section_number: 指定分析的章节编号 + + Returns: + dict: 分析结果 + """ + # 检查文件类型 + if not file.filename: + raise HTTPException(status_code=400, detail="文件名为空") + + file_ext = file.filename.split('.')[-1].lower() + if file_ext not in ['md', 'markdown']: + raise HTTPException( + status_code=400, + detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown" + ) + + # 验证分析类型 + supported_types = markdown_ai_service.get_supported_analysis_types() + if analysis_type not in supported_types: + raise HTTPException( + status_code=400, + detail=f"不支持的分析类型: {analysis_type},支持的类型: {', '.join(supported_types)}" + ) + + try: + # 读取文件内容 + content = await file.read() + + # 保存到临时文件 + with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp: + tmp.write(content) + tmp_path = tmp.name + + try: + logger.info(f"开始分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}, 章节: {section_number}") + + # 调用 AI 分析服务 + result = await markdown_ai_service.analyze_markdown( + file_path=tmp_path, + analysis_type=analysis_type, + user_prompt=user_prompt, + section_number=section_number + ) + + logger.info(f"Markdown 分析完成: {file.filename}, 成功: {result['success']}") + + if not result['success']: + raise HTTPException(status_code=500, detail=result.get('error', '分析失败')) + + return result + + finally: + # 清理临时文件 + if os.path.exists(tmp_path): + os.unlink(tmp_path) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Markdown AI 分析过程中出错: {str(e)}") + raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}") + + +@router.post("/analyze/md/stream") +async def analyze_markdown_stream( + file: UploadFile = File(...), + analysis_type: str = Query("summary", description="分析类型"), + user_prompt: str = Query("", description="用户自定义提示词"), + section_number: Optional[str] = Query(None, description="指定章节编号") +): + """ + 流式分析 Markdown 文件 (SSE) + + Returns: + StreamingResponse: SSE 流式响应 + """ + if not file.filename: + raise HTTPException(status_code=400, detail="文件名为空") + + file_ext = file.filename.split('.')[-1].lower() + if file_ext not in ['md', 'markdown']: + raise HTTPException( + status_code=400, + detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown" + ) + + try: + content = await file.read() + + with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp: + tmp.write(content) + tmp_path = tmp.name + + try: + logger.info(f"开始流式分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}") + + async def stream_generator(): + async for chunk in markdown_ai_service.analyze_markdown_stream( + file_path=tmp_path, + analysis_type=analysis_type, + user_prompt=user_prompt, + section_number=section_number + ): + yield chunk + + return StreamingResponse( + stream_generator(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no" + } + ) + + finally: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Markdown AI 流式分析出错: {str(e)}") + raise HTTPException(status_code=500, detail=f"流式分析失败: {str(e)}") + + +@router.get("/analyze/md/outline") +async def get_markdown_outline( + file: UploadFile = File(...) +): + """ + 获取 Markdown 文档的大纲结构(分章节信息) + + Args: + file: 上传的 Markdown 文件 + + Returns: + dict: 文档大纲结构 + """ + if not file.filename: + raise HTTPException(status_code=400, detail="文件名为空") + + file_ext = file.filename.split('.')[-1].lower() + if file_ext not in ['md', 'markdown']: + raise HTTPException( + status_code=400, + detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown" + ) + + try: + content = await file.read() + + with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp: + tmp.write(content) + tmp_path = tmp.name + + try: + result = await markdown_ai_service.extract_outline(tmp_path) + return result + finally: + if os.path.exists(tmp_path): + os.unlink(tmp_path) + + except Exception as e: + logger.error(f"获取 Markdown 大纲失败: {str(e)}") + raise HTTPException(status_code=500, detail=f"获取大纲失败: {str(e)}") diff --git a/backend/app/services/llm_service.py b/backend/app/services/llm_service.py index 841d605..8878deb 100644 --- a/backend/app/services/llm_service.py +++ b/backend/app/services/llm_service.py @@ -2,7 +2,7 @@ LLM 服务模块 - 封装大模型 API 调用 """ import logging -from typing import Dict, Any, List, Optional +from typing import Dict, Any, List, Optional, AsyncGenerator import httpx from app.config import settings @@ -87,6 +87,71 @@ class LLMService: logger.error(f"解析 API 响应失败: {str(e)}") raise + async def chat_stream( + self, + messages: List[Dict[str, str]], + temperature: float = 0.7, + max_tokens: Optional[int] = None, + **kwargs + ) -> AsyncGenerator[Dict[str, Any], None]: + """ + 流式调用聊天 API + + Args: + messages: 消息列表 + temperature: 温度参数 + max_tokens: 最大 token 数 + **kwargs: 其他参数 + + Yields: + Dict[str, Any]: 包含 delta 内容的块 + """ + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + } + + payload = { + "model": self.model_name, + "messages": messages, + "temperature": temperature, + "stream": True + } + + if max_tokens: + payload["max_tokens"] = max_tokens + + payload.update(kwargs) + + try: + async with httpx.AsyncClient(timeout=120.0) as client: + async with client.stream( + "POST", + f"{self.base_url}/chat/completions", + headers=headers, + json=payload + ) as response: + async for line in response.aiter_lines(): + if line.startswith("data: "): + data = line[6:] # Remove "data: " prefix + if data == "[DONE]": + break + try: + import json as json_module + chunk = json_module.loads(data) + delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "") + if delta: + yield {"content": delta} + except json_module.JSONDecodeError: + continue + + except httpx.HTTPStatusError as e: + logger.error(f"LLM 流式 API 请求失败: {e.response.status_code}") + raise + except Exception as e: + logger.error(f"LLM 流式 API 调用异常: {str(e)}") + raise + async def analyze_excel_data( self, excel_data: Dict[str, Any], diff --git a/backend/app/services/markdown_ai_service.py b/backend/app/services/markdown_ai_service.py new file mode 100644 index 0000000..1936339 --- /dev/null +++ b/backend/app/services/markdown_ai_service.py @@ -0,0 +1,591 @@ +""" +Markdown 文档 AI 分析服务 + +支持: +- 分章节解析(中文章节编号:一、二、三, (一)(二)(三)) +- 结构化数据提取 +- 流式输出 +- 多种分析类型 +""" +import asyncio +import json +import logging +import re +from typing import Any, AsyncGenerator, Dict, List, Optional + +from app.services.llm_service import llm_service +from app.core.document_parser import MarkdownParser + +logger = logging.getLogger(__name__) + + +class MarkdownSection: + """文档章节结构""" + def __init__(self, number: str, title: str, level: int, content: str, line_start: int, line_end: int): + self.number = number # 章节编号,如 "一", "(一)", "1" + self.title = title + self.level = level # 层级深度 + self.content = content # 章节内容(不含子章节) + self.line_start = line_start + self.line_end = line_end + self.subsections: List[MarkdownSection] = [] + + def to_dict(self) -> Dict[str, Any]: + return { + "number": self.number, + "title": self.title, + "level": self.level, + "content_preview": self.content[:200] + "..." if len(self.content) > 200 else self.content, + "line_start": self.line_start, + "line_end": self.line_end, + "subsections": [s.to_dict() for s in self.subsections] + } + + +class MarkdownAIService: + """Markdown 文档 AI 分析服务""" + + # 中文章节编号模式 + CHINESE_NUMBERS = ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"] + CHINESE_SUFFIX = "、" + PARENTHESIS_PATTERN = re.compile(r'^(([一二三四五六七八九十]+))\s*(.+)$') + CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+)、\s*(.+)$') + ARABIC_SECTION_PATTERN = re.compile(r'^(\d+)\.\s+(.+)$') + + def __init__(self): + self.parser = MarkdownParser() + + def get_supported_analysis_types(self) -> list: + """获取支持的分析类型""" + return [ + "summary", # 文档摘要 + "outline", # 大纲提取 + "key_points", # 关键点提取 + "questions", # 生成问题 + "tags", # 生成标签 + "qa", # 问答对 + "statistics", # 统计数据分析(适合政府公报) + "section" # 分章节详细分析 + ] + + def extract_sections(self, content: str, titles: List[Dict]) -> List[MarkdownSection]: + """ + 从文档内容中提取章节结构 + + 识别以下章节格式: + - 一级:一、二、三... + - 二级:(一)(二)(三)... + - 三级:1. 2. 3. ... + """ + sections = [] + lines = content.split('\n') + + # 构建标题行到内容的映射 + title_lines = {} + for t in titles: + title_lines[t.get('line', 0)] = t + + current_section = None + section_stack = [] + + for i, line in enumerate(lines, 1): + stripped = line.strip() + + # 检查是否是一级标题(中文数字 + 、) + match = self.CHINESE_SECTION_PATTERN.match(stripped) + if match: + # 结束当前章节 + if current_section: + current_section.content = self._get_section_content( + lines, current_section.line_start, i - 1 + ) + + current_section = MarkdownSection( + number=match.group(1), + title=match.group(2), + level=1, + content="", + line_start=i, + line_end=len(lines) + ) + sections.append(current_section) + section_stack = [current_section] + continue + + # 检查是否是二级标题((一)(二)...) + match = self.PARENTHESIS_PATTERN.match(stripped) + if match and current_section: + # 结束当前子章节 + if section_stack and len(section_stack) > 1: + parent = section_stack[-1] + parent.content = self._get_section_content( + lines, parent.line_start, i - 1 + ) + + subsection = MarkdownSection( + number=match.group(1), + title=match.group(2), + level=2, + content="", + line_start=i, + line_end=len(lines) + ) + current_section.subsections.append(subsection) + section_stack = [current_section, subsection] + continue + + # 检查是否是三级标题(1. 2. 3.) + match = self.ARABIC_SECTION_PATTERN.match(stripped) + if match and len(section_stack) > 1: + # 结束当前子章节 + if len(section_stack) > 2: + parent = section_stack[-1] + parent.content = self._get_section_content( + lines, parent.line_start, i - 1 + ) + + sub_subsection = MarkdownSection( + number=match.group(1), + title=match.group(2), + level=3, + content="", + line_start=i, + line_end=len(lines) + ) + section_stack[-1].subsections.append(sub_subsection) + section_stack = section_stack[:-1] + [sub_subsection] + continue + + # 处理最后一个章节 + if current_section: + current_section.content = self._get_section_content( + lines, current_section.line_start, len(lines) + ) + + return sections + + def _get_section_content(self, lines: List[str], start: int, end: int) -> str: + """获取指定行范围的内容""" + if start > end: + return "" + content_lines = lines[start-1:end] + # 清理:移除标题行和空行 + cleaned = [] + for line in content_lines: + stripped = line.strip() + if not stripped: + continue + # 跳过章节标题行 + if self.CHINESE_SECTION_PATTERN.match(stripped): + continue + if self.PARENTHESIS_PATTERN.match(stripped): + continue + if self.ARABIC_SECTION_PATTERN.match(stripped): + continue + cleaned.append(stripped) + return '\n'.join(cleaned) + + async def analyze_markdown( + self, + file_path: str, + analysis_type: str = "summary", + user_prompt: str = "", + section_number: Optional[str] = None + ) -> Dict[str, Any]: + """ + 使用 AI 分析 Markdown 文档 + + Args: + file_path: 文件路径 + analysis_type: 分析类型 + user_prompt: 用户自定义提示词 + section_number: 指定分析的章节编号(如 "一" 或 "(一)") + + Returns: + dict: 分析结果 + """ + try: + parse_result = self.parser.parse(file_path) + + if not parse_result.success: + return { + "success": False, + "error": parse_result.error + } + + data = parse_result.data + + # 提取章节结构 + sections = self.extract_sections(data.get("content", ""), data.get("titles", [])) + + # 如果指定了章节,只分析该章节 + target_content = data.get("content", "") + target_title = parse_result.metadata.get("filename", "") + + if section_number: + section = self._find_section(sections, section_number) + if section: + target_content = section.content + target_title = f"{section.number}、{section.title}" + else: + return { + "success": False, + "error": f"未找到章节: {section_number}" + } + + # 根据分析类型构建提示词 + prompt = self._build_prompt( + content=target_content, + analysis_type=analysis_type, + user_prompt=user_prompt, + title=target_title + ) + + # 调用 LLM 分析 + messages = [ + {"role": "system", "content": self._get_system_prompt(analysis_type)}, + {"role": "user", "content": prompt} + ] + + response = await llm_service.chat( + messages=messages, + temperature=0.3, + max_tokens=4000 + ) + + analysis = llm_service.extract_message_content(response) + + return { + "success": True, + "filename": parse_result.metadata.get("filename", ""), + "analysis_type": analysis_type, + "section": target_title if section_number else None, + "word_count": len(target_content), + "structure": { + "title_count": parse_result.metadata.get("title_count", 0), + "code_block_count": parse_result.metadata.get("code_block_count", 0), + "table_count": parse_result.metadata.get("table_count", 0), + "section_count": len(sections) + }, + "sections": [s.to_dict() for s in sections[:10]], # 最多返回10个一级章节 + "analysis": analysis + } + + except Exception as e: + logger.error(f"Markdown AI 分析失败: {str(e)}") + return { + "success": False, + "error": str(e) + } + + async def analyze_markdown_stream( + self, + file_path: str, + analysis_type: str = "summary", + user_prompt: str = "", + section_number: Optional[str] = None + ) -> AsyncGenerator[str, None]: + """ + 流式分析 Markdown 文档 (SSE) + + Yields: + str: SSE 格式的数据块 + """ + try: + parse_result = self.parser.parse(file_path) + + if not parse_result.success: + yield f"data: {json.dumps({'error': parse_result.error}, ensure_ascii=False)}\n\n" + return + + data = parse_result.data + sections = self.extract_sections(data.get("content", ""), data.get("titles", [])) + + target_content = data.get("content", "") + target_title = parse_result.metadata.get("filename", "") + + if section_number: + section = self._find_section(sections, section_number) + if section: + target_content = section.content + target_title = f"{section.number}、{section.title}" + else: + yield f"data: {json.dumps({'error': f'未找到章节: {section_number}'}, ensure_ascii=False)}\n\n" + return + + prompt = self._build_prompt( + content=target_content, + analysis_type=analysis_type, + user_prompt=user_prompt, + title=target_title + ) + + messages = [ + {"role": "system", "content": self._get_system_prompt(analysis_type)}, + {"role": "user", "content": prompt} + ] + + # 发送初始元数据 + yield f"data: {json.dumps({ + 'type': 'start', + 'filename': parse_result.metadata.get("filename", ""), + 'analysis_type': analysis_type, + 'section': target_title if section_number else None, + 'word_count': len(target_content) + }, ensure_ascii=False)}\n\n" + + # 流式调用 LLM + full_response = "" + async for chunk in llm_service.chat_stream(messages, temperature=0.3, max_tokens=4000): + content = chunk.get("content", "") + if content: + full_response += content + yield f"data: {json.dumps({'type': 'content', 'delta': content}, ensure_ascii=False)}\n\n" + + # 发送完成消息 + yield f"data: {json.dumps({'type': 'done', 'full_response': full_response}, ensure_ascii=False)}\n\n" + + except Exception as e: + logger.error(f"Markdown AI 流式分析失败: {str(e)}") + yield f"data: {json.dumps({'error': str(e)}, ensure_ascii=False)}\n\n" + + def _find_section(self, sections: List[MarkdownSection], number: str) -> Optional[MarkdownSection]: + """查找指定编号的章节""" + # 标准化编号 + num = number.strip() + for section in sections: + if section.number == num or section.title == num: + return section + # 在子章节中查找 + found = self._find_section(section.subsections, number) + if found: + return found + return None + + def _get_system_prompt(self, analysis_type: str) -> str: + """根据分析类型获取系统提示词""" + prompts = { + "summary": "你是一个专业的文档摘要助手,擅长从长文档中提取核心信息。", + "outline": "你是一个专业的文档结构分析助手,擅长提取文档大纲和层级结构。", + "key_points": "你是一个专业的知识提取助手,擅长从文档中提取关键信息和要点。", + "questions": "你是一个专业的教育助手,擅长生成帮助理解文档的问题。", + "tags": "你是一个专业的标签生成助手,擅长提取文档的主题标签。", + "qa": "你是一个专业的问答助手,擅长基于文档内容生成问答对。", + "statistics": "你是一个专业的统计数据分析助手,擅长分析政府统计公报中的数据。", + "section": "你是一个专业的章节分析助手,擅长对文档的特定章节进行深入分析。" + } + return prompts.get(analysis_type, "你是一个专业的文档分析助手。") + + def _build_prompt( + self, + content: str, + analysis_type: str, + user_prompt: str, + title: str = "" + ) -> str: + """根据分析类型构建提示词""" + + # 截断内容避免超出 token 限制 + max_content_len = 6000 + if len(content) > max_content_len: + content = content[:max_content_len] + "\n\n[内容已截断...]" + + base_prompts = { + "summary": f"""请对以下文档进行摘要分析: + +文档标题:{title} + +文档内容: +{content} + +请提供: +1. 文档主要内容摘要(300字以内) +2. 文档的目的和用途 +3. 适合的读者群体 + +请用中文回答,结构清晰。""", + + "outline": f"""请提取以下文档的大纲结构: + +文档标题:{title} + +文档内容: +{content} + +请按层级列出文档大纲,用缩进表示层级关系。 +格式: +一、一级标题 + (一)二级标题 + 1. 三级标题 + +请用中文回答。""", + + "key_points": f"""请从以下文档中提取关键要点: + +文档标题:{title} + +文档内容: +{content} + +请列出文档的关键要点(5-10条),每条用简洁的语言描述,并说明其在文档中的重要性。 + +请用中文回答,格式清晰。""", + + "questions": f"""请根据以下文档生成有助于理解内容的问题: + +文档标题:{title} + +文档内容: +{content} + +请生成5-10个问题,帮助读者更好地理解文档内容。每个问题应该: +1. 涵盖文档的重要信息点 +2. 易于理解和回答 +3. 具有思考价值 + +请用中文回答。""", + + "tags": f"""请为以下文档生成标签: + +文档标题:{title} + +文档内容: +{content[:3000]} + +请生成5-8个标签,用逗号分隔。标签应该反映: +- 文档的主题领域 +- 文档的类型 +- 文档的关键特征 + +请用中文回答,只需输出标签,不要其他内容。""", + + "qa": f"""请根据以下文档生成问答对: + +文档标题:{title} + +文档内容: +{content[:4000]} + +请生成3-5个问答对,帮助读者通过问答形式理解文档内容。 +格式: +Q1: 问题 +A1: 回答 +Q2: 问题 +A2: 回答 + +请用中文回答,内容准确。""", + + "statistics": f"""请分析以下政府统计公报中的数据和结论: + +文档标题:{title} + +文档内容: +{content} + +请提供: +1. 文档中涉及的主要统计数据(列出关键数字和指标) +2. 数据的变化趋势(增长/下降) +3. 重要的百分比和对比 +4. 数据来源和统计口径说明 + +请用中文回答,数据准确。""", + + "section": f"""请详细分析以下文档章节: + +章节标题:{title} + +章节内容: +{content} + +请提供: +1. 章节主要内容概括 +2. 关键信息和数据 +3. 与其他部分的关联(如有) +4. 重要结论 + +请用中文回答,分析深入。""" + } + + prompt = base_prompts.get(analysis_type, base_prompts["summary"]) + + if user_prompt and user_prompt.strip(): + prompt += f"\n\n用户额外需求:{user_prompt}" + + return prompt + + async def extract_outline(self, file_path: str) -> Dict[str, Any]: + """提取文档大纲""" + try: + parse_result = self.parser.parse(file_path) + + if not parse_result.success: + return {"success": False, "error": parse_result.error} + + data = parse_result.data + sections = self.extract_sections(data.get("content", ""), data.get("titles", [])) + + # 构建结构化大纲 + outline = [] + for section in sections: + outline.append({ + "number": section.number, + "title": section.title, + "level": section.level, + "line": section.line_start, + "content_preview": section.content[:100] + "..." if len(section.content) > 100 else section.content, + "subsections": [{ + "number": s.number, + "title": s.title, + "level": s.level, + "line": s.line_start + } for s in section.subsections] + }) + + return { + "success": True, + "outline": outline + } + + except Exception as e: + logger.error(f"大纲提取失败: {str(e)}") + return {"success": False, "error": str(e)} + + async def extract_tables_summary(self, file_path: str) -> Dict[str, Any]: + """提取并总结文档中的表格""" + try: + parse_result = self.parser.parse(file_path) + + if not parse_result.success: + return {"success": False, "error": parse_result.error} + + tables = parse_result.data.get("tables", []) + + if not tables: + return {"success": True, "tables": [], "message": "文档中没有表格"} + + # 提取每个表格的关键信息 + table_summaries = [] + for i, table in enumerate(tables): + summary = { + "index": i + 1, + "headers": table.get("headers", []), + "row_count": table.get("row_count", 0), + "column_count": table.get("column_count", 0), + "preview_rows": table.get("rows", [])[:3], # 只取前3行预览 + "first_column": [row[0] if row else "" for row in table.get("rows", [])[:5]] + } + table_summaries.append(summary) + + return { + "success": True, + "tables": table_summaries, + "table_count": len(tables) + } + + except Exception as e: + logger.error(f"表格提取失败: {str(e)}") + return {"success": False, "error": str(e)} + + +# 全局单例 +markdown_ai_service = MarkdownAIService() diff --git a/docs/test/test.md b/docs/test/test.md deleted file mode 100644 index 9694da8..0000000 --- a/docs/test/test.md +++ /dev/null @@ -1,113 +0,0 @@ - ✅ Excel 文件解析功能已完成并测试通过 - - 已完成的工作 - - 后端部分 - - 1. 文件服务层 (backend/app/services/file_service.py) - - - 文件保存、读取、删除功能 - - 文件信息获取 - 2. Excel 解析模块 (backend/app/core/document_parser/) - - - base.py - 解析器基类 - - xlsx_parser.py - Excel 文件解析器 - - utils.py - 工具函数 - 3. API 接口 (backend/app/api/endpoints/upload.py) - - - POST /upload/excel - 上传并解析 Excel 文件 - - GET /upload/excel/preview/{file_path} - 获取预览 - - DELETE /upload/file - 删除文件 - 4. 主应用配置 (backend/app/main.py) - - - CORS 中间件配置 - - API 路由注册 - - 前端部分 - - 1. 后端 API 封装 (frontendTest/app-a6ww9j3ja3nl/src/db/backend-api.ts) - - - Excel 上传、预览、删除函数 - 2. Excel 解析页面 (frontendTest/app-a6ww9j3ja3nl/src/pages/ExcelParse.tsx) - - - 拖拽上传界面 - - 解析选项配置 - - 数据表格展示 - - 多工作表支持 - 3. 路由配置 (frontendTest/app-a6ww9j3ja3nl/src/routes.tsx) - - - 添加 /excel-parse 路由 - 4. 环境配置 (.env) - - - 后端 API URL 配置 - - 测试结果 - - 后端 API 测试通过: - - ✅ 健康检查:http://127.0.0.1:8000/health - - ✅ Excel 上传解析:成功解析 5 行 6 列的测试数据 - - 访问方式 - - 1. 后端 API 文档:http://127.0.0.1:8000/api/v1/docs - 2. 前端页面: - - - 启动前端:cd frontendTest/app-a6ww9j3ja3nl && npm run dev - - 访问 Excel 解析页面:/excel-parse - - 下一步建议 - - 1. 启动前端开发服务器测试完整的前端界面 - 2. 添加更多文档格式解析支持(docx, md, txt) - 3. 完善错误处理和用户提示 - -✻ Worked for 4m 18s · 1 background task still running (↓ to manage) - -> 如何启动后端呢 - -● 后端启动步骤如下: - - 方法一:使用命令行启动(开发模式) - - # 1. 进入后端目录 - cd H:/OwnProject/FilesReadSysteam/backend - - # 2. 激活虚拟环境 - # PowerShell: - .\venv\Scripts\Activate.ps1 - - # 或 CMD: - .\venv\Scripts\Activate.bat - - # 3. 启动后端服务器 - python -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload - - 方法二:直接使用虚拟环境中的 Python - - cd H:/OwnProject/FilesReadSysteam/backend - ./venv/Scripts/python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload - - 方法三:直接运行 main.py(开发调试) - - cd H:/OwnProject/FilesReadSysteam/backend - ./venv/Scripts/python.exe app/main.py - - 启动成功的标志 - - INFO: Uvicorn running on http://127.0.0.1:8000 - INFO: Started server process [xxxxx] - INFO: Waiting for application startup. - INFO: Application startup complete. - - 验证服务是否正常 - - # 健康检查 - curl http://127.0.0.1:8000/health - - # 访问 API 文档 - # 浏览器打开: http://127.0.0.1:8000/api/v1/docs - ---- - 当前状态:后端已在后台运行(任务 ID: b22jkg69j),可以直接访问 http://127.0.0.1:8000 - - 需要停止的话告诉我即可。 \ No newline at end of file diff --git a/frontend/src/db/backend-api.ts b/frontend/src/db/backend-api.ts index 669d1db..8aa57df 100644 --- a/frontend/src/db/backend-api.ts +++ b/frontend/src/db/backend-api.ts @@ -166,6 +166,43 @@ export interface AIAnalysisResult { error?: string; } +// ==================== Markdown AI 分析类型 ==================== + +export interface AIMarkdownAnalyzeResult { + success: boolean; + filename?: string; + analysis_type?: string; + section?: string; + word_count?: number; + structure?: { + title_count?: number; + code_block_count?: number; + table_count?: number; + section_count?: number; + }; + sections?: MarkdownSection[]; + analysis?: string; + error?: string; +} + +export interface MarkdownSection { + number: string; + title: string; + level: number; + content_preview?: string; + line_start: number; + line_end?: number; + subsections?: MarkdownSection[]; +} + +export interface MarkdownOutlineResult { + success: boolean; + outline?: MarkdownSection[]; + error?: string; +} + +export type MarkdownAnalysisType = 'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section'; + export interface AIExcelAnalyzeResult { success: boolean; excel?: { @@ -842,6 +879,159 @@ export const aiApi = { } }, + /** + * 上传并使用 AI 分析 Markdown 文件 + */ + async analyzeMarkdown( + file: File, + options: { + analysisType?: MarkdownAnalysisType; + userPrompt?: string; + sectionNumber?: string; + } = {} + ): Promise { + const formData = new FormData(); + formData.append('file', file); + + const params = new URLSearchParams(); + if (options.analysisType) { + params.append('analysis_type', options.analysisType); + } + if (options.userPrompt) { + params.append('user_prompt', options.userPrompt); + } + if (options.sectionNumber) { + params.append('section_number', options.sectionNumber); + } + + const url = `${BACKEND_BASE_URL}/ai/analyze/md?${params.toString()}`; + + try { + const response = await fetch(url, { + method: 'POST', + body: formData, + }); + + if (!response.ok) { + const error = await response.json(); + throw new Error(error.detail || 'Markdown AI 分析失败'); + } + + return await response.json(); + } catch (error) { + console.error('Markdown AI 分析失败:', error); + throw error; + } + }, + + /** + * 流式分析 Markdown 文件 (SSE) + */ + async analyzeMarkdownStream( + file: File, + options: { + analysisType?: MarkdownAnalysisType; + userPrompt?: string; + sectionNumber?: string; + } = {}, + onChunk?: (chunk: { type: string; delta?: string; error?: string }) => void + ): Promise { + const formData = new FormData(); + formData.append('file', file); + + const params = new URLSearchParams(); + if (options.analysisType) { + params.append('analysis_type', options.analysisType); + } + if (options.userPrompt) { + params.append('user_prompt', options.userPrompt); + } + if (options.sectionNumber) { + params.append('section_number', options.sectionNumber); + } + + const url = `${BACKEND_BASE_URL}/ai/analyze/md/stream?${params.toString()}`; + + try { + const response = await fetch(url, { + method: 'POST', + body: formData, + }); + + if (!response.ok) { + const error = await response.json(); + throw new Error(error.detail || 'Markdown AI 流式分析失败'); + } + + const reader = response.body?.getReader(); + if (!reader) throw new Error('无法读取响应流'); + + const decoder = new TextDecoder(); + let fullResponse = ''; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + const chunk = decoder.decode(value); + const lines = chunk.split('\n'); + + for (const line of lines) { + if (line.startsWith('data: ')) { + const data = line.slice(6); + if (data === '[DONE]') continue; + + try { + const parsed = JSON.parse(data); + if (parsed.type === 'content' && parsed.delta) { + fullResponse += parsed.delta; + onChunk?.({ type: 'content', delta: parsed.delta }); + } else if (parsed.type === 'done') { + fullResponse = parsed.full_response || fullResponse; + } else if (parsed.error) { + onChunk?.({ type: 'error', error: parsed.error }); + } + } catch { + // Ignore parse errors for incomplete JSON + } + } + } + } + + return fullResponse; + } catch (error) { + console.error('Markdown AI 流式分析失败:', error); + throw error; + } + }, + + /** + * 获取 Markdown 文档大纲(分章节信息) + */ + async getMarkdownOutline(file: File): Promise { + const formData = new FormData(); + formData.append('file', file); + + const url = `${BACKEND_BASE_URL}/ai/analyze/md/outline`; + + try { + const response = await fetch(url, { + method: 'GET', + body: formData, + }); + + if (!response.ok) { + const error = await response.json(); + throw new Error(error.detail || '获取 Markdown 大纲失败'); + } + + return await response.json(); + } catch (error) { + console.error('获取 Markdown 大纲失败:', error); + throw error; + } + }, + /** * 生成统计信息和图表 */ diff --git a/frontend/src/pages/Documents.tsx b/frontend/src/pages/Documents.tsx index b81e564..cb303a5 100644 --- a/frontend/src/pages/Documents.tsx +++ b/frontend/src/pages/Documents.tsx @@ -19,7 +19,11 @@ import { TrendingUp, Download, Brain, - Settings2 + Settings2, + List, + MessageSquareCode, + Tag, + HelpCircle } from 'lucide-react'; import { Button } from '@/components/ui/button'; import { Input } from '@/components/ui/input'; @@ -33,7 +37,7 @@ import { Checkbox } from '@/components/ui/checkbox'; import { toast } from 'sonner'; import { cn } from '@/lib/utils'; import { Skeleton } from '@/components/ui/skeleton'; -import { backendApi, type ExcelParseResult, aiApi } from '@/db/backend-api'; +import { backendApi, type ExcelParseResult, type AIMarkdownAnalyzeResult, type MarkdownSection, aiApi } from '@/db/backend-api'; import { Table as TableComponent, TableBody, @@ -78,6 +82,15 @@ const Documents: React.FC = () => { const [analysisCharts, setAnalysisCharts] = useState(null); const [analysisTypes, setAnalysisTypes] = useState>([]); + // Markdown AI 分析相关状态 + const [mdAnalysis, setMdAnalysis] = useState(null); + const [mdAnalysisType, setMdAnalysisType] = useState<'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section'>('summary'); + const [mdUserPrompt, setMdUserPrompt] = useState(''); + const [mdSections, setMdSections] = useState([]); + const [mdSelectedSection, setMdSelectedSection] = useState(''); + const [mdStreaming, setMdStreaming] = useState(false); + const [mdStreamingContent, setMdStreamingContent] = useState(''); + // 解析选项 const [parseOptions, setParseOptions] = useState({ parseAllSheets: false, @@ -144,6 +157,9 @@ const Documents: React.FC = () => { setAiAnalysis(null); setAnalysisCharts(null); setExpandedSheet(null); + setMdAnalysis(null); + setMdSections([]); + setMdStreamingContent(''); const ext = file.name.split('.').pop()?.toLowerCase(); @@ -163,6 +179,9 @@ const Documents: React.FC = () => { } else { toast.error(result.error || '解析失败'); } + } else if (ext === 'md' || ext === 'markdown') { + // Markdown 文件:获取大纲 + await fetchMdOutline(); } else { // 其他文档使用通用上传接口 const result = await backendApi.uploadDocument(file); @@ -403,6 +422,105 @@ const Documents: React.FC = () => { } }; + const isMarkdownFile = (filename: string) => { + const ext = filename.split('.').pop()?.toLowerCase(); + return ext === 'md' || ext === 'markdown'; + }; + + // Markdown AI 分析处理 + const handleMdAnalyze = async () => { + if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) { + toast.error('请先上传 Markdown 文件'); + return; + } + + setAnalyzing(true); + setMdAnalysis(null); + + try { + const result = await aiApi.analyzeMarkdown(uploadedFile, { + analysisType: mdAnalysisType, + userPrompt: mdUserPrompt, + sectionNumber: mdSelectedSection || undefined + }); + + if (result.success) { + toast.success('Markdown AI 分析完成'); + setMdAnalysis(result); + } else { + toast.error(result.error || 'AI 分析失败'); + } + } catch (error: any) { + toast.error(error.message || 'AI 分析失败'); + } finally { + setAnalyzing(false); + } + }; + + // 流式分析 Markdown + const handleMdAnalyzeStream = async () => { + if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) { + toast.error('请先上传 Markdown 文件'); + return; + } + + setAnalyzing(true); + setMdStreaming(true); + setMdStreamingContent(''); + setMdAnalysis(null); + + try { + await aiApi.analyzeMarkdownStream( + uploadedFile, + { + analysisType: mdAnalysisType, + userPrompt: mdUserPrompt, + sectionNumber: mdSelectedSection || undefined + }, + (chunk: { type: string; delta?: string; error?: string }) => { + if (chunk.type === 'content' && chunk.delta) { + setMdStreamingContent(prev => prev + chunk.delta); + } else if (chunk.type === 'error') { + toast.error(chunk.error || '流式分析出错'); + } + } + ); + } catch (error: any) { + toast.error(error.message || 'AI 分析失败'); + } finally { + setAnalyzing(false); + setMdStreaming(false); + } + }; + + // 获取 Markdown 文档大纲(分章节) + const fetchMdOutline = async () => { + if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) return; + + try { + const result = await aiApi.getMarkdownOutline(uploadedFile); + if (result.success && result.outline) { + setMdSections(result.outline); + } + } catch (error) { + console.error('获取大纲失败:', error); + } + }; + + const getMdAnalysisIcon = (type: string) => { + switch (type) { + case 'summary': return ; + case 'outline': return ; + case 'key_points': return ; + case 'statistics': return ; + case 'section': return ; + case 'questions': return ; + case 'tags': return ; + case 'qa': return ; + default: return ; + } + }; + const formatFileSize = (bytes: number): string => { if (bytes === 0) return '0 B'; const k = 1024; @@ -600,6 +718,97 @@ const Documents: React.FC = () => { )} + {/* Markdown AI 分析选项 */} + {uploadedFile && isMarkdownFile(uploadedFile.name) && ( + + + + + Markdown AI 分析 + + + + {/* 章节选择 */} + {mdSections.length > 0 && ( +
+ + +
+ )} +
+ + +
+
+ +