feat(ai-analyze): 新增 Markdown 文件 AI 分析功能

- 添加 Markdown 文件上传和解析接口
- 实现流式分析和大纲提取功能
- 支持多种分析类型:摘要、大纲、关键点等
- 新增 markdown_ai_service 服务类
- 扩展 LLMService 支持流式调用
- 更新前端 API 接口定义和实现
This commit is contained in:
2026-04-02 11:53:12 +08:00
parent ddf30078f0
commit d189ea9620
6 changed files with 1286 additions and 118 deletions

View File

@@ -2,10 +2,14 @@
AI 分析 API 接口
"""
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body
from fastapi.responses import StreamingResponse
from typing import Optional
import logging
import tempfile
import os
from app.services.excel_ai_service import excel_ai_service
from app.services.markdown_ai_service import markdown_ai_service
logger = logging.getLogger(__name__)
@@ -93,10 +97,11 @@ async def get_analysis_types():
获取支持的分析类型列表
Returns:
list: 支持的分析类型
dict: 支持的分析类型(包含 Excel 和 Markdown
"""
return {
"types": excel_ai_service.get_supported_analysis_types()
"excel_types": excel_ai_service.get_supported_analysis_types(),
"markdown_types": markdown_ai_service.get_supported_analysis_types()
}
@@ -142,3 +147,185 @@ async def analyze_text(
except Exception as e:
logger.error(f"文本分析失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
@router.post("/analyze/md")
async def analyze_markdown(
file: UploadFile = File(...),
analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section"),
user_prompt: str = Query("", description="用户自定义提示词"),
section_number: Optional[str] = Query(None, description="指定章节编号,如 '''(一)'")
):
"""
上传并使用 AI 分析 Markdown 文件
Args:
file: 上传的 Markdown 文件
analysis_type: 分析类型
user_prompt: 用户自定义提示词
section_number: 指定分析的章节编号
Returns:
dict: 分析结果
"""
# 检查文件类型
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
# 验证分析类型
supported_types = markdown_ai_service.get_supported_analysis_types()
if analysis_type not in supported_types:
raise HTTPException(
status_code=400,
detail=f"不支持的分析类型: {analysis_type},支持的类型: {', '.join(supported_types)}"
)
try:
# 读取文件内容
content = await file.read()
# 保存到临时文件
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
logger.info(f"开始分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}, 章节: {section_number}")
# 调用 AI 分析服务
result = await markdown_ai_service.analyze_markdown(
file_path=tmp_path,
analysis_type=analysis_type,
user_prompt=user_prompt,
section_number=section_number
)
logger.info(f"Markdown 分析完成: {file.filename}, 成功: {result['success']}")
if not result['success']:
raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
return result
finally:
# 清理临时文件
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except HTTPException:
raise
except Exception as e:
logger.error(f"Markdown AI 分析过程中出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
@router.post("/analyze/md/stream")
async def analyze_markdown_stream(
file: UploadFile = File(...),
analysis_type: str = Query("summary", description="分析类型"),
user_prompt: str = Query("", description="用户自定义提示词"),
section_number: Optional[str] = Query(None, description="指定章节编号")
):
"""
流式分析 Markdown 文件 (SSE)
Returns:
StreamingResponse: SSE 流式响应
"""
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
try:
content = await file.read()
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
logger.info(f"开始流式分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}")
async def stream_generator():
async for chunk in markdown_ai_service.analyze_markdown_stream(
file_path=tmp_path,
analysis_type=analysis_type,
user_prompt=user_prompt,
section_number=section_number
):
yield chunk
return StreamingResponse(
stream_generator(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no"
}
)
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except HTTPException:
raise
except Exception as e:
logger.error(f"Markdown AI 流式分析出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"流式分析失败: {str(e)}")
@router.get("/analyze/md/outline")
async def get_markdown_outline(
file: UploadFile = File(...)
):
"""
获取 Markdown 文档的大纲结构(分章节信息)
Args:
file: 上传的 Markdown 文件
Returns:
dict: 文档大纲结构
"""
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
try:
content = await file.read()
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
result = await markdown_ai_service.extract_outline(tmp_path)
return result
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except Exception as e:
logger.error(f"获取 Markdown 大纲失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"获取大纲失败: {str(e)}")

View File

@@ -2,7 +2,7 @@
LLM 服务模块 - 封装大模型 API 调用
"""
import logging
from typing import Dict, Any, List, Optional
from typing import Dict, Any, List, Optional, AsyncGenerator
import httpx
from app.config import settings
@@ -87,6 +87,71 @@ class LLMService:
logger.error(f"解析 API 响应失败: {str(e)}")
raise
async def chat_stream(
self,
messages: List[Dict[str, str]],
temperature: float = 0.7,
max_tokens: Optional[int] = None,
**kwargs
) -> AsyncGenerator[Dict[str, Any], None]:
"""
流式调用聊天 API
Args:
messages: 消息列表
temperature: 温度参数
max_tokens: 最大 token 数
**kwargs: 其他参数
Yields:
Dict[str, Any]: 包含 delta 内容的块
"""
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
payload = {
"model": self.model_name,
"messages": messages,
"temperature": temperature,
"stream": True
}
if max_tokens:
payload["max_tokens"] = max_tokens
payload.update(kwargs)
try:
async with httpx.AsyncClient(timeout=120.0) as client:
async with client.stream(
"POST",
f"{self.base_url}/chat/completions",
headers=headers,
json=payload
) as response:
async for line in response.aiter_lines():
if line.startswith("data: "):
data = line[6:] # Remove "data: " prefix
if data == "[DONE]":
break
try:
import json as json_module
chunk = json_module.loads(data)
delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
if delta:
yield {"content": delta}
except json_module.JSONDecodeError:
continue
except httpx.HTTPStatusError as e:
logger.error(f"LLM 流式 API 请求失败: {e.response.status_code}")
raise
except Exception as e:
logger.error(f"LLM 流式 API 调用异常: {str(e)}")
raise
async def analyze_excel_data(
self,
excel_data: Dict[str, Any],

View File

@@ -0,0 +1,591 @@
"""
Markdown 文档 AI 分析服务
支持:
- 分章节解析(中文章节编号:一、二、三, (一)(二)(三))
- 结构化数据提取
- 流式输出
- 多种分析类型
"""
import asyncio
import json
import logging
import re
from typing import Any, AsyncGenerator, Dict, List, Optional
from app.services.llm_service import llm_service
from app.core.document_parser import MarkdownParser
logger = logging.getLogger(__name__)
class MarkdownSection:
"""文档章节结构"""
def __init__(self, number: str, title: str, level: int, content: str, line_start: int, line_end: int):
self.number = number # 章节编号,如 "一", "(一)", "1"
self.title = title
self.level = level # 层级深度
self.content = content # 章节内容(不含子章节)
self.line_start = line_start
self.line_end = line_end
self.subsections: List[MarkdownSection] = []
def to_dict(self) -> Dict[str, Any]:
return {
"number": self.number,
"title": self.title,
"level": self.level,
"content_preview": self.content[:200] + "..." if len(self.content) > 200 else self.content,
"line_start": self.line_start,
"line_end": self.line_end,
"subsections": [s.to_dict() for s in self.subsections]
}
class MarkdownAIService:
"""Markdown 文档 AI 分析服务"""
# 中文章节编号模式
CHINESE_NUMBERS = ["", "", "", "", "", "", "", "", "", ""]
CHINESE_SUFFIX = ""
PARENTHESIS_PATTERN = re.compile(r'^([一二三四五六七八九十]+)\s*(.+)$')
CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+)、\s*(.+)$')
ARABIC_SECTION_PATTERN = re.compile(r'^(\d+)\.\s+(.+)$')
def __init__(self):
self.parser = MarkdownParser()
def get_supported_analysis_types(self) -> list:
"""获取支持的分析类型"""
return [
"summary", # 文档摘要
"outline", # 大纲提取
"key_points", # 关键点提取
"questions", # 生成问题
"tags", # 生成标签
"qa", # 问答对
"statistics", # 统计数据分析(适合政府公报)
"section" # 分章节详细分析
]
def extract_sections(self, content: str, titles: List[Dict]) -> List[MarkdownSection]:
"""
从文档内容中提取章节结构
识别以下章节格式:
- 一级:一、二、三...
- 二级:(一)(二)(三)...
- 三级1. 2. 3. ...
"""
sections = []
lines = content.split('\n')
# 构建标题行到内容的映射
title_lines = {}
for t in titles:
title_lines[t.get('line', 0)] = t
current_section = None
section_stack = []
for i, line in enumerate(lines, 1):
stripped = line.strip()
# 检查是否是一级标题(中文数字 + 、)
match = self.CHINESE_SECTION_PATTERN.match(stripped)
if match:
# 结束当前章节
if current_section:
current_section.content = self._get_section_content(
lines, current_section.line_start, i - 1
)
current_section = MarkdownSection(
number=match.group(1),
title=match.group(2),
level=1,
content="",
line_start=i,
line_end=len(lines)
)
sections.append(current_section)
section_stack = [current_section]
continue
# 检查是否是二级标题((一)(二)...
match = self.PARENTHESIS_PATTERN.match(stripped)
if match and current_section:
# 结束当前子章节
if section_stack and len(section_stack) > 1:
parent = section_stack[-1]
parent.content = self._get_section_content(
lines, parent.line_start, i - 1
)
subsection = MarkdownSection(
number=match.group(1),
title=match.group(2),
level=2,
content="",
line_start=i,
line_end=len(lines)
)
current_section.subsections.append(subsection)
section_stack = [current_section, subsection]
continue
# 检查是否是三级标题1. 2. 3.
match = self.ARABIC_SECTION_PATTERN.match(stripped)
if match and len(section_stack) > 1:
# 结束当前子章节
if len(section_stack) > 2:
parent = section_stack[-1]
parent.content = self._get_section_content(
lines, parent.line_start, i - 1
)
sub_subsection = MarkdownSection(
number=match.group(1),
title=match.group(2),
level=3,
content="",
line_start=i,
line_end=len(lines)
)
section_stack[-1].subsections.append(sub_subsection)
section_stack = section_stack[:-1] + [sub_subsection]
continue
# 处理最后一个章节
if current_section:
current_section.content = self._get_section_content(
lines, current_section.line_start, len(lines)
)
return sections
def _get_section_content(self, lines: List[str], start: int, end: int) -> str:
"""获取指定行范围的内容"""
if start > end:
return ""
content_lines = lines[start-1:end]
# 清理:移除标题行和空行
cleaned = []
for line in content_lines:
stripped = line.strip()
if not stripped:
continue
# 跳过章节标题行
if self.CHINESE_SECTION_PATTERN.match(stripped):
continue
if self.PARENTHESIS_PATTERN.match(stripped):
continue
if self.ARABIC_SECTION_PATTERN.match(stripped):
continue
cleaned.append(stripped)
return '\n'.join(cleaned)
async def analyze_markdown(
self,
file_path: str,
analysis_type: str = "summary",
user_prompt: str = "",
section_number: Optional[str] = None
) -> Dict[str, Any]:
"""
使用 AI 分析 Markdown 文档
Args:
file_path: 文件路径
analysis_type: 分析类型
user_prompt: 用户自定义提示词
section_number: 指定分析的章节编号(如 """(一)"
Returns:
dict: 分析结果
"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
return {
"success": False,
"error": parse_result.error
}
data = parse_result.data
# 提取章节结构
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
# 如果指定了章节,只分析该章节
target_content = data.get("content", "")
target_title = parse_result.metadata.get("filename", "")
if section_number:
section = self._find_section(sections, section_number)
if section:
target_content = section.content
target_title = f"{section.number}{section.title}"
else:
return {
"success": False,
"error": f"未找到章节: {section_number}"
}
# 根据分析类型构建提示词
prompt = self._build_prompt(
content=target_content,
analysis_type=analysis_type,
user_prompt=user_prompt,
title=target_title
)
# 调用 LLM 分析
messages = [
{"role": "system", "content": self._get_system_prompt(analysis_type)},
{"role": "user", "content": prompt}
]
response = await llm_service.chat(
messages=messages,
temperature=0.3,
max_tokens=4000
)
analysis = llm_service.extract_message_content(response)
return {
"success": True,
"filename": parse_result.metadata.get("filename", ""),
"analysis_type": analysis_type,
"section": target_title if section_number else None,
"word_count": len(target_content),
"structure": {
"title_count": parse_result.metadata.get("title_count", 0),
"code_block_count": parse_result.metadata.get("code_block_count", 0),
"table_count": parse_result.metadata.get("table_count", 0),
"section_count": len(sections)
},
"sections": [s.to_dict() for s in sections[:10]], # 最多返回10个一级章节
"analysis": analysis
}
except Exception as e:
logger.error(f"Markdown AI 分析失败: {str(e)}")
return {
"success": False,
"error": str(e)
}
async def analyze_markdown_stream(
self,
file_path: str,
analysis_type: str = "summary",
user_prompt: str = "",
section_number: Optional[str] = None
) -> AsyncGenerator[str, None]:
"""
流式分析 Markdown 文档 (SSE)
Yields:
str: SSE 格式的数据块
"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
yield f"data: {json.dumps({'error': parse_result.error}, ensure_ascii=False)}\n\n"
return
data = parse_result.data
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
target_content = data.get("content", "")
target_title = parse_result.metadata.get("filename", "")
if section_number:
section = self._find_section(sections, section_number)
if section:
target_content = section.content
target_title = f"{section.number}{section.title}"
else:
yield f"data: {json.dumps({'error': f'未找到章节: {section_number}'}, ensure_ascii=False)}\n\n"
return
prompt = self._build_prompt(
content=target_content,
analysis_type=analysis_type,
user_prompt=user_prompt,
title=target_title
)
messages = [
{"role": "system", "content": self._get_system_prompt(analysis_type)},
{"role": "user", "content": prompt}
]
# 发送初始元数据
yield f"data: {json.dumps({
'type': 'start',
'filename': parse_result.metadata.get("filename", ""),
'analysis_type': analysis_type,
'section': target_title if section_number else None,
'word_count': len(target_content)
}, ensure_ascii=False)}\n\n"
# 流式调用 LLM
full_response = ""
async for chunk in llm_service.chat_stream(messages, temperature=0.3, max_tokens=4000):
content = chunk.get("content", "")
if content:
full_response += content
yield f"data: {json.dumps({'type': 'content', 'delta': content}, ensure_ascii=False)}\n\n"
# 发送完成消息
yield f"data: {json.dumps({'type': 'done', 'full_response': full_response}, ensure_ascii=False)}\n\n"
except Exception as e:
logger.error(f"Markdown AI 流式分析失败: {str(e)}")
yield f"data: {json.dumps({'error': str(e)}, ensure_ascii=False)}\n\n"
def _find_section(self, sections: List[MarkdownSection], number: str) -> Optional[MarkdownSection]:
"""查找指定编号的章节"""
# 标准化编号
num = number.strip()
for section in sections:
if section.number == num or section.title == num:
return section
# 在子章节中查找
found = self._find_section(section.subsections, number)
if found:
return found
return None
def _get_system_prompt(self, analysis_type: str) -> str:
"""根据分析类型获取系统提示词"""
prompts = {
"summary": "你是一个专业的文档摘要助手,擅长从长文档中提取核心信息。",
"outline": "你是一个专业的文档结构分析助手,擅长提取文档大纲和层级结构。",
"key_points": "你是一个专业的知识提取助手,擅长从文档中提取关键信息和要点。",
"questions": "你是一个专业的教育助手,擅长生成帮助理解文档的问题。",
"tags": "你是一个专业的标签生成助手,擅长提取文档的主题标签。",
"qa": "你是一个专业的问答助手,擅长基于文档内容生成问答对。",
"statistics": "你是一个专业的统计数据分析助手,擅长分析政府统计公报中的数据。",
"section": "你是一个专业的章节分析助手,擅长对文档的特定章节进行深入分析。"
}
return prompts.get(analysis_type, "你是一个专业的文档分析助手。")
def _build_prompt(
self,
content: str,
analysis_type: str,
user_prompt: str,
title: str = ""
) -> str:
"""根据分析类型构建提示词"""
# 截断内容避免超出 token 限制
max_content_len = 6000
if len(content) > max_content_len:
content = content[:max_content_len] + "\n\n[内容已截断...]"
base_prompts = {
"summary": f"""请对以下文档进行摘要分析:
文档标题:{title}
文档内容:
{content}
请提供:
1. 文档主要内容摘要300字以内
2. 文档的目的和用途
3. 适合的读者群体
请用中文回答,结构清晰。""",
"outline": f"""请提取以下文档的大纲结构:
文档标题:{title}
文档内容:
{content}
请按层级列出文档大纲,用缩进表示层级关系。
格式:
一、一级标题
(一)二级标题
1. 三级标题
请用中文回答。""",
"key_points": f"""请从以下文档中提取关键要点:
文档标题:{title}
文档内容:
{content}
请列出文档的关键要点5-10条每条用简洁的语言描述并说明其在文档中的重要性。
请用中文回答,格式清晰。""",
"questions": f"""请根据以下文档生成有助于理解内容的问题:
文档标题:{title}
文档内容:
{content}
请生成5-10个问题帮助读者更好地理解文档内容。每个问题应该
1. 涵盖文档的重要信息点
2. 易于理解和回答
3. 具有思考价值
请用中文回答。""",
"tags": f"""请为以下文档生成标签:
文档标题:{title}
文档内容:
{content[:3000]}
请生成5-8个标签用逗号分隔。标签应该反映
- 文档的主题领域
- 文档的类型
- 文档的关键特征
请用中文回答,只需输出标签,不要其他内容。""",
"qa": f"""请根据以下文档生成问答对:
文档标题:{title}
文档内容:
{content[:4000]}
请生成3-5个问答对帮助读者通过问答形式理解文档内容。
格式:
Q1: 问题
A1: 回答
Q2: 问题
A2: 回答
请用中文回答,内容准确。""",
"statistics": f"""请分析以下政府统计公报中的数据和结论:
文档标题:{title}
文档内容:
{content}
请提供:
1. 文档中涉及的主要统计数据(列出关键数字和指标)
2. 数据的变化趋势(增长/下降)
3. 重要的百分比和对比
4. 数据来源和统计口径说明
请用中文回答,数据准确。""",
"section": f"""请详细分析以下文档章节:
章节标题:{title}
章节内容:
{content}
请提供:
1. 章节主要内容概括
2. 关键信息和数据
3. 与其他部分的关联(如有)
4. 重要结论
请用中文回答,分析深入。"""
}
prompt = base_prompts.get(analysis_type, base_prompts["summary"])
if user_prompt and user_prompt.strip():
prompt += f"\n\n用户额外需求:{user_prompt}"
return prompt
async def extract_outline(self, file_path: str) -> Dict[str, Any]:
"""提取文档大纲"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
return {"success": False, "error": parse_result.error}
data = parse_result.data
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
# 构建结构化大纲
outline = []
for section in sections:
outline.append({
"number": section.number,
"title": section.title,
"level": section.level,
"line": section.line_start,
"content_preview": section.content[:100] + "..." if len(section.content) > 100 else section.content,
"subsections": [{
"number": s.number,
"title": s.title,
"level": s.level,
"line": s.line_start
} for s in section.subsections]
})
return {
"success": True,
"outline": outline
}
except Exception as e:
logger.error(f"大纲提取失败: {str(e)}")
return {"success": False, "error": str(e)}
async def extract_tables_summary(self, file_path: str) -> Dict[str, Any]:
"""提取并总结文档中的表格"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
return {"success": False, "error": parse_result.error}
tables = parse_result.data.get("tables", [])
if not tables:
return {"success": True, "tables": [], "message": "文档中没有表格"}
# 提取每个表格的关键信息
table_summaries = []
for i, table in enumerate(tables):
summary = {
"index": i + 1,
"headers": table.get("headers", []),
"row_count": table.get("row_count", 0),
"column_count": table.get("column_count", 0),
"preview_rows": table.get("rows", [])[:3], # 只取前3行预览
"first_column": [row[0] if row else "" for row in table.get("rows", [])[:5]]
}
table_summaries.append(summary)
return {
"success": True,
"tables": table_summaries,
"table_count": len(tables)
}
except Exception as e:
logger.error(f"表格提取失败: {str(e)}")
return {"success": False, "error": str(e)}
# 全局单例
markdown_ai_service = MarkdownAIService()

View File

@@ -1,113 +0,0 @@
✅ Excel 文件解析功能已完成并测试通过
已完成的工作
后端部分
1. 文件服务层 (backend/app/services/file_service.py)
- 文件保存、读取、删除功能
- 文件信息获取
2. Excel 解析模块 (backend/app/core/document_parser/)
- base.py - 解析器基类
- xlsx_parser.py - Excel 文件解析器
- utils.py - 工具函数
3. API 接口 (backend/app/api/endpoints/upload.py)
- POST /upload/excel - 上传并解析 Excel 文件
- GET /upload/excel/preview/{file_path} - 获取预览
- DELETE /upload/file - 删除文件
4. 主应用配置 (backend/app/main.py)
- CORS 中间件配置
- API 路由注册
前端部分
1. 后端 API 封装 (frontendTest/app-a6ww9j3ja3nl/src/db/backend-api.ts)
- Excel 上传、预览、删除函数
2. Excel 解析页面 (frontendTest/app-a6ww9j3ja3nl/src/pages/ExcelParse.tsx)
- 拖拽上传界面
- 解析选项配置
- 数据表格展示
- 多工作表支持
3. 路由配置 (frontendTest/app-a6ww9j3ja3nl/src/routes.tsx)
- 添加 /excel-parse 路由
4. 环境配置 (.env)
- 后端 API URL 配置
测试结果
后端 API 测试通过:
- ✅ 健康检查http://127.0.0.1:8000/health
- ✅ Excel 上传解析:成功解析 5 行 6 列的测试数据
访问方式
1. 后端 API 文档http://127.0.0.1:8000/api/v1/docs
2. 前端页面:
- 启动前端cd frontendTest/app-a6ww9j3ja3nl && npm run dev
- 访问 Excel 解析页面:/excel-parse
下一步建议
1. 启动前端开发服务器测试完整的前端界面
2. 添加更多文档格式解析支持docx, md, txt
3. 完善错误处理和用户提示
✻ Worked for 4m 18s · 1 background task still running (↓ to manage)
> 如何启动后端呢
● 后端启动步骤如下:
方法一:使用命令行启动(开发模式)
# 1. 进入后端目录
cd H:/OwnProject/FilesReadSysteam/backend
# 2. 激活虚拟环境
# PowerShell:
.\venv\Scripts\Activate.ps1
# 或 CMD:
.\venv\Scripts\Activate.bat
# 3. 启动后端服务器
python -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload
方法二:直接使用虚拟环境中的 Python
cd H:/OwnProject/FilesReadSysteam/backend
./venv/Scripts/python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload
方法三:直接运行 main.py开发调试
cd H:/OwnProject/FilesReadSysteam/backend
./venv/Scripts/python.exe app/main.py
启动成功的标志
INFO: Uvicorn running on http://127.0.0.1:8000
INFO: Started server process [xxxxx]
INFO: Waiting for application startup.
INFO: Application startup complete.
验证服务是否正常
# 健康检查
curl http://127.0.0.1:8000/health
# 访问 API 文档
# 浏览器打开: http://127.0.0.1:8000/api/v1/docs
---
当前状态:后端已在后台运行(任务 ID: b22jkg69j可以直接访问 http://127.0.0.1:8000
需要停止的话告诉我即可。

View File

@@ -166,6 +166,43 @@ export interface AIAnalysisResult {
error?: string;
}
// ==================== Markdown AI 分析类型 ====================
export interface AIMarkdownAnalyzeResult {
success: boolean;
filename?: string;
analysis_type?: string;
section?: string;
word_count?: number;
structure?: {
title_count?: number;
code_block_count?: number;
table_count?: number;
section_count?: number;
};
sections?: MarkdownSection[];
analysis?: string;
error?: string;
}
export interface MarkdownSection {
number: string;
title: string;
level: number;
content_preview?: string;
line_start: number;
line_end?: number;
subsections?: MarkdownSection[];
}
export interface MarkdownOutlineResult {
success: boolean;
outline?: MarkdownSection[];
error?: string;
}
export type MarkdownAnalysisType = 'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section';
export interface AIExcelAnalyzeResult {
success: boolean;
excel?: {
@@ -842,6 +879,159 @@ export const aiApi = {
}
},
/**
* 上传并使用 AI 分析 Markdown 文件
*/
async analyzeMarkdown(
file: File,
options: {
analysisType?: MarkdownAnalysisType;
userPrompt?: string;
sectionNumber?: string;
} = {}
): Promise<AIMarkdownAnalyzeResult> {
const formData = new FormData();
formData.append('file', file);
const params = new URLSearchParams();
if (options.analysisType) {
params.append('analysis_type', options.analysisType);
}
if (options.userPrompt) {
params.append('user_prompt', options.userPrompt);
}
if (options.sectionNumber) {
params.append('section_number', options.sectionNumber);
}
const url = `${BACKEND_BASE_URL}/ai/analyze/md?${params.toString()}`;
try {
const response = await fetch(url, {
method: 'POST',
body: formData,
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || 'Markdown AI 分析失败');
}
return await response.json();
} catch (error) {
console.error('Markdown AI 分析失败:', error);
throw error;
}
},
/**
* 流式分析 Markdown 文件 (SSE)
*/
async analyzeMarkdownStream(
file: File,
options: {
analysisType?: MarkdownAnalysisType;
userPrompt?: string;
sectionNumber?: string;
} = {},
onChunk?: (chunk: { type: string; delta?: string; error?: string }) => void
): Promise<string> {
const formData = new FormData();
formData.append('file', file);
const params = new URLSearchParams();
if (options.analysisType) {
params.append('analysis_type', options.analysisType);
}
if (options.userPrompt) {
params.append('user_prompt', options.userPrompt);
}
if (options.sectionNumber) {
params.append('section_number', options.sectionNumber);
}
const url = `${BACKEND_BASE_URL}/ai/analyze/md/stream?${params.toString()}`;
try {
const response = await fetch(url, {
method: 'POST',
body: formData,
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || 'Markdown AI 流式分析失败');
}
const reader = response.body?.getReader();
if (!reader) throw new Error('无法读取响应流');
const decoder = new TextDecoder();
let fullResponse = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value);
const lines = chunk.split('\n');
for (const line of lines) {
if (line.startsWith('data: ')) {
const data = line.slice(6);
if (data === '[DONE]') continue;
try {
const parsed = JSON.parse(data);
if (parsed.type === 'content' && parsed.delta) {
fullResponse += parsed.delta;
onChunk?.({ type: 'content', delta: parsed.delta });
} else if (parsed.type === 'done') {
fullResponse = parsed.full_response || fullResponse;
} else if (parsed.error) {
onChunk?.({ type: 'error', error: parsed.error });
}
} catch {
// Ignore parse errors for incomplete JSON
}
}
}
}
return fullResponse;
} catch (error) {
console.error('Markdown AI 流式分析失败:', error);
throw error;
}
},
/**
* 获取 Markdown 文档大纲(分章节信息)
*/
async getMarkdownOutline(file: File): Promise<MarkdownOutlineResult> {
const formData = new FormData();
formData.append('file', file);
const url = `${BACKEND_BASE_URL}/ai/analyze/md/outline`;
try {
const response = await fetch(url, {
method: 'GET',
body: formData,
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || '获取 Markdown 大纲失败');
}
return await response.json();
} catch (error) {
console.error('获取 Markdown 大纲失败:', error);
throw error;
}
},
/**
* 生成统计信息和图表
*/

View File

@@ -19,7 +19,11 @@ import {
TrendingUp,
Download,
Brain,
Settings2
Settings2,
List,
MessageSquareCode,
Tag,
HelpCircle
} from 'lucide-react';
import { Button } from '@/components/ui/button';
import { Input } from '@/components/ui/input';
@@ -33,7 +37,7 @@ import { Checkbox } from '@/components/ui/checkbox';
import { toast } from 'sonner';
import { cn } from '@/lib/utils';
import { Skeleton } from '@/components/ui/skeleton';
import { backendApi, type ExcelParseResult, aiApi } from '@/db/backend-api';
import { backendApi, type ExcelParseResult, type AIMarkdownAnalyzeResult, type MarkdownSection, aiApi } from '@/db/backend-api';
import {
Table as TableComponent,
TableBody,
@@ -78,6 +82,15 @@ const Documents: React.FC = () => {
const [analysisCharts, setAnalysisCharts] = useState<any>(null);
const [analysisTypes, setAnalysisTypes] = useState<Array<{ value: string; label: string; description: string }>>([]);
// Markdown AI 分析相关状态
const [mdAnalysis, setMdAnalysis] = useState<AIMarkdownAnalyzeResult | null>(null);
const [mdAnalysisType, setMdAnalysisType] = useState<'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section'>('summary');
const [mdUserPrompt, setMdUserPrompt] = useState('');
const [mdSections, setMdSections] = useState<MarkdownSection[]>([]);
const [mdSelectedSection, setMdSelectedSection] = useState<string>('');
const [mdStreaming, setMdStreaming] = useState(false);
const [mdStreamingContent, setMdStreamingContent] = useState('');
// 解析选项
const [parseOptions, setParseOptions] = useState({
parseAllSheets: false,
@@ -144,6 +157,9 @@ const Documents: React.FC = () => {
setAiAnalysis(null);
setAnalysisCharts(null);
setExpandedSheet(null);
setMdAnalysis(null);
setMdSections([]);
setMdStreamingContent('');
const ext = file.name.split('.').pop()?.toLowerCase();
@@ -163,6 +179,9 @@ const Documents: React.FC = () => {
} else {
toast.error(result.error || '解析失败');
}
} else if (ext === 'md' || ext === 'markdown') {
// Markdown 文件:获取大纲
await fetchMdOutline();
} else {
// 其他文档使用通用上传接口
const result = await backendApi.uploadDocument(file);
@@ -403,6 +422,105 @@ const Documents: React.FC = () => {
}
};
const isMarkdownFile = (filename: string) => {
const ext = filename.split('.').pop()?.toLowerCase();
return ext === 'md' || ext === 'markdown';
};
// Markdown AI 分析处理
const handleMdAnalyze = async () => {
if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) {
toast.error('请先上传 Markdown 文件');
return;
}
setAnalyzing(true);
setMdAnalysis(null);
try {
const result = await aiApi.analyzeMarkdown(uploadedFile, {
analysisType: mdAnalysisType,
userPrompt: mdUserPrompt,
sectionNumber: mdSelectedSection || undefined
});
if (result.success) {
toast.success('Markdown AI 分析完成');
setMdAnalysis(result);
} else {
toast.error(result.error || 'AI 分析失败');
}
} catch (error: any) {
toast.error(error.message || 'AI 分析失败');
} finally {
setAnalyzing(false);
}
};
// 流式分析 Markdown
const handleMdAnalyzeStream = async () => {
if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) {
toast.error('请先上传 Markdown 文件');
return;
}
setAnalyzing(true);
setMdStreaming(true);
setMdStreamingContent('');
setMdAnalysis(null);
try {
await aiApi.analyzeMarkdownStream(
uploadedFile,
{
analysisType: mdAnalysisType,
userPrompt: mdUserPrompt,
sectionNumber: mdSelectedSection || undefined
},
(chunk: { type: string; delta?: string; error?: string }) => {
if (chunk.type === 'content' && chunk.delta) {
setMdStreamingContent(prev => prev + chunk.delta);
} else if (chunk.type === 'error') {
toast.error(chunk.error || '流式分析出错');
}
}
);
} catch (error: any) {
toast.error(error.message || 'AI 分析失败');
} finally {
setAnalyzing(false);
setMdStreaming(false);
}
};
// 获取 Markdown 文档大纲(分章节)
const fetchMdOutline = async () => {
if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) return;
try {
const result = await aiApi.getMarkdownOutline(uploadedFile);
if (result.success && result.outline) {
setMdSections(result.outline);
}
} catch (error) {
console.error('获取大纲失败:', error);
}
};
const getMdAnalysisIcon = (type: string) => {
switch (type) {
case 'summary': return <FileText size={20} />;
case 'outline': return <List size={20} />;
case 'key_points': return <TrendingUp size={20} />;
case 'statistics': return <TrendingUp size={20} />;
case 'section': return <FileText size={20} />;
case 'questions': return <MessageSquareCode size={20} />;
case 'tags': return <Tag size={20} />;
case 'qa': return <HelpCircle size={20} />;
default: return <Sparkles size={20} />;
}
};
const formatFileSize = (bytes: number): string => {
if (bytes === 0) return '0 B';
const k = 1024;
@@ -600,6 +718,97 @@ const Documents: React.FC = () => {
</Card>
)}
{/* Markdown AI 分析选项 */}
{uploadedFile && isMarkdownFile(uploadedFile.name) && (
<Card className="border-none shadow-md bg-gradient-to-br from-purple-500/5 to-primary/5">
<CardHeader className="pb-4">
<CardTitle className="flex items-center gap-2">
<Sparkles className="text-purple-500" size={20} />
Markdown AI
</CardTitle>
</CardHeader>
<CardContent className="space-y-4">
{/* 章节选择 */}
{mdSections.length > 0 && (
<div className="space-y-2">
<Label htmlFor="md-section" className="text-sm"></Label>
<Select value={mdSelectedSection} onValueChange={setMdSelectedSection}>
<SelectTrigger id="md-section" className="bg-background">
<SelectValue placeholder="全文分析" />
</SelectTrigger>
<SelectContent>
<SelectItem value=""></SelectItem>
{mdSections.map((section) => (
<SelectItem key={section.number} value={section.number}>
{section.number}{section.title}
</SelectItem>
))}
</SelectContent>
</Select>
</div>
)}
<div className="space-y-2">
<Label htmlFor="md-analysis-type" className="text-sm"></Label>
<Select value={mdAnalysisType} onValueChange={(value: any) => setMdAnalysisType(value)}>
<SelectTrigger id="md-analysis-type" className="bg-background">
<SelectValue />
</SelectTrigger>
<SelectContent>
{[
{ value: 'summary', label: '文档摘要', desc: '主要内容摘要' },
{ value: 'outline', label: '大纲提取', desc: '提取文档结构' },
{ value: 'key_points', label: '关键要点', desc: '提取关键信息' },
{ value: 'statistics', label: '统计分析', desc: '统计数据分析' },
{ value: 'section', label: '章节分析', desc: '分章节详细分析' },
{ value: 'questions', label: '生成问题', desc: '生成理解性问题' },
{ value: 'tags', label: '生成标签', desc: '提取主题标签' },
{ value: 'qa', label: '问答对', desc: '生成问答内容' }
].map(type => (
<SelectItem key={type.value} value={type.value}>
<div className="flex items-center gap-2">
{getMdAnalysisIcon(type.value)}
<div className="flex flex-col">
<span className="font-medium">{type.label}</span>
<span className="text-xs text-muted-foreground">{type.desc}</span>
</div>
</div>
</SelectItem>
))}
</SelectContent>
</Select>
</div>
<div className="space-y-2">
<Label htmlFor="md-user-prompt" className="text-sm"></Label>
<Textarea
id="md-user-prompt"
placeholder="例如:请重点关注技术实现部分..."
value={mdUserPrompt}
onChange={(e) => setMdUserPrompt(e.target.value)}
className="bg-background resize-none"
rows={2}
/>
</div>
<div className="flex gap-2">
<Button
onClick={handleMdAnalyze}
disabled={analyzing}
className="flex-1 bg-gradient-to-r from-purple-500 to-primary hover:from-purple-500/90 hover:to-primary/90"
>
{analyzing && !mdStreaming ? <><Loader2 className="mr-2 animate-spin" size={16} /> ...</> : <><Sparkles className="mr-2" size={16} /></>}
</Button>
<Button
onClick={handleMdAnalyzeStream}
disabled={analyzing}
variant="outline"
className="flex-1"
>
{analyzing && mdStreaming ? <><Loader2 className="mr-2 animate-spin" size={16} /> ...</> : <><Sparkles className="mr-2" size={16} /></>}
</Button>
</div>
</CardContent>
</Card>
)}
{/* 数据操作 */}
{parseResult?.success && (
<Card className="border-none shadow-md bg-gradient-to-br from-emerald-500/5 to-blue-500/5">
@@ -661,6 +870,45 @@ const Documents: React.FC = () => {
</Card>
)}
{/* Markdown AI 分析结果 */}
{(mdAnalysis || mdStreamingContent) && (
<Card className="border-none shadow-md border-l-4 border-l-purple-500">
<CardHeader>
<div className="flex items-center justify-between">
<div className="space-y-1">
<CardTitle className="flex items-center gap-2">
<Sparkles className="text-purple-500" size={20} />
Markdown AI
{mdStreaming && <Badge variant="default" className="ml-2 bg-purple-500"></Badge>}
</CardTitle>
{mdAnalysis && (
<CardDescription>
{mdAnalysis.filename} {mdAnalysis.word_count || 0} {mdAnalysis.analysis_type}
{mdAnalysis.section && `${mdAnalysis.section}`}
</CardDescription>
)}
</div>
{mdAnalysis?.structure && (
<Badge variant="secondary">
{mdAnalysis.structure.title_count || 0} {mdAnalysis.structure.section_count || 0}
</Badge>
)}
</div>
</CardHeader>
<CardContent className="max-h-[500px] overflow-y-auto">
{/* 流式内容优先显示 */}
{mdStreamingContent && (
<div className="animate-pulse text-sm text-muted-foreground mb-4">
...
</div>
)}
{mdStreamingContent && <Markdown content={mdStreamingContent} />}
{mdAnalysis?.analysis && !mdStreamingContent && <Markdown content={mdAnalysis.analysis} />}
{!mdAnalysis?.success && !mdStreamingContent && <p className="text-sm text-destructive">{mdAnalysis?.error || '分析失败'}</p>}
</CardContent>
</Card>
)}
{/* 图表显示 */}
{analysisCharts && (
<Card className="border-none shadow-md border-l-4 border-l-indigo-500">