feat(ai-analyze): 新增 Markdown 文件 AI 分析功能

- 添加 Markdown 文件上传和解析接口
- 实现流式分析和大纲提取功能
- 支持多种分析类型:摘要、大纲、关键点等
- 新增 markdown_ai_service 服务类
- 扩展 LLMService 支持流式调用
- 更新前端 API 接口定义和实现
This commit is contained in:
2026-04-02 11:53:12 +08:00
parent ddf30078f0
commit d189ea9620
6 changed files with 1286 additions and 118 deletions

View File

@@ -2,10 +2,14 @@
AI 分析 API 接口 AI 分析 API 接口
""" """
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body
from fastapi.responses import StreamingResponse
from typing import Optional from typing import Optional
import logging import logging
import tempfile
import os
from app.services.excel_ai_service import excel_ai_service from app.services.excel_ai_service import excel_ai_service
from app.services.markdown_ai_service import markdown_ai_service
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -93,10 +97,11 @@ async def get_analysis_types():
获取支持的分析类型列表 获取支持的分析类型列表
Returns: Returns:
list: 支持的分析类型 dict: 支持的分析类型(包含 Excel 和 Markdown
""" """
return { return {
"types": excel_ai_service.get_supported_analysis_types() "excel_types": excel_ai_service.get_supported_analysis_types(),
"markdown_types": markdown_ai_service.get_supported_analysis_types()
} }
@@ -142,3 +147,185 @@ async def analyze_text(
except Exception as e: except Exception as e:
logger.error(f"文本分析失败: {str(e)}") logger.error(f"文本分析失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}") raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
@router.post("/analyze/md")
async def analyze_markdown(
file: UploadFile = File(...),
analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section"),
user_prompt: str = Query("", description="用户自定义提示词"),
section_number: Optional[str] = Query(None, description="指定章节编号,如 '''(一)'")
):
"""
上传并使用 AI 分析 Markdown 文件
Args:
file: 上传的 Markdown 文件
analysis_type: 分析类型
user_prompt: 用户自定义提示词
section_number: 指定分析的章节编号
Returns:
dict: 分析结果
"""
# 检查文件类型
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
# 验证分析类型
supported_types = markdown_ai_service.get_supported_analysis_types()
if analysis_type not in supported_types:
raise HTTPException(
status_code=400,
detail=f"不支持的分析类型: {analysis_type},支持的类型: {', '.join(supported_types)}"
)
try:
# 读取文件内容
content = await file.read()
# 保存到临时文件
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
logger.info(f"开始分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}, 章节: {section_number}")
# 调用 AI 分析服务
result = await markdown_ai_service.analyze_markdown(
file_path=tmp_path,
analysis_type=analysis_type,
user_prompt=user_prompt,
section_number=section_number
)
logger.info(f"Markdown 分析完成: {file.filename}, 成功: {result['success']}")
if not result['success']:
raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
return result
finally:
# 清理临时文件
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except HTTPException:
raise
except Exception as e:
logger.error(f"Markdown AI 分析过程中出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
@router.post("/analyze/md/stream")
async def analyze_markdown_stream(
file: UploadFile = File(...),
analysis_type: str = Query("summary", description="分析类型"),
user_prompt: str = Query("", description="用户自定义提示词"),
section_number: Optional[str] = Query(None, description="指定章节编号")
):
"""
流式分析 Markdown 文件 (SSE)
Returns:
StreamingResponse: SSE 流式响应
"""
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
try:
content = await file.read()
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
logger.info(f"开始流式分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}")
async def stream_generator():
async for chunk in markdown_ai_service.analyze_markdown_stream(
file_path=tmp_path,
analysis_type=analysis_type,
user_prompt=user_prompt,
section_number=section_number
):
yield chunk
return StreamingResponse(
stream_generator(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no"
}
)
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except HTTPException:
raise
except Exception as e:
logger.error(f"Markdown AI 流式分析出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"流式分析失败: {str(e)}")
@router.get("/analyze/md/outline")
async def get_markdown_outline(
file: UploadFile = File(...)
):
"""
获取 Markdown 文档的大纲结构(分章节信息)
Args:
file: 上传的 Markdown 文件
Returns:
dict: 文档大纲结构
"""
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
try:
content = await file.read()
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
result = await markdown_ai_service.extract_outline(tmp_path)
return result
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except Exception as e:
logger.error(f"获取 Markdown 大纲失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"获取大纲失败: {str(e)}")

View File

@@ -2,7 +2,7 @@
LLM 服务模块 - 封装大模型 API 调用 LLM 服务模块 - 封装大模型 API 调用
""" """
import logging import logging
from typing import Dict, Any, List, Optional from typing import Dict, Any, List, Optional, AsyncGenerator
import httpx import httpx
from app.config import settings from app.config import settings
@@ -87,6 +87,71 @@ class LLMService:
logger.error(f"解析 API 响应失败: {str(e)}") logger.error(f"解析 API 响应失败: {str(e)}")
raise raise
async def chat_stream(
self,
messages: List[Dict[str, str]],
temperature: float = 0.7,
max_tokens: Optional[int] = None,
**kwargs
) -> AsyncGenerator[Dict[str, Any], None]:
"""
流式调用聊天 API
Args:
messages: 消息列表
temperature: 温度参数
max_tokens: 最大 token 数
**kwargs: 其他参数
Yields:
Dict[str, Any]: 包含 delta 内容的块
"""
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
payload = {
"model": self.model_name,
"messages": messages,
"temperature": temperature,
"stream": True
}
if max_tokens:
payload["max_tokens"] = max_tokens
payload.update(kwargs)
try:
async with httpx.AsyncClient(timeout=120.0) as client:
async with client.stream(
"POST",
f"{self.base_url}/chat/completions",
headers=headers,
json=payload
) as response:
async for line in response.aiter_lines():
if line.startswith("data: "):
data = line[6:] # Remove "data: " prefix
if data == "[DONE]":
break
try:
import json as json_module
chunk = json_module.loads(data)
delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
if delta:
yield {"content": delta}
except json_module.JSONDecodeError:
continue
except httpx.HTTPStatusError as e:
logger.error(f"LLM 流式 API 请求失败: {e.response.status_code}")
raise
except Exception as e:
logger.error(f"LLM 流式 API 调用异常: {str(e)}")
raise
async def analyze_excel_data( async def analyze_excel_data(
self, self,
excel_data: Dict[str, Any], excel_data: Dict[str, Any],

View File

@@ -0,0 +1,591 @@
"""
Markdown 文档 AI 分析服务
支持:
- 分章节解析(中文章节编号:一、二、三, (一)(二)(三))
- 结构化数据提取
- 流式输出
- 多种分析类型
"""
import asyncio
import json
import logging
import re
from typing import Any, AsyncGenerator, Dict, List, Optional
from app.services.llm_service import llm_service
from app.core.document_parser import MarkdownParser
logger = logging.getLogger(__name__)
class MarkdownSection:
"""文档章节结构"""
def __init__(self, number: str, title: str, level: int, content: str, line_start: int, line_end: int):
self.number = number # 章节编号,如 "一", "(一)", "1"
self.title = title
self.level = level # 层级深度
self.content = content # 章节内容(不含子章节)
self.line_start = line_start
self.line_end = line_end
self.subsections: List[MarkdownSection] = []
def to_dict(self) -> Dict[str, Any]:
return {
"number": self.number,
"title": self.title,
"level": self.level,
"content_preview": self.content[:200] + "..." if len(self.content) > 200 else self.content,
"line_start": self.line_start,
"line_end": self.line_end,
"subsections": [s.to_dict() for s in self.subsections]
}
class MarkdownAIService:
"""Markdown 文档 AI 分析服务"""
# 中文章节编号模式
CHINESE_NUMBERS = ["", "", "", "", "", "", "", "", "", ""]
CHINESE_SUFFIX = ""
PARENTHESIS_PATTERN = re.compile(r'^([一二三四五六七八九十]+)\s*(.+)$')
CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+)、\s*(.+)$')
ARABIC_SECTION_PATTERN = re.compile(r'^(\d+)\.\s+(.+)$')
def __init__(self):
self.parser = MarkdownParser()
def get_supported_analysis_types(self) -> list:
"""获取支持的分析类型"""
return [
"summary", # 文档摘要
"outline", # 大纲提取
"key_points", # 关键点提取
"questions", # 生成问题
"tags", # 生成标签
"qa", # 问答对
"statistics", # 统计数据分析(适合政府公报)
"section" # 分章节详细分析
]
def extract_sections(self, content: str, titles: List[Dict]) -> List[MarkdownSection]:
"""
从文档内容中提取章节结构
识别以下章节格式:
- 一级:一、二、三...
- 二级:(一)(二)(三)...
- 三级1. 2. 3. ...
"""
sections = []
lines = content.split('\n')
# 构建标题行到内容的映射
title_lines = {}
for t in titles:
title_lines[t.get('line', 0)] = t
current_section = None
section_stack = []
for i, line in enumerate(lines, 1):
stripped = line.strip()
# 检查是否是一级标题(中文数字 + 、)
match = self.CHINESE_SECTION_PATTERN.match(stripped)
if match:
# 结束当前章节
if current_section:
current_section.content = self._get_section_content(
lines, current_section.line_start, i - 1
)
current_section = MarkdownSection(
number=match.group(1),
title=match.group(2),
level=1,
content="",
line_start=i,
line_end=len(lines)
)
sections.append(current_section)
section_stack = [current_section]
continue
# 检查是否是二级标题((一)(二)...
match = self.PARENTHESIS_PATTERN.match(stripped)
if match and current_section:
# 结束当前子章节
if section_stack and len(section_stack) > 1:
parent = section_stack[-1]
parent.content = self._get_section_content(
lines, parent.line_start, i - 1
)
subsection = MarkdownSection(
number=match.group(1),
title=match.group(2),
level=2,
content="",
line_start=i,
line_end=len(lines)
)
current_section.subsections.append(subsection)
section_stack = [current_section, subsection]
continue
# 检查是否是三级标题1. 2. 3.
match = self.ARABIC_SECTION_PATTERN.match(stripped)
if match and len(section_stack) > 1:
# 结束当前子章节
if len(section_stack) > 2:
parent = section_stack[-1]
parent.content = self._get_section_content(
lines, parent.line_start, i - 1
)
sub_subsection = MarkdownSection(
number=match.group(1),
title=match.group(2),
level=3,
content="",
line_start=i,
line_end=len(lines)
)
section_stack[-1].subsections.append(sub_subsection)
section_stack = section_stack[:-1] + [sub_subsection]
continue
# 处理最后一个章节
if current_section:
current_section.content = self._get_section_content(
lines, current_section.line_start, len(lines)
)
return sections
def _get_section_content(self, lines: List[str], start: int, end: int) -> str:
"""获取指定行范围的内容"""
if start > end:
return ""
content_lines = lines[start-1:end]
# 清理:移除标题行和空行
cleaned = []
for line in content_lines:
stripped = line.strip()
if not stripped:
continue
# 跳过章节标题行
if self.CHINESE_SECTION_PATTERN.match(stripped):
continue
if self.PARENTHESIS_PATTERN.match(stripped):
continue
if self.ARABIC_SECTION_PATTERN.match(stripped):
continue
cleaned.append(stripped)
return '\n'.join(cleaned)
async def analyze_markdown(
self,
file_path: str,
analysis_type: str = "summary",
user_prompt: str = "",
section_number: Optional[str] = None
) -> Dict[str, Any]:
"""
使用 AI 分析 Markdown 文档
Args:
file_path: 文件路径
analysis_type: 分析类型
user_prompt: 用户自定义提示词
section_number: 指定分析的章节编号(如 """(一)"
Returns:
dict: 分析结果
"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
return {
"success": False,
"error": parse_result.error
}
data = parse_result.data
# 提取章节结构
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
# 如果指定了章节,只分析该章节
target_content = data.get("content", "")
target_title = parse_result.metadata.get("filename", "")
if section_number:
section = self._find_section(sections, section_number)
if section:
target_content = section.content
target_title = f"{section.number}{section.title}"
else:
return {
"success": False,
"error": f"未找到章节: {section_number}"
}
# 根据分析类型构建提示词
prompt = self._build_prompt(
content=target_content,
analysis_type=analysis_type,
user_prompt=user_prompt,
title=target_title
)
# 调用 LLM 分析
messages = [
{"role": "system", "content": self._get_system_prompt(analysis_type)},
{"role": "user", "content": prompt}
]
response = await llm_service.chat(
messages=messages,
temperature=0.3,
max_tokens=4000
)
analysis = llm_service.extract_message_content(response)
return {
"success": True,
"filename": parse_result.metadata.get("filename", ""),
"analysis_type": analysis_type,
"section": target_title if section_number else None,
"word_count": len(target_content),
"structure": {
"title_count": parse_result.metadata.get("title_count", 0),
"code_block_count": parse_result.metadata.get("code_block_count", 0),
"table_count": parse_result.metadata.get("table_count", 0),
"section_count": len(sections)
},
"sections": [s.to_dict() for s in sections[:10]], # 最多返回10个一级章节
"analysis": analysis
}
except Exception as e:
logger.error(f"Markdown AI 分析失败: {str(e)}")
return {
"success": False,
"error": str(e)
}
async def analyze_markdown_stream(
self,
file_path: str,
analysis_type: str = "summary",
user_prompt: str = "",
section_number: Optional[str] = None
) -> AsyncGenerator[str, None]:
"""
流式分析 Markdown 文档 (SSE)
Yields:
str: SSE 格式的数据块
"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
yield f"data: {json.dumps({'error': parse_result.error}, ensure_ascii=False)}\n\n"
return
data = parse_result.data
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
target_content = data.get("content", "")
target_title = parse_result.metadata.get("filename", "")
if section_number:
section = self._find_section(sections, section_number)
if section:
target_content = section.content
target_title = f"{section.number}{section.title}"
else:
yield f"data: {json.dumps({'error': f'未找到章节: {section_number}'}, ensure_ascii=False)}\n\n"
return
prompt = self._build_prompt(
content=target_content,
analysis_type=analysis_type,
user_prompt=user_prompt,
title=target_title
)
messages = [
{"role": "system", "content": self._get_system_prompt(analysis_type)},
{"role": "user", "content": prompt}
]
# 发送初始元数据
yield f"data: {json.dumps({
'type': 'start',
'filename': parse_result.metadata.get("filename", ""),
'analysis_type': analysis_type,
'section': target_title if section_number else None,
'word_count': len(target_content)
}, ensure_ascii=False)}\n\n"
# 流式调用 LLM
full_response = ""
async for chunk in llm_service.chat_stream(messages, temperature=0.3, max_tokens=4000):
content = chunk.get("content", "")
if content:
full_response += content
yield f"data: {json.dumps({'type': 'content', 'delta': content}, ensure_ascii=False)}\n\n"
# 发送完成消息
yield f"data: {json.dumps({'type': 'done', 'full_response': full_response}, ensure_ascii=False)}\n\n"
except Exception as e:
logger.error(f"Markdown AI 流式分析失败: {str(e)}")
yield f"data: {json.dumps({'error': str(e)}, ensure_ascii=False)}\n\n"
def _find_section(self, sections: List[MarkdownSection], number: str) -> Optional[MarkdownSection]:
"""查找指定编号的章节"""
# 标准化编号
num = number.strip()
for section in sections:
if section.number == num or section.title == num:
return section
# 在子章节中查找
found = self._find_section(section.subsections, number)
if found:
return found
return None
def _get_system_prompt(self, analysis_type: str) -> str:
"""根据分析类型获取系统提示词"""
prompts = {
"summary": "你是一个专业的文档摘要助手,擅长从长文档中提取核心信息。",
"outline": "你是一个专业的文档结构分析助手,擅长提取文档大纲和层级结构。",
"key_points": "你是一个专业的知识提取助手,擅长从文档中提取关键信息和要点。",
"questions": "你是一个专业的教育助手,擅长生成帮助理解文档的问题。",
"tags": "你是一个专业的标签生成助手,擅长提取文档的主题标签。",
"qa": "你是一个专业的问答助手,擅长基于文档内容生成问答对。",
"statistics": "你是一个专业的统计数据分析助手,擅长分析政府统计公报中的数据。",
"section": "你是一个专业的章节分析助手,擅长对文档的特定章节进行深入分析。"
}
return prompts.get(analysis_type, "你是一个专业的文档分析助手。")
def _build_prompt(
self,
content: str,
analysis_type: str,
user_prompt: str,
title: str = ""
) -> str:
"""根据分析类型构建提示词"""
# 截断内容避免超出 token 限制
max_content_len = 6000
if len(content) > max_content_len:
content = content[:max_content_len] + "\n\n[内容已截断...]"
base_prompts = {
"summary": f"""请对以下文档进行摘要分析:
文档标题:{title}
文档内容:
{content}
请提供:
1. 文档主要内容摘要300字以内
2. 文档的目的和用途
3. 适合的读者群体
请用中文回答,结构清晰。""",
"outline": f"""请提取以下文档的大纲结构:
文档标题:{title}
文档内容:
{content}
请按层级列出文档大纲,用缩进表示层级关系。
格式:
一、一级标题
(一)二级标题
1. 三级标题
请用中文回答。""",
"key_points": f"""请从以下文档中提取关键要点:
文档标题:{title}
文档内容:
{content}
请列出文档的关键要点5-10条每条用简洁的语言描述并说明其在文档中的重要性。
请用中文回答,格式清晰。""",
"questions": f"""请根据以下文档生成有助于理解内容的问题:
文档标题:{title}
文档内容:
{content}
请生成5-10个问题帮助读者更好地理解文档内容。每个问题应该
1. 涵盖文档的重要信息点
2. 易于理解和回答
3. 具有思考价值
请用中文回答。""",
"tags": f"""请为以下文档生成标签:
文档标题:{title}
文档内容:
{content[:3000]}
请生成5-8个标签用逗号分隔。标签应该反映
- 文档的主题领域
- 文档的类型
- 文档的关键特征
请用中文回答,只需输出标签,不要其他内容。""",
"qa": f"""请根据以下文档生成问答对:
文档标题:{title}
文档内容:
{content[:4000]}
请生成3-5个问答对帮助读者通过问答形式理解文档内容。
格式:
Q1: 问题
A1: 回答
Q2: 问题
A2: 回答
请用中文回答,内容准确。""",
"statistics": f"""请分析以下政府统计公报中的数据和结论:
文档标题:{title}
文档内容:
{content}
请提供:
1. 文档中涉及的主要统计数据(列出关键数字和指标)
2. 数据的变化趋势(增长/下降)
3. 重要的百分比和对比
4. 数据来源和统计口径说明
请用中文回答,数据准确。""",
"section": f"""请详细分析以下文档章节:
章节标题:{title}
章节内容:
{content}
请提供:
1. 章节主要内容概括
2. 关键信息和数据
3. 与其他部分的关联(如有)
4. 重要结论
请用中文回答,分析深入。"""
}
prompt = base_prompts.get(analysis_type, base_prompts["summary"])
if user_prompt and user_prompt.strip():
prompt += f"\n\n用户额外需求:{user_prompt}"
return prompt
async def extract_outline(self, file_path: str) -> Dict[str, Any]:
"""提取文档大纲"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
return {"success": False, "error": parse_result.error}
data = parse_result.data
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
# 构建结构化大纲
outline = []
for section in sections:
outline.append({
"number": section.number,
"title": section.title,
"level": section.level,
"line": section.line_start,
"content_preview": section.content[:100] + "..." if len(section.content) > 100 else section.content,
"subsections": [{
"number": s.number,
"title": s.title,
"level": s.level,
"line": s.line_start
} for s in section.subsections]
})
return {
"success": True,
"outline": outline
}
except Exception as e:
logger.error(f"大纲提取失败: {str(e)}")
return {"success": False, "error": str(e)}
async def extract_tables_summary(self, file_path: str) -> Dict[str, Any]:
"""提取并总结文档中的表格"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
return {"success": False, "error": parse_result.error}
tables = parse_result.data.get("tables", [])
if not tables:
return {"success": True, "tables": [], "message": "文档中没有表格"}
# 提取每个表格的关键信息
table_summaries = []
for i, table in enumerate(tables):
summary = {
"index": i + 1,
"headers": table.get("headers", []),
"row_count": table.get("row_count", 0),
"column_count": table.get("column_count", 0),
"preview_rows": table.get("rows", [])[:3], # 只取前3行预览
"first_column": [row[0] if row else "" for row in table.get("rows", [])[:5]]
}
table_summaries.append(summary)
return {
"success": True,
"tables": table_summaries,
"table_count": len(tables)
}
except Exception as e:
logger.error(f"表格提取失败: {str(e)}")
return {"success": False, "error": str(e)}
# 全局单例
markdown_ai_service = MarkdownAIService()

View File

@@ -1,113 +0,0 @@
✅ Excel 文件解析功能已完成并测试通过
已完成的工作
后端部分
1. 文件服务层 (backend/app/services/file_service.py)
- 文件保存、读取、删除功能
- 文件信息获取
2. Excel 解析模块 (backend/app/core/document_parser/)
- base.py - 解析器基类
- xlsx_parser.py - Excel 文件解析器
- utils.py - 工具函数
3. API 接口 (backend/app/api/endpoints/upload.py)
- POST /upload/excel - 上传并解析 Excel 文件
- GET /upload/excel/preview/{file_path} - 获取预览
- DELETE /upload/file - 删除文件
4. 主应用配置 (backend/app/main.py)
- CORS 中间件配置
- API 路由注册
前端部分
1. 后端 API 封装 (frontendTest/app-a6ww9j3ja3nl/src/db/backend-api.ts)
- Excel 上传、预览、删除函数
2. Excel 解析页面 (frontendTest/app-a6ww9j3ja3nl/src/pages/ExcelParse.tsx)
- 拖拽上传界面
- 解析选项配置
- 数据表格展示
- 多工作表支持
3. 路由配置 (frontendTest/app-a6ww9j3ja3nl/src/routes.tsx)
- 添加 /excel-parse 路由
4. 环境配置 (.env)
- 后端 API URL 配置
测试结果
后端 API 测试通过:
- ✅ 健康检查http://127.0.0.1:8000/health
- ✅ Excel 上传解析:成功解析 5 行 6 列的测试数据
访问方式
1. 后端 API 文档http://127.0.0.1:8000/api/v1/docs
2. 前端页面:
- 启动前端cd frontendTest/app-a6ww9j3ja3nl && npm run dev
- 访问 Excel 解析页面:/excel-parse
下一步建议
1. 启动前端开发服务器测试完整的前端界面
2. 添加更多文档格式解析支持docx, md, txt
3. 完善错误处理和用户提示
✻ Worked for 4m 18s · 1 background task still running (↓ to manage)
> 如何启动后端呢
● 后端启动步骤如下:
方法一:使用命令行启动(开发模式)
# 1. 进入后端目录
cd H:/OwnProject/FilesReadSysteam/backend
# 2. 激活虚拟环境
# PowerShell:
.\venv\Scripts\Activate.ps1
# 或 CMD:
.\venv\Scripts\Activate.bat
# 3. 启动后端服务器
python -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload
方法二:直接使用虚拟环境中的 Python
cd H:/OwnProject/FilesReadSysteam/backend
./venv/Scripts/python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload
方法三:直接运行 main.py开发调试
cd H:/OwnProject/FilesReadSysteam/backend
./venv/Scripts/python.exe app/main.py
启动成功的标志
INFO: Uvicorn running on http://127.0.0.1:8000
INFO: Started server process [xxxxx]
INFO: Waiting for application startup.
INFO: Application startup complete.
验证服务是否正常
# 健康检查
curl http://127.0.0.1:8000/health
# 访问 API 文档
# 浏览器打开: http://127.0.0.1:8000/api/v1/docs
---
当前状态:后端已在后台运行(任务 ID: b22jkg69j可以直接访问 http://127.0.0.1:8000
需要停止的话告诉我即可。

View File

@@ -166,6 +166,43 @@ export interface AIAnalysisResult {
error?: string; error?: string;
} }
// ==================== Markdown AI 分析类型 ====================
export interface AIMarkdownAnalyzeResult {
success: boolean;
filename?: string;
analysis_type?: string;
section?: string;
word_count?: number;
structure?: {
title_count?: number;
code_block_count?: number;
table_count?: number;
section_count?: number;
};
sections?: MarkdownSection[];
analysis?: string;
error?: string;
}
export interface MarkdownSection {
number: string;
title: string;
level: number;
content_preview?: string;
line_start: number;
line_end?: number;
subsections?: MarkdownSection[];
}
export interface MarkdownOutlineResult {
success: boolean;
outline?: MarkdownSection[];
error?: string;
}
export type MarkdownAnalysisType = 'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section';
export interface AIExcelAnalyzeResult { export interface AIExcelAnalyzeResult {
success: boolean; success: boolean;
excel?: { excel?: {
@@ -842,6 +879,159 @@ export const aiApi = {
} }
}, },
/**
* 上传并使用 AI 分析 Markdown 文件
*/
async analyzeMarkdown(
file: File,
options: {
analysisType?: MarkdownAnalysisType;
userPrompt?: string;
sectionNumber?: string;
} = {}
): Promise<AIMarkdownAnalyzeResult> {
const formData = new FormData();
formData.append('file', file);
const params = new URLSearchParams();
if (options.analysisType) {
params.append('analysis_type', options.analysisType);
}
if (options.userPrompt) {
params.append('user_prompt', options.userPrompt);
}
if (options.sectionNumber) {
params.append('section_number', options.sectionNumber);
}
const url = `${BACKEND_BASE_URL}/ai/analyze/md?${params.toString()}`;
try {
const response = await fetch(url, {
method: 'POST',
body: formData,
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || 'Markdown AI 分析失败');
}
return await response.json();
} catch (error) {
console.error('Markdown AI 分析失败:', error);
throw error;
}
},
/**
* 流式分析 Markdown 文件 (SSE)
*/
async analyzeMarkdownStream(
file: File,
options: {
analysisType?: MarkdownAnalysisType;
userPrompt?: string;
sectionNumber?: string;
} = {},
onChunk?: (chunk: { type: string; delta?: string; error?: string }) => void
): Promise<string> {
const formData = new FormData();
formData.append('file', file);
const params = new URLSearchParams();
if (options.analysisType) {
params.append('analysis_type', options.analysisType);
}
if (options.userPrompt) {
params.append('user_prompt', options.userPrompt);
}
if (options.sectionNumber) {
params.append('section_number', options.sectionNumber);
}
const url = `${BACKEND_BASE_URL}/ai/analyze/md/stream?${params.toString()}`;
try {
const response = await fetch(url, {
method: 'POST',
body: formData,
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || 'Markdown AI 流式分析失败');
}
const reader = response.body?.getReader();
if (!reader) throw new Error('无法读取响应流');
const decoder = new TextDecoder();
let fullResponse = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value);
const lines = chunk.split('\n');
for (const line of lines) {
if (line.startsWith('data: ')) {
const data = line.slice(6);
if (data === '[DONE]') continue;
try {
const parsed = JSON.parse(data);
if (parsed.type === 'content' && parsed.delta) {
fullResponse += parsed.delta;
onChunk?.({ type: 'content', delta: parsed.delta });
} else if (parsed.type === 'done') {
fullResponse = parsed.full_response || fullResponse;
} else if (parsed.error) {
onChunk?.({ type: 'error', error: parsed.error });
}
} catch {
// Ignore parse errors for incomplete JSON
}
}
}
}
return fullResponse;
} catch (error) {
console.error('Markdown AI 流式分析失败:', error);
throw error;
}
},
/**
* 获取 Markdown 文档大纲(分章节信息)
*/
async getMarkdownOutline(file: File): Promise<MarkdownOutlineResult> {
const formData = new FormData();
formData.append('file', file);
const url = `${BACKEND_BASE_URL}/ai/analyze/md/outline`;
try {
const response = await fetch(url, {
method: 'GET',
body: formData,
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || '获取 Markdown 大纲失败');
}
return await response.json();
} catch (error) {
console.error('获取 Markdown 大纲失败:', error);
throw error;
}
},
/** /**
* 生成统计信息和图表 * 生成统计信息和图表
*/ */

View File

@@ -19,7 +19,11 @@ import {
TrendingUp, TrendingUp,
Download, Download,
Brain, Brain,
Settings2 Settings2,
List,
MessageSquareCode,
Tag,
HelpCircle
} from 'lucide-react'; } from 'lucide-react';
import { Button } from '@/components/ui/button'; import { Button } from '@/components/ui/button';
import { Input } from '@/components/ui/input'; import { Input } from '@/components/ui/input';
@@ -33,7 +37,7 @@ import { Checkbox } from '@/components/ui/checkbox';
import { toast } from 'sonner'; import { toast } from 'sonner';
import { cn } from '@/lib/utils'; import { cn } from '@/lib/utils';
import { Skeleton } from '@/components/ui/skeleton'; import { Skeleton } from '@/components/ui/skeleton';
import { backendApi, type ExcelParseResult, aiApi } from '@/db/backend-api'; import { backendApi, type ExcelParseResult, type AIMarkdownAnalyzeResult, type MarkdownSection, aiApi } from '@/db/backend-api';
import { import {
Table as TableComponent, Table as TableComponent,
TableBody, TableBody,
@@ -78,6 +82,15 @@ const Documents: React.FC = () => {
const [analysisCharts, setAnalysisCharts] = useState<any>(null); const [analysisCharts, setAnalysisCharts] = useState<any>(null);
const [analysisTypes, setAnalysisTypes] = useState<Array<{ value: string; label: string; description: string }>>([]); const [analysisTypes, setAnalysisTypes] = useState<Array<{ value: string; label: string; description: string }>>([]);
// Markdown AI 分析相关状态
const [mdAnalysis, setMdAnalysis] = useState<AIMarkdownAnalyzeResult | null>(null);
const [mdAnalysisType, setMdAnalysisType] = useState<'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section'>('summary');
const [mdUserPrompt, setMdUserPrompt] = useState('');
const [mdSections, setMdSections] = useState<MarkdownSection[]>([]);
const [mdSelectedSection, setMdSelectedSection] = useState<string>('');
const [mdStreaming, setMdStreaming] = useState(false);
const [mdStreamingContent, setMdStreamingContent] = useState('');
// 解析选项 // 解析选项
const [parseOptions, setParseOptions] = useState({ const [parseOptions, setParseOptions] = useState({
parseAllSheets: false, parseAllSheets: false,
@@ -144,6 +157,9 @@ const Documents: React.FC = () => {
setAiAnalysis(null); setAiAnalysis(null);
setAnalysisCharts(null); setAnalysisCharts(null);
setExpandedSheet(null); setExpandedSheet(null);
setMdAnalysis(null);
setMdSections([]);
setMdStreamingContent('');
const ext = file.name.split('.').pop()?.toLowerCase(); const ext = file.name.split('.').pop()?.toLowerCase();
@@ -163,6 +179,9 @@ const Documents: React.FC = () => {
} else { } else {
toast.error(result.error || '解析失败'); toast.error(result.error || '解析失败');
} }
} else if (ext === 'md' || ext === 'markdown') {
// Markdown 文件:获取大纲
await fetchMdOutline();
} else { } else {
// 其他文档使用通用上传接口 // 其他文档使用通用上传接口
const result = await backendApi.uploadDocument(file); const result = await backendApi.uploadDocument(file);
@@ -403,6 +422,105 @@ const Documents: React.FC = () => {
} }
}; };
const isMarkdownFile = (filename: string) => {
const ext = filename.split('.').pop()?.toLowerCase();
return ext === 'md' || ext === 'markdown';
};
// Markdown AI 分析处理
const handleMdAnalyze = async () => {
if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) {
toast.error('请先上传 Markdown 文件');
return;
}
setAnalyzing(true);
setMdAnalysis(null);
try {
const result = await aiApi.analyzeMarkdown(uploadedFile, {
analysisType: mdAnalysisType,
userPrompt: mdUserPrompt,
sectionNumber: mdSelectedSection || undefined
});
if (result.success) {
toast.success('Markdown AI 分析完成');
setMdAnalysis(result);
} else {
toast.error(result.error || 'AI 分析失败');
}
} catch (error: any) {
toast.error(error.message || 'AI 分析失败');
} finally {
setAnalyzing(false);
}
};
// 流式分析 Markdown
const handleMdAnalyzeStream = async () => {
if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) {
toast.error('请先上传 Markdown 文件');
return;
}
setAnalyzing(true);
setMdStreaming(true);
setMdStreamingContent('');
setMdAnalysis(null);
try {
await aiApi.analyzeMarkdownStream(
uploadedFile,
{
analysisType: mdAnalysisType,
userPrompt: mdUserPrompt,
sectionNumber: mdSelectedSection || undefined
},
(chunk: { type: string; delta?: string; error?: string }) => {
if (chunk.type === 'content' && chunk.delta) {
setMdStreamingContent(prev => prev + chunk.delta);
} else if (chunk.type === 'error') {
toast.error(chunk.error || '流式分析出错');
}
}
);
} catch (error: any) {
toast.error(error.message || 'AI 分析失败');
} finally {
setAnalyzing(false);
setMdStreaming(false);
}
};
// 获取 Markdown 文档大纲(分章节)
const fetchMdOutline = async () => {
if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) return;
try {
const result = await aiApi.getMarkdownOutline(uploadedFile);
if (result.success && result.outline) {
setMdSections(result.outline);
}
} catch (error) {
console.error('获取大纲失败:', error);
}
};
const getMdAnalysisIcon = (type: string) => {
switch (type) {
case 'summary': return <FileText size={20} />;
case 'outline': return <List size={20} />;
case 'key_points': return <TrendingUp size={20} />;
case 'statistics': return <TrendingUp size={20} />;
case 'section': return <FileText size={20} />;
case 'questions': return <MessageSquareCode size={20} />;
case 'tags': return <Tag size={20} />;
case 'qa': return <HelpCircle size={20} />;
default: return <Sparkles size={20} />;
}
};
const formatFileSize = (bytes: number): string => { const formatFileSize = (bytes: number): string => {
if (bytes === 0) return '0 B'; if (bytes === 0) return '0 B';
const k = 1024; const k = 1024;
@@ -600,6 +718,97 @@ const Documents: React.FC = () => {
</Card> </Card>
)} )}
{/* Markdown AI 分析选项 */}
{uploadedFile && isMarkdownFile(uploadedFile.name) && (
<Card className="border-none shadow-md bg-gradient-to-br from-purple-500/5 to-primary/5">
<CardHeader className="pb-4">
<CardTitle className="flex items-center gap-2">
<Sparkles className="text-purple-500" size={20} />
Markdown AI
</CardTitle>
</CardHeader>
<CardContent className="space-y-4">
{/* 章节选择 */}
{mdSections.length > 0 && (
<div className="space-y-2">
<Label htmlFor="md-section" className="text-sm"></Label>
<Select value={mdSelectedSection} onValueChange={setMdSelectedSection}>
<SelectTrigger id="md-section" className="bg-background">
<SelectValue placeholder="全文分析" />
</SelectTrigger>
<SelectContent>
<SelectItem value=""></SelectItem>
{mdSections.map((section) => (
<SelectItem key={section.number} value={section.number}>
{section.number}{section.title}
</SelectItem>
))}
</SelectContent>
</Select>
</div>
)}
<div className="space-y-2">
<Label htmlFor="md-analysis-type" className="text-sm"></Label>
<Select value={mdAnalysisType} onValueChange={(value: any) => setMdAnalysisType(value)}>
<SelectTrigger id="md-analysis-type" className="bg-background">
<SelectValue />
</SelectTrigger>
<SelectContent>
{[
{ value: 'summary', label: '文档摘要', desc: '主要内容摘要' },
{ value: 'outline', label: '大纲提取', desc: '提取文档结构' },
{ value: 'key_points', label: '关键要点', desc: '提取关键信息' },
{ value: 'statistics', label: '统计分析', desc: '统计数据分析' },
{ value: 'section', label: '章节分析', desc: '分章节详细分析' },
{ value: 'questions', label: '生成问题', desc: '生成理解性问题' },
{ value: 'tags', label: '生成标签', desc: '提取主题标签' },
{ value: 'qa', label: '问答对', desc: '生成问答内容' }
].map(type => (
<SelectItem key={type.value} value={type.value}>
<div className="flex items-center gap-2">
{getMdAnalysisIcon(type.value)}
<div className="flex flex-col">
<span className="font-medium">{type.label}</span>
<span className="text-xs text-muted-foreground">{type.desc}</span>
</div>
</div>
</SelectItem>
))}
</SelectContent>
</Select>
</div>
<div className="space-y-2">
<Label htmlFor="md-user-prompt" className="text-sm"></Label>
<Textarea
id="md-user-prompt"
placeholder="例如:请重点关注技术实现部分..."
value={mdUserPrompt}
onChange={(e) => setMdUserPrompt(e.target.value)}
className="bg-background resize-none"
rows={2}
/>
</div>
<div className="flex gap-2">
<Button
onClick={handleMdAnalyze}
disabled={analyzing}
className="flex-1 bg-gradient-to-r from-purple-500 to-primary hover:from-purple-500/90 hover:to-primary/90"
>
{analyzing && !mdStreaming ? <><Loader2 className="mr-2 animate-spin" size={16} /> ...</> : <><Sparkles className="mr-2" size={16} /></>}
</Button>
<Button
onClick={handleMdAnalyzeStream}
disabled={analyzing}
variant="outline"
className="flex-1"
>
{analyzing && mdStreaming ? <><Loader2 className="mr-2 animate-spin" size={16} /> ...</> : <><Sparkles className="mr-2" size={16} /></>}
</Button>
</div>
</CardContent>
</Card>
)}
{/* 数据操作 */} {/* 数据操作 */}
{parseResult?.success && ( {parseResult?.success && (
<Card className="border-none shadow-md bg-gradient-to-br from-emerald-500/5 to-blue-500/5"> <Card className="border-none shadow-md bg-gradient-to-br from-emerald-500/5 to-blue-500/5">
@@ -661,6 +870,45 @@ const Documents: React.FC = () => {
</Card> </Card>
)} )}
{/* Markdown AI 分析结果 */}
{(mdAnalysis || mdStreamingContent) && (
<Card className="border-none shadow-md border-l-4 border-l-purple-500">
<CardHeader>
<div className="flex items-center justify-between">
<div className="space-y-1">
<CardTitle className="flex items-center gap-2">
<Sparkles className="text-purple-500" size={20} />
Markdown AI
{mdStreaming && <Badge variant="default" className="ml-2 bg-purple-500"></Badge>}
</CardTitle>
{mdAnalysis && (
<CardDescription>
{mdAnalysis.filename} {mdAnalysis.word_count || 0} {mdAnalysis.analysis_type}
{mdAnalysis.section && `${mdAnalysis.section}`}
</CardDescription>
)}
</div>
{mdAnalysis?.structure && (
<Badge variant="secondary">
{mdAnalysis.structure.title_count || 0} {mdAnalysis.structure.section_count || 0}
</Badge>
)}
</div>
</CardHeader>
<CardContent className="max-h-[500px] overflow-y-auto">
{/* 流式内容优先显示 */}
{mdStreamingContent && (
<div className="animate-pulse text-sm text-muted-foreground mb-4">
...
</div>
)}
{mdStreamingContent && <Markdown content={mdStreamingContent} />}
{mdAnalysis?.analysis && !mdStreamingContent && <Markdown content={mdAnalysis.analysis} />}
{!mdAnalysis?.success && !mdStreamingContent && <p className="text-sm text-destructive">{mdAnalysis?.error || '分析失败'}</p>}
</CardContent>
</Card>
)}
{/* 图表显示 */} {/* 图表显示 */}
{analysisCharts && ( {analysisCharts && (
<Card className="border-none shadow-md border-l-4 border-l-indigo-500"> <Card className="border-none shadow-md border-l-4 border-l-indigo-500">