feat(ai-analyze): 新增 Markdown 文件 AI 分析功能

- 添加 Markdown 文件上传和解析接口
- 实现流式分析和大纲提取功能
- 支持多种分析类型:摘要、大纲、关键点等
- 新增 markdown_ai_service 服务类
- 扩展 LLMService 支持流式调用
- 更新前端 API 接口定义和实现
This commit is contained in:
2026-04-02 11:53:12 +08:00
parent ddf30078f0
commit d189ea9620
6 changed files with 1286 additions and 118 deletions

View File

@@ -2,10 +2,14 @@
AI 分析 API 接口
"""
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body
from fastapi.responses import StreamingResponse
from typing import Optional
import logging
import tempfile
import os
from app.services.excel_ai_service import excel_ai_service
from app.services.markdown_ai_service import markdown_ai_service
logger = logging.getLogger(__name__)
@@ -93,10 +97,11 @@ async def get_analysis_types():
获取支持的分析类型列表
Returns:
list: 支持的分析类型
dict: 支持的分析类型(包含 Excel 和 Markdown
"""
return {
"types": excel_ai_service.get_supported_analysis_types()
"excel_types": excel_ai_service.get_supported_analysis_types(),
"markdown_types": markdown_ai_service.get_supported_analysis_types()
}
@@ -142,3 +147,185 @@ async def analyze_text(
except Exception as e:
logger.error(f"文本分析失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
@router.post("/analyze/md")
async def analyze_markdown(
file: UploadFile = File(...),
analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section"),
user_prompt: str = Query("", description="用户自定义提示词"),
section_number: Optional[str] = Query(None, description="指定章节编号,如 '''(一)'")
):
"""
上传并使用 AI 分析 Markdown 文件
Args:
file: 上传的 Markdown 文件
analysis_type: 分析类型
user_prompt: 用户自定义提示词
section_number: 指定分析的章节编号
Returns:
dict: 分析结果
"""
# 检查文件类型
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
# 验证分析类型
supported_types = markdown_ai_service.get_supported_analysis_types()
if analysis_type not in supported_types:
raise HTTPException(
status_code=400,
detail=f"不支持的分析类型: {analysis_type},支持的类型: {', '.join(supported_types)}"
)
try:
# 读取文件内容
content = await file.read()
# 保存到临时文件
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
logger.info(f"开始分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}, 章节: {section_number}")
# 调用 AI 分析服务
result = await markdown_ai_service.analyze_markdown(
file_path=tmp_path,
analysis_type=analysis_type,
user_prompt=user_prompt,
section_number=section_number
)
logger.info(f"Markdown 分析完成: {file.filename}, 成功: {result['success']}")
if not result['success']:
raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
return result
finally:
# 清理临时文件
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except HTTPException:
raise
except Exception as e:
logger.error(f"Markdown AI 分析过程中出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
@router.post("/analyze/md/stream")
async def analyze_markdown_stream(
file: UploadFile = File(...),
analysis_type: str = Query("summary", description="分析类型"),
user_prompt: str = Query("", description="用户自定义提示词"),
section_number: Optional[str] = Query(None, description="指定章节编号")
):
"""
流式分析 Markdown 文件 (SSE)
Returns:
StreamingResponse: SSE 流式响应
"""
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
try:
content = await file.read()
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
logger.info(f"开始流式分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}")
async def stream_generator():
async for chunk in markdown_ai_service.analyze_markdown_stream(
file_path=tmp_path,
analysis_type=analysis_type,
user_prompt=user_prompt,
section_number=section_number
):
yield chunk
return StreamingResponse(
stream_generator(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no"
}
)
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except HTTPException:
raise
except Exception as e:
logger.error(f"Markdown AI 流式分析出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"流式分析失败: {str(e)}")
@router.get("/analyze/md/outline")
async def get_markdown_outline(
file: UploadFile = File(...)
):
"""
获取 Markdown 文档的大纲结构(分章节信息)
Args:
file: 上传的 Markdown 文件
Returns:
dict: 文档大纲结构
"""
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
try:
content = await file.read()
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
result = await markdown_ai_service.extract_outline(tmp_path)
return result
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except Exception as e:
logger.error(f"获取 Markdown 大纲失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"获取大纲失败: {str(e)}")

View File

@@ -2,7 +2,7 @@
LLM 服务模块 - 封装大模型 API 调用
"""
import logging
from typing import Dict, Any, List, Optional
from typing import Dict, Any, List, Optional, AsyncGenerator
import httpx
from app.config import settings
@@ -87,6 +87,71 @@ class LLMService:
logger.error(f"解析 API 响应失败: {str(e)}")
raise
async def chat_stream(
self,
messages: List[Dict[str, str]],
temperature: float = 0.7,
max_tokens: Optional[int] = None,
**kwargs
) -> AsyncGenerator[Dict[str, Any], None]:
"""
流式调用聊天 API
Args:
messages: 消息列表
temperature: 温度参数
max_tokens: 最大 token 数
**kwargs: 其他参数
Yields:
Dict[str, Any]: 包含 delta 内容的块
"""
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
payload = {
"model": self.model_name,
"messages": messages,
"temperature": temperature,
"stream": True
}
if max_tokens:
payload["max_tokens"] = max_tokens
payload.update(kwargs)
try:
async with httpx.AsyncClient(timeout=120.0) as client:
async with client.stream(
"POST",
f"{self.base_url}/chat/completions",
headers=headers,
json=payload
) as response:
async for line in response.aiter_lines():
if line.startswith("data: "):
data = line[6:] # Remove "data: " prefix
if data == "[DONE]":
break
try:
import json as json_module
chunk = json_module.loads(data)
delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
if delta:
yield {"content": delta}
except json_module.JSONDecodeError:
continue
except httpx.HTTPStatusError as e:
logger.error(f"LLM 流式 API 请求失败: {e.response.status_code}")
raise
except Exception as e:
logger.error(f"LLM 流式 API 调用异常: {str(e)}")
raise
async def analyze_excel_data(
self,
excel_data: Dict[str, Any],

View File

@@ -0,0 +1,591 @@
"""
Markdown 文档 AI 分析服务
支持:
- 分章节解析(中文章节编号:一、二、三, (一)(二)(三))
- 结构化数据提取
- 流式输出
- 多种分析类型
"""
import asyncio
import json
import logging
import re
from typing import Any, AsyncGenerator, Dict, List, Optional
from app.services.llm_service import llm_service
from app.core.document_parser import MarkdownParser
logger = logging.getLogger(__name__)
class MarkdownSection:
"""文档章节结构"""
def __init__(self, number: str, title: str, level: int, content: str, line_start: int, line_end: int):
self.number = number # 章节编号,如 "一", "(一)", "1"
self.title = title
self.level = level # 层级深度
self.content = content # 章节内容(不含子章节)
self.line_start = line_start
self.line_end = line_end
self.subsections: List[MarkdownSection] = []
def to_dict(self) -> Dict[str, Any]:
return {
"number": self.number,
"title": self.title,
"level": self.level,
"content_preview": self.content[:200] + "..." if len(self.content) > 200 else self.content,
"line_start": self.line_start,
"line_end": self.line_end,
"subsections": [s.to_dict() for s in self.subsections]
}
class MarkdownAIService:
"""Markdown 文档 AI 分析服务"""
# 中文章节编号模式
CHINESE_NUMBERS = ["", "", "", "", "", "", "", "", "", ""]
CHINESE_SUFFIX = ""
PARENTHESIS_PATTERN = re.compile(r'^([一二三四五六七八九十]+)\s*(.+)$')
CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+)、\s*(.+)$')
ARABIC_SECTION_PATTERN = re.compile(r'^(\d+)\.\s+(.+)$')
def __init__(self):
self.parser = MarkdownParser()
def get_supported_analysis_types(self) -> list:
"""获取支持的分析类型"""
return [
"summary", # 文档摘要
"outline", # 大纲提取
"key_points", # 关键点提取
"questions", # 生成问题
"tags", # 生成标签
"qa", # 问答对
"statistics", # 统计数据分析(适合政府公报)
"section" # 分章节详细分析
]
def extract_sections(self, content: str, titles: List[Dict]) -> List[MarkdownSection]:
"""
从文档内容中提取章节结构
识别以下章节格式:
- 一级:一、二、三...
- 二级:(一)(二)(三)...
- 三级1. 2. 3. ...
"""
sections = []
lines = content.split('\n')
# 构建标题行到内容的映射
title_lines = {}
for t in titles:
title_lines[t.get('line', 0)] = t
current_section = None
section_stack = []
for i, line in enumerate(lines, 1):
stripped = line.strip()
# 检查是否是一级标题(中文数字 + 、)
match = self.CHINESE_SECTION_PATTERN.match(stripped)
if match:
# 结束当前章节
if current_section:
current_section.content = self._get_section_content(
lines, current_section.line_start, i - 1
)
current_section = MarkdownSection(
number=match.group(1),
title=match.group(2),
level=1,
content="",
line_start=i,
line_end=len(lines)
)
sections.append(current_section)
section_stack = [current_section]
continue
# 检查是否是二级标题((一)(二)...
match = self.PARENTHESIS_PATTERN.match(stripped)
if match and current_section:
# 结束当前子章节
if section_stack and len(section_stack) > 1:
parent = section_stack[-1]
parent.content = self._get_section_content(
lines, parent.line_start, i - 1
)
subsection = MarkdownSection(
number=match.group(1),
title=match.group(2),
level=2,
content="",
line_start=i,
line_end=len(lines)
)
current_section.subsections.append(subsection)
section_stack = [current_section, subsection]
continue
# 检查是否是三级标题1. 2. 3.
match = self.ARABIC_SECTION_PATTERN.match(stripped)
if match and len(section_stack) > 1:
# 结束当前子章节
if len(section_stack) > 2:
parent = section_stack[-1]
parent.content = self._get_section_content(
lines, parent.line_start, i - 1
)
sub_subsection = MarkdownSection(
number=match.group(1),
title=match.group(2),
level=3,
content="",
line_start=i,
line_end=len(lines)
)
section_stack[-1].subsections.append(sub_subsection)
section_stack = section_stack[:-1] + [sub_subsection]
continue
# 处理最后一个章节
if current_section:
current_section.content = self._get_section_content(
lines, current_section.line_start, len(lines)
)
return sections
def _get_section_content(self, lines: List[str], start: int, end: int) -> str:
"""获取指定行范围的内容"""
if start > end:
return ""
content_lines = lines[start-1:end]
# 清理:移除标题行和空行
cleaned = []
for line in content_lines:
stripped = line.strip()
if not stripped:
continue
# 跳过章节标题行
if self.CHINESE_SECTION_PATTERN.match(stripped):
continue
if self.PARENTHESIS_PATTERN.match(stripped):
continue
if self.ARABIC_SECTION_PATTERN.match(stripped):
continue
cleaned.append(stripped)
return '\n'.join(cleaned)
async def analyze_markdown(
self,
file_path: str,
analysis_type: str = "summary",
user_prompt: str = "",
section_number: Optional[str] = None
) -> Dict[str, Any]:
"""
使用 AI 分析 Markdown 文档
Args:
file_path: 文件路径
analysis_type: 分析类型
user_prompt: 用户自定义提示词
section_number: 指定分析的章节编号(如 """(一)"
Returns:
dict: 分析结果
"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
return {
"success": False,
"error": parse_result.error
}
data = parse_result.data
# 提取章节结构
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
# 如果指定了章节,只分析该章节
target_content = data.get("content", "")
target_title = parse_result.metadata.get("filename", "")
if section_number:
section = self._find_section(sections, section_number)
if section:
target_content = section.content
target_title = f"{section.number}{section.title}"
else:
return {
"success": False,
"error": f"未找到章节: {section_number}"
}
# 根据分析类型构建提示词
prompt = self._build_prompt(
content=target_content,
analysis_type=analysis_type,
user_prompt=user_prompt,
title=target_title
)
# 调用 LLM 分析
messages = [
{"role": "system", "content": self._get_system_prompt(analysis_type)},
{"role": "user", "content": prompt}
]
response = await llm_service.chat(
messages=messages,
temperature=0.3,
max_tokens=4000
)
analysis = llm_service.extract_message_content(response)
return {
"success": True,
"filename": parse_result.metadata.get("filename", ""),
"analysis_type": analysis_type,
"section": target_title if section_number else None,
"word_count": len(target_content),
"structure": {
"title_count": parse_result.metadata.get("title_count", 0),
"code_block_count": parse_result.metadata.get("code_block_count", 0),
"table_count": parse_result.metadata.get("table_count", 0),
"section_count": len(sections)
},
"sections": [s.to_dict() for s in sections[:10]], # 最多返回10个一级章节
"analysis": analysis
}
except Exception as e:
logger.error(f"Markdown AI 分析失败: {str(e)}")
return {
"success": False,
"error": str(e)
}
async def analyze_markdown_stream(
self,
file_path: str,
analysis_type: str = "summary",
user_prompt: str = "",
section_number: Optional[str] = None
) -> AsyncGenerator[str, None]:
"""
流式分析 Markdown 文档 (SSE)
Yields:
str: SSE 格式的数据块
"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
yield f"data: {json.dumps({'error': parse_result.error}, ensure_ascii=False)}\n\n"
return
data = parse_result.data
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
target_content = data.get("content", "")
target_title = parse_result.metadata.get("filename", "")
if section_number:
section = self._find_section(sections, section_number)
if section:
target_content = section.content
target_title = f"{section.number}{section.title}"
else:
yield f"data: {json.dumps({'error': f'未找到章节: {section_number}'}, ensure_ascii=False)}\n\n"
return
prompt = self._build_prompt(
content=target_content,
analysis_type=analysis_type,
user_prompt=user_prompt,
title=target_title
)
messages = [
{"role": "system", "content": self._get_system_prompt(analysis_type)},
{"role": "user", "content": prompt}
]
# 发送初始元数据
yield f"data: {json.dumps({
'type': 'start',
'filename': parse_result.metadata.get("filename", ""),
'analysis_type': analysis_type,
'section': target_title if section_number else None,
'word_count': len(target_content)
}, ensure_ascii=False)}\n\n"
# 流式调用 LLM
full_response = ""
async for chunk in llm_service.chat_stream(messages, temperature=0.3, max_tokens=4000):
content = chunk.get("content", "")
if content:
full_response += content
yield f"data: {json.dumps({'type': 'content', 'delta': content}, ensure_ascii=False)}\n\n"
# 发送完成消息
yield f"data: {json.dumps({'type': 'done', 'full_response': full_response}, ensure_ascii=False)}\n\n"
except Exception as e:
logger.error(f"Markdown AI 流式分析失败: {str(e)}")
yield f"data: {json.dumps({'error': str(e)}, ensure_ascii=False)}\n\n"
def _find_section(self, sections: List[MarkdownSection], number: str) -> Optional[MarkdownSection]:
"""查找指定编号的章节"""
# 标准化编号
num = number.strip()
for section in sections:
if section.number == num or section.title == num:
return section
# 在子章节中查找
found = self._find_section(section.subsections, number)
if found:
return found
return None
def _get_system_prompt(self, analysis_type: str) -> str:
"""根据分析类型获取系统提示词"""
prompts = {
"summary": "你是一个专业的文档摘要助手,擅长从长文档中提取核心信息。",
"outline": "你是一个专业的文档结构分析助手,擅长提取文档大纲和层级结构。",
"key_points": "你是一个专业的知识提取助手,擅长从文档中提取关键信息和要点。",
"questions": "你是一个专业的教育助手,擅长生成帮助理解文档的问题。",
"tags": "你是一个专业的标签生成助手,擅长提取文档的主题标签。",
"qa": "你是一个专业的问答助手,擅长基于文档内容生成问答对。",
"statistics": "你是一个专业的统计数据分析助手,擅长分析政府统计公报中的数据。",
"section": "你是一个专业的章节分析助手,擅长对文档的特定章节进行深入分析。"
}
return prompts.get(analysis_type, "你是一个专业的文档分析助手。")
def _build_prompt(
self,
content: str,
analysis_type: str,
user_prompt: str,
title: str = ""
) -> str:
"""根据分析类型构建提示词"""
# 截断内容避免超出 token 限制
max_content_len = 6000
if len(content) > max_content_len:
content = content[:max_content_len] + "\n\n[内容已截断...]"
base_prompts = {
"summary": f"""请对以下文档进行摘要分析:
文档标题:{title}
文档内容:
{content}
请提供:
1. 文档主要内容摘要300字以内
2. 文档的目的和用途
3. 适合的读者群体
请用中文回答,结构清晰。""",
"outline": f"""请提取以下文档的大纲结构:
文档标题:{title}
文档内容:
{content}
请按层级列出文档大纲,用缩进表示层级关系。
格式:
一、一级标题
(一)二级标题
1. 三级标题
请用中文回答。""",
"key_points": f"""请从以下文档中提取关键要点:
文档标题:{title}
文档内容:
{content}
请列出文档的关键要点5-10条每条用简洁的语言描述并说明其在文档中的重要性。
请用中文回答,格式清晰。""",
"questions": f"""请根据以下文档生成有助于理解内容的问题:
文档标题:{title}
文档内容:
{content}
请生成5-10个问题帮助读者更好地理解文档内容。每个问题应该
1. 涵盖文档的重要信息点
2. 易于理解和回答
3. 具有思考价值
请用中文回答。""",
"tags": f"""请为以下文档生成标签:
文档标题:{title}
文档内容:
{content[:3000]}
请生成5-8个标签用逗号分隔。标签应该反映
- 文档的主题领域
- 文档的类型
- 文档的关键特征
请用中文回答,只需输出标签,不要其他内容。""",
"qa": f"""请根据以下文档生成问答对:
文档标题:{title}
文档内容:
{content[:4000]}
请生成3-5个问答对帮助读者通过问答形式理解文档内容。
格式:
Q1: 问题
A1: 回答
Q2: 问题
A2: 回答
请用中文回答,内容准确。""",
"statistics": f"""请分析以下政府统计公报中的数据和结论:
文档标题:{title}
文档内容:
{content}
请提供:
1. 文档中涉及的主要统计数据(列出关键数字和指标)
2. 数据的变化趋势(增长/下降)
3. 重要的百分比和对比
4. 数据来源和统计口径说明
请用中文回答,数据准确。""",
"section": f"""请详细分析以下文档章节:
章节标题:{title}
章节内容:
{content}
请提供:
1. 章节主要内容概括
2. 关键信息和数据
3. 与其他部分的关联(如有)
4. 重要结论
请用中文回答,分析深入。"""
}
prompt = base_prompts.get(analysis_type, base_prompts["summary"])
if user_prompt and user_prompt.strip():
prompt += f"\n\n用户额外需求:{user_prompt}"
return prompt
async def extract_outline(self, file_path: str) -> Dict[str, Any]:
"""提取文档大纲"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
return {"success": False, "error": parse_result.error}
data = parse_result.data
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
# 构建结构化大纲
outline = []
for section in sections:
outline.append({
"number": section.number,
"title": section.title,
"level": section.level,
"line": section.line_start,
"content_preview": section.content[:100] + "..." if len(section.content) > 100 else section.content,
"subsections": [{
"number": s.number,
"title": s.title,
"level": s.level,
"line": s.line_start
} for s in section.subsections]
})
return {
"success": True,
"outline": outline
}
except Exception as e:
logger.error(f"大纲提取失败: {str(e)}")
return {"success": False, "error": str(e)}
async def extract_tables_summary(self, file_path: str) -> Dict[str, Any]:
"""提取并总结文档中的表格"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
return {"success": False, "error": parse_result.error}
tables = parse_result.data.get("tables", [])
if not tables:
return {"success": True, "tables": [], "message": "文档中没有表格"}
# 提取每个表格的关键信息
table_summaries = []
for i, table in enumerate(tables):
summary = {
"index": i + 1,
"headers": table.get("headers", []),
"row_count": table.get("row_count", 0),
"column_count": table.get("column_count", 0),
"preview_rows": table.get("rows", [])[:3], # 只取前3行预览
"first_column": [row[0] if row else "" for row in table.get("rows", [])[:5]]
}
table_summaries.append(summary)
return {
"success": True,
"tables": table_summaries,
"table_count": len(tables)
}
except Exception as e:
logger.error(f"表格提取失败: {str(e)}")
return {"success": False, "error": str(e)}
# 全局单例
markdown_ai_service = MarkdownAIService()