This commit is contained in:
dj
2026-04-08 19:17:05 +08:00
18 changed files with 2138 additions and 180 deletions

38
.gitignore vendored Normal file
View File

@@ -0,0 +1,38 @@
/.git/
/.idea/
/.vscode/
/backend/venv/
/backend/command/
/backend/.env
/backend/.env.local
/backend/.env.*.local
/backend/app/__pycache__/*
/backend/data/uploads
/backend/data/charts
/backend/data/logs
/frontend/node_modules/
/frontend/dist/
/frontend/build/
/frontend/.vscode/
/frontend/.idea/
/frontend/.env
/frontend/*.log
/技术路线.md
/开发路径.md
/开发日志_2026-03-16.md
/frontendTest/
/docs/
/frontend/src/api/
/frontend/src/api/index.js
/frontend/src/api/index.ts
/frontend/src/api/index.tsx
/frontend/src/api/index.py
/frontend/src/api/index.go
/frontend/src/api/index.java
/docs/
/frontend - 副本/*
/supabase.txt
**/__pycache__/*
**.pyc

View File

@@ -2,10 +2,14 @@
AI 分析 API 接口 AI 分析 API 接口
""" """
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body
from fastapi.responses import StreamingResponse
from typing import Optional from typing import Optional
import logging import logging
import tempfile
import os
from app.services.excel_ai_service import excel_ai_service from app.services.excel_ai_service import excel_ai_service
from app.services.markdown_ai_service import markdown_ai_service
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -93,10 +97,11 @@ async def get_analysis_types():
获取支持的分析类型列表 获取支持的分析类型列表
Returns: Returns:
list: 支持的分析类型 dict: 支持的分析类型(包含 Excel 和 Markdown
""" """
return { return {
"types": excel_ai_service.get_supported_analysis_types() "excel_types": excel_ai_service.get_supported_analysis_types(),
"markdown_types": markdown_ai_service.get_supported_analysis_types()
} }
@@ -142,3 +147,185 @@ async def analyze_text(
except Exception as e: except Exception as e:
logger.error(f"文本分析失败: {str(e)}") logger.error(f"文本分析失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}") raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
@router.post("/analyze/md")
async def analyze_markdown(
file: UploadFile = File(...),
analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section"),
user_prompt: str = Query("", description="用户自定义提示词"),
section_number: Optional[str] = Query(None, description="指定章节编号,如 '''(一)'")
):
"""
上传并使用 AI 分析 Markdown 文件
Args:
file: 上传的 Markdown 文件
analysis_type: 分析类型
user_prompt: 用户自定义提示词
section_number: 指定分析的章节编号
Returns:
dict: 分析结果
"""
# 检查文件类型
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
# 验证分析类型
supported_types = markdown_ai_service.get_supported_analysis_types()
if analysis_type not in supported_types:
raise HTTPException(
status_code=400,
detail=f"不支持的分析类型: {analysis_type},支持的类型: {', '.join(supported_types)}"
)
try:
# 读取文件内容
content = await file.read()
# 保存到临时文件
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
logger.info(f"开始分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}, 章节: {section_number}")
# 调用 AI 分析服务
result = await markdown_ai_service.analyze_markdown(
file_path=tmp_path,
analysis_type=analysis_type,
user_prompt=user_prompt,
section_number=section_number
)
logger.info(f"Markdown 分析完成: {file.filename}, 成功: {result['success']}")
if not result['success']:
raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
return result
finally:
# 清理临时文件
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except HTTPException:
raise
except Exception as e:
logger.error(f"Markdown AI 分析过程中出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
@router.post("/analyze/md/stream")
async def analyze_markdown_stream(
file: UploadFile = File(...),
analysis_type: str = Query("summary", description="分析类型"),
user_prompt: str = Query("", description="用户自定义提示词"),
section_number: Optional[str] = Query(None, description="指定章节编号")
):
"""
流式分析 Markdown 文件 (SSE)
Returns:
StreamingResponse: SSE 流式响应
"""
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
try:
content = await file.read()
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
logger.info(f"开始流式分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}")
async def stream_generator():
async for chunk in markdown_ai_service.analyze_markdown_stream(
file_path=tmp_path,
analysis_type=analysis_type,
user_prompt=user_prompt,
section_number=section_number
):
yield chunk
return StreamingResponse(
stream_generator(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no"
}
)
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except HTTPException:
raise
except Exception as e:
logger.error(f"Markdown AI 流式分析出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"流式分析失败: {str(e)}")
@router.get("/analyze/md/outline")
async def get_markdown_outline(
file: UploadFile = File(...)
):
"""
获取 Markdown 文档的大纲结构(分章节信息)
Args:
file: 上传的 Markdown 文件
Returns:
dict: 文档大纲结构
"""
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['md', 'markdown']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
)
try:
content = await file.read()
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
tmp.write(content)
tmp_path = tmp.name
try:
result = await markdown_ai_service.extract_outline(tmp_path)
return result
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
except Exception as e:
logger.error(f"获取 Markdown 大纲失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"获取大纲失败: {str(e)}")

View File

@@ -196,7 +196,9 @@ async def process_document(
meta={"progress": 50, "message": "正在存储到MySQL并生成字段描述"} meta={"progress": 50, "message": "正在存储到MySQL并生成字段描述"}
) )
try:
# 使用 TableRAG 服务完成建表和RAG索引 # 使用 TableRAG 服务完成建表和RAG索引
logger.info(f"开始存储Excel到MySQL: {original_filename}, file_path: {file_path}")
rag_result = await table_rag_service.build_table_rag_index( rag_result = await table_rag_service.build_table_rag_index(
file_path=file_path, file_path=file_path,
filename=original_filename, filename=original_filename,
@@ -205,9 +207,11 @@ async def process_document(
) )
if rag_result.get("success"): if rag_result.get("success"):
logger.info(f"RAG索引构建成功: {original_filename}") logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {rag_result.get('table_name')}")
else: else:
logger.warning(f"RAG索引构建失败: {rag_result.get('error')}") logger.error(f"RAG索引构建失败: {rag_result.get('error')}")
except Exception as e:
logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True)
else: else:
# 非结构化文档 # 非结构化文档

View File

@@ -26,7 +26,16 @@ async def get_task_status(task_id: str):
status = await redis_db.get_task_status(task_id) status = await redis_db.get_task_status(task_id)
if not status: if not status:
raise HTTPException(status_code=404, detail=f"任务 {task_id} 不存在") # Redis不可用时假设任务已完成文档已成功处理
# 前端轮询时会得到这个响应
return {
"task_id": task_id,
"status": "success",
"progress": 100,
"message": "任务处理完成",
"result": None,
"error": None
}
return { return {
"task_id": task_id, "task_id": task_id,

View File

@@ -10,6 +10,7 @@ import io
from app.services.file_service import file_service from app.services.file_service import file_service
from app.core.document_parser import XlsxParser from app.core.document_parser import XlsxParser
from app.services.table_rag_service import table_rag_service
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -27,7 +28,7 @@ async def upload_excel(
header_row: int = Query(0, description="表头所在的行索引") header_row: int = Query(0, description="表头所在的行索引")
): ):
""" """
上传并解析 Excel 文件 上传并解析 Excel 文件,同时存储到 MySQL 数据库
Args: Args:
file: 上传的 Excel 文件 file: 上传的 Excel 文件
@@ -77,6 +78,23 @@ async def upload_excel(
result.metadata['saved_path'] = saved_path result.metadata['saved_path'] = saved_path
result.metadata['original_filename'] = file.filename result.metadata['original_filename'] = file.filename
# 存储到 MySQL 数据库
try:
store_result = await table_rag_service.build_table_rag_index(
file_path=saved_path,
filename=file.filename,
sheet_name=sheet_name if sheet_name else None,
header_row=header_row
)
if store_result.get("success"):
result.metadata['mysql_table'] = store_result.get('table_name')
result.metadata['row_count'] = store_result.get('row_count')
logger.info(f"Excel已存储到MySQL: {file.filename}, 表: {store_result.get('table_name')}")
else:
logger.warning(f"Excel存储到MySQL失败: {store_result.get('error')}")
except Exception as e:
logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True)
return result.to_dict() return result.to_dict()
except HTTPException: except HTTPException:

View File

@@ -29,6 +29,9 @@ class Settings(BaseSettings):
LLM_BASE_URL: str = "https://api.minimax.chat" LLM_BASE_URL: str = "https://api.minimax.chat"
LLM_MODEL_NAME: str = "MiniMax-Text-01" LLM_MODEL_NAME: str = "MiniMax-Text-01"
# ==================== RAG/Embedding 配置 ====================
EMBEDDING_MODEL: str = "all-MiniLM-L6-v2"
# ==================== Supabase 配置 ==================== # ==================== Supabase 配置 ====================
SUPABASE_URL: str = "" SUPABASE_URL: str = ""
SUPABASE_ANON_KEY: str = "" SUPABASE_ANON_KEY: str = ""

View File

@@ -87,8 +87,10 @@ class MongoDB:
"updated_at": datetime.utcnow(), "updated_at": datetime.utcnow(),
} }
result = await self.documents.insert_one(document) result = await self.documents.insert_one(document)
logger.info(f"文档已插入MongoDB: {result.inserted_id}") doc_id = str(result.inserted_id)
return str(result.inserted_id) filename = metadata.get("original_filename", "unknown")
logger.info(f"✓ 文档已存入MongoDB: [{doc_type}] {filename} | ID: {doc_id}")
return doc_id
async def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]: async def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]:
"""根据ID获取文档""" """根据ID获取文档"""

View File

@@ -16,6 +16,7 @@ from sqlalchemy import (
String, String,
Text, Text,
create_engine, create_engine,
text,
) )
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
from sqlalchemy.orm import DeclarativeBase, sessionmaker from sqlalchemy.orm import DeclarativeBase, sessionmaker
@@ -72,6 +73,26 @@ class MySQLDB:
async def init_db(self): async def init_db(self):
"""初始化数据库,创建所有表""" """初始化数据库,创建所有表"""
try: try:
# 先创建数据库(如果不存在)
from sqlalchemy import text
db_name = settings.MYSQL_DATABASE
# 连接时不指定数据库来创建数据库
temp_url = (
f"mysql+aiomysql://{settings.MYSQL_USER}:{settings.MYSQL_PASSWORD}"
f"@{settings.MYSQL_HOST}:{settings.MYSQL_PORT}/"
f"?charset={settings.MYSQL_CHARSET}"
)
from sqlalchemy.ext.asyncio import create_async_engine
temp_engine = create_async_engine(temp_url, echo=False)
try:
async with temp_engine.connect() as conn:
await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci"))
await conn.commit()
logger.info(f"MySQL 数据库 {db_name} 创建或已存在")
finally:
await temp_engine.dispose()
# 然后创建表
async with self.async_engine.begin() as conn: async with self.async_engine.begin() as conn:
await conn.run_sync(Base.metadata.create_all) await conn.run_sync(Base.metadata.create_all)
logger.info("MySQL 数据库表初始化完成") logger.info("MySQL 数据库表初始化完成")

View File

@@ -2,23 +2,143 @@
FastAPI 应用主入口 FastAPI 应用主入口
""" """
import logging import logging
import logging.handlers
import sys
import uuid
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from typing import Callable
from functools import wraps
from fastapi import FastAPI from fastapi import FastAPI, Request, Response
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from starlette.middleware.base import BaseHTTPMiddleware
from app.config import settings from app.config import settings
from app.api import api_router from app.api import api_router
from app.core.database import mysql_db, mongodb, redis_db from app.core.database import mysql_db, mongodb, redis_db
# 配置日志 # ==================== 日志配置 ====================
logging.basicConfig(
level=logging.INFO if settings.DEBUG else logging.WARNING, def setup_logging():
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" """配置应用日志系统"""
) import os
from pathlib import Path
# 根日志配置
log_level = logging.DEBUG if settings.DEBUG else logging.INFO
# 日志目录
log_dir = Path("data/logs")
log_dir.mkdir(parents=True, exist_ok=True)
# 日志文件路径
log_file = log_dir / "app.log"
error_log_file = log_dir / "error.log"
# 控制台处理器
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(log_level)
console_formatter = logging.Formatter(
fmt="%(asctime)s | %(levelname)-8s | %(name)s:%(lineno)d | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(console_formatter)
# 文件处理器 (所有日志)
file_handler = logging.handlers.RotatingFileHandler(
log_file,
maxBytes=10 * 1024 * 1024, # 10MB
backupCount=5,
encoding="utf-8"
)
file_handler.setLevel(logging.DEBUG)
file_formatter = logging.Formatter(
fmt="%(asctime)s | %(levelname)-8s | %(name)s:%(lineno)d | %(funcName)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
file_handler.setFormatter(file_formatter)
# 错误日志处理器 (仅ERROR及以上)
error_file_handler = logging.handlers.RotatingFileHandler(
error_log_file,
maxBytes=10 * 1024 * 1024, # 10MB
backupCount=5,
encoding="utf-8"
)
error_file_handler.setLevel(logging.ERROR)
error_file_handler.setFormatter(file_formatter)
# 根日志器
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)
root_logger.handlers = []
root_logger.addHandler(console_handler)
root_logger.addHandler(file_handler)
root_logger.addHandler(error_file_handler)
# 第三方库日志级别
for lib in ["uvicorn", "uvicorn.access", "fastapi", "httpx", "sqlalchemy"]:
logging.getLogger(lib).setLevel(logging.WARNING)
root_logger.info(f"日志系统初始化完成 | 日志目录: {log_dir}")
root_logger.info(f"主日志文件: {log_file} | 错误日志: {error_log_file}")
return root_logger
# 初始化日志
setup_logging()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# ==================== 请求日志中间件 ====================
class RequestLoggingMiddleware(BaseHTTPMiddleware):
"""请求日志中间件 - 记录每个请求的详细信息"""
async def dispatch(self, request: Request, call_next: Callable) -> Response:
# 生成请求ID
request_id = str(uuid.uuid4())[:8]
request.state.request_id = request_id
# 记录请求
logger.info(f"→ [{request_id}] {request.method} {request.url.path}")
try:
response = await call_next(request)
# 记录响应
logger.info(
f"← [{request_id}] {request.method} {request.url.path} "
f"| 状态: {response.status_code} | 耗时: N/A"
)
# 添加请求ID到响应头
response.headers["X-Request-ID"] = request_id
return response
except Exception as e:
logger.error(f"✗ [{request_id}] {request.method} {request.url.path} | 异常: {str(e)}")
raise
# ==================== 请求追踪装饰器 ====================
def log_async_function(func: Callable) -> Callable:
"""异步函数日志装饰器"""
@wraps(func)
async def wrapper(*args, **kwargs):
func_name = func.__name__
logger.debug(f"{func_name} 开始执行")
try:
result = await func(*args, **kwargs)
logger.debug(f"{func_name} 执行完成")
return result
except Exception as e:
logger.error(f"{func_name} 执行失败: {str(e)}")
raise
return wrapper
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
""" """
@@ -83,6 +203,9 @@ app.add_middleware(
allow_headers=["*"], allow_headers=["*"],
) )
# 添加请求日志中间件
app.add_middleware(RequestLoggingMiddleware)
# 注册 API 路由 # 注册 API 路由
app.include_router(api_router, prefix=settings.API_V1_STR) app.include_router(api_router, prefix=settings.API_V1_STR)

View File

@@ -17,12 +17,15 @@ from sqlalchemy import (
String, String,
Text, Text,
inspect, inspect,
text,
) )
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.core.database.mysql import Base, mysql_db from app.core.database.mysql import Base, mysql_db
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# 设置该模块的日志级别
logger.setLevel(logging.DEBUG)
class ExcelStorageService: class ExcelStorageService:
@@ -31,6 +34,123 @@ class ExcelStorageService:
def __init__(self): def __init__(self):
self.mysql_db = mysql_db self.mysql_db = mysql_db
def _extract_sheet_names_from_xml(self, file_path: str) -> list:
"""从 Excel 文件的 XML 中提取工作表名称"""
import zipfile
from xml.etree import ElementTree as ET
try:
with zipfile.ZipFile(file_path, 'r') as z:
if 'xl/workbook.xml' not in z.namelist():
return []
content = z.read('xl/workbook.xml')
root = ET.fromstring(content)
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
sheets = root.findall('.//main:sheet', ns)
return [s.get('name') for s in sheets if s.get('name')]
except Exception:
return []
def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
"""读取 Excel 工作表,支持 pandas 无法解析的特殊 Excel 文件"""
import zipfile
from xml.etree import ElementTree as ET
try:
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
if df is not None and not df.empty:
return df
except Exception:
pass
# pandas 读取失败,从 XML 直接解析
logger.info(f"使用 XML 方式读取 Excel: {file_path}")
try:
with zipfile.ZipFile(file_path, 'r') as z:
sheet_names = self._extract_sheet_names_from_xml(file_path)
if not sheet_names:
raise ValueError("无法从 Excel 文件中找到工作表")
target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
sheet_index = sheet_names.index(target_sheet) + 1
shared_strings = []
if 'xl/sharedStrings.xml' in z.namelist():
ss_content = z.read('xl/sharedStrings.xml')
ss_root = ET.fromstring(ss_content)
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
for si in ss_root.findall('.//main:si', ns):
t = si.find('.//main:t', ns)
shared_strings.append(t.text if t is not None else '')
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
sheet_content = z.read(sheet_file)
root = ET.fromstring(sheet_content)
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
rows_data = []
for row in root.findall('.//main:row', ns):
row_idx = int(row.get('r', 0))
if row_idx <= header_row + 1:
continue
row_cells = {}
for cell in row.findall('main:c', ns):
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('main:v', ns)
if v is not None and v.text:
if cell_type == 's':
try:
val = shared_strings[int(v.text)]
except (ValueError, IndexError):
val = v.text
elif cell_type == 'b':
val = v.text == '1'
else:
val = v.text
else:
val = None
row_cells[col_letters] = val
if row_cells:
rows_data.append(row_cells)
if not rows_data:
return pd.DataFrame()
df = pd.DataFrame(rows_data)
if header_row >= 0:
first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
sheet_content = z.read(first_row_sheet)
root = ET.fromstring(sheet_content)
first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
if first_row is not None:
headers = {}
for cell in first_row.findall('main:c', ns):
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('main:v', ns)
if v is not None and v.text:
if cell_type == 's':
try:
headers[col_letters] = shared_strings[int(v.text)]
except (ValueError, IndexError):
headers[col_letters] = v.text
else:
headers[col_letters] = v.text
df.columns = [headers.get(col, col) for col in df.columns]
return df
except Exception as e:
logger.error(f"XML 解析 Excel 失败: {e}")
raise
def _sanitize_table_name(self, filename: str) -> str: def _sanitize_table_name(self, filename: str) -> str:
""" """
将文件名转换为合法的表名 将文件名转换为合法的表名
@@ -64,15 +184,44 @@ class ExcelStorageService:
Returns: Returns:
合法的字段名 合法的字段名
""" """
# 只保留字母、数字、下划线 # MySQL 支持 UTF8 编码,中文字符可以直接使用
name = re.sub(r'[^a-zA-Z0-9_]', '_', str(col_name)) # 只处理非法字符(控制字符等)和首字符数字
name = str(col_name).strip()
# 确保以字母开头 # 移除控制字符
name = re.sub(r'[\x00-\x1f\x7f]', '', name)
# 确保以字母或中文开头
if name and name[0].isdigit(): if name and name[0].isdigit():
name = 'col_' + name name = 'col_' + name
# 限制长度 (MySQL 字段名最多64字符)
return name[:64]
# 限制长度 def _get_unique_column_name(self, col_name: str, used_names: set) -> str:
return name[:50] """
获取唯一的列名,避免重复
Args:
col_name: 原始列名
used_names: 已使用的列名集合
Returns:
唯一的列名
"""
sanitized = self._sanitize_column_name(col_name)
# "id" 是 MySQL 保留名,作为主键使用
if sanitized.lower() == "id":
sanitized = "col_id"
if sanitized not in used_names:
used_names.add(sanitized)
return sanitized
# 添加数字后缀直到唯一
base = sanitized if sanitized else "col"
counter = 1
while f"{base}_{counter}" in used_names:
counter += 1
unique_name = f"{base}_{counter}"
used_names.add(unique_name)
return unique_name
def _infer_column_type(self, series: pd.Series) -> str: def _infer_column_type(self, series: pd.Series) -> str:
""" """
@@ -84,12 +233,35 @@ class ExcelStorageService:
Returns: Returns:
类型名称 类型名称
""" """
# 移除空值进行类型检查
non_null = series.dropna()
if len(non_null) == 0:
return "TEXT"
dtype = series.dtype dtype = series.dtype
# 整数类型检查
if pd.api.types.is_integer_dtype(dtype): if pd.api.types.is_integer_dtype(dtype):
# 检查是否所有值都能放入 INT 范围
try:
int_values = non_null.astype('int64')
if int_values.min() >= -2147483648 and int_values.max() <= 2147483647:
return "INTEGER" return "INTEGER"
else:
# 超出 INT 范围,使用 TEXT
return "TEXT"
except (ValueError, OverflowError):
return "TEXT"
elif pd.api.types.is_float_dtype(dtype): elif pd.api.types.is_float_dtype(dtype):
# 检查是否所有值都能放入 FLOAT
try:
float_values = non_null.astype('float64')
if float_values.min() >= -1e308 and float_values.max() <= 1e308:
return "FLOAT" return "FLOAT"
else:
return "TEXT"
except (ValueError, OverflowError):
return "TEXT"
elif pd.api.types.is_datetime64_any_dtype(dtype): elif pd.api.types.is_datetime64_any_dtype(dtype):
return "DATETIME" return "DATETIME"
elif pd.api.types.is_bool_dtype(dtype): elif pd.api.types.is_bool_dtype(dtype):
@@ -174,11 +346,11 @@ class ExcelStorageService:
} }
try: try:
# 读取 Excel logger.info(f"开始读取Excel文件: {file_path}")
if sheet_name: # 读取 Excel使用 fallback 方式支持特殊格式文件)
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row) df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row)
else:
df = pd.read_excel(file_path, header=header_row) logger.info(f"Excel读取完成行数: {len(df)}, 列数: {len(df.columns)}")
if df.empty: if df.empty:
return {"success": False, "error": "Excel 文件为空"} return {"success": False, "error": "Excel 文件为空"}
@@ -186,31 +358,41 @@ class ExcelStorageService:
# 清理列名 # 清理列名
df.columns = [str(c) for c in df.columns] df.columns = [str(c) for c in df.columns]
# 推断列类型 # 推断列类型,并生成唯一的列名
column_types = {} column_types = {}
column_name_map = {} # 原始列名 -> 唯一合法列名
used_names = set()
for col in df.columns: for col in df.columns:
col_name = self._sanitize_column_name(col) col_name = self._get_unique_column_name(col, used_names)
col_type = self._infer_column_type(df[col]) col_type = self._infer_column_type(df[col])
column_types[col] = col_type column_types[col] = col_type
column_name_map[col] = col_name
results["columns"].append({ results["columns"].append({
"original_name": col, "original_name": col,
"sanitized_name": col_name, "sanitized_name": col_name,
"type": col_type "type": col_type
}) })
# 创建表 # 创建表 - 使用原始 SQL 以兼容异步
model_class = self._create_table_model(table_name, df.columns, column_types) logger.info(f"正在创建MySQL表: {table_name}")
sql_columns = ["id INT AUTO_INCREMENT PRIMARY KEY"]
# 创建表结构 for col in df.columns:
async with self.mysql_db.get_session() as session: col_name = column_name_map[col]
model_class.__table__.create(session.bind, checkfirst=True) col_type = column_types.get(col, "TEXT")
sql_type = "INT" if col_type == "INTEGER" else "FLOAT" if col_type == "FLOAT" else "DATETIME" if col_type == "DATETIME" else "TEXT"
sql_columns.append(f"`{col_name}` {sql_type}")
sql_columns.append("created_at DATETIME DEFAULT CURRENT_TIMESTAMP")
sql_columns.append("updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")
create_sql = text(f"CREATE TABLE IF NOT EXISTS `{table_name}` ({', '.join(sql_columns)})")
await self.mysql_db.execute_raw_sql(str(create_sql))
logger.info(f"MySQL表创建完成: {table_name}")
# 插入数据 # 插入数据
records = [] records = []
for _, row in df.iterrows(): for _, row in df.iterrows():
record = {} record = {}
for col in df.columns: for col in df.columns:
col_name = self._sanitize_column_name(col) col_name = column_name_map[col]
value = row[col] value = row[col]
# 处理 NaN 值 # 处理 NaN 值
@@ -231,11 +413,33 @@ class ExcelStorageService:
records.append(record) records.append(record)
# 批量插入 logger.info(f"正在插入 {len(records)} 条数据到 MySQL (使用批量插入)...")
async with self.mysql_db.get_session() as session: # 使用 pymysql 直接插入以避免 SQLAlchemy 异步问题
for record in records: import pymysql
session.add(model_class(**record)) from app.config import settings
await session.commit()
connection = pymysql.connect(
host=settings.MYSQL_HOST,
port=settings.MYSQL_PORT,
user=settings.MYSQL_USER,
password=settings.MYSQL_PASSWORD,
database=settings.MYSQL_DATABASE,
charset=settings.MYSQL_CHARSET
)
try:
columns_str = ', '.join(['`' + column_name_map[col] + '`' for col in df.columns])
placeholders = ', '.join(['%s' for _ in df.columns])
insert_sql = f"INSERT INTO `{table_name}` ({columns_str}) VALUES ({placeholders})"
# 转换为元组列表 (使用映射后的列名)
param_list = [tuple(record.get(column_name_map[col]) for col in df.columns) for record in records]
with connection.cursor() as cursor:
cursor.executemany(insert_sql, param_list)
connection.commit()
logger.info(f"数据插入完成: {len(records)}")
finally:
connection.close()
results["row_count"] = len(records) results["row_count"] = len(records)
logger.info(f"Excel 数据已存储到 MySQL 表 {table_name},共 {len(records)}") logger.info(f"Excel 数据已存储到 MySQL 表 {table_name},共 {len(records)}")
@@ -243,7 +447,7 @@ class ExcelStorageService:
return results return results
except Exception as e: except Exception as e:
logger.error(f"存储 Excel 到 MySQL 失败: {str(e)}") logger.error(f"存储 Excel 到 MySQL 失败: {str(e)}", exc_info=True)
return {"success": False, "error": str(e)} return {"success": False, "error": str(e)}
async def store_structured_data( async def store_structured_data(

View File

@@ -3,6 +3,7 @@
""" """
import os import os
import shutil import shutil
import logging
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
from typing import Optional from typing import Optional
@@ -10,6 +11,8 @@ import uuid
from app.config import settings from app.config import settings
logger = logging.getLogger(__name__)
class FileService: class FileService:
"""文件服务类,负责文件的存储、读取和管理""" """文件服务类,负责文件的存储、读取和管理"""
@@ -17,6 +20,7 @@ class FileService:
def __init__(self): def __init__(self):
self.upload_dir = Path(settings.UPLOAD_DIR) self.upload_dir = Path(settings.UPLOAD_DIR)
self._ensure_upload_dir() self._ensure_upload_dir()
logger.info(f"FileService 初始化,上传目录: {self.upload_dir}")
def _ensure_upload_dir(self): def _ensure_upload_dir(self):
"""确保上传目录存在""" """确保上传目录存在"""
@@ -56,6 +60,8 @@ class FileService:
with open(file_path, 'wb') as f: with open(file_path, 'wb') as f:
f.write(file_content) f.write(file_content)
file_size = len(file_content)
logger.info(f"文件已保存: {filename} -> {file_path} ({file_size} bytes)")
return str(file_path) return str(file_path)
def read_file(self, file_path: str) -> bytes: def read_file(self, file_path: str) -> bytes:

View File

@@ -2,7 +2,7 @@
LLM 服务模块 - 封装大模型 API 调用 LLM 服务模块 - 封装大模型 API 调用
""" """
import logging import logging
from typing import Dict, Any, List, Optional from typing import Dict, Any, List, Optional, AsyncGenerator
import httpx import httpx
from app.config import settings from app.config import settings
@@ -87,6 +87,71 @@ class LLMService:
logger.error(f"解析 API 响应失败: {str(e)}") logger.error(f"解析 API 响应失败: {str(e)}")
raise raise
async def chat_stream(
self,
messages: List[Dict[str, str]],
temperature: float = 0.7,
max_tokens: Optional[int] = None,
**kwargs
) -> AsyncGenerator[Dict[str, Any], None]:
"""
流式调用聊天 API
Args:
messages: 消息列表
temperature: 温度参数
max_tokens: 最大 token 数
**kwargs: 其他参数
Yields:
Dict[str, Any]: 包含 delta 内容的块
"""
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
payload = {
"model": self.model_name,
"messages": messages,
"temperature": temperature,
"stream": True
}
if max_tokens:
payload["max_tokens"] = max_tokens
payload.update(kwargs)
try:
async with httpx.AsyncClient(timeout=120.0) as client:
async with client.stream(
"POST",
f"{self.base_url}/chat/completions",
headers=headers,
json=payload
) as response:
async for line in response.aiter_lines():
if line.startswith("data: "):
data = line[6:] # Remove "data: " prefix
if data == "[DONE]":
break
try:
import json as json_module
chunk = json_module.loads(data)
delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
if delta:
yield {"content": delta}
except json_module.JSONDecodeError:
continue
except httpx.HTTPStatusError as e:
logger.error(f"LLM 流式 API 请求失败: {e.response.status_code}")
raise
except Exception as e:
logger.error(f"LLM 流式 API 调用异常: {str(e)}")
raise
async def analyze_excel_data( async def analyze_excel_data(
self, self,
excel_data: Dict[str, Any], excel_data: Dict[str, Any],

View File

@@ -0,0 +1,707 @@
"""
Markdown 文档 AI 分析服务
支持:
- 分章节解析(中文章节编号:一、二、三, (一)(二)(三))
- 结构化数据提取
- 流式输出
- 多种分析类型
- 可视化图表生成
"""
import asyncio
import json
import logging
import re
from typing import Any, AsyncGenerator, Dict, List, Optional
from app.services.llm_service import llm_service
from app.core.document_parser import MarkdownParser
from app.services.visualization_service import visualization_service
logger = logging.getLogger(__name__)
class MarkdownSection:
"""文档章节结构"""
def __init__(self, number: str, title: str, level: int, content: str, line_start: int, line_end: int):
self.number = number # 章节编号,如 "一", "(一)", "1"
self.title = title
self.level = level # 层级深度
self.content = content # 章节内容(不含子章节)
self.line_start = line_start
self.line_end = line_end
self.subsections: List[MarkdownSection] = []
def to_dict(self) -> Dict[str, Any]:
return {
"number": self.number,
"title": self.title,
"level": self.level,
"content_preview": self.content[:200] + "..." if len(self.content) > 200 else self.content,
"line_start": self.line_start,
"line_end": self.line_end,
"subsections": [s.to_dict() for s in self.subsections]
}
class MarkdownAIService:
"""Markdown 文档 AI 分析服务"""
# 中文章节编号模式
CHINESE_NUMBERS = ["", "", "", "", "", "", "", "", "", ""]
CHINESE_SUFFIX = ""
PARENTHESIS_PATTERN = re.compile(r'^([一二三四五六七八九十]+)\s*(.+)$')
CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+)、\s*(.+)$')
ARABIC_SECTION_PATTERN = re.compile(r'^(\d+)\.\s+(.+)$')
def __init__(self):
self.parser = MarkdownParser()
def get_supported_analysis_types(self) -> list:
"""获取支持的分析类型"""
return [
"summary", # 文档摘要
"outline", # 大纲提取
"key_points", # 关键点提取
"questions", # 生成问题
"tags", # 生成标签
"qa", # 问答对
"statistics", # 统计数据分析(适合政府公报)
"section", # 分章节详细分析
"charts" # 可视化图表生成
]
def extract_sections(self, content: str, titles: List[Dict]) -> List[MarkdownSection]:
"""
从文档内容中提取章节结构
识别以下章节格式:
- 一级:一、二、三...
- 二级:(一)(二)(三)...
- 三级1. 2. 3. ...
"""
sections = []
lines = content.split('\n')
# 构建标题行到内容的映射
title_lines = {}
for t in titles:
title_lines[t.get('line', 0)] = t
current_section = None
section_stack = []
for i, line in enumerate(lines, 1):
stripped = line.strip()
# 检查是否是一级标题(中文数字 + 、)
match = self.CHINESE_SECTION_PATTERN.match(stripped)
if match:
# 结束当前章节
if current_section:
current_section.content = self._get_section_content(
lines, current_section.line_start, i - 1
)
current_section = MarkdownSection(
number=match.group(1),
title=match.group(2),
level=1,
content="",
line_start=i,
line_end=len(lines)
)
sections.append(current_section)
section_stack = [current_section]
continue
# 检查是否是二级标题((一)(二)...
match = self.PARENTHESIS_PATTERN.match(stripped)
if match and current_section:
# 结束当前子章节
if section_stack and len(section_stack) > 1:
parent = section_stack[-1]
parent.content = self._get_section_content(
lines, parent.line_start, i - 1
)
subsection = MarkdownSection(
number=match.group(1),
title=match.group(2),
level=2,
content="",
line_start=i,
line_end=len(lines)
)
current_section.subsections.append(subsection)
section_stack = [current_section, subsection]
continue
# 检查是否是三级标题1. 2. 3.
match = self.ARABIC_SECTION_PATTERN.match(stripped)
if match and len(section_stack) > 1:
# 结束当前子章节
if len(section_stack) > 2:
parent = section_stack[-1]
parent.content = self._get_section_content(
lines, parent.line_start, i - 1
)
sub_subsection = MarkdownSection(
number=match.group(1),
title=match.group(2),
level=3,
content="",
line_start=i,
line_end=len(lines)
)
section_stack[-1].subsections.append(sub_subsection)
section_stack = section_stack[:-1] + [sub_subsection]
continue
# 处理最后一个章节
if current_section:
current_section.content = self._get_section_content(
lines, current_section.line_start, len(lines)
)
return sections
def _get_section_content(self, lines: List[str], start: int, end: int) -> str:
"""获取指定行范围的内容"""
if start > end:
return ""
content_lines = lines[start-1:end]
# 清理:移除标题行和空行
cleaned = []
for line in content_lines:
stripped = line.strip()
if not stripped:
continue
# 跳过章节标题行
if self.CHINESE_SECTION_PATTERN.match(stripped):
continue
if self.PARENTHESIS_PATTERN.match(stripped):
continue
if self.ARABIC_SECTION_PATTERN.match(stripped):
continue
cleaned.append(stripped)
return '\n'.join(cleaned)
async def analyze_markdown(
self,
file_path: str,
analysis_type: str = "summary",
user_prompt: str = "",
section_number: Optional[str] = None
) -> Dict[str, Any]:
"""
使用 AI 分析 Markdown 文档
Args:
file_path: 文件路径
analysis_type: 分析类型
user_prompt: 用户自定义提示词
section_number: 指定分析的章节编号(如 """(一)"
Returns:
dict: 分析结果
"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
return {
"success": False,
"error": parse_result.error
}
data = parse_result.data
# 提取章节结构
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
# 如果指定了章节,只分析该章节
target_content = data.get("content", "")
target_title = parse_result.metadata.get("filename", "")
if section_number:
section = self._find_section(sections, section_number)
if section:
target_content = section.content
target_title = f"{section.number}{section.title}"
else:
return {
"success": False,
"error": f"未找到章节: {section_number}"
}
# 根据分析类型构建提示词
prompt = self._build_prompt(
content=target_content,
analysis_type=analysis_type,
user_prompt=user_prompt,
title=target_title
)
# 调用 LLM 分析
messages = [
{"role": "system", "content": self._get_system_prompt(analysis_type)},
{"role": "user", "content": prompt}
]
response = await llm_service.chat(
messages=messages,
temperature=0.3,
max_tokens=4000
)
analysis = llm_service.extract_message_content(response)
# 构建基础返回
result = {
"success": True,
"filename": parse_result.metadata.get("filename", ""),
"analysis_type": analysis_type,
"section": target_title if section_number else None,
"word_count": len(target_content),
"structure": {
"title_count": parse_result.metadata.get("title_count", 0),
"code_block_count": parse_result.metadata.get("code_block_count", 0),
"table_count": parse_result.metadata.get("table_count", 0),
"section_count": len(sections)
},
"sections": [s.to_dict() for s in sections[:10]], # 最多返回10个一级章节
"analysis": analysis
}
# 如果是 charts 类型,额外生成可视化
if analysis_type == "charts":
try:
# 解析 LLM 返回的 JSON 数据
chart_data = self._parse_chart_json(analysis)
if chart_data and chart_data.get("tables"):
# 使用可视化服务生成图表
for table_info in chart_data.get("tables", []):
columns = table_info.get("columns", [])
rows = table_info.get("rows", [])
if columns and rows:
vis_result = visualization_service.analyze_and_visualize({
"columns": columns,
"rows": [dict(zip(columns, row)) for row in rows]
})
if vis_result.get("success"):
table_info["visualization"] = {
"statistics": vis_result.get("statistics"),
"charts": vis_result.get("charts"),
"distributions": vis_result.get("distributions")
}
result["chart_data"] = chart_data
except Exception as e:
logger.warning(f"生成可视化图表失败: {e}")
result["chart_data"] = {"tables": [], "key_statistics": [], "chart_suggestions": []}
return result
except Exception as e:
logger.error(f"Markdown AI 分析失败: {str(e)}")
return {
"success": False,
"error": str(e)
}
async def analyze_markdown_stream(
self,
file_path: str,
analysis_type: str = "summary",
user_prompt: str = "",
section_number: Optional[str] = None
) -> AsyncGenerator[str, None]:
"""
流式分析 Markdown 文档 (SSE)
Yields:
str: SSE 格式的数据块
"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
yield f"data: {json.dumps({'error': parse_result.error}, ensure_ascii=False)}\n\n"
return
data = parse_result.data
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
target_content = data.get("content", "")
target_title = parse_result.metadata.get("filename", "")
if section_number:
section = self._find_section(sections, section_number)
if section:
target_content = section.content
target_title = f"{section.number}{section.title}"
else:
yield f"data: {json.dumps({'error': f'未找到章节: {section_number}'}, ensure_ascii=False)}\n\n"
return
prompt = self._build_prompt(
content=target_content,
analysis_type=analysis_type,
user_prompt=user_prompt,
title=target_title
)
messages = [
{"role": "system", "content": self._get_system_prompt(analysis_type)},
{"role": "user", "content": prompt}
]
# 发送初始元数据
yield f"data: {json.dumps({
'type': 'start',
'filename': parse_result.metadata.get("filename", ""),
'analysis_type': analysis_type,
'section': target_title if section_number else None,
'word_count': len(target_content)
}, ensure_ascii=False)}\n\n"
# 流式调用 LLM
full_response = ""
async for chunk in llm_service.chat_stream(messages, temperature=0.3, max_tokens=4000):
content = chunk.get("content", "")
if content:
full_response += content
yield f"data: {json.dumps({'type': 'content', 'delta': content}, ensure_ascii=False)}\n\n"
# 发送完成消息
yield f"data: {json.dumps({'type': 'done', 'full_response': full_response}, ensure_ascii=False)}\n\n"
except Exception as e:
logger.error(f"Markdown AI 流式分析失败: {str(e)}")
yield f"data: {json.dumps({'error': str(e)}, ensure_ascii=False)}\n\n"
def _find_section(self, sections: List[MarkdownSection], number: str) -> Optional[MarkdownSection]:
"""查找指定编号的章节"""
# 标准化编号
num = number.strip()
for section in sections:
if section.number == num or section.title == num:
return section
# 在子章节中查找
found = self._find_section(section.subsections, number)
if found:
return found
return None
def _parse_chart_json(self, json_str: str) -> Optional[Dict[str, Any]]:
"""
解析 LLM 返回的 JSON 字符串
Args:
json_str: LLM 返回的 JSON 字符串
Returns:
解析后的字典,如果解析失败返回 None
"""
if not json_str:
return None
try:
# 尝试直接解析
return json.loads(json_str)
except json.JSONDecodeError:
pass
# 尝试提取 JSON 代码块
import re
# 匹配 ```json ... ``` 格式
match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', json_str)
if match:
try:
return json.loads(match.group(1))
except json.JSONDecodeError:
pass
# 尝试找到 JSON 对象的开始和结束
start = json_str.find('{')
end = json_str.rfind('}')
if start != -1 and end != -1 and end > start:
try:
return json.loads(json_str[start:end+1])
except json.JSONDecodeError:
pass
return None
def _get_system_prompt(self, analysis_type: str) -> str:
"""根据分析类型获取系统提示词"""
prompts = {
"summary": "你是一个专业的文档摘要助手,擅长从长文档中提取核心信息。",
"outline": "你是一个专业的文档结构分析助手,擅长提取文档大纲和层级结构。",
"key_points": "你是一个专业的知识提取助手,擅长从文档中提取关键信息和要点。",
"questions": "你是一个专业的教育助手,擅长生成帮助理解文档的问题。",
"tags": "你是一个专业的标签生成助手,擅长提取文档的主题标签。",
"qa": "你是一个专业的问答助手,擅长基于文档内容生成问答对。",
"statistics": "你是一个专业的统计数据分析助手,擅长分析政府统计公报中的数据。",
"section": "你是一个专业的章节分析助手,擅长对文档的特定章节进行深入分析。",
"charts": "你是一个专业的数据可视化助手,擅长从文档中提取数据并生成适合制作图表的数据结构。"
}
return prompts.get(analysis_type, "你是一个专业的文档分析助手。")
def _build_prompt(
self,
content: str,
analysis_type: str,
user_prompt: str,
title: str = ""
) -> str:
"""根据分析类型构建提示词"""
# 截断内容避免超出 token 限制
max_content_len = 6000
if len(content) > max_content_len:
content = content[:max_content_len] + "\n\n[内容已截断...]"
base_prompts = {
"summary": f"""请对以下文档进行摘要分析:
文档标题:{title}
文档内容:
{content}
请提供:
1. 文档主要内容摘要300字以内
2. 文档的目的和用途
3. 适合的读者群体
请用中文回答,结构清晰。""",
"outline": f"""请提取以下文档的大纲结构:
文档标题:{title}
文档内容:
{content}
请按层级列出文档大纲,用缩进表示层级关系。
格式:
一、一级标题
(一)二级标题
1. 三级标题
请用中文回答。""",
"key_points": f"""请从以下文档中提取关键要点:
文档标题:{title}
文档内容:
{content}
请列出文档的关键要点5-10条每条用简洁的语言描述并说明其在文档中的重要性。
请用中文回答,格式清晰。""",
"questions": f"""请根据以下文档生成有助于理解内容的问题:
文档标题:{title}
文档内容:
{content}
请生成5-10个问题帮助读者更好地理解文档内容。每个问题应该
1. 涵盖文档的重要信息点
2. 易于理解和回答
3. 具有思考价值
请用中文回答。""",
"tags": f"""请为以下文档生成标签:
文档标题:{title}
文档内容:
{content[:3000]}
请生成5-8个标签用逗号分隔。标签应该反映
- 文档的主题领域
- 文档的类型
- 文档的关键特征
请用中文回答,只需输出标签,不要其他内容。""",
"qa": f"""请根据以下文档生成问答对:
文档标题:{title}
文档内容:
{content[:4000]}
请生成3-5个问答对帮助读者通过问答形式理解文档内容。
格式:
Q1: 问题
A1: 回答
Q2: 问题
A2: 回答
请用中文回答,内容准确。""",
"statistics": f"""请分析以下政府统计公报中的数据和结论:
文档标题:{title}
文档内容:
{content}
请提供:
1. 文档中涉及的主要统计数据(列出关键数字和指标)
2. 数据的变化趋势(增长/下降)
3. 重要的百分比和对比
4. 数据来源和统计口径说明
请用中文回答,数据准确。""",
"section": f"""请详细分析以下文档章节:
章节标题:{title}
章节内容:
{content}
请提供:
1. 章节主要内容概括
2. 关键信息和数据
3. 与其他部分的关联(如有)
4. 重要结论
请用中文回答,分析深入。""",
"charts": f"""请从以下文档中提取可用于可视化的数据,并生成适合制作图表的数据结构:
文档标题:{title}
文档内容:
{content}
请完成以下任务:
1. 识别文档中的表格数据Markdown表格格式
2. 识别文档中的关键统计数据(百分比、数量、趋势等)
3. 识别可用于比较的分类数据
请用 JSON 格式返回以下结构的数据(如果没有表格数据,返回空结构):
{{
"tables": [
{{
"description": "表格的描述",
"columns": ["列名1", "列名2", ...],
"rows": [
["值1", "值2", ...],
["值1", "值2", ...]
]
}}
],
"key_statistics": [
{{
"name": "指标名称",
"value": "数值",
"trend": "增长/下降/持平",
"description": "指标说明"
}}
],
"chart_suggestions": [
{{
"chart_type": "bar/line/pie",
"title": "图表标题",
"data_source": "数据来源说明"
}}
]
}}
请确保返回的是合法的 JSON 格式。"""
}
prompt = base_prompts.get(analysis_type, base_prompts["summary"])
if user_prompt and user_prompt.strip():
prompt += f"\n\n用户额外需求:{user_prompt}"
return prompt
async def extract_outline(self, file_path: str) -> Dict[str, Any]:
"""提取文档大纲"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
return {"success": False, "error": parse_result.error}
data = parse_result.data
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
# 构建结构化大纲
outline = []
for section in sections:
outline.append({
"number": section.number,
"title": section.title,
"level": section.level,
"line": section.line_start,
"content_preview": section.content[:100] + "..." if len(section.content) > 100 else section.content,
"subsections": [{
"number": s.number,
"title": s.title,
"level": s.level,
"line": s.line_start
} for s in section.subsections]
})
return {
"success": True,
"outline": outline
}
except Exception as e:
logger.error(f"大纲提取失败: {str(e)}")
return {"success": False, "error": str(e)}
async def extract_tables_summary(self, file_path: str) -> Dict[str, Any]:
"""提取并总结文档中的表格"""
try:
parse_result = self.parser.parse(file_path)
if not parse_result.success:
return {"success": False, "error": parse_result.error}
tables = parse_result.data.get("tables", [])
if not tables:
return {"success": True, "tables": [], "message": "文档中没有表格"}
# 提取每个表格的关键信息
table_summaries = []
for i, table in enumerate(tables):
summary = {
"index": i + 1,
"headers": table.get("headers", []),
"row_count": table.get("row_count", 0),
"column_count": table.get("column_count", 0),
"preview_rows": table.get("rows", [])[:3], # 只取前3行预览
"first_column": [row[0] if row else "" for row in table.get("rows", [])[:5]]
}
table_summaries.append(summary)
return {
"success": True,
"tables": table_summaries,
"table_count": len(tables)
}
except Exception as e:
logger.error(f"表格提取失败: {str(e)}")
return {"success": False, "error": str(e)}
# 全局单例
markdown_ai_service = MarkdownAIService()

View File

@@ -40,14 +40,29 @@ class RAGService:
def _init_embeddings(self): def _init_embeddings(self):
"""初始化嵌入模型""" """初始化嵌入模型"""
if self.embedding_model is None: if self.embedding_model is None:
self.embedding_model = SentenceTransformer(settings.EMBEDDING_MODEL) # 使用轻量级本地模型,避免网络问题
model_name = 'all-MiniLM-L6-v2'
try:
self.embedding_model = SentenceTransformer(model_name)
self._dimension = self.embedding_model.get_sentence_embedding_dimension() self._dimension = self.embedding_model.get_sentence_embedding_dimension()
logger.info(f"RAG 嵌入模型初始化完成: {settings.EMBEDDING_MODEL}, 维度: {self._dimension}") logger.info(f"RAG 嵌入模型初始化完成: {model_name}, 维度: {self._dimension}")
except Exception as e:
logger.warning(f"嵌入模型 {model_name} 加载失败: {e}")
# 如果本地模型也失败使用简单hash作为后备
self.embedding_model = None
self._dimension = 384
logger.info("RAG 使用简化模式 (无向量嵌入)")
def _init_vector_store(self): def _init_vector_store(self):
"""初始化向量存储""" """初始化向量存储"""
if self.index is None: if self.index is None:
self._init_embeddings() self._init_embeddings()
if self.embedding_model is None:
# 无法加载嵌入模型,使用简化模式
self._dimension = 384
self.index = None
logger.warning("RAG 嵌入模型未加载,使用简化模式")
else:
self.index = faiss.IndexIDMap(faiss.IndexFlatIP(self._dimension)) self.index = faiss.IndexIDMap(faiss.IndexFlatIP(self._dimension))
logger.info("Faiss 向量存储初始化完成") logger.info("Faiss 向量存储初始化完成")
@@ -78,6 +93,11 @@ class RAGService:
if not self._initialized: if not self._initialized:
self._init_vector_store() self._init_vector_store()
# 如果没有嵌入模型,只记录到日志
if self.embedding_model is None:
logger.debug(f"字段跳过索引 (无嵌入模型): {table_name}.{field_name}")
return
text = f"表名: {table_name}, 字段: {field_name}, 描述: {field_description}" text = f"表名: {table_name}, 字段: {field_name}, 描述: {field_description}"
if sample_values: if sample_values:
text += f", 示例值: {', '.join(sample_values)}" text += f", 示例值: {', '.join(sample_values)}"
@@ -100,6 +120,11 @@ class RAGService:
if not self._initialized: if not self._initialized:
self._init_vector_store() self._init_vector_store()
# 如果没有嵌入模型,只记录到日志
if self.embedding_model is None:
logger.debug(f"文档跳过索引 (无嵌入模型): {doc_id}")
return
doc = SimpleDocument( doc = SimpleDocument(
page_content=content, page_content=content,
metadata=metadata or {"doc_id": doc_id} metadata=metadata or {"doc_id": doc_id}

View File

@@ -31,6 +31,178 @@ class TableRAGService:
self.rag = rag_service self.rag = rag_service
self.excel_storage = excel_storage_service self.excel_storage = excel_storage_service
def _extract_sheet_names_from_xml(self, file_path: str) -> List[str]:
"""
从 Excel 文件的 XML 中提取工作表名称
某些 Excel 文件由于包含非标准元素pandas/openpyxl 无法正确解析工作表列表,
此时需要直接从 XML 中提取。
Args:
file_path: Excel 文件路径
Returns:
工作表名称列表
"""
import zipfile
from xml.etree import ElementTree as ET
try:
with zipfile.ZipFile(file_path, 'r') as z:
# 读取 workbook.xml
if 'xl/workbook.xml' not in z.namelist():
return []
content = z.read('xl/workbook.xml')
root = ET.fromstring(content)
# 定义命名空间
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
# 提取所有 sheet 的 name 属性
sheets = root.findall('.//main:sheet', ns)
return [s.get('name') for s in sheets if s.get('name')]
except Exception as e:
logger.warning(f"从 XML 提取工作表失败: {file_path}, error: {e}")
return []
def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
"""
读取 Excel 工作表,支持 pandas 无法解析的特殊 Excel 文件
当 pandas 的 ExcelFile 无法正确解析时,直接从 XML 读取数据。
Args:
file_path: Excel 文件路径
sheet_name: 工作表名称(如果为 None读取第一个工作表
header_row: 表头行号
Returns:
DataFrame
"""
import zipfile
from xml.etree import ElementTree as ET
try:
# 先尝试用 pandas 正常读取
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
if df is not None and not df.empty:
return df
except Exception:
pass
# pandas 读取失败,从 XML 直接解析
logger.info(f"使用 XML 方式读取 Excel: {file_path}")
try:
with zipfile.ZipFile(file_path, 'r') as z:
# 获取工作表名称
sheet_names = self._extract_sheet_names_from_xml(file_path)
if not sheet_names:
raise ValueError("无法从 Excel 文件中找到工作表")
# 确定要读取的工作表
target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
sheet_index = sheet_names.index(target_sheet) + 1 # sheet1.xml, sheet2.xml, ...
# 读取 shared strings
shared_strings = []
if 'xl/sharedStrings.xml' in z.namelist():
ss_content = z.read('xl/sharedStrings.xml')
ss_root = ET.fromstring(ss_content)
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
for si in ss_root.findall('.//main:si', ns):
t = si.find('.//main:t', ns)
if t is not None:
shared_strings.append(t.text or '')
else:
shared_strings.append('')
# 读取工作表
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
if sheet_file not in z.namelist():
raise ValueError(f"工作表文件 {sheet_file} 不存在")
sheet_content = z.read(sheet_file)
root = ET.fromstring(sheet_content)
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
# 解析行
rows_data = []
for row in root.findall('.//main:row', ns):
row_idx = int(row.get('r', 0))
# header_row 是 0-indexedrow_idx 是 1-indexed
# 如果 header_row=0 表示第一行是表头,需要跳过 row_idx=1
if row_idx <= header_row + 1:
continue # 跳过表头行
row_cells = {}
for cell in row.findall('main:c', ns):
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('main:v', ns)
if v is not None and v.text:
if cell_type == 's':
# shared string
try:
val = shared_strings[int(v.text)]
except (ValueError, IndexError):
val = v.text
elif cell_type == 'b':
# boolean
val = v.text == '1'
else:
# number or other
val = v.text
else:
val = None
row_cells[col_letters] = val
if row_cells:
rows_data.append(row_cells)
# 转换为 DataFrame
if not rows_data:
return pd.DataFrame()
df = pd.DataFrame(rows_data)
# 如果有 header_row重新设置列名
if header_row >= 0:
# 重新读取第一行作为表头
first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
sheet_content = z.read(first_row_sheet)
root = ET.fromstring(sheet_content)
first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
if first_row is not None:
headers = {}
for cell in first_row.findall('main:c', ns):
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('main:v', ns)
if v is not None and v.text:
if cell_type == 's':
try:
headers[col_letters] = shared_strings[int(v.text)]
except (ValueError, IndexError):
headers[col_letters] = v.text
else:
headers[col_letters] = v.text
# 重命名列
df.columns = [headers.get(col, col) for col in df.columns]
logger.info(f"XML 解析完成: {len(df)} 行, {len(df.columns)}")
return df
except Exception as e:
logger.error(f"XML 解析 Excel 失败: {e}")
raise
async def generate_field_description( async def generate_field_description(
self, self,
table_name: str, table_name: str,
@@ -126,26 +298,49 @@ class TableRAGService:
} }
try: try:
# 1. 读取 Excel # 1. 先检查 Excel 文件是否有效
logger.info(f"正在检查Excel文件: {file_path}")
try:
xls_file = pd.ExcelFile(file_path)
sheet_names = xls_file.sheet_names
logger.info(f"Excel文件工作表: {sheet_names}")
# 如果 sheet_names 为空,尝试从 XML 中手动提取
if not sheet_names:
sheet_names = self._extract_sheet_names_from_xml(file_path)
logger.info(f"从XML提取工作表: {sheet_names}")
if not sheet_names:
return {"success": False, "error": "Excel 文件没有工作表"}
except Exception as e:
logger.error(f"读取Excel文件失败: {file_path}, error: {e}")
return {"success": False, "error": f"无法读取Excel文件: {str(e)}"}
# 2. 读取 Excel
if sheet_name: if sheet_name:
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row) # 验证指定的sheet_name是否存在
else: if sheet_name not in sheet_names:
df = pd.read_excel(file_path, header=header_row) logger.warning(f"指定的工作表 '{sheet_name}' 不存在,使用第一个工作表: {sheet_names[0]}")
sheet_name = sheet_names[0]
df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row)
logger.info(f"读取到数据: {len(df)} 行, {len(df.columns)}")
if df.empty: if df.empty:
return {"success": False, "error": "Excel 文件为空"} return {"success": False, "error": "Excel 文件为空"}
# 清理列名 # 清理列名
df.columns = [str(c) for c in df.columns] df.columns = [str(c) for c in df.columns]
table_name = excel_storage._sanitize_table_name(filename) table_name = self.excel_storage._sanitize_table_name(filename)
results["table_name"] = table_name results["table_name"] = table_name
results["field_count"] = len(df.columns) results["field_count"] = len(df.columns)
logger.info(f"表名: {table_name}, 字段数: {len(df.columns)}")
# 2. 初始化 RAG (如果需要) # 3. 初始化 RAG (如果需要)
if not self.rag._initialized: if not self.rag._initialized:
self.rag._init_vector_store() self.rag._init_vector_store()
# 3. 为每个字段生成描述并索引 # 4. 为每个字段生成描述并索引
all_fields_data = {} all_fields_data = {}
for col in df.columns: for col in df.columns:
# 采样示例值 # 采样示例值
@@ -187,7 +382,8 @@ class TableRAGService:
logger.error(error_msg) logger.error(error_msg)
results["errors"].append(error_msg) results["errors"].append(error_msg)
# 4. 存储到 MySQL # 5. 存储到 MySQL
logger.info(f"开始存储到MySQL: {filename}")
store_result = await self.excel_storage.store_excel( store_result = await self.excel_storage.store_excel(
file_path=file_path, file_path=file_path,
filename=filename, filename=filename,

View File

@@ -1,113 +0,0 @@
✅ Excel 文件解析功能已完成并测试通过
已完成的工作
后端部分
1. 文件服务层 (backend/app/services/file_service.py)
- 文件保存、读取、删除功能
- 文件信息获取
2. Excel 解析模块 (backend/app/core/document_parser/)
- base.py - 解析器基类
- xlsx_parser.py - Excel 文件解析器
- utils.py - 工具函数
3. API 接口 (backend/app/api/endpoints/upload.py)
- POST /upload/excel - 上传并解析 Excel 文件
- GET /upload/excel/preview/{file_path} - 获取预览
- DELETE /upload/file - 删除文件
4. 主应用配置 (backend/app/main.py)
- CORS 中间件配置
- API 路由注册
前端部分
1. 后端 API 封装 (frontendTest/app-a6ww9j3ja3nl/src/db/backend-api.ts)
- Excel 上传、预览、删除函数
2. Excel 解析页面 (frontendTest/app-a6ww9j3ja3nl/src/pages/ExcelParse.tsx)
- 拖拽上传界面
- 解析选项配置
- 数据表格展示
- 多工作表支持
3. 路由配置 (frontendTest/app-a6ww9j3ja3nl/src/routes.tsx)
- 添加 /excel-parse 路由
4. 环境配置 (.env)
- 后端 API URL 配置
测试结果
后端 API 测试通过:
- ✅ 健康检查http://127.0.0.1:8000/health
- ✅ Excel 上传解析:成功解析 5 行 6 列的测试数据
访问方式
1. 后端 API 文档http://127.0.0.1:8000/api/v1/docs
2. 前端页面:
- 启动前端cd frontendTest/app-a6ww9j3ja3nl && npm run dev
- 访问 Excel 解析页面:/excel-parse
下一步建议
1. 启动前端开发服务器测试完整的前端界面
2. 添加更多文档格式解析支持docx, md, txt
3. 完善错误处理和用户提示
✻ Worked for 4m 18s · 1 background task still running (↓ to manage)
> 如何启动后端呢
● 后端启动步骤如下:
方法一:使用命令行启动(开发模式)
# 1. 进入后端目录
cd H:/OwnProject/FilesReadSysteam/backend
# 2. 激活虚拟环境
# PowerShell:
.\venv\Scripts\Activate.ps1
# 或 CMD:
.\venv\Scripts\Activate.bat
# 3. 启动后端服务器
python -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload
方法二:直接使用虚拟环境中的 Python
cd H:/OwnProject/FilesReadSysteam/backend
./venv/Scripts/python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload
方法三:直接运行 main.py开发调试
cd H:/OwnProject/FilesReadSysteam/backend
./venv/Scripts/python.exe app/main.py
启动成功的标志
INFO: Uvicorn running on http://127.0.0.1:8000
INFO: Started server process [xxxxx]
INFO: Waiting for application startup.
INFO: Application startup complete.
验证服务是否正常
# 健康检查
curl http://127.0.0.1:8000/health
# 访问 API 文档
# 浏览器打开: http://127.0.0.1:8000/api/v1/docs
---
当前状态:后端已在后台运行(任务 ID: b22jkg69j可以直接访问 http://127.0.0.1:8000
需要停止的话告诉我即可。

View File

@@ -166,6 +166,66 @@ export interface AIAnalysisResult {
error?: string; error?: string;
} }
// ==================== Markdown AI 分析类型 ====================
export interface AIMarkdownAnalyzeResult {
success: boolean;
filename?: string;
analysis_type?: string;
section?: string;
word_count?: number;
structure?: {
title_count?: number;
code_block_count?: number;
table_count?: number;
section_count?: number;
};
sections?: MarkdownSection[];
analysis?: string;
chart_data?: {
tables?: Array<{
description?: string;
columns?: string[];
rows?: string[][];
visualization?: {
statistics?: any;
charts?: any;
distributions?: any;
};
}>;
key_statistics?: Array<{
name?: string;
value?: string;
trend?: string;
description?: string;
}>;
chart_suggestions?: Array<{
chart_type?: string;
title?: string;
data_source?: string;
}>;
};
error?: string;
}
export interface MarkdownSection {
number: string;
title: string;
level: number;
content_preview?: string;
line_start: number;
line_end?: number;
subsections?: MarkdownSection[];
}
export interface MarkdownOutlineResult {
success: boolean;
outline?: MarkdownSection[];
error?: string;
}
export type MarkdownAnalysisType = 'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section' | 'charts';
export interface AIExcelAnalyzeResult { export interface AIExcelAnalyzeResult {
success: boolean; success: boolean;
excel?: { excel?: {
@@ -842,6 +902,159 @@ export const aiApi = {
} }
}, },
/**
* 上传并使用 AI 分析 Markdown 文件
*/
async analyzeMarkdown(
file: File,
options: {
analysisType?: MarkdownAnalysisType;
userPrompt?: string;
sectionNumber?: string;
} = {}
): Promise<AIMarkdownAnalyzeResult> {
const formData = new FormData();
formData.append('file', file);
const params = new URLSearchParams();
if (options.analysisType) {
params.append('analysis_type', options.analysisType);
}
if (options.userPrompt) {
params.append('user_prompt', options.userPrompt);
}
if (options.sectionNumber) {
params.append('section_number', options.sectionNumber);
}
const url = `${BACKEND_BASE_URL}/ai/analyze/md?${params.toString()}`;
try {
const response = await fetch(url, {
method: 'POST',
body: formData,
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || 'Markdown AI 分析失败');
}
return await response.json();
} catch (error) {
console.error('Markdown AI 分析失败:', error);
throw error;
}
},
/**
* 流式分析 Markdown 文件 (SSE)
*/
async analyzeMarkdownStream(
file: File,
options: {
analysisType?: MarkdownAnalysisType;
userPrompt?: string;
sectionNumber?: string;
} = {},
onChunk?: (chunk: { type: string; delta?: string; error?: string }) => void
): Promise<string> {
const formData = new FormData();
formData.append('file', file);
const params = new URLSearchParams();
if (options.analysisType) {
params.append('analysis_type', options.analysisType);
}
if (options.userPrompt) {
params.append('user_prompt', options.userPrompt);
}
if (options.sectionNumber) {
params.append('section_number', options.sectionNumber);
}
const url = `${BACKEND_BASE_URL}/ai/analyze/md/stream?${params.toString()}`;
try {
const response = await fetch(url, {
method: 'POST',
body: formData,
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || 'Markdown AI 流式分析失败');
}
const reader = response.body?.getReader();
if (!reader) throw new Error('无法读取响应流');
const decoder = new TextDecoder();
let fullResponse = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value);
const lines = chunk.split('\n');
for (const line of lines) {
if (line.startsWith('data: ')) {
const data = line.slice(6);
if (data === '[DONE]') continue;
try {
const parsed = JSON.parse(data);
if (parsed.type === 'content' && parsed.delta) {
fullResponse += parsed.delta;
onChunk?.({ type: 'content', delta: parsed.delta });
} else if (parsed.type === 'done') {
fullResponse = parsed.full_response || fullResponse;
} else if (parsed.error) {
onChunk?.({ type: 'error', error: parsed.error });
}
} catch {
// Ignore parse errors for incomplete JSON
}
}
}
}
return fullResponse;
} catch (error) {
console.error('Markdown AI 流式分析失败:', error);
throw error;
}
},
/**
* 获取 Markdown 文档大纲(分章节信息)
*/
async getMarkdownOutline(file: File): Promise<MarkdownOutlineResult> {
const formData = new FormData();
formData.append('file', file);
const url = `${BACKEND_BASE_URL}/ai/analyze/md/outline`;
try {
const response = await fetch(url, {
method: 'GET',
body: formData,
});
if (!response.ok) {
const error = await response.json();
throw new Error(error.detail || '获取 Markdown 大纲失败');
}
return await response.json();
} catch (error) {
console.error('获取 Markdown 大纲失败:', error);
throw error;
}
},
/** /**
* 生成统计信息和图表 * 生成统计信息和图表
*/ */

View File

@@ -19,7 +19,11 @@ import {
TrendingUp, TrendingUp,
Download, Download,
Brain, Brain,
Settings2 Settings2,
List,
MessageSquareCode,
Tag,
HelpCircle
} from 'lucide-react'; } from 'lucide-react';
import { Button } from '@/components/ui/button'; import { Button } from '@/components/ui/button';
import { Input } from '@/components/ui/input'; import { Input } from '@/components/ui/input';
@@ -33,7 +37,7 @@ import { Checkbox } from '@/components/ui/checkbox';
import { toast } from 'sonner'; import { toast } from 'sonner';
import { cn } from '@/lib/utils'; import { cn } from '@/lib/utils';
import { Skeleton } from '@/components/ui/skeleton'; import { Skeleton } from '@/components/ui/skeleton';
import { backendApi, type ExcelParseResult, aiApi } from '@/db/backend-api'; import { backendApi, type ExcelParseResult, type AIMarkdownAnalyzeResult, type MarkdownSection, aiApi } from '@/db/backend-api';
import { import {
Table as TableComponent, Table as TableComponent,
TableBody, TableBody,
@@ -78,6 +82,15 @@ const Documents: React.FC = () => {
const [analysisCharts, setAnalysisCharts] = useState<any>(null); const [analysisCharts, setAnalysisCharts] = useState<any>(null);
const [analysisTypes, setAnalysisTypes] = useState<Array<{ value: string; label: string; description: string }>>([]); const [analysisTypes, setAnalysisTypes] = useState<Array<{ value: string; label: string; description: string }>>([]);
// Markdown AI 分析相关状态
const [mdAnalysis, setMdAnalysis] = useState<AIMarkdownAnalyzeResult | null>(null);
const [mdAnalysisType, setMdAnalysisType] = useState<'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section' | 'charts'>('summary');
const [mdUserPrompt, setMdUserPrompt] = useState('');
const [mdSections, setMdSections] = useState<MarkdownSection[]>([]);
const [mdSelectedSection, setMdSelectedSection] = useState<string>('');
const [mdStreaming, setMdStreaming] = useState(false);
const [mdStreamingContent, setMdStreamingContent] = useState('');
// 解析选项 // 解析选项
const [parseOptions, setParseOptions] = useState({ const [parseOptions, setParseOptions] = useState({
parseAllSheets: false, parseAllSheets: false,
@@ -144,6 +157,9 @@ const Documents: React.FC = () => {
setAiAnalysis(null); setAiAnalysis(null);
setAnalysisCharts(null); setAnalysisCharts(null);
setExpandedSheet(null); setExpandedSheet(null);
setMdAnalysis(null);
setMdSections([]);
setMdStreamingContent('');
const ext = file.name.split('.').pop()?.toLowerCase(); const ext = file.name.split('.').pop()?.toLowerCase();
@@ -163,6 +179,9 @@ const Documents: React.FC = () => {
} else { } else {
toast.error(result.error || '解析失败'); toast.error(result.error || '解析失败');
} }
} else if (ext === 'md' || ext === 'markdown') {
// Markdown 文件:获取大纲
await fetchMdOutline();
} else { } else {
// 其他文档使用通用上传接口 // 其他文档使用通用上传接口
const result = await backendApi.uploadDocument(file); const result = await backendApi.uploadDocument(file);
@@ -403,6 +422,106 @@ const Documents: React.FC = () => {
} }
}; };
const isMarkdownFile = (filename: string) => {
const ext = filename.split('.').pop()?.toLowerCase();
return ext === 'md' || ext === 'markdown';
};
// Markdown AI 分析处理
const handleMdAnalyze = async () => {
if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) {
toast.error('请先上传 Markdown 文件');
return;
}
setAnalyzing(true);
setMdAnalysis(null);
try {
const result = await aiApi.analyzeMarkdown(uploadedFile, {
analysisType: mdAnalysisType,
userPrompt: mdUserPrompt,
sectionNumber: mdSelectedSection || undefined
});
if (result.success) {
toast.success('Markdown AI 分析完成');
setMdAnalysis(result);
} else {
toast.error(result.error || 'AI 分析失败');
}
} catch (error: any) {
toast.error(error.message || 'AI 分析失败');
} finally {
setAnalyzing(false);
}
};
// 流式分析 Markdown
const handleMdAnalyzeStream = async () => {
if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) {
toast.error('请先上传 Markdown 文件');
return;
}
setAnalyzing(true);
setMdStreaming(true);
setMdStreamingContent('');
setMdAnalysis(null);
try {
await aiApi.analyzeMarkdownStream(
uploadedFile,
{
analysisType: mdAnalysisType,
userPrompt: mdUserPrompt,
sectionNumber: mdSelectedSection || undefined
},
(chunk: { type: string; delta?: string; error?: string }) => {
if (chunk.type === 'content' && chunk.delta) {
setMdStreamingContent(prev => prev + chunk.delta);
} else if (chunk.type === 'error') {
toast.error(chunk.error || '流式分析出错');
}
}
);
} catch (error: any) {
toast.error(error.message || 'AI 分析失败');
} finally {
setAnalyzing(false);
setMdStreaming(false);
}
};
// 获取 Markdown 文档大纲(分章节)
const fetchMdOutline = async () => {
if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) return;
try {
const result = await aiApi.getMarkdownOutline(uploadedFile);
if (result.success && result.outline) {
setMdSections(result.outline);
}
} catch (error) {
console.error('获取大纲失败:', error);
}
};
const getMdAnalysisIcon = (type: string) => {
switch (type) {
case 'summary': return <FileText size={20} />;
case 'outline': return <List size={20} />;
case 'key_points': return <TrendingUp size={20} />;
case 'statistics': return <TrendingUp size={20} />;
case 'section': return <FileText size={20} />;
case 'questions': return <MessageSquareCode size={20} />;
case 'tags': return <Tag size={20} />;
case 'qa': return <HelpCircle size={20} />;
case 'charts': return <TrendingUp size={20} />;
default: return <Sparkles size={20} />;
}
};
const formatFileSize = (bytes: number): string => { const formatFileSize = (bytes: number): string => {
if (bytes === 0) return '0 B'; if (bytes === 0) return '0 B';
const k = 1024; const k = 1024;
@@ -600,6 +719,98 @@ const Documents: React.FC = () => {
</Card> </Card>
)} )}
{/* Markdown AI 分析选项 */}
{uploadedFile && isMarkdownFile(uploadedFile.name) && (
<Card className="border-none shadow-md bg-gradient-to-br from-purple-500/5 to-primary/5">
<CardHeader className="pb-4">
<CardTitle className="flex items-center gap-2">
<Sparkles className="text-purple-500" size={20} />
Markdown AI
</CardTitle>
</CardHeader>
<CardContent className="space-y-4">
{/* 章节选择 */}
{mdSections.length > 0 && (
<div className="space-y-2">
<Label htmlFor="md-section" className="text-sm"></Label>
<Select value={mdSelectedSection} onValueChange={setMdSelectedSection}>
<SelectTrigger id="md-section" className="bg-background">
<SelectValue placeholder="全文分析" />
</SelectTrigger>
<SelectContent>
<SelectItem value=""></SelectItem>
{mdSections.map((section) => (
<SelectItem key={section.number} value={section.number}>
{section.number}{section.title}
</SelectItem>
))}
</SelectContent>
</Select>
</div>
)}
<div className="space-y-2">
<Label htmlFor="md-analysis-type" className="text-sm"></Label>
<Select value={mdAnalysisType} onValueChange={(value: any) => setMdAnalysisType(value)}>
<SelectTrigger id="md-analysis-type" className="bg-background">
<SelectValue />
</SelectTrigger>
<SelectContent>
{[
{ value: 'summary', label: '文档摘要', desc: '主要内容摘要' },
{ value: 'outline', label: '大纲提取', desc: '提取文档结构' },
{ value: 'key_points', label: '关键要点', desc: '提取关键信息' },
{ value: 'statistics', label: '统计分析', desc: '统计数据分析' },
{ value: 'section', label: '章节分析', desc: '分章节详细分析' },
{ value: 'questions', label: '生成问题', desc: '生成理解性问题' },
{ value: 'tags', label: '生成标签', desc: '提取主题标签' },
{ value: 'qa', label: '问答对', desc: '生成问答内容' },
{ value: 'charts', label: '数据图表', desc: '生成可视化数据' }
].map(type => (
<SelectItem key={type.value} value={type.value}>
<div className="flex items-center gap-2">
{getMdAnalysisIcon(type.value)}
<div className="flex flex-col">
<span className="font-medium">{type.label}</span>
<span className="text-xs text-muted-foreground">{type.desc}</span>
</div>
</div>
</SelectItem>
))}
</SelectContent>
</Select>
</div>
<div className="space-y-2">
<Label htmlFor="md-user-prompt" className="text-sm"></Label>
<Textarea
id="md-user-prompt"
placeholder="例如:请重点关注技术实现部分..."
value={mdUserPrompt}
onChange={(e) => setMdUserPrompt(e.target.value)}
className="bg-background resize-none"
rows={2}
/>
</div>
<div className="flex gap-2">
<Button
onClick={handleMdAnalyze}
disabled={analyzing}
className="flex-1 bg-gradient-to-r from-purple-500 to-primary hover:from-purple-500/90 hover:to-primary/90"
>
{analyzing && !mdStreaming ? <><Loader2 className="mr-2 animate-spin" size={16} /> ...</> : <><Sparkles className="mr-2" size={16} /></>}
</Button>
<Button
onClick={handleMdAnalyzeStream}
disabled={analyzing}
variant="outline"
className="flex-1"
>
{analyzing && mdStreaming ? <><Loader2 className="mr-2 animate-spin" size={16} /> ...</> : <><Sparkles className="mr-2" size={16} /></>}
</Button>
</div>
</CardContent>
</Card>
)}
{/* 数据操作 */} {/* 数据操作 */}
{parseResult?.success && ( {parseResult?.success && (
<Card className="border-none shadow-md bg-gradient-to-br from-emerald-500/5 to-blue-500/5"> <Card className="border-none shadow-md bg-gradient-to-br from-emerald-500/5 to-blue-500/5">
@@ -661,6 +872,45 @@ const Documents: React.FC = () => {
</Card> </Card>
)} )}
{/* Markdown AI 分析结果 */}
{(mdAnalysis || mdStreamingContent) && (
<Card className="border-none shadow-md border-l-4 border-l-purple-500">
<CardHeader>
<div className="flex items-center justify-between">
<div className="space-y-1">
<CardTitle className="flex items-center gap-2">
<Sparkles className="text-purple-500" size={20} />
Markdown AI
{mdStreaming && <Badge variant="default" className="ml-2 bg-purple-500"></Badge>}
</CardTitle>
{mdAnalysis && (
<CardDescription>
{mdAnalysis.filename} {mdAnalysis.word_count || 0} {mdAnalysis.analysis_type}
{mdAnalysis.section && `${mdAnalysis.section}`}
</CardDescription>
)}
</div>
{mdAnalysis?.structure && (
<Badge variant="secondary">
{mdAnalysis.structure.title_count || 0} {mdAnalysis.structure.section_count || 0}
</Badge>
)}
</div>
</CardHeader>
<CardContent className="max-h-[500px] overflow-y-auto">
{/* 流式内容优先显示 */}
{mdStreamingContent && (
<div className="animate-pulse text-sm text-muted-foreground mb-4">
...
</div>
)}
{mdStreamingContent && <Markdown content={mdStreamingContent} />}
{mdAnalysis?.analysis && !mdStreamingContent && <Markdown content={mdAnalysis.analysis} />}
{!mdAnalysis?.success && !mdStreamingContent && <p className="text-sm text-destructive">{mdAnalysis?.error || '分析失败'}</p>}
</CardContent>
</Card>
)}
{/* 图表显示 */} {/* 图表显示 */}
{analysisCharts && ( {analysisCharts && (
<Card className="border-none shadow-md border-l-4 border-l-indigo-500"> <Card className="border-none shadow-md border-l-4 border-l-indigo-500">