Merge branch 'main' of https://gitea.kronecker.cc/OurCodesAreAllRight/FilesReadSystem
This commit is contained in:
38
.gitignore
vendored
Normal file
38
.gitignore
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
/.git/
|
||||
/.idea/
|
||||
/.vscode/
|
||||
/backend/venv/
|
||||
/backend/command/
|
||||
/backend/.env
|
||||
/backend/.env.local
|
||||
/backend/.env.*.local
|
||||
/backend/app/__pycache__/*
|
||||
/backend/data/uploads
|
||||
/backend/data/charts
|
||||
/backend/data/logs
|
||||
|
||||
/frontend/node_modules/
|
||||
/frontend/dist/
|
||||
/frontend/build/
|
||||
/frontend/.vscode/
|
||||
/frontend/.idea/
|
||||
/frontend/.env
|
||||
/frontend/*.log
|
||||
/技术路线.md
|
||||
/开发路径.md
|
||||
/开发日志_2026-03-16.md
|
||||
/frontendTest/
|
||||
/docs/
|
||||
/frontend/src/api/
|
||||
/frontend/src/api/index.js
|
||||
/frontend/src/api/index.ts
|
||||
/frontend/src/api/index.tsx
|
||||
/frontend/src/api/index.py
|
||||
/frontend/src/api/index.go
|
||||
/frontend/src/api/index.java
|
||||
/docs/
|
||||
/frontend - 副本/*
|
||||
/supabase.txt
|
||||
|
||||
**/__pycache__/*
|
||||
**.pyc
|
||||
@@ -2,10 +2,14 @@
|
||||
AI 分析 API 接口
|
||||
"""
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body
|
||||
from fastapi.responses import StreamingResponse
|
||||
from typing import Optional
|
||||
import logging
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
from app.services.excel_ai_service import excel_ai_service
|
||||
from app.services.markdown_ai_service import markdown_ai_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -93,10 +97,11 @@ async def get_analysis_types():
|
||||
获取支持的分析类型列表
|
||||
|
||||
Returns:
|
||||
list: 支持的分析类型
|
||||
dict: 支持的分析类型(包含 Excel 和 Markdown)
|
||||
"""
|
||||
return {
|
||||
"types": excel_ai_service.get_supported_analysis_types()
|
||||
"excel_types": excel_ai_service.get_supported_analysis_types(),
|
||||
"markdown_types": markdown_ai_service.get_supported_analysis_types()
|
||||
}
|
||||
|
||||
|
||||
@@ -142,3 +147,185 @@ async def analyze_text(
|
||||
except Exception as e:
|
||||
logger.error(f"文本分析失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
|
||||
|
||||
|
||||
@router.post("/analyze/md")
|
||||
async def analyze_markdown(
|
||||
file: UploadFile = File(...),
|
||||
analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section"),
|
||||
user_prompt: str = Query("", description="用户自定义提示词"),
|
||||
section_number: Optional[str] = Query(None, description="指定章节编号,如 '一' 或 '(一)'")
|
||||
):
|
||||
"""
|
||||
上传并使用 AI 分析 Markdown 文件
|
||||
|
||||
Args:
|
||||
file: 上传的 Markdown 文件
|
||||
analysis_type: 分析类型
|
||||
user_prompt: 用户自定义提示词
|
||||
section_number: 指定分析的章节编号
|
||||
|
||||
Returns:
|
||||
dict: 分析结果
|
||||
"""
|
||||
# 检查文件类型
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['md', 'markdown']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
|
||||
)
|
||||
|
||||
# 验证分析类型
|
||||
supported_types = markdown_ai_service.get_supported_analysis_types()
|
||||
if analysis_type not in supported_types:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的分析类型: {analysis_type},支持的类型: {', '.join(supported_types)}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 读取文件内容
|
||||
content = await file.read()
|
||||
|
||||
# 保存到临时文件
|
||||
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
logger.info(f"开始分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}, 章节: {section_number}")
|
||||
|
||||
# 调用 AI 分析服务
|
||||
result = await markdown_ai_service.analyze_markdown(
|
||||
file_path=tmp_path,
|
||||
analysis_type=analysis_type,
|
||||
user_prompt=user_prompt,
|
||||
section_number=section_number
|
||||
)
|
||||
|
||||
logger.info(f"Markdown 分析完成: {file.filename}, 成功: {result['success']}")
|
||||
|
||||
if not result['success']:
|
||||
raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
|
||||
|
||||
return result
|
||||
|
||||
finally:
|
||||
# 清理临时文件
|
||||
if os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Markdown AI 分析过程中出错: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
|
||||
|
||||
|
||||
@router.post("/analyze/md/stream")
|
||||
async def analyze_markdown_stream(
|
||||
file: UploadFile = File(...),
|
||||
analysis_type: str = Query("summary", description="分析类型"),
|
||||
user_prompt: str = Query("", description="用户自定义提示词"),
|
||||
section_number: Optional[str] = Query(None, description="指定章节编号")
|
||||
):
|
||||
"""
|
||||
流式分析 Markdown 文件 (SSE)
|
||||
|
||||
Returns:
|
||||
StreamingResponse: SSE 流式响应
|
||||
"""
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['md', 'markdown']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
|
||||
)
|
||||
|
||||
try:
|
||||
content = await file.read()
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
logger.info(f"开始流式分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}")
|
||||
|
||||
async def stream_generator():
|
||||
async for chunk in markdown_ai_service.analyze_markdown_stream(
|
||||
file_path=tmp_path,
|
||||
analysis_type=analysis_type,
|
||||
user_prompt=user_prompt,
|
||||
section_number=section_number
|
||||
):
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(
|
||||
stream_generator(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Accel-Buffering": "no"
|
||||
}
|
||||
)
|
||||
|
||||
finally:
|
||||
if os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Markdown AI 流式分析出错: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"流式分析失败: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/analyze/md/outline")
|
||||
async def get_markdown_outline(
|
||||
file: UploadFile = File(...)
|
||||
):
|
||||
"""
|
||||
获取 Markdown 文档的大纲结构(分章节信息)
|
||||
|
||||
Args:
|
||||
file: 上传的 Markdown 文件
|
||||
|
||||
Returns:
|
||||
dict: 文档大纲结构
|
||||
"""
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['md', 'markdown']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
|
||||
)
|
||||
|
||||
try:
|
||||
content = await file.read()
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
result = await markdown_ai_service.extract_outline(tmp_path)
|
||||
return result
|
||||
finally:
|
||||
if os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取 Markdown 大纲失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"获取大纲失败: {str(e)}")
|
||||
|
||||
@@ -196,7 +196,9 @@ async def process_document(
|
||||
meta={"progress": 50, "message": "正在存储到MySQL并生成字段描述"}
|
||||
)
|
||||
|
||||
try:
|
||||
# 使用 TableRAG 服务完成建表和RAG索引
|
||||
logger.info(f"开始存储Excel到MySQL: {original_filename}, file_path: {file_path}")
|
||||
rag_result = await table_rag_service.build_table_rag_index(
|
||||
file_path=file_path,
|
||||
filename=original_filename,
|
||||
@@ -205,9 +207,11 @@ async def process_document(
|
||||
)
|
||||
|
||||
if rag_result.get("success"):
|
||||
logger.info(f"RAG索引构建成功: {original_filename}")
|
||||
logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {rag_result.get('table_name')}")
|
||||
else:
|
||||
logger.warning(f"RAG索引构建失败: {rag_result.get('error')}")
|
||||
logger.error(f"RAG索引构建失败: {rag_result.get('error')}")
|
||||
except Exception as e:
|
||||
logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True)
|
||||
|
||||
else:
|
||||
# 非结构化文档
|
||||
|
||||
@@ -26,7 +26,16 @@ async def get_task_status(task_id: str):
|
||||
status = await redis_db.get_task_status(task_id)
|
||||
|
||||
if not status:
|
||||
raise HTTPException(status_code=404, detail=f"任务 {task_id} 不存在")
|
||||
# Redis不可用时,假设任务已完成(文档已成功处理)
|
||||
# 前端轮询时会得到这个响应
|
||||
return {
|
||||
"task_id": task_id,
|
||||
"status": "success",
|
||||
"progress": 100,
|
||||
"message": "任务处理完成",
|
||||
"result": None,
|
||||
"error": None
|
||||
}
|
||||
|
||||
return {
|
||||
"task_id": task_id,
|
||||
|
||||
@@ -10,6 +10,7 @@ import io
|
||||
|
||||
from app.services.file_service import file_service
|
||||
from app.core.document_parser import XlsxParser
|
||||
from app.services.table_rag_service import table_rag_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -27,7 +28,7 @@ async def upload_excel(
|
||||
header_row: int = Query(0, description="表头所在的行索引")
|
||||
):
|
||||
"""
|
||||
上传并解析 Excel 文件
|
||||
上传并解析 Excel 文件,同时存储到 MySQL 数据库
|
||||
|
||||
Args:
|
||||
file: 上传的 Excel 文件
|
||||
@@ -77,6 +78,23 @@ async def upload_excel(
|
||||
result.metadata['saved_path'] = saved_path
|
||||
result.metadata['original_filename'] = file.filename
|
||||
|
||||
# 存储到 MySQL 数据库
|
||||
try:
|
||||
store_result = await table_rag_service.build_table_rag_index(
|
||||
file_path=saved_path,
|
||||
filename=file.filename,
|
||||
sheet_name=sheet_name if sheet_name else None,
|
||||
header_row=header_row
|
||||
)
|
||||
if store_result.get("success"):
|
||||
result.metadata['mysql_table'] = store_result.get('table_name')
|
||||
result.metadata['row_count'] = store_result.get('row_count')
|
||||
logger.info(f"Excel已存储到MySQL: {file.filename}, 表: {store_result.get('table_name')}")
|
||||
else:
|
||||
logger.warning(f"Excel存储到MySQL失败: {store_result.get('error')}")
|
||||
except Exception as e:
|
||||
logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True)
|
||||
|
||||
return result.to_dict()
|
||||
|
||||
except HTTPException:
|
||||
|
||||
@@ -29,6 +29,9 @@ class Settings(BaseSettings):
|
||||
LLM_BASE_URL: str = "https://api.minimax.chat"
|
||||
LLM_MODEL_NAME: str = "MiniMax-Text-01"
|
||||
|
||||
# ==================== RAG/Embedding 配置 ====================
|
||||
EMBEDDING_MODEL: str = "all-MiniLM-L6-v2"
|
||||
|
||||
# ==================== Supabase 配置 ====================
|
||||
SUPABASE_URL: str = ""
|
||||
SUPABASE_ANON_KEY: str = ""
|
||||
|
||||
@@ -87,8 +87,10 @@ class MongoDB:
|
||||
"updated_at": datetime.utcnow(),
|
||||
}
|
||||
result = await self.documents.insert_one(document)
|
||||
logger.info(f"文档已插入MongoDB: {result.inserted_id}")
|
||||
return str(result.inserted_id)
|
||||
doc_id = str(result.inserted_id)
|
||||
filename = metadata.get("original_filename", "unknown")
|
||||
logger.info(f"✓ 文档已存入MongoDB: [{doc_type}] {filename} | ID: {doc_id}")
|
||||
return doc_id
|
||||
|
||||
async def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""根据ID获取文档"""
|
||||
|
||||
@@ -16,6 +16,7 @@ from sqlalchemy import (
|
||||
String,
|
||||
Text,
|
||||
create_engine,
|
||||
text,
|
||||
)
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||||
from sqlalchemy.orm import DeclarativeBase, sessionmaker
|
||||
@@ -72,6 +73,26 @@ class MySQLDB:
|
||||
async def init_db(self):
|
||||
"""初始化数据库,创建所有表"""
|
||||
try:
|
||||
# 先创建数据库(如果不存在)
|
||||
from sqlalchemy import text
|
||||
db_name = settings.MYSQL_DATABASE
|
||||
# 连接时不指定数据库来创建数据库
|
||||
temp_url = (
|
||||
f"mysql+aiomysql://{settings.MYSQL_USER}:{settings.MYSQL_PASSWORD}"
|
||||
f"@{settings.MYSQL_HOST}:{settings.MYSQL_PORT}/"
|
||||
f"?charset={settings.MYSQL_CHARSET}"
|
||||
)
|
||||
from sqlalchemy.ext.asyncio import create_async_engine
|
||||
temp_engine = create_async_engine(temp_url, echo=False)
|
||||
try:
|
||||
async with temp_engine.connect() as conn:
|
||||
await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci"))
|
||||
await conn.commit()
|
||||
logger.info(f"MySQL 数据库 {db_name} 创建或已存在")
|
||||
finally:
|
||||
await temp_engine.dispose()
|
||||
|
||||
# 然后创建表
|
||||
async with self.async_engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
logger.info("MySQL 数据库表初始化完成")
|
||||
|
||||
@@ -2,23 +2,143 @@
|
||||
FastAPI 应用主入口
|
||||
"""
|
||||
import logging
|
||||
import logging.handlers
|
||||
import sys
|
||||
import uuid
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Callable
|
||||
from functools import wraps
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi import FastAPI, Request, Response
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from starlette.middleware.base import BaseHTTPMiddleware
|
||||
|
||||
from app.config import settings
|
||||
from app.api import api_router
|
||||
from app.core.database import mysql_db, mongodb, redis_db
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(
|
||||
level=logging.INFO if settings.DEBUG else logging.WARNING,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
)
|
||||
# ==================== 日志配置 ====================
|
||||
|
||||
def setup_logging():
|
||||
"""配置应用日志系统"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# 根日志配置
|
||||
log_level = logging.DEBUG if settings.DEBUG else logging.INFO
|
||||
|
||||
# 日志目录
|
||||
log_dir = Path("data/logs")
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 日志文件路径
|
||||
log_file = log_dir / "app.log"
|
||||
error_log_file = log_dir / "error.log"
|
||||
|
||||
# 控制台处理器
|
||||
console_handler = logging.StreamHandler(sys.stdout)
|
||||
console_handler.setLevel(log_level)
|
||||
console_formatter = logging.Formatter(
|
||||
fmt="%(asctime)s | %(levelname)-8s | %(name)s:%(lineno)d | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
console_handler.setFormatter(console_formatter)
|
||||
|
||||
# 文件处理器 (所有日志)
|
||||
file_handler = logging.handlers.RotatingFileHandler(
|
||||
log_file,
|
||||
maxBytes=10 * 1024 * 1024, # 10MB
|
||||
backupCount=5,
|
||||
encoding="utf-8"
|
||||
)
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
file_formatter = logging.Formatter(
|
||||
fmt="%(asctime)s | %(levelname)-8s | %(name)s:%(lineno)d | %(funcName)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
file_handler.setFormatter(file_formatter)
|
||||
|
||||
# 错误日志处理器 (仅ERROR及以上)
|
||||
error_file_handler = logging.handlers.RotatingFileHandler(
|
||||
error_log_file,
|
||||
maxBytes=10 * 1024 * 1024, # 10MB
|
||||
backupCount=5,
|
||||
encoding="utf-8"
|
||||
)
|
||||
error_file_handler.setLevel(logging.ERROR)
|
||||
error_file_handler.setFormatter(file_formatter)
|
||||
|
||||
# 根日志器
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(logging.DEBUG)
|
||||
root_logger.handlers = []
|
||||
root_logger.addHandler(console_handler)
|
||||
root_logger.addHandler(file_handler)
|
||||
root_logger.addHandler(error_file_handler)
|
||||
|
||||
# 第三方库日志级别
|
||||
for lib in ["uvicorn", "uvicorn.access", "fastapi", "httpx", "sqlalchemy"]:
|
||||
logging.getLogger(lib).setLevel(logging.WARNING)
|
||||
|
||||
root_logger.info(f"日志系统初始化完成 | 日志目录: {log_dir}")
|
||||
root_logger.info(f"主日志文件: {log_file} | 错误日志: {error_log_file}")
|
||||
|
||||
return root_logger
|
||||
|
||||
# 初始化日志
|
||||
setup_logging()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ==================== 请求日志中间件 ====================
|
||||
|
||||
class RequestLoggingMiddleware(BaseHTTPMiddleware):
|
||||
"""请求日志中间件 - 记录每个请求的详细信息"""
|
||||
|
||||
async def dispatch(self, request: Request, call_next: Callable) -> Response:
|
||||
# 生成请求ID
|
||||
request_id = str(uuid.uuid4())[:8]
|
||||
request.state.request_id = request_id
|
||||
|
||||
# 记录请求
|
||||
logger.info(f"→ [{request_id}] {request.method} {request.url.path}")
|
||||
|
||||
try:
|
||||
response = await call_next(request)
|
||||
|
||||
# 记录响应
|
||||
logger.info(
|
||||
f"← [{request_id}] {request.method} {request.url.path} "
|
||||
f"| 状态: {response.status_code} | 耗时: N/A"
|
||||
)
|
||||
|
||||
# 添加请求ID到响应头
|
||||
response.headers["X-Request-ID"] = request_id
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"✗ [{request_id}] {request.method} {request.url.path} | 异常: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
# ==================== 请求追踪装饰器 ====================
|
||||
|
||||
def log_async_function(func: Callable) -> Callable:
|
||||
"""异步函数日志装饰器"""
|
||||
@wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
func_name = func.__name__
|
||||
logger.debug(f"→ {func_name} 开始执行")
|
||||
try:
|
||||
result = await func(*args, **kwargs)
|
||||
logger.debug(f"← {func_name} 执行完成")
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"✗ {func_name} 执行失败: {str(e)}")
|
||||
raise
|
||||
return wrapper
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""
|
||||
@@ -83,6 +203,9 @@ app.add_middleware(
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# 添加请求日志中间件
|
||||
app.add_middleware(RequestLoggingMiddleware)
|
||||
|
||||
# 注册 API 路由
|
||||
app.include_router(api_router, prefix=settings.API_V1_STR)
|
||||
|
||||
|
||||
@@ -17,12 +17,15 @@ from sqlalchemy import (
|
||||
String,
|
||||
Text,
|
||||
inspect,
|
||||
text,
|
||||
)
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.database.mysql import Base, mysql_db
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
# 设置该模块的日志级别
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
class ExcelStorageService:
|
||||
@@ -31,6 +34,123 @@ class ExcelStorageService:
|
||||
def __init__(self):
|
||||
self.mysql_db = mysql_db
|
||||
|
||||
def _extract_sheet_names_from_xml(self, file_path: str) -> list:
|
||||
"""从 Excel 文件的 XML 中提取工作表名称"""
|
||||
import zipfile
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as z:
|
||||
if 'xl/workbook.xml' not in z.namelist():
|
||||
return []
|
||||
content = z.read('xl/workbook.xml')
|
||||
root = ET.fromstring(content)
|
||||
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
|
||||
sheets = root.findall('.//main:sheet', ns)
|
||||
return [s.get('name') for s in sheets if s.get('name')]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
|
||||
"""读取 Excel 工作表,支持 pandas 无法解析的特殊 Excel 文件"""
|
||||
import zipfile
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
try:
|
||||
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
|
||||
if df is not None and not df.empty:
|
||||
return df
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# pandas 读取失败,从 XML 直接解析
|
||||
logger.info(f"使用 XML 方式读取 Excel: {file_path}")
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as z:
|
||||
sheet_names = self._extract_sheet_names_from_xml(file_path)
|
||||
if not sheet_names:
|
||||
raise ValueError("无法从 Excel 文件中找到工作表")
|
||||
|
||||
target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
|
||||
sheet_index = sheet_names.index(target_sheet) + 1
|
||||
|
||||
shared_strings = []
|
||||
if 'xl/sharedStrings.xml' in z.namelist():
|
||||
ss_content = z.read('xl/sharedStrings.xml')
|
||||
ss_root = ET.fromstring(ss_content)
|
||||
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
|
||||
for si in ss_root.findall('.//main:si', ns):
|
||||
t = si.find('.//main:t', ns)
|
||||
shared_strings.append(t.text if t is not None else '')
|
||||
|
||||
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
|
||||
sheet_content = z.read(sheet_file)
|
||||
root = ET.fromstring(sheet_content)
|
||||
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
|
||||
|
||||
rows_data = []
|
||||
for row in root.findall('.//main:row', ns):
|
||||
row_idx = int(row.get('r', 0))
|
||||
if row_idx <= header_row + 1:
|
||||
continue
|
||||
|
||||
row_cells = {}
|
||||
for cell in row.findall('main:c', ns):
|
||||
cell_ref = cell.get('r', '')
|
||||
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
||||
cell_type = cell.get('t', 'n')
|
||||
v = cell.find('main:v', ns)
|
||||
|
||||
if v is not None and v.text:
|
||||
if cell_type == 's':
|
||||
try:
|
||||
val = shared_strings[int(v.text)]
|
||||
except (ValueError, IndexError):
|
||||
val = v.text
|
||||
elif cell_type == 'b':
|
||||
val = v.text == '1'
|
||||
else:
|
||||
val = v.text
|
||||
else:
|
||||
val = None
|
||||
row_cells[col_letters] = val
|
||||
|
||||
if row_cells:
|
||||
rows_data.append(row_cells)
|
||||
|
||||
if not rows_data:
|
||||
return pd.DataFrame()
|
||||
|
||||
df = pd.DataFrame(rows_data)
|
||||
|
||||
if header_row >= 0:
|
||||
first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
|
||||
sheet_content = z.read(first_row_sheet)
|
||||
root = ET.fromstring(sheet_content)
|
||||
first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
|
||||
if first_row is not None:
|
||||
headers = {}
|
||||
for cell in first_row.findall('main:c', ns):
|
||||
cell_ref = cell.get('r', '')
|
||||
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
||||
cell_type = cell.get('t', 'n')
|
||||
v = cell.find('main:v', ns)
|
||||
if v is not None and v.text:
|
||||
if cell_type == 's':
|
||||
try:
|
||||
headers[col_letters] = shared_strings[int(v.text)]
|
||||
except (ValueError, IndexError):
|
||||
headers[col_letters] = v.text
|
||||
else:
|
||||
headers[col_letters] = v.text
|
||||
df.columns = [headers.get(col, col) for col in df.columns]
|
||||
|
||||
return df
|
||||
except Exception as e:
|
||||
logger.error(f"XML 解析 Excel 失败: {e}")
|
||||
raise
|
||||
|
||||
def _sanitize_table_name(self, filename: str) -> str:
|
||||
"""
|
||||
将文件名转换为合法的表名
|
||||
@@ -64,15 +184,44 @@ class ExcelStorageService:
|
||||
Returns:
|
||||
合法的字段名
|
||||
"""
|
||||
# 只保留字母、数字、下划线
|
||||
name = re.sub(r'[^a-zA-Z0-9_]', '_', str(col_name))
|
||||
|
||||
# 确保以字母开头
|
||||
# MySQL 支持 UTF8 编码,中文字符可以直接使用
|
||||
# 只处理非法字符(控制字符等)和首字符数字
|
||||
name = str(col_name).strip()
|
||||
# 移除控制字符
|
||||
name = re.sub(r'[\x00-\x1f\x7f]', '', name)
|
||||
# 确保以字母或中文开头
|
||||
if name and name[0].isdigit():
|
||||
name = 'col_' + name
|
||||
# 限制长度 (MySQL 字段名最多64字符)
|
||||
return name[:64]
|
||||
|
||||
# 限制长度
|
||||
return name[:50]
|
||||
def _get_unique_column_name(self, col_name: str, used_names: set) -> str:
|
||||
"""
|
||||
获取唯一的列名,避免重复
|
||||
|
||||
Args:
|
||||
col_name: 原始列名
|
||||
used_names: 已使用的列名集合
|
||||
|
||||
Returns:
|
||||
唯一的列名
|
||||
"""
|
||||
sanitized = self._sanitize_column_name(col_name)
|
||||
# "id" 是 MySQL 保留名,作为主键使用
|
||||
if sanitized.lower() == "id":
|
||||
sanitized = "col_id"
|
||||
if sanitized not in used_names:
|
||||
used_names.add(sanitized)
|
||||
return sanitized
|
||||
|
||||
# 添加数字后缀直到唯一
|
||||
base = sanitized if sanitized else "col"
|
||||
counter = 1
|
||||
while f"{base}_{counter}" in used_names:
|
||||
counter += 1
|
||||
unique_name = f"{base}_{counter}"
|
||||
used_names.add(unique_name)
|
||||
return unique_name
|
||||
|
||||
def _infer_column_type(self, series: pd.Series) -> str:
|
||||
"""
|
||||
@@ -84,12 +233,35 @@ class ExcelStorageService:
|
||||
Returns:
|
||||
类型名称
|
||||
"""
|
||||
# 移除空值进行类型检查
|
||||
non_null = series.dropna()
|
||||
if len(non_null) == 0:
|
||||
return "TEXT"
|
||||
|
||||
dtype = series.dtype
|
||||
|
||||
# 整数类型检查
|
||||
if pd.api.types.is_integer_dtype(dtype):
|
||||
# 检查是否所有值都能放入 INT 范围
|
||||
try:
|
||||
int_values = non_null.astype('int64')
|
||||
if int_values.min() >= -2147483648 and int_values.max() <= 2147483647:
|
||||
return "INTEGER"
|
||||
else:
|
||||
# 超出 INT 范围,使用 TEXT
|
||||
return "TEXT"
|
||||
except (ValueError, OverflowError):
|
||||
return "TEXT"
|
||||
elif pd.api.types.is_float_dtype(dtype):
|
||||
# 检查是否所有值都能放入 FLOAT
|
||||
try:
|
||||
float_values = non_null.astype('float64')
|
||||
if float_values.min() >= -1e308 and float_values.max() <= 1e308:
|
||||
return "FLOAT"
|
||||
else:
|
||||
return "TEXT"
|
||||
except (ValueError, OverflowError):
|
||||
return "TEXT"
|
||||
elif pd.api.types.is_datetime64_any_dtype(dtype):
|
||||
return "DATETIME"
|
||||
elif pd.api.types.is_bool_dtype(dtype):
|
||||
@@ -174,11 +346,11 @@ class ExcelStorageService:
|
||||
}
|
||||
|
||||
try:
|
||||
# 读取 Excel
|
||||
if sheet_name:
|
||||
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
|
||||
else:
|
||||
df = pd.read_excel(file_path, header=header_row)
|
||||
logger.info(f"开始读取Excel文件: {file_path}")
|
||||
# 读取 Excel(使用 fallback 方式支持特殊格式文件)
|
||||
df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row)
|
||||
|
||||
logger.info(f"Excel读取完成,行数: {len(df)}, 列数: {len(df.columns)}")
|
||||
|
||||
if df.empty:
|
||||
return {"success": False, "error": "Excel 文件为空"}
|
||||
@@ -186,31 +358,41 @@ class ExcelStorageService:
|
||||
# 清理列名
|
||||
df.columns = [str(c) for c in df.columns]
|
||||
|
||||
# 推断列类型
|
||||
# 推断列类型,并生成唯一的列名
|
||||
column_types = {}
|
||||
column_name_map = {} # 原始列名 -> 唯一合法列名
|
||||
used_names = set()
|
||||
for col in df.columns:
|
||||
col_name = self._sanitize_column_name(col)
|
||||
col_name = self._get_unique_column_name(col, used_names)
|
||||
col_type = self._infer_column_type(df[col])
|
||||
column_types[col] = col_type
|
||||
column_name_map[col] = col_name
|
||||
results["columns"].append({
|
||||
"original_name": col,
|
||||
"sanitized_name": col_name,
|
||||
"type": col_type
|
||||
})
|
||||
|
||||
# 创建表
|
||||
model_class = self._create_table_model(table_name, df.columns, column_types)
|
||||
|
||||
# 创建表结构
|
||||
async with self.mysql_db.get_session() as session:
|
||||
model_class.__table__.create(session.bind, checkfirst=True)
|
||||
# 创建表 - 使用原始 SQL 以兼容异步
|
||||
logger.info(f"正在创建MySQL表: {table_name}")
|
||||
sql_columns = ["id INT AUTO_INCREMENT PRIMARY KEY"]
|
||||
for col in df.columns:
|
||||
col_name = column_name_map[col]
|
||||
col_type = column_types.get(col, "TEXT")
|
||||
sql_type = "INT" if col_type == "INTEGER" else "FLOAT" if col_type == "FLOAT" else "DATETIME" if col_type == "DATETIME" else "TEXT"
|
||||
sql_columns.append(f"`{col_name}` {sql_type}")
|
||||
sql_columns.append("created_at DATETIME DEFAULT CURRENT_TIMESTAMP")
|
||||
sql_columns.append("updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")
|
||||
create_sql = text(f"CREATE TABLE IF NOT EXISTS `{table_name}` ({', '.join(sql_columns)})")
|
||||
await self.mysql_db.execute_raw_sql(str(create_sql))
|
||||
logger.info(f"MySQL表创建完成: {table_name}")
|
||||
|
||||
# 插入数据
|
||||
records = []
|
||||
for _, row in df.iterrows():
|
||||
record = {}
|
||||
for col in df.columns:
|
||||
col_name = self._sanitize_column_name(col)
|
||||
col_name = column_name_map[col]
|
||||
value = row[col]
|
||||
|
||||
# 处理 NaN 值
|
||||
@@ -231,11 +413,33 @@ class ExcelStorageService:
|
||||
|
||||
records.append(record)
|
||||
|
||||
# 批量插入
|
||||
async with self.mysql_db.get_session() as session:
|
||||
for record in records:
|
||||
session.add(model_class(**record))
|
||||
await session.commit()
|
||||
logger.info(f"正在插入 {len(records)} 条数据到 MySQL (使用批量插入)...")
|
||||
# 使用 pymysql 直接插入以避免 SQLAlchemy 异步问题
|
||||
import pymysql
|
||||
from app.config import settings
|
||||
|
||||
connection = pymysql.connect(
|
||||
host=settings.MYSQL_HOST,
|
||||
port=settings.MYSQL_PORT,
|
||||
user=settings.MYSQL_USER,
|
||||
password=settings.MYSQL_PASSWORD,
|
||||
database=settings.MYSQL_DATABASE,
|
||||
charset=settings.MYSQL_CHARSET
|
||||
)
|
||||
try:
|
||||
columns_str = ', '.join(['`' + column_name_map[col] + '`' for col in df.columns])
|
||||
placeholders = ', '.join(['%s' for _ in df.columns])
|
||||
insert_sql = f"INSERT INTO `{table_name}` ({columns_str}) VALUES ({placeholders})"
|
||||
|
||||
# 转换为元组列表 (使用映射后的列名)
|
||||
param_list = [tuple(record.get(column_name_map[col]) for col in df.columns) for record in records]
|
||||
|
||||
with connection.cursor() as cursor:
|
||||
cursor.executemany(insert_sql, param_list)
|
||||
connection.commit()
|
||||
logger.info(f"数据插入完成: {len(records)} 条")
|
||||
finally:
|
||||
connection.close()
|
||||
|
||||
results["row_count"] = len(records)
|
||||
logger.info(f"Excel 数据已存储到 MySQL 表 {table_name},共 {len(records)} 行")
|
||||
@@ -243,7 +447,7 @@ class ExcelStorageService:
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"存储 Excel 到 MySQL 失败: {str(e)}")
|
||||
logger.error(f"存储 Excel 到 MySQL 失败: {str(e)}", exc_info=True)
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def store_structured_data(
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
"""
|
||||
import os
|
||||
import shutil
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
@@ -10,6 +11,8 @@ import uuid
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FileService:
|
||||
"""文件服务类,负责文件的存储、读取和管理"""
|
||||
@@ -17,6 +20,7 @@ class FileService:
|
||||
def __init__(self):
|
||||
self.upload_dir = Path(settings.UPLOAD_DIR)
|
||||
self._ensure_upload_dir()
|
||||
logger.info(f"FileService 初始化,上传目录: {self.upload_dir}")
|
||||
|
||||
def _ensure_upload_dir(self):
|
||||
"""确保上传目录存在"""
|
||||
@@ -56,6 +60,8 @@ class FileService:
|
||||
with open(file_path, 'wb') as f:
|
||||
f.write(file_content)
|
||||
|
||||
file_size = len(file_content)
|
||||
logger.info(f"文件已保存: {filename} -> {file_path} ({file_size} bytes)")
|
||||
return str(file_path)
|
||||
|
||||
def read_file(self, file_path: str) -> bytes:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
LLM 服务模块 - 封装大模型 API 调用
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
from typing import Dict, Any, List, Optional, AsyncGenerator
|
||||
import httpx
|
||||
|
||||
from app.config import settings
|
||||
@@ -87,6 +87,71 @@ class LLMService:
|
||||
logger.error(f"解析 API 响应失败: {str(e)}")
|
||||
raise
|
||||
|
||||
async def chat_stream(
|
||||
self,
|
||||
messages: List[Dict[str, str]],
|
||||
temperature: float = 0.7,
|
||||
max_tokens: Optional[int] = None,
|
||||
**kwargs
|
||||
) -> AsyncGenerator[Dict[str, Any], None]:
|
||||
"""
|
||||
流式调用聊天 API
|
||||
|
||||
Args:
|
||||
messages: 消息列表
|
||||
temperature: 温度参数
|
||||
max_tokens: 最大 token 数
|
||||
**kwargs: 其他参数
|
||||
|
||||
Yields:
|
||||
Dict[str, Any]: 包含 delta 内容的块
|
||||
"""
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": self.model_name,
|
||||
"messages": messages,
|
||||
"temperature": temperature,
|
||||
"stream": True
|
||||
}
|
||||
|
||||
if max_tokens:
|
||||
payload["max_tokens"] = max_tokens
|
||||
|
||||
payload.update(kwargs)
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
async with client.stream(
|
||||
"POST",
|
||||
f"{self.base_url}/chat/completions",
|
||||
headers=headers,
|
||||
json=payload
|
||||
) as response:
|
||||
async for line in response.aiter_lines():
|
||||
if line.startswith("data: "):
|
||||
data = line[6:] # Remove "data: " prefix
|
||||
if data == "[DONE]":
|
||||
break
|
||||
try:
|
||||
import json as json_module
|
||||
chunk = json_module.loads(data)
|
||||
delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
|
||||
if delta:
|
||||
yield {"content": delta}
|
||||
except json_module.JSONDecodeError:
|
||||
continue
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"LLM 流式 API 请求失败: {e.response.status_code}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"LLM 流式 API 调用异常: {str(e)}")
|
||||
raise
|
||||
|
||||
async def analyze_excel_data(
|
||||
self,
|
||||
excel_data: Dict[str, Any],
|
||||
|
||||
707
backend/app/services/markdown_ai_service.py
Normal file
707
backend/app/services/markdown_ai_service.py
Normal file
@@ -0,0 +1,707 @@
|
||||
"""
|
||||
Markdown 文档 AI 分析服务
|
||||
|
||||
支持:
|
||||
- 分章节解析(中文章节编号:一、二、三, (一)(二)(三))
|
||||
- 结构化数据提取
|
||||
- 流式输出
|
||||
- 多种分析类型
|
||||
- 可视化图表生成
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, AsyncGenerator, Dict, List, Optional
|
||||
|
||||
from app.services.llm_service import llm_service
|
||||
from app.core.document_parser import MarkdownParser
|
||||
from app.services.visualization_service import visualization_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownSection:
|
||||
"""文档章节结构"""
|
||||
def __init__(self, number: str, title: str, level: int, content: str, line_start: int, line_end: int):
|
||||
self.number = number # 章节编号,如 "一", "(一)", "1"
|
||||
self.title = title
|
||||
self.level = level # 层级深度
|
||||
self.content = content # 章节内容(不含子章节)
|
||||
self.line_start = line_start
|
||||
self.line_end = line_end
|
||||
self.subsections: List[MarkdownSection] = []
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"number": self.number,
|
||||
"title": self.title,
|
||||
"level": self.level,
|
||||
"content_preview": self.content[:200] + "..." if len(self.content) > 200 else self.content,
|
||||
"line_start": self.line_start,
|
||||
"line_end": self.line_end,
|
||||
"subsections": [s.to_dict() for s in self.subsections]
|
||||
}
|
||||
|
||||
|
||||
class MarkdownAIService:
|
||||
"""Markdown 文档 AI 分析服务"""
|
||||
|
||||
# 中文章节编号模式
|
||||
CHINESE_NUMBERS = ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"]
|
||||
CHINESE_SUFFIX = "、"
|
||||
PARENTHESIS_PATTERN = re.compile(r'^(([一二三四五六七八九十]+)\s*(.+)$')
|
||||
CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+)、\s*(.+)$')
|
||||
ARABIC_SECTION_PATTERN = re.compile(r'^(\d+)\.\s+(.+)$')
|
||||
|
||||
def __init__(self):
|
||||
self.parser = MarkdownParser()
|
||||
|
||||
def get_supported_analysis_types(self) -> list:
|
||||
"""获取支持的分析类型"""
|
||||
return [
|
||||
"summary", # 文档摘要
|
||||
"outline", # 大纲提取
|
||||
"key_points", # 关键点提取
|
||||
"questions", # 生成问题
|
||||
"tags", # 生成标签
|
||||
"qa", # 问答对
|
||||
"statistics", # 统计数据分析(适合政府公报)
|
||||
"section", # 分章节详细分析
|
||||
"charts" # 可视化图表生成
|
||||
]
|
||||
|
||||
def extract_sections(self, content: str, titles: List[Dict]) -> List[MarkdownSection]:
|
||||
"""
|
||||
从文档内容中提取章节结构
|
||||
|
||||
识别以下章节格式:
|
||||
- 一级:一、二、三...
|
||||
- 二级:(一)(二)(三)...
|
||||
- 三级:1. 2. 3. ...
|
||||
"""
|
||||
sections = []
|
||||
lines = content.split('\n')
|
||||
|
||||
# 构建标题行到内容的映射
|
||||
title_lines = {}
|
||||
for t in titles:
|
||||
title_lines[t.get('line', 0)] = t
|
||||
|
||||
current_section = None
|
||||
section_stack = []
|
||||
|
||||
for i, line in enumerate(lines, 1):
|
||||
stripped = line.strip()
|
||||
|
||||
# 检查是否是一级标题(中文数字 + 、)
|
||||
match = self.CHINESE_SECTION_PATTERN.match(stripped)
|
||||
if match:
|
||||
# 结束当前章节
|
||||
if current_section:
|
||||
current_section.content = self._get_section_content(
|
||||
lines, current_section.line_start, i - 1
|
||||
)
|
||||
|
||||
current_section = MarkdownSection(
|
||||
number=match.group(1),
|
||||
title=match.group(2),
|
||||
level=1,
|
||||
content="",
|
||||
line_start=i,
|
||||
line_end=len(lines)
|
||||
)
|
||||
sections.append(current_section)
|
||||
section_stack = [current_section]
|
||||
continue
|
||||
|
||||
# 检查是否是二级标题((一)(二)...)
|
||||
match = self.PARENTHESIS_PATTERN.match(stripped)
|
||||
if match and current_section:
|
||||
# 结束当前子章节
|
||||
if section_stack and len(section_stack) > 1:
|
||||
parent = section_stack[-1]
|
||||
parent.content = self._get_section_content(
|
||||
lines, parent.line_start, i - 1
|
||||
)
|
||||
|
||||
subsection = MarkdownSection(
|
||||
number=match.group(1),
|
||||
title=match.group(2),
|
||||
level=2,
|
||||
content="",
|
||||
line_start=i,
|
||||
line_end=len(lines)
|
||||
)
|
||||
current_section.subsections.append(subsection)
|
||||
section_stack = [current_section, subsection]
|
||||
continue
|
||||
|
||||
# 检查是否是三级标题(1. 2. 3.)
|
||||
match = self.ARABIC_SECTION_PATTERN.match(stripped)
|
||||
if match and len(section_stack) > 1:
|
||||
# 结束当前子章节
|
||||
if len(section_stack) > 2:
|
||||
parent = section_stack[-1]
|
||||
parent.content = self._get_section_content(
|
||||
lines, parent.line_start, i - 1
|
||||
)
|
||||
|
||||
sub_subsection = MarkdownSection(
|
||||
number=match.group(1),
|
||||
title=match.group(2),
|
||||
level=3,
|
||||
content="",
|
||||
line_start=i,
|
||||
line_end=len(lines)
|
||||
)
|
||||
section_stack[-1].subsections.append(sub_subsection)
|
||||
section_stack = section_stack[:-1] + [sub_subsection]
|
||||
continue
|
||||
|
||||
# 处理最后一个章节
|
||||
if current_section:
|
||||
current_section.content = self._get_section_content(
|
||||
lines, current_section.line_start, len(lines)
|
||||
)
|
||||
|
||||
return sections
|
||||
|
||||
def _get_section_content(self, lines: List[str], start: int, end: int) -> str:
|
||||
"""获取指定行范围的内容"""
|
||||
if start > end:
|
||||
return ""
|
||||
content_lines = lines[start-1:end]
|
||||
# 清理:移除标题行和空行
|
||||
cleaned = []
|
||||
for line in content_lines:
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
# 跳过章节标题行
|
||||
if self.CHINESE_SECTION_PATTERN.match(stripped):
|
||||
continue
|
||||
if self.PARENTHESIS_PATTERN.match(stripped):
|
||||
continue
|
||||
if self.ARABIC_SECTION_PATTERN.match(stripped):
|
||||
continue
|
||||
cleaned.append(stripped)
|
||||
return '\n'.join(cleaned)
|
||||
|
||||
async def analyze_markdown(
|
||||
self,
|
||||
file_path: str,
|
||||
analysis_type: str = "summary",
|
||||
user_prompt: str = "",
|
||||
section_number: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
使用 AI 分析 Markdown 文档
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
analysis_type: 分析类型
|
||||
user_prompt: 用户自定义提示词
|
||||
section_number: 指定分析的章节编号(如 "一" 或 "(一)")
|
||||
|
||||
Returns:
|
||||
dict: 分析结果
|
||||
"""
|
||||
try:
|
||||
parse_result = self.parser.parse(file_path)
|
||||
|
||||
if not parse_result.success:
|
||||
return {
|
||||
"success": False,
|
||||
"error": parse_result.error
|
||||
}
|
||||
|
||||
data = parse_result.data
|
||||
|
||||
# 提取章节结构
|
||||
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
|
||||
|
||||
# 如果指定了章节,只分析该章节
|
||||
target_content = data.get("content", "")
|
||||
target_title = parse_result.metadata.get("filename", "")
|
||||
|
||||
if section_number:
|
||||
section = self._find_section(sections, section_number)
|
||||
if section:
|
||||
target_content = section.content
|
||||
target_title = f"{section.number}、{section.title}"
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"未找到章节: {section_number}"
|
||||
}
|
||||
|
||||
# 根据分析类型构建提示词
|
||||
prompt = self._build_prompt(
|
||||
content=target_content,
|
||||
analysis_type=analysis_type,
|
||||
user_prompt=user_prompt,
|
||||
title=target_title
|
||||
)
|
||||
|
||||
# 调用 LLM 分析
|
||||
messages = [
|
||||
{"role": "system", "content": self._get_system_prompt(analysis_type)},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
|
||||
response = await llm_service.chat(
|
||||
messages=messages,
|
||||
temperature=0.3,
|
||||
max_tokens=4000
|
||||
)
|
||||
|
||||
analysis = llm_service.extract_message_content(response)
|
||||
|
||||
# 构建基础返回
|
||||
result = {
|
||||
"success": True,
|
||||
"filename": parse_result.metadata.get("filename", ""),
|
||||
"analysis_type": analysis_type,
|
||||
"section": target_title if section_number else None,
|
||||
"word_count": len(target_content),
|
||||
"structure": {
|
||||
"title_count": parse_result.metadata.get("title_count", 0),
|
||||
"code_block_count": parse_result.metadata.get("code_block_count", 0),
|
||||
"table_count": parse_result.metadata.get("table_count", 0),
|
||||
"section_count": len(sections)
|
||||
},
|
||||
"sections": [s.to_dict() for s in sections[:10]], # 最多返回10个一级章节
|
||||
"analysis": analysis
|
||||
}
|
||||
|
||||
# 如果是 charts 类型,额外生成可视化
|
||||
if analysis_type == "charts":
|
||||
try:
|
||||
# 解析 LLM 返回的 JSON 数据
|
||||
chart_data = self._parse_chart_json(analysis)
|
||||
if chart_data and chart_data.get("tables"):
|
||||
# 使用可视化服务生成图表
|
||||
for table_info in chart_data.get("tables", []):
|
||||
columns = table_info.get("columns", [])
|
||||
rows = table_info.get("rows", [])
|
||||
if columns and rows:
|
||||
vis_result = visualization_service.analyze_and_visualize({
|
||||
"columns": columns,
|
||||
"rows": [dict(zip(columns, row)) for row in rows]
|
||||
})
|
||||
if vis_result.get("success"):
|
||||
table_info["visualization"] = {
|
||||
"statistics": vis_result.get("statistics"),
|
||||
"charts": vis_result.get("charts"),
|
||||
"distributions": vis_result.get("distributions")
|
||||
}
|
||||
result["chart_data"] = chart_data
|
||||
except Exception as e:
|
||||
logger.warning(f"生成可视化图表失败: {e}")
|
||||
result["chart_data"] = {"tables": [], "key_statistics": [], "chart_suggestions": []}
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Markdown AI 分析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
async def analyze_markdown_stream(
|
||||
self,
|
||||
file_path: str,
|
||||
analysis_type: str = "summary",
|
||||
user_prompt: str = "",
|
||||
section_number: Optional[str] = None
|
||||
) -> AsyncGenerator[str, None]:
|
||||
"""
|
||||
流式分析 Markdown 文档 (SSE)
|
||||
|
||||
Yields:
|
||||
str: SSE 格式的数据块
|
||||
"""
|
||||
try:
|
||||
parse_result = self.parser.parse(file_path)
|
||||
|
||||
if not parse_result.success:
|
||||
yield f"data: {json.dumps({'error': parse_result.error}, ensure_ascii=False)}\n\n"
|
||||
return
|
||||
|
||||
data = parse_result.data
|
||||
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
|
||||
|
||||
target_content = data.get("content", "")
|
||||
target_title = parse_result.metadata.get("filename", "")
|
||||
|
||||
if section_number:
|
||||
section = self._find_section(sections, section_number)
|
||||
if section:
|
||||
target_content = section.content
|
||||
target_title = f"{section.number}、{section.title}"
|
||||
else:
|
||||
yield f"data: {json.dumps({'error': f'未找到章节: {section_number}'}, ensure_ascii=False)}\n\n"
|
||||
return
|
||||
|
||||
prompt = self._build_prompt(
|
||||
content=target_content,
|
||||
analysis_type=analysis_type,
|
||||
user_prompt=user_prompt,
|
||||
title=target_title
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": self._get_system_prompt(analysis_type)},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
|
||||
# 发送初始元数据
|
||||
yield f"data: {json.dumps({
|
||||
'type': 'start',
|
||||
'filename': parse_result.metadata.get("filename", ""),
|
||||
'analysis_type': analysis_type,
|
||||
'section': target_title if section_number else None,
|
||||
'word_count': len(target_content)
|
||||
}, ensure_ascii=False)}\n\n"
|
||||
|
||||
# 流式调用 LLM
|
||||
full_response = ""
|
||||
async for chunk in llm_service.chat_stream(messages, temperature=0.3, max_tokens=4000):
|
||||
content = chunk.get("content", "")
|
||||
if content:
|
||||
full_response += content
|
||||
yield f"data: {json.dumps({'type': 'content', 'delta': content}, ensure_ascii=False)}\n\n"
|
||||
|
||||
# 发送完成消息
|
||||
yield f"data: {json.dumps({'type': 'done', 'full_response': full_response}, ensure_ascii=False)}\n\n"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Markdown AI 流式分析失败: {str(e)}")
|
||||
yield f"data: {json.dumps({'error': str(e)}, ensure_ascii=False)}\n\n"
|
||||
|
||||
def _find_section(self, sections: List[MarkdownSection], number: str) -> Optional[MarkdownSection]:
|
||||
"""查找指定编号的章节"""
|
||||
# 标准化编号
|
||||
num = number.strip()
|
||||
for section in sections:
|
||||
if section.number == num or section.title == num:
|
||||
return section
|
||||
# 在子章节中查找
|
||||
found = self._find_section(section.subsections, number)
|
||||
if found:
|
||||
return found
|
||||
return None
|
||||
|
||||
def _parse_chart_json(self, json_str: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
解析 LLM 返回的 JSON 字符串
|
||||
|
||||
Args:
|
||||
json_str: LLM 返回的 JSON 字符串
|
||||
|
||||
Returns:
|
||||
解析后的字典,如果解析失败返回 None
|
||||
"""
|
||||
if not json_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
# 尝试直接解析
|
||||
return json.loads(json_str)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# 尝试提取 JSON 代码块
|
||||
import re
|
||||
# 匹配 ```json ... ``` 格式
|
||||
match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', json_str)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group(1))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# 尝试找到 JSON 对象的开始和结束
|
||||
start = json_str.find('{')
|
||||
end = json_str.rfind('}')
|
||||
if start != -1 and end != -1 and end > start:
|
||||
try:
|
||||
return json.loads(json_str[start:end+1])
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def _get_system_prompt(self, analysis_type: str) -> str:
|
||||
"""根据分析类型获取系统提示词"""
|
||||
prompts = {
|
||||
"summary": "你是一个专业的文档摘要助手,擅长从长文档中提取核心信息。",
|
||||
"outline": "你是一个专业的文档结构分析助手,擅长提取文档大纲和层级结构。",
|
||||
"key_points": "你是一个专业的知识提取助手,擅长从文档中提取关键信息和要点。",
|
||||
"questions": "你是一个专业的教育助手,擅长生成帮助理解文档的问题。",
|
||||
"tags": "你是一个专业的标签生成助手,擅长提取文档的主题标签。",
|
||||
"qa": "你是一个专业的问答助手,擅长基于文档内容生成问答对。",
|
||||
"statistics": "你是一个专业的统计数据分析助手,擅长分析政府统计公报中的数据。",
|
||||
"section": "你是一个专业的章节分析助手,擅长对文档的特定章节进行深入分析。",
|
||||
"charts": "你是一个专业的数据可视化助手,擅长从文档中提取数据并生成适合制作图表的数据结构。"
|
||||
}
|
||||
return prompts.get(analysis_type, "你是一个专业的文档分析助手。")
|
||||
|
||||
def _build_prompt(
|
||||
self,
|
||||
content: str,
|
||||
analysis_type: str,
|
||||
user_prompt: str,
|
||||
title: str = ""
|
||||
) -> str:
|
||||
"""根据分析类型构建提示词"""
|
||||
|
||||
# 截断内容避免超出 token 限制
|
||||
max_content_len = 6000
|
||||
if len(content) > max_content_len:
|
||||
content = content[:max_content_len] + "\n\n[内容已截断...]"
|
||||
|
||||
base_prompts = {
|
||||
"summary": f"""请对以下文档进行摘要分析:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content}
|
||||
|
||||
请提供:
|
||||
1. 文档主要内容摘要(300字以内)
|
||||
2. 文档的目的和用途
|
||||
3. 适合的读者群体
|
||||
|
||||
请用中文回答,结构清晰。""",
|
||||
|
||||
"outline": f"""请提取以下文档的大纲结构:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content}
|
||||
|
||||
请按层级列出文档大纲,用缩进表示层级关系。
|
||||
格式:
|
||||
一、一级标题
|
||||
(一)二级标题
|
||||
1. 三级标题
|
||||
|
||||
请用中文回答。""",
|
||||
|
||||
"key_points": f"""请从以下文档中提取关键要点:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content}
|
||||
|
||||
请列出文档的关键要点(5-10条),每条用简洁的语言描述,并说明其在文档中的重要性。
|
||||
|
||||
请用中文回答,格式清晰。""",
|
||||
|
||||
"questions": f"""请根据以下文档生成有助于理解内容的问题:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content}
|
||||
|
||||
请生成5-10个问题,帮助读者更好地理解文档内容。每个问题应该:
|
||||
1. 涵盖文档的重要信息点
|
||||
2. 易于理解和回答
|
||||
3. 具有思考价值
|
||||
|
||||
请用中文回答。""",
|
||||
|
||||
"tags": f"""请为以下文档生成标签:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content[:3000]}
|
||||
|
||||
请生成5-8个标签,用逗号分隔。标签应该反映:
|
||||
- 文档的主题领域
|
||||
- 文档的类型
|
||||
- 文档的关键特征
|
||||
|
||||
请用中文回答,只需输出标签,不要其他内容。""",
|
||||
|
||||
"qa": f"""请根据以下文档生成问答对:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content[:4000]}
|
||||
|
||||
请生成3-5个问答对,帮助读者通过问答形式理解文档内容。
|
||||
格式:
|
||||
Q1: 问题
|
||||
A1: 回答
|
||||
Q2: 问题
|
||||
A2: 回答
|
||||
|
||||
请用中文回答,内容准确。""",
|
||||
|
||||
"statistics": f"""请分析以下政府统计公报中的数据和结论:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content}
|
||||
|
||||
请提供:
|
||||
1. 文档中涉及的主要统计数据(列出关键数字和指标)
|
||||
2. 数据的变化趋势(增长/下降)
|
||||
3. 重要的百分比和对比
|
||||
4. 数据来源和统计口径说明
|
||||
|
||||
请用中文回答,数据准确。""",
|
||||
|
||||
"section": f"""请详细分析以下文档章节:
|
||||
|
||||
章节标题:{title}
|
||||
|
||||
章节内容:
|
||||
{content}
|
||||
|
||||
请提供:
|
||||
1. 章节主要内容概括
|
||||
2. 关键信息和数据
|
||||
3. 与其他部分的关联(如有)
|
||||
4. 重要结论
|
||||
|
||||
请用中文回答,分析深入。""",
|
||||
|
||||
"charts": f"""请从以下文档中提取可用于可视化的数据,并生成适合制作图表的数据结构:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content}
|
||||
|
||||
请完成以下任务:
|
||||
1. 识别文档中的表格数据(Markdown表格格式)
|
||||
2. 识别文档中的关键统计数据(百分比、数量、趋势等)
|
||||
3. 识别可用于比较的分类数据
|
||||
|
||||
请用 JSON 格式返回以下结构的数据(如果没有表格数据,返回空结构):
|
||||
{{
|
||||
"tables": [
|
||||
{{
|
||||
"description": "表格的描述",
|
||||
"columns": ["列名1", "列名2", ...],
|
||||
"rows": [
|
||||
["值1", "值2", ...],
|
||||
["值1", "值2", ...]
|
||||
]
|
||||
}}
|
||||
],
|
||||
"key_statistics": [
|
||||
{{
|
||||
"name": "指标名称",
|
||||
"value": "数值",
|
||||
"trend": "增长/下降/持平",
|
||||
"description": "指标说明"
|
||||
}}
|
||||
],
|
||||
"chart_suggestions": [
|
||||
{{
|
||||
"chart_type": "bar/line/pie",
|
||||
"title": "图表标题",
|
||||
"data_source": "数据来源说明"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
请确保返回的是合法的 JSON 格式。"""
|
||||
}
|
||||
|
||||
prompt = base_prompts.get(analysis_type, base_prompts["summary"])
|
||||
|
||||
if user_prompt and user_prompt.strip():
|
||||
prompt += f"\n\n用户额外需求:{user_prompt}"
|
||||
|
||||
return prompt
|
||||
|
||||
async def extract_outline(self, file_path: str) -> Dict[str, Any]:
|
||||
"""提取文档大纲"""
|
||||
try:
|
||||
parse_result = self.parser.parse(file_path)
|
||||
|
||||
if not parse_result.success:
|
||||
return {"success": False, "error": parse_result.error}
|
||||
|
||||
data = parse_result.data
|
||||
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
|
||||
|
||||
# 构建结构化大纲
|
||||
outline = []
|
||||
for section in sections:
|
||||
outline.append({
|
||||
"number": section.number,
|
||||
"title": section.title,
|
||||
"level": section.level,
|
||||
"line": section.line_start,
|
||||
"content_preview": section.content[:100] + "..." if len(section.content) > 100 else section.content,
|
||||
"subsections": [{
|
||||
"number": s.number,
|
||||
"title": s.title,
|
||||
"level": s.level,
|
||||
"line": s.line_start
|
||||
} for s in section.subsections]
|
||||
})
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"outline": outline
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"大纲提取失败: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def extract_tables_summary(self, file_path: str) -> Dict[str, Any]:
|
||||
"""提取并总结文档中的表格"""
|
||||
try:
|
||||
parse_result = self.parser.parse(file_path)
|
||||
|
||||
if not parse_result.success:
|
||||
return {"success": False, "error": parse_result.error}
|
||||
|
||||
tables = parse_result.data.get("tables", [])
|
||||
|
||||
if not tables:
|
||||
return {"success": True, "tables": [], "message": "文档中没有表格"}
|
||||
|
||||
# 提取每个表格的关键信息
|
||||
table_summaries = []
|
||||
for i, table in enumerate(tables):
|
||||
summary = {
|
||||
"index": i + 1,
|
||||
"headers": table.get("headers", []),
|
||||
"row_count": table.get("row_count", 0),
|
||||
"column_count": table.get("column_count", 0),
|
||||
"preview_rows": table.get("rows", [])[:3], # 只取前3行预览
|
||||
"first_column": [row[0] if row else "" for row in table.get("rows", [])[:5]]
|
||||
}
|
||||
table_summaries.append(summary)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"tables": table_summaries,
|
||||
"table_count": len(tables)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"表格提取失败: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
# 全局单例
|
||||
markdown_ai_service = MarkdownAIService()
|
||||
@@ -40,14 +40,29 @@ class RAGService:
|
||||
def _init_embeddings(self):
|
||||
"""初始化嵌入模型"""
|
||||
if self.embedding_model is None:
|
||||
self.embedding_model = SentenceTransformer(settings.EMBEDDING_MODEL)
|
||||
# 使用轻量级本地模型,避免网络问题
|
||||
model_name = 'all-MiniLM-L6-v2'
|
||||
try:
|
||||
self.embedding_model = SentenceTransformer(model_name)
|
||||
self._dimension = self.embedding_model.get_sentence_embedding_dimension()
|
||||
logger.info(f"RAG 嵌入模型初始化完成: {settings.EMBEDDING_MODEL}, 维度: {self._dimension}")
|
||||
logger.info(f"RAG 嵌入模型初始化完成: {model_name}, 维度: {self._dimension}")
|
||||
except Exception as e:
|
||||
logger.warning(f"嵌入模型 {model_name} 加载失败: {e}")
|
||||
# 如果本地模型也失败,使用简单hash作为后备
|
||||
self.embedding_model = None
|
||||
self._dimension = 384
|
||||
logger.info("RAG 使用简化模式 (无向量嵌入)")
|
||||
|
||||
def _init_vector_store(self):
|
||||
"""初始化向量存储"""
|
||||
if self.index is None:
|
||||
self._init_embeddings()
|
||||
if self.embedding_model is None:
|
||||
# 无法加载嵌入模型,使用简化模式
|
||||
self._dimension = 384
|
||||
self.index = None
|
||||
logger.warning("RAG 嵌入模型未加载,使用简化模式")
|
||||
else:
|
||||
self.index = faiss.IndexIDMap(faiss.IndexFlatIP(self._dimension))
|
||||
logger.info("Faiss 向量存储初始化完成")
|
||||
|
||||
@@ -78,6 +93,11 @@ class RAGService:
|
||||
if not self._initialized:
|
||||
self._init_vector_store()
|
||||
|
||||
# 如果没有嵌入模型,只记录到日志
|
||||
if self.embedding_model is None:
|
||||
logger.debug(f"字段跳过索引 (无嵌入模型): {table_name}.{field_name}")
|
||||
return
|
||||
|
||||
text = f"表名: {table_name}, 字段: {field_name}, 描述: {field_description}"
|
||||
if sample_values:
|
||||
text += f", 示例值: {', '.join(sample_values)}"
|
||||
@@ -100,6 +120,11 @@ class RAGService:
|
||||
if not self._initialized:
|
||||
self._init_vector_store()
|
||||
|
||||
# 如果没有嵌入模型,只记录到日志
|
||||
if self.embedding_model is None:
|
||||
logger.debug(f"文档跳过索引 (无嵌入模型): {doc_id}")
|
||||
return
|
||||
|
||||
doc = SimpleDocument(
|
||||
page_content=content,
|
||||
metadata=metadata or {"doc_id": doc_id}
|
||||
|
||||
@@ -31,6 +31,178 @@ class TableRAGService:
|
||||
self.rag = rag_service
|
||||
self.excel_storage = excel_storage_service
|
||||
|
||||
def _extract_sheet_names_from_xml(self, file_path: str) -> List[str]:
|
||||
"""
|
||||
从 Excel 文件的 XML 中提取工作表名称
|
||||
|
||||
某些 Excel 文件由于包含非标准元素,pandas/openpyxl 无法正确解析工作表列表,
|
||||
此时需要直接从 XML 中提取。
|
||||
|
||||
Args:
|
||||
file_path: Excel 文件路径
|
||||
|
||||
Returns:
|
||||
工作表名称列表
|
||||
"""
|
||||
import zipfile
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as z:
|
||||
# 读取 workbook.xml
|
||||
if 'xl/workbook.xml' not in z.namelist():
|
||||
return []
|
||||
|
||||
content = z.read('xl/workbook.xml')
|
||||
root = ET.fromstring(content)
|
||||
|
||||
# 定义命名空间
|
||||
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
|
||||
|
||||
# 提取所有 sheet 的 name 属性
|
||||
sheets = root.findall('.//main:sheet', ns)
|
||||
return [s.get('name') for s in sheets if s.get('name')]
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"从 XML 提取工作表失败: {file_path}, error: {e}")
|
||||
return []
|
||||
|
||||
def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
|
||||
"""
|
||||
读取 Excel 工作表,支持 pandas 无法解析的特殊 Excel 文件
|
||||
|
||||
当 pandas 的 ExcelFile 无法正确解析时,直接从 XML 读取数据。
|
||||
|
||||
Args:
|
||||
file_path: Excel 文件路径
|
||||
sheet_name: 工作表名称(如果为 None,读取第一个工作表)
|
||||
header_row: 表头行号
|
||||
|
||||
Returns:
|
||||
DataFrame
|
||||
"""
|
||||
import zipfile
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
try:
|
||||
# 先尝试用 pandas 正常读取
|
||||
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
|
||||
if df is not None and not df.empty:
|
||||
return df
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# pandas 读取失败,从 XML 直接解析
|
||||
logger.info(f"使用 XML 方式读取 Excel: {file_path}")
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as z:
|
||||
# 获取工作表名称
|
||||
sheet_names = self._extract_sheet_names_from_xml(file_path)
|
||||
if not sheet_names:
|
||||
raise ValueError("无法从 Excel 文件中找到工作表")
|
||||
|
||||
# 确定要读取的工作表
|
||||
target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
|
||||
sheet_index = sheet_names.index(target_sheet) + 1 # sheet1.xml, sheet2.xml, ...
|
||||
|
||||
# 读取 shared strings
|
||||
shared_strings = []
|
||||
if 'xl/sharedStrings.xml' in z.namelist():
|
||||
ss_content = z.read('xl/sharedStrings.xml')
|
||||
ss_root = ET.fromstring(ss_content)
|
||||
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
|
||||
for si in ss_root.findall('.//main:si', ns):
|
||||
t = si.find('.//main:t', ns)
|
||||
if t is not None:
|
||||
shared_strings.append(t.text or '')
|
||||
else:
|
||||
shared_strings.append('')
|
||||
|
||||
# 读取工作表
|
||||
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
|
||||
if sheet_file not in z.namelist():
|
||||
raise ValueError(f"工作表文件 {sheet_file} 不存在")
|
||||
|
||||
sheet_content = z.read(sheet_file)
|
||||
root = ET.fromstring(sheet_content)
|
||||
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
|
||||
|
||||
# 解析行
|
||||
rows_data = []
|
||||
for row in root.findall('.//main:row', ns):
|
||||
row_idx = int(row.get('r', 0))
|
||||
# header_row 是 0-indexed,row_idx 是 1-indexed
|
||||
# 如果 header_row=0 表示第一行是表头,需要跳过 row_idx=1
|
||||
if row_idx <= header_row + 1:
|
||||
continue # 跳过表头行
|
||||
|
||||
row_cells = {}
|
||||
for cell in row.findall('main:c', ns):
|
||||
cell_ref = cell.get('r', '')
|
||||
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
||||
cell_type = cell.get('t', 'n')
|
||||
v = cell.find('main:v', ns)
|
||||
|
||||
if v is not None and v.text:
|
||||
if cell_type == 's':
|
||||
# shared string
|
||||
try:
|
||||
val = shared_strings[int(v.text)]
|
||||
except (ValueError, IndexError):
|
||||
val = v.text
|
||||
elif cell_type == 'b':
|
||||
# boolean
|
||||
val = v.text == '1'
|
||||
else:
|
||||
# number or other
|
||||
val = v.text
|
||||
else:
|
||||
val = None
|
||||
|
||||
row_cells[col_letters] = val
|
||||
|
||||
if row_cells:
|
||||
rows_data.append(row_cells)
|
||||
|
||||
# 转换为 DataFrame
|
||||
if not rows_data:
|
||||
return pd.DataFrame()
|
||||
|
||||
df = pd.DataFrame(rows_data)
|
||||
|
||||
# 如果有 header_row,重新设置列名
|
||||
if header_row >= 0:
|
||||
# 重新读取第一行作为表头
|
||||
first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
|
||||
sheet_content = z.read(first_row_sheet)
|
||||
root = ET.fromstring(sheet_content)
|
||||
first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
|
||||
if first_row is not None:
|
||||
headers = {}
|
||||
for cell in first_row.findall('main:c', ns):
|
||||
cell_ref = cell.get('r', '')
|
||||
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
||||
cell_type = cell.get('t', 'n')
|
||||
v = cell.find('main:v', ns)
|
||||
if v is not None and v.text:
|
||||
if cell_type == 's':
|
||||
try:
|
||||
headers[col_letters] = shared_strings[int(v.text)]
|
||||
except (ValueError, IndexError):
|
||||
headers[col_letters] = v.text
|
||||
else:
|
||||
headers[col_letters] = v.text
|
||||
# 重命名列
|
||||
df.columns = [headers.get(col, col) for col in df.columns]
|
||||
|
||||
logger.info(f"XML 解析完成: {len(df)} 行, {len(df.columns)} 列")
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"XML 解析 Excel 失败: {e}")
|
||||
raise
|
||||
|
||||
async def generate_field_description(
|
||||
self,
|
||||
table_name: str,
|
||||
@@ -126,26 +298,49 @@ class TableRAGService:
|
||||
}
|
||||
|
||||
try:
|
||||
# 1. 读取 Excel
|
||||
# 1. 先检查 Excel 文件是否有效
|
||||
logger.info(f"正在检查Excel文件: {file_path}")
|
||||
try:
|
||||
xls_file = pd.ExcelFile(file_path)
|
||||
sheet_names = xls_file.sheet_names
|
||||
logger.info(f"Excel文件工作表: {sheet_names}")
|
||||
|
||||
# 如果 sheet_names 为空,尝试从 XML 中手动提取
|
||||
if not sheet_names:
|
||||
sheet_names = self._extract_sheet_names_from_xml(file_path)
|
||||
logger.info(f"从XML提取工作表: {sheet_names}")
|
||||
|
||||
if not sheet_names:
|
||||
return {"success": False, "error": "Excel 文件没有工作表"}
|
||||
except Exception as e:
|
||||
logger.error(f"读取Excel文件失败: {file_path}, error: {e}")
|
||||
return {"success": False, "error": f"无法读取Excel文件: {str(e)}"}
|
||||
|
||||
# 2. 读取 Excel
|
||||
if sheet_name:
|
||||
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
|
||||
else:
|
||||
df = pd.read_excel(file_path, header=header_row)
|
||||
# 验证指定的sheet_name是否存在
|
||||
if sheet_name not in sheet_names:
|
||||
logger.warning(f"指定的工作表 '{sheet_name}' 不存在,使用第一个工作表: {sheet_names[0]}")
|
||||
sheet_name = sheet_names[0]
|
||||
df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row)
|
||||
|
||||
logger.info(f"读取到数据: {len(df)} 行, {len(df.columns)} 列")
|
||||
|
||||
if df.empty:
|
||||
return {"success": False, "error": "Excel 文件为空"}
|
||||
|
||||
# 清理列名
|
||||
df.columns = [str(c) for c in df.columns]
|
||||
table_name = excel_storage._sanitize_table_name(filename)
|
||||
table_name = self.excel_storage._sanitize_table_name(filename)
|
||||
results["table_name"] = table_name
|
||||
results["field_count"] = len(df.columns)
|
||||
logger.info(f"表名: {table_name}, 字段数: {len(df.columns)}")
|
||||
|
||||
# 2. 初始化 RAG (如果需要)
|
||||
# 3. 初始化 RAG (如果需要)
|
||||
if not self.rag._initialized:
|
||||
self.rag._init_vector_store()
|
||||
|
||||
# 3. 为每个字段生成描述并索引
|
||||
# 4. 为每个字段生成描述并索引
|
||||
all_fields_data = {}
|
||||
for col in df.columns:
|
||||
# 采样示例值
|
||||
@@ -187,7 +382,8 @@ class TableRAGService:
|
||||
logger.error(error_msg)
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
# 4. 存储到 MySQL
|
||||
# 5. 存储到 MySQL
|
||||
logger.info(f"开始存储到MySQL: {filename}")
|
||||
store_result = await self.excel_storage.store_excel(
|
||||
file_path=file_path,
|
||||
filename=filename,
|
||||
|
||||
@@ -1,113 +0,0 @@
|
||||
✅ Excel 文件解析功能已完成并测试通过
|
||||
|
||||
已完成的工作
|
||||
|
||||
后端部分
|
||||
|
||||
1. 文件服务层 (backend/app/services/file_service.py)
|
||||
|
||||
- 文件保存、读取、删除功能
|
||||
- 文件信息获取
|
||||
2. Excel 解析模块 (backend/app/core/document_parser/)
|
||||
|
||||
- base.py - 解析器基类
|
||||
- xlsx_parser.py - Excel 文件解析器
|
||||
- utils.py - 工具函数
|
||||
3. API 接口 (backend/app/api/endpoints/upload.py)
|
||||
|
||||
- POST /upload/excel - 上传并解析 Excel 文件
|
||||
- GET /upload/excel/preview/{file_path} - 获取预览
|
||||
- DELETE /upload/file - 删除文件
|
||||
4. 主应用配置 (backend/app/main.py)
|
||||
|
||||
- CORS 中间件配置
|
||||
- API 路由注册
|
||||
|
||||
前端部分
|
||||
|
||||
1. 后端 API 封装 (frontendTest/app-a6ww9j3ja3nl/src/db/backend-api.ts)
|
||||
|
||||
- Excel 上传、预览、删除函数
|
||||
2. Excel 解析页面 (frontendTest/app-a6ww9j3ja3nl/src/pages/ExcelParse.tsx)
|
||||
|
||||
- 拖拽上传界面
|
||||
- 解析选项配置
|
||||
- 数据表格展示
|
||||
- 多工作表支持
|
||||
3. 路由配置 (frontendTest/app-a6ww9j3ja3nl/src/routes.tsx)
|
||||
|
||||
- 添加 /excel-parse 路由
|
||||
4. 环境配置 (.env)
|
||||
|
||||
- 后端 API URL 配置
|
||||
|
||||
测试结果
|
||||
|
||||
后端 API 测试通过:
|
||||
- ✅ 健康检查:http://127.0.0.1:8000/health
|
||||
- ✅ Excel 上传解析:成功解析 5 行 6 列的测试数据
|
||||
|
||||
访问方式
|
||||
|
||||
1. 后端 API 文档:http://127.0.0.1:8000/api/v1/docs
|
||||
2. 前端页面:
|
||||
|
||||
- 启动前端:cd frontendTest/app-a6ww9j3ja3nl && npm run dev
|
||||
- 访问 Excel 解析页面:/excel-parse
|
||||
|
||||
下一步建议
|
||||
|
||||
1. 启动前端开发服务器测试完整的前端界面
|
||||
2. 添加更多文档格式解析支持(docx, md, txt)
|
||||
3. 完善错误处理和用户提示
|
||||
|
||||
✻ Worked for 4m 18s · 1 background task still running (↓ to manage)
|
||||
|
||||
> 如何启动后端呢
|
||||
|
||||
● 后端启动步骤如下:
|
||||
|
||||
方法一:使用命令行启动(开发模式)
|
||||
|
||||
# 1. 进入后端目录
|
||||
cd H:/OwnProject/FilesReadSysteam/backend
|
||||
|
||||
# 2. 激活虚拟环境
|
||||
# PowerShell:
|
||||
.\venv\Scripts\Activate.ps1
|
||||
|
||||
# 或 CMD:
|
||||
.\venv\Scripts\Activate.bat
|
||||
|
||||
# 3. 启动后端服务器
|
||||
python -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload
|
||||
|
||||
方法二:直接使用虚拟环境中的 Python
|
||||
|
||||
cd H:/OwnProject/FilesReadSysteam/backend
|
||||
./venv/Scripts/python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload
|
||||
|
||||
方法三:直接运行 main.py(开发调试)
|
||||
|
||||
cd H:/OwnProject/FilesReadSysteam/backend
|
||||
./venv/Scripts/python.exe app/main.py
|
||||
|
||||
启动成功的标志
|
||||
|
||||
INFO: Uvicorn running on http://127.0.0.1:8000
|
||||
INFO: Started server process [xxxxx]
|
||||
INFO: Waiting for application startup.
|
||||
INFO: Application startup complete.
|
||||
|
||||
验证服务是否正常
|
||||
|
||||
# 健康检查
|
||||
curl http://127.0.0.1:8000/health
|
||||
|
||||
# 访问 API 文档
|
||||
# 浏览器打开: http://127.0.0.1:8000/api/v1/docs
|
||||
|
||||
---
|
||||
当前状态:后端已在后台运行(任务 ID: b22jkg69j),可以直接访问 http://127.0.0.1:8000
|
||||
|
||||
需要停止的话告诉我即可。
|
||||
@@ -166,6 +166,66 @@ export interface AIAnalysisResult {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
// ==================== Markdown AI 分析类型 ====================
|
||||
|
||||
export interface AIMarkdownAnalyzeResult {
|
||||
success: boolean;
|
||||
filename?: string;
|
||||
analysis_type?: string;
|
||||
section?: string;
|
||||
word_count?: number;
|
||||
structure?: {
|
||||
title_count?: number;
|
||||
code_block_count?: number;
|
||||
table_count?: number;
|
||||
section_count?: number;
|
||||
};
|
||||
sections?: MarkdownSection[];
|
||||
analysis?: string;
|
||||
chart_data?: {
|
||||
tables?: Array<{
|
||||
description?: string;
|
||||
columns?: string[];
|
||||
rows?: string[][];
|
||||
visualization?: {
|
||||
statistics?: any;
|
||||
charts?: any;
|
||||
distributions?: any;
|
||||
};
|
||||
}>;
|
||||
key_statistics?: Array<{
|
||||
name?: string;
|
||||
value?: string;
|
||||
trend?: string;
|
||||
description?: string;
|
||||
}>;
|
||||
chart_suggestions?: Array<{
|
||||
chart_type?: string;
|
||||
title?: string;
|
||||
data_source?: string;
|
||||
}>;
|
||||
};
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface MarkdownSection {
|
||||
number: string;
|
||||
title: string;
|
||||
level: number;
|
||||
content_preview?: string;
|
||||
line_start: number;
|
||||
line_end?: number;
|
||||
subsections?: MarkdownSection[];
|
||||
}
|
||||
|
||||
export interface MarkdownOutlineResult {
|
||||
success: boolean;
|
||||
outline?: MarkdownSection[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export type MarkdownAnalysisType = 'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section' | 'charts';
|
||||
|
||||
export interface AIExcelAnalyzeResult {
|
||||
success: boolean;
|
||||
excel?: {
|
||||
@@ -842,6 +902,159 @@ export const aiApi = {
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* 上传并使用 AI 分析 Markdown 文件
|
||||
*/
|
||||
async analyzeMarkdown(
|
||||
file: File,
|
||||
options: {
|
||||
analysisType?: MarkdownAnalysisType;
|
||||
userPrompt?: string;
|
||||
sectionNumber?: string;
|
||||
} = {}
|
||||
): Promise<AIMarkdownAnalyzeResult> {
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
|
||||
const params = new URLSearchParams();
|
||||
if (options.analysisType) {
|
||||
params.append('analysis_type', options.analysisType);
|
||||
}
|
||||
if (options.userPrompt) {
|
||||
params.append('user_prompt', options.userPrompt);
|
||||
}
|
||||
if (options.sectionNumber) {
|
||||
params.append('section_number', options.sectionNumber);
|
||||
}
|
||||
|
||||
const url = `${BACKEND_BASE_URL}/ai/analyze/md?${params.toString()}`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
throw new Error(error.detail || 'Markdown AI 分析失败');
|
||||
}
|
||||
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('Markdown AI 分析失败:', error);
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* 流式分析 Markdown 文件 (SSE)
|
||||
*/
|
||||
async analyzeMarkdownStream(
|
||||
file: File,
|
||||
options: {
|
||||
analysisType?: MarkdownAnalysisType;
|
||||
userPrompt?: string;
|
||||
sectionNumber?: string;
|
||||
} = {},
|
||||
onChunk?: (chunk: { type: string; delta?: string; error?: string }) => void
|
||||
): Promise<string> {
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
|
||||
const params = new URLSearchParams();
|
||||
if (options.analysisType) {
|
||||
params.append('analysis_type', options.analysisType);
|
||||
}
|
||||
if (options.userPrompt) {
|
||||
params.append('user_prompt', options.userPrompt);
|
||||
}
|
||||
if (options.sectionNumber) {
|
||||
params.append('section_number', options.sectionNumber);
|
||||
}
|
||||
|
||||
const url = `${BACKEND_BASE_URL}/ai/analyze/md/stream?${params.toString()}`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
throw new Error(error.detail || 'Markdown AI 流式分析失败');
|
||||
}
|
||||
|
||||
const reader = response.body?.getReader();
|
||||
if (!reader) throw new Error('无法读取响应流');
|
||||
|
||||
const decoder = new TextDecoder();
|
||||
let fullResponse = '';
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
const chunk = decoder.decode(value);
|
||||
const lines = chunk.split('\n');
|
||||
|
||||
for (const line of lines) {
|
||||
if (line.startsWith('data: ')) {
|
||||
const data = line.slice(6);
|
||||
if (data === '[DONE]') continue;
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(data);
|
||||
if (parsed.type === 'content' && parsed.delta) {
|
||||
fullResponse += parsed.delta;
|
||||
onChunk?.({ type: 'content', delta: parsed.delta });
|
||||
} else if (parsed.type === 'done') {
|
||||
fullResponse = parsed.full_response || fullResponse;
|
||||
} else if (parsed.error) {
|
||||
onChunk?.({ type: 'error', error: parsed.error });
|
||||
}
|
||||
} catch {
|
||||
// Ignore parse errors for incomplete JSON
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return fullResponse;
|
||||
} catch (error) {
|
||||
console.error('Markdown AI 流式分析失败:', error);
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* 获取 Markdown 文档大纲(分章节信息)
|
||||
*/
|
||||
async getMarkdownOutline(file: File): Promise<MarkdownOutlineResult> {
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
|
||||
const url = `${BACKEND_BASE_URL}/ai/analyze/md/outline`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: 'GET',
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
throw new Error(error.detail || '获取 Markdown 大纲失败');
|
||||
}
|
||||
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('获取 Markdown 大纲失败:', error);
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* 生成统计信息和图表
|
||||
*/
|
||||
|
||||
@@ -19,7 +19,11 @@ import {
|
||||
TrendingUp,
|
||||
Download,
|
||||
Brain,
|
||||
Settings2
|
||||
Settings2,
|
||||
List,
|
||||
MessageSquareCode,
|
||||
Tag,
|
||||
HelpCircle
|
||||
} from 'lucide-react';
|
||||
import { Button } from '@/components/ui/button';
|
||||
import { Input } from '@/components/ui/input';
|
||||
@@ -33,7 +37,7 @@ import { Checkbox } from '@/components/ui/checkbox';
|
||||
import { toast } from 'sonner';
|
||||
import { cn } from '@/lib/utils';
|
||||
import { Skeleton } from '@/components/ui/skeleton';
|
||||
import { backendApi, type ExcelParseResult, aiApi } from '@/db/backend-api';
|
||||
import { backendApi, type ExcelParseResult, type AIMarkdownAnalyzeResult, type MarkdownSection, aiApi } from '@/db/backend-api';
|
||||
import {
|
||||
Table as TableComponent,
|
||||
TableBody,
|
||||
@@ -78,6 +82,15 @@ const Documents: React.FC = () => {
|
||||
const [analysisCharts, setAnalysisCharts] = useState<any>(null);
|
||||
const [analysisTypes, setAnalysisTypes] = useState<Array<{ value: string; label: string; description: string }>>([]);
|
||||
|
||||
// Markdown AI 分析相关状态
|
||||
const [mdAnalysis, setMdAnalysis] = useState<AIMarkdownAnalyzeResult | null>(null);
|
||||
const [mdAnalysisType, setMdAnalysisType] = useState<'summary' | 'outline' | 'key_points' | 'questions' | 'tags' | 'qa' | 'statistics' | 'section' | 'charts'>('summary');
|
||||
const [mdUserPrompt, setMdUserPrompt] = useState('');
|
||||
const [mdSections, setMdSections] = useState<MarkdownSection[]>([]);
|
||||
const [mdSelectedSection, setMdSelectedSection] = useState<string>('');
|
||||
const [mdStreaming, setMdStreaming] = useState(false);
|
||||
const [mdStreamingContent, setMdStreamingContent] = useState('');
|
||||
|
||||
// 解析选项
|
||||
const [parseOptions, setParseOptions] = useState({
|
||||
parseAllSheets: false,
|
||||
@@ -144,6 +157,9 @@ const Documents: React.FC = () => {
|
||||
setAiAnalysis(null);
|
||||
setAnalysisCharts(null);
|
||||
setExpandedSheet(null);
|
||||
setMdAnalysis(null);
|
||||
setMdSections([]);
|
||||
setMdStreamingContent('');
|
||||
|
||||
const ext = file.name.split('.').pop()?.toLowerCase();
|
||||
|
||||
@@ -163,6 +179,9 @@ const Documents: React.FC = () => {
|
||||
} else {
|
||||
toast.error(result.error || '解析失败');
|
||||
}
|
||||
} else if (ext === 'md' || ext === 'markdown') {
|
||||
// Markdown 文件:获取大纲
|
||||
await fetchMdOutline();
|
||||
} else {
|
||||
// 其他文档使用通用上传接口
|
||||
const result = await backendApi.uploadDocument(file);
|
||||
@@ -403,6 +422,106 @@ const Documents: React.FC = () => {
|
||||
}
|
||||
};
|
||||
|
||||
const isMarkdownFile = (filename: string) => {
|
||||
const ext = filename.split('.').pop()?.toLowerCase();
|
||||
return ext === 'md' || ext === 'markdown';
|
||||
};
|
||||
|
||||
// Markdown AI 分析处理
|
||||
const handleMdAnalyze = async () => {
|
||||
if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) {
|
||||
toast.error('请先上传 Markdown 文件');
|
||||
return;
|
||||
}
|
||||
|
||||
setAnalyzing(true);
|
||||
setMdAnalysis(null);
|
||||
|
||||
try {
|
||||
const result = await aiApi.analyzeMarkdown(uploadedFile, {
|
||||
analysisType: mdAnalysisType,
|
||||
userPrompt: mdUserPrompt,
|
||||
sectionNumber: mdSelectedSection || undefined
|
||||
});
|
||||
|
||||
if (result.success) {
|
||||
toast.success('Markdown AI 分析完成');
|
||||
setMdAnalysis(result);
|
||||
} else {
|
||||
toast.error(result.error || 'AI 分析失败');
|
||||
}
|
||||
} catch (error: any) {
|
||||
toast.error(error.message || 'AI 分析失败');
|
||||
} finally {
|
||||
setAnalyzing(false);
|
||||
}
|
||||
};
|
||||
|
||||
// 流式分析 Markdown
|
||||
const handleMdAnalyzeStream = async () => {
|
||||
if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) {
|
||||
toast.error('请先上传 Markdown 文件');
|
||||
return;
|
||||
}
|
||||
|
||||
setAnalyzing(true);
|
||||
setMdStreaming(true);
|
||||
setMdStreamingContent('');
|
||||
setMdAnalysis(null);
|
||||
|
||||
try {
|
||||
await aiApi.analyzeMarkdownStream(
|
||||
uploadedFile,
|
||||
{
|
||||
analysisType: mdAnalysisType,
|
||||
userPrompt: mdUserPrompt,
|
||||
sectionNumber: mdSelectedSection || undefined
|
||||
},
|
||||
(chunk: { type: string; delta?: string; error?: string }) => {
|
||||
if (chunk.type === 'content' && chunk.delta) {
|
||||
setMdStreamingContent(prev => prev + chunk.delta);
|
||||
} else if (chunk.type === 'error') {
|
||||
toast.error(chunk.error || '流式分析出错');
|
||||
}
|
||||
}
|
||||
);
|
||||
} catch (error: any) {
|
||||
toast.error(error.message || 'AI 分析失败');
|
||||
} finally {
|
||||
setAnalyzing(false);
|
||||
setMdStreaming(false);
|
||||
}
|
||||
};
|
||||
|
||||
// 获取 Markdown 文档大纲(分章节)
|
||||
const fetchMdOutline = async () => {
|
||||
if (!uploadedFile || !isMarkdownFile(uploadedFile.name)) return;
|
||||
|
||||
try {
|
||||
const result = await aiApi.getMarkdownOutline(uploadedFile);
|
||||
if (result.success && result.outline) {
|
||||
setMdSections(result.outline);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('获取大纲失败:', error);
|
||||
}
|
||||
};
|
||||
|
||||
const getMdAnalysisIcon = (type: string) => {
|
||||
switch (type) {
|
||||
case 'summary': return <FileText size={20} />;
|
||||
case 'outline': return <List size={20} />;
|
||||
case 'key_points': return <TrendingUp size={20} />;
|
||||
case 'statistics': return <TrendingUp size={20} />;
|
||||
case 'section': return <FileText size={20} />;
|
||||
case 'questions': return <MessageSquareCode size={20} />;
|
||||
case 'tags': return <Tag size={20} />;
|
||||
case 'qa': return <HelpCircle size={20} />;
|
||||
case 'charts': return <TrendingUp size={20} />;
|
||||
default: return <Sparkles size={20} />;
|
||||
}
|
||||
};
|
||||
|
||||
const formatFileSize = (bytes: number): string => {
|
||||
if (bytes === 0) return '0 B';
|
||||
const k = 1024;
|
||||
@@ -600,6 +719,98 @@ const Documents: React.FC = () => {
|
||||
</Card>
|
||||
)}
|
||||
|
||||
{/* Markdown AI 分析选项 */}
|
||||
{uploadedFile && isMarkdownFile(uploadedFile.name) && (
|
||||
<Card className="border-none shadow-md bg-gradient-to-br from-purple-500/5 to-primary/5">
|
||||
<CardHeader className="pb-4">
|
||||
<CardTitle className="flex items-center gap-2">
|
||||
<Sparkles className="text-purple-500" size={20} />
|
||||
Markdown AI 分析
|
||||
</CardTitle>
|
||||
</CardHeader>
|
||||
<CardContent className="space-y-4">
|
||||
{/* 章节选择 */}
|
||||
{mdSections.length > 0 && (
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="md-section" className="text-sm">指定章节(可选)</Label>
|
||||
<Select value={mdSelectedSection} onValueChange={setMdSelectedSection}>
|
||||
<SelectTrigger id="md-section" className="bg-background">
|
||||
<SelectValue placeholder="全文分析" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
<SelectItem value="">全文分析</SelectItem>
|
||||
{mdSections.map((section) => (
|
||||
<SelectItem key={section.number} value={section.number}>
|
||||
{section.number}、{section.title}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
)}
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="md-analysis-type" className="text-sm">分析类型</Label>
|
||||
<Select value={mdAnalysisType} onValueChange={(value: any) => setMdAnalysisType(value)}>
|
||||
<SelectTrigger id="md-analysis-type" className="bg-background">
|
||||
<SelectValue />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
{[
|
||||
{ value: 'summary', label: '文档摘要', desc: '主要内容摘要' },
|
||||
{ value: 'outline', label: '大纲提取', desc: '提取文档结构' },
|
||||
{ value: 'key_points', label: '关键要点', desc: '提取关键信息' },
|
||||
{ value: 'statistics', label: '统计分析', desc: '统计数据分析' },
|
||||
{ value: 'section', label: '章节分析', desc: '分章节详细分析' },
|
||||
{ value: 'questions', label: '生成问题', desc: '生成理解性问题' },
|
||||
{ value: 'tags', label: '生成标签', desc: '提取主题标签' },
|
||||
{ value: 'qa', label: '问答对', desc: '生成问答内容' },
|
||||
{ value: 'charts', label: '数据图表', desc: '生成可视化数据' }
|
||||
].map(type => (
|
||||
<SelectItem key={type.value} value={type.value}>
|
||||
<div className="flex items-center gap-2">
|
||||
{getMdAnalysisIcon(type.value)}
|
||||
<div className="flex flex-col">
|
||||
<span className="font-medium">{type.label}</span>
|
||||
<span className="text-xs text-muted-foreground">{type.desc}</span>
|
||||
</div>
|
||||
</div>
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="md-user-prompt" className="text-sm">自定义提示词(可选)</Label>
|
||||
<Textarea
|
||||
id="md-user-prompt"
|
||||
placeholder="例如:请重点关注技术实现部分..."
|
||||
value={mdUserPrompt}
|
||||
onChange={(e) => setMdUserPrompt(e.target.value)}
|
||||
className="bg-background resize-none"
|
||||
rows={2}
|
||||
/>
|
||||
</div>
|
||||
<div className="flex gap-2">
|
||||
<Button
|
||||
onClick={handleMdAnalyze}
|
||||
disabled={analyzing}
|
||||
className="flex-1 bg-gradient-to-r from-purple-500 to-primary hover:from-purple-500/90 hover:to-primary/90"
|
||||
>
|
||||
{analyzing && !mdStreaming ? <><Loader2 className="mr-2 animate-spin" size={16} /> 分析中...</> : <><Sparkles className="mr-2" size={16} />普通分析</>}
|
||||
</Button>
|
||||
<Button
|
||||
onClick={handleMdAnalyzeStream}
|
||||
disabled={analyzing}
|
||||
variant="outline"
|
||||
className="flex-1"
|
||||
>
|
||||
{analyzing && mdStreaming ? <><Loader2 className="mr-2 animate-spin" size={16} /> 流式...</> : <><Sparkles className="mr-2" size={16} />流式分析</>}
|
||||
</Button>
|
||||
</div>
|
||||
</CardContent>
|
||||
</Card>
|
||||
)}
|
||||
|
||||
{/* 数据操作 */}
|
||||
{parseResult?.success && (
|
||||
<Card className="border-none shadow-md bg-gradient-to-br from-emerald-500/5 to-blue-500/5">
|
||||
@@ -661,6 +872,45 @@ const Documents: React.FC = () => {
|
||||
</Card>
|
||||
)}
|
||||
|
||||
{/* Markdown AI 分析结果 */}
|
||||
{(mdAnalysis || mdStreamingContent) && (
|
||||
<Card className="border-none shadow-md border-l-4 border-l-purple-500">
|
||||
<CardHeader>
|
||||
<div className="flex items-center justify-between">
|
||||
<div className="space-y-1">
|
||||
<CardTitle className="flex items-center gap-2">
|
||||
<Sparkles className="text-purple-500" size={20} />
|
||||
Markdown AI 分析结果
|
||||
{mdStreaming && <Badge variant="default" className="ml-2 bg-purple-500">流式输出中</Badge>}
|
||||
</CardTitle>
|
||||
{mdAnalysis && (
|
||||
<CardDescription>
|
||||
{mdAnalysis.filename} • {mdAnalysis.word_count || 0} 字 • {mdAnalysis.analysis_type}
|
||||
{mdAnalysis.section && ` • ${mdAnalysis.section}`}
|
||||
</CardDescription>
|
||||
)}
|
||||
</div>
|
||||
{mdAnalysis?.structure && (
|
||||
<Badge variant="secondary">
|
||||
{mdAnalysis.structure.title_count || 0} 标题 • {mdAnalysis.structure.section_count || 0} 章节
|
||||
</Badge>
|
||||
)}
|
||||
</div>
|
||||
</CardHeader>
|
||||
<CardContent className="max-h-[500px] overflow-y-auto">
|
||||
{/* 流式内容优先显示 */}
|
||||
{mdStreamingContent && (
|
||||
<div className="animate-pulse text-sm text-muted-foreground mb-4">
|
||||
流式输出中...
|
||||
</div>
|
||||
)}
|
||||
{mdStreamingContent && <Markdown content={mdStreamingContent} />}
|
||||
{mdAnalysis?.analysis && !mdStreamingContent && <Markdown content={mdAnalysis.analysis} />}
|
||||
{!mdAnalysis?.success && !mdStreamingContent && <p className="text-sm text-destructive">{mdAnalysis?.error || '分析失败'}</p>}
|
||||
</CardContent>
|
||||
</Card>
|
||||
)}
|
||||
|
||||
{/* 图表显示 */}
|
||||
{analysisCharts && (
|
||||
<Card className="border-none shadow-md border-l-4 border-l-indigo-500">
|
||||
|
||||
Reference in New Issue
Block a user