完成后端数据库连接配置

This commit is contained in:
2026-03-26 19:49:40 +08:00
parent d3bdb17e87
commit 4bdc3f9707
19 changed files with 2843 additions and 302 deletions

View File

@@ -0,0 +1,371 @@
"""
文档管理 API 接口
支持多格式文档(docx/xlsx/md/txt)上传、解析、存储和RAG索引
"""
import uuid
from datetime import datetime
from typing import List, Optional
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, BackgroundTasks
from pydantic import BaseModel
from app.services.file_service import file_service
from app.core.database import mongodb, mysql_db
from app.services.rag_service import rag_service
from app.core.document_parser import ParserFactory, ParseResult
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/upload", tags=["文档上传"])
# ==================== 请求/响应模型 ====================
class UploadResponse(BaseModel):
task_id: str
file_count: int
message: str
status_url: str
class TaskStatusResponse(BaseModel):
task_id: str
status: str # pending, processing, success, failure
progress: int = 0
message: Optional[str] = None
result: Optional[dict] = None
error: Optional[str] = None
# ==================== 文档上传接口 ====================
@router.post("/document", response_model=UploadResponse)
async def upload_document(
background_tasks: BackgroundTasks,
file: UploadFile = File(...),
doc_type: Optional[str] = Query(None, description="文档类型: docx/xlsx/md/txt"),
parse_all_sheets: bool = Query(False, description="是否解析所有工作表(仅Excel)"),
sheet_name: Optional[str] = Query(None, description="指定工作表(仅Excel)"),
header_row: int = Query(0, description="表头行号(仅Excel)")
):
"""
上传单个文档并异步处理
文档会:
1. 保存到本地存储
2. 解析内容
3. 存入 MongoDB (原始内容)
4. 如果是 Excel存入 MySQL (结构化数据)
5. 建立 RAG 索引
"""
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
# 根据扩展名确定文档类型
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['docx', 'xlsx', 'xls', 'md', 'txt']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 docx/xlsx/xls/md/txt"
)
# 生成任务ID
task_id = str(uuid.uuid4())
try:
# 读取文件内容
content = await file.read()
# 保存文件
saved_path = file_service.save_uploaded_file(
content,
file.filename,
subfolder=file_ext
)
# 后台处理文档
background_tasks.add_task(
process_document,
task_id=task_id,
file_path=saved_path,
original_filename=file.filename,
doc_type=file_ext,
parse_options={
"parse_all_sheets": parse_all_sheets,
"sheet_name": sheet_name,
"header_row": header_row
}
)
return UploadResponse(
task_id=task_id,
file_count=1,
message=f"文档 {file.filename} 已提交处理",
status_url=f"/api/v1/tasks/{task_id}"
)
except Exception as e:
logger.error(f"上传文档失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"上传失败: {str(e)}")
@router.post("/documents", response_model=UploadResponse)
async def upload_documents(
background_tasks: BackgroundTasks,
files: List[UploadFile] = File(...),
doc_type: Optional[str] = Query(None, description="文档类型")
):
"""
批量上传文档
所有文档会异步处理,处理完成后可通过 task_id 查询状态
"""
if not files:
raise HTTPException(status_code=400, detail="没有上传文件")
task_id = str(uuid.uuid4())
saved_paths = []
try:
for file in files:
if not file.filename:
continue
content = await file.read()
saved_path = file_service.save_uploaded_file(
content,
file.filename,
subfolder="batch"
)
saved_paths.append({
"path": saved_path,
"filename": file.filename,
"ext": file.filename.split('.')[-1].lower()
})
# 后台处理所有文档
background_tasks.add_task(
process_documents_batch,
task_id=task_id,
files=saved_paths
)
return UploadResponse(
task_id=task_id,
file_count=len(saved_paths),
message=f"已提交 {len(saved_paths)} 个文档处理",
status_url=f"/api/v1/tasks/{task_id}"
)
except Exception as e:
logger.error(f"批量上传失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"批量上传失败: {str(e)}")
# ==================== 任务处理函数 ====================
async def process_document(
task_id: str,
file_path: str,
original_filename: str,
doc_type: str,
parse_options: dict
):
"""处理单个文档"""
from app.core.database import redis_db
try:
# 更新状态: 处理中
await redis_db.set_task_status(
task_id,
status="processing",
meta={"progress": 10, "message": "正在解析文档"}
)
# 解析文档
parser = ParserFactory.get_parser(file_path)
result = parser.parse(file_path)
if not result.success:
raise Exception(result.error or "解析失败")
# 更新状态: 存储数据
await redis_db.set_task_status(
task_id,
status="processing",
meta={"progress": 40, "message": "正在存储数据"}
)
# 存储到 MongoDB
doc_id = await mongodb.insert_document(
doc_type=doc_type,
content=result.data.get("content", ""),
metadata={
**result.metadata,
"original_filename": original_filename,
"file_path": file_path
},
structured_data=result.data.get("structured_data")
)
# 如果是 Excel存储到 MySQL
if doc_type in ["xlsx", "xls"]:
await store_excel_to_mysql(file_path, original_filename, result)
# 更新状态: 建立 RAG 索引
await redis_db.set_task_status(
task_id,
status="processing",
meta={"progress": 70, "message": "正在建立索引"}
)
# 建立 RAG 索引
await index_document_to_rag(doc_id, original_filename, result, doc_type)
# 更新状态: 完成
await redis_db.set_task_status(
task_id,
status="success",
meta={
"progress": 100,
"message": "处理完成",
"doc_id": doc_id,
"result": {
"doc_id": doc_id,
"doc_type": doc_type,
"filename": original_filename
}
}
)
logger.info(f"文档处理完成: {original_filename}, doc_id: {doc_id}")
except Exception as e:
logger.error(f"文档处理失败: {str(e)}")
await redis_db.set_task_status(
task_id,
status="failure",
meta={"error": str(e)}
)
async def process_documents_batch(task_id: str, files: List[dict]):
"""批量处理文档"""
from app.core.database import redis_db
try:
await redis_db.set_task_status(
task_id,
status="processing",
meta={"progress": 0, "message": "开始批量处理"}
)
results = []
for i, file_info in enumerate(files):
try:
parser = ParserFactory.get_parser(file_info["path"])
result = parser.parse(file_info["path"])
if result.success:
doc_id = await mongodb.insert_document(
doc_type=file_info["ext"],
content=result.data.get("content", ""),
metadata={
**result.metadata,
"original_filename": file_info["filename"],
"file_path": file_info["path"]
},
structured_data=result.data.get("structured_data")
)
results.append({"filename": file_info["filename"], "doc_id": doc_id, "success": True})
else:
results.append({"filename": file_info["filename"], "success": False, "error": result.error})
except Exception as e:
results.append({"filename": file_info["filename"], "success": False, "error": str(e)})
# 更新进度
progress = int((i + 1) / len(files) * 100)
await redis_db.set_task_status(
task_id,
status="processing",
meta={"progress": progress, "message": f"已处理 {i+1}/{len(files)}"}
)
await redis_db.set_task_status(
task_id,
status="success",
meta={"progress": 100, "message": "批量处理完成", "results": results}
)
except Exception as e:
logger.error(f"批量处理失败: {str(e)}")
await redis_db.set_task_status(
task_id,
status="failure",
meta={"error": str(e)}
)
async def store_excel_to_mysql(file_path: str, filename: str, result: ParseResult):
"""将 Excel 数据存储到 MySQL"""
# TODO: 实现 Excel 数据到 MySQL 的转换和存储
# 需要根据表头动态创建表结构
pass
async def index_document_to_rag(doc_id: str, filename: str, result: ParseResult, doc_type: str):
"""将文档索引到 RAG"""
try:
if doc_type in ["xlsx", "xls"]:
# Excel 文件: 索引字段信息
columns = result.metadata.get("columns", [])
for col in columns:
rag_service.index_field(
table_name=filename,
field_name=col,
field_description=f"Excel表格 {filename} 的列 {col}",
sample_values=None
)
else:
# 其他文档: 索引文档内容
content = result.data.get("content", "")
if content:
rag_service.index_document_content(
doc_id=doc_id,
content=content[:5000], # 限制长度
metadata={
"filename": filename,
"doc_type": doc_type
}
)
except Exception as e:
logger.warning(f"RAG 索引失败: {str(e)}")
# ==================== 文档解析接口 ====================
@router.post("/document/parse")
async def parse_uploaded_document(
file_path: str = Query(..., description="文件路径")
):
"""解析已上传的文档"""
try:
parser = ParserFactory.get_parser(file_path)
result = parser.parse(file_path)
if result.success:
return result.to_dict()
else:
raise HTTPException(status_code=400, detail=result.error)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"解析文档失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"解析失败: {str(e)}")
# 需要添加 import
import logging

View File

@@ -0,0 +1,76 @@
"""
健康检查接口
"""
from datetime import datetime
from typing import Any, Dict
from fastapi import APIRouter
from app.core.database import mysql_db, mongodb, redis_db
router = APIRouter(tags=["健康检查"])
@router.get("/health")
async def health_check() -> Dict[str, Any]:
"""
健康检查接口
返回各数据库连接状态和应用信息
"""
# 检查各数据库连接状态
mysql_status = "connected"
mongodb_status = "connected"
redis_status = "connected"
try:
if mysql_db.async_engine is None:
mysql_status = "disconnected"
except Exception:
mysql_status = "error"
try:
if mongodb.client is None:
mongodb_status = "disconnected"
except Exception:
mongodb_status = "error"
try:
if not redis_db.is_connected:
redis_status = "disconnected"
except Exception:
redis_status = "error"
return {
"status": "healthy" if all([
mysql_status == "connected",
mongodb_status == "connected",
redis_status == "connected"
]) else "degraded",
"timestamp": datetime.utcnow().isoformat(),
"services": {
"mysql": mysql_status,
"mongodb": mongodb_status,
"redis": redis_status,
}
}
@router.get("/health/ready")
async def readiness_check() -> Dict[str, str]:
"""
就绪检查接口
用于 Kubernetes/负载均衡器检查服务是否就绪
"""
return {"status": "ready"}
@router.get("/health/live")
async def liveness_check() -> Dict[str, str]:
"""
存活检查接口
用于 Kubernetes/负载均衡器检查服务是否存活
"""
return {"status": "alive"}

View File

@@ -0,0 +1,139 @@
"""
文档库管理 API 接口
提供文档列表、详情查询和删除功能
"""
from typing import Optional, List
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
from app.core.database import mongodb
router = APIRouter(prefix="/documents", tags=["文档库"])
class DocumentItem(BaseModel):
doc_id: str
filename: str
original_filename: str
doc_type: str
file_size: int
created_at: str
metadata: Optional[dict] = None
@router.get("")
async def get_documents(
doc_type: Optional[str] = Query(None, description="文档类型过滤"),
limit: int = Query(50, ge=1, le=100, description="返回数量")
):
"""
获取文档列表
Returns:
文档列表
"""
try:
# 构建查询条件
query = {}
if doc_type:
query["doc_type"] = doc_type
# 查询文档
cursor = mongodb.documents.find(query).sort("created_at", -1).limit(limit)
documents = []
async for doc in cursor:
documents.append({
"doc_id": str(doc["_id"]),
"filename": doc.get("metadata", {}).get("filename", ""),
"original_filename": doc.get("metadata", {}).get("original_filename", ""),
"doc_type": doc.get("doc_type", ""),
"file_size": doc.get("metadata", {}).get("file_size", 0),
"created_at": doc.get("created_at", "").isoformat() if doc.get("created_at") else "",
"metadata": {
"row_count": doc.get("metadata", {}).get("row_count"),
"column_count": doc.get("metadata", {}).get("column_count"),
"columns": doc.get("metadata", {}).get("columns", [])[:10] # 只返回前10列
}
})
return {
"success": True,
"documents": documents,
"total": len(documents)
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"获取文档列表失败: {str(e)}")
@router.get("/{doc_id}")
async def get_document(doc_id: str):
"""
获取文档详情
Args:
doc_id: 文档ID
Returns:
文档详情
"""
try:
doc = await mongodb.get_document(doc_id)
if not doc:
raise HTTPException(status_code=404, detail="文档不存在")
return {
"success": True,
"document": {
"doc_id": str(doc["_id"]),
"filename": doc.get("metadata", {}).get("filename", ""),
"original_filename": doc.get("metadata", {}).get("original_filename", ""),
"doc_type": doc.get("doc_type", ""),
"file_size": doc.get("metadata", {}).get("file_size", 0),
"created_at": doc.get("created_at", "").isoformat() if doc.get("created_at") else "",
"content": doc.get("content", ""), # 原始文本内容
"structured_data": doc.get("structured_data"), # 结构化数据(如果有)
"metadata": doc.get("metadata", {})
}
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"获取文档详情失败: {str(e)}")
@router.delete("/{doc_id}")
async def delete_document(doc_id: str):
"""
删除文档
Args:
doc_id: 文档ID
Returns:
删除结果
"""
try:
# 从 MongoDB 删除
deleted = await mongodb.delete_document(doc_id)
if not deleted:
raise HTTPException(status_code=404, detail="文档不存在")
# TODO: 从 MySQL 删除相关数据(如果是Excel)
# TODO: 从 RAG 删除相关索引
return {
"success": True,
"message": "文档已删除"
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"删除失败: {str(e)}")

View File

@@ -0,0 +1,116 @@
"""
RAG 检索 API 接口
提供向量检索功能
"""
from typing import Optional
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
from app.services.rag_service import rag_service
router = APIRouter(prefix="/rag", tags=["RAG检索"])
class SearchRequest(BaseModel):
query: str
top_k: int = 5
class SearchResult(BaseModel):
content: str
metadata: dict
score: float
doc_id: str
@router.post("/search")
async def search_rag(
request: SearchRequest
):
"""
RAG 语义检索
根据查询文本检索相关的文档片段或字段
Args:
request.query: 查询文本
request.top_k: 返回数量
Returns:
相关文档列表
"""
try:
results = rag_service.retrieve(
query=request.query,
top_k=request.top_k
)
return {
"success": True,
"results": results
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"检索失败: {str(e)}")
@router.get("/status")
async def get_rag_status():
"""
获取 RAG 索引状态
Returns:
RAG 索引统计信息
"""
try:
count = rag_service.get_vector_count()
return {
"success": True,
"vector_count": count,
"collections": ["document_fields", "document_content"] # 预留
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"获取状态失败: {str(e)}")
@router.post("/rebuild")
async def rebuild_rag_index():
"""
重建 RAG 索引
从 MongoDB 中读取所有文档,重新构建向量索引
"""
from app.core.database import mongodb
try:
# 清空现有索引
rag_service.clear()
# 从 MongoDB 读取所有文档
cursor = mongodb.documents.find({})
count = 0
async for doc in cursor:
content = doc.get("content", "")
if content:
rag_service.index_document_content(
doc_id=str(doc["_id"]),
content=content[:5000],
metadata={
"filename": doc.get("metadata", {}).get("filename"),
"doc_type": doc.get("doc_type")
}
)
count += 1
return {
"success": True,
"message": f"已重建索引,共处理 {count} 个文档"
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"重建索引失败: {str(e)}")

View File

@@ -0,0 +1,38 @@
"""
任务管理 API 接口
提供异步任务状态查询
"""
from typing import Optional
from fastapi import APIRouter, HTTPException
from app.core.database import redis_db
router = APIRouter(prefix="/tasks", tags=["任务管理"])
@router.get("/{task_id}")
async def get_task_status(task_id: str):
"""
查询任务状态
Args:
task_id: 任务ID
Returns:
任务状态信息
"""
status = await redis_db.get_task_status(task_id)
if not status:
raise HTTPException(status_code=404, detail=f"任务 {task_id} 不存在")
return {
"task_id": task_id,
"status": status.get("status", "unknown"),
"progress": status.get("meta", {}).get("progress", 0),
"message": status.get("meta", {}).get("message"),
"result": status.get("meta", {}).get("result"),
"error": status.get("meta", {}).get("error")
}