后端完成异步和rag设置

This commit is contained in:
2026-03-26 23:41:03 +08:00
parent 5bcad4a5fa
commit 6b88e971e8
14 changed files with 741 additions and 108 deletions

View File

@@ -2,17 +2,20 @@
文档管理 API 接口
支持多格式文档(docx/xlsx/md/txt)上传、解析、存储和RAG索引
集成 Excel 存储和 AI 生成字段描述
"""
import logging
import uuid
from datetime import datetime
from typing import List, Optional
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, BackgroundTasks
from pydantic import BaseModel
from app.services.file_service import file_service
from app.core.database import mongodb, mysql_db
from app.core.database import mongodb, redis_db
from app.services.rag_service import rag_service
from app.services.table_rag_service import table_rag_service
from app.services.excel_storage_service import excel_storage_service
from app.core.document_parser import ParserFactory, ParseResult
logger = logging.getLogger(__name__)
@@ -31,7 +34,7 @@ class UploadResponse(BaseModel):
class TaskStatusResponse(BaseModel):
task_id: str
status: str # pending, processing, success, failure
status: str
progress: int = 0
message: Optional[str] = None
result: Optional[dict] = None
@@ -44,7 +47,6 @@ class TaskStatusResponse(BaseModel):
async def upload_document(
background_tasks: BackgroundTasks,
file: UploadFile = File(...),
doc_type: Optional[str] = Query(None, description="文档类型: docx/xlsx/md/txt"),
parse_all_sheets: bool = Query(False, description="是否解析所有工作表(仅Excel)"),
sheet_name: Optional[str] = Query(None, description="指定工作表(仅Excel)"),
header_row: int = Query(0, description="表头行号(仅Excel)")
@@ -56,13 +58,15 @@ async def upload_document(
1. 保存到本地存储
2. 解析内容
3. 存入 MongoDB (原始内容)
4. 如果是 Excel,存入 MySQL (结构化数据)
5. 建立 RAG 索引
4. 如果是 Excel
- 存入 MySQL (结构化数据)
- AI 生成字段描述
- 建立 RAG 索引
5. 建立 RAG 索引 (非结构化文档)
"""
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
# 根据扩展名确定文档类型
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['docx', 'xlsx', 'xls', 'md', 'txt']:
raise HTTPException(
@@ -70,21 +74,16 @@ async def upload_document(
detail=f"不支持的文件类型: {file_ext},仅支持 docx/xlsx/xls/md/txt"
)
# 生成任务ID
task_id = str(uuid.uuid4())
try:
# 读取文件内容
content = await file.read()
# 保存文件
saved_path = file_service.save_uploaded_file(
content,
file.filename,
subfolder=file_ext
)
# 后台处理文档
background_tasks.add_task(
process_document,
task_id=task_id,
@@ -114,13 +113,8 @@ async def upload_document(
async def upload_documents(
background_tasks: BackgroundTasks,
files: List[UploadFile] = File(...),
doc_type: Optional[str] = Query(None, description="文档类型")
):
"""
批量上传文档
所有文档会异步处理,处理完成后可通过 task_id 查询状态
"""
"""批量上传文档"""
if not files:
raise HTTPException(status_code=400, detail="没有上传文件")
@@ -131,25 +125,15 @@ async def upload_documents(
for file in files:
if not file.filename:
continue
content = await file.read()
saved_path = file_service.save_uploaded_file(
content,
file.filename,
subfolder="batch"
)
saved_path = file_service.save_uploaded_file(content, file.filename, subfolder="batch")
saved_paths.append({
"path": saved_path,
"filename": file.filename,
"ext": file.filename.split('.')[-1].lower()
})
# 后台处理所有文档
background_tasks.add_task(
process_documents_batch,
task_id=task_id,
files=saved_paths
)
background_tasks.add_task(process_documents_batch, task_id=task_id, files=saved_paths)
return UploadResponse(
task_id=task_id,
@@ -173,13 +157,10 @@ async def process_document(
parse_options: dict
):
"""处理单个文档"""
from app.core.database import redis_db
try:
# 更新状态: 处理
# 状态: 解析
await redis_db.set_task_status(
task_id,
status="processing",
task_id, status="processing",
meta={"progress": 10, "message": "正在解析文档"}
)
@@ -190,11 +171,10 @@ async def process_document(
if not result.success:
raise Exception(result.error or "解析失败")
# 更新状态: 存储数据
# 状态: 存储
await redis_db.set_task_status(
task_id,
status="processing",
meta={"progress": 40, "message": "正在存储数据"}
task_id, status="processing",
meta={"progress": 30, "message": "正在存储数据"}
)
# 存储到 MongoDB
@@ -209,24 +189,53 @@ async def process_document(
structured_data=result.data.get("structured_data")
)
# 如果是 Excel存储到 MySQL
# 如果是 Excel存储到 MySQL + AI生成描述 + RAG索引
if doc_type in ["xlsx", "xls"]:
await store_excel_to_mysql(file_path, original_filename, result)
await redis_db.set_task_status(
task_id, status="processing",
meta={"progress": 50, "message": "正在存储到MySQL并生成字段描述"}
)
# 更新状态: 建立 RAG 索引
# 使用 TableRAG 服务完成建表和RAG索引
rag_result = await table_rag_service.build_table_rag_index(
file_path=file_path,
filename=original_filename,
sheet_name=parse_options.get("sheet_name"),
header_row=parse_options.get("header_row", 0)
)
if rag_result.get("success"):
logger.info(f"RAG索引构建成功: {original_filename}")
else:
logger.warning(f"RAG索引构建失败: {rag_result.get('error')}")
else:
# 非结构化文档
await redis_db.set_task_status(
task_id, status="processing",
meta={"progress": 60, "message": "正在建立索引"}
)
# 如果文档中有表格数据,提取并存储到 MySQL + RAG
structured_data = result.data.get("structured_data", {})
tables = structured_data.get("tables", [])
if tables:
# 对每个表格建立 MySQL 表和 RAG 索引
for table_info in tables:
await table_rag_service.index_document_table(
doc_id=doc_id,
filename=original_filename,
table_data=table_info,
source_doc_type=doc_type
)
# 同时对文档内容建立 RAG 索引
await index_document_to_rag(doc_id, original_filename, result, doc_type)
# 完成
await redis_db.set_task_status(
task_id,
status="processing",
meta={"progress": 70, "message": "正在建立索引"}
)
# 建立 RAG 索引
await index_document_to_rag(doc_id, original_filename, result, doc_type)
# 更新状态: 完成
await redis_db.set_task_status(
task_id,
status="success",
task_id, status="success",
meta={
"progress": 100,
"message": "处理完成",
@@ -244,20 +253,16 @@ async def process_document(
except Exception as e:
logger.error(f"文档处理失败: {str(e)}")
await redis_db.set_task_status(
task_id,
status="failure",
task_id, status="failure",
meta={"error": str(e)}
)
async def process_documents_batch(task_id: str, files: List[dict]):
"""批量处理文档"""
from app.core.database import redis_db
try:
await redis_db.set_task_status(
task_id,
status="processing",
task_id, status="processing",
meta={"progress": 0, "message": "开始批量处理"}
)
@@ -278,6 +283,29 @@ async def process_documents_batch(task_id: str, files: List[dict]):
},
structured_data=result.data.get("structured_data")
)
# Excel 处理
if file_info["ext"] in ["xlsx", "xls"]:
await table_rag_service.build_table_rag_index(
file_path=file_info["path"],
filename=file_info["filename"]
)
else:
# 非结构化文档:处理其中的表格 + 内容索引
structured_data = result.data.get("structured_data", {})
tables = structured_data.get("tables", [])
if tables:
for table_info in tables:
await table_rag_service.index_document_table(
doc_id=doc_id,
filename=file_info["filename"],
table_data=table_info,
source_doc_type=file_info["ext"]
)
await index_document_to_rag(doc_id, file_info["filename"], result, file_info["ext"])
results.append({"filename": file_info["filename"], "doc_id": doc_id, "success": True})
else:
results.append({"filename": file_info["filename"], "success": False, "error": result.error})
@@ -285,61 +313,38 @@ async def process_documents_batch(task_id: str, files: List[dict]):
except Exception as e:
results.append({"filename": file_info["filename"], "success": False, "error": str(e)})
# 更新进度
progress = int((i + 1) / len(files) * 100)
await redis_db.set_task_status(
task_id,
status="processing",
task_id, status="processing",
meta={"progress": progress, "message": f"已处理 {i+1}/{len(files)}"}
)
await redis_db.set_task_status(
task_id,
status="success",
task_id, status="success",
meta={"progress": 100, "message": "批量处理完成", "results": results}
)
except Exception as e:
logger.error(f"批量处理失败: {str(e)}")
await redis_db.set_task_status(
task_id,
status="failure",
task_id, status="failure",
meta={"error": str(e)}
)
async def store_excel_to_mysql(file_path: str, filename: str, result: ParseResult):
"""将 Excel 数据存储到 MySQL"""
# TODO: 实现 Excel 数据到 MySQL 的转换和存储
# 需要根据表头动态创建表结构
pass
async def index_document_to_rag(doc_id: str, filename: str, result: ParseResult, doc_type: str):
"""将文档索引到 RAG"""
"""非结构化文档索引到 RAG"""
try:
if doc_type in ["xlsx", "xls"]:
# Excel 文件: 索引字段信息
columns = result.metadata.get("columns", [])
for col in columns:
rag_service.index_field(
table_name=filename,
field_name=col,
field_description=f"Excel表格 {filename} 的列 {col}",
sample_values=None
)
else:
# 其他文档: 索引文档内容
content = result.data.get("content", "")
if content:
rag_service.index_document_content(
doc_id=doc_id,
content=content[:5000], # 限制长度
metadata={
"filename": filename,
"doc_type": doc_type
}
)
content = result.data.get("content", "")
if content:
rag_service.index_document_content(
doc_id=doc_id,
content=content[:5000],
metadata={
"filename": filename,
"doc_type": doc_type
}
)
except Exception as e:
logger.warning(f"RAG 索引失败: {str(e)}")
@@ -365,7 +370,3 @@ async def parse_uploaded_document(
except Exception as e:
logger.error(f"解析文档失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"解析失败: {str(e)}")
# 需要添加 import
import logging

View File

@@ -4,6 +4,7 @@
提供模板上传、解析和填写功能
"""
import io
import logging
from typing import List, Optional
from fastapi import APIRouter, File, HTTPException, Query, UploadFile
@@ -222,7 +223,3 @@ async def export_filled_template(
except Exception as e:
logger.error(f"导出失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"导出失败: {str(e)}")
# ==================== 需要添加的 import ====================
import logging