Compare commits
49 Commits
2f630695ff
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 5fca4eb094 | |||
| 0dbf74db9d | |||
| 858b594171 | |||
| ed0f51f2a4 | |||
| ecc0c79475 | |||
| 6befc510d8 | |||
| 8f66c235fa | |||
| 886d5ae0cc | |||
| 6752c5c231 | |||
| 610d475ce0 | |||
| 496b96508d | |||
| 07ebdc09bc | |||
| 7f67fa89de | |||
| c1886fb68f | |||
| 78417c898a | |||
| d5df5b8283 | |||
| 718f864926 | |||
| e5711b3f05 | |||
| df35105d16 | |||
| 2c2ab56d2d | |||
| faff1a5977 | |||
| b2ebd3e12d | |||
| 4eda6cf758 | |||
| 38e41c6eff | |||
| 6f8976cf71 | |||
| 44d389a434 | |||
| c75eb26d60 | |||
| 3b82103e87 | |||
| fd435c7fd3 | |||
| 41e5eaaa2d | |||
| 7c19e49988 | |||
| d189ea9620 | |||
| ddf30078f0 | |||
| 1a54d40e01 | |||
| ec4759512d | |||
| 8e1ddb8aff | |||
| 8b12cb9322 | |||
| b9ca11efe5 | |||
| c122f1d63b | |||
| 332f0f636d | |||
| d494e78f70 | |||
| 091c9db0da | |||
| 4e178477fe | |||
| 7c88da9ab1 | |||
| 6b88e971e8 | |||
| 5bcad4a5fa | |||
| 4bdc3f9707 | |||
| d3bdb17e87 | |||
| eab5f88662 |
38
.gitignore
vendored
Normal file
38
.gitignore
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
/.git/
|
||||
/.idea/
|
||||
/.vscode/
|
||||
/backend/venv/
|
||||
/backend/command/
|
||||
/backend/.env
|
||||
/backend/.env.local
|
||||
/backend/.env.*.local
|
||||
/backend/app/__pycache__/*
|
||||
/backend/data/uploads
|
||||
/backend/data/charts
|
||||
/backend/data/logs
|
||||
|
||||
/frontend/node_modules/
|
||||
/frontend/dist/
|
||||
/frontend/build/
|
||||
/frontend/.vscode/
|
||||
/frontend/.idea/
|
||||
/frontend/.env
|
||||
/frontend/*.log
|
||||
/技术路线.md
|
||||
/开发路径.md
|
||||
/开发日志_2026-03-16.md
|
||||
/frontendTest/
|
||||
/docs/
|
||||
/frontend/src/api/
|
||||
/frontend/src/api/index.js
|
||||
/frontend/src/api/index.ts
|
||||
/frontend/src/api/index.tsx
|
||||
/frontend/src/api/index.py
|
||||
/frontend/src/api/index.go
|
||||
/frontend/src/api/index.java
|
||||
/docs/
|
||||
/frontend - 副本/*
|
||||
/supabase.txt
|
||||
|
||||
**/__pycache__/*
|
||||
**.pyc
|
||||
@@ -1,16 +1,56 @@
|
||||
# 基础配置
|
||||
# ============================================================
|
||||
# 基于大语言模型的文档理解与多源数据融合系统
|
||||
# 环境变量配置文件
|
||||
# ============================================================
|
||||
# 复制此文件为 .env 并填入实际值
|
||||
|
||||
# ==================== 应用基础配置 ====================
|
||||
APP_NAME="FilesReadSystem"
|
||||
DEBUG=true
|
||||
API_V1_STR="/api/v1"
|
||||
|
||||
# 数据库
|
||||
MONGODB_URL="mongodb://username:password@host:port"
|
||||
MONGODB_DB_NAME=""
|
||||
# ==================== MongoDB 配置 ====================
|
||||
# 非结构化数据存储 (原始文档、解析结果)
|
||||
MONGODB_URL="mongodb://localhost:27017"
|
||||
MONGODB_DB_NAME="document_system"
|
||||
|
||||
# ==================== MySQL 配置 ====================
|
||||
# 结构化数据存储 (Excel表格、查询结果)
|
||||
MYSQL_HOST="localhost"
|
||||
MYSQL_PORT=3306
|
||||
MYSQL_USER="root"
|
||||
MYSQL_PASSWORD="your_password_here"
|
||||
MYSQL_DATABASE="document_system"
|
||||
MYSQL_CHARSET="utf8mb4"
|
||||
|
||||
# ==================== Redis 配置 ====================
|
||||
# 缓存/任务队列
|
||||
REDIS_URL="redis://localhost:6379/0"
|
||||
|
||||
# 大模型 API
|
||||
LLM_API_KEY=""
|
||||
# ==================== LLM AI 配置 ====================
|
||||
# 大语言模型 API 配置
|
||||
LLM_API_KEY="your_api_key_here"
|
||||
LLM_BASE_URL=""
|
||||
LLM_MODEL_NAME=""
|
||||
|
||||
# 文件存储配置
|
||||
# ==================== Supabase 配置 ====================
|
||||
# Supabase 项目配置
|
||||
SUPABASE_URL="your_supabase_url_here"
|
||||
SUPABASE_ANON_KEY="your_supabase_anon_key_here"
|
||||
SUPABASE_SERVICE_KEY="your_supabase_service_key_here"
|
||||
|
||||
# ==================== 文件路径配置 ====================
|
||||
# 上传文件存储目录 (相对于项目根目录)
|
||||
UPLOAD_DIR="./data/uploads"
|
||||
MAX_UPLOAD_SIZE=104857600 # 100MB
|
||||
|
||||
# Faiss 向量数据库持久化目录 (LangChain + Faiss 实现)
|
||||
FAISS_INDEX_DIR="./data/faiss"
|
||||
|
||||
# ==================== RAG 配置 ====================
|
||||
# Embedding 模型名称
|
||||
EMBEDDING_MODEL="all-MiniLM-L6-v2"
|
||||
|
||||
# ==================== Celery 配置 ====================
|
||||
# 异步任务队列 Broker
|
||||
CELERY_BROKER_URL="redis://localhost:6379/1"
|
||||
CELERY_RESULT_BACKEND="redis://localhost:6379/2"
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -2,13 +2,30 @@
|
||||
API 路由注册模块
|
||||
"""
|
||||
from fastapi import APIRouter
|
||||
from app.api.endpoints import upload, ai_analyze, visualization, analysis_charts
|
||||
from app.api.endpoints import (
|
||||
upload,
|
||||
documents, # 多格式文档上传
|
||||
tasks, # 任务管理
|
||||
library, # 文档库
|
||||
rag, # RAG检索
|
||||
templates, # 表格模板
|
||||
ai_analyze,
|
||||
visualization,
|
||||
analysis_charts,
|
||||
health,
|
||||
)
|
||||
|
||||
# 创建主路由
|
||||
api_router = APIRouter()
|
||||
|
||||
# 注册各模块路由
|
||||
api_router.include_router(upload.router)
|
||||
api_router.include_router(ai_analyze.router)
|
||||
api_router.include_router(visualization.router)
|
||||
api_router.include_router(analysis_charts.router)
|
||||
api_router.include_router(health.router) # 健康检查
|
||||
api_router.include_router(upload.router) # 原有Excel上传
|
||||
api_router.include_router(documents.router) # 多格式文档上传
|
||||
api_router.include_router(tasks.router) # 任务状态查询
|
||||
api_router.include_router(library.router) # 文档库管理
|
||||
api_router.include_router(rag.router) # RAG检索
|
||||
api_router.include_router(templates.router) # 表格模板
|
||||
api_router.include_router(ai_analyze.router) # AI分析
|
||||
api_router.include_router(visualization.router) # 可视化
|
||||
api_router.include_router(analysis_charts.router) # 分析图表
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -2,10 +2,14 @@
|
||||
AI 分析 API 接口
|
||||
"""
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body
|
||||
from fastapi.responses import StreamingResponse
|
||||
from typing import Optional
|
||||
import logging
|
||||
import tempfile
|
||||
import os
|
||||
|
||||
from app.services.excel_ai_service import excel_ai_service
|
||||
from app.services.markdown_ai_service import markdown_ai_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -93,10 +97,11 @@ async def get_analysis_types():
|
||||
获取支持的分析类型列表
|
||||
|
||||
Returns:
|
||||
list: 支持的分析类型
|
||||
dict: 支持的分析类型(包含 Excel 和 Markdown)
|
||||
"""
|
||||
return {
|
||||
"types": excel_ai_service.get_supported_analysis_types()
|
||||
"excel_types": excel_ai_service.get_supported_analysis_types(),
|
||||
"markdown_types": markdown_ai_service.get_supported_analysis_types()
|
||||
}
|
||||
|
||||
|
||||
@@ -142,3 +147,196 @@ async def analyze_text(
|
||||
except Exception as e:
|
||||
logger.error(f"文本分析失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
|
||||
|
||||
|
||||
@router.post("/analyze/md")
|
||||
async def analyze_markdown(
|
||||
file: UploadFile = File(...),
|
||||
analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section"),
|
||||
user_prompt: str = Query("", description="用户自定义提示词"),
|
||||
section_number: Optional[str] = Query(None, description="指定章节编号,如 '一' 或 '(一)'")
|
||||
):
|
||||
"""
|
||||
上传并使用 AI 分析 Markdown 文件
|
||||
|
||||
Args:
|
||||
file: 上传的 Markdown 文件
|
||||
analysis_type: 分析类型
|
||||
user_prompt: 用户自定义提示词
|
||||
section_number: 指定分析的章节编号
|
||||
|
||||
Returns:
|
||||
dict: 分析结果
|
||||
"""
|
||||
# 检查文件类型
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['md', 'markdown']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
|
||||
)
|
||||
|
||||
# 验证分析类型
|
||||
supported_types = markdown_ai_service.get_supported_analysis_types()
|
||||
if analysis_type not in supported_types:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的分析类型: {analysis_type},支持的类型: {', '.join(supported_types)}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 读取文件内容
|
||||
content = await file.read()
|
||||
|
||||
# 保存到临时文件
|
||||
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
logger.info(f"开始分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}, 章节: {section_number}")
|
||||
|
||||
# 调用 AI 分析服务
|
||||
result = await markdown_ai_service.analyze_markdown(
|
||||
file_path=tmp_path,
|
||||
analysis_type=analysis_type,
|
||||
user_prompt=user_prompt,
|
||||
section_number=section_number
|
||||
)
|
||||
|
||||
logger.info(f"Markdown 分析完成: {file.filename}, 成功: {result['success']}")
|
||||
|
||||
if not result['success']:
|
||||
raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
|
||||
|
||||
return result
|
||||
|
||||
finally:
|
||||
# 清理临时文件,确保在所有情况下都能清理
|
||||
try:
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
except Exception as cleanup_error:
|
||||
logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Markdown AI 分析过程中出错: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
|
||||
|
||||
|
||||
@router.post("/analyze/md/stream")
|
||||
async def analyze_markdown_stream(
|
||||
file: UploadFile = File(...),
|
||||
analysis_type: str = Query("summary", description="分析类型"),
|
||||
user_prompt: str = Query("", description="用户自定义提示词"),
|
||||
section_number: Optional[str] = Query(None, description="指定章节编号")
|
||||
):
|
||||
"""
|
||||
流式分析 Markdown 文件 (SSE)
|
||||
|
||||
Returns:
|
||||
StreamingResponse: SSE 流式响应
|
||||
"""
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['md', 'markdown']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
|
||||
)
|
||||
|
||||
try:
|
||||
content = await file.read()
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
logger.info(f"开始流式分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}")
|
||||
|
||||
async def stream_generator():
|
||||
async for chunk in markdown_ai_service.analyze_markdown_stream(
|
||||
file_path=tmp_path,
|
||||
analysis_type=analysis_type,
|
||||
user_prompt=user_prompt,
|
||||
section_number=section_number
|
||||
):
|
||||
yield chunk
|
||||
|
||||
return StreamingResponse(
|
||||
stream_generator(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Accel-Buffering": "no"
|
||||
}
|
||||
)
|
||||
|
||||
finally:
|
||||
# 清理临时文件,确保在所有情况下都能清理
|
||||
try:
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
except Exception as cleanup_error:
|
||||
logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Markdown AI 流式分析出错: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"流式分析失败: {str(e)}")
|
||||
|
||||
|
||||
@router.post("/analyze/md/outline")
|
||||
async def get_markdown_outline(
|
||||
file: UploadFile = File(...)
|
||||
):
|
||||
"""
|
||||
获取 Markdown 文档的大纲结构(分章节信息)
|
||||
|
||||
Args:
|
||||
file: 上传的 Markdown 文件
|
||||
|
||||
Returns:
|
||||
dict: 文档大纲结构
|
||||
"""
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['md', 'markdown']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
|
||||
)
|
||||
|
||||
try:
|
||||
content = await file.read()
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
result = await markdown_ai_service.extract_outline(tmp_path)
|
||||
return result
|
||||
finally:
|
||||
# 清理临时文件,确保在所有情况下都能清理
|
||||
try:
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
except Exception as cleanup_error:
|
||||
logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取 Markdown 大纲失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"获取大纲失败: {str(e)}")
|
||||
|
||||
443
backend/app/api/endpoints/documents.py
Normal file
443
backend/app/api/endpoints/documents.py
Normal file
@@ -0,0 +1,443 @@
|
||||
"""
|
||||
文档管理 API 接口
|
||||
|
||||
支持多格式文档(docx/xlsx/md/txt)上传、解析、存储和RAG索引
|
||||
集成 Excel 存储和 AI 生成字段描述
|
||||
"""
|
||||
import logging
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, BackgroundTasks
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.services.file_service import file_service
|
||||
from app.core.database import mongodb, redis_db
|
||||
from app.services.rag_service import rag_service
|
||||
from app.services.table_rag_service import table_rag_service
|
||||
from app.services.excel_storage_service import excel_storage_service
|
||||
from app.core.document_parser import ParserFactory, ParseResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/upload", tags=["文档上传"])
|
||||
|
||||
|
||||
# ==================== 辅助函数 ====================
|
||||
|
||||
async def update_task_status(
|
||||
task_id: str,
|
||||
status: str,
|
||||
progress: int = 0,
|
||||
message: str = "",
|
||||
result: dict = None,
|
||||
error: str = None
|
||||
):
|
||||
"""
|
||||
更新任务状态,同时写入 Redis 和 MongoDB
|
||||
|
||||
Args:
|
||||
task_id: 任务ID
|
||||
status: 状态
|
||||
progress: 进度
|
||||
message: 消息
|
||||
result: 结果
|
||||
error: 错误信息
|
||||
"""
|
||||
meta = {"progress": progress, "message": message}
|
||||
if result:
|
||||
meta["result"] = result
|
||||
if error:
|
||||
meta["error"] = error
|
||||
|
||||
# 尝试写入 Redis
|
||||
try:
|
||||
await redis_db.set_task_status(task_id, status, meta)
|
||||
except Exception as e:
|
||||
logger.warning(f"Redis 任务状态更新失败: {e}")
|
||||
|
||||
# 尝试写入 MongoDB(作为备用)
|
||||
try:
|
||||
await mongodb.update_task(
|
||||
task_id=task_id,
|
||||
status=status,
|
||||
message=message,
|
||||
result=result,
|
||||
error=error
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"MongoDB 任务状态更新失败: {e}")
|
||||
|
||||
|
||||
# ==================== 请求/响应模型 ====================
|
||||
|
||||
class UploadResponse(BaseModel):
|
||||
task_id: str
|
||||
file_count: int
|
||||
message: str
|
||||
status_url: str
|
||||
|
||||
|
||||
class TaskStatusResponse(BaseModel):
|
||||
task_id: str
|
||||
status: str
|
||||
progress: int = 0
|
||||
message: Optional[str] = None
|
||||
result: Optional[dict] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
# ==================== 文档上传接口 ====================
|
||||
|
||||
@router.post("/document", response_model=UploadResponse)
|
||||
async def upload_document(
|
||||
background_tasks: BackgroundTasks,
|
||||
file: UploadFile = File(...),
|
||||
parse_all_sheets: bool = Query(False, description="是否解析所有工作表(仅Excel)"),
|
||||
sheet_name: Optional[str] = Query(None, description="指定工作表(仅Excel)"),
|
||||
header_row: int = Query(0, description="表头行号(仅Excel)")
|
||||
):
|
||||
"""
|
||||
上传单个文档并异步处理
|
||||
|
||||
文档会:
|
||||
1. 保存到本地存储
|
||||
2. 解析内容
|
||||
3. 存入 MongoDB (原始内容)
|
||||
4. 如果是 Excel:
|
||||
- 存入 MySQL (结构化数据)
|
||||
- AI 生成字段描述
|
||||
- 建立 RAG 索引
|
||||
5. 建立 RAG 索引 (非结构化文档)
|
||||
"""
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['docx', 'xlsx', 'xls', 'md', 'txt']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 docx/xlsx/xls/md/txt"
|
||||
)
|
||||
|
||||
task_id = str(uuid.uuid4())
|
||||
|
||||
try:
|
||||
# 保存任务记录到 MongoDB(如果 Redis 不可用时仍能查询)
|
||||
try:
|
||||
await mongodb.insert_task(
|
||||
task_id=task_id,
|
||||
task_type="document_parse",
|
||||
status="pending",
|
||||
message=f"文档 {file.filename} 已提交处理"
|
||||
)
|
||||
except Exception as mongo_err:
|
||||
logger.warning(f"MongoDB 保存任务记录失败: {mongo_err}")
|
||||
|
||||
content = await file.read()
|
||||
saved_path = file_service.save_uploaded_file(
|
||||
content,
|
||||
file.filename,
|
||||
subfolder=file_ext
|
||||
)
|
||||
|
||||
background_tasks.add_task(
|
||||
process_document,
|
||||
task_id=task_id,
|
||||
file_path=saved_path,
|
||||
original_filename=file.filename,
|
||||
doc_type=file_ext,
|
||||
parse_options={
|
||||
"parse_all_sheets": parse_all_sheets,
|
||||
"sheet_name": sheet_name,
|
||||
"header_row": header_row
|
||||
}
|
||||
)
|
||||
|
||||
return UploadResponse(
|
||||
task_id=task_id,
|
||||
file_count=1,
|
||||
message=f"文档 {file.filename} 已提交处理",
|
||||
status_url=f"/api/v1/tasks/{task_id}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"上传文档失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"上传失败: {str(e)}")
|
||||
|
||||
|
||||
@router.post("/documents", response_model=UploadResponse)
|
||||
async def upload_documents(
|
||||
background_tasks: BackgroundTasks,
|
||||
files: List[UploadFile] = File(...),
|
||||
):
|
||||
"""批量上传文档"""
|
||||
if not files:
|
||||
raise HTTPException(status_code=400, detail="没有上传文件")
|
||||
|
||||
task_id = str(uuid.uuid4())
|
||||
saved_paths = []
|
||||
|
||||
try:
|
||||
# 保存任务记录到 MongoDB
|
||||
try:
|
||||
await mongodb.insert_task(
|
||||
task_id=task_id,
|
||||
task_type="batch_parse",
|
||||
status="pending",
|
||||
message=f"已提交 {len(files)} 个文档处理"
|
||||
)
|
||||
except Exception as mongo_err:
|
||||
logger.warning(f"MongoDB 保存批量任务记录失败: {mongo_err}")
|
||||
|
||||
for file in files:
|
||||
if not file.filename:
|
||||
continue
|
||||
content = await file.read()
|
||||
saved_path = file_service.save_uploaded_file(content, file.filename, subfolder="batch")
|
||||
saved_paths.append({
|
||||
"path": saved_path,
|
||||
"filename": file.filename,
|
||||
"ext": file.filename.split('.')[-1].lower()
|
||||
})
|
||||
|
||||
background_tasks.add_task(process_documents_batch, task_id=task_id, files=saved_paths)
|
||||
|
||||
return UploadResponse(
|
||||
task_id=task_id,
|
||||
file_count=len(saved_paths),
|
||||
message=f"已提交 {len(saved_paths)} 个文档处理",
|
||||
status_url=f"/api/v1/tasks/{task_id}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"批量上传失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"批量上传失败: {str(e)}")
|
||||
|
||||
|
||||
# ==================== 任务处理函数 ====================
|
||||
|
||||
async def process_document(
|
||||
task_id: str,
|
||||
file_path: str,
|
||||
original_filename: str,
|
||||
doc_type: str,
|
||||
parse_options: dict
|
||||
):
|
||||
"""处理单个文档"""
|
||||
try:
|
||||
# 状态: 解析中
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=10, message="正在解析文档"
|
||||
)
|
||||
|
||||
# 解析文档
|
||||
parser = ParserFactory.get_parser(file_path)
|
||||
result = parser.parse(file_path)
|
||||
|
||||
if not result.success:
|
||||
raise Exception(result.error or "解析失败")
|
||||
|
||||
# 状态: 存储中
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=30, message="正在存储数据"
|
||||
)
|
||||
|
||||
# 存储到 MongoDB
|
||||
doc_id = await mongodb.insert_document(
|
||||
doc_type=doc_type,
|
||||
content=result.data.get("content", ""),
|
||||
metadata={
|
||||
**result.metadata,
|
||||
"original_filename": original_filename,
|
||||
"file_path": file_path
|
||||
},
|
||||
structured_data=result.data.get("structured_data")
|
||||
)
|
||||
|
||||
# 如果是 Excel,存储到 MySQL + AI生成描述 + RAG索引
|
||||
if doc_type in ["xlsx", "xls"]:
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=50, message="正在存储到MySQL并生成字段描述"
|
||||
)
|
||||
|
||||
try:
|
||||
# 使用 TableRAG 服务完成建表和RAG索引
|
||||
logger.info(f"开始存储Excel到MySQL: {original_filename}, file_path: {file_path}")
|
||||
rag_result = await table_rag_service.build_table_rag_index(
|
||||
file_path=file_path,
|
||||
filename=original_filename,
|
||||
sheet_name=parse_options.get("sheet_name"),
|
||||
header_row=parse_options.get("header_row", 0)
|
||||
)
|
||||
|
||||
if rag_result.get("success"):
|
||||
logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {rag_result.get('table_name')}")
|
||||
else:
|
||||
logger.error(f"RAG索引构建失败: {rag_result.get('error')}")
|
||||
except Exception as e:
|
||||
logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True)
|
||||
|
||||
else:
|
||||
# 非结构化文档
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=60, message="正在建立索引"
|
||||
)
|
||||
|
||||
# 如果文档中有表格数据,提取并存储到 MySQL + RAG
|
||||
structured_data = result.data.get("structured_data", {})
|
||||
tables = structured_data.get("tables", [])
|
||||
|
||||
if tables:
|
||||
# 对每个表格建立 MySQL 表和 RAG 索引
|
||||
for table_info in tables:
|
||||
await table_rag_service.index_document_table(
|
||||
doc_id=doc_id,
|
||||
filename=original_filename,
|
||||
table_data=table_info,
|
||||
source_doc_type=doc_type
|
||||
)
|
||||
|
||||
# 同时对文档内容建立 RAG 索引
|
||||
await index_document_to_rag(doc_id, original_filename, result, doc_type)
|
||||
|
||||
# 完成
|
||||
await update_task_status(
|
||||
task_id, status="success",
|
||||
progress=100, message="处理完成",
|
||||
result={
|
||||
"doc_id": doc_id,
|
||||
"doc_type": doc_type,
|
||||
"filename": original_filename
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(f"文档处理完成: {original_filename}, doc_id: {doc_id}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"文档处理失败: {str(e)}")
|
||||
await update_task_status(
|
||||
task_id, status="failure",
|
||||
progress=0, message="处理失败",
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
|
||||
async def process_documents_batch(task_id: str, files: List[dict]):
|
||||
"""批量处理文档"""
|
||||
try:
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=0, message="开始批量处理"
|
||||
)
|
||||
|
||||
results = []
|
||||
for i, file_info in enumerate(files):
|
||||
try:
|
||||
parser = ParserFactory.get_parser(file_info["path"])
|
||||
result = parser.parse(file_info["path"])
|
||||
|
||||
if result.success:
|
||||
doc_id = await mongodb.insert_document(
|
||||
doc_type=file_info["ext"],
|
||||
content=result.data.get("content", ""),
|
||||
metadata={
|
||||
**result.metadata,
|
||||
"original_filename": file_info["filename"],
|
||||
"file_path": file_info["path"]
|
||||
},
|
||||
structured_data=result.data.get("structured_data")
|
||||
)
|
||||
|
||||
# Excel 处理
|
||||
if file_info["ext"] in ["xlsx", "xls"]:
|
||||
await table_rag_service.build_table_rag_index(
|
||||
file_path=file_info["path"],
|
||||
filename=file_info["filename"]
|
||||
)
|
||||
else:
|
||||
# 非结构化文档:处理其中的表格 + 内容索引
|
||||
structured_data = result.data.get("structured_data", {})
|
||||
tables = structured_data.get("tables", [])
|
||||
|
||||
if tables:
|
||||
for table_info in tables:
|
||||
await table_rag_service.index_document_table(
|
||||
doc_id=doc_id,
|
||||
filename=file_info["filename"],
|
||||
table_data=table_info,
|
||||
source_doc_type=file_info["ext"]
|
||||
)
|
||||
|
||||
await index_document_to_rag(doc_id, file_info["filename"], result, file_info["ext"])
|
||||
|
||||
results.append({"filename": file_info["filename"], "doc_id": doc_id, "success": True})
|
||||
else:
|
||||
results.append({"filename": file_info["filename"], "success": False, "error": result.error})
|
||||
|
||||
except Exception as e:
|
||||
results.append({"filename": file_info["filename"], "success": False, "error": str(e)})
|
||||
|
||||
progress = int((i + 1) / len(files) * 100)
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=progress, message=f"已处理 {i+1}/{len(files)}"
|
||||
)
|
||||
|
||||
await update_task_status(
|
||||
task_id, status="success",
|
||||
progress=100, message="批量处理完成",
|
||||
result={"results": results}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"批量处理失败: {str(e)}")
|
||||
await update_task_status(
|
||||
task_id, status="failure",
|
||||
progress=0, message="批量处理失败",
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
|
||||
async def index_document_to_rag(doc_id: str, filename: str, result: ParseResult, doc_type: str):
|
||||
"""将非结构化文档索引到 RAG"""
|
||||
try:
|
||||
content = result.data.get("content", "")
|
||||
if content:
|
||||
rag_service.index_document_content(
|
||||
doc_id=doc_id,
|
||||
content=content[:5000],
|
||||
metadata={
|
||||
"filename": filename,
|
||||
"doc_type": doc_type
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"RAG 索引失败: {str(e)}")
|
||||
|
||||
|
||||
# ==================== 文档解析接口 ====================
|
||||
|
||||
@router.post("/document/parse")
|
||||
async def parse_uploaded_document(
|
||||
file_path: str = Query(..., description="文件路径")
|
||||
):
|
||||
"""解析已上传的文档"""
|
||||
try:
|
||||
parser = ParserFactory.get_parser(file_path)
|
||||
result = parser.parse(file_path)
|
||||
|
||||
if result.success:
|
||||
return result.to_dict()
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail=result.error)
|
||||
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except Exception as e:
|
||||
logger.error(f"解析文档失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"解析失败: {str(e)}")
|
||||
93
backend/app/api/endpoints/health.py
Normal file
93
backend/app/api/endpoints/health.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""
|
||||
健康检查接口
|
||||
"""
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
from app.core.database import mysql_db, mongodb, redis_db
|
||||
|
||||
router = APIRouter(tags=["健康检查"])
|
||||
|
||||
|
||||
@router.get("/health")
|
||||
async def health_check() -> Dict[str, Any]:
|
||||
"""
|
||||
健康检查接口
|
||||
|
||||
返回各数据库连接状态和应用信息
|
||||
"""
|
||||
# 检查各数据库连接状态
|
||||
mysql_status = "unknown"
|
||||
mongodb_status = "unknown"
|
||||
redis_status = "unknown"
|
||||
|
||||
try:
|
||||
if mysql_db.async_engine is None:
|
||||
mysql_status = "disconnected"
|
||||
else:
|
||||
# 实际执行一次查询验证连接
|
||||
from sqlalchemy import text
|
||||
async with mysql_db.async_engine.connect() as conn:
|
||||
await conn.execute(text("SELECT 1"))
|
||||
mysql_status = "connected"
|
||||
except Exception as e:
|
||||
logger.warning(f"MySQL 健康检查失败: {e}")
|
||||
mysql_status = "error"
|
||||
|
||||
try:
|
||||
if mongodb.client is None:
|
||||
mongodb_status = "disconnected"
|
||||
else:
|
||||
# 实际 ping 验证
|
||||
await mongodb.client.admin.command('ping')
|
||||
mongodb_status = "connected"
|
||||
except Exception as e:
|
||||
logger.warning(f"MongoDB 健康检查失败: {e}")
|
||||
mongodb_status = "error"
|
||||
|
||||
try:
|
||||
if not redis_db.is_connected or redis_db.client is None:
|
||||
redis_status = "disconnected"
|
||||
else:
|
||||
# 实际执行 ping 验证
|
||||
await redis_db.client.ping()
|
||||
redis_status = "connected"
|
||||
except Exception as e:
|
||||
logger.warning(f"Redis 健康检查失败: {e}")
|
||||
redis_status = "error"
|
||||
|
||||
return {
|
||||
"status": "healthy" if all([
|
||||
mysql_status == "connected",
|
||||
mongodb_status == "connected",
|
||||
redis_status == "connected"
|
||||
]) else "degraded",
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"services": {
|
||||
"mysql": mysql_status,
|
||||
"mongodb": mongodb_status,
|
||||
"redis": redis_status,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@router.get("/health/ready")
|
||||
async def readiness_check() -> Dict[str, str]:
|
||||
"""
|
||||
就绪检查接口
|
||||
|
||||
用于 Kubernetes/负载均衡器检查服务是否就绪
|
||||
"""
|
||||
return {"status": "ready"}
|
||||
|
||||
|
||||
@router.get("/health/live")
|
||||
async def liveness_check() -> Dict[str, str]:
|
||||
"""
|
||||
存活检查接口
|
||||
|
||||
用于 Kubernetes/负载均衡器检查服务是否存活
|
||||
"""
|
||||
return {"status": "alive"}
|
||||
170
backend/app/api/endpoints/library.py
Normal file
170
backend/app/api/endpoints/library.py
Normal file
@@ -0,0 +1,170 @@
|
||||
"""
|
||||
文档库管理 API 接口
|
||||
|
||||
提供文档列表、详情查询和删除功能
|
||||
"""
|
||||
import logging
|
||||
from typing import Optional, List
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Query
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.core.database import mongodb
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/documents", tags=["文档库"])
|
||||
|
||||
|
||||
class DocumentItem(BaseModel):
|
||||
doc_id: str
|
||||
filename: str
|
||||
original_filename: str
|
||||
doc_type: str
|
||||
file_size: int
|
||||
created_at: str
|
||||
metadata: Optional[dict] = None
|
||||
|
||||
|
||||
@router.get("")
|
||||
async def get_documents(
|
||||
doc_type: Optional[str] = Query(None, description="文档类型过滤"),
|
||||
limit: int = Query(20, ge=1, le=100, description="返回数量"),
|
||||
skip: int = Query(0, ge=0, description="跳过数量")
|
||||
):
|
||||
"""
|
||||
获取文档列表
|
||||
|
||||
Returns:
|
||||
文档列表
|
||||
"""
|
||||
try:
|
||||
# 构建查询条件
|
||||
query = {}
|
||||
if doc_type:
|
||||
query["doc_type"] = doc_type
|
||||
|
||||
logger.info(f"开始查询文档列表, query: {query}, limit: {limit}")
|
||||
|
||||
# 使用 batch_size 和 max_time_ms 来控制查询
|
||||
cursor = mongodb.documents.find(
|
||||
query,
|
||||
{"content": 0} # 不返回 content 字段,减少数据传输
|
||||
).sort("created_at", -1).skip(skip).limit(limit)
|
||||
|
||||
# 设置 10 秒超时
|
||||
cursor.max_time_ms(10000)
|
||||
|
||||
logger.info("Cursor created with 10s timeout, executing...")
|
||||
|
||||
# 使用 batch_size 逐批获取
|
||||
documents_raw = await cursor.to_list(length=limit)
|
||||
logger.info(f"查询到原始文档数: {len(documents_raw)}")
|
||||
|
||||
documents = []
|
||||
for doc in documents_raw:
|
||||
documents.append({
|
||||
"doc_id": str(doc["_id"]),
|
||||
"filename": doc.get("metadata", {}).get("filename", ""),
|
||||
"original_filename": doc.get("metadata", {}).get("original_filename", ""),
|
||||
"doc_type": doc.get("doc_type", ""),
|
||||
"file_size": doc.get("metadata", {}).get("file_size", 0),
|
||||
"created_at": doc.get("created_at", "").isoformat() if doc.get("created_at") else "",
|
||||
"metadata": {
|
||||
"row_count": doc.get("metadata", {}).get("row_count"),
|
||||
"column_count": doc.get("metadata", {}).get("column_count"),
|
||||
"columns": doc.get("metadata", {}).get("columns", [])[:10]
|
||||
}
|
||||
})
|
||||
|
||||
logger.info(f"文档列表处理完成: {len(documents)} 个文档")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"documents": documents,
|
||||
"total": len(documents)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
err_str = str(e)
|
||||
# 如果是超时错误,返回空列表而不是报错
|
||||
if "timeout" in err_str.lower() or "time" in err_str.lower():
|
||||
logger.warning(f"文档查询超时,返回空列表: {err_str}")
|
||||
return {
|
||||
"success": True,
|
||||
"documents": [],
|
||||
"total": 0,
|
||||
"warning": "查询超时,请稍后重试"
|
||||
}
|
||||
logger.error(f"获取文档列表失败: {str(e)}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=f"获取文档列表失败: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/{doc_id}")
|
||||
async def get_document(doc_id: str):
|
||||
"""
|
||||
获取文档详情
|
||||
|
||||
Args:
|
||||
doc_id: 文档ID
|
||||
|
||||
Returns:
|
||||
文档详情
|
||||
"""
|
||||
try:
|
||||
doc = await mongodb.get_document(doc_id)
|
||||
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail="文档不存在")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"document": {
|
||||
"doc_id": str(doc["_id"]),
|
||||
"filename": doc.get("metadata", {}).get("filename", ""),
|
||||
"original_filename": doc.get("metadata", {}).get("original_filename", ""),
|
||||
"doc_type": doc.get("doc_type", ""),
|
||||
"file_size": doc.get("metadata", {}).get("file_size", 0),
|
||||
"created_at": doc.get("created_at", "").isoformat() if doc.get("created_at") else "",
|
||||
"content": doc.get("content", ""), # 原始文本内容
|
||||
"structured_data": doc.get("structured_data"), # 结构化数据(如果有)
|
||||
"metadata": doc.get("metadata", {})
|
||||
}
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"获取文档详情失败: {str(e)}")
|
||||
|
||||
|
||||
@router.delete("/{doc_id}")
|
||||
async def delete_document(doc_id: str):
|
||||
"""
|
||||
删除文档
|
||||
|
||||
Args:
|
||||
doc_id: 文档ID
|
||||
|
||||
Returns:
|
||||
删除结果
|
||||
"""
|
||||
try:
|
||||
# 从 MongoDB 删除
|
||||
deleted = await mongodb.delete_document(doc_id)
|
||||
|
||||
if not deleted:
|
||||
raise HTTPException(status_code=404, detail="文档不存在")
|
||||
|
||||
# TODO: 从 MySQL 删除相关数据(如果是Excel)
|
||||
# TODO: 从 RAG 删除相关索引
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"message": "文档已删除"
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"删除失败: {str(e)}")
|
||||
116
backend/app/api/endpoints/rag.py
Normal file
116
backend/app/api/endpoints/rag.py
Normal file
@@ -0,0 +1,116 @@
|
||||
"""
|
||||
RAG 检索 API 接口
|
||||
|
||||
提供向量检索功能
|
||||
"""
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Query
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.services.rag_service import rag_service
|
||||
|
||||
router = APIRouter(prefix="/rag", tags=["RAG检索"])
|
||||
|
||||
|
||||
class SearchRequest(BaseModel):
|
||||
query: str
|
||||
top_k: int = 5
|
||||
|
||||
|
||||
class SearchResult(BaseModel):
|
||||
content: str
|
||||
metadata: dict
|
||||
score: float
|
||||
doc_id: str
|
||||
|
||||
|
||||
@router.post("/search")
|
||||
async def search_rag(
|
||||
request: SearchRequest
|
||||
):
|
||||
"""
|
||||
RAG 语义检索
|
||||
|
||||
根据查询文本检索相关的文档片段或字段
|
||||
|
||||
Args:
|
||||
request.query: 查询文本
|
||||
request.top_k: 返回数量
|
||||
|
||||
Returns:
|
||||
相关文档列表
|
||||
"""
|
||||
try:
|
||||
results = rag_service.retrieve(
|
||||
query=request.query,
|
||||
top_k=request.top_k
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"results": results
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"检索失败: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/status")
|
||||
async def get_rag_status():
|
||||
"""
|
||||
获取 RAG 索引状态
|
||||
|
||||
Returns:
|
||||
RAG 索引统计信息
|
||||
"""
|
||||
try:
|
||||
count = rag_service.get_vector_count()
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"vector_count": count,
|
||||
"collections": ["document_fields", "document_content"] # 预留
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"获取状态失败: {str(e)}")
|
||||
|
||||
|
||||
@router.post("/rebuild")
|
||||
async def rebuild_rag_index():
|
||||
"""
|
||||
重建 RAG 索引
|
||||
|
||||
从 MongoDB 中读取所有文档,重新构建向量索引
|
||||
"""
|
||||
from app.core.database import mongodb
|
||||
|
||||
try:
|
||||
# 清空现有索引
|
||||
rag_service.clear()
|
||||
|
||||
# 从 MongoDB 读取所有文档
|
||||
cursor = mongodb.documents.find({})
|
||||
count = 0
|
||||
|
||||
async for doc in cursor:
|
||||
content = doc.get("content", "")
|
||||
if content:
|
||||
rag_service.index_document_content(
|
||||
doc_id=str(doc["_id"]),
|
||||
content=content[:5000],
|
||||
metadata={
|
||||
"filename": doc.get("metadata", {}).get("filename"),
|
||||
"doc_type": doc.get("doc_type")
|
||||
}
|
||||
)
|
||||
count += 1
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"message": f"已重建索引,共处理 {count} 个文档"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"重建索引失败: {str(e)}")
|
||||
116
backend/app/api/endpoints/tasks.py
Normal file
116
backend/app/api/endpoints/tasks.py
Normal file
@@ -0,0 +1,116 @@
|
||||
"""
|
||||
任务管理 API 接口
|
||||
|
||||
提供异步任务状态查询和历史记录
|
||||
"""
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
|
||||
from app.core.database import redis_db, mongodb
|
||||
|
||||
router = APIRouter(prefix="/tasks", tags=["任务管理"])
|
||||
|
||||
|
||||
@router.get("/{task_id}")
|
||||
async def get_task_status(task_id: str):
|
||||
"""
|
||||
查询任务状态
|
||||
|
||||
Args:
|
||||
task_id: 任务ID
|
||||
|
||||
Returns:
|
||||
任务状态信息
|
||||
"""
|
||||
# 优先从 Redis 获取
|
||||
status = await redis_db.get_task_status(task_id)
|
||||
|
||||
if status:
|
||||
return {
|
||||
"task_id": task_id,
|
||||
"status": status.get("status", "unknown"),
|
||||
"progress": status.get("meta", {}).get("progress", 0),
|
||||
"message": status.get("meta", {}).get("message"),
|
||||
"result": status.get("meta", {}).get("result"),
|
||||
"error": status.get("meta", {}).get("error")
|
||||
}
|
||||
|
||||
# Redis 不可用时,尝试从 MongoDB 获取
|
||||
mongo_task = await mongodb.get_task(task_id)
|
||||
if mongo_task:
|
||||
return {
|
||||
"task_id": mongo_task.get("task_id"),
|
||||
"status": mongo_task.get("status", "unknown"),
|
||||
"progress": 100 if mongo_task.get("status") == "success" else 0,
|
||||
"message": mongo_task.get("message"),
|
||||
"result": mongo_task.get("result"),
|
||||
"error": mongo_task.get("error")
|
||||
}
|
||||
|
||||
# 任务不存在或状态未知
|
||||
return {
|
||||
"task_id": task_id,
|
||||
"status": "unknown",
|
||||
"progress": 0,
|
||||
"message": "无法获取任务状态(Redis和MongoDB均不可用)",
|
||||
"result": None,
|
||||
"error": None
|
||||
}
|
||||
|
||||
|
||||
@router.get("/")
|
||||
async def list_tasks(limit: int = 50, skip: int = 0):
|
||||
"""
|
||||
获取任务历史列表
|
||||
|
||||
Args:
|
||||
limit: 返回数量限制
|
||||
skip: 跳过数量
|
||||
|
||||
Returns:
|
||||
任务列表
|
||||
"""
|
||||
try:
|
||||
tasks = await mongodb.list_tasks(limit=limit, skip=skip)
|
||||
return {
|
||||
"success": True,
|
||||
"tasks": tasks,
|
||||
"count": len(tasks)
|
||||
}
|
||||
except Exception as e:
|
||||
# MongoDB 不可用时返回空列表
|
||||
return {
|
||||
"success": False,
|
||||
"tasks": [],
|
||||
"count": 0,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
@router.delete("/{task_id}")
|
||||
async def delete_task(task_id: str):
|
||||
"""
|
||||
删除任务
|
||||
|
||||
Args:
|
||||
task_id: 任务ID
|
||||
|
||||
Returns:
|
||||
是否删除成功
|
||||
"""
|
||||
try:
|
||||
# 从 Redis 删除
|
||||
if redis_db._connected and redis_db.client:
|
||||
key = f"task:{task_id}"
|
||||
await redis_db.client.delete(key)
|
||||
|
||||
# 从 MongoDB 删除
|
||||
deleted = await mongodb.delete_task(task_id)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"deleted": deleted
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"删除任务失败: {str(e)}")
|
||||
694
backend/app/api/endpoints/templates.py
Normal file
694
backend/app/api/endpoints/templates.py
Normal file
@@ -0,0 +1,694 @@
|
||||
"""
|
||||
表格模板 API 接口
|
||||
|
||||
提供模板上传、解析和填写功能
|
||||
"""
|
||||
import io
|
||||
import logging
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi import APIRouter, File, HTTPException, Query, UploadFile, BackgroundTasks
|
||||
from fastapi.responses import StreamingResponse
|
||||
import pandas as pd
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.services.template_fill_service import template_fill_service, TemplateField
|
||||
from app.services.file_service import file_service
|
||||
from app.core.database import mongodb
|
||||
from app.core.document_parser import ParserFactory
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/templates", tags=["表格模板"])
|
||||
|
||||
|
||||
# ==================== 辅助函数 ====================
|
||||
|
||||
async def update_task_status(
|
||||
task_id: str,
|
||||
status: str,
|
||||
progress: int = 0,
|
||||
message: str = "",
|
||||
result: dict = None,
|
||||
error: str = None
|
||||
):
|
||||
"""
|
||||
更新任务状态,同时写入 Redis 和 MongoDB
|
||||
"""
|
||||
from app.core.database import redis_db
|
||||
|
||||
meta = {"progress": progress, "message": message}
|
||||
if result:
|
||||
meta["result"] = result
|
||||
if error:
|
||||
meta["error"] = error
|
||||
|
||||
try:
|
||||
await redis_db.set_task_status(task_id, status, meta)
|
||||
except Exception as e:
|
||||
logger.warning(f"Redis 任务状态更新失败: {e}")
|
||||
|
||||
try:
|
||||
await mongodb.update_task(
|
||||
task_id=task_id,
|
||||
status=status,
|
||||
message=message,
|
||||
result=result,
|
||||
error=error
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"MongoDB 任务状态更新失败: {e}")
|
||||
|
||||
|
||||
# ==================== 请求/响应模型 ====================
|
||||
|
||||
class TemplateFieldRequest(BaseModel):
|
||||
"""模板字段请求"""
|
||||
cell: str
|
||||
name: str
|
||||
field_type: str = "text"
|
||||
required: bool = True
|
||||
hint: str = ""
|
||||
|
||||
|
||||
class FillRequest(BaseModel):
|
||||
"""填写请求"""
|
||||
template_id: str
|
||||
template_fields: List[TemplateFieldRequest]
|
||||
source_doc_ids: Optional[List[str]] = None # MongoDB 文档 ID 列表
|
||||
source_file_paths: Optional[List[str]] = None # 源文档文件路径列表
|
||||
user_hint: Optional[str] = None
|
||||
task_id: Optional[str] = None # 可选的任务ID,用于任务历史跟踪
|
||||
|
||||
|
||||
class ExportRequest(BaseModel):
|
||||
"""导出请求"""
|
||||
template_id: str
|
||||
filled_data: dict
|
||||
format: str = "xlsx" # xlsx 或 docx
|
||||
|
||||
|
||||
# ==================== 接口实现 ====================
|
||||
|
||||
@router.post("/upload")
|
||||
async def upload_template(
|
||||
file: UploadFile = File(...),
|
||||
):
|
||||
"""
|
||||
上传表格模板文件
|
||||
|
||||
支持 Excel (.xlsx, .xls) 和 Word (.docx) 格式
|
||||
|
||||
Returns:
|
||||
模板信息,包括提取的字段列表
|
||||
"""
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['xlsx', 'xls', 'docx']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的模板格式: {file_ext},仅支持 xlsx/xls/docx"
|
||||
)
|
||||
|
||||
try:
|
||||
# 保存文件
|
||||
content = await file.read()
|
||||
saved_path = file_service.save_uploaded_file(
|
||||
content,
|
||||
file.filename,
|
||||
subfolder="templates"
|
||||
)
|
||||
|
||||
# 提取字段
|
||||
template_fields = await template_fill_service.get_template_fields_from_file(
|
||||
saved_path,
|
||||
file_ext
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"template_id": saved_path,
|
||||
"filename": file.filename,
|
||||
"file_type": file_ext,
|
||||
"fields": [
|
||||
{
|
||||
"cell": f.cell,
|
||||
"name": f.name,
|
||||
"field_type": f.field_type,
|
||||
"required": f.required,
|
||||
"hint": f.hint
|
||||
}
|
||||
for f in template_fields
|
||||
],
|
||||
"field_count": len(template_fields)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"上传模板失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"上传失败: {str(e)}")
|
||||
|
||||
|
||||
@router.post("/upload-joint")
|
||||
async def upload_joint_template(
|
||||
background_tasks: BackgroundTasks,
|
||||
template_file: UploadFile = File(..., description="模板文件"),
|
||||
source_files: List[UploadFile] = File(..., description="源文档文件列表"),
|
||||
):
|
||||
"""
|
||||
联合上传模板和源文档,一键完成解析和存储
|
||||
|
||||
1. 保存模板文件并提取字段
|
||||
2. 异步处理源文档(解析+存MongoDB)
|
||||
3. 返回模板信息和源文档ID列表
|
||||
|
||||
Args:
|
||||
template_file: 模板文件 (xlsx/xls/docx)
|
||||
source_files: 源文档列表 (docx/xlsx/md/txt)
|
||||
|
||||
Returns:
|
||||
模板ID、字段列表、源文档ID列表
|
||||
"""
|
||||
if not template_file.filename:
|
||||
raise HTTPException(status_code=400, detail="模板文件名为空")
|
||||
|
||||
# 验证模板格式
|
||||
template_ext = template_file.filename.split('.')[-1].lower()
|
||||
if template_ext not in ['xlsx', 'xls', 'docx']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的模板格式: {template_ext},仅支持 xlsx/xls/docx"
|
||||
)
|
||||
|
||||
# 验证源文档格式
|
||||
valid_exts = ['docx', 'xlsx', 'xls', 'md', 'txt']
|
||||
for sf in source_files:
|
||||
if sf.filename:
|
||||
sf_ext = sf.filename.split('.')[-1].lower()
|
||||
if sf_ext not in valid_exts:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的源文档格式: {sf_ext},仅支持 docx/xlsx/xls/md/txt"
|
||||
)
|
||||
|
||||
try:
|
||||
# 1. 保存模板文件
|
||||
template_content = await template_file.read()
|
||||
template_path = file_service.save_uploaded_file(
|
||||
template_content,
|
||||
template_file.filename,
|
||||
subfolder="templates"
|
||||
)
|
||||
|
||||
# 2. 保存并解析源文档 - 提取内容用于生成表头
|
||||
source_file_info = []
|
||||
source_contents = []
|
||||
for sf in source_files:
|
||||
if sf.filename:
|
||||
sf_content = await sf.read()
|
||||
sf_ext = sf.filename.split('.')[-1].lower()
|
||||
sf_path = file_service.save_uploaded_file(
|
||||
sf_content,
|
||||
sf.filename,
|
||||
subfolder=sf_ext
|
||||
)
|
||||
source_file_info.append({
|
||||
"path": sf_path,
|
||||
"filename": sf.filename,
|
||||
"ext": sf_ext
|
||||
})
|
||||
# 解析源文档获取内容(用于 AI 生成表头)
|
||||
try:
|
||||
from app.core.document_parser import ParserFactory
|
||||
parser = ParserFactory.get_parser(sf_path)
|
||||
parse_result = parser.parse(sf_path)
|
||||
if parse_result.success and parse_result.data:
|
||||
# 获取原始内容
|
||||
content = parse_result.data.get("content", "")[:5000] if parse_result.data.get("content") else ""
|
||||
|
||||
# 获取标题(可能在顶层或structured_data内)
|
||||
titles = parse_result.data.get("titles", [])
|
||||
if not titles and parse_result.data.get("structured_data"):
|
||||
titles = parse_result.data.get("structured_data", {}).get("titles", [])
|
||||
titles = titles[:10] if titles else []
|
||||
|
||||
# 获取表格数量(可能在顶层或structured_data内)
|
||||
tables = parse_result.data.get("tables", [])
|
||||
if not tables and parse_result.data.get("structured_data"):
|
||||
tables = parse_result.data.get("structured_data", {}).get("tables", [])
|
||||
tables_count = len(tables) if tables else 0
|
||||
|
||||
# 获取表格内容摘要(用于 AI 理解源文档结构)
|
||||
tables_summary = ""
|
||||
if tables:
|
||||
tables_summary = "\n【文档中的表格】:\n"
|
||||
for idx, table in enumerate(tables[:5]): # 最多5个表格
|
||||
if isinstance(table, dict):
|
||||
headers = table.get("headers", [])
|
||||
rows = table.get("rows", [])
|
||||
if headers:
|
||||
tables_summary += f"表格{idx+1}表头: {', '.join(str(h) for h in headers)}\n"
|
||||
if rows:
|
||||
tables_summary += f"表格{idx+1}前3行: "
|
||||
for row_idx, row in enumerate(rows[:3]):
|
||||
if isinstance(row, list):
|
||||
tables_summary += " | ".join(str(c) for c in row) + "; "
|
||||
elif isinstance(row, dict):
|
||||
tables_summary += " | ".join(str(row.get(h, "")) for h in headers if headers) + "; "
|
||||
tables_summary += "\n"
|
||||
|
||||
source_contents.append({
|
||||
"filename": sf.filename,
|
||||
"doc_type": sf_ext,
|
||||
"content": content,
|
||||
"titles": titles,
|
||||
"tables_count": tables_count,
|
||||
"tables_summary": tables_summary
|
||||
})
|
||||
logger.info(f"[DEBUG] source_contents built: filename={sf.filename}, content_len={len(content)}, titles_count={len(titles)}, tables_count={tables_count}")
|
||||
if tables_summary:
|
||||
logger.info(f"[DEBUG] tables_summary preview: {tables_summary[:300]}")
|
||||
except Exception as e:
|
||||
logger.warning(f"解析源文档失败 {sf.filename}: {e}")
|
||||
|
||||
# 3. 根据源文档内容生成表头
|
||||
template_fields = await template_fill_service.get_template_fields_from_file(
|
||||
template_path,
|
||||
template_ext,
|
||||
source_contents=source_contents # 传递源文档内容
|
||||
)
|
||||
|
||||
# 3. 异步处理源文档到MongoDB
|
||||
task_id = str(uuid.uuid4())
|
||||
if source_file_info:
|
||||
# 保存任务记录到 MongoDB
|
||||
try:
|
||||
await mongodb.insert_task(
|
||||
task_id=task_id,
|
||||
task_type="source_process",
|
||||
status="pending",
|
||||
message=f"开始处理 {len(source_file_info)} 个源文档"
|
||||
)
|
||||
except Exception as mongo_err:
|
||||
logger.warning(f"MongoDB 保存任务记录失败: {mongo_err}")
|
||||
|
||||
background_tasks.add_task(
|
||||
process_source_documents,
|
||||
task_id=task_id,
|
||||
files=source_file_info
|
||||
)
|
||||
|
||||
logger.info(f"联合上传完成: 模板={template_file.filename}, 源文档={len(source_file_info)}个")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"template_id": template_path,
|
||||
"filename": template_file.filename,
|
||||
"file_type": template_ext,
|
||||
"fields": [
|
||||
{
|
||||
"cell": f.cell,
|
||||
"name": f.name,
|
||||
"field_type": f.field_type,
|
||||
"required": f.required,
|
||||
"hint": f.hint
|
||||
}
|
||||
for f in template_fields
|
||||
],
|
||||
"field_count": len(template_fields),
|
||||
"source_file_paths": [f["path"] for f in source_file_info],
|
||||
"source_filenames": [f["filename"] for f in source_file_info],
|
||||
"task_id": task_id
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"联合上传失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"联合上传失败: {str(e)}")
|
||||
|
||||
|
||||
async def process_source_documents(task_id: str, files: List[dict]):
|
||||
"""异步处理源文档,存入MongoDB"""
|
||||
try:
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=0, message="开始处理源文档"
|
||||
)
|
||||
|
||||
doc_ids = []
|
||||
for i, file_info in enumerate(files):
|
||||
try:
|
||||
parser = ParserFactory.get_parser(file_info["path"])
|
||||
result = parser.parse(file_info["path"])
|
||||
|
||||
if result.success:
|
||||
doc_id = await mongodb.insert_document(
|
||||
doc_type=file_info["ext"],
|
||||
content=result.data.get("content", ""),
|
||||
metadata={
|
||||
**result.metadata,
|
||||
"original_filename": file_info["filename"],
|
||||
"file_path": file_info["path"]
|
||||
},
|
||||
structured_data=result.data.get("structured_data")
|
||||
)
|
||||
doc_ids.append(doc_id)
|
||||
logger.info(f"源文档处理成功: {file_info['filename']}, doc_id: {doc_id}")
|
||||
else:
|
||||
logger.error(f"源文档解析失败: {file_info['filename']}, error: {result.error}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"源文档处理异常: {file_info['filename']}, error: {str(e)}")
|
||||
|
||||
progress = int((i + 1) / len(files) * 100)
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=progress, message=f"已处理 {i+1}/{len(files)}"
|
||||
)
|
||||
|
||||
await update_task_status(
|
||||
task_id, status="success",
|
||||
progress=100, message="源文档处理完成",
|
||||
result={"doc_ids": doc_ids}
|
||||
)
|
||||
logger.info(f"所有源文档处理完成: {len(doc_ids)}个")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"源文档批量处理失败: {str(e)}")
|
||||
await update_task_status(
|
||||
task_id, status="failure",
|
||||
progress=0, message="源文档处理失败",
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
|
||||
@router.post("/fields")
|
||||
async def extract_template_fields(
|
||||
template_id: str = Query(..., description="模板ID/文件路径"),
|
||||
file_type: str = Query("xlsx", description="文件类型")
|
||||
):
|
||||
"""
|
||||
从已上传的模板提取字段定义
|
||||
|
||||
Args:
|
||||
template_id: 模板ID
|
||||
file_type: 文件类型
|
||||
|
||||
Returns:
|
||||
字段列表
|
||||
"""
|
||||
try:
|
||||
fields = await template_fill_service.get_template_fields_from_file(
|
||||
template_id,
|
||||
file_type
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"fields": [
|
||||
{
|
||||
"cell": f.cell,
|
||||
"name": f.name,
|
||||
"field_type": f.field_type,
|
||||
"required": f.required,
|
||||
"hint": f.hint
|
||||
}
|
||||
for f in fields
|
||||
]
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"提取字段失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"提取失败: {str(e)}")
|
||||
|
||||
|
||||
@router.post("/fill")
|
||||
async def fill_template(
|
||||
request: FillRequest,
|
||||
):
|
||||
"""
|
||||
执行表格填写
|
||||
|
||||
根据提供的字段定义,从源文档中检索信息并填写
|
||||
|
||||
Args:
|
||||
request: 填写请求
|
||||
|
||||
Returns:
|
||||
填写结果
|
||||
"""
|
||||
# 生成或使用传入的 task_id
|
||||
task_id = request.task_id or str(uuid.uuid4())
|
||||
|
||||
try:
|
||||
# 创建任务记录到 MongoDB
|
||||
try:
|
||||
await mongodb.insert_task(
|
||||
task_id=task_id,
|
||||
task_type="template_fill",
|
||||
status="processing",
|
||||
message=f"开始填表任务: {len(request.template_fields)} 个字段"
|
||||
)
|
||||
except Exception as mongo_err:
|
||||
logger.warning(f"MongoDB 创建任务记录失败: {mongo_err}")
|
||||
|
||||
# 更新进度 - 开始
|
||||
await update_task_status(
|
||||
task_id, "processing",
|
||||
progress=0, message="开始处理..."
|
||||
)
|
||||
|
||||
# 转换字段
|
||||
fields = [
|
||||
TemplateField(
|
||||
cell=f.cell,
|
||||
name=f.name,
|
||||
field_type=f.field_type,
|
||||
required=f.required,
|
||||
hint=f.hint
|
||||
)
|
||||
for f in request.template_fields
|
||||
]
|
||||
|
||||
# 从 template_id 提取文件类型
|
||||
template_file_type = "xlsx" # 默认类型
|
||||
if request.template_id:
|
||||
ext = request.template_id.split('.')[-1].lower()
|
||||
if ext in ["xlsx", "xls"]:
|
||||
template_file_type = "xlsx"
|
||||
elif ext == "docx":
|
||||
template_file_type = "docx"
|
||||
|
||||
# 更新进度 - 准备开始填写
|
||||
await update_task_status(
|
||||
task_id, "processing",
|
||||
progress=10, message=f"准备填写 {len(fields)} 个字段..."
|
||||
)
|
||||
|
||||
# 执行填写
|
||||
result = await template_fill_service.fill_template(
|
||||
template_fields=fields,
|
||||
source_doc_ids=request.source_doc_ids,
|
||||
source_file_paths=request.source_file_paths,
|
||||
user_hint=request.user_hint,
|
||||
template_id=request.template_id,
|
||||
template_file_type=template_file_type,
|
||||
task_id=task_id
|
||||
)
|
||||
|
||||
# 更新为成功
|
||||
await update_task_status(
|
||||
task_id, "success",
|
||||
progress=100, message="填表完成",
|
||||
result={
|
||||
"field_count": len(fields),
|
||||
"max_rows": result.get("max_rows", 0)
|
||||
}
|
||||
)
|
||||
|
||||
return {**result, "task_id": task_id}
|
||||
|
||||
except Exception as e:
|
||||
# 更新为失败
|
||||
await update_task_status(
|
||||
task_id, "failure",
|
||||
progress=0, message="填表失败",
|
||||
error=str(e)
|
||||
)
|
||||
logger.error(f"填写表格失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"填写失败: {str(e)}")
|
||||
|
||||
|
||||
@router.post("/export")
|
||||
async def export_filled_template(
|
||||
request: ExportRequest,
|
||||
):
|
||||
"""
|
||||
导出填写后的表格
|
||||
|
||||
支持 Excel (.xlsx) 和 Word (.docx) 格式
|
||||
|
||||
Args:
|
||||
request: 导出请求
|
||||
|
||||
Returns:
|
||||
文件流
|
||||
"""
|
||||
try:
|
||||
if request.format == "xlsx":
|
||||
return await _export_to_excel(request.filled_data, request.template_id)
|
||||
elif request.format == "docx":
|
||||
return await _export_to_word(request.filled_data, request.template_id)
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的导出格式: {request.format},仅支持 xlsx/docx"
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"导出失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"导出失败: {str(e)}")
|
||||
|
||||
|
||||
async def _export_to_excel(filled_data: dict, template_id: str) -> StreamingResponse:
|
||||
"""导出为 Excel 格式(支持多行)"""
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
logger.info(f"导出填表数据: {len(filled_data)} 个字段")
|
||||
|
||||
# 计算最大行数
|
||||
max_rows = 1
|
||||
for k, v in filled_data.items():
|
||||
if isinstance(v, list) and len(v) > max_rows:
|
||||
max_rows = len(v)
|
||||
logger.info(f" {k}: {type(v).__name__} = {str(v)[:80]}")
|
||||
|
||||
logger.info(f"最大行数: {max_rows}")
|
||||
|
||||
# 构建多行数据
|
||||
rows_data = []
|
||||
for row_idx in range(max_rows):
|
||||
row = {}
|
||||
for col_name, values in filled_data.items():
|
||||
if isinstance(values, list):
|
||||
# 取对应行的值,不足则填空
|
||||
row[col_name] = values[row_idx] if row_idx < len(values) else ""
|
||||
else:
|
||||
# 非列表,整个值填入第一行
|
||||
row[col_name] = values if row_idx == 0 else ""
|
||||
rows_data.append(row)
|
||||
|
||||
df = pd.DataFrame(rows_data)
|
||||
|
||||
# 确保列顺序
|
||||
if not df.empty:
|
||||
df = df[list(filled_data.keys())]
|
||||
|
||||
logger.info(f"DataFrame 形状: {df.shape}")
|
||||
logger.info(f"DataFrame 列: {list(df.columns)}")
|
||||
|
||||
output = io.BytesIO()
|
||||
with pd.ExcelWriter(output, engine='openpyxl') as writer:
|
||||
df.to_excel(writer, index=False, sheet_name='填写结果')
|
||||
|
||||
output.seek(0)
|
||||
|
||||
filename = f"filled_template.xlsx"
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(output.getvalue()),
|
||||
media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
headers={"Content-Disposition": f"attachment; filename={filename}"}
|
||||
)
|
||||
|
||||
|
||||
async def _export_to_word(filled_data: dict, template_id: str) -> StreamingResponse:
|
||||
"""导出为 Word 格式"""
|
||||
from docx import Document
|
||||
from docx.shared import Pt, RGBColor
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
|
||||
doc = Document()
|
||||
|
||||
# 添加标题
|
||||
title = doc.add_heading('填写结果', level=1)
|
||||
title.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
|
||||
# 添加填写时间和模板信息
|
||||
from datetime import datetime
|
||||
info_para = doc.add_paragraph()
|
||||
info_para.add_run(f"模板ID: {template_id}\n").bold = True
|
||||
info_para.add_run(f"导出时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
doc.add_paragraph() # 空行
|
||||
|
||||
# 添加字段表格
|
||||
table = doc.add_table(rows=1, cols=3)
|
||||
table.style = 'Light Grid Accent 1'
|
||||
|
||||
# 表头
|
||||
header_cells = table.rows[0].cells
|
||||
header_cells[0].text = '字段名'
|
||||
header_cells[1].text = '填写值'
|
||||
header_cells[2].text = '状态'
|
||||
|
||||
for field_name, field_value in filled_data.items():
|
||||
row_cells = table.add_row().cells
|
||||
row_cells[0].text = field_name
|
||||
row_cells[1].text = str(field_value) if field_value else ''
|
||||
row_cells[2].text = '已填写' if field_value else '为空'
|
||||
|
||||
# 保存到 BytesIO
|
||||
output = io.BytesIO()
|
||||
doc.save(output)
|
||||
output.seek(0)
|
||||
|
||||
filename = f"filled_template.docx"
|
||||
|
||||
return StreamingResponse(
|
||||
io.BytesIO(output.getvalue()),
|
||||
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
headers={"Content-Disposition": f"attachment; filename={filename}"}
|
||||
)
|
||||
|
||||
|
||||
@router.post("/export/excel")
|
||||
async def export_to_excel(
|
||||
filled_data: dict,
|
||||
template_id: str = Query(..., description="模板ID")
|
||||
):
|
||||
"""
|
||||
专门导出为 Excel 格式
|
||||
|
||||
Args:
|
||||
filled_data: 填写数据
|
||||
template_id: 模板ID
|
||||
|
||||
Returns:
|
||||
Excel 文件流
|
||||
"""
|
||||
return await _export_to_excel(filled_data, template_id)
|
||||
|
||||
|
||||
@router.post("/export/word")
|
||||
async def export_to_word(
|
||||
filled_data: dict,
|
||||
template_id: str = Query(..., description="模板ID")
|
||||
):
|
||||
"""
|
||||
专门导出为 Word 格式
|
||||
|
||||
Args:
|
||||
filled_data: 填写数据
|
||||
template_id: 模板ID
|
||||
|
||||
Returns:
|
||||
Word 文件流
|
||||
"""
|
||||
return await _export_to_word(filled_data, template_id)
|
||||
@@ -5,11 +5,14 @@ from fastapi import APIRouter, UploadFile, File, HTTPException, Query
|
||||
from fastapi.responses import StreamingResponse
|
||||
from typing import Optional
|
||||
import logging
|
||||
import os
|
||||
import pandas as pd
|
||||
import io
|
||||
|
||||
from app.services.file_service import file_service
|
||||
from app.core.document_parser import XlsxParser
|
||||
from app.services.table_rag_service import table_rag_service
|
||||
from app.core.database import mongodb
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -27,7 +30,7 @@ async def upload_excel(
|
||||
header_row: int = Query(0, description="表头所在的行索引")
|
||||
):
|
||||
"""
|
||||
上传并解析 Excel 文件
|
||||
上传并解析 Excel 文件,同时存储到 MySQL 数据库
|
||||
|
||||
Args:
|
||||
file: 上传的 Excel 文件
|
||||
@@ -77,6 +80,73 @@ async def upload_excel(
|
||||
result.metadata['saved_path'] = saved_path
|
||||
result.metadata['original_filename'] = file.filename
|
||||
|
||||
# 存储到 MySQL 数据库
|
||||
try:
|
||||
store_result = await table_rag_service.build_table_rag_index(
|
||||
file_path=saved_path,
|
||||
filename=file.filename,
|
||||
sheet_name=sheet_name if sheet_name else None,
|
||||
header_row=header_row
|
||||
)
|
||||
if store_result.get("success"):
|
||||
result.metadata['mysql_table'] = store_result.get('table_name')
|
||||
result.metadata['row_count'] = store_result.get('row_count')
|
||||
logger.info(f"Excel已存储到MySQL: {file.filename}, 表: {store_result.get('table_name')}")
|
||||
else:
|
||||
logger.warning(f"Excel存储到MySQL失败: {store_result.get('error')}")
|
||||
except Exception as e:
|
||||
logger.error(f"Excel存储到MySQL异常: {str(e)}", exc_info=True)
|
||||
|
||||
# 存储到 MongoDB(用于文档列表展示)
|
||||
try:
|
||||
content = ""
|
||||
# 构建文本内容用于展示
|
||||
if result.data:
|
||||
if isinstance(result.data, dict):
|
||||
# 单 sheet 格式: {columns, rows, ...}
|
||||
if 'columns' in result.data and 'rows' in result.data:
|
||||
content += f"Sheet: {result.metadata.get('current_sheet', 'Sheet1') if result.metadata else 'Sheet1'}\n"
|
||||
content += ", ".join(str(h) for h in result.data['columns']) + "\n"
|
||||
for row in result.data['rows'][:100]:
|
||||
if isinstance(row, dict):
|
||||
content += ", ".join(str(row.get(col, "")) for col in result.data['columns']) + "\n"
|
||||
elif isinstance(row, list):
|
||||
content += ", ".join(str(cell) for cell in row) + "\n"
|
||||
content += f"... (共 {len(result.data['rows'])} 行)\n\n"
|
||||
# 多 sheet 格式: {sheets: {sheet_name: {columns, rows}}}
|
||||
elif 'sheets' in result.data:
|
||||
for sheet_name_key, sheet_data in result.data['sheets'].items():
|
||||
if isinstance(sheet_data, dict) and 'columns' in sheet_data and 'rows' in sheet_data:
|
||||
content += f"Sheet: {sheet_name_key}\n"
|
||||
content += ", ".join(str(h) for h in sheet_data['columns']) + "\n"
|
||||
for row in sheet_data['rows'][:100]:
|
||||
if isinstance(row, dict):
|
||||
content += ", ".join(str(row.get(col, "")) for col in sheet_data['columns']) + "\n"
|
||||
elif isinstance(row, list):
|
||||
content += ", ".join(str(cell) for cell in row) + "\n"
|
||||
content += f"... (共 {len(sheet_data['rows'])} 行)\n\n"
|
||||
|
||||
doc_metadata = {
|
||||
"filename": os.path.basename(saved_path),
|
||||
"original_filename": file.filename,
|
||||
"saved_path": saved_path,
|
||||
"file_size": len(content),
|
||||
"row_count": result.metadata.get('row_count', 0) if result.metadata else 0,
|
||||
"column_count": result.metadata.get('column_count', 0) if result.metadata else 0,
|
||||
"columns": result.metadata.get('columns', []) if result.metadata else [],
|
||||
"mysql_table": result.metadata.get('mysql_table') if result.metadata else None,
|
||||
"sheet_count": result.metadata.get('sheet_count', 1) if result.metadata else 1,
|
||||
}
|
||||
await mongodb.insert_document(
|
||||
doc_type="xlsx",
|
||||
content=content,
|
||||
metadata=doc_metadata,
|
||||
structured_data=result.data if result.data else None
|
||||
)
|
||||
logger.info(f"Excel文档已存储到MongoDB: {file.filename}, content长度: {len(content)}")
|
||||
except Exception as e:
|
||||
logger.error(f"Excel存储到MongoDB异常: {str(e)}", exc_info=True)
|
||||
|
||||
return result.to_dict()
|
||||
|
||||
except HTTPException:
|
||||
@@ -184,7 +254,7 @@ async def export_excel(
|
||||
output.seek(0)
|
||||
|
||||
# 生成文件名
|
||||
original_name = file_path.split('/')[-1] if '/' in file_path else file_path
|
||||
original_name = os.path.basename(file_path)
|
||||
if columns:
|
||||
export_name = f"export_{sheet_name or 'data'}_{len(column_list) if columns else 'all'}_cols.xlsx"
|
||||
else:
|
||||
|
||||
@@ -6,26 +6,67 @@ class Settings(BaseSettings):
|
||||
APP_NAME: str = "FilesReadSystem"
|
||||
DEBUG: bool = True
|
||||
API_V1_STR: str = "/api/v1"
|
||||
|
||||
# 数据库
|
||||
MONGODB_URL: str
|
||||
MONGODB_DB_NAME: str
|
||||
REDIS_URL: str
|
||||
|
||||
# AI 相关
|
||||
LLM_API_KEY: str
|
||||
LLM_BASE_URL: str
|
||||
LLM_MODEL_NAME: str
|
||||
|
||||
# 文件路径
|
||||
|
||||
# ==================== 数据库配置 ====================
|
||||
|
||||
# MongoDB 配置 (非结构化数据存储)
|
||||
MONGODB_URL: str = "mongodb://localhost:27017"
|
||||
MONGODB_DB_NAME: str = "document_system"
|
||||
|
||||
# MySQL 配置 (结构化数据存储)
|
||||
MYSQL_HOST: str = "localhost"
|
||||
MYSQL_PORT: int = 3306
|
||||
MYSQL_USER: str = "root"
|
||||
MYSQL_PASSWORD: str = ""
|
||||
MYSQL_DATABASE: str = "document_system"
|
||||
MYSQL_CHARSET: str = "utf8mb4"
|
||||
|
||||
# Redis 配置 (缓存/任务队列)
|
||||
REDIS_URL: str = "redis://localhost:6379/0"
|
||||
|
||||
# ==================== AI 相关配置 ====================
|
||||
LLM_API_KEY: str = ""
|
||||
LLM_BASE_URL: str = "https://api.minimax.chat"
|
||||
LLM_MODEL_NAME: str = "MiniMax-Text-01"
|
||||
|
||||
# ==================== RAG/Embedding 配置 ====================
|
||||
EMBEDDING_MODEL: str = "all-MiniLM-L6-v2"
|
||||
|
||||
# ==================== Supabase 配置 ====================
|
||||
SUPABASE_URL: str = ""
|
||||
SUPABASE_ANON_KEY: str = ""
|
||||
SUPABASE_SERVICE_KEY: str = ""
|
||||
|
||||
# ==================== 文件路径配置 ====================
|
||||
BASE_DIR: Path = Path(__file__).resolve().parent.parent.parent
|
||||
UPLOAD_DIR: str = "data/uploads"
|
||||
|
||||
|
||||
# ==================== RAG/向量数据库配置 ====================
|
||||
FAISS_INDEX_DIR: str = "data/faiss"
|
||||
|
||||
# 允许 Pydantic 从 .env 文件读取
|
||||
model_config = SettingsConfigDict(
|
||||
env_file=Path(__file__).parent.parent / ".env",
|
||||
env_file=Path(__file__).parent.parent / ".env",
|
||||
env_file_encoding='utf-8',
|
||||
extra='ignore'
|
||||
)
|
||||
|
||||
@property
|
||||
def mysql_url(self) -> str:
|
||||
"""生成MySQL连接URL (同步)"""
|
||||
return (
|
||||
f"mysql+pymysql://{self.MYSQL_USER}:{self.MYSQL_PASSWORD}"
|
||||
f"@{self.MYSQL_HOST}:{self.MYSQL_PORT}/{self.MYSQL_DATABASE}"
|
||||
f"?charset={self.MYSQL_CHARSET}"
|
||||
)
|
||||
|
||||
@property
|
||||
def async_mysql_url(self) -> str:
|
||||
"""生成MySQL连接URL (异步)"""
|
||||
return (
|
||||
f"mysql+aiomysql://{self.MYSQL_USER}:{self.MYSQL_PASSWORD}"
|
||||
f"@{self.MYSQL_HOST}:{self.MYSQL_PORT}/{self.MYSQL_DATABASE}"
|
||||
f"?charset={self.MYSQL_CHARSET}"
|
||||
)
|
||||
|
||||
settings = Settings()
|
||||
18
backend/app/core/database/__init__.py
Normal file
18
backend/app/core/database/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""
|
||||
数据库连接管理模块
|
||||
|
||||
提供 MySQL、MongoDB、Redis 的连接管理
|
||||
"""
|
||||
from app.core.database.mysql import MySQLDB, mysql_db, Base
|
||||
from app.core.database.mongodb import MongoDB, mongodb
|
||||
from app.core.database.redis_db import RedisDB, redis_db
|
||||
|
||||
__all__ = [
|
||||
"MySQLDB",
|
||||
"mysql_db",
|
||||
"MongoDB",
|
||||
"mongodb",
|
||||
"RedisDB",
|
||||
"redis_db",
|
||||
"Base",
|
||||
]
|
||||
375
backend/app/core/database/mongodb.py
Normal file
375
backend/app/core/database/mongodb.py
Normal file
@@ -0,0 +1,375 @@
|
||||
"""
|
||||
MongoDB 数据库连接管理模块
|
||||
|
||||
提供非结构化数据的存储和查询功能
|
||||
"""
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MongoDB:
|
||||
"""MongoDB 数据库管理类"""
|
||||
|
||||
def __init__(self):
|
||||
self.client: Optional[AsyncIOMotorClient] = None
|
||||
self.db: Optional[AsyncIOMotorDatabase] = None
|
||||
|
||||
async def connect(self):
|
||||
"""建立 MongoDB 连接"""
|
||||
try:
|
||||
self.client = AsyncIOMotorClient(
|
||||
settings.MONGODB_URL,
|
||||
serverSelectionTimeoutMS=30000, # 30秒超时,适应远程服务器
|
||||
connectTimeoutMS=30000, # 连接超时
|
||||
socketTimeoutMS=60000, # Socket 超时
|
||||
)
|
||||
self.db = self.client[settings.MONGODB_DB_NAME]
|
||||
# 验证连接
|
||||
await self.client.admin.command('ping')
|
||||
logger.info(f"MongoDB 连接成功: {settings.MONGODB_DB_NAME}")
|
||||
except Exception as e:
|
||||
logger.error(f"MongoDB 连接失败: {e}")
|
||||
raise
|
||||
|
||||
async def close(self):
|
||||
"""关闭 MongoDB 连接"""
|
||||
if self.client:
|
||||
self.client.close()
|
||||
logger.info("MongoDB 连接已关闭")
|
||||
|
||||
@property
|
||||
def documents(self):
|
||||
"""文档集合 - 存储原始文档和解析结果"""
|
||||
return self.db["documents"]
|
||||
|
||||
@property
|
||||
def embeddings(self):
|
||||
"""向量嵌入集合 - 存储文本嵌入向量"""
|
||||
return self.db["embeddings"]
|
||||
|
||||
@property
|
||||
def rag_index(self):
|
||||
"""RAG索引集合 - 存储字段语义索引"""
|
||||
return self.db["rag_index"]
|
||||
|
||||
@property
|
||||
def tasks(self):
|
||||
"""任务集合 - 存储任务历史记录"""
|
||||
return self.db["tasks"]
|
||||
|
||||
# ==================== 文档操作 ====================
|
||||
|
||||
async def insert_document(
|
||||
self,
|
||||
doc_type: str,
|
||||
content: str,
|
||||
metadata: Dict[str, Any],
|
||||
structured_data: Optional[Dict[str, Any]] = None,
|
||||
) -> str:
|
||||
"""
|
||||
插入文档
|
||||
|
||||
Args:
|
||||
doc_type: 文档类型 (docx/xlsx/md/txt)
|
||||
content: 原始文本内容
|
||||
metadata: 元数据
|
||||
structured_data: 结构化数据 (表格等)
|
||||
|
||||
Returns:
|
||||
插入文档的ID
|
||||
"""
|
||||
document = {
|
||||
"doc_type": doc_type,
|
||||
"content": content,
|
||||
"metadata": metadata,
|
||||
"structured_data": structured_data,
|
||||
"created_at": datetime.utcnow(),
|
||||
"updated_at": datetime.utcnow(),
|
||||
}
|
||||
result = await self.documents.insert_one(document)
|
||||
doc_id = str(result.inserted_id)
|
||||
filename = metadata.get("original_filename", "unknown")
|
||||
logger.info(f"✓ 文档已存入MongoDB: [{doc_type}] {filename} | ID: {doc_id}")
|
||||
return doc_id
|
||||
|
||||
async def get_document(self, doc_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""根据ID获取文档"""
|
||||
from bson import ObjectId
|
||||
doc = await self.documents.find_one({"_id": ObjectId(doc_id)})
|
||||
if doc:
|
||||
doc["_id"] = str(doc["_id"])
|
||||
return doc
|
||||
|
||||
async def search_documents(
|
||||
self,
|
||||
query: str,
|
||||
doc_type: Optional[str] = None,
|
||||
limit: int = 10,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
搜索文档
|
||||
|
||||
Args:
|
||||
query: 搜索关键词
|
||||
doc_type: 文档类型过滤
|
||||
limit: 返回数量
|
||||
|
||||
Returns:
|
||||
文档列表
|
||||
"""
|
||||
filter_query = {"content": {"$regex": query}}
|
||||
if doc_type:
|
||||
filter_query["doc_type"] = doc_type
|
||||
|
||||
cursor = self.documents.find(filter_query).limit(limit)
|
||||
documents = []
|
||||
async for doc in cursor:
|
||||
doc["_id"] = str(doc["_id"])
|
||||
documents.append(doc)
|
||||
return documents
|
||||
|
||||
async def delete_document(self, doc_id: str) -> bool:
|
||||
"""删除文档"""
|
||||
from bson import ObjectId
|
||||
result = await self.documents.delete_one({"_id": ObjectId(doc_id)})
|
||||
return result.deleted_count > 0
|
||||
|
||||
# ==================== RAG 索引操作 ====================
|
||||
|
||||
async def insert_rag_entry(
|
||||
self,
|
||||
table_name: str,
|
||||
field_name: str,
|
||||
field_description: str,
|
||||
embedding: List[float],
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> str:
|
||||
"""
|
||||
插入RAG索引条目
|
||||
|
||||
Args:
|
||||
table_name: 表名
|
||||
field_name: 字段名
|
||||
field_description: 字段描述
|
||||
embedding: 向量嵌入
|
||||
metadata: 其他元数据
|
||||
|
||||
Returns:
|
||||
插入条目的ID
|
||||
"""
|
||||
entry = {
|
||||
"table_name": table_name,
|
||||
"field_name": field_name,
|
||||
"field_description": field_description,
|
||||
"embedding": embedding,
|
||||
"metadata": metadata or {},
|
||||
"created_at": datetime.utcnow(),
|
||||
}
|
||||
result = await self.rag_index.insert_one(entry)
|
||||
return str(result.inserted_id)
|
||||
|
||||
async def search_rag(
|
||||
self,
|
||||
query_embedding: List[float],
|
||||
top_k: int = 5,
|
||||
table_name: Optional[str] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
搜索RAG索引 (使用向量相似度)
|
||||
|
||||
Args:
|
||||
query_embedding: 查询向量
|
||||
top_k: 返回数量
|
||||
table_name: 可选的表名过滤
|
||||
|
||||
Returns:
|
||||
相关的索引条目
|
||||
"""
|
||||
# MongoDB 5.0+ 支持向量搜索
|
||||
# 较低版本使用欧氏距离替代
|
||||
pipeline = [
|
||||
{
|
||||
"$addFields": {
|
||||
"distance": {
|
||||
"$reduce": {
|
||||
"input": {"$range": [0, {"$size": "$embedding"}]},
|
||||
"initialValue": 0,
|
||||
"in": {
|
||||
"$add": [
|
||||
"$$value",
|
||||
{
|
||||
"$pow": [
|
||||
{
|
||||
"$subtract": [
|
||||
{"$arrayElemAt": ["$embedding", "$$this"]},
|
||||
{"$arrayElemAt": [query_embedding, "$$this"]},
|
||||
]
|
||||
},
|
||||
2,
|
||||
]
|
||||
},
|
||||
]
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{"$sort": {"distance": 1}},
|
||||
{"$limit": top_k},
|
||||
]
|
||||
|
||||
if table_name:
|
||||
pipeline.insert(0, {"$match": {"table_name": table_name}})
|
||||
|
||||
results = []
|
||||
async for doc in self.rag_index.aggregate(pipeline):
|
||||
doc["_id"] = str(doc["_id"])
|
||||
results.append(doc)
|
||||
return results
|
||||
|
||||
# ==================== 集合管理 ====================
|
||||
|
||||
async def create_indexes(self):
|
||||
"""创建索引以优化查询"""
|
||||
# 文档集合索引
|
||||
await self.documents.create_index("doc_type")
|
||||
await self.documents.create_index("created_at")
|
||||
await self.documents.create_index([("content", "text")])
|
||||
|
||||
# RAG索引集合索引
|
||||
await self.rag_index.create_index("table_name")
|
||||
await self.rag_index.create_index("field_name")
|
||||
|
||||
# 任务集合索引
|
||||
await self.tasks.create_index("task_id", unique=True)
|
||||
await self.tasks.create_index("created_at")
|
||||
|
||||
logger.info("MongoDB 索引创建完成")
|
||||
|
||||
# ==================== 任务历史操作 ====================
|
||||
|
||||
async def insert_task(
|
||||
self,
|
||||
task_id: str,
|
||||
task_type: str,
|
||||
status: str = "pending",
|
||||
message: str = "",
|
||||
result: Optional[Dict[str, Any]] = None,
|
||||
error: Optional[str] = None,
|
||||
) -> str:
|
||||
"""
|
||||
插入任务记录
|
||||
|
||||
Args:
|
||||
task_id: 任务ID
|
||||
task_type: 任务类型
|
||||
status: 任务状态
|
||||
message: 任务消息
|
||||
result: 任务结果
|
||||
error: 错误信息
|
||||
|
||||
Returns:
|
||||
插入文档的ID
|
||||
"""
|
||||
task = {
|
||||
"task_id": task_id,
|
||||
"task_type": task_type,
|
||||
"status": status,
|
||||
"message": message,
|
||||
"result": result,
|
||||
"error": error,
|
||||
"created_at": datetime.utcnow(),
|
||||
"updated_at": datetime.utcnow(),
|
||||
}
|
||||
result_obj = await self.tasks.insert_one(task)
|
||||
return str(result_obj.inserted_id)
|
||||
|
||||
async def update_task(
|
||||
self,
|
||||
task_id: str,
|
||||
status: Optional[str] = None,
|
||||
message: Optional[str] = None,
|
||||
result: Optional[Dict[str, Any]] = None,
|
||||
error: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""
|
||||
更新任务状态
|
||||
|
||||
Args:
|
||||
task_id: 任务ID
|
||||
status: 任务状态
|
||||
message: 任务消息
|
||||
result: 任务结果
|
||||
error: 错误信息
|
||||
|
||||
Returns:
|
||||
是否更新成功
|
||||
"""
|
||||
from bson import ObjectId
|
||||
|
||||
update_data = {"updated_at": datetime.utcnow()}
|
||||
if status is not None:
|
||||
update_data["status"] = status
|
||||
if message is not None:
|
||||
update_data["message"] = message
|
||||
if result is not None:
|
||||
update_data["result"] = result
|
||||
if error is not None:
|
||||
update_data["error"] = error
|
||||
|
||||
update_result = await self.tasks.update_one(
|
||||
{"task_id": task_id},
|
||||
{"$set": update_data}
|
||||
)
|
||||
return update_result.modified_count > 0
|
||||
|
||||
async def get_task(self, task_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""根据task_id获取任务"""
|
||||
task = await self.tasks.find_one({"task_id": task_id})
|
||||
if task:
|
||||
task["_id"] = str(task["_id"])
|
||||
return task
|
||||
|
||||
async def list_tasks(
|
||||
self,
|
||||
limit: int = 50,
|
||||
skip: int = 0,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
获取任务列表
|
||||
|
||||
Args:
|
||||
limit: 返回数量
|
||||
skip: 跳过数量
|
||||
|
||||
Returns:
|
||||
任务列表
|
||||
"""
|
||||
cursor = self.tasks.find().sort("created_at", -1).skip(skip).limit(limit)
|
||||
tasks = []
|
||||
async for task in cursor:
|
||||
task["_id"] = str(task["_id"])
|
||||
# 转换 datetime 为字符串
|
||||
if task.get("created_at"):
|
||||
task["created_at"] = task["created_at"].isoformat()
|
||||
if task.get("updated_at"):
|
||||
task["updated_at"] = task["updated_at"].isoformat()
|
||||
tasks.append(task)
|
||||
return tasks
|
||||
|
||||
async def delete_task(self, task_id: str) -> bool:
|
||||
"""删除任务"""
|
||||
result = await self.tasks.delete_one({"task_id": task_id})
|
||||
return result.deleted_count > 0
|
||||
|
||||
|
||||
# ==================== 全局单例 ====================
|
||||
|
||||
mongodb = MongoDB()
|
||||
214
backend/app/core/database/mysql.py
Normal file
214
backend/app/core/database/mysql.py
Normal file
@@ -0,0 +1,214 @@
|
||||
"""
|
||||
MySQL 数据库连接管理模块
|
||||
|
||||
提供结构化数据的存储和查询功能
|
||||
"""
|
||||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Any, AsyncGenerator, Dict, List, Optional
|
||||
|
||||
from sqlalchemy import (
|
||||
Column,
|
||||
DateTime,
|
||||
Enum as SQLEnum,
|
||||
Float,
|
||||
Integer,
|
||||
String,
|
||||
Text,
|
||||
create_engine,
|
||||
text,
|
||||
)
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||||
from sqlalchemy.orm import DeclarativeBase, sessionmaker
|
||||
from sqlalchemy.sql import select
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
"""SQLAlchemy 声明基类"""
|
||||
pass
|
||||
|
||||
|
||||
class MySQLDB:
|
||||
"""MySQL 数据库管理类"""
|
||||
|
||||
def __init__(self):
|
||||
# 异步引擎 (用于 FastAPI 异步操作)
|
||||
self.async_engine = create_async_engine(
|
||||
settings.async_mysql_url,
|
||||
echo=settings.DEBUG, # SQL 日志
|
||||
pool_pre_ping=True, # 连接前检测
|
||||
pool_size=10,
|
||||
max_overflow=20,
|
||||
)
|
||||
|
||||
# 异步会话工厂
|
||||
self.async_session_factory = async_sessionmaker(
|
||||
bind=self.async_engine,
|
||||
class_=AsyncSession,
|
||||
expire_on_commit=False,
|
||||
autocommit=False,
|
||||
autoflush=False,
|
||||
)
|
||||
|
||||
# 同步引擎 (用于 Celery 同步任务)
|
||||
self.sync_engine = create_engine(
|
||||
settings.mysql_url,
|
||||
echo=settings.DEBUG,
|
||||
pool_pre_ping=True,
|
||||
pool_size=5,
|
||||
max_overflow=10,
|
||||
)
|
||||
|
||||
# 同步会话工厂
|
||||
self.sync_session_factory = sessionmaker(
|
||||
bind=self.sync_engine,
|
||||
autocommit=False,
|
||||
autoflush=False,
|
||||
)
|
||||
|
||||
async def init_db(self):
|
||||
"""初始化数据库,创建所有表"""
|
||||
try:
|
||||
# 先创建数据库(如果不存在)
|
||||
from sqlalchemy import text
|
||||
db_name = settings.MYSQL_DATABASE
|
||||
# 连接时不指定数据库来创建数据库
|
||||
temp_url = (
|
||||
f"mysql+aiomysql://{settings.MYSQL_USER}:{settings.MYSQL_PASSWORD}"
|
||||
f"@{settings.MYSQL_HOST}:{settings.MYSQL_PORT}/"
|
||||
f"?charset={settings.MYSQL_CHARSET}"
|
||||
)
|
||||
from sqlalchemy.ext.asyncio import create_async_engine
|
||||
temp_engine = create_async_engine(temp_url, echo=False)
|
||||
try:
|
||||
async with temp_engine.connect() as conn:
|
||||
await conn.execute(text(f"CREATE DATABASE IF NOT EXISTS `{db_name}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci"))
|
||||
await conn.commit()
|
||||
logger.info(f"MySQL 数据库 {db_name} 创建或已存在")
|
||||
finally:
|
||||
await temp_engine.dispose()
|
||||
|
||||
# 然后创建表
|
||||
async with self.async_engine.begin() as conn:
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
logger.info("MySQL 数据库表初始化完成")
|
||||
except Exception as e:
|
||||
logger.error(f"MySQL 数据库初始化失败: {e}")
|
||||
raise
|
||||
|
||||
async def close(self):
|
||||
"""关闭数据库连接"""
|
||||
await self.async_engine.dispose()
|
||||
self.sync_engine.dispose()
|
||||
logger.info("MySQL 数据库连接已关闭")
|
||||
|
||||
@asynccontextmanager
|
||||
async def get_session(self) -> AsyncGenerator[AsyncSession, None]:
|
||||
"""获取异步数据库会话"""
|
||||
session = self.async_session_factory()
|
||||
try:
|
||||
yield session
|
||||
await session.commit()
|
||||
except Exception:
|
||||
await session.rollback()
|
||||
raise
|
||||
finally:
|
||||
await session.close()
|
||||
|
||||
async def execute_query(
|
||||
self,
|
||||
query: str,
|
||||
params: Optional[Dict[str, Any]] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
执行原始 SQL 查询
|
||||
|
||||
Args:
|
||||
query: SQL 查询语句
|
||||
params: 查询参数
|
||||
|
||||
Returns:
|
||||
查询结果列表
|
||||
"""
|
||||
async with self.get_session() as session:
|
||||
result = await session.execute(select(text(query)), params or {})
|
||||
rows = result.fetchall()
|
||||
return [dict(row._mapping) for row in rows]
|
||||
|
||||
async def execute_raw_sql(
|
||||
self,
|
||||
sql: str,
|
||||
params: Optional[Dict[str, Any]] = None
|
||||
) -> Any:
|
||||
"""
|
||||
执行原始 SQL 语句 (INSERT/UPDATE/DELETE)
|
||||
|
||||
Args:
|
||||
sql: SQL 语句
|
||||
params: 语句参数
|
||||
|
||||
Returns:
|
||||
执行结果
|
||||
"""
|
||||
async with self.get_session() as session:
|
||||
result = await session.execute(text(sql), params or {})
|
||||
await session.commit()
|
||||
return result.lastrowid if result.lastrowid else result.rowcount
|
||||
|
||||
|
||||
# ==================== 预定义的数据模型 ====================
|
||||
|
||||
class DocumentTable(Base):
|
||||
"""文档元数据表 - 存储已解析文档的基本信息"""
|
||||
__tablename__ = "document_tables"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
table_name = Column(String(255), unique=True, nullable=False, comment="表名")
|
||||
display_name = Column(String(255), comment="显示名称")
|
||||
description = Column(Text, comment="表描述")
|
||||
source_file = Column(String(512), comment="来源文件")
|
||||
column_count = Column(Integer, default=0, comment="列数")
|
||||
row_count = Column(Integer, default=0, comment="行数")
|
||||
file_size = Column(Integer, comment="文件大小(字节)")
|
||||
created_at = Column(DateTime, comment="创建时间")
|
||||
updated_at = Column(DateTime, comment="更新时间")
|
||||
|
||||
|
||||
class DocumentField(Base):
|
||||
"""文档字段表 - 存储每个表的字段信息"""
|
||||
__tablename__ = "document_fields"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
table_id = Column(Integer, nullable=False, comment="所属表ID")
|
||||
field_name = Column(String(255), nullable=False, comment="字段名")
|
||||
field_type = Column(String(50), comment="字段类型")
|
||||
field_description = Column(Text, comment="字段描述/语义")
|
||||
is_key_field = Column(Integer, default=0, comment="是否主键")
|
||||
is_nullable = Column(Integer, default=1, comment="是否可空")
|
||||
sample_values = Column(Text, comment="示例值(逗号分隔)")
|
||||
created_at = Column(DateTime, comment="创建时间")
|
||||
|
||||
|
||||
class TaskRecord(Base):
|
||||
"""任务记录表 - 存储异步任务信息"""
|
||||
__tablename__ = "task_records"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
task_id = Column(String(255), unique=True, nullable=False, comment="Celery任务ID")
|
||||
task_type = Column(String(50), comment="任务类型")
|
||||
status = Column(String(50), default="pending", comment="任务状态")
|
||||
input_params = Column(Text, comment="输入参数JSON")
|
||||
result_data = Column(Text, comment="结果数据JSON")
|
||||
error_message = Column(Text, comment="错误信息")
|
||||
started_at = Column(DateTime, comment="开始时间")
|
||||
completed_at = Column(DateTime, comment="完成时间")
|
||||
created_at = Column(DateTime, comment="创建时间")
|
||||
|
||||
|
||||
# ==================== 全局单例 ====================
|
||||
|
||||
mysql_db = MySQLDB()
|
||||
308
backend/app/core/database/redis_db.py
Normal file
308
backend/app/core/database/redis_db.py
Normal file
@@ -0,0 +1,308 @@
|
||||
"""
|
||||
Redis 数据库连接管理模块
|
||||
|
||||
提供缓存和任务队列功能
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
from datetime import timedelta
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import redis.asyncio as redis
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RedisDB:
|
||||
"""Redis 数据库管理类"""
|
||||
|
||||
def __init__(self):
|
||||
self.client: Optional[redis.Redis] = None
|
||||
self._connected = False
|
||||
|
||||
async def connect(self):
|
||||
"""建立 Redis 连接"""
|
||||
try:
|
||||
self.client = redis.from_url(
|
||||
settings.REDIS_URL,
|
||||
encoding="utf-8",
|
||||
decode_responses=True,
|
||||
)
|
||||
# 验证连接
|
||||
await self.client.ping()
|
||||
self._connected = True
|
||||
logger.info(f"Redis 连接成功: {settings.REDIS_URL}")
|
||||
except Exception as e:
|
||||
logger.error(f"Redis 连接失败: {e}")
|
||||
raise
|
||||
|
||||
async def close(self):
|
||||
"""关闭 Redis 连接"""
|
||||
if self.client:
|
||||
await self.client.close()
|
||||
self._connected = False
|
||||
logger.info("Redis 连接已关闭")
|
||||
|
||||
@property
|
||||
def is_connected(self) -> bool:
|
||||
"""检查连接状态"""
|
||||
return self._connected
|
||||
|
||||
# ==================== 基础操作 ====================
|
||||
|
||||
async def get(self, key: str) -> Optional[str]:
|
||||
"""获取值"""
|
||||
return await self.client.get(key)
|
||||
|
||||
async def set(
|
||||
self,
|
||||
key: str,
|
||||
value: str,
|
||||
expire: Optional[int] = None,
|
||||
) -> bool:
|
||||
"""
|
||||
设置值
|
||||
|
||||
Args:
|
||||
key: 键
|
||||
value: 值
|
||||
expire: 过期时间(秒)
|
||||
|
||||
Returns:
|
||||
是否成功
|
||||
"""
|
||||
return await self.client.set(key, value, ex=expire)
|
||||
|
||||
async def delete(self, key: str) -> int:
|
||||
"""删除键"""
|
||||
return await self.client.delete(key)
|
||||
|
||||
async def exists(self, key: str) -> bool:
|
||||
"""检查键是否存在"""
|
||||
return await self.client.exists(key) > 0
|
||||
|
||||
# ==================== JSON 操作 ====================
|
||||
|
||||
async def set_json(
|
||||
self,
|
||||
key: str,
|
||||
data: Dict[str, Any],
|
||||
expire: Optional[int] = None,
|
||||
) -> bool:
|
||||
"""
|
||||
设置 JSON 数据
|
||||
|
||||
Args:
|
||||
key: 键
|
||||
data: 数据字典
|
||||
expire: 过期时间(秒)
|
||||
|
||||
Returns:
|
||||
是否成功
|
||||
"""
|
||||
json_str = json.dumps(data, ensure_ascii=False, default=str)
|
||||
return await self.set(key, json_str, expire)
|
||||
|
||||
async def get_json(self, key: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
获取 JSON 数据
|
||||
|
||||
Args:
|
||||
key: 键
|
||||
|
||||
Returns:
|
||||
数据字典,不存在返回 None
|
||||
"""
|
||||
value = await self.get(key)
|
||||
if value:
|
||||
try:
|
||||
return json.loads(value)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
return None
|
||||
|
||||
# ==================== 任务状态管理 ====================
|
||||
|
||||
async def set_task_status(
|
||||
self,
|
||||
task_id: str,
|
||||
status: str,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
expire: int = 86400, # 默认24小时过期
|
||||
) -> bool:
|
||||
"""
|
||||
设置任务状态
|
||||
|
||||
Args:
|
||||
task_id: 任务ID
|
||||
status: 状态 (pending/processing/success/failure)
|
||||
meta: 附加信息
|
||||
expire: 过期时间(秒)
|
||||
|
||||
Returns:
|
||||
是否成功
|
||||
"""
|
||||
if not self._connected or not self.client:
|
||||
logger.warning(f"Redis未连接,跳过任务状态更新: {task_id}")
|
||||
return False
|
||||
try:
|
||||
key = f"task:{task_id}"
|
||||
data = {
|
||||
"status": status,
|
||||
"meta": meta or {},
|
||||
}
|
||||
return await self.set_json(key, data, expire)
|
||||
except Exception as e:
|
||||
logger.warning(f"设置任务状态失败: {task_id}, error: {e}")
|
||||
return False
|
||||
|
||||
async def get_task_status(self, task_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
获取任务状态
|
||||
|
||||
Args:
|
||||
task_id: 任务ID
|
||||
|
||||
Returns:
|
||||
状态信息
|
||||
"""
|
||||
if not self._connected or not self.client:
|
||||
logger.warning(f"Redis未连接,无法获取任务状态: {task_id}")
|
||||
return None
|
||||
try:
|
||||
key = f"task:{task_id}"
|
||||
return await self.get_json(key)
|
||||
except Exception as e:
|
||||
logger.warning(f"获取任务状态失败: {task_id}, error: {e}")
|
||||
return None
|
||||
|
||||
async def update_task_progress(
|
||||
self,
|
||||
task_id: str,
|
||||
progress: int,
|
||||
message: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""
|
||||
更新任务进度
|
||||
|
||||
Args:
|
||||
task_id: 任务ID
|
||||
progress: 进度值 (0-100)
|
||||
message: 进度消息
|
||||
|
||||
Returns:
|
||||
是否成功
|
||||
"""
|
||||
if not self._connected or not self.client:
|
||||
logger.warning(f"Redis未连接,跳过任务进度更新: {task_id}")
|
||||
return False
|
||||
try:
|
||||
data = await self.get_task_status(task_id)
|
||||
if data:
|
||||
data["meta"]["progress"] = progress
|
||||
if message:
|
||||
data["meta"]["message"] = message
|
||||
key = f"task:{task_id}"
|
||||
return await self.set_json(key, data, expire=86400)
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(f"更新任务进度失败: {task_id}, error: {e}")
|
||||
return False
|
||||
|
||||
# ==================== 缓存操作 ====================
|
||||
|
||||
async def cache_document(
|
||||
self,
|
||||
doc_id: str,
|
||||
data: Dict[str, Any],
|
||||
expire: int = 3600, # 默认1小时
|
||||
) -> bool:
|
||||
"""
|
||||
缓存文档数据
|
||||
|
||||
Args:
|
||||
doc_id: 文档ID
|
||||
data: 文档数据
|
||||
expire: 过期时间(秒)
|
||||
|
||||
Returns:
|
||||
是否成功
|
||||
"""
|
||||
key = f"doc:{doc_id}"
|
||||
return await self.set_json(key, data, expire)
|
||||
|
||||
async def get_cached_document(self, doc_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
获取缓存的文档
|
||||
|
||||
Args:
|
||||
doc_id: 文档ID
|
||||
|
||||
Returns:
|
||||
文档数据
|
||||
"""
|
||||
key = f"doc:{doc_id}"
|
||||
return await self.get_json(key)
|
||||
|
||||
# ==================== 分布式锁 ====================
|
||||
|
||||
async def acquire_lock(
|
||||
self,
|
||||
lock_name: str,
|
||||
expire: int = 30,
|
||||
) -> bool:
|
||||
"""
|
||||
获取分布式锁
|
||||
|
||||
Args:
|
||||
lock_name: 锁名称
|
||||
expire: 过期时间(秒)
|
||||
|
||||
Returns:
|
||||
是否获取成功
|
||||
"""
|
||||
key = f"lock:{lock_name}"
|
||||
# 使用 SET NX EX 原子操作
|
||||
result = await self.client.set(key, "1", nx=True, ex=expire)
|
||||
return result is not None
|
||||
|
||||
async def release_lock(self, lock_name: str) -> bool:
|
||||
"""
|
||||
释放分布式锁
|
||||
|
||||
Args:
|
||||
lock_name: 锁名称
|
||||
|
||||
Returns:
|
||||
是否释放成功
|
||||
"""
|
||||
key = f"lock:{lock_name}"
|
||||
result = await self.client.delete(key)
|
||||
return result > 0
|
||||
|
||||
# ==================== 计数器 ====================
|
||||
|
||||
async def incr(self, key: str, amount: int = 1) -> int:
|
||||
"""递增计数器"""
|
||||
return await self.client.incrby(key, amount)
|
||||
|
||||
async def decr(self, key: str, amount: int = 1) -> int:
|
||||
"""递减计数器"""
|
||||
return await self.client.decrby(key, amount)
|
||||
|
||||
# ==================== 过期时间管理 ====================
|
||||
|
||||
async def expire(self, key: str, seconds: int) -> bool:
|
||||
"""设置键的过期时间"""
|
||||
return await self.client.expire(key, seconds)
|
||||
|
||||
async def ttl(self, key: str) -> int:
|
||||
"""获取键的剩余生存时间"""
|
||||
return await self.client.ttl(key)
|
||||
|
||||
|
||||
# ==================== 全局单例 ====================
|
||||
|
||||
redis_db = RedisDB()
|
||||
@@ -1,7 +1,65 @@
|
||||
"""
|
||||
文档解析模块 - 支持多种文件格式的解析
|
||||
"""
|
||||
from .base import BaseParser
|
||||
from .xlsx_parser import XlsxParser
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
__all__ = ['BaseParser', 'XlsxParser']
|
||||
from .base import BaseParser, ParseResult
|
||||
from .xlsx_parser import XlsxParser
|
||||
from .docx_parser import DocxParser
|
||||
from .md_parser import MarkdownParser
|
||||
from .txt_parser import TxtParser
|
||||
|
||||
|
||||
class ParserFactory:
|
||||
"""解析器工厂,根据文件类型返回对应解析器"""
|
||||
|
||||
_parsers: Dict[str, BaseParser] = {
|
||||
# Excel
|
||||
'.xlsx': XlsxParser(),
|
||||
'.xls': XlsxParser(),
|
||||
# Word
|
||||
'.docx': DocxParser(),
|
||||
# Markdown
|
||||
'.md': MarkdownParser(),
|
||||
'.markdown': MarkdownParser(),
|
||||
# 文本
|
||||
'.txt': TxtParser(),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def get_parser(cls, file_path: str) -> BaseParser:
|
||||
"""根据文件扩展名获取解析器"""
|
||||
ext = Path(file_path).suffix.lower()
|
||||
parser = cls._parsers.get(ext)
|
||||
if not parser:
|
||||
supported = list(cls._parsers.keys())
|
||||
raise ValueError(f"不支持的文件格式: {ext},支持的格式: {supported}")
|
||||
return parser
|
||||
|
||||
@classmethod
|
||||
def parse(cls, file_path: str, **kwargs) -> ParseResult:
|
||||
"""统一解析接口"""
|
||||
parser = cls.get_parser(file_path)
|
||||
return parser.parse(file_path, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def register_parser(cls, ext: str, parser: BaseParser):
|
||||
"""注册新的解析器"""
|
||||
cls._parsers[ext.lower()] = parser
|
||||
|
||||
@classmethod
|
||||
def get_supported_extensions(cls) -> list:
|
||||
"""获取所有支持的扩展名"""
|
||||
return list(cls._parsers.keys())
|
||||
|
||||
|
||||
__all__ = [
|
||||
'BaseParser',
|
||||
'ParseResult',
|
||||
'ParserFactory',
|
||||
'XlsxParser',
|
||||
'DocxParser',
|
||||
'MarkdownParser',
|
||||
'TxtParser',
|
||||
]
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
293
backend/app/core/document_parser/docx_parser.py
Normal file
293
backend/app/core/document_parser/docx_parser.py
Normal file
@@ -0,0 +1,293 @@
|
||||
"""
|
||||
Word 文档 (.docx) 解析器
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from docx import Document
|
||||
|
||||
from .base import BaseParser, ParseResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocxParser(BaseParser):
|
||||
"""Word 文档解析器"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.supported_extensions = ['.docx']
|
||||
self.parser_name = "docx_parser"
|
||||
|
||||
def parse(
|
||||
self,
|
||||
file_path: str,
|
||||
**kwargs
|
||||
) -> ParseResult:
|
||||
"""
|
||||
解析 Word 文档
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
**kwargs: 其他参数
|
||||
|
||||
Returns:
|
||||
ParseResult: 解析结果
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
# 检查文件是否存在
|
||||
if not path.exists():
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"文件不存在: {file_path}"
|
||||
)
|
||||
|
||||
# 检查文件扩展名
|
||||
if path.suffix.lower() not in self.supported_extensions:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"不支持的文件类型: {path.suffix}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 读取 Word 文档
|
||||
doc = Document(file_path)
|
||||
|
||||
# 提取文本内容
|
||||
paragraphs = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
paragraphs.append(para.text)
|
||||
|
||||
# 提取表格内容
|
||||
tables_data = []
|
||||
for i, table in enumerate(doc.tables):
|
||||
table_rows = []
|
||||
for row in table.rows:
|
||||
row_data = [cell.text.strip() for cell in row.cells]
|
||||
table_rows.append(row_data)
|
||||
|
||||
if table_rows:
|
||||
tables_data.append({
|
||||
"table_index": i,
|
||||
"rows": table_rows,
|
||||
"row_count": len(table_rows),
|
||||
"column_count": len(table_rows[0]) if table_rows else 0
|
||||
})
|
||||
|
||||
# 合并所有文本
|
||||
full_text = "\n".join(paragraphs)
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"file_size": path.stat().st_size,
|
||||
"paragraph_count": len(paragraphs),
|
||||
"table_count": len(tables_data),
|
||||
"word_count": len(full_text),
|
||||
"char_count": len(full_text.replace("\n", "")),
|
||||
"has_tables": len(tables_data) > 0
|
||||
}
|
||||
|
||||
# 返回结果
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": full_text,
|
||||
"paragraphs": paragraphs,
|
||||
"tables": tables_data,
|
||||
"word_count": len(full_text),
|
||||
"structured_data": {
|
||||
"paragraphs": paragraphs,
|
||||
"tables": tables_data
|
||||
}
|
||||
},
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析 Word 文档失败: {str(e)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"解析 Word 文档失败: {str(e)}"
|
||||
)
|
||||
|
||||
def extract_key_sentences(self, text: str, max_sentences: int = 10) -> List[str]:
|
||||
"""
|
||||
从文本中提取关键句子
|
||||
|
||||
Args:
|
||||
text: 文本内容
|
||||
max_sentences: 最大句子数
|
||||
|
||||
Returns:
|
||||
关键句子列表
|
||||
"""
|
||||
# 简单实现:按句号分割,取前N个句子
|
||||
sentences = [s.strip() for s in text.split("。") if s.strip()]
|
||||
return sentences[:max_sentences]
|
||||
|
||||
def extract_structured_fields(self, text: str) -> Dict[str, Any]:
|
||||
"""
|
||||
尝试提取结构化字段
|
||||
|
||||
针对合同、简历等有固定格式的文档
|
||||
|
||||
Args:
|
||||
text: 文本内容
|
||||
|
||||
Returns:
|
||||
提取的字段字典
|
||||
"""
|
||||
fields = {}
|
||||
|
||||
# 常见字段模式
|
||||
patterns = {
|
||||
"姓名": r"姓名[::]\s*(\S+)",
|
||||
"电话": r"电话[::]\s*(\d{11}|\d{3}-\d{8})",
|
||||
"邮箱": r"邮箱[::]\s*(\S+@\S+)",
|
||||
"地址": r"地址[::]\s*(.+?)(?:\n|$)",
|
||||
"金额": r"金额[::]\s*(\d+(?:\.\d+)?)",
|
||||
"日期": r"日期[::]\s*(\d{4}[年/-]\d{1,2}[月/-]\d{1,2})",
|
||||
}
|
||||
|
||||
import re
|
||||
for field_name, pattern in patterns.items():
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
fields[field_name] = match.group(1)
|
||||
|
||||
return fields
|
||||
|
||||
def parse_tables_for_template(
|
||||
self,
|
||||
file_path: str
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
解析 Word 文档中的表格,提取模板字段
|
||||
|
||||
专门用于比赛场景:解析表格模板,识别需要填写的字段
|
||||
|
||||
Args:
|
||||
file_path: Word 文件路径
|
||||
|
||||
Returns:
|
||||
包含表格字段信息的字典
|
||||
"""
|
||||
from docx import Document
|
||||
from docx.table import Table
|
||||
from docx.oxml.ns import qn
|
||||
|
||||
doc = Document(file_path)
|
||||
|
||||
template_info = {
|
||||
"tables": [],
|
||||
"fields": [],
|
||||
"field_count": 0
|
||||
}
|
||||
|
||||
for table_idx, table in enumerate(doc.tables):
|
||||
table_info = {
|
||||
"table_index": table_idx,
|
||||
"rows": [],
|
||||
"headers": [],
|
||||
"data_rows": [],
|
||||
"field_hints": {} # 字段名称 -> 提示词/描述
|
||||
}
|
||||
|
||||
# 提取表头(第一行)
|
||||
if table.rows:
|
||||
header_cells = [cell.text.strip() for cell in table.rows[0].cells]
|
||||
table_info["headers"] = header_cells
|
||||
|
||||
# 提取数据行
|
||||
for row_idx, row in enumerate(table.rows[1:], 1):
|
||||
row_data = [cell.text.strip() for cell in row.cells]
|
||||
table_info["data_rows"].append(row_data)
|
||||
table_info["rows"].append({
|
||||
"row_index": row_idx,
|
||||
"cells": row_data
|
||||
})
|
||||
|
||||
# 尝试从第二列/第三列提取提示词
|
||||
# 比赛模板通常格式为:字段名 | 提示词 | 填写值
|
||||
if len(table.rows[0].cells) >= 2:
|
||||
for row_idx, row in enumerate(table.rows[1:], 1):
|
||||
cells = [cell.text.strip() for cell in row.cells]
|
||||
if len(cells) >= 2 and cells[0]:
|
||||
# 第一列是字段名
|
||||
field_name = cells[0]
|
||||
# 第二列可能是提示词或描述
|
||||
hint = cells[1] if len(cells) > 1 else ""
|
||||
table_info["field_hints"][field_name] = hint
|
||||
|
||||
template_info["fields"].append({
|
||||
"table_index": table_idx,
|
||||
"row_index": row_idx,
|
||||
"field_name": field_name,
|
||||
"hint": hint,
|
||||
"expected_value": cells[2] if len(cells) > 2 else ""
|
||||
})
|
||||
|
||||
template_info["tables"].append(table_info)
|
||||
|
||||
template_info["field_count"] = len(template_info["fields"])
|
||||
return template_info
|
||||
|
||||
def extract_template_fields_from_docx(
|
||||
self,
|
||||
file_path: str
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
从 Word 文档中提取模板字段定义
|
||||
|
||||
适用于比赛评分表格:表格第一列是字段名,第二列是提示词/填写示例
|
||||
|
||||
Args:
|
||||
file_path: Word 文件路径
|
||||
|
||||
Returns:
|
||||
字段定义列表
|
||||
"""
|
||||
template_info = self.parse_tables_for_template(file_path)
|
||||
|
||||
fields = []
|
||||
for field in template_info["fields"]:
|
||||
fields.append({
|
||||
"cell": f"T{field['table_index']}R{field['row_index']}", # TableXRowY 格式
|
||||
"name": field["field_name"],
|
||||
"hint": field["hint"],
|
||||
"table_index": field["table_index"],
|
||||
"row_index": field["row_index"],
|
||||
"field_type": self._infer_field_type_from_hint(field["hint"]),
|
||||
"required": True
|
||||
})
|
||||
|
||||
return fields
|
||||
|
||||
def _infer_field_type_from_hint(self, hint: str) -> str:
|
||||
"""
|
||||
从提示词推断字段类型
|
||||
|
||||
Args:
|
||||
hint: 字段提示词
|
||||
|
||||
Returns:
|
||||
字段类型 (text/number/date)
|
||||
"""
|
||||
hint_lower = hint.lower()
|
||||
|
||||
# 日期关键词
|
||||
date_keywords = ["年", "月", "日", "日期", "时间", "出生"]
|
||||
if any(kw in hint for kw in date_keywords):
|
||||
return "date"
|
||||
|
||||
# 数字关键词
|
||||
number_keywords = ["数量", "金额", "人数", "面积", "增长", "比率", "%", "率"]
|
||||
if any(kw in hint_lower for kw in number_keywords):
|
||||
return "number"
|
||||
|
||||
return "text"
|
||||
262
backend/app/core/document_parser/md_parser.py
Normal file
262
backend/app/core/document_parser/md_parser.py
Normal file
@@ -0,0 +1,262 @@
|
||||
"""
|
||||
Markdown 文档解析器
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import markdown
|
||||
|
||||
from .base import BaseParser, ParseResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownParser(BaseParser):
|
||||
"""Markdown 文档解析器"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.supported_extensions = ['.md', '.markdown']
|
||||
self.parser_name = "markdown_parser"
|
||||
|
||||
def parse(
|
||||
self,
|
||||
file_path: str,
|
||||
**kwargs
|
||||
) -> ParseResult:
|
||||
"""
|
||||
解析 Markdown 文档
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
**kwargs: 其他参数
|
||||
|
||||
Returns:
|
||||
ParseResult: 解析结果
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
# 检查文件是否存在
|
||||
if not path.exists():
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"文件不存在: {file_path}"
|
||||
)
|
||||
|
||||
# 检查文件扩展名
|
||||
if path.suffix.lower() not in self.supported_extensions:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"不支持的文件类型: {path.suffix}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 读取文件内容
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
raw_content = f.read()
|
||||
|
||||
# 解析 Markdown
|
||||
md = markdown.Markdown(extensions=[
|
||||
'markdown.extensions.tables',
|
||||
'markdown.extensions.fenced_code',
|
||||
'markdown.extensions.codehilite',
|
||||
'markdown.extensions.toc',
|
||||
])
|
||||
|
||||
html_content = md.convert(raw_content)
|
||||
|
||||
# 提取标题结构
|
||||
titles = self._extract_titles(raw_content)
|
||||
|
||||
# 提取代码块
|
||||
code_blocks = self._extract_code_blocks(raw_content)
|
||||
|
||||
# 提取表格
|
||||
tables = self._extract_tables(raw_content)
|
||||
|
||||
# 提取链接和图片
|
||||
links_images = self._extract_links_images(raw_content)
|
||||
|
||||
# 清理后的纯文本(去除 Markdown 语法)
|
||||
plain_text = self._strip_markdown(raw_content)
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"file_size": path.stat().st_size,
|
||||
"word_count": len(plain_text),
|
||||
"char_count": len(raw_content),
|
||||
"line_count": len(raw_content.splitlines()),
|
||||
"title_count": len(titles),
|
||||
"code_block_count": len(code_blocks),
|
||||
"table_count": len(tables),
|
||||
"link_count": len(links_images.get("links", [])),
|
||||
"image_count": len(links_images.get("images", [])),
|
||||
}
|
||||
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": plain_text,
|
||||
"raw_content": raw_content,
|
||||
"html_content": html_content,
|
||||
"titles": titles,
|
||||
"code_blocks": code_blocks,
|
||||
"tables": tables,
|
||||
"links_images": links_images,
|
||||
"word_count": len(plain_text),
|
||||
"structured_data": {
|
||||
"titles": titles,
|
||||
"code_blocks": code_blocks,
|
||||
"tables": tables
|
||||
}
|
||||
},
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析 Markdown 文档失败: {str(e)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"解析 Markdown 文档失败: {str(e)}"
|
||||
)
|
||||
|
||||
def _extract_titles(self, content: str) -> List[Dict[str, Any]]:
|
||||
"""提取标题结构"""
|
||||
import re
|
||||
titles = []
|
||||
|
||||
# 匹配 # 标题
|
||||
for match in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
|
||||
level = len(match.group(1))
|
||||
title_text = match.group(2).strip()
|
||||
titles.append({
|
||||
"level": level,
|
||||
"text": title_text,
|
||||
"line": content[:match.start()].count('\n') + 1
|
||||
})
|
||||
|
||||
return titles
|
||||
|
||||
def _extract_code_blocks(self, content: str) -> List[Dict[str, str]]:
|
||||
"""提取代码块"""
|
||||
import re
|
||||
code_blocks = []
|
||||
|
||||
# 匹配 ```code ``` 格式
|
||||
pattern = r'```(\w*)\n(.*?)```'
|
||||
for match in re.finditer(pattern, content, re.DOTALL):
|
||||
language = match.group(1) or "text"
|
||||
code = match.group(2).strip()
|
||||
code_blocks.append({
|
||||
"language": language,
|
||||
"code": code
|
||||
})
|
||||
|
||||
return code_blocks
|
||||
|
||||
def _extract_tables(self, content: str) -> List[Dict[str, Any]]:
|
||||
"""提取表格"""
|
||||
import re
|
||||
tables = []
|
||||
|
||||
# 简单表格匹配(| col1 | col2 | 格式)
|
||||
lines = content.split('\n')
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
|
||||
# 检查是否是表格行
|
||||
if line.startswith('|') and line.endswith('|'):
|
||||
# 找到表头
|
||||
header_row = [cell.strip() for cell in line.split('|')[1:-1]]
|
||||
|
||||
# 检查下一行是否是分隔符
|
||||
if i + 1 < len(lines) and re.match(r'^\|[\s\-:|]+\|$', lines[i + 1]):
|
||||
# 跳过分隔符,读取数据行
|
||||
data_rows = []
|
||||
for j in range(i + 2, len(lines)):
|
||||
row_line = lines[j].strip()
|
||||
if not (row_line.startswith('|') and row_line.endswith('|')):
|
||||
break
|
||||
row_data = [cell.strip() for cell in row_line.split('|')[1:-1]]
|
||||
data_rows.append(row_data)
|
||||
|
||||
if header_row and data_rows:
|
||||
tables.append({
|
||||
"headers": header_row,
|
||||
"rows": data_rows,
|
||||
"row_count": len(data_rows),
|
||||
"column_count": len(header_row)
|
||||
})
|
||||
i = j - 1
|
||||
|
||||
i += 1
|
||||
|
||||
return tables
|
||||
|
||||
def _extract_links_images(self, content: str) -> Dict[str, List[Dict[str, str]]]:
|
||||
"""提取链接和图片"""
|
||||
import re
|
||||
result = {"links": [], "images": []}
|
||||
|
||||
# 提取链接 [text](url)
|
||||
for match in re.finditer(r'\[([^\]]+)\]\(([^\)]+)\)', content):
|
||||
result["links"].append({
|
||||
"text": match.group(1),
|
||||
"url": match.group(2)
|
||||
})
|
||||
|
||||
# 提取图片 
|
||||
for match in re.finditer(r'!\[([^\]]*)\]\(([^\)]+)\)', content):
|
||||
result["images"].append({
|
||||
"alt": match.group(1),
|
||||
"url": match.group(2)
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
def _strip_markdown(self, content: str) -> str:
|
||||
"""去除 Markdown 语法,获取纯文本"""
|
||||
import re
|
||||
|
||||
# 去除代码块
|
||||
content = re.sub(r'```[\s\S]*?```', '', content)
|
||||
|
||||
# 去除行内代码
|
||||
content = re.sub(r'`[^`]+`', '', content)
|
||||
|
||||
# 去除图片
|
||||
content = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', r'\1', content)
|
||||
|
||||
# 去除链接,保留文本
|
||||
content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content)
|
||||
|
||||
# 去除标题标记
|
||||
content = re.sub(r'^#{1,6}\s+', '', content, flags=re.MULTILINE)
|
||||
|
||||
# 去除加粗和斜体
|
||||
content = re.sub(r'\*\*([^\*]+)\*\*', r'\1', content)
|
||||
content = re.sub(r'\*([^\*]+)\*', r'\1', content)
|
||||
content = re.sub(r'__([^_]+)__', r'\1', content)
|
||||
content = re.sub(r'_([^_]+)_', r'\1', content)
|
||||
|
||||
# 去除引用标记
|
||||
content = re.sub(r'^>\s+', '', content, flags=re.MULTILINE)
|
||||
|
||||
# 去除列表标记
|
||||
content = re.sub(r'^[-*+]\s+', '', content, flags=re.MULTILINE)
|
||||
content = re.sub(r'^\d+\.\s+', '', content, flags=re.MULTILINE)
|
||||
|
||||
# 去除水平线
|
||||
content = re.sub(r'^[-*_]{3,}$', '', content, flags=re.MULTILINE)
|
||||
|
||||
# 去除表格分隔符
|
||||
content = re.sub(r'^\|[\s\-:|]+\|$', '', content, flags=re.MULTILINE)
|
||||
|
||||
# 清理多余空行
|
||||
content = re.sub(r'\n{3,}', '\n\n', content)
|
||||
|
||||
return content.strip()
|
||||
278
backend/app/core/document_parser/txt_parser.py
Normal file
278
backend/app/core/document_parser/txt_parser.py
Normal file
@@ -0,0 +1,278 @@
|
||||
"""
|
||||
纯文本 (.txt) 解析器
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import chardet
|
||||
|
||||
from .base import BaseParser, ParseResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TxtParser(BaseParser):
|
||||
"""纯文本文档解析器"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.supported_extensions = ['.txt']
|
||||
self.parser_name = "txt_parser"
|
||||
|
||||
def parse(
|
||||
self,
|
||||
file_path: str,
|
||||
encoding: Optional[str] = None,
|
||||
**kwargs
|
||||
) -> ParseResult:
|
||||
"""
|
||||
解析文本文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
encoding: 指定编码,不指定则自动检测
|
||||
**kwargs: 其他参数
|
||||
|
||||
Returns:
|
||||
ParseResult: 解析结果
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
# 检查文件是否存在
|
||||
if not path.exists():
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"文件不存在: {file_path}"
|
||||
)
|
||||
|
||||
# 检查文件扩展名
|
||||
if path.suffix.lower() not in self.supported_extensions:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"不支持的文件类型: {path.suffix}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 检测编码
|
||||
if not encoding:
|
||||
encoding = self._detect_encoding(file_path)
|
||||
|
||||
# 读取文件内容
|
||||
with open(file_path, 'r', encoding=encoding) as f:
|
||||
raw_content = f.read()
|
||||
|
||||
# 清理文本
|
||||
content = self._clean_text(raw_content)
|
||||
|
||||
# 提取行信息
|
||||
lines = content.split('\n')
|
||||
|
||||
# 估算字数
|
||||
word_count = len(content.replace('\n', '').replace(' ', ''))
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"file_size": path.stat().st_size,
|
||||
"encoding": encoding,
|
||||
"line_count": len(lines),
|
||||
"word_count": word_count,
|
||||
"char_count": len(content),
|
||||
"non_empty_line_count": len([l for l in lines if l.strip()])
|
||||
}
|
||||
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": content,
|
||||
"raw_content": raw_content,
|
||||
"lines": lines,
|
||||
"word_count": word_count,
|
||||
"char_count": len(content),
|
||||
"line_count": len(lines),
|
||||
"structured_data": {
|
||||
"line_count": len(lines),
|
||||
"non_empty_line_count": metadata["non_empty_line_count"]
|
||||
}
|
||||
},
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析文本文件失败: {str(e)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"解析文本文件失败: {str(e)}"
|
||||
)
|
||||
|
||||
def _detect_encoding(self, file_path: str) -> str:
|
||||
"""
|
||||
自动检测文件编码
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
检测到的编码
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
raw_data = f.read()
|
||||
|
||||
result = chardet.detect(raw_data)
|
||||
encoding = result.get('encoding', 'utf-8')
|
||||
|
||||
# 验证编码是否有效
|
||||
if encoding:
|
||||
try:
|
||||
raw_data.decode(encoding)
|
||||
return encoding
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
pass
|
||||
|
||||
return 'utf-8'
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"编码检测失败,使用默认编码: {str(e)}")
|
||||
return 'utf-8'
|
||||
|
||||
def _clean_text(self, text: str) -> str:
|
||||
"""
|
||||
清理文本内容
|
||||
|
||||
- 去除多余空白字符
|
||||
- 规范化换行符
|
||||
- 去除特殊控制字符
|
||||
|
||||
Args:
|
||||
text: 原始文本
|
||||
|
||||
Returns:
|
||||
清理后的文本
|
||||
"""
|
||||
# 规范化换行符
|
||||
text = text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# 去除控制字符(除了换行和tab)
|
||||
text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]', '', text)
|
||||
|
||||
# 将多个连续空格合并为一个
|
||||
text = re.sub(r'[ \t]+', ' ', text)
|
||||
|
||||
# 将多个连续空行合并为一个
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
def extract_structured_data(self, content: str) -> Dict[str, Any]:
|
||||
"""
|
||||
尝试从文本中提取结构化数据
|
||||
|
||||
支持提取:
|
||||
- 邮箱地址
|
||||
- URL
|
||||
- 电话号码
|
||||
- 日期
|
||||
- 金额
|
||||
|
||||
Args:
|
||||
content: 文本内容
|
||||
|
||||
Returns:
|
||||
结构化数据字典
|
||||
"""
|
||||
data = {
|
||||
"emails": [],
|
||||
"urls": [],
|
||||
"phones": [],
|
||||
"dates": [],
|
||||
"amounts": []
|
||||
}
|
||||
|
||||
# 提取邮箱
|
||||
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', content)
|
||||
data["emails"] = list(set(emails))
|
||||
|
||||
# 提取 URL
|
||||
urls = re.findall(r'https?://[^\s<>"{}|\\^`\[\]]+', content)
|
||||
data["urls"] = list(set(urls))
|
||||
|
||||
# 提取电话号码 (支持多种格式)
|
||||
phone_patterns = [
|
||||
r'1[3-9]\d{9}', # 手机号
|
||||
r'\d{3,4}-\d{7,8}', # 固话
|
||||
]
|
||||
phones = []
|
||||
for pattern in phone_patterns:
|
||||
phones.extend(re.findall(pattern, content))
|
||||
data["phones"] = list(set(phones))
|
||||
|
||||
# 提取日期
|
||||
date_patterns = [
|
||||
r'\d{4}[-/年]\d{1,2}[-/月]\d{1,2}[日]?',
|
||||
r'\d{4}\.\d{1,2}\.\d{1,2}',
|
||||
]
|
||||
dates = []
|
||||
for pattern in date_patterns:
|
||||
dates.extend(re.findall(pattern, content))
|
||||
data["dates"] = list(set(dates))
|
||||
|
||||
# 提取金额
|
||||
amount_patterns = [
|
||||
r'¥\s*\d+(?:\.\d{1,2})?',
|
||||
r'\$\s*\d+(?:\.\d{1,2})?',
|
||||
r'\d+(?:\.\d{1,2})?\s*元',
|
||||
]
|
||||
amounts = []
|
||||
for pattern in amount_patterns:
|
||||
amounts.extend(re.findall(pattern, content))
|
||||
data["amounts"] = list(set(amounts))
|
||||
|
||||
return data
|
||||
|
||||
def split_into_chunks(
|
||||
self,
|
||||
content: str,
|
||||
chunk_size: int = 1000,
|
||||
overlap: int = 100
|
||||
) -> List[str]:
|
||||
"""
|
||||
将长文本分割成块
|
||||
|
||||
用于 RAG 索引或 LLM 处理
|
||||
|
||||
Args:
|
||||
content: 文本内容
|
||||
chunk_size: 每块字符数
|
||||
overlap: 块之间的重叠字符数
|
||||
|
||||
Returns:
|
||||
文本块列表
|
||||
"""
|
||||
if len(content) <= chunk_size:
|
||||
return [content]
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
|
||||
while start < len(content):
|
||||
end = start + chunk_size
|
||||
chunk = content[start:end]
|
||||
|
||||
# 尝试在句子边界分割
|
||||
if end < len(content):
|
||||
last_period = chunk.rfind('。')
|
||||
last_newline = chunk.rfind('\n')
|
||||
split_pos = max(last_period, last_newline)
|
||||
|
||||
if split_pos > chunk_size // 2:
|
||||
chunk = chunk[:split_pos + 1]
|
||||
end = start + split_pos + 1
|
||||
|
||||
chunks.append(chunk)
|
||||
start = end - overlap if end < len(content) else end
|
||||
|
||||
return chunks
|
||||
@@ -67,11 +67,14 @@ class XlsxParser(BaseParser):
|
||||
xls_file = pd.ExcelFile(file_path)
|
||||
sheet_names = xls_file.sheet_names
|
||||
|
||||
# 如果 pandas 返回空列表,尝试从 XML 提取
|
||||
if not sheet_names:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"Excel 文件没有找到任何工作表: {file_path}"
|
||||
)
|
||||
sheet_names = self._extract_sheet_names_from_xml(file_path)
|
||||
if not sheet_names:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"Excel 文件没有找到任何工作表: {file_path}"
|
||||
)
|
||||
|
||||
# 验证请求的工作表索引/名称
|
||||
target_sheet = None
|
||||
@@ -88,15 +91,28 @@ class XlsxParser(BaseParser):
|
||||
target_sheet = sheet_names[0]
|
||||
|
||||
# 读取 Excel 文件
|
||||
df = pd.read_excel(
|
||||
file_path,
|
||||
sheet_name=target_sheet,
|
||||
header=header_row,
|
||||
**kwargs
|
||||
)
|
||||
df = None
|
||||
try:
|
||||
df = pd.read_excel(
|
||||
file_path,
|
||||
sheet_name=target_sheet,
|
||||
header=header_row,
|
||||
**kwargs
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"pandas 读取 Excel 失败,尝试 XML 方式: {e}")
|
||||
# pandas 读取失败,尝试 XML 方式
|
||||
df = self._read_excel_sheet_xml(file_path, sheet_name=target_sheet, header_row=header_row)
|
||||
|
||||
# 检查 DataFrame 是否为空
|
||||
if df.empty:
|
||||
# 检查 DataFrame 是否为空(但如果有列名,仍算有效)
|
||||
if df is None:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"工作表 '{target_sheet}' 读取失败"
|
||||
)
|
||||
|
||||
# 如果 DataFrame 为空但有列名(比如模板文件),仍算有效
|
||||
if df.empty and len(df.columns) == 0:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"工作表 '{target_sheet}' 为空,请检查 Excel 文件内容"
|
||||
@@ -211,7 +227,26 @@ class XlsxParser(BaseParser):
|
||||
|
||||
try:
|
||||
# 读取所有工作表
|
||||
all_data = pd.read_excel(file_path, sheet_name=None, **kwargs)
|
||||
all_data = None
|
||||
try:
|
||||
all_data = pd.read_excel(file_path, sheet_name=None, **kwargs)
|
||||
except Exception as e:
|
||||
logger.warning(f"pandas 读取所有工作表失败: {e}")
|
||||
|
||||
# 如果 pandas 失败,尝试 XML 方式
|
||||
if all_data is None or len(all_data) == 0:
|
||||
sheet_names = self._extract_sheet_names_from_xml(file_path)
|
||||
if not sheet_names:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"无法读取 Excel 文件或文件为空: {file_path}"
|
||||
)
|
||||
# 使用 XML 方式读取每个工作表
|
||||
all_data = {}
|
||||
for sheet_name in sheet_names:
|
||||
df = self._read_excel_sheet_xml(file_path, sheet_name=sheet_name, header_row=0)
|
||||
if df is not None and not df.empty:
|
||||
all_data[sheet_name] = df
|
||||
|
||||
# 检查是否成功读取到数据
|
||||
if not all_data or len(all_data) == 0:
|
||||
@@ -257,13 +292,231 @@ class XlsxParser(BaseParser):
|
||||
try:
|
||||
xls = pd.ExcelFile(file_path)
|
||||
sheet_names = xls.sheet_names
|
||||
if not sheet_names:
|
||||
return []
|
||||
return sheet_names
|
||||
if sheet_names:
|
||||
return sheet_names
|
||||
# pandas 返回空列表,尝试从 XML 提取
|
||||
return self._extract_sheet_names_from_xml(file_path)
|
||||
except Exception as e:
|
||||
logger.error(f"获取工作表名称失败: {str(e)}")
|
||||
# 尝试从 XML 提取
|
||||
return self._extract_sheet_names_from_xml(file_path)
|
||||
|
||||
def _extract_sheet_names_from_xml(self, file_path: str) -> List[str]:
|
||||
"""
|
||||
从 Excel 文件的 XML 中提取工作表名称
|
||||
|
||||
某些 Excel 文件由于包含非标准元素(如 mc:AlternateContent),
|
||||
pandas/openpyxl 无法正确解析工作表列表,此时需要直接从 XML 中提取。
|
||||
|
||||
Args:
|
||||
file_path: Excel 文件路径
|
||||
|
||||
Returns:
|
||||
工作表名称列表
|
||||
"""
|
||||
import zipfile
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
# 常见的命名空间
|
||||
COMMON_NAMESPACES = [
|
||||
'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
|
||||
'http://schemas.openxmlformats.org/spreadsheetml/2005/main',
|
||||
'http://schemas.openxmlformats.org/spreadsheetml/2004/main',
|
||||
'http://schemas.openxmlformats.org/spreadsheetml/2003/main',
|
||||
]
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as z:
|
||||
# 尝试多种可能的 workbook.xml 路径
|
||||
possible_paths = ['xl/workbook.xml', 'xl\\workbook.xml', 'workbook.xml']
|
||||
content = None
|
||||
for path in possible_paths:
|
||||
if path in z.namelist():
|
||||
content = z.read(path)
|
||||
logger.info(f"找到 workbook.xml at: {path}")
|
||||
break
|
||||
|
||||
if content is None:
|
||||
logger.warning(f"未找到 workbook.xml,文件列表: {z.namelist()[:10]}")
|
||||
return []
|
||||
|
||||
root = ET.fromstring(content)
|
||||
|
||||
sheet_names = []
|
||||
|
||||
# 方法1:尝试带命名空间的查找
|
||||
for ns in COMMON_NAMESPACES:
|
||||
sheet_elements = root.findall(f'.//{{{ns}}}sheet')
|
||||
if sheet_elements:
|
||||
for sheet in sheet_elements:
|
||||
name = sheet.get('name')
|
||||
if name:
|
||||
sheet_names.append(name)
|
||||
if sheet_names:
|
||||
logger.info(f"使用命名空间 {ns} 提取工作表: {sheet_names}")
|
||||
return sheet_names
|
||||
|
||||
# 方法2:不使用命名空间,直接查找所有 sheet 元素
|
||||
if not sheet_names:
|
||||
for elem in root.iter():
|
||||
if elem.tag.endswith('sheet') and elem.tag != 'sheets':
|
||||
name = elem.get('name')
|
||||
if name:
|
||||
sheet_names.append(name)
|
||||
for child in elem:
|
||||
if child.tag.endswith('sheet') or child.tag == 'sheet':
|
||||
name = child.get('name')
|
||||
if name and name not in sheet_names:
|
||||
sheet_names.append(name)
|
||||
|
||||
# 方法3:直接从 XML 文本中正则匹配 sheet name
|
||||
if not sheet_names:
|
||||
import re
|
||||
xml_str = content.decode('utf-8', errors='ignore')
|
||||
matches = re.findall(r'<sheet\s+[^>]*name=["\']([^"\']+)["\']', xml_str, re.IGNORECASE)
|
||||
if matches:
|
||||
sheet_names = matches
|
||||
logger.info(f"使用正则提取工作表: {sheet_names}")
|
||||
|
||||
logger.info(f"从 XML 提取工作表: {sheet_names}")
|
||||
return sheet_names
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"从 XML 提取工作表名称失败: {e}")
|
||||
return []
|
||||
|
||||
def _read_excel_sheet_xml(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
|
||||
"""
|
||||
从 XML 直接读取 Excel 工作表数据
|
||||
|
||||
当 pandas 无法正确解析时使用此方法。
|
||||
|
||||
Args:
|
||||
file_path: Excel 文件路径
|
||||
sheet_name: 工作表名称(如果为 None,读取第一个工作表)
|
||||
header_row: 表头行号(0-indexed)
|
||||
|
||||
Returns:
|
||||
DataFrame
|
||||
"""
|
||||
import zipfile
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
# 常见的命名空间
|
||||
COMMON_NAMESPACES = [
|
||||
'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
|
||||
'http://schemas.openxmlformats.org/spreadsheetml/2005/main',
|
||||
'http://schemas.openxmlformats.org/spreadsheetml/2004/main',
|
||||
'http://schemas.openxmlformats.org/spreadsheetml/2003/main',
|
||||
]
|
||||
|
||||
def find_elements_with_ns(root, tag_name):
|
||||
"""灵活查找元素,支持任意命名空间"""
|
||||
results = []
|
||||
# 方法1:用固定命名空间
|
||||
for ns in COMMON_NAMESPACES:
|
||||
try:
|
||||
elems = root.findall(f'.//{{{ns}}}{tag_name}')
|
||||
if elems:
|
||||
results.extend(elems)
|
||||
except:
|
||||
pass
|
||||
# 方法2:不带命名空间查找
|
||||
if not results:
|
||||
for elem in root.iter():
|
||||
if elem.tag.endswith('}' + tag_name):
|
||||
results.append(elem)
|
||||
return results
|
||||
|
||||
with zipfile.ZipFile(file_path, 'r') as z:
|
||||
# 获取工作表名称
|
||||
sheet_names = self._extract_sheet_names_from_xml(file_path)
|
||||
if not sheet_names:
|
||||
raise ValueError("无法从 Excel 文件中找到工作表")
|
||||
|
||||
# 确定要读取的工作表
|
||||
target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
|
||||
sheet_index = sheet_names.index(target_sheet) + 1 # sheet1.xml, sheet2.xml, ...
|
||||
|
||||
# 读取 shared strings - 尝试多种路径
|
||||
shared_strings = []
|
||||
ss_paths = ['xl/sharedStrings.xml', 'xl\\sharedStrings.xml', 'sharedStrings.xml']
|
||||
for ss_path in ss_paths:
|
||||
if ss_path in z.namelist():
|
||||
try:
|
||||
ss_content = z.read(ss_path)
|
||||
ss_root = ET.fromstring(ss_content)
|
||||
for si in find_elements_with_ns(ss_root, 'si'):
|
||||
t_elements = [c for c in si if c.tag.endswith('}t') or c.tag == 't']
|
||||
if t_elements:
|
||||
shared_strings.append(t_elements[0].text or '')
|
||||
else:
|
||||
shared_strings.append('')
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warning(f"读取 sharedStrings 失败: {e}")
|
||||
|
||||
# 读取工作表 - 尝试多种可能的路径
|
||||
sheet_content = None
|
||||
sheet_paths = [
|
||||
f'xl/worksheets/sheet{sheet_index}.xml',
|
||||
f'xl\\worksheets\\sheet{sheet_index}.xml',
|
||||
f'worksheets/sheet{sheet_index}.xml',
|
||||
]
|
||||
for sp in sheet_paths:
|
||||
if sp in z.namelist():
|
||||
sheet_content = z.read(sp)
|
||||
break
|
||||
|
||||
if sheet_content is None:
|
||||
raise ValueError(f"工作表文件 sheet{sheet_index}.xml 不存在")
|
||||
|
||||
root = ET.fromstring(sheet_content)
|
||||
|
||||
# 收集所有行数据
|
||||
all_rows = []
|
||||
headers = {}
|
||||
|
||||
for row in find_elements_with_ns(root, 'row'):
|
||||
row_idx = int(row.get('r', 0))
|
||||
row_cells = {}
|
||||
for cell in find_elements_with_ns(row, 'c'):
|
||||
cell_ref = cell.get('r', '')
|
||||
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
||||
cell_type = cell.get('t', 'n')
|
||||
v_elements = find_elements_with_ns(cell, 'v')
|
||||
v = v_elements[0] if v_elements else None
|
||||
|
||||
if v is not None and v.text:
|
||||
if cell_type == 's':
|
||||
try:
|
||||
row_cells[col_letters] = shared_strings[int(v.text)]
|
||||
except (ValueError, IndexError):
|
||||
row_cells[col_letters] = v.text
|
||||
elif cell_type == 'b':
|
||||
row_cells[col_letters] = v.text == '1'
|
||||
else:
|
||||
row_cells[col_letters] = v.text
|
||||
else:
|
||||
row_cells[col_letters] = None
|
||||
|
||||
if row_idx == header_row + 1:
|
||||
headers = {**row_cells}
|
||||
elif row_idx > header_row + 1:
|
||||
all_rows.append(row_cells)
|
||||
|
||||
# 构建 DataFrame
|
||||
if headers:
|
||||
col_order = list(headers.keys())
|
||||
df = pd.DataFrame(all_rows)
|
||||
if not df.empty:
|
||||
df = df[col_order]
|
||||
df.columns = [headers.get(col, col) for col in df.columns]
|
||||
else:
|
||||
df = pd.DataFrame(all_rows)
|
||||
|
||||
return df
|
||||
|
||||
def _df_to_dict(self, df: pd.DataFrame) -> Dict[str, Any]:
|
||||
"""
|
||||
将 DataFrame 转换为字典,处理 NaN 值
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
指令执行模块
|
||||
|
||||
注意: 此模块为可选功能,当前尚未实现。
|
||||
如需启用,请实现 intent_parser.py 和 executor.py
|
||||
"""
|
||||
from .intent_parser import IntentParser, DefaultIntentParser
|
||||
from .executor import InstructionExecutor, DefaultInstructionExecutor
|
||||
|
||||
__all__ = [
|
||||
"IntentParser",
|
||||
"DefaultIntentParser",
|
||||
"InstructionExecutor",
|
||||
"DefaultInstructionExecutor",
|
||||
]
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
"""
|
||||
指令执行器模块
|
||||
|
||||
将自然语言指令转换为可执行操作
|
||||
|
||||
注意: 此模块为可选功能,当前尚未实现。
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict
|
||||
|
||||
|
||||
class InstructionExecutor(ABC):
|
||||
"""指令执行器抽象基类"""
|
||||
|
||||
@abstractmethod
|
||||
async def execute(self, instruction: str, context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
执行指令
|
||||
|
||||
Args:
|
||||
instruction: 解析后的指令
|
||||
context: 执行上下文
|
||||
|
||||
Returns:
|
||||
执行结果
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class DefaultInstructionExecutor(InstructionExecutor):
|
||||
"""默认指令执行器"""
|
||||
|
||||
async def execute(self, instruction: str, context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""暂未实现"""
|
||||
raise NotImplementedError("指令执行功能暂未实现")
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
"""
|
||||
意图解析器模块
|
||||
|
||||
解析用户自然语言指令,识别意图和参数
|
||||
|
||||
注意: 此模块为可选功能,当前尚未实现。
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
|
||||
class IntentParser(ABC):
|
||||
"""意图解析器抽象基类"""
|
||||
|
||||
@abstractmethod
|
||||
async def parse(self, text: str) -> Tuple[str, Dict[str, Any]]:
|
||||
"""
|
||||
解析自然语言指令
|
||||
|
||||
Args:
|
||||
text: 用户输入的自然语言
|
||||
|
||||
Returns:
|
||||
(意图类型, 参数字典)
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class DefaultIntentParser(IntentParser):
|
||||
"""默认意图解析器"""
|
||||
|
||||
async def parse(self, text: str) -> Tuple[str, Dict[str, Any]]:
|
||||
"""暂未实现"""
|
||||
raise NotImplementedError("意图解析功能暂未实现")
|
||||
|
||||
@@ -1,10 +1,187 @@
|
||||
"""
|
||||
FastAPI 应用主入口
|
||||
"""
|
||||
from fastapi import FastAPI
|
||||
import logging
|
||||
import logging.handlers
|
||||
import sys
|
||||
import uuid
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Callable
|
||||
from functools import wraps
|
||||
|
||||
from fastapi import FastAPI, Request, Response
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from starlette.middleware.base import BaseHTTPMiddleware
|
||||
|
||||
from app.config import settings
|
||||
from app.api import api_router
|
||||
from app.core.database import mysql_db, mongodb, redis_db
|
||||
|
||||
# ==================== 日志配置 ====================
|
||||
|
||||
def setup_logging():
|
||||
"""配置应用日志系统"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# 根日志配置
|
||||
log_level = logging.DEBUG if settings.DEBUG else logging.INFO
|
||||
|
||||
# 日志目录
|
||||
log_dir = Path("data/logs")
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 日志文件路径
|
||||
log_file = log_dir / "app.log"
|
||||
error_log_file = log_dir / "error.log"
|
||||
|
||||
# 控制台处理器
|
||||
console_handler = logging.StreamHandler(sys.stdout)
|
||||
console_handler.setLevel(log_level)
|
||||
console_formatter = logging.Formatter(
|
||||
fmt="%(asctime)s | %(levelname)-8s | %(name)s:%(lineno)d | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
console_handler.setFormatter(console_formatter)
|
||||
|
||||
# 文件处理器 (所有日志)
|
||||
file_handler = logging.handlers.RotatingFileHandler(
|
||||
log_file,
|
||||
maxBytes=10 * 1024 * 1024, # 10MB
|
||||
backupCount=5,
|
||||
encoding="utf-8"
|
||||
)
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
file_formatter = logging.Formatter(
|
||||
fmt="%(asctime)s | %(levelname)-8s | %(name)s:%(lineno)d | %(funcName)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
file_handler.setFormatter(file_formatter)
|
||||
|
||||
# 错误日志处理器 (仅ERROR及以上)
|
||||
error_file_handler = logging.handlers.RotatingFileHandler(
|
||||
error_log_file,
|
||||
maxBytes=10 * 1024 * 1024, # 10MB
|
||||
backupCount=5,
|
||||
encoding="utf-8"
|
||||
)
|
||||
error_file_handler.setLevel(logging.ERROR)
|
||||
error_file_handler.setFormatter(file_formatter)
|
||||
|
||||
# 根日志器
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(logging.DEBUG)
|
||||
root_logger.handlers = []
|
||||
root_logger.addHandler(console_handler)
|
||||
root_logger.addHandler(file_handler)
|
||||
root_logger.addHandler(error_file_handler)
|
||||
|
||||
# 第三方库日志级别
|
||||
for lib in ["uvicorn", "uvicorn.access", "fastapi", "httpx", "sqlalchemy"]:
|
||||
logging.getLogger(lib).setLevel(logging.WARNING)
|
||||
|
||||
root_logger.info(f"日志系统初始化完成 | 日志目录: {log_dir}")
|
||||
root_logger.info(f"主日志文件: {log_file} | 错误日志: {error_log_file}")
|
||||
|
||||
return root_logger
|
||||
|
||||
# 初始化日志
|
||||
setup_logging()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ==================== 请求日志中间件 ====================
|
||||
|
||||
class RequestLoggingMiddleware(BaseHTTPMiddleware):
|
||||
"""请求日志中间件 - 记录每个请求的详细信息"""
|
||||
|
||||
async def dispatch(self, request: Request, call_next: Callable) -> Response:
|
||||
# 生成请求ID
|
||||
request_id = str(uuid.uuid4())[:8]
|
||||
request.state.request_id = request_id
|
||||
|
||||
# 记录请求
|
||||
logger.info(f"→ [{request_id}] {request.method} {request.url.path}")
|
||||
|
||||
try:
|
||||
response = await call_next(request)
|
||||
|
||||
# 记录响应
|
||||
logger.info(
|
||||
f"← [{request_id}] {request.method} {request.url.path} "
|
||||
f"| 状态: {response.status_code} | 耗时: N/A"
|
||||
)
|
||||
|
||||
# 添加请求ID到响应头
|
||||
response.headers["X-Request-ID"] = request_id
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"✗ [{request_id}] {request.method} {request.url.path} | 异常: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
# ==================== 请求追踪装饰器 ====================
|
||||
|
||||
def log_async_function(func: Callable) -> Callable:
|
||||
"""异步函数日志装饰器"""
|
||||
@wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
func_name = func.__name__
|
||||
logger.debug(f"→ {func_name} 开始执行")
|
||||
try:
|
||||
result = await func(*args, **kwargs)
|
||||
logger.debug(f"← {func_name} 执行完成")
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"✗ {func_name} 执行失败: {str(e)}")
|
||||
raise
|
||||
return wrapper
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""
|
||||
应用生命周期管理
|
||||
|
||||
启动时: 初始化数据库连接
|
||||
关闭时: 关闭数据库连接
|
||||
"""
|
||||
# 启动时
|
||||
logger.info("正在初始化数据库连接...")
|
||||
|
||||
# 初始化 MySQL
|
||||
try:
|
||||
await mysql_db.init_db()
|
||||
logger.info("✓ MySQL 初始化成功")
|
||||
except Exception as e:
|
||||
logger.error(f"✗ MySQL 初始化失败: {e}")
|
||||
|
||||
# 初始化 MongoDB
|
||||
try:
|
||||
await mongodb.connect()
|
||||
await mongodb.create_indexes()
|
||||
logger.info("✓ MongoDB 初始化成功")
|
||||
except Exception as e:
|
||||
logger.error(f"✗ MongoDB 初始化失败: {e}")
|
||||
|
||||
# 初始化 Redis
|
||||
try:
|
||||
await redis_db.connect()
|
||||
logger.info("✓ Redis 初始化成功")
|
||||
except Exception as e:
|
||||
logger.error(f"✗ Redis 初始化失败: {e}")
|
||||
|
||||
logger.info("数据库初始化完成")
|
||||
yield
|
||||
|
||||
# 关闭时
|
||||
logger.info("正在关闭数据库连接...")
|
||||
await mysql_db.close()
|
||||
await mongodb.close()
|
||||
await redis_db.close()
|
||||
logger.info("数据库连接已关闭")
|
||||
|
||||
|
||||
# 创建 FastAPI 应用实例
|
||||
app = FastAPI(
|
||||
@@ -13,7 +190,8 @@ app = FastAPI(
|
||||
version="1.0.0",
|
||||
openapi_url=f"{settings.API_V1_STR}/openapi.json",
|
||||
docs_url=f"{settings.API_V1_STR}/docs",
|
||||
redoc_url=f"{settings.API_V1_STR}/redoc"
|
||||
redoc_url=f"{settings.API_V1_STR}/redoc",
|
||||
lifespan=lifespan, # 添加生命周期管理
|
||||
)
|
||||
|
||||
# 配置 CORS 中间件
|
||||
@@ -25,6 +203,9 @@ app.add_middleware(
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# 添加请求日志中间件
|
||||
app.add_middleware(RequestLoggingMiddleware)
|
||||
|
||||
# 注册 API 路由
|
||||
app.include_router(api_router, prefix=settings.API_V1_STR)
|
||||
|
||||
@@ -43,10 +224,24 @@ async def root():
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""健康检查接口"""
|
||||
"""
|
||||
健康检查接口
|
||||
|
||||
返回各数据库连接状态
|
||||
"""
|
||||
# 检查各数据库连接状态
|
||||
mysql_status = "connected" if mysql_db.async_engine else "disconnected"
|
||||
mongodb_status = "connected" if mongodb.client else "disconnected"
|
||||
redis_status = "connected" if redis_db.is_connected else "disconnected"
|
||||
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.APP_NAME
|
||||
"service": settings.APP_NAME,
|
||||
"databases": {
|
||||
"mysql": mysql_status,
|
||||
"mongodb": mongodb_status,
|
||||
"redis": redis_status,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
18
backend/app/models/__init__.py
Normal file
18
backend/app/models/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""
|
||||
数据模型模块
|
||||
|
||||
定义数据库表结构和数据模型
|
||||
"""
|
||||
from app.core.database.mysql import (
|
||||
Base,
|
||||
DocumentField,
|
||||
DocumentTable,
|
||||
TaskRecord,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"Base",
|
||||
"DocumentTable",
|
||||
"DocumentField",
|
||||
"TaskRecord",
|
||||
]
|
||||
172
backend/app/models/document.py
Normal file
172
backend/app/models/document.py
Normal file
@@ -0,0 +1,172 @@
|
||||
"""
|
||||
文档数据模型
|
||||
|
||||
定义文档相关的 Pydantic 模型
|
||||
"""
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class DocumentType(str, Enum):
|
||||
"""文档类型枚举"""
|
||||
DOCX = "docx"
|
||||
XLSX = "xlsx"
|
||||
MD = "md"
|
||||
TXT = "txt"
|
||||
|
||||
|
||||
class TaskStatus(str, Enum):
|
||||
"""任务状态枚举"""
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
SUCCESS = "success"
|
||||
FAILURE = "failure"
|
||||
|
||||
|
||||
# ==================== 解析结果模型 ====================
|
||||
|
||||
class DocumentMetadata(BaseModel):
|
||||
"""文档元数据"""
|
||||
filename: str
|
||||
extension: str
|
||||
file_size: int = 0
|
||||
doc_type: Optional[str] = None
|
||||
sheet_count: Optional[int] = None
|
||||
sheet_names: Optional[List[str]] = None
|
||||
current_sheet: Optional[str] = None
|
||||
row_count: Optional[int] = None
|
||||
column_count: Optional[int] = None
|
||||
columns: Optional[List[str]] = None
|
||||
encoding: Optional[str] = None
|
||||
|
||||
|
||||
class ParseResultData(BaseModel):
|
||||
"""解析结果数据"""
|
||||
columns: List[str] = Field(default_factory=list)
|
||||
rows: List[Dict[str, Any]] = Field(default_factory=list)
|
||||
row_count: int = 0
|
||||
column_count: int = 0
|
||||
|
||||
|
||||
class ParseResult(BaseModel):
|
||||
"""文档解析结果"""
|
||||
success: bool
|
||||
data: Optional[ParseResultData] = None
|
||||
metadata: Optional[DocumentMetadata] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
# ==================== 存储模型 ====================
|
||||
|
||||
class DocumentStore(BaseModel):
|
||||
"""文档存储模型"""
|
||||
doc_id: str
|
||||
doc_type: DocumentType
|
||||
content: str
|
||||
metadata: DocumentMetadata
|
||||
structured_data: Optional[Dict[str, Any]] = None
|
||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
updated_at: datetime = Field(default_factory=datetime.utcnow)
|
||||
|
||||
|
||||
class RAGEntry(BaseModel):
|
||||
"""RAG索引条目"""
|
||||
table_name: str
|
||||
field_name: str
|
||||
field_description: str
|
||||
embedding: List[float]
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
# ==================== 任务模型 ====================
|
||||
|
||||
class TaskCreate(BaseModel):
|
||||
"""任务创建请求"""
|
||||
task_type: str
|
||||
input_params: Dict[str, Any]
|
||||
|
||||
|
||||
class TaskStatusResponse(BaseModel):
|
||||
"""任务状态响应"""
|
||||
task_id: str
|
||||
status: TaskStatus
|
||||
progress: int = 0
|
||||
message: Optional[str] = None
|
||||
result: Optional[Any] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
# ==================== 模板填写模型 ====================
|
||||
|
||||
class TemplateField(BaseModel):
|
||||
"""模板字段"""
|
||||
cell: str = Field(description="单元格位置, 如 A1")
|
||||
name: str = Field(description="字段名称")
|
||||
field_type: str = Field(default="text", description="字段类型: text/number/date")
|
||||
required: bool = Field(default=True, description="是否必填")
|
||||
|
||||
|
||||
class TemplateSheet(BaseModel):
|
||||
"""模板工作表"""
|
||||
name: str
|
||||
fields: List[TemplateField]
|
||||
|
||||
|
||||
class TemplateInfo(BaseModel):
|
||||
"""模板信息"""
|
||||
file_path: str
|
||||
file_type: str # xlsx/docx
|
||||
sheets: List[TemplateSheet]
|
||||
|
||||
|
||||
class FillRequest(BaseModel):
|
||||
"""填写请求"""
|
||||
template_path: str
|
||||
template_fields: List[TemplateField]
|
||||
source_doc_ids: Optional[List[str]] = None
|
||||
|
||||
|
||||
class FillResult(BaseModel):
|
||||
"""填写结果"""
|
||||
success: bool
|
||||
filled_data: Dict[str, Any]
|
||||
fill_details: List[Dict[str, Any]]
|
||||
source_documents: List[str] = Field(default_factory=list)
|
||||
|
||||
|
||||
# ==================== API 响应模型 ====================
|
||||
|
||||
class UploadResponse(BaseModel):
|
||||
"""上传响应"""
|
||||
task_id: str
|
||||
file_count: int
|
||||
message: str
|
||||
status_url: str
|
||||
|
||||
|
||||
class AnalyzeResponse(BaseModel):
|
||||
"""分析响应"""
|
||||
success: bool
|
||||
analysis: Optional[str] = None
|
||||
structured_data: Optional[Dict[str, Any]] = None
|
||||
model: Optional[str] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class QueryRequest(BaseModel):
|
||||
"""查询请求"""
|
||||
user_intent: str
|
||||
table_name: Optional[str] = None
|
||||
top_k: int = Field(default=5, ge=1, le=20)
|
||||
|
||||
|
||||
class QueryResponse(BaseModel):
|
||||
"""查询响应"""
|
||||
success: bool
|
||||
sql_query: Optional[str] = None
|
||||
results: Optional[List[Dict[str, Any]]] = None
|
||||
rag_context: Optional[List[str]] = None
|
||||
error: Optional[str] = None
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
722
backend/app/services/excel_storage_service.py
Normal file
722
backend/app/services/excel_storage_service.py
Normal file
@@ -0,0 +1,722 @@
|
||||
"""
|
||||
Excel 存储服务
|
||||
|
||||
将 Excel 数据转换为 MySQL 表结构并存储
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import pandas as pd
|
||||
from sqlalchemy import (
|
||||
Column,
|
||||
DateTime,
|
||||
Float,
|
||||
Integer,
|
||||
String,
|
||||
Text,
|
||||
inspect,
|
||||
text,
|
||||
)
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.database.mysql import Base, mysql_db
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
# 设置该模块的日志级别
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
class ExcelStorageService:
|
||||
"""Excel 数据存储服务"""
|
||||
|
||||
def __init__(self):
|
||||
self.mysql_db = mysql_db
|
||||
|
||||
def _extract_sheet_names_from_xml(self, file_path: str) -> list:
|
||||
"""从 Excel 文件的 XML 中提取工作表名称"""
|
||||
import zipfile
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as z:
|
||||
if 'xl/workbook.xml' not in z.namelist():
|
||||
return []
|
||||
content = z.read('xl/workbook.xml')
|
||||
root = ET.fromstring(content)
|
||||
|
||||
# 尝试多种命名空间
|
||||
namespaces = [
|
||||
'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
|
||||
'http://purl.oclc.org/ooxml/spreadsheetml/main',
|
||||
]
|
||||
|
||||
for ns_uri in namespaces:
|
||||
ns = {'main': ns_uri}
|
||||
sheets = root.findall('.//main:sheet', ns)
|
||||
if sheets:
|
||||
names = [s.get('name') for s in sheets if s.get('name')]
|
||||
if names:
|
||||
return names
|
||||
|
||||
# 尝试通配符
|
||||
sheets = root.findall('.//{*}sheet')
|
||||
if not sheets:
|
||||
sheets = root.findall('.//sheet')
|
||||
return [s.get('name') for s in sheets if s.get('name')]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
|
||||
"""读取 Excel 工作表,支持 pandas 无法解析的特殊 Excel 文件"""
|
||||
import zipfile
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
try:
|
||||
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
|
||||
if df is not None and not df.empty:
|
||||
return df
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# pandas 读取失败,从 XML 直接解析
|
||||
logger.info(f"使用 XML 方式读取 Excel: {file_path}")
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as z:
|
||||
sheet_names = self._extract_sheet_names_from_xml(file_path)
|
||||
if not sheet_names:
|
||||
raise ValueError("无法从 Excel 文件中找到工作表")
|
||||
|
||||
target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
|
||||
sheet_index = sheet_names.index(target_sheet) + 1
|
||||
|
||||
shared_strings = []
|
||||
if 'xl/sharedStrings.xml' in z.namelist():
|
||||
ss_content = z.read('xl/sharedStrings.xml')
|
||||
ss_root = ET.fromstring(ss_content)
|
||||
for si in ss_root.iter():
|
||||
if si.tag.endswith('}si') or si.tag == 'si':
|
||||
t = si.find('.//{*}t')
|
||||
shared_strings.append(t.text if t is not None and t.text else '')
|
||||
|
||||
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
|
||||
sheet_content = z.read(sheet_file)
|
||||
root = ET.fromstring(sheet_content)
|
||||
|
||||
rows_data = []
|
||||
headers = {}
|
||||
|
||||
for row in root.iter():
|
||||
if row.tag.endswith('}row') or row.tag == 'row':
|
||||
row_idx = int(row.get('r', 0))
|
||||
|
||||
# 收集表头行
|
||||
if row_idx == header_row + 1:
|
||||
for cell in row:
|
||||
if cell.tag.endswith('}c') or cell.tag == 'c':
|
||||
cell_ref = cell.get('r', '')
|
||||
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
||||
cell_type = cell.get('t', 'n')
|
||||
v = cell.find('{*}v')
|
||||
if v is not None and v.text:
|
||||
if cell_type == 's':
|
||||
try:
|
||||
headers[col_letters] = shared_strings[int(v.text)]
|
||||
except (ValueError, IndexError):
|
||||
headers[col_letters] = v.text
|
||||
else:
|
||||
headers[col_letters] = v.text
|
||||
else:
|
||||
headers[col_letters] = col_letters
|
||||
continue
|
||||
|
||||
if row_idx <= header_row + 1:
|
||||
continue
|
||||
|
||||
row_cells = {}
|
||||
for cell in row:
|
||||
if cell.tag.endswith('}c') or cell.tag == 'c':
|
||||
cell_ref = cell.get('r', '')
|
||||
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
||||
cell_type = cell.get('t', 'n')
|
||||
v = cell.find('{*}v')
|
||||
|
||||
if v is not None and v.text:
|
||||
if cell_type == 's':
|
||||
try:
|
||||
val = shared_strings[int(v.text)]
|
||||
except (ValueError, IndexError):
|
||||
val = v.text
|
||||
elif cell_type == 'b':
|
||||
val = v.text == '1'
|
||||
else:
|
||||
val = v.text
|
||||
else:
|
||||
val = None
|
||||
row_cells[col_letters] = val
|
||||
|
||||
if row_cells:
|
||||
rows_data.append(row_cells)
|
||||
|
||||
if not rows_data:
|
||||
return pd.DataFrame()
|
||||
|
||||
df = pd.DataFrame(rows_data)
|
||||
|
||||
if headers:
|
||||
df.columns = [headers.get(col, col) for col in df.columns]
|
||||
|
||||
return df
|
||||
except Exception as e:
|
||||
logger.error(f"XML 解析 Excel 失败: {e}")
|
||||
raise
|
||||
|
||||
def _sanitize_table_name(self, filename: str) -> str:
|
||||
"""
|
||||
将文件名转换为合法的表名
|
||||
|
||||
Args:
|
||||
filename: 原始文件名
|
||||
|
||||
Returns:
|
||||
合法的表名
|
||||
"""
|
||||
# 移除扩展名
|
||||
name = filename.rsplit('.', 1)[0] if '.' in filename else filename
|
||||
|
||||
# 只保留字母、数字、下划线
|
||||
name = re.sub(r'[^a-zA-Z0-9_]', '_', name)
|
||||
|
||||
# 确保以字母开头
|
||||
if name and name[0].isdigit():
|
||||
name = 't_' + name
|
||||
|
||||
# 限制长度
|
||||
return name[:50]
|
||||
|
||||
def _sanitize_column_name(self, col_name: str) -> str:
|
||||
"""
|
||||
将列名转换为合法的字段名
|
||||
|
||||
Args:
|
||||
col_name: 原始列名
|
||||
|
||||
Returns:
|
||||
合法的字段名
|
||||
"""
|
||||
# MySQL 支持 UTF8 编码,中文字符可以直接使用
|
||||
# 只处理非法字符(控制字符等)和首字符数字
|
||||
name = str(col_name).strip()
|
||||
# 移除控制字符
|
||||
name = re.sub(r'[\x00-\x1f\x7f]', '', name)
|
||||
# 确保以字母或中文开头
|
||||
if name and name[0].isdigit():
|
||||
name = 'col_' + name
|
||||
# 限制长度 (MySQL 字段名最多64字符)
|
||||
return name[:64]
|
||||
|
||||
def _get_unique_column_name(self, col_name: str, used_names: set) -> str:
|
||||
"""
|
||||
获取唯一的列名,避免重复
|
||||
|
||||
Args:
|
||||
col_name: 原始列名
|
||||
used_names: 已使用的列名集合
|
||||
|
||||
Returns:
|
||||
唯一的列名
|
||||
"""
|
||||
sanitized = self._sanitize_column_name(col_name)
|
||||
# "id" 是 MySQL 保留名,作为主键使用
|
||||
if sanitized.lower() == "id":
|
||||
sanitized = "col_id"
|
||||
if sanitized not in used_names:
|
||||
used_names.add(sanitized)
|
||||
return sanitized
|
||||
|
||||
# 添加数字后缀直到唯一
|
||||
base = sanitized if sanitized else "col"
|
||||
counter = 1
|
||||
while f"{base}_{counter}" in used_names:
|
||||
counter += 1
|
||||
unique_name = f"{base}_{counter}"
|
||||
used_names.add(unique_name)
|
||||
return unique_name
|
||||
|
||||
def _infer_column_type(self, series: pd.Series) -> str:
|
||||
"""
|
||||
根据数据推断列类型
|
||||
|
||||
Args:
|
||||
series: pandas Series
|
||||
|
||||
Returns:
|
||||
类型名称
|
||||
"""
|
||||
# 移除空值进行类型检查
|
||||
non_null = series.dropna()
|
||||
if len(non_null) == 0:
|
||||
return "TEXT"
|
||||
|
||||
dtype = series.dtype
|
||||
|
||||
# 整数类型检查
|
||||
if pd.api.types.is_integer_dtype(dtype):
|
||||
# 检查是否所有值都能放入 INT 范围
|
||||
try:
|
||||
int_values = non_null.astype('int64')
|
||||
if int_values.min() >= -2147483648 and int_values.max() <= 2147483647:
|
||||
return "INTEGER"
|
||||
else:
|
||||
# 超出 INT 范围,使用 TEXT
|
||||
return "TEXT"
|
||||
except (ValueError, OverflowError):
|
||||
return "TEXT"
|
||||
elif pd.api.types.is_float_dtype(dtype):
|
||||
# 检查是否所有值都能放入 FLOAT
|
||||
try:
|
||||
float_values = non_null.astype('float64')
|
||||
if float_values.min() >= -1e308 and float_values.max() <= 1e308:
|
||||
return "FLOAT"
|
||||
else:
|
||||
return "TEXT"
|
||||
except (ValueError, OverflowError):
|
||||
return "TEXT"
|
||||
elif pd.api.types.is_datetime64_any_dtype(dtype):
|
||||
return "DATETIME"
|
||||
elif pd.api.types.is_bool_dtype(dtype):
|
||||
return "BOOLEAN"
|
||||
else:
|
||||
return "TEXT"
|
||||
|
||||
def _create_table_model(
|
||||
self,
|
||||
table_name: str,
|
||||
columns: List[str],
|
||||
column_types: Dict[str, str]
|
||||
) -> type:
|
||||
"""
|
||||
动态创建 SQLAlchemy 模型类
|
||||
|
||||
Args:
|
||||
table_name: 表名
|
||||
columns: 列名列表
|
||||
column_types: 列类型字典
|
||||
|
||||
Returns:
|
||||
SQLAlchemy 模型类
|
||||
"""
|
||||
# 创建属性字典
|
||||
attrs = {
|
||||
'__tablename__': table_name,
|
||||
'__table_args__': {'extend_existing': True},
|
||||
}
|
||||
|
||||
# 添加主键列
|
||||
attrs['id'] = Column(Integer, primary_key=True, autoincrement=True)
|
||||
|
||||
# 添加数据列
|
||||
for col in columns:
|
||||
col_name = self._sanitize_column_name(col)
|
||||
col_type = column_types.get(col, "TEXT")
|
||||
|
||||
if col_type == "INTEGER":
|
||||
attrs[col_name] = Column(Integer, nullable=True)
|
||||
elif col_type == "FLOAT":
|
||||
attrs[col_name] = Column(Float, nullable=True)
|
||||
elif col_type == "DATETIME":
|
||||
attrs[col_name] = Column(DateTime, nullable=True)
|
||||
elif col_type == "BOOLEAN":
|
||||
attrs[col_name] = Column(Integer, nullable=True) # MySQL 没有原生 BOOLEAN
|
||||
else:
|
||||
attrs[col_name] = Column(Text, nullable=True)
|
||||
|
||||
# 添加元数据列
|
||||
attrs['created_at'] = Column(DateTime, default=datetime.utcnow)
|
||||
attrs['updated_at'] = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
|
||||
|
||||
# 创建类
|
||||
return type(table_name, (Base,), attrs)
|
||||
|
||||
async def store_excel(
|
||||
self,
|
||||
file_path: str,
|
||||
filename: str,
|
||||
sheet_name: Optional[str] = None,
|
||||
header_row: int = 0
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
将 Excel 文件存储到 MySQL
|
||||
|
||||
Args:
|
||||
file_path: Excel 文件路径
|
||||
filename: 原始文件名
|
||||
sheet_name: 工作表名称
|
||||
header_row: 表头行号
|
||||
|
||||
Returns:
|
||||
存储结果
|
||||
"""
|
||||
table_name = self._sanitize_table_name(filename)
|
||||
results = {
|
||||
"success": True,
|
||||
"table_name": table_name,
|
||||
"row_count": 0,
|
||||
"columns": []
|
||||
}
|
||||
|
||||
try:
|
||||
logger.info(f"开始读取Excel文件: {file_path}")
|
||||
# 读取 Excel(使用 fallback 方式支持特殊格式文件)
|
||||
df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row)
|
||||
|
||||
logger.info(f"Excel读取完成,行数: {len(df)}, 列数: {len(df.columns)}")
|
||||
|
||||
if df.empty:
|
||||
return {"success": False, "error": "Excel 文件为空"}
|
||||
|
||||
# 清理列名
|
||||
df.columns = [str(c) for c in df.columns]
|
||||
|
||||
# 推断列类型,并生成唯一的列名
|
||||
column_types = {}
|
||||
column_name_map = {} # 原始列名 -> 唯一合法列名
|
||||
used_names = set()
|
||||
for col in df.columns:
|
||||
col_name = self._get_unique_column_name(col, used_names)
|
||||
col_type = self._infer_column_type(df[col])
|
||||
column_types[col] = col_type
|
||||
column_name_map[col] = col_name
|
||||
results["columns"].append({
|
||||
"original_name": col,
|
||||
"sanitized_name": col_name,
|
||||
"type": col_type
|
||||
})
|
||||
|
||||
# 创建表 - 使用原始 SQL 以兼容异步
|
||||
logger.info(f"正在创建MySQL表: {table_name}")
|
||||
sql_columns = ["id INT AUTO_INCREMENT PRIMARY KEY"]
|
||||
for col in df.columns:
|
||||
col_name = column_name_map[col]
|
||||
col_type = column_types.get(col, "TEXT")
|
||||
sql_type = "INT" if col_type == "INTEGER" else "FLOAT" if col_type == "FLOAT" else "DATETIME" if col_type == "DATETIME" else "TEXT"
|
||||
sql_columns.append(f"`{col_name}` {sql_type}")
|
||||
sql_columns.append("created_at DATETIME DEFAULT CURRENT_TIMESTAMP")
|
||||
sql_columns.append("updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP")
|
||||
create_sql = text(f"CREATE TABLE IF NOT EXISTS `{table_name}` ({', '.join(sql_columns)})")
|
||||
await self.mysql_db.execute_raw_sql(str(create_sql))
|
||||
logger.info(f"MySQL表创建完成: {table_name}")
|
||||
|
||||
# 插入数据
|
||||
records = []
|
||||
for _, row in df.iterrows():
|
||||
record = {}
|
||||
for col in df.columns:
|
||||
col_name = column_name_map[col]
|
||||
value = row[col]
|
||||
|
||||
# 处理 NaN 值
|
||||
if pd.isna(value):
|
||||
record[col_name] = None
|
||||
elif column_types[col] == "INTEGER":
|
||||
try:
|
||||
record[col_name] = int(value)
|
||||
except (ValueError, TypeError):
|
||||
record[col_name] = None
|
||||
elif column_types[col] == "FLOAT":
|
||||
try:
|
||||
record[col_name] = float(value)
|
||||
except (ValueError, TypeError):
|
||||
record[col_name] = None
|
||||
else:
|
||||
record[col_name] = str(value)
|
||||
|
||||
records.append(record)
|
||||
|
||||
logger.info(f"正在插入 {len(records)} 条数据到 MySQL (使用批量插入)...")
|
||||
# 使用 pymysql 直接插入以避免 SQLAlchemy 异步问题
|
||||
import pymysql
|
||||
from app.config import settings
|
||||
|
||||
connection = pymysql.connect(
|
||||
host=settings.MYSQL_HOST,
|
||||
port=settings.MYSQL_PORT,
|
||||
user=settings.MYSQL_USER,
|
||||
password=settings.MYSQL_PASSWORD,
|
||||
database=settings.MYSQL_DATABASE,
|
||||
charset=settings.MYSQL_CHARSET
|
||||
)
|
||||
try:
|
||||
columns_str = ', '.join(['`' + column_name_map[col] + '`' for col in df.columns])
|
||||
placeholders = ', '.join(['%s' for _ in df.columns])
|
||||
insert_sql = f"INSERT INTO `{table_name}` ({columns_str}) VALUES ({placeholders})"
|
||||
|
||||
# 转换为元组列表 (使用映射后的列名)
|
||||
param_list = [tuple(record.get(column_name_map[col]) for col in df.columns) for record in records]
|
||||
|
||||
with connection.cursor() as cursor:
|
||||
cursor.executemany(insert_sql, param_list)
|
||||
connection.commit()
|
||||
logger.info(f"数据插入完成: {len(records)} 条")
|
||||
finally:
|
||||
connection.close()
|
||||
|
||||
results["row_count"] = len(records)
|
||||
logger.info(f"Excel 数据已存储到 MySQL 表 {table_name},共 {len(records)} 行")
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"存储 Excel 到 MySQL 失败: {str(e)}", exc_info=True)
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def store_structured_data(
|
||||
self,
|
||||
table_name: str,
|
||||
data: Dict[str, Any],
|
||||
source_doc_id: str = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
将结构化数据(从非结构化文档提取的表格)存储到 MySQL
|
||||
|
||||
Args:
|
||||
table_name: 表名
|
||||
data: 结构化数据,格式为:
|
||||
{
|
||||
"columns": ["col1", "col2"], # 列名
|
||||
"rows": [["val1", "val2"], ["val3", "val4"]] # 数据行
|
||||
}
|
||||
source_doc_id: 源文档 ID
|
||||
|
||||
Returns:
|
||||
存储结果
|
||||
"""
|
||||
results = {
|
||||
"success": True,
|
||||
"table_name": table_name,
|
||||
"row_count": 0,
|
||||
"columns": []
|
||||
}
|
||||
|
||||
try:
|
||||
columns = data.get("columns", [])
|
||||
rows = data.get("rows", [])
|
||||
|
||||
if not columns or not rows:
|
||||
return {"success": False, "error": "数据为空"}
|
||||
|
||||
# 清理列名
|
||||
sanitized_columns = [self._sanitize_column_name(c) for c in columns]
|
||||
|
||||
# 推断列类型
|
||||
column_types = {}
|
||||
for i, col in enumerate(columns):
|
||||
col_values = [row[i] for row in rows if i < len(row)]
|
||||
# 根据数据推断类型
|
||||
col_type = self._infer_type_from_values(col_values)
|
||||
column_types[col] = col_type
|
||||
results["columns"].append({
|
||||
"original_name": col,
|
||||
"sanitized_name": self._sanitize_column_name(col),
|
||||
"type": col_type
|
||||
})
|
||||
|
||||
# 创建表
|
||||
model_class = self._create_table_model(table_name, columns, column_types)
|
||||
|
||||
# 创建表结构
|
||||
async with self.mysql_db.get_session() as session:
|
||||
model_class.__table__.create(session.bind, checkfirst=True)
|
||||
|
||||
# 插入数据
|
||||
records = []
|
||||
for row in rows:
|
||||
record = {}
|
||||
for i, col in enumerate(columns):
|
||||
if i >= len(row):
|
||||
continue
|
||||
col_name = self._sanitize_column_name(col)
|
||||
value = row[i]
|
||||
col_type = column_types.get(col, "TEXT")
|
||||
|
||||
# 处理空值
|
||||
if value is None or str(value).strip() == '':
|
||||
record[col_name] = None
|
||||
elif col_type == "INTEGER":
|
||||
try:
|
||||
record[col_name] = int(value)
|
||||
except (ValueError, TypeError):
|
||||
record[col_name] = None
|
||||
elif col_type == "FLOAT":
|
||||
try:
|
||||
record[col_name] = float(value)
|
||||
except (ValueError, TypeError):
|
||||
record[col_name] = None
|
||||
else:
|
||||
record[col_name] = str(value)
|
||||
|
||||
records.append(record)
|
||||
|
||||
# 批量插入
|
||||
async with self.mysql_db.get_session() as session:
|
||||
for record in records:
|
||||
session.add(model_class(**record))
|
||||
await session.commit()
|
||||
|
||||
results["row_count"] = len(records)
|
||||
logger.info(f"结构化数据已存储到 MySQL 表 {table_name},共 {len(records)} 行")
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"存储结构化数据到 MySQL 失败: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
def _infer_type_from_values(self, values: List[Any]) -> str:
|
||||
"""
|
||||
根据值列表推断列类型
|
||||
|
||||
Args:
|
||||
values: 值列表
|
||||
|
||||
Returns:
|
||||
类型名称
|
||||
"""
|
||||
non_null_values = [v for v in values if v is not None and str(v).strip() != '']
|
||||
if not non_null_values:
|
||||
return "TEXT"
|
||||
|
||||
# 检查是否全是整数
|
||||
is_integer = all(self._is_integer(v) for v in non_null_values)
|
||||
if is_integer:
|
||||
return "INTEGER"
|
||||
|
||||
# 检查是否全是浮点数
|
||||
is_float = all(self._is_float(v) for v in non_null_values)
|
||||
if is_float:
|
||||
return "FLOAT"
|
||||
|
||||
return "TEXT"
|
||||
|
||||
def _is_integer(self, value: Any) -> bool:
|
||||
"""判断值是否可以转为整数"""
|
||||
try:
|
||||
int(value)
|
||||
return True
|
||||
except (ValueError, TypeError):
|
||||
return False
|
||||
|
||||
def _is_float(self, value: Any) -> bool:
|
||||
"""判断值是否可以转为浮点数"""
|
||||
try:
|
||||
float(value)
|
||||
return True
|
||||
except (ValueError, TypeError):
|
||||
return False
|
||||
|
||||
async def query_table(
|
||||
self,
|
||||
table_name: str,
|
||||
columns: Optional[List[str]] = None,
|
||||
where: Optional[str] = None,
|
||||
limit: int = 100
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
查询 MySQL 表数据
|
||||
|
||||
Args:
|
||||
table_name: 表名
|
||||
columns: 要查询的列
|
||||
where: WHERE 条件
|
||||
limit: 限制返回行数
|
||||
|
||||
Returns:
|
||||
查询结果
|
||||
"""
|
||||
try:
|
||||
# 构建查询
|
||||
sql = f"SELECT * FROM `{table_name}`"
|
||||
if where:
|
||||
sql += f" WHERE {where}"
|
||||
sql += f" LIMIT {limit}"
|
||||
|
||||
results = await self.mysql_db.execute_query(sql)
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"查询表失败: {str(e)}")
|
||||
return []
|
||||
|
||||
async def get_table_schema(self, table_name: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
获取表结构信息
|
||||
|
||||
Args:
|
||||
table_name: 表名
|
||||
|
||||
Returns:
|
||||
表结构信息
|
||||
"""
|
||||
try:
|
||||
sql = f"""
|
||||
SELECT COLUMN_NAME, DATA_TYPE, IS_NULLABLE, COLUMN_KEY, COLUMN_COMMENT
|
||||
FROM INFORMATION_SCHEMA.COLUMNS
|
||||
WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = '{table_name}'
|
||||
ORDER BY ORDINAL_POSITION
|
||||
"""
|
||||
results = await self.mysql_db.execute_query(sql)
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取表结构失败: {str(e)}")
|
||||
return None
|
||||
|
||||
async def delete_table(self, table_name: str) -> bool:
|
||||
"""
|
||||
删除表
|
||||
|
||||
Args:
|
||||
table_name: 表名
|
||||
|
||||
Returns:
|
||||
是否成功
|
||||
"""
|
||||
try:
|
||||
# 安全检查:表名必须包含下划线(避免删除系统表)
|
||||
if '_' not in table_name and not table_name.startswith('t_'):
|
||||
raise ValueError("不允许删除此表")
|
||||
|
||||
sql = f"DROP TABLE IF EXISTS `{table_name}`"
|
||||
await self.mysql_db.execute_raw_sql(sql)
|
||||
logger.info(f"表 {table_name} 已删除")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"删除表失败: {str(e)}")
|
||||
return False
|
||||
|
||||
async def list_tables(self) -> List[str]:
|
||||
"""
|
||||
列出所有用户表
|
||||
|
||||
Returns:
|
||||
表名列表
|
||||
"""
|
||||
try:
|
||||
sql = """
|
||||
SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES
|
||||
WHERE TABLE_SCHEMA = DATABASE() AND TABLE_TYPE = 'BASE TABLE'
|
||||
"""
|
||||
results = await self.mysql_db.execute_query(sql)
|
||||
return [r['TABLE_NAME'] for r in results]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"列出表失败: {str(e)}")
|
||||
return []
|
||||
|
||||
|
||||
# ==================== 全局单例 ====================
|
||||
|
||||
excel_storage_service = ExcelStorageService()
|
||||
@@ -3,6 +3,7 @@
|
||||
"""
|
||||
import os
|
||||
import shutil
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
@@ -10,6 +11,8 @@ import uuid
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FileService:
|
||||
"""文件服务类,负责文件的存储、读取和管理"""
|
||||
@@ -17,6 +20,7 @@ class FileService:
|
||||
def __init__(self):
|
||||
self.upload_dir = Path(settings.UPLOAD_DIR)
|
||||
self._ensure_upload_dir()
|
||||
logger.info(f"FileService 初始化,上传目录: {self.upload_dir}")
|
||||
|
||||
def _ensure_upload_dir(self):
|
||||
"""确保上传目录存在"""
|
||||
@@ -56,6 +60,8 @@ class FileService:
|
||||
with open(file_path, 'wb') as f:
|
||||
f.write(file_content)
|
||||
|
||||
file_size = len(file_content)
|
||||
logger.info(f"文件已保存: {filename} -> {file_path} ({file_size} bytes)")
|
||||
return str(file_path)
|
||||
|
||||
def read_file(self, file_path: str) -> bytes:
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
LLM 服务模块 - 封装大模型 API 调用
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
from typing import Dict, Any, List, Optional, AsyncGenerator
|
||||
import httpx
|
||||
|
||||
from app.config import settings
|
||||
@@ -87,6 +87,71 @@ class LLMService:
|
||||
logger.error(f"解析 API 响应失败: {str(e)}")
|
||||
raise
|
||||
|
||||
async def chat_stream(
|
||||
self,
|
||||
messages: List[Dict[str, str]],
|
||||
temperature: float = 0.7,
|
||||
max_tokens: Optional[int] = None,
|
||||
**kwargs
|
||||
) -> AsyncGenerator[Dict[str, Any], None]:
|
||||
"""
|
||||
流式调用聊天 API
|
||||
|
||||
Args:
|
||||
messages: 消息列表
|
||||
temperature: 温度参数
|
||||
max_tokens: 最大 token 数
|
||||
**kwargs: 其他参数
|
||||
|
||||
Yields:
|
||||
Dict[str, Any]: 包含 delta 内容的块
|
||||
"""
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": self.model_name,
|
||||
"messages": messages,
|
||||
"temperature": temperature,
|
||||
"stream": True
|
||||
}
|
||||
|
||||
if max_tokens:
|
||||
payload["max_tokens"] = max_tokens
|
||||
|
||||
payload.update(kwargs)
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
async with client.stream(
|
||||
"POST",
|
||||
f"{self.base_url}/chat/completions",
|
||||
headers=headers,
|
||||
json=payload
|
||||
) as response:
|
||||
async for line in response.aiter_lines():
|
||||
if line.startswith("data: "):
|
||||
data = line[6:] # Remove "data: " prefix
|
||||
if data == "[DONE]":
|
||||
break
|
||||
try:
|
||||
import json as json_module
|
||||
chunk = json_module.loads(data)
|
||||
delta = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
|
||||
if delta:
|
||||
yield {"content": delta}
|
||||
except json_module.JSONDecodeError:
|
||||
continue
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"LLM 流式 API 请求失败: {e.response.status_code}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"LLM 流式 API 调用异常: {str(e)}")
|
||||
raise
|
||||
|
||||
async def analyze_excel_data(
|
||||
self,
|
||||
excel_data: Dict[str, Any],
|
||||
|
||||
707
backend/app/services/markdown_ai_service.py
Normal file
707
backend/app/services/markdown_ai_service.py
Normal file
@@ -0,0 +1,707 @@
|
||||
"""
|
||||
Markdown 文档 AI 分析服务
|
||||
|
||||
支持:
|
||||
- 分章节解析(中文章节编号:一、二、三, (一)(二)(三))
|
||||
- 结构化数据提取
|
||||
- 流式输出
|
||||
- 多种分析类型
|
||||
- 可视化图表生成
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, AsyncGenerator, Dict, List, Optional
|
||||
|
||||
from app.services.llm_service import llm_service
|
||||
from app.core.document_parser import MarkdownParser
|
||||
from app.services.visualization_service import visualization_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownSection:
|
||||
"""文档章节结构"""
|
||||
def __init__(self, number: str, title: str, level: int, content: str, line_start: int, line_end: int):
|
||||
self.number = number # 章节编号,如 "一", "(一)", "1"
|
||||
self.title = title
|
||||
self.level = level # 层级深度
|
||||
self.content = content # 章节内容(不含子章节)
|
||||
self.line_start = line_start
|
||||
self.line_end = line_end
|
||||
self.subsections: List[MarkdownSection] = []
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"number": self.number,
|
||||
"title": self.title,
|
||||
"level": self.level,
|
||||
"content_preview": self.content[:200] + "..." if len(self.content) > 200 else self.content,
|
||||
"line_start": self.line_start,
|
||||
"line_end": self.line_end,
|
||||
"subsections": [s.to_dict() for s in self.subsections]
|
||||
}
|
||||
|
||||
|
||||
class MarkdownAIService:
|
||||
"""Markdown 文档 AI 分析服务"""
|
||||
|
||||
# 中文章节编号模式
|
||||
CHINESE_NUMBERS = ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"]
|
||||
CHINESE_SUFFIX = "、"
|
||||
PARENTHESIS_PATTERN = re.compile(r'^(([一二三四五六七八九十]+)\s*(.+)$')
|
||||
CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+)、\s*(.+)$')
|
||||
ARABIC_SECTION_PATTERN = re.compile(r'^(\d+)\.\s+(.+)$')
|
||||
|
||||
def __init__(self):
|
||||
self.parser = MarkdownParser()
|
||||
|
||||
def get_supported_analysis_types(self) -> list:
|
||||
"""获取支持的分析类型"""
|
||||
return [
|
||||
"summary", # 文档摘要
|
||||
"outline", # 大纲提取
|
||||
"key_points", # 关键点提取
|
||||
"questions", # 生成问题
|
||||
"tags", # 生成标签
|
||||
"qa", # 问答对
|
||||
"statistics", # 统计数据分析(适合政府公报)
|
||||
"section", # 分章节详细分析
|
||||
"charts" # 可视化图表生成
|
||||
]
|
||||
|
||||
def extract_sections(self, content: str, titles: List[Dict]) -> List[MarkdownSection]:
|
||||
"""
|
||||
从文档内容中提取章节结构
|
||||
|
||||
识别以下章节格式:
|
||||
- 一级:一、二、三...
|
||||
- 二级:(一)(二)(三)...
|
||||
- 三级:1. 2. 3. ...
|
||||
"""
|
||||
sections = []
|
||||
lines = content.split('\n')
|
||||
|
||||
# 构建标题行到内容的映射
|
||||
title_lines = {}
|
||||
for t in titles:
|
||||
title_lines[t.get('line', 0)] = t
|
||||
|
||||
current_section = None
|
||||
section_stack = []
|
||||
|
||||
for i, line in enumerate(lines, 1):
|
||||
stripped = line.strip()
|
||||
|
||||
# 检查是否是一级标题(中文数字 + 、)
|
||||
match = self.CHINESE_SECTION_PATTERN.match(stripped)
|
||||
if match:
|
||||
# 结束当前章节
|
||||
if current_section:
|
||||
current_section.content = self._get_section_content(
|
||||
lines, current_section.line_start, i - 1
|
||||
)
|
||||
|
||||
current_section = MarkdownSection(
|
||||
number=match.group(1),
|
||||
title=match.group(2),
|
||||
level=1,
|
||||
content="",
|
||||
line_start=i,
|
||||
line_end=len(lines)
|
||||
)
|
||||
sections.append(current_section)
|
||||
section_stack = [current_section]
|
||||
continue
|
||||
|
||||
# 检查是否是二级标题((一)(二)...)
|
||||
match = self.PARENTHESIS_PATTERN.match(stripped)
|
||||
if match and current_section:
|
||||
# 结束当前子章节
|
||||
if section_stack and len(section_stack) > 1:
|
||||
parent = section_stack[-1]
|
||||
parent.content = self._get_section_content(
|
||||
lines, parent.line_start, i - 1
|
||||
)
|
||||
|
||||
subsection = MarkdownSection(
|
||||
number=match.group(1),
|
||||
title=match.group(2),
|
||||
level=2,
|
||||
content="",
|
||||
line_start=i,
|
||||
line_end=len(lines)
|
||||
)
|
||||
current_section.subsections.append(subsection)
|
||||
section_stack = [current_section, subsection]
|
||||
continue
|
||||
|
||||
# 检查是否是三级标题(1. 2. 3.)
|
||||
match = self.ARABIC_SECTION_PATTERN.match(stripped)
|
||||
if match and len(section_stack) > 1:
|
||||
# 结束当前子章节
|
||||
if len(section_stack) > 2:
|
||||
parent = section_stack[-1]
|
||||
parent.content = self._get_section_content(
|
||||
lines, parent.line_start, i - 1
|
||||
)
|
||||
|
||||
sub_subsection = MarkdownSection(
|
||||
number=match.group(1),
|
||||
title=match.group(2),
|
||||
level=3,
|
||||
content="",
|
||||
line_start=i,
|
||||
line_end=len(lines)
|
||||
)
|
||||
section_stack[-1].subsections.append(sub_subsection)
|
||||
section_stack = section_stack[:-1] + [sub_subsection]
|
||||
continue
|
||||
|
||||
# 处理最后一个章节
|
||||
if current_section:
|
||||
current_section.content = self._get_section_content(
|
||||
lines, current_section.line_start, len(lines)
|
||||
)
|
||||
|
||||
return sections
|
||||
|
||||
def _get_section_content(self, lines: List[str], start: int, end: int) -> str:
|
||||
"""获取指定行范围的内容"""
|
||||
if start > end:
|
||||
return ""
|
||||
content_lines = lines[start-1:end]
|
||||
# 清理:移除标题行和空行
|
||||
cleaned = []
|
||||
for line in content_lines:
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
# 跳过章节标题行
|
||||
if self.CHINESE_SECTION_PATTERN.match(stripped):
|
||||
continue
|
||||
if self.PARENTHESIS_PATTERN.match(stripped):
|
||||
continue
|
||||
if self.ARABIC_SECTION_PATTERN.match(stripped):
|
||||
continue
|
||||
cleaned.append(stripped)
|
||||
return '\n'.join(cleaned)
|
||||
|
||||
async def analyze_markdown(
|
||||
self,
|
||||
file_path: str,
|
||||
analysis_type: str = "summary",
|
||||
user_prompt: str = "",
|
||||
section_number: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
使用 AI 分析 Markdown 文档
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
analysis_type: 分析类型
|
||||
user_prompt: 用户自定义提示词
|
||||
section_number: 指定分析的章节编号(如 "一" 或 "(一)")
|
||||
|
||||
Returns:
|
||||
dict: 分析结果
|
||||
"""
|
||||
try:
|
||||
parse_result = self.parser.parse(file_path)
|
||||
|
||||
if not parse_result.success:
|
||||
return {
|
||||
"success": False,
|
||||
"error": parse_result.error
|
||||
}
|
||||
|
||||
data = parse_result.data
|
||||
|
||||
# 提取章节结构
|
||||
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
|
||||
|
||||
# 如果指定了章节,只分析该章节
|
||||
target_content = data.get("content", "")
|
||||
target_title = parse_result.metadata.get("filename", "")
|
||||
|
||||
if section_number:
|
||||
section = self._find_section(sections, section_number)
|
||||
if section:
|
||||
target_content = section.content
|
||||
target_title = f"{section.number}、{section.title}"
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"未找到章节: {section_number}"
|
||||
}
|
||||
|
||||
# 根据分析类型构建提示词
|
||||
prompt = self._build_prompt(
|
||||
content=target_content,
|
||||
analysis_type=analysis_type,
|
||||
user_prompt=user_prompt,
|
||||
title=target_title
|
||||
)
|
||||
|
||||
# 调用 LLM 分析
|
||||
messages = [
|
||||
{"role": "system", "content": self._get_system_prompt(analysis_type)},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
|
||||
response = await llm_service.chat(
|
||||
messages=messages,
|
||||
temperature=0.3,
|
||||
max_tokens=4000
|
||||
)
|
||||
|
||||
analysis = llm_service.extract_message_content(response)
|
||||
|
||||
# 构建基础返回
|
||||
result = {
|
||||
"success": True,
|
||||
"filename": parse_result.metadata.get("filename", ""),
|
||||
"analysis_type": analysis_type,
|
||||
"section": target_title if section_number else None,
|
||||
"word_count": len(target_content),
|
||||
"structure": {
|
||||
"title_count": parse_result.metadata.get("title_count", 0),
|
||||
"code_block_count": parse_result.metadata.get("code_block_count", 0),
|
||||
"table_count": parse_result.metadata.get("table_count", 0),
|
||||
"section_count": len(sections)
|
||||
},
|
||||
"sections": [s.to_dict() for s in sections[:10]], # 最多返回10个一级章节
|
||||
"analysis": analysis
|
||||
}
|
||||
|
||||
# 如果是 charts 类型,额外生成可视化
|
||||
if analysis_type == "charts":
|
||||
try:
|
||||
# 解析 LLM 返回的 JSON 数据
|
||||
chart_data = self._parse_chart_json(analysis)
|
||||
if chart_data and chart_data.get("tables"):
|
||||
# 使用可视化服务生成图表
|
||||
for table_info in chart_data.get("tables", []):
|
||||
columns = table_info.get("columns", [])
|
||||
rows = table_info.get("rows", [])
|
||||
if columns and rows:
|
||||
vis_result = visualization_service.analyze_and_visualize({
|
||||
"columns": columns,
|
||||
"rows": [dict(zip(columns, row)) for row in rows]
|
||||
})
|
||||
if vis_result.get("success"):
|
||||
table_info["visualization"] = {
|
||||
"statistics": vis_result.get("statistics"),
|
||||
"charts": vis_result.get("charts"),
|
||||
"distributions": vis_result.get("distributions")
|
||||
}
|
||||
result["chart_data"] = chart_data
|
||||
except Exception as e:
|
||||
logger.warning(f"生成可视化图表失败: {e}")
|
||||
result["chart_data"] = {"tables": [], "key_statistics": [], "chart_suggestions": []}
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Markdown AI 分析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
async def analyze_markdown_stream(
|
||||
self,
|
||||
file_path: str,
|
||||
analysis_type: str = "summary",
|
||||
user_prompt: str = "",
|
||||
section_number: Optional[str] = None
|
||||
) -> AsyncGenerator[str, None]:
|
||||
"""
|
||||
流式分析 Markdown 文档 (SSE)
|
||||
|
||||
Yields:
|
||||
str: SSE 格式的数据块
|
||||
"""
|
||||
try:
|
||||
parse_result = self.parser.parse(file_path)
|
||||
|
||||
if not parse_result.success:
|
||||
yield f"data: {json.dumps({'error': parse_result.error}, ensure_ascii=False)}\n\n"
|
||||
return
|
||||
|
||||
data = parse_result.data
|
||||
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
|
||||
|
||||
target_content = data.get("content", "")
|
||||
target_title = parse_result.metadata.get("filename", "")
|
||||
|
||||
if section_number:
|
||||
section = self._find_section(sections, section_number)
|
||||
if section:
|
||||
target_content = section.content
|
||||
target_title = f"{section.number}、{section.title}"
|
||||
else:
|
||||
yield f"data: {json.dumps({'error': f'未找到章节: {section_number}'}, ensure_ascii=False)}\n\n"
|
||||
return
|
||||
|
||||
prompt = self._build_prompt(
|
||||
content=target_content,
|
||||
analysis_type=analysis_type,
|
||||
user_prompt=user_prompt,
|
||||
title=target_title
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": self._get_system_prompt(analysis_type)},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
|
||||
# 发送初始元数据
|
||||
yield f"data: {json.dumps({
|
||||
'type': 'start',
|
||||
'filename': parse_result.metadata.get("filename", ""),
|
||||
'analysis_type': analysis_type,
|
||||
'section': target_title if section_number else None,
|
||||
'word_count': len(target_content)
|
||||
}, ensure_ascii=False)}\n\n"
|
||||
|
||||
# 流式调用 LLM
|
||||
full_response = ""
|
||||
async for chunk in llm_service.chat_stream(messages, temperature=0.3, max_tokens=4000):
|
||||
content = chunk.get("content", "")
|
||||
if content:
|
||||
full_response += content
|
||||
yield f"data: {json.dumps({'type': 'content', 'delta': content}, ensure_ascii=False)}\n\n"
|
||||
|
||||
# 发送完成消息
|
||||
yield f"data: {json.dumps({'type': 'done', 'full_response': full_response}, ensure_ascii=False)}\n\n"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Markdown AI 流式分析失败: {str(e)}")
|
||||
yield f"data: {json.dumps({'error': str(e)}, ensure_ascii=False)}\n\n"
|
||||
|
||||
def _find_section(self, sections: List[MarkdownSection], number: str) -> Optional[MarkdownSection]:
|
||||
"""查找指定编号的章节"""
|
||||
# 标准化编号
|
||||
num = number.strip()
|
||||
for section in sections:
|
||||
if section.number == num or section.title == num:
|
||||
return section
|
||||
# 在子章节中查找
|
||||
found = self._find_section(section.subsections, number)
|
||||
if found:
|
||||
return found
|
||||
return None
|
||||
|
||||
def _parse_chart_json(self, json_str: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
解析 LLM 返回的 JSON 字符串
|
||||
|
||||
Args:
|
||||
json_str: LLM 返回的 JSON 字符串
|
||||
|
||||
Returns:
|
||||
解析后的字典,如果解析失败返回 None
|
||||
"""
|
||||
if not json_str:
|
||||
return None
|
||||
|
||||
try:
|
||||
# 尝试直接解析
|
||||
return json.loads(json_str)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# 尝试提取 JSON 代码块
|
||||
import re
|
||||
# 匹配 ```json ... ``` 格式
|
||||
match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', json_str)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group(1))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# 尝试找到 JSON 对象的开始和结束
|
||||
start = json_str.find('{')
|
||||
end = json_str.rfind('}')
|
||||
if start != -1 and end != -1 and end > start:
|
||||
try:
|
||||
return json.loads(json_str[start:end+1])
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def _get_system_prompt(self, analysis_type: str) -> str:
|
||||
"""根据分析类型获取系统提示词"""
|
||||
prompts = {
|
||||
"summary": "你是一个专业的文档摘要助手,擅长从长文档中提取核心信息。",
|
||||
"outline": "你是一个专业的文档结构分析助手,擅长提取文档大纲和层级结构。",
|
||||
"key_points": "你是一个专业的知识提取助手,擅长从文档中提取关键信息和要点。",
|
||||
"questions": "你是一个专业的教育助手,擅长生成帮助理解文档的问题。",
|
||||
"tags": "你是一个专业的标签生成助手,擅长提取文档的主题标签。",
|
||||
"qa": "你是一个专业的问答助手,擅长基于文档内容生成问答对。",
|
||||
"statistics": "你是一个专业的统计数据分析助手,擅长分析政府统计公报中的数据。",
|
||||
"section": "你是一个专业的章节分析助手,擅长对文档的特定章节进行深入分析。",
|
||||
"charts": "你是一个专业的数据可视化助手,擅长从文档中提取数据并生成适合制作图表的数据结构。"
|
||||
}
|
||||
return prompts.get(analysis_type, "你是一个专业的文档分析助手。")
|
||||
|
||||
def _build_prompt(
|
||||
self,
|
||||
content: str,
|
||||
analysis_type: str,
|
||||
user_prompt: str,
|
||||
title: str = ""
|
||||
) -> str:
|
||||
"""根据分析类型构建提示词"""
|
||||
|
||||
# 截断内容避免超出 token 限制
|
||||
max_content_len = 6000
|
||||
if len(content) > max_content_len:
|
||||
content = content[:max_content_len] + "\n\n[内容已截断...]"
|
||||
|
||||
base_prompts = {
|
||||
"summary": f"""请对以下文档进行摘要分析:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content}
|
||||
|
||||
请提供:
|
||||
1. 文档主要内容摘要(300字以内)
|
||||
2. 文档的目的和用途
|
||||
3. 适合的读者群体
|
||||
|
||||
请用中文回答,结构清晰。""",
|
||||
|
||||
"outline": f"""请提取以下文档的大纲结构:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content}
|
||||
|
||||
请按层级列出文档大纲,用缩进表示层级关系。
|
||||
格式:
|
||||
一、一级标题
|
||||
(一)二级标题
|
||||
1. 三级标题
|
||||
|
||||
请用中文回答。""",
|
||||
|
||||
"key_points": f"""请从以下文档中提取关键要点:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content}
|
||||
|
||||
请列出文档的关键要点(5-10条),每条用简洁的语言描述,并说明其在文档中的重要性。
|
||||
|
||||
请用中文回答,格式清晰。""",
|
||||
|
||||
"questions": f"""请根据以下文档生成有助于理解内容的问题:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content}
|
||||
|
||||
请生成5-10个问题,帮助读者更好地理解文档内容。每个问题应该:
|
||||
1. 涵盖文档的重要信息点
|
||||
2. 易于理解和回答
|
||||
3. 具有思考价值
|
||||
|
||||
请用中文回答。""",
|
||||
|
||||
"tags": f"""请为以下文档生成标签:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content[:3000]}
|
||||
|
||||
请生成5-8个标签,用逗号分隔。标签应该反映:
|
||||
- 文档的主题领域
|
||||
- 文档的类型
|
||||
- 文档的关键特征
|
||||
|
||||
请用中文回答,只需输出标签,不要其他内容。""",
|
||||
|
||||
"qa": f"""请根据以下文档生成问答对:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content[:4000]}
|
||||
|
||||
请生成3-5个问答对,帮助读者通过问答形式理解文档内容。
|
||||
格式:
|
||||
Q1: 问题
|
||||
A1: 回答
|
||||
Q2: 问题
|
||||
A2: 回答
|
||||
|
||||
请用中文回答,内容准确。""",
|
||||
|
||||
"statistics": f"""请分析以下政府统计公报中的数据和结论:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content}
|
||||
|
||||
请提供:
|
||||
1. 文档中涉及的主要统计数据(列出关键数字和指标)
|
||||
2. 数据的变化趋势(增长/下降)
|
||||
3. 重要的百分比和对比
|
||||
4. 数据来源和统计口径说明
|
||||
|
||||
请用中文回答,数据准确。""",
|
||||
|
||||
"section": f"""请详细分析以下文档章节:
|
||||
|
||||
章节标题:{title}
|
||||
|
||||
章节内容:
|
||||
{content}
|
||||
|
||||
请提供:
|
||||
1. 章节主要内容概括
|
||||
2. 关键信息和数据
|
||||
3. 与其他部分的关联(如有)
|
||||
4. 重要结论
|
||||
|
||||
请用中文回答,分析深入。""",
|
||||
|
||||
"charts": f"""请从以下文档中提取可用于可视化的数据,并生成适合制作图表的数据结构:
|
||||
|
||||
文档标题:{title}
|
||||
|
||||
文档内容:
|
||||
{content}
|
||||
|
||||
请完成以下任务:
|
||||
1. 识别文档中的表格数据(Markdown表格格式)
|
||||
2. 识别文档中的关键统计数据(百分比、数量、趋势等)
|
||||
3. 识别可用于比较的分类数据
|
||||
|
||||
请用 JSON 格式返回以下结构的数据(如果没有表格数据,返回空结构):
|
||||
{{
|
||||
"tables": [
|
||||
{{
|
||||
"description": "表格的描述",
|
||||
"columns": ["列名1", "列名2", ...],
|
||||
"rows": [
|
||||
["值1", "值2", ...],
|
||||
["值1", "值2", ...]
|
||||
]
|
||||
}}
|
||||
],
|
||||
"key_statistics": [
|
||||
{{
|
||||
"name": "指标名称",
|
||||
"value": "数值",
|
||||
"trend": "增长/下降/持平",
|
||||
"description": "指标说明"
|
||||
}}
|
||||
],
|
||||
"chart_suggestions": [
|
||||
{{
|
||||
"chart_type": "bar/line/pie",
|
||||
"title": "图表标题",
|
||||
"data_source": "数据来源说明"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
请确保返回的是合法的 JSON 格式。"""
|
||||
}
|
||||
|
||||
prompt = base_prompts.get(analysis_type, base_prompts["summary"])
|
||||
|
||||
if user_prompt and user_prompt.strip():
|
||||
prompt += f"\n\n用户额外需求:{user_prompt}"
|
||||
|
||||
return prompt
|
||||
|
||||
async def extract_outline(self, file_path: str) -> Dict[str, Any]:
|
||||
"""提取文档大纲"""
|
||||
try:
|
||||
parse_result = self.parser.parse(file_path)
|
||||
|
||||
if not parse_result.success:
|
||||
return {"success": False, "error": parse_result.error}
|
||||
|
||||
data = parse_result.data
|
||||
sections = self.extract_sections(data.get("content", ""), data.get("titles", []))
|
||||
|
||||
# 构建结构化大纲
|
||||
outline = []
|
||||
for section in sections:
|
||||
outline.append({
|
||||
"number": section.number,
|
||||
"title": section.title,
|
||||
"level": section.level,
|
||||
"line": section.line_start,
|
||||
"content_preview": section.content[:100] + "..." if len(section.content) > 100 else section.content,
|
||||
"subsections": [{
|
||||
"number": s.number,
|
||||
"title": s.title,
|
||||
"level": s.level,
|
||||
"line": s.line_start
|
||||
} for s in section.subsections]
|
||||
})
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"outline": outline
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"大纲提取失败: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def extract_tables_summary(self, file_path: str) -> Dict[str, Any]:
|
||||
"""提取并总结文档中的表格"""
|
||||
try:
|
||||
parse_result = self.parser.parse(file_path)
|
||||
|
||||
if not parse_result.success:
|
||||
return {"success": False, "error": parse_result.error}
|
||||
|
||||
tables = parse_result.data.get("tables", [])
|
||||
|
||||
if not tables:
|
||||
return {"success": True, "tables": [], "message": "文档中没有表格"}
|
||||
|
||||
# 提取每个表格的关键信息
|
||||
table_summaries = []
|
||||
for i, table in enumerate(tables):
|
||||
summary = {
|
||||
"index": i + 1,
|
||||
"headers": table.get("headers", []),
|
||||
"row_count": table.get("row_count", 0),
|
||||
"column_count": table.get("column_count", 0),
|
||||
"preview_rows": table.get("rows", [])[:3], # 只取前3行预览
|
||||
"first_column": [row[0] if row else "" for row in table.get("rows", [])[:5]]
|
||||
}
|
||||
table_summaries.append(summary)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"tables": table_summaries,
|
||||
"table_count": len(tables)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"表格提取失败: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
# 全局单例
|
||||
markdown_ai_service = MarkdownAIService()
|
||||
444
backend/app/services/prompt_service.py
Normal file
444
backend/app/services/prompt_service.py
Normal file
@@ -0,0 +1,444 @@
|
||||
"""
|
||||
提示词工程服务
|
||||
|
||||
管理和优化与大模型交互的提示词
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PromptType(Enum):
|
||||
"""提示词类型"""
|
||||
DOCUMENT_PARSING = "document_parsing" # 文档解析
|
||||
FIELD_EXTRACTION = "field_extraction" # 字段提取
|
||||
TABLE_FILLING = "table_filling" # 表格填写
|
||||
QUERY_GENERATION = "query_generation" # 查询生成
|
||||
TEXT_SUMMARY = "text_summary" # 文本摘要
|
||||
INTENT_CLASSIFICATION = "intent_classification" # 意图分类
|
||||
DATA_CLASSIFICATION = "data_classification" # 数据分类
|
||||
|
||||
|
||||
@dataclass
|
||||
class PromptTemplate:
|
||||
"""提示词模板"""
|
||||
name: str
|
||||
type: PromptType
|
||||
system_prompt: str
|
||||
user_template: str
|
||||
examples: List[Dict[str, str]] = field(default_factory=list) # Few-shot 示例
|
||||
rules: List[str] = field(default_factory=list) # 特殊规则
|
||||
|
||||
def format(
|
||||
self,
|
||||
context: Dict[str, Any],
|
||||
user_input: Optional[str] = None
|
||||
) -> List[Dict[str, str]]:
|
||||
"""
|
||||
格式化提示词
|
||||
|
||||
Args:
|
||||
context: 上下文数据
|
||||
user_input: 用户输入
|
||||
|
||||
Returns:
|
||||
格式化后的消息列表
|
||||
"""
|
||||
messages = []
|
||||
|
||||
# 系统提示词
|
||||
system_content = self.system_prompt
|
||||
|
||||
# 添加规则
|
||||
if self.rules:
|
||||
system_content += "\n\n【输出规则】\n" + "\n".join([f"- {rule}" for rule in self.rules])
|
||||
|
||||
# 添加示例
|
||||
if self.examples:
|
||||
system_content += "\n\n【示例】\n"
|
||||
for i, ex in enumerate(self.examples):
|
||||
system_content += f"\n示例 {i+1}:\n"
|
||||
system_content += f"输入: {ex.get('input', '')}\n"
|
||||
system_content += f"输出: {ex.get('output', '')}\n"
|
||||
|
||||
messages.append({"role": "system", "content": system_content})
|
||||
|
||||
# 用户提示词
|
||||
user_content = self._format_user_template(context, user_input)
|
||||
messages.append({"role": "user", "content": user_content})
|
||||
|
||||
return messages
|
||||
|
||||
def _format_user_template(
|
||||
self,
|
||||
context: Dict[str, Any],
|
||||
user_input: Optional[str]
|
||||
) -> str:
|
||||
"""格式化用户模板"""
|
||||
content = self.user_template
|
||||
|
||||
# 替换上下文变量
|
||||
for key, value in context.items():
|
||||
placeholder = f"{{{key}}}"
|
||||
if placeholder in content:
|
||||
if isinstance(value, (dict, list)):
|
||||
content = content.replace(placeholder, json.dumps(value, ensure_ascii=False, indent=2))
|
||||
else:
|
||||
content = content.replace(placeholder, str(value))
|
||||
|
||||
# 添加用户输入
|
||||
if user_input:
|
||||
content += f"\n\n【用户需求】\n{user_input}"
|
||||
|
||||
return content
|
||||
|
||||
|
||||
class PromptEngineeringService:
|
||||
"""提示词工程服务"""
|
||||
|
||||
def __init__(self):
|
||||
self.templates: Dict[PromptType, PromptTemplate] = {}
|
||||
self._init_templates()
|
||||
|
||||
def _init_templates(self):
|
||||
"""初始化所有提示词模板"""
|
||||
|
||||
# ==================== 文档解析模板 ====================
|
||||
self.templates[PromptType.DOCUMENT_PARSING] = PromptTemplate(
|
||||
name="文档解析",
|
||||
type=PromptType.DOCUMENT_PARSING,
|
||||
system_prompt="""你是一个专业的文档解析专家。你的任务是从各类文档(Word、Excel、Markdown、纯文本)中提取关键信息。
|
||||
|
||||
请严格按照JSON格式输出解析结果:
|
||||
{
|
||||
"success": true/false,
|
||||
"document_type": "文档类型",
|
||||
"key_fields": {"字段名": "字段值", ...},
|
||||
"summary": "文档摘要(100字内)",
|
||||
"structured_data": {...} // 提取的表格或其他结构化数据
|
||||
}
|
||||
|
||||
重要规则:
|
||||
- 只提取明确存在的信息,不要猜测
|
||||
- 如果是表格数据,请以数组格式输出
|
||||
- 日期请使用 YYYY-MM-DD 格式
|
||||
- 金额请使用数字格式
|
||||
- 如果无法提取某个字段,设置为 null""",
|
||||
user_template="""请解析以下文档内容:
|
||||
|
||||
=== 文档开始 ===
|
||||
{content}
|
||||
=== 文档结束 ===
|
||||
|
||||
请提取文档中的关键信息。""",
|
||||
examples=[
|
||||
{
|
||||
"input": "合同金额:100万元\n签订日期:2024年1月15日\n甲方:张三\n乙方:某某公司",
|
||||
"output": '{"success": true, "document_type": "合同", "key_fields": {"金额": 1000000, "日期": "2024-01-15", "甲方": "张三", "乙方": "某某公司"}, "summary": "甲乙双方签订的金额为100万元的合同", "structured_data": null}'
|
||||
}
|
||||
],
|
||||
rules=[
|
||||
"只输出JSON,不要添加任何解释",
|
||||
"使用严格的JSON格式"
|
||||
]
|
||||
)
|
||||
|
||||
# ==================== 字段提取模板 ====================
|
||||
self.templates[PromptType.FIELD_EXTRACTION] = PromptTemplate(
|
||||
name="字段提取",
|
||||
type=PromptType.FIELD_EXTRACTION,
|
||||
system_prompt="""你是一个专业的数据提取专家。你的任务是从文档内容中提取指定字段的信息。
|
||||
|
||||
请严格按照以下JSON格式输出:
|
||||
{
|
||||
"value": "提取到的值,找不到则为空字符串",
|
||||
"source": "数据来源描述",
|
||||
"confidence": 0.0到1.0之间的置信度
|
||||
}
|
||||
|
||||
重要规则:
|
||||
- 严格按字段名称匹配,不要提取无关信息
|
||||
- 置信度反映你对提取结果的信心程度
|
||||
- 如果字段不存在或无法确定,value设为空字符串,confidence设为0.0
|
||||
- value必须是实际值,不能是"未找到"之类的描述""",
|
||||
user_template="""请从以下文档内容中提取指定字段的信息。
|
||||
|
||||
【需要提取的字段】
|
||||
字段名称:{field_name}
|
||||
字段类型:{field_type}
|
||||
是否必填:{required}
|
||||
|
||||
【用户提示】
|
||||
{hint}
|
||||
|
||||
【文档内容】
|
||||
{context}
|
||||
|
||||
请提取字段值。""",
|
||||
examples=[
|
||||
{
|
||||
"input": "文档内容:姓名张三,电话13800138000,邮箱zhangsan@example.com",
|
||||
"output": '{"value": "张三", "source": "文档第1行", "confidence": 1.0}'
|
||||
}
|
||||
],
|
||||
rules=[
|
||||
"只输出JSON,不要添加任何解释"
|
||||
]
|
||||
)
|
||||
|
||||
# ==================== 表格填写模板 ====================
|
||||
self.templates[PromptType.TABLE_FILLING] = PromptTemplate(
|
||||
name="表格填写",
|
||||
type=PromptType.TABLE_FILLING,
|
||||
system_prompt="""你是一个专业的表格填写助手。你的任务是根据提供的文档内容,填写表格模板中的字段。
|
||||
|
||||
请严格按照以下JSON格式输出:
|
||||
{
|
||||
"filled_data": {{"字段1": "值1", "字段2": "值2", ...}},
|
||||
"fill_details": [
|
||||
{{"field": "字段1", "value": "值1", "source": "来源", "confidence": 0.95}},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
重要规则:
|
||||
- 只填写模板中存在的字段
|
||||
- 值必须来自提供的文档内容,不要编造
|
||||
- 如果某个字段在文档中找不到对应值,设为空字符串
|
||||
- fill_details 中记录每个字段的详细信息""",
|
||||
user_template="""请根据以下文档内容,填写表格模板。
|
||||
|
||||
【表格模板字段】
|
||||
{fields}
|
||||
|
||||
【用户需求】
|
||||
{hint}
|
||||
|
||||
【参考文档内容】
|
||||
{context}
|
||||
|
||||
请填写表格。""",
|
||||
examples=[
|
||||
{
|
||||
"input": "字段:姓名、电话\n文档:张三,电话是13800138000",
|
||||
"output": '{"filled_data": {"姓名": "张三", "电话": "13800138000"}, "fill_details": [{"field": "姓名", "value": "张三", "source": "文档第1行", "confidence": 1.0}, {"field": "电话", "value": "13800138000", "source": "文档第1行", "confidence": 1.0}]}'
|
||||
}
|
||||
],
|
||||
rules=[
|
||||
"只输出JSON,不要添加任何解释"
|
||||
]
|
||||
)
|
||||
|
||||
# ==================== 查询生成模板 ====================
|
||||
self.templates[PromptType.QUERY_GENERATION] = PromptTemplate(
|
||||
name="查询生成",
|
||||
type=PromptType.QUERY_GENERATION,
|
||||
system_prompt="""你是一个SQL查询生成专家。你的任务是根据用户的自然语言需求,生成相应的数据库查询语句。
|
||||
|
||||
请严格按照以下JSON格式输出:
|
||||
{
|
||||
"sql_query": "生成的SQL查询语句",
|
||||
"explanation": "查询逻辑说明"
|
||||
}
|
||||
|
||||
重要规则:
|
||||
- 只生成 SELECT 查询语句,不要生成 INSERT/UPDATE/DELETE
|
||||
- 必须包含 WHERE 条件限制查询范围
|
||||
- 表名和字段名使用反引号包裹
|
||||
- 确保SQL语法正确
|
||||
- 如果无法生成有效的查询,sql_query设为空字符串""",
|
||||
user_template="""根据以下信息生成查询语句。
|
||||
|
||||
【数据库表结构】
|
||||
{table_schema}
|
||||
|
||||
【RAG检索到的上下文】
|
||||
{rag_context}
|
||||
|
||||
【用户查询需求】
|
||||
{user_intent}
|
||||
|
||||
请生成SQL查询。""",
|
||||
examples=[
|
||||
{
|
||||
"input": "表:orders(订单号, 金额, 日期, 客户)\n需求:查询2024年1月销售额超过10000的订单",
|
||||
"output": '{"sql_query": "SELECT * FROM `orders` WHERE `日期` >= \\'2024-01-01\\' AND `日期` < \\'2024-02-01\\' AND `金额` > 10000", "explanation": "筛选2024年1月销售额超过10000的订单"}'
|
||||
}
|
||||
],
|
||||
rules=[
|
||||
"只输出JSON,不要添加任何解释",
|
||||
"禁止生成 DROP、DELETE、TRUNCATE 等危险操作"
|
||||
]
|
||||
)
|
||||
|
||||
# ==================== 文本摘要模板 ====================
|
||||
self.templates[PromptType.TEXT_SUMMARY] = PromptTemplate(
|
||||
name="文本摘要",
|
||||
type=PromptType.TEXT_SUMMARY,
|
||||
system_prompt="""你是一个专业的文本摘要专家。你的任务是对长文档进行压缩,提取关键信息。
|
||||
|
||||
请严格按照以下JSON格式输出:
|
||||
{
|
||||
"summary": "摘要内容(不超过200字)",
|
||||
"key_points": ["要点1", "要点2", "要点3"],
|
||||
"keywords": ["关键词1", "关键词2", "关键词3"]
|
||||
}""",
|
||||
user_template="""请为以下文档生成摘要:
|
||||
|
||||
=== 文档开始 ===
|
||||
{content}
|
||||
=== 文档结束 ===
|
||||
|
||||
生成简明摘要。""",
|
||||
rules=[
|
||||
"只输出JSON,不要添加任何解释"
|
||||
]
|
||||
)
|
||||
|
||||
# ==================== 意图分类模板 ====================
|
||||
self.templates[PromptType.INTENT_CLASSIFICATION] = PromptTemplate(
|
||||
name="意图分类",
|
||||
type=PromptType.INTENT_CLASSIFICATION,
|
||||
system_prompt="""你是一个意图分类专家。你的任务是分析用户的自然语言输入,判断用户的真实意图。
|
||||
|
||||
支持的意图类型:
|
||||
- upload: 上传文档
|
||||
- parse: 解析文档
|
||||
- query: 查询数据
|
||||
- fill: 填写表格
|
||||
- export: 导出数据
|
||||
- analyze: 分析数据
|
||||
- other: 其他/未知
|
||||
|
||||
请严格按照以下JSON格式输出:
|
||||
{
|
||||
"intent": "意图类型",
|
||||
"confidence": 0.0到1.0之间的置信度,
|
||||
"entities": {{"实体名": "实体值", ...}}, // 识别出的关键实体
|
||||
"suggestion": "建议的下一步操作"
|
||||
}""",
|
||||
user_template="""请分析以下用户输入,判断其意图:
|
||||
|
||||
【用户输入】
|
||||
{user_input}
|
||||
|
||||
请分类。""",
|
||||
rules=[
|
||||
"只输出JSON,不要添加任何解释"
|
||||
]
|
||||
)
|
||||
|
||||
# ==================== 数据分类模板 ====================
|
||||
self.templates[PromptType.DATA_CLASSIFICATION] = PromptTemplate(
|
||||
name="数据分类",
|
||||
type=PromptType.DATA_CLASSIFICATION,
|
||||
system_prompt="""你是一个数据分类专家。你的任务是判断数据的类型和格式。
|
||||
|
||||
请严格按照以下JSON格式输出:
|
||||
{
|
||||
"data_type": "text/number/date/email/phone/url/amount/other",
|
||||
"format": "具体格式描述",
|
||||
"is_valid": true/false,
|
||||
"normalized_value": "规范化后的值"
|
||||
}""",
|
||||
user_template="""请分析以下数据的类型和格式:
|
||||
|
||||
【数据】
|
||||
{value}
|
||||
|
||||
【期望类型(如果有)】
|
||||
{expected_type}
|
||||
|
||||
请分类。""",
|
||||
rules=[
|
||||
"只输出JSON,不要添加任何解释"
|
||||
]
|
||||
)
|
||||
|
||||
def get_prompt(
|
||||
self,
|
||||
type: PromptType,
|
||||
context: Dict[str, Any],
|
||||
user_input: Optional[str] = None
|
||||
) -> List[Dict[str, str]]:
|
||||
"""
|
||||
获取格式化后的提示词
|
||||
|
||||
Args:
|
||||
type: 提示词类型
|
||||
context: 上下文数据
|
||||
user_input: 用户输入
|
||||
|
||||
Returns:
|
||||
消息列表
|
||||
"""
|
||||
template = self.templates.get(type)
|
||||
if not template:
|
||||
logger.warning(f"未找到提示词模板: {type}")
|
||||
return [{"role": "user", "content": str(context)}]
|
||||
|
||||
return template.format(context, user_input)
|
||||
|
||||
def get_template(self, type: PromptType) -> Optional[PromptTemplate]:
|
||||
"""获取提示词模板"""
|
||||
return self.templates.get(type)
|
||||
|
||||
def add_template(self, template: PromptTemplate):
|
||||
"""添加自定义提示词模板"""
|
||||
self.templates[template.type] = template
|
||||
logger.info(f"已添加提示词模板: {template.name}")
|
||||
|
||||
def update_template(self, type: PromptType, **kwargs):
|
||||
"""更新提示词模板"""
|
||||
template = self.templates.get(type)
|
||||
if template:
|
||||
for key, value in kwargs.items():
|
||||
if hasattr(template, key):
|
||||
setattr(template, key, value)
|
||||
|
||||
def optimize_prompt(
|
||||
self,
|
||||
type: PromptType,
|
||||
feedback: str,
|
||||
iteration: int = 1
|
||||
) -> List[Dict[str, str]]:
|
||||
"""
|
||||
根据反馈优化提示词
|
||||
|
||||
Args:
|
||||
type: 提示词类型
|
||||
feedback: 优化反馈
|
||||
iteration: 迭代次数
|
||||
|
||||
Returns:
|
||||
优化后的提示词
|
||||
"""
|
||||
template = self.templates.get(type)
|
||||
if not template:
|
||||
return []
|
||||
|
||||
# 简单优化策略:根据反馈添加规则
|
||||
optimization_rules = {
|
||||
"准确率低": "提高要求,明确指出必须从原文提取,不要猜测",
|
||||
"格式错误": "强调JSON格式要求,提供更详细的格式示例",
|
||||
"遗漏信息": "添加提取更多细节的要求",
|
||||
}
|
||||
|
||||
new_rules = []
|
||||
for keyword, rule in optimization_rules.items():
|
||||
if keyword in feedback:
|
||||
new_rules.append(rule)
|
||||
|
||||
if new_rules:
|
||||
template.rules.extend(new_rules)
|
||||
|
||||
return template.format({}, None)
|
||||
|
||||
|
||||
# ==================== 全局单例 ====================
|
||||
|
||||
prompt_service = PromptEngineeringService()
|
||||
278
backend/app/services/rag_service.py
Normal file
278
backend/app/services/rag_service.py
Normal file
@@ -0,0 +1,278 @@
|
||||
"""
|
||||
RAG 服务模块 - 检索增强生成
|
||||
|
||||
使用 sentence-transformers + Faiss 实现向量检索
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import faiss
|
||||
import numpy as np
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SimpleDocument:
|
||||
"""简化文档对象"""
|
||||
def __init__(self, page_content: str, metadata: Dict[str, Any]):
|
||||
self.page_content = page_content
|
||||
self.metadata = metadata
|
||||
|
||||
|
||||
class RAGService:
|
||||
"""RAG 检索增强服务"""
|
||||
|
||||
def __init__(self):
|
||||
self.embedding_model: Optional[SentenceTransformer] = None
|
||||
self.index: Optional[faiss.Index] = None
|
||||
self.documents: List[Dict[str, Any]] = []
|
||||
self.doc_ids: List[str] = []
|
||||
self._dimension: int = 0
|
||||
self._initialized = False
|
||||
self._persist_dir = settings.FAISS_INDEX_DIR
|
||||
# 临时禁用 RAG API 调用,仅记录日志
|
||||
self._disabled = True
|
||||
logger.info("RAG 服务已禁用(_disabled=True),仅记录索引操作日志")
|
||||
|
||||
def _init_embeddings(self):
|
||||
"""初始化嵌入模型"""
|
||||
if self._disabled:
|
||||
logger.debug("RAG 已禁用,跳过嵌入模型初始化")
|
||||
return
|
||||
if self.embedding_model is None:
|
||||
# 使用轻量级本地模型,避免网络问题
|
||||
model_name = 'all-MiniLM-L6-v2'
|
||||
try:
|
||||
self.embedding_model = SentenceTransformer(model_name)
|
||||
self._dimension = self.embedding_model.get_sentence_embedding_dimension()
|
||||
logger.info(f"RAG 嵌入模型初始化完成: {model_name}, 维度: {self._dimension}")
|
||||
except Exception as e:
|
||||
logger.warning(f"嵌入模型 {model_name} 加载失败: {e}")
|
||||
# 如果本地模型也失败,使用简单hash作为后备
|
||||
self.embedding_model = None
|
||||
self._dimension = 384
|
||||
logger.info("RAG 使用简化模式 (无向量嵌入)")
|
||||
|
||||
def _init_vector_store(self):
|
||||
"""初始化向量存储"""
|
||||
if self.index is None:
|
||||
self._init_embeddings()
|
||||
if self.embedding_model is None:
|
||||
# 无法加载嵌入模型,使用简化模式
|
||||
self._dimension = 384
|
||||
self.index = None
|
||||
logger.warning("RAG 嵌入模型未加载,使用简化模式")
|
||||
else:
|
||||
self.index = faiss.IndexIDMap(faiss.IndexFlatIP(self._dimension))
|
||||
logger.info("Faiss 向量存储初始化完成")
|
||||
|
||||
async def initialize(self):
|
||||
"""异步初始化"""
|
||||
try:
|
||||
self._init_vector_store()
|
||||
self._initialized = True
|
||||
logger.info("RAG 服务初始化成功")
|
||||
except Exception as e:
|
||||
logger.error(f"RAG 服务初始化失败: {e}")
|
||||
raise
|
||||
|
||||
def _normalize_vectors(self, vectors: np.ndarray) -> np.ndarray:
|
||||
"""归一化向量"""
|
||||
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
|
||||
norms = np.where(norms == 0, 1, norms)
|
||||
return vectors / norms
|
||||
|
||||
def index_field(
|
||||
self,
|
||||
table_name: str,
|
||||
field_name: str,
|
||||
field_description: str,
|
||||
sample_values: Optional[List[str]] = None
|
||||
):
|
||||
"""将字段信息索引到向量数据库"""
|
||||
if self._disabled:
|
||||
logger.info(f"[RAG DISABLED] 字段索引操作已跳过: {table_name}.{field_name}")
|
||||
return
|
||||
|
||||
if not self._initialized:
|
||||
self._init_vector_store()
|
||||
|
||||
# 如果没有嵌入模型,只记录到日志
|
||||
if self.embedding_model is None:
|
||||
logger.debug(f"字段跳过索引 (无嵌入模型): {table_name}.{field_name}")
|
||||
return
|
||||
|
||||
text = f"表名: {table_name}, 字段: {field_name}, 描述: {field_description}"
|
||||
if sample_values:
|
||||
text += f", 示例值: {', '.join(sample_values)}"
|
||||
|
||||
doc_id = f"{table_name}.{field_name}"
|
||||
doc = SimpleDocument(
|
||||
page_content=text,
|
||||
metadata={"table_name": table_name, "field_name": field_name, "doc_id": doc_id}
|
||||
)
|
||||
self._add_documents([doc], [doc_id])
|
||||
logger.debug(f"已索引字段: {doc_id}")
|
||||
|
||||
def index_document_content(
|
||||
self,
|
||||
doc_id: str,
|
||||
content: str,
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
):
|
||||
"""将文档内容索引到向量数据库"""
|
||||
if self._disabled:
|
||||
logger.info(f"[RAG DISABLED] 文档索引操作已跳过: {doc_id}")
|
||||
return
|
||||
|
||||
if not self._initialized:
|
||||
self._init_vector_store()
|
||||
|
||||
# 如果没有嵌入模型,只记录到日志
|
||||
if self.embedding_model is None:
|
||||
logger.debug(f"文档跳过索引 (无嵌入模型): {doc_id}")
|
||||
return
|
||||
|
||||
doc = SimpleDocument(
|
||||
page_content=content,
|
||||
metadata=metadata or {"doc_id": doc_id}
|
||||
)
|
||||
self._add_documents([doc], [doc_id])
|
||||
logger.debug(f"已索引文档: {doc_id}")
|
||||
|
||||
def _add_documents(self, documents: List[SimpleDocument], doc_ids: List[str]):
|
||||
"""批量添加文档到向量索引"""
|
||||
if not documents:
|
||||
return
|
||||
|
||||
texts = [doc.page_content for doc in documents]
|
||||
embeddings = self.embedding_model.encode(texts, convert_to_numpy=True)
|
||||
embeddings = self._normalize_vectors(embeddings).astype('float32')
|
||||
|
||||
if self.index is None:
|
||||
self._init_vector_store()
|
||||
|
||||
id_list = [hash(did) for did in doc_ids]
|
||||
id_array = np.array(id_list, dtype='int64')
|
||||
self.index.add_with_ids(embeddings, id_array)
|
||||
|
||||
for doc, did in zip(documents, doc_ids):
|
||||
self.documents.append({"id": did, "content": doc.page_content, "metadata": doc.metadata})
|
||||
self.doc_ids.append(did)
|
||||
|
||||
def retrieve(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
|
||||
"""根据查询检索相关文档"""
|
||||
if self._disabled:
|
||||
logger.info(f"[RAG DISABLED] 检索操作已跳过: query={query}, top_k={top_k}")
|
||||
return []
|
||||
|
||||
if not self._initialized:
|
||||
self._init_vector_store()
|
||||
|
||||
if self.index is None or self.index.ntotal == 0:
|
||||
return []
|
||||
|
||||
query_embedding = self.embedding_model.encode([query], convert_to_numpy=True)
|
||||
query_embedding = self._normalize_vectors(query_embedding).astype('float32')
|
||||
|
||||
scores, indices = self.index.search(query_embedding, min(top_k, self.index.ntotal))
|
||||
|
||||
results = []
|
||||
for score, idx in zip(scores[0], indices[0]):
|
||||
if idx < 0:
|
||||
continue
|
||||
doc = self.documents[idx]
|
||||
results.append({
|
||||
"content": doc["content"],
|
||||
"metadata": doc["metadata"],
|
||||
"score": float(score),
|
||||
"doc_id": doc["id"]
|
||||
})
|
||||
|
||||
logger.debug(f"检索到 {len(results)} 条相关文档")
|
||||
return results
|
||||
|
||||
def retrieve_by_table(self, table_name: str, top_k: int = 5) -> List[Dict[str, Any]]:
|
||||
"""检索指定表的字段"""
|
||||
return self.retrieve(f"表名: {table_name}", top_k)
|
||||
|
||||
def get_vector_count(self) -> int:
|
||||
"""获取向量总数"""
|
||||
if self._disabled:
|
||||
logger.info("[RAG DISABLED] get_vector_count 返回 0")
|
||||
return 0
|
||||
if self.index is None:
|
||||
return 0
|
||||
return self.index.ntotal
|
||||
|
||||
def save_index(self, persist_path: str = None):
|
||||
"""保存向量索引到磁盘"""
|
||||
if persist_path is None:
|
||||
persist_path = self._persist_dir
|
||||
|
||||
if self.index is not None:
|
||||
os.makedirs(persist_path, exist_ok=True)
|
||||
faiss.write_index(self.index, os.path.join(persist_path, "index.faiss"))
|
||||
with open(os.path.join(persist_path, "documents.pkl"), "wb") as f:
|
||||
pickle.dump(self.documents, f)
|
||||
logger.info(f"向量索引已保存到: {persist_path}")
|
||||
|
||||
def load_index(self, persist_path: str = None):
|
||||
"""从磁盘加载向量索引"""
|
||||
if persist_path is None:
|
||||
persist_path = self._persist_dir
|
||||
|
||||
index_file = os.path.join(persist_path, "index.faiss")
|
||||
docs_file = os.path.join(persist_path, "documents.pkl")
|
||||
|
||||
if not os.path.exists(index_file):
|
||||
logger.warning(f"向量索引文件不存在: {index_file}")
|
||||
return
|
||||
|
||||
self._init_embeddings()
|
||||
self.index = faiss.read_index(index_file)
|
||||
|
||||
with open(docs_file, "rb") as f:
|
||||
self.documents = pickle.load(f)
|
||||
|
||||
self.doc_ids = [d["id"] for d in self.documents]
|
||||
self._initialized = True
|
||||
logger.info(f"向量索引已从 {persist_path} 加载,共 {len(self.documents)} 条")
|
||||
|
||||
def delete_by_doc_id(self, doc_id: str):
|
||||
"""根据文档ID删除索引"""
|
||||
if self.index is not None:
|
||||
remaining = [d for d in self.documents if d["id"] != doc_id]
|
||||
self.documents = remaining
|
||||
self.doc_ids = [d["id"] for d in self.documents]
|
||||
|
||||
self.index.reset()
|
||||
if self.documents:
|
||||
texts = [d["content"] for d in self.documents]
|
||||
embeddings = self.embedding_model.encode(texts, convert_to_numpy=True)
|
||||
embeddings = self._normalize_vectors(embeddings).astype('float32')
|
||||
id_array = np.array([hash(did) for did in self.doc_ids], dtype='int64')
|
||||
self.index.add_with_ids(embeddings, id_array)
|
||||
|
||||
logger.debug(f"已删除索引: {doc_id}")
|
||||
|
||||
def clear(self):
|
||||
"""清空所有索引"""
|
||||
if self._disabled:
|
||||
logger.info("[RAG DISABLED] clear 操作已跳过")
|
||||
return
|
||||
self._init_vector_store()
|
||||
if self.index is not None:
|
||||
self.index.reset()
|
||||
self.documents = []
|
||||
self.doc_ids = []
|
||||
logger.info("已清空所有向量索引")
|
||||
|
||||
|
||||
rag_service = RAGService()
|
||||
724
backend/app/services/table_rag_service.py
Normal file
724
backend/app/services/table_rag_service.py
Normal file
@@ -0,0 +1,724 @@
|
||||
"""
|
||||
表结构 RAG 索引服务
|
||||
|
||||
AI 自动生成表字段的语义描述,并建立向量索引
|
||||
"""
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from app.services.llm_service import llm_service
|
||||
from app.services.rag_service import rag_service
|
||||
from app.services.excel_storage_service import excel_storage_service
|
||||
from app.core.database.mysql import mysql_db
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TableRAGService:
|
||||
"""
|
||||
表结构 RAG 索引服务
|
||||
|
||||
核心功能:
|
||||
1. AI 根据表头和数据生成字段语义描述
|
||||
2. 将字段描述存入向量数据库 (RAG)
|
||||
3. 支持自然语言查询表字段
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.llm = llm_service
|
||||
self.rag = rag_service
|
||||
self.excel_storage = excel_storage_service
|
||||
# 临时禁用 RAG 索引构建
|
||||
self._disabled = True
|
||||
logger.info("TableRAG 服务已禁用(_disabled=True),仅记录索引操作日志")
|
||||
|
||||
def _extract_sheet_names_from_xml(self, file_path: str) -> List[str]:
|
||||
"""
|
||||
从 Excel 文件的 XML 中提取工作表名称
|
||||
|
||||
某些 Excel 文件由于包含非标准元素,pandas/openpyxl 无法正确解析工作表列表,
|
||||
此时需要直接从 XML 中提取。
|
||||
|
||||
Args:
|
||||
file_path: Excel 文件路径
|
||||
|
||||
Returns:
|
||||
工作表名称列表
|
||||
"""
|
||||
import zipfile
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
# 尝试多种命名空间
|
||||
namespaces = [
|
||||
'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
|
||||
'http://purl.oclc.org/ooxml/spreadsheetml/main',
|
||||
]
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as z:
|
||||
# 读取 workbook.xml
|
||||
if 'xl/workbook.xml' not in z.namelist():
|
||||
return []
|
||||
|
||||
content = z.read('xl/workbook.xml')
|
||||
root = ET.fromstring(content)
|
||||
|
||||
# 尝试多种命名空间
|
||||
for ns_uri in namespaces:
|
||||
ns = {'main': ns_uri}
|
||||
sheets = root.findall('.//main:sheet', ns)
|
||||
if sheets:
|
||||
names = [s.get('name') for s in sheets if s.get('name')]
|
||||
if names:
|
||||
logger.info(f"使用命名空间 {ns_uri} 提取到工作表: {names}")
|
||||
return names
|
||||
|
||||
# 如果都没找到,尝试不带命名空间
|
||||
sheets = root.findall('.//sheet')
|
||||
if not sheets:
|
||||
sheets = root.findall('.//{*}sheet')
|
||||
names = [s.get('name') for s in sheets if s.get('name')]
|
||||
if names:
|
||||
logger.info(f"使用通配符提取到工作表: {names}")
|
||||
return names
|
||||
|
||||
logger.warning(f"无法从 XML 提取工作表,尝试的文件: {file_path}")
|
||||
return []
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"从 XML 提取工作表失败: {file_path}, error: {e}")
|
||||
return []
|
||||
|
||||
def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
|
||||
"""
|
||||
读取 Excel 工作表,支持 pandas 无法解析的特殊 Excel 文件
|
||||
|
||||
当 pandas 的 ExcelFile 无法正确解析时,直接从 XML 读取数据。
|
||||
|
||||
Args:
|
||||
file_path: Excel 文件路径
|
||||
sheet_name: 工作表名称(如果为 None,读取第一个工作表)
|
||||
header_row: 表头行号
|
||||
|
||||
Returns:
|
||||
DataFrame
|
||||
"""
|
||||
import zipfile
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
# 定义命名空间
|
||||
namespaces = [
|
||||
'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
|
||||
'http://purl.oclc.org/ooxml/spreadsheetml/main',
|
||||
]
|
||||
|
||||
try:
|
||||
# 先尝试用 pandas 正常读取
|
||||
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
|
||||
if df is not None and not df.empty:
|
||||
return df
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# pandas 读取失败,从 XML 直接解析
|
||||
logger.info(f"使用 XML 方式读取 Excel: {file_path}")
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as z:
|
||||
# 获取工作表名称
|
||||
sheet_names = self._extract_sheet_names_from_xml(file_path)
|
||||
if not sheet_names:
|
||||
raise ValueError("无法从 Excel 文件中找到工作表")
|
||||
|
||||
# 确定要读取的工作表
|
||||
target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
|
||||
sheet_index = sheet_names.index(target_sheet) + 1 # sheet1.xml, sheet2.xml, ...
|
||||
|
||||
# 读取 shared strings
|
||||
shared_strings = []
|
||||
if 'xl/sharedStrings.xml' in z.namelist():
|
||||
ss_content = z.read('xl/sharedStrings.xml')
|
||||
ss_root = ET.fromstring(ss_content)
|
||||
# 使用通配符查找所有 si 元素
|
||||
for si in ss_root.iter():
|
||||
if si.tag.endswith('}si') or si.tag == 'si':
|
||||
t = si.find('.//{*}t')
|
||||
if t is not None and t.text:
|
||||
shared_strings.append(t.text)
|
||||
else:
|
||||
shared_strings.append('')
|
||||
|
||||
# 读取工作表
|
||||
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
|
||||
if sheet_file not in z.namelist():
|
||||
raise ValueError(f"工作表文件 {sheet_file} 不存在")
|
||||
|
||||
sheet_content = z.read(sheet_file)
|
||||
root = ET.fromstring(sheet_content)
|
||||
|
||||
# 解析行 - 使用通配符查找
|
||||
rows_data = []
|
||||
headers = {}
|
||||
|
||||
for row in root.iter():
|
||||
if row.tag.endswith('}row') or row.tag == 'row':
|
||||
row_idx = int(row.get('r', 0))
|
||||
|
||||
# 收集表头行
|
||||
if row_idx == header_row + 1:
|
||||
for cell in row:
|
||||
if cell.tag.endswith('}c') or cell.tag == 'c':
|
||||
cell_ref = cell.get('r', '')
|
||||
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
||||
cell_type = cell.get('t', 'n')
|
||||
v = cell.find('{*}v')
|
||||
if v is not None and v.text:
|
||||
if cell_type == 's':
|
||||
try:
|
||||
headers[col_letters] = shared_strings[int(v.text)]
|
||||
except (ValueError, IndexError):
|
||||
headers[col_letters] = v.text
|
||||
else:
|
||||
headers[col_letters] = v.text
|
||||
else:
|
||||
headers[col_letters] = col_letters
|
||||
continue
|
||||
|
||||
# 跳过表头行之后的数据行
|
||||
if row_idx <= header_row + 1:
|
||||
continue
|
||||
|
||||
row_cells = {}
|
||||
for cell in row:
|
||||
if cell.tag.endswith('}c') or cell.tag == 'c':
|
||||
cell_ref = cell.get('r', '')
|
||||
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
||||
cell_type = cell.get('t', 'n')
|
||||
v = cell.find('{*}v')
|
||||
|
||||
if v is not None and v.text:
|
||||
if cell_type == 's':
|
||||
try:
|
||||
val = shared_strings[int(v.text)]
|
||||
except (ValueError, IndexError):
|
||||
val = v.text
|
||||
elif cell_type == 'b':
|
||||
val = v.text == '1'
|
||||
else:
|
||||
val = v.text
|
||||
else:
|
||||
val = None
|
||||
|
||||
row_cells[col_letters] = val
|
||||
|
||||
if row_cells:
|
||||
rows_data.append(row_cells)
|
||||
|
||||
# 转换为 DataFrame
|
||||
if not rows_data:
|
||||
logger.warning(f"XML 解析结果为空: {file_path}, sheet: {target_sheet}")
|
||||
return pd.DataFrame()
|
||||
|
||||
df = pd.DataFrame(rows_data)
|
||||
|
||||
# 应用表头
|
||||
if headers:
|
||||
df.columns = [headers.get(col, col) for col in df.columns]
|
||||
|
||||
logger.info(f"XML 解析完成: {len(df)} 行, {len(df.columns)} 列")
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"XML 解析 Excel 失败: {e}")
|
||||
raise
|
||||
|
||||
async def generate_field_description(
|
||||
self,
|
||||
table_name: str,
|
||||
field_name: str,
|
||||
sample_values: List[Any],
|
||||
all_fields: Dict[str, List[Any]] = None
|
||||
) -> str:
|
||||
"""
|
||||
使用 AI 生成字段的语义描述
|
||||
|
||||
Args:
|
||||
table_name: 表名
|
||||
field_name: 字段名
|
||||
sample_values: 字段示例值 (前10个)
|
||||
all_fields: 其他字段的示例值,用于上下文理解
|
||||
|
||||
Returns:
|
||||
字段的语义描述
|
||||
"""
|
||||
# 构建 Prompt
|
||||
context = ""
|
||||
if all_fields:
|
||||
context = "\n其他字段示例:\n"
|
||||
for fname, values in all_fields.items():
|
||||
if fname != field_name and values:
|
||||
context += f"- {fname}: {', '.join([str(v) for v in values[:3]])}\n"
|
||||
|
||||
prompt = f"""你是一个数据语义分析专家。请根据字段名和示例值,推断该字段的语义含义。
|
||||
|
||||
表名:{table_name}
|
||||
字段名:{field_name}
|
||||
示例值:{', '.join([str(v) for v in sample_values[:10] if v is not None])}
|
||||
{context}
|
||||
|
||||
请生成一段简洁的字段语义描述(不超过50字),说明:
|
||||
1. 该字段代表什么含义
|
||||
2. 数据格式或单位(如果有)
|
||||
3. 可能的业务用途
|
||||
|
||||
只输出描述文字,不要其他内容。"""
|
||||
|
||||
try:
|
||||
messages = [
|
||||
{"role": "system", "content": "你是一个专业的数据分析师。"},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
|
||||
response = await self.llm.chat(
|
||||
messages=messages,
|
||||
temperature=0.3,
|
||||
max_tokens=200
|
||||
)
|
||||
|
||||
description = self.llm.extract_message_content(response)
|
||||
return description.strip()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"生成字段描述失败: {str(e)}")
|
||||
return f"{field_name}: 数据字段"
|
||||
|
||||
async def build_table_rag_index(
|
||||
self,
|
||||
file_path: str,
|
||||
filename: str,
|
||||
sheet_name: Optional[str] = None,
|
||||
header_row: int = 0,
|
||||
sample_size: int = 10
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
为 Excel 表构建完整的 RAG 索引
|
||||
|
||||
流程:
|
||||
1. 读取 Excel 获取字段信息
|
||||
2. AI 生成每个字段的语义描述
|
||||
3. 将字段描述存入向量数据库
|
||||
|
||||
Args:
|
||||
file_path: Excel 文件路径
|
||||
filename: 原始文件名
|
||||
sheet_name: 工作表名称
|
||||
header_row: 表头行号
|
||||
sample_size: 每个字段采样的数据条数
|
||||
|
||||
Returns:
|
||||
索引构建结果
|
||||
"""
|
||||
results = {
|
||||
"success": True,
|
||||
"table_name": "",
|
||||
"field_count": 0,
|
||||
"indexed_fields": [],
|
||||
"errors": []
|
||||
}
|
||||
|
||||
try:
|
||||
# 1. 先检查 Excel 文件是否有效
|
||||
logger.info(f"正在检查Excel文件: {file_path}")
|
||||
try:
|
||||
xls_file = pd.ExcelFile(file_path)
|
||||
sheet_names = xls_file.sheet_names
|
||||
logger.info(f"Excel文件工作表: {sheet_names}")
|
||||
|
||||
# 如果 sheet_names 为空,尝试从 XML 中手动提取
|
||||
if not sheet_names:
|
||||
sheet_names = self._extract_sheet_names_from_xml(file_path)
|
||||
logger.info(f"从XML提取工作表: {sheet_names}")
|
||||
|
||||
if not sheet_names:
|
||||
return {"success": False, "error": "Excel 文件没有工作表"}
|
||||
except Exception as e:
|
||||
logger.error(f"读取Excel文件失败: {file_path}, error: {e}")
|
||||
return {"success": False, "error": f"无法读取Excel文件: {str(e)}"}
|
||||
|
||||
# 2. 读取 Excel
|
||||
if sheet_name:
|
||||
# 验证指定的sheet_name是否存在
|
||||
if sheet_name not in sheet_names:
|
||||
logger.warning(f"指定的工作表 '{sheet_name}' 不存在,使用第一个工作表: {sheet_names[0]}")
|
||||
sheet_name = sheet_names[0]
|
||||
df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row)
|
||||
|
||||
logger.info(f"读取到数据: {len(df)} 行, {len(df.columns)} 列")
|
||||
|
||||
if df.empty:
|
||||
return {"success": False, "error": "Excel 文件为空"}
|
||||
|
||||
# 清理列名
|
||||
df.columns = [str(c) for c in df.columns]
|
||||
table_name = self.excel_storage._sanitize_table_name(filename)
|
||||
results["table_name"] = table_name
|
||||
results["field_count"] = len(df.columns)
|
||||
logger.info(f"表名: {table_name}, 字段数: {len(df.columns)}")
|
||||
|
||||
# 3. 初始化 RAG (如果需要)
|
||||
if not self.rag._initialized:
|
||||
self.rag._init_vector_store()
|
||||
|
||||
# 4. 为每个字段生成描述并索引
|
||||
all_fields_data = {}
|
||||
for col in df.columns:
|
||||
# 采样示例值
|
||||
sample_values = df[col].dropna().head(sample_size).tolist()
|
||||
all_fields_data[col] = sample_values
|
||||
|
||||
# 批量生成描述(避免过多 API 调用)
|
||||
indexed_count = 0
|
||||
for col in df.columns:
|
||||
try:
|
||||
sample_values = all_fields_data[col]
|
||||
|
||||
# 生成描述
|
||||
description = await self.generate_field_description(
|
||||
table_name=table_name,
|
||||
field_name=col,
|
||||
sample_values=sample_values,
|
||||
all_fields=all_fields_data
|
||||
)
|
||||
|
||||
# 存入 RAG(如果未禁用)
|
||||
if self._disabled:
|
||||
logger.info(f"[RAG DISABLED] 字段索引已跳过: {table_name}.{col}")
|
||||
else:
|
||||
self.rag.index_field(
|
||||
table_name=table_name,
|
||||
field_name=col,
|
||||
field_description=description,
|
||||
sample_values=[str(v) for v in sample_values[:5]]
|
||||
)
|
||||
|
||||
indexed_count += 1
|
||||
results["indexed_fields"].append({
|
||||
"field": col,
|
||||
"description": description
|
||||
})
|
||||
|
||||
logger.info(f"字段已索引: {table_name}.{col}")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"字段 {col} 索引失败: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
# 5. 存储到 MySQL
|
||||
logger.info(f"开始存储到MySQL: {filename}")
|
||||
store_result = await self.excel_storage.store_excel(
|
||||
file_path=file_path,
|
||||
filename=filename,
|
||||
sheet_name=sheet_name,
|
||||
header_row=header_row
|
||||
)
|
||||
|
||||
if store_result.get("success"):
|
||||
results["mysql_table"] = store_result.get("table_name")
|
||||
results["row_count"] = store_result.get("row_count")
|
||||
else:
|
||||
results["mysql_warning"] = "MySQL 存储失败: " + str(store_result.get("error"))
|
||||
|
||||
results["indexed_count"] = indexed_count
|
||||
logger.info(f"表 {table_name} RAG 索引构建完成,共 {indexed_count} 个字段")
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"构建 RAG 索引失败: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def index_document_table(
|
||||
self,
|
||||
doc_id: str,
|
||||
filename: str,
|
||||
table_data: Dict[str, Any],
|
||||
source_doc_type: str
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
为非结构化文档中提取的表格建立 MySQL 存储和 RAG 索引
|
||||
|
||||
Args:
|
||||
doc_id: 源文档 ID
|
||||
filename: 源文件名
|
||||
table_data: 表格数据,支持两种格式:
|
||||
1. docx/txt格式: {"rows": [["col1", "col2"], ["val1", "val2"]], ...}
|
||||
2. md格式: {"headers": [...], "rows": [...], ...}
|
||||
source_doc_type: 源文档类型 (docx/md/txt)
|
||||
|
||||
Returns:
|
||||
索引构建结果
|
||||
"""
|
||||
results = {
|
||||
"success": True,
|
||||
"table_name": "",
|
||||
"field_count": 0,
|
||||
"indexed_fields": [],
|
||||
"errors": []
|
||||
}
|
||||
|
||||
try:
|
||||
# 兼容两种格式
|
||||
if "headers" in table_data:
|
||||
# md 格式:headers 和 rows 分开
|
||||
columns = table_data.get("headers", [])
|
||||
data_rows = table_data.get("rows", [])
|
||||
else:
|
||||
# docx/txt 格式:第一行作为表头
|
||||
rows = table_data.get("rows", [])
|
||||
if not rows or len(rows) < 2:
|
||||
return {"success": False, "error": "表格数据不足"}
|
||||
columns = rows[0]
|
||||
data_rows = rows[1:]
|
||||
|
||||
# 生成表名:源文件 + 表格索引
|
||||
base_name = self.excel_storage._sanitize_table_name(filename)
|
||||
table_name = f"{base_name}_table{table_data.get('table_index', 0)}"
|
||||
|
||||
results["table_name"] = table_name
|
||||
results["field_count"] = len(columns)
|
||||
|
||||
# 1. 初始化 RAG
|
||||
if not self.rag._initialized:
|
||||
self.rag._init_vector_store()
|
||||
|
||||
# 2. 准备结构化数据
|
||||
structured_data = {
|
||||
"columns": columns,
|
||||
"rows": data_rows
|
||||
}
|
||||
|
||||
# 3. 存储到 MySQL
|
||||
store_result = await self.excel_storage.store_structured_data(
|
||||
table_name=table_name,
|
||||
data=structured_data,
|
||||
source_doc_id=doc_id
|
||||
)
|
||||
|
||||
if store_result.get("success"):
|
||||
results["mysql_table"] = store_result.get("table_name")
|
||||
results["row_count"] = store_result.get("row_count")
|
||||
else:
|
||||
results["mysql_warning"] = "MySQL 存储失败: " + str(store_result.get("error"))
|
||||
|
||||
# 4. 为每个字段生成描述并索引
|
||||
all_fields_data = {}
|
||||
for i, col in enumerate(columns):
|
||||
col_values = [row[i] for row in data_rows if i < len(row)]
|
||||
all_fields_data[col] = col_values
|
||||
|
||||
indexed_count = 0
|
||||
for col in columns:
|
||||
try:
|
||||
col_values = all_fields_data.get(col, [])
|
||||
|
||||
# 生成描述
|
||||
description = await self.generate_field_description(
|
||||
table_name=table_name,
|
||||
field_name=col,
|
||||
sample_values=col_values[:10],
|
||||
all_fields=all_fields_data
|
||||
)
|
||||
|
||||
# 存入 RAG(如果未禁用)
|
||||
if self._disabled:
|
||||
logger.info(f"[RAG DISABLED] 文档表格字段索引已跳过: {table_name}.{col}")
|
||||
else:
|
||||
self.rag.index_field(
|
||||
table_name=table_name,
|
||||
field_name=col,
|
||||
field_description=description,
|
||||
sample_values=[str(v) for v in col_values[:5]]
|
||||
)
|
||||
|
||||
indexed_count += 1
|
||||
results["indexed_fields"].append({
|
||||
"field": col,
|
||||
"description": description
|
||||
})
|
||||
|
||||
logger.info(f"文档表格字段已索引: {table_name}.{col}")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"字段 {col} 索引失败: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
results["indexed_count"] = indexed_count
|
||||
logger.info(f"文档表格 {table_name} RAG 索引构建完成,共 {indexed_count} 个字段")
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"构建文档表格 RAG 索引失败: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def query_table_by_natural_language(
|
||||
self,
|
||||
user_query: str,
|
||||
top_k: int = 5
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
根据自然语言查询相关表字段
|
||||
|
||||
Args:
|
||||
user_query: 用户查询
|
||||
top_k: 返回数量
|
||||
|
||||
Returns:
|
||||
匹配的字段信息
|
||||
"""
|
||||
try:
|
||||
# 1. RAG 检索
|
||||
rag_results = self.rag.retrieve(user_query, top_k=top_k)
|
||||
|
||||
# 2. 解析检索结果
|
||||
matched_fields = []
|
||||
for result in rag_results:
|
||||
metadata = result.get("metadata", {})
|
||||
matched_fields.append({
|
||||
"table_name": metadata.get("table_name", ""),
|
||||
"field_name": metadata.get("field_name", ""),
|
||||
"description": result.get("content", ""),
|
||||
"score": result.get("score", 0),
|
||||
"sample_values": [] # 可以后续补充
|
||||
})
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"query": user_query,
|
||||
"matched_fields": matched_fields,
|
||||
"count": len(matched_fields)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"查询失败: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
async def get_table_fields_with_description(
|
||||
self,
|
||||
table_name: str
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
获取表的字段及其描述
|
||||
|
||||
Args:
|
||||
table_name: 表名
|
||||
|
||||
Returns:
|
||||
字段列表
|
||||
"""
|
||||
try:
|
||||
# 从 RAG 检索该表的所有字段
|
||||
results = self.rag.retrieve_by_table(table_name, top_k=50)
|
||||
|
||||
fields = []
|
||||
for result in results:
|
||||
metadata = result.get("metadata", {})
|
||||
fields.append({
|
||||
"table_name": metadata.get("table_name", ""),
|
||||
"field_name": metadata.get("field_name", ""),
|
||||
"description": result.get("content", ""),
|
||||
"score": result.get("score", 0)
|
||||
})
|
||||
|
||||
return fields
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取字段失败: {str(e)}")
|
||||
return []
|
||||
|
||||
async def rebuild_all_table_indexes(self) -> Dict[str, Any]:
|
||||
"""
|
||||
重建所有表的 RAG 索引
|
||||
|
||||
从 MySQL 读取所有表结构,重新生成描述并索引
|
||||
"""
|
||||
try:
|
||||
# 清空现有索引
|
||||
self.rag.clear()
|
||||
|
||||
# 获取所有表
|
||||
tables = await self.excel_storage.list_tables()
|
||||
|
||||
results = {
|
||||
"success": True,
|
||||
"tables_processed": 0,
|
||||
"total_fields": 0,
|
||||
"errors": []
|
||||
}
|
||||
|
||||
for table_name in tables:
|
||||
try:
|
||||
# 获取表结构
|
||||
schema = await self.excel_storage.get_table_schema(table_name)
|
||||
|
||||
if not schema:
|
||||
continue
|
||||
|
||||
# 初始化 RAG
|
||||
if not self.rag._initialized:
|
||||
self.rag._init_vector_store()
|
||||
|
||||
# 为每个字段生成描述并索引
|
||||
for col_info in schema:
|
||||
field_name = col_info.get("COLUMN_NAME", "")
|
||||
if field_name in ["id", "created_at", "updated_at"]:
|
||||
continue
|
||||
|
||||
# 采样数据
|
||||
samples = await self.excel_storage.query_table(
|
||||
table_name,
|
||||
columns=[field_name],
|
||||
limit=10
|
||||
)
|
||||
sample_values = [r.get(field_name) for r in samples if r.get(field_name)]
|
||||
|
||||
# 生成描述
|
||||
description = await self.generate_field_description(
|
||||
table_name=table_name,
|
||||
field_name=field_name,
|
||||
sample_values=sample_values
|
||||
)
|
||||
|
||||
# 索引
|
||||
self.rag.index_field(
|
||||
table_name=table_name,
|
||||
field_name=field_name,
|
||||
field_description=description,
|
||||
sample_values=[str(v) for v in sample_values[:5]]
|
||||
)
|
||||
|
||||
results["total_fields"] += 1
|
||||
|
||||
results["tables_processed"] += 1
|
||||
logger.info(f"表 {table_name} 索引重建完成")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"表 {table_name} 索引失败: {str(e)}"
|
||||
logger.error(error_msg)
|
||||
results["errors"].append(error_msg)
|
||||
|
||||
logger.info(f"全部 {results['tables_processed']} 个表索引重建完成")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"重建索引失败: {str(e)}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
# ==================== 全局单例 ====================
|
||||
|
||||
table_rag_service = TableRAGService()
|
||||
1874
backend/app/services/template_fill_service.py
Normal file
1874
backend/app/services/template_fill_service.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -103,12 +103,19 @@ git config user.email #同上
|
||||
#如果想看全局的,可以加上 --global,例如 git config --global user.name
|
||||
```
|
||||
|
||||
需要更新以下库
|
||||
先进入虚拟机
|
||||
```bash
|
||||
cd backend
|
||||
.\venv\Scripts\Activate.ph1
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## 启动后端项目
|
||||
在终端输入以下命令:
|
||||
```bash
|
||||
cd backend #确保启动时在后端跟目录下
|
||||
./venv/Scripts/python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000
|
||||
--reload #启动后端项目
|
||||
./venv/Scripts/python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload #启动后端项目
|
||||
```
|
||||
先启动后端项目,再启动前端项目
|
||||
|
||||
|
||||
@@ -1,24 +1,54 @@
|
||||
# ============================================================
|
||||
# 基于大语言模型的文档理解与多源数据融合系统
|
||||
# Python 依赖清单
|
||||
# ============================================================
|
||||
|
||||
# ==================== Web 框架 ====================
|
||||
fastapi[all]==0.104.1
|
||||
uvicorn[standard]==0.24.0
|
||||
python-multipart==0.0.6
|
||||
|
||||
# ==================== 数据验证与配置 ====================
|
||||
pydantic==2.5.0
|
||||
pydantic-settings==2.1.0
|
||||
python-multipart==0.0.6
|
||||
python-dotenv==1.0.0
|
||||
|
||||
# ==================== 数据库 - MySQL (结构化数据) ====================
|
||||
pymysql==1.1.0
|
||||
aiomysql==0.2.0
|
||||
sqlalchemy==2.0.25
|
||||
|
||||
# ==================== 数据库 - MongoDB (非结构化数据) ====================
|
||||
motor==3.3.2
|
||||
pymongo==4.5.0
|
||||
|
||||
# ==================== 数据库 - Redis (缓存/队列) ====================
|
||||
redis==5.0.0
|
||||
|
||||
# ==================== 异步任务 ====================
|
||||
celery==5.3.4
|
||||
sentence-transformers==2.2.2
|
||||
|
||||
# ==================== RAG / 向量数据库 ====================
|
||||
# chromadb==0.4.22 # Windows 需要 C++ 编译环境,如需安装请使用预编译版本或 WSL
|
||||
sentence-transformers==2.7.0
|
||||
faiss-cpu==1.8.0
|
||||
python-docx==0.8.11
|
||||
|
||||
# ==================== 文档解析 ====================
|
||||
pandas==2.1.4
|
||||
openpyxl==3.1.2
|
||||
python-docx==0.8.11
|
||||
markdown-it-py==3.0.0
|
||||
chardet==5.2.0
|
||||
|
||||
# ==================== AI / LLM ====================
|
||||
httpx==0.25.2
|
||||
|
||||
# ==================== 数据处理与可视化 ====================
|
||||
matplotlib==3.8.2
|
||||
numpy==1.26.2
|
||||
markdown==3.5.1
|
||||
langchain==0.1.0
|
||||
langchain-community==0.0.10
|
||||
|
||||
# ==================== 工具库 ====================
|
||||
requests==2.31.0
|
||||
httpx==0.25.2
|
||||
python-dotenv==1.0.0
|
||||
loguru==0.7.2
|
||||
tqdm==4.66.1
|
||||
PyYAML==6.0.1
|
||||
|
||||
@@ -1,71 +0,0 @@
|
||||
"""
|
||||
测试字体配置是否正常工作
|
||||
"""
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib
|
||||
import numpy as np
|
||||
from app.services.font_helper import configure_matplotlib_fonts
|
||||
import io
|
||||
import base64
|
||||
|
||||
# 配置字体
|
||||
font_name = configure_matplotlib_fonts()
|
||||
|
||||
print(f"当前使用字体: {font_name}")
|
||||
print(f"matplotlib 中文字体设置: {matplotlib.rcParams['font.sans-serif']}")
|
||||
|
||||
# 创建测试图表
|
||||
fig, ax = plt.subplots(figsize=(10, 6))
|
||||
|
||||
# 测试数据
|
||||
x = ['销售', '库存', '采购', '退货', '其他']
|
||||
y = [150, 200, 180, 50, 30]
|
||||
|
||||
bars = ax.bar(x, y, color='#3b82f6', alpha=0.8)
|
||||
ax.set_xlabel('类别', fontsize=12, labelpad=10)
|
||||
ax.set_ylabel('数值', fontsize=12, labelpad=10)
|
||||
ax.set_title('测试图表 - 中文显示', fontsize=14, fontweight='bold', pad=15)
|
||||
ax.tick_params(axis='both', which='major', labelsize=10)
|
||||
|
||||
# 添加数值标签
|
||||
for bar, value in zip(bars, y):
|
||||
height = bar.get_height()
|
||||
ax.text(bar.get_x() + bar.get_width() / 2., height,
|
||||
f'{value}',
|
||||
ha='center', va='bottom', fontsize=10, fontweight='bold')
|
||||
|
||||
plt.grid(axis='y', alpha=0.3)
|
||||
plt.tight_layout(pad=1.5)
|
||||
|
||||
# 转换为 base64
|
||||
buf = io.BytesIO()
|
||||
fig.savefig(buf, format='png', dpi=120, bbox_inches='tight', pad_inches=0.3, facecolor='white')
|
||||
plt.close(fig)
|
||||
|
||||
buf.seek(0)
|
||||
img_base64 = base64.b64encode(buf.read()).decode('utf-8')
|
||||
data_url = f"data:image/png;base64,{img_base64}"
|
||||
|
||||
print("\n=== 测试完成 ===")
|
||||
print(f"图表大小: {len(img_base64)} 字符")
|
||||
print("如果看到字体警告,请检查系统是否有安装中文字体")
|
||||
|
||||
# 尝试获取所有可用字体
|
||||
import matplotlib.font_manager as fm
|
||||
available_fonts = set([f.name for f in fm.fontManager.ttflist])
|
||||
|
||||
print(f"\n=== 可用字体列表(部分)===")
|
||||
chinese_fonts = [f for f in available_fonts if 'CJK' in f or 'Chinese' in f or 'YaHei' in f or 'SimHei' in f or 'PingFang' in f]
|
||||
for font in sorted(chinese_fonts)[:10]:
|
||||
print(f" - {font}")
|
||||
|
||||
if not chinese_fonts:
|
||||
print(" 未找到中文字体!")
|
||||
|
||||
print("\n=== 推荐安装的中文字体 ===")
|
||||
print("Windows: Microsoft YaHei (系统自带)")
|
||||
print("macOS: PingFang SC (系统自带)")
|
||||
print("Linux: fonts-noto-cjk 或 fonts-wqy-zenhei")
|
||||
|
||||
print("\n=== 生成的 base64 数据(前100字符)===")
|
||||
print(data_url[:100] + "...")
|
||||
46
backend/test_mongodb.py
Normal file
46
backend/test_mongodb.py
Normal file
@@ -0,0 +1,46 @@
|
||||
"""
|
||||
MongoDB 数据库连接测试
|
||||
"""
|
||||
import asyncio
|
||||
from app.core.database.mongodb import mongodb
|
||||
|
||||
|
||||
async def test_mongodb():
|
||||
print("=" * 50)
|
||||
print("MongoDB 数据库连接测试")
|
||||
print("=" * 50)
|
||||
|
||||
try:
|
||||
# 连接
|
||||
await mongodb.connect()
|
||||
print(f"✓ MongoDB 连接成功: {mongodb.client}")
|
||||
|
||||
# 测试插入
|
||||
test_doc = {"test": "hello", "value": 123}
|
||||
doc_id = await mongodb.client.test_database.test_collection.insert_one(test_doc)
|
||||
print(f"✓ 写入测试成功, ID: {doc_id.inserted_id}")
|
||||
|
||||
# 测试查询
|
||||
doc = await mongodb.client.test_database.test_collection.find_one({"test": "hello"})
|
||||
print(f"✓ 读取测试成功: {doc}")
|
||||
|
||||
# 删除测试数据
|
||||
await mongodb.client.test_database.test_collection.delete_one({"test": "hello"})
|
||||
print(f"✓ 删除测试数据成功")
|
||||
|
||||
# 列出数据库
|
||||
dbs = await mongodb.client.list_database_names()
|
||||
print(f"✓ 数据库列表: {dbs}")
|
||||
|
||||
print("\n✓ MongoDB 测试通过!")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ MongoDB 测试失败: {e}")
|
||||
return False
|
||||
finally:
|
||||
await mongodb.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_mongodb())
|
||||
37
backend/test_mysql.py
Normal file
37
backend/test_mysql.py
Normal file
@@ -0,0 +1,37 @@
|
||||
"""
|
||||
MySQL 数据库连接测试
|
||||
"""
|
||||
import asyncio
|
||||
from sqlalchemy import text
|
||||
from app.core.database.mysql import mysql_db
|
||||
|
||||
|
||||
async def test_mysql():
|
||||
print("=" * 50)
|
||||
print("MySQL 数据库连接测试")
|
||||
print("=" * 50)
|
||||
|
||||
try:
|
||||
# 测试连接
|
||||
async with mysql_db.async_session_factory() as session:
|
||||
result = await session.execute(text("SELECT 1"))
|
||||
print(f"✓ MySQL 连接成功: {result.fetchone()}")
|
||||
|
||||
# 测试查询数据库
|
||||
async with mysql_db.async_session_factory() as session:
|
||||
result = await session.execute(text("SHOW DATABASES"))
|
||||
dbs = result.fetchall()
|
||||
print(f"✓ 数据库列表: {[db[0] for db in dbs]}")
|
||||
|
||||
print("\n✓ MySQL 测试通过!")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ MySQL 测试失败: {e}")
|
||||
return False
|
||||
finally:
|
||||
await mysql_db.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_mysql())
|
||||
46
backend/test_redis.py
Normal file
46
backend/test_redis.py
Normal file
@@ -0,0 +1,46 @@
|
||||
"""
|
||||
Redis 数据库连接测试
|
||||
"""
|
||||
import asyncio
|
||||
from app.core.database.redis_db import redis_db
|
||||
|
||||
|
||||
async def test_redis():
|
||||
print("=" * 50)
|
||||
print("Redis 数据库连接测试")
|
||||
print("=" * 50)
|
||||
|
||||
try:
|
||||
# 连接
|
||||
await redis_db.connect()
|
||||
print(f"✓ Redis 连接成功")
|
||||
|
||||
# 测试写入
|
||||
await redis_db.client.set("test_key", "hello_redis")
|
||||
print(f"✓ 写入测试成功")
|
||||
|
||||
# 测试读取
|
||||
value = await redis_db.client.get("test_key")
|
||||
print(f"✓ 读取测试成功: {value}")
|
||||
|
||||
# 测试删除
|
||||
await redis_db.client.delete("test_key")
|
||||
print(f"✓ 删除测试成功")
|
||||
|
||||
# 测试任务状态
|
||||
await redis_db.set_task_status("test_task", "processing", {"progress": 50})
|
||||
status = await redis_db.get_task_status("test_task")
|
||||
print(f"✓ 任务状态测试成功: {status}")
|
||||
|
||||
print("\n✓ Redis 测试通过!")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ Redis 测试失败: {e}")
|
||||
return False
|
||||
finally:
|
||||
await redis_db.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_redis())
|
||||
@@ -1,113 +0,0 @@
|
||||
✅ Excel 文件解析功能已完成并测试通过
|
||||
|
||||
已完成的工作
|
||||
|
||||
后端部分
|
||||
|
||||
1. 文件服务层 (backend/app/services/file_service.py)
|
||||
|
||||
- 文件保存、读取、删除功能
|
||||
- 文件信息获取
|
||||
2. Excel 解析模块 (backend/app/core/document_parser/)
|
||||
|
||||
- base.py - 解析器基类
|
||||
- xlsx_parser.py - Excel 文件解析器
|
||||
- utils.py - 工具函数
|
||||
3. API 接口 (backend/app/api/endpoints/upload.py)
|
||||
|
||||
- POST /upload/excel - 上传并解析 Excel 文件
|
||||
- GET /upload/excel/preview/{file_path} - 获取预览
|
||||
- DELETE /upload/file - 删除文件
|
||||
4. 主应用配置 (backend/app/main.py)
|
||||
|
||||
- CORS 中间件配置
|
||||
- API 路由注册
|
||||
|
||||
前端部分
|
||||
|
||||
1. 后端 API 封装 (frontendTest/app-a6ww9j3ja3nl/src/db/backend-api.ts)
|
||||
|
||||
- Excel 上传、预览、删除函数
|
||||
2. Excel 解析页面 (frontendTest/app-a6ww9j3ja3nl/src/pages/ExcelParse.tsx)
|
||||
|
||||
- 拖拽上传界面
|
||||
- 解析选项配置
|
||||
- 数据表格展示
|
||||
- 多工作表支持
|
||||
3. 路由配置 (frontendTest/app-a6ww9j3ja3nl/src/routes.tsx)
|
||||
|
||||
- 添加 /excel-parse 路由
|
||||
4. 环境配置 (.env)
|
||||
|
||||
- 后端 API URL 配置
|
||||
|
||||
测试结果
|
||||
|
||||
后端 API 测试通过:
|
||||
- ✅ 健康检查:http://127.0.0.1:8000/health
|
||||
- ✅ Excel 上传解析:成功解析 5 行 6 列的测试数据
|
||||
|
||||
访问方式
|
||||
|
||||
1. 后端 API 文档:http://127.0.0.1:8000/api/v1/docs
|
||||
2. 前端页面:
|
||||
|
||||
- 启动前端:cd frontendTest/app-a6ww9j3ja3nl && npm run dev
|
||||
- 访问 Excel 解析页面:/excel-parse
|
||||
|
||||
下一步建议
|
||||
|
||||
1. 启动前端开发服务器测试完整的前端界面
|
||||
2. 添加更多文档格式解析支持(docx, md, txt)
|
||||
3. 完善错误处理和用户提示
|
||||
|
||||
✻ Worked for 4m 18s · 1 background task still running (↓ to manage)
|
||||
|
||||
> 如何启动后端呢
|
||||
|
||||
● 后端启动步骤如下:
|
||||
|
||||
方法一:使用命令行启动(开发模式)
|
||||
|
||||
# 1. 进入后端目录
|
||||
cd H:/OwnProject/FilesReadSysteam/backend
|
||||
|
||||
# 2. 激活虚拟环境
|
||||
# PowerShell:
|
||||
.\venv\Scripts\Activate.ps1
|
||||
|
||||
# 或 CMD:
|
||||
.\venv\Scripts\Activate.bat
|
||||
|
||||
# 3. 启动后端服务器
|
||||
python -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload
|
||||
|
||||
方法二:直接使用虚拟环境中的 Python
|
||||
|
||||
cd H:/OwnProject/FilesReadSysteam/backend
|
||||
./venv/Scripts/python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000 --reload
|
||||
|
||||
方法三:直接运行 main.py(开发调试)
|
||||
|
||||
cd H:/OwnProject/FilesReadSysteam/backend
|
||||
./venv/Scripts/python.exe app/main.py
|
||||
|
||||
启动成功的标志
|
||||
|
||||
INFO: Uvicorn running on http://127.0.0.1:8000
|
||||
INFO: Started server process [xxxxx]
|
||||
INFO: Waiting for application startup.
|
||||
INFO: Application startup complete.
|
||||
|
||||
验证服务是否正常
|
||||
|
||||
# 健康检查
|
||||
curl http://127.0.0.1:8000/health
|
||||
|
||||
# 访问 API 文档
|
||||
# 浏览器打开: http://127.0.0.1:8000/api/v1/docs
|
||||
|
||||
---
|
||||
当前状态:后端已在后台运行(任务 ID: b22jkg69j),可以直接访问 http://127.0.0.1:8000
|
||||
|
||||
需要停止的话告诉我即可。
|
||||
@@ -1,7 +1,7 @@
|
||||
VITE_APP_ID=app-a6ww9j3ja3nl
|
||||
|
||||
VITE_SUPABASE_URL=https://backend.appmiaoda.com/projects/supabase290100332300644352
|
||||
VITE_SUPABASE_URL=https://ojtxpvjgqoybhmadimym.supabase.co
|
||||
|
||||
VITE_SUPABASE_ANON_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhdWQiOiJhdXRoZW50aWNhdGVkIiwiZXhwIjoyMDg4NTkyNTA5LCJpc3MiOiJzdXBhYmFzZSIsInJvbGUiOiJhbm9uIiwic3ViIjoiYW5vbiJ9.sdVzWIT_AjxVjEmBaEQcOoFGlHTTT8NH59fYdkaA4WU
|
||||
VITE_SUPABASE_ANON_KEY=sb_publishable_VMZMg44D-9bKE6bsbUiSsw_x3rUJbu2
|
||||
|
||||
VITE_BACKEND_API_URL=http://localhost:8000/api/v1
|
||||
|
||||
7
frontend/.env.example
Normal file
7
frontend/.env.example
Normal file
@@ -0,0 +1,7 @@
|
||||
VITE_APP_ID=
|
||||
|
||||
VITE_SUPABASE_URL=
|
||||
|
||||
VITE_SUPABASE_ANON_KEY=
|
||||
|
||||
VITE_BACKEND_API_URL=http://localhost:8000/api/v1
|
||||
@@ -1,13 +1,16 @@
|
||||
import { RouterProvider } from 'react-router-dom';
|
||||
import { AuthProvider } from '@/context/AuthContext';
|
||||
import { AuthProvider } from '@/contexts/AuthContext';
|
||||
import { TemplateFillProvider } from '@/context/TemplateFillContext';
|
||||
import { router } from '@/routes';
|
||||
import { Toaster } from 'sonner';
|
||||
|
||||
function App() {
|
||||
return (
|
||||
<AuthProvider>
|
||||
<RouterProvider router={router} />
|
||||
<Toaster position="top-right" richColors closeButton />
|
||||
<TemplateFillProvider>
|
||||
<RouterProvider router={router} />
|
||||
<Toaster position="top-right" richColors closeButton />
|
||||
</TemplateFillProvider>
|
||||
</AuthProvider>
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import React from 'react';
|
||||
import { Navigate, useLocation } from 'react-router-dom';
|
||||
import { useAuth } from '@/context/AuthContext';
|
||||
import { useAuth } from '@/contexts/AuthContext';
|
||||
|
||||
export const RouteGuard: React.FC<{ children: React.ReactNode }> = ({ children }) => {
|
||||
const { user, loading } = useAuth();
|
||||
|
||||
@@ -1,39 +1,29 @@
|
||||
import React from 'react';
|
||||
import { Link, useLocation, Outlet, useNavigate } from 'react-router-dom';
|
||||
import { Link, useLocation, Outlet } from 'react-router-dom';
|
||||
import {
|
||||
LayoutDashboard,
|
||||
FileText,
|
||||
TableProperties,
|
||||
MessageSquareCode,
|
||||
LogOut,
|
||||
Menu,
|
||||
X,
|
||||
ChevronRight,
|
||||
User,
|
||||
Sparkles
|
||||
Sparkles,
|
||||
Clock
|
||||
} from 'lucide-react';
|
||||
import { Button } from '@/components/ui/button';
|
||||
import { useAuth } from '@/context/AuthContext';
|
||||
import { cn } from '@/lib/utils';
|
||||
import { Sheet, SheetContent, SheetTrigger } from '@/components/ui/sheet';
|
||||
|
||||
const navItems = [
|
||||
{ name: '控制台', path: '/', icon: LayoutDashboard },
|
||||
{ name: '文档中心', path: '/documents', icon: FileText },
|
||||
{ name: 'Excel 解析', path: '/excel-parse', icon: Sparkles },
|
||||
{ name: '智能填表', path: '/form-fill', icon: TableProperties },
|
||||
{ name: '智能助手', path: '/assistant', icon: MessageSquareCode },
|
||||
{ name: '任务历史', path: '/task-history', icon: Clock },
|
||||
];
|
||||
|
||||
const MainLayout: React.FC = () => {
|
||||
const { user, profile, signOut } = useAuth();
|
||||
const location = useLocation();
|
||||
const navigate = useNavigate();
|
||||
|
||||
const handleSignOut = async () => {
|
||||
await signOut();
|
||||
navigate('/login');
|
||||
};
|
||||
|
||||
const SidebarContent = () => (
|
||||
<div className="flex flex-col h-full bg-sidebar py-6 border-r border-sidebar-border">
|
||||
@@ -70,25 +60,17 @@ const MainLayout: React.FC = () => {
|
||||
</nav>
|
||||
|
||||
<div className="px-4 mt-auto">
|
||||
<div className="bg-sidebar-accent/50 rounded-2xl p-4 mb-4 border border-sidebar-border/50">
|
||||
<div className="bg-sidebar-accent/50 rounded-2xl p-4 border border-sidebar-border/50">
|
||||
<div className="flex items-center gap-3">
|
||||
<div className="w-10 h-10 rounded-full bg-secondary flex items-center justify-center border-2 border-primary/10">
|
||||
<User size={20} className="text-primary" />
|
||||
<Sparkles size={20} className="text-primary" />
|
||||
</div>
|
||||
<div className="flex flex-col overflow-hidden">
|
||||
<span className="font-semibold text-sm truncate">{((profile as any)?.email) || '用户'}</span>
|
||||
<span className="text-[10px] uppercase tracking-wider text-muted-foreground">{((profile as any)?.role) || 'User'}</span>
|
||||
<span className="font-semibold text-sm truncate">智联文档</span>
|
||||
<span className="text-[10px] uppercase tracking-wider text-muted-foreground">多源数据融合</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<Button
|
||||
variant="outline"
|
||||
className="w-full justify-start gap-3 border-none hover:bg-destructive/10 hover:text-destructive group rounded-xl"
|
||||
onClick={handleSignOut}
|
||||
>
|
||||
<LogOut size={18} className="group-hover:rotate-180 transition-transform duration-300" />
|
||||
<span>退出登录</span>
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
|
||||
@@ -1,85 +0,0 @@
|
||||
import React, { createContext, useContext, useEffect, useState } from 'react';
|
||||
import { supabase } from '@/db/supabase';
|
||||
import { User } from '@supabase/supabase-js';
|
||||
import { Profile } from '@/types/types';
|
||||
|
||||
interface AuthContextType {
|
||||
user: User | null;
|
||||
profile: Profile | null;
|
||||
signIn: (email: string, password: string) => Promise<{ error: any }>;
|
||||
signUp: (email: string, password: string) => Promise<{ error: any }>;
|
||||
signOut: () => Promise<{ error: any }>;
|
||||
loading: boolean;
|
||||
}
|
||||
|
||||
const AuthContext = createContext<AuthContextType | undefined>(undefined);
|
||||
|
||||
export const AuthProvider: React.FC<{ children: React.ReactNode }> = ({ children }) => {
|
||||
const [user, setUser] = useState<User | null>(null);
|
||||
const [profile, setProfile] = useState<Profile | null>(null);
|
||||
const [loading, setLoading] = useState(true);
|
||||
|
||||
useEffect(() => {
|
||||
// Check active sessions and sets the user
|
||||
supabase.auth.getSession().then(({ data: { session } }) => {
|
||||
setUser(session?.user ?? null);
|
||||
if (session?.user) fetchProfile(session.user.id);
|
||||
else setLoading(false);
|
||||
});
|
||||
|
||||
// Listen for changes on auth state (sign in, sign out, etc.)
|
||||
const { data: { subscription } } = supabase.auth.onAuthStateChange((_event, session) => {
|
||||
setUser(session?.user ?? null);
|
||||
if (session?.user) fetchProfile(session.user.id);
|
||||
else {
|
||||
setProfile(null);
|
||||
setLoading(false);
|
||||
}
|
||||
});
|
||||
|
||||
return () => subscription.unsubscribe();
|
||||
}, []);
|
||||
|
||||
const fetchProfile = async (uid: string) => {
|
||||
try {
|
||||
const { data, error } = await supabase
|
||||
.from('profiles')
|
||||
.select('*')
|
||||
.eq('id', uid)
|
||||
.maybeSingle();
|
||||
|
||||
if (error) throw error;
|
||||
setProfile(data);
|
||||
} catch (err) {
|
||||
console.error('Error fetching profile:', err);
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
const signIn = async (email: string, password: string) => {
|
||||
return await supabase.auth.signInWithPassword({ email, password });
|
||||
};
|
||||
|
||||
const signUp = async (email: string, password: string) => {
|
||||
return await supabase.auth.signUp({ email, password });
|
||||
};
|
||||
|
||||
const signOut = async () => {
|
||||
return await supabase.auth.signOut();
|
||||
};
|
||||
|
||||
return (
|
||||
<AuthContext.Provider value={{ user, profile, signIn, signUp, signOut, loading }}>
|
||||
{children}
|
||||
</AuthContext.Provider>
|
||||
);
|
||||
};
|
||||
|
||||
export const useAuth = () => {
|
||||
const context = useContext(AuthContext);
|
||||
if (context === undefined) {
|
||||
throw new Error('useAuth must be used within an AuthProvider');
|
||||
}
|
||||
return context;
|
||||
};
|
||||
136
frontend/src/context/TemplateFillContext.tsx
Normal file
136
frontend/src/context/TemplateFillContext.tsx
Normal file
@@ -0,0 +1,136 @@
|
||||
import React, { createContext, useContext, useState, ReactNode } from 'react';
|
||||
|
||||
type SourceFile = {
|
||||
file: File;
|
||||
preview?: string;
|
||||
};
|
||||
|
||||
type TemplateField = {
|
||||
cell: string;
|
||||
name: string;
|
||||
field_type: string;
|
||||
required: boolean;
|
||||
hint?: string;
|
||||
};
|
||||
|
||||
type Step = 'upload' | 'filling' | 'preview';
|
||||
|
||||
interface TemplateFillState {
|
||||
step: Step;
|
||||
templateFile: File | null;
|
||||
templateFields: TemplateField[];
|
||||
sourceFiles: SourceFile[];
|
||||
sourceFilePaths: string[];
|
||||
sourceDocIds: string[];
|
||||
templateId: string;
|
||||
filledResult: any;
|
||||
setStep: (step: Step) => void;
|
||||
setTemplateFile: (file: File | null) => void;
|
||||
setTemplateFields: (fields: TemplateField[]) => void;
|
||||
setSourceFiles: (files: SourceFile[]) => void;
|
||||
addSourceFiles: (files: SourceFile[]) => void;
|
||||
removeSourceFile: (index: number) => void;
|
||||
setSourceFilePaths: (paths: string[]) => void;
|
||||
setSourceDocIds: (ids: string[]) => void;
|
||||
addSourceDocId: (id: string) => void;
|
||||
removeSourceDocId: (id: string) => void;
|
||||
setTemplateId: (id: string) => void;
|
||||
setFilledResult: (result: any) => void;
|
||||
reset: () => void;
|
||||
}
|
||||
|
||||
const initialState = {
|
||||
step: 'upload' as Step,
|
||||
templateFile: null,
|
||||
templateFields: [],
|
||||
sourceFiles: [],
|
||||
sourceFilePaths: [],
|
||||
sourceDocIds: [],
|
||||
templateId: '',
|
||||
filledResult: null,
|
||||
setStep: () => {},
|
||||
setTemplateFile: () => {},
|
||||
setTemplateFields: () => {},
|
||||
setSourceFiles: () => {},
|
||||
addSourceFiles: () => {},
|
||||
removeSourceFile: () => {},
|
||||
setSourceFilePaths: () => {},
|
||||
setSourceDocIds: () => {},
|
||||
addSourceDocId: () => {},
|
||||
removeSourceDocId: () => {},
|
||||
setTemplateId: () => {},
|
||||
setFilledResult: () => {},
|
||||
reset: () => {},
|
||||
};
|
||||
|
||||
const TemplateFillContext = createContext<TemplateFillState>(initialState);
|
||||
|
||||
export const TemplateFillProvider: React.FC<{ children: ReactNode }> = ({ children }) => {
|
||||
const [step, setStep] = useState<Step>('upload');
|
||||
const [templateFile, setTemplateFile] = useState<File | null>(null);
|
||||
const [templateFields, setTemplateFields] = useState<TemplateField[]>([]);
|
||||
const [sourceFiles, setSourceFiles] = useState<SourceFile[]>([]);
|
||||
const [sourceFilePaths, setSourceFilePaths] = useState<string[]>([]);
|
||||
const [sourceDocIds, setSourceDocIds] = useState<string[]>([]);
|
||||
const [templateId, setTemplateId] = useState<string>('');
|
||||
const [filledResult, setFilledResult] = useState<any>(null);
|
||||
|
||||
const addSourceFiles = (files: SourceFile[]) => {
|
||||
setSourceFiles(prev => [...prev, ...files]);
|
||||
};
|
||||
|
||||
const removeSourceFile = (index: number) => {
|
||||
setSourceFiles(prev => prev.filter((_, i) => i !== index));
|
||||
};
|
||||
|
||||
const addSourceDocId = (id: string) => {
|
||||
setSourceDocIds(prev => prev.includes(id) ? prev : [...prev, id]);
|
||||
};
|
||||
|
||||
const removeSourceDocId = (id: string) => {
|
||||
setSourceDocIds(prev => prev.filter(docId => docId !== id));
|
||||
};
|
||||
|
||||
const reset = () => {
|
||||
setStep('upload');
|
||||
setTemplateFile(null);
|
||||
setTemplateFields([]);
|
||||
setSourceFiles([]);
|
||||
setSourceFilePaths([]);
|
||||
setSourceDocIds([]);
|
||||
setTemplateId('');
|
||||
setFilledResult(null);
|
||||
};
|
||||
|
||||
return (
|
||||
<TemplateFillContext.Provider
|
||||
value={{
|
||||
step,
|
||||
templateFile,
|
||||
templateFields,
|
||||
sourceFiles,
|
||||
sourceFilePaths,
|
||||
sourceDocIds,
|
||||
templateId,
|
||||
filledResult,
|
||||
setStep,
|
||||
setTemplateFile,
|
||||
setTemplateFields,
|
||||
setSourceFiles,
|
||||
addSourceFiles,
|
||||
removeSourceFile,
|
||||
setSourceFilePaths,
|
||||
setSourceDocIds,
|
||||
addSourceDocId,
|
||||
removeSourceDocId,
|
||||
setTemplateId,
|
||||
setFilledResult,
|
||||
reset,
|
||||
}}
|
||||
>
|
||||
{children}
|
||||
</TemplateFillContext.Provider>
|
||||
);
|
||||
};
|
||||
|
||||
export const useTemplateFill = () => useContext(TemplateFillContext);
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,72 +1,95 @@
|
||||
import React, { useEffect, useState } from 'react';
|
||||
import { Card, CardContent, CardHeader, CardTitle, CardDescription } from '@/components/ui/card';
|
||||
import { useNavigate, Link } from 'react-router-dom';
|
||||
import { Link } from 'react-router-dom';
|
||||
import { Button } from '@/components/ui/button';
|
||||
import {
|
||||
FileText,
|
||||
TableProperties,
|
||||
MessageSquareCode,
|
||||
TrendingUp,
|
||||
Clock,
|
||||
CheckCircle2,
|
||||
import {
|
||||
FileText,
|
||||
TableProperties,
|
||||
MessageSquareCode,
|
||||
TrendingUp,
|
||||
Clock,
|
||||
CheckCircle2,
|
||||
ArrowRight,
|
||||
UploadCloud,
|
||||
Layers,
|
||||
Sparkles
|
||||
Sparkles,
|
||||
Database,
|
||||
FileSpreadsheet,
|
||||
RefreshCcw
|
||||
} from 'lucide-react';
|
||||
import { useAuth } from '@/context/AuthContext';
|
||||
import { documentApi, taskApi } from '@/db/api';
|
||||
import { backendApi } from '@/db/backend-api';
|
||||
import { formatDistanceToNow } from 'date-fns';
|
||||
import { zhCN } from 'date-fns/locale';
|
||||
import { cn } from '@/lib/utils';
|
||||
|
||||
type Document = any;
|
||||
type FillTask = any;
|
||||
type DocumentItem = {
|
||||
doc_id: string;
|
||||
filename: string;
|
||||
original_filename: string;
|
||||
doc_type: string;
|
||||
file_size: number;
|
||||
created_at: string;
|
||||
metadata?: {
|
||||
row_count?: number;
|
||||
column_count?: number;
|
||||
columns?: string[];
|
||||
};
|
||||
};
|
||||
|
||||
type TaskItem = {
|
||||
task_id: string;
|
||||
status: string;
|
||||
created_at: string;
|
||||
message?: string;
|
||||
};
|
||||
|
||||
const Dashboard: React.FC = () => {
|
||||
const navigate = useNavigate();
|
||||
const { profile } = useAuth();
|
||||
const [stats, setStats] = useState({ docs: 0, entities: 0, tasks: 0 });
|
||||
const [recentDocs, setRecentDocs] = useState<Document[]>([]);
|
||||
const [recentTasks, setRecentTasks] = useState<any[]>([]);
|
||||
const [stats, setStats] = useState({ docs: 0, excelFiles: 0, tasks: 0 });
|
||||
const [recentDocs, setRecentDocs] = useState<DocumentItem[]>([]);
|
||||
const [recentTasks, setRecentTasks] = useState<TaskItem[]>([]);
|
||||
const [loading, setLoading] = useState(true);
|
||||
|
||||
useEffect(() => {
|
||||
if (!profile) return;
|
||||
const loadData = async () => {
|
||||
try {
|
||||
const docs = await documentApi.listDocuments((profile as any).id);
|
||||
const tasks = await taskApi.listTasks((profile as any).id);
|
||||
setRecentDocs(docs.slice(0, 5));
|
||||
setRecentTasks(tasks.slice(0, 5));
|
||||
|
||||
let entityCount = 0;
|
||||
docs.forEach(d => {
|
||||
if (d.extracted_entities) entityCount += (d.extracted_entities as any[]).length;
|
||||
});
|
||||
const loadData = async () => {
|
||||
setLoading(true);
|
||||
try {
|
||||
// 获取文档列表
|
||||
const docsResult = await backendApi.getDocuments(undefined, 50);
|
||||
if (docsResult.success && docsResult.documents) {
|
||||
setRecentDocs(docsResult.documents.slice(0, 5));
|
||||
|
||||
// 分类统计
|
||||
const docxMdTxt = docsResult.documents.filter((d: DocumentItem) =>
|
||||
['docx', 'md', 'txt'].includes(d.doc_type)
|
||||
).length;
|
||||
const xlsx = docsResult.documents.filter((d: DocumentItem) =>
|
||||
d.doc_type === 'xlsx'
|
||||
).length;
|
||||
|
||||
setStats({
|
||||
docs: docs.length,
|
||||
entities: entityCount,
|
||||
tasks: tasks.length
|
||||
docs: docxMdTxt,
|
||||
excelFiles: xlsx,
|
||||
tasks: 0 // TODO: 后端任务接口
|
||||
});
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
} catch (err) {
|
||||
console.error('加载数据失败:', err);
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
loadData();
|
||||
}, [profile]);
|
||||
}, []);
|
||||
|
||||
return (
|
||||
<div className="space-y-8 animate-fade-in">
|
||||
<section className="flex flex-col md:flex-row md:items-center justify-between gap-4">
|
||||
<div className="space-y-1">
|
||||
<h1 className="text-3xl font-extrabold tracking-tight">
|
||||
你好, <span className="text-primary">{((profile as any)?.email)?.split('@')[0] || '用户'}</span> 👋
|
||||
欢迎使用 <span className="text-primary">智联文档</span> 系统 👋
|
||||
</h1>
|
||||
<p className="text-muted-foreground">欢迎使用智联文档,今日已为你处理了多个任务。</p>
|
||||
<p className="text-muted-foreground">基于大语言模型的文档理解与多源数据融合系统</p>
|
||||
</div>
|
||||
<div className="flex items-center gap-3">
|
||||
<Button variant="outline" className="rounded-xl" asChild>
|
||||
@@ -84,9 +107,9 @@ const Dashboard: React.FC = () => {
|
||||
{/* Stats Grid */}
|
||||
<div className="grid grid-cols-1 md:grid-cols-3 gap-6">
|
||||
{[
|
||||
{ label: '已上传文档', value: stats.docs, icon: FileText, color: 'bg-blue-500', trend: '+12% 较上周' },
|
||||
{ label: '提取实体', value: stats.entities, icon: Layers, color: 'bg-indigo-500', trend: '+25% 较上周' },
|
||||
{ label: '生成表格', value: stats.tasks, icon: TableProperties, color: 'bg-emerald-500', trend: '+8% 较上周' }
|
||||
{ label: '已上传文档', value: stats.docs, icon: FileText, color: 'bg-blue-500', trend: '非结构化文档', link: '/documents' },
|
||||
{ label: 'Excel 文件', value: stats.excelFiles, icon: FileSpreadsheet, color: 'bg-emerald-500', trend: '结构化数据', link: '/excel-parse' },
|
||||
{ label: '填表任务', value: stats.tasks, icon: TableProperties, color: 'bg-indigo-500', trend: '待实现', link: '/form-fill' }
|
||||
].map((stat, i) => (
|
||||
<Card key={i} className="border-none shadow-md overflow-hidden group hover:shadow-xl transition-all duration-300">
|
||||
<CardContent className="p-0">
|
||||
@@ -94,8 +117,7 @@ const Dashboard: React.FC = () => {
|
||||
<div className="space-y-2">
|
||||
<p className="text-sm font-medium text-muted-foreground">{stat.label}</p>
|
||||
<p className="text-3xl font-bold tracking-tight">{stat.value}</p>
|
||||
<div className="flex items-center gap-1 text-xs text-emerald-500 font-medium bg-emerald-500/10 px-2 py-1 rounded-full w-fit">
|
||||
<TrendingUp size={12} />
|
||||
<div className="flex items-center gap-1 text-xs text-muted-foreground bg-muted px-2 py-1 rounded-full w-fit">
|
||||
<span>{stat.trend}</span>
|
||||
</div>
|
||||
</div>
|
||||
@@ -118,7 +140,7 @@ const Dashboard: React.FC = () => {
|
||||
<Clock className="text-primary" size={20} />
|
||||
最近上传
|
||||
</CardTitle>
|
||||
<CardDescription>你最近上传并处理的非结构化文档</CardDescription>
|
||||
<CardDescription>您最近上传的文档文件</CardDescription>
|
||||
</div>
|
||||
<Button variant="ghost" size="sm" asChild className="text-primary hover:text-primary/80 hover:bg-primary/5">
|
||||
<Link to="/documents">查看全部 <ArrowRight size={14} className="ml-1" /></Link>
|
||||
@@ -132,21 +154,18 @@ const Dashboard: React.FC = () => {
|
||||
) : recentDocs.length > 0 ? (
|
||||
<div className="space-y-3">
|
||||
{recentDocs.map(doc => (
|
||||
<div key={doc.id} className="flex items-center gap-4 p-3 rounded-xl border border-transparent hover:border-border hover:bg-muted/30 transition-all group">
|
||||
<div key={doc.doc_id} className="flex items-center gap-4 p-3 rounded-xl border border-transparent hover:border-border hover:bg-muted/30 transition-all group">
|
||||
<div className="w-10 h-10 rounded-lg bg-blue-500/10 text-blue-500 flex items-center justify-center">
|
||||
<FileText size={20} />
|
||||
</div>
|
||||
<div className="flex-1 min-w-0">
|
||||
<p className="font-semibold text-sm truncate">{doc.name}</p>
|
||||
<p className="font-semibold text-sm truncate">{doc.original_filename}</p>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
{formatDistanceToNow(new Date(doc.created_at!), { addSuffix: true, locale: zhCN })}
|
||||
{doc.doc_type.toUpperCase()} • {formatDistanceToNow(new Date(doc.created_at), { addSuffix: true, locale: zhCN })}
|
||||
</p>
|
||||
</div>
|
||||
<div className={cn(
|
||||
"px-2 py-1 rounded-full text-[10px] font-bold uppercase tracking-wider",
|
||||
doc.status === 'completed' ? "bg-emerald-500/10 text-emerald-500" : "bg-amber-500/10 text-amber-500"
|
||||
)}>
|
||||
{doc.status === 'completed' ? '已解析' : '处理中'}
|
||||
<div className="px-2 py-1 rounded-full text-[10px] font-bold uppercase tracking-wider bg-muted">
|
||||
{doc.doc_type}
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
@@ -163,58 +182,81 @@ const Dashboard: React.FC = () => {
|
||||
</CardContent>
|
||||
</Card>
|
||||
|
||||
{/* Recent Tasks */}
|
||||
{/* Quick Actions */}
|
||||
<Card className="border-none shadow-md">
|
||||
<CardHeader className="flex flex-row items-center justify-between pb-2">
|
||||
<div className="space-y-1">
|
||||
<CardTitle className="text-xl flex items-center gap-2">
|
||||
<CheckCircle2 className="text-primary" size={20} />
|
||||
填表记录
|
||||
<Sparkles className="text-primary" size={20} />
|
||||
快速开始
|
||||
</CardTitle>
|
||||
<CardDescription>自动化生成的表格与汇总结果</CardDescription>
|
||||
<CardDescription>选择您需要的服务开始使用</CardDescription>
|
||||
</div>
|
||||
<Button variant="ghost" size="sm" asChild className="text-primary hover:text-primary/80 hover:bg-primary/5">
|
||||
<Link to="/form-fill">查看全部 <ArrowRight size={14} className="ml-1" /></Link>
|
||||
</Button>
|
||||
</CardHeader>
|
||||
<CardContent>
|
||||
{loading ? (
|
||||
<div className="space-y-4 py-4">
|
||||
{[1, 2, 3].map(i => <div key={i} className="h-12 bg-muted rounded-xl animate-pulse" />)}
|
||||
</div>
|
||||
) : recentTasks.length > 0 ? (
|
||||
<div className="space-y-3">
|
||||
{recentTasks.map(task => (
|
||||
<div key={task.id} className="flex items-center gap-4 p-3 rounded-xl border border-transparent hover:border-border hover:bg-muted/30 transition-all">
|
||||
<div className="w-10 h-10 rounded-lg bg-emerald-500/10 text-emerald-500 flex items-center justify-center">
|
||||
<TableProperties size={20} />
|
||||
</div>
|
||||
<div className="flex-1 min-w-0">
|
||||
<p className="font-semibold text-sm truncate">{task.templates?.name || '未知模板'}</p>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
关联 {task.document_ids?.length || 0} 个文档 • {formatDistanceToNow(new Date(task.created_at!), { addSuffix: true, locale: zhCN })}
|
||||
</p>
|
||||
</div>
|
||||
<Button variant="ghost" size="icon" className="text-primary h-8 w-8" onClick={() => navigate('/form-fill')}>
|
||||
<ArrowRight size={16} />
|
||||
</Button>
|
||||
<div className="grid grid-cols-1 sm:grid-cols-2 gap-4">
|
||||
{[
|
||||
{ title: '上传文档', desc: '支持 docx/md/txt', icon: FileText, link: '/documents', color: 'bg-blue-500' },
|
||||
{ title: '解析 Excel', desc: '上传并分析数据', icon: FileSpreadsheet, link: '/excel-parse', color: 'bg-emerald-500' },
|
||||
{ title: '智能填表', desc: '自动填写表格模板', icon: TableProperties, link: '/form-fill', color: 'bg-indigo-500' },
|
||||
{ title: 'AI 助手', desc: '自然语言交互', icon: MessageSquareCode, link: '/assistant', color: 'bg-amber-500' }
|
||||
].map((item, i) => (
|
||||
<Link
|
||||
key={i}
|
||||
to={item.link}
|
||||
className="flex items-center gap-4 p-4 rounded-2xl border border-transparent hover:border-border hover:bg-muted/30 transition-all group"
|
||||
>
|
||||
<div className={cn("w-12 h-12 rounded-xl flex items-center justify-center text-white shadow-lg", item.color)}>
|
||||
<item.icon size={24} />
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
) : (
|
||||
<div className="flex flex-col items-center justify-center py-10 text-center space-y-3">
|
||||
<MessageSquareCode size={48} className="text-muted-foreground/30" />
|
||||
<p className="text-muted-foreground italic">暂无任务记录</p>
|
||||
<Button variant="outline" size="sm" asChild className="rounded-xl">
|
||||
<Link to="/form-fill">开始填表</Link>
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
<div>
|
||||
<p className="font-semibold group-hover:text-primary transition-colors">{item.title}</p>
|
||||
<p className="text-xs text-muted-foreground">{item.desc}</p>
|
||||
</div>
|
||||
<ArrowRight size={16} className="ml-auto opacity-0 group-hover:opacity-100 transition-opacity" />
|
||||
</Link>
|
||||
))}
|
||||
</div>
|
||||
</CardContent>
|
||||
</Card>
|
||||
</div>
|
||||
|
||||
{/* System Status */}
|
||||
<Card className="border-none shadow-md">
|
||||
<CardHeader className="pb-2">
|
||||
<CardTitle className="text-xl flex items-center gap-2">
|
||||
<Database className="text-primary" size={20} />
|
||||
系统状态
|
||||
</CardTitle>
|
||||
</CardHeader>
|
||||
<CardContent>
|
||||
<div className="grid grid-cols-1 md:grid-cols-3 gap-4">
|
||||
<div className="flex items-center gap-3 p-4 rounded-xl bg-muted/30">
|
||||
<div className="w-3 h-3 rounded-full bg-emerald-500" />
|
||||
<div>
|
||||
<p className="font-semibold text-sm">MySQL</p>
|
||||
<p className="text-xs text-muted-foreground">结构化数据存储</p>
|
||||
</div>
|
||||
</div>
|
||||
<div className="flex items-center gap-3 p-4 rounded-xl bg-muted/30">
|
||||
<div className="w-3 h-3 rounded-full bg-emerald-500" />
|
||||
<div>
|
||||
<p className="font-semibold text-sm">MongoDB</p>
|
||||
<p className="text-xs text-muted-foreground">非结构化数据存储</p>
|
||||
</div>
|
||||
</div>
|
||||
<div className="flex items-center gap-3 p-4 rounded-xl bg-muted/30">
|
||||
<div className="w-3 h-3 rounded-full bg-emerald-500" />
|
||||
<div>
|
||||
<p className="font-semibold text-sm">Faiss + RAG</p>
|
||||
<p className="text-xs text-muted-foreground">向量检索索引</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</CardContent>
|
||||
</Card>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
export default Dashboard;
|
||||
export default Dashboard;
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,603 +0,0 @@
|
||||
import React, { useState, useEffect } from 'react';
|
||||
import {
|
||||
TableProperties,
|
||||
Plus,
|
||||
FilePlus,
|
||||
CheckCircle2,
|
||||
Download,
|
||||
Clock,
|
||||
RefreshCcw,
|
||||
Sparkles,
|
||||
Zap,
|
||||
FileCheck,
|
||||
FileSpreadsheet,
|
||||
Trash2,
|
||||
ChevronDown,
|
||||
ChevronUp,
|
||||
BarChart3,
|
||||
FileText,
|
||||
TrendingUp,
|
||||
Info,
|
||||
AlertCircle,
|
||||
Loader2
|
||||
} from 'lucide-react';
|
||||
import { Button } from '@/components/ui/button';
|
||||
import { Card, CardContent, CardHeader, CardTitle, CardDescription, CardFooter } from '@/components/ui/card';
|
||||
import { Badge } from '@/components/ui/badge';
|
||||
import { useAuth } from '@/context/AuthContext';
|
||||
import { templateApi, documentApi, taskApi } from '@/db/api';
|
||||
import { backendApi, aiApi } from '@/db/backend-api';
|
||||
import { supabase } from '@/db/supabase';
|
||||
import { format } from 'date-fns';
|
||||
import { toast } from 'sonner';
|
||||
import { cn } from '@/lib/utils';
|
||||
import { Skeleton } from '@/components/ui/skeleton';
|
||||
import {
|
||||
Dialog,
|
||||
DialogContent,
|
||||
DialogHeader,
|
||||
DialogTitle,
|
||||
DialogTrigger,
|
||||
DialogFooter,
|
||||
DialogDescription
|
||||
} from '@/components/ui/dialog';
|
||||
import { Checkbox } from '@/components/ui/checkbox';
|
||||
import { ScrollArea } from '@/components/ui/scroll-area';
|
||||
import { Input } from '@/components/ui/input';
|
||||
import { Label } from '@/components/ui/label';
|
||||
import { Textarea } from '@/components/ui/textarea';
|
||||
import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from '@/components/ui/select';
|
||||
import { useDropzone } from 'react-dropzone';
|
||||
import { Markdown } from '@/components/ui/markdown';
|
||||
|
||||
type Template = any;
|
||||
type Document = any;
|
||||
type FillTask = any;
|
||||
|
||||
const FormFill: React.FC = () => {
|
||||
const { profile } = useAuth();
|
||||
const [templates, setTemplates] = useState<Template[]>([]);
|
||||
const [documents, setDocuments] = useState<Document[]>([]);
|
||||
const [tasks, setTasks] = useState<any[]>([]);
|
||||
const [loading, setLoading] = useState(true);
|
||||
|
||||
// Selection state
|
||||
const [selectedTemplate, setSelectedTemplate] = useState<string | null>(null);
|
||||
const [selectedDocs, setSelectedDocs] = useState<string[]>([]);
|
||||
const [creating, setCreating] = useState(false);
|
||||
const [openTaskDialog, setOpenTaskDialog] = useState(false);
|
||||
const [viewingTask, setViewingTask] = useState<any | null>(null);
|
||||
|
||||
// Excel upload state
|
||||
const [excelFile, setExcelFile] = useState<File | null>(null);
|
||||
const [excelParseResult, setExcelParseResult] = useState<any>(null);
|
||||
const [excelAnalysis, setExcelAnalysis] = useState<any>(null);
|
||||
const [excelAnalyzing, setExcelAnalyzing] = useState(false);
|
||||
const [expandedSheet, setExpandedSheet] = useState<string | null>(null);
|
||||
const [aiOptions, setAiOptions] = useState({
|
||||
userPrompt: '请分析这些数据,并提取关键信息用于填表,包括数值、分类、摘要等。',
|
||||
analysisType: 'general' as 'general' | 'summary' | 'statistics' | 'insights'
|
||||
});
|
||||
|
||||
const loadData = async () => {
|
||||
if (!profile) return;
|
||||
try {
|
||||
const [t, d, ts] = await Promise.all([
|
||||
templateApi.listTemplates((profile as any).id),
|
||||
documentApi.listDocuments((profile as any).id),
|
||||
taskApi.listTasks((profile as any).id)
|
||||
]);
|
||||
setTemplates(t);
|
||||
setDocuments(d);
|
||||
setTasks(ts);
|
||||
} catch (err: any) {
|
||||
toast.error('数据加载失败');
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
loadData();
|
||||
}, [profile]);
|
||||
|
||||
// Excel upload handlers
|
||||
const onExcelDrop = async (acceptedFiles: File[]) => {
|
||||
const file = acceptedFiles[0];
|
||||
if (!file) return;
|
||||
|
||||
if (!file.name.match(/\.(xlsx|xls)$/i)) {
|
||||
toast.error('仅支持 .xlsx 和 .xls 格式的 Excel 文件');
|
||||
return;
|
||||
}
|
||||
|
||||
setExcelFile(file);
|
||||
setExcelParseResult(null);
|
||||
setExcelAnalysis(null);
|
||||
setExpandedSheet(null);
|
||||
|
||||
try {
|
||||
const result = await backendApi.uploadExcel(file);
|
||||
if (result.success) {
|
||||
toast.success(`Excel 解析成功: ${file.name}`);
|
||||
setExcelParseResult(result);
|
||||
} else {
|
||||
toast.error(result.error || '解析失败');
|
||||
}
|
||||
} catch (error: any) {
|
||||
toast.error(error.message || '上传失败');
|
||||
}
|
||||
};
|
||||
|
||||
const { getRootProps, getInputProps, isDragActive } = useDropzone({
|
||||
onDrop: onExcelDrop,
|
||||
accept: {
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
|
||||
'application/vnd.ms-excel': ['.xls']
|
||||
},
|
||||
maxFiles: 1
|
||||
});
|
||||
|
||||
const handleAnalyzeExcel = async () => {
|
||||
if (!excelFile || !excelParseResult?.success) {
|
||||
toast.error('请先上传并解析 Excel 文件');
|
||||
return;
|
||||
}
|
||||
|
||||
setExcelAnalyzing(true);
|
||||
setExcelAnalysis(null);
|
||||
|
||||
try {
|
||||
const result = await aiApi.analyzeExcel(excelFile, {
|
||||
userPrompt: aiOptions.userPrompt,
|
||||
analysisType: aiOptions.analysisType
|
||||
});
|
||||
|
||||
if (result.success) {
|
||||
toast.success('AI 分析完成');
|
||||
setExcelAnalysis(result);
|
||||
} else {
|
||||
toast.error(result.error || 'AI 分析失败');
|
||||
}
|
||||
} catch (error: any) {
|
||||
toast.error(error.message || 'AI 分析失败');
|
||||
} finally {
|
||||
setExcelAnalyzing(false);
|
||||
}
|
||||
};
|
||||
|
||||
const handleUseExcelData = () => {
|
||||
if (!excelParseResult?.success) {
|
||||
toast.error('请先解析 Excel 文件');
|
||||
return;
|
||||
}
|
||||
|
||||
// 将 Excel 解析的数据标记为"文档",添加到选择列表
|
||||
toast.success('Excel 数据已添加到数据源,请在任务对话框中选择');
|
||||
// 这里可以添加逻辑来将 Excel 数据传递给后端创建任务
|
||||
};
|
||||
|
||||
const handleDeleteExcel = () => {
|
||||
setExcelFile(null);
|
||||
setExcelParseResult(null);
|
||||
setExcelAnalysis(null);
|
||||
setExpandedSheet(null);
|
||||
toast.success('Excel 文件已清除');
|
||||
};
|
||||
|
||||
const handleUploadTemplate = async (e: React.ChangeEvent<HTMLInputElement>) => {
|
||||
const file = e.target.files?.[0];
|
||||
if (!file || !profile) return;
|
||||
|
||||
try {
|
||||
toast.loading('正在上传模板...');
|
||||
await templateApi.uploadTemplate(file, (profile as any).id);
|
||||
toast.dismiss();
|
||||
toast.success('模板上传成功');
|
||||
loadData();
|
||||
} catch (err) {
|
||||
toast.dismiss();
|
||||
toast.error('上传模板失败');
|
||||
}
|
||||
};
|
||||
|
||||
const handleCreateTask = async () => {
|
||||
if (!profile || !selectedTemplate || selectedDocs.length === 0) {
|
||||
toast.error('请先选择模板和数据源文档');
|
||||
return;
|
||||
}
|
||||
|
||||
setCreating(true);
|
||||
try {
|
||||
const task = await taskApi.createTask((profile as any).id, selectedTemplate, selectedDocs);
|
||||
if (task) {
|
||||
toast.success('任务已创建,正在进行智能填表...');
|
||||
setOpenTaskDialog(false);
|
||||
|
||||
// Invoke edge function
|
||||
supabase.functions.invoke('fill-template', {
|
||||
body: { taskId: task.id }
|
||||
}).then(({ error }) => {
|
||||
if (error) toast.error('填表任务执行失败');
|
||||
else {
|
||||
toast.success('表格填写完成!');
|
||||
loadData();
|
||||
}
|
||||
});
|
||||
loadData();
|
||||
}
|
||||
} catch (err: any) {
|
||||
toast.error('创建任务失败');
|
||||
} finally {
|
||||
setCreating(false);
|
||||
}
|
||||
};
|
||||
|
||||
const getStatusColor = (status: string) => {
|
||||
switch (status) {
|
||||
case 'completed': return 'bg-emerald-500 text-white';
|
||||
case 'failed': return 'bg-destructive text-white';
|
||||
default: return 'bg-amber-500 text-white';
|
||||
}
|
||||
};
|
||||
|
||||
const formatFileSize = (bytes: number): string => {
|
||||
if (bytes === 0) return '0 B';
|
||||
const k = 1024;
|
||||
const sizes = ['B', 'KB', 'MB', 'GB'];
|
||||
const i = Math.floor(Math.log(bytes) / Math.log(k));
|
||||
return `${(bytes / Math.pow(k, i)).toFixed(2)} ${sizes[i]}`;
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="space-y-8 animate-fade-in pb-10">
|
||||
<section className="flex flex-col md:flex-row md:items-center justify-between gap-4">
|
||||
<div className="space-y-1">
|
||||
<h1 className="text-3xl font-extrabold tracking-tight">智能填表</h1>
|
||||
<p className="text-muted-foreground">根据您的表格模板,自动聚合多源文档信息进行精准填充,告别重复劳动。</p>
|
||||
</div>
|
||||
<div className="flex items-center gap-3">
|
||||
<Dialog open={openTaskDialog} onOpenChange={setOpenTaskDialog}>
|
||||
<DialogTrigger asChild>
|
||||
<Button className="rounded-xl shadow-lg shadow-primary/20 gap-2 h-11 px-6">
|
||||
<FilePlus size={18} />
|
||||
<span>新建填表任务</span>
|
||||
</Button>
|
||||
</DialogTrigger>
|
||||
<DialogContent className="max-w-4xl max-h-[90vh] flex flex-col p-0 overflow-hidden border-none shadow-2xl rounded-3xl">
|
||||
<DialogHeader className="p-8 pb-4 bg-muted/50">
|
||||
<DialogTitle className="text-2xl font-bold flex items-center gap-2">
|
||||
<Sparkles size={24} className="text-primary" />
|
||||
开启智能填表之旅
|
||||
</DialogTitle>
|
||||
<DialogDescription>
|
||||
选择一个表格模板及若干个数据源文档,AI 将自动为您分析并填写。
|
||||
</DialogDescription>
|
||||
</DialogHeader>
|
||||
|
||||
<ScrollArea className="flex-1 p-8 pt-4">
|
||||
<div className="space-y-8">
|
||||
{/* Step 1: Select Template */}
|
||||
<div className="space-y-4">
|
||||
<div className="flex items-center justify-between">
|
||||
<h4 className="font-bold flex items-center gap-2 text-primary uppercase tracking-widest text-xs">
|
||||
<span className="w-5 h-5 rounded-full bg-primary text-white flex items-center justify-center text-[10px]">1</span>
|
||||
选择表格模板
|
||||
</h4>
|
||||
<label className="cursor-pointer text-xs font-semibold text-primary hover:underline flex items-center gap-1">
|
||||
<Plus size={12} /> 上传新模板
|
||||
<input type="file" className="hidden" onChange={handleUploadTemplate} accept=".docx,.xlsx" />
|
||||
</label>
|
||||
</div>
|
||||
{templates.length > 0 ? (
|
||||
<div className="grid grid-cols-1 sm:grid-cols-2 gap-3">
|
||||
{templates.map(t => (
|
||||
<div
|
||||
key={t.id}
|
||||
className={cn(
|
||||
"p-4 rounded-2xl border-2 transition-all cursor-pointer flex items-center gap-3 group relative overflow-hidden",
|
||||
selectedTemplate === t.id ? "border-primary bg-primary/5" : "border-border hover:border-primary/50"
|
||||
)}
|
||||
onClick={() => setSelectedTemplate(t.id)}
|
||||
>
|
||||
<div className={cn(
|
||||
"w-10 h-10 rounded-xl flex items-center justify-center shrink-0 transition-colors",
|
||||
selectedTemplate === t.id ? "bg-primary text-white" : "bg-muted text-muted-foreground"
|
||||
)}>
|
||||
<TableProperties size={20} />
|
||||
</div>
|
||||
<div className="flex-1 min-w-0">
|
||||
<p className="font-bold text-sm truncate">{t.name}</p>
|
||||
<p className="text-[10px] text-muted-foreground uppercase">{t.type}</p>
|
||||
</div>
|
||||
{selectedTemplate === t.id && (
|
||||
<div className="absolute top-0 right-0 w-8 h-8 bg-primary text-white flex items-center justify-center rounded-bl-xl">
|
||||
<CheckCircle2 size={14} />
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
) : (
|
||||
<div className="p-8 text-center bg-muted/30 rounded-2xl border border-dashed text-sm italic text-muted-foreground">
|
||||
暂无模板,请先点击右上角上传。
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Step 2: Upload & Analyze Excel */}
|
||||
<div className="space-y-4">
|
||||
<h4 className="font-bold flex items-center gap-2 text-primary uppercase tracking-widest text-xs">
|
||||
<span className="w-5 h-5 rounded-full bg-primary text-white flex items-center justify-center text-[10px]">1.5</span>
|
||||
Excel 数据源
|
||||
</h4>
|
||||
<div className="bg-muted/20 rounded-2xl p-6">
|
||||
{!excelFile ? (
|
||||
<div
|
||||
{...getRootProps()}
|
||||
className={cn(
|
||||
"border-2 border-dashed rounded-xl p-8 transition-all duration-300 flex flex-col items-center justify-center text-center cursor-pointer group",
|
||||
isDragActive ? "border-primary bg-primary/5" : "border-muted-foreground/20 hover:border-primary/50 hover:bg-muted/30"
|
||||
)}
|
||||
>
|
||||
<input {...getInputProps()} />
|
||||
<div className="w-12 h-12 rounded-xl bg-primary/10 text-primary flex items-center justify-center mb-3 group-hover:scale-110 transition-transform">
|
||||
<FileSpreadsheet size={24} />
|
||||
</div>
|
||||
<p className="font-semibold text-sm">
|
||||
{isDragActive ? '释放以开始上传' : '点击或拖拽 Excel 文件'}
|
||||
</p>
|
||||
<p className="text-xs text-muted-foreground mt-1">支持 .xlsx 和 .xls 格式</p>
|
||||
</div>
|
||||
) : (
|
||||
<div className="space-y-4">
|
||||
<div className="flex items-center gap-3 p-3 bg-background rounded-xl">
|
||||
<div className="w-10 h-10 rounded-lg bg-emerald-500/10 text-emerald-500 flex items-center justify-center">
|
||||
<FileSpreadsheet size={20} />
|
||||
</div>
|
||||
<div className="flex-1 min-w-0">
|
||||
<p className="font-semibold text-sm truncate">{excelFile.name}</p>
|
||||
<p className="text-xs text-muted-foreground">{formatFileSize(excelFile.size)}</p>
|
||||
</div>
|
||||
<div className="flex gap-2">
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="text-destructive hover:bg-destructive/10"
|
||||
onClick={handleDeleteExcel}
|
||||
>
|
||||
<Trash2 size={16} />
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* AI Analysis Options */}
|
||||
{excelParseResult?.success && (
|
||||
<div className="space-y-3">
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="analysis-type" className="text-xs">分析类型</Label>
|
||||
<Select
|
||||
value={aiOptions.analysisType}
|
||||
onValueChange={(value: any) => setAiOptions({ ...aiOptions, analysisType: value })}
|
||||
>
|
||||
<SelectTrigger id="analysis-type" className="bg-background h-9 text-sm">
|
||||
<SelectValue placeholder="选择分析类型" />
|
||||
</SelectTrigger>
|
||||
<SelectContent>
|
||||
<SelectItem value="general">综合分析</SelectItem>
|
||||
<SelectItem value="summary">数据摘要</SelectItem>
|
||||
<SelectItem value="statistics">统计分析</SelectItem>
|
||||
<SelectItem value="insights">深度洞察</SelectItem>
|
||||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="user-prompt" className="text-xs">自定义提示词</Label>
|
||||
<Textarea
|
||||
id="user-prompt"
|
||||
value={aiOptions.userPrompt}
|
||||
onChange={(e) => setAiOptions({ ...aiOptions, userPrompt: e.target.value })}
|
||||
className="bg-background resize-none text-sm"
|
||||
rows={2}
|
||||
/>
|
||||
</div>
|
||||
<Button
|
||||
onClick={handleAnalyzeExcel}
|
||||
disabled={excelAnalyzing}
|
||||
className="w-full gap-2 h-9"
|
||||
variant="outline"
|
||||
>
|
||||
{excelAnalyzing ? <Loader2 className="animate-spin" size={14} /> : <Sparkles size={14} />}
|
||||
{excelAnalyzing ? '分析中...' : 'AI 分析'}
|
||||
</Button>
|
||||
{excelParseResult?.success && (
|
||||
<Button
|
||||
onClick={handleUseExcelData}
|
||||
className="w-full gap-2 h-9"
|
||||
>
|
||||
<CheckCircle2 size={14} />
|
||||
使用此数据源
|
||||
</Button>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Excel Analysis Result */}
|
||||
{excelAnalysis && (
|
||||
<div className="mt-4 p-4 bg-background rounded-xl max-h-60 overflow-y-auto">
|
||||
<div className="flex items-center gap-2 mb-3">
|
||||
<Sparkles size={16} className="text-primary" />
|
||||
<span className="font-semibold text-sm">AI 分析结果</span>
|
||||
</div>
|
||||
<Markdown content={excelAnalysis.analysis?.analysis || ''} className="text-sm" />
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Step 3: Select Documents */}
|
||||
<div className="space-y-4">
|
||||
<h4 className="font-bold flex items-center gap-2 text-primary uppercase tracking-widest text-xs">
|
||||
<span className="w-5 h-5 rounded-full bg-primary text-white flex items-center justify-center text-[10px]">2</span>
|
||||
选择其他数据源文档
|
||||
</h4>
|
||||
{documents.filter(d => d.status === 'completed').length > 0 ? (
|
||||
<div className="space-y-2 max-h-40 overflow-y-auto pr-2 custom-scrollbar">
|
||||
{documents.filter(d => d.status === 'completed').map(doc => (
|
||||
<div
|
||||
key={doc.id}
|
||||
className={cn(
|
||||
"flex items-center gap-3 p-3 rounded-xl border transition-all cursor-pointer",
|
||||
selectedDocs.includes(doc.id) ? "border-primary/50 bg-primary/5 shadow-sm" : "border-border hover:bg-muted/30"
|
||||
)}
|
||||
onClick={() => {
|
||||
setSelectedDocs(prev =>
|
||||
prev.includes(doc.id) ? prev.filter(id => id !== doc.id) : [...prev, doc.id]
|
||||
);
|
||||
}}
|
||||
>
|
||||
<Checkbox checked={selectedDocs.includes(doc.id)} onCheckedChange={() => {}} />
|
||||
<div className="w-8 h-8 rounded-lg bg-blue-500/10 text-blue-500 flex items-center justify-center">
|
||||
<Zap size={16} />
|
||||
</div>
|
||||
<span className="font-semibold text-sm truncate">{doc.name}</span>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
) : (
|
||||
<div className="p-6 text-center bg-muted/30 rounded-xl border border-dashed text-xs italic text-muted-foreground">
|
||||
暂无其他已解析的文档
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</ScrollArea>
|
||||
|
||||
<DialogFooter className="p-8 pt-4 bg-muted/20 border-t border-dashed">
|
||||
<Button variant="outline" className="rounded-xl h-12 px-6" onClick={() => setOpenTaskDialog(false)}>取消</Button>
|
||||
<Button
|
||||
className="rounded-xl h-12 px-8 shadow-lg shadow-primary/20 gap-2"
|
||||
onClick={handleCreateTask}
|
||||
disabled={creating || !selectedTemplate || (selectedDocs.length === 0 && !excelParseResult?.success)}
|
||||
>
|
||||
{creating ? <RefreshCcw className="animate-spin h-5 w-5" /> : <Zap className="h-5 w-5 fill-current" />}
|
||||
<span>启动智能填表引擎</span>
|
||||
</Button>
|
||||
</DialogFooter>
|
||||
</DialogContent>
|
||||
</Dialog>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
{/* Task List */}
|
||||
<div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-6">
|
||||
{loading ? (
|
||||
Array.from({ length: 3 }).map((_, i) => (
|
||||
<Skeleton key={i} className="h-48 w-full rounded-3xl bg-muted" />
|
||||
))
|
||||
) : tasks.length > 0 ? (
|
||||
tasks.map((task) => (
|
||||
<Card key={task.id} className="border-none shadow-md hover:shadow-xl transition-all group rounded-3xl overflow-hidden flex flex-col">
|
||||
<div className="h-1.5 w-full" style={{ backgroundColor: task.status === 'completed' ? '#10b981' : task.status === 'failed' ? '#ef4444' : '#f59e0b' }} />
|
||||
<CardHeader className="p-6 pb-2">
|
||||
<div className="flex justify-between items-start mb-2">
|
||||
<div className="w-12 h-12 rounded-2xl bg-emerald-500/10 text-emerald-500 flex items-center justify-center shadow-inner group-hover:scale-110 transition-transform">
|
||||
<TableProperties size={24} />
|
||||
</div>
|
||||
<Badge className={cn("text-[10px] uppercase font-bold tracking-widest", getStatusColor(task.status))}>
|
||||
{task.status === 'completed' ? '已完成' : task.status === 'failed' ? '失败' : '执行中'}
|
||||
</Badge>
|
||||
</div>
|
||||
<CardTitle className="text-lg font-bold truncate group-hover:text-primary transition-colors">{task.templates?.name || '未知模板'}</CardTitle>
|
||||
<CardDescription className="text-xs flex items-center gap-1 font-medium italic">
|
||||
<Clock size={12} /> {format(new Date(task.created_at!), 'yyyy/MM/dd HH:mm')}
|
||||
</CardDescription>
|
||||
</CardHeader>
|
||||
<CardContent className="p-6 pt-2 flex-1">
|
||||
<div className="space-y-4">
|
||||
<div className="flex flex-wrap gap-2">
|
||||
<Badge variant="outline" className="bg-muted/50 border-none text-[10px] font-bold">关联 {task.document_ids?.length} 份数据源</Badge>
|
||||
</div>
|
||||
{task.status === 'completed' && (
|
||||
<div className="p-3 bg-emerald-500/5 rounded-2xl border border-emerald-500/10 flex items-center gap-3">
|
||||
<CheckCircle2 className="text-emerald-500" size={18} />
|
||||
<span className="text-xs font-semibold text-emerald-700">内容已精准聚合,表格生成完毕</span>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</CardContent>
|
||||
<CardFooter className="p-6 pt-0">
|
||||
<Button
|
||||
className="w-full rounded-2xl h-11 bg-primary group-hover:shadow-lg group-hover:shadow-primary/30 transition-all gap-2"
|
||||
disabled={task.status !== 'completed'}
|
||||
onClick={() => setViewingTask(task)}
|
||||
>
|
||||
<Download size={18} />
|
||||
<span>下载汇总表格</span>
|
||||
</Button>
|
||||
</CardFooter>
|
||||
</Card>
|
||||
))
|
||||
) : (
|
||||
<div className="col-span-full py-24 flex flex-col items-center justify-center text-center space-y-6">
|
||||
<div className="w-24 h-24 rounded-full bg-muted flex items-center justify-center text-muted-foreground/30 border-4 border-dashed">
|
||||
<TableProperties size={48} />
|
||||
</div>
|
||||
<div className="space-y-2 max-w-sm">
|
||||
<p className="text-2xl font-extrabold tracking-tight">暂无生成任务</p>
|
||||
<p className="text-muted-foreground text-sm">上传模板后,您可以将多个文档的数据自动填充到汇总表格中。</p>
|
||||
</div>
|
||||
<Button className="rounded-xl h-12 px-8" onClick={() => setOpenTaskDialog(true)}>立即创建首个任务</Button>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Task Result View Modal */}
|
||||
<Dialog open={!!viewingTask} onOpenChange={(open) => !open && setViewingTask(null)}>
|
||||
<DialogContent className="max-w-4xl max-h-[90vh] flex flex-col p-0 overflow-hidden border-none shadow-2xl rounded-3xl">
|
||||
<DialogHeader className="p-8 pb-4 bg-primary text-primary-foreground">
|
||||
<div className="flex items-center gap-3 mb-2">
|
||||
<FileCheck size={28} />
|
||||
<DialogTitle className="text-2xl font-extrabold">表格生成结果预览</DialogTitle>
|
||||
</div>
|
||||
<DialogDescription className="text-primary-foreground/80 italic">
|
||||
系统已根据 {viewingTask?.document_ids?.length} 份文档信息自动填充完毕。
|
||||
</DialogDescription>
|
||||
</DialogHeader>
|
||||
<ScrollArea className="flex-1 p-8 bg-muted/10">
|
||||
<div className="prose dark:prose-invert max-w-none">
|
||||
<div className="bg-card p-8 rounded-2xl shadow-sm border min-h-[400px]">
|
||||
<Badge variant="outline" className="mb-4">数据已脱敏</Badge>
|
||||
<div className="whitespace-pre-wrap font-sans text-sm leading-relaxed">
|
||||
<h2 className="text-xl font-bold mb-4">汇总结果报告</h2>
|
||||
<p className="text-muted-foreground mb-6">以下是根据您上传的多个文档提取并生成的汇总信息:</p>
|
||||
|
||||
<div className="p-4 bg-muted/30 rounded-xl border border-dashed border-primary/20 italic">
|
||||
正在从云端安全下载解析结果并渲染渲染视图...
|
||||
</div>
|
||||
|
||||
<div className="mt-8 space-y-4">
|
||||
<p className="font-semibold text-primary">✓ 核心实体已对齐</p>
|
||||
<p className="font-semibold text-primary">✓ 逻辑勾稽关系校验通过</p>
|
||||
<p className="font-semibold text-primary">✓ 格式符合模板规范</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</ScrollArea>
|
||||
<DialogFooter className="p-8 pt-4 border-t border-dashed">
|
||||
<Button variant="outline" className="rounded-xl" onClick={() => setViewingTask(null)}>关闭</Button>
|
||||
<Button className="rounded-xl px-8 gap-2 shadow-lg shadow-primary/20" onClick={() => toast.success("正在导出文件...")}>
|
||||
<Download size={18} />
|
||||
导出为 {viewingTask?.templates?.type?.toUpperCase() || '文件'}
|
||||
</Button>
|
||||
</DialogFooter>
|
||||
</DialogContent>
|
||||
</Dialog>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
export default FormFill;
|
||||
351
frontend/src/pages/InstructionChat.tsx
Normal file
351
frontend/src/pages/InstructionChat.tsx
Normal file
@@ -0,0 +1,351 @@
|
||||
import React, { useState, useRef, useEffect } from 'react';
|
||||
import {
|
||||
Send,
|
||||
Bot,
|
||||
User,
|
||||
Sparkles,
|
||||
Trash2,
|
||||
RefreshCcw,
|
||||
FileText,
|
||||
TableProperties,
|
||||
ChevronRight,
|
||||
ArrowRight,
|
||||
Loader2
|
||||
} from 'lucide-react';
|
||||
import { Button } from '@/components/ui/button';
|
||||
import { Input } from '@/components/ui/input';
|
||||
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
|
||||
import { ScrollArea } from '@/components/ui/scroll-area';
|
||||
import { Badge } from '@/components/ui/badge';
|
||||
import { backendApi } from '@/db/backend-api';
|
||||
import { toast } from 'sonner';
|
||||
import { cn } from '@/lib/utils';
|
||||
|
||||
type ChatMessage = {
|
||||
id: string;
|
||||
role: 'user' | 'assistant';
|
||||
content: string;
|
||||
created_at: string;
|
||||
};
|
||||
|
||||
const InstructionChat: React.FC = () => {
|
||||
const [messages, setMessages] = useState<ChatMessage[]>([]);
|
||||
const [input, setInput] = useState('');
|
||||
const [loading, setLoading] = useState(false);
|
||||
const scrollAreaRef = useRef<HTMLDivElement>(null);
|
||||
|
||||
useEffect(() => {
|
||||
// Initial welcome message
|
||||
if (messages.length === 0) {
|
||||
setMessages([
|
||||
{
|
||||
id: 'welcome',
|
||||
role: 'assistant',
|
||||
content: `您好!我是智联文档 AI 助手。
|
||||
|
||||
我可以帮您完成以下操作:
|
||||
|
||||
📄 **文档管理**
|
||||
- "帮我列出最近上传的所有文档"
|
||||
- "删除三天前的 docx 文档"
|
||||
|
||||
📊 **Excel 分析**
|
||||
- "分析一下最近上传的 Excel 文件"
|
||||
- "帮我统计销售报表中的数据"
|
||||
|
||||
📝 **智能填表**
|
||||
- "根据员工信息表创建一个考勤汇总表"
|
||||
- "用财务文档填充报销模板"
|
||||
|
||||
请告诉我您想做什么?`,
|
||||
created_at: new Date().toISOString()
|
||||
}
|
||||
]);
|
||||
}
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
// Scroll to bottom
|
||||
if (scrollAreaRef.current) {
|
||||
const scrollElement = scrollAreaRef.current.querySelector('[data-radix-scroll-area-viewport]');
|
||||
if (scrollElement) {
|
||||
scrollElement.scrollTop = scrollElement.scrollHeight;
|
||||
}
|
||||
}
|
||||
}, [messages]);
|
||||
|
||||
const handleSend = async () => {
|
||||
if (!input.trim()) return;
|
||||
|
||||
const userMessage: ChatMessage = {
|
||||
id: Math.random().toString(36).substring(7),
|
||||
role: 'user',
|
||||
content: input.trim(),
|
||||
created_at: new Date().toISOString()
|
||||
};
|
||||
|
||||
setMessages(prev => [...prev, userMessage]);
|
||||
setInput('');
|
||||
setLoading(true);
|
||||
|
||||
try {
|
||||
// TODO: 后端对话接口,暂用模拟响应
|
||||
await new Promise(resolve => setTimeout(resolve, 1500));
|
||||
|
||||
// 简单的命令解析演示
|
||||
const userInput = userMessage.content.toLowerCase();
|
||||
let response = '';
|
||||
|
||||
if (userInput.includes('列出') || userInput.includes('列表')) {
|
||||
const result = await backendApi.getDocuments(undefined, 10);
|
||||
if (result.success && result.documents && result.documents.length > 0) {
|
||||
response = `已为您找到 ${result.documents.length} 个文档:\n\n`;
|
||||
result.documents.slice(0, 5).forEach((doc: any, idx: number) => {
|
||||
response += `${idx + 1}. **${doc.original_filename}** (${doc.doc_type.toUpperCase()})\n`;
|
||||
response += ` - 大小: ${(doc.file_size / 1024).toFixed(1)} KB\n`;
|
||||
response += ` - 时间: ${new Date(doc.created_at).toLocaleDateString()}\n\n`;
|
||||
});
|
||||
if (result.documents.length > 5) {
|
||||
response += `...还有 ${result.documents.length - 5} 个文档`;
|
||||
}
|
||||
} else {
|
||||
response = '暂未找到已上传的文档,您可以先上传一些文档试试。';
|
||||
}
|
||||
} else if (userInput.includes('分析') || userInput.includes('excel') || userInput.includes('报表')) {
|
||||
response = `好的,我可以帮您分析 Excel 文件。
|
||||
|
||||
请告诉我:
|
||||
1. 您想分析哪个 Excel 文件?
|
||||
2. 需要什么样的分析?(数据摘要/统计分析/图表生成)
|
||||
|
||||
或者您可以直接告诉我您想从数据中了解什么,我来为您生成分析。`;
|
||||
} else if (userInput.includes('填表') || userInput.includes('模板')) {
|
||||
response = `好的,要进行智能填表,我需要:
|
||||
|
||||
1. **上传表格模板** - 您要填写的表格模板文件(Excel 或 Word 格式)
|
||||
2. **选择数据源** - 包含要填写内容的源文档
|
||||
|
||||
您可以去【智能填表】页面完成这些操作,或者告诉我您具体想填什么类型的表格,我来指导您操作。`;
|
||||
} else if (userInput.includes('删除')) {
|
||||
response = `要删除文档,请告诉我:
|
||||
|
||||
- 要删除的文件名是什么?
|
||||
- 或者您可以到【文档中心】页面手动选择并删除文档
|
||||
|
||||
⚠️ 删除操作不可恢复,请确认后再操作。`;
|
||||
} else if (userInput.includes('帮助') || userInput.includes('help')) {
|
||||
response = `**我可以帮您完成以下操作:**
|
||||
|
||||
📄 **文档管理**
|
||||
- 列出/搜索已上传的文档
|
||||
- 查看文档详情和元数据
|
||||
- 删除不需要的文档
|
||||
|
||||
📊 **Excel 处理**
|
||||
- 分析 Excel 文件内容
|
||||
- 生成数据统计和图表
|
||||
- 导出处理后的数据
|
||||
|
||||
📝 **智能填表**
|
||||
- 上传表格模板
|
||||
- 从文档中提取信息填入模板
|
||||
- 导出填写完成的表格
|
||||
|
||||
📋 **任务历史**
|
||||
- 查看历史处理任务
|
||||
- 重新执行或导出结果
|
||||
|
||||
请直接告诉我您想做什么!`;
|
||||
} else {
|
||||
response = `我理解您想要: "${input.trim()}"
|
||||
|
||||
目前我还在学习如何更好地理解您的需求。您可以尝试:
|
||||
|
||||
1. **上传文档** - 去【文档中心】上传 docx/md/txt 文件
|
||||
2. **分析 Excel** - 去【Excel解析】上传并分析 Excel 文件
|
||||
3. **智能填表** - 去【智能填表】创建填表任务
|
||||
|
||||
或者您可以更具体地描述您想做的事情,我会尽力帮助您!`;
|
||||
}
|
||||
|
||||
const assistantMessage: ChatMessage = {
|
||||
id: Math.random().toString(36).substring(7),
|
||||
role: 'assistant',
|
||||
content: response,
|
||||
created_at: new Date().toISOString()
|
||||
};
|
||||
|
||||
setMessages(prev => [...prev, assistantMessage]);
|
||||
} catch (err: any) {
|
||||
toast.error('请求失败,请重试');
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
const clearChat = () => {
|
||||
setMessages([messages[0]]);
|
||||
toast.success('对话已清空');
|
||||
};
|
||||
|
||||
const quickActions = [
|
||||
{ label: '列出所有文档', icon: FileText, action: () => setInput('列出所有已上传的文档') },
|
||||
{ label: '分析 Excel 数据', icon: TableProperties, action: () => setInput('分析一下 Excel 文件') },
|
||||
{ label: '智能填表', icon: Sparkles, action: () => setInput('我想进行智能填表') },
|
||||
{ label: '帮助', icon: Sparkles, action: () => setInput('帮助') }
|
||||
];
|
||||
|
||||
return (
|
||||
<div className="h-[calc(100vh-8rem)] flex flex-col gap-6 animate-fade-in relative">
|
||||
<section className="flex flex-col md:flex-row md:items-center justify-between gap-4">
|
||||
<div className="space-y-1">
|
||||
<h1 className="text-3xl font-extrabold tracking-tight flex items-center gap-3">
|
||||
<Sparkles className="text-primary animate-pulse" />
|
||||
智能助手
|
||||
</h1>
|
||||
<p className="text-muted-foreground">通过自然语言指令,极速操控您的整个文档数据库。</p>
|
||||
</div>
|
||||
<Button
|
||||
variant="outline"
|
||||
size="sm"
|
||||
className="rounded-xl gap-2 h-10 border-none bg-card shadow-sm hover:bg-destructive/10 hover:text-destructive"
|
||||
onClick={clearChat}
|
||||
>
|
||||
<Trash2 size={16} />
|
||||
<span>清除历史</span>
|
||||
</Button>
|
||||
</section>
|
||||
|
||||
<div className="flex-1 flex flex-col lg:flex-row gap-6 min-h-0">
|
||||
{/* Chat Area */}
|
||||
<Card className="flex-1 flex flex-col border-none shadow-xl overflow-hidden rounded-3xl bg-card/50 backdrop-blur-sm">
|
||||
<ScrollArea className="flex-1 p-6" ref={scrollAreaRef}>
|
||||
<div className="space-y-8 pb-4">
|
||||
{messages.map((m) => (
|
||||
<div
|
||||
key={m.id}
|
||||
className={cn(
|
||||
"flex gap-4 max-w-[85%]",
|
||||
m.role === 'user' ? "ml-auto flex-row-reverse" : "mr-auto"
|
||||
)}
|
||||
>
|
||||
<div className={cn(
|
||||
"w-10 h-10 rounded-2xl flex items-center justify-center shrink-0 shadow-lg",
|
||||
m.role === 'user' ? "bg-primary text-primary-foreground" : "bg-white text-primary border border-primary/20"
|
||||
)}>
|
||||
{m.role === 'user' ? <User size={20} /> : <Bot size={22} />}
|
||||
</div>
|
||||
<div className={cn(
|
||||
"space-y-2 p-5 rounded-3xl",
|
||||
m.role === 'user'
|
||||
? "bg-primary text-primary-foreground shadow-xl shadow-primary/20 rounded-tr-none"
|
||||
: "bg-white border border-border/50 shadow-md rounded-tl-none"
|
||||
)}>
|
||||
<p className="text-sm leading-relaxed whitespace-pre-wrap font-medium">
|
||||
{m.content}
|
||||
</p>
|
||||
<span className={cn(
|
||||
"text-[10px] block opacity-50 font-bold tracking-widest",
|
||||
m.role === 'user' ? "text-right" : "text-left"
|
||||
)}>
|
||||
{new Date(m.created_at).toLocaleTimeString([], { hour: '2-digit', minute: '2-digit' })}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
{loading && (
|
||||
<div className="flex gap-4 mr-auto max-w-[85%] animate-pulse">
|
||||
<div className="w-10 h-10 rounded-2xl bg-muted flex items-center justify-center shrink-0 border border-border/50">
|
||||
<Bot size={22} className="text-muted-foreground" />
|
||||
</div>
|
||||
<div className="p-5 rounded-3xl rounded-tl-none bg-muted/50 border border-border/50">
|
||||
<div className="flex gap-2">
|
||||
<div className="w-2 h-2 rounded-full bg-primary/40 animate-bounce [animation-delay:-0.3s]" />
|
||||
<div className="w-2 h-2 rounded-full bg-primary/40 animate-bounce [animation-delay:-0.15s]" />
|
||||
<div className="w-2 h-2 rounded-full bg-primary/40 animate-bounce" />
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</ScrollArea>
|
||||
|
||||
<CardContent className="p-6 bg-white/50 backdrop-blur-xl border-t border-border/50">
|
||||
<form
|
||||
onSubmit={(e) => { e.preventDefault(); handleSend(); }}
|
||||
className="w-full flex gap-3 bg-muted/30 p-2 rounded-2xl border border-border/50 focus-within:border-primary/50 transition-all shadow-inner"
|
||||
>
|
||||
<Input
|
||||
placeholder="尝试输入:帮我分析最近上传的 Excel 文件..."
|
||||
className="flex-1 bg-transparent border-none focus-visible:ring-0 shadow-none h-12 text-base font-medium"
|
||||
value={input}
|
||||
onChange={(e) => setInput(e.target.value)}
|
||||
disabled={loading}
|
||||
/>
|
||||
<Button
|
||||
type="submit"
|
||||
size="icon"
|
||||
className="w-12 h-12 rounded-xl bg-primary hover:scale-105 transition-all shadow-lg shadow-primary/20"
|
||||
disabled={loading || !input.trim()}
|
||||
>
|
||||
<Send size={20} />
|
||||
</Button>
|
||||
</form>
|
||||
</CardContent>
|
||||
</Card>
|
||||
|
||||
{/* Quick Actions Panel */}
|
||||
<aside className="w-full lg:w-80 space-y-6">
|
||||
<Card className="border-none shadow-lg rounded-3xl bg-gradient-to-br from-primary/5 via-background to-background">
|
||||
<CardHeader className="p-6">
|
||||
<CardTitle className="text-sm font-bold uppercase tracking-widest text-primary flex items-center gap-2">
|
||||
<Sparkles size={16} />
|
||||
快捷操作
|
||||
</CardTitle>
|
||||
</CardHeader>
|
||||
<CardContent className="p-6 pt-0 space-y-3">
|
||||
{quickActions.map((action, i) => (
|
||||
<button
|
||||
key={i}
|
||||
className="w-full flex items-center gap-3 p-3 rounded-2xl hover:bg-white hover:shadow-md transition-all group text-left border border-transparent hover:border-primary/10"
|
||||
onClick={action.action}
|
||||
>
|
||||
<div className="w-8 h-8 rounded-lg bg-primary/10 text-primary flex items-center justify-center shrink-0">
|
||||
<action.icon size={16} />
|
||||
</div>
|
||||
<span className="text-sm font-semibold truncate group-hover:text-primary transition-colors">{action.label}</span>
|
||||
<ArrowRight size={14} className="ml-auto opacity-0 group-hover:opacity-100 -translate-x-2 group-hover:translate-x-0 transition-all" />
|
||||
</button>
|
||||
))}
|
||||
</CardContent>
|
||||
</Card>
|
||||
|
||||
<Card className="border-none shadow-lg rounded-3xl bg-gradient-to-br from-indigo-500/10 to-blue-500/10 overflow-hidden relative">
|
||||
<div className="absolute top-0 right-0 p-4 opacity-20">
|
||||
<Sparkles size={100} />
|
||||
</div>
|
||||
<CardHeader className="p-6 relative z-10">
|
||||
<CardTitle className="text-lg font-bold">功能说明</CardTitle>
|
||||
</CardHeader>
|
||||
<CardContent className="p-6 pt-0 relative z-10 space-y-4 text-sm text-muted-foreground">
|
||||
<div className="flex items-start gap-2">
|
||||
<FileText size={16} className="mt-0.5 text-blue-500 shrink-0" />
|
||||
<span>上传 docx/md/txt 文档到 MongoDB</span>
|
||||
</div>
|
||||
<div className="flex items-start gap-2">
|
||||
<TableProperties size={16} className="mt-0.5 text-emerald-500 shrink-0" />
|
||||
<span>上传 xlsx 文档到 MySQL</span>
|
||||
</div>
|
||||
<div className="flex items-start gap-2">
|
||||
<Sparkles size={16} className="mt-0.5 text-indigo-500 shrink-0" />
|
||||
<span>使用 RAG 智能检索和填表</span>
|
||||
</div>
|
||||
</CardContent>
|
||||
</Card>
|
||||
</aside>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
export default InstructionChat;
|
||||
@@ -1,184 +0,0 @@
|
||||
import React, { useState } from 'react';
|
||||
import { useNavigate, useLocation } from 'react-router-dom';
|
||||
import { useAuth } from '@/context/AuthContext';
|
||||
import { Button } from '@/components/ui/button';
|
||||
import { Input } from '@/components/ui/input';
|
||||
import { Label } from '@/components/ui/label';
|
||||
import { Card, CardContent, CardDescription, CardFooter, CardHeader, CardTitle } from '@/components/ui/card';
|
||||
import { Tabs, TabsContent, TabsList, TabsTrigger } from '@/components/ui/tabs';
|
||||
import { FileText, Lock, User, CheckCircle2, AlertCircle } from 'lucide-react';
|
||||
import { toast } from 'sonner';
|
||||
|
||||
const Login: React.FC = () => {
|
||||
const [username, setUsername] = useState('');
|
||||
const [password, setPassword] = useState('');
|
||||
const [loading, setLoading] = useState(false);
|
||||
const { signIn, signUp } = useAuth();
|
||||
const navigate = useNavigate();
|
||||
const location = useLocation();
|
||||
|
||||
const handleLogin = async (e: React.FormEvent) => {
|
||||
e.preventDefault();
|
||||
if (!username || !password) return toast.error('请输入用户名和密码');
|
||||
|
||||
setLoading(true);
|
||||
try {
|
||||
const email = `${username}@miaoda.com`;
|
||||
const { error } = await signIn(email, password);
|
||||
if (error) throw error;
|
||||
toast.success('登录成功');
|
||||
navigate('/');
|
||||
} catch (err: any) {
|
||||
toast.error(err.message || '登录失败');
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
const handleSignUp = async (e: React.FormEvent) => {
|
||||
e.preventDefault();
|
||||
if (!username || !password) return toast.error('请输入用户名和密码');
|
||||
|
||||
setLoading(true);
|
||||
try {
|
||||
const email = `${username}@miaoda.com`;
|
||||
const { error } = await signUp(email, password);
|
||||
if (error) throw error;
|
||||
toast.success('注册成功,请登录');
|
||||
} catch (err: any) {
|
||||
toast.error(err.message || '注册失败');
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="min-h-screen flex items-center justify-center bg-[radial-gradient(ellipse_at_top_left,_var(--tw-gradient-stops))] from-primary/10 via-background to-background p-4 relative overflow-hidden">
|
||||
{/* Decorative elements */}
|
||||
<div className="absolute top-0 left-0 w-96 h-96 bg-primary/5 rounded-full blur-3xl -translate-x-1/2 -translate-y-1/2" />
|
||||
<div className="absolute bottom-0 right-0 w-64 h-64 bg-primary/5 rounded-full blur-3xl translate-x-1/3 translate-y-1/3" />
|
||||
|
||||
<div className="w-full max-w-md space-y-8 relative animate-fade-in">
|
||||
<div className="text-center space-y-2">
|
||||
<div className="inline-flex items-center justify-center w-16 h-16 rounded-2xl bg-primary text-primary-foreground shadow-2xl shadow-primary/30 mb-4 animate-slide-in">
|
||||
<FileText size={32} />
|
||||
</div>
|
||||
<h1 className="text-4xl font-extrabold tracking-tight gradient-text">智联文档</h1>
|
||||
<p className="text-muted-foreground">多源数据融合与智能文档处理系统</p>
|
||||
</div>
|
||||
|
||||
<Card className="border-border/50 shadow-2xl backdrop-blur-sm bg-card/95">
|
||||
<Tabs defaultValue="login" className="w-full">
|
||||
<TabsList className="grid w-full grid-cols-2 rounded-t-xl h-12 bg-muted/50 p-1">
|
||||
<TabsTrigger value="login" className="rounded-lg data-[state=active]:bg-background data-[state=active]:shadow-sm">登录</TabsTrigger>
|
||||
<TabsTrigger value="signup" className="rounded-lg data-[state=active]:bg-background data-[state=active]:shadow-sm">注册</TabsTrigger>
|
||||
</TabsList>
|
||||
|
||||
<TabsContent value="login">
|
||||
<form onSubmit={handleLogin}>
|
||||
<CardHeader>
|
||||
<CardTitle>欢迎回来</CardTitle>
|
||||
<CardDescription>使用您的账号登录智联文档系统</CardDescription>
|
||||
</CardHeader>
|
||||
<CardContent className="space-y-4">
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="username">用户名</Label>
|
||||
<div className="relative">
|
||||
<User className="absolute left-3 top-2.5 h-4 w-4 text-muted-foreground" />
|
||||
<Input
|
||||
id="username"
|
||||
placeholder="请输入用户名"
|
||||
className="pl-9 bg-muted/30 border-none focus-visible:ring-primary"
|
||||
value={username}
|
||||
onChange={(e) => setUsername(e.target.value)}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="password">密码</Label>
|
||||
<div className="relative">
|
||||
<Lock className="absolute left-3 top-2.5 h-4 w-4 text-muted-foreground" />
|
||||
<Input
|
||||
id="password"
|
||||
type="password"
|
||||
placeholder="请输入密码"
|
||||
className="pl-9 bg-muted/30 border-none focus-visible:ring-primary"
|
||||
value={password}
|
||||
onChange={(e) => setPassword(e.target.value)}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</CardContent>
|
||||
<CardFooter>
|
||||
<Button className="w-full h-11 text-lg font-semibold rounded-xl" type="submit" disabled={loading}>
|
||||
{loading ? '登录中...' : '立即登录'}
|
||||
</Button>
|
||||
</CardFooter>
|
||||
</form>
|
||||
</TabsContent>
|
||||
|
||||
<TabsContent value="signup">
|
||||
<form onSubmit={handleSignUp}>
|
||||
<CardHeader>
|
||||
<CardTitle>创建账号</CardTitle>
|
||||
<CardDescription>开启智能文档处理的新体验</CardDescription>
|
||||
</CardHeader>
|
||||
<CardContent className="space-y-4">
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="signup-username">用户名</Label>
|
||||
<div className="relative">
|
||||
<User className="absolute left-3 top-2.5 h-4 w-4 text-muted-foreground" />
|
||||
<Input
|
||||
id="signup-username"
|
||||
placeholder="仅字母、数字和下划线"
|
||||
className="pl-9 bg-muted/30 border-none focus-visible:ring-primary"
|
||||
value={username}
|
||||
onChange={(e) => setUsername(e.target.value)}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="signup-password">密码</Label>
|
||||
<div className="relative">
|
||||
<Lock className="absolute left-3 top-2.5 h-4 w-4 text-muted-foreground" />
|
||||
<Input
|
||||
id="signup-password"
|
||||
type="password"
|
||||
placeholder="不少于 6 位"
|
||||
className="pl-9 bg-muted/30 border-none focus-visible:ring-primary"
|
||||
value={password}
|
||||
onChange={(e) => setPassword(e.target.value)}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
</CardContent>
|
||||
<CardFooter>
|
||||
<Button className="w-full h-11 text-lg font-semibold rounded-xl" type="submit" disabled={loading}>
|
||||
{loading ? '注册中...' : '注册账号'}
|
||||
</Button>
|
||||
</CardFooter>
|
||||
</form>
|
||||
</TabsContent>
|
||||
</Tabs>
|
||||
</Card>
|
||||
|
||||
<div className="grid grid-cols-2 gap-4 text-center text-xs text-muted-foreground">
|
||||
<div className="flex flex-col items-center gap-1">
|
||||
<CheckCircle2 size={16} className="text-primary" />
|
||||
<span>智能解析</span>
|
||||
</div>
|
||||
<div className="flex flex-col items-center gap-1">
|
||||
<CheckCircle2 size={16} className="text-primary" />
|
||||
<span>极速填表</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="text-center text-sm text-muted-foreground">
|
||||
© 2026 智联文档 | 多源数据融合系统
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
export default Login;
|
||||
@@ -1,16 +0,0 @@
|
||||
/**
|
||||
* Sample Page
|
||||
*/
|
||||
|
||||
import PageMeta from "../components/common/PageMeta";
|
||||
|
||||
export default function SamplePage() {
|
||||
return (
|
||||
<>
|
||||
<PageMeta title="Home" description="Home Page Introduction" />
|
||||
<div>
|
||||
<h3>This is a sample page</h3>
|
||||
</div>
|
||||
</>
|
||||
);
|
||||
}
|
||||
302
frontend/src/pages/TaskHistory.tsx
Normal file
302
frontend/src/pages/TaskHistory.tsx
Normal file
@@ -0,0 +1,302 @@
|
||||
import React, { useState, useEffect } from 'react';
|
||||
import {
|
||||
Clock,
|
||||
CheckCircle2,
|
||||
XCircle,
|
||||
RefreshCcw,
|
||||
Download,
|
||||
FileText,
|
||||
FileSpreadsheet,
|
||||
Loader2,
|
||||
ChevronDown,
|
||||
ChevronUp,
|
||||
Trash2,
|
||||
AlertCircle,
|
||||
HelpCircle
|
||||
} from 'lucide-react';
|
||||
import { Card, CardContent, CardHeader, CardTitle, CardDescription } from '@/components/ui/card';
|
||||
import { Button } from '@/components/ui/button';
|
||||
import { Badge } from '@/components/ui/badge';
|
||||
import { backendApi } from '@/db/backend-api';
|
||||
import { format } from 'date-fns';
|
||||
import { toast } from 'sonner';
|
||||
import { cn } from '@/lib/utils';
|
||||
import { Skeleton } from '@/components/ui/skeleton';
|
||||
|
||||
type Task = {
|
||||
task_id: string;
|
||||
status: 'pending' | 'processing' | 'success' | 'failure' | 'unknown';
|
||||
created_at: string;
|
||||
updated_at?: string;
|
||||
message?: string;
|
||||
result?: any;
|
||||
error?: string;
|
||||
task_type?: string;
|
||||
};
|
||||
|
||||
const TaskHistory: React.FC = () => {
|
||||
const [tasks, setTasks] = useState<Task[]>([]);
|
||||
const [loading, setLoading] = useState(true);
|
||||
const [expandedTask, setExpandedTask] = useState<string | null>(null);
|
||||
|
||||
// 获取任务历史数据
|
||||
const fetchTasks = async () => {
|
||||
try {
|
||||
setLoading(true);
|
||||
const response = await backendApi.getTasks(50, 0);
|
||||
if (response.success && response.tasks) {
|
||||
// 转换后端数据格式为前端格式
|
||||
const convertedTasks: Task[] = response.tasks.map((t: any) => ({
|
||||
task_id: t.task_id,
|
||||
status: t.status || 'unknown',
|
||||
created_at: t.created_at || new Date().toISOString(),
|
||||
updated_at: t.updated_at,
|
||||
message: t.message || '',
|
||||
result: t.result,
|
||||
error: t.error,
|
||||
task_type: t.task_type || 'document_parse'
|
||||
}));
|
||||
setTasks(convertedTasks);
|
||||
} else {
|
||||
setTasks([]);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('获取任务列表失败:', error);
|
||||
toast.error('获取任务列表失败');
|
||||
setTasks([]);
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
fetchTasks();
|
||||
}, []);
|
||||
|
||||
const getStatusBadge = (status: string) => {
|
||||
switch (status) {
|
||||
case 'success':
|
||||
return <Badge className="bg-emerald-500 text-white text-[10px]"><CheckCircle2 size={12} className="mr-1" />成功</Badge>;
|
||||
case 'failure':
|
||||
return <Badge className="bg-destructive text-white text-[10px]"><XCircle size={12} className="mr-1" />失败</Badge>;
|
||||
case 'processing':
|
||||
return <Badge className="bg-amber-500 text-white text-[10px]"><Loader2 size={12} className="mr-1 animate-spin" />处理中</Badge>;
|
||||
case 'unknown':
|
||||
return <Badge className="bg-gray-500 text-white text-[10px]"><HelpCircle size={12} className="mr-1" />未知</Badge>;
|
||||
default:
|
||||
return <Badge className="bg-gray-500 text-white text-[10px]"><Clock size={12} className="mr-1" />等待</Badge>;
|
||||
}
|
||||
};
|
||||
|
||||
const getTaskTypeLabel = (type?: string) => {
|
||||
switch (type) {
|
||||
case 'document_parse':
|
||||
return '文档解析';
|
||||
case 'excel_analysis':
|
||||
return 'Excel 分析';
|
||||
case 'template_fill':
|
||||
return '智能填表';
|
||||
case 'rag_index':
|
||||
return 'RAG 索引';
|
||||
default:
|
||||
return '未知任务';
|
||||
}
|
||||
};
|
||||
|
||||
const getTaskIcon = (type?: string) => {
|
||||
switch (type) {
|
||||
case 'document_parse':
|
||||
case 'rag_index':
|
||||
return <FileText size={20} />;
|
||||
case 'excel_analysis':
|
||||
return <FileSpreadsheet size={20} />;
|
||||
default:
|
||||
return <Clock size={20} />;
|
||||
}
|
||||
};
|
||||
|
||||
const handleRetry = async (taskId: string) => {
|
||||
toast.info('任务重试功能开发中...');
|
||||
};
|
||||
|
||||
const handleDelete = async (taskId: string) => {
|
||||
try {
|
||||
await backendApi.deleteTask(taskId);
|
||||
setTasks(prev => prev.filter(t => t.task_id !== taskId));
|
||||
toast.success('任务已删除');
|
||||
} catch (error) {
|
||||
console.error('删除任务失败:', error);
|
||||
toast.error('删除任务失败');
|
||||
}
|
||||
};
|
||||
|
||||
const stats = {
|
||||
total: tasks.length,
|
||||
success: tasks.filter(t => t.status === 'success').length,
|
||||
processing: tasks.filter(t => t.status === 'processing').length,
|
||||
failure: tasks.filter(t => t.status === 'failure').length,
|
||||
unknown: tasks.filter(t => t.status === 'unknown').length
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="space-y-8 pb-10">
|
||||
<section className="flex flex-col md:flex-row md:items-center justify-between gap-4">
|
||||
<div className="space-y-1">
|
||||
<h1 className="text-3xl font-extrabold tracking-tight">任务历史</h1>
|
||||
<p className="text-muted-foreground">查看和管理您所有的文档处理任务记录</p>
|
||||
</div>
|
||||
<Button variant="outline" className="rounded-xl gap-2" onClick={() => fetchTasks()}>
|
||||
<RefreshCcw size={18} />
|
||||
<span>刷新</span>
|
||||
</Button>
|
||||
</section>
|
||||
|
||||
{/* Stats Cards */}
|
||||
<div className="grid grid-cols-2 md:grid-cols-4 gap-4">
|
||||
{[
|
||||
{ label: '总任务数', value: stats.total, icon: Clock, color: 'text-blue-500', bg: 'bg-blue-500/10' },
|
||||
{ label: '成功', value: stats.success, icon: CheckCircle2, color: 'text-emerald-500', bg: 'bg-emerald-500/10' },
|
||||
{ label: '处理中', value: stats.processing, icon: Loader2, color: 'text-amber-500', bg: 'bg-amber-500/10' },
|
||||
{ label: '失败', value: stats.failure, icon: XCircle, color: 'text-destructive', bg: 'bg-destructive/10' }
|
||||
].map((stat, i) => (
|
||||
<Card key={i} className="border-none shadow-sm">
|
||||
<CardContent className="p-4 flex items-center gap-4">
|
||||
<div className={cn("w-12 h-12 rounded-xl flex items-center justify-center", stat.bg)}>
|
||||
<stat.icon size={24} className={stat.color} />
|
||||
</div>
|
||||
<div>
|
||||
<p className="text-2xl font-bold">{stat.value}</p>
|
||||
<p className="text-xs text-muted-foreground">{stat.label}</p>
|
||||
</div>
|
||||
</CardContent>
|
||||
</Card>
|
||||
))}
|
||||
</div>
|
||||
|
||||
{/* Task List */}
|
||||
<div className="space-y-4">
|
||||
{loading ? (
|
||||
Array.from({ length: 3 }).map((_, i) => (
|
||||
<Skeleton key={i} className="h-24 w-full rounded-2xl" />
|
||||
))
|
||||
) : tasks.length > 0 ? (
|
||||
tasks.map((task) => (
|
||||
<Card key={task.task_id} className="border-none shadow-sm overflow-hidden">
|
||||
<div className="flex flex-col">
|
||||
<div className="p-6 flex flex-col md:flex-row md:items-center gap-4">
|
||||
<div className={cn(
|
||||
"w-12 h-12 rounded-xl flex items-center justify-center shrink-0",
|
||||
task.status === 'success' ? "bg-emerald-500/10 text-emerald-500" :
|
||||
task.status === 'failure' ? "bg-destructive/10 text-destructive" :
|
||||
task.status === 'processing' ? "bg-amber-500/10 text-amber-500" :
|
||||
"bg-gray-500/10 text-gray-500"
|
||||
)}>
|
||||
{task.status === 'processing' ? (
|
||||
<Loader2 size={24} className="animate-spin" />
|
||||
) : (
|
||||
getTaskIcon(task.task_type)
|
||||
)}
|
||||
</div>
|
||||
|
||||
<div className="flex-1 min-w-0 space-y-1">
|
||||
<div className="flex items-center gap-3 flex-wrap">
|
||||
<h3 className="font-bold">{getTaskTypeLabel(task.task_type)}</h3>
|
||||
{getStatusBadge(task.status)}
|
||||
<Badge variant="outline" className="text-[10px]">
|
||||
{task.task_id}
|
||||
</Badge>
|
||||
</div>
|
||||
<p className="text-sm text-muted-foreground">
|
||||
{task.message || (task.status === 'unknown' ? '无法获取状态' : '任务执行中...')}
|
||||
</p>
|
||||
<div className="flex items-center gap-4 text-xs text-muted-foreground">
|
||||
<span className="flex items-center gap-1">
|
||||
<Clock size={12} />
|
||||
{task.created_at ? format(new Date(task.created_at), 'yyyy-MM-dd HH:mm:ss') : '时间未知'}
|
||||
</span>
|
||||
{task.updated_at && task.status !== 'processing' && (
|
||||
<span>
|
||||
更新: {format(new Date(task.updated_at), 'HH:mm:ss')}
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div className="flex items-center gap-2">
|
||||
{task.status === 'failure' && (
|
||||
<Button
|
||||
variant="outline"
|
||||
size="sm"
|
||||
className="rounded-xl gap-1 h-9"
|
||||
onClick={() => handleRetry(task.task_id)}
|
||||
>
|
||||
<RefreshCcw size={14} />
|
||||
<span>重试</span>
|
||||
</Button>
|
||||
)}
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="rounded-lg text-destructive hover:bg-destructive/10"
|
||||
onClick={() => handleDelete(task.task_id)}
|
||||
>
|
||||
<Trash2 size={18} />
|
||||
</Button>
|
||||
{task.result && (
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="sm"
|
||||
className="rounded-xl gap-1 h-9"
|
||||
onClick={() => setExpandedTask(expandedTask === task.task_id ? null : task.task_id)}
|
||||
>
|
||||
{expandedTask === task.task_id ? <ChevronUp size={14} /> : <ChevronDown size={14} />}
|
||||
<span>详情</span>
|
||||
</Button>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Expanded Details */}
|
||||
{expandedTask === task.task_id && task.result && (
|
||||
<div className="px-6 pb-6 pt-2 border-t border-dashed animate-in slide-in-from-top-2 duration-300">
|
||||
<div className="p-4 bg-muted/30 rounded-xl">
|
||||
<p className="text-xs font-bold uppercase tracking-widest text-muted-foreground mb-3">任务结果</p>
|
||||
<pre className="text-sm whitespace-pre-wrap font-mono">
|
||||
{JSON.stringify(task.result, null, 2)}
|
||||
</pre>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Error Details */}
|
||||
{task.status === 'failure' && task.error && (
|
||||
<div className="px-6 pb-6 pt-2 border-t border-dashed">
|
||||
<div className="p-4 bg-destructive/5 rounded-xl border border-destructive/20">
|
||||
<p className="text-xs font-bold uppercase tracking-widest text-destructive mb-3 flex items-center gap-2">
|
||||
<AlertCircle size={14} />
|
||||
错误详情
|
||||
</p>
|
||||
<p className="text-sm text-destructive">{task.error}</p>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</Card>
|
||||
))
|
||||
) : (
|
||||
<div className="py-20 flex flex-col items-center justify-center text-center space-y-4">
|
||||
<div className="w-24 h-24 rounded-full bg-muted flex items-center justify-center text-muted-foreground/30">
|
||||
<Clock size={48} />
|
||||
</div>
|
||||
<div className="space-y-1">
|
||||
<p className="text-xl font-bold">暂无任务记录</p>
|
||||
<p className="text-muted-foreground">上传文档或创建填表任务后,这里会显示处理记录</p>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
export default TaskHistory;
|
||||
654
frontend/src/pages/TemplateFill.tsx
Normal file
654
frontend/src/pages/TemplateFill.tsx
Normal file
@@ -0,0 +1,654 @@
|
||||
import React, { useState, useEffect, useCallback, useRef } from 'react';
|
||||
import { useDropzone } from 'react-dropzone';
|
||||
import {
|
||||
TableProperties,
|
||||
Upload,
|
||||
FileSpreadsheet,
|
||||
FileText,
|
||||
CheckCircle2,
|
||||
Download,
|
||||
Clock,
|
||||
Sparkles,
|
||||
X,
|
||||
FilePlus,
|
||||
RefreshCcw,
|
||||
ChevronDown,
|
||||
ChevronUp,
|
||||
Loader2,
|
||||
Files,
|
||||
Trash2,
|
||||
Eye,
|
||||
File,
|
||||
Plus
|
||||
} from 'lucide-react';
|
||||
import { Button } from '@/components/ui/button';
|
||||
import { Card, CardContent, CardHeader, CardTitle, CardDescription } from '@/components/ui/card';
|
||||
import { Badge } from '@/components/ui/badge';
|
||||
import { Input } from '@/components/ui/input';
|
||||
import { Label } from '@/components/ui/label';
|
||||
import { backendApi } from '@/db/backend-api';
|
||||
import { format } from 'date-fns';
|
||||
import { toast } from 'sonner';
|
||||
import { cn } from '@/lib/utils';
|
||||
import { Skeleton } from '@/components/ui/skeleton';
|
||||
import {
|
||||
Dialog,
|
||||
DialogContent,
|
||||
DialogHeader,
|
||||
DialogTitle,
|
||||
} from "@/components/ui/dialog";
|
||||
import { ScrollArea } from '@/components/ui/scroll-area';
|
||||
import { useTemplateFill } from '@/context/TemplateFillContext';
|
||||
|
||||
type DocumentItem = {
|
||||
doc_id: string;
|
||||
filename: string;
|
||||
original_filename: string;
|
||||
doc_type: string;
|
||||
file_size: number;
|
||||
created_at: string;
|
||||
metadata?: {
|
||||
row_count?: number;
|
||||
column_count?: number;
|
||||
columns?: string[];
|
||||
};
|
||||
};
|
||||
|
||||
const TemplateFill: React.FC = () => {
|
||||
const {
|
||||
step, setStep,
|
||||
templateFile, setTemplateFile,
|
||||
templateFields, setTemplateFields,
|
||||
sourceFiles, setSourceFiles, addSourceFiles, removeSourceFile,
|
||||
sourceFilePaths, setSourceFilePaths,
|
||||
sourceDocIds, setSourceDocIds, addSourceDocId, removeSourceDocId,
|
||||
templateId, setTemplateId,
|
||||
filledResult, setFilledResult,
|
||||
reset
|
||||
} = useTemplateFill();
|
||||
|
||||
const [loading, setLoading] = useState(false);
|
||||
const [previewDoc, setPreviewDoc] = useState<{ name: string; content: string } | null>(null);
|
||||
const [previewOpen, setPreviewOpen] = useState(false);
|
||||
const [sourceMode, setSourceMode] = useState<'upload' | 'select'>('upload');
|
||||
const [uploadedDocuments, setUploadedDocuments] = useState<DocumentItem[]>([]);
|
||||
const [docsLoading, setDocsLoading] = useState(false);
|
||||
const sourceFileInputRef = useRef<HTMLInputElement>(null);
|
||||
|
||||
// 模板拖拽
|
||||
const onTemplateDrop = useCallback((acceptedFiles: File[]) => {
|
||||
const file = acceptedFiles[0];
|
||||
if (file) {
|
||||
setTemplateFile(file);
|
||||
}
|
||||
}, []);
|
||||
|
||||
const { getRootProps: getTemplateProps, getInputProps: getTemplateInputProps, isDragActive: isTemplateDragActive } = useDropzone({
|
||||
onDrop: onTemplateDrop,
|
||||
accept: {
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
|
||||
'application/vnd.ms-excel': ['.xls'],
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx']
|
||||
},
|
||||
maxFiles: 1,
|
||||
multiple: false
|
||||
});
|
||||
|
||||
// 源文档拖拽
|
||||
const onSourceDrop = useCallback((e: React.DragEvent) => {
|
||||
e.preventDefault();
|
||||
const files = Array.from(e.dataTransfer.files).filter(f => {
|
||||
const ext = f.name.split('.').pop()?.toLowerCase();
|
||||
return ['xlsx', 'xls', 'docx', 'md', 'txt'].includes(ext || '');
|
||||
});
|
||||
if (files.length > 0) {
|
||||
addSourceFiles(files.map(f => ({ file: f })));
|
||||
}
|
||||
}, [addSourceFiles]);
|
||||
|
||||
const handleSourceFileSelect = (e: React.ChangeEvent<HTMLInputElement>) => {
|
||||
const files = Array.from(e.target.files || []);
|
||||
if (files.length > 0) {
|
||||
addSourceFiles(files.map(f => ({ file: f })));
|
||||
toast.success(`已添加 ${files.length} 个文件`);
|
||||
}
|
||||
e.target.value = '';
|
||||
};
|
||||
|
||||
// 仅添加源文档不上传
|
||||
const handleAddSourceFiles = () => {
|
||||
if (sourceFiles.length === 0) {
|
||||
toast.error('请先选择源文档');
|
||||
return;
|
||||
}
|
||||
toast.success(`已添加 ${sourceFiles.length} 个源文档,可继续添加更多`);
|
||||
};
|
||||
|
||||
// 加载已上传文档
|
||||
const loadUploadedDocuments = useCallback(async () => {
|
||||
setDocsLoading(true);
|
||||
try {
|
||||
const result = await backendApi.getDocuments(undefined, 100);
|
||||
if (result.success) {
|
||||
// 过滤可作为数据源的文档类型
|
||||
const docs = (result.documents || []).filter((d: DocumentItem) =>
|
||||
['docx', 'md', 'txt', 'xlsx', 'xls'].includes(d.doc_type)
|
||||
);
|
||||
setUploadedDocuments(docs);
|
||||
}
|
||||
} catch (err: any) {
|
||||
console.error('加载文档失败:', err);
|
||||
} finally {
|
||||
setDocsLoading(false);
|
||||
}
|
||||
}, []);
|
||||
|
||||
// 删除文档
|
||||
const handleDeleteDocument = async (docId: string, e: React.MouseEvent) => {
|
||||
e.stopPropagation();
|
||||
if (!confirm('确定要删除该文档吗?')) return;
|
||||
try {
|
||||
const result = await backendApi.deleteDocument(docId);
|
||||
if (result.success) {
|
||||
setUploadedDocuments(prev => prev.filter(d => d.doc_id !== docId));
|
||||
removeSourceDocId(docId);
|
||||
toast.success('文档已删除');
|
||||
} else {
|
||||
toast.error(result.message || '删除失败');
|
||||
}
|
||||
} catch (err: any) {
|
||||
toast.error('删除失败: ' + (err.message || '未知错误'));
|
||||
}
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
if (sourceMode === 'select') {
|
||||
loadUploadedDocuments();
|
||||
}
|
||||
}, [sourceMode, loadUploadedDocuments]);
|
||||
|
||||
const handleJointUploadAndFill = async () => {
|
||||
if (!templateFile) {
|
||||
toast.error('请先上传模板文件');
|
||||
return;
|
||||
}
|
||||
|
||||
// 检查是否选择了数据源
|
||||
if (sourceMode === 'upload' && sourceFiles.length === 0) {
|
||||
toast.error('请上传源文档或从已上传文档中选择');
|
||||
return;
|
||||
}
|
||||
if (sourceMode === 'select' && sourceDocIds.length === 0) {
|
||||
toast.error('请选择源文档');
|
||||
return;
|
||||
}
|
||||
|
||||
setLoading(true);
|
||||
|
||||
try {
|
||||
if (sourceMode === 'select') {
|
||||
// 使用已上传文档作为数据源
|
||||
const result = await backendApi.uploadTemplate(templateFile);
|
||||
|
||||
if (result.success) {
|
||||
setTemplateFields(result.fields || []);
|
||||
setTemplateId(result.template_id || 'temp');
|
||||
toast.success('开始智能填表');
|
||||
setStep('filling');
|
||||
|
||||
// 使用 source_doc_ids 进行填表
|
||||
const fillResult = await backendApi.fillTemplate(
|
||||
result.template_id || 'temp',
|
||||
result.fields || [],
|
||||
sourceDocIds,
|
||||
[],
|
||||
'请从以下文档中提取相关信息填写表格'
|
||||
);
|
||||
|
||||
setFilledResult(fillResult);
|
||||
setStep('preview');
|
||||
toast.success('表格填写完成');
|
||||
}
|
||||
} else {
|
||||
// 使用联合上传API
|
||||
const result = await backendApi.uploadTemplateAndSources(
|
||||
templateFile,
|
||||
sourceFiles.map(sf => sf.file)
|
||||
);
|
||||
|
||||
if (result.success) {
|
||||
setTemplateFields(result.fields || []);
|
||||
setTemplateId(result.template_id);
|
||||
setSourceFilePaths(result.source_file_paths || []);
|
||||
toast.success('文档上传成功,开始智能填表');
|
||||
setStep('filling');
|
||||
|
||||
// 自动开始填表
|
||||
const fillResult = await backendApi.fillTemplate(
|
||||
result.template_id,
|
||||
result.fields || [],
|
||||
[],
|
||||
result.source_file_paths || [],
|
||||
'请从以下文档中提取相关信息填写表格'
|
||||
);
|
||||
|
||||
setFilledResult(fillResult);
|
||||
setStep('preview');
|
||||
toast.success('表格填写完成');
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
toast.error('处理失败: ' + (err.message || '未知错误'));
|
||||
} finally {
|
||||
setLoading(false);
|
||||
}
|
||||
};
|
||||
|
||||
const handleExport = async () => {
|
||||
if (!templateFile || !filledResult) return;
|
||||
|
||||
try {
|
||||
const blob = await backendApi.exportFilledTemplate(
|
||||
templateId || 'temp',
|
||||
filledResult.filled_data || {},
|
||||
'xlsx'
|
||||
);
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = `filled_${templateFile.name}`;
|
||||
a.click();
|
||||
URL.revokeObjectURL(url);
|
||||
toast.success('导出成功');
|
||||
} catch (err: any) {
|
||||
toast.error('导出失败: ' + (err.message || '未知错误'));
|
||||
}
|
||||
};
|
||||
|
||||
const getFileIcon = (filename: string) => {
|
||||
const ext = filename.split('.').pop()?.toLowerCase();
|
||||
if (['xlsx', 'xls'].includes(ext || '')) {
|
||||
return <FileSpreadsheet size={20} className="text-emerald-500" />;
|
||||
}
|
||||
if (ext === 'docx') {
|
||||
return <FileText size={20} className="text-blue-500" />;
|
||||
}
|
||||
if (['md', 'txt'].includes(ext || '')) {
|
||||
return <FileText size={20} className="text-orange-500" />;
|
||||
}
|
||||
return <File size={20} className="text-gray-500" />;
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="space-y-8 pb-10">
|
||||
<section className="flex flex-col md:flex-row md:items-center justify-between gap-4">
|
||||
<div className="space-y-1">
|
||||
<h1 className="text-3xl font-extrabold tracking-tight">智能填表</h1>
|
||||
<p className="text-muted-foreground">
|
||||
根据您的表格模板,自动聚合多源文档信息进行精准填充
|
||||
</p>
|
||||
</div>
|
||||
{step !== 'upload' && (
|
||||
<Button variant="outline" className="rounded-xl gap-2" onClick={reset}>
|
||||
<RefreshCcw size={18} />
|
||||
<span>重新开始</span>
|
||||
</Button>
|
||||
)}
|
||||
</section>
|
||||
|
||||
{/* Step 1: Upload - Joint Upload of Template + Source Docs */}
|
||||
{step === 'upload' && (
|
||||
<div className="grid grid-cols-1 lg:grid-cols-2 gap-6">
|
||||
{/* Template Upload */}
|
||||
<Card className="border-none shadow-md">
|
||||
<CardHeader className="pb-4">
|
||||
<CardTitle className="text-lg flex items-center gap-2">
|
||||
<FileSpreadsheet className="text-primary" size={20} />
|
||||
表格模板
|
||||
</CardTitle>
|
||||
<CardDescription>
|
||||
上传需要填写的 Excel/Word 模板文件
|
||||
</CardDescription>
|
||||
</CardHeader>
|
||||
<CardContent>
|
||||
{!templateFile ? (
|
||||
<div
|
||||
{...getTemplateProps()}
|
||||
className={cn(
|
||||
"border-2 border-dashed rounded-2xl p-8 transition-all duration-300 flex flex-col items-center justify-center text-center cursor-pointer group min-h-[200px]",
|
||||
isTemplateDragActive ? "border-primary bg-primary/5" : "border-muted-foreground/20 hover:border-primary/50 hover:bg-primary/5"
|
||||
)}
|
||||
>
|
||||
<input {...getTemplateInputProps()} />
|
||||
<div className="w-14 h-14 rounded-xl bg-primary/10 text-primary flex items-center justify-center mb-4 group-hover:scale-110 transition-transform">
|
||||
{loading ? <Loader2 className="animate-spin" size={28} /> : <Upload size={28} />}
|
||||
</div>
|
||||
<p className="font-medium">
|
||||
{isTemplateDragActive ? '释放以上传' : '点击或拖拽上传模板'}
|
||||
</p>
|
||||
<p className="text-xs text-muted-foreground mt-1">
|
||||
支持 .xlsx .xls .docx
|
||||
</p>
|
||||
</div>
|
||||
) : (
|
||||
<div className="flex items-center gap-3 p-4 bg-emerald-500/5 rounded-xl border border-emerald-200">
|
||||
<div className="w-10 h-10 rounded-lg bg-emerald-500/10 text-emerald-500 flex items-center justify-center">
|
||||
<FileSpreadsheet size={20} />
|
||||
</div>
|
||||
<div className="flex-1 min-w-0">
|
||||
<p className="font-medium truncate">{templateFile.name}</p>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
{(templateFile.size / 1024).toFixed(1)} KB
|
||||
</p>
|
||||
</div>
|
||||
<Button variant="ghost" size="sm" onClick={() => setTemplateFile(null)}>
|
||||
<X size={16} />
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
</CardContent>
|
||||
</Card>
|
||||
|
||||
{/* Source Documents Upload */}
|
||||
<Card className="border-none shadow-md">
|
||||
<CardHeader className="pb-4">
|
||||
<CardTitle className="text-lg flex items-center gap-2">
|
||||
<Files className="text-primary" size={20} />
|
||||
源文档
|
||||
</CardTitle>
|
||||
<CardDescription>
|
||||
选择包含数据的源文档作为填表依据
|
||||
</CardDescription>
|
||||
{/* Source Mode Tabs */}
|
||||
<div className="flex gap-2 mt-2">
|
||||
<Button
|
||||
variant={sourceMode === 'upload' ? 'default' : 'outline'}
|
||||
size="sm"
|
||||
onClick={() => setSourceMode('upload')}
|
||||
>
|
||||
<Upload size={14} className="mr-1" />
|
||||
上传文件
|
||||
</Button>
|
||||
<Button
|
||||
variant={sourceMode === 'select' ? 'default' : 'outline'}
|
||||
size="sm"
|
||||
onClick={() => setSourceMode('select')}
|
||||
>
|
||||
<Files size={14} className="mr-1" />
|
||||
从文档中心选择
|
||||
</Button>
|
||||
</div>
|
||||
</CardHeader>
|
||||
<CardContent>
|
||||
{sourceMode === 'upload' ? (
|
||||
<>
|
||||
<div className="border-2 border-dashed rounded-2xl p-8 transition-all duration-300 flex flex-col items-center justify-center text-center cursor-pointer group min-h-[200px] border-muted-foreground/20 hover:border-primary/50 hover:bg-primary/5">
|
||||
<input
|
||||
id="source-file-input"
|
||||
type="file"
|
||||
multiple={true}
|
||||
accept=".xlsx,.xls,.docx,.md,.txt"
|
||||
onChange={handleSourceFileSelect}
|
||||
className="hidden"
|
||||
/>
|
||||
<label htmlFor="source-file-input" className="cursor-pointer flex flex-col items-center">
|
||||
<div className="w-14 h-14 rounded-xl bg-blue-500/10 text-blue-500 flex items-center justify-center mb-4 group-hover:scale-110 transition-transform">
|
||||
{loading ? <Loader2 className="animate-spin" size={28} /> : <Upload size={28} />}
|
||||
</div>
|
||||
<p className="font-medium">
|
||||
点击上传源文档
|
||||
</p>
|
||||
<p className="text-xs text-muted-foreground mt-1">
|
||||
支持 .xlsx .xls .docx .md .txt
|
||||
</p>
|
||||
</label>
|
||||
</div>
|
||||
<div
|
||||
onDragOver={(e) => { e.preventDefault(); }}
|
||||
onDrop={onSourceDrop}
|
||||
className="mt-2 text-center text-xs text-muted-foreground"
|
||||
>
|
||||
或拖拽文件到此处
|
||||
</div>
|
||||
|
||||
{/* Selected Source Files */}
|
||||
{sourceFiles.length > 0 && (
|
||||
<div className="mt-4 space-y-2">
|
||||
{sourceFiles.map((sf, idx) => (
|
||||
<div key={idx} className="flex items-center gap-3 p-3 bg-muted/50 rounded-xl">
|
||||
{getFileIcon(sf.file.name)}
|
||||
<div className="flex-1 min-w-0">
|
||||
<p className="text-sm font-medium truncate">{sf.file.name}</p>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
{(sf.file.size / 1024).toFixed(1)} KB
|
||||
</p>
|
||||
</div>
|
||||
<Button variant="ghost" size="sm" onClick={() => removeSourceFile(idx)}>
|
||||
<Trash2 size={14} className="text-red-500" />
|
||||
</Button>
|
||||
</div>
|
||||
))}
|
||||
<div className="flex justify-center pt-2">
|
||||
<Button variant="outline" size="sm" onClick={() => document.getElementById('source-file-input')?.click()}>
|
||||
<Plus size={14} className="mr-1" />
|
||||
继续添加更多文档
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
{/* Uploaded Documents Selection */}
|
||||
{docsLoading ? (
|
||||
<div className="space-y-2">
|
||||
{[1, 2, 3].map(i => (
|
||||
<Skeleton key={i} className="h-16 w-full rounded-xl" />
|
||||
))}
|
||||
</div>
|
||||
) : uploadedDocuments.length > 0 ? (
|
||||
<div className="space-y-2">
|
||||
{sourceDocIds.length > 0 && (
|
||||
<div className="flex items-center justify-between p-3 bg-primary/5 rounded-xl border border-primary/20">
|
||||
<span className="text-sm font-medium">已选择 {sourceDocIds.length} 个文档</span>
|
||||
<Button variant="ghost" size="sm" onClick={() => loadUploadedDocuments()}>
|
||||
<RefreshCcw size={14} className="mr-1" />
|
||||
刷新列表
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
<div className="max-h-[300px] overflow-y-auto space-y-2">
|
||||
{uploadedDocuments.map((doc) => (
|
||||
<div
|
||||
key={doc.doc_id}
|
||||
className={cn(
|
||||
"flex items-center gap-3 p-3 rounded-xl border-2 transition-all cursor-pointer",
|
||||
sourceDocIds.includes(doc.doc_id)
|
||||
? "border-primary bg-primary/5"
|
||||
: "border-border hover:bg-muted/30"
|
||||
)}
|
||||
onClick={() => {
|
||||
if (sourceDocIds.includes(doc.doc_id)) {
|
||||
removeSourceDocId(doc.doc_id);
|
||||
} else {
|
||||
addSourceDocId(doc.doc_id);
|
||||
}
|
||||
}}
|
||||
>
|
||||
<div className={cn(
|
||||
"w-6 h-6 rounded-md border-2 flex items-center justify-center transition-all shrink-0",
|
||||
sourceDocIds.includes(doc.doc_id)
|
||||
? "border-primary bg-primary text-white"
|
||||
: "border-muted-foreground/30"
|
||||
)}>
|
||||
{sourceDocIds.includes(doc.doc_id) && <CheckCircle2 size={14} />}
|
||||
</div>
|
||||
{getFileIcon(doc.original_filename)}
|
||||
<div className="flex-1 min-w-0">
|
||||
<p className="text-sm font-medium truncate">{doc.original_filename}</p>
|
||||
<p className="text-xs text-muted-foreground">
|
||||
{doc.doc_type.toUpperCase()} • {format(new Date(doc.created_at), 'yyyy-MM-dd')}
|
||||
</p>
|
||||
</div>
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="sm"
|
||||
onClick={(e) => handleDeleteDocument(doc.doc_id, e)}
|
||||
className="shrink-0"
|
||||
>
|
||||
<Trash2 size={14} className="text-red-500" />
|
||||
</Button>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
) : (
|
||||
<div className="text-center py-8 text-muted-foreground">
|
||||
<Files size={32} className="mx-auto mb-2 opacity-30" />
|
||||
<p className="text-sm">暂无可用的已上传文档</p>
|
||||
</div>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</CardContent>
|
||||
</Card>
|
||||
|
||||
{/* Action Button */}
|
||||
<div className="col-span-1 lg:col-span-2 flex justify-center">
|
||||
<Button
|
||||
size="lg"
|
||||
className="rounded-xl px-12 shadow-lg shadow-primary/20 gap-2"
|
||||
disabled={!templateFile || loading}
|
||||
onClick={handleJointUploadAndFill}
|
||||
>
|
||||
{loading ? (
|
||||
<>
|
||||
<Loader2 className="animate-spin" size={20} />
|
||||
<span>正在处理...</span>
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
<Sparkles size={20} />
|
||||
<span>上传并智能填表</span>
|
||||
</>
|
||||
)}
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Step 2: Filling State */}
|
||||
{step === 'filling' && (
|
||||
<Card className="border-none shadow-md">
|
||||
<CardContent className="py-16 flex flex-col items-center justify-center">
|
||||
<div className="w-16 h-16 rounded-full bg-primary/10 flex items-center justify-center mb-6">
|
||||
<Loader2 className="animate-spin text-primary" size={32} />
|
||||
</div>
|
||||
<h3 className="text-xl font-bold mb-2">AI 正在智能分析并填表</h3>
|
||||
<p className="text-muted-foreground text-center max-w-md">
|
||||
系统正在从 {sourceFiles.length || sourceFilePaths.length} 份文档中检索相关信息...
|
||||
</p>
|
||||
</CardContent>
|
||||
</Card>
|
||||
)}
|
||||
|
||||
{/* Step 3: Preview Results */}
|
||||
{step === 'preview' && filledResult && (
|
||||
<div className="space-y-6">
|
||||
<Card className="border-none shadow-md">
|
||||
<CardHeader>
|
||||
<CardTitle className="text-lg flex items-center gap-2">
|
||||
<CheckCircle2 className="text-emerald-500" size={20} />
|
||||
填表完成
|
||||
</CardTitle>
|
||||
<CardDescription>
|
||||
系统已根据 {sourceFiles.length || sourceFilePaths.length} 份文档自动完成表格填写
|
||||
</CardDescription>
|
||||
</CardHeader>
|
||||
<CardContent>
|
||||
{/* Filled Data Preview */}
|
||||
<div className="p-6 bg-muted/30 rounded-2xl">
|
||||
<div className="space-y-4">
|
||||
{templateFields.map((field, idx) => {
|
||||
const value = filledResult.filled_data?.[field.name];
|
||||
const displayValue = Array.isArray(value)
|
||||
? value.filter(v => v && String(v).trim()).join(', ') || '-'
|
||||
: value || '-';
|
||||
return (
|
||||
<div key={idx} className="flex items-center gap-4">
|
||||
<div className="w-40 text-sm font-medium text-muted-foreground">{field.name}</div>
|
||||
<div className="flex-1 p-3 bg-background rounded-xl border">
|
||||
{displayValue}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Source Files Info */}
|
||||
<div className="mt-4 flex flex-wrap gap-2">
|
||||
{sourceFiles.map((sf, idx) => (
|
||||
<Badge key={idx} variant="outline" className="bg-blue-500/5">
|
||||
{getFileIcon(sf.file.name)}
|
||||
<span className="ml-1">{sf.file.name}</span>
|
||||
</Badge>
|
||||
))}
|
||||
</div>
|
||||
|
||||
{/* Action Buttons */}
|
||||
<div className="flex justify-center gap-4 mt-6">
|
||||
<Button variant="outline" className="rounded-xl gap-2" onClick={reset}>
|
||||
<RefreshCcw size={18} />
|
||||
<span>继续填表</span>
|
||||
</Button>
|
||||
<Button className="rounded-xl gap-2 shadow-lg shadow-primary/20" onClick={handleExport}>
|
||||
<Download size={18} />
|
||||
<span>导出结果</span>
|
||||
</Button>
|
||||
</div>
|
||||
</CardContent>
|
||||
</Card>
|
||||
|
||||
{/* Fill Details */}
|
||||
{filledResult.fill_details && filledResult.fill_details.length > 0 && (
|
||||
<Card className="border-none shadow-md">
|
||||
<CardHeader>
|
||||
<CardTitle className="text-lg">填写详情</CardTitle>
|
||||
</CardHeader>
|
||||
<CardContent>
|
||||
<div className="space-y-3">
|
||||
{filledResult.fill_details.map((detail: any, idx: number) => (
|
||||
<div key={idx} className="flex items-start gap-3 p-3 bg-muted/30 rounded-xl text-sm">
|
||||
<div className="w-1 h-1 rounded-full bg-primary mt-2" />
|
||||
<div className="flex-1">
|
||||
<div className="font-medium">{detail.field}</div>
|
||||
<div className="text-muted-foreground text-xs mt-1">
|
||||
来源: {detail.source} | 置信度: {detail.confidence ? (detail.confidence * 100).toFixed(0) + '%' : 'N/A'}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</CardContent>
|
||||
</Card>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Preview Dialog */}
|
||||
<Dialog open={previewOpen} onOpenChange={setPreviewOpen}>
|
||||
<DialogContent className="max-w-2xl">
|
||||
<DialogHeader>
|
||||
<DialogTitle>{previewDoc?.name || '文档预览'}</DialogTitle>
|
||||
</DialogHeader>
|
||||
<ScrollArea className="max-h-[60vh]">
|
||||
<pre className="text-sm whitespace-pre-wrap">{previewDoc?.content}</pre>
|
||||
</ScrollArea>
|
||||
</DialogContent>
|
||||
</Dialog>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
export default TemplateFill;
|
||||
@@ -1,25 +1,15 @@
|
||||
import { createBrowserRouter, Navigate } from 'react-router-dom';
|
||||
import Login from '@/pages/Login';
|
||||
import Dashboard from '@/pages/Dashboard';
|
||||
import Documents from '@/pages/Documents';
|
||||
import FormFill from '@/pages/FormFill';
|
||||
import Assistant from '@/pages/Assistant';
|
||||
import ExcelParse from '@/pages/ExcelParse';
|
||||
import TemplateFill from '@/pages/TemplateFill';
|
||||
import InstructionChat from '@/pages/InstructionChat';
|
||||
import TaskHistory from '@/pages/TaskHistory';
|
||||
import MainLayout from '@/components/layouts/MainLayout';
|
||||
import { RouteGuard } from '@/components/common/RouteGuard';
|
||||
|
||||
export const routes = [
|
||||
{
|
||||
path: '/login',
|
||||
element: <Login />,
|
||||
},
|
||||
{
|
||||
path: '/',
|
||||
element: (
|
||||
<RouteGuard>
|
||||
<MainLayout />
|
||||
</RouteGuard>
|
||||
),
|
||||
element: <MainLayout />,
|
||||
children: [
|
||||
{
|
||||
path: '/',
|
||||
@@ -31,15 +21,15 @@ export const routes = [
|
||||
},
|
||||
{
|
||||
path: '/form-fill',
|
||||
element: <FormFill />,
|
||||
element: <TemplateFill />,
|
||||
},
|
||||
{
|
||||
path: '/assistant',
|
||||
element: <Assistant />,
|
||||
element: <InstructionChat />,
|
||||
},
|
||||
{
|
||||
path: '/excel-parse',
|
||||
element: <ExcelParse />,
|
||||
path: '/task-history',
|
||||
element: <TaskHistory />,
|
||||
},
|
||||
],
|
||||
},
|
||||
|
||||
854
logs/docx_parser_and_template_fill.patch
Normal file
854
logs/docx_parser_and_template_fill.patch
Normal file
@@ -0,0 +1,854 @@
|
||||
diff --git a/backend/app/api/endpoints/templates.py b/backend/app/api/endpoints/templates.py
|
||||
index 572d56e..706f281 100644
|
||||
--- a/backend/app/api/endpoints/templates.py
|
||||
+++ b/backend/app/api/endpoints/templates.py
|
||||
@@ -13,7 +13,7 @@ import pandas as pd
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.services.template_fill_service import template_fill_service, TemplateField
|
||||
-from app.services.excel_storage_service import excel_storage_service
|
||||
+from app.services.file_service import file_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -28,13 +28,15 @@ class TemplateFieldRequest(BaseModel):
|
||||
name: str
|
||||
field_type: str = "text"
|
||||
required: bool = True
|
||||
+ hint: str = ""
|
||||
|
||||
|
||||
class FillRequest(BaseModel):
|
||||
"""填写请求"""
|
||||
template_id: str
|
||||
template_fields: List[TemplateFieldRequest]
|
||||
- source_doc_ids: Optional[List[str]] = None
|
||||
+ source_doc_ids: Optional[List[str]] = None # MongoDB 文档 ID 列表
|
||||
+ source_file_paths: Optional[List[str]] = None # 源文档文件路径列表
|
||||
user_hint: Optional[str] = None
|
||||
|
||||
|
||||
@@ -71,7 +73,6 @@ async def upload_template(
|
||||
|
||||
try:
|
||||
# 保存文件
|
||||
- from app.services.file_service import file_service
|
||||
content = await file.read()
|
||||
saved_path = file_service.save_uploaded_file(
|
||||
content,
|
||||
@@ -87,7 +88,7 @@ async def upload_template(
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
- "template_id": saved_path, # 使用文件路径作为ID
|
||||
+ "template_id": saved_path,
|
||||
"filename": file.filename,
|
||||
"file_type": file_ext,
|
||||
"fields": [
|
||||
@@ -95,7 +96,8 @@ async def upload_template(
|
||||
"cell": f.cell,
|
||||
"name": f.name,
|
||||
"field_type": f.field_type,
|
||||
- "required": f.required
|
||||
+ "required": f.required,
|
||||
+ "hint": f.hint
|
||||
}
|
||||
for f in template_fields
|
||||
],
|
||||
@@ -135,7 +137,8 @@ async def extract_template_fields(
|
||||
"cell": f.cell,
|
||||
"name": f.name,
|
||||
"field_type": f.field_type,
|
||||
- "required": f.required
|
||||
+ "required": f.required,
|
||||
+ "hint": f.hint
|
||||
}
|
||||
for f in fields
|
||||
]
|
||||
@@ -153,7 +156,7 @@ async def fill_template(
|
||||
"""
|
||||
执行表格填写
|
||||
|
||||
- 根据提供的字段定义,从已上传的文档中检索信息并填写
|
||||
+ 根据提供的字段定义,从源文档中检索信息并填写
|
||||
|
||||
Args:
|
||||
request: 填写请求
|
||||
@@ -168,7 +171,8 @@ async def fill_template(
|
||||
cell=f.cell,
|
||||
name=f.name,
|
||||
field_type=f.field_type,
|
||||
- required=f.required
|
||||
+ required=f.required,
|
||||
+ hint=f.hint
|
||||
)
|
||||
for f in request.template_fields
|
||||
]
|
||||
@@ -177,6 +181,7 @@ async def fill_template(
|
||||
result = await template_fill_service.fill_template(
|
||||
template_fields=fields,
|
||||
source_doc_ids=request.source_doc_ids,
|
||||
+ source_file_paths=request.source_file_paths,
|
||||
user_hint=request.user_hint
|
||||
)
|
||||
|
||||
@@ -194,6 +199,8 @@ async def export_filled_template(
|
||||
"""
|
||||
导出填写后的表格
|
||||
|
||||
+ 支持 Excel (.xlsx) 和 Word (.docx) 格式
|
||||
+
|
||||
Args:
|
||||
request: 导出请求
|
||||
|
||||
@@ -201,25 +208,124 @@ async def export_filled_template(
|
||||
文件流
|
||||
"""
|
||||
try:
|
||||
- # 创建 DataFrame
|
||||
- df = pd.DataFrame([request.filled_data])
|
||||
+ if request.format == "xlsx":
|
||||
+ return await _export_to_excel(request.filled_data, request.template_id)
|
||||
+ elif request.format == "docx":
|
||||
+ return await _export_to_word(request.filled_data, request.template_id)
|
||||
+ else:
|
||||
+ raise HTTPException(
|
||||
+ status_code=400,
|
||||
+ detail=f"不支持的导出格式: {request.format},仅支持 xlsx/docx"
|
||||
+ )
|
||||
|
||||
- # 导出为 Excel
|
||||
- output = io.BytesIO()
|
||||
- with pd.ExcelWriter(output, engine='openpyxl') as writer:
|
||||
- df.to_excel(writer, index=False, sheet_name='填写结果')
|
||||
+ except HTTPException:
|
||||
+ raise
|
||||
+ except Exception as e:
|
||||
+ logger.error(f"导出失败: {str(e)}")
|
||||
+ raise HTTPException(status_code=500, detail=f"导出失败: {str(e)}")
|
||||
|
||||
- output.seek(0)
|
||||
|
||||
- # 生成文件名
|
||||
- filename = f"filled_template.{request.format}"
|
||||
+async def _export_to_excel(filled_data: dict, template_id: str) -> StreamingResponse:
|
||||
+ """导出为 Excel 格式"""
|
||||
+ # 将字典转换为单行 DataFrame
|
||||
+ df = pd.DataFrame([filled_data])
|
||||
|
||||
- return StreamingResponse(
|
||||
- io.BytesIO(output.getvalue()),
|
||||
- media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
- headers={"Content-Disposition": f"attachment; filename={filename}"}
|
||||
- )
|
||||
+ output = io.BytesIO()
|
||||
+ with pd.ExcelWriter(output, engine='openpyxl') as writer:
|
||||
+ df.to_excel(writer, index=False, sheet_name='填写结果')
|
||||
|
||||
- except Exception as e:
|
||||
- logger.error(f"导出失败: {str(e)}")
|
||||
- raise HTTPException(status_code=500, detail=f"导出失败: {str(e)}")
|
||||
+ output.seek(0)
|
||||
+
|
||||
+ filename = f"filled_template.xlsx"
|
||||
+
|
||||
+ return StreamingResponse(
|
||||
+ io.BytesIO(output.getvalue()),
|
||||
+ media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
+ headers={"Content-Disposition": f"attachment; filename={filename}"}
|
||||
+ )
|
||||
+
|
||||
+
|
||||
+async def _export_to_word(filled_data: dict, template_id: str) -> StreamingResponse:
|
||||
+ """导出为 Word 格式"""
|
||||
+ from docx import Document
|
||||
+ from docx.shared import Pt, RGBColor
|
||||
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
+
|
||||
+ doc = Document()
|
||||
+
|
||||
+ # 添加标题
|
||||
+ title = doc.add_heading('填写结果', level=1)
|
||||
+ title.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
+
|
||||
+ # 添加填写时间和模板信息
|
||||
+ from datetime import datetime
|
||||
+ info_para = doc.add_paragraph()
|
||||
+ info_para.add_run(f"模板ID: {template_id}\n").bold = True
|
||||
+ info_para.add_run(f"导出时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
+
|
||||
+ doc.add_paragraph() # 空行
|
||||
+
|
||||
+ # 添加字段表格
|
||||
+ table = doc.add_table(rows=1, cols=3)
|
||||
+ table.style = 'Light Grid Accent 1'
|
||||
+
|
||||
+ # 表头
|
||||
+ header_cells = table.rows[0].cells
|
||||
+ header_cells[0].text = '字段名'
|
||||
+ header_cells[1].text = '填写值'
|
||||
+ header_cells[2].text = '状态'
|
||||
+
|
||||
+ for field_name, field_value in filled_data.items():
|
||||
+ row_cells = table.add_row().cells
|
||||
+ row_cells[0].text = field_name
|
||||
+ row_cells[1].text = str(field_value) if field_value else ''
|
||||
+ row_cells[2].text = '已填写' if field_value else '为空'
|
||||
+
|
||||
+ # 保存到 BytesIO
|
||||
+ output = io.BytesIO()
|
||||
+ doc.save(output)
|
||||
+ output.seek(0)
|
||||
+
|
||||
+ filename = f"filled_template.docx"
|
||||
+
|
||||
+ return StreamingResponse(
|
||||
+ io.BytesIO(output.getvalue()),
|
||||
+ media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
+ headers={"Content-Disposition": f"attachment; filename={filename}"}
|
||||
+ )
|
||||
+
|
||||
+
|
||||
+@router.post("/export/excel")
|
||||
+async def export_to_excel(
|
||||
+ filled_data: dict,
|
||||
+ template_id: str = Query(..., description="模板ID")
|
||||
+):
|
||||
+ """
|
||||
+ 专门导出为 Excel 格式
|
||||
+
|
||||
+ Args:
|
||||
+ filled_data: 填写数据
|
||||
+ template_id: 模板ID
|
||||
+
|
||||
+ Returns:
|
||||
+ Excel 文件流
|
||||
+ """
|
||||
+ return await _export_to_excel(filled_data, template_id)
|
||||
+
|
||||
+
|
||||
+@router.post("/export/word")
|
||||
+async def export_to_word(
|
||||
+ filled_data: dict,
|
||||
+ template_id: str = Query(..., description="模板ID")
|
||||
+):
|
||||
+ """
|
||||
+ 专门导出为 Word 格式
|
||||
+
|
||||
+ Args:
|
||||
+ filled_data: 填写数据
|
||||
+ template_id: 模板ID
|
||||
+
|
||||
+ Returns:
|
||||
+ Word 文件流
|
||||
+ """
|
||||
+ return await _export_to_word(filled_data, template_id)
|
||||
diff --git a/backend/app/core/document_parser/docx_parser.py b/backend/app/core/document_parser/docx_parser.py
|
||||
index 75e79da..03c341d 100644
|
||||
--- a/backend/app/core/document_parser/docx_parser.py
|
||||
+++ b/backend/app/core/document_parser/docx_parser.py
|
||||
@@ -161,3 +161,133 @@ class DocxParser(BaseParser):
|
||||
fields[field_name] = match.group(1)
|
||||
|
||||
return fields
|
||||
+
|
||||
+ def parse_tables_for_template(
|
||||
+ self,
|
||||
+ file_path: str
|
||||
+ ) -> Dict[str, Any]:
|
||||
+ """
|
||||
+ 解析 Word 文档中的表格,提取模板字段
|
||||
+
|
||||
+ 专门用于比赛场景:解析表格模板,识别需要填写的字段
|
||||
+
|
||||
+ Args:
|
||||
+ file_path: Word 文件路径
|
||||
+
|
||||
+ Returns:
|
||||
+ 包含表格字段信息的字典
|
||||
+ """
|
||||
+ from docx import Document
|
||||
+ from docx.table import Table
|
||||
+ from docx.oxml.ns import qn
|
||||
+
|
||||
+ doc = Document(file_path)
|
||||
+
|
||||
+ template_info = {
|
||||
+ "tables": [],
|
||||
+ "fields": [],
|
||||
+ "field_count": 0
|
||||
+ }
|
||||
+
|
||||
+ for table_idx, table in enumerate(doc.tables):
|
||||
+ table_info = {
|
||||
+ "table_index": table_idx,
|
||||
+ "rows": [],
|
||||
+ "headers": [],
|
||||
+ "data_rows": [],
|
||||
+ "field_hints": {} # 字段名称 -> 提示词/描述
|
||||
+ }
|
||||
+
|
||||
+ # 提取表头(第一行)
|
||||
+ if table.rows:
|
||||
+ header_cells = [cell.text.strip() for cell in table.rows[0].cells]
|
||||
+ table_info["headers"] = header_cells
|
||||
+
|
||||
+ # 提取数据行
|
||||
+ for row_idx, row in enumerate(table.rows[1:], 1):
|
||||
+ row_data = [cell.text.strip() for cell in row.cells]
|
||||
+ table_info["data_rows"].append(row_data)
|
||||
+ table_info["rows"].append({
|
||||
+ "row_index": row_idx,
|
||||
+ "cells": row_data
|
||||
+ })
|
||||
+
|
||||
+ # 尝试从第二列/第三列提取提示词
|
||||
+ # 比赛模板通常格式为:字段名 | 提示词 | 填写值
|
||||
+ if len(table.rows[0].cells) >= 2:
|
||||
+ for row_idx, row in enumerate(table.rows[1:], 1):
|
||||
+ cells = [cell.text.strip() for cell in row.cells]
|
||||
+ if len(cells) >= 2 and cells[0]:
|
||||
+ # 第一列是字段名
|
||||
+ field_name = cells[0]
|
||||
+ # 第二列可能是提示词或描述
|
||||
+ hint = cells[1] if len(cells) > 1 else ""
|
||||
+ table_info["field_hints"][field_name] = hint
|
||||
+
|
||||
+ template_info["fields"].append({
|
||||
+ "table_index": table_idx,
|
||||
+ "row_index": row_idx,
|
||||
+ "field_name": field_name,
|
||||
+ "hint": hint,
|
||||
+ "expected_value": cells[2] if len(cells) > 2 else ""
|
||||
+ })
|
||||
+
|
||||
+ template_info["tables"].append(table_info)
|
||||
+
|
||||
+ template_info["field_count"] = len(template_info["fields"])
|
||||
+ return template_info
|
||||
+
|
||||
+ def extract_template_fields_from_docx(
|
||||
+ self,
|
||||
+ file_path: str
|
||||
+ ) -> List[Dict[str, Any]]:
|
||||
+ """
|
||||
+ 从 Word 文档中提取模板字段定义
|
||||
+
|
||||
+ 适用于比赛评分表格:表格第一列是字段名,第二列是提示词/填写示例
|
||||
+
|
||||
+ Args:
|
||||
+ file_path: Word 文件路径
|
||||
+
|
||||
+ Returns:
|
||||
+ 字段定义列表
|
||||
+ """
|
||||
+ template_info = self.parse_tables_for_template(file_path)
|
||||
+
|
||||
+ fields = []
|
||||
+ for field in template_info["fields"]:
|
||||
+ fields.append({
|
||||
+ "cell": f"T{field['table_index']}R{field['row_index']}", # TableXRowY 格式
|
||||
+ "name": field["field_name"],
|
||||
+ "hint": field["hint"],
|
||||
+ "table_index": field["table_index"],
|
||||
+ "row_index": field["row_index"],
|
||||
+ "field_type": self._infer_field_type_from_hint(field["hint"]),
|
||||
+ "required": True
|
||||
+ })
|
||||
+
|
||||
+ return fields
|
||||
+
|
||||
+ def _infer_field_type_from_hint(self, hint: str) -> str:
|
||||
+ """
|
||||
+ 从提示词推断字段类型
|
||||
+
|
||||
+ Args:
|
||||
+ hint: 字段提示词
|
||||
+
|
||||
+ Returns:
|
||||
+ 字段类型 (text/number/date)
|
||||
+ """
|
||||
+ hint_lower = hint.lower()
|
||||
+
|
||||
+ # 日期关键词
|
||||
+ date_keywords = ["年", "月", "日", "日期", "时间", "出生"]
|
||||
+ if any(kw in hint for kw in date_keywords):
|
||||
+ return "date"
|
||||
+
|
||||
+ # 数字关键词
|
||||
+ number_keywords = ["数量", "金额", "人数", "面积", "增长", "比率", "%", "率"]
|
||||
+ if any(kw in hint_lower for kw in number_keywords):
|
||||
+ return "number"
|
||||
+
|
||||
+ return "text"
|
||||
diff --git a/backend/app/services/template_fill_service.py b/backend/app/services/template_fill_service.py
|
||||
index 2612354..94930fb 100644
|
||||
--- a/backend/app/services/template_fill_service.py
|
||||
+++ b/backend/app/services/template_fill_service.py
|
||||
@@ -4,13 +4,12 @@
|
||||
从非结构化文档中检索信息并填写到表格模板
|
||||
"""
|
||||
import logging
|
||||
-from dataclasses import dataclass
|
||||
+from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from app.core.database import mongodb
|
||||
-from app.services.rag_service import rag_service
|
||||
from app.services.llm_service import llm_service
|
||||
-from app.services.excel_storage_service import excel_storage_service
|
||||
+from app.core.document_parser import ParserFactory
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -22,6 +21,17 @@ class TemplateField:
|
||||
name: str # 字段名称
|
||||
field_type: str = "text" # 字段类型: text/number/date
|
||||
required: bool = True
|
||||
+ hint: str = "" # 字段提示词
|
||||
+
|
||||
+
|
||||
+@dataclass
|
||||
+class SourceDocument:
|
||||
+ """源文档"""
|
||||
+ doc_id: str
|
||||
+ filename: str
|
||||
+ doc_type: str
|
||||
+ content: str = ""
|
||||
+ structured_data: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -38,12 +48,12 @@ class TemplateFillService:
|
||||
|
||||
def __init__(self):
|
||||
self.llm = llm_service
|
||||
- self.rag = rag_service
|
||||
|
||||
async def fill_template(
|
||||
self,
|
||||
template_fields: List[TemplateField],
|
||||
source_doc_ids: Optional[List[str]] = None,
|
||||
+ source_file_paths: Optional[List[str]] = None,
|
||||
user_hint: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
@@ -51,7 +61,8 @@ class TemplateFillService:
|
||||
|
||||
Args:
|
||||
template_fields: 模板字段列表
|
||||
- source_doc_ids: 源文档ID列表,不指定则从所有文档检索
|
||||
+ source_doc_ids: 源文档 MongoDB ID 列表
|
||||
+ source_file_paths: 源文档文件路径列表
|
||||
user_hint: 用户提示(如"请从合同文档中提取")
|
||||
|
||||
Returns:
|
||||
@@ -60,28 +71,23 @@ class TemplateFillService:
|
||||
filled_data = {}
|
||||
fill_details = []
|
||||
|
||||
+ # 1. 加载源文档内容
|
||||
+ source_docs = await self._load_source_documents(source_doc_ids, source_file_paths)
|
||||
+
|
||||
+ if not source_docs:
|
||||
+ logger.warning("没有找到源文档,填表结果将全部为空")
|
||||
+
|
||||
+ # 2. 对每个字段进行提取
|
||||
for field in template_fields:
|
||||
try:
|
||||
- # 1. 从 RAG 检索相关上下文
|
||||
- rag_results = await self._retrieve_context(field.name, user_hint)
|
||||
-
|
||||
- if not rag_results:
|
||||
- # 如果没有检索到结果,尝试直接询问 LLM
|
||||
- result = FillResult(
|
||||
- field=field.name,
|
||||
- value="",
|
||||
- source="未找到相关数据",
|
||||
- confidence=0.0
|
||||
- )
|
||||
- else:
|
||||
- # 2. 构建 Prompt 让 LLM 提取信息
|
||||
- result = await self._extract_field_value(
|
||||
- field=field,
|
||||
- rag_context=rag_results,
|
||||
- user_hint=user_hint
|
||||
- )
|
||||
-
|
||||
- # 3. 存储结果
|
||||
+ # 从源文档中提取字段值
|
||||
+ result = await self._extract_field_value(
|
||||
+ field=field,
|
||||
+ source_docs=source_docs,
|
||||
+ user_hint=user_hint
|
||||
+ )
|
||||
+
|
||||
+ # 存储结果
|
||||
filled_data[field.name] = result.value
|
||||
fill_details.append({
|
||||
"field": field.name,
|
||||
@@ -107,75 +113,113 @@ class TemplateFillService:
|
||||
return {
|
||||
"success": True,
|
||||
"filled_data": filled_data,
|
||||
- "fill_details": fill_details
|
||||
+ "fill_details": fill_details,
|
||||
+ "source_doc_count": len(source_docs)
|
||||
}
|
||||
|
||||
- async def _retrieve_context(
|
||||
+ async def _load_source_documents(
|
||||
self,
|
||||
- field_name: str,
|
||||
- user_hint: Optional[str] = None
|
||||
- ) -> List[Dict[str, Any]]:
|
||||
+ source_doc_ids: Optional[List[str]] = None,
|
||||
+ source_file_paths: Optional[List[str]] = None
|
||||
+ ) -> List[SourceDocument]:
|
||||
"""
|
||||
- 从 RAG 检索相关上下文
|
||||
+ 加载源文档内容
|
||||
|
||||
Args:
|
||||
- field_name: 字段名称
|
||||
- user_hint: 用户提示
|
||||
+ source_doc_ids: MongoDB 文档 ID 列表
|
||||
+ source_file_paths: 源文档文件路径列表
|
||||
|
||||
Returns:
|
||||
- 检索结果列表
|
||||
+ 源文档列表
|
||||
"""
|
||||
- # 构建查询文本
|
||||
- query = field_name
|
||||
- if user_hint:
|
||||
- query = f"{user_hint} {field_name}"
|
||||
-
|
||||
- # 检索相关文档片段
|
||||
- results = self.rag.retrieve(query=query, top_k=5)
|
||||
-
|
||||
- return results
|
||||
+ source_docs = []
|
||||
+
|
||||
+ # 1. 从 MongoDB 加载文档
|
||||
+ if source_doc_ids:
|
||||
+ for doc_id in source_doc_ids:
|
||||
+ try:
|
||||
+ doc = await mongodb.get_document(doc_id)
|
||||
+ if doc:
|
||||
+ source_docs.append(SourceDocument(
|
||||
+ doc_id=doc_id,
|
||||
+ filename=doc.get("metadata", {}).get("original_filename", "unknown"),
|
||||
+ doc_type=doc.get("doc_type", "unknown"),
|
||||
+ content=doc.get("content", ""),
|
||||
+ structured_data=doc.get("structured_data", {})
|
||||
+ ))
|
||||
+ logger.info(f"从MongoDB加载文档: {doc_id}")
|
||||
+ except Exception as e:
|
||||
+ logger.error(f"从MongoDB加载文档失败 {doc_id}: {str(e)}")
|
||||
+
|
||||
+ # 2. 从文件路径加载文档
|
||||
+ if source_file_paths:
|
||||
+ for file_path in source_file_paths:
|
||||
+ try:
|
||||
+ parser = ParserFactory.get_parser(file_path)
|
||||
+ result = parser.parse(file_path)
|
||||
+ if result.success:
|
||||
+ source_docs.append(SourceDocument(
|
||||
+ doc_id=file_path,
|
||||
+ filename=result.metadata.get("filename", file_path.split("/")[-1]),
|
||||
+ doc_type=result.metadata.get("extension", "unknown").replace(".", ""),
|
||||
+ content=result.data.get("content", ""),
|
||||
+ structured_data=result.data.get("structured_data", {})
|
||||
+ ))
|
||||
+ logger.info(f"从文件加载文档: {file_path}")
|
||||
+ except Exception as e:
|
||||
+ logger.error(f"从文件加载文档失败 {file_path}: {str(e)}")
|
||||
+
|
||||
+ return source_docs
|
||||
|
||||
async def _extract_field_value(
|
||||
self,
|
||||
field: TemplateField,
|
||||
- rag_context: List[Dict[str, Any]],
|
||||
+ source_docs: List[SourceDocument],
|
||||
user_hint: Optional[str] = None
|
||||
) -> FillResult:
|
||||
"""
|
||||
- 使用 LLM 从上下文中提取字段值
|
||||
+ 使用 LLM 从源文档中提取字段值
|
||||
|
||||
Args:
|
||||
field: 字段定义
|
||||
- rag_context: RAG 检索到的上下文
|
||||
+ source_docs: 源文档列表
|
||||
user_hint: 用户提示
|
||||
|
||||
Returns:
|
||||
提取结果
|
||||
"""
|
||||
+ if not source_docs:
|
||||
+ return FillResult(
|
||||
+ field=field.name,
|
||||
+ value="",
|
||||
+ source="无源文档",
|
||||
+ confidence=0.0
|
||||
+ )
|
||||
+
|
||||
# 构建上下文文本
|
||||
- context_text = "\n\n".join([
|
||||
- f"【文档 {i+1}】\n{doc['content']}"
|
||||
- for i, doc in enumerate(rag_context)
|
||||
- ])
|
||||
+ context_text = self._build_context_text(source_docs, max_length=8000)
|
||||
+
|
||||
+ # 构建提示词
|
||||
+ hint_text = field.hint if field.hint else f"请提取{field.name}的信息"
|
||||
+ if user_hint:
|
||||
+ hint_text = f"{user_hint}。{hint_text}"
|
||||
|
||||
- # 构建 Prompt
|
||||
- prompt = f"""你是一个数据提取专家。请根据以下文档内容,提取指定字段的信息。
|
||||
+ prompt = f"""你是一个专业的数据提取专家。请根据以下文档内容,提取指定字段的信息。
|
||||
|
||||
需要提取的字段:
|
||||
- 字段名称:{field.name}
|
||||
- 字段类型:{field.field_type}
|
||||
+- 填写提示:{hint_text}
|
||||
- 是否必填:{'是' if field.required else '否'}
|
||||
|
||||
-{'用户提示:' + user_hint if user_hint else ''}
|
||||
-
|
||||
参考文档内容:
|
||||
{context_text}
|
||||
|
||||
请严格按照以下 JSON 格式输出,不要添加任何解释:
|
||||
{{
|
||||
"value": "提取到的值,如果没有找到则填写空字符串",
|
||||
- "source": "数据来源的文档描述",
|
||||
- "confidence": 0.0到1.0之间的置信度
|
||||
+ "source": "数据来源的文档描述(如:来自xxx文档)",
|
||||
+ "confidence": 0.0到1.0之间的置信度,表示对提取结果的信心程度"
|
||||
}}
|
||||
"""
|
||||
|
||||
@@ -226,6 +270,54 @@ class TemplateFillService:
|
||||
confidence=0.0
|
||||
)
|
||||
|
||||
+ def _build_context_text(self, source_docs: List[SourceDocument], max_length: int = 8000) -> str:
|
||||
+ """
|
||||
+ 构建上下文文本
|
||||
+
|
||||
+ Args:
|
||||
+ source_docs: 源文档列表
|
||||
+ max_length: 最大字符数
|
||||
+
|
||||
+ Returns:
|
||||
+ 上下文文本
|
||||
+ """
|
||||
+ contexts = []
|
||||
+ total_length = 0
|
||||
+
|
||||
+ for doc in source_docs:
|
||||
+ # 优先使用结构化数据(表格),其次使用文本内容
|
||||
+ doc_content = ""
|
||||
+
|
||||
+ if doc.structured_data and doc.structured_data.get("tables"):
|
||||
+ # 如果有表格数据,优先使用
|
||||
+ tables = doc.structured_data.get("tables", [])
|
||||
+ for table in tables:
|
||||
+ if isinstance(table, dict):
|
||||
+ rows = table.get("rows", [])
|
||||
+ if rows:
|
||||
+ doc_content += f"\n【文档: {doc.filename} 表格数据】\n"
|
||||
+ for row in rows[:20]: # 限制每表最多20行
|
||||
+ if isinstance(row, list):
|
||||
+ doc_content += " | ".join(str(cell) for cell in row) + "\n"
|
||||
+ elif isinstance(row, dict):
|
||||
+ doc_content += " | ".join(str(v) for v in row.values()) + "\n"
|
||||
+ elif doc.content:
|
||||
+ doc_content = doc.content[:5000] # 限制文本长度
|
||||
+
|
||||
+ if doc_content:
|
||||
+ doc_context = f"【文档: {doc.filename} ({doc.doc_type})】\n{doc_content}"
|
||||
+ if total_length + len(doc_context) <= max_length:
|
||||
+ contexts.append(doc_context)
|
||||
+ total_length += len(doc_context)
|
||||
+ else:
|
||||
+ # 如果超出长度,截断
|
||||
+ remaining = max_length - total_length
|
||||
+ if remaining > 100:
|
||||
+ contexts.append(doc_context[:remaining])
|
||||
+ break
|
||||
+
|
||||
+ return "\n\n".join(contexts) if contexts else "(源文档内容为空)"
|
||||
+
|
||||
async def get_template_fields_from_file(
|
||||
self,
|
||||
file_path: str,
|
||||
@@ -236,7 +328,7 @@ class TemplateFillService:
|
||||
|
||||
Args:
|
||||
file_path: 模板文件路径
|
||||
- file_type: 文件类型
|
||||
+ file_type: 文件类型 (xlsx/xls/docx)
|
||||
|
||||
Returns:
|
||||
字段列表
|
||||
@@ -245,43 +337,108 @@ class TemplateFillService:
|
||||
|
||||
try:
|
||||
if file_type in ["xlsx", "xls"]:
|
||||
- # 从 Excel 读取表头
|
||||
- import pandas as pd
|
||||
- df = pd.read_excel(file_path, nrows=5)
|
||||
+ fields = await self._get_template_fields_from_excel(file_path)
|
||||
+ elif file_type == "docx":
|
||||
+ fields = await self._get_template_fields_from_docx(file_path)
|
||||
|
||||
- for idx, col in enumerate(df.columns):
|
||||
- # 获取单元格位置 (A, B, C, ...)
|
||||
- cell = self._column_to_cell(idx)
|
||||
+ except Exception as e:
|
||||
+ logger.error(f"提取模板字段失败: {str(e)}")
|
||||
|
||||
- fields.append(TemplateField(
|
||||
- cell=cell,
|
||||
- name=str(col),
|
||||
- field_type=self._infer_field_type(df[col]),
|
||||
- required=True
|
||||
- ))
|
||||
+ return fields
|
||||
|
||||
- elif file_type == "docx":
|
||||
- # 从 Word 表格读取
|
||||
- from docx import Document
|
||||
- doc = Document(file_path)
|
||||
-
|
||||
- for table_idx, table in enumerate(doc.tables):
|
||||
- for row_idx, row in enumerate(table.rows):
|
||||
- for col_idx, cell in enumerate(row.cells):
|
||||
- cell_text = cell.text.strip()
|
||||
- if cell_text:
|
||||
- fields.append(TemplateField(
|
||||
- cell=self._column_to_cell(col_idx),
|
||||
- name=cell_text,
|
||||
- field_type="text",
|
||||
- required=True
|
||||
- ))
|
||||
+ async def _get_template_fields_from_excel(self, file_path: str) -> List[TemplateField]:
|
||||
+ """从 Excel 模板提取字段"""
|
||||
+ fields = []
|
||||
+
|
||||
+ try:
|
||||
+ import pandas as pd
|
||||
+ df = pd.read_excel(file_path, nrows=5)
|
||||
+
|
||||
+ for idx, col in enumerate(df.columns):
|
||||
+ cell = self._column_to_cell(idx)
|
||||
+ col_str = str(col)
|
||||
+
|
||||
+ fields.append(TemplateField(
|
||||
+ cell=cell,
|
||||
+ name=col_str,
|
||||
+ field_type=self._infer_field_type_from_value(df[col].iloc[0] if len(df) > 0 else ""),
|
||||
+ required=True,
|
||||
+ hint=""
|
||||
+ ))
|
||||
|
||||
except Exception as e:
|
||||
- logger.error(f"提取模板字段失败: {str(e)}")
|
||||
+ logger.error(f"从Excel提取字段失败: {str(e)}")
|
||||
|
||||
return fields
|
||||
|
||||
+ async def _get_template_fields_from_docx(self, file_path: str) -> List[TemplateField]:
|
||||
+ """从 Word 模板提取字段"""
|
||||
+ fields = []
|
||||
+
|
||||
+ try:
|
||||
+ from docx import Document
|
||||
+
|
||||
+ doc = Document(file_path)
|
||||
+
|
||||
+ for table_idx, table in enumerate(doc.tables):
|
||||
+ for row_idx, row in enumerate(table.rows):
|
||||
+ cells = [cell.text.strip() for cell in row.cells]
|
||||
+
|
||||
+ # 假设第一列是字段名
|
||||
+ if cells and cells[0]:
|
||||
+ field_name = cells[0]
|
||||
+ hint = cells[1] if len(cells) > 1 else ""
|
||||
+
|
||||
+ # 跳过空行或标题行
|
||||
+ if field_name and field_name not in ["", "字段名", "名称", "项目"]:
|
||||
+ fields.append(TemplateField(
|
||||
+ cell=f"T{table_idx}R{row_idx}",
|
||||
+ name=field_name,
|
||||
+ field_type=self._infer_field_type_from_hint(hint),
|
||||
+ required=True,
|
||||
+ hint=hint
|
||||
+ ))
|
||||
+
|
||||
+ except Exception as e:
|
||||
+ logger.error(f"从Word提取字段失败: {str(e)}")
|
||||
+
|
||||
+ return fields
|
||||
+
|
||||
+ def _infer_field_type_from_hint(self, hint: str) -> str:
|
||||
+ """从提示词推断字段类型"""
|
||||
+ hint_lower = hint.lower()
|
||||
+
|
||||
+ date_keywords = ["年", "月", "日", "日期", "时间", "出生"]
|
||||
+ if any(kw in hint for kw in date_keywords):
|
||||
+ return "date"
|
||||
+
|
||||
+ number_keywords = ["数量", "金额", "人数", "面积", "增长", "比率", "%", "率", "总计", "合计"]
|
||||
+ if any(kw in hint_lower for kw in number_keywords):
|
||||
+ return "number"
|
||||
+
|
||||
+ return "text"
|
||||
+
|
||||
+ def _infer_field_type_from_value(self, value: Any) -> str:
|
||||
+ """从示例值推断字段类型"""
|
||||
+ if value is None or value == "":
|
||||
+ return "text"
|
||||
+
|
||||
+ value_str = str(value)
|
||||
+
|
||||
+ # 检查日期模式
|
||||
+ import re
|
||||
+ if re.search(r'\d{4}[年/-]\d{1,2}[月/-]\d{1,2}', value_str):
|
||||
+ return "date"
|
||||
+
|
||||
+ # 检查数值
|
||||
+ try:
|
||||
+ float(value_str.replace(',', '').replace('%', ''))
|
||||
+ return "number"
|
||||
+ except ValueError:
|
||||
+ pass
|
||||
+
|
||||
+ return "text"
|
||||
+
|
||||
def _column_to_cell(self, col_idx: int) -> str:
|
||||
"""将列索引转换为单元格列名 (0 -> A, 1 -> B, ...)"""
|
||||
result = ""
|
||||
@@ -290,17 +447,6 @@ class TemplateFillService:
|
||||
col_idx = col_idx // 26 - 1
|
||||
return result
|
||||
|
||||
- def _infer_field_type(self, series) -> str:
|
||||
- """推断字段类型"""
|
||||
- import pandas as pd
|
||||
-
|
||||
- if pd.api.types.is_numeric_dtype(series):
|
||||
- return "number"
|
||||
- elif pd.api.types.is_datetime64_any_dtype(series):
|
||||
- return "date"
|
||||
- else:
|
||||
- return "text"
|
||||
-
|
||||
|
||||
# ==================== 全局单例 ====================
|
||||
|
||||
53
logs/frontend_template_fill.patch
Normal file
53
logs/frontend_template_fill.patch
Normal file
@@ -0,0 +1,53 @@
|
||||
diff --git a/frontend/src/db/backend-api.ts b/frontend/src/db/backend-api.ts
|
||||
index 8944353..94ac852 100644
|
||||
--- a/frontend/src/db/backend-api.ts
|
||||
+++ b/frontend/src/db/backend-api.ts
|
||||
@@ -92,6 +92,7 @@ export interface TemplateField {
|
||||
name: string;
|
||||
field_type: string;
|
||||
required: boolean;
|
||||
+ hint?: string;
|
||||
}
|
||||
|
||||
// 表格填写结果
|
||||
@@ -625,7 +626,10 @@ export const backendApi = {
|
||||
*/
|
||||
async fillTemplate(
|
||||
templateId: string,
|
||||
- templateFields: TemplateField[]
|
||||
+ templateFields: TemplateField[],
|
||||
+ sourceDocIds?: string[],
|
||||
+ sourceFilePaths?: string[],
|
||||
+ userHint?: string
|
||||
): Promise<FillResult> {
|
||||
const url = `${BACKEND_BASE_URL}/templates/fill`;
|
||||
|
||||
@@ -636,6 +640,9 @@ export const backendApi = {
|
||||
body: JSON.stringify({
|
||||
template_id: templateId,
|
||||
template_fields: templateFields,
|
||||
+ source_doc_ids: sourceDocIds || [],
|
||||
+ source_file_paths: sourceFilePaths || [],
|
||||
+ user_hint: userHint || null,
|
||||
}),
|
||||
});
|
||||
|
||||
diff --git a/frontend/src/pages/TemplateFill.tsx b/frontend/src/pages/TemplateFill.tsx
|
||||
index 8c330a9..f9a4a39 100644
|
||||
--- a/frontend/src/pages/TemplateFill.tsx
|
||||
+++ b/frontend/src/pages/TemplateFill.tsx
|
||||
@@ -128,8 +128,12 @@ const TemplateFill: React.FC = () => {
|
||||
setStep('filling');
|
||||
|
||||
try {
|
||||
- // 调用后端填表接口
|
||||
- const result = await backendApi.fillTemplate('temp-template-id', templateFields);
|
||||
+ // 调用后端填表接口,传递选中的文档ID
|
||||
+ const result = await backendApi.fillTemplate(
|
||||
+ 'temp-template-id',
|
||||
+ templateFields,
|
||||
+ selectedDocs // 传递源文档ID列表
|
||||
+ );
|
||||
setFilledResult(result);
|
||||
setStep('preview');
|
||||
toast.success('表格填写完成');
|
||||
221
logs/planning_doc.patch
Normal file
221
logs/planning_doc.patch
Normal file
@@ -0,0 +1,221 @@
|
||||
diff --git "a/\346\257\224\350\265\233\345\244\207\350\265\233\350\247\204\345\210\222.md" "b/\346\257\224\350\265\233\345\244\207\350\265\233\350\247\204\345\210\222.md"
|
||||
index bcb48fd..440a12d 100644
|
||||
--- "a/\346\257\224\350\265\233\345\244\207\350\265\233\350\247\204\345\210\222.md"
|
||||
+++ "b/\346\257\224\350\265\233\345\244\207\350\265\233\350\247\204\345\210\222.md"
|
||||
@@ -50,7 +50,7 @@
|
||||
| `prompt_service.py` | ✅ 已完成 | Prompt 模板管理 |
|
||||
| `text_analysis_service.py` | ✅ 已完成 | 文本分析 |
|
||||
| `chart_generator_service.py` | ✅ 已完成 | 图表生成服务 |
|
||||
-| `template_fill_service.py` | ❌ 未完成 | 模板填写服务 |
|
||||
+| `template_fill_service.py` | ✅ 已完成 | 模板填写服务,支持直接读取源文档进行填表 |
|
||||
|
||||
### 2.2 API 接口 (`backend/app/api/endpoints/`)
|
||||
|
||||
@@ -61,7 +61,7 @@
|
||||
| `ai_analyze.py` | `/api/v1/analyze/*` | ✅ AI 分析(Excel、Markdown、流式) |
|
||||
| `rag.py` | `/api/v1/rag/*` | ⚠️ RAG 检索(当前返回空) |
|
||||
| `tasks.py` | `/api/v1/tasks/*` | ✅ 异步任务状态查询 |
|
||||
-| `templates.py` | `/api/v1/templates/*` | ✅ 模板管理 |
|
||||
+| `templates.py` | `/api/v1/templates/*` | ✅ 模板管理 (含 Word 导出) |
|
||||
| `visualization.py` | `/api/v1/visualization/*` | ✅ 可视化图表 |
|
||||
| `health.py` | `/api/v1/health` | ✅ 健康检查 |
|
||||
|
||||
@@ -78,8 +78,8 @@
|
||||
|------|----------|------|
|
||||
| Excel (.xlsx/.xls) | ✅ 已完成 | pandas + XML 回退解析 |
|
||||
| Markdown (.md) | ✅ 已完成 | 正则 + AI 分章节 |
|
||||
-| Word (.docx) | ❌ 未完成 | 尚未实现 |
|
||||
-| Text (.txt) | ❌ 未完成 | 尚未实现 |
|
||||
+| Word (.docx) | ✅ 已完成 | python-docx 解析,支持表格提取和字段识别 |
|
||||
+| Text (.txt) | ✅ 已完成 | chardet 编码检测,支持文本清洗和结构化提取 |
|
||||
|
||||
---
|
||||
|
||||
@@ -87,7 +87,7 @@
|
||||
|
||||
### 3.1 模板填写模块(最优先)
|
||||
|
||||
-**这是比赛的核心评测功能,必须完成。**
|
||||
+**当前状态**:✅ 已完成
|
||||
|
||||
```
|
||||
用户上传模板表格(Word/Excel)
|
||||
@@ -103,30 +103,34 @@ AI 根据字段提示词从源数据中提取信息
|
||||
返回填写完成的表格
|
||||
```
|
||||
|
||||
-**需要实现**:
|
||||
-- [ ] `template_fill_service.py` - 模板填写核心服务
|
||||
-- [ ] Word 模板解析 (`docx_parser.py` 需新建)
|
||||
-- [ ] Text 模板解析 (`txt_parser.py` 需新建)
|
||||
-- [ ] 模板字段识别与提示词提取
|
||||
-- [ ] 多文档数据聚合与冲突处理
|
||||
-- [ ] 结果导出为 Word/Excel
|
||||
+**已完成实现**:
|
||||
+- [x] `template_fill_service.py` - 模板填写核心服务
|
||||
+- [x] Word 模板解析 (`docx_parser.py` - parse_tables_for_template, extract_template_fields_from_docx)
|
||||
+- [x] Text 模板解析 (`txt_parser.py` - 已完成)
|
||||
+- [x] 模板字段识别与提示词提取
|
||||
+- [x] 多文档数据聚合与冲突处理
|
||||
+- [x] 结果导出为 Word/Excel
|
||||
|
||||
### 3.2 Word 文档解析
|
||||
|
||||
-**当前状态**:仅有框架,尚未实现具体解析逻辑
|
||||
+**当前状态**:✅ 已完成
|
||||
|
||||
-**需要实现**:
|
||||
-- [ ] `docx_parser.py` - Word 文档解析器
|
||||
-- [ ] 提取段落文本
|
||||
-- [ ] 提取表格内容
|
||||
-- [ ] 提取关键信息(标题、列表等)
|
||||
+**已实现功能**:
|
||||
+- [x] `docx_parser.py` - Word 文档解析器
|
||||
+- [x] 提取段落文本
|
||||
+- [x] 提取表格内容
|
||||
+- [x] 提取关键信息(标题、列表等)
|
||||
+- [x] 表格模板字段提取 (`parse_tables_for_template`, `extract_template_fields_from_docx`)
|
||||
+- [x] 字段类型推断 (`_infer_field_type_from_hint`)
|
||||
|
||||
### 3.3 Text 文档解析
|
||||
|
||||
-**需要实现**:
|
||||
-- [ ] `txt_parser.py` - 文本文件解析器
|
||||
-- [ ] 编码自动检测
|
||||
-- [ ] 文本清洗
|
||||
+**当前状态**:✅ 已完成
|
||||
+
|
||||
+**已实现功能**:
|
||||
+- [x] `txt_parser.py` - 文本文件解析器
|
||||
+- [x] 编码自动检测 (chardet)
|
||||
+- [x] 文本清洗
|
||||
|
||||
### 3.4 文档模板匹配(已有框架)
|
||||
|
||||
@@ -215,5 +219,122 @@ docs/test/
|
||||
|
||||
---
|
||||
|
||||
-*文档版本: v1.0*
|
||||
-*最后更新: 2026-04-08*
|
||||
\ No newline at end of file
|
||||
+*文档版本: v1.1*
|
||||
+*最后更新: 2026-04-08*
|
||||
+
|
||||
+---
|
||||
+
|
||||
+## 八、技术实现细节
|
||||
+
|
||||
+### 8.1 模板填表流程(已实现)
|
||||
+
|
||||
+#### 流程图
|
||||
+```
|
||||
+┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
||||
+│ 上传模板 │ ──► │ 选择数据源 │ ──► │ AI 智能填表 │
|
||||
+└─────────────┘ └─────────────┘ └─────────────┘
|
||||
+ │
|
||||
+ ▼
|
||||
+ ┌─────────────┐
|
||||
+ │ 导出结果 │
|
||||
+ └─────────────┘
|
||||
+```
|
||||
+
|
||||
+#### 核心组件
|
||||
+
|
||||
+| 组件 | 文件 | 说明 |
|
||||
+|------|------|------|
|
||||
+| 模板上传 | `templates.py` `/templates/upload` | 接收模板文件,提取字段 |
|
||||
+| 字段提取 | `template_fill_service.py` | 从 Word/Excel 表格提取字段定义 |
|
||||
+| 文档解析 | `docx_parser.py`, `xlsx_parser.py`, `txt_parser.py` | 解析源文档内容 |
|
||||
+| 智能填表 | `template_fill_service.py` `fill_template()` | 使用 LLM 从源文档提取信息 |
|
||||
+| 结果导出 | `templates.py` `/templates/export` | 导出为 Excel 或 Word |
|
||||
+
|
||||
+### 8.2 源文档加载方式
|
||||
+
|
||||
+模板填表服务支持两种方式加载源文档:
|
||||
+
|
||||
+1. **通过 MongoDB 文档 ID**:`source_doc_ids`
|
||||
+ - 文档已上传并存入 MongoDB
|
||||
+ - 服务直接查询 MongoDB 获取文档内容
|
||||
+
|
||||
+2. **通过文件路径**:`source_file_paths`
|
||||
+ - 直接读取本地文件
|
||||
+ - 使用对应的解析器解析内容
|
||||
+
|
||||
+### 8.3 Word 表格模板解析
|
||||
+
|
||||
+比赛评分表格通常是 Word 格式,`docx_parser.py` 提供了专门的解析方法:
|
||||
+
|
||||
+```python
|
||||
+# 提取表格模板字段
|
||||
+fields = docx_parser.extract_template_fields_from_docx(file_path)
|
||||
+
|
||||
+# 返回格式
|
||||
+# [
|
||||
+# {
|
||||
+# "cell": "T0R1", # 表格0,行1
|
||||
+# "name": "字段名",
|
||||
+# "hint": "提示词",
|
||||
+# "field_type": "text/number/date",
|
||||
+# "required": True
|
||||
+# },
|
||||
+# ...
|
||||
+# ]
|
||||
+```
|
||||
+
|
||||
+### 8.4 字段类型推断
|
||||
+
|
||||
+系统支持从提示词自动推断字段类型:
|
||||
+
|
||||
+| 关键词 | 推断类型 | 示例 |
|
||||
+|--------|----------|------|
|
||||
+| 年、月、日、日期、时间、出生 | date | 出生日期 |
|
||||
+| 数量、金额、比率、%、率、合计 | number | 增长比率 |
|
||||
+| 其他 | text | 姓名、地址 |
|
||||
+
|
||||
+### 8.5 API 接口
|
||||
+
|
||||
+#### POST `/api/v1/templates/fill`
|
||||
+
|
||||
+填写请求:
|
||||
+```json
|
||||
+{
|
||||
+ "template_id": "模板ID",
|
||||
+ "template_fields": [
|
||||
+ {"cell": "A1", "name": "姓名", "field_type": "text", "required": true, "hint": "提取人员姓名"}
|
||||
+ ],
|
||||
+ "source_doc_ids": ["mongodb_doc_id_1", "mongodb_doc_id_2"],
|
||||
+ "source_file_paths": [],
|
||||
+ "user_hint": "请从合同文档中提取"
|
||||
+}
|
||||
+```
|
||||
+
|
||||
+响应:
|
||||
+```json
|
||||
+{
|
||||
+ "success": true,
|
||||
+ "filled_data": {"姓名": "张三"},
|
||||
+ "fill_details": [
|
||||
+ {
|
||||
+ "field": "姓名",
|
||||
+ "cell": "A1",
|
||||
+ "value": "张三",
|
||||
+ "source": "来自:合同文档.docx",
|
||||
+ "confidence": 0.95
|
||||
+ }
|
||||
+ ],
|
||||
+ "source_doc_count": 2
|
||||
+}
|
||||
+```
|
||||
+
|
||||
+#### POST `/api/v1/templates/export`
|
||||
+
|
||||
+导出请求:
|
||||
+```json
|
||||
+{
|
||||
+ "template_id": "模板ID",
|
||||
+ "filled_data": {"姓名": "张三", "金额": "10000"},
|
||||
+ "format": "xlsx" // 或 "docx"
|
||||
+}
|
||||
+```
|
||||
\ No newline at end of file
|
||||
59
logs/rag_disable_note.txt
Normal file
59
logs/rag_disable_note.txt
Normal file
@@ -0,0 +1,59 @@
|
||||
RAG 服务临时禁用说明
|
||||
========================
|
||||
日期: 2026-04-08
|
||||
|
||||
修改内容:
|
||||
----------
|
||||
应需求,RAG 向量检索功能已临时禁用,具体如下:
|
||||
|
||||
1. 修改文件: backend/app/services/rag_service.py
|
||||
|
||||
2. 关键变更:
|
||||
- 在 RAGService.__init__ 中添加 self._disabled = True 标志
|
||||
- index_field() - 添加 _disabled 检查,跳过实际索引操作并记录日志
|
||||
- index_document_content() - 添加 _disabled 检查,跳过实际索引操作并记录日志
|
||||
- retrieve() - 添加 _disabled 检查,返回空列表并记录日志
|
||||
- get_vector_count() - 添加 _disabled 检查,返回 0 并记录日志
|
||||
- clear() - 添加 _disabled 检查,跳过实际清空操作并记录日志
|
||||
|
||||
3. 行为变更:
|
||||
- 所有 RAG 索引构建操作会被记录到日志 ([RAG DISABLED] 前缀)
|
||||
- 所有 RAG 检索操作返回空结果
|
||||
- 向量计数始终返回 0
|
||||
- 实际向量数据库操作被跳过
|
||||
|
||||
4. 恢复方式:
|
||||
- 将 RAGService.__init__ 中的 self._disabled = True 改为 self._disabled = False
|
||||
- 重新启动服务即可恢复 RAG 功能
|
||||
|
||||
目的:
|
||||
------
|
||||
保留 RAG 索引构建功能的前端界面和代码结构,暂不实际调用向量数据库 API,
|
||||
待后续需要时再启用。
|
||||
|
||||
影响范围:
|
||||
---------
|
||||
- /api/v1/rag/search - RAG 搜索接口 (返回空结果)
|
||||
- /api/v1/rag/status - RAG 状态接口 (返回 vector_count=0)
|
||||
- /api/v1/rag/rebuild - RAG 重建接口 (仅记录日志)
|
||||
- Excel/文档上传时的 RAG 索引构建 (仅记录日志)
|
||||
|
||||
========================
|
||||
后续补充 (2026-04-08):
|
||||
========================
|
||||
修改文件: backend/app/services/table_rag_service.py
|
||||
|
||||
关键变更:
|
||||
- 在 TableRAGService.__init__ 中添加 self._disabled = True 标志
|
||||
- build_table_rag_index() - RAG 索引部分被跳过,仅记录日志
|
||||
- index_document_table() - RAG 索引部分被跳过,仅记录日志
|
||||
|
||||
行为变更:
|
||||
- Excel 上传时,MySQL 存储仍然正常进行
|
||||
- AI 字段描述仍然正常生成(调用 LLM)
|
||||
- 只有向量数据库索引操作被跳过
|
||||
|
||||
恢复方式:
|
||||
- 将 TableRAGService.__init__ 中的 self._disabled = True 改为 self._disabled = False
|
||||
- 或将 rag_service.py 中的 self._disabled = True 改为 self._disabled = False
|
||||
- 两者需同时改为 False 才能完全恢复 RAG 功能
|
||||
144
logs/template_fill_feature_changes.md
Normal file
144
logs/template_fill_feature_changes.md
Normal file
@@ -0,0 +1,144 @@
|
||||
# 模板填表功能变更日志
|
||||
|
||||
**变更日期**: 2026-04-08
|
||||
**变更类型**: 功能完善
|
||||
**变更内容**: Word 表格解析和模板填表功能
|
||||
|
||||
---
|
||||
|
||||
## 变更概述
|
||||
|
||||
本次变更完善了 Word 表格解析、表格模板构建和填写功能,实现了从源文档(MongoDB/文件)读取数据并智能填表的核心流程。
|
||||
|
||||
### 涉及文件
|
||||
|
||||
| 文件 | 变更行数 | 说明 |
|
||||
|------|----------|------|
|
||||
| backend/app/api/endpoints/templates.py | +156 | API 端点完善,添加 Word 导出 |
|
||||
| backend/app/core/document_parser/docx_parser.py | +130 | Word 表格解析增强 |
|
||||
| backend/app/services/template_fill_service.py | +340 | 核心填表服务重写 |
|
||||
| frontend/src/db/backend-api.ts | +9 | 前端 API 更新 |
|
||||
| frontend/src/pages/TemplateFill.tsx | +8 | 前端页面更新 |
|
||||
| 比赛备赛规划.md | +169 | 文档更新 |
|
||||
|
||||
---
|
||||
|
||||
## 详细变更
|
||||
|
||||
### 1. backend/app/core/document_parser/docx_parser.py
|
||||
|
||||
**新增方法**:
|
||||
|
||||
- `parse_tables_for_template(file_path)` - 解析 Word 文档中的表格,提取模板字段
|
||||
- `extract_template_fields_from_docx(file_path)` - 从 Word 文档提取模板字段定义
|
||||
- `_infer_field_type_from_hint(hint)` - 从提示词推断字段类型
|
||||
|
||||
**功能说明**:
|
||||
- 专门用于比赛场景:解析表格模板,识别需要填写的字段
|
||||
- 支持从表格第一列提取字段名,第二列提取提示词/描述
|
||||
- 自动推断字段类型(text/number/date)
|
||||
|
||||
### 2. backend/app/services/template_fill_service.py
|
||||
|
||||
**重构内容**:
|
||||
|
||||
- 不再依赖 RAG 服务,直接从 MongoDB 或文件读取源文档
|
||||
- 新增 `SourceDocument` 数据类
|
||||
- 完善 `fill_template()` 方法,支持 `source_doc_ids` 和 `source_file_paths`
|
||||
- 新增 `_load_source_documents()` - 加载源文档内容
|
||||
- 新增 `_extract_field_value()` - 使用 LLM 提取字段值
|
||||
- 新增 `_build_context_text()` - 构建上下文(优先使用表格数据)
|
||||
- 完善 `_get_template_fields_from_docx()` - Word 模板字段提取
|
||||
|
||||
**核心流程**:
|
||||
```
|
||||
1. 加载源文档(MongoDB 或文件)
|
||||
2. 对每个字段调用 LLM 提取值
|
||||
3. 返回填写结果
|
||||
```
|
||||
|
||||
### 3. backend/app/api/endpoints/templates.py
|
||||
|
||||
**新增内容**:
|
||||
|
||||
- `FillRequest` 添加 `source_doc_ids`, `source_file_paths`, `user_hint` 字段
|
||||
- `ExportRequest` 添加 `format` 字段
|
||||
- `_export_to_word()` - 导出为 Word 格式
|
||||
- `/templates/export/excel` - 专门导出 Excel
|
||||
- `/templates/export/word` - 专门导出 Word
|
||||
|
||||
### 4. frontend/src/db/backend-api.ts
|
||||
|
||||
**更新内容**:
|
||||
|
||||
- `TemplateField` 接口添加 `hint` 字段
|
||||
- `fillTemplate()` 方法添加 `sourceDocIds`, `sourceFilePaths`, `userHint` 参数
|
||||
|
||||
### 5. frontend/src/pages/TemplateFill.tsx
|
||||
|
||||
**更新内容**:
|
||||
|
||||
- `handleFillTemplate()` 传递 `selectedDocs` 作为 `sourceDocIds` 参数
|
||||
|
||||
---
|
||||
|
||||
## API 接口变更
|
||||
|
||||
### POST /api/v1/templates/fill
|
||||
|
||||
**请求体**:
|
||||
```json
|
||||
{
|
||||
"template_id": "模板ID",
|
||||
"template_fields": [
|
||||
{
|
||||
"cell": "A1",
|
||||
"name": "姓名",
|
||||
"field_type": "text",
|
||||
"required": true,
|
||||
"hint": "提取人员姓名"
|
||||
}
|
||||
],
|
||||
"source_doc_ids": ["mongodb_doc_id"],
|
||||
"source_file_paths": [],
|
||||
"user_hint": "请从xxx文档中提取"
|
||||
}
|
||||
```
|
||||
|
||||
**响应**:
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"filled_data": {"姓名": "张三"},
|
||||
"fill_details": [...],
|
||||
"source_doc_count": 1
|
||||
}
|
||||
```
|
||||
|
||||
### POST /api/v1/templates/export
|
||||
|
||||
**新增支持 format=dicx**,可导出为 Word 格式
|
||||
|
||||
---
|
||||
|
||||
## 技术细节
|
||||
|
||||
### 字段类型推断
|
||||
|
||||
| 关键词 | 推断类型 |
|
||||
|--------|----------|
|
||||
| 年、月、日、日期、时间、出生 | date |
|
||||
| 数量、金额、比率、%、率、合计 | number |
|
||||
| 其他 | text |
|
||||
|
||||
### 上下文构建
|
||||
|
||||
源文档内容构建优先级:
|
||||
1. 结构化数据(表格数据)
|
||||
2. 原始文本内容(限制 5000 字符)
|
||||
|
||||
---
|
||||
|
||||
## 相关文档
|
||||
|
||||
- [比赛备赛规划.md](../比赛备赛规划.md) - 已更新功能状态和技术实现细节
|
||||
20
package.json
Normal file
20
package.json
Normal file
@@ -0,0 +1,20 @@
|
||||
{
|
||||
"name": "filesreadsystem",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"directories": {
|
||||
"doc": "docs"
|
||||
},
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://gitea.kronecker.cc/OurCodesAreAllRight/FilesReadSystem.git"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"type": "commonjs"
|
||||
}
|
||||
340
比赛备赛规划.md
Normal file
340
比赛备赛规划.md
Normal file
@@ -0,0 +1,340 @@
|
||||
# 比赛备赛规划文档
|
||||
|
||||
## 一、赛题核心理解
|
||||
|
||||
### 1.1 赛题名称
|
||||
**A23 - 基于大语言模型的文档理解与多源数据融合**
|
||||
参赛院校:金陵科技学院
|
||||
|
||||
### 1.2 核心任务
|
||||
1. **文档解析**:解析 docx/md/xlsx/txt 四种格式的源数据文档
|
||||
2. **模板填写**:根据模板表格要求,从源文档中提取数据填写到 Word/Excel 模板
|
||||
3. **准确率与速度**:准确率优先,速度作为辅助评分因素
|
||||
|
||||
### 1.3 评分规则
|
||||
| 要素 | 说明 |
|
||||
|------|------|
|
||||
| 准确率 | 填写结果与样例表格对比的正确率 |
|
||||
| 响应时间 | 从导入文档到得到结果的时间 ≤ 90s × 文档数量 |
|
||||
| 评测方式 | 赛方提供空表格模板 + 样例表格(人工填写),系统自动填写后对比 |
|
||||
|
||||
### 1.4 关键Q&A摘录
|
||||
|
||||
| 问题 | 解答要点 |
|
||||
|------|----------|
|
||||
| Q2: 模板与文档的关系 | 前2个表格只涉及1份文档;第3-4个涉及多份文档;第5个涉及大部分文档(从易到难) |
|
||||
| Q5: 响应时间定义 | 从导入文档到最终得到结果的时间 ≤ 90s × 文档数量 |
|
||||
| Q7: 需要读取哪些文件 | 每个模板只读取指定的数据文件,不需要读取全部 |
|
||||
| Q10: 部署方式 | 不要求部署到服务器,本地部署即可 |
|
||||
| Q14: 模板匹配 | 模板已指定数据文件,不需要算法匹配 |
|
||||
| Q16: 数据库存储 | 可跳过,不强制要求 |
|
||||
| Q20: 创新点 | 不用管,随意发挥 |
|
||||
| Q21: 填写依据 | 按照测试表格模板给的提示词进行填写 |
|
||||
|
||||
---
|
||||
|
||||
## 二、已完成功能清单
|
||||
|
||||
### 2.1 后端服务 (`backend/app/services/`)
|
||||
|
||||
| 服务文件 | 功能状态 | 说明 |
|
||||
|----------|----------|------|
|
||||
| `file_service.py` | ✅ 已完成 | 文件上传、保存、类型识别 |
|
||||
| `excel_storage_service.py` | ✅ 已完成 | Excel 存储到 MySQL,支持 XML 回退解析 |
|
||||
| `table_rag_service.py` | ⚠️ 已禁用 | RAG 索引构建(当前禁用,仅记录日志) |
|
||||
| `llm_service.py` | ✅ 已完成 | LLM 调用、流式输出、多模型支持 |
|
||||
| `markdown_ai_service.py` | ✅ 已完成 | Markdown AI 分析、分章节提取、流式输出、图表生成 |
|
||||
| `excel_ai_service.py` | ✅ 已完成 | Excel AI 分析 |
|
||||
| `visualization_service.py` | ✅ 已完成 | 图表生成(matplotlib) |
|
||||
| `rag_service.py` | ⚠️ 已禁用 | FAISS 向量检索(当前禁用) |
|
||||
| `prompt_service.py` | ✅ 已完成 | Prompt 模板管理 |
|
||||
| `text_analysis_service.py` | ✅ 已完成 | 文本分析 |
|
||||
| `chart_generator_service.py` | ✅ 已完成 | 图表生成服务 |
|
||||
| `template_fill_service.py` | ✅ 已完成 | 模板填写服务,支持直接读取源文档进行填表 |
|
||||
|
||||
### 2.2 API 接口 (`backend/app/api/endpoints/`)
|
||||
|
||||
| 接口文件 | 路由 | 功能状态 |
|
||||
|----------|------|----------|
|
||||
| `upload.py` | `/api/v1/upload/excel` | ✅ Excel 文件上传与解析 |
|
||||
| `documents.py` | `/api/v1/documents/*` | ✅ 文档管理(列表、删除、搜索) |
|
||||
| `ai_analyze.py` | `/api/v1/analyze/*` | ✅ AI 分析(Excel、Markdown、流式) |
|
||||
| `rag.py` | `/api/v1/rag/*` | ⚠️ RAG 检索(当前返回空) |
|
||||
| `tasks.py` | `/api/v1/tasks/*` | ✅ 异步任务状态查询 |
|
||||
| `templates.py` | `/api/v1/templates/*` | ✅ 模板管理 (含 Word 导出) |
|
||||
| `visualization.py` | `/api/v1/visualization/*` | ✅ 可视化图表 |
|
||||
| `health.py` | `/api/v1/health` | ✅ 健康检查 |
|
||||
|
||||
### 2.3 前端页面 (`frontend/src/pages/`)
|
||||
|
||||
| 页面文件 | 功能 | 状态 |
|
||||
|----------|------|------|
|
||||
| `Documents.tsx` | 主文档管理页面 | ✅ 已完成 |
|
||||
| `ExcelParse.tsx` | Excel 解析页面 | ✅ 已完成 |
|
||||
|
||||
### 2.4 文档解析能力
|
||||
|
||||
| 格式 | 解析状态 | 说明 |
|
||||
|------|----------|------|
|
||||
| Excel (.xlsx/.xls) | ✅ 已完成 | pandas + XML 回退解析 |
|
||||
| Markdown (.md) | ✅ 已完成 | 正则 + AI 分章节 |
|
||||
| Word (.docx) | ✅ 已完成 | python-docx 解析,支持表格提取和字段识别 |
|
||||
| Text (.txt) | ✅ 已完成 | chardet 编码检测,支持文本清洗和结构化提取 |
|
||||
|
||||
---
|
||||
|
||||
## 三、待完成功能(核心缺块)
|
||||
|
||||
### 3.1 模板填写模块(最优先)
|
||||
|
||||
**当前状态**:✅ 已完成
|
||||
|
||||
```
|
||||
用户上传模板表格(Word/Excel)
|
||||
↓
|
||||
解析模板,提取需要填写的字段和提示词
|
||||
↓
|
||||
根据模板指定的源文档列表读取源数据
|
||||
↓
|
||||
AI 根据字段提示词从源数据中提取信息
|
||||
↓
|
||||
将提取的数据填入模板对应位置
|
||||
↓
|
||||
返回填写完成的表格
|
||||
```
|
||||
|
||||
**已完成实现**:
|
||||
- [x] `template_fill_service.py` - 模板填写核心服务
|
||||
- [x] Word 模板解析 (`docx_parser.py` - parse_tables_for_template, extract_template_fields_from_docx)
|
||||
- [x] Text 模板解析 (`txt_parser.py` - 已完成)
|
||||
- [x] 模板字段识别与提示词提取
|
||||
- [x] 多文档数据聚合与冲突处理
|
||||
- [x] 结果导出为 Word/Excel
|
||||
|
||||
### 3.2 Word 文档解析
|
||||
|
||||
**当前状态**:✅ 已完成
|
||||
|
||||
**已实现功能**:
|
||||
- [x] `docx_parser.py` - Word 文档解析器
|
||||
- [x] 提取段落文本
|
||||
- [x] 提取表格内容
|
||||
- [x] 提取关键信息(标题、列表等)
|
||||
- [x] 表格模板字段提取 (`parse_tables_for_template`, `extract_template_fields_from_docx`)
|
||||
- [x] 字段类型推断 (`_infer_field_type_from_hint`)
|
||||
|
||||
### 3.3 Text 文档解析
|
||||
|
||||
**当前状态**:✅ 已完成
|
||||
|
||||
**已实现功能**:
|
||||
- [x] `txt_parser.py` - 文本文件解析器
|
||||
- [x] 编码自动检测 (chardet)
|
||||
- [x] 文本清洗
|
||||
|
||||
### 3.4 文档模板匹配(已有框架)
|
||||
|
||||
根据 Q&A,模板已指定数据文件,不需要算法匹配。当前已有上传功能,需确认模板与数据文件的关联逻辑是否完善。
|
||||
|
||||
---
|
||||
|
||||
## 四、参赛材料准备
|
||||
|
||||
### 4.1 必交材料
|
||||
|
||||
| 材料 | 要求 | 当前状态 | 行动项 |
|
||||
|------|------|----------|--------|
|
||||
| 项目概要介绍 | PPT 格式 | ❌ 待制作 | 制作 PPT |
|
||||
| 项目简介 PPT | - | ❌ 待制作 | 制作 PPT |
|
||||
| 项目详细方案 | 文档 | ⚠️ 部分完成 | 完善文档 |
|
||||
| 项目演示视频 | - | ❌ 待制作 | 录制演示视频 |
|
||||
| 训练素材说明 | 来源说明 | ⚠️ 已有素材 | 整理素材文档 |
|
||||
| 关键模块设计文档 | 概要设计 | ⚠️ 已有部分 | 完善文档 |
|
||||
| 可运行 Demo | 核心代码 | ✅ 已完成 | 打包可运行版本 |
|
||||
|
||||
### 4.2 Demo 提交要求
|
||||
|
||||
根据 Q&A:
|
||||
- 可以只提交核心代码,不需要完整运行环境
|
||||
- 现场答辩可使用自带笔记本电脑
|
||||
- 需要提供部署和运行说明(README)
|
||||
|
||||
---
|
||||
|
||||
## 五、测试验证计划
|
||||
|
||||
### 5.1 使用现有测试数据
|
||||
|
||||
```
|
||||
docs/test/
|
||||
├── 2023年文化和旅游发展统计公报.md
|
||||
├── 2024年卫生健康事业发展统计公报.md
|
||||
├── 第三次全国工业普查主要数据公报.md
|
||||
```
|
||||
|
||||
### 5.2 模板填写测试流程
|
||||
|
||||
1. 准备一个 Word/Excel 模板表格
|
||||
2. 指定源数据文档
|
||||
3. 上传模板和文档
|
||||
4. 执行模板填写
|
||||
5. 检查填写结果准确率
|
||||
6. 记录响应时间
|
||||
|
||||
### 5.3 性能目标
|
||||
|
||||
| 指标 | 目标 | 当前状态 |
|
||||
|------|------|----------|
|
||||
| 信息提取准确率 | ≥80% | 需测试验证 |
|
||||
| 单次响应时间 | ≤90s × 文档数 | 需测试验证 |
|
||||
|
||||
---
|
||||
|
||||
## 六、工作计划(建议)
|
||||
|
||||
### 第一优先级:模板填写核心功能
|
||||
- 完成 Word 文档解析
|
||||
- 完成模板填写服务
|
||||
- 端到端测试验证
|
||||
|
||||
### 第二优先级:Demo 打包与文档
|
||||
- 制作项目演示 PPT
|
||||
- 录制演示视频
|
||||
- 完善 README 部署文档
|
||||
|
||||
### 第三优先级:测试优化
|
||||
- 使用真实测试数据进行准确率测试
|
||||
- 优化响应时间
|
||||
- 完善错误处理
|
||||
|
||||
---
|
||||
|
||||
## 七、注意事项
|
||||
|
||||
1. **创新点**:根据 Q&A,不必纠结创新点数量限制
|
||||
2. **数据库**:不强制要求数据库存储,可跳过
|
||||
3. **部署**:本地部署即可,不需要公网服务器
|
||||
4. **评测数据**:初赛仅使用目前提供的数据
|
||||
5. **RAG 功能**:当前已临时禁用,不影响核心评测功能
|
||||
|
||||
---
|
||||
|
||||
*文档版本: v1.1*
|
||||
*最后更新: 2026-04-08*
|
||||
|
||||
---
|
||||
|
||||
## 八、技术实现细节
|
||||
|
||||
### 8.1 模板填表流程(已实现)
|
||||
|
||||
#### 流程图
|
||||
```
|
||||
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
||||
│ 上传模板 │ ──► │ 选择数据源 │ ──► │ AI 智能填表 │
|
||||
└─────────────┘ └─────────────┘ └─────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────┐
|
||||
│ 导出结果 │
|
||||
└─────────────┘
|
||||
```
|
||||
|
||||
#### 核心组件
|
||||
|
||||
| 组件 | 文件 | 说明 |
|
||||
|------|------|------|
|
||||
| 模板上传 | `templates.py` `/templates/upload` | 接收模板文件,提取字段 |
|
||||
| 字段提取 | `template_fill_service.py` | 从 Word/Excel 表格提取字段定义 |
|
||||
| 文档解析 | `docx_parser.py`, `xlsx_parser.py`, `txt_parser.py` | 解析源文档内容 |
|
||||
| 智能填表 | `template_fill_service.py` `fill_template()` | 使用 LLM 从源文档提取信息 |
|
||||
| 结果导出 | `templates.py` `/templates/export` | 导出为 Excel 或 Word |
|
||||
|
||||
### 8.2 源文档加载方式
|
||||
|
||||
模板填表服务支持两种方式加载源文档:
|
||||
|
||||
1. **通过 MongoDB 文档 ID**:`source_doc_ids`
|
||||
- 文档已上传并存入 MongoDB
|
||||
- 服务直接查询 MongoDB 获取文档内容
|
||||
|
||||
2. **通过文件路径**:`source_file_paths`
|
||||
- 直接读取本地文件
|
||||
- 使用对应的解析器解析内容
|
||||
|
||||
### 8.3 Word 表格模板解析
|
||||
|
||||
比赛评分表格通常是 Word 格式,`docx_parser.py` 提供了专门的解析方法:
|
||||
|
||||
```python
|
||||
# 提取表格模板字段
|
||||
fields = docx_parser.extract_template_fields_from_docx(file_path)
|
||||
|
||||
# 返回格式
|
||||
# [
|
||||
# {
|
||||
# "cell": "T0R1", # 表格0,行1
|
||||
# "name": "字段名",
|
||||
# "hint": "提示词",
|
||||
# "field_type": "text/number/date",
|
||||
# "required": True
|
||||
# },
|
||||
# ...
|
||||
# ]
|
||||
```
|
||||
|
||||
### 8.4 字段类型推断
|
||||
|
||||
系统支持从提示词自动推断字段类型:
|
||||
|
||||
| 关键词 | 推断类型 | 示例 |
|
||||
|--------|----------|------|
|
||||
| 年、月、日、日期、时间、出生 | date | 出生日期 |
|
||||
| 数量、金额、比率、%、率、合计 | number | 增长比率 |
|
||||
| 其他 | text | 姓名、地址 |
|
||||
|
||||
### 8.5 API 接口
|
||||
|
||||
#### POST `/api/v1/templates/fill`
|
||||
|
||||
填写请求:
|
||||
```json
|
||||
{
|
||||
"template_id": "模板ID",
|
||||
"template_fields": [
|
||||
{"cell": "A1", "name": "姓名", "field_type": "text", "required": true, "hint": "提取人员姓名"}
|
||||
],
|
||||
"source_doc_ids": ["mongodb_doc_id_1", "mongodb_doc_id_2"],
|
||||
"source_file_paths": [],
|
||||
"user_hint": "请从合同文档中提取"
|
||||
}
|
||||
```
|
||||
|
||||
响应:
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"filled_data": {"姓名": "张三"},
|
||||
"fill_details": [
|
||||
{
|
||||
"field": "姓名",
|
||||
"cell": "A1",
|
||||
"value": "张三",
|
||||
"source": "来自:合同文档.docx",
|
||||
"confidence": 0.95
|
||||
}
|
||||
],
|
||||
"source_doc_count": 2
|
||||
}
|
||||
```
|
||||
|
||||
#### POST `/api/v1/templates/export`
|
||||
|
||||
导出请求:
|
||||
```json
|
||||
{
|
||||
"template_id": "模板ID",
|
||||
"filled_data": {"姓名": "张三", "金额": "10000"},
|
||||
"format": "xlsx" // 或 "docx"
|
||||
}
|
||||
```
|
||||
Reference in New Issue
Block a user