Compare commits
15 Commits
47c89d888f
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 8f6d8a43d3 | |||
| 6ec45b73ad | |||
| 73f1c2804f | |||
| 74d40f91c5 | |||
| d2e3c2db3e | |||
| be302839ee | |||
| 581e2b0ae0 | |||
| 975ebf536b | |||
| 38b0c7e62e | |||
| 8e46e635f1 | |||
| c2f50d3bd8 | |||
| 2adf9aef60 | |||
| 827371cb90 | |||
| e5d4724e82 | |||
| 9e7f9df384 |
7
.claude/settings.local.json
Normal file
7
.claude/settings.local.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"WebSearch"
|
||||
]
|
||||
}
|
||||
}
|
||||
35
.env.example
Normal file
35
.env.example
Normal file
@@ -0,0 +1,35 @@
|
||||
# ============================================================
|
||||
# FilesReadSystem 环境变量配置模板
|
||||
# 复制此文件为 .env 并填入实际值
|
||||
# ============================================================
|
||||
|
||||
# ==================== 应用配置 ====================
|
||||
DEBUG=false
|
||||
|
||||
# ==================== MongoDB ====================
|
||||
MONGO_ROOT_USER=admin
|
||||
MONGO_ROOT_PASSWORD=your_mongo_password
|
||||
MONGODB_DB_NAME=document_system
|
||||
|
||||
# ==================== MySQL ====================
|
||||
MYSQL_PASSWORD=your_mysql_password
|
||||
MYSQL_DATABASE=document
|
||||
|
||||
# ==================== Redis ====================
|
||||
REDIS_PASSWORD=your_redis_password
|
||||
|
||||
# ==================== LLM AI ====================
|
||||
LLM_API_KEY=your_llm_api_key
|
||||
LLM_BASE_URL=https://api.deepseek.com
|
||||
LLM_MODEL_NAME=deepseek-chat
|
||||
|
||||
# ==================== Supabase ====================
|
||||
SUPABASE_URL=https://your-project.supabase.co
|
||||
SUPABASE_ANON_KEY=your_anon_key
|
||||
SUPABASE_SERVICE_KEY=your_service_key
|
||||
|
||||
# ==================== Embedding / RAG ====================
|
||||
EMBEDDING_MODEL=all-MiniLM-L6-v2
|
||||
|
||||
# ==================== 前端配置 ====================
|
||||
VITE_APP_ID=your_app_id
|
||||
175
README.md
175
README.md
@@ -1,4 +1,4 @@
|
||||
# FilesReadSystem
|
||||
# 智联文档
|
||||
|
||||
## 项目介绍 / Project Introduction
|
||||
|
||||
@@ -26,37 +26,79 @@ A document understanding and multi-source data fusion system based on Large Lang
|
||||
|
||||
## 项目架构 / Project Architecture
|
||||
|
||||
```mermaid
|
||||
flowchart TB
|
||||
subgraph UI["用户界面 / User Interface"]
|
||||
Frontend["React + TypeScript + shadcn/ui"]
|
||||
end
|
||||
|
||||
subgraph Backend["FastAPI 后端 / Backend"]
|
||||
Upload["上传 API<br/>/upload"]
|
||||
Documents["文档管理<br/>/documents"]
|
||||
RAG["RAG 检索<br/>/rag/search"]
|
||||
AI["AI 分析<br/>/ai/analyze"]
|
||||
Template["模板填充<br/>/templates/fill"]
|
||||
Instruction["自然语言指令<br/>/instruction/execute"]
|
||||
Visual["可视化<br/>/visualization"]
|
||||
end
|
||||
|
||||
subgraph Data["数据层 / Data Layer"]
|
||||
MongoDB["MongoDB<br/>文档存储"]
|
||||
MySQL["MySQL<br/>结构化数据"]
|
||||
Redis["Redis<br/>缓存/队列"]
|
||||
FAISS["FAISS<br/>向量索引"]
|
||||
end
|
||||
|
||||
UI --> Backend
|
||||
Backend --> MongoDB
|
||||
Backend --> MySQL
|
||||
Backend --> Redis
|
||||
MongoDB --> FAISS
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ User Interface │
|
||||
│ (React + TypeScript + shadcn/ui) │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ FastAPI Backend │
|
||||
│ ┌─────────────┐ ┌──────────────┐ ┌─────────────────────────┐ │
|
||||
│ │ Upload API │ │ RAG Search │ │ Natural Language │ │
|
||||
│ │ /documents │ │ /rag/search │ │ /instruction/execute │ │
|
||||
│ └─────────────┘ └──────────────┘ └─────────────────────────┘ │
|
||||
│ ┌─────────────┐ ┌──────────────┐ ┌─────────────────────────┐ │
|
||||
│ │ AI Analyze │ │ Template Fill│ │ Visualization │ │
|
||||
│ │ /ai/analyze │ │ /templates │ │ /visualization │ │
|
||||
│ └─────────────┘ └──────────────┘ └─────────────────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
┌─────────────────────┼─────────────────────┐
|
||||
▼ ▼ ▼
|
||||
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
||||
│ MongoDB │ │ MySQL │ │ Redis │
|
||||
│ (Documents) │ │ (Structured) │ │ (Cache/Queue) │
|
||||
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────┐
|
||||
│ FAISS │
|
||||
│ (Vector Index) │
|
||||
└─────────────────┘
|
||||
|
||||
---
|
||||
|
||||
## 程序流程 / Program Flow
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
Start([用户上传文档<br/>User Uploads Document]) --> Parse{解析文档格式<br/>Parse Document Format}
|
||||
|
||||
Parse -->|Excel| ParseXlsx["解析 Excel<br/>Parse XLSX"]
|
||||
Parse -->|Word| ParseDocx["解析 Word<br/>Parse DOCX"]
|
||||
Parse -->|Markdown| ParseMd["解析 Markdown<br/>Parse Markdown"]
|
||||
Parse -->|Text| ParseTxt["解析文本<br/>Parse Text"]
|
||||
|
||||
ParseXlsx --> Store1[(存储到<br/>MongoDB)]
|
||||
ParseDocx --> Store1
|
||||
ParseMd --> Store1
|
||||
ParseTxt --> Store1
|
||||
|
||||
Store1 --> Embed["Embedding 向量化<br/>Create Embeddings"]
|
||||
Embed --> Index[(索引到<br/>FAISS)]
|
||||
|
||||
Index --> TaskCreated{创建任务<br/>Create Task}
|
||||
|
||||
TaskCreated -->|同步| ProcessSync["同步处理<br/>Sync Process"]
|
||||
TaskCreated -->|异步| QueueTask["加入任务队列<br/>Queue to Celery"]
|
||||
|
||||
ProcessSync --> ReturnResult["返回结果<br/>Return Result"]
|
||||
|
||||
QueueTask --> CeleryWorker["Celery Worker<br/>异步处理"]
|
||||
CeleryWorker --> LLM["调用 LLM<br/>Call LLM API"]
|
||||
LLM --> StoreResult["存储结果<br/>Store Result"]
|
||||
StoreResult --> ReturnAsync["返回任务ID<br/>Return Task ID"]
|
||||
|
||||
ReturnResult --> End([完成<br/>Complete])
|
||||
ReturnAsync --> Poll{轮询任务状态<br/>Poll Task Status}
|
||||
Poll -->|进行中| Poll
|
||||
Poll -->|完成| GetResult["获取结果<br/>Get Result"]
|
||||
GetResult --> End
|
||||
|
||||
style Start fill:#e1f5fe
|
||||
style End fill:#c8e6c9
|
||||
style LLM fill:#fff3e0
|
||||
style CeleryWorker fill:#fff3e0
|
||||
```
|
||||
|
||||
---
|
||||
@@ -233,6 +275,77 @@ pnpm dev
|
||||
|
||||
---
|
||||
|
||||
## Docker 部署 / Docker Deployment
|
||||
|
||||
### 快速启动 / Quick Start
|
||||
|
||||
```bash
|
||||
# 1. 复制环境变量模板并编辑
|
||||
cp .env.example .env
|
||||
# 编辑 .env 填入实际配置
|
||||
|
||||
# 2. 启动所有服务
|
||||
docker compose up -d
|
||||
|
||||
# 3. 查看日志
|
||||
docker compose logs -f
|
||||
|
||||
# 4. 检查服务状态
|
||||
docker compose ps
|
||||
|
||||
# 5. 更新部署
|
||||
docker compose up -d --build
|
||||
```
|
||||
|
||||
### 服务说明 / Services
|
||||
|
||||
| 服务 | 端口 | 说明 |
|
||||
|:---|:---|:---|
|
||||
| frontend | 80 | React 前端 (Nginx) |
|
||||
| backend | 8000 | FastAPI 后端 |
|
||||
| mongodb | 27017 | MongoDB 数据库 |
|
||||
| mysql | 3306 | MySQL 数据库 |
|
||||
| redis | 6379 | Redis 缓存/队列 |
|
||||
|
||||
### 环境变量 / Environment Variables
|
||||
|
||||
创建 `.env` 文件,参考 `.env.example`:
|
||||
|
||||
```bash
|
||||
# 数据库配置
|
||||
MONGO_ROOT_USER=admin
|
||||
MONGO_ROOT_PASSWORD=your_password
|
||||
MONGODB_DB_NAME=document_system
|
||||
MYSQL_PASSWORD=your_password
|
||||
MYSQL_DATABASE=document
|
||||
REDIS_PASSWORD=your_password
|
||||
|
||||
# LLM 配置
|
||||
LLM_API_KEY=your_api_key
|
||||
LLM_BASE_URL=https://api.deepseek.com
|
||||
LLM_MODEL_NAME=deepseek-chat
|
||||
|
||||
# Supabase 配置
|
||||
SUPABASE_URL=https://your-project.supabase.co
|
||||
SUPABASE_ANON_KEY=your_anon_key
|
||||
SUPABASE_SERVICE_KEY=your_service_key
|
||||
```
|
||||
|
||||
### 验证部署 / Verify Deployment
|
||||
|
||||
```bash
|
||||
# 检查所有服务状态
|
||||
docker compose ps
|
||||
|
||||
# 访问前端
|
||||
curl http://localhost
|
||||
|
||||
# 检查后端健康
|
||||
curl http://localhost:8000/health
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 许可证 / License
|
||||
|
||||
ISC
|
||||
|
||||
@@ -34,9 +34,9 @@ REDIS_URL="redis://localhost:6379/0"
|
||||
# - 模型: glm-4-flash (快速文本模型), glm-4 (标准), glm-4-plus (高性能)
|
||||
# - API: https://open.bigmodel.cn
|
||||
# - API Key: https://open.bigmodel.cn/usercenter/apikeys
|
||||
LLM_API_KEY="ca79ad9f96524cd5afc3e43ca97f347d.cpiLLx2oyitGvTeU"
|
||||
LLM_BASE_URL="https://open.bigmodel.cn/api/paas/v4"
|
||||
LLM_MODEL_NAME="glm-4v-plus"
|
||||
LLM_API_KEY="your_llm_api_key_here"
|
||||
LLM_BASE_URL="https://api.deepseek.com"
|
||||
LLM_MODEL_NAME="deepseek-chat"
|
||||
|
||||
# ==================== Supabase 配置 ====================
|
||||
# Supabase 项目配置
|
||||
@@ -45,10 +45,14 @@ SUPABASE_ANON_KEY="your_supabase_anon_key_here"
|
||||
SUPABASE_SERVICE_KEY="your_supabase_service_key_here"
|
||||
|
||||
# ==================== 文件路径配置 ====================
|
||||
# 上传文件存储目录 (相对于项目根目录)
|
||||
# 上传文件存储目录
|
||||
# 本地开发: ./data/uploads
|
||||
# Docker部署: /app/data/uploads
|
||||
UPLOAD_DIR="./data/uploads"
|
||||
|
||||
# Faiss 向量数据库持久化目录 (LangChain + Faiss 实现)
|
||||
# Faiss 向量数据库持久化目录
|
||||
# 本地开发: ./data/faiss
|
||||
# Docker部署: /app/data/faiss
|
||||
FAISS_INDEX_DIR="./data/faiss"
|
||||
|
||||
# ==================== RAG 配置 ====================
|
||||
|
||||
7
backend/=4.0.0
Normal file
7
backend/=4.0.0
Normal file
@@ -0,0 +1,7 @@
|
||||
Collecting reportlab
|
||||
Using cached reportlab-4.4.10-py3-none-any.whl.metadata (1.7 kB)
|
||||
Requirement already satisfied: pillow>=9.0.0 in d:\code\filesreadsystem\backend\venv\lib\site-packages (from reportlab) (12.1.1)
|
||||
Requirement already satisfied: charset-normalizer in d:\code\filesreadsystem\backend\venv\lib\site-packages (from reportlab) (3.4.6)
|
||||
Using cached reportlab-4.4.10-py3-none-any.whl (2.0 MB)
|
||||
Installing collected packages: reportlab
|
||||
Successfully installed reportlab-4.4.10
|
||||
40
backend/Dockerfile
Normal file
40
backend/Dockerfile
Normal file
@@ -0,0 +1,40 @@
|
||||
# ============================================================
|
||||
# FilesReadSystem Backend Docker Image
|
||||
# ============================================================
|
||||
FROM python:3.12-slim
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
# 安装系统依赖 (FAISS, Pillow, tesseract 等)
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
gcc \
|
||||
g++ \
|
||||
libgl1-mesa-glx \
|
||||
libglib2.0-0 \
|
||||
tesseract-ocr \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 先复制依赖文件,再安装(利用 Docker 缓存)
|
||||
COPY requirements.txt .
|
||||
|
||||
# 安装 Python 依赖
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# 复制应用代码
|
||||
COPY app/ ./app/
|
||||
|
||||
# 创建数据目录
|
||||
RUN mkdir -p /app/data/uploads /app/data/faiss /app/data/logs
|
||||
|
||||
# 暴露端口
|
||||
EXPOSE 8000
|
||||
|
||||
# 健康检查
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
|
||||
CMD python -c "import httpx; httpx.get('http://localhost:8000/health')" || exit 1
|
||||
|
||||
# 启动命令
|
||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
@@ -14,6 +14,8 @@ from app.api.endpoints import (
|
||||
analysis_charts,
|
||||
health,
|
||||
instruction, # 智能指令
|
||||
conversation, # 对话历史
|
||||
pdf_converter, # PDF转换
|
||||
)
|
||||
|
||||
# 创建主路由
|
||||
@@ -31,3 +33,5 @@ api_router.include_router(ai_analyze.router) # AI分析
|
||||
api_router.include_router(visualization.router) # 可视化
|
||||
api_router.include_router(analysis_charts.router) # 分析图表
|
||||
api_router.include_router(instruction.router) # 智能指令
|
||||
api_router.include_router(conversation.router) # 对话历史
|
||||
api_router.include_router(pdf_converter.router) # PDF转换
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""
|
||||
AI 分析 API 接口
|
||||
"""
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body, Form
|
||||
from fastapi.responses import StreamingResponse
|
||||
from typing import Optional
|
||||
import logging
|
||||
@@ -12,6 +12,7 @@ from app.services.excel_ai_service import excel_ai_service
|
||||
from app.services.markdown_ai_service import markdown_ai_service
|
||||
from app.services.template_fill_service import template_fill_service
|
||||
from app.services.word_ai_service import word_ai_service
|
||||
from app.services.txt_ai_service import txt_ai_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -20,7 +21,8 @@ router = APIRouter(prefix="/ai", tags=["AI 分析"])
|
||||
|
||||
@router.post("/analyze/excel")
|
||||
async def analyze_excel(
|
||||
file: UploadFile = File(...),
|
||||
file: Optional[UploadFile] = File(None),
|
||||
doc_id: Optional[str] = Form(None, description="文档ID(从数据库读取)"),
|
||||
user_prompt: str = Query("", description="用户自定义提示词"),
|
||||
analysis_type: str = Query("general", description="分析类型: general, summary, statistics, insights"),
|
||||
parse_all_sheets: bool = Query(False, description="是否分析所有工作表")
|
||||
@@ -29,7 +31,8 @@ async def analyze_excel(
|
||||
上传并使用 AI 分析 Excel 文件
|
||||
|
||||
Args:
|
||||
file: 上传的 Excel 文件
|
||||
file: 上传的 Excel 文件(与 doc_id 二选一)
|
||||
doc_id: 文档ID(从数据库读取)
|
||||
user_prompt: 用户自定义提示词
|
||||
analysis_type: 分析类型
|
||||
parse_all_sheets: 是否分析所有工作表
|
||||
@@ -37,7 +40,57 @@ async def analyze_excel(
|
||||
Returns:
|
||||
dict: 分析结果,包含 Excel 数据和 AI 分析结果
|
||||
"""
|
||||
# 检查文件类型
|
||||
filename = None
|
||||
|
||||
# 从数据库读取模式
|
||||
if doc_id:
|
||||
try:
|
||||
from app.core.database.mongodb import mongodb
|
||||
doc = await mongodb.get_document(doc_id)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")
|
||||
|
||||
filename = doc.get("metadata", {}).get("original_filename", "unknown.xlsx")
|
||||
file_ext = filename.split('.')[-1].lower()
|
||||
|
||||
if file_ext not in ['xlsx', 'xls']:
|
||||
raise HTTPException(status_code=400, detail=f"文档类型不是 Excel: {file_ext}")
|
||||
|
||||
file_path = doc.get("metadata", {}).get("file_path")
|
||||
if not file_path:
|
||||
raise HTTPException(status_code=400, detail="文档没有存储文件路径,请重新上传")
|
||||
|
||||
# 使用文件路径进行 AI 分析
|
||||
if parse_all_sheets:
|
||||
result = await excel_ai_service.batch_analyze_sheets_from_path(
|
||||
file_path=file_path,
|
||||
filename=filename,
|
||||
user_prompt=user_prompt,
|
||||
analysis_type=analysis_type
|
||||
)
|
||||
else:
|
||||
result = await excel_ai_service.analyze_excel_file_from_path(
|
||||
file_path=file_path,
|
||||
filename=filename,
|
||||
user_prompt=user_prompt,
|
||||
analysis_type=analysis_type
|
||||
)
|
||||
|
||||
if result.get("success"):
|
||||
return result
|
||||
else:
|
||||
return result
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"从数据库读取 Excel 文档失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
|
||||
|
||||
# 文件上传模式
|
||||
if not file:
|
||||
raise HTTPException(status_code=400, detail="请提供文件或文档ID")
|
||||
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
@@ -60,7 +113,11 @@ async def analyze_excel(
|
||||
# 读取文件内容
|
||||
content = await file.read()
|
||||
|
||||
logger.info(f"开始分析文件: {file.filename}, 分析类型: {analysis_type}")
|
||||
# 验证文件内容不为空
|
||||
if not content:
|
||||
raise HTTPException(status_code=400, detail="文件内容为空,请确保文件已正确上传")
|
||||
|
||||
logger.info(f"开始分析文件: {file.filename}, 分析类型: {analysis_type}, 文件大小: {len(content)} bytes")
|
||||
|
||||
# 调用 AI 分析服务
|
||||
if parse_all_sheets:
|
||||
@@ -153,8 +210,9 @@ async def analyze_text(
|
||||
|
||||
@router.post("/analyze/md")
|
||||
async def analyze_markdown(
|
||||
file: UploadFile = File(...),
|
||||
analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section"),
|
||||
file: Optional[UploadFile] = File(None),
|
||||
doc_id: Optional[str] = Form(None, description="文档ID(从数据库读取)"),
|
||||
analysis_type: str = Query("summary", description="分析类型: summary, outline, key_points, questions, tags, qa, statistics, section, charts"),
|
||||
user_prompt: str = Query("", description="用户自定义提示词"),
|
||||
section_number: Optional[str] = Query(None, description="指定章节编号,如 '一' 或 '(一)'")
|
||||
):
|
||||
@@ -162,7 +220,8 @@ async def analyze_markdown(
|
||||
上传并使用 AI 分析 Markdown 文件
|
||||
|
||||
Args:
|
||||
file: 上传的 Markdown 文件
|
||||
file: 上传的 Markdown 文件(与 doc_id 二选一)
|
||||
doc_id: 文档ID(从数据库读取)
|
||||
analysis_type: 分析类型
|
||||
user_prompt: 用户自定义提示词
|
||||
section_number: 指定分析的章节编号
|
||||
@@ -170,16 +229,8 @@ async def analyze_markdown(
|
||||
Returns:
|
||||
dict: 分析结果
|
||||
"""
|
||||
# 检查文件类型
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['md', 'markdown']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
|
||||
)
|
||||
filename = None
|
||||
tmp_path = None
|
||||
|
||||
# 验证分析类型
|
||||
supported_types = markdown_ai_service.get_supported_analysis_types()
|
||||
@@ -189,46 +240,96 @@ async def analyze_markdown(
|
||||
detail=f"不支持的分析类型: {analysis_type},支持的类型: {', '.join(supported_types)}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 读取文件内容
|
||||
content = await file.read()
|
||||
|
||||
# 保存到临时文件
|
||||
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
if doc_id:
|
||||
# 从数据库读取文档
|
||||
try:
|
||||
logger.info(f"开始分析 Markdown 文件: {file.filename}, 分析类型: {analysis_type}, 章节: {section_number}")
|
||||
from app.core.database.mongodb import mongodb
|
||||
doc = await mongodb.get_document(doc_id)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")
|
||||
|
||||
# 调用 AI 分析服务
|
||||
result = await markdown_ai_service.analyze_markdown(
|
||||
file_path=tmp_path,
|
||||
analysis_type=analysis_type,
|
||||
user_prompt=user_prompt,
|
||||
section_number=section_number
|
||||
filename = doc.get("metadata", {}).get("original_filename", "unknown.md")
|
||||
file_ext = filename.split('.')[-1].lower()
|
||||
|
||||
if file_ext not in ['md', 'markdown']:
|
||||
raise HTTPException(status_code=400, detail=f"文档类型不是 Markdown: {file_ext}")
|
||||
|
||||
content = doc.get("content") or ""
|
||||
if not content:
|
||||
raise HTTPException(status_code=400, detail="文档内容为空")
|
||||
|
||||
# 保存到临时文件
|
||||
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
|
||||
tmp.write(content.encode('utf-8'))
|
||||
tmp_path = tmp.name
|
||||
|
||||
logger.info(f"从数据库加载 Markdown 文档: {filename}, 长度: {len(content)}")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"从数据库读取 Markdown 文档失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
|
||||
else:
|
||||
# 文件上传模式
|
||||
if not file:
|
||||
raise HTTPException(status_code=400, detail="请提供文件或文档ID")
|
||||
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['md', 'markdown']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .md 和 .markdown"
|
||||
)
|
||||
|
||||
logger.info(f"Markdown 分析完成: {file.filename}, 成功: {result['success']}")
|
||||
try:
|
||||
# 读取文件内容
|
||||
content = await file.read()
|
||||
|
||||
if not result['success']:
|
||||
raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
|
||||
# 保存到临时文件
|
||||
with tempfile.NamedTemporaryFile(mode='wb', suffix='.md', delete=False) as tmp:
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
|
||||
return result
|
||||
filename = file.filename
|
||||
|
||||
finally:
|
||||
# 清理临时文件,确保在所有情况下都能清理
|
||||
try:
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
except Exception as cleanup_error:
|
||||
logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")
|
||||
except Exception as e:
|
||||
logger.error(f"读取 Markdown 文件失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"读取文件失败: {str(e)}")
|
||||
|
||||
try:
|
||||
logger.info(f"开始分析 Markdown 文件: {filename}, 分析类型: {analysis_type}, 章节: {section_number}")
|
||||
|
||||
# 调用 AI 分析服务
|
||||
result = await markdown_ai_service.analyze_markdown(
|
||||
file_path=tmp_path,
|
||||
analysis_type=analysis_type,
|
||||
user_prompt=user_prompt,
|
||||
section_number=section_number
|
||||
)
|
||||
|
||||
logger.info(f"Markdown 分析完成: {filename}, 成功: {result['success']}")
|
||||
|
||||
if not result['success']:
|
||||
raise HTTPException(status_code=500, detail=result.get('error', '分析失败'))
|
||||
|
||||
return result
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Markdown AI 分析过程中出错: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
|
||||
finally:
|
||||
# 清理临时文件
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except Exception as cleanup_error:
|
||||
logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")
|
||||
|
||||
|
||||
@router.post("/analyze/md/stream")
|
||||
@@ -346,67 +447,100 @@ async def get_markdown_outline(
|
||||
|
||||
@router.post("/analyze/txt")
|
||||
async def analyze_txt(
|
||||
file: UploadFile = File(...),
|
||||
file: Optional[UploadFile] = File(None),
|
||||
doc_id: Optional[str] = Form(None, description="文档ID(从数据库读取)"),
|
||||
analysis_type: str = Query("structured", description="分析类型: structured, charts")
|
||||
):
|
||||
"""
|
||||
上传并使用 AI 分析 TXT 文本文件,提取结构化数据
|
||||
上传并使用 AI 分析 TXT 文本文件,提取结构化数据或生成图表
|
||||
|
||||
将非结构化文本转换为结构化表格数据,便于后续填表使用
|
||||
当 analysis_type=charts 时,可生成可视化图表
|
||||
|
||||
Args:
|
||||
file: 上传的 TXT 文件
|
||||
file: 上传的 TXT 文件(与 doc_id 二选一)
|
||||
doc_id: 文档ID(从数据库读取)
|
||||
analysis_type: 分析类型 - "structured"(默认,提取结构化数据)或 "charts"(生成图表)
|
||||
|
||||
Returns:
|
||||
dict: 分析结果,包含结构化表格数据
|
||||
dict: 分析结果,包含结构化表格数据或图表数据
|
||||
"""
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['txt', 'text']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .txt"
|
||||
)
|
||||
|
||||
try:
|
||||
# 读取文件内容
|
||||
content = await file.read()
|
||||
|
||||
# 保存到临时文件
|
||||
with tempfile.NamedTemporaryFile(mode='wb', suffix='.txt', delete=False) as tmp:
|
||||
tmp.write(content)
|
||||
tmp_path = tmp.name
|
||||
filename = None
|
||||
text_content = None
|
||||
|
||||
if doc_id:
|
||||
# 从数据库读取文档
|
||||
try:
|
||||
logger.info(f"开始 AI 分析 TXT 文件: {file.filename}")
|
||||
from app.core.database.mongodb import mongodb
|
||||
doc = await mongodb.get_document(doc_id)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")
|
||||
|
||||
# 使用 template_fill_service 的 AI 分析方法
|
||||
result = await template_fill_service.analyze_txt_with_ai(
|
||||
content=content.decode('utf-8', errors='replace'),
|
||||
filename=file.filename
|
||||
filename = doc.get("metadata", {}).get("original_filename", "unknown.txt")
|
||||
file_ext = filename.split('.')[-1].lower()
|
||||
|
||||
if file_ext not in ['txt', 'text']:
|
||||
raise HTTPException(status_code=400, detail=f"文档类型不是 TXT: {file_ext}")
|
||||
|
||||
# 使用数据库中的 content
|
||||
text_content = doc.get("content") or ""
|
||||
|
||||
if not text_content:
|
||||
raise HTTPException(status_code=400, detail="文档内容为空")
|
||||
|
||||
logger.info(f"从数据库加载 TXT 文档: {filename}, 长度: {len(text_content)}")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"从数据库读取 TXT 文档失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
|
||||
else:
|
||||
# 文件上传模式
|
||||
if not file:
|
||||
raise HTTPException(status_code=400, detail="请提供文件或文档ID")
|
||||
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['txt', 'text']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .txt"
|
||||
)
|
||||
|
||||
if result:
|
||||
logger.info(f"TXT AI 分析成功: {file.filename}")
|
||||
return {
|
||||
"success": True,
|
||||
"filename": file.filename,
|
||||
"structured_data": result
|
||||
}
|
||||
else:
|
||||
logger.warning(f"TXT AI 分析返回空结果: {file.filename}")
|
||||
return {
|
||||
"success": False,
|
||||
"filename": file.filename,
|
||||
"error": "AI 分析未能提取到结构化数据",
|
||||
"structured_data": None
|
||||
}
|
||||
# 读取文件内容
|
||||
content = await file.read()
|
||||
text_content = content.decode('utf-8', errors='replace')
|
||||
filename = file.filename
|
||||
|
||||
finally:
|
||||
# 清理临时文件
|
||||
if os.path.exists(tmp_path):
|
||||
os.unlink(tmp_path)
|
||||
try:
|
||||
logger.info(f"开始 AI 分析 TXT 文件: {filename}, analysis_type={analysis_type}")
|
||||
|
||||
# 使用 txt_ai_service 的 AI 分析方法
|
||||
result = await txt_ai_service.analyze_txt_with_ai(
|
||||
content=text_content,
|
||||
filename=filename,
|
||||
analysis_type=analysis_type
|
||||
)
|
||||
|
||||
if result:
|
||||
logger.info(f"TXT AI 分析成功: {filename}")
|
||||
return {
|
||||
"success": result.get("success", True),
|
||||
"filename": filename,
|
||||
"analysis_type": analysis_type,
|
||||
"result": result
|
||||
}
|
||||
else:
|
||||
logger.warning(f"TXT AI 分析返回空结果: {filename}")
|
||||
return {
|
||||
"success": False,
|
||||
"filename": filename,
|
||||
"error": "AI 分析未能提取到结构化数据",
|
||||
"result": None
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
@@ -419,21 +553,90 @@ async def analyze_txt(
|
||||
|
||||
@router.post("/analyze/word")
|
||||
async def analyze_word(
|
||||
file: UploadFile = File(...),
|
||||
user_hint: str = Query("", description="用户提示词,如'请提取表格数据'")
|
||||
file: Optional[UploadFile] = File(None),
|
||||
doc_id: Optional[str] = Form(None, description="文档ID(从数据库读取)"),
|
||||
user_hint: str = Form("", description="用户提示词,如'请提取表格数据'"),
|
||||
analysis_type: str = Query("structured", description="分析类型: structured, charts")
|
||||
):
|
||||
"""
|
||||
使用 AI 解析 Word 文档,提取结构化数据
|
||||
使用 AI 解析 Word 文档,提取结构化数据或生成图表
|
||||
|
||||
适用于从非结构化的 Word 文档中提取表格数据、键值对等信息
|
||||
当 analysis_type=charts 时,可生成可视化图表
|
||||
|
||||
Args:
|
||||
file: 上传的 Word 文件
|
||||
file: 上传的 Word 文件(与 doc_id 二选一)
|
||||
doc_id: 文档ID(从数据库读取)
|
||||
user_hint: 用户提示词
|
||||
analysis_type: 分析类型 - "structured"(默认,提取结构化数据)或 "charts"(生成图表)
|
||||
|
||||
Returns:
|
||||
dict: 包含结构化数据的解析结果
|
||||
dict: 包含结构化数据的解析结果或图表数据
|
||||
"""
|
||||
# 获取文件名和扩展名
|
||||
filename = None
|
||||
file_ext = None
|
||||
|
||||
if doc_id:
|
||||
# 从数据库读取文档
|
||||
try:
|
||||
from app.core.database.mongodb import mongodb
|
||||
doc = await mongodb.get_document(doc_id)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail=f"文档不存在: {doc_id}")
|
||||
|
||||
filename = doc.get("metadata", {}).get("original_filename", "unknown.docx")
|
||||
file_ext = filename.split('.')[-1].lower()
|
||||
|
||||
if file_ext not in ['docx']:
|
||||
raise HTTPException(status_code=400, detail=f"文档类型不是 Word: {file_ext}")
|
||||
|
||||
# 使用数据库中的 content 进行分析
|
||||
content = doc.get("content", "") or ""
|
||||
structured_data = doc.get("structured_data") or {}
|
||||
tables = structured_data.get("tables", [])
|
||||
|
||||
# 调用 AI 分析服务,传入数据库内容
|
||||
if analysis_type == "charts":
|
||||
result = await word_ai_service.generate_charts_from_db(
|
||||
content=content,
|
||||
tables=tables,
|
||||
filename=filename,
|
||||
user_hint=user_hint
|
||||
)
|
||||
else:
|
||||
result = await word_ai_service.parse_word_with_ai_from_db(
|
||||
content=content,
|
||||
tables=tables,
|
||||
filename=filename,
|
||||
user_hint=user_hint or "请提取文档中的所有结构化数据,包括表格、键值对等"
|
||||
)
|
||||
|
||||
if result.get("success"):
|
||||
return {
|
||||
"success": True,
|
||||
"filename": filename,
|
||||
"analysis_type": analysis_type,
|
||||
"result": result
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"filename": filename,
|
||||
"error": result.get("error", "AI 解析失败"),
|
||||
"result": None
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"从数据库读取 Word 文档失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"读取文档失败: {str(e)}")
|
||||
|
||||
# 文件上传模式
|
||||
if not file:
|
||||
raise HTTPException(status_code=400, detail="请提供文件或文档ID")
|
||||
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
@@ -453,16 +656,25 @@ async def analyze_word(
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
# 使用 AI 解析 Word 文档
|
||||
result = await word_ai_service.parse_word_with_ai(
|
||||
file_path=tmp_path,
|
||||
user_hint=user_hint or "请提取文档中的所有结构化数据,包括表格、键值对等"
|
||||
)
|
||||
# 根据 analysis_type 选择处理方式
|
||||
if analysis_type == "charts":
|
||||
# 生成图表
|
||||
result = await word_ai_service.generate_charts(
|
||||
file_path=tmp_path,
|
||||
user_hint=user_hint
|
||||
)
|
||||
else:
|
||||
# 提取结构化数据
|
||||
result = await word_ai_service.parse_word_with_ai(
|
||||
file_path=tmp_path,
|
||||
user_hint=user_hint or "请提取文档中的所有结构化数据,包括表格、键值对等"
|
||||
)
|
||||
|
||||
if result.get("success"):
|
||||
return {
|
||||
"success": True,
|
||||
"filename": file.filename,
|
||||
"analysis_type": analysis_type,
|
||||
"result": result
|
||||
}
|
||||
else:
|
||||
|
||||
98
backend/app/api/endpoints/conversation.py
Normal file
98
backend/app/api/endpoints/conversation.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""
|
||||
对话历史 API 接口
|
||||
|
||||
提供对话历史的存储和查询功能
|
||||
"""
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.core.database import mongodb
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/conversation", tags=["对话历史"])
|
||||
|
||||
|
||||
# ==================== 请求/响应模型 ====================
|
||||
|
||||
class ConversationMessage(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
intent: Optional[str] = None
|
||||
|
||||
|
||||
class ConversationHistoryResponse(BaseModel):
|
||||
success: bool
|
||||
messages: list
|
||||
|
||||
|
||||
class ConversationListResponse(BaseModel):
|
||||
success: bool
|
||||
conversations: list
|
||||
|
||||
|
||||
# ==================== 接口 ====================
|
||||
|
||||
@router.get("/{conversation_id}/history", response_model=ConversationHistoryResponse)
|
||||
async def get_conversation_history(conversation_id: str, limit: int = 20):
|
||||
"""
|
||||
获取对话历史
|
||||
|
||||
Args:
|
||||
conversation_id: 对话会话ID
|
||||
limit: 返回消息数量(默认20条)
|
||||
"""
|
||||
try:
|
||||
messages = await mongodb.get_conversation_history(conversation_id, limit=limit)
|
||||
return ConversationHistoryResponse(
|
||||
success=True,
|
||||
messages=messages
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"获取对话历史失败: {e}")
|
||||
return ConversationHistoryResponse(
|
||||
success=False,
|
||||
messages=[]
|
||||
)
|
||||
|
||||
|
||||
@router.delete("/{conversation_id}")
|
||||
async def delete_conversation(conversation_id: str):
|
||||
"""
|
||||
删除对话会话
|
||||
|
||||
Args:
|
||||
conversation_id: 对话会话ID
|
||||
"""
|
||||
try:
|
||||
success = await mongodb.delete_conversation(conversation_id)
|
||||
return {"success": success}
|
||||
except Exception as e:
|
||||
logger.error(f"删除对话失败: {e}")
|
||||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
@router.get("/all", response_model=ConversationListResponse)
|
||||
async def list_conversations(limit: int = 50, skip: int = 0):
|
||||
"""
|
||||
获取会话列表
|
||||
|
||||
Args:
|
||||
limit: 返回数量
|
||||
skip: 跳过数量
|
||||
"""
|
||||
try:
|
||||
conversations = await mongodb.list_conversations(limit=limit, skip=skip)
|
||||
return ConversationListResponse(
|
||||
success=True,
|
||||
conversations=conversations
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"获取会话列表失败: {e}")
|
||||
return ConversationListResponse(
|
||||
success=False,
|
||||
conversations=[]
|
||||
)
|
||||
@@ -4,6 +4,7 @@
|
||||
支持多格式文档(docx/xlsx/md/txt)上传、解析、存储和RAG索引
|
||||
集成 Excel 存储和 AI 生成字段描述
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
import uuid
|
||||
from typing import List, Optional
|
||||
@@ -258,6 +259,7 @@ async def process_document(
|
||||
)
|
||||
|
||||
# 如果是 Excel,存储到 MySQL + AI生成描述 + RAG索引
|
||||
mysql_table_name = None
|
||||
if doc_type in ["xlsx", "xls"]:
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
@@ -265,17 +267,29 @@ async def process_document(
|
||||
)
|
||||
|
||||
try:
|
||||
# 使用 TableRAG 服务完成建表和RAG索引
|
||||
# 使用 TableRAG 服务存储到 MySQL(跳过 RAG 索引以提升速度)
|
||||
logger.info(f"开始存储Excel到MySQL: {original_filename}, file_path: {file_path}")
|
||||
rag_result = await table_rag_service.build_table_rag_index(
|
||||
file_path=file_path,
|
||||
filename=original_filename,
|
||||
sheet_name=parse_options.get("sheet_name"),
|
||||
header_row=parse_options.get("header_row", 0)
|
||||
header_row=parse_options.get("header_row", 0),
|
||||
skip_rag_index=True # 跳过 AI 字段描述生成和索引
|
||||
)
|
||||
|
||||
if rag_result.get("success"):
|
||||
logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {rag_result.get('table_name')}")
|
||||
mysql_table_name = rag_result.get('table_name')
|
||||
logger.info(f"Excel存储到MySQL成功: {original_filename}, table: {mysql_table_name}")
|
||||
# 更新 MongoDB 中的 metadata,记录 MySQL 表名
|
||||
try:
|
||||
doc = await mongodb.get_document(doc_id)
|
||||
if doc:
|
||||
metadata = doc.get("metadata", {})
|
||||
metadata["mysql_table_name"] = mysql_table_name
|
||||
await mongodb.update_document_metadata(doc_id, metadata)
|
||||
logger.info(f"已更新 MongoDB 文档的 mysql_table_name: {mysql_table_name}")
|
||||
except Exception as update_err:
|
||||
logger.warning(f"更新 MongoDB mysql_table_name 失败: {update_err}")
|
||||
else:
|
||||
logger.error(f"RAG索引构建失败: {rag_result.get('error')}")
|
||||
except Exception as e:
|
||||
@@ -283,17 +297,16 @@ async def process_document(
|
||||
|
||||
else:
|
||||
# 非结构化文档
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=60, message="正在建立索引"
|
||||
)
|
||||
|
||||
# 如果文档中有表格数据,提取并存储到 MySQL + RAG
|
||||
structured_data = result.data.get("structured_data", {})
|
||||
tables = structured_data.get("tables", [])
|
||||
|
||||
# 如果文档中有表格数据,提取并存储到 MySQL(不需要 RAG 索引)
|
||||
if tables:
|
||||
# 对每个表格建立 MySQL 表和 RAG 索引
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=60, message="正在存储表格数据"
|
||||
)
|
||||
# 对每个表格建立 MySQL 表(跳过 RAG 索引,速度更快)
|
||||
for table_info in tables:
|
||||
await table_rag_service.index_document_table(
|
||||
doc_id=doc_id,
|
||||
@@ -302,8 +315,14 @@ async def process_document(
|
||||
source_doc_type=doc_type
|
||||
)
|
||||
|
||||
# 同时对文档内容建立 RAG 索引
|
||||
await index_document_to_rag(doc_id, original_filename, result, doc_type)
|
||||
# 对文档内容建立 RAG 索引(非结构化文本需要语义搜索)
|
||||
content = result.data.get("content", "")
|
||||
if content and len(content) > 50: # 只有内容足够长才建立索引
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=80, message="正在建立语义索引"
|
||||
)
|
||||
await index_document_to_rag(doc_id, original_filename, result, doc_type)
|
||||
|
||||
# 完成
|
||||
await update_task_status(
|
||||
@@ -328,72 +347,95 @@ async def process_document(
|
||||
|
||||
|
||||
async def process_documents_batch(task_id: str, files: List[dict]):
|
||||
"""批量处理文档"""
|
||||
"""批量并行处理文档"""
|
||||
try:
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=0, message="开始批量处理"
|
||||
progress=0, message=f"开始批量处理 {len(files)} 个文档",
|
||||
result={"total": len(files), "files": []}
|
||||
)
|
||||
|
||||
results = []
|
||||
for i, file_info in enumerate(files):
|
||||
async def process_single_file(file_info: dict, index: int) -> dict:
|
||||
"""处理单个文件"""
|
||||
filename = file_info["filename"]
|
||||
try:
|
||||
# 解析文档
|
||||
parser = ParserFactory.get_parser(file_info["path"])
|
||||
result = parser.parse(file_info["path"])
|
||||
|
||||
if result.success:
|
||||
doc_id = await mongodb.insert_document(
|
||||
doc_type=file_info["ext"],
|
||||
content=result.data.get("content", ""),
|
||||
metadata={
|
||||
**result.metadata,
|
||||
"original_filename": file_info["filename"],
|
||||
"file_path": file_info["path"]
|
||||
},
|
||||
structured_data=result.data.get("structured_data")
|
||||
if not result.success:
|
||||
return {"index": index, "filename": filename, "success": False, "error": result.error or "解析失败"}
|
||||
|
||||
# 存储到 MongoDB
|
||||
doc_id = await mongodb.insert_document(
|
||||
doc_type=file_info["ext"],
|
||||
content=result.data.get("content", ""),
|
||||
metadata={
|
||||
**result.metadata,
|
||||
"original_filename": filename,
|
||||
"file_path": file_info["path"]
|
||||
},
|
||||
structured_data=result.data.get("structured_data")
|
||||
)
|
||||
|
||||
# Excel 处理
|
||||
if file_info["ext"] in ["xlsx", "xls"]:
|
||||
await table_rag_service.build_table_rag_index(
|
||||
file_path=file_info["path"],
|
||||
filename=filename,
|
||||
skip_rag_index=True # 跳过 AI 字段描述生成和索引
|
||||
)
|
||||
|
||||
# Excel 处理
|
||||
if file_info["ext"] in ["xlsx", "xls"]:
|
||||
await table_rag_service.build_table_rag_index(
|
||||
file_path=file_info["path"],
|
||||
filename=file_info["filename"]
|
||||
)
|
||||
else:
|
||||
# 非结构化文档:处理其中的表格 + 内容索引
|
||||
structured_data = result.data.get("structured_data", {})
|
||||
tables = structured_data.get("tables", [])
|
||||
|
||||
if tables:
|
||||
for table_info in tables:
|
||||
await table_rag_service.index_document_table(
|
||||
doc_id=doc_id,
|
||||
filename=file_info["filename"],
|
||||
table_data=table_info,
|
||||
source_doc_type=file_info["ext"]
|
||||
)
|
||||
|
||||
await index_document_to_rag(doc_id, file_info["filename"], result, file_info["ext"])
|
||||
|
||||
results.append({"filename": file_info["filename"], "doc_id": doc_id, "success": True})
|
||||
else:
|
||||
results.append({"filename": file_info["filename"], "success": False, "error": result.error})
|
||||
# 非结构化文档
|
||||
structured_data = result.data.get("structured_data", {})
|
||||
tables = structured_data.get("tables", [])
|
||||
|
||||
# 表格数据直接存 MySQL(跳过 RAG 索引)
|
||||
if tables:
|
||||
for table_info in tables:
|
||||
await table_rag_service.index_document_table(
|
||||
doc_id=doc_id,
|
||||
filename=filename,
|
||||
table_data=table_info,
|
||||
source_doc_type=file_info["ext"]
|
||||
)
|
||||
|
||||
# 只有内容足够长才建立语义索引
|
||||
content = result.data.get("content", "")
|
||||
if content and len(content) > 50:
|
||||
await index_document_to_rag(doc_id, filename, result, file_info["ext"])
|
||||
|
||||
return {"index": index, "filename": filename, "doc_id": doc_id, "file_path": file_info["path"], "success": True}
|
||||
|
||||
except Exception as e:
|
||||
results.append({"filename": file_info["filename"], "success": False, "error": str(e)})
|
||||
logger.error(f"处理文件 {filename} 失败: {e}")
|
||||
return {"index": index, "filename": filename, "success": False, "error": str(e)}
|
||||
|
||||
progress = int((i + 1) / len(files) * 100)
|
||||
await update_task_status(
|
||||
task_id, status="processing",
|
||||
progress=progress, message=f"已处理 {i+1}/{len(files)}"
|
||||
)
|
||||
# 并行处理所有文档
|
||||
tasks = [process_single_file(f, i) for i, f in enumerate(files)]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
# 按原始顺序排序
|
||||
results.sort(key=lambda x: x["index"])
|
||||
|
||||
# 统计成功/失败数量
|
||||
success_count = sum(1 for r in results if r["success"])
|
||||
fail_count = len(results) - success_count
|
||||
|
||||
# 更新最终状态
|
||||
await update_task_status(
|
||||
task_id, status="success",
|
||||
progress=100, message="批量处理完成",
|
||||
result={"results": results}
|
||||
progress=100, message=f"批量处理完成: {success_count} 成功, {fail_count} 失败",
|
||||
result={
|
||||
"total": len(files),
|
||||
"success": success_count,
|
||||
"failure": fail_count,
|
||||
"results": results
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(f"批量处理完成: {success_count}/{len(files)} 成功")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"批量处理失败: {str(e)}")
|
||||
await update_task_status(
|
||||
@@ -404,20 +446,20 @@ async def process_documents_batch(task_id: str, files: List[dict]):
|
||||
|
||||
|
||||
async def index_document_to_rag(doc_id: str, filename: str, result: ParseResult, doc_type: str):
|
||||
"""将非结构化文档索引到 RAG(使用分块索引)"""
|
||||
"""将非结构化文档索引到 RAG(使用分块索引,异步执行)"""
|
||||
try:
|
||||
content = result.data.get("content", "")
|
||||
if content:
|
||||
# 将完整内容传递给 RAG 服务自动分块索引
|
||||
rag_service.index_document_content(
|
||||
# 使用异步方法索引,避免阻塞事件循环
|
||||
await rag_service.index_document_content_async(
|
||||
doc_id=doc_id,
|
||||
content=content, # 传递完整内容,由 RAG 服务自动分块
|
||||
content=content,
|
||||
metadata={
|
||||
"filename": filename,
|
||||
"doc_type": doc_type
|
||||
},
|
||||
chunk_size=500, # 每块 500 字符
|
||||
chunk_overlap=50 # 块之间 50 字符重叠
|
||||
chunk_size=1000, # 每块 1000 字符,提升速度
|
||||
chunk_overlap=100 # 块之间 100 字符重叠
|
||||
)
|
||||
logger.info(f"RAG 索引完成: {filename}, doc_id={doc_id}")
|
||||
except Exception as e:
|
||||
|
||||
@@ -25,6 +25,7 @@ class InstructionRequest(BaseModel):
|
||||
instruction: str
|
||||
doc_ids: Optional[List[str]] = None # 关联的文档 ID 列表
|
||||
context: Optional[Dict[str, Any]] = None # 额外上下文
|
||||
conversation_id: Optional[str] = None # 对话会话ID,用于关联历史记录
|
||||
|
||||
|
||||
class IntentRecognitionResponse(BaseModel):
|
||||
@@ -240,7 +241,8 @@ async def instruction_chat(
|
||||
task_id=task_id,
|
||||
instruction=request.instruction,
|
||||
doc_ids=request.doc_ids,
|
||||
context=request.context
|
||||
context=request.context,
|
||||
conversation_id=request.conversation_id
|
||||
)
|
||||
|
||||
return {
|
||||
@@ -251,14 +253,15 @@ async def instruction_chat(
|
||||
}
|
||||
|
||||
# 同步模式:等待执行完成
|
||||
return await _execute_chat_task(task_id, request.instruction, request.doc_ids, request.context)
|
||||
return await _execute_chat_task(task_id, request.instruction, request.doc_ids, request.context, request.conversation_id)
|
||||
|
||||
|
||||
async def _execute_chat_task(
|
||||
task_id: str,
|
||||
instruction: str,
|
||||
doc_ids: Optional[List[str]],
|
||||
context: Optional[Dict[str, Any]]
|
||||
context: Optional[Dict[str, Any]],
|
||||
conversation_id: Optional[str] = None
|
||||
):
|
||||
"""执行指令对话的后台任务"""
|
||||
from app.core.database import mongodb as mongo_client
|
||||
@@ -278,6 +281,13 @@ async def _execute_chat_task(
|
||||
# 构建上下文
|
||||
ctx: Dict[str, Any] = context or {}
|
||||
|
||||
# 获取对话历史
|
||||
if conversation_id:
|
||||
history = await mongo_client.get_conversation_history(conversation_id, limit=20)
|
||||
if history:
|
||||
ctx["conversation_history"] = history
|
||||
logger.info(f"加载对话历史: conversation_id={conversation_id}, 消息数={len(history)}")
|
||||
|
||||
# 获取关联文档
|
||||
if doc_ids:
|
||||
docs = []
|
||||
@@ -291,6 +301,29 @@ async def _execute_chat_task(
|
||||
# 执行指令
|
||||
result = await instruction_executor.execute(instruction, ctx)
|
||||
|
||||
# 存储对话历史
|
||||
if conversation_id:
|
||||
try:
|
||||
# 存储用户消息
|
||||
await mongo_client.insert_conversation(
|
||||
conversation_id=conversation_id,
|
||||
role="user",
|
||||
content=instruction,
|
||||
intent=result.get("intent", "unknown")
|
||||
)
|
||||
# 存储助手回复
|
||||
response_content = result.get("message", "")
|
||||
if response_content:
|
||||
await mongo_client.insert_conversation(
|
||||
conversation_id=conversation_id,
|
||||
role="assistant",
|
||||
content=response_content,
|
||||
intent=result.get("intent", "unknown")
|
||||
)
|
||||
logger.info(f"已存储对话历史: conversation_id={conversation_id}")
|
||||
except Exception as e:
|
||||
logger.error(f"存储对话历史失败: {e}")
|
||||
|
||||
# 根据意图类型添加友好的响应消息
|
||||
response_messages = {
|
||||
"extract": f"已提取 {len(result.get('extracted_data', {}))} 个字段的数据",
|
||||
|
||||
208
backend/app/api/endpoints/pdf_converter.py
Normal file
208
backend/app/api/endpoints/pdf_converter.py
Normal file
@@ -0,0 +1,208 @@
|
||||
"""
|
||||
PDF 转换 API 接口
|
||||
|
||||
提供将 Word、Excel、Txt、Markdown 转换为 PDF 的功能
|
||||
"""
|
||||
import logging
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, UploadFile, File, Form, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from app.services.pdf_converter_service import pdf_converter_service
|
||||
from app.services.file_service import file_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/pdf", tags=["PDF转换"])
|
||||
|
||||
# 临时存储转换后的 PDF(key: download_id, value: (pdf_content, original_filename))
|
||||
_pdf_cache: dict = {}
|
||||
|
||||
|
||||
# ==================== 请求/响应模型 ====================
|
||||
|
||||
class ConvertResponse:
|
||||
"""转换响应"""
|
||||
def __init__(self, success: bool, message: str = "", filename: str = ""):
|
||||
self.success = success
|
||||
self.message = message
|
||||
self.filename = filename
|
||||
|
||||
|
||||
# ==================== 接口 ====================
|
||||
|
||||
@router.post("/convert")
|
||||
async def convert_to_pdf(
|
||||
file: UploadFile = File(...),
|
||||
):
|
||||
"""
|
||||
将上传的文件转换为 PDF
|
||||
|
||||
支持格式: docx, xlsx, txt, md
|
||||
|
||||
Args:
|
||||
file: 上传的文件
|
||||
|
||||
Returns:
|
||||
PDF 文件流
|
||||
"""
|
||||
try:
|
||||
# 检查文件格式
|
||||
filename = file.filename or "document"
|
||||
file_ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
|
||||
|
||||
if file_ext not in pdf_converter_service.supported_formats:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的格式: {file_ext},支持的格式: {', '.join(pdf_converter_service.supported_formats)}"
|
||||
)
|
||||
|
||||
# 读取文件内容
|
||||
content = await file.read()
|
||||
if not content:
|
||||
raise HTTPException(status_code=400, detail="文件内容为空")
|
||||
|
||||
logger.info(f"开始转换文件: {filename} ({file_ext})")
|
||||
|
||||
# 转换为 PDF
|
||||
pdf_content, error = await pdf_converter_service.convert_to_pdf(
|
||||
file_content=content,
|
||||
source_format=file_ext,
|
||||
filename=filename.rsplit('.', 1)[0] if '.' in filename else filename
|
||||
)
|
||||
|
||||
if error:
|
||||
raise HTTPException(status_code=500, detail=error)
|
||||
|
||||
# 直接返回 PDF 文件流
|
||||
return StreamingResponse(
|
||||
iter([pdf_content]),
|
||||
media_type="application/pdf",
|
||||
headers={
|
||||
"Content-Disposition": f"attachment; filename*=UTF-8''converted.pdf"
|
||||
}
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"PDF转换失败: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"转换失败: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/download/{download_id}")
|
||||
async def download_pdf(download_id: str):
|
||||
"""
|
||||
通过下载 ID 下载 PDF(支持 IDM 拦截)
|
||||
"""
|
||||
if download_id not in _pdf_cache:
|
||||
raise HTTPException(status_code=404, detail="下载链接已过期或不存在")
|
||||
|
||||
pdf_content, filename = _pdf_cache.pop(download_id) # 下载后删除
|
||||
|
||||
# 使用 RFC 5987 编码支持中文文件名
|
||||
from starlette.responses import StreamingResponse
|
||||
import urllib.parse
|
||||
|
||||
# URL 编码中文文件名
|
||||
encoded_filename = urllib.parse.quote(f"{filename}.pdf")
|
||||
|
||||
return StreamingResponse(
|
||||
iter([pdf_content]),
|
||||
media_type="application/pdf",
|
||||
headers={
|
||||
"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}"
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@router.get("/formats")
|
||||
async def get_supported_formats():
|
||||
"""
|
||||
获取支持的源文件格式
|
||||
|
||||
Returns:
|
||||
支持的格式列表
|
||||
"""
|
||||
return {
|
||||
"success": True,
|
||||
"formats": pdf_converter_service.get_supported_formats()
|
||||
}
|
||||
|
||||
|
||||
@router.post("/convert/batch")
|
||||
async def batch_convert_to_pdf(
|
||||
files: list[UploadFile] = File(...),
|
||||
):
|
||||
"""
|
||||
批量将多个文件转换为 PDF
|
||||
|
||||
注意: 批量转换会返回多个 PDF 文件打包的 zip
|
||||
|
||||
Args:
|
||||
files: 上传的文件列表
|
||||
|
||||
Returns:
|
||||
ZIP 压缩包(包含所有PDF)
|
||||
"""
|
||||
try:
|
||||
import io
|
||||
import zipfile
|
||||
|
||||
results = []
|
||||
errors = []
|
||||
|
||||
for file in files:
|
||||
try:
|
||||
filename = file.filename or "document"
|
||||
file_ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
|
||||
|
||||
if file_ext not in pdf_converter_service.supported_formats:
|
||||
errors.append(f"{filename}: 不支持的格式")
|
||||
continue
|
||||
|
||||
content = await file.read()
|
||||
pdf_content, error = await pdf_converter_service.convert_to_pdf(
|
||||
file_content=content,
|
||||
source_format=file_ext,
|
||||
filename=filename.rsplit('.', 1)[0] if '.' in filename else filename
|
||||
)
|
||||
|
||||
if error:
|
||||
errors.append(f"{filename}: {error}")
|
||||
else:
|
||||
results.append((filename, pdf_content))
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f"{file.filename}: {str(e)}")
|
||||
|
||||
if not results:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"没有可转换的文件。错误: {'; '.join(errors)}"
|
||||
)
|
||||
|
||||
# 创建 ZIP 包
|
||||
zip_buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
||||
for original_name, pdf_content in results:
|
||||
pdf_name = f"{original_name.rsplit('.', 1)[0] if '.' in original_name else original_name}.pdf"
|
||||
zip_file.writestr(pdf_name, pdf_content)
|
||||
|
||||
zip_buffer.seek(0)
|
||||
|
||||
return StreamingResponse(
|
||||
iter([zip_buffer.getvalue()]),
|
||||
media_type="application/zip",
|
||||
headers={
|
||||
"Content-Disposition": "attachment; filename*=UTF-8''converted_pdfs.zip"
|
||||
}
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"批量PDF转换失败: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"批量转换失败: {str(e)}")
|
||||
@@ -87,6 +87,7 @@ class ExportRequest(BaseModel):
|
||||
template_id: str
|
||||
filled_data: dict
|
||||
format: str = "xlsx" # xlsx 或 docx
|
||||
filled_file_path: Optional[str] = None # 已填写的 Word 文件路径(可选)
|
||||
|
||||
|
||||
# ==================== 接口实现 ====================
|
||||
@@ -541,7 +542,7 @@ async def export_filled_template(
|
||||
if request.format == "xlsx":
|
||||
return await _export_to_excel(request.filled_data, request.template_id)
|
||||
elif request.format == "docx":
|
||||
return await _export_to_word(request.filled_data, request.template_id)
|
||||
return await _export_to_word(request.filled_data, request.template_id, request.filled_file_path)
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
@@ -608,11 +609,12 @@ async def _export_to_excel(filled_data: dict, template_id: str) -> StreamingResp
|
||||
)
|
||||
|
||||
|
||||
async def _export_to_word(filled_data: dict, template_id: str) -> StreamingResponse:
|
||||
async def _export_to_word(filled_data: dict, template_id: str, filled_file_path: Optional[str] = None) -> StreamingResponse:
|
||||
"""导出为 Word 格式"""
|
||||
import re
|
||||
import tempfile
|
||||
import os
|
||||
import urllib.parse
|
||||
from docx import Document
|
||||
from docx.shared import Pt, RGBColor
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
@@ -623,12 +625,32 @@ async def _export_to_word(filled_data: dict, template_id: str) -> StreamingRespo
|
||||
return ""
|
||||
# 移除控制字符
|
||||
text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
|
||||
# 转义 XML 特殊字符以防破坏文档结构
|
||||
text = text.replace('&', '&').replace('<', '<').replace('>', '>')
|
||||
return text.strip()
|
||||
|
||||
tmp_path = None
|
||||
try:
|
||||
# 先保存到临时文件,再读取到内存,确保文档完整性
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as tmp_file:
|
||||
tmp_path = tmp_file.name
|
||||
# 如果有已填写的文件(通过 _fill_docx 填写了模板单元格),直接返回该文件
|
||||
if filled_file_path and os.path.exists(filled_file_path):
|
||||
filename = os.path.basename(filled_file_path)
|
||||
with open(filled_file_path, 'rb') as f:
|
||||
file_content = f.read()
|
||||
output = io.BytesIO(file_content)
|
||||
encoded_filename = urllib.parse.quote(filename)
|
||||
return StreamingResponse(
|
||||
output,
|
||||
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
headers={
|
||||
"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}",
|
||||
"Content-Length": str(len(file_content))
|
||||
}
|
||||
)
|
||||
|
||||
# 没有已填写文件,创建新的 Word 文档(表格形式)
|
||||
# 创建临时文件(立即关闭句柄,避免 Windows 文件锁问题)
|
||||
tmp_fd, tmp_path = tempfile.mkstemp(suffix='.docx')
|
||||
os.close(tmp_fd) # 关闭立即得到的 fd,让 docx 可以写入
|
||||
|
||||
doc = Document()
|
||||
doc.add_heading('填写结果', level=1)
|
||||
@@ -670,19 +692,23 @@ async def _export_to_word(filled_data: dict, template_id: str) -> StreamingRespo
|
||||
|
||||
finally:
|
||||
# 清理临时文件
|
||||
if os.path.exists(tmp_path):
|
||||
if tmp_path and os.path.exists(tmp_path):
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
output = io.BytesIO(file_content)
|
||||
filename = "filled_template.docx"
|
||||
encoded_filename = urllib.parse.quote(filename)
|
||||
|
||||
return StreamingResponse(
|
||||
output,
|
||||
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
headers={"Content-Disposition": f"attachment; filename*=UTF-8''{filename}"}
|
||||
headers={
|
||||
"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}",
|
||||
"Content-Length": str(len(file_content))
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
|
||||
27
backend/app/celery_app.py
Normal file
27
backend/app/celery_app.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# ============================================================
|
||||
# Celery 应用配置
|
||||
# ============================================================
|
||||
from celery import Celery
|
||||
|
||||
# 优先使用环境变量,否则使用默认值
|
||||
import os
|
||||
|
||||
CELERY_BROKER_URL = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/1")
|
||||
CELERY_RESULT_BACKEND = os.getenv("CELERY_RESULT_BACKEND", "redis://localhost:6379/2")
|
||||
|
||||
celery_app = Celery(
|
||||
"filesread",
|
||||
broker=CELERY_BROKER_URL,
|
||||
backend=CELERY_RESULT_BACKEND,
|
||||
)
|
||||
|
||||
celery_app.conf.update(
|
||||
task_serializer="json",
|
||||
accept_content=["json"],
|
||||
result_serializer="json",
|
||||
timezone="Asia/Shanghai",
|
||||
enable_utc=True,
|
||||
task_track_started=True,
|
||||
task_time_limit=3600, # 1小时超时
|
||||
worker_prefetch_multiplier=1,
|
||||
)
|
||||
@@ -64,6 +64,11 @@ class MongoDB:
|
||||
"""任务集合 - 存储任务历史记录"""
|
||||
return self.db["tasks"]
|
||||
|
||||
@property
|
||||
def conversations(self):
|
||||
"""对话集合 - 存储对话历史记录"""
|
||||
return self.db["conversations"]
|
||||
|
||||
# ==================== 文档操作 ====================
|
||||
|
||||
async def insert_document(
|
||||
@@ -117,14 +122,20 @@ class MongoDB:
|
||||
搜索文档
|
||||
|
||||
Args:
|
||||
query: 搜索关键词
|
||||
query: 搜索关键词(支持文件名和内容搜索)
|
||||
doc_type: 文档类型过滤
|
||||
limit: 返回数量
|
||||
|
||||
Returns:
|
||||
文档列表
|
||||
"""
|
||||
filter_query = {"content": {"$regex": query}}
|
||||
filter_query = {
|
||||
"$or": [
|
||||
{"content": {"$regex": query, "$options": "i"}},
|
||||
{"metadata.original_filename": {"$regex": query, "$options": "i"}},
|
||||
{"metadata.filename": {"$regex": query, "$options": "i"}},
|
||||
]
|
||||
}
|
||||
if doc_type:
|
||||
filter_query["doc_type"] = doc_type
|
||||
|
||||
@@ -141,6 +152,15 @@ class MongoDB:
|
||||
result = await self.documents.delete_one({"_id": ObjectId(doc_id)})
|
||||
return result.deleted_count > 0
|
||||
|
||||
async def update_document_metadata(self, doc_id: str, metadata: Dict[str, Any]) -> bool:
|
||||
"""更新文档 metadata 字段"""
|
||||
from bson import ObjectId
|
||||
result = await self.documents.update_one(
|
||||
{"_id": ObjectId(doc_id)},
|
||||
{"$set": {"metadata": metadata}}
|
||||
)
|
||||
return result.modified_count > 0
|
||||
|
||||
# ==================== RAG 索引操作 ====================
|
||||
|
||||
async def insert_rag_entry(
|
||||
@@ -251,6 +271,10 @@ class MongoDB:
|
||||
await self.tasks.create_index("task_id", unique=True)
|
||||
await self.tasks.create_index("created_at")
|
||||
|
||||
# 对话集合索引
|
||||
await self.conversations.create_index("conversation_id")
|
||||
await self.conversations.create_index("created_at")
|
||||
|
||||
logger.info("MongoDB 索引创建完成")
|
||||
|
||||
# ==================== 任务历史操作 ====================
|
||||
@@ -369,6 +393,108 @@ class MongoDB:
|
||||
result = await self.tasks.delete_one({"task_id": task_id})
|
||||
return result.deleted_count > 0
|
||||
|
||||
# ==================== 对话历史操作 ====================
|
||||
|
||||
async def insert_conversation(
|
||||
self,
|
||||
conversation_id: str,
|
||||
role: str,
|
||||
content: str,
|
||||
intent: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> str:
|
||||
"""
|
||||
插入对话记录
|
||||
|
||||
Args:
|
||||
conversation_id: 对话会话ID
|
||||
role: 角色 (user/assistant)
|
||||
content: 对话内容
|
||||
intent: 意图类型
|
||||
metadata: 额外元数据
|
||||
|
||||
Returns:
|
||||
插入文档的ID
|
||||
"""
|
||||
message = {
|
||||
"conversation_id": conversation_id,
|
||||
"role": role,
|
||||
"content": content,
|
||||
"intent": intent,
|
||||
"metadata": metadata or {},
|
||||
"created_at": datetime.utcnow(),
|
||||
}
|
||||
result = await self.conversations.insert_one(message)
|
||||
return str(result.inserted_id)
|
||||
|
||||
async def get_conversation_history(
|
||||
self,
|
||||
conversation_id: str,
|
||||
limit: int = 20,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
获取对话历史
|
||||
|
||||
Args:
|
||||
conversation_id: 对话会话ID
|
||||
limit: 返回消息数量
|
||||
|
||||
Returns:
|
||||
对话消息列表
|
||||
"""
|
||||
cursor = self.conversations.find(
|
||||
{"conversation_id": conversation_id}
|
||||
).sort("created_at", 1).limit(limit)
|
||||
|
||||
messages = []
|
||||
async for msg in cursor:
|
||||
msg["_id"] = str(msg["_id"])
|
||||
if msg.get("created_at"):
|
||||
msg["created_at"] = msg["created_at"].isoformat()
|
||||
messages.append(msg)
|
||||
return messages
|
||||
|
||||
async def delete_conversation(self, conversation_id: str) -> bool:
|
||||
"""删除对话会话"""
|
||||
result = await self.conversations.delete_many({"conversation_id": conversation_id})
|
||||
return result.deleted_count > 0
|
||||
|
||||
async def list_conversations(
|
||||
self,
|
||||
limit: int = 50,
|
||||
skip: int = 0,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
获取会话列表(按最近一条消息排序)
|
||||
|
||||
Args:
|
||||
limit: 返回数量
|
||||
skip: 跳过数量
|
||||
|
||||
Returns:
|
||||
会话列表
|
||||
"""
|
||||
# 使用 aggregation 获取每个会话的最新一条消息
|
||||
pipeline = [
|
||||
{"$sort": {"created_at": -1}},
|
||||
{"$group": {
|
||||
"_id": "$conversation_id",
|
||||
"last_message": {"$first": "$$ROOT"},
|
||||
}},
|
||||
{"$replaceRoot": {"newRoot": "$last_message"}},
|
||||
{"$sort": {"created_at": -1}},
|
||||
{"$skip": skip},
|
||||
{"$limit": limit},
|
||||
]
|
||||
|
||||
conversations = []
|
||||
async for doc in self.conversations.aggregate(pipeline):
|
||||
doc["_id"] = str(doc["_id"])
|
||||
if doc.get("created_at"):
|
||||
doc["created_at"] = doc["created_at"].isoformat()
|
||||
conversations.append(doc)
|
||||
return conversations
|
||||
|
||||
|
||||
# ==================== 全局单例 ====================
|
||||
|
||||
|
||||
@@ -44,6 +44,22 @@ class DocxParser(BaseParser):
|
||||
error=f"文件不存在: {file_path}"
|
||||
)
|
||||
|
||||
# 尝试使用 python-docx 解析,失败则使用备用方法
|
||||
try:
|
||||
return self._parse_with_docx(path)
|
||||
except Exception as e:
|
||||
logger.warning(f"python-docx 解析失败,使用备用方法: {e}")
|
||||
try:
|
||||
return self._parse_fallback(path)
|
||||
except Exception as fallback_error:
|
||||
logger.error(f"备用解析方法也失败: {fallback_error}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"解析 Word 文档失败: {str(e)}"
|
||||
)
|
||||
|
||||
def _parse_with_docx(self, path: Path) -> ParseResult:
|
||||
"""使用 python-docx 解析文档"""
|
||||
# 检查文件扩展名
|
||||
if path.suffix.lower() not in self.supported_extensions:
|
||||
return ParseResult(
|
||||
@@ -51,98 +67,181 @@ class DocxParser(BaseParser):
|
||||
error=f"不支持的文件类型: {path.suffix}"
|
||||
)
|
||||
|
||||
# 读取 Word 文档
|
||||
doc = Document(path)
|
||||
|
||||
# 提取文本内容
|
||||
paragraphs = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
paragraphs.append({
|
||||
"text": para.text,
|
||||
"style": str(para.style.name) if para.style else "Normal"
|
||||
})
|
||||
|
||||
# 提取段落纯文本(用于 AI 解析)
|
||||
paragraphs_text = [p["text"] for p in paragraphs if p["text"].strip()]
|
||||
|
||||
# 提取表格内容
|
||||
tables_data = []
|
||||
for i, table in enumerate(doc.tables):
|
||||
table_rows = []
|
||||
for row in table.rows:
|
||||
row_data = [cell.text.strip() for cell in row.cells]
|
||||
table_rows.append(row_data)
|
||||
|
||||
if table_rows:
|
||||
# 第一行作为表头,其余行作为数据
|
||||
headers = table_rows[0] if table_rows else []
|
||||
data_rows = table_rows[1:] if len(table_rows) > 1 else []
|
||||
tables_data.append({
|
||||
"table_index": i,
|
||||
"headers": headers, # 添加 headers 字段
|
||||
"rows": data_rows, # 数据行(不含表头)
|
||||
"row_count": len(data_rows),
|
||||
"column_count": len(headers) if headers else 0
|
||||
})
|
||||
|
||||
# 提取图片/嵌入式对象信息
|
||||
images_info = self._extract_images_info(doc, path)
|
||||
|
||||
# 合并所有文本(包括图片描述)
|
||||
full_text_parts = []
|
||||
full_text_parts.append("【文档正文】")
|
||||
full_text_parts.extend(paragraphs_text)
|
||||
|
||||
if tables_data:
|
||||
full_text_parts.append("\n【文档表格】")
|
||||
for idx, table in enumerate(tables_data):
|
||||
full_text_parts.append(f"--- 表格 {idx + 1} ---")
|
||||
for row in table["rows"]:
|
||||
full_text_parts.append(" | ".join(str(cell) for cell in row))
|
||||
|
||||
if images_info.get("image_count", 0) > 0:
|
||||
full_text_parts.append(f"\n【文档图片】文档包含 {images_info['image_count']} 张图片/图表")
|
||||
|
||||
full_text = "\n".join(full_text_parts)
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"paragraph_count": len(paragraphs),
|
||||
"table_count": len(tables_data),
|
||||
"image_count": images_info.get("image_count", 0)
|
||||
}
|
||||
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": full_text,
|
||||
"paragraphs": paragraphs,
|
||||
"paragraphs_with_style": paragraphs,
|
||||
"tables": tables_data,
|
||||
"images": images_info
|
||||
},
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
def _parse_fallback(self, path: Path) -> ParseResult:
|
||||
"""备用解析方法:直接解析 docx 的 XML 结构"""
|
||||
import zipfile
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
try:
|
||||
# 读取 Word 文档
|
||||
doc = Document(file_path)
|
||||
with zipfile.ZipFile(path, 'r') as zf:
|
||||
# 读取 document.xml
|
||||
if 'word/document.xml' not in zf.namelist():
|
||||
return ParseResult(success=False, error="无效的 docx 文件格式")
|
||||
|
||||
# 提取文本内容
|
||||
paragraphs = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
paragraphs.append({
|
||||
"text": para.text,
|
||||
"style": str(para.style.name) if para.style else "Normal"
|
||||
xml_content = zf.read('word/document.xml')
|
||||
root = ET.fromstring(xml_content)
|
||||
|
||||
# 命名空间
|
||||
namespaces = {
|
||||
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
||||
}
|
||||
|
||||
paragraphs = []
|
||||
tables = []
|
||||
current_table = []
|
||||
|
||||
for elem in root.iter():
|
||||
if elem.tag.endswith('}p'): # 段落
|
||||
text_parts = []
|
||||
for t in elem.iter():
|
||||
if t.tag.endswith('}t') and t.text:
|
||||
text_parts.append(t.text)
|
||||
text = ''.join(text_parts).strip()
|
||||
if text:
|
||||
paragraphs.append({'text': text, 'style': 'Normal'})
|
||||
elif elem.tag.endswith('}tr'): # 表格行
|
||||
row_data = []
|
||||
for tc in elem.iter():
|
||||
if tc.tag.endswith('}tc'): # 单元格
|
||||
cell_text = []
|
||||
for t in tc.iter():
|
||||
if t.tag.endswith('}t') and t.text:
|
||||
cell_text.append(t.text)
|
||||
row_data.append(''.join(cell_text).strip())
|
||||
if row_data:
|
||||
current_table.append(row_data)
|
||||
else:
|
||||
# 表格结束,保存
|
||||
if current_table:
|
||||
tables.append({
|
||||
'table_index': len(tables),
|
||||
'rows': current_table,
|
||||
'row_count': len(current_table),
|
||||
'column_count': len(current_table[0]) if current_table else 0
|
||||
})
|
||||
current_table = []
|
||||
|
||||
# 保存最后一张表格
|
||||
if current_table:
|
||||
tables.append({
|
||||
'table_index': len(tables),
|
||||
'rows': current_table,
|
||||
'row_count': len(current_table),
|
||||
'column_count': len(current_table[0]) if current_table else 0
|
||||
})
|
||||
|
||||
# 提取段落纯文本(用于 AI 解析)
|
||||
paragraphs_text = [p["text"] for p in paragraphs if p["text"].strip()]
|
||||
# 构建文本
|
||||
paragraphs_text = [p["text"] for p in paragraphs]
|
||||
full_text_parts = ["【文档正文】"] + paragraphs_text
|
||||
|
||||
# 提取表格内容
|
||||
tables_data = []
|
||||
for i, table in enumerate(doc.tables):
|
||||
table_rows = []
|
||||
for row in table.rows:
|
||||
row_data = [cell.text.strip() for cell in row.cells]
|
||||
table_rows.append(row_data)
|
||||
if tables:
|
||||
full_text_parts.append("\n【文档表格】")
|
||||
for idx, table in enumerate(tables):
|
||||
full_text_parts.append(f"--- 表格 {idx + 1} ---")
|
||||
for row in table["rows"]:
|
||||
full_text_parts.append(" | ".join(str(cell) for cell in row))
|
||||
|
||||
if table_rows:
|
||||
tables_data.append({
|
||||
"table_index": i,
|
||||
"rows": table_rows,
|
||||
"row_count": len(table_rows),
|
||||
"column_count": len(table_rows[0]) if table_rows else 0
|
||||
})
|
||||
full_text = "\n".join(full_text_parts)
|
||||
|
||||
# 提取图片/嵌入式对象信息
|
||||
images_info = self._extract_images_info(doc, path)
|
||||
|
||||
# 合并所有文本(包括图片描述)
|
||||
full_text_parts = []
|
||||
full_text_parts.append("【文档正文】")
|
||||
full_text_parts.extend(paragraphs_text)
|
||||
|
||||
if tables_data:
|
||||
full_text_parts.append("\n【文档表格】")
|
||||
for idx, table in enumerate(tables_data):
|
||||
full_text_parts.append(f"--- 表格 {idx + 1} ---")
|
||||
for row in table["rows"]:
|
||||
full_text_parts.append(" | ".join(str(cell) for cell in row))
|
||||
|
||||
if images_info.get("image_count", 0) > 0:
|
||||
full_text_parts.append(f"\n【文档图片】文档包含 {images_info['image_count']} 张图片/图表")
|
||||
|
||||
full_text = "\n".join(full_text_parts)
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"file_size": path.stat().st_size,
|
||||
"paragraph_count": len(paragraphs),
|
||||
"table_count": len(tables_data),
|
||||
"word_count": len(full_text),
|
||||
"char_count": len(full_text.replace("\n", "")),
|
||||
"has_tables": len(tables_data) > 0,
|
||||
"has_images": images_info.get("image_count", 0) > 0,
|
||||
"image_count": images_info.get("image_count", 0)
|
||||
}
|
||||
|
||||
# 返回结果
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": full_text,
|
||||
"paragraphs": paragraphs_text,
|
||||
"paragraphs_with_style": paragraphs,
|
||||
"tables": tables_data,
|
||||
"images": images_info,
|
||||
"word_count": len(full_text),
|
||||
"structured_data": {
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": full_text,
|
||||
"paragraphs": paragraphs,
|
||||
"paragraphs_text": paragraphs_text,
|
||||
"tables": tables_data,
|
||||
"images": images_info
|
||||
"paragraphs_with_style": paragraphs,
|
||||
"tables": tables,
|
||||
"images": {"image_count": 0, "descriptions": []}
|
||||
},
|
||||
metadata={
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"paragraph_count": len(paragraphs),
|
||||
"table_count": len(tables),
|
||||
"image_count": 0,
|
||||
"parse_method": "fallback_xml"
|
||||
}
|
||||
},
|
||||
metadata=metadata
|
||||
)
|
||||
)
|
||||
|
||||
except zipfile.BadZipFile:
|
||||
return ParseResult(success=False, error="无效的 ZIP/文档文件")
|
||||
except Exception as e:
|
||||
logger.error(f"解析 Word 文档失败: {str(e)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"解析 Word 文档失败: {str(e)}"
|
||||
)
|
||||
return ParseResult(success=False, error=f"备用解析失败: {str(e)}")
|
||||
|
||||
def extract_images_as_base64(self, file_path: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
@@ -197,6 +296,83 @@ class DocxParser(BaseParser):
|
||||
logger.info(f"共提取 {len(images)} 张图片")
|
||||
return images
|
||||
|
||||
def extract_text_from_images(self, file_path: str, lang: str = 'chi_sim+eng') -> Dict[str, Any]:
|
||||
"""
|
||||
对 Word 文档中的图片进行 OCR 文字识别
|
||||
|
||||
Args:
|
||||
file_path: Word 文件路径
|
||||
lang: Tesseract 语言代码,默认简体中文+英文 (chi_sim+eng)
|
||||
|
||||
Returns:
|
||||
包含识别结果的字典
|
||||
"""
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from PIL import Image
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
except ImportError:
|
||||
logger.warning("pytesseract 未安装,OCR 功能不可用")
|
||||
return {
|
||||
"success": False,
|
||||
"error": "pytesseract 未安装,请运行: pip install pytesseract",
|
||||
"image_count": 0,
|
||||
"extracted_text": []
|
||||
}
|
||||
|
||||
results = {
|
||||
"success": True,
|
||||
"image_count": 0,
|
||||
"extracted_text": [],
|
||||
"total_chars": 0
|
||||
}
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as zf:
|
||||
# 查找 word/media 目录下的图片文件
|
||||
media_files = [f for f in zf.namelist() if f.startswith('word/media/')]
|
||||
|
||||
for idx, filename in enumerate(media_files):
|
||||
ext = filename.split('.')[-1].lower()
|
||||
if ext not in ['png', 'jpg', 'jpeg', 'gif', 'bmp']:
|
||||
continue
|
||||
|
||||
try:
|
||||
# 读取图片数据
|
||||
image_data = zf.read(filename)
|
||||
image = Image.open(BytesIO(image_data))
|
||||
|
||||
# 使用 Tesseract OCR 提取文字
|
||||
text = pytesseract.image_to_string(image, lang=lang)
|
||||
text = text.strip()
|
||||
|
||||
if text:
|
||||
results["extracted_text"].append({
|
||||
"image_index": idx,
|
||||
"filename": filename,
|
||||
"text": text,
|
||||
"char_count": len(text)
|
||||
})
|
||||
results["total_chars"] += len(text)
|
||||
|
||||
logger.info(f"图片 {filename} OCR 识别完成,提取 {len(text)} 字符")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"图片 {filename} OCR 识别失败: {str(e)}")
|
||||
|
||||
results["image_count"] = len(results["extracted_text"])
|
||||
|
||||
except zipfile.BadZipFile:
|
||||
results["success"] = False
|
||||
results["error"] = "无效的 Word 文档文件"
|
||||
except Exception as e:
|
||||
results["success"] = False
|
||||
results["error"] = f"OCR 处理失败: {str(e)}"
|
||||
|
||||
return results
|
||||
|
||||
def extract_key_sentences(self, text: str, max_sentences: int = 10) -> List[str]:
|
||||
"""
|
||||
从文本中提取关键句子
|
||||
|
||||
@@ -5,9 +5,10 @@
|
||||
"""
|
||||
import logging
|
||||
import json
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from app.services.template_fill_service import template_fill_service
|
||||
from app.services.template_fill_service import template_fill_service, TemplateField
|
||||
from app.services.rag_service import rag_service
|
||||
from app.services.markdown_ai_service import markdown_ai_service
|
||||
from app.core.database import mongodb
|
||||
@@ -15,6 +16,31 @@ from app.core.database import mongodb
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _extract_filenames_from_text(text: str) -> List[str]:
|
||||
"""
|
||||
从指令文本中提取文件名列表。
|
||||
|
||||
智能处理用'和'/'与'/'、分隔的多个文件名(尤其是带年号的统计公报)。
|
||||
"""
|
||||
# 先去掉"对比这两个文档"等引导语,只保留文件名部分
|
||||
text = re.sub(r'^(?:对比|比较)这两个?文档[的差异]?[::]?', '', text).strip()
|
||||
text = re.sub(r'两个文档.*$', '', text).strip()
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# 直接查找所有带扩展名的文件名模式
|
||||
results = []
|
||||
for m in re.finditer(r'[^\s,。!?、和与]+(?=\.(?:docx|xlsx|md|txt))', text):
|
||||
start = m.start()
|
||||
ext_match = re.search(r'\.(?:docx|xlsx|md|txt)', text[m.end():])
|
||||
if ext_match:
|
||||
fn = text[start:m.end() + ext_match.end()]
|
||||
if fn:
|
||||
results.append(fn)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
class InstructionExecutor:
|
||||
"""指令执行器"""
|
||||
|
||||
@@ -41,9 +67,10 @@ class InstructionExecutor:
|
||||
self.intent_parser = intent_parser
|
||||
|
||||
context = context or {}
|
||||
context["instruction"] = instruction # 保存原始指令以便后续使用
|
||||
|
||||
# 解析意图
|
||||
intent, params = await self.intent_parser.parse(instruction)
|
||||
# 解析意图(传递对话历史上下文)
|
||||
intent, params = await self.intent_parser.parse(instruction, context)
|
||||
|
||||
# 根据意图类型执行相应操作
|
||||
if intent == "extract":
|
||||
@@ -72,18 +99,48 @@ class InstructionExecutor:
|
||||
async def _execute_extract(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""执行信息提取"""
|
||||
try:
|
||||
target_fields = params.get("field_refs", [])
|
||||
# target_fields 来自意图解析,field_refs 来自引号/字段关键词匹配
|
||||
target_fields = params.get("target_fields", []) or params.get("field_refs", [])
|
||||
doc_ids = params.get("document_refs", [])
|
||||
instruction_text = context.get("instruction", "")
|
||||
|
||||
# 如果没有指定文档,尝试按文件名精确搜索
|
||||
if not doc_ids or "all_docs" in doc_ids:
|
||||
if instruction_text:
|
||||
import re
|
||||
# 提取引号内的内容或文件名
|
||||
filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text)
|
||||
if filename_match:
|
||||
search_term = filename_match.group(1)
|
||||
else:
|
||||
match = re.search(r'([^\s]+\.(?:docx|xlsx|md|txt))', instruction_text)
|
||||
search_term = match.group(1) if match else None
|
||||
|
||||
if search_term:
|
||||
logger.info(f"提取时搜索文档: {search_term}")
|
||||
searched_docs = await mongodb.search_documents(search_term, limit=5)
|
||||
if searched_docs:
|
||||
# 优先选择文件名完全匹配的文档
|
||||
best_docs = [
|
||||
d for d in searched_docs
|
||||
if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower()
|
||||
]
|
||||
if not best_docs:
|
||||
best_docs = [searched_docs[0]]
|
||||
context["source_docs"] = best_docs
|
||||
doc_ids = [doc.get("_id", "") for doc in best_docs]
|
||||
logger.info(f"找到 {len(best_docs)} 个文档用于提取,最佳: {best_docs[0].get('metadata', {}).get('original_filename', '?')}")
|
||||
|
||||
if not target_fields:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "extract",
|
||||
"error": "未指定要提取的字段",
|
||||
"message": "请明确说明要提取哪些字段,如:'提取医院数量和床位数'"
|
||||
}
|
||||
|
||||
# 如果指定了文档,验证文档存在
|
||||
if doc_ids and "all_docs" not in doc_ids:
|
||||
# 如果指定了文档且还没有加载 source_docs,则验证并加载
|
||||
if doc_ids and "all_docs" not in doc_ids and not context.get("source_docs"):
|
||||
valid_docs = []
|
||||
for doc_ref in doc_ids:
|
||||
doc_id = doc_ref.replace("doc_", "")
|
||||
@@ -93,20 +150,22 @@ class InstructionExecutor:
|
||||
if not valid_docs:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "extract",
|
||||
"error": "指定的文档不存在",
|
||||
"message": "请检查文档编号是否正确"
|
||||
}
|
||||
context["source_docs"] = valid_docs
|
||||
|
||||
# 构建字段列表
|
||||
fields = []
|
||||
for i, field_name in enumerate(target_fields):
|
||||
fields.append({
|
||||
"name": field_name,
|
||||
"cell": f"A{i+1}",
|
||||
"field_type": "text",
|
||||
"required": False
|
||||
})
|
||||
# 构建字段列表(使用 TemplateField dataclass)
|
||||
fields = [
|
||||
TemplateField(
|
||||
name=field_name,
|
||||
cell=f"A{i+1}",
|
||||
field_type="text",
|
||||
required=False
|
||||
)
|
||||
for i, field_name in enumerate(target_fields)
|
||||
]
|
||||
|
||||
# 调用填表服务
|
||||
result = await template_fill_service.fill_template(
|
||||
@@ -143,7 +202,7 @@ class InstructionExecutor:
|
||||
}
|
||||
|
||||
# 获取源文档
|
||||
source_docs = context.get("source_docs", [])
|
||||
source_docs = context.get("source_docs", []) or []
|
||||
source_doc_ids = [doc.get("_id") for doc in source_docs if doc.get("_id")]
|
||||
|
||||
# 获取字段
|
||||
@@ -175,36 +234,103 @@ class InstructionExecutor:
|
||||
}
|
||||
|
||||
async def _execute_summarize(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""执行摘要总结"""
|
||||
"""执行摘要总结 - 使用 LLM 生成真实摘要"""
|
||||
try:
|
||||
docs = context.get("source_docs", [])
|
||||
import re
|
||||
docs = context.get("source_docs", []) or []
|
||||
instruction_text = context.get("instruction", "")
|
||||
|
||||
# 从指令中提取文件名/关键词,优先搜索精确文档
|
||||
search_term = None
|
||||
if instruction_text:
|
||||
filename_match = re.search(r'["""]([^"""]+)["""]', instruction_text)
|
||||
if filename_match:
|
||||
search_term = filename_match.group(1)
|
||||
else:
|
||||
file_match = re.search(r'([^\s,。!?,]+\.(?:docx|xlsx|md|txt))', instruction_text)
|
||||
if file_match:
|
||||
search_term = file_match.group(1)
|
||||
|
||||
# 如果没有文档或有更精确的搜索词,尝试重新搜索
|
||||
if not docs or search_term:
|
||||
if search_term:
|
||||
logger.info(f"按关键词搜索文档: {search_term}")
|
||||
searched_docs = await mongodb.search_documents(search_term, limit=5)
|
||||
if searched_docs:
|
||||
# 优先使用文件名最匹配的文档
|
||||
docs = sorted(
|
||||
searched_docs,
|
||||
key=lambda d: 1 if search_term.lower() in d.get("metadata", {}).get("original_filename", "").lower() else 0,
|
||||
reverse=True
|
||||
)
|
||||
logger.info(f"找到 {len(docs)} 个文档,最佳匹配: {docs[0].get('metadata', {}).get('original_filename', '?')}")
|
||||
|
||||
if not docs:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "没有可用的文档",
|
||||
"message": "请先上传要总结的文档"
|
||||
"success": True,
|
||||
"intent": "summarize",
|
||||
"action_needed": "provide_document",
|
||||
"message": "我理解了,您想分析文档内容。",
|
||||
"suggestion": "请提供已上传文档的名称(可以是文件名或部分名称),或者上传您想要分析的文档。\n\n支持的格式:docx、xlsx、md、txt\n\n例如:'分析2021年民政事业发展统计公报' 或 '总结卫生健康数据'"
|
||||
}
|
||||
|
||||
summaries = []
|
||||
for doc in docs[:5]: # 最多处理5个文档
|
||||
content = doc.get("content", "")[:5000] # 限制内容长度
|
||||
if content:
|
||||
summaries.append({
|
||||
"filename": doc.get("metadata", {}).get("original_filename", "未知"),
|
||||
"content_preview": content[:500] + "..." if len(content) > 500 else content
|
||||
})
|
||||
# 对第一个(最佳匹配)文档生成 AI 摘要
|
||||
primary_doc = docs[0]
|
||||
content = primary_doc.get("content", "")
|
||||
filename = primary_doc.get("metadata", {}).get("original_filename", "未知文档")
|
||||
|
||||
if not content:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "summarize",
|
||||
"error": "文档内容为空",
|
||||
"message": f"文档 {filename} 没有可供分析的文本内容"
|
||||
}
|
||||
|
||||
# 使用 LLM 生成摘要
|
||||
content_for_summary = content[:12000] # 最多取前 12000 字
|
||||
user_request = instruction_text or "请总结这份文档"
|
||||
|
||||
prompt = f"""请对以下文档进行全面、有条理的摘要分析。
|
||||
|
||||
文档名称:{filename}
|
||||
用户要求:{user_request}
|
||||
|
||||
文档内容:
|
||||
{content_for_summary}
|
||||
|
||||
请按以下格式输出摘要:
|
||||
1. **文档概述**:简述文档主题和背景(2-3句)
|
||||
2. **主要内容**:列出文档的核心数据和关键信息(用要点列出)
|
||||
3. **重要数据**:提取文档中的重要数字、统计数据
|
||||
4. **主要结论**:归纳文档的主要结论或趋势
|
||||
|
||||
要求:条理清晰,数据准确,不要遗漏关键信息。"""
|
||||
|
||||
from app.services.llm_service import llm_service
|
||||
messages = [
|
||||
{"role": "system", "content": "你是一个专业的文档分析助手,擅长提取关键信息并生成结构化摘要。"},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
|
||||
response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=2000)
|
||||
ai_summary = llm_service.extract_message_content(response)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"intent": "summarize",
|
||||
"summaries": summaries,
|
||||
"message": f"找到 {len(summaries)} 个文档可供参考"
|
||||
"ai_summary": ai_summary,
|
||||
"filename": filename,
|
||||
"doc_id": primary_doc.get("_id", ""),
|
||||
"total_docs_found": len(docs),
|
||||
"message": f"已生成文档摘要"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"摘要执行失败: {e}")
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "summarize",
|
||||
"error": str(e),
|
||||
"message": f"摘要生成失败: {str(e)}"
|
||||
}
|
||||
@@ -213,17 +339,39 @@ class InstructionExecutor:
|
||||
"""执行问答"""
|
||||
try:
|
||||
question = params.get("question", "")
|
||||
instruction_text = context.get("instruction", "")
|
||||
|
||||
if not question:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "question",
|
||||
"error": "未提供问题",
|
||||
"message": "请输入要回答的问题"
|
||||
}
|
||||
|
||||
# 使用 RAG 检索相关文档
|
||||
docs = context.get("source_docs", [])
|
||||
rag_results = []
|
||||
docs = context.get("source_docs", []) or []
|
||||
|
||||
# 如果没有文档,尝试从指令中提取文件名搜索
|
||||
if not docs:
|
||||
filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text)
|
||||
if not filename_match:
|
||||
filename_match = re.search(r'([^\s,。!?]+\.(?:docx|xlsx|md|txt))', instruction_text)
|
||||
if filename_match:
|
||||
found = await mongodb.search_documents(filename_match.group(1), limit=5)
|
||||
if found:
|
||||
docs = found
|
||||
|
||||
if not docs:
|
||||
return {
|
||||
"success": True,
|
||||
"intent": "question",
|
||||
"question": question,
|
||||
"answer": None,
|
||||
"message": "请先上传文档,我才能回答您的问题"
|
||||
}
|
||||
|
||||
# 使用 RAG 检索相关文档
|
||||
rag_results = []
|
||||
for doc in docs:
|
||||
doc_id = doc.get("_id", "")
|
||||
if doc_id:
|
||||
@@ -241,12 +389,42 @@ class InstructionExecutor:
|
||||
doc.get("content", "")[:3000] for doc in docs[:3] if doc.get("content")
|
||||
])
|
||||
|
||||
if not context_text:
|
||||
return {
|
||||
"success": True,
|
||||
"intent": "question",
|
||||
"question": question,
|
||||
"answer": None,
|
||||
"message": "文档内容为空,无法回答问题"
|
||||
}
|
||||
|
||||
# 使用 LLM 生成答案
|
||||
filename = docs[0].get("metadata", {}).get("original_filename", "文档")
|
||||
prompt = f"""基于以下文档内容,回答用户的问题。
|
||||
|
||||
文档名称:{filename}
|
||||
用户问题:{question}
|
||||
|
||||
文档内容:
|
||||
{context_text[:8000]}
|
||||
|
||||
请根据文档内容准确回答问题。如果文档中没有相关信息,请明确说明。"""
|
||||
|
||||
from app.services.llm_service import llm_service
|
||||
messages = [
|
||||
{"role": "system", "content": "你是一个专业的文档问答助手,根据提供的内容准确回答用户问题。"},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
response = await llm_service.chat(messages=messages, temperature=0.3, max_tokens=1500)
|
||||
answer = llm_service.extract_message_content(response)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"intent": "question",
|
||||
"question": question,
|
||||
"context_preview": context_text[:500] + "..." if len(context_text) > 500 else context_text,
|
||||
"message": "已找到相关上下文,可进行问答"
|
||||
"answer": answer,
|
||||
"filename": filename,
|
||||
"message": "已生成回答"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
@@ -299,12 +477,53 @@ class InstructionExecutor:
|
||||
async def _execute_compare(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""执行对比分析"""
|
||||
try:
|
||||
docs = context.get("source_docs", [])
|
||||
docs = context.get("source_docs", []) or []
|
||||
instruction_text = context.get("instruction", "")
|
||||
|
||||
# 优先从指令中提取具体的文件名
|
||||
filenames = _extract_filenames_from_text(instruction_text)
|
||||
|
||||
if filenames:
|
||||
# 只选择文件名匹配的那些文档
|
||||
matched_docs = []
|
||||
for doc in docs:
|
||||
fname = doc.get("metadata", {}).get("original_filename", "").lower()
|
||||
for fn in filenames:
|
||||
if fn.lower() in fname or fname in fn.lower():
|
||||
matched_docs.append(doc)
|
||||
break
|
||||
# 如果匹配到足够文档,用匹配的
|
||||
if len(matched_docs) >= 2:
|
||||
docs = matched_docs
|
||||
else:
|
||||
# 匹配不够,尝试按文件名搜索 MongoDB
|
||||
all_found = []
|
||||
for fn in filenames:
|
||||
found = await mongodb.search_documents(fn, limit=5)
|
||||
all_found.extend(found)
|
||||
seen = set()
|
||||
unique_docs = []
|
||||
for d in all_found:
|
||||
did = d.get("_id", "")
|
||||
if did and did not in seen:
|
||||
seen.add(did)
|
||||
unique_docs.append(d)
|
||||
if len(unique_docs) >= 2:
|
||||
docs = unique_docs
|
||||
elif len(unique_docs) == 1 and len(docs) >= 1:
|
||||
# 找到一个指定的 + 用一个通用的
|
||||
docs = unique_docs + docs[:1]
|
||||
elif docs and len(filenames) == 1:
|
||||
# 找到一个指定文件名但只有一个匹配,尝试补充
|
||||
docs = unique_docs + [d for d in docs if d not in unique_docs]
|
||||
docs = docs[:2]
|
||||
|
||||
if len(docs) < 2:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "compare",
|
||||
"error": "对比需要至少2个文档",
|
||||
"message": "请上传至少2个文档进行对比"
|
||||
"message": "请上传至少2个文档进行对比,或明确说出要对比的文档名称"
|
||||
}
|
||||
|
||||
# 提取文档基本信息
|
||||
@@ -329,6 +548,7 @@ class InstructionExecutor:
|
||||
logger.error(f"对比执行失败: {e}")
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "compare",
|
||||
"error": str(e),
|
||||
"message": f"对比分析失败: {str(e)}"
|
||||
}
|
||||
@@ -336,10 +556,23 @@ class InstructionExecutor:
|
||||
async def _execute_edit(self, params: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""执行文档编辑操作"""
|
||||
try:
|
||||
docs = context.get("source_docs", [])
|
||||
docs = context.get("source_docs", []) or []
|
||||
instruction_text = context.get("instruction", "")
|
||||
|
||||
# 如果没有文档,尝试从指令中提取文件名搜索
|
||||
if not docs:
|
||||
filename_match = re.search(r'["""]([^"""]+\.(?:docx|xlsx|md|txt))["""]', instruction_text)
|
||||
if not filename_match:
|
||||
filename_match = re.search(r'([^\s,。!?]+\.(?:docx|xlsx|md|txt))', instruction_text)
|
||||
if filename_match:
|
||||
found = await mongodb.search_documents(filename_match.group(1), limit=3)
|
||||
if found:
|
||||
docs = found
|
||||
|
||||
if not docs:
|
||||
return {
|
||||
"success": False,
|
||||
"intent": "edit",
|
||||
"error": "没有可用的文档",
|
||||
"message": "请先上传要编辑的文档"
|
||||
}
|
||||
@@ -405,7 +638,7 @@ class InstructionExecutor:
|
||||
- Word -> Markdown
|
||||
"""
|
||||
try:
|
||||
docs = context.get("source_docs", [])
|
||||
docs = context.get("source_docs", []) or []
|
||||
if not docs:
|
||||
return {
|
||||
"success": False,
|
||||
|
||||
@@ -28,7 +28,7 @@ class IntentParser:
|
||||
INTENT_KEYWORDS = {
|
||||
INTENT_EXTRACT: ["提取", "抽取", "获取", "找出", "查找", "识别", "找到"],
|
||||
INTENT_FILL_TABLE: ["填表", "填写", "填充", "录入", "导入到表格", "填写到"],
|
||||
INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼"],
|
||||
INTENT_SUMMARIZE: ["总结", "摘要", "概括", "概述", "归纳", "提炼", "分析", "聊聊"],
|
||||
INTENT_QUESTION: ["问答", "回答", "解释", "什么是", "为什么", "如何", "怎样", "多少", "几个"],
|
||||
INTENT_SEARCH: ["搜索", "查找", "检索", "查询", "找"],
|
||||
INTENT_COMPARE: ["对比", "比较", "差异", "区别", "不同"],
|
||||
@@ -47,12 +47,13 @@ class IntentParser:
|
||||
def __init__(self):
|
||||
self.intent_history: List[Dict[str, Any]] = []
|
||||
|
||||
async def parse(self, text: str) -> Tuple[str, Dict[str, Any]]:
|
||||
async def parse(self, text: str, context: Dict[str, Any] = None) -> Tuple[str, Dict[str, Any]]:
|
||||
"""
|
||||
解析自然语言指令
|
||||
|
||||
Args:
|
||||
text: 用户输入的自然语言
|
||||
context: 执行上下文(包含对话历史等)
|
||||
|
||||
Returns:
|
||||
(意图类型, 参数字典)
|
||||
@@ -61,11 +62,17 @@ class IntentParser:
|
||||
if not text:
|
||||
return self.INTENT_UNKNOWN, {}
|
||||
|
||||
# 检查对话历史中的上下文
|
||||
conversation_history = []
|
||||
if context and context.get("conversation_history"):
|
||||
conversation_history = context.get("conversation_history", [])
|
||||
logger.info(f"解析时使用对话历史: {len(conversation_history)} 条消息")
|
||||
|
||||
# 记录历史
|
||||
self.intent_history.append({"text": text, "intent": None})
|
||||
|
||||
# 识别意图
|
||||
intent = self._recognize_intent(text)
|
||||
# 识别意图(考虑对话上下文)
|
||||
intent = self._recognize_intent_with_context(text, conversation_history)
|
||||
|
||||
# 提取参数
|
||||
params = self._extract_params(text, intent)
|
||||
@@ -78,6 +85,42 @@ class IntentParser:
|
||||
|
||||
return intent, params
|
||||
|
||||
def _recognize_intent_with_context(self, text: str, conversation_history: List[Dict[str, Any]]) -> str:
|
||||
"""
|
||||
基于对话历史识别意图
|
||||
|
||||
Args:
|
||||
text: 当前用户输入
|
||||
conversation_history: 对话历史
|
||||
|
||||
Returns:
|
||||
意图类型
|
||||
"""
|
||||
# 如果对话历史为空,使用基础意图识别
|
||||
if not conversation_history:
|
||||
return self._recognize_intent(text)
|
||||
|
||||
# 基于历史上下文进行意图识别
|
||||
# 分析最近的对话了解用户意图的延续性
|
||||
last_intent = None
|
||||
last_topic = None
|
||||
|
||||
for msg in conversation_history[-5:]: # 最多看最近5条消息
|
||||
if msg.get("role") == "assistant":
|
||||
last_intent = msg.get("intent")
|
||||
if msg.get("intent") and msg.get("intent") != "unknown":
|
||||
last_topic = msg.get("intent")
|
||||
|
||||
# 如果当前消息很短(如"继续"、"是的"),可能延续之前的意图
|
||||
short_confirmation = ["是", "是的", "好", "继续", "ok", "好", "接着", "然后", "还有吗"]
|
||||
if text.strip() in short_confirmation or len(text.strip()) <= 3:
|
||||
if last_topic:
|
||||
logger.info(f"简短确认,延续之前的意图: {last_topic}")
|
||||
return last_topic
|
||||
|
||||
# 否则使用标准意图识别
|
||||
return self._recognize_intent(text)
|
||||
|
||||
def _recognize_intent(self, text: str) -> str:
|
||||
"""识别意图类型"""
|
||||
intent_scores: Dict[str, float] = {}
|
||||
@@ -214,18 +257,27 @@ class IntentParser:
|
||||
return template_info if template_info else None
|
||||
|
||||
def _extract_target_fields(self, text: str) -> List[str]:
|
||||
"""提取目标字段"""
|
||||
"""提取目标字段 - 按分隔符切分再逐段清理"""
|
||||
fields = []
|
||||
|
||||
# 匹配 "提取XXX和YYY"、"抽取XXX、YYY"
|
||||
patterns = [
|
||||
r"提取([^(and|,|,)+]+?)(?:和|与|、|,|plus)",
|
||||
r"抽取([^(and|,|,)+]+?)(?:和|与|、|,|plus)",
|
||||
]
|
||||
# 去除提取/抽取前缀
|
||||
cleaned_text = re.sub(r"^(?:提取|抽取)", "", text).strip()
|
||||
|
||||
for pattern in patterns:
|
||||
matches = re.findall(pattern, text)
|
||||
fields.extend([m.strip() for m in matches if m.strip()])
|
||||
# 按'和'、'与'、'、'分割成多段
|
||||
segments = re.split(r"[和与、]", cleaned_text)
|
||||
|
||||
# 常见前缀(这些不是字段名,需要去除)
|
||||
prefixes = ["文档中的", "文档中", "文件中的", "文件中", "内容中的", "内容中"]
|
||||
|
||||
for seg in segments:
|
||||
seg = seg.strip()
|
||||
# 去除常见前缀
|
||||
for p in prefixes:
|
||||
if seg.startswith(p):
|
||||
seg = seg[len(p):]
|
||||
break
|
||||
if seg and 2 <= len(seg) <= 20:
|
||||
fields.append(seg)
|
||||
|
||||
return list(set(fields))
|
||||
|
||||
|
||||
@@ -34,8 +34,8 @@ def setup_logging():
|
||||
# 根日志配置
|
||||
log_level = logging.DEBUG if settings.DEBUG else logging.INFO
|
||||
|
||||
# 日志目录
|
||||
log_dir = Path("data/logs")
|
||||
# 日志目录 (使用 settings.BASE_DIR 确保跨平台一致)
|
||||
log_dir = settings.BASE_DIR / "data" / "logs"
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 日志文件路径
|
||||
|
||||
@@ -223,6 +223,177 @@ class ExcelAIService:
|
||||
}
|
||||
}
|
||||
|
||||
async def analyze_excel_file_from_path(
|
||||
self,
|
||||
file_path: str,
|
||||
filename: str,
|
||||
user_prompt: str = "",
|
||||
analysis_type: str = "general",
|
||||
parse_options: Optional[Dict[str, Any]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
从文件路径分析 Excel 文件(用于从数据库加载的文档)
|
||||
|
||||
Args:
|
||||
file_path: Excel 文件路径
|
||||
filename: 文件名
|
||||
user_prompt: 用户自定义提示词
|
||||
analysis_type: 分析类型
|
||||
parse_options: 解析选项
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 分析结果
|
||||
"""
|
||||
# 1. 解析 Excel 文件
|
||||
excel_data = None
|
||||
parse_result_metadata = None
|
||||
try:
|
||||
parse_options = parse_options or {}
|
||||
parse_result = self.parser.parse(file_path, **parse_options)
|
||||
|
||||
if not parse_result.success:
|
||||
return {
|
||||
"success": False,
|
||||
"error": parse_result.error,
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
excel_data = parse_result.data
|
||||
parse_result_metadata = parse_result.metadata
|
||||
logger.info(f"Excel 解析成功: {parse_result_metadata}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Excel 解析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Excel 解析失败: {str(e)}",
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
# 2. 调用 LLM 进行分析
|
||||
try:
|
||||
if user_prompt and user_prompt.strip():
|
||||
llm_result = await self.llm_service.analyze_with_template(
|
||||
excel_data,
|
||||
user_prompt
|
||||
)
|
||||
else:
|
||||
llm_result = await self.llm_service.analyze_excel_data(
|
||||
excel_data,
|
||||
user_prompt,
|
||||
analysis_type
|
||||
)
|
||||
|
||||
logger.info(f"AI 分析完成: {llm_result['success']}")
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"excel": {
|
||||
"data": excel_data,
|
||||
"metadata": parse_result_metadata,
|
||||
"saved_path": file_path
|
||||
},
|
||||
"analysis": llm_result
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"AI 分析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"AI 分析失败: {str(e)}",
|
||||
"excel": {
|
||||
"data": excel_data,
|
||||
"metadata": parse_result_metadata
|
||||
},
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
async def batch_analyze_sheets_from_path(
|
||||
self,
|
||||
file_path: str,
|
||||
filename: str,
|
||||
user_prompt: str = "",
|
||||
analysis_type: str = "general"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
从文件路径批量分析 Excel 文件的所有工作表(用于从数据库加载的文档)
|
||||
|
||||
Args:
|
||||
file_path: Excel 文件路径
|
||||
filename: 文件名
|
||||
user_prompt: 用户自定义提示词
|
||||
analysis_type: 分析类型
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 分析结果
|
||||
"""
|
||||
# 1. 解析所有工作表
|
||||
try:
|
||||
parse_result = self.parser.parse_all_sheets(file_path)
|
||||
|
||||
if not parse_result.success:
|
||||
return {
|
||||
"success": False,
|
||||
"error": parse_result.error,
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
sheets_data = parse_result.data.get("sheets", {})
|
||||
logger.info(f"Excel 解析成功,共 {len(sheets_data)} 个工作表")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Excel 解析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Excel 解析失败: {str(e)}",
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
# 2. 批量分析每个工作表
|
||||
sheet_analyses = {}
|
||||
errors = {}
|
||||
|
||||
for sheet_name, sheet_data in sheets_data.items():
|
||||
try:
|
||||
if user_prompt and user_prompt.strip():
|
||||
llm_result = await self.llm_service.analyze_with_template(
|
||||
sheet_data,
|
||||
user_prompt
|
||||
)
|
||||
else:
|
||||
llm_result = await self.llm_service.analyze_excel_data(
|
||||
sheet_data,
|
||||
user_prompt,
|
||||
analysis_type
|
||||
)
|
||||
|
||||
sheet_analyses[sheet_name] = llm_result
|
||||
|
||||
if not llm_result["success"]:
|
||||
errors[sheet_name] = llm_result.get("error", "未知错误")
|
||||
|
||||
logger.info(f"工作表 '{sheet_name}' 分析完成")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"工作表 '{sheet_name}' 分析失败: {str(e)}")
|
||||
errors[sheet_name] = str(e)
|
||||
|
||||
# 3. 组合结果
|
||||
return {
|
||||
"success": len(errors) == 0,
|
||||
"excel": {
|
||||
"sheets": sheets_data,
|
||||
"metadata": parse_result.metadata,
|
||||
"saved_path": file_path
|
||||
},
|
||||
"analysis": {
|
||||
"sheets": sheet_analyses,
|
||||
"total_sheets": len(sheets_data),
|
||||
"successful": len(sheet_analyses) - len(errors),
|
||||
"errors": errors
|
||||
}
|
||||
}
|
||||
|
||||
def get_supported_analysis_types(self) -> List[str]:
|
||||
"""获取支持的分析类型"""
|
||||
return [
|
||||
|
||||
@@ -526,9 +526,10 @@ class ExcelStorageService:
|
||||
# 创建表
|
||||
model_class = self._create_table_model(table_name, columns, column_types)
|
||||
|
||||
# 创建表结构
|
||||
# 创建表结构 (使用异步方式)
|
||||
async with self.mysql_db.get_session() as session:
|
||||
model_class.__table__.create(session.bind, checkfirst=True)
|
||||
async with session.bind.begin() as conn:
|
||||
await conn.run_sync(lambda: model_class.__table__.create(checkfirst=True))
|
||||
|
||||
# 插入数据
|
||||
records = []
|
||||
|
||||
@@ -54,15 +54,21 @@ class LLMService:
|
||||
# 添加其他参数
|
||||
payload.update(kwargs)
|
||||
|
||||
import time
|
||||
_start_time = time.time()
|
||||
logger.info(f"🤖 [LLM] 正在调用 DeepSeek API... 模型: {self.model_name}")
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
response = await client.post(
|
||||
f"{self.base_url}/chat/completions",
|
||||
headers=headers,
|
||||
json=payload
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
result = response.json()
|
||||
_elapsed = time.time() - _start_time
|
||||
logger.info(f"✅ [LLM] DeepSeek API 响应成功 | 模型: {self.model_name} | 耗时: {_elapsed:.2f}s | Token: {result.get('usage', {}).get('total_tokens', 'N/A')}")
|
||||
return result
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
error_detail = e.response.text
|
||||
@@ -78,7 +84,7 @@ class LLMService:
|
||||
pass
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"LLM API 调用异常: {str(e)}")
|
||||
logger.error(f"LLM API 调用异常: {repr(e)} - {str(e)}")
|
||||
raise
|
||||
|
||||
def extract_message_content(self, response: Dict[str, Any]) -> str:
|
||||
@@ -133,6 +139,9 @@ class LLMService:
|
||||
|
||||
payload.update(kwargs)
|
||||
|
||||
import time
|
||||
_start_time = time.time()
|
||||
logger.info(f"🤖 [LLM] 正在调用 DeepSeek API (流式) | 模型: {self.model_name}")
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
async with client.stream(
|
||||
@@ -141,10 +150,13 @@ class LLMService:
|
||||
headers=headers,
|
||||
json=payload
|
||||
) as response:
|
||||
_elapsed = time.time() - _start_time
|
||||
logger.info(f"✅ [LLM] DeepSeek API 流式响应开始 | 模型: {self.model_name} | 耗时: {_elapsed:.2f}s")
|
||||
async for line in response.aiter_lines():
|
||||
if line.startswith("data: "):
|
||||
data = line[6:] # Remove "data: " prefix
|
||||
if data == "[DONE]":
|
||||
logger.info(f"✅ [LLM] DeepSeek API 流式响应完成")
|
||||
break
|
||||
try:
|
||||
import json as json_module
|
||||
|
||||
403
backend/app/services/pdf_converter_service.py
Normal file
403
backend/app/services/pdf_converter_service.py
Normal file
@@ -0,0 +1,403 @@
|
||||
"""
|
||||
PDF 转换服务
|
||||
|
||||
支持将 Word(docx)、Excel(xlsx)、Txt、Markdown(md) 格式转换为 PDF
|
||||
策略:所有格式先转为 Markdown,再通过 Markdown 转 PDF
|
||||
"""
|
||||
import io
|
||||
import logging
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
from reportlab.lib.pagesizes import A4
|
||||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||||
from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_JUSTIFY
|
||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
|
||||
from reportlab.pdfbase import pdfmetrics
|
||||
from reportlab.pdfbase.ttfonts import TTFont
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PDFConverterService:
|
||||
"""PDF 转换服务"""
|
||||
|
||||
def __init__(self):
|
||||
self.supported_formats = ["docx", "xlsx", "txt", "md"]
|
||||
self._font_name = None
|
||||
self._styles = None
|
||||
self._page_width = None
|
||||
self._page_height = None
|
||||
self._setup_fonts()
|
||||
|
||||
def _setup_fonts(self):
|
||||
"""设置字体"""
|
||||
try:
|
||||
self._page_width, self._page_height = A4
|
||||
|
||||
# 查找中文字体
|
||||
font_path = self._find_chinese_font()
|
||||
if font_path:
|
||||
try:
|
||||
font = TTFont('ChineseFont', font_path)
|
||||
pdfmetrics.registerFont(font)
|
||||
from reportlab.pdfbase.pdfmetrics import registerFontFamily
|
||||
registerFontFamily('ChineseFont', normal='ChineseFont')
|
||||
self._font_name = 'ChineseFont'
|
||||
logger.info(f"成功注册中文字体: {font_path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"字体注册失败: {e}, 使用Helvetica")
|
||||
self._font_name = 'Helvetica'
|
||||
else:
|
||||
self._font_name = 'Helvetica'
|
||||
logger.warning("未找到中文字体,使用 Helvetica(不支持中文)")
|
||||
|
||||
# 创建样式
|
||||
styles = getSampleStyleSheet()
|
||||
|
||||
styles.add(ParagraphStyle(
|
||||
name='ChineseTitle',
|
||||
fontName=self._font_name,
|
||||
fontSize=16,
|
||||
leading=22,
|
||||
alignment=TA_CENTER,
|
||||
spaceAfter=12,
|
||||
))
|
||||
|
||||
styles.add(ParagraphStyle(
|
||||
name='ChineseHeading',
|
||||
fontName=self._font_name,
|
||||
fontSize=14,
|
||||
leading=20,
|
||||
spaceBefore=10,
|
||||
spaceAfter=8,
|
||||
))
|
||||
|
||||
styles.add(ParagraphStyle(
|
||||
name='ChineseBody',
|
||||
fontName=self._font_name,
|
||||
fontSize=10,
|
||||
leading=14,
|
||||
alignment=TA_JUSTIFY,
|
||||
spaceAfter=6,
|
||||
))
|
||||
|
||||
styles.add(ParagraphStyle(
|
||||
name='ChineseCode',
|
||||
fontName='Courier',
|
||||
fontSize=9,
|
||||
leading=12,
|
||||
))
|
||||
|
||||
self._styles = styles
|
||||
logger.info("PDF服务初始化完成")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"PDF服务初始化失败: {e}")
|
||||
raise
|
||||
|
||||
def _find_chinese_font(self) -> str:
|
||||
"""查找中文字体"""
|
||||
system = platform.system()
|
||||
|
||||
if system == "Windows":
|
||||
fonts = [
|
||||
"C:/Windows/Fonts/simhei.ttf",
|
||||
"C:/Windows/Fonts/simsun.ttc",
|
||||
"C:/Windows/Fonts/msyh.ttc",
|
||||
"C:/Windows/Fonts/simsun.ttf",
|
||||
]
|
||||
elif system == "Darwin":
|
||||
fonts = [
|
||||
"/System/Library/Fonts/STHeiti Light.ttc",
|
||||
"/System/Library/Fonts/PingFang.ttc",
|
||||
"/Library/Fonts/Arial Unicode.ttf",
|
||||
]
|
||||
else:
|
||||
fonts = [
|
||||
"/usr/share/fonts/truetype/wqy/wqy-microhei.ttc",
|
||||
"/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
|
||||
]
|
||||
|
||||
for font in fonts:
|
||||
if Path(font).exists():
|
||||
return font
|
||||
return None
|
||||
|
||||
def _sanitize_text(self, text: str) -> str:
|
||||
"""清理文本"""
|
||||
if not text:
|
||||
return ""
|
||||
return text.replace('\x00', '')
|
||||
|
||||
async def convert_to_pdf(
|
||||
self,
|
||||
file_content: bytes,
|
||||
source_format: str,
|
||||
filename: str = "document"
|
||||
) -> Tuple[bytes, str]:
|
||||
"""将文档转换为 PDF"""
|
||||
try:
|
||||
if source_format.lower() not in self.supported_formats:
|
||||
return b"", f"不支持的格式: {source_format}"
|
||||
|
||||
# 第一步:转换为 Markdown
|
||||
markdown_content, error = await self._convert_to_markdown(file_content, source_format, filename)
|
||||
if error:
|
||||
return b"", error
|
||||
|
||||
# 第二步:Markdown 转 PDF
|
||||
return await self._convert_markdown_to_pdf(markdown_content, filename)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"PDF转换失败: {e}")
|
||||
import traceback
|
||||
logger.error(f"详细错误: {traceback.format_exc()}")
|
||||
return b"", f"转换失败: {str(e)}"
|
||||
|
||||
async def _convert_to_markdown(
|
||||
self,
|
||||
file_content: bytes,
|
||||
source_format: str,
|
||||
filename: str
|
||||
) -> Tuple[str, str]:
|
||||
"""将各种格式转换为 Markdown"""
|
||||
converters = {
|
||||
"docx": self._convert_docx_to_markdown,
|
||||
"xlsx": self._convert_xlsx_to_markdown,
|
||||
"txt": self._convert_txt_to_markdown,
|
||||
"md": self._convert_md_to_markdown,
|
||||
}
|
||||
return await converters[source_format.lower()](file_content, filename)
|
||||
|
||||
async def _convert_txt_to_markdown(self, file_content: bytes, filename: str) -> Tuple[str, str]:
|
||||
"""Txt 转 Markdown"""
|
||||
try:
|
||||
text = self._decode_content(file_content)
|
||||
text = self._sanitize_text(text)
|
||||
return f"# {filename}\n\n{text}", ""
|
||||
except Exception as e:
|
||||
logger.error(f"Txt转Markdown失败: {e}")
|
||||
return "", f"文本文件处理失败: {str(e)}"
|
||||
|
||||
async def _convert_md_to_markdown(self, file_content: bytes, filename: str) -> Tuple[str, str]:
|
||||
"""Markdown 原样返回"""
|
||||
try:
|
||||
content = self._decode_content(file_content)
|
||||
content = self._sanitize_text(content)
|
||||
return f"# {filename}\n\n{content}", ""
|
||||
except Exception as e:
|
||||
logger.error(f"Markdown处理失败: {e}")
|
||||
return "", f"Markdown处理失败: {str(e)}"
|
||||
|
||||
async def _convert_docx_to_markdown(self, file_content: bytes, filename: str) -> Tuple[str, str]:
|
||||
"""Word 转 Markdown - 使用 zipfile 直接解析,更加健壮"""
|
||||
try:
|
||||
import zipfile
|
||||
import re
|
||||
|
||||
lines = [f"# {filename}", ""]
|
||||
|
||||
# 直接使用 zipfile 解析 DOCX,避免 python-docx 的严格验证
|
||||
try:
|
||||
with zipfile.ZipFile(io.BytesIO(file_content), 'r') as zf:
|
||||
# 读取主文档内容
|
||||
xml_content = zf.read('word/document.xml').decode('utf-8')
|
||||
except zipfile.BadZipFile:
|
||||
return "", "文件不是有效的 DOCX 格式"
|
||||
except KeyError:
|
||||
return "", "DOCX 文件损坏:找不到 document.xml"
|
||||
|
||||
# 简单的 XML 解析 - 提取文本段落
|
||||
# 移除 XML 标签,提取纯文本
|
||||
xml_content = re.sub(r'<w:br[^>]*>', '\n', xml_content)
|
||||
xml_content = re.sub(r'</w:p>', '\n', xml_content)
|
||||
xml_content = re.sub(r'<[^>]+>', '', xml_content)
|
||||
xml_content = re.sub(r'\n\s*\n', '\n\n', xml_content)
|
||||
|
||||
# 解码 HTML 实体
|
||||
xml_content = xml_content.replace('&', '&')
|
||||
xml_content = xml_content.replace('<', '<')
|
||||
xml_content = xml_content.replace('>', '>')
|
||||
xml_content = xml_content.replace('"', '"')
|
||||
xml_content = xml_content.replace(''', "'")
|
||||
|
||||
# 清理空白
|
||||
lines_text = [line.strip() for line in xml_content.split('\n') if line.strip()]
|
||||
|
||||
# 生成 Markdown
|
||||
for text in lines_text[:500]: # 限制最多500行
|
||||
if text:
|
||||
lines.append(text)
|
||||
|
||||
return '\n'.join(lines), ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Word转Markdown失败: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
return "", f"Word文档处理失败: {str(e)}"
|
||||
for table in doc.tables:
|
||||
lines.append("")
|
||||
for row in table.rows:
|
||||
row_data = [cell.text.strip() for cell in row.cells]
|
||||
lines.append("| " + " | ".join(row_data) + " |")
|
||||
# 表头分隔符
|
||||
if table.rows:
|
||||
lines.append("| " + " | ".join(["---"] * len(table.rows[0].cells)) + " |")
|
||||
|
||||
return "\n".join(lines), ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Word转Markdown失败: {e}")
|
||||
return "", f"Word文档处理失败: {str(e)}"
|
||||
|
||||
async def _convert_xlsx_to_markdown(self, file_content: bytes, filename: str) -> Tuple[str, str]:
|
||||
"""Excel 转 Markdown"""
|
||||
try:
|
||||
import openpyxl
|
||||
|
||||
wb = openpyxl.load_workbook(io.BytesIO(file_content))
|
||||
lines = [f"# {filename} - Excel数据", ""]
|
||||
|
||||
for sheet_name in wb.sheetnames[:10]:
|
||||
ws = wb[sheet_name]
|
||||
lines.append(f"## 工作表: {sheet_name}")
|
||||
lines.append("")
|
||||
|
||||
for row_idx, row in enumerate(ws.iter_rows(max_row=50, values_only=True)):
|
||||
row_data = [str(cell) if cell is not None else "" for cell in row]
|
||||
if not any(row_data):
|
||||
continue
|
||||
lines.append("| " + " | ".join(row_data) + " |")
|
||||
if row_idx == 0:
|
||||
lines.append("| " + " | ".join(["---"] * len(row_data)) + " |")
|
||||
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines), ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Excel转Markdown失败: {e}")
|
||||
return "", f"Excel处理失败: {str(e)}"
|
||||
|
||||
async def _convert_markdown_to_pdf(self, markdown_content: str, filename: str) -> Tuple[bytes, str]:
|
||||
"""Markdown 转 PDF"""
|
||||
try:
|
||||
logger.info(f"Markdown转PDF开始 - filename={filename}, 字体={self._font_name}")
|
||||
logger.info(f"styles['ChineseTitle'].fontName={self._styles['ChineseTitle'].fontName}")
|
||||
|
||||
buffer = io.BytesIO()
|
||||
story = []
|
||||
|
||||
safe_filename = self._sanitize_text(filename)
|
||||
logger.info(f"safe_filename={repr(safe_filename[:50])}")
|
||||
|
||||
story.append(Paragraph(text=safe_filename, style=self._styles['ChineseTitle']))
|
||||
story.append(Spacer(1, 12))
|
||||
|
||||
in_code = False
|
||||
for line in markdown_content.split('\n'):
|
||||
line = line.strip()
|
||||
|
||||
if line.startswith('```'):
|
||||
in_code = not in_code
|
||||
story.append(Spacer(1, 6))
|
||||
continue
|
||||
|
||||
if in_code:
|
||||
story.append(Paragraph(text=self._sanitize_text(line), style=self._styles['ChineseCode']))
|
||||
continue
|
||||
|
||||
if not line:
|
||||
story.append(Spacer(1, 6))
|
||||
continue
|
||||
|
||||
# 标题处理
|
||||
if line.startswith('# '):
|
||||
story.append(Paragraph(text=self._sanitize_text(line[2:]), style=self._styles['ChineseHeading']))
|
||||
elif line.startswith('## '):
|
||||
story.append(Paragraph(text=self._sanitize_text(line[3:]), style=self._styles['ChineseHeading']))
|
||||
elif line.startswith('### '):
|
||||
story.append(Paragraph(text=self._sanitize_text(line[4:]), style=self._styles['ChineseHeading']))
|
||||
elif line.startswith('#### '):
|
||||
story.append(Paragraph(text=self._sanitize_text(line[5:]), style=self._styles['ChineseHeading']))
|
||||
elif line.startswith('- ') or line.startswith('* '):
|
||||
story.append(Paragraph(text="• " + self._sanitize_text(line[2:]), style=self._styles['ChineseBody']))
|
||||
# 表格处理
|
||||
elif line.startswith('|'):
|
||||
# 跳过 markdown 表格分隔符
|
||||
if set(line.replace('|', '').replace('-', '').replace(':', '').replace(' ', '')) == set():
|
||||
continue
|
||||
# 解析并创建表格
|
||||
table_lines = []
|
||||
for _ in range(50): # 最多50行
|
||||
if line.startswith('|'):
|
||||
row = [cell.strip() for cell in line.split('|')[1:-1]]
|
||||
if not any(row) or set(''.join(row).replace('-', '').replace(':', '').replace(' ', '')) == set():
|
||||
break
|
||||
table_lines.append(row)
|
||||
try:
|
||||
line = next(markdown_content.split('\n').__iter__()).strip()
|
||||
except StopIteration:
|
||||
break
|
||||
else:
|
||||
break
|
||||
|
||||
if table_lines:
|
||||
# 创建表格
|
||||
t = Table(table_lines, colWidths=[100] * len(table_lines[0]))
|
||||
t.setStyle(TableStyle([
|
||||
('FONTNAME', (0, 0), (-1, -1), self._font_name),
|
||||
('FONTSIZE', (0, 0), (-1, -1), 9),
|
||||
('GRID', (0, 0), (-1, -1), 0.5, '#999999'),
|
||||
('BACKGROUND', (0, 0), (-1, 0), '#4472C4'),
|
||||
('TEXTCOLOR', (0, 0), (-1, 0), '#FFFFFF'),
|
||||
]))
|
||||
story.append(t)
|
||||
story.append(Spacer(1, 6))
|
||||
else:
|
||||
story.append(Paragraph(text=self._sanitize_text(line), style=self._styles['ChineseBody']))
|
||||
|
||||
logger.info(f"准备构建PDF,story长度={len(story)}")
|
||||
|
||||
pdf_doc = SimpleDocTemplate(
|
||||
buffer,
|
||||
pagesize=(self._page_width, self._page_height),
|
||||
rightMargin=72,
|
||||
leftMargin=72,
|
||||
topMargin=72,
|
||||
bottomMargin=72
|
||||
)
|
||||
logger.info("调用pdf_doc.build()")
|
||||
pdf_doc.build(story)
|
||||
logger.info("pdf_doc.build()完成")
|
||||
|
||||
result = buffer.getvalue()
|
||||
buffer.close()
|
||||
return result, ""
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Markdown转PDF失败: {e}")
|
||||
import traceback
|
||||
logger.error(f"详细错误: {traceback.format_exc()}")
|
||||
return b"", f"Markdown转PDF失败: {str(e)}"
|
||||
|
||||
def _decode_content(self, file_content: bytes) -> str:
|
||||
"""解码文件内容"""
|
||||
encodings = ['utf-8', 'gbk', 'gb2312', 'gb18030', 'latin-1']
|
||||
for enc in encodings:
|
||||
try:
|
||||
return file_content.decode(enc)
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
continue
|
||||
return file_content.decode('utf-8', errors='replace')
|
||||
|
||||
def get_supported_formats(self) -> List[str]:
|
||||
"""获取支持的格式"""
|
||||
return self.supported_formats
|
||||
|
||||
|
||||
# 全局单例
|
||||
pdf_converter_service = PDFConverterService()
|
||||
@@ -165,9 +165,9 @@ class BM25:
|
||||
class RAGService:
|
||||
"""RAG 检索增强服务"""
|
||||
|
||||
# 默认分块参数
|
||||
DEFAULT_CHUNK_SIZE = 500 # 每个文本块的大小(字符数)
|
||||
DEFAULT_CHUNK_OVERLAP = 50 # 块之间的重叠(字符数)
|
||||
# 默认分块参数 - 增大块大小减少embedding次数
|
||||
DEFAULT_CHUNK_SIZE = 1000 # 每个文本块的大小(字符数),增大以提升速度
|
||||
DEFAULT_CHUNK_OVERLAP = 100 # 块之间的重叠(字符数)
|
||||
|
||||
def __init__(self):
|
||||
self.embedding_model = None
|
||||
@@ -389,6 +389,70 @@ class RAGService:
|
||||
self._add_documents(documents, chunk_ids)
|
||||
logger.info(f"已索引文档 {doc_id},共 {len(chunks)} 个块")
|
||||
|
||||
async def index_document_content_async(
|
||||
self,
|
||||
doc_id: str,
|
||||
content: str,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
chunk_size: int = None,
|
||||
chunk_overlap: int = None
|
||||
):
|
||||
"""
|
||||
异步将文档内容索引到向量数据库(自动分块)
|
||||
|
||||
使用 asyncio.to_thread 避免阻塞事件循环
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
if self._disabled:
|
||||
logger.info(f"[RAG DISABLED] 文档索引操作已跳过: {doc_id}")
|
||||
return
|
||||
|
||||
if not self._initialized:
|
||||
self._init_vector_store()
|
||||
|
||||
if self.embedding_model is None:
|
||||
logger.debug(f"文档跳过索引 (无嵌入模型): {doc_id}")
|
||||
return
|
||||
|
||||
# 分割文档为小块
|
||||
if chunk_size is None:
|
||||
chunk_size = self.DEFAULT_CHUNK_SIZE
|
||||
if chunk_overlap is None:
|
||||
chunk_overlap = self.DEFAULT_CHUNK_OVERLAP
|
||||
|
||||
chunks = self._split_into_chunks(content, chunk_size, chunk_overlap)
|
||||
|
||||
if not chunks:
|
||||
logger.warning(f"文档内容为空,跳过索引: {doc_id}")
|
||||
return
|
||||
|
||||
# 为每个块创建文档对象
|
||||
documents = []
|
||||
chunk_ids = []
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
chunk_id = f"{doc_id}_chunk_{i}"
|
||||
chunk_metadata = metadata.copy() if metadata else {}
|
||||
chunk_metadata.update({
|
||||
"chunk_index": i,
|
||||
"total_chunks": len(chunks),
|
||||
"doc_id": doc_id
|
||||
})
|
||||
|
||||
documents.append(SimpleDocument(
|
||||
page_content=chunk,
|
||||
metadata=chunk_metadata
|
||||
))
|
||||
chunk_ids.append(chunk_id)
|
||||
|
||||
# 使用线程池执行 CPU 密集型的 embedding 计算
|
||||
def _sync_add():
|
||||
self._add_documents(documents, chunk_ids)
|
||||
|
||||
await asyncio.to_thread(_sync_add)
|
||||
logger.info(f"已异步索引文档 {doc_id},共 {len(chunks)} 个块")
|
||||
|
||||
def _add_documents(self, documents: List[SimpleDocument], doc_ids: List[str]):
|
||||
"""批量添加文档到向量索引"""
|
||||
if not documents:
|
||||
@@ -605,7 +669,7 @@ class RAGService:
|
||||
# 按融合分数降序排序
|
||||
fused_results.sort(key=lambda x: x["score"], reverse=True)
|
||||
|
||||
logger.debug(f"混合融合: {len(fused_results)} 个文档, 向量:{len(vector_results)}, BM25:{len(bm25_results)}")
|
||||
logger.info(f"RRF 混合融合: {len(fused_results)} 个文档参与融合, 向量检索命中:{len(vector_results)}, BM25命中:{len(bm25_results)}")
|
||||
|
||||
return fused_results[:top_k]
|
||||
|
||||
|
||||
@@ -300,13 +300,15 @@ class TableRAGService:
|
||||
filename: str,
|
||||
sheet_name: Optional[str] = None,
|
||||
header_row: int = 0,
|
||||
sample_size: int = 10
|
||||
sample_size: int = 10,
|
||||
skip_rag_index: bool = False
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
为 Excel 表构建完整的 RAG 索引
|
||||
|
||||
流程:
|
||||
1. 读取 Excel 获取字段信息
|
||||
2. 如果 skip_rag_index=True,跳过 RAG 索引,直接存 MySQL
|
||||
2. AI 生成每个字段的语义描述
|
||||
3. 将字段描述存入向量数据库
|
||||
|
||||
@@ -367,6 +369,20 @@ class TableRAGService:
|
||||
results["field_count"] = len(df.columns)
|
||||
logger.info(f"表名: {table_name}, 字段数: {len(df.columns)}")
|
||||
|
||||
# 跳过 RAG 索引时直接存 MySQL
|
||||
if skip_rag_index:
|
||||
logger.info(f"跳过 RAG 索引,直接存储到 MySQL")
|
||||
store_result = await self.excel_storage.store_excel(
|
||||
file_path=file_path,
|
||||
filename=filename,
|
||||
sheet_name=sheet_name,
|
||||
header_row=header_row
|
||||
)
|
||||
results["mysql_table"] = store_result.get("table_name") if store_result.get("success") else None
|
||||
results["row_count"] = store_result.get("row_count", len(df))
|
||||
results["indexed_count"] = 0
|
||||
return results
|
||||
|
||||
# 3. 初始化 RAG (如果需要)
|
||||
if not self.rag._initialized:
|
||||
self.rag._init_vector_store()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
353
backend/app/services/txt_ai_service.py
Normal file
353
backend/app/services/txt_ai_service.py
Normal file
@@ -0,0 +1,353 @@
|
||||
"""
|
||||
TXT 文档 AI 分析服务
|
||||
|
||||
使用 LLM 对 TXT 文本文件进行深度分析,提取结构化数据并生成可视化图表
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from app.services.llm_service import llm_service
|
||||
from app.services.visualization_service import visualization_service
|
||||
from app.core.document_parser.txt_parser import TxtParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TxtAIService:
|
||||
"""TXT 文档 AI 分析服务"""
|
||||
|
||||
def __init__(self):
|
||||
self.parser = TxtParser()
|
||||
self.llm = llm_service
|
||||
|
||||
async def analyze_txt_with_ai(
|
||||
self,
|
||||
content: str,
|
||||
filename: str = "",
|
||||
user_hint: str = "",
|
||||
analysis_type: str = "structured"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
使用 AI 解析 TXT 文本文件
|
||||
|
||||
Args:
|
||||
content: 文本内容
|
||||
filename: 文件名(可选)
|
||||
user_hint: 用户提示词
|
||||
analysis_type: 分析类型 - "structured"(默认,提取结构化数据)或 "charts"(生成图表)
|
||||
|
||||
Returns:
|
||||
Dict: 包含结构化数据的分析结果
|
||||
"""
|
||||
try:
|
||||
if not content or not content.strip():
|
||||
return {
|
||||
"success": False,
|
||||
"error": "文档内容为空"
|
||||
}
|
||||
|
||||
# 根据分析类型选择处理方式
|
||||
if analysis_type == "charts":
|
||||
return await self.generate_charts(content, filename, user_hint)
|
||||
|
||||
# 默认:提取结构化数据
|
||||
return await self._extract_structured_data(content, filename, user_hint)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TXT AI 分析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
async def _extract_structured_data(
|
||||
self,
|
||||
content: str,
|
||||
filename: str = "",
|
||||
user_hint: str = ""
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
从文本中提取结构化数据
|
||||
|
||||
Args:
|
||||
content: 文本内容
|
||||
filename: 文件名
|
||||
user_hint: 用户提示词
|
||||
|
||||
Returns:
|
||||
结构化数据
|
||||
"""
|
||||
try:
|
||||
# 截断内容避免超出 token 限制
|
||||
max_content_len = 8000
|
||||
text_preview = content[:max_content_len] if len(content) > max_content_len else content
|
||||
|
||||
prompt = f"""你是一个专业的数据提取专家。请从以下文本中提取结构化数据。
|
||||
|
||||
【用户需求】
|
||||
{user_hint if user_hint else "请提取文档中的所有结构化数据,包括表格数据、键值对、列表项等。"}
|
||||
|
||||
【文档内容】({"前" + str(max_content_len) + "字符,仅显示部分" if len(content) > max_content_len else "全文"})
|
||||
{text_preview}
|
||||
|
||||
请按照以下 JSON 格式输出:
|
||||
{{
|
||||
"type": "structured_text",
|
||||
"tables": [{{"headers": [...], "rows": [...]}}],
|
||||
"key_values": {{"键1": "值1", "键2": "值2", ...}},
|
||||
"list_items": ["项1", "项2", ...],
|
||||
"summary": "文档内容摘要"
|
||||
}}
|
||||
|
||||
重点:
|
||||
- 如果文档包含表格数据(制表符、空格对齐等),提取到 tables 中
|
||||
- 如果文档包含键值对(如 名称: 张三),提取到 key_values 中
|
||||
- 如果文档包含列表项,提取到 list_items 中
|
||||
- 如果无法提取到结构化数据,至少提供一个详细的摘要
|
||||
"""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": "你是一个专业的数据提取助手。请严格按JSON格式输出。"},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
|
||||
response = await self.llm.chat(
|
||||
messages=messages,
|
||||
temperature=0.1,
|
||||
max_tokens=8000
|
||||
)
|
||||
|
||||
content_text = self.llm.extract_message_content(response)
|
||||
result = self._parse_json_response(content_text)
|
||||
|
||||
if result:
|
||||
logger.info(f"TXT 结构化数据提取成功: type={result.get('type')}")
|
||||
return {
|
||||
"success": True,
|
||||
"type": result.get("type", "structured_text"),
|
||||
"tables": result.get("tables", []),
|
||||
"key_values": result.get("key_values", {}),
|
||||
"list_items": result.get("list_items", []),
|
||||
"summary": result.get("summary", "")
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": True,
|
||||
"type": "text",
|
||||
"summary": text_preview[:500],
|
||||
"raw_text_preview": text_preview[:500]
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TXT 结构化数据提取失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
async def generate_charts(
|
||||
self,
|
||||
content: str,
|
||||
filename: str = "",
|
||||
user_hint: str = ""
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
从文本中提取数据并生成可视化图表
|
||||
|
||||
Args:
|
||||
content: 文本内容
|
||||
filename: 文件名
|
||||
user_hint: 用户提示词
|
||||
|
||||
Returns:
|
||||
包含图表数据和统计信息的结果
|
||||
"""
|
||||
try:
|
||||
# 截断内容避免超出 token 限制
|
||||
max_content_len = 8000
|
||||
text_preview = content[:max_content_len] if len(content) > max_content_len else content
|
||||
|
||||
# 使用 LLM 提取可用于图表的数据
|
||||
prompt = f"""你是一个专业的数据可视化助手。请从以下文本中提取可用于可视化的数据。
|
||||
|
||||
文档标题:{filename}
|
||||
|
||||
文档内容:
|
||||
{text_preview}
|
||||
|
||||
请完成以下任务:
|
||||
1. 识别文本中的表格数据(制表符分隔、空格对齐的表格等)
|
||||
2. 识别文本中的关键统计数据(百分比、数量、趋势等)
|
||||
3. 识别可用于比较的分类数据
|
||||
|
||||
请用 JSON 格式返回以下结构的数据(如果没有表格数据,返回空结构):
|
||||
{{
|
||||
"tables": [
|
||||
{{
|
||||
"description": "表格的描述",
|
||||
"columns": ["列名1", "列名2", ...],
|
||||
"rows": [
|
||||
["值1", "值2", ...],
|
||||
["值1", "值2", ...]
|
||||
]
|
||||
}}
|
||||
],
|
||||
"key_statistics": [
|
||||
{{
|
||||
"name": "指标名称",
|
||||
"value": "数值",
|
||||
"trend": "增长/下降/持平",
|
||||
"description": "指标说明"
|
||||
}}
|
||||
],
|
||||
"chart_suggestions": [
|
||||
{{
|
||||
"chart_type": "bar/line/pie",
|
||||
"title": "图表标题",
|
||||
"data_source": "数据来源说明"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
如果没有表格数据,返回空结构:{{"tables": [], "key_statistics": [], "chart_suggestions": []}}
|
||||
请确保返回的是合法的 JSON 格式。"""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": "你是一个专业的数据可视化助手,擅长从文本中提取数据并生成图表。"},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
|
||||
response = await self.llm.chat(
|
||||
messages=messages,
|
||||
temperature=0.1,
|
||||
max_tokens=8000
|
||||
)
|
||||
|
||||
content_text = self.llm.extract_message_content(response)
|
||||
chart_data = self._parse_json_response(content_text)
|
||||
|
||||
if not chart_data:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "无法从文本中提取有效的数据结构"
|
||||
}
|
||||
|
||||
# 检查是否有表格数据
|
||||
tables = chart_data.get("tables", [])
|
||||
key_statistics = chart_data.get("key_statistics", [])
|
||||
|
||||
if not tables:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "文档中没有可用于图表的表格数据",
|
||||
"key_statistics": key_statistics,
|
||||
"chart_suggestions": chart_data.get("chart_suggestions", [])
|
||||
}
|
||||
|
||||
# 使用第一个表格生成图表
|
||||
first_table = tables[0]
|
||||
columns = first_table.get("columns", [])
|
||||
rows = first_table.get("rows", [])
|
||||
|
||||
if not columns or not rows:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "表格数据为空"
|
||||
}
|
||||
|
||||
# 转换为 visualization_service 需要的格式
|
||||
viz_data = {
|
||||
"columns": columns,
|
||||
"rows": rows
|
||||
}
|
||||
|
||||
# 生成可视化图表
|
||||
logger.info(f"开始生成图表,列数: {len(columns)}, 行数: {len(rows)}")
|
||||
vis_result = visualization_service.analyze_and_visualize(viz_data)
|
||||
|
||||
if vis_result.get("success"):
|
||||
return {
|
||||
"success": True,
|
||||
"charts": vis_result.get("charts", {}),
|
||||
"statistics": vis_result.get("statistics", {}),
|
||||
"distributions": vis_result.get("distributions", {}),
|
||||
"row_count": vis_result.get("row_count", 0),
|
||||
"column_count": vis_result.get("column_count", 0),
|
||||
"key_statistics": key_statistics,
|
||||
"chart_suggestions": chart_data.get("chart_suggestions", []),
|
||||
"table_description": first_table.get("description", "")
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": vis_result.get("error", "可视化生成失败"),
|
||||
"key_statistics": key_statistics
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"TXT 图表生成失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def _parse_json_response(self, content: str) -> Optional[Dict]:
|
||||
"""解析 JSON 响应,处理各种格式问题"""
|
||||
if not content:
|
||||
return None
|
||||
|
||||
import json
|
||||
|
||||
# 清理 markdown 标记
|
||||
cleaned = content.strip()
|
||||
cleaned = re.sub(r'^```json\s*', '', cleaned, flags=re.MULTILINE)
|
||||
cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE)
|
||||
cleaned = cleaned.strip()
|
||||
|
||||
# 找到 JSON 开始位置
|
||||
json_start = -1
|
||||
for i, c in enumerate(cleaned):
|
||||
if c == '{':
|
||||
json_start = i
|
||||
break
|
||||
|
||||
if json_start == -1:
|
||||
logger.warning("无法找到 JSON 开始位置")
|
||||
return None
|
||||
|
||||
json_text = cleaned[json_start:]
|
||||
|
||||
# 尝试直接解析
|
||||
try:
|
||||
return json.loads(json_text)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# 尝试修复并解析
|
||||
try:
|
||||
# 找到闭合括号
|
||||
depth = 0
|
||||
end_pos = -1
|
||||
for i, c in enumerate(json_text):
|
||||
if c == '{':
|
||||
depth += 1
|
||||
elif c == '}':
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
end_pos = i + 1
|
||||
break
|
||||
|
||||
if end_pos > 0:
|
||||
fixed = json_text[:end_pos]
|
||||
# 移除末尾逗号
|
||||
fixed = re.sub(r',\s*([}]])', r'\1', fixed)
|
||||
return json.loads(fixed)
|
||||
except Exception as e:
|
||||
logger.warning(f"JSON 修复失败: {e}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# 全局单例
|
||||
txt_ai_service = TxtAIService()
|
||||
@@ -53,7 +53,11 @@ class VisualizationService:
|
||||
}
|
||||
|
||||
# 转换为 DataFrame
|
||||
df = pd.DataFrame(rows, columns=columns)
|
||||
# 过滤掉行数与列数不匹配的数据
|
||||
valid_rows = [row for row in rows if len(row) == len(columns)]
|
||||
if len(valid_rows) < len(rows):
|
||||
logger.warning(f"过滤了 {len(rows) - len(valid_rows)} 行无效数据(列数不匹配)")
|
||||
df = pd.DataFrame(valid_rows, columns=columns)
|
||||
|
||||
# 根据列类型分类
|
||||
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
|
||||
@@ -141,18 +145,18 @@ class VisualizationService:
|
||||
charts = {}
|
||||
|
||||
# 1. 数值型列的直方图
|
||||
charts["histograms"] = []
|
||||
charts["numeric_charts"] = []
|
||||
for col in numeric_columns[:5]: # 限制最多 5 个数值列
|
||||
chart_data = self._create_histogram(df[col], col)
|
||||
if chart_data:
|
||||
charts["histograms"].append(chart_data)
|
||||
charts["numeric_charts"].append(chart_data)
|
||||
|
||||
# 2. 分类型列的条形图
|
||||
charts["bar_charts"] = []
|
||||
charts["categorical_charts"] = []
|
||||
for col in categorical_columns[:5]: # 限制最多 5 个分类型列
|
||||
chart_data = self._create_bar_chart(df[col], col)
|
||||
if chart_data:
|
||||
charts["bar_charts"].append(chart_data)
|
||||
charts["categorical_charts"].append(chart_data)
|
||||
|
||||
# 3. 数值型列的箱线图
|
||||
charts["box_plots"] = []
|
||||
|
||||
@@ -8,6 +8,7 @@ from typing import Dict, Any, List, Optional
|
||||
import json
|
||||
|
||||
from app.services.llm_service import llm_service
|
||||
from app.services.visualization_service import visualization_service
|
||||
from app.core.document_parser.docx_parser import DocxParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -183,7 +184,7 @@ class WordAIService:
|
||||
response = await self.llm.chat(
|
||||
messages=messages,
|
||||
temperature=0.1,
|
||||
max_tokens=50000
|
||||
max_tokens=8000
|
||||
)
|
||||
|
||||
content = self.llm.extract_message_content(response)
|
||||
@@ -275,7 +276,7 @@ class WordAIService:
|
||||
response = await self.llm.chat(
|
||||
messages=messages,
|
||||
temperature=0.1,
|
||||
max_tokens=50000
|
||||
max_tokens=8000
|
||||
)
|
||||
|
||||
content = self.llm.extract_message_content(response)
|
||||
@@ -634,6 +635,281 @@ class WordAIService:
|
||||
|
||||
return values
|
||||
|
||||
async def generate_charts(
|
||||
self,
|
||||
file_path: str,
|
||||
user_hint: str = ""
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
使用 AI 解析 Word 文档并生成可视化图表
|
||||
|
||||
从 Word 文档中提取表格数据,然后生成统计图表
|
||||
|
||||
Args:
|
||||
file_path: Word 文件路径
|
||||
user_hint: 用户提示词,指定要提取的内容类型
|
||||
|
||||
Returns:
|
||||
Dict: 包含图表数据和统计信息的结果
|
||||
"""
|
||||
try:
|
||||
# 1. 先用基础解析器提取原始内容
|
||||
parse_result = self.parser.parse(file_path)
|
||||
|
||||
if not parse_result.success:
|
||||
return {
|
||||
"success": False,
|
||||
"error": parse_result.error,
|
||||
"structured_data": None
|
||||
}
|
||||
|
||||
# 2. 获取原始数据
|
||||
raw_data = parse_result.data
|
||||
paragraphs = raw_data.get("paragraphs", [])
|
||||
tables = raw_data.get("tables", [])
|
||||
content = raw_data.get("content", "")
|
||||
|
||||
logger.info(f"Word 基础解析完成: {len(paragraphs)} 个段落, {len(tables)} 个表格")
|
||||
|
||||
# 3. 优先处理表格数据
|
||||
if tables and len(tables) > 0:
|
||||
structured_data = await self._extract_tables_with_ai(
|
||||
tables, paragraphs, 0, user_hint, parse_result.metadata
|
||||
)
|
||||
elif paragraphs and len(paragraphs) > 0:
|
||||
structured_data = await self._extract_from_text_with_ai(
|
||||
paragraphs, content, 0, [], user_hint
|
||||
)
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "文档内容为空",
|
||||
"structured_data": None
|
||||
}
|
||||
|
||||
# 4. 检查是否有表格数据用于可视化
|
||||
if not structured_data.get("success"):
|
||||
return {
|
||||
"success": False,
|
||||
"error": structured_data.get("error", "解析失败"),
|
||||
"structured_data": None
|
||||
}
|
||||
|
||||
parse_type = structured_data.get("type", "")
|
||||
|
||||
# 5. 提取可用于图表的数据
|
||||
chart_data = None
|
||||
|
||||
if parse_type == "table_data":
|
||||
headers = structured_data.get("headers", [])
|
||||
rows = structured_data.get("rows", [])
|
||||
if headers and rows:
|
||||
chart_data = {
|
||||
"columns": headers,
|
||||
"rows": rows
|
||||
}
|
||||
elif parse_type == "structured_text":
|
||||
tables = structured_data.get("tables", [])
|
||||
if tables and len(tables) > 0:
|
||||
first_table = tables[0]
|
||||
headers = first_table.get("headers", [])
|
||||
rows = first_table.get("rows", [])
|
||||
if headers and rows:
|
||||
chart_data = {
|
||||
"columns": headers,
|
||||
"rows": rows
|
||||
}
|
||||
|
||||
# 6. 生成可视化图表
|
||||
if chart_data:
|
||||
logger.info(f"开始生成图表,列数: {len(chart_data['columns'])}, 行数: {len(chart_data['rows'])}")
|
||||
vis_result = visualization_service.analyze_and_visualize(chart_data)
|
||||
|
||||
if vis_result.get("success"):
|
||||
return {
|
||||
"success": True,
|
||||
"charts": vis_result.get("charts", {}),
|
||||
"statistics": vis_result.get("statistics", {}),
|
||||
"distributions": vis_result.get("distributions", {}),
|
||||
"structured_data": structured_data,
|
||||
"row_count": vis_result.get("row_count", 0),
|
||||
"column_count": vis_result.get("column_count", 0)
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": vis_result.get("error", "可视化生成失败"),
|
||||
"structured_data": structured_data
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "文档中没有可用于图表的表格数据",
|
||||
"structured_data": structured_data
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Word 文档图表生成失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"structured_data": None
|
||||
}
|
||||
|
||||
|
||||
async def parse_word_with_ai_from_db(
|
||||
self,
|
||||
content: str,
|
||||
tables: List[Dict],
|
||||
filename: str = "",
|
||||
user_hint: str = ""
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
使用 AI 解析从数据库读取的 Word 文档内容,提取结构化数据
|
||||
|
||||
Args:
|
||||
content: 文档文本内容
|
||||
tables: 表格数据列表
|
||||
filename: 文件名
|
||||
user_hint: 用户提示词
|
||||
|
||||
Returns:
|
||||
Dict: 包含结构化数据的解析结果
|
||||
"""
|
||||
try:
|
||||
# 解析段落
|
||||
paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
|
||||
|
||||
logger.info(f"从数据库解析 Word: {len(paragraphs)} 个段落, {len(tables)} 个表格")
|
||||
|
||||
# 优先处理表格数据
|
||||
if tables and len(tables) > 0:
|
||||
structured_data = await self._extract_tables_with_ai(
|
||||
tables, paragraphs, 0, user_hint, {"filename": filename}
|
||||
)
|
||||
elif paragraphs and len(paragraphs) > 0:
|
||||
structured_data = await self._extract_from_text_with_ai(
|
||||
paragraphs, content, 0, [], user_hint
|
||||
)
|
||||
else:
|
||||
structured_data = {
|
||||
"success": True,
|
||||
"type": "empty",
|
||||
"message": "文档内容为空"
|
||||
}
|
||||
|
||||
return structured_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"从数据库解析 Word 文档失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
async def generate_charts_from_db(
|
||||
self,
|
||||
content: str,
|
||||
tables: List[Dict],
|
||||
filename: str = "",
|
||||
user_hint: str = ""
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
使用 AI 解析从数据库读取的 Word 文档并生成可视化图表
|
||||
|
||||
Args:
|
||||
content: 文档文本内容
|
||||
tables: 表格数据列表
|
||||
filename: 文件名
|
||||
user_hint: 用户提示词
|
||||
|
||||
Returns:
|
||||
Dict: 包含图表数据和统计信息的结果
|
||||
"""
|
||||
try:
|
||||
# 解析段落
|
||||
paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
|
||||
|
||||
logger.info(f"从数据库生成 Word 图表: {len(paragraphs)} 个段落, {len(tables)} 个表格")
|
||||
|
||||
# 优先处理表格数据
|
||||
if tables and len(tables) > 0:
|
||||
structured_data = await self._extract_tables_with_ai(
|
||||
tables, paragraphs, 0, user_hint, {"filename": filename}
|
||||
)
|
||||
elif paragraphs and len(paragraphs) > 0:
|
||||
structured_data = await self._extract_from_text_with_ai(
|
||||
paragraphs, content, 0, [], user_hint
|
||||
)
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "文档内容为空"
|
||||
}
|
||||
|
||||
# 提取可用于图表的数据
|
||||
chart_data = None
|
||||
logger.info(f"准备提取图表数据,structured_data type: {structured_data.get('type')}, keys: {list(structured_data.keys())}")
|
||||
|
||||
if structured_data.get("type") == "table_data":
|
||||
headers = structured_data.get("headers", [])
|
||||
rows = structured_data.get("rows", [])
|
||||
logger.info(f"table_data类型: headers数量={len(headers)}, rows数量={len(rows)}")
|
||||
if headers and rows:
|
||||
chart_data = {
|
||||
"columns": headers,
|
||||
"rows": rows
|
||||
}
|
||||
elif structured_data.get("type") == "structured_text":
|
||||
tables_data = structured_data.get("tables", [])
|
||||
logger.info(f"structured_text类型: tables数量={len(tables_data)}")
|
||||
if tables_data and len(tables_data) > 0:
|
||||
first_table = tables_data[0]
|
||||
headers = first_table.get("headers", [])
|
||||
rows = first_table.get("rows", [])
|
||||
logger.info(f"第一个表格: headers={headers[:5]}, rows数量={len(rows)}")
|
||||
if headers and rows:
|
||||
chart_data = {
|
||||
"columns": headers,
|
||||
"rows": rows
|
||||
}
|
||||
else:
|
||||
logger.warning(f"无法识别的structured_data类型: {structured_data.get('type')}")
|
||||
|
||||
# 生成可视化图表
|
||||
if chart_data:
|
||||
logger.info(f"开始生成图表,列数: {len(chart_data['columns'])}, 行数: {len(chart_data['rows'])}")
|
||||
vis_result = visualization_service.analyze_and_visualize(chart_data)
|
||||
|
||||
if vis_result.get("success"):
|
||||
return {
|
||||
"success": True,
|
||||
"charts": vis_result.get("charts", {}),
|
||||
"statistics": vis_result.get("statistics", {}),
|
||||
"distributions": vis_result.get("distributions", {}),
|
||||
"structured_data": structured_data,
|
||||
"row_count": vis_result.get("row_count", 0),
|
||||
"column_count": vis_result.get("column_count", 0)
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": vis_result.get("error", "可视化生成失败"),
|
||||
"structured_data": structured_data
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "文档中没有可用于图表的表格数据",
|
||||
"structured_data": structured_data
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"从数据库生成 Word 图表失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
|
||||
# 全局单例
|
||||
word_ai_service = WordAIService()
|
||||
|
||||
@@ -39,6 +39,11 @@ openpyxl==3.1.2
|
||||
python-docx==0.8.11
|
||||
markdown-it-py==3.0.0
|
||||
chardet==5.2.0
|
||||
Pillow>=10.0.0
|
||||
pytesseract>=0.3.10
|
||||
|
||||
# ==================== PDF 生成 ====================
|
||||
reportlab>=4.0.0
|
||||
|
||||
# ==================== AI / LLM ====================
|
||||
httpx==0.25.2
|
||||
|
||||
203
docker-compose.yml
Normal file
203
docker-compose.yml
Normal file
@@ -0,0 +1,203 @@
|
||||
# ============================================================
|
||||
# FilesReadSystem Docker Compose
|
||||
# 全栈 AI 文档理解与数据融合系统
|
||||
# ============================================================
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
# ==================== 数据库服务 ====================
|
||||
|
||||
mongodb:
|
||||
image: mongo:7.0
|
||||
container_name: filesread_mongodb
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "27017:27017"
|
||||
environment:
|
||||
MONGO_INITDB_ROOT_USERNAME: ${MONGO_ROOT_USER:-admin}
|
||||
MONGO_INITDB_ROOT_PASSWORD: ${MONGO_ROOT_PASSWORD:-20060825fhy}
|
||||
MONGO_INITDB_DATABASE: ${MONGODB_DB_NAME:-document_system}
|
||||
volumes:
|
||||
- mongodb_data:/data/db
|
||||
networks:
|
||||
- filesread_network
|
||||
healthcheck:
|
||||
test: ["CMD", "mongosh", "--eval", "db.adminCommand('ping')", "--quiet"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
|
||||
mysql:
|
||||
image: mysql:8.0
|
||||
container_name: filesread_mysql
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "3306:3306"
|
||||
environment:
|
||||
MYSQL_ROOT_PASSWORD: ${MYSQL_PASSWORD:-123456}
|
||||
MYSQL_DATABASE: ${MYSQL_DATABASE:-document}
|
||||
volumes:
|
||||
- mysql_data:/var/lib/mysql
|
||||
networks:
|
||||
- filesread_network
|
||||
healthcheck:
|
||||
test: ["CMD", "mysqladmin", "ping", "-h", "localhost", "-u", "root", "-p${MYSQL_PASSWORD:-123456}"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: filesread_redis
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "6379:6379"
|
||||
volumes:
|
||||
- redis_data:/data
|
||||
networks:
|
||||
- filesread_network
|
||||
command: redis-server --appendonly yes --requirepass ${REDIS_PASSWORD:-}
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# ==================== 应用服务 ====================
|
||||
|
||||
backend:
|
||||
build:
|
||||
context: ./backend
|
||||
dockerfile: Dockerfile
|
||||
container_name: filesread_backend
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "8000:8000"
|
||||
environment:
|
||||
# 应用配置
|
||||
APP_NAME: FilesReadSystem
|
||||
DEBUG: ${DEBUG:-false}
|
||||
API_V1_STR: /api/v1
|
||||
|
||||
# MongoDB 配置 (使用 docker-compose 服务名)
|
||||
MONGODB_URL: mongodb://${MONGO_ROOT_USER:-admin}:${MONGO_ROOT_PASSWORD:-20060825fhy}@mongodb:27017/admin
|
||||
MONGODB_DB_NAME: ${MONGODB_DB_NAME:-document_system}
|
||||
|
||||
# MySQL 配置
|
||||
MYSQL_HOST: mysql
|
||||
MYSQL_PORT: 3306
|
||||
MYSQL_USER: root
|
||||
MYSQL_PASSWORD: ${MYSQL_PASSWORD:-123456}
|
||||
MYSQL_DATABASE: ${MYSQL_DATABASE:-document}
|
||||
MYSQL_CHARSET: utf8mb4
|
||||
|
||||
# Redis 配置
|
||||
REDIS_URL: redis://:${REDIS_PASSWORD:-}@redis:6379/0
|
||||
|
||||
# LLM AI 配置
|
||||
LLM_API_KEY: ${LLM_API_KEY}
|
||||
LLM_BASE_URL: ${LLM_BASE_URL:-https://api.deepseek.com}
|
||||
LLM_MODEL_NAME: ${LLM_MODEL_NAME:-deepseek-chat}
|
||||
|
||||
# Supabase 配置
|
||||
SUPABASE_URL: ${SUPABASE_URL}
|
||||
SUPABASE_ANON_KEY: ${SUPABASE_ANON_KEY}
|
||||
SUPABASE_SERVICE_KEY: ${SUPABASE_SERVICE_KEY}
|
||||
|
||||
# Embedding / RAG 配置
|
||||
EMBEDDING_MODEL: ${EMBEDDING_MODEL:-all-MiniLM-L6-v2}
|
||||
FAISS_INDEX_DIR: /app/data/faiss
|
||||
|
||||
# 文件路径配置
|
||||
UPLOAD_DIR: /app/data/uploads
|
||||
MAX_UPLOAD_SIZE: 104857600
|
||||
|
||||
# Celery 配置
|
||||
CELERY_BROKER_URL: redis://:${REDIS_PASSWORD:-}@redis:6379/1
|
||||
CELERY_RESULT_BACKEND: redis://:${REDIS_PASSWORD:-}@redis:6379/2
|
||||
volumes:
|
||||
- backend_data:/app/data
|
||||
networks:
|
||||
- filesread_network
|
||||
depends_on:
|
||||
mongodb:
|
||||
condition: service_healthy
|
||||
mysql:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD", "python", "-c", "import httpx; httpx.get('http://localhost:8000/health')"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
|
||||
celery_worker:
|
||||
build:
|
||||
context: ./backend
|
||||
dockerfile: Dockerfile
|
||||
container_name: filesread_celery
|
||||
restart: unless-stopped
|
||||
command: celery -A app.celery_app worker --loglevel=info --prefetch-multiplier=1
|
||||
environment:
|
||||
# Celery 配置
|
||||
CELERY_BROKER_URL: redis://:${REDIS_PASSWORD:-}@redis:6379/1
|
||||
CELERY_RESULT_BACKEND: redis://:${REDIS_PASSWORD:-}@redis:6379/2
|
||||
|
||||
# 复用后端的数据库配置
|
||||
MONGODB_URL: mongodb://${MONGO_ROOT_USER:-admin}:${MONGO_ROOT_PASSWORD:-20060825fhy}@mongodb:27017/admin
|
||||
MONGODB_DB_NAME: ${MONGODB_DB_NAME:-document_system}
|
||||
MYSQL_HOST: mysql
|
||||
MYSQL_PORT: 3306
|
||||
MYSQL_USER: root
|
||||
MYSQL_PASSWORD: ${MYSQL_PASSWORD:-123456}
|
||||
MYSQL_DATABASE: ${MYSQL_DATABASE:-document}
|
||||
REDIS_URL: redis://:${REDIS_PASSWORD:-}@redis:6379/0
|
||||
|
||||
# LLM 配置
|
||||
LLM_API_KEY: ${LLM_API_KEY}
|
||||
LLM_BASE_URL: ${LLM_BASE_URL:-https://api.deepseek.com}
|
||||
LLM_MODEL_NAME: ${LLM_MODEL_NAME:-deepseek-chat}
|
||||
|
||||
# Embedding 配置
|
||||
EMBEDDING_MODEL: ${EMBEDDING_MODEL:-all-MiniLM-L6-v2}
|
||||
FAISS_INDEX_DIR: /app/data/faiss
|
||||
volumes:
|
||||
- backend_data:/app/data
|
||||
networks:
|
||||
- filesread_network
|
||||
depends_on:
|
||||
- redis
|
||||
- mongodb
|
||||
- mysql
|
||||
|
||||
frontend:
|
||||
build:
|
||||
context: ./frontend
|
||||
dockerfile: Dockerfile
|
||||
container_name: filesread_frontend
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "80:80"
|
||||
environment:
|
||||
VITE_APP_ID: ${VITE_APP_ID:-}
|
||||
VITE_SUPABASE_URL: ${SUPABASE_URL}
|
||||
VITE_SUPABASE_ANON_KEY: ${SUPABASE_ANON_KEY}
|
||||
VITE_BACKEND_API_URL: /api/v1
|
||||
networks:
|
||||
- filesread_network
|
||||
depends_on:
|
||||
- backend
|
||||
|
||||
networks:
|
||||
filesread_network:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
mongodb_data:
|
||||
mysql_data:
|
||||
redis_data:
|
||||
backend_data:
|
||||
169
docs/architecture.drawio
Normal file
169
docs/architecture.drawio
Normal file
@@ -0,0 +1,169 @@
|
||||
<mxfile host="app.diagrams.net" modified="2026-04-16T14:00:00.000Z" agent="Claude" version="24.0.0">
|
||||
<diagram name="系统架构图" id="architecture">
|
||||
<mxGraphModel dx="1200" dy="800" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1920" pageHeight="1080" math="0" shadow="0">
|
||||
<root>
|
||||
<mxCell id="0" />
|
||||
<mxCell id="1" parent="0" />
|
||||
|
||||
<!-- 用户访问层 -->
|
||||
<mxCell id="layer1" value="用户访问层" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=16;fontStyle=1;fontColor=#1a1a2e;" vertex="1" parent="1">
|
||||
<mxGeometry x="800" y="20" width="120" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="browser" value="浏览器
(Browser)" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#e3f2fd;strokeColor=#1976d2;fontColor=#0d47a1;" vertex="1" parent="1">
|
||||
<mxGeometry x="860" y="60" width="120" height="50" as="geometry" />
|
||||
</mxCell>
|
||||
|
||||
<!-- 前端展示层 -->
|
||||
<mxCell id="layer2" value="前端展示层" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=16;fontStyle=1;fontColor=#1a1a2e;" vertex="1" parent="1">
|
||||
<mxGeometry x="800" y="140" width="120" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="frontend_box" value="" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#f3e5f5;strokeColor=#7b1fa2;strokeWidth=2;" vertex="1" parent="1">
|
||||
<mxGeometry x="200" y="180" width="1520" height="140" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="frontend_title" value="React 18 + TypeScript + Vite + shadcn/ui" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=14;fontStyle=1;fontColor=#4a148c;" vertex="1" parent="1">
|
||||
<mxGeometry x="760" y="185" width="280" height="25" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="dashboard" value="Dashboard
首页仪表盘" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ce93d8;strokeColor=#8e24aa;fontColor=#fff;" vertex="1" parent="1">
|
||||
<mxGeometry x="240" y="220" width="120" height="80" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="documents" value="Documents
文档管理" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ce93d8;strokeColor=#8e24aa;fontColor=#fff;" vertex="1" parent="1">
|
||||
<mxGeometry x="400" y="220" width="120" height="80" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="template" value="TemplateFill
智能填表" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ce93d8;strokeColor=#8e24aa;fontColor=#fff;" vertex="1" parent="1">
|
||||
<mxGeometry x="560" y="220" width="120" height="80" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="instruction" value="Instruction
指令助手" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ce93d8;strokeColor=#8e24aa;fontColor=#fff;" vertex="1" parent="1">
|
||||
<mxGeometry x="720" y="220" width="120" height="80" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="taskhistory" value="TaskHistory
任务历史" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ce93d8;strokeColor=#8e24aa;fontColor=#fff;" vertex="1" parent="1">
|
||||
<mxGeometry x="880" y="220" width="120" height="80" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="frontend_libs" value="Recharts + Lucide Icons + React Router" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=11;fontColor=#6a1b9a;" vertex="1" parent="1">
|
||||
<mxGeometry x="1040" y="250" width="280" height="25" as="geometry" />
|
||||
</mxCell>
|
||||
|
||||
<!-- 连接线:浏览器到前端 -->
|
||||
<mxCell id="conn1" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#1976d2;strokeWidth=2;" edge="1" parent="1" source="browser" target="frontend_box">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
|
||||
<!-- 后端服务层 -->
|
||||
<mxCell id="layer3" value="后端服务层" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=16;fontStyle=1;fontColor=#1a1a2e;" vertex="1" parent="1">
|
||||
<mxGeometry x="800" y="350" width="120" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="backend_box" value="" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#e8f5e9;strokeColor=#388e3c;strokeWidth=2;" vertex="1" parent="1">
|
||||
<mxGeometry x="200" y="390" width="1520" height="180" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="backend_title" value="FastAPI + Uvicorn + Celery" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=14;fontStyle=1;fontColor=#1b5e20;" vertex="1" parent="1">
|
||||
<mxGeometry x="800" y="395" width="200" height="25" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="upload" value="文档上传
/upload/*" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#81c784;strokeColor=#2e7d32;fontColor=#1b5e20;" vertex="1" parent="1">
|
||||
<mxGeometry x="240" y="430" width="140" height="60" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="ai" value="AI分析
/ai/*" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#81c784;strokeColor=#2e7d32;fontColor=#1b5e20;" vertex="1" parent="1">
|
||||
<mxGeometry x="420" y="430" width="140" height="60" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="rag" value="RAG检索
/rag/*" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#81c784;strokeColor=#2e7d32;fontColor=#1b5e20;" vertex="1" parent="1">
|
||||
<mxGeometry x="600" y="430" width="140" height="60" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="template_api" value="模板填充
/templates/*" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#81c784;strokeColor=#2e7d32;fontColor=#1b5e20;" vertex="1" parent="1">
|
||||
<mxGeometry x="780" y="430" width="140" height="60" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="instruction_api" value="指令解析
/instruction/*" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#81c784;strokeColor=#2e7d32;fontColor=#1b5e20;" vertex="1" parent="1">
|
||||
<mxGeometry x="960" y="430" width="140" height="60" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="visualization" value="可视化
/visualization/*" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#81c784;strokeColor=#2e7d32;fontColor=#1b5e20;" vertex="1" parent="1">
|
||||
<mxGeometry x="1140" y="430" width="140" height="60" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="celery" value="Celery
任务调度" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#a5d6a7;strokeColor=#2e7d32;fontColor=#1b5e20;" vertex="1" parent="1">
|
||||
<mxGeometry x="1320" y="430" width="120" height="60" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="logging" value="监控日志" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#a5d6a7;strokeColor=#2e7d32;fontColor=#1b5e20;" vertex="1" parent="1">
|
||||
<mxGeometry x="1480" y="430" width="100" height="60" as="geometry" />
|
||||
</mxCell>
|
||||
|
||||
<!-- 连接线:前端到后端 -->
|
||||
<mxCell id="conn2" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#388e3c;strokeWidth=2;dashed=1;dashPattern=8 8;" edge="1" parent="1" source="frontend_box" target="backend_box">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
|
||||
<!-- AI服务层 -->
|
||||
<mxCell id="layer4" value="AI服务层" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=16;fontStyle=1;fontColor=#1a1a2e;" vertex="1" parent="1">
|
||||
<mxGeometry x="800" y="600" width="120" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="ai_box" value="" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#fff3e0;strokeColor=#f57c00;strokeWidth=2;" vertex="1" parent="1">
|
||||
<mxGeometry x="300" y="640" width="1320" height="120" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="llm_title" value="LLMService - 大模型服务" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=14;fontStyle=1;fontColor=#e65100;" vertex="1" parent="1">
|
||||
<mxGeometry x="820" y="645" width="200" height="25" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="minimax" value="MiniMax-Text-01" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#ffcc80;strokeColor=#ef6c00;fontColor=#e65100;" vertex="1" parent="1">
|
||||
<mxGeometry x="400" y="680" width="150" height="50" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="deepseek" value="DeepSeek-chat" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#ffcc80;strokeColor=#ef6c00;fontColor=#e65100;" vertex="1" parent="1">
|
||||
<mxGeometry x="600" y="680" width="150" height="50" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="excel_ai" value="ExcelAIService" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ffe0b2;strokeColor=#f57c00;fontColor=#e65100;" vertex="1" parent="1">
|
||||
<mxGeometry x="820" y="680" width="130" height="50" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="word_ai" value="WordAIService" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ffe0b2;strokeColor=#f57c00;fontColor=#e65100;" vertex="1" parent="1">
|
||||
<mxGeometry x="980" y="680" width="130" height="50" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="md_ai" value="MarkdownAIService" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ffe0b2;strokeColor=#f57c00;fontColor=#e65100;" vertex="1" parent="1">
|
||||
<mxGeometry x="1140" y="680" width="130" height="50" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="txt_ai" value="TxtAIService" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ffe0b2;strokeColor=#f57c00;fontColor=#e65100;" vertex="1" parent="1">
|
||||
<mxGeometry x="1300" y="680" width="130" height="50" as="geometry" />
|
||||
</mxCell>
|
||||
|
||||
<!-- 连接线:后端到AI -->
|
||||
<mxCell id="conn3" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#f57c00;strokeWidth=2;dashed=1;dashPattern=8 8;" edge="1" parent="1" source="backend_box" target="ai_box">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
|
||||
<!-- 数据存储层 -->
|
||||
<mxCell id="layer5" value="数据存储层" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=16;fontStyle=1;fontColor=#1a1a2e;" vertex="1" parent="1">
|
||||
<mxGeometry x="800" y="790" width="120" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mongodb" value="MongoDB
文档数据库

• 原始文档内容
• 元数据信息
• 文档标签
• 处理状态" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#e0e0e0;strokeColor=#616161;fontColor=#212121;align=left;spacingLeft=10;" vertex="1" parent="1">
|
||||
<mxGeometry x="240" y="830" width="200" height="160" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mysql" value="MySQL
关系数据库

• Excel表格数据
• 结构化数据
• 字段描述
• RAG索引" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#e0e0e0;strokeColor=#616161;fontColor=#212121;align=left;spacingLeft=10;" vertex="1" parent="1">
|
||||
<mxGeometry x="520" y="830" width="200" height="160" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="redis" value="Redis
缓存/队列

• 会话缓存
• 任务队列
• Celery broker
• 临时数据" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#e0e0e0;strokeColor=#616161;fontColor=#212121;align=left;spacingLeft=10;" vertex="1" parent="1">
|
||||
<mxGeometry x="800" y="830" width="200" height="160" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="faiss" value="FAISS
向量数据库

• 文档向量索引
• 语义相似度
• RAG检索
• sentence-transformers" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#e0e0e0;strokeColor=#616161;fontColor=#212121;align=left;spacingLeft=10;" vertex="1" parent="1">
|
||||
<mxGeometry x="1080" y="830" width="240" height="160" as="geometry" />
|
||||
</mxCell>
|
||||
|
||||
<!-- 连接线:AI到存储 -->
|
||||
<mxCell id="conn4" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#616161;strokeWidth=2;dashed=1;dashPattern=8 8;" edge="1" parent="1" source="ai_box" target="mongodb">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="conn5" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#616161;strokeWidth=2;dashed=1;dashPattern=8 8;" edge="1" parent="1" source="ai_box" target="mysql">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="conn6" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#616161;strokeWidth=2;dashed=1;dashPattern=8 8;" edge="1" parent="1" source="ai_box" target="redis">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="conn7" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;strokeColor=#616161;strokeWidth=2;dashed=1;dashPattern=8 8;" edge="1" parent="1" source="ai_box" target="faiss">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
|
||||
<!-- 标注 -->
|
||||
<mxCell id="arrow1" value="HTTP/HTTPS
WebSocket" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=10;fontColor=#1976d2;" vertex="1" parent="1">
|
||||
<mxGeometry x="1020" y="130" width="80" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="arrow2" value="API调用" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=10;fontColor=#388e3c;" vertex="1" parent="1">
|
||||
<mxGeometry x="1020" y="570" width="60" height="20" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="arrow3" value="数据读写" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=10;fontColor=#616161;" vertex="1" parent="1">
|
||||
<mxGeometry x="1020" y="770" width="60" height="20" as="geometry" />
|
||||
</mxCell>
|
||||
|
||||
</root>
|
||||
</mxGraphModel>
|
||||
</diagram>
|
||||
</mxfile>
|
||||
36
frontend/Dockerfile
Normal file
36
frontend/Dockerfile
Normal file
@@ -0,0 +1,36 @@
|
||||
# ============================================================
|
||||
# FilesReadSystem Frontend - React + Vite
|
||||
# 多阶段构建: Node 构建 -> Nginx 运行
|
||||
# ============================================================
|
||||
|
||||
# === 阶段1: 构建阶段 ===
|
||||
FROM node:20-alpine AS builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 复制 package 文件和锁文件
|
||||
COPY package.json pnpm-lock.yaml* ./
|
||||
|
||||
# 安装 pnpm 并安装依赖
|
||||
RUN npm install -g pnpm && \
|
||||
pnpm install --frozen-lockfile
|
||||
|
||||
# 复制源码
|
||||
COPY . .
|
||||
|
||||
# 构建生产版本
|
||||
RUN pnpm build
|
||||
|
||||
# === 阶段2: 运行阶段 ===
|
||||
FROM nginx:alpine
|
||||
|
||||
# 复制 nginx 配置
|
||||
COPY nginx.conf /etc/nginx/conf.d/default.conf
|
||||
|
||||
# 复制构建产物
|
||||
COPY --from=builder /app/dist /usr/share/nginx/html
|
||||
|
||||
# 暴露端口
|
||||
EXPOSE 80
|
||||
|
||||
CMD ["nginx", "-g", "daemon off;"]
|
||||
47
frontend/nginx.conf
Normal file
47
frontend/nginx.conf
Normal file
@@ -0,0 +1,47 @@
|
||||
# ============================================================
|
||||
# FilesReadSystem Nginx 配置
|
||||
# 反向代理 API 请求到后端
|
||||
# ============================================================
|
||||
|
||||
server {
|
||||
listen 80;
|
||||
server_name localhost;
|
||||
|
||||
# 前端静态文件
|
||||
root /usr/share/nginx/html;
|
||||
index index.html;
|
||||
|
||||
# SPA 支持 - 所有请求都尝试返回 index.html
|
||||
location / {
|
||||
try_files $uri $uri/ /index.html;
|
||||
}
|
||||
|
||||
# 静态资源缓存
|
||||
location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2|ttf|eot)$ {
|
||||
expires 1y;
|
||||
add_header Cache-Control "public, immutable";
|
||||
}
|
||||
|
||||
# API 反向代理到后端
|
||||
location /api/ {
|
||||
proxy_pass http://backend:8000/api/;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# 超时设置
|
||||
proxy_connect_timeout 60s;
|
||||
proxy_send_timeout 60s;
|
||||
proxy_read_timeout 60s;
|
||||
}
|
||||
|
||||
# 文件上传代理
|
||||
location /uploads/ {
|
||||
proxy_pass http://backend:8000/uploads/;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
client_max_body_size 100M;
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,8 @@ import {
|
||||
Menu,
|
||||
ChevronRight,
|
||||
Sparkles,
|
||||
Clock
|
||||
Clock,
|
||||
FileDown
|
||||
} from 'lucide-react';
|
||||
import { Button } from '@/components/ui/button';
|
||||
import { cn } from '@/lib/utils';
|
||||
@@ -19,6 +20,7 @@ const navItems = [
|
||||
{ name: '文档中心', path: '/documents', icon: FileText },
|
||||
{ name: '智能填表', path: '/form-fill', icon: TableProperties },
|
||||
{ name: '智能助手', path: '/assistant', icon: MessageSquareCode },
|
||||
{ name: '文档转PDF', path: '/pdf-converter', icon: FileDown },
|
||||
{ name: '任务历史', path: '/task-history', icon: Clock },
|
||||
];
|
||||
|
||||
@@ -32,7 +34,7 @@ const MainLayout: React.FC = () => {
|
||||
<FileText size={24} />
|
||||
</div>
|
||||
<div className="flex flex-col">
|
||||
<span className="font-bold text-lg tracking-tight text-sidebar-foreground">智联文档</span>
|
||||
<span className="font-bold text-lg tracking-tight text-sidebar-foreground">表易智融</span>
|
||||
<span className="text-xs text-muted-foreground">多源数据融合平台</span>
|
||||
</div>
|
||||
</div>
|
||||
@@ -66,7 +68,7 @@ const MainLayout: React.FC = () => {
|
||||
<Sparkles size={20} className="text-primary" />
|
||||
</div>
|
||||
<div className="flex flex-col overflow-hidden">
|
||||
<span className="font-semibold text-sm truncate">智联文档</span>
|
||||
<span className="font-semibold text-sm truncate">表易智融</span>
|
||||
<span className="text-[10px] uppercase tracking-wider text-muted-foreground">多源数据融合</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -250,6 +250,98 @@ export interface AIExcelAnalyzeResult {
|
||||
error?: string;
|
||||
}
|
||||
|
||||
// ==================== Word/TXT AI 分析类型 ====================
|
||||
|
||||
export type WordAnalysisType = 'structured' | 'charts';
|
||||
export type TxtAnalysisType = 'structured' | 'charts';
|
||||
|
||||
export interface WordAIStructuredResult {
|
||||
success: boolean;
|
||||
result?: {
|
||||
success?: boolean;
|
||||
type?: string;
|
||||
headers?: string[];
|
||||
rows?: string[][];
|
||||
key_values?: Record<string, string>;
|
||||
list_items?: string[];
|
||||
summary?: string;
|
||||
error?: string;
|
||||
};
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface WordAIChartsResult {
|
||||
success: boolean;
|
||||
result?: {
|
||||
success?: boolean;
|
||||
charts?: {
|
||||
histograms?: Array<any>;
|
||||
bar_charts?: Array<any>;
|
||||
box_plots?: Array<any>;
|
||||
correlation?: any;
|
||||
};
|
||||
statistics?: {
|
||||
numeric?: Record<string, any>;
|
||||
categorical?: Record<string, any>;
|
||||
};
|
||||
distributions?: Record<string, any>;
|
||||
row_count?: number;
|
||||
column_count?: number;
|
||||
error?: string;
|
||||
};
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface TxtAIStructuredResult {
|
||||
success: boolean;
|
||||
result?: {
|
||||
success?: boolean;
|
||||
type?: string;
|
||||
tables?: Array<{
|
||||
headers?: string[];
|
||||
rows?: string[][];
|
||||
}>;
|
||||
key_values?: Record<string, string>;
|
||||
list_items?: string[];
|
||||
summary?: string;
|
||||
error?: string;
|
||||
};
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface TxtAIChartsResult {
|
||||
success: boolean;
|
||||
result?: {
|
||||
success?: boolean;
|
||||
charts?: {
|
||||
histograms?: Array<any>;
|
||||
bar_charts?: Array<any>;
|
||||
box_plots?: Array<any>;
|
||||
correlation?: any;
|
||||
};
|
||||
statistics?: {
|
||||
numeric?: Record<string, any>;
|
||||
categorical?: Record<string, any>;
|
||||
};
|
||||
distributions?: Record<string, any>;
|
||||
row_count?: number;
|
||||
column_count?: number;
|
||||
key_statistics?: Array<{
|
||||
name?: string;
|
||||
value?: string;
|
||||
trend?: string;
|
||||
description?: string;
|
||||
}>;
|
||||
chart_suggestions?: Array<{
|
||||
chart_type?: string;
|
||||
title?: string;
|
||||
data_source?: string;
|
||||
}>;
|
||||
error?: string;
|
||||
};
|
||||
error?: string;
|
||||
}
|
||||
|
||||
// ==================== API 封装 ====================
|
||||
|
||||
export const backendApi = {
|
||||
@@ -781,7 +873,8 @@ export const backendApi = {
|
||||
async exportFilledTemplate(
|
||||
templateId: string,
|
||||
filledData: Record<string, any>,
|
||||
format: 'xlsx' | 'docx' = 'xlsx'
|
||||
format: 'xlsx' | 'docx' = 'xlsx',
|
||||
filledFilePath?: string
|
||||
): Promise<Blob> {
|
||||
const url = `${BACKEND_BASE_URL}/templates/export`;
|
||||
|
||||
@@ -793,6 +886,7 @@ export const backendApi = {
|
||||
template_id: templateId,
|
||||
filled_data: filledData,
|
||||
format,
|
||||
...(filledFilePath && { filled_file_path: filledFilePath }),
|
||||
}),
|
||||
});
|
||||
|
||||
@@ -964,6 +1058,215 @@ export const backendApi = {
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
|
||||
// ==================== 智能指令 API ====================
|
||||
|
||||
/**
|
||||
* 智能对话(支持多轮对话的指令执行)
|
||||
*/
|
||||
async instructionChat(
|
||||
instruction: string,
|
||||
docIds?: string[],
|
||||
context?: Record<string, any>
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
intent: string;
|
||||
result: Record<string, any>;
|
||||
message: string;
|
||||
hint?: string;
|
||||
}> {
|
||||
const url = `${BACKEND_BASE_URL}/instruction/chat`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ instruction, doc_ids: docIds, context }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
throw new Error(error.detail || '对话处理失败');
|
||||
}
|
||||
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('对话处理失败:', error);
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* 获取支持的指令类型列表
|
||||
*/
|
||||
async getSupportedIntents(): Promise<{
|
||||
intents: Array<{
|
||||
intent: string;
|
||||
name: string;
|
||||
examples: string[];
|
||||
params: string[];
|
||||
}>;
|
||||
}> {
|
||||
const url = `${BACKEND_BASE_URL}/instruction/intents`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) throw new Error('获取指令列表失败');
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('获取指令列表失败:', error);
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* 执行指令(同步模式)
|
||||
*/
|
||||
async executeInstruction(
|
||||
instruction: string,
|
||||
docIds?: string[],
|
||||
context?: Record<string, any>
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
intent: string;
|
||||
result: Record<string, any>;
|
||||
message: string;
|
||||
}> {
|
||||
const url = `${BACKEND_BASE_URL}/instruction/execute`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ instruction, doc_ids: docIds, context }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
throw new Error(error.detail || '指令执行失败');
|
||||
}
|
||||
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('指令执行失败:', error);
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
|
||||
// ==================== PDF 转换 API ====================
|
||||
|
||||
/**
|
||||
* 将文件转换为 PDF
|
||||
*/
|
||||
/**
|
||||
* PDF转换并直接下载(使用XHR,支持IDM拦截)
|
||||
*/
|
||||
async convertAndDownloadPdf(file: File): Promise<void> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const xhr = new XMLHttpRequest();
|
||||
xhr.open('POST', `${BACKEND_BASE_URL}/pdf/convert`);
|
||||
|
||||
xhr.onload = function() {
|
||||
if (xhr.status >= 200 && xhr.status < 300) {
|
||||
// 创建 blob 并触发下载
|
||||
const blob = xhr.response;
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = `${file.name.replace(/\.[^.]+$/, '')}.pdf`;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
resolve();
|
||||
} else {
|
||||
reject(new Error(`转换失败: ${xhr.status}`));
|
||||
}
|
||||
};
|
||||
|
||||
xhr.onerror = function() {
|
||||
reject(new Error('网络错误'));
|
||||
};
|
||||
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
xhr.responseType = 'blob';
|
||||
xhr.send(formData);
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
* PDF转换(返回Blob)
|
||||
*/
|
||||
async convertToPdf(file: File): Promise<Blob> {
|
||||
return new Promise((resolve, reject) => {
|
||||
const xhr = new XMLHttpRequest();
|
||||
xhr.open('POST', `${BACKEND_BASE_URL}/pdf/convert`);
|
||||
|
||||
xhr.onload = function() {
|
||||
if (xhr.status >= 200 && xhr.status < 300) {
|
||||
resolve(xhr.response);
|
||||
} else {
|
||||
reject(new Error(`转换失败: ${xhr.status}`));
|
||||
}
|
||||
};
|
||||
|
||||
xhr.onerror = function() {
|
||||
reject(new Error('网络错误'));
|
||||
};
|
||||
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
xhr.responseType = 'blob';
|
||||
xhr.send(formData);
|
||||
});
|
||||
},
|
||||
|
||||
/**
|
||||
* 批量将文件转换为 PDF
|
||||
*/
|
||||
async batchConvertToPdf(files: File[]): Promise<Blob> {
|
||||
const formData = new FormData();
|
||||
files.forEach(file => formData.append('files', file));
|
||||
|
||||
const url = `${BACKEND_BASE_URL}/pdf/convert/batch`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
body: formData,
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
throw new Error(error.detail || '批量PDF转换失败');
|
||||
}
|
||||
|
||||
return await response.blob();
|
||||
} catch (error) {
|
||||
console.error('批量PDF转换失败:', error);
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* 获取支持的 PDF 转换格式
|
||||
*/
|
||||
async getPdfSupportedFormats(): Promise<{
|
||||
success: boolean;
|
||||
formats: string[];
|
||||
}> {
|
||||
const url = `${BACKEND_BASE_URL}/pdf/formats`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) throw new Error('获取支持的格式失败');
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('获取支持的格式失败:', error);
|
||||
return { success: false, formats: ['docx', 'xlsx', 'txt', 'md'] };
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// ==================== AI 分析 API ====================
|
||||
@@ -998,11 +1301,19 @@ export const aiApi = {
|
||||
* 上传并使用 AI 分析 Excel 文件
|
||||
*/
|
||||
async analyzeExcel(
|
||||
file: File,
|
||||
options: AIAnalyzeOptions = {}
|
||||
file: File | null,
|
||||
options: AIAnalyzeOptions = {},
|
||||
docId: string | null = null
|
||||
): Promise<AIExcelAnalyzeResult> {
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
|
||||
if (docId) {
|
||||
formData.append('doc_id', docId);
|
||||
} else if (file) {
|
||||
formData.append('file', file);
|
||||
} else {
|
||||
throw new Error('必须提供文件或文档ID');
|
||||
}
|
||||
|
||||
const params = new URLSearchParams();
|
||||
if (options.userPrompt) {
|
||||
@@ -1079,7 +1390,9 @@ export const aiApi = {
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) throw new Error('获取分析类型失败');
|
||||
return await response.json();
|
||||
const data = await response.json();
|
||||
// 转换后端返回格式 {excel_types: [], markdown_types: []} 为前端期望的 {types: []}
|
||||
return { types: data.excel_types || [] };
|
||||
} catch (error) {
|
||||
console.error('获取分析类型失败:', error);
|
||||
throw error;
|
||||
@@ -1090,15 +1403,21 @@ export const aiApi = {
|
||||
* 上传并使用 AI 分析 Markdown 文件
|
||||
*/
|
||||
async analyzeMarkdown(
|
||||
file: File,
|
||||
file: File | null,
|
||||
options: {
|
||||
docId?: string;
|
||||
analysisType?: MarkdownAnalysisType;
|
||||
userPrompt?: string;
|
||||
sectionNumber?: string;
|
||||
} = {}
|
||||
): Promise<AIMarkdownAnalyzeResult> {
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
if (file) {
|
||||
formData.append('file', file);
|
||||
}
|
||||
if (options.docId) {
|
||||
formData.append('doc_id', options.docId);
|
||||
}
|
||||
|
||||
const params = new URLSearchParams();
|
||||
if (options.analysisType) {
|
||||
@@ -1240,28 +1559,31 @@ export const aiApi = {
|
||||
},
|
||||
|
||||
/**
|
||||
* 上传并使用 AI 分析 TXT 文本文件,提取结构化数据
|
||||
* 上传并使用 AI 分析 TXT 文本文件,提取结构化数据或生成图表
|
||||
*/
|
||||
async analyzeTxt(
|
||||
file: File
|
||||
file: File | null,
|
||||
docId: string | null = null,
|
||||
analysisType: TxtAnalysisType = 'structured'
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
filename?: string;
|
||||
structured_data?: {
|
||||
table?: {
|
||||
columns?: string[];
|
||||
rows?: string[][];
|
||||
};
|
||||
summary?: string;
|
||||
key_value_pairs?: Array<{ key: string; value: string }>;
|
||||
numeric_data?: Array<{ name: string; value: number; unit?: string }>;
|
||||
};
|
||||
analysis_type?: string;
|
||||
result?: any;
|
||||
error?: string;
|
||||
}> {
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
if (file) {
|
||||
formData.append('file', file);
|
||||
}
|
||||
if (docId) {
|
||||
formData.append('doc_id', docId);
|
||||
}
|
||||
|
||||
const url = `${BACKEND_BASE_URL}/ai/analyze/txt`;
|
||||
const params = new URLSearchParams();
|
||||
params.append('analysis_type', analysisType);
|
||||
|
||||
const url = `${BACKEND_BASE_URL}/ai/analyze/txt?${params.toString()}`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
@@ -1383,28 +1705,35 @@ export const aiApi = {
|
||||
// ==================== Word AI 解析 ====================
|
||||
|
||||
/**
|
||||
* 使用 AI 解析 Word 文档,提取结构化数据
|
||||
* 使用 AI 解析 Word 文档,提取结构化数据或生成图表
|
||||
*/
|
||||
async analyzeWordWithAI(
|
||||
file: File,
|
||||
userHint: string = ''
|
||||
file: File | null,
|
||||
docId: string | null = null,
|
||||
userHint: string = '',
|
||||
analysisType: WordAnalysisType = 'structured'
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
type?: string;
|
||||
headers?: string[];
|
||||
rows?: string[][];
|
||||
key_values?: Record<string, string>;
|
||||
list_items?: string[];
|
||||
summary?: string;
|
||||
filename?: string;
|
||||
analysis_type?: string;
|
||||
result?: any;
|
||||
error?: string;
|
||||
}> {
|
||||
const formData = new FormData();
|
||||
formData.append('file', file);
|
||||
if (file) {
|
||||
formData.append('file', file);
|
||||
}
|
||||
if (docId) {
|
||||
formData.append('doc_id', docId);
|
||||
}
|
||||
if (userHint) {
|
||||
formData.append('user_hint', userHint);
|
||||
}
|
||||
|
||||
const url = `${BACKEND_BASE_URL}/ai/analyze/word`;
|
||||
const params = new URLSearchParams();
|
||||
params.append('analysis_type', analysisType);
|
||||
|
||||
const url = `${BACKEND_BASE_URL}/ai/analyze/word?${params.toString()}`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
@@ -1529,61 +1858,67 @@ export const aiApi = {
|
||||
}
|
||||
},
|
||||
|
||||
// ==================== 对话历史 API ====================
|
||||
|
||||
/**
|
||||
* 智能对话(支持多轮对话的指令执行)
|
||||
* 获取对话历史
|
||||
*/
|
||||
async instructionChat(
|
||||
instruction: string,
|
||||
docIds?: string[],
|
||||
context?: Record<string, any>
|
||||
): Promise<{
|
||||
async getConversationHistory(conversationId: string, limit: number = 20): Promise<{
|
||||
success: boolean;
|
||||
intent: string;
|
||||
result: Record<string, any>;
|
||||
message: string;
|
||||
hint?: string;
|
||||
}> {
|
||||
const url = `${BACKEND_BASE_URL}/instruction/chat`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ instruction, doc_ids: docIds, context }),
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const error = await response.json();
|
||||
throw new Error(error.detail || '对话处理失败');
|
||||
}
|
||||
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('对话处理失败:', error);
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* 获取支持的指令类型列表
|
||||
*/
|
||||
async getSupportedIntents(): Promise<{
|
||||
intents: Array<{
|
||||
intent: string;
|
||||
name: string;
|
||||
examples: string[];
|
||||
params: string[];
|
||||
messages: Array<{
|
||||
role: string;
|
||||
content: string;
|
||||
intent?: string;
|
||||
created_at: string;
|
||||
}>;
|
||||
}> {
|
||||
const url = `${BACKEND_BASE_URL}/instruction/intents`;
|
||||
const url = `${BACKEND_BASE_URL}/conversation/${conversationId}/history?limit=${limit}`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) throw new Error('获取指令列表失败');
|
||||
if (!response.ok) throw new Error('获取对话历史失败');
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('获取指令列表失败:', error);
|
||||
throw error;
|
||||
console.error('获取对话历史失败:', error);
|
||||
return { success: false, messages: [] };
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* 删除对话历史
|
||||
*/
|
||||
async deleteConversation(conversationId: string): Promise<{
|
||||
success: boolean;
|
||||
}> {
|
||||
const url = `${BACKEND_BASE_URL}/conversation/${conversationId}`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url, { method: 'DELETE' });
|
||||
if (!response.ok) throw new Error('删除对话历史失败');
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('删除对话历史失败:', error);
|
||||
return { success: false };
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
* 获取会话列表
|
||||
*/
|
||||
async listConversations(limit: number = 50): Promise<{
|
||||
success: boolean;
|
||||
conversations: Array<any>;
|
||||
}> {
|
||||
const url = `${BACKEND_BASE_URL}/conversation/all?limit=${limit}`;
|
||||
|
||||
try {
|
||||
const response = await fetch(url);
|
||||
if (!response.ok) throw new Error('获取会话列表失败');
|
||||
return await response.json();
|
||||
} catch (error) {
|
||||
console.error('获取会话列表失败:', error);
|
||||
return { success: false, conversations: [] };
|
||||
}
|
||||
},
|
||||
|
||||
};
|
||||
|
||||
@@ -41,7 +41,7 @@ const Assistant: React.FC = () => {
|
||||
{
|
||||
id: '1',
|
||||
role: 'assistant',
|
||||
content: '您好!我是智联文档 AI 助手。您可以告诉我您想对文档进行的操作,例如:\n- "帮我列出最近上传的所有 docx 文档"\n- "从 2026 财报文档中提取出关键的利润数据"\n- "帮我创建一个汇总各部门报销单的填表任务"\n\n请问有什么我可以帮您的?',
|
||||
content: '您好!我是表易智融 AI 助手。您可以告诉我您想对文档进行的操作,例如:\n- "帮我列出最近上传的所有 docx 文档"\n- "从 2026 财报文档中提取出关键的利润数据"\n- "帮我创建一个汇总各部门报销单的填表任务"\n\n请问有什么我可以帮您的?',
|
||||
created_at: new Date().toISOString()
|
||||
}
|
||||
]);
|
||||
|
||||
@@ -15,12 +15,14 @@ import {
|
||||
Sparkles,
|
||||
Database,
|
||||
FileSpreadsheet,
|
||||
RefreshCcw
|
||||
RefreshCcw,
|
||||
Trash2
|
||||
} from 'lucide-react';
|
||||
import { backendApi } from '@/db/backend-api';
|
||||
import { formatDistanceToNow } from 'date-fns';
|
||||
import { zhCN } from 'date-fns/locale';
|
||||
import { cn } from '@/lib/utils';
|
||||
import { toast } from 'sonner';
|
||||
|
||||
type DocumentItem = {
|
||||
doc_id: string;
|
||||
@@ -87,7 +89,7 @@ const Dashboard: React.FC = () => {
|
||||
<section className="flex flex-col md:flex-row md:items-center justify-between gap-4">
|
||||
<div className="space-y-1">
|
||||
<h1 className="text-3xl font-extrabold tracking-tight">
|
||||
欢迎使用 <span className="text-primary">智联文档</span> 系统 👋
|
||||
欢迎使用 <span className="text-primary">表易智融</span> 系统 👋
|
||||
</h1>
|
||||
<p className="text-muted-foreground">基于大语言模型的文档理解与多源数据融合系统</p>
|
||||
</div>
|
||||
@@ -108,7 +110,7 @@ const Dashboard: React.FC = () => {
|
||||
<div className="grid grid-cols-1 md:grid-cols-3 gap-6">
|
||||
{[
|
||||
{ label: '已上传文档', value: stats.docs, icon: FileText, color: 'bg-blue-500', trend: '非结构化文档', link: '/documents' },
|
||||
{ label: 'Excel 文件', value: stats.excelFiles, icon: FileSpreadsheet, color: 'bg-emerald-500', trend: '结构化数据', link: '/excel-parse' },
|
||||
{ label: 'Excel 文件', value: stats.excelFiles, icon: FileSpreadsheet, color: 'bg-emerald-500', trend: '结构化数据', link: '/documents' },
|
||||
{ label: '填表任务', value: stats.tasks, icon: TableProperties, color: 'bg-indigo-500', trend: '待实现', link: '/form-fill' }
|
||||
].map((stat, i) => (
|
||||
<Card key={i} className="border-none shadow-md overflow-hidden group hover:shadow-xl transition-all duration-300">
|
||||
@@ -164,8 +166,30 @@ const Dashboard: React.FC = () => {
|
||||
{doc.doc_type.toUpperCase()} • {formatDistanceToNow(new Date(doc.created_at), { addSuffix: true, locale: zhCN })}
|
||||
</p>
|
||||
</div>
|
||||
<div className="px-2 py-1 rounded-full text-[10px] font-bold uppercase tracking-wider bg-muted">
|
||||
{doc.doc_type}
|
||||
<div className="flex items-center gap-2">
|
||||
<div className="px-2 py-1 rounded-full text-[10px] font-bold uppercase tracking-wider bg-muted">
|
||||
{doc.doc_type}
|
||||
</div>
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="opacity-0 group-hover:opacity-100 text-destructive hover:bg-destructive/10 transition-opacity"
|
||||
onClick={async (e) => {
|
||||
e.stopPropagation();
|
||||
if (!confirm(`确定要删除 "${doc.original_filename}" 吗?`)) return;
|
||||
try {
|
||||
const result = await backendApi.deleteDocument(doc.doc_id);
|
||||
if (result.success) {
|
||||
setRecentDocs(prev => prev.filter(d => d.doc_id !== doc.doc_id));
|
||||
toast.success('文档已删除');
|
||||
}
|
||||
} catch (err: any) {
|
||||
toast.error(err.message || '删除失败');
|
||||
}
|
||||
}}
|
||||
>
|
||||
<Trash2 size={16} />
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
@@ -197,7 +221,7 @@ const Dashboard: React.FC = () => {
|
||||
<div className="grid grid-cols-1 sm:grid-cols-2 gap-4">
|
||||
{[
|
||||
{ title: '上传文档', desc: '支持 docx/md/txt', icon: FileText, link: '/documents', color: 'bg-blue-500' },
|
||||
{ title: '解析 Excel', desc: '上传并分析数据', icon: FileSpreadsheet, link: '/excel-parse', color: 'bg-emerald-500' },
|
||||
{ title: '解析 Excel', desc: '上传并分析数据', icon: FileSpreadsheet, link: '/documents', color: 'bg-emerald-500' },
|
||||
{ title: '智能填表', desc: '自动填写表格模板', icon: TableProperties, link: '/form-fill', color: 'bg-indigo-500' },
|
||||
{ title: 'AI 助手', desc: '自然语言交互', icon: MessageSquareCode, link: '/assistant', color: 'bg-amber-500' }
|
||||
].map((item, i) => (
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,26 +1,10 @@
|
||||
import React, { useState, useRef, useEffect } from 'react';
|
||||
import {
|
||||
Send,
|
||||
Bot,
|
||||
User,
|
||||
Sparkles,
|
||||
Trash2,
|
||||
RefreshCcw,
|
||||
FileText,
|
||||
TableProperties,
|
||||
ChevronRight,
|
||||
ArrowRight,
|
||||
Loader2,
|
||||
Download,
|
||||
Search,
|
||||
MessageSquare,
|
||||
CheckCircle
|
||||
} from 'lucide-react';
|
||||
import { Send, Bot, User, Sparkles, Trash2, FileText, TableProperties, ArrowRight, Search, MessageSquare } from 'lucide-react';
|
||||
import { Button } from '@/components/ui/button';
|
||||
import { Input } from '@/components/ui/input';
|
||||
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
|
||||
import { ScrollArea } from '@/components/ui/scroll-area';
|
||||
import { Badge } from '@/components/ui/badge';
|
||||
import { Markdown } from '@/components/ui/markdown';
|
||||
import { backendApi } from '@/db/backend-api';
|
||||
import { toast } from 'sonner';
|
||||
import { cn } from '@/lib/utils';
|
||||
@@ -39,8 +23,21 @@ const InstructionChat: React.FC = () => {
|
||||
const [input, setInput] = useState('');
|
||||
const [loading, setLoading] = useState(false);
|
||||
const [currentDocIds, setCurrentDocIds] = useState<string[]>([]);
|
||||
const [conversationId, setConversationId] = useState<string>('');
|
||||
const scrollAreaRef = useRef<HTMLDivElement>(null);
|
||||
|
||||
// 初始化会话ID
|
||||
useEffect(() => {
|
||||
const storedId = localStorage.getItem('chat_conversation_id');
|
||||
if (storedId) {
|
||||
setConversationId(storedId);
|
||||
} else {
|
||||
const newId = `conv_${Date.now()}_${Math.random().toString(36).substring(7)}`;
|
||||
setConversationId(newId);
|
||||
localStorage.setItem('chat_conversation_id', newId);
|
||||
}
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
// Initial welcome message
|
||||
if (messages.length === 0) {
|
||||
@@ -48,7 +45,7 @@ const InstructionChat: React.FC = () => {
|
||||
{
|
||||
id: 'welcome',
|
||||
role: 'assistant',
|
||||
content: `您好!我是智联文档 AI 助手。
|
||||
content: `您好!我是表易智融 AI 助手。
|
||||
|
||||
**📄 文档智能操作**
|
||||
- "提取文档中的医院数量和床位数"
|
||||
@@ -119,7 +116,8 @@ const InstructionChat: React.FC = () => {
|
||||
// 使用真实的智能指令 API
|
||||
const response = await backendApi.instructionChat(
|
||||
input.trim(),
|
||||
currentDocIds.length > 0 ? currentDocIds : undefined
|
||||
currentDocIds.length > 0 ? currentDocIds : undefined,
|
||||
{ conversation_id: conversationId }
|
||||
);
|
||||
|
||||
// 根据意图类型生成友好响应
|
||||
@@ -135,11 +133,12 @@ const InstructionChat: React.FC = () => {
|
||||
responseContent = `✅ 已提取到 ${keys.length} 个字段的数据:\n\n`;
|
||||
for (const [key, value] of Object.entries(extracted)) {
|
||||
const values = Array.isArray(value) ? value : [value];
|
||||
responseContent += `**${key}**: ${values.slice(0, 3).join(', ')}${values.length > 3 ? '...' : ''}\n`;
|
||||
const displayValues = values.length > 10 ? values.slice(0, 10).join(', ') + ` ...(共${values.length}条)` : values.join(', ');
|
||||
responseContent += `**${key}**: ${displayValues}\n`;
|
||||
}
|
||||
responseContent += `\n💡 您可以将这些数据填入表格。`;
|
||||
responseContent += `\n💡 可直接使用以上数据,或说"填入表格"继续填表操作。`;
|
||||
} else {
|
||||
responseContent = '未能从文档中提取到相关数据。请尝试更明确的字段名称。';
|
||||
responseContent = resultData?.message || '未能从文档中提取到相关数据。请尝试更明确的字段名称。';
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -151,24 +150,24 @@ const InstructionChat: React.FC = () => {
|
||||
responseContent = `✅ 填表完成!成功填写 ${filledKeys.length} 个字段:\n\n`;
|
||||
for (const [key, value] of Object.entries(filled)) {
|
||||
const values = Array.isArray(value) ? value : [value];
|
||||
responseContent += `**${key}**: ${values.slice(0, 3).join(', ')}\n`;
|
||||
const displayValues = values.length > 10 ? values.slice(0, 10).join(', ') + ` ...(共${values.length}条)` : values.join(', ');
|
||||
responseContent += `**${key}**: ${displayValues}\n`;
|
||||
}
|
||||
responseContent += `\n📋 请到【智能填表】页面查看或导出结果。`;
|
||||
} else {
|
||||
responseContent = '填表未能提取到数据。请检查模板表头和数据源内容。';
|
||||
responseContent = resultData?.message || '填表未能提取到数据。请检查模板表头和数据源内容。';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'summarize':
|
||||
// 摘要结果
|
||||
const summaries = resultData?.summaries || [];
|
||||
if (summaries.length > 0) {
|
||||
responseContent = `📄 找到 ${summaries.length} 个文档的摘要:\n\n`;
|
||||
summaries.forEach((s: any, idx: number) => {
|
||||
responseContent += `**${idx + 1}. ${s.filename}**\n${s.content_preview}\n\n`;
|
||||
});
|
||||
if (resultData?.action_needed === 'provide_document' || resultData?.action_needed === 'upload_document') {
|
||||
responseContent = `📋 ${resultData.message}\n\n${resultData.suggestion || ''}`;
|
||||
} else if (resultData?.ai_summary) {
|
||||
// AI 生成的摘要
|
||||
responseContent = `📄 **${resultData.filename}** 摘要分析:\n\n${resultData.ai_summary}`;
|
||||
} else {
|
||||
responseContent = '未能生成摘要。请确保已上传文档。';
|
||||
responseContent = resultData?.message || '未能生成摘要。请确保已上传文档。';
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -176,8 +175,10 @@ const InstructionChat: React.FC = () => {
|
||||
// 问答结果
|
||||
if (resultData?.answer) {
|
||||
responseContent = `**问题**: ${resultData.question}\n\n**答案**: ${resultData.answer}`;
|
||||
} else if (resultData?.context_preview) {
|
||||
responseContent = `**问题**: ${resultData.question}\n\n**相关上下文**:\n${resultData.context_preview}`;
|
||||
} else {
|
||||
responseContent = resultData?.message || '我找到了相关信息,请查看上文。';
|
||||
responseContent = resultData?.message || '请先上传文档,我才能回答您的问题。';
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -207,8 +208,35 @@ const InstructionChat: React.FC = () => {
|
||||
}
|
||||
break;
|
||||
|
||||
case 'edit':
|
||||
// 文档编辑结果
|
||||
if (resultData?.edited_content) {
|
||||
responseContent = `✏️ **${resultData.original_filename}** 编辑完成:\n\n${resultData.edited_content.substring(0, 500)}${resultData.edited_content.length > 500 ? '\n\n...(内容已截断)' : ''}`;
|
||||
} else {
|
||||
responseContent = resultData?.message || '编辑完成。';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'transform':
|
||||
// 格式转换结果
|
||||
if (resultData?.excel_data) {
|
||||
responseContent = `🔄 格式转换完成!\n\n已转换为 **Excel** 格式,共 **${resultData.excel_data.length}** 行数据。\n\n${resultData.message || ''}`;
|
||||
} else if (resultData?.content) {
|
||||
responseContent = `🔄 格式转换完成!\n\n目标格式: **${resultData.target_format?.toUpperCase()}**\n\n${resultData.message || ''}`;
|
||||
} else {
|
||||
responseContent = resultData?.message || '格式转换完成。';
|
||||
}
|
||||
break;
|
||||
|
||||
case 'unknown':
|
||||
responseContent = `我理解您想要: "${input.trim()}"\n\n但我目前无法完成此操作。您可以尝试:\n\n1. **提取数据**: "提取医院数量和床位数"\n2. **填表**: "根据这些数据填表"\n3. **总结**: "总结这份文档"\n4. **问答**: "文档里说了什么?"\n5. **搜索**: "搜索相关内容"`;
|
||||
// 检查是否需要用户上传文档
|
||||
if (resultData?.suggestion) {
|
||||
responseContent = resultData.suggestion;
|
||||
} else if (resultData?.message && resultData.message !== '无法理解该指令,请尝试更明确的描述') {
|
||||
responseContent = resultData.message;
|
||||
} else {
|
||||
responseContent = `我理解您想要: "${input.trim()}"\n\n请尝试以下操作:\n\n1. **提取数据**: "提取医院数量和床位数"\n2. **填表**: "根据这些数据填表"\n3. **总结**: "总结这份文档"\n4. **问答**: "文档里说了什么?"\n5. **搜索**: "搜索相关内容"`;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -299,9 +327,11 @@ const InstructionChat: React.FC = () => {
|
||||
? "bg-primary text-primary-foreground shadow-xl shadow-primary/20 rounded-tr-none"
|
||||
: "bg-white border border-border/50 shadow-md rounded-tl-none"
|
||||
)}>
|
||||
<p className="text-sm leading-relaxed whitespace-pre-wrap font-medium">
|
||||
{m.content}
|
||||
</p>
|
||||
{m.role === 'assistant' ? (
|
||||
<Markdown content={m.content} className="text-sm leading-relaxed prose prose-sm max-w-none" />
|
||||
) : (
|
||||
<p className="text-sm leading-relaxed whitespace-pre-wrap font-medium">{m.content}</p>
|
||||
)}
|
||||
<span className={cn(
|
||||
"text-[10px] block opacity-50 font-bold tracking-widest",
|
||||
m.role === 'user' ? "text-right" : "text-left"
|
||||
|
||||
446
frontend/src/pages/PdfConverter.tsx
Normal file
446
frontend/src/pages/PdfConverter.tsx
Normal file
@@ -0,0 +1,446 @@
|
||||
/**
|
||||
* PDF 转换页面
|
||||
* 支持将 Word、Excel、Txt、Markdown 格式转换为 PDF
|
||||
*/
|
||||
import React, { useState, useCallback } from 'react';
|
||||
import { useDropzone } from 'react-dropzone';
|
||||
import {
|
||||
FileText,
|
||||
Upload,
|
||||
Download,
|
||||
FileSpreadsheet,
|
||||
File as FileIcon,
|
||||
Loader2,
|
||||
CheckCircle,
|
||||
AlertCircle,
|
||||
Trash2,
|
||||
FileDown,
|
||||
X,
|
||||
Copy
|
||||
} from 'lucide-react';
|
||||
import { Button } from '@/components/ui/button';
|
||||
import { Card, CardContent, CardHeader, CardTitle, CardDescription } from '@/components/ui/card';
|
||||
import { Badge } from '@/components/ui/badge';
|
||||
import { Label } from '@/components/ui/label';
|
||||
import { toast } from 'sonner';
|
||||
import { cn } from '@/lib/utils';
|
||||
import { backendApi } from '@/db/backend-api';
|
||||
|
||||
type FileState = {
|
||||
file: File;
|
||||
status: 'pending' | 'converting' | 'success' | 'failed';
|
||||
progress: number;
|
||||
pdfBlob?: Blob;
|
||||
error?: string;
|
||||
};
|
||||
|
||||
const SUPPORTED_FORMATS = [
|
||||
{ ext: 'docx', name: 'Word 文档', icon: FileText, color: 'blue' },
|
||||
{ ext: 'xlsx', name: 'Excel 表格', icon: FileSpreadsheet, color: 'emerald' },
|
||||
{ ext: 'txt', name: '文本文件', icon: FileIcon, color: 'gray' },
|
||||
{ ext: 'md', name: 'Markdown', icon: FileText, color: 'purple' },
|
||||
];
|
||||
|
||||
const PdfConverter: React.FC = () => {
|
||||
const [files, setFiles] = useState<FileState[]>([]);
|
||||
const [converting, setConverting] = useState(false);
|
||||
const [convertedCount, setConvertedCount] = useState(0);
|
||||
|
||||
const onDrop = useCallback((acceptedFiles: File[]) => {
|
||||
const newFiles: FileState[] = acceptedFiles.map(file => ({
|
||||
file,
|
||||
status: 'pending',
|
||||
progress: 0,
|
||||
}));
|
||||
setFiles(prev => [...prev, ...newFiles]);
|
||||
}, []);
|
||||
|
||||
const { getRootProps, getInputProps, isDragActive } = useDropzone({
|
||||
onDrop,
|
||||
accept: {
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
|
||||
'application/vnd.ms-excel': ['.xls'],
|
||||
'text/markdown': ['.md'],
|
||||
'text/plain': ['.txt'],
|
||||
},
|
||||
multiple: true,
|
||||
});
|
||||
|
||||
const handleConvert = async () => {
|
||||
if (files.length === 0) {
|
||||
toast.error('请先上传文件');
|
||||
return;
|
||||
}
|
||||
|
||||
setConverting(true);
|
||||
setConvertedCount(0);
|
||||
|
||||
const pendingFiles = files.filter(f => f.status === 'pending' || f.status === 'failed');
|
||||
let successCount = 0;
|
||||
|
||||
for (let i = 0; i < pendingFiles.length; i++) {
|
||||
const fileState = pendingFiles[i];
|
||||
const fileIndex = files.findIndex(f => f.file === fileState.file);
|
||||
|
||||
// 更新状态为转换中
|
||||
setFiles(prev => prev.map((f, idx) =>
|
||||
idx === fileIndex ? { ...f, status: 'converting', progress: 10 } : f
|
||||
));
|
||||
|
||||
try {
|
||||
// 获取 PDF blob
|
||||
const pdfBlob = await backendApi.convertToPdf(fileState.file);
|
||||
|
||||
// 触发下载
|
||||
const url = URL.createObjectURL(pdfBlob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = `${fileState.file.name.replace(/\.[^.]+$/, '')}.pdf`;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
|
||||
// 保存 blob 以便批量下载
|
||||
setFiles(prev => prev.map((f, idx) =>
|
||||
idx === fileIndex ? { ...f, status: 'success', progress: 100, pdfBlob } : f
|
||||
));
|
||||
successCount++;
|
||||
setConvertedCount(successCount);
|
||||
toast.success(`${fileState.file.name} 下载已开始`);
|
||||
} catch (error: any) {
|
||||
setFiles(prev => prev.map((f, idx) =>
|
||||
idx === fileIndex ? { ...f, status: 'failed', error: error.message || '转换失败' } : f
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
setConverting(false);
|
||||
toast.success(`转换完成:${successCount}/${pendingFiles.length} 个文件`);
|
||||
};
|
||||
|
||||
const handleDownload = (fileState: FileState) => {
|
||||
if (!fileState.pdfBlob) return;
|
||||
|
||||
const url = URL.createObjectURL(fileState.pdfBlob);
|
||||
const link = document.createElement('a');
|
||||
link.href = url;
|
||||
link.download = `${fileState.file.name.replace(/\.[^.]+$/, '')}.pdf`;
|
||||
document.body.appendChild(link);
|
||||
link.click();
|
||||
document.body.removeChild(link);
|
||||
URL.revokeObjectURL(url);
|
||||
};
|
||||
|
||||
const handleDownloadAll = async () => {
|
||||
const successFiles = files.filter(f => f.status === 'success' && f.pdfBlob);
|
||||
|
||||
if (successFiles.length === 0) {
|
||||
toast.error('没有可下载的文件');
|
||||
return;
|
||||
}
|
||||
|
||||
if (successFiles.length === 1) {
|
||||
handleDownload(successFiles[0]);
|
||||
return;
|
||||
}
|
||||
|
||||
// 多个文件,下载 ZIP
|
||||
try {
|
||||
const zipBlob = await backendApi.batchConvertToPdf(
|
||||
successFiles.map(f => f.file)
|
||||
);
|
||||
const url = URL.createObjectURL(zipBlob);
|
||||
const link = document.createElement('a');
|
||||
link.href = url;
|
||||
link.download = 'converted_pdfs.zip';
|
||||
document.body.appendChild(link);
|
||||
link.click();
|
||||
document.body.removeChild(link);
|
||||
URL.revokeObjectURL(url);
|
||||
toast.success('ZIP 下载开始');
|
||||
} catch (error: any) {
|
||||
toast.error(error.message || '下载失败');
|
||||
}
|
||||
};
|
||||
|
||||
const handleRemove = (index: number) => {
|
||||
setFiles(prev => prev.filter((_, i) => i !== index));
|
||||
};
|
||||
|
||||
const handleClear = () => {
|
||||
setFiles([]);
|
||||
setConvertedCount(0);
|
||||
};
|
||||
|
||||
const getFileIcon = (filename: string) => {
|
||||
const ext = filename.split('.').pop()?.toLowerCase();
|
||||
const format = SUPPORTED_FORMATS.find(f => f.ext === ext);
|
||||
if (!format) return FileIcon;
|
||||
return format.icon;
|
||||
};
|
||||
|
||||
const getFileColor = (filename: string) => {
|
||||
const ext = filename.split('.').pop()?.toLowerCase();
|
||||
const format = SUPPORTED_FORMATS.find(f => f.ext === ext);
|
||||
return format?.color || 'gray';
|
||||
};
|
||||
|
||||
const colorClasses: Record<string, string> = {
|
||||
blue: 'bg-blue-500/10 text-blue-500',
|
||||
emerald: 'bg-emerald-500/10 text-emerald-500',
|
||||
purple: 'bg-purple-500/10 text-purple-500',
|
||||
gray: 'bg-gray-500/10 text-gray-500',
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="space-y-8 pb-10">
|
||||
<section className="flex flex-col md:flex-row md:items-center justify-between gap-4">
|
||||
<div className="space-y-1">
|
||||
<h1 className="text-3xl font-extrabold tracking-tight">文档转 PDF</h1>
|
||||
<p className="text-muted-foreground">将 Word、Excel、文本、Markdown 文件转换为 PDF 格式</p>
|
||||
</div>
|
||||
{files.length > 0 && (
|
||||
<div className="flex gap-2">
|
||||
<Button variant="outline" onClick={handleClear}>
|
||||
<Trash2 size={18} className="mr-2" />
|
||||
清空
|
||||
</Button>
|
||||
<Button onClick={handleDownloadAll} disabled={files.filter(f => f.status === 'success').length === 0}>
|
||||
<Download size={18} className="mr-2" />
|
||||
打包下载 ({files.filter(f => f.status === 'success').length})
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
</section>
|
||||
|
||||
<div className="grid grid-cols-1 lg:grid-cols-3 gap-6">
|
||||
{/* 左侧:上传区域 */}
|
||||
<div className="lg:col-span-1 space-y-6">
|
||||
{/* 上传卡片 */}
|
||||
<Card className="border-none shadow-md">
|
||||
<CardHeader className="pb-4">
|
||||
<CardTitle className="flex items-center gap-2">
|
||||
<Upload className="text-primary" size={20} />
|
||||
上传文件
|
||||
</CardTitle>
|
||||
<CardDescription>拖拽或点击上传要转换的文件</CardDescription>
|
||||
</CardHeader>
|
||||
<CardContent className="space-y-4">
|
||||
<div
|
||||
{...getRootProps()}
|
||||
className={cn(
|
||||
"border-2 border-dashed rounded-2xl p-8 transition-all duration-300 flex flex-col items-center justify-center text-center cursor-pointer group",
|
||||
isDragActive ? "border-primary bg-primary/5" : "border-muted-foreground/20 hover:border-primary/50 hover:bg-primary/5",
|
||||
converting && "opacity-50 pointer-events-none"
|
||||
)}
|
||||
>
|
||||
<input {...getInputProps()} />
|
||||
<div className="w-14 h-14 rounded-xl bg-primary/10 text-primary flex items-center justify-center mb-4 group-hover:scale-110 transition-transform">
|
||||
{converting ? <Loader2 className="animate-spin" size={28} /> : <Upload size={28} />}
|
||||
</div>
|
||||
<p className="font-semibold text-sm">
|
||||
{isDragActive ? '释放以开始上传' : '点击或拖拽文件到这里'}
|
||||
</p>
|
||||
<div className="mt-4 flex flex-wrap justify-center gap-2">
|
||||
{SUPPORTED_FORMATS.map(format => (
|
||||
<Badge key={format.ext} variant="outline" className={cn("text-xs", colorClasses[format.color])}>
|
||||
{format.name}
|
||||
</Badge>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* 转换按钮 */}
|
||||
{files.length > 0 && (
|
||||
<Button
|
||||
onClick={handleConvert}
|
||||
disabled={converting || files.filter(f => f.status === 'pending' || f.status === 'failed').length === 0}
|
||||
className="w-full bg-gradient-to-r from-primary to-purple-600 hover:from-primary/90 hover:to-purple-600/90"
|
||||
>
|
||||
{converting ? (
|
||||
<>
|
||||
<Loader2 className="mr-2 animate-spin" size={16} />
|
||||
转换中... ({convertedCount}/{files.filter(f => f.status === 'pending' || f.status === 'failed').length})
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
<FileDown className="mr-2" size={16} />
|
||||
开始转换 ({files.filter(f => f.status === 'pending' || f.status === 'failed').length})
|
||||
</>
|
||||
)}
|
||||
</Button>
|
||||
)}
|
||||
</CardContent>
|
||||
</Card>
|
||||
|
||||
{/* 格式说明 */}
|
||||
<Card className="border-none shadow-md">
|
||||
<CardHeader className="pb-4">
|
||||
<CardTitle className="flex items-center gap-2">
|
||||
<FileText className="text-primary" size={20} />
|
||||
支持的格式
|
||||
</CardTitle>
|
||||
</CardHeader>
|
||||
<CardContent>
|
||||
<div className="space-y-3">
|
||||
{SUPPORTED_FORMATS.map(format => {
|
||||
const Icon = format.icon;
|
||||
return (
|
||||
<div key={format.ext} className="flex items-center gap-3 p-2 rounded-lg hover:bg-muted/30 transition-colors">
|
||||
<div className={cn("w-8 h-8 rounded flex items-center justify-center", colorClasses[format.color])}>
|
||||
<Icon size={16} />
|
||||
</div>
|
||||
<div className="flex-1">
|
||||
<p className="text-sm font-medium">.{format.ext.toUpperCase()}</p>
|
||||
<p className="text-xs text-muted-foreground">{format.name}</p>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
</CardContent>
|
||||
</Card>
|
||||
</div>
|
||||
|
||||
{/* 右侧:文件列表 */}
|
||||
<div className="lg:col-span-2 space-y-6">
|
||||
<Card className="border-none shadow-md">
|
||||
<CardHeader>
|
||||
<div className="flex items-center justify-between">
|
||||
<div className="space-y-1">
|
||||
<CardTitle className="flex items-center gap-2">
|
||||
<FileIcon className="text-primary" size={20} />
|
||||
文件列表
|
||||
</CardTitle>
|
||||
<CardDescription>
|
||||
共 {files.length} 个文件,已转换 {files.filter(f => f.status === 'success').length} 个
|
||||
</CardDescription>
|
||||
</div>
|
||||
</div>
|
||||
</CardHeader>
|
||||
<CardContent>
|
||||
{files.length === 0 ? (
|
||||
<div className="text-center py-12 text-muted-foreground">
|
||||
<FileIcon size={48} className="mx-auto mb-4 opacity-30" />
|
||||
<p>暂无文件,上传文件开始转换</p>
|
||||
</div>
|
||||
) : (
|
||||
<div className="space-y-3">
|
||||
{files.map((fileState, index) => {
|
||||
const Icon = getFileIcon(fileState.file.name);
|
||||
const color = getFileColor(fileState.file.name);
|
||||
|
||||
return (
|
||||
<div
|
||||
key={index}
|
||||
className="flex items-center gap-4 p-4 rounded-xl border bg-card hover:bg-muted/30 transition-colors"
|
||||
>
|
||||
<div className={cn("w-10 h-10 rounded-lg flex items-center justify-center shrink-0", colorClasses[color])}>
|
||||
<Icon size={20} />
|
||||
</div>
|
||||
<div className="flex-1 min-w-0">
|
||||
<p className="font-semibold truncate">{fileState.file.name}</p>
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="text-xs text-muted-foreground">
|
||||
{(fileState.file.size / 1024).toFixed(1)} KB
|
||||
</span>
|
||||
{fileState.status === 'pending' && (
|
||||
<Badge variant="secondary" className="text-xs">待转换</Badge>
|
||||
)}
|
||||
{fileState.status === 'converting' && (
|
||||
<Badge variant="default" className="text-xs bg-blue-500">转换中</Badge>
|
||||
)}
|
||||
{fileState.status === 'success' && (
|
||||
<Badge variant="default" className="text-xs bg-emerald-500">已转换</Badge>
|
||||
)}
|
||||
{fileState.status === 'failed' && (
|
||||
<Badge variant="destructive" className="text-xs">失败</Badge>
|
||||
)}
|
||||
</div>
|
||||
{fileState.status === 'converting' && (
|
||||
<div className="mt-1 h-1 bg-muted rounded-full overflow-hidden">
|
||||
<div
|
||||
className="h-full bg-primary transition-all duration-300"
|
||||
style={{ width: `${fileState.progress}%` }}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
{fileState.error && (
|
||||
<p className="text-xs text-destructive mt-1">{fileState.error}</p>
|
||||
)}
|
||||
</div>
|
||||
<div className="flex items-center gap-2 shrink-0">
|
||||
{fileState.status === 'success' && (
|
||||
<>
|
||||
<Button variant="ghost" size="icon" onClick={() => handleDownload(fileState)}>
|
||||
<Download size={18} className="text-emerald-500" />
|
||||
</Button>
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
onClick={() => {
|
||||
// 复制下载链接到剪贴板
|
||||
if (fileState.pdfBlob) {
|
||||
const url = URL.createObjectURL(fileState.pdfBlob);
|
||||
navigator.clipboard.writeText(url);
|
||||
toast.success('链接已复制');
|
||||
}
|
||||
}}
|
||||
>
|
||||
<Copy size={18} />
|
||||
</Button>
|
||||
</>
|
||||
)}
|
||||
{(fileState.status === 'pending' || fileState.status === 'failed') && (
|
||||
<Button
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
onClick={() => handleRemove(index)}
|
||||
className="text-destructive hover:bg-destructive/10"
|
||||
>
|
||||
<X size={18} />
|
||||
</Button>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
)}
|
||||
</CardContent>
|
||||
</Card>
|
||||
|
||||
{/* 使用说明 */}
|
||||
<Card className="border-none shadow-md bg-gradient-to-br from-primary/5 to-purple-500/5">
|
||||
<CardHeader className="pb-4">
|
||||
<CardTitle className="flex items-center gap-2">
|
||||
<FileText className="text-primary" size={20} />
|
||||
使用说明
|
||||
</CardTitle>
|
||||
</CardHeader>
|
||||
<CardContent>
|
||||
<div className="space-y-3 text-sm text-muted-foreground">
|
||||
<div className="flex gap-3">
|
||||
<div className="w-6 h-6 rounded-full bg-primary/10 text-primary flex items-center justify-center shrink-0 text-xs font-bold">1</div>
|
||||
<p>上传要转换的文件,支持 Word(.docx)、Excel(.xlsx)、文本(.txt)、Markdown(.md) 格式</p>
|
||||
</div>
|
||||
<div className="flex gap-3">
|
||||
<div className="w-6 h-6 rounded-full bg-primary/10 text-primary flex items-center justify-center shrink-0 text-xs font-bold">2</div>
|
||||
<p>点击「开始转换」按钮,系统将自动将文件转换为 PDF 格式</p>
|
||||
</div>
|
||||
<div className="flex gap-3">
|
||||
<div className="w-6 h-6 rounded-full bg-primary/10 text-primary flex items-center justify-center shrink-0 text-xs font-bold">3</div>
|
||||
<p>转换完成后,点击下载按钮获取 PDF 文件,或使用「打包下载」一次性下载所有文件</p>
|
||||
</div>
|
||||
</div>
|
||||
</CardContent>
|
||||
</Card>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
export default PdfConverter;
|
||||
@@ -248,15 +248,25 @@ const TemplateFill: React.FC = () => {
|
||||
if (!templateFile || !filledResult) return;
|
||||
|
||||
try {
|
||||
const ext = templateFile.name.split('.').pop()?.toLowerCase();
|
||||
const exportFormat = (ext === 'docx') ? 'docx' : 'xlsx';
|
||||
// 对于 Word 模板,如果已有填写后的文件(已填入表格单元格),传递其路径以便直接下载
|
||||
const filledFilePath = (ext === 'docx' && filledResult.filled_file_path)
|
||||
? filledResult.filled_file_path
|
||||
: undefined;
|
||||
const blob = await backendApi.exportFilledTemplate(
|
||||
templateId || 'temp',
|
||||
filledResult.filled_data || {},
|
||||
'xlsx'
|
||||
exportFormat,
|
||||
filledFilePath
|
||||
);
|
||||
const ext_match = templateFile.name.match(/\.([^.])+$/);
|
||||
const baseName = ext_match ? templateFile.name.replace(ext_match[0], '') : templateFile.name;
|
||||
const downloadName = `filled_${baseName}.${exportFormat}`;
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = `filled_${templateFile.name}`;
|
||||
a.download = downloadName;
|
||||
a.click();
|
||||
URL.revokeObjectURL(url);
|
||||
toast.success('导出成功');
|
||||
@@ -546,7 +556,7 @@ const TemplateFill: React.FC = () => {
|
||||
</div>
|
||||
<h3 className="text-xl font-bold mb-2">AI 正在智能分析并填表</h3>
|
||||
<p className="text-muted-foreground text-center max-w-md">
|
||||
系统正在从 {sourceFiles.length || sourceFilePaths.length} 份文档中检索相关信息...
|
||||
系统正在从 {sourceFiles.length || sourceFilePaths.length || sourceDocIds.length || 0} 份文档中检索相关信息...
|
||||
</p>
|
||||
</CardContent>
|
||||
</Card>
|
||||
@@ -562,7 +572,7 @@ const TemplateFill: React.FC = () => {
|
||||
填表完成
|
||||
</CardTitle>
|
||||
<CardDescription>
|
||||
系统已根据 {sourceFiles.length || sourceFilePaths.length} 份文档自动完成表格填写
|
||||
系统已根据 {filledResult.source_doc_count || sourceFiles.length || sourceFilePaths.length || sourceDocIds.length} 份文档自动完成表格填写
|
||||
</CardDescription>
|
||||
</CardHeader>
|
||||
<CardContent>
|
||||
|
||||
@@ -4,6 +4,7 @@ import Documents from '@/pages/Documents';
|
||||
import TemplateFill from '@/pages/TemplateFill';
|
||||
import InstructionChat from '@/pages/InstructionChat';
|
||||
import TaskHistory from '@/pages/TaskHistory';
|
||||
import PdfConverter from '@/pages/PdfConverter';
|
||||
import MainLayout from '@/components/layouts/MainLayout';
|
||||
|
||||
export const routes = [
|
||||
@@ -31,6 +32,10 @@ export const routes = [
|
||||
path: '/task-history',
|
||||
element: <TaskHistory />,
|
||||
},
|
||||
{
|
||||
path: '/pdf-converter',
|
||||
element: <PdfConverter />,
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
|
||||
@@ -23,7 +23,6 @@
|
||||
"noUnusedParameters": true,
|
||||
"noFallthroughCasesInSwitch": true,
|
||||
"noUncheckedSideEffectImports": true,
|
||||
"baseUrl": ".",
|
||||
"paths": {
|
||||
"@/*": ["./src/*"]
|
||||
},
|
||||
|
||||
BIN
屏幕截图 2026-04-18 002609.png
Normal file
BIN
屏幕截图 2026-04-18 002609.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 552 KiB |
Reference in New Issue
Block a user