feat: 添加文档转PDF转换功能

- 后端添加 PDF 转换服务，支持 Word(docx)、Excel(xlsx)、文本(txt)、Markdown(md) 格式转换为 PDF - 使用 reportlab 库，支持中文字体（simhei.ttf） - 添加 FastAPI 接口：POST /api/v1/pdf/convert 单文件转换，POST /api/v1/pdf/convert/batch 批量转换 - 前端添加 PdfConverter 页面，支持拖拽上传、转换进度显示、批量下载 - 转换流程：所有格式先转为 Markdown，再通过 Markdown 转 PDF，保证输出一致性 - DOCX 解析使用 zipfile 直接读取 XML，避免 python-docx 的兼容性问题的
2026-04-20 00:00:30 +08:00
parent 581e2b0ae0
commit be302839ee
12 changed files with 1200 additions and 3 deletions
--- a/backend/app/api/init.py
+++ b/backend/app/api/init.py
@@ -15,6 +15,7 @@ from app.api.endpoints import (
    health,
    instruction,    # 智能指令
    conversation,   # 对话历史
+    pdf_converter,  # PDF转换
 )

 # 创建主路由
@@ -33,3 +34,4 @@ api_router.include_router(visualization.router)    # 可视化
 api_router.include_router(analysis_charts.router)  # 分析图表
 api_router.include_router(instruction.router)      # 智能指令
 api_router.include_router(conversation.router)     # 对话历史
+api_router.include_router(pdf_converter.router)   # PDF转换
--- a/backend/app/api/endpoints/pdf_converter.py
+++ b/backend/app/api/endpoints/pdf_converter.py
@@ -0,0 +1,208 @@
+"""
+PDF 转换 API 接口
+
+提供将 Word、Excel、Txt、Markdown 转换为 PDF 的功能
+"""
+import logging
+import uuid
+from typing import Optional
+
+from fastapi import APIRouter, UploadFile, File, Form, HTTPException
+from fastapi.responses import StreamingResponse
+
+from app.services.pdf_converter_service import pdf_converter_service
+from app.services.file_service import file_service
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/pdf", tags=["PDF转换"])
+
+# 临时存储转换后的 PDF（key: download_id, value: (pdf_content, original_filename)）
+_pdf_cache: dict = {}
+
+
+# ==================== 请求/响应模型 ====================
+
+class ConvertResponse:
+    """转换响应"""
+    def __init__(self, success: bool, message: str = "", filename: str = ""):
+        self.success = success
+        self.message = message
+        self.filename = filename
+
+
+# ==================== 接口 ====================
+
+@router.post("/convert")
+async def convert_to_pdf(
+    file: UploadFile = File(...),
+):
+    """
+    将上传的文件转换为 PDF
+
+    支持格式: docx, xlsx, txt, md
+
+    Args:
+        file: 上传的文件
+
+    Returns:
+        PDF 文件流
+    """
+    try:
+        # 检查文件格式
+        filename = file.filename or "document"
+        file_ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
+
+        if file_ext not in pdf_converter_service.supported_formats:
+            raise HTTPException(
+                status_code=400,
+                detail=f"不支持的格式: {file_ext}，支持的格式: {', '.join(pdf_converter_service.supported_formats)}"
+            )
+
+        # 读取文件内容
+        content = await file.read()
+        if not content:
+            raise HTTPException(status_code=400, detail="文件内容为空")
+
+        logger.info(f"开始转换文件: {filename} ({file_ext})")
+
+        # 转换为 PDF
+        pdf_content, error = await pdf_converter_service.convert_to_pdf(
+            file_content=content,
+            source_format=file_ext,
+            filename=filename.rsplit('.', 1)[0] if '.' in filename else filename
+        )
+
+        if error:
+            raise HTTPException(status_code=500, detail=error)
+
+        # 直接返回 PDF 文件流
+        return StreamingResponse(
+            iter([pdf_content]),
+            media_type="application/pdf",
+            headers={
+                "Content-Disposition": f"attachment; filename*=UTF-8''converted.pdf"
+            }
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"PDF转换失败: {e}")
+        raise HTTPException(status_code=500, detail=f"转换失败: {str(e)}")
+
+
+@router.get("/download/{download_id}")
+async def download_pdf(download_id: str):
+    """
+    通过下载 ID 下载 PDF（支持 IDM 拦截）
+    """
+    if download_id not in _pdf_cache:
+        raise HTTPException(status_code=404, detail="下载链接已过期或不存在")
+
+    pdf_content, filename = _pdf_cache.pop(download_id)  # 下载后删除
+
+    # 使用 RFC 5987 编码支持中文文件名
+    from starlette.responses import StreamingResponse
+    import urllib.parse
+
+    # URL 编码中文文件名
+    encoded_filename = urllib.parse.quote(f"{filename}.pdf")
+
+    return StreamingResponse(
+        iter([pdf_content]),
+        media_type="application/pdf",
+        headers={
+            "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}"
+        }
+    )
+
+
+@router.get("/formats")
+async def get_supported_formats():
+    """
+    获取支持的源文件格式
+
+    Returns:
+        支持的格式列表
+    """
+    return {
+        "success": True,
+        "formats": pdf_converter_service.get_supported_formats()
+    }
+
+
+@router.post("/convert/batch")
+async def batch_convert_to_pdf(
+    files: list[UploadFile] = File(...),
+):
+    """
+    批量将多个文件转换为 PDF
+
+    注意: 批量转换会返回多个 PDF 文件打包的 zip
+
+    Args:
+        files: 上传的文件列表
+
+    Returns:
+        ZIP 压缩包（包含所有PDF）
+    """
+    try:
+        import io
+        import zipfile
+
+        results = []
+        errors = []
+
+        for file in files:
+            try:
+                filename = file.filename or "document"
+                file_ext = filename.rsplit('.', 1)[-1].lower() if '.' in filename else ''
+
+                if file_ext not in pdf_converter_service.supported_formats:
+                    errors.append(f"{filename}: 不支持的格式")
+                    continue
+
+                content = await file.read()
+                pdf_content, error = await pdf_converter_service.convert_to_pdf(
+                    file_content=content,
+                    source_format=file_ext,
+                    filename=filename.rsplit('.', 1)[0] if '.' in filename else filename
+                )
+
+                if error:
+                    errors.append(f"{filename}: {error}")
+                else:
+                    results.append((filename, pdf_content))
+
+            except Exception as e:
+                errors.append(f"{file.filename}: {str(e)}")
+
+        if not results:
+            raise HTTPException(
+                status_code=400,
+                detail=f"没有可转换的文件。错误: {'; '.join(errors)}"
+            )
+
+        # 创建 ZIP 包
+        zip_buffer = io.BytesIO()
+        with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+            for original_name, pdf_content in results:
+                pdf_name = f"{original_name.rsplit('.', 1)[0] if '.' in original_name else original_name}.pdf"
+                zip_file.writestr(pdf_name, pdf_content)
+
+        zip_buffer.seek(0)
+
+        return StreamingResponse(
+            iter([zip_buffer.getvalue()]),
+            media_type="application/zip",
+            headers={
+                "Content-Disposition": "attachment; filename*=UTF-8''converted_pdfs.zip"
+            }
+        )
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"批量PDF转换失败: {e}")
+        raise HTTPException(status_code=500, detail=f"批量转换失败: {str(e)}")