Merge branch 'main' of https://gitea.kronecker.cc/OurCodesAreAllRight/FilesReadSystem

优化联合模板上传逻辑支持源文档内容解析
- 移除模板文件字段提取步骤，改为直接保存模板文件 - 新增源文档解析功能，提取文档内容、标题和表格数量信息 - 修改模板填充服务，支持传入源文档内容用于AI表头生成 - 更新AI表头生成逻辑，基于源文档内容智能生成合适的表头字段 - 增强日志记录，显示源文档数量和处理进度
2026-04-09 22:44:01 +08:00 · 2026-04-09 22:43:51 +08:00
2 changed files with 59 additions and 14 deletions
--- a/backend/app/api/endpoints/templates.py
+++ b/backend/app/api/endpoints/templates.py
@@ -155,20 +155,17 @@ async def upload_joint_template(
                )
    try:
-        # 1. 保存模板文件并提取字段
+        # 1. 保存模板文件
        template_content = await template_file.read()
        template_path = file_service.save_uploaded_file(
            template_content,
            template_file.filename,
            subfolder="templates"
        )
        template_fields = await template_fill_service.get_template_fields_from_file(
            template_path,
            template_ext
        )
-        # 2. 处理源文档 - 保存文件
+        # 2. 保存并解析源文档 - 提取内容用于生成表头
        source_file_info = []
        source_contents = []
        for sf in source_files:
            if sf.filename:
                sf_content = await sf.read()
@@ -183,6 +180,28 @@ async def upload_joint_template(
                    "filename": sf.filename,
                    "ext": sf_ext
                })
                # 解析源文档获取内容（用于 AI 生成表头）
                try:
                    from app.core.document_parser import ParserFactory
                    parser = ParserFactory.get_parser(sf_path)
                    parse_result = parser.parse(sf_path)
                    if parse_result.success and parse_result.data:
                        source_contents.append({
                            "filename": sf.filename,
                            "doc_type": sf_ext,
                            "content": parse_result.data.get("content", "")[:5000] if parse_result.data.get("content") else "",
                            "titles": parse_result.data.get("titles", [])[:10] if parse_result.data.get("titles") else [],
                            "tables_count": len(parse_result.data.get("tables", [])) if parse_result.data.get("tables") else 0
                        })
                except Exception as e:
                    logger.warning(f"解析源文档失败 {sf.filename}: {e}")
        # 3. 根据源文档内容生成表头
        template_fields = await template_fill_service.get_template_fields_from_file(
            template_path,
            template_ext,
            source_contents=source_contents  # 传递源文档内容
        )
        # 3. 异步处理源文档到MongoDB
        task_id = str(uuid.uuid4())
--- a/backend/app/services/template_fill_service.py
+++ b/backend/app/services/template_fill_service.py
@@ -609,7 +609,8 @@ class TemplateFillService:
    async def get_template_fields_from_file(
        self,
        file_path: str,
-        file_type: str = "xlsx"
+        file_type: str = "xlsx",
        source_contents: List[dict] = None
    ) -> List[TemplateField]:
        """
        从模板文件提取字段定义
@@ -617,11 +618,14 @@ class TemplateFillService:
        Args:
            file_path: 模板文件路径
            file_type: 文件类型 (xlsx/xls/docx)
            source_contents: 源文档内容列表（用于 AI 生成表头）
        Returns:
            字段列表
        """
        fields = []
        if source_contents is None:
            source_contents = []
        try:
            if file_type in ["xlsx", "xls"]:
@@ -637,8 +641,8 @@ class TemplateFillService:
            )
            if needs_ai_generation:
-                logger.info(f"模板表头为空或自动生成，尝试 AI 生成表头... (fields={len(fields)})")
+                logger.info(f"模板表头为空或自动生成，尝试 AI 生成表头... (fields={len(fields)}, source_docs={len(source_contents)})")
-                ai_fields = await self._generate_fields_with_ai(file_path, file_type)
+                ai_fields = await self._generate_fields_with_ai(file_path, file_type, source_contents)
                if ai_fields:
                    fields = ai_fields
                    logger.info(f"AI 生成表头成功: {len(fields)} 个字段")
@@ -1481,7 +1485,8 @@ class TemplateFillService:
    async def _generate_fields_with_ai(
        self,
        file_path: str,
-        file_type: str
+        file_type: str,
        source_contents: List[dict] = None
    ) -> Optional[List[TemplateField]]:
        """
        使用 AI 为空表生成表头字段
@@ -1525,15 +1530,36 @@ class TemplateFillService:
                    content_sample = ""
            # 调用 AI 生成表头
-            prompt = f"""你是一个专业的表格设计助手。请为以下空白表格生成合适的表头字段。
+            # 根据源文档内容生成表头
            source_info = ""
            if source_contents:
                source_info = "\n\n【源文档内容摘要】（根据以下文档内容生成表头）：\n"
                for idx, src in enumerate(source_contents[:5]):  # 最多5个源文档
                    filename = src.get("filename", f"文档{idx+1}")
                    doc_type = src.get("doc_type", "unknown")
                    content = src.get("content", "")[:3000]  # 限制内容长度
                    titles = src.get("titles", [])[:10]  # 最多10个标题
                    tables_count = src.get("tables_count", 0)
-表格内容预览：
+                    source_info += f"\n--- 文档 {idx+1}: {filename} ({doc_type}) ---\n"
-{content_sample[:2000] if content_sample else "空白表格"}
+                    if titles:
                        source_info += f"【章节标题】: {', '.join([t.get('text', '') for t in titles[:5]])}\n"
                    if tables_count > 0:
                        source_info += f"【包含表格数】: {tables_count}\n"
                    if content:
                        source_info += f"【内容预览】: {content[:1500]}...\n"
            prompt = f"""你是一个专业的表格设计助手。请根据源文档内容生成合适的表格表头字段。
 任务：用户有一些源文档（可能包含表格数据、统计信息等），需要填写到表格中。请分析源文档内容，生成适合的表头字段。
 {source_info}
 请生成5-10个简洁的表头字段名，这些字段应该：
 1. 简洁明了，易于理解
 2. 适合作为表格列标题
-3. 之间有明显的区分度
+3. 直接对应源文档中的关键数据项
 4. 字段之间有明显的区分度
 请严格按照以下 JSON 格式输出（只需输出 JSON，不要其他内容）：
 {{