tl

2026-04-10 10:24:52 +08:00
parent e5711b3f05
commit ed66aa346d
4 changed files with 527 additions and 26 deletions
--- a/backend/app/api/endpoints/ai_analyze.py
+++ b/backend/app/api/endpoints/ai_analyze.py
@@ -10,6 +10,7 @@ import os

 from app.services.excel_ai_service import excel_ai_service
 from app.services.markdown_ai_service import markdown_ai_service
+from app.services.template_fill_service import template_fill_service

 logger = logging.getLogger(__name__)

@@ -329,3 +330,74 @@ async def get_markdown_outline(
    except Exception as e:
        logger.error(f"获取 Markdown 大纲失败: {str(e)}")
        raise HTTPException(status_code=500, detail=f"获取大纲失败: {str(e)}")
+
+
+@router.post("/analyze/txt")
+async def analyze_txt(
+    file: UploadFile = File(...),
+):
+    """
+    上传并使用 AI 分析 TXT 文本文件，提取结构化数据
+
+    将非结构化文本转换为结构化表格数据，便于后续填表使用
+
+    Args:
+        file: 上传的 TXT 文件
+
+    Returns:
+        dict: 分析结果，包含结构化表格数据
+    """
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="文件名为空")
+
+    file_ext = file.filename.split('.')[-1].lower()
+    if file_ext not in ['txt', 'text']:
+        raise HTTPException(
+            status_code=400,
+            detail=f"不支持的文件类型: {file_ext}，仅支持 .txt"
+        )
+
+    try:
+        # 读取文件内容
+        content = await file.read()
+
+        # 保存到临时文件
+        with tempfile.NamedTemporaryFile(mode='wb', suffix='.txt', delete=False) as tmp:
+            tmp.write(content)
+            tmp_path = tmp.name
+
+        try:
+            logger.info(f"开始 AI 分析 TXT 文件: {file.filename}")
+
+            # 使用 template_fill_service 的 AI 分析方法
+            result = await template_fill_service.analyze_txt_with_ai(
+                content=content.decode('utf-8', errors='replace'),
+                filename=file.filename
+            )
+
+            if result:
+                logger.info(f"TXT AI 分析成功: {file.filename}")
+                return {
+                    "success": True,
+                    "filename": file.filename,
+                    "structured_data": result
+                }
+            else:
+                logger.warning(f"TXT AI 分析返回空结果: {file.filename}")
+                return {
+                    "success": False,
+                    "filename": file.filename,
+                    "error": "AI 分析未能提取到结构化数据",
+                    "structured_data": None
+                }
+
+        finally:
+            # 清理临时文件
+            if os.path.exists(tmp_path):
+                os.unlink(tmp_path)
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"TXT AI 分析过程中出错: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
--- a/backend/app/services/llm_service.py
+++ b/backend/app/services/llm_service.py
@@ -55,12 +55,20 @@ class LLMService:
        payload.update(kwargs)

        try:
+            logger.info(f"LLM API 请求: model={self.model_name}, temperature={temperature}, max_tokens={max_tokens}")
+            logger.info(f"消息数量: {len(messages)}")
+            for i, msg in enumerate(messages):
+                logger.info(f"消息[{i}]: role={msg.get('role')}, content长度={len(msg.get('content', ''))}")
+
            async with httpx.AsyncClient(timeout=60.0) as client:
                response = await client.post(
                    f"{self.base_url}/chat/completions",
                    headers=headers,
                    json=payload
                )
+                logger.info(f"LLM API 响应状态: {response.status_code}")
+                if response.status_code != 200:
+                    logger.error(f"LLM API 响应内容: {response.text}")
                response.raise_for_status()
                return response.json()

--- a/backend/app/services/template_fill_service.py
+++ b/backend/app/services/template_fill_service.py
@@ -5,6 +5,7 @@
 """
 import logging
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import Any, Dict, List, Optional

 from app.core.database import mongodb
@@ -32,6 +33,7 @@ class SourceDocument:
    doc_type: str
    content: str = ""
    structured_data: Dict[str, Any] = field(default_factory=dict)
+    ai_structured_data: Optional[Dict[str, Any]] = None  # AI 结构化分析结果缓存


@dataclass
@@ -76,12 +78,14 @@ class TemplateFillService:
        filled_data = {}
        fill_details = []

-        logger.info(f"开始填表: {len(template_fields)} 个字段, {len(source_doc_ids or [])} 个源文档")
+        logger.info(f"开始填表: {len(template_fields)} 个字段, {len(source_doc_ids or [])} 个源文档, {len(source_file_paths or [])} 个文件路径")

        # 1. 加载源文档内容
        source_docs = await self._load_source_documents(source_doc_ids, source_file_paths)

        logger.info(f"加载了 {len(source_docs)} 个源文档")
+        for doc in source_docs:
+            logger.info(f"  - 文档: {doc.filename}, 类型: {doc.doc_type}, 内容长度: {len(doc.content)}, AI分析: {bool(doc.ai_structured_data)}")

        if not source_docs:
            logger.warning("没有找到源文档，填表结果将全部为空")
@@ -140,7 +144,7 @@ class TemplateFillService:
        source_file_paths: Optional[List[str]] = None
    ) -> List[SourceDocument]:
        """
-        加载源文档内容
+        加载源文档内容，并对 TXT 文件进行 AI 结构化分析

        Args:
            source_doc_ids: MongoDB 文档 ID 列表
@@ -157,12 +161,23 @@ class TemplateFillService:
                try:
                    doc = await mongodb.get_document(doc_id)
                    if doc:
+                        doc_type = doc.get("doc_type", "unknown")
+                        content = doc.get("content", "")
+
+                        # 对 TXT 文档进行 AI 结构化分析
+                        ai_structured = None
+                        if doc_type == "txt" and content:
+                            logger.info(f"MongoDB TXT 文档需要 AI 分析: {doc_id}, 内容长度: {len(content)}")
+                            ai_structured = await self._analyze_txt_once(content, doc.get("metadata", {}).get("original_filename", "unknown"))
+                            logger.info(f"AI 分析结果: has_data={ai_structured is not None}")
+
                        source_docs.append(SourceDocument(
                            doc_id=doc_id,
                            filename=doc.get("metadata", {}).get("original_filename", "unknown"),
-                            doc_type=doc.get("doc_type", "unknown"),
-                            content=doc.get("content", ""),
-                            structured_data=doc.get("structured_data", {})
+                            doc_type=doc_type,
+                            content=content,
+                            structured_data=doc.get("structured_data", {}),
+                            ai_structured_data=ai_structured
                        ))
                        logger.info(f"从MongoDB加载文档: {doc_id}")
                except Exception as e:
@@ -170,10 +185,13 @@ class TemplateFillService:

        # 2. 从文件路径加载文档
        if source_file_paths:
+            logger.info(f"开始从文件路径加载 {len(source_file_paths)} 个文档")
            for file_path in source_file_paths:
                try:
+                    logger.info(f"  加载文件: {file_path}")
                    parser = ParserFactory.get_parser(file_path)
                    result = parser.parse(file_path)
+                    logger.info(f"  解析结果: success={result.success}, error={result.error}")
                    if result.success:
                        # result.data 的结构取决于解析器类型:
                        # - Excel 单 sheet: {columns: [...], rows: [...], row_count, column_count}
@@ -182,20 +200,149 @@ class TemplateFillService:
                        doc_data = result.data if result.data else {}
                        doc_content = doc_data.get("content", "") if isinstance(doc_data, dict) else ""
                        doc_structured = doc_data if isinstance(doc_data, dict) and "rows" in doc_data or isinstance(doc_data, dict) and "sheets" in doc_data else {}
+                        doc_type = result.metadata.get("extension", "unknown").replace(".", "").lower()
+                        logger.info(f"  文件类型: {doc_type}, 内容长度: {len(doc_content)}")
+
+                        # 对 TXT 文件进行 AI 结构化分析
+                        ai_structured = None
+                        if doc_type == "txt" and doc_content:
+                            logger.info(f"  检测到 TXT 文件，内容前100字: {doc_content[:100]}")
+                            ai_structured = await self._analyze_txt_once(doc_content, result.metadata.get("filename", Path(file_path).name))
+                            logger.info(f"  AI 分析完成: has_result={ai_structured is not None}")
+                            if ai_structured:
+                                logger.info(f"  AI 结果 keys: {list(ai_structured.keys())}")
+                                if "table" in ai_structured:
+                                    table = ai_structured.get("table", {})
+                                    logger.info(f"  AI 表格: {len(table.get('columns', []))} 列, {len(table.get('rows', []))} 行")

                        source_docs.append(SourceDocument(
                            doc_id=file_path,
-                            filename=result.metadata.get("filename", file_path.split("/")[-1]),
-                            doc_type=result.metadata.get("extension", "unknown").replace(".", ""),
+                            filename=result.metadata.get("filename", Path(file_path).name),
+                            doc_type=doc_type,
                            content=doc_content,
-                            structured_data=doc_structured
+                            structured_data=doc_structured,
+                            ai_structured_data=ai_structured
                        ))
-                        logger.info(f"从文件加载文档: {file_path}, content长度: {len(doc_content)}, structured数据: {bool(doc_structured)}")
+                    else:
+                        logger.warning(f"文档解析失败 {file_path}: {result.error}")
                except Exception as e:
-                    logger.error(f"从文件加载文档失败 {file_path}: {str(e)}")
+                    logger.error(f"从文件加载文档失败 {file_path}: {str(e)}", exc_info=True)

        return source_docs

+    async def _analyze_txt_once(self, content: str, filename: str) -> Optional[Dict[str, Any]]:
+        """
+        对 TXT 内容进行一次性 AI 分析，提取保持行结构的表格数据
+
+        Args:
+            content: 原始文本内容
+            filename: 文件名
+
+        Returns:
+            分析结果字典，包含表格数据
+        """
+        # 确保 content 是字符串
+        if isinstance(content, bytes):
+            try:
+                content = content.decode('utf-8')
+            except:
+                content = content.decode('gbk', errors='replace')
+
+        if not content or len(str(content).strip()) < 10:
+            logger.warning(f"TXT 内容过短或为空: {filename}, 类型: {type(content)}")
+            return None
+
+        content = str(content)
+
+        # 限制内容长度，避免 token 超限
+        max_chars = 8000
+        truncated_content = content[:max_chars] if len(content) > max_chars else content
+
+        prompt = f"""你是一个专业的数据提取助手。请从以下文本内容中提取表格数据。
+
+文件名：{filename}
+
+文本内容：
+{truncated_content}
+
+请仔细分析文本中的表格数据，提取所有行。每行是一个完整的数据记录。
+
+请严格按以下 JSON 格式输出，不要添加任何解释：
+{{
+    "table": {{
+        "columns": ["列1", "列2", "列3", ...],
+        "rows": [
+            ["值1", "值2", "值3", ...],
+            ["值1", "值2", "值3", ...]
+        ]
+    }},
+    "summary": "简要说明数据内容"
+}}"""
+
+        messages = [
+            {"role": "system", "content": "你是一个专业的数据提取助手。请严格按JSON格式输出，只输出纯JSON。"},
+            {"role": "user", "content": prompt}
+        ]
+
+        try:
+            logger.info(f"开始 AI 分析 TXT 文件: {filename}, 内容长度: {len(truncated_content)}")
+            response = await self.llm.chat(
+                messages=messages,
+                temperature=0.1,
+                max_tokens=2000
+            )
+
+            ai_content = self.llm.extract_message_content(response)
+            logger.info(f"LLM 返回内容长度: {len(ai_content)}, 内容前200字: {ai_content[:200]}")
+
+            # 解析 JSON
+            import json
+            import re
+
+            cleaned = ai_content.strip()
+            cleaned = re.sub(r'^```json\s*', '', cleaned, flags=re.MULTILINE)
+            cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE)
+            cleaned = cleaned.strip()
+
+            logger.info(f"清理后内容前200字: {cleaned[:200]}")
+
+            # 查找 JSON
+            json_start = cleaned.find('{')
+            json_end = cleaned.rfind('}') + 1
+
+            if json_start >= 0 and json_end > json_start:
+                json_str = cleaned[json_start:json_end]
+                logger.info(f"提取的JSON字符串: {json_str[:200]}")
+                try:
+                    result = json.loads(json_str)
+                    # 兼容不同格式的返回
+                    if "table" in result:
+                        table = result["table"]
+                    elif "data" in result:
+                        table = result["data"]
+                    elif "rows" in result:
+                        table = {"columns": result.get("columns", []), "rows": result.get("rows", [])}
+                    else:
+                        # 尝试直接使用根级别的数据
+                        table = result
+
+                    if isinstance(table, dict) and ("columns" in table or "rows" in table):
+                        columns = table.get("columns", [])
+                        rows = table.get("rows", [])
+                        logger.info(f"TXT AI 分析成功: {filename}, 列数: {len(columns)}, 行数: {len(rows)}")
+                        return {"table": {"columns": columns, "rows": rows}, "summary": result.get("summary", "")}
+                    else:
+                        logger.warning(f"JSON 中没有找到有效的表格数据: {filename}, result keys: {list(result.keys())}")
+                except json.JSONDecodeError as e:
+                    logger.warning(f"JSON 解析失败: {e}, json_str: {json_str[:200]}")
+
+            logger.warning(f"无法解析 AI 返回的 JSON: {filename}, ai_content: {ai_content[:500]}")
+            return None
+
+        except Exception as e:
+            logger.error(f"AI 分析 TXT 失败: {str(e)}, 文件: {filename}", exc_info=True)
+            return None
+
    async def _extract_field_value(
        self,
        field: TemplateField,
@@ -237,27 +384,25 @@ class TemplateFillService:
        logger.info(f"字段 {field.name} 无法直接从结构化数据提取，使用 LLM...")

        # 构建上下文文本 - 传入字段名，只提取该列数据
-        context_text = self._build_context_text(source_docs, field_name=field.name, max_length=200000)
+        context_text = await self._build_context_text(source_docs, field_name=field.name, max_length=6000)

        # 构建提示词
        hint_text = field.hint if field.hint else f"请提取{field.name}的信息"
        if user_hint:
            hint_text = f"{user_hint}。{hint_text}"

-        prompt = f"""你是一个专业的数据提取专家。请从以下文档内容中提取"{field.name}"字段的所有行数据。
+        prompt = f"""你是一个专业的数据提取专家。请从以下文档内容中提取"{field.name}"字段的值。

-参考文档内容（已提取" {field.name}"列的数据）：
+参考文档内容：
 {context_text}

-请提取上述所有行的" {field.name}"值，存入数组。每一行对应数组中的一个元素。
-如果某行该字段为空，请用空字符串""占位。
+请仔细阅读上述内容，找到所有与"{field.name}"相关的值。
+如果内容是表格格式，请找到对应的列，提取该列所有行的值。
+每一行对应数组中的一个元素，保持行与行的对应关系。
+如果找不到对应的值，返回空数组。

-请严格按照以下 JSON 格式输出，不要添加任何解释：
-{{
-    "values": ["第1行的值", "第2行的值", "第3行的值", ...],
-    "source": "数据来源的文档描述",
-    "confidence": 0.0到1.0之间的置信度
-}}
+请严格按以下JSON格式输出（只输出纯JSON，不要任何解释）：
+{{"values": ["值1", "值2", "值3", ...], "source": "来源说明", "confidence": 0.9}}
 """

        # 调用 LLM
@@ -270,7 +415,7 @@ class TemplateFillService:
            response = await self.llm.chat(
                messages=messages,
                temperature=0.1,
-                max_tokens=50000
+                max_tokens=2000
            )

            content = self.llm.extract_message_content(response)
@@ -280,7 +425,6 @@ class TemplateFillService:
            import re

            extracted_values = []
-            extracted_value = ""
            extracted_source = "LLM生成"
            confidence = 0.5

@@ -368,7 +512,7 @@ class TemplateFillService:
                confidence=0.0
            )

-    def _build_context_text(self, source_docs: List[SourceDocument], field_name: str = None, max_length: int = 8000) -> str:
+    async def _build_context_text(self, source_docs: List[SourceDocument], field_name: str = None, max_length: int = 8000) -> str:
        """
        构建上下文文本

@@ -474,7 +618,54 @@ class TemplateFillService:
                                doc_content += " | ".join(str(cell) for cell in row) + "\n"
                            row_count += 1
            elif doc.content:
-                doc_content = doc.content[:5000]
+                # TXT 文件优先使用 AI 分析后的结构化数据
+                if doc.doc_type == "txt" and doc.ai_structured_data:
+                    # 使用 AI 结构化分析结果
+                    ai_table = doc.ai_structured_data.get("table", {})
+                    columns = ai_table.get("columns", [])
+                    rows = ai_table.get("rows", [])
+
+                    logger.info(f"TXT AI 结构化数据: doc_type={doc.doc_type}, has_ai_data={doc.ai_structured_data is not None}, columns={columns}, rows={len(rows) if rows else 0}")
+
+                    if columns and rows:
+                        doc_content += f"\n【文档: {doc.filename} - AI 结构化表格，共 {len(rows)} 行】\n"
+                        if field_name:
+                            # 查找匹配的列
+                            target_col = None
+                            for col in columns:
+                                if field_name.lower() in str(col).lower() or str(col).lower() in field_name.lower():
+                                    target_col = col
+                                    break
+                            if target_col:
+                                doc_content += f"列名: {target_col}\n"
+                                for row_idx, row in enumerate(rows):
+                                    if isinstance(row, list) and target_col in columns:
+                                        val = row[columns.index(target_col)]
+                                    else:
+                                        val = str(row.get(target_col, "")) if isinstance(row, dict) else ""
+                                    doc_content += f"行{row_idx+1}: {val}\n"
+                                    row_count += 1
+                        else:
+                            # 输出表格
+                            doc_content += " | ".join(str(col) for col in columns) + "\n"
+                            for row in rows:
+                                if isinstance(row, list):
+                                    doc_content += " | ".join(str(cell) for cell in row) + "\n"
+                                else:
+                                    doc_content += " | ".join(str(row.get(col, "")) for col in columns) + "\n"
+                                row_count += 1
+                        logger.info(f"使用 TXT AI 结构化表格: {doc.filename}, {len(columns)} 列, {len(rows)} 行")
+                    else:
+                        # AI 结果无表格，回退到原始内容
+                        doc_content = doc.content[:8000]
+                        logger.warning(f"TXT AI 结果无表格: {doc.filename}, 使用原始内容")
+                elif doc.doc_type == "txt" and doc.content:
+                    # 没有 AI 分析结果，直接使用原始内容
+                    doc_content = doc.content[:8000]
+                    logger.info(f"使用 TXT 原始内容: {doc.filename}, 长度: {len(doc_content)}")
+                else:
+                    # 其他文档类型直接使用内容
+                    doc_content = doc.content[:5000]

            if doc_content:
                doc_context = f"【文档: {doc.filename} ({doc.doc_type})】\n{doc_content}"
@@ -494,6 +685,182 @@ class TemplateFillService:
        logger.info(f"最终上下文长度: {len(result)}")
        return result

+    async def analyze_txt_with_ai(self, content: str, filename: str = "") -> Dict[str, Any]:
+        """
+        使用 AI 分析 TXT 文本内容，提取结构化数据
+
+        Args:
+            content: 原始文本内容
+            filename: 文件名（用于日志）
+
+        Returns:
+            结构化数据，包含:
+            - key_value_pairs: 键值对列表
+            - tables: 表格数据列表
+            - numeric_data: 数值数据列表
+            - text_summary: 文本摘要
+        """
+        if not content or len(content.strip()) < 10:
+            logger.warning(f"TXT 内容过短或为空，跳过 AI 分析: {filename}")
+            return {}
+
+        # 截断过长的文本，避免 token 超限
+        max_chars = 15000
+        truncated_content = content[:max_chars] if len(content) > max_chars else content
+
+        system_prompt = """你是一个专业的数据提取专家。请分析提供的文本内容，提取其中包含的结构化信息。
+
+请提取以下类型的数据：
+
+1. **键值对信息**：从文本中提取的名词-值对，如"姓名: 张三"、"年龄: 25"等
+2. **表格数据**：如果文本中包含表格或列表形式的数据，提取出来
+3. **数值数据**：包含数值、金额、百分比、统计数字等
+4. **关键描述**：文本的核心内容摘要
+
+请严格按照以下 JSON 格式输出，不要添加任何 Markdown 标记或解释：
+{
+    "key_value_pairs": [
+        {"key": "键名1", "value": "值1"},
+        {"key": "键名2", "value": "值2"}
+    ],
+    "tables": [
+        {
+            "description": "表格描述",
+            "columns": ["列1", "列2"],
+            "rows": [["值1", "值2"], ["值3", "值4"]]
+        }
+    ],
+    "numeric_data": [
+        {"name": "数据项名称", "value": 123.45, "unit": "单位"}
+    ],
+    "text_summary": "一段简洁的文本摘要，不超过200字"
+}"""
+
+        user_message = f"""请分析以下文本内容，提取结构化数据：
+
+文件名：{filename}
+
+文本内容：
+{truncated_content}
+
+请严格按 JSON 格式输出。"""
+
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_message}
+        ]
+
+        try:
+            logger.info(f"开始 AI 分析 TXT 文件: {filename}, 内容长度: {len(truncated_content)}")
+            response = await self.llm.chat(
+                messages=messages,
+                temperature=0.1,
+                max_tokens=2000
+            )
+
+            ai_content = self.llm.extract_message_content(response)
+            logger.info(f"AI 返回内容长度: {len(ai_content)}")
+
+            # 解析 JSON
+            import json
+            import re
+
+            # 清理 markdown 格式
+            cleaned = ai_content.strip()
+            cleaned = re.sub(r'^```json\s*', '', cleaned, flags=re.MULTILINE)
+            cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE)
+            cleaned = cleaned.strip()
+
+            # 提取 JSON
+            json_start = -1
+            for i, c in enumerate(cleaned):
+                if c == '{':
+                    json_start = i
+                    break
+
+            if json_start >= 0:
+                brace_count = 0
+                json_end = -1
+                for i in range(json_start, len(cleaned)):
+                    if cleaned[i] == '{':
+                        brace_count += 1
+                    elif cleaned[i] == '}':
+                        brace_count -= 1
+                        if brace_count == 0:
+                            json_end = i + 1
+                            break
+
+                if json_end > json_start:
+                    json_str = cleaned[json_start:json_end]
+                    result = json.loads(json_str)
+                    logger.info(f"TXT AI 分析成功: {filename}, 提取到 {len(result.get('key_value_pairs', []))} 个键值对")
+                    return result
+
+            logger.warning(f"无法从 AI 返回中解析 JSON: {filename}")
+            return {}
+
+        except json.JSONDecodeError as e:
+            logger.error(f"JSON 解析失败: {str(e)}, 文件: {filename}")
+            return {}
+        except Exception as e:
+            logger.error(f"AI 分析 TXT 失败: {str(e)}, 文件: {filename}", exc_info=True)
+            return {}
+
+    def _format_structured_for_context(self, structured_data: Dict[str, Any], filename: str) -> str:
+        """
+        将结构化数据格式化为上下文文本
+
+        Args:
+            structured_data: AI 分析返回的结构化数据
+            filename: 文件名
+
+        Returns:
+            格式化的文本上下文
+        """
+        parts = []
+
+        # 添加标题
+        parts.append(f"【文档: {filename} - AI 结构化分析结果】")
+
+        # 格式化键值对
+        key_value_pairs = structured_data.get("key_value_pairs", [])
+        if key_value_pairs:
+            parts.append("\n## 关键信息：")
+            for kv in key_value_pairs[:20]:  # 最多 20 个
+                parts.append(f"- {kv.get('key', '')}: {kv.get('value', '')}")
+
+        # 格式化表格数据
+        tables = structured_data.get("tables", [])
+        if tables:
+            parts.append("\n## 表格数据：")
+            for i, table in enumerate(tables[:5]):  # 最多 5 个表格
+                desc = table.get("description", f"表格{i+1}")
+                columns = table.get("columns", [])
+                rows = table.get("rows", [])
+                if columns and rows:
+                    parts.append(f"\n### {desc}")
+                    parts.append("| " + " | ".join(str(c) for c in columns) + " |")
+                    parts.append("| " + " | ".join(["---"] * len(columns)) + " |")
+                    for row in rows[:10]:  # 每个表格最多 10 行
+                        parts.append("| " + " | ".join(str(cell) for cell in row) + " |")
+
+        # 格式化数值数据
+        numeric_data = structured_data.get("numeric_data", [])
+        if numeric_data:
+            parts.append("\n## 数值数据：")
+            for num in numeric_data[:15]:  # 最多 15 个
+                name = num.get("name", "")
+                value = num.get("value", "")
+                unit = num.get("unit", "")
+                parts.append(f"- {name}: {value} {unit}")
+
+        # 添加文本摘要
+        text_summary = structured_data.get("text_summary", "")
+        if text_summary:
+            parts.append(f"\n## 内容摘要：\n{text_summary}")
+
+        return "\n".join(parts)
+
    async def get_template_fields_from_file(
        self,
        file_path: str,
@@ -675,7 +1042,7 @@ class TemplateFillService:

    def _extract_values_from_structured_data(self, source_docs: List[SourceDocument], field_name: str) -> List[str]:
        """
-        从结构化数据（Excel rows）中直接提取指定列的值
+        从结构化数据（Excel rows）或 AI 结构化分析结果中直接提取指定列的值

        适用于有 rows 结构的文档数据，无需 LLM 即可提取

@@ -689,6 +1056,18 @@ class TemplateFillService:
        all_values = []

        for doc in source_docs:
+            # 优先从 AI 结构化数据中提取（适用于 TXT 文件）
+            if doc.ai_structured_data:
+                ai_table = doc.ai_structured_data.get("table", {})
+                columns = ai_table.get("columns", [])
+                rows = ai_table.get("rows", [])
+                if columns and rows:
+                    values = self._extract_column_values(rows, columns, field_name)
+                    if values:
+                        all_values.extend(values)
+                        logger.info(f"从 TXT AI 结构化数据提取到 {len(values)} 个值: {doc.filename}")
+                        break
+
            # 尝试从 structured_data 中提取
            structured = doc.structured_data