添加其他格式文档的解析

2026-03-26 23:14:39 +08:00
parent 4bdc3f9707
commit 5bcad4a5fa
9 changed files with 2075 additions and 22 deletions
--- a/backend/app/services/excel_storage_service.py
+++ b/backend/app/services/excel_storage_service.py
@@ -0,0 +1,352 @@
+"""
+Excel 存储服务
+
+将 Excel 数据转换为 MySQL 表结构并存储
+"""
+import logging
+import re
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+import pandas as pd
+from sqlalchemy import (
+    Column,
+    DateTime,
+    Float,
+    Integer,
+    String,
+    Text,
+    inspect,
+)
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.core.database.mysql import Base, mysql_db
+
+logger = logging.getLogger(__name__)
+
+
+class ExcelStorageService:
+    """Excel 数据存储服务"""
+
+    def __init__(self):
+        self.mysql_db = mysql_db
+
+    def _sanitize_table_name(self, filename: str) -> str:
+        """
+        将文件名转换为合法的表名
+
+        Args:
+            filename: 原始文件名
+
+        Returns:
+            合法的表名
+        """
+        # 移除扩展名
+        name = filename.rsplit('.', 1)[0] if '.' in filename else filename
+
+        # 只保留字母、数字、下划线
+        name = re.sub(r'[^a-zA-Z0-9_]', '_', name)
+
+        # 确保以字母开头
+        if name and name[0].isdigit():
+            name = 't_' + name
+
+        # 限制长度
+        return name[:50]
+
+    def _sanitize_column_name(self, col_name: str) -> str:
+        """
+        将列名转换为合法的字段名
+
+        Args:
+            col_name: 原始列名
+
+        Returns:
+            合法的字段名
+        """
+        # 只保留字母、数字、下划线
+        name = re.sub(r'[^a-zA-Z0-9_]', '_', str(col_name))
+
+        # 确保以字母开头
+        if name and name[0].isdigit():
+            name = 'col_' + name
+
+        # 限制长度
+        return name[:50]
+
+    def _infer_column_type(self, series: pd.Series) -> str:
+        """
+        根据数据推断列类型
+
+        Args:
+            series: pandas Series
+
+        Returns:
+            类型名称
+        """
+        dtype = series.dtype
+
+        if pd.api.types.is_integer_dtype(dtype):
+            return "INTEGER"
+        elif pd.api.types.is_float_dtype(dtype):
+            return "FLOAT"
+        elif pd.api.types.is_datetime64_any_dtype(dtype):
+            return "DATETIME"
+        elif pd.api.types.is_bool_dtype(dtype):
+            return "BOOLEAN"
+        else:
+            return "TEXT"
+
+    def _create_table_model(
+        self,
+        table_name: str,
+        columns: List[str],
+        column_types: Dict[str, str]
+    ) -> type:
+        """
+        动态创建 SQLAlchemy 模型类
+
+        Args:
+            table_name: 表名
+            columns: 列名列表
+            column_types: 列类型字典
+
+        Returns:
+            SQLAlchemy 模型类
+        """
+        # 创建属性字典
+        attrs = {
+            '__tablename__': table_name,
+            '__table_args__': {'extend_existing': True},
+        }
+
+        # 添加主键列
+        attrs['id'] = Column(Integer, primary_key=True, autoincrement=True)
+
+        # 添加数据列
+        for col in columns:
+            col_name = self._sanitize_column_name(col)
+            col_type = column_types.get(col, "TEXT")
+
+            if col_type == "INTEGER":
+                attrs[col_name] = Column(Integer, nullable=True)
+            elif col_type == "FLOAT":
+                attrs[col_name] = Column(Float, nullable=True)
+            elif col_type == "DATETIME":
+                attrs[col_name] = Column(DateTime, nullable=True)
+            elif col_type == "BOOLEAN":
+                attrs[col_name] = Column(Integer, nullable=True)  # MySQL 没有原生 BOOLEAN
+            else:
+                attrs[col_name] = Column(Text, nullable=True)
+
+        # 添加元数据列
+        attrs['created_at'] = Column(DateTime, default=datetime.utcnow)
+        attrs['updated_at'] = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
+
+        # 创建类
+        return type(table_name, (Base,), attrs)
+
+    async def store_excel(
+        self,
+        file_path: str,
+        filename: str,
+        sheet_name: Optional[str] = None,
+        header_row: int = 0
+    ) -> Dict[str, Any]:
+        """
+        将 Excel 文件存储到 MySQL
+
+        Args:
+            file_path: Excel 文件路径
+            filename: 原始文件名
+            sheet_name: 工作表名称
+            header_row: 表头行号
+
+        Returns:
+            存储结果
+        """
+        table_name = self._sanitize_table_name(filename)
+        results = {
+            "success": True,
+            "table_name": table_name,
+            "row_count": 0,
+            "columns": []
+        }
+
+        try:
+            # 读取 Excel
+            if sheet_name:
+                df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
+            else:
+                df = pd.read_excel(file_path, header=header_row)
+
+            if df.empty:
+                return {"success": False, "error": "Excel 文件为空"}
+
+            # 清理列名
+            df.columns = [str(c) for c in df.columns]
+
+            # 推断列类型
+            column_types = {}
+            for col in df.columns:
+                col_name = self._sanitize_column_name(col)
+                col_type = self._infer_column_type(df[col])
+                column_types[col] = col_type
+                results["columns"].append({
+                    "original_name": col,
+                    "sanitized_name": col_name,
+                    "type": col_type
+                })
+
+            # 创建表
+            model_class = self._create_table_model(table_name, df.columns, column_types)
+
+            # 创建表结构
+            async with self.mysql_db.get_session() as session:
+                model_class.__table__.create(session.bind, checkfirst=True)
+
+            # 插入数据
+            records = []
+            for _, row in df.iterrows():
+                record = {}
+                for col in df.columns:
+                    col_name = self._sanitize_column_name(col)
+                    value = row[col]
+
+                    # 处理 NaN 值
+                    if pd.isna(value):
+                        record[col_name] = None
+                    elif column_types[col] == "INTEGER":
+                        try:
+                            record[col_name] = int(value)
+                        except (ValueError, TypeError):
+                            record[col_name] = None
+                    elif column_types[col] == "FLOAT":
+                        try:
+                            record[col_name] = float(value)
+                        except (ValueError, TypeError):
+                            record[col_name] = None
+                    else:
+                        record[col_name] = str(value)
+
+                records.append(record)
+
+            # 批量插入
+            async with self.mysql_db.get_session() as session:
+                for record in records:
+                    session.add(model_class(**record))
+                await session.commit()
+
+            results["row_count"] = len(records)
+            logger.info(f"Excel 数据已存储到 MySQL 表 {table_name}，共 {len(records)} 行")
+
+            return results
+
+        except Exception as e:
+            logger.error(f"存储 Excel 到 MySQL 失败: {str(e)}")
+            return {"success": False, "error": str(e)}
+
+    async def query_table(
+        self,
+        table_name: str,
+        columns: Optional[List[str]] = None,
+        where: Optional[str] = None,
+        limit: int = 100
+    ) -> List[Dict[str, Any]]:
+        """
+        查询 MySQL 表数据
+
+        Args:
+            table_name: 表名
+            columns: 要查询的列
+            where: WHERE 条件
+            limit: 限制返回行数
+
+        Returns:
+            查询结果
+        """
+        try:
+            # 构建查询
+            sql = f"SELECT * FROM `{table_name}`"
+            if where:
+                sql += f" WHERE {where}"
+            sql += f" LIMIT {limit}"
+
+            results = await self.mysql_db.execute_query(sql)
+            return results
+
+        except Exception as e:
+            logger.error(f"查询表失败: {str(e)}")
+            return []
+
+    async def get_table_schema(self, table_name: str) -> Optional[Dict[str, Any]]:
+        """
+        获取表结构信息
+
+        Args:
+            table_name: 表名
+
+        Returns:
+            表结构信息
+        """
+        try:
+            sql = f"""
+                SELECT COLUMN_NAME, DATA_TYPE, IS_NULLABLE, COLUMN_KEY, COLUMN_COMMENT
+                FROM INFORMATION_SCHEMA.COLUMNS
+                WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = '{table_name}'
+                ORDER BY ORDINAL_POSITION
+            """
+            results = await self.mysql_db.execute_query(sql)
+            return results
+
+        except Exception as e:
+            logger.error(f"获取表结构失败: {str(e)}")
+            return None
+
+    async def delete_table(self, table_name: str) -> bool:
+        """
+        删除表
+
+        Args:
+            table_name: 表名
+
+        Returns:
+            是否成功
+        """
+        try:
+            # 安全检查：表名必须包含下划线（避免删除系统表）
+            if '_' not in table_name and not table_name.startswith('t_'):
+                raise ValueError("不允许删除此表")
+
+            sql = f"DROP TABLE IF EXISTS `{table_name}`"
+            await self.mysql_db.execute_raw_sql(sql)
+            logger.info(f"表 {table_name} 已删除")
+            return True
+
+        except Exception as e:
+            logger.error(f"删除表失败: {str(e)}")
+            return False
+
+    async def list_tables(self) -> List[str]:
+        """
+        列出所有用户表
+
+        Returns:
+            表名列表
+        """
+        try:
+            sql = """
+                SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES
+                WHERE TABLE_SCHEMA = DATABASE() AND TABLE_TYPE = 'BASE TABLE'
+            """
+            results = await self.mysql_db.execute_query(sql)
+            return [r['TABLE_NAME'] for r in results]
+
+        except Exception as e:
+            logger.error(f"列出表失败: {str(e)}")
+            return []
+
+
+# ==================== 全局单例 ====================
+
+excel_storage_service = ExcelStorageService()
--- a/backend/app/services/prompt_service.py
+++ b/backend/app/services/prompt_service.py
@@ -0,0 +1,444 @@
+"""
+提示词工程服务
+
+管理和优化与大模型交互的提示词
+"""
+import json
+import logging
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class PromptType(Enum):
+    """提示词类型"""
+    DOCUMENT_PARSING = "document_parsing"       # 文档解析
+    FIELD_EXTRACTION = "field_extraction"       # 字段提取
+    TABLE_FILLING = "table_filling"             # 表格填写
+    QUERY_GENERATION = "query_generation"       # 查询生成
+    TEXT_SUMMARY = "text_summary"               # 文本摘要
+    INTENT_CLASSIFICATION = "intent_classification"  # 意图分类
+    DATA_CLASSIFICATION = "data_classification" # 数据分类
+
+
+@dataclass
+class PromptTemplate:
+    """提示词模板"""
+    name: str
+    type: PromptType
+    system_prompt: str
+    user_template: str
+    examples: List[Dict[str, str]] = field(default_factory=list)  # Few-shot 示例
+    rules: List[str] = field(default_factory=list)  # 特殊规则
+
+    def format(
+        self,
+        context: Dict[str, Any],
+        user_input: Optional[str] = None
+    ) -> List[Dict[str, str]]:
+        """
+        格式化提示词
+
+        Args:
+            context: 上下文数据
+            user_input: 用户输入
+
+        Returns:
+            格式化后的消息列表
+        """
+        messages = []
+
+        # 系统提示词
+        system_content = self.system_prompt
+
+        # 添加规则
+        if self.rules:
+            system_content += "\n\n【输出规则】\n" + "\n".join([f"- {rule}" for rule in self.rules])
+
+        # 添加示例
+        if self.examples:
+            system_content += "\n\n【示例】\n"
+            for i, ex in enumerate(self.examples):
+                system_content += f"\n示例 {i+1}:\n"
+                system_content += f"输入: {ex.get('input', '')}\n"
+                system_content += f"输出: {ex.get('output', '')}\n"
+
+        messages.append({"role": "system", "content": system_content})
+
+        # 用户提示词
+        user_content = self._format_user_template(context, user_input)
+        messages.append({"role": "user", "content": user_content})
+
+        return messages
+
+    def _format_user_template(
+        self,
+        context: Dict[str, Any],
+        user_input: Optional[str]
+    ) -> str:
+        """格式化用户模板"""
+        content = self.user_template
+
+        # 替换上下文变量
+        for key, value in context.items():
+            placeholder = f"{{{key}}}"
+            if placeholder in content:
+                if isinstance(value, (dict, list)):
+                    content = content.replace(placeholder, json.dumps(value, ensure_ascii=False, indent=2))
+                else:
+                    content = content.replace(placeholder, str(value))
+
+        # 添加用户输入
+        if user_input:
+            content += f"\n\n【用户需求】\n{user_input}"
+
+        return content
+
+
+class PromptEngineeringService:
+    """提示词工程服务"""
+
+    def __init__(self):
+        self.templates: Dict[PromptType, PromptTemplate] = {}
+        self._init_templates()
+
+    def _init_templates(self):
+        """初始化所有提示词模板"""
+
+        # ==================== 文档解析模板 ====================
+        self.templates[PromptType.DOCUMENT_PARSING] = PromptTemplate(
+            name="文档解析",
+            type=PromptType.DOCUMENT_PARSING,
+            system_prompt="""你是一个专业的文档解析专家。你的任务是从各类文档（Word、Excel、Markdown、纯文本）中提取关键信息。
+
+请严格按照JSON格式输出解析结果：
+{
+    "success": true/false,
+    "document_type": "文档类型",
+    "key_fields": {"字段名": "字段值", ...},
+    "summary": "文档摘要（100字内）",
+    "structured_data": {...}  // 提取的表格或其他结构化数据
+}
+
+重要规则：
+- 只提取明确存在的信息，不要猜测
+- 如果是表格数据，请以数组格式输出
+- 日期请使用 YYYY-MM-DD 格式
+- 金额请使用数字格式
+- 如果无法提取某个字段，设置为 null""",
+            user_template="""请解析以下文档内容：
+
+=== 文档开始 ===
+{content}
+=== 文档结束 ===
+
+请提取文档中的关键信息。""",
+            examples=[
+                {
+                    "input": "合同金额：100万元\n签订日期：2024年1月15日\n甲方：张三\n乙方：某某公司",
+                    "output": '{"success": true, "document_type": "合同", "key_fields": {"金额": 1000000, "日期": "2024-01-15", "甲方": "张三", "乙方": "某某公司"}, "summary": "甲乙双方签订的金额为100万元的合同", "structured_data": null}'
+                }
+            ],
+            rules=[
+                "只输出JSON，不要添加任何解释",
+                "使用严格的JSON格式"
+            ]
+        )
+
+        # ==================== 字段提取模板 ====================
+        self.templates[PromptType.FIELD_EXTRACTION] = PromptTemplate(
+            name="字段提取",
+            type=PromptType.FIELD_EXTRACTION,
+            system_prompt="""你是一个专业的数据提取专家。你的任务是从文档内容中提取指定字段的信息。
+
+请严格按照以下JSON格式输出：
+{
+    "value": "提取到的值，找不到则为空字符串",
+    "source": "数据来源描述",
+    "confidence": 0.0到1.0之间的置信度
+}
+
+重要规则：
+- 严格按字段名称匹配，不要提取无关信息
+- 置信度反映你对提取结果的信心程度
+- 如果字段不存在或无法确定，value设为空字符串，confidence设为0.0
+- value必须是实际值，不能是"未找到"之类的描述""",
+            user_template="""请从以下文档内容中提取指定字段的信息。
+
+【需要提取的字段】
+字段名称：{field_name}
+字段类型：{field_type}
+是否必填：{required}
+
+【用户提示】
+{hint}
+
+【文档内容】
+{context}
+
+请提取字段值。""",
+            examples=[
+                {
+                    "input": "文档内容：姓名张三，电话13800138000，邮箱zhangsan@example.com",
+                    "output": '{"value": "张三", "source": "文档第1行", "confidence": 1.0}'
+                }
+            ],
+            rules=[
+                "只输出JSON，不要添加任何解释"
+            ]
+        )
+
+        # ==================== 表格填写模板 ====================
+        self.templates[PromptType.TABLE_FILLING] = PromptTemplate(
+            name="表格填写",
+            type=PromptType.TABLE_FILLING,
+            system_prompt="""你是一个专业的表格填写助手。你的任务是根据提供的文档内容，填写表格模板中的字段。
+
+请严格按照以下JSON格式输出：
+{
+    "filled_data": {{"字段1": "值1", "字段2": "值2", ...}},
+    "fill_details": [
+        {{"field": "字段1", "value": "值1", "source": "来源", "confidence": 0.95}},
+        ...
+    ]
+}
+
+重要规则：
+- 只填写模板中存在的字段
+- 值必须来自提供的文档内容，不要编造
+- 如果某个字段在文档中找不到对应值，设为空字符串
+- fill_details 中记录每个字段的详细信息""",
+            user_template="""请根据以下文档内容，填写表格模板。
+
+【表格模板字段】
+{fields}
+
+【用户需求】
+{hint}
+
+【参考文档内容】
+{context}
+
+请填写表格。""",
+            examples=[
+                {
+                    "input": "字段：姓名、电话\n文档：张三，电话是13800138000",
+                    "output": '{"filled_data": {"姓名": "张三", "电话": "13800138000"}, "fill_details": [{"field": "姓名", "value": "张三", "source": "文档第1行", "confidence": 1.0}, {"field": "电话", "value": "13800138000", "source": "文档第1行", "confidence": 1.0}]}'
+                }
+            ],
+            rules=[
+                "只输出JSON，不要添加任何解释"
+            ]
+        )
+
+        # ==================== 查询生成模板 ====================
+        self.templates[PromptType.QUERY_GENERATION] = PromptTemplate(
+            name="查询生成",
+            type=PromptType.QUERY_GENERATION,
+            system_prompt="""你是一个SQL查询生成专家。你的任务是根据用户的自然语言需求，生成相应的数据库查询语句。
+
+请严格按照以下JSON格式输出：
+{
+    "sql_query": "生成的SQL查询语句",
+    "explanation": "查询逻辑说明"
+}
+
+重要规则：
+- 只生成 SELECT 查询语句，不要生成 INSERT/UPDATE/DELETE
+- 必须包含 WHERE 条件限制查询范围
+- 表名和字段名使用反引号包裹
+- 确保SQL语法正确
+- 如果无法生成有效的查询，sql_query设为空字符串""",
+            user_template="""根据以下信息生成查询语句。
+
+【数据库表结构】
+{table_schema}
+
+【RAG检索到的上下文】
+{rag_context}
+
+【用户查询需求】
+{user_intent}
+
+请生成SQL查询。""",
+            examples=[
+                {
+                    "input": "表：orders(订单号, 金额, 日期, 客户)\n需求：查询2024年1月销售额超过10000的订单",
+                    "output": '{"sql_query": "SELECT * FROM `orders` WHERE `日期` >= \\'2024-01-01\\' AND `日期` < \\'2024-02-01\\' AND `金额` > 10000", "explanation": "筛选2024年1月销售额超过10000的订单"}'
+                }
+            ],
+            rules=[
+                "只输出JSON，不要添加任何解释",
+                "禁止生成 DROP、DELETE、TRUNCATE 等危险操作"
+            ]
+        )
+
+        # ==================== 文本摘要模板 ====================
+        self.templates[PromptType.TEXT_SUMMARY] = PromptTemplate(
+            name="文本摘要",
+            type=PromptType.TEXT_SUMMARY,
+            system_prompt="""你是一个专业的文本摘要专家。你的任务是对长文档进行压缩，提取关键信息。
+
+请严格按照以下JSON格式输出：
+{
+    "summary": "摘要内容（不超过200字）",
+    "key_points": ["要点1", "要点2", "要点3"],
+    "keywords": ["关键词1", "关键词2", "关键词3"]
+}""",
+            user_template="""请为以下文档生成摘要：
+
+=== 文档开始 ===
+{content}
+=== 文档结束 ===
+
+生成简明摘要。""",
+            rules=[
+                "只输出JSON，不要添加任何解释"
+            ]
+        )
+
+        # ==================== 意图分类模板 ====================
+        self.templates[PromptType.INTENT_CLASSIFICATION] = PromptTemplate(
+            name="意图分类",
+            type=PromptType.INTENT_CLASSIFICATION,
+            system_prompt="""你是一个意图分类专家。你的任务是分析用户的自然语言输入，判断用户的真实意图。
+
+支持的意图类型：
+- upload: 上传文档
+- parse: 解析文档
+- query: 查询数据
+- fill: 填写表格
+- export: 导出数据
+- analyze: 分析数据
+- other: 其他/未知
+
+请严格按照以下JSON格式输出：
+{
+    "intent": "意图类型",
+    "confidence": 0.0到1.0之间的置信度,
+    "entities": {{"实体名": "实体值", ...}},  // 识别出的关键实体
+    "suggestion": "建议的下一步操作"
+}""",
+            user_template="""请分析以下用户输入，判断其意图：
+
+【用户输入】
+{user_input}
+
+请分类。""",
+            rules=[
+                "只输出JSON，不要添加任何解释"
+            ]
+        )
+
+        # ==================== 数据分类模板 ====================
+        self.templates[PromptType.DATA_CLASSIFICATION] = PromptTemplate(
+            name="数据分类",
+            type=PromptType.DATA_CLASSIFICATION,
+            system_prompt="""你是一个数据分类专家。你的任务是判断数据的类型和格式。
+
+请严格按照以下JSON格式输出：
+{
+    "data_type": "text/number/date/email/phone/url/amount/other",
+    "format": "具体格式描述",
+    "is_valid": true/false,
+    "normalized_value": "规范化后的值"
+}""",
+            user_template="""请分析以下数据的类型和格式：
+
+【数据】
+{value}
+
+【期望类型（如果有）】
+{expected_type}
+
+请分类。""",
+            rules=[
+                "只输出JSON，不要添加任何解释"
+            ]
+        )
+
+    def get_prompt(
+        self,
+        type: PromptType,
+        context: Dict[str, Any],
+        user_input: Optional[str] = None
+    ) -> List[Dict[str, str]]:
+        """
+        获取格式化后的提示词
+
+        Args:
+            type: 提示词类型
+            context: 上下文数据
+            user_input: 用户输入
+
+        Returns:
+            消息列表
+        """
+        template = self.templates.get(type)
+        if not template:
+            logger.warning(f"未找到提示词模板: {type}")
+            return [{"role": "user", "content": str(context)}]
+
+        return template.format(context, user_input)
+
+    def get_template(self, type: PromptType) -> Optional[PromptTemplate]:
+        """获取提示词模板"""
+        return self.templates.get(type)
+
+    def add_template(self, template: PromptTemplate):
+        """添加自定义提示词模板"""
+        self.templates[template.type] = template
+        logger.info(f"已添加提示词模板: {template.name}")
+
+    def update_template(self, type: PromptType, **kwargs):
+        """更新提示词模板"""
+        template = self.templates.get(type)
+        if template:
+            for key, value in kwargs.items():
+                if hasattr(template, key):
+                    setattr(template, key, value)
+
+    def optimize_prompt(
+        self,
+        type: PromptType,
+        feedback: str,
+        iteration: int = 1
+    ) -> List[Dict[str, str]]:
+        """
+        根据反馈优化提示词
+
+        Args:
+            type: 提示词类型
+            feedback: 优化反馈
+            iteration: 迭代次数
+
+        Returns:
+            优化后的提示词
+        """
+        template = self.templates.get(type)
+        if not template:
+            return []
+
+        # 简单优化策略：根据反馈添加规则
+        optimization_rules = {
+            "准确率低": "提高要求，明确指出必须从原文提取，不要猜测",
+            "格式错误": "强调JSON格式要求，提供更详细的格式示例",
+            "遗漏信息": "添加提取更多细节的要求",
+        }
+
+        new_rules = []
+        for keyword, rule in optimization_rules.items():
+            if keyword in feedback:
+                new_rules.append(rule)
+
+        if new_rules:
+            template.rules.extend(new_rules)
+
+        return template.format({}, None)
+
+
+# ==================== 全局单例 ====================
+
+prompt_service = PromptEngineeringService()
--- a/backend/app/services/template_fill_service.py
+++ b/backend/app/services/template_fill_service.py
@@ -0,0 +1,307 @@
+"""
+表格模板填写服务
+
+从非结构化文档中检索信息并填写到表格模板
+"""
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from app.core.database import mongodb
+from app.services.rag_service import rag_service
+from app.services.llm_service import llm_service
+from app.services.excel_storage_service import excel_storage_service
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class TemplateField:
+    """模板字段"""
+    cell: str  # 单元格位置，如 "A1"
+    name: str  # 字段名称
+    field_type: str = "text"  # 字段类型: text/number/date
+    required: bool = True
+
+
+@dataclass
+class FillResult:
+    """填写结果"""
+    field: str
+    value: Any
+    source: str  # 来源文档
+    confidence: float = 1.0  # 置信度
+
+
+class TemplateFillService:
+    """表格填写服务"""
+
+    def __init__(self):
+        self.llm = llm_service
+        self.rag = rag_service
+
+    async def fill_template(
+        self,
+        template_fields: List[TemplateField],
+        source_doc_ids: Optional[List[str]] = None,
+        user_hint: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        填写表格模板
+
+        Args:
+            template_fields: 模板字段列表
+            source_doc_ids: 源文档ID列表，不指定则从所有文档检索
+            user_hint: 用户提示（如"请从合同文档中提取"）
+
+        Returns:
+            填写结果
+        """
+        filled_data = {}
+        fill_details = []
+
+        for field in template_fields:
+            try:
+                # 1. 从 RAG 检索相关上下文
+                rag_results = await self._retrieve_context(field.name, user_hint)
+
+                if not rag_results:
+                    # 如果没有检索到结果，尝试直接询问 LLM
+                    result = FillResult(
+                        field=field.name,
+                        value="",
+                        source="未找到相关数据",
+                        confidence=0.0
+                    )
+                else:
+                    # 2. 构建 Prompt 让 LLM 提取信息
+                    result = await self._extract_field_value(
+                        field=field,
+                        rag_context=rag_results,
+                        user_hint=user_hint
+                    )
+
+                # 3. 存储结果
+                filled_data[field.name] = result.value
+                fill_details.append({
+                    "field": field.name,
+                    "cell": field.cell,
+                    "value": result.value,
+                    "source": result.source,
+                    "confidence": result.confidence
+                })
+
+                logger.info(f"字段 {field.name} 填写完成: {result.value}")
+
+            except Exception as e:
+                logger.error(f"填写字段 {field.name} 失败: {str(e)}")
+                filled_data[field.name] = f"[提取失败: {str(e)}]"
+                fill_details.append({
+                    "field": field.name,
+                    "cell": field.cell,
+                    "value": f"[提取失败]",
+                    "source": "error",
+                    "confidence": 0.0
+                })
+
+        return {
+            "success": True,
+            "filled_data": filled_data,
+            "fill_details": fill_details
+        }
+
+    async def _retrieve_context(
+        self,
+        field_name: str,
+        user_hint: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        """
+        从 RAG 检索相关上下文
+
+        Args:
+            field_name: 字段名称
+            user_hint: 用户提示
+
+        Returns:
+            检索结果列表
+        """
+        # 构建查询文本
+        query = field_name
+        if user_hint:
+            query = f"{user_hint} {field_name}"
+
+        # 检索相关文档片段
+        results = self.rag.retrieve(query=query, top_k=5)
+
+        return results
+
+    async def _extract_field_value(
+        self,
+        field: TemplateField,
+        rag_context: List[Dict[str, Any]],
+        user_hint: Optional[str] = None
+    ) -> FillResult:
+        """
+        使用 LLM 从上下文中提取字段值
+
+        Args:
+            field: 字段定义
+            rag_context: RAG 检索到的上下文
+            user_hint: 用户提示
+
+        Returns:
+            提取结果
+        """
+        # 构建上下文文本
+        context_text = "\n\n".join([
+            f"【文档 {i+1}】\n{doc['content']}"
+            for i, doc in enumerate(rag_context)
+        ])
+
+        # 构建 Prompt
+        prompt = f"""你是一个数据提取专家。请根据以下文档内容，提取指定字段的信息。
+
+需要提取的字段：
+- 字段名称：{field.name}
+- 字段类型：{field.field_type}
+- 是否必填：{'是' if field.required else '否'}
+
+{'用户提示：' + user_hint if user_hint else ''}
+
+参考文档内容：
+{context_text}
+
+请严格按照以下 JSON 格式输出，不要添加任何解释：
+{{
+    "value": "提取到的值，如果没有找到则填写空字符串",
+    "source": "数据来源的文档描述",
+    "confidence": 0.0到1.0之间的置信度
+}}
+"""
+
+        # 调用 LLM
+        messages = [
+            {"role": "system", "content": "你是一个专业的数据提取助手。请严格按JSON格式输出。"},
+            {"role": "user", "content": prompt}
+        ]
+
+        try:
+            response = await self.llm.chat(
+                messages=messages,
+                temperature=0.1,
+                max_tokens=500
+            )
+
+            content = self.llm.extract_message_content(response)
+
+            # 解析 JSON 响应
+            import json
+            import re
+
+            # 尝试提取 JSON
+            json_match = re.search(r'\{[\s\S]*\}', content)
+            if json_match:
+                result = json.loads(json_match.group())
+                return FillResult(
+                    field=field.name,
+                    value=result.get("value", ""),
+                    source=result.get("source", "LLM生成"),
+                    confidence=result.get("confidence", 0.5)
+                )
+            else:
+                # 如果无法解析，返回原始内容
+                return FillResult(
+                    field=field.name,
+                    value=content.strip(),
+                    source="直接提取",
+                    confidence=0.5
+                )
+
+        except Exception as e:
+            logger.error(f"LLM 提取失败: {str(e)}")
+            return FillResult(
+                field=field.name,
+                value="",
+                source=f"提取失败: {str(e)}",
+                confidence=0.0
+            )
+
+    async def get_template_fields_from_file(
+        self,
+        file_path: str,
+        file_type: str = "xlsx"
+    ) -> List[TemplateField]:
+        """
+        从模板文件提取字段定义
+
+        Args:
+            file_path: 模板文件路径
+            file_type: 文件类型
+
+        Returns:
+            字段列表
+        """
+        fields = []
+
+        try:
+            if file_type in ["xlsx", "xls"]:
+                # 从 Excel 读取表头
+                import pandas as pd
+                df = pd.read_excel(file_path, nrows=5)
+
+                for idx, col in enumerate(df.columns):
+                    # 获取单元格位置 (A, B, C, ...)
+                    cell = self._column_to_cell(idx)
+
+                    fields.append(TemplateField(
+                        cell=cell,
+                        name=str(col),
+                        field_type=self._infer_field_type(df[col]),
+                        required=True
+                    ))
+
+            elif file_type == "docx":
+                # 从 Word 表格读取
+                from docx import Document
+                doc = Document(file_path)
+
+                for table_idx, table in enumerate(doc.tables):
+                    for row_idx, row in enumerate(table.rows):
+                        for col_idx, cell in enumerate(row.cells):
+                            cell_text = cell.text.strip()
+                            if cell_text:
+                                fields.append(TemplateField(
+                                    cell=self._column_to_cell(col_idx),
+                                    name=cell_text,
+                                    field_type="text",
+                                    required=True
+                                ))
+
+        except Exception as e:
+            logger.error(f"提取模板字段失败: {str(e)}")
+
+        return fields
+
+    def _column_to_cell(self, col_idx: int) -> str:
+        """将列索引转换为单元格列名 (0 -> A, 1 -> B, ...)"""
+        result = ""
+        while col_idx >= 0:
+            result = chr(65 + (col_idx % 26)) + result
+            col_idx = col_idx // 26 - 1
+        return result
+
+    def _infer_field_type(self, series) -> str:
+        """推断字段类型"""
+        import pandas as pd
+
+        if pd.api.types.is_numeric_dtype(series):
+            return "number"
+        elif pd.api.types.is_datetime64_any_dtype(series):
+            return "date"
+        else:
+            return "text"
+
+
+# ==================== 全局单例 ====================
+
+template_fill_service = TemplateFillService()