From 718f864926c1de05e3be533ad12d4b3be36dabdd Mon Sep 17 00:00:00 2001
From: dj <431634905@qq.com>
Date: Thu, 9 Apr 2026 20:56:38 +0800
Subject: [PATCH 01/13] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=AF=BB=E5=8F=96excel?=
=?UTF-8?q?=E8=A1=A8=E6=97=B6=E5=AD=98=E5=9C=A8=E6=95=B0=E5=AD=97=E6=97=B6?=
=?UTF-8?q?=E6=B5=AE=E7=82=B9=E5=8C=B9=E9=85=8D=E7=94=9F=E6=88=90=E4=B8=8D?=
=?UTF-8?q?=E4=B8=80=E8=87=B4=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
backend/app/services/template_fill_service.py | 68 +++++++++++++++----
1 file changed, 55 insertions(+), 13 deletions(-)
diff --git a/backend/app/services/template_fill_service.py b/backend/app/services/template_fill_service.py
index 71976a6..f564ab9 100644
--- a/backend/app/services/template_fill_service.py
+++ b/backend/app/services/template_fill_service.py
@@ -757,10 +757,52 @@ class TemplateFillService:
val = row[columns.index(target_col)]
else:
val = ""
- values.append(str(val) if val is not None else "")
+ values.append(self._format_value(val))
return values
+ def _format_value(self, val: Any) -> str:
+ """
+ 格式化值为字符串,保持原始格式
+
+ - 如果是浮点数但实际上等于整数,返回整数格式(如 3.0 -> "3")
+ - 如果是浮点数且有小数部分,保留小数(如 3.5 -> "3.5")
+ - 如果是整数,直接返回(如 3 -> "3")
+ - 其他类型直接转为字符串
+
+ Args:
+ val: 原始值
+
+ Returns:
+ 格式化后的字符串
+ """
+ if val is None:
+ return ""
+
+ # 如果已经是字符串
+ if isinstance(val, str):
+ return val.strip()
+
+ # 如果是布尔值
+ if isinstance(val, bool):
+ return "true" if val else "false"
+
+ # 如果是数字
+ if isinstance(val, (int, float)):
+ # 检查是否是浮点数但等于整数
+ if isinstance(val, float):
+ # 检查是否是小数部分为0
+ if val == int(val):
+ return str(int(val))
+ else:
+ # 去除尾部多余的0,但保留必要的小数位
+ formatted = f"{val:.10f}".rstrip('0').rstrip('.')
+ return formatted
+ else:
+ return str(val)
+
+ return str(val)
+
def _extract_values_from_json(self, result) -> List[str]:
"""
从解析后的 JSON 对象/数组中提取值数组
@@ -774,12 +816,12 @@ class TemplateFillService:
if isinstance(result, dict):
# 优先找 values 数组
if "values" in result and isinstance(result["values"], list):
- vals = [str(v).strip() for v in result["values"] if v and str(v).strip()]
+ vals = [self._format_value(v).strip() for v in result["values"] if self._format_value(v).strip()]
if vals:
return vals
# 尝试找 value 字段
if "value" in result:
- val = str(result["value"]).strip()
+ val = self._format_value(result["value"]).strip()
if val:
return [val]
# 尝试找任何数组类型的键
@@ -787,13 +829,13 @@ class TemplateFillService:
val = result[key]
if isinstance(val, list) and len(val) > 0:
if all(isinstance(v, (str, int, float, bool)) or v is None for v in val):
- vals = [str(v).strip() for v in val if v is not None and str(v).strip()]
+ vals = [self._format_value(v).strip() for v in val if v is not None and self._format_value(v).strip()]
if vals:
return vals
elif isinstance(val, (str, int, float, bool)):
- return [str(val).strip()]
+ return [self._format_value(val).strip()]
elif isinstance(result, list):
- vals = [str(v).strip() for v in result if v is not None and str(v).strip()]
+ vals = [self._format_value(v).strip() for v in result if v is not None and self._format_value(v).strip()]
if vals:
return vals
return []
@@ -930,15 +972,15 @@ class TemplateFillService:
if isinstance(parsed, dict):
# 如果是 {"values": [...]} 格式,提取 values
if "values" in parsed and isinstance(parsed["values"], list):
- return [str(v).strip() for v in parsed["values"] if v and str(v).strip()]
+ return [self._format_value(v).strip() for v in parsed["values"] if self._format_value(v).strip()]
# 如果是其他 dict 格式,尝试找 values 键
for key in ["values", "value", "data", "result"]:
if key in parsed and isinstance(parsed[key], list):
- return [str(v).strip() for v in parsed[key] if v and str(v).strip()]
+ return [self._format_value(v).strip() for v in parsed[key] if self._format_value(v).strip()]
elif key in parsed:
- return [str(parsed[key]).strip()]
+ return [self._format_value(parsed[key]).strip()]
elif isinstance(parsed, list):
- return [str(v).strip() for v in parsed if v and str(v).strip()]
+ return [self._format_value(v).strip() for v in parsed if self._format_value(v).strip()]
except (json.JSONDecodeError, TypeError):
pass
@@ -954,14 +996,14 @@ class TemplateFillService:
result = []
for item in arr:
if isinstance(item, dict) and "values" in item and isinstance(item["values"], list):
- result.extend([str(v).strip() for v in item["values"] if v and str(v).strip()])
+ result.extend([self._format_value(v).strip() for v in item["values"] if self._format_value(v).strip()])
elif isinstance(item, dict):
result.append(str(item))
else:
- result.append(str(item))
+ result.append(self._format_value(item))
if result:
return result
- return [str(v).strip() for v in arr if v and str(v).strip()]
+ return [self._format_value(v).strip() for v in arr if self._format_value(v).strip()]
except:
pass
From d5df5b8283b0e7697b6f829836a157dbe7a44331 Mon Sep 17 00:00:00 2001
From: KiriAky 107
Date: Thu, 9 Apr 2026 21:00:31 +0800
Subject: [PATCH 02/13] =?UTF-8?q?=E5=A2=9E=E5=BC=BA=E6=A8=A1=E6=9D=BF?=
=?UTF-8?q?=E5=A1=AB=E5=85=85=E6=9C=8D=E5=8A=A1=E6=94=AF=E6=8C=81=E9=9D=9E?=
=?UTF-8?q?=E7=BB=93=E6=9E=84=E5=8C=96=E6=96=87=E6=A1=A3AI=E5=88=86?=
=?UTF-8?q?=E6=9E=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 引入markdown_ai_service服务支持Markdown文档处理
- 实现_nonstructured_docs_for_fields方法对非结构化文档进行AI分析
- 优化LLM提示词,改进数据提取的准确性和格式规范
- 支持从Markdown表格格式{tables: [{headers: [...], rows: [...]}]}中提取数据
- 添加文档章节结构解析,提升上下文理解能力
- 增加JSON响应格式修复功能,提高数据解析成功率
---
backend/app/services/template_fill_service.py | 200 +++++++++++++++++-
1 file changed, 193 insertions(+), 7 deletions(-)
diff --git a/backend/app/services/template_fill_service.py b/backend/app/services/template_fill_service.py
index 71976a6..dfea7f8 100644
--- a/backend/app/services/template_fill_service.py
+++ b/backend/app/services/template_fill_service.py
@@ -10,6 +10,7 @@ from typing import Any, Dict, List, Optional
from app.core.database import mongodb
from app.services.llm_service import llm_service
from app.core.document_parser import ParserFactory
+from app.services.markdown_ai_service import markdown_ai_service
logger = logging.getLogger(__name__)
@@ -233,6 +234,12 @@ class TemplateFillService:
confidence=1.0
)
+ # 无法直接从结构化数据提取,尝试 AI 分析非结构化文档
+ ai_structured = await self._analyze_unstructured_docs_for_fields(source_docs, field, user_hint)
+ if ai_structured:
+ logger.info(f"✅ 字段 {field.name} 通过 AI 分析结构化提取到数据")
+ return ai_structured
+
# 无法从结构化数据提取,使用 LLM
logger.info(f"字段 {field.name} 无法直接从结构化数据提取,使用 LLM...")
@@ -244,18 +251,20 @@ class TemplateFillService:
if user_hint:
hint_text = f"{user_hint}。{hint_text}"
- prompt = f"""你是一个专业的数据提取专家。请从以下文档内容中提取"{field.name}"字段的所有行数据。
+ prompt = f"""你是一个专业的数据提取专家。请从以下文档内容中提取与"{field.name}"相关的所有信息。
-参考文档内容(已提取" {field.name}"列的数据):
+提示词: {hint_text}
+
+文档内容:
{context_text}
-请提取上述所有行的" {field.name}"值,存入数组。每一行对应数组中的一个元素。
-如果某行该字段为空,请用空字符串""占位。
+请分析文档结构(可能包含表格、标题段落等),找出所有与"{field.name}"相关的数据。
+如果找到表格数据,返回多行值;如果是非表格段落,提取关键信息。
-请严格按照以下 JSON 格式输出,不要添加任何解释:
+请严格按照以下 JSON 格式输出:
{{
- "values": ["第1行的值", "第2行的值", "第3行的值", ...],
- "source": "数据来源的文档描述",
+ "values": ["第1行的值", "第2行的值", ...],
+ "source": "数据来源描述",
"confidence": 0.0到1.0之间的置信度
}}
"""
@@ -473,6 +482,29 @@ class TemplateFillService:
elif isinstance(row, list):
doc_content += " | ".join(str(cell) for cell in row) + "\n"
row_count += 1
+ elif doc.structured_data and doc.structured_data.get("tables"):
+ # Markdown 表格格式: {tables: [{headers: [...], rows: [...]}]}
+ tables = doc.structured_data.get("tables", [])
+ for table in tables:
+ if isinstance(table, dict):
+ headers = table.get("headers", [])
+ rows = table.get("rows", [])
+ if rows and headers:
+ doc_content += f"\n【文档: {doc.filename} - 表格】\n"
+ doc_content += " | ".join(str(h) for h in headers) + "\n"
+ for row in rows:
+ if isinstance(row, list):
+ doc_content += " | ".join(str(cell) for cell in row) + "\n"
+ row_count += 1
+ # 如果有标题结构,也添加上下文
+ if doc.structured_data.get("titles"):
+ titles = doc.structured_data.get("titles", [])
+ doc_content += f"\n【文档章节结构】\n"
+ for title in titles[:20]: # 限制前20个标题
+ doc_content += f"{'#' * title.get('level', 1)} {title.get('text', '')}\n"
+ # 如果没有提取到表格内容,使用纯文本
+ if not doc_content.strip():
+ doc_content = doc.content[:5000] if doc.content else ""
elif doc.content:
doc_content = doc.content[:5000]
@@ -720,6 +752,21 @@ class TemplateFillService:
logger.info(f"从文档 {doc.filename} 提取到 {len(values)} 个值")
break
+ # 处理 Markdown 表格格式: {tables: [{headers: [...], rows: [...]}]}
+ elif structured.get("tables"):
+ tables = structured.get("tables", [])
+ for table in tables:
+ if isinstance(table, dict):
+ headers = table.get("headers", [])
+ rows = table.get("rows", [])
+ values = self._extract_column_values(rows, headers, field_name)
+ if values:
+ all_values.extend(values)
+ logger.info(f"从 Markdown 表格提取到 {len(values)} 个值")
+ break
+ if all_values:
+ break
+
return all_values
def _extract_column_values(self, rows: List, columns: List, field_name: str) -> List[str]:
@@ -1005,6 +1052,145 @@ class TemplateFillService:
content = text.strip()[:500] if text.strip() else ""
return [content] if content else []
+ async def _analyze_unstructured_docs_for_fields(
+ self,
+ source_docs: List[SourceDocument],
+ field: TemplateField,
+ user_hint: Optional[str] = None
+ ) -> Optional[FillResult]:
+ """
+ 对非结构化文档进行 AI 分析,尝试提取结构化数据
+
+ 适用于 Markdown 等没有表格格式的文档,通过 AI 分析提取结构化信息
+
+ Args:
+ source_docs: 源文档列表
+ field: 字段定义
+ user_hint: 用户提示
+
+ Returns:
+ FillResult 如果提取成功,否则返回 None
+ """
+ # 找出非结构化的 Markdown/TXT 文档(没有表格的)
+ unstructured_docs = []
+ for doc in source_docs:
+ if doc.doc_type in ["md", "txt", "markdown"]:
+ # 检查是否有表格
+ has_tables = (
+ doc.structured_data and
+ doc.structured_data.get("tables") and
+ len(doc.structured_data.get("tables", [])) > 0
+ )
+ if not has_tables:
+ unstructured_docs.append(doc)
+
+ if not unstructured_docs:
+ return None
+
+ logger.info(f"发现 {len(unstructured_docs)} 个非结构化文档,尝试 AI 分析...")
+
+ # 对每个非结构化文档进行 AI 分析
+ for doc in unstructured_docs:
+ try:
+ # 使用 markdown_ai_service 的 statistics 分析类型
+ # 这种类型专门用于政府统计公报等包含数据的文档
+ hint_text = field.hint if field.hint else f"请提取{field.name}的信息"
+ if user_hint:
+ hint_text = f"{user_hint}。{hint_text}"
+
+ # 构建针对字段提取的提示词
+ prompt = f"""你是一个专业的数据提取专家。请从以下文档内容中提取与"{field.name}"相关的所有数据。
+
+字段提示: {hint_text}
+
+文档内容:
+{doc.content[:8000] if doc.content else ""}
+
+请完成以下任务:
+1. 仔细阅读文档,找出所有与"{field.name}"相关的数据
+2. 如果文档中有表格数据,提取表格中的对应列值
+3. 如果文档中是段落描述,提取其中的关键数值或结论
+4. 返回提取的所有值(可能多个,用数组存储)
+
+请用严格的 JSON 格式返回:
+{{
+ "values": ["值1", "值2", ...],
+ "source": "数据来源说明",
+ "confidence": 0.0到1.0之间的置信度
+}}
+
+如果没有找到相关数据,返回空数组 values: []"""
+
+ messages = [
+ {"role": "system", "content": "你是一个专业的数据提取助手,擅长从政府统计公报等文档中提取数据。请严格按JSON格式输出。"},
+ {"role": "user", "content": prompt}
+ ]
+
+ response = await self.llm.chat(
+ messages=messages,
+ temperature=0.1,
+ max_tokens=5000
+ )
+
+ content = self.llm.extract_message_content(response)
+ logger.info(f"AI 分析返回: {content[:500]}")
+
+ # 解析 JSON
+ import json
+ import re
+
+ # 清理 markdown 格式
+ cleaned = content.strip()
+ cleaned = re.sub(r'^```json\s*', '', cleaned, flags=re.MULTILINE)
+ cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE)
+ cleaned = cleaned.strip()
+
+ # 查找 JSON
+ json_start = -1
+ for i, c in enumerate(cleaned):
+ if c == '{' or c == '[':
+ json_start = i
+ break
+
+ if json_start == -1:
+ continue
+
+ json_text = cleaned[json_start:]
+ try:
+ result = json.loads(json_text)
+ values = self._extract_values_from_json(result)
+ if values:
+ return FillResult(
+ field=field.name,
+ values=values,
+ value=values[0] if values else "",
+ source=f"AI分析: {doc.filename}",
+ confidence=result.get("confidence", 0.8)
+ )
+ except json.JSONDecodeError:
+ # 尝试修复 JSON
+ fixed = self._fix_json(json_text)
+ if fixed:
+ try:
+ result = json.loads(fixed)
+ values = self._extract_values_from_json(result)
+ if values:
+ return FillResult(
+ field=field.name,
+ values=values,
+ value=values[0] if values else "",
+ source=f"AI分析: {doc.filename}",
+ confidence=result.get("confidence", 0.8)
+ )
+ except json.JSONDecodeError:
+ pass
+
+ except Exception as e:
+ logger.warning(f"AI 分析文档 {doc.filename} 失败: {str(e)}")
+ continue
+
+ return None
+
# ==================== 全局单例 ====================
From 78417c898a30c150b478816b94a1234458359b97 Mon Sep 17 00:00:00 2001
From: dj <431634905@qq.com>
Date: Thu, 9 Apr 2026 21:42:07 +0800
Subject: [PATCH 03/13] =?UTF-8?q?=E6=94=B9=E8=BF=9B=E6=99=BA=E8=83=BD?=
=?UTF-8?q?=E5=A1=AB=E8=A1=A8=E5=8A=9F=E8=83=BD=EF=BC=9A=E6=94=AF=E6=8C=81?=
=?UTF-8?q?Markdown=E8=A1=A8=E6=A0=BC=E6=8F=90=E5=8F=96=E5=92=8C=E4=BF=AE?=
=?UTF-8?q?=E5=A4=8DLLM=E8=B0=83=E7=94=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 新增对MongoDB存储的tables格式支持,直接从structured_data.tables提取数据
- 修复max_tokens值过大问题(50000→4000),解决DeepSeek API限制
- 增强列名匹配算法,支持模糊匹配
- 添加详细日志便于调试结构化数据提取过程
Co-Authored-By: Claude Opus 4.6
---
backend/app/services/template_fill_service.py | 208 ++++++++++++++++--
1 file changed, 192 insertions(+), 16 deletions(-)
diff --git a/backend/app/services/template_fill_service.py b/backend/app/services/template_fill_service.py
index f564ab9..00fa270 100644
--- a/backend/app/services/template_fill_service.py
+++ b/backend/app/services/template_fill_service.py
@@ -77,12 +77,19 @@ class TemplateFillService:
fill_details = []
logger.info(f"开始填表: {len(template_fields)} 个字段, {len(source_doc_ids or [])} 个源文档")
+ logger.info(f"source_doc_ids: {source_doc_ids}")
+ logger.info(f"source_file_paths: {source_file_paths}")
# 1. 加载源文档内容
source_docs = await self._load_source_documents(source_doc_ids, source_file_paths)
logger.info(f"加载了 {len(source_docs)} 个源文档")
+ # 打印每个加载的文档的详细信息
+ for i, doc in enumerate(source_docs):
+ logger.info(f" 文档[{i}]: id={doc.doc_id}, filename={doc.filename}, doc_type={doc.doc_type}")
+ logger.info(f" content长度: {len(doc.content)}, structured_data keys: {list(doc.structured_data.keys()) if doc.structured_data else 'None'}")
+
if not source_docs:
logger.warning("没有找到源文档,填表结果将全部为空")
@@ -157,14 +164,21 @@ class TemplateFillService:
try:
doc = await mongodb.get_document(doc_id)
if doc:
+ sd = doc.get("structured_data", {})
+ sd_keys = list(sd.keys()) if sd else []
+ logger.info(f"从MongoDB加载文档: {doc_id}, doc_type={doc.get('doc_type')}, structured_data keys={sd_keys}")
+ if sd.get("tables"):
+ logger.info(f" tables数量: {len(sd.get('tables', []))}")
+ if sd["tables"]:
+ first_table = sd["tables"][0]
+ logger.info(f" 第一表格: headers={first_table.get('headers', [])[:3]}..., rows数量={len(first_table.get('rows', []))}")
source_docs.append(SourceDocument(
doc_id=doc_id,
filename=doc.get("metadata", {}).get("original_filename", "unknown"),
doc_type=doc.get("doc_type", "unknown"),
content=doc.get("content", ""),
- structured_data=doc.get("structured_data", {})
+ structured_data=sd
))
- logger.info(f"从MongoDB加载文档: {doc_id}")
except Exception as e:
logger.error(f"从MongoDB加载文档失败 {doc_id}: {str(e)}")
@@ -178,10 +192,48 @@ class TemplateFillService:
# result.data 的结构取决于解析器类型:
# - Excel 单 sheet: {columns: [...], rows: [...], row_count, column_count}
# - Excel 多 sheet: {sheets: {sheet_name: {columns, rows, ...}}}
+ # - Markdown: {content: "...", tables: [...], structured_data: {tables: [...]}}
# - Word/TXT: {content: "...", structured_data: {...}}
doc_data = result.data if result.data else {}
doc_content = doc_data.get("content", "") if isinstance(doc_data, dict) else ""
- doc_structured = doc_data if isinstance(doc_data, dict) and "rows" in doc_data or isinstance(doc_data, dict) and "sheets" in doc_data else {}
+
+ # 检查并提取 structured_data
+ doc_structured = {}
+ if isinstance(doc_data, dict):
+ logger.info(f"文档 {file_path} doc_data keys: {list(doc_data.keys())}")
+
+ # Excel 多 sheet
+ if "sheets" in doc_data:
+ doc_structured = doc_data
+ logger.info(f" -> 使用 Excel 多 sheet 格式")
+ # Excel 单 sheet 或有 rows 的格式
+ elif "rows" in doc_data:
+ doc_structured = doc_data
+ logger.info(f" -> 使用 rows 格式,列数: {len(doc_data.get('columns', []))}")
+ # Markdown 格式:tables 可能直接在 doc_data.tables 或在 structured_data.tables 中
+ elif "tables" in doc_data and doc_data["tables"]:
+ # Markdown: tables 直接在 doc_data 中
+ tables = doc_data["tables"]
+ first_table = tables[0]
+ doc_structured = {
+ "headers": first_table.get("headers", []),
+ "rows": first_table.get("rows", [])
+ }
+ logger.info(f" -> 使用 doc_data.tables 格式,表头: {doc_structured.get('headers', [])[:5]}")
+ elif "structured_data" in doc_data and isinstance(doc_data["structured_data"], dict):
+ # Markdown: tables 在 structured_data 中
+ tables = doc_data["structured_data"].get("tables", [])
+ if tables:
+ first_table = tables[0]
+ doc_structured = {
+ "headers": first_table.get("headers", []),
+ "rows": first_table.get("rows", [])
+ }
+ logger.info(f" -> 使用 structured_data.tables 格式,表头: {doc_structured.get('headers', [])[:5]}")
+ else:
+ logger.warning(f" -> structured_data.tables 为空")
+ else:
+ logger.warning(f" -> 未识别的文档格式,无 structured_data")
source_docs.append(SourceDocument(
doc_id=file_path,
@@ -270,7 +322,7 @@ class TemplateFillService:
response = await self.llm.chat(
messages=messages,
temperature=0.1,
- max_tokens=50000
+ max_tokens=4000
)
content = self.llm.extract_message_content(response)
@@ -675,7 +727,7 @@ class TemplateFillService:
def _extract_values_from_structured_data(self, source_docs: List[SourceDocument], field_name: str) -> List[str]:
"""
- 从结构化数据(Excel rows)中直接提取指定列的值
+ 从结构化数据(Excel rows 或 Markdown tables)中直接提取指定列的值
适用于有 rows 结构的文档数据,无需 LLM 即可提取
@@ -687,10 +739,15 @@ class TemplateFillService:
值列表,如果无法提取则返回空列表
"""
all_values = []
+ logger.info(f"[_extract_values_from_structured_data] 开始提取字段: {field_name}")
+ logger.info(f" source_docs 数量: {len(source_docs)}")
- for doc in source_docs:
+ for doc_idx, doc in enumerate(source_docs):
# 尝试从 structured_data 中提取
structured = doc.structured_data
+ logger.info(f" 文档[{doc_idx}]: {doc.filename}, structured类型: {type(structured)}, 是否为空: {not bool(structured)}")
+ if structured:
+ logger.info(f" structured_data keys: {list(structured.keys())}")
if not structured:
continue
@@ -710,6 +767,33 @@ class TemplateFillService:
if all_values:
break
+ # 处理 Markdown 表格格式: {headers: [...], rows: [...], ...}
+ elif structured.get("headers") and structured.get("rows"):
+ headers = structured.get("headers", [])
+ rows = structured.get("rows", [])
+ values = self._extract_values_from_markdown_table(headers, rows, field_name)
+ if values:
+ all_values.extend(values)
+ logger.info(f"从 Markdown 文档 {doc.filename} 提取到 {len(values)} 个值")
+ break
+
+ # 处理 MongoDB 存储的 tables 格式: {tables: [{headers, rows, ...}, ...]}
+ elif structured.get("tables") and isinstance(structured.get("tables"), list):
+ tables = structured.get("tables", [])
+ logger.info(f" 检测到 tables 格式,共 {len(tables)} 个表")
+ for table_idx, table in enumerate(tables):
+ if isinstance(table, dict):
+ headers = table.get("headers", [])
+ rows = table.get("rows", [])
+ logger.info(f" 表格[{table_idx}]: headers={headers[:3]}..., rows数量={len(rows)}")
+ values = self._extract_values_from_markdown_table(headers, rows, field_name)
+ if values:
+ all_values.extend(values)
+ logger.info(f"从表格[{table_idx}] 提取到 {len(values)} 个值")
+ break
+ if all_values:
+ break
+
# 处理单 sheet 格式: {columns: [...], rows: [...]}
elif structured.get("rows"):
columns = structured.get("columns", [])
@@ -722,6 +806,100 @@ class TemplateFillService:
return all_values
+ def _extract_values_from_markdown_table(self, headers: List, rows: List, field_name: str) -> List[str]:
+ """
+ 从 Markdown 表格中提取指定列的值
+
+ Markdown 表格格式:
+ - headers: ["col1", "col2", ...]
+ - rows: [["val1", "val2", ...], ...]
+
+ Args:
+ headers: 表头列表
+ rows: 数据行列表
+ field_name: 要提取的字段名
+
+ Returns:
+ 值列表
+ """
+ if not rows or not headers:
+ logger.warning(f"Markdown 表格为空: headers={headers}, rows={len(rows) if rows else 0}")
+ return []
+
+ # 查找匹配的列索引 - 使用增强的匹配算法
+ target_idx = self._find_best_matching_column(headers, field_name)
+
+ if target_idx is None:
+ logger.warning(f"未找到匹配列: {field_name}, 可用表头: {headers}")
+ return []
+
+ logger.info(f"列匹配成功: {field_name} -> {headers[target_idx]} (索引: {target_idx})")
+
+ values = []
+ for row in rows:
+ if isinstance(row, list) and target_idx < len(row):
+ val = row[target_idx]
+ else:
+ val = ""
+ values.append(self._format_value(val))
+
+ return values
+
+ def _find_best_matching_column(self, headers: List, field_name: str) -> Optional[int]:
+ """
+ 查找最佳匹配的列索引
+
+ 使用多层匹配策略:
+ 1. 精确匹配(忽略大小写)
+ 2. 子字符串匹配(字段名在表头中,或表头在字段名中)
+ 3. 关键词重叠匹配(中文字符串分割后比对)
+
+ Args:
+ headers: 表头列表
+ field_name: 要匹配的字段名
+
+ Returns:
+ 匹配的列索引,找不到返回 None
+ """
+ field_lower = field_name.lower().strip()
+ field_keywords = set(field_lower.replace(" ", "").split())
+
+ best_match_idx = None
+ best_match_score = 0
+
+ for idx, header in enumerate(headers):
+ header_str = str(header).strip()
+ header_lower = header_str.lower()
+
+ # 策略1: 精确匹配(忽略大小写)
+ if header_lower == field_lower:
+ return idx
+
+ # 策略2: 子字符串匹配
+ if field_lower in header_lower or header_lower in field_lower:
+ # 计算匹配分数(较长匹配更优先)
+ score = max(len(field_lower), len(header_lower)) / min(len(field_lower) + 1, len(header_lower) + 1)
+ if score > best_match_score:
+ best_match_score = score
+ best_match_idx = idx
+ continue
+
+ # 策略3: 关键词重叠匹配(适用于中文)
+ header_keywords = set(header_lower.replace(" ", "").split())
+ overlap = field_keywords & header_keywords
+ if overlap and len(overlap) > 0:
+ score = len(overlap) / max(len(field_keywords), len(header_keywords), 1)
+ if score > best_match_score:
+ best_match_score = score
+ best_match_idx = idx
+
+ # 只有当匹配分数超过阈值时才返回
+ if best_match_score >= 0.3:
+ logger.info(f"模糊匹配: {field_name} -> {headers[best_match_idx]} (分数: {best_match_score:.2f})")
+ return best_match_idx
+
+ return None
+
def _extract_column_values(self, rows: List, columns: List, field_name: str) -> List[str]:
"""
从 rows 和 columns 中提取指定列的值
@@ -737,24 +915,22 @@ class TemplateFillService:
if not rows or not columns:
return []
- # 查找匹配的列(模糊匹配)
- target_col = None
- for col in columns:
- col_str = str(col)
- if field_name.lower() in col_str.lower() or col_str.lower() in field_name.lower():
- target_col = col
- break
+ # 使用增强的匹配算法查找最佳匹配的列索引
+ target_idx = self._find_best_matching_column(columns, field_name)
- if not target_col:
+ if target_idx is None:
logger.warning(f"未找到匹配列: {field_name}, 可用列: {columns}")
return []
+ target_col = columns[target_idx]
+ logger.info(f"列匹配成功: {field_name} -> {target_col} (索引: {target_idx})")
+
values = []
for row in rows:
if isinstance(row, dict):
val = row.get(target_col, "")
- elif isinstance(row, list) and target_col in columns:
- val = row[columns.index(target_col)]
+ elif isinstance(row, list) and target_idx < len(row):
+ val = row[target_idx]
else:
val = ""
values.append(self._format_value(val))
From 7f67fa89de3894b23425f1554d79705937a6bf4d Mon Sep 17 00:00:00 2001
From: KiriAky 107
Date: Thu, 9 Apr 2026 22:15:37 +0800
Subject: [PATCH 04/13] =?UTF-8?q?=E6=B7=BB=E5=8A=A0AI=E7=94=9F=E6=88=90?=
=?UTF-8?q?=E8=A1=A8=E5=A4=B4=E5=8A=9F=E8=83=BD=E5=B9=B6=E9=87=8D=E6=9E=84?=
=?UTF-8?q?=E5=89=8D=E7=AB=AF=E7=8A=B6=E6=80=81=E7=AE=A1=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- 后端:实现AI生成表头逻辑,当模板为空或字段为自动生成时调用AI分析并生成合适字段
- 后端:添加_is_auto_generated_field方法识别自动生成的无效表头字段
- 后端:修改_get_template_fields_from_excel方法支持文件类型参数
- 前端:创建TemplateFillContext提供全局状态管理
- 前端:将TemplateFill页面状态迁移到Context中统一管理
- 前端:移除页面内重复的状态定义和方法实现
---
backend/app/services/template_fill_service.py | 155 +++++++++++++++++-
frontend/src/App.tsx | 7 +-
frontend/src/context/TemplateFillContext.tsx | 114 +++++++++++++
frontend/src/pages/TemplateFill.tsx | 89 ++--------
4 files changed, 288 insertions(+), 77 deletions(-)
create mode 100644 frontend/src/context/TemplateFillContext.tsx
diff --git a/backend/app/services/template_fill_service.py b/backend/app/services/template_fill_service.py
index dfea7f8..e744d09 100644
--- a/backend/app/services/template_fill_service.py
+++ b/backend/app/services/template_fill_service.py
@@ -545,16 +545,47 @@ class TemplateFillService:
try:
if file_type in ["xlsx", "xls"]:
- fields = await self._get_template_fields_from_excel(file_path)
+ fields = await self._get_template_fields_from_excel(file_type, file_path)
elif file_type == "docx":
fields = await self._get_template_fields_from_docx(file_path)
+ # 检查是否需要 AI 生成表头
+ # 条件:没有字段 OR 所有字段都是自动命名的(如"字段1"、"列1"、"Unnamed"开头)
+ needs_ai_generation = (
+ len(fields) == 0 or
+ all(self._is_auto_generated_field(f.name) for f in fields)
+ )
+
+ if needs_ai_generation:
+ logger.info(f"模板表头为空或自动生成,尝试 AI 生成表头... (fields={len(fields)})")
+ ai_fields = await self._generate_fields_with_ai(file_path, file_type)
+ if ai_fields:
+ fields = ai_fields
+ logger.info(f"AI 生成表头成功: {len(fields)} 个字段")
+
except Exception as e:
logger.error(f"提取模板字段失败: {str(e)}")
return fields
- async def _get_template_fields_from_excel(self, file_path: str) -> List[TemplateField]:
+ def _is_auto_generated_field(self, name: str) -> bool:
+ """检查字段名是否是自动生成的(无效表头)"""
+ import re
+ if not name:
+ return True
+ name_str = str(name).strip()
+ # 匹配 "字段1", "列1", "Field1", "Column1" 等自动生成的名字
+ # 或 "Unnamed: 0" 等 Excel 默认名字
+ if name_str.startswith('Unnamed'):
+ return True
+ if re.match(r'^[列字段ColumnField]+\d+$', name_str, re.IGNORECASE):
+ return True
+ if name_str in ['0', '1', '2'] or name_str.startswith('0.') or name_str.startswith('1.'):
+ # 纯数字或类似 "0.1" 的列名
+ return True
+ return False
+
+ async def _get_template_fields_from_excel(self, file_type: str, file_path: str) -> List[TemplateField]:
"""从 Excel 模板提取字段"""
fields = []
@@ -1191,6 +1222,126 @@ class TemplateFillService:
return None
+ async def _generate_fields_with_ai(
+ self,
+ file_path: str,
+ file_type: str
+ ) -> Optional[List[TemplateField]]:
+ """
+ 使用 AI 为空表生成表头字段
+
+ 当模板文件为空或没有表头时,调用 AI 分析并生成合适的字段名
+
+ Args:
+ file_path: 模板文件路径
+ file_type: 文件类型
+
+ Returns:
+ 生成的字段列表,如果失败返回 None
+ """
+ try:
+ import pandas as pd
+
+ # 读取 Excel 内容检查是否为空
+ if file_type in ["xlsx", "xls"]:
+ df = pd.read_excel(file_path, header=None)
+ if df.shape[0] == 0 or df.shape[1] == 0:
+ logger.info("Excel 表格为空")
+ # 生成默认字段
+ return [TemplateField(
+ cell=self._column_to_cell(i),
+ name=f"字段{i+1}",
+ field_type="text",
+ required=False,
+ hint="请填写此字段"
+ ) for i in range(5)]
+
+ # 表格有数据但没有表头
+ if df.shape[1] > 0:
+ # 读取第一行作为参考,看是否为空
+ first_row = df.iloc[0].tolist() if len(df) > 0 else []
+ if not any(pd.notna(v) and str(v).strip() != '' for v in first_row):
+ # 第一行为空,AI 生成表头
+ content_sample = df.iloc[:10].to_string() if len(df) >= 10 else df.to_string()
+ else:
+ content_sample = df.to_string()
+ else:
+ content_sample = ""
+
+ # 调用 AI 生成表头
+ prompt = f"""你是一个专业的表格设计助手。请为以下空白表格生成合适的表头字段。
+
+表格内容预览:
+{content_sample[:2000] if content_sample else "空白表格"}
+
+请生成5-10个简洁的表头字段名,这些字段应该:
+1. 简洁明了,易于理解
+2. 适合作为表格列标题
+3. 之间有明显的区分度
+
+请严格按照以下 JSON 格式输出(只需输出 JSON,不要其他内容):
+{{
+ "fields": [
+ {{"name": "字段名1", "hint": "字段说明提示1"}},
+ {{"name": "字段名2", "hint": "字段说明提示2"}}
+ ]
+}}
+"""
+ messages = [
+ {"role": "system", "content": "你是一个专业的表格设计助手。请严格按JSON格式输出。"},
+ {"role": "user", "content": prompt}
+ ]
+
+ response = await self.llm.chat(
+ messages=messages,
+ temperature=0.3,
+ max_tokens=2000
+ )
+
+ content = self.llm.extract_message_content(response)
+ logger.info(f"AI 生成表头返回: {content[:500]}")
+
+ # 解析 JSON
+ import json
+ import re
+
+ # 清理 markdown 格式
+ cleaned = content.strip()
+ cleaned = re.sub(r'^```json\s*', '', cleaned, flags=re.MULTILINE)
+ cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE)
+ cleaned = cleaned.strip()
+
+ # 查找 JSON
+ json_start = -1
+ for i, c in enumerate(cleaned):
+ if c == '{':
+ json_start = i
+ break
+
+ if json_start == -1:
+ logger.warning("无法找到 JSON 开始位置")
+ return None
+
+ json_text = cleaned[json_start:]
+ result = json.loads(json_text)
+
+ if result and "fields" in result:
+ fields = []
+ for idx, f in enumerate(result["fields"]):
+ fields.append(TemplateField(
+ cell=self._column_to_cell(idx),
+ name=f.get("name", f"字段{idx+1}"),
+ field_type="text",
+ required=False,
+ hint=f.get("hint", "")
+ ))
+ return fields
+
+ except Exception as e:
+ logger.error(f"AI 生成表头失败: {str(e)}")
+
+ return None
+
# ==================== 全局单例 ====================
diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx
index 877c55f..e764335 100644
--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@@ -1,13 +1,16 @@
import { RouterProvider } from 'react-router-dom';
import { AuthProvider } from '@/context/AuthContext';
+import { TemplateFillProvider } from '@/context/TemplateFillContext';
import { router } from '@/routes';
import { Toaster } from 'sonner';
function App() {
return (
-
-
+
+
+
+
);
}
diff --git a/frontend/src/context/TemplateFillContext.tsx b/frontend/src/context/TemplateFillContext.tsx
new file mode 100644
index 0000000..76ba073
--- /dev/null
+++ b/frontend/src/context/TemplateFillContext.tsx
@@ -0,0 +1,114 @@
+import React, { createContext, useContext, useState, ReactNode } from 'react';
+
+type SourceFile = {
+ file: File;
+ preview?: string;
+};
+
+type TemplateField = {
+ cell: string;
+ name: string;
+ field_type: string;
+ required: boolean;
+ hint?: string;
+};
+
+type Step = 'upload' | 'filling' | 'preview';
+
+interface TemplateFillState {
+ step: Step;
+ templateFile: File | null;
+ templateFields: TemplateField[];
+ sourceFiles: SourceFile[];
+ sourceFilePaths: string[];
+ templateId: string;
+ filledResult: any;
+ setStep: (step: Step) => void;
+ setTemplateFile: (file: File | null) => void;
+ setTemplateFields: (fields: TemplateField[]) => void;
+ setSourceFiles: (files: SourceFile[]) => void;
+ addSourceFiles: (files: SourceFile[]) => void;
+ removeSourceFile: (index: number) => void;
+ setSourceFilePaths: (paths: string[]) => void;
+ setTemplateId: (id: string) => void;
+ setFilledResult: (result: any) => void;
+ reset: () => void;
+}
+
+const initialState = {
+ step: 'upload' as Step,
+ templateFile: null,
+ templateFields: [],
+ sourceFiles: [],
+ sourceFilePaths: [],
+ templateId: '',
+ filledResult: null,
+ setStep: () => {},
+ setTemplateFile: () => {},
+ setTemplateFields: () => {},
+ setSourceFiles: () => {},
+ addSourceFiles: () => {},
+ removeSourceFile: () => {},
+ setSourceFilePaths: () => {},
+ setTemplateId: () => {},
+ setFilledResult: () => {},
+ reset: () => {},
+};
+
+const TemplateFillContext = createContext(initialState);
+
+export const TemplateFillProvider: React.FC<{ children: ReactNode }> = ({ children }) => {
+ const [step, setStep] = useState('upload');
+ const [templateFile, setTemplateFile] = useState(null);
+ const [templateFields, setTemplateFields] = useState([]);
+ const [sourceFiles, setSourceFiles] = useState([]);
+ const [sourceFilePaths, setSourceFilePaths] = useState([]);
+ const [templateId, setTemplateId] = useState('');
+ const [filledResult, setFilledResult] = useState(null);
+
+ const addSourceFiles = (files: SourceFile[]) => {
+ setSourceFiles(prev => [...prev, ...files]);
+ };
+
+ const removeSourceFile = (index: number) => {
+ setSourceFiles(prev => prev.filter((_, i) => i !== index));
+ };
+
+ const reset = () => {
+ setStep('upload');
+ setTemplateFile(null);
+ setTemplateFields([]);
+ setSourceFiles([]);
+ setSourceFilePaths([]);
+ setTemplateId('');
+ setFilledResult(null);
+ };
+
+ return (
+
+ {children}
+
+ );
+};
+
+export const useTemplateFill = () => useContext(TemplateFillContext);
diff --git a/frontend/src/pages/TemplateFill.tsx b/frontend/src/pages/TemplateFill.tsx
index 1fa7c99..d3e57c9 100644
--- a/frontend/src/pages/TemplateFill.tsx
+++ b/frontend/src/pages/TemplateFill.tsx
@@ -37,6 +37,7 @@ import {
DialogTitle,
} from "@/components/ui/dialog";
import { ScrollArea } from '@/components/ui/scroll-area';
+import { useTemplateFill } from '@/context/TemplateFillContext';
type DocumentItem = {
doc_id: string;
@@ -52,29 +53,19 @@ type DocumentItem = {
};
};
-type SourceFile = {
- file: File;
- preview?: string;
-};
-
-type TemplateField = {
- cell: string;
- name: string;
- field_type: string;
- required: boolean;
- hint?: string;
-};
-
const TemplateFill: React.FC = () => {
- const [step, setStep] = useState<'upload' | 'filling' | 'preview'>('upload');
- const [templateFile, setTemplateFile] = useState(null);
- const [templateFields, setTemplateFields] = useState([]);
- const [sourceFiles, setSourceFiles] = useState([]);
- const [sourceFilePaths, setSourceFilePaths] = useState([]);
- const [templateId, setTemplateId] = useState('');
+ const {
+ step, setStep,
+ templateFile, setTemplateFile,
+ templateFields, setTemplateFields,
+ sourceFiles, setSourceFiles, addSourceFiles, removeSourceFile,
+ sourceFilePaths, setSourceFilePaths,
+ templateId, setTemplateId,
+ filledResult, setFilledResult,
+ reset
+ } = useTemplateFill();
+
const [loading, setLoading] = useState(false);
- const [filling, setFilling] = useState(false);
- const [filledResult, setFilledResult] = useState(null);
const [previewDoc, setPreviewDoc] = useState<{ name: string; content: string } | null>(null);
const [previewOpen, setPreviewOpen] = useState(false);
@@ -103,8 +94,8 @@ const TemplateFill: React.FC = () => {
file: f,
preview: f.type.startsWith('text/') || f.name.endsWith('.md') ? undefined : undefined
}));
- setSourceFiles(prev => [...prev, ...newFiles]);
- }, []);
+ addSourceFiles(newFiles);
+ }, [addSourceFiles]);
const { getRootProps: getSourceProps, getInputProps: getSourceInputProps, isDragActive: isSourceDragActive } = useDropzone({
onDrop: onSourceDrop,
@@ -118,10 +109,6 @@ const TemplateFill: React.FC = () => {
multiple: true
});
- const removeSourceFile = (index: number) => {
- setSourceFiles(prev => prev.filter((_, i) => i !== index));
- };
-
const handleJointUploadAndFill = async () => {
if (!templateFile) {
toast.error('请先上传模板文件');
@@ -164,40 +151,6 @@ const TemplateFill: React.FC = () => {
}
};
- // 传统方式:先上传源文档再填表(兼容已有文档库的场景)
- const handleFillWithExistingDocs = async (selectedDocIds: string[]) => {
- if (!templateFile || selectedDocIds.length === 0) {
- toast.error('请选择数据源文档');
- return;
- }
-
- setLoading(true);
- setStep('filling');
-
- try {
- // 先上传模板获取template_id
- const uploadResult = await backendApi.uploadTemplate(templateFile);
-
- const fillResult = await backendApi.fillTemplate(
- uploadResult.template_id,
- uploadResult.fields || [],
- selectedDocIds,
- [],
- '请从以下文档中提取相关信息填写表格'
- );
-
- setTemplateFields(uploadResult.fields || []);
- setTemplateId(uploadResult.template_id);
- setFilledResult(fillResult);
- setStep('preview');
- toast.success('表格填写完成');
- } catch (err: any) {
- toast.error('填表失败: ' + (err.message || '未知错误'));
- } finally {
- setLoading(false);
- }
- };
-
const handleExport = async () => {
if (!templateFile || !filledResult) return;
@@ -219,16 +172,6 @@ const TemplateFill: React.FC = () => {
}
};
- const resetFlow = () => {
- setStep('upload');
- setTemplateFile(null);
- setTemplateFields([]);
- setSourceFiles([]);
- setSourceFilePaths([]);
- setTemplateId('');
- setFilledResult(null);
- };
-
const getFileIcon = (filename: string) => {
const ext = filename.split('.').pop()?.toLowerCase();
if (['xlsx', 'xls'].includes(ext || '')) {
@@ -253,7 +196,7 @@ const TemplateFill: React.FC = () => {
{step !== 'upload' && (
-