This commit is contained in:
zzz
2026-04-08 20:23:51 +08:00
parent 6f8976cf71
commit 38e41c6eff
6 changed files with 663 additions and 149 deletions

View File

@@ -161,3 +161,133 @@ class DocxParser(BaseParser):
fields[field_name] = match.group(1)
return fields
def parse_tables_for_template(
self,
file_path: str
) -> Dict[str, Any]:
"""
解析 Word 文档中的表格,提取模板字段
专门用于比赛场景:解析表格模板,识别需要填写的字段
Args:
file_path: Word 文件路径
Returns:
包含表格字段信息的字典
"""
from docx import Document
from docx.table import Table
from docx.oxml.ns import qn
doc = Document(file_path)
template_info = {
"tables": [],
"fields": [],
"field_count": 0
}
for table_idx, table in enumerate(doc.tables):
table_info = {
"table_index": table_idx,
"rows": [],
"headers": [],
"data_rows": [],
"field_hints": {} # 字段名称 -> 提示词/描述
}
# 提取表头(第一行)
if table.rows:
header_cells = [cell.text.strip() for cell in table.rows[0].cells]
table_info["headers"] = header_cells
# 提取数据行
for row_idx, row in enumerate(table.rows[1:], 1):
row_data = [cell.text.strip() for cell in row.cells]
table_info["data_rows"].append(row_data)
table_info["rows"].append({
"row_index": row_idx,
"cells": row_data
})
# 尝试从第二列/第三列提取提示词
# 比赛模板通常格式为:字段名 | 提示词 | 填写值
if len(table.rows[0].cells) >= 2:
for row_idx, row in enumerate(table.rows[1:], 1):
cells = [cell.text.strip() for cell in row.cells]
if len(cells) >= 2 and cells[0]:
# 第一列是字段名
field_name = cells[0]
# 第二列可能是提示词或描述
hint = cells[1] if len(cells) > 1 else ""
table_info["field_hints"][field_name] = hint
template_info["fields"].append({
"table_index": table_idx,
"row_index": row_idx,
"field_name": field_name,
"hint": hint,
"expected_value": cells[2] if len(cells) > 2 else ""
})
template_info["tables"].append(table_info)
template_info["field_count"] = len(template_info["fields"])
return template_info
def extract_template_fields_from_docx(
self,
file_path: str
) -> List[Dict[str, Any]]:
"""
从 Word 文档中提取模板字段定义
适用于比赛评分表格:表格第一列是字段名,第二列是提示词/填写示例
Args:
file_path: Word 文件路径
Returns:
字段定义列表
"""
template_info = self.parse_tables_for_template(file_path)
fields = []
for field in template_info["fields"]:
fields.append({
"cell": f"T{field['table_index']}R{field['row_index']}", # TableXRowY 格式
"name": field["field_name"],
"hint": field["hint"],
"table_index": field["table_index"],
"row_index": field["row_index"],
"field_type": self._infer_field_type_from_hint(field["hint"]),
"required": True
})
return fields
def _infer_field_type_from_hint(self, hint: str) -> str:
"""
从提示词推断字段类型
Args:
hint: 字段提示词
Returns:
字段类型 (text/number/date)
"""
hint_lower = hint.lower()
# 日期关键词
date_keywords = ["", "", "", "日期", "时间", "出生"]
if any(kw in hint for kw in date_keywords):
return "date"
# 数字关键词
number_keywords = ["数量", "金额", "人数", "面积", "增长", "比率", "%", ""]
if any(kw in hint_lower for kw in number_keywords):
return "number"
return "text"