zyh
This commit is contained in:
@@ -161,3 +161,133 @@ class DocxParser(BaseParser):
|
||||
fields[field_name] = match.group(1)
|
||||
|
||||
return fields
|
||||
|
||||
def parse_tables_for_template(
|
||||
self,
|
||||
file_path: str
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
解析 Word 文档中的表格,提取模板字段
|
||||
|
||||
专门用于比赛场景:解析表格模板,识别需要填写的字段
|
||||
|
||||
Args:
|
||||
file_path: Word 文件路径
|
||||
|
||||
Returns:
|
||||
包含表格字段信息的字典
|
||||
"""
|
||||
from docx import Document
|
||||
from docx.table import Table
|
||||
from docx.oxml.ns import qn
|
||||
|
||||
doc = Document(file_path)
|
||||
|
||||
template_info = {
|
||||
"tables": [],
|
||||
"fields": [],
|
||||
"field_count": 0
|
||||
}
|
||||
|
||||
for table_idx, table in enumerate(doc.tables):
|
||||
table_info = {
|
||||
"table_index": table_idx,
|
||||
"rows": [],
|
||||
"headers": [],
|
||||
"data_rows": [],
|
||||
"field_hints": {} # 字段名称 -> 提示词/描述
|
||||
}
|
||||
|
||||
# 提取表头(第一行)
|
||||
if table.rows:
|
||||
header_cells = [cell.text.strip() for cell in table.rows[0].cells]
|
||||
table_info["headers"] = header_cells
|
||||
|
||||
# 提取数据行
|
||||
for row_idx, row in enumerate(table.rows[1:], 1):
|
||||
row_data = [cell.text.strip() for cell in row.cells]
|
||||
table_info["data_rows"].append(row_data)
|
||||
table_info["rows"].append({
|
||||
"row_index": row_idx,
|
||||
"cells": row_data
|
||||
})
|
||||
|
||||
# 尝试从第二列/第三列提取提示词
|
||||
# 比赛模板通常格式为:字段名 | 提示词 | 填写值
|
||||
if len(table.rows[0].cells) >= 2:
|
||||
for row_idx, row in enumerate(table.rows[1:], 1):
|
||||
cells = [cell.text.strip() for cell in row.cells]
|
||||
if len(cells) >= 2 and cells[0]:
|
||||
# 第一列是字段名
|
||||
field_name = cells[0]
|
||||
# 第二列可能是提示词或描述
|
||||
hint = cells[1] if len(cells) > 1 else ""
|
||||
table_info["field_hints"][field_name] = hint
|
||||
|
||||
template_info["fields"].append({
|
||||
"table_index": table_idx,
|
||||
"row_index": row_idx,
|
||||
"field_name": field_name,
|
||||
"hint": hint,
|
||||
"expected_value": cells[2] if len(cells) > 2 else ""
|
||||
})
|
||||
|
||||
template_info["tables"].append(table_info)
|
||||
|
||||
template_info["field_count"] = len(template_info["fields"])
|
||||
return template_info
|
||||
|
||||
def extract_template_fields_from_docx(
|
||||
self,
|
||||
file_path: str
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
从 Word 文档中提取模板字段定义
|
||||
|
||||
适用于比赛评分表格:表格第一列是字段名,第二列是提示词/填写示例
|
||||
|
||||
Args:
|
||||
file_path: Word 文件路径
|
||||
|
||||
Returns:
|
||||
字段定义列表
|
||||
"""
|
||||
template_info = self.parse_tables_for_template(file_path)
|
||||
|
||||
fields = []
|
||||
for field in template_info["fields"]:
|
||||
fields.append({
|
||||
"cell": f"T{field['table_index']}R{field['row_index']}", # TableXRowY 格式
|
||||
"name": field["field_name"],
|
||||
"hint": field["hint"],
|
||||
"table_index": field["table_index"],
|
||||
"row_index": field["row_index"],
|
||||
"field_type": self._infer_field_type_from_hint(field["hint"]),
|
||||
"required": True
|
||||
})
|
||||
|
||||
return fields
|
||||
|
||||
def _infer_field_type_from_hint(self, hint: str) -> str:
|
||||
"""
|
||||
从提示词推断字段类型
|
||||
|
||||
Args:
|
||||
hint: 字段提示词
|
||||
|
||||
Returns:
|
||||
字段类型 (text/number/date)
|
||||
"""
|
||||
hint_lower = hint.lower()
|
||||
|
||||
# 日期关键词
|
||||
date_keywords = ["年", "月", "日", "日期", "时间", "出生"]
|
||||
if any(kw in hint for kw in date_keywords):
|
||||
return "date"
|
||||
|
||||
# 数字关键词
|
||||
number_keywords = ["数量", "金额", "人数", "面积", "增长", "比率", "%", "率"]
|
||||
if any(kw in hint_lower for kw in number_keywords):
|
||||
return "number"
|
||||
|
||||
return "text"
|
||||
|
||||
Reference in New Issue
Block a user