Compare commits
6 Commits
7f67fa89de
...
610d475ce0
| Author | SHA1 | Date | |
|---|---|---|---|
| 610d475ce0 | |||
| 496b96508d | |||
| 07ebdc09bc | |||
| c1886fb68f | |||
| 78417c898a | |||
| 718f864926 |
@@ -317,24 +317,70 @@ class XlsxParser(BaseParser):
|
|||||||
import zipfile
|
import zipfile
|
||||||
from xml.etree import ElementTree as ET
|
from xml.etree import ElementTree as ET
|
||||||
|
|
||||||
|
# 常见的命名空间
|
||||||
|
COMMON_NAMESPACES = [
|
||||||
|
'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
|
||||||
|
'http://schemas.openxmlformats.org/spreadsheetml/2005/main',
|
||||||
|
'http://schemas.openxmlformats.org/spreadsheetml/2004/main',
|
||||||
|
'http://schemas.openxmlformats.org/spreadsheetml/2003/main',
|
||||||
|
]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with zipfile.ZipFile(file_path, 'r') as z:
|
with zipfile.ZipFile(file_path, 'r') as z:
|
||||||
if 'xl/workbook.xml' not in z.namelist():
|
# 尝试多种可能的 workbook.xml 路径
|
||||||
|
possible_paths = ['xl/workbook.xml', 'xl\\workbook.xml', 'workbook.xml']
|
||||||
|
content = None
|
||||||
|
for path in possible_paths:
|
||||||
|
if path in z.namelist():
|
||||||
|
content = z.read(path)
|
||||||
|
logger.info(f"找到 workbook.xml at: {path}")
|
||||||
|
break
|
||||||
|
|
||||||
|
if content is None:
|
||||||
|
logger.warning(f"未找到 workbook.xml,文件列表: {z.namelist()[:10]}")
|
||||||
return []
|
return []
|
||||||
content = z.read('xl/workbook.xml')
|
|
||||||
root = ET.fromstring(content)
|
root = ET.fromstring(content)
|
||||||
|
|
||||||
# 命名空间
|
|
||||||
ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
|
|
||||||
|
|
||||||
sheet_names = []
|
sheet_names = []
|
||||||
for sheet in root.findall('.//main:sheet', ns):
|
|
||||||
name = sheet.get('name')
|
# 方法1:尝试带命名空间的查找
|
||||||
if name:
|
for ns in COMMON_NAMESPACES:
|
||||||
sheet_names.append(name)
|
sheet_elements = root.findall(f'.//{{{ns}}}sheet')
|
||||||
|
if sheet_elements:
|
||||||
|
for sheet in sheet_elements:
|
||||||
|
name = sheet.get('name')
|
||||||
|
if name:
|
||||||
|
sheet_names.append(name)
|
||||||
|
if sheet_names:
|
||||||
|
logger.info(f"使用命名空间 {ns} 提取工作表: {sheet_names}")
|
||||||
|
return sheet_names
|
||||||
|
|
||||||
|
# 方法2:不使用命名空间,直接查找所有 sheet 元素
|
||||||
|
if not sheet_names:
|
||||||
|
for elem in root.iter():
|
||||||
|
if elem.tag.endswith('sheet') and elem.tag != 'sheets':
|
||||||
|
name = elem.get('name')
|
||||||
|
if name:
|
||||||
|
sheet_names.append(name)
|
||||||
|
for child in elem:
|
||||||
|
if child.tag.endswith('sheet') or child.tag == 'sheet':
|
||||||
|
name = child.get('name')
|
||||||
|
if name and name not in sheet_names:
|
||||||
|
sheet_names.append(name)
|
||||||
|
|
||||||
|
# 方法3:直接从 XML 文本中正则匹配 sheet name
|
||||||
|
if not sheet_names:
|
||||||
|
import re
|
||||||
|
xml_str = content.decode('utf-8', errors='ignore')
|
||||||
|
matches = re.findall(r'<sheet\s+[^>]*name=["\']([^"\']+)["\']', xml_str, re.IGNORECASE)
|
||||||
|
if matches:
|
||||||
|
sheet_names = matches
|
||||||
|
logger.info(f"使用正则提取工作表: {sheet_names}")
|
||||||
|
|
||||||
logger.info(f"从 XML 提取工作表: {sheet_names}")
|
logger.info(f"从 XML 提取工作表: {sheet_names}")
|
||||||
return sheet_names
|
return sheet_names
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"从 XML 提取工作表名称失败: {e}")
|
logger.error(f"从 XML 提取工作表名称失败: {e}")
|
||||||
return []
|
return []
|
||||||
@@ -356,6 +402,32 @@ class XlsxParser(BaseParser):
|
|||||||
import zipfile
|
import zipfile
|
||||||
from xml.etree import ElementTree as ET
|
from xml.etree import ElementTree as ET
|
||||||
|
|
||||||
|
# 常见的命名空间
|
||||||
|
COMMON_NAMESPACES = [
|
||||||
|
'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
|
||||||
|
'http://schemas.openxmlformats.org/spreadsheetml/2005/main',
|
||||||
|
'http://schemas.openxmlformats.org/spreadsheetml/2004/main',
|
||||||
|
'http://schemas.openxmlformats.org/spreadsheetml/2003/main',
|
||||||
|
]
|
||||||
|
|
||||||
|
def find_elements_with_ns(root, tag_name):
|
||||||
|
"""灵活查找元素,支持任意命名空间"""
|
||||||
|
results = []
|
||||||
|
# 方法1:用固定命名空间
|
||||||
|
for ns in COMMON_NAMESPACES:
|
||||||
|
try:
|
||||||
|
elems = root.findall(f'.//{{{ns}}}{tag_name}')
|
||||||
|
if elems:
|
||||||
|
results.extend(elems)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
# 方法2:不带命名空间查找
|
||||||
|
if not results:
|
||||||
|
for elem in root.iter():
|
||||||
|
if elem.tag.endswith('}' + tag_name):
|
||||||
|
results.append(elem)
|
||||||
|
return results
|
||||||
|
|
||||||
with zipfile.ZipFile(file_path, 'r') as z:
|
with zipfile.ZipFile(file_path, 'r') as z:
|
||||||
# 获取工作表名称
|
# 获取工作表名称
|
||||||
sheet_names = self._extract_sheet_names_from_xml(file_path)
|
sheet_names = self._extract_sheet_names_from_xml(file_path)
|
||||||
@@ -366,57 +438,68 @@ class XlsxParser(BaseParser):
|
|||||||
target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
|
target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
|
||||||
sheet_index = sheet_names.index(target_sheet) + 1 # sheet1.xml, sheet2.xml, ...
|
sheet_index = sheet_names.index(target_sheet) + 1 # sheet1.xml, sheet2.xml, ...
|
||||||
|
|
||||||
# 读取 shared strings
|
# 读取 shared strings - 尝试多种路径
|
||||||
shared_strings = []
|
shared_strings = []
|
||||||
if 'xl/sharedStrings.xml' in z.namelist():
|
ss_paths = ['xl/sharedStrings.xml', 'xl\\sharedStrings.xml', 'sharedStrings.xml']
|
||||||
ss_content = z.read('xl/sharedStrings.xml')
|
for ss_path in ss_paths:
|
||||||
ss_root = ET.fromstring(ss_content)
|
if ss_path in z.namelist():
|
||||||
ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
|
try:
|
||||||
for si in ss_root.findall('.//main:si', ns):
|
ss_content = z.read(ss_path)
|
||||||
t = si.find('.//main:t', ns)
|
ss_root = ET.fromstring(ss_content)
|
||||||
if t is not None:
|
for si in find_elements_with_ns(ss_root, 'si'):
|
||||||
shared_strings.append(t.text or '')
|
t_elements = [c for c in si if c.tag.endswith('}t') or c.tag == 't']
|
||||||
else:
|
if t_elements:
|
||||||
shared_strings.append('')
|
shared_strings.append(t_elements[0].text or '')
|
||||||
|
else:
|
||||||
|
shared_strings.append('')
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"读取 sharedStrings 失败: {e}")
|
||||||
|
|
||||||
# 读取工作表
|
# 读取工作表 - 尝试多种可能的路径
|
||||||
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
|
sheet_content = None
|
||||||
if sheet_file not in z.namelist():
|
sheet_paths = [
|
||||||
raise ValueError(f"工作表文件 {sheet_file} 不存在")
|
f'xl/worksheets/sheet{sheet_index}.xml',
|
||||||
|
f'xl\\worksheets\\sheet{sheet_index}.xml',
|
||||||
|
f'worksheets/sheet{sheet_index}.xml',
|
||||||
|
]
|
||||||
|
for sp in sheet_paths:
|
||||||
|
if sp in z.namelist():
|
||||||
|
sheet_content = z.read(sp)
|
||||||
|
break
|
||||||
|
|
||||||
|
if sheet_content is None:
|
||||||
|
raise ValueError(f"工作表文件 sheet{sheet_index}.xml 不存在")
|
||||||
|
|
||||||
sheet_content = z.read(sheet_file)
|
|
||||||
root = ET.fromstring(sheet_content)
|
root = ET.fromstring(sheet_content)
|
||||||
ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
|
|
||||||
|
|
||||||
# 收集所有行数据
|
# 收集所有行数据
|
||||||
all_rows = []
|
all_rows = []
|
||||||
headers = {}
|
headers = {}
|
||||||
|
|
||||||
for row in root.findall('.//main:row', ns):
|
for row in find_elements_with_ns(root, 'row'):
|
||||||
row_idx = int(row.get('r', 0))
|
row_idx = int(row.get('r', 0))
|
||||||
row_cells = {}
|
row_cells = {}
|
||||||
for cell in row.findall('main:c', ns):
|
for cell in find_elements_with_ns(row, 'c'):
|
||||||
cell_ref = cell.get('r', '')
|
cell_ref = cell.get('r', '')
|
||||||
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
||||||
cell_type = cell.get('t', 'n')
|
cell_type = cell.get('t', 'n')
|
||||||
v = cell.find('main:v', ns)
|
v_elements = find_elements_with_ns(cell, 'v')
|
||||||
|
v = v_elements[0] if v_elements else None
|
||||||
|
|
||||||
if v is not None and v.text:
|
if v is not None and v.text:
|
||||||
if cell_type == 's':
|
if cell_type == 's':
|
||||||
# shared string
|
|
||||||
try:
|
try:
|
||||||
row_cells[col_letters] = shared_strings[int(v.text)]
|
row_cells[col_letters] = shared_strings[int(v.text)]
|
||||||
except (ValueError, IndexError):
|
except (ValueError, IndexError):
|
||||||
row_cells[col_letters] = v.text
|
row_cells[col_letters] = v.text
|
||||||
elif cell_type == 'b':
|
elif cell_type == 'b':
|
||||||
# boolean
|
|
||||||
row_cells[col_letters] = v.text == '1'
|
row_cells[col_letters] = v.text == '1'
|
||||||
else:
|
else:
|
||||||
row_cells[col_letters] = v.text
|
row_cells[col_letters] = v.text
|
||||||
else:
|
else:
|
||||||
row_cells[col_letters] = None
|
row_cells[col_letters] = None
|
||||||
|
|
||||||
# 处理表头行
|
|
||||||
if row_idx == header_row + 1:
|
if row_idx == header_row + 1:
|
||||||
headers = {**row_cells}
|
headers = {**row_cells}
|
||||||
elif row_idx > header_row + 1:
|
elif row_idx > header_row + 1:
|
||||||
@@ -424,7 +507,6 @@ class XlsxParser(BaseParser):
|
|||||||
|
|
||||||
# 构建 DataFrame
|
# 构建 DataFrame
|
||||||
if headers:
|
if headers:
|
||||||
# 按原始列顺序排列
|
|
||||||
col_order = list(headers.keys())
|
col_order = list(headers.keys())
|
||||||
df = pd.DataFrame(all_rows)
|
df = pd.DataFrame(all_rows)
|
||||||
if not df.empty:
|
if not df.empty:
|
||||||
|
|||||||
@@ -78,12 +78,19 @@ class TemplateFillService:
|
|||||||
fill_details = []
|
fill_details = []
|
||||||
|
|
||||||
logger.info(f"开始填表: {len(template_fields)} 个字段, {len(source_doc_ids or [])} 个源文档")
|
logger.info(f"开始填表: {len(template_fields)} 个字段, {len(source_doc_ids or [])} 个源文档")
|
||||||
|
logger.info(f"source_doc_ids: {source_doc_ids}")
|
||||||
|
logger.info(f"source_file_paths: {source_file_paths}")
|
||||||
|
|
||||||
# 1. 加载源文档内容
|
# 1. 加载源文档内容
|
||||||
source_docs = await self._load_source_documents(source_doc_ids, source_file_paths)
|
source_docs = await self._load_source_documents(source_doc_ids, source_file_paths)
|
||||||
|
|
||||||
logger.info(f"加载了 {len(source_docs)} 个源文档")
|
logger.info(f"加载了 {len(source_docs)} 个源文档")
|
||||||
|
|
||||||
|
# 打印每个加载的文档的详细信息
|
||||||
|
for i, doc in enumerate(source_docs):
|
||||||
|
logger.info(f" 文档[{i}]: id={doc.doc_id}, filename={doc.filename}, doc_type={doc.doc_type}")
|
||||||
|
logger.info(f" content长度: {len(doc.content)}, structured_data keys: {list(doc.structured_data.keys()) if doc.structured_data else 'None'}")
|
||||||
|
|
||||||
if not source_docs:
|
if not source_docs:
|
||||||
logger.warning("没有找到源文档,填表结果将全部为空")
|
logger.warning("没有找到源文档,填表结果将全部为空")
|
||||||
|
|
||||||
@@ -158,14 +165,49 @@ class TemplateFillService:
|
|||||||
try:
|
try:
|
||||||
doc = await mongodb.get_document(doc_id)
|
doc = await mongodb.get_document(doc_id)
|
||||||
if doc:
|
if doc:
|
||||||
|
sd = doc.get("structured_data", {})
|
||||||
|
sd_keys = list(sd.keys()) if sd else []
|
||||||
|
logger.info(f"从MongoDB加载文档: {doc_id}, doc_type={doc.get('doc_type')}, structured_data keys={sd_keys}")
|
||||||
|
|
||||||
|
# 如果 structured_data 为空,但有 file_path,尝试重新解析文件
|
||||||
|
doc_content = doc.get("content", "")
|
||||||
|
if not sd or (not sd.get("tables") and not sd.get("headers") and not sd.get("rows")):
|
||||||
|
file_path = doc.get("metadata", {}).get("file_path")
|
||||||
|
if file_path:
|
||||||
|
logger.info(f" structured_data 为空,尝试重新解析文件: {file_path}")
|
||||||
|
try:
|
||||||
|
parser = ParserFactory.get_parser(file_path)
|
||||||
|
result = parser.parse(file_path)
|
||||||
|
if result.success and result.data:
|
||||||
|
if result.data.get("structured_data"):
|
||||||
|
sd = result.data.get("structured_data")
|
||||||
|
logger.info(f" 重新解析成功,structured_data keys: {list(sd.keys())}")
|
||||||
|
elif result.data.get("tables"):
|
||||||
|
sd = {"tables": result.data.get("tables", [])}
|
||||||
|
logger.info(f" 使用 data.tables,tables数量: {len(sd.get('tables', []))}")
|
||||||
|
elif result.data.get("rows"):
|
||||||
|
sd = result.data
|
||||||
|
logger.info(f" 使用 data.rows 格式")
|
||||||
|
if result.data.get("content"):
|
||||||
|
doc_content = result.data.get("content", "")
|
||||||
|
else:
|
||||||
|
logger.warning(f" 重新解析失败: {result.error if result else 'unknown'}")
|
||||||
|
except Exception as parse_err:
|
||||||
|
logger.error(f" 重新解析文件异常: {str(parse_err)}")
|
||||||
|
|
||||||
|
if sd.get("tables"):
|
||||||
|
logger.info(f" tables数量: {len(sd.get('tables', []))}")
|
||||||
|
if sd["tables"]:
|
||||||
|
first_table = sd["tables"][0]
|
||||||
|
logger.info(f" 第一表格: headers={first_table.get('headers', [])[:3]}..., rows数量={len(first_table.get('rows', []))}")
|
||||||
|
|
||||||
source_docs.append(SourceDocument(
|
source_docs.append(SourceDocument(
|
||||||
doc_id=doc_id,
|
doc_id=doc_id,
|
||||||
filename=doc.get("metadata", {}).get("original_filename", "unknown"),
|
filename=doc.get("metadata", {}).get("original_filename", "unknown"),
|
||||||
doc_type=doc.get("doc_type", "unknown"),
|
doc_type=doc.get("doc_type", "unknown"),
|
||||||
content=doc.get("content", ""),
|
content=doc_content,
|
||||||
structured_data=doc.get("structured_data", {})
|
structured_data=sd
|
||||||
))
|
))
|
||||||
logger.info(f"从MongoDB加载文档: {doc_id}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"从MongoDB加载文档失败 {doc_id}: {str(e)}")
|
logger.error(f"从MongoDB加载文档失败 {doc_id}: {str(e)}")
|
||||||
|
|
||||||
@@ -179,10 +221,48 @@ class TemplateFillService:
|
|||||||
# result.data 的结构取决于解析器类型:
|
# result.data 的结构取决于解析器类型:
|
||||||
# - Excel 单 sheet: {columns: [...], rows: [...], row_count, column_count}
|
# - Excel 单 sheet: {columns: [...], rows: [...], row_count, column_count}
|
||||||
# - Excel 多 sheet: {sheets: {sheet_name: {columns, rows, ...}}}
|
# - Excel 多 sheet: {sheets: {sheet_name: {columns, rows, ...}}}
|
||||||
|
# - Markdown: {content: "...", tables: [...], structured_data: {tables: [...]}}
|
||||||
# - Word/TXT: {content: "...", structured_data: {...}}
|
# - Word/TXT: {content: "...", structured_data: {...}}
|
||||||
doc_data = result.data if result.data else {}
|
doc_data = result.data if result.data else {}
|
||||||
doc_content = doc_data.get("content", "") if isinstance(doc_data, dict) else ""
|
doc_content = doc_data.get("content", "") if isinstance(doc_data, dict) else ""
|
||||||
doc_structured = doc_data if isinstance(doc_data, dict) and "rows" in doc_data or isinstance(doc_data, dict) and "sheets" in doc_data else {}
|
|
||||||
|
# 检查并提取 structured_data
|
||||||
|
doc_structured = {}
|
||||||
|
if isinstance(doc_data, dict):
|
||||||
|
logger.info(f"文档 {file_path} doc_data keys: {list(doc_data.keys())}")
|
||||||
|
|
||||||
|
# Excel 多 sheet
|
||||||
|
if "sheets" in doc_data:
|
||||||
|
doc_structured = doc_data
|
||||||
|
logger.info(f" -> 使用 Excel 多 sheet 格式")
|
||||||
|
# Excel 单 sheet 或有 rows 的格式
|
||||||
|
elif "rows" in doc_data:
|
||||||
|
doc_structured = doc_data
|
||||||
|
logger.info(f" -> 使用 rows 格式,列数: {len(doc_data.get('columns', []))}")
|
||||||
|
# Markdown 格式:tables 可能直接在 doc_data.tables 或在 structured_data.tables 中
|
||||||
|
elif "tables" in doc_data and doc_data["tables"]:
|
||||||
|
# Markdown: tables 直接在 doc_data 中
|
||||||
|
tables = doc_data["tables"]
|
||||||
|
first_table = tables[0]
|
||||||
|
doc_structured = {
|
||||||
|
"headers": first_table.get("headers", []),
|
||||||
|
"rows": first_table.get("rows", [])
|
||||||
|
}
|
||||||
|
logger.info(f" -> 使用 doc_data.tables 格式,表头: {doc_structured.get('headers', [])[:5]}")
|
||||||
|
elif "structured_data" in doc_data and isinstance(doc_data["structured_data"], dict):
|
||||||
|
# Markdown: tables 在 structured_data 中
|
||||||
|
tables = doc_data["structured_data"].get("tables", [])
|
||||||
|
if tables:
|
||||||
|
first_table = tables[0]
|
||||||
|
doc_structured = {
|
||||||
|
"headers": first_table.get("headers", []),
|
||||||
|
"rows": first_table.get("rows", [])
|
||||||
|
}
|
||||||
|
logger.info(f" -> 使用 structured_data.tables 格式,表头: {doc_structured.get('headers', [])[:5]}")
|
||||||
|
else:
|
||||||
|
logger.warning(f" -> structured_data.tables 为空")
|
||||||
|
else:
|
||||||
|
logger.warning(f" -> 未识别的文档格式,无 structured_data")
|
||||||
|
|
||||||
source_docs.append(SourceDocument(
|
source_docs.append(SourceDocument(
|
||||||
doc_id=file_path,
|
doc_id=file_path,
|
||||||
@@ -279,7 +359,7 @@ class TemplateFillService:
|
|||||||
response = await self.llm.chat(
|
response = await self.llm.chat(
|
||||||
messages=messages,
|
messages=messages,
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
max_tokens=50000
|
max_tokens=4000
|
||||||
)
|
)
|
||||||
|
|
||||||
content = self.llm.extract_message_content(response)
|
content = self.llm.extract_message_content(response)
|
||||||
@@ -738,7 +818,7 @@ class TemplateFillService:
|
|||||||
|
|
||||||
def _extract_values_from_structured_data(self, source_docs: List[SourceDocument], field_name: str) -> List[str]:
|
def _extract_values_from_structured_data(self, source_docs: List[SourceDocument], field_name: str) -> List[str]:
|
||||||
"""
|
"""
|
||||||
从结构化数据(Excel rows)中直接提取指定列的值
|
从结构化数据(Excel rows 或 Markdown tables)中直接提取指定列的值
|
||||||
|
|
||||||
适用于有 rows 结构的文档数据,无需 LLM 即可提取
|
适用于有 rows 结构的文档数据,无需 LLM 即可提取
|
||||||
|
|
||||||
@@ -750,10 +830,15 @@ class TemplateFillService:
|
|||||||
值列表,如果无法提取则返回空列表
|
值列表,如果无法提取则返回空列表
|
||||||
"""
|
"""
|
||||||
all_values = []
|
all_values = []
|
||||||
|
logger.info(f"[_extract_values_from_structured_data] 开始提取字段: {field_name}")
|
||||||
|
logger.info(f" source_docs 数量: {len(source_docs)}")
|
||||||
|
|
||||||
for doc in source_docs:
|
for doc_idx, doc in enumerate(source_docs):
|
||||||
# 尝试从 structured_data 中提取
|
# 尝试从 structured_data 中提取
|
||||||
structured = doc.structured_data
|
structured = doc.structured_data
|
||||||
|
logger.info(f" 文档[{doc_idx}]: {doc.filename}, structured类型: {type(structured)}, 是否为空: {not bool(structured)}")
|
||||||
|
if structured:
|
||||||
|
logger.info(f" structured_data keys: {list(structured.keys())}")
|
||||||
|
|
||||||
if not structured:
|
if not structured:
|
||||||
continue
|
continue
|
||||||
@@ -773,6 +858,33 @@ class TemplateFillService:
|
|||||||
if all_values:
|
if all_values:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# 处理 Markdown 表格格式: {headers: [...], rows: [...], ...}
|
||||||
|
elif structured.get("headers") and structured.get("rows"):
|
||||||
|
headers = structured.get("headers", [])
|
||||||
|
rows = structured.get("rows", [])
|
||||||
|
values = self._extract_values_from_markdown_table(headers, rows, field_name)
|
||||||
|
if values:
|
||||||
|
all_values.extend(values)
|
||||||
|
logger.info(f"从 Markdown 文档 {doc.filename} 提取到 {len(values)} 个值")
|
||||||
|
break
|
||||||
|
|
||||||
|
# 处理 MongoDB 存储的 tables 格式: {tables: [{headers, rows, ...}, ...]}
|
||||||
|
elif structured.get("tables") and isinstance(structured.get("tables"), list):
|
||||||
|
tables = structured.get("tables", [])
|
||||||
|
logger.info(f" 检测到 tables 格式,共 {len(tables)} 个表")
|
||||||
|
for table_idx, table in enumerate(tables):
|
||||||
|
if isinstance(table, dict):
|
||||||
|
headers = table.get("headers", [])
|
||||||
|
rows = table.get("rows", [])
|
||||||
|
logger.info(f" 表格[{table_idx}]: headers={headers[:3]}..., rows数量={len(rows)}")
|
||||||
|
values = self._extract_values_from_markdown_table(headers, rows, field_name)
|
||||||
|
if values:
|
||||||
|
all_values.extend(values)
|
||||||
|
logger.info(f"从表格[{table_idx}] 提取到 {len(values)} 个值")
|
||||||
|
break
|
||||||
|
if all_values:
|
||||||
|
break
|
||||||
|
|
||||||
# 处理单 sheet 格式: {columns: [...], rows: [...]}
|
# 处理单 sheet 格式: {columns: [...], rows: [...]}
|
||||||
elif structured.get("rows"):
|
elif structured.get("rows"):
|
||||||
columns = structured.get("columns", [])
|
columns = structured.get("columns", [])
|
||||||
@@ -800,6 +912,100 @@ class TemplateFillService:
|
|||||||
|
|
||||||
return all_values
|
return all_values
|
||||||
|
|
||||||
|
def _extract_values_from_markdown_table(self, headers: List, rows: List, field_name: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
从 Markdown 表格中提取指定列的值
|
||||||
|
|
||||||
|
Markdown 表格格式:
|
||||||
|
- headers: ["col1", "col2", ...]
|
||||||
|
- rows: [["val1", "val2", ...], ...]
|
||||||
|
|
||||||
|
Args:
|
||||||
|
headers: 表头列表
|
||||||
|
rows: 数据行列表
|
||||||
|
field_name: 要提取的字段名
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
值列表
|
||||||
|
"""
|
||||||
|
if not rows or not headers:
|
||||||
|
logger.warning(f"Markdown 表格为空: headers={headers}, rows={len(rows) if rows else 0}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# 查找匹配的列索引 - 使用增强的匹配算法
|
||||||
|
target_idx = self._find_best_matching_column(headers, field_name)
|
||||||
|
|
||||||
|
if target_idx is None:
|
||||||
|
logger.warning(f"未找到匹配列: {field_name}, 可用表头: {headers}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
logger.info(f"列匹配成功: {field_name} -> {headers[target_idx]} (索引: {target_idx})")
|
||||||
|
|
||||||
|
values = []
|
||||||
|
for row in rows:
|
||||||
|
if isinstance(row, list) and target_idx < len(row):
|
||||||
|
val = row[target_idx]
|
||||||
|
else:
|
||||||
|
val = ""
|
||||||
|
values.append(self._format_value(val))
|
||||||
|
|
||||||
|
return values
|
||||||
|
|
||||||
|
def _find_best_matching_column(self, headers: List, field_name: str) -> Optional[int]:
|
||||||
|
"""
|
||||||
|
查找最佳匹配的列索引
|
||||||
|
|
||||||
|
使用多层匹配策略:
|
||||||
|
1. 精确匹配(忽略大小写)
|
||||||
|
2. 子字符串匹配(字段名在表头中,或表头在字段名中)
|
||||||
|
3. 关键词重叠匹配(中文字符串分割后比对)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
headers: 表头列表
|
||||||
|
field_name: 要匹配的字段名
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
匹配的列索引,找不到返回 None
|
||||||
|
"""
|
||||||
|
field_lower = field_name.lower().strip()
|
||||||
|
field_keywords = set(field_lower.replace(" ", "").split())
|
||||||
|
|
||||||
|
best_match_idx = None
|
||||||
|
best_match_score = 0
|
||||||
|
|
||||||
|
for idx, header in enumerate(headers):
|
||||||
|
header_str = str(header).strip()
|
||||||
|
header_lower = header_str.lower()
|
||||||
|
|
||||||
|
# 策略1: 精确匹配(忽略大小写)
|
||||||
|
if header_lower == field_lower:
|
||||||
|
return idx
|
||||||
|
|
||||||
|
# 策略2: 子字符串匹配
|
||||||
|
if field_lower in header_lower or header_lower in field_lower:
|
||||||
|
# 计算匹配分数(较长匹配更优先)
|
||||||
|
score = max(len(field_lower), len(header_lower)) / min(len(field_lower) + 1, len(header_lower) + 1)
|
||||||
|
if score > best_match_score:
|
||||||
|
best_match_score = score
|
||||||
|
best_match_idx = idx
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 策略3: 关键词重叠匹配(适用于中文)
|
||||||
|
header_keywords = set(header_lower.replace(" ", "").split())
|
||||||
|
overlap = field_keywords & header_keywords
|
||||||
|
if overlap and len(overlap) > 0:
|
||||||
|
score = len(overlap) / max(len(field_keywords), len(header_keywords), 1)
|
||||||
|
if score > best_match_score:
|
||||||
|
best_match_score = score
|
||||||
|
best_match_idx = idx
|
||||||
|
|
||||||
|
# 只有当匹配分数超过阈值时才返回
|
||||||
|
if best_match_score >= 0.3:
|
||||||
|
logger.info(f"模糊匹配: {field_name} -> {headers[best_match_idx]} (分数: {best_match_score:.2f})")
|
||||||
|
return best_match_idx
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
def _extract_column_values(self, rows: List, columns: List, field_name: str) -> List[str]:
|
def _extract_column_values(self, rows: List, columns: List, field_name: str) -> List[str]:
|
||||||
"""
|
"""
|
||||||
从 rows 和 columns 中提取指定列的值
|
从 rows 和 columns 中提取指定列的值
|
||||||
@@ -815,30 +1021,70 @@ class TemplateFillService:
|
|||||||
if not rows or not columns:
|
if not rows or not columns:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# 查找匹配的列(模糊匹配)
|
# 使用增强的匹配算法查找最佳匹配的列索引
|
||||||
target_col = None
|
target_idx = self._find_best_matching_column(columns, field_name)
|
||||||
for col in columns:
|
|
||||||
col_str = str(col)
|
|
||||||
if field_name.lower() in col_str.lower() or col_str.lower() in field_name.lower():
|
|
||||||
target_col = col
|
|
||||||
break
|
|
||||||
|
|
||||||
if not target_col:
|
if target_idx is None:
|
||||||
logger.warning(f"未找到匹配列: {field_name}, 可用列: {columns}")
|
logger.warning(f"未找到匹配列: {field_name}, 可用列: {columns}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
target_col = columns[target_idx]
|
||||||
|
logger.info(f"列匹配成功: {field_name} -> {target_col} (索引: {target_idx})")
|
||||||
|
|
||||||
values = []
|
values = []
|
||||||
for row in rows:
|
for row in rows:
|
||||||
if isinstance(row, dict):
|
if isinstance(row, dict):
|
||||||
val = row.get(target_col, "")
|
val = row.get(target_col, "")
|
||||||
elif isinstance(row, list) and target_col in columns:
|
elif isinstance(row, list) and target_idx < len(row):
|
||||||
val = row[columns.index(target_col)]
|
val = row[target_idx]
|
||||||
else:
|
else:
|
||||||
val = ""
|
val = ""
|
||||||
values.append(str(val) if val is not None else "")
|
values.append(self._format_value(val))
|
||||||
|
|
||||||
return values
|
return values
|
||||||
|
|
||||||
|
def _format_value(self, val: Any) -> str:
|
||||||
|
"""
|
||||||
|
格式化值为字符串,保持原始格式
|
||||||
|
|
||||||
|
- 如果是浮点数但实际上等于整数,返回整数格式(如 3.0 -> "3")
|
||||||
|
- 如果是浮点数且有小数部分,保留小数(如 3.5 -> "3.5")
|
||||||
|
- 如果是整数,直接返回(如 3 -> "3")
|
||||||
|
- 其他类型直接转为字符串
|
||||||
|
|
||||||
|
Args:
|
||||||
|
val: 原始值
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
格式化后的字符串
|
||||||
|
"""
|
||||||
|
if val is None:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# 如果已经是字符串
|
||||||
|
if isinstance(val, str):
|
||||||
|
return val.strip()
|
||||||
|
|
||||||
|
# 如果是布尔值
|
||||||
|
if isinstance(val, bool):
|
||||||
|
return "true" if val else "false"
|
||||||
|
|
||||||
|
# 如果是数字
|
||||||
|
if isinstance(val, (int, float)):
|
||||||
|
# 检查是否是浮点数但等于整数
|
||||||
|
if isinstance(val, float):
|
||||||
|
# 检查是否是小数部分为0
|
||||||
|
if val == int(val):
|
||||||
|
return str(int(val))
|
||||||
|
else:
|
||||||
|
# 去除尾部多余的0,但保留必要的小数位
|
||||||
|
formatted = f"{val:.10f}".rstrip('0').rstrip('.')
|
||||||
|
return formatted
|
||||||
|
else:
|
||||||
|
return str(val)
|
||||||
|
|
||||||
|
return str(val)
|
||||||
|
|
||||||
def _extract_values_from_json(self, result) -> List[str]:
|
def _extract_values_from_json(self, result) -> List[str]:
|
||||||
"""
|
"""
|
||||||
从解析后的 JSON 对象/数组中提取值数组
|
从解析后的 JSON 对象/数组中提取值数组
|
||||||
@@ -852,12 +1098,12 @@ class TemplateFillService:
|
|||||||
if isinstance(result, dict):
|
if isinstance(result, dict):
|
||||||
# 优先找 values 数组
|
# 优先找 values 数组
|
||||||
if "values" in result and isinstance(result["values"], list):
|
if "values" in result and isinstance(result["values"], list):
|
||||||
vals = [str(v).strip() for v in result["values"] if v and str(v).strip()]
|
vals = [self._format_value(v).strip() for v in result["values"] if self._format_value(v).strip()]
|
||||||
if vals:
|
if vals:
|
||||||
return vals
|
return vals
|
||||||
# 尝试找 value 字段
|
# 尝试找 value 字段
|
||||||
if "value" in result:
|
if "value" in result:
|
||||||
val = str(result["value"]).strip()
|
val = self._format_value(result["value"]).strip()
|
||||||
if val:
|
if val:
|
||||||
return [val]
|
return [val]
|
||||||
# 尝试找任何数组类型的键
|
# 尝试找任何数组类型的键
|
||||||
@@ -865,13 +1111,13 @@ class TemplateFillService:
|
|||||||
val = result[key]
|
val = result[key]
|
||||||
if isinstance(val, list) and len(val) > 0:
|
if isinstance(val, list) and len(val) > 0:
|
||||||
if all(isinstance(v, (str, int, float, bool)) or v is None for v in val):
|
if all(isinstance(v, (str, int, float, bool)) or v is None for v in val):
|
||||||
vals = [str(v).strip() for v in val if v is not None and str(v).strip()]
|
vals = [self._format_value(v).strip() for v in val if v is not None and self._format_value(v).strip()]
|
||||||
if vals:
|
if vals:
|
||||||
return vals
|
return vals
|
||||||
elif isinstance(val, (str, int, float, bool)):
|
elif isinstance(val, (str, int, float, bool)):
|
||||||
return [str(val).strip()]
|
return [self._format_value(val).strip()]
|
||||||
elif isinstance(result, list):
|
elif isinstance(result, list):
|
||||||
vals = [str(v).strip() for v in result if v is not None and str(v).strip()]
|
vals = [self._format_value(v).strip() for v in result if v is not None and self._format_value(v).strip()]
|
||||||
if vals:
|
if vals:
|
||||||
return vals
|
return vals
|
||||||
return []
|
return []
|
||||||
@@ -1008,15 +1254,15 @@ class TemplateFillService:
|
|||||||
if isinstance(parsed, dict):
|
if isinstance(parsed, dict):
|
||||||
# 如果是 {"values": [...]} 格式,提取 values
|
# 如果是 {"values": [...]} 格式,提取 values
|
||||||
if "values" in parsed and isinstance(parsed["values"], list):
|
if "values" in parsed and isinstance(parsed["values"], list):
|
||||||
return [str(v).strip() for v in parsed["values"] if v and str(v).strip()]
|
return [self._format_value(v).strip() for v in parsed["values"] if self._format_value(v).strip()]
|
||||||
# 如果是其他 dict 格式,尝试找 values 键
|
# 如果是其他 dict 格式,尝试找 values 键
|
||||||
for key in ["values", "value", "data", "result"]:
|
for key in ["values", "value", "data", "result"]:
|
||||||
if key in parsed and isinstance(parsed[key], list):
|
if key in parsed and isinstance(parsed[key], list):
|
||||||
return [str(v).strip() for v in parsed[key] if v and str(v).strip()]
|
return [self._format_value(v).strip() for v in parsed[key] if self._format_value(v).strip()]
|
||||||
elif key in parsed:
|
elif key in parsed:
|
||||||
return [str(parsed[key]).strip()]
|
return [self._format_value(parsed[key]).strip()]
|
||||||
elif isinstance(parsed, list):
|
elif isinstance(parsed, list):
|
||||||
return [str(v).strip() for v in parsed if v and str(v).strip()]
|
return [self._format_value(v).strip() for v in parsed if self._format_value(v).strip()]
|
||||||
except (json.JSONDecodeError, TypeError):
|
except (json.JSONDecodeError, TypeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -1032,14 +1278,14 @@ class TemplateFillService:
|
|||||||
result = []
|
result = []
|
||||||
for item in arr:
|
for item in arr:
|
||||||
if isinstance(item, dict) and "values" in item and isinstance(item["values"], list):
|
if isinstance(item, dict) and "values" in item and isinstance(item["values"], list):
|
||||||
result.extend([str(v).strip() for v in item["values"] if v and str(v).strip()])
|
result.extend([self._format_value(v).strip() for v in item["values"] if self._format_value(v).strip()])
|
||||||
elif isinstance(item, dict):
|
elif isinstance(item, dict):
|
||||||
result.append(str(item))
|
result.append(str(item))
|
||||||
else:
|
else:
|
||||||
result.append(str(item))
|
result.append(self._format_value(item))
|
||||||
if result:
|
if result:
|
||||||
return result
|
return result
|
||||||
return [str(v).strip() for v in arr if v and str(v).strip()]
|
return [self._format_value(v).strip() for v in arr if self._format_value(v).strip()]
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -1130,27 +1376,37 @@ class TemplateFillService:
|
|||||||
hint_text = f"{user_hint}。{hint_text}"
|
hint_text = f"{user_hint}。{hint_text}"
|
||||||
|
|
||||||
# 构建针对字段提取的提示词
|
# 构建针对字段提取的提示词
|
||||||
prompt = f"""你是一个专业的数据提取专家。请从以下文档内容中提取与"{field.name}"相关的所有数据。
|
prompt = f"""你是一个专业的数据提取专家。请从以下文档内容中提取与"{field.name}"完全匹配的数据。
|
||||||
|
|
||||||
字段提示: {hint_text}
|
【重要】字段名: "{field.name}"
|
||||||
|
【重要】字段提示: {hint_text}
|
||||||
|
|
||||||
|
请严格按照以下步骤操作:
|
||||||
|
1. 在文档中搜索与"{field.name}"完全相同或高度相关的关键词
|
||||||
|
2. 找到后,提取该关键词后的数值(注意:只要数值,不要单位)
|
||||||
|
3. 如果是表格中的数据,直接提取该单元格的数值
|
||||||
|
4. 如果是段落描述,在关键词附近找数值
|
||||||
|
|
||||||
|
【重要】返回值规则:
|
||||||
|
- 只返回纯数值,不要单位(如 "4.9" 而不是 "4.9万亿元")
|
||||||
|
- 如果原文是"4.9万亿元",返回 "4.9"
|
||||||
|
- 如果原文是"144000万册",返回 "144000"
|
||||||
|
- 如果是百分比如"增长7.7%",返回 "7.7"
|
||||||
|
- 如果没有找到完全匹配的数据,返回空数组
|
||||||
|
|
||||||
文档内容:
|
文档内容:
|
||||||
{doc.content[:8000] if doc.content else ""}
|
{doc.content[:10000] if doc.content else ""}
|
||||||
|
|
||||||
请完成以下任务:
|
|
||||||
1. 仔细阅读文档,找出所有与"{field.name}"相关的数据
|
|
||||||
2. 如果文档中有表格数据,提取表格中的对应列值
|
|
||||||
3. 如果文档中是段落描述,提取其中的关键数值或结论
|
|
||||||
4. 返回提取的所有值(可能多个,用数组存储)
|
|
||||||
|
|
||||||
请用严格的 JSON 格式返回:
|
请用严格的 JSON 格式返回:
|
||||||
{{
|
{{
|
||||||
"values": ["值1", "值2", ...],
|
"values": ["值1", "值2", ...], // 只填数值,不要单位
|
||||||
"source": "数据来源说明",
|
"source": "数据来源说明",
|
||||||
"confidence": 0.0到1.0之间的置信度
|
"confidence": 0.0到1.0之间的置信度
|
||||||
}}
|
}}
|
||||||
|
|
||||||
如果没有找到相关数据,返回空数组 values: []"""
|
示例:
|
||||||
|
- 如果字段是"图书馆总藏量(万册)"且文档说"图书总藏量14.4亿册",返回 values: ["144000"]
|
||||||
|
- 如果字段是"国内旅游收入(亿元)"且文档说"国内旅游收入4.9万亿元",返回 values: ["49000"]"""
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
{"role": "system", "content": "你是一个专业的数据提取助手,擅长从政府统计公报等文档中提取数据。请严格按JSON格式输出。"},
|
{"role": "system", "content": "你是一个专业的数据提取助手,擅长从政府统计公报等文档中提取数据。请严格按JSON格式输出。"},
|
||||||
@@ -1160,7 +1416,7 @@ class TemplateFillService:
|
|||||||
response = await self.llm.chat(
|
response = await self.llm.chat(
|
||||||
messages=messages,
|
messages=messages,
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
max_tokens=5000
|
max_tokens=4000
|
||||||
)
|
)
|
||||||
|
|
||||||
content = self.llm.extract_message_content(response)
|
content = self.llm.extract_message_content(response)
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ interface TemplateFillState {
|
|||||||
templateFields: TemplateField[];
|
templateFields: TemplateField[];
|
||||||
sourceFiles: SourceFile[];
|
sourceFiles: SourceFile[];
|
||||||
sourceFilePaths: string[];
|
sourceFilePaths: string[];
|
||||||
|
sourceDocIds: string[];
|
||||||
templateId: string;
|
templateId: string;
|
||||||
filledResult: any;
|
filledResult: any;
|
||||||
setStep: (step: Step) => void;
|
setStep: (step: Step) => void;
|
||||||
@@ -30,6 +31,9 @@ interface TemplateFillState {
|
|||||||
addSourceFiles: (files: SourceFile[]) => void;
|
addSourceFiles: (files: SourceFile[]) => void;
|
||||||
removeSourceFile: (index: number) => void;
|
removeSourceFile: (index: number) => void;
|
||||||
setSourceFilePaths: (paths: string[]) => void;
|
setSourceFilePaths: (paths: string[]) => void;
|
||||||
|
setSourceDocIds: (ids: string[]) => void;
|
||||||
|
addSourceDocId: (id: string) => void;
|
||||||
|
removeSourceDocId: (id: string) => void;
|
||||||
setTemplateId: (id: string) => void;
|
setTemplateId: (id: string) => void;
|
||||||
setFilledResult: (result: any) => void;
|
setFilledResult: (result: any) => void;
|
||||||
reset: () => void;
|
reset: () => void;
|
||||||
@@ -41,6 +45,7 @@ const initialState = {
|
|||||||
templateFields: [],
|
templateFields: [],
|
||||||
sourceFiles: [],
|
sourceFiles: [],
|
||||||
sourceFilePaths: [],
|
sourceFilePaths: [],
|
||||||
|
sourceDocIds: [],
|
||||||
templateId: '',
|
templateId: '',
|
||||||
filledResult: null,
|
filledResult: null,
|
||||||
setStep: () => {},
|
setStep: () => {},
|
||||||
@@ -50,6 +55,9 @@ const initialState = {
|
|||||||
addSourceFiles: () => {},
|
addSourceFiles: () => {},
|
||||||
removeSourceFile: () => {},
|
removeSourceFile: () => {},
|
||||||
setSourceFilePaths: () => {},
|
setSourceFilePaths: () => {},
|
||||||
|
setSourceDocIds: () => {},
|
||||||
|
addSourceDocId: () => {},
|
||||||
|
removeSourceDocId: () => {},
|
||||||
setTemplateId: () => {},
|
setTemplateId: () => {},
|
||||||
setFilledResult: () => {},
|
setFilledResult: () => {},
|
||||||
reset: () => {},
|
reset: () => {},
|
||||||
@@ -63,6 +71,7 @@ export const TemplateFillProvider: React.FC<{ children: ReactNode }> = ({ childr
|
|||||||
const [templateFields, setTemplateFields] = useState<TemplateField[]>([]);
|
const [templateFields, setTemplateFields] = useState<TemplateField[]>([]);
|
||||||
const [sourceFiles, setSourceFiles] = useState<SourceFile[]>([]);
|
const [sourceFiles, setSourceFiles] = useState<SourceFile[]>([]);
|
||||||
const [sourceFilePaths, setSourceFilePaths] = useState<string[]>([]);
|
const [sourceFilePaths, setSourceFilePaths] = useState<string[]>([]);
|
||||||
|
const [sourceDocIds, setSourceDocIds] = useState<string[]>([]);
|
||||||
const [templateId, setTemplateId] = useState<string>('');
|
const [templateId, setTemplateId] = useState<string>('');
|
||||||
const [filledResult, setFilledResult] = useState<any>(null);
|
const [filledResult, setFilledResult] = useState<any>(null);
|
||||||
|
|
||||||
@@ -74,12 +83,21 @@ export const TemplateFillProvider: React.FC<{ children: ReactNode }> = ({ childr
|
|||||||
setSourceFiles(prev => prev.filter((_, i) => i !== index));
|
setSourceFiles(prev => prev.filter((_, i) => i !== index));
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const addSourceDocId = (id: string) => {
|
||||||
|
setSourceDocIds(prev => prev.includes(id) ? prev : [...prev, id]);
|
||||||
|
};
|
||||||
|
|
||||||
|
const removeSourceDocId = (id: string) => {
|
||||||
|
setSourceDocIds(prev => prev.filter(docId => docId !== id));
|
||||||
|
};
|
||||||
|
|
||||||
const reset = () => {
|
const reset = () => {
|
||||||
setStep('upload');
|
setStep('upload');
|
||||||
setTemplateFile(null);
|
setTemplateFile(null);
|
||||||
setTemplateFields([]);
|
setTemplateFields([]);
|
||||||
setSourceFiles([]);
|
setSourceFiles([]);
|
||||||
setSourceFilePaths([]);
|
setSourceFilePaths([]);
|
||||||
|
setSourceDocIds([]);
|
||||||
setTemplateId('');
|
setTemplateId('');
|
||||||
setFilledResult(null);
|
setFilledResult(null);
|
||||||
};
|
};
|
||||||
@@ -92,6 +110,7 @@ export const TemplateFillProvider: React.FC<{ children: ReactNode }> = ({ childr
|
|||||||
templateFields,
|
templateFields,
|
||||||
sourceFiles,
|
sourceFiles,
|
||||||
sourceFilePaths,
|
sourceFilePaths,
|
||||||
|
sourceDocIds,
|
||||||
templateId,
|
templateId,
|
||||||
filledResult,
|
filledResult,
|
||||||
setStep,
|
setStep,
|
||||||
@@ -101,6 +120,9 @@ export const TemplateFillProvider: React.FC<{ children: ReactNode }> = ({ childr
|
|||||||
addSourceFiles,
|
addSourceFiles,
|
||||||
removeSourceFile,
|
removeSourceFile,
|
||||||
setSourceFilePaths,
|
setSourceFilePaths,
|
||||||
|
setSourceDocIds,
|
||||||
|
addSourceDocId,
|
||||||
|
removeSourceDocId,
|
||||||
setTemplateId,
|
setTemplateId,
|
||||||
setFilledResult,
|
setFilledResult,
|
||||||
reset,
|
reset,
|
||||||
|
|||||||
@@ -60,6 +60,7 @@ const TemplateFill: React.FC = () => {
|
|||||||
templateFields, setTemplateFields,
|
templateFields, setTemplateFields,
|
||||||
sourceFiles, setSourceFiles, addSourceFiles, removeSourceFile,
|
sourceFiles, setSourceFiles, addSourceFiles, removeSourceFile,
|
||||||
sourceFilePaths, setSourceFilePaths,
|
sourceFilePaths, setSourceFilePaths,
|
||||||
|
sourceDocIds, setSourceDocIds, addSourceDocId, removeSourceDocId,
|
||||||
templateId, setTemplateId,
|
templateId, setTemplateId,
|
||||||
filledResult, setFilledResult,
|
filledResult, setFilledResult,
|
||||||
reset
|
reset
|
||||||
@@ -68,6 +69,9 @@ const TemplateFill: React.FC = () => {
|
|||||||
const [loading, setLoading] = useState(false);
|
const [loading, setLoading] = useState(false);
|
||||||
const [previewDoc, setPreviewDoc] = useState<{ name: string; content: string } | null>(null);
|
const [previewDoc, setPreviewDoc] = useState<{ name: string; content: string } | null>(null);
|
||||||
const [previewOpen, setPreviewOpen] = useState(false);
|
const [previewOpen, setPreviewOpen] = useState(false);
|
||||||
|
const [sourceMode, setSourceMode] = useState<'upload' | 'select'>('upload');
|
||||||
|
const [uploadedDocuments, setUploadedDocuments] = useState<DocumentItem[]>([]);
|
||||||
|
const [docsLoading, setDocsLoading] = useState(false);
|
||||||
|
|
||||||
// 模板拖拽
|
// 模板拖拽
|
||||||
const onTemplateDrop = useCallback((acceptedFiles: File[]) => {
|
const onTemplateDrop = useCallback((acceptedFiles: File[]) => {
|
||||||
@@ -109,40 +113,118 @@ const TemplateFill: React.FC = () => {
|
|||||||
multiple: true
|
multiple: true
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// 加载已上传文档
|
||||||
|
const loadUploadedDocuments = useCallback(async () => {
|
||||||
|
setDocsLoading(true);
|
||||||
|
try {
|
||||||
|
const result = await backendApi.getDocuments(undefined, 100);
|
||||||
|
if (result.success) {
|
||||||
|
// 过滤可作为数据源的文档类型
|
||||||
|
const docs = (result.documents || []).filter((d: DocumentItem) =>
|
||||||
|
['docx', 'md', 'txt', 'xlsx', 'xls'].includes(d.doc_type)
|
||||||
|
);
|
||||||
|
setUploadedDocuments(docs);
|
||||||
|
}
|
||||||
|
} catch (err: any) {
|
||||||
|
console.error('加载文档失败:', err);
|
||||||
|
} finally {
|
||||||
|
setDocsLoading(false);
|
||||||
|
}
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
// 删除文档
|
||||||
|
const handleDeleteDocument = async (docId: string, e: React.MouseEvent) => {
|
||||||
|
e.stopPropagation();
|
||||||
|
if (!confirm('确定要删除该文档吗?')) return;
|
||||||
|
try {
|
||||||
|
const result = await backendApi.deleteDocument(docId);
|
||||||
|
if (result.success) {
|
||||||
|
setUploadedDocuments(prev => prev.filter(d => d.doc_id !== docId));
|
||||||
|
removeSourceDocId(docId);
|
||||||
|
toast.success('文档已删除');
|
||||||
|
} else {
|
||||||
|
toast.error(result.message || '删除失败');
|
||||||
|
}
|
||||||
|
} catch (err: any) {
|
||||||
|
toast.error('删除失败: ' + (err.message || '未知错误'));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (sourceMode === 'select') {
|
||||||
|
loadUploadedDocuments();
|
||||||
|
}
|
||||||
|
}, [sourceMode, loadUploadedDocuments]);
|
||||||
|
|
||||||
const handleJointUploadAndFill = async () => {
|
const handleJointUploadAndFill = async () => {
|
||||||
if (!templateFile) {
|
if (!templateFile) {
|
||||||
toast.error('请先上传模板文件');
|
toast.error('请先上传模板文件');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 检查是否选择了数据源
|
||||||
|
if (sourceMode === 'upload' && sourceFiles.length === 0) {
|
||||||
|
toast.error('请上传源文档或从已上传文档中选择');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (sourceMode === 'select' && sourceDocIds.length === 0) {
|
||||||
|
toast.error('请选择源文档');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
setLoading(true);
|
setLoading(true);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// 使用联合上传API
|
if (sourceMode === 'select') {
|
||||||
const result = await backendApi.uploadTemplateAndSources(
|
// 使用已上传文档作为数据源
|
||||||
templateFile,
|
const result = await backendApi.uploadTemplate(templateFile);
|
||||||
sourceFiles.map(sf => sf.file)
|
|
||||||
);
|
|
||||||
|
|
||||||
if (result.success) {
|
if (result.success) {
|
||||||
setTemplateFields(result.fields || []);
|
setTemplateFields(result.fields || []);
|
||||||
setTemplateId(result.template_id);
|
setTemplateId(result.template_id || 'temp');
|
||||||
setSourceFilePaths(result.source_file_paths || []);
|
toast.success('开始智能填表');
|
||||||
toast.success('文档上传成功,开始智能填表');
|
setStep('filling');
|
||||||
setStep('filling');
|
|
||||||
|
|
||||||
// 自动开始填表
|
// 使用 source_doc_ids 进行填表
|
||||||
const fillResult = await backendApi.fillTemplate(
|
const fillResult = await backendApi.fillTemplate(
|
||||||
result.template_id,
|
result.template_id || 'temp',
|
||||||
result.fields || [],
|
result.fields || [],
|
||||||
[], // 使用 source_file_paths 而非 source_doc_ids
|
sourceDocIds,
|
||||||
result.source_file_paths || [],
|
[],
|
||||||
'请从以下文档中提取相关信息填写表格'
|
'请从以下文档中提取相关信息填写表格'
|
||||||
|
);
|
||||||
|
|
||||||
|
setFilledResult(fillResult);
|
||||||
|
setStep('preview');
|
||||||
|
toast.success('表格填写完成');
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// 使用联合上传API
|
||||||
|
const result = await backendApi.uploadTemplateAndSources(
|
||||||
|
templateFile,
|
||||||
|
sourceFiles.map(sf => sf.file)
|
||||||
);
|
);
|
||||||
|
|
||||||
setFilledResult(fillResult);
|
if (result.success) {
|
||||||
setStep('preview');
|
setTemplateFields(result.fields || []);
|
||||||
toast.success('表格填写完成');
|
setTemplateId(result.template_id);
|
||||||
|
setSourceFilePaths(result.source_file_paths || []);
|
||||||
|
toast.success('文档上传成功,开始智能填表');
|
||||||
|
setStep('filling');
|
||||||
|
|
||||||
|
// 自动开始填表
|
||||||
|
const fillResult = await backendApi.fillTemplate(
|
||||||
|
result.template_id,
|
||||||
|
result.fields || [],
|
||||||
|
[],
|
||||||
|
result.source_file_paths || [],
|
||||||
|
'请从以下文档中提取相关信息填写表格'
|
||||||
|
);
|
||||||
|
|
||||||
|
setFilledResult(fillResult);
|
||||||
|
setStep('preview');
|
||||||
|
toast.success('表格填写完成');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
toast.error('处理失败: ' + (err.message || '未知错误'));
|
toast.error('处理失败: ' + (err.message || '未知错误'));
|
||||||
@@ -264,47 +346,131 @@ const TemplateFill: React.FC = () => {
|
|||||||
源文档
|
源文档
|
||||||
</CardTitle>
|
</CardTitle>
|
||||||
<CardDescription>
|
<CardDescription>
|
||||||
上传包含数据的源文档(支持多选),可同时上传多个文件
|
选择包含数据的源文档作为填表依据
|
||||||
</CardDescription>
|
</CardDescription>
|
||||||
|
{/* Source Mode Tabs */}
|
||||||
|
<div className="flex gap-2 mt-2">
|
||||||
|
<Button
|
||||||
|
variant={sourceMode === 'upload' ? 'default' : 'outline'}
|
||||||
|
size="sm"
|
||||||
|
onClick={() => setSourceMode('upload')}
|
||||||
|
>
|
||||||
|
<Upload size={14} className="mr-1" />
|
||||||
|
上传文件
|
||||||
|
</Button>
|
||||||
|
<Button
|
||||||
|
variant={sourceMode === 'select' ? 'default' : 'outline'}
|
||||||
|
size="sm"
|
||||||
|
onClick={() => setSourceMode('select')}
|
||||||
|
>
|
||||||
|
<Files size={14} className="mr-1" />
|
||||||
|
从文档中心选择
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
</CardHeader>
|
</CardHeader>
|
||||||
<CardContent>
|
<CardContent>
|
||||||
<div
|
{sourceMode === 'upload' ? (
|
||||||
{...getSourceProps()}
|
<>
|
||||||
className={cn(
|
<div
|
||||||
"border-2 border-dashed rounded-2xl p-8 transition-all duration-300 flex flex-col items-center justify-center text-center cursor-pointer group min-h-[200px]",
|
{...getSourceProps()}
|
||||||
isSourceDragActive ? "border-primary bg-primary/5" : "border-muted-foreground/20 hover:border-primary/50 hover:bg-primary/5"
|
className={cn(
|
||||||
)}
|
"border-2 border-dashed rounded-2xl p-8 transition-all duration-300 flex flex-col items-center justify-center text-center cursor-pointer group min-h-[200px]",
|
||||||
>
|
isSourceDragActive ? "border-primary bg-primary/5" : "border-muted-foreground/20 hover:border-primary/50 hover:bg-primary/5"
|
||||||
<input {...getSourceInputProps()} />
|
)}
|
||||||
<div className="w-14 h-14 rounded-xl bg-blue-500/10 text-blue-500 flex items-center justify-center mb-4 group-hover:scale-110 transition-transform">
|
>
|
||||||
{loading ? <Loader2 className="animate-spin" size={28} /> : <Upload size={28} />}
|
<input {...getSourceInputProps()} />
|
||||||
</div>
|
<div className="w-14 h-14 rounded-xl bg-blue-500/10 text-blue-500 flex items-center justify-center mb-4 group-hover:scale-110 transition-transform">
|
||||||
<p className="font-medium">
|
{loading ? <Loader2 className="animate-spin" size={28} /> : <Upload size={28} />}
|
||||||
{isSourceDragActive ? '释放以上传' : '点击或拖拽上传源文档'}
|
|
||||||
</p>
|
|
||||||
<p className="text-xs text-muted-foreground mt-1">
|
|
||||||
支持 .xlsx .xls .docx .md .txt
|
|
||||||
</p>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
{/* Selected Source Files */}
|
|
||||||
{sourceFiles.length > 0 && (
|
|
||||||
<div className="mt-4 space-y-2">
|
|
||||||
{sourceFiles.map((sf, idx) => (
|
|
||||||
<div key={idx} className="flex items-center gap-3 p-3 bg-muted/50 rounded-xl">
|
|
||||||
{getFileIcon(sf.file.name)}
|
|
||||||
<div className="flex-1 min-w-0">
|
|
||||||
<p className="text-sm font-medium truncate">{sf.file.name}</p>
|
|
||||||
<p className="text-xs text-muted-foreground">
|
|
||||||
{(sf.file.size / 1024).toFixed(1)} KB
|
|
||||||
</p>
|
|
||||||
</div>
|
|
||||||
<Button variant="ghost" size="sm" onClick={() => removeSourceFile(idx)}>
|
|
||||||
<Trash2 size={14} className="text-red-500" />
|
|
||||||
</Button>
|
|
||||||
</div>
|
</div>
|
||||||
))}
|
<p className="font-medium">
|
||||||
</div>
|
{isSourceDragActive ? '释放以上传' : '点击或拖拽上传源文档'}
|
||||||
|
</p>
|
||||||
|
<p className="text-xs text-muted-foreground mt-1">
|
||||||
|
支持 .xlsx .xls .docx .md .txt
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Selected Source Files */}
|
||||||
|
{sourceFiles.length > 0 && (
|
||||||
|
<div className="mt-4 space-y-2">
|
||||||
|
{sourceFiles.map((sf, idx) => (
|
||||||
|
<div key={idx} className="flex items-center gap-3 p-3 bg-muted/50 rounded-xl">
|
||||||
|
{getFileIcon(sf.file.name)}
|
||||||
|
<div className="flex-1 min-w-0">
|
||||||
|
<p className="text-sm font-medium truncate">{sf.file.name}</p>
|
||||||
|
<p className="text-xs text-muted-foreground">
|
||||||
|
{(sf.file.size / 1024).toFixed(1)} KB
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<Button variant="ghost" size="sm" onClick={() => removeSourceFile(idx)}>
|
||||||
|
<Trash2 size={14} className="text-red-500" />
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
) : (
|
||||||
|
<>
|
||||||
|
{/* Uploaded Documents Selection */}
|
||||||
|
{docsLoading ? (
|
||||||
|
<div className="space-y-2">
|
||||||
|
{[1, 2, 3].map(i => (
|
||||||
|
<Skeleton key={i} className="h-16 w-full rounded-xl" />
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
) : uploadedDocuments.length > 0 ? (
|
||||||
|
<div className="space-y-2 max-h-[300px] overflow-y-auto">
|
||||||
|
{uploadedDocuments.map((doc) => (
|
||||||
|
<div
|
||||||
|
key={doc.doc_id}
|
||||||
|
className={cn(
|
||||||
|
"flex items-center gap-3 p-3 rounded-xl border-2 transition-all cursor-pointer",
|
||||||
|
sourceDocIds.includes(doc.doc_id)
|
||||||
|
? "border-primary bg-primary/5"
|
||||||
|
: "border-border hover:bg-muted/30"
|
||||||
|
)}
|
||||||
|
onClick={() => {
|
||||||
|
if (sourceDocIds.includes(doc.doc_id)) {
|
||||||
|
removeSourceDocId(doc.doc_id);
|
||||||
|
} else {
|
||||||
|
addSourceDocId(doc.doc_id);
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<div className={cn(
|
||||||
|
"w-6 h-6 rounded-md border-2 flex items-center justify-center transition-all shrink-0",
|
||||||
|
sourceDocIds.includes(doc.doc_id)
|
||||||
|
? "border-primary bg-primary text-white"
|
||||||
|
: "border-muted-foreground/30"
|
||||||
|
)}>
|
||||||
|
{sourceDocIds.includes(doc.doc_id) && <CheckCircle2 size={14} />}
|
||||||
|
</div>
|
||||||
|
{getFileIcon(doc.original_filename)}
|
||||||
|
<div className="flex-1 min-w-0">
|
||||||
|
<p className="text-sm font-medium truncate">{doc.original_filename}</p>
|
||||||
|
<p className="text-xs text-muted-foreground">
|
||||||
|
{doc.doc_type.toUpperCase()} • {format(new Date(doc.created_at), 'yyyy-MM-dd')}
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<Button
|
||||||
|
variant="ghost"
|
||||||
|
size="sm"
|
||||||
|
onClick={(e) => handleDeleteDocument(doc.doc_id, e)}
|
||||||
|
className="shrink-0"
|
||||||
|
>
|
||||||
|
<Trash2 size={14} className="text-red-500" />
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<div className="text-center py-8 text-muted-foreground">
|
||||||
|
<Files size={32} className="mx-auto mb-2 opacity-30" />
|
||||||
|
<p className="text-sm">暂无可用的已上传文档</p>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
)}
|
)}
|
||||||
</CardContent>
|
</CardContent>
|
||||||
</Card>
|
</Card>
|
||||||
|
|||||||
Reference in New Issue
Block a user