添加XML回退解析机制支持复杂Excel文件

当pandas无法解析某些包含非标准元素的Excel文件时,
添加了XML直接解析功能来提取工作表名称和数据。

- 实现了`_extract_sheet_names_from_xml`方法从XML提取工作表名称
- 实现了`_read_excel_sheet_xml`方法直接解析Excel XML数据
- 添加多种命名空间支持以处理不同Excel格式
- 在pandas解析失败时自动回退到XML解析方式

fix(excel-storage-service): 修复XML解析中的命名空间问题

改进了XML解析逻辑,添加对多种命名空间的支持,
使用通配符查找元素以兼容不同Excel文件格式。

refactor(table-rag-service): 优化XML解析逻辑提高兼容性

统一了XML解析的命名空间处理方式,
改进了元素查找逻辑以更好地支持不同Excel格式。

feat(frontend): 添加RAG向量检索和索引重建功能

- 实现了RAG状态查看、搜索和索引重建接口
- 添加了前端RAG检索界面组件
- 增加了错误处理和加载状态提示
This commit is contained in:
2026-04-08 19:21:40 +08:00
parent 41e5eaaa2d
commit 3b82103e87
6 changed files with 523 additions and 145 deletions

View File

@@ -67,11 +67,14 @@ class XlsxParser(BaseParser):
xls_file = pd.ExcelFile(file_path)
sheet_names = xls_file.sheet_names
# 如果 pandas 返回空列表,尝试从 XML 提取
if not sheet_names:
return ParseResult(
success=False,
error=f"Excel 文件没有找到任何工作表: {file_path}"
)
sheet_names = self._extract_sheet_names_from_xml(file_path)
if not sheet_names:
return ParseResult(
success=False,
error=f"Excel 文件没有找到任何工作表: {file_path}"
)
# 验证请求的工作表索引/名称
target_sheet = None
@@ -88,15 +91,21 @@ class XlsxParser(BaseParser):
target_sheet = sheet_names[0]
# 读取 Excel 文件
df = pd.read_excel(
file_path,
sheet_name=target_sheet,
header=header_row,
**kwargs
)
df = None
try:
df = pd.read_excel(
file_path,
sheet_name=target_sheet,
header=header_row,
**kwargs
)
except Exception as e:
logger.warning(f"pandas 读取 Excel 失败,尝试 XML 方式: {e}")
# pandas 读取失败,尝试 XML 方式
df = self._read_excel_sheet_xml(file_path, sheet_name=target_sheet, header_row=header_row)
# 检查 DataFrame 是否为空
if df.empty:
if df is None or df.empty:
return ParseResult(
success=False,
error=f"工作表 '{target_sheet}' 为空,请检查 Excel 文件内容"
@@ -211,7 +220,26 @@ class XlsxParser(BaseParser):
try:
# 读取所有工作表
all_data = pd.read_excel(file_path, sheet_name=None, **kwargs)
all_data = None
try:
all_data = pd.read_excel(file_path, sheet_name=None, **kwargs)
except Exception as e:
logger.warning(f"pandas 读取所有工作表失败: {e}")
# 如果 pandas 失败,尝试 XML 方式
if all_data is None or len(all_data) == 0:
sheet_names = self._extract_sheet_names_from_xml(file_path)
if not sheet_names:
return ParseResult(
success=False,
error=f"无法读取 Excel 文件或文件为空: {file_path}"
)
# 使用 XML 方式读取每个工作表
all_data = {}
for sheet_name in sheet_names:
df = self._read_excel_sheet_xml(file_path, sheet_name=sheet_name, header_row=0)
if df is not None and not df.empty:
all_data[sheet_name] = df
# 检查是否成功读取到数据
if not all_data or len(all_data) == 0:
@@ -257,13 +285,149 @@ class XlsxParser(BaseParser):
try:
xls = pd.ExcelFile(file_path)
sheet_names = xls.sheet_names
if not sheet_names:
return []
return sheet_names
if sheet_names:
return sheet_names
# pandas 返回空列表,尝试从 XML 提取
return self._extract_sheet_names_from_xml(file_path)
except Exception as e:
logger.error(f"获取工作表名称失败: {str(e)}")
# 尝试从 XML 提取
return self._extract_sheet_names_from_xml(file_path)
def _extract_sheet_names_from_xml(self, file_path: str) -> List[str]:
"""
从 Excel 文件的 XML 中提取工作表名称
某些 Excel 文件由于包含非标准元素(如 mc:AlternateContent
pandas/openpyxl 无法正确解析工作表列表,此时需要直接从 XML 中提取。
Args:
file_path: Excel 文件路径
Returns:
工作表名称列表
"""
import zipfile
from xml.etree import ElementTree as ET
try:
with zipfile.ZipFile(file_path, 'r') as z:
if 'xl/workbook.xml' not in z.namelist():
return []
content = z.read('xl/workbook.xml')
root = ET.fromstring(content)
# 命名空间
ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
sheet_names = []
for sheet in root.findall('.//main:sheet', ns):
name = sheet.get('name')
if name:
sheet_names.append(name)
logger.info(f"从 XML 提取工作表: {sheet_names}")
return sheet_names
except Exception as e:
logger.error(f"从 XML 提取工作表名称失败: {e}")
return []
def _read_excel_sheet_xml(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
"""
从 XML 直接读取 Excel 工作表数据
当 pandas 无法正确解析时使用此方法。
Args:
file_path: Excel 文件路径
sheet_name: 工作表名称(如果为 None读取第一个工作表
header_row: 表头行号0-indexed
Returns:
DataFrame
"""
import zipfile
from xml.etree import ElementTree as ET
with zipfile.ZipFile(file_path, 'r') as z:
# 获取工作表名称
sheet_names = self._extract_sheet_names_from_xml(file_path)
if not sheet_names:
raise ValueError("无法从 Excel 文件中找到工作表")
# 确定要读取的工作表
target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
sheet_index = sheet_names.index(target_sheet) + 1 # sheet1.xml, sheet2.xml, ...
# 读取 shared strings
shared_strings = []
if 'xl/sharedStrings.xml' in z.namelist():
ss_content = z.read('xl/sharedStrings.xml')
ss_root = ET.fromstring(ss_content)
ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
for si in ss_root.findall('.//main:si', ns):
t = si.find('.//main:t', ns)
if t is not None:
shared_strings.append(t.text or '')
else:
shared_strings.append('')
# 读取工作表
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
if sheet_file not in z.namelist():
raise ValueError(f"工作表文件 {sheet_file} 不存在")
sheet_content = z.read(sheet_file)
root = ET.fromstring(sheet_content)
ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
# 收集所有行数据
all_rows = []
headers = {}
for row in root.findall('.//main:row', ns):
row_idx = int(row.get('r', 0))
row_cells = {}
for cell in row.findall('main:c', ns):
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('main:v', ns)
if v is not None and v.text:
if cell_type == 's':
# shared string
try:
row_cells[col_letters] = shared_strings[int(v.text)]
except (ValueError, IndexError):
row_cells[col_letters] = v.text
elif cell_type == 'b':
# boolean
row_cells[col_letters] = v.text == '1'
else:
row_cells[col_letters] = v.text
else:
row_cells[col_letters] = None
# 处理表头行
if row_idx == header_row + 1:
headers = {**row_cells}
elif row_idx > header_row + 1:
all_rows.append(row_cells)
# 构建 DataFrame
if headers:
# 按原始列顺序排列
col_order = list(headers.keys())
df = pd.DataFrame(all_rows)
if not df.empty:
df = df[col_order]
df.columns = [headers.get(col, col) for col in df.columns]
else:
df = pd.DataFrame(all_rows)
return df
def _df_to_dict(self, df: pd.DataFrame) -> Dict[str, Any]:
"""
将 DataFrame 转换为字典,处理 NaN 值

View File

@@ -45,8 +45,25 @@ class ExcelStorageService:
return []
content = z.read('xl/workbook.xml')
root = ET.fromstring(content)
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
sheets = root.findall('.//main:sheet', ns)
# 尝试多种命名空间
namespaces = [
'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
'http://purl.oclc.org/ooxml/spreadsheetml/main',
]
for ns_uri in namespaces:
ns = {'main': ns_uri}
sheets = root.findall('.//main:sheet', ns)
if sheets:
names = [s.get('name') for s in sheets if s.get('name')]
if names:
return names
# 尝试通配符
sheets = root.findall('.//{*}sheet')
if not sheets:
sheets = root.findall('.//sheet')
return [s.get('name') for s in sheets if s.get('name')]
except Exception:
return []
@@ -79,72 +96,77 @@ class ExcelStorageService:
if 'xl/sharedStrings.xml' in z.namelist():
ss_content = z.read('xl/sharedStrings.xml')
ss_root = ET.fromstring(ss_content)
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
for si in ss_root.findall('.//main:si', ns):
t = si.find('.//main:t', ns)
shared_strings.append(t.text if t is not None else '')
for si in ss_root.iter():
if si.tag.endswith('}si') or si.tag == 'si':
t = si.find('.//{*}t')
shared_strings.append(t.text if t is not None and t.text else '')
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
sheet_content = z.read(sheet_file)
root = ET.fromstring(sheet_content)
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
rows_data = []
for row in root.findall('.//main:row', ns):
row_idx = int(row.get('r', 0))
if row_idx <= header_row + 1:
continue
headers = {}
row_cells = {}
for cell in row.findall('main:c', ns):
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('main:v', ns)
for row in root.iter():
if row.tag.endswith('}row') or row.tag == 'row':
row_idx = int(row.get('r', 0))
if v is not None and v.text:
if cell_type == 's':
try:
val = shared_strings[int(v.text)]
except (ValueError, IndexError):
val = v.text
elif cell_type == 'b':
val = v.text == '1'
else:
val = v.text
else:
val = None
row_cells[col_letters] = val
# 收集表头行
if row_idx == header_row + 1:
for cell in row:
if cell.tag.endswith('}c') or cell.tag == 'c':
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('{*}v')
if v is not None and v.text:
if cell_type == 's':
try:
headers[col_letters] = shared_strings[int(v.text)]
except (ValueError, IndexError):
headers[col_letters] = v.text
else:
headers[col_letters] = v.text
else:
headers[col_letters] = col_letters
continue
if row_cells:
rows_data.append(row_cells)
if row_idx <= header_row + 1:
continue
row_cells = {}
for cell in row:
if cell.tag.endswith('}c') or cell.tag == 'c':
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('{*}v')
if v is not None and v.text:
if cell_type == 's':
try:
val = shared_strings[int(v.text)]
except (ValueError, IndexError):
val = v.text
elif cell_type == 'b':
val = v.text == '1'
else:
val = v.text
else:
val = None
row_cells[col_letters] = val
if row_cells:
rows_data.append(row_cells)
if not rows_data:
return pd.DataFrame()
df = pd.DataFrame(rows_data)
if header_row >= 0:
first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
sheet_content = z.read(first_row_sheet)
root = ET.fromstring(sheet_content)
first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
if first_row is not None:
headers = {}
for cell in first_row.findall('main:c', ns):
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('main:v', ns)
if v is not None and v.text:
if cell_type == 's':
try:
headers[col_letters] = shared_strings[int(v.text)]
except (ValueError, IndexError):
headers[col_letters] = v.text
else:
headers[col_letters] = v.text
df.columns = [headers.get(col, col) for col in df.columns]
if headers:
df.columns = [headers.get(col, col) for col in df.columns]
return df
except Exception as e:

View File

@@ -47,6 +47,12 @@ class TableRAGService:
import zipfile
from xml.etree import ElementTree as ET
# 尝试多种命名空间
namespaces = [
'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
'http://purl.oclc.org/ooxml/spreadsheetml/main',
]
try:
with zipfile.ZipFile(file_path, 'r') as z:
# 读取 workbook.xml
@@ -56,12 +62,27 @@ class TableRAGService:
content = z.read('xl/workbook.xml')
root = ET.fromstring(content)
# 定义命名空间
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
# 尝试多种命名空间
for ns_uri in namespaces:
ns = {'main': ns_uri}
sheets = root.findall('.//main:sheet', ns)
if sheets:
names = [s.get('name') for s in sheets if s.get('name')]
if names:
logger.info(f"使用命名空间 {ns_uri} 提取到工作表: {names}")
return names
# 提取所有 sheet 的 name 属性
sheets = root.findall('.//main:sheet', ns)
return [s.get('name') for s in sheets if s.get('name')]
# 如果都没找到,尝试不带命名空间
sheets = root.findall('.//sheet')
if not sheets:
sheets = root.findall('.//{*}sheet')
names = [s.get('name') for s in sheets if s.get('name')]
if names:
logger.info(f"使用通配符提取到工作表: {names}")
return names
logger.warning(f"无法从 XML 提取工作表,尝试的文件: {file_path}")
return []
except Exception as e:
logger.warning(f"从 XML 提取工作表失败: {file_path}, error: {e}")
@@ -84,6 +105,12 @@ class TableRAGService:
import zipfile
from xml.etree import ElementTree as ET
# 定义命名空间
namespaces = [
'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
'http://purl.oclc.org/ooxml/spreadsheetml/main',
]
try:
# 先尝试用 pandas 正常读取
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
@@ -111,13 +138,14 @@ class TableRAGService:
if 'xl/sharedStrings.xml' in z.namelist():
ss_content = z.read('xl/sharedStrings.xml')
ss_root = ET.fromstring(ss_content)
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
for si in ss_root.findall('.//main:si', ns):
t = si.find('.//main:t', ns)
if t is not None:
shared_strings.append(t.text or '')
else:
shared_strings.append('')
# 使用通配符查找所有 si 元素
for si in ss_root.iter():
if si.tag.endswith('}si') or si.tag == 'si':
t = si.find('.//{*}t')
if t is not None and t.text:
shared_strings.append(t.text)
else:
shared_strings.append('')
# 读取工作表
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
@@ -126,75 +154,75 @@ class TableRAGService:
sheet_content = z.read(sheet_file)
root = ET.fromstring(sheet_content)
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
# 解析行
# 解析行 - 使用通配符查找
rows_data = []
for row in root.findall('.//main:row', ns):
row_idx = int(row.get('r', 0))
# header_row 是 0-indexedrow_idx 是 1-indexed
# 如果 header_row=0 表示第一行是表头,需要跳过 row_idx=1
if row_idx <= header_row + 1:
continue # 跳过表头行
headers = {}
row_cells = {}
for cell in row.findall('main:c', ns):
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('main:v', ns)
for row in root.iter():
if row.tag.endswith('}row') or row.tag == 'row':
row_idx = int(row.get('r', 0))
if v is not None and v.text:
if cell_type == 's':
# shared string
try:
val = shared_strings[int(v.text)]
except (ValueError, IndexError):
val = v.text
elif cell_type == 'b':
# boolean
val = v.text == '1'
else:
# number or other
val = v.text
else:
val = None
# 收集表头行
if row_idx == header_row + 1:
for cell in row:
if cell.tag.endswith('}c') or cell.tag == 'c':
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('{*}v')
if v is not None and v.text:
if cell_type == 's':
try:
headers[col_letters] = shared_strings[int(v.text)]
except (ValueError, IndexError):
headers[col_letters] = v.text
else:
headers[col_letters] = v.text
else:
headers[col_letters] = col_letters
continue
row_cells[col_letters] = val
# 跳过表头行之后的数据行
if row_idx <= header_row + 1:
continue
if row_cells:
rows_data.append(row_cells)
row_cells = {}
for cell in row:
if cell.tag.endswith('}c') or cell.tag == 'c':
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('{*}v')
if v is not None and v.text:
if cell_type == 's':
try:
val = shared_strings[int(v.text)]
except (ValueError, IndexError):
val = v.text
elif cell_type == 'b':
val = v.text == '1'
else:
val = v.text
else:
val = None
row_cells[col_letters] = val
if row_cells:
rows_data.append(row_cells)
# 转换为 DataFrame
if not rows_data:
logger.warning(f"XML 解析结果为空: {file_path}, sheet: {target_sheet}")
return pd.DataFrame()
df = pd.DataFrame(rows_data)
# 如果有 header_row重新设置列名
if header_row >= 0:
# 重新读取第一行作为表头
first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
sheet_content = z.read(first_row_sheet)
root = ET.fromstring(sheet_content)
first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
if first_row is not None:
headers = {}
for cell in first_row.findall('main:c', ns):
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('main:v', ns)
if v is not None and v.text:
if cell_type == 's':
try:
headers[col_letters] = shared_strings[int(v.text)]
except (ValueError, IndexError):
headers[col_letters] = v.text
else:
headers[col_letters] = v.text
# 重命名列
df.columns = [headers.get(col, col) for col in df.columns]
# 应用表头
if headers:
df.columns = [headers.get(col, col) for col in df.columns]
logger.info(f"XML 解析完成: {len(df)} 行, {len(df.columns)}")
return df