添加XML回退解析机制支持复杂Excel文件
当pandas无法解析某些包含非标准元素的Excel文件时, 添加了XML直接解析功能来提取工作表名称和数据。 - 实现了`_extract_sheet_names_from_xml`方法从XML提取工作表名称 - 实现了`_read_excel_sheet_xml`方法直接解析Excel XML数据 - 添加多种命名空间支持以处理不同Excel格式 - 在pandas解析失败时自动回退到XML解析方式 fix(excel-storage-service): 修复XML解析中的命名空间问题 改进了XML解析逻辑,添加对多种命名空间的支持, 使用通配符查找元素以兼容不同Excel文件格式。 refactor(table-rag-service): 优化XML解析逻辑提高兼容性 统一了XML解析的命名空间处理方式, 改进了元素查找逻辑以更好地支持不同Excel格式。 feat(frontend): 添加RAG向量检索和索引重建功能 - 实现了RAG状态查看、搜索和索引重建接口 - 添加了前端RAG检索界面组件 - 增加了错误处理和加载状态提示
This commit is contained in:
@@ -67,6 +67,9 @@ class XlsxParser(BaseParser):
|
|||||||
xls_file = pd.ExcelFile(file_path)
|
xls_file = pd.ExcelFile(file_path)
|
||||||
sheet_names = xls_file.sheet_names
|
sheet_names = xls_file.sheet_names
|
||||||
|
|
||||||
|
# 如果 pandas 返回空列表,尝试从 XML 提取
|
||||||
|
if not sheet_names:
|
||||||
|
sheet_names = self._extract_sheet_names_from_xml(file_path)
|
||||||
if not sheet_names:
|
if not sheet_names:
|
||||||
return ParseResult(
|
return ParseResult(
|
||||||
success=False,
|
success=False,
|
||||||
@@ -88,15 +91,21 @@ class XlsxParser(BaseParser):
|
|||||||
target_sheet = sheet_names[0]
|
target_sheet = sheet_names[0]
|
||||||
|
|
||||||
# 读取 Excel 文件
|
# 读取 Excel 文件
|
||||||
|
df = None
|
||||||
|
try:
|
||||||
df = pd.read_excel(
|
df = pd.read_excel(
|
||||||
file_path,
|
file_path,
|
||||||
sheet_name=target_sheet,
|
sheet_name=target_sheet,
|
||||||
header=header_row,
|
header=header_row,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"pandas 读取 Excel 失败,尝试 XML 方式: {e}")
|
||||||
|
# pandas 读取失败,尝试 XML 方式
|
||||||
|
df = self._read_excel_sheet_xml(file_path, sheet_name=target_sheet, header_row=header_row)
|
||||||
|
|
||||||
# 检查 DataFrame 是否为空
|
# 检查 DataFrame 是否为空
|
||||||
if df.empty:
|
if df is None or df.empty:
|
||||||
return ParseResult(
|
return ParseResult(
|
||||||
success=False,
|
success=False,
|
||||||
error=f"工作表 '{target_sheet}' 为空,请检查 Excel 文件内容"
|
error=f"工作表 '{target_sheet}' 为空,请检查 Excel 文件内容"
|
||||||
@@ -211,7 +220,26 @@ class XlsxParser(BaseParser):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# 读取所有工作表
|
# 读取所有工作表
|
||||||
|
all_data = None
|
||||||
|
try:
|
||||||
all_data = pd.read_excel(file_path, sheet_name=None, **kwargs)
|
all_data = pd.read_excel(file_path, sheet_name=None, **kwargs)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"pandas 读取所有工作表失败: {e}")
|
||||||
|
|
||||||
|
# 如果 pandas 失败,尝试 XML 方式
|
||||||
|
if all_data is None or len(all_data) == 0:
|
||||||
|
sheet_names = self._extract_sheet_names_from_xml(file_path)
|
||||||
|
if not sheet_names:
|
||||||
|
return ParseResult(
|
||||||
|
success=False,
|
||||||
|
error=f"无法读取 Excel 文件或文件为空: {file_path}"
|
||||||
|
)
|
||||||
|
# 使用 XML 方式读取每个工作表
|
||||||
|
all_data = {}
|
||||||
|
for sheet_name in sheet_names:
|
||||||
|
df = self._read_excel_sheet_xml(file_path, sheet_name=sheet_name, header_row=0)
|
||||||
|
if df is not None and not df.empty:
|
||||||
|
all_data[sheet_name] = df
|
||||||
|
|
||||||
# 检查是否成功读取到数据
|
# 检查是否成功读取到数据
|
||||||
if not all_data or len(all_data) == 0:
|
if not all_data or len(all_data) == 0:
|
||||||
@@ -257,12 +285,148 @@ class XlsxParser(BaseParser):
|
|||||||
try:
|
try:
|
||||||
xls = pd.ExcelFile(file_path)
|
xls = pd.ExcelFile(file_path)
|
||||||
sheet_names = xls.sheet_names
|
sheet_names = xls.sheet_names
|
||||||
if not sheet_names:
|
if sheet_names:
|
||||||
return []
|
|
||||||
return sheet_names
|
return sheet_names
|
||||||
|
# pandas 返回空列表,尝试从 XML 提取
|
||||||
|
return self._extract_sheet_names_from_xml(file_path)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"获取工作表名称失败: {str(e)}")
|
logger.error(f"获取工作表名称失败: {str(e)}")
|
||||||
|
# 尝试从 XML 提取
|
||||||
|
return self._extract_sheet_names_from_xml(file_path)
|
||||||
|
|
||||||
|
def _extract_sheet_names_from_xml(self, file_path: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
从 Excel 文件的 XML 中提取工作表名称
|
||||||
|
|
||||||
|
某些 Excel 文件由于包含非标准元素(如 mc:AlternateContent),
|
||||||
|
pandas/openpyxl 无法正确解析工作表列表,此时需要直接从 XML 中提取。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Excel 文件路径
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
工作表名称列表
|
||||||
|
"""
|
||||||
|
import zipfile
|
||||||
|
from xml.etree import ElementTree as ET
|
||||||
|
|
||||||
|
try:
|
||||||
|
with zipfile.ZipFile(file_path, 'r') as z:
|
||||||
|
if 'xl/workbook.xml' not in z.namelist():
|
||||||
return []
|
return []
|
||||||
|
content = z.read('xl/workbook.xml')
|
||||||
|
root = ET.fromstring(content)
|
||||||
|
|
||||||
|
# 命名空间
|
||||||
|
ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
|
||||||
|
|
||||||
|
sheet_names = []
|
||||||
|
for sheet in root.findall('.//main:sheet', ns):
|
||||||
|
name = sheet.get('name')
|
||||||
|
if name:
|
||||||
|
sheet_names.append(name)
|
||||||
|
|
||||||
|
logger.info(f"从 XML 提取工作表: {sheet_names}")
|
||||||
|
return sheet_names
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"从 XML 提取工作表名称失败: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _read_excel_sheet_xml(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
从 XML 直接读取 Excel 工作表数据
|
||||||
|
|
||||||
|
当 pandas 无法正确解析时使用此方法。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Excel 文件路径
|
||||||
|
sheet_name: 工作表名称(如果为 None,读取第一个工作表)
|
||||||
|
header_row: 表头行号(0-indexed)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DataFrame
|
||||||
|
"""
|
||||||
|
import zipfile
|
||||||
|
from xml.etree import ElementTree as ET
|
||||||
|
|
||||||
|
with zipfile.ZipFile(file_path, 'r') as z:
|
||||||
|
# 获取工作表名称
|
||||||
|
sheet_names = self._extract_sheet_names_from_xml(file_path)
|
||||||
|
if not sheet_names:
|
||||||
|
raise ValueError("无法从 Excel 文件中找到工作表")
|
||||||
|
|
||||||
|
# 确定要读取的工作表
|
||||||
|
target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
|
||||||
|
sheet_index = sheet_names.index(target_sheet) + 1 # sheet1.xml, sheet2.xml, ...
|
||||||
|
|
||||||
|
# 读取 shared strings
|
||||||
|
shared_strings = []
|
||||||
|
if 'xl/sharedStrings.xml' in z.namelist():
|
||||||
|
ss_content = z.read('xl/sharedStrings.xml')
|
||||||
|
ss_root = ET.fromstring(ss_content)
|
||||||
|
ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
|
||||||
|
for si in ss_root.findall('.//main:si', ns):
|
||||||
|
t = si.find('.//main:t', ns)
|
||||||
|
if t is not None:
|
||||||
|
shared_strings.append(t.text or '')
|
||||||
|
else:
|
||||||
|
shared_strings.append('')
|
||||||
|
|
||||||
|
# 读取工作表
|
||||||
|
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
|
||||||
|
if sheet_file not in z.namelist():
|
||||||
|
raise ValueError(f"工作表文件 {sheet_file} 不存在")
|
||||||
|
|
||||||
|
sheet_content = z.read(sheet_file)
|
||||||
|
root = ET.fromstring(sheet_content)
|
||||||
|
ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
|
||||||
|
|
||||||
|
# 收集所有行数据
|
||||||
|
all_rows = []
|
||||||
|
headers = {}
|
||||||
|
|
||||||
|
for row in root.findall('.//main:row', ns):
|
||||||
|
row_idx = int(row.get('r', 0))
|
||||||
|
row_cells = {}
|
||||||
|
for cell in row.findall('main:c', ns):
|
||||||
|
cell_ref = cell.get('r', '')
|
||||||
|
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
||||||
|
cell_type = cell.get('t', 'n')
|
||||||
|
v = cell.find('main:v', ns)
|
||||||
|
|
||||||
|
if v is not None and v.text:
|
||||||
|
if cell_type == 's':
|
||||||
|
# shared string
|
||||||
|
try:
|
||||||
|
row_cells[col_letters] = shared_strings[int(v.text)]
|
||||||
|
except (ValueError, IndexError):
|
||||||
|
row_cells[col_letters] = v.text
|
||||||
|
elif cell_type == 'b':
|
||||||
|
# boolean
|
||||||
|
row_cells[col_letters] = v.text == '1'
|
||||||
|
else:
|
||||||
|
row_cells[col_letters] = v.text
|
||||||
|
else:
|
||||||
|
row_cells[col_letters] = None
|
||||||
|
|
||||||
|
# 处理表头行
|
||||||
|
if row_idx == header_row + 1:
|
||||||
|
headers = {**row_cells}
|
||||||
|
elif row_idx > header_row + 1:
|
||||||
|
all_rows.append(row_cells)
|
||||||
|
|
||||||
|
# 构建 DataFrame
|
||||||
|
if headers:
|
||||||
|
# 按原始列顺序排列
|
||||||
|
col_order = list(headers.keys())
|
||||||
|
df = pd.DataFrame(all_rows)
|
||||||
|
if not df.empty:
|
||||||
|
df = df[col_order]
|
||||||
|
df.columns = [headers.get(col, col) for col in df.columns]
|
||||||
|
else:
|
||||||
|
df = pd.DataFrame(all_rows)
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
def _df_to_dict(self, df: pd.DataFrame) -> Dict[str, Any]:
|
def _df_to_dict(self, df: pd.DataFrame) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -45,8 +45,25 @@ class ExcelStorageService:
|
|||||||
return []
|
return []
|
||||||
content = z.read('xl/workbook.xml')
|
content = z.read('xl/workbook.xml')
|
||||||
root = ET.fromstring(content)
|
root = ET.fromstring(content)
|
||||||
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
|
|
||||||
|
# 尝试多种命名空间
|
||||||
|
namespaces = [
|
||||||
|
'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
|
||||||
|
'http://purl.oclc.org/ooxml/spreadsheetml/main',
|
||||||
|
]
|
||||||
|
|
||||||
|
for ns_uri in namespaces:
|
||||||
|
ns = {'main': ns_uri}
|
||||||
sheets = root.findall('.//main:sheet', ns)
|
sheets = root.findall('.//main:sheet', ns)
|
||||||
|
if sheets:
|
||||||
|
names = [s.get('name') for s in sheets if s.get('name')]
|
||||||
|
if names:
|
||||||
|
return names
|
||||||
|
|
||||||
|
# 尝试通配符
|
||||||
|
sheets = root.findall('.//{*}sheet')
|
||||||
|
if not sheets:
|
||||||
|
sheets = root.findall('.//sheet')
|
||||||
return [s.get('name') for s in sheets if s.get('name')]
|
return [s.get('name') for s in sheets if s.get('name')]
|
||||||
except Exception:
|
except Exception:
|
||||||
return []
|
return []
|
||||||
@@ -79,28 +96,52 @@ class ExcelStorageService:
|
|||||||
if 'xl/sharedStrings.xml' in z.namelist():
|
if 'xl/sharedStrings.xml' in z.namelist():
|
||||||
ss_content = z.read('xl/sharedStrings.xml')
|
ss_content = z.read('xl/sharedStrings.xml')
|
||||||
ss_root = ET.fromstring(ss_content)
|
ss_root = ET.fromstring(ss_content)
|
||||||
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
|
for si in ss_root.iter():
|
||||||
for si in ss_root.findall('.//main:si', ns):
|
if si.tag.endswith('}si') or si.tag == 'si':
|
||||||
t = si.find('.//main:t', ns)
|
t = si.find('.//{*}t')
|
||||||
shared_strings.append(t.text if t is not None else '')
|
shared_strings.append(t.text if t is not None and t.text else '')
|
||||||
|
|
||||||
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
|
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
|
||||||
sheet_content = z.read(sheet_file)
|
sheet_content = z.read(sheet_file)
|
||||||
root = ET.fromstring(sheet_content)
|
root = ET.fromstring(sheet_content)
|
||||||
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
|
|
||||||
|
|
||||||
rows_data = []
|
rows_data = []
|
||||||
for row in root.findall('.//main:row', ns):
|
headers = {}
|
||||||
|
|
||||||
|
for row in root.iter():
|
||||||
|
if row.tag.endswith('}row') or row.tag == 'row':
|
||||||
row_idx = int(row.get('r', 0))
|
row_idx = int(row.get('r', 0))
|
||||||
|
|
||||||
|
# 收集表头行
|
||||||
|
if row_idx == header_row + 1:
|
||||||
|
for cell in row:
|
||||||
|
if cell.tag.endswith('}c') or cell.tag == 'c':
|
||||||
|
cell_ref = cell.get('r', '')
|
||||||
|
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
||||||
|
cell_type = cell.get('t', 'n')
|
||||||
|
v = cell.find('{*}v')
|
||||||
|
if v is not None and v.text:
|
||||||
|
if cell_type == 's':
|
||||||
|
try:
|
||||||
|
headers[col_letters] = shared_strings[int(v.text)]
|
||||||
|
except (ValueError, IndexError):
|
||||||
|
headers[col_letters] = v.text
|
||||||
|
else:
|
||||||
|
headers[col_letters] = v.text
|
||||||
|
else:
|
||||||
|
headers[col_letters] = col_letters
|
||||||
|
continue
|
||||||
|
|
||||||
if row_idx <= header_row + 1:
|
if row_idx <= header_row + 1:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
row_cells = {}
|
row_cells = {}
|
||||||
for cell in row.findall('main:c', ns):
|
for cell in row:
|
||||||
|
if cell.tag.endswith('}c') or cell.tag == 'c':
|
||||||
cell_ref = cell.get('r', '')
|
cell_ref = cell.get('r', '')
|
||||||
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
||||||
cell_type = cell.get('t', 'n')
|
cell_type = cell.get('t', 'n')
|
||||||
v = cell.find('main:v', ns)
|
v = cell.find('{*}v')
|
||||||
|
|
||||||
if v is not None and v.text:
|
if v is not None and v.text:
|
||||||
if cell_type == 's':
|
if cell_type == 's':
|
||||||
@@ -124,26 +165,7 @@ class ExcelStorageService:
|
|||||||
|
|
||||||
df = pd.DataFrame(rows_data)
|
df = pd.DataFrame(rows_data)
|
||||||
|
|
||||||
if header_row >= 0:
|
if headers:
|
||||||
first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
|
|
||||||
sheet_content = z.read(first_row_sheet)
|
|
||||||
root = ET.fromstring(sheet_content)
|
|
||||||
first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
|
|
||||||
if first_row is not None:
|
|
||||||
headers = {}
|
|
||||||
for cell in first_row.findall('main:c', ns):
|
|
||||||
cell_ref = cell.get('r', '')
|
|
||||||
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
|
||||||
cell_type = cell.get('t', 'n')
|
|
||||||
v = cell.find('main:v', ns)
|
|
||||||
if v is not None and v.text:
|
|
||||||
if cell_type == 's':
|
|
||||||
try:
|
|
||||||
headers[col_letters] = shared_strings[int(v.text)]
|
|
||||||
except (ValueError, IndexError):
|
|
||||||
headers[col_letters] = v.text
|
|
||||||
else:
|
|
||||||
headers[col_letters] = v.text
|
|
||||||
df.columns = [headers.get(col, col) for col in df.columns]
|
df.columns = [headers.get(col, col) for col in df.columns]
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|||||||
@@ -47,6 +47,12 @@ class TableRAGService:
|
|||||||
import zipfile
|
import zipfile
|
||||||
from xml.etree import ElementTree as ET
|
from xml.etree import ElementTree as ET
|
||||||
|
|
||||||
|
# 尝试多种命名空间
|
||||||
|
namespaces = [
|
||||||
|
'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
|
||||||
|
'http://purl.oclc.org/ooxml/spreadsheetml/main',
|
||||||
|
]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with zipfile.ZipFile(file_path, 'r') as z:
|
with zipfile.ZipFile(file_path, 'r') as z:
|
||||||
# 读取 workbook.xml
|
# 读取 workbook.xml
|
||||||
@@ -56,12 +62,27 @@ class TableRAGService:
|
|||||||
content = z.read('xl/workbook.xml')
|
content = z.read('xl/workbook.xml')
|
||||||
root = ET.fromstring(content)
|
root = ET.fromstring(content)
|
||||||
|
|
||||||
# 定义命名空间
|
# 尝试多种命名空间
|
||||||
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
|
for ns_uri in namespaces:
|
||||||
|
ns = {'main': ns_uri}
|
||||||
# 提取所有 sheet 的 name 属性
|
|
||||||
sheets = root.findall('.//main:sheet', ns)
|
sheets = root.findall('.//main:sheet', ns)
|
||||||
return [s.get('name') for s in sheets if s.get('name')]
|
if sheets:
|
||||||
|
names = [s.get('name') for s in sheets if s.get('name')]
|
||||||
|
if names:
|
||||||
|
logger.info(f"使用命名空间 {ns_uri} 提取到工作表: {names}")
|
||||||
|
return names
|
||||||
|
|
||||||
|
# 如果都没找到,尝试不带命名空间
|
||||||
|
sheets = root.findall('.//sheet')
|
||||||
|
if not sheets:
|
||||||
|
sheets = root.findall('.//{*}sheet')
|
||||||
|
names = [s.get('name') for s in sheets if s.get('name')]
|
||||||
|
if names:
|
||||||
|
logger.info(f"使用通配符提取到工作表: {names}")
|
||||||
|
return names
|
||||||
|
|
||||||
|
logger.warning(f"无法从 XML 提取工作表,尝试的文件: {file_path}")
|
||||||
|
return []
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"从 XML 提取工作表失败: {file_path}, error: {e}")
|
logger.warning(f"从 XML 提取工作表失败: {file_path}, error: {e}")
|
||||||
@@ -84,6 +105,12 @@ class TableRAGService:
|
|||||||
import zipfile
|
import zipfile
|
||||||
from xml.etree import ElementTree as ET
|
from xml.etree import ElementTree as ET
|
||||||
|
|
||||||
|
# 定义命名空间
|
||||||
|
namespaces = [
|
||||||
|
'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
|
||||||
|
'http://purl.oclc.org/ooxml/spreadsheetml/main',
|
||||||
|
]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 先尝试用 pandas 正常读取
|
# 先尝试用 pandas 正常读取
|
||||||
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
|
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
|
||||||
@@ -111,11 +138,12 @@ class TableRAGService:
|
|||||||
if 'xl/sharedStrings.xml' in z.namelist():
|
if 'xl/sharedStrings.xml' in z.namelist():
|
||||||
ss_content = z.read('xl/sharedStrings.xml')
|
ss_content = z.read('xl/sharedStrings.xml')
|
||||||
ss_root = ET.fromstring(ss_content)
|
ss_root = ET.fromstring(ss_content)
|
||||||
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
|
# 使用通配符查找所有 si 元素
|
||||||
for si in ss_root.findall('.//main:si', ns):
|
for si in ss_root.iter():
|
||||||
t = si.find('.//main:t', ns)
|
if si.tag.endswith('}si') or si.tag == 'si':
|
||||||
if t is not None:
|
t = si.find('.//{*}t')
|
||||||
shared_strings.append(t.text or '')
|
if t is not None and t.text:
|
||||||
|
shared_strings.append(t.text)
|
||||||
else:
|
else:
|
||||||
shared_strings.append('')
|
shared_strings.append('')
|
||||||
|
|
||||||
@@ -126,36 +154,56 @@ class TableRAGService:
|
|||||||
|
|
||||||
sheet_content = z.read(sheet_file)
|
sheet_content = z.read(sheet_file)
|
||||||
root = ET.fromstring(sheet_content)
|
root = ET.fromstring(sheet_content)
|
||||||
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
|
|
||||||
|
|
||||||
# 解析行
|
# 解析行 - 使用通配符查找
|
||||||
rows_data = []
|
rows_data = []
|
||||||
for row in root.findall('.//main:row', ns):
|
headers = {}
|
||||||
row_idx = int(row.get('r', 0))
|
|
||||||
# header_row 是 0-indexed,row_idx 是 1-indexed
|
|
||||||
# 如果 header_row=0 表示第一行是表头,需要跳过 row_idx=1
|
|
||||||
if row_idx <= header_row + 1:
|
|
||||||
continue # 跳过表头行
|
|
||||||
|
|
||||||
row_cells = {}
|
for row in root.iter():
|
||||||
for cell in row.findall('main:c', ns):
|
if row.tag.endswith('}row') or row.tag == 'row':
|
||||||
|
row_idx = int(row.get('r', 0))
|
||||||
|
|
||||||
|
# 收集表头行
|
||||||
|
if row_idx == header_row + 1:
|
||||||
|
for cell in row:
|
||||||
|
if cell.tag.endswith('}c') or cell.tag == 'c':
|
||||||
cell_ref = cell.get('r', '')
|
cell_ref = cell.get('r', '')
|
||||||
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
||||||
cell_type = cell.get('t', 'n')
|
cell_type = cell.get('t', 'n')
|
||||||
v = cell.find('main:v', ns)
|
v = cell.find('{*}v')
|
||||||
|
if v is not None and v.text:
|
||||||
|
if cell_type == 's':
|
||||||
|
try:
|
||||||
|
headers[col_letters] = shared_strings[int(v.text)]
|
||||||
|
except (ValueError, IndexError):
|
||||||
|
headers[col_letters] = v.text
|
||||||
|
else:
|
||||||
|
headers[col_letters] = v.text
|
||||||
|
else:
|
||||||
|
headers[col_letters] = col_letters
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 跳过表头行之后的数据行
|
||||||
|
if row_idx <= header_row + 1:
|
||||||
|
continue
|
||||||
|
|
||||||
|
row_cells = {}
|
||||||
|
for cell in row:
|
||||||
|
if cell.tag.endswith('}c') or cell.tag == 'c':
|
||||||
|
cell_ref = cell.get('r', '')
|
||||||
|
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
||||||
|
cell_type = cell.get('t', 'n')
|
||||||
|
v = cell.find('{*}v')
|
||||||
|
|
||||||
if v is not None and v.text:
|
if v is not None and v.text:
|
||||||
if cell_type == 's':
|
if cell_type == 's':
|
||||||
# shared string
|
|
||||||
try:
|
try:
|
||||||
val = shared_strings[int(v.text)]
|
val = shared_strings[int(v.text)]
|
||||||
except (ValueError, IndexError):
|
except (ValueError, IndexError):
|
||||||
val = v.text
|
val = v.text
|
||||||
elif cell_type == 'b':
|
elif cell_type == 'b':
|
||||||
# boolean
|
|
||||||
val = v.text == '1'
|
val = v.text == '1'
|
||||||
else:
|
else:
|
||||||
# number or other
|
|
||||||
val = v.text
|
val = v.text
|
||||||
else:
|
else:
|
||||||
val = None
|
val = None
|
||||||
@@ -167,33 +215,13 @@ class TableRAGService:
|
|||||||
|
|
||||||
# 转换为 DataFrame
|
# 转换为 DataFrame
|
||||||
if not rows_data:
|
if not rows_data:
|
||||||
|
logger.warning(f"XML 解析结果为空: {file_path}, sheet: {target_sheet}")
|
||||||
return pd.DataFrame()
|
return pd.DataFrame()
|
||||||
|
|
||||||
df = pd.DataFrame(rows_data)
|
df = pd.DataFrame(rows_data)
|
||||||
|
|
||||||
# 如果有 header_row,重新设置列名
|
# 应用表头
|
||||||
if header_row >= 0:
|
if headers:
|
||||||
# 重新读取第一行作为表头
|
|
||||||
first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
|
|
||||||
sheet_content = z.read(first_row_sheet)
|
|
||||||
root = ET.fromstring(sheet_content)
|
|
||||||
first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
|
|
||||||
if first_row is not None:
|
|
||||||
headers = {}
|
|
||||||
for cell in first_row.findall('main:c', ns):
|
|
||||||
cell_ref = cell.get('r', '')
|
|
||||||
col_letters = ''.join(filter(str.isalpha, cell_ref))
|
|
||||||
cell_type = cell.get('t', 'n')
|
|
||||||
v = cell.find('main:v', ns)
|
|
||||||
if v is not None and v.text:
|
|
||||||
if cell_type == 's':
|
|
||||||
try:
|
|
||||||
headers[col_letters] = shared_strings[int(v.text)]
|
|
||||||
except (ValueError, IndexError):
|
|
||||||
headers[col_letters] = v.text
|
|
||||||
else:
|
|
||||||
headers[col_letters] = v.text
|
|
||||||
# 重命名列
|
|
||||||
df.columns = [headers.get(col, col) for col in df.columns]
|
df.columns = [headers.get(col, col) for col in df.columns]
|
||||||
|
|
||||||
logger.info(f"XML 解析完成: {len(df)} 行, {len(df.columns)} 列")
|
logger.info(f"XML 解析完成: {len(df)} 行, {len(df.columns)} 列")
|
||||||
|
|||||||
1
frontend - 副本
Submodule
1
frontend - 副本
Submodule
Submodule frontend - 副本 added at 797125940b
@@ -563,6 +563,30 @@ export const backendApi = {
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 重建 RAG 索引
|
||||||
|
*/
|
||||||
|
async rebuildRAGIndex(): Promise<{
|
||||||
|
success: boolean;
|
||||||
|
message: string;
|
||||||
|
}> {
|
||||||
|
const url = `${BACKEND_BASE_URL}/rag/rebuild`;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await fetch(url, {
|
||||||
|
method: 'POST',
|
||||||
|
});
|
||||||
|
if (!response.ok) {
|
||||||
|
const error = await response.json();
|
||||||
|
throw new Error(error.detail || '重建索引失败');
|
||||||
|
}
|
||||||
|
return await response.json();
|
||||||
|
} catch (error) {
|
||||||
|
console.error('重建 RAG 索引失败:', error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
// ==================== 表格填写 ====================
|
// ==================== 表格填写 ====================
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -91,6 +91,13 @@ const Documents: React.FC = () => {
|
|||||||
const [mdStreaming, setMdStreaming] = useState(false);
|
const [mdStreaming, setMdStreaming] = useState(false);
|
||||||
const [mdStreamingContent, setMdStreamingContent] = useState('');
|
const [mdStreamingContent, setMdStreamingContent] = useState('');
|
||||||
|
|
||||||
|
// RAG 向量检索相关状态
|
||||||
|
const [ragStatus, setRagStatus] = useState<{ vector_count: number; collections: string[] } | null>(null);
|
||||||
|
const [ragSearchQuery, setRagSearchQuery] = useState('');
|
||||||
|
const [ragSearching, setRagSearching] = useState(false);
|
||||||
|
const [ragResults, setRagResults] = useState<any[]>([]);
|
||||||
|
const [ragRebuilding, setRagRebuilding] = useState(false);
|
||||||
|
|
||||||
// 解析选项
|
// 解析选项
|
||||||
const [parseOptions, setParseOptions] = useState({
|
const [parseOptions, setParseOptions] = useState({
|
||||||
parseAllSheets: false,
|
parseAllSheets: false,
|
||||||
@@ -146,6 +153,61 @@ const Documents: React.FC = () => {
|
|||||||
loadDocuments();
|
loadDocuments();
|
||||||
}, [loadDocuments]);
|
}, [loadDocuments]);
|
||||||
|
|
||||||
|
// 获取 RAG 状态
|
||||||
|
useEffect(() => {
|
||||||
|
const fetchRagStatus = async () => {
|
||||||
|
try {
|
||||||
|
const status = await backendApi.getRAGStatus();
|
||||||
|
if (status.success) {
|
||||||
|
setRagStatus({ vector_count: status.vector_count, collections: status.collections });
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
console.error('获取 RAG 状态失败:', err);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
fetchRagStatus();
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
// RAG 搜索
|
||||||
|
const handleRagSearch = async () => {
|
||||||
|
if (!ragSearchQuery.trim()) {
|
||||||
|
toast.error('请输入搜索内容');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
setRagSearching(true);
|
||||||
|
setRagResults([]);
|
||||||
|
try {
|
||||||
|
const result = await backendApi.searchRAG(ragSearchQuery, 5);
|
||||||
|
if (result.success) {
|
||||||
|
setRagResults(result.results || []);
|
||||||
|
}
|
||||||
|
} catch (err: any) {
|
||||||
|
toast.error(err.message || '搜索失败');
|
||||||
|
} finally {
|
||||||
|
setRagSearching(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// 重建 RAG 索引
|
||||||
|
const handleRebuildRag = async () => {
|
||||||
|
setRagRebuilding(true);
|
||||||
|
try {
|
||||||
|
const result = await backendApi.rebuildRAGIndex();
|
||||||
|
if (result.success) {
|
||||||
|
toast.success(result.message || '索引重建成功');
|
||||||
|
// 刷新状态
|
||||||
|
const status = await backendApi.getRAGStatus();
|
||||||
|
if (status.success) {
|
||||||
|
setRagStatus({ vector_count: status.vector_count, collections: status.collections });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (err: any) {
|
||||||
|
toast.error(err.message || '重建索引失败');
|
||||||
|
} finally {
|
||||||
|
setRagRebuilding(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// 文件上传处理
|
// 文件上传处理
|
||||||
const onDrop = async (acceptedFiles: File[]) => {
|
const onDrop = async (acceptedFiles: File[]) => {
|
||||||
const file = acceptedFiles[0];
|
const file = acceptedFiles[0];
|
||||||
@@ -688,7 +750,7 @@ const Documents: React.FC = () => {
|
|||||||
<SelectValue />
|
<SelectValue />
|
||||||
</SelectTrigger>
|
</SelectTrigger>
|
||||||
<SelectContent>
|
<SelectContent>
|
||||||
{analysisTypes.map(type => (
|
{(analysisTypes || []).map(type => (
|
||||||
<SelectItem key={type.value} value={type.value}>
|
<SelectItem key={type.value} value={type.value}>
|
||||||
<div className="flex items-center gap-2">
|
<div className="flex items-center gap-2">
|
||||||
{getAnalysisIcon(type.value)}
|
{getAnalysisIcon(type.value)}
|
||||||
@@ -851,9 +913,9 @@ const Documents: React.FC = () => {
|
|||||||
</div>
|
</div>
|
||||||
</CardHeader>
|
</CardHeader>
|
||||||
<CardContent className="max-h-[400px] overflow-y-auto">
|
<CardContent className="max-h-[400px] overflow-y-auto">
|
||||||
{aiAnalysis.analysis?.sheets ? (
|
{aiAnalysis.analysis?.sheets && typeof aiAnalysis.analysis.sheets === 'object' ? (
|
||||||
<div className="space-y-4">
|
<div className="space-y-4">
|
||||||
{Object.entries(aiAnalysis.analysis.sheets).map(([sheetName, result]: [string, any]) => (
|
{Object.entries(aiAnalysis.analysis.sheets || {}).map(([sheetName, result]: [string, any]) => (
|
||||||
<div key={sheetName} className="p-4 bg-muted/30 rounded-xl">
|
<div key={sheetName} className="p-4 bg-muted/30 rounded-xl">
|
||||||
<div className="flex items-center gap-2 mb-2">
|
<div className="flex items-center gap-2 mb-2">
|
||||||
<FileSpreadsheet size={16} className="text-primary" />
|
<FileSpreadsheet size={16} className="text-primary" />
|
||||||
@@ -940,7 +1002,7 @@ const Documents: React.FC = () => {
|
|||||||
<Table className="text-primary" size={20} />
|
<Table className="text-primary" size={20} />
|
||||||
数据预览
|
数据预览
|
||||||
</CardTitle>
|
</CardTitle>
|
||||||
<CardDescription>{parseResult.data.sheets ? '所有工作表数据' : '工作表数据'}</CardDescription>
|
<CardDescription>{parseResult?.data?.sheets ? '所有工作表数据' : '工作表数据'}</CardDescription>
|
||||||
</div>
|
</div>
|
||||||
<Button variant="outline" size="sm" onClick={openExportDialog} className="gap-2">
|
<Button variant="outline" size="sm" onClick={openExportDialog} className="gap-2">
|
||||||
<Download size={14} />导出
|
<Download size={14} />导出
|
||||||
@@ -948,9 +1010,9 @@ const Documents: React.FC = () => {
|
|||||||
</div>
|
</div>
|
||||||
</CardHeader>
|
</CardHeader>
|
||||||
<CardContent>
|
<CardContent>
|
||||||
{parseResult.data.sheets ? (
|
{parseResult?.data?.sheets && typeof parseResult.data.sheets === 'object' ? (
|
||||||
<div className="space-y-4">
|
<div className="space-y-4">
|
||||||
{Object.entries(parseResult.data.sheets).map(([sheetName, sheetData]: [string, any]) => (
|
{Object.entries(parseResult.data.sheets || {}).map(([sheetName, sheetData]: [string, any]) => (
|
||||||
<div key={sheetName} className="border rounded-xl overflow-hidden">
|
<div key={sheetName} className="border rounded-xl overflow-hidden">
|
||||||
<button
|
<button
|
||||||
onClick={() => setExpandedSheet(expandedSheet === sheetName ? null : sheetName)}
|
onClick={() => setExpandedSheet(expandedSheet === sheetName ? null : sheetName)}
|
||||||
@@ -972,12 +1034,89 @@ const Documents: React.FC = () => {
|
|||||||
))}
|
))}
|
||||||
</div>
|
</div>
|
||||||
) : (
|
) : (
|
||||||
<DataTable columns={parseResult.data.columns || []} rows={parseResult.data.rows || []} />
|
<DataTable columns={parseResult?.data?.columns || []} rows={parseResult?.data?.rows || []} />
|
||||||
)}
|
)}
|
||||||
</CardContent>
|
</CardContent>
|
||||||
</Card>
|
</Card>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
{/* RAG 向量检索 */}
|
||||||
|
<Card className="border-none shadow-md bg-gradient-to-br from-violet-500/5 to-cyan-500/5">
|
||||||
|
<CardHeader className="pb-4">
|
||||||
|
<div className="flex items-center justify-between">
|
||||||
|
<div className="space-y-1">
|
||||||
|
<CardTitle className="flex items-center gap-2">
|
||||||
|
<Brain className="text-violet-500" size={20} />
|
||||||
|
RAG 向量检索
|
||||||
|
</CardTitle>
|
||||||
|
<CardDescription>
|
||||||
|
向量索引: {(ragStatus?.vector_count) || 0} 条
|
||||||
|
{ragStatus?.collections && ragStatus.collections.length > 0 && ` | 集合: ${ragStatus.collections.join(', ')}`}
|
||||||
|
</CardDescription>
|
||||||
|
</div>
|
||||||
|
<Button
|
||||||
|
variant="outline"
|
||||||
|
size="sm"
|
||||||
|
onClick={handleRebuildRag}
|
||||||
|
disabled={ragRebuilding}
|
||||||
|
>
|
||||||
|
{ragRebuilding ? <Loader2 className="mr-2 h-4 w-4 animate-spin" /> : <RefreshCcw className="mr-2 h-4 w-4" />}
|
||||||
|
重建索引
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
</CardHeader>
|
||||||
|
<CardContent className="space-y-4">
|
||||||
|
{/* 搜索框 */}
|
||||||
|
<div className="flex gap-2">
|
||||||
|
<Input
|
||||||
|
placeholder="输入查询内容,例如:查询去年销售额最高的客户..."
|
||||||
|
value={ragSearchQuery}
|
||||||
|
onChange={(e) => setRagSearchQuery(e.target.value)}
|
||||||
|
onKeyDown={(e) => e.key === 'Enter' && handleRagSearch()}
|
||||||
|
className="flex-1"
|
||||||
|
/>
|
||||||
|
<Button onClick={handleRagSearch} disabled={ragSearching}>
|
||||||
|
{ragSearching ? <Loader2 className="h-4 w-4 animate-spin" /> : <Search className="h-4 w-4" />}
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
{/* 搜索结果 */}
|
||||||
|
{(ragResults?.length ?? 0) > 0 && (
|
||||||
|
<div className="space-y-3">
|
||||||
|
<Label className="text-sm font-medium">检索结果</Label>
|
||||||
|
{(ragResults || []).map((result, index) => (
|
||||||
|
<div key={index} className="p-4 rounded-xl border bg-card hover:bg-muted/30 transition-colors">
|
||||||
|
<div className="flex items-start justify-between gap-2 mb-2">
|
||||||
|
<Badge variant="outline" className="text-xs">
|
||||||
|
相似度: {(result.score * 100).toFixed(1)}%
|
||||||
|
</Badge>
|
||||||
|
{result.metadata?.table_name && (
|
||||||
|
<Badge variant="secondary" className="text-xs">
|
||||||
|
{result.metadata.table_name}
|
||||||
|
</Badge>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
<p className="text-sm whitespace-pre-wrap">{result.content}</p>
|
||||||
|
{result.metadata && (
|
||||||
|
<div className="flex gap-2 mt-2 flex-wrap">
|
||||||
|
{result.metadata.field_name && (
|
||||||
|
<span className="text-xs text-muted-foreground">
|
||||||
|
字段: {result.metadata.field_name}
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
{result.metadata.filename && (
|
||||||
|
<span className="text-xs text-muted-foreground">
|
||||||
|
文件: {result.metadata.filename}
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</CardContent>
|
||||||
|
</Card>
|
||||||
|
|
||||||
{/* 文档列表 */}
|
{/* 文档列表 */}
|
||||||
<Card className="border-none shadow-md">
|
<Card className="border-none shadow-md">
|
||||||
<CardHeader>
|
<CardHeader>
|
||||||
@@ -1002,9 +1141,9 @@ const Documents: React.FC = () => {
|
|||||||
{/* 文档列表 */}
|
{/* 文档列表 */}
|
||||||
{loading ? (
|
{loading ? (
|
||||||
<div className="space-y-3">{[1, 2, 3].map(i => <Skeleton key={i} className="h-16 w-full rounded-xl" />)}</div>
|
<div className="space-y-3">{[1, 2, 3].map(i => <Skeleton key={i} className="h-16 w-full rounded-xl" />)}</div>
|
||||||
) : filteredDocs.length > 0 ? (
|
) : (filteredDocs?.length ?? 0) > 0 ? (
|
||||||
<div className="space-y-3">
|
<div className="space-y-3">
|
||||||
{filteredDocs.map(doc => (
|
{(filteredDocs || []).map(doc => (
|
||||||
<div key={doc.doc_id} className="flex items-center gap-4 p-4 rounded-xl border border-transparent hover:bg-muted/30 transition-all group">
|
<div key={doc.doc_id} className="flex items-center gap-4 p-4 rounded-xl border border-transparent hover:bg-muted/30 transition-all group">
|
||||||
<div className={cn(
|
<div className={cn(
|
||||||
"w-10 h-10 rounded-lg flex items-center justify-center shrink-0",
|
"w-10 h-10 rounded-lg flex items-center justify-center shrink-0",
|
||||||
|
|||||||
Reference in New Issue
Block a user