添加XML回退解析机制支持复杂Excel文件

当pandas无法解析某些包含非标准元素的Excel文件时，添加了XML直接解析功能来提取工作表名称和数据。 - 实现了`_extract_sheet_names_from_xml`方法从XML提取工作表名称 - 实现了`_read_excel_sheet_xml`方法直接解析Excel XML数据 - 添加多种命名空间支持以处理不同Excel格式 - 在pandas解析失败时自动回退到XML解析方式 fix(excel-storage-service): 修复XML解析中的命名空间问题改进了XML解析逻辑，添加对多种命名空间的支持，使用通配符查找元素以兼容不同Excel文件格式。 refactor(table-rag-service): 优化XML解析逻辑提高兼容性统一了XML解析的命名空间处理方式，改进了元素查找逻辑以更好地支持不同Excel格式。 feat(frontend): 添加RAG向量检索和索引重建功能 - 实现了RAG状态查看、搜索和索引重建接口 - 添加了前端RAG检索界面组件 - 增加了错误处理和加载状态提示
2026-04-08 19:21:40 +08:00
parent 41e5eaaa2d
commit 3b82103e87
6 changed files with 523 additions and 145 deletions
--- a/backend/app/core/document_parser/xlsx_parser.py
+++ b/backend/app/core/document_parser/xlsx_parser.py
@@ -67,11 +67,14 @@ class XlsxParser(BaseParser):
            xls_file = pd.ExcelFile(file_path)
            sheet_names = xls_file.sheet_names

+            # 如果 pandas 返回空列表，尝试从 XML 提取
            if not sheet_names:
-                return ParseResult(
-                    success=False,
-                    error=f"Excel 文件没有找到任何工作表: {file_path}"
-                )
+                sheet_names = self._extract_sheet_names_from_xml(file_path)
+                if not sheet_names:
+                    return ParseResult(
+                        success=False,
+                        error=f"Excel 文件没有找到任何工作表: {file_path}"
+                    )

            # 验证请求的工作表索引/名称
            target_sheet = None
@@ -88,15 +91,21 @@ class XlsxParser(BaseParser):
                target_sheet = sheet_names[0]

            # 读取 Excel 文件
-            df = pd.read_excel(
-                file_path,
-                sheet_name=target_sheet,
-                header=header_row,
-                **kwargs
-            )
+            df = None
+            try:
+                df = pd.read_excel(
+                    file_path,
+                    sheet_name=target_sheet,
+                    header=header_row,
+                    **kwargs
+                )
+            except Exception as e:
+                logger.warning(f"pandas 读取 Excel 失败，尝试 XML 方式: {e}")
+                # pandas 读取失败，尝试 XML 方式
+                df = self._read_excel_sheet_xml(file_path, sheet_name=target_sheet, header_row=header_row)

            # 检查 DataFrame 是否为空
-            if df.empty:
+            if df is None or df.empty:
                return ParseResult(
                    success=False,
                    error=f"工作表 '{target_sheet}' 为空，请检查 Excel 文件内容"
@@ -211,7 +220,26 @@ class XlsxParser(BaseParser):

        try:
            # 读取所有工作表
-            all_data = pd.read_excel(file_path, sheet_name=None, **kwargs)
+            all_data = None
+            try:
+                all_data = pd.read_excel(file_path, sheet_name=None, **kwargs)
+            except Exception as e:
+                logger.warning(f"pandas 读取所有工作表失败: {e}")
+
+            # 如果 pandas 失败，尝试 XML 方式
+            if all_data is None or len(all_data) == 0:
+                sheet_names = self._extract_sheet_names_from_xml(file_path)
+                if not sheet_names:
+                    return ParseResult(
+                        success=False,
+                        error=f"无法读取 Excel 文件或文件为空: {file_path}"
+                    )
+                # 使用 XML 方式读取每个工作表
+                all_data = {}
+                for sheet_name in sheet_names:
+                    df = self._read_excel_sheet_xml(file_path, sheet_name=sheet_name, header_row=0)
+                    if df is not None and not df.empty:
+                        all_data[sheet_name] = df

            # 检查是否成功读取到数据
            if not all_data or len(all_data) == 0:
@@ -257,13 +285,149 @@ class XlsxParser(BaseParser):
        try:
            xls = pd.ExcelFile(file_path)
            sheet_names = xls.sheet_names
-            if not sheet_names:
-                return []
-            return sheet_names
+            if sheet_names:
+                return sheet_names
+            # pandas 返回空列表，尝试从 XML 提取
+            return self._extract_sheet_names_from_xml(file_path)
        except Exception as e:
            logger.error(f"获取工作表名称失败: {str(e)}")
+            # 尝试从 XML 提取
+            return self._extract_sheet_names_from_xml(file_path)
+
+    def _extract_sheet_names_from_xml(self, file_path: str) -> List[str]:
+        """
+        从 Excel 文件的 XML 中提取工作表名称
+
+        某些 Excel 文件由于包含非标准元素（如 mc:AlternateContent），
+        pandas/openpyxl 无法正确解析工作表列表，此时需要直接从 XML 中提取。
+
+        Args:
+            file_path: Excel 文件路径
+
+        Returns:
+            工作表名称列表
+        """
+        import zipfile
+        from xml.etree import ElementTree as ET
+
+        try:
+            with zipfile.ZipFile(file_path, 'r') as z:
+                if 'xl/workbook.xml' not in z.namelist():
+                    return []
+                content = z.read('xl/workbook.xml')
+                root = ET.fromstring(content)
+
+                # 命名空间
+                ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
+
+                sheet_names = []
+                for sheet in root.findall('.//main:sheet', ns):
+                    name = sheet.get('name')
+                    if name:
+                        sheet_names.append(name)
+
+                logger.info(f"从 XML 提取工作表: {sheet_names}")
+                return sheet_names
+        except Exception as e:
+            logger.error(f"从 XML 提取工作表名称失败: {e}")
            return []

+    def _read_excel_sheet_xml(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
+        """
+        从 XML 直接读取 Excel 工作表数据
+
+        当 pandas 无法正确解析时使用此方法。
+
+        Args:
+            file_path: Excel 文件路径
+            sheet_name: 工作表名称（如果为 None，读取第一个工作表）
+            header_row: 表头行号（0-indexed）
+
+        Returns:
+            DataFrame
+        """
+        import zipfile
+        from xml.etree import ElementTree as ET
+
+        with zipfile.ZipFile(file_path, 'r') as z:
+            # 获取工作表名称
+            sheet_names = self._extract_sheet_names_from_xml(file_path)
+            if not sheet_names:
+                raise ValueError("无法从 Excel 文件中找到工作表")
+
+            # 确定要读取的工作表
+            target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
+            sheet_index = sheet_names.index(target_sheet) + 1  # sheet1.xml, sheet2.xml, ...
+
+            # 读取 shared strings
+            shared_strings = []
+            if 'xl/sharedStrings.xml' in z.namelist():
+                ss_content = z.read('xl/sharedStrings.xml')
+                ss_root = ET.fromstring(ss_content)
+                ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
+                for si in ss_root.findall('.//main:si', ns):
+                    t = si.find('.//main:t', ns)
+                    if t is not None:
+                        shared_strings.append(t.text or '')
+                    else:
+                        shared_strings.append('')
+
+            # 读取工作表
+            sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
+            if sheet_file not in z.namelist():
+                raise ValueError(f"工作表文件 {sheet_file} 不存在")
+
+            sheet_content = z.read(sheet_file)
+            root = ET.fromstring(sheet_content)
+            ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
+
+            # 收集所有行数据
+            all_rows = []
+            headers = {}
+
+            for row in root.findall('.//main:row', ns):
+                row_idx = int(row.get('r', 0))
+                row_cells = {}
+                for cell in row.findall('main:c', ns):
+                    cell_ref = cell.get('r', '')
+                    col_letters = ''.join(filter(str.isalpha, cell_ref))
+                    cell_type = cell.get('t', 'n')
+                    v = cell.find('main:v', ns)
+
+                    if v is not None and v.text:
+                        if cell_type == 's':
+                            # shared string
+                            try:
+                                row_cells[col_letters] = shared_strings[int(v.text)]
+                            except (ValueError, IndexError):
+                                row_cells[col_letters] = v.text
+                        elif cell_type == 'b':
+                            # boolean
+                            row_cells[col_letters] = v.text == '1'
+                        else:
+                            row_cells[col_letters] = v.text
+                    else:
+                        row_cells[col_letters] = None
+
+                # 处理表头行
+                if row_idx == header_row + 1:
+                    headers = {**row_cells}
+                elif row_idx > header_row + 1:
+                    all_rows.append(row_cells)
+
+            # 构建 DataFrame
+            if headers:
+                # 按原始列顺序排列
+                col_order = list(headers.keys())
+                df = pd.DataFrame(all_rows)
+                if not df.empty:
+                    df = df[col_order]
+                df.columns = [headers.get(col, col) for col in df.columns]
+            else:
+                df = pd.DataFrame(all_rows)
+
+            return df
+
    def _df_to_dict(self, df: pd.DataFrame) -> Dict[str, Any]:
        """
        将 DataFrame 转换为字典，处理 NaN 值
--- a/backend/app/services/excel_storage_service.py
+++ b/backend/app/services/excel_storage_service.py
@@ -45,8 +45,25 @@ class ExcelStorageService:
                    return []
                content = z.read('xl/workbook.xml')
                root = ET.fromstring(content)
-                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
-                sheets = root.findall('.//main:sheet', ns)
+
+                # 尝试多种命名空间
+                namespaces = [
+                    'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
+                    'http://purl.oclc.org/ooxml/spreadsheetml/main',
+                ]
+
+                for ns_uri in namespaces:
+                    ns = {'main': ns_uri}
+                    sheets = root.findall('.//main:sheet', ns)
+                    if sheets:
+                        names = [s.get('name') for s in sheets if s.get('name')]
+                        if names:
+                            return names
+
+                # 尝试通配符
+                sheets = root.findall('.//{*}sheet')
+                if not sheets:
+                    sheets = root.findall('.//sheet')
                return [s.get('name') for s in sheets if s.get('name')]
        except Exception:
            return []
@@ -79,72 +96,77 @@ class ExcelStorageService:
                if 'xl/sharedStrings.xml' in z.namelist():
                    ss_content = z.read('xl/sharedStrings.xml')
                    ss_root = ET.fromstring(ss_content)
-                    ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
-                    for si in ss_root.findall('.//main:si', ns):
-                        t = si.find('.//main:t', ns)
-                        shared_strings.append(t.text if t is not None else '')
+                    for si in ss_root.iter():
+                        if si.tag.endswith('}si') or si.tag == 'si':
+                            t = si.find('.//{*}t')
+                            shared_strings.append(t.text if t is not None and t.text else '')

                sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
                sheet_content = z.read(sheet_file)
                root = ET.fromstring(sheet_content)
-                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}

                rows_data = []
-                for row in root.findall('.//main:row', ns):
-                    row_idx = int(row.get('r', 0))
-                    if row_idx <= header_row + 1:
-                        continue
+                headers = {}

-                    row_cells = {}
-                    for cell in row.findall('main:c', ns):
-                        cell_ref = cell.get('r', '')
-                        col_letters = ''.join(filter(str.isalpha, cell_ref))
-                        cell_type = cell.get('t', 'n')
-                        v = cell.find('main:v', ns)
+                for row in root.iter():
+                    if row.tag.endswith('}row') or row.tag == 'row':
+                        row_idx = int(row.get('r', 0))

-                        if v is not None and v.text:
-                            if cell_type == 's':
-                                try:
-                                    val = shared_strings[int(v.text)]
-                                except (ValueError, IndexError):
-                                    val = v.text
-                            elif cell_type == 'b':
-                                val = v.text == '1'
-                            else:
-                                val = v.text
-                        else:
-                            val = None
-                        row_cells[col_letters] = val
+                        # 收集表头行
+                        if row_idx == header_row + 1:
+                            for cell in row:
+                                if cell.tag.endswith('}c') or cell.tag == 'c':
+                                    cell_ref = cell.get('r', '')
+                                    col_letters = ''.join(filter(str.isalpha, cell_ref))
+                                    cell_type = cell.get('t', 'n')
+                                    v = cell.find('{*}v')
+                                    if v is not None and v.text:
+                                        if cell_type == 's':
+                                            try:
+                                                headers[col_letters] = shared_strings[int(v.text)]
+                                            except (ValueError, IndexError):
+                                                headers[col_letters] = v.text
+                                        else:
+                                            headers[col_letters] = v.text
+                                    else:
+                                        headers[col_letters] = col_letters
+                            continue

-                    if row_cells:
-                        rows_data.append(row_cells)
+                        if row_idx <= header_row + 1:
+                            continue
+
+                        row_cells = {}
+                        for cell in row:
+                            if cell.tag.endswith('}c') or cell.tag == 'c':
+                                cell_ref = cell.get('r', '')
+                                col_letters = ''.join(filter(str.isalpha, cell_ref))
+                                cell_type = cell.get('t', 'n')
+                                v = cell.find('{*}v')
+
+                                if v is not None and v.text:
+                                    if cell_type == 's':
+                                        try:
+                                            val = shared_strings[int(v.text)]
+                                        except (ValueError, IndexError):
+                                            val = v.text
+                                    elif cell_type == 'b':
+                                        val = v.text == '1'
+                                    else:
+                                        val = v.text
+                                else:
+                                    val = None
+                                row_cells[col_letters] = val
+
+                        if row_cells:
+                            rows_data.append(row_cells)

                if not rows_data:
                    return pd.DataFrame()

                df = pd.DataFrame(rows_data)

-                if header_row >= 0:
-                    first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
-                    sheet_content = z.read(first_row_sheet)
-                    root = ET.fromstring(sheet_content)
-                    first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
-                    if first_row is not None:
-                        headers = {}
-                        for cell in first_row.findall('main:c', ns):
-                            cell_ref = cell.get('r', '')
-                            col_letters = ''.join(filter(str.isalpha, cell_ref))
-                            cell_type = cell.get('t', 'n')
-                            v = cell.find('main:v', ns)
-                            if v is not None and v.text:
-                                if cell_type == 's':
-                                    try:
-                                        headers[col_letters] = shared_strings[int(v.text)]
-                                    except (ValueError, IndexError):
-                                        headers[col_letters] = v.text
-                                else:
-                                    headers[col_letters] = v.text
-                        df.columns = [headers.get(col, col) for col in df.columns]
+                if headers:
+                    df.columns = [headers.get(col, col) for col in df.columns]

                return df
        except Exception as e:
--- a/backend/app/services/table_rag_service.py
+++ b/backend/app/services/table_rag_service.py
@@ -47,6 +47,12 @@ class TableRAGService:
        import zipfile
        from xml.etree import ElementTree as ET

+        # 尝试多种命名空间
+        namespaces = [
+            'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
+            'http://purl.oclc.org/ooxml/spreadsheetml/main',
+        ]
+
        try:
            with zipfile.ZipFile(file_path, 'r') as z:
                # 读取 workbook.xml
@@ -56,12 +62,27 @@ class TableRAGService:
                content = z.read('xl/workbook.xml')
                root = ET.fromstring(content)

-                # 定义命名空间
-                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
+                # 尝试多种命名空间
+                for ns_uri in namespaces:
+                    ns = {'main': ns_uri}
+                    sheets = root.findall('.//main:sheet', ns)
+                    if sheets:
+                        names = [s.get('name') for s in sheets if s.get('name')]
+                        if names:
+                            logger.info(f"使用命名空间 {ns_uri} 提取到工作表: {names}")
+                            return names

-                # 提取所有 sheet 的 name 属性
-                sheets = root.findall('.//main:sheet', ns)
-                return [s.get('name') for s in sheets if s.get('name')]
+                # 如果都没找到，尝试不带命名空间
+                sheets = root.findall('.//sheet')
+                if not sheets:
+                    sheets = root.findall('.//{*}sheet')
+                names = [s.get('name') for s in sheets if s.get('name')]
+                if names:
+                    logger.info(f"使用通配符提取到工作表: {names}")
+                    return names
+
+                logger.warning(f"无法从 XML 提取工作表，尝试的文件: {file_path}")
+                return []

        except Exception as e:
            logger.warning(f"从 XML 提取工作表失败: {file_path}, error: {e}")
@@ -84,6 +105,12 @@ class TableRAGService:
        import zipfile
        from xml.etree import ElementTree as ET

+        # 定义命名空间
+        namespaces = [
+            'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
+            'http://purl.oclc.org/ooxml/spreadsheetml/main',
+        ]
+
        try:
            # 先尝试用 pandas 正常读取
            df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
@@ -111,13 +138,14 @@ class TableRAGService:
                if 'xl/sharedStrings.xml' in z.namelist():
                    ss_content = z.read('xl/sharedStrings.xml')
                    ss_root = ET.fromstring(ss_content)
-                    ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
-                    for si in ss_root.findall('.//main:si', ns):
-                        t = si.find('.//main:t', ns)
-                        if t is not None:
-                            shared_strings.append(t.text or '')
-                        else:
-                            shared_strings.append('')
+                    # 使用通配符查找所有 si 元素
+                    for si in ss_root.iter():
+                        if si.tag.endswith('}si') or si.tag == 'si':
+                            t = si.find('.//{*}t')
+                            if t is not None and t.text:
+                                shared_strings.append(t.text)
+                            else:
+                                shared_strings.append('')

                # 读取工作表
                sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
@@ -126,75 +154,75 @@ class TableRAGService:

                sheet_content = z.read(sheet_file)
                root = ET.fromstring(sheet_content)
-                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}

-                # 解析行
+                # 解析行 - 使用通配符查找
                rows_data = []
-                for row in root.findall('.//main:row', ns):
-                    row_idx = int(row.get('r', 0))
-                    # header_row 是 0-indexed，row_idx 是 1-indexed
-                    # 如果 header_row=0 表示第一行是表头，需要跳过 row_idx=1
-                    if row_idx <= header_row + 1:
-                        continue  # 跳过表头行
+                headers = {}

-                    row_cells = {}
-                    for cell in row.findall('main:c', ns):
-                        cell_ref = cell.get('r', '')
-                        col_letters = ''.join(filter(str.isalpha, cell_ref))
-                        cell_type = cell.get('t', 'n')
-                        v = cell.find('main:v', ns)
+                for row in root.iter():
+                    if row.tag.endswith('}row') or row.tag == 'row':
+                        row_idx = int(row.get('r', 0))

-                        if v is not None and v.text:
-                            if cell_type == 's':
-                                # shared string
-                                try:
-                                    val = shared_strings[int(v.text)]
-                                except (ValueError, IndexError):
-                                    val = v.text
-                            elif cell_type == 'b':
-                                # boolean
-                                val = v.text == '1'
-                            else:
-                                # number or other
-                                val = v.text
-                        else:
-                            val = None
+                        # 收集表头行
+                        if row_idx == header_row + 1:
+                            for cell in row:
+                                if cell.tag.endswith('}c') or cell.tag == 'c':
+                                    cell_ref = cell.get('r', '')
+                                    col_letters = ''.join(filter(str.isalpha, cell_ref))
+                                    cell_type = cell.get('t', 'n')
+                                    v = cell.find('{*}v')
+                                    if v is not None and v.text:
+                                        if cell_type == 's':
+                                            try:
+                                                headers[col_letters] = shared_strings[int(v.text)]
+                                            except (ValueError, IndexError):
+                                                headers[col_letters] = v.text
+                                        else:
+                                            headers[col_letters] = v.text
+                                    else:
+                                        headers[col_letters] = col_letters
+                            continue

-                        row_cells[col_letters] = val
+                        # 跳过表头行之后的数据行
+                        if row_idx <= header_row + 1:
+                            continue

-                    if row_cells:
-                        rows_data.append(row_cells)
+                        row_cells = {}
+                        for cell in row:
+                            if cell.tag.endswith('}c') or cell.tag == 'c':
+                                cell_ref = cell.get('r', '')
+                                col_letters = ''.join(filter(str.isalpha, cell_ref))
+                                cell_type = cell.get('t', 'n')
+                                v = cell.find('{*}v')
+
+                                if v is not None and v.text:
+                                    if cell_type == 's':
+                                        try:
+                                            val = shared_strings[int(v.text)]
+                                        except (ValueError, IndexError):
+                                            val = v.text
+                                    elif cell_type == 'b':
+                                        val = v.text == '1'
+                                    else:
+                                        val = v.text
+                                else:
+                                    val = None
+
+                                row_cells[col_letters] = val
+
+                        if row_cells:
+                            rows_data.append(row_cells)

                # 转换为 DataFrame
                if not rows_data:
+                    logger.warning(f"XML 解析结果为空: {file_path}, sheet: {target_sheet}")
                    return pd.DataFrame()

                df = pd.DataFrame(rows_data)

-                # 如果有 header_row，重新设置列名
-                if header_row >= 0:
-                    # 重新读取第一行作为表头
-                    first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
-                    sheet_content = z.read(first_row_sheet)
-                    root = ET.fromstring(sheet_content)
-                    first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
-                    if first_row is not None:
-                        headers = {}
-                        for cell in first_row.findall('main:c', ns):
-                            cell_ref = cell.get('r', '')
-                            col_letters = ''.join(filter(str.isalpha, cell_ref))
-                            cell_type = cell.get('t', 'n')
-                            v = cell.find('main:v', ns)
-                            if v is not None and v.text:
-                                if cell_type == 's':
-                                    try:
-                                        headers[col_letters] = shared_strings[int(v.text)]
-                                    except (ValueError, IndexError):
-                                        headers[col_letters] = v.text
-                                else:
-                                    headers[col_letters] = v.text
-                        # 重命名列
-                        df.columns = [headers.get(col, col) for col in df.columns]
+                # 应用表头
+                if headers:
+                    df.columns = [headers.get(col, col) for col in df.columns]

                logger.info(f"XML 解析完成: {len(df)} 行, {len(df.columns)} 列")
                return df