feat(excel): 添加对特殊Excel文件的XML解析支持

添加了从Excel文件XML直接解析工作表名称和数据的功能，以支持pandas无法正确解析的特殊格式Excel文件。同时更新了.gitignore文件，添加了更多忽略规则。修复了markdown AI服务中的正则表达式模式匹配问题。
2026-04-02 13:19:00 +08:00
parent d189ea9620
commit 7c19e49988
4 changed files with 338 additions and 10 deletions
--- a/backend/app/services/excel_storage_service.py
+++ b/backend/app/services/excel_storage_service.py
@@ -34,6 +34,123 @@ class ExcelStorageService:
    def __init__(self):
        self.mysql_db = mysql_db

+    def _extract_sheet_names_from_xml(self, file_path: str) -> list:
+        """从 Excel 文件的 XML 中提取工作表名称"""
+        import zipfile
+        from xml.etree import ElementTree as ET
+
+        try:
+            with zipfile.ZipFile(file_path, 'r') as z:
+                if 'xl/workbook.xml' not in z.namelist():
+                    return []
+                content = z.read('xl/workbook.xml')
+                root = ET.fromstring(content)
+                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
+                sheets = root.findall('.//main:sheet', ns)
+                return [s.get('name') for s in sheets if s.get('name')]
+        except Exception:
+            return []
+
+    def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
+        """读取 Excel 工作表，支持 pandas 无法解析的特殊 Excel 文件"""
+        import zipfile
+        from xml.etree import ElementTree as ET
+
+        try:
+            df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
+            if df is not None and not df.empty:
+                return df
+        except Exception:
+            pass
+
+        # pandas 读取失败，从 XML 直接解析
+        logger.info(f"使用 XML 方式读取 Excel: {file_path}")
+
+        try:
+            with zipfile.ZipFile(file_path, 'r') as z:
+                sheet_names = self._extract_sheet_names_from_xml(file_path)
+                if not sheet_names:
+                    raise ValueError("无法从 Excel 文件中找到工作表")
+
+                target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
+                sheet_index = sheet_names.index(target_sheet) + 1
+
+                shared_strings = []
+                if 'xl/sharedStrings.xml' in z.namelist():
+                    ss_content = z.read('xl/sharedStrings.xml')
+                    ss_root = ET.fromstring(ss_content)
+                    ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
+                    for si in ss_root.findall('.//main:si', ns):
+                        t = si.find('.//main:t', ns)
+                        shared_strings.append(t.text if t is not None else '')
+
+                sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
+                sheet_content = z.read(sheet_file)
+                root = ET.fromstring(sheet_content)
+                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
+
+                rows_data = []
+                for row in root.findall('.//main:row', ns):
+                    row_idx = int(row.get('r', 0))
+                    if row_idx <= header_row + 1:
+                        continue
+
+                    row_cells = {}
+                    for cell in row.findall('main:c', ns):
+                        cell_ref = cell.get('r', '')
+                        col_letters = ''.join(filter(str.isalpha, cell_ref))
+                        cell_type = cell.get('t', 'n')
+                        v = cell.find('main:v', ns)
+
+                        if v is not None and v.text:
+                            if cell_type == 's':
+                                try:
+                                    val = shared_strings[int(v.text)]
+                                except (ValueError, IndexError):
+                                    val = v.text
+                            elif cell_type == 'b':
+                                val = v.text == '1'
+                            else:
+                                val = v.text
+                        else:
+                            val = None
+                        row_cells[col_letters] = val
+
+                    if row_cells:
+                        rows_data.append(row_cells)
+
+                if not rows_data:
+                    return pd.DataFrame()
+
+                df = pd.DataFrame(rows_data)
+
+                if header_row >= 0:
+                    first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
+                    sheet_content = z.read(first_row_sheet)
+                    root = ET.fromstring(sheet_content)
+                    first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
+                    if first_row is not None:
+                        headers = {}
+                        for cell in first_row.findall('main:c', ns):
+                            cell_ref = cell.get('r', '')
+                            col_letters = ''.join(filter(str.isalpha, cell_ref))
+                            cell_type = cell.get('t', 'n')
+                            v = cell.find('main:v', ns)
+                            if v is not None and v.text:
+                                if cell_type == 's':
+                                    try:
+                                        headers[col_letters] = shared_strings[int(v.text)]
+                                    except (ValueError, IndexError):
+                                        headers[col_letters] = v.text
+                                else:
+                                    headers[col_letters] = v.text
+                        df.columns = [headers.get(col, col) for col in df.columns]
+
+                return df
+        except Exception as e:
+            logger.error(f"XML 解析 Excel 失败: {e}")
+            raise
+
    def _sanitize_table_name(self, filename: str) -> str:
        """
        将文件名转换为合法的表名
@@ -227,11 +344,8 @@ class ExcelStorageService:

        try:
            logger.info(f"开始读取Excel文件: {file_path}")
-            # 读取 Excel
-            if sheet_name:
-                df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
-            else:
-                df = pd.read_excel(file_path, header=header_row)
+            # 读取 Excel（使用 fallback 方式支持特殊格式文件）
+            df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row)

            logger.info(f"Excel读取完成，行数: {len(df)}, 列数: {len(df.columns)}")

--- a/backend/app/services/markdown_ai_service.py
+++ b/backend/app/services/markdown_ai_service.py
@@ -48,8 +48,8 @@ class MarkdownAIService:
    # 中文章节编号模式
    CHINESE_NUMBERS = ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"]
    CHINESE_SUFFIX = "、"
-    PARENTHESIS_PATTERN = re.compile(r'^（([一二三四五六七八九十]+)）\s*(.+)$')
-    CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+）、\s*(.+)$')
+    PARENTHESIS_PATTERN = re.compile(r'^（([一二三四五六七八九十]+)\s*(.+)$')
+    CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+)、\s*(.+)$')
    ARABIC_SECTION_PATTERN = re.compile(r'^(\d+)\.\s+(.+)$')

    def __init__(self):
--- a/backend/app/services/table_rag_service.py
+++ b/backend/app/services/table_rag_service.py
@@ -31,6 +31,178 @@ class TableRAGService:
        self.rag = rag_service
        self.excel_storage = excel_storage_service

+    def _extract_sheet_names_from_xml(self, file_path: str) -> List[str]:
+        """
+        从 Excel 文件的 XML 中提取工作表名称
+
+        某些 Excel 文件由于包含非标准元素，pandas/openpyxl 无法正确解析工作表列表，
+        此时需要直接从 XML 中提取。
+
+        Args:
+            file_path: Excel 文件路径
+
+        Returns:
+            工作表名称列表
+        """
+        import zipfile
+        from xml.etree import ElementTree as ET
+
+        try:
+            with zipfile.ZipFile(file_path, 'r') as z:
+                # 读取 workbook.xml
+                if 'xl/workbook.xml' not in z.namelist():
+                    return []
+
+                content = z.read('xl/workbook.xml')
+                root = ET.fromstring(content)
+
+                # 定义命名空间
+                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
+
+                # 提取所有 sheet 的 name 属性
+                sheets = root.findall('.//main:sheet', ns)
+                return [s.get('name') for s in sheets if s.get('name')]
+
+        except Exception as e:
+            logger.warning(f"从 XML 提取工作表失败: {file_path}, error: {e}")
+            return []
+
+    def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
+        """
+        读取 Excel 工作表，支持 pandas 无法解析的特殊 Excel 文件
+
+        当 pandas 的 ExcelFile 无法正确解析时，直接从 XML 读取数据。
+
+        Args:
+            file_path: Excel 文件路径
+            sheet_name: 工作表名称（如果为 None，读取第一个工作表）
+            header_row: 表头行号
+
+        Returns:
+            DataFrame
+        """
+        import zipfile
+        from xml.etree import ElementTree as ET
+
+        try:
+            # 先尝试用 pandas 正常读取
+            df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
+            if df is not None and not df.empty:
+                return df
+        except Exception:
+            pass
+
+        # pandas 读取失败，从 XML 直接解析
+        logger.info(f"使用 XML 方式读取 Excel: {file_path}")
+
+        try:
+            with zipfile.ZipFile(file_path, 'r') as z:
+                # 获取工作表名称
+                sheet_names = self._extract_sheet_names_from_xml(file_path)
+                if not sheet_names:
+                    raise ValueError("无法从 Excel 文件中找到工作表")
+
+                # 确定要读取的工作表
+                target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
+                sheet_index = sheet_names.index(target_sheet) + 1  # sheet1.xml, sheet2.xml, ...
+
+                # 读取 shared strings
+                shared_strings = []
+                if 'xl/sharedStrings.xml' in z.namelist():
+                    ss_content = z.read('xl/sharedStrings.xml')
+                    ss_root = ET.fromstring(ss_content)
+                    ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
+                    for si in ss_root.findall('.//main:si', ns):
+                        t = si.find('.//main:t', ns)
+                        if t is not None:
+                            shared_strings.append(t.text or '')
+                        else:
+                            shared_strings.append('')
+
+                # 读取工作表
+                sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
+                if sheet_file not in z.namelist():
+                    raise ValueError(f"工作表文件 {sheet_file} 不存在")
+
+                sheet_content = z.read(sheet_file)
+                root = ET.fromstring(sheet_content)
+                ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
+
+                # 解析行
+                rows_data = []
+                for row in root.findall('.//main:row', ns):
+                    row_idx = int(row.get('r', 0))
+                    # header_row 是 0-indexed，row_idx 是 1-indexed
+                    # 如果 header_row=0 表示第一行是表头，需要跳过 row_idx=1
+                    if row_idx <= header_row + 1:
+                        continue  # 跳过表头行
+
+                    row_cells = {}
+                    for cell in row.findall('main:c', ns):
+                        cell_ref = cell.get('r', '')
+                        col_letters = ''.join(filter(str.isalpha, cell_ref))
+                        cell_type = cell.get('t', 'n')
+                        v = cell.find('main:v', ns)
+
+                        if v is not None and v.text:
+                            if cell_type == 's':
+                                # shared string
+                                try:
+                                    val = shared_strings[int(v.text)]
+                                except (ValueError, IndexError):
+                                    val = v.text
+                            elif cell_type == 'b':
+                                # boolean
+                                val = v.text == '1'
+                            else:
+                                # number or other
+                                val = v.text
+                        else:
+                            val = None
+
+                        row_cells[col_letters] = val
+
+                    if row_cells:
+                        rows_data.append(row_cells)
+
+                # 转换为 DataFrame
+                if not rows_data:
+                    return pd.DataFrame()
+
+                df = pd.DataFrame(rows_data)
+
+                # 如果有 header_row，重新设置列名
+                if header_row >= 0:
+                    # 重新读取第一行作为表头
+                    first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
+                    sheet_content = z.read(first_row_sheet)
+                    root = ET.fromstring(sheet_content)
+                    first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
+                    if first_row is not None:
+                        headers = {}
+                        for cell in first_row.findall('main:c', ns):
+                            cell_ref = cell.get('r', '')
+                            col_letters = ''.join(filter(str.isalpha, cell_ref))
+                            cell_type = cell.get('t', 'n')
+                            v = cell.find('main:v', ns)
+                            if v is not None and v.text:
+                                if cell_type == 's':
+                                    try:
+                                        headers[col_letters] = shared_strings[int(v.text)]
+                                    except (ValueError, IndexError):
+                                        headers[col_letters] = v.text
+                                else:
+                                    headers[col_letters] = v.text
+                        # 重命名列
+                        df.columns = [headers.get(col, col) for col in df.columns]
+
+                logger.info(f"XML 解析完成: {len(df)} 行, {len(df.columns)} 列")
+                return df
+
+        except Exception as e:
+            logger.error(f"XML 解析 Excel 失败: {e}")
+            raise
+
    async def generate_field_description(
        self,
        table_name: str,
@@ -132,6 +304,12 @@ class TableRAGService:
                xls_file = pd.ExcelFile(file_path)
                sheet_names = xls_file.sheet_names
                logger.info(f"Excel文件工作表: {sheet_names}")
+
+                # 如果 sheet_names 为空，尝试从 XML 中手动提取
+                if not sheet_names:
+                    sheet_names = self._extract_sheet_names_from_xml(file_path)
+                    logger.info(f"从XML提取工作表: {sheet_names}")
+
                if not sheet_names:
                    return {"success": False, "error": "Excel 文件没有工作表"}
            except Exception as e:
@@ -144,9 +322,7 @@ class TableRAGService:
                if sheet_name not in sheet_names:
                    logger.warning(f"指定的工作表 '{sheet_name}' 不存在，使用第一个工作表: {sheet_names[0]}")
                    sheet_name = sheet_names[0]
-                df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
-            else:
-                df = pd.read_excel(file_path, header=header_row)
+            df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row)

            logger.info(f"读取到数据: {len(df)} 行, {len(df.columns)} 列")