diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4c224b9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,38 @@ +/.git/ +/.idea/ +/.vscode/ +/backend/venv/ +/backend/command/ +/backend/.env +/backend/.env.local +/backend/.env.*.local +/backend/app/__pycache__/* +/backend/data/uploads +/backend/data/charts +/backend/data/logs + +/frontend/node_modules/ +/frontend/dist/ +/frontend/build/ +/frontend/.vscode/ +/frontend/.idea/ +/frontend/.env +/frontend/*.log +/技术路线.md +/开发路径.md +/开发日志_2026-03-16.md +/frontendTest/ +/docs/ +/frontend/src/api/ +/frontend/src/api/index.js +/frontend/src/api/index.ts +/frontend/src/api/index.tsx +/frontend/src/api/index.py +/frontend/src/api/index.go +/frontend/src/api/index.java +/docs/ +/frontend - 副本/* +/supabase.txt + +**/__pycache__/* +**.pyc diff --git a/backend/app/services/excel_storage_service.py b/backend/app/services/excel_storage_service.py index 62a54c6..000dafc 100644 --- a/backend/app/services/excel_storage_service.py +++ b/backend/app/services/excel_storage_service.py @@ -34,6 +34,123 @@ class ExcelStorageService: def __init__(self): self.mysql_db = mysql_db + def _extract_sheet_names_from_xml(self, file_path: str) -> list: + """从 Excel 文件的 XML 中提取工作表名称""" + import zipfile + from xml.etree import ElementTree as ET + + try: + with zipfile.ZipFile(file_path, 'r') as z: + if 'xl/workbook.xml' not in z.namelist(): + return [] + content = z.read('xl/workbook.xml') + root = ET.fromstring(content) + ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'} + sheets = root.findall('.//main:sheet', ns) + return [s.get('name') for s in sheets if s.get('name')] + except Exception: + return [] + + def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame: + """读取 Excel 工作表,支持 pandas 无法解析的特殊 Excel 文件""" + import zipfile + from xml.etree import ElementTree as ET + + try: + df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row) + if df is not None and not df.empty: + return df + except Exception: + pass + + # pandas 读取失败,从 XML 直接解析 + logger.info(f"使用 XML 方式读取 Excel: {file_path}") + + try: + with zipfile.ZipFile(file_path, 'r') as z: + sheet_names = self._extract_sheet_names_from_xml(file_path) + if not sheet_names: + raise ValueError("无法从 Excel 文件中找到工作表") + + target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0] + sheet_index = sheet_names.index(target_sheet) + 1 + + shared_strings = [] + if 'xl/sharedStrings.xml' in z.namelist(): + ss_content = z.read('xl/sharedStrings.xml') + ss_root = ET.fromstring(ss_content) + ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'} + for si in ss_root.findall('.//main:si', ns): + t = si.find('.//main:t', ns) + shared_strings.append(t.text if t is not None else '') + + sheet_file = f'xl/worksheets/sheet{sheet_index}.xml' + sheet_content = z.read(sheet_file) + root = ET.fromstring(sheet_content) + ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'} + + rows_data = [] + for row in root.findall('.//main:row', ns): + row_idx = int(row.get('r', 0)) + if row_idx <= header_row + 1: + continue + + row_cells = {} + for cell in row.findall('main:c', ns): + cell_ref = cell.get('r', '') + col_letters = ''.join(filter(str.isalpha, cell_ref)) + cell_type = cell.get('t', 'n') + v = cell.find('main:v', ns) + + if v is not None and v.text: + if cell_type == 's': + try: + val = shared_strings[int(v.text)] + except (ValueError, IndexError): + val = v.text + elif cell_type == 'b': + val = v.text == '1' + else: + val = v.text + else: + val = None + row_cells[col_letters] = val + + if row_cells: + rows_data.append(row_cells) + + if not rows_data: + return pd.DataFrame() + + df = pd.DataFrame(rows_data) + + if header_row >= 0: + first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml' + sheet_content = z.read(first_row_sheet) + root = ET.fromstring(sheet_content) + first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns) + if first_row is not None: + headers = {} + for cell in first_row.findall('main:c', ns): + cell_ref = cell.get('r', '') + col_letters = ''.join(filter(str.isalpha, cell_ref)) + cell_type = cell.get('t', 'n') + v = cell.find('main:v', ns) + if v is not None and v.text: + if cell_type == 's': + try: + headers[col_letters] = shared_strings[int(v.text)] + except (ValueError, IndexError): + headers[col_letters] = v.text + else: + headers[col_letters] = v.text + df.columns = [headers.get(col, col) for col in df.columns] + + return df + except Exception as e: + logger.error(f"XML 解析 Excel 失败: {e}") + raise + def _sanitize_table_name(self, filename: str) -> str: """ 将文件名转换为合法的表名 @@ -227,11 +344,8 @@ class ExcelStorageService: try: logger.info(f"开始读取Excel文件: {file_path}") - # 读取 Excel - if sheet_name: - df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row) - else: - df = pd.read_excel(file_path, header=header_row) + # 读取 Excel(使用 fallback 方式支持特殊格式文件) + df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row) logger.info(f"Excel读取完成,行数: {len(df)}, 列数: {len(df.columns)}") diff --git a/backend/app/services/markdown_ai_service.py b/backend/app/services/markdown_ai_service.py index 1936339..f4e1b10 100644 --- a/backend/app/services/markdown_ai_service.py +++ b/backend/app/services/markdown_ai_service.py @@ -48,8 +48,8 @@ class MarkdownAIService: # 中文章节编号模式 CHINESE_NUMBERS = ["一", "二", "三", "四", "五", "六", "七", "八", "九", "十"] CHINESE_SUFFIX = "、" - PARENTHESIS_PATTERN = re.compile(r'^(([一二三四五六七八九十]+))\s*(.+)$') - CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+)、\s*(.+)$') + PARENTHESIS_PATTERN = re.compile(r'^(([一二三四五六七八九十]+)\s*(.+)$') + CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+)、\s*(.+)$') ARABIC_SECTION_PATTERN = re.compile(r'^(\d+)\.\s+(.+)$') def __init__(self): diff --git a/backend/app/services/table_rag_service.py b/backend/app/services/table_rag_service.py index fb12c76..ddf6bf3 100644 --- a/backend/app/services/table_rag_service.py +++ b/backend/app/services/table_rag_service.py @@ -31,6 +31,178 @@ class TableRAGService: self.rag = rag_service self.excel_storage = excel_storage_service + def _extract_sheet_names_from_xml(self, file_path: str) -> List[str]: + """ + 从 Excel 文件的 XML 中提取工作表名称 + + 某些 Excel 文件由于包含非标准元素,pandas/openpyxl 无法正确解析工作表列表, + 此时需要直接从 XML 中提取。 + + Args: + file_path: Excel 文件路径 + + Returns: + 工作表名称列表 + """ + import zipfile + from xml.etree import ElementTree as ET + + try: + with zipfile.ZipFile(file_path, 'r') as z: + # 读取 workbook.xml + if 'xl/workbook.xml' not in z.namelist(): + return [] + + content = z.read('xl/workbook.xml') + root = ET.fromstring(content) + + # 定义命名空间 + ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'} + + # 提取所有 sheet 的 name 属性 + sheets = root.findall('.//main:sheet', ns) + return [s.get('name') for s in sheets if s.get('name')] + + except Exception as e: + logger.warning(f"从 XML 提取工作表失败: {file_path}, error: {e}") + return [] + + def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame: + """ + 读取 Excel 工作表,支持 pandas 无法解析的特殊 Excel 文件 + + 当 pandas 的 ExcelFile 无法正确解析时,直接从 XML 读取数据。 + + Args: + file_path: Excel 文件路径 + sheet_name: 工作表名称(如果为 None,读取第一个工作表) + header_row: 表头行号 + + Returns: + DataFrame + """ + import zipfile + from xml.etree import ElementTree as ET + + try: + # 先尝试用 pandas 正常读取 + df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row) + if df is not None and not df.empty: + return df + except Exception: + pass + + # pandas 读取失败,从 XML 直接解析 + logger.info(f"使用 XML 方式读取 Excel: {file_path}") + + try: + with zipfile.ZipFile(file_path, 'r') as z: + # 获取工作表名称 + sheet_names = self._extract_sheet_names_from_xml(file_path) + if not sheet_names: + raise ValueError("无法从 Excel 文件中找到工作表") + + # 确定要读取的工作表 + target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0] + sheet_index = sheet_names.index(target_sheet) + 1 # sheet1.xml, sheet2.xml, ... + + # 读取 shared strings + shared_strings = [] + if 'xl/sharedStrings.xml' in z.namelist(): + ss_content = z.read('xl/sharedStrings.xml') + ss_root = ET.fromstring(ss_content) + ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'} + for si in ss_root.findall('.//main:si', ns): + t = si.find('.//main:t', ns) + if t is not None: + shared_strings.append(t.text or '') + else: + shared_strings.append('') + + # 读取工作表 + sheet_file = f'xl/worksheets/sheet{sheet_index}.xml' + if sheet_file not in z.namelist(): + raise ValueError(f"工作表文件 {sheet_file} 不存在") + + sheet_content = z.read(sheet_file) + root = ET.fromstring(sheet_content) + ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'} + + # 解析行 + rows_data = [] + for row in root.findall('.//main:row', ns): + row_idx = int(row.get('r', 0)) + # header_row 是 0-indexed,row_idx 是 1-indexed + # 如果 header_row=0 表示第一行是表头,需要跳过 row_idx=1 + if row_idx <= header_row + 1: + continue # 跳过表头行 + + row_cells = {} + for cell in row.findall('main:c', ns): + cell_ref = cell.get('r', '') + col_letters = ''.join(filter(str.isalpha, cell_ref)) + cell_type = cell.get('t', 'n') + v = cell.find('main:v', ns) + + if v is not None and v.text: + if cell_type == 's': + # shared string + try: + val = shared_strings[int(v.text)] + except (ValueError, IndexError): + val = v.text + elif cell_type == 'b': + # boolean + val = v.text == '1' + else: + # number or other + val = v.text + else: + val = None + + row_cells[col_letters] = val + + if row_cells: + rows_data.append(row_cells) + + # 转换为 DataFrame + if not rows_data: + return pd.DataFrame() + + df = pd.DataFrame(rows_data) + + # 如果有 header_row,重新设置列名 + if header_row >= 0: + # 重新读取第一行作为表头 + first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml' + sheet_content = z.read(first_row_sheet) + root = ET.fromstring(sheet_content) + first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns) + if first_row is not None: + headers = {} + for cell in first_row.findall('main:c', ns): + cell_ref = cell.get('r', '') + col_letters = ''.join(filter(str.isalpha, cell_ref)) + cell_type = cell.get('t', 'n') + v = cell.find('main:v', ns) + if v is not None and v.text: + if cell_type == 's': + try: + headers[col_letters] = shared_strings[int(v.text)] + except (ValueError, IndexError): + headers[col_letters] = v.text + else: + headers[col_letters] = v.text + # 重命名列 + df.columns = [headers.get(col, col) for col in df.columns] + + logger.info(f"XML 解析完成: {len(df)} 行, {len(df.columns)} 列") + return df + + except Exception as e: + logger.error(f"XML 解析 Excel 失败: {e}") + raise + async def generate_field_description( self, table_name: str, @@ -132,6 +304,12 @@ class TableRAGService: xls_file = pd.ExcelFile(file_path) sheet_names = xls_file.sheet_names logger.info(f"Excel文件工作表: {sheet_names}") + + # 如果 sheet_names 为空,尝试从 XML 中手动提取 + if not sheet_names: + sheet_names = self._extract_sheet_names_from_xml(file_path) + logger.info(f"从XML提取工作表: {sheet_names}") + if not sheet_names: return {"success": False, "error": "Excel 文件没有工作表"} except Exception as e: @@ -144,9 +322,7 @@ class TableRAGService: if sheet_name not in sheet_names: logger.warning(f"指定的工作表 '{sheet_name}' 不存在,使用第一个工作表: {sheet_names[0]}") sheet_name = sheet_names[0] - df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row) - else: - df = pd.read_excel(file_path, header=header_row) + df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row) logger.info(f"读取到数据: {len(df)} 行, {len(df.columns)} 列")