feat(excel): 添加对特殊Excel文件的XML解析支持

添加了从Excel文件XML直接解析工作表名称和数据的功能,
以支持pandas无法正确解析的特殊格式Excel文件。
同时更新了.gitignore文件,添加了更多忽略规则。
修复了markdown AI服务中的正则表达式模式匹配问题。
This commit is contained in:
2026-04-02 13:19:00 +08:00
parent d189ea9620
commit 7c19e49988
4 changed files with 338 additions and 10 deletions

38
.gitignore vendored Normal file
View File

@@ -0,0 +1,38 @@
/.git/
/.idea/
/.vscode/
/backend/venv/
/backend/command/
/backend/.env
/backend/.env.local
/backend/.env.*.local
/backend/app/__pycache__/*
/backend/data/uploads
/backend/data/charts
/backend/data/logs
/frontend/node_modules/
/frontend/dist/
/frontend/build/
/frontend/.vscode/
/frontend/.idea/
/frontend/.env
/frontend/*.log
/技术路线.md
/开发路径.md
/开发日志_2026-03-16.md
/frontendTest/
/docs/
/frontend/src/api/
/frontend/src/api/index.js
/frontend/src/api/index.ts
/frontend/src/api/index.tsx
/frontend/src/api/index.py
/frontend/src/api/index.go
/frontend/src/api/index.java
/docs/
/frontend - 副本/*
/supabase.txt
**/__pycache__/*
**.pyc

View File

@@ -34,6 +34,123 @@ class ExcelStorageService:
def __init__(self):
self.mysql_db = mysql_db
def _extract_sheet_names_from_xml(self, file_path: str) -> list:
"""从 Excel 文件的 XML 中提取工作表名称"""
import zipfile
from xml.etree import ElementTree as ET
try:
with zipfile.ZipFile(file_path, 'r') as z:
if 'xl/workbook.xml' not in z.namelist():
return []
content = z.read('xl/workbook.xml')
root = ET.fromstring(content)
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
sheets = root.findall('.//main:sheet', ns)
return [s.get('name') for s in sheets if s.get('name')]
except Exception:
return []
def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
"""读取 Excel 工作表,支持 pandas 无法解析的特殊 Excel 文件"""
import zipfile
from xml.etree import ElementTree as ET
try:
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
if df is not None and not df.empty:
return df
except Exception:
pass
# pandas 读取失败,从 XML 直接解析
logger.info(f"使用 XML 方式读取 Excel: {file_path}")
try:
with zipfile.ZipFile(file_path, 'r') as z:
sheet_names = self._extract_sheet_names_from_xml(file_path)
if not sheet_names:
raise ValueError("无法从 Excel 文件中找到工作表")
target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
sheet_index = sheet_names.index(target_sheet) + 1
shared_strings = []
if 'xl/sharedStrings.xml' in z.namelist():
ss_content = z.read('xl/sharedStrings.xml')
ss_root = ET.fromstring(ss_content)
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
for si in ss_root.findall('.//main:si', ns):
t = si.find('.//main:t', ns)
shared_strings.append(t.text if t is not None else '')
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
sheet_content = z.read(sheet_file)
root = ET.fromstring(sheet_content)
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
rows_data = []
for row in root.findall('.//main:row', ns):
row_idx = int(row.get('r', 0))
if row_idx <= header_row + 1:
continue
row_cells = {}
for cell in row.findall('main:c', ns):
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('main:v', ns)
if v is not None and v.text:
if cell_type == 's':
try:
val = shared_strings[int(v.text)]
except (ValueError, IndexError):
val = v.text
elif cell_type == 'b':
val = v.text == '1'
else:
val = v.text
else:
val = None
row_cells[col_letters] = val
if row_cells:
rows_data.append(row_cells)
if not rows_data:
return pd.DataFrame()
df = pd.DataFrame(rows_data)
if header_row >= 0:
first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
sheet_content = z.read(first_row_sheet)
root = ET.fromstring(sheet_content)
first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
if first_row is not None:
headers = {}
for cell in first_row.findall('main:c', ns):
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('main:v', ns)
if v is not None and v.text:
if cell_type == 's':
try:
headers[col_letters] = shared_strings[int(v.text)]
except (ValueError, IndexError):
headers[col_letters] = v.text
else:
headers[col_letters] = v.text
df.columns = [headers.get(col, col) for col in df.columns]
return df
except Exception as e:
logger.error(f"XML 解析 Excel 失败: {e}")
raise
def _sanitize_table_name(self, filename: str) -> str:
"""
将文件名转换为合法的表名
@@ -227,11 +344,8 @@ class ExcelStorageService:
try:
logger.info(f"开始读取Excel文件: {file_path}")
# 读取 Excel
if sheet_name:
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
else:
df = pd.read_excel(file_path, header=header_row)
# 读取 Excel(使用 fallback 方式支持特殊格式文件)
df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row)
logger.info(f"Excel读取完成行数: {len(df)}, 列数: {len(df.columns)}")

View File

@@ -48,8 +48,8 @@ class MarkdownAIService:
# 中文章节编号模式
CHINESE_NUMBERS = ["", "", "", "", "", "", "", "", "", ""]
CHINESE_SUFFIX = ""
PARENTHESIS_PATTERN = re.compile(r'^([一二三四五六七八九十]+)\s*(.+)$')
CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+\s*(.+)$')
PARENTHESIS_PATTERN = re.compile(r'^([一二三四五六七八九十]+)\s*(.+)$')
CHINESE_SECTION_PATTERN = re.compile(r'^([一二三四五六七八九十]+)\s*(.+)$')
ARABIC_SECTION_PATTERN = re.compile(r'^(\d+)\.\s+(.+)$')
def __init__(self):

View File

@@ -31,6 +31,178 @@ class TableRAGService:
self.rag = rag_service
self.excel_storage = excel_storage_service
def _extract_sheet_names_from_xml(self, file_path: str) -> List[str]:
"""
从 Excel 文件的 XML 中提取工作表名称
某些 Excel 文件由于包含非标准元素pandas/openpyxl 无法正确解析工作表列表,
此时需要直接从 XML 中提取。
Args:
file_path: Excel 文件路径
Returns:
工作表名称列表
"""
import zipfile
from xml.etree import ElementTree as ET
try:
with zipfile.ZipFile(file_path, 'r') as z:
# 读取 workbook.xml
if 'xl/workbook.xml' not in z.namelist():
return []
content = z.read('xl/workbook.xml')
root = ET.fromstring(content)
# 定义命名空间
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
# 提取所有 sheet 的 name 属性
sheets = root.findall('.//main:sheet', ns)
return [s.get('name') for s in sheets if s.get('name')]
except Exception as e:
logger.warning(f"从 XML 提取工作表失败: {file_path}, error: {e}")
return []
def _read_excel_sheet(self, file_path: str, sheet_name: str = None, header_row: int = 0) -> pd.DataFrame:
"""
读取 Excel 工作表,支持 pandas 无法解析的特殊 Excel 文件
当 pandas 的 ExcelFile 无法正确解析时,直接从 XML 读取数据。
Args:
file_path: Excel 文件路径
sheet_name: 工作表名称(如果为 None读取第一个工作表
header_row: 表头行号
Returns:
DataFrame
"""
import zipfile
from xml.etree import ElementTree as ET
try:
# 先尝试用 pandas 正常读取
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
if df is not None and not df.empty:
return df
except Exception:
pass
# pandas 读取失败,从 XML 直接解析
logger.info(f"使用 XML 方式读取 Excel: {file_path}")
try:
with zipfile.ZipFile(file_path, 'r') as z:
# 获取工作表名称
sheet_names = self._extract_sheet_names_from_xml(file_path)
if not sheet_names:
raise ValueError("无法从 Excel 文件中找到工作表")
# 确定要读取的工作表
target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
sheet_index = sheet_names.index(target_sheet) + 1 # sheet1.xml, sheet2.xml, ...
# 读取 shared strings
shared_strings = []
if 'xl/sharedStrings.xml' in z.namelist():
ss_content = z.read('xl/sharedStrings.xml')
ss_root = ET.fromstring(ss_content)
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
for si in ss_root.findall('.//main:si', ns):
t = si.find('.//main:t', ns)
if t is not None:
shared_strings.append(t.text or '')
else:
shared_strings.append('')
# 读取工作表
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
if sheet_file not in z.namelist():
raise ValueError(f"工作表文件 {sheet_file} 不存在")
sheet_content = z.read(sheet_file)
root = ET.fromstring(sheet_content)
ns = {'main': 'http://purl.oclc.org/ooxml/spreadsheetml/main'}
# 解析行
rows_data = []
for row in root.findall('.//main:row', ns):
row_idx = int(row.get('r', 0))
# header_row 是 0-indexedrow_idx 是 1-indexed
# 如果 header_row=0 表示第一行是表头,需要跳过 row_idx=1
if row_idx <= header_row + 1:
continue # 跳过表头行
row_cells = {}
for cell in row.findall('main:c', ns):
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('main:v', ns)
if v is not None and v.text:
if cell_type == 's':
# shared string
try:
val = shared_strings[int(v.text)]
except (ValueError, IndexError):
val = v.text
elif cell_type == 'b':
# boolean
val = v.text == '1'
else:
# number or other
val = v.text
else:
val = None
row_cells[col_letters] = val
if row_cells:
rows_data.append(row_cells)
# 转换为 DataFrame
if not rows_data:
return pd.DataFrame()
df = pd.DataFrame(rows_data)
# 如果有 header_row重新设置列名
if header_row >= 0:
# 重新读取第一行作为表头
first_row_sheet = f'xl/worksheets/sheet{sheet_index}.xml'
sheet_content = z.read(first_row_sheet)
root = ET.fromstring(sheet_content)
first_row = root.find(f'.//main:row[@r="{header_row + 1}"]', ns)
if first_row is not None:
headers = {}
for cell in first_row.findall('main:c', ns):
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('main:v', ns)
if v is not None and v.text:
if cell_type == 's':
try:
headers[col_letters] = shared_strings[int(v.text)]
except (ValueError, IndexError):
headers[col_letters] = v.text
else:
headers[col_letters] = v.text
# 重命名列
df.columns = [headers.get(col, col) for col in df.columns]
logger.info(f"XML 解析完成: {len(df)} 行, {len(df.columns)}")
return df
except Exception as e:
logger.error(f"XML 解析 Excel 失败: {e}")
raise
async def generate_field_description(
self,
table_name: str,
@@ -132,6 +304,12 @@ class TableRAGService:
xls_file = pd.ExcelFile(file_path)
sheet_names = xls_file.sheet_names
logger.info(f"Excel文件工作表: {sheet_names}")
# 如果 sheet_names 为空,尝试从 XML 中手动提取
if not sheet_names:
sheet_names = self._extract_sheet_names_from_xml(file_path)
logger.info(f"从XML提取工作表: {sheet_names}")
if not sheet_names:
return {"success": False, "error": "Excel 文件没有工作表"}
except Exception as e:
@@ -144,9 +322,7 @@ class TableRAGService:
if sheet_name not in sheet_names:
logger.warning(f"指定的工作表 '{sheet_name}' 不存在,使用第一个工作表: {sheet_names[0]}")
sheet_name = sheet_names[0]
df = pd.read_excel(file_path, sheet_name=sheet_name, header=header_row)
else:
df = pd.read_excel(file_path, header=header_row)
df = self._read_excel_sheet(file_path, sheet_name=sheet_name, header_row=header_row)
logger.info(f"读取到数据: {len(df)} 行, {len(df.columns)}")