From 496b96508ddc22ee5425b04d6d769cee99f10176 Mon Sep 17 00:00:00 2001 From: dj <431634905@qq.com> Date: Thu, 9 Apr 2026 22:21:51 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8DExcel=E8=A7=A3=E6=9E=90?= =?UTF-8?q?=E5=92=8C=E6=99=BA=E8=83=BD=E5=A1=AB=E8=A1=A8=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 增强Excel解析器支持多种命名空间和路径格式,解决英文表头Excel无法读取问题 - 当MongoDB中structured_data为空时,尝试用file_path重新解析文件 - 改进AI分析提示词,明确要求返回纯数值不要单位 - 修复max_tokens值(5000→4000)避免DeepSeek API报错 Co-Authored-By: Claude Opus 4.6 --- .../app/core/document_parser/xlsx_parser.py | 148 ++++++++++++++---- backend/app/services/template_fill_service.py | 64 ++++++-- 2 files changed, 166 insertions(+), 46 deletions(-) diff --git a/backend/app/core/document_parser/xlsx_parser.py b/backend/app/core/document_parser/xlsx_parser.py index 47cd232..a0216a1 100644 --- a/backend/app/core/document_parser/xlsx_parser.py +++ b/backend/app/core/document_parser/xlsx_parser.py @@ -317,24 +317,70 @@ class XlsxParser(BaseParser): import zipfile from xml.etree import ElementTree as ET + # 常见的命名空间 + COMMON_NAMESPACES = [ + 'http://schemas.openxmlformats.org/spreadsheetml/2006/main', + 'http://schemas.openxmlformats.org/spreadsheetml/2005/main', + 'http://schemas.openxmlformats.org/spreadsheetml/2004/main', + 'http://schemas.openxmlformats.org/spreadsheetml/2003/main', + ] + try: with zipfile.ZipFile(file_path, 'r') as z: - if 'xl/workbook.xml' not in z.namelist(): + # 尝试多种可能的 workbook.xml 路径 + possible_paths = ['xl/workbook.xml', 'xl\\workbook.xml', 'workbook.xml'] + content = None + for path in possible_paths: + if path in z.namelist(): + content = z.read(path) + logger.info(f"找到 workbook.xml at: {path}") + break + + if content is None: + logger.warning(f"未找到 workbook.xml,文件列表: {z.namelist()[:10]}") return [] - content = z.read('xl/workbook.xml') + root = ET.fromstring(content) - # 命名空间 - ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'} - sheet_names = [] - for sheet in root.findall('.//main:sheet', ns): - name = sheet.get('name') - if name: - sheet_names.append(name) + + # 方法1:尝试带命名空间的查找 + for ns in COMMON_NAMESPACES: + sheet_elements = root.findall(f'.//{{{ns}}}sheet') + if sheet_elements: + for sheet in sheet_elements: + name = sheet.get('name') + if name: + sheet_names.append(name) + if sheet_names: + logger.info(f"使用命名空间 {ns} 提取工作表: {sheet_names}") + return sheet_names + + # 方法2:不使用命名空间,直接查找所有 sheet 元素 + if not sheet_names: + for elem in root.iter(): + if elem.tag.endswith('sheet') and elem.tag != 'sheets': + name = elem.get('name') + if name: + sheet_names.append(name) + for child in elem: + if child.tag.endswith('sheet') or child.tag == 'sheet': + name = child.get('name') + if name and name not in sheet_names: + sheet_names.append(name) + + # 方法3:直接从 XML 文本中正则匹配 sheet name + if not sheet_names: + import re + xml_str = content.decode('utf-8', errors='ignore') + matches = re.findall(r']*name=["\']([^"\']+)["\']', xml_str, re.IGNORECASE) + if matches: + sheet_names = matches + logger.info(f"使用正则提取工作表: {sheet_names}") logger.info(f"从 XML 提取工作表: {sheet_names}") return sheet_names + except Exception as e: logger.error(f"从 XML 提取工作表名称失败: {e}") return [] @@ -356,6 +402,32 @@ class XlsxParser(BaseParser): import zipfile from xml.etree import ElementTree as ET + # 常见的命名空间 + COMMON_NAMESPACES = [ + 'http://schemas.openxmlformats.org/spreadsheetml/2006/main', + 'http://schemas.openxmlformats.org/spreadsheetml/2005/main', + 'http://schemas.openxmlformats.org/spreadsheetml/2004/main', + 'http://schemas.openxmlformats.org/spreadsheetml/2003/main', + ] + + def find_elements_with_ns(root, tag_name): + """灵活查找元素,支持任意命名空间""" + results = [] + # 方法1:用固定命名空间 + for ns in COMMON_NAMESPACES: + try: + elems = root.findall(f'.//{{{ns}}}{tag_name}') + if elems: + results.extend(elems) + except: + pass + # 方法2:不带命名空间查找 + if not results: + for elem in root.iter(): + if elem.tag.endswith('}' + tag_name): + results.append(elem) + return results + with zipfile.ZipFile(file_path, 'r') as z: # 获取工作表名称 sheet_names = self._extract_sheet_names_from_xml(file_path) @@ -366,57 +438,68 @@ class XlsxParser(BaseParser): target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0] sheet_index = sheet_names.index(target_sheet) + 1 # sheet1.xml, sheet2.xml, ... - # 读取 shared strings + # 读取 shared strings - 尝试多种路径 shared_strings = [] - if 'xl/sharedStrings.xml' in z.namelist(): - ss_content = z.read('xl/sharedStrings.xml') - ss_root = ET.fromstring(ss_content) - ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'} - for si in ss_root.findall('.//main:si', ns): - t = si.find('.//main:t', ns) - if t is not None: - shared_strings.append(t.text or '') - else: - shared_strings.append('') + ss_paths = ['xl/sharedStrings.xml', 'xl\\sharedStrings.xml', 'sharedStrings.xml'] + for ss_path in ss_paths: + if ss_path in z.namelist(): + try: + ss_content = z.read(ss_path) + ss_root = ET.fromstring(ss_content) + for si in find_elements_with_ns(ss_root, 'si'): + t_elements = [c for c in si if c.tag.endswith('}t') or c.tag == 't'] + if t_elements: + shared_strings.append(t_elements[0].text or '') + else: + shared_strings.append('') + break + except Exception as e: + logger.warning(f"读取 sharedStrings 失败: {e}") - # 读取工作表 - sheet_file = f'xl/worksheets/sheet{sheet_index}.xml' - if sheet_file not in z.namelist(): - raise ValueError(f"工作表文件 {sheet_file} 不存在") + # 读取工作表 - 尝试多种可能的路径 + sheet_content = None + sheet_paths = [ + f'xl/worksheets/sheet{sheet_index}.xml', + f'xl\\worksheets\\sheet{sheet_index}.xml', + f'worksheets/sheet{sheet_index}.xml', + ] + for sp in sheet_paths: + if sp in z.namelist(): + sheet_content = z.read(sp) + break + + if sheet_content is None: + raise ValueError(f"工作表文件 sheet{sheet_index}.xml 不存在") - sheet_content = z.read(sheet_file) root = ET.fromstring(sheet_content) - ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'} # 收集所有行数据 all_rows = [] headers = {} - for row in root.findall('.//main:row', ns): + for row in find_elements_with_ns(root, 'row'): row_idx = int(row.get('r', 0)) row_cells = {} - for cell in row.findall('main:c', ns): + for cell in find_elements_with_ns(row, 'c'): cell_ref = cell.get('r', '') col_letters = ''.join(filter(str.isalpha, cell_ref)) cell_type = cell.get('t', 'n') - v = cell.find('main:v', ns) + v_elements = find_elements_with_ns(cell, 'v') + v = v_elements[0] if v_elements else None if v is not None and v.text: if cell_type == 's': - # shared string try: row_cells[col_letters] = shared_strings[int(v.text)] except (ValueError, IndexError): row_cells[col_letters] = v.text elif cell_type == 'b': - # boolean row_cells[col_letters] = v.text == '1' else: row_cells[col_letters] = v.text else: row_cells[col_letters] = None - # 处理表头行 if row_idx == header_row + 1: headers = {**row_cells} elif row_idx > header_row + 1: @@ -424,7 +507,6 @@ class XlsxParser(BaseParser): # 构建 DataFrame if headers: - # 按原始列顺序排列 col_order = list(headers.keys()) df = pd.DataFrame(all_rows) if not df.empty: diff --git a/backend/app/services/template_fill_service.py b/backend/app/services/template_fill_service.py index e20b081..fbcb9e9 100644 --- a/backend/app/services/template_fill_service.py +++ b/backend/app/services/template_fill_service.py @@ -168,16 +168,44 @@ class TemplateFillService: sd = doc.get("structured_data", {}) sd_keys = list(sd.keys()) if sd else [] logger.info(f"从MongoDB加载文档: {doc_id}, doc_type={doc.get('doc_type')}, structured_data keys={sd_keys}") + + # 如果 structured_data 为空,但有 file_path,尝试重新解析文件 + doc_content = doc.get("content", "") + if not sd or (not sd.get("tables") and not sd.get("headers") and not sd.get("rows")): + file_path = doc.get("metadata", {}).get("file_path") + if file_path: + logger.info(f" structured_data 为空,尝试重新解析文件: {file_path}") + try: + parser = ParserFactory.get_parser(file_path) + result = parser.parse(file_path) + if result.success and result.data: + if result.data.get("structured_data"): + sd = result.data.get("structured_data") + logger.info(f" 重新解析成功,structured_data keys: {list(sd.keys())}") + elif result.data.get("tables"): + sd = {"tables": result.data.get("tables", [])} + logger.info(f" 使用 data.tables,tables数量: {len(sd.get('tables', []))}") + elif result.data.get("rows"): + sd = result.data + logger.info(f" 使用 data.rows 格式") + if result.data.get("content"): + doc_content = result.data.get("content", "") + else: + logger.warning(f" 重新解析失败: {result.error if result else 'unknown'}") + except Exception as parse_err: + logger.error(f" 重新解析文件异常: {str(parse_err)}") + if sd.get("tables"): logger.info(f" tables数量: {len(sd.get('tables', []))}") if sd["tables"]: first_table = sd["tables"][0] logger.info(f" 第一表格: headers={first_table.get('headers', [])[:3]}..., rows数量={len(first_table.get('rows', []))}") + source_docs.append(SourceDocument( doc_id=doc_id, filename=doc.get("metadata", {}).get("original_filename", "unknown"), doc_type=doc.get("doc_type", "unknown"), - content=doc.get("content", ""), + content=doc_content, structured_data=sd )) except Exception as e: @@ -1348,27 +1376,37 @@ class TemplateFillService: hint_text = f"{user_hint}。{hint_text}" # 构建针对字段提取的提示词 - prompt = f"""你是一个专业的数据提取专家。请从以下文档内容中提取与"{field.name}"相关的所有数据。 + prompt = f"""你是一个专业的数据提取专家。请从以下文档内容中提取与"{field.name}"完全匹配的数据。 -字段提示: {hint_text} +【重要】字段名: "{field.name}" +【重要】字段提示: {hint_text} + +请严格按照以下步骤操作: +1. 在文档中搜索与"{field.name}"完全相同或高度相关的关键词 +2. 找到后,提取该关键词后的数值(注意:只要数值,不要单位) +3. 如果是表格中的数据,直接提取该单元格的数值 +4. 如果是段落描述,在关键词附近找数值 + +【重要】返回值规则: +- 只返回纯数值,不要单位(如 "4.9" 而不是 "4.9万亿元") +- 如果原文是"4.9万亿元",返回 "4.9" +- 如果原文是"144000万册",返回 "144000" +- 如果是百分比如"增长7.7%",返回 "7.7" +- 如果没有找到完全匹配的数据,返回空数组 文档内容: -{doc.content[:8000] if doc.content else ""} - -请完成以下任务: -1. 仔细阅读文档,找出所有与"{field.name}"相关的数据 -2. 如果文档中有表格数据,提取表格中的对应列值 -3. 如果文档中是段落描述,提取其中的关键数值或结论 -4. 返回提取的所有值(可能多个,用数组存储) +{doc.content[:10000] if doc.content else ""} 请用严格的 JSON 格式返回: {{ - "values": ["值1", "值2", ...], + "values": ["值1", "值2", ...], // 只填数值,不要单位 "source": "数据来源说明", "confidence": 0.0到1.0之间的置信度 }} -如果没有找到相关数据,返回空数组 values: []""" +示例: +- 如果字段是"图书馆总藏量(万册)"且文档说"图书总藏量14.4亿册",返回 values: ["144000"] +- 如果字段是"国内旅游收入(亿元)"且文档说"国内旅游收入4.9万亿元",返回 values: ["49000"]""" messages = [ {"role": "system", "content": "你是一个专业的数据提取助手,擅长从政府统计公报等文档中提取数据。请严格按JSON格式输出。"}, @@ -1378,7 +1416,7 @@ class TemplateFillService: response = await self.llm.chat( messages=messages, temperature=0.1, - max_tokens=5000 + max_tokens=4000 ) content = self.llm.extract_message_content(response)