添加其他格式文档的解析

This commit is contained in:
2026-03-26 23:14:39 +08:00
parent 4bdc3f9707
commit 5bcad4a5fa
9 changed files with 2075 additions and 22 deletions

View File

@@ -2,26 +2,29 @@
文档解析模块 - 支持多种文件格式的解析
"""
from pathlib import Path
from typing import Dict, Optional
from typing import Dict
from .base import BaseParser, ParseResult
from .xlsx_parser import XlsxParser
# 导入其他解析器 (需要先实现)
# from .docx_parser import DocxParser
# from .md_parser import MarkdownParser
# from .txt_parser import TxtParser
from .docx_parser import DocxParser
from .md_parser import MarkdownParser
from .txt_parser import TxtParser
class ParserFactory:
"""解析器工厂,根据文件类型返回对应解析器"""
_parsers: Dict[str, BaseParser] = {
# Excel
'.xlsx': XlsxParser(),
'.xls': XlsxParser(),
# '.docx': DocxParser(), # TODO: 待实现
# '.md': MarkdownParser(), # TODO: 待实现
# '.txt': TxtParser(), # TODO: 待实现
# Word
'.docx': DocxParser(),
# Markdown
'.md': MarkdownParser(),
'.markdown': MarkdownParser(),
# 文本
'.txt': TxtParser(),
}
@classmethod
@@ -30,7 +33,8 @@ class ParserFactory:
ext = Path(file_path).suffix.lower()
parser = cls._parsers.get(ext)
if not parser:
raise ValueError(f"不支持的文件格式: {ext},支持的格式: {list(cls._parsers.keys())}")
supported = list(cls._parsers.keys())
raise ValueError(f"不支持的文件格式: {ext},支持的格式: {supported}")
return parser
@classmethod
@@ -44,5 +48,18 @@ class ParserFactory:
"""注册新的解析器"""
cls._parsers[ext.lower()] = parser
@classmethod
def get_supported_extensions(cls) -> list:
"""获取所有支持的扩展名"""
return list(cls._parsers.keys())
__all__ = ['BaseParser', 'ParseResult', 'XlsxParser', 'ParserFactory']
__all__ = [
'BaseParser',
'ParseResult',
'ParserFactory',
'XlsxParser',
'DocxParser',
'MarkdownParser',
'TxtParser',
]

View File

@@ -0,0 +1,163 @@
"""
Word 文档 (.docx) 解析器
"""
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional
from docx import Document
from .base import BaseParser, ParseResult
logger = logging.getLogger(__name__)
class DocxParser(BaseParser):
"""Word 文档解析器"""
def __init__(self):
super().__init__()
self.supported_extensions = ['.docx']
self.parser_name = "docx_parser"
def parse(
self,
file_path: str,
**kwargs
) -> ParseResult:
"""
解析 Word 文档
Args:
file_path: 文件路径
**kwargs: 其他参数
Returns:
ParseResult: 解析结果
"""
path = Path(file_path)
# 检查文件是否存在
if not path.exists():
return ParseResult(
success=False,
error=f"文件不存在: {file_path}"
)
# 检查文件扩展名
if path.suffix.lower() not in self.supported_extensions:
return ParseResult(
success=False,
error=f"不支持的文件类型: {path.suffix}"
)
try:
# 读取 Word 文档
doc = Document(file_path)
# 提取文本内容
paragraphs = []
for para in doc.paragraphs:
if para.text.strip():
paragraphs.append(para.text)
# 提取表格内容
tables_data = []
for i, table in enumerate(doc.tables):
table_rows = []
for row in table.rows:
row_data = [cell.text.strip() for cell in row.cells]
table_rows.append(row_data)
if table_rows:
tables_data.append({
"table_index": i,
"rows": table_rows,
"row_count": len(table_rows),
"column_count": len(table_rows[0]) if table_rows else 0
})
# 合并所有文本
full_text = "\n".join(paragraphs)
# 构建元数据
metadata = {
"filename": path.name,
"extension": path.suffix.lower(),
"file_size": path.stat().st_size,
"paragraph_count": len(paragraphs),
"table_count": len(tables_data),
"word_count": len(full_text),
"char_count": len(full_text.replace("\n", "")),
"has_tables": len(tables_data) > 0
}
# 返回结果
return ParseResult(
success=True,
data={
"content": full_text,
"paragraphs": paragraphs,
"tables": tables_data,
"word_count": len(full_text),
"structured_data": {
"paragraphs": paragraphs,
"tables": tables_data
}
},
metadata=metadata
)
except Exception as e:
logger.error(f"解析 Word 文档失败: {str(e)}")
return ParseResult(
success=False,
error=f"解析 Word 文档失败: {str(e)}"
)
def extract_key_sentences(self, text: str, max_sentences: int = 10) -> List[str]:
"""
从文本中提取关键句子
Args:
text: 文本内容
max_sentences: 最大句子数
Returns:
关键句子列表
"""
# 简单实现按句号分割取前N个句子
sentences = [s.strip() for s in text.split("") if s.strip()]
return sentences[:max_sentences]
def extract_structured_fields(self, text: str) -> Dict[str, Any]:
"""
尝试提取结构化字段
针对合同、简历等有固定格式的文档
Args:
text: 文本内容
Returns:
提取的字段字典
"""
fields = {}
# 常见字段模式
patterns = {
"姓名": r"姓名[:]\s*(\S+)",
"电话": r"电话[:]\s*(\d{11}|\d{3}-\d{8})",
"邮箱": r"邮箱[:]\s*(\S+@\S+)",
"地址": r"地址[:]\s*(.+?)(?:\n|$)",
"金额": r"金额[:]\s*(\d+(?:\.\d+)?)",
"日期": r"日期[:]\s*(\d{4}[年/-]\d{1,2}[月/-]\d{1,2})",
}
import re
for field_name, pattern in patterns.items():
match = re.search(pattern, text)
if match:
fields[field_name] = match.group(1)
return fields

View File

@@ -0,0 +1,262 @@
"""
Markdown 文档解析器
"""
import logging
from pathlib import Path
from typing import Any, Dict, List, Optional
import markdown
from .base import BaseParser, ParseResult
logger = logging.getLogger(__name__)
class MarkdownParser(BaseParser):
"""Markdown 文档解析器"""
def __init__(self):
super().__init__()
self.supported_extensions = ['.md', '.markdown']
self.parser_name = "markdown_parser"
def parse(
self,
file_path: str,
**kwargs
) -> ParseResult:
"""
解析 Markdown 文档
Args:
file_path: 文件路径
**kwargs: 其他参数
Returns:
ParseResult: 解析结果
"""
path = Path(file_path)
# 检查文件是否存在
if not path.exists():
return ParseResult(
success=False,
error=f"文件不存在: {file_path}"
)
# 检查文件扩展名
if path.suffix.lower() not in self.supported_extensions:
return ParseResult(
success=False,
error=f"不支持的文件类型: {path.suffix}"
)
try:
# 读取文件内容
with open(file_path, 'r', encoding='utf-8') as f:
raw_content = f.read()
# 解析 Markdown
md = markdown.Markdown(extensions=[
'markdown.extensions.tables',
'markdown.extensions.fenced_code',
'markdown.extensions.codehilite',
'markdown.extensions.toc',
])
html_content = md.convert(raw_content)
# 提取标题结构
titles = self._extract_titles(raw_content)
# 提取代码块
code_blocks = self._extract_code_blocks(raw_content)
# 提取表格
tables = self._extract_tables(raw_content)
# 提取链接和图片
links_images = self._extract_links_images(raw_content)
# 清理后的纯文本(去除 Markdown 语法)
plain_text = self._strip_markdown(raw_content)
# 构建元数据
metadata = {
"filename": path.name,
"extension": path.suffix.lower(),
"file_size": path.stat().st_size,
"word_count": len(plain_text),
"char_count": len(raw_content),
"line_count": len(raw_content.splitlines()),
"title_count": len(titles),
"code_block_count": len(code_blocks),
"table_count": len(tables),
"link_count": len(links_images.get("links", [])),
"image_count": len(links_images.get("images", [])),
}
return ParseResult(
success=True,
data={
"content": plain_text,
"raw_content": raw_content,
"html_content": html_content,
"titles": titles,
"code_blocks": code_blocks,
"tables": tables,
"links_images": links_images,
"word_count": len(plain_text),
"structured_data": {
"titles": titles,
"code_blocks": code_blocks,
"tables": tables
}
},
metadata=metadata
)
except Exception as e:
logger.error(f"解析 Markdown 文档失败: {str(e)}")
return ParseResult(
success=False,
error=f"解析 Markdown 文档失败: {str(e)}"
)
def _extract_titles(self, content: str) -> List[Dict[str, Any]]:
"""提取标题结构"""
import re
titles = []
# 匹配 # 标题
for match in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
level = len(match.group(1))
title_text = match.group(2).strip()
titles.append({
"level": level,
"text": title_text,
"line": content[:match.start()].count('\n') + 1
})
return titles
def _extract_code_blocks(self, content: str) -> List[Dict[str, str]]:
"""提取代码块"""
import re
code_blocks = []
# 匹配 ```code ``` 格式
pattern = r'```(\w*)\n(.*?)```'
for match in re.finditer(pattern, content, re.DOTALL):
language = match.group(1) or "text"
code = match.group(2).strip()
code_blocks.append({
"language": language,
"code": code
})
return code_blocks
def _extract_tables(self, content: str) -> List[Dict[str, Any]]:
"""提取表格"""
import re
tables = []
# 简单表格匹配(| col1 | col2 | 格式)
lines = content.split('\n')
i = 0
while i < len(lines):
line = lines[i].strip()
# 检查是否是表格行
if line.startswith('|') and line.endswith('|'):
# 找到表头
header_row = [cell.strip() for cell in line.split('|')[1:-1]]
# 检查下一行是否是分隔符
if i + 1 < len(lines) and re.match(r'^\|[\s\-:|]+\|$', lines[i + 1]):
# 跳过分隔符,读取数据行
data_rows = []
for j in range(i + 2, len(lines)):
row_line = lines[j].strip()
if not (row_line.startswith('|') and row_line.endswith('|')):
break
row_data = [cell.strip() for cell in row_line.split('|')[1:-1]]
data_rows.append(row_data)
if header_row and data_rows:
tables.append({
"headers": header_row,
"rows": data_rows,
"row_count": len(data_rows),
"column_count": len(header_row)
})
i = j - 1
i += 1
return tables
def _extract_links_images(self, content: str) -> Dict[str, List[Dict[str, str]]]:
"""提取链接和图片"""
import re
result = {"links": [], "images": []}
# 提取链接 [text](url)
for match in re.finditer(r'\[([^\]]+)\]\(([^\)]+)\)', content):
result["links"].append({
"text": match.group(1),
"url": match.group(2)
})
# 提取图片 ![alt](url)
for match in re.finditer(r'!\[([^\]]*)\]\(([^\)]+)\)', content):
result["images"].append({
"alt": match.group(1),
"url": match.group(2)
})
return result
def _strip_markdown(self, content: str) -> str:
"""去除 Markdown 语法,获取纯文本"""
import re
# 去除代码块
content = re.sub(r'```[\s\S]*?```', '', content)
# 去除行内代码
content = re.sub(r'`[^`]+`', '', content)
# 去除图片
content = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', r'\1', content)
# 去除链接,保留文本
content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content)
# 去除标题标记
content = re.sub(r'^#{1,6}\s+', '', content, flags=re.MULTILINE)
# 去除加粗和斜体
content = re.sub(r'\*\*([^\*]+)\*\*', r'\1', content)
content = re.sub(r'\*([^\*]+)\*', r'\1', content)
content = re.sub(r'__([^_]+)__', r'\1', content)
content = re.sub(r'_([^_]+)_', r'\1', content)
# 去除引用标记
content = re.sub(r'^>\s+', '', content, flags=re.MULTILINE)
# 去除列表标记
content = re.sub(r'^[-*+]\s+', '', content, flags=re.MULTILINE)
content = re.sub(r'^\d+\.\s+', '', content, flags=re.MULTILINE)
# 去除水平线
content = re.sub(r'^[-*_]{3,}$', '', content, flags=re.MULTILINE)
# 去除表格分隔符
content = re.sub(r'^\|[\s\-:|]+\|$', '', content, flags=re.MULTILINE)
# 清理多余空行
content = re.sub(r'\n{3,}', '\n\n', content)
return content.strip()

View File

@@ -0,0 +1,278 @@
"""
纯文本 (.txt) 解析器
"""
import logging
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
import chardet
from .base import BaseParser, ParseResult
logger = logging.getLogger(__name__)
class TxtParser(BaseParser):
"""纯文本文档解析器"""
def __init__(self):
super().__init__()
self.supported_extensions = ['.txt']
self.parser_name = "txt_parser"
def parse(
self,
file_path: str,
encoding: Optional[str] = None,
**kwargs
) -> ParseResult:
"""
解析文本文件
Args:
file_path: 文件路径
encoding: 指定编码,不指定则自动检测
**kwargs: 其他参数
Returns:
ParseResult: 解析结果
"""
path = Path(file_path)
# 检查文件是否存在
if not path.exists():
return ParseResult(
success=False,
error=f"文件不存在: {file_path}"
)
# 检查文件扩展名
if path.suffix.lower() not in self.supported_extensions:
return ParseResult(
success=False,
error=f"不支持的文件类型: {path.suffix}"
)
try:
# 检测编码
if not encoding:
encoding = self._detect_encoding(file_path)
# 读取文件内容
with open(file_path, 'r', encoding=encoding) as f:
raw_content = f.read()
# 清理文本
content = self._clean_text(raw_content)
# 提取行信息
lines = content.split('\n')
# 估算字数
word_count = len(content.replace('\n', '').replace(' ', ''))
# 构建元数据
metadata = {
"filename": path.name,
"extension": path.suffix.lower(),
"file_size": path.stat().st_size,
"encoding": encoding,
"line_count": len(lines),
"word_count": word_count,
"char_count": len(content),
"non_empty_line_count": len([l for l in lines if l.strip()])
}
return ParseResult(
success=True,
data={
"content": content,
"raw_content": raw_content,
"lines": lines,
"word_count": word_count,
"char_count": len(content),
"line_count": len(lines),
"structured_data": {
"line_count": len(lines),
"non_empty_line_count": metadata["non_empty_line_count"]
}
},
metadata=metadata
)
except Exception as e:
logger.error(f"解析文本文件失败: {str(e)}")
return ParseResult(
success=False,
error=f"解析文本文件失败: {str(e)}"
)
def _detect_encoding(self, file_path: str) -> str:
"""
自动检测文件编码
Args:
file_path: 文件路径
Returns:
检测到的编码
"""
try:
with open(file_path, 'rb') as f:
raw_data = f.read()
result = chardet.detect(raw_data)
encoding = result.get('encoding', 'utf-8')
# 验证编码是否有效
if encoding:
try:
raw_data.decode(encoding)
return encoding
except (UnicodeDecodeError, LookupError):
pass
return 'utf-8'
except Exception as e:
logger.warning(f"编码检测失败,使用默认编码: {str(e)}")
return 'utf-8'
def _clean_text(self, text: str) -> str:
"""
清理文本内容
- 去除多余空白字符
- 规范化换行符
- 去除特殊控制字符
Args:
text: 原始文本
Returns:
清理后的文本
"""
# 规范化换行符
text = text.replace('\r\n', '\n').replace('\r', '\n')
# 去除控制字符除了换行和tab
text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]', '', text)
# 将多个连续空格合并为一个
text = re.sub(r'[ \t]+', ' ', text)
# 将多个连续空行合并为一个
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
def extract_structured_data(self, content: str) -> Dict[str, Any]:
"""
尝试从文本中提取结构化数据
支持提取:
- 邮箱地址
- URL
- 电话号码
- 日期
- 金额
Args:
content: 文本内容
Returns:
结构化数据字典
"""
data = {
"emails": [],
"urls": [],
"phones": [],
"dates": [],
"amounts": []
}
# 提取邮箱
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', content)
data["emails"] = list(set(emails))
# 提取 URL
urls = re.findall(r'https?://[^\s<>"{}|\\^`\[\]]+', content)
data["urls"] = list(set(urls))
# 提取电话号码 (支持多种格式)
phone_patterns = [
r'1[3-9]\d{9}', # 手机号
r'\d{3,4}-\d{7,8}', # 固话
]
phones = []
for pattern in phone_patterns:
phones.extend(re.findall(pattern, content))
data["phones"] = list(set(phones))
# 提取日期
date_patterns = [
r'\d{4}[-/年]\d{1,2}[-/月]\d{1,2}[日]?',
r'\d{4}\.\d{1,2}\.\d{1,2}',
]
dates = []
for pattern in date_patterns:
dates.extend(re.findall(pattern, content))
data["dates"] = list(set(dates))
# 提取金额
amount_patterns = [
r'¥\s*\d+(?:\.\d{1,2})?',
r'\$\s*\d+(?:\.\d{1,2})?',
r'\d+(?:\.\d{1,2})?\s*元',
]
amounts = []
for pattern in amount_patterns:
amounts.extend(re.findall(pattern, content))
data["amounts"] = list(set(amounts))
return data
def split_into_chunks(
self,
content: str,
chunk_size: int = 1000,
overlap: int = 100
) -> List[str]:
"""
将长文本分割成块
用于 RAG 索引或 LLM 处理
Args:
content: 文本内容
chunk_size: 每块字符数
overlap: 块之间的重叠字符数
Returns:
文本块列表
"""
if len(content) <= chunk_size:
return [content]
chunks = []
start = 0
while start < len(content):
end = start + chunk_size
chunk = content[start:end]
# 尝试在句子边界分割
if end < len(content):
last_period = chunk.rfind('')
last_newline = chunk.rfind('\n')
split_pos = max(last_period, last_newline)
if split_pos > chunk_size // 2:
chunk = chunk[:split_pos + 1]
end = start + split_pos + 1
chunks.append(chunk)
start = end - overlap if end < len(content) else end
return chunks