添加其他格式文档的解析
This commit is contained in:
@@ -2,26 +2,29 @@
|
||||
文档解析模块 - 支持多种文件格式的解析
|
||||
"""
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
from typing import Dict
|
||||
|
||||
from .base import BaseParser, ParseResult
|
||||
from .xlsx_parser import XlsxParser
|
||||
|
||||
# 导入其他解析器 (需要先实现)
|
||||
# from .docx_parser import DocxParser
|
||||
# from .md_parser import MarkdownParser
|
||||
# from .txt_parser import TxtParser
|
||||
from .docx_parser import DocxParser
|
||||
from .md_parser import MarkdownParser
|
||||
from .txt_parser import TxtParser
|
||||
|
||||
|
||||
class ParserFactory:
|
||||
"""解析器工厂,根据文件类型返回对应解析器"""
|
||||
|
||||
_parsers: Dict[str, BaseParser] = {
|
||||
# Excel
|
||||
'.xlsx': XlsxParser(),
|
||||
'.xls': XlsxParser(),
|
||||
# '.docx': DocxParser(), # TODO: 待实现
|
||||
# '.md': MarkdownParser(), # TODO: 待实现
|
||||
# '.txt': TxtParser(), # TODO: 待实现
|
||||
# Word
|
||||
'.docx': DocxParser(),
|
||||
# Markdown
|
||||
'.md': MarkdownParser(),
|
||||
'.markdown': MarkdownParser(),
|
||||
# 文本
|
||||
'.txt': TxtParser(),
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@@ -30,7 +33,8 @@ class ParserFactory:
|
||||
ext = Path(file_path).suffix.lower()
|
||||
parser = cls._parsers.get(ext)
|
||||
if not parser:
|
||||
raise ValueError(f"不支持的文件格式: {ext},支持的格式: {list(cls._parsers.keys())}")
|
||||
supported = list(cls._parsers.keys())
|
||||
raise ValueError(f"不支持的文件格式: {ext},支持的格式: {supported}")
|
||||
return parser
|
||||
|
||||
@classmethod
|
||||
@@ -44,5 +48,18 @@ class ParserFactory:
|
||||
"""注册新的解析器"""
|
||||
cls._parsers[ext.lower()] = parser
|
||||
|
||||
@classmethod
|
||||
def get_supported_extensions(cls) -> list:
|
||||
"""获取所有支持的扩展名"""
|
||||
return list(cls._parsers.keys())
|
||||
|
||||
__all__ = ['BaseParser', 'ParseResult', 'XlsxParser', 'ParserFactory']
|
||||
|
||||
__all__ = [
|
||||
'BaseParser',
|
||||
'ParseResult',
|
||||
'ParserFactory',
|
||||
'XlsxParser',
|
||||
'DocxParser',
|
||||
'MarkdownParser',
|
||||
'TxtParser',
|
||||
]
|
||||
|
||||
163
backend/app/core/document_parser/docx_parser.py
Normal file
163
backend/app/core/document_parser/docx_parser.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""
|
||||
Word 文档 (.docx) 解析器
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from docx import Document
|
||||
|
||||
from .base import BaseParser, ParseResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocxParser(BaseParser):
|
||||
"""Word 文档解析器"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.supported_extensions = ['.docx']
|
||||
self.parser_name = "docx_parser"
|
||||
|
||||
def parse(
|
||||
self,
|
||||
file_path: str,
|
||||
**kwargs
|
||||
) -> ParseResult:
|
||||
"""
|
||||
解析 Word 文档
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
**kwargs: 其他参数
|
||||
|
||||
Returns:
|
||||
ParseResult: 解析结果
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
# 检查文件是否存在
|
||||
if not path.exists():
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"文件不存在: {file_path}"
|
||||
)
|
||||
|
||||
# 检查文件扩展名
|
||||
if path.suffix.lower() not in self.supported_extensions:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"不支持的文件类型: {path.suffix}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 读取 Word 文档
|
||||
doc = Document(file_path)
|
||||
|
||||
# 提取文本内容
|
||||
paragraphs = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
paragraphs.append(para.text)
|
||||
|
||||
# 提取表格内容
|
||||
tables_data = []
|
||||
for i, table in enumerate(doc.tables):
|
||||
table_rows = []
|
||||
for row in table.rows:
|
||||
row_data = [cell.text.strip() for cell in row.cells]
|
||||
table_rows.append(row_data)
|
||||
|
||||
if table_rows:
|
||||
tables_data.append({
|
||||
"table_index": i,
|
||||
"rows": table_rows,
|
||||
"row_count": len(table_rows),
|
||||
"column_count": len(table_rows[0]) if table_rows else 0
|
||||
})
|
||||
|
||||
# 合并所有文本
|
||||
full_text = "\n".join(paragraphs)
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"file_size": path.stat().st_size,
|
||||
"paragraph_count": len(paragraphs),
|
||||
"table_count": len(tables_data),
|
||||
"word_count": len(full_text),
|
||||
"char_count": len(full_text.replace("\n", "")),
|
||||
"has_tables": len(tables_data) > 0
|
||||
}
|
||||
|
||||
# 返回结果
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": full_text,
|
||||
"paragraphs": paragraphs,
|
||||
"tables": tables_data,
|
||||
"word_count": len(full_text),
|
||||
"structured_data": {
|
||||
"paragraphs": paragraphs,
|
||||
"tables": tables_data
|
||||
}
|
||||
},
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析 Word 文档失败: {str(e)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"解析 Word 文档失败: {str(e)}"
|
||||
)
|
||||
|
||||
def extract_key_sentences(self, text: str, max_sentences: int = 10) -> List[str]:
|
||||
"""
|
||||
从文本中提取关键句子
|
||||
|
||||
Args:
|
||||
text: 文本内容
|
||||
max_sentences: 最大句子数
|
||||
|
||||
Returns:
|
||||
关键句子列表
|
||||
"""
|
||||
# 简单实现:按句号分割,取前N个句子
|
||||
sentences = [s.strip() for s in text.split("。") if s.strip()]
|
||||
return sentences[:max_sentences]
|
||||
|
||||
def extract_structured_fields(self, text: str) -> Dict[str, Any]:
|
||||
"""
|
||||
尝试提取结构化字段
|
||||
|
||||
针对合同、简历等有固定格式的文档
|
||||
|
||||
Args:
|
||||
text: 文本内容
|
||||
|
||||
Returns:
|
||||
提取的字段字典
|
||||
"""
|
||||
fields = {}
|
||||
|
||||
# 常见字段模式
|
||||
patterns = {
|
||||
"姓名": r"姓名[::]\s*(\S+)",
|
||||
"电话": r"电话[::]\s*(\d{11}|\d{3}-\d{8})",
|
||||
"邮箱": r"邮箱[::]\s*(\S+@\S+)",
|
||||
"地址": r"地址[::]\s*(.+?)(?:\n|$)",
|
||||
"金额": r"金额[::]\s*(\d+(?:\.\d+)?)",
|
||||
"日期": r"日期[::]\s*(\d{4}[年/-]\d{1,2}[月/-]\d{1,2})",
|
||||
}
|
||||
|
||||
import re
|
||||
for field_name, pattern in patterns.items():
|
||||
match = re.search(pattern, text)
|
||||
if match:
|
||||
fields[field_name] = match.group(1)
|
||||
|
||||
return fields
|
||||
262
backend/app/core/document_parser/md_parser.py
Normal file
262
backend/app/core/document_parser/md_parser.py
Normal file
@@ -0,0 +1,262 @@
|
||||
"""
|
||||
Markdown 文档解析器
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import markdown
|
||||
|
||||
from .base import BaseParser, ParseResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownParser(BaseParser):
|
||||
"""Markdown 文档解析器"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.supported_extensions = ['.md', '.markdown']
|
||||
self.parser_name = "markdown_parser"
|
||||
|
||||
def parse(
|
||||
self,
|
||||
file_path: str,
|
||||
**kwargs
|
||||
) -> ParseResult:
|
||||
"""
|
||||
解析 Markdown 文档
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
**kwargs: 其他参数
|
||||
|
||||
Returns:
|
||||
ParseResult: 解析结果
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
# 检查文件是否存在
|
||||
if not path.exists():
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"文件不存在: {file_path}"
|
||||
)
|
||||
|
||||
# 检查文件扩展名
|
||||
if path.suffix.lower() not in self.supported_extensions:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"不支持的文件类型: {path.suffix}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 读取文件内容
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
raw_content = f.read()
|
||||
|
||||
# 解析 Markdown
|
||||
md = markdown.Markdown(extensions=[
|
||||
'markdown.extensions.tables',
|
||||
'markdown.extensions.fenced_code',
|
||||
'markdown.extensions.codehilite',
|
||||
'markdown.extensions.toc',
|
||||
])
|
||||
|
||||
html_content = md.convert(raw_content)
|
||||
|
||||
# 提取标题结构
|
||||
titles = self._extract_titles(raw_content)
|
||||
|
||||
# 提取代码块
|
||||
code_blocks = self._extract_code_blocks(raw_content)
|
||||
|
||||
# 提取表格
|
||||
tables = self._extract_tables(raw_content)
|
||||
|
||||
# 提取链接和图片
|
||||
links_images = self._extract_links_images(raw_content)
|
||||
|
||||
# 清理后的纯文本(去除 Markdown 语法)
|
||||
plain_text = self._strip_markdown(raw_content)
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"file_size": path.stat().st_size,
|
||||
"word_count": len(plain_text),
|
||||
"char_count": len(raw_content),
|
||||
"line_count": len(raw_content.splitlines()),
|
||||
"title_count": len(titles),
|
||||
"code_block_count": len(code_blocks),
|
||||
"table_count": len(tables),
|
||||
"link_count": len(links_images.get("links", [])),
|
||||
"image_count": len(links_images.get("images", [])),
|
||||
}
|
||||
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": plain_text,
|
||||
"raw_content": raw_content,
|
||||
"html_content": html_content,
|
||||
"titles": titles,
|
||||
"code_blocks": code_blocks,
|
||||
"tables": tables,
|
||||
"links_images": links_images,
|
||||
"word_count": len(plain_text),
|
||||
"structured_data": {
|
||||
"titles": titles,
|
||||
"code_blocks": code_blocks,
|
||||
"tables": tables
|
||||
}
|
||||
},
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析 Markdown 文档失败: {str(e)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"解析 Markdown 文档失败: {str(e)}"
|
||||
)
|
||||
|
||||
def _extract_titles(self, content: str) -> List[Dict[str, Any]]:
|
||||
"""提取标题结构"""
|
||||
import re
|
||||
titles = []
|
||||
|
||||
# 匹配 # 标题
|
||||
for match in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
|
||||
level = len(match.group(1))
|
||||
title_text = match.group(2).strip()
|
||||
titles.append({
|
||||
"level": level,
|
||||
"text": title_text,
|
||||
"line": content[:match.start()].count('\n') + 1
|
||||
})
|
||||
|
||||
return titles
|
||||
|
||||
def _extract_code_blocks(self, content: str) -> List[Dict[str, str]]:
|
||||
"""提取代码块"""
|
||||
import re
|
||||
code_blocks = []
|
||||
|
||||
# 匹配 ```code ``` 格式
|
||||
pattern = r'```(\w*)\n(.*?)```'
|
||||
for match in re.finditer(pattern, content, re.DOTALL):
|
||||
language = match.group(1) or "text"
|
||||
code = match.group(2).strip()
|
||||
code_blocks.append({
|
||||
"language": language,
|
||||
"code": code
|
||||
})
|
||||
|
||||
return code_blocks
|
||||
|
||||
def _extract_tables(self, content: str) -> List[Dict[str, Any]]:
|
||||
"""提取表格"""
|
||||
import re
|
||||
tables = []
|
||||
|
||||
# 简单表格匹配(| col1 | col2 | 格式)
|
||||
lines = content.split('\n')
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
|
||||
# 检查是否是表格行
|
||||
if line.startswith('|') and line.endswith('|'):
|
||||
# 找到表头
|
||||
header_row = [cell.strip() for cell in line.split('|')[1:-1]]
|
||||
|
||||
# 检查下一行是否是分隔符
|
||||
if i + 1 < len(lines) and re.match(r'^\|[\s\-:|]+\|$', lines[i + 1]):
|
||||
# 跳过分隔符,读取数据行
|
||||
data_rows = []
|
||||
for j in range(i + 2, len(lines)):
|
||||
row_line = lines[j].strip()
|
||||
if not (row_line.startswith('|') and row_line.endswith('|')):
|
||||
break
|
||||
row_data = [cell.strip() for cell in row_line.split('|')[1:-1]]
|
||||
data_rows.append(row_data)
|
||||
|
||||
if header_row and data_rows:
|
||||
tables.append({
|
||||
"headers": header_row,
|
||||
"rows": data_rows,
|
||||
"row_count": len(data_rows),
|
||||
"column_count": len(header_row)
|
||||
})
|
||||
i = j - 1
|
||||
|
||||
i += 1
|
||||
|
||||
return tables
|
||||
|
||||
def _extract_links_images(self, content: str) -> Dict[str, List[Dict[str, str]]]:
|
||||
"""提取链接和图片"""
|
||||
import re
|
||||
result = {"links": [], "images": []}
|
||||
|
||||
# 提取链接 [text](url)
|
||||
for match in re.finditer(r'\[([^\]]+)\]\(([^\)]+)\)', content):
|
||||
result["links"].append({
|
||||
"text": match.group(1),
|
||||
"url": match.group(2)
|
||||
})
|
||||
|
||||
# 提取图片 
|
||||
for match in re.finditer(r'!\[([^\]]*)\]\(([^\)]+)\)', content):
|
||||
result["images"].append({
|
||||
"alt": match.group(1),
|
||||
"url": match.group(2)
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
def _strip_markdown(self, content: str) -> str:
|
||||
"""去除 Markdown 语法,获取纯文本"""
|
||||
import re
|
||||
|
||||
# 去除代码块
|
||||
content = re.sub(r'```[\s\S]*?```', '', content)
|
||||
|
||||
# 去除行内代码
|
||||
content = re.sub(r'`[^`]+`', '', content)
|
||||
|
||||
# 去除图片
|
||||
content = re.sub(r'!\[([^\]]*)\]\([^\)]+\)', r'\1', content)
|
||||
|
||||
# 去除链接,保留文本
|
||||
content = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', content)
|
||||
|
||||
# 去除标题标记
|
||||
content = re.sub(r'^#{1,6}\s+', '', content, flags=re.MULTILINE)
|
||||
|
||||
# 去除加粗和斜体
|
||||
content = re.sub(r'\*\*([^\*]+)\*\*', r'\1', content)
|
||||
content = re.sub(r'\*([^\*]+)\*', r'\1', content)
|
||||
content = re.sub(r'__([^_]+)__', r'\1', content)
|
||||
content = re.sub(r'_([^_]+)_', r'\1', content)
|
||||
|
||||
# 去除引用标记
|
||||
content = re.sub(r'^>\s+', '', content, flags=re.MULTILINE)
|
||||
|
||||
# 去除列表标记
|
||||
content = re.sub(r'^[-*+]\s+', '', content, flags=re.MULTILINE)
|
||||
content = re.sub(r'^\d+\.\s+', '', content, flags=re.MULTILINE)
|
||||
|
||||
# 去除水平线
|
||||
content = re.sub(r'^[-*_]{3,}$', '', content, flags=re.MULTILINE)
|
||||
|
||||
# 去除表格分隔符
|
||||
content = re.sub(r'^\|[\s\-:|]+\|$', '', content, flags=re.MULTILINE)
|
||||
|
||||
# 清理多余空行
|
||||
content = re.sub(r'\n{3,}', '\n\n', content)
|
||||
|
||||
return content.strip()
|
||||
278
backend/app/core/document_parser/txt_parser.py
Normal file
278
backend/app/core/document_parser/txt_parser.py
Normal file
@@ -0,0 +1,278 @@
|
||||
"""
|
||||
纯文本 (.txt) 解析器
|
||||
"""
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import chardet
|
||||
|
||||
from .base import BaseParser, ParseResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TxtParser(BaseParser):
|
||||
"""纯文本文档解析器"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.supported_extensions = ['.txt']
|
||||
self.parser_name = "txt_parser"
|
||||
|
||||
def parse(
|
||||
self,
|
||||
file_path: str,
|
||||
encoding: Optional[str] = None,
|
||||
**kwargs
|
||||
) -> ParseResult:
|
||||
"""
|
||||
解析文本文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
encoding: 指定编码,不指定则自动检测
|
||||
**kwargs: 其他参数
|
||||
|
||||
Returns:
|
||||
ParseResult: 解析结果
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
# 检查文件是否存在
|
||||
if not path.exists():
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"文件不存在: {file_path}"
|
||||
)
|
||||
|
||||
# 检查文件扩展名
|
||||
if path.suffix.lower() not in self.supported_extensions:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"不支持的文件类型: {path.suffix}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 检测编码
|
||||
if not encoding:
|
||||
encoding = self._detect_encoding(file_path)
|
||||
|
||||
# 读取文件内容
|
||||
with open(file_path, 'r', encoding=encoding) as f:
|
||||
raw_content = f.read()
|
||||
|
||||
# 清理文本
|
||||
content = self._clean_text(raw_content)
|
||||
|
||||
# 提取行信息
|
||||
lines = content.split('\n')
|
||||
|
||||
# 估算字数
|
||||
word_count = len(content.replace('\n', '').replace(' ', ''))
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"file_size": path.stat().st_size,
|
||||
"encoding": encoding,
|
||||
"line_count": len(lines),
|
||||
"word_count": word_count,
|
||||
"char_count": len(content),
|
||||
"non_empty_line_count": len([l for l in lines if l.strip()])
|
||||
}
|
||||
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": content,
|
||||
"raw_content": raw_content,
|
||||
"lines": lines,
|
||||
"word_count": word_count,
|
||||
"char_count": len(content),
|
||||
"line_count": len(lines),
|
||||
"structured_data": {
|
||||
"line_count": len(lines),
|
||||
"non_empty_line_count": metadata["non_empty_line_count"]
|
||||
}
|
||||
},
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析文本文件失败: {str(e)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"解析文本文件失败: {str(e)}"
|
||||
)
|
||||
|
||||
def _detect_encoding(self, file_path: str) -> str:
|
||||
"""
|
||||
自动检测文件编码
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
检测到的编码
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
raw_data = f.read()
|
||||
|
||||
result = chardet.detect(raw_data)
|
||||
encoding = result.get('encoding', 'utf-8')
|
||||
|
||||
# 验证编码是否有效
|
||||
if encoding:
|
||||
try:
|
||||
raw_data.decode(encoding)
|
||||
return encoding
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
pass
|
||||
|
||||
return 'utf-8'
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"编码检测失败,使用默认编码: {str(e)}")
|
||||
return 'utf-8'
|
||||
|
||||
def _clean_text(self, text: str) -> str:
|
||||
"""
|
||||
清理文本内容
|
||||
|
||||
- 去除多余空白字符
|
||||
- 规范化换行符
|
||||
- 去除特殊控制字符
|
||||
|
||||
Args:
|
||||
text: 原始文本
|
||||
|
||||
Returns:
|
||||
清理后的文本
|
||||
"""
|
||||
# 规范化换行符
|
||||
text = text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# 去除控制字符(除了换行和tab)
|
||||
text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]', '', text)
|
||||
|
||||
# 将多个连续空格合并为一个
|
||||
text = re.sub(r'[ \t]+', ' ', text)
|
||||
|
||||
# 将多个连续空行合并为一个
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
def extract_structured_data(self, content: str) -> Dict[str, Any]:
|
||||
"""
|
||||
尝试从文本中提取结构化数据
|
||||
|
||||
支持提取:
|
||||
- 邮箱地址
|
||||
- URL
|
||||
- 电话号码
|
||||
- 日期
|
||||
- 金额
|
||||
|
||||
Args:
|
||||
content: 文本内容
|
||||
|
||||
Returns:
|
||||
结构化数据字典
|
||||
"""
|
||||
data = {
|
||||
"emails": [],
|
||||
"urls": [],
|
||||
"phones": [],
|
||||
"dates": [],
|
||||
"amounts": []
|
||||
}
|
||||
|
||||
# 提取邮箱
|
||||
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', content)
|
||||
data["emails"] = list(set(emails))
|
||||
|
||||
# 提取 URL
|
||||
urls = re.findall(r'https?://[^\s<>"{}|\\^`\[\]]+', content)
|
||||
data["urls"] = list(set(urls))
|
||||
|
||||
# 提取电话号码 (支持多种格式)
|
||||
phone_patterns = [
|
||||
r'1[3-9]\d{9}', # 手机号
|
||||
r'\d{3,4}-\d{7,8}', # 固话
|
||||
]
|
||||
phones = []
|
||||
for pattern in phone_patterns:
|
||||
phones.extend(re.findall(pattern, content))
|
||||
data["phones"] = list(set(phones))
|
||||
|
||||
# 提取日期
|
||||
date_patterns = [
|
||||
r'\d{4}[-/年]\d{1,2}[-/月]\d{1,2}[日]?',
|
||||
r'\d{4}\.\d{1,2}\.\d{1,2}',
|
||||
]
|
||||
dates = []
|
||||
for pattern in date_patterns:
|
||||
dates.extend(re.findall(pattern, content))
|
||||
data["dates"] = list(set(dates))
|
||||
|
||||
# 提取金额
|
||||
amount_patterns = [
|
||||
r'¥\s*\d+(?:\.\d{1,2})?',
|
||||
r'\$\s*\d+(?:\.\d{1,2})?',
|
||||
r'\d+(?:\.\d{1,2})?\s*元',
|
||||
]
|
||||
amounts = []
|
||||
for pattern in amount_patterns:
|
||||
amounts.extend(re.findall(pattern, content))
|
||||
data["amounts"] = list(set(amounts))
|
||||
|
||||
return data
|
||||
|
||||
def split_into_chunks(
|
||||
self,
|
||||
content: str,
|
||||
chunk_size: int = 1000,
|
||||
overlap: int = 100
|
||||
) -> List[str]:
|
||||
"""
|
||||
将长文本分割成块
|
||||
|
||||
用于 RAG 索引或 LLM 处理
|
||||
|
||||
Args:
|
||||
content: 文本内容
|
||||
chunk_size: 每块字符数
|
||||
overlap: 块之间的重叠字符数
|
||||
|
||||
Returns:
|
||||
文本块列表
|
||||
"""
|
||||
if len(content) <= chunk_size:
|
||||
return [content]
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
|
||||
while start < len(content):
|
||||
end = start + chunk_size
|
||||
chunk = content[start:end]
|
||||
|
||||
# 尝试在句子边界分割
|
||||
if end < len(content):
|
||||
last_period = chunk.rfind('。')
|
||||
last_newline = chunk.rfind('\n')
|
||||
split_pos = max(last_period, last_newline)
|
||||
|
||||
if split_pos > chunk_size // 2:
|
||||
chunk = chunk[:split_pos + 1]
|
||||
end = start + split_pos + 1
|
||||
|
||||
chunks.append(chunk)
|
||||
start = end - overlap if end < len(content) else end
|
||||
|
||||
return chunks
|
||||
Reference in New Issue
Block a user