88 lines
2.0 KiB
Python
88 lines
2.0 KiB
Python
"""
|
|
解析器基类 - 定义所有解析器的通用接口
|
|
"""
|
|
from abc import ABC, abstractmethod
|
|
from typing import Any, Dict, List, Optional
|
|
from pathlib import Path
|
|
|
|
|
|
class ParseResult:
|
|
"""解析结果类"""
|
|
|
|
def __init__(
|
|
self,
|
|
success: bool,
|
|
data: Optional[Dict[str, Any]] = None,
|
|
error: Optional[str] = None,
|
|
metadata: Optional[Dict[str, Any]] = None
|
|
):
|
|
self.success = success
|
|
self.data = data or {}
|
|
self.error = error
|
|
self.metadata = metadata or {}
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""转换为字典"""
|
|
return {
|
|
"success": self.success,
|
|
"data": self.data,
|
|
"error": self.error,
|
|
"metadata": self.metadata
|
|
}
|
|
|
|
|
|
class BaseParser(ABC):
|
|
"""文档解析器基类"""
|
|
|
|
def __init__(self):
|
|
self.supported_extensions: List[str] = []
|
|
self.parser_name: str = "base_parser"
|
|
|
|
@abstractmethod
|
|
def parse(self, file_path: str, **kwargs) -> ParseResult:
|
|
"""
|
|
解析文件
|
|
|
|
Args:
|
|
file_path: 文件路径
|
|
**kwargs: 其他解析参数
|
|
|
|
Returns:
|
|
ParseResult: 解析结果
|
|
"""
|
|
pass
|
|
|
|
def can_parse(self, file_path: str) -> bool:
|
|
"""
|
|
检查是否可以解析该文件
|
|
|
|
Args:
|
|
file_path: 文件路径
|
|
|
|
Returns:
|
|
bool: 是否可以解析
|
|
"""
|
|
ext = Path(file_path).suffix.lower()
|
|
return ext in self.supported_extensions
|
|
|
|
def get_file_info(self, file_path: str) -> Dict[str, Any]:
|
|
"""
|
|
获取文件基本信息
|
|
|
|
Args:
|
|
file_path: 文件路径
|
|
|
|
Returns:
|
|
Dict[str, Any]: 文件信息
|
|
"""
|
|
path = Path(file_path)
|
|
if not path.exists():
|
|
return {"error": "File not found"}
|
|
|
|
return {
|
|
"filename": path.name,
|
|
"extension": path.suffix.lower(),
|
|
"size": path.stat().st_size,
|
|
"parser": self.parser_name
|
|
}
|