【智能助手增强】
- 新增对话历史管理:MongoDB新增conversations集合,存储用户与AI的对话上下文,支持多轮对话意图延续
- 新增对话历史API(conversation.py):GET/DELETE conversation历史、列出所有会话
- 意图解析增强:支持基于对话历史的意图识别,上下文理解更准确
- 字段提取优化:支持"提取文档中的医院数量"等自然语言模式,智能去除"文档中的"前缀
- 文档对比优化:从指令中提取文件名并精确匹配source_docs,支持"对比A和B两个文档"
- 文档摘要优化:使用LLM生成真实AI摘要而非返回原始文档预览
【Word模板填表核心功能】
- Word模板字段生成:空白Word上传后,自动从源文档(Excel/Word/TXT/MD)内容AI生成字段名
- Word模板填表(_fill_docx):将提取数据写入Word模板表格,支持精确匹配、模糊匹配、追加新行
- 数据润色(_polish_word_filled_data):LLM对多行Excel数据进行统计归纳(合计/平均/极值),转化为专业自然语言描述
- 段落格式输出:使用📌字段名+值段落+分隔线(灰色横线)格式,提升可读性
- 导出链打通:fill_template返回filled_file_path,export直接返回已填好的Word文件
【其他修复】
- 修复Word导出Windows文件锁问题:NamedTemporaryFile改为mkstemp+close
- 修复Word方框非法字符:扩展clean_text移除\uFFFD、□等Unicode替代符和零宽字符
- 修复文档对比"需要至少2个文档":从指令提取具体文件名优先匹配而非取前2个
- 修复导出format硬编码:自动识别docx/xlsx格式
- Docx解析器增加备用解析方法和更完整的段落/表格/标题提取
- RAG服务新增MySQL数据源支持
This commit is contained in:
@@ -44,6 +44,22 @@ class DocxParser(BaseParser):
|
||||
error=f"文件不存在: {file_path}"
|
||||
)
|
||||
|
||||
# 尝试使用 python-docx 解析,失败则使用备用方法
|
||||
try:
|
||||
return self._parse_with_docx(path)
|
||||
except Exception as e:
|
||||
logger.warning(f"python-docx 解析失败,使用备用方法: {e}")
|
||||
try:
|
||||
return self._parse_fallback(path)
|
||||
except Exception as fallback_error:
|
||||
logger.error(f"备用解析方法也失败: {fallback_error}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"解析 Word 文档失败: {str(e)}"
|
||||
)
|
||||
|
||||
def _parse_with_docx(self, path: Path) -> ParseResult:
|
||||
"""使用 python-docx 解析文档"""
|
||||
# 检查文件扩展名
|
||||
if path.suffix.lower() not in self.supported_extensions:
|
||||
return ParseResult(
|
||||
@@ -51,98 +67,177 @@ class DocxParser(BaseParser):
|
||||
error=f"不支持的文件类型: {path.suffix}"
|
||||
)
|
||||
|
||||
# 读取 Word 文档
|
||||
doc = Document(path)
|
||||
|
||||
# 提取文本内容
|
||||
paragraphs = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
paragraphs.append({
|
||||
"text": para.text,
|
||||
"style": str(para.style.name) if para.style else "Normal"
|
||||
})
|
||||
|
||||
# 提取段落纯文本(用于 AI 解析)
|
||||
paragraphs_text = [p["text"] for p in paragraphs if p["text"].strip()]
|
||||
|
||||
# 提取表格内容
|
||||
tables_data = []
|
||||
for i, table in enumerate(doc.tables):
|
||||
table_rows = []
|
||||
for row in table.rows:
|
||||
row_data = [cell.text.strip() for cell in row.cells]
|
||||
table_rows.append(row_data)
|
||||
|
||||
if table_rows:
|
||||
tables_data.append({
|
||||
"table_index": i,
|
||||
"rows": table_rows,
|
||||
"row_count": len(table_rows),
|
||||
"column_count": len(table_rows[0]) if table_rows else 0
|
||||
})
|
||||
|
||||
# 提取图片/嵌入式对象信息
|
||||
images_info = self._extract_images_info(doc, path)
|
||||
|
||||
# 合并所有文本(包括图片描述)
|
||||
full_text_parts = []
|
||||
full_text_parts.append("【文档正文】")
|
||||
full_text_parts.extend(paragraphs_text)
|
||||
|
||||
if tables_data:
|
||||
full_text_parts.append("\n【文档表格】")
|
||||
for idx, table in enumerate(tables_data):
|
||||
full_text_parts.append(f"--- 表格 {idx + 1} ---")
|
||||
for row in table["rows"]:
|
||||
full_text_parts.append(" | ".join(str(cell) for cell in row))
|
||||
|
||||
if images_info.get("image_count", 0) > 0:
|
||||
full_text_parts.append(f"\n【文档图片】文档包含 {images_info['image_count']} 张图片/图表")
|
||||
|
||||
full_text = "\n".join(full_text_parts)
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"paragraph_count": len(paragraphs),
|
||||
"table_count": len(tables_data),
|
||||
"image_count": images_info.get("image_count", 0)
|
||||
}
|
||||
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": full_text,
|
||||
"paragraphs": paragraphs,
|
||||
"paragraphs_with_style": paragraphs,
|
||||
"tables": tables_data,
|
||||
"images": images_info
|
||||
},
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
def _parse_fallback(self, path: Path) -> ParseResult:
|
||||
"""备用解析方法:直接解析 docx 的 XML 结构"""
|
||||
import zipfile
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
try:
|
||||
# 读取 Word 文档
|
||||
doc = Document(file_path)
|
||||
with zipfile.ZipFile(path, 'r') as zf:
|
||||
# 读取 document.xml
|
||||
if 'word/document.xml' not in zf.namelist():
|
||||
return ParseResult(success=False, error="无效的 docx 文件格式")
|
||||
|
||||
# 提取文本内容
|
||||
paragraphs = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
paragraphs.append({
|
||||
"text": para.text,
|
||||
"style": str(para.style.name) if para.style else "Normal"
|
||||
xml_content = zf.read('word/document.xml')
|
||||
root = ET.fromstring(xml_content)
|
||||
|
||||
# 命名空间
|
||||
namespaces = {
|
||||
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
||||
}
|
||||
|
||||
paragraphs = []
|
||||
tables = []
|
||||
current_table = []
|
||||
|
||||
for elem in root.iter():
|
||||
if elem.tag.endswith('}p'): # 段落
|
||||
text_parts = []
|
||||
for t in elem.iter():
|
||||
if t.tag.endswith('}t') and t.text:
|
||||
text_parts.append(t.text)
|
||||
text = ''.join(text_parts).strip()
|
||||
if text:
|
||||
paragraphs.append({'text': text, 'style': 'Normal'})
|
||||
elif elem.tag.endswith('}tr'): # 表格行
|
||||
row_data = []
|
||||
for tc in elem.iter():
|
||||
if tc.tag.endswith('}tc'): # 单元格
|
||||
cell_text = []
|
||||
for t in tc.iter():
|
||||
if t.tag.endswith('}t') and t.text:
|
||||
cell_text.append(t.text)
|
||||
row_data.append(''.join(cell_text).strip())
|
||||
if row_data:
|
||||
current_table.append(row_data)
|
||||
else:
|
||||
# 表格结束,保存
|
||||
if current_table:
|
||||
tables.append({
|
||||
'table_index': len(tables),
|
||||
'rows': current_table,
|
||||
'row_count': len(current_table),
|
||||
'column_count': len(current_table[0]) if current_table else 0
|
||||
})
|
||||
current_table = []
|
||||
|
||||
# 保存最后一张表格
|
||||
if current_table:
|
||||
tables.append({
|
||||
'table_index': len(tables),
|
||||
'rows': current_table,
|
||||
'row_count': len(current_table),
|
||||
'column_count': len(current_table[0]) if current_table else 0
|
||||
})
|
||||
|
||||
# 提取段落纯文本(用于 AI 解析)
|
||||
paragraphs_text = [p["text"] for p in paragraphs if p["text"].strip()]
|
||||
# 构建文本
|
||||
paragraphs_text = [p["text"] for p in paragraphs]
|
||||
full_text_parts = ["【文档正文】"] + paragraphs_text
|
||||
|
||||
# 提取表格内容
|
||||
tables_data = []
|
||||
for i, table in enumerate(doc.tables):
|
||||
table_rows = []
|
||||
for row in table.rows:
|
||||
row_data = [cell.text.strip() for cell in row.cells]
|
||||
table_rows.append(row_data)
|
||||
if tables:
|
||||
full_text_parts.append("\n【文档表格】")
|
||||
for idx, table in enumerate(tables):
|
||||
full_text_parts.append(f"--- 表格 {idx + 1} ---")
|
||||
for row in table["rows"]:
|
||||
full_text_parts.append(" | ".join(str(cell) for cell in row))
|
||||
|
||||
if table_rows:
|
||||
tables_data.append({
|
||||
"table_index": i,
|
||||
"rows": table_rows,
|
||||
"row_count": len(table_rows),
|
||||
"column_count": len(table_rows[0]) if table_rows else 0
|
||||
})
|
||||
full_text = "\n".join(full_text_parts)
|
||||
|
||||
# 提取图片/嵌入式对象信息
|
||||
images_info = self._extract_images_info(doc, path)
|
||||
|
||||
# 合并所有文本(包括图片描述)
|
||||
full_text_parts = []
|
||||
full_text_parts.append("【文档正文】")
|
||||
full_text_parts.extend(paragraphs_text)
|
||||
|
||||
if tables_data:
|
||||
full_text_parts.append("\n【文档表格】")
|
||||
for idx, table in enumerate(tables_data):
|
||||
full_text_parts.append(f"--- 表格 {idx + 1} ---")
|
||||
for row in table["rows"]:
|
||||
full_text_parts.append(" | ".join(str(cell) for cell in row))
|
||||
|
||||
if images_info.get("image_count", 0) > 0:
|
||||
full_text_parts.append(f"\n【文档图片】文档包含 {images_info['image_count']} 张图片/图表")
|
||||
|
||||
full_text = "\n".join(full_text_parts)
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"file_size": path.stat().st_size,
|
||||
"paragraph_count": len(paragraphs),
|
||||
"table_count": len(tables_data),
|
||||
"word_count": len(full_text),
|
||||
"char_count": len(full_text.replace("\n", "")),
|
||||
"has_tables": len(tables_data) > 0,
|
||||
"has_images": images_info.get("image_count", 0) > 0,
|
||||
"image_count": images_info.get("image_count", 0)
|
||||
}
|
||||
|
||||
# 返回结果
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": full_text,
|
||||
"paragraphs": paragraphs_text,
|
||||
"paragraphs_with_style": paragraphs,
|
||||
"tables": tables_data,
|
||||
"images": images_info,
|
||||
"word_count": len(full_text),
|
||||
"structured_data": {
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": full_text,
|
||||
"paragraphs": paragraphs,
|
||||
"paragraphs_text": paragraphs_text,
|
||||
"tables": tables_data,
|
||||
"images": images_info
|
||||
"paragraphs_with_style": paragraphs,
|
||||
"tables": tables,
|
||||
"images": {"image_count": 0, "descriptions": []}
|
||||
},
|
||||
metadata={
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"paragraph_count": len(paragraphs),
|
||||
"table_count": len(tables),
|
||||
"image_count": 0,
|
||||
"parse_method": "fallback_xml"
|
||||
}
|
||||
},
|
||||
metadata=metadata
|
||||
)
|
||||
)
|
||||
|
||||
except zipfile.BadZipFile:
|
||||
return ParseResult(success=False, error="无效的 ZIP/文档文件")
|
||||
except Exception as e:
|
||||
logger.error(f"解析 Word 文档失败: {str(e)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"解析 Word 文档失败: {str(e)}"
|
||||
)
|
||||
return ParseResult(success=False, error=f"备用解析失败: {str(e)}")
|
||||
|
||||
def extract_images_as_base64(self, file_path: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
@@ -197,6 +292,83 @@ class DocxParser(BaseParser):
|
||||
logger.info(f"共提取 {len(images)} 张图片")
|
||||
return images
|
||||
|
||||
def extract_text_from_images(self, file_path: str, lang: str = 'chi_sim+eng') -> Dict[str, Any]:
|
||||
"""
|
||||
对 Word 文档中的图片进行 OCR 文字识别
|
||||
|
||||
Args:
|
||||
file_path: Word 文件路径
|
||||
lang: Tesseract 语言代码,默认简体中文+英文 (chi_sim+eng)
|
||||
|
||||
Returns:
|
||||
包含识别结果的字典
|
||||
"""
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from PIL import Image
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
except ImportError:
|
||||
logger.warning("pytesseract 未安装,OCR 功能不可用")
|
||||
return {
|
||||
"success": False,
|
||||
"error": "pytesseract 未安装,请运行: pip install pytesseract",
|
||||
"image_count": 0,
|
||||
"extracted_text": []
|
||||
}
|
||||
|
||||
results = {
|
||||
"success": True,
|
||||
"image_count": 0,
|
||||
"extracted_text": [],
|
||||
"total_chars": 0
|
||||
}
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as zf:
|
||||
# 查找 word/media 目录下的图片文件
|
||||
media_files = [f for f in zf.namelist() if f.startswith('word/media/')]
|
||||
|
||||
for idx, filename in enumerate(media_files):
|
||||
ext = filename.split('.')[-1].lower()
|
||||
if ext not in ['png', 'jpg', 'jpeg', 'gif', 'bmp']:
|
||||
continue
|
||||
|
||||
try:
|
||||
# 读取图片数据
|
||||
image_data = zf.read(filename)
|
||||
image = Image.open(BytesIO(image_data))
|
||||
|
||||
# 使用 Tesseract OCR 提取文字
|
||||
text = pytesseract.image_to_string(image, lang=lang)
|
||||
text = text.strip()
|
||||
|
||||
if text:
|
||||
results["extracted_text"].append({
|
||||
"image_index": idx,
|
||||
"filename": filename,
|
||||
"text": text,
|
||||
"char_count": len(text)
|
||||
})
|
||||
results["total_chars"] += len(text)
|
||||
|
||||
logger.info(f"图片 {filename} OCR 识别完成,提取 {len(text)} 字符")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"图片 {filename} OCR 识别失败: {str(e)}")
|
||||
|
||||
results["image_count"] = len(results["extracted_text"])
|
||||
|
||||
except zipfile.BadZipFile:
|
||||
results["success"] = False
|
||||
results["error"] = "无效的 Word 文档文件"
|
||||
except Exception as e:
|
||||
results["success"] = False
|
||||
results["error"] = f"OCR 处理失败: {str(e)}"
|
||||
|
||||
return results
|
||||
|
||||
def extract_key_sentences(self, text: str, max_sentences: int = 10) -> List[str]:
|
||||
"""
|
||||
从文本中提取关键句子
|
||||
|
||||
Reference in New Issue
Block a user