【智能助手增强】

- 新增对话历史管理:MongoDB新增conversations集合,存储用户与AI的对话上下文,支持多轮对话意图延续
- 新增对话历史API(conversation.py):GET/DELETE conversation历史、列出所有会话
- 意图解析增强:支持基于对话历史的意图识别,上下文理解更准确
- 字段提取优化:支持"提取文档中的医院数量"等自然语言模式,智能去除"文档中的"前缀
- 文档对比优化:从指令中提取文件名并精确匹配source_docs,支持"对比A和B两个文档"
- 文档摘要优化:使用LLM生成真实AI摘要而非返回原始文档预览

【Word模板填表核心功能】
- Word模板字段生成:空白Word上传后,自动从源文档(Excel/Word/TXT/MD)内容AI生成字段名
- Word模板填表(_fill_docx):将提取数据写入Word模板表格,支持精确匹配、模糊匹配、追加新行
- 数据润色(_polish_word_filled_data):LLM对多行Excel数据进行统计归纳(合计/平均/极值),转化为专业自然语言描述
- 段落格式输出:使用📌字段名+值段落+分隔线(灰色横线)格式,提升可读性
- 导出链打通:fill_template返回filled_file_path,export直接返回已填好的Word文件

【其他修复】
- 修复Word导出Windows文件锁问题:NamedTemporaryFile改为mkstemp+close
- 修复Word方框非法字符:扩展clean_text移除\uFFFD、□等Unicode替代符和零宽字符
- 修复文档对比"需要至少2个文档":从指令提取具体文件名优先匹配而非取前2个
- 修复导出format硬编码:自动识别docx/xlsx格式
- Docx解析器增加备用解析方法和更完整的段落/表格/标题提取
- RAG服务新增MySQL数据源支持
This commit is contained in:
dj
2026-04-15 23:32:55 +08:00
parent 9e7f9df384
commit e5d4724e82
19 changed files with 2185 additions and 407 deletions

View File

@@ -64,6 +64,11 @@ class MongoDB:
"""任务集合 - 存储任务历史记录"""
return self.db["tasks"]
@property
def conversations(self):
"""对话集合 - 存储对话历史记录"""
return self.db["conversations"]
# ==================== 文档操作 ====================
async def insert_document(
@@ -117,14 +122,20 @@ class MongoDB:
搜索文档
Args:
query: 搜索关键词
query: 搜索关键词(支持文件名和内容搜索)
doc_type: 文档类型过滤
limit: 返回数量
Returns:
文档列表
"""
filter_query = {"content": {"$regex": query}}
filter_query = {
"$or": [
{"content": {"$regex": query, "$options": "i"}},
{"metadata.original_filename": {"$regex": query, "$options": "i"}},
{"metadata.filename": {"$regex": query, "$options": "i"}},
]
}
if doc_type:
filter_query["doc_type"] = doc_type
@@ -141,6 +152,15 @@ class MongoDB:
result = await self.documents.delete_one({"_id": ObjectId(doc_id)})
return result.deleted_count > 0
async def update_document_metadata(self, doc_id: str, metadata: Dict[str, Any]) -> bool:
"""更新文档 metadata 字段"""
from bson import ObjectId
result = await self.documents.update_one(
{"_id": ObjectId(doc_id)},
{"$set": {"metadata": metadata}}
)
return result.modified_count > 0
# ==================== RAG 索引操作 ====================
async def insert_rag_entry(
@@ -251,6 +271,10 @@ class MongoDB:
await self.tasks.create_index("task_id", unique=True)
await self.tasks.create_index("created_at")
# 对话集合索引
await self.conversations.create_index("conversation_id")
await self.conversations.create_index("created_at")
logger.info("MongoDB 索引创建完成")
# ==================== 任务历史操作 ====================
@@ -369,6 +393,108 @@ class MongoDB:
result = await self.tasks.delete_one({"task_id": task_id})
return result.deleted_count > 0
# ==================== 对话历史操作 ====================
async def insert_conversation(
self,
conversation_id: str,
role: str,
content: str,
intent: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
) -> str:
"""
插入对话记录
Args:
conversation_id: 对话会话ID
role: 角色 (user/assistant)
content: 对话内容
intent: 意图类型
metadata: 额外元数据
Returns:
插入文档的ID
"""
message = {
"conversation_id": conversation_id,
"role": role,
"content": content,
"intent": intent,
"metadata": metadata or {},
"created_at": datetime.utcnow(),
}
result = await self.conversations.insert_one(message)
return str(result.inserted_id)
async def get_conversation_history(
self,
conversation_id: str,
limit: int = 20,
) -> List[Dict[str, Any]]:
"""
获取对话历史
Args:
conversation_id: 对话会话ID
limit: 返回消息数量
Returns:
对话消息列表
"""
cursor = self.conversations.find(
{"conversation_id": conversation_id}
).sort("created_at", 1).limit(limit)
messages = []
async for msg in cursor:
msg["_id"] = str(msg["_id"])
if msg.get("created_at"):
msg["created_at"] = msg["created_at"].isoformat()
messages.append(msg)
return messages
async def delete_conversation(self, conversation_id: str) -> bool:
"""删除对话会话"""
result = await self.conversations.delete_many({"conversation_id": conversation_id})
return result.deleted_count > 0
async def list_conversations(
self,
limit: int = 50,
skip: int = 0,
) -> List[Dict[str, Any]]:
"""
获取会话列表(按最近一条消息排序)
Args:
limit: 返回数量
skip: 跳过数量
Returns:
会话列表
"""
# 使用 aggregation 获取每个会话的最新一条消息
pipeline = [
{"$sort": {"created_at": -1}},
{"$group": {
"_id": "$conversation_id",
"last_message": {"$first": "$$ROOT"},
}},
{"$replaceRoot": {"newRoot": "$last_message"}},
{"$sort": {"created_at": -1}},
{"$skip": skip},
{"$limit": limit},
]
conversations = []
async for doc in self.conversations.aggregate(pipeline):
doc["_id"] = str(doc["_id"])
if doc.get("created_at"):
doc["created_at"] = doc["created_at"].isoformat()
conversations.append(doc)
return conversations
# ==================== 全局单例 ====================

View File

@@ -44,6 +44,22 @@ class DocxParser(BaseParser):
error=f"文件不存在: {file_path}"
)
# 尝试使用 python-docx 解析,失败则使用备用方法
try:
return self._parse_with_docx(path)
except Exception as e:
logger.warning(f"python-docx 解析失败,使用备用方法: {e}")
try:
return self._parse_fallback(path)
except Exception as fallback_error:
logger.error(f"备用解析方法也失败: {fallback_error}")
return ParseResult(
success=False,
error=f"解析 Word 文档失败: {str(e)}"
)
def _parse_with_docx(self, path: Path) -> ParseResult:
"""使用 python-docx 解析文档"""
# 检查文件扩展名
if path.suffix.lower() not in self.supported_extensions:
return ParseResult(
@@ -51,98 +67,177 @@ class DocxParser(BaseParser):
error=f"不支持的文件类型: {path.suffix}"
)
# 读取 Word 文档
doc = Document(path)
# 提取文本内容
paragraphs = []
for para in doc.paragraphs:
if para.text.strip():
paragraphs.append({
"text": para.text,
"style": str(para.style.name) if para.style else "Normal"
})
# 提取段落纯文本(用于 AI 解析)
paragraphs_text = [p["text"] for p in paragraphs if p["text"].strip()]
# 提取表格内容
tables_data = []
for i, table in enumerate(doc.tables):
table_rows = []
for row in table.rows:
row_data = [cell.text.strip() for cell in row.cells]
table_rows.append(row_data)
if table_rows:
tables_data.append({
"table_index": i,
"rows": table_rows,
"row_count": len(table_rows),
"column_count": len(table_rows[0]) if table_rows else 0
})
# 提取图片/嵌入式对象信息
images_info = self._extract_images_info(doc, path)
# 合并所有文本(包括图片描述)
full_text_parts = []
full_text_parts.append("【文档正文】")
full_text_parts.extend(paragraphs_text)
if tables_data:
full_text_parts.append("\n【文档表格】")
for idx, table in enumerate(tables_data):
full_text_parts.append(f"--- 表格 {idx + 1} ---")
for row in table["rows"]:
full_text_parts.append(" | ".join(str(cell) for cell in row))
if images_info.get("image_count", 0) > 0:
full_text_parts.append(f"\n【文档图片】文档包含 {images_info['image_count']} 张图片/图表")
full_text = "\n".join(full_text_parts)
# 构建元数据
metadata = {
"filename": path.name,
"extension": path.suffix.lower(),
"paragraph_count": len(paragraphs),
"table_count": len(tables_data),
"image_count": images_info.get("image_count", 0)
}
return ParseResult(
success=True,
data={
"content": full_text,
"paragraphs": paragraphs,
"paragraphs_with_style": paragraphs,
"tables": tables_data,
"images": images_info
},
metadata=metadata
)
def _parse_fallback(self, path: Path) -> ParseResult:
"""备用解析方法:直接解析 docx 的 XML 结构"""
import zipfile
from xml.etree import ElementTree as ET
try:
# 读取 Word 文档
doc = Document(file_path)
with zipfile.ZipFile(path, 'r') as zf:
# 读取 document.xml
if 'word/document.xml' not in zf.namelist():
return ParseResult(success=False, error="无效的 docx 文件格式")
# 提取文本内容
paragraphs = []
for para in doc.paragraphs:
if para.text.strip():
paragraphs.append({
"text": para.text,
"style": str(para.style.name) if para.style else "Normal"
xml_content = zf.read('word/document.xml')
root = ET.fromstring(xml_content)
# 命名空间
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
}
paragraphs = []
tables = []
current_table = []
for elem in root.iter():
if elem.tag.endswith('}p'): # 段落
text_parts = []
for t in elem.iter():
if t.tag.endswith('}t') and t.text:
text_parts.append(t.text)
text = ''.join(text_parts).strip()
if text:
paragraphs.append({'text': text, 'style': 'Normal'})
elif elem.tag.endswith('}tr'): # 表格行
row_data = []
for tc in elem.iter():
if tc.tag.endswith('}tc'): # 单元格
cell_text = []
for t in tc.iter():
if t.tag.endswith('}t') and t.text:
cell_text.append(t.text)
row_data.append(''.join(cell_text).strip())
if row_data:
current_table.append(row_data)
else:
# 表格结束,保存
if current_table:
tables.append({
'table_index': len(tables),
'rows': current_table,
'row_count': len(current_table),
'column_count': len(current_table[0]) if current_table else 0
})
current_table = []
# 保存最后一张表格
if current_table:
tables.append({
'table_index': len(tables),
'rows': current_table,
'row_count': len(current_table),
'column_count': len(current_table[0]) if current_table else 0
})
# 提取段落纯文本(用于 AI 解析)
paragraphs_text = [p["text"] for p in paragraphs if p["text"].strip()]
# 构建文本
paragraphs_text = [p["text"] for p in paragraphs]
full_text_parts = ["【文档正文】"] + paragraphs_text
# 提取表格内容
tables_data = []
for i, table in enumerate(doc.tables):
table_rows = []
for row in table.rows:
row_data = [cell.text.strip() for cell in row.cells]
table_rows.append(row_data)
if tables:
full_text_parts.append("\n【文档表格】")
for idx, table in enumerate(tables):
full_text_parts.append(f"--- 表格 {idx + 1} ---")
for row in table["rows"]:
full_text_parts.append(" | ".join(str(cell) for cell in row))
if table_rows:
tables_data.append({
"table_index": i,
"rows": table_rows,
"row_count": len(table_rows),
"column_count": len(table_rows[0]) if table_rows else 0
})
full_text = "\n".join(full_text_parts)
# 提取图片/嵌入式对象信息
images_info = self._extract_images_info(doc, path)
# 合并所有文本(包括图片描述)
full_text_parts = []
full_text_parts.append("【文档正文】")
full_text_parts.extend(paragraphs_text)
if tables_data:
full_text_parts.append("\n【文档表格】")
for idx, table in enumerate(tables_data):
full_text_parts.append(f"--- 表格 {idx + 1} ---")
for row in table["rows"]:
full_text_parts.append(" | ".join(str(cell) for cell in row))
if images_info.get("image_count", 0) > 0:
full_text_parts.append(f"\n【文档图片】文档包含 {images_info['image_count']} 张图片/图表")
full_text = "\n".join(full_text_parts)
# 构建元数据
metadata = {
"filename": path.name,
"extension": path.suffix.lower(),
"file_size": path.stat().st_size,
"paragraph_count": len(paragraphs),
"table_count": len(tables_data),
"word_count": len(full_text),
"char_count": len(full_text.replace("\n", "")),
"has_tables": len(tables_data) > 0,
"has_images": images_info.get("image_count", 0) > 0,
"image_count": images_info.get("image_count", 0)
}
# 返回结果
return ParseResult(
success=True,
data={
"content": full_text,
"paragraphs": paragraphs_text,
"paragraphs_with_style": paragraphs,
"tables": tables_data,
"images": images_info,
"word_count": len(full_text),
"structured_data": {
return ParseResult(
success=True,
data={
"content": full_text,
"paragraphs": paragraphs,
"paragraphs_text": paragraphs_text,
"tables": tables_data,
"images": images_info
"paragraphs_with_style": paragraphs,
"tables": tables,
"images": {"image_count": 0, "descriptions": []}
},
metadata={
"filename": path.name,
"extension": path.suffix.lower(),
"paragraph_count": len(paragraphs),
"table_count": len(tables),
"image_count": 0,
"parse_method": "fallback_xml"
}
},
metadata=metadata
)
)
except zipfile.BadZipFile:
return ParseResult(success=False, error="无效的 ZIP/文档文件")
except Exception as e:
logger.error(f"解析 Word 文档失败: {str(e)}")
return ParseResult(
success=False,
error=f"解析 Word 文档失败: {str(e)}"
)
return ParseResult(success=False, error=f"备用解析失败: {str(e)}")
def extract_images_as_base64(self, file_path: str) -> List[Dict[str, str]]:
"""
@@ -197,6 +292,83 @@ class DocxParser(BaseParser):
logger.info(f"共提取 {len(images)} 张图片")
return images
def extract_text_from_images(self, file_path: str, lang: str = 'chi_sim+eng') -> Dict[str, Any]:
"""
对 Word 文档中的图片进行 OCR 文字识别
Args:
file_path: Word 文件路径
lang: Tesseract 语言代码,默认简体中文+英文 (chi_sim+eng)
Returns:
包含识别结果的字典
"""
import zipfile
from io import BytesIO
from PIL import Image
try:
import pytesseract
except ImportError:
logger.warning("pytesseract 未安装OCR 功能不可用")
return {
"success": False,
"error": "pytesseract 未安装,请运行: pip install pytesseract",
"image_count": 0,
"extracted_text": []
}
results = {
"success": True,
"image_count": 0,
"extracted_text": [],
"total_chars": 0
}
try:
with zipfile.ZipFile(file_path, 'r') as zf:
# 查找 word/media 目录下的图片文件
media_files = [f for f in zf.namelist() if f.startswith('word/media/')]
for idx, filename in enumerate(media_files):
ext = filename.split('.')[-1].lower()
if ext not in ['png', 'jpg', 'jpeg', 'gif', 'bmp']:
continue
try:
# 读取图片数据
image_data = zf.read(filename)
image = Image.open(BytesIO(image_data))
# 使用 Tesseract OCR 提取文字
text = pytesseract.image_to_string(image, lang=lang)
text = text.strip()
if text:
results["extracted_text"].append({
"image_index": idx,
"filename": filename,
"text": text,
"char_count": len(text)
})
results["total_chars"] += len(text)
logger.info(f"图片 {filename} OCR 识别完成,提取 {len(text)} 字符")
except Exception as e:
logger.warning(f"图片 {filename} OCR 识别失败: {str(e)}")
results["image_count"] = len(results["extracted_text"])
except zipfile.BadZipFile:
results["success"] = False
results["error"] = "无效的 Word 文档文件"
except Exception as e:
results["success"] = False
results["error"] = f"OCR 处理失败: {str(e)}"
return results
def extract_key_sentences(self, text: str, max_sentences: int = 10) -> List[str]:
"""
从文本中提取关键句子