【智能助手增强】
- 新增对话历史管理:MongoDB新增conversations集合,存储用户与AI的对话上下文,支持多轮对话意图延续
- 新增对话历史API(conversation.py):GET/DELETE conversation历史、列出所有会话
- 意图解析增强:支持基于对话历史的意图识别,上下文理解更准确
- 字段提取优化:支持"提取文档中的医院数量"等自然语言模式,智能去除"文档中的"前缀
- 文档对比优化:从指令中提取文件名并精确匹配source_docs,支持"对比A和B两个文档"
- 文档摘要优化:使用LLM生成真实AI摘要而非返回原始文档预览
【Word模板填表核心功能】
- Word模板字段生成:空白Word上传后,自动从源文档(Excel/Word/TXT/MD)内容AI生成字段名
- Word模板填表(_fill_docx):将提取数据写入Word模板表格,支持精确匹配、模糊匹配、追加新行
- 数据润色(_polish_word_filled_data):LLM对多行Excel数据进行统计归纳(合计/平均/极值),转化为专业自然语言描述
- 段落格式输出:使用📌字段名+值段落+分隔线(灰色横线)格式,提升可读性
- 导出链打通:fill_template返回filled_file_path,export直接返回已填好的Word文件
【其他修复】
- 修复Word导出Windows文件锁问题:NamedTemporaryFile改为mkstemp+close
- 修复Word方框非法字符:扩展clean_text移除\uFFFD、□等Unicode替代符和零宽字符
- 修复文档对比"需要至少2个文档":从指令提取具体文件名优先匹配而非取前2个
- 修复导出format硬编码:自动识别docx/xlsx格式
- Docx解析器增加备用解析方法和更完整的段落/表格/标题提取
- RAG服务新增MySQL数据源支持
This commit is contained in:
@@ -64,6 +64,11 @@ class MongoDB:
|
||||
"""任务集合 - 存储任务历史记录"""
|
||||
return self.db["tasks"]
|
||||
|
||||
@property
|
||||
def conversations(self):
|
||||
"""对话集合 - 存储对话历史记录"""
|
||||
return self.db["conversations"]
|
||||
|
||||
# ==================== 文档操作 ====================
|
||||
|
||||
async def insert_document(
|
||||
@@ -117,14 +122,20 @@ class MongoDB:
|
||||
搜索文档
|
||||
|
||||
Args:
|
||||
query: 搜索关键词
|
||||
query: 搜索关键词(支持文件名和内容搜索)
|
||||
doc_type: 文档类型过滤
|
||||
limit: 返回数量
|
||||
|
||||
Returns:
|
||||
文档列表
|
||||
"""
|
||||
filter_query = {"content": {"$regex": query}}
|
||||
filter_query = {
|
||||
"$or": [
|
||||
{"content": {"$regex": query, "$options": "i"}},
|
||||
{"metadata.original_filename": {"$regex": query, "$options": "i"}},
|
||||
{"metadata.filename": {"$regex": query, "$options": "i"}},
|
||||
]
|
||||
}
|
||||
if doc_type:
|
||||
filter_query["doc_type"] = doc_type
|
||||
|
||||
@@ -141,6 +152,15 @@ class MongoDB:
|
||||
result = await self.documents.delete_one({"_id": ObjectId(doc_id)})
|
||||
return result.deleted_count > 0
|
||||
|
||||
async def update_document_metadata(self, doc_id: str, metadata: Dict[str, Any]) -> bool:
|
||||
"""更新文档 metadata 字段"""
|
||||
from bson import ObjectId
|
||||
result = await self.documents.update_one(
|
||||
{"_id": ObjectId(doc_id)},
|
||||
{"$set": {"metadata": metadata}}
|
||||
)
|
||||
return result.modified_count > 0
|
||||
|
||||
# ==================== RAG 索引操作 ====================
|
||||
|
||||
async def insert_rag_entry(
|
||||
@@ -251,6 +271,10 @@ class MongoDB:
|
||||
await self.tasks.create_index("task_id", unique=True)
|
||||
await self.tasks.create_index("created_at")
|
||||
|
||||
# 对话集合索引
|
||||
await self.conversations.create_index("conversation_id")
|
||||
await self.conversations.create_index("created_at")
|
||||
|
||||
logger.info("MongoDB 索引创建完成")
|
||||
|
||||
# ==================== 任务历史操作 ====================
|
||||
@@ -369,6 +393,108 @@ class MongoDB:
|
||||
result = await self.tasks.delete_one({"task_id": task_id})
|
||||
return result.deleted_count > 0
|
||||
|
||||
# ==================== 对话历史操作 ====================
|
||||
|
||||
async def insert_conversation(
|
||||
self,
|
||||
conversation_id: str,
|
||||
role: str,
|
||||
content: str,
|
||||
intent: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> str:
|
||||
"""
|
||||
插入对话记录
|
||||
|
||||
Args:
|
||||
conversation_id: 对话会话ID
|
||||
role: 角色 (user/assistant)
|
||||
content: 对话内容
|
||||
intent: 意图类型
|
||||
metadata: 额外元数据
|
||||
|
||||
Returns:
|
||||
插入文档的ID
|
||||
"""
|
||||
message = {
|
||||
"conversation_id": conversation_id,
|
||||
"role": role,
|
||||
"content": content,
|
||||
"intent": intent,
|
||||
"metadata": metadata or {},
|
||||
"created_at": datetime.utcnow(),
|
||||
}
|
||||
result = await self.conversations.insert_one(message)
|
||||
return str(result.inserted_id)
|
||||
|
||||
async def get_conversation_history(
|
||||
self,
|
||||
conversation_id: str,
|
||||
limit: int = 20,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
获取对话历史
|
||||
|
||||
Args:
|
||||
conversation_id: 对话会话ID
|
||||
limit: 返回消息数量
|
||||
|
||||
Returns:
|
||||
对话消息列表
|
||||
"""
|
||||
cursor = self.conversations.find(
|
||||
{"conversation_id": conversation_id}
|
||||
).sort("created_at", 1).limit(limit)
|
||||
|
||||
messages = []
|
||||
async for msg in cursor:
|
||||
msg["_id"] = str(msg["_id"])
|
||||
if msg.get("created_at"):
|
||||
msg["created_at"] = msg["created_at"].isoformat()
|
||||
messages.append(msg)
|
||||
return messages
|
||||
|
||||
async def delete_conversation(self, conversation_id: str) -> bool:
|
||||
"""删除对话会话"""
|
||||
result = await self.conversations.delete_many({"conversation_id": conversation_id})
|
||||
return result.deleted_count > 0
|
||||
|
||||
async def list_conversations(
|
||||
self,
|
||||
limit: int = 50,
|
||||
skip: int = 0,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
获取会话列表(按最近一条消息排序)
|
||||
|
||||
Args:
|
||||
limit: 返回数量
|
||||
skip: 跳过数量
|
||||
|
||||
Returns:
|
||||
会话列表
|
||||
"""
|
||||
# 使用 aggregation 获取每个会话的最新一条消息
|
||||
pipeline = [
|
||||
{"$sort": {"created_at": -1}},
|
||||
{"$group": {
|
||||
"_id": "$conversation_id",
|
||||
"last_message": {"$first": "$$ROOT"},
|
||||
}},
|
||||
{"$replaceRoot": {"newRoot": "$last_message"}},
|
||||
{"$sort": {"created_at": -1}},
|
||||
{"$skip": skip},
|
||||
{"$limit": limit},
|
||||
]
|
||||
|
||||
conversations = []
|
||||
async for doc in self.conversations.aggregate(pipeline):
|
||||
doc["_id"] = str(doc["_id"])
|
||||
if doc.get("created_at"):
|
||||
doc["created_at"] = doc["created_at"].isoformat()
|
||||
conversations.append(doc)
|
||||
return conversations
|
||||
|
||||
|
||||
# ==================== 全局单例 ====================
|
||||
|
||||
|
||||
@@ -44,6 +44,22 @@ class DocxParser(BaseParser):
|
||||
error=f"文件不存在: {file_path}"
|
||||
)
|
||||
|
||||
# 尝试使用 python-docx 解析,失败则使用备用方法
|
||||
try:
|
||||
return self._parse_with_docx(path)
|
||||
except Exception as e:
|
||||
logger.warning(f"python-docx 解析失败,使用备用方法: {e}")
|
||||
try:
|
||||
return self._parse_fallback(path)
|
||||
except Exception as fallback_error:
|
||||
logger.error(f"备用解析方法也失败: {fallback_error}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"解析 Word 文档失败: {str(e)}"
|
||||
)
|
||||
|
||||
def _parse_with_docx(self, path: Path) -> ParseResult:
|
||||
"""使用 python-docx 解析文档"""
|
||||
# 检查文件扩展名
|
||||
if path.suffix.lower() not in self.supported_extensions:
|
||||
return ParseResult(
|
||||
@@ -51,98 +67,177 @@ class DocxParser(BaseParser):
|
||||
error=f"不支持的文件类型: {path.suffix}"
|
||||
)
|
||||
|
||||
# 读取 Word 文档
|
||||
doc = Document(path)
|
||||
|
||||
# 提取文本内容
|
||||
paragraphs = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
paragraphs.append({
|
||||
"text": para.text,
|
||||
"style": str(para.style.name) if para.style else "Normal"
|
||||
})
|
||||
|
||||
# 提取段落纯文本(用于 AI 解析)
|
||||
paragraphs_text = [p["text"] for p in paragraphs if p["text"].strip()]
|
||||
|
||||
# 提取表格内容
|
||||
tables_data = []
|
||||
for i, table in enumerate(doc.tables):
|
||||
table_rows = []
|
||||
for row in table.rows:
|
||||
row_data = [cell.text.strip() for cell in row.cells]
|
||||
table_rows.append(row_data)
|
||||
|
||||
if table_rows:
|
||||
tables_data.append({
|
||||
"table_index": i,
|
||||
"rows": table_rows,
|
||||
"row_count": len(table_rows),
|
||||
"column_count": len(table_rows[0]) if table_rows else 0
|
||||
})
|
||||
|
||||
# 提取图片/嵌入式对象信息
|
||||
images_info = self._extract_images_info(doc, path)
|
||||
|
||||
# 合并所有文本(包括图片描述)
|
||||
full_text_parts = []
|
||||
full_text_parts.append("【文档正文】")
|
||||
full_text_parts.extend(paragraphs_text)
|
||||
|
||||
if tables_data:
|
||||
full_text_parts.append("\n【文档表格】")
|
||||
for idx, table in enumerate(tables_data):
|
||||
full_text_parts.append(f"--- 表格 {idx + 1} ---")
|
||||
for row in table["rows"]:
|
||||
full_text_parts.append(" | ".join(str(cell) for cell in row))
|
||||
|
||||
if images_info.get("image_count", 0) > 0:
|
||||
full_text_parts.append(f"\n【文档图片】文档包含 {images_info['image_count']} 张图片/图表")
|
||||
|
||||
full_text = "\n".join(full_text_parts)
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"paragraph_count": len(paragraphs),
|
||||
"table_count": len(tables_data),
|
||||
"image_count": images_info.get("image_count", 0)
|
||||
}
|
||||
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": full_text,
|
||||
"paragraphs": paragraphs,
|
||||
"paragraphs_with_style": paragraphs,
|
||||
"tables": tables_data,
|
||||
"images": images_info
|
||||
},
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
def _parse_fallback(self, path: Path) -> ParseResult:
|
||||
"""备用解析方法:直接解析 docx 的 XML 结构"""
|
||||
import zipfile
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
try:
|
||||
# 读取 Word 文档
|
||||
doc = Document(file_path)
|
||||
with zipfile.ZipFile(path, 'r') as zf:
|
||||
# 读取 document.xml
|
||||
if 'word/document.xml' not in zf.namelist():
|
||||
return ParseResult(success=False, error="无效的 docx 文件格式")
|
||||
|
||||
# 提取文本内容
|
||||
paragraphs = []
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
paragraphs.append({
|
||||
"text": para.text,
|
||||
"style": str(para.style.name) if para.style else "Normal"
|
||||
xml_content = zf.read('word/document.xml')
|
||||
root = ET.fromstring(xml_content)
|
||||
|
||||
# 命名空间
|
||||
namespaces = {
|
||||
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
||||
}
|
||||
|
||||
paragraphs = []
|
||||
tables = []
|
||||
current_table = []
|
||||
|
||||
for elem in root.iter():
|
||||
if elem.tag.endswith('}p'): # 段落
|
||||
text_parts = []
|
||||
for t in elem.iter():
|
||||
if t.tag.endswith('}t') and t.text:
|
||||
text_parts.append(t.text)
|
||||
text = ''.join(text_parts).strip()
|
||||
if text:
|
||||
paragraphs.append({'text': text, 'style': 'Normal'})
|
||||
elif elem.tag.endswith('}tr'): # 表格行
|
||||
row_data = []
|
||||
for tc in elem.iter():
|
||||
if tc.tag.endswith('}tc'): # 单元格
|
||||
cell_text = []
|
||||
for t in tc.iter():
|
||||
if t.tag.endswith('}t') and t.text:
|
||||
cell_text.append(t.text)
|
||||
row_data.append(''.join(cell_text).strip())
|
||||
if row_data:
|
||||
current_table.append(row_data)
|
||||
else:
|
||||
# 表格结束,保存
|
||||
if current_table:
|
||||
tables.append({
|
||||
'table_index': len(tables),
|
||||
'rows': current_table,
|
||||
'row_count': len(current_table),
|
||||
'column_count': len(current_table[0]) if current_table else 0
|
||||
})
|
||||
current_table = []
|
||||
|
||||
# 保存最后一张表格
|
||||
if current_table:
|
||||
tables.append({
|
||||
'table_index': len(tables),
|
||||
'rows': current_table,
|
||||
'row_count': len(current_table),
|
||||
'column_count': len(current_table[0]) if current_table else 0
|
||||
})
|
||||
|
||||
# 提取段落纯文本(用于 AI 解析)
|
||||
paragraphs_text = [p["text"] for p in paragraphs if p["text"].strip()]
|
||||
# 构建文本
|
||||
paragraphs_text = [p["text"] for p in paragraphs]
|
||||
full_text_parts = ["【文档正文】"] + paragraphs_text
|
||||
|
||||
# 提取表格内容
|
||||
tables_data = []
|
||||
for i, table in enumerate(doc.tables):
|
||||
table_rows = []
|
||||
for row in table.rows:
|
||||
row_data = [cell.text.strip() for cell in row.cells]
|
||||
table_rows.append(row_data)
|
||||
if tables:
|
||||
full_text_parts.append("\n【文档表格】")
|
||||
for idx, table in enumerate(tables):
|
||||
full_text_parts.append(f"--- 表格 {idx + 1} ---")
|
||||
for row in table["rows"]:
|
||||
full_text_parts.append(" | ".join(str(cell) for cell in row))
|
||||
|
||||
if table_rows:
|
||||
tables_data.append({
|
||||
"table_index": i,
|
||||
"rows": table_rows,
|
||||
"row_count": len(table_rows),
|
||||
"column_count": len(table_rows[0]) if table_rows else 0
|
||||
})
|
||||
full_text = "\n".join(full_text_parts)
|
||||
|
||||
# 提取图片/嵌入式对象信息
|
||||
images_info = self._extract_images_info(doc, path)
|
||||
|
||||
# 合并所有文本(包括图片描述)
|
||||
full_text_parts = []
|
||||
full_text_parts.append("【文档正文】")
|
||||
full_text_parts.extend(paragraphs_text)
|
||||
|
||||
if tables_data:
|
||||
full_text_parts.append("\n【文档表格】")
|
||||
for idx, table in enumerate(tables_data):
|
||||
full_text_parts.append(f"--- 表格 {idx + 1} ---")
|
||||
for row in table["rows"]:
|
||||
full_text_parts.append(" | ".join(str(cell) for cell in row))
|
||||
|
||||
if images_info.get("image_count", 0) > 0:
|
||||
full_text_parts.append(f"\n【文档图片】文档包含 {images_info['image_count']} 张图片/图表")
|
||||
|
||||
full_text = "\n".join(full_text_parts)
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"file_size": path.stat().st_size,
|
||||
"paragraph_count": len(paragraphs),
|
||||
"table_count": len(tables_data),
|
||||
"word_count": len(full_text),
|
||||
"char_count": len(full_text.replace("\n", "")),
|
||||
"has_tables": len(tables_data) > 0,
|
||||
"has_images": images_info.get("image_count", 0) > 0,
|
||||
"image_count": images_info.get("image_count", 0)
|
||||
}
|
||||
|
||||
# 返回结果
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": full_text,
|
||||
"paragraphs": paragraphs_text,
|
||||
"paragraphs_with_style": paragraphs,
|
||||
"tables": tables_data,
|
||||
"images": images_info,
|
||||
"word_count": len(full_text),
|
||||
"structured_data": {
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={
|
||||
"content": full_text,
|
||||
"paragraphs": paragraphs,
|
||||
"paragraphs_text": paragraphs_text,
|
||||
"tables": tables_data,
|
||||
"images": images_info
|
||||
"paragraphs_with_style": paragraphs,
|
||||
"tables": tables,
|
||||
"images": {"image_count": 0, "descriptions": []}
|
||||
},
|
||||
metadata={
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"paragraph_count": len(paragraphs),
|
||||
"table_count": len(tables),
|
||||
"image_count": 0,
|
||||
"parse_method": "fallback_xml"
|
||||
}
|
||||
},
|
||||
metadata=metadata
|
||||
)
|
||||
)
|
||||
|
||||
except zipfile.BadZipFile:
|
||||
return ParseResult(success=False, error="无效的 ZIP/文档文件")
|
||||
except Exception as e:
|
||||
logger.error(f"解析 Word 文档失败: {str(e)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"解析 Word 文档失败: {str(e)}"
|
||||
)
|
||||
return ParseResult(success=False, error=f"备用解析失败: {str(e)}")
|
||||
|
||||
def extract_images_as_base64(self, file_path: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
@@ -197,6 +292,83 @@ class DocxParser(BaseParser):
|
||||
logger.info(f"共提取 {len(images)} 张图片")
|
||||
return images
|
||||
|
||||
def extract_text_from_images(self, file_path: str, lang: str = 'chi_sim+eng') -> Dict[str, Any]:
|
||||
"""
|
||||
对 Word 文档中的图片进行 OCR 文字识别
|
||||
|
||||
Args:
|
||||
file_path: Word 文件路径
|
||||
lang: Tesseract 语言代码,默认简体中文+英文 (chi_sim+eng)
|
||||
|
||||
Returns:
|
||||
包含识别结果的字典
|
||||
"""
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from PIL import Image
|
||||
|
||||
try:
|
||||
import pytesseract
|
||||
except ImportError:
|
||||
logger.warning("pytesseract 未安装,OCR 功能不可用")
|
||||
return {
|
||||
"success": False,
|
||||
"error": "pytesseract 未安装,请运行: pip install pytesseract",
|
||||
"image_count": 0,
|
||||
"extracted_text": []
|
||||
}
|
||||
|
||||
results = {
|
||||
"success": True,
|
||||
"image_count": 0,
|
||||
"extracted_text": [],
|
||||
"total_chars": 0
|
||||
}
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as zf:
|
||||
# 查找 word/media 目录下的图片文件
|
||||
media_files = [f for f in zf.namelist() if f.startswith('word/media/')]
|
||||
|
||||
for idx, filename in enumerate(media_files):
|
||||
ext = filename.split('.')[-1].lower()
|
||||
if ext not in ['png', 'jpg', 'jpeg', 'gif', 'bmp']:
|
||||
continue
|
||||
|
||||
try:
|
||||
# 读取图片数据
|
||||
image_data = zf.read(filename)
|
||||
image = Image.open(BytesIO(image_data))
|
||||
|
||||
# 使用 Tesseract OCR 提取文字
|
||||
text = pytesseract.image_to_string(image, lang=lang)
|
||||
text = text.strip()
|
||||
|
||||
if text:
|
||||
results["extracted_text"].append({
|
||||
"image_index": idx,
|
||||
"filename": filename,
|
||||
"text": text,
|
||||
"char_count": len(text)
|
||||
})
|
||||
results["total_chars"] += len(text)
|
||||
|
||||
logger.info(f"图片 {filename} OCR 识别完成,提取 {len(text)} 字符")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"图片 {filename} OCR 识别失败: {str(e)}")
|
||||
|
||||
results["image_count"] = len(results["extracted_text"])
|
||||
|
||||
except zipfile.BadZipFile:
|
||||
results["success"] = False
|
||||
results["error"] = "无效的 Word 文档文件"
|
||||
except Exception as e:
|
||||
results["success"] = False
|
||||
results["error"] = f"OCR 处理失败: {str(e)}"
|
||||
|
||||
return results
|
||||
|
||||
def extract_key_sentences(self, text: str, max_sentences: int = 10) -> List[str]:
|
||||
"""
|
||||
从文本中提取关键句子
|
||||
|
||||
Reference in New Issue
Block a user