feat: 实现智能指令的格式转换和文档编辑功能

主要更新:
- 新增 transform 意图:支持 Word/Excel/Markdown 格式互转
- 新增 edit 意图:使用 LLM 润色编辑文档内容
- 智能指令接口增加异步执行模式(async_execute 参数)
- 修复 Word 模板导出文档损坏问题(改用临时文件方式)
- 优化 intent_parser 增加 transform/edit 关键词识别

新增文件:
- app/api/endpoints/instruction.py: 智能指令 API 端点
- app/services/multi_doc_reasoning_service.py: 多文档推理服务

其他优化:
- RAG 服务混合搜索(BM25 + 向量)融合
- 模板填充服务表头匹配增强
- Word AI 解析服务返回结构完善
- 前端 InstructionChat 组件对接真实 API
This commit is contained in:
dj
2026-04-14 20:39:37 +08:00
parent 51350e3002
commit ecad9ccd82
12 changed files with 2943 additions and 196 deletions

View File

@@ -2,11 +2,15 @@
RAG 服务模块 - 检索增强生成
使用 sentence-transformers + Faiss 实现向量检索
支持 BM25 关键词检索 + 向量检索混合融合
"""
import logging
import os
import pickle
from typing import Any, Dict, List, Optional
import re
import math
from typing import Any, Dict, List, Optional, Tuple
from collections import Counter, defaultdict
import faiss
import numpy as np
@@ -32,6 +36,132 @@ class SimpleDocument:
self.metadata = metadata
class BM25:
"""
BM25 关键词检索算法
一种基于词频和文档频率的信息检索算法,比纯向量搜索更适合关键词精确匹配
"""
def __init__(self, k1: float = 1.5, b: float = 0.75):
self.k1 = k1 # 词频饱和参数
self.b = b # 文档长度归一化参数
self.documents: List[str] = []
self.doc_ids: List[str] = []
self.avg_doc_length = 0
self.doc_freqs: Dict[str, int] = {} # 词 -> 包含该词的文档数
self.idf: Dict[str, float] = {} # 词 -> IDF 值
self.doc_lengths: List[int] = []
self.doc_term_freqs: List[Dict[str, int]] = [] # 每个文档的词频
def _tokenize(self, text: str) -> List[str]:
"""分词(简单的中文分词)"""
if not text:
return []
# 简单分词:按标点和空格分割
tokens = re.findall(r'[\u4e00-\u9fff]+|[a-zA-Z0-9]+', text.lower())
# 过滤单字符
return [t for t in tokens if len(t) > 1]
def fit(self, documents: List[str], doc_ids: List[str]):
"""
构建 BM25 索引
Args:
documents: 文档内容列表
doc_ids: 文档 ID 列表
"""
self.documents = documents
self.doc_ids = doc_ids
n = len(documents)
# 统计文档频率
self.doc_freqs = defaultdict(int)
self.doc_lengths = []
self.doc_term_freqs = []
for doc in documents:
tokens = self._tokenize(doc)
self.doc_lengths.append(len(tokens))
doc_tf = Counter(tokens)
self.doc_term_freqs.append(doc_tf)
for term in doc_tf:
self.doc_freqs[term] += 1
# 计算平均文档长度
self.avg_doc_length = sum(self.doc_lengths) / n if n > 0 else 0
# 计算 IDF
for term, df in self.doc_freqs.items():
# IDF = log((n - df + 0.5) / (df + 0.5))
self.idf[term] = math.log((n - df + 0.5) / (df + 0.5) + 1)
logger.info(f"BM25 索引构建完成: {n} 个文档, {len(self.idf)} 个词项")
def search(self, query: str, top_k: int = 10) -> List[Tuple[int, float]]:
"""
搜索相关文档
Args:
query: 查询文本
top_k: 返回前 k 个结果
Returns:
[(文档索引, BM25分数), ...]
"""
if not self.documents:
return []
query_tokens = self._tokenize(query)
if not query_tokens:
return []
scores = []
n = len(self.documents)
for idx in range(n):
score = self._calculate_score(query_tokens, idx)
scores.append((idx, score))
# 按分数降序排序
scores.sort(key=lambda x: x[1], reverse=True)
return scores[:top_k]
def _calculate_score(self, query_tokens: List[str], doc_idx: int) -> float:
"""计算单个文档的 BM25 分数"""
doc_tf = self.doc_term_freqs[doc_idx]
doc_len = self.doc_lengths[doc_idx]
score = 0.0
for term in query_tokens:
if term not in self.idf:
continue
tf = doc_tf.get(term, 0)
idf = self.idf[term]
# BM25 公式
numerator = tf * (self.k1 + 1)
denominator = tf + self.k1 * (1 - self.b + self.b * doc_len / self.avg_doc_length)
score += idf * numerator / denominator
return score
def get_scores(self, query: str) -> List[float]:
"""获取所有文档的 BM25 分数"""
if not self.documents:
return []
query_tokens = self._tokenize(query)
if not query_tokens:
return [0.0] * len(self.documents)
return [self._calculate_score(query_tokens, idx) for idx in range(len(self.documents))]
class RAGService:
"""RAG 检索增强服务"""
@@ -47,12 +177,15 @@ class RAGService:
self._dimension: int = 384 # 默认维度
self._initialized = False
self._persist_dir = settings.FAISS_INDEX_DIR
# BM25 索引
self.bm25: Optional[BM25] = None
self._bm25_enabled = True # 始终启用 BM25
# 检查是否可用
self._disabled = not SENTENCE_TRANSFORMERS_AVAILABLE
if self._disabled:
logger.warning("RAG 服务已禁用sentence-transformers 不可用),将使用关键词匹配作为后备")
logger.warning("RAG 服务已禁用sentence-transformers 不可用),将使用 BM25 关键词检索")
else:
logger.info("RAG 服务已启用")
logger.info("RAG 服务已启用(向量检索 + BM25 混合检索)")
def _init_embeddings(self):
"""初始化嵌入模型"""
@@ -261,11 +394,25 @@ class RAGService:
if not documents:
return
# 总是将文档存储在内存中(用于关键词搜索后备
# 总是将文档存储在内存中(用于 BM25 和关键词搜索)
for doc, did in zip(documents, doc_ids):
self.documents.append({"id": did, "content": doc.page_content, "metadata": doc.metadata})
self.doc_ids.append(did)
# 构建 BM25 索引
if self._bm25_enabled and documents:
bm25_texts = [doc.page_content for doc in documents]
if self.bm25 is None:
self.bm25 = BM25()
self.bm25.fit(bm25_texts, doc_ids)
else:
# 增量添加重新构建BM25 不支持增量)
all_texts = [d["content"] for d in self.documents]
all_ids = self.doc_ids.copy()
self.bm25 = BM25()
self.bm25.fit(all_texts, all_ids)
logger.debug(f"BM25 索引更新: {len(documents)} 个文档")
# 如果没有嵌入模型,跳过向量索引
if self.embedding_model is None:
logger.debug(f"文档跳过向量索引 (无嵌入模型): {len(documents)} 个文档")
@@ -284,7 +431,7 @@ class RAGService:
def retrieve(self, query: str, top_k: int = 5, min_score: float = 0.3) -> List[Dict[str, Any]]:
"""
根据查询检索相关文档块
根据查询检索相关文档块(混合检索:向量 + BM25
Args:
query: 查询文本
@@ -301,39 +448,167 @@ class RAGService:
if not self._initialized:
self._init_vector_store()
# 优先使用向量检索
if self.index is not None and self.index.ntotal > 0 and self.embedding_model is not None:
try:
query_embedding = self.embedding_model.encode([query], convert_to_numpy=True)
query_embedding = self._normalize_vectors(query_embedding).astype('float32')
# 获取向量检索结果
vector_results = self._vector_search(query, top_k * 2, min_score)
scores, indices = self.index.search(query_embedding, min(top_k, self.index.ntotal))
# 获取 BM25 检索结果
bm25_results = self._bm25_search(query, top_k * 2)
results = []
for score, idx in zip(scores[0], indices[0]):
if idx < 0:
continue
if score < min_score:
continue
doc = self.documents[idx]
results.append({
"content": doc["content"],
"metadata": doc["metadata"],
"score": float(score),
"doc_id": doc["id"],
"chunk_index": doc["metadata"].get("chunk_index", 0)
})
# 混合融合
hybrid_results = self._hybrid_fusion(vector_results, bm25_results, top_k)
if results:
logger.debug(f"向量检索到 {len(results)} 条相关文档块")
return results
except Exception as e:
logger.warning(f"向量检索失败,使用关键词搜索后备: {e}")
if hybrid_results:
logger.info(f"混合检索到 {len(hybrid_results)} 条相关文档块 (向量:{len(vector_results)}, BM25:{len(bm25_results)})")
return hybrid_results
# 后备:使用关键词搜索
logger.debug("使用关键词搜索后备方案")
# 降级:只使用 BM25
if bm25_results:
logger.info(f"降级到 BM25 检索: {len(bm25_results)}")
return bm25_results
# 降级:使用关键词搜索
logger.info("降级到关键词搜索")
return self._keyword_search(query, top_k)
def _vector_search(self, query: str, top_k: int, min_score: float) -> List[Dict[str, Any]]:
"""向量检索"""
if self.index is None or self.index.ntotal == 0 or self.embedding_model is None:
return []
try:
query_embedding = self.embedding_model.encode([query], convert_to_numpy=True)
query_embedding = self._normalize_vectors(query_embedding).astype('float32')
scores, indices = self.index.search(query_embedding, min(top_k * 2, self.index.ntotal))
results = []
for score, idx in zip(scores[0], indices[0]):
if idx < 0:
continue
if score < min_score:
continue
doc = self.documents[idx]
results.append({
"content": doc["content"],
"metadata": doc["metadata"],
"score": float(score),
"doc_id": doc["id"],
"chunk_index": doc["metadata"].get("chunk_index", 0),
"search_type": "vector"
})
return results
except Exception as e:
logger.warning(f"向量检索失败: {e}")
return []
def _bm25_search(self, query: str, top_k: int) -> List[Dict[str, Any]]:
"""BM25 检索"""
if not self.bm25 or not self.documents:
return []
try:
bm25_scores = self.bm25.get_scores(query)
if not bm25_scores:
return []
# 归一化 BM25 分数到 [0, 1]
max_score = max(bm25_scores) if bm25_scores else 1
min_score_bm = min(bm25_scores) if bm25_scores else 0
score_range = max_score - min_score_bm if max_score != min_score_bm else 1
results = []
for idx, score in enumerate(bm25_scores):
if score <= 0:
continue
# 归一化
normalized_score = (score - min_score_bm) / score_range if score_range > 0 else 0
doc = self.documents[idx]
results.append({
"content": doc["content"],
"metadata": doc["metadata"],
"score": float(normalized_score),
"doc_id": doc["id"],
"chunk_index": doc["metadata"].get("chunk_index", 0),
"search_type": "bm25"
})
# 按分数降序
results.sort(key=lambda x: x["score"], reverse=True)
return results[:top_k]
except Exception as e:
logger.warning(f"BM25 检索失败: {e}")
return []
def _hybrid_fusion(
self,
vector_results: List[Dict[str, Any]],
bm25_results: List[Dict[str, Any]],
top_k: int
) -> List[Dict[str, Any]]:
"""
混合融合向量和 BM25 检索结果
使用 RRFR (Reciprocal Rank Fusion) 算法:
Score = weight_vector * (1 / rank_vector) + weight_bm25 * (1 / rank_bm25)
Args:
vector_results: 向量检索结果
bm25_results: BM25 检索结果
top_k: 返回数量
Returns:
融合后的结果
"""
if not vector_results and not bm25_results:
return []
# 融合权重
weight_vector = 0.6
weight_bm25 = 0.4
# 构建文档分数映射
doc_scores: Dict[str, Dict[str, float]] = {}
# 添加向量检索结果
for rank, result in enumerate(vector_results):
doc_id = result["doc_id"]
if doc_id not in doc_scores:
doc_scores[doc_id] = {"vector": 0, "bm25": 0, "content": result["content"], "metadata": result["metadata"]}
# 使用倒数排名 (Reciprocal Rank)
doc_scores[doc_id]["vector"] = weight_vector / (rank + 1)
# 添加 BM25 检索结果
for rank, result in enumerate(bm25_results):
doc_id = result["doc_id"]
if doc_id not in doc_scores:
doc_scores[doc_id] = {"vector": 0, "bm25": 0, "content": result["content"], "metadata": result["metadata"]}
doc_scores[doc_id]["bm25"] = weight_bm25 / (rank + 1)
# 计算融合分数
fused_results = []
for doc_id, scores in doc_scores.items():
fused_score = scores["vector"] + scores["bm25"]
# 使用向量检索结果的原始分数作为参考
vector_score = next((r["score"] for r in vector_results if r["doc_id"] == doc_id), 0.5)
fused_results.append({
"content": scores["content"],
"metadata": scores["metadata"],
"score": fused_score,
"doc_id": doc_id,
"vector_score": vector_score,
"bm25_score": scores["bm25"],
"search_type": "hybrid"
})
# 按融合分数降序排序
fused_results.sort(key=lambda x: x["score"], reverse=True)
logger.debug(f"混合融合: {len(fused_results)} 个文档, 向量:{len(vector_results)}, BM25:{len(bm25_results)}")
return fused_results[:top_k]
def _keyword_search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
"""
关键词搜索后备方案