feat: 实现智能指令的格式转换和文档编辑功能
主要更新: - 新增 transform 意图:支持 Word/Excel/Markdown 格式互转 - 新增 edit 意图:使用 LLM 润色编辑文档内容 - 智能指令接口增加异步执行模式(async_execute 参数) - 修复 Word 模板导出文档损坏问题(改用临时文件方式) - 优化 intent_parser 增加 transform/edit 关键词识别 新增文件: - app/api/endpoints/instruction.py: 智能指令 API 端点 - app/services/multi_doc_reasoning_service.py: 多文档推理服务 其他优化: - RAG 服务混合搜索(BM25 + 向量)融合 - 模板填充服务表头匹配增强 - Word AI 解析服务返回结构完善 - 前端 InstructionChat 组件对接真实 API
This commit is contained in:
@@ -2,11 +2,15 @@
|
||||
RAG 服务模块 - 检索增强生成
|
||||
|
||||
使用 sentence-transformers + Faiss 实现向量检索
|
||||
支持 BM25 关键词检索 + 向量检索混合融合
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
from typing import Any, Dict, List, Optional
|
||||
import re
|
||||
import math
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
import faiss
|
||||
import numpy as np
|
||||
@@ -32,6 +36,132 @@ class SimpleDocument:
|
||||
self.metadata = metadata
|
||||
|
||||
|
||||
class BM25:
|
||||
"""
|
||||
BM25 关键词检索算法
|
||||
|
||||
一种基于词频和文档频率的信息检索算法,比纯向量搜索更适合关键词精确匹配
|
||||
"""
|
||||
|
||||
def __init__(self, k1: float = 1.5, b: float = 0.75):
|
||||
self.k1 = k1 # 词频饱和参数
|
||||
self.b = b # 文档长度归一化参数
|
||||
self.documents: List[str] = []
|
||||
self.doc_ids: List[str] = []
|
||||
self.avg_doc_length = 0
|
||||
self.doc_freqs: Dict[str, int] = {} # 词 -> 包含该词的文档数
|
||||
self.idf: Dict[str, float] = {} # 词 -> IDF 值
|
||||
self.doc_lengths: List[int] = []
|
||||
self.doc_term_freqs: List[Dict[str, int]] = [] # 每个文档的词频
|
||||
|
||||
def _tokenize(self, text: str) -> List[str]:
|
||||
"""分词(简单的中文分词)"""
|
||||
if not text:
|
||||
return []
|
||||
# 简单分词:按标点和空格分割
|
||||
tokens = re.findall(r'[\u4e00-\u9fff]+|[a-zA-Z0-9]+', text.lower())
|
||||
# 过滤单字符
|
||||
return [t for t in tokens if len(t) > 1]
|
||||
|
||||
def fit(self, documents: List[str], doc_ids: List[str]):
|
||||
"""
|
||||
构建 BM25 索引
|
||||
|
||||
Args:
|
||||
documents: 文档内容列表
|
||||
doc_ids: 文档 ID 列表
|
||||
"""
|
||||
self.documents = documents
|
||||
self.doc_ids = doc_ids
|
||||
n = len(documents)
|
||||
|
||||
# 统计文档频率
|
||||
self.doc_freqs = defaultdict(int)
|
||||
self.doc_lengths = []
|
||||
self.doc_term_freqs = []
|
||||
|
||||
for doc in documents:
|
||||
tokens = self._tokenize(doc)
|
||||
self.doc_lengths.append(len(tokens))
|
||||
doc_tf = Counter(tokens)
|
||||
self.doc_term_freqs.append(doc_tf)
|
||||
|
||||
for term in doc_tf:
|
||||
self.doc_freqs[term] += 1
|
||||
|
||||
# 计算平均文档长度
|
||||
self.avg_doc_length = sum(self.doc_lengths) / n if n > 0 else 0
|
||||
|
||||
# 计算 IDF
|
||||
for term, df in self.doc_freqs.items():
|
||||
# IDF = log((n - df + 0.5) / (df + 0.5))
|
||||
self.idf[term] = math.log((n - df + 0.5) / (df + 0.5) + 1)
|
||||
|
||||
logger.info(f"BM25 索引构建完成: {n} 个文档, {len(self.idf)} 个词项")
|
||||
|
||||
def search(self, query: str, top_k: int = 10) -> List[Tuple[int, float]]:
|
||||
"""
|
||||
搜索相关文档
|
||||
|
||||
Args:
|
||||
query: 查询文本
|
||||
top_k: 返回前 k 个结果
|
||||
|
||||
Returns:
|
||||
[(文档索引, BM25分数), ...]
|
||||
"""
|
||||
if not self.documents:
|
||||
return []
|
||||
|
||||
query_tokens = self._tokenize(query)
|
||||
if not query_tokens:
|
||||
return []
|
||||
|
||||
scores = []
|
||||
n = len(self.documents)
|
||||
|
||||
for idx in range(n):
|
||||
score = self._calculate_score(query_tokens, idx)
|
||||
scores.append((idx, score))
|
||||
|
||||
# 按分数降序排序
|
||||
scores.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
return scores[:top_k]
|
||||
|
||||
def _calculate_score(self, query_tokens: List[str], doc_idx: int) -> float:
|
||||
"""计算单个文档的 BM25 分数"""
|
||||
doc_tf = self.doc_term_freqs[doc_idx]
|
||||
doc_len = self.doc_lengths[doc_idx]
|
||||
score = 0.0
|
||||
|
||||
for term in query_tokens:
|
||||
if term not in self.idf:
|
||||
continue
|
||||
|
||||
tf = doc_tf.get(term, 0)
|
||||
idf = self.idf[term]
|
||||
|
||||
# BM25 公式
|
||||
numerator = tf * (self.k1 + 1)
|
||||
denominator = tf + self.k1 * (1 - self.b + self.b * doc_len / self.avg_doc_length)
|
||||
|
||||
score += idf * numerator / denominator
|
||||
|
||||
return score
|
||||
|
||||
def get_scores(self, query: str) -> List[float]:
|
||||
"""获取所有文档的 BM25 分数"""
|
||||
if not self.documents:
|
||||
return []
|
||||
|
||||
query_tokens = self._tokenize(query)
|
||||
if not query_tokens:
|
||||
return [0.0] * len(self.documents)
|
||||
|
||||
return [self._calculate_score(query_tokens, idx) for idx in range(len(self.documents))]
|
||||
|
||||
|
||||
class RAGService:
|
||||
"""RAG 检索增强服务"""
|
||||
|
||||
@@ -47,12 +177,15 @@ class RAGService:
|
||||
self._dimension: int = 384 # 默认维度
|
||||
self._initialized = False
|
||||
self._persist_dir = settings.FAISS_INDEX_DIR
|
||||
# BM25 索引
|
||||
self.bm25: Optional[BM25] = None
|
||||
self._bm25_enabled = True # 始终启用 BM25
|
||||
# 检查是否可用
|
||||
self._disabled = not SENTENCE_TRANSFORMERS_AVAILABLE
|
||||
if self._disabled:
|
||||
logger.warning("RAG 服务已禁用(sentence-transformers 不可用),将使用关键词匹配作为后备")
|
||||
logger.warning("RAG 服务已禁用(sentence-transformers 不可用),将使用 BM25 关键词检索")
|
||||
else:
|
||||
logger.info("RAG 服务已启用")
|
||||
logger.info("RAG 服务已启用(向量检索 + BM25 混合检索)")
|
||||
|
||||
def _init_embeddings(self):
|
||||
"""初始化嵌入模型"""
|
||||
@@ -261,11 +394,25 @@ class RAGService:
|
||||
if not documents:
|
||||
return
|
||||
|
||||
# 总是将文档存储在内存中(用于关键词搜索后备)
|
||||
# 总是将文档存储在内存中(用于 BM25 和关键词搜索)
|
||||
for doc, did in zip(documents, doc_ids):
|
||||
self.documents.append({"id": did, "content": doc.page_content, "metadata": doc.metadata})
|
||||
self.doc_ids.append(did)
|
||||
|
||||
# 构建 BM25 索引
|
||||
if self._bm25_enabled and documents:
|
||||
bm25_texts = [doc.page_content for doc in documents]
|
||||
if self.bm25 is None:
|
||||
self.bm25 = BM25()
|
||||
self.bm25.fit(bm25_texts, doc_ids)
|
||||
else:
|
||||
# 增量添加:重新构建(BM25 不支持增量)
|
||||
all_texts = [d["content"] for d in self.documents]
|
||||
all_ids = self.doc_ids.copy()
|
||||
self.bm25 = BM25()
|
||||
self.bm25.fit(all_texts, all_ids)
|
||||
logger.debug(f"BM25 索引更新: {len(documents)} 个文档")
|
||||
|
||||
# 如果没有嵌入模型,跳过向量索引
|
||||
if self.embedding_model is None:
|
||||
logger.debug(f"文档跳过向量索引 (无嵌入模型): {len(documents)} 个文档")
|
||||
@@ -284,7 +431,7 @@ class RAGService:
|
||||
|
||||
def retrieve(self, query: str, top_k: int = 5, min_score: float = 0.3) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
根据查询检索相关文档块
|
||||
根据查询检索相关文档块(混合检索:向量 + BM25)
|
||||
|
||||
Args:
|
||||
query: 查询文本
|
||||
@@ -301,39 +448,167 @@ class RAGService:
|
||||
if not self._initialized:
|
||||
self._init_vector_store()
|
||||
|
||||
# 优先使用向量检索
|
||||
if self.index is not None and self.index.ntotal > 0 and self.embedding_model is not None:
|
||||
try:
|
||||
query_embedding = self.embedding_model.encode([query], convert_to_numpy=True)
|
||||
query_embedding = self._normalize_vectors(query_embedding).astype('float32')
|
||||
# 获取向量检索结果
|
||||
vector_results = self._vector_search(query, top_k * 2, min_score)
|
||||
|
||||
scores, indices = self.index.search(query_embedding, min(top_k, self.index.ntotal))
|
||||
# 获取 BM25 检索结果
|
||||
bm25_results = self._bm25_search(query, top_k * 2)
|
||||
|
||||
results = []
|
||||
for score, idx in zip(scores[0], indices[0]):
|
||||
if idx < 0:
|
||||
continue
|
||||
if score < min_score:
|
||||
continue
|
||||
doc = self.documents[idx]
|
||||
results.append({
|
||||
"content": doc["content"],
|
||||
"metadata": doc["metadata"],
|
||||
"score": float(score),
|
||||
"doc_id": doc["id"],
|
||||
"chunk_index": doc["metadata"].get("chunk_index", 0)
|
||||
})
|
||||
# 混合融合
|
||||
hybrid_results = self._hybrid_fusion(vector_results, bm25_results, top_k)
|
||||
|
||||
if results:
|
||||
logger.debug(f"向量检索到 {len(results)} 条相关文档块")
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.warning(f"向量检索失败,使用关键词搜索后备: {e}")
|
||||
if hybrid_results:
|
||||
logger.info(f"混合检索到 {len(hybrid_results)} 条相关文档块 (向量:{len(vector_results)}, BM25:{len(bm25_results)})")
|
||||
return hybrid_results
|
||||
|
||||
# 后备:使用关键词搜索
|
||||
logger.debug("使用关键词搜索后备方案")
|
||||
# 降级:只使用 BM25
|
||||
if bm25_results:
|
||||
logger.info(f"降级到 BM25 检索: {len(bm25_results)} 条")
|
||||
return bm25_results
|
||||
|
||||
# 降级:使用关键词搜索
|
||||
logger.info("降级到关键词搜索")
|
||||
return self._keyword_search(query, top_k)
|
||||
|
||||
def _vector_search(self, query: str, top_k: int, min_score: float) -> List[Dict[str, Any]]:
|
||||
"""向量检索"""
|
||||
if self.index is None or self.index.ntotal == 0 or self.embedding_model is None:
|
||||
return []
|
||||
|
||||
try:
|
||||
query_embedding = self.embedding_model.encode([query], convert_to_numpy=True)
|
||||
query_embedding = self._normalize_vectors(query_embedding).astype('float32')
|
||||
|
||||
scores, indices = self.index.search(query_embedding, min(top_k * 2, self.index.ntotal))
|
||||
|
||||
results = []
|
||||
for score, idx in zip(scores[0], indices[0]):
|
||||
if idx < 0:
|
||||
continue
|
||||
if score < min_score:
|
||||
continue
|
||||
doc = self.documents[idx]
|
||||
results.append({
|
||||
"content": doc["content"],
|
||||
"metadata": doc["metadata"],
|
||||
"score": float(score),
|
||||
"doc_id": doc["id"],
|
||||
"chunk_index": doc["metadata"].get("chunk_index", 0),
|
||||
"search_type": "vector"
|
||||
})
|
||||
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.warning(f"向量检索失败: {e}")
|
||||
return []
|
||||
|
||||
def _bm25_search(self, query: str, top_k: int) -> List[Dict[str, Any]]:
|
||||
"""BM25 检索"""
|
||||
if not self.bm25 or not self.documents:
|
||||
return []
|
||||
|
||||
try:
|
||||
bm25_scores = self.bm25.get_scores(query)
|
||||
if not bm25_scores:
|
||||
return []
|
||||
|
||||
# 归一化 BM25 分数到 [0, 1]
|
||||
max_score = max(bm25_scores) if bm25_scores else 1
|
||||
min_score_bm = min(bm25_scores) if bm25_scores else 0
|
||||
score_range = max_score - min_score_bm if max_score != min_score_bm else 1
|
||||
|
||||
results = []
|
||||
for idx, score in enumerate(bm25_scores):
|
||||
if score <= 0:
|
||||
continue
|
||||
# 归一化
|
||||
normalized_score = (score - min_score_bm) / score_range if score_range > 0 else 0
|
||||
doc = self.documents[idx]
|
||||
results.append({
|
||||
"content": doc["content"],
|
||||
"metadata": doc["metadata"],
|
||||
"score": float(normalized_score),
|
||||
"doc_id": doc["id"],
|
||||
"chunk_index": doc["metadata"].get("chunk_index", 0),
|
||||
"search_type": "bm25"
|
||||
})
|
||||
|
||||
# 按分数降序
|
||||
results.sort(key=lambda x: x["score"], reverse=True)
|
||||
return results[:top_k]
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"BM25 检索失败: {e}")
|
||||
return []
|
||||
|
||||
def _hybrid_fusion(
|
||||
self,
|
||||
vector_results: List[Dict[str, Any]],
|
||||
bm25_results: List[Dict[str, Any]],
|
||||
top_k: int
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
混合融合向量和 BM25 检索结果
|
||||
|
||||
使用 RRFR (Reciprocal Rank Fusion) 算法:
|
||||
Score = weight_vector * (1 / rank_vector) + weight_bm25 * (1 / rank_bm25)
|
||||
|
||||
Args:
|
||||
vector_results: 向量检索结果
|
||||
bm25_results: BM25 检索结果
|
||||
top_k: 返回数量
|
||||
|
||||
Returns:
|
||||
融合后的结果
|
||||
"""
|
||||
if not vector_results and not bm25_results:
|
||||
return []
|
||||
|
||||
# 融合权重
|
||||
weight_vector = 0.6
|
||||
weight_bm25 = 0.4
|
||||
|
||||
# 构建文档分数映射
|
||||
doc_scores: Dict[str, Dict[str, float]] = {}
|
||||
|
||||
# 添加向量检索结果
|
||||
for rank, result in enumerate(vector_results):
|
||||
doc_id = result["doc_id"]
|
||||
if doc_id not in doc_scores:
|
||||
doc_scores[doc_id] = {"vector": 0, "bm25": 0, "content": result["content"], "metadata": result["metadata"]}
|
||||
# 使用倒数排名 (Reciprocal Rank)
|
||||
doc_scores[doc_id]["vector"] = weight_vector / (rank + 1)
|
||||
|
||||
# 添加 BM25 检索结果
|
||||
for rank, result in enumerate(bm25_results):
|
||||
doc_id = result["doc_id"]
|
||||
if doc_id not in doc_scores:
|
||||
doc_scores[doc_id] = {"vector": 0, "bm25": 0, "content": result["content"], "metadata": result["metadata"]}
|
||||
doc_scores[doc_id]["bm25"] = weight_bm25 / (rank + 1)
|
||||
|
||||
# 计算融合分数
|
||||
fused_results = []
|
||||
for doc_id, scores in doc_scores.items():
|
||||
fused_score = scores["vector"] + scores["bm25"]
|
||||
# 使用向量检索结果的原始分数作为参考
|
||||
vector_score = next((r["score"] for r in vector_results if r["doc_id"] == doc_id), 0.5)
|
||||
fused_results.append({
|
||||
"content": scores["content"],
|
||||
"metadata": scores["metadata"],
|
||||
"score": fused_score,
|
||||
"doc_id": doc_id,
|
||||
"vector_score": vector_score,
|
||||
"bm25_score": scores["bm25"],
|
||||
"search_type": "hybrid"
|
||||
})
|
||||
|
||||
# 按融合分数降序排序
|
||||
fused_results.sort(key=lambda x: x["score"], reverse=True)
|
||||
|
||||
logger.debug(f"混合融合: {len(fused_results)} 个文档, 向量:{len(vector_results)}, BM25:{len(bm25_results)}")
|
||||
|
||||
return fused_results[:top_k]
|
||||
|
||||
def _keyword_search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
关键词搜索后备方案
|
||||
|
||||
Reference in New Issue
Block a user