This commit is contained in:
zzz
2026-04-10 10:10:41 +08:00
24 changed files with 1852 additions and 2333 deletions

View File

@@ -216,9 +216,12 @@ async def analyze_markdown(
return result
finally:
# 清理临时文件
if os.path.exists(tmp_path):
os.unlink(tmp_path)
# 清理临时文件,确保在所有情况下都能清理
try:
if tmp_path and os.path.exists(tmp_path):
os.unlink(tmp_path)
except Exception as cleanup_error:
logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")
except HTTPException:
raise
@@ -280,8 +283,12 @@ async def analyze_markdown_stream(
)
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
# 清理临时文件,确保在所有情况下都能清理
try:
if tmp_path and os.path.exists(tmp_path):
os.unlink(tmp_path)
except Exception as cleanup_error:
logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")
except HTTPException:
raise
@@ -290,7 +297,7 @@ async def analyze_markdown_stream(
raise HTTPException(status_code=500, detail=f"流式分析失败: {str(e)}")
@router.get("/analyze/md/outline")
@router.post("/analyze/md/outline")
async def get_markdown_outline(
file: UploadFile = File(...)
):
@@ -324,8 +331,12 @@ async def get_markdown_outline(
result = await markdown_ai_service.extract_outline(tmp_path)
return result
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
# 清理临时文件,确保在所有情况下都能清理
try:
if tmp_path and os.path.exists(tmp_path):
os.unlink(tmp_path)
except Exception as cleanup_error:
logger.warning(f"临时文件清理失败: {tmp_path}, error: {cleanup_error}")
except Exception as e:
logger.error(f"获取 Markdown 大纲失败: {str(e)}")

View File

@@ -23,6 +23,52 @@ logger = logging.getLogger(__name__)
router = APIRouter(prefix="/upload", tags=["文档上传"])
# ==================== 辅助函数 ====================
async def update_task_status(
task_id: str,
status: str,
progress: int = 0,
message: str = "",
result: dict = None,
error: str = None
):
"""
更新任务状态,同时写入 Redis 和 MongoDB
Args:
task_id: 任务ID
status: 状态
progress: 进度
message: 消息
result: 结果
error: 错误信息
"""
meta = {"progress": progress, "message": message}
if result:
meta["result"] = result
if error:
meta["error"] = error
# 尝试写入 Redis
try:
await redis_db.set_task_status(task_id, status, meta)
except Exception as e:
logger.warning(f"Redis 任务状态更新失败: {e}")
# 尝试写入 MongoDB作为备用
try:
await mongodb.update_task(
task_id=task_id,
status=status,
message=message,
result=result,
error=error
)
except Exception as e:
logger.warning(f"MongoDB 任务状态更新失败: {e}")
# ==================== 请求/响应模型 ====================
class UploadResponse(BaseModel):
@@ -77,6 +123,17 @@ async def upload_document(
task_id = str(uuid.uuid4())
try:
# 保存任务记录到 MongoDB如果 Redis 不可用时仍能查询)
try:
await mongodb.insert_task(
task_id=task_id,
task_type="document_parse",
status="pending",
message=f"文档 {file.filename} 已提交处理"
)
except Exception as mongo_err:
logger.warning(f"MongoDB 保存任务记录失败: {mongo_err}")
content = await file.read()
saved_path = file_service.save_uploaded_file(
content,
@@ -122,6 +179,17 @@ async def upload_documents(
saved_paths = []
try:
# 保存任务记录到 MongoDB
try:
await mongodb.insert_task(
task_id=task_id,
task_type="batch_parse",
status="pending",
message=f"已提交 {len(files)} 个文档处理"
)
except Exception as mongo_err:
logger.warning(f"MongoDB 保存批量任务记录失败: {mongo_err}")
for file in files:
if not file.filename:
continue
@@ -159,9 +227,9 @@ async def process_document(
"""处理单个文档"""
try:
# 状态: 解析中
await redis_db.set_task_status(
await update_task_status(
task_id, status="processing",
meta={"progress": 10, "message": "正在解析文档"}
progress=10, message="正在解析文档"
)
# 解析文档
@@ -172,9 +240,9 @@ async def process_document(
raise Exception(result.error or "解析失败")
# 状态: 存储中
await redis_db.set_task_status(
await update_task_status(
task_id, status="processing",
meta={"progress": 30, "message": "正在存储数据"}
progress=30, message="正在存储数据"
)
# 存储到 MongoDB
@@ -235,9 +303,9 @@ async def process_document(
# 如果是 Excel存储到 MySQL + AI生成描述 + RAG索引
if doc_type in ["xlsx", "xls"]:
await redis_db.set_task_status(
await update_task_status(
task_id, status="processing",
meta={"progress": 50, "message": "正在存储到MySQL并生成字段描述"}
progress=50, message="正在存储到MySQL并生成字段描述"
)
try:
@@ -259,9 +327,9 @@ async def process_document(
else:
# 非结构化文档
await redis_db.set_task_status(
await update_task_status(
task_id, status="processing",
meta={"progress": 60, "message": "正在建立索引"}
progress=60, message="正在建立索引"
)
# 如果文档中有表格数据,提取并存储到 MySQL + RAG
@@ -282,17 +350,13 @@ async def process_document(
await index_document_to_rag(doc_id, original_filename, result, doc_type)
# 完成
await redis_db.set_task_status(
await update_task_status(
task_id, status="success",
meta={
"progress": 100,
"message": "处理完成",
progress=100, message="处理完成",
result={
"doc_id": doc_id,
"result": {
"doc_id": doc_id,
"doc_type": doc_type,
"filename": original_filename
}
"doc_type": doc_type,
"filename": original_filename
}
)
@@ -300,18 +364,19 @@ async def process_document(
except Exception as e:
logger.error(f"文档处理失败: {str(e)}")
await redis_db.set_task_status(
await update_task_status(
task_id, status="failure",
meta={"error": str(e)}
progress=0, message="处理失败",
error=str(e)
)
async def process_documents_batch(task_id: str, files: List[dict]):
"""批量处理文档"""
try:
await redis_db.set_task_status(
await update_task_status(
task_id, status="processing",
meta={"progress": 0, "message": "开始批量处理"}
progress=0, message="开始批量处理"
)
results = []
@@ -362,21 +427,23 @@ async def process_documents_batch(task_id: str, files: List[dict]):
results.append({"filename": file_info["filename"], "success": False, "error": str(e)})
progress = int((i + 1) / len(files) * 100)
await redis_db.set_task_status(
await update_task_status(
task_id, status="processing",
meta={"progress": progress, "message": f"已处理 {i+1}/{len(files)}"}
progress=progress, message=f"已处理 {i+1}/{len(files)}"
)
await redis_db.set_task_status(
await update_task_status(
task_id, status="success",
meta={"progress": 100, "message": "批量处理完成", "results": results}
progress=100, message="批量处理完成",
result={"results": results}
)
except Exception as e:
logger.error(f"批量处理失败: {str(e)}")
await redis_db.set_task_status(
await update_task_status(
task_id, status="failure",
meta={"error": str(e)}
progress=0, message="批量处理失败",
error=str(e)
)

View File

@@ -19,26 +19,43 @@ async def health_check() -> Dict[str, Any]:
返回各数据库连接状态和应用信息
"""
# 检查各数据库连接状态
mysql_status = "connected"
mongodb_status = "connected"
redis_status = "connected"
mysql_status = "unknown"
mongodb_status = "unknown"
redis_status = "unknown"
try:
if mysql_db.async_engine is None:
mysql_status = "disconnected"
except Exception:
else:
# 实际执行一次查询验证连接
from sqlalchemy import text
async with mysql_db.async_engine.connect() as conn:
await conn.execute(text("SELECT 1"))
mysql_status = "connected"
except Exception as e:
logger.warning(f"MySQL 健康检查失败: {e}")
mysql_status = "error"
try:
if mongodb.client is None:
mongodb_status = "disconnected"
except Exception:
else:
# 实际 ping 验证
await mongodb.client.admin.command('ping')
mongodb_status = "connected"
except Exception as e:
logger.warning(f"MongoDB 健康检查失败: {e}")
mongodb_status = "error"
try:
if not redis_db.is_connected:
if not redis_db.is_connected or redis_db.client is None:
redis_status = "disconnected"
except Exception:
else:
# 实际执行 ping 验证
await redis_db.client.ping()
redis_status = "connected"
except Exception as e:
logger.warning(f"Redis 健康检查失败: {e}")
redis_status = "error"
return {

View File

@@ -1,13 +1,13 @@
"""
任务管理 API 接口
提供异步任务状态查询
提供异步任务状态查询和历史记录
"""
from typing import Optional
from fastapi import APIRouter, HTTPException
from app.core.database import redis_db
from app.core.database import redis_db, mongodb
router = APIRouter(prefix="/tasks", tags=["任务管理"])
@@ -23,25 +23,94 @@ async def get_task_status(task_id: str):
Returns:
任务状态信息
"""
# 优先从 Redis 获取
status = await redis_db.get_task_status(task_id)
if not status:
# Redis不可用时假设任务已完成文档已成功处理
# 前端轮询时会得到这个响应
if status:
return {
"task_id": task_id,
"status": "success",
"progress": 100,
"message": "任务处理完成",
"result": None,
"error": None
"status": status.get("status", "unknown"),
"progress": status.get("meta", {}).get("progress", 0),
"message": status.get("meta", {}).get("message"),
"result": status.get("meta", {}).get("result"),
"error": status.get("meta", {}).get("error")
}
# Redis 不可用时,尝试从 MongoDB 获取
mongo_task = await mongodb.get_task(task_id)
if mongo_task:
return {
"task_id": mongo_task.get("task_id"),
"status": mongo_task.get("status", "unknown"),
"progress": 100 if mongo_task.get("status") == "success" else 0,
"message": mongo_task.get("message"),
"result": mongo_task.get("result"),
"error": mongo_task.get("error")
}
# 任务不存在或状态未知
return {
"task_id": task_id,
"status": status.get("status", "unknown"),
"progress": status.get("meta", {}).get("progress", 0),
"message": status.get("meta", {}).get("message"),
"result": status.get("meta", {}).get("result"),
"error": status.get("meta", {}).get("error")
"status": "unknown",
"progress": 0,
"message": "无法获取任务状态Redis和MongoDB均不可用",
"result": None,
"error": None
}
@router.get("/")
async def list_tasks(limit: int = 50, skip: int = 0):
"""
获取任务历史列表
Args:
limit: 返回数量限制
skip: 跳过数量
Returns:
任务列表
"""
try:
tasks = await mongodb.list_tasks(limit=limit, skip=skip)
return {
"success": True,
"tasks": tasks,
"count": len(tasks)
}
except Exception as e:
# MongoDB 不可用时返回空列表
return {
"success": False,
"tasks": [],
"count": 0,
"error": str(e)
}
@router.delete("/{task_id}")
async def delete_task(task_id: str):
"""
删除任务
Args:
task_id: 任务ID
Returns:
是否删除成功
"""
try:
# 从 Redis 删除
if redis_db._connected and redis_db.client:
key = f"task:{task_id}"
await redis_db.client.delete(key)
# 从 MongoDB 删除
deleted = await mongodb.delete_task(task_id)
return {
"success": True,
"deleted": deleted
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"删除任务失败: {str(e)}")

View File

@@ -23,6 +23,44 @@ logger = logging.getLogger(__name__)
router = APIRouter(prefix="/templates", tags=["表格模板"])
# ==================== 辅助函数 ====================
async def update_task_status(
task_id: str,
status: str,
progress: int = 0,
message: str = "",
result: dict = None,
error: str = None
):
"""
更新任务状态,同时写入 Redis 和 MongoDB
"""
from app.core.database import redis_db
meta = {"progress": progress, "message": message}
if result:
meta["result"] = result
if error:
meta["error"] = error
try:
await redis_db.set_task_status(task_id, status, meta)
except Exception as e:
logger.warning(f"Redis 任务状态更新失败: {e}")
try:
await mongodb.update_task(
task_id=task_id,
status=status,
message=message,
result=result,
error=error
)
except Exception as e:
logger.warning(f"MongoDB 任务状态更新失败: {e}")
# ==================== 请求/响应模型 ====================
class TemplateFieldRequest(BaseModel):
@@ -41,6 +79,7 @@ class FillRequest(BaseModel):
source_doc_ids: Optional[List[str]] = None # MongoDB 文档 ID 列表
source_file_paths: Optional[List[str]] = None # 源文档文件路径列表
user_hint: Optional[str] = None
task_id: Optional[str] = None # 可选的任务ID用于任务历史跟踪
class ExportRequest(BaseModel):
@@ -162,20 +201,17 @@ async def upload_joint_template(
)
try:
# 1. 保存模板文件并提取字段
# 1. 保存模板文件
template_content = await template_file.read()
template_path = file_service.save_uploaded_file(
template_content,
template_file.filename,
subfolder="templates"
)
template_fields = await template_fill_service.get_template_fields_from_file(
template_path,
template_ext
)
# 2. 处理源文档 - 保存文件
# 2. 保存并解析源文档 - 提取内容用于生成表头
source_file_info = []
source_contents = []
for sf in source_files:
if sf.filename:
sf_content = await sf.read()
@@ -190,10 +226,81 @@ async def upload_joint_template(
"filename": sf.filename,
"ext": sf_ext
})
# 解析源文档获取内容(用于 AI 生成表头)
try:
from app.core.document_parser import ParserFactory
parser = ParserFactory.get_parser(sf_path)
parse_result = parser.parse(sf_path)
if parse_result.success and parse_result.data:
# 获取原始内容
content = parse_result.data.get("content", "")[:5000] if parse_result.data.get("content") else ""
# 获取标题可能在顶层或structured_data内
titles = parse_result.data.get("titles", [])
if not titles and parse_result.data.get("structured_data"):
titles = parse_result.data.get("structured_data", {}).get("titles", [])
titles = titles[:10] if titles else []
# 获取表格数量可能在顶层或structured_data内
tables = parse_result.data.get("tables", [])
if not tables and parse_result.data.get("structured_data"):
tables = parse_result.data.get("structured_data", {}).get("tables", [])
tables_count = len(tables) if tables else 0
# 获取表格内容摘要(用于 AI 理解源文档结构)
tables_summary = ""
if tables:
tables_summary = "\n【文档中的表格】:\n"
for idx, table in enumerate(tables[:5]): # 最多5个表格
if isinstance(table, dict):
headers = table.get("headers", [])
rows = table.get("rows", [])
if headers:
tables_summary += f"表格{idx+1}表头: {', '.join(str(h) for h in headers)}\n"
if rows:
tables_summary += f"表格{idx+1}前3行: "
for row_idx, row in enumerate(rows[:3]):
if isinstance(row, list):
tables_summary += " | ".join(str(c) for c in row) + "; "
elif isinstance(row, dict):
tables_summary += " | ".join(str(row.get(h, "")) for h in headers if headers) + "; "
tables_summary += "\n"
source_contents.append({
"filename": sf.filename,
"doc_type": sf_ext,
"content": content,
"titles": titles,
"tables_count": tables_count,
"tables_summary": tables_summary
})
logger.info(f"[DEBUG] source_contents built: filename={sf.filename}, content_len={len(content)}, titles_count={len(titles)}, tables_count={tables_count}")
if tables_summary:
logger.info(f"[DEBUG] tables_summary preview: {tables_summary[:300]}")
except Exception as e:
logger.warning(f"解析源文档失败 {sf.filename}: {e}")
# 3. 根据源文档内容生成表头
template_fields = await template_fill_service.get_template_fields_from_file(
template_path,
template_ext,
source_contents=source_contents # 传递源文档内容
)
# 3. 异步处理源文档到MongoDB
task_id = str(uuid.uuid4())
if source_file_info:
# 保存任务记录到 MongoDB
try:
await mongodb.insert_task(
task_id=task_id,
task_type="source_process",
status="pending",
message=f"开始处理 {len(source_file_info)} 个源文档"
)
except Exception as mongo_err:
logger.warning(f"MongoDB 保存任务记录失败: {mongo_err}")
background_tasks.add_task(
process_source_documents,
task_id=task_id,
@@ -232,12 +339,10 @@ async def upload_joint_template(
async def process_source_documents(task_id: str, files: List[dict]):
"""异步处理源文档存入MongoDB"""
from app.core.database import redis_db
try:
await redis_db.set_task_status(
await update_task_status(
task_id, status="processing",
meta={"progress": 0, "message": "开始处理源文档"}
progress=0, message="开始处理源文档"
)
doc_ids = []
@@ -266,22 +371,24 @@ async def process_source_documents(task_id: str, files: List[dict]):
logger.error(f"源文档处理异常: {file_info['filename']}, error: {str(e)}")
progress = int((i + 1) / len(files) * 100)
await redis_db.set_task_status(
await update_task_status(
task_id, status="processing",
meta={"progress": progress, "message": f"已处理 {i+1}/{len(files)}"}
progress=progress, message=f"已处理 {i+1}/{len(files)}"
)
await redis_db.set_task_status(
await update_task_status(
task_id, status="success",
meta={"progress": 100, "message": "源文档处理完成", "doc_ids": doc_ids}
progress=100, message="源文档处理完成",
result={"doc_ids": doc_ids}
)
logger.info(f"所有源文档处理完成: {len(doc_ids)}")
except Exception as e:
logger.error(f"源文档批量处理失败: {str(e)}")
await redis_db.set_task_status(
await update_task_status(
task_id, status="failure",
meta={"error": str(e)}
progress=0, message="源文档处理失败",
error=str(e)
)
@@ -340,7 +447,27 @@ async def fill_template(
Returns:
填写结果
"""
# 生成或使用传入的 task_id
task_id = request.task_id or str(uuid.uuid4())
try:
# 创建任务记录到 MongoDB
try:
await mongodb.insert_task(
task_id=task_id,
task_type="template_fill",
status="processing",
message=f"开始填表任务: {len(request.template_fields)} 个字段"
)
except Exception as mongo_err:
logger.warning(f"MongoDB 创建任务记录失败: {mongo_err}")
# 更新进度 - 开始
await update_task_status(
task_id, "processing",
progress=0, message="开始处理..."
)
# 转换字段
fields = [
TemplateField(
@@ -353,17 +480,51 @@ async def fill_template(
for f in request.template_fields
]
# 从 template_id 提取文件类型
template_file_type = "xlsx" # 默认类型
if request.template_id:
ext = request.template_id.split('.')[-1].lower()
if ext in ["xlsx", "xls"]:
template_file_type = "xlsx"
elif ext == "docx":
template_file_type = "docx"
# 更新进度 - 准备开始填写
await update_task_status(
task_id, "processing",
progress=10, message=f"准备填写 {len(fields)} 个字段..."
)
# 执行填写
result = await template_fill_service.fill_template(
template_fields=fields,
source_doc_ids=request.source_doc_ids,
source_file_paths=request.source_file_paths,
user_hint=request.user_hint
user_hint=request.user_hint,
template_id=request.template_id,
template_file_type=template_file_type,
task_id=task_id
)
return result
# 更新为成功
await update_task_status(
task_id, "success",
progress=100, message="填表完成",
result={
"field_count": len(fields),
"max_rows": result.get("max_rows", 0)
}
)
return {**result, "task_id": task_id}
except Exception as e:
# 更新为失败
await update_task_status(
task_id, "failure",
progress=0, message="填表失败",
error=str(e)
)
logger.error(f"填写表格失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"填写失败: {str(e)}")

View File

@@ -5,6 +5,7 @@ from fastapi import APIRouter, UploadFile, File, HTTPException, Query
from fastapi.responses import StreamingResponse
from typing import Optional
import logging
import os
import pandas as pd
import io
@@ -126,7 +127,7 @@ async def upload_excel(
content += f"... (共 {len(sheet_data['rows'])} 行)\n\n"
doc_metadata = {
"filename": saved_path.split("/")[-1] if "/" in saved_path else saved_path.split("\\")[-1],
"filename": os.path.basename(saved_path),
"original_filename": file.filename,
"saved_path": saved_path,
"file_size": len(content),
@@ -253,7 +254,7 @@ async def export_excel(
output.seek(0)
# 生成文件名
original_name = file_path.split('/')[-1] if '/' in file_path else file_path
original_name = os.path.basename(file_path)
if columns:
export_name = f"export_{sheet_name or 'data'}_{len(column_list) if columns else 'all'}_cols.xlsx"
else:

View File

@@ -59,6 +59,11 @@ class MongoDB:
"""RAG索引集合 - 存储字段语义索引"""
return self.db["rag_index"]
@property
def tasks(self):
"""任务集合 - 存储任务历史记录"""
return self.db["tasks"]
# ==================== 文档操作 ====================
async def insert_document(
@@ -264,8 +269,128 @@ class MongoDB:
await self.rag_index.create_index("table_name")
await self.rag_index.create_index("field_name")
# 任务集合索引
await self.tasks.create_index("task_id", unique=True)
await self.tasks.create_index("created_at")
logger.info("MongoDB 索引创建完成")
# ==================== 任务历史操作 ====================
async def insert_task(
self,
task_id: str,
task_type: str,
status: str = "pending",
message: str = "",
result: Optional[Dict[str, Any]] = None,
error: Optional[str] = None,
) -> str:
"""
插入任务记录
Args:
task_id: 任务ID
task_type: 任务类型
status: 任务状态
message: 任务消息
result: 任务结果
error: 错误信息
Returns:
插入文档的ID
"""
task = {
"task_id": task_id,
"task_type": task_type,
"status": status,
"message": message,
"result": result,
"error": error,
"created_at": datetime.utcnow(),
"updated_at": datetime.utcnow(),
}
result_obj = await self.tasks.insert_one(task)
return str(result_obj.inserted_id)
async def update_task(
self,
task_id: str,
status: Optional[str] = None,
message: Optional[str] = None,
result: Optional[Dict[str, Any]] = None,
error: Optional[str] = None,
) -> bool:
"""
更新任务状态
Args:
task_id: 任务ID
status: 任务状态
message: 任务消息
result: 任务结果
error: 错误信息
Returns:
是否更新成功
"""
from bson import ObjectId
update_data = {"updated_at": datetime.utcnow()}
if status is not None:
update_data["status"] = status
if message is not None:
update_data["message"] = message
if result is not None:
update_data["result"] = result
if error is not None:
update_data["error"] = error
update_result = await self.tasks.update_one(
{"task_id": task_id},
{"$set": update_data}
)
return update_result.modified_count > 0
async def get_task(self, task_id: str) -> Optional[Dict[str, Any]]:
"""根据task_id获取任务"""
task = await self.tasks.find_one({"task_id": task_id})
if task:
task["_id"] = str(task["_id"])
return task
async def list_tasks(
self,
limit: int = 50,
skip: int = 0,
) -> List[Dict[str, Any]]:
"""
获取任务列表
Args:
limit: 返回数量
skip: 跳过数量
Returns:
任务列表
"""
cursor = self.tasks.find().sort("created_at", -1).skip(skip).limit(limit)
tasks = []
async for task in cursor:
task["_id"] = str(task["_id"])
# 转换 datetime 为字符串
if task.get("created_at"):
task["created_at"] = task["created_at"].isoformat()
if task.get("updated_at"):
task["updated_at"] = task["updated_at"].isoformat()
tasks.append(task)
return tasks
async def delete_task(self, task_id: str) -> bool:
"""删除任务"""
result = await self.tasks.delete_one({"task_id": task_id})
return result.deleted_count > 0
# ==================== 全局单例 ====================

View File

@@ -317,24 +317,70 @@ class XlsxParser(BaseParser):
import zipfile
from xml.etree import ElementTree as ET
# 常见的命名空间
COMMON_NAMESPACES = [
'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
'http://schemas.openxmlformats.org/spreadsheetml/2005/main',
'http://schemas.openxmlformats.org/spreadsheetml/2004/main',
'http://schemas.openxmlformats.org/spreadsheetml/2003/main',
]
try:
with zipfile.ZipFile(file_path, 'r') as z:
if 'xl/workbook.xml' not in z.namelist():
# 尝试多种可能的 workbook.xml 路径
possible_paths = ['xl/workbook.xml', 'xl\\workbook.xml', 'workbook.xml']
content = None
for path in possible_paths:
if path in z.namelist():
content = z.read(path)
logger.info(f"找到 workbook.xml at: {path}")
break
if content is None:
logger.warning(f"未找到 workbook.xml文件列表: {z.namelist()[:10]}")
return []
content = z.read('xl/workbook.xml')
root = ET.fromstring(content)
# 命名空间
ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
sheet_names = []
for sheet in root.findall('.//main:sheet', ns):
name = sheet.get('name')
if name:
sheet_names.append(name)
# 方法1尝试带命名空间的查找
for ns in COMMON_NAMESPACES:
sheet_elements = root.findall(f'.//{{{ns}}}sheet')
if sheet_elements:
for sheet in sheet_elements:
name = sheet.get('name')
if name:
sheet_names.append(name)
if sheet_names:
logger.info(f"使用命名空间 {ns} 提取工作表: {sheet_names}")
return sheet_names
# 方法2不使用命名空间直接查找所有 sheet 元素
if not sheet_names:
for elem in root.iter():
if elem.tag.endswith('sheet') and elem.tag != 'sheets':
name = elem.get('name')
if name:
sheet_names.append(name)
for child in elem:
if child.tag.endswith('sheet') or child.tag == 'sheet':
name = child.get('name')
if name and name not in sheet_names:
sheet_names.append(name)
# 方法3直接从 XML 文本中正则匹配 sheet name
if not sheet_names:
import re
xml_str = content.decode('utf-8', errors='ignore')
matches = re.findall(r'<sheet\s+[^>]*name=["\']([^"\']+)["\']', xml_str, re.IGNORECASE)
if matches:
sheet_names = matches
logger.info(f"使用正则提取工作表: {sheet_names}")
logger.info(f"从 XML 提取工作表: {sheet_names}")
return sheet_names
except Exception as e:
logger.error(f"从 XML 提取工作表名称失败: {e}")
return []
@@ -356,6 +402,32 @@ class XlsxParser(BaseParser):
import zipfile
from xml.etree import ElementTree as ET
# 常见的命名空间
COMMON_NAMESPACES = [
'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
'http://schemas.openxmlformats.org/spreadsheetml/2005/main',
'http://schemas.openxmlformats.org/spreadsheetml/2004/main',
'http://schemas.openxmlformats.org/spreadsheetml/2003/main',
]
def find_elements_with_ns(root, tag_name):
"""灵活查找元素,支持任意命名空间"""
results = []
# 方法1用固定命名空间
for ns in COMMON_NAMESPACES:
try:
elems = root.findall(f'.//{{{ns}}}{tag_name}')
if elems:
results.extend(elems)
except:
pass
# 方法2不带命名空间查找
if not results:
for elem in root.iter():
if elem.tag.endswith('}' + tag_name):
results.append(elem)
return results
with zipfile.ZipFile(file_path, 'r') as z:
# 获取工作表名称
sheet_names = self._extract_sheet_names_from_xml(file_path)
@@ -366,57 +438,68 @@ class XlsxParser(BaseParser):
target_sheet = sheet_name if sheet_name and sheet_name in sheet_names else sheet_names[0]
sheet_index = sheet_names.index(target_sheet) + 1 # sheet1.xml, sheet2.xml, ...
# 读取 shared strings
# 读取 shared strings - 尝试多种路径
shared_strings = []
if 'xl/sharedStrings.xml' in z.namelist():
ss_content = z.read('xl/sharedStrings.xml')
ss_root = ET.fromstring(ss_content)
ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
for si in ss_root.findall('.//main:si', ns):
t = si.find('.//main:t', ns)
if t is not None:
shared_strings.append(t.text or '')
else:
shared_strings.append('')
ss_paths = ['xl/sharedStrings.xml', 'xl\\sharedStrings.xml', 'sharedStrings.xml']
for ss_path in ss_paths:
if ss_path in z.namelist():
try:
ss_content = z.read(ss_path)
ss_root = ET.fromstring(ss_content)
for si in find_elements_with_ns(ss_root, 'si'):
t_elements = [c for c in si if c.tag.endswith('}t') or c.tag == 't']
if t_elements:
shared_strings.append(t_elements[0].text or '')
else:
shared_strings.append('')
break
except Exception as e:
logger.warning(f"读取 sharedStrings 失败: {e}")
# 读取工作表
sheet_file = f'xl/worksheets/sheet{sheet_index}.xml'
if sheet_file not in z.namelist():
raise ValueError(f"工作表文件 {sheet_file} 不存在")
# 读取工作表 - 尝试多种可能的路径
sheet_content = None
sheet_paths = [
f'xl/worksheets/sheet{sheet_index}.xml',
f'xl\\worksheets\\sheet{sheet_index}.xml',
f'worksheets/sheet{sheet_index}.xml',
]
for sp in sheet_paths:
if sp in z.namelist():
sheet_content = z.read(sp)
break
if sheet_content is None:
raise ValueError(f"工作表文件 sheet{sheet_index}.xml 不存在")
sheet_content = z.read(sheet_file)
root = ET.fromstring(sheet_content)
ns = {'main': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'}
# 收集所有行数据
all_rows = []
headers = {}
for row in root.findall('.//main:row', ns):
for row in find_elements_with_ns(root, 'row'):
row_idx = int(row.get('r', 0))
row_cells = {}
for cell in row.findall('main:c', ns):
for cell in find_elements_with_ns(row, 'c'):
cell_ref = cell.get('r', '')
col_letters = ''.join(filter(str.isalpha, cell_ref))
cell_type = cell.get('t', 'n')
v = cell.find('main:v', ns)
v_elements = find_elements_with_ns(cell, 'v')
v = v_elements[0] if v_elements else None
if v is not None and v.text:
if cell_type == 's':
# shared string
try:
row_cells[col_letters] = shared_strings[int(v.text)]
except (ValueError, IndexError):
row_cells[col_letters] = v.text
elif cell_type == 'b':
# boolean
row_cells[col_letters] = v.text == '1'
else:
row_cells[col_letters] = v.text
else:
row_cells[col_letters] = None
# 处理表头行
if row_idx == header_row + 1:
headers = {**row_cells}
elif row_idx > header_row + 1:
@@ -424,7 +507,6 @@ class XlsxParser(BaseParser):
# 构建 DataFrame
if headers:
# 按原始列顺序排列
col_order = list(headers.keys())
df = pd.DataFrame(all_rows)
if not df.empty:

View File

@@ -0,0 +1,15 @@
"""
指令执行模块
注意: 此模块为可选功能,当前尚未实现。
如需启用,请实现 intent_parser.py 和 executor.py
"""
from .intent_parser import IntentParser, DefaultIntentParser
from .executor import InstructionExecutor, DefaultInstructionExecutor
__all__ = [
"IntentParser",
"DefaultIntentParser",
"InstructionExecutor",
"DefaultInstructionExecutor",
]

View File

@@ -0,0 +1,35 @@
"""
指令执行器模块
将自然语言指令转换为可执行操作
注意: 此模块为可选功能,当前尚未实现。
"""
from abc import ABC, abstractmethod
from typing import Any, Dict
class InstructionExecutor(ABC):
"""指令执行器抽象基类"""
@abstractmethod
async def execute(self, instruction: str, context: Dict[str, Any]) -> Dict[str, Any]:
"""
执行指令
Args:
instruction: 解析后的指令
context: 执行上下文
Returns:
执行结果
"""
pass
class DefaultInstructionExecutor(InstructionExecutor):
"""默认指令执行器"""
async def execute(self, instruction: str, context: Dict[str, Any]) -> Dict[str, Any]:
"""暂未实现"""
raise NotImplementedError("指令执行功能暂未实现")

View File

@@ -0,0 +1,34 @@
"""
意图解析器模块
解析用户自然语言指令,识别意图和参数
注意: 此模块为可选功能,当前尚未实现。
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, Tuple
class IntentParser(ABC):
"""意图解析器抽象基类"""
@abstractmethod
async def parse(self, text: str) -> Tuple[str, Dict[str, Any]]:
"""
解析自然语言指令
Args:
text: 用户输入的自然语言
Returns:
(意图类型, 参数字典)
"""
pass
class DefaultIntentParser(IntentParser):
"""默认意图解析器"""
async def parse(self, text: str) -> Tuple[str, Dict[str, Any]]:
"""暂未实现"""
raise NotImplementedError("意图解析功能暂未实现")

View File

@@ -61,7 +61,10 @@ class TemplateFillService:
template_fields: List[TemplateField],
source_doc_ids: Optional[List[str]] = None,
source_file_paths: Optional[List[str]] = None,
user_hint: Optional[str] = None
user_hint: Optional[str] = None,
template_id: Optional[str] = None,
template_file_type: Optional[str] = "xlsx",
task_id: Optional[str] = None
) -> Dict[str, Any]:
"""
填写表格模板
@@ -71,6 +74,9 @@ class TemplateFillService:
source_doc_ids: 源文档 MongoDB ID 列表
source_file_paths: 源文档文件路径列表
user_hint: 用户提示(如"请从合同文档中提取"
template_id: 模板文件路径(用于重新生成表头)
template_file_type: 模板文件类型
task_id: 可选的任务ID用于任务进度跟踪
Returns:
填写结果
@@ -79,15 +85,94 @@ class TemplateFillService:
fill_details = []
logger.info(f"开始填表: {len(template_fields)} 个字段, {len(source_doc_ids or [])} 个源文档")
logger.info(f"source_doc_ids: {source_doc_ids}")
logger.info(f"source_file_paths: {source_file_paths}")
# 1. 加载源文档内容
source_docs = await self._load_source_documents(source_doc_ids, source_file_paths)
logger.info(f"加载了 {len(source_docs)} 个源文档")
# 打印每个加载的文档的详细信息
for i, doc in enumerate(source_docs):
logger.info(f" 文档[{i}]: id={doc.doc_id}, filename={doc.filename}, doc_type={doc.doc_type}")
logger.info(f" content长度: {len(doc.content)}, structured_data keys: {list(doc.structured_data.keys()) if doc.structured_data else 'None'}")
if not source_docs:
logger.warning("没有找到源文档,填表结果将全部为空")
# 3. 检查是否需要使用源文档重新生成表头
# 条件:源文档已加载 AND 现有字段看起来是自动生成的(如"字段1"、"字段2"
needs_regenerate_headers = (
len(source_docs) > 0 and
len(template_fields) > 0 and
all(self._is_auto_generated_field(f.name) for f in template_fields)
)
if needs_regenerate_headers:
logger.info(f"检测到自动生成表头,尝试使用源文档重新生成... (当前字段: {[f.name for f in template_fields]})")
# 将 SourceDocument 转换为 source_contents 格式
source_contents = []
for doc in source_docs:
structured = doc.structured_data if doc.structured_data else {}
# 获取标题
titles = structured.get("titles", [])
if not titles:
titles = []
# 获取表格
tables = structured.get("tables", [])
tables_count = len(tables) if tables else 0
# 生成表格摘要
tables_summary = ""
if tables:
tables_summary = "\n【文档中的表格】:\n"
for idx, table in enumerate(tables[:5]):
if isinstance(table, dict):
headers = table.get("headers", [])
rows = table.get("rows", [])
if headers:
tables_summary += f"表格{idx+1}表头: {', '.join(str(h) for h in headers)}\n"
if rows:
tables_summary += f"表格{idx+1}前3行: "
for row_idx, row in enumerate(rows[:3]):
if isinstance(row, list):
tables_summary += " | ".join(str(c) for c in row) + "; "
elif isinstance(row, dict):
tables_summary += " | ".join(str(row.get(h, "")) for h in headers if headers) + "; "
tables_summary += "\n"
source_contents.append({
"filename": doc.filename,
"doc_type": doc.doc_type,
"content": doc.content[:5000] if doc.content else "",
"titles": titles[:10] if titles else [],
"tables_count": tables_count,
"tables_summary": tables_summary
})
# 使用源文档内容重新生成表头
if template_id and template_file_type:
logger.info(f"使用源文档重新生成表头: template_id={template_id}, template_file_type={template_file_type}")
new_fields = await self.get_template_fields_from_file(
template_id,
template_file_type,
source_contents=source_contents
)
if new_fields and len(new_fields) > 0:
logger.info(f"成功重新生成表头: {[f.name for f in new_fields]}")
template_fields = new_fields
else:
logger.warning("重新生成表头返回空结果,使用原始字段")
else:
logger.warning("无法重新生成表头:缺少 template_id 或 template_file_type")
else:
if source_docs and template_fields:
logger.info(f"表头看起来正常(非自动生成),无需重新生成: {[f.name for f in template_fields[:5]]}")
# 2. 对每个字段进行提取
for idx, field in enumerate(template_fields):
try:
@@ -99,6 +184,22 @@ class TemplateFillService:
user_hint=user_hint
)
# AI审核验证提取的值是否合理
if result.values and result.values[0]:
logger.info(f"字段 {field.name} 进入AI审核阶段...")
verified_result = await self._verify_field_value(
field=field,
extracted_values=result.values,
source_docs=source_docs,
user_hint=user_hint
)
if verified_result:
# 审核给出了修正结果
result = verified_result
logger.info(f"字段 {field.name} 审核后修正值: {result.values[:3]}")
else:
logger.info(f"字段 {field.name} 审核通过,使用原提取结果")
# 存储结果 - 使用 values 数组
filled_data[field.name] = result.values if result.values else [""]
fill_details.append({
@@ -159,14 +260,49 @@ class TemplateFillService:
try:
doc = await mongodb.get_document(doc_id)
if doc:
sd = doc.get("structured_data", {})
sd_keys = list(sd.keys()) if sd else []
logger.info(f"从MongoDB加载文档: {doc_id}, doc_type={doc.get('doc_type')}, structured_data keys={sd_keys}")
# 如果 structured_data 为空,但有 file_path尝试重新解析文件
doc_content = doc.get("content", "")
if not sd or (not sd.get("tables") and not sd.get("headers") and not sd.get("rows")):
file_path = doc.get("metadata", {}).get("file_path")
if file_path:
logger.info(f" structured_data 为空,尝试重新解析文件: {file_path}")
try:
parser = ParserFactory.get_parser(file_path)
result = parser.parse(file_path)
if result.success and result.data:
if result.data.get("structured_data"):
sd = result.data.get("structured_data")
logger.info(f" 重新解析成功structured_data keys: {list(sd.keys())}")
elif result.data.get("tables"):
sd = {"tables": result.data.get("tables", [])}
logger.info(f" 使用 data.tablestables数量: {len(sd.get('tables', []))}")
elif result.data.get("rows"):
sd = result.data
logger.info(f" 使用 data.rows 格式")
if result.data.get("content"):
doc_content = result.data.get("content", "")
else:
logger.warning(f" 重新解析失败: {result.error if result else 'unknown'}")
except Exception as parse_err:
logger.error(f" 重新解析文件异常: {str(parse_err)}")
if sd.get("tables"):
logger.info(f" tables数量: {len(sd.get('tables', []))}")
if sd["tables"]:
first_table = sd["tables"][0]
logger.info(f" 第一表格: headers={first_table.get('headers', [])[:3]}..., rows数量={len(first_table.get('rows', []))}")
source_docs.append(SourceDocument(
doc_id=doc_id,
filename=doc.get("metadata", {}).get("original_filename", "unknown"),
doc_type=doc.get("doc_type", "unknown"),
content=doc.get("content", ""),
structured_data=doc.get("structured_data", {})
content=doc_content,
structured_data=sd
))
logger.info(f"从MongoDB加载文档: {doc_id}")
except Exception as e:
logger.error(f"从MongoDB加载文档失败 {doc_id}: {str(e)}")
@@ -370,7 +506,7 @@ class TemplateFillService:
response = await self.llm.chat(
messages=messages,
temperature=0.1,
max_tokens=50000
max_tokens=4000
)
content = self.llm.extract_message_content(response)
@@ -476,6 +612,137 @@ class TemplateFillService:
confidence=0.0
)
async def _verify_field_value(
self,
field: TemplateField,
extracted_values: List[str],
source_docs: List[SourceDocument],
user_hint: Optional[str] = None
) -> Optional[FillResult]:
"""
验证并修正提取的字段值
Args:
field: 字段定义
extracted_values: 已提取的值
source_docs: 源文档列表
user_hint: 用户提示
Returns:
验证后的结果如果验证通过返回None使用原结果
"""
if not extracted_values or not extracted_values[0]:
return None
if not source_docs:
return None
try:
# 构建验证上下文
context_text = self._build_context_text(source_docs, field_name=field.name, max_length=15000)
hint_text = field.hint if field.hint else f"请理解{field.name}字段的含义"
if user_hint:
hint_text = f"{user_hint}{hint_text}"
prompt = f"""你是一个数据质量审核专家。请审核以下提取的数据是否合理。
【待审核字段】
字段名:{field.name}
字段说明:{hint_text}
【已提取的值】
{extracted_values[:10]} # 最多审核前10个值
【源文档上下文】
{context_text[:8000]}
【审核要求】
1. 这些值是否符合字段的含义?
2. 值在原文中的原始含义是什么?检查是否有误解或误提取
3. 是否存在明显错误、空值或不合理的数据?
4. 如果表格有多个列,请确认提取的是正确的列
请严格按照以下 JSON 格式输出(只需输出 JSON不要其他内容
{{
"is_valid": true或false,
"corrected_values": ["修正后的值列表"] 或 null如果无需修正,
"reason": "审核说明,解释判断理由",
"original_meaning": "值在原文中的原始含义描述"
}}
"""
messages = [
{"role": "system", "content": "你是一个严格的数据质量审核专家。请仔细核对原文和提取的值是否匹配。"},
{"role": "user", "content": prompt}
]
response = await self.llm.chat(
messages=messages,
temperature=0.2,
max_tokens=3000
)
content = self.llm.extract_message_content(response)
logger.info(f"字段 {field.name} 审核返回: {content[:300]}")
# 解析 JSON
import json
import re
cleaned = content.strip()
cleaned = re.sub(r'^```json\s*', '', cleaned, flags=re.MULTILINE)
cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE)
cleaned = cleaned.strip()
json_start = -1
for i, c in enumerate(cleaned):
if c == '{':
json_start = i
break
if json_start == -1:
logger.warning(f"字段 {field.name} 审核:无法找到 JSON")
return None
json_text = cleaned[json_start:]
result = json.loads(json_text)
is_valid = result.get("is_valid", True)
corrected_values = result.get("corrected_values")
reason = result.get("reason", "")
original_meaning = result.get("original_meaning", "")
logger.info(f"字段 {field.name} 审核结果: is_valid={is_valid}, reason={reason[:100]}")
if not is_valid and corrected_values:
# 值有问题且有修正建议,使用修正后的值
logger.info(f"字段 {field.name} 使用修正后的值: {corrected_values[:5]}")
return FillResult(
field=field.name,
values=corrected_values,
value=corrected_values[0] if corrected_values else "",
source=f"AI审核修正: {reason[:100]}",
confidence=0.7
)
elif not is_valid and original_meaning:
# 值有问题但无修正,记录原始含义供用户参考
logger.info(f"字段 {field.name} 审核发现问题: {original_meaning}")
return FillResult(
field=field.name,
values=extracted_values,
value=extracted_values[0] if extracted_values else "",
source=f"AI审核疑问: {original_meaning[:100]}",
confidence=0.5
)
# 验证通过,返回 None 表示使用原结果
return None
except Exception as e:
logger.error(f"字段 {field.name} 审核失败: {str(e)}")
return None
def _build_context_text(self, source_docs: List[SourceDocument], field_name: str = None, max_length: int = 8000) -> str:
"""
构建上下文文本
@@ -625,7 +892,8 @@ class TemplateFillService:
async def get_template_fields_from_file(
self,
file_path: str,
file_type: str = "xlsx"
file_type: str = "xlsx",
source_contents: List[dict] = None
) -> List[TemplateField]:
"""
从模板文件提取字段定义
@@ -633,11 +901,14 @@ class TemplateFillService:
Args:
file_path: 模板文件路径
file_type: 文件类型 (xlsx/xls/docx)
source_contents: 源文档内容列表(用于 AI 生成表头)
Returns:
字段列表
"""
fields = []
if source_contents is None:
source_contents = []
try:
if file_type in ["xlsx", "xls"]:
@@ -653,8 +924,8 @@ class TemplateFillService:
)
if needs_ai_generation:
logger.info(f"模板表头为空或自动生成,尝试 AI 生成表头... (fields={len(fields)})")
ai_fields = await self._generate_fields_with_ai(file_path, file_type)
logger.info(f"模板表头为空或自动生成,尝试 AI 生成表头... (fields={len(fields)}, source_docs={len(source_contents)})")
ai_fields = await self._generate_fields_with_ai(file_path, file_type, source_contents)
if ai_fields:
fields = ai_fields
logger.info(f"AI 生成表头成功: {len(fields)} 个字段")
@@ -857,7 +1128,7 @@ class TemplateFillService:
def _extract_values_from_structured_data(self, source_docs: List[SourceDocument], field_name: str) -> List[str]:
"""
从结构化数据Excel rows中直接提取指定列的值
从结构化数据Excel rows 或 Markdown tables)中直接提取指定列的值
适用于有 rows 结构的文档数据,无需 LLM 即可提取
@@ -869,10 +1140,15 @@ class TemplateFillService:
值列表,如果无法提取则返回空列表
"""
all_values = []
logger.info(f"[_extract_values_from_structured_data] 开始提取字段: {field_name}")
logger.info(f" source_docs 数量: {len(source_docs)}")
for doc in source_docs:
for doc_idx, doc in enumerate(source_docs):
# 尝试从 structured_data 中提取
structured = doc.structured_data
logger.info(f" 文档[{doc_idx}]: {doc.filename}, structured类型: {type(structured)}, 是否为空: {not bool(structured)}")
if structured:
logger.info(f" structured_data keys: {list(structured.keys())}")
if not structured:
continue
@@ -892,6 +1168,33 @@ class TemplateFillService:
if all_values:
break
# 处理 Markdown 表格格式: {headers: [...], rows: [...], ...}
elif structured.get("headers") and structured.get("rows"):
headers = structured.get("headers", [])
rows = structured.get("rows", [])
values = self._extract_values_from_markdown_table(headers, rows, field_name)
if values:
all_values.extend(values)
logger.info(f"从 Markdown 文档 {doc.filename} 提取到 {len(values)} 个值")
break
# 处理 MongoDB 存储的 tables 格式: {tables: [{headers, rows, ...}, ...]}
elif structured.get("tables") and isinstance(structured.get("tables"), list):
tables = structured.get("tables", [])
logger.info(f" 检测到 tables 格式,共 {len(tables)} 个表")
for table_idx, table in enumerate(tables):
if isinstance(table, dict):
headers = table.get("headers", [])
rows = table.get("rows", [])
logger.info(f" 表格[{table_idx}]: headers={headers[:3]}..., rows数量={len(rows)}")
values = self._extract_values_from_markdown_table(headers, rows, field_name)
if values:
all_values.extend(values)
logger.info(f"从表格[{table_idx}] 提取到 {len(values)} 个值")
break
if all_values:
break
# 处理单 sheet 格式: {columns: [...], rows: [...]}
elif structured.get("rows"):
columns = structured.get("columns", [])
@@ -945,16 +1248,18 @@ class TemplateFillService:
if not table_rows or len(table_rows) < 2:
return []
# 第一步:尝试在 header第一行中查找匹配列
target_col_idx = None
for col_idx, col_name in enumerate(header):
col_str = str(col_name).strip()
if field_name.lower() in col_str.lower() or col_str.lower() in field_name.lower():
target_col_idx = col_idx
break
# 使用增强的匹配算法查找最佳匹配的列索引
target_col_idx = self._find_best_matching_column(header, field_name)
# 如果增强匹配没找到,尝试在 header第一行中查找
if target_col_idx is None:
for col_idx, col_name in enumerate(header):
col_str = str(col_name).strip()
if field_name.lower() in col_str.lower() or col_str.lower() in field_name.lower():
target_col_idx = col_idx
break
# 如果 header 中没找到,尝试在 table_rows[1](第二行)中查找
# 这是因为有时第一行是数据而不是表头
if target_col_idx is None and len(table_rows) > 1:
second_row = table_rows[1]
if isinstance(second_row, list):
@@ -970,33 +1275,112 @@ class TemplateFillService:
return []
# 确定从哪一行开始提取数据
# 如果 header 是表头(包含 field_name则从 table_rows[1] 开始提取
# 如果 header 是数据(不包含 field_name则从 table_rows[2] 开始提取
header_contains_field = any(
field_name.lower() in str(col).strip().lower() or str(col).strip().lower() in field_name.lower()
for col in header
)
if header_contains_field:
# header 是表头,从第二行开始提取
data_start_idx = 1
else:
# header 是数据,从第三行开始提取(跳过表头和第一行数据)
data_start_idx = 2
# 提取值
values = []
for row_idx, row in enumerate(table_rows[data_start_idx:], start=data_start_idx):
if isinstance(row, list) and target_col_idx < len(row):
val = str(row[target_col_idx]).strip() if row[target_col_idx] else ""
values.append(val)
val = row[target_col_idx]
values.append(self._format_value(val))
elif isinstance(row, dict):
val = str(row.get(target_col_idx, "")).strip()
values.append(val)
val = row.get(target_col_idx, "")
values.append(self._format_value(val))
logger.info(f"从 Word 表格列 {target_col_idx} 提取到 {len(values)} 个值: {values[:3]}")
return values
def _format_value(self, val: Any) -> str:
"""
格式化值为字符串,保持原始格式
Args:
val: 原始值
Returns:
格式化后的字符串
"""
if val is None:
return ""
if isinstance(val, str):
return val.strip()
if isinstance(val, bool):
return "true" if val else "false"
if isinstance(val, (int, float)):
if isinstance(val, float):
if val == int(val):
return str(int(val))
else:
formatted = f"{val:.10f}".rstrip('0').rstrip('.')
return formatted
else:
return str(val)
return str(val)
def _find_best_matching_column(self, headers: List, field_name: str) -> Optional[int]:
"""
查找最佳匹配的列索引
使用多层匹配策略:
1. 精确匹配(忽略大小写)
2. 子字符串匹配(字段名在表头中,或表头在字段名中)
3. 关键词重叠匹配(中文字符串分割后比对)
Args:
headers: 表头列表
field_name: 要匹配的字段名
Returns:
匹配的列索引,找不到返回 None
"""
field_lower = field_name.lower().strip()
field_keywords = set(field_lower.replace(" ", "").split())
best_match_idx = None
best_match_score = 0
for idx, header in enumerate(headers):
header_str = str(header).strip()
header_lower = header_str.lower()
# 策略1: 精确匹配(忽略大小写)
if header_lower == field_lower:
return idx
# 策略2: 子字符串匹配
if field_lower in header_lower or header_lower in field_lower:
score = max(len(field_lower), len(header_lower)) / min(len(field_lower) + 1, len(header_lower) + 1)
if score > best_match_score:
best_match_score = score
best_match_idx = idx
continue
# 策略3: 关键词重叠匹配(适用于中文)
header_keywords = set(header_lower.replace(" ", "").split())
overlap = field_keywords & header_keywords
if overlap and len(overlap) > 0:
score = len(overlap) / max(len(field_keywords), len(header_keywords), 1)
if score > best_match_score:
best_match_score = score
best_match_idx = idx
if best_match_score >= 0.3:
logger.info(f"模糊匹配: {field_name} -> {headers[best_match_idx]} (分数: {best_match_score:.2f})")
return best_match_idx
return None
def _extract_column_values(self, rows: List, columns: List, field_name: str) -> List[str]:
"""
从 rows 和 columns 中提取指定列的值
@@ -1012,27 +1396,25 @@ class TemplateFillService:
if not rows or not columns:
return []
# 查找匹配的列(模糊匹配)
target_col = None
for col in columns:
col_str = str(col)
if field_name.lower() in col_str.lower() or col_str.lower() in field_name.lower():
target_col = col
break
# 使用增强的匹配算法查找最佳匹配的列索引
target_idx = self._find_best_matching_column(columns, field_name)
if not target_col:
if target_idx is None:
logger.warning(f"未找到匹配列: {field_name}, 可用列: {columns}")
return []
target_col = columns[target_idx]
logger.info(f"列匹配成功: {field_name} -> {target_col} (索引: {target_idx})")
values = []
for row in rows:
if isinstance(row, dict):
val = row.get(target_col, "")
elif isinstance(row, list) and target_col in columns:
val = row[columns.index(target_col)]
elif isinstance(row, list) and target_idx < len(row):
val = row[target_idx]
else:
val = ""
values.append(str(val) if val is not None else "")
values.append(self._format_value(val))
return values
@@ -1046,7 +1428,6 @@ class TemplateFillService:
Returns:
(值列表, 置信度) 元组
"""
# 提取置信度
confidence = 0.5
if isinstance(result, dict) and "confidence" in result:
try:
@@ -1057,28 +1438,25 @@ class TemplateFillService:
pass
if isinstance(result, dict):
# 优先找 values 数组
if "values" in result and isinstance(result["values"], list):
vals = [str(v).strip() for v in result["values"] if v and str(v).strip()]
vals = [self._format_value(v).strip() for v in result["values"] if self._format_value(v).strip()]
if vals:
return vals, confidence
# 尝试找 value 字段
if "value" in result:
val = str(result["value"]).strip()
val = self._format_value(result["value"]).strip()
if val:
return [val], confidence
# 尝试找任何数组类型的键
for key in result.keys():
val = result[key]
if isinstance(val, list) and len(val) > 0:
if all(isinstance(v, (str, int, float, bool)) or v is None for v in val):
vals = [str(v).strip() for v in val if v is not None and str(v).strip()]
vals = [self._format_value(v).strip() for v in val if v is not None and self._format_value(v).strip()]
if vals:
return vals, confidence
elif isinstance(val, (str, int, float, bool)):
return [str(val).strip()], confidence
return [self._format_value(val).strip()], confidence
elif isinstance(result, list):
vals = [str(v).strip() for v in result if v is not None and str(v).strip()]
vals = [self._format_value(v).strip() for v in result if v is not None and self._format_value(v).strip()]
if vals:
return vals, confidence
return [], confidence
@@ -1215,15 +1593,15 @@ class TemplateFillService:
if isinstance(parsed, dict):
# 如果是 {"values": [...]} 格式,提取 values
if "values" in parsed and isinstance(parsed["values"], list):
return [str(v).strip() for v in parsed["values"] if v and str(v).strip()]
return [self._format_value(v).strip() for v in parsed["values"] if self._format_value(v).strip()]
# 如果是其他 dict 格式,尝试找 values 键
for key in ["values", "value", "data", "result"]:
if key in parsed and isinstance(parsed[key], list):
return [str(v).strip() for v in parsed[key] if v and str(v).strip()]
return [self._format_value(v).strip() for v in parsed[key] if self._format_value(v).strip()]
elif key in parsed:
return [str(parsed[key]).strip()]
return [self._format_value(parsed[key]).strip()]
elif isinstance(parsed, list):
return [str(v).strip() for v in parsed if v and str(v).strip()]
return [self._format_value(v).strip() for v in parsed if self._format_value(v).strip()]
except (json.JSONDecodeError, TypeError):
pass
@@ -1239,14 +1617,14 @@ class TemplateFillService:
result = []
for item in arr:
if isinstance(item, dict) and "values" in item and isinstance(item["values"], list):
result.extend([str(v).strip() for v in item["values"] if v and str(v).strip()])
result.extend([self._format_value(v).strip() for v in item["values"] if self._format_value(v).strip()])
elif isinstance(item, dict):
result.append(str(item))
else:
result.append(str(item))
result.append(self._format_value(item))
if result:
return result
return [str(v).strip() for v in arr if v and str(v).strip()]
return [self._format_value(v).strip() for v in arr if self._format_value(v).strip()]
except:
pass
@@ -1337,27 +1715,37 @@ class TemplateFillService:
hint_text = f"{user_hint}{hint_text}"
# 构建针对字段提取的提示词
prompt = f"""你是一个专业的数据提取专家。请从以下文档内容中提取与"{field.name}"相关的所有数据。
prompt = f"""你是一个专业的数据提取专家。请从以下文档内容中提取与"{field.name}"完全匹配的数据。
字段提示: {hint_text}
【重要】字段名: "{field.name}"
【重要】字段提示: {hint_text}
请严格按照以下步骤操作:
1. 在文档中搜索与"{field.name}"完全相同或高度相关的关键词
2. 找到后,提取该关键词后的数值(注意:只要数值,不要单位)
3. 如果是表格中的数据,直接提取该单元格的数值
4. 如果是段落描述,在关键词附近找数值
【重要】返回值规则:
- 只返回纯数值,不要单位(如 "4.9" 而不是 "4.9万亿元"
- 如果原文是"4.9万亿元",返回 "4.9"
- 如果原文是"144000万册",返回 "144000"
- 如果是百分比如"增长7.7%",返回 "7.7"
- 如果没有找到完全匹配的数据,返回空数组
文档内容:
{doc.content[:8000] if doc.content else ""}
请完成以下任务:
1. 仔细阅读文档,找出所有与"{field.name}"相关的数据
2. 如果文档中有表格数据,提取表格中的对应列值
3. 如果文档中是段落描述,提取其中的关键数值或结论
4. 返回提取的所有值(可能多个,用数组存储)
{doc.content[:10000] if doc.content else ""}
请用严格的 JSON 格式返回:
{{
"values": ["值1", "值2", ...],
"values": ["值1", "值2", ...], // 只填数值,不要单位
"source": "数据来源说明",
"confidence": 0.0到1.0之间的置信度
}}
如果没有找到相关数据,返回空数组 values: []"""
示例:
- 如果字段是"图书馆总藏量(万册)"且文档说"图书总藏量14.4亿册",返回 values: ["144000"]
- 如果字段是"国内旅游收入(亿元)"且文档说"国内旅游收入4.9万亿元",返回 values: ["49000"]"""
messages = [
{"role": "system", "content": "你是一个专业的数据提取助手擅长从政府统计公报等文档中提取数据。请严格按JSON格式输出。"},
@@ -1367,7 +1755,7 @@ class TemplateFillService:
response = await self.llm.chat(
messages=messages,
temperature=0.1,
max_tokens=5000
max_tokens=4000
)
content = self.llm.extract_message_content(response)
@@ -1434,7 +1822,8 @@ class TemplateFillService:
async def _generate_fields_with_ai(
self,
file_path: str,
file_type: str
file_type: str,
source_contents: List[dict] = None
) -> Optional[List[TemplateField]]:
"""
使用 AI 为空表生成表头字段
@@ -1454,28 +1843,35 @@ class TemplateFillService:
content_sample = ""
# 读取 Excel 内容检查是否为空
content_sample = ""
if file_type in ["xlsx", "xls"]:
df = pd.read_excel(file_path, header=None)
if df.shape[0] == 0 or df.shape[1] == 0:
logger.info("Excel 表格为空")
# 生成默认字段
return [TemplateField(
cell=self._column_to_cell(i),
name=f"字段{i+1}",
field_type="text",
required=False,
hint="请填写此字段"
) for i in range(5)]
# 表格有数据但没有表头
if df.shape[1] > 0:
# 读取第一行作为参考,看是否为空
first_row = df.iloc[0].tolist() if len(df) > 0 else []
if not any(pd.notna(v) and str(v).strip() != '' for v in first_row):
# 第一行为空AI 生成表头
content_sample = df.iloc[:10].to_string() if len(df) >= 10 else df.to_string()
# 即使 Excel 为空,如果有源文档,仍然尝试使用 AI 生成表头
if not source_contents:
logger.info("Excel 为空且没有源文档,使用默认字段名")
return [TemplateField(
cell=self._column_to_cell(i),
name=f"字段{i+1}",
field_type="text",
required=False,
hint="请填写此字段"
) for i in range(5)]
# 有源文档,继续调用 AI 生成表头
logger.info("Excel 为空但有源文档,使用源文档内容生成表头...")
else:
# 表格有数据但没有表头
if df.shape[1] > 0:
# 读取第一行作为参考,看是否为空
first_row = df.iloc[0].tolist() if len(df) > 0 else []
if not any(pd.notna(v) and str(v).strip() != '' for v in first_row):
# 第一行为空AI 生成表头
content_sample = df.iloc[:10].to_string() if len(df) >= 10 else df.to_string()
else:
content_sample = df.to_string()
else:
content_sample = df.to_string()
content_sample = ""
elif file_type == "docx":
# Word 文档:尝试使用 docx_parser 提取内容
@@ -1506,21 +1902,56 @@ class TemplateFillService:
return None
# 调用 AI 生成表头
prompt = f"""你是一个专业的表格设计助手。请为以下空白表格生成合适的表头字段。
# 根据源文档内容生成表头
source_info = ""
logger.info(f"[DEBUG] _generate_fields_with_ai received source_contents: {len(source_contents) if source_contents else 0} items")
if source_contents:
for sc in source_contents:
logger.info(f"[DEBUG] source doc: filename={sc.get('filename')}, content_len={len(sc.get('content', ''))}, titles={len(sc.get('titles', []))}, tables_count={sc.get('tables_count', 0)}, has_tables_summary={bool(sc.get('tables_summary'))}")
source_info = "\n\n【源文档内容摘要】(根据以下文档内容生成表头):\n"
for idx, src in enumerate(source_contents[:5]): # 最多5个源文档
filename = src.get("filename", f"文档{idx+1}")
doc_type = src.get("doc_type", "unknown")
content = src.get("content", "")[:3000] # 限制内容长度
titles = src.get("titles", [])[:10] # 最多10个标题
tables_count = src.get("tables_count", 0)
tables_summary = src.get("tables_summary", "")
表格内容预览:
{content_sample[:2000] if content_sample else "空白表格"}
source_info += f"\n--- 文档 {idx+1}: {filename} ({doc_type}) ---\n"
# 处理 titles可能是字符串列表或字典列表
if titles:
title_texts = []
for t in titles[:5]:
if isinstance(t, dict):
title_texts.append(t.get('text', ''))
else:
title_texts.append(str(t))
if title_texts:
source_info += f"【章节标题】: {', '.join(title_texts)}\n"
if tables_count > 0:
source_info += f"【包含表格数】: {tables_count}\n"
if tables_summary:
source_info += f"{tables_summary}\n"
elif content:
source_info += f"【内容预览】: {content[:1500]}...\n"
请生成5-10个简洁的表头字段名这些字段应该
1. 简洁明了,易于理解
2. 适合作为表格列标题
3. 之间有明显的区分度
prompt = f"""你是一个专业的表格设计助手。请根据源文档内容生成合适的表格表头字段。
任务:用户有一些源文档(包含表格数据),需要填写到空白表格模板中。源文档中的表格如下:
{source_info}
【重要要求】
1. 请仔细阅读上面的源文档表格,找出所有不同的列名(如"产品名称""1995年产量""按资产总额计算(%)"等)
2. 直接使用这些实际的列名作为表头字段名,不要生成新的或同义词
3. 如果一个源文档有多个表格,请为每个表格选择合适的列名
4. 生成3-8个表头字段优先选择数据量大的表格的列
请严格按照以下 JSON 格式输出(只需输出 JSON不要其他内容
{{
"fields": [
{{"name": "字段名1", "hint": "字段说明提示1"}},
{{"name": "字段名2", "hint": "字段说明提示2"}}
{{"name": "实际列名1", "hint": "对该列的说明"}},
{{"name": "实际列名2", "hint": "对该列的说明"}}
]
}}
"""