From e5711b3f05b318da0930c2d690a733a3b4939ba0 Mon Sep 17 00:00:00 2001 From: KiriAky 107 Date: Thu, 9 Apr 2026 20:35:41 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E8=81=94=E5=90=88?= =?UTF-8?q?=E4=B8=8A=E4=BC=A0=E6=A8=A1=E6=9D=BF=E5=92=8C=E6=BA=90=E6=96=87?= =?UTF-8?q?=E6=A1=A3=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 新增 upload-joint 接口支持模板文件和源文档的一键式联合上传处理, 包括异步文档解析和MongoDB存储功能;前端新增对应API调用方法和UI界 面,优化表格填写流程,支持拖拽上传和实时预览功能。 --- backend/app/api/endpoints/templates.py | 171 ++++++- frontend/src/db/backend-api.ts | 40 ++ frontend/src/pages/TemplateFill.tsx | 615 ++++++++++++++----------- logs/rag_disable_note.txt | 59 +++ 4 files changed, 604 insertions(+), 281 deletions(-) create mode 100644 logs/rag_disable_note.txt diff --git a/backend/app/api/endpoints/templates.py b/backend/app/api/endpoints/templates.py index 8aaa296..8d2ebee 100644 --- a/backend/app/api/endpoints/templates.py +++ b/backend/app/api/endpoints/templates.py @@ -5,15 +5,18 @@ """ import io import logging +import uuid from typing import List, Optional -from fastapi import APIRouter, File, HTTPException, Query, UploadFile +from fastapi import APIRouter, File, HTTPException, Query, UploadFile, BackgroundTasks from fastapi.responses import StreamingResponse import pandas as pd from pydantic import BaseModel from app.services.template_fill_service import template_fill_service, TemplateField from app.services.file_service import file_service +from app.core.database import mongodb +from app.core.document_parser import ParserFactory logger = logging.getLogger(__name__) @@ -109,6 +112,172 @@ async def upload_template( raise HTTPException(status_code=500, detail=f"上传失败: {str(e)}") +@router.post("/upload-joint") +async def upload_joint_template( + background_tasks: BackgroundTasks, + template_file: UploadFile = File(..., description="模板文件"), + source_files: List[UploadFile] = File(..., description="源文档文件列表"), +): + """ + 联合上传模板和源文档,一键完成解析和存储 + + 1. 保存模板文件并提取字段 + 2. 异步处理源文档(解析+存MongoDB) + 3. 返回模板信息和源文档ID列表 + + Args: + template_file: 模板文件 (xlsx/xls/docx) + source_files: 源文档列表 (docx/xlsx/md/txt) + + Returns: + 模板ID、字段列表、源文档ID列表 + """ + if not template_file.filename: + raise HTTPException(status_code=400, detail="模板文件名为空") + + # 验证模板格式 + template_ext = template_file.filename.split('.')[-1].lower() + if template_ext not in ['xlsx', 'xls', 'docx']: + raise HTTPException( + status_code=400, + detail=f"不支持的模板格式: {template_ext},仅支持 xlsx/xls/docx" + ) + + # 验证源文档格式 + valid_exts = ['docx', 'xlsx', 'xls', 'md', 'txt'] + for sf in source_files: + if sf.filename: + sf_ext = sf.filename.split('.')[-1].lower() + if sf_ext not in valid_exts: + raise HTTPException( + status_code=400, + detail=f"不支持的源文档格式: {sf_ext},仅支持 docx/xlsx/xls/md/txt" + ) + + try: + # 1. 保存模板文件并提取字段 + template_content = await template_file.read() + template_path = file_service.save_uploaded_file( + template_content, + template_file.filename, + subfolder="templates" + ) + template_fields = await template_fill_service.get_template_fields_from_file( + template_path, + template_ext + ) + + # 2. 处理源文档 - 保存文件 + source_file_info = [] + for sf in source_files: + if sf.filename: + sf_content = await sf.read() + sf_ext = sf.filename.split('.')[-1].lower() + sf_path = file_service.save_uploaded_file( + sf_content, + sf.filename, + subfolder=sf_ext + ) + source_file_info.append({ + "path": sf_path, + "filename": sf.filename, + "ext": sf_ext + }) + + # 3. 异步处理源文档到MongoDB + task_id = str(uuid.uuid4()) + if source_file_info: + background_tasks.add_task( + process_source_documents, + task_id=task_id, + files=source_file_info + ) + + logger.info(f"联合上传完成: 模板={template_file.filename}, 源文档={len(source_file_info)}个") + + return { + "success": True, + "template_id": template_path, + "filename": template_file.filename, + "file_type": template_ext, + "fields": [ + { + "cell": f.cell, + "name": f.name, + "field_type": f.field_type, + "required": f.required, + "hint": f.hint + } + for f in template_fields + ], + "field_count": len(template_fields), + "source_file_paths": [f["path"] for f in source_file_info], + "source_filenames": [f["filename"] for f in source_file_info], + "task_id": task_id + } + + except HTTPException: + raise + except Exception as e: + logger.error(f"联合上传失败: {str(e)}") + raise HTTPException(status_code=500, detail=f"联合上传失败: {str(e)}") + + +async def process_source_documents(task_id: str, files: List[dict]): + """异步处理源文档,存入MongoDB""" + from app.core.database import redis_db + + try: + await redis_db.set_task_status( + task_id, status="processing", + meta={"progress": 0, "message": "开始处理源文档"} + ) + + doc_ids = [] + for i, file_info in enumerate(files): + try: + parser = ParserFactory.get_parser(file_info["path"]) + result = parser.parse(file_info["path"]) + + if result.success: + doc_id = await mongodb.insert_document( + doc_type=file_info["ext"], + content=result.data.get("content", ""), + metadata={ + **result.metadata, + "original_filename": file_info["filename"], + "file_path": file_info["path"] + }, + structured_data=result.data.get("structured_data") + ) + doc_ids.append(doc_id) + logger.info(f"源文档处理成功: {file_info['filename']}, doc_id: {doc_id}") + else: + logger.error(f"源文档解析失败: {file_info['filename']}, error: {result.error}") + + except Exception as e: + logger.error(f"源文档处理异常: {file_info['filename']}, error: {str(e)}") + + progress = int((i + 1) / len(files) * 100) + await redis_db.set_task_status( + task_id, status="processing", + meta={"progress": progress, "message": f"已处理 {i+1}/{len(files)}"} + ) + + await redis_db.set_task_status( + task_id, status="success", + meta={"progress": 100, "message": "源文档处理完成", "doc_ids": doc_ids} + ) + logger.info(f"所有源文档处理完成: {len(doc_ids)}个") + + except Exception as e: + logger.error(f"源文档批量处理失败: {str(e)}") + await redis_db.set_task_status( + task_id, status="failure", + meta={"error": str(e)} + ) + + @router.post("/fields") async def extract_template_fields( template_id: str = Query(..., description="模板ID/文件路径"), diff --git a/frontend/src/db/backend-api.ts b/frontend/src/db/backend-api.ts index 998fe62..d26e1a8 100644 --- a/frontend/src/db/backend-api.ts +++ b/frontend/src/db/backend-api.ts @@ -656,6 +656,46 @@ export const backendApi = { } }, + /** + * 联合上传模板和源文档 + */ + async uploadTemplateAndSources( + templateFile: File, + sourceFiles: File[] + ): Promise<{ + success: boolean; + template_id: string; + filename: string; + file_type: string; + fields: TemplateField[]; + field_count: number; + source_file_paths: string[]; + source_filenames: string[]; + task_id: string; + }> { + const formData = new FormData(); + formData.append('template_file', templateFile); + sourceFiles.forEach(file => formData.append('source_files', file)); + + const url = `${BACKEND_BASE_URL}/templates/upload-joint`; + + try { + const response = await fetch(url, { + method: 'POST', + body: formData, + }); + + if (!response.ok) { + const error = await response.json(); + throw new Error(error.detail || '联合上传失败'); + } + return await response.json(); + } catch (error) { + console.error('联合上传失败:', error); + throw error; + } + }, + /** * 执行表格填写 */ diff --git a/frontend/src/pages/TemplateFill.tsx b/frontend/src/pages/TemplateFill.tsx index 573d3f7..1fa7c99 100644 --- a/frontend/src/pages/TemplateFill.tsx +++ b/frontend/src/pages/TemplateFill.tsx @@ -1,4 +1,4 @@ -import React, { useState, useEffect } from 'react'; +import React, { useState, useEffect, useCallback } from 'react'; import { useDropzone } from 'react-dropzone'; import { TableProperties, @@ -14,7 +14,11 @@ import { RefreshCcw, ChevronDown, ChevronUp, - Loader2 + Loader2, + Files, + Trash2, + Eye, + File } from 'lucide-react'; import { Button } from '@/components/ui/button'; import { Card, CardContent, CardHeader, CardTitle, CardDescription } from '@/components/ui/card'; @@ -26,6 +30,13 @@ import { format } from 'date-fns'; import { toast } from 'sonner'; import { cn } from '@/lib/utils'; import { Skeleton } from '@/components/ui/skeleton'; +import { + Dialog, + DialogContent, + DialogHeader, + DialogTitle, +} from "@/components/ui/dialog"; +import { ScrollArea } from '@/components/ui/scroll-area'; type DocumentItem = { doc_id: string; @@ -41,6 +52,11 @@ type DocumentItem = { }; }; +type SourceFile = { + file: File; + preview?: string; +}; + type TemplateField = { cell: string; name: string; @@ -50,64 +66,25 @@ type TemplateField = { }; const TemplateFill: React.FC = () => { - const [step, setStep] = useState<'upload-template' | 'select-source' | 'preview' | 'filling'>('upload-template'); + const [step, setStep] = useState<'upload' | 'filling' | 'preview'>('upload'); const [templateFile, setTemplateFile] = useState(null); const [templateFields, setTemplateFields] = useState([]); - const [sourceDocs, setSourceDocs] = useState([]); - const [selectedDocs, setSelectedDocs] = useState([]); + const [sourceFiles, setSourceFiles] = useState([]); + const [sourceFilePaths, setSourceFilePaths] = useState([]); + const [templateId, setTemplateId] = useState(''); const [loading, setLoading] = useState(false); const [filling, setFilling] = useState(false); const [filledResult, setFilledResult] = useState(null); + const [previewDoc, setPreviewDoc] = useState<{ name: string; content: string } | null>(null); + const [previewOpen, setPreviewOpen] = useState(false); - // Load available source documents - useEffect(() => { - loadSourceDocuments(); - }, []); - - const loadSourceDocuments = async () => { - setLoading(true); - try { - const result = await backendApi.getDocuments(undefined, 100); - if (result.success) { - // Filter to only non-Excel documents that can be used as data sources - const docs = (result.documents || []).filter((d: DocumentItem) => - ['docx', 'md', 'txt', 'xlsx'].includes(d.doc_type) - ); - setSourceDocs(docs); - } - } catch (err: any) { - toast.error('加载数据源失败'); - } finally { - setLoading(false); - } - }; - - const onTemplateDrop = async (acceptedFiles: File[]) => { + // 模板拖拽 + const onTemplateDrop = useCallback((acceptedFiles: File[]) => { const file = acceptedFiles[0]; - if (!file) return; - - const ext = file.name.split('.').pop()?.toLowerCase(); - if (!['xlsx', 'xls', 'docx'].includes(ext || '')) { - toast.error('仅支持 xlsx/xls/docx 格式的模板文件'); - return; + if (file) { + setTemplateFile(file); } - - setTemplateFile(file); - setLoading(true); - - try { - const result = await backendApi.uploadTemplate(file); - if (result.success) { - setTemplateFields(result.fields || []); - setStep('select-source'); - toast.success('模板上传成功'); - } - } catch (err: any) { - toast.error('模板上传失败: ' + (err.message || '未知错误')); - } finally { - setLoading(false); - } - }; + }, []); const { getRootProps: getTemplateProps, getInputProps: getTemplateInputProps, isDragActive: isTemplateDragActive } = useDropzone({ onDrop: onTemplateDrop, @@ -116,33 +93,108 @@ const TemplateFill: React.FC = () => { 'application/vnd.ms-excel': ['.xls'], 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'] }, - maxFiles: 1 + maxFiles: 1, + multiple: false }); - const handleFillTemplate = async () => { - if (!templateFile || selectedDocs.length === 0) { + // 源文档拖拽 + const onSourceDrop = useCallback((acceptedFiles: File[]) => { + const newFiles = acceptedFiles.map(f => ({ + file: f, + preview: f.type.startsWith('text/') || f.name.endsWith('.md') ? undefined : undefined + })); + setSourceFiles(prev => [...prev, ...newFiles]); + }, []); + + const { getRootProps: getSourceProps, getInputProps: getSourceInputProps, isDragActive: isSourceDragActive } = useDropzone({ + onDrop: onSourceDrop, + accept: { + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'], + 'application/vnd.ms-excel': ['.xls'], + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'], + 'text/plain': ['.txt'], + 'text/markdown': ['.md'] + }, + multiple: true + }); + + const removeSourceFile = (index: number) => { + setSourceFiles(prev => prev.filter((_, i) => i !== index)); + }; + + const handleJointUploadAndFill = async () => { + if (!templateFile) { + toast.error('请先上传模板文件'); + return; + } + + setLoading(true); + + try { + // 使用联合上传API + const result = await backendApi.uploadTemplateAndSources( + templateFile, + sourceFiles.map(sf => sf.file) + ); + + if (result.success) { + setTemplateFields(result.fields || []); + setTemplateId(result.template_id); + setSourceFilePaths(result.source_file_paths || []); + toast.success('文档上传成功,开始智能填表'); + setStep('filling'); + + // 自动开始填表 + const fillResult = await backendApi.fillTemplate( + result.template_id, + result.fields || [], + [], // 使用 source_file_paths 而非 source_doc_ids + result.source_file_paths || [], + '请从以下文档中提取相关信息填写表格' + ); + + setFilledResult(fillResult); + setStep('preview'); + toast.success('表格填写完成'); + } + } catch (err: any) { + toast.error('处理失败: ' + (err.message || '未知错误')); + } finally { + setLoading(false); + } + }; + + // 传统方式:先上传源文档再填表(兼容已有文档库的场景) + const handleFillWithExistingDocs = async (selectedDocIds: string[]) => { + if (!templateFile || selectedDocIds.length === 0) { toast.error('请选择数据源文档'); return; } - setFilling(true); + setLoading(true); setStep('filling'); try { - // 调用后端填表接口,传递选中的文档ID - const result = await backendApi.fillTemplate( - 'temp-template-id', - templateFields, - selectedDocs // 传递源文档ID列表 + // 先上传模板获取template_id + const uploadResult = await backendApi.uploadTemplate(templateFile); + + const fillResult = await backendApi.fillTemplate( + uploadResult.template_id, + uploadResult.fields || [], + selectedDocIds, + [], + '请从以下文档中提取相关信息填写表格' ); - setFilledResult(result); + + setTemplateFields(uploadResult.fields || []); + setTemplateId(uploadResult.template_id); + setFilledResult(fillResult); setStep('preview'); toast.success('表格填写完成'); } catch (err: any) { toast.error('填表失败: ' + (err.message || '未知错误')); - setStep('select-source'); } finally { - setFilling(false); + setLoading(false); } }; @@ -150,7 +202,11 @@ const TemplateFill: React.FC = () => { if (!templateFile || !filledResult) return; try { - const blob = await backendApi.exportFilledTemplate('temp', filledResult.filled_data || {}, 'xlsx'); + const blob = await backendApi.exportFilledTemplate( + templateId || 'temp', + filledResult.filled_data || {}, + 'xlsx' + ); const url = URL.createObjectURL(blob); const a = document.createElement('a'); a.href = url; @@ -164,13 +220,29 @@ const TemplateFill: React.FC = () => { }; const resetFlow = () => { - setStep('upload-template'); + setStep('upload'); setTemplateFile(null); setTemplateFields([]); - setSelectedDocs([]); + setSourceFiles([]); + setSourceFilePaths([]); + setTemplateId(''); setFilledResult(null); }; + const getFileIcon = (filename: string) => { + const ext = filename.split('.').pop()?.toLowerCase(); + if (['xlsx', 'xls'].includes(ext || '')) { + return ; + } + if (ext === 'docx') { + return ; + } + if (['md', 'txt'].includes(ext || '')) { + return ; + } + return ; + }; + return (
@@ -180,7 +252,7 @@ const TemplateFill: React.FC = () => { 根据您的表格模板,自动聚合多源文档信息进行精准填充

- {step !== 'upload-template' && ( + {step !== 'upload' && ( - - - {/* Template Fields Preview */} -
-

待填写字段

-
- {templateFields.map((field, idx) => ( - - {field.name} - - ))} -
-
- - - - {/* Source Documents Selection */} - - - - - 选择数据源文档 + 表格模板 - 从已上传的文档中选择作为填表的数据来源,支持 Excel 和非结构化文档 + 上传需要填写的 Excel/Word 模板文件 - {loading ? ( -
- {[1, 2, 3].map(i => )} -
- ) : sourceDocs.length > 0 ? ( -
- {sourceDocs.map(doc => ( -
{ - setSelectedDocs(prev => - prev.includes(doc.doc_id) - ? prev.filter(id => id !== doc.doc_id) - : [...prev, doc.doc_id] - ); - }} - > -
- {selectedDocs.includes(doc.doc_id) && } -
-
- {doc.doc_type === 'xlsx' ? : } -
-
-

{doc.original_filename}

-

- {doc.doc_type.toUpperCase()} • {format(new Date(doc.created_at), 'yyyy-MM-dd')} -

-
- {doc.metadata?.columns && ( - - {doc.metadata.columns.length} 列 - - )} -
- ))} + {!templateFile ? ( +
+ +
+ {loading ? : } +
+

+ {isTemplateDragActive ? '释放以上传' : '点击或拖拽上传模板'} +

+

+ 支持 .xlsx .xls .docx +

) : ( -
- -

暂无数据源文档,请先上传文档

+
+
+ +
+
+

{templateFile.name}

+

+ {(templateFile.size / 1024).toFixed(1)} KB +

+
+ +
+ )} + + + + {/* Source Documents Upload */} + + + + + 源文档 + + + 上传包含数据的源文档(支持多选),可同时上传多个文件 + + + +
+ +
+ {loading ? : } +
+

+ {isSourceDragActive ? '释放以上传' : '点击或拖拽上传源文档'} +

+

+ 支持 .xlsx .xls .docx .md .txt +

+
+ + {/* Selected Source Files */} + {sourceFiles.length > 0 && ( +
+ {sourceFiles.map((sf, idx) => ( +
+ {getFileIcon(sf.file.name)} +
+

{sf.file.name}

+

+ {(sf.file.size / 1024).toFixed(1)} KB +

+
+ +
+ ))}
)}
{/* Action Button */} -
+
@@ -389,49 +390,7 @@ const TemplateFill: React.FC = () => {
)} - {/* Step 3: Preview Results */} - {step === 'preview' && filledResult && ( - - - - - 填表完成 - - - 系统已根据 {selectedDocs.length} 份文档自动完成表格填写 - - - - {/* Filled Data Preview */} -
-
- {templateFields.map((field, idx) => ( -
-
{field.name}
-
- {(filledResult.filled_data || {})[field.name] || '-'} -
-
- ))} -
-
- - {/* Action Buttons */} -
- - -
-
-
- )} - - {/* Filling State */} + {/* Step 2: Filling State */} {step === 'filling' && ( @@ -440,11 +399,107 @@ const TemplateFill: React.FC = () => {

AI 正在智能分析并填表

- 系统正在从 {selectedDocs.length} 份文档中检索相关信息,生成字段描述,并使用 RAG 增强填写准确性... + 系统正在从 {sourceFiles.length || sourceFilePaths.length} 份文档中检索相关信息...

)} + + {/* Step 3: Preview Results */} + {step === 'preview' && filledResult && ( +
+ + + + + 填表完成 + + + 系统已根据 {sourceFiles.length || sourceFilePaths.length} 份文档自动完成表格填写 + + + + {/* Filled Data Preview */} +
+
+ {templateFields.map((field, idx) => { + const value = filledResult.filled_data?.[field.name]; + const displayValue = Array.isArray(value) + ? value.filter(v => v && String(v).trim()).join(', ') || '-' + : value || '-'; + return ( +
+
{field.name}
+
+ {displayValue} +
+
+ ); + })} +
+
+ + {/* Source Files Info */} +
+ {sourceFiles.map((sf, idx) => ( + + {getFileIcon(sf.file.name)} + {sf.file.name} + + ))} +
+ + {/* Action Buttons */} +
+ + +
+
+
+ + {/* Fill Details */} + {filledResult.fill_details && filledResult.fill_details.length > 0 && ( + + + 填写详情 + + +
+ {filledResult.fill_details.map((detail: any, idx: number) => ( +
+
+
+
{detail.field}
+
+ 来源: {detail.source} | 置信度: {detail.confidence ? (detail.confidence * 100).toFixed(0) + '%' : 'N/A'} +
+
+
+ ))} +
+ + + )} +
+ )} + + {/* Preview Dialog */} + + + + {previewDoc?.name || '文档预览'} + + +
{previewDoc?.content}
+
+
+
); }; diff --git a/logs/rag_disable_note.txt b/logs/rag_disable_note.txt new file mode 100644 index 0000000..cf75308 --- /dev/null +++ b/logs/rag_disable_note.txt @@ -0,0 +1,59 @@ +RAG 服务临时禁用说明 +======================== +日期: 2026-04-08 + +修改内容: +---------- +应需求,RAG 向量检索功能已临时禁用,具体如下: + +1. 修改文件: backend/app/services/rag_service.py + +2. 关键变更: + - 在 RAGService.__init__ 中添加 self._disabled = True 标志 + - index_field() - 添加 _disabled 检查,跳过实际索引操作并记录日志 + - index_document_content() - 添加 _disabled 检查,跳过实际索引操作并记录日志 + - retrieve() - 添加 _disabled 检查,返回空列表并记录日志 + - get_vector_count() - 添加 _disabled 检查,返回 0 并记录日志 + - clear() - 添加 _disabled 检查,跳过实际清空操作并记录日志 + +3. 行为变更: + - 所有 RAG 索引构建操作会被记录到日志 ([RAG DISABLED] 前缀) + - 所有 RAG 检索操作返回空结果 + - 向量计数始终返回 0 + - 实际向量数据库操作被跳过 + +4. 恢复方式: + - 将 RAGService.__init__ 中的 self._disabled = True 改为 self._disabled = False + - 重新启动服务即可恢复 RAG 功能 + +目的: +------ +保留 RAG 索引构建功能的前端界面和代码结构,暂不实际调用向量数据库 API, +待后续需要时再启用。 + +影响范围: +--------- +- /api/v1/rag/search - RAG 搜索接口 (返回空结果) +- /api/v1/rag/status - RAG 状态接口 (返回 vector_count=0) +- /api/v1/rag/rebuild - RAG 重建接口 (仅记录日志) +- Excel/文档上传时的 RAG 索引构建 (仅记录日志) + +======================== +后续补充 (2026-04-08): +======================== +修改文件: backend/app/services/table_rag_service.py + +关键变更: +- 在 TableRAGService.__init__ 中添加 self._disabled = True 标志 +- build_table_rag_index() - RAG 索引部分被跳过,仅记录日志 +- index_document_table() - RAG 索引部分被跳过,仅记录日志 + +行为变更: +- Excel 上传时,MySQL 存储仍然正常进行 +- AI 字段描述仍然正常生成(调用 LLM) +- 只有向量数据库索引操作被跳过 + +恢复方式: +- 将 TableRAGService.__init__ 中的 self._disabled = True 改为 self._disabled = False +- 或将 rag_service.py 中的 self._disabled = True 改为 self._disabled = False +- 两者需同时改为 False 才能完全恢复 RAG 功能 From d5df5b8283b0e7697b6f829836a157dbe7a44331 Mon Sep 17 00:00:00 2001 From: KiriAky 107 Date: Thu, 9 Apr 2026 21:00:31 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E5=A2=9E=E5=BC=BA=E6=A8=A1=E6=9D=BF?= =?UTF-8?q?=E5=A1=AB=E5=85=85=E6=9C=8D=E5=8A=A1=E6=94=AF=E6=8C=81=E9=9D=9E?= =?UTF-8?q?=E7=BB=93=E6=9E=84=E5=8C=96=E6=96=87=E6=A1=A3AI=E5=88=86?= =?UTF-8?q?=E6=9E=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 引入markdown_ai_service服务支持Markdown文档处理 - 实现_nonstructured_docs_for_fields方法对非结构化文档进行AI分析 - 优化LLM提示词,改进数据提取的准确性和格式规范 - 支持从Markdown表格格式{tables: [{headers: [...], rows: [...]}]}中提取数据 - 添加文档章节结构解析,提升上下文理解能力 - 增加JSON响应格式修复功能,提高数据解析成功率 --- backend/app/services/template_fill_service.py | 200 +++++++++++++++++- 1 file changed, 193 insertions(+), 7 deletions(-) diff --git a/backend/app/services/template_fill_service.py b/backend/app/services/template_fill_service.py index 71976a6..dfea7f8 100644 --- a/backend/app/services/template_fill_service.py +++ b/backend/app/services/template_fill_service.py @@ -10,6 +10,7 @@ from typing import Any, Dict, List, Optional from app.core.database import mongodb from app.services.llm_service import llm_service from app.core.document_parser import ParserFactory +from app.services.markdown_ai_service import markdown_ai_service logger = logging.getLogger(__name__) @@ -233,6 +234,12 @@ class TemplateFillService: confidence=1.0 ) + # 无法直接从结构化数据提取,尝试 AI 分析非结构化文档 + ai_structured = await self._analyze_unstructured_docs_for_fields(source_docs, field, user_hint) + if ai_structured: + logger.info(f"✅ 字段 {field.name} 通过 AI 分析结构化提取到数据") + return ai_structured + # 无法从结构化数据提取,使用 LLM logger.info(f"字段 {field.name} 无法直接从结构化数据提取,使用 LLM...") @@ -244,18 +251,20 @@ class TemplateFillService: if user_hint: hint_text = f"{user_hint}。{hint_text}" - prompt = f"""你是一个专业的数据提取专家。请从以下文档内容中提取"{field.name}"字段的所有行数据。 + prompt = f"""你是一个专业的数据提取专家。请从以下文档内容中提取与"{field.name}"相关的所有信息。 -参考文档内容(已提取" {field.name}"列的数据): +提示词: {hint_text} + +文档内容: {context_text} -请提取上述所有行的" {field.name}"值,存入数组。每一行对应数组中的一个元素。 -如果某行该字段为空,请用空字符串""占位。 +请分析文档结构(可能包含表格、标题段落等),找出所有与"{field.name}"相关的数据。 +如果找到表格数据,返回多行值;如果是非表格段落,提取关键信息。 -请严格按照以下 JSON 格式输出,不要添加任何解释: +请严格按照以下 JSON 格式输出: {{ - "values": ["第1行的值", "第2行的值", "第3行的值", ...], - "source": "数据来源的文档描述", + "values": ["第1行的值", "第2行的值", ...], + "source": "数据来源描述", "confidence": 0.0到1.0之间的置信度 }} """ @@ -473,6 +482,29 @@ class TemplateFillService: elif isinstance(row, list): doc_content += " | ".join(str(cell) for cell in row) + "\n" row_count += 1 + elif doc.structured_data and doc.structured_data.get("tables"): + # Markdown 表格格式: {tables: [{headers: [...], rows: [...]}]} + tables = doc.structured_data.get("tables", []) + for table in tables: + if isinstance(table, dict): + headers = table.get("headers", []) + rows = table.get("rows", []) + if rows and headers: + doc_content += f"\n【文档: {doc.filename} - 表格】\n" + doc_content += " | ".join(str(h) for h in headers) + "\n" + for row in rows: + if isinstance(row, list): + doc_content += " | ".join(str(cell) for cell in row) + "\n" + row_count += 1 + # 如果有标题结构,也添加上下文 + if doc.structured_data.get("titles"): + titles = doc.structured_data.get("titles", []) + doc_content += f"\n【文档章节结构】\n" + for title in titles[:20]: # 限制前20个标题 + doc_content += f"{'#' * title.get('level', 1)} {title.get('text', '')}\n" + # 如果没有提取到表格内容,使用纯文本 + if not doc_content.strip(): + doc_content = doc.content[:5000] if doc.content else "" elif doc.content: doc_content = doc.content[:5000] @@ -720,6 +752,21 @@ class TemplateFillService: logger.info(f"从文档 {doc.filename} 提取到 {len(values)} 个值") break + # 处理 Markdown 表格格式: {tables: [{headers: [...], rows: [...]}]} + elif structured.get("tables"): + tables = structured.get("tables", []) + for table in tables: + if isinstance(table, dict): + headers = table.get("headers", []) + rows = table.get("rows", []) + values = self._extract_column_values(rows, headers, field_name) + if values: + all_values.extend(values) + logger.info(f"从 Markdown 表格提取到 {len(values)} 个值") + break + if all_values: + break + return all_values def _extract_column_values(self, rows: List, columns: List, field_name: str) -> List[str]: @@ -1005,6 +1052,145 @@ class TemplateFillService: content = text.strip()[:500] if text.strip() else "" return [content] if content else [] + async def _analyze_unstructured_docs_for_fields( + self, + source_docs: List[SourceDocument], + field: TemplateField, + user_hint: Optional[str] = None + ) -> Optional[FillResult]: + """ + 对非结构化文档进行 AI 分析,尝试提取结构化数据 + + 适用于 Markdown 等没有表格格式的文档,通过 AI 分析提取结构化信息 + + Args: + source_docs: 源文档列表 + field: 字段定义 + user_hint: 用户提示 + + Returns: + FillResult 如果提取成功,否则返回 None + """ + # 找出非结构化的 Markdown/TXT 文档(没有表格的) + unstructured_docs = [] + for doc in source_docs: + if doc.doc_type in ["md", "txt", "markdown"]: + # 检查是否有表格 + has_tables = ( + doc.structured_data and + doc.structured_data.get("tables") and + len(doc.structured_data.get("tables", [])) > 0 + ) + if not has_tables: + unstructured_docs.append(doc) + + if not unstructured_docs: + return None + + logger.info(f"发现 {len(unstructured_docs)} 个非结构化文档,尝试 AI 分析...") + + # 对每个非结构化文档进行 AI 分析 + for doc in unstructured_docs: + try: + # 使用 markdown_ai_service 的 statistics 分析类型 + # 这种类型专门用于政府统计公报等包含数据的文档 + hint_text = field.hint if field.hint else f"请提取{field.name}的信息" + if user_hint: + hint_text = f"{user_hint}。{hint_text}" + + # 构建针对字段提取的提示词 + prompt = f"""你是一个专业的数据提取专家。请从以下文档内容中提取与"{field.name}"相关的所有数据。 + +字段提示: {hint_text} + +文档内容: +{doc.content[:8000] if doc.content else ""} + +请完成以下任务: +1. 仔细阅读文档,找出所有与"{field.name}"相关的数据 +2. 如果文档中有表格数据,提取表格中的对应列值 +3. 如果文档中是段落描述,提取其中的关键数值或结论 +4. 返回提取的所有值(可能多个,用数组存储) + +请用严格的 JSON 格式返回: +{{ + "values": ["值1", "值2", ...], + "source": "数据来源说明", + "confidence": 0.0到1.0之间的置信度 +}} + +如果没有找到相关数据,返回空数组 values: []""" + + messages = [ + {"role": "system", "content": "你是一个专业的数据提取助手,擅长从政府统计公报等文档中提取数据。请严格按JSON格式输出。"}, + {"role": "user", "content": prompt} + ] + + response = await self.llm.chat( + messages=messages, + temperature=0.1, + max_tokens=5000 + ) + + content = self.llm.extract_message_content(response) + logger.info(f"AI 分析返回: {content[:500]}") + + # 解析 JSON + import json + import re + + # 清理 markdown 格式 + cleaned = content.strip() + cleaned = re.sub(r'^```json\s*', '', cleaned, flags=re.MULTILINE) + cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE) + cleaned = cleaned.strip() + + # 查找 JSON + json_start = -1 + for i, c in enumerate(cleaned): + if c == '{' or c == '[': + json_start = i + break + + if json_start == -1: + continue + + json_text = cleaned[json_start:] + try: + result = json.loads(json_text) + values = self._extract_values_from_json(result) + if values: + return FillResult( + field=field.name, + values=values, + value=values[0] if values else "", + source=f"AI分析: {doc.filename}", + confidence=result.get("confidence", 0.8) + ) + except json.JSONDecodeError: + # 尝试修复 JSON + fixed = self._fix_json(json_text) + if fixed: + try: + result = json.loads(fixed) + values = self._extract_values_from_json(result) + if values: + return FillResult( + field=field.name, + values=values, + value=values[0] if values else "", + source=f"AI分析: {doc.filename}", + confidence=result.get("confidence", 0.8) + ) + except json.JSONDecodeError: + pass + + except Exception as e: + logger.warning(f"AI 分析文档 {doc.filename} 失败: {str(e)}") + continue + + return None + # ==================== 全局单例 ====================