diff --git a/backend/app/api/endpoints/ai_analyze.py b/backend/app/api/endpoints/ai_analyze.py index ae13191..9112977 100644 --- a/backend/app/api/endpoints/ai_analyze.py +++ b/backend/app/api/endpoints/ai_analyze.py @@ -12,6 +12,7 @@ from app.services.excel_ai_service import excel_ai_service from app.services.markdown_ai_service import markdown_ai_service from app.services.template_fill_service import template_fill_service from app.services.word_ai_service import word_ai_service +from app.services.txt_ai_service import txt_ai_service logger = logging.getLogger(__name__) @@ -347,17 +348,20 @@ async def get_markdown_outline( @router.post("/analyze/txt") async def analyze_txt( file: UploadFile = File(...), + analysis_type: str = Query("structured", description="分析类型: structured, charts") ): """ - 上传并使用 AI 分析 TXT 文本文件,提取结构化数据 + 上传并使用 AI 分析 TXT 文本文件,提取结构化数据或生成图表 将非结构化文本转换为结构化表格数据,便于后续填表使用 + 当 analysis_type=charts 时,可生成可视化图表 Args: file: 上传的 TXT 文件 + analysis_type: 分析类型 - "structured"(默认,提取结构化数据)或 "charts"(生成图表) Returns: - dict: 分析结果,包含结构化表格数据 + dict: 分析结果,包含结构化表格数据或图表数据 """ if not file.filename: raise HTTPException(status_code=400, detail="文件名为空") @@ -372,6 +376,7 @@ async def analyze_txt( try: # 读取文件内容 content = await file.read() + text_content = content.decode('utf-8', errors='replace') # 保存到临时文件 with tempfile.NamedTemporaryFile(mode='wb', suffix='.txt', delete=False) as tmp: @@ -379,20 +384,22 @@ async def analyze_txt( tmp_path = tmp.name try: - logger.info(f"开始 AI 分析 TXT 文件: {file.filename}") + logger.info(f"开始 AI 分析 TXT 文件: {file.filename}, analysis_type={analysis_type}") - # 使用 template_fill_service 的 AI 分析方法 - result = await template_fill_service.analyze_txt_with_ai( - content=content.decode('utf-8', errors='replace'), - filename=file.filename + # 使用 txt_ai_service 的 AI 分析方法 + result = await txt_ai_service.analyze_txt_with_ai( + content=text_content, + filename=file.filename, + analysis_type=analysis_type ) if result: logger.info(f"TXT AI 分析成功: {file.filename}") return { - "success": True, + "success": result.get("success", True), "filename": file.filename, - "structured_data": result + "analysis_type": analysis_type, + "result": result } else: logger.warning(f"TXT AI 分析返回空结果: {file.filename}") @@ -400,7 +407,7 @@ async def analyze_txt( "success": False, "filename": file.filename, "error": "AI 分析未能提取到结构化数据", - "structured_data": None + "result": None } finally: @@ -420,19 +427,22 @@ async def analyze_txt( @router.post("/analyze/word") async def analyze_word( file: UploadFile = File(...), - user_hint: str = Query("", description="用户提示词,如'请提取表格数据'") + user_hint: str = Query("", description="用户提示词,如'请提取表格数据'"), + analysis_type: str = Query("structured", description="分析类型: structured, charts") ): """ - 使用 AI 解析 Word 文档,提取结构化数据 + 使用 AI 解析 Word 文档,提取结构化数据或生成图表 适用于从非结构化的 Word 文档中提取表格数据、键值对等信息 + 当 analysis_type=charts 时,可生成可视化图表 Args: file: 上传的 Word 文件 user_hint: 用户提示词 + analysis_type: 分析类型 - "structured"(默认,提取结构化数据)或 "charts"(生成图表) Returns: - dict: 包含结构化数据的解析结果 + dict: 包含结构化数据的解析结果或图表数据 """ if not file.filename: raise HTTPException(status_code=400, detail="文件名为空") @@ -453,16 +463,25 @@ async def analyze_word( tmp_path = tmp.name try: - # 使用 AI 解析 Word 文档 - result = await word_ai_service.parse_word_with_ai( - file_path=tmp_path, - user_hint=user_hint or "请提取文档中的所有结构化数据,包括表格、键值对等" - ) + # 根据 analysis_type 选择处理方式 + if analysis_type == "charts": + # 生成图表 + result = await word_ai_service.generate_charts( + file_path=tmp_path, + user_hint=user_hint + ) + else: + # 提取结构化数据 + result = await word_ai_service.parse_word_with_ai( + file_path=tmp_path, + user_hint=user_hint or "请提取文档中的所有结构化数据,包括表格、键值对等" + ) if result.get("success"): return { "success": True, "filename": file.filename, + "analysis_type": analysis_type, "result": result } else: diff --git a/backend/app/services/txt_ai_service.py b/backend/app/services/txt_ai_service.py new file mode 100644 index 0000000..740121d --- /dev/null +++ b/backend/app/services/txt_ai_service.py @@ -0,0 +1,352 @@ +""" +TXT 文档 AI 分析服务 + +使用 LLM 对 TXT 文本文件进行深度分析,提取结构化数据并生成可视化图表 +""" +import logging +import re +from typing import Any, Dict, List, Optional + +from app.services.llm_service import llm_service +from app.services.visualization_service import visualization_service +from app.core.document_parser.txt_parser import TxtParser + +logger = logging.getLogger(__name__) + + +class TxtAIService: + """TXT 文档 AI 分析服务""" + + def __init__(self): + self.parser = TxtParser() + + async def analyze_txt_with_ai( + self, + content: str, + filename: str = "", + user_hint: str = "", + analysis_type: str = "structured" + ) -> Dict[str, Any]: + """ + 使用 AI 解析 TXT 文本文件 + + Args: + content: 文本内容 + filename: 文件名(可选) + user_hint: 用户提示词 + analysis_type: 分析类型 - "structured"(默认,提取结构化数据)或 "charts"(生成图表) + + Returns: + Dict: 包含结构化数据的分析结果 + """ + try: + if not content or not content.strip(): + return { + "success": False, + "error": "文档内容为空" + } + + # 根据分析类型选择处理方式 + if analysis_type == "charts": + return await self.generate_charts(content, filename, user_hint) + + # 默认:提取结构化数据 + return await self._extract_structured_data(content, filename, user_hint) + + except Exception as e: + logger.error(f"TXT AI 分析失败: {str(e)}") + return { + "success": False, + "error": str(e) + } + + async def _extract_structured_data( + self, + content: str, + filename: str = "", + user_hint: str = "" + ) -> Dict[str, Any]: + """ + 从文本中提取结构化数据 + + Args: + content: 文本内容 + filename: 文件名 + user_hint: 用户提示词 + + Returns: + 结构化数据 + """ + try: + # 截断内容避免超出 token 限制 + max_content_len = 8000 + text_preview = content[:max_content_len] if len(content) > max_content_len else content + + prompt = f"""你是一个专业的数据提取专家。请从以下文本中提取结构化数据。 + +【用户需求】 +{user_hint if user_hint else "请提取文档中的所有结构化数据,包括表格数据、键值对、列表项等。"} + +【文档内容】({"前" + str(max_content_len) + "字符,仅显示部分" if len(content) > max_content_len else "全文"}) +{text_preview} + +请按照以下 JSON 格式输出: +{{ + "type": "structured_text", + "tables": [{{"headers": [...], "rows": [...]}}], + "key_values": {{"键1": "值1", "键2": "值2", ...}}, + "list_items": ["项1", "项2", ...], + "summary": "文档内容摘要" +}} + +重点: +- 如果文档包含表格数据(制表符、空格对齐等),提取到 tables 中 +- 如果文档包含键值对(如 名称: 张三),提取到 key_values 中 +- 如果文档包含列表项,提取到 list_items 中 +- 如果无法提取到结构化数据,至少提供一个详细的摘要 +""" + + messages = [ + {"role": "system", "content": "你是一个专业的数据提取助手。请严格按JSON格式输出。"}, + {"role": "user", "content": prompt} + ] + + response = await self.llm.chat( + messages=messages, + temperature=0.1, + max_tokens=50000 + ) + + content_text = self.llm.extract_message_content(response) + result = self._parse_json_response(content_text) + + if result: + logger.info(f"TXT 结构化数据提取成功: type={result.get('type')}") + return { + "success": True, + "type": result.get("type", "structured_text"), + "tables": result.get("tables", []), + "key_values": result.get("key_values", {}), + "list_items": result.get("list_items", []), + "summary": result.get("summary", "") + } + else: + return { + "success": True, + "type": "text", + "summary": text_preview[:500], + "raw_text_preview": text_preview[:500] + } + + except Exception as e: + logger.error(f"TXT 结构化数据提取失败: {str(e)}") + return { + "success": False, + "error": str(e) + } + + async def generate_charts( + self, + content: str, + filename: str = "", + user_hint: str = "" + ) -> Dict[str, Any]: + """ + 从文本中提取数据并生成可视化图表 + + Args: + content: 文本内容 + filename: 文件名 + user_hint: 用户提示词 + + Returns: + 包含图表数据和统计信息的结果 + """ + try: + # 截断内容避免超出 token 限制 + max_content_len = 8000 + text_preview = content[:max_content_len] if len(content) > max_content_len else content + + # 使用 LLM 提取可用于图表的数据 + prompt = f"""你是一个专业的数据可视化助手。请从以下文本中提取可用于可视化的数据。 + +文档标题:{filename} + +文档内容: +{text_preview} + +请完成以下任务: +1. 识别文本中的表格数据(制表符分隔、空格对齐的表格等) +2. 识别文本中的关键统计数据(百分比、数量、趋势等) +3. 识别可用于比较的分类数据 + +请用 JSON 格式返回以下结构的数据(如果没有表格数据,返回空结构): +{{ + "tables": [ + {{ + "description": "表格的描述", + "columns": ["列名1", "列名2", ...], + "rows": [ + ["值1", "值2", ...], + ["值1", "值2", ...] + ] + }} + ], + "key_statistics": [ + {{ + "name": "指标名称", + "value": "数值", + "trend": "增长/下降/持平", + "description": "指标说明" + }} + ], + "chart_suggestions": [ + {{ + "chart_type": "bar/line/pie", + "title": "图表标题", + "data_source": "数据来源说明" + }} + ] +}} + +如果没有表格数据,返回空结构:{{"tables": [], "key_statistics": [], "chart_suggestions": []}} +请确保返回的是合法的 JSON 格式。""" + + messages = [ + {"role": "system", "content": "你是一个专业的数据可视化助手,擅长从文本中提取数据并生成图表。"}, + {"role": "user", "content": prompt} + ] + + response = await self.llm.chat( + messages=messages, + temperature=0.1, + max_tokens=50000 + ) + + content_text = self.llm.extract_message_content(response) + chart_data = self._parse_json_response(content_text) + + if not chart_data: + return { + "success": False, + "error": "无法从文本中提取有效的数据结构" + } + + # 检查是否有表格数据 + tables = chart_data.get("tables", []) + key_statistics = chart_data.get("key_statistics", []) + + if not tables: + return { + "success": False, + "error": "文档中没有可用于图表的表格数据", + "key_statistics": key_statistics, + "chart_suggestions": chart_data.get("chart_suggestions", []) + } + + # 使用第一个表格生成图表 + first_table = tables[0] + columns = first_table.get("columns", []) + rows = first_table.get("rows", []) + + if not columns or not rows: + return { + "success": False, + "error": "表格数据为空" + } + + # 转换为 visualization_service 需要的格式 + viz_data = { + "columns": columns, + "rows": rows + } + + # 生成可视化图表 + logger.info(f"开始生成图表,列数: {len(columns)}, 行数: {len(rows)}") + vis_result = visualization_service.analyze_and_visualize(viz_data) + + if vis_result.get("success"): + return { + "success": True, + "charts": vis_result.get("charts", {}), + "statistics": vis_result.get("statistics", {}), + "distributions": vis_result.get("distributions", {}), + "row_count": vis_result.get("row_count", 0), + "column_count": vis_result.get("column_count", 0), + "key_statistics": key_statistics, + "chart_suggestions": chart_data.get("chart_suggestions", []), + "table_description": first_table.get("description", "") + } + else: + return { + "success": False, + "error": vis_result.get("error", "可视化生成失败"), + "key_statistics": key_statistics + } + + except Exception as e: + logger.error(f"TXT 图表生成失败: {str(e)}") + return { + "success": False, + "error": str(e) + } + + def _parse_json_response(self, content: str) -> Optional[Dict]: + """解析 JSON 响应,处理各种格式问题""" + if not content: + return None + + import json + + # 清理 markdown 标记 + cleaned = content.strip() + cleaned = re.sub(r'^```json\s*', '', cleaned, flags=re.MULTILINE) + cleaned = re.sub(r'^```\s*', '', cleaned, flags=re.MULTILINE) + cleaned = cleaned.strip() + + # 找到 JSON 开始位置 + json_start = -1 + for i, c in enumerate(cleaned): + if c == '{': + json_start = i + break + + if json_start == -1: + logger.warning("无法找到 JSON 开始位置") + return None + + json_text = cleaned[json_start:] + + # 尝试直接解析 + try: + return json.loads(json_text) + except json.JSONDecodeError: + pass + + # 尝试修复并解析 + try: + # 找到闭合括号 + depth = 0 + end_pos = -1 + for i, c in enumerate(json_text): + if c == '{': + depth += 1 + elif c == '}': + depth -= 1 + if depth == 0: + end_pos = i + 1 + break + + if end_pos > 0: + fixed = json_text[:end_pos] + # 移除末尾逗号 + fixed = re.sub(r',\s*([}]])', r'\1', fixed) + return json.loads(fixed) + except Exception as e: + logger.warning(f"JSON 修复失败: {e}") + + return None + + +# 全局单例 +txt_ai_service = TxtAIService() diff --git a/backend/app/services/word_ai_service.py b/backend/app/services/word_ai_service.py index 3a0ab16..817fd8c 100644 --- a/backend/app/services/word_ai_service.py +++ b/backend/app/services/word_ai_service.py @@ -8,6 +8,7 @@ from typing import Dict, Any, List, Optional import json from app.services.llm_service import llm_service +from app.services.visualization_service import visualization_service from app.core.document_parser.docx_parser import DocxParser logger = logging.getLogger(__name__) @@ -634,6 +635,127 @@ class WordAIService: return values + async def generate_charts( + self, + file_path: str, + user_hint: str = "" + ) -> Dict[str, Any]: + """ + 使用 AI 解析 Word 文档并生成可视化图表 + + 从 Word 文档中提取表格数据,然后生成统计图表 + + Args: + file_path: Word 文件路径 + user_hint: 用户提示词,指定要提取的内容类型 + + Returns: + Dict: 包含图表数据和统计信息的结果 + """ + try: + # 1. 先用基础解析器提取原始内容 + parse_result = self.parser.parse(file_path) + + if not parse_result.success: + return { + "success": False, + "error": parse_result.error, + "structured_data": None + } + + # 2. 获取原始数据 + raw_data = parse_result.data + paragraphs = raw_data.get("paragraphs", []) + tables = raw_data.get("tables", []) + content = raw_data.get("content", "") + + logger.info(f"Word 基础解析完成: {len(paragraphs)} 个段落, {len(tables)} 个表格") + + # 3. 优先处理表格数据 + if tables and len(tables) > 0: + structured_data = await self._extract_tables_with_ai( + tables, paragraphs, 0, user_hint, parse_result.metadata + ) + elif paragraphs and len(paragraphs) > 0: + structured_data = await self._extract_from_text_with_ai( + paragraphs, content, 0, [], user_hint + ) + else: + return { + "success": False, + "error": "文档内容为空", + "structured_data": None + } + + # 4. 检查是否有表格数据用于可视化 + if not structured_data.get("success"): + return { + "success": False, + "error": structured_data.get("error", "解析失败"), + "structured_data": None + } + + parse_type = structured_data.get("type", "") + + # 5. 提取可用于图表的数据 + chart_data = None + + if parse_type == "table_data": + headers = structured_data.get("headers", []) + rows = structured_data.get("rows", []) + if headers and rows: + chart_data = { + "columns": headers, + "rows": rows + } + elif parse_type == "structured_text": + tables = structured_data.get("tables", []) + if tables and len(tables) > 0: + first_table = tables[0] + headers = first_table.get("headers", []) + rows = first_table.get("rows", []) + if headers and rows: + chart_data = { + "columns": headers, + "rows": rows + } + + # 6. 生成可视化图表 + if chart_data: + logger.info(f"开始生成图表,列数: {len(chart_data['columns'])}, 行数: {len(chart_data['rows'])}") + vis_result = visualization_service.analyze_and_visualize(chart_data) + + if vis_result.get("success"): + return { + "success": True, + "charts": vis_result.get("charts", {}), + "statistics": vis_result.get("statistics", {}), + "distributions": vis_result.get("distributions", {}), + "structured_data": structured_data, + "row_count": vis_result.get("row_count", 0), + "column_count": vis_result.get("column_count", 0) + } + else: + return { + "success": False, + "error": vis_result.get("error", "可视化生成失败"), + "structured_data": structured_data + } + else: + return { + "success": False, + "error": "文档中没有可用于图表的表格数据", + "structured_data": structured_data + } + + except Exception as e: + logger.error(f"Word 文档图表生成失败: {str(e)}") + return { + "success": False, + "error": str(e), + "structured_data": None + } + # 全局单例 word_ai_service = WordAIService() diff --git a/frontend/src/db/backend-api.ts b/frontend/src/db/backend-api.ts index 7d43424..59243f6 100644 --- a/frontend/src/db/backend-api.ts +++ b/frontend/src/db/backend-api.ts @@ -250,6 +250,98 @@ export interface AIExcelAnalyzeResult { error?: string; } +// ==================== Word/TXT AI 分析类型 ==================== + +export type WordAnalysisType = 'structured' | 'charts'; +export type TxtAnalysisType = 'structured' | 'charts'; + +export interface WordAIStructuredResult { + success: boolean; + result?: { + success?: boolean; + type?: string; + headers?: string[]; + rows?: string[][]; + key_values?: Record; + list_items?: string[]; + summary?: string; + error?: string; + }; + error?: string; +} + +export interface WordAIChartsResult { + success: boolean; + result?: { + success?: boolean; + charts?: { + histograms?: Array; + bar_charts?: Array; + box_plots?: Array; + correlation?: any; + }; + statistics?: { + numeric?: Record; + categorical?: Record; + }; + distributions?: Record; + row_count?: number; + column_count?: number; + error?: string; + }; + error?: string; +} + +export interface TxtAIStructuredResult { + success: boolean; + result?: { + success?: boolean; + type?: string; + tables?: Array<{ + headers?: string[]; + rows?: string[][]; + }>; + key_values?: Record; + list_items?: string[]; + summary?: string; + error?: string; + }; + error?: string; +} + +export interface TxtAIChartsResult { + success: boolean; + result?: { + success?: boolean; + charts?: { + histograms?: Array; + bar_charts?: Array; + box_plots?: Array; + correlation?: any; + }; + statistics?: { + numeric?: Record; + categorical?: Record; + }; + distributions?: Record; + row_count?: number; + column_count?: number; + key_statistics?: Array<{ + name?: string; + value?: string; + trend?: string; + description?: string; + }>; + chart_suggestions?: Array<{ + chart_type?: string; + title?: string; + data_source?: string; + }>; + error?: string; + }; + error?: string; +} + // ==================== API 封装 ==================== export const backendApi = { @@ -1337,28 +1429,25 @@ export const aiApi = { }, /** - * 上传并使用 AI 分析 TXT 文本文件,提取结构化数据 + * 上传并使用 AI 分析 TXT 文本文件,提取结构化数据或生成图表 */ async analyzeTxt( - file: File + file: File, + analysisType: TxtAnalysisType = 'structured' ): Promise<{ success: boolean; filename?: string; - structured_data?: { - table?: { - columns?: string[]; - rows?: string[][]; - }; - summary?: string; - key_value_pairs?: Array<{ key: string; value: string }>; - numeric_data?: Array<{ name: string; value: number; unit?: string }>; - }; + analysis_type?: string; + result?: any; error?: string; }> { const formData = new FormData(); formData.append('file', file); - const url = `${BACKEND_BASE_URL}/ai/analyze/txt`; + const params = new URLSearchParams(); + params.append('analysis_type', analysisType); + + const url = `${BACKEND_BASE_URL}/ai/analyze/txt?${params.toString()}`; try { const response = await fetch(url, { @@ -1480,19 +1569,17 @@ export const aiApi = { // ==================== Word AI 解析 ==================== /** - * 使用 AI 解析 Word 文档,提取结构化数据 + * 使用 AI 解析 Word 文档,提取结构化数据或生成图表 */ async analyzeWordWithAI( file: File, - userHint: string = '' + userHint: string = '', + analysisType: WordAnalysisType = 'structured' ): Promise<{ success: boolean; - type?: string; - headers?: string[]; - rows?: string[][]; - key_values?: Record; - list_items?: string[]; - summary?: string; + filename?: string; + analysis_type?: string; + result?: any; error?: string; }> { const formData = new FormData(); @@ -1501,7 +1588,10 @@ export const aiApi = { formData.append('user_hint', userHint); } - const url = `${BACKEND_BASE_URL}/ai/analyze/word`; + const params = new URLSearchParams(); + params.append('analysis_type', analysisType); + + const url = `${BACKEND_BASE_URL}/ai/analyze/word?${params.toString()}`; try { const response = await fetch(url, { diff --git a/frontend/src/pages/Documents.tsx b/frontend/src/pages/Documents.tsx index 79af9f5..1cf86af 100644 --- a/frontend/src/pages/Documents.tsx +++ b/frontend/src/pages/Documents.tsx @@ -107,6 +107,15 @@ const Documents: React.FC = () => { const [mdStreaming, setMdStreaming] = useState(false); const [mdStreamingContent, setMdStreamingContent] = useState(''); + // Word AI 分析相关状态 + const [wordAnalysis, setWordAnalysis] = useState(null); + const [wordAnalysisType, setWordAnalysisType] = useState<'structured' | 'charts'>('structured'); + const [wordUserHint, setWordUserHint] = useState(''); + + // TXT AI 分析相关状态 + const [txtAnalysis, setTxtAnalysis] = useState(null); + const [txtAnalysisType, setTxtAnalysisType] = useState<'structured' | 'charts'>('structured'); + // RAG 向量检索相关状态 const [ragStatus, setRagStatus] = useState<{ vector_count: number; collections: string[] } | null>(null); const [ragSearchQuery, setRagSearchQuery] = useState(''); @@ -701,6 +710,62 @@ const Documents: React.FC = () => { } }; + // Word AI 分析处理 + const handleWordAnalyze = async () => { + if (!uploadedFile || !isWordFile(uploadedFile.name)) { + toast.error('请先上传 Word 文件'); + return; + } + + setAnalyzing(true); + setWordAnalysis(null); + + try { + const result = await aiApi.analyzeWordWithAI( + uploadedFile, + wordUserHint, + wordAnalysisType + ); + + if (result.success) { + toast.success('Word AI 分析完成'); + setWordAnalysis(result); + } else { + toast.error(result.error || 'AI 分析失败'); + } + } catch (error: any) { + toast.error(error.message || 'AI 分析失败'); + } finally { + setAnalyzing(false); + } + }; + + // TXT AI 分析处理 + const handleTxtAnalyze = async () => { + if (!uploadedFile || !isTxtFile(uploadedFile.name)) { + toast.error('请先上传 TXT 文件'); + return; + } + + setAnalyzing(true); + setTxtAnalysis(null); + + try { + const result = await aiApi.analyzeTxt(uploadedFile, txtAnalysisType); + + if (result.success) { + toast.success('TXT AI 分析完成'); + setTxtAnalysis(result); + } else { + toast.error(result.error || 'AI 分析失败'); + } + } catch (error: any) { + toast.error(error.message || 'AI 分析失败'); + } finally { + setAnalyzing(false); + } + }; + const getMdAnalysisIcon = (type: string) => { switch (type) { case 'summary': return ; @@ -739,6 +804,16 @@ const Documents: React.FC = () => { return ext === 'xlsx' || ext === 'xls'; }; + const isWordFile = (filename: string) => { + const ext = filename.split('.').pop()?.toLowerCase(); + return ext === 'docx'; + }; + + const isTxtFile = (filename: string) => { + const ext = filename.split('.').pop()?.toLowerCase(); + return ext === 'txt'; + }; + return (
@@ -1238,6 +1313,115 @@ const Documents: React.FC = () => { )} + {/* Word AI 分析选项 */} + {uploadedFile && isWordFile(uploadedFile.name) && ( + + + + + Word AI 分析 + + + +
+ + +
+
+ +