前后端基本架构和完全excel表的解析及统计图表的生成以及excel表的到出

This commit is contained in:
2026-03-19 01:51:34 +08:00
parent c23b93bb70
commit 2f630695ff
194 changed files with 23354 additions and 174 deletions

View File

@@ -0,0 +1,349 @@
"""
图表生成服务 - 根据结构化数据生成图表
"""
import io
import base64
import logging
from typing import Dict, Any, List, Optional
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
# 使用字体辅助模块配置中文字体
from app.services.font_helper import configure_matplotlib_fonts
configure_matplotlib_fonts()
logger = logging.getLogger(__name__)
class ChartGeneratorService:
"""图表生成服务类"""
def __init__(self):
self.output_dir = Path(__file__).resolve().parent.parent.parent / "data" / "charts"
self.output_dir.mkdir(parents=True, exist_ok=True)
def generate_charts_from_analysis(
self,
structured_data: Dict[str, Any]
) -> Dict[str, Any]:
"""
根据提取的结构化数据生成图表
Args:
structured_data: 从 AI 分析结果中提取的结构化数据
Returns:
Dict[str, Any]: 包含图表数据的结果
"""
if not structured_data.get("success"):
return {
"success": False,
"error": structured_data.get("error", "数据提取失败")
}
data = structured_data.get("data", {})
charts = {}
statistics = {}
try:
# 1. 数值型数据图表
numeric_data = data.get("numeric_data", [])
if numeric_data:
charts["numeric_charts"] = self._create_numeric_charts(numeric_data)
statistics["numeric_summary"] = self._create_numeric_summary(numeric_data)
# 2. 分类数据图表
categorical_data = data.get("categorical_data", [])
if categorical_data:
charts["categorical_charts"] = self._create_categorical_charts(categorical_data)
# 3. 时间序列图表
time_series_data = data.get("time_series_data", [])
if time_series_data:
charts["time_series_chart"] = self._create_time_series_chart(time_series_data)
# 4. 对比数据图表
comparison_data = data.get("comparison_data", [])
if comparison_data:
charts["comparison_chart"] = self._create_comparison_chart(comparison_data)
# 5. 表格数据可视化
table_data = data.get("table_data")
if table_data:
charts["table_preview"] = self._create_table_preview(table_data)
# 元数据
metadata = data.get("metadata", {})
return {
"success": True,
"charts": charts,
"statistics": statistics,
"metadata": metadata,
"data_source": "ai_analysis"
}
except Exception as e:
logger.error(f"生成图表失败: {str(e)}", exc_info=True)
return {
"success": False,
"error": str(e)
}
def _create_numeric_charts(self, numeric_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""创建数值型数据图表"""
charts = []
# 提取数值和标签
names = [item.get("name", f"{i}") for i, item in enumerate(numeric_data)]
values = [item.get("value", 0) for item in numeric_data]
if not values:
return charts
# 1. 柱状图
try:
fig, ax = plt.subplots(figsize=(12, 7))
colors = plt.cm.Set3(np.linspace(0, 1, len(values)))
bars = ax.bar(names, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)
# 添加数值标签
for bar, value in zip(bars, values):
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width() / 2., height,
f'{value:,.0f}',
ha='center', va='bottom', fontsize=9, fontweight='bold')
ax.set_xlabel('项目', fontsize=10, labelpad=10, fontweight='bold')
ax.set_ylabel('数值', fontsize=10, labelpad=10, fontweight='bold')
ax.set_title('数值型数据对比', fontsize=12, fontweight='bold', pad=15)
ax.set_xticklabels(names, rotation=30, ha='right', fontsize=9)
ax.tick_params(axis='both', which='major', labelsize=9)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout(pad=1.5)
img_base64 = self._figure_to_base64(fig)
charts.append({
"type": "bar",
"title": "数值型数据对比",
"image": img_base64,
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
})
except Exception as e:
logger.error(f"创建柱状图失败: {str(e)}")
# 2. 饼图
if len(values) > 0 and len(values) <= 10:
try:
fig, ax = plt.subplots(figsize=(10, 10))
wedges, texts, autotexts = ax.pie(values, labels=names, autopct='%1.1f%%',
startangle=90, colors=plt.cm.Set3.colors[:len(values)])
for autotext in autotexts:
autotext.set_color('white')
autotext.set_fontsize(9)
autotext.set_fontweight('bold')
ax.set_title('数值型数据占比', fontsize=12, fontweight='bold', pad=15)
img_base64 = self._figure_to_base64(fig)
charts.append({
"type": "pie",
"title": "数值型数据占比",
"image": img_base64,
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
})
except Exception as e:
logger.error(f"创建饼图失败: {str(e)}")
return charts
def _create_categorical_charts(self, categorical_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""创建分类数据图表"""
charts = []
# 提取数据
names = [item.get("name", f"{i}") for i, item in enumerate(categorical_data)]
counts = [item.get("count", 1) for item in categorical_data]
if not names or not counts:
return charts
# 水平条形图
try:
fig, ax = plt.subplots(figsize=(10, max(6, len(names) * 0.8)))
y_pos = np.arange(len(names))
bars = ax.barh(y_pos, counts, align='center', color='#10b981', alpha=0.8, edgecolor='black', linewidth=0.5)
# 添加数值标签
for bar, count in zip(bars, counts):
width = bar.get_width()
ax.text(width, bar.get_y() + bar.get_height() / 2.,
f'{count}',
ha='left', va='center', fontsize=10, fontweight='bold')
ax.set_yticks(y_pos)
ax.set_yticklabels(names, fontsize=10)
ax.invert_yaxis()
ax.set_xlabel('数量', fontsize=10, labelpad=10, fontweight='bold')
ax.set_title('分类数据分布', fontsize=12, fontweight='bold', pad=15)
ax.tick_params(axis='both', which='major', labelsize=9)
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(pad=1.5)
img_base64 = self._figure_to_base64(fig)
charts.append({
"type": "barh",
"title": "分类数据分布",
"image": img_base64,
"data": [{"name": n, "count": c} for n, c in zip(names, counts)]
})
except Exception as e:
logger.error(f"创建分类图表失败: {str(e)}")
return charts
def _create_time_series_chart(self, time_series_data: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
"""创建时间序列图表"""
if not time_series_data:
return None
try:
names = [item.get("name", f"时间{i}") for i, item in enumerate(time_series_data)]
values = [item.get("value", 0) for item in time_series_data]
if len(values) < 2:
return None
fig, ax = plt.subplots(figsize=(14, 7))
# 绘制折线图和柱状图
x_pos = np.arange(len(names))
bars = ax.bar(x_pos, values, width=0.4, label='数值', color='#3b82f6', alpha=0.7)
# 添加折线
line = ax.plot(x_pos, values, 'o-', color='#ef4444', linewidth=2.5, markersize=8, label='趋势')
ax.set_xticks(x_pos)
ax.set_xticklabels(names, rotation=30, ha='right', fontsize=9)
ax.set_ylabel('数值', fontsize=10, labelpad=10, fontweight='bold')
ax.set_title('时间序列数据', fontsize=12, fontweight='bold', pad=15)
ax.legend(loc='best', fontsize=9)
ax.tick_params(axis='both', which='major', labelsize=9)
ax.grid(True, alpha=0.3)
plt.tight_layout(pad=1.5)
img_base64 = self._figure_to_base64(fig)
return {
"type": "time_series",
"title": "时间序列数据",
"image": img_base64,
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
}
except Exception as e:
logger.error(f"创建时间序列图表失败: {str(e)}")
return None
def _create_comparison_chart(self, comparison_data: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
"""创建对比图表"""
if not comparison_data:
return None
try:
names = [item.get("name", f"对比{i}") for i, item in enumerate(comparison_data)]
values = [item.get("value", 0) for item in comparison_data]
fig, ax = plt.subplots(figsize=(10, 7))
# 区分正负值
colors = ['#10b981' if v >= 0 else '#ef4444' for v in values]
bars = ax.bar(names, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.8)
# 添加数值标签
for bar, value in zip(bars, values):
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width() / 2., height,
f'{value:,.1f}',
ha='center', va='bottom' if value >= 0 else 'top',
fontsize=10, fontweight='bold')
# 添加零线
ax.axhline(y=0, color='black', linestyle='-', linewidth=1)
ax.set_ylabel('', fontsize=10, labelpad=10, fontweight='bold')
ax.set_title('对比数据', fontsize=12, fontweight='bold', pad=15)
ax.set_xticklabels(names, rotation=30, ha='right', fontsize=9)
ax.tick_params(axis='both', which='major', labelsize=9)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout(pad=1.5)
img_base64 = self._figure_to_base64(fig)
return {
"type": "comparison",
"title": "对比数据",
"image": img_base64,
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
}
except Exception as e:
logger.error(f"创建对比图表失败: {str(e)}")
return None
def _create_table_preview(self, table_data: Dict[str, Any]) -> Dict[str, Any]:
"""创建表格预览数据"""
if not table_data:
return {}
columns = table_data.get("columns", [])
rows = table_data.get("rows", [])
return {
"columns": columns,
"rows": rows[:50], # 限制显示前50行
"total_rows": len(rows),
"preview_rows": min(50, len(rows))
}
def _create_numeric_summary(self, numeric_data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""创建数值型数据摘要"""
values = [item.get("value", 0) for item in numeric_data if isinstance(item.get("value"), (int, float))]
if not values:
return {}
return {
"count": len(values),
"sum": float(sum(values)),
"mean": float(np.mean(values)),
"median": float(np.median(values)),
"min": float(min(values)),
"max": float(max(values)),
"std": float(np.std(values)) if len(values) > 1 else 0
}
def _figure_to_base64(self, fig) -> str:
"""将 matplotlib 图形转换为 base64 字符串"""
buf = io.BytesIO()
fig.savefig(
buf,
format='png',
dpi=120,
bbox_inches='tight',
pad_inches=0.3,
facecolor='white',
edgecolor='none',
transparent=False
)
plt.close(fig)
buf.seek(0)
img_base64 = base64.b64encode(buf.read()).decode('utf-8')
return f"data:image/png;base64,{img_base64}"
# 全局单例
chart_generator_service = ChartGeneratorService()

View File

@@ -0,0 +1,253 @@
"""
Excel AI 分析服务 - 集成 Excel 解析和 LLM 分析
"""
import logging
from typing import Dict, Any, Optional, List
from app.core.document_parser import XlsxParser
from app.services.file_service import file_service
from app.services.llm_service import llm_service
logger = logging.getLogger(__name__)
class ExcelAIService:
"""Excel AI 分析服务"""
def __init__(self):
self.parser = XlsxParser()
self.file_service = file_service
self.llm_service = llm_service
async def analyze_excel_file(
self,
file_content: bytes,
filename: str,
user_prompt: str = "",
analysis_type: str = "general",
parse_options: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
"""
分析 Excel 文件
Args:
file_content: 文件内容字节
filename: 文件名
user_prompt: 用户自定义提示词
analysis_type: 分析类型
parse_options: 解析选项
Returns:
Dict[str, Any]: 分析结果
"""
# 1. 保存文件
try:
saved_path = self.file_service.save_uploaded_file(
file_content,
filename,
subfolder="excel"
)
logger.info(f"文件已保存: {saved_path}")
except Exception as e:
logger.error(f"文件保存失败: {str(e)}")
return {
"success": False,
"error": f"文件保存失败: {str(e)}",
"analysis": None
}
# 2. 解析 Excel 文件
try:
parse_options = parse_options or {}
parse_result = self.parser.parse(saved_path, **parse_options)
if not parse_result.success:
return {
"success": False,
"error": parse_result.error,
"analysis": None
}
excel_data = parse_result.data
logger.info(f"Excel 解析成功: {parse_result.metadata}")
except Exception as e:
logger.error(f"Excel 解析失败: {str(e)}")
return {
"success": False,
"error": f"Excel 解析失败: {str(e)}",
"analysis": None
}
# 3. 调用 LLM 进行分析
try:
# 如果有自定义提示词,使用模板分析
if user_prompt and user_prompt.strip():
llm_result = await self.llm_service.analyze_with_template(
excel_data,
user_prompt
)
else:
# 否则使用标准分析
llm_result = await self.llm_service.analyze_excel_data(
excel_data,
user_prompt,
analysis_type
)
logger.info(f"AI 分析完成: {llm_result['success']}")
# 4. 组合结果
return {
"success": True,
"excel": {
"data": excel_data,
"metadata": parse_result.metadata,
"saved_path": saved_path
},
"analysis": llm_result
}
except Exception as e:
logger.error(f"AI 分析失败: {str(e)}")
return {
"success": False,
"error": f"AI 分析失败: {str(e)}",
"excel": {
"data": excel_data,
"metadata": parse_result.metadata
},
"analysis": None
}
async def batch_analyze_sheets(
self,
file_content: bytes,
filename: str,
user_prompt: str = "",
analysis_type: str = "general"
) -> Dict[str, Any]:
"""
批量分析 Excel 文件的所有工作表
Args:
file_content: 文件内容字节
filename: 文件名
user_prompt: 用户自定义提示词
analysis_type: 分析类型
Returns:
Dict[str, Any]: 分析结果
"""
# 1. 保存文件
try:
saved_path = self.file_service.save_uploaded_file(
file_content,
filename,
subfolder="excel"
)
logger.info(f"文件已保存: {saved_path}")
except Exception as e:
logger.error(f"文件保存失败: {str(e)}")
return {
"success": False,
"error": f"文件保存失败: {str(e)}",
"analysis": None
}
# 2. 解析所有工作表
try:
parse_result = self.parser.parse_all_sheets(saved_path)
if not parse_result.success:
return {
"success": False,
"error": parse_result.error,
"analysis": None
}
sheets_data = parse_result.data.get("sheets", {})
logger.info(f"Excel 解析成功,共 {len(sheets_data)} 个工作表")
except Exception as e:
logger.error(f"Excel 解析失败: {str(e)}")
return {
"success": False,
"error": f"Excel 解析失败: {str(e)}",
"analysis": None
}
# 3. 批量分析每个工作表
sheet_analyses = {}
errors = {}
for sheet_name, sheet_data in sheets_data.items():
try:
# 调用 LLM 分析
if user_prompt and user_prompt.strip():
llm_result = await self.llm_service.analyze_with_template(
sheet_data,
user_prompt
)
else:
llm_result = await self.llm_service.analyze_excel_data(
sheet_data,
user_prompt,
analysis_type
)
sheet_analyses[sheet_name] = llm_result
if not llm_result["success"]:
errors[sheet_name] = llm_result.get("error", "未知错误")
logger.info(f"工作表 '{sheet_name}' 分析完成")
except Exception as e:
logger.error(f"工作表 '{sheet_name}' 分析失败: {str(e)}")
errors[sheet_name] = str(e)
# 4. 组合结果
return {
"success": len(errors) == 0,
"excel": {
"sheets": sheets_data,
"metadata": parse_result.metadata,
"saved_path": saved_path
},
"analysis": {
"sheets": sheet_analyses,
"total_sheets": len(sheets_data),
"successful": len(sheet_analyses) - len(errors),
"errors": errors
}
}
def get_supported_analysis_types(self) -> List[str]:
"""获取支持的分析类型"""
return [
{
"value": "general",
"label": "综合分析",
"description": "提供数据概览、关键发现、质量评估和建议"
},
{
"value": "summary",
"label": "数据摘要",
"description": "快速了解数据的结构、范围和主要内容"
},
{
"value": "statistics",
"label": "统计分析",
"description": "数值型列的统计信息和分类列的分布"
},
{
"value": "insights",
"label": "深度洞察",
"description": "深入挖掘数据,提供异常值和业务建议"
}
]
# 全局单例
excel_ai_service = ExcelAIService()

View File

@@ -0,0 +1,132 @@
"""
文件服务模块 - 处理文件存储和读取
"""
import os
import shutil
from pathlib import Path
from datetime import datetime
from typing import Optional
import uuid
from app.config import settings
class FileService:
"""文件服务类,负责文件的存储、读取和管理"""
def __init__(self):
self.upload_dir = Path(settings.UPLOAD_DIR)
self._ensure_upload_dir()
def _ensure_upload_dir(self):
"""确保上传目录存在"""
self.upload_dir.mkdir(parents=True, exist_ok=True)
def save_uploaded_file(
self,
file_content: bytes,
filename: str,
subfolder: Optional[str] = None
) -> str:
"""
保存上传的文件
Args:
file_content: 文件内容字节
filename: 原始文件名
subfolder: 可选的子文件夹名称
Returns:
str: 保存后的文件路径
"""
# 生成唯一文件名,避免覆盖
file_ext = Path(filename).suffix
unique_name = f"{uuid.uuid4().hex}{file_ext}"
# 确定保存路径
if subfolder:
save_dir = self.upload_dir / subfolder
save_dir.mkdir(parents=True, exist_ok=True)
else:
save_dir = self.upload_dir
file_path = save_dir / unique_name
# 写入文件
with open(file_path, 'wb') as f:
f.write(file_content)
return str(file_path)
def read_file(self, file_path: str) -> bytes:
"""
读取文件内容
Args:
file_path: 文件路径
Returns:
bytes: 文件内容
"""
with open(file_path, 'rb') as f:
return f.read()
def delete_file(self, file_path: str) -> bool:
"""
删除文件
Args:
file_path: 文件路径
Returns:
bool: 是否删除成功
"""
try:
file = Path(file_path)
if file.exists():
file.unlink()
return True
return False
except Exception:
return False
def get_file_info(self, file_path: str) -> dict:
"""
获取文件信息
Args:
file_path: 文件路径
Returns:
dict: 文件信息
"""
file = Path(file_path)
if not file.exists():
return {}
stat = file.stat()
return {
"filename": file.name,
"filepath": str(file),
"size": stat.st_size,
"created": datetime.fromtimestamp(stat.st_ctime).isoformat(),
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
"extension": file.suffix.lower()
}
def get_file_size(self, file_path: str) -> int:
"""
获取文件大小(字节)
Args:
file_path: 文件路径
Returns:
int: 文件大小,文件不存在返回 0
"""
file = Path(file_path)
return file.stat().st_size if file.exists() else 0
# 全局单例
file_service = FileService()

View File

@@ -0,0 +1,105 @@
"""
字体辅助模块 - 处理中文字体检测和配置
"""
import matplotlib
import matplotlib.font_manager as fm
import platform
import os
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
def get_chinese_font() -> str:
"""
获取可用的中文字体
Returns:
str: 可用的中文字体名称
"""
# 获取系统中所有可用字体
available_fonts = set([f.name for f in fm.fontManager.ttflist])
# 定义字体优先级列表
# Windows 优先
if platform.system() == 'Windows':
font_list = [
'Microsoft YaHei', # 微软雅黑
'SimHei', # 黑体
'SimSun', # 宋体
'KaiTi', # 楷体
'FangSong', # 仿宋
'STXihei', # 华文细黑
'STKaiti', # 华文楷体
'STSong', # 华文宋体
'STFangsong', # 华文仿宋
]
# macOS 优先
elif platform.system() == 'Darwin':
font_list = [
'PingFang SC', # 苹方-简
'PingFang TC', # 苹方-繁
'Heiti SC', # 黑体-简
'Heiti TC', # 黑体-繁
'STHeiti', # 华文黑体
'STSong', # 华文宋体
'STKaiti', # 华文楷体
'Arial Unicode MS', # Arial Unicode MS
]
# Linux 优先
else:
font_list = [
'Noto Sans CJK SC', # Noto Sans CJK 简体中文
'WenQuanYi Micro Hei', # 文泉驿微米黑
'AR PL UMing CN', # AR PL UMing
'AR PL UKai CN', # AR PL UKai
'ZCOOL XiaoWei', # ZCOOL 小薇
]
# 通用备选字体
font_list.extend([
'SimHei',
'Microsoft YaHei',
'Arial Unicode MS',
'Droid Sans Fallback',
])
# 查找第一个可用的字体
for font_name in font_list:
if font_name in available_fonts:
logger.info(f"找到中文字体: {font_name}")
return font_name
# 如果没找到,尝试获取第一个中文字体
for font in fm.fontManager.ttflist:
if 'CJK' in font.name or 'SC' in font.name or 'TC' in font.name:
logger.info(f"使用找到的中文字体: {font.name}")
return font.name
# 最终备选:使用系统默认字体
logger.warning("未找到合适的中文字体,使用默认字体")
return 'sans-serif'
def configure_matplotlib_fonts():
"""
配置 matplotlib 的字体设置
"""
chinese_font = get_chinese_font()
# 配置字体
matplotlib.rcParams['font.sans-serif'] = [chinese_font]
matplotlib.rcParams['axes.unicode_minus'] = False
matplotlib.rcParams['figure.dpi'] = 100
matplotlib.rcParams['savefig.dpi'] = 120
# 字体大小设置
matplotlib.rcParams['font.size'] = 10
matplotlib.rcParams['axes.labelsize'] = 10
matplotlib.rcParams['axes.titlesize'] = 11
matplotlib.rcParams['xtick.labelsize'] = 9
matplotlib.rcParams['ytick.labelsize'] = 9
matplotlib.rcParams['legend.fontsize'] = 9
logger.info(f"配置完成,使用字体: {chinese_font}")
return chinese_font

View File

@@ -0,0 +1,268 @@
"""
LLM 服务模块 - 封装大模型 API 调用
"""
import logging
from typing import Dict, Any, List, Optional
import httpx
from app.config import settings
logger = logging.getLogger(__name__)
class LLMService:
"""大语言模型服务类"""
def __init__(self):
self.api_key = settings.LLM_API_KEY
self.base_url = settings.LLM_BASE_URL
self.model_name = settings.LLM_MODEL_NAME
async def chat(
self,
messages: List[Dict[str, str]],
temperature: float = 0.7,
max_tokens: Optional[int] = None,
**kwargs
) -> Dict[str, Any]:
"""
调用聊天 API
Args:
messages: 消息列表,格式为 [{"role": "user", "content": "..."}]
temperature: 温度参数,控制随机性
max_tokens: 最大生成 token 数
**kwargs: 其他参数
Returns:
Dict[str, Any]: API 响应结果
"""
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
payload = {
"model": self.model_name,
"messages": messages,
"temperature": temperature
}
if max_tokens:
payload["max_tokens"] = max_tokens
# 添加其他参数
payload.update(kwargs)
try:
async with httpx.AsyncClient(timeout=60.0) as client:
response = await client.post(
f"{self.base_url}/chat/completions",
headers=headers,
json=payload
)
response.raise_for_status()
return response.json()
except httpx.HTTPStatusError as e:
logger.error(f"LLM API 请求失败: {e.response.status_code} - {e.response.text}")
raise
except Exception as e:
logger.error(f"LLM API 调用异常: {str(e)}")
raise
def extract_message_content(self, response: Dict[str, Any]) -> str:
"""
从 API 响应中提取消息内容
Args:
response: API 响应
Returns:
str: 消息内容
"""
try:
return response["choices"][0]["message"]["content"]
except (KeyError, IndexError) as e:
logger.error(f"解析 API 响应失败: {str(e)}")
raise
async def analyze_excel_data(
self,
excel_data: Dict[str, Any],
user_prompt: str,
analysis_type: str = "general"
) -> Dict[str, Any]:
"""
分析 Excel 数据
Args:
excel_data: Excel 解析后的数据
user_prompt: 用户提示词
analysis_type: 分析类型 (general, summary, statistics, insights)
Returns:
Dict[str, Any]: 分析结果
"""
# 构建 Prompt
system_prompt = self._get_system_prompt(analysis_type)
user_message = self._format_user_message(excel_data, user_prompt)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
]
try:
response = await self.chat(
messages=messages,
temperature=0.3, # 较低的温度以获得更稳定的输出
max_tokens=2000
)
content = self.extract_message_content(response)
return {
"success": True,
"analysis": content,
"model": self.model_name,
"analysis_type": analysis_type
}
except Exception as e:
logger.error(f"Excel 数据分析失败: {str(e)}")
return {
"success": False,
"error": str(e),
"analysis": None
}
def _get_system_prompt(self, analysis_type: str) -> str:
"""获取系统提示词"""
prompts = {
"general": """你是一个专业的数据分析师。请分析用户提供的 Excel 数据,提供有价值的见解和建议。
请按照以下格式输出:
1. 数据概览
2. 关键发现
3. 数据质量评估
4. 建议
输出语言:中文""",
"summary": """你是一个专业的数据分析师。请对用户提供的 Excel 数据进行简洁的总结。
输出格式:
- 数据行数和列数
- 主要列的说明
- 数据范围概述
输出语言:中文""",
"statistics": """你是一个专业的数据分析师。请对用户提供的 Excel 数据进行统计分析。
请分析:
- 数值型列的统计信息(平均值、中位数、最大值、最小值)
- 分类列的分布情况
- 数据相关性
输出语言:中文,使用表格或结构化格式展示""",
"insights": """你是一个专业的数据分析师。请深入挖掘用户提供的 Excel 数据,提供有价值的洞察。
请分析:
1. 数据中的异常值或特殊模式
2. 数据之间的潜在关联
3. 基于数据的业务建议
4. 数据趋势分析(如适用)
输出语言:中文,提供详细且可操作的建议"""
}
return prompts.get(analysis_type, prompts["general"])
def _format_user_message(self, excel_data: Dict[str, Any], user_prompt: str) -> str:
"""格式化用户消息"""
columns = excel_data.get("columns", [])
rows = excel_data.get("rows", [])
row_count = excel_data.get("row_count", 0)
column_count = excel_data.get("column_count", 0)
# 构建数据描述
data_info = f"""
Excel 数据概览:
- 行数: {row_count}
- 列数: {column_count}
- 列名: {', '.join(columns)}
数据样例(前 5 行):
"""
# 添加数据样例
for i, row in enumerate(rows[:5], 1):
row_str = " | ".join([f"{col}: {row.get(col, '')}" for col in columns])
data_info += f"{i} 行: {row_str}\n"
if row_count > 5:
data_info += f"\n(还有 {row_count - 5} 行数据...)\n"
# 添加用户自定义提示
if user_prompt and user_prompt.strip():
data_info += f"\n用户需求:\n{user_prompt}"
else:
data_info += "\n用户需求: 请对上述数据进行分析"
return data_info
async def analyze_with_template(
self,
excel_data: Dict[str, Any],
template_prompt: str
) -> Dict[str, Any]:
"""
使用自定义模板分析 Excel 数据
Args:
excel_data: Excel 解析后的数据
template_prompt: 自定义提示词模板
Returns:
Dict[str, Any]: 分析结果
"""
system_prompt = """你是一个专业的数据分析师。请根据用户提供的自定义提示词分析 Excel 数据。
请严格按照用户的要求进行分析,输出清晰、有条理的结果。
输出语言:中文"""
user_message = self._format_user_message(excel_data, template_prompt)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
]
try:
response = await self.chat(
messages=messages,
temperature=0.5,
max_tokens=3000
)
content = self.extract_message_content(response)
return {
"success": True,
"analysis": content,
"model": self.model_name,
"is_template": True
}
except Exception as e:
logger.error(f"自定义模板分析失败: {str(e)}")
return {
"success": False,
"error": str(e),
"analysis": None
}
# 全局单例
llm_service = LLMService()

View File

@@ -0,0 +1,218 @@
"""
文本分析服务 - 从 AI 分析结果中提取结构化数据用于可视化
"""
import logging
from typing import Dict, Any, List, Optional
import re
import json
from app.services.llm_service import llm_service
logger = logging.getLogger(__name__)
class TextAnalysisService:
"""文本分析服务类"""
def __init__(self):
self.llm_service = llm_service
async def extract_structured_data(
self,
analysis_text: str,
original_filename: str = "",
file_type: str = "text"
) -> Dict[str, Any]:
"""
从 AI 分析结果文本中提取结构化数据
Args:
analysis_text: AI 分析结果文本
original_filename: 原始文件名
file_type: 文件类型
Returns:
Dict[str, Any]: 提取的结构化数据
"""
# 限制分析的文本长度,避免 token 超限
max_text_length = 8000
truncated_text = analysis_text[:max_text_length]
system_prompt = """你是一个专业的数据提取助手。你的任务是从AI分析结果中提取结构化数据用于生成图表。
请按照以下要求提取数据:
1. 数值型数据:
- 提取所有的数值、统计信息、百分比等
- 为每个数值创建一个条目,包含:名称、值、单位(如果有)
- 格式示例:{"name": "销售额", "value": 123456.78, "unit": ""}
2. 分类数据:
- 提取所有的类别、状态、枚举值等
- 为每个类别创建一个条目,包含:名称、值、数量(如果有)
- 格式示例:{"name": "产品类别", "value": "电子产品", "count": 25}
3. 时间序列数据:
- 提取所有的时间相关数据(年月、季度、日期等)
- 格式示例:{"name": "2025年1月", "value": 12345}
4. 对比数据:
- 提取所有的对比、排名、趋势等数据
- 格式示例:{"name": "同比增长", "value": 15.3, "unit": "%"}
5. 表格数据:
- 如果分析结果中包含表格或列表形式的数据,提取出来
- 格式:{"columns": ["列1", "列2"], "rows": [{"列1": "值1", "列2": "值2"}]}
重要规则:
- 只提取明确提到的数据和数值
- 如果某种类型的数据不存在,返回空数组 []
- 确保所有数值都是有效的数字类型
- 保持数据的原始精度
- 返回的 JSON 必须完整且格式正确
- 表格数据最多提取 20 行
请以 JSON 格式返回,不要添加任何 Markdown 标记或解释文字,只返回纯 JSON
{
"success": true,
"data": {
"numeric_data": [
{"name": string, "value": number, "unit": string|null}
],
"categorical_data": [
{"name": string, "value": string, "count": number|null}
],
"time_series_data": [
{"name": string, "value": number}
],
"comparison_data": [
{"name": string, "value": number, "unit": string|null}
],
"table_data": {
"columns": string[],
"rows": object[]
} | null
},
"metadata": {
"total_items": number,
"data_types": string[]
}
}"""
user_message = f"""请从以下 AI 分析结果中提取结构化数据:
原始文件名:{original_filename}
文件类型:{file_type}
AI 分析结果:
{truncated_text}
请按照系统提示的要求提取数据并返回纯 JSON 格式。"""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
]
try:
logger.info(f"开始提取结构化数据,文本长度: {len(truncated_text)}")
response = await self.llm_service.chat(
messages=messages,
temperature=0.1,
max_tokens=4000
)
content = self.llm_service.extract_message_content(response)
logger.info(f"LLM 返回内容长度: {len(content)}")
# 使用简单的方法提取 JSON
result = self._extract_json_simple(content)
if not result:
logger.error("无法从 LLM 响应中提取有效的 JSON")
return {
"success": False,
"error": "AI 返回的数据格式不正确或被截断",
"raw_content": content[:500]
}
logger.info(f"成功提取结构化数据")
return result
except Exception as e:
logger.error(f"提取结构化数据失败: {str(e)}")
return {
"success": False,
"error": str(e)
}
def _extract_json_simple(self, content: str) -> Optional[Dict[str, Any]]:
"""
简化的 JSON 提取方法
Args:
content: LLM 返回的内容
Returns:
Optional[Dict[str, Any]]: 解析后的 JSON失败返回 None
"""
try:
# 方法 1: 查找 ```json 代码块
code_block_match = re.search(r'```json\n{[\s\S]*?}[\s\S]*?}\n```', content, re.DOTALL)
if code_block_match:
json_str = code_block_match.group(1)
logger.info("从代码块中提取 JSON")
return json.loads(json_str)
# 方法 2: 查找第一个完整的 { } 对象
brace_count = 0
json_start = -1
for i in range(len(content)):
if content[i] == '{':
if brace_count == 0:
json_start = i
brace_count += 1
elif content[i] == '}':
brace_count -= 1
if brace_count == 0:
# 找到了完整的 JSON 对象
json_end = i + 1
json_str = content[json_start:json_end]
logger.info(f"从大括号中提取 JSON")
return json.loads(json_str)
# 方法 3: 尝试直接解析
logger.info("尝试直接解析整个内容")
return json.loads(content)
except json.JSONDecodeError as e:
logger.error(f"JSON 解析失败: {str(e)}")
logger.error(f"原始内容(前 500 字符): {content[:500]}...")
return None
except Exception as e:
logger.error(f"提取 JSON 失败: {str(e)}")
return None
def detect_data_types(self, data: Dict[str, Any]) -> List[str]:
"""检测数据中包含的类型"""
types = []
d = data.get("data", {})
if d.get("numeric_data") and len(d["numeric_data"]) > 0:
types.append("numeric")
if d.get("categorical_data") and len(d["categorical_data"]) > 0:
types.append("categorical")
if d.get("time_series_data") and len(d["time_series_data"]) > 0:
types.append("time_series")
if d.get("comparison_data") and len(d["comparison_data"]) > 0:
types.append("comparison")
if d.get("table_data") and d["table_data"]:
types.append("table")
return types
# 全局单例
text_analysis_service = TextAnalysisService()

View File

@@ -0,0 +1,388 @@
"""
数据可视化服务 - 使用 matplotlib/plotly 生成统计图表
"""
import io
import base64
import logging
from typing import Dict, Any, List, Optional, Union
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
# 使用字体辅助模块配置中文字体
from app.services.font_helper import configure_matplotlib_fonts
configure_matplotlib_fonts()
logger = logging.getLogger(__name__)
class VisualizationService:
"""数据可视化服务类"""
def __init__(self):
self.output_dir = Path(__file__).resolve().parent.parent.parent / "data" / "charts"
self.output_dir.mkdir(parents=True, exist_ok=True)
def analyze_and_visualize(
self,
excel_data: Dict[str, Any],
analysis_type: str = "statistics"
) -> Dict[str, Any]:
"""
分析数据并生成可视化图表
Args:
excel_data: Excel 解析后的数据
analysis_type: 分析类型
Returns:
Dict[str, Any]: 包含图表数据和统计信息的结果
"""
try:
columns = excel_data.get("columns", [])
rows = excel_data.get("rows", [])
if not columns or not rows:
return {
"success": False,
"error": "没有数据可用于分析"
}
# 转换为 DataFrame
df = pd.DataFrame(rows, columns=columns)
# 根据列类型分类
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = df.select_dtypes(exclude=[np.number]).columns.tolist()
# 生成统计信息
statistics = self._generate_statistics(df, numeric_columns, categorical_columns)
# 生成图表
charts = self._generate_charts(df, numeric_columns, categorical_columns)
# 生成数据分布信息
distributions = self._generate_distributions(df, categorical_columns)
return {
"success": True,
"statistics": statistics,
"charts": charts,
"distributions": distributions,
"row_count": len(df),
"column_count": len(columns)
}
except Exception as e:
logger.error(f"可视化分析失败: {str(e)}", exc_info=True)
return {
"success": False,
"error": str(e)
}
def _generate_statistics(
self,
df: pd.DataFrame,
numeric_columns: List[str],
categorical_columns: List[str]
) -> Dict[str, Any]:
"""生成统计信息"""
statistics = {
"numeric": {},
"categorical": {}
}
# 数值型列统计
for col in numeric_columns:
try:
stats = {
"count": int(df[col].count()),
"mean": float(df[col].mean()),
"median": float(df[col].median()),
"std": float(df[col].std()) if df[col].count() > 1 else 0,
"min": float(df[col].min()),
"max": float(df[col].max()),
"q25": float(df[col].quantile(0.25)),
"q75": float(df[col].quantile(0.75)),
"missing": int(df[col].isna().sum())
}
statistics["numeric"][col] = stats
except Exception as e:
logger.warning(f"{col} 统计失败: {str(e)}")
# 分类型列统计
for col in categorical_columns:
try:
value_counts = df[col].value_counts()
stats = {
"unique": int(df[col].nunique()),
"most_common": str(value_counts.index[0]) if len(value_counts) > 0 else "",
"most_common_count": int(value_counts.iloc[0]) if len(value_counts) > 0 else 0,
"missing": int(df[col].isna().sum()),
"distribution": {str(k): int(v) for k, v in value_counts.items()}
}
statistics["categorical"][col] = stats
except Exception as e:
logger.warning(f"{col} 统计失败: {str(e)}")
return statistics
def _generate_charts(
self,
df: pd.DataFrame,
numeric_columns: List[str],
categorical_columns: List[str]
) -> Dict[str, Any]:
"""生成图表"""
charts = {}
# 1. 数值型列的直方图
charts["histograms"] = []
for col in numeric_columns[:5]: # 限制最多 5 个数值列
chart_data = self._create_histogram(df[col], col)
if chart_data:
charts["histograms"].append(chart_data)
# 2. 分类型列的条形图
charts["bar_charts"] = []
for col in categorical_columns[:5]: # 限制最多 5 个分类型列
chart_data = self._create_bar_chart(df[col], col)
if chart_data:
charts["bar_charts"].append(chart_data)
# 3. 数值型列的箱线图
charts["box_plots"] = []
if len(numeric_columns) > 0:
chart_data = self._create_box_plot(df[numeric_columns[:5]], numeric_columns[:5])
if chart_data:
charts["box_plots"].append(chart_data)
# 4. 相关性热力图
if len(numeric_columns) >= 2:
chart_data = self._create_correlation_heatmap(df[numeric_columns], numeric_columns)
if chart_data:
charts["correlation"] = chart_data
return charts
def _create_histogram(self, series: pd.Series, column_name: str) -> Optional[Dict[str, Any]]:
"""创建直方图"""
try:
fig, ax = plt.subplots(figsize=(11, 7))
ax.hist(series.dropna(), bins=20, edgecolor='black', alpha=0.7, color='#3b82f6')
ax.set_xlabel(column_name, fontsize=10, labelpad=10)
ax.set_ylabel('频数', fontsize=10, labelpad=10)
ax.set_title(f'{column_name} 分布', fontsize=12, fontweight='bold', pad=15)
ax.grid(True, alpha=0.3, axis='y')
ax.tick_params(axis='both', which='major', labelsize=9)
# 改进布局
plt.tight_layout(pad=1.5, w_pad=1.0, h_pad=1.0)
# 转换为 base64
img_base64 = self._figure_to_base64(fig)
return {
"type": "histogram",
"column": column_name,
"image": img_base64,
"stats": {
"mean": float(series.mean()),
"median": float(series.median()),
"std": float(series.std()) if len(series) > 1 else 0
}
}
except Exception as e:
logger.error(f"创建直方图失败 ({column_name}): {str(e)}")
return None
def _create_bar_chart(self, series: pd.Series, column_name: str) -> Optional[Dict[str, Any]]:
"""创建条形图"""
try:
value_counts = series.value_counts().head(10) # 只显示前 10 个
fig, ax = plt.subplots(figsize=(12, 7))
# 处理标签显示
labels = [str(x)[:15] + '...' if len(str(x)) > 15 else str(x) for x in value_counts.index]
x_pos = range(len(value_counts))
bars = ax.bar(x_pos, value_counts.values, color='#10b981', alpha=0.8, edgecolor='black', linewidth=0.5)
ax.set_xticks(x_pos)
ax.set_xticklabels(labels, rotation=30, ha='right', fontsize=8)
ax.set_xlabel(column_name, fontsize=10, labelpad=10)
ax.set_ylabel('数量', fontsize=10, labelpad=10)
ax.set_title(f'{column_name} 分布 (Top 10)', fontsize=12, fontweight='bold', pad=15)
ax.grid(True, alpha=0.3, axis='y')
ax.tick_params(axis='both', which='major', labelsize=9)
# 添加数值标签(位置稍微上移)
max_val = value_counts.values.max()
y_offset = max_val * 0.02 if max_val > 0 else 0.5
for bar, value in zip(bars, value_counts.values):
ax.text(bar.get_x() + bar.get_width() / 2., value + y_offset,
f'{int(value)}',
ha='center', va='bottom', fontsize=8, fontweight='bold')
# 改进布局
plt.tight_layout(pad=1.5, w_pad=1.0, h_pad=1.0)
# 转换为 base64
img_base64 = self._figure_to_base64(fig)
return {
"type": "bar_chart",
"column": column_name,
"image": img_base64,
"categories": {str(k): int(v) for k, v in value_counts.items()}
}
except Exception as e:
logger.error(f"创建条形图失败 ({column_name}): {str(e)}")
return None
def _create_box_plot(self, df: pd.DataFrame, columns: List[str]) -> Optional[Dict[str, Any]]:
"""创建箱线图"""
try:
fig, ax = plt.subplots(figsize=(14, 7))
# 准备数据
box_data = [df[col].dropna() for col in columns]
bp = ax.boxplot(box_data, labels=columns, patch_artist=True,
notch=True, showcaps=True, showfliers=True)
# 美化箱线图
box_colors = ['#3b82f6', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6']
for patch, color in zip(bp['boxes'], box_colors[:len(bp['boxes'])]):
patch.set_facecolor(color)
patch.set_alpha(0.6)
patch.set_linewidth(1.5)
# 设置其他元素样式
for element in ['whiskers', 'fliers', 'means', 'medians', 'caps']:
plt.setp(bp[element], linewidth=1.5)
ax.set_ylabel('', fontsize=10, labelpad=10)
ax.set_title('数值型列分布对比', fontsize=12, fontweight='bold', pad=15)
ax.grid(True, alpha=0.3, axis='y')
# 旋转 x 轴标签以避免重叠
plt.setp(ax.get_xticklabels(), rotation=30, ha='right', fontsize=9)
ax.tick_params(axis='both', which='major', labelsize=9)
# 改进布局
plt.tight_layout(pad=1.5, w_pad=1.5, h_pad=1.0)
# 转换为 base64
img_base64 = self._figure_to_base64(fig)
return {
"type": "box_plot",
"columns": columns,
"image": img_base64
}
except Exception as e:
logger.error(f"创建箱线图失败: {str(e)}")
return None
def _create_correlation_heatmap(self, df: pd.DataFrame, columns: List[str]) -> Optional[Dict[str, Any]]:
"""创建相关性热力图"""
try:
# 计算相关系数
corr = df.corr()
fig, ax = plt.subplots(figsize=(11, 9))
im = ax.imshow(corr, cmap='RdBu_r', aspect='auto', vmin=-1, vmax=1)
# 设置刻度
n_cols = len(corr)
ax.set_xticks(np.arange(n_cols))
ax.set_yticks(np.arange(n_cols))
# 处理过长的列名
x_labels = [str(col)[:10] + '...' if len(str(col)) > 10 else str(col) for col in corr.columns]
y_labels = [str(col)[:10] + '...' if len(str(col)) > 10 else str(col) for col in corr.columns]
ax.set_xticklabels(x_labels, rotation=30, ha='right', fontsize=9)
ax.set_yticklabels(y_labels, fontsize=9)
# 添加数值标签,根据相关性值选择颜色
for i in range(n_cols):
for j in range(n_cols):
value = corr.iloc[i, j]
# 根据背景色深浅选择文字颜色
text_color = 'white' if abs(value) > 0.5 else 'black'
ax.text(j, i, f'{value:.2f}',
ha="center", va="center", color=text_color,
fontsize=8, fontweight='bold' if abs(value) > 0.7 else 'normal')
ax.set_title('数值型列相关性热力图', fontsize=12, fontweight='bold', pad=15)
ax.tick_params(axis='both', which='major', labelsize=9)
# 添加颜色条
cbar = plt.colorbar(im, ax=ax)
cbar.set_label('相关系数', rotation=270, labelpad=20, fontsize=10)
cbar.ax.tick_params(labelsize=9)
# 改进布局
plt.tight_layout(pad=2.0, w_pad=1.0, h_pad=1.0)
# 转换为 base64
img_base64 = self._figure_to_base64(fig)
return {
"type": "correlation_heatmap",
"columns": columns,
"image": img_base64,
"correlation_matrix": corr.to_dict()
}
except Exception as e:
logger.error(f"创建相关性热力图失败: {str(e)}")
return None
def _generate_distributions(
self,
df: pd.DataFrame,
categorical_columns: List[str]
) -> Dict[str, Any]:
"""生成数据分布信息"""
distributions = {}
for col in categorical_columns[:5]:
try:
value_counts = df[col].value_counts()
total = len(df)
distributions[col] = {
"categories": {str(k): int(v) for k, v in value_counts.items()},
"percentages": {str(k): round(v / total * 100, 2) for k, v in value_counts.items()},
"unique_count": len(value_counts)
}
except Exception as e:
logger.warning(f"{col} 分布生成失败: {str(e)}")
return distributions
def _figure_to_base64(self, fig) -> str:
"""将 matplotlib 图形转换为 base64 字符串"""
buf = io.BytesIO()
fig.savefig(
buf,
format='png',
dpi=120,
bbox_inches='tight',
pad_inches=0.3,
facecolor='white',
edgecolor='none',
transparent=False
)
plt.close(fig)
buf.seek(0)
img_base64 = base64.b64encode(buf.read()).decode('utf-8')
return f"data:image/png;base64,{img_base64}"
# 全局单例
visualization_service = VisualizationService()