前后端基本架构和完全excel表的解析及统计图表的生成以及excel表的到出
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
backend/app/services/__pycache__/file_service.cpython-312.pyc
Normal file
BIN
backend/app/services/__pycache__/file_service.cpython-312.pyc
Normal file
Binary file not shown.
BIN
backend/app/services/__pycache__/font_helper.cpython-312.pyc
Normal file
BIN
backend/app/services/__pycache__/font_helper.cpython-312.pyc
Normal file
Binary file not shown.
BIN
backend/app/services/__pycache__/font_helper.cpython-313.pyc
Normal file
BIN
backend/app/services/__pycache__/font_helper.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/services/__pycache__/llm_service.cpython-312.pyc
Normal file
BIN
backend/app/services/__pycache__/llm_service.cpython-312.pyc
Normal file
Binary file not shown.
BIN
backend/app/services/__pycache__/llm_service.cpython-313.pyc
Normal file
BIN
backend/app/services/__pycache__/llm_service.cpython-313.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
349
backend/app/services/chart_generator_service.py
Normal file
349
backend/app/services/chart_generator_service.py
Normal file
@@ -0,0 +1,349 @@
|
||||
"""
|
||||
图表生成服务 - 根据结构化数据生成图表
|
||||
"""
|
||||
import io
|
||||
import base64
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib
|
||||
import numpy as np
|
||||
|
||||
# 使用字体辅助模块配置中文字体
|
||||
from app.services.font_helper import configure_matplotlib_fonts
|
||||
|
||||
configure_matplotlib_fonts()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ChartGeneratorService:
|
||||
"""图表生成服务类"""
|
||||
|
||||
def __init__(self):
|
||||
self.output_dir = Path(__file__).resolve().parent.parent.parent / "data" / "charts"
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def generate_charts_from_analysis(
|
||||
self,
|
||||
structured_data: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
根据提取的结构化数据生成图表
|
||||
|
||||
Args:
|
||||
structured_data: 从 AI 分析结果中提取的结构化数据
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 包含图表数据的结果
|
||||
"""
|
||||
if not structured_data.get("success"):
|
||||
return {
|
||||
"success": False,
|
||||
"error": structured_data.get("error", "数据提取失败")
|
||||
}
|
||||
|
||||
data = structured_data.get("data", {})
|
||||
charts = {}
|
||||
statistics = {}
|
||||
|
||||
try:
|
||||
# 1. 数值型数据图表
|
||||
numeric_data = data.get("numeric_data", [])
|
||||
if numeric_data:
|
||||
charts["numeric_charts"] = self._create_numeric_charts(numeric_data)
|
||||
statistics["numeric_summary"] = self._create_numeric_summary(numeric_data)
|
||||
|
||||
# 2. 分类数据图表
|
||||
categorical_data = data.get("categorical_data", [])
|
||||
if categorical_data:
|
||||
charts["categorical_charts"] = self._create_categorical_charts(categorical_data)
|
||||
|
||||
# 3. 时间序列图表
|
||||
time_series_data = data.get("time_series_data", [])
|
||||
if time_series_data:
|
||||
charts["time_series_chart"] = self._create_time_series_chart(time_series_data)
|
||||
|
||||
# 4. 对比数据图表
|
||||
comparison_data = data.get("comparison_data", [])
|
||||
if comparison_data:
|
||||
charts["comparison_chart"] = self._create_comparison_chart(comparison_data)
|
||||
|
||||
# 5. 表格数据可视化
|
||||
table_data = data.get("table_data")
|
||||
if table_data:
|
||||
charts["table_preview"] = self._create_table_preview(table_data)
|
||||
|
||||
# 元数据
|
||||
metadata = data.get("metadata", {})
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"charts": charts,
|
||||
"statistics": statistics,
|
||||
"metadata": metadata,
|
||||
"data_source": "ai_analysis"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"生成图表失败: {str(e)}", exc_info=True)
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def _create_numeric_charts(self, numeric_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""创建数值型数据图表"""
|
||||
charts = []
|
||||
|
||||
# 提取数值和标签
|
||||
names = [item.get("name", f"项{i}") for i, item in enumerate(numeric_data)]
|
||||
values = [item.get("value", 0) for item in numeric_data]
|
||||
|
||||
if not values:
|
||||
return charts
|
||||
|
||||
# 1. 柱状图
|
||||
try:
|
||||
fig, ax = plt.subplots(figsize=(12, 7))
|
||||
colors = plt.cm.Set3(np.linspace(0, 1, len(values)))
|
||||
bars = ax.bar(names, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)
|
||||
|
||||
# 添加数值标签
|
||||
for bar, value in zip(bars, values):
|
||||
height = bar.get_height()
|
||||
ax.text(bar.get_x() + bar.get_width() / 2., height,
|
||||
f'{value:,.0f}',
|
||||
ha='center', va='bottom', fontsize=9, fontweight='bold')
|
||||
|
||||
ax.set_xlabel('项目', fontsize=10, labelpad=10, fontweight='bold')
|
||||
ax.set_ylabel('数值', fontsize=10, labelpad=10, fontweight='bold')
|
||||
ax.set_title('数值型数据对比', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.set_xticklabels(names, rotation=30, ha='right', fontsize=9)
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
plt.grid(axis='y', alpha=0.3)
|
||||
plt.tight_layout(pad=1.5)
|
||||
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
charts.append({
|
||||
"type": "bar",
|
||||
"title": "数值型数据对比",
|
||||
"image": img_base64,
|
||||
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"创建柱状图失败: {str(e)}")
|
||||
|
||||
# 2. 饼图
|
||||
if len(values) > 0 and len(values) <= 10:
|
||||
try:
|
||||
fig, ax = plt.subplots(figsize=(10, 10))
|
||||
wedges, texts, autotexts = ax.pie(values, labels=names, autopct='%1.1f%%',
|
||||
startangle=90, colors=plt.cm.Set3.colors[:len(values)])
|
||||
|
||||
for autotext in autotexts:
|
||||
autotext.set_color('white')
|
||||
autotext.set_fontsize(9)
|
||||
autotext.set_fontweight('bold')
|
||||
|
||||
ax.set_title('数值型数据占比', fontsize=12, fontweight='bold', pad=15)
|
||||
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
charts.append({
|
||||
"type": "pie",
|
||||
"title": "数值型数据占比",
|
||||
"image": img_base64,
|
||||
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"创建饼图失败: {str(e)}")
|
||||
|
||||
return charts
|
||||
|
||||
def _create_categorical_charts(self, categorical_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""创建分类数据图表"""
|
||||
charts = []
|
||||
|
||||
# 提取数据
|
||||
names = [item.get("name", f"类{i}") for i, item in enumerate(categorical_data)]
|
||||
counts = [item.get("count", 1) for item in categorical_data]
|
||||
|
||||
if not names or not counts:
|
||||
return charts
|
||||
|
||||
# 水平条形图
|
||||
try:
|
||||
fig, ax = plt.subplots(figsize=(10, max(6, len(names) * 0.8)))
|
||||
y_pos = np.arange(len(names))
|
||||
|
||||
bars = ax.barh(y_pos, counts, align='center', color='#10b981', alpha=0.8, edgecolor='black', linewidth=0.5)
|
||||
|
||||
# 添加数值标签
|
||||
for bar, count in zip(bars, counts):
|
||||
width = bar.get_width()
|
||||
ax.text(width, bar.get_y() + bar.get_height() / 2.,
|
||||
f'{count}',
|
||||
ha='left', va='center', fontsize=10, fontweight='bold')
|
||||
|
||||
ax.set_yticks(y_pos)
|
||||
ax.set_yticklabels(names, fontsize=10)
|
||||
ax.invert_yaxis()
|
||||
ax.set_xlabel('数量', fontsize=10, labelpad=10, fontweight='bold')
|
||||
ax.set_title('分类数据分布', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
ax.grid(axis='x', alpha=0.3)
|
||||
plt.tight_layout(pad=1.5)
|
||||
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
charts.append({
|
||||
"type": "barh",
|
||||
"title": "分类数据分布",
|
||||
"image": img_base64,
|
||||
"data": [{"name": n, "count": c} for n, c in zip(names, counts)]
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"创建分类图表失败: {str(e)}")
|
||||
|
||||
return charts
|
||||
|
||||
def _create_time_series_chart(self, time_series_data: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
||||
"""创建时间序列图表"""
|
||||
if not time_series_data:
|
||||
return None
|
||||
|
||||
try:
|
||||
names = [item.get("name", f"时间{i}") for i, item in enumerate(time_series_data)]
|
||||
values = [item.get("value", 0) for item in time_series_data]
|
||||
|
||||
if len(values) < 2:
|
||||
return None
|
||||
|
||||
fig, ax = plt.subplots(figsize=(14, 7))
|
||||
|
||||
# 绘制折线图和柱状图
|
||||
x_pos = np.arange(len(names))
|
||||
bars = ax.bar(x_pos, values, width=0.4, label='数值', color='#3b82f6', alpha=0.7)
|
||||
|
||||
# 添加折线
|
||||
line = ax.plot(x_pos, values, 'o-', color='#ef4444', linewidth=2.5, markersize=8, label='趋势')
|
||||
|
||||
ax.set_xticks(x_pos)
|
||||
ax.set_xticklabels(names, rotation=30, ha='right', fontsize=9)
|
||||
ax.set_ylabel('数值', fontsize=10, labelpad=10, fontweight='bold')
|
||||
ax.set_title('时间序列数据', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.legend(loc='best', fontsize=9)
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
ax.grid(True, alpha=0.3)
|
||||
plt.tight_layout(pad=1.5)
|
||||
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
return {
|
||||
"type": "time_series",
|
||||
"title": "时间序列数据",
|
||||
"image": img_base64,
|
||||
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"创建时间序列图表失败: {str(e)}")
|
||||
return None
|
||||
|
||||
def _create_comparison_chart(self, comparison_data: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
||||
"""创建对比图表"""
|
||||
if not comparison_data:
|
||||
return None
|
||||
|
||||
try:
|
||||
names = [item.get("name", f"对比{i}") for i, item in enumerate(comparison_data)]
|
||||
values = [item.get("value", 0) for item in comparison_data]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 7))
|
||||
|
||||
# 区分正负值
|
||||
colors = ['#10b981' if v >= 0 else '#ef4444' for v in values]
|
||||
bars = ax.bar(names, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.8)
|
||||
|
||||
# 添加数值标签
|
||||
for bar, value in zip(bars, values):
|
||||
height = bar.get_height()
|
||||
ax.text(bar.get_x() + bar.get_width() / 2., height,
|
||||
f'{value:,.1f}',
|
||||
ha='center', va='bottom' if value >= 0 else 'top',
|
||||
fontsize=10, fontweight='bold')
|
||||
|
||||
# 添加零线
|
||||
ax.axhline(y=0, color='black', linestyle='-', linewidth=1)
|
||||
|
||||
ax.set_ylabel('值', fontsize=10, labelpad=10, fontweight='bold')
|
||||
ax.set_title('对比数据', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.set_xticklabels(names, rotation=30, ha='right', fontsize=9)
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
plt.grid(axis='y', alpha=0.3)
|
||||
plt.tight_layout(pad=1.5)
|
||||
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
return {
|
||||
"type": "comparison",
|
||||
"title": "对比数据",
|
||||
"image": img_base64,
|
||||
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"创建对比图表失败: {str(e)}")
|
||||
return None
|
||||
|
||||
def _create_table_preview(self, table_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""创建表格预览数据"""
|
||||
if not table_data:
|
||||
return {}
|
||||
|
||||
columns = table_data.get("columns", [])
|
||||
rows = table_data.get("rows", [])
|
||||
|
||||
return {
|
||||
"columns": columns,
|
||||
"rows": rows[:50], # 限制显示前50行
|
||||
"total_rows": len(rows),
|
||||
"preview_rows": min(50, len(rows))
|
||||
}
|
||||
|
||||
def _create_numeric_summary(self, numeric_data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""创建数值型数据摘要"""
|
||||
values = [item.get("value", 0) for item in numeric_data if isinstance(item.get("value"), (int, float))]
|
||||
|
||||
if not values:
|
||||
return {}
|
||||
|
||||
return {
|
||||
"count": len(values),
|
||||
"sum": float(sum(values)),
|
||||
"mean": float(np.mean(values)),
|
||||
"median": float(np.median(values)),
|
||||
"min": float(min(values)),
|
||||
"max": float(max(values)),
|
||||
"std": float(np.std(values)) if len(values) > 1 else 0
|
||||
}
|
||||
|
||||
def _figure_to_base64(self, fig) -> str:
|
||||
"""将 matplotlib 图形转换为 base64 字符串"""
|
||||
buf = io.BytesIO()
|
||||
fig.savefig(
|
||||
buf,
|
||||
format='png',
|
||||
dpi=120,
|
||||
bbox_inches='tight',
|
||||
pad_inches=0.3,
|
||||
facecolor='white',
|
||||
edgecolor='none',
|
||||
transparent=False
|
||||
)
|
||||
plt.close(fig)
|
||||
buf.seek(0)
|
||||
img_base64 = base64.b64encode(buf.read()).decode('utf-8')
|
||||
return f"data:image/png;base64,{img_base64}"
|
||||
|
||||
|
||||
# 全局单例
|
||||
chart_generator_service = ChartGeneratorService()
|
||||
253
backend/app/services/excel_ai_service.py
Normal file
253
backend/app/services/excel_ai_service.py
Normal file
@@ -0,0 +1,253 @@
|
||||
"""
|
||||
Excel AI 分析服务 - 集成 Excel 解析和 LLM 分析
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, Any, Optional, List
|
||||
|
||||
from app.core.document_parser import XlsxParser
|
||||
from app.services.file_service import file_service
|
||||
from app.services.llm_service import llm_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExcelAIService:
|
||||
"""Excel AI 分析服务"""
|
||||
|
||||
def __init__(self):
|
||||
self.parser = XlsxParser()
|
||||
self.file_service = file_service
|
||||
self.llm_service = llm_service
|
||||
|
||||
async def analyze_excel_file(
|
||||
self,
|
||||
file_content: bytes,
|
||||
filename: str,
|
||||
user_prompt: str = "",
|
||||
analysis_type: str = "general",
|
||||
parse_options: Optional[Dict[str, Any]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
分析 Excel 文件
|
||||
|
||||
Args:
|
||||
file_content: 文件内容字节
|
||||
filename: 文件名
|
||||
user_prompt: 用户自定义提示词
|
||||
analysis_type: 分析类型
|
||||
parse_options: 解析选项
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 分析结果
|
||||
"""
|
||||
# 1. 保存文件
|
||||
try:
|
||||
saved_path = self.file_service.save_uploaded_file(
|
||||
file_content,
|
||||
filename,
|
||||
subfolder="excel"
|
||||
)
|
||||
logger.info(f"文件已保存: {saved_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"文件保存失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"文件保存失败: {str(e)}",
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
# 2. 解析 Excel 文件
|
||||
try:
|
||||
parse_options = parse_options or {}
|
||||
parse_result = self.parser.parse(saved_path, **parse_options)
|
||||
|
||||
if not parse_result.success:
|
||||
return {
|
||||
"success": False,
|
||||
"error": parse_result.error,
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
excel_data = parse_result.data
|
||||
logger.info(f"Excel 解析成功: {parse_result.metadata}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Excel 解析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Excel 解析失败: {str(e)}",
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
# 3. 调用 LLM 进行分析
|
||||
try:
|
||||
# 如果有自定义提示词,使用模板分析
|
||||
if user_prompt and user_prompt.strip():
|
||||
llm_result = await self.llm_service.analyze_with_template(
|
||||
excel_data,
|
||||
user_prompt
|
||||
)
|
||||
else:
|
||||
# 否则使用标准分析
|
||||
llm_result = await self.llm_service.analyze_excel_data(
|
||||
excel_data,
|
||||
user_prompt,
|
||||
analysis_type
|
||||
)
|
||||
|
||||
logger.info(f"AI 分析完成: {llm_result['success']}")
|
||||
|
||||
# 4. 组合结果
|
||||
return {
|
||||
"success": True,
|
||||
"excel": {
|
||||
"data": excel_data,
|
||||
"metadata": parse_result.metadata,
|
||||
"saved_path": saved_path
|
||||
},
|
||||
"analysis": llm_result
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"AI 分析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"AI 分析失败: {str(e)}",
|
||||
"excel": {
|
||||
"data": excel_data,
|
||||
"metadata": parse_result.metadata
|
||||
},
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
async def batch_analyze_sheets(
|
||||
self,
|
||||
file_content: bytes,
|
||||
filename: str,
|
||||
user_prompt: str = "",
|
||||
analysis_type: str = "general"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
批量分析 Excel 文件的所有工作表
|
||||
|
||||
Args:
|
||||
file_content: 文件内容字节
|
||||
filename: 文件名
|
||||
user_prompt: 用户自定义提示词
|
||||
analysis_type: 分析类型
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 分析结果
|
||||
"""
|
||||
# 1. 保存文件
|
||||
try:
|
||||
saved_path = self.file_service.save_uploaded_file(
|
||||
file_content,
|
||||
filename,
|
||||
subfolder="excel"
|
||||
)
|
||||
logger.info(f"文件已保存: {saved_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"文件保存失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"文件保存失败: {str(e)}",
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
# 2. 解析所有工作表
|
||||
try:
|
||||
parse_result = self.parser.parse_all_sheets(saved_path)
|
||||
|
||||
if not parse_result.success:
|
||||
return {
|
||||
"success": False,
|
||||
"error": parse_result.error,
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
sheets_data = parse_result.data.get("sheets", {})
|
||||
logger.info(f"Excel 解析成功,共 {len(sheets_data)} 个工作表")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Excel 解析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Excel 解析失败: {str(e)}",
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
# 3. 批量分析每个工作表
|
||||
sheet_analyses = {}
|
||||
errors = {}
|
||||
|
||||
for sheet_name, sheet_data in sheets_data.items():
|
||||
try:
|
||||
# 调用 LLM 分析
|
||||
if user_prompt and user_prompt.strip():
|
||||
llm_result = await self.llm_service.analyze_with_template(
|
||||
sheet_data,
|
||||
user_prompt
|
||||
)
|
||||
else:
|
||||
llm_result = await self.llm_service.analyze_excel_data(
|
||||
sheet_data,
|
||||
user_prompt,
|
||||
analysis_type
|
||||
)
|
||||
|
||||
sheet_analyses[sheet_name] = llm_result
|
||||
|
||||
if not llm_result["success"]:
|
||||
errors[sheet_name] = llm_result.get("error", "未知错误")
|
||||
|
||||
logger.info(f"工作表 '{sheet_name}' 分析完成")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"工作表 '{sheet_name}' 分析失败: {str(e)}")
|
||||
errors[sheet_name] = str(e)
|
||||
|
||||
# 4. 组合结果
|
||||
return {
|
||||
"success": len(errors) == 0,
|
||||
"excel": {
|
||||
"sheets": sheets_data,
|
||||
"metadata": parse_result.metadata,
|
||||
"saved_path": saved_path
|
||||
},
|
||||
"analysis": {
|
||||
"sheets": sheet_analyses,
|
||||
"total_sheets": len(sheets_data),
|
||||
"successful": len(sheet_analyses) - len(errors),
|
||||
"errors": errors
|
||||
}
|
||||
}
|
||||
|
||||
def get_supported_analysis_types(self) -> List[str]:
|
||||
"""获取支持的分析类型"""
|
||||
return [
|
||||
{
|
||||
"value": "general",
|
||||
"label": "综合分析",
|
||||
"description": "提供数据概览、关键发现、质量评估和建议"
|
||||
},
|
||||
{
|
||||
"value": "summary",
|
||||
"label": "数据摘要",
|
||||
"description": "快速了解数据的结构、范围和主要内容"
|
||||
},
|
||||
{
|
||||
"value": "statistics",
|
||||
"label": "统计分析",
|
||||
"description": "数值型列的统计信息和分类列的分布"
|
||||
},
|
||||
{
|
||||
"value": "insights",
|
||||
"label": "深度洞察",
|
||||
"description": "深入挖掘数据,提供异常值和业务建议"
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
# 全局单例
|
||||
excel_ai_service = ExcelAIService()
|
||||
132
backend/app/services/file_service.py
Normal file
132
backend/app/services/file_service.py
Normal file
@@ -0,0 +1,132 @@
|
||||
"""
|
||||
文件服务模块 - 处理文件存储和读取
|
||||
"""
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
import uuid
|
||||
|
||||
from app.config import settings
|
||||
|
||||
|
||||
class FileService:
|
||||
"""文件服务类,负责文件的存储、读取和管理"""
|
||||
|
||||
def __init__(self):
|
||||
self.upload_dir = Path(settings.UPLOAD_DIR)
|
||||
self._ensure_upload_dir()
|
||||
|
||||
def _ensure_upload_dir(self):
|
||||
"""确保上传目录存在"""
|
||||
self.upload_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def save_uploaded_file(
|
||||
self,
|
||||
file_content: bytes,
|
||||
filename: str,
|
||||
subfolder: Optional[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
保存上传的文件
|
||||
|
||||
Args:
|
||||
file_content: 文件内容字节
|
||||
filename: 原始文件名
|
||||
subfolder: 可选的子文件夹名称
|
||||
|
||||
Returns:
|
||||
str: 保存后的文件路径
|
||||
"""
|
||||
# 生成唯一文件名,避免覆盖
|
||||
file_ext = Path(filename).suffix
|
||||
unique_name = f"{uuid.uuid4().hex}{file_ext}"
|
||||
|
||||
# 确定保存路径
|
||||
if subfolder:
|
||||
save_dir = self.upload_dir / subfolder
|
||||
save_dir.mkdir(parents=True, exist_ok=True)
|
||||
else:
|
||||
save_dir = self.upload_dir
|
||||
|
||||
file_path = save_dir / unique_name
|
||||
|
||||
# 写入文件
|
||||
with open(file_path, 'wb') as f:
|
||||
f.write(file_content)
|
||||
|
||||
return str(file_path)
|
||||
|
||||
def read_file(self, file_path: str) -> bytes:
|
||||
"""
|
||||
读取文件内容
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
bytes: 文件内容
|
||||
"""
|
||||
with open(file_path, 'rb') as f:
|
||||
return f.read()
|
||||
|
||||
def delete_file(self, file_path: str) -> bool:
|
||||
"""
|
||||
删除文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
bool: 是否删除成功
|
||||
"""
|
||||
try:
|
||||
file = Path(file_path)
|
||||
if file.exists():
|
||||
file.unlink()
|
||||
return True
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def get_file_info(self, file_path: str) -> dict:
|
||||
"""
|
||||
获取文件信息
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
dict: 文件信息
|
||||
"""
|
||||
file = Path(file_path)
|
||||
if not file.exists():
|
||||
return {}
|
||||
|
||||
stat = file.stat()
|
||||
return {
|
||||
"filename": file.name,
|
||||
"filepath": str(file),
|
||||
"size": stat.st_size,
|
||||
"created": datetime.fromtimestamp(stat.st_ctime).isoformat(),
|
||||
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
||||
"extension": file.suffix.lower()
|
||||
}
|
||||
|
||||
def get_file_size(self, file_path: str) -> int:
|
||||
"""
|
||||
获取文件大小(字节)
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
int: 文件大小,文件不存在返回 0
|
||||
"""
|
||||
file = Path(file_path)
|
||||
return file.stat().st_size if file.exists() else 0
|
||||
|
||||
|
||||
# 全局单例
|
||||
file_service = FileService()
|
||||
105
backend/app/services/font_helper.py
Normal file
105
backend/app/services/font_helper.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""
|
||||
字体辅助模块 - 处理中文字体检测和配置
|
||||
"""
|
||||
import matplotlib
|
||||
import matplotlib.font_manager as fm
|
||||
import platform
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def get_chinese_font() -> str:
|
||||
"""
|
||||
获取可用的中文字体
|
||||
|
||||
Returns:
|
||||
str: 可用的中文字体名称
|
||||
"""
|
||||
# 获取系统中所有可用字体
|
||||
available_fonts = set([f.name for f in fm.fontManager.ttflist])
|
||||
|
||||
# 定义字体优先级列表
|
||||
# Windows 优先
|
||||
if platform.system() == 'Windows':
|
||||
font_list = [
|
||||
'Microsoft YaHei', # 微软雅黑
|
||||
'SimHei', # 黑体
|
||||
'SimSun', # 宋体
|
||||
'KaiTi', # 楷体
|
||||
'FangSong', # 仿宋
|
||||
'STXihei', # 华文细黑
|
||||
'STKaiti', # 华文楷体
|
||||
'STSong', # 华文宋体
|
||||
'STFangsong', # 华文仿宋
|
||||
]
|
||||
# macOS 优先
|
||||
elif platform.system() == 'Darwin':
|
||||
font_list = [
|
||||
'PingFang SC', # 苹方-简
|
||||
'PingFang TC', # 苹方-繁
|
||||
'Heiti SC', # 黑体-简
|
||||
'Heiti TC', # 黑体-繁
|
||||
'STHeiti', # 华文黑体
|
||||
'STSong', # 华文宋体
|
||||
'STKaiti', # 华文楷体
|
||||
'Arial Unicode MS', # Arial Unicode MS
|
||||
]
|
||||
# Linux 优先
|
||||
else:
|
||||
font_list = [
|
||||
'Noto Sans CJK SC', # Noto Sans CJK 简体中文
|
||||
'WenQuanYi Micro Hei', # 文泉驿微米黑
|
||||
'AR PL UMing CN', # AR PL UMing
|
||||
'AR PL UKai CN', # AR PL UKai
|
||||
'ZCOOL XiaoWei', # ZCOOL 小薇
|
||||
]
|
||||
|
||||
# 通用备选字体
|
||||
font_list.extend([
|
||||
'SimHei',
|
||||
'Microsoft YaHei',
|
||||
'Arial Unicode MS',
|
||||
'Droid Sans Fallback',
|
||||
])
|
||||
|
||||
# 查找第一个可用的字体
|
||||
for font_name in font_list:
|
||||
if font_name in available_fonts:
|
||||
logger.info(f"找到中文字体: {font_name}")
|
||||
return font_name
|
||||
|
||||
# 如果没找到,尝试获取第一个中文字体
|
||||
for font in fm.fontManager.ttflist:
|
||||
if 'CJK' in font.name or 'SC' in font.name or 'TC' in font.name:
|
||||
logger.info(f"使用找到的中文字体: {font.name}")
|
||||
return font.name
|
||||
|
||||
# 最终备选:使用系统默认字体
|
||||
logger.warning("未找到合适的中文字体,使用默认字体")
|
||||
return 'sans-serif'
|
||||
|
||||
|
||||
def configure_matplotlib_fonts():
|
||||
"""
|
||||
配置 matplotlib 的字体设置
|
||||
"""
|
||||
chinese_font = get_chinese_font()
|
||||
|
||||
# 配置字体
|
||||
matplotlib.rcParams['font.sans-serif'] = [chinese_font]
|
||||
matplotlib.rcParams['axes.unicode_minus'] = False
|
||||
matplotlib.rcParams['figure.dpi'] = 100
|
||||
matplotlib.rcParams['savefig.dpi'] = 120
|
||||
|
||||
# 字体大小设置
|
||||
matplotlib.rcParams['font.size'] = 10
|
||||
matplotlib.rcParams['axes.labelsize'] = 10
|
||||
matplotlib.rcParams['axes.titlesize'] = 11
|
||||
matplotlib.rcParams['xtick.labelsize'] = 9
|
||||
matplotlib.rcParams['ytick.labelsize'] = 9
|
||||
matplotlib.rcParams['legend.fontsize'] = 9
|
||||
|
||||
logger.info(f"配置完成,使用字体: {chinese_font}")
|
||||
return chinese_font
|
||||
268
backend/app/services/llm_service.py
Normal file
268
backend/app/services/llm_service.py
Normal file
@@ -0,0 +1,268 @@
|
||||
"""
|
||||
LLM 服务模块 - 封装大模型 API 调用
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
import httpx
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LLMService:
|
||||
"""大语言模型服务类"""
|
||||
|
||||
def __init__(self):
|
||||
self.api_key = settings.LLM_API_KEY
|
||||
self.base_url = settings.LLM_BASE_URL
|
||||
self.model_name = settings.LLM_MODEL_NAME
|
||||
|
||||
async def chat(
|
||||
self,
|
||||
messages: List[Dict[str, str]],
|
||||
temperature: float = 0.7,
|
||||
max_tokens: Optional[int] = None,
|
||||
**kwargs
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
调用聊天 API
|
||||
|
||||
Args:
|
||||
messages: 消息列表,格式为 [{"role": "user", "content": "..."}]
|
||||
temperature: 温度参数,控制随机性
|
||||
max_tokens: 最大生成 token 数
|
||||
**kwargs: 其他参数
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: API 响应结果
|
||||
"""
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": self.model_name,
|
||||
"messages": messages,
|
||||
"temperature": temperature
|
||||
}
|
||||
|
||||
if max_tokens:
|
||||
payload["max_tokens"] = max_tokens
|
||||
|
||||
# 添加其他参数
|
||||
payload.update(kwargs)
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.post(
|
||||
f"{self.base_url}/chat/completions",
|
||||
headers=headers,
|
||||
json=payload
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"LLM API 请求失败: {e.response.status_code} - {e.response.text}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"LLM API 调用异常: {str(e)}")
|
||||
raise
|
||||
|
||||
def extract_message_content(self, response: Dict[str, Any]) -> str:
|
||||
"""
|
||||
从 API 响应中提取消息内容
|
||||
|
||||
Args:
|
||||
response: API 响应
|
||||
|
||||
Returns:
|
||||
str: 消息内容
|
||||
"""
|
||||
try:
|
||||
return response["choices"][0]["message"]["content"]
|
||||
except (KeyError, IndexError) as e:
|
||||
logger.error(f"解析 API 响应失败: {str(e)}")
|
||||
raise
|
||||
|
||||
async def analyze_excel_data(
|
||||
self,
|
||||
excel_data: Dict[str, Any],
|
||||
user_prompt: str,
|
||||
analysis_type: str = "general"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
分析 Excel 数据
|
||||
|
||||
Args:
|
||||
excel_data: Excel 解析后的数据
|
||||
user_prompt: 用户提示词
|
||||
analysis_type: 分析类型 (general, summary, statistics, insights)
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 分析结果
|
||||
"""
|
||||
# 构建 Prompt
|
||||
system_prompt = self._get_system_prompt(analysis_type)
|
||||
user_message = self._format_user_message(excel_data, user_prompt)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_message}
|
||||
]
|
||||
|
||||
try:
|
||||
response = await self.chat(
|
||||
messages=messages,
|
||||
temperature=0.3, # 较低的温度以获得更稳定的输出
|
||||
max_tokens=2000
|
||||
)
|
||||
|
||||
content = self.extract_message_content(response)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"analysis": content,
|
||||
"model": self.model_name,
|
||||
"analysis_type": analysis_type
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Excel 数据分析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
def _get_system_prompt(self, analysis_type: str) -> str:
|
||||
"""获取系统提示词"""
|
||||
prompts = {
|
||||
"general": """你是一个专业的数据分析师。请分析用户提供的 Excel 数据,提供有价值的见解和建议。
|
||||
|
||||
请按照以下格式输出:
|
||||
1. 数据概览
|
||||
2. 关键发现
|
||||
3. 数据质量评估
|
||||
4. 建议
|
||||
|
||||
输出语言:中文""",
|
||||
"summary": """你是一个专业的数据分析师。请对用户提供的 Excel 数据进行简洁的总结。
|
||||
|
||||
输出格式:
|
||||
- 数据行数和列数
|
||||
- 主要列的说明
|
||||
- 数据范围概述
|
||||
|
||||
输出语言:中文""",
|
||||
"statistics": """你是一个专业的数据分析师。请对用户提供的 Excel 数据进行统计分析。
|
||||
|
||||
请分析:
|
||||
- 数值型列的统计信息(平均值、中位数、最大值、最小值)
|
||||
- 分类列的分布情况
|
||||
- 数据相关性
|
||||
|
||||
输出语言:中文,使用表格或结构化格式展示""",
|
||||
"insights": """你是一个专业的数据分析师。请深入挖掘用户提供的 Excel 数据,提供有价值的洞察。
|
||||
|
||||
请分析:
|
||||
1. 数据中的异常值或特殊模式
|
||||
2. 数据之间的潜在关联
|
||||
3. 基于数据的业务建议
|
||||
4. 数据趋势分析(如适用)
|
||||
|
||||
输出语言:中文,提供详细且可操作的建议"""
|
||||
}
|
||||
|
||||
return prompts.get(analysis_type, prompts["general"])
|
||||
|
||||
def _format_user_message(self, excel_data: Dict[str, Any], user_prompt: str) -> str:
|
||||
"""格式化用户消息"""
|
||||
columns = excel_data.get("columns", [])
|
||||
rows = excel_data.get("rows", [])
|
||||
row_count = excel_data.get("row_count", 0)
|
||||
column_count = excel_data.get("column_count", 0)
|
||||
|
||||
# 构建数据描述
|
||||
data_info = f"""
|
||||
Excel 数据概览:
|
||||
- 行数: {row_count}
|
||||
- 列数: {column_count}
|
||||
- 列名: {', '.join(columns)}
|
||||
|
||||
数据样例(前 5 行):
|
||||
"""
|
||||
|
||||
# 添加数据样例
|
||||
for i, row in enumerate(rows[:5], 1):
|
||||
row_str = " | ".join([f"{col}: {row.get(col, '')}" for col in columns])
|
||||
data_info += f"第 {i} 行: {row_str}\n"
|
||||
|
||||
if row_count > 5:
|
||||
data_info += f"\n(还有 {row_count - 5} 行数据...)\n"
|
||||
|
||||
# 添加用户自定义提示
|
||||
if user_prompt and user_prompt.strip():
|
||||
data_info += f"\n用户需求:\n{user_prompt}"
|
||||
else:
|
||||
data_info += "\n用户需求: 请对上述数据进行分析"
|
||||
|
||||
return data_info
|
||||
|
||||
async def analyze_with_template(
|
||||
self,
|
||||
excel_data: Dict[str, Any],
|
||||
template_prompt: str
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
使用自定义模板分析 Excel 数据
|
||||
|
||||
Args:
|
||||
excel_data: Excel 解析后的数据
|
||||
template_prompt: 自定义提示词模板
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 分析结果
|
||||
"""
|
||||
system_prompt = """你是一个专业的数据分析师。请根据用户提供的自定义提示词分析 Excel 数据。
|
||||
|
||||
请严格按照用户的要求进行分析,输出清晰、有条理的结果。
|
||||
|
||||
输出语言:中文"""
|
||||
|
||||
user_message = self._format_user_message(excel_data, template_prompt)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_message}
|
||||
]
|
||||
|
||||
try:
|
||||
response = await self.chat(
|
||||
messages=messages,
|
||||
temperature=0.5,
|
||||
max_tokens=3000
|
||||
)
|
||||
|
||||
content = self.extract_message_content(response)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"analysis": content,
|
||||
"model": self.model_name,
|
||||
"is_template": True
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"自定义模板分析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
|
||||
# 全局单例
|
||||
llm_service = LLMService()
|
||||
218
backend/app/services/text_analysis_service.py
Normal file
218
backend/app/services/text_analysis_service.py
Normal file
@@ -0,0 +1,218 @@
|
||||
"""
|
||||
文本分析服务 - 从 AI 分析结果中提取结构化数据用于可视化
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
import re
|
||||
import json
|
||||
|
||||
from app.services.llm_service import llm_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextAnalysisService:
|
||||
"""文本分析服务类"""
|
||||
|
||||
def __init__(self):
|
||||
self.llm_service = llm_service
|
||||
|
||||
async def extract_structured_data(
|
||||
self,
|
||||
analysis_text: str,
|
||||
original_filename: str = "",
|
||||
file_type: str = "text"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
从 AI 分析结果文本中提取结构化数据
|
||||
|
||||
Args:
|
||||
analysis_text: AI 分析结果文本
|
||||
original_filename: 原始文件名
|
||||
file_type: 文件类型
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 提取的结构化数据
|
||||
"""
|
||||
# 限制分析的文本长度,避免 token 超限
|
||||
max_text_length = 8000
|
||||
truncated_text = analysis_text[:max_text_length]
|
||||
|
||||
system_prompt = """你是一个专业的数据提取助手。你的任务是从AI分析结果中提取结构化数据,用于生成图表。
|
||||
|
||||
请按照以下要求提取数据:
|
||||
|
||||
1. 数值型数据:
|
||||
- 提取所有的数值、统计信息、百分比等
|
||||
- 为每个数值创建一个条目,包含:名称、值、单位(如果有)
|
||||
- 格式示例:{"name": "销售额", "value": 123456.78, "unit": "元"}
|
||||
|
||||
2. 分类数据:
|
||||
- 提取所有的类别、状态、枚举值等
|
||||
- 为每个类别创建一个条目,包含:名称、值、数量(如果有)
|
||||
- 格式示例:{"name": "产品类别", "value": "电子产品", "count": 25}
|
||||
|
||||
3. 时间序列数据:
|
||||
- 提取所有的时间相关数据(年月、季度、日期等)
|
||||
- 格式示例:{"name": "2025年1月", "value": 12345}
|
||||
|
||||
4. 对比数据:
|
||||
- 提取所有的对比、排名、趋势等数据
|
||||
- 格式示例:{"name": "同比增长", "value": 15.3, "unit": "%"}
|
||||
|
||||
5. 表格数据:
|
||||
- 如果分析结果中包含表格或列表形式的数据,提取出来
|
||||
- 格式:{"columns": ["列1", "列2"], "rows": [{"列1": "值1", "列2": "值2"}]}
|
||||
|
||||
重要规则:
|
||||
- 只提取明确提到的数据和数值
|
||||
- 如果某种类型的数据不存在,返回空数组 []
|
||||
- 确保所有数值都是有效的数字类型
|
||||
- 保持数据的原始精度
|
||||
- 返回的 JSON 必须完整且格式正确
|
||||
- 表格数据最多提取 20 行
|
||||
|
||||
请以 JSON 格式返回,不要添加任何 Markdown 标记或解释文字,只返回纯 JSON:
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"numeric_data": [
|
||||
{"name": string, "value": number, "unit": string|null}
|
||||
],
|
||||
"categorical_data": [
|
||||
{"name": string, "value": string, "count": number|null}
|
||||
],
|
||||
"time_series_data": [
|
||||
{"name": string, "value": number}
|
||||
],
|
||||
"comparison_data": [
|
||||
{"name": string, "value": number, "unit": string|null}
|
||||
],
|
||||
"table_data": {
|
||||
"columns": string[],
|
||||
"rows": object[]
|
||||
} | null
|
||||
},
|
||||
"metadata": {
|
||||
"total_items": number,
|
||||
"data_types": string[]
|
||||
}
|
||||
}"""
|
||||
|
||||
user_message = f"""请从以下 AI 分析结果中提取结构化数据:
|
||||
|
||||
原始文件名:{original_filename}
|
||||
文件类型:{file_type}
|
||||
|
||||
AI 分析结果:
|
||||
{truncated_text}
|
||||
|
||||
请按照系统提示的要求提取数据并返回纯 JSON 格式。"""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_message}
|
||||
]
|
||||
|
||||
try:
|
||||
logger.info(f"开始提取结构化数据,文本长度: {len(truncated_text)}")
|
||||
|
||||
response = await self.llm_service.chat(
|
||||
messages=messages,
|
||||
temperature=0.1,
|
||||
max_tokens=4000
|
||||
)
|
||||
|
||||
content = self.llm_service.extract_message_content(response)
|
||||
logger.info(f"LLM 返回内容长度: {len(content)}")
|
||||
|
||||
# 使用简单的方法提取 JSON
|
||||
result = self._extract_json_simple(content)
|
||||
|
||||
if not result:
|
||||
logger.error("无法从 LLM 响应中提取有效的 JSON")
|
||||
return {
|
||||
"success": False,
|
||||
"error": "AI 返回的数据格式不正确或被截断",
|
||||
"raw_content": content[:500]
|
||||
}
|
||||
|
||||
logger.info(f"成功提取结构化数据")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"提取结构化数据失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def _extract_json_simple(self, content: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
简化的 JSON 提取方法
|
||||
|
||||
Args:
|
||||
content: LLM 返回的内容
|
||||
|
||||
Returns:
|
||||
Optional[Dict[str, Any]]: 解析后的 JSON,失败返回 None
|
||||
"""
|
||||
try:
|
||||
# 方法 1: 查找 ```json 代码块
|
||||
code_block_match = re.search(r'```json\n{[\s\S]*?}[\s\S]*?}\n```', content, re.DOTALL)
|
||||
if code_block_match:
|
||||
json_str = code_block_match.group(1)
|
||||
logger.info("从代码块中提取 JSON")
|
||||
return json.loads(json_str)
|
||||
|
||||
# 方法 2: 查找第一个完整的 { } 对象
|
||||
brace_count = 0
|
||||
json_start = -1
|
||||
|
||||
for i in range(len(content)):
|
||||
if content[i] == '{':
|
||||
if brace_count == 0:
|
||||
json_start = i
|
||||
brace_count += 1
|
||||
elif content[i] == '}':
|
||||
brace_count -= 1
|
||||
if brace_count == 0:
|
||||
# 找到了完整的 JSON 对象
|
||||
json_end = i + 1
|
||||
json_str = content[json_start:json_end]
|
||||
logger.info(f"从大括号中提取 JSON")
|
||||
return json.loads(json_str)
|
||||
|
||||
# 方法 3: 尝试直接解析
|
||||
logger.info("尝试直接解析整个内容")
|
||||
return json.loads(content)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"JSON 解析失败: {str(e)}")
|
||||
logger.error(f"原始内容(前 500 字符): {content[:500]}...")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"提取 JSON 失败: {str(e)}")
|
||||
return None
|
||||
|
||||
def detect_data_types(self, data: Dict[str, Any]) -> List[str]:
|
||||
"""检测数据中包含的类型"""
|
||||
types = []
|
||||
d = data.get("data", {})
|
||||
|
||||
if d.get("numeric_data") and len(d["numeric_data"]) > 0:
|
||||
types.append("numeric")
|
||||
if d.get("categorical_data") and len(d["categorical_data"]) > 0:
|
||||
types.append("categorical")
|
||||
if d.get("time_series_data") and len(d["time_series_data"]) > 0:
|
||||
types.append("time_series")
|
||||
if d.get("comparison_data") and len(d["comparison_data"]) > 0:
|
||||
types.append("comparison")
|
||||
if d.get("table_data") and d["table_data"]:
|
||||
types.append("table")
|
||||
|
||||
return types
|
||||
|
||||
|
||||
# 全局单例
|
||||
text_analysis_service = TextAnalysisService()
|
||||
0
backend/app/services/text_analysis_service_fixed.py
Normal file
0
backend/app/services/text_analysis_service_fixed.py
Normal file
388
backend/app/services/visualization_service.py
Normal file
388
backend/app/services/visualization_service.py
Normal file
@@ -0,0 +1,388 @@
|
||||
"""
|
||||
数据可视化服务 - 使用 matplotlib/plotly 生成统计图表
|
||||
"""
|
||||
import io
|
||||
import base64
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional, Union
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib
|
||||
import numpy as np
|
||||
|
||||
# 使用字体辅助模块配置中文字体
|
||||
from app.services.font_helper import configure_matplotlib_fonts
|
||||
|
||||
configure_matplotlib_fonts()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VisualizationService:
|
||||
"""数据可视化服务类"""
|
||||
|
||||
def __init__(self):
|
||||
self.output_dir = Path(__file__).resolve().parent.parent.parent / "data" / "charts"
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def analyze_and_visualize(
|
||||
self,
|
||||
excel_data: Dict[str, Any],
|
||||
analysis_type: str = "statistics"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
分析数据并生成可视化图表
|
||||
|
||||
Args:
|
||||
excel_data: Excel 解析后的数据
|
||||
analysis_type: 分析类型
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 包含图表数据和统计信息的结果
|
||||
"""
|
||||
try:
|
||||
columns = excel_data.get("columns", [])
|
||||
rows = excel_data.get("rows", [])
|
||||
|
||||
if not columns or not rows:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "没有数据可用于分析"
|
||||
}
|
||||
|
||||
# 转换为 DataFrame
|
||||
df = pd.DataFrame(rows, columns=columns)
|
||||
|
||||
# 根据列类型分类
|
||||
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
|
||||
categorical_columns = df.select_dtypes(exclude=[np.number]).columns.tolist()
|
||||
|
||||
# 生成统计信息
|
||||
statistics = self._generate_statistics(df, numeric_columns, categorical_columns)
|
||||
|
||||
# 生成图表
|
||||
charts = self._generate_charts(df, numeric_columns, categorical_columns)
|
||||
|
||||
# 生成数据分布信息
|
||||
distributions = self._generate_distributions(df, categorical_columns)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"statistics": statistics,
|
||||
"charts": charts,
|
||||
"distributions": distributions,
|
||||
"row_count": len(df),
|
||||
"column_count": len(columns)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"可视化分析失败: {str(e)}", exc_info=True)
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def _generate_statistics(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
numeric_columns: List[str],
|
||||
categorical_columns: List[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""生成统计信息"""
|
||||
statistics = {
|
||||
"numeric": {},
|
||||
"categorical": {}
|
||||
}
|
||||
|
||||
# 数值型列统计
|
||||
for col in numeric_columns:
|
||||
try:
|
||||
stats = {
|
||||
"count": int(df[col].count()),
|
||||
"mean": float(df[col].mean()),
|
||||
"median": float(df[col].median()),
|
||||
"std": float(df[col].std()) if df[col].count() > 1 else 0,
|
||||
"min": float(df[col].min()),
|
||||
"max": float(df[col].max()),
|
||||
"q25": float(df[col].quantile(0.25)),
|
||||
"q75": float(df[col].quantile(0.75)),
|
||||
"missing": int(df[col].isna().sum())
|
||||
}
|
||||
statistics["numeric"][col] = stats
|
||||
except Exception as e:
|
||||
logger.warning(f"列 {col} 统计失败: {str(e)}")
|
||||
|
||||
# 分类型列统计
|
||||
for col in categorical_columns:
|
||||
try:
|
||||
value_counts = df[col].value_counts()
|
||||
stats = {
|
||||
"unique": int(df[col].nunique()),
|
||||
"most_common": str(value_counts.index[0]) if len(value_counts) > 0 else "",
|
||||
"most_common_count": int(value_counts.iloc[0]) if len(value_counts) > 0 else 0,
|
||||
"missing": int(df[col].isna().sum()),
|
||||
"distribution": {str(k): int(v) for k, v in value_counts.items()}
|
||||
}
|
||||
statistics["categorical"][col] = stats
|
||||
except Exception as e:
|
||||
logger.warning(f"列 {col} 统计失败: {str(e)}")
|
||||
|
||||
return statistics
|
||||
|
||||
def _generate_charts(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
numeric_columns: List[str],
|
||||
categorical_columns: List[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""生成图表"""
|
||||
charts = {}
|
||||
|
||||
# 1. 数值型列的直方图
|
||||
charts["histograms"] = []
|
||||
for col in numeric_columns[:5]: # 限制最多 5 个数值列
|
||||
chart_data = self._create_histogram(df[col], col)
|
||||
if chart_data:
|
||||
charts["histograms"].append(chart_data)
|
||||
|
||||
# 2. 分类型列的条形图
|
||||
charts["bar_charts"] = []
|
||||
for col in categorical_columns[:5]: # 限制最多 5 个分类型列
|
||||
chart_data = self._create_bar_chart(df[col], col)
|
||||
if chart_data:
|
||||
charts["bar_charts"].append(chart_data)
|
||||
|
||||
# 3. 数值型列的箱线图
|
||||
charts["box_plots"] = []
|
||||
if len(numeric_columns) > 0:
|
||||
chart_data = self._create_box_plot(df[numeric_columns[:5]], numeric_columns[:5])
|
||||
if chart_data:
|
||||
charts["box_plots"].append(chart_data)
|
||||
|
||||
# 4. 相关性热力图
|
||||
if len(numeric_columns) >= 2:
|
||||
chart_data = self._create_correlation_heatmap(df[numeric_columns], numeric_columns)
|
||||
if chart_data:
|
||||
charts["correlation"] = chart_data
|
||||
|
||||
return charts
|
||||
|
||||
def _create_histogram(self, series: pd.Series, column_name: str) -> Optional[Dict[str, Any]]:
|
||||
"""创建直方图"""
|
||||
try:
|
||||
fig, ax = plt.subplots(figsize=(11, 7))
|
||||
ax.hist(series.dropna(), bins=20, edgecolor='black', alpha=0.7, color='#3b82f6')
|
||||
ax.set_xlabel(column_name, fontsize=10, labelpad=10)
|
||||
ax.set_ylabel('频数', fontsize=10, labelpad=10)
|
||||
ax.set_title(f'{column_name} 分布', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.grid(True, alpha=0.3, axis='y')
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
|
||||
# 改进布局
|
||||
plt.tight_layout(pad=1.5, w_pad=1.0, h_pad=1.0)
|
||||
|
||||
# 转换为 base64
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
|
||||
return {
|
||||
"type": "histogram",
|
||||
"column": column_name,
|
||||
"image": img_base64,
|
||||
"stats": {
|
||||
"mean": float(series.mean()),
|
||||
"median": float(series.median()),
|
||||
"std": float(series.std()) if len(series) > 1 else 0
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"创建直方图失败 ({column_name}): {str(e)}")
|
||||
return None
|
||||
|
||||
def _create_bar_chart(self, series: pd.Series, column_name: str) -> Optional[Dict[str, Any]]:
|
||||
"""创建条形图"""
|
||||
try:
|
||||
value_counts = series.value_counts().head(10) # 只显示前 10 个
|
||||
fig, ax = plt.subplots(figsize=(12, 7))
|
||||
|
||||
# 处理标签显示
|
||||
labels = [str(x)[:15] + '...' if len(str(x)) > 15 else str(x) for x in value_counts.index]
|
||||
x_pos = range(len(value_counts))
|
||||
bars = ax.bar(x_pos, value_counts.values, color='#10b981', alpha=0.8, edgecolor='black', linewidth=0.5)
|
||||
|
||||
ax.set_xticks(x_pos)
|
||||
ax.set_xticklabels(labels, rotation=30, ha='right', fontsize=8)
|
||||
ax.set_xlabel(column_name, fontsize=10, labelpad=10)
|
||||
ax.set_ylabel('数量', fontsize=10, labelpad=10)
|
||||
ax.set_title(f'{column_name} 分布 (Top 10)', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.grid(True, alpha=0.3, axis='y')
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
|
||||
# 添加数值标签(位置稍微上移)
|
||||
max_val = value_counts.values.max()
|
||||
y_offset = max_val * 0.02 if max_val > 0 else 0.5
|
||||
for bar, value in zip(bars, value_counts.values):
|
||||
ax.text(bar.get_x() + bar.get_width() / 2., value + y_offset,
|
||||
f'{int(value)}',
|
||||
ha='center', va='bottom', fontsize=8, fontweight='bold')
|
||||
|
||||
# 改进布局
|
||||
plt.tight_layout(pad=1.5, w_pad=1.0, h_pad=1.0)
|
||||
|
||||
# 转换为 base64
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
|
||||
return {
|
||||
"type": "bar_chart",
|
||||
"column": column_name,
|
||||
"image": img_base64,
|
||||
"categories": {str(k): int(v) for k, v in value_counts.items()}
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"创建条形图失败 ({column_name}): {str(e)}")
|
||||
return None
|
||||
|
||||
def _create_box_plot(self, df: pd.DataFrame, columns: List[str]) -> Optional[Dict[str, Any]]:
|
||||
"""创建箱线图"""
|
||||
try:
|
||||
fig, ax = plt.subplots(figsize=(14, 7))
|
||||
|
||||
# 准备数据
|
||||
box_data = [df[col].dropna() for col in columns]
|
||||
bp = ax.boxplot(box_data, labels=columns, patch_artist=True,
|
||||
notch=True, showcaps=True, showfliers=True)
|
||||
|
||||
# 美化箱线图
|
||||
box_colors = ['#3b82f6', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6']
|
||||
for patch, color in zip(bp['boxes'], box_colors[:len(bp['boxes'])]):
|
||||
patch.set_facecolor(color)
|
||||
patch.set_alpha(0.6)
|
||||
patch.set_linewidth(1.5)
|
||||
|
||||
# 设置其他元素样式
|
||||
for element in ['whiskers', 'fliers', 'means', 'medians', 'caps']:
|
||||
plt.setp(bp[element], linewidth=1.5)
|
||||
|
||||
ax.set_ylabel('值', fontsize=10, labelpad=10)
|
||||
ax.set_title('数值型列分布对比', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.grid(True, alpha=0.3, axis='y')
|
||||
|
||||
# 旋转 x 轴标签以避免重叠
|
||||
plt.setp(ax.get_xticklabels(), rotation=30, ha='right', fontsize=9)
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
|
||||
# 改进布局
|
||||
plt.tight_layout(pad=1.5, w_pad=1.5, h_pad=1.0)
|
||||
|
||||
# 转换为 base64
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
|
||||
return {
|
||||
"type": "box_plot",
|
||||
"columns": columns,
|
||||
"image": img_base64
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"创建箱线图失败: {str(e)}")
|
||||
return None
|
||||
|
||||
def _create_correlation_heatmap(self, df: pd.DataFrame, columns: List[str]) -> Optional[Dict[str, Any]]:
|
||||
"""创建相关性热力图"""
|
||||
try:
|
||||
# 计算相关系数
|
||||
corr = df.corr()
|
||||
|
||||
fig, ax = plt.subplots(figsize=(11, 9))
|
||||
im = ax.imshow(corr, cmap='RdBu_r', aspect='auto', vmin=-1, vmax=1)
|
||||
|
||||
# 设置刻度
|
||||
n_cols = len(corr)
|
||||
ax.set_xticks(np.arange(n_cols))
|
||||
ax.set_yticks(np.arange(n_cols))
|
||||
|
||||
# 处理过长的列名
|
||||
x_labels = [str(col)[:10] + '...' if len(str(col)) > 10 else str(col) for col in corr.columns]
|
||||
y_labels = [str(col)[:10] + '...' if len(str(col)) > 10 else str(col) for col in corr.columns]
|
||||
|
||||
ax.set_xticklabels(x_labels, rotation=30, ha='right', fontsize=9)
|
||||
ax.set_yticklabels(y_labels, fontsize=9)
|
||||
|
||||
# 添加数值标签,根据相关性值选择颜色
|
||||
for i in range(n_cols):
|
||||
for j in range(n_cols):
|
||||
value = corr.iloc[i, j]
|
||||
# 根据背景色深浅选择文字颜色
|
||||
text_color = 'white' if abs(value) > 0.5 else 'black'
|
||||
ax.text(j, i, f'{value:.2f}',
|
||||
ha="center", va="center", color=text_color,
|
||||
fontsize=8, fontweight='bold' if abs(value) > 0.7 else 'normal')
|
||||
|
||||
ax.set_title('数值型列相关性热力图', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
|
||||
# 添加颜色条
|
||||
cbar = plt.colorbar(im, ax=ax)
|
||||
cbar.set_label('相关系数', rotation=270, labelpad=20, fontsize=10)
|
||||
cbar.ax.tick_params(labelsize=9)
|
||||
|
||||
# 改进布局
|
||||
plt.tight_layout(pad=2.0, w_pad=1.0, h_pad=1.0)
|
||||
|
||||
# 转换为 base64
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
|
||||
return {
|
||||
"type": "correlation_heatmap",
|
||||
"columns": columns,
|
||||
"image": img_base64,
|
||||
"correlation_matrix": corr.to_dict()
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"创建相关性热力图失败: {str(e)}")
|
||||
return None
|
||||
|
||||
def _generate_distributions(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
categorical_columns: List[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""生成数据分布信息"""
|
||||
distributions = {}
|
||||
|
||||
for col in categorical_columns[:5]:
|
||||
try:
|
||||
value_counts = df[col].value_counts()
|
||||
total = len(df)
|
||||
|
||||
distributions[col] = {
|
||||
"categories": {str(k): int(v) for k, v in value_counts.items()},
|
||||
"percentages": {str(k): round(v / total * 100, 2) for k, v in value_counts.items()},
|
||||
"unique_count": len(value_counts)
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning(f"列 {col} 分布生成失败: {str(e)}")
|
||||
|
||||
return distributions
|
||||
|
||||
def _figure_to_base64(self, fig) -> str:
|
||||
"""将 matplotlib 图形转换为 base64 字符串"""
|
||||
buf = io.BytesIO()
|
||||
fig.savefig(
|
||||
buf,
|
||||
format='png',
|
||||
dpi=120,
|
||||
bbox_inches='tight',
|
||||
pad_inches=0.3,
|
||||
facecolor='white',
|
||||
edgecolor='none',
|
||||
transparent=False
|
||||
)
|
||||
plt.close(fig)
|
||||
buf.seek(0)
|
||||
img_base64 = base64.b64encode(buf.read()).decode('utf-8')
|
||||
return f"data:image/png;base64,{img_base64}"
|
||||
|
||||
|
||||
# 全局单例
|
||||
visualization_service = VisualizationService()
|
||||
Reference in New Issue
Block a user