前后端基本架构和完全excel表的解析及统计图表的生成以及excel表的到出

This commit is contained in:
2026-03-19 01:51:34 +08:00
parent c23b93bb70
commit 2f630695ff
194 changed files with 23354 additions and 174 deletions

View File

@@ -0,0 +1,14 @@
"""
API 路由注册模块
"""
from fastapi import APIRouter
from app.api.endpoints import upload, ai_analyze, visualization, analysis_charts
# 创建主路由
api_router = APIRouter()
# 注册各模块路由
api_router.include_router(upload.router)
api_router.include_router(ai_analyze.router)
api_router.include_router(visualization.router)
api_router.include_router(analysis_charts.router)

Binary file not shown.

View File

@@ -0,0 +1,144 @@
"""
AI 分析 API 接口
"""
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body
from typing import Optional
import logging
from app.services.excel_ai_service import excel_ai_service
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/ai", tags=["AI 分析"])
@router.post("/analyze/excel")
async def analyze_excel(
file: UploadFile = File(...),
user_prompt: str = Query("", description="用户自定义提示词"),
analysis_type: str = Query("general", description="分析类型: general, summary, statistics, insights"),
parse_all_sheets: bool = Query(False, description="是否分析所有工作表")
):
"""
上传并使用 AI 分析 Excel 文件
Args:
file: 上传的 Excel 文件
user_prompt: 用户自定义提示词
analysis_type: 分析类型
parse_all_sheets: 是否分析所有工作表
Returns:
dict: 分析结果,包含 Excel 数据和 AI 分析结果
"""
# 检查文件类型
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['xlsx', 'xls']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .xlsx 和 .xls"
)
# 验证分析类型
supported_types = ['general', 'summary', 'statistics', 'insights']
if analysis_type not in supported_types:
raise HTTPException(
status_code=400,
detail=f"不支持的分析类型: {analysis_type},支持的类型: {', '.join(supported_types)}"
)
try:
# 读取文件内容
content = await file.read()
logger.info(f"开始分析文件: {file.filename}, 分析类型: {analysis_type}")
# 调用 AI 分析服务
if parse_all_sheets:
result = await excel_ai_service.batch_analyze_sheets(
content,
file.filename,
user_prompt=user_prompt,
analysis_type=analysis_type
)
else:
# 解析选项
parse_options = {"header_row": 0}
result = await excel_ai_service.analyze_excel_file(
content,
file.filename,
user_prompt=user_prompt,
analysis_type=analysis_type,
parse_options=parse_options
)
logger.info(f"文件分析完成: {file.filename}, 成功: {result['success']}")
return result
except HTTPException:
raise
except Exception as e:
logger.error(f"AI 分析过程中出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
@router.get("/analysis/types")
async def get_analysis_types():
"""
获取支持的分析类型列表
Returns:
list: 支持的分析类型
"""
return {
"types": excel_ai_service.get_supported_analysis_types()
}
@router.post("/analyze/text")
async def analyze_text(
excel_data: dict = Body(..., description="Excel 解析后的数据"),
user_prompt: str = Body("", description="用户提示词"),
analysis_type: str = Body("general", description="分析类型")
):
"""
对已解析的 Excel 数据进行 AI 分析
Args:
excel_data: Excel 数据
user_prompt: 用户提示词
analysis_type: 分析类型
Returns:
dict: 分析结果
"""
try:
logger.info(f"开始文本分析, 分析类型: {analysis_type}")
# 调用 LLM 服务
from app.services.llm_service import llm_service
if user_prompt and user_prompt.strip():
result = await llm_service.analyze_with_template(
excel_data,
user_prompt
)
else:
result = await llm_service.analyze_excel_data(
excel_data,
user_prompt,
analysis_type
)
logger.info(f"文本分析完成, 成功: {result['success']}")
return result
except Exception as e:
logger.error(f"文本分析失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")

View File

@@ -0,0 +1,105 @@
"""
分析结果图表 API - 根据文本分析结果生成图表
"""
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from typing import Optional
import logging
from app.services.text_analysis_service import text_analysis_service
from app.services.chart_generator_service import chart_generator_service
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/analysis", tags=["分析结果图表"])
class AnalysisChartRequest(BaseModel):
"""分析图表生成请求模型"""
analysis_text: str
original_filename: Optional[str] = ""
file_type: Optional[str] = "text"
@router.post("/extract-and-chart")
async def extract_and_generate_charts(request: AnalysisChartRequest):
"""
从 AI 分析结果中提取数据并生成图表
Args:
request: 包含分析文本的请求
Returns:
dict: 包含图表数据的结果
"""
if not request.analysis_text or not request.analysis_text.strip():
raise HTTPException(status_code=400, detail="分析文本不能为空")
try:
logger.info("开始从分析结果中提取结构化数据...")
# 1. 使用 LLM 提取结构化数据
extract_result = await text_analysis_service.extract_structured_data(
analysis_text=request.analysis_text,
original_filename=request.original_filename or "unknown",
file_type=request.file_type or "text"
)
if not extract_result.get("success"):
raise HTTPException(
status_code=500,
detail=f"提取结构化数据失败: {extract_result.get('error', '未知错误')}"
)
logger.info("结构化数据提取成功,开始生成图表...")
# 2. 根据提取的数据生成图表
chart_result = chart_generator_service.generate_charts_from_analysis(extract_result)
if not chart_result.get("success"):
raise HTTPException(
status_code=500,
detail=f"生成图表失败: {chart_result.get('error', '未知错误')}"
)
logger.info("图表生成成功")
return chart_result
except HTTPException:
raise
except Exception as e:
logger.error(f"分析结果图表生成失败: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"图表生成失败: {str(e)}"
)
@router.post("/analyze-text")
async def analyze_text_only(request: AnalysisChartRequest):
"""
仅提取结构化数据(不生成图表),用于调试
Args:
request: 包含分析文本的请求
Returns:
dict: 提取的结构化数据
"""
if not request.analysis_text or not request.analysis_text.strip():
raise HTTPException(status_code=400, detail="分析文本不能为空")
try:
result = await text_analysis_service.extract_structured_data(
analysis_text=request.analysis_text,
original_filename=request.original_filename or "unknown",
file_type=request.file_type or "text"
)
return result
except Exception as e:
logger.error(f"文本分析失败: {str(e)}")
raise HTTPException(
status_code=500,
detail=f"文本分析失败: {str(e)}"
)

View File

@@ -0,0 +1,205 @@
"""
文件上传 API 接口
"""
from fastapi import APIRouter, UploadFile, File, HTTPException, Query
from fastapi.responses import StreamingResponse
from typing import Optional
import logging
import pandas as pd
import io
from app.services.file_service import file_service
from app.core.document_parser import XlsxParser
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/upload", tags=["文件上传"])
# 初始化解析器
excel_parser = XlsxParser()
@router.post("/excel")
async def upload_excel(
file: UploadFile = File(...),
parse_all_sheets: bool = Query(False, description="是否解析所有工作表"),
sheet_name: Optional[str] = Query(None, description="指定解析的工作表名称"),
header_row: int = Query(0, description="表头所在的行索引")
):
"""
上传并解析 Excel 文件
Args:
file: 上传的 Excel 文件
parse_all_sheets: 是否解析所有工作表
sheet_name: 指定解析的工作表名称
header_row: 表头所在的行索引
Returns:
dict: 解析结果
"""
# 检查文件类型
if not file.filename:
raise HTTPException(status_code=400, detail="文件名为空")
file_ext = file.filename.split('.')[-1].lower()
if file_ext not in ['xlsx', 'xls']:
raise HTTPException(
status_code=400,
detail=f"不支持的文件类型: {file_ext},仅支持 .xlsx 和 .xls"
)
try:
# 读取文件内容
content = await file.read()
# 保存文件
saved_path = file_service.save_uploaded_file(
content,
file.filename,
subfolder="excel"
)
logger.info(f"文件已保存: {saved_path}")
# 解析文件
if parse_all_sheets:
result = excel_parser.parse_all_sheets(saved_path)
else:
# 如果指定了 sheet_name使用指定的否则使用默认的第一个
if sheet_name:
result = excel_parser.parse(saved_path, sheet_name=sheet_name, header_row=header_row)
else:
result = excel_parser.parse(saved_path, header_row=header_row)
# 添加文件路径到元数据
if result.metadata:
result.metadata['saved_path'] = saved_path
result.metadata['original_filename'] = file.filename
return result.to_dict()
except HTTPException:
raise
except Exception as e:
logger.error(f"解析 Excel 文件时出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"解析失败: {str(e)}")
@router.get("/excel/preview/{file_path:path}")
async def get_excel_preview(
file_path: str,
sheet_name: Optional[str] = Query(None, description="工作表名称"),
max_rows: int = Query(10, description="最多返回的行数", ge=1, le=100)
):
"""
获取 Excel 文件的预览数据
Args:
file_path: 文件路径
sheet_name: 工作表名称
max_rows: 最多返回的行数
Returns:
dict: 预览数据
"""
try:
# 解析工作表名称参数
sheet_param = sheet_name if sheet_name else 0
result = excel_parser.get_sheet_preview(
file_path,
sheet_name=sheet_param,
max_rows=max_rows
)
return result.to_dict()
except Exception as e:
logger.error(f"获取预览数据时出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"获取预览失败: {str(e)}")
@router.delete("/file")
async def delete_uploaded_file(file_path: str = Query(..., description="要删除的文件路径")):
"""
删除已上传的文件
Args:
file_path: 文件路径
Returns:
dict: 删除结果
"""
try:
success = file_service.delete_file(file_path)
if success:
return {"success": True, "message": "文件删除成功"}
else:
return {"success": False, "message": "文件不存在或删除失败"}
except Exception as e:
logger.error(f"删除文件时出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"删除失败: {str(e)}")
@router.get("/excel/export/{file_path:path}")
async def export_excel(
file_path: str,
sheet_name: Optional[str] = Query(None, description="工作表名称"),
columns: Optional[str] = Query(None, description="要导出的列,逗号分隔")
):
"""
导出 Excel 文件(可选择工作表和列)
Args:
file_path: 原始文件路径
sheet_name: 工作表名称(可选)
columns: 要导出的列名,逗号分隔(可选)
Returns:
StreamingResponse: Excel 文件
"""
try:
# 读取 Excel 文件
if sheet_name:
df = pd.read_excel(file_path, sheet_name=sheet_name)
else:
df = pd.read_excel(file_path)
# 如果指定了列,只选择这些列
if columns:
column_list = [col.strip() for col in columns.split(',')]
# 过滤掉不存在的列
available_columns = [col for col in column_list if col in df.columns]
if available_columns:
df = df[available_columns]
# 创建 Excel 文件
output = io.BytesIO()
with pd.ExcelWriter(output, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name=sheet_name or 'Sheet1')
output.seek(0)
# 生成文件名
original_name = file_path.split('/')[-1] if '/' in file_path else file_path
if columns:
export_name = f"export_{sheet_name or 'data'}_{len(column_list) if columns else 'all'}_cols.xlsx"
else:
export_name = f"export_{original_name}"
# 返回文件流
return StreamingResponse(
io.BytesIO(output.getvalue()),
media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
headers={"Content-Disposition": f"attachment; filename={export_name}"}
)
except FileNotFoundError:
logger.error(f"文件不存在: {file_path}")
raise HTTPException(status_code=404, detail="文件不存在")
except Exception as e:
logger.error(f"导出 Excel 文件时出错: {str(e)}")
raise HTTPException(status_code=500, detail=f"导出失败: {str(e)}")

View File

@@ -0,0 +1,90 @@
"""
可视化 API 接口 - 生成统计图表
"""
from fastapi import APIRouter, HTTPException, Body
from typing import Dict, Any
import logging
from app.services.visualization_service import visualization_service
from pydantic import BaseModel
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/visualization", tags=["数据可视化"])
class StatisticsRequest(BaseModel):
"""统计图表生成请求模型"""
excel_data: Dict[str, Any]
analysis_type: str = "statistics"
@router.post("/statistics")
async def generate_statistics(request: StatisticsRequest):
"""
生成统计信息和可视化图表
Args:
request: 包含 excel_data 和 analysis_type 的请求体
Returns:
dict: 包含统计信息和图表数据的结果
"""
excel_data = request.excel_data
analysis_type = request.analysis_type
if not excel_data:
raise HTTPException(status_code=400, detail="未提供 Excel 数据")
try:
result = visualization_service.analyze_and_visualize(
excel_data,
analysis_type
)
if not result.get("success"):
raise HTTPException(status_code=500, detail=result.get("error", "分析失败"))
logger.info("统计图表生成成功")
return result
except HTTPException:
raise
except Exception as e:
logger.error(f"统计图表生成失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"图表生成失败: {str(e)}")
@router.get("/chart-types")
async def get_chart_types():
"""
获取支持的图表类型
Returns:
dict: 支持的图表类型列表
"""
return {
"chart_types": [
{
"value": "histogram",
"label": "直方图",
"description": "显示数值型列的分布情况"
},
{
"value": "bar_chart",
"label": "条形图",
"description": "显示分类列的频次分布"
},
{
"value": "box_plot",
"label": "箱线图",
"description": "显示数值列的四分位数和异常值"
},
{
"value": "correlation_heatmap",
"label": "相关性热力图",
"description": "显示数值列之间的相关性"
}
]
}

View File

@@ -0,0 +1,7 @@
"""
文档解析模块 - 支持多种文件格式的解析
"""
from .base import BaseParser
from .xlsx_parser import XlsxParser
__all__ = ['BaseParser', 'XlsxParser']

View File

@@ -0,0 +1,87 @@
"""
解析器基类 - 定义所有解析器的通用接口
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional
from pathlib import Path
class ParseResult:
"""解析结果类"""
def __init__(
self,
success: bool,
data: Optional[Dict[str, Any]] = None,
error: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
):
self.success = success
self.data = data or {}
self.error = error
self.metadata = metadata or {}
def to_dict(self) -> Dict[str, Any]:
"""转换为字典"""
return {
"success": self.success,
"data": self.data,
"error": self.error,
"metadata": self.metadata
}
class BaseParser(ABC):
"""文档解析器基类"""
def __init__(self):
self.supported_extensions: List[str] = []
self.parser_name: str = "base_parser"
@abstractmethod
def parse(self, file_path: str, **kwargs) -> ParseResult:
"""
解析文件
Args:
file_path: 文件路径
**kwargs: 其他解析参数
Returns:
ParseResult: 解析结果
"""
pass
def can_parse(self, file_path: str) -> bool:
"""
检查是否可以解析该文件
Args:
file_path: 文件路径
Returns:
bool: 是否可以解析
"""
ext = Path(file_path).suffix.lower()
return ext in self.supported_extensions
def get_file_info(self, file_path: str) -> Dict[str, Any]:
"""
获取文件基本信息
Args:
file_path: 文件路径
Returns:
Dict[str, Any]: 文件信息
"""
path = Path(file_path)
if not path.exists():
return {"error": "File not found"}
return {
"filename": path.name,
"extension": path.suffix.lower(),
"size": path.stat().st_size,
"parser": self.parser_name
}

View File

@@ -0,0 +1,120 @@
"""
文档解析工具函数
"""
import re
from typing import List, Optional, Dict, Any
def clean_text(text: str) -> str:
"""
清洗文本,去除多余的空白字符和特殊符号
Args:
text: 原始文本
Returns:
str: 清洗后的文本
"""
if not text:
return ""
# 去除首尾空白
text = text.strip()
# 将多个连续的空白字符替换为单个空格
text = re.sub(r'\s+', ' ', text)
# 去除不可打印字符
text = ''.join(char for char in text if char.isprintable() or char in '\n\r\t')
return text
def chunk_text(
text: str,
chunk_size: int = 1000,
overlap: int = 100
) -> List[str]:
"""
将文本分块
Args:
text: 原始文本
chunk_size: 每块的大小(字符数)
overlap: 重叠区域的大小
Returns:
List[str]: 文本块列表
"""
if not text:
return []
chunks = []
start = 0
text_length = len(text)
while start < text_length:
end = start + chunk_size
chunk = text[start:end]
chunks.append(chunk)
start = end - overlap
return chunks
def normalize_string(s: Any) -> str:
"""
标准化字符串
Args:
s: 输入值
Returns:
str: 标准化后的字符串
"""
if s is None:
return ""
if isinstance(s, (int, float)):
return str(s)
if isinstance(s, str):
return clean_text(s)
return str(s)
def detect_encoding(file_path: str) -> Optional[str]:
"""
检测文件编码(简化版)
Args:
file_path: 文件路径
Returns:
Optional[str]: 编码格式,无法检测则返回 None
"""
import chardet
try:
with open(file_path, 'rb') as f:
raw_data = f.read(10000) # 读取前 10000 字节
result = chardet.detect(raw_data)
return result.get('encoding')
except Exception:
return None
def safe_get(d: Dict[str, Any], key: str, default: Any = None) -> Any:
"""
安全地获取字典值
Args:
d: 字典
key: 键
default: 默认值
Returns:
Any: 字典值或默认值
"""
try:
return d.get(key, default)
except Exception:
return default

View File

@@ -0,0 +1,288 @@
"""
Excel 文件解析器 - 解析 .xlsx 和 .xls 文件
"""
from typing import Any, Dict, List, Optional
from pathlib import Path
import pandas as pd
import logging
from .base import BaseParser, ParseResult
logger = logging.getLogger(__name__)
class XlsxParser(BaseParser):
"""Excel 文件解析器"""
def __init__(self):
super().__init__()
self.supported_extensions = ['.xlsx', '.xls']
self.parser_name = "excel_parser"
def parse(
self,
file_path: str,
sheet_name: Optional[str | int] = 0,
header_row: int = 0,
**kwargs
) -> ParseResult:
"""
解析 Excel 文件
Args:
file_path: 文件路径
sheet_name: 工作表名称或索引,默认为第一个工作表
header_row: 表头所在的行索引,默认为 0
**kwargs: 其他参数传递给 pandas.read_excel
Returns:
ParseResult: 解析结果
"""
path = Path(file_path)
# 检查文件是否存在
if not path.exists():
return ParseResult(
success=False,
error=f"File not found: {file_path}"
)
# 检查文件扩展名
if path.suffix.lower() not in self.supported_extensions:
return ParseResult(
success=False,
error=f"Unsupported file type: {path.suffix}"
)
# 检查文件大小
file_size = path.stat().st_size
if file_size == 0:
return ParseResult(
success=False,
error=f"File is empty: {file_path}"
)
try:
# 尝试读取 Excel 文件,检查是否有工作表
xls_file = pd.ExcelFile(file_path)
sheet_names = xls_file.sheet_names
if not sheet_names:
return ParseResult(
success=False,
error=f"Excel 文件没有找到任何工作表: {file_path}"
)
# 验证请求的工作表索引/名称
target_sheet = None
if sheet_name is not None:
if isinstance(sheet_name, int) and sheet_name < len(sheet_names):
target_sheet = sheet_names[sheet_name]
elif isinstance(sheet_name, str) and sheet_name in sheet_names:
target_sheet = sheet_name
else:
# 如果指定的 sheet_name 无效,使用第一个工作表
target_sheet = sheet_names[0]
else:
# 默认使用第一个工作表
target_sheet = sheet_names[0]
# 读取 Excel 文件
df = pd.read_excel(
file_path,
sheet_name=target_sheet,
header=header_row,
**kwargs
)
# 检查 DataFrame 是否为空
if df.empty:
return ParseResult(
success=False,
error=f"工作表 '{target_sheet}' 为空,请检查 Excel 文件内容"
)
# 转换为可序列化的数据
data = self._df_to_dict(df)
# 构建元数据
metadata = {
"filename": path.name,
"extension": path.suffix.lower(),
"sheet_count": len(sheet_names),
"sheet_names": sheet_names,
"current_sheet": target_sheet,
"row_count": len(df),
"column_count": len(df.columns) if not df.empty else 0,
"columns": df.columns.tolist() if not df.empty else [],
"file_size": file_size
}
return ParseResult(
success=True,
data=data,
metadata=metadata
)
except IndexError as e:
logger.error(f"工作表索引错误: {str(e)}")
# 工作表索引超出范围时,尝试使用第一个工作表
try:
xls_file = pd.ExcelFile(file_path)
sheet_names = xls_file.sheet_names
if sheet_names:
df = pd.read_excel(
file_path,
sheet_name=sheet_names[0],
header=header_row,
**kwargs
)
data = self._df_to_dict(df)
metadata = {
"filename": path.name,
"extension": path.suffix.lower(),
"sheet_count": len(sheet_names),
"sheet_names": sheet_names,
"current_sheet": sheet_names[0],
"row_count": len(df),
"column_count": len(df.columns) if not df.empty else 0,
"columns": df.columns.tolist() if not df.empty else [],
"file_size": path.stat().st_size
}
return ParseResult(
success=True,
data=data,
metadata=metadata
)
else:
return ParseResult(
success=False,
error=f"Excel 文件没有有效的工作表"
)
except Exception as e2:
logger.error(f"重试解析失败: {str(e2)}")
return ParseResult(
success=False,
error=f"无法解析 Excel 文件: {str(e)}"
)
except Exception as e:
logger.error(f"解析 Excel 文件时出错: {str(e)}")
return ParseResult(
success=False,
error=f"Failed to parse Excel file: {str(e)}"
)
def parse_all_sheets(self, file_path: str, **kwargs) -> ParseResult:
"""
解析 Excel 文件的所有工作表
Args:
file_path: 文件路径
**kwargs: 其他参数传递给 pandas.read_excel
Returns:
ParseResult: 解析结果
"""
path = Path(file_path)
# 检查文件是否存在
if not path.exists():
return ParseResult(
success=False,
error=f"File not found: {file_path}"
)
if path.suffix.lower() not in self.supported_extensions:
return ParseResult(
success=False,
error=f"Unsupported file type: {path.suffix}"
)
# 检查文件大小
file_size = path.stat().st_size
if file_size == 0:
return ParseResult(
success=False,
error=f"File is empty: {file_path}"
)
try:
# 读取所有工作表
all_data = pd.read_excel(file_path, sheet_name=None, **kwargs)
# 检查是否成功读取到数据
if not all_data or len(all_data) == 0:
return ParseResult(
success=False,
error=f"无法读取 Excel 文件或文件为空: {file_path}"
)
# 转换为可序列化的数据
sheets_data = {}
for sheet_name, df in all_data.items():
sheets_data[sheet_name] = self._df_to_dict(df)
# 获取所有工作表名称
all_sheets = list(all_data.keys())
# 构建元数据
total_rows = sum(len(df) for df in all_data.values())
metadata = {
"filename": path.name,
"extension": path.suffix.lower(),
"sheet_count": len(all_sheets),
"sheet_names": all_sheets,
"total_rows": total_rows,
"file_size": file_size
}
return ParseResult(
success=True,
data={"sheets": sheets_data},
metadata=metadata
)
except Exception as e:
logger.error(f"Failed to parse Excel file: {str(e)}")
return ParseResult(
success=False,
error=f"Failed to parse Excel file: {str(e)}"
)
def _get_sheet_names(self, file_path: str) -> List[str]:
"""获取 Excel 文件中的所有工作表名称"""
try:
xls = pd.ExcelFile(file_path)
sheet_names = xls.sheet_names
if not sheet_names:
return []
return sheet_names
except Exception as e:
logger.error(f"获取工作表名称失败: {str(e)}")
return []
def _df_to_dict(self, df: pd.DataFrame) -> Dict[str, Any]:
"""
将 DataFrame 转换为字典,处理 NaN 值
Args:
df: pandas DataFrame
Returns:
Dict[str, Any]: 转换后的字典
"""
# 将 NaN 替换为 None
df = df.replace({pd.NA: None, float('nan'): None})
# 转换为字典列表(每一行一个字典)
rows = df.to_dict(orient='records')
return {
"columns": df.columns.tolist(),
"rows": rows,
"row_count": len(rows),
"column_count": len(df.columns) if not df.empty else 0
}

View File

@@ -1,18 +1,61 @@
"""
FastAPI 应用主入口
"""
from fastapi import FastAPI
from config import settings
from fastapi.middleware.cors import CORSMiddleware
from app.config import settings
from app.api import api_router
# 创建 FastAPI 应用实例
app = FastAPI(
title=settings.APP_NAME,
openapi_url=f"{settings.API_V1_STR}/openapi.json"
description="基于大语言模型的文档理解与多源数据融合系统",
version="1.0.0",
openapi_url=f"{settings.API_V1_STR}/openapi.json",
docs_url=f"{settings.API_V1_STR}/docs",
redoc_url=f"{settings.API_V1_STR}/redoc"
)
# 配置 CORS 中间件
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# 注册 API 路由
app.include_router(api_router, prefix=settings.API_V1_STR)
@app.get("/")
async def root():
"""根路径"""
return {
"message": f"Welcome to {settings.APP_NAME}",
"status": "online",
"debug_mode": settings.DEBUG
"version": "1.0.0",
"debug_mode": settings.DEBUG,
"api_docs": f"{settings.API_V1_STR}/docs"
}
@app.get("/health")
async def health_check():
"""健康检查接口"""
return {
"status": "healthy",
"service": settings.APP_NAME
}
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="127.0.0.1", port=8000, reload=True)
uvicorn.run(
"app.main:app",
host="127.0.0.1",
port=8000,
reload=settings.DEBUG
)

View File

@@ -0,0 +1,349 @@
"""
图表生成服务 - 根据结构化数据生成图表
"""
import io
import base64
import logging
from typing import Dict, Any, List, Optional
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
# 使用字体辅助模块配置中文字体
from app.services.font_helper import configure_matplotlib_fonts
configure_matplotlib_fonts()
logger = logging.getLogger(__name__)
class ChartGeneratorService:
"""图表生成服务类"""
def __init__(self):
self.output_dir = Path(__file__).resolve().parent.parent.parent / "data" / "charts"
self.output_dir.mkdir(parents=True, exist_ok=True)
def generate_charts_from_analysis(
self,
structured_data: Dict[str, Any]
) -> Dict[str, Any]:
"""
根据提取的结构化数据生成图表
Args:
structured_data: 从 AI 分析结果中提取的结构化数据
Returns:
Dict[str, Any]: 包含图表数据的结果
"""
if not structured_data.get("success"):
return {
"success": False,
"error": structured_data.get("error", "数据提取失败")
}
data = structured_data.get("data", {})
charts = {}
statistics = {}
try:
# 1. 数值型数据图表
numeric_data = data.get("numeric_data", [])
if numeric_data:
charts["numeric_charts"] = self._create_numeric_charts(numeric_data)
statistics["numeric_summary"] = self._create_numeric_summary(numeric_data)
# 2. 分类数据图表
categorical_data = data.get("categorical_data", [])
if categorical_data:
charts["categorical_charts"] = self._create_categorical_charts(categorical_data)
# 3. 时间序列图表
time_series_data = data.get("time_series_data", [])
if time_series_data:
charts["time_series_chart"] = self._create_time_series_chart(time_series_data)
# 4. 对比数据图表
comparison_data = data.get("comparison_data", [])
if comparison_data:
charts["comparison_chart"] = self._create_comparison_chart(comparison_data)
# 5. 表格数据可视化
table_data = data.get("table_data")
if table_data:
charts["table_preview"] = self._create_table_preview(table_data)
# 元数据
metadata = data.get("metadata", {})
return {
"success": True,
"charts": charts,
"statistics": statistics,
"metadata": metadata,
"data_source": "ai_analysis"
}
except Exception as e:
logger.error(f"生成图表失败: {str(e)}", exc_info=True)
return {
"success": False,
"error": str(e)
}
def _create_numeric_charts(self, numeric_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""创建数值型数据图表"""
charts = []
# 提取数值和标签
names = [item.get("name", f"{i}") for i, item in enumerate(numeric_data)]
values = [item.get("value", 0) for item in numeric_data]
if not values:
return charts
# 1. 柱状图
try:
fig, ax = plt.subplots(figsize=(12, 7))
colors = plt.cm.Set3(np.linspace(0, 1, len(values)))
bars = ax.bar(names, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)
# 添加数值标签
for bar, value in zip(bars, values):
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width() / 2., height,
f'{value:,.0f}',
ha='center', va='bottom', fontsize=9, fontweight='bold')
ax.set_xlabel('项目', fontsize=10, labelpad=10, fontweight='bold')
ax.set_ylabel('数值', fontsize=10, labelpad=10, fontweight='bold')
ax.set_title('数值型数据对比', fontsize=12, fontweight='bold', pad=15)
ax.set_xticklabels(names, rotation=30, ha='right', fontsize=9)
ax.tick_params(axis='both', which='major', labelsize=9)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout(pad=1.5)
img_base64 = self._figure_to_base64(fig)
charts.append({
"type": "bar",
"title": "数值型数据对比",
"image": img_base64,
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
})
except Exception as e:
logger.error(f"创建柱状图失败: {str(e)}")
# 2. 饼图
if len(values) > 0 and len(values) <= 10:
try:
fig, ax = plt.subplots(figsize=(10, 10))
wedges, texts, autotexts = ax.pie(values, labels=names, autopct='%1.1f%%',
startangle=90, colors=plt.cm.Set3.colors[:len(values)])
for autotext in autotexts:
autotext.set_color('white')
autotext.set_fontsize(9)
autotext.set_fontweight('bold')
ax.set_title('数值型数据占比', fontsize=12, fontweight='bold', pad=15)
img_base64 = self._figure_to_base64(fig)
charts.append({
"type": "pie",
"title": "数值型数据占比",
"image": img_base64,
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
})
except Exception as e:
logger.error(f"创建饼图失败: {str(e)}")
return charts
def _create_categorical_charts(self, categorical_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""创建分类数据图表"""
charts = []
# 提取数据
names = [item.get("name", f"{i}") for i, item in enumerate(categorical_data)]
counts = [item.get("count", 1) for item in categorical_data]
if not names or not counts:
return charts
# 水平条形图
try:
fig, ax = plt.subplots(figsize=(10, max(6, len(names) * 0.8)))
y_pos = np.arange(len(names))
bars = ax.barh(y_pos, counts, align='center', color='#10b981', alpha=0.8, edgecolor='black', linewidth=0.5)
# 添加数值标签
for bar, count in zip(bars, counts):
width = bar.get_width()
ax.text(width, bar.get_y() + bar.get_height() / 2.,
f'{count}',
ha='left', va='center', fontsize=10, fontweight='bold')
ax.set_yticks(y_pos)
ax.set_yticklabels(names, fontsize=10)
ax.invert_yaxis()
ax.set_xlabel('数量', fontsize=10, labelpad=10, fontweight='bold')
ax.set_title('分类数据分布', fontsize=12, fontweight='bold', pad=15)
ax.tick_params(axis='both', which='major', labelsize=9)
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(pad=1.5)
img_base64 = self._figure_to_base64(fig)
charts.append({
"type": "barh",
"title": "分类数据分布",
"image": img_base64,
"data": [{"name": n, "count": c} for n, c in zip(names, counts)]
})
except Exception as e:
logger.error(f"创建分类图表失败: {str(e)}")
return charts
def _create_time_series_chart(self, time_series_data: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
"""创建时间序列图表"""
if not time_series_data:
return None
try:
names = [item.get("name", f"时间{i}") for i, item in enumerate(time_series_data)]
values = [item.get("value", 0) for item in time_series_data]
if len(values) < 2:
return None
fig, ax = plt.subplots(figsize=(14, 7))
# 绘制折线图和柱状图
x_pos = np.arange(len(names))
bars = ax.bar(x_pos, values, width=0.4, label='数值', color='#3b82f6', alpha=0.7)
# 添加折线
line = ax.plot(x_pos, values, 'o-', color='#ef4444', linewidth=2.5, markersize=8, label='趋势')
ax.set_xticks(x_pos)
ax.set_xticklabels(names, rotation=30, ha='right', fontsize=9)
ax.set_ylabel('数值', fontsize=10, labelpad=10, fontweight='bold')
ax.set_title('时间序列数据', fontsize=12, fontweight='bold', pad=15)
ax.legend(loc='best', fontsize=9)
ax.tick_params(axis='both', which='major', labelsize=9)
ax.grid(True, alpha=0.3)
plt.tight_layout(pad=1.5)
img_base64 = self._figure_to_base64(fig)
return {
"type": "time_series",
"title": "时间序列数据",
"image": img_base64,
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
}
except Exception as e:
logger.error(f"创建时间序列图表失败: {str(e)}")
return None
def _create_comparison_chart(self, comparison_data: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
"""创建对比图表"""
if not comparison_data:
return None
try:
names = [item.get("name", f"对比{i}") for i, item in enumerate(comparison_data)]
values = [item.get("value", 0) for item in comparison_data]
fig, ax = plt.subplots(figsize=(10, 7))
# 区分正负值
colors = ['#10b981' if v >= 0 else '#ef4444' for v in values]
bars = ax.bar(names, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.8)
# 添加数值标签
for bar, value in zip(bars, values):
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width() / 2., height,
f'{value:,.1f}',
ha='center', va='bottom' if value >= 0 else 'top',
fontsize=10, fontweight='bold')
# 添加零线
ax.axhline(y=0, color='black', linestyle='-', linewidth=1)
ax.set_ylabel('', fontsize=10, labelpad=10, fontweight='bold')
ax.set_title('对比数据', fontsize=12, fontweight='bold', pad=15)
ax.set_xticklabels(names, rotation=30, ha='right', fontsize=9)
ax.tick_params(axis='both', which='major', labelsize=9)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout(pad=1.5)
img_base64 = self._figure_to_base64(fig)
return {
"type": "comparison",
"title": "对比数据",
"image": img_base64,
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
}
except Exception as e:
logger.error(f"创建对比图表失败: {str(e)}")
return None
def _create_table_preview(self, table_data: Dict[str, Any]) -> Dict[str, Any]:
"""创建表格预览数据"""
if not table_data:
return {}
columns = table_data.get("columns", [])
rows = table_data.get("rows", [])
return {
"columns": columns,
"rows": rows[:50], # 限制显示前50行
"total_rows": len(rows),
"preview_rows": min(50, len(rows))
}
def _create_numeric_summary(self, numeric_data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""创建数值型数据摘要"""
values = [item.get("value", 0) for item in numeric_data if isinstance(item.get("value"), (int, float))]
if not values:
return {}
return {
"count": len(values),
"sum": float(sum(values)),
"mean": float(np.mean(values)),
"median": float(np.median(values)),
"min": float(min(values)),
"max": float(max(values)),
"std": float(np.std(values)) if len(values) > 1 else 0
}
def _figure_to_base64(self, fig) -> str:
"""将 matplotlib 图形转换为 base64 字符串"""
buf = io.BytesIO()
fig.savefig(
buf,
format='png',
dpi=120,
bbox_inches='tight',
pad_inches=0.3,
facecolor='white',
edgecolor='none',
transparent=False
)
plt.close(fig)
buf.seek(0)
img_base64 = base64.b64encode(buf.read()).decode('utf-8')
return f"data:image/png;base64,{img_base64}"
# 全局单例
chart_generator_service = ChartGeneratorService()

View File

@@ -0,0 +1,253 @@
"""
Excel AI 分析服务 - 集成 Excel 解析和 LLM 分析
"""
import logging
from typing import Dict, Any, Optional, List
from app.core.document_parser import XlsxParser
from app.services.file_service import file_service
from app.services.llm_service import llm_service
logger = logging.getLogger(__name__)
class ExcelAIService:
"""Excel AI 分析服务"""
def __init__(self):
self.parser = XlsxParser()
self.file_service = file_service
self.llm_service = llm_service
async def analyze_excel_file(
self,
file_content: bytes,
filename: str,
user_prompt: str = "",
analysis_type: str = "general",
parse_options: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
"""
分析 Excel 文件
Args:
file_content: 文件内容字节
filename: 文件名
user_prompt: 用户自定义提示词
analysis_type: 分析类型
parse_options: 解析选项
Returns:
Dict[str, Any]: 分析结果
"""
# 1. 保存文件
try:
saved_path = self.file_service.save_uploaded_file(
file_content,
filename,
subfolder="excel"
)
logger.info(f"文件已保存: {saved_path}")
except Exception as e:
logger.error(f"文件保存失败: {str(e)}")
return {
"success": False,
"error": f"文件保存失败: {str(e)}",
"analysis": None
}
# 2. 解析 Excel 文件
try:
parse_options = parse_options or {}
parse_result = self.parser.parse(saved_path, **parse_options)
if not parse_result.success:
return {
"success": False,
"error": parse_result.error,
"analysis": None
}
excel_data = parse_result.data
logger.info(f"Excel 解析成功: {parse_result.metadata}")
except Exception as e:
logger.error(f"Excel 解析失败: {str(e)}")
return {
"success": False,
"error": f"Excel 解析失败: {str(e)}",
"analysis": None
}
# 3. 调用 LLM 进行分析
try:
# 如果有自定义提示词,使用模板分析
if user_prompt and user_prompt.strip():
llm_result = await self.llm_service.analyze_with_template(
excel_data,
user_prompt
)
else:
# 否则使用标准分析
llm_result = await self.llm_service.analyze_excel_data(
excel_data,
user_prompt,
analysis_type
)
logger.info(f"AI 分析完成: {llm_result['success']}")
# 4. 组合结果
return {
"success": True,
"excel": {
"data": excel_data,
"metadata": parse_result.metadata,
"saved_path": saved_path
},
"analysis": llm_result
}
except Exception as e:
logger.error(f"AI 分析失败: {str(e)}")
return {
"success": False,
"error": f"AI 分析失败: {str(e)}",
"excel": {
"data": excel_data,
"metadata": parse_result.metadata
},
"analysis": None
}
async def batch_analyze_sheets(
self,
file_content: bytes,
filename: str,
user_prompt: str = "",
analysis_type: str = "general"
) -> Dict[str, Any]:
"""
批量分析 Excel 文件的所有工作表
Args:
file_content: 文件内容字节
filename: 文件名
user_prompt: 用户自定义提示词
analysis_type: 分析类型
Returns:
Dict[str, Any]: 分析结果
"""
# 1. 保存文件
try:
saved_path = self.file_service.save_uploaded_file(
file_content,
filename,
subfolder="excel"
)
logger.info(f"文件已保存: {saved_path}")
except Exception as e:
logger.error(f"文件保存失败: {str(e)}")
return {
"success": False,
"error": f"文件保存失败: {str(e)}",
"analysis": None
}
# 2. 解析所有工作表
try:
parse_result = self.parser.parse_all_sheets(saved_path)
if not parse_result.success:
return {
"success": False,
"error": parse_result.error,
"analysis": None
}
sheets_data = parse_result.data.get("sheets", {})
logger.info(f"Excel 解析成功,共 {len(sheets_data)} 个工作表")
except Exception as e:
logger.error(f"Excel 解析失败: {str(e)}")
return {
"success": False,
"error": f"Excel 解析失败: {str(e)}",
"analysis": None
}
# 3. 批量分析每个工作表
sheet_analyses = {}
errors = {}
for sheet_name, sheet_data in sheets_data.items():
try:
# 调用 LLM 分析
if user_prompt and user_prompt.strip():
llm_result = await self.llm_service.analyze_with_template(
sheet_data,
user_prompt
)
else:
llm_result = await self.llm_service.analyze_excel_data(
sheet_data,
user_prompt,
analysis_type
)
sheet_analyses[sheet_name] = llm_result
if not llm_result["success"]:
errors[sheet_name] = llm_result.get("error", "未知错误")
logger.info(f"工作表 '{sheet_name}' 分析完成")
except Exception as e:
logger.error(f"工作表 '{sheet_name}' 分析失败: {str(e)}")
errors[sheet_name] = str(e)
# 4. 组合结果
return {
"success": len(errors) == 0,
"excel": {
"sheets": sheets_data,
"metadata": parse_result.metadata,
"saved_path": saved_path
},
"analysis": {
"sheets": sheet_analyses,
"total_sheets": len(sheets_data),
"successful": len(sheet_analyses) - len(errors),
"errors": errors
}
}
def get_supported_analysis_types(self) -> List[str]:
"""获取支持的分析类型"""
return [
{
"value": "general",
"label": "综合分析",
"description": "提供数据概览、关键发现、质量评估和建议"
},
{
"value": "summary",
"label": "数据摘要",
"description": "快速了解数据的结构、范围和主要内容"
},
{
"value": "statistics",
"label": "统计分析",
"description": "数值型列的统计信息和分类列的分布"
},
{
"value": "insights",
"label": "深度洞察",
"description": "深入挖掘数据,提供异常值和业务建议"
}
]
# 全局单例
excel_ai_service = ExcelAIService()

View File

@@ -0,0 +1,132 @@
"""
文件服务模块 - 处理文件存储和读取
"""
import os
import shutil
from pathlib import Path
from datetime import datetime
from typing import Optional
import uuid
from app.config import settings
class FileService:
"""文件服务类,负责文件的存储、读取和管理"""
def __init__(self):
self.upload_dir = Path(settings.UPLOAD_DIR)
self._ensure_upload_dir()
def _ensure_upload_dir(self):
"""确保上传目录存在"""
self.upload_dir.mkdir(parents=True, exist_ok=True)
def save_uploaded_file(
self,
file_content: bytes,
filename: str,
subfolder: Optional[str] = None
) -> str:
"""
保存上传的文件
Args:
file_content: 文件内容字节
filename: 原始文件名
subfolder: 可选的子文件夹名称
Returns:
str: 保存后的文件路径
"""
# 生成唯一文件名,避免覆盖
file_ext = Path(filename).suffix
unique_name = f"{uuid.uuid4().hex}{file_ext}"
# 确定保存路径
if subfolder:
save_dir = self.upload_dir / subfolder
save_dir.mkdir(parents=True, exist_ok=True)
else:
save_dir = self.upload_dir
file_path = save_dir / unique_name
# 写入文件
with open(file_path, 'wb') as f:
f.write(file_content)
return str(file_path)
def read_file(self, file_path: str) -> bytes:
"""
读取文件内容
Args:
file_path: 文件路径
Returns:
bytes: 文件内容
"""
with open(file_path, 'rb') as f:
return f.read()
def delete_file(self, file_path: str) -> bool:
"""
删除文件
Args:
file_path: 文件路径
Returns:
bool: 是否删除成功
"""
try:
file = Path(file_path)
if file.exists():
file.unlink()
return True
return False
except Exception:
return False
def get_file_info(self, file_path: str) -> dict:
"""
获取文件信息
Args:
file_path: 文件路径
Returns:
dict: 文件信息
"""
file = Path(file_path)
if not file.exists():
return {}
stat = file.stat()
return {
"filename": file.name,
"filepath": str(file),
"size": stat.st_size,
"created": datetime.fromtimestamp(stat.st_ctime).isoformat(),
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
"extension": file.suffix.lower()
}
def get_file_size(self, file_path: str) -> int:
"""
获取文件大小(字节)
Args:
file_path: 文件路径
Returns:
int: 文件大小,文件不存在返回 0
"""
file = Path(file_path)
return file.stat().st_size if file.exists() else 0
# 全局单例
file_service = FileService()

View File

@@ -0,0 +1,105 @@
"""
字体辅助模块 - 处理中文字体检测和配置
"""
import matplotlib
import matplotlib.font_manager as fm
import platform
import os
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
def get_chinese_font() -> str:
"""
获取可用的中文字体
Returns:
str: 可用的中文字体名称
"""
# 获取系统中所有可用字体
available_fonts = set([f.name for f in fm.fontManager.ttflist])
# 定义字体优先级列表
# Windows 优先
if platform.system() == 'Windows':
font_list = [
'Microsoft YaHei', # 微软雅黑
'SimHei', # 黑体
'SimSun', # 宋体
'KaiTi', # 楷体
'FangSong', # 仿宋
'STXihei', # 华文细黑
'STKaiti', # 华文楷体
'STSong', # 华文宋体
'STFangsong', # 华文仿宋
]
# macOS 优先
elif platform.system() == 'Darwin':
font_list = [
'PingFang SC', # 苹方-简
'PingFang TC', # 苹方-繁
'Heiti SC', # 黑体-简
'Heiti TC', # 黑体-繁
'STHeiti', # 华文黑体
'STSong', # 华文宋体
'STKaiti', # 华文楷体
'Arial Unicode MS', # Arial Unicode MS
]
# Linux 优先
else:
font_list = [
'Noto Sans CJK SC', # Noto Sans CJK 简体中文
'WenQuanYi Micro Hei', # 文泉驿微米黑
'AR PL UMing CN', # AR PL UMing
'AR PL UKai CN', # AR PL UKai
'ZCOOL XiaoWei', # ZCOOL 小薇
]
# 通用备选字体
font_list.extend([
'SimHei',
'Microsoft YaHei',
'Arial Unicode MS',
'Droid Sans Fallback',
])
# 查找第一个可用的字体
for font_name in font_list:
if font_name in available_fonts:
logger.info(f"找到中文字体: {font_name}")
return font_name
# 如果没找到,尝试获取第一个中文字体
for font in fm.fontManager.ttflist:
if 'CJK' in font.name or 'SC' in font.name or 'TC' in font.name:
logger.info(f"使用找到的中文字体: {font.name}")
return font.name
# 最终备选:使用系统默认字体
logger.warning("未找到合适的中文字体,使用默认字体")
return 'sans-serif'
def configure_matplotlib_fonts():
"""
配置 matplotlib 的字体设置
"""
chinese_font = get_chinese_font()
# 配置字体
matplotlib.rcParams['font.sans-serif'] = [chinese_font]
matplotlib.rcParams['axes.unicode_minus'] = False
matplotlib.rcParams['figure.dpi'] = 100
matplotlib.rcParams['savefig.dpi'] = 120
# 字体大小设置
matplotlib.rcParams['font.size'] = 10
matplotlib.rcParams['axes.labelsize'] = 10
matplotlib.rcParams['axes.titlesize'] = 11
matplotlib.rcParams['xtick.labelsize'] = 9
matplotlib.rcParams['ytick.labelsize'] = 9
matplotlib.rcParams['legend.fontsize'] = 9
logger.info(f"配置完成,使用字体: {chinese_font}")
return chinese_font

View File

@@ -0,0 +1,268 @@
"""
LLM 服务模块 - 封装大模型 API 调用
"""
import logging
from typing import Dict, Any, List, Optional
import httpx
from app.config import settings
logger = logging.getLogger(__name__)
class LLMService:
"""大语言模型服务类"""
def __init__(self):
self.api_key = settings.LLM_API_KEY
self.base_url = settings.LLM_BASE_URL
self.model_name = settings.LLM_MODEL_NAME
async def chat(
self,
messages: List[Dict[str, str]],
temperature: float = 0.7,
max_tokens: Optional[int] = None,
**kwargs
) -> Dict[str, Any]:
"""
调用聊天 API
Args:
messages: 消息列表,格式为 [{"role": "user", "content": "..."}]
temperature: 温度参数,控制随机性
max_tokens: 最大生成 token 数
**kwargs: 其他参数
Returns:
Dict[str, Any]: API 响应结果
"""
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
payload = {
"model": self.model_name,
"messages": messages,
"temperature": temperature
}
if max_tokens:
payload["max_tokens"] = max_tokens
# 添加其他参数
payload.update(kwargs)
try:
async with httpx.AsyncClient(timeout=60.0) as client:
response = await client.post(
f"{self.base_url}/chat/completions",
headers=headers,
json=payload
)
response.raise_for_status()
return response.json()
except httpx.HTTPStatusError as e:
logger.error(f"LLM API 请求失败: {e.response.status_code} - {e.response.text}")
raise
except Exception as e:
logger.error(f"LLM API 调用异常: {str(e)}")
raise
def extract_message_content(self, response: Dict[str, Any]) -> str:
"""
从 API 响应中提取消息内容
Args:
response: API 响应
Returns:
str: 消息内容
"""
try:
return response["choices"][0]["message"]["content"]
except (KeyError, IndexError) as e:
logger.error(f"解析 API 响应失败: {str(e)}")
raise
async def analyze_excel_data(
self,
excel_data: Dict[str, Any],
user_prompt: str,
analysis_type: str = "general"
) -> Dict[str, Any]:
"""
分析 Excel 数据
Args:
excel_data: Excel 解析后的数据
user_prompt: 用户提示词
analysis_type: 分析类型 (general, summary, statistics, insights)
Returns:
Dict[str, Any]: 分析结果
"""
# 构建 Prompt
system_prompt = self._get_system_prompt(analysis_type)
user_message = self._format_user_message(excel_data, user_prompt)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
]
try:
response = await self.chat(
messages=messages,
temperature=0.3, # 较低的温度以获得更稳定的输出
max_tokens=2000
)
content = self.extract_message_content(response)
return {
"success": True,
"analysis": content,
"model": self.model_name,
"analysis_type": analysis_type
}
except Exception as e:
logger.error(f"Excel 数据分析失败: {str(e)}")
return {
"success": False,
"error": str(e),
"analysis": None
}
def _get_system_prompt(self, analysis_type: str) -> str:
"""获取系统提示词"""
prompts = {
"general": """你是一个专业的数据分析师。请分析用户提供的 Excel 数据,提供有价值的见解和建议。
请按照以下格式输出:
1. 数据概览
2. 关键发现
3. 数据质量评估
4. 建议
输出语言:中文""",
"summary": """你是一个专业的数据分析师。请对用户提供的 Excel 数据进行简洁的总结。
输出格式:
- 数据行数和列数
- 主要列的说明
- 数据范围概述
输出语言:中文""",
"statistics": """你是一个专业的数据分析师。请对用户提供的 Excel 数据进行统计分析。
请分析:
- 数值型列的统计信息(平均值、中位数、最大值、最小值)
- 分类列的分布情况
- 数据相关性
输出语言:中文,使用表格或结构化格式展示""",
"insights": """你是一个专业的数据分析师。请深入挖掘用户提供的 Excel 数据,提供有价值的洞察。
请分析:
1. 数据中的异常值或特殊模式
2. 数据之间的潜在关联
3. 基于数据的业务建议
4. 数据趋势分析(如适用)
输出语言:中文,提供详细且可操作的建议"""
}
return prompts.get(analysis_type, prompts["general"])
def _format_user_message(self, excel_data: Dict[str, Any], user_prompt: str) -> str:
"""格式化用户消息"""
columns = excel_data.get("columns", [])
rows = excel_data.get("rows", [])
row_count = excel_data.get("row_count", 0)
column_count = excel_data.get("column_count", 0)
# 构建数据描述
data_info = f"""
Excel 数据概览:
- 行数: {row_count}
- 列数: {column_count}
- 列名: {', '.join(columns)}
数据样例(前 5 行):
"""
# 添加数据样例
for i, row in enumerate(rows[:5], 1):
row_str = " | ".join([f"{col}: {row.get(col, '')}" for col in columns])
data_info += f"{i} 行: {row_str}\n"
if row_count > 5:
data_info += f"\n(还有 {row_count - 5} 行数据...)\n"
# 添加用户自定义提示
if user_prompt and user_prompt.strip():
data_info += f"\n用户需求:\n{user_prompt}"
else:
data_info += "\n用户需求: 请对上述数据进行分析"
return data_info
async def analyze_with_template(
self,
excel_data: Dict[str, Any],
template_prompt: str
) -> Dict[str, Any]:
"""
使用自定义模板分析 Excel 数据
Args:
excel_data: Excel 解析后的数据
template_prompt: 自定义提示词模板
Returns:
Dict[str, Any]: 分析结果
"""
system_prompt = """你是一个专业的数据分析师。请根据用户提供的自定义提示词分析 Excel 数据。
请严格按照用户的要求进行分析,输出清晰、有条理的结果。
输出语言:中文"""
user_message = self._format_user_message(excel_data, template_prompt)
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
]
try:
response = await self.chat(
messages=messages,
temperature=0.5,
max_tokens=3000
)
content = self.extract_message_content(response)
return {
"success": True,
"analysis": content,
"model": self.model_name,
"is_template": True
}
except Exception as e:
logger.error(f"自定义模板分析失败: {str(e)}")
return {
"success": False,
"error": str(e),
"analysis": None
}
# 全局单例
llm_service = LLMService()

View File

@@ -0,0 +1,218 @@
"""
文本分析服务 - 从 AI 分析结果中提取结构化数据用于可视化
"""
import logging
from typing import Dict, Any, List, Optional
import re
import json
from app.services.llm_service import llm_service
logger = logging.getLogger(__name__)
class TextAnalysisService:
"""文本分析服务类"""
def __init__(self):
self.llm_service = llm_service
async def extract_structured_data(
self,
analysis_text: str,
original_filename: str = "",
file_type: str = "text"
) -> Dict[str, Any]:
"""
从 AI 分析结果文本中提取结构化数据
Args:
analysis_text: AI 分析结果文本
original_filename: 原始文件名
file_type: 文件类型
Returns:
Dict[str, Any]: 提取的结构化数据
"""
# 限制分析的文本长度,避免 token 超限
max_text_length = 8000
truncated_text = analysis_text[:max_text_length]
system_prompt = """你是一个专业的数据提取助手。你的任务是从AI分析结果中提取结构化数据用于生成图表。
请按照以下要求提取数据:
1. 数值型数据:
- 提取所有的数值、统计信息、百分比等
- 为每个数值创建一个条目,包含:名称、值、单位(如果有)
- 格式示例:{"name": "销售额", "value": 123456.78, "unit": ""}
2. 分类数据:
- 提取所有的类别、状态、枚举值等
- 为每个类别创建一个条目,包含:名称、值、数量(如果有)
- 格式示例:{"name": "产品类别", "value": "电子产品", "count": 25}
3. 时间序列数据:
- 提取所有的时间相关数据(年月、季度、日期等)
- 格式示例:{"name": "2025年1月", "value": 12345}
4. 对比数据:
- 提取所有的对比、排名、趋势等数据
- 格式示例:{"name": "同比增长", "value": 15.3, "unit": "%"}
5. 表格数据:
- 如果分析结果中包含表格或列表形式的数据,提取出来
- 格式:{"columns": ["列1", "列2"], "rows": [{"列1": "值1", "列2": "值2"}]}
重要规则:
- 只提取明确提到的数据和数值
- 如果某种类型的数据不存在,返回空数组 []
- 确保所有数值都是有效的数字类型
- 保持数据的原始精度
- 返回的 JSON 必须完整且格式正确
- 表格数据最多提取 20 行
请以 JSON 格式返回,不要添加任何 Markdown 标记或解释文字,只返回纯 JSON
{
"success": true,
"data": {
"numeric_data": [
{"name": string, "value": number, "unit": string|null}
],
"categorical_data": [
{"name": string, "value": string, "count": number|null}
],
"time_series_data": [
{"name": string, "value": number}
],
"comparison_data": [
{"name": string, "value": number, "unit": string|null}
],
"table_data": {
"columns": string[],
"rows": object[]
} | null
},
"metadata": {
"total_items": number,
"data_types": string[]
}
}"""
user_message = f"""请从以下 AI 分析结果中提取结构化数据:
原始文件名:{original_filename}
文件类型:{file_type}
AI 分析结果:
{truncated_text}
请按照系统提示的要求提取数据并返回纯 JSON 格式。"""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
]
try:
logger.info(f"开始提取结构化数据,文本长度: {len(truncated_text)}")
response = await self.llm_service.chat(
messages=messages,
temperature=0.1,
max_tokens=4000
)
content = self.llm_service.extract_message_content(response)
logger.info(f"LLM 返回内容长度: {len(content)}")
# 使用简单的方法提取 JSON
result = self._extract_json_simple(content)
if not result:
logger.error("无法从 LLM 响应中提取有效的 JSON")
return {
"success": False,
"error": "AI 返回的数据格式不正确或被截断",
"raw_content": content[:500]
}
logger.info(f"成功提取结构化数据")
return result
except Exception as e:
logger.error(f"提取结构化数据失败: {str(e)}")
return {
"success": False,
"error": str(e)
}
def _extract_json_simple(self, content: str) -> Optional[Dict[str, Any]]:
"""
简化的 JSON 提取方法
Args:
content: LLM 返回的内容
Returns:
Optional[Dict[str, Any]]: 解析后的 JSON失败返回 None
"""
try:
# 方法 1: 查找 ```json 代码块
code_block_match = re.search(r'```json\n{[\s\S]*?}[\s\S]*?}\n```', content, re.DOTALL)
if code_block_match:
json_str = code_block_match.group(1)
logger.info("从代码块中提取 JSON")
return json.loads(json_str)
# 方法 2: 查找第一个完整的 { } 对象
brace_count = 0
json_start = -1
for i in range(len(content)):
if content[i] == '{':
if brace_count == 0:
json_start = i
brace_count += 1
elif content[i] == '}':
brace_count -= 1
if brace_count == 0:
# 找到了完整的 JSON 对象
json_end = i + 1
json_str = content[json_start:json_end]
logger.info(f"从大括号中提取 JSON")
return json.loads(json_str)
# 方法 3: 尝试直接解析
logger.info("尝试直接解析整个内容")
return json.loads(content)
except json.JSONDecodeError as e:
logger.error(f"JSON 解析失败: {str(e)}")
logger.error(f"原始内容(前 500 字符): {content[:500]}...")
return None
except Exception as e:
logger.error(f"提取 JSON 失败: {str(e)}")
return None
def detect_data_types(self, data: Dict[str, Any]) -> List[str]:
"""检测数据中包含的类型"""
types = []
d = data.get("data", {})
if d.get("numeric_data") and len(d["numeric_data"]) > 0:
types.append("numeric")
if d.get("categorical_data") and len(d["categorical_data"]) > 0:
types.append("categorical")
if d.get("time_series_data") and len(d["time_series_data"]) > 0:
types.append("time_series")
if d.get("comparison_data") and len(d["comparison_data"]) > 0:
types.append("comparison")
if d.get("table_data") and d["table_data"]:
types.append("table")
return types
# 全局单例
text_analysis_service = TextAnalysisService()

View File

@@ -0,0 +1,388 @@
"""
数据可视化服务 - 使用 matplotlib/plotly 生成统计图表
"""
import io
import base64
import logging
from typing import Dict, Any, List, Optional, Union
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
# 使用字体辅助模块配置中文字体
from app.services.font_helper import configure_matplotlib_fonts
configure_matplotlib_fonts()
logger = logging.getLogger(__name__)
class VisualizationService:
"""数据可视化服务类"""
def __init__(self):
self.output_dir = Path(__file__).resolve().parent.parent.parent / "data" / "charts"
self.output_dir.mkdir(parents=True, exist_ok=True)
def analyze_and_visualize(
self,
excel_data: Dict[str, Any],
analysis_type: str = "statistics"
) -> Dict[str, Any]:
"""
分析数据并生成可视化图表
Args:
excel_data: Excel 解析后的数据
analysis_type: 分析类型
Returns:
Dict[str, Any]: 包含图表数据和统计信息的结果
"""
try:
columns = excel_data.get("columns", [])
rows = excel_data.get("rows", [])
if not columns or not rows:
return {
"success": False,
"error": "没有数据可用于分析"
}
# 转换为 DataFrame
df = pd.DataFrame(rows, columns=columns)
# 根据列类型分类
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = df.select_dtypes(exclude=[np.number]).columns.tolist()
# 生成统计信息
statistics = self._generate_statistics(df, numeric_columns, categorical_columns)
# 生成图表
charts = self._generate_charts(df, numeric_columns, categorical_columns)
# 生成数据分布信息
distributions = self._generate_distributions(df, categorical_columns)
return {
"success": True,
"statistics": statistics,
"charts": charts,
"distributions": distributions,
"row_count": len(df),
"column_count": len(columns)
}
except Exception as e:
logger.error(f"可视化分析失败: {str(e)}", exc_info=True)
return {
"success": False,
"error": str(e)
}
def _generate_statistics(
self,
df: pd.DataFrame,
numeric_columns: List[str],
categorical_columns: List[str]
) -> Dict[str, Any]:
"""生成统计信息"""
statistics = {
"numeric": {},
"categorical": {}
}
# 数值型列统计
for col in numeric_columns:
try:
stats = {
"count": int(df[col].count()),
"mean": float(df[col].mean()),
"median": float(df[col].median()),
"std": float(df[col].std()) if df[col].count() > 1 else 0,
"min": float(df[col].min()),
"max": float(df[col].max()),
"q25": float(df[col].quantile(0.25)),
"q75": float(df[col].quantile(0.75)),
"missing": int(df[col].isna().sum())
}
statistics["numeric"][col] = stats
except Exception as e:
logger.warning(f"{col} 统计失败: {str(e)}")
# 分类型列统计
for col in categorical_columns:
try:
value_counts = df[col].value_counts()
stats = {
"unique": int(df[col].nunique()),
"most_common": str(value_counts.index[0]) if len(value_counts) > 0 else "",
"most_common_count": int(value_counts.iloc[0]) if len(value_counts) > 0 else 0,
"missing": int(df[col].isna().sum()),
"distribution": {str(k): int(v) for k, v in value_counts.items()}
}
statistics["categorical"][col] = stats
except Exception as e:
logger.warning(f"{col} 统计失败: {str(e)}")
return statistics
def _generate_charts(
self,
df: pd.DataFrame,
numeric_columns: List[str],
categorical_columns: List[str]
) -> Dict[str, Any]:
"""生成图表"""
charts = {}
# 1. 数值型列的直方图
charts["histograms"] = []
for col in numeric_columns[:5]: # 限制最多 5 个数值列
chart_data = self._create_histogram(df[col], col)
if chart_data:
charts["histograms"].append(chart_data)
# 2. 分类型列的条形图
charts["bar_charts"] = []
for col in categorical_columns[:5]: # 限制最多 5 个分类型列
chart_data = self._create_bar_chart(df[col], col)
if chart_data:
charts["bar_charts"].append(chart_data)
# 3. 数值型列的箱线图
charts["box_plots"] = []
if len(numeric_columns) > 0:
chart_data = self._create_box_plot(df[numeric_columns[:5]], numeric_columns[:5])
if chart_data:
charts["box_plots"].append(chart_data)
# 4. 相关性热力图
if len(numeric_columns) >= 2:
chart_data = self._create_correlation_heatmap(df[numeric_columns], numeric_columns)
if chart_data:
charts["correlation"] = chart_data
return charts
def _create_histogram(self, series: pd.Series, column_name: str) -> Optional[Dict[str, Any]]:
"""创建直方图"""
try:
fig, ax = plt.subplots(figsize=(11, 7))
ax.hist(series.dropna(), bins=20, edgecolor='black', alpha=0.7, color='#3b82f6')
ax.set_xlabel(column_name, fontsize=10, labelpad=10)
ax.set_ylabel('频数', fontsize=10, labelpad=10)
ax.set_title(f'{column_name} 分布', fontsize=12, fontweight='bold', pad=15)
ax.grid(True, alpha=0.3, axis='y')
ax.tick_params(axis='both', which='major', labelsize=9)
# 改进布局
plt.tight_layout(pad=1.5, w_pad=1.0, h_pad=1.0)
# 转换为 base64
img_base64 = self._figure_to_base64(fig)
return {
"type": "histogram",
"column": column_name,
"image": img_base64,
"stats": {
"mean": float(series.mean()),
"median": float(series.median()),
"std": float(series.std()) if len(series) > 1 else 0
}
}
except Exception as e:
logger.error(f"创建直方图失败 ({column_name}): {str(e)}")
return None
def _create_bar_chart(self, series: pd.Series, column_name: str) -> Optional[Dict[str, Any]]:
"""创建条形图"""
try:
value_counts = series.value_counts().head(10) # 只显示前 10 个
fig, ax = plt.subplots(figsize=(12, 7))
# 处理标签显示
labels = [str(x)[:15] + '...' if len(str(x)) > 15 else str(x) for x in value_counts.index]
x_pos = range(len(value_counts))
bars = ax.bar(x_pos, value_counts.values, color='#10b981', alpha=0.8, edgecolor='black', linewidth=0.5)
ax.set_xticks(x_pos)
ax.set_xticklabels(labels, rotation=30, ha='right', fontsize=8)
ax.set_xlabel(column_name, fontsize=10, labelpad=10)
ax.set_ylabel('数量', fontsize=10, labelpad=10)
ax.set_title(f'{column_name} 分布 (Top 10)', fontsize=12, fontweight='bold', pad=15)
ax.grid(True, alpha=0.3, axis='y')
ax.tick_params(axis='both', which='major', labelsize=9)
# 添加数值标签(位置稍微上移)
max_val = value_counts.values.max()
y_offset = max_val * 0.02 if max_val > 0 else 0.5
for bar, value in zip(bars, value_counts.values):
ax.text(bar.get_x() + bar.get_width() / 2., value + y_offset,
f'{int(value)}',
ha='center', va='bottom', fontsize=8, fontweight='bold')
# 改进布局
plt.tight_layout(pad=1.5, w_pad=1.0, h_pad=1.0)
# 转换为 base64
img_base64 = self._figure_to_base64(fig)
return {
"type": "bar_chart",
"column": column_name,
"image": img_base64,
"categories": {str(k): int(v) for k, v in value_counts.items()}
}
except Exception as e:
logger.error(f"创建条形图失败 ({column_name}): {str(e)}")
return None
def _create_box_plot(self, df: pd.DataFrame, columns: List[str]) -> Optional[Dict[str, Any]]:
"""创建箱线图"""
try:
fig, ax = plt.subplots(figsize=(14, 7))
# 准备数据
box_data = [df[col].dropna() for col in columns]
bp = ax.boxplot(box_data, labels=columns, patch_artist=True,
notch=True, showcaps=True, showfliers=True)
# 美化箱线图
box_colors = ['#3b82f6', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6']
for patch, color in zip(bp['boxes'], box_colors[:len(bp['boxes'])]):
patch.set_facecolor(color)
patch.set_alpha(0.6)
patch.set_linewidth(1.5)
# 设置其他元素样式
for element in ['whiskers', 'fliers', 'means', 'medians', 'caps']:
plt.setp(bp[element], linewidth=1.5)
ax.set_ylabel('', fontsize=10, labelpad=10)
ax.set_title('数值型列分布对比', fontsize=12, fontweight='bold', pad=15)
ax.grid(True, alpha=0.3, axis='y')
# 旋转 x 轴标签以避免重叠
plt.setp(ax.get_xticklabels(), rotation=30, ha='right', fontsize=9)
ax.tick_params(axis='both', which='major', labelsize=9)
# 改进布局
plt.tight_layout(pad=1.5, w_pad=1.5, h_pad=1.0)
# 转换为 base64
img_base64 = self._figure_to_base64(fig)
return {
"type": "box_plot",
"columns": columns,
"image": img_base64
}
except Exception as e:
logger.error(f"创建箱线图失败: {str(e)}")
return None
def _create_correlation_heatmap(self, df: pd.DataFrame, columns: List[str]) -> Optional[Dict[str, Any]]:
"""创建相关性热力图"""
try:
# 计算相关系数
corr = df.corr()
fig, ax = plt.subplots(figsize=(11, 9))
im = ax.imshow(corr, cmap='RdBu_r', aspect='auto', vmin=-1, vmax=1)
# 设置刻度
n_cols = len(corr)
ax.set_xticks(np.arange(n_cols))
ax.set_yticks(np.arange(n_cols))
# 处理过长的列名
x_labels = [str(col)[:10] + '...' if len(str(col)) > 10 else str(col) for col in corr.columns]
y_labels = [str(col)[:10] + '...' if len(str(col)) > 10 else str(col) for col in corr.columns]
ax.set_xticklabels(x_labels, rotation=30, ha='right', fontsize=9)
ax.set_yticklabels(y_labels, fontsize=9)
# 添加数值标签,根据相关性值选择颜色
for i in range(n_cols):
for j in range(n_cols):
value = corr.iloc[i, j]
# 根据背景色深浅选择文字颜色
text_color = 'white' if abs(value) > 0.5 else 'black'
ax.text(j, i, f'{value:.2f}',
ha="center", va="center", color=text_color,
fontsize=8, fontweight='bold' if abs(value) > 0.7 else 'normal')
ax.set_title('数值型列相关性热力图', fontsize=12, fontweight='bold', pad=15)
ax.tick_params(axis='both', which='major', labelsize=9)
# 添加颜色条
cbar = plt.colorbar(im, ax=ax)
cbar.set_label('相关系数', rotation=270, labelpad=20, fontsize=10)
cbar.ax.tick_params(labelsize=9)
# 改进布局
plt.tight_layout(pad=2.0, w_pad=1.0, h_pad=1.0)
# 转换为 base64
img_base64 = self._figure_to_base64(fig)
return {
"type": "correlation_heatmap",
"columns": columns,
"image": img_base64,
"correlation_matrix": corr.to_dict()
}
except Exception as e:
logger.error(f"创建相关性热力图失败: {str(e)}")
return None
def _generate_distributions(
self,
df: pd.DataFrame,
categorical_columns: List[str]
) -> Dict[str, Any]:
"""生成数据分布信息"""
distributions = {}
for col in categorical_columns[:5]:
try:
value_counts = df[col].value_counts()
total = len(df)
distributions[col] = {
"categories": {str(k): int(v) for k, v in value_counts.items()},
"percentages": {str(k): round(v / total * 100, 2) for k, v in value_counts.items()},
"unique_count": len(value_counts)
}
except Exception as e:
logger.warning(f"{col} 分布生成失败: {str(e)}")
return distributions
def _figure_to_base64(self, fig) -> str:
"""将 matplotlib 图形转换为 base64 字符串"""
buf = io.BytesIO()
fig.savefig(
buf,
format='png',
dpi=120,
bbox_inches='tight',
pad_inches=0.3,
facecolor='white',
edgecolor='none',
transparent=False
)
plt.close(fig)
buf.seek(0)
img_base64 = base64.b64encode(buf.read()).decode('utf-8')
return f"data:image/png;base64,{img_base64}"
# 全局单例
visualization_service = VisualizationService()

View File

@@ -103,6 +103,21 @@ git config user.email #同上
#如果想看全局的,可以加上 --global例如 git config --global user.name
```
## 启动后端项目
在终端输入以下命令:
```bash
cd backend #确保启动时在后端跟目录下
./venv/Scripts/python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000
--reload #启动后端项目
```
先启动后端项目,再启动前端项目
记得在你的.gitignore中添加
```
/backend/data/uploads
/backend/data/charts
```
## 预计项目结构:
```bash

View File

@@ -1,6 +1,7 @@
fastapi[all]==0.104.1
fastapi[all]==0.104.1
uvicorn[standard]==0.24.0
pydantic==2.5.0
pydantic-settings==2.1.0
python-multipart==0.0.6
pymongo==4.5.0
redis==5.0.0
@@ -10,6 +11,8 @@ faiss-cpu==1.8.0
python-docx==0.8.11
pandas==2.1.4
openpyxl==3.1.2
matplotlib==3.8.2
numpy==1.26.2
markdown==3.5.1
langchain==0.1.0
langchain-community==0.0.10
@@ -18,5 +21,4 @@ httpx==0.25.2
python-dotenv==1.0.0
loguru==0.7.2
tqdm==4.66.1
numpy==1.26.2
PyYAML==6.0.1
PyYAML==6.0.1

View File

@@ -1 +0,0 @@
print("Hello,World")

View File

@@ -1 +0,0 @@
print("Hello world!")

View File

@@ -1 +0,0 @@
print("hello,world!")

View File

@@ -0,0 +1,71 @@
"""
测试字体配置是否正常工作
"""
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from app.services.font_helper import configure_matplotlib_fonts
import io
import base64
# 配置字体
font_name = configure_matplotlib_fonts()
print(f"当前使用字体: {font_name}")
print(f"matplotlib 中文字体设置: {matplotlib.rcParams['font.sans-serif']}")
# 创建测试图表
fig, ax = plt.subplots(figsize=(10, 6))
# 测试数据
x = ['销售', '库存', '采购', '退货', '其他']
y = [150, 200, 180, 50, 30]
bars = ax.bar(x, y, color='#3b82f6', alpha=0.8)
ax.set_xlabel('类别', fontsize=12, labelpad=10)
ax.set_ylabel('数值', fontsize=12, labelpad=10)
ax.set_title('测试图表 - 中文显示', fontsize=14, fontweight='bold', pad=15)
ax.tick_params(axis='both', which='major', labelsize=10)
# 添加数值标签
for bar, value in zip(bars, y):
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width() / 2., height,
f'{value}',
ha='center', va='bottom', fontsize=10, fontweight='bold')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout(pad=1.5)
# 转换为 base64
buf = io.BytesIO()
fig.savefig(buf, format='png', dpi=120, bbox_inches='tight', pad_inches=0.3, facecolor='white')
plt.close(fig)
buf.seek(0)
img_base64 = base64.b64encode(buf.read()).decode('utf-8')
data_url = f"data:image/png;base64,{img_base64}"
print("\n=== 测试完成 ===")
print(f"图表大小: {len(img_base64)} 字符")
print("如果看到字体警告,请检查系统是否有安装中文字体")
# 尝试获取所有可用字体
import matplotlib.font_manager as fm
available_fonts = set([f.name for f in fm.fontManager.ttflist])
print(f"\n=== 可用字体列表(部分)===")
chinese_fonts = [f for f in available_fonts if 'CJK' in f or 'Chinese' in f or 'YaHei' in f or 'SimHei' in f or 'PingFang' in f]
for font in sorted(chinese_fonts)[:10]:
print(f" - {font}")
if not chinese_fonts:
print(" 未找到中文字体!")
print("\n=== 推荐安装的中文字体 ===")
print("Windows: Microsoft YaHei (系统自带)")
print("macOS: PingFang SC (系统自带)")
print("Linux: fonts-noto-cjk 或 fonts-wqy-zenhei")
print("\n=== 生成的 base64 数据前100字符===")
print(data_url[:100] + "...")