前后端基本架构和完全excel表的解析及统计图表的生成以及excel表的到出
This commit is contained in:
Binary file not shown.
@@ -0,0 +1,14 @@
|
||||
"""
|
||||
API 路由注册模块
|
||||
"""
|
||||
from fastapi import APIRouter
|
||||
from app.api.endpoints import upload, ai_analyze, visualization, analysis_charts
|
||||
|
||||
# 创建主路由
|
||||
api_router = APIRouter()
|
||||
|
||||
# 注册各模块路由
|
||||
api_router.include_router(upload.router)
|
||||
api_router.include_router(ai_analyze.router)
|
||||
api_router.include_router(visualization.router)
|
||||
api_router.include_router(analysis_charts.router)
|
||||
|
||||
BIN
backend/app/api/__pycache__/__init__.cpython-312.pyc
Normal file
BIN
backend/app/api/__pycache__/__init__.cpython-312.pyc
Normal file
Binary file not shown.
BIN
backend/app/api/endpoints/__pycache__/ai_analyze.cpython-312.pyc
Normal file
BIN
backend/app/api/endpoints/__pycache__/ai_analyze.cpython-312.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
backend/app/api/endpoints/__pycache__/upload.cpython-312.pyc
Normal file
BIN
backend/app/api/endpoints/__pycache__/upload.cpython-312.pyc
Normal file
Binary file not shown.
BIN
backend/app/api/endpoints/__pycache__/upload.cpython-313.pyc
Normal file
BIN
backend/app/api/endpoints/__pycache__/upload.cpython-313.pyc
Normal file
Binary file not shown.
Binary file not shown.
144
backend/app/api/endpoints/ai_analyze.py
Normal file
144
backend/app/api/endpoints/ai_analyze.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""
|
||||
AI 分析 API 接口
|
||||
"""
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException, Query, Body
|
||||
from typing import Optional
|
||||
import logging
|
||||
|
||||
from app.services.excel_ai_service import excel_ai_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/ai", tags=["AI 分析"])
|
||||
|
||||
|
||||
@router.post("/analyze/excel")
|
||||
async def analyze_excel(
|
||||
file: UploadFile = File(...),
|
||||
user_prompt: str = Query("", description="用户自定义提示词"),
|
||||
analysis_type: str = Query("general", description="分析类型: general, summary, statistics, insights"),
|
||||
parse_all_sheets: bool = Query(False, description="是否分析所有工作表")
|
||||
):
|
||||
"""
|
||||
上传并使用 AI 分析 Excel 文件
|
||||
|
||||
Args:
|
||||
file: 上传的 Excel 文件
|
||||
user_prompt: 用户自定义提示词
|
||||
analysis_type: 分析类型
|
||||
parse_all_sheets: 是否分析所有工作表
|
||||
|
||||
Returns:
|
||||
dict: 分析结果,包含 Excel 数据和 AI 分析结果
|
||||
"""
|
||||
# 检查文件类型
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['xlsx', 'xls']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .xlsx 和 .xls"
|
||||
)
|
||||
|
||||
# 验证分析类型
|
||||
supported_types = ['general', 'summary', 'statistics', 'insights']
|
||||
if analysis_type not in supported_types:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的分析类型: {analysis_type},支持的类型: {', '.join(supported_types)}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 读取文件内容
|
||||
content = await file.read()
|
||||
|
||||
logger.info(f"开始分析文件: {file.filename}, 分析类型: {analysis_type}")
|
||||
|
||||
# 调用 AI 分析服务
|
||||
if parse_all_sheets:
|
||||
result = await excel_ai_service.batch_analyze_sheets(
|
||||
content,
|
||||
file.filename,
|
||||
user_prompt=user_prompt,
|
||||
analysis_type=analysis_type
|
||||
)
|
||||
else:
|
||||
# 解析选项
|
||||
parse_options = {"header_row": 0}
|
||||
|
||||
result = await excel_ai_service.analyze_excel_file(
|
||||
content,
|
||||
file.filename,
|
||||
user_prompt=user_prompt,
|
||||
analysis_type=analysis_type,
|
||||
parse_options=parse_options
|
||||
)
|
||||
|
||||
logger.info(f"文件分析完成: {file.filename}, 成功: {result['success']}")
|
||||
|
||||
return result
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"AI 分析过程中出错: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/analysis/types")
|
||||
async def get_analysis_types():
|
||||
"""
|
||||
获取支持的分析类型列表
|
||||
|
||||
Returns:
|
||||
list: 支持的分析类型
|
||||
"""
|
||||
return {
|
||||
"types": excel_ai_service.get_supported_analysis_types()
|
||||
}
|
||||
|
||||
|
||||
@router.post("/analyze/text")
|
||||
async def analyze_text(
|
||||
excel_data: dict = Body(..., description="Excel 解析后的数据"),
|
||||
user_prompt: str = Body("", description="用户提示词"),
|
||||
analysis_type: str = Body("general", description="分析类型")
|
||||
):
|
||||
"""
|
||||
对已解析的 Excel 数据进行 AI 分析
|
||||
|
||||
Args:
|
||||
excel_data: Excel 数据
|
||||
user_prompt: 用户提示词
|
||||
analysis_type: 分析类型
|
||||
|
||||
Returns:
|
||||
dict: 分析结果
|
||||
"""
|
||||
try:
|
||||
logger.info(f"开始文本分析, 分析类型: {analysis_type}")
|
||||
|
||||
# 调用 LLM 服务
|
||||
from app.services.llm_service import llm_service
|
||||
|
||||
if user_prompt and user_prompt.strip():
|
||||
result = await llm_service.analyze_with_template(
|
||||
excel_data,
|
||||
user_prompt
|
||||
)
|
||||
else:
|
||||
result = await llm_service.analyze_excel_data(
|
||||
excel_data,
|
||||
user_prompt,
|
||||
analysis_type
|
||||
)
|
||||
|
||||
logger.info(f"文本分析完成, 成功: {result['success']}")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"文本分析失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"分析失败: {str(e)}")
|
||||
105
backend/app/api/endpoints/analysis_charts.py
Normal file
105
backend/app/api/endpoints/analysis_charts.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""
|
||||
分析结果图表 API - 根据文本分析结果生成图表
|
||||
"""
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional
|
||||
import logging
|
||||
|
||||
from app.services.text_analysis_service import text_analysis_service
|
||||
from app.services.chart_generator_service import chart_generator_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/analysis", tags=["分析结果图表"])
|
||||
|
||||
|
||||
class AnalysisChartRequest(BaseModel):
|
||||
"""分析图表生成请求模型"""
|
||||
analysis_text: str
|
||||
original_filename: Optional[str] = ""
|
||||
file_type: Optional[str] = "text"
|
||||
|
||||
|
||||
@router.post("/extract-and-chart")
|
||||
async def extract_and_generate_charts(request: AnalysisChartRequest):
|
||||
"""
|
||||
从 AI 分析结果中提取数据并生成图表
|
||||
|
||||
Args:
|
||||
request: 包含分析文本的请求
|
||||
|
||||
Returns:
|
||||
dict: 包含图表数据的结果
|
||||
"""
|
||||
if not request.analysis_text or not request.analysis_text.strip():
|
||||
raise HTTPException(status_code=400, detail="分析文本不能为空")
|
||||
|
||||
try:
|
||||
logger.info("开始从分析结果中提取结构化数据...")
|
||||
|
||||
# 1. 使用 LLM 提取结构化数据
|
||||
extract_result = await text_analysis_service.extract_structured_data(
|
||||
analysis_text=request.analysis_text,
|
||||
original_filename=request.original_filename or "unknown",
|
||||
file_type=request.file_type or "text"
|
||||
)
|
||||
|
||||
if not extract_result.get("success"):
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"提取结构化数据失败: {extract_result.get('error', '未知错误')}"
|
||||
)
|
||||
|
||||
logger.info("结构化数据提取成功,开始生成图表...")
|
||||
|
||||
# 2. 根据提取的数据生成图表
|
||||
chart_result = chart_generator_service.generate_charts_from_analysis(extract_result)
|
||||
|
||||
if not chart_result.get("success"):
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"生成图表失败: {chart_result.get('error', '未知错误')}"
|
||||
)
|
||||
|
||||
logger.info("图表生成成功")
|
||||
|
||||
return chart_result
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"分析结果图表生成失败: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"图表生成失败: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.post("/analyze-text")
|
||||
async def analyze_text_only(request: AnalysisChartRequest):
|
||||
"""
|
||||
仅提取结构化数据(不生成图表),用于调试
|
||||
|
||||
Args:
|
||||
request: 包含分析文本的请求
|
||||
|
||||
Returns:
|
||||
dict: 提取的结构化数据
|
||||
"""
|
||||
if not request.analysis_text or not request.analysis_text.strip():
|
||||
raise HTTPException(status_code=400, detail="分析文本不能为空")
|
||||
|
||||
try:
|
||||
result = await text_analysis_service.extract_structured_data(
|
||||
analysis_text=request.analysis_text,
|
||||
original_filename=request.original_filename or "unknown",
|
||||
file_type=request.file_type or "text"
|
||||
)
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"文本分析失败: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"文本分析失败: {str(e)}"
|
||||
)
|
||||
205
backend/app/api/endpoints/upload.py
Normal file
205
backend/app/api/endpoints/upload.py
Normal file
@@ -0,0 +1,205 @@
|
||||
"""
|
||||
文件上传 API 接口
|
||||
"""
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException, Query
|
||||
from fastapi.responses import StreamingResponse
|
||||
from typing import Optional
|
||||
import logging
|
||||
import pandas as pd
|
||||
import io
|
||||
|
||||
from app.services.file_service import file_service
|
||||
from app.core.document_parser import XlsxParser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/upload", tags=["文件上传"])
|
||||
|
||||
# 初始化解析器
|
||||
excel_parser = XlsxParser()
|
||||
|
||||
|
||||
@router.post("/excel")
|
||||
async def upload_excel(
|
||||
file: UploadFile = File(...),
|
||||
parse_all_sheets: bool = Query(False, description="是否解析所有工作表"),
|
||||
sheet_name: Optional[str] = Query(None, description="指定解析的工作表名称"),
|
||||
header_row: int = Query(0, description="表头所在的行索引")
|
||||
):
|
||||
"""
|
||||
上传并解析 Excel 文件
|
||||
|
||||
Args:
|
||||
file: 上传的 Excel 文件
|
||||
parse_all_sheets: 是否解析所有工作表
|
||||
sheet_name: 指定解析的工作表名称
|
||||
header_row: 表头所在的行索引
|
||||
|
||||
Returns:
|
||||
dict: 解析结果
|
||||
"""
|
||||
# 检查文件类型
|
||||
if not file.filename:
|
||||
raise HTTPException(status_code=400, detail="文件名为空")
|
||||
|
||||
file_ext = file.filename.split('.')[-1].lower()
|
||||
if file_ext not in ['xlsx', 'xls']:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"不支持的文件类型: {file_ext},仅支持 .xlsx 和 .xls"
|
||||
)
|
||||
|
||||
try:
|
||||
# 读取文件内容
|
||||
content = await file.read()
|
||||
|
||||
# 保存文件
|
||||
saved_path = file_service.save_uploaded_file(
|
||||
content,
|
||||
file.filename,
|
||||
subfolder="excel"
|
||||
)
|
||||
|
||||
logger.info(f"文件已保存: {saved_path}")
|
||||
|
||||
# 解析文件
|
||||
if parse_all_sheets:
|
||||
result = excel_parser.parse_all_sheets(saved_path)
|
||||
else:
|
||||
# 如果指定了 sheet_name,使用指定的,否则使用默认的第一个
|
||||
if sheet_name:
|
||||
result = excel_parser.parse(saved_path, sheet_name=sheet_name, header_row=header_row)
|
||||
else:
|
||||
result = excel_parser.parse(saved_path, header_row=header_row)
|
||||
|
||||
# 添加文件路径到元数据
|
||||
if result.metadata:
|
||||
result.metadata['saved_path'] = saved_path
|
||||
result.metadata['original_filename'] = file.filename
|
||||
|
||||
return result.to_dict()
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"解析 Excel 文件时出错: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"解析失败: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/excel/preview/{file_path:path}")
|
||||
async def get_excel_preview(
|
||||
file_path: str,
|
||||
sheet_name: Optional[str] = Query(None, description="工作表名称"),
|
||||
max_rows: int = Query(10, description="最多返回的行数", ge=1, le=100)
|
||||
):
|
||||
"""
|
||||
获取 Excel 文件的预览数据
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
sheet_name: 工作表名称
|
||||
max_rows: 最多返回的行数
|
||||
|
||||
Returns:
|
||||
dict: 预览数据
|
||||
"""
|
||||
try:
|
||||
# 解析工作表名称参数
|
||||
sheet_param = sheet_name if sheet_name else 0
|
||||
|
||||
result = excel_parser.get_sheet_preview(
|
||||
file_path,
|
||||
sheet_name=sheet_param,
|
||||
max_rows=max_rows
|
||||
)
|
||||
|
||||
return result.to_dict()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取预览数据时出错: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"获取预览失败: {str(e)}")
|
||||
|
||||
|
||||
@router.delete("/file")
|
||||
async def delete_uploaded_file(file_path: str = Query(..., description="要删除的文件路径")):
|
||||
"""
|
||||
删除已上传的文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
dict: 删除结果
|
||||
"""
|
||||
try:
|
||||
success = file_service.delete_file(file_path)
|
||||
|
||||
if success:
|
||||
return {"success": True, "message": "文件删除成功"}
|
||||
else:
|
||||
return {"success": False, "message": "文件不存在或删除失败"}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"删除文件时出错: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"删除失败: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/excel/export/{file_path:path}")
|
||||
async def export_excel(
|
||||
file_path: str,
|
||||
sheet_name: Optional[str] = Query(None, description="工作表名称"),
|
||||
columns: Optional[str] = Query(None, description="要导出的列,逗号分隔")
|
||||
):
|
||||
"""
|
||||
导出 Excel 文件(可选择工作表和列)
|
||||
|
||||
Args:
|
||||
file_path: 原始文件路径
|
||||
sheet_name: 工作表名称(可选)
|
||||
columns: 要导出的列名,逗号分隔(可选)
|
||||
|
||||
Returns:
|
||||
StreamingResponse: Excel 文件
|
||||
"""
|
||||
try:
|
||||
# 读取 Excel 文件
|
||||
if sheet_name:
|
||||
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
||||
else:
|
||||
df = pd.read_excel(file_path)
|
||||
|
||||
# 如果指定了列,只选择这些列
|
||||
if columns:
|
||||
column_list = [col.strip() for col in columns.split(',')]
|
||||
# 过滤掉不存在的列
|
||||
available_columns = [col for col in column_list if col in df.columns]
|
||||
if available_columns:
|
||||
df = df[available_columns]
|
||||
|
||||
# 创建 Excel 文件
|
||||
output = io.BytesIO()
|
||||
with pd.ExcelWriter(output, engine='openpyxl') as writer:
|
||||
df.to_excel(writer, index=False, sheet_name=sheet_name or 'Sheet1')
|
||||
|
||||
output.seek(0)
|
||||
|
||||
# 生成文件名
|
||||
original_name = file_path.split('/')[-1] if '/' in file_path else file_path
|
||||
if columns:
|
||||
export_name = f"export_{sheet_name or 'data'}_{len(column_list) if columns else 'all'}_cols.xlsx"
|
||||
else:
|
||||
export_name = f"export_{original_name}"
|
||||
|
||||
# 返回文件流
|
||||
return StreamingResponse(
|
||||
io.BytesIO(output.getvalue()),
|
||||
media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
headers={"Content-Disposition": f"attachment; filename={export_name}"}
|
||||
)
|
||||
|
||||
except FileNotFoundError:
|
||||
logger.error(f"文件不存在: {file_path}")
|
||||
raise HTTPException(status_code=404, detail="文件不存在")
|
||||
except Exception as e:
|
||||
logger.error(f"导出 Excel 文件时出错: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"导出失败: {str(e)}")
|
||||
90
backend/app/api/endpoints/visualization.py
Normal file
90
backend/app/api/endpoints/visualization.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
可视化 API 接口 - 生成统计图表
|
||||
"""
|
||||
from fastapi import APIRouter, HTTPException, Body
|
||||
from typing import Dict, Any
|
||||
import logging
|
||||
|
||||
from app.services.visualization_service import visualization_service
|
||||
from pydantic import BaseModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter(prefix="/visualization", tags=["数据可视化"])
|
||||
|
||||
|
||||
class StatisticsRequest(BaseModel):
|
||||
"""统计图表生成请求模型"""
|
||||
excel_data: Dict[str, Any]
|
||||
analysis_type: str = "statistics"
|
||||
|
||||
|
||||
@router.post("/statistics")
|
||||
async def generate_statistics(request: StatisticsRequest):
|
||||
"""
|
||||
生成统计信息和可视化图表
|
||||
|
||||
Args:
|
||||
request: 包含 excel_data 和 analysis_type 的请求体
|
||||
|
||||
Returns:
|
||||
dict: 包含统计信息和图表数据的结果
|
||||
"""
|
||||
excel_data = request.excel_data
|
||||
analysis_type = request.analysis_type
|
||||
|
||||
if not excel_data:
|
||||
raise HTTPException(status_code=400, detail="未提供 Excel 数据")
|
||||
|
||||
try:
|
||||
result = visualization_service.analyze_and_visualize(
|
||||
excel_data,
|
||||
analysis_type
|
||||
)
|
||||
|
||||
if not result.get("success"):
|
||||
raise HTTPException(status_code=500, detail=result.get("error", "分析失败"))
|
||||
|
||||
logger.info("统计图表生成成功")
|
||||
|
||||
return result
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"统计图表生成失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"图表生成失败: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/chart-types")
|
||||
async def get_chart_types():
|
||||
"""
|
||||
获取支持的图表类型
|
||||
|
||||
Returns:
|
||||
dict: 支持的图表类型列表
|
||||
"""
|
||||
return {
|
||||
"chart_types": [
|
||||
{
|
||||
"value": "histogram",
|
||||
"label": "直方图",
|
||||
"description": "显示数值型列的分布情况"
|
||||
},
|
||||
{
|
||||
"value": "bar_chart",
|
||||
"label": "条形图",
|
||||
"description": "显示分类列的频次分布"
|
||||
},
|
||||
{
|
||||
"value": "box_plot",
|
||||
"label": "箱线图",
|
||||
"description": "显示数值列的四分位数和异常值"
|
||||
},
|
||||
{
|
||||
"value": "correlation_heatmap",
|
||||
"label": "相关性热力图",
|
||||
"description": "显示数值列之间的相关性"
|
||||
}
|
||||
]
|
||||
}
|
||||
7
backend/app/core/document_parser/__init__.py
Normal file
7
backend/app/core/document_parser/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
"""
|
||||
文档解析模块 - 支持多种文件格式的解析
|
||||
"""
|
||||
from .base import BaseParser
|
||||
from .xlsx_parser import XlsxParser
|
||||
|
||||
__all__ = ['BaseParser', 'XlsxParser']
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
87
backend/app/core/document_parser/base.py
Normal file
87
backend/app/core/document_parser/base.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""
|
||||
解析器基类 - 定义所有解析器的通用接口
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class ParseResult:
|
||||
"""解析结果类"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
success: bool,
|
||||
data: Optional[Dict[str, Any]] = None,
|
||||
error: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
):
|
||||
self.success = success
|
||||
self.data = data or {}
|
||||
self.error = error
|
||||
self.metadata = metadata or {}
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""转换为字典"""
|
||||
return {
|
||||
"success": self.success,
|
||||
"data": self.data,
|
||||
"error": self.error,
|
||||
"metadata": self.metadata
|
||||
}
|
||||
|
||||
|
||||
class BaseParser(ABC):
|
||||
"""文档解析器基类"""
|
||||
|
||||
def __init__(self):
|
||||
self.supported_extensions: List[str] = []
|
||||
self.parser_name: str = "base_parser"
|
||||
|
||||
@abstractmethod
|
||||
def parse(self, file_path: str, **kwargs) -> ParseResult:
|
||||
"""
|
||||
解析文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
**kwargs: 其他解析参数
|
||||
|
||||
Returns:
|
||||
ParseResult: 解析结果
|
||||
"""
|
||||
pass
|
||||
|
||||
def can_parse(self, file_path: str) -> bool:
|
||||
"""
|
||||
检查是否可以解析该文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
bool: 是否可以解析
|
||||
"""
|
||||
ext = Path(file_path).suffix.lower()
|
||||
return ext in self.supported_extensions
|
||||
|
||||
def get_file_info(self, file_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
获取文件基本信息
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 文件信息
|
||||
"""
|
||||
path = Path(file_path)
|
||||
if not path.exists():
|
||||
return {"error": "File not found"}
|
||||
|
||||
return {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"size": path.stat().st_size,
|
||||
"parser": self.parser_name
|
||||
}
|
||||
120
backend/app/core/document_parser/utils.py
Normal file
120
backend/app/core/document_parser/utils.py
Normal file
@@ -0,0 +1,120 @@
|
||||
"""
|
||||
文档解析工具函数
|
||||
"""
|
||||
import re
|
||||
from typing import List, Optional, Dict, Any
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""
|
||||
清洗文本,去除多余的空白字符和特殊符号
|
||||
|
||||
Args:
|
||||
text: 原始文本
|
||||
|
||||
Returns:
|
||||
str: 清洗后的文本
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
# 去除首尾空白
|
||||
text = text.strip()
|
||||
|
||||
# 将多个连续的空白字符替换为单个空格
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
|
||||
# 去除不可打印字符
|
||||
text = ''.join(char for char in text if char.isprintable() or char in '\n\r\t')
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def chunk_text(
|
||||
text: str,
|
||||
chunk_size: int = 1000,
|
||||
overlap: int = 100
|
||||
) -> List[str]:
|
||||
"""
|
||||
将文本分块
|
||||
|
||||
Args:
|
||||
text: 原始文本
|
||||
chunk_size: 每块的大小(字符数)
|
||||
overlap: 重叠区域的大小
|
||||
|
||||
Returns:
|
||||
List[str]: 文本块列表
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
text_length = len(text)
|
||||
|
||||
while start < text_length:
|
||||
end = start + chunk_size
|
||||
chunk = text[start:end]
|
||||
chunks.append(chunk)
|
||||
start = end - overlap
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def normalize_string(s: Any) -> str:
|
||||
"""
|
||||
标准化字符串
|
||||
|
||||
Args:
|
||||
s: 输入值
|
||||
|
||||
Returns:
|
||||
str: 标准化后的字符串
|
||||
"""
|
||||
if s is None:
|
||||
return ""
|
||||
if isinstance(s, (int, float)):
|
||||
return str(s)
|
||||
if isinstance(s, str):
|
||||
return clean_text(s)
|
||||
return str(s)
|
||||
|
||||
|
||||
def detect_encoding(file_path: str) -> Optional[str]:
|
||||
"""
|
||||
检测文件编码(简化版)
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
Optional[str]: 编码格式,无法检测则返回 None
|
||||
"""
|
||||
import chardet
|
||||
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
raw_data = f.read(10000) # 读取前 10000 字节
|
||||
result = chardet.detect(raw_data)
|
||||
return result.get('encoding')
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def safe_get(d: Dict[str, Any], key: str, default: Any = None) -> Any:
|
||||
"""
|
||||
安全地获取字典值
|
||||
|
||||
Args:
|
||||
d: 字典
|
||||
key: 键
|
||||
default: 默认值
|
||||
|
||||
Returns:
|
||||
Any: 字典值或默认值
|
||||
"""
|
||||
try:
|
||||
return d.get(key, default)
|
||||
except Exception:
|
||||
return default
|
||||
288
backend/app/core/document_parser/xlsx_parser.py
Normal file
288
backend/app/core/document_parser/xlsx_parser.py
Normal file
@@ -0,0 +1,288 @@
|
||||
"""
|
||||
Excel 文件解析器 - 解析 .xlsx 和 .xls 文件
|
||||
"""
|
||||
from typing import Any, Dict, List, Optional
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
import logging
|
||||
|
||||
from .base import BaseParser, ParseResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class XlsxParser(BaseParser):
|
||||
"""Excel 文件解析器"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.supported_extensions = ['.xlsx', '.xls']
|
||||
self.parser_name = "excel_parser"
|
||||
|
||||
def parse(
|
||||
self,
|
||||
file_path: str,
|
||||
sheet_name: Optional[str | int] = 0,
|
||||
header_row: int = 0,
|
||||
**kwargs
|
||||
) -> ParseResult:
|
||||
"""
|
||||
解析 Excel 文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
sheet_name: 工作表名称或索引,默认为第一个工作表
|
||||
header_row: 表头所在的行索引,默认为 0
|
||||
**kwargs: 其他参数传递给 pandas.read_excel
|
||||
|
||||
Returns:
|
||||
ParseResult: 解析结果
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
# 检查文件是否存在
|
||||
if not path.exists():
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"File not found: {file_path}"
|
||||
)
|
||||
|
||||
# 检查文件扩展名
|
||||
if path.suffix.lower() not in self.supported_extensions:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"Unsupported file type: {path.suffix}"
|
||||
)
|
||||
|
||||
# 检查文件大小
|
||||
file_size = path.stat().st_size
|
||||
if file_size == 0:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"File is empty: {file_path}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 尝试读取 Excel 文件,检查是否有工作表
|
||||
xls_file = pd.ExcelFile(file_path)
|
||||
sheet_names = xls_file.sheet_names
|
||||
|
||||
if not sheet_names:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"Excel 文件没有找到任何工作表: {file_path}"
|
||||
)
|
||||
|
||||
# 验证请求的工作表索引/名称
|
||||
target_sheet = None
|
||||
if sheet_name is not None:
|
||||
if isinstance(sheet_name, int) and sheet_name < len(sheet_names):
|
||||
target_sheet = sheet_names[sheet_name]
|
||||
elif isinstance(sheet_name, str) and sheet_name in sheet_names:
|
||||
target_sheet = sheet_name
|
||||
else:
|
||||
# 如果指定的 sheet_name 无效,使用第一个工作表
|
||||
target_sheet = sheet_names[0]
|
||||
else:
|
||||
# 默认使用第一个工作表
|
||||
target_sheet = sheet_names[0]
|
||||
|
||||
# 读取 Excel 文件
|
||||
df = pd.read_excel(
|
||||
file_path,
|
||||
sheet_name=target_sheet,
|
||||
header=header_row,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# 检查 DataFrame 是否为空
|
||||
if df.empty:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"工作表 '{target_sheet}' 为空,请检查 Excel 文件内容"
|
||||
)
|
||||
|
||||
# 转换为可序列化的数据
|
||||
data = self._df_to_dict(df)
|
||||
|
||||
# 构建元数据
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"sheet_count": len(sheet_names),
|
||||
"sheet_names": sheet_names,
|
||||
"current_sheet": target_sheet,
|
||||
"row_count": len(df),
|
||||
"column_count": len(df.columns) if not df.empty else 0,
|
||||
"columns": df.columns.tolist() if not df.empty else [],
|
||||
"file_size": file_size
|
||||
}
|
||||
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data=data,
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
except IndexError as e:
|
||||
logger.error(f"工作表索引错误: {str(e)}")
|
||||
# 工作表索引超出范围时,尝试使用第一个工作表
|
||||
try:
|
||||
xls_file = pd.ExcelFile(file_path)
|
||||
sheet_names = xls_file.sheet_names
|
||||
if sheet_names:
|
||||
df = pd.read_excel(
|
||||
file_path,
|
||||
sheet_name=sheet_names[0],
|
||||
header=header_row,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
data = self._df_to_dict(df)
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"sheet_count": len(sheet_names),
|
||||
"sheet_names": sheet_names,
|
||||
"current_sheet": sheet_names[0],
|
||||
"row_count": len(df),
|
||||
"column_count": len(df.columns) if not df.empty else 0,
|
||||
"columns": df.columns.tolist() if not df.empty else [],
|
||||
"file_size": path.stat().st_size
|
||||
}
|
||||
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data=data,
|
||||
metadata=metadata
|
||||
)
|
||||
else:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"Excel 文件没有有效的工作表"
|
||||
)
|
||||
except Exception as e2:
|
||||
logger.error(f"重试解析失败: {str(e2)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"无法解析 Excel 文件: {str(e)}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"解析 Excel 文件时出错: {str(e)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"Failed to parse Excel file: {str(e)}"
|
||||
)
|
||||
|
||||
def parse_all_sheets(self, file_path: str, **kwargs) -> ParseResult:
|
||||
"""
|
||||
解析 Excel 文件的所有工作表
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
**kwargs: 其他参数传递给 pandas.read_excel
|
||||
|
||||
Returns:
|
||||
ParseResult: 解析结果
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
# 检查文件是否存在
|
||||
if not path.exists():
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"File not found: {file_path}"
|
||||
)
|
||||
|
||||
if path.suffix.lower() not in self.supported_extensions:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"Unsupported file type: {path.suffix}"
|
||||
)
|
||||
|
||||
# 检查文件大小
|
||||
file_size = path.stat().st_size
|
||||
if file_size == 0:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"File is empty: {file_path}"
|
||||
)
|
||||
|
||||
try:
|
||||
# 读取所有工作表
|
||||
all_data = pd.read_excel(file_path, sheet_name=None, **kwargs)
|
||||
|
||||
# 检查是否成功读取到数据
|
||||
if not all_data or len(all_data) == 0:
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"无法读取 Excel 文件或文件为空: {file_path}"
|
||||
)
|
||||
|
||||
# 转换为可序列化的数据
|
||||
sheets_data = {}
|
||||
for sheet_name, df in all_data.items():
|
||||
sheets_data[sheet_name] = self._df_to_dict(df)
|
||||
|
||||
# 获取所有工作表名称
|
||||
all_sheets = list(all_data.keys())
|
||||
|
||||
# 构建元数据
|
||||
total_rows = sum(len(df) for df in all_data.values())
|
||||
metadata = {
|
||||
"filename": path.name,
|
||||
"extension": path.suffix.lower(),
|
||||
"sheet_count": len(all_sheets),
|
||||
"sheet_names": all_sheets,
|
||||
"total_rows": total_rows,
|
||||
"file_size": file_size
|
||||
}
|
||||
|
||||
return ParseResult(
|
||||
success=True,
|
||||
data={"sheets": sheets_data},
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse Excel file: {str(e)}")
|
||||
return ParseResult(
|
||||
success=False,
|
||||
error=f"Failed to parse Excel file: {str(e)}"
|
||||
)
|
||||
|
||||
def _get_sheet_names(self, file_path: str) -> List[str]:
|
||||
"""获取 Excel 文件中的所有工作表名称"""
|
||||
try:
|
||||
xls = pd.ExcelFile(file_path)
|
||||
sheet_names = xls.sheet_names
|
||||
if not sheet_names:
|
||||
return []
|
||||
return sheet_names
|
||||
except Exception as e:
|
||||
logger.error(f"获取工作表名称失败: {str(e)}")
|
||||
return []
|
||||
|
||||
def _df_to_dict(self, df: pd.DataFrame) -> Dict[str, Any]:
|
||||
"""
|
||||
将 DataFrame 转换为字典,处理 NaN 值
|
||||
|
||||
Args:
|
||||
df: pandas DataFrame
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 转换后的字典
|
||||
"""
|
||||
# 将 NaN 替换为 None
|
||||
df = df.replace({pd.NA: None, float('nan'): None})
|
||||
|
||||
# 转换为字典列表(每一行一个字典)
|
||||
rows = df.to_dict(orient='records')
|
||||
|
||||
return {
|
||||
"columns": df.columns.tolist(),
|
||||
"rows": rows,
|
||||
"row_count": len(rows),
|
||||
"column_count": len(df.columns) if not df.empty else 0
|
||||
}
|
||||
@@ -1,18 +1,61 @@
|
||||
"""
|
||||
FastAPI 应用主入口
|
||||
"""
|
||||
from fastapi import FastAPI
|
||||
from config import settings
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from app.config import settings
|
||||
from app.api import api_router
|
||||
|
||||
# 创建 FastAPI 应用实例
|
||||
app = FastAPI(
|
||||
title=settings.APP_NAME,
|
||||
openapi_url=f"{settings.API_V1_STR}/openapi.json"
|
||||
description="基于大语言模型的文档理解与多源数据融合系统",
|
||||
version="1.0.0",
|
||||
openapi_url=f"{settings.API_V1_STR}/openapi.json",
|
||||
docs_url=f"{settings.API_V1_STR}/docs",
|
||||
redoc_url=f"{settings.API_V1_STR}/redoc"
|
||||
)
|
||||
|
||||
# 配置 CORS 中间件
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# 注册 API 路由
|
||||
app.include_router(api_router, prefix=settings.API_V1_STR)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""根路径"""
|
||||
return {
|
||||
"message": f"Welcome to {settings.APP_NAME}",
|
||||
"status": "online",
|
||||
"debug_mode": settings.DEBUG
|
||||
"version": "1.0.0",
|
||||
"debug_mode": settings.DEBUG,
|
||||
"api_docs": f"{settings.API_V1_STR}/docs"
|
||||
}
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""健康检查接口"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.APP_NAME
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run("main:app", host="127.0.0.1", port=8000, reload=True)
|
||||
|
||||
uvicorn.run(
|
||||
"app.main:app",
|
||||
host="127.0.0.1",
|
||||
port=8000,
|
||||
reload=settings.DEBUG
|
||||
)
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
backend/app/services/__pycache__/file_service.cpython-312.pyc
Normal file
BIN
backend/app/services/__pycache__/file_service.cpython-312.pyc
Normal file
Binary file not shown.
BIN
backend/app/services/__pycache__/font_helper.cpython-312.pyc
Normal file
BIN
backend/app/services/__pycache__/font_helper.cpython-312.pyc
Normal file
Binary file not shown.
BIN
backend/app/services/__pycache__/font_helper.cpython-313.pyc
Normal file
BIN
backend/app/services/__pycache__/font_helper.cpython-313.pyc
Normal file
Binary file not shown.
BIN
backend/app/services/__pycache__/llm_service.cpython-312.pyc
Normal file
BIN
backend/app/services/__pycache__/llm_service.cpython-312.pyc
Normal file
Binary file not shown.
BIN
backend/app/services/__pycache__/llm_service.cpython-313.pyc
Normal file
BIN
backend/app/services/__pycache__/llm_service.cpython-313.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
349
backend/app/services/chart_generator_service.py
Normal file
349
backend/app/services/chart_generator_service.py
Normal file
@@ -0,0 +1,349 @@
|
||||
"""
|
||||
图表生成服务 - 根据结构化数据生成图表
|
||||
"""
|
||||
import io
|
||||
import base64
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib
|
||||
import numpy as np
|
||||
|
||||
# 使用字体辅助模块配置中文字体
|
||||
from app.services.font_helper import configure_matplotlib_fonts
|
||||
|
||||
configure_matplotlib_fonts()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ChartGeneratorService:
|
||||
"""图表生成服务类"""
|
||||
|
||||
def __init__(self):
|
||||
self.output_dir = Path(__file__).resolve().parent.parent.parent / "data" / "charts"
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def generate_charts_from_analysis(
|
||||
self,
|
||||
structured_data: Dict[str, Any]
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
根据提取的结构化数据生成图表
|
||||
|
||||
Args:
|
||||
structured_data: 从 AI 分析结果中提取的结构化数据
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 包含图表数据的结果
|
||||
"""
|
||||
if not structured_data.get("success"):
|
||||
return {
|
||||
"success": False,
|
||||
"error": structured_data.get("error", "数据提取失败")
|
||||
}
|
||||
|
||||
data = structured_data.get("data", {})
|
||||
charts = {}
|
||||
statistics = {}
|
||||
|
||||
try:
|
||||
# 1. 数值型数据图表
|
||||
numeric_data = data.get("numeric_data", [])
|
||||
if numeric_data:
|
||||
charts["numeric_charts"] = self._create_numeric_charts(numeric_data)
|
||||
statistics["numeric_summary"] = self._create_numeric_summary(numeric_data)
|
||||
|
||||
# 2. 分类数据图表
|
||||
categorical_data = data.get("categorical_data", [])
|
||||
if categorical_data:
|
||||
charts["categorical_charts"] = self._create_categorical_charts(categorical_data)
|
||||
|
||||
# 3. 时间序列图表
|
||||
time_series_data = data.get("time_series_data", [])
|
||||
if time_series_data:
|
||||
charts["time_series_chart"] = self._create_time_series_chart(time_series_data)
|
||||
|
||||
# 4. 对比数据图表
|
||||
comparison_data = data.get("comparison_data", [])
|
||||
if comparison_data:
|
||||
charts["comparison_chart"] = self._create_comparison_chart(comparison_data)
|
||||
|
||||
# 5. 表格数据可视化
|
||||
table_data = data.get("table_data")
|
||||
if table_data:
|
||||
charts["table_preview"] = self._create_table_preview(table_data)
|
||||
|
||||
# 元数据
|
||||
metadata = data.get("metadata", {})
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"charts": charts,
|
||||
"statistics": statistics,
|
||||
"metadata": metadata,
|
||||
"data_source": "ai_analysis"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"生成图表失败: {str(e)}", exc_info=True)
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def _create_numeric_charts(self, numeric_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""创建数值型数据图表"""
|
||||
charts = []
|
||||
|
||||
# 提取数值和标签
|
||||
names = [item.get("name", f"项{i}") for i, item in enumerate(numeric_data)]
|
||||
values = [item.get("value", 0) for item in numeric_data]
|
||||
|
||||
if not values:
|
||||
return charts
|
||||
|
||||
# 1. 柱状图
|
||||
try:
|
||||
fig, ax = plt.subplots(figsize=(12, 7))
|
||||
colors = plt.cm.Set3(np.linspace(0, 1, len(values)))
|
||||
bars = ax.bar(names, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)
|
||||
|
||||
# 添加数值标签
|
||||
for bar, value in zip(bars, values):
|
||||
height = bar.get_height()
|
||||
ax.text(bar.get_x() + bar.get_width() / 2., height,
|
||||
f'{value:,.0f}',
|
||||
ha='center', va='bottom', fontsize=9, fontweight='bold')
|
||||
|
||||
ax.set_xlabel('项目', fontsize=10, labelpad=10, fontweight='bold')
|
||||
ax.set_ylabel('数值', fontsize=10, labelpad=10, fontweight='bold')
|
||||
ax.set_title('数值型数据对比', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.set_xticklabels(names, rotation=30, ha='right', fontsize=9)
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
plt.grid(axis='y', alpha=0.3)
|
||||
plt.tight_layout(pad=1.5)
|
||||
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
charts.append({
|
||||
"type": "bar",
|
||||
"title": "数值型数据对比",
|
||||
"image": img_base64,
|
||||
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"创建柱状图失败: {str(e)}")
|
||||
|
||||
# 2. 饼图
|
||||
if len(values) > 0 and len(values) <= 10:
|
||||
try:
|
||||
fig, ax = plt.subplots(figsize=(10, 10))
|
||||
wedges, texts, autotexts = ax.pie(values, labels=names, autopct='%1.1f%%',
|
||||
startangle=90, colors=plt.cm.Set3.colors[:len(values)])
|
||||
|
||||
for autotext in autotexts:
|
||||
autotext.set_color('white')
|
||||
autotext.set_fontsize(9)
|
||||
autotext.set_fontweight('bold')
|
||||
|
||||
ax.set_title('数值型数据占比', fontsize=12, fontweight='bold', pad=15)
|
||||
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
charts.append({
|
||||
"type": "pie",
|
||||
"title": "数值型数据占比",
|
||||
"image": img_base64,
|
||||
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"创建饼图失败: {str(e)}")
|
||||
|
||||
return charts
|
||||
|
||||
def _create_categorical_charts(self, categorical_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""创建分类数据图表"""
|
||||
charts = []
|
||||
|
||||
# 提取数据
|
||||
names = [item.get("name", f"类{i}") for i, item in enumerate(categorical_data)]
|
||||
counts = [item.get("count", 1) for item in categorical_data]
|
||||
|
||||
if not names or not counts:
|
||||
return charts
|
||||
|
||||
# 水平条形图
|
||||
try:
|
||||
fig, ax = plt.subplots(figsize=(10, max(6, len(names) * 0.8)))
|
||||
y_pos = np.arange(len(names))
|
||||
|
||||
bars = ax.barh(y_pos, counts, align='center', color='#10b981', alpha=0.8, edgecolor='black', linewidth=0.5)
|
||||
|
||||
# 添加数值标签
|
||||
for bar, count in zip(bars, counts):
|
||||
width = bar.get_width()
|
||||
ax.text(width, bar.get_y() + bar.get_height() / 2.,
|
||||
f'{count}',
|
||||
ha='left', va='center', fontsize=10, fontweight='bold')
|
||||
|
||||
ax.set_yticks(y_pos)
|
||||
ax.set_yticklabels(names, fontsize=10)
|
||||
ax.invert_yaxis()
|
||||
ax.set_xlabel('数量', fontsize=10, labelpad=10, fontweight='bold')
|
||||
ax.set_title('分类数据分布', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
ax.grid(axis='x', alpha=0.3)
|
||||
plt.tight_layout(pad=1.5)
|
||||
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
charts.append({
|
||||
"type": "barh",
|
||||
"title": "分类数据分布",
|
||||
"image": img_base64,
|
||||
"data": [{"name": n, "count": c} for n, c in zip(names, counts)]
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"创建分类图表失败: {str(e)}")
|
||||
|
||||
return charts
|
||||
|
||||
def _create_time_series_chart(self, time_series_data: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
||||
"""创建时间序列图表"""
|
||||
if not time_series_data:
|
||||
return None
|
||||
|
||||
try:
|
||||
names = [item.get("name", f"时间{i}") for i, item in enumerate(time_series_data)]
|
||||
values = [item.get("value", 0) for item in time_series_data]
|
||||
|
||||
if len(values) < 2:
|
||||
return None
|
||||
|
||||
fig, ax = plt.subplots(figsize=(14, 7))
|
||||
|
||||
# 绘制折线图和柱状图
|
||||
x_pos = np.arange(len(names))
|
||||
bars = ax.bar(x_pos, values, width=0.4, label='数值', color='#3b82f6', alpha=0.7)
|
||||
|
||||
# 添加折线
|
||||
line = ax.plot(x_pos, values, 'o-', color='#ef4444', linewidth=2.5, markersize=8, label='趋势')
|
||||
|
||||
ax.set_xticks(x_pos)
|
||||
ax.set_xticklabels(names, rotation=30, ha='right', fontsize=9)
|
||||
ax.set_ylabel('数值', fontsize=10, labelpad=10, fontweight='bold')
|
||||
ax.set_title('时间序列数据', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.legend(loc='best', fontsize=9)
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
ax.grid(True, alpha=0.3)
|
||||
plt.tight_layout(pad=1.5)
|
||||
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
return {
|
||||
"type": "time_series",
|
||||
"title": "时间序列数据",
|
||||
"image": img_base64,
|
||||
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"创建时间序列图表失败: {str(e)}")
|
||||
return None
|
||||
|
||||
def _create_comparison_chart(self, comparison_data: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
||||
"""创建对比图表"""
|
||||
if not comparison_data:
|
||||
return None
|
||||
|
||||
try:
|
||||
names = [item.get("name", f"对比{i}") for i, item in enumerate(comparison_data)]
|
||||
values = [item.get("value", 0) for item in comparison_data]
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 7))
|
||||
|
||||
# 区分正负值
|
||||
colors = ['#10b981' if v >= 0 else '#ef4444' for v in values]
|
||||
bars = ax.bar(names, values, color=colors, alpha=0.8, edgecolor='black', linewidth=0.8)
|
||||
|
||||
# 添加数值标签
|
||||
for bar, value in zip(bars, values):
|
||||
height = bar.get_height()
|
||||
ax.text(bar.get_x() + bar.get_width() / 2., height,
|
||||
f'{value:,.1f}',
|
||||
ha='center', va='bottom' if value >= 0 else 'top',
|
||||
fontsize=10, fontweight='bold')
|
||||
|
||||
# 添加零线
|
||||
ax.axhline(y=0, color='black', linestyle='-', linewidth=1)
|
||||
|
||||
ax.set_ylabel('值', fontsize=10, labelpad=10, fontweight='bold')
|
||||
ax.set_title('对比数据', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.set_xticklabels(names, rotation=30, ha='right', fontsize=9)
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
plt.grid(axis='y', alpha=0.3)
|
||||
plt.tight_layout(pad=1.5)
|
||||
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
return {
|
||||
"type": "comparison",
|
||||
"title": "对比数据",
|
||||
"image": img_base64,
|
||||
"data": [{"name": n, "value": v} for n, v in zip(names, values)]
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"创建对比图表失败: {str(e)}")
|
||||
return None
|
||||
|
||||
def _create_table_preview(self, table_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""创建表格预览数据"""
|
||||
if not table_data:
|
||||
return {}
|
||||
|
||||
columns = table_data.get("columns", [])
|
||||
rows = table_data.get("rows", [])
|
||||
|
||||
return {
|
||||
"columns": columns,
|
||||
"rows": rows[:50], # 限制显示前50行
|
||||
"total_rows": len(rows),
|
||||
"preview_rows": min(50, len(rows))
|
||||
}
|
||||
|
||||
def _create_numeric_summary(self, numeric_data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""创建数值型数据摘要"""
|
||||
values = [item.get("value", 0) for item in numeric_data if isinstance(item.get("value"), (int, float))]
|
||||
|
||||
if not values:
|
||||
return {}
|
||||
|
||||
return {
|
||||
"count": len(values),
|
||||
"sum": float(sum(values)),
|
||||
"mean": float(np.mean(values)),
|
||||
"median": float(np.median(values)),
|
||||
"min": float(min(values)),
|
||||
"max": float(max(values)),
|
||||
"std": float(np.std(values)) if len(values) > 1 else 0
|
||||
}
|
||||
|
||||
def _figure_to_base64(self, fig) -> str:
|
||||
"""将 matplotlib 图形转换为 base64 字符串"""
|
||||
buf = io.BytesIO()
|
||||
fig.savefig(
|
||||
buf,
|
||||
format='png',
|
||||
dpi=120,
|
||||
bbox_inches='tight',
|
||||
pad_inches=0.3,
|
||||
facecolor='white',
|
||||
edgecolor='none',
|
||||
transparent=False
|
||||
)
|
||||
plt.close(fig)
|
||||
buf.seek(0)
|
||||
img_base64 = base64.b64encode(buf.read()).decode('utf-8')
|
||||
return f"data:image/png;base64,{img_base64}"
|
||||
|
||||
|
||||
# 全局单例
|
||||
chart_generator_service = ChartGeneratorService()
|
||||
253
backend/app/services/excel_ai_service.py
Normal file
253
backend/app/services/excel_ai_service.py
Normal file
@@ -0,0 +1,253 @@
|
||||
"""
|
||||
Excel AI 分析服务 - 集成 Excel 解析和 LLM 分析
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, Any, Optional, List
|
||||
|
||||
from app.core.document_parser import XlsxParser
|
||||
from app.services.file_service import file_service
|
||||
from app.services.llm_service import llm_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExcelAIService:
|
||||
"""Excel AI 分析服务"""
|
||||
|
||||
def __init__(self):
|
||||
self.parser = XlsxParser()
|
||||
self.file_service = file_service
|
||||
self.llm_service = llm_service
|
||||
|
||||
async def analyze_excel_file(
|
||||
self,
|
||||
file_content: bytes,
|
||||
filename: str,
|
||||
user_prompt: str = "",
|
||||
analysis_type: str = "general",
|
||||
parse_options: Optional[Dict[str, Any]] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
分析 Excel 文件
|
||||
|
||||
Args:
|
||||
file_content: 文件内容字节
|
||||
filename: 文件名
|
||||
user_prompt: 用户自定义提示词
|
||||
analysis_type: 分析类型
|
||||
parse_options: 解析选项
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 分析结果
|
||||
"""
|
||||
# 1. 保存文件
|
||||
try:
|
||||
saved_path = self.file_service.save_uploaded_file(
|
||||
file_content,
|
||||
filename,
|
||||
subfolder="excel"
|
||||
)
|
||||
logger.info(f"文件已保存: {saved_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"文件保存失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"文件保存失败: {str(e)}",
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
# 2. 解析 Excel 文件
|
||||
try:
|
||||
parse_options = parse_options or {}
|
||||
parse_result = self.parser.parse(saved_path, **parse_options)
|
||||
|
||||
if not parse_result.success:
|
||||
return {
|
||||
"success": False,
|
||||
"error": parse_result.error,
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
excel_data = parse_result.data
|
||||
logger.info(f"Excel 解析成功: {parse_result.metadata}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Excel 解析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Excel 解析失败: {str(e)}",
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
# 3. 调用 LLM 进行分析
|
||||
try:
|
||||
# 如果有自定义提示词,使用模板分析
|
||||
if user_prompt and user_prompt.strip():
|
||||
llm_result = await self.llm_service.analyze_with_template(
|
||||
excel_data,
|
||||
user_prompt
|
||||
)
|
||||
else:
|
||||
# 否则使用标准分析
|
||||
llm_result = await self.llm_service.analyze_excel_data(
|
||||
excel_data,
|
||||
user_prompt,
|
||||
analysis_type
|
||||
)
|
||||
|
||||
logger.info(f"AI 分析完成: {llm_result['success']}")
|
||||
|
||||
# 4. 组合结果
|
||||
return {
|
||||
"success": True,
|
||||
"excel": {
|
||||
"data": excel_data,
|
||||
"metadata": parse_result.metadata,
|
||||
"saved_path": saved_path
|
||||
},
|
||||
"analysis": llm_result
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"AI 分析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"AI 分析失败: {str(e)}",
|
||||
"excel": {
|
||||
"data": excel_data,
|
||||
"metadata": parse_result.metadata
|
||||
},
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
async def batch_analyze_sheets(
|
||||
self,
|
||||
file_content: bytes,
|
||||
filename: str,
|
||||
user_prompt: str = "",
|
||||
analysis_type: str = "general"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
批量分析 Excel 文件的所有工作表
|
||||
|
||||
Args:
|
||||
file_content: 文件内容字节
|
||||
filename: 文件名
|
||||
user_prompt: 用户自定义提示词
|
||||
analysis_type: 分析类型
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 分析结果
|
||||
"""
|
||||
# 1. 保存文件
|
||||
try:
|
||||
saved_path = self.file_service.save_uploaded_file(
|
||||
file_content,
|
||||
filename,
|
||||
subfolder="excel"
|
||||
)
|
||||
logger.info(f"文件已保存: {saved_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"文件保存失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"文件保存失败: {str(e)}",
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
# 2. 解析所有工作表
|
||||
try:
|
||||
parse_result = self.parser.parse_all_sheets(saved_path)
|
||||
|
||||
if not parse_result.success:
|
||||
return {
|
||||
"success": False,
|
||||
"error": parse_result.error,
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
sheets_data = parse_result.data.get("sheets", {})
|
||||
logger.info(f"Excel 解析成功,共 {len(sheets_data)} 个工作表")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Excel 解析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Excel 解析失败: {str(e)}",
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
# 3. 批量分析每个工作表
|
||||
sheet_analyses = {}
|
||||
errors = {}
|
||||
|
||||
for sheet_name, sheet_data in sheets_data.items():
|
||||
try:
|
||||
# 调用 LLM 分析
|
||||
if user_prompt and user_prompt.strip():
|
||||
llm_result = await self.llm_service.analyze_with_template(
|
||||
sheet_data,
|
||||
user_prompt
|
||||
)
|
||||
else:
|
||||
llm_result = await self.llm_service.analyze_excel_data(
|
||||
sheet_data,
|
||||
user_prompt,
|
||||
analysis_type
|
||||
)
|
||||
|
||||
sheet_analyses[sheet_name] = llm_result
|
||||
|
||||
if not llm_result["success"]:
|
||||
errors[sheet_name] = llm_result.get("error", "未知错误")
|
||||
|
||||
logger.info(f"工作表 '{sheet_name}' 分析完成")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"工作表 '{sheet_name}' 分析失败: {str(e)}")
|
||||
errors[sheet_name] = str(e)
|
||||
|
||||
# 4. 组合结果
|
||||
return {
|
||||
"success": len(errors) == 0,
|
||||
"excel": {
|
||||
"sheets": sheets_data,
|
||||
"metadata": parse_result.metadata,
|
||||
"saved_path": saved_path
|
||||
},
|
||||
"analysis": {
|
||||
"sheets": sheet_analyses,
|
||||
"total_sheets": len(sheets_data),
|
||||
"successful": len(sheet_analyses) - len(errors),
|
||||
"errors": errors
|
||||
}
|
||||
}
|
||||
|
||||
def get_supported_analysis_types(self) -> List[str]:
|
||||
"""获取支持的分析类型"""
|
||||
return [
|
||||
{
|
||||
"value": "general",
|
||||
"label": "综合分析",
|
||||
"description": "提供数据概览、关键发现、质量评估和建议"
|
||||
},
|
||||
{
|
||||
"value": "summary",
|
||||
"label": "数据摘要",
|
||||
"description": "快速了解数据的结构、范围和主要内容"
|
||||
},
|
||||
{
|
||||
"value": "statistics",
|
||||
"label": "统计分析",
|
||||
"description": "数值型列的统计信息和分类列的分布"
|
||||
},
|
||||
{
|
||||
"value": "insights",
|
||||
"label": "深度洞察",
|
||||
"description": "深入挖掘数据,提供异常值和业务建议"
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
# 全局单例
|
||||
excel_ai_service = ExcelAIService()
|
||||
132
backend/app/services/file_service.py
Normal file
132
backend/app/services/file_service.py
Normal file
@@ -0,0 +1,132 @@
|
||||
"""
|
||||
文件服务模块 - 处理文件存储和读取
|
||||
"""
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
import uuid
|
||||
|
||||
from app.config import settings
|
||||
|
||||
|
||||
class FileService:
|
||||
"""文件服务类,负责文件的存储、读取和管理"""
|
||||
|
||||
def __init__(self):
|
||||
self.upload_dir = Path(settings.UPLOAD_DIR)
|
||||
self._ensure_upload_dir()
|
||||
|
||||
def _ensure_upload_dir(self):
|
||||
"""确保上传目录存在"""
|
||||
self.upload_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def save_uploaded_file(
|
||||
self,
|
||||
file_content: bytes,
|
||||
filename: str,
|
||||
subfolder: Optional[str] = None
|
||||
) -> str:
|
||||
"""
|
||||
保存上传的文件
|
||||
|
||||
Args:
|
||||
file_content: 文件内容字节
|
||||
filename: 原始文件名
|
||||
subfolder: 可选的子文件夹名称
|
||||
|
||||
Returns:
|
||||
str: 保存后的文件路径
|
||||
"""
|
||||
# 生成唯一文件名,避免覆盖
|
||||
file_ext = Path(filename).suffix
|
||||
unique_name = f"{uuid.uuid4().hex}{file_ext}"
|
||||
|
||||
# 确定保存路径
|
||||
if subfolder:
|
||||
save_dir = self.upload_dir / subfolder
|
||||
save_dir.mkdir(parents=True, exist_ok=True)
|
||||
else:
|
||||
save_dir = self.upload_dir
|
||||
|
||||
file_path = save_dir / unique_name
|
||||
|
||||
# 写入文件
|
||||
with open(file_path, 'wb') as f:
|
||||
f.write(file_content)
|
||||
|
||||
return str(file_path)
|
||||
|
||||
def read_file(self, file_path: str) -> bytes:
|
||||
"""
|
||||
读取文件内容
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
bytes: 文件内容
|
||||
"""
|
||||
with open(file_path, 'rb') as f:
|
||||
return f.read()
|
||||
|
||||
def delete_file(self, file_path: str) -> bool:
|
||||
"""
|
||||
删除文件
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
bool: 是否删除成功
|
||||
"""
|
||||
try:
|
||||
file = Path(file_path)
|
||||
if file.exists():
|
||||
file.unlink()
|
||||
return True
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def get_file_info(self, file_path: str) -> dict:
|
||||
"""
|
||||
获取文件信息
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
dict: 文件信息
|
||||
"""
|
||||
file = Path(file_path)
|
||||
if not file.exists():
|
||||
return {}
|
||||
|
||||
stat = file.stat()
|
||||
return {
|
||||
"filename": file.name,
|
||||
"filepath": str(file),
|
||||
"size": stat.st_size,
|
||||
"created": datetime.fromtimestamp(stat.st_ctime).isoformat(),
|
||||
"modified": datetime.fromtimestamp(stat.st_mtime).isoformat(),
|
||||
"extension": file.suffix.lower()
|
||||
}
|
||||
|
||||
def get_file_size(self, file_path: str) -> int:
|
||||
"""
|
||||
获取文件大小(字节)
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
int: 文件大小,文件不存在返回 0
|
||||
"""
|
||||
file = Path(file_path)
|
||||
return file.stat().st_size if file.exists() else 0
|
||||
|
||||
|
||||
# 全局单例
|
||||
file_service = FileService()
|
||||
105
backend/app/services/font_helper.py
Normal file
105
backend/app/services/font_helper.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""
|
||||
字体辅助模块 - 处理中文字体检测和配置
|
||||
"""
|
||||
import matplotlib
|
||||
import matplotlib.font_manager as fm
|
||||
import platform
|
||||
import os
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def get_chinese_font() -> str:
|
||||
"""
|
||||
获取可用的中文字体
|
||||
|
||||
Returns:
|
||||
str: 可用的中文字体名称
|
||||
"""
|
||||
# 获取系统中所有可用字体
|
||||
available_fonts = set([f.name for f in fm.fontManager.ttflist])
|
||||
|
||||
# 定义字体优先级列表
|
||||
# Windows 优先
|
||||
if platform.system() == 'Windows':
|
||||
font_list = [
|
||||
'Microsoft YaHei', # 微软雅黑
|
||||
'SimHei', # 黑体
|
||||
'SimSun', # 宋体
|
||||
'KaiTi', # 楷体
|
||||
'FangSong', # 仿宋
|
||||
'STXihei', # 华文细黑
|
||||
'STKaiti', # 华文楷体
|
||||
'STSong', # 华文宋体
|
||||
'STFangsong', # 华文仿宋
|
||||
]
|
||||
# macOS 优先
|
||||
elif platform.system() == 'Darwin':
|
||||
font_list = [
|
||||
'PingFang SC', # 苹方-简
|
||||
'PingFang TC', # 苹方-繁
|
||||
'Heiti SC', # 黑体-简
|
||||
'Heiti TC', # 黑体-繁
|
||||
'STHeiti', # 华文黑体
|
||||
'STSong', # 华文宋体
|
||||
'STKaiti', # 华文楷体
|
||||
'Arial Unicode MS', # Arial Unicode MS
|
||||
]
|
||||
# Linux 优先
|
||||
else:
|
||||
font_list = [
|
||||
'Noto Sans CJK SC', # Noto Sans CJK 简体中文
|
||||
'WenQuanYi Micro Hei', # 文泉驿微米黑
|
||||
'AR PL UMing CN', # AR PL UMing
|
||||
'AR PL UKai CN', # AR PL UKai
|
||||
'ZCOOL XiaoWei', # ZCOOL 小薇
|
||||
]
|
||||
|
||||
# 通用备选字体
|
||||
font_list.extend([
|
||||
'SimHei',
|
||||
'Microsoft YaHei',
|
||||
'Arial Unicode MS',
|
||||
'Droid Sans Fallback',
|
||||
])
|
||||
|
||||
# 查找第一个可用的字体
|
||||
for font_name in font_list:
|
||||
if font_name in available_fonts:
|
||||
logger.info(f"找到中文字体: {font_name}")
|
||||
return font_name
|
||||
|
||||
# 如果没找到,尝试获取第一个中文字体
|
||||
for font in fm.fontManager.ttflist:
|
||||
if 'CJK' in font.name or 'SC' in font.name or 'TC' in font.name:
|
||||
logger.info(f"使用找到的中文字体: {font.name}")
|
||||
return font.name
|
||||
|
||||
# 最终备选:使用系统默认字体
|
||||
logger.warning("未找到合适的中文字体,使用默认字体")
|
||||
return 'sans-serif'
|
||||
|
||||
|
||||
def configure_matplotlib_fonts():
|
||||
"""
|
||||
配置 matplotlib 的字体设置
|
||||
"""
|
||||
chinese_font = get_chinese_font()
|
||||
|
||||
# 配置字体
|
||||
matplotlib.rcParams['font.sans-serif'] = [chinese_font]
|
||||
matplotlib.rcParams['axes.unicode_minus'] = False
|
||||
matplotlib.rcParams['figure.dpi'] = 100
|
||||
matplotlib.rcParams['savefig.dpi'] = 120
|
||||
|
||||
# 字体大小设置
|
||||
matplotlib.rcParams['font.size'] = 10
|
||||
matplotlib.rcParams['axes.labelsize'] = 10
|
||||
matplotlib.rcParams['axes.titlesize'] = 11
|
||||
matplotlib.rcParams['xtick.labelsize'] = 9
|
||||
matplotlib.rcParams['ytick.labelsize'] = 9
|
||||
matplotlib.rcParams['legend.fontsize'] = 9
|
||||
|
||||
logger.info(f"配置完成,使用字体: {chinese_font}")
|
||||
return chinese_font
|
||||
268
backend/app/services/llm_service.py
Normal file
268
backend/app/services/llm_service.py
Normal file
@@ -0,0 +1,268 @@
|
||||
"""
|
||||
LLM 服务模块 - 封装大模型 API 调用
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
import httpx
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LLMService:
|
||||
"""大语言模型服务类"""
|
||||
|
||||
def __init__(self):
|
||||
self.api_key = settings.LLM_API_KEY
|
||||
self.base_url = settings.LLM_BASE_URL
|
||||
self.model_name = settings.LLM_MODEL_NAME
|
||||
|
||||
async def chat(
|
||||
self,
|
||||
messages: List[Dict[str, str]],
|
||||
temperature: float = 0.7,
|
||||
max_tokens: Optional[int] = None,
|
||||
**kwargs
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
调用聊天 API
|
||||
|
||||
Args:
|
||||
messages: 消息列表,格式为 [{"role": "user", "content": "..."}]
|
||||
temperature: 温度参数,控制随机性
|
||||
max_tokens: 最大生成 token 数
|
||||
**kwargs: 其他参数
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: API 响应结果
|
||||
"""
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": self.model_name,
|
||||
"messages": messages,
|
||||
"temperature": temperature
|
||||
}
|
||||
|
||||
if max_tokens:
|
||||
payload["max_tokens"] = max_tokens
|
||||
|
||||
# 添加其他参数
|
||||
payload.update(kwargs)
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.post(
|
||||
f"{self.base_url}/chat/completions",
|
||||
headers=headers,
|
||||
json=payload
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"LLM API 请求失败: {e.response.status_code} - {e.response.text}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"LLM API 调用异常: {str(e)}")
|
||||
raise
|
||||
|
||||
def extract_message_content(self, response: Dict[str, Any]) -> str:
|
||||
"""
|
||||
从 API 响应中提取消息内容
|
||||
|
||||
Args:
|
||||
response: API 响应
|
||||
|
||||
Returns:
|
||||
str: 消息内容
|
||||
"""
|
||||
try:
|
||||
return response["choices"][0]["message"]["content"]
|
||||
except (KeyError, IndexError) as e:
|
||||
logger.error(f"解析 API 响应失败: {str(e)}")
|
||||
raise
|
||||
|
||||
async def analyze_excel_data(
|
||||
self,
|
||||
excel_data: Dict[str, Any],
|
||||
user_prompt: str,
|
||||
analysis_type: str = "general"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
分析 Excel 数据
|
||||
|
||||
Args:
|
||||
excel_data: Excel 解析后的数据
|
||||
user_prompt: 用户提示词
|
||||
analysis_type: 分析类型 (general, summary, statistics, insights)
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 分析结果
|
||||
"""
|
||||
# 构建 Prompt
|
||||
system_prompt = self._get_system_prompt(analysis_type)
|
||||
user_message = self._format_user_message(excel_data, user_prompt)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_message}
|
||||
]
|
||||
|
||||
try:
|
||||
response = await self.chat(
|
||||
messages=messages,
|
||||
temperature=0.3, # 较低的温度以获得更稳定的输出
|
||||
max_tokens=2000
|
||||
)
|
||||
|
||||
content = self.extract_message_content(response)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"analysis": content,
|
||||
"model": self.model_name,
|
||||
"analysis_type": analysis_type
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Excel 数据分析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
def _get_system_prompt(self, analysis_type: str) -> str:
|
||||
"""获取系统提示词"""
|
||||
prompts = {
|
||||
"general": """你是一个专业的数据分析师。请分析用户提供的 Excel 数据,提供有价值的见解和建议。
|
||||
|
||||
请按照以下格式输出:
|
||||
1. 数据概览
|
||||
2. 关键发现
|
||||
3. 数据质量评估
|
||||
4. 建议
|
||||
|
||||
输出语言:中文""",
|
||||
"summary": """你是一个专业的数据分析师。请对用户提供的 Excel 数据进行简洁的总结。
|
||||
|
||||
输出格式:
|
||||
- 数据行数和列数
|
||||
- 主要列的说明
|
||||
- 数据范围概述
|
||||
|
||||
输出语言:中文""",
|
||||
"statistics": """你是一个专业的数据分析师。请对用户提供的 Excel 数据进行统计分析。
|
||||
|
||||
请分析:
|
||||
- 数值型列的统计信息(平均值、中位数、最大值、最小值)
|
||||
- 分类列的分布情况
|
||||
- 数据相关性
|
||||
|
||||
输出语言:中文,使用表格或结构化格式展示""",
|
||||
"insights": """你是一个专业的数据分析师。请深入挖掘用户提供的 Excel 数据,提供有价值的洞察。
|
||||
|
||||
请分析:
|
||||
1. 数据中的异常值或特殊模式
|
||||
2. 数据之间的潜在关联
|
||||
3. 基于数据的业务建议
|
||||
4. 数据趋势分析(如适用)
|
||||
|
||||
输出语言:中文,提供详细且可操作的建议"""
|
||||
}
|
||||
|
||||
return prompts.get(analysis_type, prompts["general"])
|
||||
|
||||
def _format_user_message(self, excel_data: Dict[str, Any], user_prompt: str) -> str:
|
||||
"""格式化用户消息"""
|
||||
columns = excel_data.get("columns", [])
|
||||
rows = excel_data.get("rows", [])
|
||||
row_count = excel_data.get("row_count", 0)
|
||||
column_count = excel_data.get("column_count", 0)
|
||||
|
||||
# 构建数据描述
|
||||
data_info = f"""
|
||||
Excel 数据概览:
|
||||
- 行数: {row_count}
|
||||
- 列数: {column_count}
|
||||
- 列名: {', '.join(columns)}
|
||||
|
||||
数据样例(前 5 行):
|
||||
"""
|
||||
|
||||
# 添加数据样例
|
||||
for i, row in enumerate(rows[:5], 1):
|
||||
row_str = " | ".join([f"{col}: {row.get(col, '')}" for col in columns])
|
||||
data_info += f"第 {i} 行: {row_str}\n"
|
||||
|
||||
if row_count > 5:
|
||||
data_info += f"\n(还有 {row_count - 5} 行数据...)\n"
|
||||
|
||||
# 添加用户自定义提示
|
||||
if user_prompt and user_prompt.strip():
|
||||
data_info += f"\n用户需求:\n{user_prompt}"
|
||||
else:
|
||||
data_info += "\n用户需求: 请对上述数据进行分析"
|
||||
|
||||
return data_info
|
||||
|
||||
async def analyze_with_template(
|
||||
self,
|
||||
excel_data: Dict[str, Any],
|
||||
template_prompt: str
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
使用自定义模板分析 Excel 数据
|
||||
|
||||
Args:
|
||||
excel_data: Excel 解析后的数据
|
||||
template_prompt: 自定义提示词模板
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 分析结果
|
||||
"""
|
||||
system_prompt = """你是一个专业的数据分析师。请根据用户提供的自定义提示词分析 Excel 数据。
|
||||
|
||||
请严格按照用户的要求进行分析,输出清晰、有条理的结果。
|
||||
|
||||
输出语言:中文"""
|
||||
|
||||
user_message = self._format_user_message(excel_data, template_prompt)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_message}
|
||||
]
|
||||
|
||||
try:
|
||||
response = await self.chat(
|
||||
messages=messages,
|
||||
temperature=0.5,
|
||||
max_tokens=3000
|
||||
)
|
||||
|
||||
content = self.extract_message_content(response)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"analysis": content,
|
||||
"model": self.model_name,
|
||||
"is_template": True
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"自定义模板分析失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
"analysis": None
|
||||
}
|
||||
|
||||
|
||||
# 全局单例
|
||||
llm_service = LLMService()
|
||||
218
backend/app/services/text_analysis_service.py
Normal file
218
backend/app/services/text_analysis_service.py
Normal file
@@ -0,0 +1,218 @@
|
||||
"""
|
||||
文本分析服务 - 从 AI 分析结果中提取结构化数据用于可视化
|
||||
"""
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
import re
|
||||
import json
|
||||
|
||||
from app.services.llm_service import llm_service
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextAnalysisService:
|
||||
"""文本分析服务类"""
|
||||
|
||||
def __init__(self):
|
||||
self.llm_service = llm_service
|
||||
|
||||
async def extract_structured_data(
|
||||
self,
|
||||
analysis_text: str,
|
||||
original_filename: str = "",
|
||||
file_type: str = "text"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
从 AI 分析结果文本中提取结构化数据
|
||||
|
||||
Args:
|
||||
analysis_text: AI 分析结果文本
|
||||
original_filename: 原始文件名
|
||||
file_type: 文件类型
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 提取的结构化数据
|
||||
"""
|
||||
# 限制分析的文本长度,避免 token 超限
|
||||
max_text_length = 8000
|
||||
truncated_text = analysis_text[:max_text_length]
|
||||
|
||||
system_prompt = """你是一个专业的数据提取助手。你的任务是从AI分析结果中提取结构化数据,用于生成图表。
|
||||
|
||||
请按照以下要求提取数据:
|
||||
|
||||
1. 数值型数据:
|
||||
- 提取所有的数值、统计信息、百分比等
|
||||
- 为每个数值创建一个条目,包含:名称、值、单位(如果有)
|
||||
- 格式示例:{"name": "销售额", "value": 123456.78, "unit": "元"}
|
||||
|
||||
2. 分类数据:
|
||||
- 提取所有的类别、状态、枚举值等
|
||||
- 为每个类别创建一个条目,包含:名称、值、数量(如果有)
|
||||
- 格式示例:{"name": "产品类别", "value": "电子产品", "count": 25}
|
||||
|
||||
3. 时间序列数据:
|
||||
- 提取所有的时间相关数据(年月、季度、日期等)
|
||||
- 格式示例:{"name": "2025年1月", "value": 12345}
|
||||
|
||||
4. 对比数据:
|
||||
- 提取所有的对比、排名、趋势等数据
|
||||
- 格式示例:{"name": "同比增长", "value": 15.3, "unit": "%"}
|
||||
|
||||
5. 表格数据:
|
||||
- 如果分析结果中包含表格或列表形式的数据,提取出来
|
||||
- 格式:{"columns": ["列1", "列2"], "rows": [{"列1": "值1", "列2": "值2"}]}
|
||||
|
||||
重要规则:
|
||||
- 只提取明确提到的数据和数值
|
||||
- 如果某种类型的数据不存在,返回空数组 []
|
||||
- 确保所有数值都是有效的数字类型
|
||||
- 保持数据的原始精度
|
||||
- 返回的 JSON 必须完整且格式正确
|
||||
- 表格数据最多提取 20 行
|
||||
|
||||
请以 JSON 格式返回,不要添加任何 Markdown 标记或解释文字,只返回纯 JSON:
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"numeric_data": [
|
||||
{"name": string, "value": number, "unit": string|null}
|
||||
],
|
||||
"categorical_data": [
|
||||
{"name": string, "value": string, "count": number|null}
|
||||
],
|
||||
"time_series_data": [
|
||||
{"name": string, "value": number}
|
||||
],
|
||||
"comparison_data": [
|
||||
{"name": string, "value": number, "unit": string|null}
|
||||
],
|
||||
"table_data": {
|
||||
"columns": string[],
|
||||
"rows": object[]
|
||||
} | null
|
||||
},
|
||||
"metadata": {
|
||||
"total_items": number,
|
||||
"data_types": string[]
|
||||
}
|
||||
}"""
|
||||
|
||||
user_message = f"""请从以下 AI 分析结果中提取结构化数据:
|
||||
|
||||
原始文件名:{original_filename}
|
||||
文件类型:{file_type}
|
||||
|
||||
AI 分析结果:
|
||||
{truncated_text}
|
||||
|
||||
请按照系统提示的要求提取数据并返回纯 JSON 格式。"""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_message}
|
||||
]
|
||||
|
||||
try:
|
||||
logger.info(f"开始提取结构化数据,文本长度: {len(truncated_text)}")
|
||||
|
||||
response = await self.llm_service.chat(
|
||||
messages=messages,
|
||||
temperature=0.1,
|
||||
max_tokens=4000
|
||||
)
|
||||
|
||||
content = self.llm_service.extract_message_content(response)
|
||||
logger.info(f"LLM 返回内容长度: {len(content)}")
|
||||
|
||||
# 使用简单的方法提取 JSON
|
||||
result = self._extract_json_simple(content)
|
||||
|
||||
if not result:
|
||||
logger.error("无法从 LLM 响应中提取有效的 JSON")
|
||||
return {
|
||||
"success": False,
|
||||
"error": "AI 返回的数据格式不正确或被截断",
|
||||
"raw_content": content[:500]
|
||||
}
|
||||
|
||||
logger.info(f"成功提取结构化数据")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"提取结构化数据失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def _extract_json_simple(self, content: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
简化的 JSON 提取方法
|
||||
|
||||
Args:
|
||||
content: LLM 返回的内容
|
||||
|
||||
Returns:
|
||||
Optional[Dict[str, Any]]: 解析后的 JSON,失败返回 None
|
||||
"""
|
||||
try:
|
||||
# 方法 1: 查找 ```json 代码块
|
||||
code_block_match = re.search(r'```json\n{[\s\S]*?}[\s\S]*?}\n```', content, re.DOTALL)
|
||||
if code_block_match:
|
||||
json_str = code_block_match.group(1)
|
||||
logger.info("从代码块中提取 JSON")
|
||||
return json.loads(json_str)
|
||||
|
||||
# 方法 2: 查找第一个完整的 { } 对象
|
||||
brace_count = 0
|
||||
json_start = -1
|
||||
|
||||
for i in range(len(content)):
|
||||
if content[i] == '{':
|
||||
if brace_count == 0:
|
||||
json_start = i
|
||||
brace_count += 1
|
||||
elif content[i] == '}':
|
||||
brace_count -= 1
|
||||
if brace_count == 0:
|
||||
# 找到了完整的 JSON 对象
|
||||
json_end = i + 1
|
||||
json_str = content[json_start:json_end]
|
||||
logger.info(f"从大括号中提取 JSON")
|
||||
return json.loads(json_str)
|
||||
|
||||
# 方法 3: 尝试直接解析
|
||||
logger.info("尝试直接解析整个内容")
|
||||
return json.loads(content)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"JSON 解析失败: {str(e)}")
|
||||
logger.error(f"原始内容(前 500 字符): {content[:500]}...")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"提取 JSON 失败: {str(e)}")
|
||||
return None
|
||||
|
||||
def detect_data_types(self, data: Dict[str, Any]) -> List[str]:
|
||||
"""检测数据中包含的类型"""
|
||||
types = []
|
||||
d = data.get("data", {})
|
||||
|
||||
if d.get("numeric_data") and len(d["numeric_data"]) > 0:
|
||||
types.append("numeric")
|
||||
if d.get("categorical_data") and len(d["categorical_data"]) > 0:
|
||||
types.append("categorical")
|
||||
if d.get("time_series_data") and len(d["time_series_data"]) > 0:
|
||||
types.append("time_series")
|
||||
if d.get("comparison_data") and len(d["comparison_data"]) > 0:
|
||||
types.append("comparison")
|
||||
if d.get("table_data") and d["table_data"]:
|
||||
types.append("table")
|
||||
|
||||
return types
|
||||
|
||||
|
||||
# 全局单例
|
||||
text_analysis_service = TextAnalysisService()
|
||||
0
backend/app/services/text_analysis_service_fixed.py
Normal file
0
backend/app/services/text_analysis_service_fixed.py
Normal file
388
backend/app/services/visualization_service.py
Normal file
388
backend/app/services/visualization_service.py
Normal file
@@ -0,0 +1,388 @@
|
||||
"""
|
||||
数据可视化服务 - 使用 matplotlib/plotly 生成统计图表
|
||||
"""
|
||||
import io
|
||||
import base64
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional, Union
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib
|
||||
import numpy as np
|
||||
|
||||
# 使用字体辅助模块配置中文字体
|
||||
from app.services.font_helper import configure_matplotlib_fonts
|
||||
|
||||
configure_matplotlib_fonts()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VisualizationService:
|
||||
"""数据可视化服务类"""
|
||||
|
||||
def __init__(self):
|
||||
self.output_dir = Path(__file__).resolve().parent.parent.parent / "data" / "charts"
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def analyze_and_visualize(
|
||||
self,
|
||||
excel_data: Dict[str, Any],
|
||||
analysis_type: str = "statistics"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
分析数据并生成可视化图表
|
||||
|
||||
Args:
|
||||
excel_data: Excel 解析后的数据
|
||||
analysis_type: 分析类型
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 包含图表数据和统计信息的结果
|
||||
"""
|
||||
try:
|
||||
columns = excel_data.get("columns", [])
|
||||
rows = excel_data.get("rows", [])
|
||||
|
||||
if not columns or not rows:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "没有数据可用于分析"
|
||||
}
|
||||
|
||||
# 转换为 DataFrame
|
||||
df = pd.DataFrame(rows, columns=columns)
|
||||
|
||||
# 根据列类型分类
|
||||
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
|
||||
categorical_columns = df.select_dtypes(exclude=[np.number]).columns.tolist()
|
||||
|
||||
# 生成统计信息
|
||||
statistics = self._generate_statistics(df, numeric_columns, categorical_columns)
|
||||
|
||||
# 生成图表
|
||||
charts = self._generate_charts(df, numeric_columns, categorical_columns)
|
||||
|
||||
# 生成数据分布信息
|
||||
distributions = self._generate_distributions(df, categorical_columns)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"statistics": statistics,
|
||||
"charts": charts,
|
||||
"distributions": distributions,
|
||||
"row_count": len(df),
|
||||
"column_count": len(columns)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"可视化分析失败: {str(e)}", exc_info=True)
|
||||
return {
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def _generate_statistics(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
numeric_columns: List[str],
|
||||
categorical_columns: List[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""生成统计信息"""
|
||||
statistics = {
|
||||
"numeric": {},
|
||||
"categorical": {}
|
||||
}
|
||||
|
||||
# 数值型列统计
|
||||
for col in numeric_columns:
|
||||
try:
|
||||
stats = {
|
||||
"count": int(df[col].count()),
|
||||
"mean": float(df[col].mean()),
|
||||
"median": float(df[col].median()),
|
||||
"std": float(df[col].std()) if df[col].count() > 1 else 0,
|
||||
"min": float(df[col].min()),
|
||||
"max": float(df[col].max()),
|
||||
"q25": float(df[col].quantile(0.25)),
|
||||
"q75": float(df[col].quantile(0.75)),
|
||||
"missing": int(df[col].isna().sum())
|
||||
}
|
||||
statistics["numeric"][col] = stats
|
||||
except Exception as e:
|
||||
logger.warning(f"列 {col} 统计失败: {str(e)}")
|
||||
|
||||
# 分类型列统计
|
||||
for col in categorical_columns:
|
||||
try:
|
||||
value_counts = df[col].value_counts()
|
||||
stats = {
|
||||
"unique": int(df[col].nunique()),
|
||||
"most_common": str(value_counts.index[0]) if len(value_counts) > 0 else "",
|
||||
"most_common_count": int(value_counts.iloc[0]) if len(value_counts) > 0 else 0,
|
||||
"missing": int(df[col].isna().sum()),
|
||||
"distribution": {str(k): int(v) for k, v in value_counts.items()}
|
||||
}
|
||||
statistics["categorical"][col] = stats
|
||||
except Exception as e:
|
||||
logger.warning(f"列 {col} 统计失败: {str(e)}")
|
||||
|
||||
return statistics
|
||||
|
||||
def _generate_charts(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
numeric_columns: List[str],
|
||||
categorical_columns: List[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""生成图表"""
|
||||
charts = {}
|
||||
|
||||
# 1. 数值型列的直方图
|
||||
charts["histograms"] = []
|
||||
for col in numeric_columns[:5]: # 限制最多 5 个数值列
|
||||
chart_data = self._create_histogram(df[col], col)
|
||||
if chart_data:
|
||||
charts["histograms"].append(chart_data)
|
||||
|
||||
# 2. 分类型列的条形图
|
||||
charts["bar_charts"] = []
|
||||
for col in categorical_columns[:5]: # 限制最多 5 个分类型列
|
||||
chart_data = self._create_bar_chart(df[col], col)
|
||||
if chart_data:
|
||||
charts["bar_charts"].append(chart_data)
|
||||
|
||||
# 3. 数值型列的箱线图
|
||||
charts["box_plots"] = []
|
||||
if len(numeric_columns) > 0:
|
||||
chart_data = self._create_box_plot(df[numeric_columns[:5]], numeric_columns[:5])
|
||||
if chart_data:
|
||||
charts["box_plots"].append(chart_data)
|
||||
|
||||
# 4. 相关性热力图
|
||||
if len(numeric_columns) >= 2:
|
||||
chart_data = self._create_correlation_heatmap(df[numeric_columns], numeric_columns)
|
||||
if chart_data:
|
||||
charts["correlation"] = chart_data
|
||||
|
||||
return charts
|
||||
|
||||
def _create_histogram(self, series: pd.Series, column_name: str) -> Optional[Dict[str, Any]]:
|
||||
"""创建直方图"""
|
||||
try:
|
||||
fig, ax = plt.subplots(figsize=(11, 7))
|
||||
ax.hist(series.dropna(), bins=20, edgecolor='black', alpha=0.7, color='#3b82f6')
|
||||
ax.set_xlabel(column_name, fontsize=10, labelpad=10)
|
||||
ax.set_ylabel('频数', fontsize=10, labelpad=10)
|
||||
ax.set_title(f'{column_name} 分布', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.grid(True, alpha=0.3, axis='y')
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
|
||||
# 改进布局
|
||||
plt.tight_layout(pad=1.5, w_pad=1.0, h_pad=1.0)
|
||||
|
||||
# 转换为 base64
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
|
||||
return {
|
||||
"type": "histogram",
|
||||
"column": column_name,
|
||||
"image": img_base64,
|
||||
"stats": {
|
||||
"mean": float(series.mean()),
|
||||
"median": float(series.median()),
|
||||
"std": float(series.std()) if len(series) > 1 else 0
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"创建直方图失败 ({column_name}): {str(e)}")
|
||||
return None
|
||||
|
||||
def _create_bar_chart(self, series: pd.Series, column_name: str) -> Optional[Dict[str, Any]]:
|
||||
"""创建条形图"""
|
||||
try:
|
||||
value_counts = series.value_counts().head(10) # 只显示前 10 个
|
||||
fig, ax = plt.subplots(figsize=(12, 7))
|
||||
|
||||
# 处理标签显示
|
||||
labels = [str(x)[:15] + '...' if len(str(x)) > 15 else str(x) for x in value_counts.index]
|
||||
x_pos = range(len(value_counts))
|
||||
bars = ax.bar(x_pos, value_counts.values, color='#10b981', alpha=0.8, edgecolor='black', linewidth=0.5)
|
||||
|
||||
ax.set_xticks(x_pos)
|
||||
ax.set_xticklabels(labels, rotation=30, ha='right', fontsize=8)
|
||||
ax.set_xlabel(column_name, fontsize=10, labelpad=10)
|
||||
ax.set_ylabel('数量', fontsize=10, labelpad=10)
|
||||
ax.set_title(f'{column_name} 分布 (Top 10)', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.grid(True, alpha=0.3, axis='y')
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
|
||||
# 添加数值标签(位置稍微上移)
|
||||
max_val = value_counts.values.max()
|
||||
y_offset = max_val * 0.02 if max_val > 0 else 0.5
|
||||
for bar, value in zip(bars, value_counts.values):
|
||||
ax.text(bar.get_x() + bar.get_width() / 2., value + y_offset,
|
||||
f'{int(value)}',
|
||||
ha='center', va='bottom', fontsize=8, fontweight='bold')
|
||||
|
||||
# 改进布局
|
||||
plt.tight_layout(pad=1.5, w_pad=1.0, h_pad=1.0)
|
||||
|
||||
# 转换为 base64
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
|
||||
return {
|
||||
"type": "bar_chart",
|
||||
"column": column_name,
|
||||
"image": img_base64,
|
||||
"categories": {str(k): int(v) for k, v in value_counts.items()}
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"创建条形图失败 ({column_name}): {str(e)}")
|
||||
return None
|
||||
|
||||
def _create_box_plot(self, df: pd.DataFrame, columns: List[str]) -> Optional[Dict[str, Any]]:
|
||||
"""创建箱线图"""
|
||||
try:
|
||||
fig, ax = plt.subplots(figsize=(14, 7))
|
||||
|
||||
# 准备数据
|
||||
box_data = [df[col].dropna() for col in columns]
|
||||
bp = ax.boxplot(box_data, labels=columns, patch_artist=True,
|
||||
notch=True, showcaps=True, showfliers=True)
|
||||
|
||||
# 美化箱线图
|
||||
box_colors = ['#3b82f6', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6']
|
||||
for patch, color in zip(bp['boxes'], box_colors[:len(bp['boxes'])]):
|
||||
patch.set_facecolor(color)
|
||||
patch.set_alpha(0.6)
|
||||
patch.set_linewidth(1.5)
|
||||
|
||||
# 设置其他元素样式
|
||||
for element in ['whiskers', 'fliers', 'means', 'medians', 'caps']:
|
||||
plt.setp(bp[element], linewidth=1.5)
|
||||
|
||||
ax.set_ylabel('值', fontsize=10, labelpad=10)
|
||||
ax.set_title('数值型列分布对比', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.grid(True, alpha=0.3, axis='y')
|
||||
|
||||
# 旋转 x 轴标签以避免重叠
|
||||
plt.setp(ax.get_xticklabels(), rotation=30, ha='right', fontsize=9)
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
|
||||
# 改进布局
|
||||
plt.tight_layout(pad=1.5, w_pad=1.5, h_pad=1.0)
|
||||
|
||||
# 转换为 base64
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
|
||||
return {
|
||||
"type": "box_plot",
|
||||
"columns": columns,
|
||||
"image": img_base64
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"创建箱线图失败: {str(e)}")
|
||||
return None
|
||||
|
||||
def _create_correlation_heatmap(self, df: pd.DataFrame, columns: List[str]) -> Optional[Dict[str, Any]]:
|
||||
"""创建相关性热力图"""
|
||||
try:
|
||||
# 计算相关系数
|
||||
corr = df.corr()
|
||||
|
||||
fig, ax = plt.subplots(figsize=(11, 9))
|
||||
im = ax.imshow(corr, cmap='RdBu_r', aspect='auto', vmin=-1, vmax=1)
|
||||
|
||||
# 设置刻度
|
||||
n_cols = len(corr)
|
||||
ax.set_xticks(np.arange(n_cols))
|
||||
ax.set_yticks(np.arange(n_cols))
|
||||
|
||||
# 处理过长的列名
|
||||
x_labels = [str(col)[:10] + '...' if len(str(col)) > 10 else str(col) for col in corr.columns]
|
||||
y_labels = [str(col)[:10] + '...' if len(str(col)) > 10 else str(col) for col in corr.columns]
|
||||
|
||||
ax.set_xticklabels(x_labels, rotation=30, ha='right', fontsize=9)
|
||||
ax.set_yticklabels(y_labels, fontsize=9)
|
||||
|
||||
# 添加数值标签,根据相关性值选择颜色
|
||||
for i in range(n_cols):
|
||||
for j in range(n_cols):
|
||||
value = corr.iloc[i, j]
|
||||
# 根据背景色深浅选择文字颜色
|
||||
text_color = 'white' if abs(value) > 0.5 else 'black'
|
||||
ax.text(j, i, f'{value:.2f}',
|
||||
ha="center", va="center", color=text_color,
|
||||
fontsize=8, fontweight='bold' if abs(value) > 0.7 else 'normal')
|
||||
|
||||
ax.set_title('数值型列相关性热力图', fontsize=12, fontweight='bold', pad=15)
|
||||
ax.tick_params(axis='both', which='major', labelsize=9)
|
||||
|
||||
# 添加颜色条
|
||||
cbar = plt.colorbar(im, ax=ax)
|
||||
cbar.set_label('相关系数', rotation=270, labelpad=20, fontsize=10)
|
||||
cbar.ax.tick_params(labelsize=9)
|
||||
|
||||
# 改进布局
|
||||
plt.tight_layout(pad=2.0, w_pad=1.0, h_pad=1.0)
|
||||
|
||||
# 转换为 base64
|
||||
img_base64 = self._figure_to_base64(fig)
|
||||
|
||||
return {
|
||||
"type": "correlation_heatmap",
|
||||
"columns": columns,
|
||||
"image": img_base64,
|
||||
"correlation_matrix": corr.to_dict()
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"创建相关性热力图失败: {str(e)}")
|
||||
return None
|
||||
|
||||
def _generate_distributions(
|
||||
self,
|
||||
df: pd.DataFrame,
|
||||
categorical_columns: List[str]
|
||||
) -> Dict[str, Any]:
|
||||
"""生成数据分布信息"""
|
||||
distributions = {}
|
||||
|
||||
for col in categorical_columns[:5]:
|
||||
try:
|
||||
value_counts = df[col].value_counts()
|
||||
total = len(df)
|
||||
|
||||
distributions[col] = {
|
||||
"categories": {str(k): int(v) for k, v in value_counts.items()},
|
||||
"percentages": {str(k): round(v / total * 100, 2) for k, v in value_counts.items()},
|
||||
"unique_count": len(value_counts)
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning(f"列 {col} 分布生成失败: {str(e)}")
|
||||
|
||||
return distributions
|
||||
|
||||
def _figure_to_base64(self, fig) -> str:
|
||||
"""将 matplotlib 图形转换为 base64 字符串"""
|
||||
buf = io.BytesIO()
|
||||
fig.savefig(
|
||||
buf,
|
||||
format='png',
|
||||
dpi=120,
|
||||
bbox_inches='tight',
|
||||
pad_inches=0.3,
|
||||
facecolor='white',
|
||||
edgecolor='none',
|
||||
transparent=False
|
||||
)
|
||||
plt.close(fig)
|
||||
buf.seek(0)
|
||||
img_base64 = base64.b64encode(buf.read()).decode('utf-8')
|
||||
return f"data:image/png;base64,{img_base64}"
|
||||
|
||||
|
||||
# 全局单例
|
||||
visualization_service = VisualizationService()
|
||||
@@ -103,6 +103,21 @@ git config user.email #同上
|
||||
#如果想看全局的,可以加上 --global,例如 git config --global user.name
|
||||
```
|
||||
|
||||
## 启动后端项目
|
||||
在终端输入以下命令:
|
||||
```bash
|
||||
cd backend #确保启动时在后端跟目录下
|
||||
./venv/Scripts/python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000
|
||||
--reload #启动后端项目
|
||||
```
|
||||
先启动后端项目,再启动前端项目
|
||||
|
||||
记得在你的.gitignore中添加:
|
||||
```
|
||||
/backend/data/uploads
|
||||
/backend/data/charts
|
||||
```
|
||||
|
||||
## 预计项目结构:
|
||||
|
||||
```bash
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
fastapi[all]==0.104.1
|
||||
fastapi[all]==0.104.1
|
||||
uvicorn[standard]==0.24.0
|
||||
pydantic==2.5.0
|
||||
pydantic-settings==2.1.0
|
||||
python-multipart==0.0.6
|
||||
pymongo==4.5.0
|
||||
redis==5.0.0
|
||||
@@ -10,6 +11,8 @@ faiss-cpu==1.8.0
|
||||
python-docx==0.8.11
|
||||
pandas==2.1.4
|
||||
openpyxl==3.1.2
|
||||
matplotlib==3.8.2
|
||||
numpy==1.26.2
|
||||
markdown==3.5.1
|
||||
langchain==0.1.0
|
||||
langchain-community==0.0.10
|
||||
@@ -18,5 +21,4 @@ httpx==0.25.2
|
||||
python-dotenv==1.0.0
|
||||
loguru==0.7.2
|
||||
tqdm==4.66.1
|
||||
numpy==1.26.2
|
||||
PyYAML==6.0.1
|
||||
PyYAML==6.0.1
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
print("Hello,World")
|
||||
@@ -1 +0,0 @@
|
||||
print("Hello world!")
|
||||
@@ -1 +0,0 @@
|
||||
print("hello,world!")
|
||||
71
backend/test_font_config.py
Normal file
71
backend/test_font_config.py
Normal file
@@ -0,0 +1,71 @@
|
||||
"""
|
||||
测试字体配置是否正常工作
|
||||
"""
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib
|
||||
import numpy as np
|
||||
from app.services.font_helper import configure_matplotlib_fonts
|
||||
import io
|
||||
import base64
|
||||
|
||||
# 配置字体
|
||||
font_name = configure_matplotlib_fonts()
|
||||
|
||||
print(f"当前使用字体: {font_name}")
|
||||
print(f"matplotlib 中文字体设置: {matplotlib.rcParams['font.sans-serif']}")
|
||||
|
||||
# 创建测试图表
|
||||
fig, ax = plt.subplots(figsize=(10, 6))
|
||||
|
||||
# 测试数据
|
||||
x = ['销售', '库存', '采购', '退货', '其他']
|
||||
y = [150, 200, 180, 50, 30]
|
||||
|
||||
bars = ax.bar(x, y, color='#3b82f6', alpha=0.8)
|
||||
ax.set_xlabel('类别', fontsize=12, labelpad=10)
|
||||
ax.set_ylabel('数值', fontsize=12, labelpad=10)
|
||||
ax.set_title('测试图表 - 中文显示', fontsize=14, fontweight='bold', pad=15)
|
||||
ax.tick_params(axis='both', which='major', labelsize=10)
|
||||
|
||||
# 添加数值标签
|
||||
for bar, value in zip(bars, y):
|
||||
height = bar.get_height()
|
||||
ax.text(bar.get_x() + bar.get_width() / 2., height,
|
||||
f'{value}',
|
||||
ha='center', va='bottom', fontsize=10, fontweight='bold')
|
||||
|
||||
plt.grid(axis='y', alpha=0.3)
|
||||
plt.tight_layout(pad=1.5)
|
||||
|
||||
# 转换为 base64
|
||||
buf = io.BytesIO()
|
||||
fig.savefig(buf, format='png', dpi=120, bbox_inches='tight', pad_inches=0.3, facecolor='white')
|
||||
plt.close(fig)
|
||||
|
||||
buf.seek(0)
|
||||
img_base64 = base64.b64encode(buf.read()).decode('utf-8')
|
||||
data_url = f"data:image/png;base64,{img_base64}"
|
||||
|
||||
print("\n=== 测试完成 ===")
|
||||
print(f"图表大小: {len(img_base64)} 字符")
|
||||
print("如果看到字体警告,请检查系统是否有安装中文字体")
|
||||
|
||||
# 尝试获取所有可用字体
|
||||
import matplotlib.font_manager as fm
|
||||
available_fonts = set([f.name for f in fm.fontManager.ttflist])
|
||||
|
||||
print(f"\n=== 可用字体列表(部分)===")
|
||||
chinese_fonts = [f for f in available_fonts if 'CJK' in f or 'Chinese' in f or 'YaHei' in f or 'SimHei' in f or 'PingFang' in f]
|
||||
for font in sorted(chinese_fonts)[:10]:
|
||||
print(f" - {font}")
|
||||
|
||||
if not chinese_fonts:
|
||||
print(" 未找到中文字体!")
|
||||
|
||||
print("\n=== 推荐安装的中文字体 ===")
|
||||
print("Windows: Microsoft YaHei (系统自带)")
|
||||
print("macOS: PingFang SC (系统自带)")
|
||||
print("Linux: fonts-noto-cjk 或 fonts-wqy-zenhei")
|
||||
|
||||
print("\n=== 生成的 base64 数据(前100字符)===")
|
||||
print(data_url[:100] + "...")
|
||||
Reference in New Issue
Block a user