Json-Python-Server/app/services/analysis_system.py

1062 lines
44 KiB
Python
Raw Normal View History

2026-01-29 18:18:32 +08:00
import os
import tempfile
import pandas as pd
import numpy as np
import matplotlib
import time
import re
import requests
from datetime import datetime
import warnings
import gc
import math
from decimal import Decimal
from typing import Any, Dict, List, Tuple
from openai import OpenAI
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from app.services.font_manager import FontManager
from app.services.analysis.modules.basic import (
generate_statistical_overview as _generate_statistical_overview,
perform_normality_tests as _perform_normality_tests,
)
from app.services.analysis.modules.modeling import (
analyze_feature_importance as _analyze_feature_importance,
perform_var_analysis as _perform_var_analysis,
)
from app.services.analysis.modules.multivariate import (
generate_correlation_heatmap as _generate_correlation_heatmap,
generate_pca_scree_plot as _generate_pca_scree_plot,
perform_clustering_analysis as _perform_clustering_analysis,
perform_factor_analysis as _perform_factor_analysis,
perform_pca_analysis as _perform_pca_analysis,
)
from app.services.analysis.modules.stationarity import (
perform_cointegration_test as _perform_cointegration_test,
perform_stationarity_tests as _perform_stationarity_tests,
)
from app.services.analysis.modules.time_series import (
generate_acf_pacf_plots as _generate_acf_pacf_plots,
generate_time_series_plots as _generate_time_series_plots,
perform_seasonal_decomposition as _perform_seasonal_decomposition,
perform_spectral_analysis as _perform_spectral_analysis,
)
class TimeSeriesAnalysisSystem:
def __init__(self, csv_path, task_description, data_background=None, language='en', generate_plots=False):
self.csv_path = csv_path
self.task_description = task_description
self.data_background = data_background or {}
self.language = language
self.generate_plots = generate_plots
self.data = None
self.temp_dir = tempfile.TemporaryDirectory()
# 统一字体设置
self.chinese_font = self.setup_fonts(language)
# 初始化API客户端
self.client_config = self.create_qwen_client()
if not self.client_config:
print("警告: 无法初始化API客户端将使用模拟分析")
# 分析日志
self.analysis_log = []
self.preprocessing_steps = []
# ---- 通用清洗工具 ----
def to_echarts_safe(self, obj: Any, _seen: Tuple[int, ...] = ()):
"""递归清洗为 JSON/ECharts 可序列化结构,处理 NaN/Inf/Timestamp/numpy/Decimal。
使用 _seen 记录已访问对象避免循环引用
"""
obj_id = id(obj)
if obj_id in _seen:
return None
seen = _seen + (obj_id,)
# pandas NA
try:
if obj is pd.NA: # type: ignore[comparison-overlap]
return None
except Exception:
pass
# None / bool / int / str
if obj is None or isinstance(obj, (bool, int, str)):
return obj
# float / numpy float
if isinstance(obj, (float, np.floating)):
value = float(obj)
if math.isnan(value) or math.isinf(value):
return None
return value
# numpy integer
if isinstance(obj, np.integer):
return int(obj)
# numpy bool
if isinstance(obj, np.bool_):
return bool(obj)
# Decimal
if isinstance(obj, Decimal):
return float(obj)
# pandas Timestamp
if isinstance(obj, pd.Timestamp):
return obj.isoformat()
# datetime
if isinstance(obj, datetime):
return obj.isoformat()
# numpy array
if isinstance(obj, np.ndarray):
return [self.to_echarts_safe(v, seen) for v in obj.tolist()]
# pandas DataFrame/Series
if isinstance(obj, pd.DataFrame):
return [obj.columns.tolist()] + [self.to_echarts_safe(row, seen) for row in obj.values.tolist()]
if isinstance(obj, pd.Series):
return self.to_echarts_safe(obj.tolist(), seen)
# mapping
if isinstance(obj, dict):
return {str(k): self.to_echarts_safe(v, seen) for k, v in obj.items()}
# list/tuple
if isinstance(obj, (list, tuple)):
return [self.to_echarts_safe(v, seen) for v in obj]
# fallback
return obj
def _log_step(self, message, status="info"):
"""记录分析步骤"""
log_entry = {
"timestamp": datetime.now().strftime("%H:%M:%S"),
"message": message,
"status": status
}
self.analysis_log.append(log_entry)
print(f"[{log_entry['timestamp']}] {message}")
def _log_preprocessing_step(self, step_name, description, status="completed"):
"""记录预处理步骤"""
step_entry = {
"name": step_name,
"description": description,
"status": status
}
self.preprocessing_steps.append(step_entry)
def setup_fonts(self, language='zh'):
"""统一设置字体 - 合并中英文设置"""
try:
# 设置 matplotlib 字体
self.setup_matplotlib_font(language)
return 'Helvetica' # 返回默认字体因为没有PDF生成了
except Exception as e:
print(f"字体设置失败: {e}")
return 'Helvetica'
def setup_matplotlib_font(self, language='zh'):
"""设置 matplotlib 字体"""
try:
font_manager = FontManager()
font_manager.setup_matplotlib_font(language)
return True
except Exception as e:
print(f"Matplotlib字体设置警告: {e}")
plt.rcParams['font.family'] = ['DejaVu Sans', 'Arial Unicode MS', 'Arial']
plt.rcParams['axes.unicode_minus'] = False
return True
def create_qwen_client(self):
"""创建阿里云千问API客户端配置"""
try:
# 阿里云千问API配置
api_key = os.environ.get("MY_API_KEY", "sk-f1ef83c90dcf4c839efae2a7e63dcb3d")
base_url = os.environ.get("MY_API_BASE", "https://dashscope.aliyuncs.com/compatible-mode/v1")
model_name = os.environ.get("MY_MODEL", "qwen-turbo")
# 初始化OpenAI客户端兼容模式
self.openai_client = OpenAI(
api_key=api_key,
base_url=base_url
)
print("✓ 阿里云千问API客户端配置完成")
return {
'api_key': api_key,
'base_url': base_url,
'model': model_name,
'client': self.openai_client
}
except Exception as e:
print(f"API客户端配置失败: {e}")
# 返回模拟配置
return {
'api_key': "simulation-mode",
'base_url': "simulation",
'model': "simulation-model",
'client': None
}
def call_api(self, prompt, language='zh', max_retries=2):
"""统一的API调用方法"""
# 如果是模拟模式,直接返回模拟响应
if self.client_config.get('api_key') == 'simulation-mode':
return self._get_simulation_response(prompt, language)
for attempt in range(max_retries):
try:
self._log_step(f"调用API (尝试 {attempt + 1})..." if language == 'zh'
else f"Calling API (attempt {attempt + 1})...")
client = self.client_config.get('client')
if not client:
self._log_step("API客户端未初始化" if language == 'zh'
else "API client not initialized", "error")
break
# 准备消息
if language == 'zh':
system_content = "你是一个专业的数据分析师,擅长分析时间序列数据和统计图表。请用中文回答,提供深入、专业的分析见解。"
else:
system_content = "You are a professional data analyst, skilled in analyzing time series data and statistical charts. Please answer in English, providing in-depth, professional analysis insights."
messages = [
{"role": "system", "content": system_content},
{"role": "user", "content": prompt}
]
# 调用API
response = client.chat.completions.create(
model=self.client_config.get('model', 'qwen-turbo'),
messages=messages,
max_tokens=2048,
temperature=0.7,
stream=False
)
# 提取响应内容
if response and response.choices and response.choices[0].message:
content = response.choices[0].message.content.strip()
if content:
self._log_step(f"API调用成功返回内容长度: {len(content)}" if language == 'zh'
else f"API call successful, content length: {len(content)}", "success")
return content
self._log_step("API响应内容为空" if language == 'zh'
else "API response content empty", "warning")
except Exception as e:
error_msg = str(e)
self._log_step(f"API调用失败 (尝试 {attempt + 1}): {error_msg}" if language == 'zh'
else f"API call failed (attempt {attempt + 1}): {error_msg}", "warning")
# 如果是认证错误,不再重试
if "401" in error_msg or "403" in error_msg or "authentication" in error_msg.lower():
break
if attempt < max_retries - 1:
time.sleep(2)
self._log_step("所有API调用尝试都失败使用模拟分析" if language == 'zh'
else "All API call attempts failed, using simulation analysis", "warning")
return self._get_simulation_response(prompt, language)
def _get_simulation_response(self, prompt, language='zh'):
"""统一的模拟API响应"""
if language == 'zh':
simulation_responses = {
"statistical_overview": "统计概览分析显示数据具有良好的统计特性,各变量分布较为均匀。",
"time_series_analysis": "时间序列分析揭示了数据的趋势性和周期性特征。",
"correlation_analysis": "相关性分析表明变量间存在显著的线性关系。",
"pca_analysis": "主成分分析成功降低了数据维度,保留了主要信息。",
"feature_importance": "特征重要性分析识别了对预测目标最重要的变量。",
"clustering_analysis": "聚类分析发现了数据中的自然分组结构。"
}
default_response = "基于数据的专业分析已完成,结果显示数据具有良好的统计特性和分析价值。"
else:
simulation_responses = {
"statistical_overview": "Statistical overview analysis shows good statistical characteristics with uniform variable distributions.",
"time_series_analysis": "Time series analysis reveals trend and periodic characteristics in the data.",
"correlation_analysis": "Correlation analysis indicates significant linear relationships between variables.",
"pca_analysis": "Principal component analysis successfully reduced data dimensionality while preserving key information.",
"feature_importance": "Feature importance analysis identified the most important variables for prediction.",
"clustering_analysis": "Clustering analysis discovered natural grouping structures in the data."
}
default_response = "Professional data analysis completed. The results show good statistical characteristics and analytical value."
for key, response in simulation_responses.items():
if key in prompt.lower():
return response
return default_response
def test_api_connection(self):
"""测试阿里云千问API连接"""
try:
self._log_step("测试阿里云千问API连接...")
# 如果是模拟模式,直接返回成功
if self.client_config.get('api_key') == 'simulation-mode':
self._log_step("模拟模式跳过API连接测试", "info")
return True
# 简单的API测试
test_prompt = "请回复'连接测试成功',不要添加其他内容。"
response = self.call_api(test_prompt, 'zh')
if response and "连接测试成功" in response:
self._log_step("阿里云千问API连接测试成功", "success")
return True
else:
self._log_step(f"API连接测试失败响应: {response}", "warning")
return False
except Exception as e:
self._log_step(f"API连接测试异常: {e}", "warning")
return False
def _format_analysis_text(self, text):
"""格式化分析文本"""
if not text:
return "暂无分析内容"
# 简单的文本格式化
formatted = text.replace("\n", "<br/>")
return formatted
def _create_sample_data(self):
"""创建示例数据"""
self._log_step("Creating sample data...")
# 生成示例时间序列数据
# pandas 3.x 要求频率字符串小写
dates = pd.date_range(start='2023-01-01', periods=100, freq='h')
sample_data = {
'timestamp': dates,
'temperature': np.random.normal(25, 5, 100),
'humidity': np.random.normal(60, 10, 100),
'pressure': np.random.normal(1013, 5, 100)
}
df = pd.DataFrame(sample_data)
df.set_index('timestamp', inplace=True)
self._log_step("Sample data created", "success")
return df
def query_api_for_data_description(self, data_summary, language='zh'):
"""统一的数据描述查询方法"""
try:
if language == 'zh':
prompt = f"""
作为专业数据分析师请分析以下数据集
{data_summary}
请提供
1. 数据的基本特征和结构描述
2. 潜在的数据质量问题识别
3. 适合的分析方法和技术建议
4. 预期的分析价值和业务意义
请用中文回答确保分析全面且专业
"""
else:
prompt = f"""
As a professional data analyst, please analyze the following dataset:
{data_summary}
Please provide in English:
1. Basic characteristics and structure description of the data
2. Identification of potential data quality issues
3. Recommended analysis methods and technical approaches
4. Expected analytical value and business significance
Ensure the analysis is comprehensive and professional.
"""
response = self.call_api(prompt, language)
return response or "数据描述分析暂不可用" if language == 'zh' else "Data description analysis is temporarily unavailable"
except Exception as e:
self._log_step(f"数据描述查询失败: {e}" if language == 'zh'
else f"Data description query failed: {e}", "warning")
return "数据描述分析暂不可用" if language == 'zh' else "Data description analysis is temporarily unavailable"
def query_api_with_text(self, chart_description, data_summary, language='zh'):
"""使用文本查询API"""
try:
if language == 'zh':
prompt = f"""
数据摘要:
{data_summary}
图表分析:
{chart_description}
请基于以上信息提供专业的分析见解
"""
else:
prompt = f"""
Data Summary:
{data_summary}
Chart Analysis:
{chart_description}
Please provide professional analysis insights based on the above information.
"""
response = self.call_api(prompt, language)
return response or "无法获取分析结果" if language == 'zh' else "Unable to get analysis results"
except Exception as e:
self._log_step(f"API query failed: {e}", "warning")
return "分析结果暂不可用" if language == 'zh' else "Analysis results temporarily unavailable"
def load_and_preprocess_data(self):
"""加载和预处理数据"""
try:
self._log_step("Loading data from CSV file...")
self._log_preprocessing_step("数据加载", "开始读取CSV文件")
# 尝试多种编码方式读取CSV
encodings = ['utf-8-sig', 'utf-8', 'gbk', 'latin-1']
self.data = None
for encoding in encodings:
try:
self.data = pd.read_csv(self.csv_path, encoding=encoding)
if not self.data.empty:
self._log_preprocessing_step("数据加载", f"使用 {encoding} 编码成功读取CSV文件")
print(f"✓ 使用 {encoding} 编码成功读取CSV文件")
break
except Exception as e:
print(f"使用 {encoding} 编码读取失败: {e}")
continue
if self.data is None or self.data.empty:
# 创建示例数据作为最后手段
self._log_step("无法解析数据,创建示例数据...")
self._log_preprocessing_step("数据加载", "无法解析原始数据,创建示例数据", "warning")
self.data = self._create_sample_data()
return True
print(f"原始数据形状: {self.data.shape}")
print(f"列名: {list(self.data.columns)}")
# 处理缺失值
missing_values = self.data.isnull().sum().sum()
if missing_values > 0:
self.data.fillna(method='ffill', inplace=True)
self.data.fillna(method='bfill', inplace=True)
self._log_preprocessing_step("缺失值处理",
f"检测到 {missing_values} 个缺失值,使用前向填充和后向填充处理")
else:
self._log_preprocessing_step("缺失值处理", "未检测到缺失值")
# 添加时间索引
if 'timestamp' not in self.data.columns:
# pandas 3.x 频率字符串需小写
self.data['timestamp'] = pd.date_range(start='2023-01-01', periods=len(self.data), freq='s')
self._log_preprocessing_step("时间索引", "添加默认时间戳列")
else:
# 尝试转换时间戳列
try:
self.data['timestamp'] = pd.to_datetime(self.data['timestamp'])
self._log_preprocessing_step("时间索引", "成功转换时间戳列")
except:
self.data['timestamp'] = pd.date_range(start='2023-01-01', periods=len(self.data), freq='s')
self._log_preprocessing_step("时间索引", "时间戳转换失败,使用默认时间戳", "warning")
self.data.set_index('timestamp', inplace=True)
# 只对数值列进行标准化
numeric_columns = self.data.select_dtypes(include=[np.number]).columns
print(f"数值列: {list(numeric_columns)}")
if len(numeric_columns) == 0:
self._log_step("没有找到数值列,创建示例数据", "warning")
self._log_preprocessing_step("数据标准化", "没有找到数值列,创建示例数据", "warning")
self.data = self._create_sample_data()
numeric_columns = self.data.select_dtypes(include=[np.number]).columns
# 只标准化数值列
self.scaler = StandardScaler()
numeric_data = self.data[numeric_columns]
scaled_numeric = self.scaler.fit_transform(numeric_data)
# 创建完整的scaled_data包含所有列
self.scaled_data = self.data.copy()
self.scaled_data[numeric_columns] = scaled_numeric
self._log_preprocessing_step("数据标准化", f"{len(numeric_columns)} 个数值列进行标准化")
self._log_step(f"✓ Data loaded: {self.data.shape[0]} rows, {self.data.shape[1]} columns", "success")
return True
except Exception as e:
self._log_step(f"✗ Data loading error: {e}", "error")
import traceback
print(f"详细错误信息: {traceback.format_exc()}")
# 创建示例数据作为备选
self.data = self._create_sample_data()
return True
def _generate_data_summary(self):
"""生成数据摘要"""
if not hasattr(self, 'data') or self.data is None:
return "No data available"
summary = f"""
数据集摘要:
- 记录数量: {len(self.data)}
- 变量数量: {len(self.data.columns)}
- 时间范围: {self.data.index.min()} {self.data.index.max()}
- 变量列表: {', '.join(self.data.columns.tolist())}
"""
# 添加基本统计信息
if len(self.data.columns) > 0:
numeric_cols = self.data.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
summary += f"\n数值变量统计:"
for col in numeric_cols[:3]: # 只显示前3个变量
stats = self.data[col].describe()
summary += f"\n {col}: 均值={stats['mean']:.2f}, 标准差={stats['std']:.2f}, 范围=[{stats['min']:.2f}, {stats['max']:.2f}]"
return summary
def _create_analysis_prompt(self, analysis_type, title, chart_summary, data_summary, language='zh'):
"""统一的分析提示创建方法"""
background = self.data_background
if language == 'zh':
prompt_template = f"""
你是一个专业的数据科学家请分析以下时间序列数据图表并提供深入见解
分析任务
{title} ({analysis_type})
图表信息
{chart_summary}
数据集背景
{data_summary}
用户提供的背景信息
- 数据来源: {background.get('source', '未提供')}
- 采集方法: {background.get('method', '未提供')}
- 数据用途: {background.get('purpose', '未提供')}
- 领域知识: {background.get('domain', '未提供')}
具体分析要求
请从以下角度提供专业分析
1. 图表的主要发现和关键洞察
2. 数据中表现出的模式趋势和异常
3. 各变量之间的关系和相互影响
4. 结合用户提供的背景信息分析数据的业务或研究意义
5. 进一步分析的潜在方向和价值
请用中文回答确保分析专业深入且具有可操作性
"""
else:
prompt_template = f"""
You are a professional data scientist, please analyze the following time series data chart and provide in-depth insights.
Analysis Task
{title} ({analysis_type})
Chart Information
{chart_summary}
Dataset Background
{data_summary}
User-Provided Background Information
- Data Source: {background.get('source', 'Not provided')}
- Collection Method: {background.get('method', 'Not provided')}
- Data Purpose: {background.get('purpose', 'Not provided')}
- Domain Knowledge: {background.get('domain', 'Not provided')}
Specific Analysis Requirements
Please provide professional analysis from the following perspectives:
1. Main findings and key insights from the chart
2. Patterns, trends, and anomalies exhibited in the data
3. Relationships and interactions between variables
4. Business or research significance considering the background
5. Potential directions and value for further analysis
Please answer in English, ensuring the analysis is professional and actionable.
"""
return prompt_template
def _create_fallback_analysis_for_step(self, step, language='zh'):
"""为分析步骤创建备选分析"""
if language == 'zh':
fallback_analyses = {
'Statistical Overview': "统计概览分析: 数据展示了基本的统计特征和分布情况",
'Time Series Analysis': "时间序列分析: 数据展示了随时间变化的趋势和模式",
'Correlation Analysis': "相关性分析: 揭示了变量之间的线性关系强度",
'PCA Analysis': "主成分分析: 展示了数据在降维后的主要变化方向",
'Feature Importance': "特征重要性分析: 识别了对目标变量预测最重要的特征",
'Clustering Analysis': "聚类分析: 将数据点分组为具有相似特征的簇"
}
default_response = f"{step.get('zh_title', step['title'])}分析完成"
else:
fallback_analyses = {
'Statistical Overview': "Statistical Overview Analysis: Data shows good statistical characteristics",
'Time Series Analysis': "Time Series Analysis: Clear temporal patterns identified",
'Correlation Analysis': "Correlation Analysis: Linear relationships between variables detected",
'PCA Analysis': "Principal Component Analysis: Dimensionality reduction successful",
'Feature Importance': "Feature Importance Analysis: Key predictive features identified",
'Clustering Analysis': "Clustering Analysis: Natural grouping structure discovered"
}
default_response = f"{step.get('en_title', step['title'])} analysis completed."
return fallback_analyses.get(step['title'], default_response)
def _create_fallback_chart(self, chart_key):
"""创建备选图表"""
try:
# 创建简单的占位图表
plt.figure(figsize=(8, 6))
plt.text(0.5, 0.5, f'{chart_key} Chart\n(Placeholder)',
ha='center', va='center', fontsize=14)
plt.axis('off')
img_path = os.path.join(self.temp_dir.name, f'{chart_key}_fallback.png')
plt.savefig(img_path, dpi=150, bbox_inches='tight')
plt.close()
return img_path
except:
return None
# ====== 基础分析方法 ======
def generate_statistical_overview(self):
return _generate_statistical_overview(self)
def generate_time_series_plots(self):
return _generate_time_series_plots(self)
def generate_correlation_heatmap(self):
return _generate_correlation_heatmap(self)
# ====== 时间序列特性分析 ======
def generate_acf_pacf_plots(self):
return _generate_acf_pacf_plots(self)
def perform_stationarity_tests(self):
return _perform_stationarity_tests(self)
def perform_normality_tests(self):
return _perform_normality_tests(self)
def perform_seasonal_decomposition(self):
return _perform_seasonal_decomposition(self)
def perform_spectral_analysis(self):
return _perform_spectral_analysis(self)
# ====== 多元统计分析 ======
def generate_pca_scree_plot(self):
return _generate_pca_scree_plot(self)
def perform_pca_analysis(self):
return _perform_pca_analysis(self)
def analyze_feature_importance(self):
return _analyze_feature_importance(self)
def perform_clustering_analysis(self):
return _perform_clustering_analysis(self)
def perform_factor_analysis(self):
return _perform_factor_analysis(self)
def perform_cointegration_test(self):
return _perform_cointegration_test(self)
def perform_var_analysis(self):
return _perform_var_analysis(self)
# ====== 完整分析流程 ======
def run_analysis(self):
"""执行完整分析流程"""
self._log_step("Starting analysis pipeline...")
# charts 模式下强制不生成图片
self.generate_plots = False
# 测试API连接
try:
api_connected = self.test_api_connection()
if not api_connected:
self._log_step("API connection test failed, but will continue with local analysis", "warning")
except Exception as e:
self._log_step(f"API connection test exception: {e}, will continue with local analysis", "warning")
if not self.load_and_preprocess_data():
return None, self.analysis_log
# 生成数据描述 - 根据语言选择
data_summary = self._generate_data_summary()
data_description = self.query_api_for_data_description(data_summary, self.language)
# 初始化结果字典
results = {
'api_analysis': {},
'data_description': data_description,
'preprocessing_steps': self.preprocessing_steps,
# steps 用于描述顺序与摘要
'steps': [],
'charts': {},
}
# 定义扩展的分析步骤key 用于兼容旧命名chart_key 用于 charts 映射)
analysis_steps = [
{
'key': 'stats_img',
'chart_key': 'stats',
'method': self.generate_statistical_overview,
'title': 'Statistical Overview',
'zh_title': '统计概览',
'en_title': 'Statistical Overview'
},
{
'key': 'ts_img',
'chart_key': 'ts',
'method': self.generate_time_series_plots,
'title': 'Time Series Analysis',
'zh_title': '时间序列分析',
'en_title': 'Time Series Analysis'
},
{
'key': 'acf_pacf_img',
'chart_key': 'acf_pacf',
'method': self.generate_acf_pacf_plots,
'title': 'ACF PACF Analysis',
'zh_title': '自相关和偏自相关分析',
'en_title': 'Autocorrelation Analysis'
},
{
'key': 'stationarity_img',
'chart_key': 'stationarity',
'method': self.perform_stationarity_tests,
'title': 'Stationarity Tests',
'zh_title': '平稳性检验',
'en_title': 'Stationarity Tests'
},
{
'key': 'normality_img',
'chart_key': 'normality',
'method': self.perform_normality_tests,
'title': 'Normality Tests',
'zh_title': '正态性检验',
'en_title': 'Normality Tests'
},
{
'key': 'seasonal_img',
'chart_key': 'seasonal',
'method': self.perform_seasonal_decomposition,
'title': 'Seasonal Decomposition',
'zh_title': '季节性分解',
'en_title': 'Seasonal Decomposition'
},
{
'key': 'spectral_img',
'chart_key': 'spectral',
'method': self.perform_spectral_analysis,
'title': 'Spectral Analysis',
'zh_title': '频谱分析',
'en_title': 'Spectral Analysis'
},
{
'key': 'heatmap_img',
'chart_key': 'heatmap',
'method': self.generate_correlation_heatmap,
'title': 'Correlation Analysis',
'zh_title': '相关性分析',
'en_title': 'Correlation Analysis'
},
{
'key': 'pca_scree_img',
'chart_key': 'pca_scree',
'method': self.generate_pca_scree_plot,
'title': 'PCA Scree Plot',
'zh_title': 'PCA碎石图',
'en_title': 'PCA Scree Plot'
},
{
'key': 'pca_img',
'chart_key': 'pca_scatter',
'method': self.perform_pca_analysis,
'title': 'PCA Analysis',
'zh_title': '主成分分析',
'en_title': 'Principal Component Analysis'
},
{
'key': 'fi_img',
'chart_key': 'feature_importance',
'method': self.analyze_feature_importance,
'title': 'Feature Importance',
'zh_title': '特征重要性分析',
'en_title': 'Feature Importance Analysis'
},
{
'key': 'cluster_img',
'chart_key': 'cluster',
'method': self.perform_clustering_analysis,
'title': 'Clustering Analysis',
'zh_title': '聚类分析',
'en_title': 'Clustering Analysis'
},
{
'key': 'factor_img',
'chart_key': 'factor',
'method': self.perform_factor_analysis,
'title': 'Factor Analysis',
'zh_title': '因子分析',
'en_title': 'Factor Analysis'
},
{
'key': 'cointegration_img',
'chart_key': 'cointegration',
'method': self.perform_cointegration_test,
'title': 'Cointegration Test',
'zh_title': '协整检验',
'en_title': 'Cointegration Test'
},
{
'key': 'var_img',
'chart_key': 'var_forecast',
'method': self.perform_var_analysis,
'title': 'VAR Analysis',
'zh_title': '向量自回归分析',
'en_title': 'Vector Autoregression Analysis'
}
]
# 执行每个分析步骤
for step in analysis_steps:
step_entry = {
'key': step['key'],
'title': step['title'],
'zh_title': step.get('zh_title'),
'en_title': step.get('en_title'),
}
try:
self._log_step(f"Generating {step['title']}...")
# 执行方法
step_result = step['method']()
summary = ""
step_data = None
if isinstance(step_result, tuple):
if len(step_result) == 3:
_img_unused, summary, step_data = step_result
elif len(step_result) == 2:
_img_unused, summary = step_result
step_entry['summary'] = summary
# 转换数据并构建 charts
if step_data is not None:
chart_payload = self._build_chart_payload(step['chart_key'], step_data)
if chart_payload is not None:
results['charts'][step['chart_key']] = chart_payload
step_entry['chart'] = step['chart_key']
step_entry['data_preview'] = chart_payload.get('preview')
# success 判断:有 charts 或 summary 即可
success = step['chart_key'] in results['charts'] or bool(summary)
if success:
# 调用API分析图表 - 根据语言选择,添加方法名称
self._log_step(f"Calling API for {step['title']} analysis...")
title = step['zh_title'] if self.language == 'zh' else step['en_title']
analysis_prompt = self._create_analysis_prompt(
step['title'], title, summary, data_summary, self.language
)
api_analysis = self.call_api(analysis_prompt, self.language)
if api_analysis:
results['api_analysis'][step['title']] = api_analysis
step_entry['api_analysis'] = api_analysis
self._log_step(f"{step['title']} analysis completed with API", "success")
else:
fallback_analysis = self._create_fallback_analysis_for_step(step, self.language)
results['api_analysis'][step['title']] = fallback_analysis
step_entry['api_analysis'] = fallback_analysis
self._log_step(f"{step['title']} analysis completed (fallback)", "warning")
else:
self._log_step(f"{step['title']} data generation failed", "error")
step_entry['error'] = 'data_generation_failed'
except Exception as e:
self._log_step(f"{step['title']} analysis failed: {e}", "error")
step_entry['error'] = str(e)
if step['title'] not in results['api_analysis']:
results['api_analysis'][step['title']] = self._create_fallback_analysis_for_step(step,
self.language)
# 无论成功与否都记录 step entry便于前端展示与调试
results['steps'].append(step_entry)
self._log_step("✓ Analysis pipeline completed", "success")
return results, self.analysis_log
def _build_chart_payload(self, chart_key: str, data: Any) -> Dict[str, Any]:
"""根据 chart_key 将步骤数据组装为 ECharts 友好的结构。"""
try:
if chart_key == 'stats' and isinstance(data, pd.DataFrame):
dataset = [data.columns.tolist()] + data.values.tolist()
return {
'type': 'table',
'dataset': self.to_echarts_safe(dataset),
'meta': {'rows': len(data), 'cols': len(data.columns)}
}
if chart_key == 'ts' and isinstance(data, pd.DataFrame):
dataset = [data.columns.tolist()] + data.values.tolist()
return {
'type': 'line',
'dataset': self.to_echarts_safe(dataset),
'meta': {'columns': data.columns.tolist()}
}
if chart_key == 'acf_pacf' and isinstance(data, dict):
series_payload = []
for col, vals in data.items():
acf_vals = vals.get('acf') or []
pacf_vals = vals.get('pacf') or []
acf_points = [{'lag': idx, 'value': v} for idx, v in enumerate(acf_vals)]
pacf_points = [{'lag': idx, 'value': v} for idx, v in enumerate(pacf_vals)]
series_payload.append({'name': col, 'acf': acf_points, 'pacf': pacf_points})
return {
'type': 'bar',
'series': self.to_echarts_safe(series_payload),
'meta': {'columns': list(data.keys())}
}
if chart_key == 'stationarity' and isinstance(data, dict):
return {
'type': 'table',
'records': self.to_echarts_safe([
{'column': col, **vals}
for col, vals in data.items()
]),
}
if chart_key == 'normality' and isinstance(data, dict):
return {
'type': 'table',
'records': self.to_echarts_safe([
{'column': col, **vals}
for col, vals in data.items()
]),
}
if chart_key == 'seasonal' and isinstance(data, pd.DataFrame):
dataset = [data.columns.tolist()] + data.values.tolist()
return {
'type': 'line',
'dataset': self.to_echarts_safe(dataset),
}
if chart_key == 'spectral' and isinstance(data, dict):
return {
'type': 'spectral',
'series': self.to_echarts_safe(data),
}
if chart_key == 'heatmap':
# data 期望为 DataFrame 相关矩阵
if isinstance(data, pd.DataFrame):
labels = data.columns.tolist()
flattened: List[List[Any]] = []
for i, row_label in enumerate(labels):
for j, col_label in enumerate(labels):
flattened.append([i, j, data.iloc[i, j]])
return {
'type': 'heatmap',
'data': self.to_echarts_safe(flattened),
'xLabels': labels,
'yLabels': labels,
}
if chart_key == 'pca_scree' and isinstance(data, pd.DataFrame):
dataset = [data.columns.tolist()] + data.values.tolist()
return {
'type': 'bar',
'dataset': self.to_echarts_safe(dataset),
}
if chart_key == 'pca_scatter' and isinstance(data, pd.DataFrame):
return {
'type': 'scatter',
'records': self.to_echarts_safe(data.to_dict(orient='records')),
}
if chart_key == 'feature_importance' and isinstance(data, pd.DataFrame):
return {
'type': 'bar',
'records': self.to_echarts_safe(data.to_dict(orient='records')),
}
if chart_key == 'cluster' and isinstance(data, pd.DataFrame):
return {
'type': 'scatter',
'records': self.to_echarts_safe(data.to_dict(orient='records')),
}
if chart_key == 'factor' and isinstance(data, pd.DataFrame):
return {
'type': 'scatter',
'records': self.to_echarts_safe(data.to_dict(orient='records')),
}
if chart_key == 'cointegration' and isinstance(data, dict):
return {
'type': 'table',
'meta': self.to_echarts_safe(data),
}
if chart_key == 'var_forecast' and isinstance(data, pd.DataFrame):
# 添加 step 序号便于前端 encode
data_with_step = data.copy()
data_with_step.insert(0, 'step', range(1, len(data_with_step) + 1))
dataset = [data_with_step.columns.tolist()] + data_with_step.values.tolist()
return {
'type': 'line',
'dataset': self.to_echarts_safe(dataset),
}
return None
except Exception as e:
self._log_step(f"build chart payload failed for {chart_key}: {e}", "warning")
return None
# ====== 报告生成方法 (Removed) ======