Json-Python-Server/app/services/analysis/modules/multivariate.py

302 lines
12 KiB
Python
Raw Normal View History

2026-01-29 18:18:32 +08:00
import gc
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
def generate_correlation_heatmap(self):
"""生成相关性热力图"""
fig = None
try:
self._log_step("Generating correlation heatmap...")
if not hasattr(self, 'data') or self.data is None or len(self.data.columns) <= 1:
self._log_step("Not enough data for correlation analysis", "warning")
return None, "Not enough data", None
# 计算相关性矩阵
numeric_cols = self.data.select_dtypes(include=[np.number]).columns
corr_matrix = self.data[numeric_cols].corr()
summary = "Correlation matrix calculated"
if not self.generate_plots:
self._log_step("Correlation analysis completed (data only)", "success")
# 替换NaN为None以兼容JSON
return None, summary, corr_matrix.where(pd.notnull(corr_matrix), None)
# 创建热力图
fig = plt.figure(figsize=(8, 6), dpi=100)
sns.heatmap(
corr_matrix,
annot=True,
fmt=".2f",
cmap='coolwarm',
center=0,
square=True,
cbar_kws={"shrink": 0.8},
)
plt.title('Correlation Heatmap')
plt.tight_layout()
# 保存图片
img_path = os.path.join(self.temp_dir.name, 'correlation_heatmap.png')
try:
plt.savefig(img_path, dpi=100, bbox_inches='tight', format='png')
except Exception as save_err:
self._log_step(f"Save error: {save_err}", "error")
return None, f"Save error: {str(save_err)[:100]}", corr_matrix.where(pd.notnull(corr_matrix), None)
finally:
plt.close(fig)
gc.collect()
self._log_step("Correlation heatmap generated", "success")
return img_path, summary, corr_matrix.where(pd.notnull(corr_matrix), None)
except Exception as e:
self._log_step(f"Correlation heatmap failed: {str(e)[:100]}", "error")
if fig is not None:
try:
plt.close(fig)
except Exception:
pass
return None, f"Correlation heatmap failed: {str(e)[:100]}", None
def generate_pca_scree_plot(self):
"""生成PCA碎石图"""
try:
self._log_step("Generating PCA scree plot...")
if hasattr(self, 'scaled_data') and self.scaled_data is not None:
pca = PCA()
pca.fit(self.scaled_data)
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)
# 准备数据
scree_data = pd.DataFrame({
'component': range(1, len(explained_variance) + 1),
'explained_variance': explained_variance,
'cumulative_variance': cumulative_variance,
})
summary = (
"PCA碎石图生成完成前2个主成分解释 "
f"{cumulative_variance[min(1, len(cumulative_variance) - 1)]:.2%} 方差"
)
if not self.generate_plots:
self._log_step("PCA scree data generated", "success")
return None, summary, scree_data
# 创建碎石图
plt.figure(figsize=(10, 6))
# 绘制碎石图
plt.subplot(1, 2, 1)
plt.plot(range(1, len(explained_variance) + 1), explained_variance, 'bo-')
plt.title('PCA碎石图')
plt.xlabel('主成分')
plt.ylabel('解释方差比例')
plt.grid(True, alpha=0.3)
# 绘制累积方差图
plt.subplot(1, 2, 2)
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 'ro-')
plt.title('累积解释方差')
plt.xlabel('主成分数量')
plt.ylabel('累积方差比例')
plt.axhline(y=0.85, color='g', linestyle='--', label='85% 方差')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
img_path = os.path.join(self.temp_dir.name, 'pca_scree_plot.png')
plt.savefig(img_path, dpi=150, bbox_inches='tight')
plt.close()
self._log_step("PCA scree plot generated", "success")
return img_path, summary, scree_data
self._log_step("No scaled data available for PCA scree plot", "warning")
return None, "没有标准化数据可用于PCA碎石图", None
except Exception as e:
self._log_step(f"PCA scree plot failed: {e}", "error")
return None, f"PCA碎石图生成失败: {e}", None
def perform_pca_analysis(self):
"""执行主成分分析"""
try:
self._log_step("Performing PCA analysis...")
if hasattr(self, 'scaled_data') and self.scaled_data is not None and len(self.scaled_data.columns) > 1:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(self.scaled_data)
summary = (
"PCA analysis completed, explained variance: "
f"{pca.explained_variance_ratio_[0]:.2%} + {pca.explained_variance_ratio_[1]:.2%}"
)
if not self.generate_plots:
self._log_step("PCA analysis completed (data only)", "success")
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['timestamp'] = self.data.index.astype(str)
return None, summary, pca_df
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['timestamp'] = self.data.index.astype(str)
# 创建PCA散点图
plt.figure(figsize=(8, 6))
plt.scatter(principal_components[:, 0], principal_components[:, 1], alpha=0.7)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
plt.title('Principal Component Analysis (PCA)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
# 保存图片
img_path = os.path.join(self.temp_dir.name, 'pca_analysis.png')
plt.savefig(img_path, dpi=150, bbox_inches='tight')
plt.close()
self._log_step("PCA analysis completed", "success")
return img_path, summary, pca_df
self._log_step("Not enough data for PCA analysis", "warning")
return None, "Not enough data for PCA analysis", None
except Exception as e:
self._log_step(f"PCA analysis failed: {e}", "error")
return None, f"PCA analysis failed: {e}", None
def perform_clustering_analysis(self):
"""执行聚类分析"""
try:
self._log_step("Performing clustering analysis...")
if hasattr(self, 'scaled_data') and self.scaled_data is not None and len(self.scaled_data.columns) > 1:
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(self.scaled_data)
summary = f"Clustering analysis completed, found {len(np.unique(clusters))} clusters"
if not self.generate_plots:
self._log_step("Clustering analysis completed (data only)", "success")
cluster_df = pd.DataFrame({'cluster': clusters})
cluster_df['timestamp'] = self.data.index.astype(str)
return None, summary, cluster_df
cluster_df = pd.DataFrame({'cluster': clusters})
cluster_df['timestamp'] = self.data.index.astype(str)
# 如果数据是2D的直接绘制聚类结果
if len(self.scaled_data.columns) >= 2:
plt.figure(figsize=(8, 6))
plt.scatter(
self.scaled_data.iloc[:, 0],
self.scaled_data.iloc[:, 1],
c=clusters,
cmap='viridis',
alpha=0.7,
)
plt.xlabel(self.scaled_data.columns[0])
plt.ylabel(self.scaled_data.columns[1])
plt.title('Clustering Analysis')
plt.colorbar(label='Cluster')
plt.tight_layout()
else:
# 对于高维数据使用PCA降维后可视化
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(self.scaled_data)
plt.figure(figsize=(8, 6))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=clusters, cmap='viridis', alpha=0.7)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Clustering Analysis (PCA Reduced)')
plt.colorbar(label='Cluster')
plt.tight_layout()
# 保存图片
img_path = os.path.join(self.temp_dir.name, 'clustering_analysis.png')
plt.savefig(img_path, dpi=150, bbox_inches='tight')
plt.close()
self._log_step("Clustering analysis completed", "success")
return img_path, summary, cluster_df
self._log_step("Not enough data for clustering analysis", "warning")
return None, "Not enough data for clustering analysis", None
except Exception as e:
self._log_step(f"Clustering analysis failed: {e}", "error")
return None, f"Clustering analysis failed: {e}", None
def perform_factor_analysis(self):
"""执行因子分析"""
try:
self._log_step("Performing factor analysis...")
if hasattr(self, 'scaled_data') and self.scaled_data is not None and len(self.scaled_data.columns) > 1:
from sklearn.decomposition import FactorAnalysis
fa = FactorAnalysis(n_components=2, random_state=42)
factors = fa.fit_transform(self.scaled_data)
summary = "因子分析完成提取了2个主要因子"
if not self.generate_plots:
self._log_step("Factor analysis completed (data only)", "success")
factor_df = pd.DataFrame(data=factors, columns=['Factor1', 'Factor2'])
factor_df['timestamp'] = self.data.index.astype(str)
return None, summary, factor_df
factor_df = pd.DataFrame(data=factors, columns=['Factor1', 'Factor2'])
factor_df['timestamp'] = self.data.index.astype(str)
# 创建因子分析图
plt.figure(figsize=(10, 8))
plt.scatter(factors[:, 0], factors[:, 1], alpha=0.7)
plt.xlabel('Factor 1')
plt.ylabel('Factor 2')
plt.title('Factor Analysis')
plt.grid(True, alpha=0.3)
# 添加因子载荷
for i, (x, y) in enumerate(factors[:10]): # 只显示前10个点
plt.annotate(str(i), (x, y), xytext=(5, 5), textcoords='offset points', fontsize=8)
plt.tight_layout()
# 保存图片
img_path = os.path.join(self.temp_dir.name, 'factor_analysis.png')
plt.savefig(img_path, dpi=150, bbox_inches='tight')
plt.close()
self._log_step("Factor analysis completed", "success")
return img_path, summary, factor_df
self._log_step("Not enough data for factor analysis", "warning")
return None, "数据不足,无法进行因子分析", None
except Exception as e:
self._log_step(f"Factor analysis failed: {e}", "error")
return None, f"因子分析失败: {e}", None