181 lines
7.1 KiB
Python
181 lines
7.1 KiB
Python
|
|
import gc
|
|||
|
|
import os
|
|||
|
|
|
|||
|
|
import numpy as np
|
|||
|
|
import pandas as pd
|
|||
|
|
import matplotlib.pyplot as plt
|
|||
|
|
from scipy import stats
|
|||
|
|
|
|||
|
|
|
|||
|
|
def generate_statistical_overview(self):
|
|||
|
|
"""生成统计概览 - 优化内存版本"""
|
|||
|
|
fig = None
|
|||
|
|
try:
|
|||
|
|
self._log_step("Generating statistical overview...")
|
|||
|
|
|
|||
|
|
# 检查数据
|
|||
|
|
if not hasattr(self, 'data') or self.data is None or len(self.data) == 0:
|
|||
|
|
self._log_step("No data available for statistical overview", "warning")
|
|||
|
|
return None, "No data available", None
|
|||
|
|
|
|||
|
|
# 计算统计数据
|
|||
|
|
numeric_cols = self.data.select_dtypes(include=[np.number]).columns
|
|||
|
|
stats_df = self.data[numeric_cols].describe().T.reset_index().rename(columns={'index': 'variable'})
|
|||
|
|
summary = f"Generated statistical overview for {len(numeric_cols)} variables"
|
|||
|
|
|
|||
|
|
if not self.generate_plots:
|
|||
|
|
self._log_step("Statistical overview generated (data only)", "success")
|
|||
|
|
return None, summary, stats_df
|
|||
|
|
|
|||
|
|
# 使用更小的图形尺寸和DPI来节省内存
|
|||
|
|
fig, axes = plt.subplots(2, 2, figsize=(10, 8), dpi=100)
|
|||
|
|
fig.suptitle('Statistical Overview', fontsize=14)
|
|||
|
|
|
|||
|
|
# 基本统计信息
|
|||
|
|
# 只处理前4个变量以节省内存
|
|||
|
|
num_vars = min(4, len(self.data.columns))
|
|||
|
|
|
|||
|
|
for i in range(num_vars):
|
|||
|
|
row = i // 2
|
|||
|
|
col = i % 2
|
|||
|
|
col_name = self.data.columns[i]
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 时间序列图
|
|||
|
|
axes[row, col].plot(self.data.index, self.data[col_name], linewidth=1)
|
|||
|
|
axes[row, col].set_title(f'{col_name}')
|
|||
|
|
axes[row, col].tick_params(axis='x', rotation=45)
|
|||
|
|
axes[row, col].grid(True, alpha=0.3)
|
|||
|
|
except Exception as e:
|
|||
|
|
self._log_step(f"Plotting {col_name} failed: {e}", "warning")
|
|||
|
|
axes[row, col].text(
|
|||
|
|
0.5,
|
|||
|
|
0.5,
|
|||
|
|
f'Error: {str(e)[:30]}',
|
|||
|
|
ha='center',
|
|||
|
|
va='center',
|
|||
|
|
transform=axes[row, col].transAxes,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
plt.tight_layout()
|
|||
|
|
|
|||
|
|
# 保存图片(使用更低的DPI)
|
|||
|
|
img_path = os.path.join(self.temp_dir.name, 'stats_overview.png')
|
|||
|
|
try:
|
|||
|
|
plt.savefig(img_path, dpi=100, bbox_inches='tight', format='png')
|
|||
|
|
if not os.path.exists(img_path):
|
|||
|
|
self._log_step("Failed to save statistical overview image", "error")
|
|||
|
|
return None, "Failed to save image", stats_df
|
|||
|
|
except Exception as save_error:
|
|||
|
|
self._log_step(f"Failed to save figure: {save_error}", "error")
|
|||
|
|
return None, f"Save error: {str(save_error)[:100]}", stats_df
|
|||
|
|
finally:
|
|||
|
|
plt.close(fig) # 明确关闭图形释放内存
|
|||
|
|
gc.collect()
|
|||
|
|
|
|||
|
|
self._log_step("Statistical overview generated", "success")
|
|||
|
|
|
|||
|
|
return img_path, summary, stats_df
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
self._log_step(f"Statistical overview failed: {str(e)[:100]}", "error")
|
|||
|
|
if fig is not None:
|
|||
|
|
try:
|
|||
|
|
plt.close(fig)
|
|||
|
|
gc.collect()
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
return None, f"Statistical overview failed: {str(e)[:100]}", None
|
|||
|
|
|
|||
|
|
|
|||
|
|
def perform_normality_tests(self):
|
|||
|
|
"""执行正态性检验"""
|
|||
|
|
try:
|
|||
|
|
self._log_step("Performing normality tests...")
|
|||
|
|
|
|||
|
|
if hasattr(self, 'data') and self.data is not None:
|
|||
|
|
numeric_cols = self.data.select_dtypes(include=[np.number]).columns
|
|||
|
|
results = {}
|
|||
|
|
|
|||
|
|
for col in numeric_cols[:3]: # 只测试前3个变量
|
|||
|
|
series = self.data[col].dropna()
|
|||
|
|
col_results = {}
|
|||
|
|
|
|||
|
|
# 直方图分箱(后端负责 binning)
|
|||
|
|
hist_counts, bin_edges = np.histogram(series, bins=20)
|
|||
|
|
histogram = []
|
|||
|
|
for i in range(len(hist_counts)):
|
|||
|
|
histogram.append({
|
|||
|
|
'range_start': float(bin_edges[i]),
|
|||
|
|
'range_end': float(bin_edges[i + 1]),
|
|||
|
|
'count': int(hist_counts[i])
|
|||
|
|
})
|
|||
|
|
col_results['histogram'] = histogram
|
|||
|
|
|
|||
|
|
# Shapiro-Wilk检验
|
|||
|
|
if len(series) >= 3 and len(series) <= 5000:
|
|||
|
|
shapiro_result = stats.shapiro(series)
|
|||
|
|
col_results['Shapiro-Wilk'] = {
|
|||
|
|
'statistic': float(shapiro_result[0]),
|
|||
|
|
'p_value': float(shapiro_result[1]),
|
|||
|
|
'normal': bool(shapiro_result[1] > 0.05),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# Jarque-Bera检验
|
|||
|
|
jb_result = stats.jarque_bera(series)
|
|||
|
|
# SciPy result typing varies by version; keep runtime behavior and silence stub mismatch.
|
|||
|
|
jb_stat = float(jb_result[0]) # type: ignore[index,arg-type]
|
|||
|
|
jb_p = float(jb_result[1]) # type: ignore[index,arg-type]
|
|||
|
|
col_results['Jarque-Bera'] = {
|
|||
|
|
'statistic': jb_stat,
|
|||
|
|
'p_value': jb_p,
|
|||
|
|
'normal': bool(jb_p > 0.05),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
results[col] = col_results
|
|||
|
|
|
|||
|
|
summary = f"正态性检验完成,测试了 {len(results)} 个变量"
|
|||
|
|
|
|||
|
|
if not self.generate_plots:
|
|||
|
|
self._log_step("Normality tests completed (data only)", "success")
|
|||
|
|
return None, summary, results
|
|||
|
|
|
|||
|
|
# 创建正态性检验可视化
|
|||
|
|
n_cols = min(3, len(numeric_cols))
|
|||
|
|
fig, axes = plt.subplots(n_cols, 2, figsize=(12, 4 * n_cols))
|
|||
|
|
fig.suptitle('正态性检验结果', fontsize=16)
|
|||
|
|
|
|||
|
|
if n_cols == 1:
|
|||
|
|
axes = axes.reshape(1, -1)
|
|||
|
|
|
|||
|
|
for i, col in enumerate(numeric_cols[:n_cols]):
|
|||
|
|
series = self.data[col].dropna()
|
|||
|
|
|
|||
|
|
# 直方图与正态曲线
|
|||
|
|
axes[i, 0].hist(series, bins=20, density=True, alpha=0.7, color='skyblue')
|
|||
|
|
xmin, xmax = axes[i, 0].get_xlim()
|
|||
|
|
x = np.linspace(xmin, xmax, 100)
|
|||
|
|
p = stats.norm.pdf(x, series.mean(), series.std())
|
|||
|
|
axes[i, 0].plot(x, p, 'k', linewidth=2)
|
|||
|
|
axes[i, 0].set_title(f'{col} - 分布直方图')
|
|||
|
|
|
|||
|
|
# Q-Q图
|
|||
|
|
stats.probplot(series, dist="norm", plot=axes[i, 1])
|
|||
|
|
axes[i, 1].set_title(f'{col} - Q-Q图')
|
|||
|
|
|
|||
|
|
plt.tight_layout()
|
|||
|
|
img_path = os.path.join(self.temp_dir.name, 'normality_tests.png')
|
|||
|
|
plt.savefig(img_path, dpi=150, bbox_inches='tight')
|
|||
|
|
plt.close()
|
|||
|
|
|
|||
|
|
self._log_step("Normality tests completed", "success")
|
|||
|
|
|
|||
|
|
return img_path, summary, results
|
|||
|
|
|
|||
|
|
self._log_step("No data available for normality tests", "warning")
|
|||
|
|
return None, "数据不足,无法进行正态性检验", None
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
self._log_step(f"Normality tests failed: {e}", "error")
|
|||
|
|
return None, f"正态性检验失败: {e}", None
|