181 lines
7.1 KiB
Python
181 lines
7.1 KiB
Python
import gc
|
||
import os
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
import matplotlib.pyplot as plt
|
||
from scipy import stats
|
||
|
||
|
||
def generate_statistical_overview(self):
|
||
"""生成统计概览 - 优化内存版本"""
|
||
fig = None
|
||
try:
|
||
self._log_step("Generating statistical overview...")
|
||
|
||
# 检查数据
|
||
if not hasattr(self, 'data') or self.data is None or len(self.data) == 0:
|
||
self._log_step("No data available for statistical overview", "warning")
|
||
return None, "No data available", None
|
||
|
||
# 计算统计数据
|
||
numeric_cols = self.data.select_dtypes(include=[np.number]).columns
|
||
stats_df = self.data[numeric_cols].describe().T.reset_index().rename(columns={'index': 'variable'})
|
||
summary = f"Generated statistical overview for {len(numeric_cols)} variables"
|
||
|
||
if not self.generate_plots:
|
||
self._log_step("Statistical overview generated (data only)", "success")
|
||
return None, summary, stats_df
|
||
|
||
# 使用更小的图形尺寸和DPI来节省内存
|
||
fig, axes = plt.subplots(2, 2, figsize=(10, 8), dpi=100)
|
||
fig.suptitle('Statistical Overview', fontsize=14)
|
||
|
||
# 基本统计信息
|
||
# 只处理前4个变量以节省内存
|
||
num_vars = min(4, len(self.data.columns))
|
||
|
||
for i in range(num_vars):
|
||
row = i // 2
|
||
col = i % 2
|
||
col_name = self.data.columns[i]
|
||
|
||
try:
|
||
# 时间序列图
|
||
axes[row, col].plot(self.data.index, self.data[col_name], linewidth=1)
|
||
axes[row, col].set_title(f'{col_name}')
|
||
axes[row, col].tick_params(axis='x', rotation=45)
|
||
axes[row, col].grid(True, alpha=0.3)
|
||
except Exception as e:
|
||
self._log_step(f"Plotting {col_name} failed: {e}", "warning")
|
||
axes[row, col].text(
|
||
0.5,
|
||
0.5,
|
||
f'Error: {str(e)[:30]}',
|
||
ha='center',
|
||
va='center',
|
||
transform=axes[row, col].transAxes,
|
||
)
|
||
|
||
plt.tight_layout()
|
||
|
||
# 保存图片(使用更低的DPI)
|
||
img_path = os.path.join(self.temp_dir.name, 'stats_overview.png')
|
||
try:
|
||
plt.savefig(img_path, dpi=100, bbox_inches='tight', format='png')
|
||
if not os.path.exists(img_path):
|
||
self._log_step("Failed to save statistical overview image", "error")
|
||
return None, "Failed to save image", stats_df
|
||
except Exception as save_error:
|
||
self._log_step(f"Failed to save figure: {save_error}", "error")
|
||
return None, f"Save error: {str(save_error)[:100]}", stats_df
|
||
finally:
|
||
plt.close(fig) # 明确关闭图形释放内存
|
||
gc.collect()
|
||
|
||
self._log_step("Statistical overview generated", "success")
|
||
|
||
return img_path, summary, stats_df
|
||
|
||
except Exception as e:
|
||
self._log_step(f"Statistical overview failed: {str(e)[:100]}", "error")
|
||
if fig is not None:
|
||
try:
|
||
plt.close(fig)
|
||
gc.collect()
|
||
except Exception:
|
||
pass
|
||
return None, f"Statistical overview failed: {str(e)[:100]}", None
|
||
|
||
|
||
def perform_normality_tests(self):
|
||
"""执行正态性检验"""
|
||
try:
|
||
self._log_step("Performing normality tests...")
|
||
|
||
if hasattr(self, 'data') and self.data is not None:
|
||
numeric_cols = self.data.select_dtypes(include=[np.number]).columns
|
||
results = {}
|
||
|
||
for col in numeric_cols[:3]: # 只测试前3个变量
|
||
series = self.data[col].dropna()
|
||
col_results = {}
|
||
|
||
# 直方图分箱(后端负责 binning)
|
||
hist_counts, bin_edges = np.histogram(series, bins=20)
|
||
histogram = []
|
||
for i in range(len(hist_counts)):
|
||
histogram.append({
|
||
'range_start': float(bin_edges[i]),
|
||
'range_end': float(bin_edges[i + 1]),
|
||
'count': int(hist_counts[i])
|
||
})
|
||
col_results['histogram'] = histogram
|
||
|
||
# Shapiro-Wilk检验
|
||
if len(series) >= 3 and len(series) <= 5000:
|
||
shapiro_result = stats.shapiro(series)
|
||
col_results['Shapiro-Wilk'] = {
|
||
'statistic': float(shapiro_result[0]),
|
||
'p_value': float(shapiro_result[1]),
|
||
'normal': bool(shapiro_result[1] > 0.05),
|
||
}
|
||
|
||
# Jarque-Bera检验
|
||
jb_result = stats.jarque_bera(series)
|
||
# SciPy result typing varies by version; keep runtime behavior and silence stub mismatch.
|
||
jb_stat = float(jb_result[0]) # type: ignore[index,arg-type]
|
||
jb_p = float(jb_result[1]) # type: ignore[index,arg-type]
|
||
col_results['Jarque-Bera'] = {
|
||
'statistic': jb_stat,
|
||
'p_value': jb_p,
|
||
'normal': bool(jb_p > 0.05),
|
||
}
|
||
|
||
results[col] = col_results
|
||
|
||
summary = f"正态性检验完成,测试了 {len(results)} 个变量"
|
||
|
||
if not self.generate_plots:
|
||
self._log_step("Normality tests completed (data only)", "success")
|
||
return None, summary, results
|
||
|
||
# 创建正态性检验可视化
|
||
n_cols = min(3, len(numeric_cols))
|
||
fig, axes = plt.subplots(n_cols, 2, figsize=(12, 4 * n_cols))
|
||
fig.suptitle('正态性检验结果', fontsize=16)
|
||
|
||
if n_cols == 1:
|
||
axes = axes.reshape(1, -1)
|
||
|
||
for i, col in enumerate(numeric_cols[:n_cols]):
|
||
series = self.data[col].dropna()
|
||
|
||
# 直方图与正态曲线
|
||
axes[i, 0].hist(series, bins=20, density=True, alpha=0.7, color='skyblue')
|
||
xmin, xmax = axes[i, 0].get_xlim()
|
||
x = np.linspace(xmin, xmax, 100)
|
||
p = stats.norm.pdf(x, series.mean(), series.std())
|
||
axes[i, 0].plot(x, p, 'k', linewidth=2)
|
||
axes[i, 0].set_title(f'{col} - 分布直方图')
|
||
|
||
# Q-Q图
|
||
stats.probplot(series, dist="norm", plot=axes[i, 1])
|
||
axes[i, 1].set_title(f'{col} - Q-Q图')
|
||
|
||
plt.tight_layout()
|
||
img_path = os.path.join(self.temp_dir.name, 'normality_tests.png')
|
||
plt.savefig(img_path, dpi=150, bbox_inches='tight')
|
||
plt.close()
|
||
|
||
self._log_step("Normality tests completed", "success")
|
||
|
||
return img_path, summary, results
|
||
|
||
self._log_step("No data available for normality tests", "warning")
|
||
return None, "数据不足,无法进行正态性检验", None
|
||
|
||
except Exception as e:
|
||
self._log_step(f"Normality tests failed: {e}", "error")
|
||
return None, f"正态性检验失败: {e}", None
|