import gc import os import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy import stats def generate_statistical_overview(self): """生成统计概览 - 优化内存版本""" fig = None try: self._log_step("Generating statistical overview...") # 检查数据 if not hasattr(self, 'data') or self.data is None or len(self.data) == 0: self._log_step("No data available for statistical overview", "warning") return None, "No data available", None # 计算统计数据 numeric_cols = self.data.select_dtypes(include=[np.number]).columns stats_df = self.data[numeric_cols].describe().T.reset_index().rename(columns={'index': 'variable'}) summary = f"Generated statistical overview for {len(numeric_cols)} variables" if not self.generate_plots: self._log_step("Statistical overview generated (data only)", "success") return None, summary, stats_df # 使用更小的图形尺寸和DPI来节省内存 fig, axes = plt.subplots(2, 2, figsize=(10, 8), dpi=100) fig.suptitle('Statistical Overview', fontsize=14) # 基本统计信息 # 只处理前4个变量以节省内存 num_vars = min(4, len(self.data.columns)) for i in range(num_vars): row = i // 2 col = i % 2 col_name = self.data.columns[i] try: # 时间序列图 axes[row, col].plot(self.data.index, self.data[col_name], linewidth=1) axes[row, col].set_title(f'{col_name}') axes[row, col].tick_params(axis='x', rotation=45) axes[row, col].grid(True, alpha=0.3) except Exception as e: self._log_step(f"Plotting {col_name} failed: {e}", "warning") axes[row, col].text( 0.5, 0.5, f'Error: {str(e)[:30]}', ha='center', va='center', transform=axes[row, col].transAxes, ) plt.tight_layout() # 保存图片(使用更低的DPI) img_path = os.path.join(self.temp_dir.name, 'stats_overview.png') try: plt.savefig(img_path, dpi=100, bbox_inches='tight', format='png') if not os.path.exists(img_path): self._log_step("Failed to save statistical overview image", "error") return None, "Failed to save image", stats_df except Exception as save_error: self._log_step(f"Failed to save figure: {save_error}", "error") return None, f"Save error: {str(save_error)[:100]}", stats_df finally: plt.close(fig) # 明确关闭图形释放内存 gc.collect() self._log_step("Statistical overview generated", "success") return img_path, summary, stats_df except Exception as e: self._log_step(f"Statistical overview failed: {str(e)[:100]}", "error") if fig is not None: try: plt.close(fig) gc.collect() except Exception: pass return None, f"Statistical overview failed: {str(e)[:100]}", None def perform_normality_tests(self): """执行正态性检验""" try: self._log_step("Performing normality tests...") if hasattr(self, 'data') and self.data is not None: numeric_cols = self.data.select_dtypes(include=[np.number]).columns results = {} for col in numeric_cols[:3]: # 只测试前3个变量 series = self.data[col].dropna() col_results = {} # 直方图分箱(后端负责 binning) hist_counts, bin_edges = np.histogram(series, bins=20) histogram = [] for i in range(len(hist_counts)): histogram.append({ 'range_start': float(bin_edges[i]), 'range_end': float(bin_edges[i + 1]), 'count': int(hist_counts[i]) }) col_results['histogram'] = histogram # Shapiro-Wilk检验 if len(series) >= 3 and len(series) <= 5000: shapiro_result = stats.shapiro(series) col_results['Shapiro-Wilk'] = { 'statistic': float(shapiro_result[0]), 'p_value': float(shapiro_result[1]), 'normal': bool(shapiro_result[1] > 0.05), } # Jarque-Bera检验 jb_result = stats.jarque_bera(series) # SciPy result typing varies by version; keep runtime behavior and silence stub mismatch. jb_stat = float(jb_result[0]) # type: ignore[index,arg-type] jb_p = float(jb_result[1]) # type: ignore[index,arg-type] col_results['Jarque-Bera'] = { 'statistic': jb_stat, 'p_value': jb_p, 'normal': bool(jb_p > 0.05), } results[col] = col_results summary = f"正态性检验完成,测试了 {len(results)} 个变量" if not self.generate_plots: self._log_step("Normality tests completed (data only)", "success") return None, summary, results # 创建正态性检验可视化 n_cols = min(3, len(numeric_cols)) fig, axes = plt.subplots(n_cols, 2, figsize=(12, 4 * n_cols)) fig.suptitle('正态性检验结果', fontsize=16) if n_cols == 1: axes = axes.reshape(1, -1) for i, col in enumerate(numeric_cols[:n_cols]): series = self.data[col].dropna() # 直方图与正态曲线 axes[i, 0].hist(series, bins=20, density=True, alpha=0.7, color='skyblue') xmin, xmax = axes[i, 0].get_xlim() x = np.linspace(xmin, xmax, 100) p = stats.norm.pdf(x, series.mean(), series.std()) axes[i, 0].plot(x, p, 'k', linewidth=2) axes[i, 0].set_title(f'{col} - 分布直方图') # Q-Q图 stats.probplot(series, dist="norm", plot=axes[i, 1]) axes[i, 1].set_title(f'{col} - Q-Q图') plt.tight_layout() img_path = os.path.join(self.temp_dir.name, 'normality_tests.png') plt.savefig(img_path, dpi=150, bbox_inches='tight') plt.close() self._log_step("Normality tests completed", "success") return img_path, summary, results self._log_step("No data available for normality tests", "warning") return None, "数据不足,无法进行正态性检验", None except Exception as e: self._log_step(f"Normality tests failed: {e}", "error") return None, f"正态性检验失败: {e}", None