Json-Python-Server/app/services/analysis/modules/basic.py

import gc
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats


def generate_statistical_overview(self):
    """生成统计概览 - 优化内存版本"""
    fig = None
    try:
        self._log_step("Generating statistical overview...")

        # 检查数据
        if not hasattr(self, 'data') or self.data is None or len(self.data) == 0:
            self._log_step("No data available for statistical overview", "warning")
            return None, "No data available", None

        # 计算统计数据
        numeric_cols = self.data.select_dtypes(include=[np.number]).columns
        stats_df = self.data[numeric_cols].describe().T.reset_index().rename(columns={'index': 'variable'})
        summary = f"Generated statistical overview for {len(numeric_cols)} variables"

        if not self.generate_plots:
            self._log_step("Statistical overview generated (data only)", "success")
            return None, summary, stats_df

        # 使用更小的图形尺寸和DPI来节省内存
        fig, axes = plt.subplots(2, 2, figsize=(10, 8), dpi=100)
        fig.suptitle('Statistical Overview', fontsize=14)

        # 基本统计信息
        # 只处理前4个变量以节省内存
        num_vars = min(4, len(self.data.columns))

        for i in range(num_vars):
            row = i // 2
            col = i % 2
            col_name = self.data.columns[i]

            try:
                # 时间序列图
                axes[row, col].plot(self.data.index, self.data[col_name], linewidth=1)
                axes[row, col].set_title(f'{col_name}')
                axes[row, col].tick_params(axis='x', rotation=45)
                axes[row, col].grid(True, alpha=0.3)
            except Exception as e:
                self._log_step(f"Plotting {col_name} failed: {e}", "warning")
                axes[row, col].text(
                    0.5,
                    0.5,
                    f'Error: {str(e)[:30]}',
                    ha='center',
                    va='center',
                    transform=axes[row, col].transAxes,
                )

        plt.tight_layout()

        # 保存图片（使用更低的DPI）
        img_path = os.path.join(self.temp_dir.name, 'stats_overview.png')
        try:
            plt.savefig(img_path, dpi=100, bbox_inches='tight', format='png')
            if not os.path.exists(img_path):
                self._log_step("Failed to save statistical overview image", "error")
                return None, "Failed to save image", stats_df
        except Exception as save_error:
            self._log_step(f"Failed to save figure: {save_error}", "error")
            return None, f"Save error: {str(save_error)[:100]}", stats_df
        finally:
            plt.close(fig)  # 明确关闭图形释放内存
            gc.collect()

        self._log_step("Statistical overview generated", "success")

        return img_path, summary, stats_df

    except Exception as e:
        self._log_step(f"Statistical overview failed: {str(e)[:100]}", "error")
        if fig is not None:
            try:
                plt.close(fig)
                gc.collect()
            except Exception:
                pass
        return None, f"Statistical overview failed: {str(e)[:100]}", None


def perform_normality_tests(self):
    """执行正态性检验"""
    try:
        self._log_step("Performing normality tests...")

        if hasattr(self, 'data') and self.data is not None:
            numeric_cols = self.data.select_dtypes(include=[np.number]).columns
            results = {}

            for col in numeric_cols[:3]:  # 只测试前3个变量
                series = self.data[col].dropna()
                col_results = {}

                # 直方图分箱（后端负责 binning）
                hist_counts, bin_edges = np.histogram(series, bins=20)
                histogram = []
                for i in range(len(hist_counts)):
                    histogram.append({
                        'range_start': float(bin_edges[i]),
                        'range_end': float(bin_edges[i + 1]),
                        'count': int(hist_counts[i])
                    })
                col_results['histogram'] = histogram

                # Shapiro-Wilk检验
                if len(series) >= 3 and len(series) <= 5000:
                    shapiro_result = stats.shapiro(series)
                    col_results['Shapiro-Wilk'] = {
                        'statistic': float(shapiro_result[0]),
                        'p_value': float(shapiro_result[1]),
                        'normal': bool(shapiro_result[1] > 0.05),
                    }

                # Jarque-Bera检验
                jb_result = stats.jarque_bera(series)
                # SciPy result typing varies by version; keep runtime behavior and silence stub mismatch.
                jb_stat = float(jb_result[0])  # type: ignore[index,arg-type]
                jb_p = float(jb_result[1])  # type: ignore[index,arg-type]
                col_results['Jarque-Bera'] = {
                    'statistic': jb_stat,
                    'p_value': jb_p,
                    'normal': bool(jb_p > 0.05),
                }

                results[col] = col_results

            summary = f"正态性检验完成，测试了 {len(results)} 个变量"

            if not self.generate_plots:
                self._log_step("Normality tests completed (data only)", "success")
                return None, summary, results

            # 创建正态性检验可视化
            n_cols = min(3, len(numeric_cols))
            fig, axes = plt.subplots(n_cols, 2, figsize=(12, 4 * n_cols))
            fig.suptitle('正态性检验结果', fontsize=16)

            if n_cols == 1:
                axes = axes.reshape(1, -1)

            for i, col in enumerate(numeric_cols[:n_cols]):
                series = self.data[col].dropna()

                # 直方图与正态曲线
                axes[i, 0].hist(series, bins=20, density=True, alpha=0.7, color='skyblue')
                xmin, xmax = axes[i, 0].get_xlim()
                x = np.linspace(xmin, xmax, 100)
                p = stats.norm.pdf(x, series.mean(), series.std())
                axes[i, 0].plot(x, p, 'k', linewidth=2)
                axes[i, 0].set_title(f'{col} - 分布直方图')

                # Q-Q图
                stats.probplot(series, dist="norm", plot=axes[i, 1])
                axes[i, 1].set_title(f'{col} - Q-Q图')

            plt.tight_layout()
            img_path = os.path.join(self.temp_dir.name, 'normality_tests.png')
            plt.savefig(img_path, dpi=150, bbox_inches='tight')
            plt.close()

            self._log_step("Normality tests completed", "success")

            return img_path, summary, results

        self._log_step("No data available for normality tests", "warning")
        return None, "数据不足，无法进行正态性检验", None

    except Exception as e:
        self._log_step(f"Normality tests failed: {e}", "error")
        return None, f"正态性检验失败: {e}", None