Json-Python-Server/app/services/analysis/modules/basic.py

181 lines
7.1 KiB
Python
Raw Normal View History

2026-01-29 18:18:32 +08:00
import gc
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
def generate_statistical_overview(self):
"""生成统计概览 - 优化内存版本"""
fig = None
try:
self._log_step("Generating statistical overview...")
# 检查数据
if not hasattr(self, 'data') or self.data is None or len(self.data) == 0:
self._log_step("No data available for statistical overview", "warning")
return None, "No data available", None
# 计算统计数据
numeric_cols = self.data.select_dtypes(include=[np.number]).columns
stats_df = self.data[numeric_cols].describe().T.reset_index().rename(columns={'index': 'variable'})
summary = f"Generated statistical overview for {len(numeric_cols)} variables"
if not self.generate_plots:
self._log_step("Statistical overview generated (data only)", "success")
return None, summary, stats_df
# 使用更小的图形尺寸和DPI来节省内存
fig, axes = plt.subplots(2, 2, figsize=(10, 8), dpi=100)
fig.suptitle('Statistical Overview', fontsize=14)
# 基本统计信息
# 只处理前4个变量以节省内存
num_vars = min(4, len(self.data.columns))
for i in range(num_vars):
row = i // 2
col = i % 2
col_name = self.data.columns[i]
try:
# 时间序列图
axes[row, col].plot(self.data.index, self.data[col_name], linewidth=1)
axes[row, col].set_title(f'{col_name}')
axes[row, col].tick_params(axis='x', rotation=45)
axes[row, col].grid(True, alpha=0.3)
except Exception as e:
self._log_step(f"Plotting {col_name} failed: {e}", "warning")
axes[row, col].text(
0.5,
0.5,
f'Error: {str(e)[:30]}',
ha='center',
va='center',
transform=axes[row, col].transAxes,
)
plt.tight_layout()
# 保存图片使用更低的DPI
img_path = os.path.join(self.temp_dir.name, 'stats_overview.png')
try:
plt.savefig(img_path, dpi=100, bbox_inches='tight', format='png')
if not os.path.exists(img_path):
self._log_step("Failed to save statistical overview image", "error")
return None, "Failed to save image", stats_df
except Exception as save_error:
self._log_step(f"Failed to save figure: {save_error}", "error")
return None, f"Save error: {str(save_error)[:100]}", stats_df
finally:
plt.close(fig) # 明确关闭图形释放内存
gc.collect()
self._log_step("Statistical overview generated", "success")
return img_path, summary, stats_df
except Exception as e:
self._log_step(f"Statistical overview failed: {str(e)[:100]}", "error")
if fig is not None:
try:
plt.close(fig)
gc.collect()
except Exception:
pass
return None, f"Statistical overview failed: {str(e)[:100]}", None
def perform_normality_tests(self):
"""执行正态性检验"""
try:
self._log_step("Performing normality tests...")
if hasattr(self, 'data') and self.data is not None:
numeric_cols = self.data.select_dtypes(include=[np.number]).columns
results = {}
for col in numeric_cols[:3]: # 只测试前3个变量
series = self.data[col].dropna()
col_results = {}
# 直方图分箱(后端负责 binning
hist_counts, bin_edges = np.histogram(series, bins=20)
histogram = []
for i in range(len(hist_counts)):
histogram.append({
'range_start': float(bin_edges[i]),
'range_end': float(bin_edges[i + 1]),
'count': int(hist_counts[i])
})
col_results['histogram'] = histogram
# Shapiro-Wilk检验
if len(series) >= 3 and len(series) <= 5000:
shapiro_result = stats.shapiro(series)
col_results['Shapiro-Wilk'] = {
'statistic': float(shapiro_result[0]),
'p_value': float(shapiro_result[1]),
'normal': bool(shapiro_result[1] > 0.05),
}
# Jarque-Bera检验
jb_result = stats.jarque_bera(series)
# SciPy result typing varies by version; keep runtime behavior and silence stub mismatch.
jb_stat = float(jb_result[0]) # type: ignore[index,arg-type]
jb_p = float(jb_result[1]) # type: ignore[index,arg-type]
col_results['Jarque-Bera'] = {
'statistic': jb_stat,
'p_value': jb_p,
'normal': bool(jb_p > 0.05),
}
results[col] = col_results
summary = f"正态性检验完成,测试了 {len(results)} 个变量"
if not self.generate_plots:
self._log_step("Normality tests completed (data only)", "success")
return None, summary, results
# 创建正态性检验可视化
n_cols = min(3, len(numeric_cols))
fig, axes = plt.subplots(n_cols, 2, figsize=(12, 4 * n_cols))
fig.suptitle('正态性检验结果', fontsize=16)
if n_cols == 1:
axes = axes.reshape(1, -1)
for i, col in enumerate(numeric_cols[:n_cols]):
series = self.data[col].dropna()
# 直方图与正态曲线
axes[i, 0].hist(series, bins=20, density=True, alpha=0.7, color='skyblue')
xmin, xmax = axes[i, 0].get_xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, series.mean(), series.std())
axes[i, 0].plot(x, p, 'k', linewidth=2)
axes[i, 0].set_title(f'{col} - 分布直方图')
# Q-Q图
stats.probplot(series, dist="norm", plot=axes[i, 1])
axes[i, 1].set_title(f'{col} - Q-Q图')
plt.tight_layout()
img_path = os.path.join(self.temp_dir.name, 'normality_tests.png')
plt.savefig(img_path, dpi=150, bbox_inches='tight')
plt.close()
self._log_step("Normality tests completed", "success")
return img_path, summary, results
self._log_step("No data available for normality tests", "warning")
return None, "数据不足,无法进行正态性检验", None
except Exception as e:
self._log_step(f"Normality tests failed: {e}", "error")
return None, f"正态性检验失败: {e}", None