Json-Python-Server/app/services/analysis/modules/basic.py
2026-01-29 18:18:32 +08:00

181 lines
7.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import gc
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
def generate_statistical_overview(self):
"""生成统计概览 - 优化内存版本"""
fig = None
try:
self._log_step("Generating statistical overview...")
# 检查数据
if not hasattr(self, 'data') or self.data is None or len(self.data) == 0:
self._log_step("No data available for statistical overview", "warning")
return None, "No data available", None
# 计算统计数据
numeric_cols = self.data.select_dtypes(include=[np.number]).columns
stats_df = self.data[numeric_cols].describe().T.reset_index().rename(columns={'index': 'variable'})
summary = f"Generated statistical overview for {len(numeric_cols)} variables"
if not self.generate_plots:
self._log_step("Statistical overview generated (data only)", "success")
return None, summary, stats_df
# 使用更小的图形尺寸和DPI来节省内存
fig, axes = plt.subplots(2, 2, figsize=(10, 8), dpi=100)
fig.suptitle('Statistical Overview', fontsize=14)
# 基本统计信息
# 只处理前4个变量以节省内存
num_vars = min(4, len(self.data.columns))
for i in range(num_vars):
row = i // 2
col = i % 2
col_name = self.data.columns[i]
try:
# 时间序列图
axes[row, col].plot(self.data.index, self.data[col_name], linewidth=1)
axes[row, col].set_title(f'{col_name}')
axes[row, col].tick_params(axis='x', rotation=45)
axes[row, col].grid(True, alpha=0.3)
except Exception as e:
self._log_step(f"Plotting {col_name} failed: {e}", "warning")
axes[row, col].text(
0.5,
0.5,
f'Error: {str(e)[:30]}',
ha='center',
va='center',
transform=axes[row, col].transAxes,
)
plt.tight_layout()
# 保存图片使用更低的DPI
img_path = os.path.join(self.temp_dir.name, 'stats_overview.png')
try:
plt.savefig(img_path, dpi=100, bbox_inches='tight', format='png')
if not os.path.exists(img_path):
self._log_step("Failed to save statistical overview image", "error")
return None, "Failed to save image", stats_df
except Exception as save_error:
self._log_step(f"Failed to save figure: {save_error}", "error")
return None, f"Save error: {str(save_error)[:100]}", stats_df
finally:
plt.close(fig) # 明确关闭图形释放内存
gc.collect()
self._log_step("Statistical overview generated", "success")
return img_path, summary, stats_df
except Exception as e:
self._log_step(f"Statistical overview failed: {str(e)[:100]}", "error")
if fig is not None:
try:
plt.close(fig)
gc.collect()
except Exception:
pass
return None, f"Statistical overview failed: {str(e)[:100]}", None
def perform_normality_tests(self):
"""执行正态性检验"""
try:
self._log_step("Performing normality tests...")
if hasattr(self, 'data') and self.data is not None:
numeric_cols = self.data.select_dtypes(include=[np.number]).columns
results = {}
for col in numeric_cols[:3]: # 只测试前3个变量
series = self.data[col].dropna()
col_results = {}
# 直方图分箱(后端负责 binning
hist_counts, bin_edges = np.histogram(series, bins=20)
histogram = []
for i in range(len(hist_counts)):
histogram.append({
'range_start': float(bin_edges[i]),
'range_end': float(bin_edges[i + 1]),
'count': int(hist_counts[i])
})
col_results['histogram'] = histogram
# Shapiro-Wilk检验
if len(series) >= 3 and len(series) <= 5000:
shapiro_result = stats.shapiro(series)
col_results['Shapiro-Wilk'] = {
'statistic': float(shapiro_result[0]),
'p_value': float(shapiro_result[1]),
'normal': bool(shapiro_result[1] > 0.05),
}
# Jarque-Bera检验
jb_result = stats.jarque_bera(series)
# SciPy result typing varies by version; keep runtime behavior and silence stub mismatch.
jb_stat = float(jb_result[0]) # type: ignore[index,arg-type]
jb_p = float(jb_result[1]) # type: ignore[index,arg-type]
col_results['Jarque-Bera'] = {
'statistic': jb_stat,
'p_value': jb_p,
'normal': bool(jb_p > 0.05),
}
results[col] = col_results
summary = f"正态性检验完成,测试了 {len(results)} 个变量"
if not self.generate_plots:
self._log_step("Normality tests completed (data only)", "success")
return None, summary, results
# 创建正态性检验可视化
n_cols = min(3, len(numeric_cols))
fig, axes = plt.subplots(n_cols, 2, figsize=(12, 4 * n_cols))
fig.suptitle('正态性检验结果', fontsize=16)
if n_cols == 1:
axes = axes.reshape(1, -1)
for i, col in enumerate(numeric_cols[:n_cols]):
series = self.data[col].dropna()
# 直方图与正态曲线
axes[i, 0].hist(series, bins=20, density=True, alpha=0.7, color='skyblue')
xmin, xmax = axes[i, 0].get_xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, series.mean(), series.std())
axes[i, 0].plot(x, p, 'k', linewidth=2)
axes[i, 0].set_title(f'{col} - 分布直方图')
# Q-Q图
stats.probplot(series, dist="norm", plot=axes[i, 1])
axes[i, 1].set_title(f'{col} - Q-Q图')
plt.tight_layout()
img_path = os.path.join(self.temp_dir.name, 'normality_tests.png')
plt.savefig(img_path, dpi=150, bbox_inches='tight')
plt.close()
self._log_step("Normality tests completed", "success")
return img_path, summary, results
self._log_step("No data available for normality tests", "warning")
return None, "数据不足,无法进行正态性检验", None
except Exception as e:
self._log_step(f"Normality tests failed: {e}", "error")
return None, f"正态性检验失败: {e}", None