Json-Python-Server/app/services/analysis/modules/modeling.py

113 lines
4.4 KiB
Python
Raw Permalink Normal View History

2026-01-29 18:18:32 +08:00
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
def analyze_feature_importance(self):
"""分析特征重要性"""
try:
self._log_step("Analyzing feature importance...")
if not (hasattr(self, 'data') and self.data is not None and len(self.data.columns) > 1):
self._log_step("Not enough data for feature importance analysis", "warning")
return None, "Not enough data for feature importance analysis", None
X = self.data
y = self.data.iloc[:, 0] # 使用第一列作为目标变量
model = RandomForestRegressor(n_estimators=50, random_state=42) # 减少树的数量
model.fit(X, y)
feature_importance = pd.Series(model.feature_importances_, index=X.columns)
feature_importance = feature_importance.sort_values(ascending=False)
fi_df = feature_importance.reset_index()
fi_df.columns = ['feature', 'importance']
summary = f"Feature importance analysis completed, top feature: {fi_df.iloc[0]['feature']}"
if not self.generate_plots:
self._log_step("Feature importance analysis completed (data only)", "success")
return None, summary, fi_df
plt.figure(figsize=(8, 6))
feature_importance.head(10).plot(kind='bar')
plt.title('Feature Importance Analysis')
plt.ylabel('Importance Score')
plt.tight_layout()
img_path = os.path.join(self.temp_dir.name, 'feature_importance.png')
plt.savefig(img_path, dpi=150, bbox_inches='tight')
plt.close()
self._log_step("Feature importance analysis completed", "success")
return img_path, summary, fi_df
except Exception as e:
self._log_step(f"Feature importance analysis failed: {e}", "error")
return None, f"Feature importance analysis failed: {e}", None
def perform_var_analysis(self):
"""执行向量自回归分析"""
try:
self._log_step("Performing VAR analysis...")
if not (hasattr(self, 'data') and self.data is not None and len(self.data.columns) > 1):
self._log_step("Not enough data for VAR analysis", "warning")
return None, "数据不足无法进行VAR分析", None
from statsmodels.tsa.api import VAR
numeric_data = self.data.select_dtypes(include=[np.number])
if len(numeric_data.columns) < 2:
self._log_step("Not enough numeric columns for VAR analysis", "warning")
return None, "数值变量不足无法进行VAR分析", None
var_data = numeric_data.iloc[:, : min(3, len(numeric_data.columns))]
model = VAR(var_data)
results = model.fit(maxlags=2, ic='aic')
lag_order = results.k_ar
forecast = results.forecast(var_data.values[-lag_order:], steps=10)
forecast_df = pd.DataFrame(data=forecast, columns=[f"{col}_forecast" for col in var_data.columns])
summary = f"VAR分析完成使用滞后阶数: {results.k_ar}生成了10期预测"
if not self.generate_plots:
self._log_step("VAR analysis completed (data only)", "success")
return None, summary, forecast_df
plt.figure(figsize=(12, 8))
for i, col in enumerate(var_data.columns):
plt.plot(range(len(var_data)), var_data[col].values, label=f'{col} (actual)', alpha=0.7)
plt.plot(
range(len(var_data), len(var_data) + 10),
forecast[:, i],
label=f'{col} (forecast)',
linestyle='--',
)
plt.axvline(x=len(var_data), color='red', linestyle=':', alpha=0.7, label='Forecast Start')
plt.xlabel('Time')
plt.ylabel('Value')
plt.title('Vector Autoregression (VAR) Forecast')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
img_path = os.path.join(self.temp_dir.name, 'var_analysis.png')
plt.savefig(img_path, dpi=150, bbox_inches='tight')
plt.close()
self._log_step("VAR analysis completed", "success")
return img_path, summary, forecast_df
except Exception as e:
self._log_step(f"VAR analysis failed: {e}", "error")
return None, f"VAR分析失败: {e}", None