113 lines
4.4 KiB
Python
113 lines
4.4 KiB
Python
import os
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
import matplotlib.pyplot as plt
|
||
from sklearn.ensemble import RandomForestRegressor
|
||
|
||
|
||
def analyze_feature_importance(self):
|
||
"""分析特征重要性"""
|
||
try:
|
||
self._log_step("Analyzing feature importance...")
|
||
|
||
if not (hasattr(self, 'data') and self.data is not None and len(self.data.columns) > 1):
|
||
self._log_step("Not enough data for feature importance analysis", "warning")
|
||
return None, "Not enough data for feature importance analysis", None
|
||
|
||
X = self.data
|
||
y = self.data.iloc[:, 0] # 使用第一列作为目标变量
|
||
|
||
model = RandomForestRegressor(n_estimators=50, random_state=42) # 减少树的数量
|
||
model.fit(X, y)
|
||
|
||
feature_importance = pd.Series(model.feature_importances_, index=X.columns)
|
||
feature_importance = feature_importance.sort_values(ascending=False)
|
||
|
||
fi_df = feature_importance.reset_index()
|
||
fi_df.columns = ['feature', 'importance']
|
||
|
||
summary = f"Feature importance analysis completed, top feature: {fi_df.iloc[0]['feature']}"
|
||
|
||
if not self.generate_plots:
|
||
self._log_step("Feature importance analysis completed (data only)", "success")
|
||
return None, summary, fi_df
|
||
|
||
plt.figure(figsize=(8, 6))
|
||
feature_importance.head(10).plot(kind='bar')
|
||
plt.title('Feature Importance Analysis')
|
||
plt.ylabel('Importance Score')
|
||
plt.tight_layout()
|
||
|
||
img_path = os.path.join(self.temp_dir.name, 'feature_importance.png')
|
||
plt.savefig(img_path, dpi=150, bbox_inches='tight')
|
||
plt.close()
|
||
|
||
self._log_step("Feature importance analysis completed", "success")
|
||
return img_path, summary, fi_df
|
||
|
||
except Exception as e:
|
||
self._log_step(f"Feature importance analysis failed: {e}", "error")
|
||
return None, f"Feature importance analysis failed: {e}", None
|
||
|
||
|
||
def perform_var_analysis(self):
|
||
"""执行向量自回归分析"""
|
||
try:
|
||
self._log_step("Performing VAR analysis...")
|
||
|
||
if not (hasattr(self, 'data') and self.data is not None and len(self.data.columns) > 1):
|
||
self._log_step("Not enough data for VAR analysis", "warning")
|
||
return None, "数据不足,无法进行VAR分析", None
|
||
|
||
from statsmodels.tsa.api import VAR
|
||
|
||
numeric_data = self.data.select_dtypes(include=[np.number])
|
||
if len(numeric_data.columns) < 2:
|
||
self._log_step("Not enough numeric columns for VAR analysis", "warning")
|
||
return None, "数值变量不足,无法进行VAR分析", None
|
||
|
||
var_data = numeric_data.iloc[:, : min(3, len(numeric_data.columns))]
|
||
|
||
model = VAR(var_data)
|
||
results = model.fit(maxlags=2, ic='aic')
|
||
|
||
lag_order = results.k_ar
|
||
forecast = results.forecast(var_data.values[-lag_order:], steps=10)
|
||
|
||
forecast_df = pd.DataFrame(data=forecast, columns=[f"{col}_forecast" for col in var_data.columns])
|
||
summary = f"VAR分析完成,使用滞后阶数: {results.k_ar},生成了10期预测"
|
||
|
||
if not self.generate_plots:
|
||
self._log_step("VAR analysis completed (data only)", "success")
|
||
return None, summary, forecast_df
|
||
|
||
plt.figure(figsize=(12, 8))
|
||
for i, col in enumerate(var_data.columns):
|
||
plt.plot(range(len(var_data)), var_data[col].values, label=f'{col} (actual)', alpha=0.7)
|
||
plt.plot(
|
||
range(len(var_data), len(var_data) + 10),
|
||
forecast[:, i],
|
||
label=f'{col} (forecast)',
|
||
linestyle='--',
|
||
)
|
||
|
||
plt.axvline(x=len(var_data), color='red', linestyle=':', alpha=0.7, label='Forecast Start')
|
||
plt.xlabel('Time')
|
||
plt.ylabel('Value')
|
||
plt.title('Vector Autoregression (VAR) Forecast')
|
||
plt.legend()
|
||
plt.grid(True, alpha=0.3)
|
||
plt.tight_layout()
|
||
|
||
img_path = os.path.join(self.temp_dir.name, 'var_analysis.png')
|
||
plt.savefig(img_path, dpi=150, bbox_inches='tight')
|
||
plt.close()
|
||
|
||
self._log_step("VAR analysis completed", "success")
|
||
return img_path, summary, forecast_df
|
||
|
||
except Exception as e:
|
||
self._log_step(f"VAR analysis failed: {e}", "error")
|
||
return None, f"VAR分析失败: {e}", None
|