DAY 55 序列预测任务介绍
知识点回顾
- 序列预测介绍
- 单步预测
- 多步预测的2种方式
- 序列数据的处理:滑动窗口
- 多输入多输出任务的思路
- 经典机器学习在序列任务上的劣势;以随机森林为例
作业:手动构造类似的数据集(如cosx数据),观察不同的机器学习模型的差异
使用lightgbm同样效果非常差
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import lightgbm as lgb# =============================================================
# ===== 步骤1:数据准备 (与之前完全相同) =====
# =============================================================# 生成合成时间序列
x = np.linspace(0, 100, 1000)
y = np.cos(x) + 0.1 * x + np.random.normal(0, 0.5, 1000)# 定义参数
train_size = int(len(y) * 0.8)
seq_length = 30# 正确的数据标准化
train_data_raw = y[:train_size]
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train_data_raw.reshape(-1, 1))
scaled_y = scaler.transform(y.reshape(-1, 1)).flatten()# 创建时序数据集函数
def create_sequences(data, seq_length):X, y = [], []for i in range(len(data) - seq_length):X.append(data[i:i+seq_length])y.append(data[i+seq_length])return np.array(X), np.array(y)# 对完整数据应用滑动窗口
all_X, all_y = create_sequences(scaled_y, seq_length)# 划分序列数据集
split_idx = train_size - seq_length
X_train_np = all_X[:split_idx]
y_train_np = all_y[:split_idx]
X_test_np = all_X[split_idx:]
y_test_np = all_y[split_idx:]# =========================================================================
# ===== 步骤2:为LightGBM模型准备数据 =====
# =========================================================================# 调整X的形状为二维 [样本数, 特征数]
n_samples_train = X_train_np.shape[0]
n_samples_test = X_test_np.shape[0]X_train_lgb = X_train_np.reshape(n_samples_train, -1)
X_test_lgb = X_test_np.reshape(n_samples_test, -1)print("为LightGBM准备的 X_train 形状:", X_train_lgb.shape) # (770, 30)
print("为LightGBM准备的 X_test 形状:", X_test_lgb.shape) # (200, 30)# =============================================================
# ===== 步骤3:创建、训练和评估LightGBM模型 =====
# =============================================================# 创建LightGBM数据集
#train_data = lgb.Dataset(X_train_lgb, label=y_train_np)
#test_data = lgb.Dataset(X_test_lgb, label=y_test_np, reference=train_data)
lgb_model = lgb.LGBMRegressor(n_estimators=100,num_leaves=31,learning_rate=0.05,feature_fraction=0.9,random_state=42,n_jobs=-1
)# 训练模型
print("\n开始训练LightGBM模型...")
lgb_model.fit(X_train_lgb, y_train_np)
print("模型训练完成!")# 做出预测
train_predict = lgb_model.predict(X_train_lgb)
test_predict = lgb_model.predict(X_test_lgb)# 反标准化预测结果
train_predict = scaler.inverse_transform(train_predict.reshape(-1, 1))
test_predict = scaler.inverse_transform(test_predict.reshape(-1, 1))# 原始标签也需要反标准化
y_train_orig = scaler.inverse_transform(y_train_np.reshape(-1, 1))
y_test_orig = scaler.inverse_transform(y_test_np.reshape(-1, 1))# 计算均方根误差 (RMSE)
train_rmse = np.sqrt(mean_squared_error(y_train_orig, train_predict))
test_rmse = np.sqrt(mean_squared_error(y_test_orig, test_predict))print(f"\n训练集 RMSE: {train_rmse:.4f}")
print(f"测试集 RMSE: {test_rmse:.4f}")# =============================================================
# ===== 步骤4:可视化结果 =====
# =============================================================plt.figure(figsize=(15, 7))
plt.plot(y, label='原始数据', color='gray', alpha=0.5)# 绘制训练集的预测结果
train_predict_plot = np.empty_like(y)
train_predict_plot[:] = np.nan
train_predict_plot[seq_length:seq_length+len(train_predict)] = train_predict.flatten()
plt.plot(train_predict_plot, label='训练集预测值 (LightGBM)', color='blue')# 绘制测试集的预测结果
test_predict_plot = np.empty_like(y)
test_predict_plot[:] = np.nan
test_predict_plot[len(train_predict) + seq_length : len(y)] = test_predict.flatten()
plt.plot(test_predict_plot, label='测试集预测值 (RF)', color='red')plt.title('时间序列预测结果对比 (LightGBM)')
plt.xlabel('时间步')
plt.ylabel('值')
plt.legend()
plt.grid(True)
plt.show()# 特征重要性可视化
lgb.plot_importance(lgb_model, height=0.8, title='特征重要性', importance_type='gain')
plt.show()
@浙大疏锦行