知识点回顾:
- 过拟合的判断:测试集和训练集同步打印指标
- 模型的保存和加载
- 仅保存权重
- 保存权重和模型
- 保存全部信息checkpoint,还包含训练状态
- 早停策略
作业:对信贷数据集训练后保存权重,加载权重后继续训练50轮,并采取早停策略
import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as pltdef set_seed(seed=42):random.seed(seed)np.random.seed(seed)torch.manual_seed(seed)torch.cuda.manual_seed_all(seed)torch.backends.cudnn.deterministic = Trueset_seed(42)# 读取数据
data = pd.read_csv('data.csv')
target_col = 'Credit Default'
data = data.fillna(data.median(numeric_only=True))
data = data.fillna('Unknown')categorical_features = ['Home Ownership', 'Purpose', 'Term', 'Years in current job']
numerical_features = [col for col in data.columns if col not in categorical_features + [target_col]]for col in categorical_features:le = LabelEncoder()data[col] = le.fit_transform(data[col])X = data[categorical_features + numerical_features]
y = data[target_col]scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)class CreditDataset(Dataset):def __init__(self, X, y):self.X = torch.tensor(X.values, dtype=torch.float32)self.y = torch.tensor(y.values, dtype=torch.float32)def __len__(self):return len(self.X)def __getitem__(self, idx):return self.X[idx], self.y[idx]train_dataset = CreditDataset(X_train, y_train)
test_dataset = CreditDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)class CreditNet(nn.Module):def __init__(self, input_dim):super(CreditNet, self).__init__()self.model = nn.Sequential(nn.Linear(input_dim, 64),nn.BatchNorm1d(64),nn.ReLU(),nn.Dropout(0.3),nn.Linear(64, 32),nn.BatchNorm1d(32),nn.ReLU(),nn.Dropout(0.2),nn.Linear(32, 1))def forward(self, x):return self.model(x).squeeze(1)device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CreditNet(X_train.shape[1]).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)def train(model, loader, criterion, optimizer):model.train()total_loss = 0for X_batch, y_batch in loader:X_batch, y_batch = X_batch.to(device), y_batch.to(device)optimizer.zero_grad()outputs = model(X_batch)loss = criterion(outputs, y_batch)loss.backward()optimizer.step()total_loss += loss.item() * X_batch.size(0)return total_loss / len(loader.dataset)def evaluate(model, loader):model.eval()preds, targets = [], []with torch.no_grad():for X_batch, y_batch in loader:X_batch = X_batch.to(device)outputs = torch.sigmoid(model(X_batch)).cpu().numpy()preds.extend(outputs)targets.extend(y_batch.numpy())preds = np.array(preds)targets = np.array(targets)preds_label = (preds > 0.5).astype(int)auc = roc_auc_score(targets, preds)report = classification_report(targets, preds_label, digits=4)return auc, report# 训练主循环
epochs = 20
train_losses = []
test_aucs = []for epoch in range(epochs):train_loss = train(model, train_loader, criterion, optimizer)auc, _ = evaluate(model, test_loader)train_losses.append(train_loss)test_aucs.append(auc)print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f} - Test AUC: {auc:.4f}")# 可视化训练损失和AUC曲线
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.plot(range(1, epochs+1), train_losses, marker='o')
plt.xlabel('Epoch')
plt.ylabel('Train Loss')
plt.title('Training Loss Curve')
plt.grid(True)
plt.subplot(1,2,2)
plt.plot(range(1, epochs+1), test_aucs, marker='o', color='orange')
plt.xlabel('Epoch')
plt.ylabel('Test AUC')
plt.title('Test AUC Curve')
plt.grid(True)
plt.tight_layout()
plt.show()# 保存模型权重
torch.save(model.state_dict(), "credit_model.pth")
# 定义早停类
class EarlyStopping:def __init__(self, patience=5, delta=1e-4):self.patience = patienceself.delta = deltaself.best_score = Noneself.counter = 0self.early_stop = Falsedef __call__(self, score):if self.best_score is None or score > self.best_score + self.delta:self.best_score = scoreself.counter = 0else:self.counter += 1if self.counter >= self.patience:self.early_stop = True# 加载权重并继续训练
model.load_state_dict(torch.load("credit_model.pth"))
epochs_continue = 50
early_stopping = EarlyStopping(patience=5, delta=1e-4)
train_losses2 = []
test_aucs2 = []for epoch in range(epochs_continue):train_loss = train(model, train_loader, criterion, optimizer)auc, _ = evaluate(model, test_loader)train_losses2.append(train_loss)test_aucs2.append(auc)print(f"[Continue] Epoch {epoch+1}/{epochs_continue} - Train Loss: {train_loss:.4f} - Test AUC: {auc:.4f}")early_stopping(auc)if early_stopping.early_stop:print("Early stopping triggered!")break# 可视化继续训练的曲线
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.plot(range(1, len(train_losses2)+1), train_losses2, marker='o')
plt.xlabel('Epoch')
plt.ylabel('Train Loss')
plt.title('Continue Training Loss Curve')
plt.grid(True)
plt.subplot(1,2,2)
plt.plot(range(1, len(test_aucs2)+1), test_aucs2, marker='o', color='orange')
plt.xlabel('Epoch')
plt.ylabel('Test AUC')
plt.title('Continue Test AUC Curve')
plt.grid(True)
plt.tight_layout()
plt.show()# 最终评估
auc, report = evaluate(model, test_loader)
print(f"\nFinal Test AUC: {auc:.4f}")
print("Classification Report:\n", report)