下面是手动实现,写起来符合直觉,如下:
1def l2_penalty(w):2 return torch.sum(w.pow(2)) / 23
4def train(lambd):5 w, b = init_params()6 net, loss = lambda X: d2l.linreg(X, w, b), d2l.squared_loss7 num_epochs, lr = 100, 0.0038 animator = d2l.Animator(xlabel='epochs', ylabel='loss', yscale='log',9 xlim=[5, num_epochs], legend=['train', 'test'])10 for epoch in range(num_epochs):11 for X, y in train_iter:12 # 增加了L2范数惩罚项,13 # 广播机制使l2_penalty(w)成为一个长度为batch_size的向量14 l = loss(net(X), y) + lambd * l2_penalty(w)15 l.sum().backward()5 collapsed lines
16 d2l.sgd([w, b], lr, batch_size)17 if (epoch + 1) % 5 == 0:18 animator.add(epoch + 1, (d2l.evaluate_loss(net, train_iter, loss),19 d2l.evaluate_loss(net, test_iter, loss)))20 print('w的L2范数是:', torch.norm(w).item())
集成实现:
1def train_concise(wd):2 net = nn.Sequential(nn.Linear(num_inputs, 1))3 for param in net.parameters():4 param.data.normal_()5 loss = nn.MSELoss(reduction='none')6 num_epochs, lr = 100, 0.0037 # 偏置参数没有衰减8 trainer = torch.optim.SGD([9 {"params":net[0].weight,'weight_decay': wd},10 {"params":net[0].bias}], lr=lr)11 animator = d2l.Animator(xlabel='epochs', ylabel='loss', yscale='log',12 xlim=[5, num_epochs], legend=['train', 'test'])13 for epoch in range(num_epochs):14 for X, y in train_iter:15 trainer.zero_grad()9 collapsed lines
16 # loss 上面 loss = nn.MSELoss17 l = loss(net(X), y)18 l.mean().backward()19 trainer.step()20 if (epoch + 1) % 5 == 0:21 animator.add(epoch + 1,22 (d2l.evaluate_loss(net, train_iter, loss),23 d2l.evaluate_loss(net, test_iter, loss)))24 print('w的L2范数:', net[0].weight.norm().item())