08 Regularisation#
overfitting
l2, l1
back prop with regularisation
drop-out
early stopping
train/test
Stability of the fit (overfitting)#
With more flexibility comes more potential for overfitting (poor performance on new data). Here’s an example of fitting the two hidden layer network on an increasing number of data points:
# previously was at the end of 02 logistic regression
"""
for n in [5, 10, 20, 30, 40, 50, 100, 250]:
X, y = get_cubic_data(n)
X = torch.tensor(X, dtype=torch.float)
y = torch.tensor(y, dtype=torch.float)
y = y[:, None]
model = nn.Sequential(
nn.Linear(in_features=2, out_features=3),
nn.Sigmoid(),
nn.Linear(in_features=3, out_features=3),
nn.Sigmoid(),
nn.Linear(in_features=3, out_features=1),
# Sigmoid (applied in loss_fn)
)
learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
model, loss_history = train(X, y, model, loss_fn, optimizer, 30000)
show_result(X, y, model, loss_history, print_weights=False, suptitle=f"n = {n}")
"""
'\nfor n in [5, 10, 20, 30, 40, 50, 100, 250]:\n X, y = get_cubic_data(n)\n X = torch.tensor(X, dtype=torch.float)\n y = torch.tensor(y, dtype=torch.float)\n y = y[:, None]\n\n model = nn.Sequential(\n nn.Linear(in_features=2, out_features=3),\n nn.Sigmoid(),\n nn.Linear(in_features=3, out_features=3),\n nn.Sigmoid(),\n nn.Linear(in_features=3, out_features=1),\n # Sigmoid (applied in loss_fn)\n )\n\n learning_rate = 0.1\n optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)\n model, loss_history = train(X, y, model, loss_fn, optimizer, 30000)\n\n show_result(X, y, model, loss_history, print_weights=False, suptitle=f"n = {n}")\n'