3. Optimization#

This exercise is devoted to:

  • convex and non-convex optimization,

  • how to use different optimizer with PyTorch, and

  • how to apply some common tricks of optimization

Imports and Utils

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn


def plot_progress(func, track, losses, func_min, torch=False):
    """
    Adapted from 
    https://colab.research.google.com/github/davidbau/how-to-read-pytorch/blob/master/notebooks/3-Pytorch-Optimizers.ipynb
    """
    # Draw the contours of the objective function, and x, and y
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

    xs = np.linspace(-2.5, 2.5, 100)
    ys = np.linspace(-1, 4, 100)
    X, Y = np.meshgrid(xs, ys)
    Z = func(np.concatenate([np.expand_dims(X, 0), np.expand_dims(Y, 0)]))
    Z = np.array(Z)

    levels = np.linspace(Z.min(), Z.max(), 100)
    manual_levels = np.array([5, 20, 100])
    levels = np.concatenate(((Z.min()+0.001) * manual_levels, levels))
    levels.sort()

    ax1.contour(X, Y, Z, levels=levels, cmap="bwr", alpha=0.5)
    track = np.stack(track).T  # (2,n)

    for i in range(len(losses)-1):
        ax1.plot(track[0, i:i+2], track[1, i:i+2], marker='o',
                 color='k', alpha=0.2 + i/(len(losses)-1) * 0.3)

    ax1.scatter(func_min[0], func_min[1], s=80, c='g', marker=(5, 1))

    ax1.set_title('progress of x')
    ax1.set_ylim(-1.0, 4)
    ax1.set_xlim(-2.5, 2.5)
    ax1.set_ylabel('x_2')
    ax1.set_xlabel('x_1')

    ax2.set_title('progress of loss')
    ax2.xaxis.set_major_locator(matplotlib.ticker.MaxNLocator(integer=True))
    for i in range(len(losses)-1):
        ax2.plot([i, i+1], losses[i:i+2], marker='o',
                 color='k', alpha=0.2 + i/(len(losses)-1) * 0.3)

    ax2.hlines(func(func_min), 0, len(losses), colors='g')
    # ax2.plot(range(len(losses)), losses, marker='o', color='k')
    ax2.set_ylabel('objective')
    ax2.set_xlabel('iteration')
    fig.show()

3.1. With Analytical Derivatives#

3.1.1. Case Study: Convex vs Non-Convex Optimization#

In the convex optimization case, we look at the Rosenbrock function

\(f_1(x) = (x_2-x_1^2)^2 + 0.01 \cdot (1-x_1)^2\)

The non-convex case emerges if we add a sinusoidal signal in \(x\), i.e.

\(f_2(x) = (x_2-x_1^2)^2 + 0.01 \cdot (1-x_1)^2 + 0.1 \cdot (1 + \cos(x_1))\)

def f1(x, use_torch=False):
    """Rosenbrock function
    Has one local and global minimum at f(1,1)=0"""
    return (x[1]-x[0] ** 2) ** 2 + 0.01 * (1-x[0]) ** 2


def grad_f1(x):
    """Gradient of Rosenbrock function"""
    dx0 = - 2 * (x[1] - x[0] ** 2) * 2 * x[0] - 2 * 0.01 * (1-x[0])
    dx1 = 2 * (x[1] - x[0] ** 2)
    return np.array([dx0, dx1])


def hess_f1(x):
    """Hessian of Rosenbrock function"""
    dxdx = 8*x[0]**2 - 4*(x[1]-x[0]**2) + 2*0.01
    dxdy = dydx = -4*x[0]
    dydy = 2
    return np.array([[dxdx, dxdy], [dydx, dydy]])


f1_min = np.array([1.0, 1.0])


def f2(x, use_torch=False):
    """Extended Rosenbrock function
    Has one global minimum f(0.63, 0.40)=0.00137 and many local minima"""
    regular_rosenbrock = (x[1]-x[0] ** 2) ** 2 + 0.01 * (1-x[0]) ** 2

    if use_torch:
        return regular_rosenbrock + 0.1 * (1 + torch.cos(5*x[0]))
    else:
        return regular_rosenbrock + 0.1 * (1 + np.cos(5*x[0]))


def grad_f2(x):
    """Gradient of extended Rosenbrock function"""
    dx0 = - 2 * (x[1] - x[0] ** 2) * 2 * x[0] - 2 * \
        0.01 * (1-x[0]) - 0.1 * np.sin(5*x[0]) * 5
    dx1 = 2 * (x[1] - x[0] ** 2)
    return np.array([dx0, dx1])


def hess_f2(x):
    """Hessian of extended Rosenbrock function"""
    dxdx = 8*x[0]**2 - 4*(x[1]-x[0]**2) + 2*0.01 - 0.1 * np.cos(5*x[0]) * 5 * 5
    dxdy = dydx = -4*x[0]
    dydy = 2
    return np.array([[dxdx, dxdy], [dydx, dydy]])


f2_min = np.array([0.63, 0.40])
selected_function = 1

if selected_function == 1:
    func = f1
    grad_func = grad_f1
    hess_func = hess_f1
    func_min = f1_min
else:
    func = f2
    grad_func = grad_f2
    hess_func = hess_f2
    func_min = f2_min

3.1.2. Gradient Descent (revised)#

num_iters = 100
x = np.array([-2., 2])
track, losses = [x], [func(x)]


def gd_step(x, grad, lr=0.1):
    # GD step
    x = x - lr * grad
    return x


for iter in range(num_iters):
    # apply gradient descent
    grad = grad_func(x)
    x = gd_step(x=x, grad=grad)

    # logging
    track.append(x)
    losses.append(func(x))

plot_progress(func, track, losses, func_min)
/var/folders/7g/3mxmtrb16h7gh3hsxzbkh_5w0000gn/T/ipykernel_19923/4038594132.py:52: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  fig.show()
../_images/98596483cd9b3572be67d2024ef1fa813f9e6f3e66eaafabee664424dd08b5fe.png

3.1.3. Gradient Descent with Momentum#

num_iters = 100
x = np.array([-2., 2])
v = np.zeros(x.shape)
track, losses = [x], [func(x)]


def gd_with_momentum_step(x, v, grad, lr=0.03, beta=0.9):
    # GD with momentum step
    v = beta*v + grad
    x = x - lr * v
    return (x, v)


for iter in range(num_iters):
    # apply gradient descent with momentum
    grad = grad_func(x)
    x, v = gd_with_momentum_step(x=x, v=v, grad=grad)

    # logging
    track.append(x)
    losses.append(func(x))

plot_progress(func, track, losses, func_min)
/var/folders/7g/3mxmtrb16h7gh3hsxzbkh_5w0000gn/T/ipykernel_19923/4038594132.py:52: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  fig.show()
../_images/1689ff1bb4c9e20701041975f1f105019b1cdae85a20f6699d290993ca63e7cf.png

3.1.4. Adam#

num_iters = 100
x = np.array([-2., 2])
v = np.zeros(x.shape)
s = np.zeros(x.shape)
track, losses = [x], [func(x)]


def adam_step(x, v, s, grad, t, lr=.9, beta1=0.9, beta2=0.999):
    # Adam step
    v = beta1 * v + (1 - beta1) * grad
    s = beta2 * s + (1 - beta2) * grad ** 2
    v_hat = v / (1 - beta1 ** t)
    s_hat = s / (1 - beta2 ** t)
    # the small number in the next line is added to avoid divisions by zero
    x = x - lr * v_hat / (np.sqrt(s_hat) + 0.0000001)
    return (x, v, s)


for iter in range(num_iters):
    # apply Adam
    grad = grad_func(x)
    x, v, s = adam_step(x=x, v=v, s=s, grad=grad, t=(iter+1))

    # logging
    track.append(x)
    losses.append(func(x))

plot_progress(func, track, losses, func_min)
/var/folders/7g/3mxmtrb16h7gh3hsxzbkh_5w0000gn/T/ipykernel_19923/4038594132.py:52: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  fig.show()
../_images/14573c501058d9250d8b01973dce0c9ab24d1e7abd3de312be6305a07b31c6d0.png

3.1.5. Newton Optimizer#

num_iters = 10
x = np.array([-2., 2])
track, losses = [x], [func(x)]


def newton_step(x, grad, hess, lr=1.0):
    # Newton's method step
    hess_inv = np.linalg.inv(hess)
    x = x - lr * hess_inv @ grad
    # equivalent to `x = x - lr * np.dot(hess_inv, grad)`
    return x


for iter in range(num_iters):
    # apply Newton's method
    grad = grad_func(x)
    hess = hess_func(x)

    x = newton_step(x=x, grad=grad, hess=hess)

    # logging
    track.append(x)
    losses.append(func(x))

plot_progress(func, track, losses, func_min)
/var/folders/7g/3mxmtrb16h7gh3hsxzbkh_5w0000gn/T/ipykernel_19923/4038594132.py:52: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  fig.show()
../_images/1bc3d2e038e8240ba797b05cd58e261bc74c8e9c19aefa996d48039b3fa384b7.png

3.1.6. Derivative-Free Optimization#

Here, we compare two seemingly very similar algorithms: Grid Search and Random Search.

We assume that we know the range of the parameters in which the minimum is contained. We then just explore the region either in a structered way (grid search) or by random sampling (random search).

samples_per_dim = 3

# grid search

xs = np.linspace(-2.0, 2.0, samples_per_dim)
ys = np.linspace(-0.95, 3.0, samples_per_dim)
X, Y = np.meshgrid(xs, ys)
Z = np.zeros(X.shape)
for i in range(len(xs)):
    for j in range(len(ys)):
        Z[i, j] = func(np.array([X[i, j], Y[i, j]]))

plt.scatter(X.flatten(), Y.flatten(), c=Z.flatten(), cmap='bwr')
plt.grid()

idx = Z == Z.min()
print(
    f"Our best guess is x={X[idx]}, y={Y[idx]}, and loss = {func(np.array([X[idx], Y[idx]]))}")
Our best guess is x=[0.], y=[-0.95], and loss = [0.9125]
../_images/2d27e573f0082073adeeed265f214fd6e7859fd2656647859f1a3b92cd80b888.png
# random search

np.random.seed(123)
# sample randomly samples_per_dim**2 points on the x and y axis
X = np.random.uniform(-2.0, 2.0, int(samples_per_dim**2))
Y = np.random.uniform(-0.95, 3.0, int(samples_per_dim**2))
# evaluate loss at each of these points
Z = np.zeros(X.shape)
for i in range(len(Z)):
    Z[i] = func(np.array([X[i], Y[i]]))

plt.scatter(X, Y, c=Z, cmap='bwr')
plt.grid()

idx = Z == Z.min()
print(
    f"Our best guess is x={X[idx]}, y={Y[idx]}, and loss = {func(np.array([X[idx], Y[idx]]))}")
Our best guess is x=[0.78587674], y=[0.5988642], and loss = [0.0008096]
../_images/3862c14aeaeee72c969890da1f4e656a3507c650cb63e633f7c1b5bcdaf28e16.png

3.2. With PyTorch#

3.2.1. Gradient Descent#

num_iters = 100
x_init = torch.tensor([-2.0, 2.0])
x = x_init.clone()
x.requires_grad = True
optimizer = torch.optim.SGD([x], lr=0.1)

track = [x.detach().clone().numpy()]
losses = [func(x, use_torch=True).detach().numpy()]

for iter in range(num_iters):
    loss = func(x, use_torch=True)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    track.append(x.detach().clone().numpy())
    losses.append(loss.detach().numpy())

plot_progress(func, track, losses, func_min)
/var/folders/7g/3mxmtrb16h7gh3hsxzbkh_5w0000gn/T/ipykernel_19923/4038594132.py:52: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  fig.show()
../_images/1b4888445889e5d599a15c271abd44d023a06e855c0954fffc341b09b88a6057.png

3.2.2. Adam#

num_iters = 100
x_init = torch.tensor([-2.0, 2.0])
x = x_init.clone()
x.requires_grad = True
optimizer = torch.optim.Adam([x], lr=1.0, betas=(0.9, 0.95))

track = [x.detach().clone().numpy()]
losses = [func(x, use_torch=True).detach().numpy()]

for iter in range(num_iters):
    loss = func(x, use_torch=True)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    track.append(x.detach().clone().numpy())
    losses.append(loss.detach().numpy())

plot_progress(func, track, losses, func_min)
/var/folders/7g/3mxmtrb16h7gh3hsxzbkh_5w0000gn/T/ipykernel_19923/4038594132.py:52: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  fig.show()
../_images/e89740009e5bc889723abbd3a000f467f0678105ee7ab037c2134294c22d7a93.png

3.2.3. L-BFGS#

num_iters = 100
x_init = torch.tensor([-2.0, 2.0])
x = x_init.clone()
x.requires_grad = True
optimizer = torch.optim.LBFGS([x], lr=1.0)

track = [x.detach().clone().numpy()]
losses = [func(x, use_torch=True).detach().numpy()]

for iter in range(num_iters):
    # LBFGS requres a `closure` for the approximation of the hessian
    # typically it is the gradient:
    def closure():
        """
        from the [documentation](https://pytorch.org/docs/stable/generated/torch.optim.LBFGS.html): 
        "A closure that reevaluates the model and returns the loss."

        More information on the closure here:
        https://pytorch.org/docs/stable/optim.html
        """
        optimizer.zero_grad()
        loss = func(x, use_torch=True)
        loss.backward()
        return loss

    optimizer.step(closure)

    track.append(x.detach().clone().numpy())
    losses.append(func(x, use_torch=True).detach().numpy())

    optimizer.step(closure)


plot_progress(func, track, losses, func_min)
/var/folders/7g/3mxmtrb16h7gh3hsxzbkh_5w0000gn/T/ipykernel_19923/4038594132.py:52: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown
  fig.show()
../_images/43914610a4799b397d9fef53f552b01059a90f61cd1a93f7004257c73ff09c50.png

3.3. Tricks of Optimization#

# plot util

def plot_progress_polynomial(losses, model, x_data, y_data):
    # visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

    # left plot
    x_linspace = torch.linspace(-1.5, 1.5, 31).unsqueeze(1)

    with torch.no_grad():
        y_pred_np = model(x_linspace).data.squeeze().numpy()

    # for plotting we transfer everything from torch to numpy
    x_data_np = x_data.detach().squeeze().numpy()
    y_data_np = y_data.detach().squeeze().numpy()
    x_np = x_linspace.squeeze().numpy()

    # plot performance
    ax1.plot(x_data_np, y_data_np, 'go', label='True data')
    ax1.plot(x_np, y_pred_np, '--', label='Predictions')
    ax1.set_ylim([-3, 3])
    ax1.legend()
    ax1.grid()

    # right plot
    ax2.plot(losses)
    ax2.set_xlabel('iteration')
    ax2.set_ylabel('loss')
    ax2.grid()

    plt.tight_layout()
    plt.show()


# train util

def train(num_iters, model, optimizer, criterion, x_data, y_data, losses):
    print("## Parameters before training")
    for name, param in model.named_parameters():
        print(name, ": ", param.data)

    for iter in range(num_iters):

        def closure():
            optimizer.zero_grad()
            y_pred = model(x_data)
            loss = criterion(y_pred, y_data)
            loss.backward()
            return loss

        optimizer.step(closure)
        losses.append(closure().detach().item())

    # Caution: LBFGS converges very fast, but will not always converge to the 
    # true minimum. Check the left plot below to verify validity.
    print("## Parameters after training:")
    for name, param in model.named_parameters():
        print(name, ": ", param.data)

    print(f"## Loss at the end of training = {losses[-1]}")

3.3.1. Case Study: Polynomial Linear Regression#

Note: we move back to the typical ML notation with measurement pairs \(\left\{x^{(i)}, y^{\text {(i)}}\right\}_{i=1,...m}\), model \(h(x)\) which approximates \(y\), and loss \(\mathcal{L}\)

In this specific case we look at polynomial linear regression, i.e.

\[ h(x) = \vartheta_0 + \vartheta_1 \cdot x^1 + ... + \vartheta_p \cdot x^p \]

We work with a very simple artificial dataset consisting of 5 measurement pairs given below.

x_data = torch.tensor([[1.0, 1.5, 1.8, 2.3, 2.5]]).T
y_data = torch.tensor([[3.0, 2., 2.3, 4., 1.]]).T
print(x_data.shape, y_data.shape)   
plt.scatter(x_data, y_data); plt.grid()
plt.xlabel('x'); plt.ylabel('y'); plt.show()
torch.Size([5, 1]) torch.Size([5, 1])
../_images/06d207739d3d1d8adde4a857ab0b605a218bcbbfec2858be2e09769ee9fcd392.png

3.3.1.1. Input/Output Normalization#

Trick Nr. 1: Input/Output normalization to zero-centered and std=1 variables.

# TODO: vital for an efficient training process

x_mean = x_data.mean()
x_std = x_data.std()
x_data = (x_data-x_mean)/x_std

y_mean = y_data.mean()
y_std = y_data.std()
y_data = (y_data-y_mean)/y_std

plt.scatter(x_data, y_data); plt.grid()
../_images/56a5814368d46273402e451a3884e50d9f6fb57b1a7d8fb0b190298c09c17b60.png

3.3.2. Choice of Model Complexity#

Let’s first construct a linear model and set up an optimization with Adam.

# model

class PolynomialModel(nn.Module):
    """ from https://soham.dev/posts/polynomial-regression-pytorch"""

    def __init__(self, degree):
        super().__init__()
        self._degree = degree
        # 1 is the dimension of the output y
        self.linear = nn.Linear(self._degree, 1)

    def forward(self, x):
        return self.linear(self._polynomial_features(x))

    def _polynomial_features(self, x):
        return torch.cat([x ** i for i in range(1, self._degree + 1)], 1)
# x.shape = [B,F] where B is the batch size and F is the number of features

Exercise

Try various values for the degree of the polynomial. When does overfitting begin?

Keep in mind: polynomial with degree d is able to perfectly fit d+1 points

# TODO: try our different polynomial degrees

polynomial_degree = 3
model = PolynomialModel(polynomial_degree)
# Use LBFGS for fast convergence, but check validity of solution. 
# Remember: LBFGS is only an approximate 2nd order optimizer
# optimizer = torch.optim.LBFGS(model.parameters()) 
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
criterion = nn.MSELoss()

losses = []
num_iters = 100

train(num_iters, model, optimizer, criterion, x_data, y_data, losses)

plot_progress_polynomial(losses, model, x_data, y_data)
## Parameters before training
linear.weight :  tensor([[-0.2872, -0.3925, -0.4374]])
linear.bias :  tensor([-0.4289])
## Parameters after training:
linear.weight :  tensor([[ 2.7548, -1.1361, -2.3317]])
linear.bias :  tensor([0.5757])
## Loss at the end of training = 0.17764988541603088
../_images/39a2c93db2aeda89cabc5ac1bf9c2414736840f1e614683ce9e68d96853ff62d.png

3.3.3. Learning Rate Optimization#

Exercise

Work with polynomial degree = 3.

Apply grid and/or random search over the learning rate of Adam. Do also combined optimization over the second order momentum parameter \(\beta_2\) and learning rate.

# TODO:

# grid search over learning rate for 100 steps

polynomial_degree = 3
num_iters = 100

lrs = [1.0, 0.1, 0.01, 0.001]

fig = plt.figure()

for lr in lrs:
    
    model = PolynomialModel(polynomial_degree)
    # Use LBFGS for fast convergence, but check validity of solution.
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    losses = []

    for iter in range(num_iters):

        def closure():
            optimizer.zero_grad()
            y_pred = model(x_data)
            loss = criterion(y_pred, y_data)
            loss.backward()
            return loss

        optimizer.step(closure)
        losses.append(closure().detach().item())

    plt.plot(losses, label=f"lr={lr}")
    

plt.xlabel("iteration")
plt.ylabel("loss")
plt.legend()
plt.grid()
plt.ylim((0, 5))
(0.0, 5.0)
../_images/163c34e98dbf0593838df317eae83d1849539fbb336113b1302b63bdc53a6412.png
# random search over learning rate and beta_2 for 1k steps

def sample_loguniform(lb, ub, n):
    """Sapmle from the log-uniform distribution.
    This is very useful for explore different orders of magnitude.
    See e.g. this article: https://towardsdatascience.com/why-is-the-log-uniform-distribution-useful-for-hyperparameter-tuning-63c8d331698
    
    lb: lower boundary
    ub: upper boundary
    n: number of samples
    """
    return np.exp(np.random.uniform(np.log(lb), np.log(ub), n,))

polynomial_degree = 3
num_iters = 400

np.random.seed(123)
torch.manual_seed(123)
# log-uniform samples on the interval (0.5-0.02)
lrs = sample_loguniform(0.5, 0.02, 10)
# log-uniform samples on the interval (0.9, 0.999) finer resolved towards 1
beta2s = 1 - sample_loguniform(0.001, 0.1, 10)

# now train the model at each of these points

fig = plt.figure()

final_losses = []
for (lr, beta2) in zip(lrs, beta2s):

    model = PolynomialModel(polynomial_degree)
    # Use LBFGS for fast convergence, but check validity of solution.
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, beta2))
    criterion = nn.MSELoss()

    losses = []

    for iter in range(num_iters):

        def closure():
            optimizer.zero_grad()
            y_pred = model(x_data)
            loss = criterion(y_pred, y_data)
            loss.backward()
            return loss

        optimizer.step(closure)
        losses.append(closure().detach().item())
    
    final_losses.append(losses[-1])

    plt.plot(losses, label=f"lr={lr:0.3f}, beta2={beta2:.4f}")


plt.xlabel("iteration")
plt.ylabel("loss")
plt.legend()
plt.ylim((0, 2))
plt.grid()
plt.show()

# second scatter plot
fig2 = plt.figure()
final_losses = np.array(final_losses)
plt.scatter(lrs, beta2s, c=final_losses, cmap='bwr')
plt.xlabel("lr")
plt.ylabel("beta2")
plt.grid()

idx = final_losses == final_losses.min()
print(f"Our best guess is lr={lrs[idx]}, beta2={beta2s[idx]}, and loss = {final_losses[idx]}")
../_images/a961c114ac52b94d1fc85d2b634a677d114590e5d27957485e024bd53e134941.png
Our best guess is lr=[0.2409047], beta2=[0.99246394], and loss = [0.17758173]
../_images/60b062cd1fbf1a10a86bd6bcf199a0126b4f9fffc11bc5efe504cca7841565c4.png

3.3.4. Learning Rate Scheduling#

Exercise

Work with polynomial degree = 3.

Apply stepwise decreasing learning rate - in PyTorch MultiStepLR using the Adam optimizer.

from torch.optim.lr_scheduler import MultiStepLR
# TODO:

# MultiStepLR implements a learning rate which descreeses by a factor of 
# `gamma` at each of the given `milestones`, i.e. steps during optimization.

polynomial_degree = 3
num_iters = 200

lr_inits = [1.0, 0.1, 0.01, 0.001]

fig = plt.figure()

for lr in lr_inits:

    model = PolynomialModel(polynomial_degree)
    # This learning rate `lr` given to the optimizer is the initial one
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = MultiStepLR(optimizer, milestones=[50, 150], gamma=0.2)

    criterion = nn.MSELoss()

    losses = []

    for iter in range(num_iters):

        def closure():
            optimizer.zero_grad()
            y_pred = model(x_data)
            loss = criterion(y_pred, y_data)
            loss.backward()
            return loss

        optimizer.step()
        scheduler.step()
        losses.append(closure().detach().item())

    plt.plot(losses, label=f"lr={lr}")


plt.xlabel("iteration")
plt.ylabel("loss")
plt.legend()
plt.grid()
plt.ylim((0, 2))
plt.show()

# In this plot it is not apparent why LR Scheduler are so necessary, but you
# will see the true benefit when we start working with deep learning models.
../_images/94cb3a564ba931061105d9c1dd38c8f8e5f4b37b6a1788ab1f5263a66dd0f471.png

3.3.5. Regularization#

Exercise

Work with polynomial degree = 5.

Apply L1 and L2 regularization to the weights of the model using the Adam optimizer.

Hint: See this discussion. More precisely, l1_norm = sum(torch.linalg.norm(p, 1) for p in model.parameters()) and l2_norm = sum(torch.linalg.norm(p, 2) for p in model.parameters())

# TODO: 

# # Parameters
torch.manual_seed(42)
np.random.seed(42)
num_iters = 10000
# weighting value for regularization term in the loss
reg_lambda = 0.1
dataset_case = "default"  # "default", "polynomial" or "multivariate"

if dataset_case == "default":
    polynomial_degree = 5
elif dataset_case == "polynomial":
    np.random.seed(42)
    n_samples = 20
    x_data = torch.rand(size=(n_samples, 1)) * 3 - 1.5  # Uniformly distributed between -1.5 and 1.5
    y_data =  x_data**2 + 0.01 * torch.randn_like(x_data)
    polynomial_degree = 10
elif dataset_case == "multivariate":
    n_samples = 20
    num_features = 10 # number of features
    coeff_target = torch.zeros(num_features)
    coeff_target[:3] = 1.0
    x_data = torch.randn(n_samples, num_features)
    y_data = x_data @ coeff_target + 0.1 * torch.randn(n_samples)
    y_data = y_data.unsqueeze(1)

def no_reg(model):
    """no regularization"""
    return 0

def l1_reg(model):
    """l1 regularization"""
    return sum(torch.norm(p, 1) for p in model.parameters())

def l2_reg(model):
    """l2 regularization
    Using `torch.optim.Adam`, this regularization is already there under the 
    parameter `weight_decay`
    """
    return sum(torch.norm(p, 2) for p in model.parameters())

for case, reg in zip(["no", "L1", "L2"], [no_reg, l1_reg, l2_reg]):
    print(f"################# Training with {case} regularization ###################")
    losses = []
    
    if dataset_case in ["polynomial", "default"]:
        model = PolynomialModel(polynomial_degree)
    elif dataset_case == "multivariate":
        model = nn.Linear(num_features, 1)
    # optimizer = torch.optim.SGD(model.parameters(), lr=0.003)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    # optimizer = torch.optim.LBFGS(model.parameters())
    criterion = nn.MSELoss()

    print("## Parameters before training")
    for name, param in model.named_parameters():
        print(name, ": ", param.data)

    for iter in range(num_iters):

        def closure():
            optimizer.zero_grad()
            y_pred = model(x_data)

            # here we apply regularization
            loss = criterion(y_pred, y_data) + reg_lambda * reg(model)
            loss.backward()
            return loss

        optimizer.step(closure)
        losses.append(closure().detach().item())

    print("## Parameters after training:")
    torch.set_printoptions(sci_mode=False)
    for name, param in model.named_parameters():
        print(name, ": ", param.data)

    if dataset_case in ["polynomial", "default"]:
        plot_progress_polynomial(losses, model, x_data, y_data)
################# Training with no regularization ###################
## Parameters before training
linear.weight :  tensor([[ 0.3419,  0.3712, -0.1048,  0.4108, -0.0980]])
linear.bias :  tensor([0.0902])
## Parameters after training:
linear.weight :  tensor([[ 1.5655,  2.3635,  0.2018, -2.2983, -1.4480]])
linear.bias :  tensor([-0.0935])
../_images/b7acf90c2b4fb1b2abcdfb034a1726d17ceb2931ead0301e67e89a1d6da5b92c.png
################# Training with L1 regularization ###################
## Parameters before training
linear.weight :  tensor([[-0.2177,  0.2626,  0.3942, -0.3281,  0.3887]])
linear.bias :  tensor([0.0837])
## Parameters after training:
linear.weight :  tensor([[     1.1930,      0.0004,      0.0004,     -0.5924,     -0.8655]])
linear.bias :  tensor([0.1560])
../_images/8e00b1251a47e539162f864a1605c1551589ae788a7f67bf81c046d85a99b3f6.png
################# Training with L2 regularization ###################
## Parameters before training
linear.weight :  tensor([[ 0.3304,  0.0606,  0.2156, -0.0631,  0.3448]])
linear.bias :  tensor([0.0661])
## Parameters after training:
linear.weight :  tensor([[ 1.3985,  0.7845,  0.0293, -1.1680, -1.0786]])
linear.bias :  tensor([0.0590])
../_images/133d9c8915cfa01ccaf16c4861f53053ff93a8a9fdcb51d5d30c2944f0095463.png