Convolutional Neural Networks

6. Convolutional Neural Networks#

At the end of this exercise you will know how to:

construct a CNN image classification pipeline from scratch, based on the Training a Classifier PyTorch tutorial
use a pretrained CNN model for image classification, based on the Transfer Learning for Computer Vision Tutorial by PyTorch

We will keep this notebook as tight as possible as many of the questions you might have are already answered in these excellent PyTorch tutorials. Our focus will be on the nitty-gritty details, which are not explained in depth in the tutorials.

One further helpful PyTorch resource is the official cheat sheet.

from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
from torch.utils.tensorboard import SummaryWriter

cudnn.benchmark = True
plt.ion()   # interactive mode

# Load the TensorBoard notebook extension for logging training runs
%load_ext tensorboard

6.1. Training a Classifier#

We will be working with the Cifar10 dataset, which is probably the second best known classification dataset just after Mnist and just before ImageNet. It consists of 60,000 RGB images of shape 3x32x32 from 10 different classes with 6000 images per class. The python dataset requires around 160 MB space.

# the transform is a preprocessing step over the data before feeding it to the model.
# The `ToTensor` transform converts the PIL image to a tensor with values in the range
# [0, 1]. The `Normalize` transform applies input normalization which you have seen in
# the "Tricks of Optimization" lecture.
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 4

# if you don't use the `download=True` argument, the data will be downloaded in the
# root directory, if it is not already there.
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
# print("Number of training batches: ", len(trainloader))  # 1250

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz

100%|██████████| 170498071/170498071 [00:03<00:00, 42983432.61it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified

Now, we can look at one random batch.

def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

# get some random training images
# first make an iterator over the training data and then get the `next` batch
dataiter = iter(trainloader)
images, labels = next(dataiter)

# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join(f'{classes[labels[j]]:5s}' for j in range(batch_size)))
# print shapes
print(f'Shapes of images: {images.shape} and labels: {labels.shape}')

../_images/4a71b094e497f4fbc797deed8ece4f0d56711cf7eeec5bb7c7d2155f92eb3151.png

dog   car   horse plane
Shapes of images: torch.Size([4, 3, 32, 32]) and labels: torch.Size([4])

We are ready to define our first CNN

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, (5,5)) # (in_channels, out_channels, kernel_size) -> 6x28x28
        self.pool = nn.MaxPool2d(2) # -> 6x14x14
        self.conv2 = nn.Conv2d(6, 16, 5) # -> 16x10x10
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # (in_features, out_features)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        # print(f"x.shape = {x.shape}")
        x = self.pool(F.relu(self.conv2(x)))
        # print(f"x.shape = {x.shape}")
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        # print(f"x.shape = {x.shape}")
        x = F.relu(self.fc1(x))
        # print(f"x.shape = {x.shape}")
        x = F.relu(self.fc2(x))
        # print(f"x.shape = {x.shape}")
        x = self.fc3(x)
        return x

To have a trainig loop we are just missing the loss (“criterion”) and optimizer

net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

# if we use the commented out code in the model definition, we get these shapes
outputs = net(images)

x.shape = torch.Size([4, 6, 14, 14])
x.shape = torch.Size([4, 16, 5, 5])
x.shape = torch.Size([4, 400])
x.shape = torch.Size([4, 120])
x.shape = torch.Size([4, 84])

Let’s train this model

writer = SummaryWriter('runs/cifar10_experiment_1')

for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            writer.add_scalar('train/loss', running_loss / 2000, epoch * len(trainloader) + i )
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')
writer.close()

[1,  2000] loss: 2.160
[1,  4000] loss: 1.799
[1,  6000] loss: 1.644
[1,  8000] loss: 1.558
[1, 10000] loss: 1.496
[1, 12000] loss: 1.448
[2,  2000] loss: 1.392
[2,  4000] loss: 1.368
[2,  6000] loss: 1.349
[2,  8000] loss: 1.290
[2, 10000] loss: 1.307
[2, 12000] loss: 1.264
Finished Training

%tensorboard --logdir 'runs/cifar10_experiment_1'

Get a prediction over one of the test batches

# get test sample
dataiter = iter(testloader)
images, labels = next(dataiter)

# evaluate model on test input
outputs = net(images)

# make predictions
_, predicted = torch.max(outputs, 1)

print('Predicted: ', ' '.join(f'{classes[predicted[j]]:5s}' for j in range(4)))
print('GroundTruth: ', ' '.join(f'{classes[labels[j]]:5s}' for j in range(4)))

Predicted:  cat   ship  ship  ship 
GroundTruth:  cat   ship  ship  plane

Evaluate full accuracy

correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for data in testloader:
        images, labels = data
        # calculate outputs by running images through the network
        outputs = net(images)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(
    f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')

Accuracy of the network on the 10000 test images: 55 %

Or evaluating the accuracy per class

# prepare to count predictions for each class
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

# again no gradients needed
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = net(images)
        _, predictions = torch.max(outputs, 1)
        # collect the correct predictions for each class
        for label, prediction in zip(labels, predictions):
            if label == prediction:
                correct_pred[classes[label]] += 1
            total_pred[classes[label]] += 1


# print accuracy for each class
for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print(f'Accuracy for class: {classname:5s} is {accuracy:.1f} %')

Accuracy for class: plane is 59.9 %
Accuracy for class: car   is 74.4 %
Accuracy for class: bird  is 38.6 %
Accuracy for class: cat   is 39.7 %
Accuracy for class: deer  is 51.2 %
Accuracy for class: dog   is 41.8 %
Accuracy for class: frog  is 58.4 %
Accuracy for class: horse is 60.7 %
Accuracy for class: ship  is 76.0 %
Accuracy for class: truck is 56.8 %

Let’s see how to achieve GPU training

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0

!nvidia-smi

Fri Jan 12 09:12:32 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|  No running processes found                                                           |
+---------------------------------------------------------------------------------------+

# the trick is to move the model as well as the data to the GPU
net = Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

writer = SummaryWriter('runs/cifar10_experiment_1')

for epoch in range(2):

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):

        # the only difference is that we move the data to the GPU here
        inputs, labels = data[0].to(device), data[1].to(device)

        optimizer.zero_grad()

        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 2000 == 1999:
            writer.add_scalar('train/loss', running_loss /
                              2000, epoch * len(trainloader) + i)
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')
writer.close()

[1,  2000] loss: 2.171
[1,  4000] loss: 1.857
[1,  6000] loss: 1.667
[1,  8000] loss: 1.567
[1, 10000] loss: 1.495
[1, 12000] loss: 1.455
[2,  2000] loss: 1.372
[2,  4000] loss: 1.348
[2,  6000] loss: 1.337
[2,  8000] loss: 1.282
[2, 10000] loss: 1.288
[2, 12000] loss: 1.236
Finished Training

6.2. Pretraining#

The idea of pretraining is to utilize a large model pretraned on a large dataset, and then with limited amount of compute to fute this model to your own, small dataset. There are two commmon strategies to approach that:

Freezing all but the last: leave the weights of the pretrained model as they are except for the last layer, and then only train this last layer on your data.
Finetuning: train the whole pretrained network on your data for a few epochs, but make sure that you stop that process before overfitting kicks in.

Both these approaches rely on the pretrained model being pretrained on data which is somewhat similar to your data, so that you can reuse the rich feature extraction capabilities of the pretrained model also for your tasks.

Image source here.

from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import zipfile

cudnn.benchmark = True
plt.ion()   # interactive mode

<contextlib.ExitStack at 0x77ffbf7115a0>

# download the dataset provided here
# https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
# and put it in the same directory as this notebook

# if you want to clean the data directory, you can do so with
# !rm -rf data/*

# unzip the data into the data directory
zipfile.ZipFile("hymenoptera_data.zip", "r").extractall("data/")

# Data augmentation and normalization for training
# Just normalization for validation
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

data_dir = 'data/hymenoptera_data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
                                              shuffle=True, num_workers=4)
               for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:557: UserWarning: This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
  warnings.warn(_create_warning_msg(

def imshow(inp, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1)
    plt.imshow(inp)
    if title is not None:
        plt.title(title)
    plt.pause(0.001)  # pause a bit so that plots are updated

# Get a batch of training data
inputs, classes = next(iter(dataloaders['train']))

# Make a grid from batch
out = torchvision.utils.make_grid(inputs)

imshow(out, title=[class_names[x] for x in classes])

../_images/e89858d51941ae95bc8deed5d2b48d6d5d38585b3422f65a1c3b11a74348ea49.png

def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print(
        f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

def visualize_model(model, num_images=6):
    was_training = model.training
    model.eval()
    images_so_far = 0
    fig = plt.figure()

    with torch.no_grad():
        for i, (inputs, labels) in enumerate(dataloaders['val']):
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

            for j in range(inputs.size()[0]):
                images_so_far += 1
                ax = plt.subplot(num_images//2, 2, images_so_far)
                ax.axis('off')
                ax.set_title(f'predicted: {class_names[preds[j]]}')
                imshow(inputs.cpu().data[j])

                if images_so_far == num_images:
                    model.train(mode=was_training)
                    return
        model.train(mode=was_training)

Finetuning a CNN

model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
# Here the size of each output sample is set to 2.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
model_ft.fc = nn.Linear(num_ftrs, 2)

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

/usr/local/lib/python3.10/dist-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead.
  warnings.warn(
/usr/local/lib/python3.10/dist-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=ResNet18_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet18_Weights.DEFAULT` to get the most up-to-date weights.
  warnings.warn(msg)
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 95.4MB/s]

model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=2)

Epoch 0/1
----------
train Loss: 0.5745 Acc: 0.7254
val Loss: 0.2001 Acc: 0.9412

Epoch 1/1
----------
train Loss: 0.5069 Acc: 0.7869
val Loss: 0.2088 Acc: 0.9346

Training complete in 1m 47s
Best val Acc: 0.941176

visualize_model(model_ft)

../_images/cbf49dcd60989646b0ba56716a7858eef10729d443587b018c45b457b582dbd0.png

../_images/d2b12ced0cf50e1b83ac3a73e66c7df42001cdcd9fe530ebc7d296ca31e10cf5.png

../_images/78d7ba2d0664a82e83bc39d2e43212800555a1a9779645a24725ebdd7b28def4.png

../_images/7b6d8528c3fb8a38a718671605c190b1165688743005ba77564c68a38322ac87.png

../_images/e91ff5f72c6cc8d339e78ee81f71b1999e05eab4e89e20a3c3e3deaccdc47390.png

../_images/0e1dc40a89d67a0bbfa4082a0f758385f933ef3ec47f280ea0b7b0d2b71c7daa.png

Fixed feature extractor (Freezing all but the last layer)

model_conv = torchvision.models.resnet18(pretrained=True)
for param in model_conv.parameters():
    param.requires_grad = False

# Parameters of newly constructed modules have requires_grad=True by default
num_ftrs = model_conv.fc.in_features
model_conv.fc = nn.Linear(num_ftrs, 2)

model_conv = model_conv.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that only parameters of final layer are being optimized as
# opposed to before.
optimizer_conv = optim.SGD(model_conv.fc.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_conv, step_size=7, gamma=0.1)

# print the learnable (`requires_grad=True`) model parameters
for name, param in model_conv.named_parameters():
    if param.requires_grad:
        print(name, param.data.shape)

fc.weight torch.Size([2, 512])
fc.bias torch.Size([2])

model_conv = train_model(model_conv, criterion, optimizer_conv,
                         exp_lr_scheduler, num_epochs=2)

Epoch 0/1
----------
train Loss: 0.5743 Acc: 0.6844
val Loss: 0.2464 Acc: 0.9281

Epoch 1/1
----------
train Loss: 0.4083 Acc: 0.8115
val Loss: 0.2295 Acc: 0.9216

Training complete in 0m 56s
Best val Acc: 0.928105

visualize_model(model_conv)

plt.ioff()
plt.show()

../_images/04664fa298683a5028ec907d0d5b624abbfe7d483dbc6f90cb0b188b807c1e66.png

../_images/c72b7bf393c2d2582313b2a62066806e5b275b7281fcf4d7daa981f817e7b20a.png

../_images/68f12b51014ea0f3b018b2d83495fe53ebb4a4ab4a642745d67bca342c271834.png

../_images/cb40e838df3d5874dd9c671140a9b175f41540c223ba0034402ad13312d04b5d.png

../_images/728794a8ab4190a8529b6334e09a663a3ab391029aeddb0abfcca63e64d25be8.png

../_images/a7be5813c98ad3a459f982e76622ab8fd13c85201c1b58586286547e1ed526d9.png

Convolutional Neural Networks

Contents

6. Convolutional Neural Networks#

6.1. Training a Classifier#

6.2. Pretraining#