# PyTorch-Tutorial

# Tensors

  • Tensors are a specialized data structure that are very similar to arrays and matrices. In PyTorch, we use tensors to encode the inputs and outputs of a model, as well as the model’s parameters.
  • Benefits
    • Run on GPU's
    • Automatic differentiation
    • Share memory with numpy arrays
  • Created by default in CPU and have to manually move to the GPU
import torch
from torch.utils.data import Dataset
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor, Lambda
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from torchvision.io import read_image
from torch.utils.data import DataLoader
from torch import nn
import torchvision.models as models
# Initialize
data = [[1, 2], [3, 4]]
x = torch.tensor(data)
x
tensor([[1, 2],
        [3, 4]])
# Init from numpy
x = np.array(data)
t = torch.from_numpy(x)
t
tensor([[1, 2],
        [3, 4]])
# Init from torch tensor which retains shape
y = torch.rand_like(t, dtype=torch.float)
y
tensor([[0.0644, 0.8114],
        [0.3996, 0.9173]])
# Consts
torch.ones((1, 2)), torch.zeros((3, 2))
(tensor([[1., 1.]]),
 tensor([[0., 0.],
         [0., 0.],
         [0., 0.]]))
# Attributes
y.shape, y.dtype, y.device
(torch.Size([2, 2]), torch.float32, device(type='cpu'))
# Move to GPU
y.to('cuda')
tensor([[0.0644, 0.8114],
        [0.3996, 0.9173]], device='cuda:0')
# Indexing/Slicing
y[0], y[:, 0], y[..., -1]
(tensor([0.0644, 0.8114]), tensor([0.0644, 0.3996]), tensor([0.8114, 0.9173]))
# Replace in place
y[:, 1] = 0
y
tensor([[0.0644, 0.0000],
        [0.3996, 0.0000]])
# Join tensors
torch.cat([t, y])
tensor([[1.0000, 2.0000],
        [3.0000, 4.0000],
        [0.0644, 0.0000],
        [0.3996, 0.0000]])
# Element wise
t * y
tensor([[0.0644, 0.0000],
        [1.1987, 0.0000]])
# Matrix mul
torch.from_numpy(x).type(torch.float) @ y
tensor([[0.8636, 0.0000],
        [1.7915, 0.0000]])
# Scalar
torch.tensor([1, 2, 3]).sum().item()
6
# In-place ops
y.add_(5)
tensor([[5.0644, 5.0000],
        [5.3996, 5.0000]])
# Convert to numpy
y.numpy()
array([[5.06442  , 5.       ],
       [5.3995686, 5.       ]], dtype=float32)

# Datasets & Dataloaders

  • Decouples dataset code and model training code
  • Dataset stores samples with corresponding labels
  • DataLoader wraps iterable over Dataset for easy access
# Load dataset
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)
# Visualize
labels_map = {
    0: "T-Shirt",
    1: "Trouser",
    2: "Pullover",
    3: "Dress",
    4: "Coat",
    5: "Sandal",
    6: "Shirt",
    7: "Sneaker",
    8: "Bag",
    9: "Ankle Boot",
}

figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3

for i in range(1, cols * rows + 1):
    sample_idx = int(torch.randint(len(training_data), size=(1,)).item())
    img, label = training_data[sample_idx]

    figure.add_subplot(rows, cols, i)
    plt.title(labels_map[label])
    plt.axis("off")
    plt.imshow(img.squeeze(), cmap="gray")
plt.show()

png
png

# Custom dataset
class CustomDataset(Dataset):
    def __init__(self, file, img_dir, transform=None, target_transform=None) -> None:
        self.img_labels = pd.read_csv(file)  # Contains file paths & labels
        self.img_dir = img_dir  # Images corresponding to paths in above dataframe
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self) -> int:
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = read_image(img_path)
        label = self.img_labels.iloc[idx, 1]

        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)

        return image, label
# Prepare for training
train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)
train_features, train_labels = next(iter(train_dataloader))  # First iteration batch (64)
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

img = train_features[0].squeeze()
label = train_labels[0]
plt.imshow(img, cmap="gray")
plt.show()
print(f"Label: {label}")
Feature batch shape: torch.Size([64, 1, 28, 28])
Labels batch shape: torch.Size([64])

png
png

Label: 1

# Transforms

  • Use transforms to transform features, labels to whatever we desire.
  • FashionMNIST features are in PIL Image format, and the labels are integers.
  • For training, we need the features as normalized tensors, and the labels as one-hot encoded tensors.
  • Transforms
    • ToTensor - converts to a FloatTensor and scales to range [0., 1.]
    • Lambda - user-defined lambda function
# Transform

ds = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
    # Create 0 tensor of size 10, assign value 1 through scatter based on label
    target_transform=Lambda(lambda y: torch.zeros(10, dtype=torch.float).scatter_(0, torch.tensor(y), value=1))
)

# Build NN

  • torch.nn namespace provides all classes for building a NN
  • Every module subclasses nn.Module
  • A neural network is a module itself that consists of other modules (layers). This nested structure allows for building and managing complex architectures easily.
  • To use the created model, pass input data to the instance and do not call model.forward directly
  • Many layers inside a neural network are parameterized. Subclassing nn.Module automatically tracks all fields defined inside your model object, and makes all parameters accessible using your model’s parameters() or named_parameters() methods.
# Define NN module
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


model = NeuralNetwork().to("cuda")
print(model)
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)
# Calling model
X = torch.rand(1, 28, 28, device="cuda")
logits = model(X)

pred_probab = nn.Softmax(dim=1)(logits)  # dim=1 implies prob sum up to 1
y_pred = pred_probab.argmax(1)

print(f"Predicted class: {y_pred.item()}")
Predicted class: 8
print(f"Model structure: {model}\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")
Model structure: NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values : tensor([[ 0.0023, -0.0056,  0.0165,  ..., -0.0223,  0.0029, -0.0093],
        [-0.0096,  0.0235,  0.0186,  ..., -0.0205,  0.0214, -0.0350]],
       device='cuda:0', grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values : tensor([-0.0055, -0.0158], device='cuda:0', grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values : tensor([[ 0.0238, -0.0208,  0.0025,  ...,  0.0394,  0.0170, -0.0157],
        [-0.0373, -0.0269,  0.0017,  ..., -0.0328,  0.0026, -0.0071]],
       device='cuda:0', grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.bias | Size: torch.Size([512]) | Values : tensor([ 0.0070, -0.0422], device='cuda:0', grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.4.weight | Size: torch.Size([10, 512]) | Values : tensor([[ 4.2477e-02, -1.4444e-02,  1.1693e-02,  ..., -6.0160e-03,
          2.8273e-02, -1.6820e-02],
        [-2.0542e-02, -3.2362e-02,  4.1182e-02,  ...,  4.6578e-03,
          3.0098e-02,  9.7865e-05]], device='cuda:0', grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.4.bias | Size: torch.Size([10]) | Values : tensor([-0.0012, -0.0229], device='cuda:0', grad_fn=<SliceBackward0>) 

# Automatic differentiation

  • During training, backprop is used to move losses, and parameters are adjusted according to the gradient of the loss function with respect to the given parameter.
  • To compute those gradients, PyTorch has a built-in differentiation engine called torch.autograd
  • Computation graph for the below model is image.png
  • A function that we apply to tensors to construct computational graph is an object of class Function. It computes forward and backward propogation and the reference to backprop function is stored in grad_fn property of a tensor.
  • To optimize weights of parameters in the neural network, we need to compute the derivatives of our loss function with respect to parameters which we do by calling loss.backward() and retrieve values from w.grad & b.grad.
    • We can only obtain the grad properties for the leaf nodes of the computational graph, which have requires_grad property set to True.
    • We can only perform gradient calculations using backward once on a given graph, for performance reasons. If we need to do several calls on the same graph, we need to pass retain_graph=True to the backward call.
  • We might not want gradient computations if model is already trained and we only want to pass input values. We do this by the torch.no_grad block.
    • Speeds up computations
    • Can mark some params as frozen
  • Computation graph
    • A directed acyclic graph containing Function objects.
    • Leaves are input & roots are output tensors.
    • Tracing from roots to leaves we compute gradients.
    • Forward pass
      • Runs operation to compute result tensor
      • Maintains operation's gradient function
    • Backward pass
      • Computes gradients from each grad_fn
      • Accumulates them in the tensor's grad attribute
      • Uses chain rule to propogate error to leaves.
    • DAG's are dynamic and are recreated in every .backward call
  • Jacobian product
    • Used when input and output are tensors denoted by J
    • Instead of computing Jacobian directly, we can compute Jacobian Product - v^T \cdot J by calling backward with v - the input tensor as an arg.
    • Size of v is same as that of original tensor wrt which we want to compute product.
    • Torch accumulates gradients on multiple backward calls.
# A simple NN
x = torch.ones(5)  # Input tensor
y = torch.zeros(3)  # Expected output

w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
z = torch.matmul(x, w) + b

loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
# Gradient fn for the tensors
print(f"Gradient function for z = {z.grad_fn}")
print(f"Gradient function for loss = {loss.grad_fn}")
Gradient function for z = <AddBackward0 object at 0x7fec40126a10>
Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x7fec40126dd0>
# Gradient computation
loss.backward()
print(w.grad)
print(b.grad)
tensor([[0.0532, 0.2955, 0.0534],
        [0.0532, 0.2955, 0.0534],
        [0.0532, 0.2955, 0.0534],
        [0.0532, 0.2955, 0.0534],
        [0.0532, 0.2955, 0.0534]])
tensor([0.0532, 0.2955, 0.0534])
# Disable gradient tracking
z = torch.matmul(x, w) + b
print(z.requires_grad)

with torch.no_grad():
    z = torch.matmul(x, w) + b
print(z.requires_grad)

z = torch.matmul(x, w) + b
z_det = z.detach()
print(z_det.requires_grad)
True
False
False
# Jacobian computation
inp = torch.eye(4, 5, requires_grad=True)
out = (inp+1).pow(2).t()

out.backward(torch.ones_like(out), retain_graph=True)
print(f"First call\n{inp.grad}")

out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nSecond call\n{inp.grad}")  # Different as torch accumulates gradients

inp.grad.zero_()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")
First call
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])

Second call
tensor([[8., 4., 4., 4., 4.],
        [4., 8., 4., 4., 4.],
        [4., 4., 8., 4., 4.],
        [4., 4., 4., 8., 4.]])

Call after zeroing gradients
tensor([[4., 2., 2., 2., 2.],
        [2., 4., 2., 2., 2.],
        [2., 2., 4., 2., 2.],
        [2., 2., 2., 4., 2.]])

# Optimizing parameters

  • Hyperparameters
    • Adjustable params that help control model optimization process
    • Examples
      • Epochs
      • Batch size
      • Learning rate
  • Optimization loop
    • Each iteration of the loop is called an epoch
    • Consists of train & validation/test loop
  • Loss function
    • Measures the degree of dissimilarity of obtained result to the target value
    • We want to minimize during training
  • Optimizer
    • Optimization is the process of adjusting model parameters to reduce model error in each training step
    • Happens in 3 steps
      • Call optimizer.zero_grad to reset gradients of model params.
      • Backprop loss with a loss.backward() call
      • Call optimizer.step to adjust params by collected gradients
learning_rate = 1e-3
batch_size = 64
epochs = 5
# Initialize the loss function
loss_fn = nn.CrossEntropyLoss()
# Initialize the optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# Train
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)

    # Set the model to training mode - important for batch normalization and dropout layers
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X.to("cuda"))
        loss = loss_fn(pred, y.to("cuda"))

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
# Test
def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    with torch.no_grad():
        for X, y in dataloader:
            y = y.to("cuda")
            pred = model(X.to("cuda"))
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
# Train and test
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")
Epoch 1
-------------------------------
loss: 2.307636  [   64/60000]
loss: 2.285602  [ 6464/60000]
loss: 2.276582  [12864/60000]
loss: 2.259840  [19264/60000]
loss: 2.235050  [25664/60000]
loss: 2.230984  [32064/60000]
loss: 2.211891  [38464/60000]
loss: 2.190745  [44864/60000]
loss: 2.178002  [51264/60000]
loss: 2.178962  [57664/60000]
Test Error: 
 Accuracy: 48.6%, Avg loss: 2.155461 

Epoch 2
-------------------------------
loss: 2.161340  [   64/60000]
loss: 2.125069  [ 6464/60000]
loss: 2.112151  [12864/60000]
loss: 2.070282  [19264/60000]
loss: 2.053517  [25664/60000]
loss: 2.038269  [32064/60000]
loss: 2.022111  [38464/60000]
loss: 1.967550  [44864/60000]
loss: 1.954207  [51264/60000]
loss: 1.857311  [57664/60000]
Test Error: 
 Accuracy: 61.4%, Avg loss: 1.885582 

Epoch 3
-------------------------------
loss: 1.869350  [   64/60000]
loss: 1.907839  [ 6464/60000]
loss: 1.856524  [12864/60000]
loss: 1.748639  [19264/60000]
loss: 1.753882  [25664/60000]
loss: 1.721729  [32064/60000]
loss: 1.533606  [38464/60000]
loss: 1.569647  [44864/60000]
loss: 1.501750  [51264/60000]
loss: 1.503370  [57664/60000]
Test Error: 
 Accuracy: 62.9%, Avg loss: 1.509759 

Epoch 4
-------------------------------
loss: 1.532161  [   64/60000]
loss: 1.388002  [ 6464/60000]
loss: 1.443119  [12864/60000]
loss: 1.341603  [19264/60000]
loss: 1.281666  [25664/60000]
loss: 1.273293  [32064/60000]
loss: 1.254668  [38464/60000]
loss: 1.331026  [44864/60000]
loss: 1.283661  [51264/60000]
loss: 1.268986  [57664/60000]
Test Error: 
 Accuracy: 63.9%, Avg loss: 1.235178 

Epoch 5
-------------------------------
loss: 1.141766  [   64/60000]
loss: 1.192270  [ 6464/60000]
loss: 1.154393  [12864/60000]
loss: 1.196480  [19264/60000]
loss: 1.135895  [25664/60000]
loss: 1.129671  [32064/60000]
loss: 1.065386  [38464/60000]
loss: 0.970784  [44864/60000]
loss: 1.176631  [51264/60000]
loss: 1.183024  [57664/60000]
Test Error: 
 Accuracy: 65.1%, Avg loss: 1.068337 

Epoch 6
-------------------------------
loss: 1.096738  [   64/60000]
loss: 1.186743  [ 6464/60000]
loss: 1.166200  [12864/60000]
loss: 1.004340  [19264/60000]
loss: 1.163391  [25664/60000]
loss: 0.958063  [32064/60000]
loss: 0.989098  [38464/60000]
loss: 1.110039  [44864/60000]
loss: 0.900861  [51264/60000]
loss: 1.119866  [57664/60000]
Test Error: 
 Accuracy: 66.5%, Avg loss: 0.965297 

Epoch 7
-------------------------------
loss: 1.007180  [   64/60000]
loss: 0.863904  [ 6464/60000]
loss: 0.876791  [12864/60000]
loss: 0.921062  [19264/60000]
loss: 0.920073  [25664/60000]
loss: 0.901771  [32064/60000]
loss: 0.988752  [38464/60000]
loss: 0.816810  [44864/60000]
loss: 0.762370  [51264/60000]
loss: 1.075614  [57664/60000]
Test Error: 
 Accuracy: 67.2%, Avg loss: 0.896018 

Epoch 8
-------------------------------
loss: 0.800400  [   64/60000]
loss: 1.027741  [ 6464/60000]
loss: 0.775136  [12864/60000]
loss: 0.861781  [19264/60000]
loss: 0.763875  [25664/60000]
loss: 0.880276  [32064/60000]
loss: 0.819394  [38464/60000]
loss: 0.827268  [44864/60000]
loss: 0.653719  [51264/60000]
loss: 0.804865  [57664/60000]
Test Error: 
 Accuracy: 69.1%, Avg loss: 0.847345 

Epoch 9
-------------------------------
loss: 0.729509  [   64/60000]
loss: 0.788606  [ 6464/60000]
loss: 0.808654  [12864/60000]
loss: 0.825352  [19264/60000]
loss: 0.932982  [25664/60000]
loss: 0.819240  [32064/60000]
loss: 0.908761  [38464/60000]
loss: 0.676269  [44864/60000]
loss: 0.658318  [51264/60000]
loss: 0.829181  [57664/60000]
Test Error: 
 Accuracy: 70.0%, Avg loss: 0.808802 

Epoch 10
-------------------------------
loss: 0.743573  [   64/60000]
loss: 0.782595  [ 6464/60000]
loss: 0.963787  [12864/60000]
loss: 0.787322  [19264/60000]
loss: 0.694903  [25664/60000]
loss: 0.812387  [32064/60000]
loss: 0.831846  [38464/60000]
loss: 0.861239  [44864/60000]
loss: 0.781885  [51264/60000]
loss: 0.744422  [57664/60000]
Test Error: 
 Accuracy: 71.6%, Avg loss: 0.778174 

Done!

# Save and load model

  • PyTorch models store the learned parameters in an internal state dictionary, called state_dict. These can be persisted via the torch.save method
  • When loading model weights, we needed to instantiate the model class first, because the class defines the structure of a network. We might want to save the structure of this class together with the model, in which case we can pass model only
    • Uses pickle
# Save
model = models.vgg16(weights='IMAGENET1K_V1')
torch.save(model.state_dict(), 'model_weights.pth')
Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /home/deep/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100.0%
# Load
model = models.vgg16() # we do not specify weights, i.e. create untrained model
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()
VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (18): ReLU(inplace=True)
    (19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (20): ReLU(inplace=True)
    (21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (22): ReLU(inplace=True)
    (23): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (24): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (25): ReLU(inplace=True)
    (26): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (27): ReLU(inplace=True)
    (28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (29): ReLU(inplace=True)
    (30): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(7, 7))
  (classifier): Sequential(
    (0): Linear(in_features=25088, out_features=4096, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=4096, out_features=4096, bias=True)
    (4): ReLU(inplace=True)
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=4096, out_features=1000, bias=True)
  )
)
torch.save(model, 'model.pth')
model = torch.load('model.pth')