#
PyTorch-Tutorial
#
Tensors
- Tensors are a specialized data structure that are very similar to arrays and matrices. In PyTorch, we use tensors to encode the inputs and outputs of a model, as well as the model’s parameters.
- Benefits
- Run on GPU's
- Automatic differentiation
- Share memory with numpy arrays
- Created by default in CPU and have to manually move to the GPU
import torch
from torch.utils.data import Dataset
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor, Lambda
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from torchvision.io import read_image
from torch.utils.data import DataLoader
from torch import nn
import torchvision.models as models
# Initialize
data = [[1, 2], [3, 4]]
x = torch.tensor(data)
x
tensor([[1, 2],
[3, 4]])
# Init from numpy
x = np.array(data)
t = torch.from_numpy(x)
t
tensor([[1, 2],
[3, 4]])
# Init from torch tensor which retains shape
y = torch.rand_like(t, dtype=torch.float)
y
tensor([[0.0644, 0.8114],
[0.3996, 0.9173]])
# Consts
torch.ones((1, 2)), torch.zeros((3, 2))
(tensor([[1., 1.]]),
tensor([[0., 0.],
[0., 0.],
[0., 0.]]))
# Attributes
y.shape, y.dtype, y.device
(torch.Size([2, 2]), torch.float32, device(type='cpu'))
# Move to GPU
y.to('cuda')
tensor([[0.0644, 0.8114],
[0.3996, 0.9173]], device='cuda:0')
# Indexing/Slicing
y[0], y[:, 0], y[..., -1]
(tensor([0.0644, 0.8114]), tensor([0.0644, 0.3996]), tensor([0.8114, 0.9173]))
# Replace in place
y[:, 1] = 0
y
tensor([[0.0644, 0.0000],
[0.3996, 0.0000]])
# Join tensors
torch.cat([t, y])
tensor([[1.0000, 2.0000],
[3.0000, 4.0000],
[0.0644, 0.0000],
[0.3996, 0.0000]])
# Element wise
t * y
tensor([[0.0644, 0.0000],
[1.1987, 0.0000]])
# Matrix mul
torch.from_numpy(x).type(torch.float) @ y
tensor([[0.8636, 0.0000],
[1.7915, 0.0000]])
# Scalar
torch.tensor([1, 2, 3]).sum().item()
6
# In-place ops
y.add_(5)
tensor([[5.0644, 5.0000],
[5.3996, 5.0000]])
# Convert to numpy
y.numpy()
array([[5.06442 , 5. ],
[5.3995686, 5. ]], dtype=float32)
#
Datasets & Dataloaders
- Decouples dataset code and model training code
Dataset
stores samples with corresponding labelsDataLoader
wraps iterable overDataset
for easy access
# Load dataset
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor()
)
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor()
)
# Visualize
labels_map = {
0: "T-Shirt",
1: "Trouser",
2: "Pullover",
3: "Dress",
4: "Coat",
5: "Sandal",
6: "Shirt",
7: "Sneaker",
8: "Bag",
9: "Ankle Boot",
}
figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3
for i in range(1, cols * rows + 1):
sample_idx = int(torch.randint(len(training_data), size=(1,)).item())
img, label = training_data[sample_idx]
figure.add_subplot(rows, cols, i)
plt.title(labels_map[label])
plt.axis("off")
plt.imshow(img.squeeze(), cmap="gray")
plt.show()
# Custom dataset
class CustomDataset(Dataset):
def __init__(self, file, img_dir, transform=None, target_transform=None) -> None:
self.img_labels = pd.read_csv(file) # Contains file paths & labels
self.img_dir = img_dir # Images corresponding to paths in above dataframe
self.transform = transform
self.target_transform = target_transform
def __len__(self) -> int:
return len(self.img_labels)
def __getitem__(self, idx):
img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
image = read_image(img_path)
label = self.img_labels.iloc[idx, 1]
if self.transform:
image = self.transform(image)
if self.target_transform:
label = self.target_transform(label)
return image, label
# Prepare for training
train_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True)
train_features, train_labels = next(iter(train_dataloader)) # First iteration batch (64)
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")
img = train_features[0].squeeze()
label = train_labels[0]
plt.imshow(img, cmap="gray")
plt.show()
print(f"Label: {label}")
Feature batch shape: torch.Size([64, 1, 28, 28])
Labels batch shape: torch.Size([64])
Label: 1
#
Transforms
- Use transforms to transform features, labels to whatever we desire.
- FashionMNIST features are in PIL Image format, and the labels are integers.
- For training, we need the features as normalized tensors, and the labels as one-hot encoded tensors.
- Transforms
- ToTensor - converts to a FloatTensor and scales to range [0., 1.]
- Lambda - user-defined lambda function
# Transform
ds = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor(),
# Create 0 tensor of size 10, assign value 1 through scatter based on label
target_transform=Lambda(lambda y: torch.zeros(10, dtype=torch.float).scatter_(0, torch.tensor(y), value=1))
)
#
Build NN
torch.nn
namespace provides all classes for building a NN- Every module subclasses
nn.Module
- A neural network is a module itself that consists of other modules (layers). This nested structure allows for building and managing complex architectures easily.
- To use the created model, pass input data to the instance and do not call
model.forward
directly - Many layers inside a neural network are parameterized. Subclassing
nn.Module
automatically tracks all fields defined inside your model object, and makes all parameters accessible using your model’sparameters()
ornamed_parameters()
methods.
# Define NN module
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28 * 28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
model = NeuralNetwork().to("cuda")
print(model)
NeuralNetwork(
(flatten): Flatten(start_dim=1, end_dim=-1)
(linear_relu_stack): Sequential(
(0): Linear(in_features=784, out_features=512, bias=True)
(1): ReLU()
(2): Linear(in_features=512, out_features=512, bias=True)
(3): ReLU()
(4): Linear(in_features=512, out_features=10, bias=True)
)
)
# Calling model
X = torch.rand(1, 28, 28, device="cuda")
logits = model(X)
pred_probab = nn.Softmax(dim=1)(logits) # dim=1 implies prob sum up to 1
y_pred = pred_probab.argmax(1)
print(f"Predicted class: {y_pred.item()}")
Predicted class: 8
print(f"Model structure: {model}\n\n")
for name, param in model.named_parameters():
print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")
Model structure: NeuralNetwork(
(flatten): Flatten(start_dim=1, end_dim=-1)
(linear_relu_stack): Sequential(
(0): Linear(in_features=784, out_features=512, bias=True)
(1): ReLU()
(2): Linear(in_features=512, out_features=512, bias=True)
(3): ReLU()
(4): Linear(in_features=512, out_features=10, bias=True)
)
)
Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values : tensor([[ 0.0023, -0.0056, 0.0165, ..., -0.0223, 0.0029, -0.0093],
[-0.0096, 0.0235, 0.0186, ..., -0.0205, 0.0214, -0.0350]],
device='cuda:0', grad_fn=<SliceBackward0>)
Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values : tensor([-0.0055, -0.0158], device='cuda:0', grad_fn=<SliceBackward0>)
Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values : tensor([[ 0.0238, -0.0208, 0.0025, ..., 0.0394, 0.0170, -0.0157],
[-0.0373, -0.0269, 0.0017, ..., -0.0328, 0.0026, -0.0071]],
device='cuda:0', grad_fn=<SliceBackward0>)
Layer: linear_relu_stack.2.bias | Size: torch.Size([512]) | Values : tensor([ 0.0070, -0.0422], device='cuda:0', grad_fn=<SliceBackward0>)
Layer: linear_relu_stack.4.weight | Size: torch.Size([10, 512]) | Values : tensor([[ 4.2477e-02, -1.4444e-02, 1.1693e-02, ..., -6.0160e-03,
2.8273e-02, -1.6820e-02],
[-2.0542e-02, -3.2362e-02, 4.1182e-02, ..., 4.6578e-03,
3.0098e-02, 9.7865e-05]], device='cuda:0', grad_fn=<SliceBackward0>)
Layer: linear_relu_stack.4.bias | Size: torch.Size([10]) | Values : tensor([-0.0012, -0.0229], device='cuda:0', grad_fn=<SliceBackward0>)
#
Automatic differentiation
- During training, backprop is used to move losses, and parameters are adjusted according to the gradient of the loss function with respect to the given parameter.
- To compute those gradients, PyTorch has a built-in differentiation engine called
torch.autograd
- Computation graph for the below model is
- A function that we apply to tensors to construct computational graph is an object of class
Function
. It computes forward and backward propogation and the reference to backprop function is stored ingrad_fn
property of a tensor. - To optimize weights of parameters in the neural network, we need to compute the derivatives of our loss function with respect to parameters which we do by calling
loss.backward()
and retrieve values fromw.grad
&b.grad
.- We can only obtain the grad properties for the leaf nodes of the computational graph, which have
requires_grad
property set to True. - We can only perform gradient calculations using backward once on a given graph, for performance reasons. If we need to do several calls on the same graph, we need to pass
retain_graph=True
to the backward call.
- We can only obtain the grad properties for the leaf nodes of the computational graph, which have
- We might not want gradient computations if model is already trained and we only want to pass input values. We do this by the
torch.no_grad
block.- Speeds up computations
- Can mark some params as frozen
- Computation graph
- A directed acyclic graph containing
Function
objects. - Leaves are input & roots are output tensors.
- Tracing from roots to leaves we compute gradients.
- Forward pass
- Runs operation to compute result tensor
- Maintains operation's gradient function
- Backward pass
- Computes gradients from each
grad_fn
- Accumulates them in the tensor's
grad
attribute - Uses chain rule to propogate error to leaves.
- Computes gradients from each
- DAG's are dynamic and are recreated in every
.backward
call
- A directed acyclic graph containing
- Jacobian product
- Used when input and output are tensors denoted by J
- Instead of computing Jacobian directly, we can compute Jacobian Product - v^T \cdot J by calling
backward
with v - the input tensor as an arg. - Size of v is same as that of original tensor wrt which we want to compute product.
- Torch accumulates gradients on multiple
backward
calls.
# A simple NN
x = torch.ones(5) # Input tensor
y = torch.zeros(3) # Expected output
w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
z = torch.matmul(x, w) + b
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
# Gradient fn for the tensors
print(f"Gradient function for z = {z.grad_fn}")
print(f"Gradient function for loss = {loss.grad_fn}")
Gradient function for z = <AddBackward0 object at 0x7fec40126a10>
Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x7fec40126dd0>
# Gradient computation
loss.backward()
print(w.grad)
print(b.grad)
tensor([[0.0532, 0.2955, 0.0534],
[0.0532, 0.2955, 0.0534],
[0.0532, 0.2955, 0.0534],
[0.0532, 0.2955, 0.0534],
[0.0532, 0.2955, 0.0534]])
tensor([0.0532, 0.2955, 0.0534])
# Disable gradient tracking
z = torch.matmul(x, w) + b
print(z.requires_grad)
with torch.no_grad():
z = torch.matmul(x, w) + b
print(z.requires_grad)
z = torch.matmul(x, w) + b
z_det = z.detach()
print(z_det.requires_grad)
True
False
False
# Jacobian computation
inp = torch.eye(4, 5, requires_grad=True)
out = (inp+1).pow(2).t()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"First call\n{inp.grad}")
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nSecond call\n{inp.grad}") # Different as torch accumulates gradients
inp.grad.zero_()
out.backward(torch.ones_like(out), retain_graph=True)
print(f"\nCall after zeroing gradients\n{inp.grad}")
First call
tensor([[4., 2., 2., 2., 2.],
[2., 4., 2., 2., 2.],
[2., 2., 4., 2., 2.],
[2., 2., 2., 4., 2.]])
Second call
tensor([[8., 4., 4., 4., 4.],
[4., 8., 4., 4., 4.],
[4., 4., 8., 4., 4.],
[4., 4., 4., 8., 4.]])
Call after zeroing gradients
tensor([[4., 2., 2., 2., 2.],
[2., 4., 2., 2., 2.],
[2., 2., 4., 2., 2.],
[2., 2., 2., 4., 2.]])
#
Optimizing parameters
- Hyperparameters
- Adjustable params that help control model optimization process
- Examples
- Epochs
- Batch size
- Learning rate
- Optimization loop
- Each iteration of the loop is called an epoch
- Consists of train & validation/test loop
- Loss function
- Measures the degree of dissimilarity of obtained result to the target value
- We want to minimize during training
- Optimizer
- Optimization is the process of adjusting model parameters to reduce model error in each training step
- Happens in 3 steps
- Call
optimizer.zero_grad
to reset gradients of model params. - Backprop loss with a
loss.backward()
call - Call
optimizer.step
to adjust params by collected gradients
- Call
learning_rate = 1e-3
batch_size = 64
epochs = 5
# Initialize the loss function
loss_fn = nn.CrossEntropyLoss()
# Initialize the optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# Train
def train_loop(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
# Set the model to training mode - important for batch normalization and dropout layers
model.train()
for batch, (X, y) in enumerate(dataloader):
# Compute prediction and loss
pred = model(X.to("cuda"))
loss = loss_fn(pred, y.to("cuda"))
# Backpropagation
loss.backward()
optimizer.step()
optimizer.zero_grad()
if batch % 100 == 0:
loss, current = loss.item(), (batch + 1) * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
# Test
def test_loop(dataloader, model, loss_fn):
# Set the model to evaluation mode - important for batch normalization and dropout layers
model.eval()
size = len(dataloader.dataset)
num_batches = len(dataloader)
test_loss, correct = 0, 0
# Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
with torch.no_grad():
for X, y in dataloader:
y = y.to("cuda")
pred = model(X.to("cuda"))
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
# Train and test
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
epochs = 10
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train_loop(train_dataloader, model, loss_fn, optimizer)
test_loop(test_dataloader, model, loss_fn)
print("Done!")
Epoch 1
-------------------------------
loss: 2.307636 [ 64/60000]
loss: 2.285602 [ 6464/60000]
loss: 2.276582 [12864/60000]
loss: 2.259840 [19264/60000]
loss: 2.235050 [25664/60000]
loss: 2.230984 [32064/60000]
loss: 2.211891 [38464/60000]
loss: 2.190745 [44864/60000]
loss: 2.178002 [51264/60000]
loss: 2.178962 [57664/60000]
Test Error:
Accuracy: 48.6%, Avg loss: 2.155461
Epoch 2
-------------------------------
loss: 2.161340 [ 64/60000]
loss: 2.125069 [ 6464/60000]
loss: 2.112151 [12864/60000]
loss: 2.070282 [19264/60000]
loss: 2.053517 [25664/60000]
loss: 2.038269 [32064/60000]
loss: 2.022111 [38464/60000]
loss: 1.967550 [44864/60000]
loss: 1.954207 [51264/60000]
loss: 1.857311 [57664/60000]
Test Error:
Accuracy: 61.4%, Avg loss: 1.885582
Epoch 3
-------------------------------
loss: 1.869350 [ 64/60000]
loss: 1.907839 [ 6464/60000]
loss: 1.856524 [12864/60000]
loss: 1.748639 [19264/60000]
loss: 1.753882 [25664/60000]
loss: 1.721729 [32064/60000]
loss: 1.533606 [38464/60000]
loss: 1.569647 [44864/60000]
loss: 1.501750 [51264/60000]
loss: 1.503370 [57664/60000]
Test Error:
Accuracy: 62.9%, Avg loss: 1.509759
Epoch 4
-------------------------------
loss: 1.532161 [ 64/60000]
loss: 1.388002 [ 6464/60000]
loss: 1.443119 [12864/60000]
loss: 1.341603 [19264/60000]
loss: 1.281666 [25664/60000]
loss: 1.273293 [32064/60000]
loss: 1.254668 [38464/60000]
loss: 1.331026 [44864/60000]
loss: 1.283661 [51264/60000]
loss: 1.268986 [57664/60000]
Test Error:
Accuracy: 63.9%, Avg loss: 1.235178
Epoch 5
-------------------------------
loss: 1.141766 [ 64/60000]
loss: 1.192270 [ 6464/60000]
loss: 1.154393 [12864/60000]
loss: 1.196480 [19264/60000]
loss: 1.135895 [25664/60000]
loss: 1.129671 [32064/60000]
loss: 1.065386 [38464/60000]
loss: 0.970784 [44864/60000]
loss: 1.176631 [51264/60000]
loss: 1.183024 [57664/60000]
Test Error:
Accuracy: 65.1%, Avg loss: 1.068337
Epoch 6
-------------------------------
loss: 1.096738 [ 64/60000]
loss: 1.186743 [ 6464/60000]
loss: 1.166200 [12864/60000]
loss: 1.004340 [19264/60000]
loss: 1.163391 [25664/60000]
loss: 0.958063 [32064/60000]
loss: 0.989098 [38464/60000]
loss: 1.110039 [44864/60000]
loss: 0.900861 [51264/60000]
loss: 1.119866 [57664/60000]
Test Error:
Accuracy: 66.5%, Avg loss: 0.965297
Epoch 7
-------------------------------
loss: 1.007180 [ 64/60000]
loss: 0.863904 [ 6464/60000]
loss: 0.876791 [12864/60000]
loss: 0.921062 [19264/60000]
loss: 0.920073 [25664/60000]
loss: 0.901771 [32064/60000]
loss: 0.988752 [38464/60000]
loss: 0.816810 [44864/60000]
loss: 0.762370 [51264/60000]
loss: 1.075614 [57664/60000]
Test Error:
Accuracy: 67.2%, Avg loss: 0.896018
Epoch 8
-------------------------------
loss: 0.800400 [ 64/60000]
loss: 1.027741 [ 6464/60000]
loss: 0.775136 [12864/60000]
loss: 0.861781 [19264/60000]
loss: 0.763875 [25664/60000]
loss: 0.880276 [32064/60000]
loss: 0.819394 [38464/60000]
loss: 0.827268 [44864/60000]
loss: 0.653719 [51264/60000]
loss: 0.804865 [57664/60000]
Test Error:
Accuracy: 69.1%, Avg loss: 0.847345
Epoch 9
-------------------------------
loss: 0.729509 [ 64/60000]
loss: 0.788606 [ 6464/60000]
loss: 0.808654 [12864/60000]
loss: 0.825352 [19264/60000]
loss: 0.932982 [25664/60000]
loss: 0.819240 [32064/60000]
loss: 0.908761 [38464/60000]
loss: 0.676269 [44864/60000]
loss: 0.658318 [51264/60000]
loss: 0.829181 [57664/60000]
Test Error:
Accuracy: 70.0%, Avg loss: 0.808802
Epoch 10
-------------------------------
loss: 0.743573 [ 64/60000]
loss: 0.782595 [ 6464/60000]
loss: 0.963787 [12864/60000]
loss: 0.787322 [19264/60000]
loss: 0.694903 [25664/60000]
loss: 0.812387 [32064/60000]
loss: 0.831846 [38464/60000]
loss: 0.861239 [44864/60000]
loss: 0.781885 [51264/60000]
loss: 0.744422 [57664/60000]
Test Error:
Accuracy: 71.6%, Avg loss: 0.778174
Done!
#
Save and load model
- PyTorch models store the learned parameters in an internal state dictionary, called
state_dict
. These can be persisted via thetorch.save
method - When loading model weights, we needed to instantiate the model class first, because the class defines the structure of a network. We might want to save the structure of this class together with the model, in which case we can pass
model
only- Uses
pickle
- Uses
# Save
model = models.vgg16(weights='IMAGENET1K_V1')
torch.save(model.state_dict(), 'model_weights.pth')
Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /home/deep/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100.0%
# Load
model = models.vgg16() # we do not specify weights, i.e. create untrained model
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()
VGG(
(features): Sequential(
(0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): ReLU(inplace=True)
(2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(3): ReLU(inplace=True)
(4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(6): ReLU(inplace=True)
(7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(8): ReLU(inplace=True)
(9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(11): ReLU(inplace=True)
(12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(13): ReLU(inplace=True)
(14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(15): ReLU(inplace=True)
(16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(18): ReLU(inplace=True)
(19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(20): ReLU(inplace=True)
(21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(22): ReLU(inplace=True)
(23): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(24): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(25): ReLU(inplace=True)
(26): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(27): ReLU(inplace=True)
(28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(29): ReLU(inplace=True)
(30): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
(avgpool): AdaptiveAvgPool2d(output_size=(7, 7))
(classifier): Sequential(
(0): Linear(in_features=25088, out_features=4096, bias=True)
(1): ReLU(inplace=True)
(2): Dropout(p=0.5, inplace=False)
(3): Linear(in_features=4096, out_features=4096, bias=True)
(4): ReLU(inplace=True)
(5): Dropout(p=0.5, inplace=False)
(6): Linear(in_features=4096, out_features=1000, bias=True)
)
)
torch.save(model, 'model.pth')
model = torch.load('model.pth')