Kash’s Portfolio - Intro to Object Detection using FasterRCNN

Mission Statement :

Identifying the car from the given entire dataset of ~16000 images using Computer Vision techniques.

Objectives :

To import, understand and preprocess the data ready for model building

Summarize the valuable insights from the data using EDA

Design a basic Deep Learning based car identification model

Importing Necessary libraries and packages

import os # To use terminal commands inside python for working on files and  directories

# Data transformation libraries
import pandas as pd
import numpy as np
import copy

# Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Scikit-learn packages
from sklearn.preprocessing import LabelEncoder # For label encoding the class labels
from sklearn.model_selection import ShuffleSplit

## Other libraries helpful for CV tasks
import cv2 # computer vision related packages for reading and playing with Image data
# Necessary packages for getting image editing capabilities
from PIL import Image
import PIL
from prettytable import PrettyTable # To format the results with good formatting
from zipfile import ZipFile # For extracting .zip files
import pickle # For exporting and importing the model files

# Necessary Pytorch libraries
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from torch.utils.data import Subset
import torchvision.transforms.functional as tf
import torch.optim as optim
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

%matplotlib inline

# For plotting images inside the notebook
%matplotlib inline

from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/GL_capstone_project')

Mounted at /content/drive

Step 1: Import the data.

Extracting the necessary zip files for the problem and if already extracted, skipping the file extraction

try:
    with ZipFile('Car Images.zip', 'r') as z:
        z.extractall()
    with ZipFile('Annotations.zip', 'r') as z:
        z.extractall()
except:
    pass

Step 2: Map training and testing images to its classes.

Creating a dataframe to store image name, image details and image path for using while modelling

Using this details, we can merge with annotations and car name make files to get more information about the images present in train and test folders

# Get all the image names into a df and then merging above with images data --> Image name+image_name

# Creating the placeholders for storing the values while reading the images in train and test data
train_images_df=pd.DataFrame()
test_images_df=pd.DataFrame()
train_car_images_list=list()
train_car_images_path_list=list()
train_car_name_make_list=list()
test_car_images_list=list()
test_car_images_path_list=list()
test_car_name_make_list=list()

# Iterating through train images folder to get image_path, image_name and car_name_make_details
for sub_folder in os.listdir(os.path.join('car Images')):
    if sub_folder =='Train Images':
        for car_folder in os.listdir(os.path.join('car Images','Train Images')):
            # Neglecting temporary folders created runtime and taking only the required folders
            if car_folder!='.DS_Store' and car_folder!='.ipynb_checkpoints':
                for car_image_name in os.listdir(os.path.join('car Images/Train Images',car_folder)):
                    if car_image_name.split('.')[1]=='jpg':
                        train_car_name_make_list.append(car_folder)
                        train_car_images_list.append(car_image_name)
                        train_car_images_path_list.append('car Images/Train Images/'+car_folder+'/'+car_image_name)
# Iterating through test images folder to get image_path, image_name and car_name_make_details
for sub_folder in os.listdir(os.path.join('car Images')):
    if sub_folder =='Test Images':
        for car_folder in os.listdir(os.path.join('car Images','Test Images')):
            # Neglecting temporary folders created runtime and taking only the required folders
            if car_folder!='.DS_Store' and car_folder!='.ipynb_checkpoints':
                for car_image_name in os.listdir(os.path.join('car Images/Test Images',car_folder)):
                    if car_image_name.split('.')[1]=='jpg':
                        test_car_name_make_list.append(car_folder)
                        test_car_images_list.append(car_image_name)
                        test_car_images_path_list.append('car Images/Test Images/'+car_folder+'/'+car_image_name)

# Storing all the extracted information into the train and test dataframes created earlier
train_images_df['image_path']=train_car_images_path_list
train_images_df['image_name']=train_car_images_list
train_images_df['car_name_make']=train_car_name_make_list
print('train_images_df shape:',train_images_df.shape)
test_images_df['image_path']=test_car_images_path_list
test_images_df['image_name']=test_car_images_list
test_images_df['car_name_make']=test_car_name_make_list
print('test_images_df shape:',test_images_df.shape)

train_images_df shape: (8144, 3)
test_images_df shape: (8041, 3)

Merging the above dataframe with car name and make file to get the class label details

These labels can be used as classes during the modelling

car_name_make_df=pd.read_csv('Car+names+and+make.csv',header=None)
car_name_make_df['image_class']=list(range(1,197))
car_name_make_df.rename(columns={0: 'car_name_make'}, inplace=True)
# In the car_name_make file,few car names are with '/', so replacing '/' with '-' to match with image folder names
car_name_make_df['car_name_make']=car_name_make_df['car_name_make'].apply(lambda x:x.replace('/','-'))
car_name_make_df=car_name_make_df[['image_class','car_name_make']]
print('Car names and make file shape:',car_name_make_df.shape)

# Merging with previous data frame on id='car_name_make' to get all details
# about train and test dataframes into a single entity
train_df=train_images_df.merge(car_name_make_df,on='car_name_make')
print('train_df shape:',train_df.shape)
test_df=test_images_df.merge(car_name_make_df,on='car_name_make')
print('test_df shape:',test_df.shape)

Car names and make file shape: (196, 2)
train_df shape: (8144, 4)
test_df shape: (8041, 4)

Viewing sample output of the dataframe in both train and test data

train_df.head(5)

	image_path	image_name	car_name_make	image_class
0	car Images/Train Images/Dodge Dakota Crew Cab ...	03132.jpg	Dodge Dakota Crew Cab 2010	90
1	car Images/Train Images/Dodge Dakota Crew Cab ...	03984.jpg	Dodge Dakota Crew Cab 2010	90
2	car Images/Train Images/Dodge Dakota Crew Cab ...	08090.jpg	Dodge Dakota Crew Cab 2010	90
3	car Images/Train Images/Dodge Dakota Crew Cab ...	02012.jpg	Dodge Dakota Crew Cab 2010	90
4	car Images/Train Images/Dodge Dakota Crew Cab ...	07660.jpg	Dodge Dakota Crew Cab 2010	90

test_df.head(2)

	image_path	image_name	car_name_make	image_class
0	car Images/Test Images/GMC Canyon Extended Cab...	02668.jpg	GMC Canyon Extended Cab 2012	122
1	car Images/Test Images/GMC Canyon Extended Cab...	01433.jpg	GMC Canyon Extended Cab 2012	122

Step 3: Map training and testing images to its annotations.

Merging the above dataframe with Annotations files in both train and test folders to get the co-ordinates and bounding box details

These details can be used while posing a object detection problem during the modelling further

Also, extracting some details like brand and model year information from the car_name_make to get better EDA insights on the train and test data

# Reading Annotations file for both both test and train folders to get image boundary and image_class details
train_annot_df=pd.read_csv(os.path.join('Annotations/Train Annotations.csv'))
print('Train_annotations shape:',train_annot_df.shape)
test_annot_df=pd.read_csv(os.path.join('Annotations/Test Annotation.csv'))
print('Test_annotations shape:',test_annot_df.shape)

Train_annotations shape: (8144, 6)
Test_annotations shape: (8041, 6)

# Merging with the previous dataframe with id as image_name+Image Name
train_df=train_df.merge(train_annot_df,left_on='image_name', right_on='Image Name')
# Dropping the unnecessary/repititive columns in the final dataframe
train_df.drop(['Image Name','Image class'],axis=1,inplace=True)
# Extracting few variables from train_df which can be helpful for EDA from car_name_make column
train_df['car_brand']=train_df['car_name_make'].apply(lambda x:x.split()[0])
train_df['car_make_yr']=train_df['car_name_make'].apply(lambda x:x.split()[-1])
print('Final_train_data shape:',train_df.shape)
test_df=test_df.merge(test_annot_df,left_on='image_name', right_on='Image Name')
# Dropping the unnecessary/repititive columns in the final dataframe
test_df.drop(['Image Name','Image class'],axis=1,inplace=True)
# Extracting few variables from test_df which can be helpful for EDA from car_name_make column
test_df['car_brand']=test_df['car_name_make'].apply(lambda x:x.split()[0])
test_df['car_make_yr']=test_df['car_name_make'].apply(lambda x:x.split()[-1])
print('Final_test_data shape:',test_df.shape)

Final_train_data shape: (8144, 10)
Final_test_data shape: (8041, 10)

Renaming the columns properly to get the correct vertices details for plotting the boundary box

# Renaming the image coordinates columns from the annotations file for better intuition and understanding
# Identified the co-ordinates with real images and then decided with these vertices details for the rectangle BB
train_df.rename(columns={'Bounding Box coordinates':'x1',
                 'Unnamed: 2':'y1',
                 'Unnamed: 3':'x2',
                 'Unnamed: 4':'y2'},inplace=True)
test_df.rename(columns={'Bounding Box coordinates':'x1',
                 'Unnamed: 2':'y1',
                 'Unnamed: 3':'x2',
                 'Unnamed: 4':'y2'},inplace=True)

Viewing sample output and columns present in the final dataframe for both train and test data

This final data frame should have the following details: image_path, image_name, Annotations(BB-vertices details), image_class, car_name_make, car_brand & car_make_yr

train_df.head(2)

	image_path	image_name	car_name_make	image_class	x1	y1	x2	y2	car_brand	car_make_yr
0	car Images/Train Images/Dodge Dakota Crew Cab ...	03132.jpg	Dodge Dakota Crew Cab 2010	90	45	14	261	191	Dodge	2010
1	car Images/Train Images/Dodge Dakota Crew Cab ...	03984.jpg	Dodge Dakota Crew Cab 2010	90	8	19	289	180	Dodge	2010

train_df.columns

Index(['image_path', 'image_name', 'car_name_make', 'image_class', 'x1', 'y1',
       'x2', 'y2', 'car_brand', 'car_make_yr'],
      dtype='object')

test_df.head(2)

	image_path	image_name	car_name_make	image_class	x1	y1	x2	y2	car_brand	car_make_yr
0	car Images/Test Images/GMC Canyon Extended Cab...	02668.jpg	GMC Canyon Extended Cab 2012	122	13	66	477	228	GMC	2012
1	car Images/Test Images/GMC Canyon Extended Cab...	01433.jpg	GMC Canyon Extended Cab 2012	122	23	41	584	395	GMC	2012

Configuration

train_dataset = train_df.copy()
test_dataset = test_df.copy()

train_dir = "/content/drive/MyDrive/GL_capstone_project/car Images/Train Images"
test_dir = "/content/drive/MyDrive/GL_capstone_project/car Images/Test Images"
image_dir = "/content/drive/MyDrive/GL_capstone_project/"

np.random.seed(42)
torch.manual_seed(42)

device="cuda" if torch.cuda.is_available() else "cpu"

batch_size=4
learning_rate=3e-5
epochs=5

threshold=0.5
iou_threshold=0.8

train_df['image_name'].nunique(), test_df['image_name'].nunique()
print(f'Unique image classes = {train_df.image_class.nunique()}')

Unique image classes = 196

Creating custom dataset

Creating a new column called image_id by replacing the string from image_name
Creating tensors as required by input to the model.
We already have our bounding boxes as coordinates. So, we don’t need to perform any transformation there. However, the area for the bounding box is calculated based on \((x2-x1)*(y2-y1)\)
We will use sklearn’s ShuffleSplit function here to split the images based on train and valid (80-20 split)

train_df['image_id'] = train_df['image_name'].str.replace(r'\.jpg$', '', regex=True)

test_df['image_id'] = test_df['image_name'].str.replace(r'\.jpg$', '', regex=True)

"""The input to the model is expected to be a list of tensors, each of shape [C, H, W],
   one for each image, and should be in 0-1 range. Different images can have different
   sizes.The behavior of the model changes depending if it is in training or evaluation
   mode."""

class CarDataset(Dataset):
    def __init__(self,dataframe,image_dir):
        super().__init__()
        self.dataframe=dataframe
        self.img_list=sorted(self.dataframe["image_id"].unique())
        self.img_dir=image_dir

    def __len__(self):
        return len(self.img_list)

    def __getitem__(self,idx):
        img_name=self.img_list[idx]
        img_path=os.path.join(self.img_dir,self.dataframe["image_path"][idx])
        img=cv2.imread(img_path)
        img=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
        img=tf.to_tensor(img)

        inter=self.dataframe[self.dataframe["image_id"]==self.img_list[idx]]
        boxes=inter[["x1","y1","x2","y2"]].values
        area=(boxes[:,2]-boxes[:,0])*(boxes[:,3]-boxes[:,1])

        # converting bounding box from x0y0wh format to x0y0x1y1 format
        # boxes[:,2]=boxes[:,0]+boxes[:,2]
        # boxes[:,3]=boxes[:,1]+boxes[:,3]

        labels=torch.ones((boxes.shape[0]),dtype=torch.int64)
        iscrowd=torch.zeros((boxes.shape[0]),dtype=torch.uint8)

        target={}
        target["boxes"]=torch.as_tensor(boxes,dtype=torch.float32)
        target["area"]=torch.as_tensor(area,dtype=torch.float32)
        target["labels"]=labels
        target["iscrowd"]=iscrowd
        target["id"]=torch.tensor(idx)

        return img,target

train_ds=CarDataset(train_dataset,image_dir)
val_ds=CarDataset(train_dataset,image_dir)

len(train_ds)

ss=ShuffleSplit(n_splits=1,test_size=0.2,random_state=1)

indexs=range(len(train_ds))
for train_idx,val_idx in ss.split(indexs):
    print(f"Train dataset length: {len(train_idx)}")
    print(f"Validation dataset length: {len(val_idx)}")

Train dataset length: 6515
Validation dataset length: 1629

train_ds=Subset(train_ds,train_idx)
val_ds=Subset(val_ds,val_idx)

len(train_ds), len(val_ds)

(6515, 1629)

def show(img,boxes):
    boxes=boxes.detach().numpy().astype(np.int32)
    sample=img.permute(1,2,0).numpy().copy()

    for box in boxes:
        cv2.rectangle(sample,(box[0], box[1]),(box[2], box[3]),(220, 0, 0), 3)

    plt.axis("off");
    plt.imshow(sample);

DataLoader¶

def collate_fn(batch):
    return tuple(zip(*batch))

train_dl=DataLoader(train_ds,batch_size=batch_size,shuffle=True,num_workers=2,
                    pin_memory=True if torch.cuda.is_available else False,
                    collate_fn=collate_fn)
val_dl=DataLoader(val_ds,batch_size=batch_size,shuffle=False,num_workers=2,
                  pin_memory=True if torch.cuda.is_available else False,
                  collate_fn=collate_fn)

Model

# load a model pre-trained on COCO
weights=FasterRCNN_ResNet50_FPN_Weights.DEFAULT
model=fasterrcnn_resnet50_fpn(weights=weights)

# replace the classifier with a new one, that has
# num_classes which is user-defined
# 1 class (person) + background
num_classes = 196

# get number of input features for the classifier
in_features=model.roi_heads.box_predictor.cls_score.in_features


# replace the pre-trained head with a new one
model.roi_heads.box_predictor=FastRCNNPredictor(in_channels=in_features,
                                                num_classes=num_classes)

model.to(device)

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:02<00:00, 79.0MB/s]

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d(256, eps=0.0)
          )
        )
        (1): Bottleneck(
          (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(inplace=True)
        )
        (2): Bottleneck(
          (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(inplace=True)
        )
      )
      (layer2): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(128, eps=0.0)
          (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(128, eps=0.0)
          (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(512, eps=0.0)
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
            (1): FrozenBatchNorm2d(512, eps=0.0)
          )
        )
        (1): Bottleneck(
          (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(128, eps=0.0)
          (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(128, eps=0.0)
          (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(512, eps=0.0)
          (relu): ReLU(inplace=True)
        )
        (2): Bottleneck(
          (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(128, eps=0.0)
          (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(128, eps=0.0)
          (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(512, eps=0.0)
          (relu): ReLU(inplace=True)
        )
        (3): Bottleneck(
          (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(128, eps=0.0)
          (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(128, eps=0.0)
          (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(512, eps=0.0)
          (relu): ReLU(inplace=True)
        )
      )
      (layer3): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(256, eps=0.0)
          (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(256, eps=0.0)
          (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(1024, eps=0.0)
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)
            (1): FrozenBatchNorm2d(1024, eps=0.0)
          )
        )
        (1): Bottleneck(
          (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(256, eps=0.0)
          (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(256, eps=0.0)
          (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(1024, eps=0.0)
          (relu): ReLU(inplace=True)
        )
        (2): Bottleneck(
          (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(256, eps=0.0)
          (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(256, eps=0.0)
          (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(1024, eps=0.0)
          (relu): ReLU(inplace=True)
        )
        (3): Bottleneck(
          (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(256, eps=0.0)
          (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(256, eps=0.0)
          (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(1024, eps=0.0)
          (relu): ReLU(inplace=True)
        )
        (4): Bottleneck(
          (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(256, eps=0.0)
          (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(256, eps=0.0)
          (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(1024, eps=0.0)
          (relu): ReLU(inplace=True)
        )
        (5): Bottleneck(
          (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(256, eps=0.0)
          (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(256, eps=0.0)
          (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(1024, eps=0.0)
          (relu): ReLU(inplace=True)
        )
      )
      (layer4): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(512, eps=0.0)
          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(512, eps=0.0)
          (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(2048, eps=0.0)
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)
            (1): FrozenBatchNorm2d(2048, eps=0.0)
          )
        )
        (1): Bottleneck(
          (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(512, eps=0.0)
          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(512, eps=0.0)
          (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(2048, eps=0.0)
          (relu): ReLU(inplace=True)
        )
        (2): Bottleneck(
          (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(512, eps=0.0)
          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(512, eps=0.0)
          (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(2048, eps=0.0)
          (relu): ReLU(inplace=True)
        )
      )
    )
    (fpn): FeaturePyramidNetwork(
      (inner_blocks): ModuleList(
        (0): Conv2dNormActivation(
          (0): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
        )
        (1): Conv2dNormActivation(
          (0): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
        )
        (2): Conv2dNormActivation(
          (0): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
        )
        (3): Conv2dNormActivation(
          (0): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
        )
      )
      (layer_blocks): ModuleList(
        (0-3): 4 x Conv2dNormActivation(
          (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
      )
      (extra_blocks): LastLevelMaxPool()
    )
  )
  (rpn): RegionProposalNetwork(
    (anchor_generator): AnchorGenerator()
    (head): RPNHead(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
          (1): ReLU(inplace=True)
        )
      )
      (cls_logits): Conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1))
      (bbox_pred): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1))
    )
  )
  (roi_heads): RoIHeads(
    (box_roi_pool): MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=(7, 7), sampling_ratio=2)
    (box_head): TwoMLPHead(
      (fc6): Linear(in_features=12544, out_features=1024, bias=True)
      (fc7): Linear(in_features=1024, out_features=1024, bias=True)
    )
    (box_predictor): FastRCNNPredictor(
      (cls_score): Linear(in_features=1024, out_features=196, bias=True)
      (bbox_pred): Linear(in_features=1024, out_features=784, bias=True)
    )
  )
)

for params in model.children():
    print(params)

GeneralizedRCNNTransform(
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    Resize(min_size=(800,), max_size=1333, mode='bilinear')
)
BackboneWithFPN(
  (body): IntermediateLayerGetter(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): FrozenBatchNorm2d(64, eps=0.0)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(64, eps=0.0)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(64, eps=0.0)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(256, eps=0.0)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): FrozenBatchNorm2d(256, eps=0.0)
        )
      )
      (1): Bottleneck(
        (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(64, eps=0.0)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(64, eps=0.0)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(256, eps=0.0)
        (relu): ReLU(inplace=True)
      )
      (2): Bottleneck(
        (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(64, eps=0.0)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(64, eps=0.0)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(256, eps=0.0)
        (relu): ReLU(inplace=True)
      )
    )
    (layer2): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(128, eps=0.0)
        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(128, eps=0.0)
        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(512, eps=0.0)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
          (1): FrozenBatchNorm2d(512, eps=0.0)
        )
      )
      (1): Bottleneck(
        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(128, eps=0.0)
        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(128, eps=0.0)
        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(512, eps=0.0)
        (relu): ReLU(inplace=True)
      )
      (2): Bottleneck(
        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(128, eps=0.0)
        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(128, eps=0.0)
        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(512, eps=0.0)
        (relu): ReLU(inplace=True)
      )
      (3): Bottleneck(
        (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(128, eps=0.0)
        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(128, eps=0.0)
        (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(512, eps=0.0)
        (relu): ReLU(inplace=True)
      )
    )
    (layer3): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(256, eps=0.0)
        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(256, eps=0.0)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(1024, eps=0.0)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)
          (1): FrozenBatchNorm2d(1024, eps=0.0)
        )
      )
      (1): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(256, eps=0.0)
        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(256, eps=0.0)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(1024, eps=0.0)
        (relu): ReLU(inplace=True)
      )
      (2): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(256, eps=0.0)
        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(256, eps=0.0)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(1024, eps=0.0)
        (relu): ReLU(inplace=True)
      )
      (3): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(256, eps=0.0)
        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(256, eps=0.0)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(1024, eps=0.0)
        (relu): ReLU(inplace=True)
      )
      (4): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(256, eps=0.0)
        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(256, eps=0.0)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(1024, eps=0.0)
        (relu): ReLU(inplace=True)
      )
      (5): Bottleneck(
        (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(256, eps=0.0)
        (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(256, eps=0.0)
        (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(1024, eps=0.0)
        (relu): ReLU(inplace=True)
      )
    )
    (layer4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(512, eps=0.0)
        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(512, eps=0.0)
        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(2048, eps=0.0)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)
          (1): FrozenBatchNorm2d(2048, eps=0.0)
        )
      )
      (1): Bottleneck(
        (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(512, eps=0.0)
        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(512, eps=0.0)
        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(2048, eps=0.0)
        (relu): ReLU(inplace=True)
      )
      (2): Bottleneck(
        (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(512, eps=0.0)
        (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(512, eps=0.0)
        (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(2048, eps=0.0)
        (relu): ReLU(inplace=True)
      )
    )
  )
  (fpn): FeaturePyramidNetwork(
    (inner_blocks): ModuleList(
      (0): Conv2dNormActivation(
        (0): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
      )
      (1): Conv2dNormActivation(
        (0): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
      )
      (2): Conv2dNormActivation(
        (0): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
      )
      (3): Conv2dNormActivation(
        (0): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
      )
    )
    (layer_blocks): ModuleList(
      (0-3): 4 x Conv2dNormActivation(
        (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
    )
    (extra_blocks): LastLevelMaxPool()
  )
)
RegionProposalNetwork(
  (anchor_generator): AnchorGenerator()
  (head): RPNHead(
    (conv): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
    )
    (cls_logits): Conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1))
    (bbox_pred): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1))
  )
)
RoIHeads(
  (box_roi_pool): MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=(7, 7), sampling_ratio=2)
  (box_head): TwoMLPHead(
    (fc6): Linear(in_features=12544, out_features=1024, bias=True)
    (fc7): Linear(in_features=1024, out_features=1024, bias=True)
  )
  (box_predictor): FastRCNNPredictor(
    (cls_score): Linear(in_features=1024, out_features=196, bias=True)
    (bbox_pred): Linear(in_features=1024, out_features=784, bias=True)
  )
)

"""ConvNet as fixed feature extractor: Here, we will freeze the weights for the backbone of
   the network (resnet50 with feature pyramid network). The Regional Proposal network and
   Region of Interest heads will be fine tuned using transfer learning."""

classification_head=list(model.children())[-2:]

for children in list(model.children())[:-2]:
    for params in children.parameters():
        params.requires_grad=False

parameters=[]
for heads in classification_head:
    for params in heads.parameters():
        parameters.append(params)


optimizer=optim.Adam(parameters,lr=learning_rate)
lr_scheduler=optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1,
                                                  patience=8, threshold=0.0001)


print(classification_head)

[RegionProposalNetwork(
  (anchor_generator): AnchorGenerator()
  (head): RPNHead(
    (conv): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
    )
    (cls_logits): Conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1))
    (bbox_pred): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1))
  )
), RoIHeads(
  (box_roi_pool): MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=(7, 7), sampling_ratio=2)
  (box_head): TwoMLPHead(
    (fc6): Linear(in_features=12544, out_features=1024, bias=True)
    (fc7): Linear(in_features=1024, out_features=1024, bias=True)
  )
  (box_predictor): FastRCNNPredictor(
    (cls_score): Linear(in_features=1024, out_features=196, bias=True)
    (bbox_pred): Linear(in_features=1024, out_features=784, bias=True)
  )
)]

Training The Fine-Tuned Model

def get_lr(optimizer):
    for params in optimizer.param_groups:
        return params["lr"]

loss_history={"training_loss":[],
              "validation_loss":[]}

train_len=len(train_dl.dataset)
val_len=len(val_dl.dataset)

best_validation_loss=np.inf
best_weights=copy.deepcopy(model.state_dict())

for epoch in range(epochs):

    training_loss=0.0
    validation_loss=0.0
    current_lr=get_lr(optimizer)

    #During training, the model expects both the input tensors, as well as a targets
    model.train()
    for imgs,targets in train_dl:
        imgs=[img.to(device) for img in imgs]
        targets=[{k:v.to(device) for (k,v) in d.items()} for d in targets]

        """The model returns a Dict[Tensor] during training, containing the classification
           and regression losses for both the RPN and the R-CNN."""

        loss_dict=model(imgs,targets)
        losses=sum(loss for loss in loss_dict.values())
        training_loss+=losses.item()

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

    with torch.no_grad():
        for imgs,targets in val_dl:
            imgs=[img.to(device) for img in imgs]
            targets=[{k:v.to(device) for (k,v) in d.items()} for d in targets]

            """The model returns a Dict[Tensor] during training, containing the classification
               and regression losses for both the RPN and the R-CNN."""

            loss_dict=model(imgs,targets)
            losses=sum(loss for loss in loss_dict.values())
            validation_loss+=losses.item()


    lr_scheduler.step(validation_loss)
    if current_lr!=get_lr(optimizer):
        print("Loading best Model weights")
        model.load_state_dict(best_weights)

    if validation_loss<best_validation_loss:
        best_validation_loss=validation_loss
        best_weights=copy.deepcopy(model.state_dict())
        print("Updating Best Model weights")


    loss_history["training_loss"].append(training_loss/train_len)
    loss_history["validation_loss"].append(validation_loss/val_len)

    print(f"\n{epoch+1}/{epochs}")
    print(f"Training Loss: {training_loss/train_len}")
    print(f"Validation_loss: {validation_loss/val_len}")
    print("\n"+"*"*50)

Updating Best Model weights

1/5
Training Loss: 0.5091633841704151
Validation_loss: 0.4923232738008669

**************************************************
Updating Best Model weights

2/5
Training Loss: 0.34851808806999957
Validation_loss: 0.33369734506891724

**************************************************
Updating Best Model weights

3/5
Training Loss: 0.2804957070411487
Validation_loss: 0.29477986226863406

**************************************************
Updating Best Model weights

4/5
Training Loss: 0.2661970856332907
Validation_loss: 0.285527507111915

**************************************************

5/5
Training Loss: 0.26277425953473665
Validation_loss: 0.28928613928909896

**************************************************

sns.lineplot(x=range(epochs),y=loss_history["training_loss"],label="Train Losses");
sns.lineplot(x=range(epochs),y=loss_history["validation_loss"],label="Validation Losses");
plt.title("Training Validation Datasets Losses Plot");
plt.legend();
plt.savefig("3.jpg")

Model Save

model_weights_file="model.pth"
torch.save(best_weights,model_weights_file)

Next Steps:

Inference on the model
Performing Data Augmentation, hyperparameter-tuning, different backbone architectures to compare losses.