import os # To use terminal commands inside python for working on files and directories
# Data transformation libraries
import pandas as pd
import numpy as np
import copy
# Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
# Scikit-learn packages
from sklearn.preprocessing import LabelEncoder # For label encoding the class labels
from sklearn.model_selection import ShuffleSplit
## Other libraries helpful for CV tasks
import cv2 # computer vision related packages for reading and playing with Image data
# Necessary packages for getting image editing capabilities
from PIL import Image
import PIL
from prettytable import PrettyTable # To format the results with good formatting
from zipfile import ZipFile # For extracting .zip files
import pickle # For exporting and importing the model files
# Necessary Pytorch libraries
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from torch.utils.data import Subset
import torchvision.transforms.functional as tf
import torch.optim as optim
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
%matplotlib inline
# For plotting images inside the notebook
%matplotlib inline
Mission Statement :
Identifying the car from the given entire dataset of ~16000 images using Computer Vision techniques.
Objectives :
Importing Necessary libraries and packages
from google.colab import drive
'/content/drive')
drive.mount('/content/drive/MyDrive/GL_capstone_project') os.chdir(
Mounted at /content/drive
Step 1: Import the data.
Extracting the necessary zip files for the problem and if already extracted, skipping the file extraction
try:
with ZipFile('Car Images.zip', 'r') as z:
z.extractall()with ZipFile('Annotations.zip', 'r') as z:
z.extractall()except:
pass
Step 2: Map training and testing images to its classes.
# Get all the image names into a df and then merging above with images data --> Image name+image_name
# Creating the placeholders for storing the values while reading the images in train and test data
=pd.DataFrame()
train_images_df=pd.DataFrame()
test_images_df=list()
train_car_images_list=list()
train_car_images_path_list=list()
train_car_name_make_list=list()
test_car_images_list=list()
test_car_images_path_list=list()
test_car_name_make_list
# Iterating through train images folder to get image_path, image_name and car_name_make_details
for sub_folder in os.listdir(os.path.join('car Images')):
if sub_folder =='Train Images':
for car_folder in os.listdir(os.path.join('car Images','Train Images')):
# Neglecting temporary folders created runtime and taking only the required folders
if car_folder!='.DS_Store' and car_folder!='.ipynb_checkpoints':
for car_image_name in os.listdir(os.path.join('car Images/Train Images',car_folder)):
if car_image_name.split('.')[1]=='jpg':
train_car_name_make_list.append(car_folder)
train_car_images_list.append(car_image_name)'car Images/Train Images/'+car_folder+'/'+car_image_name)
train_car_images_path_list.append(# Iterating through test images folder to get image_path, image_name and car_name_make_details
for sub_folder in os.listdir(os.path.join('car Images')):
if sub_folder =='Test Images':
for car_folder in os.listdir(os.path.join('car Images','Test Images')):
# Neglecting temporary folders created runtime and taking only the required folders
if car_folder!='.DS_Store' and car_folder!='.ipynb_checkpoints':
for car_image_name in os.listdir(os.path.join('car Images/Test Images',car_folder)):
if car_image_name.split('.')[1]=='jpg':
test_car_name_make_list.append(car_folder)
test_car_images_list.append(car_image_name)'car Images/Test Images/'+car_folder+'/'+car_image_name)
test_car_images_path_list.append(
# Storing all the extracted information into the train and test dataframes created earlier
'image_path']=train_car_images_path_list
train_images_df['image_name']=train_car_images_list
train_images_df['car_name_make']=train_car_name_make_list
train_images_df[print('train_images_df shape:',train_images_df.shape)
'image_path']=test_car_images_path_list
test_images_df['image_name']=test_car_images_list
test_images_df['car_name_make']=test_car_name_make_list
test_images_df[print('test_images_df shape:',test_images_df.shape)
train_images_df shape: (8144, 3)
test_images_df shape: (8041, 3)
=pd.read_csv('Car+names+and+make.csv',header=None)
car_name_make_df'image_class']=list(range(1,197))
car_name_make_df[={0: 'car_name_make'}, inplace=True)
car_name_make_df.rename(columns# In the car_name_make file,few car names are with '/', so replacing '/' with '-' to match with image folder names
'car_name_make']=car_name_make_df['car_name_make'].apply(lambda x:x.replace('/','-'))
car_name_make_df[=car_name_make_df[['image_class','car_name_make']]
car_name_make_dfprint('Car names and make file shape:',car_name_make_df.shape)
# Merging with previous data frame on id='car_name_make' to get all details
# about train and test dataframes into a single entity
=train_images_df.merge(car_name_make_df,on='car_name_make')
train_dfprint('train_df shape:',train_df.shape)
=test_images_df.merge(car_name_make_df,on='car_name_make')
test_dfprint('test_df shape:',test_df.shape)
Car names and make file shape: (196, 2)
train_df shape: (8144, 4)
test_df shape: (8041, 4)
Viewing sample output of the dataframe in both train and test data
5) train_df.head(
image_path | image_name | car_name_make | image_class | |
---|---|---|---|---|
0 | car Images/Train Images/Dodge Dakota Crew Cab ... | 03132.jpg | Dodge Dakota Crew Cab 2010 | 90 |
1 | car Images/Train Images/Dodge Dakota Crew Cab ... | 03984.jpg | Dodge Dakota Crew Cab 2010 | 90 |
2 | car Images/Train Images/Dodge Dakota Crew Cab ... | 08090.jpg | Dodge Dakota Crew Cab 2010 | 90 |
3 | car Images/Train Images/Dodge Dakota Crew Cab ... | 02012.jpg | Dodge Dakota Crew Cab 2010 | 90 |
4 | car Images/Train Images/Dodge Dakota Crew Cab ... | 07660.jpg | Dodge Dakota Crew Cab 2010 | 90 |
2) test_df.head(
image_path | image_name | car_name_make | image_class | |
---|---|---|---|---|
0 | car Images/Test Images/GMC Canyon Extended Cab... | 02668.jpg | GMC Canyon Extended Cab 2012 | 122 |
1 | car Images/Test Images/GMC Canyon Extended Cab... | 01433.jpg | GMC Canyon Extended Cab 2012 | 122 |
Step 3: Map training and testing images to its annotations.
# Reading Annotations file for both both test and train folders to get image boundary and image_class details
=pd.read_csv(os.path.join('Annotations/Train Annotations.csv'))
train_annot_dfprint('Train_annotations shape:',train_annot_df.shape)
=pd.read_csv(os.path.join('Annotations/Test Annotation.csv'))
test_annot_dfprint('Test_annotations shape:',test_annot_df.shape)
Train_annotations shape: (8144, 6)
Test_annotations shape: (8041, 6)
# Merging with the previous dataframe with id as image_name+Image Name
=train_df.merge(train_annot_df,left_on='image_name', right_on='Image Name')
train_df# Dropping the unnecessary/repititive columns in the final dataframe
'Image Name','Image class'],axis=1,inplace=True)
train_df.drop([# Extracting few variables from train_df which can be helpful for EDA from car_name_make column
'car_brand']=train_df['car_name_make'].apply(lambda x:x.split()[0])
train_df['car_make_yr']=train_df['car_name_make'].apply(lambda x:x.split()[-1])
train_df[print('Final_train_data shape:',train_df.shape)
=test_df.merge(test_annot_df,left_on='image_name', right_on='Image Name')
test_df# Dropping the unnecessary/repititive columns in the final dataframe
'Image Name','Image class'],axis=1,inplace=True)
test_df.drop([# Extracting few variables from test_df which can be helpful for EDA from car_name_make column
'car_brand']=test_df['car_name_make'].apply(lambda x:x.split()[0])
test_df['car_make_yr']=test_df['car_name_make'].apply(lambda x:x.split()[-1])
test_df[print('Final_test_data shape:',test_df.shape)
Final_train_data shape: (8144, 10)
Final_test_data shape: (8041, 10)
Renaming the columns properly to get the correct vertices details for plotting the boundary box
# Renaming the image coordinates columns from the annotations file for better intuition and understanding
# Identified the co-ordinates with real images and then decided with these vertices details for the rectangle BB
={'Bounding Box coordinates':'x1',
train_df.rename(columns'Unnamed: 2':'y1',
'Unnamed: 3':'x2',
'Unnamed: 4':'y2'},inplace=True)
={'Bounding Box coordinates':'x1',
test_df.rename(columns'Unnamed: 2':'y1',
'Unnamed: 3':'x2',
'Unnamed: 4':'y2'},inplace=True)
Viewing sample output and columns present in the final dataframe for both train and test data
This final data frame should have the following details: image_path, image_name, Annotations(BB-vertices details), image_class, car_name_make, car_brand & car_make_yr
2) train_df.head(
image_path | image_name | car_name_make | image_class | x1 | y1 | x2 | y2 | car_brand | car_make_yr | |
---|---|---|---|---|---|---|---|---|---|---|
0 | car Images/Train Images/Dodge Dakota Crew Cab ... | 03132.jpg | Dodge Dakota Crew Cab 2010 | 90 | 45 | 14 | 261 | 191 | Dodge | 2010 |
1 | car Images/Train Images/Dodge Dakota Crew Cab ... | 03984.jpg | Dodge Dakota Crew Cab 2010 | 90 | 8 | 19 | 289 | 180 | Dodge | 2010 |
train_df.columns
Index(['image_path', 'image_name', 'car_name_make', 'image_class', 'x1', 'y1',
'x2', 'y2', 'car_brand', 'car_make_yr'],
dtype='object')
2) test_df.head(
image_path | image_name | car_name_make | image_class | x1 | y1 | x2 | y2 | car_brand | car_make_yr | |
---|---|---|---|---|---|---|---|---|---|---|
0 | car Images/Test Images/GMC Canyon Extended Cab... | 02668.jpg | GMC Canyon Extended Cab 2012 | 122 | 13 | 66 | 477 | 228 | GMC | 2012 |
1 | car Images/Test Images/GMC Canyon Extended Cab... | 01433.jpg | GMC Canyon Extended Cab 2012 | 122 | 23 | 41 | 584 | 395 | GMC | 2012 |
Configuration
= train_df.copy()
train_dataset = test_df.copy()
test_dataset
= "/content/drive/MyDrive/GL_capstone_project/car Images/Train Images"
train_dir = "/content/drive/MyDrive/GL_capstone_project/car Images/Test Images"
test_dir = "/content/drive/MyDrive/GL_capstone_project/"
image_dir
42)
np.random.seed(42)
torch.manual_seed(
="cuda" if torch.cuda.is_available() else "cpu"
device
=4
batch_size=3e-5
learning_rate=5
epochs
=0.5
threshold=0.8 iou_threshold
'image_name'].nunique(), test_df['image_name'].nunique()
train_df[print(f'Unique image classes = {train_df.image_class.nunique()}')
Unique image classes = 196
Creating custom dataset
- Creating a new column called
image_id
by replacing the string fromimage_name
- Creating tensors as required by input to the model.
- We already have our bounding boxes as coordinates. So, we don’t need to perform any transformation there. However, the area for the bounding box is calculated based on \((x2-x1)*(y2-y1)\)
- We will use sklearn’s ShuffleSplit function here to split the images based on train and valid (80-20 split)
'image_id'] = train_df['image_name'].str.replace(r'\.jpg$', '', regex=True) train_df[
'image_id'] = test_df['image_name'].str.replace(r'\.jpg$', '', regex=True) test_df[
"""The input to the model is expected to be a list of tensors, each of shape [C, H, W],
one for each image, and should be in 0-1 range. Different images can have different
sizes.The behavior of the model changes depending if it is in training or evaluation
mode."""
class CarDataset(Dataset):
def __init__(self,dataframe,image_dir):
super().__init__()
self.dataframe=dataframe
self.img_list=sorted(self.dataframe["image_id"].unique())
self.img_dir=image_dir
def __len__(self):
return len(self.img_list)
def __getitem__(self,idx):
=self.img_list[idx]
img_name=os.path.join(self.img_dir,self.dataframe["image_path"][idx])
img_path=cv2.imread(img_path)
img=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
img=tf.to_tensor(img)
img
=self.dataframe[self.dataframe["image_id"]==self.img_list[idx]]
inter=inter[["x1","y1","x2","y2"]].values
boxes=(boxes[:,2]-boxes[:,0])*(boxes[:,3]-boxes[:,1])
area
# converting bounding box from x0y0wh format to x0y0x1y1 format
# boxes[:,2]=boxes[:,0]+boxes[:,2]
# boxes[:,3]=boxes[:,1]+boxes[:,3]
=torch.ones((boxes.shape[0]),dtype=torch.int64)
labels=torch.zeros((boxes.shape[0]),dtype=torch.uint8)
iscrowd
={}
target"boxes"]=torch.as_tensor(boxes,dtype=torch.float32)
target["area"]=torch.as_tensor(area,dtype=torch.float32)
target["labels"]=labels
target["iscrowd"]=iscrowd
target["id"]=torch.tensor(idx)
target[
return img,target
=CarDataset(train_dataset,image_dir)
train_ds=CarDataset(train_dataset,image_dir) val_ds
len(train_ds)
8144
=ShuffleSplit(n_splits=1,test_size=0.2,random_state=1)
ss
=range(len(train_ds))
indexsfor train_idx,val_idx in ss.split(indexs):
print(f"Train dataset length: {len(train_idx)}")
print(f"Validation dataset length: {len(val_idx)}")
Train dataset length: 6515
Validation dataset length: 1629
=Subset(train_ds,train_idx)
train_ds=Subset(val_ds,val_idx) val_ds
len(train_ds), len(val_ds)
(6515, 1629)
def show(img,boxes):
=boxes.detach().numpy().astype(np.int32)
boxes=img.permute(1,2,0).numpy().copy()
sample
for box in boxes:
0], box[1]),(box[2], box[3]),(220, 0, 0), 3)
cv2.rectangle(sample,(box[
"off");
plt.axis(; plt.imshow(sample)
DataLoader¶
def collate_fn(batch):
return tuple(zip(*batch))
=DataLoader(train_ds,batch_size=batch_size,shuffle=True,num_workers=2,
train_dl=True if torch.cuda.is_available else False,
pin_memory=collate_fn)
collate_fn=DataLoader(val_ds,batch_size=batch_size,shuffle=False,num_workers=2,
val_dl=True if torch.cuda.is_available else False,
pin_memory=collate_fn) collate_fn
Model
# load a model pre-trained on COCO
=FasterRCNN_ResNet50_FPN_Weights.DEFAULT
weights=fasterrcnn_resnet50_fpn(weights=weights)
model
# replace the classifier with a new one, that has
# num_classes which is user-defined
# 1 class (person) + background
= 196
num_classes
# get number of input features for the classifier
=model.roi_heads.box_predictor.cls_score.in_features
in_features
# replace the pre-trained head with a new one
=FastRCNNPredictor(in_channels=in_features,
model.roi_heads.box_predictor=num_classes)
num_classes
model.to(device)
Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:02<00:00, 79.0MB/s]
FasterRCNN(
(transform): GeneralizedRCNNTransform(
Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
Resize(min_size=(800,), max_size=1333, mode='bilinear')
)
(backbone): BackboneWithFPN(
(body): IntermediateLayerGetter(
(conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
(bn1): FrozenBatchNorm2d(64, eps=0.0)
(relu): ReLU(inplace=True)
(maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
(layer1): Sequential(
(0): Bottleneck(
(conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(64, eps=0.0)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(64, eps=0.0)
(conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(256, eps=0.0)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): FrozenBatchNorm2d(256, eps=0.0)
)
)
(1): Bottleneck(
(conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(64, eps=0.0)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(64, eps=0.0)
(conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(256, eps=0.0)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(64, eps=0.0)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(64, eps=0.0)
(conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(256, eps=0.0)
(relu): ReLU(inplace=True)
)
)
(layer2): Sequential(
(0): Bottleneck(
(conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(128, eps=0.0)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(128, eps=0.0)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(512, eps=0.0)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): FrozenBatchNorm2d(512, eps=0.0)
)
)
(1): Bottleneck(
(conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(128, eps=0.0)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(128, eps=0.0)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(512, eps=0.0)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(128, eps=0.0)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(128, eps=0.0)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(512, eps=0.0)
(relu): ReLU(inplace=True)
)
(3): Bottleneck(
(conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(128, eps=0.0)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(128, eps=0.0)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(512, eps=0.0)
(relu): ReLU(inplace=True)
)
)
(layer3): Sequential(
(0): Bottleneck(
(conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(256, eps=0.0)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(256, eps=0.0)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(1024, eps=0.0)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): FrozenBatchNorm2d(1024, eps=0.0)
)
)
(1): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(256, eps=0.0)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(256, eps=0.0)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(1024, eps=0.0)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(256, eps=0.0)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(256, eps=0.0)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(1024, eps=0.0)
(relu): ReLU(inplace=True)
)
(3): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(256, eps=0.0)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(256, eps=0.0)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(1024, eps=0.0)
(relu): ReLU(inplace=True)
)
(4): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(256, eps=0.0)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(256, eps=0.0)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(1024, eps=0.0)
(relu): ReLU(inplace=True)
)
(5): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(256, eps=0.0)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(256, eps=0.0)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(1024, eps=0.0)
(relu): ReLU(inplace=True)
)
)
(layer4): Sequential(
(0): Bottleneck(
(conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(512, eps=0.0)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(512, eps=0.0)
(conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(2048, eps=0.0)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): FrozenBatchNorm2d(2048, eps=0.0)
)
)
(1): Bottleneck(
(conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(512, eps=0.0)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(512, eps=0.0)
(conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(2048, eps=0.0)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(512, eps=0.0)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(512, eps=0.0)
(conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(2048, eps=0.0)
(relu): ReLU(inplace=True)
)
)
)
(fpn): FeaturePyramidNetwork(
(inner_blocks): ModuleList(
(0): Conv2dNormActivation(
(0): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
)
(1): Conv2dNormActivation(
(0): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
)
(2): Conv2dNormActivation(
(0): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
)
(3): Conv2dNormActivation(
(0): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
)
)
(layer_blocks): ModuleList(
(0-3): 4 x Conv2dNormActivation(
(0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(extra_blocks): LastLevelMaxPool()
)
)
(rpn): RegionProposalNetwork(
(anchor_generator): AnchorGenerator()
(head): RPNHead(
(conv): Sequential(
(0): Conv2dNormActivation(
(0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): ReLU(inplace=True)
)
)
(cls_logits): Conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1))
(bbox_pred): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1))
)
)
(roi_heads): RoIHeads(
(box_roi_pool): MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=(7, 7), sampling_ratio=2)
(box_head): TwoMLPHead(
(fc6): Linear(in_features=12544, out_features=1024, bias=True)
(fc7): Linear(in_features=1024, out_features=1024, bias=True)
)
(box_predictor): FastRCNNPredictor(
(cls_score): Linear(in_features=1024, out_features=196, bias=True)
(bbox_pred): Linear(in_features=1024, out_features=784, bias=True)
)
)
)
for params in model.children():
print(params)
GeneralizedRCNNTransform(
Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
Resize(min_size=(800,), max_size=1333, mode='bilinear')
)
BackboneWithFPN(
(body): IntermediateLayerGetter(
(conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
(bn1): FrozenBatchNorm2d(64, eps=0.0)
(relu): ReLU(inplace=True)
(maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
(layer1): Sequential(
(0): Bottleneck(
(conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(64, eps=0.0)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(64, eps=0.0)
(conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(256, eps=0.0)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): FrozenBatchNorm2d(256, eps=0.0)
)
)
(1): Bottleneck(
(conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(64, eps=0.0)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(64, eps=0.0)
(conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(256, eps=0.0)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(64, eps=0.0)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(64, eps=0.0)
(conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(256, eps=0.0)
(relu): ReLU(inplace=True)
)
)
(layer2): Sequential(
(0): Bottleneck(
(conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(128, eps=0.0)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(128, eps=0.0)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(512, eps=0.0)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): FrozenBatchNorm2d(512, eps=0.0)
)
)
(1): Bottleneck(
(conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(128, eps=0.0)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(128, eps=0.0)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(512, eps=0.0)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(128, eps=0.0)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(128, eps=0.0)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(512, eps=0.0)
(relu): ReLU(inplace=True)
)
(3): Bottleneck(
(conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(128, eps=0.0)
(conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(128, eps=0.0)
(conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(512, eps=0.0)
(relu): ReLU(inplace=True)
)
)
(layer3): Sequential(
(0): Bottleneck(
(conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(256, eps=0.0)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(256, eps=0.0)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(1024, eps=0.0)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): FrozenBatchNorm2d(1024, eps=0.0)
)
)
(1): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(256, eps=0.0)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(256, eps=0.0)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(1024, eps=0.0)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(256, eps=0.0)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(256, eps=0.0)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(1024, eps=0.0)
(relu): ReLU(inplace=True)
)
(3): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(256, eps=0.0)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(256, eps=0.0)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(1024, eps=0.0)
(relu): ReLU(inplace=True)
)
(4): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(256, eps=0.0)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(256, eps=0.0)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(1024, eps=0.0)
(relu): ReLU(inplace=True)
)
(5): Bottleneck(
(conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(256, eps=0.0)
(conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(256, eps=0.0)
(conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(1024, eps=0.0)
(relu): ReLU(inplace=True)
)
)
(layer4): Sequential(
(0): Bottleneck(
(conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(512, eps=0.0)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(512, eps=0.0)
(conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(2048, eps=0.0)
(relu): ReLU(inplace=True)
(downsample): Sequential(
(0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False)
(1): FrozenBatchNorm2d(2048, eps=0.0)
)
)
(1): Bottleneck(
(conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(512, eps=0.0)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(512, eps=0.0)
(conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(2048, eps=0.0)
(relu): ReLU(inplace=True)
)
(2): Bottleneck(
(conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn1): FrozenBatchNorm2d(512, eps=0.0)
(conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): FrozenBatchNorm2d(512, eps=0.0)
(conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False)
(bn3): FrozenBatchNorm2d(2048, eps=0.0)
(relu): ReLU(inplace=True)
)
)
)
(fpn): FeaturePyramidNetwork(
(inner_blocks): ModuleList(
(0): Conv2dNormActivation(
(0): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))
)
(1): Conv2dNormActivation(
(0): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))
)
(2): Conv2dNormActivation(
(0): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
)
(3): Conv2dNormActivation(
(0): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1))
)
)
(layer_blocks): ModuleList(
(0-3): 4 x Conv2dNormActivation(
(0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
(extra_blocks): LastLevelMaxPool()
)
)
RegionProposalNetwork(
(anchor_generator): AnchorGenerator()
(head): RPNHead(
(conv): Sequential(
(0): Conv2dNormActivation(
(0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): ReLU(inplace=True)
)
)
(cls_logits): Conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1))
(bbox_pred): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1))
)
)
RoIHeads(
(box_roi_pool): MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=(7, 7), sampling_ratio=2)
(box_head): TwoMLPHead(
(fc6): Linear(in_features=12544, out_features=1024, bias=True)
(fc7): Linear(in_features=1024, out_features=1024, bias=True)
)
(box_predictor): FastRCNNPredictor(
(cls_score): Linear(in_features=1024, out_features=196, bias=True)
(bbox_pred): Linear(in_features=1024, out_features=784, bias=True)
)
)
"""ConvNet as fixed feature extractor: Here, we will freeze the weights for the backbone of
the network (resnet50 with feature pyramid network). The Regional Proposal network and
Region of Interest heads will be fine tuned using transfer learning."""
=list(model.children())[-2:]
classification_head
for children in list(model.children())[:-2]:
for params in children.parameters():
=False
params.requires_grad
=[]
parametersfor heads in classification_head:
for params in heads.parameters():
parameters.append(params)
=optim.Adam(parameters,lr=learning_rate)
optimizer=optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1,
lr_scheduler=8, threshold=0.0001)
patience
print(classification_head)
[RegionProposalNetwork(
(anchor_generator): AnchorGenerator()
(head): RPNHead(
(conv): Sequential(
(0): Conv2dNormActivation(
(0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(1): ReLU(inplace=True)
)
)
(cls_logits): Conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1))
(bbox_pred): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1))
)
), RoIHeads(
(box_roi_pool): MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=(7, 7), sampling_ratio=2)
(box_head): TwoMLPHead(
(fc6): Linear(in_features=12544, out_features=1024, bias=True)
(fc7): Linear(in_features=1024, out_features=1024, bias=True)
)
(box_predictor): FastRCNNPredictor(
(cls_score): Linear(in_features=1024, out_features=196, bias=True)
(bbox_pred): Linear(in_features=1024, out_features=784, bias=True)
)
)]
Training The Fine-Tuned Model
def get_lr(optimizer):
for params in optimizer.param_groups:
return params["lr"]
={"training_loss":[],
loss_history"validation_loss":[]}
=len(train_dl.dataset)
train_len=len(val_dl.dataset)
val_len
=np.inf
best_validation_loss=copy.deepcopy(model.state_dict())
best_weights
for epoch in range(epochs):
=0.0
training_loss=0.0
validation_loss=get_lr(optimizer)
current_lr
#During training, the model expects both the input tensors, as well as a targets
model.train()for imgs,targets in train_dl:
=[img.to(device) for img in imgs]
imgs=[{k:v.to(device) for (k,v) in d.items()} for d in targets]
targets
"""The model returns a Dict[Tensor] during training, containing the classification
and regression losses for both the RPN and the R-CNN."""
=model(imgs,targets)
loss_dict=sum(loss for loss in loss_dict.values())
losses+=losses.item()
training_loss
optimizer.zero_grad()
losses.backward()
optimizer.step()
with torch.no_grad():
for imgs,targets in val_dl:
=[img.to(device) for img in imgs]
imgs=[{k:v.to(device) for (k,v) in d.items()} for d in targets]
targets
"""The model returns a Dict[Tensor] during training, containing the classification
and regression losses for both the RPN and the R-CNN."""
=model(imgs,targets)
loss_dict=sum(loss for loss in loss_dict.values())
losses+=losses.item()
validation_loss
lr_scheduler.step(validation_loss)if current_lr!=get_lr(optimizer):
print("Loading best Model weights")
model.load_state_dict(best_weights)
if validation_loss<best_validation_loss:
=validation_loss
best_validation_loss=copy.deepcopy(model.state_dict())
best_weightsprint("Updating Best Model weights")
"training_loss"].append(training_loss/train_len)
loss_history["validation_loss"].append(validation_loss/val_len)
loss_history[
print(f"\n{epoch+1}/{epochs}")
print(f"Training Loss: {training_loss/train_len}")
print(f"Validation_loss: {validation_loss/val_len}")
print("\n"+"*"*50)
Updating Best Model weights
1/5
Training Loss: 0.5091633841704151
Validation_loss: 0.4923232738008669
**************************************************
Updating Best Model weights
2/5
Training Loss: 0.34851808806999957
Validation_loss: 0.33369734506891724
**************************************************
Updating Best Model weights
3/5
Training Loss: 0.2804957070411487
Validation_loss: 0.29477986226863406
**************************************************
Updating Best Model weights
4/5
Training Loss: 0.2661970856332907
Validation_loss: 0.285527507111915
**************************************************
5/5
Training Loss: 0.26277425953473665
Validation_loss: 0.28928613928909896
**************************************************
=range(epochs),y=loss_history["training_loss"],label="Train Losses");
sns.lineplot(x=range(epochs),y=loss_history["validation_loss"],label="Validation Losses");
sns.lineplot(x"Training Validation Datasets Losses Plot");
plt.title(;
plt.legend()"3.jpg") plt.savefig(
Model Save
="model.pth"
model_weights_file torch.save(best_weights,model_weights_file)
Next Steps:
- Inference on the model
- Performing Data Augmentation, hyperparameter-tuning, different backbone architectures to compare losses.