import numpy as np
import pandas as pd
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, random_split, Subset
import os
import random
import cv2
import keras
from tensorflow.keras.utils import load_img, img_to_array, array_to_img
from keras.preprocessing.image import ImageDataGenerator
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
cuda:0
num_sample = 9
root = "/kaggle/input/birds23sp/birds/train/"
classes = []
with open("/kaggle/input/birds23sp/birds/names.txt") as f:
classes.extend(f.readlines())
plt.figure(figsize = (20, 10))
for i in range(num_sample):
random_class = random.randint(0, 554)
class_name = classes[random_class].strip().split("\n")[0]
class_path = os.path.join(root, str(random_class))
files = os.listdir(class_path)
image = random.choice(files)
image_path = os.path.join(class_path, image)
image = load_img(image_path)
plt.subplot(3, 3, i + 1)
plt.imshow(image)
plt.title(class_name)
Class distribution is uneven. The class with the most instances is ten times the class with the least instances. The standard deviation is also very big.
path = "/kaggle/input/birds23sp/birds/train/"
num = 555
class_num = []
for i in range(num):
class_path = os.path.join(path, str(i))
files = os.listdir(class_path)
class_num.append(len(files))
plt.figure(figsize = (5, 5))
plt.xlabel("Images per class")
plt.ylabel("Num_classes")
plt.title("")
plt.hist(class_num)
print(f"total images: {np.sum(class_num)}")
print(f"min class: {classes[np.argmin(class_num)].strip()}, num: {min(class_num)}")
print(f"max class: {classes[np.argmax(class_num)].strip()}, num: {max(class_num)}")
print(f"standard deviation: {np.std(class_num)}")
total images: 38562 min class: Dark-eyed Junco (White-winged), num: 10 max class: White-winged Dove, num: 106 standard deviation: 22.06846640893006
height_list = []
width_list = []
for i in range(num):
class_path = os.path.join(path, str(i))
files = os.listdir(class_path)
for file in files:
image_path = os.path.join(class_path, file)
height, width = cv2.imread(image_path).shape[:2]
height_list.append(height)
width_list.append(width)
print(f"avg image height: {np.mean(height_list)}")
print(f"avg image width: {np.mean(width_list)}")
print(f"min image width: {min(width_list)}")
print(f"max image width: {max(width_list)}")
print(f"standard deviation: {np.std(width_list)}")
plt.figure(figsize = (5, 5))
plt.xlabel("Image width")
plt.ylabel("Num_images")
plt.title("")
plt.hist(width_list)
avg image height: 711.886001763394 avg image width: 898.0834500285255 min image width: 90 max image width: 1024 standard deviation: 173.71986785146
(array([6.0000e+00, 1.9000e+01, 1.4000e+02, 4.9600e+02, 1.3120e+03, 3.5760e+03, 2.4650e+03, 5.5080e+03, 1.5090e+03, 2.3531e+04]), array([ 90. , 183.4, 276.8, 370.2, 463.6, 557. , 650.4, 743.8, 837.2, 930.6, 1024. ]), <BarContainer object of 10 artists>)
img = load_img("/kaggle/input/birds23sp/birds/train/0/0b23d29cb6364a33a450f1f4fca010ac.jpg")
#plt.imshow(img)
x = img_to_array(img)
x = x.reshape((1,) + x.shape)
datagen = ImageDataGenerator(rotation_range=40,
width_shift_range = 0.2,
height_shift_range = 0.2,
shear_range = 0.2,
zoom_range = 0.2,
fill_mode = "nearest",
horizontal_flip=0.5)
os.makedirs("/kaggle/working/augmented_data")
i=0
for img_batch in datagen.flow(x, batch_size=3, save_to_dir = "/kaggle/working/augmented_data"):
img_batch = img_batch.reshape(img_batch.shape[1:])
img_batch = array_to_img(img_batch)
image = img_batch
plt.subplot(3, 3, i + 1)
plt.imshow(image)
i += 1
if i >= 9:
break
sample_weights = []
for i in range(num):
sample_weights.append(1 / class_num[i])
sampler = torch.utils.data.WeightedRandomSampler(weights=sample_weights, num_samples= 555 * 70)
Here our data preprocessing pipeline includes image augmentation and train val splitting.
def get_bird_data(batch_size=64, train_val_pct=0.2):
transform_data = transforms.Compose([
transforms.ToTensor(),
])
transform_train = transforms.Compose([
transforms.Resize(224),
transforms.RandomCrop(224, padding=8, padding_mode='edge'), # Take 128x128 crops from padded images
transforms.RandomHorizontalFlip(), # 50% of time flip image along y-axis
transforms.RandomAffine((30), translate = (0.2, 0.2), scale = (0.15, 0.25), shear = 0.2),
transforms.RandomRotation(degrees=(0,180)),
transforms.RandomGrayscale(),
transforms.GaussianBlur(kernel_size=35),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
transform_test = transforms.Compose([
transforms.Resize(224),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
dataset = torchvision.datasets.ImageFolder(root='/kaggle/input/birds23sp/birds/train', transform=transform_data)
num_data = len(dataset)
num_val = int(num_data * train_val_pct)
num_train = num_data - num_val
trainset, valset = random_split(dataset, [num_train, num_val])
trainset.dataset.transform = transform_train
valset.dataset.transform = transform_test
sample_weights = []
classes = [0] * 555
for _, label in trainset:
classes[int(label)]+=1
for _, label in trainset:
sample_weights.append(1 / classes[label])
sample_weights = np.array(sample_weights)
sampler = torch.utils.data.WeightedRandomSampler(weights=sample_weights, num_samples= len(trainset))
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, num_workers=2, pin_memory=True, sampler = sampler)
valloader = torch.utils.data.DataLoader(valset, batch_size=1, shuffle=False, num_workers=2, pin_memory=True)
testset = torchvision.datasets.ImageFolder(root='/kaggle/input/birds23sp/birds/test', transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=1, shuffle=False, num_workers=2, pin_memory=True)
classes = open("/kaggle/input/birds23sp/birds/names.txt").read().strip().split("\n")
# Backward mapping to original class ids (from folder names) and species name (from names.txt)
class_to_idx = dataset.class_to_idx
idx_to_class = {int(v): int(k) for k, v in class_to_idx.items()}
idx_to_name = {k: classes[v] for k,v in idx_to_class.items()}
return {'train': trainloader, 'val': valloader, 'test': testloader, 'to_class': idx_to_class, 'to_name':idx_to_name}
data = get_bird_data()
trainloader = data['train']
# Get a batch of images from the trainloader
images, labels = next(iter(trainloader))
# Visualize the images
fig, axes = plt.subplots(figsize=(20, 10), ncols=5)
for i, ax in enumerate(axes):
ax.imshow(images[i].permute(1, 2, 0)) # Transpose image tensor from CxHxW to HxWxC
ax.axis('off')
ax.set_title(data['to_name'][labels[i].item()])
plt.tight_layout()
plt.show()
The function doesn't include the weight sampler because we found that it even decrease the performance of our model predictions on the leaderboard.
def train(net, trainloader, valloader, epochs=10, start_epoch=0, lr=0.01, momentum=0.9, decay=0.0005,
verbose=1, print_every=10, state=None, schedule={}, checkpoint_path=None):
net.to(device)
net.train()
losses = []
losses_val = []
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum, weight_decay=decay)
# optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=decay)
# Load previous training state
if state:
net.load_state_dict(state['net'])
optimizer.load_state_dict(state['optimizer'])
start_epoch = state['epoch']
losses = state['losses']
losses_val = state['losses_val']
# Fast forward lr schedule through already trained epochs
for epoch in range(start_epoch):
if epoch in schedule:
print ("Learning rate: %f"% schedule[epoch])
for g in optimizer.param_groups:
g['lr'] = schedule[epoch]
for epoch in range(start_epoch, epochs):
sum_loss = 0.0
running_loss = 0.0
sum_loss_val = 0.0
# Update learning rate when scheduled
if epoch in schedule:
print ("Learning rate: %f"% schedule[epoch])
for g in optimizer.param_groups:
g['lr'] = schedule[epoch]
num_train = 0
for i, batch in enumerate(trainloader, 0):
inputs, labels = batch[0].to(device), batch[1].to(device)
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward() # autograd magic, computes all the partial derivatives
optimizer.step() # takes a step in gradient direction
losses.append(loss.item())
sum_loss += loss.item()
running_loss += loss.item()
num_train += 1
if i % print_every == print_every-1: # print every 100 mini-batches
if verbose:
print('[epoch %d, batch %d] train loss: %.3f' % (epoch, i + 1, running_loss / print_every))
running_loss = 0.0
net.eval()
with torch.no_grad():
num_val = 0
for inputs, labels in valloader:
inputs = inputs.to(device) # Move inputs to the device
labels = labels.to(device)
outputs = net(inputs) # Forward pass
loss = criterion(outputs, labels)
sum_loss_val += loss.item()
num_val += 1
losses_val.append(sum_loss_val / num_val)
if verbose:
print('[epoch %d] train loss: %.3f, val loss: %.3f' % (epoch, sum_loss / num_train, sum_loss_val / num_val))
if checkpoint_path:
state = {'epoch': epoch+1, 'net': net.state_dict(), 'optimizer': optimizer.state_dict(), 'losses': losses}
torch.save(state, checkpoint_path + 'checkpoint-%d.pth'%(epoch+1))
return losses, losses_val
Download MaxViT: Multi-Axis Vision Transformer (tiny version). Change the output size of the last layer to 555.
maxvit = torchvision.models.maxvit_t(weights='DEFAULT')
maxvit.classifier[5] = nn.Linear(512, 555)
print(maxvit)
/opt/conda/lib/python3.10/site-packages/torch/functional.py:504: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at /usr/local/src/pytorch/aten/src/ATen/native/TensorShape.cpp:3483.) return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined] Downloading: "https://download.pytorch.org/models/maxvit_t-bc5ab103.pth" to /root/.cache/torch/hub/checkpoints/maxvit_t-bc5ab103.pth 100%|██████████| 119M/119M [00:01<00:00, 90.8MB/s]
MaxVit( (stem): Sequential( (0): Conv2dNormActivation( (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (1): BatchNorm2d(64, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (1): Conv2dNormActivation( (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (blocks): ModuleList( (0): MaxVitBlock( (layers): ModuleList( (0): MaxVitLayer( (layers): Sequential( (MBconv): MBConv( (proj): Sequential( (0): AvgPool2d(kernel_size=3, stride=2, padding=1) (1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1)) ) (stochastic_depth): Identity() (layers): Sequential( (pre_norm): BatchNorm2d(64, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (conv_a): Conv2dNormActivation( (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): BatchNorm2d(256, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (conv_b): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=256, bias=False) (1): BatchNorm2d(256, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (squeeze_excitation): SqueezeExcitation( (avgpool): AdaptiveAvgPool2d(output_size=1) (fc1): Conv2d(256, 16, kernel_size=(1, 1), stride=(1, 1)) (fc2): Conv2d(16, 256, kernel_size=(1, 1), stride=(1, 1)) (activation): SiLU() (scale_activation): Sigmoid() ) (conv_c): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1)) ) ) (window_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): Identity() (departition_swap): Identity() (attn_layer): Sequential( (0): LayerNorm((64,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=64, out_features=192, bias=True) (merge): Linear(in_features=64, out_features=64, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((64,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=64, out_features=256, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=256, out_features=64, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.0, mode=row) ) (grid_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): SwapAxes() (departition_swap): SwapAxes() (attn_layer): Sequential( (0): LayerNorm((64,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=64, out_features=192, bias=True) (merge): Linear(in_features=64, out_features=64, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((64,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=64, out_features=256, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=256, out_features=64, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.0, mode=row) ) ) ) (1): MaxVitLayer( (layers): Sequential( (MBconv): MBConv( (proj): Identity() (stochastic_depth): StochasticDepth(p=0.02, mode=row) (layers): Sequential( (pre_norm): BatchNorm2d(64, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (conv_a): Conv2dNormActivation( (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): BatchNorm2d(256, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (conv_b): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=256, bias=False) (1): BatchNorm2d(256, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (squeeze_excitation): SqueezeExcitation( (avgpool): AdaptiveAvgPool2d(output_size=1) (fc1): Conv2d(256, 16, kernel_size=(1, 1), stride=(1, 1)) (fc2): Conv2d(16, 256, kernel_size=(1, 1), stride=(1, 1)) (activation): SiLU() (scale_activation): Sigmoid() ) (conv_c): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1)) ) ) (window_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): Identity() (departition_swap): Identity() (attn_layer): Sequential( (0): LayerNorm((64,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=64, out_features=192, bias=True) (merge): Linear(in_features=64, out_features=64, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((64,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=64, out_features=256, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=256, out_features=64, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.02, mode=row) ) (grid_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): SwapAxes() (departition_swap): SwapAxes() (attn_layer): Sequential( (0): LayerNorm((64,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=64, out_features=192, bias=True) (merge): Linear(in_features=64, out_features=64, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((64,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=64, out_features=256, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=256, out_features=64, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.02, mode=row) ) ) ) ) ) (1): MaxVitBlock( (layers): ModuleList( (0): MaxVitLayer( (layers): Sequential( (MBconv): MBConv( (proj): Sequential( (0): AvgPool2d(kernel_size=3, stride=2, padding=1) (1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1)) ) (stochastic_depth): StochasticDepth(p=0.04, mode=row) (layers): Sequential( (pre_norm): BatchNorm2d(64, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (conv_a): Conv2dNormActivation( (0): Conv2d(64, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): BatchNorm2d(512, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (conv_b): Conv2dNormActivation( (0): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=512, bias=False) (1): BatchNorm2d(512, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (squeeze_excitation): SqueezeExcitation( (avgpool): AdaptiveAvgPool2d(output_size=1) (fc1): Conv2d(512, 32, kernel_size=(1, 1), stride=(1, 1)) (fc2): Conv2d(32, 512, kernel_size=(1, 1), stride=(1, 1)) (activation): SiLU() (scale_activation): Sigmoid() ) (conv_c): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1)) ) ) (window_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): Identity() (departition_swap): Identity() (attn_layer): Sequential( (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=128, out_features=384, bias=True) (merge): Linear(in_features=128, out_features=128, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=128, out_features=512, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=512, out_features=128, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.04, mode=row) ) (grid_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): SwapAxes() (departition_swap): SwapAxes() (attn_layer): Sequential( (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=128, out_features=384, bias=True) (merge): Linear(in_features=128, out_features=128, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=128, out_features=512, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=512, out_features=128, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.04, mode=row) ) ) ) (1): MaxVitLayer( (layers): Sequential( (MBconv): MBConv( (proj): Identity() (stochastic_depth): StochasticDepth(p=0.06, mode=row) (layers): Sequential( (pre_norm): BatchNorm2d(128, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (conv_a): Conv2dNormActivation( (0): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): BatchNorm2d(512, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (conv_b): Conv2dNormActivation( (0): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=512, bias=False) (1): BatchNorm2d(512, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (squeeze_excitation): SqueezeExcitation( (avgpool): AdaptiveAvgPool2d(output_size=1) (fc1): Conv2d(512, 32, kernel_size=(1, 1), stride=(1, 1)) (fc2): Conv2d(32, 512, kernel_size=(1, 1), stride=(1, 1)) (activation): SiLU() (scale_activation): Sigmoid() ) (conv_c): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1)) ) ) (window_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): Identity() (departition_swap): Identity() (attn_layer): Sequential( (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=128, out_features=384, bias=True) (merge): Linear(in_features=128, out_features=128, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=128, out_features=512, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=512, out_features=128, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.06, mode=row) ) (grid_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): SwapAxes() (departition_swap): SwapAxes() (attn_layer): Sequential( (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=128, out_features=384, bias=True) (merge): Linear(in_features=128, out_features=128, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=128, out_features=512, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=512, out_features=128, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.06, mode=row) ) ) ) ) ) (2): MaxVitBlock( (layers): ModuleList( (0): MaxVitLayer( (layers): Sequential( (MBconv): MBConv( (proj): Sequential( (0): AvgPool2d(kernel_size=3, stride=2, padding=1) (1): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1)) ) (stochastic_depth): StochasticDepth(p=0.08, mode=row) (layers): Sequential( (pre_norm): BatchNorm2d(128, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (conv_a): Conv2dNormActivation( (0): Conv2d(128, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): BatchNorm2d(1024, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (conv_b): Conv2dNormActivation( (0): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=1024, bias=False) (1): BatchNorm2d(1024, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (squeeze_excitation): SqueezeExcitation( (avgpool): AdaptiveAvgPool2d(output_size=1) (fc1): Conv2d(1024, 64, kernel_size=(1, 1), stride=(1, 1)) (fc2): Conv2d(64, 1024, kernel_size=(1, 1), stride=(1, 1)) (activation): SiLU() (scale_activation): Sigmoid() ) (conv_c): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1)) ) ) (window_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): Identity() (departition_swap): Identity() (attn_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=256, out_features=768, bias=True) (merge): Linear(in_features=256, out_features=256, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=256, out_features=1024, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=1024, out_features=256, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.08, mode=row) ) (grid_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): SwapAxes() (departition_swap): SwapAxes() (attn_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=256, out_features=768, bias=True) (merge): Linear(in_features=256, out_features=256, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=256, out_features=1024, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=1024, out_features=256, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.08, mode=row) ) ) ) (1): MaxVitLayer( (layers): Sequential( (MBconv): MBConv( (proj): Identity() (stochastic_depth): StochasticDepth(p=0.1, mode=row) (layers): Sequential( (pre_norm): BatchNorm2d(256, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (conv_a): Conv2dNormActivation( (0): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): BatchNorm2d(1024, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (conv_b): Conv2dNormActivation( (0): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024, bias=False) (1): BatchNorm2d(1024, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (squeeze_excitation): SqueezeExcitation( (avgpool): AdaptiveAvgPool2d(output_size=1) (fc1): Conv2d(1024, 64, kernel_size=(1, 1), stride=(1, 1)) (fc2): Conv2d(64, 1024, kernel_size=(1, 1), stride=(1, 1)) (activation): SiLU() (scale_activation): Sigmoid() ) (conv_c): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1)) ) ) (window_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): Identity() (departition_swap): Identity() (attn_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=256, out_features=768, bias=True) (merge): Linear(in_features=256, out_features=256, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=256, out_features=1024, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=1024, out_features=256, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.1, mode=row) ) (grid_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): SwapAxes() (departition_swap): SwapAxes() (attn_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=256, out_features=768, bias=True) (merge): Linear(in_features=256, out_features=256, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=256, out_features=1024, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=1024, out_features=256, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.1, mode=row) ) ) ) (2): MaxVitLayer( (layers): Sequential( (MBconv): MBConv( (proj): Identity() (stochastic_depth): StochasticDepth(p=0.12, mode=row) (layers): Sequential( (pre_norm): BatchNorm2d(256, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (conv_a): Conv2dNormActivation( (0): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): BatchNorm2d(1024, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (conv_b): Conv2dNormActivation( (0): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024, bias=False) (1): BatchNorm2d(1024, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (squeeze_excitation): SqueezeExcitation( (avgpool): AdaptiveAvgPool2d(output_size=1) (fc1): Conv2d(1024, 64, kernel_size=(1, 1), stride=(1, 1)) (fc2): Conv2d(64, 1024, kernel_size=(1, 1), stride=(1, 1)) (activation): SiLU() (scale_activation): Sigmoid() ) (conv_c): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1)) ) ) (window_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): Identity() (departition_swap): Identity() (attn_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=256, out_features=768, bias=True) (merge): Linear(in_features=256, out_features=256, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=256, out_features=1024, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=1024, out_features=256, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.12, mode=row) ) (grid_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): SwapAxes() (departition_swap): SwapAxes() (attn_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=256, out_features=768, bias=True) (merge): Linear(in_features=256, out_features=256, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=256, out_features=1024, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=1024, out_features=256, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.12, mode=row) ) ) ) (3): MaxVitLayer( (layers): Sequential( (MBconv): MBConv( (proj): Identity() (stochastic_depth): StochasticDepth(p=0.14, mode=row) (layers): Sequential( (pre_norm): BatchNorm2d(256, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (conv_a): Conv2dNormActivation( (0): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): BatchNorm2d(1024, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (conv_b): Conv2dNormActivation( (0): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024, bias=False) (1): BatchNorm2d(1024, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (squeeze_excitation): SqueezeExcitation( (avgpool): AdaptiveAvgPool2d(output_size=1) (fc1): Conv2d(1024, 64, kernel_size=(1, 1), stride=(1, 1)) (fc2): Conv2d(64, 1024, kernel_size=(1, 1), stride=(1, 1)) (activation): SiLU() (scale_activation): Sigmoid() ) (conv_c): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1)) ) ) (window_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): Identity() (departition_swap): Identity() (attn_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=256, out_features=768, bias=True) (merge): Linear(in_features=256, out_features=256, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=256, out_features=1024, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=1024, out_features=256, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.14, mode=row) ) (grid_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): SwapAxes() (departition_swap): SwapAxes() (attn_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=256, out_features=768, bias=True) (merge): Linear(in_features=256, out_features=256, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=256, out_features=1024, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=1024, out_features=256, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.14, mode=row) ) ) ) (4): MaxVitLayer( (layers): Sequential( (MBconv): MBConv( (proj): Identity() (stochastic_depth): StochasticDepth(p=0.16, mode=row) (layers): Sequential( (pre_norm): BatchNorm2d(256, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (conv_a): Conv2dNormActivation( (0): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): BatchNorm2d(1024, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (conv_b): Conv2dNormActivation( (0): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=1024, bias=False) (1): BatchNorm2d(1024, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (squeeze_excitation): SqueezeExcitation( (avgpool): AdaptiveAvgPool2d(output_size=1) (fc1): Conv2d(1024, 64, kernel_size=(1, 1), stride=(1, 1)) (fc2): Conv2d(64, 1024, kernel_size=(1, 1), stride=(1, 1)) (activation): SiLU() (scale_activation): Sigmoid() ) (conv_c): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1)) ) ) (window_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): Identity() (departition_swap): Identity() (attn_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=256, out_features=768, bias=True) (merge): Linear(in_features=256, out_features=256, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=256, out_features=1024, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=1024, out_features=256, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.16, mode=row) ) (grid_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): SwapAxes() (departition_swap): SwapAxes() (attn_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=256, out_features=768, bias=True) (merge): Linear(in_features=256, out_features=256, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=256, out_features=1024, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=1024, out_features=256, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.16, mode=row) ) ) ) ) ) (3): MaxVitBlock( (layers): ModuleList( (0): MaxVitLayer( (layers): Sequential( (MBconv): MBConv( (proj): Sequential( (0): AvgPool2d(kernel_size=3, stride=2, padding=1) (1): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1)) ) (stochastic_depth): StochasticDepth(p=0.18, mode=row) (layers): Sequential( (pre_norm): BatchNorm2d(256, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (conv_a): Conv2dNormActivation( (0): Conv2d(256, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): BatchNorm2d(2048, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (conv_b): Conv2dNormActivation( (0): Conv2d(2048, 2048, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), groups=2048, bias=False) (1): BatchNorm2d(2048, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (squeeze_excitation): SqueezeExcitation( (avgpool): AdaptiveAvgPool2d(output_size=1) (fc1): Conv2d(2048, 128, kernel_size=(1, 1), stride=(1, 1)) (fc2): Conv2d(128, 2048, kernel_size=(1, 1), stride=(1, 1)) (activation): SiLU() (scale_activation): Sigmoid() ) (conv_c): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1)) ) ) (window_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): Identity() (departition_swap): Identity() (attn_layer): Sequential( (0): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=512, out_features=1536, bias=True) (merge): Linear(in_features=512, out_features=512, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=512, out_features=2048, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=2048, out_features=512, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.18, mode=row) ) (grid_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): SwapAxes() (departition_swap): SwapAxes() (attn_layer): Sequential( (0): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=512, out_features=1536, bias=True) (merge): Linear(in_features=512, out_features=512, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=512, out_features=2048, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=2048, out_features=512, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.18, mode=row) ) ) ) (1): MaxVitLayer( (layers): Sequential( (MBconv): MBConv( (proj): Identity() (stochastic_depth): StochasticDepth(p=0.2, mode=row) (layers): Sequential( (pre_norm): BatchNorm2d(512, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (conv_a): Conv2dNormActivation( (0): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): BatchNorm2d(2048, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (conv_b): Conv2dNormActivation( (0): Conv2d(2048, 2048, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=2048, bias=False) (1): BatchNorm2d(2048, eps=0.001, momentum=0.99, affine=True, track_running_stats=True) (2): GELU(approximate='none') ) (squeeze_excitation): SqueezeExcitation( (avgpool): AdaptiveAvgPool2d(output_size=1) (fc1): Conv2d(2048, 128, kernel_size=(1, 1), stride=(1, 1)) (fc2): Conv2d(128, 2048, kernel_size=(1, 1), stride=(1, 1)) (activation): SiLU() (scale_activation): Sigmoid() ) (conv_c): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1)) ) ) (window_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): Identity() (departition_swap): Identity() (attn_layer): Sequential( (0): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=512, out_features=1536, bias=True) (merge): Linear(in_features=512, out_features=512, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=512, out_features=2048, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=2048, out_features=512, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.2, mode=row) ) (grid_attention): PartitionAttentionLayer( (partition_op): WindowPartition() (departition_op): WindowDepartition() (partition_swap): SwapAxes() (departition_swap): SwapAxes() (attn_layer): Sequential( (0): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (1): RelativePositionalMultiHeadAttention( (to_qkv): Linear(in_features=512, out_features=1536, bias=True) (merge): Linear(in_features=512, out_features=512, bias=True) ) (2): Dropout(p=0.0, inplace=False) ) (mlp_layer): Sequential( (0): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (1): Linear(in_features=512, out_features=2048, bias=True) (2): GELU(approximate='none') (3): Linear(in_features=2048, out_features=512, bias=True) (4): Dropout(p=0.0, inplace=False) ) (stochastic_dropout): StochasticDepth(p=0.2, mode=row) ) ) ) ) ) ) (classifier): Sequential( (0): AdaptiveAvgPool2d(output_size=1) (1): Flatten(start_dim=1, end_dim=-1) (2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (3): Linear(in_features=512, out_features=512, bias=True) (4): Tanh() (5): Linear(in_features=512, out_features=555, bias=True) ) )
Download A ConvNet for the 2020s (small version). Change the output size of the last layer to 555.
convnext = torchvision.models.convnext_small(weights='DEFAULT')
convnext.classifier[2] = nn.Linear(768, 555)
print(convnext)
Downloading: "https://download.pytorch.org/models/convnext_small-0c510722.pth" to /root/.cache/torch/hub/checkpoints/convnext_small-0c510722.pth 100%|██████████| 192M/192M [00:02<00:00, 91.0MB/s]
ConvNeXt( (features): Sequential( (0): Conv2dNormActivation( (0): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4)) (1): LayerNorm2d((96,), eps=1e-06, elementwise_affine=True) ) (1): Sequential( (0): CNBlock( (block): Sequential( (0): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96) (1): Permute() (2): LayerNorm((96,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=96, out_features=384, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=384, out_features=96, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.0, mode=row) ) (1): CNBlock( (block): Sequential( (0): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96) (1): Permute() (2): LayerNorm((96,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=96, out_features=384, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=384, out_features=96, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.011428571428571429, mode=row) ) (2): CNBlock( (block): Sequential( (0): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96) (1): Permute() (2): LayerNorm((96,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=96, out_features=384, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=384, out_features=96, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.022857142857142857, mode=row) ) ) (2): Sequential( (0): LayerNorm2d((96,), eps=1e-06, elementwise_affine=True) (1): Conv2d(96, 192, kernel_size=(2, 2), stride=(2, 2)) ) (3): Sequential( (0): CNBlock( (block): Sequential( (0): Conv2d(192, 192, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=192) (1): Permute() (2): LayerNorm((192,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=192, out_features=768, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=768, out_features=192, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.03428571428571429, mode=row) ) (1): CNBlock( (block): Sequential( (0): Conv2d(192, 192, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=192) (1): Permute() (2): LayerNorm((192,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=192, out_features=768, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=768, out_features=192, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.045714285714285714, mode=row) ) (2): CNBlock( (block): Sequential( (0): Conv2d(192, 192, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=192) (1): Permute() (2): LayerNorm((192,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=192, out_features=768, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=768, out_features=192, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.05714285714285714, mode=row) ) ) (4): Sequential( (0): LayerNorm2d((192,), eps=1e-06, elementwise_affine=True) (1): Conv2d(192, 384, kernel_size=(2, 2), stride=(2, 2)) ) (5): Sequential( (0): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.06857142857142857, mode=row) ) (1): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.08, mode=row) ) (2): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.09142857142857143, mode=row) ) (3): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.10285714285714286, mode=row) ) (4): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.11428571428571428, mode=row) ) (5): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.12571428571428572, mode=row) ) (6): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.13714285714285715, mode=row) ) (7): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.14857142857142858, mode=row) ) (8): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.16, mode=row) ) (9): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.17142857142857143, mode=row) ) (10): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.18285714285714286, mode=row) ) (11): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.1942857142857143, mode=row) ) (12): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.2057142857142857, mode=row) ) (13): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.21714285714285717, mode=row) ) (14): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.22857142857142856, mode=row) ) (15): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.24000000000000002, mode=row) ) (16): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.25142857142857145, mode=row) ) (17): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.2628571428571429, mode=row) ) (18): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.2742857142857143, mode=row) ) (19): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.2857142857142857, mode=row) ) (20): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.29714285714285715, mode=row) ) (21): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.3085714285714286, mode=row) ) (22): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.32, mode=row) ) (23): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.33142857142857146, mode=row) ) (24): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.34285714285714286, mode=row) ) (25): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.3542857142857143, mode=row) ) (26): CNBlock( (block): Sequential( (0): Conv2d(384, 384, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=384) (1): Permute() (2): LayerNorm((384,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=384, out_features=1536, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=1536, out_features=384, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.3657142857142857, mode=row) ) ) (6): Sequential( (0): LayerNorm2d((384,), eps=1e-06, elementwise_affine=True) (1): Conv2d(384, 768, kernel_size=(2, 2), stride=(2, 2)) ) (7): Sequential( (0): CNBlock( (block): Sequential( (0): Conv2d(768, 768, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=768) (1): Permute() (2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=768, out_features=3072, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=3072, out_features=768, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.37714285714285717, mode=row) ) (1): CNBlock( (block): Sequential( (0): Conv2d(768, 768, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=768) (1): Permute() (2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=768, out_features=3072, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=3072, out_features=768, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.3885714285714286, mode=row) ) (2): CNBlock( (block): Sequential( (0): Conv2d(768, 768, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=768) (1): Permute() (2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) (3): Linear(in_features=768, out_features=3072, bias=True) (4): GELU(approximate='none') (5): Linear(in_features=3072, out_features=768, bias=True) (6): Permute() ) (stochastic_depth): StochasticDepth(p=0.4, mode=row) ) ) ) (avgpool): AdaptiveAvgPool2d(output_size=1) (classifier): Sequential( (0): LayerNorm2d((768,), eps=1e-06, elementwise_affine=True) (1): Flatten(start_dim=1, end_dim=-1) (2): Linear(in_features=768, out_features=555, bias=True) ) )
Download Swin Transformer V2: Scaling Up Capacity and Resolution (tiny version). Change the output size of the last layer to 555.
swin = torchvision.models.swin_v2_t(weights='DEFAULT')
swin.head = nn.Linear(768, 555)
print(swin)
Downloading: "https://download.pytorch.org/models/swin_v2_t-b137f0e2.pth" to /root/.cache/torch/hub/checkpoints/swin_v2_t-b137f0e2.pth 100%|██████████| 109M/109M [00:00<00:00, 147MB/s]
SwinTransformer( (features): Sequential( (0): Sequential( (0): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4)) (1): Permute() (2): LayerNorm((96,), eps=1e-05, elementwise_affine=True) ) (1): Sequential( (0): SwinTransformerBlockV2( (norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True) (attn): ShiftedWindowAttentionV2( (qkv): Linear(in_features=96, out_features=288, bias=True) (proj): Linear(in_features=96, out_features=96, bias=True) (cpb_mlp): Sequential( (0): Linear(in_features=2, out_features=512, bias=True) (1): ReLU(inplace=True) (2): Linear(in_features=512, out_features=3, bias=False) ) ) (stochastic_depth): StochasticDepth(p=0.0, mode=row) (norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (0): Linear(in_features=96, out_features=384, bias=True) (1): GELU(approximate='none') (2): Dropout(p=0.0, inplace=False) (3): Linear(in_features=384, out_features=96, bias=True) (4): Dropout(p=0.0, inplace=False) ) ) (1): SwinTransformerBlockV2( (norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True) (attn): ShiftedWindowAttentionV2( (qkv): Linear(in_features=96, out_features=288, bias=True) (proj): Linear(in_features=96, out_features=96, bias=True) (cpb_mlp): Sequential( (0): Linear(in_features=2, out_features=512, bias=True) (1): ReLU(inplace=True) (2): Linear(in_features=512, out_features=3, bias=False) ) ) (stochastic_depth): StochasticDepth(p=0.018181818181818184, mode=row) (norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (0): Linear(in_features=96, out_features=384, bias=True) (1): GELU(approximate='none') (2): Dropout(p=0.0, inplace=False) (3): Linear(in_features=384, out_features=96, bias=True) (4): Dropout(p=0.0, inplace=False) ) ) ) (2): PatchMergingV2( (reduction): Linear(in_features=384, out_features=192, bias=False) (norm): LayerNorm((192,), eps=1e-05, elementwise_affine=True) ) (3): Sequential( (0): SwinTransformerBlockV2( (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (attn): ShiftedWindowAttentionV2( (qkv): Linear(in_features=192, out_features=576, bias=True) (proj): Linear(in_features=192, out_features=192, bias=True) (cpb_mlp): Sequential( (0): Linear(in_features=2, out_features=512, bias=True) (1): ReLU(inplace=True) (2): Linear(in_features=512, out_features=6, bias=False) ) ) (stochastic_depth): StochasticDepth(p=0.03636363636363637, mode=row) (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (0): Linear(in_features=192, out_features=768, bias=True) (1): GELU(approximate='none') (2): Dropout(p=0.0, inplace=False) (3): Linear(in_features=768, out_features=192, bias=True) (4): Dropout(p=0.0, inplace=False) ) ) (1): SwinTransformerBlockV2( (norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (attn): ShiftedWindowAttentionV2( (qkv): Linear(in_features=192, out_features=576, bias=True) (proj): Linear(in_features=192, out_features=192, bias=True) (cpb_mlp): Sequential( (0): Linear(in_features=2, out_features=512, bias=True) (1): ReLU(inplace=True) (2): Linear(in_features=512, out_features=6, bias=False) ) ) (stochastic_depth): StochasticDepth(p=0.05454545454545456, mode=row) (norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (0): Linear(in_features=192, out_features=768, bias=True) (1): GELU(approximate='none') (2): Dropout(p=0.0, inplace=False) (3): Linear(in_features=768, out_features=192, bias=True) (4): Dropout(p=0.0, inplace=False) ) ) ) (4): PatchMergingV2( (reduction): Linear(in_features=768, out_features=384, bias=False) (norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True) ) (5): Sequential( (0): SwinTransformerBlockV2( (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True) (attn): ShiftedWindowAttentionV2( (qkv): Linear(in_features=384, out_features=1152, bias=True) (proj): Linear(in_features=384, out_features=384, bias=True) (cpb_mlp): Sequential( (0): Linear(in_features=2, out_features=512, bias=True) (1): ReLU(inplace=True) (2): Linear(in_features=512, out_features=12, bias=False) ) ) (stochastic_depth): StochasticDepth(p=0.07272727272727274, mode=row) (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (0): Linear(in_features=384, out_features=1536, bias=True) (1): GELU(approximate='none') (2): Dropout(p=0.0, inplace=False) (3): Linear(in_features=1536, out_features=384, bias=True) (4): Dropout(p=0.0, inplace=False) ) ) (1): SwinTransformerBlockV2( (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True) (attn): ShiftedWindowAttentionV2( (qkv): Linear(in_features=384, out_features=1152, bias=True) (proj): Linear(in_features=384, out_features=384, bias=True) (cpb_mlp): Sequential( (0): Linear(in_features=2, out_features=512, bias=True) (1): ReLU(inplace=True) (2): Linear(in_features=512, out_features=12, bias=False) ) ) (stochastic_depth): StochasticDepth(p=0.09090909090909091, mode=row) (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (0): Linear(in_features=384, out_features=1536, bias=True) (1): GELU(approximate='none') (2): Dropout(p=0.0, inplace=False) (3): Linear(in_features=1536, out_features=384, bias=True) (4): Dropout(p=0.0, inplace=False) ) ) (2): SwinTransformerBlockV2( (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True) (attn): ShiftedWindowAttentionV2( (qkv): Linear(in_features=384, out_features=1152, bias=True) (proj): Linear(in_features=384, out_features=384, bias=True) (cpb_mlp): Sequential( (0): Linear(in_features=2, out_features=512, bias=True) (1): ReLU(inplace=True) (2): Linear(in_features=512, out_features=12, bias=False) ) ) (stochastic_depth): StochasticDepth(p=0.10909090909090911, mode=row) (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (0): Linear(in_features=384, out_features=1536, bias=True) (1): GELU(approximate='none') (2): Dropout(p=0.0, inplace=False) (3): Linear(in_features=1536, out_features=384, bias=True) (4): Dropout(p=0.0, inplace=False) ) ) (3): SwinTransformerBlockV2( (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True) (attn): ShiftedWindowAttentionV2( (qkv): Linear(in_features=384, out_features=1152, bias=True) (proj): Linear(in_features=384, out_features=384, bias=True) (cpb_mlp): Sequential( (0): Linear(in_features=2, out_features=512, bias=True) (1): ReLU(inplace=True) (2): Linear(in_features=512, out_features=12, bias=False) ) ) (stochastic_depth): StochasticDepth(p=0.1272727272727273, mode=row) (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (0): Linear(in_features=384, out_features=1536, bias=True) (1): GELU(approximate='none') (2): Dropout(p=0.0, inplace=False) (3): Linear(in_features=1536, out_features=384, bias=True) (4): Dropout(p=0.0, inplace=False) ) ) (4): SwinTransformerBlockV2( (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True) (attn): ShiftedWindowAttentionV2( (qkv): Linear(in_features=384, out_features=1152, bias=True) (proj): Linear(in_features=384, out_features=384, bias=True) (cpb_mlp): Sequential( (0): Linear(in_features=2, out_features=512, bias=True) (1): ReLU(inplace=True) (2): Linear(in_features=512, out_features=12, bias=False) ) ) (stochastic_depth): StochasticDepth(p=0.14545454545454548, mode=row) (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (0): Linear(in_features=384, out_features=1536, bias=True) (1): GELU(approximate='none') (2): Dropout(p=0.0, inplace=False) (3): Linear(in_features=1536, out_features=384, bias=True) (4): Dropout(p=0.0, inplace=False) ) ) (5): SwinTransformerBlockV2( (norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True) (attn): ShiftedWindowAttentionV2( (qkv): Linear(in_features=384, out_features=1152, bias=True) (proj): Linear(in_features=384, out_features=384, bias=True) (cpb_mlp): Sequential( (0): Linear(in_features=2, out_features=512, bias=True) (1): ReLU(inplace=True) (2): Linear(in_features=512, out_features=12, bias=False) ) ) (stochastic_depth): StochasticDepth(p=0.16363636363636364, mode=row) (norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (0): Linear(in_features=384, out_features=1536, bias=True) (1): GELU(approximate='none') (2): Dropout(p=0.0, inplace=False) (3): Linear(in_features=1536, out_features=384, bias=True) (4): Dropout(p=0.0, inplace=False) ) ) ) (6): PatchMergingV2( (reduction): Linear(in_features=1536, out_features=768, bias=False) (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) (7): Sequential( (0): SwinTransformerBlockV2( (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): ShiftedWindowAttentionV2( (qkv): Linear(in_features=768, out_features=2304, bias=True) (proj): Linear(in_features=768, out_features=768, bias=True) (cpb_mlp): Sequential( (0): Linear(in_features=2, out_features=512, bias=True) (1): ReLU(inplace=True) (2): Linear(in_features=512, out_features=24, bias=False) ) ) (stochastic_depth): StochasticDepth(p=0.18181818181818182, mode=row) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (0): Linear(in_features=768, out_features=3072, bias=True) (1): GELU(approximate='none') (2): Dropout(p=0.0, inplace=False) (3): Linear(in_features=3072, out_features=768, bias=True) (4): Dropout(p=0.0, inplace=False) ) ) (1): SwinTransformerBlockV2( (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attn): ShiftedWindowAttentionV2( (qkv): Linear(in_features=768, out_features=2304, bias=True) (proj): Linear(in_features=768, out_features=768, bias=True) (cpb_mlp): Sequential( (0): Linear(in_features=2, out_features=512, bias=True) (1): ReLU(inplace=True) (2): Linear(in_features=512, out_features=24, bias=False) ) ) (stochastic_depth): StochasticDepth(p=0.2, mode=row) (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): MLP( (0): Linear(in_features=768, out_features=3072, bias=True) (1): GELU(approximate='none') (2): Dropout(p=0.0, inplace=False) (3): Linear(in_features=3072, out_features=768, bias=True) (4): Dropout(p=0.0, inplace=False) ) ) ) ) (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (permute): Permute() (avgpool): AdaptiveAvgPool2d(output_size=1) (flatten): Flatten(start_dim=1, end_dim=-1) (head): Linear(in_features=768, out_features=555, bias=True) )
# losses, losses_val = train(maxvit, data['train'], data['val'], epochs=10, checkpoint_path='/kaggle/working/', print_every=100, lr=0.005)
cp_max = torch.load('/kaggle/input/birds-classification-models/maxvit-t-checkpoint-8.pth')
maxvit.load_state_dict(cp_max['net'])
<All keys matched successfully>
# losses, losses_val = train(convnext, data['train'], data['val'], epochs=15, checkpoint_path='/kaggle/working/', print_every=100, lr=0.005)
cp_conv = torch.load('/kaggle/input/birds-classification-models/convnext-small-checkpoint-8.pth')
convnext.load_state_dict(cp_conv['net'])
<All keys matched successfully>
# losses, losses_val = train(swin, data['train'], data['val'], epochs=15, checkpoint_path='/kaggle/working/', print_every=100, lr=0.005)
cp_swin = torch.load('/kaggle/input/birds-classification-models/swin-v2-t-checkpoint-7.pth')
swin.load_state_dict(cp_swin['net'])
<All keys matched successfully>
def smooth(x, size):
return np.convolve(x, np.ones(size)/size, mode='valid')
def compute_weight_2(net1, net2):
net1.to(device)
net1.eval()
net2.to(device)
net2.eval()
accuracies = []
alphas = []
for alpha in range(5, 96, 5):
print(alpha)
num_correct = 0
total = 0
for i, batch in enumerate(data['val'], 0):
if i%100==0: print(i, end = ' ')
inputs, labels = batch[0].to(device), batch[1].to(device)
outputs1 = net1(inputs)
outputs2 = net2(inputs)
outputs = outputs1*alpha/100 + outputs2*(1-alpha/100)
_, pred = torch.max(outputs.data, 1)
for p, l in zip(pred, labels):
total += 1
if p == l: num_correct += 1
if i==999: break
print()
accuracies.append(num_correct / total)
alphas.append(alpha/100)
weights = {}
for i in range(len(accuracies)):
weights[accuracies[i]] = alphas[i]
sortedWeights = dict(sorted(weights.items()))
print(sortedWeights)
return sortedWeights
The result shows that the best coefficient between maxvit and convnext is 0.6 : 0.4.
# weights = compute_weight_2(maxvit, convnext)
def compute_weight_3(net1, net2, net3, weight1):
net1.to(device)
net1.eval()
net2.to(device)
net2.eval()
net3.to(device)
net3.eval()
accuracies = []
alphas = []
for alpha in range(5, 96, 5):
print(alpha)
num_correct = 0
total = 0
for i, batch in enumerate(data['val'], 0):
if i%100==0: print(i, end = ' ')
inputs, labels = batch[0].to(device), batch[1].to(device)
outputs1 = net1(inputs)
outputs2 = net2(inputs)
outputs12 = outputs1 * weight1 + outputs2 * (1 - weight1)
outputs3 = net3(inputs)
outputs = outputs12*alpha/100 + outputs3*(1-alpha/100)
_, pred = torch.max(outputs.data, 1)
for p, l in zip(pred, labels):
total += 1
if p == l: num_correct += 1
if i==999: break
print()
accuracies.append(num_correct / total)
alphas.append(alpha/100)
weights = {}
for i in range(len(accuracies)):
weights[accuracies[i]] = alphas[i]
sortedWeights = dict(sorted(weights.items()))
print(sortedWeights)
return sortedWeights
The result shows that the best coefficient between maxvit, convnext, and swin is 0.39 : 0.26 : 0.35.
# weights = compute_weight_3(maxvit, convnext, swin, 0.6)
Prediction using one model.
def predict(net, dataloader, ofname):
out = open(ofname, 'w')
out.write("path,class\n")
net.to(device)
net.eval()
correct = 0
total = 0
with torch.no_grad():
for i, (images, labels) in enumerate(dataloader, 0):
if i%100 == 0:
print(i)
images, labels = images.to(device), labels.to(device)
outputs = net(images)
_, predicted = torch.max(outputs.data, 1)
fname, _ = dataloader.dataset.samples[i]
out.write("test/{},{}\n".format(fname.split('/')[-1], data['to_class'][predicted.item()]))
out.close()
Prediction using two models.
def predict_ensemble_2(net1, net2, weight, dataloader, ofname):
out = open(ofname, 'w')
out.write("path,class\n")
net1.to(device)
net1.eval()
net2.to(device)
net2.eval()
with torch.no_grad():
for i, (images, labels) in enumerate(dataloader, 0):
if i%100 == 0:
print(i)
images, labels = images.to(device), labels.to(device)
outputs1 = net1(images)
outputs2 = net2(images)
outputs = outputs1*weight + outputs2*(1-weight)
_, predicted = torch.max(outputs.data, 1)
fname, _ = dataloader.dataset.samples[i]
out.write("test/{},{}\n".format(fname.split('/')[-1], data['to_class'][predicted.item()]))
out.close()
Prediction using three models.
def predict_ensemble_3(net1, net2, net3, weight1, weight2, dataloader, ofname):
out = open(ofname, 'w')
out.write("path,class\n")
net1.to(device)
net1.eval()
net2.to(device)
net2.eval()
net3.to(device)
net3.eval()
with torch.no_grad():
for i, (images, labels) in enumerate(dataloader, 0):
if i%100 == 0:
print(i)
images, labels = images.to(device), labels.to(device)
outputs1 = net1(images)
outputs2 = net2(images)
outputs12 = outputs1*weight1 + outputs2*(1-weight1)
outputs3 = net3(images)
outputs = outputs12*weight2 + outputs3*(1-weight2)
_, predicted = torch.max(outputs.data, 1)
fname, _ = dataloader.dataset.samples[i]
out.write("test/{},{}\n".format(fname.split('/')[-1], data['to_class'][predicted.item()]))
out.close()
Final submission. This version reaches 88% acc on the public leaderboard (No.1).
predict_ensemble_3(maxvit, convnext, swin, 0.6, 0.65, data['test'], "submissions_ensemble_maxvit_t_convnext_s_swin_v2_t.csv")
0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9800 9900