Confirmation on volatility of ViT loss when training using a Masked AutoEncoder

Hi I am training ViT (patch 16, 224, ImageNet Pretrained) backbone on satellite imagery (Million-AID dataset, ~900000 images of varying sizes) in Self-Supervised learning fashion using an Masked Auto Encoder (MAE). The loss I get during training is very volatile and wanted to confirm if anyone knows if this behavior is normal when training a ViT or when training ViT using MAE approach?

I have pasted the code here just in case I have made a mistake. I am using the lightly-ssl to ease up training an MAE.

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
<code># ==============================================================================
# MAE Imp0lementation : THE PAPER HAS TWO DIFFERENT SETTINGS FOR PRETRAINING & FINETUNING
# What the paper uses | Any changes that deviate from paper
# Backbone : VIT-L/16 | VIT-B/16
# Decoder : depth - 8 blocks, Width - 512d
# Encoder : w/o mask tokens
# Optimizer
# linear lr scaling (cosine decay) : lr = base_lr x bs / 256, base_lr = 1.5e-3/1e-3, warmup = 40/5 epochs
# AdamW : Beta1,2 = 0.9,0.95 / 0.9,0.999
# Batch Size : 4096/1024 | 1024/1024
# Loss func : MSE
# Masking : 75%
# Augmenntation : RandomResized Crop
# Crop Size : 224x224
# Fine tuning : 50 epochs vs Pretraining : 200 epochs
# Note that these are for Vit-L
# ==============================================================================
mae_params = config["mae_params"]
class MaeBBViT(pl.LightningModule):
def __init__(self, model_params:dict, backbone:VisionTransformer):
"""
Inputs
- model_params: dictionary containing model parameters such as lr, batch_size
- backbone : vit model with last layer removed
"""
super().__init__()
# Saving hyperparameters
hyper_dict = {}
hyper_dict.update(mae_params)
hyper_dict.update(model_params)
self.save_hyperparameters(hyper_dict)
self.model_params = model_params
self.backbone = backbone # we will need this later
self.mask_ratio = mae_params["mask_ratio"] # 0.75
self.patch_size = backbone.patch_embed.patch_size[0] #(16,16) so index [0] returns 16
self.masked_encoder = MaskedVisionTransformerTIMM(vit = backbone)
self.sequence_length = self.masked_encoder.sequence_length #197
self.decoder = MAEDecoderTIMM(
num_patches = backbone.patch_embed.num_patches, #196
patch_size = self.patch_size,#196
embed_dim = backbone.embed_dim, #768
decoder_embed_dim = mae_params["decoder_dim"], #512
decoder_depth = mae_params["decoder_depth"], #8
mlp_ratio = 4.0,
proj_drop_rate = 0.0, # drop out rate in projection head
attn_drop_rate = 0.0 # drop out rate in
)
self.criterion = nn.MSELoss()
self.apply_lr_scheduler = False if model_params["lr"] else True
def forward_encoder(self, images, idx_keep = None):
# shape returned from .encode = (bs, num_unmasked_patches, embed size)
# What you pass here as images are actually just unamsked portion of the image only
# So if 50/197 patches were left not masked you get the shape below (total 197 including cls) of which 50 is not masked
return self.masked_encoder.encode(images = images, idx_keep = idx_keep) #(*,3,224,224) -> (*,50,768)
def forward_decoder(self, x_encoded, idx_keep, idx_mask):
# Build decoder input
batch_size = x_encoded.shape[0]
x_decode = self.decoder.embed(x_encoded) #(*,50,512) where 50 is number NOT makes patches, 512 is num dimension decalred above
x_masked = repeat_token(
self.decoder.mask_token,
(batch_size, self.sequence_length)
) #(*,197,512) where 197 nuber of masked patches & 512 is the nub dimensions declared above
x_masked = set_at_index(x_masked, idx_keep, x_decode.type_as(x_masked)) #(*,197,512)
# decoder forward pass
x_decoded = self.decoder.decode(x_masked) #(*,197,512)
# Predict pixel values for masked tokens
x_pred = get_at_index(x_decoded, idx_mask) #(*,147,512)
x_pred = self.decoder.predict(x_pred) #(*,147,768)
return x_pred
def training_step(self, batch, batch_idx):
X,y,f = batch
images = X[0] # (*,3, 224,224) There is only a single view but its within a list
batch_size = images.shape[0]
# These are unmasked and masked patches. Rememnber that there is 197 patches including ...
# CLS token (14*14 + 1), 14 patches was obtained by 224/16 where each patch is 16 pixels in height and width
idx_keep, idx_mask = random_token_mask(
size = (batch_size, self.sequence_length),
mask_ratio = self.mask_ratio,
device = images.device
) #(*,50),(*,147)
x_encoded = self.forward_encoder(
images = images,
idx_keep = idx_keep
) #(*,50,768) where 50 is the number of NOT maked patches
x_pred = self.forward_decoder(
x_encoded = x_encoded,
idx_keep = idx_keep,
idx_mask = idx_mask
) #(*,147,768) where 147 are the number of masked patches
# get image patches for masked tokens
patches = patchify(images, self.patch_size) #(*,196,768)
# must adjust idx mask for missing class token
target = get_at_index(patches, idx_mask - 1) #(*, 147, 768)
self.loss = self.criterion(x_pred, target) #(,)
return self.loss
def on_train_epoch_end(self) -> None:
self.log("training loss" , self.loss)
if self.apply_lr_scheduler:
self.log("current lr", self.scheduler.get_lr()[0])
else:
self.log("current lr", self.model_params["lr"])
def configure_optimizers(self):
if self.apply_lr_scheduler:
optimizer = torch.optim.AdamW(
params= self.parameters(),
lr = mae_params["base_lr"] * self.model_params["eff_batch_size"] / 256,
weight_decay=mae_params["weight_decay"]
)
self.scheduler = LinearWarmupCosineAnnealingLR(
optimizer = optimizer,
warmup_epochs=mae_params["warmup_epochs"], # Linearly rampup lr as then decay using cosine as indicated in paper
max_epochs=self.model_params["epochs"],
warmup_start_lr=mae_params["base_lr"], # we linearly ramp up from 0 to base_lr which is indicated in the optimizer
eta_min=mae_params["eta_min"] #* We keep eta_min at 0 as Dino Paper hasnt indicated a value
)
return [optimizer],[{"scheduler" : self.scheduler, "interval" : "epoch"}]
else:
optimizer = torch.optim.AdamW(
params = self.parameters(), lr = self.model_params["lr"], weight_decay=mae_params["weight_decay"]
)
return optimizer
</code>
<code># ============================================================================== # MAE Imp0lementation : THE PAPER HAS TWO DIFFERENT SETTINGS FOR PRETRAINING & FINETUNING # What the paper uses | Any changes that deviate from paper # Backbone : VIT-L/16 | VIT-B/16 # Decoder : depth - 8 blocks, Width - 512d # Encoder : w/o mask tokens # Optimizer # linear lr scaling (cosine decay) : lr = base_lr x bs / 256, base_lr = 1.5e-3/1e-3, warmup = 40/5 epochs # AdamW : Beta1,2 = 0.9,0.95 / 0.9,0.999 # Batch Size : 4096/1024 | 1024/1024 # Loss func : MSE # Masking : 75% # Augmenntation : RandomResized Crop # Crop Size : 224x224 # Fine tuning : 50 epochs vs Pretraining : 200 epochs # Note that these are for Vit-L # ============================================================================== mae_params = config["mae_params"] class MaeBBViT(pl.LightningModule): def __init__(self, model_params:dict, backbone:VisionTransformer): """ Inputs - model_params: dictionary containing model parameters such as lr, batch_size - backbone : vit model with last layer removed """ super().__init__() # Saving hyperparameters hyper_dict = {} hyper_dict.update(mae_params) hyper_dict.update(model_params) self.save_hyperparameters(hyper_dict) self.model_params = model_params self.backbone = backbone # we will need this later self.mask_ratio = mae_params["mask_ratio"] # 0.75 self.patch_size = backbone.patch_embed.patch_size[0] #(16,16) so index [0] returns 16 self.masked_encoder = MaskedVisionTransformerTIMM(vit = backbone) self.sequence_length = self.masked_encoder.sequence_length #197 self.decoder = MAEDecoderTIMM( num_patches = backbone.patch_embed.num_patches, #196 patch_size = self.patch_size,#196 embed_dim = backbone.embed_dim, #768 decoder_embed_dim = mae_params["decoder_dim"], #512 decoder_depth = mae_params["decoder_depth"], #8 mlp_ratio = 4.0, proj_drop_rate = 0.0, # drop out rate in projection head attn_drop_rate = 0.0 # drop out rate in ) self.criterion = nn.MSELoss() self.apply_lr_scheduler = False if model_params["lr"] else True def forward_encoder(self, images, idx_keep = None): # shape returned from .encode = (bs, num_unmasked_patches, embed size) # What you pass here as images are actually just unamsked portion of the image only # So if 50/197 patches were left not masked you get the shape below (total 197 including cls) of which 50 is not masked return self.masked_encoder.encode(images = images, idx_keep = idx_keep) #(*,3,224,224) -> (*,50,768) def forward_decoder(self, x_encoded, idx_keep, idx_mask): # Build decoder input batch_size = x_encoded.shape[0] x_decode = self.decoder.embed(x_encoded) #(*,50,512) where 50 is number NOT makes patches, 512 is num dimension decalred above x_masked = repeat_token( self.decoder.mask_token, (batch_size, self.sequence_length) ) #(*,197,512) where 197 nuber of masked patches & 512 is the nub dimensions declared above x_masked = set_at_index(x_masked, idx_keep, x_decode.type_as(x_masked)) #(*,197,512) # decoder forward pass x_decoded = self.decoder.decode(x_masked) #(*,197,512) # Predict pixel values for masked tokens x_pred = get_at_index(x_decoded, idx_mask) #(*,147,512) x_pred = self.decoder.predict(x_pred) #(*,147,768) return x_pred def training_step(self, batch, batch_idx): X,y,f = batch images = X[0] # (*,3, 224,224) There is only a single view but its within a list batch_size = images.shape[0] # These are unmasked and masked patches. Rememnber that there is 197 patches including ... # CLS token (14*14 + 1), 14 patches was obtained by 224/16 where each patch is 16 pixels in height and width idx_keep, idx_mask = random_token_mask( size = (batch_size, self.sequence_length), mask_ratio = self.mask_ratio, device = images.device ) #(*,50),(*,147) x_encoded = self.forward_encoder( images = images, idx_keep = idx_keep ) #(*,50,768) where 50 is the number of NOT maked patches x_pred = self.forward_decoder( x_encoded = x_encoded, idx_keep = idx_keep, idx_mask = idx_mask ) #(*,147,768) where 147 are the number of masked patches # get image patches for masked tokens patches = patchify(images, self.patch_size) #(*,196,768) # must adjust idx mask for missing class token target = get_at_index(patches, idx_mask - 1) #(*, 147, 768) self.loss = self.criterion(x_pred, target) #(,) return self.loss def on_train_epoch_end(self) -> None: self.log("training loss" , self.loss) if self.apply_lr_scheduler: self.log("current lr", self.scheduler.get_lr()[0]) else: self.log("current lr", self.model_params["lr"]) def configure_optimizers(self): if self.apply_lr_scheduler: optimizer = torch.optim.AdamW( params= self.parameters(), lr = mae_params["base_lr"] * self.model_params["eff_batch_size"] / 256, weight_decay=mae_params["weight_decay"] ) self.scheduler = LinearWarmupCosineAnnealingLR( optimizer = optimizer, warmup_epochs=mae_params["warmup_epochs"], # Linearly rampup lr as then decay using cosine as indicated in paper max_epochs=self.model_params["epochs"], warmup_start_lr=mae_params["base_lr"], # we linearly ramp up from 0 to base_lr which is indicated in the optimizer eta_min=mae_params["eta_min"] #* We keep eta_min at 0 as Dino Paper hasnt indicated a value ) return [optimizer],[{"scheduler" : self.scheduler, "interval" : "epoch"}] else: optimizer = torch.optim.AdamW( params = self.parameters(), lr = self.model_params["lr"], weight_decay=mae_params["weight_decay"] ) return optimizer </code>
# ==============================================================================
# MAE Imp0lementation : THE PAPER HAS TWO DIFFERENT SETTINGS FOR PRETRAINING & FINETUNING
# What the paper uses | Any changes that deviate from paper
# Backbone : VIT-L/16 | VIT-B/16
#   Decoder : depth - 8 blocks, Width - 512d
#   Encoder : w/o mask tokens
# Optimizer
#   linear lr scaling (cosine decay) : lr = base_lr x bs / 256, base_lr = 1.5e-3/1e-3, warmup = 40/5 epochs
#   AdamW : Beta1,2 = 0.9,0.95 / 0.9,0.999
# Batch Size : 4096/1024 | 1024/1024
# Loss func : MSE
# Masking : 75%
# Augmenntation : RandomResized Crop     
#   Crop Size : 224x224
# Fine tuning : 50 epochs vs Pretraining : 200 epochs
#   Note that these are for Vit-L
# ==============================================================================

mae_params = config["mae_params"]

class MaeBBViT(pl.LightningModule):
    def __init__(self, model_params:dict, backbone:VisionTransformer):
        """
        Inputs 
            - model_params: dictionary containing model parameters such as lr, batch_size
            - backbone : vit model with last layer removed 
        """
        super().__init__()

        # Saving hyperparameters
        hyper_dict = {}
        hyper_dict.update(mae_params)
        hyper_dict.update(model_params)
        self.save_hyperparameters(hyper_dict)

        self.model_params = model_params

        self.backbone = backbone # we will need this later

        self.mask_ratio = mae_params["mask_ratio"] # 0.75
        self.patch_size = backbone.patch_embed.patch_size[0] #(16,16) so index [0] returns 16
        self.masked_encoder = MaskedVisionTransformerTIMM(vit = backbone)
        self.sequence_length = self.masked_encoder.sequence_length #197
        self.decoder = MAEDecoderTIMM(
            num_patches = backbone.patch_embed.num_patches, #196
            patch_size = self.patch_size,#196
            embed_dim = backbone.embed_dim, #768
            decoder_embed_dim = mae_params["decoder_dim"], #512
            decoder_depth = mae_params["decoder_depth"], #8
            mlp_ratio = 4.0,
            proj_drop_rate = 0.0, # drop out rate in projection head
            attn_drop_rate = 0.0 # drop out rate in 
        )
        self.criterion = nn.MSELoss()

        self.apply_lr_scheduler = False if model_params["lr"] else True

    def forward_encoder(self, images, idx_keep = None):
        # shape returned from .encode = (bs, num_unmasked_patches, embed size)
        # What you pass here as images are actually just unamsked portion of the image only
        # So if 50/197 patches were left not masked you get the shape below (total 197 including cls) of which 50 is not masked
        return self.masked_encoder.encode(images = images, idx_keep = idx_keep) #(*,3,224,224) -> (*,50,768)

    def forward_decoder(self, x_encoded, idx_keep, idx_mask):
        # Build decoder input
        batch_size = x_encoded.shape[0]

        x_decode = self.decoder.embed(x_encoded) #(*,50,512) where 50 is number NOT makes patches, 512 is num dimension decalred above

        x_masked = repeat_token(
            self.decoder.mask_token,
            (batch_size, self.sequence_length)
        ) #(*,197,512) where 197 nuber of masked patches & 512 is the nub dimensions declared above

        x_masked = set_at_index(x_masked, idx_keep, x_decode.type_as(x_masked)) #(*,197,512)

        # decoder forward pass
        x_decoded = self.decoder.decode(x_masked) #(*,197,512) 

        # Predict pixel values for masked tokens
        x_pred = get_at_index(x_decoded, idx_mask) #(*,147,512)
        x_pred = self.decoder.predict(x_pred) #(*,147,768)

        return x_pred

    def training_step(self, batch, batch_idx):
        X,y,f = batch 
        images = X[0] # (*,3, 224,224) There is only a single view but its within a list
        batch_size = images.shape[0]
        # These are unmasked and masked patches. Rememnber that there is 197 patches including ...
        # CLS token (14*14 + 1), 14 patches was obtained by 224/16 where each patch is 16 pixels in height and width 
        idx_keep, idx_mask = random_token_mask(
            size = (batch_size, self.sequence_length),
            mask_ratio = self.mask_ratio,
            device = images.device
        ) #(*,50),(*,147)
        x_encoded = self.forward_encoder(
            images = images, 
            idx_keep = idx_keep
        ) #(*,50,768) where 50 is the number of NOT maked patches
        x_pred = self.forward_decoder(
            x_encoded = x_encoded, 
            idx_keep = idx_keep, 
            idx_mask = idx_mask
        ) #(*,147,768) where 147 are the number of masked patches

        # get image patches for masked tokens
        patches = patchify(images, self.patch_size) #(*,196,768)
        # must adjust idx mask for missing class token
        target = get_at_index(patches, idx_mask - 1) #(*, 147, 768)

        self.loss = self.criterion(x_pred, target) #(,)
        return self.loss

    def on_train_epoch_end(self) -> None:
        self.log("training loss" , self.loss)

        if self.apply_lr_scheduler:
            self.log("current lr", self.scheduler.get_lr()[0])
        else:
            self.log("current lr", self.model_params["lr"])

    def configure_optimizers(self):
        if self.apply_lr_scheduler:
            optimizer = torch.optim.AdamW(
                params= self.parameters(), 
                lr = mae_params["base_lr"] * self.model_params["eff_batch_size"] / 256, 
                weight_decay=mae_params["weight_decay"]
            )
            self.scheduler = LinearWarmupCosineAnnealingLR(
                optimizer = optimizer, 
                warmup_epochs=mae_params["warmup_epochs"], # Linearly rampup lr as then decay using cosine as indicated in paper
                max_epochs=self.model_params["epochs"], 
                warmup_start_lr=mae_params["base_lr"], # we linearly ramp up from 0 to base_lr which is indicated in the optimizer
                eta_min=mae_params["eta_min"] #* We keep eta_min at 0 as Dino Paper hasnt indicated a value
            )
            return [optimizer],[{"scheduler" : self.scheduler, "interval" : "epoch"}] 
        else:
            optimizer = torch.optim.AdamW(
                params = self.parameters(), lr = self.model_params["lr"], weight_decay=mae_params["weight_decay"]
            )
            return optimizer

Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa Dịch vụ tổ chức sự kiện 5 sao Thông tin về chúng tôi Dịch vụ sinh nhật bé trai Dịch vụ sinh nhật bé gái Sự kiện trọn gói Các tiết mục giải trí Dịch vụ bổ trợ Tiệc cưới sang trọng Dịch vụ khai trương Tư vấn tổ chức sự kiện Hình ảnh sự kiện Cập nhật tin tức Liên hệ ngay Thuê chú hề chuyên nghiệp Tiệc tất niên cho công ty Trang trí tiệc cuối năm Tiệc tất niên độc đáo Sinh nhật bé Hải Đăng Sinh nhật đáng yêu bé Khánh Vân Sinh nhật sang trọng Bích Ngân Tiệc sinh nhật bé Thanh Trang Dịch vụ ông già Noel Xiếc thú vui nhộn Biểu diễn xiếc quay đĩa Dịch vụ tổ chức tiệc uy tín Khám phá dịch vụ của chúng tôi Tiệc sinh nhật cho bé trai Trang trí tiệc cho bé gái Gói sự kiện chuyên nghiệp Chương trình giải trí hấp dẫn Dịch vụ hỗ trợ sự kiện Trang trí tiệc cưới đẹp Khởi đầu thành công với khai trương Chuyên gia tư vấn sự kiện Xem ảnh các sự kiện đẹp Tin mới về sự kiện Kết nối với đội ngũ chuyên gia Chú hề vui nhộn cho tiệc sinh nhật Ý tưởng tiệc cuối năm Tất niên độc đáo Trang trí tiệc hiện đại Tổ chức sinh nhật cho Hải Đăng Sinh nhật độc quyền Khánh Vân Phong cách tiệc Bích Ngân Trang trí tiệc bé Thanh Trang Thuê dịch vụ ông già Noel chuyên nghiệp Xem xiếc khỉ đặc sắc Xiếc quay đĩa thú vị
Trang chủ Giới thiệu Sinh nhật bé trai Sinh nhật bé gái Tổ chức sự kiện Biểu diễn giải trí Dịch vụ khác Trang trí tiệc cưới Tổ chức khai trương Tư vấn dịch vụ Thư viện ảnh Tin tức - sự kiện Liên hệ Chú hề sinh nhật Trang trí YEAR END PARTY công ty Trang trí tất niên cuối năm Trang trí tất niên xu hướng mới nhất Trang trí sinh nhật bé trai Hải Đăng Trang trí sinh nhật bé Khánh Vân Trang trí sinh nhật Bích Ngân Trang trí sinh nhật bé Thanh Trang Thuê ông già Noel phát quà Biểu diễn xiếc khỉ Xiếc quay đĩa
Thiết kế website Thiết kế website Thiết kế website Cách kháng tài khoản quảng cáo Mua bán Fanpage Facebook Dịch vụ SEO Tổ chức sinh nhật