Thiết kế website giá rẻ

Question

I want to train vision transformer on Cifare10 , I tried to do fine tuning of hyperparameter to enhance the accuracy but actually I still obtain a bad accuracy , so , please there are not any suggestion to enhance my model thank you in advance I tried to load weight from pretrained vit on Imagenet but it doesn’t work :`

transform = transforms.Compose([
transforms.RandomResizedCrop(size=(IMG_SIZE, IMG_SIZE) ),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2470, 0.2435, 0.2616)),
])

class PatchEmbedding(nn.Module):

def __init__(self, 
             in_channels:int=3,
             patch_size:int=16,
             embedding_dim:int=64):
    super().__init__()
    
 
    self.patcher = nn.Conv2d(in_channels=in_channels,
                             out_channels=embedding_dim,
                             kernel_size=patch_size,
                             stride=patch_size,
                             padding=0)




    self.flatten = nn.Flatten(start_dim=2, 
                              end_dim=3)
    self.out_channels = embedding_dim
           
    self.embedding_dim = embedding_dim
    self.patch_size = patch_size




def forward(self, x):

    image_resolution = x.shape[-1]
    assert image_resolution % patch_size == 0, f"Input image size must be divisble by patch size, image shape: {image_resolution}, patch size: {patch_size}"
    

    x_patched = self.patcher(x)
    x_flattened = self.flatten(x_patched) 
    
   
    return x_flattened.permute(0, 2, 1) 


class MultiheadSelfAttentionBlock(nn.Module):

def __init__(self,
             embedding_dim:int=768, 
             num_heads:int=12, 
             attn_dropout:float=0): 
    super().__init__()
    

    self.layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)
    

    self.multihead_attn = nn.MultiheadAttention(embed_dim=embedding_dim,
                                                num_heads=num_heads,
                                                dropout=attn_dropout,
                                                batch_first=True) 
    

def forward(self, x):
    x = self.layer_norm(x)
    attn_output, _ = self.multihead_attn(query=x, 
                                         key=x, 
                                         value=x, 
                                         need_weights=False) 
    return attn_output


class MLPBlock(nn.Module):

def __init__(self,
             embedding_dim:int=768, 
             mlp_size:int=3072,
             dropout:float=0.1): 
    super().__init__()
    

    self.layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)
    

    self.mlp = nn.Sequential(
        nn.Linear(in_features=embedding_dim,
                  out_features=mlp_size),
        nn.GELU(),
        nn.Dropout(p=dropout),
        nn.Linear(in_features=mlp_size, 
                  out_features=embedding_dim), 
        nn.Dropout(p=dropout) 
    )


def forward(self, x):
    x = self.layer_norm(x)
    x = self.mlp(x)
    return x


class TransformerEncoderBlock(nn.Module):

def __init__(self,
             embedding_dim:int=768, 
             num_heads:int=12,
             mlp_size:int=3072, 
             mlp_dropout:float=0.1, 
             attn_dropout:float=0): 
    super().__init__()


    self.msa_block = MultiheadSelfAttentionBlock(embedding_dim=embedding_dim,
                                                 num_heads=num_heads,
                                                 attn_dropout=attn_dropout)
    
    # 4. Create MLP block (equation 3)
    self.mlp_block =  MLPBlock(embedding_dim=embedding_dim,
                               mlp_size=mlp_size,
                               dropout=mlp_dropout)
    
# 5. Create a forward() method  
def forward(self, x):
    
    # 6. Create residual connection for MSA block (add the input to the output)
    x =  self.msa_block(x) + x 
    
    # 7. Create residual connection for MLP block (add the input to the output)
    x = self.mlp_block(x) + x 
    
    return x


class vit_model(nn.Module):


def __init__(self,
             img_size:int=224, 
             in_channels:int=3, 
             patch_size:int=16,
             embedding_dim:int=768, 
             num_transformer_layers:int=12,
             embedding_dropout:float=0.1, 
             Headlayer:int=12, 
             num_classes:int=10 ,
             mlp_size:int=4, 
             num_heads:int=8, 
             mlp_dropout:float=0.1, 
             ): 
    super().__init__()
    
    
    assert img_size % patch_size == 0, f"Image size must be divisible by patch size, image size: {img_size}, patch size: {patch_size}."
    
    
    self.num_patches = (img_size * img_size) // patch_size**2
             
    
    self.class_embedding = nn.Parameter(data=torch.randn(1, 1, embedding_dim),
                                        requires_grad=True)
    
    
    self.position_embedding = nn.Parameter(data=torch.randn(1, self.num_patches+1, embedding_dim),
                                           requires_grad=True)
    
            
    
    self.embedding_dropout = nn.Dropout(p=embedding_dropout)
    
   
    self.patch_embedding = PatchEmbedding(in_channels=in_channels,
                                          patch_size=patch_size,
                                          embedding_dim=embedding_dim)
    self.transformer_encoder = nn.Sequential(*[TransformerEncoderBlock(embedding_dim=embedding_dim,
                                                                        num_heads=num_heads,
                                                                        mlp_size=mlp_size,
                                                                        mlp_dropout=mlp_dropout) for _ in     range(num_transformer_layers)])
    self.classifier = nn.Sequential(
        nn.LayerNorm(normalized_shape=embedding_dim),
        nn.Linear(in_features=embedding_dim, 
                  out_features=num_classes)
    )

    


def forward(self, x):



    
    
    batch_size = x.shape[0]
    
    
    class_token = self.class_embedding.expand(batch_size, -1, -1) # "-1" means to infer the dimension (try this line on its own)

    # 14. Create patch embedding (equation 1)
    x = self.patch_embedding(x)

    # 15. Concat class embedding and patch embedding (equation 1)
    x = torch.cat((class_token, x), dim=1)

    # 16. Add position embedding to patch embedding (equation 1) 
    x = self.position_embedding + x

    # 17. Run embedding dropout (Appendix B.1)
    x = self.embedding_dropout(x)

    x = self.transformer_encoder(x)
    x = self.classifier(x[:, 0]) 

    return x 

image_size = 224
patch_size = 16
num_patches = (image_size // patch_size) ** 2
embed_dim = 768
num_heads = 12
num_layers = 12
mlp_ratio = 4
num_classes = 10

model = vit_model()
model = model.to(device)



cifar_dataset = datasets.CIFAR10(root='./data', train= False, download=True, transform=transform)
train_size = int(0.8 * len(cifar_dataset))
val_size = len(cifar_dataset) - train_size
train_dataset, val_dataset = random_split(cifar_dataset, [train_size, val_size])

trainloader = DataLoader(train_dataset, batch_size= 150, shuffle=True)
valloader = DataLoader(val_dataset, batch_size= 150 , shuffle=False)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)  # Lower learning rate
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

train_acc_history = []
test_acc_history = []
train_loss_history = []
test_loss_history = []

num_epochs = 100
for epoch in range(num_epochs): 
    model.train()
    train_loss = 0.0
    train_correct = 0
    for data, target in trainloader:
        optimizer.zero_grad()
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        train_loss += loss.item() * data.size(0)
        train_correct += (output.argmax(dim=1) == target).sum().item()
    model.eval()
    with torch.no_grad():
        train_loss = train_loss / len(trainloader.dataset)
        train_accuracy = 100.0 * train_correct / len(trainloader.dataset)
        train_acc_history.append(train_accuracy)
        train_loss_history.append(train_loss)
        print(f"Epoch {epoch + 1}/{num_epochs} - Train Loss: {train_loss:.4f} - Train Accuracy:    {train_accuracy:.2f}%")

    test_loss = 0.0
    test_correct = 0
    for data, target in valloader:
        target = target.to(device)
        data = data.to(device)
        output = model(data)
        test_loss += criterion(output, target).item() * data.size(0)
        test_correct += (output.argmax(dim=1) == target).sum().item()

    test_loss = test_loss / len(valloader.dataset)
    test_accuracy = 100.0 * test_correct / len(valloader.dataset)
    test_acc_history.append(test_accuracy)
    test_loss_history.append(test_loss)

    print(f"Epoch {epoch + 1}/{num_epochs} - Test Loss: {test_loss:.4f} - Test Accuracy: {test_accuracy:.2f}%")

this is the  result of first epoch  : `Epoch 1/100 - Test Loss: 2.1949 - Test Accuracy: 17.90%
Epoch 2/100 - Train Loss: 2.0975 - Train Accuracy: 19.85%
Epoch 2/100 - Test Loss: 2.1168 - Test Accuracy: 20.80%
Epoch 3/100 - Train Loss: 2.0607 - Train Accuracy: 22.43%
Epoch 3/100 - Test Loss: 2.1017 - Test Accuracy: 21.20%
`
I want to enhance the performance of the model  

Improve the accuracy

Thiết kế website giá rẻ

Danh mục

Is there method to enhance the performance of vit?