Thiết kế website giá rẻ

Question

I am trying to a semi-supervised learning.
I first train on the small set of training set (represented by training on the test set, instead of normal training set), then during validation, I take the predictions with the probability higher than 0.9, label them (this is yet to be done, so I just take the given label for now).
Then I take the data from the test_dataset that has more than 0.9 prediction probability, then I what to append/add to the original train_set, then feed this new train_set to train the model.

Code is below.

# -*- coding: utf-8 -*-
"""
Created on Tue Sep  3 01:20:02 2024

@author: ra064640
"""

# -*- coding: utf-8 -*-
"""
Created on Mon Sep  2 13:39:33 2024

@author: ra064640


import torch
from torch.utils.data import Dataset, Subset
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader, ConcatDataset
import torchvision.models as models
import torch.nn as nn

training_data = datasets.CIFAR10(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

test_data = datasets.CIFAR10(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)


batch_size = 128
train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

# loop through the dataloader and see what the data set looks like. 
# the batch feature is the size of torch.Size([64, 1, 28, 28]) 
# the batch label is the size of torch.Size([64])
i = 0
for train_features, train_label in train_dataloader:
        if i == 0:
            print(train_features.size())
            print(train_label.size())
        i += 1

model = models.resnet18()
model.fc = nn.Linear(in_features=model.fc.in_features, out_features=10)

# train this model using Adam and the train set. output the test data accuracy
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

def accuracy(outputs, labels):
    # model output is the size of batch x label
    # get the labels from the outputs. 
    batch_size = outputs.shape[0]
    val, idx = torch.max(outputs, dim = -1)
    correct = idx.eq(labels).sum() * 1.0
    acc = correct / batch_size
    return acc

# now create the training function definition. 
def train_epoch(epoch, training_loader, loss_func, optimizer, model, batch_size=batch_size):
    
    # due to time, only train the first 10 of the epoch = 64 * 1000 =64000 samples per epoch 

    running_loss = 0
    for i, (inputs, labels) in enumerate(training_loader):
        if torch.cuda.is_available():
            inputs = inputs.cuda()
            labels = labels.cuda()
            model = model.cuda()
            
        optimizer.zero_grad()
        
        outputs = model(inputs)
        
        loss = loss_func(outputs, labels)
        loss.backward()
        
        optimizer.step()
        
        running_loss += loss.item()
        if i % 50 == 0:
            print(f'i index is {i} and loss is {running_loss}')

    return running_loss/batch_size
        
def valid_epoch(test_loader, loss_func, batch_size, model):
    # due to time, only validate the i < 5
    dataset_list = []
    for i, data in enumerate(test_loader):
        inputs, labels = data
        if torch.cuda.is_available():
            inputs = inputs.cuda()
            labels = labels.cuda()
            model = model.cuda()
        # set the model to be no grad and ebal
            model.eval()
        with torch.no_grad():
            outputs = model(inputs)
            valid_loss = loss_func(outputs, labels)
        # softmax the outputs. 
        softmax = nn.Softmax(dim = -1)
        softmax_outputs = softmax(outputs)
        # get the values of the outputs. 
        val, idx = torch.max(softmax_outputs, dim = -1)
        semi_data_boolean = val > 0.9
        semi_data_boolean_count = semi_data_boolean.sum() * 1.0
        indexes = torch.nonzero(semi_data_boolean)
        if i % 25 == 0:
            print(f'the semi_data_boolean count is {semi_data_boolean_count}')
        semi_data = Subset(data, indexes)
        dataset_list.append(semi_data)
        
        # able to assume that outputs and labels are in the cuda space
        valid_batch_acc = accuracy(outputs, labels)
        
        if i % 100 == 0:
            print(f'valid accuracy is {valid_batch_acc}')
    semi_dataset = ConcatDataset(dataset_list)
    return dataset_list, valid_loss, valid_batch_acc
    
# carry out the training. 
epoches = 3
for epoch in range(epoches):
    print(f'epoch is {epoch}')
    data_len = len(test_data)
    print(f'the len of the data is {data_len}')
    # trian the model for the first epoch
    running_loss = train_epoch(epoch = epoch, training_loader = test_dataloader, loss_func = loss_fn, model = model, optimizer = optimizer, batch_size=batch_size)
    # then validate the model for the first epoch
    dataset_list, valid_loss, valid_acc = valid_epoch(test_loader = train_dataloader, loss_func = loss_fn, batch_size = batch_size, model = model)
    # concatenate the test_dataloader to the semi_dataset and repeat the process
    # check if the size match with the test_Data and the semi_dataset
    semi_dataset = dataset_list.append(test_data)
    test_data = ConcatDataset(semi_dataset)
    
    # Extract all data from the ConcatDataset
    concat_data = [test_data[i] for i in range(len(test_data))]
    
    # Create a new dataset from the extracted data
    class CombinedDataset(Dataset):
        def __init__(self, data):
            self.data = data
    
        def __len__(self):
            return len(self.data)
    
        def __getitem__(self, idx):
            return self.data[idx]
    
    # Instantiate the new dataset
    #test_data = CombinedDataset(test_data)
    test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

I have tried several things.

Just feed the data loader with ConcatDataset([new_data, train_data])
Or create a completely new data set from new_data and train_data…

What would be the best way to achieve this?

Thiết kế website giá rẻ

Danh mục

How to create a dataset that combines 2 datasets and then create dataloader for pytorch?