I am trying to a semi-supervised learning.
I first train on the small set of training set (represented by training on the test set, instead of normal training set), then during validation, I take the predictions with the probability higher than 0.9, label them (this is yet to be done, so I just take the given label for now).
Then I take the data from the test_dataset
that has more than 0.9 prediction probability, then I what to append/add to the original train_set
, then feed this new train_set
to train the model.
Code is below.
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 3 01:20:02 2024
@author: ra064640
"""
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 2 13:39:33 2024
@author: ra064640
import torch
from torch.utils.data import Dataset, Subset
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader, ConcatDataset
import torchvision.models as models
import torch.nn as nn
training_data = datasets.CIFAR10(
root="data",
train=True,
download=True,
transform=ToTensor()
)
test_data = datasets.CIFAR10(
root="data",
train=False,
download=True,
transform=ToTensor()
)
batch_size = 128
train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
# loop through the dataloader and see what the data set looks like.
# the batch feature is the size of torch.Size([64, 1, 28, 28])
# the batch label is the size of torch.Size([64])
i = 0
for train_features, train_label in train_dataloader:
if i == 0:
print(train_features.size())
print(train_label.size())
i += 1
model = models.resnet18()
model.fc = nn.Linear(in_features=model.fc.in_features, out_features=10)
# train this model using Adam and the train set. output the test data accuracy
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
def accuracy(outputs, labels):
# model output is the size of batch x label
# get the labels from the outputs.
batch_size = outputs.shape[0]
val, idx = torch.max(outputs, dim = -1)
correct = idx.eq(labels).sum() * 1.0
acc = correct / batch_size
return acc
# now create the training function definition.
def train_epoch(epoch, training_loader, loss_func, optimizer, model, batch_size=batch_size):
# due to time, only train the first 10 of the epoch = 64 * 1000 =64000 samples per epoch
running_loss = 0
for i, (inputs, labels) in enumerate(training_loader):
if torch.cuda.is_available():
inputs = inputs.cuda()
labels = labels.cuda()
model = model.cuda()
optimizer.zero_grad()
outputs = model(inputs)
loss = loss_func(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 50 == 0:
print(f'i index is {i} and loss is {running_loss}')
return running_loss/batch_size
def valid_epoch(test_loader, loss_func, batch_size, model):
# due to time, only validate the i < 5
dataset_list = []
for i, data in enumerate(test_loader):
inputs, labels = data
if torch.cuda.is_available():
inputs = inputs.cuda()
labels = labels.cuda()
model = model.cuda()
# set the model to be no grad and ebal
model.eval()
with torch.no_grad():
outputs = model(inputs)
valid_loss = loss_func(outputs, labels)
# softmax the outputs.
softmax = nn.Softmax(dim = -1)
softmax_outputs = softmax(outputs)
# get the values of the outputs.
val, idx = torch.max(softmax_outputs, dim = -1)
semi_data_boolean = val > 0.9
semi_data_boolean_count = semi_data_boolean.sum() * 1.0
indexes = torch.nonzero(semi_data_boolean)
if i % 25 == 0:
print(f'the semi_data_boolean count is {semi_data_boolean_count}')
semi_data = Subset(data, indexes)
dataset_list.append(semi_data)
# able to assume that outputs and labels are in the cuda space
valid_batch_acc = accuracy(outputs, labels)
if i % 100 == 0:
print(f'valid accuracy is {valid_batch_acc}')
semi_dataset = ConcatDataset(dataset_list)
return dataset_list, valid_loss, valid_batch_acc
# carry out the training.
epoches = 3
for epoch in range(epoches):
print(f'epoch is {epoch}')
data_len = len(test_data)
print(f'the len of the data is {data_len}')
# trian the model for the first epoch
running_loss = train_epoch(epoch = epoch, training_loader = test_dataloader, loss_func = loss_fn, model = model, optimizer = optimizer, batch_size=batch_size)
# then validate the model for the first epoch
dataset_list, valid_loss, valid_acc = valid_epoch(test_loader = train_dataloader, loss_func = loss_fn, batch_size = batch_size, model = model)
# concatenate the test_dataloader to the semi_dataset and repeat the process
# check if the size match with the test_Data and the semi_dataset
semi_dataset = dataset_list.append(test_data)
test_data = ConcatDataset(semi_dataset)
# Extract all data from the ConcatDataset
concat_data = [test_data[i] for i in range(len(test_data))]
# Create a new dataset from the extracted data
class CombinedDataset(Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
# Instantiate the new dataset
#test_data = CombinedDataset(test_data)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)
I have tried several things.
- Just feed the data loader with
ConcatDataset([new_data, train_data])
- Or create a completely new data set from
new_data
andtrain_data
…
What would be the best way to achieve this?