I am using clearml to explore learning rates in PyTorch. Tuning task is now DRAFT and data is not showing up on loss monitoring screen.
step1:(This has been a success.)
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from clearml import Task, Logger
task = Task.init(project_name="CNN Experiments", task_name="non tune 3 class cnn")
logger = Logger.current_logger()
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
#extract 3 class
train_indices = [i for i, (x, y) in enumerate(trainset) if y < 3]
test_indices = [i for i, (x, y) in enumerate(testset) if y < 3]
trainloader = torch.utils.data.DataLoader(
torch.utils.data.Subset(trainset, train_indices),
batch_size=64, shuffle=True, num_workers=2
)
testloader = torch.utils.data.DataLoader(
torch.utils.data.Subset(testset, test_indices),
batch_size=64, shuffle=False, num_workers=2
)
# tune param
configuration_dict = {"base_lr": 0.001,'dropout': 0.25,'number_of_epochs': 3}
configuration_dict = task.connect(configuration_dict)
class SimpleCNN(nn.Module):
def __init__(self):
#super(SimpleCNN, self).__init__()
super().__init__()
self.conv1 = nn.Conv2d(3, 16, 3, padding=1) # 3x32x32 -> 16x32x32
self.conv2 = nn.Conv2d(16, 32, 3, padding=1) # 16x32x32 -> 32x32x32
self.conv3 = nn.Conv2d(32, 64, 3, padding=1) # 32x32x32 -> 64x32x32
self.pool = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(64 * 4 * 4, 64)
self.dorpout = nn.Dropout(p=configuration_dict.get('dropout', 0.25))
self.fc2 = nn.Linear(64, 3)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = self.pool(F.relu(self.conv3(x)))
x = torch.flatten(x, 1)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
model = SimpleCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=configuration_dict.get('base_lr', 0.001), momentum=0.9)
for epoch in range(configuration_dict.get('number_of_epochs', 10)):
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
inputs, labels = data
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % 100 == 99:
print(f"[{epoch + 1}, {i + 1}] loss: {running_loss / 100:.3f}")
running_loss = 0.0
#############
logger.report_scalar(title="train", series="loss", iteration=epoch, value=loss.item())
print("send loss")
print("Finished Training")
PATH = './cifar_net.pth'
torch.save(model.state_dict(), PATH)
correct = 0
total = 0
with torch.no_grad():
for data in testloader:
images, labels = data
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print(f"Accuracy on test data: {100 * correct / total:.2f}%")
In the above code, loss is displayed in the scalars panel.
However, when the following tuning code is executed, the loss for the parameter combination is not recorded.
Is there a problem?
step2:(tune)
from clearml import Task, Logger
from clearml.automation import UniformParameterRange, UniformIntegerParameterRange
from clearml.automation import DiscreteParameterRange
from clearml.automation import GridSearch, RandomSearch, HyperParameterOptimizer
from clearml.automation.optuna import OptimizerOptuna
def job_complete_callback(
job_id, # type: str
objective_value, # type: float
objective_iteration, # type: int
job_parameters, # type: dict
top_performance_job_id # type: str
):
print('Job completed!', job_id, objective_value, objective_iteration, job_parameters)
if job_id == top_performance_job_id:
print('WOOT WOOT we broke the record! Objective reached {}'.format(objective_value))
task = Task.init(project_name="CNN tuner",
task_name="tune",
task_type=Task.TaskTypes.optimizer,
#reuse_last_task_id=False
)
#base_task_id is step1-task-id
an_optimizer = HyperParameterOptimizer(
base_task_id='9e9ce547---------2d167936',
hyper_parameters=[
UniformIntegerParameterRange('number_of_epochs', min_value=2, max_value=12, step_size=2),
UniformParameterRange('dropout', min_value=0, max_value=0.5, step_size=0.05),
UniformParameterRange('base_lr', min_value=0.00025, max_value=0.01, step_size=0.00025),
],
objective_metric_title='train',
objective_metric_series='loss',
objective_metric_sign='min',
optimizer_class=OptimizerOptuna,
# Configuring optimization parameters
#execution_queue='dan_queue',
# queue to schedule the experiments for execution
max_number_of_concurrent_tasks=20, # number of concurrent experiments
optimization_time_limit=6., # set the time limit for the optimization process
compute_time_limit=6, # set the compute time limit (sum of execution time on all machines)
total_max_jobs=20, # set the maximum number of experiments for the optimization.
# Converted to total number of iteration for OptimizerBOHB
min_iteration_per_job=150, # minimum number of iterations per experiment, till early stopping
max_iteration_per_job=1500, # maximum number of iterations per experiment
)
an_optimizer.set_report_period(0.2)
an_optimizer.start()
an_optimizer.wait()
an_optimizer.stop()
top_exp = an_optimizer.get_top_experiments(top_k=3)
print([t.id for t in top_exp])
print("all fin")
The ideal goal would be to obtain a figure comparing the performance of the parameters, such as the following URL
https://clear.ml/docs/latest/assets/images/webapp_compare_11-ad1348542d5b42a0cce427e91e9ec2e5.png
enter image description here
I have looked at the official youtube and run the github exsamples but could not achieve it with my code.
https://github.com/allegroai/clearml/tree/master/examples
liveman is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.