I’m trying to train the model that selects the maximum number in the list of 10 numbers.
For example, I have a list [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
Given that input, the model is supposed to find the highest number, which is 9 in that case.
I have the following reward/penalty rules:
- +10 if the model’s answer was correct.
- -1 if the model’s answer was wrong.
I let my model play that game, and the accuracy is always around 10%, which is the same as just using a complete random picking.
What am I doing wrong?
Here is my code:
import random
import torch
import torch.nn as nn
from tensordict import TensorDict
from tensordict.nn import TensorDictModule, InteractionType
from torch import optim
from torch.distributions import Categorical
from torchrl.modules import ProbabilisticActor, ValueOperator
from torchrl.objectives import ClipPPOLoss
from torchrl.objectives.value.functional import generalized_advantage_estimate
def main():
seed = 3
torch.manual_seed(seed)
# Find the maximum number in the list of 10 numbers.
policy_network = nn.Sequential(
nn.Linear(10, 64),
nn.ReLU(),
nn.Linear(64, 10)
)
policy_module = TensorDictModule(
module=policy_network,
in_keys=["numbers"],
out_keys=["logits"]
)
actor = ProbabilisticActor(
module=policy_module,
in_keys=["logits"],
out_keys=["action"],
distribution_class=Categorical,
default_interaction_type=InteractionType.RANDOM,
return_log_prob=True
)
value_network = nn.Sequential(
nn.Linear(10, 64),
nn.ReLU(),
nn.Linear(64, 1)
)
value_operator = ValueOperator(
module=value_network,
in_keys=["numbers"],
out_keys=["value"]
)
loss_module = ClipPPOLoss(
actor_network=actor,
critic_network=value_operator
)
loss_module.set_keys(
advantage="advantage",
value_target="value_target",
value="value",
action="action",
sample_log_prob="sample_log_prob"
)
# Training
episode = 0
max_episode = 1000
optimizer = optim.Adam(list(policy_network.parameters()) + list(value_network.parameters()))
number_of_correct_decisions = 0
while episode < max_episode:
# Generate a list of 10 random integers.
numbers = [random.randint(1, 100) for _ in range(10)]
current_tensor_dict = TensorDict({
"numbers": torch.FloatTensor(numbers)
}, batch_size=[])
actor(current_tensor_dict)
max_index = current_tensor_dict["action"].item()
value_operator(current_tensor_dict)
current_tensor_dict["sample_log_prob"] = current_tensor_dict["sample_log_prob"].detach()
next_tensor_dict = TensorDict({
"numbers": torch.FloatTensor(numbers)
}, batch_size=[])
value_operator(next_tensor_dict)
correct_index = numbers.index(max(numbers))
# Reward/Penalty Rules
score = 0
if max_index == correct_index:
score += 10
number_of_correct_decisions += 1
else:
score -= 1
reward = torch.FloatTensor([[score]])
# Note that we need to use batched input and the output will be in batched form.
advantage, value_target = generalized_advantage_estimate(
gamma=0.98,
lmbda=0.95,
state_value=current_tensor_dict["value"].unsqueeze(0),
next_state_value=next_tensor_dict["value"].unsqueeze(0),
reward=reward,
done=torch.BoolTensor([[1]]),
terminated=torch.BoolTensor([[1]])
)
current_tensor_dict["advantage"] = advantage.squeeze(0)
current_tensor_dict["value_target"] = value_target.squeeze(0)
loss_tensor_dict = loss_module(current_tensor_dict)
loss_critic = loss_tensor_dict["loss_critic"]
loss_entropy = loss_tensor_dict["loss_entropy"]
loss_objective = loss_tensor_dict["loss_objective"]
loss = loss_critic + 0.01 * loss_entropy + loss_objective
print(
f"episode: {episode}, score: {score}, numbers: {numbers}, max_num: {numbers[max_index]}")
loss.backward()
optimizer.step()
optimizer.zero_grad()
episode += 1
print(f"Accuracy = {number_of_correct_decisions / max_episode}")
main()