Thiết kế website giá rẻ

Question

I’m trying to train the model that selects the maximum number in the list of 10 numbers.

For example, I have a list [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].

Given that input, the model is supposed to find the highest number, which is 9 in that case.

I have the following reward/penalty rules:

+10 if the model’s answer was correct.
-1 if the model’s answer was wrong.

I let my model play that game, and the accuracy is always around 10%, which is the same as just using a complete random picking.

What am I doing wrong?

Here is my code:

import random

import torch
import torch.nn as nn
from tensordict import TensorDict
from tensordict.nn import TensorDictModule, InteractionType
from torch import optim
from torch.distributions import Categorical
from torchrl.modules import ProbabilisticActor, ValueOperator
from torchrl.objectives import ClipPPOLoss
from torchrl.objectives.value.functional import generalized_advantage_estimate


def main():
    seed = 3
    torch.manual_seed(seed)

    # Find the maximum number in the list of 10 numbers.
    policy_network = nn.Sequential(
        nn.Linear(10, 64),
        nn.ReLU(),
        nn.Linear(64, 10)
    )
    policy_module = TensorDictModule(
        module=policy_network,
        in_keys=["numbers"],
        out_keys=["logits"]
    )
    actor = ProbabilisticActor(
        module=policy_module,
        in_keys=["logits"],
        out_keys=["action"],
        distribution_class=Categorical,
        default_interaction_type=InteractionType.RANDOM,
        return_log_prob=True
    )
    value_network = nn.Sequential(
        nn.Linear(10, 64),
        nn.ReLU(),
        nn.Linear(64, 1)
    )
    value_operator = ValueOperator(
        module=value_network,
        in_keys=["numbers"],
        out_keys=["value"]
    )
    loss_module = ClipPPOLoss(
        actor_network=actor,
        critic_network=value_operator
    )
    loss_module.set_keys(
        advantage="advantage",
        value_target="value_target",
        value="value",
        action="action",
        sample_log_prob="sample_log_prob"
    )

    # Training

    episode = 0
    max_episode = 1000
    optimizer = optim.Adam(list(policy_network.parameters()) + list(value_network.parameters()))
    number_of_correct_decisions = 0
    while episode < max_episode:
        # Generate a list of 10 random integers.
        numbers = [random.randint(1, 100) for _ in range(10)]
        current_tensor_dict = TensorDict({
            "numbers": torch.FloatTensor(numbers)
        }, batch_size=[])
        actor(current_tensor_dict)
        max_index = current_tensor_dict["action"].item()
        value_operator(current_tensor_dict)
        current_tensor_dict["sample_log_prob"] = current_tensor_dict["sample_log_prob"].detach()
        next_tensor_dict = TensorDict({
            "numbers": torch.FloatTensor(numbers)
        }, batch_size=[])
        value_operator(next_tensor_dict)

        correct_index = numbers.index(max(numbers))

        # Reward/Penalty Rules
        score = 0
        if max_index == correct_index:
            score += 10
            number_of_correct_decisions += 1
        else:
            score -= 1
        reward = torch.FloatTensor([[score]])

        # Note that we need to use batched input and the output will be in batched form.
        advantage, value_target = generalized_advantage_estimate(
            gamma=0.98,
            lmbda=0.95,
            state_value=current_tensor_dict["value"].unsqueeze(0),
            next_state_value=next_tensor_dict["value"].unsqueeze(0),
            reward=reward,
            done=torch.BoolTensor([[1]]),
            terminated=torch.BoolTensor([[1]])
        )
        current_tensor_dict["advantage"] = advantage.squeeze(0)
        current_tensor_dict["value_target"] = value_target.squeeze(0)
        loss_tensor_dict = loss_module(current_tensor_dict)
        loss_critic = loss_tensor_dict["loss_critic"]
        loss_entropy = loss_tensor_dict["loss_entropy"]
        loss_objective = loss_tensor_dict["loss_objective"]
        loss = loss_critic + 0.01 * loss_entropy + loss_objective
        print(
            f"episode: {episode}, score: {score}, numbers: {numbers}, max_num: {numbers[max_index]}")
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        episode += 1
    print(f"Accuracy = {number_of_correct_decisions / max_episode}")

main()

Thiết kế website giá rẻ

Danh mục

PPO agent is not learning