Thiết kế website giá rẻ

Question

I ma trying to implement Proximal Policy Optimization(PPO) in pytorch and apply it to BipedalWalker-v3 gym environment.

Agent so far is not learning well, I managed to achieve overall reward = -9 (ideally should be close to 300 according to their documentation (https://www.gymlibrary.dev/environments/box2d/bipedal_walker/)

I am not sure about my gradients implementation:

I don’t calculate gradients for the old policy probabilities (old_log_probs)
I don’t calculate gradients for next state values.
I don’t calculate gradients for target values
I don’t calculate gradients advantage

So basically I calculate gradients only for:

current state values, in order to use them for value loss calculation
current policy log probabilities, in order ti use them for policy loss calculation.

I am not sure, that my understanding is correct. Here is my implementation of the training agent loop:

class Agent(object):
    """Agent class used for training, saving data and handling the model.
    """
    def __init__(self, buffer, state_size, action_size, hidden_size, lr_actor,lr_critic, logger, eps_clip, n_epochs,
                 weight_decay, betas, loss_scales, discount, checkpoint_dir="ckpts"):
        self.action_size = action_size
        self.state_size = state_size
        self.buffer = buffer
        self.checkpoint_dir = checkpoint_dir
        self.loss_scales = loss_scales
        self.n_epochs = n_epochs
        self.eps_clip = eps_clip
        self.logger = logger
        self.discount = discount
        self.model = ActorCriticNet(state_size, action_size, hidden_size).to(device)
        self.optimizer = torch.optim.Adam([
                        {'params': self.model.actor.parameters(), 'lr': lr_actor,'weight_decay':weight_decay,'betas':betas,'eps':1e-5},
                        {'params': self.model.critic.parameters(), 'lr': lr_critic,'weight_decay':weight_decay,'betas':betas,'eps':1e-5}
                    ])

        self.critic_criterion = torch.nn.MSELoss()

    def train(self):
        for i in range(self.n_epochs):
            # create the a dataloader based on the current buffer
            loader = self.buffer.create_dataloader()
            # iterate over the samples in the dataloader
            for states, actions, rewards, next_states, old_log_probs in loader:

                rewards = rewards.to(torch.float32).to(device)
                if rewards.shape[0] > 1:
                    rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

                rewards =  torch.clamp(rewards, -10, 10) 

                states, actions, rewards, next_states = states.to(device), actions.to(device), rewards.to(device), next_states.to(device)
                old_log_probs = old_log_probs.squeeze().to(device).detach()
               
                with torch.no_grad(): 
                    # 1) compute target value with next state and reward
                    next_state_values = self.model.critic(next_states).squeeze()
                    target_values = rewards + self.discount * next_state_values

                # 2) compute advantage function from target and current state and action
                state_values, log_probs, entropy = self.model.evaluate(states, actions)
                state_values = state_values.squeeze()
                with torch.no_grad(): 
                    advantages = (target_values - state_values)
                    if advantages.numel() > 1:
                        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-7)

                # 3) compute importance sampling ratio from log probabilities
                if(len(old_log_probs.shape) == 1):
                    old_log_probs = old_log_probs.unsqueeze(0)

                ratio = (log_probs.sum(1)  - old_log_probs.sum(1)).exp()



                # 4) compute surrogate loss with the advantage and clipped surrogate loss
                sur1 = ratio * advantages
                sur2 = torch.clamp(ratio,1 - self.eps_clip, 1 + self.eps_clip) * advantages
                policy_loss = - torch.min(sur1,sur2)

                #print('log_probs shape:',log_probs.shape)
                #print('old_log_probs shape:',old_log_probs.shape)
                #print('advantages shape:',advantages.shape)
                #print('ratio shape:',ratio.shape)
                #print('state_values shape:',state_values.shape)
                #print('target_values shape:',target_values.shape)
                #print('rewards shape:',rewards.shape)
                #print('next_states_values shape:',next_state_values.shape)
                #print('entropy shape:',entropy.shape)

                # 5) compute value losses
                value_loss = self.critic_criterion(state_values,target_values)
                entropy_loss = entropy.sum(1)

                # 6) compute total loss with entropy regularization
                loss = self.loss_scales[0] * value_loss + self.loss_scales[1] * policy_loss - self.loss_scales[2] * entropy_loss

                # 7) compute gradients and perform optimization step
                self.optimizer.zero_grad()
                loss.mean().backward()
                torch.nn.utils.clip_grad_norm_(self.model.actor.parameters(), max_norm=0.5)
                torch.nn.utils.clip_grad_norm_(self.model.critic.parameters(), max_norm=0.5)
                self.optimizer.step()

        # return losses and entropy
        return (loss.mean().detach().cpu().numpy(),
                value_loss.mean().detach().cpu().numpy(),
                policy_loss.mean().detach().cpu().numpy()), entropy.mean().detach().cpu().numpy()

Thiết kế website giá rẻ

Danh mục

PPO implementation in Pytorch: gradients calculation