I ma trying to implement Proximal Policy Optimization(PPO) in pytorch and apply it to BipedalWalker-v3 gym environment.
Agent so far is not learning well, I managed to achieve overall reward = -9 (ideally should be close to 300 according to their documentation (https://www.gymlibrary.dev/environments/box2d/bipedal_walker/)
I am not sure about my gradients implementation:
- I don’t calculate gradients for the old policy probabilities (old_log_probs)
- I don’t calculate gradients for next state values.
- I don’t calculate gradients for target values
- I don’t calculate gradients advantage
So basically I calculate gradients only for:
- current state values, in order to use them for value loss calculation
- current policy log probabilities, in order ti use them for policy loss calculation.
I am not sure, that my understanding is correct. Here is my implementation of the training agent loop:
class Agent(object):
"""Agent class used for training, saving data and handling the model.
"""
def __init__(self, buffer, state_size, action_size, hidden_size, lr_actor,lr_critic, logger, eps_clip, n_epochs,
weight_decay, betas, loss_scales, discount, checkpoint_dir="ckpts"):
self.action_size = action_size
self.state_size = state_size
self.buffer = buffer
self.checkpoint_dir = checkpoint_dir
self.loss_scales = loss_scales
self.n_epochs = n_epochs
self.eps_clip = eps_clip
self.logger = logger
self.discount = discount
self.model = ActorCriticNet(state_size, action_size, hidden_size).to(device)
self.optimizer = torch.optim.Adam([
{'params': self.model.actor.parameters(), 'lr': lr_actor,'weight_decay':weight_decay,'betas':betas,'eps':1e-5},
{'params': self.model.critic.parameters(), 'lr': lr_critic,'weight_decay':weight_decay,'betas':betas,'eps':1e-5}
])
self.critic_criterion = torch.nn.MSELoss()
def train(self):
for i in range(self.n_epochs):
# create the a dataloader based on the current buffer
loader = self.buffer.create_dataloader()
# iterate over the samples in the dataloader
for states, actions, rewards, next_states, old_log_probs in loader:
rewards = rewards.to(torch.float32).to(device)
if rewards.shape[0] > 1:
rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)
rewards = torch.clamp(rewards, -10, 10)
states, actions, rewards, next_states = states.to(device), actions.to(device), rewards.to(device), next_states.to(device)
old_log_probs = old_log_probs.squeeze().to(device).detach()
with torch.no_grad():
# 1) compute target value with next state and reward
next_state_values = self.model.critic(next_states).squeeze()
target_values = rewards + self.discount * next_state_values
# 2) compute advantage function from target and current state and action
state_values, log_probs, entropy = self.model.evaluate(states, actions)
state_values = state_values.squeeze()
with torch.no_grad():
advantages = (target_values - state_values)
if advantages.numel() > 1:
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-7)
# 3) compute importance sampling ratio from log probabilities
if(len(old_log_probs.shape) == 1):
old_log_probs = old_log_probs.unsqueeze(0)
ratio = (log_probs.sum(1) - old_log_probs.sum(1)).exp()
# 4) compute surrogate loss with the advantage and clipped surrogate loss
sur1 = ratio * advantages
sur2 = torch.clamp(ratio,1 - self.eps_clip, 1 + self.eps_clip) * advantages
policy_loss = - torch.min(sur1,sur2)
#print('log_probs shape:',log_probs.shape)
#print('old_log_probs shape:',old_log_probs.shape)
#print('advantages shape:',advantages.shape)
#print('ratio shape:',ratio.shape)
#print('state_values shape:',state_values.shape)
#print('target_values shape:',target_values.shape)
#print('rewards shape:',rewards.shape)
#print('next_states_values shape:',next_state_values.shape)
#print('entropy shape:',entropy.shape)
# 5) compute value losses
value_loss = self.critic_criterion(state_values,target_values)
entropy_loss = entropy.sum(1)
# 6) compute total loss with entropy regularization
loss = self.loss_scales[0] * value_loss + self.loss_scales[1] * policy_loss - self.loss_scales[2] * entropy_loss
# 7) compute gradients and perform optimization step
self.optimizer.zero_grad()
loss.mean().backward()
torch.nn.utils.clip_grad_norm_(self.model.actor.parameters(), max_norm=0.5)
torch.nn.utils.clip_grad_norm_(self.model.critic.parameters(), max_norm=0.5)
self.optimizer.step()
# return losses and entropy
return (loss.mean().detach().cpu().numpy(),
value_loss.mean().detach().cpu().numpy(),
policy_loss.mean().detach().cpu().numpy()), entropy.mean().detach().cpu().numpy()