I’m currently trying to implement A3C on InvertedPendulumSwingupBulletEnv-v0 environment. The code runs alright but the agent doesn’t perform well. Only after steps mean converges to 1 and standard deviation converges to 0.001. So I was wondering if there’s any error with my implementation.
I have two files, run.py and agent.py. To train the agent, I’ve ran run.py file, set n_step to 4 and multi to 1 for now. Thank you guys for your help.
run.py
import warnings
# warnings.filterwarnings("ignore")
import numpy as np
import torch.multiprocessing as mp
from torch.distributions import Normal
from collections import deque
import torch
import time
import gym
from agent import Worker, ActorCritic
import matplotlib.pyplot as plt
def visualize_env(agent=None):
env = gym.make('InvertedPendulumSwingupBulletEnv-v0')
env.seed(1)
env.action_space.seed(1)
env.render(mode='human')
state = env.reset()
total_rewards = 0
for step in range(200):
time.sleep(0.016)
if agent is None:
action = env.action_space.sample()
else:
action = agent.select_action(state)
next_state, reward, done, _ = env.step(action)
print("reward:", reward)
total_rewards = total_rewards + reward
if done:
print("total reward:", total_rewards)
total_rewards = 0
state = env.reset()
state = next_state
def evaluate(global_actor, global_epi, sync, finish, multi):
start_time = time.time()
env = gym.make('InvertedPendulumSwingupBulletEnv-v0')
env.seed(1)
recent_scores = deque(maxlen=20)
mean_scores = []
n_epi = 0
while True:
if global_epi.value == multi:
state = env.reset()
score = 0
done = False
while not done:
with torch.no_grad():
mu, std = global_actor.actor(torch.FloatTensor(state))
dist = Normal(mu, std)
action = dist.sample()
next_state, reward, done, _ = env.step(action)
state = next_state
score += reward
with sync:
sync.notify_all()
with global_epi.get_lock():
global_epi.value = 0
recent_scores.append(score)
mean_score = np.mean(recent_scores)
mean_scores.append(mean_score)
n_epi += 1
print(f'[Episode {n_epi}] Avg. score: {mean_score: .2f}')
if mean_score >= 600:
with finish.get_lock():
finish.value = 1
print("Achieved score 600!!!, Time : {:.2f}".format(time.time() - start_time))
elif n_epi > 1000:
with finish.get_lock():
finish.value = 1
if np.max(mean_scores) >= 500:
print("Max episode finished! Achieved score 500!!!")
elif np.max(mean_scores) >= 400:
print("Max episode finished! Achievd score 400!!!")
else:
print("Max episode finished!")
if finish.value == 1:
with sync:
sync.notify_all()
break
plt.figure()
plt.plot(np.arange(len(mean_scores)), mean_scores)
plt.axhline(400, linestyle='--')
plt.axhline(500, linestyle='--')
plt.axhline(600, linestyle='--')
plt.xlabel('Episode')
plt.ylabel('Mean Score')
plt.savefig('plot.png')
plt.close()
print('figure saved')
env.close()
def model_free_RL(n_steps, multi):
global_actor = ActorCritic()
global_actor.share_memory()
global_epi = mp.Value('i', 0)
sync = mp.Condition()
finish = mp.Value('i', 0)
# Multiprocessing
processes = []
for rank in range(multi + 1):
if rank == 0:
p = mp.Process(target=evaluate, args=(global_actor, global_epi, sync, finish, multi))
p.start()
else:
worker = Worker(global_actor, global_epi, sync, finish, n_steps, rank)
p = mp.Process(target=worker.train)
p.start()
processes.append(p)
for p in processes:
p.join()
return worker
if __name__ == '__main__':
while True:
mp.set_start_method('spawn')
print("1. visualize without learning")
print("2. actor-critic training")
print("3. visualize after learning")
print("4. exit")
menu = int(input("select: "))
if menu == 1:
visualize_env()
elif menu == 2:
n_steps = int(input("n_steps: "))
multi = int(input("multi: "))
torch.manual_seed(77)
np.random.seed(1)
agent = model_free_RL(n_steps, multi)
elif menu == 3:
visualize_env(agent)
elif menu == 4:
break
else:
print("wrong input!")
agent.py
import gym
import pybullet_envs
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
import numpy as np
ENV = gym.make("InvertedPendulumSwingupBulletEnv-v0")
OBS_DIM = ENV.observation_space.shape[0]
ACT_DIM = ENV.action_space.shape[0]
ACT_LIMIT = ENV.action_space.high[0]
ENV.close()
class NstepBuffer:
def __init__(self):
self.states = []
self.actions = []
self.rewards = []
self.next_states = []
self.dones = []
def add(self, state, action, reward, next_state, done):
self.states.append(state)
self.actions.append(action)
self.rewards.append(reward)
self.next_states.append(next_state)
self.dones.append(done)
def sample(self):
states = torch.FloatTensor(np.array(self.states))
actions = torch.FloatTensor(np.array(self.actions))
rewards = torch.FloatTensor(np.array(self.rewards))
next_states = torch.FloatTensor(np.array(self.next_states))
dones = torch.FloatTensor(np.array(self.dones))
return states, actions, rewards, next_states, dones
def reset(self):
self.states = []
self.actions = []
self.rewards = []
self.next_states = []
self.dones = []
class ActorCritic(nn.Module):
def __init__(self):
super(ActorCritic, self).__init__()
self.hidden_size = 256
self.actor_layer_1 = nn.Linear(OBS_DIM, self.hidden_size)
self.actor_layer_2 = nn.Linear(self.hidden_size, self.hidden_size)
self.actor_layer_3 = nn.Linear(self.hidden_size, self.hidden_size)
self.critic_layer_1 = nn.Linear(OBS_DIM, self.hidden_size)
self.critic_layer_2 = nn.Linear(self.hidden_size, self.hidden_size)
self.critic_layer_3 = nn.Linear(self.hidden_size, self.hidden_size)
self.mu_head = nn.Linear(self.hidden_size, ACT_DIM)
self.std_head = nn.Linear(self.hidden_size, ACT_DIM)
self.critic_head = nn.Linear(self.hidden_size, 1)
self._initialize_weights()
def _initialize_weights(self):
for layer in self.children():
if isinstance(layer, nn.Linear):
nn.init.kaiming_normal_(layer.weight)
nn.init.constant_(layer.bias, 0)
def actor(self, states):
x = F.relu(self.actor_layer_1(states))
x = F.relu(self.actor_layer_2(x))
x = F.relu(self.actor_layer_3(x))
mu = F.tanh(self.mu_head(x))
std = F.softplus(self.std_head(x)).clamp(min=1e-2, max=1)
return mu, std
def critic(self, states):
x = torch.relu(self.critic_layer_1(states))
x = torch.relu(self.critic_layer_2(x))
x = torch.relu(self.critic_layer_3(x))
return self.critic_head(x)
class Worker(object):
def __init__(self, global_actor, global_epi, sync, finish, n_step, seed):
self.env = gym.make('InvertedPendulumSwingupBulletEnv-v0')
self.env.seed(seed)
self.lr = 0.001
self.gamma = 0.99
self.entropy_coef = 0.01
self.critic_coef = 1
self.global_actor = global_actor
self.global_epi = global_epi
self.sync = sync
self.finish = finish
self.optimizer = optim.Adam(self.global_actor.parameters(), lr=self.lr)
self.n_step = n_step
self.local_actor = ActorCritic()
self.local_actor.load_state_dict(self.global_actor.state_dict())
self.nstep_buffer = NstepBuffer()
def select_action(self, state):
state = torch.FloatTensor(state).unsqueeze(0)
mu, std = self.local_actor.actor(state)
dist = Normal(mu, std)
action = dist.sample()
action = action.clamp(-ACT_LIMIT, ACT_LIMIT).squeeze(0)
return action.numpy()
def train_network(self, states, actions, rewards, next_states, dones):
n_step_return = rewards
for i in range(rewards.size(0) - 2, -1, -1):
n_step_return[i] += self.gamma * n_step_return[i + 1] * (1 - dones[i + 1])
next_values = self.local_actor.critic(next_states[-1]).squeeze(0)
td_targets = n_step_return + (self.gamma ** self.n_step) * next_values * (1 - dones[-1])
values = self.local_actor.critic(states).squeeze(1)
advantages = td_targets - values
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
critic_loss = F.mse_loss(values, td_targets.detach())
mu, std = self.local_actor.actor(states)
dist = Normal(mu, std)
log_probs = dist.log_prob(actions).sum(-1)
actor_loss = -(log_probs * advantages.detach()).mean()
entropy = dist.entropy().mean()
total_loss = actor_loss + self.critic_coef * critic_loss - self.entropy_coef * entropy
print(f'values: {values}')
print(f'td_targets: {td_targets}')
print(f'mu: {mu}')
print(f'std: {std}')
print(f'advantages: {advantages}')
print(f'log_probs: {log_probs}')
print(f'actor_loss: {actor_loss}')
print(f'critic_loss: {critic_loss}')
print(f'entropy: {entropy}')
print(f'total_loss: {total_loss}n')
self.optimizer.zero_grad()
total_loss.backward()
for global_param, local_param in zip(self.global_actor.parameters(), self.local_actor.parameters()):
global_param._grad = local_param.grad
self.optimizer.step()
self.local_actor.load_state_dict(self.global_actor.state_dict())
def train(self):
step = 1
while True:
state = self.env.reset()
done = False
while not done:
action = self.select_action(state)
next_state, reward, done, _ = self.env.step(action)
self.nstep_buffer.add(state, action[0], reward, next_state, done)
if step % self.n_step == 0 or done:
if len(self.nstep_buffer.states) >= self.n_step:
self.train_network(*self.nstep_buffer.sample())
self.nstep_buffer.reset()
state = next_state
step += 1
with self.global_epi.get_lock():
self.global_epi.value += 1
if self.finish.value == 1:
break
with self.sync:
self.sync.wait()
self.env.close()
I’ve tried to train the agent, but episode reward remains around -800. Below is part of output for some key variables.
values: tensor([-67.8674, -72.5005, -77.3016, -82.4779], grad_fn=<SqueezeBackward1>)
td_targets: tensor([-87.2845, -86.4848, -85.6695, -84.8368], grad_fn=<AddBackward0>)
mu: tensor([[1.],
[1.],
[1.],
[1.]], grad_fn=<TanhBackward0>)
std: tensor([[0.0100],
[0.0100],
[0.0100],
[0.0100]], grad_fn=<ClampBackward1>)
advantages: tensor([-1.1434, -0.4026, 0.3633, 1.1827], grad_fn=<DivBackward0>)
log_probs: tensor([14.7439, 14.7439, 14.7439, 14.7439], grad_fn=<SumBackward1>)
actor_loss: -0.0
critic_loss: 162.04246520996094
entropy: -3.1862316131591797
total_loss: 162.07432556152344
values: tensor([ -88.4638, -93.1184, -97.5148, -101.5877],
grad_fn=<SqueezeBackward1>)
td_targets: tensor([-104.7119, -103.8752, -103.0174, -102.1377], grad_fn=<AddBackward0>)
mu: tensor([[1.],
[1.],
[1.],
[1.]], grad_fn=<TanhBackward0>)
std: tensor([[0.0100],
[0.0100],
[0.0100],
[0.0100]], grad_fn=<ClampBackward1>)
advantages: tensor([-1.1810, -0.3687, 0.4085, 1.1412], grad_fn=<DivBackward0>)
log_probs: tensor([13.4080, 13.4080, 13.4080, 13.4080], grad_fn=<SumBackward1>)
actor_loss: -2.384185791015625e-07
critic_loss: 102.57270050048828
entropy: -3.1862316131591797
total_loss: 102.60456085205078
values: tensor([-106.7568, -109.9448, -112.6035, -114.7130],
grad_fn=<SqueezeBackward1>)
td_targets: tensor([-115.3283, -114.4403, -113.5298, -112.5969], grad_fn=<AddBackward0>)
mu: tensor([[1.],
[1.],
[1.],
[1.]], grad_fn=<TanhBackward0>)
std: tensor([[0.0100],
[0.0100],
[0.0100],
[0.0100]], grad_fn=<ClampBackward1>)
advantages: tensor([-1.2153, -0.3311, 0.4432, 1.1032], grad_fn=<DivBackward0>)
log_probs: tensor([14.5895, 14.5895, 14.5895, 14.5895], grad_fn=<SumBackward1>)
actor_loss: -4.76837158203125e-07
critic_loss: 24.75372886657715
entropy: -3.1862316131591797
total_loss: 24.78559112548828
YSH is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.