I’m currently trying to implement A3C on InvertedPendulumSwingupBulletEnv-v0 environment. The code runs alright but the agent doesn’t perform well. I’ve check some key variables and found that std always either converges to minimum value(1e-6) or keeps on increasing forever. So I was wondering if there’s any error with my implementation.
I have two files, run.py and agent.py. To train the agent, I’ve ran run.py file, set n_step to 4 and multi to 1 for now. Thank you guys for your help.
run.py
import warnings
# warnings.filterwarnings("ignore")
import numpy as np
import torch.multiprocessing as mp
from torch.distributions import Normal
from collections import deque
import torch
import time
import gym
from agent import Worker, ActorCritic
import matplotlib.pyplot as plt
def visualize_env(agent=None):
env = gym.make('InvertedPendulumSwingupBulletEnv-v0')
env.seed(1)
env.action_space.seed(1)
env.render(mode='human')
state = env.reset()
total_rewards = 0
for step in range(200):
time.sleep(0.016)
if agent is None:
action = env.action_space.sample()
else:
action = agent.select_action(state)
next_state, reward, done, _ = env.step(action)
print("reward:", reward)
total_rewards = total_rewards + reward
if done:
print("total reward:", total_rewards)
total_rewards = 0
state = env.reset()
state = next_state
def evaluate(global_actor, global_epi, sync, finish, multi):
start_time = time.time()
env = gym.make('InvertedPendulumSwingupBulletEnv-v0')
env.seed(1)
recent_scores = deque(maxlen=20)
mean_scores = []
n_epi = 0
while True:
if global_epi.value == multi:
state = env.reset()
score = 0
done = False
while not done:
with torch.no_grad():
mu, std = global_actor.actor(torch.FloatTensor(state))
dist = Normal(mu, std)
action = dist.sample()
next_state, reward, done, _ = env.step(action)
state = next_state
score += reward
with sync:
sync.notify_all()
with global_epi.get_lock():
global_epi.value = 0
recent_scores.append(score)
mean_score = np.mean(recent_scores)
mean_scores.append(mean_score)
n_epi += 1
print(f'[Episode {n_epi}] Avg. score: {mean_score: .2f}')
if mean_score >= 600:
with finish.get_lock():
finish.value = 1
print("Achieved score 600!!!, Time : {:.2f}".format(time.time() - start_time))
elif n_epi > 1000:
with finish.get_lock():
finish.value = 1
if np.max(mean_scores) >= 500:
print("Max episode finished! Achieved score 500!!!")
elif np.max(mean_scores) >= 400:
print("Max episode finished! Achievd score 400!!!")
else:
print("Max episode finished!")
if finish.value == 1:
with sync:
sync.notify_all()
break
plt.figure()
plt.plot(np.arange(len(mean_scores)), mean_scores)
plt.axhline(400, linestyle='--')
plt.axhline(500, linestyle='--')
plt.axhline(600, linestyle='--')
plt.xlabel('Episode')
plt.ylabel('Mean Score')
plt.savefig('plot.png')
plt.close()
print('figure saved')
env.close()
def model_free_RL(n_steps, multi):
global_actor = ActorCritic()
global_actor.share_memory()
global_epi = mp.Value('i', 0)
sync = mp.Condition()
finish = mp.Value('i', 0)
# Multiprocessing
processes = []
for rank in range(multi + 1):
if rank == 0:
p = mp.Process(target=evaluate, args=(global_actor, global_epi, sync, finish, multi))
p.start()
else:
worker = Worker(global_actor, global_epi, sync, finish, n_steps, rank)
p = mp.Process(target=worker.train)
p.start()
processes.append(p)
for p in processes:
p.join()
return worker
if __name__ == '__main__':
while True:
mp.set_start_method('spawn')
print("1. visualize without learning")
print("2. actor-critic training")
print("3. visualize after learning")
print("4. exit")
menu = int(input("select: "))
if menu == 1:
visualize_env()
elif menu == 2:
n_steps = int(input("n_steps: "))
multi = int(input("multi: "))
torch.manual_seed(77)
np.random.seed(1)
agent = model_free_RL(n_steps, multi)
elif menu == 3:
visualize_env(agent)
elif menu == 4:
break
else:
print("wrong input!")
agent.py
import gym
import pybullet_envs
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
import numpy as np
ENV = gym.make("InvertedPendulumSwingupBulletEnv-v0")
OBS_DIM = ENV.observation_space.shape[0]
ACT_DIM = ENV.action_space.shape[0]
ACT_LIMIT = ENV.action_space.high[0]
ENV.close()
class NstepBuffer:
def __init__(self):
self.states = []
self.actions = []
self.rewards = []
self.next_states = []
self.dones = []
def add(self, state, action, reward, next_state, done):
self.states.append(state)
self.actions.append(action)
self.rewards.append(reward)
self.next_states.append(next_state)
self.dones.append(done)
def sample(self):
return self.states, self.actions, self.rewards, self.next_states, self.dones
def reset(self):
self.states = []
self.actions = []
self.rewards = []
self.next_states = []
self.dones = []
class ActorCritic(nn.Module):
def __init__(self):
super(ActorCritic, self).__init__()
self.actor_fc1 = nn.Linear(OBS_DIM, 128)
self.actor_fc2 = nn.Linear(128, 64)
self.actor_mu = nn.Linear(64, ACT_DIM)
self.actor_log_std = nn.Linear(64, ACT_DIM)
self.critic_fc1 = nn.Linear(OBS_DIM, 128)
self.critic_fc2 = nn.Linear(128, 64)
self.critic_out = nn.Linear(64, 1)
def actor(self, states):
x = F.relu(self.actor_fc1(states))
x = F.relu(self.actor_fc2(x))
mu = self.actor_mu(x).clamp(-ACT_LIMIT, ACT_LIMIT)
std = self.actor_log_std.exp() + 1e-6
return mu, std
def critic(self, states):
x = F.relu(self.critic_fc1(states))
x = F.relu(self.critic_fc2(x))
value = self.critic_out(x)
return value
class Worker(object):
def __init__(self, global_actor, global_epi, sync, finish, n_step, seed):
self.env = gym.make('InvertedPendulumSwingupBulletEnv-v0')
self.env.seed(seed)
self.lr = 0.001
self.gamma = 0.99
self.entropy_coef = 0.01
self.global_actor = global_actor
self.global_epi = global_epi
self.sync = sync
self.finish = finish
self.optimizer = optim.Adam(self.global_actor.parameters(), lr=self.lr)
self.n_step = n_step
self.local_actor = ActorCritic()
self.nstep_buffer = NstepBuffer()
def select_action(self, state):
state = torch.FloatTensor(state).unsqueeze(0)
with torch.no_grad():
mu, std = self.local_actor.actor(state)
dist = Normal(mu, std)
action = dist.sample()
return action.clamp(-ACT_LIMIT, ACT_LIMIT).numpy()[0]
def train_network(self, states, actions, rewards, next_states, dones):
states = torch.FloatTensor(np.array(states))
actions = torch.FloatTensor(np.array(actions)).unsqueeze(1)
rewards = torch.FloatTensor(np.array(rewards))
next_states = torch.FloatTensor(np.array(next_states))
dones = torch.FloatTensor(np.array(dones))
# Calculate n-step returns
returns = torch.zeros_like(rewards)
G = 0
for t in reversed(range(len(rewards))):
G = rewards[t] + self.gamma * G
returns[t] = G
values = self.local_actor.critic(states).squeeze(1)
advantages = returns - values
# Calculate target value for critic loss
with torch.no_grad():
next_values = self.local_actor.critic(next_states).squeeze(1)
target_values = rewards + self.gamma * (1 - dones) * next_values
# Critic loss
critic_loss = F.mse_loss(values, target_values)
# Actor loss
mu, std = self.local_actor.actor(states)
dist = Normal(mu, std)
log_probs = dist.log_prob(actions).sum(-1)
entropy = dist.entropy().mean()
actor_loss = -(log_probs * advantages.detach()).mean()
# Total loss
total_loss = critic_loss + actor_loss - self.entropy_coef * entropy
self.optimizer.zero_grad()
total_loss.backward()
for global_param, local_param in zip(self.global_actor.parameters(), self.local_actor.parameters()):
global_param._grad = local_param.grad
# Global optimizer update
self.optimizer.step()
self.local_actor.load_state_dict(self.global_actor.state_dict())
def train(self):
step = 1
while True:
state = self.env.reset()
done = False
while not done:
action = self.select_action(state)
next_state, reward, done, _ = self.env.step(action)
self.nstep_buffer.add(state, action.item(), reward, next_state, done)
if step % self.n_step == 0 or done:
self.train_network(*self.nstep_buffer.sample())
self.nstep_buffer.reset()
state = next_state
step += 1
with self.global_epi.get_lock():
self.global_epi.value += 1
if self.finish.value == 1:
break
with self.sync:
self.sync.wait()
self.env.close()
I’ve tried to train the agent, but episode reward remains around -800. I’ve checked values of some key variables and found that after few episodes, std just keeps on increasing.
YSH is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.