Thiết kế website giá rẻ

Question

I’m currently trying to implement A3C on InvertedPendulumSwingupBulletEnv-v0 environment. The code runs alright but the agent doesn’t perform well. I’ve check some key variables and found that std always either converges to minimum value(1e-6) or keeps on increasing forever. So I was wondering if there’s any error with my implementation.

I have two files, run.py and agent.py. To train the agent, I’ve ran run.py file, set n_step to 4 and multi to 1 for now. Thank you guys for your help.

run.py

import warnings
# warnings.filterwarnings("ignore")
import numpy as np
import torch.multiprocessing as mp
from torch.distributions import Normal
from collections import deque
import torch
import time
import gym
from agent import Worker, ActorCritic
import matplotlib.pyplot as plt

def visualize_env(agent=None):
    env = gym.make('InvertedPendulumSwingupBulletEnv-v0')
    env.seed(1)
    env.action_space.seed(1)
    env.render(mode='human')
    state = env.reset()
    total_rewards = 0

    for step in range(200):
        time.sleep(0.016)
        if agent is None:
            action = env.action_space.sample()
        else:
            action = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)

        print("reward:", reward)
        total_rewards = total_rewards + reward
        if done:
            print("total reward:", total_rewards)
            total_rewards = 0
            state = env.reset()
        state = next_state

def evaluate(global_actor, global_epi, sync, finish, multi):
    start_time = time.time()
    env = gym.make('InvertedPendulumSwingupBulletEnv-v0')
    env.seed(1)
    recent_scores = deque(maxlen=20)
    mean_scores = []
    n_epi = 0

    while True:
        if global_epi.value == multi:
            state = env.reset()        
            score = 0
            done = False

            while not done:
                with torch.no_grad():
                    mu, std = global_actor.actor(torch.FloatTensor(state))
                    dist = Normal(mu, std)
                    action = dist.sample()
                    next_state, reward, done, _ = env.step(action)
                    state = next_state
                    score += reward
       
            with sync:
                sync.notify_all()

            with global_epi.get_lock():
                global_epi.value = 0

            recent_scores.append(score)
            mean_score = np.mean(recent_scores)
            mean_scores.append(mean_score)
            n_epi += 1
            print(f'[Episode {n_epi}] Avg. score: {mean_score: .2f}')

            if mean_score >= 600:
                with finish.get_lock():
                    finish.value = 1
                print("Achieved score 600!!!, Time : {:.2f}".format(time.time() - start_time))
            elif n_epi > 1000:
                with finish.get_lock():
                    finish.value = 1
                if np.max(mean_scores) >= 500:
                    print("Max episode finished! Achieved score 500!!!")
                elif np.max(mean_scores) >= 400:
                    print("Max episode finished! Achievd score 400!!!")
                else:
                    print("Max episode finished!")

            if finish.value == 1:
                with sync:
                    sync.notify_all()
                break

    plt.figure()
    plt.plot(np.arange(len(mean_scores)), mean_scores)
    plt.axhline(400, linestyle='--')
    plt.axhline(500, linestyle='--')
    plt.axhline(600, linestyle='--')
    plt.xlabel('Episode')
    plt.ylabel('Mean Score')
    plt.savefig('plot.png')
    plt.close()
    print('figure saved')  

    env.close()

def model_free_RL(n_steps, multi):
    global_actor = ActorCritic()
    global_actor.share_memory()

    global_epi = mp.Value('i', 0)
    sync = mp.Condition()
    finish = mp.Value('i', 0)

    # Multiprocessing
    processes = []

    for rank in range(multi + 1):
        if rank == 0:
            p = mp.Process(target=evaluate, args=(global_actor, global_epi, sync, finish, multi))
            p.start()

        else:
            worker = Worker(global_actor, global_epi, sync, finish, n_steps, rank)
            p = mp.Process(target=worker.train)
            p.start()
            processes.append(p)

    for p in processes:
        p.join()

    return worker

if __name__ == '__main__':
    while True:
        mp.set_start_method('spawn')
        print("1. visualize without learning")
        print("2. actor-critic training")
        print("3. visualize after learning")
        print("4. exit")
        menu = int(input("select: "))
        if menu == 1:
            visualize_env()
        elif menu == 2:
            n_steps = int(input("n_steps: "))
            multi = int(input("multi: ")) 
            torch.manual_seed(77)
            np.random.seed(1)
            agent = model_free_RL(n_steps, multi)
        elif menu == 3:
            visualize_env(agent)
        elif menu == 4:
            break
        else:
            print("wrong input!")

agent.py

import gym
import pybullet_envs
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
import numpy as np

ENV = gym.make("InvertedPendulumSwingupBulletEnv-v0")
OBS_DIM = ENV.observation_space.shape[0]
ACT_DIM = ENV.action_space.shape[0]
ACT_LIMIT = ENV.action_space.high[0]
ENV.close()

class NstepBuffer:
    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.next_states = []
        self.dones = []

    def add(self, state, action, reward, next_state, done):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.next_states.append(next_state)
        self.dones.append(done)

    def sample(self):
        return self.states, self.actions, self.rewards, self.next_states, self.dones
    
    def reset(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.next_states = []
        self.dones = []

class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.actor_fc1 = nn.Linear(OBS_DIM, 128)
        self.actor_fc2 = nn.Linear(128, 64)
        self.actor_mu = nn.Linear(64, ACT_DIM)
        self.actor_log_std = nn.Linear(64, ACT_DIM)

        self.critic_fc1 = nn.Linear(OBS_DIM, 128)
        self.critic_fc2 = nn.Linear(128, 64)
        self.critic_out = nn.Linear(64, 1)

    def actor(self, states):
        x = F.relu(self.actor_fc1(states))
        x = F.relu(self.actor_fc2(x))
        mu = self.actor_mu(x).clamp(-ACT_LIMIT, ACT_LIMIT)
        std = self.actor_log_std.exp() + 1e-6
        return mu, std

    def critic(self, states):
        x = F.relu(self.critic_fc1(states))
        x = F.relu(self.critic_fc2(x))
        value = self.critic_out(x)
        return value

class Worker(object):
    def __init__(self, global_actor, global_epi, sync, finish, n_step, seed):
        self.env = gym.make('InvertedPendulumSwingupBulletEnv-v0')
        self.env.seed(seed)
        self.lr = 0.001
        self.gamma = 0.99
        self.entropy_coef = 0.01
        self.global_actor = global_actor
        self.global_epi = global_epi
        self.sync = sync
        self.finish = finish
        self.optimizer = optim.Adam(self.global_actor.parameters(), lr=self.lr)
        
        self.n_step = n_step
        self.local_actor = ActorCritic()
        self.nstep_buffer = NstepBuffer()

    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            mu, std = self.local_actor.actor(state)
            dist = Normal(mu, std)
            action = dist.sample()
        return action.clamp(-ACT_LIMIT, ACT_LIMIT).numpy()[0]
    
    def train_network(self, states, actions, rewards, next_states, dones):
        states = torch.FloatTensor(np.array(states))
        actions = torch.FloatTensor(np.array(actions)).unsqueeze(1)
        rewards = torch.FloatTensor(np.array(rewards))
        next_states = torch.FloatTensor(np.array(next_states))
        dones = torch.FloatTensor(np.array(dones))

        # Calculate n-step returns
        returns = torch.zeros_like(rewards)
        G = 0
        for t in reversed(range(len(rewards))):
            G = rewards[t] + self.gamma * G
            returns[t] = G

        values = self.local_actor.critic(states).squeeze(1)
        advantages = returns - values

        # Calculate target value for critic loss
        with torch.no_grad():
            next_values = self.local_actor.critic(next_states).squeeze(1)
            target_values = rewards + self.gamma * (1 - dones) * next_values

        # Critic loss
        critic_loss = F.mse_loss(values, target_values)

        # Actor loss
        mu, std = self.local_actor.actor(states)
        dist = Normal(mu, std)
        log_probs = dist.log_prob(actions).sum(-1)
        entropy = dist.entropy().mean()
        actor_loss = -(log_probs * advantages.detach()).mean() 

        # Total loss
        total_loss = critic_loss + actor_loss - self.entropy_coef * entropy

        self.optimizer.zero_grad()
        total_loss.backward()

        for global_param, local_param in zip(self.global_actor.parameters(), self.local_actor.parameters()):
                global_param._grad = local_param.grad

        # Global optimizer update
        self.optimizer.step()

        self.local_actor.load_state_dict(self.global_actor.state_dict())
        
    def train(self):
        step = 1

        while True:
            state = self.env.reset()
            done = False

            while not done:
                action = self.select_action(state)
                next_state, reward, done, _ = self.env.step(action)
                self.nstep_buffer.add(state, action.item(), reward, next_state, done)

                if step % self.n_step == 0 or done:
                    self.train_network(*self.nstep_buffer.sample())
                    self.nstep_buffer.reset()                    
                
                state = next_state
                step += 1

            with self.global_epi.get_lock():
                self.global_epi.value += 1
            
            if self.finish.value == 1:
                break

            with self.sync:
                self.sync.wait()
           
        self.env.close()

I’ve tried to train the agent, but episode reward remains around -800. I’ve checked values of some key variables and found that after few episodes, std just keeps on increasing.

Thiết kế website giá rẻ

Danh mục

A3C agent (continuous action space) not being trained properly and producing either very high or low std