import gym
import numpy as np
import matplotlib.pyplot as plt
import pickle
def run(is_training=True, render=False):
env = gym.make('Pendulum-v1', render_mode='human' if render else None)
x_space = np.linspace(-1.0, 1.0, 10)
y_space = np.linspace(-1.0, 1.0, 10)
ang_vel_space = np.linspace(-8.0, 8.0, 10)
if is_training:
q = np.zeros((len(x_space)+1, len(y_space)+1, len(ang_vel_space)+1, env.action_space.shape[0]))
else:
f = open('pendulum.pkl', 'rb')
q = pickle.load(f)
f.close()
learning_rate_a = 0.1
discount_factor_g = 0.99
epsilon = 1
epsilon_min = 0.1
epsilon_decay_rate = 0.00001
rng = np.random.default_rng()
rewards_per_episode = []
i = 0
while True:
state = env.reset()[0]
state_x = np.digitize(state[0], x_space)
state_y = np.digitize(state[1], y_space)
state_av = np.digitize(state[2], ang_vel_space)
terminated = False
truncated = False
rewards = 0
while not terminated:
if is_training and rng.random() < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(q[state_x, state_y, state_av, :])
new_state, reward, terminated,_, _ = env.step(action)
new_state_x = np.digitize(new_state[0], x_space)
new_state_y = np.digitize(new_state[1], y_space)
new_state_av = np.digitize(new_state[2], ang_vel_space)
if is_training:
q[state_x, state_y, state_av, action] = q[state_x, state_y, state_av, action] + learning_rate_a * (
reward + discount_factor_g*np.max(q[new_state_x, new_state_y, new_state_av,:]) - q[state_x, state_y, state_av,action]
)
state = new_state
state_x = new_state_x
state_y = new_state_y
state_av = new_state_av
rewards += reward
if terminated or truncated:
break
if not is_training and rewards % 100 == 0:
print(f'Episode: {i} Rewards: {rewards}')
rewards_per_episode.append(rewards)
mean_rewards = np.mean(rewards_per_episode[max(0, len(rewards_per_episode)-100):])
if is_training and i % 100 == 0:
print(f'Episode: {i} {rewards} Epsilon: {epsilon:0.2f} Rewards: {mean_rewards:0.1f}')
if mean_rewards > -16.27:
break
if is_training and epsilon <= 0.1:
break
i += 1
env.close()
if is_training:
with open('pendulum.pkl', 'wb') as f:
pickle.dump(q, f)
mean_rewards = []
for t in range(i):
mean_rewards.append(np.mean(rewards_per_episode[max(0, t-100):(t+1)]))
plt.plot(mean_rewards)
plt.savefig(f'pendulum.png')
if __name__ == '__main__':
run(is_training=True, render=False)
#run(is_training=False, render=True)
q[state_x, state_y, state_av, action] = q[state_x, state_y, state_av, action] + learning_rate_a * (
~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
IndexError: arrays used as indices must be of integer (or boolean) type
I’m just starting to learn about reinforcement learning. When I used the same code for Cart Pole, it worked correctly. However, I couldn’t get it right for Pendulum because its action space is a bit different. Can you help?
New contributor
Ali Tokbas is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.