I did an Q-learning algorithm that work well, so I would try to do a DeepQ-learning algorithm to learn the CartPole env of gym env. So based on my Q-learning algorith I tried to evolve it on a DeepQ-learning algorithm without sucess during 2-3 days.
My problem is I just don’t understand why this is not working :
-Is it because I did a dumb mistake ?
-Is it because my model don’t converge because I update it every step ?
-Is it because I don’t use the right parameters ?
Im here because I don’t find the answer by myself ^^
To fix my issues I tried to :
- Try a lot way of decay epsilon value to balance exploration-exploitation of my epsilon greedy policy (also try some fix epsilon value)
-Try a bunch learning rate parameters for my neural network (from 0.1 to 0.0001)
-Try to find dumb mistake ( I really hope it’s not that lol )
-Try different type of optimiser (Adam, SGD)
I really want to insist that I find DeepQ-learning algorithm that solve this problem on internet but they used batch update, I really want to understand why my algorithm don’t improve.
I use Pytorch for the neural network btw.
I would appreciate some idea on why this is not working !!!
Ty for reading and here my code 🙂
Neural Network :
class Neural_Network(nn.Module):
def __init__(self,lr=0.1,input_dims=2,fc1_dims=512,fc2_dims=512,output_dims=3):
super().__init__()
self.lr=lr
self.input_dims=input_dims
self.fc1_dims=fc1_dims
self.fc2_dims=fc2_dims
self.output_dims=output_dims
self.fc1=nn.Linear(input_dims,fc1_dims)
self.fc2=nn.Linear(fc1_dims,fc2_dims)
self.fc3=nn.Linear(fc2_dims,output_dims)
return
def forward(self,x):
x = T.tensor(x,dtype=T.float)
x = F.relu( self.fc1(x) )
x = F.relu( self.fc2(x) )
x = self.fc3(x)
return x
The agent code :
class Agent_DeepQ:
def __init__(self,lr=0.01, states_dims=2,nbr_actions=3):
self.gamma = 0.99
self.epsilon = 1
self.NN = Neural_Network(lr=lr, input_dims= states_dims, output_dims = nbr_actions)
self.input_dims = states_dims
self.nbr_actions = nbr_actions
self.lr = lr
return
def choose_greedy_action(self,state):
Q_Values = self.NN(state).detach()
a=np.argmax(Q_Values)
return a
def choose_epsilon_action(self,state):
#epsilon policy
if np.random.rand()<self.epsilon: #choose random action
a=np.random.randint(self.nbr_actions)
else:
Q_values = self.NN(state).detach()
a=np.argmax(Q_values).numpy()
return a
def learn(self, state, action, reward, next_state):
#reset grad of params of our neural network
for param in self.NN.parameters():
if param.grad is not None:
param.grad.zero_()
#estimate q value of state
Q_state_action = self.NN(state)[action]
#estimate the best q value of the next state
Q_next_max= T.max(self.NN(next_state).detach())
target = reward + self.gamma * Q_next_max
#loss function, we want that our Q value step in the direction of our target
loss = (Q_state_action - target)**2
#calcul gradient
loss.backward()
#step in direction of gradient
with T.no_grad():
for param in self.NN.parameters():
param -= self.lr * param.grad
the train function :
def train(nb_episode):
#initialize agent
agent=Agent_DeepQ(lr=0.001,states_dims=4,nbr_actions=2)
#initialize env
env=gym.make("CartPole-v1")
#track reward
reward_history=[]
moving_average=[]
#choose epsilon greedy policy wwith epsilon = 0.1
agent.epsilon = 0.1
for ep in tqdm(range(nb_episode)):
observation=env.reset()[0]
done=False
score=0
while not done:
#choose an action with epsilon greedy policy
action= agent.choose_epsilon_action(observation)
#take a step according to the action that we choose
observation_, reward, terminated, truncated, info = env.step(action)
#look if we are done
done = terminated or truncated
#count rewards
score += reward
#immediatly learn from what we see
agent.learn(observation,action,reward,observation_)
observation = observation_
#Graphic moving average last 100 ep
reward_history.append(score)
clear_output(wait=True)
if( len(reward_history) >= 100 ):
moving_average.append( np.mean(reward_history[-100:] ))
x=np.arange(100,len(moving_average)+100)
plt.plot(x,moving_average)
plt.show()
return agent
Moving average of the 100 last episodes where we can see the reward don’t improve
Cauchy_Chlasse is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.