I trying to batch train an AI to play connect four but for some reason I have more records of rewards then actions. It does not happen with ever episode and is always exactly one less entry then rewards. There should be one action for each reward recorded. I believe the rewards are correct but the some actions are getting skipped. States are also sometimes an odd number but they should always be even. The state before and after an action. I trimmed down the code as best I can.
for epoch in range(N_EPOCHS):
replays = []
for train_episode_id in range(0,N_TRAINING_EPISODES,EPISODE_BATCH_SIZE):
games = []
agent_turns = []
opponent_turns = []
batch_size = min(EPISODE_BATCH_SIZE, N_TRAINING_EPISODES - train_episode_id)
# Initialize lists with the correct size
batch_states = [[] for _ in range(batch_size)]
batch_rewards = [[] for _ in range(batch_size)]
batch_actions = [[] for _ in range(batch_size)]
batch_move_types = [[] for _ in range(batch_size)]
batch_rewards_adjusted = [[] for _ in range(batch_size)]
scores = [0]*batch_size
moves=0
for i in range(batch_size):
games.append(ConnectFour_Game())
agent_turns.append(1 if (train_episode_id+i) % 2 == 0 else 2)
opponent_turns.append(2 if (train_episode_id+i) % 2 == 0 else 1)
# Get indices of games that have not ended
not_ended_games_indices = []
recorded_endings = [False]*batch_size
for i, game in enumerate(games):
if game.check_winner() == 0 and not game.check_tie():
not_ended_games_indices.append(i)
agent_games=[]
opponent_games=[]
for i in range(len(games)):
if games[i].turn == agent_turns[i] and i in not_ended_games_indices:
agent_games.append(i)
elif games[i].turn == opponent_turns[i] and i in not_ended_games_indices:
opponent_games.append(i)
while len(not_ended_games_indices) > 0 and moves < 80:
# print(len(batch_states[0]))
if len(agent_games) > 0:
input_states = []
for i in agent_games:
state = games[i].to_tensor(agent_turns[i])
batch_states[i].append(state)
input_states.append(state.unsqueeze(0).to(device))
agent_q_values = agent(torch.stack(input_states))
for i in range(len(agent_games)):
action_probs = torch.softmax(agent_q_values[i], dim=0)
agent_response = torch.multinomial(action_probs, 1).item()
if not games[agent_games[i]].check_valid_move(agent_response):
agent_response = random.randint(0, 6)
while not games[agent_games[i]].check_valid_move(agent_response):
agent_response = random.randint(0, 6)
batch_move_types[agent_games[i]].append("Rand")
else:
batch_move_types[agent_games[i]].append("AI")
batch_actions[agent_games[i]].append(agent_response)
games[agent_games[i]].player_move(agent_turns[agent_games[i]], agent_response)
moves+=1
if len(opponent_games) > 0:
input_states = []
for i in opponent_games:
state = games[i].to_tensor(opponent_turns[i])
if moves > 0:
batch_states[i].append(state)
# if train_episode_id == 0 and epoch == 0 and i == 0:
# print("state 2")
input_states.append(state.unsqueeze(0).to(device))
opponent_q_values = opponent(torch.stack(input_states))
for i in range(len(opponent_games)):
action_probs = torch.softmax(opponent_q_values[i], dim=0)
opponent_response = torch.multinomial(action_probs, 1).item()
if not games[opponent_games[i]].check_valid_move(opponent_response):
opponent_response = random.randint(0, 6)
while not games[opponent_games[i]].check_valid_move(opponent_response):
opponent_response = random.randint(0, 6)
games[opponent_games[i]].player_move(opponent_turns[opponent_games[i]], opponent_response)
if games[opponent_games[i]].check_winner() == 0 and not games[opponent_games[i]].check_tie() and moves < 80:
new_score = game.player_score(agent_turns[opponent_games[i]]) - game.player_score(opponent_turns[opponent_games[i]])
if moves > 0:
# print(len(batch_rewards))
batch_rewards[opponent_games[i]].append((new_score - scores[opponent_games[i]]) * DISCOUNT_RATE ** (len(batch_states[opponent_games[i]])//2) )
scores[opponent_games[i]] = new_score
# print(batch_rewards[opponent_games[0]][-1])
moves+=1
for i in range(len(not_ended_games_indices)):
if games[i].check_winner()!=0 or games[i].check_tie() or moves >= 80:
if not recorded_endings[i]:
if games[i].check_winner() == agent_turns[i]:
batch_rewards[i].append(160)
elif games[i].check_tie():
batch_rewards[i].append(0)
elif games[i].check_winner() == opponent_turns[i]:
batch_rewards[i].append(-160)
recorded_endings[i] = True
state = games[i].to_tensor(i)
batch_states[i].append(state)
not_ended_games_indices=[]
for i, game in enumerate(games):
if game.check_winner() == 0 and not game.check_tie():
not_ended_games_indices.append(i)
agent_games=[]
opponent_games=[]
for i in range(len(games)):
if games[i].turn == agent_turns[i] and i in not_ended_games_indices:
agent_games.append(i)
elif games[i].turn == opponent_turns[i] and i in not_ended_games_indices:
opponent_games.append(i)
# loops over each game
for i in range(len(batch_states)):
# loops over each reward position
batch_rewards_adjusted[i]=[0]*len(batch_rewards[i])
for j in range(len(batch_rewards[i])):
# print(i,j,len(batch_rewards[i]))
batch_rewards_adjusted[i][j]=0
for k in range(j,len(batch_rewards[i])):
batch_rewards_adjusted[i][j]+=batch_rewards[i][k]*DISCOUNT_RATE**(k-j)
print(len(batch_rewards[i]),len(batch_actions[i]),len(batch_states[i]),len(batch_move_types[i]))
for j in range(0,len(batch_states[i])-1,2):
tmp = batch_rewards[i]
tmp2 = batch_actions[i]
replays.append({
'state': batch_states[i][j],
'new_state': batch_states[i][j+1],
'action': batch_actions[i][j//2],
'reward': batch_rewards_adjusted[i][j//2],
'm_type': batch_move_types[i][j//2]
})
I made several adjustments which improved the issue but it still there.