Thiết kế website giá rẻ

Question

I trying to batch train an AI to play connect four but for some reason I have more records of rewards then actions. It does not happen with ever episode and is always exactly one less entry then rewards. There should be one action for each reward recorded. I believe the rewards are correct but the some actions are getting skipped. States are also sometimes an odd number but they should always be even. The state before and after an action. I trimmed down the code as best I can.

for epoch in range(N_EPOCHS):
    replays = []
    for train_episode_id in range(0,N_TRAINING_EPISODES,EPISODE_BATCH_SIZE):
        games = []
        agent_turns = []
        opponent_turns = []
        batch_size = min(EPISODE_BATCH_SIZE, N_TRAINING_EPISODES - train_episode_id)
        
        # Initialize lists with the correct size
        batch_states = [[] for _ in range(batch_size)]
        batch_rewards = [[] for _ in range(batch_size)]
        batch_actions = [[] for _ in range(batch_size)]
        batch_move_types = [[] for _ in range(batch_size)]
        batch_rewards_adjusted = [[] for _ in range(batch_size)]
        scores = [0]*batch_size
        moves=0

        for i in range(batch_size):
            games.append(ConnectFour_Game())
            agent_turns.append(1 if (train_episode_id+i) % 2 == 0 else 2)
            opponent_turns.append(2 if (train_episode_id+i) % 2 == 0 else 1)

        # Get indices of games that have not ended
        not_ended_games_indices = []
        recorded_endings = [False]*batch_size
        for i, game in enumerate(games):
            if game.check_winner() == 0 and not game.check_tie():
                not_ended_games_indices.append(i)
        agent_games=[]
        opponent_games=[]
        for i in range(len(games)):
            if games[i].turn == agent_turns[i] and i in not_ended_games_indices:
                agent_games.append(i)
            elif games[i].turn == opponent_turns[i] and i in not_ended_games_indices:
                opponent_games.append(i)

        while len(not_ended_games_indices) > 0 and moves < 80:
            # print(len(batch_states[0]))
            if len(agent_games) > 0:
                input_states = []
                for i in agent_games:
                    state = games[i].to_tensor(agent_turns[i])
                    batch_states[i].append(state)
                    input_states.append(state.unsqueeze(0).to(device))
                agent_q_values = agent(torch.stack(input_states))
                for i in range(len(agent_games)):
                    action_probs = torch.softmax(agent_q_values[i], dim=0)
                    agent_response = torch.multinomial(action_probs, 1).item()
                    if not games[agent_games[i]].check_valid_move(agent_response):
                        agent_response = random.randint(0, 6)
                        while not games[agent_games[i]].check_valid_move(agent_response):
                            agent_response = random.randint(0, 6)
                        batch_move_types[agent_games[i]].append("Rand")
                    else:
                        batch_move_types[agent_games[i]].append("AI")
                    batch_actions[agent_games[i]].append(agent_response)
                    games[agent_games[i]].player_move(agent_turns[agent_games[i]], agent_response)
                moves+=1
            if len(opponent_games) > 0:
                input_states = []
                for i in opponent_games:
                        state = games[i].to_tensor(opponent_turns[i])
                        if moves > 0:
                            batch_states[i].append(state)
                            # if train_episode_id == 0 and epoch == 0 and i == 0:
                            #     print("state 2")
                        input_states.append(state.unsqueeze(0).to(device))
                opponent_q_values = opponent(torch.stack(input_states))
                for i in range(len(opponent_games)):
                    action_probs = torch.softmax(opponent_q_values[i], dim=0)
                    opponent_response = torch.multinomial(action_probs, 1).item()
                    if not games[opponent_games[i]].check_valid_move(opponent_response):
                        opponent_response = random.randint(0, 6)
                        while not games[opponent_games[i]].check_valid_move(opponent_response):
                            opponent_response = random.randint(0, 6)
                    games[opponent_games[i]].player_move(opponent_turns[opponent_games[i]], opponent_response)

                    if games[opponent_games[i]].check_winner() == 0 and not  games[opponent_games[i]].check_tie() and moves < 80:
                        new_score = game.player_score(agent_turns[opponent_games[i]]) - game.player_score(opponent_turns[opponent_games[i]])
                        if moves > 0:
                            # print(len(batch_rewards))
                            batch_rewards[opponent_games[i]].append((new_score - scores[opponent_games[i]]) * DISCOUNT_RATE ** (len(batch_states[opponent_games[i]])//2) )
                    
                    scores[opponent_games[i]] = new_score
                # print(batch_rewards[opponent_games[0]][-1])
                moves+=1
            
            for i in range(len(not_ended_games_indices)):
                if games[i].check_winner()!=0 or games[i].check_tie() or moves >= 80:
                    if not recorded_endings[i]:
                        if games[i].check_winner() == agent_turns[i]:
                            batch_rewards[i].append(160)
                        elif games[i].check_tie():
                            batch_rewards[i].append(0)
                        elif games[i].check_winner() == opponent_turns[i]:
                            batch_rewards[i].append(-160)
                        recorded_endings[i] = True
                        state = games[i].to_tensor(i)
                        batch_states[i].append(state)
            
            not_ended_games_indices=[]
            for i, game in enumerate(games):
                if game.check_winner() == 0 and not game.check_tie():
                    not_ended_games_indices.append(i)
            agent_games=[]
            opponent_games=[]
            for i in range(len(games)):
                if games[i].turn == agent_turns[i] and i in not_ended_games_indices:
                    agent_games.append(i)
                elif games[i].turn == opponent_turns[i] and i in not_ended_games_indices:
                    opponent_games.append(i)

        # loops over each game
        for i in range(len(batch_states)):
            # loops over each reward position
            batch_rewards_adjusted[i]=[0]*len(batch_rewards[i])
            for j in range(len(batch_rewards[i])):
                # print(i,j,len(batch_rewards[i]))
                batch_rewards_adjusted[i][j]=0
                for k in range(j,len(batch_rewards[i])):
                    batch_rewards_adjusted[i][j]+=batch_rewards[i][k]*DISCOUNT_RATE**(k-j)
                
            print(len(batch_rewards[i]),len(batch_actions[i]),len(batch_states[i]),len(batch_move_types[i]))
            for j in range(0,len(batch_states[i])-1,2):
                tmp = batch_rewards[i]
                tmp2 = batch_actions[i]
                replays.append({
                    'state': batch_states[i][j],
                    'new_state': batch_states[i][j+1],
                    'action': batch_actions[i][j//2],
                    'reward': batch_rewards_adjusted[i][j//2],
                    'm_type': batch_move_types[i][j//2]
                })

I made several adjustments which improved the issue but it still there.

Thiết kế website giá rẻ

Danh mục

Number agent actions recorded does not match rewards recorded