I am trying to create a Q learning maze that solves the maze and uses a rewards system where hitting the wall is -1 and finishing is 100, however, when I reach the goal, my total rewards are always 100 regardless of how many walls I hit, how do I fix this problem? I tried adding print statements for when the values are updated but they never printed? I am not sure why? maze
import random import time import copy Initializes the Q-table for all positions in the maze Each position is given a list of 4 Q-values, corresponding to 4 directions def Q_table_init(maze): Q_table = {} for y in range(1, len(maze) - 1): for x in range(1, len(maze[0]) - 1): pos = (y, x) Q_table[pos] = [0.00, 0.00, 0.00, 0.00] return Q_table Prints the Q-values in a formatted way for easier reading def print_Qdict(d): for key, value in d.items(): formatted_values = ["{:.2f}".format(num) for num in value] print(f"{key}: {formatted_values}") General function to print a dictionary in key: value format def print_dict(d): for key, value in d.items(): print(f"{key}: {value}") Reads the maze structure from a file and returns a list of lists representing the maze def read_maze(file_path): with open(file_path, 'r') as file: maze = [list(line.strip()) for line in file] return maze Finds and returns the start position in the maze def find_start(maze): for y, row in enumerate(maze): for x, col in enumerate(row): if col == 'S': # Assuming 'S' represents the start position return (y, x) return None Prints the maze layout def print_maze(maze): for row in maze: print(' '.join(row)) Sets up the reward system for the environment based on the maze structure def Env_rewards(Env): rewards = {} for i in range(len(Env)): for j in range(len(Env[0])): pos = (i, j) if pos not in rewards: rewards[pos] = 0 if Env[pos[0]][pos[1]] == "G": rewards[pos] += 100 # Reward for reaching the goal print(rewards) elif Env[pos[0]][pos[1]] == "#": rewards[pos] -= 1 # Penalty for hitting a wall print(rewards) elif Env[pos[0]][pos[1]] == " ": rewards[pos] += 0 # Neutral reward for empty spaces return rewards Determines the next possible states from every position in the maze def Env_next_states(maze, start_pos): next_states = {} for y in range(1, len(maze) - 1): for x in range(1, len(maze[0]) - 1): pos = (y, x) moves = [] up if maze[y - 1][x] == 'G' or maze[y - 1][x] == ' ': up_move = (y - 1, x) else: up_move = (y, x) moves.append(up_move) down if maze[y + 1][x] == 'G' or maze[y + 1][x] == ' ': down_move = (y + 1, x) else: down_move = (y, x) moves.append(down_move) left if maze[y][x - 1] == 'G' or maze[y][x - 1] == ' ': left_move = (y, x - 1) else: left_move = (y, x) moves.append(left_move) right if maze[y][x + 1] == 'G' or maze[y][x + 1] == ' ': right_move = (y, x + 1) else: right_move = (y, x) moves.append(right_move) next_states[pos] = moves return next_states Returns the next position based on the current position and direction of movement def reward_pos(pos, direction): if direction == "up": return (pos[0] - 1, pos[1]) if direction == "down": return (pos[0] + 1, pos[1]) if direction == "left": return (pos[0], pos[1] - 1) if direction == "right": return (pos[0], pos[1] + 1) Simulates the environment's response to an action Returns the next state and the associated reward def Evironment(state, action, env_rewards, env_next_states): dir_d = {0: "up", 1: "down", 2: "left", 3: "right"} reward = env_rewards[reward_pos(state, dir_d[action])] next_state = (env_next_states[state])[action] return (next_state, reward) Updates the Q-table based on the observed state, reward, and action Uses the Q-learning algorithm to update the values def Q_table_update(Q_table, state, obs, reward, action, alpha, gamma): ''' # Add function code here to update the Q table according the algorithm formula # ''' currentQValue = Q_table[state][action] maxQValue = max(Q_table[obs]) newQValue = currentQValue + alpha * (reward + gamma * maxQValue - currentQValue) Q_table[state][action] = newQValue return Q_table Determines the best action (with the highest Q-value) at a given state Randomly chooses between actions if there are multiple best actions def best_q_action(Q_table, state): ''' # Add function code here to choose the best action according to the Qtable # ''' actions = Q_table[state] maxQValue = max(actions) bestActions = [i for i, q in enumerate(actions) if q == maxQValue] return random.choice(bestActions) Chooses an action based on the epsilon-greedy strategy Randomly explores with a probability of epsilon, otherwise exploits the best action def q_learning_choice(Q_table, state, epsilon): ''' # Add function code here to implement epsilon greedy policy # ''' if random.random() > epsilon: return random.randint(0, 3) else: return best_q_action(Q_table, state) Main control loop to move agent in the maze def manual_action(): ''' # Add function code here # ''' while True: try: userInputDirections = int( input("Enter a number between 0 and 3, 0 is up, 1 is down, 2 is left and 3 is right")) if 0 <= userInputDirections <= 3: print(userInputDirections) return userInputDirections else: print("try again and this time enter a valid number") except ValueError: print("Invalid input. Please enter an integer.") def random_action(): return random.randint(0, 3) def Initialise_Env(maze_file): maze_env = read_maze(maze_file) start_pos = find_start(maze_env) maze_env[start_pos[0]][start_pos[1]] = ' ' next_rewards = Env_rewards(maze_env) next_states = Env_next_states(maze_env, start_pos) return maze_env, start_pos, next_rewards, next_states Main function to execute the Q-learning algorithm Set Q learning parameters like alpha, gamma, epsilon Initialize the environment and Q-table Run the Q-learning algorithm over a number of episodes For each episode, execute actions until the goal is reached Update the Q-table based on the observed rewards def main(): Q learning parameters alpha = 0.2 # learning rate gamma = 0.9 # discount factor epsilon = 0.9 # exploration factor number_episodes = 1 # Number of times the simulation is run quick_training_ep = 0 # The number of episodes to run quickly before showing the behaviour I would recommend setting this to a few less than the number of episodes when complete pause_time = 1 # Number of seconds between steps for final episodes to view Agent behaviour maze_file = '/Users/arnavgupta/Downloads/maze_env1.txt' # change path name to desired maze Initialise Environment from files maze_env, start_pos, next_rewards, next_states = Initialise_Env(maze_file) state = start_pos Initialise Q_table for Agent Q_table = Q_table_init(maze_env) Show Environment tables print("Environment Dictionaries") print_dict(next_states) print_dict(next_rewards) for episode in range(number_episodes): Flag to run while loop until goal is reached goal = False Optionally print the Q_table after each episode print_Qdict(Q_table) while not goal: Choose an action according to the q_learning epsilon greedy policy This is initialised as a random move policy action = q_learning_choice(Q_table, state, epsilon) action = manual_action() action = random_action() Evironment(state, action, env_rewards, env_next_states) n_state, reward = Evironment(state, action, next_rewards, next_states) Update Q_table for the current state with new reward and new state information Q_table = Q_table_update(Q_table, state, n_state, reward, action, alpha, gamma) Update state to new state state = n_state if maze_env[n_state[0]][n_state[1]] != "G": maze_env_pos = copy.deepcopy(maze_env) maze_env_pos[state[0]][state[1]] = "A" print_maze(maze_env_pos) time.sleep(pause_time) If reward is 100 then the goal state has been reached if maze_env[n_state[0]][n_state[1]] == "G": print("Congratulations. Goal state found") print("Episode: ", episode) dir_d = {0: "up", 1: "down", 2: "left", 3: "right"} print(f"agent makes action {dir_d[action]} and recieves observation {n_state} and reward {reward}") goal = True print_maze(maze_env) if __name__ == "__main__": main()
New contributor
user25128050 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.