%matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import hsv_to_rgb
import itertools
# set grid size
grid_length = 12
grid_height = 6
R_matrix = np.full((grid_height, grid_length),-1) # every other step receives -1 reward
R_matrix[-1, 1:grid_length-1] = -100 # cliff
R_matrix[grid_height-2:-1, grid_length-8:grid_length-4] = -100
R_matrix[grid_height-3:grid_height-2, grid_length-7:grid_length-5] = -100
R_matrix[grid_height-4:grid_height-3, grid_length-3:grid_length] = -100
R_matrix[grid_height-5:grid_height-4, grid_length-9:grid_length-8] = -100
R_matrix[-1, -1] = 100 #objective
R_matrix
#for the renderer
def change_range(values, vmin=0, vmax=1):
start_zero = values - np.min(values)
return (start_zero / (np.max(start_zero) + 1e-7)) * (vmax - vmin) + vmin
class CliffEnvironment:
#change these colors
terrain_color = dict(normal=[127/360, 0, 96/100],
objective=[26/360, 100/100, 100/100],
cliff=[247/360, 92/100, 70/100],
player=[344/360, 93/100, 100/100])
def __init__(self):
self.player = None
self._create_grid()
self._draw_grid()
def _create_grid(self, initial_grid=None):
self.grid = self.terrain_color['normal'] * np.ones((grid_height, grid_length, 3))
self._set_terrain(self.grid)
def _set_terrain(self, grid):
grid[-1, 1:grid_length-1] = self.terrain_color['cliff']
grid[grid_height-2:-1, grid_length-8:grid_length-4] = self.terrain_color['cliff']
grid[grid_height-3:grid_height-2, grid_length-7:grid_length-5] = self.terrain_color['cliff']
grid[grid_height-4:grid_height-3, grid_length-3:grid_length] = self.terrain_color['cliff']
grid[grid_height-5:grid_height-4, grid_length-9:grid_length-8] = self.terrain_color['cliff']
grid[-1, -1] = self.terrain_color['objective']
def _draw_grid(self):
self.fig, self.ax = plt.subplots(figsize=(grid_length, grid_height))
self.ax.grid(which='minor')
self.q_texts = [self.ax.text(*self._id_to_position(i)[::-1], '0',
fontsize=11, verticalalignment='center',
horizontalalignment='center') for i in range(grid_length * grid_height)]
self.im = self.ax.imshow(hsv_to_rgb(self.grid), cmap='terrain',
interpolation='nearest', vmin=0, vmax=1)
self.ax.set_xticks(np.arange(grid_length))
self.ax.set_xticks(np.arange(grid_length) - 0.5, minor=True)
self.ax.set_yticks(np.arange(grid_height))
self.ax.set_yticks(np.arange(grid_height) - 0.5, minor=True)
def reset(self):
self.player = (grid_height-1, 0) # (y_coord, x,_coord, tuple to indicate current location )
return self._position_to_id(self.player)
def step(self, action):
# SETTING GRID MOVEMENT RULES AND CONSTRAINTS
#move UP
# if action is move UP and the Y_cord is > 0 (i.e. agent is not in the top row of grid)
if action == 0 and self.player[0] > 0:
#move the agent one cell up
self.player = (self.player[0] - 1, self.player[1])
#move DOWN
# if action is move DOWN and the Y_cord is < last row (i.e. agent is not in the bottom row of grid)
if action == 1 and self.player[0] < grid_height-1:
#move the agent one cell down
self.player = (self.player[0] + 1, self.player[1])
#move RIGHT
# if action is move RIGHT and the X_cord is < last col (i.e. agent is not in the...
# furthest right hand column of grid)
if action == 2 and self.player[1] < grid_length-1:
#move the agent one cell to the right
self.player = (self.player[0], self.player[1] + 1)
#move LEFT
# if action is move LEFT and the X_cord is > first col (i.e. agent is not in the...
# furthest left hand column of grid)
if action == 3 and self.player[1] > 0:
#move the agent one cell to the left
self.player = (self.player[0], self.player[1] - 1)
#Assign reward for chosen action
#reward = R_matrix[self.player[1],self.player[1]]
#
if all(self.grid[self.player] == self.terrain_color['cliff']) or all(self.grid[self.player] == self.terrain_color['objective']):
reward = R_matrix[self.player[0],self.player[1]]
terminal_state = True
else:
reward = R_matrix[self.player[0],self.player[1]]
terminal_state = False
if all(self.grid[self.player] == self.terrain_color['objective']):
#reward = R_matrix[self.player[0],self.player[1]]
goal_state = True
return self._position_to_id(self.player), reward, terminal_state
def _position_to_id(self, pos):
''' Maps a position in x,y coordinates to a unique ID '''
return pos[0] * grid_length + pos[1]
def _id_to_position(self, idx):
return (idx // grid_length), (idx % grid_length)
def render(self, Q_matrix=None, action=None, max_q=False, colorize_q=False):
assert self.player is not None, 'You first need to call .reset()'
if colorize_q:
assert Q_matrix is not None, 'Q_matrix must not be None for using colorize_q'
grid = self.terrain_color['normal'] * np.ones((grid_height, grid_length, 3))
values = change_range(np.max(Q_matrix, -1)).reshape(grid_height, grid_length)
grid[:, :, 1] = values
self._set_terrain(grid)
else:
grid = self.grid.copy()
grid[self.player] = self.terrain_color['player']
self.im.set_data(hsv_to_rgb(grid))
if Q_matrix is not None:
xs = np.repeat(np.arange(grid_length), grid_height)
ys = np.tile(np.arange(grid_height), grid_length)
for i, text in enumerate(self.q_texts):
if max_q:
q = max(Q_matrix[i])
txt = '{:.2f}'.format(q)
text.set_text(txt)
else:
actions = ['U', 'D', 'R', 'L']
txt = '\n'.join(['{}: {:.2f}'.format(k, q) for k, q in zip(actions, Q_matrix[i])])
text.set_text(txt)
if action is not None:
self.ax.set_title(action, color='r', weight='bold', fontsize=32)
plt.pause(0.01)
#plt.show()
#plt.draw()
UP = 0
DOWN = 1
RIGHT = 2
LEFT = 3
actions = ['UP', 'DOWN', 'RIGHT', 'LEFT']
cliff = CliffEnvironment()
# The number of states in simply the number of "squares" in our grid world, in this case 4 * 12
num_states = grid_length * grid_height
# We have 4 possible actions, up, down, right and left
num_actions = 4
#Initialise the Q-matrix with an array of zeros.
Q_matrix = np.zeros((num_states, num_actions))
df = pd.DataFrame(Q_matrix, columns=['A0: up', 'A1: down', 'A2: right', 'A3: left'])
df.index.name = 'States'
df.head(5)
def egreedy_policy(Q_matrix, state, epsilon=0.9):
'''
Choose an action based on a epsilon greedy policy.
A random action is selected with epsilon probability, else select the best action.
'''
if np.random.random() < epsilon:
return np.random.choice(4)
else:
return np.argmax(Q_matrix[state])
def q_learning(cliff, num_episodes=5000, render=False, decay = True, epsilon = 0.9,
epsilon_threshold = 0.75, decay1=0.999, decay2=0.9999, alpha_lr =0.1, gamma=0.9):
'''
Based on 1992 Watkins and Dayan technical note on 'Q-learning'
http://www.gatsby.ucl.ac.uk/~dayan/papers/cjch.pdf
'''
#Initialise Q(s,a) matrix
Q_matrix = np.zeros((num_states, num_actions))
rewards_list = []
#max_reward = 100-grid_length
max_reward = 84
first_max_threshold = 0
episodes_to_max = 99999
for episode in range(num_episodes):
state = cliff.reset() #initialise state St
terminal_state = False
reward_sum = 0
while not terminal_state:
# Choose action (At) from state (St) using egreedy_policy
action = egreedy_policy(Q_matrix, state, epsilon)
# Execute the action (At) to the environment
#and observe new state (s at t+1)
next_state, reward, terminal_state = cliff.step(action)
#Receive immediate reward (r at t+1)
reward_sum += reward
# Update Q_matrix using Watkins and Dayan 1992 update rule
td_target = reward + gamma * np.max(Q_matrix[next_state])
td_error = td_target - Q_matrix[state][action]
Q_matrix[state][action] += alpha_lr * td_error
state = next_state
if decay:
if epsilon >= epsilon_threshold:
epsilon = decay1 * epsilon
else:
epsilon = decay2 * epsilon
#x = input("press space")
#if render:
# cliff.render(Q_matrix, action=actions[action], colorize_q=True)
rewards_list.append(reward_sum)
if first_max_threshold < 1 and reward_sum == 84:
print('Found optimal path after only: {} episodes'.format(episode))
episodes_to_max = episode
first_max_threshold += 1
if episode % 1000 == 0:
print('Episode: {} completed'.format(episode))
return rewards_list, Q_matrix, episodes_to_max
def sa_q_learning(cliff, num_episodes=5000, render=False, decay = True, epsilon = 0.9,
epsilon_threshold = 0.5, decay1=0.999, decay2=0.9999, alpha_lr =0.1, gamma=0.9):
'''
Based on 2004 paper from Guo et al: A new Q-learning algorithm based on the metropolis criterion
https://ieeexplore.ieee.org/document/1335509
'''
#Initialise Q(s,a) matrix
Q_matrix = np.zeros((num_states, num_actions))
rewards_list = []
#max_reward = 100-grid_length
max_reward = 84
first_max_threshold = 0
episodes_to_max = 99999
for episode in range(num_episodes):
state = cliff.reset() #initialise state St
terminal_state = False
reward_sum = 0
while not terminal_state:
# Choose action (At) from state (St) using egreedy_policy
action_policy = egreedy_policy(Q_matrix, state, epsilon)
#select an action arbitrarily
action_r = np.random.choice(4)
#extract the Q values for both the policy action and the
# arbitrary action_r
Q_p = Q_matrix[state][action_policy]
Q_r = Q_matrix[state][action_r]
#generate a random value ξ∈(0,1)
ξ = np.random.random()
if ξ < np.exp((Q_r - Q_p)/epsilon):
# Execute the action (At) to the environment
#and observe new state (s at t+1)
next_state, reward, terminal_state = cliff.step(action_r)
#Receive immediate reward (r at t+1)
reward_sum += reward
# Update Q_matrix using Watkins and Dayan 1992 update rule
td_target = reward + gamma * np.max(Q_matrix[next_state])
td_error = td_target - Q_matrix[state][action_r]
Q_matrix[state][action_r] += alpha_lr * td_error
else:
next_state, reward, terminal_state = cliff.step(action_policy)
#Receive immediate reward (r at t+1)
reward_sum += reward
# Update Q_matrix using Watkins and Dayan 1992 update rule
td_target = reward + gamma * np.max(Q_matrix[next_state])
td_error = td_target - Q_matrix[state][action_policy]
Q_matrix[state][action_policy] += alpha_lr * td_error
state = next_state
if decay:
if epsilon >= epsilon_threshold:
epsilon = decay1 * epsilon
else:
epsilon = decay2 * epsilon
if render:
cliff.render(Q_matrix, action=actions[action], colorize_q=True)
rewards_list.append(reward_sum)
if first_max_threshold < 1 and reward_sum == 84:
print('Found optimal path after only: {} episodes'.format(episode))
episodes_to_max = episode
first_max_threshold += 1
if episode % 1000 == 0:
print('Episode: {} completed'.format(episode))
return rewards_list, Q_matrix, episodes_to_max
def bqsa_learning(cliff, num_episodes=5000, render=False, decay = True, epsilon = 0.9,
epsilon_threshold = 0.8, decay1=0.999, decay2=0.9999, alpha_lr =0.1, gamma=0.9):
'''
Backward Q based SARSA algorithm
based on paper by Hao Wang et al 2013
https://www.sciencedirect.com/science/article/abs/pii/S0952197613001176
'''
#Initialise Q(s,a) matrix
Q_matrix = np.zeros((num_states, num_actions))
rewards_list = []
#max_reward = 100-grid_length
max_reward = 84
first_max_threshold = 0
episodes_to_max = 99999
for episode in range(num_episodes):
state = cliff.reset() #initialise state St
# Choose action (At) from state (St) using egreedy_policy
action = egreedy_policy(Q_matrix, state, epsilon)
terminal_state = False
goal_state = False
reward_sum = 0
M = []
N = 0
while not terminal_state and N < 200:
# Execute the action (At) to the environment
#and observe new state (s at t+1)
next_state, reward, terminal_state = cliff.step(action)
#Receive immediate reward (r at t+1)
reward_sum += reward
# Choose next action
next_action = egreedy_policy(Q_matrix, next_state, epsilon)
# Next q value is the value of the next action
M_i = {"i": N,
"st": state,
"at": action,
"rt+1": reward,
"st+1": next_state}
M.append(M_i)
#print(M)
# Next q value is the value of the next action
#td_target = reward + gamma * np.max(Q_matrix[next_state][action])
#td_error = td_target - Q_matrix[state][action]
update = alpha_lr * (reward + (gamma * Q_matrix[next_state][next_action]) - Q_matrix[state][action])
# Update q value
Q_matrix[state][action] += update
#td_target = reward + gamma * np.max(Q_matrix[next_state])
#td_error = td_target - Q_matrix[state][action]
#Q_matrix[state][action] += alpha_lr * td_error
#state = next_state
# Update state and action
state = next_state
action = next_action
if decay:
if epsilon >= epsilon_threshold:
epsilon = decay1 * epsilon
else:
epsilon = decay2 * epsilon
#if render:
#cliff.render(Q_matrix, action=actions[action], colorize_q=True)
N += 1
rewards_list.append(reward_sum)
if goal_state:
for x in reversed(range(N)):
update = alpha_lr * (reward + (gamma * np.max(Q_matrix[M[x]["st+1"]][M[x]["at"]])) - Q_matrix[M[x]["st"]][M[x]["at"]])
Q_matrix[M[x]["st"]][M[x]["at"]] += update
if first_max_threshold < 1 and reward_sum == 84:
print('Found optimal path after only: {} episodes'.format(episode))
episodes_to_max = episode
first_max_threshold += 1
if episode % 1000 == 0:
print('Episode: {} completed'.format(episode))
#if decay:
# if epsilon >= epsilon_threshold:
# epsilon = decay1 * epsilon
#else:
#epsilon = decay2 * epsilon
return rewards_list, Q_matrix, episodes_to_max, M
%matplotlib inline
def parse_results_list(results_list):
# from list of dicts to PD dataframe
parsed_results_df = pd.DataFrame(results_list, index=None)
#print top 5 by episodes to optimal solution
print("Top 5 results, by episodes to find optimal path: \n",
parsed_results_df.sort_values('Epi_to_max', ascending = True).head())
#print top 5 by mean reward over 5000 episodes
print("\n \n Top 5 results, by mean reward over 5000 episodes: \n",
parsed_results_df.sort_values('Mean reward', ascending = False).head())
#filter out experiments that did not find optimal path
completed_parsed_results_df = parsed_results_df[parsed_results_df.Epi_to_max < 999]
#plot results
if len(completed_parsed_results_df) != 0:
completed_parsed_results_df.plot.scatter(x='Epi_to_max', y='Mean reward')
return completed_parsed_results_df
else:
print("No experiments in this batch made it to the end of the game .")
def run_Q (gamma_param = [0.05,0.1,0.2,0.3,0.4],
alpha_lr_param = [0.05,0.1,0.2,0.3,0.4],
decay_param = [True],
decay1 = [0.999],
decay2 = [0.9999],
verbose = True,
plot_figures = True,
parse_results = True
):
max_experiments = len(gamma_param)*len(alpha_lr_param)*len(decay_param)*len(decay1)*len(decay2)
results_list = []
experiment_id = 1
for d in decay1:
for e in decay2:
for i in gamma_param:
for j in alpha_lr_param:
for k in decay_param:
q_learning_rewards, Q, episodes_to_max = zip(*[q_learning(cliff, decay = k, decay1=d,decay2=e,gamma= i, render=True,
alpha_lr= j ) for _ in range(1)])
avg_rewards = np.mean(q_learning_rewards, axis=0)
mean_reward = [np.mean(avg_rewards)] * len(avg_rewards)
#max_reward = np.max(q_learning_rewards, axis=0)
if plot_figures:
fig, ax = plt.subplots()
ax.set_xlabel('Episodes')
ax.set_ylabel('Rewards')
ax.set_title('Experiment ID: {} \n Params: gamma = {}, alpha_lr = {}, decay = {} \n Mean Reward: {} \n Episodes to find optimal path: {}'.format(experiment_id, i, j, k, mean_reward[0],"Not found" if min(episodes_to_max) == 99999 else min(episodes_to_max) ))
rolling_avg_rewards = pd.Series(avg_rewards).rolling(20, min_periods=20).mean()
ax.plot(rolling_avg_rewards, color='green')
ax.plot(mean_reward, 'g--')
if min(episodes_to_max) != 99999:
ax.axvline(x=min(episodes_to_max), color='r', linestyle='--')
plt.tight_layout()
fig.savefig('{}_results_q.png'.format(experiment_id), pad_inches=1)
results = {'Experiment_ID': experiment_id,
'gamma': i,
'alpha_lr': j,
'decay': k,
'decay1':d,
'decay2':e,
#'Episode Length':steps,
'Mean reward': mean_reward[0],
'Epi_to_max' : min(episodes_to_max)
}
results_list.append(results)
if verbose:
print ('Experiment {} of {} complete'.format(experiment_id, max_experiments))
experiment_id += 1
if parse_results:
parsed_results_df = parse_results_list(results_list)
return parsed_results_df
else:
return results_list
Q_results = run_Q()
Q_results['Algorithm'] = 'Q-Learning'
Q_results.to_csv('/Users/JamesPaulPhelan/Desktop/Q_results', sep='\t')
Q_results
import seaborn as sns
data1 = Q_results.pivot("gamma", "alpha_lr", "Mean reward")
ax = sns.heatmap(data1,annot=True,linewidths=.5,cbar_kws={'label': 'Mean Reward'})
fig = ax.get_figure()
fig.savefig('QMR.png')
data2 = Q_results.pivot("gamma", "alpha_lr", "Epi_to_max")
ax = sns.heatmap(data2,linewidths=.5,cbar_kws={'label': 'Episodes to Max'})
fig = ax.get_figure()
fig.savefig('QETM.png')
def run_SA_Q (gamma_param = [0.7,0.75,0.8,0.85,0.9],
alpha_lr_param = [0.09,0.095,0.1,0.15,0.2],
decay_param = [True],
decay1 = [0.999],
decay2 = [0.9999],
verbose = True,
plot_figures = True,
parse_results = True
):
max_experiments = len(gamma_param)*len(alpha_lr_param)
results_list = []
experiment_id = 1
for d in decay1:
for e in decay2:
for i in gamma_param:
for j in alpha_lr_param:
rewards_list, Q_matrix, episodes_to_max = zip(*[sa_q_learning(cliff, num_episodes=5000, render=False, epsilon = 0.9,
epsilon_threshold = 0.8, decay1=d, decay2=e, alpha_lr =j, gamma=i) for _ in range(1)])
avg_rewards = np.mean(rewards_list, axis=0)
mean_reward = [np.mean(avg_rewards)] * len(avg_rewards)
#max_reward = np.max(q_learning_rewards, axis=0)
if plot_figures:
fig, ax = plt.subplots()
ax.set_xlabel('Episodes')
ax.set_ylabel('Rewards')
ax.set_title('Experiment ID: {} \n Params: gamma = {}, alpha_lr = {} \n Mean Reward: {} \n Episodes to find optimal path: {}'.format(experiment_id, i, j, mean_reward[0],"Not found" if min(episodes_to_max) == 99999 else min(episodes_to_max) ))
rolling_avg_rewards = pd.Series(avg_rewards).rolling(20, min_periods=20).mean()
ax.plot(rolling_avg_rewards)
ax.plot(mean_reward, 'g--')
if min(episodes_to_max) != 99999:
ax.axvline(x=min(episodes_to_max), color='r', linestyle='--')
plt.tight_layout()
fig.savefig('{}_results_sa_q.jpg'.format(experiment_id), pad_inches=1)
results = {'Experiment_ID': experiment_id,
'gamma': i,
'alpha_lr': j,
'decay1':d,
'decay2':e,
'Mean reward': mean_reward[0],
'Epi_to_max' : min(episodes_to_max)
}
results_list.append(results)
if verbose:
print ('Experiment {} of {} complete'.format(experiment_id, max_experiments))
experiment_id += 1
if parse_results:
parsed_results_df = parse_results_list(results_list)
return parsed_results_df
else:
return results_list
SA_Q_results = run_SA_Q()
SA_Q_results['Algorithm'] = 'SAQ-Learning'
SA_Q_results['Algorithm'] = 'SAQ-Learning'
SA_Q_results
SA_Q_results.to_csv('/Users/JamesPaulPhelan/Desktop/SA_Q_results', sep='\t')
data3 = SA_Q_results.pivot("gamma", "alpha_lr", "Mean reward")
ax = sns.heatmap(data3,annot=True,linewidths=.5,cbar_kws={'label': 'Mean Reward'})
fig = ax.get_figure()
fig.savefig('SAQMR.png')
data4 = SA_Q_results.pivot("gamma", "alpha_lr", "Epi_to_max")
ax = sns.heatmap(data4,linewidths=.5,cbar_kws={'label': 'Episodes to Max'})
fig = ax.get_figure()
fig.savefig('SAQETM.png')
def run_bqsa (gamma_param = [0.09,0.095,0.1,0.15,0.2],
alpha_lr_param = [0.09,0.095,0.1,0.15,0.2],
decay_param = [True],
decay1 = [0.999],
decay2 = [0.999],
verbose = True,
plot_figures = True,
parse_results = True,
render1 = False
):
max_experiments = len(gamma_param)*len(alpha_lr_param)
results_list = []
experiment_id = 1
for d in decay1:
for e in decay2:
for i in gamma_param:
for j in alpha_lr_param:
rewards_list, Q_matrix, episodes_to_max, M = zip(*[bqsa_learning(cliff, alpha_lr =j, gamma=i,decay1=d, decay2=e, epsilon = 0.9,
epsilon_threshold = 0.8) for _ in range(1)])
avg_rewards = np.mean(rewards_list, axis=0)
mean_reward = [np.mean(avg_rewards)] * len(avg_rewards)
#max_reward = np.max(q_learning_rewards, axis=0)
if plot_figures:
fig, ax = plt.subplots()
ax.set_xlabel('Episodes')
ax.set_ylabel('Rewards')
ax.set_title('Experiment ID: {} \n Params: gamma = {}, alpha_lr = {} \n Mean Reward: {} \n Episodes to find optimal path: {}'.format(experiment_id, i, j, mean_reward[0],"Not found" if min(episodes_to_max) == 99999 else min(episodes_to_max) ))
rolling_avg_rewards = pd.Series(avg_rewards).rolling(20, min_periods=20).mean()
ax.plot(rolling_avg_rewards, color='orange')
ax.plot(mean_reward, 'g--')
if min(episodes_to_max) != 99999:
ax.axvline(x=min(episodes_to_max), color='r', linestyle='--')
plt.tight_layout()
plt.subplots_adjust(top=0.88)
fig.savefig('{}_results_bqsa.jpg'.format(experiment_id), pad_inches=1)
results = {'Experiment_ID': experiment_id,
'gamma': i,
'alpha_lr': j,
'decay1':d,
'decay2':e,
'Mean reward': mean_reward[0],
'Epi_to_max' : min(episodes_to_max)
}
results_list.append(results)
if verbose:
print ('Experiment {} of {} complete'.format(experiment_id, max_experiments))
experiment_id += 1
if parse_results:
parsed_results_df = parse_results_list(results_list)
return parsed_results_df
else:
return results_list
bqsa_results = run_bqsa()
bqsa_results['Algorithm'] = 'BQSA-Learning'
data5 = bqsa_results.pivot("gamma", "alpha_lr", "Mean reward")
ax = sns.heatmap(data5,annot=True,linewidths=.5,cbar_kws={'label': 'Mean Reward'})
data6 = bqsa_results.pivot("gamma", "alpha_lr", "Epi_to_max")
ax = sns.heatmap(data6,annot=False,linewidths=.5,cbar_kws={'label': 'Episodes to Max'})
fig, ax = plt.subplots()
ax.set_xlabel('Episodes to max')
ax.set_ylabel('Max rewards')
ax.set_title('Comparison betweeen performance of Q-Learning, SAQ-Learning and BQSA-Learning')
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
ax.plot(Q_results.Epi_to_max, Q_results['Mean reward'], marker='o', linestyle='', ms=12, label='Q-Learning')
ax.plot(SA_Q_results.Epi_to_max, SA_Q_results['Mean reward'], marker='o', linestyle='', ms=12, label='SAQ-Learning')
ax.plot(bqsa_results.Epi_to_max, bqsa_results['Mean reward'], marker='o', linestyle='', ms=12, label='BQSA-Learning')
ax.legend()
plt.show()
def run_bqsa (gamma_param = [0.09,0.095,0.1,0.15,0.2],
alpha_lr_param = [0.09,0.095,0.1,0.15,0.2],
decay_param = [True],
decay1 = [0.999],
decay2 = [0.999],
verbose = True,
plot_figures = True,
parse_results = True,
render1 = False
):
max_experiments = len(gamma_param)*len(alpha_lr_param)
results_list = []
experiment_id = 1
for d in decay1:
for e in decay2:
for i in gamma_param:
for j in alpha_lr_param:
rewards_list, Q_matrix, episodes_to_max, M = zip(*[bqsa_learning(cliff, alpha_lr =j, gamma=i,decay1=d, decay2=e, epsilon = 0.9,
epsilon_threshold = 0.8) for _ in range(1)])
avg_rewards = np.mean(rewards_list, axis=0)
mean_reward = [np.mean(avg_rewards)] * len(avg_rewards)
#max_reward = np.max(q_learning_rewards, axis=0)
if plot_figures:
fig, ax = plt.subplots()
ax.set_xlabel('Episodes')
ax.set_ylabel('Rewards')
ax.set_title('Experiment ID: {} \n Params: gamma = {}, alpha_lr = {} \n Mean Reward: {} \n Episodes to find optimal path: {}'.format(experiment_id, i, j, mean_reward[0],"Not found" if min(episodes_to_max) == 99999 else min(episodes_to_max) ))
rolling_avg_rewards = pd.Series(avg_rewards).rolling(20, min_periods=20).mean()
ax.plot(rolling_avg_rewards, color='orange')
ax.plot(mean_reward, 'g--')
if min(episodes_to_max) != 99999:
ax.axvline(x=min(episodes_to_max), color='r', linestyle='--')
plt.tight_layout()
#plt.subplots_adjust(top=0.88)
fig.savefig('{}_results_bqsa.jpg'.format(experiment_id), pad_inches=1)
results = {'Experiment_ID': experiment_id,
'gamma': i,
'alpha_lr': j,
'decay1':d,
'decay2':e,
'Mean reward': mean_reward[0],
'Epi_to_max' : min(episodes_to_max)
}
results_list.append(results)
if verbose:
print ('Experiment {} of {} complete'.format(experiment_id, max_experiments))
experiment_id += 1
if parse_results:
parsed_results_df = parse_results_list(results_list)
return parsed_results_df
else:
return results_list
bqsa_results2 = run_bqsa()
bqsa_results2['Algorithm'] = 'BQSA-Learning'
bqsa_results2
bqsa_results2.to_csv('/Users/JamesPaulPhelan/Desktop/bqsa_results2', sep='\t')
data7 = bqsa_results2.pivot("gamma", "alpha_lr", "Mean reward")
ax = sns.heatmap(data7,annot=True,linewidths=.5,cbar_kws={'label': 'Mean reward'})
fig = ax.get_figure()
fig.savefig('BQSAMR.png')
data8 = bqsa_results2.pivot("gamma", "alpha_lr", "Epi_to_max")
ax = sns.heatmap(data8,annot=False,linewidths=.5,cbar_kws={'label': 'Episodes to Max'})
fig = ax.get_figure()
fig.savefig('BQSAETM.png')
fig, ax = plt.subplots()
ax.set_xlabel('Episodes to max')
ax.set_ylabel('Mean reward')
ax.set_title('Comparison betweeen performance of Q-Learning, SAQ-Learning and BQSA-Learning')
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
ax.plot(Q_results.Epi_to_max, Q_results['Mean reward'], marker='o', linestyle='', ms=12, label='Q-Learning')
ax.plot(SA_Q_results.Epi_to_max, SA_Q_results['Mean reward'], marker='o', linestyle='', ms=12, label='SAQ-Learning')
ax.plot(bqsa_results2.Epi_to_max, bqsa_results2['Mean reward'], marker='o', linestyle='', ms=12, label='BQSA-Learning')
ax.legend()
fig = ax.get_figure()
fig.savefig('Comparison.png')
plt.show()