Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
khanm442 committed Nov 27, 2024
0 parents commit 1c5202f
Showing 1 changed file with 113 additions and 0 deletions.
113 changes: 113 additions & 0 deletions Q-Learning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import pickle

def acrobat(is_training=True, render=False):
env = gym.make('Acrobot-v1', render_mode='human' if render else None)

th1_cos = np.linspace(env.observation_space.low[0], env.observation_space.high[0], 15)
th1_sin = np.linspace(env.observation_space.low[1], env.observation_space.high[1], 15)
th2_cos = np.linspace(env.observation_space.low[2], env.observation_space.high[2], 15)
th2_sin = np.linspace(env.observation_space.low[3], env.observation_space.high[3], 15)
th1_ang = np.linspace(env.observation_space.low[4], env.observation_space.high[4], 15)
th2_ang = np.linspace(env.observation_space.low[5], env.observation_space.high[5], 15)

if is_training:
q = np.zeros((len(th1_cos)+1, len(th1_sin)+1, len(th2_cos)+1, len(th2_sin)+1, len(th1_ang)+1, len(th2_ang)+1, env.action_space.n))
else:
f = open('acrobat.pkl', 'rb')
q = pickle.load(f)
f.close()

learning_rate = 0.1
discount_factor = 0.9
epsilon = 1
epsilon_decay_rate = 0.999
rng = np.random.default_rng()

rewards_per_episode = []
i = 0

while(True):

state = env.reset()[0]

state_cos2 = np.digitize(state[0], th2_cos)
state_sin2 = np.digitize(state[1], th2_sin)
state_cos1 = np.digitize(state[2], th1_cos)
state_sin1 = np.digitize(state[3], th1_sin)
state_ang1 = np.digitize(state[4], th1_ang)
state_ang2 = np.digitize(state[5], th2_ang)

terminated = False
rewards = 0

while(not terminated):

if is_training and rng.random() < epsilon and np.all(q[state_cos2, state_cos1, state_sin2, state_sin1, state_ang1, state_ang2, :] ==
q[state_cos2, state_cos1, state_sin2, state_sin1, state_ang1, state_ang2, 0]):
action = env.action_space.sample()
else:
action = np.argmax(q[state_cos2, state_sin2, state_cos1, state_sin1, state_ang1, state_ang2, :])
#print(f'Action: {action}')

new_state, reward, terminated,_,_ = env.step(action)

#print(f'New_state: {new_state}, Reward:{reward}')

new_state_cos2 = np.digitize(new_state[0], th2_cos)
new_state_sin2 = np.digitize(new_state[1], th2_sin)
new_state_cos1 = np.digitize(new_state[2], th1_cos)
new_state_sin1 = np.digitize(new_state[3], th1_sin)
new_state_ang1 = np.digitize(new_state[4], th1_ang)
new_state_ang2 = np.digitize(new_state[5], th2_ang)

if is_training:
q[state_cos2, state_sin2, state_cos1, state_sin1, state_ang1, state_ang2, action] += \
learning_rate * (
reward +
discount_factor * np.max(q[new_state_cos2, new_state_sin2, new_state_cos1,
new_state_sin1, new_state_ang1, new_state_ang2, :]) -
q[state_cos2, state_sin2, state_cos1, state_sin1, state_ang1, state_ang2, action]
)

state = new_state
state_cos2 = new_state_cos2
state_sin2 = new_state_sin2
state_cos1 = new_state_cos1
state_sin1 = new_state_sin1
state_ang1 = new_state_ang1
state_ang2 = new_state_ang2

rewards+=reward
#print(f'Reward: {reward}')

rewards_per_episode.append(rewards)
mean_rewards = np.mean(rewards_per_episode[len(rewards_per_episode)-100:])
if i%100 == 0:
print(f'Episode numeber: {i}, Epsilon: {epsilon}, Rewards: {rewards} Mean Reward: {mean_rewards}')

if mean_rewards>-100 or rewards>-100 or i == 10000:
break

#epsilon = np.max(0.01, epsilon * epsilon_decay_rate)
#epsilon = max(0.01, epsilon * epsilon_decay_rate)
epsilon = max(epsilon - 0.0005, 0.01)
i+=1

env.close()

# Save Q table to file
if is_training:
f = open('acrobat.pkl','wb')
pickle.dump(q, f)
f.close()

mean_rewards = []
for t in range(i):
mean_rewards.append(np.mean(rewards_per_episode[max(0, t-100):(t+1)]))
plt.plot(mean_rewards)
plt.savefig(f'acrobat.png')

acrobat()

0 comments on commit 1c5202f

Please sign in to comment.