Skip to content
Permalink
5ba1232712
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
James Brusey first version
Latest commit 5ba1232 Oct 15, 2024 History
0 contributors

Users who have contributed to this file

125 lines (92 sloc) 2.76 KB
#!/usr/bin/env python
import numpy as np
# corresponding to 4 rotations of aprob
A = set([0, 1, 2, 3])
ARROWS = '↑←↓→■'
PI = np.array([3, 3, 3, 0,
2, 0, 3, 0,
3, 3, 0, 0])
aprob = [[0., 0.8, 0.],
[.1, 0, .1],
[0., 0., 0.]]
AB = 999
S_ordering = [(0, 0), (1, 0), (2, 0), (3, 0),
(0, 1), (AB, 0), (2, 1), (3, 1),
(0, 2), (1, 2), (2, 2), (3, 2)]
# S_ordering = [(0,0), (0,1), (0,2), (0,3),
# (1,0), (1,2), (1,3),
# (2,0), (2,1), (2,2), (2,3) ]
S = set(S_ordering)
TERMINAL = set([(AB, 0), (3, 0), (3, 1)])
MV = -0.04
R = np.array([MV, MV, MV, 1,
MV, 0, MV, -1000,
MV, MV, MV, MV])
GAMMA = 1-1e-9
def psa(s_i, a):
s = S_ordering[s_i]
p = np.zeros((len(S_ordering)))
assert a in A
pp = np.rot90(aprob, a)
assert s in S
if s in TERMINAL:
p[S_ordering.index((AB, 0))] = 1
return p
for i in range(3):
for j in range(3):
try:
sdash = S_ordering.index((i+s[0]-1, j+s[1]-1))
p[sdash] += pp[j, i]
except ValueError:
p[s_i] += pp[j, i]
return p
def vpi(pi):
# (\gamma P(s_{1}|s_{1},\pi(s_{1}))-1)V^{\pi}(s_{1})+ \gamma
# P(s_{2}|s_{1},\pi(s_{1}))V^{\pi}(s_{2})+\ldots&=&-R(s_{1},\pi(s_{1}))
coef = np.zeros((len(S_ordering), len(S_ordering)))
for s_i in range(len(S_ordering)):
this_psa = psa(s_i, pi[s_i])
for sdash_i in range(len(S_ordering)):
coef[s_i, sdash_i] = - GAMMA * this_psa[sdash_i]
if s_i == sdash_i:
coef[s_i, sdash_i] += 1
# print(coef)
x = np.linalg.solve(coef, R)
# print(x)
return x
def bestaction(V):
pi = np.zeros(len(S_ordering))
for s_i in range(len(S_ordering)):
exp_v = np.zeros(len(A))
for a in A:
pp = psa(s_i, a)
for sdash_i in range(len(S_ordering)):
exp_v[a] += pp[sdash_i] * V[sdash_i]
pi[s_i] = np.argmax(exp_v)
return pi
def policy_iteration(pi):
while True:
V = vpi(pi)
pi_dash = bestaction(V)
if np.array_equal(pi, pi_dash):
return (V, pi)
pi = pi_dash
def main():
# pol iter
import time
t = time.time()
(V, pi) = policy_iteration(np.zeros((len(S_ordering))))
print(time.time() - t)
print(V)
for row in range(3):
print('| ' +
' | '.join([f'{V[row * 4 + col]:.3f}'
for col in range(4)])
+ ' |')
pi[5] = 4
for row in range(3):
print('| ' +
' | '.join([ARROWS[int(pi[row * 4 + col])] for col in range(4)])
+ ' |')
if __name__ == "__main__":
main()