veepi.py

#!/usr/bin/env python

import numpy as np

# corresponding to 4 rotations of aprob
A = set([0, 1, 2, 3])

ARROWS = '↑←↓→■'

PI = np.array([3, 3, 3, 0,
               2, 0, 3, 0,
               3, 3, 0, 0])

aprob = [[0., 0.8, 0.],
         [.1, 0, .1],
         [0., 0., 0.]]

AB = 999
S_ordering = [(0, 0), (1, 0), (2, 0), (3, 0),
              (0, 1), (AB, 0), (2, 1), (3, 1),
              (0, 2), (1, 2), (2, 2), (3, 2)]
# S_ordering = [(0,0), (0,1), (0,2), (0,3),
#               (1,0),        (1,2), (1,3),
#               (2,0), (2,1), (2,2), (2,3) ]

S = set(S_ordering)
TERMINAL = set([(AB, 0), (3, 0), (3, 1)])

MV = -0.04

R = np.array([MV, MV, MV, 1,
              MV, 0, MV, -1000,
              MV, MV, MV, MV])

GAMMA = 1-1e-9


def psa(s_i, a):
    s = S_ordering[s_i]
    p = np.zeros((len(S_ordering)))
    assert a in A

    pp = np.rot90(aprob, a)

    assert s in S

    if s in TERMINAL:
        p[S_ordering.index((AB, 0))] = 1
        return p

    for i in range(3):
        for j in range(3):
            try:
                sdash = S_ordering.index((i+s[0]-1, j+s[1]-1))
                p[sdash] += pp[j, i]
            except ValueError:
                p[s_i] += pp[j, i]
    return p


def vpi(pi):
    # (\gamma P(s_{1}|s_{1},\pi(s_{1}))-1)V^{\pi}(s_{1})+ \gamma
    # P(s_{2}|s_{1},\pi(s_{1}))V^{\pi}(s_{2})+\ldots&=&-R(s_{1},\pi(s_{1}))

    coef = np.zeros((len(S_ordering), len(S_ordering)))
    for s_i in range(len(S_ordering)):

        this_psa = psa(s_i, pi[s_i])
        for sdash_i in range(len(S_ordering)):
            coef[s_i, sdash_i] = - GAMMA * this_psa[sdash_i]
            if s_i == sdash_i:
                coef[s_i, sdash_i] += 1
    # print(coef)

    x = np.linalg.solve(coef, R)

    # print(x)
    return x


def bestaction(V):
    pi = np.zeros(len(S_ordering))
    for s_i in range(len(S_ordering)):
        exp_v = np.zeros(len(A))
        for a in A:
            pp = psa(s_i, a)
            for sdash_i in range(len(S_ordering)):
                exp_v[a] += pp[sdash_i] * V[sdash_i]
        pi[s_i] = np.argmax(exp_v)

    return pi


def policy_iteration(pi):
    while True:
        V = vpi(pi)

        pi_dash = bestaction(V)
        if np.array_equal(pi, pi_dash):
            return (V, pi)
        pi = pi_dash


def main():
    # pol iter
    import time
    t = time.time()
    (V, pi) = policy_iteration(np.zeros((len(S_ordering))))
    print(time.time() - t)

    print(V)
    for row in range(3):
        print('| ' +
              ' | '.join([f'{V[row * 4 + col]:.3f}'
                          for col in range(4)])
              + ' |')
    pi[5] = 4
    for row in range(3):
        print('| ' +
              ' | '.join([ARROWS[int(pi[row * 4 + col])] for col in range(4)])
              + ' |')


if __name__ == "__main__":
    main()
	#!/usr/bin/env python

	import numpy as np

	# corresponding to 4 rotations of aprob
	A = set([0, 1, 2, 3])

	ARROWS = '↑←↓→■'

	PI = np.array([3, 3, 3, 0,
	2, 0, 3, 0,
	3, 3, 0, 0])

	aprob = [[0., 0.8, 0.],
	[.1, 0, .1],
	[0., 0., 0.]]

	AB = 999
	S_ordering = [(0, 0), (1, 0), (2, 0), (3, 0),
	(0, 1), (AB, 0), (2, 1), (3, 1),
	(0, 2), (1, 2), (2, 2), (3, 2)]
	# S_ordering = [(0,0), (0,1), (0,2), (0,3),
	# (1,0), (1,2), (1,3),
	# (2,0), (2,1), (2,2), (2,3) ]

	S = set(S_ordering)
	TERMINAL = set([(AB, 0), (3, 0), (3, 1)])

	MV = -0.04

	R = np.array([MV, MV, MV, 1,
	MV, 0, MV, -1000,
	MV, MV, MV, MV])

	GAMMA = 1-1e-9


	def psa(s_i, a):
	s = S_ordering[s_i]
	p = np.zeros((len(S_ordering)))
	assert a in A

	pp = np.rot90(aprob, a)

	assert s in S

	if s in TERMINAL:
	p[S_ordering.index((AB, 0))] = 1
	return p

	for i in range(3):
	for j in range(3):
	try:
	sdash = S_ordering.index((i+s[0]-1, j+s[1]-1))
	p[sdash] += pp[j, i]
	except ValueError:
	p[s_i] += pp[j, i]
	return p


	def vpi(pi):
	# (\gamma P(s_{1}\|s_{1},\pi(s_{1}))-1)V^{\pi}(s_{1})+ \gamma
	# P(s_{2}\|s_{1},\pi(s_{1}))V^{\pi}(s_{2})+\ldots&=&-R(s_{1},\pi(s_{1}))

	coef = np.zeros((len(S_ordering), len(S_ordering)))
	for s_i in range(len(S_ordering)):

	this_psa = psa(s_i, pi[s_i])
	for sdash_i in range(len(S_ordering)):
	coef[s_i, sdash_i] = - GAMMA * this_psa[sdash_i]
	if s_i == sdash_i:
	coef[s_i, sdash_i] += 1
	# print(coef)

	x = np.linalg.solve(coef, R)

	# print(x)
	return x


	def bestaction(V):
	pi = np.zeros(len(S_ordering))
	for s_i in range(len(S_ordering)):
	exp_v = np.zeros(len(A))
	for a in A:
	pp = psa(s_i, a)
	for sdash_i in range(len(S_ordering)):
	exp_v[a] += pp[sdash_i] * V[sdash_i]
	pi[s_i] = np.argmax(exp_v)

	return pi


	def policy_iteration(pi):
	while True:
	V = vpi(pi)

	pi_dash = bestaction(V)
	if np.array_equal(pi, pi_dash):
	return (V, pi)
	pi = pi_dash


	def main():
	# pol iter
	import time
	t = time.time()
	(V, pi) = policy_iteration(np.zeros((len(S_ordering))))
	print(time.time() - t)

	print(V)
	for row in range(3):
	print('\| ' +
	' \| '.join([f'{V[row * 4 + col]:.3f}'
	for col in range(4)])
	+ ' \|')
	pi[5] = 4
	for row in range(3):
	print('\| ' +
	' \| '.join([ARROWS[int(pi[row * 4 + col])] for col in range(4)])
	+ ' \|')


	if __name__ == "__main__":
	main()