天天看點

[MRP]代碼紀錄

MRP

"""這是MRP疊代過程(Markov Reward Process馬爾可夫報酬過程)"""
 
 
# 疊代
def next_v(_lambda, r_list, old_v_list, weight_list):
    # _lambda是折扣因子,表示未來報酬的目前價值
    new_v_list = []
    for j in range(len(old_v_list)):
        if j != len(old_v_list) - 1:
            j_sum = .0
            # 與目前狀态相接的後續狀态的值函數相應的狀态轉移機率的求和的折扣
            for k in range(len(weight_list[j])):
                j_sum += weight_list[j][k][0] * old_v_list[weight_list[j][k][1]]
 
            # 目前狀态在下一階段的報酬
            new_v_list.append(r_list[j] + _lambda * j_sum)
 
    # Sleep狀态無後續狀态,故直接指派0
    new_v_list.append(0.0)
    return new_v_list
 
 
if __name__ == '__main__':
    # γ
    my_lambda = 1
 
    # 報酬順序:Class 1、Class 2、Class 3、Pass、Pub、Facebook、Sleep,分别對應0, 1, 2, 3, 4, 5, 6
    # 後續順序皆與此相同
    my_r_list = [-2., -2., -2., 10., 1., -1., 0.]  # 目前報酬
 
    # 初始化值函數
    my_old_v_list = [0, 0, 0, 0, 0, 0, 0]  # 舊的值函數清單
 
    # 狀态轉移機率(這裡沒有用機率矩陣,方法有點笨,讀者可以用矩陣來表示)
    # 這裡以[[0.5, 1], [0.5, 5]]為例解釋一下,該清單記錄Class 1的狀态:
    # [0.5, 1]表示以0.5的機率轉移到Class 2
    # [0.5, 5]表示以0.5的機率轉移到Facebook
    my_weight_list = [[[0.5, 1], [0.5, 5]],
                      [[0.8, 2], [0.2, 6]],
                      [[0.6, 3], [0.4, 4]],
                      [[1, 6]],
                      [[0.2, 0], [0.4, 1], [0.4, 2]],
                      [[0.1, 0], [0.9, 5]],
                      [[0, 0]]]
 
    my_new_v_list = []
    # 指定疊代次數
    for i in range(100):
        my_new_v_list = next_v(my_lambda, my_r_list, my_old_v_list, my_weight_list)
 
        # 用新生成的值函數清單替換舊的值函數清單
        my_old_v_list = my_new_v_list
 
    print(my_new_v_list)
    ```
    貝爾曼方程
    ```python
    import numpy as np
 
 # 通過MRPs的Bellman等式求解值函數
# γ
_lambda = 1
 
# 狀态轉移機率矩陣
p = [[0, 0.5, 0, 0, 0, 0.5, 0],
     [0, 0, 0.8, 0, 0, 0, 0.2],
     [0, 0, 0, 0.6, 0.4, 0, 0],
     [0, 0, 0, 0, 0, 0, 1],
     [0.2, 0.4, 0.4, 0, 0, 0, 0],
     [0.1, 0, 0, 0, 0, 0.9, 0],
     [0, 0, 0, 0, 0, 0, 0]]
# 報酬矩陣
r = [[-2],
     [-2],
     [-2],
     [10],
     [1],
     [-1],
     [0]]
# 機關矩陣
i = [[1, 0, 0, 0, 0, 0, 0],
     [0, 1, 0, 0, 0, 0, 0],
     [0, 0, 1, 0, 0, 0, 0],
     [0, 0, 0, 1, 0, 0, 0],
     [0, 0, 0, 0, 1, 0, 0],
     [0, 0, 0, 0, 0, 1, 0],
     [0, 0, 0, 0, 0, 0, 1]]
 
p_mat = np.matrix(p)
r_mat = np.matrix(r)
i_mat = np.matrix(i)
 
# Bellman等式的矩陣形式
v_mat = (i_mat - _lambda * p_mat).I * r_mat
# v_mat = np.dot(np.linalg.inv(i_mat - p_mat), r_mat)
 
print(v_mat)
           

MDP

"""這是MDP疊代過程 馬爾可夫決策過程"""
 
def next_v(pi, r_list, old_v_list, weight_list):
    new_v_list = []
    for j in range(len(old_v_list)):
        if j != len(old_v_list) - 1:
            j_sum = .0
            for k in range(len(weight_list[j])):
                if type(weight_list[j][k][0]) is not list:
                    j_sum += pi * (r_list[weight_list[j][k][1]] + old_v_list[weight_list[j][k][0]])
                else:
                    m_sum = .0
                    for m in range(len(weight_list[j][k][0])):
                        m_sum += old_v_list[weight_list[j][k][0][m][0]] * weight_list[j][k][0][m][1]
                    j_sum += pi * (r_list[weight_list[j][k][1]] + m_sum)
            new_v_list.append(j_sum)
 
    new_v_list.append(0.0)
    return new_v_list
 
if __name__ == '__main__':
    my_pi = 0.5
    # 報酬順序:study pass pub facebook quit sleep
    my_r_list = [-2., 10., 1., -1., 0., 0.]
    my_old_v_list = [0, 0, 0, 0, 0]
    my_weight_list = [[[1, 0], [3, 3]],
                      [[2, 0], [4, 5]],
                      [[[[0, 0.2], [1, 0.4], [2, 0.4]], 2], [4, 1]],
                      [[0, 4], [3, 3]],
                      []]
 
    my_new_v_list = []
    # my_new_v_list = next_v(my_pi, my_r_list, my_old_v_list, my_weight_list)
    for i in range(100):
        my_new_v_list = next_v(my_pi, my_r_list, my_old_v_list, my_weight_list)
        my_old_v_list = my_new_v_list
 
    print(my_new_v_list)
    ```
    貝爾曼方程
    ```
    import numpy as np
 
 
# π、γ
_pi = 0.5
_lambda = 1
 
p = [[0, _pi, 0, _pi, 0],
     [0, 0, _pi, 0, _pi],
     [_pi*0.2, _pi*0.4, _pi*0.4, 0, _pi],
     [_pi, 0, 0, _pi, 0],
     [0, 0, 0, 0, 0]]
r = [[_pi*-2 + _pi*-1],
     [_pi*-2 + _pi*0],
     [_pi*1 + _pi*10],
     [_pi*0 + _pi*-1],
     [0]]
i = [[1, 0, 0, 0, 0],
     [0, 1, 0, 0, 0],
     [0, 0, 1, 0, 0],
     [0, 0, 0, 1, 0],
     [0, 0, 0, 0, 1]]
 
p_mat = np.matrix(p)
r_mat = np.matrix(r)
i_mat = np.matrix(i)
 
v_mat = (i_mat - _lambda * p_mat).I * r_mat
# v_mat = np.dot(np.linalg.inv(i_mat - p_mat), r_mat)
 
print(v_mat)
           

繼續閱讀