Q-Learning決策過程
Q-learning 小例子
-o---T
# T 就是寶藏的位置, o 是探索者的位置
每一次移動,狀态發生改變的回報
def get_env_feedback(S, A):
# This is how agent will interact with the environment
if A == 'right': # move right
if S == N_STATES - 2: # terminate
S_ = 'terminal'
R = 1
else:
S_ = S + 1
R = 0
else: # move left
R = 0
if S == 0:
S_ = S # reach the wall
else:
S_ = S - 1
return S_, R
RL算法:選擇、更新
def rl():
q_table = build_q_table(N_STATES, ACTIONS) # 初始 q table
for episode in range(MAX_EPISODES): # 回合
step_counter = 0
S = 0 # 回合初始位置
is_terminated = False # 是否回合結束
update_env(S, episode, step_counter) # 環境更新
while not is_terminated:
A = choose_action(S, q_table) # 選行為
S_, R = get_env_feedback(S, A) # 實施行為并得到環境的回報
q_predict = q_table.loc[S, A] # 估算的(狀态-行為)值
if S_ != 'terminal':
q_target = R + GAMMA * q_table.iloc[S_, :].max() # 實際的(狀态-行為)值 (回合沒結束)
else:
q_target = R # 實際的(狀态-行為)值 (回合結束)
is_terminated = True # terminate this episode
q_table.loc[S, A] += ALPHA * (q_target - q_predict) # q_table 更新
S = S_ # 探索者移動到下一個 state
update_env(S, episode, step_counter+1) # 環境更新
step_counter += 1
return q_table