while not is_terminal(state):
action = select_action(state) # 使用当前策略选择动作
next_state, reward = take_action(state, action)
episode.append((state, action, reward))
state = next_state
# 计算折扣累积回报
G = 0
for state, action, reward in reversed(episode):
G = reward + discount_factor * G
samples.append((state, action, G))
# 神经网络训练
train_neural_network(samples)
# 策略更新
update_policy()
# 检查收敛条件
if check_convergence():
break
对抗性遗憾最小化(CFR)
# 初始化策略和遗憾值
initialize_strategy_and_regret()
for iteration in range(num_iterations):
# 策略更新
update_strategy()
# 策略模拟
for game in range(num_games):
play_game_and_update_regret()
# 检查收敛条件
if check_convergence():
break
6. 总结
深度蒙特卡洛(DMC)和对抗性遗憾最小化(CFR)各自有其独特的优势和应用场景:
while not is_terminal(state):
action = select_action(state) # 使用当前策略选择动作
next_state, reward = take_action(state, action)
episode.append((state, action, reward))
state = next_state
# 计算折扣累积回报
G = 0
for state, action, reward in reversed(episode):
G = reward + discount_factor * G
samples.append((state, action, G))
# 神经网络训练
train_neural_network(samples)
# 策略更新
update_policy()
# 检查收敛条件
if check_convergence():
break