plot_value_function('FVMC estimates through time vs. true values', np.max(Q_track_mc, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=False)
plot_value_function('FVMC estimates through time vs. true values (log scale)', np.max(Q_track_mc, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=True)
plot_value_function('FVMC estimates through time (close up)', np.max(Q_track_mc, axis=2)[:cu_episodes], None, limit_items=cu_limit_items, limit_value=cu_limit_value, log=False)
SARSA
plot_value_function('Sarsa estimates through time vs. true values', np.max(Q_track_sarsa, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=False)
plot_value_function('Sarsa estimates through time vs. true values (log scale)', np.max(Q_track_sarsa, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=True)
plot_value_function('Sarsa estimates through time (close up)', np.max(Q_track_sarsa, axis=2)[:cu_episodes], None, limit_items=cu_limit_items, limit_value=cu_limit_value, log=False)
Q학습
plot_value_function('Q-Learning estimates through time vs. true values', np.max(Q_track_ql, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=False)
plot_value_function('Q-Learning estimates through time vs. true values (log scale)', np.max(Q_track_ql, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=True)
plot_value_function('Q-Learning estimates through time (close up)', np.max(Q_track_ql, axis=2)[:cu_episodes], None, limit_items=cu_limit_items, limit_value=cu_limit_value, log=False)
이중 Q학습
plot_value_function('Double Q-Learning estimates through time vs. true values', np.max(Q_track_dql, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=False)
plot_value_function('Double Q-Learning estimates through time vs. true values (log scale)', np.max(Q_track_dql, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=True)
plot_value_function('Double Q-Learning estimates through time (close up)', np.max(Q_track_dql, axis=2)[:cu_episodes], None, limit_items=cu_limit_items, limit_value=cu_limit_value, log=False)
plot_value_function('FVMC estimates through time vs. true values', np.max(Q_track_mc, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=False)
plot_value_function('FVMC estimates through time vs. true values (log scale)', np.max(Q_track_mc, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=True)
plot_value_function('FVMC estimates through time (close up)', np.max(Q_track_mc, axis=2)[:cu_episodes], None, limit_items=cu_limit_items, limit_value=cu_limit_value, log=False)
SARSA
plot_value_function('Sarsa estimates through time vs. true values', np.max(Q_track_sarsa, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=False)
plot_value_function('Sarsa estimates through time vs. true values (log scale)', np.max(Q_track_sarsa, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=True)
plot_value_function('Sarsa estimates through time (close up)', np.max(Q_track_sarsa, axis=2)[:cu_episodes], None, limit_items=cu_limit_items, limit_value=cu_limit_value, log=False)
Q학습
plot_value_function('Q-Learning estimates through time vs. true values', np.max(Q_track_ql, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=False)
plot_value_function('Q-Learning estimates through time vs. true values (log scale)', np.max(Q_track_ql, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=True)
plot_value_function('Q-Learning estimates through time (close up)', np.max(Q_track_ql, axis=2)[:cu_episodes], None, limit_items=cu_limit_items, limit_value=cu_limit_value, log=False)
이중 Q학습
plot_value_function('Double Q-Learning estimates through time vs. true values', np.max(Q_track_dql, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=False)
plot_value_function('Double Q-Learning estimates through time vs. true values (log scale)', np.max(Q_track_dql, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=True)
plot_value_function('Double Q-Learning estimates through time (close up)', np.max(Q_track_dql, axis=2)[:cu_episodes], None, limit_items=cu_limit_items, limit_value=cu_limit_value, log=False)
plot_value_function('FVMC estimates through time vs. true values', np.max(Q_track_mc, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=False)
plot_value_function('FVMC estimates through time vs. true values (log scale)', np.max(Q_track_mc, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=True)
plot_value_function('FVMC estimates through time (close up)', np.max(Q_track_mc, axis=2)[:cu_episodes], None, limit_items=cu_limit_items, limit_value=cu_limit_value, log=False)
SARSA
plot_value_function('Sarsa estimates through time vs. true values', np.max(Q_track_sarsa, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=False)
plot_value_function('Sarsa estimates through time vs. true values (log scale)', np.max(Q_track_sarsa, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=True)
plot_value_function('Sarsa estimates through time (close up)', np.max(Q_track_sarsa, axis=2)[:cu_episodes], None, limit_items=cu_limit_items, limit_value=cu_limit_value, log=False)
Q학습
plot_value_function('Q-Learning estimates through time vs. true values', np.max(Q_track_ql, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=False)
plot_value_function('Q-Learning estimates through time vs. true values (log scale)', np.max(Q_track_ql, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=True)
plot_value_function('Q-Learning estimates through time (close up)', np.max(Q_track_ql, axis=2)[:cu_episodes], None, limit_items=cu_limit_items, limit_value=cu_limit_value, log=False)
이중 Q학습
plot_value_function('Double Q-Learning estimates through time vs. true values', np.max(Q_track_dql, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=False)
plot_value_function('Double Q-Learning estimates through time vs. true values (log scale)', np.max(Q_track_dql, axis=2), optimal_V, limit_items=limit_items, limit_value=limit_value, log=True)
plot_value_function('Double Q-Learning estimates through time (close up)', np.max(Q_track_dql, axis=2)[:cu_episodes], None, limit_items=cu_limit_items, limit_value=cu_limit_value, log=False)