class DiscountedCartPole(gym.Wrapper):def__init__(self, env): gym.Wrapper.__init__(self, env)def reset(self, **kwargs):returnself.env.reset(**kwargs)def step(self, a): o, r, d, _ =self.env.step(a) (x, x_dot, theta, theta_dot) = o pole_fell = x <-self.env.unwrapped.x_threshold \or x >self.env.unwrapped.x_threshold \or theta <-self.env.unwrapped.theta_threshold_radians \or theta >self.env.unwrapped.theta_threshold_radians r =-1if pole_fell else0return o, r, d, _
class MCCartPole(gym.Wrapper):def__init__(self, env): gym.Wrapper.__init__(self, env)def reset(self, **kwargs):returnself.env.reset(**kwargs)def step(self, a): o, r, d, _ =self.env.step(a) (x, x_dot, theta, theta_dot) = o pole_fell = x <-self.env.unwrapped.x_threshold \or x >self.env.unwrapped.x_threshold \or theta <-self.env.unwrapped.theta_threshold_radians \or theta >self.env.unwrapped.theta_threshold_radiansif d:if pole_fell: r =0# done, in failureelse: r =self.env._max_episode_steps # done, but successfullyreturn o, r, d, _
el 00:00:00, ep 0000, ts 000020, ar 10 020.0±000.0, 100 020.0±000.0, ex 100 0.5±0.0, ev 012.0±000.0
el 00:00:30, ep 0403, ts 030072, ar 10 207.1±092.1, 100 164.4±086.7, ex 100 0.3±0.0, ev 296.0±132.1
el 00:01:00, ep 0604, ts 075023, ar 10 374.2±095.5, 100 237.9±112.4, ex 100 0.3±0.0, ev 306.8±145.2
el 00:01:21, ep 0703, ts 107979, ar 10 373.6±129.8, 100 334.2±127.6, ex 100 0.3±0.0, ev 475.5±056.7
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 64.53s training time, 86.24s wall-clock time.
el 00:00:00, ep 0000, ts 000019, ar 10 019.0±000.0, 100 019.0±000.0, ex 100 0.5±0.0, ev 010.0±000.0
el 00:00:30, ep 0485, ts 023911, ar 10 173.6±068.5, 100 110.1±075.4, ex 100 0.3±0.0, ev 239.3±128.7
el 00:01:00, ep 0694, ts 065145, ar 10 223.7±063.6, 100 197.9±093.0, ex 100 0.3±0.0, ev 301.2±134.1
el 00:01:30, ep 0856, ts 110977, ar 10 318.1±094.9, 100 319.8±123.0, ex 100 0.3±0.0, ev 447.4±103.9
el 00:02:00, ep 1058, ts 154511, ar 10 184.5±024.9, 100 301.9±124.2, ex 100 0.3±0.0, ev 385.2±138.5
el 00:02:29, ep 1196, ts 201289, ar 10 316.4±145.6, 100 389.7±115.9, ex 100 0.3±0.0, ev 475.4±058.3
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 119.75s training time, 154.79s wall-clock time.
el 00:00:00, ep 0000, ts 000016, ar 10 016.0±000.0, 100 016.0±000.0, ex 100 0.7±0.0, ev 014.0±000.0
el 00:00:30, ep 0460, ts 023972, ar 10 197.6±096.7, 100 113.7±081.2, ex 100 0.3±0.1, ev 264.3±146.3
el 00:01:00, ep 0716, ts 064889, ar 10 384.3±131.7, 100 258.2±189.5, ex 100 0.3±0.0, ev 321.4±196.5
el 00:01:10, ep 0754, ts 082591, ar 10 494.7±015.9, 100 412.8±123.1, ex 100 0.3±0.0, ev 476.9±076.8
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 56.44s training time, 75.48s wall-clock time.
el 00:00:00, ep 0000, ts 000041, ar 10 041.0±000.0, 100 041.0±000.0, ex 100 0.5±0.0, ev 011.0±000.0
el 00:00:30, ep 0397, ts 029474, ar 10 190.9±089.6, 100 144.0±073.6, ex 100 0.3±0.0, ev 289.7±125.7
el 00:01:00, ep 0601, ts 071386, ar 10 354.8±106.5, 100 264.6±129.0, ex 100 0.3±0.0, ev 426.7±091.1
el 00:01:30, ep 0805, ts 113800, ar 10 446.0±096.3, 100 263.2±146.0, ex 100 0.3±0.0, ev 320.8±162.1
el 00:01:43, ep 0853, ts 134368, ar 10 404.3±083.7, 100 398.4±108.8, ex 100 0.3±0.0, ev 475.0±063.1
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 435.82±62.11 in 82.74s training time, 107.79s wall-clock time.
el 00:00:00, ep 0000, ts 000029, ar 10 029.0±000.0, 100 029.0±000.0, ex 100 0.6±0.0, ev 019.0±000.0
el 00:00:30, ep 0459, ts 026790, ar 10 179.2±058.6, 100 119.9±065.1, ex 100 0.3±0.0, ev 205.7±124.6
el 00:01:00, ep 0639, ts 069009, ar 10 398.4±119.3, 100 263.9±153.6, ex 100 0.2±0.0, ev 319.6±169.6
el 00:01:30, ep 0783, ts 117147, ar 10 431.5±096.4, 100 357.9±128.2, ex 100 0.2±0.0, ev 391.5±127.9
el 00:01:37, ep 0812, ts 128183, ar 10 379.3±118.5, 100 409.9±095.6, ex 100 0.2±0.0, ev 475.1±053.5
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 78.07s training time, 102.37s wall-clock time.
weight, probs, entropies =-0.001, [], []for p in np.arange(0, 1.01, 0.01): probs.append(p) p = torch.FloatTensor([p, 1-p]) d = torch.distributions.Categorical(probs=p) entropies.append(weight * d.entropy().item())plt.plot(probs, entropies)plt.xlabel('Probability of action A\np(B)=1-p(A)', labelpad=20)plt.ylabel('Negative\nweighted\nentropy', labelpad=80, rotation=0)plt.title('Entropy contribution to the loss function\n{}*entropy(π)'.format(weight), pad=30)plt.show()
class FCV(nn.Module):def__init__(self, input_dim, hidden_dims=(32,32), activation_fc=F.relu):super(FCV, self).__init__()self.activation_fc = activation_fcself.input_layer = nn.Linear(input_dim, hidden_dims[0])self.hidden_layers = nn.ModuleList()for i inrange(len(hidden_dims)-1): hidden_layer = nn.Linear(hidden_dims[i], hidden_dims[i+1])self.hidden_layers.append(hidden_layer)self.output_layer = nn.Linear(hidden_dims[-1], 1)def _format(self, state): x = stateifnotisinstance(x, torch.Tensor): x = torch.tensor(x, dtype=torch.float32) x = x.unsqueeze(0)return xdef forward(self, state): x =self._format(state) x =self.activation_fc(self.input_layer(x))for hidden_layer inself.hidden_layers: x =self.activation_fc(hidden_layer(x))returnself.output_layer(x)
el 00:00:00, ep 0000, ts 000015, ar 10 015.0±000.0, 100 015.0±000.0, ex 100 0.7±0.0, ev 022.0±000.0
el 00:00:30, ep 0272, ts 023994, ar 10 204.4±074.4, 100 152.8±083.0, ex 100 0.3±0.0, ev 384.5±108.0
el 00:01:00, ep 0392, ts 056416, ar 10 337.2±072.2, 100 281.3±126.7, ex 100 0.3±0.0, ev 426.8±109.9
el 00:01:30, ep 0475, ts 091685, ar 10 485.0±045.0, 100 412.3±115.2, ex 100 0.3±0.0, ev 469.8±070.5
el 00:01:36, ep 0490, ts 099102, ar 10 491.7±024.9, 100 433.8±106.0, ex 100 0.3±0.0, ev 475.6±064.9
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 470.15±48.91 in 80.51s training time, 100.92s wall-clock time.
el 00:00:00, ep 0000, ts 000016, ar 10 016.0±000.0, 100 016.0±000.0, ex 100 0.5±0.0, ev 010.0±000.0
el 00:00:30, ep 0271, ts 023015, ar 10 224.7±140.5, 100 157.4±102.9, ex 100 0.3±0.0, ev 348.5±127.1
el 00:01:00, ep 0378, ts 053590, ar 10 375.6±109.5, 100 289.5±108.4, ex 100 0.3±0.0, ev 470.0±066.1
el 00:01:00, ep 0381, ts 054412, ar 10 364.8±130.6, 100 291.2±109.8, ex 100 0.3±0.0, ev 475.7±059.7
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 491.57±29.58 in 49.83s training time, 65.84s wall-clock time.
el 00:00:00, ep 0000, ts 000012, ar 10 012.0±000.0, 100 012.0±000.0, ex 100 0.2±0.0, ev 009.0±000.0
el 00:00:30, ep 0255, ts 021235, ar 10 178.6±065.7, 100 145.2±081.3, ex 100 0.3±0.0, ev 392.6±116.4
el 00:01:00, ep 0367, ts 053476, ar 10 319.4±129.0, 100 298.6±132.4, ex 100 0.3±0.0, ev 456.4±076.7
el 00:01:10, ep 0398, ts 065780, ar 10 434.8±127.8, 100 352.9±128.5, ex 100 0.3±0.0, ev 475.2±052.5
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 497.89±9.70 in 56.72s training time, 75.84s wall-clock time.
el 00:00:00, ep 0000, ts 000015, ar 10 015.0±000.0, 100 015.0±000.0, ex 100 0.3±0.0, ev 011.0±000.0
el 00:00:30, ep 0261, ts 023823, ar 10 235.1±082.2, 100 169.4±095.7, ex 100 0.3±0.0, ev 381.2±123.0
el 00:00:54, ep 0344, ts 049251, ar 10 359.2±111.2, 100 285.9±113.9, ex 100 0.3±0.0, ev 476.1±053.5
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 499.79±2.09 in 44.73s training time, 59.04s wall-clock time.
el 00:00:00, ep 0000, ts 000019, ar 10 019.0±000.0, 100 019.0±000.0, ex 100 0.3±0.0, ev 009.0±000.0
el 00:00:30, ep 0252, ts 021122, ar 10 233.9±078.6, 100 151.6±082.6, ex 100 0.3±0.0, ev 393.2±115.6
el 00:01:00, ep 0357, ts 053059, ar 10 403.8±121.6, 100 305.5±119.6, ex 100 0.3±0.0, ev 470.1±062.5
el 00:01:03, ep 0366, ts 056699, ar 10 412.9±111.0, 100 321.4±123.6, ex 100 0.3±0.0, ev 476.0±056.4
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 50.90s training time, 68.78s wall-clock time.
el 00:00:00, ep 0000, ts 000015, ar 10 015.0±000.0, 100 015.0±000.0, ex 100 0.7±0.0, ev 022.0±000.0
el 00:00:16, ep 0508, ts 103193, ar 10 477.5±071.2, 100 438.5±098.4, ex 100 0.2±0.0, ev 479.5±058.2
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 113.05s training time, 16.92s wall-clock time.
el 00:00:00, ep 0000, ts 000016, ar 10 016.0±000.0, 100 016.0±000.0, ex 100 0.5±0.0, ev 010.0±000.0
el 00:00:17, ep 0627, ts 102319, ar 10 496.3±011.7, 100 448.3±086.1, ex 100 0.2±0.0, ev 482.0±043.2
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 118.00s training time, 18.00s wall-clock time.
el 00:00:00, ep 0000, ts 000012, ar 10 012.0±000.0, 100 012.0±000.0, ex 100 0.2±0.0, ev 009.0±000.0
el 00:00:30, ep 0890, ts 190166, ar 10 500.0±000.0, 100 479.0±060.0, ex 100 0.2±0.0, ev 486.6±042.5
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 202.03s training time, 30.05s wall-clock time.
el 00:00:00, ep 0000, ts 000015, ar 10 015.0±000.0, 100 015.0±000.0, ex 100 0.3±0.0, ev 011.0±000.0
el 00:00:30, ep 0825, ts 197972, ar 10 481.7±041.5, 100 447.7±089.5, ex 100 0.1±0.0, ev 459.5±075.7
el 00:00:35, ep 0916, ts 241390, ar 10 500.0±000.0, 100 477.4±063.6, ex 100 0.1±0.0, ev 475.9±068.2
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 244.94s training time, 36.04s wall-clock time.
el 00:00:00, ep 0000, ts 000019, ar 10 019.0±000.0, 100 019.0±000.0, ex 100 0.3±0.0, ev 009.0±000.0
el 00:00:14, ep 0527, ts 072236, ar 10 500.0±000.0, 100 342.2±134.6, ex 100 0.3±0.0, ev 478.4±056.6
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 88.76s training time, 14.38s wall-clock time.
el 00:00:00, ep 0000, ts 000015, ar 10 015.0±000.0, 100 015.0±000.0, ex 100 0.7±0.0, ev 022.0±000.0
el 00:00:30, ep 2707, ts 043577, ar 10 012.0±005.5, 100 012.1±003.6, ex 100 0.2±0.1, ev 009.3±001.2
el 00:01:00, ep 5386, ts 076157, ar 10 020.2±009.2, 100 020.6±011.2, ex 100 0.4±0.1, ev 022.5±019.8
el 00:01:30, ep 8070, ts 116692, ar 10 009.3±000.9, 100 009.4±000.9, ex 100 0.0±0.0, ev 009.4±000.7
el 00:01:49, ep 9997, ts 134864, ar 10 008.8±003.2, 100 009.4±001.3, ex 100 0.0±0.0, ev 009.3±001.2
--> reached_max_episodes ✕
Training complete.
Final evaluation score 9.32±0.73 in 834.32s training time, 109.81s wall-clock time.
el 00:00:00, ep 0000, ts 000016, ar 10 016.0±000.0, 100 016.0±000.0, ex 100 0.5±0.0, ev 010.0±000.0
el 00:00:21, ep 0673, ts 111604, ar 10 500.0±000.0, 100 445.1±101.0, ex 100 0.2±0.0, ev 477.6±054.9
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 141.37s training time, 21.34s wall-clock time.
el 00:00:00, ep 0000, ts 000012, ar 10 012.0±000.0, 100 012.0±000.0, ex 100 0.2±0.0, ev 009.0±000.0
el 00:00:17, ep 0646, ts 089063, ar 10 431.9±162.1, 100 421.8±126.1, ex 100 0.3±0.0, ev 480.1±062.6
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 118.79s training time, 17.82s wall-clock time.
el 00:00:00, ep 0000, ts 000015, ar 10 015.0±000.0, 100 015.0±000.0, ex 100 0.3±0.0, ev 011.0±000.0
el 00:00:20, ep 0589, ts 110357, ar 10 500.0±000.0, 100 461.9±078.2, ex 100 0.2±0.0, ev 492.5±036.0
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 136.27s training time, 20.47s wall-clock time.
el 00:00:00, ep 0000, ts 000019, ar 10 019.0±000.0, 100 019.0±000.0, ex 100 0.3±0.0, ev 009.0±000.0
el 00:00:16, ep 0494, ts 075672, ar 10 409.1±093.6, 100 385.4±119.7, ex 100 0.3±0.0, ev 486.7±038.5
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 447.38±64.04 in 104.10s training time, 16.17s wall-clock time.
el 00:00:00, ep 0000, ts 000014, ar 10 014.0±000.0, 100 014.0±000.0, ex 100 0.5±0.0, ev 010.0±000.0
el 00:00:30, ep 0635, ts 086300, ar 10 340.0±150.1, 100 268.4±170.6, ex 100 0.2±0.0, ev 393.2±155.1
el 00:01:00, ep 0957, ts 201778, ar 10 230.7±178.3, 100 442.2±132.3, ex 100 0.2±0.0, ev 391.9±157.5
el 00:01:20, ep 1108, ts 270963, ar 10 500.0±000.0, 100 488.8±068.8, ex 100 0.2±0.0, ev 476.1±069.7
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 599.12s training time, 86.05s wall-clock time.
el 00:00:00, ep 0000, ts 000014, ar 10 014.0±000.0, 100 014.0±000.0, ex 100 0.4±0.0, ev 011.0±000.0
el 00:00:30, ep 0700, ts 089210, ar 10 462.8±074.7, 100 313.2±154.9, ex 100 0.2±0.0, ev 367.1±120.6
el 00:00:37, ep 0751, ts 114449, ar 10 500.0±000.0, 100 430.7±129.6, ex 100 0.2±0.0, ev 475.8±056.6
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 274.62s training time, 43.15s wall-clock time.
el 00:00:00, ep 0000, ts 000009, ar 10 009.0±000.0, 100 009.0±000.0, ex 100 0.9±0.0, ev 010.0±000.0
el 00:00:30, ep 0645, ts 092189, ar 10 480.8±039.9, 100 358.5±169.7, ex 100 0.2±0.0, ev 463.5±087.1
el 00:00:32, ep 0662, ts 100689, ar 10 500.0±000.0, 100 384.3±167.2, ex 100 0.2±0.0, ev 475.9±071.8
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 240.26s training time, 38.35s wall-clock time.
el 00:00:00, ep 0000, ts 000010, ar 10 010.0±000.0, 100 010.0±000.0, ex 100 0.1±0.0, ev 009.0±000.0
el 00:00:30, ep 0674, ts 093052, ar 10 488.7±033.9, 100 293.0±158.8, ex 100 0.2±0.0, ev 380.0±116.2
el 00:00:39, ep 0741, ts 125628, ar 10 500.0±000.0, 100 450.3±109.4, ex 100 0.2±0.0, ev 475.6±062.6
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 295.73s training time, 45.91s wall-clock time.
el 00:00:00, ep 0000, ts 000012, ar 10 012.0±000.0, 100 012.0±000.0, ex 100 0.7±0.0, ev 026.0±000.0
el 00:00:30, ep 0614, ts 090673, ar 10 481.3±038.0, 100 365.9±173.1, ex 100 0.2±0.0, ev 454.3±081.1
el 00:00:35, ep 0649, ts 108173, ar 10 500.0±000.0, 100 452.3±121.3, ex 100 0.2±0.0, ev 475.9±064.4
--> reached_goal_mean_reward ✓
Training complete.
Final evaluation score 500.00±0.00 in 259.40s training time, 41.02s wall-clock time.