aidudezzz · KelvinYang0320 · Sep 30, 2023 · Oct 2, 2023
diff --git a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/PPO_runner.py b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/PPO_runner.py
@@ -20,7 +20,7 @@ def run():
 
     # Run outer loop until the episodes limit is reached or the task is solved
     while not solved and episode_count < episode_limit:
-        state = env.reset()  # Reset robot and get starting observation
+        state, _ = env.reset()  # Reset robot and get starting observation
         env.episode_score = 0
         action_probs = []  # This list holds the probability of each chosen action
 
@@ -33,14 +33,14 @@ def run():
 
             # Step the supervisor to get the current selected_action reward, the new state and whether we reached the
             # done condition
-            new_state, reward, done, info = env.step([selected_action])
+            new_state, reward, terminated, truncated, info = env.step([selected_action])
 
             # Save the current state transition in agent's memory
             trans = Transition(state, selected_action, action_prob, reward, new_state)
             agent.store_transition(trans)
 
             env.episode_score += reward  # Accumulate episode reward
-            if done:
+            if terminated or truncated:
                 # Save the episode's score
                 env.episode_score_list.append(env.episode_score)
                 agent.train_step(batch_size=step + 1)
@@ -74,10 +74,10 @@ def run():
     env.episode_score = 0
     while True:
         selected_action, action_prob = agent.work(state, type_="selectActionMax")
-        state, reward, done, _ = env.step([selected_action])
+        state, reward, terminated, truncated, _ = env.step([selected_action])
         env.episode_score += reward  # Accumulate episode reward
 
-        if done:
+        if terminated or truncated:
             print("Reward accumulated =", env.episode_score)
             env.episode_score = 0
             state = env.reset()
diff --git a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/robot_supervisor.py b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/robot_supervisor.py
@@ -1,7 +1,7 @@
 from deepbots.supervisor import RobotSupervisorEnv
 from utilities import normalize_to_range
 
-from gym.spaces import Box, Discrete
+from gymnasium.spaces import Box, Discrete
 import numpy as np
 
 
@@ -109,7 +109,7 @@ def get_reward(self, action):
         """
         return 1
 
-    def is_done(self):
+    def is_terminated(self):
         """
         An episode is done if the score is over 195.0, or if the pole is off balance, or the cart position is on the
         arena's edges.
@@ -130,6 +130,9 @@ def is_done(self):
 
         return False
 
+    def is_truncated(self):
+        return False
+
     def solved(self):
         """
         This method checks whether the CartPole task is solved, so training terminates.

diff --git a/...ole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/PPO_runner.py b/...ole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/PPO_runner.py
@@ -1,5 +1,5 @@
 from numpy import convolve, ones, mean
-import gym
+import gymnasium as gym
 
 from robot_supervisor import CartPoleRobotSupervisor
 from utilities import plot_data
@@ -13,7 +13,7 @@ def run():
     env = CartPoleRobotSupervisor()
 
     # Verify that the environment is working as a gym-style env
-    #check_env(env)
+    check_env(env)
 
     #  Use the PPO algorithm from the stable baselines having MLP, verbose=1  output the training information
     model = PPO("MlpPolicy", env, verbose=1)

diff --git a/...rtpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/robot_supervisor.py b/...rtpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/robot_supervisor.py
@@ -1,7 +1,7 @@
 from deepbots.supervisor import RobotSupervisorEnv
 from utilities import normalize_to_range
 
-from gym.spaces import Box, Discrete
+from gymnasium.spaces import Box, Discrete
 import numpy as np
 
 
@@ -109,9 +109,9 @@ def get_reward(self, action):
         """
         return 1
 
-    def is_done(self):
+    def is_terminated(self):
         """
-        An episode is done if the score is over 195.0, or if the pole is off balance, or the cart position is on the
+        An episode is terminated if the score is over 195.0, or if the pole is off balance, or the cart position is on the
         arena's edges.
 
         :return: True if termination conditions are met, False otherwise
@@ -130,6 +130,9 @@ def is_done(self):
 
         return False
 
+    def is_truncated(self):
+        return False
+
     def solved(self):
         """
         This method checks whether the CartPole task is solved, so training terminates.

diff --git a/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/PPO_runner.py b/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/PPO_runner.py
@@ -24,7 +24,7 @@ def run():
 
     # Run outer loop until the episodes limit is reached or the task is solved
     while not solved and episode_count < episod_limit:
-        state = supervisor_env.reset()  # Reset robot and get starting observation
+        state, _ = supervisor_env.reset()  # Reset robot and get starting observation
         supervisor_pre.episodeScore = 0
         action_probs = []  # This list holds the probability of each chosen action
 
@@ -37,14 +37,14 @@ def run():
 
             # Step the supervisor to get the current selected_action reward, the new state and whether we reached the
             # done condition
-            new_state, reward, done, info = supervisor_env.step([selected_action])
+            new_state, reward, terminated, truncated, info = supervisor_env.step([selected_action])
 
             # Save the current state transition in agent's memory
             trans = Transition(state, selected_action, action_prob, reward, new_state)
             agent.store_transition(trans)
 
             supervisor_pre.episodeScore += reward  # Accumulate episode reward
-            if done:
+            if terminated or truncated:
                 # Save the episode's score
                 supervisor_pre.episode_score_list.append(supervisor_pre.episodeScore)
                 agent.train_step(batch_size=step + 1)
@@ -85,10 +85,10 @@ def run():
     supervisor_pre.episodeScore = 0
     while True:
         selected_action, action_prob = agent.work(state, type_="selectActionMax")
-        state, reward, done, _ = supervisor_env.step([selected_action])
+        state, reward, terminated, truncated, _ = supervisor_env.step([selected_action])
         supervisor_pre.episodeScore += reward  # Accumulate episode reward
 
-        if done:
+        if terminated or truncated:
             print("Reward accumulated =", supervisor_pre.episodeScore)
             supervisor_pre.episodeScore = 0
             state = supervisor_env.reset()
diff --git a/...cartpole/cartpole_discrete/controllers/supervisor_manager/keyboard_controller_cartpole.py b/...cartpole/cartpole_discrete/controllers/supervisor_manager/keyboard_controller_cartpole.py
@@ -21,7 +21,7 @@ def step(self, action):
 
         "R" invokes the environment's reset method resetting the simulation to its initial state.
         """
-        observation, reward, is_done, info = self.controller.step(action)
+        observation, reward, terminated, truncated, info = self.controller.step(action)
         key = self.keyboard.getKey()
 
         if key == ord("T") and not self.controller.test:
@@ -31,4 +31,4 @@ def step(self, action):
             print("User invoked reset method.")
             self.controller.reset()
 
-        return observation, reward, is_done, info
+        return observation, reward, terminated, truncated, info
diff --git a/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/supervisor_controller.py b/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/supervisor_controller.py
@@ -109,7 +109,7 @@ def get_reward(self, action=None):
         """
         return 1
 
-    def is_done(self):
+    def is_terminated(self):
         """
         An episode is done if the score is over 195.0, or if the pole is off balance, or the cart position is on the
         arena's edges.
@@ -134,6 +134,9 @@ def is_done(self):
 
         return False
 
+    def is_truncated(self):
+        return False
+
     def get_info(self):
         """
         Dummy implementation of get_info.