From 54bc942dcf0397149ac46be61eb17d67996fc310 Mon Sep 17 00:00:00 2001
From: KelvinYang0320 <kelvin777320@gmail.com>
Date: Sat, 30 Sep 2023 22:11:51 +0800
Subject: [PATCH 1/2] Updated is_terminated, is_truncated, and check_env for
 gymnasium in cartpole_discrete_SB3

---
 .../PPO_runner.py                                        | 4 ++--
 .../robot_supervisor.py                                  | 9 ++++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/PPO_runner.py b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/PPO_runner.py
index 736036cb..92428c90 100644
--- a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/PPO_runner.py
+++ b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/PPO_runner.py
@@ -1,5 +1,5 @@
 from numpy import convolve, ones, mean
-import gym
+import gymnasium as gym
 
 from robot_supervisor import CartPoleRobotSupervisor
 from utilities import plot_data
@@ -13,7 +13,7 @@ def run():
     env = CartPoleRobotSupervisor()
     
     # Verify that the environment is working as a gym-style env
-    #check_env(env)
+    check_env(env)
     
     #  Use the PPO algorithm from the stable baselines having MLP, verbose=1  output the training information
     model = PPO("MlpPolicy", env, verbose=1)
diff --git a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/robot_supervisor.py b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/robot_supervisor.py
index 19c481be..1dd915b8 100644
--- a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/robot_supervisor.py
+++ b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/robot_supervisor.py
@@ -1,7 +1,7 @@
 from deepbots.supervisor import RobotSupervisorEnv
 from utilities import normalize_to_range
 
-from gym.spaces import Box, Discrete
+from gymnasium.spaces import Box, Discrete
 import numpy as np
 
 
@@ -109,9 +109,9 @@ def get_reward(self, action):
         """
         return 1
 
-    def is_done(self):
+    def is_terminated(self):
         """
-        An episode is done if the score is over 195.0, or if the pole is off balance, or the cart position is on the
+        An episode is terminated if the score is over 195.0, or if the pole is off balance, or the cart position is on the
         arena's edges.
 
         :return: True if termination conditions are met, False otherwise
@@ -130,6 +130,9 @@ def is_done(self):
 
         return False
 
+    def is_truncated(self):
+        return False
+
     def solved(self):
         """
         This method checks whether the CartPole task is solved, so training terminates.

From 6d854b591d10e8e0864a7efefdc38e5216dfdd81 Mon Sep 17 00:00:00 2001
From: KelvinYang0320 <kelvin777320@gmail.com>
Date: Mon, 2 Oct 2023 21:24:45 +0800
Subject: [PATCH 2/2] gymnasium step and reset

---
 .../controllers/robot_supervisor_manager/PPO_runner.py | 10 +++++-----
 .../robot_supervisor_manager/robot_supervisor.py       |  7 +++++--
 .../controllers/supervisor_manager/PPO_runner.py       | 10 +++++-----
 .../supervisor_manager/keyboard_controller_cartpole.py |  4 ++--
 .../supervisor_manager/supervisor_controller.py        |  5 ++++-
 5 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/PPO_runner.py b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/PPO_runner.py
index 85093ce1..3664eb94 100644
--- a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/PPO_runner.py
+++ b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/PPO_runner.py
@@ -20,7 +20,7 @@ def run():
 
     # Run outer loop until the episodes limit is reached or the task is solved
     while not solved and episode_count < episode_limit:
-        state = env.reset()  # Reset robot and get starting observation
+        state, _ = env.reset()  # Reset robot and get starting observation
         env.episode_score = 0
         action_probs = []  # This list holds the probability of each chosen action
 
@@ -33,14 +33,14 @@ def run():
 
             # Step the supervisor to get the current selected_action reward, the new state and whether we reached the
             # done condition
-            new_state, reward, done, info = env.step([selected_action])
+            new_state, reward, terminated, truncated, info = env.step([selected_action])
 
             # Save the current state transition in agent's memory
             trans = Transition(state, selected_action, action_prob, reward, new_state)
             agent.store_transition(trans)
 
             env.episode_score += reward  # Accumulate episode reward
-            if done:
+            if terminated or truncated:
                 # Save the episode's score
                 env.episode_score_list.append(env.episode_score)
                 agent.train_step(batch_size=step + 1)
@@ -74,10 +74,10 @@ def run():
     env.episode_score = 0
     while True:
         selected_action, action_prob = agent.work(state, type_="selectActionMax")
-        state, reward, done, _ = env.step([selected_action])
+        state, reward, terminated, truncated, _ = env.step([selected_action])
         env.episode_score += reward  # Accumulate episode reward
 
-        if done:
+        if terminated or truncated:
             print("Reward accumulated =", env.episode_score)
             env.episode_score = 0
             state = env.reset()
diff --git a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/robot_supervisor.py b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/robot_supervisor.py
index 622b511a..24293395 100644
--- a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/robot_supervisor.py
+++ b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/robot_supervisor.py
@@ -1,7 +1,7 @@
 from deepbots.supervisor import RobotSupervisorEnv
 from utilities import normalize_to_range
 
-from gym.spaces import Box, Discrete
+from gymnasium.spaces import Box, Discrete
 import numpy as np
 
 
@@ -109,7 +109,7 @@ def get_reward(self, action):
         """
         return 1
 
-    def is_done(self):
+    def is_terminated(self):
         """
         An episode is done if the score is over 195.0, or if the pole is off balance, or the cart position is on the
         arena's edges.
@@ -130,6 +130,9 @@ def is_done(self):
 
         return False
 
+    def is_truncated(self):
+        return False
+
     def solved(self):
         """
         This method checks whether the CartPole task is solved, so training terminates.
diff --git a/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/PPO_runner.py b/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/PPO_runner.py
index ba041479..f258887b 100644
--- a/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/PPO_runner.py
+++ b/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/PPO_runner.py
@@ -24,7 +24,7 @@ def run():
 
     # Run outer loop until the episodes limit is reached or the task is solved
     while not solved and episode_count < episod_limit:
-        state = supervisor_env.reset()  # Reset robot and get starting observation
+        state, _ = supervisor_env.reset()  # Reset robot and get starting observation
         supervisor_pre.episodeScore = 0
         action_probs = []  # This list holds the probability of each chosen action
 
@@ -37,14 +37,14 @@ def run():
 
             # Step the supervisor to get the current selected_action reward, the new state and whether we reached the
             # done condition
-            new_state, reward, done, info = supervisor_env.step([selected_action])
+            new_state, reward, terminated, truncated, info = supervisor_env.step([selected_action])
 
             # Save the current state transition in agent's memory
             trans = Transition(state, selected_action, action_prob, reward, new_state)
             agent.store_transition(trans)
 
             supervisor_pre.episodeScore += reward  # Accumulate episode reward
-            if done:
+            if terminated or truncated:
                 # Save the episode's score
                 supervisor_pre.episode_score_list.append(supervisor_pre.episodeScore)
                 agent.train_step(batch_size=step + 1)
@@ -85,10 +85,10 @@ def run():
     supervisor_pre.episodeScore = 0
     while True:
         selected_action, action_prob = agent.work(state, type_="selectActionMax")
-        state, reward, done, _ = supervisor_env.step([selected_action])
+        state, reward, terminated, truncated, _ = supervisor_env.step([selected_action])
         supervisor_pre.episodeScore += reward  # Accumulate episode reward
 
-        if done:
+        if terminated or truncated:
             print("Reward accumulated =", supervisor_pre.episodeScore)
             supervisor_pre.episodeScore = 0
             state = supervisor_env.reset()
diff --git a/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/keyboard_controller_cartpole.py b/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/keyboard_controller_cartpole.py
index e67dced6..5f66ac01 100644
--- a/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/keyboard_controller_cartpole.py
+++ b/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/keyboard_controller_cartpole.py
@@ -21,7 +21,7 @@ def step(self, action):
 
         "R" invokes the environment's reset method resetting the simulation to its initial state.
         """
-        observation, reward, is_done, info = self.controller.step(action)
+        observation, reward, terminated, truncated, info = self.controller.step(action)
         key = self.keyboard.getKey()
 
         if key == ord("T") and not self.controller.test:
@@ -31,4 +31,4 @@ def step(self, action):
             print("User invoked reset method.")
             self.controller.reset()
 
-        return observation, reward, is_done, info
+        return observation, reward, terminated, truncated, info
diff --git a/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/supervisor_controller.py b/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/supervisor_controller.py
index 5bf2ac4b..9e7a28f3 100644
--- a/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/supervisor_controller.py
+++ b/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/supervisor_controller.py
@@ -109,7 +109,7 @@ def get_reward(self, action=None):
         """
         return 1
 
-    def is_done(self):
+    def is_terminated(self):
         """
         An episode is done if the score is over 195.0, or if the pole is off balance, or the cart position is on the
         arena's edges.
@@ -134,6 +134,9 @@ def is_done(self):
 
         return False
 
+    def is_truncated(self):
+        return False
+
     def get_info(self):
         """
         Dummy implementation of get_info.