From 54bc942dcf0397149ac46be61eb17d67996fc310 Mon Sep 17 00:00:00 2001 From: KelvinYang0320 Date: Sat, 30 Sep 2023 22:11:51 +0800 Subject: [PATCH 1/2] Updated is_terminated, is_truncated, and check_env for gymnasium in cartpole_discrete_SB3 --- .../PPO_runner.py | 4 ++-- .../robot_supervisor.py | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/PPO_runner.py b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/PPO_runner.py index 736036cb..92428c90 100644 --- a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/PPO_runner.py +++ b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/PPO_runner.py @@ -1,5 +1,5 @@ from numpy import convolve, ones, mean -import gym +import gymnasium as gym from robot_supervisor import CartPoleRobotSupervisor from utilities import plot_data @@ -13,7 +13,7 @@ def run(): env = CartPoleRobotSupervisor() # Verify that the environment is working as a gym-style env - #check_env(env) + check_env(env) # Use the PPO algorithm from the stable baselines having MLP, verbose=1 output the training information model = PPO("MlpPolicy", env, verbose=1) diff --git a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/robot_supervisor.py b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/robot_supervisor.py index 19c481be..1dd915b8 100644 --- a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/robot_supervisor.py +++ b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager_Stable_Baselines/robot_supervisor.py @@ -1,7 +1,7 @@ from deepbots.supervisor import RobotSupervisorEnv from utilities import normalize_to_range -from gym.spaces import Box, Discrete +from gymnasium.spaces import Box, Discrete import numpy as np @@ -109,9 +109,9 @@ def get_reward(self, action): """ return 1 - def is_done(self): + def is_terminated(self): """ - An episode is done if the score is over 195.0, or if the pole is off balance, or the cart position is on the + An episode is terminated if the score is over 195.0, or if the pole is off balance, or the cart position is on the arena's edges. :return: True if termination conditions are met, False otherwise @@ -130,6 +130,9 @@ def is_done(self): return False + def is_truncated(self): + return False + def solved(self): """ This method checks whether the CartPole task is solved, so training terminates. From 6d854b591d10e8e0864a7efefdc38e5216dfdd81 Mon Sep 17 00:00:00 2001 From: KelvinYang0320 Date: Mon, 2 Oct 2023 21:24:45 +0800 Subject: [PATCH 2/2] gymnasium step and reset --- .../controllers/robot_supervisor_manager/PPO_runner.py | 10 +++++----- .../robot_supervisor_manager/robot_supervisor.py | 7 +++++-- .../controllers/supervisor_manager/PPO_runner.py | 10 +++++----- .../supervisor_manager/keyboard_controller_cartpole.py | 4 ++-- .../supervisor_manager/supervisor_controller.py | 5 ++++- 5 files changed, 21 insertions(+), 15 deletions(-) diff --git a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/PPO_runner.py b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/PPO_runner.py index 85093ce1..3664eb94 100644 --- a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/PPO_runner.py +++ b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/PPO_runner.py @@ -20,7 +20,7 @@ def run(): # Run outer loop until the episodes limit is reached or the task is solved while not solved and episode_count < episode_limit: - state = env.reset() # Reset robot and get starting observation + state, _ = env.reset() # Reset robot and get starting observation env.episode_score = 0 action_probs = [] # This list holds the probability of each chosen action @@ -33,14 +33,14 @@ def run(): # Step the supervisor to get the current selected_action reward, the new state and whether we reached the # done condition - new_state, reward, done, info = env.step([selected_action]) + new_state, reward, terminated, truncated, info = env.step([selected_action]) # Save the current state transition in agent's memory trans = Transition(state, selected_action, action_prob, reward, new_state) agent.store_transition(trans) env.episode_score += reward # Accumulate episode reward - if done: + if terminated or truncated: # Save the episode's score env.episode_score_list.append(env.episode_score) agent.train_step(batch_size=step + 1) @@ -74,10 +74,10 @@ def run(): env.episode_score = 0 while True: selected_action, action_prob = agent.work(state, type_="selectActionMax") - state, reward, done, _ = env.step([selected_action]) + state, reward, terminated, truncated, _ = env.step([selected_action]) env.episode_score += reward # Accumulate episode reward - if done: + if terminated or truncated: print("Reward accumulated =", env.episode_score) env.episode_score = 0 state = env.reset() diff --git a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/robot_supervisor.py b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/robot_supervisor.py index 622b511a..24293395 100644 --- a/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/robot_supervisor.py +++ b/examples/cartpole/cartpole_discrete/controllers/robot_supervisor_manager/robot_supervisor.py @@ -1,7 +1,7 @@ from deepbots.supervisor import RobotSupervisorEnv from utilities import normalize_to_range -from gym.spaces import Box, Discrete +from gymnasium.spaces import Box, Discrete import numpy as np @@ -109,7 +109,7 @@ def get_reward(self, action): """ return 1 - def is_done(self): + def is_terminated(self): """ An episode is done if the score is over 195.0, or if the pole is off balance, or the cart position is on the arena's edges. @@ -130,6 +130,9 @@ def is_done(self): return False + def is_truncated(self): + return False + def solved(self): """ This method checks whether the CartPole task is solved, so training terminates. diff --git a/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/PPO_runner.py b/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/PPO_runner.py index ba041479..f258887b 100644 --- a/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/PPO_runner.py +++ b/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/PPO_runner.py @@ -24,7 +24,7 @@ def run(): # Run outer loop until the episodes limit is reached or the task is solved while not solved and episode_count < episod_limit: - state = supervisor_env.reset() # Reset robot and get starting observation + state, _ = supervisor_env.reset() # Reset robot and get starting observation supervisor_pre.episodeScore = 0 action_probs = [] # This list holds the probability of each chosen action @@ -37,14 +37,14 @@ def run(): # Step the supervisor to get the current selected_action reward, the new state and whether we reached the # done condition - new_state, reward, done, info = supervisor_env.step([selected_action]) + new_state, reward, terminated, truncated, info = supervisor_env.step([selected_action]) # Save the current state transition in agent's memory trans = Transition(state, selected_action, action_prob, reward, new_state) agent.store_transition(trans) supervisor_pre.episodeScore += reward # Accumulate episode reward - if done: + if terminated or truncated: # Save the episode's score supervisor_pre.episode_score_list.append(supervisor_pre.episodeScore) agent.train_step(batch_size=step + 1) @@ -85,10 +85,10 @@ def run(): supervisor_pre.episodeScore = 0 while True: selected_action, action_prob = agent.work(state, type_="selectActionMax") - state, reward, done, _ = supervisor_env.step([selected_action]) + state, reward, terminated, truncated, _ = supervisor_env.step([selected_action]) supervisor_pre.episodeScore += reward # Accumulate episode reward - if done: + if terminated or truncated: print("Reward accumulated =", supervisor_pre.episodeScore) supervisor_pre.episodeScore = 0 state = supervisor_env.reset() diff --git a/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/keyboard_controller_cartpole.py b/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/keyboard_controller_cartpole.py index e67dced6..5f66ac01 100644 --- a/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/keyboard_controller_cartpole.py +++ b/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/keyboard_controller_cartpole.py @@ -21,7 +21,7 @@ def step(self, action): "R" invokes the environment's reset method resetting the simulation to its initial state. """ - observation, reward, is_done, info = self.controller.step(action) + observation, reward, terminated, truncated, info = self.controller.step(action) key = self.keyboard.getKey() if key == ord("T") and not self.controller.test: @@ -31,4 +31,4 @@ def step(self, action): print("User invoked reset method.") self.controller.reset() - return observation, reward, is_done, info + return observation, reward, terminated, truncated, info diff --git a/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/supervisor_controller.py b/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/supervisor_controller.py index 5bf2ac4b..9e7a28f3 100644 --- a/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/supervisor_controller.py +++ b/examples/cartpole/cartpole_discrete/controllers/supervisor_manager/supervisor_controller.py @@ -109,7 +109,7 @@ def get_reward(self, action=None): """ return 1 - def is_done(self): + def is_terminated(self): """ An episode is done if the score is over 195.0, or if the pole is off balance, or the cart position is on the arena's edges. @@ -134,6 +134,9 @@ def is_done(self): return False + def is_truncated(self): + return False + def get_info(self): """ Dummy implementation of get_info.