diff --git a/config/experiments/reality.yaml b/config/experiments/reality.yaml index 16f01f8..845e1ba 100644 --- a/config/experiments/reality.yaml +++ b/config/experiments/reality.yaml @@ -13,10 +13,13 @@ policy: exploration_thresh: 0.0 obstacle_map_area_threshold: 1.5 # in square meters text_prompt: "Seems like there is a target_object ahead." + min_obstacle_height: 0.3 + max_obstacle_height: 1.5 + agent_radius: 0.2 env: max_body_cam_depth: 3.5 - max_gripper_cam_depth: 5.0 + max_gripper_cam_depth: 10.0 max_lin_dist: 0.2 max_ang_dist: 0.523599 time_step: 0.7 diff --git a/zsos/mapping/object_point_cloud_map.py b/zsos/mapping/object_point_cloud_map.py index 93ac17b..6ccd168 100644 --- a/zsos/mapping/object_point_cloud_map.py +++ b/zsos/mapping/object_point_cloud_map.py @@ -304,4 +304,13 @@ def too_offset(mask: np.ndarray) -> bool: third = mask.shape[1] // 3 # Check if the entire bounding rectangle is in the left or right third of the mask - return x + w <= third or x >= 2 * third + if x + w <= third: + # Check if the leftmost point is at the edge of the image + # return x == 0 + return x <= int(0.05 * mask.shape[1]) + elif x >= 2 * third: + # Check if the rightmost point is at the edge of the image + # return x + w == mask.shape[1] + return x + w >= int(0.95 * mask.shape[1]) + else: + return False diff --git a/zsos/mapping/obstacle_map.py b/zsos/mapping/obstacle_map.py index bc8d955..a4f504d 100644 --- a/zsos/mapping/obstacle_map.py +++ b/zsos/mapping/obstacle_map.py @@ -58,6 +58,8 @@ def update_map( fx: float, fy: float, topdown_fov: float, + explore: bool = True, + update_obstacles: bool = True, ): """ Adds all obstacles from the current view to the map. Also updates the area @@ -75,34 +77,40 @@ def update_map( fy (float): The focal length of the camera in the y direction. topdown_fov (float): The field of view of the depth camera projected onto the topdown map. + explore (bool): Whether to update the explored area. + update_obstacles (bool): Whether to update the obstacle map. """ - if self._hole_area_thresh == -1: - filled_depth = depth.copy() - filled_depth[depth == 0] = 1.0 - else: - filled_depth = fill_small_holes(depth, self._hole_area_thresh) - scaled_depth = filled_depth * (max_depth - min_depth) + min_depth - mask = scaled_depth < max_depth - point_cloud_camera_frame = get_point_cloud(scaled_depth, mask, fx, fy) - point_cloud_episodic_frame = transform_points( - tf_camera_to_episodic, point_cloud_camera_frame - ) - obstacle_cloud = filter_points_by_height( - point_cloud_episodic_frame, self._min_height, self._max_height - ) - - # Populate topdown map with obstacle locations - xy_points = obstacle_cloud[:, :2] - pixel_points = self._xy_to_px(xy_points) - self._map[pixel_points[:, 1], pixel_points[:, 0]] = 1 + if update_obstacles: + if self._hole_area_thresh == -1: + filled_depth = depth.copy() + filled_depth[depth == 0] = 1.0 + else: + filled_depth = fill_small_holes(depth, self._hole_area_thresh) + scaled_depth = filled_depth * (max_depth - min_depth) + min_depth + mask = scaled_depth < max_depth + point_cloud_camera_frame = get_point_cloud(scaled_depth, mask, fx, fy) + point_cloud_episodic_frame = transform_points( + tf_camera_to_episodic, point_cloud_camera_frame + ) + obstacle_cloud = filter_points_by_height( + point_cloud_episodic_frame, self._min_height, self._max_height + ) - # Update the navigable area, which is an inverse of the obstacle map after a - # dilation operation to accommodate the robot's radius. - self._navigable_map = 1 - cv2.dilate( - self._map.astype(np.uint8), - self._navigable_kernel, - iterations=1, - ).astype(bool) + # Populate topdown map with obstacle locations + xy_points = obstacle_cloud[:, :2] + pixel_points = self._xy_to_px(xy_points) + self._map[pixel_points[:, 1], pixel_points[:, 0]] = 1 + + # Update the navigable area, which is an inverse of the obstacle map after a + # dilation operation to accommodate the robot's radius. + self._navigable_map = 1 - cv2.dilate( + self._map.astype(np.uint8), + self._navigable_kernel, + iterations=1, + ).astype(bool) + + if not explore: + return # Update the explored area agent_xy_location = tf_camera_to_episodic[:2, 3] diff --git a/zsos/policy/base_objectnav_policy.py b/zsos/policy/base_objectnav_policy.py index bd42e89..6a1b521 100644 --- a/zsos/policy/base_objectnav_policy.py +++ b/zsos/policy/base_objectnav_policy.py @@ -411,6 +411,7 @@ class ZSOSConfig: vqa_prompt: str = "Is this " coco_threshold: float = 0.6 non_coco_threshold: float = 0.4 + agent_radius: float = 0.18 @classmethod @property diff --git a/zsos/policy/reality_policies.py b/zsos/policy/reality_policies.py index dc31241..62d3028 100644 --- a/zsos/policy/reality_policies.py +++ b/zsos/policy/reality_policies.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Any, Dict, Union +from typing import Any, Dict, List, Union import numpy as np import torch @@ -11,6 +11,8 @@ from zsos.policy.base_objectnav_policy import BaseObjectNavPolicy, ZSOSConfig from zsos.policy.itm_policy import ITMPolicyV2 +INITIAL_ARM_YAWS = np.deg2rad([-90, -60, -30, 0, 30, 60, 90, 0]).tolist() + class RealityMixin: """ @@ -26,6 +28,7 @@ class RealityMixin: "chair . table . tv . laptop . microwave . toaster . sink . refrigerator . book" " . clock . vase . scissors . teddy bear . hair drier . toothbrush ." ) + _initial_yaws: List = INITIAL_ARM_YAWS.copy() def __init__(self: BaseObjectNavPolicy, *args: Any, **kwargs: Any) -> None: super().__init__(sync_explored_areas=True, *args, **kwargs) @@ -42,13 +45,17 @@ def from_config(cls, config: DictConfig, *args_unused, **kwargs_unused): return cls(**kwargs) def act( - self: BaseObjectNavPolicy, + self: Union["RealityMixin", BaseObjectNavPolicy], observations: Dict[str, Any], rnn_hidden_states: Any, prev_actions: Any, masks: Tensor, deterministic=False, ) -> Dict[str, Any]: + if observations["objectgoal"] not in self._non_coco_caption: + self._non_coco_caption = ( + observations["objectgoal"] + " . " + self._non_coco_caption + ) parent_cls: BaseObjectNavPolicy = super() # type: ignore action: Tensor = parent_cls.act( observations, rnn_hidden_states, prev_actions, masks, deterministic @@ -58,11 +65,22 @@ def act( # is the linear velocity and the second element is the angular velocity. We # convert this numpy array to a dictionary with keys "angular" and "linear" so # that it can be passed to the Spot robot. - action_dict = { - "angular": action[0][0].item(), - "linear": action[0][1].item(), - "info": self._policy_info, - } + if self._done_initializing: + action_dict = { + "angular": action[0][0].item(), + "linear": action[0][1].item(), + "arm_yaw": -1, + "info": self._policy_info, + } + else: + action_dict = { + "angular": 0, + "linear": 0, + "arm_yaw": action[0][0].item(), + "info": self._policy_info, + } + + self._done_initializing = len(self._initial_yaws) == 0 return action_dict @@ -74,7 +92,12 @@ def get_action( def _reset(self: BaseObjectNavPolicy) -> None: parent_cls: BaseObjectNavPolicy = super() # type: ignore parent_cls._reset() - self._done_initializing = True + self._initial_yaws = INITIAL_ARM_YAWS.copy() + self._done_initializing = False + + def _initialize(self) -> Tensor: + yaw = self._initial_yaws.pop(0) + return torch.tensor([[yaw]], dtype=torch.float32) def _cache_observations( self: Union["RealityMixin", BaseObjectNavPolicy], observations: Dict[str, Any] @@ -88,7 +111,7 @@ def _cache_observations( return self._obstacle_map: ObstacleMap - for obs_map_data in observations["obstacle_map_depths"]: + for obs_map_data in observations["obstacle_map_depths"][:-1]: depth, tf, min_depth, max_depth, fx, fy, topdown_fov = obs_map_data self._obstacle_map.update_map( depth, @@ -98,7 +121,24 @@ def _cache_observations( fx, fy, topdown_fov, + explore=False, ) + + _, tf, min_depth, max_depth, fx, fy, topdown_fov = observations[ + "obstacle_map_depths" + ][-1] + self._obstacle_map.update_map( + None, + tf, + min_depth, + max_depth, + fx, + fy, + topdown_fov, + explore=True, + update_obstacles=False, + ) + self._obstacle_map.update_agent_traj( observations["robot_xy"], observations["robot_heading"] ) diff --git a/zsos/reality/objectnav_env.py b/zsos/reality/objectnav_env.py index c81b34b..c001c6f 100644 --- a/zsos/reality/objectnav_env.py +++ b/zsos/reality/objectnav_env.py @@ -14,24 +14,24 @@ LEFT_CROP = 124 RIGHT_CROP = 60 -NOMINAL_ARM_POSE = np.deg2rad([0, -170, 120, 0, 75, 0]) +NOMINAL_ARM_POSE = np.deg2rad([0, -170, 120, 0, 55, 0]) VALUE_MAP_CAMS = [ - SpotCamIds.BACK_FISHEYE, - SpotCamIds.BACK_DEPTH_IN_VISUAL_FRAME, - SpotCamIds.LEFT_FISHEYE, - SpotCamIds.LEFT_DEPTH_IN_VISUAL_FRAME, - SpotCamIds.RIGHT_FISHEYE, - SpotCamIds.RIGHT_DEPTH_IN_VISUAL_FRAME, + # SpotCamIds.BACK_FISHEYE, + # SpotCamIds.BACK_DEPTH_IN_VISUAL_FRAME, + # SpotCamIds.LEFT_FISHEYE, + # SpotCamIds.LEFT_DEPTH_IN_VISUAL_FRAME, + # SpotCamIds.RIGHT_FISHEYE, + # SpotCamIds.RIGHT_DEPTH_IN_VISUAL_FRAME, SpotCamIds.HAND_COLOR, ] POINT_CLOUD_CAMS = [ - SpotCamIds.RIGHT_DEPTH_IN_VISUAL_FRAME, SpotCamIds.FRONTLEFT_DEPTH, SpotCamIds.FRONTRIGHT_DEPTH, SpotCamIds.BACK_DEPTH_IN_VISUAL_FRAME, SpotCamIds.LEFT_DEPTH_IN_VISUAL_FRAME, + SpotCamIds.RIGHT_DEPTH_IN_VISUAL_FRAME, ] ALL_CAMS = list(set(VALUE_MAP_CAMS + POINT_CLOUD_CAMS)) @@ -95,9 +95,17 @@ def step(self, action: Dict[str, np.ndarray]) -> Tuple[Dict, float, bool, Dict]: cv2.imshow("Visualization", cv2.resize(vis_img, (0, 0), fx=0.5, fy=0.5)) cv2.waitKey(1) - if "gripper_camera_pan" in action: - self._pan_gripper_camera(action["gripper_camera_pan"]) - return super().step(action) + if action["arm_yaw"] == -1: + return super().step(action) + + new_pose = np.array(NOMINAL_ARM_POSE) + new_pose[0] = action["arm_yaw"] + self.robot.set_arm_joints(new_pose, travel_time=0.5) + time.sleep(0.75) + done = False + self._num_steps += 1 + + return self._get_obs(), 0.0, done, {} # not using info dict yet def _get_obs(self) -> Dict[str, np.ndarray]: robot_xy, robot_heading = self._get_gps(), self._get_compass() @@ -174,7 +182,11 @@ def _get_camera_obs(self): # Obstacle map output is a list of (depth, tf, min_depth, max_depth, fx, fy, # topdown_fov) for each of the cameras in POINT_CLOUD_CAMS obstacle_map_depths = [] - for src in POINT_CLOUD_CAMS: + if self._num_steps <= 10: + srcs = POINT_CLOUD_CAMS.copy() + else: + srcs = POINT_CLOUD_CAMS[:2] + for src in srcs: depth = cam_data[src]["image"] fx, fy = cam_data[src]["fx"], cam_data[src]["fy"] tf = cam_data[src]["tf_camera_to_global"] @@ -185,6 +197,15 @@ def _get_camera_obs(self): src_data = (depth, tf, min_depth, self._max_body_cam_depth, fx, fy, fov) obstacle_map_depths.append(src_data) + tf = cam_data[SpotCamIds.HAND_COLOR]["tf_camera_to_global"] + fx, fy = ( + cam_data[SpotCamIds.HAND_COLOR]["fx"], + cam_data[SpotCamIds.HAND_COLOR]["fy"], + ) + fov = get_fov(fx, cam_data[src]["image"].shape[1]) + src_data = (None, tf, min_depth, self._max_body_cam_depth, fx, fy, fov) + obstacle_map_depths.append(src_data) + # Value map output is a list of (rgb, depth, tf_camera_to_episodic, min_depth, # max_depth, fov) for each of the cameras in VALUE_MAP_CAMS value_map_rgbd = [] @@ -231,8 +252,3 @@ def _get_compass(self) -> float: global_yaw = self.robot.xy_yaw[1] episodic_yaw = wrap_heading(global_yaw - self.episodic_start_yaw) return episodic_yaw - - def _pan_gripper_camera(self, yaw): - new_pose = np.array(NOMINAL_ARM_POSE) - new_pose[0] = yaw[0] - self.robot.set_arm_joints(new_pose, travel_time=self.time_step * 0.9) diff --git a/zsos/reality/pointnav_env.py b/zsos/reality/pointnav_env.py index dc2023c..2edb475 100644 --- a/zsos/reality/pointnav_env.py +++ b/zsos/reality/pointnav_env.py @@ -33,6 +33,7 @@ def __init__( self._max_ang_dist = max_ang_dist self._time_step = time_step self._cmd_id = None + self._num_steps = 0 def reset(self, goal: Any, relative=True, *args, **kwargs) -> Dict[str, np.ndarray]: assert isinstance(goal, np.ndarray) @@ -70,6 +71,7 @@ def step(self, action: Dict[str, np.ndarray]) -> Tuple[Dict, float, bool, Dict]: blocking=False, ) done = ang_dist == 0.0 and lin_dist == 0.0 + self._num_steps += 1 return self._get_obs(), 0.0, done, {} # not using info dict yet def _compute_velocities(self, action: Dict[str, np.ndarray]) -> Tuple[float, float]: