adding agent radius to config for obstacle maps; using 5% margin to i…

…dentify edge detection; making gripper cam level; only using gripper cam for semantic value; adding gripper cam panning at the beginning of episode; only using all camera for point clouds for the first 10 steps to reduce noise during journey; allowing obstacle map inputs to update obstacles and/or update explored area; updating obstacle height range used for reality experiments; making inferred depth range farther (10m); adding target object to GDINO caption if doesn't already exist in it
bdaiinstitute · Sep 14, 2023 · 3f8f23c · 3f8f23c
1 parent 4db07d8
commit 3f8f23c
Show file tree

Hide file tree

Showing 7 changed files with 133 additions and 54 deletions.
diff --git a/config/experiments/reality.yaml b/config/experiments/reality.yaml
@@ -13,10 +13,13 @@ policy:
     exploration_thresh: 0.0
     obstacle_map_area_threshold: 1.5  # in square meters
     text_prompt: "Seems like there is a target_object ahead."
+    min_obstacle_height: 0.3
+    max_obstacle_height: 1.5
+    agent_radius: 0.2
 
 env:
   max_body_cam_depth: 3.5
-  max_gripper_cam_depth: 5.0
+  max_gripper_cam_depth: 10.0
   max_lin_dist: 0.2
   max_ang_dist: 0.523599
   time_step: 0.7

diff --git a/zsos/mapping/object_point_cloud_map.py b/zsos/mapping/object_point_cloud_map.py
@@ -304,4 +304,13 @@ def too_offset(mask: np.ndarray) -> bool:
     third = mask.shape[1] // 3
 
     # Check if the entire bounding rectangle is in the left or right third of the mask
-    return x + w <= third or x >= 2 * third
+    if x + w <= third:
+        # Check if the leftmost point is at the edge of the image
+        # return x == 0
+        return x <= int(0.05 * mask.shape[1])
+    elif x >= 2 * third:
+        # Check if the rightmost point is at the edge of the image
+        # return x + w == mask.shape[1]
+        return x + w >= int(0.95 * mask.shape[1])
+    else:
+        return False
diff --git a/zsos/mapping/obstacle_map.py b/zsos/mapping/obstacle_map.py
@@ -58,6 +58,8 @@ def update_map(
         fx: float,
         fy: float,
         topdown_fov: float,
+        explore: bool = True,
+        update_obstacles: bool = True,
     ):
         """
         Adds all obstacles from the current view to the map. Also updates the area
@@ -75,34 +77,40 @@ def update_map(
             fy (float): The focal length of the camera in the y direction.
             topdown_fov (float): The field of view of the depth camera projected onto
                 the topdown map.
+            explore (bool): Whether to update the explored area.
+            update_obstacles (bool): Whether to update the obstacle map.
         """
-        if self._hole_area_thresh == -1:
-            filled_depth = depth.copy()
-            filled_depth[depth == 0] = 1.0
-        else:
-            filled_depth = fill_small_holes(depth, self._hole_area_thresh)
-        scaled_depth = filled_depth * (max_depth - min_depth) + min_depth
-        mask = scaled_depth < max_depth
-        point_cloud_camera_frame = get_point_cloud(scaled_depth, mask, fx, fy)
-        point_cloud_episodic_frame = transform_points(
-            tf_camera_to_episodic, point_cloud_camera_frame
-        )
-        obstacle_cloud = filter_points_by_height(
-            point_cloud_episodic_frame, self._min_height, self._max_height
-        )
-
-        # Populate topdown map with obstacle locations
-        xy_points = obstacle_cloud[:, :2]
-        pixel_points = self._xy_to_px(xy_points)
-        self._map[pixel_points[:, 1], pixel_points[:, 0]] = 1
+        if update_obstacles:
+            if self._hole_area_thresh == -1:
+                filled_depth = depth.copy()
+                filled_depth[depth == 0] = 1.0
+            else:
+                filled_depth = fill_small_holes(depth, self._hole_area_thresh)
+            scaled_depth = filled_depth * (max_depth - min_depth) + min_depth
+            mask = scaled_depth < max_depth
+            point_cloud_camera_frame = get_point_cloud(scaled_depth, mask, fx, fy)
+            point_cloud_episodic_frame = transform_points(
+                tf_camera_to_episodic, point_cloud_camera_frame
+            )
+            obstacle_cloud = filter_points_by_height(
+                point_cloud_episodic_frame, self._min_height, self._max_height
+            )
 
-        # Update the navigable area, which is an inverse of the obstacle map after a
-        # dilation operation to accommodate the robot's radius.
-        self._navigable_map = 1 - cv2.dilate(
-            self._map.astype(np.uint8),
-            self._navigable_kernel,
-            iterations=1,
-        ).astype(bool)
+            # Populate topdown map with obstacle locations
+            xy_points = obstacle_cloud[:, :2]
+            pixel_points = self._xy_to_px(xy_points)
+            self._map[pixel_points[:, 1], pixel_points[:, 0]] = 1
+
+            # Update the navigable area, which is an inverse of the obstacle map after a
+            # dilation operation to accommodate the robot's radius.
+            self._navigable_map = 1 - cv2.dilate(
+                self._map.astype(np.uint8),
+                self._navigable_kernel,
+                iterations=1,
+            ).astype(bool)
+
+        if not explore:
+            return
 
         # Update the explored area
         agent_xy_location = tf_camera_to_episodic[:2, 3]

diff --git a/zsos/policy/base_objectnav_policy.py b/zsos/policy/base_objectnav_policy.py
@@ -411,6 +411,7 @@ class ZSOSConfig:
     vqa_prompt: str = "Is this "
     coco_threshold: float = 0.6
     non_coco_threshold: float = 0.4
+    agent_radius: float = 0.18
 
     @classmethod
     @property

diff --git a/zsos/policy/reality_policies.py b/zsos/policy/reality_policies.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Any, Dict, Union
+from typing import Any, Dict, List, Union
 
 import numpy as np
 import torch
@@ -11,6 +11,8 @@
 from zsos.policy.base_objectnav_policy import BaseObjectNavPolicy, ZSOSConfig
 from zsos.policy.itm_policy import ITMPolicyV2
 
+INITIAL_ARM_YAWS = np.deg2rad([-90, -60, -30, 0, 30, 60, 90, 0]).tolist()
+
 
 class RealityMixin:
     """
@@ -26,6 +28,7 @@ class RealityMixin:
         "chair . table . tv . laptop . microwave . toaster . sink . refrigerator . book"
         " . clock . vase . scissors . teddy bear . hair drier . toothbrush ."
     )
+    _initial_yaws: List = INITIAL_ARM_YAWS.copy()
 
     def __init__(self: BaseObjectNavPolicy, *args: Any, **kwargs: Any) -> None:
         super().__init__(sync_explored_areas=True, *args, **kwargs)
@@ -42,13 +45,17 @@ def from_config(cls, config: DictConfig, *args_unused, **kwargs_unused):
         return cls(**kwargs)
 
     def act(
-        self: BaseObjectNavPolicy,
+        self: Union["RealityMixin", BaseObjectNavPolicy],
         observations: Dict[str, Any],
         rnn_hidden_states: Any,
         prev_actions: Any,
         masks: Tensor,
         deterministic=False,
     ) -> Dict[str, Any]:
+        if observations["objectgoal"] not in self._non_coco_caption:
+            self._non_coco_caption = (
+                observations["objectgoal"] + " . " + self._non_coco_caption
+            )
         parent_cls: BaseObjectNavPolicy = super()  # type: ignore
         action: Tensor = parent_cls.act(
             observations, rnn_hidden_states, prev_actions, masks, deterministic
@@ -58,11 +65,22 @@ def act(
         # is the linear velocity and the second element is the angular velocity. We
         # convert this numpy array to a dictionary with keys "angular" and "linear" so
         # that it can be passed to the Spot robot.
-        action_dict = {
-            "angular": action[0][0].item(),
-            "linear": action[0][1].item(),
-            "info": self._policy_info,
-        }
+        if self._done_initializing:
+            action_dict = {
+                "angular": action[0][0].item(),
+                "linear": action[0][1].item(),
+                "arm_yaw": -1,
+                "info": self._policy_info,
+            }
+        else:
+            action_dict = {
+                "angular": 0,
+                "linear": 0,
+                "arm_yaw": action[0][0].item(),
+                "info": self._policy_info,
+            }
+
+        self._done_initializing = len(self._initial_yaws) == 0
 
         return action_dict
 
@@ -74,7 +92,12 @@ def get_action(
     def _reset(self: BaseObjectNavPolicy) -> None:
         parent_cls: BaseObjectNavPolicy = super()  # type: ignore
         parent_cls._reset()
-        self._done_initializing = True
+        self._initial_yaws = INITIAL_ARM_YAWS.copy()
+        self._done_initializing = False
+
+    def _initialize(self) -> Tensor:
+        yaw = self._initial_yaws.pop(0)
+        return torch.tensor([[yaw]], dtype=torch.float32)
 
     def _cache_observations(
         self: Union["RealityMixin", BaseObjectNavPolicy], observations: Dict[str, Any]
@@ -88,7 +111,7 @@ def _cache_observations(
             return
 
         self._obstacle_map: ObstacleMap
-        for obs_map_data in observations["obstacle_map_depths"]:
+        for obs_map_data in observations["obstacle_map_depths"][:-1]:
             depth, tf, min_depth, max_depth, fx, fy, topdown_fov = obs_map_data
             self._obstacle_map.update_map(
                 depth,
@@ -98,7 +121,24 @@ def _cache_observations(
                 fx,
                 fy,
                 topdown_fov,
+                explore=False,
             )
+
+        _, tf, min_depth, max_depth, fx, fy, topdown_fov = observations[
+            "obstacle_map_depths"
+        ][-1]
+        self._obstacle_map.update_map(
+            None,
+            tf,
+            min_depth,
+            max_depth,
+            fx,
+            fy,
+            topdown_fov,
+            explore=True,
+            update_obstacles=False,
+        )
+
         self._obstacle_map.update_agent_traj(
             observations["robot_xy"], observations["robot_heading"]
         )

diff --git a/zsos/reality/objectnav_env.py b/zsos/reality/objectnav_env.py
@@ -14,24 +14,24 @@
 
 LEFT_CROP = 124
 RIGHT_CROP = 60
-NOMINAL_ARM_POSE = np.deg2rad([0, -170, 120, 0, 75, 0])
+NOMINAL_ARM_POSE = np.deg2rad([0, -170, 120, 0, 55, 0])
 
 VALUE_MAP_CAMS = [
-    SpotCamIds.BACK_FISHEYE,
-    SpotCamIds.BACK_DEPTH_IN_VISUAL_FRAME,
-    SpotCamIds.LEFT_FISHEYE,
-    SpotCamIds.LEFT_DEPTH_IN_VISUAL_FRAME,
-    SpotCamIds.RIGHT_FISHEYE,
-    SpotCamIds.RIGHT_DEPTH_IN_VISUAL_FRAME,
+    # SpotCamIds.BACK_FISHEYE,
+    # SpotCamIds.BACK_DEPTH_IN_VISUAL_FRAME,
+    # SpotCamIds.LEFT_FISHEYE,
+    # SpotCamIds.LEFT_DEPTH_IN_VISUAL_FRAME,
+    # SpotCamIds.RIGHT_FISHEYE,
+    # SpotCamIds.RIGHT_DEPTH_IN_VISUAL_FRAME,
     SpotCamIds.HAND_COLOR,
 ]
 
 POINT_CLOUD_CAMS = [
-    SpotCamIds.RIGHT_DEPTH_IN_VISUAL_FRAME,
     SpotCamIds.FRONTLEFT_DEPTH,
     SpotCamIds.FRONTRIGHT_DEPTH,
     SpotCamIds.BACK_DEPTH_IN_VISUAL_FRAME,
     SpotCamIds.LEFT_DEPTH_IN_VISUAL_FRAME,
+    SpotCamIds.RIGHT_DEPTH_IN_VISUAL_FRAME,
 ]
 
 ALL_CAMS = list(set(VALUE_MAP_CAMS + POINT_CLOUD_CAMS))
@@ -95,9 +95,17 @@ def step(self, action: Dict[str, np.ndarray]) -> Tuple[Dict, float, bool, Dict]:
         cv2.imshow("Visualization", cv2.resize(vis_img, (0, 0), fx=0.5, fy=0.5))
         cv2.waitKey(1)
 
-        if "gripper_camera_pan" in action:
-            self._pan_gripper_camera(action["gripper_camera_pan"])
-        return super().step(action)
+        if action["arm_yaw"] == -1:
+            return super().step(action)
+
+        new_pose = np.array(NOMINAL_ARM_POSE)
+        new_pose[0] = action["arm_yaw"]
+        self.robot.set_arm_joints(new_pose, travel_time=0.5)
+        time.sleep(0.75)
+        done = False
+        self._num_steps += 1
+
+        return self._get_obs(), 0.0, done, {}  # not using info dict yet
 
     def _get_obs(self) -> Dict[str, np.ndarray]:
         robot_xy, robot_heading = self._get_gps(), self._get_compass()
@@ -174,7 +182,11 @@ def _get_camera_obs(self):
         # Obstacle map output is a list of (depth, tf, min_depth, max_depth, fx, fy,
         # topdown_fov) for each of the cameras in POINT_CLOUD_CAMS
         obstacle_map_depths = []
-        for src in POINT_CLOUD_CAMS:
+        if self._num_steps <= 10:
+            srcs = POINT_CLOUD_CAMS.copy()
+        else:
+            srcs = POINT_CLOUD_CAMS[:2]
+        for src in srcs:
             depth = cam_data[src]["image"]
             fx, fy = cam_data[src]["fx"], cam_data[src]["fy"]
             tf = cam_data[src]["tf_camera_to_global"]
@@ -185,6 +197,15 @@ def _get_camera_obs(self):
             src_data = (depth, tf, min_depth, self._max_body_cam_depth, fx, fy, fov)
             obstacle_map_depths.append(src_data)
 
+        tf = cam_data[SpotCamIds.HAND_COLOR]["tf_camera_to_global"]
+        fx, fy = (
+            cam_data[SpotCamIds.HAND_COLOR]["fx"],
+            cam_data[SpotCamIds.HAND_COLOR]["fy"],
+        )
+        fov = get_fov(fx, cam_data[src]["image"].shape[1])
+        src_data = (None, tf, min_depth, self._max_body_cam_depth, fx, fy, fov)
+        obstacle_map_depths.append(src_data)
+
         # Value map output is a list of (rgb, depth, tf_camera_to_episodic, min_depth,
         # max_depth, fov) for each of the cameras in VALUE_MAP_CAMS
         value_map_rgbd = []
@@ -231,8 +252,3 @@ def _get_compass(self) -> float:
         global_yaw = self.robot.xy_yaw[1]
         episodic_yaw = wrap_heading(global_yaw - self.episodic_start_yaw)
         return episodic_yaw
-
-    def _pan_gripper_camera(self, yaw):
-        new_pose = np.array(NOMINAL_ARM_POSE)
-        new_pose[0] = yaw[0]
-        self.robot.set_arm_joints(new_pose, travel_time=self.time_step * 0.9)
diff --git a/zsos/reality/pointnav_env.py b/zsos/reality/pointnav_env.py
@@ -33,6 +33,7 @@ def __init__(
         self._max_ang_dist = max_ang_dist
         self._time_step = time_step
         self._cmd_id = None
+        self._num_steps = 0
 
     def reset(self, goal: Any, relative=True, *args, **kwargs) -> Dict[str, np.ndarray]:
         assert isinstance(goal, np.ndarray)
@@ -70,6 +71,7 @@ def step(self, action: Dict[str, np.ndarray]) -> Tuple[Dict, float, bool, Dict]:
             blocking=False,
         )
         done = ang_dist == 0.0 and lin_dist == 0.0
+        self._num_steps += 1
         return self._get_obs(), 0.0, done, {}  # not using info dict yet
 
     def _compute_velocities(self, action: Dict[str, np.ndarray]) -> Tuple[float, float]: