From 2a0300708f67bfb9b60dbae36e2c3c9e18626eaa Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Mon, 16 Sep 2024 18:28:11 +0100
Subject: [PATCH 01/29] backup wip

---
 .../aloha_default/left_follower.json          |  68 --
 .../aloha_default/left_leader.json            |  68 --
 .../aloha_default/right_follower.json         |  68 --
 .../aloha_default/right_leader.json           |  68 --
 lerobot/common/datasets/online_buffer.py      | 418 +++++++----
 .../robot_devices/cameras/intelrealsense.py   | 448 -----------
 .../robot_devices/robots/manipulator.py       | 703 ------------------
 lerobot/configs/robot/aloha.yaml              | 115 ---
 lerobot/configs/robot/koch_bimanual.yaml      |  75 --
 lerobot/scripts/train.py                      |  17 +-
 .../aloha_mobile_cabinet/meta_data/info.json  |  11 -
 11 files changed, 300 insertions(+), 1759 deletions(-)
 delete mode 100644 .cache/calibration/aloha_default/left_follower.json
 delete mode 100644 .cache/calibration/aloha_default/left_leader.json
 delete mode 100644 .cache/calibration/aloha_default/right_follower.json
 delete mode 100644 .cache/calibration/aloha_default/right_leader.json
 delete mode 100644 lerobot/common/robot_devices/cameras/intelrealsense.py
 delete mode 100644 lerobot/common/robot_devices/robots/manipulator.py
 delete mode 100644 lerobot/configs/robot/aloha.yaml
 delete mode 100644 lerobot/configs/robot/koch_bimanual.yaml
 delete mode 100644 tests/data/lerobot/aloha_mobile_cabinet/meta_data/info.json

diff --git a/.cache/calibration/aloha_default/left_follower.json b/.cache/calibration/aloha_default/left_follower.json
deleted file mode 100644
index 336c238a0..000000000
--- a/.cache/calibration/aloha_default/left_follower.json
+++ /dev/null
@@ -1,68 +0,0 @@
-{
-    "homing_offset": [
-        2048,
-        3072,
-        3072,
-        -1024,
-        -1024,
-        2048,
-        -2048,
-        2048,
-        -2048
-    ],
-    "drive_mode": [
-        1,
-        1,
-        1,
-        0,
-        0,
-        1,
-        0,
-        1,
-        0
-    ],
-    "start_pos": [
-        2015,
-        3058,
-        3061,
-        1071,
-        1071,
-        2035,
-        2152,
-        2029,
-        2499
-    ],
-    "end_pos": [
-        -1008,
-        -1963,
-        -1966,
-        2141,
-        2143,
-        -971,
-        3043,
-        -1077,
-        3144
-    ],
-    "calib_mode": [
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "LINEAR"
-    ],
-    "motor_names": [
-        "waist",
-        "shoulder",
-        "shoulder_shadow",
-        "elbow",
-        "elbow_shadow",
-        "forearm_roll",
-        "wrist_angle",
-        "wrist_rotate",
-        "gripper"
-    ]
-}
diff --git a/.cache/calibration/aloha_default/left_leader.json b/.cache/calibration/aloha_default/left_leader.json
deleted file mode 100644
index d933f2bab..000000000
--- a/.cache/calibration/aloha_default/left_leader.json
+++ /dev/null
@@ -1,68 +0,0 @@
-{
-    "homing_offset": [
-        2048,
-        3072,
-        3072,
-        -1024,
-        -1024,
-        2048,
-        -2048,
-        2048,
-        -1024
-    ],
-    "drive_mode": [
-        1,
-        1,
-        1,
-        0,
-        0,
-        1,
-        0,
-        1,
-        0
-    ],
-    "start_pos": [
-        2035,
-        3024,
-        3019,
-        979,
-        981,
-        1982,
-        2166,
-        2124,
-        1968
-    ],
-    "end_pos": [
-        -990,
-        -2017,
-        -2015,
-        2078,
-        2076,
-        -1030,
-        3117,
-        -1016,
-        2556
-    ],
-    "calib_mode": [
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "LINEAR"
-    ],
-    "motor_names": [
-        "waist",
-        "shoulder",
-        "shoulder_shadow",
-        "elbow",
-        "elbow_shadow",
-        "forearm_roll",
-        "wrist_angle",
-        "wrist_rotate",
-        "gripper"
-    ]
-}
diff --git a/.cache/calibration/aloha_default/right_follower.json b/.cache/calibration/aloha_default/right_follower.json
deleted file mode 100644
index bc69dfafd..000000000
--- a/.cache/calibration/aloha_default/right_follower.json
+++ /dev/null
@@ -1,68 +0,0 @@
-{
-    "homing_offset": [
-        2048,
-        3072,
-        3072,
-        -1024,
-        -1024,
-        2048,
-        -2048,
-        2048,
-        -2048
-    ],
-    "drive_mode": [
-        1,
-        1,
-        1,
-        0,
-        0,
-        1,
-        0,
-        1,
-        0
-    ],
-    "start_pos": [
-        2056,
-        2895,
-        2896,
-        1191,
-        1190,
-        2018,
-        2051,
-        2056,
-        2509
-    ],
-    "end_pos": [
-        -1040,
-        -2004,
-        -2006,
-        2126,
-        2127,
-        -1010,
-        3050,
-        -1117,
-        3143
-    ],
-    "calib_mode": [
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "LINEAR"
-    ],
-    "motor_names": [
-        "waist",
-        "shoulder",
-        "shoulder_shadow",
-        "elbow",
-        "elbow_shadow",
-        "forearm_roll",
-        "wrist_angle",
-        "wrist_rotate",
-        "gripper"
-    ]
-}
diff --git a/.cache/calibration/aloha_default/right_leader.json b/.cache/calibration/aloha_default/right_leader.json
deleted file mode 100644
index d96d1de9b..000000000
--- a/.cache/calibration/aloha_default/right_leader.json
+++ /dev/null
@@ -1,68 +0,0 @@
-{
-    "homing_offset": [
-        2048,
-        3072,
-        3072,
-        -1024,
-        -1024,
-        2048,
-        -2048,
-        2048,
-        -2048
-    ],
-    "drive_mode": [
-        1,
-        1,
-        1,
-        0,
-        0,
-        1,
-        0,
-        1,
-        0
-    ],
-    "start_pos": [
-        2068,
-        3034,
-        3030,
-        1038,
-        1041,
-        1991,
-        1948,
-        2090,
-        1985
-    ],
-    "end_pos": [
-        -1025,
-        -2014,
-        -2015,
-        2058,
-        2060,
-        -955,
-        3091,
-        -940,
-        2576
-    ],
-    "calib_mode": [
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "DEGREE",
-        "LINEAR"
-    ],
-    "motor_names": [
-        "waist",
-        "shoulder",
-        "shoulder_shadow",
-        "elbow",
-        "elbow_shadow",
-        "forearm_roll",
-        "wrist_angle",
-        "wrist_rotate",
-        "gripper"
-    ]
-}
diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
index 6b093cda7..9bc0580fb 100644
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -13,22 +13,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""An online buffer for the online training loop in train.py
-
-Note to maintainers: This duplicates some logic from LeRobotDataset and EpisodeAwareSampler. We should
-consider converging to one approach. Here we have opted to use numpy.memmap to back the data buffer. It's much
-faster than using HuggingFace Datasets as there's no conversion to an intermediate non-python object. Also it
-supports in-place slicing and mutation which is very handy for a dynamic buffer.
-"""
+"""A data buffer for efficient data management during offline and online training."""
 
+import logging
 import os
+from itertools import chain
 from pathlib import Path
-from typing import Any
+from typing import Any, Callable
 
+import datasets
 import numpy as np
 import torch
 
 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+from lerobot.common.datasets.video_utils import VideoFrame, load_from_videos
+
+MAX_VIDEO_PATH_LENGTH = 100  # TODO(now): Move somewhere more appropriate
 
 
 def _make_memmap_safe(**kwargs) -> np.memmap:
@@ -45,59 +45,79 @@ def _make_memmap_safe(**kwargs) -> np.memmap:
         available_space = stats.f_bavail * stats.f_frsize  # bytes
         if required_space >= available_space * 0.8:
             raise RuntimeError(
-                f"You're about to take up {required_space} of {available_space} bytes available."
+                f"You're about to take up {required_space} of {available_space} bytes available. This "
+                "exception has been raised to protect your storage device."
+                ""
             )
     return np.memmap(**kwargs)
 
 
-class OnlineBuffer(torch.utils.data.Dataset):
-    """FIFO data buffer for the online training loop in train.py.
+class DataBuffer(torch.utils.data.Dataset):
+    """Data buffer for efficient dataset management during offline and online training.
+
+    Data is considered to come in the form of "episodes" (an instance of a robot performing a task). Episodes
+    are made up of "frames", which are chronoligically ordered and contain timestamp aligned data, potentially
+    including environment observations, and robot actions. NOTE: for the time being, we require all data
+    modalities are timestamp aligned. This constraint may be relaxed in the future.
 
-    Follows the protocol of LeRobotDataset as much as is required to have it be used by the online training
-    loop in the same way that a LeRobotDataset would be used.
+    The data is stored in a mapping from data keys to arrays with shape (total_number_of_frames, *data_dim).
+    The compulsory data keys are:
+        - "index": A sequential integer index per frame.
+        - "episode_index": A sequential integer index per episode.
+        - "frame_index": A sequential integer index per frame within an episode (it resets for each episode).
+        - "timestamp": The relative timestamp of the frame within the episode in units of seconds. The choice.
+            of reference time is not important.
 
-    The underlying data structure will have data inserted in a circular fashion. Always insert after the
-    last index, and when you reach the end, wrap around to the start.
+    The `add_data` and `add_episodes` functions can be used to insert more data in the form of integral
+    episodes (starting from frame 0 and with the frames ordered). The buffer has a compulsory size limit,
+    which must be provided. Data is inserted in a circular fashion, inserting after the most recently added
+    frame, and wrapping around to the start when necessary (in which case older episodes are overwritten).
 
-    The data is stored in a numpy memmap.
+    This class is also a PyTorch Dataset and can be used as such in a dataloader for a training loop. The item
+    getter returns either a single from, or a slice of a single episode based on the requested data index.
     """
 
+    # Special key for a (1,) array storing a pointer to the next index to fill from when adding data.
     NEXT_INDEX_KEY = "_next_index"
+    # Since the data buffer is pre-allocated, this boolean mask is used to indicate which frames have "real"
+    # data.
     OCCUPANCY_MASK_KEY = "_occupancy_mask"
+    # This is not a data key used in the buffer. It is used to indicate that a frame is padding, added by the
+    # __getitem__ method.
+    IS_PAD_POSTFIX = "_is_pad"
     INDEX_KEY = "index"
     FRAME_INDEX_KEY = "frame_index"
     EPISODE_INDEX_KEY = "episode_index"
     TIMESTAMP_KEY = "timestamp"
-    IS_PAD_POSTFIX = "_is_pad"
+    PRESET_KEYS = {INDEX_KEY, FRAME_INDEX_KEY, EPISODE_INDEX_KEY, TIMESTAMP_KEY}
 
     def __init__(
         self,
-        write_dir: str | Path,
+        storage_dir: str | Path,
         data_spec: dict[str, Any] | None,
         buffer_capacity: int | None,
-        fps: float | None = None,
+        image_transform: Callable[[np.ndarray], np.ndarray] | None = None,
         delta_timestamps: dict[str, list[float]] | dict[str, np.ndarray] | None = None,
+        fps: float | None = None,
     ):
-        """
-        The online buffer can be provided from scratch or you can load an existing online buffer by passing
-        a `write_dir` associated with an existing buffer.
+        """Create a data buffer including reserving the underlying on-disk storage.
 
         Args:
-            write_dir: Where to keep the numpy memmap files. One memmap file will be stored for each data key.
-                Note that if the files already exist, they are opened in read-write mode (used for training
-                resumption.)
+            storage_dir: Where to keep the numpy memmap files. One memmap file will be stored for each data
+                key. Note that if the files already exist, they are opened in read-write mode.
             data_spec: A mapping from data key to data specification, like {data_key: {"shape": tuple[int],
                 "dtype": np.dtype}}. This should include all the data that you wish to record into the buffer,
-                but note that "index", "frame_index" and "episode_index" are already accounted for by this
-                class, so you don't need to include them.
+                but note that "index", "frame_index", "episode_index", and "timestamp", are already handled
+                internally, so you don't need to include them.
             buffer_capacity: How many frames should be stored in the buffer as a maximum. Be aware of your
                 system's available disk space when choosing this.
-            fps: Same as the fps concept in LeRobot dataset. Here it needs to be provided for the
-                 delta_timestamps logic. You can pass None if you are not using delta_timestamps.
-            delta_timestamps: Same as the delta_timestamps concept in LeRobotDataset. This is internally
-                converted to dict[str, np.ndarray] for optimization purposes.
+            image_transform: TODO(now)
+            delta_timestamps: TODO(now)
+            fps: TODO(now)
 
         """
+        if (delta_timestamps is None) ^ (fps is None):
+            raise ValueError("`delta_timestamps` and `fps` should be provided together, or not at all.")
         self.set_delta_timestamps(delta_timestamps)
         self._fps = fps
         # Tolerance in seconds used to discard loaded frames when their timestamps are not close enough from
@@ -105,16 +125,17 @@ def __init__(
         # minus 1e-4 to account for possible numerical error
         self.tolerance_s = 1 / self.fps - 1e-4 if fps is not None else None
         self._buffer_capacity = buffer_capacity
+        Path(storage_dir).mkdir(parents=True, exist_ok=True)
         data_spec = self._make_data_spec(data_spec, buffer_capacity)
-        Path(write_dir).mkdir(parents=True, exist_ok=True)
-        self._data = {}
+        self._data: dict[str, np.memmap] = {}
         for k, v in data_spec.items():
             self._data[k] = _make_memmap_safe(
-                filename=Path(write_dir) / k,
+                filename=Path(storage_dir) / k,
                 dtype=v["dtype"] if v is not None else None,
-                mode="r+" if (Path(write_dir) / k).exists() else "w+",
+                mode="r+" if (Path(storage_dir) / k).exists() else "w+",
                 shape=tuple(v["shape"]) if v is not None else None,
             )
+        self.image_transform = image_transform
 
     @property
     def delta_timestamps(self) -> dict[str, np.ndarray] | None:
@@ -123,8 +144,8 @@ def delta_timestamps(self) -> dict[str, np.ndarray] | None:
     def set_delta_timestamps(self, value: dict[str, list[float]] | None):
         """Set delta_timestamps converting the values to numpy arrays.
 
-        The conversion is for an optimization in the __getitem__. The loop is much slower if the arrays
-        need to be converted into numpy arrays.
+        Note: The conversion is for an optimization in the __getitem__. The loop is much slower if lists need
+        to be converted into numpy arrays.
         """
         if value is not None:
             self._delta_timestamps = {k: np.array(v) for k, v in value.items()}
@@ -132,88 +153,102 @@ def set_delta_timestamps(self, value: dict[str, list[float]] | None):
             self._delta_timestamps = None
 
     def _make_data_spec(self, data_spec: dict[str, Any], buffer_capacity: int) -> dict[str, dict[str, Any]]:
-        """Makes the data spec for np.memmap."""
+        """Makes the necessary data specification keyword arguments for np.memmap."""
         if any(k.startswith("_") for k in data_spec):
             raise ValueError(
                 "data_spec keys should not start with '_'. This prefix is reserved for internal logic."
             )
-        preset_keys = {
-            OnlineBuffer.INDEX_KEY,
-            OnlineBuffer.FRAME_INDEX_KEY,
-            OnlineBuffer.EPISODE_INDEX_KEY,
-            OnlineBuffer.TIMESTAMP_KEY,
-        }
-        if len(intersection := set(data_spec).intersection(preset_keys)) > 0:
+        if len(intersection := set(data_spec).intersection(DataBuffer.PRESET_KEYS)) > 0:
             raise ValueError(
-                f"data_spec should not contain any of {preset_keys} as these are handled internally. "
-                f"The provided data_spec has {intersection}."
+                f"`data_spec` should not contain any of {DataBuffer.PRESET_KEYS} as these are handled "
+                f"internally. The provided data_spec has {intersection}."
             )
         complete_data_spec = {
-            # _next_index will be a pointer to the next index that we should start filling from when we add
-            # more data.
-            OnlineBuffer.NEXT_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": ()},
-            # Since the memmap is initialized with all-zeros, this keeps track of which indices are occupied
-            # with real data rather than the dummy initialization.
-            OnlineBuffer.OCCUPANCY_MASK_KEY: {"dtype": np.dtype("?"), "shape": (buffer_capacity,)},
-            OnlineBuffer.INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
-            OnlineBuffer.FRAME_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
-            OnlineBuffer.EPISODE_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
-            OnlineBuffer.TIMESTAMP_KEY: {"dtype": np.dtype("float64"), "shape": (buffer_capacity,)},
+            DataBuffer.NEXT_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (1,)},
+            DataBuffer.OCCUPANCY_MASK_KEY: {"dtype": np.dtype("?"), "shape": (buffer_capacity,)},
+            DataBuffer.INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
+            DataBuffer.FRAME_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
+            DataBuffer.EPISODE_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
+            DataBuffer.TIMESTAMP_KEY: {"dtype": np.dtype("float64"), "shape": (buffer_capacity,)},
         }
         for k, v in data_spec.items():
             complete_data_spec[k] = {"dtype": v["dtype"], "shape": (buffer_capacity, *v["shape"])}
         return complete_data_spec
 
     def add_data(self, data: dict[str, np.ndarray]):
-        """Add new data to the buffer, which could potentially mean shifting old data out.
+        """Add data to the buffer.
+
+        This calls `add_episode` for each unique episode index. See the documentation there for more
+        information.
+        """
+        for episode_index in np.unique(data[DataBuffer.EPISODE_INDEX_KEY]):
+            where_episode = np.where(data[DataBuffer.EPISODE_INDEX_KEY] == episode_index)[0]
+            episode_data = {k: data[k][where_episode] for k in data}
+            self.add_episode(episode_data)
 
-        The new data should contain all the frames (in order) of any number of episodes. The indices should
-        start from 0 (note to the developer: this can easily be generalized). See the `rollout` and
-        `eval_policy` functions in `eval.py` for more information on how the data is constructed.
+    def add_episode(self, data: dict[str, np.ndarray]):
+        """Add data for a single episode to the buffer.
 
-        Shift the incoming data index and episode_index to continue on from the last frame. Note that this
-        will be done in place!
+        `data` should have the same key, array mapping as the buffer. It should contain exactly one episode.
+        The episode should have frame indices that start from 0 and step up in increments of 1.
+
+        If the episode has more frames then are available till the end of the buffer, the pointer is reset
+        to the start of the buffer and the episode is inserted there, overwriting existing episode frames.
+
+        When episode frames are overwritten by a new episode, by default, any remaining frames belonging to
+        the existing episode are left in place (meaning not all episodes will be guaranteed to start from
+        their frame 0).
         """
         if len(missing_keys := (set(self.data_keys).difference(set(data)))) > 0:
             raise ValueError(f"Missing data keys: {missing_keys}")
         new_data_length = len(data[self.data_keys[0]])
+        if new_data_length <= 0:
+            raise ValueError("The episode has 0 frames")
+        if new_data_length > self._buffer_capacity:
+            raise ValueError("The episode length is larger than the buffer capacity.")
         if not all(len(data[k]) == new_data_length for k in self.data_keys):
             raise ValueError("All data items should have the same length")
+        if not np.all(data[DataBuffer.EPISODE_INDEX_KEY] == data[DataBuffer.EPISODE_INDEX_KEY][0]):
+            raise ValueError(
+                "New data should only contain one episode but there is more than one unique episode index."
+            )
+        if not np.array_equal(data[DataBuffer.FRAME_INDEX_KEY], np.arange(new_data_length)):
+            raise ValueError(
+                "Expected frame indices to start from 0 and step up in increments of 1 per frame."
+            )
 
-        next_index = self._data[OnlineBuffer.NEXT_INDEX_KEY]
-
-        # Sanity check to make sure that the new data indices start from 0.
-        assert data[OnlineBuffer.EPISODE_INDEX_KEY][0].item() == 0
-        assert data[OnlineBuffer.INDEX_KEY][0].item() == 0
-
-        # Shift the incoming indices if necessary.
+        # Figure out where we need to start filling data next, and make sure we continue data and episode
+        # indices.
+        next_index = self._data[DataBuffer.NEXT_INDEX_KEY][0]
         if self.num_samples > 0:
-            last_episode_index = self._data[OnlineBuffer.EPISODE_INDEX_KEY][next_index - 1]
-            last_data_index = self._data[OnlineBuffer.INDEX_KEY][next_index - 1]
-            data[OnlineBuffer.EPISODE_INDEX_KEY] += last_episode_index + 1
-            data[OnlineBuffer.INDEX_KEY] += last_data_index + 1
+            last_episode_index = self._data[DataBuffer.EPISODE_INDEX_KEY][next_index - 1]
+            last_data_index = self._data[DataBuffer.INDEX_KEY][next_index - 1]
+        else:
+            last_episode_index = -1
+            last_data_index = -1
+        # If there aren't enough slots in the buffer left to accommodate the episode, wrap to the start.
+        if max(0, new_data_length - (self._buffer_capacity - next_index)) > 0:
+            next_index = 0
 
-        # Insert the new data starting from next_index. It may be necessary to wrap around to the start.
-        n_surplus = max(0, new_data_length - (self._buffer_capacity - next_index))
+        # Insert the new data starting from next_index.
         for k in self.data_keys:
-            if n_surplus == 0:
-                slc = slice(next_index, next_index + new_data_length)
-                self._data[k][slc] = data[k]
-                self._data[OnlineBuffer.OCCUPANCY_MASK_KEY][slc] = True
+            slc = slice(next_index, next_index + new_data_length)
+            if k == DataBuffer.EPISODE_INDEX_KEY:
+                self._data[k][slc] = last_episode_index + 1
+            elif k == DataBuffer.INDEX_KEY:
+                self._data[k][slc] = np.arange(last_data_index + 1, last_data_index + 1 + new_data_length)
             else:
-                self._data[k][next_index:] = data[k][:-n_surplus]
-                self._data[OnlineBuffer.OCCUPANCY_MASK_KEY][next_index:] = True
-                self._data[k][:n_surplus] = data[k][-n_surplus:]
-        if n_surplus == 0:
-            self._data[OnlineBuffer.NEXT_INDEX_KEY] = next_index + new_data_length
-        else:
-            self._data[OnlineBuffer.NEXT_INDEX_KEY] = n_surplus
+                self._data[k][slc] = data[k]
+            self._data[DataBuffer.OCCUPANCY_MASK_KEY][slc] = True
+
+        # Update the data pointer.
+        self._data[DataBuffer.NEXT_INDEX_KEY][0] = next_index + new_data_length
 
     @property
     def data_keys(self) -> list[str]:
         keys = set(self._data)
-        keys.remove(OnlineBuffer.OCCUPANCY_MASK_KEY)
-        keys.remove(OnlineBuffer.NEXT_INDEX_KEY)
+        keys.remove(DataBuffer.OCCUPANCY_MASK_KEY)
+        keys.remove(DataBuffer.NEXT_INDEX_KEY)
         return sorted(keys)
 
     @property
@@ -223,12 +258,12 @@ def fps(self) -> float | None:
     @property
     def num_episodes(self) -> int:
         return len(
-            np.unique(self._data[OnlineBuffer.EPISODE_INDEX_KEY][self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]])
+            np.unique(self._data[DataBuffer.EPISODE_INDEX_KEY][self._data[DataBuffer.OCCUPANCY_MASK_KEY]])
         )
 
     @property
     def num_samples(self) -> int:
-        return np.count_nonzero(self._data[OnlineBuffer.OCCUPANCY_MASK_KEY])
+        return np.count_nonzero(self._data[DataBuffer.OCCUPANCY_MASK_KEY])
 
     def __len__(self):
         return self.num_samples
@@ -244,62 +279,181 @@ def _item_to_tensors(self, item: dict) -> dict:
                 item_[k] = torch.tensor(v)
         return item_
 
+    @property
+    def camera_keys(self) -> list[str]:
+        """Keys to access image and video stream from cameras."""
+        return [k for k in self._data if k.startswith("observation.image")]
+
+    def _optimized_advanced_slice(self, data_key: str, indices: np.ndarray) -> np.ndarray:
+        """Convert advanced slicing to basic slicing by finding contiguous ranges in the requested indices.
+
+        TODO(now): Is this needed?
+        """
+        indices_diff = np.diff(indices, prepend=indices[0] - 1)
+        where_not_1 = np.where(indices_diff != 1)[0]
+        ptr = 0
+        ret = []
+        for ix in chain(where_not_1, [len(indices)]):
+            ret.append(self._data[data_key][indices[ptr] : indices[ix - 1] + 1])
+            ptr = ix
+
+        # Avoid creating a copy with concatenate if possible.
+        return np.concatenate(ret) if len(ret) > 1 else ret[0]
+
+    def flush(self):
+        """Save the data to disk.
+
+        `np.memmap`s keep a portion of the data mirrored in memory. Updates to the in-memory data are not
+        immediately reflected on disk. Call this method to explicitly save the updates to disk.
+        """
+        for k in self._data:
+            self._data[k].flush()
+
+    def get_data_by_key(self, key: str) -> torch.Tensor:
+        """Returns all data for a given data key as a Tensor."""
+        return torch.from_numpy(self._data[key][self._data[DataBuffer.OCCUPANCY_MASK_KEY]])
+
     def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
         if idx >= len(self) or idx < -len(self):
             raise IndexError
 
         item = {k: v[idx] for k, v in self._data.items() if not k.startswith("_")}
 
-        if self.delta_timestamps is None:
-            return self._item_to_tensors(item)
-
-        episode_index = item[OnlineBuffer.EPISODE_INDEX_KEY]
-        current_ts = item[OnlineBuffer.TIMESTAMP_KEY]
-        episode_data_indices = np.where(
-            np.bitwise_and(
-                self._data[OnlineBuffer.EPISODE_INDEX_KEY] == episode_index,
-                self._data[OnlineBuffer.OCCUPANCY_MASK_KEY],
+        if self.delta_timestamps is not None:
+            episode_index = item[DataBuffer.EPISODE_INDEX_KEY]
+            current_ts = item[DataBuffer.TIMESTAMP_KEY]
+            episode_data_indices = np.where(
+                np.bitwise_and(
+                    self._data[DataBuffer.EPISODE_INDEX_KEY] == episode_index,
+                    self._data[DataBuffer.OCCUPANCY_MASK_KEY],
+                )
+            )[0]
+            episode_timestamps = self._data[DataBuffer.TIMESTAMP_KEY][episode_data_indices]
+
+            if self.video:
+                video_timestamps = {}  # TODO(now): HACK
+
+            for data_key in self.delta_timestamps:
+                # Get timestamps used as query to retrieve data of previous/future frames.
+                query_ts = current_ts + self.delta_timestamps[data_key]
+
+                # Compute distances between each query timestamp and all timestamps of all the frames
+                # belonging to the episode.
+                dist = np.abs(query_ts[:, None] - episode_timestamps[None, :])
+                argmin_ = np.argmin(dist, axis=1)
+                min_ = dist[np.arange(dist.shape[0]), argmin_]
+
+                is_pad = min_ > self.tolerance_s
+
+                # Check violated query timestamps are all outside the episode range.
+                err_msg = (
+                    f"One or several timestamps unexpectedly violate the tolerance ({min_} > "
+                    f"{self.tolerance_s=}) inside the episode range."
+                )
+                try:
+                    assert (
+                        (query_ts[is_pad] < episode_timestamps[0])
+                        | (episode_timestamps[-1] < query_ts[is_pad])
+                    ).all(), err_msg
+                except AssertionError:
+                    logging.warning(err_msg)
+                    return self.__getitem__(np.random.choice(len(self)))
+
+                if self.video and data_key in self.video_frame_keys:  # TODO(now): HACK
+                    video_timestamps[data_key] = self._data[DataBuffer.TIMESTAMP_KEY][
+                        episode_data_indices[argmin_]
+                    ]
+
+                # Load frames for this data key.
+                if np.any(np.diff(argmin_) != 1):
+                    item[data_key] = self._data[data_key][episode_data_indices[argmin_]]
+                    # item[data_key] = self._optimized_advanced_slice(data_key, episode_data_indices[argmin_])
+                else:
+                    # Do basic slicing where possible
+                    item[data_key] = self._data[data_key][
+                        episode_data_indices[argmin_.min()] : episode_data_indices[argmin_.max()] + 1
+                    ]
+
+                item[f"{data_key}{DataBuffer.IS_PAD_POSTFIX}"] = is_pad
+
+        if self.video:
+            item_ = dict(item)
+            for k in self.video_frame_keys:  # TODO(now): HACK
+                if self.delta_timestamps is None:
+                    item_[k] = {"path": item[k].decode(), "timestamp": float(item[DataBuffer.TIMESTAMP_KEY])}
+                else:
+                    query_ts = current_ts + self.delta_timestamps[k]
+                    item_[k] = [
+                        {"path": item[k][i].decode(), "timestamp": video_timestamps[k][i]}
+                        for i in range(len(item[k]))
+                    ]
+            item = load_from_videos(
+                item_,
+                self.video_frame_keys,
+                self.videos_dir,
+                self.tolerance_s,
+                self.video_backend,
             )
-        )[0]
-        episode_timestamps = self._data[OnlineBuffer.TIMESTAMP_KEY][episode_data_indices]
-
-        for data_key in self.delta_timestamps:
-            # Note: The logic in this loop is copied from `load_previous_and_future_frames`.
-            # Get timestamps used as query to retrieve data of previous/future frames.
-            query_ts = current_ts + self.delta_timestamps[data_key]
-
-            # Compute distances between each query timestamp and all timestamps of all the frames belonging to
-            # the episode.
-            dist = np.abs(query_ts[:, None] - episode_timestamps[None, :])
-            argmin_ = np.argmin(dist, axis=1)
-            min_ = dist[np.arange(dist.shape[0]), argmin_]
-
-            is_pad = min_ > self.tolerance_s
-
-            # Check violated query timestamps are all outside the episode range.
-            assert (
-                (query_ts[is_pad] < episode_timestamps[0]) | (episode_timestamps[-1] < query_ts[is_pad])
-            ).all(), (
-                f"One or several timestamps unexpectedly violate the tolerance ({min_} > {self.tolerance_s=}"
-                ") inside the episode range."
-            )
-
-            # Load frames for this data key.
-            item[data_key] = self._data[data_key][episode_data_indices[argmin_]]
 
-            item[f"{data_key}{OnlineBuffer.IS_PAD_POSTFIX}"] = is_pad
+        if self.image_transform is not None:
+            for cam in self.camera_keys:
+                item[cam] = self.image_transform(item[cam])
 
         return self._item_to_tensors(item)
 
-    def get_data_by_key(self, key: str) -> torch.Tensor:
-        """Returns all data for a given data key as a Tensor."""
-        return torch.from_numpy(self._data[key][self._data[OnlineBuffer.OCCUPANCY_MASK_KEY]])
+    @classmethod
+    def from_hf_dataset(cls, hf_dataset: datasets.Dataset, **kwargs) -> "DataBuffer":
+        data_spec = {}
+        video_frame_keys = []
+        for k, feature in hf_dataset.features.items():
+            if k in DataBuffer.PRESET_KEYS:
+                continue
+            if isinstance(feature, datasets.features.Image):
+                example_img = np.array(hf_dataset[0][k])
+                data_spec[k] = {"shape": example_img.shape, "dtype": np.dtype("uint8")}
+            elif isinstance(feature, VideoFrame):
+                video_frame_keys.append(k)
+                data_spec[k] = {
+                    "shape": (),
+                    "dtype": np.dtype(f"S{MAX_VIDEO_PATH_LENGTH}"),
+                }
+            elif isinstance(feature, datasets.features.Sequence):
+                data_spec[k] = {"shape": (feature.length,), "dtype": np.dtype(feature.feature.dtype)}
+            elif isinstance(feature, datasets.features.Value):
+                data_spec[k] = {"shape": (), "dtype": np.dtype(feature.dtype)}
+            else:
+                raise NotImplementedError(f"Dataset feature type {type(feature)} is not handled.")
+        obj = cls(
+            **kwargs,
+            data_spec=data_spec,
+            buffer_capacity=len(hf_dataset),
+        )
+        data_dict = {}
+        for k, feature in hf_dataset.features.items():
+            if isinstance(feature, datasets.features.Image):
+                data_dict[k] = np.stack(
+                    [np.array(pil_img).astype(np.float32) / 255 for pil_img in hf_dataset[k]]
+                )
+            elif isinstance(feature, VideoFrame):
+                data_dict[k] = np.stack(
+                    [np.array(dct["path"], dtype=f"S{MAX_VIDEO_PATH_LENGTH}") for dct in hf_dataset[k]]
+                )
+            else:
+                data_dict[k] = np.array(hf_dataset[k])
+        obj.add_data(data_dict)
+        obj.video = False  # TODO(now): HACK
+        if len(video_frame_keys) > 0:
+            obj.video = True  # TODO(now): HACK
+            obj.video_frame_keys = video_frame_keys  # TODO(now): HACK
+            obj.videos_dir = Path(kwargs["storage_dir"]) / "videos"  # TODO(now): HACK
+            obj.video_backend = "pyav"  # TODO(now): HACK
+        return obj
 
 
 def compute_sampler_weights(
     offline_dataset: LeRobotDataset,
     offline_drop_n_last_frames: int = 0,
-    online_dataset: OnlineBuffer | None = None,
+    online_dataset: DataBuffer | None = None,
     online_sampling_ratio: float | None = None,
     online_drop_n_last_frames: int = 0,
 ) -> torch.Tensor:
@@ -308,7 +462,7 @@ def compute_sampler_weights(
     Args:
         offline_dataset: The LeRobotDataset used for offline pre-training.
         online_drop_n_last_frames: Number of frames to drop from the end of each offline dataset episode.
-        online_dataset: The OnlineBuffer used in online training.
+        online_dataset: The DataBuffer used in online training.
         online_sampling_ratio: The proportion of data that should be sampled from the online dataset. If an
             online dataset is provided, this value must also be provided.
         online_drop_n_first_frames: See `offline_drop_n_last_frames`. This is the same, but for the online
diff --git a/lerobot/common/robot_devices/cameras/intelrealsense.py b/lerobot/common/robot_devices/cameras/intelrealsense.py
deleted file mode 100644
index 4806bf785..000000000
--- a/lerobot/common/robot_devices/cameras/intelrealsense.py
+++ /dev/null
@@ -1,448 +0,0 @@
-"""
-This file contains utilities for recording frames from Intel Realsense cameras.
-"""
-
-import argparse
-import concurrent.futures
-import logging
-import shutil
-import threading
-import time
-import traceback
-from dataclasses import dataclass, replace
-from pathlib import Path
-from threading import Thread
-
-import cv2
-import numpy as np
-import pyrealsense2 as rs
-from PIL import Image
-
-from lerobot.common.robot_devices.utils import (
-    RobotDeviceAlreadyConnectedError,
-    RobotDeviceNotConnectedError,
-)
-from lerobot.common.utils.utils import capture_timestamp_utc
-from lerobot.scripts.control_robot import busy_wait
-
-SERIAL_NUMBER_INDEX = 1
-
-
-def find_camera_indices(raise_when_empty=True) -> list[int]:
-    """
-    Find the serial numbers of the Intel RealSense cameras
-    connected to the computer.
-    """
-    camera_ids = []
-    for device in rs.context().query_devices():
-        serial_number = int(device.get_info(rs.camera_info(SERIAL_NUMBER_INDEX)))
-        camera_ids.append(serial_number)
-
-    if raise_when_empty and len(camera_ids) == 0:
-        raise OSError(
-            "Not a single camera was detected. Try re-plugging, or re-installing `librealsense` and its python wrapper `pyrealsense2`, or updating the firmware."
-        )
-
-    return camera_ids
-
-
-def save_image(img_array, camera_idx, frame_index, images_dir):
-    try:
-        img = Image.fromarray(img_array)
-        path = images_dir / f"camera_{camera_idx}_frame_{frame_index:06d}.png"
-        path.parent.mkdir(parents=True, exist_ok=True)
-        img.save(str(path), quality=100)
-        logging.info(f"Saved image: {path}")
-    except Exception as e:
-        logging.error(f"Failed to save image for camera {camera_idx} frame {frame_index}: {e}")
-
-
-def save_images_from_cameras(
-    images_dir: Path,
-    camera_ids: list[int] | None = None,
-    fps=None,
-    width=None,
-    height=None,
-    record_time_s=2,
-):
-    """
-    Initializes all the cameras and saves images to the directory. Useful to visually identify the camera
-    associated to a given camera index.
-    """
-    if camera_ids is None:
-        camera_ids = find_camera_indices()
-
-    print("Connecting cameras")
-    cameras = []
-    for cam_idx in camera_ids:
-        camera = IntelRealSenseCamera(cam_idx, fps=fps, width=width, height=height)
-        camera.connect()
-        print(
-            f"IntelRealSenseCamera({camera.camera_index}, fps={camera.fps}, width={camera.width}, height={camera.height}, color_mode={camera.color_mode})"
-        )
-        cameras.append(camera)
-
-    images_dir = Path(images_dir)
-    if images_dir.exists():
-        shutil.rmtree(
-            images_dir,
-        )
-    images_dir.mkdir(parents=True, exist_ok=True)
-
-    print(f"Saving images to {images_dir}")
-    frame_index = 0
-    start_time = time.perf_counter()
-    try:
-        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
-            while True:
-                now = time.perf_counter()
-
-                for camera in cameras:
-                    # If we use async_read when fps is None, the loop will go full speed, and we will end up
-                    # saving the same images from the cameras multiple times until the RAM/disk is full.
-                    image = camera.read() if fps is None else camera.async_read()
-                    if image is None:
-                        print("No Frame")
-                    bgr_converted_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
-
-                    executor.submit(
-                        save_image,
-                        bgr_converted_image,
-                        camera.camera_index,
-                        frame_index,
-                        images_dir,
-                    )
-
-                if fps is not None:
-                    dt_s = time.perf_counter() - now
-                    busy_wait(1 / fps - dt_s)
-
-                if time.perf_counter() - start_time > record_time_s:
-                    break
-
-                print(f"Frame: {frame_index:04d}\tLatency (ms): {(time.perf_counter() - now) * 1000:.2f}")
-
-                frame_index += 1
-    finally:
-        print(f"Images have been saved to {images_dir}")
-        for camera in cameras:
-            camera.disconnect()
-
-
-@dataclass
-class IntelRealSenseCameraConfig:
-    """
-    Example of tested options for Intel Real Sense D405:
-
-    ```python
-    IntelRealSenseCameraConfig(30, 640, 480)
-    IntelRealSenseCameraConfig(60, 640, 480)
-    IntelRealSenseCameraConfig(90, 640, 480)
-    IntelRealSenseCameraConfig(30, 1280, 720)
-    IntelRealSenseCameraConfig(30, 640, 480, use_depth=True)
-    ```
-    """
-
-    fps: int | None = None
-    width: int | None = None
-    height: int | None = None
-    color_mode: str = "rgb"
-    use_depth: bool = False
-    force_hardware_reset: bool = True
-
-    def __post_init__(self):
-        if self.color_mode not in ["rgb", "bgr"]:
-            raise ValueError(
-                f"`color_mode` is expected to be 'rgb' or 'bgr', but {self.color_mode} is provided."
-            )
-
-        if (self.fps or self.width or self.height) and not (self.fps and self.width and self.height):
-            raise ValueError(
-                "For `fps`, `width` and `height`, either all of them need to be set, or none of them, "
-                f"but {self.fps=}, {self.width=}, {self.height=} were provided."
-            )
-
-
-class IntelRealSenseCamera:
-    """
-    The IntelRealSenseCamera class is similar to OpenCVCamera class but adds additional features for Intel Real Sense cameras:
-    - camera_index corresponds to the serial number of the camera,
-    - camera_index won't randomly change as it can be the case of OpenCVCamera for Linux,
-    - read is more reliable than OpenCVCamera,
-    - depth map can be returned.
-
-    To find the camera indices of your cameras, you can run our utility script that will save a few frames for each camera:
-    ```bash
-    python lerobot/common/robot_devices/cameras/intelrealsense.py --images-dir outputs/images_from_intelrealsense_cameras
-    ```
-
-    When an IntelRealSenseCamera is instantiated, if no specific config is provided, the default fps, width, height and color_mode
-    of the given camera will be used.
-
-    Example of usage:
-    ```python
-    camera_index = 128422271347
-    camera = IntelRealSenseCamera(camera_index)
-    camera.connect()
-    color_image = camera.read()
-    # when done using the camera, consider disconnecting
-    camera.disconnect()
-    ```
-
-    Example of changing default fps, width, height and color_mode:
-    ```python
-    camera = IntelRealSenseCamera(camera_index, fps=30, width=1280, height=720)
-    camera = connect()  # applies the settings, might error out if these settings are not compatible with the camera
-
-    camera = IntelRealSenseCamera(camera_index, fps=90, width=640, height=480)
-    camera = connect()
-
-    camera = IntelRealSenseCamera(camera_index, fps=90, width=640, height=480, color_mode="bgr")
-    camera = connect()
-    ```
-
-    Example of returning depth:
-    ```python
-    camera = IntelRealSenseCamera(camera_index, use_depth=True)
-    camera.connect()
-    color_image, depth_map = camera.read()
-    ```
-    """
-
-    def __init__(
-        self,
-        camera_index: int,
-        config: IntelRealSenseCameraConfig | None = None,
-        **kwargs,
-    ):
-        if config is None:
-            config = IntelRealSenseCameraConfig()
-
-        # Overwrite the config arguments using kwargs
-        config = replace(config, **kwargs)
-
-        self.camera_index = camera_index
-        self.fps = config.fps
-        self.width = config.width
-        self.height = config.height
-        self.color_mode = config.color_mode
-        self.use_depth = config.use_depth
-        self.force_hardware_reset = config.force_hardware_reset
-
-        self.camera = None
-        self.is_connected = False
-        self.thread = None
-        self.stop_event = None
-        self.color_image = None
-        self.depth_map = None
-        self.logs = {}
-
-    def connect(self):
-        if self.is_connected:
-            raise RobotDeviceAlreadyConnectedError(
-                f"IntelRealSenseCamera({self.camera_index}) is already connected."
-            )
-
-        config = rs.config()
-        config.enable_device(str(self.camera_index))
-
-        if self.fps and self.width and self.height:
-            # TODO(rcadene): can we set rgb8 directly?
-            config.enable_stream(rs.stream.color, self.width, self.height, rs.format.rgb8, self.fps)
-        else:
-            config.enable_stream(rs.stream.color)
-
-        if self.use_depth:
-            if self.fps and self.width and self.height:
-                config.enable_stream(rs.stream.depth, self.width, self.height, rs.format.z16, self.fps)
-            else:
-                config.enable_stream(rs.stream.depth)
-
-        self.camera = rs.pipeline()
-        try:
-            self.camera.start(config)
-            is_camera_open = True
-        except RuntimeError:
-            is_camera_open = False
-            traceback.print_exc()
-
-        # If the camera doesn't work, display the camera indices corresponding to
-        # valid cameras.
-        if not is_camera_open:
-            # Verify that the provided `camera_index` is valid before printing the traceback
-            available_cam_ids = find_camera_indices()
-            if self.camera_index not in available_cam_ids:
-                raise ValueError(
-                    f"`camera_index` is expected to be one of these available cameras {available_cam_ids}, but {self.camera_index} is provided instead. "
-                    "To find the camera index you should use, run `python lerobot/common/robot_devices/cameras/intelrealsense.py`."
-                )
-
-            raise OSError(f"Can't access IntelRealSenseCamera({self.camera_index}).")
-
-        self.is_connected = True
-
-    def read(self, temporary_color: str | None = None) -> np.ndarray | tuple[np.ndarray, np.ndarray]:
-        """Read a frame from the camera returned in the format height x width x channels (e.g. 480 x 640 x 3)
-        of type `np.uint8`, contrarily to the pytorch format which is float channel first.
-
-        When `use_depth=True`, returns a tuple `(color_image, depth_map)` with a depth map in the format
-        height x width (e.g. 480 x 640) of type np.uint16.
-
-        Note: Reading a frame is done every `camera.fps` times per second, and it is blocking.
-        If you are reading data from other sensors, we advise to use `camera.async_read()` which is non blocking version of `camera.read()`.
-        """
-        if not self.is_connected:
-            raise RobotDeviceNotConnectedError(
-                f"IntelRealSenseCamera({self.camera_index}) is not connected. Try running `camera.connect()` first."
-            )
-
-        start_time = time.perf_counter()
-
-        frame = self.camera.wait_for_frames(timeout_ms=5000)
-
-        color_frame = frame.get_color_frame()
-
-        if not color_frame:
-            raise OSError(f"Can't capture color image from IntelRealSenseCamera({self.camera_index}).")
-
-        color_image = np.asanyarray(color_frame.get_data())
-
-        requested_color_mode = self.color_mode if temporary_color is None else temporary_color
-        if requested_color_mode not in ["rgb", "bgr"]:
-            raise ValueError(
-                f"Expected color values are 'rgb' or 'bgr', but {requested_color_mode} is provided."
-            )
-
-        # IntelRealSense uses RGB format as default (red, green, blue).
-        if requested_color_mode == "bgr":
-            color_image = cv2.cvtColor(color_image, cv2.COLOR_RGB2BGR)
-
-        h, w, _ = color_image.shape
-        if h != self.height or w != self.width:
-            raise OSError(
-                f"Can't capture color image with expected height and width ({self.height} x {self.width}). ({h} x {w}) returned instead."
-            )
-
-        # log the number of seconds it took to read the image
-        self.logs["delta_timestamp_s"] = time.perf_counter() - start_time
-
-        # log the utc time at which the image was received
-        self.logs["timestamp_utc"] = capture_timestamp_utc()
-
-        if self.use_depth:
-            depth_frame = frame.get_depth_frame()
-            if not depth_frame:
-                raise OSError(f"Can't capture depth image from IntelRealSenseCamera({self.camera_index}).")
-
-            depth_map = np.asanyarray(depth_frame.get_data())
-
-            h, w = depth_map.shape
-            if h != self.height or w != self.width:
-                raise OSError(
-                    f"Can't capture depth map with expected height and width ({self.height} x {self.width}). ({h} x {w}) returned instead."
-                )
-
-            return color_image, depth_map
-        else:
-            return color_image
-
-    def read_loop(self):
-        while self.stop_event is None or not self.stop_event.is_set():
-            if self.use_depth:
-                self.color_image, self.depth_map = self.read()
-            else:
-                self.color_image = self.read()
-
-    def async_read(self):
-        """Access the latest color image"""
-        if not self.is_connected:
-            raise RobotDeviceNotConnectedError(
-                f"IntelRealSenseCamera({self.camera_index}) is not connected. Try running `camera.connect()` first."
-            )
-
-        if self.thread is None:
-            self.stop_event = threading.Event()
-            self.thread = Thread(target=self.read_loop, args=())
-            self.thread.daemon = True
-            self.thread.start()
-
-        num_tries = 0
-        while self.color_image is None:
-            num_tries += 1
-            time.sleep(1 / self.fps)
-            if num_tries > self.fps and (self.thread.ident is None or not self.thread.is_alive()):
-                raise Exception(
-                    "The thread responsible for `self.async_read()` took too much time to start. There might be an issue. Verify that `self.thread.start()` has been called."
-                )
-
-        if self.use_depth:
-            return self.color_image, self.depth_map
-        else:
-            return self.color_image
-
-    def disconnect(self):
-        if not self.is_connected:
-            raise RobotDeviceNotConnectedError(
-                f"IntelRealSenseCamera({self.camera_index}) is not connected. Try running `camera.connect()` first."
-            )
-
-        if self.thread is not None and self.thread.is_alive():
-            # wait for the thread to finish
-            self.stop_event.set()
-            self.thread.join()
-            self.thread = None
-            self.stop_event = None
-
-        self.camera.stop()
-        self.camera = None
-
-        self.is_connected = False
-
-    def __del__(self):
-        if getattr(self, "is_connected", False):
-            self.disconnect()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Save a few frames using `IntelRealSenseCamera` for all cameras connected to the computer, or a selected subset."
-    )
-    parser.add_argument(
-        "--camera-ids",
-        type=int,
-        nargs="*",
-        default=None,
-        help="List of camera indices used to instantiate the `IntelRealSenseCamera`. If not provided, find and use all available camera indices.",
-    )
-    parser.add_argument(
-        "--fps",
-        type=int,
-        default=30,
-        help="Set the number of frames recorded per seconds for all cameras. If not provided, use the default fps of each camera.",
-    )
-    parser.add_argument(
-        "--width",
-        type=str,
-        default=640,
-        help="Set the width for all cameras. If not provided, use the default width of each camera.",
-    )
-    parser.add_argument(
-        "--height",
-        type=str,
-        default=480,
-        help="Set the height for all cameras. If not provided, use the default height of each camera.",
-    )
-    parser.add_argument(
-        "--images-dir",
-        type=Path,
-        default="outputs/images_from_intelrealsense_cameras",
-        help="Set directory to save a few frames for each camera.",
-    )
-    parser.add_argument(
-        "--record-time-s",
-        type=float,
-        default=2.0,
-        help="Set the number of seconds used to record the frames. By default, 2 seconds.",
-    )
-    args = parser.parse_args()
-    save_images_from_cameras(**vars(args))
diff --git a/lerobot/common/robot_devices/robots/manipulator.py b/lerobot/common/robot_devices/robots/manipulator.py
deleted file mode 100644
index 337519765..000000000
--- a/lerobot/common/robot_devices/robots/manipulator.py
+++ /dev/null
@@ -1,703 +0,0 @@
-import json
-import logging
-import time
-import warnings
-from dataclasses import dataclass, field, replace
-from pathlib import Path
-from typing import Sequence
-
-import numpy as np
-import torch
-
-from lerobot.common.robot_devices.cameras.utils import Camera
-from lerobot.common.robot_devices.motors.dynamixel import (
-    CalibrationMode,
-    TorqueMode,
-    convert_degrees_to_steps,
-)
-from lerobot.common.robot_devices.motors.utils import MotorsBus
-from lerobot.common.robot_devices.robots.utils import get_arm_id
-from lerobot.common.robot_devices.utils import RobotDeviceAlreadyConnectedError, RobotDeviceNotConnectedError
-
-########################################################################
-# Calibration logic
-########################################################################
-
-URL_TEMPLATE = (
-    "https://raw.githubusercontent.com/huggingface/lerobot/main/media/{robot}/{arm}_{position}.webp"
-)
-
-# The following positions are provided in nominal degree range ]-180, +180[
-# For more info on these constants, see comments in the code where they get used.
-ZERO_POSITION_DEGREE = 0
-ROTATED_POSITION_DEGREE = 90
-
-
-def assert_drive_mode(drive_mode):
-    # `drive_mode` is in [0,1] with 0 means original rotation direction for the motor, and 1 means inverted.
-    if not np.all(np.isin(drive_mode, [0, 1])):
-        raise ValueError(f"`drive_mode` contains values other than 0 or 1: ({drive_mode})")
-
-
-def apply_drive_mode(position, drive_mode):
-    assert_drive_mode(drive_mode)
-    # Convert `drive_mode` from [0, 1] with 0 indicates original rotation direction and 1 inverted,
-    # to [-1, 1] with 1 indicates original rotation direction and -1 inverted.
-    signed_drive_mode = -(drive_mode * 2 - 1)
-    position *= signed_drive_mode
-    return position
-
-
-def compute_nearest_rounded_position(position, models):
-    delta_turn = convert_degrees_to_steps(ROTATED_POSITION_DEGREE, models)
-    nearest_pos = np.round(position.astype(float) / delta_turn) * delta_turn
-    return nearest_pos.astype(position.dtype)
-
-
-def run_arm_calibration(arm: MotorsBus, robot_type: str, arm_name: str, arm_type: str):
-    """This function ensures that a neural network trained on data collected on a given robot
-    can work on another robot. For instance before calibration, setting a same goal position
-    for each motor of two different robots will get two very different positions. But after calibration,
-    the two robots will move to the same position.To this end, this function computes the homing offset
-    and the drive mode for each motor of a given robot.
-
-    Homing offset is used to shift the motor position to a ]-2048, +2048[ nominal range (when the motor uses 2048 steps
-    to complete a half a turn). This range is set around an arbitrary "zero position" corresponding to all motor positions
-    being 0. During the calibration process, you will need to manually move the robot to this "zero position".
-
-    Drive mode is used to invert the rotation direction of the motor. This is useful when some motors have been assembled
-    in the opposite orientation for some robots. During the calibration process, you will need to manually move the robot
-    to the "rotated position".
-
-    After calibration, the homing offsets and drive modes are stored in a cache.
-
-    Example of usage:
-    ```python
-    run_arm_calibration(arm, "koch", "left", "follower")
-    ```
-    """
-    if (arm.read("Torque_Enable") != TorqueMode.DISABLED.value).any():
-        raise ValueError("To run calibration, the torque must be disabled on all motors.")
-
-    print(f"\nRunning calibration of {robot_type} {arm_name} {arm_type}...")
-
-    print("\nMove arm to zero position")
-    print("See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="zero"))
-    input("Press Enter to continue...")
-
-    # We arbitrarily chose our zero target position to be a straight horizontal position with gripper upwards and closed.
-    # It is easy to identify and all motors are in a "quarter turn" position. Once calibration is done, this position will
-    # correspond to every motor angle being 0. If you set all 0 as Goal Position, the arm will move in this position.
-    zero_target_pos = convert_degrees_to_steps(ZERO_POSITION_DEGREE, arm.motor_models)
-
-    # Compute homing offset so that `present_position + homing_offset ~= target_position`.
-    zero_pos = arm.read("Present_Position")
-    zero_nearest_pos = compute_nearest_rounded_position(zero_pos, arm.motor_models)
-    homing_offset = zero_target_pos - zero_nearest_pos
-
-    # The rotated target position corresponds to a rotation of a quarter turn from the zero position.
-    # This allows to identify the rotation direction of each motor.
-    # For instance, if the motor rotates 90 degree, and its value is -90 after applying the homing offset, then we know its rotation direction
-    # is inverted. However, for the calibration being successful, we need everyone to follow the same target position.
-    # Sometimes, there is only one possible rotation direction. For instance, if the gripper is closed, there is only one direction which
-    # corresponds to opening the gripper. When the rotation direction is ambiguous, we arbitrarely rotate clockwise from the point of view
-    # of the previous motor in the kinetic chain.
-    print("\nMove arm to rotated target position")
-    print("See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="rotated"))
-    input("Press Enter to continue...")
-
-    rotated_target_pos = convert_degrees_to_steps(ROTATED_POSITION_DEGREE, arm.motor_models)
-
-    # Find drive mode by rotating each motor by a quarter of a turn.
-    # Drive mode indicates if the motor rotation direction should be inverted (=1) or not (=0).
-    rotated_pos = arm.read("Present_Position")
-    drive_mode = (rotated_pos < zero_pos).astype(np.int32)
-
-    # Re-compute homing offset to take into account drive mode
-    rotated_drived_pos = apply_drive_mode(rotated_pos, drive_mode)
-    rotated_nearest_pos = compute_nearest_rounded_position(rotated_drived_pos, arm.motor_models)
-    homing_offset = rotated_target_pos - rotated_nearest_pos
-
-    print("\nMove arm to rest position")
-    print("See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="rest"))
-    input("Press Enter to continue...")
-    print()
-
-    # Joints with rotational motions are expressed in degrees in nominal range of [-180, 180]
-    calib_mode = [CalibrationMode.DEGREE.name] * len(arm.motor_names)
-
-    # TODO(rcadene): make type of joints (DEGREE or LINEAR) configurable from yaml?
-    if robot_type == "aloha" and "gripper" in arm.motor_names:
-        # Joints with linear motions (like gripper of Aloha) are experessed in nominal range of [0, 100]
-        calib_idx = arm.motor_names.index("gripper")
-        calib_mode[calib_idx] = CalibrationMode.LINEAR.name
-
-    calib_data = {
-        "homing_offset": homing_offset.tolist(),
-        "drive_mode": drive_mode.tolist(),
-        "start_pos": zero_pos.tolist(),
-        "end_pos": rotated_pos.tolist(),
-        "calib_mode": calib_mode,
-        "motor_names": arm.motor_names,
-    }
-    return calib_data
-
-
-def ensure_safe_goal_position(
-    goal_pos: torch.Tensor, present_pos: torch.Tensor, max_relative_target: float | list[float]
-):
-    # Cap relative action target magnitude for safety.
-    diff = goal_pos - present_pos
-    max_relative_target = torch.tensor(max_relative_target)
-    safe_diff = torch.minimum(diff, max_relative_target)
-    safe_diff = torch.maximum(safe_diff, -max_relative_target)
-    safe_goal_pos = present_pos + safe_diff
-
-    if not torch.allclose(goal_pos, safe_goal_pos):
-        logging.warning(
-            "Relative goal position magnitude had to be clamped to be safe.\n"
-            f"  requested relative goal position target: {diff}\n"
-            f"    clamped relative goal position target: {safe_diff}"
-        )
-
-    return safe_goal_pos
-
-
-########################################################################
-# Manipulator robot
-########################################################################
-
-
-@dataclass
-class ManipulatorRobotConfig:
-    """
-    Example of usage:
-    ```python
-    ManipulatorRobotConfig()
-    ```
-    """
-
-    # Define all components of the robot
-    robot_type: str | None = None
-    leader_arms: dict[str, MotorsBus] = field(default_factory=lambda: {})
-    follower_arms: dict[str, MotorsBus] = field(default_factory=lambda: {})
-    cameras: dict[str, Camera] = field(default_factory=lambda: {})
-
-    # Optionally limit the magnitude of the relative positional target vector for safety purposes.
-    # Set this to a positive scalar to have the same value for all motors, or a list that is the same length
-    # as the number of motors in your follower arms (assumes all follower arms have the same number of
-    # motors).
-    max_relative_target: list[float] | float | None = None
-
-    # Optionally set the leader arm in torque mode with the gripper motor set to this angle. This makes it
-    # possible to squeeze the gripper and have it spring back to an open position on its own. If None, the
-    # gripper is not put in torque mode.
-    gripper_open_degree: float | None = None
-
-    def __setattr__(self, prop: str, val):
-        if prop == "max_relative_target" and val is not None and isinstance(val, Sequence):
-            for name in self.follower_arms:
-                if len(self.follower_arms[name].motors) != len(val):
-                    raise ValueError(
-                        f"len(max_relative_target)={len(val)} but the follower arm with name {name} has "
-                        f"{len(self.follower_arms[name].motors)} motors. Please make sure that the "
-                        f"`max_relative_target` list has as many parameters as there are motors per arm. "
-                        "Note: This feature does not yet work with robots where different follower arms have "
-                        "different numbers of motors."
-                    )
-        super().__setattr__(prop, val)
-
-
-class ManipulatorRobot:
-    # TODO(rcadene): Implement force feedback
-    """This class allows to control any manipulator robot of various number of motors.
-
-    Non exaustive list of robots:
-    - [Koch v1.0](https://github.com/AlexanderKoch-Koch/low_cost_robot), with and without the wrist-to-elbow expansion, developed
-    by Alexander Koch from [Tau Robotics](https://tau-robotics.com)
-    - [Koch v1.1](https://github.com/jess-moss/koch-v1-1) developed by Jess Moss
-    - [Aloha](https://www.trossenrobotics.com/aloha-kits) developed by Trossen Robotics
-
-    Example of highest frequency teleoperation without camera:
-    ```python
-    # Defines how to communicate with the motors of the leader and follower arms
-    leader_arms = {
-        "main": DynamixelMotorsBus(
-            port="/dev/tty.usbmodem575E0031751",
-            motors={
-                # name: (index, model)
-                "shoulder_pan": (1, "xl330-m077"),
-                "shoulder_lift": (2, "xl330-m077"),
-                "elbow_flex": (3, "xl330-m077"),
-                "wrist_flex": (4, "xl330-m077"),
-                "wrist_roll": (5, "xl330-m077"),
-                "gripper": (6, "xl330-m077"),
-            },
-        ),
-    }
-    follower_arms = {
-        "main": DynamixelMotorsBus(
-            port="/dev/tty.usbmodem575E0032081",
-            motors={
-                # name: (index, model)
-                "shoulder_pan": (1, "xl430-w250"),
-                "shoulder_lift": (2, "xl430-w250"),
-                "elbow_flex": (3, "xl330-m288"),
-                "wrist_flex": (4, "xl330-m288"),
-                "wrist_roll": (5, "xl330-m288"),
-                "gripper": (6, "xl330-m288"),
-            },
-        ),
-    }
-    robot = ManipulatorRobot(
-        robot_type="koch",
-        calibration_dir=".cache/calibration/koch",
-        leader_arms=leader_arms,
-        follower_arms=follower_arms,
-    )
-
-    # Connect motors buses and cameras if any (Required)
-    robot.connect()
-
-    while True:
-        robot.teleop_step()
-    ```
-
-    Example of highest frequency data collection without camera:
-    ```python
-    # Assumes leader and follower arms have been instantiated already (see first example)
-    robot = ManipulatorRobot(
-        robot_type="koch",
-        calibration_dir=".cache/calibration/koch",
-        leader_arms=leader_arms,
-        follower_arms=follower_arms,
-    )
-    robot.connect()
-    while True:
-        observation, action = robot.teleop_step(record_data=True)
-    ```
-
-    Example of highest frequency data collection with cameras:
-    ```python
-    # Defines how to communicate with 2 cameras connected to the computer.
-    # Here, the webcam of the laptop and the phone (connected in USB to the laptop)
-    # can be reached respectively using the camera indices 0 and 1. These indices can be
-    # arbitrary. See the documentation of `OpenCVCamera` to find your own camera indices.
-    cameras = {
-        "laptop": OpenCVCamera(camera_index=0, fps=30, width=640, height=480),
-        "phone": OpenCVCamera(camera_index=1, fps=30, width=640, height=480),
-    }
-
-    # Assumes leader and follower arms have been instantiated already (see first example)
-    robot = ManipulatorRobot(
-        robot_type="koch",
-        calibration_dir=".cache/calibration/koch",
-        leader_arms=leader_arms,
-        follower_arms=follower_arms,
-        cameras=cameras,
-    )
-    robot.connect()
-    while True:
-        observation, action = robot.teleop_step(record_data=True)
-    ```
-
-    Example of controlling the robot with a policy (without running multiple policies in parallel to ensure highest frequency):
-    ```python
-    # Assumes leader and follower arms + cameras have been instantiated already (see previous example)
-    robot = ManipulatorRobot(
-        robot_type="koch",
-        calibration_dir=".cache/calibration/koch",
-        leader_arms=leader_arms,
-        follower_arms=follower_arms,
-        cameras=cameras,
-    )
-    robot.connect()
-    while True:
-        # Uses the follower arms and cameras to capture an observation
-        observation = robot.capture_observation()
-
-        # Assumes a policy has been instantiated
-        with torch.inference_mode():
-            action = policy.select_action(observation)
-
-        # Orders the robot to move
-        robot.send_action(action)
-    ```
-
-    Example of disconnecting which is not mandatory since we disconnect when the object is deleted:
-    ```python
-    robot.disconnect()
-    ```
-    """
-
-    def __init__(
-        self,
-        config: ManipulatorRobotConfig | None = None,
-        calibration_dir: Path = ".cache/calibration/koch",
-        **kwargs,
-    ):
-        if config is None:
-            config = ManipulatorRobotConfig()
-        # Overwrite config arguments using kwargs
-        self.config = replace(config, **kwargs)
-        self.calibration_dir = Path(calibration_dir)
-
-        self.robot_type = self.config.robot_type
-        self.leader_arms = self.config.leader_arms
-        self.follower_arms = self.config.follower_arms
-        self.cameras = self.config.cameras
-        self.is_connected = False
-        self.logs = {}
-
-    def connect(self):
-        if self.is_connected:
-            raise RobotDeviceAlreadyConnectedError(
-                "ManipulatorRobot is already connected. Do not run `robot.connect()` twice."
-            )
-
-        if not self.leader_arms and not self.follower_arms and not self.cameras:
-            raise ValueError(
-                "ManipulatorRobot doesn't have any device to connect. See example of usage in docstring of the class."
-            )
-
-        # Connect the arms
-        for name in self.follower_arms:
-            print(f"Connecting {name} follower arm.")
-            self.follower_arms[name].connect()
-            print(f"Connecting {name} leader arm.")
-            self.leader_arms[name].connect()
-
-        # We assume that at connection time, arms are in a rest position, and torque can
-        # be safely disabled to run calibration and/or set robot preset configurations.
-        for name in self.follower_arms:
-            self.follower_arms[name].write("Torque_Enable", TorqueMode.DISABLED.value)
-        for name in self.leader_arms:
-            self.leader_arms[name].write("Torque_Enable", TorqueMode.DISABLED.value)
-
-        self.activate_calibration()
-
-        # Set robot preset (e.g. torque in leader gripper for Koch v1.1)
-        if self.robot_type == "koch":
-            self.set_koch_robot_preset()
-        elif self.robot_type == "aloha":
-            self.set_aloha_robot_preset()
-        else:
-            warnings.warn(f"No preset found for robot type: {self.robot_type}", stacklevel=1)
-
-        # Enable torque on all motors of the follower arms
-        for name in self.follower_arms:
-            print(f"Activating torque on {name} follower arm.")
-            self.follower_arms[name].write("Torque_Enable", 1)
-
-        if self.config.gripper_open_degree is not None:
-            # Set the leader arm in torque mode with the gripper motor set to an angle. This makes it possible
-            # to squeeze the gripper and have it spring back to an open position on its own.
-            for name in self.leader_arms:
-                self.leader_arms[name].write("Torque_Enable", 1, "gripper")
-                self.leader_arms[name].write("Goal_Position", self.config.gripper_open_degree, "gripper")
-
-        # Connect the cameras
-        for name in self.cameras:
-            self.cameras[name].connect()
-
-        self.is_connected = True
-
-    def activate_calibration(self):
-        """After calibration all motors function in human interpretable ranges.
-        Rotations are expressed in degrees in nominal range of [-180, 180],
-        and linear motions (like gripper of Aloha) in nominal range of [0, 100].
-        """
-
-        def load_or_run_calibration_(name, arm, arm_type):
-            arm_id = get_arm_id(name, arm_type)
-            arm_calib_path = self.calibration_dir / f"{arm_id}.json"
-
-            if arm_calib_path.exists():
-                with open(arm_calib_path) as f:
-                    calibration = json.load(f)
-            else:
-                print(f"Missing calibration file '{arm_calib_path}'")
-                calibration = run_arm_calibration(arm, self.robot_type, name, arm_type)
-
-                print(f"Calibration is done! Saving calibration file '{arm_calib_path}'")
-                arm_calib_path.parent.mkdir(parents=True, exist_ok=True)
-                with open(arm_calib_path, "w") as f:
-                    json.dump(calibration, f)
-
-            return calibration
-
-        for name, arm in self.follower_arms.items():
-            calibration = load_or_run_calibration_(name, arm, "follower")
-            arm.set_calibration(calibration)
-        for name, arm in self.leader_arms.items():
-            calibration = load_or_run_calibration_(name, arm, "leader")
-            arm.set_calibration(calibration)
-
-    def set_koch_robot_preset(self):
-        def set_operating_mode_(arm):
-            if (arm.read("Torque_Enable") != TorqueMode.DISABLED.value).any():
-                raise ValueError("To run set robot preset, the torque must be disabled on all motors.")
-
-            # Use 'extended position mode' for all motors except gripper, because in joint mode the servos can't
-            # rotate more than 360 degrees (from 0 to 4095) And some mistake can happen while assembling the arm,
-            # you could end up with a servo with a position 0 or 4095 at a crucial point See [
-            # https://emanual.robotis.com/docs/en/dxl/x/x_series/#operating-mode11]
-            all_motors_except_gripper = [name for name in arm.motor_names if name != "gripper"]
-            if len(all_motors_except_gripper) > 0:
-                # 4 corresponds to Extended Position on Koch motors
-                arm.write("Operating_Mode", 4, all_motors_except_gripper)
-
-            # Use 'position control current based' for gripper to be limited by the limit of the current.
-            # For the follower gripper, it means it can grasp an object without forcing too much even tho,
-            # it's goal position is a complete grasp (both gripper fingers are ordered to join and reach a touch).
-            # For the leader gripper, it means we can use it as a physical trigger, since we can force with our finger
-            # to make it move, and it will move back to its original target position when we release the force.
-            # 5 corresponds to Current Controlled Position on Koch gripper motors "xl330-m077, xl330-m288"
-            arm.write("Operating_Mode", 5, "gripper")
-
-        for name in self.follower_arms:
-            set_operating_mode_(self.follower_arms[name])
-
-            # Set better PID values to close the gap between recorded states and actions
-            # TODO(rcadene): Implement an automatic procedure to set optimial PID values for each motor
-            self.follower_arms[name].write("Position_P_Gain", 1500, "elbow_flex")
-            self.follower_arms[name].write("Position_I_Gain", 0, "elbow_flex")
-            self.follower_arms[name].write("Position_D_Gain", 600, "elbow_flex")
-
-        if self.config.gripper_open_degree is not None:
-            for name in self.leader_arms:
-                set_operating_mode_(self.leader_arms[name])
-
-                # Enable torque on the gripper of the leader arms, and move it to 45 degrees,
-                # so that we can use it as a trigger to close the gripper of the follower arms.
-                self.leader_arms[name].write("Torque_Enable", 1, "gripper")
-                self.leader_arms[name].write("Goal_Position", self.config.gripper_open_degree, "gripper")
-
-    def set_aloha_robot_preset(self):
-        def set_shadow_(arm):
-            # Set secondary/shadow ID for shoulder and elbow. These joints have two motors.
-            # As a result, if only one of them is required to move to a certain position,
-            # the other will follow. This is to avoid breaking the motors.
-            if "shoulder_shadow" in arm.motor_names:
-                shoulder_idx = arm.read("ID", "shoulder")
-                arm.write("Secondary_ID", shoulder_idx, "shoulder_shadow")
-
-            if "elbow_shadow" in arm.motor_names:
-                elbow_idx = arm.read("ID", "elbow")
-                arm.write("Secondary_ID", elbow_idx, "elbow_shadow")
-
-        for name in self.follower_arms:
-            set_shadow_(self.follower_arms[name])
-
-        for name in self.leader_arms:
-            set_shadow_(self.leader_arms[name])
-
-        for name in self.follower_arms:
-            # Set a velocity limit of 131 as advised by Trossen Robotics
-            self.follower_arms[name].write("Velocity_Limit", 131)
-
-            # Use 'extended position mode' for all motors except gripper, because in joint mode the servos can't
-            # rotate more than 360 degrees (from 0 to 4095) And some mistake can happen while assembling the arm,
-            # you could end up with a servo with a position 0 or 4095 at a crucial point See [
-            # https://emanual.robotis.com/docs/en/dxl/x/x_series/#operating-mode11]
-            all_motors_except_gripper = [
-                name for name in self.follower_arms[name].motor_names if name != "gripper"
-            ]
-            if len(all_motors_except_gripper) > 0:
-                # 4 corresponds to Extended Position on Aloha motors
-                self.follower_arms[name].write("Operating_Mode", 4, all_motors_except_gripper)
-
-            # Use 'position control current based' for follower gripper to be limited by the limit of the current.
-            # It can grasp an object without forcing too much even tho,
-            # it's goal position is a complete grasp (both gripper fingers are ordered to join and reach a touch).
-            # 5 corresponds to Current Controlled Position on Aloha gripper follower "xm430-w350"
-            self.follower_arms[name].write("Operating_Mode", 5, "gripper")
-
-            # Note: We can't enable torque on the leader gripper since "xc430-w150" doesn't have
-            # a Current Controlled Position mode.
-
-        if self.config.gripper_open_degree is not None:
-            warnings.warn(
-                f"`gripper_open_degree` is set to {self.config.gripper_open_degree}, but None is expected for Aloha instead",
-                stacklevel=1,
-            )
-
-    def teleop_step(
-        self, record_data=False
-    ) -> None | tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
-        if not self.is_connected:
-            raise RobotDeviceNotConnectedError(
-                "ManipulatorRobot is not connected. You need to run `robot.connect()`."
-            )
-
-        # Prepare to assign the position of the leader to the follower
-        leader_pos = {}
-        for name in self.leader_arms:
-            before_lread_t = time.perf_counter()
-            leader_pos[name] = self.leader_arms[name].read("Present_Position")
-            leader_pos[name] = torch.from_numpy(leader_pos[name])
-            self.logs[f"read_leader_{name}_pos_dt_s"] = time.perf_counter() - before_lread_t
-
-        # Send goal position to the follower
-        follower_goal_pos = {}
-        for name in self.follower_arms:
-            before_fwrite_t = time.perf_counter()
-            goal_pos = leader_pos[name]
-
-            # Cap goal position when too far away from present position.
-            # Slower fps expected due to reading from the follower.
-            if self.config.max_relative_target is not None:
-                present_pos = self.follower_arms[name].read("Present_Position")
-                present_pos = torch.from_numpy(present_pos)
-                goal_pos = ensure_safe_goal_position(goal_pos, present_pos, self.config.max_relative_target)
-
-            # Used when record_data=True
-            follower_goal_pos[name] = goal_pos
-
-            goal_pos = goal_pos.numpy().astype(np.int32)
-            self.follower_arms[name].write("Goal_Position", goal_pos)
-            self.logs[f"write_follower_{name}_goal_pos_dt_s"] = time.perf_counter() - before_fwrite_t
-
-        # Early exit when recording data is not requested
-        if not record_data:
-            return
-
-        # TODO(rcadene): Add velocity and other info
-        # Read follower position
-        follower_pos = {}
-        for name in self.follower_arms:
-            before_fread_t = time.perf_counter()
-            follower_pos[name] = self.follower_arms[name].read("Present_Position")
-            follower_pos[name] = torch.from_numpy(follower_pos[name])
-            self.logs[f"read_follower_{name}_pos_dt_s"] = time.perf_counter() - before_fread_t
-
-        # Create state by concatenating follower current position
-        state = []
-        for name in self.follower_arms:
-            if name in follower_pos:
-                state.append(follower_pos[name])
-        state = torch.cat(state)
-
-        # Create action by concatenating follower goal position
-        action = []
-        for name in self.follower_arms:
-            if name in follower_goal_pos:
-                action.append(follower_goal_pos[name])
-        action = torch.cat(action)
-
-        # Capture images from cameras
-        images = {}
-        for name in self.cameras:
-            before_camread_t = time.perf_counter()
-            images[name] = self.cameras[name].async_read()
-            images[name] = torch.from_numpy(images[name])
-            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
-            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t
-
-        # Populate output dictionnaries
-        obs_dict, action_dict = {}, {}
-        obs_dict["observation.state"] = state
-        action_dict["action"] = action
-        for name in self.cameras:
-            obs_dict[f"observation.images.{name}"] = images[name]
-
-        return obs_dict, action_dict
-
-    def capture_observation(self):
-        """The returned observations do not have a batch dimension."""
-        if not self.is_connected:
-            raise RobotDeviceNotConnectedError(
-                "ManipulatorRobot is not connected. You need to run `robot.connect()`."
-            )
-
-        # Read follower position
-        follower_pos = {}
-        for name in self.follower_arms:
-            before_fread_t = time.perf_counter()
-            follower_pos[name] = self.follower_arms[name].read("Present_Position")
-            follower_pos[name] = torch.from_numpy(follower_pos[name])
-            self.logs[f"read_follower_{name}_pos_dt_s"] = time.perf_counter() - before_fread_t
-
-        # Create state by concatenating follower current position
-        state = []
-        for name in self.follower_arms:
-            if name in follower_pos:
-                state.append(follower_pos[name])
-        state = torch.cat(state)
-
-        # Capture images from cameras
-        images = {}
-        for name in self.cameras:
-            before_camread_t = time.perf_counter()
-            images[name] = self.cameras[name].async_read()
-            images[name] = torch.from_numpy(images[name])
-            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
-            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t
-
-        # Populate output dictionnaries and format to pytorch
-        obs_dict = {}
-        obs_dict["observation.state"] = state
-        for name in self.cameras:
-            obs_dict[f"observation.images.{name}"] = images[name]
-        return obs_dict
-
-    def send_action(self, action: torch.Tensor) -> torch.Tensor:
-        """Command the follower arms to move to a target joint configuration.
-
-        The relative action magnitude may be clipped depending on the configuration parameter
-        `max_relative_target`. In this case, the action sent differs from original action.
-        Thus, this function always returns the action actually sent.
-
-        Args:
-            action: tensor containing the concatenated goal positions for the follower arms.
-        """
-        if not self.is_connected:
-            raise RobotDeviceNotConnectedError(
-                "ManipulatorRobot is not connected. You need to run `robot.connect()`."
-            )
-
-        from_idx = 0
-        to_idx = 0
-        action_sent = []
-        for name in self.follower_arms:
-            # Get goal position of each follower arm by splitting the action vector
-            to_idx += len(self.follower_arms[name].motor_names)
-            goal_pos = action[from_idx:to_idx]
-            from_idx = to_idx
-
-            # Cap goal position when too far away from present position.
-            # Slower fps expected due to reading from the follower.
-            if self.config.max_relative_target is not None:
-                present_pos = self.follower_arms[name].read("Present_Position")
-                present_pos = torch.from_numpy(present_pos)
-                goal_pos = ensure_safe_goal_position(goal_pos, present_pos, self.config.max_relative_target)
-
-            # Save tensor to concat and return
-            action_sent.append(goal_pos)
-
-            # Send goal position to each follower
-            goal_pos = goal_pos.numpy().astype(np.int32)
-            self.follower_arms[name].write("Goal_Position", goal_pos)
-
-        return torch.cat(action_sent)
-
-    def disconnect(self):
-        if not self.is_connected:
-            raise RobotDeviceNotConnectedError(
-                "ManipulatorRobot is not connected. You need to run `robot.connect()` before disconnecting."
-            )
-
-        for name in self.follower_arms:
-            self.follower_arms[name].disconnect()
-
-        for name in self.leader_arms:
-            self.leader_arms[name].disconnect()
-
-        for name in self.cameras:
-            self.cameras[name].disconnect()
-
-        self.is_connected = False
-
-    def __del__(self):
-        if getattr(self, "is_connected", False):
-            self.disconnect()
diff --git a/lerobot/configs/robot/aloha.yaml b/lerobot/configs/robot/aloha.yaml
deleted file mode 100644
index 938fa2e3d..000000000
--- a/lerobot/configs/robot/aloha.yaml
+++ /dev/null
@@ -1,115 +0,0 @@
-# Aloha: A Low-Cost Hardware for Bimanual Teleoperation
-# https://aloha-2.github.io
-# https://www.trossenrobotics.com/aloha-stationary
-
-# Requires installing extras packages
-# With pip: `pip install -e ".[dynamixel intelrealsense]"`
-# With poetry: `poetry install --sync --extras "dynamixel intelrealsense"`
-
-_target_: lerobot.common.robot_devices.robots.manipulator.ManipulatorRobot
-robot_type: aloha
-# Specific to Aloha, LeRobot comes with default calibration files. Assuming the motors have been
-# properly assembled, no manual calibration step is expected. If you need to run manual calibration,
-# simply update this path to ".cache/calibration/aloha"
-calibration_dir: .cache/calibration/aloha_default
-
-# /!\ FOR SAFETY, READ THIS /!\
-# `max_relative_target` limits the magnitude of the relative positional target vector for safety purposes.
-# Set this to a positive scalar to have the same value for all motors, or a list that is the same length as
-# the number of motors in your follower arms.
-# For Aloha, for every goal position request, motor rotations are capped at 5 degrees by default.
-# When you feel more confident with teleoperation or running the policy, you can extend
-# this safety limit and even removing it by setting it to `null`.
-# Also, everything is expected to work safely out-of-the-box, but we highly advise to
-# first try to teleoperate the grippers only (by commenting out the rest of the motors in this yaml),
-# then to gradually add more motors (by uncommenting), until you can teleoperate both arms fully
-max_relative_target: 5
-
-leader_arms:
-  left:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/ttyDXL_leader_left
-    motors:  # window_x
-      # name: (index, model)
-      waist: [1, xm430-w350]
-      shoulder: [2, xm430-w350]
-      shoulder_shadow: [3, xm430-w350]
-      elbow: [4, xm430-w350]
-      elbow_shadow: [5, xm430-w350]
-      forearm_roll: [6, xm430-w350]
-      wrist_angle: [7, xm430-w350]
-      wrist_rotate: [8, xl430-w250]
-      gripper: [9, xc430-w150]
-  right:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/ttyDXL_leader_right
-    motors:  # window_x
-      # name: (index, model)
-      waist: [1, xm430-w350]
-      shoulder: [2, xm430-w350]
-      shoulder_shadow: [3, xm430-w350]
-      elbow: [4, xm430-w350]
-      elbow_shadow: [5, xm430-w350]
-      forearm_roll: [6, xm430-w350]
-      wrist_angle: [7, xm430-w350]
-      wrist_rotate: [8, xl430-w250]
-      gripper: [9, xc430-w150]
-
-follower_arms:
-  left:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/ttyDXL_follower_left
-    motors:
-      # name: [index, model]
-      waist: [1, xm540-w270]
-      shoulder: [2, xm540-w270]
-      shoulder_shadow: [3, xm540-w270]
-      elbow: [4, xm540-w270]
-      elbow_shadow: [5, xm540-w270]
-      forearm_roll: [6, xm540-w270]
-      wrist_angle: [7, xm540-w270]
-      wrist_rotate: [8, xm430-w350]
-      gripper: [9, xm430-w350]
-  right:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/ttyDXL_follower_right
-    motors:
-      # name: [index, model]
-      waist: [1, xm540-w270]
-      shoulder: [2, xm540-w270]
-      shoulder_shadow: [3, xm540-w270]
-      elbow: [4, xm540-w270]
-      elbow_shadow: [5, xm540-w270]
-      forearm_roll: [6, xm540-w270]
-      wrist_angle: [7, xm540-w270]
-      wrist_rotate: [8, xm430-w350]
-      gripper: [9, xm430-w350]
-
-# Troubleshooting: If one of your IntelRealSense cameras freeze during
-# data recording due to bandwidth limit, you might need to plug the camera
-# on another USB hub or PCIe card.
-cameras:
-  cam_high:
-    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
-    camera_index: 128422271347
-    fps: 30
-    width: 640
-    height: 480
-  cam_low:
-    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
-    camera_index: 130322270656
-    fps: 30
-    width: 640
-    height: 480
-  cam_left_wrist:
-    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
-    camera_index: 218622272670
-    fps: 30
-    width: 640
-    height: 480
-  cam_right_wrist:
-    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
-    camera_index: 130322272300
-    fps: 30
-    width: 640
-    height: 480
diff --git a/lerobot/configs/robot/koch_bimanual.yaml b/lerobot/configs/robot/koch_bimanual.yaml
deleted file mode 100644
index 7f8138675..000000000
--- a/lerobot/configs/robot/koch_bimanual.yaml
+++ /dev/null
@@ -1,75 +0,0 @@
-_target_: lerobot.common.robot_devices.robots.manipulator.ManipulatorRobot
-robot_type: koch
-calibration_dir: .cache/calibration/koch_bimanual
-
-# `max_relative_target` limits the magnitude of the relative positional target vector for safety purposes.
-# Set this to a positive scalar to have the same value for all motors, or a list that is the same length as
-# the number of motors in your follower arms.
-max_relative_target: null
-
-leader_arms:
-  left:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/tty.usbmodem585A0085511
-    motors:
-      # name: (index, model)
-      shoulder_pan: [1, "xl330-m077"]
-      shoulder_lift: [2, "xl330-m077"]
-      elbow_flex: [3, "xl330-m077"]
-      wrist_flex: [4, "xl330-m077"]
-      wrist_roll: [5, "xl330-m077"]
-      gripper: [6, "xl330-m077"]
-  right:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/tty.usbmodem575E0031751
-    motors:
-      # name: (index, model)
-      shoulder_pan: [1, "xl330-m077"]
-      shoulder_lift: [2, "xl330-m077"]
-      elbow_flex: [3, "xl330-m077"]
-      wrist_flex: [4, "xl330-m077"]
-      wrist_roll: [5, "xl330-m077"]
-      gripper: [6, "xl330-m077"]
-
-follower_arms:
-  left:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/tty.usbmodem585A0076891
-    motors:
-      # name: (index, model)
-      shoulder_pan: [1, "xl430-w250"]
-      shoulder_lift: [2, "xl430-w250"]
-      elbow_flex: [3, "xl330-m288"]
-      wrist_flex: [4, "xl330-m288"]
-      wrist_roll: [5, "xl330-m288"]
-      gripper: [6, "xl330-m288"]
-  right:
-    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
-    port: /dev/tty.usbmodem575E0032081
-    motors:
-      # name: (index, model)
-      shoulder_pan: [1, "xl430-w250"]
-      shoulder_lift: [2, "xl430-w250"]
-      elbow_flex: [3, "xl330-m288"]
-      wrist_flex: [4, "xl330-m288"]
-      wrist_roll: [5, "xl330-m288"]
-      gripper: [6, "xl330-m288"]
-
-cameras:
-  laptop:
-    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
-    camera_index: 0
-    fps: 30
-    width: 640
-    height: 480
-  phone:
-    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
-    camera_index: 1
-    fps: 30
-    width: 640
-    height: 480
-
-# ~ Koch specific settings ~
-# Sets the leader arm in torque mode with the gripper motor set to this angle. This makes it possible
-# to squeeze the gripper and have it spring back to an open position on its own.
-gripper_open_degree: 35.156
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index 45807503f..28c4e8737 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -33,7 +33,7 @@
 
 from lerobot.common.datasets.factory import make_dataset, resolve_delta_timestamps
 from lerobot.common.datasets.lerobot_dataset import MultiLeRobotDataset
-from lerobot.common.datasets.online_buffer import OnlineBuffer, compute_sampler_weights
+from lerobot.common.datasets.online_buffer import DataBuffer, compute_sampler_weights
 from lerobot.common.datasets.sampler import EpisodeAwareSampler
 from lerobot.common.datasets.utils import cycle
 from lerobot.common.envs.factory import make_env
@@ -403,8 +403,19 @@ def evaluate_and_checkpoint_if_needed(step, is_online):
     else:
         shuffle = True
         sampler = None
+
+    if cfg.get("use_lerobot_data_buffer", False):
+        offline_dataset_for_dataloader = DataBuffer.from_hf_dataset(
+            offline_dataset.hf_dataset,
+            storage_dir=f"/tmp/{offline_dataset.repo_id}",
+            fps=offline_dataset.fps,
+            delta_timestamps=offline_dataset.delta_timestamps,
+        )
+    else:
+        offline_dataset_for_dataloader = offline_dataset
+
     dataloader = torch.utils.data.DataLoader(
-        offline_dataset,
+        offline_dataset_for_dataloader,
         num_workers=cfg.training.num_workers,
         batch_size=cfg.training.batch_size,
         shuffle=shuffle,
@@ -470,7 +481,7 @@ def evaluate_and_checkpoint_if_needed(step, is_online):
             "was made. This is because the online buffer is updated on disk during training, independently "
             "of our explicit checkpointing mechanisms."
         )
-    online_dataset = OnlineBuffer(
+    online_dataset = DataBuffer(
         online_buffer_path,
         data_spec={
             **{k: {"shape": v, "dtype": np.dtype("float32")} for k, v in policy.config.input_shapes.items()},
diff --git a/tests/data/lerobot/aloha_mobile_cabinet/meta_data/info.json b/tests/data/lerobot/aloha_mobile_cabinet/meta_data/info.json
deleted file mode 100644
index 73d51a20f..000000000
--- a/tests/data/lerobot/aloha_mobile_cabinet/meta_data/info.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "codebase_version": "v1.6",
-    "fps": 50,
-    "video": true,
-    "encoding": {
-        "vcodec": "libsvtav1",
-        "pix_fmt": "yuv420p",
-        "g": 2,
-        "crf": 30
-    }
-}
\ No newline at end of file

From 1140a850efd5e9a5b989e7d39ab6305c270c6ad1 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Mon, 16 Sep 2024 18:37:31 +0100
Subject: [PATCH 02/29] backup wip

---
 .../aloha_default/left_follower.json          | 68 +++++++++++++++++++
 .../aloha_default/left_leader.json            | 68 +++++++++++++++++++
 .../aloha_default/right_follower.json         | 68 +++++++++++++++++++
 .../aloha_default/right_leader.json           | 68 +++++++++++++++++++
 .../aloha_mobile_cabinet/meta_data/info.json  | 11 +++
 5 files changed, 283 insertions(+)
 create mode 100644 .cache/calibration/aloha_default/left_follower.json
 create mode 100644 .cache/calibration/aloha_default/left_leader.json
 create mode 100644 .cache/calibration/aloha_default/right_follower.json
 create mode 100644 .cache/calibration/aloha_default/right_leader.json
 create mode 100644 tests/data/lerobot/aloha_mobile_cabinet/meta_data/info.json

diff --git a/.cache/calibration/aloha_default/left_follower.json b/.cache/calibration/aloha_default/left_follower.json
new file mode 100644
index 000000000..336c238a0
--- /dev/null
+++ b/.cache/calibration/aloha_default/left_follower.json
@@ -0,0 +1,68 @@
+{
+    "homing_offset": [
+        2048,
+        3072,
+        3072,
+        -1024,
+        -1024,
+        2048,
+        -2048,
+        2048,
+        -2048
+    ],
+    "drive_mode": [
+        1,
+        1,
+        1,
+        0,
+        0,
+        1,
+        0,
+        1,
+        0
+    ],
+    "start_pos": [
+        2015,
+        3058,
+        3061,
+        1071,
+        1071,
+        2035,
+        2152,
+        2029,
+        2499
+    ],
+    "end_pos": [
+        -1008,
+        -1963,
+        -1966,
+        2141,
+        2143,
+        -971,
+        3043,
+        -1077,
+        3144
+    ],
+    "calib_mode": [
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "LINEAR"
+    ],
+    "motor_names": [
+        "waist",
+        "shoulder",
+        "shoulder_shadow",
+        "elbow",
+        "elbow_shadow",
+        "forearm_roll",
+        "wrist_angle",
+        "wrist_rotate",
+        "gripper"
+    ]
+}
diff --git a/.cache/calibration/aloha_default/left_leader.json b/.cache/calibration/aloha_default/left_leader.json
new file mode 100644
index 000000000..d933f2bab
--- /dev/null
+++ b/.cache/calibration/aloha_default/left_leader.json
@@ -0,0 +1,68 @@
+{
+    "homing_offset": [
+        2048,
+        3072,
+        3072,
+        -1024,
+        -1024,
+        2048,
+        -2048,
+        2048,
+        -1024
+    ],
+    "drive_mode": [
+        1,
+        1,
+        1,
+        0,
+        0,
+        1,
+        0,
+        1,
+        0
+    ],
+    "start_pos": [
+        2035,
+        3024,
+        3019,
+        979,
+        981,
+        1982,
+        2166,
+        2124,
+        1968
+    ],
+    "end_pos": [
+        -990,
+        -2017,
+        -2015,
+        2078,
+        2076,
+        -1030,
+        3117,
+        -1016,
+        2556
+    ],
+    "calib_mode": [
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "LINEAR"
+    ],
+    "motor_names": [
+        "waist",
+        "shoulder",
+        "shoulder_shadow",
+        "elbow",
+        "elbow_shadow",
+        "forearm_roll",
+        "wrist_angle",
+        "wrist_rotate",
+        "gripper"
+    ]
+}
diff --git a/.cache/calibration/aloha_default/right_follower.json b/.cache/calibration/aloha_default/right_follower.json
new file mode 100644
index 000000000..bc69dfafd
--- /dev/null
+++ b/.cache/calibration/aloha_default/right_follower.json
@@ -0,0 +1,68 @@
+{
+    "homing_offset": [
+        2048,
+        3072,
+        3072,
+        -1024,
+        -1024,
+        2048,
+        -2048,
+        2048,
+        -2048
+    ],
+    "drive_mode": [
+        1,
+        1,
+        1,
+        0,
+        0,
+        1,
+        0,
+        1,
+        0
+    ],
+    "start_pos": [
+        2056,
+        2895,
+        2896,
+        1191,
+        1190,
+        2018,
+        2051,
+        2056,
+        2509
+    ],
+    "end_pos": [
+        -1040,
+        -2004,
+        -2006,
+        2126,
+        2127,
+        -1010,
+        3050,
+        -1117,
+        3143
+    ],
+    "calib_mode": [
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "LINEAR"
+    ],
+    "motor_names": [
+        "waist",
+        "shoulder",
+        "shoulder_shadow",
+        "elbow",
+        "elbow_shadow",
+        "forearm_roll",
+        "wrist_angle",
+        "wrist_rotate",
+        "gripper"
+    ]
+}
diff --git a/.cache/calibration/aloha_default/right_leader.json b/.cache/calibration/aloha_default/right_leader.json
new file mode 100644
index 000000000..d96d1de9b
--- /dev/null
+++ b/.cache/calibration/aloha_default/right_leader.json
@@ -0,0 +1,68 @@
+{
+    "homing_offset": [
+        2048,
+        3072,
+        3072,
+        -1024,
+        -1024,
+        2048,
+        -2048,
+        2048,
+        -2048
+    ],
+    "drive_mode": [
+        1,
+        1,
+        1,
+        0,
+        0,
+        1,
+        0,
+        1,
+        0
+    ],
+    "start_pos": [
+        2068,
+        3034,
+        3030,
+        1038,
+        1041,
+        1991,
+        1948,
+        2090,
+        1985
+    ],
+    "end_pos": [
+        -1025,
+        -2014,
+        -2015,
+        2058,
+        2060,
+        -955,
+        3091,
+        -940,
+        2576
+    ],
+    "calib_mode": [
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "DEGREE",
+        "LINEAR"
+    ],
+    "motor_names": [
+        "waist",
+        "shoulder",
+        "shoulder_shadow",
+        "elbow",
+        "elbow_shadow",
+        "forearm_roll",
+        "wrist_angle",
+        "wrist_rotate",
+        "gripper"
+    ]
+}
diff --git a/tests/data/lerobot/aloha_mobile_cabinet/meta_data/info.json b/tests/data/lerobot/aloha_mobile_cabinet/meta_data/info.json
new file mode 100644
index 000000000..73d51a20f
--- /dev/null
+++ b/tests/data/lerobot/aloha_mobile_cabinet/meta_data/info.json
@@ -0,0 +1,11 @@
+{
+    "codebase_version": "v1.6",
+    "fps": 50,
+    "video": true,
+    "encoding": {
+        "vcodec": "libsvtav1",
+        "pix_fmt": "yuv420p",
+        "g": 2,
+        "crf": 30
+    }
+}
\ No newline at end of file

From 745787d0b7b33b39d39c59f5193e83199149f1b2 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Mon, 16 Sep 2024 18:38:11 +0100
Subject: [PATCH 03/29] backup wip

---
 .../robot_devices/cameras/intelrealsense.py   | 448 +++++++++++
 .../robot_devices/robots/manipulator.py       | 703 ++++++++++++++++++
 lerobot/configs/robot/aloha.yaml              | 115 +++
 lerobot/configs/robot/koch_bimanual.yaml      |  75 ++
 4 files changed, 1341 insertions(+)
 create mode 100644 lerobot/common/robot_devices/cameras/intelrealsense.py
 create mode 100644 lerobot/common/robot_devices/robots/manipulator.py
 create mode 100644 lerobot/configs/robot/aloha.yaml
 create mode 100644 lerobot/configs/robot/koch_bimanual.yaml

diff --git a/lerobot/common/robot_devices/cameras/intelrealsense.py b/lerobot/common/robot_devices/cameras/intelrealsense.py
new file mode 100644
index 000000000..4806bf785
--- /dev/null
+++ b/lerobot/common/robot_devices/cameras/intelrealsense.py
@@ -0,0 +1,448 @@
+"""
+This file contains utilities for recording frames from Intel Realsense cameras.
+"""
+
+import argparse
+import concurrent.futures
+import logging
+import shutil
+import threading
+import time
+import traceback
+from dataclasses import dataclass, replace
+from pathlib import Path
+from threading import Thread
+
+import cv2
+import numpy as np
+import pyrealsense2 as rs
+from PIL import Image
+
+from lerobot.common.robot_devices.utils import (
+    RobotDeviceAlreadyConnectedError,
+    RobotDeviceNotConnectedError,
+)
+from lerobot.common.utils.utils import capture_timestamp_utc
+from lerobot.scripts.control_robot import busy_wait
+
+SERIAL_NUMBER_INDEX = 1
+
+
+def find_camera_indices(raise_when_empty=True) -> list[int]:
+    """
+    Find the serial numbers of the Intel RealSense cameras
+    connected to the computer.
+    """
+    camera_ids = []
+    for device in rs.context().query_devices():
+        serial_number = int(device.get_info(rs.camera_info(SERIAL_NUMBER_INDEX)))
+        camera_ids.append(serial_number)
+
+    if raise_when_empty and len(camera_ids) == 0:
+        raise OSError(
+            "Not a single camera was detected. Try re-plugging, or re-installing `librealsense` and its python wrapper `pyrealsense2`, or updating the firmware."
+        )
+
+    return camera_ids
+
+
+def save_image(img_array, camera_idx, frame_index, images_dir):
+    try:
+        img = Image.fromarray(img_array)
+        path = images_dir / f"camera_{camera_idx}_frame_{frame_index:06d}.png"
+        path.parent.mkdir(parents=True, exist_ok=True)
+        img.save(str(path), quality=100)
+        logging.info(f"Saved image: {path}")
+    except Exception as e:
+        logging.error(f"Failed to save image for camera {camera_idx} frame {frame_index}: {e}")
+
+
+def save_images_from_cameras(
+    images_dir: Path,
+    camera_ids: list[int] | None = None,
+    fps=None,
+    width=None,
+    height=None,
+    record_time_s=2,
+):
+    """
+    Initializes all the cameras and saves images to the directory. Useful to visually identify the camera
+    associated to a given camera index.
+    """
+    if camera_ids is None:
+        camera_ids = find_camera_indices()
+
+    print("Connecting cameras")
+    cameras = []
+    for cam_idx in camera_ids:
+        camera = IntelRealSenseCamera(cam_idx, fps=fps, width=width, height=height)
+        camera.connect()
+        print(
+            f"IntelRealSenseCamera({camera.camera_index}, fps={camera.fps}, width={camera.width}, height={camera.height}, color_mode={camera.color_mode})"
+        )
+        cameras.append(camera)
+
+    images_dir = Path(images_dir)
+    if images_dir.exists():
+        shutil.rmtree(
+            images_dir,
+        )
+    images_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Saving images to {images_dir}")
+    frame_index = 0
+    start_time = time.perf_counter()
+    try:
+        with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
+            while True:
+                now = time.perf_counter()
+
+                for camera in cameras:
+                    # If we use async_read when fps is None, the loop will go full speed, and we will end up
+                    # saving the same images from the cameras multiple times until the RAM/disk is full.
+                    image = camera.read() if fps is None else camera.async_read()
+                    if image is None:
+                        print("No Frame")
+                    bgr_converted_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+
+                    executor.submit(
+                        save_image,
+                        bgr_converted_image,
+                        camera.camera_index,
+                        frame_index,
+                        images_dir,
+                    )
+
+                if fps is not None:
+                    dt_s = time.perf_counter() - now
+                    busy_wait(1 / fps - dt_s)
+
+                if time.perf_counter() - start_time > record_time_s:
+                    break
+
+                print(f"Frame: {frame_index:04d}\tLatency (ms): {(time.perf_counter() - now) * 1000:.2f}")
+
+                frame_index += 1
+    finally:
+        print(f"Images have been saved to {images_dir}")
+        for camera in cameras:
+            camera.disconnect()
+
+
+@dataclass
+class IntelRealSenseCameraConfig:
+    """
+    Example of tested options for Intel Real Sense D405:
+
+    ```python
+    IntelRealSenseCameraConfig(30, 640, 480)
+    IntelRealSenseCameraConfig(60, 640, 480)
+    IntelRealSenseCameraConfig(90, 640, 480)
+    IntelRealSenseCameraConfig(30, 1280, 720)
+    IntelRealSenseCameraConfig(30, 640, 480, use_depth=True)
+    ```
+    """
+
+    fps: int | None = None
+    width: int | None = None
+    height: int | None = None
+    color_mode: str = "rgb"
+    use_depth: bool = False
+    force_hardware_reset: bool = True
+
+    def __post_init__(self):
+        if self.color_mode not in ["rgb", "bgr"]:
+            raise ValueError(
+                f"`color_mode` is expected to be 'rgb' or 'bgr', but {self.color_mode} is provided."
+            )
+
+        if (self.fps or self.width or self.height) and not (self.fps and self.width and self.height):
+            raise ValueError(
+                "For `fps`, `width` and `height`, either all of them need to be set, or none of them, "
+                f"but {self.fps=}, {self.width=}, {self.height=} were provided."
+            )
+
+
+class IntelRealSenseCamera:
+    """
+    The IntelRealSenseCamera class is similar to OpenCVCamera class but adds additional features for Intel Real Sense cameras:
+    - camera_index corresponds to the serial number of the camera,
+    - camera_index won't randomly change as it can be the case of OpenCVCamera for Linux,
+    - read is more reliable than OpenCVCamera,
+    - depth map can be returned.
+
+    To find the camera indices of your cameras, you can run our utility script that will save a few frames for each camera:
+    ```bash
+    python lerobot/common/robot_devices/cameras/intelrealsense.py --images-dir outputs/images_from_intelrealsense_cameras
+    ```
+
+    When an IntelRealSenseCamera is instantiated, if no specific config is provided, the default fps, width, height and color_mode
+    of the given camera will be used.
+
+    Example of usage:
+    ```python
+    camera_index = 128422271347
+    camera = IntelRealSenseCamera(camera_index)
+    camera.connect()
+    color_image = camera.read()
+    # when done using the camera, consider disconnecting
+    camera.disconnect()
+    ```
+
+    Example of changing default fps, width, height and color_mode:
+    ```python
+    camera = IntelRealSenseCamera(camera_index, fps=30, width=1280, height=720)
+    camera = connect()  # applies the settings, might error out if these settings are not compatible with the camera
+
+    camera = IntelRealSenseCamera(camera_index, fps=90, width=640, height=480)
+    camera = connect()
+
+    camera = IntelRealSenseCamera(camera_index, fps=90, width=640, height=480, color_mode="bgr")
+    camera = connect()
+    ```
+
+    Example of returning depth:
+    ```python
+    camera = IntelRealSenseCamera(camera_index, use_depth=True)
+    camera.connect()
+    color_image, depth_map = camera.read()
+    ```
+    """
+
+    def __init__(
+        self,
+        camera_index: int,
+        config: IntelRealSenseCameraConfig | None = None,
+        **kwargs,
+    ):
+        if config is None:
+            config = IntelRealSenseCameraConfig()
+
+        # Overwrite the config arguments using kwargs
+        config = replace(config, **kwargs)
+
+        self.camera_index = camera_index
+        self.fps = config.fps
+        self.width = config.width
+        self.height = config.height
+        self.color_mode = config.color_mode
+        self.use_depth = config.use_depth
+        self.force_hardware_reset = config.force_hardware_reset
+
+        self.camera = None
+        self.is_connected = False
+        self.thread = None
+        self.stop_event = None
+        self.color_image = None
+        self.depth_map = None
+        self.logs = {}
+
+    def connect(self):
+        if self.is_connected:
+            raise RobotDeviceAlreadyConnectedError(
+                f"IntelRealSenseCamera({self.camera_index}) is already connected."
+            )
+
+        config = rs.config()
+        config.enable_device(str(self.camera_index))
+
+        if self.fps and self.width and self.height:
+            # TODO(rcadene): can we set rgb8 directly?
+            config.enable_stream(rs.stream.color, self.width, self.height, rs.format.rgb8, self.fps)
+        else:
+            config.enable_stream(rs.stream.color)
+
+        if self.use_depth:
+            if self.fps and self.width and self.height:
+                config.enable_stream(rs.stream.depth, self.width, self.height, rs.format.z16, self.fps)
+            else:
+                config.enable_stream(rs.stream.depth)
+
+        self.camera = rs.pipeline()
+        try:
+            self.camera.start(config)
+            is_camera_open = True
+        except RuntimeError:
+            is_camera_open = False
+            traceback.print_exc()
+
+        # If the camera doesn't work, display the camera indices corresponding to
+        # valid cameras.
+        if not is_camera_open:
+            # Verify that the provided `camera_index` is valid before printing the traceback
+            available_cam_ids = find_camera_indices()
+            if self.camera_index not in available_cam_ids:
+                raise ValueError(
+                    f"`camera_index` is expected to be one of these available cameras {available_cam_ids}, but {self.camera_index} is provided instead. "
+                    "To find the camera index you should use, run `python lerobot/common/robot_devices/cameras/intelrealsense.py`."
+                )
+
+            raise OSError(f"Can't access IntelRealSenseCamera({self.camera_index}).")
+
+        self.is_connected = True
+
+    def read(self, temporary_color: str | None = None) -> np.ndarray | tuple[np.ndarray, np.ndarray]:
+        """Read a frame from the camera returned in the format height x width x channels (e.g. 480 x 640 x 3)
+        of type `np.uint8`, contrarily to the pytorch format which is float channel first.
+
+        When `use_depth=True`, returns a tuple `(color_image, depth_map)` with a depth map in the format
+        height x width (e.g. 480 x 640) of type np.uint16.
+
+        Note: Reading a frame is done every `camera.fps` times per second, and it is blocking.
+        If you are reading data from other sensors, we advise to use `camera.async_read()` which is non blocking version of `camera.read()`.
+        """
+        if not self.is_connected:
+            raise RobotDeviceNotConnectedError(
+                f"IntelRealSenseCamera({self.camera_index}) is not connected. Try running `camera.connect()` first."
+            )
+
+        start_time = time.perf_counter()
+
+        frame = self.camera.wait_for_frames(timeout_ms=5000)
+
+        color_frame = frame.get_color_frame()
+
+        if not color_frame:
+            raise OSError(f"Can't capture color image from IntelRealSenseCamera({self.camera_index}).")
+
+        color_image = np.asanyarray(color_frame.get_data())
+
+        requested_color_mode = self.color_mode if temporary_color is None else temporary_color
+        if requested_color_mode not in ["rgb", "bgr"]:
+            raise ValueError(
+                f"Expected color values are 'rgb' or 'bgr', but {requested_color_mode} is provided."
+            )
+
+        # IntelRealSense uses RGB format as default (red, green, blue).
+        if requested_color_mode == "bgr":
+            color_image = cv2.cvtColor(color_image, cv2.COLOR_RGB2BGR)
+
+        h, w, _ = color_image.shape
+        if h != self.height or w != self.width:
+            raise OSError(
+                f"Can't capture color image with expected height and width ({self.height} x {self.width}). ({h} x {w}) returned instead."
+            )
+
+        # log the number of seconds it took to read the image
+        self.logs["delta_timestamp_s"] = time.perf_counter() - start_time
+
+        # log the utc time at which the image was received
+        self.logs["timestamp_utc"] = capture_timestamp_utc()
+
+        if self.use_depth:
+            depth_frame = frame.get_depth_frame()
+            if not depth_frame:
+                raise OSError(f"Can't capture depth image from IntelRealSenseCamera({self.camera_index}).")
+
+            depth_map = np.asanyarray(depth_frame.get_data())
+
+            h, w = depth_map.shape
+            if h != self.height or w != self.width:
+                raise OSError(
+                    f"Can't capture depth map with expected height and width ({self.height} x {self.width}). ({h} x {w}) returned instead."
+                )
+
+            return color_image, depth_map
+        else:
+            return color_image
+
+    def read_loop(self):
+        while self.stop_event is None or not self.stop_event.is_set():
+            if self.use_depth:
+                self.color_image, self.depth_map = self.read()
+            else:
+                self.color_image = self.read()
+
+    def async_read(self):
+        """Access the latest color image"""
+        if not self.is_connected:
+            raise RobotDeviceNotConnectedError(
+                f"IntelRealSenseCamera({self.camera_index}) is not connected. Try running `camera.connect()` first."
+            )
+
+        if self.thread is None:
+            self.stop_event = threading.Event()
+            self.thread = Thread(target=self.read_loop, args=())
+            self.thread.daemon = True
+            self.thread.start()
+
+        num_tries = 0
+        while self.color_image is None:
+            num_tries += 1
+            time.sleep(1 / self.fps)
+            if num_tries > self.fps and (self.thread.ident is None or not self.thread.is_alive()):
+                raise Exception(
+                    "The thread responsible for `self.async_read()` took too much time to start. There might be an issue. Verify that `self.thread.start()` has been called."
+                )
+
+        if self.use_depth:
+            return self.color_image, self.depth_map
+        else:
+            return self.color_image
+
+    def disconnect(self):
+        if not self.is_connected:
+            raise RobotDeviceNotConnectedError(
+                f"IntelRealSenseCamera({self.camera_index}) is not connected. Try running `camera.connect()` first."
+            )
+
+        if self.thread is not None and self.thread.is_alive():
+            # wait for the thread to finish
+            self.stop_event.set()
+            self.thread.join()
+            self.thread = None
+            self.stop_event = None
+
+        self.camera.stop()
+        self.camera = None
+
+        self.is_connected = False
+
+    def __del__(self):
+        if getattr(self, "is_connected", False):
+            self.disconnect()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Save a few frames using `IntelRealSenseCamera` for all cameras connected to the computer, or a selected subset."
+    )
+    parser.add_argument(
+        "--camera-ids",
+        type=int,
+        nargs="*",
+        default=None,
+        help="List of camera indices used to instantiate the `IntelRealSenseCamera`. If not provided, find and use all available camera indices.",
+    )
+    parser.add_argument(
+        "--fps",
+        type=int,
+        default=30,
+        help="Set the number of frames recorded per seconds for all cameras. If not provided, use the default fps of each camera.",
+    )
+    parser.add_argument(
+        "--width",
+        type=str,
+        default=640,
+        help="Set the width for all cameras. If not provided, use the default width of each camera.",
+    )
+    parser.add_argument(
+        "--height",
+        type=str,
+        default=480,
+        help="Set the height for all cameras. If not provided, use the default height of each camera.",
+    )
+    parser.add_argument(
+        "--images-dir",
+        type=Path,
+        default="outputs/images_from_intelrealsense_cameras",
+        help="Set directory to save a few frames for each camera.",
+    )
+    parser.add_argument(
+        "--record-time-s",
+        type=float,
+        default=2.0,
+        help="Set the number of seconds used to record the frames. By default, 2 seconds.",
+    )
+    args = parser.parse_args()
+    save_images_from_cameras(**vars(args))
diff --git a/lerobot/common/robot_devices/robots/manipulator.py b/lerobot/common/robot_devices/robots/manipulator.py
new file mode 100644
index 000000000..337519765
--- /dev/null
+++ b/lerobot/common/robot_devices/robots/manipulator.py
@@ -0,0 +1,703 @@
+import json
+import logging
+import time
+import warnings
+from dataclasses import dataclass, field, replace
+from pathlib import Path
+from typing import Sequence
+
+import numpy as np
+import torch
+
+from lerobot.common.robot_devices.cameras.utils import Camera
+from lerobot.common.robot_devices.motors.dynamixel import (
+    CalibrationMode,
+    TorqueMode,
+    convert_degrees_to_steps,
+)
+from lerobot.common.robot_devices.motors.utils import MotorsBus
+from lerobot.common.robot_devices.robots.utils import get_arm_id
+from lerobot.common.robot_devices.utils import RobotDeviceAlreadyConnectedError, RobotDeviceNotConnectedError
+
+########################################################################
+# Calibration logic
+########################################################################
+
+URL_TEMPLATE = (
+    "https://raw.githubusercontent.com/huggingface/lerobot/main/media/{robot}/{arm}_{position}.webp"
+)
+
+# The following positions are provided in nominal degree range ]-180, +180[
+# For more info on these constants, see comments in the code where they get used.
+ZERO_POSITION_DEGREE = 0
+ROTATED_POSITION_DEGREE = 90
+
+
+def assert_drive_mode(drive_mode):
+    # `drive_mode` is in [0,1] with 0 means original rotation direction for the motor, and 1 means inverted.
+    if not np.all(np.isin(drive_mode, [0, 1])):
+        raise ValueError(f"`drive_mode` contains values other than 0 or 1: ({drive_mode})")
+
+
+def apply_drive_mode(position, drive_mode):
+    assert_drive_mode(drive_mode)
+    # Convert `drive_mode` from [0, 1] with 0 indicates original rotation direction and 1 inverted,
+    # to [-1, 1] with 1 indicates original rotation direction and -1 inverted.
+    signed_drive_mode = -(drive_mode * 2 - 1)
+    position *= signed_drive_mode
+    return position
+
+
+def compute_nearest_rounded_position(position, models):
+    delta_turn = convert_degrees_to_steps(ROTATED_POSITION_DEGREE, models)
+    nearest_pos = np.round(position.astype(float) / delta_turn) * delta_turn
+    return nearest_pos.astype(position.dtype)
+
+
+def run_arm_calibration(arm: MotorsBus, robot_type: str, arm_name: str, arm_type: str):
+    """This function ensures that a neural network trained on data collected on a given robot
+    can work on another robot. For instance before calibration, setting a same goal position
+    for each motor of two different robots will get two very different positions. But after calibration,
+    the two robots will move to the same position.To this end, this function computes the homing offset
+    and the drive mode for each motor of a given robot.
+
+    Homing offset is used to shift the motor position to a ]-2048, +2048[ nominal range (when the motor uses 2048 steps
+    to complete a half a turn). This range is set around an arbitrary "zero position" corresponding to all motor positions
+    being 0. During the calibration process, you will need to manually move the robot to this "zero position".
+
+    Drive mode is used to invert the rotation direction of the motor. This is useful when some motors have been assembled
+    in the opposite orientation for some robots. During the calibration process, you will need to manually move the robot
+    to the "rotated position".
+
+    After calibration, the homing offsets and drive modes are stored in a cache.
+
+    Example of usage:
+    ```python
+    run_arm_calibration(arm, "koch", "left", "follower")
+    ```
+    """
+    if (arm.read("Torque_Enable") != TorqueMode.DISABLED.value).any():
+        raise ValueError("To run calibration, the torque must be disabled on all motors.")
+
+    print(f"\nRunning calibration of {robot_type} {arm_name} {arm_type}...")
+
+    print("\nMove arm to zero position")
+    print("See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="zero"))
+    input("Press Enter to continue...")
+
+    # We arbitrarily chose our zero target position to be a straight horizontal position with gripper upwards and closed.
+    # It is easy to identify and all motors are in a "quarter turn" position. Once calibration is done, this position will
+    # correspond to every motor angle being 0. If you set all 0 as Goal Position, the arm will move in this position.
+    zero_target_pos = convert_degrees_to_steps(ZERO_POSITION_DEGREE, arm.motor_models)
+
+    # Compute homing offset so that `present_position + homing_offset ~= target_position`.
+    zero_pos = arm.read("Present_Position")
+    zero_nearest_pos = compute_nearest_rounded_position(zero_pos, arm.motor_models)
+    homing_offset = zero_target_pos - zero_nearest_pos
+
+    # The rotated target position corresponds to a rotation of a quarter turn from the zero position.
+    # This allows to identify the rotation direction of each motor.
+    # For instance, if the motor rotates 90 degree, and its value is -90 after applying the homing offset, then we know its rotation direction
+    # is inverted. However, for the calibration being successful, we need everyone to follow the same target position.
+    # Sometimes, there is only one possible rotation direction. For instance, if the gripper is closed, there is only one direction which
+    # corresponds to opening the gripper. When the rotation direction is ambiguous, we arbitrarely rotate clockwise from the point of view
+    # of the previous motor in the kinetic chain.
+    print("\nMove arm to rotated target position")
+    print("See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="rotated"))
+    input("Press Enter to continue...")
+
+    rotated_target_pos = convert_degrees_to_steps(ROTATED_POSITION_DEGREE, arm.motor_models)
+
+    # Find drive mode by rotating each motor by a quarter of a turn.
+    # Drive mode indicates if the motor rotation direction should be inverted (=1) or not (=0).
+    rotated_pos = arm.read("Present_Position")
+    drive_mode = (rotated_pos < zero_pos).astype(np.int32)
+
+    # Re-compute homing offset to take into account drive mode
+    rotated_drived_pos = apply_drive_mode(rotated_pos, drive_mode)
+    rotated_nearest_pos = compute_nearest_rounded_position(rotated_drived_pos, arm.motor_models)
+    homing_offset = rotated_target_pos - rotated_nearest_pos
+
+    print("\nMove arm to rest position")
+    print("See: " + URL_TEMPLATE.format(robot=robot_type, arm=arm_type, position="rest"))
+    input("Press Enter to continue...")
+    print()
+
+    # Joints with rotational motions are expressed in degrees in nominal range of [-180, 180]
+    calib_mode = [CalibrationMode.DEGREE.name] * len(arm.motor_names)
+
+    # TODO(rcadene): make type of joints (DEGREE or LINEAR) configurable from yaml?
+    if robot_type == "aloha" and "gripper" in arm.motor_names:
+        # Joints with linear motions (like gripper of Aloha) are experessed in nominal range of [0, 100]
+        calib_idx = arm.motor_names.index("gripper")
+        calib_mode[calib_idx] = CalibrationMode.LINEAR.name
+
+    calib_data = {
+        "homing_offset": homing_offset.tolist(),
+        "drive_mode": drive_mode.tolist(),
+        "start_pos": zero_pos.tolist(),
+        "end_pos": rotated_pos.tolist(),
+        "calib_mode": calib_mode,
+        "motor_names": arm.motor_names,
+    }
+    return calib_data
+
+
+def ensure_safe_goal_position(
+    goal_pos: torch.Tensor, present_pos: torch.Tensor, max_relative_target: float | list[float]
+):
+    # Cap relative action target magnitude for safety.
+    diff = goal_pos - present_pos
+    max_relative_target = torch.tensor(max_relative_target)
+    safe_diff = torch.minimum(diff, max_relative_target)
+    safe_diff = torch.maximum(safe_diff, -max_relative_target)
+    safe_goal_pos = present_pos + safe_diff
+
+    if not torch.allclose(goal_pos, safe_goal_pos):
+        logging.warning(
+            "Relative goal position magnitude had to be clamped to be safe.\n"
+            f"  requested relative goal position target: {diff}\n"
+            f"    clamped relative goal position target: {safe_diff}"
+        )
+
+    return safe_goal_pos
+
+
+########################################################################
+# Manipulator robot
+########################################################################
+
+
+@dataclass
+class ManipulatorRobotConfig:
+    """
+    Example of usage:
+    ```python
+    ManipulatorRobotConfig()
+    ```
+    """
+
+    # Define all components of the robot
+    robot_type: str | None = None
+    leader_arms: dict[str, MotorsBus] = field(default_factory=lambda: {})
+    follower_arms: dict[str, MotorsBus] = field(default_factory=lambda: {})
+    cameras: dict[str, Camera] = field(default_factory=lambda: {})
+
+    # Optionally limit the magnitude of the relative positional target vector for safety purposes.
+    # Set this to a positive scalar to have the same value for all motors, or a list that is the same length
+    # as the number of motors in your follower arms (assumes all follower arms have the same number of
+    # motors).
+    max_relative_target: list[float] | float | None = None
+
+    # Optionally set the leader arm in torque mode with the gripper motor set to this angle. This makes it
+    # possible to squeeze the gripper and have it spring back to an open position on its own. If None, the
+    # gripper is not put in torque mode.
+    gripper_open_degree: float | None = None
+
+    def __setattr__(self, prop: str, val):
+        if prop == "max_relative_target" and val is not None and isinstance(val, Sequence):
+            for name in self.follower_arms:
+                if len(self.follower_arms[name].motors) != len(val):
+                    raise ValueError(
+                        f"len(max_relative_target)={len(val)} but the follower arm with name {name} has "
+                        f"{len(self.follower_arms[name].motors)} motors. Please make sure that the "
+                        f"`max_relative_target` list has as many parameters as there are motors per arm. "
+                        "Note: This feature does not yet work with robots where different follower arms have "
+                        "different numbers of motors."
+                    )
+        super().__setattr__(prop, val)
+
+
+class ManipulatorRobot:
+    # TODO(rcadene): Implement force feedback
+    """This class allows to control any manipulator robot of various number of motors.
+
+    Non exaustive list of robots:
+    - [Koch v1.0](https://github.com/AlexanderKoch-Koch/low_cost_robot), with and without the wrist-to-elbow expansion, developed
+    by Alexander Koch from [Tau Robotics](https://tau-robotics.com)
+    - [Koch v1.1](https://github.com/jess-moss/koch-v1-1) developed by Jess Moss
+    - [Aloha](https://www.trossenrobotics.com/aloha-kits) developed by Trossen Robotics
+
+    Example of highest frequency teleoperation without camera:
+    ```python
+    # Defines how to communicate with the motors of the leader and follower arms
+    leader_arms = {
+        "main": DynamixelMotorsBus(
+            port="/dev/tty.usbmodem575E0031751",
+            motors={
+                # name: (index, model)
+                "shoulder_pan": (1, "xl330-m077"),
+                "shoulder_lift": (2, "xl330-m077"),
+                "elbow_flex": (3, "xl330-m077"),
+                "wrist_flex": (4, "xl330-m077"),
+                "wrist_roll": (5, "xl330-m077"),
+                "gripper": (6, "xl330-m077"),
+            },
+        ),
+    }
+    follower_arms = {
+        "main": DynamixelMotorsBus(
+            port="/dev/tty.usbmodem575E0032081",
+            motors={
+                # name: (index, model)
+                "shoulder_pan": (1, "xl430-w250"),
+                "shoulder_lift": (2, "xl430-w250"),
+                "elbow_flex": (3, "xl330-m288"),
+                "wrist_flex": (4, "xl330-m288"),
+                "wrist_roll": (5, "xl330-m288"),
+                "gripper": (6, "xl330-m288"),
+            },
+        ),
+    }
+    robot = ManipulatorRobot(
+        robot_type="koch",
+        calibration_dir=".cache/calibration/koch",
+        leader_arms=leader_arms,
+        follower_arms=follower_arms,
+    )
+
+    # Connect motors buses and cameras if any (Required)
+    robot.connect()
+
+    while True:
+        robot.teleop_step()
+    ```
+
+    Example of highest frequency data collection without camera:
+    ```python
+    # Assumes leader and follower arms have been instantiated already (see first example)
+    robot = ManipulatorRobot(
+        robot_type="koch",
+        calibration_dir=".cache/calibration/koch",
+        leader_arms=leader_arms,
+        follower_arms=follower_arms,
+    )
+    robot.connect()
+    while True:
+        observation, action = robot.teleop_step(record_data=True)
+    ```
+
+    Example of highest frequency data collection with cameras:
+    ```python
+    # Defines how to communicate with 2 cameras connected to the computer.
+    # Here, the webcam of the laptop and the phone (connected in USB to the laptop)
+    # can be reached respectively using the camera indices 0 and 1. These indices can be
+    # arbitrary. See the documentation of `OpenCVCamera` to find your own camera indices.
+    cameras = {
+        "laptop": OpenCVCamera(camera_index=0, fps=30, width=640, height=480),
+        "phone": OpenCVCamera(camera_index=1, fps=30, width=640, height=480),
+    }
+
+    # Assumes leader and follower arms have been instantiated already (see first example)
+    robot = ManipulatorRobot(
+        robot_type="koch",
+        calibration_dir=".cache/calibration/koch",
+        leader_arms=leader_arms,
+        follower_arms=follower_arms,
+        cameras=cameras,
+    )
+    robot.connect()
+    while True:
+        observation, action = robot.teleop_step(record_data=True)
+    ```
+
+    Example of controlling the robot with a policy (without running multiple policies in parallel to ensure highest frequency):
+    ```python
+    # Assumes leader and follower arms + cameras have been instantiated already (see previous example)
+    robot = ManipulatorRobot(
+        robot_type="koch",
+        calibration_dir=".cache/calibration/koch",
+        leader_arms=leader_arms,
+        follower_arms=follower_arms,
+        cameras=cameras,
+    )
+    robot.connect()
+    while True:
+        # Uses the follower arms and cameras to capture an observation
+        observation = robot.capture_observation()
+
+        # Assumes a policy has been instantiated
+        with torch.inference_mode():
+            action = policy.select_action(observation)
+
+        # Orders the robot to move
+        robot.send_action(action)
+    ```
+
+    Example of disconnecting which is not mandatory since we disconnect when the object is deleted:
+    ```python
+    robot.disconnect()
+    ```
+    """
+
+    def __init__(
+        self,
+        config: ManipulatorRobotConfig | None = None,
+        calibration_dir: Path = ".cache/calibration/koch",
+        **kwargs,
+    ):
+        if config is None:
+            config = ManipulatorRobotConfig()
+        # Overwrite config arguments using kwargs
+        self.config = replace(config, **kwargs)
+        self.calibration_dir = Path(calibration_dir)
+
+        self.robot_type = self.config.robot_type
+        self.leader_arms = self.config.leader_arms
+        self.follower_arms = self.config.follower_arms
+        self.cameras = self.config.cameras
+        self.is_connected = False
+        self.logs = {}
+
+    def connect(self):
+        if self.is_connected:
+            raise RobotDeviceAlreadyConnectedError(
+                "ManipulatorRobot is already connected. Do not run `robot.connect()` twice."
+            )
+
+        if not self.leader_arms and not self.follower_arms and not self.cameras:
+            raise ValueError(
+                "ManipulatorRobot doesn't have any device to connect. See example of usage in docstring of the class."
+            )
+
+        # Connect the arms
+        for name in self.follower_arms:
+            print(f"Connecting {name} follower arm.")
+            self.follower_arms[name].connect()
+            print(f"Connecting {name} leader arm.")
+            self.leader_arms[name].connect()
+
+        # We assume that at connection time, arms are in a rest position, and torque can
+        # be safely disabled to run calibration and/or set robot preset configurations.
+        for name in self.follower_arms:
+            self.follower_arms[name].write("Torque_Enable", TorqueMode.DISABLED.value)
+        for name in self.leader_arms:
+            self.leader_arms[name].write("Torque_Enable", TorqueMode.DISABLED.value)
+
+        self.activate_calibration()
+
+        # Set robot preset (e.g. torque in leader gripper for Koch v1.1)
+        if self.robot_type == "koch":
+            self.set_koch_robot_preset()
+        elif self.robot_type == "aloha":
+            self.set_aloha_robot_preset()
+        else:
+            warnings.warn(f"No preset found for robot type: {self.robot_type}", stacklevel=1)
+
+        # Enable torque on all motors of the follower arms
+        for name in self.follower_arms:
+            print(f"Activating torque on {name} follower arm.")
+            self.follower_arms[name].write("Torque_Enable", 1)
+
+        if self.config.gripper_open_degree is not None:
+            # Set the leader arm in torque mode with the gripper motor set to an angle. This makes it possible
+            # to squeeze the gripper and have it spring back to an open position on its own.
+            for name in self.leader_arms:
+                self.leader_arms[name].write("Torque_Enable", 1, "gripper")
+                self.leader_arms[name].write("Goal_Position", self.config.gripper_open_degree, "gripper")
+
+        # Connect the cameras
+        for name in self.cameras:
+            self.cameras[name].connect()
+
+        self.is_connected = True
+
+    def activate_calibration(self):
+        """After calibration all motors function in human interpretable ranges.
+        Rotations are expressed in degrees in nominal range of [-180, 180],
+        and linear motions (like gripper of Aloha) in nominal range of [0, 100].
+        """
+
+        def load_or_run_calibration_(name, arm, arm_type):
+            arm_id = get_arm_id(name, arm_type)
+            arm_calib_path = self.calibration_dir / f"{arm_id}.json"
+
+            if arm_calib_path.exists():
+                with open(arm_calib_path) as f:
+                    calibration = json.load(f)
+            else:
+                print(f"Missing calibration file '{arm_calib_path}'")
+                calibration = run_arm_calibration(arm, self.robot_type, name, arm_type)
+
+                print(f"Calibration is done! Saving calibration file '{arm_calib_path}'")
+                arm_calib_path.parent.mkdir(parents=True, exist_ok=True)
+                with open(arm_calib_path, "w") as f:
+                    json.dump(calibration, f)
+
+            return calibration
+
+        for name, arm in self.follower_arms.items():
+            calibration = load_or_run_calibration_(name, arm, "follower")
+            arm.set_calibration(calibration)
+        for name, arm in self.leader_arms.items():
+            calibration = load_or_run_calibration_(name, arm, "leader")
+            arm.set_calibration(calibration)
+
+    def set_koch_robot_preset(self):
+        def set_operating_mode_(arm):
+            if (arm.read("Torque_Enable") != TorqueMode.DISABLED.value).any():
+                raise ValueError("To run set robot preset, the torque must be disabled on all motors.")
+
+            # Use 'extended position mode' for all motors except gripper, because in joint mode the servos can't
+            # rotate more than 360 degrees (from 0 to 4095) And some mistake can happen while assembling the arm,
+            # you could end up with a servo with a position 0 or 4095 at a crucial point See [
+            # https://emanual.robotis.com/docs/en/dxl/x/x_series/#operating-mode11]
+            all_motors_except_gripper = [name for name in arm.motor_names if name != "gripper"]
+            if len(all_motors_except_gripper) > 0:
+                # 4 corresponds to Extended Position on Koch motors
+                arm.write("Operating_Mode", 4, all_motors_except_gripper)
+
+            # Use 'position control current based' for gripper to be limited by the limit of the current.
+            # For the follower gripper, it means it can grasp an object without forcing too much even tho,
+            # it's goal position is a complete grasp (both gripper fingers are ordered to join and reach a touch).
+            # For the leader gripper, it means we can use it as a physical trigger, since we can force with our finger
+            # to make it move, and it will move back to its original target position when we release the force.
+            # 5 corresponds to Current Controlled Position on Koch gripper motors "xl330-m077, xl330-m288"
+            arm.write("Operating_Mode", 5, "gripper")
+
+        for name in self.follower_arms:
+            set_operating_mode_(self.follower_arms[name])
+
+            # Set better PID values to close the gap between recorded states and actions
+            # TODO(rcadene): Implement an automatic procedure to set optimial PID values for each motor
+            self.follower_arms[name].write("Position_P_Gain", 1500, "elbow_flex")
+            self.follower_arms[name].write("Position_I_Gain", 0, "elbow_flex")
+            self.follower_arms[name].write("Position_D_Gain", 600, "elbow_flex")
+
+        if self.config.gripper_open_degree is not None:
+            for name in self.leader_arms:
+                set_operating_mode_(self.leader_arms[name])
+
+                # Enable torque on the gripper of the leader arms, and move it to 45 degrees,
+                # so that we can use it as a trigger to close the gripper of the follower arms.
+                self.leader_arms[name].write("Torque_Enable", 1, "gripper")
+                self.leader_arms[name].write("Goal_Position", self.config.gripper_open_degree, "gripper")
+
+    def set_aloha_robot_preset(self):
+        def set_shadow_(arm):
+            # Set secondary/shadow ID for shoulder and elbow. These joints have two motors.
+            # As a result, if only one of them is required to move to a certain position,
+            # the other will follow. This is to avoid breaking the motors.
+            if "shoulder_shadow" in arm.motor_names:
+                shoulder_idx = arm.read("ID", "shoulder")
+                arm.write("Secondary_ID", shoulder_idx, "shoulder_shadow")
+
+            if "elbow_shadow" in arm.motor_names:
+                elbow_idx = arm.read("ID", "elbow")
+                arm.write("Secondary_ID", elbow_idx, "elbow_shadow")
+
+        for name in self.follower_arms:
+            set_shadow_(self.follower_arms[name])
+
+        for name in self.leader_arms:
+            set_shadow_(self.leader_arms[name])
+
+        for name in self.follower_arms:
+            # Set a velocity limit of 131 as advised by Trossen Robotics
+            self.follower_arms[name].write("Velocity_Limit", 131)
+
+            # Use 'extended position mode' for all motors except gripper, because in joint mode the servos can't
+            # rotate more than 360 degrees (from 0 to 4095) And some mistake can happen while assembling the arm,
+            # you could end up with a servo with a position 0 or 4095 at a crucial point See [
+            # https://emanual.robotis.com/docs/en/dxl/x/x_series/#operating-mode11]
+            all_motors_except_gripper = [
+                name for name in self.follower_arms[name].motor_names if name != "gripper"
+            ]
+            if len(all_motors_except_gripper) > 0:
+                # 4 corresponds to Extended Position on Aloha motors
+                self.follower_arms[name].write("Operating_Mode", 4, all_motors_except_gripper)
+
+            # Use 'position control current based' for follower gripper to be limited by the limit of the current.
+            # It can grasp an object without forcing too much even tho,
+            # it's goal position is a complete grasp (both gripper fingers are ordered to join and reach a touch).
+            # 5 corresponds to Current Controlled Position on Aloha gripper follower "xm430-w350"
+            self.follower_arms[name].write("Operating_Mode", 5, "gripper")
+
+            # Note: We can't enable torque on the leader gripper since "xc430-w150" doesn't have
+            # a Current Controlled Position mode.
+
+        if self.config.gripper_open_degree is not None:
+            warnings.warn(
+                f"`gripper_open_degree` is set to {self.config.gripper_open_degree}, but None is expected for Aloha instead",
+                stacklevel=1,
+            )
+
+    def teleop_step(
+        self, record_data=False
+    ) -> None | tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
+        if not self.is_connected:
+            raise RobotDeviceNotConnectedError(
+                "ManipulatorRobot is not connected. You need to run `robot.connect()`."
+            )
+
+        # Prepare to assign the position of the leader to the follower
+        leader_pos = {}
+        for name in self.leader_arms:
+            before_lread_t = time.perf_counter()
+            leader_pos[name] = self.leader_arms[name].read("Present_Position")
+            leader_pos[name] = torch.from_numpy(leader_pos[name])
+            self.logs[f"read_leader_{name}_pos_dt_s"] = time.perf_counter() - before_lread_t
+
+        # Send goal position to the follower
+        follower_goal_pos = {}
+        for name in self.follower_arms:
+            before_fwrite_t = time.perf_counter()
+            goal_pos = leader_pos[name]
+
+            # Cap goal position when too far away from present position.
+            # Slower fps expected due to reading from the follower.
+            if self.config.max_relative_target is not None:
+                present_pos = self.follower_arms[name].read("Present_Position")
+                present_pos = torch.from_numpy(present_pos)
+                goal_pos = ensure_safe_goal_position(goal_pos, present_pos, self.config.max_relative_target)
+
+            # Used when record_data=True
+            follower_goal_pos[name] = goal_pos
+
+            goal_pos = goal_pos.numpy().astype(np.int32)
+            self.follower_arms[name].write("Goal_Position", goal_pos)
+            self.logs[f"write_follower_{name}_goal_pos_dt_s"] = time.perf_counter() - before_fwrite_t
+
+        # Early exit when recording data is not requested
+        if not record_data:
+            return
+
+        # TODO(rcadene): Add velocity and other info
+        # Read follower position
+        follower_pos = {}
+        for name in self.follower_arms:
+            before_fread_t = time.perf_counter()
+            follower_pos[name] = self.follower_arms[name].read("Present_Position")
+            follower_pos[name] = torch.from_numpy(follower_pos[name])
+            self.logs[f"read_follower_{name}_pos_dt_s"] = time.perf_counter() - before_fread_t
+
+        # Create state by concatenating follower current position
+        state = []
+        for name in self.follower_arms:
+            if name in follower_pos:
+                state.append(follower_pos[name])
+        state = torch.cat(state)
+
+        # Create action by concatenating follower goal position
+        action = []
+        for name in self.follower_arms:
+            if name in follower_goal_pos:
+                action.append(follower_goal_pos[name])
+        action = torch.cat(action)
+
+        # Capture images from cameras
+        images = {}
+        for name in self.cameras:
+            before_camread_t = time.perf_counter()
+            images[name] = self.cameras[name].async_read()
+            images[name] = torch.from_numpy(images[name])
+            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
+            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t
+
+        # Populate output dictionnaries
+        obs_dict, action_dict = {}, {}
+        obs_dict["observation.state"] = state
+        action_dict["action"] = action
+        for name in self.cameras:
+            obs_dict[f"observation.images.{name}"] = images[name]
+
+        return obs_dict, action_dict
+
+    def capture_observation(self):
+        """The returned observations do not have a batch dimension."""
+        if not self.is_connected:
+            raise RobotDeviceNotConnectedError(
+                "ManipulatorRobot is not connected. You need to run `robot.connect()`."
+            )
+
+        # Read follower position
+        follower_pos = {}
+        for name in self.follower_arms:
+            before_fread_t = time.perf_counter()
+            follower_pos[name] = self.follower_arms[name].read("Present_Position")
+            follower_pos[name] = torch.from_numpy(follower_pos[name])
+            self.logs[f"read_follower_{name}_pos_dt_s"] = time.perf_counter() - before_fread_t
+
+        # Create state by concatenating follower current position
+        state = []
+        for name in self.follower_arms:
+            if name in follower_pos:
+                state.append(follower_pos[name])
+        state = torch.cat(state)
+
+        # Capture images from cameras
+        images = {}
+        for name in self.cameras:
+            before_camread_t = time.perf_counter()
+            images[name] = self.cameras[name].async_read()
+            images[name] = torch.from_numpy(images[name])
+            self.logs[f"read_camera_{name}_dt_s"] = self.cameras[name].logs["delta_timestamp_s"]
+            self.logs[f"async_read_camera_{name}_dt_s"] = time.perf_counter() - before_camread_t
+
+        # Populate output dictionnaries and format to pytorch
+        obs_dict = {}
+        obs_dict["observation.state"] = state
+        for name in self.cameras:
+            obs_dict[f"observation.images.{name}"] = images[name]
+        return obs_dict
+
+    def send_action(self, action: torch.Tensor) -> torch.Tensor:
+        """Command the follower arms to move to a target joint configuration.
+
+        The relative action magnitude may be clipped depending on the configuration parameter
+        `max_relative_target`. In this case, the action sent differs from original action.
+        Thus, this function always returns the action actually sent.
+
+        Args:
+            action: tensor containing the concatenated goal positions for the follower arms.
+        """
+        if not self.is_connected:
+            raise RobotDeviceNotConnectedError(
+                "ManipulatorRobot is not connected. You need to run `robot.connect()`."
+            )
+
+        from_idx = 0
+        to_idx = 0
+        action_sent = []
+        for name in self.follower_arms:
+            # Get goal position of each follower arm by splitting the action vector
+            to_idx += len(self.follower_arms[name].motor_names)
+            goal_pos = action[from_idx:to_idx]
+            from_idx = to_idx
+
+            # Cap goal position when too far away from present position.
+            # Slower fps expected due to reading from the follower.
+            if self.config.max_relative_target is not None:
+                present_pos = self.follower_arms[name].read("Present_Position")
+                present_pos = torch.from_numpy(present_pos)
+                goal_pos = ensure_safe_goal_position(goal_pos, present_pos, self.config.max_relative_target)
+
+            # Save tensor to concat and return
+            action_sent.append(goal_pos)
+
+            # Send goal position to each follower
+            goal_pos = goal_pos.numpy().astype(np.int32)
+            self.follower_arms[name].write("Goal_Position", goal_pos)
+
+        return torch.cat(action_sent)
+
+    def disconnect(self):
+        if not self.is_connected:
+            raise RobotDeviceNotConnectedError(
+                "ManipulatorRobot is not connected. You need to run `robot.connect()` before disconnecting."
+            )
+
+        for name in self.follower_arms:
+            self.follower_arms[name].disconnect()
+
+        for name in self.leader_arms:
+            self.leader_arms[name].disconnect()
+
+        for name in self.cameras:
+            self.cameras[name].disconnect()
+
+        self.is_connected = False
+
+    def __del__(self):
+        if getattr(self, "is_connected", False):
+            self.disconnect()
diff --git a/lerobot/configs/robot/aloha.yaml b/lerobot/configs/robot/aloha.yaml
new file mode 100644
index 000000000..938fa2e3d
--- /dev/null
+++ b/lerobot/configs/robot/aloha.yaml
@@ -0,0 +1,115 @@
+# Aloha: A Low-Cost Hardware for Bimanual Teleoperation
+# https://aloha-2.github.io
+# https://www.trossenrobotics.com/aloha-stationary
+
+# Requires installing extras packages
+# With pip: `pip install -e ".[dynamixel intelrealsense]"`
+# With poetry: `poetry install --sync --extras "dynamixel intelrealsense"`
+
+_target_: lerobot.common.robot_devices.robots.manipulator.ManipulatorRobot
+robot_type: aloha
+# Specific to Aloha, LeRobot comes with default calibration files. Assuming the motors have been
+# properly assembled, no manual calibration step is expected. If you need to run manual calibration,
+# simply update this path to ".cache/calibration/aloha"
+calibration_dir: .cache/calibration/aloha_default
+
+# /!\ FOR SAFETY, READ THIS /!\
+# `max_relative_target` limits the magnitude of the relative positional target vector for safety purposes.
+# Set this to a positive scalar to have the same value for all motors, or a list that is the same length as
+# the number of motors in your follower arms.
+# For Aloha, for every goal position request, motor rotations are capped at 5 degrees by default.
+# When you feel more confident with teleoperation or running the policy, you can extend
+# this safety limit and even removing it by setting it to `null`.
+# Also, everything is expected to work safely out-of-the-box, but we highly advise to
+# first try to teleoperate the grippers only (by commenting out the rest of the motors in this yaml),
+# then to gradually add more motors (by uncommenting), until you can teleoperate both arms fully
+max_relative_target: 5
+
+leader_arms:
+  left:
+    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
+    port: /dev/ttyDXL_leader_left
+    motors:  # window_x
+      # name: (index, model)
+      waist: [1, xm430-w350]
+      shoulder: [2, xm430-w350]
+      shoulder_shadow: [3, xm430-w350]
+      elbow: [4, xm430-w350]
+      elbow_shadow: [5, xm430-w350]
+      forearm_roll: [6, xm430-w350]
+      wrist_angle: [7, xm430-w350]
+      wrist_rotate: [8, xl430-w250]
+      gripper: [9, xc430-w150]
+  right:
+    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
+    port: /dev/ttyDXL_leader_right
+    motors:  # window_x
+      # name: (index, model)
+      waist: [1, xm430-w350]
+      shoulder: [2, xm430-w350]
+      shoulder_shadow: [3, xm430-w350]
+      elbow: [4, xm430-w350]
+      elbow_shadow: [5, xm430-w350]
+      forearm_roll: [6, xm430-w350]
+      wrist_angle: [7, xm430-w350]
+      wrist_rotate: [8, xl430-w250]
+      gripper: [9, xc430-w150]
+
+follower_arms:
+  left:
+    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
+    port: /dev/ttyDXL_follower_left
+    motors:
+      # name: [index, model]
+      waist: [1, xm540-w270]
+      shoulder: [2, xm540-w270]
+      shoulder_shadow: [3, xm540-w270]
+      elbow: [4, xm540-w270]
+      elbow_shadow: [5, xm540-w270]
+      forearm_roll: [6, xm540-w270]
+      wrist_angle: [7, xm540-w270]
+      wrist_rotate: [8, xm430-w350]
+      gripper: [9, xm430-w350]
+  right:
+    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
+    port: /dev/ttyDXL_follower_right
+    motors:
+      # name: [index, model]
+      waist: [1, xm540-w270]
+      shoulder: [2, xm540-w270]
+      shoulder_shadow: [3, xm540-w270]
+      elbow: [4, xm540-w270]
+      elbow_shadow: [5, xm540-w270]
+      forearm_roll: [6, xm540-w270]
+      wrist_angle: [7, xm540-w270]
+      wrist_rotate: [8, xm430-w350]
+      gripper: [9, xm430-w350]
+
+# Troubleshooting: If one of your IntelRealSense cameras freeze during
+# data recording due to bandwidth limit, you might need to plug the camera
+# on another USB hub or PCIe card.
+cameras:
+  cam_high:
+    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
+    camera_index: 128422271347
+    fps: 30
+    width: 640
+    height: 480
+  cam_low:
+    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
+    camera_index: 130322270656
+    fps: 30
+    width: 640
+    height: 480
+  cam_left_wrist:
+    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
+    camera_index: 218622272670
+    fps: 30
+    width: 640
+    height: 480
+  cam_right_wrist:
+    _target_: lerobot.common.robot_devices.cameras.intelrealsense.IntelRealSenseCamera
+    camera_index: 130322272300
+    fps: 30
+    width: 640
+    height: 480
diff --git a/lerobot/configs/robot/koch_bimanual.yaml b/lerobot/configs/robot/koch_bimanual.yaml
new file mode 100644
index 000000000..7f8138675
--- /dev/null
+++ b/lerobot/configs/robot/koch_bimanual.yaml
@@ -0,0 +1,75 @@
+_target_: lerobot.common.robot_devices.robots.manipulator.ManipulatorRobot
+robot_type: koch
+calibration_dir: .cache/calibration/koch_bimanual
+
+# `max_relative_target` limits the magnitude of the relative positional target vector for safety purposes.
+# Set this to a positive scalar to have the same value for all motors, or a list that is the same length as
+# the number of motors in your follower arms.
+max_relative_target: null
+
+leader_arms:
+  left:
+    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
+    port: /dev/tty.usbmodem585A0085511
+    motors:
+      # name: (index, model)
+      shoulder_pan: [1, "xl330-m077"]
+      shoulder_lift: [2, "xl330-m077"]
+      elbow_flex: [3, "xl330-m077"]
+      wrist_flex: [4, "xl330-m077"]
+      wrist_roll: [5, "xl330-m077"]
+      gripper: [6, "xl330-m077"]
+  right:
+    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
+    port: /dev/tty.usbmodem575E0031751
+    motors:
+      # name: (index, model)
+      shoulder_pan: [1, "xl330-m077"]
+      shoulder_lift: [2, "xl330-m077"]
+      elbow_flex: [3, "xl330-m077"]
+      wrist_flex: [4, "xl330-m077"]
+      wrist_roll: [5, "xl330-m077"]
+      gripper: [6, "xl330-m077"]
+
+follower_arms:
+  left:
+    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
+    port: /dev/tty.usbmodem585A0076891
+    motors:
+      # name: (index, model)
+      shoulder_pan: [1, "xl430-w250"]
+      shoulder_lift: [2, "xl430-w250"]
+      elbow_flex: [3, "xl330-m288"]
+      wrist_flex: [4, "xl330-m288"]
+      wrist_roll: [5, "xl330-m288"]
+      gripper: [6, "xl330-m288"]
+  right:
+    _target_: lerobot.common.robot_devices.motors.dynamixel.DynamixelMotorsBus
+    port: /dev/tty.usbmodem575E0032081
+    motors:
+      # name: (index, model)
+      shoulder_pan: [1, "xl430-w250"]
+      shoulder_lift: [2, "xl430-w250"]
+      elbow_flex: [3, "xl330-m288"]
+      wrist_flex: [4, "xl330-m288"]
+      wrist_roll: [5, "xl330-m288"]
+      gripper: [6, "xl330-m288"]
+
+cameras:
+  laptop:
+    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
+    camera_index: 0
+    fps: 30
+    width: 640
+    height: 480
+  phone:
+    _target_: lerobot.common.robot_devices.cameras.opencv.OpenCVCamera
+    camera_index: 1
+    fps: 30
+    width: 640
+    height: 480
+
+# ~ Koch specific settings ~
+# Sets the leader arm in torque mode with the gripper motor set to this angle. This makes it possible
+# to squeeze the gripper and have it spring back to an open position on its own.
+gripper_open_degree: 35.156

From f455f1b0909efe6748dbec99991eeaa0858c2dde Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Tue, 17 Sep 2024 11:05:30 +0100
Subject: [PATCH 04/29] backup wip

---
 lerobot/common/datasets/online_buffer.py | 81 +++++++++++++++++++-----
 lerobot/scripts/train.py                 |  5 +-
 2 files changed, 68 insertions(+), 18 deletions(-)

diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
index 9bc0580fb..b6395e48f 100644
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -25,10 +25,13 @@
 import numpy as np
 import torch
 
-from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
-from lerobot.common.datasets.video_utils import VideoFrame, load_from_videos
+from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
+from lerobot.common.datasets.utils import load_hf_dataset, load_info, load_videos
+from lerobot.common.datasets.video_utils import VideoFrame, decode_video_frames_torchvision, load_from_videos
 
-MAX_VIDEO_PATH_LENGTH = 100  # TODO(now): Move somewhere more appropriate
+# TODO(alexander-soare): Move somewhere more appropriate once the DataBuffer class permeates more of the coe
+# base.
+MAX_VIDEO_PATH_LENGTH = 100
 
 
 def _make_memmap_safe(**kwargs) -> np.memmap:
@@ -111,9 +114,12 @@ def __init__(
                 internally, so you don't need to include them.
             buffer_capacity: How many frames should be stored in the buffer as a maximum. Be aware of your
                 system's available disk space when choosing this.
-            image_transform: TODO(now)
-            delta_timestamps: TODO(now)
-            fps: TODO(now)
+            image_transform: Transforms to apply in the item getter to all image data (any data whose key
+                starts with "observation.image").
+            delta_timestamps: TODO(alexander-soare): Document this somewhere when
+                `load_previous_and_future_frames` is refactored.
+            fps: TODO(alexander-soare): Document this somewhere when `load_previous_and_future_frames` is
+                refactored.
 
         """
         if (delta_timestamps is None) ^ (fps is None):
@@ -384,7 +390,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
                 else:
                     query_ts = current_ts + self.delta_timestamps[k]
                     item_[k] = [
-                        {"path": item[k][i].decode(), "timestamp": video_timestamps[k][i]}
+                        {"path": item[k][i].decode(), "timestamp": float(video_timestamps[k][i])}
                         for i in range(len(item[k]))
                     ]
             item = load_from_videos(
@@ -402,7 +408,20 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
         return self._item_to_tensors(item)
 
     @classmethod
-    def from_hf_dataset(cls, hf_dataset: datasets.Dataset, **kwargs) -> "DataBuffer":
+    def from_hf_dataset(
+        cls,
+        repo_id: str,
+        decode_video: bool = False,
+        **kwargs,
+    ) -> "DataBuffer":
+        hf_dataset = load_hf_dataset(repo_id, version=CODEBASE_VERSION, root=None, split="train")
+        lerobot_dataset_info = load_info(repo_id, version=CODEBASE_VERSION, root=None)
+        is_video_dataset = lerobot_dataset_info.get("video", False)
+        if not is_video_dataset and decode_video:
+            raise ValueError(f"The provided dataset is not a video dataset but you have {decode_video=}")
+        if is_video_dataset:
+            videos_path = load_videos(repo_id, version=CODEBASE_VERSION, root=None)
+
         data_spec = {}
         video_frame_keys = []
         for k, feature in hf_dataset.features.items():
@@ -412,11 +431,20 @@ def from_hf_dataset(cls, hf_dataset: datasets.Dataset, **kwargs) -> "DataBuffer"
                 example_img = np.array(hf_dataset[0][k])
                 data_spec[k] = {"shape": example_img.shape, "dtype": np.dtype("uint8")}
             elif isinstance(feature, VideoFrame):
-                video_frame_keys.append(k)
-                data_spec[k] = {
-                    "shape": (),
-                    "dtype": np.dtype(f"S{MAX_VIDEO_PATH_LENGTH}"),
-                }
+                if decode_video:
+                    video_dct = hf_dataset[0][k]
+                    example_img = decode_video_frames_torchvision(
+                        videos_path.parent / video_dct["path"],
+                        [video_dct["timestamp"]],
+                        1 / lerobot_dataset_info["fps"] - 1e-4,
+                    )[0]
+                    data_spec[k] = {"shape": example_img.shape, "dtype": np.dtype("uint8")}
+                else:
+                    video_frame_keys.append(k)
+                    data_spec[k] = {
+                        "shape": (),
+                        "dtype": np.dtype(f"S{MAX_VIDEO_PATH_LENGTH}"),
+                    }
             elif isinstance(feature, datasets.features.Sequence):
                 data_spec[k] = {"shape": (feature.length,), "dtype": np.dtype(feature.feature.dtype)}
             elif isinstance(feature, datasets.features.Value):
@@ -435,9 +463,25 @@ def from_hf_dataset(cls, hf_dataset: datasets.Dataset, **kwargs) -> "DataBuffer"
                     [np.array(pil_img).astype(np.float32) / 255 for pil_img in hf_dataset[k]]
                 )
             elif isinstance(feature, VideoFrame):
-                data_dict[k] = np.stack(
-                    [np.array(dct["path"], dtype=f"S{MAX_VIDEO_PATH_LENGTH}") for dct in hf_dataset[k]]
-                )
+                if decode_video:
+                    # Decode all videos into images.
+                    episode_indices = np.array(hf_dataset["episode_index"])
+                    timestamps = np.array(hf_dataset["timestamp"])
+                    all_imgs = []
+                    for episode_index in np.unique(episode_indices):
+                        episode_data_indices = np.where(episode_indices == episode_index)[0]
+                        episode_timestamps = timestamps[episode_indices == episode_index]
+                        episode_imgs = decode_video_frames_torchvision(
+                            videos_path.parent / hf_dataset[k][episode_data_indices[0]]["path"],
+                            episode_timestamps,
+                            1 / lerobot_dataset_info["fps"] - 1e-4,
+                        )
+                        all_imgs.extend(episode_imgs.numpy())
+                    data_dict[k] = np.stack(all_imgs)
+                else:
+                    data_dict[k] = np.stack(
+                        [np.array(dct["path"], dtype=f"S{MAX_VIDEO_PATH_LENGTH}") for dct in hf_dataset[k]]
+                    )
             else:
                 data_dict[k] = np.array(hf_dataset[k])
         obj.add_data(data_dict)
@@ -445,8 +489,13 @@ def from_hf_dataset(cls, hf_dataset: datasets.Dataset, **kwargs) -> "DataBuffer"
         if len(video_frame_keys) > 0:
             obj.video = True  # TODO(now): HACK
             obj.video_frame_keys = video_frame_keys  # TODO(now): HACK
+            # Symlink videos if needed.
+            obj.videos_dir = kwargs["storage_dir"] / "videos"
+            if not obj.videos_dir.exists():
+                os.symlink(videos_path.absolute(), kwargs["storage_dir"])
             obj.videos_dir = Path(kwargs["storage_dir"]) / "videos"  # TODO(now): HACK
             obj.video_backend = "pyav"  # TODO(now): HACK
+
         return obj
 
 
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index 28c4e8737..a3ea06ead 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -406,10 +406,11 @@ def evaluate_and_checkpoint_if_needed(step, is_online):
 
     if cfg.get("use_lerobot_data_buffer", False):
         offline_dataset_for_dataloader = DataBuffer.from_hf_dataset(
-            offline_dataset.hf_dataset,
-            storage_dir=f"/tmp/{offline_dataset.repo_id}",
+            cfg.dataset_repo_id,
+            storage_dir=Path(f"/tmp/{offline_dataset.repo_id}"),
             fps=offline_dataset.fps,
             delta_timestamps=offline_dataset.delta_timestamps,
+            decode_video=offline_dataset.video,
         )
     else:
         offline_dataset_for_dataloader = offline_dataset

From e6864f0a414f56a4eec15a94142e5837e98fe0f3 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Tue, 17 Sep 2024 17:45:39 +0100
Subject: [PATCH 05/29] backup wip

---
 lerobot/common/datasets/online_buffer.py | 587 -----------------------
 lerobot/scripts/eval.py                  |   1 +
 lerobot/scripts/train.py                 |   6 +-
 tests/test_online_buffer.py              | 320 ------------
 tests/utils.py                           |  15 +
 5 files changed, 19 insertions(+), 910 deletions(-)
 delete mode 100644 lerobot/common/datasets/online_buffer.py
 delete mode 100644 tests/test_online_buffer.py

diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
deleted file mode 100644
index b6395e48f..000000000
--- a/lerobot/common/datasets/online_buffer.py
+++ /dev/null
@@ -1,587 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""A data buffer for efficient data management during offline and online training."""
-
-import logging
-import os
-from itertools import chain
-from pathlib import Path
-from typing import Any, Callable
-
-import datasets
-import numpy as np
-import torch
-
-from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
-from lerobot.common.datasets.utils import load_hf_dataset, load_info, load_videos
-from lerobot.common.datasets.video_utils import VideoFrame, decode_video_frames_torchvision, load_from_videos
-
-# TODO(alexander-soare): Move somewhere more appropriate once the DataBuffer class permeates more of the coe
-# base.
-MAX_VIDEO_PATH_LENGTH = 100
-
-
-def _make_memmap_safe(**kwargs) -> np.memmap:
-    """Make a numpy memmap with checks on available disk space first.
-
-    Expected kwargs are: "filename", "dtype" (must by np.dtype), "mode" and "shape"
-
-    For information on dtypes:
-    https://numpy.org/doc/stable/reference/arrays.dtypes.html#arrays-dtypes-constructing
-    """
-    if kwargs["mode"].startswith("w"):
-        required_space = kwargs["dtype"].itemsize * np.prod(kwargs["shape"])  # bytes
-        stats = os.statvfs(Path(kwargs["filename"]).parent)
-        available_space = stats.f_bavail * stats.f_frsize  # bytes
-        if required_space >= available_space * 0.8:
-            raise RuntimeError(
-                f"You're about to take up {required_space} of {available_space} bytes available. This "
-                "exception has been raised to protect your storage device."
-                ""
-            )
-    return np.memmap(**kwargs)
-
-
-class DataBuffer(torch.utils.data.Dataset):
-    """Data buffer for efficient dataset management during offline and online training.
-
-    Data is considered to come in the form of "episodes" (an instance of a robot performing a task). Episodes
-    are made up of "frames", which are chronoligically ordered and contain timestamp aligned data, potentially
-    including environment observations, and robot actions. NOTE: for the time being, we require all data
-    modalities are timestamp aligned. This constraint may be relaxed in the future.
-
-    The data is stored in a mapping from data keys to arrays with shape (total_number_of_frames, *data_dim).
-    The compulsory data keys are:
-        - "index": A sequential integer index per frame.
-        - "episode_index": A sequential integer index per episode.
-        - "frame_index": A sequential integer index per frame within an episode (it resets for each episode).
-        - "timestamp": The relative timestamp of the frame within the episode in units of seconds. The choice.
-            of reference time is not important.
-
-    The `add_data` and `add_episodes` functions can be used to insert more data in the form of integral
-    episodes (starting from frame 0 and with the frames ordered). The buffer has a compulsory size limit,
-    which must be provided. Data is inserted in a circular fashion, inserting after the most recently added
-    frame, and wrapping around to the start when necessary (in which case older episodes are overwritten).
-
-    This class is also a PyTorch Dataset and can be used as such in a dataloader for a training loop. The item
-    getter returns either a single from, or a slice of a single episode based on the requested data index.
-    """
-
-    # Special key for a (1,) array storing a pointer to the next index to fill from when adding data.
-    NEXT_INDEX_KEY = "_next_index"
-    # Since the data buffer is pre-allocated, this boolean mask is used to indicate which frames have "real"
-    # data.
-    OCCUPANCY_MASK_KEY = "_occupancy_mask"
-    # This is not a data key used in the buffer. It is used to indicate that a frame is padding, added by the
-    # __getitem__ method.
-    IS_PAD_POSTFIX = "_is_pad"
-    INDEX_KEY = "index"
-    FRAME_INDEX_KEY = "frame_index"
-    EPISODE_INDEX_KEY = "episode_index"
-    TIMESTAMP_KEY = "timestamp"
-    PRESET_KEYS = {INDEX_KEY, FRAME_INDEX_KEY, EPISODE_INDEX_KEY, TIMESTAMP_KEY}
-
-    def __init__(
-        self,
-        storage_dir: str | Path,
-        data_spec: dict[str, Any] | None,
-        buffer_capacity: int | None,
-        image_transform: Callable[[np.ndarray], np.ndarray] | None = None,
-        delta_timestamps: dict[str, list[float]] | dict[str, np.ndarray] | None = None,
-        fps: float | None = None,
-    ):
-        """Create a data buffer including reserving the underlying on-disk storage.
-
-        Args:
-            storage_dir: Where to keep the numpy memmap files. One memmap file will be stored for each data
-                key. Note that if the files already exist, they are opened in read-write mode.
-            data_spec: A mapping from data key to data specification, like {data_key: {"shape": tuple[int],
-                "dtype": np.dtype}}. This should include all the data that you wish to record into the buffer,
-                but note that "index", "frame_index", "episode_index", and "timestamp", are already handled
-                internally, so you don't need to include them.
-            buffer_capacity: How many frames should be stored in the buffer as a maximum. Be aware of your
-                system's available disk space when choosing this.
-            image_transform: Transforms to apply in the item getter to all image data (any data whose key
-                starts with "observation.image").
-            delta_timestamps: TODO(alexander-soare): Document this somewhere when
-                `load_previous_and_future_frames` is refactored.
-            fps: TODO(alexander-soare): Document this somewhere when `load_previous_and_future_frames` is
-                refactored.
-
-        """
-        if (delta_timestamps is None) ^ (fps is None):
-            raise ValueError("`delta_timestamps` and `fps` should be provided together, or not at all.")
-        self.set_delta_timestamps(delta_timestamps)
-        self._fps = fps
-        # Tolerance in seconds used to discard loaded frames when their timestamps are not close enough from
-        # the requested frames. It is only used when `delta_timestamps` is provided.
-        # minus 1e-4 to account for possible numerical error
-        self.tolerance_s = 1 / self.fps - 1e-4 if fps is not None else None
-        self._buffer_capacity = buffer_capacity
-        Path(storage_dir).mkdir(parents=True, exist_ok=True)
-        data_spec = self._make_data_spec(data_spec, buffer_capacity)
-        self._data: dict[str, np.memmap] = {}
-        for k, v in data_spec.items():
-            self._data[k] = _make_memmap_safe(
-                filename=Path(storage_dir) / k,
-                dtype=v["dtype"] if v is not None else None,
-                mode="r+" if (Path(storage_dir) / k).exists() else "w+",
-                shape=tuple(v["shape"]) if v is not None else None,
-            )
-        self.image_transform = image_transform
-
-    @property
-    def delta_timestamps(self) -> dict[str, np.ndarray] | None:
-        return self._delta_timestamps
-
-    def set_delta_timestamps(self, value: dict[str, list[float]] | None):
-        """Set delta_timestamps converting the values to numpy arrays.
-
-        Note: The conversion is for an optimization in the __getitem__. The loop is much slower if lists need
-        to be converted into numpy arrays.
-        """
-        if value is not None:
-            self._delta_timestamps = {k: np.array(v) for k, v in value.items()}
-        else:
-            self._delta_timestamps = None
-
-    def _make_data_spec(self, data_spec: dict[str, Any], buffer_capacity: int) -> dict[str, dict[str, Any]]:
-        """Makes the necessary data specification keyword arguments for np.memmap."""
-        if any(k.startswith("_") for k in data_spec):
-            raise ValueError(
-                "data_spec keys should not start with '_'. This prefix is reserved for internal logic."
-            )
-        if len(intersection := set(data_spec).intersection(DataBuffer.PRESET_KEYS)) > 0:
-            raise ValueError(
-                f"`data_spec` should not contain any of {DataBuffer.PRESET_KEYS} as these are handled "
-                f"internally. The provided data_spec has {intersection}."
-            )
-        complete_data_spec = {
-            DataBuffer.NEXT_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (1,)},
-            DataBuffer.OCCUPANCY_MASK_KEY: {"dtype": np.dtype("?"), "shape": (buffer_capacity,)},
-            DataBuffer.INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
-            DataBuffer.FRAME_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
-            DataBuffer.EPISODE_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (buffer_capacity,)},
-            DataBuffer.TIMESTAMP_KEY: {"dtype": np.dtype("float64"), "shape": (buffer_capacity,)},
-        }
-        for k, v in data_spec.items():
-            complete_data_spec[k] = {"dtype": v["dtype"], "shape": (buffer_capacity, *v["shape"])}
-        return complete_data_spec
-
-    def add_data(self, data: dict[str, np.ndarray]):
-        """Add data to the buffer.
-
-        This calls `add_episode` for each unique episode index. See the documentation there for more
-        information.
-        """
-        for episode_index in np.unique(data[DataBuffer.EPISODE_INDEX_KEY]):
-            where_episode = np.where(data[DataBuffer.EPISODE_INDEX_KEY] == episode_index)[0]
-            episode_data = {k: data[k][where_episode] for k in data}
-            self.add_episode(episode_data)
-
-    def add_episode(self, data: dict[str, np.ndarray]):
-        """Add data for a single episode to the buffer.
-
-        `data` should have the same key, array mapping as the buffer. It should contain exactly one episode.
-        The episode should have frame indices that start from 0 and step up in increments of 1.
-
-        If the episode has more frames then are available till the end of the buffer, the pointer is reset
-        to the start of the buffer and the episode is inserted there, overwriting existing episode frames.
-
-        When episode frames are overwritten by a new episode, by default, any remaining frames belonging to
-        the existing episode are left in place (meaning not all episodes will be guaranteed to start from
-        their frame 0).
-        """
-        if len(missing_keys := (set(self.data_keys).difference(set(data)))) > 0:
-            raise ValueError(f"Missing data keys: {missing_keys}")
-        new_data_length = len(data[self.data_keys[0]])
-        if new_data_length <= 0:
-            raise ValueError("The episode has 0 frames")
-        if new_data_length > self._buffer_capacity:
-            raise ValueError("The episode length is larger than the buffer capacity.")
-        if not all(len(data[k]) == new_data_length for k in self.data_keys):
-            raise ValueError("All data items should have the same length")
-        if not np.all(data[DataBuffer.EPISODE_INDEX_KEY] == data[DataBuffer.EPISODE_INDEX_KEY][0]):
-            raise ValueError(
-                "New data should only contain one episode but there is more than one unique episode index."
-            )
-        if not np.array_equal(data[DataBuffer.FRAME_INDEX_KEY], np.arange(new_data_length)):
-            raise ValueError(
-                "Expected frame indices to start from 0 and step up in increments of 1 per frame."
-            )
-
-        # Figure out where we need to start filling data next, and make sure we continue data and episode
-        # indices.
-        next_index = self._data[DataBuffer.NEXT_INDEX_KEY][0]
-        if self.num_samples > 0:
-            last_episode_index = self._data[DataBuffer.EPISODE_INDEX_KEY][next_index - 1]
-            last_data_index = self._data[DataBuffer.INDEX_KEY][next_index - 1]
-        else:
-            last_episode_index = -1
-            last_data_index = -1
-        # If there aren't enough slots in the buffer left to accommodate the episode, wrap to the start.
-        if max(0, new_data_length - (self._buffer_capacity - next_index)) > 0:
-            next_index = 0
-
-        # Insert the new data starting from next_index.
-        for k in self.data_keys:
-            slc = slice(next_index, next_index + new_data_length)
-            if k == DataBuffer.EPISODE_INDEX_KEY:
-                self._data[k][slc] = last_episode_index + 1
-            elif k == DataBuffer.INDEX_KEY:
-                self._data[k][slc] = np.arange(last_data_index + 1, last_data_index + 1 + new_data_length)
-            else:
-                self._data[k][slc] = data[k]
-            self._data[DataBuffer.OCCUPANCY_MASK_KEY][slc] = True
-
-        # Update the data pointer.
-        self._data[DataBuffer.NEXT_INDEX_KEY][0] = next_index + new_data_length
-
-    @property
-    def data_keys(self) -> list[str]:
-        keys = set(self._data)
-        keys.remove(DataBuffer.OCCUPANCY_MASK_KEY)
-        keys.remove(DataBuffer.NEXT_INDEX_KEY)
-        return sorted(keys)
-
-    @property
-    def fps(self) -> float | None:
-        return self._fps
-
-    @property
-    def num_episodes(self) -> int:
-        return len(
-            np.unique(self._data[DataBuffer.EPISODE_INDEX_KEY][self._data[DataBuffer.OCCUPANCY_MASK_KEY]])
-        )
-
-    @property
-    def num_samples(self) -> int:
-        return np.count_nonzero(self._data[DataBuffer.OCCUPANCY_MASK_KEY])
-
-    def __len__(self):
-        return self.num_samples
-
-    def _item_to_tensors(self, item: dict) -> dict:
-        item_ = {}
-        for k, v in item.items():
-            if isinstance(v, torch.Tensor):
-                item_[k] = v
-            elif isinstance(v, np.ndarray):
-                item_[k] = torch.from_numpy(v)
-            else:
-                item_[k] = torch.tensor(v)
-        return item_
-
-    @property
-    def camera_keys(self) -> list[str]:
-        """Keys to access image and video stream from cameras."""
-        return [k for k in self._data if k.startswith("observation.image")]
-
-    def _optimized_advanced_slice(self, data_key: str, indices: np.ndarray) -> np.ndarray:
-        """Convert advanced slicing to basic slicing by finding contiguous ranges in the requested indices.
-
-        TODO(now): Is this needed?
-        """
-        indices_diff = np.diff(indices, prepend=indices[0] - 1)
-        where_not_1 = np.where(indices_diff != 1)[0]
-        ptr = 0
-        ret = []
-        for ix in chain(where_not_1, [len(indices)]):
-            ret.append(self._data[data_key][indices[ptr] : indices[ix - 1] + 1])
-            ptr = ix
-
-        # Avoid creating a copy with concatenate if possible.
-        return np.concatenate(ret) if len(ret) > 1 else ret[0]
-
-    def flush(self):
-        """Save the data to disk.
-
-        `np.memmap`s keep a portion of the data mirrored in memory. Updates to the in-memory data are not
-        immediately reflected on disk. Call this method to explicitly save the updates to disk.
-        """
-        for k in self._data:
-            self._data[k].flush()
-
-    def get_data_by_key(self, key: str) -> torch.Tensor:
-        """Returns all data for a given data key as a Tensor."""
-        return torch.from_numpy(self._data[key][self._data[DataBuffer.OCCUPANCY_MASK_KEY]])
-
-    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
-        if idx >= len(self) or idx < -len(self):
-            raise IndexError
-
-        item = {k: v[idx] for k, v in self._data.items() if not k.startswith("_")}
-
-        if self.delta_timestamps is not None:
-            episode_index = item[DataBuffer.EPISODE_INDEX_KEY]
-            current_ts = item[DataBuffer.TIMESTAMP_KEY]
-            episode_data_indices = np.where(
-                np.bitwise_and(
-                    self._data[DataBuffer.EPISODE_INDEX_KEY] == episode_index,
-                    self._data[DataBuffer.OCCUPANCY_MASK_KEY],
-                )
-            )[0]
-            episode_timestamps = self._data[DataBuffer.TIMESTAMP_KEY][episode_data_indices]
-
-            if self.video:
-                video_timestamps = {}  # TODO(now): HACK
-
-            for data_key in self.delta_timestamps:
-                # Get timestamps used as query to retrieve data of previous/future frames.
-                query_ts = current_ts + self.delta_timestamps[data_key]
-
-                # Compute distances between each query timestamp and all timestamps of all the frames
-                # belonging to the episode.
-                dist = np.abs(query_ts[:, None] - episode_timestamps[None, :])
-                argmin_ = np.argmin(dist, axis=1)
-                min_ = dist[np.arange(dist.shape[0]), argmin_]
-
-                is_pad = min_ > self.tolerance_s
-
-                # Check violated query timestamps are all outside the episode range.
-                err_msg = (
-                    f"One or several timestamps unexpectedly violate the tolerance ({min_} > "
-                    f"{self.tolerance_s=}) inside the episode range."
-                )
-                try:
-                    assert (
-                        (query_ts[is_pad] < episode_timestamps[0])
-                        | (episode_timestamps[-1] < query_ts[is_pad])
-                    ).all(), err_msg
-                except AssertionError:
-                    logging.warning(err_msg)
-                    return self.__getitem__(np.random.choice(len(self)))
-
-                if self.video and data_key in self.video_frame_keys:  # TODO(now): HACK
-                    video_timestamps[data_key] = self._data[DataBuffer.TIMESTAMP_KEY][
-                        episode_data_indices[argmin_]
-                    ]
-
-                # Load frames for this data key.
-                if np.any(np.diff(argmin_) != 1):
-                    item[data_key] = self._data[data_key][episode_data_indices[argmin_]]
-                    # item[data_key] = self._optimized_advanced_slice(data_key, episode_data_indices[argmin_])
-                else:
-                    # Do basic slicing where possible
-                    item[data_key] = self._data[data_key][
-                        episode_data_indices[argmin_.min()] : episode_data_indices[argmin_.max()] + 1
-                    ]
-
-                item[f"{data_key}{DataBuffer.IS_PAD_POSTFIX}"] = is_pad
-
-        if self.video:
-            item_ = dict(item)
-            for k in self.video_frame_keys:  # TODO(now): HACK
-                if self.delta_timestamps is None:
-                    item_[k] = {"path": item[k].decode(), "timestamp": float(item[DataBuffer.TIMESTAMP_KEY])}
-                else:
-                    query_ts = current_ts + self.delta_timestamps[k]
-                    item_[k] = [
-                        {"path": item[k][i].decode(), "timestamp": float(video_timestamps[k][i])}
-                        for i in range(len(item[k]))
-                    ]
-            item = load_from_videos(
-                item_,
-                self.video_frame_keys,
-                self.videos_dir,
-                self.tolerance_s,
-                self.video_backend,
-            )
-
-        if self.image_transform is not None:
-            for cam in self.camera_keys:
-                item[cam] = self.image_transform(item[cam])
-
-        return self._item_to_tensors(item)
-
-    @classmethod
-    def from_hf_dataset(
-        cls,
-        repo_id: str,
-        decode_video: bool = False,
-        **kwargs,
-    ) -> "DataBuffer":
-        hf_dataset = load_hf_dataset(repo_id, version=CODEBASE_VERSION, root=None, split="train")
-        lerobot_dataset_info = load_info(repo_id, version=CODEBASE_VERSION, root=None)
-        is_video_dataset = lerobot_dataset_info.get("video", False)
-        if not is_video_dataset and decode_video:
-            raise ValueError(f"The provided dataset is not a video dataset but you have {decode_video=}")
-        if is_video_dataset:
-            videos_path = load_videos(repo_id, version=CODEBASE_VERSION, root=None)
-
-        data_spec = {}
-        video_frame_keys = []
-        for k, feature in hf_dataset.features.items():
-            if k in DataBuffer.PRESET_KEYS:
-                continue
-            if isinstance(feature, datasets.features.Image):
-                example_img = np.array(hf_dataset[0][k])
-                data_spec[k] = {"shape": example_img.shape, "dtype": np.dtype("uint8")}
-            elif isinstance(feature, VideoFrame):
-                if decode_video:
-                    video_dct = hf_dataset[0][k]
-                    example_img = decode_video_frames_torchvision(
-                        videos_path.parent / video_dct["path"],
-                        [video_dct["timestamp"]],
-                        1 / lerobot_dataset_info["fps"] - 1e-4,
-                    )[0]
-                    data_spec[k] = {"shape": example_img.shape, "dtype": np.dtype("uint8")}
-                else:
-                    video_frame_keys.append(k)
-                    data_spec[k] = {
-                        "shape": (),
-                        "dtype": np.dtype(f"S{MAX_VIDEO_PATH_LENGTH}"),
-                    }
-            elif isinstance(feature, datasets.features.Sequence):
-                data_spec[k] = {"shape": (feature.length,), "dtype": np.dtype(feature.feature.dtype)}
-            elif isinstance(feature, datasets.features.Value):
-                data_spec[k] = {"shape": (), "dtype": np.dtype(feature.dtype)}
-            else:
-                raise NotImplementedError(f"Dataset feature type {type(feature)} is not handled.")
-        obj = cls(
-            **kwargs,
-            data_spec=data_spec,
-            buffer_capacity=len(hf_dataset),
-        )
-        data_dict = {}
-        for k, feature in hf_dataset.features.items():
-            if isinstance(feature, datasets.features.Image):
-                data_dict[k] = np.stack(
-                    [np.array(pil_img).astype(np.float32) / 255 for pil_img in hf_dataset[k]]
-                )
-            elif isinstance(feature, VideoFrame):
-                if decode_video:
-                    # Decode all videos into images.
-                    episode_indices = np.array(hf_dataset["episode_index"])
-                    timestamps = np.array(hf_dataset["timestamp"])
-                    all_imgs = []
-                    for episode_index in np.unique(episode_indices):
-                        episode_data_indices = np.where(episode_indices == episode_index)[0]
-                        episode_timestamps = timestamps[episode_indices == episode_index]
-                        episode_imgs = decode_video_frames_torchvision(
-                            videos_path.parent / hf_dataset[k][episode_data_indices[0]]["path"],
-                            episode_timestamps,
-                            1 / lerobot_dataset_info["fps"] - 1e-4,
-                        )
-                        all_imgs.extend(episode_imgs.numpy())
-                    data_dict[k] = np.stack(all_imgs)
-                else:
-                    data_dict[k] = np.stack(
-                        [np.array(dct["path"], dtype=f"S{MAX_VIDEO_PATH_LENGTH}") for dct in hf_dataset[k]]
-                    )
-            else:
-                data_dict[k] = np.array(hf_dataset[k])
-        obj.add_data(data_dict)
-        obj.video = False  # TODO(now): HACK
-        if len(video_frame_keys) > 0:
-            obj.video = True  # TODO(now): HACK
-            obj.video_frame_keys = video_frame_keys  # TODO(now): HACK
-            # Symlink videos if needed.
-            obj.videos_dir = kwargs["storage_dir"] / "videos"
-            if not obj.videos_dir.exists():
-                os.symlink(videos_path.absolute(), kwargs["storage_dir"])
-            obj.videos_dir = Path(kwargs["storage_dir"]) / "videos"  # TODO(now): HACK
-            obj.video_backend = "pyav"  # TODO(now): HACK
-
-        return obj
-
-
-def compute_sampler_weights(
-    offline_dataset: LeRobotDataset,
-    offline_drop_n_last_frames: int = 0,
-    online_dataset: DataBuffer | None = None,
-    online_sampling_ratio: float | None = None,
-    online_drop_n_last_frames: int = 0,
-) -> torch.Tensor:
-    """Compute the sampling weights for the online training dataloader in train.py.
-
-    Args:
-        offline_dataset: The LeRobotDataset used for offline pre-training.
-        online_drop_n_last_frames: Number of frames to drop from the end of each offline dataset episode.
-        online_dataset: The DataBuffer used in online training.
-        online_sampling_ratio: The proportion of data that should be sampled from the online dataset. If an
-            online dataset is provided, this value must also be provided.
-        online_drop_n_first_frames: See `offline_drop_n_last_frames`. This is the same, but for the online
-            dataset.
-    Returns:
-        Tensor of weights for [offline_dataset; online_dataset], normalized to 1.
-
-    Notes to maintainers:
-        - This duplicates some logic from EpisodeAwareSampler. We should consider converging to one approach.
-        - When used with `torch.utils.data.WeightedRandomSampler`, it could completely replace
-          `EpisodeAwareSampler` as the online dataset related arguments are optional. The only missing feature
-          is the ability to turn shuffling off.
-        - Options `drop_first_n_frames` and `episode_indices_to_use` can be added easily. They were not
-          included here to avoid adding complexity.
-    """
-    if len(offline_dataset) == 0 and (online_dataset is None or len(online_dataset) == 0):
-        raise ValueError("At least one of `offline_dataset` or `online_dataset` should be contain data.")
-    if (online_dataset is None) ^ (online_sampling_ratio is None):
-        raise ValueError(
-            "`online_dataset` and `online_sampling_ratio` must be provided together or not at all."
-        )
-    offline_sampling_ratio = 0 if online_sampling_ratio is None else 1 - online_sampling_ratio
-
-    weights = []
-
-    if len(offline_dataset) > 0:
-        offline_data_mask_indices = []
-        for start_index, end_index in zip(
-            offline_dataset.episode_data_index["from"],
-            offline_dataset.episode_data_index["to"],
-            strict=True,
-        ):
-            offline_data_mask_indices.extend(
-                range(start_index.item(), end_index.item() - offline_drop_n_last_frames)
-            )
-        offline_data_mask = torch.zeros(len(offline_dataset), dtype=torch.bool)
-        offline_data_mask[torch.tensor(offline_data_mask_indices)] = True
-        weights.append(
-            torch.full(
-                size=(len(offline_dataset),),
-                fill_value=offline_sampling_ratio / offline_data_mask.sum(),
-            )
-            * offline_data_mask
-        )
-
-    if online_dataset is not None and len(online_dataset) > 0:
-        online_data_mask_indices = []
-        episode_indices = online_dataset.get_data_by_key("episode_index")
-        for episode_idx in torch.unique(episode_indices):
-            where_episode = torch.where(episode_indices == episode_idx)
-            start_index = where_episode[0][0]
-            end_index = where_episode[0][-1] + 1
-            online_data_mask_indices.extend(
-                range(start_index.item(), end_index.item() - online_drop_n_last_frames)
-            )
-        online_data_mask = torch.zeros(len(online_dataset), dtype=torch.bool)
-        online_data_mask[torch.tensor(online_data_mask_indices)] = True
-        weights.append(
-            torch.full(
-                size=(len(online_dataset),),
-                fill_value=online_sampling_ratio / online_data_mask.sum(),
-            )
-            * online_data_mask
-        )
-
-    weights = torch.cat(weights)
-
-    if weights.sum() == 0:
-        weights += 1 / len(weights)
-    else:
-        weights /= weights.sum()
-
-    return weights
diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py
index 482af786f..cff4bc01b 100644
--- a/lerobot/scripts/eval.py
+++ b/lerobot/scripts/eval.py
@@ -188,6 +188,7 @@ def rollout(
     # Track the final observation.
     if return_observations:
         observation = preprocess_observation(observation)
+        # TODO(now): Go uint8 HWC instead
         all_observations.append(deepcopy(observation))
 
     # Stack the sequence along the first dimension so that we have (batch, sequence, *) tensors.
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index a3ea06ead..288ef83f5 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -31,9 +31,9 @@
 from torch import nn
 from torch.cuda.amp import GradScaler
 
+from lerobot.common.datasets.data_buffer import DataBuffer, compute_sampler_weights
 from lerobot.common.datasets.factory import make_dataset, resolve_delta_timestamps
 from lerobot.common.datasets.lerobot_dataset import MultiLeRobotDataset
-from lerobot.common.datasets.online_buffer import DataBuffer, compute_sampler_weights
 from lerobot.common.datasets.sampler import EpisodeAwareSampler
 from lerobot.common.datasets.utils import cycle
 from lerobot.common.envs.factory import make_env
@@ -405,8 +405,8 @@ def evaluate_and_checkpoint_if_needed(step, is_online):
         sampler = None
 
     if cfg.get("use_lerobot_data_buffer", False):
-        offline_dataset_for_dataloader = DataBuffer.from_hf_dataset(
-            cfg.dataset_repo_id,
+        offline_dataset_for_dataloader = DataBuffer.from_huggingface_hub(
+            offline_dataset.repo_id,
             storage_dir=Path(f"/tmp/{offline_dataset.repo_id}"),
             fps=offline_dataset.fps,
             delta_timestamps=offline_dataset.delta_timestamps,
diff --git a/tests/test_online_buffer.py b/tests/test_online_buffer.py
deleted file mode 100644
index 37000e4fb..000000000
--- a/tests/test_online_buffer.py
+++ /dev/null
@@ -1,320 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.d
-from copy import deepcopy
-from uuid import uuid4
-
-import numpy as np
-import pytest
-import torch
-from datasets import Dataset
-
-from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
-from lerobot.common.datasets.online_buffer import OnlineBuffer, compute_sampler_weights
-from lerobot.common.datasets.utils import hf_transform_to_torch
-
-# Some constants for OnlineBuffer tests.
-data_key = "data"
-data_shape = (2, 3)  # just some arbitrary > 1D shape
-buffer_capacity = 100
-fps = 10
-
-
-def make_new_buffer(
-    write_dir: str | None = None, delta_timestamps: dict[str, list[float]] | None = None
-) -> tuple[OnlineBuffer, str]:
-    if write_dir is None:
-        write_dir = f"/tmp/online_buffer_{uuid4().hex}"
-    buffer = OnlineBuffer(
-        write_dir,
-        data_spec={data_key: {"shape": data_shape, "dtype": np.dtype("float32")}},
-        buffer_capacity=buffer_capacity,
-        fps=fps,
-        delta_timestamps=delta_timestamps,
-    )
-    return buffer, write_dir
-
-
-def make_spoof_data_frames(n_episodes: int, n_frames_per_episode: int) -> dict[str, np.ndarray]:
-    new_data = {
-        data_key: np.arange(n_frames_per_episode * n_episodes * np.prod(data_shape)).reshape(-1, *data_shape),
-        OnlineBuffer.INDEX_KEY: np.arange(n_frames_per_episode * n_episodes),
-        OnlineBuffer.EPISODE_INDEX_KEY: np.repeat(np.arange(n_episodes), n_frames_per_episode),
-        OnlineBuffer.FRAME_INDEX_KEY: np.tile(np.arange(n_frames_per_episode), n_episodes),
-        OnlineBuffer.TIMESTAMP_KEY: np.tile(np.arange(n_frames_per_episode) / fps, n_episodes),
-    }
-    return new_data
-
-
-def test_non_mutate():
-    """Checks that the data provided to the add_data method is copied rather than passed by reference.
-
-    This means that mutating the data in the buffer does not mutate the original data.
-
-    NOTE: If this test fails, it means some of the other tests may be compromised. For example, we can't trust
-    a success case for `test_write_read`.
-    """
-    buffer, _ = make_new_buffer()
-    new_data = make_spoof_data_frames(2, buffer_capacity // 4)
-    new_data_copy = deepcopy(new_data)
-    buffer.add_data(new_data)
-    buffer._data[data_key][:] += 1
-    assert all(np.array_equal(new_data[k], new_data_copy[k]) for k in new_data)
-
-
-def test_index_error_no_data():
-    buffer, _ = make_new_buffer()
-    with pytest.raises(IndexError):
-        buffer[0]
-
-
-def test_index_error_with_data():
-    buffer, _ = make_new_buffer()
-    n_frames = buffer_capacity // 2
-    new_data = make_spoof_data_frames(1, n_frames)
-    buffer.add_data(new_data)
-    with pytest.raises(IndexError):
-        buffer[n_frames]
-    with pytest.raises(IndexError):
-        buffer[-n_frames - 1]
-
-
-@pytest.mark.parametrize("do_reload", [False, True])
-def test_write_read(do_reload: bool):
-    """Checks that data can be added to the buffer and read back.
-
-    If do_reload we delete the buffer object and load the buffer back from disk before reading.
-    """
-    buffer, write_dir = make_new_buffer()
-    n_episodes = 2
-    n_frames_per_episode = buffer_capacity // 4
-    new_data = make_spoof_data_frames(n_episodes, n_frames_per_episode)
-    buffer.add_data(new_data)
-
-    if do_reload:
-        del buffer
-        buffer, _ = make_new_buffer(write_dir)
-
-    assert len(buffer) == n_frames_per_episode * n_episodes
-    for i, item in enumerate(buffer):
-        assert all(isinstance(item[k], torch.Tensor) for k in item)
-        assert np.array_equal(item[data_key].numpy(), new_data[data_key][i])
-
-
-def test_read_data_key():
-    """Tests that data can be added to a buffer and all data for a. specific key can be read back."""
-    buffer, _ = make_new_buffer()
-    n_episodes = 2
-    n_frames_per_episode = buffer_capacity // 4
-    new_data = make_spoof_data_frames(n_episodes, n_frames_per_episode)
-    buffer.add_data(new_data)
-
-    data_from_buffer = buffer.get_data_by_key(data_key)
-    assert isinstance(data_from_buffer, torch.Tensor)
-    assert np.array_equal(data_from_buffer.numpy(), new_data[data_key])
-
-
-def test_fifo():
-    """Checks that if data is added beyond the buffer capacity, we discard the oldest data first."""
-    buffer, _ = make_new_buffer()
-    n_frames_per_episode = buffer_capacity // 4
-    n_episodes = 3
-    new_data = make_spoof_data_frames(n_episodes, n_frames_per_episode)
-    buffer.add_data(new_data)
-    n_more_episodes = 2
-    # Developer sanity check (in case someone changes the global `buffer_capacity`).
-    assert (
-        n_episodes + n_more_episodes
-    ) * n_frames_per_episode > buffer_capacity, "Something went wrong with the test code."
-    more_new_data = make_spoof_data_frames(n_more_episodes, n_frames_per_episode)
-    buffer.add_data(more_new_data)
-    assert len(buffer) == buffer_capacity, "The buffer should be full."
-
-    expected_data = {}
-    for k in new_data:
-        # Concatenate, left-truncate, then roll, to imitate the cyclical FIFO pattern in OnlineBuffer.
-        expected_data[k] = np.roll(
-            np.concatenate([new_data[k], more_new_data[k]])[-buffer_capacity:],
-            shift=len(new_data[k]) + len(more_new_data[k]) - buffer_capacity,
-            axis=0,
-        )
-
-    for i, item in enumerate(buffer):
-        assert all(isinstance(item[k], torch.Tensor) for k in item)
-        assert np.array_equal(item[data_key].numpy(), expected_data[data_key][i])
-
-
-def test_delta_timestamps_within_tolerance():
-    """Check that getting an item with delta_timestamps within tolerance succeeds.
-
-    Note: Copied from `test_datasets.py::test_load_previous_and_future_frames_within_tolerance`.
-    """
-    # Sanity check on global fps as we are assuming it is 10 here.
-    assert fps == 10, "This test assumes fps==10"
-    buffer, _ = make_new_buffer(delta_timestamps={"index": [-0.2, 0, 0.139]})
-    new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
-    buffer.add_data(new_data)
-    buffer.tolerance_s = 0.04
-    item = buffer[2]
-    data, is_pad = item["index"], item[f"index{OnlineBuffer.IS_PAD_POSTFIX}"]
-    assert torch.allclose(data, torch.tensor([0, 2, 3])), "Data does not match expected values"
-    assert not is_pad.any(), "Unexpected padding detected"
-
-
-def test_delta_timestamps_outside_tolerance_inside_episode_range():
-    """Check that getting an item with delta_timestamps outside of tolerance fails.
-
-    We expect it to fail if and only if the requested timestamps are within the episode range.
-
-    Note: Copied from
-    `test_datasets.py::test_load_previous_and_future_frames_outside_tolerance_inside_episode_range`
-    """
-    # Sanity check on global fps as we are assuming it is 10 here.
-    assert fps == 10, "This test assumes fps==10"
-    buffer, _ = make_new_buffer(delta_timestamps={"index": [-0.2, 0, 0.141]})
-    new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
-    buffer.add_data(new_data)
-    buffer.tolerance_s = 0.04
-    with pytest.raises(AssertionError):
-        buffer[2]
-
-
-def test_delta_timestamps_outside_tolerance_outside_episode_range():
-    """Check that copy-padding of timestamps outside of the episode range works.
-
-    Note: Copied from
-    `test_datasets.py::test_load_previous_and_future_frames_outside_tolerance_outside_episode_range`
-    """
-    # Sanity check on global fps as we are assuming it is 10 here.
-    assert fps == 10, "This test assumes fps==10"
-    buffer, _ = make_new_buffer(delta_timestamps={"index": [-0.3, -0.24, 0, 0.26, 0.3]})
-    new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
-    buffer.add_data(new_data)
-    buffer.tolerance_s = 0.04
-    item = buffer[2]
-    data, is_pad = item["index"], item["index_is_pad"]
-    assert torch.equal(data, torch.tensor([0, 0, 2, 4, 4])), "Data does not match expected values"
-    assert torch.equal(
-        is_pad, torch.tensor([True, False, False, True, True])
-    ), "Padding does not match expected values"
-
-
-# Arbitrarily set small dataset sizes, making sure to have uneven sizes.
-@pytest.mark.parametrize("offline_dataset_size", [0, 6])
-@pytest.mark.parametrize("online_dataset_size", [0, 4])
-@pytest.mark.parametrize("online_sampling_ratio", [0.0, 1.0])
-def test_compute_sampler_weights_trivial(
-    offline_dataset_size: int, online_dataset_size: int, online_sampling_ratio: float
-):
-    # Pass/skip the test if both datasets sizes are zero.
-    if offline_dataset_size + online_dataset_size == 0:
-        return
-    # Create spoof offline dataset.
-    offline_dataset = LeRobotDataset.from_preloaded(
-        hf_dataset=Dataset.from_dict({"data": list(range(offline_dataset_size))})
-    )
-    offline_dataset.hf_dataset.set_transform(hf_transform_to_torch)
-    if offline_dataset_size == 0:
-        offline_dataset.episode_data_index = {}
-    else:
-        # Set up an episode_data_index with at least two episodes.
-        offline_dataset.episode_data_index = {
-            "from": torch.tensor([0, offline_dataset_size // 2]),
-            "to": torch.tensor([offline_dataset_size // 2, offline_dataset_size]),
-        }
-    # Create spoof online datset.
-    online_dataset, _ = make_new_buffer()
-    if online_dataset_size > 0:
-        online_dataset.add_data(
-            make_spoof_data_frames(n_episodes=2, n_frames_per_episode=online_dataset_size // 2)
-        )
-
-    weights = compute_sampler_weights(
-        offline_dataset, online_dataset=online_dataset, online_sampling_ratio=online_sampling_ratio
-    )
-    if offline_dataset_size == 0 or online_dataset_size == 0:
-        expected_weights = torch.ones(offline_dataset_size + online_dataset_size)
-    elif online_sampling_ratio == 0:
-        expected_weights = torch.cat([torch.ones(offline_dataset_size), torch.zeros(online_dataset_size)])
-    elif online_sampling_ratio == 1:
-        expected_weights = torch.cat([torch.zeros(offline_dataset_size), torch.ones(online_dataset_size)])
-    expected_weights /= expected_weights.sum()
-    assert torch.allclose(weights, expected_weights)
-
-
-def test_compute_sampler_weights_nontrivial_ratio():
-    # Arbitrarily set small dataset sizes, making sure to have uneven sizes.
-    # Create spoof offline dataset.
-    offline_dataset = LeRobotDataset.from_preloaded(hf_dataset=Dataset.from_dict({"data": list(range(4))}))
-    offline_dataset.hf_dataset.set_transform(hf_transform_to_torch)
-    offline_dataset.episode_data_index = {
-        "from": torch.tensor([0, 2]),
-        "to": torch.tensor([2, 4]),
-    }
-    # Create spoof online datset.
-    online_dataset, _ = make_new_buffer()
-    online_dataset.add_data(make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2))
-    online_sampling_ratio = 0.8
-    weights = compute_sampler_weights(
-        offline_dataset, online_dataset=online_dataset, online_sampling_ratio=online_sampling_ratio
-    )
-    assert torch.allclose(
-        weights, torch.tensor([0.05, 0.05, 0.05, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
-    )
-
-
-def test_compute_sampler_weights_nontrivial_ratio_and_drop_last_n():
-    # Arbitrarily set small dataset sizes, making sure to have uneven sizes.
-    # Create spoof offline dataset.
-    offline_dataset = LeRobotDataset.from_preloaded(hf_dataset=Dataset.from_dict({"data": list(range(4))}))
-    offline_dataset.hf_dataset.set_transform(hf_transform_to_torch)
-    offline_dataset.episode_data_index = {
-        "from": torch.tensor([0]),
-        "to": torch.tensor([4]),
-    }
-    # Create spoof online datset.
-    online_dataset, _ = make_new_buffer()
-    online_dataset.add_data(make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2))
-    weights = compute_sampler_weights(
-        offline_dataset, online_dataset=online_dataset, online_sampling_ratio=0.8, online_drop_n_last_frames=1
-    )
-    assert torch.allclose(
-        weights, torch.tensor([0.05, 0.05, 0.05, 0.05, 0.2, 0.0, 0.2, 0.0, 0.2, 0.0, 0.2, 0.0])
-    )
-
-
-def test_compute_sampler_weights_drop_n_last_frames():
-    """Note: test copied from test_sampler."""
-    data_dict = {
-        "timestamp": [0, 0.1],
-        "index": [0, 1],
-        "episode_index": [0, 0],
-        "frame_index": [0, 1],
-    }
-    offline_dataset = LeRobotDataset.from_preloaded(hf_dataset=Dataset.from_dict(data_dict))
-    offline_dataset.hf_dataset.set_transform(hf_transform_to_torch)
-    offline_dataset.episode_data_index = {"from": torch.tensor([0]), "to": torch.tensor([2])}
-
-    online_dataset, _ = make_new_buffer()
-    online_dataset.add_data(make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2))
-
-    weights = compute_sampler_weights(
-        offline_dataset,
-        offline_drop_n_last_frames=1,
-        online_dataset=online_dataset,
-        online_sampling_ratio=0.5,
-        online_drop_n_last_frames=1,
-    )
-    assert torch.allclose(weights, torch.tensor([0.5, 0, 0.125, 0, 0.125, 0, 0.125, 0, 0.125, 0]))
diff --git a/tests/utils.py b/tests/utils.py
index db214aeac..3e27dc4d8 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -183,3 +183,18 @@ def wrapper(*args, **kwargs):
         return func(*args, **kwargs)
 
     return wrapper
+
+
+class DevTestingError(Exception):
+    """To be raised when a test does not work as expected because of the test code itself.
+
+    This makes it clear that the test failed because of the test itself, not the code being tested.
+    """
+
+    def __init__(self, message: str | None = None):
+        if message is None:
+            message = (
+                "It's likely this test failed because of the test code itself, rather than the code being "
+                "tested. Please review the test code."
+            )
+        super().__init__(message)

From d6304e1c8a215ef094d3152fee5bce94c4dc9477 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Tue, 17 Sep 2024 18:38:29 +0100
Subject: [PATCH 06/29] from_huggingface_hub base test passing

---
 lerobot/common/datasets/video_utils.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py
index 4d4ac6b0a..c08295013 100644
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -70,6 +70,7 @@ def decode_video_frames_torchvision(
     tolerance_s: float,
     backend: str = "pyav",
     log_loaded_timestamps: bool = False,
+    to_pytorch_format: bool = True,
 ) -> torch.Tensor:
     """Loads frames associated to the requested timestamps of a video
 
@@ -129,8 +130,8 @@ def decode_video_frames_torchvision(
 
     reader = None
 
-    query_ts = torch.tensor(timestamps)
-    loaded_ts = torch.tensor(loaded_ts)
+    query_ts = torch.tensor(timestamps, dtype=torch.float32)
+    loaded_ts = torch.tensor(loaded_ts, dtype=torch.float32)
 
     # compute distances between each query timestamp and timestamps of all loaded frames
     dist = torch.cdist(query_ts[:, None], loaded_ts[:, None], p=1)
@@ -155,8 +156,12 @@ def decode_video_frames_torchvision(
     if log_loaded_timestamps:
         logging.info(f"{closest_ts=}")
 
-    # convert to the pytorch format which is float32 in [0,1] range (and channel first)
-    closest_frames = closest_frames.type(torch.float32) / 255
+    if to_pytorch_format:
+        # Return as pytorch format: float32, normalized to [0,1], channel-first.
+        closest_frames = closest_frames.type(torch.float32) / 255
+    else:
+        # Return in numpy format: np.uint8, in [0, 255], channel-last.
+        closest_frames = closest_frames.permute(0, 2, 3, 1).numpy()
 
     assert len(timestamps) == len(closest_frames)
     return closest_frames

From edae4407992394c3b196899081505d81b2113024 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Wed, 18 Sep 2024 08:49:12 +0100
Subject: [PATCH 07/29] Tests passing with improved API

---
 lerobot/common/datasets/data_buffer.py | 682 +++++++++++++++++++++++++
 tests/test_data_buffer.py              | 395 ++++++++++++++
 2 files changed, 1077 insertions(+)
 create mode 100644 lerobot/common/datasets/data_buffer.py
 create mode 100644 tests/test_data_buffer.py

diff --git a/lerobot/common/datasets/data_buffer.py b/lerobot/common/datasets/data_buffer.py
new file mode 100644
index 000000000..49555952c
--- /dev/null
+++ b/lerobot/common/datasets/data_buffer.py
@@ -0,0 +1,682 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A data buffer for efficient data management during offline and online training."""
+
+import json
+import os
+import shutil
+from itertools import chain
+from pathlib import Path
+from typing import Callable
+
+import datasets
+import numpy as np
+import torch
+
+from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, DATA_DIR, LeRobotDataset
+from lerobot.common.datasets.utils import load_hf_dataset, load_info, load_videos
+from lerobot.common.datasets.video_utils import VideoFrame, decode_video_frames_torchvision, load_from_videos
+
+# TODO(alexander-soare): Move somewhere more appropriate once the DataBuffer class permeates more of the coe
+# base.
+MAX_VIDEO_PATH_LENGTH = 100
+
+
+def _make_memmap_safe(**kwargs) -> np.memmap:
+    """Make a numpy memmap with checks on available disk space first.
+
+    Expected kwargs are: "filename", "dtype" (must by np.dtype), "mode" and "shape"
+
+    For information on dtypes:
+    https://numpy.org/doc/stable/reference/arrays.dtypes.html#arrays-dtypes-constructing
+    """
+    if kwargs["mode"].startswith("w"):
+        required_space = kwargs["dtype"].itemsize * np.prod(kwargs["shape"])  # bytes
+        stats = os.statvfs(Path(kwargs["filename"]).parent)
+        available_space = stats.f_bavail * stats.f_frsize  # bytes
+        if required_space >= available_space * 0.8:
+            raise RuntimeError(
+                f"You're about to take up {required_space} of {available_space} bytes available. This "
+                "exception has been raised to protect your storage device."
+                ""
+            )
+    return np.memmap(**kwargs)
+
+
+class TimestampOutsideToleranceError(Exception):
+    pass
+
+
+class DataBuffer(torch.utils.data.Dataset):
+    """Data buffer and training data item getter.
+
+    Data is considered to come in the form of "episodes" (an instance of a robot performing a task). Episodes
+    are made up of "frames", which are chronoligically ordered and contain timestamp aligned data, potentially
+    including environment observations, and robot actions. NOTE: for the time being, we require all data
+    modalities to be timestamp aligned. This constraint may be relaxed in the future.
+
+    The data is stored in a mapping from data keys to arrays with shape (total_number_of_frames, *data_dim).
+    The compulsory data keys are:
+        - "index": A sequential integer index per frame.
+        - "episode_index": A sequential integer index per episode.
+        - "frame_index": A sequential integer index per frame within an episode (it resets for each episode).
+        - "timestamp": The relative timestamp of the frame within the episode in units of seconds. The choice.
+            of reference time is not important.
+
+    Under the hood, the data is stored in `numpy.memmap`s, one for each data key. Loosely speaking, memory
+    mapping (https://en.wikipedia.org/wiki/Memory-mapped_file) allows us to treat a portion of disk space as
+    virtual memory. This allows us to work with more data than can fit in our physical memory, while treating
+    the data as if it were just standard numpy arrays. The associated files are saved in the file system under
+    what we call the "storage directory", and the Python object that allows us to treat them as virtual memory
+    is called the "buffer". The storage directory also contains a "meta.json" file which includes information
+    about the date types and shapes for each memmap. This allows us to load the data without having to specify
+    the data specifications at runtime.
+
+    The `add_episodes` method can be used to insert more data in the form of integral episodes (starting from
+    frame 0 and with the frames ordered). The buffer has a compulsory size limit, which must be provided when
+    creating a new one. Data is inserted in a circular fashion, inserting after the most recently added frame,
+    and wrapping around to the start when necessary (in which case older episodes are overwritten).
+
+    This class is also a PyTorch Dataset and can be used as such in a dataloader for a training loop. The item
+    getter returns either a single frame, or a slice of a single episode if `delta_timestamps` is set.
+    """
+
+    # TODO(now): What should we do about implicit data key specs, like observations.image?
+
+    # Special key for a (1,) array storing a pointer to the next index to fill from when adding data.
+    NEXT_INDEX_KEY = "_next_index"
+    # Since the data buffer is pre-allocated, this boolean mask is used to indicate which frames have "real"
+    # data.
+    OCCUPANCY_MASK_KEY = "_occupancy_mask"
+    # This is not a data key used in the buffer. It is used to indicate that a frame is padding, added by the
+    # __getitem__ method.
+    IS_PAD_POSTFIX = "_is_pad"
+    INDEX_KEY = "index"
+    FRAME_INDEX_KEY = "frame_index"
+    EPISODE_INDEX_KEY = "episode_index"
+    TIMESTAMP_KEY = "timestamp"
+    PRESET_KEYS = {INDEX_KEY, FRAME_INDEX_KEY, EPISODE_INDEX_KEY, TIMESTAMP_KEY}
+
+    METADATA_FILE_NAME = "meta.json"
+
+    def __init__(
+        self,
+        storage_dir: str | Path,
+        buffer_capacity: int | None = None,
+        image_transform: Callable[[np.ndarray], np.ndarray] | None = None,
+        delta_timestamps: dict[str, list[float]] | dict[str, np.ndarray] | None = None,
+        fps: float | None = None,
+    ):
+        """
+        Args:
+            storage_dir: Where to keep the numpy memmap files and metadata files. One memmap file will be
+                stored for each data key. Note that if the storage directory already exist, the memmap files
+                are opened in read-write mode. If the storage directory does not exist, it will be lazily
+                created with the first call to `add_episodes`.
+            buffer_capacity: How many frames should be stored in the buffer as a maximum. Be aware of your
+                system's available disk space when choosing this. Note that if `storage_dir` references an
+                existing storage directory, `buffer_capacity` should not be provided, as it is already
+                included in "meta.json".
+            image_transform: Transforms to apply in the item getter to all image data (any data whose key
+                starts with "observation.image").
+            delta_timestamps: TODO(alexander-soare): Document this somewhere when
+                `load_previous_and_future_frames` is refactored.
+            fps: TODO(alexander-soare): Document this somewhere when `load_previous_and_future_frames` is
+                refactored.
+
+        """
+        # Argument validation.
+        if (delta_timestamps is None) ^ (fps is None):
+            raise ValueError("`delta_timestamps` and `fps` should be provided together, or not at all.")
+
+        # Parameters for the data structure.
+        self._storage_dir = Path(storage_dir)
+        self._data: dict[str, np.memmap] = {}
+        self._is_video_dataset = False
+        # If the storage directory already exists, load the memmaps.
+        if self._storage_dir.exists():
+            if buffer_capacity is not None:
+                raise ValueError(
+                    "The storage directory already exists, which means you should not provide a "
+                    "buffer_capacity explicitly. Instead, it will be read from 'meta.json' in the storage "
+                    "directory."
+                )
+            data_spec = self._load_data_spec()
+            self._make_memmaps(data_spec, mode="r+")
+            self._buffer_capacity = len(self._data[self.INDEX_KEY])
+        else:
+            if buffer_capacity is None:
+                raise ValueError(
+                    "The storage directory does not exist, which means you need to provide a buffer_capacity."
+                )
+            self._buffer_capacity = buffer_capacity
+
+        # Parameters for the item getter.
+        self.set_delta_timestamps(delta_timestamps)
+        self._fps = fps
+        # Tolerance (in seconds) used to discard loaded frames when their timestamps are not close enough to
+        # the requested frames. It is only used when `delta_timestamps` is provided. The -1e-4 accounts for
+        # possible numerical error.
+        self.tolerance_s = 1 / self.fps - 1e-4 if fps is not None else None
+        self.image_transform = image_transform
+
+    @property
+    def storage_dir(self) -> Path:
+        return self._storage_dir
+
+    @property
+    def is_video_dataset(self) -> bool:
+        return self._is_video_dataset
+
+    @property
+    def data_keys(self) -> list[str]:
+        keys = set(self._data)
+        keys.remove(self.OCCUPANCY_MASK_KEY)
+        keys.remove(self.NEXT_INDEX_KEY)
+        return sorted(keys)
+
+    @property
+    def fps(self) -> float | None:
+        return self._fps
+
+    @property
+    def num_episodes(self) -> int:
+        """Total number of unique episode indices in the dataset."""
+        if len(self._data) == 0:
+            # Buffers not created yet.
+            return 0
+        return len(np.unique(self._data[self.EPISODE_INDEX_KEY][self._data[self.OCCUPANCY_MASK_KEY]]))
+
+    @property
+    def num_samples(self) -> int:
+        """Total number of unique samples (aka frames) in the dataset.
+
+        TODO(alexander-soare): Rename to num_frames once LeRobotDataset is deprecated.
+        """
+        if len(self._data) == 0:
+            # Buffers not created yet.
+            return 0
+        return np.count_nonzero(self._data[self.OCCUPANCY_MASK_KEY])
+
+    @property
+    def camera_keys(self) -> list[str]:
+        """Return the names of all data keys pertaining to camera observations.
+
+        By convention, this is all the keys starting with "observation.image".
+        """
+        return [k for k in self._data if k.startswith("observation.image")]
+
+    def _save_data_spec(self, data_spec: dict[str, dict]):
+        """Save the data type and shape specifications to the storage directory."""
+        meta_file = self._storage_dir / self.METADATA_FILE_NAME
+        with open(meta_file, "w") as f:
+            for k in data_spec:
+                data_spec[k]["dtype"] = str(data_spec[k]["dtype"])
+            json.dump(data_spec, f)
+
+    def _load_data_spec(self) -> dict[str, dict]:
+        """Load the data type and shape specifications from the storage directory."""
+        meta_file = self._storage_dir / self.METADATA_FILE_NAME
+        with open(meta_file) as f:
+            data_spec = json.load(f)
+        for k in data_spec:
+            data_spec[k]["dtype"] = np.dtype(data_spec[k]["dtype"])
+        return data_spec
+
+    def _make_storage_dir(self, episode_data: dict[str, np.ndarray]):
+        """Create the storage directory based on example episode data from the first `add_episodes` call."""
+        assert (
+            not self.storage_dir.exists()
+        ), "This method should only be called before the storage directory has been created."
+
+        self._storage_dir.mkdir(parents=True, exist_ok=True)
+
+        try:
+            # Make the data spec for np.memmap
+            data_spec = {
+                self.NEXT_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (1,)},
+                self.OCCUPANCY_MASK_KEY: {"dtype": np.dtype("?"), "shape": (self._buffer_capacity,)},
+                self.INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (self._buffer_capacity,)},
+                self.FRAME_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (self._buffer_capacity,)},
+                self.EPISODE_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (self._buffer_capacity,)},
+                # TODO(now): Should this just be float32?
+                self.TIMESTAMP_KEY: {"dtype": np.dtype("float64"), "shape": (self._buffer_capacity,)},
+            }
+            for k, v in episode_data.items():
+                if k in data_spec:
+                    continue
+                data_spec[k] = {"dtype": v.dtype, "shape": (self._buffer_capacity, *v.shape[1:])}
+
+            self._make_memmaps(data_spec, "w+")
+            self._save_data_spec(data_spec)
+        except Exception as e:
+            # Attempt to clean up by removing the empty storage directory.
+            shutil.rmtree(self._storage_dir)
+            raise e
+
+    def _make_memmaps(self, data_spec: dict[str, dict], mode: str):
+        """Create the memmap buffer objects.
+
+        The underlying storage directory may or may not already exist. Provide the file opening `mode`
+        accordingly.
+        """
+        for k, v in data_spec.items():
+            self._data[k] = _make_memmap_safe(
+                filename=self._storage_dir / k,
+                dtype=v["dtype"] if v is not None else None,
+                mode=mode,
+                shape=tuple(v["shape"]) if v is not None else None,
+            )
+
+    @property
+    def delta_timestamps(self) -> dict[str, np.ndarray] | None:
+        return self._delta_timestamps
+
+    def set_delta_timestamps(self, value: dict[str, list[float]] | None):
+        """Set delta_timestamps converting the values to numpy arrays.
+
+        Note: The conversion is for an optimization in the __getitem__. The loop is much slower if lists need
+        to be converted into numpy arrays.
+        """
+        if value is not None:
+            self._delta_timestamps = {k: np.array(v) for k, v in value.items()}
+        else:
+            self._delta_timestamps = None
+
+    def add_episodes(self, data: dict[str, np.ndarray]):
+        """Add data to the buffer.
+
+        `data` should have the same key, array mapping as the buffer. It should contain at least one episode.
+        The episodes should have frame indices that start from 0 and step up in increments of 1.
+
+        Episodes are added to the buffer one-by-one. If an episode has more frames then are available till the
+        end of the buffer, the pointer is reset to the start of the buffer and the episode is inserted there,
+        overwriting existing episode frames.
+
+        When episode frames are overwritten by a new episode, by default, any remaining frames belonging to
+        the existing episode are left in place (meaning not all episodes will be guaranteed to start from
+        their frame 0).
+
+        After adding the episodes to the buffer, the buffer is flushed to disk.
+        """
+        for episode_index in np.unique(data[self.EPISODE_INDEX_KEY]):
+            where_episode = np.where(data[self.EPISODE_INDEX_KEY] == episode_index)[0]
+            episode_data = {k: data[k][where_episode] for k in data}
+            self._add_episode(episode_data)
+
+        self.flush()
+
+    def _add_episode(self, data: dict[str, np.ndarray]):
+        """Add data for a single episode to the buffer."""
+        if len(self._data) == 0:
+            self._make_storage_dir(data)
+
+        if len(missing_keys := (set(self.data_keys).difference(set(data)))) > 0:
+            raise ValueError(f"Missing data keys: {missing_keys}")
+        new_data_length = len(data[self.data_keys[0]])
+        if new_data_length <= 0:
+            raise ValueError("The episode has 0 frames")
+        if new_data_length > self._buffer_capacity:
+            raise ValueError("The episode length is larger than the buffer capacity.")
+        if not all(len(data[k]) == new_data_length for k in self.data_keys):
+            raise ValueError("All data items should have the same length")
+        if not np.all(data[self.EPISODE_INDEX_KEY] == data[self.EPISODE_INDEX_KEY][0]):
+            raise ValueError(
+                "New data should only contain one episode but there is more than one unique episode index."
+            )
+        if not np.array_equal(data[self.FRAME_INDEX_KEY], np.arange(new_data_length)):
+            raise ValueError(
+                "Expected frame indices to start from 0 and step up in increments of 1 per frame."
+            )
+
+        # Figure out where we need to start filling data next, and make sure we continue data and episode
+        # indices.
+        next_index = self._data[self.NEXT_INDEX_KEY][0]
+        if self.num_samples > 0:
+            last_episode_index = self._data[self.EPISODE_INDEX_KEY][next_index - 1]
+            last_data_index = self._data[self.INDEX_KEY][next_index - 1]
+        else:
+            last_episode_index = -1
+            last_data_index = -1
+        # If there aren't enough slots in the buffer left to accommodate the episode, wrap to the start.
+        if max(0, new_data_length - (self._buffer_capacity - next_index)) > 0:
+            next_index = 0
+
+        # Insert the new data starting from next_index.
+        for k in self.data_keys:
+            slc = slice(next_index, next_index + new_data_length)
+            if k == self.EPISODE_INDEX_KEY:
+                self._data[k][slc] = last_episode_index + 1
+            elif k == self.INDEX_KEY:
+                self._data[k][slc] = np.arange(last_data_index + 1, last_data_index + 1 + new_data_length)
+            else:
+                self._data[k][slc] = data[k]
+            self._data[self.OCCUPANCY_MASK_KEY][slc] = True
+
+        # Update the data pointer.
+        self._data[self.NEXT_INDEX_KEY][0] = next_index + new_data_length
+
+    def flush(self):
+        """Save the data to disk.
+
+        `np.memmap`s keep a portion of the data mirrored in memory. Updates to the in-memory data are not
+        immediately reflected on disk. Call this method to explicitly save the updates to disk.
+        """
+        for k in self._data:
+            self._data[k].flush()
+
+    @staticmethod
+    def _item_to_tensors(item: dict) -> dict:
+        item_ = {}
+        for k, v in item.items():
+            item_[k] = torch.from_numpy(v)
+        return item_
+
+    def _optimized_advanced_slice(self, data_key: str, indices: np.ndarray) -> np.ndarray:
+        """Convert advanced slicing to basic slicing by finding contiguous ranges in the requested indices.
+
+        TODO(now): Is this needed?
+        """
+        indices_diff = np.diff(indices, prepend=indices[0] - 1)
+        where_not_1 = np.where(indices_diff != 1)[0]
+        ptr = 0
+        ret = []
+        for ix in chain(where_not_1, [len(indices)]):
+            ret.append(self._data[data_key][indices[ptr] : indices[ix - 1] + 1])
+            ptr = ix
+
+        # Avoid creating a copy with concatenate if possible.
+        return np.concatenate(ret) if len(ret) > 1 else ret[0]
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        if idx >= len(self) or idx < -len(self):
+            raise IndexError
+
+        item = {k: v[idx] for k, v in self._data.items() if not k.startswith("_")}
+
+        if self.delta_timestamps is not None:
+            episode_index = item[self.EPISODE_INDEX_KEY]
+            current_ts = item[self.TIMESTAMP_KEY]
+            episode_data_indices = np.where(
+                np.bitwise_and(
+                    self._data[self.EPISODE_INDEX_KEY] == episode_index,
+                    self._data[self.OCCUPANCY_MASK_KEY],
+                )
+            )[0]
+            episode_timestamps = self._data[self.TIMESTAMP_KEY][episode_data_indices]
+
+            if self.is_video_dataset:
+                video_timestamps = {}  # TODO(now): HACK
+
+            for data_key in self.delta_timestamps:
+                # Get timestamps used as query to retrieve data of previous/future frames.
+                query_ts = current_ts + self.delta_timestamps[data_key]
+
+                # Compute distances between each query timestamp and all timestamps of all the frames
+                # belonging to the episode.
+                dist = np.abs(query_ts[:, None] - episode_timestamps[None, :])
+                argmin_ = np.argmin(dist, axis=1)
+                min_ = dist[np.arange(dist.shape[0]), argmin_]
+
+                is_pad = min_ > self.tolerance_s
+
+                # Check violated query timestamps are all outside the episode range.
+                if not (
+                    (query_ts[is_pad] < episode_timestamps[0]) | (episode_timestamps[-1] < query_ts[is_pad])
+                ).all():
+                    raise TimestampOutsideToleranceError(
+                        f"One or several timestamps unexpectedly violate the tolerance ({min_} > "
+                        f"{self.tolerance_s=}) inside the episode range."
+                    )
+
+                if self.is_video_dataset and data_key.startswith("observation.image"):
+                    video_timestamps[data_key] = self._data[self.TIMESTAMP_KEY][episode_data_indices[argmin_]]
+
+                # Load frames for this data key.
+                if np.any(np.diff(argmin_) != 1):
+                    item[data_key] = self._data[data_key][episode_data_indices[argmin_]]
+                    # item[data_key] = self._optimized_advanced_slice(data_key, episode_data_indices[argmin_])
+                else:
+                    # Do basic slicing where possible
+                    item[data_key] = self._data[data_key][
+                        episode_data_indices[argmin_.min()] : episode_data_indices[argmin_.max()] + 1
+                    ]
+
+                item[f"{data_key}{self.IS_PAD_POSTFIX}"] = is_pad
+
+        if self.is_video_dataset:
+            item_ = dict(item)
+            for k in self.video_frame_keys:  # TODO(now): HACK
+                if self.delta_timestamps is None:
+                    item_[k] = {"path": item[k].decode(), "timestamp": float(item[self.TIMESTAMP_KEY])}
+                else:
+                    query_ts = current_ts + self.delta_timestamps[k]
+                    item_[k] = [
+                        {"path": item[k][i].decode(), "timestamp": float(video_timestamps[k][i])}
+                        for i in range(len(item[k]))
+                    ]
+            item = load_from_videos(
+                item_,
+                self.video_frame_keys,
+                self.videos_dir,
+                self.tolerance_s,
+                self.video_backend,
+            )
+
+        if self.image_transform is not None:
+            for cam in self.camera_keys:
+                item[cam] = self.image_transform(item[cam])
+
+        return self._item_to_tensors(item)
+
+    @classmethod
+    def from_huggingface_hub(
+        cls,
+        repo_id: str,
+        decode_video: bool = False,
+        root: Path | None = DATA_DIR,
+        **kwargs,
+    ) -> "DataBuffer":
+        """Create a DataBuffer from a data repository on the Hugging Face Hub.
+
+        NOTE: If the DataBuffer already exists in /tmp, this function will reuse it rather than creating a new
+        one.
+
+        Args:
+            repo_id: The dataset repository ID.
+            decode_video: If repo_id refers to a video dataset (the image observations are encoded as videos),
+                decode the videos and store the frames in a numpy memmap.
+            root: (will be deprecated) Directory to load the dataset from, instead of the hub.
+            **kwargs: Other arguments to `self.__init__` except for the `data_spec` and
+                `buffer_capacity` arguments which are inferred automatically. `storage_dir` is set to
+                `/tmp/{repo_id}_{hf_dataset._fingerprint}_{decoded?}` unless provided explicitly.
+        Returns:
+            The resulting DataBuffer object.
+
+        TODO(now): Consider populating the buffer one episode at a time in order not to kill RAM.
+        TODO(now): Reuse previously saved? Maybe I need a hash.
+        """
+        for k in ["data_spec", "buffer_capacity"]:
+            if k in kwargs:
+                raise ValueError(f"`{k}` should not be provided as it is inferred from the hub dataset.")
+
+        hf_dataset = load_hf_dataset(repo_id, version=CODEBASE_VERSION, root=root, split="train")
+        hf_dataset.set_transform(lambda x: x)
+        kwargs.setdefault(
+            "storage_dir",
+            DataBuffer._default_storage_dir_from_huggingface_hub(
+                repo_id, hf_dataset._fingerprint, decode_video
+            ),
+        )
+
+        buffer_already_on_disk = False
+        if kwargs["storage_dir"].exists():
+            buffer_already_on_disk = True
+
+        obj = cls(
+            **kwargs,
+            buffer_capacity=len(hf_dataset) if not buffer_already_on_disk else None,
+        )
+        # If we have accessed an existing cached data buffer, just return the object as is.
+        if buffer_already_on_disk:
+            return obj
+
+        # Get some metadata necessary for processing videos.
+        lerobot_dataset_info = load_info(repo_id, version=CODEBASE_VERSION, root=root)
+        is_video_dataset = lerobot_dataset_info.get("video", False)
+        if not is_video_dataset and decode_video:
+            raise ValueError(f"The provided dataset is not a video dataset but you have {decode_video=}")
+        if is_video_dataset:
+            videos_path = load_videos(repo_id, version=CODEBASE_VERSION, root=root)
+
+        # Populate the buffer with the data from the dataset.
+        data_dict = {}
+        for k, feature in hf_dataset.features.items():
+            if isinstance(feature, datasets.features.Image):
+                data_dict[k] = np.stack([np.array(pil_img) for pil_img in hf_dataset[k]])
+            elif isinstance(feature, VideoFrame):
+                if decode_video:
+                    # Decode all videos into images.
+                    episode_indices = np.array(hf_dataset["episode_index"])
+                    timestamps = np.array(hf_dataset["timestamp"])
+                    all_imgs = []
+                    for episode_index in np.unique(episode_indices):
+                        episode_data_indices = np.where(episode_indices == episode_index)[0]
+                        episode_timestamps = timestamps[episode_indices == episode_index]
+                        episode_imgs = decode_video_frames_torchvision(
+                            videos_path.parent / hf_dataset[k][episode_data_indices[0]]["path"],
+                            episode_timestamps,
+                            1 / lerobot_dataset_info["fps"] - 1e-4,
+                            to_pytorch_format=False,
+                        )
+                        all_imgs.extend(episode_imgs)
+                    data_dict[k] = np.stack(all_imgs)
+                else:
+                    data_dict[k] = np.stack(
+                        [np.array(dct["path"], dtype=f"S{MAX_VIDEO_PATH_LENGTH}") for dct in hf_dataset[k]]
+                    )
+            else:
+                data_dict[k] = np.array(hf_dataset[k])
+        obj.add_episodes(data_dict)
+        if is_video_dataset:
+            obj._is_video_dataset = True  # TODO(now): HACK
+            # Symlink videos if needed.
+            obj.videos_dir = kwargs["storage_dir"] / "videos"
+            if not obj.videos_dir.exists():
+                os.symlink(videos_path.absolute(), obj.videos_dir)
+            obj.videos_dir = kwargs["storage_dir"] / "videos"  # TODO(now): HACK
+            obj.video_backend = "pyav"  # TODO(now): HACK
+
+        return obj
+
+    @staticmethod
+    def _default_storage_dir_from_huggingface_hub(repo_id: str, fingerprint: str, decode_video: bool) -> Path:
+        """Create the default storage directory used for the `from_huggingface_hub` method.
+
+        Note: This method is really meant for development / testing.
+        """
+        return Path(f"/tmp/{repo_id}_{fingerprint}{'_decoded' if decode_video else ''}")
+
+
+def compute_sampler_weights(
+    offline_dataset: LeRobotDataset,
+    offline_drop_n_last_frames: int = 0,
+    online_dataset: DataBuffer | None = None,
+    online_sampling_ratio: float | None = None,
+    online_drop_n_last_frames: int = 0,
+) -> torch.Tensor:
+    """Compute the sampling weights for the online training dataloader in train.py.
+
+    Args:
+        offline_dataset: The LeRobotDataset used for offline pre-training.
+        online_drop_n_last_frames: Number of frames to drop from the end of each offline dataset episode.
+        online_dataset: The DataBuffer used in online training.
+        online_sampling_ratio: The proportion of data that should be sampled from the online dataset. If an
+            online dataset is provided, this value must also be provided.
+        online_drop_n_first_frames: See `offline_drop_n_last_frames`. This is the same, but for the online
+            dataset.
+    Returns:
+        Tensor of weights for [offline_dataset; online_dataset], normalized to 1.
+
+    Notes to maintainers:
+        - This duplicates some logic from EpisodeAwareSampler. We should consider converging to one approach.
+        - When used with `torch.utils.data.WeightedRandomSampler`, it could completely replace
+          `EpisodeAwareSampler` as the online dataset related arguments are optional. The only missing feature
+          is the ability to turn shuffling off.
+        - Options `drop_first_n_frames` and `episode_indices_to_use` can be added easily. They were not
+          included here to avoid adding complexity.
+    """
+    if len(offline_dataset) == 0 and (online_dataset is None or len(online_dataset) == 0):
+        raise ValueError("At least one of `offline_dataset` or `online_dataset` should be contain data.")
+    if (online_dataset is None) ^ (online_sampling_ratio is None):
+        raise ValueError(
+            "`online_dataset` and `online_sampling_ratio` must be provided together or not at all."
+        )
+    offline_sampling_ratio = 0 if online_sampling_ratio is None else 1 - online_sampling_ratio
+
+    weights = []
+
+    if len(offline_dataset) > 0:
+        offline_data_mask_indices = []
+        for start_index, end_index in zip(
+            offline_dataset.episode_data_index["from"],
+            offline_dataset.episode_data_index["to"],
+            strict=True,
+        ):
+            offline_data_mask_indices.extend(
+                range(start_index.item(), end_index.item() - offline_drop_n_last_frames)
+            )
+        offline_data_mask = torch.zeros(len(offline_dataset), dtype=torch.bool)
+        offline_data_mask[torch.tensor(offline_data_mask_indices)] = True
+        weights.append(
+            torch.full(
+                size=(len(offline_dataset),),
+                fill_value=offline_sampling_ratio / offline_data_mask.sum(),
+            )
+            * offline_data_mask
+        )
+
+    if online_dataset is not None and len(online_dataset) > 0:
+        online_data_mask_indices = []
+        episode_indices = online_dataset.get_data_by_key("episode_index")
+        for episode_idx in torch.unique(episode_indices):
+            where_episode = torch.where(episode_indices == episode_idx)
+            start_index = where_episode[0][0]
+            end_index = where_episode[0][-1] + 1
+            online_data_mask_indices.extend(
+                range(start_index.item(), end_index.item() - online_drop_n_last_frames)
+            )
+        online_data_mask = torch.zeros(len(online_dataset), dtype=torch.bool)
+        online_data_mask[torch.tensor(online_data_mask_indices)] = True
+        weights.append(
+            torch.full(
+                size=(len(online_dataset),),
+                fill_value=online_sampling_ratio / online_data_mask.sum(),
+            )
+            * online_data_mask
+        )
+
+    weights = torch.cat(weights)
+
+    if weights.sum() == 0:
+        weights += 1 / len(weights)
+    else:
+        weights /= weights.sum()
+
+    return weights
diff --git a/tests/test_data_buffer.py b/tests/test_data_buffer.py
new file mode 100644
index 000000000..d21ff1077
--- /dev/null
+++ b/tests/test_data_buffer.py
@@ -0,0 +1,395 @@
+#!/usr/bin/env python
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.d
+from copy import deepcopy
+from pathlib import Path
+from uuid import uuid4
+
+import datasets
+import numpy as np
+import pytest
+import torch
+
+from lerobot.common.datasets.data_buffer import (
+    DataBuffer,
+    TimestampOutsideToleranceError,
+    compute_sampler_weights,
+)
+from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, DATA_DIR, LeRobotDataset
+from lerobot.common.datasets.utils import hf_transform_to_torch, load_hf_dataset, load_info, load_videos
+from lerobot.common.datasets.video_utils import VideoFrame, decode_video_frames_torchvision
+from tests.utils import DevTestingError
+
+# Some constants for DataBuffer tests.
+data_key = "data"
+data_shape = (2, 3)  # just some arbitrary > 1D shape
+buffer_capacity = 100
+fps = 10
+
+
+def make_new_buffer(
+    write_dir: str | None = None, delta_timestamps: dict[str, list[float]] | None = None
+) -> tuple[DataBuffer, str]:
+    if write_dir is None:
+        write_dir = f"/tmp/online_buffer_{uuid4().hex}"
+    buffer = DataBuffer(
+        write_dir,
+        data_spec={data_key: {"shape": data_shape, "dtype": np.dtype("float32")}},
+        buffer_capacity=buffer_capacity,
+        fps=None if delta_timestamps is None else fps,
+        delta_timestamps=delta_timestamps,
+    )
+    return buffer, write_dir
+
+
+def make_spoof_data_frames(n_episodes: int, n_frames_per_episode: int) -> dict[str, np.ndarray]:
+    new_data = {
+        data_key: np.arange(n_frames_per_episode * n_episodes * np.prod(data_shape)).reshape(-1, *data_shape),
+        DataBuffer.INDEX_KEY: np.arange(n_frames_per_episode * n_episodes),
+        DataBuffer.EPISODE_INDEX_KEY: np.repeat(np.arange(n_episodes), n_frames_per_episode),
+        DataBuffer.FRAME_INDEX_KEY: np.tile(np.arange(n_frames_per_episode), n_episodes),
+        DataBuffer.TIMESTAMP_KEY: np.tile(np.arange(n_frames_per_episode) / fps, n_episodes),
+    }
+    return new_data
+
+
+def test_non_mutate():
+    """Checks that the data provided to the add_data method is copied rather than passed by reference.
+
+    This means that mutating the data in the buffer does not mutate the original data.
+
+    NOTE: If this test fails, it means some of the other tests may be compromised. For example, we can't trust
+    a success case for `test_write_read`.
+    """
+    buffer, _ = make_new_buffer()
+    new_data = make_spoof_data_frames(2, buffer_capacity // 4)
+    new_data_copy = deepcopy(new_data)
+    buffer.add_episodes(new_data)
+    buffer._data[data_key][:] += 1
+    assert all(np.array_equal(new_data[k], new_data_copy[k]) for k in new_data)
+
+
+def test_index_error_no_data():
+    buffer, _ = make_new_buffer()
+    with pytest.raises(IndexError):
+        buffer[0]
+
+
+def test_index_error_with_data():
+    buffer, _ = make_new_buffer()
+    n_frames = buffer_capacity // 2
+    new_data = make_spoof_data_frames(1, n_frames)
+    buffer.add_episodes(new_data)
+    with pytest.raises(IndexError):
+        buffer[n_frames]
+    with pytest.raises(IndexError):
+        buffer[-n_frames - 1]
+
+
+@pytest.mark.parametrize("do_reload", [False, True])
+def test_write_read(do_reload: bool):
+    """Checks that data can be added to the buffer and read back.
+
+    If do_reload we delete the buffer object and load the buffer back from disk before reading.
+    """
+    buffer, write_dir = make_new_buffer()
+    n_episodes = 2
+    n_frames_per_episode = buffer_capacity // 4
+    new_data = make_spoof_data_frames(n_episodes, n_frames_per_episode)
+    buffer.add_episodes(new_data)
+
+    if do_reload:
+        del buffer
+        buffer, _ = make_new_buffer(write_dir)
+
+    assert len(buffer) == n_frames_per_episode * n_episodes
+    for i, item in enumerate(buffer):
+        assert all(isinstance(item[k], torch.Tensor) for k in item)
+        assert np.array_equal(item[data_key].numpy(), new_data[data_key][i])
+
+
+def test_read_data_key():
+    """Tests that data can be added to a buffer and all data for a. specific key can be read back."""
+    buffer, _ = make_new_buffer()
+    n_episodes = 2
+    n_frames_per_episode = buffer_capacity // 4
+    new_data = make_spoof_data_frames(n_episodes, n_frames_per_episode)
+    buffer.add_episodes(new_data)
+
+    data_from_buffer = buffer.get_data_by_key(data_key)
+    assert isinstance(data_from_buffer, torch.Tensor)
+    assert np.array_equal(data_from_buffer.numpy(), new_data[data_key])
+
+
+def test_fifo():
+    """Checks that if data is added beyond the buffer capacity, we discard the oldest data first."""
+    buffer, _ = make_new_buffer()
+    n_frames_per_episode = buffer_capacity // 4
+    n_episodes = 3
+    new_data = make_spoof_data_frames(n_episodes, n_frames_per_episode)
+    buffer.add_episodes(new_data)
+    n_more_episodes = 2
+    # Developer sanity check (in case someone changes the global `buffer_capacity`).
+    assert (
+        n_episodes + n_more_episodes
+    ) * n_frames_per_episode > buffer_capacity, "Something went wrong with the test code."
+    more_new_data = make_spoof_data_frames(n_more_episodes, n_frames_per_episode)
+    buffer.add_episodes(more_new_data)
+    assert len(buffer) == buffer_capacity, "The buffer should be full."
+
+    expected_data = {}
+    for k in new_data:
+        # Concatenate, left-truncate, then roll, to imitate the cyclical FIFO pattern in DataBuffer.
+        expected_data[k] = np.roll(
+            np.concatenate([new_data[k], more_new_data[k]])[-buffer_capacity:],
+            shift=len(new_data[k]) + len(more_new_data[k]) - buffer_capacity,
+            axis=0,
+        )
+
+    for i, item in enumerate(buffer):
+        assert all(isinstance(item[k], torch.Tensor) for k in item)
+        assert np.array_equal(item[data_key].numpy(), expected_data[data_key][i])
+
+
+def test_delta_timestamps_within_tolerance():
+    """Check that getting an item with delta_timestamps within tolerance succeeds.
+
+    Note: Copied from `test_datasets.py::test_load_previous_and_future_frames_within_tolerance`.
+    """
+    # Sanity check on global fps as we are assuming it is 10 here.
+    assert fps == 10, "This test assumes fps==10"
+    buffer, _ = make_new_buffer(delta_timestamps={"index": [-0.2, 0, 0.139]})
+    new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
+    buffer.add_episodes(new_data)
+    buffer.tolerance_s = 0.04
+    item = buffer[2]
+    data, is_pad = item["index"], item[f"index{DataBuffer.IS_PAD_POSTFIX}"]
+    assert torch.allclose(data, torch.tensor([0, 2, 3])), "Data does not match expected values"
+    assert not is_pad.any(), "Unexpected padding detected"
+
+
+def test_delta_timestamps_outside_tolerance_inside_episode_range():
+    """Check that getting an item with delta_timestamps outside of tolerance fails.
+
+    We expect it to fail if and only if the requested timestamps are within the episode range.
+
+    Note: Copied from
+    `test_datasets.py::test_load_previous_and_future_frames_outside_tolerance_inside_episode_range`
+    """
+    # Sanity check on global fps as we are assuming it is 10 here.
+    assert fps == 10, "This test assumes fps==10"
+    buffer, _ = make_new_buffer(delta_timestamps={"index": [-0.2, 0, 0.141]})
+    new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
+    buffer.add_episodes(new_data)
+    buffer.tolerance_s = 0.04
+    with pytest.raises(TimestampOutsideToleranceError):
+        buffer[2]
+
+
+def test_delta_timestamps_outside_tolerance_outside_episode_range():
+    """Check that copy-padding of timestamps outside of the episode range works.
+
+    Note: Copied from
+    `test_datasets.py::test_load_previous_and_future_frames_outside_tolerance_outside_episode_range`
+    """
+    # Sanity check on global fps as we are assuming it is 10 here.
+    assert fps == 10, "This test assumes fps==10"
+    buffer, _ = make_new_buffer(delta_timestamps={"index": [-0.3, -0.24, 0, 0.26, 0.3]})
+    new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
+    buffer.add_episodes(new_data)
+    buffer.tolerance_s = 0.04
+    item = buffer[2]
+    data, is_pad = item["index"], item["index_is_pad"]
+    assert torch.equal(data, torch.tensor([0, 0, 2, 4, 4])), "Data does not match expected values"
+    assert torch.equal(
+        is_pad, torch.tensor([True, False, False, True, True])
+    ), "Padding does not match expected values"
+
+
+@pytest.mark.parametrize(
+    ("dataset_repo_id", "decode_video"),
+    (
+        ("lerobot/pusht", True),
+        ("lerobot/pusht", False),
+        ("lerobot/pusht_image", False),
+    ),
+)
+def test_from_huggingface_hub(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
+    """Check that we can make a buffer from a Hugging Face Hub dataset repository.
+
+    Check that the buffer we make, accurately reflects the hub dataset.
+    """
+    for iteration in range(2):  # do it twice to check that running with an existing cached buffer also works
+        hf_dataset = load_hf_dataset(dataset_repo_id, version=CODEBASE_VERSION, root=DATA_DIR, split="train")
+        hf_dataset.set_transform(lambda x: x)
+        # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
+        # This ensures that the first time this loop is run, the storage directory does not already exist.
+        storage_dir = tmp_path / DataBuffer._default_storage_dir_from_huggingface_hub(
+            dataset_repo_id, hf_dataset._fingerprint, decode_video
+        ).relative_to("/tmp")
+        if iteration == 0 and storage_dir.exists():
+            raise DevTestingError("The storage directory should not exist for the first pass of this test.")
+        buffer = DataBuffer.from_huggingface_hub(
+            dataset_repo_id,
+            decode_video,
+            storage_dir=storage_dir,
+        )
+        assert len(buffer) == len(hf_dataset)
+        for k, feature in hf_dataset.features.items():
+            if isinstance(feature, datasets.features.Image):
+                assert np.array_equal(
+                    buffer._data[k], np.stack([np.array(pil_img) for pil_img in hf_dataset[k]])
+                )
+            elif isinstance(feature, VideoFrame):
+                if decode_video:
+                    # Decode the video here.
+                    lerobot_dataset_info = load_info(dataset_repo_id, version=CODEBASE_VERSION, root=DATA_DIR)
+                    videos_path = load_videos(dataset_repo_id, version=CODEBASE_VERSION, root=DATA_DIR)
+                    episode_indices = np.array(hf_dataset["episode_index"])
+                    timestamps = np.array(hf_dataset["timestamp"])
+                    all_imgs = []
+                    for episode_index in np.unique(episode_indices):
+                        episode_data_indices = np.where(episode_indices == episode_index)[0]
+                        episode_timestamps = timestamps[episode_indices == episode_index]
+                        episode_imgs = decode_video_frames_torchvision(
+                            videos_path.parent / hf_dataset[k][episode_data_indices[0]]["path"],
+                            episode_timestamps,
+                            1 / lerobot_dataset_info["fps"] - 1e-4,
+                            to_pytorch_format=False,
+                        )
+                        all_imgs.extend(episode_imgs)
+                    assert np.array_equal(buffer._data[k], all_imgs)
+                else:
+                    # Check that the video paths are the same.
+                    assert np.array_equal(
+                        buffer._data[k], [item["path"].encode("ascii") for item in hf_dataset[k]]
+                    )
+            elif isinstance(feature, (datasets.features.Sequence, datasets.features.Value)):
+                assert np.array_equal(buffer._data[k], hf_dataset[k])
+            else:
+                raise DevTestingError(f"Tests not implemented for this feature type: {type(feature)=}")
+
+
+# Arbitrarily set small dataset sizes, making sure to have uneven sizes.
+@pytest.mark.parametrize("offline_dataset_size", [0, 6])
+@pytest.mark.parametrize("online_dataset_size", [0, 4])
+@pytest.mark.parametrize("online_sampling_ratio", [0.0, 1.0])
+def test_compute_sampler_weights_trivial(
+    offline_dataset_size: int, online_dataset_size: int, online_sampling_ratio: float
+):
+    # Pass/skip the test if both datasets sizes are zero.
+    if offline_dataset_size + online_dataset_size == 0:
+        return
+    # Create spoof offline dataset.
+    offline_dataset = LeRobotDataset.from_preloaded(
+        hf_dataset=datasets.Dataset.from_dict({"data": list(range(offline_dataset_size))})
+    )
+    offline_dataset.hf_dataset.set_transform(hf_transform_to_torch)
+    if offline_dataset_size == 0:
+        offline_dataset.episode_data_index = {}
+    else:
+        # Set up an episode_data_index with at least two episodes.
+        offline_dataset.episode_data_index = {
+            "from": torch.tensor([0, offline_dataset_size // 2]),
+            "to": torch.tensor([offline_dataset_size // 2, offline_dataset_size]),
+        }
+    # Create spoof online datset.
+    online_dataset, _ = make_new_buffer()
+    if online_dataset_size > 0:
+        online_dataset.add_episodes(
+            make_spoof_data_frames(n_episodes=2, n_frames_per_episode=online_dataset_size // 2)
+        )
+
+    weights = compute_sampler_weights(
+        offline_dataset, online_dataset=online_dataset, online_sampling_ratio=online_sampling_ratio
+    )
+    if offline_dataset_size == 0 or online_dataset_size == 0:
+        expected_weights = torch.ones(offline_dataset_size + online_dataset_size)
+    elif online_sampling_ratio == 0:
+        expected_weights = torch.cat([torch.ones(offline_dataset_size), torch.zeros(online_dataset_size)])
+    elif online_sampling_ratio == 1:
+        expected_weights = torch.cat([torch.zeros(offline_dataset_size), torch.ones(online_dataset_size)])
+    expected_weights /= expected_weights.sum()
+    assert torch.allclose(weights, expected_weights)
+
+
+def test_compute_sampler_weights_nontrivial_ratio():
+    # Arbitrarily set small dataset sizes, making sure to have uneven sizes.
+    # Create spoof offline dataset.
+    offline_dataset = LeRobotDataset.from_preloaded(
+        hf_dataset=datasets.Dataset.from_dict({"data": list(range(4))})
+    )
+    offline_dataset.hf_dataset.set_transform(hf_transform_to_torch)
+    offline_dataset.episode_data_index = {
+        "from": torch.tensor([0, 2]),
+        "to": torch.tensor([2, 4]),
+    }
+    # Create spoof online datset.
+    online_dataset, _ = make_new_buffer()
+    online_dataset.add_episodes(make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2))
+    online_sampling_ratio = 0.8
+    weights = compute_sampler_weights(
+        offline_dataset, online_dataset=online_dataset, online_sampling_ratio=online_sampling_ratio
+    )
+    assert torch.allclose(
+        weights, torch.tensor([0.05, 0.05, 0.05, 0.05, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
+    )
+
+
+def test_compute_sampler_weights_nontrivial_ratio_and_drop_last_n():
+    # Arbitrarily set small dataset sizes, making sure to have uneven sizes.
+    # Create spoof offline dataset.
+    offline_dataset = LeRobotDataset.from_preloaded(
+        hf_dataset=datasets.Dataset.from_dict({"data": list(range(4))})
+    )
+    offline_dataset.hf_dataset.set_transform(hf_transform_to_torch)
+    offline_dataset.episode_data_index = {
+        "from": torch.tensor([0]),
+        "to": torch.tensor([4]),
+    }
+    # Create spoof online datset.
+    online_dataset, _ = make_new_buffer()
+    online_dataset.add_episodes(make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2))
+    weights = compute_sampler_weights(
+        offline_dataset, online_dataset=online_dataset, online_sampling_ratio=0.8, online_drop_n_last_frames=1
+    )
+    assert torch.allclose(
+        weights, torch.tensor([0.05, 0.05, 0.05, 0.05, 0.2, 0.0, 0.2, 0.0, 0.2, 0.0, 0.2, 0.0])
+    )
+
+
+def test_compute_sampler_weights_drop_n_last_frames():
+    """Note: test copied from test_sampler."""
+    data_dict = {
+        "timestamp": [0, 0.1],
+        "index": [0, 1],
+        "episode_index": [0, 0],
+        "frame_index": [0, 1],
+    }
+    offline_dataset = LeRobotDataset.from_preloaded(hf_dataset=datasets.Dataset.from_dict(data_dict))
+    offline_dataset.hf_dataset.set_transform(hf_transform_to_torch)
+    offline_dataset.episode_data_index = {"from": torch.tensor([0]), "to": torch.tensor([2])}
+
+    online_dataset, _ = make_new_buffer()
+    online_dataset.add_episodes(make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2))
+
+    weights = compute_sampler_weights(
+        offline_dataset,
+        offline_drop_n_last_frames=1,
+        online_dataset=online_dataset,
+        online_sampling_ratio=0.5,
+        online_drop_n_last_frames=1,
+    )
+    assert torch.allclose(weights, torch.tensor([0.5, 0, 0.125, 0, 0.125, 0, 0.125, 0, 0.125, 0]))

From 51609febc41371b7e27b55c59ee64302a346a368 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Wed, 18 Sep 2024 13:34:59 +0100
Subject: [PATCH 08/29] All test_data_buffer passing

---
 lerobot/common/datasets/data_buffer.py |  16 +-
 tests/test_data_buffer.py              | 206 ++++++++++++-------------
 2 files changed, 108 insertions(+), 114 deletions(-)

diff --git a/lerobot/common/datasets/data_buffer.py b/lerobot/common/datasets/data_buffer.py
index 49555952c..9617a4884 100644
--- a/lerobot/common/datasets/data_buffer.py
+++ b/lerobot/common/datasets/data_buffer.py
@@ -382,7 +382,10 @@ def flush(self):
     def _item_to_tensors(item: dict) -> dict:
         item_ = {}
         for k, v in item.items():
-            item_[k] = torch.from_numpy(v)
+            if isinstance(v, np.ndarray):
+                item_[k] = torch.from_numpy(v)
+            else:
+                item_[k] = torch.tensor(item[k])
         return item_
 
     def _optimized_advanced_slice(self, data_key: str, indices: np.ndarray) -> np.ndarray:
@@ -404,6 +407,10 @@ def _optimized_advanced_slice(self, data_key: str, indices: np.ndarray) -> np.nd
     def __len__(self):
         return self.num_samples
 
+    def get_data_by_key(self, key: str) -> np.ndarray:
+        """Returns all data for a given data key (where the data is valid)."""
+        return self._data[key][self._data[self.OCCUPANCY_MASK_KEY]]
+
     def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
         if idx >= len(self) or idx < -len(self):
             raise IndexError
@@ -479,6 +486,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
                 self.video_backend,
             )
 
+        # TODO(now): Where to put transform that goes to torch format for images.
         if self.image_transform is not None:
             for cam in self.camera_keys:
                 item[cam] = self.image_transform(item[cam])
@@ -654,9 +662,9 @@ def compute_sampler_weights(
 
     if online_dataset is not None and len(online_dataset) > 0:
         online_data_mask_indices = []
-        episode_indices = online_dataset.get_data_by_key("episode_index")
-        for episode_idx in torch.unique(episode_indices):
-            where_episode = torch.where(episode_indices == episode_idx)
+        episode_indices = online_dataset.get_data_by_key(DataBuffer.EPISODE_INDEX_KEY)
+        for episode_idx in np.unique(episode_indices):
+            where_episode = np.where(episode_indices == episode_idx)
             start_index = where_episode[0][0]
             end_index = where_episode[0][-1] + 1
             online_data_mask_indices.extend(
diff --git a/tests/test_data_buffer.py b/tests/test_data_buffer.py
index d21ff1077..99f8030bd 100644
--- a/tests/test_data_buffer.py
+++ b/tests/test_data_buffer.py
@@ -40,18 +40,15 @@
 
 
 def make_new_buffer(
-    write_dir: str | None = None, delta_timestamps: dict[str, list[float]] | None = None
-) -> tuple[DataBuffer, str]:
-    if write_dir is None:
-        write_dir = f"/tmp/online_buffer_{uuid4().hex}"
+    storage_dir: str, storage_dir_exists: bool = False, delta_timestamps: dict[str, list[float]] | None = None
+) -> DataBuffer:
     buffer = DataBuffer(
-        write_dir,
-        data_spec={data_key: {"shape": data_shape, "dtype": np.dtype("float32")}},
-        buffer_capacity=buffer_capacity,
+        storage_dir,
+        buffer_capacity=buffer_capacity if not storage_dir_exists else None,
         fps=None if delta_timestamps is None else fps,
         delta_timestamps=delta_timestamps,
     )
-    return buffer, write_dir
+    return buffer
 
 
 def make_spoof_data_frames(n_episodes: int, n_frames_per_episode: int) -> dict[str, np.ndarray]:
@@ -65,7 +62,7 @@ def make_spoof_data_frames(n_episodes: int, n_frames_per_episode: int) -> dict[s
     return new_data
 
 
-def test_non_mutate():
+def test_non_mutate(tmp_path: Path):
     """Checks that the data provided to the add_data method is copied rather than passed by reference.
 
     This means that mutating the data in the buffer does not mutate the original data.
@@ -73,23 +70,25 @@ def test_non_mutate():
     NOTE: If this test fails, it means some of the other tests may be compromised. For example, we can't trust
     a success case for `test_write_read`.
     """
-    buffer, _ = make_new_buffer()
+    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
+    # Note: choices for args to make_spoof_data_frames are arbitrary.
     new_data = make_spoof_data_frames(2, buffer_capacity // 4)
     new_data_copy = deepcopy(new_data)
     buffer.add_episodes(new_data)
-    buffer._data[data_key][:] += 1
+    buffer.get_data_by_key(data_key)[:] += 1
     assert all(np.array_equal(new_data[k], new_data_copy[k]) for k in new_data)
 
 
-def test_index_error_no_data():
-    buffer, _ = make_new_buffer()
+def test_index_error_no_data(tmp_path: Path):
+    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
     with pytest.raises(IndexError):
         buffer[0]
 
 
-def test_index_error_with_data():
-    buffer, _ = make_new_buffer()
+def test_index_error_with_data(tmp_path: Path):
+    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
     n_frames = buffer_capacity // 2
+    # Note: choices for args to make_spoof_data_frames are arbitrary.
     new_data = make_spoof_data_frames(1, n_frames)
     buffer.add_episodes(new_data)
     with pytest.raises(IndexError):
@@ -99,12 +98,14 @@ def test_index_error_with_data():
 
 
 @pytest.mark.parametrize("do_reload", [False, True])
-def test_write_read(do_reload: bool):
+def test_write_read(tmp_path: Path, do_reload: bool):
     """Checks that data can be added to the buffer and read back.
 
     If do_reload we delete the buffer object and load the buffer back from disk before reading.
     """
-    buffer, write_dir = make_new_buffer()
+    storage_dir = tmp_path / f"buffer_{uuid4().hex}"
+    buffer = make_new_buffer(storage_dir)
+    # Note: choices for args to make_spoof_data_frames are arbitrary.
     n_episodes = 2
     n_frames_per_episode = buffer_capacity // 4
     new_data = make_spoof_data_frames(n_episodes, n_frames_per_episode)
@@ -112,7 +113,7 @@ def test_write_read(do_reload: bool):
 
     if do_reload:
         del buffer
-        buffer, _ = make_new_buffer(write_dir)
+        buffer = make_new_buffer(storage_dir, storage_dir_exists=True)
 
     assert len(buffer) == n_frames_per_episode * n_episodes
     for i, item in enumerate(buffer):
@@ -120,57 +121,67 @@ def test_write_read(do_reload: bool):
         assert np.array_equal(item[data_key].numpy(), new_data[data_key][i])
 
 
-def test_read_data_key():
-    """Tests that data can be added to a buffer and all data for a. specific key can be read back."""
-    buffer, _ = make_new_buffer()
-    n_episodes = 2
-    n_frames_per_episode = buffer_capacity // 4
-    new_data = make_spoof_data_frames(n_episodes, n_frames_per_episode)
-    buffer.add_episodes(new_data)
-
-    data_from_buffer = buffer.get_data_by_key(data_key)
-    assert isinstance(data_from_buffer, torch.Tensor)
-    assert np.array_equal(data_from_buffer.numpy(), new_data[data_key])
-
-
-def test_fifo():
+def test_fifo(tmp_path: Path):
     """Checks that if data is added beyond the buffer capacity, we discard the oldest data first."""
-    buffer, _ = make_new_buffer()
-    n_frames_per_episode = buffer_capacity // 4
+    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
+    # Note: choices for args to make_spoof_data_frames are mostly arbitrary. Of interest is:
+    #   - later we need `n_more_episodes` to cause an overflow.
+    #   - we need that overflow to happen *within* an episode such that we're testing the behavior whereby the
+    #     whole episode is wrapped to the start of the buffer, even if part of it can fit.
+    n_frames_per_episode = buffer_capacity // 4 + 2
     n_episodes = 3
-    new_data = make_spoof_data_frames(n_episodes, n_frames_per_episode)
-    buffer.add_episodes(new_data)
+    if buffer_capacity - n_frames_per_episode * n_episodes >= n_frames_per_episode:
+        raise DevTestingError("Make sure to set this up such that adding another episode causes an overflow.")
+    new_episodes = make_spoof_data_frames(n_episodes, n_frames_per_episode)
+    buffer.add_episodes(new_episodes)
+    # Note `n_more_episodes` is chosen to result in an overflow on the first episode.
     n_more_episodes = 2
-    # Developer sanity check (in case someone changes the global `buffer_capacity`).
+    # Make this slightly larger than the prior episodes to test that there is no issue with overwriting the
+    # start of an existing episode.
+    n_frames_per_more_episodes = n_frames_per_episode + 1
+    more_new_episodes = make_spoof_data_frames(n_more_episodes, n_frames_per_more_episodes)
+    buffer.add_episodes(more_new_episodes)
     assert (
-        n_episodes + n_more_episodes
-    ) * n_frames_per_episode > buffer_capacity, "Something went wrong with the test code."
-    more_new_data = make_spoof_data_frames(n_more_episodes, n_frames_per_episode)
-    buffer.add_episodes(more_new_data)
-    assert len(buffer) == buffer_capacity, "The buffer should be full."
+        len(buffer) == n_frames_per_episode * n_episodes
+    ), "The new episode should have wrapped around to the start"
 
     expected_data = {}
-    for k in new_data:
-        # Concatenate, left-truncate, then roll, to imitate the cyclical FIFO pattern in DataBuffer.
-        expected_data[k] = np.roll(
-            np.concatenate([new_data[k], more_new_data[k]])[-buffer_capacity:],
-            shift=len(new_data[k]) + len(more_new_data[k]) - buffer_capacity,
-            axis=0,
-        )
+    for k in new_episodes:
+        expected_data[k] = new_episodes[k]
+        # The extra new episode should overwrite the start of the buffer.
+        expected_data[k][: len(more_new_episodes[k])] = more_new_episodes[k]
 
     for i, item in enumerate(buffer):
-        assert all(isinstance(item[k], torch.Tensor) for k in item)
         assert np.array_equal(item[data_key].numpy(), expected_data[data_key][i])
 
 
-def test_delta_timestamps_within_tolerance():
-    """Check that getting an item with delta_timestamps within tolerance succeeds.
+def test_get_data_by_key(tmp_path: Path):
+    """Tests that data can be added to a buffer and all data for a specific key can be read back."""
+    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
+    # Note: choices for args to make_spoof_data_frames are mostly arbitrary. The only intentional aspect is to
+    # make sure the buffer is not full, in order to check that `get_data_by_key` only returns the part of the
+    # buffer that is occupied.
+    n_episodes = 2
+    n_frames_per_episode = buffer_capacity // 4
+    new_episodes = make_spoof_data_frames(n_episodes, n_frames_per_episode)
+    buffer.add_episodes(new_episodes)
 
-    Note: Copied from `test_datasets.py::test_load_previous_and_future_frames_within_tolerance`.
-    """
-    # Sanity check on global fps as we are assuming it is 10 here.
-    assert fps == 10, "This test assumes fps==10"
-    buffer, _ = make_new_buffer(delta_timestamps={"index": [-0.2, 0, 0.139]})
+    data_from_buffer = buffer.get_data_by_key(data_key)
+    assert np.array_equal(data_from_buffer, new_episodes[data_key])
+
+
+def test_getter_return_tensors(tmp_path: Path):
+    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
+    for item in buffer:
+        for k in item:
+            assert isinstance(item[k], torch.tensor)
+
+
+def test_delta_timestamps_within_tolerance(tmp_path: Path):
+    """Check that getting an item with delta_timestamps within tolerance succeeds."""
+    if fps != 10:
+        raise DevTestingError("This test is designed to use fps == 10.")
+    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}", delta_timestamps={"index": [-0.2, 0, 0.139]})
     new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
     buffer.add_episodes(new_data)
     buffer.tolerance_s = 0.04
@@ -180,17 +191,14 @@ def test_delta_timestamps_within_tolerance():
     assert not is_pad.any(), "Unexpected padding detected"
 
 
-def test_delta_timestamps_outside_tolerance_inside_episode_range():
+def test_delta_timestamps_outside_tolerance_inside_episode_range(tmp_path: Path):
     """Check that getting an item with delta_timestamps outside of tolerance fails.
 
     We expect it to fail if and only if the requested timestamps are within the episode range.
-
-    Note: Copied from
-    `test_datasets.py::test_load_previous_and_future_frames_outside_tolerance_inside_episode_range`
     """
-    # Sanity check on global fps as we are assuming it is 10 here.
-    assert fps == 10, "This test assumes fps==10"
-    buffer, _ = make_new_buffer(delta_timestamps={"index": [-0.2, 0, 0.141]})
+    if fps != 10:
+        raise DevTestingError("This test is designed to use fps == 10.")
+    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}", delta_timestamps={"index": [-0.2, 0, 0.141]})
     new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
     buffer.add_episodes(new_data)
     buffer.tolerance_s = 0.04
@@ -198,15 +206,13 @@ def test_delta_timestamps_outside_tolerance_inside_episode_range():
         buffer[2]
 
 
-def test_delta_timestamps_outside_tolerance_outside_episode_range():
-    """Check that copy-padding of timestamps outside of the episode range works.
-
-    Note: Copied from
-    `test_datasets.py::test_load_previous_and_future_frames_outside_tolerance_outside_episode_range`
-    """
-    # Sanity check on global fps as we are assuming it is 10 here.
-    assert fps == 10, "This test assumes fps==10"
-    buffer, _ = make_new_buffer(delta_timestamps={"index": [-0.3, -0.24, 0, 0.26, 0.3]})
+def test_delta_timestamps_outside_tolerance_outside_episode_range(tmp_path):
+    """Check that copy-padding of timestamps outside of the episode range works."""
+    if fps != 10:
+        raise DevTestingError("This test is designed to use fps == 10.")
+    buffer = make_new_buffer(
+        tmp_path / f"buffer_{uuid4().hex}", delta_timestamps={"index": [-0.3, -0.24, 0, 0.26, 0.3]}
+    )
     new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
     buffer.add_episodes(new_data)
     buffer.tolerance_s = 0.04
@@ -227,9 +233,11 @@ def test_delta_timestamps_outside_tolerance_outside_episode_range():
     ),
 )
 def test_from_huggingface_hub(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
-    """Check that we can make a buffer from a Hugging Face Hub dataset repository.
-
-    Check that the buffer we make, accurately reflects the hub dataset.
+    """
+    Check that:
+        - We can make a buffer from a Hugging Face Hub dataset repository.
+        - The buffer we make, accurately reflects the hub dataset.
+        - If we try to make it a second time, everything still works as expected.
     """
     for iteration in range(2):  # do it twice to check that running with an existing cached buffer also works
         hf_dataset = load_hf_dataset(dataset_repo_id, version=CODEBASE_VERSION, root=DATA_DIR, split="train")
@@ -250,7 +258,7 @@ def test_from_huggingface_hub(tmp_path: Path, dataset_repo_id: str, decode_video
         for k, feature in hf_dataset.features.items():
             if isinstance(feature, datasets.features.Image):
                 assert np.array_equal(
-                    buffer._data[k], np.stack([np.array(pil_img) for pil_img in hf_dataset[k]])
+                    buffer.get_data_by_key(k), np.stack([np.array(pil_img) for pil_img in hf_dataset[k]])
                 )
             elif isinstance(feature, VideoFrame):
                 if decode_video:
@@ -270,16 +278,16 @@ def test_from_huggingface_hub(tmp_path: Path, dataset_repo_id: str, decode_video
                             to_pytorch_format=False,
                         )
                         all_imgs.extend(episode_imgs)
-                    assert np.array_equal(buffer._data[k], all_imgs)
+                    assert np.array_equal(buffer.get_data_by_key(k), all_imgs)
                 else:
                     # Check that the video paths are the same.
                     assert np.array_equal(
-                        buffer._data[k], [item["path"].encode("ascii") for item in hf_dataset[k]]
+                        buffer.get_data_by_key(k), [item["path"].encode("ascii") for item in hf_dataset[k]]
                     )
             elif isinstance(feature, (datasets.features.Sequence, datasets.features.Value)):
-                assert np.array_equal(buffer._data[k], hf_dataset[k])
+                assert np.array_equal(buffer.get_data_by_key(k), hf_dataset[k])
             else:
-                raise DevTestingError(f"Tests not implemented for this feature type: {type(feature)=}")
+                raise DevTestingError(f"Tests not implemented for this feature type: {type(feature)=}.")
 
 
 # Arbitrarily set small dataset sizes, making sure to have uneven sizes.
@@ -287,7 +295,7 @@ def test_from_huggingface_hub(tmp_path: Path, dataset_repo_id: str, decode_video
 @pytest.mark.parametrize("online_dataset_size", [0, 4])
 @pytest.mark.parametrize("online_sampling_ratio", [0.0, 1.0])
 def test_compute_sampler_weights_trivial(
-    offline_dataset_size: int, online_dataset_size: int, online_sampling_ratio: float
+    tmp_path: Path, offline_dataset_size: int, online_dataset_size: int, online_sampling_ratio: float
 ):
     # Pass/skip the test if both datasets sizes are zero.
     if offline_dataset_size + online_dataset_size == 0:
@@ -306,7 +314,7 @@ def test_compute_sampler_weights_trivial(
             "to": torch.tensor([offline_dataset_size // 2, offline_dataset_size]),
         }
     # Create spoof online datset.
-    online_dataset, _ = make_new_buffer()
+    online_dataset = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
     if online_dataset_size > 0:
         online_dataset.add_episodes(
             make_spoof_data_frames(n_episodes=2, n_frames_per_episode=online_dataset_size // 2)
@@ -325,7 +333,7 @@ def test_compute_sampler_weights_trivial(
     assert torch.allclose(weights, expected_weights)
 
 
-def test_compute_sampler_weights_nontrivial_ratio():
+def test_compute_sampler_weights_nontrivial_ratio(tmp_path: Path):
     # Arbitrarily set small dataset sizes, making sure to have uneven sizes.
     # Create spoof offline dataset.
     offline_dataset = LeRobotDataset.from_preloaded(
@@ -337,7 +345,7 @@ def test_compute_sampler_weights_nontrivial_ratio():
         "to": torch.tensor([2, 4]),
     }
     # Create spoof online datset.
-    online_dataset, _ = make_new_buffer()
+    online_dataset = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
     online_dataset.add_episodes(make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2))
     online_sampling_ratio = 0.8
     weights = compute_sampler_weights(
@@ -348,30 +356,8 @@ def test_compute_sampler_weights_nontrivial_ratio():
     )
 
 
-def test_compute_sampler_weights_nontrivial_ratio_and_drop_last_n():
-    # Arbitrarily set small dataset sizes, making sure to have uneven sizes.
-    # Create spoof offline dataset.
-    offline_dataset = LeRobotDataset.from_preloaded(
-        hf_dataset=datasets.Dataset.from_dict({"data": list(range(4))})
-    )
-    offline_dataset.hf_dataset.set_transform(hf_transform_to_torch)
-    offline_dataset.episode_data_index = {
-        "from": torch.tensor([0]),
-        "to": torch.tensor([4]),
-    }
-    # Create spoof online datset.
-    online_dataset, _ = make_new_buffer()
-    online_dataset.add_episodes(make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2))
-    weights = compute_sampler_weights(
-        offline_dataset, online_dataset=online_dataset, online_sampling_ratio=0.8, online_drop_n_last_frames=1
-    )
-    assert torch.allclose(
-        weights, torch.tensor([0.05, 0.05, 0.05, 0.05, 0.2, 0.0, 0.2, 0.0, 0.2, 0.0, 0.2, 0.0])
-    )
-
-
-def test_compute_sampler_weights_drop_n_last_frames():
-    """Note: test copied from test_sampler."""
+def test_compute_sampler_weights_drop_n_last_frames(tmp_path: Path):
+    """Check that the drop_n_last_frames feature works as intended."""
     data_dict = {
         "timestamp": [0, 0.1],
         "index": [0, 1],
@@ -382,7 +368,7 @@ def test_compute_sampler_weights_drop_n_last_frames():
     offline_dataset.hf_dataset.set_transform(hf_transform_to_torch)
     offline_dataset.episode_data_index = {"from": torch.tensor([0]), "to": torch.tensor([2])}
 
-    online_dataset, _ = make_new_buffer()
+    online_dataset = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
     online_dataset.add_episodes(make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2))
 
     weights = compute_sampler_weights(

From 59b7e0e71485b386f68b455b93fdc5839bff9115 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Wed, 18 Sep 2024 16:16:07 +0100
Subject: [PATCH 09/29] ALL tests passing

---
 lerobot/common/envs/utils.py | 38 ++++++++++++++--------
 lerobot/scripts/eval.py      | 62 ++++++++++++++++++++----------------
 lerobot/scripts/train.py     | 10 +-----
 3 files changed, 59 insertions(+), 51 deletions(-)

diff --git a/lerobot/common/envs/utils.py b/lerobot/common/envs/utils.py
index 001973bc1..f8cbb5a8f 100644
--- a/lerobot/common/envs/utils.py
+++ b/lerobot/common/envs/utils.py
@@ -19,12 +19,16 @@
 from torch import Tensor
 
 
-def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Tensor]:
+def preprocess_observation(
+    observations: dict[str, np.ndarray], to_pytorch_format: bool = True
+) -> dict[str, Tensor | np.ndarray]:
     """Convert environment observation to LeRobot format observation.
     Args:
-        observation: Dictionary of observation batches from a Gym vector environment.
+        observations: Dictionary of observation batches from a Gym vector environment.
+        to_pytorch_format: Whether to return tensors instead of numpy arrays. For image observations, this
+            also implies switching to channel-first, float32 normalized to the range [0, 1].
     Returns:
-        Dictionary of observation batches with keys renamed to LeRobot format and values as tensors.
+        Dictionary of observation batches in LeRobot format,
     """
     # map to expected inputs for the policy
     return_observations = {}
@@ -35,28 +39,34 @@ def preprocess_observation(observations: dict[str, np.ndarray]) -> dict[str, Ten
             imgs = {"observation.image": observations["pixels"]}
 
         for imgkey, img in imgs.items():
-            img = torch.from_numpy(img)
-
             # sanity check that images are channel last
             _, h, w, c = img.shape
             assert c < h and c < w, f"expect channel last images, but instead got {img.shape=}"
 
             # sanity check that images are uint8
-            assert img.dtype == torch.uint8, f"expect torch.uint8, but instead {img.dtype=}"
+            assert img.dtype == np.uint8, f"expected np.uint8, but instead got {img.dtype=}"
 
-            # convert to channel first of type float32 in range [0,1]
-            img = einops.rearrange(img, "b h w c -> b c h w").contiguous()
-            img = img.type(torch.float32)
-            img /= 255
+            if to_pytorch_format:
+                img = torch.from_numpy(img)
+                # convert to channel first of type float32 in range [0,1]
+                img = einops.rearrange(img, "b h w c -> b c h w").contiguous()
+                img = img.type(torch.float32)
+                img /= 255
 
             return_observations[imgkey] = img
 
     if "environment_state" in observations:
-        return_observations["observation.environment_state"] = torch.from_numpy(
-            observations["environment_state"]
-        ).float()
+        return_observations["observation.environment_state"] = observations["environment_state"].astype(
+            np.float32
+        )
+        if to_pytorch_format:
+            return_observations["observation.environment_state"] = torch.from_numpy(
+                return_observations["observation.environment_state"]
+            )
 
     # TODO(rcadene): enable pixels only baseline with `obs_type="pixels"` in environment by removing
     # requirement for "agent_pos"
-    return_observations["observation.state"] = torch.from_numpy(observations["agent_pos"]).float()
+    return_observations["observation.state"] = observations["agent_pos"].astype(np.float32)
+    if to_pytorch_format:
+        return_observations["observation.state"] = torch.from_numpy(return_observations["observation.state"])
     return return_observations
diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py
index cff4bc01b..2c763db34 100644
--- a/lerobot/scripts/eval.py
+++ b/lerobot/scripts/eval.py
@@ -47,7 +47,6 @@
 import threading
 import time
 from contextlib import nullcontext
-from copy import deepcopy
 from datetime import datetime as dt
 from pathlib import Path
 from typing import Callable
@@ -145,9 +144,9 @@ def rollout(
     )
     while not np.all(done):
         # Numpy array to tensor and changing dictionary keys to LeRobot policy format.
-        observation = preprocess_observation(observation)
         if return_observations:
-            all_observations.append(deepcopy(observation))
+            all_observations.append(preprocess_observation(observation, to_pytorch_format=False))
+        observation = preprocess_observation(observation)
 
         observation = {key: observation[key].to(device, non_blocking=True) for key in observation}
 
@@ -173,35 +172,32 @@ def rollout(
         # Keep track of which environments are done so far.
         done = terminated | truncated | done
 
-        all_actions.append(torch.from_numpy(action))
-        all_rewards.append(torch.from_numpy(reward))
-        all_dones.append(torch.from_numpy(done))
-        all_successes.append(torch.tensor(successes))
+        all_actions.append(action)
+        all_rewards.append(reward)
+        all_dones.append(done)
+        all_successes.append(np.array(successes))
 
         step += 1
-        running_success_rate = (
-            einops.reduce(torch.stack(all_successes, dim=1), "b n -> b", "any").numpy().mean()
-        )
+        running_success_rate = einops.reduce(np.stack(all_successes, axis=1), "b n -> b", "any").mean()
         progbar.set_postfix({"running_success_rate": f"{running_success_rate.item() * 100:.1f}%"})
         progbar.update()
 
     # Track the final observation.
     if return_observations:
-        observation = preprocess_observation(observation)
-        # TODO(now): Go uint8 HWC instead
-        all_observations.append(deepcopy(observation))
+        all_observations.append(preprocess_observation(observation, to_pytorch_format=False))
 
     # Stack the sequence along the first dimension so that we have (batch, sequence, *) tensors.
+    # All arrays go to float32 for more efficient storage in a DataBuffer.
     ret = {
-        "action": torch.stack(all_actions, dim=1),
-        "reward": torch.stack(all_rewards, dim=1),
-        "success": torch.stack(all_successes, dim=1),
-        "done": torch.stack(all_dones, dim=1),
+        "action": np.stack(all_actions, axis=1).astype(np.float32),
+        "reward": np.stack(all_rewards, axis=1).astype(np.float32),
+        "success": np.stack(all_successes, axis=1).astype(bool),
+        "done": np.stack(all_dones, axis=1).astype(bool),
     }
     if return_observations:
         stacked_observations = {}
         for key in all_observations[0]:
-            stacked_observations[key] = torch.stack([obs[key] for obs in all_observations], dim=1)
+            stacked_observations[key] = np.stack([obs[key] for obs in all_observations], axis=1)
         ret["observation"] = stacked_observations
 
     return ret
@@ -293,11 +289,11 @@ def render_frame(env: gym.vector.VectorEnv):
         # this won't be included).
         n_steps = rollout_data["done"].shape[1]
         # Note: this relies on a property of argmax: that it returns the first occurrence as a tiebreaker.
-        done_indices = torch.argmax(rollout_data["done"].to(int), dim=1)
+        done_indices = np.argmax(rollout_data["done"], axis=1)
 
         # Make a mask with shape (batch, n_steps) to mask out rollout data after the first done
         # (batch-element-wise). Note the `done_indices + 1` to make sure to keep the data from the done step.
-        mask = (torch.arange(n_steps) <= einops.repeat(done_indices + 1, "b -> b s", s=n_steps)).int()
+        mask = np.arange(n_steps) <= einops.repeat(done_indices + 1, "b -> b s", s=n_steps)
         # Extend metrics.
         batch_sum_rewards = einops.reduce((rollout_data["reward"] * mask), "b n -> b", "sum")
         sum_rewards.extend(batch_sum_rewards.tolist())
@@ -417,17 +413,27 @@ def _compile_episode_data(
         # Here we do `num_frames - 1` as we don't want to include the last observation frame just yet.
         ep_dict = {
             "action": rollout_data["action"][ep_ix, : num_frames - 1],
-            "episode_index": torch.tensor([start_episode_index + ep_ix] * (num_frames - 1)),
-            "frame_index": torch.arange(0, num_frames - 1, 1),
-            "timestamp": torch.arange(0, num_frames - 1, 1) / fps,
+            "episode_index": np.array([start_episode_index + ep_ix] * (num_frames - 1)),
+            "frame_index": np.arange(0, num_frames - 1, 1),
+            "timestamp": np.arange(0, num_frames - 1, 1) / fps,
             "next.done": rollout_data["done"][ep_ix, : num_frames - 1],
             "next.success": rollout_data["success"][ep_ix, : num_frames - 1],
-            "next.reward": rollout_data["reward"][ep_ix, : num_frames - 1].type(torch.float32),
+            "next.reward": rollout_data["reward"][ep_ix, : num_frames - 1],
         }
 
-        # For the last observation frame, all other keys will just be copy padded.
+        # For the last observation frame, all other keys will be padded.
         for k in ep_dict:
-            ep_dict[k] = torch.cat([ep_dict[k], ep_dict[k][-1:]])
+            if k not in ["timestamp", "frame_index"]:
+                # Copy-pad.
+                ep_dict[k] = np.concatenate([ep_dict[k], ep_dict[k][-1:]])
+            elif k == "timestamp":
+                # Pad with + 1 / fps
+                ep_dict[k] = np.append(ep_dict[k], ep_dict[k][-1] + 1 / fps)
+            elif k == "frame_index":
+                # Pad with the next index.
+                ep_dict[k] = np.append(ep_dict[k], ep_dict[k][-1] + 1)
+            else:
+                raise AssertionError
 
         for key in rollout_data["observation"]:
             ep_dict[key] = rollout_data["observation"][key][ep_ix, :num_frames]
@@ -436,9 +442,9 @@ def _compile_episode_data(
 
     data_dict = {}
     for key in ep_dicts[0]:
-        data_dict[key] = torch.cat([x[key] for x in ep_dicts])
+        data_dict[key] = np.concatenate([x[key] for x in ep_dicts])
 
-    data_dict["index"] = torch.arange(start_data_index, start_data_index + total_frames, 1)
+    data_dict["index"] = np.arange(start_data_index, start_data_index + total_frames, 1)
 
     return data_dict
 
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index 288ef83f5..0876e0012 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -23,7 +23,6 @@
 from threading import Lock
 
 import hydra
-import numpy as np
 import torch
 from deepdiff import DeepDiff
 from omegaconf import DictConfig, ListConfig, OmegaConf
@@ -484,13 +483,6 @@ def evaluate_and_checkpoint_if_needed(step, is_online):
         )
     online_dataset = DataBuffer(
         online_buffer_path,
-        data_spec={
-            **{k: {"shape": v, "dtype": np.dtype("float32")} for k, v in policy.config.input_shapes.items()},
-            **{k: {"shape": v, "dtype": np.dtype("float32")} for k, v in policy.config.output_shapes.items()},
-            "next.reward": {"shape": (), "dtype": np.dtype("float32")},
-            "next.done": {"shape": (), "dtype": np.dtype("?")},
-            "next.success": {"shape": (), "dtype": np.dtype("?")},
-        },
         buffer_capacity=cfg.training.online_buffer_capacity,
         fps=online_env.unwrapped.metadata["render_fps"],
         delta_timestamps=cfg.training.delta_timestamps,
@@ -570,7 +562,7 @@ def sample_trajectory_and_update_buffer():
 
             with lock:
                 start_update_buffer_time = time.perf_counter()
-                online_dataset.add_data(eval_info["episodes"])
+                online_dataset.add_episodes(eval_info["episodes"])
 
                 # Update the concatenated dataset length used during sampling.
                 concat_dataset.cumulative_sizes = concat_dataset.cumsum(concat_dataset.datasets)

From d4b40f8212dc3532c5f28a2216d53c9777cad937 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Wed, 18 Sep 2024 18:37:32 +0100
Subject: [PATCH 10/29] backup wip

---
 lerobot/common/datasets/data_buffer.py | 113 +++++++++++++++++--------
 lerobot/common/datasets/video_utils.py |   9 +-
 tests/test_data_buffer.py              |  74 ++++++++++++++--
 3 files changed, 152 insertions(+), 44 deletions(-)

diff --git a/lerobot/common/datasets/data_buffer.py b/lerobot/common/datasets/data_buffer.py
index 9617a4884..f99a0fe44 100644
--- a/lerobot/common/datasets/data_buffer.py
+++ b/lerobot/common/datasets/data_buffer.py
@@ -91,11 +91,11 @@ class DataBuffer(torch.utils.data.Dataset):
     and wrapping around to the start when necessary (in which case older episodes are overwritten).
 
     This class is also a PyTorch Dataset and can be used as such in a dataloader for a training loop. The item
-    getter returns either a single frame, or a slice of a single episode if `delta_timestamps` is set.
+    getter returns either a single frame, or a slice of a single episode if `delta_timestamps` is set. It also
+    converts the numpy data to torch tensors, and handles converting images to channel-first, float32
+    normalized to the range [0, 1].
     """
 
-    # TODO(now): What should we do about implicit data key specs, like observations.image?
-
     # Special key for a (1,) array storing a pointer to the next index to fill from when adding data.
     NEXT_INDEX_KEY = "_next_index"
     # Since the data buffer is pre-allocated, this boolean mask is used to indicate which frames have "real"
@@ -109,6 +109,8 @@ class DataBuffer(torch.utils.data.Dataset):
     EPISODE_INDEX_KEY = "episode_index"
     TIMESTAMP_KEY = "timestamp"
     PRESET_KEYS = {INDEX_KEY, FRAME_INDEX_KEY, EPISODE_INDEX_KEY, TIMESTAMP_KEY}
+    # By convention, all images should be stored under a key with this prefix.
+    IMAGE_KEY_PREFIX = "observation.image"
 
     METADATA_FILE_NAME = "meta.json"
 
@@ -138,10 +140,6 @@ def __init__(
                 refactored.
 
         """
-        # Argument validation.
-        if (delta_timestamps is None) ^ (fps is None):
-            raise ValueError("`delta_timestamps` and `fps` should be provided together, or not at all.")
-
         # Parameters for the data structure.
         self._storage_dir = Path(storage_dir)
         self._data: dict[str, np.memmap] = {}
@@ -165,12 +163,7 @@ def __init__(
             self._buffer_capacity = buffer_capacity
 
         # Parameters for the item getter.
-        self.set_delta_timestamps(delta_timestamps)
-        self._fps = fps
-        # Tolerance (in seconds) used to discard loaded frames when their timestamps are not close enough to
-        # the requested frames. It is only used when `delta_timestamps` is provided. The -1e-4 accounts for
-        # possible numerical error.
-        self.tolerance_s = 1 / self.fps - 1e-4 if fps is not None else None
+        self.set_delta_timestamps_and_fps(delta_timestamps, fps)
         self.image_transform = image_transform
 
     @property
@@ -217,7 +210,7 @@ def camera_keys(self) -> list[str]:
 
         By convention, this is all the keys starting with "observation.image".
         """
-        return [k for k in self._data if k.startswith("observation.image")]
+        return [k for k in self._data if k.startswith(self.IMAGE_KEY_PREFIX)]
 
     def _save_data_spec(self, data_spec: dict[str, dict]):
         """Save the data type and shape specifications to the storage directory."""
@@ -285,17 +278,34 @@ def _make_memmaps(self, data_spec: dict[str, dict], mode: str):
     def delta_timestamps(self) -> dict[str, np.ndarray] | None:
         return self._delta_timestamps
 
-    def set_delta_timestamps(self, value: dict[str, list[float]] | None):
-        """Set delta_timestamps converting the values to numpy arrays.
+    def set_delta_timestamps_and_fps(
+        self, delta_timestamps: dict[str, list[float]] | None, fps: float | None
+    ):
+        """Set delta_timestamps converting the values to numpy arrays, fps, and timestamp tolerance.
 
         Note: The conversion is for an optimization in the __getitem__. The loop is much slower if lists need
         to be converted into numpy arrays.
+
+        Note: fps needs to be included as they should be provided together or not at all.
+        # TODO(now): ugly...
+
+        Tolerance is set according to the fps.
         """
-        if value is not None:
-            self._delta_timestamps = {k: np.array(v) for k, v in value.items()}
+        if (delta_timestamps is None) ^ (fps is None):
+            raise ValueError("`delta_timestamps` and `fps` should be provided together, or not at all.")
+
+        if delta_timestamps is not None:
+            self._delta_timestamps = {k: np.array(v) for k, v in delta_timestamps.items()}
         else:
             self._delta_timestamps = None
 
+        self._fps = fps
+
+        # Tolerance (in seconds) used to discard loaded frames when their timestamps are not close enough to
+        # the requested frames. It is only used when `delta_timestamps` is provided. The -1e-4 accounts for
+        # possible numerical error.
+        self.tolerance_s = 1 / self.fps - 1e-4 if fps is not None else None
+
     def add_episodes(self, data: dict[str, np.ndarray]):
         """Add data to the buffer.
 
@@ -341,6 +351,24 @@ def _add_episode(self, data: dict[str, np.ndarray]):
             raise ValueError(
                 "Expected frame indices to start from 0 and step up in increments of 1 per frame."
             )
+        # Special checks on image keys.
+        for k in data:
+            if not k.startswith(self.IMAGE_KEY_PREFIX):
+                continue
+            if self._is_video_dataset:
+                if data[k].dtype != np.dtype(f"S{MAX_VIDEO_PATH_LENGTH}"):
+                    raise ValueError(
+                        f"Any data key starting with '{self.IMAGE_KEY_PREFIX}' is assumed to be an image, "
+                        "and in a video dataset it should be string data (with a relative path to the video "
+                        "to be loaded)."
+                    )
+            else:
+                _, h, w, c = data[k].shape
+                if data[k].dtype is not np.dtype("uint8") or c >= min(h, w):
+                    raise ValueError(
+                        f"Any data key starting with '{self.IMAGE_KEY_PREFIX}' is assumed to be an image, "
+                        "and should be of type np.uint8, with channel-last format."
+                    )
 
         # Figure out where we need to start filling data next, and make sure we continue data and episode
         # indices.
@@ -384,8 +412,14 @@ def _item_to_tensors(item: dict) -> dict:
         for k, v in item.items():
             if isinstance(v, np.ndarray):
                 item_[k] = torch.from_numpy(v)
+            elif isinstance(v, torch.Tensor):
+                item_[k] = v
+            elif isinstance(v, np.bool_):
+                # Note: This is not necessary vs just doing torch.tensor(v), but it dodges a
+                # DeprecationWarning from torch.
+                item_[k] = torch.tensor(bool(v))
             else:
-                item_[k] = torch.tensor(item[k])
+                item_[k] = torch.tensor(v)
         return item_
 
     def _optimized_advanced_slice(self, data_key: str, indices: np.ndarray) -> np.ndarray:
@@ -412,6 +446,14 @@ def get_data_by_key(self, key: str) -> np.ndarray:
         return self._data[key][self._data[self.OCCUPANCY_MASK_KEY]]
 
     def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        """Gets an item or slice from the buffer and returns it in PyTorch format.
+
+        Images (any data key starting with "observation.image") get converted from numpy uint8, in range
+        [0, 255], channel-first to torch float32, in range [0, 1], channel-last.
+
+        If `delta_timestamps` is set... TODO(alexander-soare): Document this somewhere when
+        `load_previous_and_future_frames` is refactored.
+        """
         if idx >= len(self) or idx < -len(self):
             raise IndexError
 
@@ -452,7 +494,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
                         f"{self.tolerance_s=}) inside the episode range."
                     )
 
-                if self.is_video_dataset and data_key.startswith("observation.image"):
+                if self.is_video_dataset and data_key.startswith(self.IMAGE_KEY_PREFIX):
                     video_timestamps[data_key] = self._data[self.TIMESTAMP_KEY][episode_data_indices[argmin_]]
 
                 # Load frames for this data key.
@@ -469,24 +511,27 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
 
         if self.is_video_dataset:
             item_ = dict(item)
-            for k in self.video_frame_keys:  # TODO(now): HACK
+            for k in self.camera_keys:
                 if self.delta_timestamps is None:
-                    item_[k] = {"path": item[k].decode(), "timestamp": float(item[self.TIMESTAMP_KEY])}
+                    item_[k] = {"path": item[k].decode(), "timestamp": item[self.TIMESTAMP_KEY]}
                 else:
-                    query_ts = current_ts + self.delta_timestamps[k]
                     item_[k] = [
-                        {"path": item[k][i].decode(), "timestamp": float(video_timestamps[k][i])}
+                        {"path": item[k][i].decode(), "timestamp": video_timestamps[k][i]}
                         for i in range(len(item[k]))
                     ]
             item = load_from_videos(
                 item_,
-                self.video_frame_keys,
+                self.camera_keys,
                 self.videos_dir,
-                self.tolerance_s,
-                self.video_backend,
+                self.tolerance_s or 1e-8,  # 1e-8 to account for no delta_timestamps
+                "pyav",
+                to_pytorch_format=True,
             )
+        else:
+            # Convert to PyTorch format: channel-last, float32, normalize to range [0, 1].
+            for cam in self.camera_keys:
+                item[cam] = item[cam].astype(np.float32) / 255.0
 
-        # TODO(now): Where to put transform that goes to torch format for images.
         if self.image_transform is not None:
             for cam in self.camera_keys:
                 item[cam] = self.image_transform(item[cam])
@@ -518,7 +563,6 @@ def from_huggingface_hub(
             The resulting DataBuffer object.
 
         TODO(now): Consider populating the buffer one episode at a time in order not to kill RAM.
-        TODO(now): Reuse previously saved? Maybe I need a hash.
         """
         for k in ["data_spec", "buffer_capacity"]:
             if k in kwargs:
@@ -576,21 +620,20 @@ def from_huggingface_hub(
                         all_imgs.extend(episode_imgs)
                     data_dict[k] = np.stack(all_imgs)
                 else:
+                    # TODO(now): Dynamically create the path instead of storing?
                     data_dict[k] = np.stack(
                         [np.array(dct["path"], dtype=f"S{MAX_VIDEO_PATH_LENGTH}") for dct in hf_dataset[k]]
                     )
             else:
                 data_dict[k] = np.array(hf_dataset[k])
-        obj.add_episodes(data_dict)
-        if is_video_dataset:
-            obj._is_video_dataset = True  # TODO(now): HACK
+        if is_video_dataset and not decode_video:
+            obj._is_video_dataset = True
+        obj.add_episodes(data_dict)  # note this must happen after setting _is_video_dataset.
+        if is_video_dataset and not decode_video:
             # Symlink videos if needed.
             obj.videos_dir = kwargs["storage_dir"] / "videos"
             if not obj.videos_dir.exists():
                 os.symlink(videos_path.absolute(), obj.videos_dir)
-            obj.videos_dir = kwargs["storage_dir"] / "videos"  # TODO(now): HACK
-            obj.video_backend = "pyav"  # TODO(now): HACK
-
         return obj
 
     @staticmethod
diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py
index c08295013..46997ccc2 100644
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -33,6 +33,7 @@ def load_from_videos(
     videos_dir: Path,
     tolerance_s: float,
     backend: str = "pyav",
+    to_pytorch_format: bool = True,
 ):
     """Note: When using data workers (e.g. DataLoader with num_workers>0), do not call this function
     in the main process (e.g. by using a second Dataloader with num_workers=0). It will result in a Segmentation Fault.
@@ -51,14 +52,18 @@ def load_from_videos(
                 raise NotImplementedError("All video paths are expected to be the same for now.")
             video_path = data_dir / paths[0]
 
-            frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
+            frames = decode_video_frames_torchvision(
+                video_path, timestamps, tolerance_s, backend, to_pytorch_format=to_pytorch_format
+            )
             item[key] = frames
         else:
             # load one frame
             timestamps = [item[key]["timestamp"]]
             video_path = data_dir / item[key]["path"]
 
-            frames = decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
+            frames = decode_video_frames_torchvision(
+                video_path, timestamps, tolerance_s, backend, to_pytorch_format=to_pytorch_format
+            )
             item[key] = frames[0]
 
     return item
diff --git a/tests/test_data_buffer.py b/tests/test_data_buffer.py
index 99f8030bd..7f8b843a8 100644
--- a/tests/test_data_buffer.py
+++ b/tests/test_data_buffer.py
@@ -170,13 +170,6 @@ def test_get_data_by_key(tmp_path: Path):
     assert np.array_equal(data_from_buffer, new_episodes[data_key])
 
 
-def test_getter_return_tensors(tmp_path: Path):
-    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
-    for item in buffer:
-        for k in item:
-            assert isinstance(item[k], torch.tensor)
-
-
 def test_delta_timestamps_within_tolerance(tmp_path: Path):
     """Check that getting an item with delta_timestamps within tolerance succeeds."""
     if fps != 10:
@@ -224,6 +217,37 @@ def test_delta_timestamps_outside_tolerance_outside_episode_range(tmp_path):
     ), "Padding does not match expected values"
 
 
+@pytest.mark.parametrize(
+    ("dataset_repo_id", "decode_video"),
+    (
+        # ("lerobot/aloha_mobile_cabinet", True),  # choose aloha_mobile_cabinet to have multiple image keys
+        # ("lerobot/aloha_mobile_cabinet", False),
+        ("lerobot/pusht_image", False),
+    ),
+)
+def test_getter_images_with_delta_timestamps(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
+    """Checks a basic delta_timestamps use case with images.
+
+    Specifically, makes sure that the items returned by the getter have the correct number of frames.
+
+    Note: images deserve particular attention because video decoding is involved, and because they are not
+    covered by the basic tests above that use simple spoof data.
+    """
+    lerobot_dataset_info = load_info(dataset_repo_id, version=CODEBASE_VERSION, root=DATA_DIR)
+    fps = lerobot_dataset_info["fps"]
+    # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
+    storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}_{decode_video}"
+    buffer = DataBuffer.from_huggingface_hub(dataset_repo_id, decode_video, storage_dir=storage_dir)
+
+    delta_timestamps = [-1 / fps, 0.0, 1 / fps]
+    buffer.set_delta_timestamps_and_fps({k: [-1 / fps, 0.0, 1 / fps] for k in buffer.camera_keys}, fps)
+    for item in buffer:
+        for k in buffer.camera_keys:
+            assert item[k].shape[0] == len(delta_timestamps)
+
+    # TODO(now) check that the frames come out the same with / without decode video.
+
+
 @pytest.mark.parametrize(
     ("dataset_repo_id", "decode_video"),
     (
@@ -290,6 +314,42 @@ def test_from_huggingface_hub(tmp_path: Path, dataset_repo_id: str, decode_video
                 raise DevTestingError(f"Tests not implemented for this feature type: {type(feature)=}.")
 
 
+@pytest.mark.parametrize(
+    ("dataset_repo_id", "decode_video"),
+    (
+        ("lerobot/aloha_mobile_cabinet", True),  # choose aloha_mobile_cabinet to have multiple image keys
+        ("lerobot/aloha_mobile_cabinet", False),
+        ("lerobot/pusht", False),
+    ),
+)
+def test_camera_keys(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
+    """Check that the camera_keys property returns all relevant keys."""
+    storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}_{decode_video}"
+    buffer = DataBuffer.from_huggingface_hub(dataset_repo_id, decode_video, storage_dir=storage_dir)
+    assert set(buffer.camera_keys) == {k for k in buffer._data if k.startswith("observation.image")}
+
+
+@pytest.mark.parametrize(
+    ("dataset_repo_id", "decode_video"),
+    (
+        ("lerobot/aloha_mobile_cabinet", True),  # choose aloha_mobile_cabinet to have multiple image keys
+        ("lerobot/aloha_mobile_cabinet", False),
+        ("lerobot/pusht", False),
+    ),
+)
+def test_getter_returns_pytorch_format(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
+    # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
+    storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}_{decode_video}"
+    buffer = DataBuffer.from_huggingface_hub(dataset_repo_id, decode_video, storage_dir=storage_dir)
+    for item in buffer:
+        for k in item:
+            assert isinstance(item[k], torch.Tensor)
+        if k in buffer.camera_keys:
+            assert item[k].dtype == torch.float32, "images aren't float32"
+            h, w, c = item.shape
+            assert c < min(h, w), "images are not channel-first"
+
+
 # Arbitrarily set small dataset sizes, making sure to have uneven sizes.
 @pytest.mark.parametrize("offline_dataset_size", [0, 6])
 @pytest.mark.parametrize("online_dataset_size", [0, 4])

From 731634d698bd8623d9a3ddffe7db1d603d71efbe Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Thu, 19 Sep 2024 08:07:30 +0100
Subject: [PATCH 11/29] ALL tests passing

---
 lerobot/common/datasets/data_buffer.py |  6 +++++-
 tests/test_data_buffer.py              | 13 +++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/lerobot/common/datasets/data_buffer.py b/lerobot/common/datasets/data_buffer.py
index f99a0fe44..02a11149a 100644
--- a/lerobot/common/datasets/data_buffer.py
+++ b/lerobot/common/datasets/data_buffer.py
@@ -25,10 +25,12 @@
 import datasets
 import numpy as np
 import torch
+from tqdm import tqdm
 
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, DATA_DIR, LeRobotDataset
 from lerobot.common.datasets.utils import load_hf_dataset, load_info, load_videos
 from lerobot.common.datasets.video_utils import VideoFrame, decode_video_frames_torchvision, load_from_videos
+from lerobot.common.utils.utils import inside_slurm
 
 # TODO(alexander-soare): Move somewhere more appropriate once the DataBuffer class permeates more of the coe
 # base.
@@ -608,7 +610,9 @@ def from_huggingface_hub(
                     episode_indices = np.array(hf_dataset["episode_index"])
                     timestamps = np.array(hf_dataset["timestamp"])
                     all_imgs = []
-                    for episode_index in np.unique(episode_indices):
+                    for episode_index in tqdm(
+                        np.unique(episode_indices), desc="Decoding videos", disable=inside_slurm()
+                    ):
                         episode_data_indices = np.where(episode_indices == episode_index)[0]
                         episode_timestamps = timestamps[episode_indices == episode_index]
                         episode_imgs = decode_video_frames_torchvision(
diff --git a/tests/test_data_buffer.py b/tests/test_data_buffer.py
index 7f8b843a8..fd890b48f 100644
--- a/tests/test_data_buffer.py
+++ b/tests/test_data_buffer.py
@@ -220,8 +220,8 @@ def test_delta_timestamps_outside_tolerance_outside_episode_range(tmp_path):
 @pytest.mark.parametrize(
     ("dataset_repo_id", "decode_video"),
     (
-        # ("lerobot/aloha_mobile_cabinet", True),  # choose aloha_mobile_cabinet to have multiple image keys
-        # ("lerobot/aloha_mobile_cabinet", False),
+        ("lerobot/aloha_mobile_cabinet", True),  # choose aloha_mobile_cabinet to have multiple image keys
+        ("lerobot/aloha_mobile_cabinet", False),
         ("lerobot/pusht_image", False),
     ),
 )
@@ -241,7 +241,10 @@ def test_getter_images_with_delta_timestamps(tmp_path: Path, dataset_repo_id: st
 
     delta_timestamps = [-1 / fps, 0.0, 1 / fps]
     buffer.set_delta_timestamps_and_fps({k: [-1 / fps, 0.0, 1 / fps] for k in buffer.camera_keys}, fps)
-    for item in buffer:
+
+    # Just iterate through the start and end of the dataset to make the test faster.
+    for i in np.concatenate([np.arange(0, 10), np.arange(-10, 0)]):
+        item = buffer[i]
         for k in buffer.camera_keys:
             assert item[k].shape[0] == len(delta_timestamps)
 
@@ -341,7 +344,9 @@ def test_getter_returns_pytorch_format(tmp_path: Path, dataset_repo_id: str, dec
     # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
     storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}_{decode_video}"
     buffer = DataBuffer.from_huggingface_hub(dataset_repo_id, decode_video, storage_dir=storage_dir)
-    for item in buffer:
+    # Just iterate through the start and end of the dataset to make the test faster.
+    for i in np.concatenate([np.arange(0, 10), np.arange(-10, 0)]):
+        item = buffer[i]
         for k in item:
             assert isinstance(item[k], torch.Tensor)
         if k in buffer.camera_keys:

From 6944a2715480778fe812d732736162bd2472d271 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Thu, 19 Sep 2024 09:41:49 +0100
Subject: [PATCH 12/29] backup wip

---
 Makefile                               |   2 +
 lerobot/common/datasets/data_buffer.py |  61 +++++------
 lerobot/common/datasets/video_utils.py |   4 +-
 tests/test_data_buffer.py              | 136 +++++++++++++++----------
 4 files changed, 119 insertions(+), 84 deletions(-)

diff --git a/Makefile b/Makefile
index f6517497e..409178f63 100644
--- a/Makefile
+++ b/Makefile
@@ -31,6 +31,8 @@ test-end-to-end:
 	${MAKE} DEVICE=$(DEVICE) test-default-ete-eval
 	${MAKE} DEVICE=$(DEVICE) test-act-pusht-tutorial
 
+# TODO(now): Add tests with siphon
+
 test-act-ete-train:
 	python lerobot/scripts/train.py \
 		policy=act \
diff --git a/lerobot/common/datasets/data_buffer.py b/lerobot/common/datasets/data_buffer.py
index 02a11149a..b1ac908b2 100644
--- a/lerobot/common/datasets/data_buffer.py
+++ b/lerobot/common/datasets/data_buffer.py
@@ -23,6 +23,7 @@
 from typing import Callable
 
 import datasets
+import einops
 import numpy as np
 import torch
 from tqdm import tqdm
@@ -145,7 +146,9 @@ def __init__(
         # Parameters for the data structure.
         self._storage_dir = Path(storage_dir)
         self._data: dict[str, np.memmap] = {}
-        self._is_video_dataset = False
+        self._is_video_dataset = False  # may be switched to True by `from_huggingface_hub`
+        self._videos_dir: str | None = None  # may be set by `from_huggingface_hub`
+
         # If the storage directory already exists, load the memmaps.
         if self._storage_dir.exists():
             if buffer_capacity is not None:
@@ -165,7 +168,8 @@ def __init__(
             self._buffer_capacity = buffer_capacity
 
         # Parameters for the item getter.
-        self.set_delta_timestamps_and_fps(delta_timestamps, fps)
+        self._fps = fps
+        self.set_delta_timestamps(delta_timestamps)
         self.image_transform = image_transform
 
     @property
@@ -247,8 +251,7 @@ def _make_storage_dir(self, episode_data: dict[str, np.ndarray]):
                 self.INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (self._buffer_capacity,)},
                 self.FRAME_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (self._buffer_capacity,)},
                 self.EPISODE_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (self._buffer_capacity,)},
-                # TODO(now): Should this just be float32?
-                self.TIMESTAMP_KEY: {"dtype": np.dtype("float64"), "shape": (self._buffer_capacity,)},
+                self.TIMESTAMP_KEY: {"dtype": np.dtype("float32"), "shape": (self._buffer_capacity,)},
             }
             for k, v in episode_data.items():
                 if k in data_spec:
@@ -280,34 +283,33 @@ def _make_memmaps(self, data_spec: dict[str, dict], mode: str):
     def delta_timestamps(self) -> dict[str, np.ndarray] | None:
         return self._delta_timestamps
 
-    def set_delta_timestamps_and_fps(
-        self, delta_timestamps: dict[str, list[float]] | None, fps: float | None
-    ):
-        """Set delta_timestamps converting the values to numpy arrays, fps, and timestamp tolerance.
+    @property
+    def tolerance_s(self) -> float | None:
+        """
+        Tolerance (in seconds) used to discard loaded frames when their timestamps are not close enough to
+        the requested frames. It is only used when `delta_timestamps` is provided. The -1e-4 accounts for
+        possible numerical error.
+        """
+        if self._fps is None:
+            return None
+        return 1 / self._fps - 1e-4
+
+    def set_delta_timestamps(self, delta_timestamps: dict[str, list[float]] | None):
+        """Set delta_timestamps converting the values to numpy arrays.
 
         Note: The conversion is for an optimization in the __getitem__. The loop is much slower if lists need
         to be converted into numpy arrays.
-
-        Note: fps needs to be included as they should be provided together or not at all.
-        # TODO(now): ugly...
-
-        Tolerance is set according to the fps.
         """
-        if (delta_timestamps is None) ^ (fps is None):
-            raise ValueError("`delta_timestamps` and `fps` should be provided together, or not at all.")
+        if delta_timestamps is not None and self._fps is None:
+            raise ValueError(
+                "`fps` must be provided to `__init__` if you want to provide `delta_timestamps`."
+            )
 
         if delta_timestamps is not None:
             self._delta_timestamps = {k: np.array(v) for k, v in delta_timestamps.items()}
         else:
             self._delta_timestamps = None
 
-        self._fps = fps
-
-        # Tolerance (in seconds) used to discard loaded frames when their timestamps are not close enough to
-        # the requested frames. It is only used when `delta_timestamps` is provided. The -1e-4 accounts for
-        # possible numerical error.
-        self.tolerance_s = 1 / self.fps - 1e-4 if fps is not None else None
-
     def add_episodes(self, data: dict[str, np.ndarray]):
         """Add data to the buffer.
 
@@ -524,7 +526,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
             item = load_from_videos(
                 item_,
                 self.camera_keys,
-                self.videos_dir,
+                self._videos_dir,
                 self.tolerance_s or 1e-8,  # 1e-8 to account for no delta_timestamps
                 "pyav",
                 to_pytorch_format=True,
@@ -532,7 +534,9 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
         else:
             # Convert to PyTorch format: channel-last, float32, normalize to range [0, 1].
             for cam in self.camera_keys:
-                item[cam] = item[cam].astype(np.float32) / 255.0
+                item[cam] = einops.rearrange(
+                    torch.from_numpy(item[cam].astype(np.float32) / 255.0), "... h w c -> ... c h w"
+                )
 
         if self.image_transform is not None:
             for cam in self.camera_keys:
@@ -611,7 +615,7 @@ def from_huggingface_hub(
                     timestamps = np.array(hf_dataset["timestamp"])
                     all_imgs = []
                     for episode_index in tqdm(
-                        np.unique(episode_indices), desc="Decoding videos", disable=inside_slurm()
+                        np.unique(episode_indices), desc=f"Decoding videos for {k}", disable=inside_slurm()
                     ):
                         episode_data_indices = np.where(episode_indices == episode_index)[0]
                         episode_timestamps = timestamps[episode_indices == episode_index]
@@ -624,7 +628,6 @@ def from_huggingface_hub(
                         all_imgs.extend(episode_imgs)
                     data_dict[k] = np.stack(all_imgs)
                 else:
-                    # TODO(now): Dynamically create the path instead of storing?
                     data_dict[k] = np.stack(
                         [np.array(dct["path"], dtype=f"S{MAX_VIDEO_PATH_LENGTH}") for dct in hf_dataset[k]]
                     )
@@ -635,9 +638,9 @@ def from_huggingface_hub(
         obj.add_episodes(data_dict)  # note this must happen after setting _is_video_dataset.
         if is_video_dataset and not decode_video:
             # Symlink videos if needed.
-            obj.videos_dir = kwargs["storage_dir"] / "videos"
-            if not obj.videos_dir.exists():
-                os.symlink(videos_path.absolute(), obj.videos_dir)
+            obj._videos_dir = kwargs["storage_dir"] / "videos"
+            if not obj._videos_dir.exists():
+                os.symlink(videos_path.absolute(), obj._videos_dir)
         return obj
 
     @staticmethod
diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py
index 46997ccc2..af60222e8 100644
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -21,6 +21,7 @@
 from pathlib import Path
 from typing import Any, ClassVar
 
+import einops
 import pyarrow as pa
 import torch
 import torchvision
@@ -161,12 +162,13 @@ def decode_video_frames_torchvision(
     if log_loaded_timestamps:
         logging.info(f"{closest_ts=}")
 
+    # Note that at this point the images are in torch.uint8, in [0, 255], channel-first.
     if to_pytorch_format:
         # Return as pytorch format: float32, normalized to [0,1], channel-first.
         closest_frames = closest_frames.type(torch.float32) / 255
     else:
         # Return in numpy format: np.uint8, in [0, 255], channel-last.
-        closest_frames = closest_frames.permute(0, 2, 3, 1).numpy()
+        closest_frames = einops.rearrange(closest_frames.numpy(), "... c h w -> ... h w c")
 
     assert len(timestamps) == len(closest_frames)
     return closest_frames
diff --git a/tests/test_data_buffer.py b/tests/test_data_buffer.py
index fd890b48f..ef01cc9a7 100644
--- a/tests/test_data_buffer.py
+++ b/tests/test_data_buffer.py
@@ -174,10 +174,9 @@ def test_delta_timestamps_within_tolerance(tmp_path: Path):
     """Check that getting an item with delta_timestamps within tolerance succeeds."""
     if fps != 10:
         raise DevTestingError("This test is designed to use fps == 10.")
-    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}", delta_timestamps={"index": [-0.2, 0, 0.139]})
+    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}", delta_timestamps={"index": [-0.2, 0, 0.1]})
     new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
     buffer.add_episodes(new_data)
-    buffer.tolerance_s = 0.04
     item = buffer[2]
     data, is_pad = item["index"], item[f"index{DataBuffer.IS_PAD_POSTFIX}"]
     assert torch.allclose(data, torch.tensor([0, 2, 3])), "Data does not match expected values"
@@ -191,10 +190,11 @@ def test_delta_timestamps_outside_tolerance_inside_episode_range(tmp_path: Path)
     """
     if fps != 10:
         raise DevTestingError("This test is designed to use fps == 10.")
-    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}", delta_timestamps={"index": [-0.2, 0, 0.141]})
+    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}", delta_timestamps={"index": [-0.2, 0, 0.1]})
     new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
+    # Hack the timestamps to invoke a tolerance error.
+    new_data["timestamp"][3] += 0.1
     buffer.add_episodes(new_data)
-    buffer.tolerance_s = 0.04
     with pytest.raises(TimestampOutsideToleranceError):
         buffer[2]
 
@@ -204,16 +204,15 @@ def test_delta_timestamps_outside_tolerance_outside_episode_range(tmp_path):
     if fps != 10:
         raise DevTestingError("This test is designed to use fps == 10.")
     buffer = make_new_buffer(
-        tmp_path / f"buffer_{uuid4().hex}", delta_timestamps={"index": [-0.3, -0.24, 0, 0.26, 0.3]}
+        tmp_path / f"buffer_{uuid4().hex}", delta_timestamps={"index": [-0.3, -0.2, 0, 0.2, 0.3]}
     )
     new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
     buffer.add_episodes(new_data)
-    buffer.tolerance_s = 0.04
     item = buffer[2]
     data, is_pad = item["index"], item["index_is_pad"]
     assert torch.equal(data, torch.tensor([0, 0, 2, 4, 4])), "Data does not match expected values"
     assert torch.equal(
-        is_pad, torch.tensor([True, False, False, True, True])
+        is_pad, torch.tensor([True, False, False, False, True])
     ), "Padding does not match expected values"
 
 
@@ -222,25 +221,59 @@ def test_delta_timestamps_outside_tolerance_outside_episode_range(tmp_path):
     (
         ("lerobot/aloha_mobile_cabinet", True),  # choose aloha_mobile_cabinet to have multiple image keys
         ("lerobot/aloha_mobile_cabinet", False),
-        ("lerobot/pusht_image", False),
+        ("lerobot/pusht", False),
+    ),
+)
+def test_camera_keys(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
+    """Check that the camera_keys property returns all relevant keys."""
+    storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}_{decode_video}"
+    buffer = DataBuffer.from_huggingface_hub(dataset_repo_id, decode_video, storage_dir=storage_dir)
+    assert set(buffer.camera_keys) == {k for k in buffer._data if k.startswith("observation.image")}
+
+
+# TODO(now): Make the test lighter.
+@pytest.mark.parametrize(
+    ("dataset_repo_id", "decode_video"),
+    (
+        ("lerobot/aloha_mobile_cabinet", True),  # choose aloha_mobile_cabinet to have multiple image keys
+        ("lerobot/aloha_mobile_cabinet", False),
+        ("lerobot/pusht", False),
     ),
 )
-def test_getter_images_with_delta_timestamps(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
+def test_getter_returns_pytorch_format(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
+    """Checks that tensors are returned and that images are torch.float32, in range [0, 1], channel-first."""
+    # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
+    storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}_{decode_video}"
+    buffer = DataBuffer.from_huggingface_hub(dataset_repo_id, decode_video, storage_dir=storage_dir)
+    # Just iterate through the start and end of the dataset to make the test faster.
+    for i in np.concatenate([np.arange(0, 10), np.arange(-10, 0)]):
+        item = buffer[i]
+        for k in item:
+            assert isinstance(item[k], torch.Tensor)
+        for k in buffer.camera_keys:
+            assert item[k].dtype == torch.float32, "images aren't float32"
+            assert item[k].max() <= 1, "image values go above max of range [0, 1]"
+            assert item[k].min() >= 0, "image values go below min of range [0, 1]"
+            c, h, w = item[k].shape
+            assert c < min(h, w), "images are not channel-first"
+
+
+def test_getter_images_with_delta_timestamps(tmp_path: Path):
     """Checks a basic delta_timestamps use case with images.
 
     Specifically, makes sure that the items returned by the getter have the correct number of frames.
 
-    Note: images deserve particular attention because video decoding is involved, and because they are not
-    covered by the basic tests above that use simple spoof data.
+    Note: images deserve particular attention because they are not covered by the basic tests above that use
+    simple spoof data.
     """
+    dataset_repo_id = "lerobot/pusht_image"
     lerobot_dataset_info = load_info(dataset_repo_id, version=CODEBASE_VERSION, root=DATA_DIR)
     fps = lerobot_dataset_info["fps"]
     # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
-    storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}_{decode_video}"
-    buffer = DataBuffer.from_huggingface_hub(dataset_repo_id, decode_video, storage_dir=storage_dir)
-
+    storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}"
+    buffer = DataBuffer.from_huggingface_hub(dataset_repo_id, storage_dir=storage_dir, fps=fps)
     delta_timestamps = [-1 / fps, 0.0, 1 / fps]
-    buffer.set_delta_timestamps_and_fps({k: [-1 / fps, 0.0, 1 / fps] for k in buffer.camera_keys}, fps)
+    buffer.set_delta_timestamps({k: delta_timestamps for k in buffer.camera_keys})
 
     # Just iterate through the start and end of the dataset to make the test faster.
     for i in np.concatenate([np.arange(0, 10), np.arange(-10, 0)]):
@@ -248,7 +281,40 @@ def test_getter_images_with_delta_timestamps(tmp_path: Path, dataset_repo_id: st
         for k in buffer.camera_keys:
             assert item[k].shape[0] == len(delta_timestamps)
 
-    # TODO(now) check that the frames come out the same with / without decode video.
+
+def test_getter_video_images_with_delta_timestamps(tmp_path: Path):
+    """Checks that images from the video dataset and decoded video dataset are the same.
+
+    Adds to `test_getter_images_with_delta_timestamps` by testing with a video dataset.
+
+    Note: images deserve particular attention because video decoding is involved, and because they are not
+    covered by the basic tests above that use simple spoof data.
+    """
+    dataset_repo_id = "lerobot/aloha_mobile_cabinet"
+    lerobot_dataset_info = load_info(dataset_repo_id, version=CODEBASE_VERSION, root=DATA_DIR)
+    fps = lerobot_dataset_info["fps"]
+    # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
+    buffer_video = DataBuffer.from_huggingface_hub(
+        dataset_repo_id, False, storage_dir=tmp_path / f"{dataset_repo_id}_{uuid4().hex}_False", fps=fps
+    )
+    buffer_decode = DataBuffer.from_huggingface_hub(
+        dataset_repo_id, True, storage_dir=tmp_path / f"{dataset_repo_id}_{uuid4().hex}_True", fps=fps
+    )
+
+    assert set(buffer_video.camera_keys) == set(buffer_decode.camera_keys)
+
+    delta_timestamps = [-1 / fps, 0.0, 1 / fps]
+    buffer_video.set_delta_timestamps({k: delta_timestamps for k in buffer_video.camera_keys})
+    buffer_decode.set_delta_timestamps({k: delta_timestamps for k in buffer_decode.camera_keys})
+
+    # Just iterate through the start and end of the datasets to make the test faster.
+    for i in np.concatenate([np.arange(0, 10), np.arange(-10, 0)]):
+        item_video = buffer_video[i]
+        item_decode = buffer_decode[i]
+        for k in buffer_video.camera_keys:
+            assert item_video[k].shape[0] == len(delta_timestamps)
+            assert item_decode[k].shape[0] == len(delta_timestamps)
+            assert torch.equal(item_video[k], item_decode[k])
 
 
 @pytest.mark.parametrize(
@@ -317,44 +383,6 @@ def test_from_huggingface_hub(tmp_path: Path, dataset_repo_id: str, decode_video
                 raise DevTestingError(f"Tests not implemented for this feature type: {type(feature)=}.")
 
 
-@pytest.mark.parametrize(
-    ("dataset_repo_id", "decode_video"),
-    (
-        ("lerobot/aloha_mobile_cabinet", True),  # choose aloha_mobile_cabinet to have multiple image keys
-        ("lerobot/aloha_mobile_cabinet", False),
-        ("lerobot/pusht", False),
-    ),
-)
-def test_camera_keys(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
-    """Check that the camera_keys property returns all relevant keys."""
-    storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}_{decode_video}"
-    buffer = DataBuffer.from_huggingface_hub(dataset_repo_id, decode_video, storage_dir=storage_dir)
-    assert set(buffer.camera_keys) == {k for k in buffer._data if k.startswith("observation.image")}
-
-
-@pytest.mark.parametrize(
-    ("dataset_repo_id", "decode_video"),
-    (
-        ("lerobot/aloha_mobile_cabinet", True),  # choose aloha_mobile_cabinet to have multiple image keys
-        ("lerobot/aloha_mobile_cabinet", False),
-        ("lerobot/pusht", False),
-    ),
-)
-def test_getter_returns_pytorch_format(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
-    # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
-    storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}_{decode_video}"
-    buffer = DataBuffer.from_huggingface_hub(dataset_repo_id, decode_video, storage_dir=storage_dir)
-    # Just iterate through the start and end of the dataset to make the test faster.
-    for i in np.concatenate([np.arange(0, 10), np.arange(-10, 0)]):
-        item = buffer[i]
-        for k in item:
-            assert isinstance(item[k], torch.Tensor)
-        if k in buffer.camera_keys:
-            assert item[k].dtype == torch.float32, "images aren't float32"
-            h, w, c = item.shape
-            assert c < min(h, w), "images are not channel-first"
-
-
 # Arbitrarily set small dataset sizes, making sure to have uneven sizes.
 @pytest.mark.parametrize("offline_dataset_size", [0, 6])
 @pytest.mark.parametrize("online_dataset_size", [0, 4])

From 52da98d2171b6b272605f854f6b4c26eed471b2e Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Thu, 19 Sep 2024 09:46:29 +0100
Subject: [PATCH 13/29] backup wip

---
 tests/test_data_buffer.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/test_data_buffer.py b/tests/test_data_buffer.py
index ef01cc9a7..5cf58ea68 100644
--- a/tests/test_data_buffer.py
+++ b/tests/test_data_buffer.py
@@ -219,8 +219,9 @@ def test_delta_timestamps_outside_tolerance_outside_episode_range(tmp_path):
 @pytest.mark.parametrize(
     ("dataset_repo_id", "decode_video"),
     (
-        ("lerobot/aloha_mobile_cabinet", True),  # choose aloha_mobile_cabinet to have multiple image keys
-        ("lerobot/aloha_mobile_cabinet", False),
+        # choose unitreeh1_two_robot_greeting to have multiple image keys
+        ("lerobot/unitreeh1_two_robot_greeting", True),
+        ("lerobot/unitreeh1_two_robot_greeting", False),
         ("lerobot/pusht", False),
     ),
 )
@@ -235,8 +236,9 @@ def test_camera_keys(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
 @pytest.mark.parametrize(
     ("dataset_repo_id", "decode_video"),
     (
-        ("lerobot/aloha_mobile_cabinet", True),  # choose aloha_mobile_cabinet to have multiple image keys
-        ("lerobot/aloha_mobile_cabinet", False),
+        # choose unitreeh1_two_robot_greeting to have multiple image keys
+        ("lerobot/unitreeh1_two_robot_greeting", True),
+        ("lerobot/unitreeh1_two_robot_greeting", False),
         ("lerobot/pusht", False),
     ),
 )
@@ -290,7 +292,8 @@ def test_getter_video_images_with_delta_timestamps(tmp_path: Path):
     Note: images deserve particular attention because video decoding is involved, and because they are not
     covered by the basic tests above that use simple spoof data.
     """
-    dataset_repo_id = "lerobot/aloha_mobile_cabinet"
+    # choose unitreeh1_two_robot_greeting to have multiple image keys
+    dataset_repo_id = "lerobot/unitreeh1_two_robot_greeting"
     lerobot_dataset_info = load_info(dataset_repo_id, version=CODEBASE_VERSION, root=DATA_DIR)
     fps = lerobot_dataset_info["fps"]
     # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.

From 05cb6ffc5cc66c8036a22ab19542055d12d14325 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Thu, 19 Sep 2024 13:43:10 +0100
Subject: [PATCH 14/29] backup wip

---
 lerobot/common/datasets/data_buffer.py | 8 ++++++--
 lerobot/scripts/train.py               | 1 -
 tests/test_data_buffer.py              | 4 ++--
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/lerobot/common/datasets/data_buffer.py b/lerobot/common/datasets/data_buffer.py
index b1ac908b2..609aa7683 100644
--- a/lerobot/common/datasets/data_buffer.py
+++ b/lerobot/common/datasets/data_buffer.py
@@ -224,7 +224,7 @@ def _save_data_spec(self, data_spec: dict[str, dict]):
         with open(meta_file, "w") as f:
             for k in data_spec:
                 data_spec[k]["dtype"] = str(data_spec[k]["dtype"])
-            json.dump(data_spec, f)
+            json.dump(data_spec, f, indent=2)
 
     def _load_data_spec(self) -> dict[str, dict]:
         """Load the data type and shape specifications from the storage directory."""
@@ -631,8 +631,12 @@ def from_huggingface_hub(
                     data_dict[k] = np.stack(
                         [np.array(dct["path"], dtype=f"S{MAX_VIDEO_PATH_LENGTH}") for dct in hf_dataset[k]]
                     )
+            elif isinstance(feature, datasets.features.Sequence):
+                data_dict[k] = np.array(hf_dataset[k], dtype=np.dtype(feature.feature.dtype))
+            elif isinstance(feature, datasets.features.Value):
+                data_dict[k] = np.array(hf_dataset[k], dtype=np.dtype(feature.dtype))
             else:
-                data_dict[k] = np.array(hf_dataset[k])
+                raise NotImplementedError(f"feature type {type(feature)} is not handled.")
         if is_video_dataset and not decode_video:
             obj._is_video_dataset = True
         obj.add_episodes(data_dict)  # note this must happen after setting _is_video_dataset.
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index 0876e0012..197327279 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -406,7 +406,6 @@ def evaluate_and_checkpoint_if_needed(step, is_online):
     if cfg.get("use_lerobot_data_buffer", False):
         offline_dataset_for_dataloader = DataBuffer.from_huggingface_hub(
             offline_dataset.repo_id,
-            storage_dir=Path(f"/tmp/{offline_dataset.repo_id}"),
             fps=offline_dataset.fps,
             delta_timestamps=offline_dataset.delta_timestamps,
             decode_video=offline_dataset.video,
diff --git a/tests/test_data_buffer.py b/tests/test_data_buffer.py
index 5cf58ea68..09883496e 100644
--- a/tests/test_data_buffer.py
+++ b/tests/test_data_buffer.py
@@ -219,7 +219,7 @@ def test_delta_timestamps_outside_tolerance_outside_episode_range(tmp_path):
 @pytest.mark.parametrize(
     ("dataset_repo_id", "decode_video"),
     (
-        # choose unitreeh1_two_robot_greeting to have multiple image keys
+        # choose unitreeh1_two_robot_greeting to have multiple image keys (with minimal video data)
         ("lerobot/unitreeh1_two_robot_greeting", True),
         ("lerobot/unitreeh1_two_robot_greeting", False),
         ("lerobot/pusht", False),
@@ -232,7 +232,6 @@ def test_camera_keys(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
     assert set(buffer.camera_keys) == {k for k in buffer._data if k.startswith("observation.image")}
 
 
-# TODO(now): Make the test lighter.
 @pytest.mark.parametrize(
     ("dataset_repo_id", "decode_video"),
     (
@@ -334,6 +333,7 @@ def test_from_huggingface_hub(tmp_path: Path, dataset_repo_id: str, decode_video
         - We can make a buffer from a Hugging Face Hub dataset repository.
         - The buffer we make, accurately reflects the hub dataset.
         - If we try to make it a second time, everything still works as expected.
+        - All non-image data
     """
     for iteration in range(2):  # do it twice to check that running with an existing cached buffer also works
         hf_dataset = load_hf_dataset(dataset_repo_id, version=CODEBASE_VERSION, root=DATA_DIR, split="train")

From 7da0988666df84606b9648747c89e570e0302341 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Thu, 19 Sep 2024 15:09:01 +0100
Subject: [PATCH 15/29] backup wip

---
 Makefile                               | 45 ++++++++++++++++-
 lerobot/common/datasets/data_buffer.py | 69 ++++++++++----------------
 lerobot/scripts/train.py               | 10 +++-
 tests/test_data_buffer.py              |  4 +-
 4 files changed, 80 insertions(+), 48 deletions(-)

diff --git a/Makefile b/Makefile
index 409178f63..afb18c7da 100644
--- a/Makefile
+++ b/Makefile
@@ -20,6 +20,8 @@ build-gpu:
 
 test-end-to-end:
 	${MAKE} DEVICE=$(DEVICE) test-act-ete-train
+	${MAKE} DEVICE=$(DEVICE) test-act-ete-train-data-buffer
+	${MAKE} DEVICE=$(DEVICE) test-act-ete-train-data-buffer-decode-video
 	${MAKE} DEVICE=$(DEVICE) test-act-ete-eval
 	${MAKE} DEVICE=$(DEVICE) test-act-ete-train-amp
 	${MAKE} DEVICE=$(DEVICE) test-act-ete-eval-amp
@@ -31,8 +33,6 @@ test-end-to-end:
 	${MAKE} DEVICE=$(DEVICE) test-default-ete-eval
 	${MAKE} DEVICE=$(DEVICE) test-act-pusht-tutorial
 
-# TODO(now): Add tests with siphon
-
 test-act-ete-train:
 	python lerobot/scripts/train.py \
 		policy=act \
@@ -52,6 +52,47 @@ test-act-ete-train:
 		training.image_transforms.enable=true \
 		hydra.run.dir=tests/outputs/act/
 
+test-act-ete-train-data-buffer:
+	python lerobot/scripts/train.py \
+		policy=act \
+		policy.dim_model=64 \
+		env=aloha \
+		wandb.enable=False \
+		training.offline_steps=2 \
+		training.online_steps=0 \
+		eval.n_episodes=1 \
+		eval.batch_size=1 \
+		device=$(DEVICE) \
+		training.save_checkpoint=true \
+		training.save_freq=2 \
+		policy.n_action_steps=20 \
+		policy.chunk_size=20 \
+		training.batch_size=2 \
+		training.image_transforms.enable=true \
+		hydra.run.dir=tests/outputs/act_buffer/ \
+		+use_lerobot_data_buffer=true \
+
+test-act-ete-train-data-buffer-decode-video:
+	python lerobot/scripts/train.py \
+		policy=act \
+		policy.dim_model=64 \
+		env=aloha \
+		wandb.enable=False \
+		training.offline_steps=2 \
+		training.online_steps=0 \
+		eval.n_episodes=1 \
+		eval.batch_size=1 \
+		device=$(DEVICE) \
+		training.save_checkpoint=true \
+		training.save_freq=2 \
+		policy.n_action_steps=20 \
+		policy.chunk_size=20 \
+		training.batch_size=2 \
+		training.image_transforms.enable=true \
+		hydra.run.dir=tests/outputs/act_buffer_decode_video/ \
+		+use_lerobot_data_buffer=true \
+		+lerobot_data_buffer_decode_video=true \
+
 test-act-ete-eval:
 	python lerobot/scripts/eval.py \
 		-p tests/outputs/act/checkpoints/000002/pretrained_model \
diff --git a/lerobot/common/datasets/data_buffer.py b/lerobot/common/datasets/data_buffer.py
index 609aa7683..bb41ed22c 100644
--- a/lerobot/common/datasets/data_buffer.py
+++ b/lerobot/common/datasets/data_buffer.py
@@ -18,7 +18,6 @@
 import json
 import os
 import shutil
-from itertools import chain
 from pathlib import Path
 from typing import Callable
 
@@ -426,22 +425,6 @@ def _item_to_tensors(item: dict) -> dict:
                 item_[k] = torch.tensor(v)
         return item_
 
-    def _optimized_advanced_slice(self, data_key: str, indices: np.ndarray) -> np.ndarray:
-        """Convert advanced slicing to basic slicing by finding contiguous ranges in the requested indices.
-
-        TODO(now): Is this needed?
-        """
-        indices_diff = np.diff(indices, prepend=indices[0] - 1)
-        where_not_1 = np.where(indices_diff != 1)[0]
-        ptr = 0
-        ret = []
-        for ix in chain(where_not_1, [len(indices)]):
-            ret.append(self._data[data_key][indices[ptr] : indices[ix - 1] + 1])
-            ptr = ix
-
-        # Avoid creating a copy with concatenate if possible.
-        return np.concatenate(ret) if len(ret) > 1 else ret[0]
-
     def __len__(self):
         return self.num_samples
 
@@ -501,22 +484,16 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
                 if self.is_video_dataset and data_key.startswith(self.IMAGE_KEY_PREFIX):
                     video_timestamps[data_key] = self._data[self.TIMESTAMP_KEY][episode_data_indices[argmin_]]
 
-                # Load frames for this data key.
-                if np.any(np.diff(argmin_) != 1):
-                    item[data_key] = self._data[data_key][episode_data_indices[argmin_]]
-                    # item[data_key] = self._optimized_advanced_slice(data_key, episode_data_indices[argmin_])
-                else:
-                    # Do basic slicing where possible
-                    item[data_key] = self._data[data_key][
-                        episode_data_indices[argmin_.min()] : episode_data_indices[argmin_.max()] + 1
-                    ]
+                item[data_key] = self._data[data_key][episode_data_indices[argmin_]]
 
                 item[f"{data_key}{self.IS_PAD_POSTFIX}"] = is_pad
 
         if self.is_video_dataset:
             item_ = dict(item)
             for k in self.camera_keys:
-                if self.delta_timestamps is None:
+                if self.delta_timestamps is None or not any(
+                    k in self.delta_timestamps for k in self.camera_keys
+                ):
                     item_[k] = {"path": item[k].decode(), "timestamp": item[self.TIMESTAMP_KEY]}
                 else:
                     item_[k] = [
@@ -550,6 +527,7 @@ def from_huggingface_hub(
         repo_id: str,
         decode_video: bool = False,
         root: Path | None = DATA_DIR,
+        verbose: bool = False,
         **kwargs,
     ) -> "DataBuffer":
         """Create a DataBuffer from a data repository on the Hugging Face Hub.
@@ -567,8 +545,6 @@ def from_huggingface_hub(
                 `/tmp/{repo_id}_{hf_dataset._fingerprint}_{decoded?}` unless provided explicitly.
         Returns:
             The resulting DataBuffer object.
-
-        TODO(now): Consider populating the buffer one episode at a time in order not to kill RAM.
         """
         for k in ["data_spec", "buffer_capacity"]:
             if k in kwargs:
@@ -576,6 +552,13 @@ def from_huggingface_hub(
 
         hf_dataset = load_hf_dataset(repo_id, version=CODEBASE_VERSION, root=root, split="train")
         hf_dataset.set_transform(lambda x: x)
+        # Get some metadata necessary for processing videos.
+        lerobot_dataset_info = load_info(repo_id, version=CODEBASE_VERSION, root=root)
+        if not lerobot_dataset_info.get("video", False) and decode_video:
+            raise ValueError(f"The provided dataset is not a video dataset but you have {decode_video=}")
+        if lerobot_dataset_info.get("video", False):
+            videos_path = load_videos(repo_id, version=CODEBASE_VERSION, root=root)
+
         kwargs.setdefault(
             "storage_dir",
             DataBuffer._default_storage_dir_from_huggingface_hub(
@@ -587,22 +570,23 @@ def from_huggingface_hub(
         if kwargs["storage_dir"].exists():
             buffer_already_on_disk = True
 
+        # Create the DataBuffer object. Reminder: if the storage directory already exists, this reads it.
+        # Otherwise, the storage directory is not created until later when we make the first call to
+        # `add_episodes`.
         obj = cls(
             **kwargs,
             buffer_capacity=len(hf_dataset) if not buffer_already_on_disk else None,
         )
+
+        if lerobot_dataset_info.get("video", False) and not decode_video:
+            obj._is_video_dataset = True
+        if obj._is_video_dataset:
+            obj._videos_dir = kwargs["storage_dir"] / "videos"
+
         # If we have accessed an existing cached data buffer, just return the object as is.
         if buffer_already_on_disk:
             return obj
 
-        # Get some metadata necessary for processing videos.
-        lerobot_dataset_info = load_info(repo_id, version=CODEBASE_VERSION, root=root)
-        is_video_dataset = lerobot_dataset_info.get("video", False)
-        if not is_video_dataset and decode_video:
-            raise ValueError(f"The provided dataset is not a video dataset but you have {decode_video=}")
-        if is_video_dataset:
-            videos_path = load_videos(repo_id, version=CODEBASE_VERSION, root=root)
-
         # Populate the buffer with the data from the dataset.
         data_dict = {}
         for k, feature in hf_dataset.features.items():
@@ -637,14 +621,11 @@ def from_huggingface_hub(
                 data_dict[k] = np.array(hf_dataset[k], dtype=np.dtype(feature.dtype))
             else:
                 raise NotImplementedError(f"feature type {type(feature)} is not handled.")
-        if is_video_dataset and not decode_video:
-            obj._is_video_dataset = True
+
         obj.add_episodes(data_dict)  # note this must happen after setting _is_video_dataset.
-        if is_video_dataset and not decode_video:
-            # Symlink videos if needed.
-            obj._videos_dir = kwargs["storage_dir"] / "videos"
-            if not obj._videos_dir.exists():
-                os.symlink(videos_path.absolute(), obj._videos_dir)
+        # Symlink vidoes if needed.
+        if obj._is_video_dataset and not obj._videos_dir.exists():
+            os.symlink(videos_path.absolute(), obj._videos_dir)
         return obj
 
     @staticmethod
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index 197327279..6ff372ffa 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -404,11 +404,19 @@ def evaluate_and_checkpoint_if_needed(step, is_online):
         sampler = None
 
     if cfg.get("use_lerobot_data_buffer", False):
+        logging.info("Siphoning the dataset into a DataBuffer.")
+        decode_video = offline_dataset.video and cfg.get("lerobot_data_buffer_decode_video", False)
+        if decode_video:
+            logging.info(
+                "You have chosen to decode the video. It could take some time to populate the buffer "
+                "depending on the amount of data (but it only needs to happen once, and data loading will be "
+                "fast!)"
+            )
         offline_dataset_for_dataloader = DataBuffer.from_huggingface_hub(
             offline_dataset.repo_id,
             fps=offline_dataset.fps,
             delta_timestamps=offline_dataset.delta_timestamps,
-            decode_video=offline_dataset.video,
+            decode_video=decode_video,
         )
     else:
         offline_dataset_for_dataloader = offline_dataset
diff --git a/tests/test_data_buffer.py b/tests/test_data_buffer.py
index 09883496e..596fd5cbb 100644
--- a/tests/test_data_buffer.py
+++ b/tests/test_data_buffer.py
@@ -332,8 +332,8 @@ def test_from_huggingface_hub(tmp_path: Path, dataset_repo_id: str, decode_video
     Check that:
         - We can make a buffer from a Hugging Face Hub dataset repository.
         - The buffer we make, accurately reflects the hub dataset.
+        - We can get an item from the buffer.
         - If we try to make it a second time, everything still works as expected.
-        - All non-image data
     """
     for iteration in range(2):  # do it twice to check that running with an existing cached buffer also works
         hf_dataset = load_hf_dataset(dataset_repo_id, version=CODEBASE_VERSION, root=DATA_DIR, split="train")
@@ -384,6 +384,8 @@ def test_from_huggingface_hub(tmp_path: Path, dataset_repo_id: str, decode_video
                 assert np.array_equal(buffer.get_data_by_key(k), hf_dataset[k])
             else:
                 raise DevTestingError(f"Tests not implemented for this feature type: {type(feature)=}.")
+        # Check that we can get an item.
+        _ = buffer[0]
 
 
 # Arbitrarily set small dataset sizes, making sure to have uneven sizes.

From 5bc1500c6b67c912700c999e391e3b0878d92e57 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Thu, 19 Sep 2024 15:52:53 +0100
Subject: [PATCH 16/29] ready for review

---
 lerobot/common/datasets/data_buffer.py | 58 ++++++++++++++------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/lerobot/common/datasets/data_buffer.py b/lerobot/common/datasets/data_buffer.py
index bb41ed22c..ea8dae8ef 100644
--- a/lerobot/common/datasets/data_buffer.py
+++ b/lerobot/common/datasets/data_buffer.py
@@ -29,7 +29,7 @@
 
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, DATA_DIR, LeRobotDataset
 from lerobot.common.datasets.utils import load_hf_dataset, load_info, load_videos
-from lerobot.common.datasets.video_utils import VideoFrame, decode_video_frames_torchvision, load_from_videos
+from lerobot.common.datasets.video_utils import VideoFrame, decode_video_frames_torchvision
 from lerobot.common.utils.utils import inside_slurm
 
 # TODO(alexander-soare): Move somewhere more appropriate once the DataBuffer class permeates more of the coe
@@ -446,6 +446,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
 
         item = {k: v[idx] for k, v in self._data.items() if not k.startswith("_")}
 
+        # If we are using delta_timestamps take slices of the data.
         if self.delta_timestamps is not None:
             episode_index = item[self.EPISODE_INDEX_KEY]
             current_ts = item[self.TIMESTAMP_KEY]
@@ -458,7 +459,8 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
             episode_timestamps = self._data[self.TIMESTAMP_KEY][episode_data_indices]
 
             if self.is_video_dataset:
-                video_timestamps = {}  # TODO(now): HACK
+                # We'll use this for `decode_video_frames_torchvision`.
+                video_delta_timestamps = {}
 
             for data_key in self.delta_timestamps:
                 # Get timestamps used as query to retrieve data of previous/future frames.
@@ -482,42 +484,44 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
                     )
 
                 if self.is_video_dataset and data_key.startswith(self.IMAGE_KEY_PREFIX):
-                    video_timestamps[data_key] = self._data[self.TIMESTAMP_KEY][episode_data_indices[argmin_]]
-
-                item[data_key] = self._data[data_key][episode_data_indices[argmin_]]
+                    video_delta_timestamps[data_key] = self._data[self.TIMESTAMP_KEY][
+                        episode_data_indices[argmin_]
+                    ]
+                else:
+                    item[data_key] = self._data[data_key][episode_data_indices[argmin_]]
 
                 item[f"{data_key}{self.IS_PAD_POSTFIX}"] = is_pad
 
         if self.is_video_dataset:
-            item_ = dict(item)
+            # Decode the required video frames.
             for k in self.camera_keys:
-                if self.delta_timestamps is None or not any(
-                    k in self.delta_timestamps for k in self.camera_keys
-                ):
-                    item_[k] = {"path": item[k].decode(), "timestamp": item[self.TIMESTAMP_KEY]}
+                this_key_has_delta_timestamps = (
+                    self.delta_timestamps is not None and k in self.delta_timestamps
+                )
+                requested_timestamps = (
+                    video_delta_timestamps[k] if this_key_has_delta_timestamps else [item[self.TIMESTAMP_KEY]]
+                )
+                img_or_imgs = decode_video_frames_torchvision(
+                    video_path=self.storage_dir / item[k].decode(),
+                    timestamps=requested_timestamps,
+                    tolerance_s=self.tolerance_s or 1e-8,  # 1e-8 to account for no fps setting
+                    backend="pyav",
+                    to_pytorch_format=True,
+                )
+                if this_key_has_delta_timestamps:
+                    item[k] = img_or_imgs
                 else:
-                    item_[k] = [
-                        {"path": item[k][i].decode(), "timestamp": video_timestamps[k][i]}
-                        for i in range(len(item[k]))
-                    ]
-            item = load_from_videos(
-                item_,
-                self.camera_keys,
-                self._videos_dir,
-                self.tolerance_s or 1e-8,  # 1e-8 to account for no delta_timestamps
-                "pyav",
-                to_pytorch_format=True,
-            )
+                    item[k] = img_or_imgs[0]  # in this case we don't want a temporal dimension
         else:
             # Convert to PyTorch format: channel-last, float32, normalize to range [0, 1].
-            for cam in self.camera_keys:
-                item[cam] = einops.rearrange(
-                    torch.from_numpy(item[cam].astype(np.float32) / 255.0), "... h w c -> ... c h w"
+            for k in self.camera_keys:
+                item[k] = einops.rearrange(
+                    torch.from_numpy(item[k].astype(np.float32) / 255.0), "... h w c -> ... c h w"
                 )
 
         if self.image_transform is not None:
-            for cam in self.camera_keys:
-                item[cam] = self.image_transform(item[cam])
+            for k in self.camera_keys:
+                item[k] = self.image_transform(item[k])
 
         return self._item_to_tensors(item)
 

From 63c23203ec24c4a7626ae26e14a6061a04864979 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Thu, 19 Sep 2024 17:46:13 +0100
Subject: [PATCH 17/29] temporarily revert file names for diff

---
 lerobot/common/datasets/{data_buffer.py => online_buffer.py} | 0
 lerobot/scripts/train.py                                     | 2 +-
 tests/{test_data_buffer.py => test_online_buffer.py}         | 4 ++--
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename lerobot/common/datasets/{data_buffer.py => online_buffer.py} (100%)
 rename tests/{test_data_buffer.py => test_online_buffer.py} (99%)

diff --git a/lerobot/common/datasets/data_buffer.py b/lerobot/common/datasets/online_buffer.py
similarity index 100%
rename from lerobot/common/datasets/data_buffer.py
rename to lerobot/common/datasets/online_buffer.py
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index 6ff372ffa..7b8e52081 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -30,9 +30,9 @@
 from torch import nn
 from torch.cuda.amp import GradScaler
 
-from lerobot.common.datasets.data_buffer import DataBuffer, compute_sampler_weights
 from lerobot.common.datasets.factory import make_dataset, resolve_delta_timestamps
 from lerobot.common.datasets.lerobot_dataset import MultiLeRobotDataset
+from lerobot.common.datasets.online_buffer import DataBuffer, compute_sampler_weights
 from lerobot.common.datasets.sampler import EpisodeAwareSampler
 from lerobot.common.datasets.utils import cycle
 from lerobot.common.envs.factory import make_env
diff --git a/tests/test_data_buffer.py b/tests/test_online_buffer.py
similarity index 99%
rename from tests/test_data_buffer.py
rename to tests/test_online_buffer.py
index 596fd5cbb..612a2eee1 100644
--- a/tests/test_data_buffer.py
+++ b/tests/test_online_buffer.py
@@ -22,12 +22,12 @@
 import pytest
 import torch
 
-from lerobot.common.datasets.data_buffer import (
+from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, DATA_DIR, LeRobotDataset
+from lerobot.common.datasets.online_buffer import (
     DataBuffer,
     TimestampOutsideToleranceError,
     compute_sampler_weights,
 )
-from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, DATA_DIR, LeRobotDataset
 from lerobot.common.datasets.utils import hf_transform_to_torch, load_hf_dataset, load_info, load_videos
 from lerobot.common.datasets.video_utils import VideoFrame, decode_video_frames_torchvision
 from tests.utils import DevTestingError

From f0848cdc775b0cbc3bfedf665219069683ee0b77 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Fri, 20 Sep 2024 16:03:34 +0100
Subject: [PATCH 18/29] remove redundant kwarg

---
 lerobot/common/datasets/online_buffer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
index ea8dae8ef..b208fddfb 100644
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -531,7 +531,6 @@ def from_huggingface_hub(
         repo_id: str,
         decode_video: bool = False,
         root: Path | None = DATA_DIR,
-        verbose: bool = False,
         **kwargs,
     ) -> "DataBuffer":
         """Create a DataBuffer from a data repository on the Hugging Face Hub.

From f7f2972eb37ee2d161b4b40ceaff1a0b150dfee0 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Mon, 23 Sep 2024 11:31:45 +0100
Subject: [PATCH 19/29] revision

---
 lerobot/common/datasets/online_buffer.py | 13 +++++++------
 lerobot/scripts/train.py                 |  2 ++
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
index b208fddfb..3bded9656 100644
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -66,7 +66,7 @@ class DataBuffer(torch.utils.data.Dataset):
     """Data buffer and training data item getter.
 
     Data is considered to come in the form of "episodes" (an instance of a robot performing a task). Episodes
-    are made up of "frames", which are chronoligically ordered and contain timestamp aligned data, potentially
+    are made up of "frames", which are chronologically ordered and contain timestamp aligned data, potentially
     including environment observations, and robot actions. NOTE: for the time being, we require all data
     modalities to be timestamp aligned. This constraint may be relaxed in the future.
 
@@ -87,10 +87,11 @@ class DataBuffer(torch.utils.data.Dataset):
     about the date types and shapes for each memmap. This allows us to load the data without having to specify
     the data specifications at runtime.
 
-    The `add_episodes` method can be used to insert more data in the form of integral episodes (starting from
-    frame 0 and with the frames ordered). The buffer has a compulsory size limit, which must be provided when
-    creating a new one. Data is inserted in a circular fashion, inserting after the most recently added frame,
-    and wrapping around to the start when necessary (in which case older episodes are overwritten).
+    A size limit must be specified when creating a new buffer (to know how much space to reserve on disk). The_
+    `add_episodes` method can be used to insert data in the form of integral episodes (starting from frame 0
+    and with the frames ordered). Data is inserted in a circular fashion, inserting after the most recently
+    added frame, and wrapping around to the start of the buffer when necessary (in which case older episode
+    frames are overwritten).
 
     This class is also a PyTorch Dataset and can be used as such in a dataloader for a training loop. The item
     getter returns either a single frame, or a slice of a single episode if `delta_timestamps` is set. It also
@@ -554,7 +555,7 @@ def from_huggingface_hub(
                 raise ValueError(f"`{k}` should not be provided as it is inferred from the hub dataset.")
 
         hf_dataset = load_hf_dataset(repo_id, version=CODEBASE_VERSION, root=root, split="train")
-        hf_dataset.set_transform(lambda x: x)
+        hf_dataset.set_transform(lambda x: x)  # there is a default transform in place. reset it
         # Get some metadata necessary for processing videos.
         lerobot_dataset_info = load_info(repo_id, version=CODEBASE_VERSION, root=root)
         if not lerobot_dataset_info.get("video", False) and decode_video:
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index 7b8e52081..169b7c063 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -417,6 +417,7 @@ def evaluate_and_checkpoint_if_needed(step, is_online):
             fps=offline_dataset.fps,
             delta_timestamps=offline_dataset.delta_timestamps,
             decode_video=decode_video,
+            image_transform=offline_dataset.image_transforms,
         )
     else:
         offline_dataset_for_dataloader = offline_dataset
@@ -493,6 +494,7 @@ def evaluate_and_checkpoint_if_needed(step, is_online):
         buffer_capacity=cfg.training.online_buffer_capacity,
         fps=online_env.unwrapped.metadata["render_fps"],
         delta_timestamps=cfg.training.delta_timestamps,
+        image_transform=offline_dataset.image_transforms,
     )
 
     # If we are doing online rollouts asynchronously, deepcopy the policy to use for online rollouts (this

From 2d05318159ddf933a6b5374dd130015af7365514 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Tue, 24 Sep 2024 14:52:12 +0100
Subject: [PATCH 20/29] backup wip

---
 lerobot/common/datasets/online_buffer.py | 78 ++++++++++++++++--------
 lerobot/scripts/train.py                 |  2 +-
 2 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
index 3bded9656..6f7086bad 100644
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -25,7 +25,9 @@
 import einops
 import numpy as np
 import torch
-from tqdm import tqdm
+import torchvision
+from PIL import Image
+from tqdm import tqdm, trange
 
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, DATA_DIR, LeRobotDataset
 from lerobot.common.datasets.utils import load_hf_dataset, load_info, load_videos
@@ -146,11 +148,11 @@ def __init__(
         # Parameters for the data structure.
         self._storage_dir = Path(storage_dir)
         self._data: dict[str, np.memmap] = {}
-        self._is_video_dataset = False  # may be switched to True by `from_huggingface_hub`
+        self._image_storage_mode = "memmap"
         self._videos_dir: str | None = None  # may be set by `from_huggingface_hub`
 
         # If the storage directory already exists, load the memmaps.
-        if self._storage_dir.exists():
+        if (self._storage_dir / self.METADATA_FILE_NAME).exists():
             if buffer_capacity is not None:
                 raise ValueError(
                     "The storage directory already exists, which means you should not provide a "
@@ -237,9 +239,9 @@ def _load_data_spec(self) -> dict[str, dict]:
 
     def _make_storage_dir(self, episode_data: dict[str, np.ndarray]):
         """Create the storage directory based on example episode data from the first `add_episodes` call."""
-        assert (
-            not self.storage_dir.exists()
-        ), "This method should only be called before the storage directory has been created."
+        assert not (
+            self.storage_dir / self.METADATA_FILE_NAME
+        ).exists(), "This method should only be called before the storage directory has been created."
 
         self._storage_dir.mkdir(parents=True, exist_ok=True)
 
@@ -359,12 +361,11 @@ def _add_episode(self, data: dict[str, np.ndarray]):
         for k in data:
             if not k.startswith(self.IMAGE_KEY_PREFIX):
                 continue
-            if self._is_video_dataset:
+            if self._image_storage_mode in ["video", "png"]:
                 if data[k].dtype != np.dtype(f"S{MAX_VIDEO_PATH_LENGTH}"):
                     raise ValueError(
                         f"Any data key starting with '{self.IMAGE_KEY_PREFIX}' is assumed to be an image, "
-                        "and in a video dataset it should be string data (with a relative path to the video "
-                        "to be loaded)."
+                        "and it should be string data (with a relative path to the video/png to be loaded)."
                     )
             else:
                 _, h, w, c = data[k].shape
@@ -459,7 +460,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
             )[0]
             episode_timestamps = self._data[self.TIMESTAMP_KEY][episode_data_indices]
 
-            if self.is_video_dataset:
+            if self._image_storage_mode == "video":
                 # We'll use this for `decode_video_frames_torchvision`.
                 video_delta_timestamps = {}
 
@@ -484,7 +485,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
                         f"{self.tolerance_s=}) inside the episode range."
                     )
 
-                if self.is_video_dataset and data_key.startswith(self.IMAGE_KEY_PREFIX):
+                if self._image_storage_mode == "video" and data_key.startswith(self.IMAGE_KEY_PREFIX):
                     video_delta_timestamps[data_key] = self._data[self.TIMESTAMP_KEY][
                         episode_data_indices[argmin_]
                     ]
@@ -493,7 +494,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
 
                 item[f"{data_key}{self.IS_PAD_POSTFIX}"] = is_pad
 
-        if self.is_video_dataset:
+        if self._image_storage_mode == "video":
             # Decode the required video frames.
             for k in self.camera_keys:
                 this_key_has_delta_timestamps = (
@@ -513,6 +514,14 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
                     item[k] = img_or_imgs
                 else:
                     item[k] = img_or_imgs[0]  # in this case we don't want a temporal dimension
+        elif self._image_storage_mode == "png":
+            imgs = []
+            for k in self.camera_keys:
+                for rel_path in item[k]:
+                    imgs.append(
+                        torchvision.transforms.ToTensor()(Image.open(self.storage_dir / rel_path.decode()))
+                    )
+            item[k] = torch.stack(imgs)
         else:
             # Convert to PyTorch format: channel-last, float32, normalize to range [0, 1].
             for k in self.camera_keys:
@@ -530,7 +539,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
     def from_huggingface_hub(
         cls,
         repo_id: str,
-        decode_video: bool = False,
+        decode_images: bool = False,
         root: Path | None = DATA_DIR,
         **kwargs,
     ) -> "DataBuffer":
@@ -541,8 +550,9 @@ def from_huggingface_hub(
 
         Args:
             repo_id: The dataset repository ID.
-            decode_video: If repo_id refers to a video dataset (the image observations are encoded as videos),
-                decode the videos and store the frames in a numpy memmap.
+            decode_images: Optionally decode videos files or image files into a numpy memmap up front. This
+                provides large speed benefits for data access but only if your storage can handle it (decoded
+                images take up a lot more storage space than videos or png files).
             root: (will be deprecated) Directory to load the dataset from, instead of the hub.
             **kwargs: Other arguments to `self.__init__` except for the `data_spec` and
                 `buffer_capacity` arguments which are inferred automatically. `storage_dir` is set to
@@ -558,20 +568,20 @@ def from_huggingface_hub(
         hf_dataset.set_transform(lambda x: x)  # there is a default transform in place. reset it
         # Get some metadata necessary for processing videos.
         lerobot_dataset_info = load_info(repo_id, version=CODEBASE_VERSION, root=root)
-        if not lerobot_dataset_info.get("video", False) and decode_video:
-            raise ValueError(f"The provided dataset is not a video dataset but you have {decode_video=}")
+        # if not lerobot_dataset_info.get("video", False) and decode_images:
+        #     raise ValueError(f"The provided dataset is not a video dataset but you have {decode_images=}")
         if lerobot_dataset_info.get("video", False):
             videos_path = load_videos(repo_id, version=CODEBASE_VERSION, root=root)
 
         kwargs.setdefault(
             "storage_dir",
             DataBuffer._default_storage_dir_from_huggingface_hub(
-                repo_id, hf_dataset._fingerprint, decode_video
+                repo_id, hf_dataset._fingerprint, decode_images
             ),
         )
 
         buffer_already_on_disk = False
-        if kwargs["storage_dir"].exists():
+        if Path(kwargs["storage_dir"] / DataBuffer.METADATA_FILE_NAME).exists():
             buffer_already_on_disk = True
 
         # Create the DataBuffer object. Reminder: if the storage directory already exists, this reads it.
@@ -582,11 +592,15 @@ def from_huggingface_hub(
             buffer_capacity=len(hf_dataset) if not buffer_already_on_disk else None,
         )
 
-        if lerobot_dataset_info.get("video", False) and not decode_video:
-            obj._is_video_dataset = True
-        if obj._is_video_dataset:
+        if lerobot_dataset_info.get("video", False) and not decode_images:
+            obj._image_storage_mode = "video"
+        elif not decode_images:
+            obj._image_storage_mode = "png"
+        if obj._image_storage_mode == "video":
             obj._videos_dir = kwargs["storage_dir"] / "videos"
 
+        os.makedirs(obj.storage_dir, exist_ok=True)
+
         # If we have accessed an existing cached data buffer, just return the object as is.
         if buffer_already_on_disk:
             return obj
@@ -595,9 +609,23 @@ def from_huggingface_hub(
         data_dict = {}
         for k, feature in hf_dataset.features.items():
             if isinstance(feature, datasets.features.Image):
-                data_dict[k] = np.stack([np.array(pil_img) for pil_img in hf_dataset[k]])
+                if decode_images:
+                    data_dict[k] = np.stack([np.array(pil_img) for pil_img in hf_dataset[k]])
+                else:
+                    relative_paths = []
+                    for i in trange(len(hf_dataset)):
+                        item = hf_dataset[i]
+                        pil_img = item[k]
+                        frame_index = item[DataBuffer.FRAME_INDEX_KEY]
+                        episode_index = item[DataBuffer.EPISODE_INDEX_KEY]
+                        relative_path = f"images/{k}_episode_{episode_index:06d}_frame_{frame_index:06d}.png"
+                        relative_paths.append(np.array(relative_path, dtype=f"S{MAX_VIDEO_PATH_LENGTH}"))
+                        absolute_path = obj.storage_dir / relative_path
+                        os.makedirs(absolute_path.parent, exist_ok=True)
+                        pil_img.save(absolute_path)
+                    data_dict[k] = np.stack(relative_paths, dtype=f"S{MAX_VIDEO_PATH_LENGTH}")
             elif isinstance(feature, VideoFrame):
-                if decode_video:
+                if decode_images:
                     # Decode all videos into images.
                     episode_indices = np.array(hf_dataset["episode_index"])
                     timestamps = np.array(hf_dataset["timestamp"])
@@ -628,7 +656,7 @@ def from_huggingface_hub(
 
         obj.add_episodes(data_dict)  # note this must happen after setting _is_video_dataset.
         # Symlink vidoes if needed.
-        if obj._is_video_dataset and not obj._videos_dir.exists():
+        if obj._image_storage_mode == "video" and not obj._videos_dir.exists():
             os.symlink(videos_path.absolute(), obj._videos_dir)
         return obj
 
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index 169b7c063..964d547a6 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -416,7 +416,7 @@ def evaluate_and_checkpoint_if_needed(step, is_online):
             offline_dataset.repo_id,
             fps=offline_dataset.fps,
             delta_timestamps=offline_dataset.delta_timestamps,
-            decode_video=decode_video,
+            decode_images=decode_video,
             image_transform=offline_dataset.image_transforms,
         )
     else:

From c86c755c381e29df87c62a28a543998712130be4 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Wed, 25 Sep 2024 11:46:23 +0100
Subject: [PATCH 21/29] add png option

---
 lerobot/common/datasets/online_buffer.py | 103 +++++++++++++++++------
 1 file changed, 79 insertions(+), 24 deletions(-)

diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
index 6f7086bad..851d9a851 100644
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -18,6 +18,7 @@
 import json
 import os
 import shutil
+from enum import Enum
 from pathlib import Path
 from typing import Callable
 
@@ -64,6 +65,12 @@ class TimestampOutsideToleranceError(Exception):
     pass
 
 
+class DataBufferImageMode(Enum):
+    MEMMAP = "memmap"
+    PNG = "png"
+    VIDEO = "video"
+
+
 class DataBuffer(torch.utils.data.Dataset):
     """Data buffer and training data item getter.
 
@@ -72,14 +79,25 @@ class DataBuffer(torch.utils.data.Dataset):
     including environment observations, and robot actions. NOTE: for the time being, we require all data
     modalities to be timestamp aligned. This constraint may be relaxed in the future.
 
-    The data is stored in a mapping from data keys to arrays with shape (total_number_of_frames, *data_dim).
-    The compulsory data keys are:
+    Non-image data is stored in a mapping from data keys to arrays with shape (total_number_of_frames,
+    *data_dim). The compulsory data keys are:
         - "index": A sequential integer index per frame.
         - "episode_index": A sequential integer index per episode.
         - "frame_index": A sequential integer index per frame within an episode (it resets for each episode).
         - "timestamp": The relative timestamp of the frame within the episode in units of seconds. The choice.
             of reference time is not important.
 
+    Image data may be stored in at least one of the following formats: video files, PNG image files, as raw
+    pixel arrays (in which case they also have a data key to array mapping as discussed above). Some words on
+    each of these storage methods:
+        - Video files: These are the most compact in terms of storage space. Decoding image frames from the
+            video files takes non-trivial time and one would be advised to keep an eye out for data loading
+            bottlenecks during policy training.
+        - Numpy memmaps (more on memmaps below): This is the least compact in terms of storage space (high
+            resolution camera feeds can rapidly overwhelm your storage capacity). On the other hand, random
+            access of image frames at least an order of magnitude faster than with video decoding.
+        - TODO(now)
+
     Under the hood, the data is stored in `numpy.memmap`s, one for each data key. Loosely speaking, memory
     mapping (https://en.wikipedia.org/wiki/Memory-mapped_file) allows us to treat a portion of disk space as
     virtual memory. This allows us to work with more data than can fit in our physical memory, while treating
@@ -117,12 +135,16 @@ class DataBuffer(torch.utils.data.Dataset):
     # By convention, all images should be stored under a key with this prefix.
     IMAGE_KEY_PREFIX = "observation.image"
 
-    METADATA_FILE_NAME = "meta.json"
+    METADATA_FILE_NAME = "metadata.json"
+
+    VIDEOS_DIR = "videos"  # directory (relative to storage directory), to store videos
+    IMAGES_DIR = "images"  # directory (relative to storage directory), to store images
 
     def __init__(
         self,
         storage_dir: str | Path,
         buffer_capacity: int | None = None,
+        image_mode: DataBufferImageMode | None = None,
         image_transform: Callable[[np.ndarray], np.ndarray] | None = None,
         delta_timestamps: dict[str, list[float]] | dict[str, np.ndarray] | None = None,
         fps: float | None = None,
@@ -137,6 +159,7 @@ def __init__(
                 system's available disk space when choosing this. Note that if `storage_dir` references an
                 existing storage directory, `buffer_capacity` should not be provided, as it is already
                 included in "meta.json".
+            image_mode: TODO(now)
             image_transform: Transforms to apply in the item getter to all image data (any data whose key
                 starts with "observation.image").
             delta_timestamps: TODO(alexander-soare): Document this somewhere when
@@ -148,10 +171,14 @@ def __init__(
         # Parameters for the data structure.
         self._storage_dir = Path(storage_dir)
         self._data: dict[str, np.memmap] = {}
-        self._image_storage_mode = "memmap"
-        self._videos_dir: str | None = None  # may be set by `from_huggingface_hub`
 
-        # If the storage directory already exists, load the memmaps.
+        # Default assumption is that the image storage mode is memmaps meaning we don't need a video or image
+        # directory.
+        self._image_mode = DataBufferImageMode.MEMMAP
+        self._videos_dir: str | None = None
+        self._images_dir: str | None = None
+
+        # If the storage directory and metadata files already exists, load the memmaps.
         if (self._storage_dir / self.METADATA_FILE_NAME).exists():
             if buffer_capacity is not None:
                 raise ValueError(
@@ -162,6 +189,22 @@ def __init__(
             data_spec = self._load_data_spec()
             self._make_memmaps(data_spec, mode="r+")
             self._buffer_capacity = len(self._data[self.INDEX_KEY])
+            # Set image mode based on what's available in the storage directory and/or the user's selection.
+            possible_image_modes = self._infer_image_modes()
+            if image_mode is not None:
+                if image_mode not in possible_image_modes:
+                    raise ValueError(
+                        f"Provided {image_mode=} not available with this storage directory. Modes available: "
+                        f"{possible_image_modes}"
+                    )
+                self._image_mode = image_mode
+            else:
+                # If image_mode is not provided, default to VIDEO, followed by PNG, followed by MEMMAP
+                # (already set above).
+                if DataBufferImageMode.VIDEO in possible_image_modes:
+                    self._image_mode = DataBufferImageMode.VIDEO
+                elif DataBufferImageMode.PNG in possible_image_modes:
+                    self._image_mode = DataBufferImageMode.PNG
         else:
             if buffer_capacity is None:
                 raise ValueError(
@@ -178,10 +221,6 @@ def __init__(
     def storage_dir(self) -> Path:
         return self._storage_dir
 
-    @property
-    def is_video_dataset(self) -> bool:
-        return self._is_video_dataset
-
     @property
     def data_keys(self) -> list[str]:
         keys = set(self._data)
@@ -220,6 +259,17 @@ def camera_keys(self) -> list[str]:
         """
         return [k for k in self._data if k.startswith(self.IMAGE_KEY_PREFIX)]
 
+    def _infer_image_modes(self) -> list[DataBufferImageMode]:
+        """Infer which image modes are available according to what is in the storage directory"""
+        image_modes = []
+        if (self.storage_dir / self.VIDEOS_DIR).exists():
+            image_modes.append(DataBufferImageMode.VIDEO)
+        if (self.storage_dir / self.IMAGES_DIR).exists():
+            image_modes.append(DataBufferImageMode.PNG)
+        if any(k.startswith("observation.image") for k in self._load_data_spec()):
+            image_modes.append(DataBufferImageMode.MEMMAP)
+        return image_modes
+
     def _save_data_spec(self, data_spec: dict[str, dict]):
         """Save the data type and shape specifications to the storage directory."""
         meta_file = self._storage_dir / self.METADATA_FILE_NAME
@@ -361,7 +411,7 @@ def _add_episode(self, data: dict[str, np.ndarray]):
         for k in data:
             if not k.startswith(self.IMAGE_KEY_PREFIX):
                 continue
-            if self._image_storage_mode in ["video", "png"]:
+            if self._image_mode in [DataBufferImageMode.PNG, DataBufferImageMode.VIDEO]:
                 if data[k].dtype != np.dtype(f"S{MAX_VIDEO_PATH_LENGTH}"):
                     raise ValueError(
                         f"Any data key starting with '{self.IMAGE_KEY_PREFIX}' is assumed to be an image, "
@@ -460,7 +510,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
             )[0]
             episode_timestamps = self._data[self.TIMESTAMP_KEY][episode_data_indices]
 
-            if self._image_storage_mode == "video":
+            if self._image_mode == DataBufferImageMode.VIDEO:
                 # We'll use this for `decode_video_frames_torchvision`.
                 video_delta_timestamps = {}
 
@@ -485,7 +535,9 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
                         f"{self.tolerance_s=}) inside the episode range."
                     )
 
-                if self._image_storage_mode == "video" and data_key.startswith(self.IMAGE_KEY_PREFIX):
+                if self._image_mode == DataBufferImageMode.VIDEO and data_key.startswith(
+                    self.IMAGE_KEY_PREFIX
+                ):
                     video_delta_timestamps[data_key] = self._data[self.TIMESTAMP_KEY][
                         episode_data_indices[argmin_]
                     ]
@@ -494,7 +546,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
 
                 item[f"{data_key}{self.IS_PAD_POSTFIX}"] = is_pad
 
-        if self._image_storage_mode == "video":
+        if self._image_mode == DataBufferImageMode.VIDEO:
             # Decode the required video frames.
             for k in self.camera_keys:
                 this_key_has_delta_timestamps = (
@@ -514,7 +566,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
                     item[k] = img_or_imgs
                 else:
                     item[k] = img_or_imgs[0]  # in this case we don't want a temporal dimension
-        elif self._image_storage_mode == "png":
+        elif self._image_mode == DataBufferImageMode.PNG:
             imgs = []
             for k in self.camera_keys:
                 for rel_path in item[k]:
@@ -559,6 +611,8 @@ def from_huggingface_hub(
                 `/tmp/{repo_id}_{hf_dataset._fingerprint}_{decoded?}` unless provided explicitly.
         Returns:
             The resulting DataBuffer object.
+
+        # TODO(now): Add one episode at a time.
         """
         for k in ["data_spec", "buffer_capacity"]:
             if k in kwargs:
@@ -592,14 +646,15 @@ def from_huggingface_hub(
             buffer_capacity=len(hf_dataset) if not buffer_already_on_disk else None,
         )
 
-        if lerobot_dataset_info.get("video", False) and not decode_images:
-            obj._image_storage_mode = "video"
-        elif not decode_images:
-            obj._image_storage_mode = "png"
-        if obj._image_storage_mode == "video":
-            obj._videos_dir = kwargs["storage_dir"] / "videos"
-
-        os.makedirs(obj.storage_dir, exist_ok=True)
+        if decode_images:
+            obj._image_mode = DataBufferImageMode.MEMMAP
+        else:
+            if lerobot_dataset_info.get("video", False):
+                obj._image_mode = DataBufferImageMode.VIDEO
+                obj._videos_dir = kwargs["storage_dir"] / DataBuffer.VIDEOS_DIR
+            else:
+                obj._image_mode = DataBufferImageMode.PNG
+                obj._images_dir = kwargs["storage_dir"] / DataBuffer.IMAGES_DIR
 
         # If we have accessed an existing cached data buffer, just return the object as is.
         if buffer_already_on_disk:
@@ -656,7 +711,7 @@ def from_huggingface_hub(
 
         obj.add_episodes(data_dict)  # note this must happen after setting _is_video_dataset.
         # Symlink vidoes if needed.
-        if obj._image_storage_mode == "video" and not obj._videos_dir.exists():
+        if obj._image_mode == DataBufferImageMode.VIDEO and not obj._videos_dir.exists():
             os.symlink(videos_path.absolute(), obj._videos_dir)
         return obj
 

From 375c5f984cf3c5bb75111fdad5e5f6c25e824dcd Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Wed, 25 Sep 2024 15:01:18 +0100
Subject: [PATCH 22/29] from_huggingface_hub adds episodes one at at time

---
 lerobot/common/datasets/online_buffer.py | 249 +++++++++++++++--------
 1 file changed, 159 insertions(+), 90 deletions(-)

diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
index 851d9a851..d28715181 100644
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -28,12 +28,11 @@
 import torch
 import torchvision
 from PIL import Image
-from tqdm import tqdm, trange
+from tqdm import tqdm
 
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, DATA_DIR, LeRobotDataset
-from lerobot.common.datasets.utils import load_hf_dataset, load_info, load_videos
+from lerobot.common.datasets.utils import load_episode_data_index, load_hf_dataset, load_info, load_videos
 from lerobot.common.datasets.video_utils import VideoFrame, decode_video_frames_torchvision
-from lerobot.common.utils.utils import inside_slurm
 
 # TODO(alexander-soare): Move somewhere more appropriate once the DataBuffer class permeates more of the coe
 # base.
@@ -74,49 +73,91 @@ class DataBufferImageMode(Enum):
 class DataBuffer(torch.utils.data.Dataset):
     """Data buffer and training data item getter.
 
+    This class is intended for the following main use cases:
+    - Downloading and using datasets from the Hugging Face Hub.
+    - Creating new datasets and potentially uploading them to the Hugging Face Hub.
+    - Use as an online experience replay buffer during training.
+
     Data is considered to come in the form of "episodes" (an instance of a robot performing a task). Episodes
     are made up of "frames", which are chronologically ordered and contain timestamp aligned data, potentially
     including environment observations, and robot actions. NOTE: for the time being, we require all data
     modalities to be timestamp aligned. This constraint may be relaxed in the future.
 
-    Non-image data is stored in a mapping from data keys to arrays with shape (total_number_of_frames,
-    *data_dim). The compulsory data keys are:
+    Data is stored in a mapping from data keys to arrays with shape (total_number_of_frames, *data_dim). The
+    compulsory data keys are:
         - "index": A sequential integer index per frame.
         - "episode_index": A sequential integer index per episode.
         - "frame_index": A sequential integer index per frame within an episode (it resets for each episode).
         - "timestamp": The relative timestamp of the frame within the episode in units of seconds. The choice.
             of reference time is not important.
+    Other data keys may be included, and when using LeRobot policies, one should note LeRobot's implicit
+    naming conventions for data keys:
+        - "action": A 1D vector for the robot command.
+        - "observation.state": A 1D vector for the proprioceptive state.
+        - "observation.environment_state": A 1D vector encoding the environment state (for example: the
+            poses of objects in the environment).
+        - Any key starting with "observation.image" is considered to be a camera image. Note that this doesn't
+          necessarily get stored in this data structure. Instead, we might have video files or PNG files to
+          decode (more on that below).
 
     Image data may be stored in at least one of the following formats: video files, PNG image files, as raw
-    pixel arrays (in which case they also have a data key to array mapping as discussed above). Some words on
-    each of these storage methods:
-        - Video files: These are the most compact in terms of storage space. Decoding image frames from the
-            video files takes non-trivial time and one would be advised to keep an eye out for data loading
-            bottlenecks during policy training.
-        - Numpy memmaps (more on memmaps below): This is the least compact in terms of storage space (high
-            resolution camera feeds can rapidly overwhelm your storage capacity). On the other hand, random
-            access of image frames at least an order of magnitude faster than with video decoding.
-        - TODO(now)
-
-    Under the hood, the data is stored in `numpy.memmap`s, one for each data key. Loosely speaking, memory
-    mapping (https://en.wikipedia.org/wiki/Memory-mapped_file) allows us to treat a portion of disk space as
-    virtual memory. This allows us to work with more data than can fit in our physical memory, while treating
-    the data as if it were just standard numpy arrays. The associated files are saved in the file system under
-    what we call the "storage directory", and the Python object that allows us to treat them as virtual memory
-    is called the "buffer". The storage directory also contains a "meta.json" file which includes information
-    about the date types and shapes for each memmap. This allows us to load the data without having to specify
-    the data specifications at runtime.
-
-    A size limit must be specified when creating a new buffer (to know how much space to reserve on disk). The_
-    `add_episodes` method can be used to insert data in the form of integral episodes (starting from frame 0
-    and with the frames ordered). Data is inserted in a circular fashion, inserting after the most recently
-    added frame, and wrapping around to the start of the buffer when necessary (in which case older episode
-    frames are overwritten).
+    pixel arrays (in which case they also have a data key to array mapping as discussed above). For each of
+    these methods, we care about storage capacity and random access data loading speed.
+        - Video files: These are the most compact in terms of storage space. Video decoding can be faster
+            than PNG image file decoding when we need to access multiple sequential frames at a time,
+            otherwise it is generally slower.
+        - PNG files: The are the less compact than videos in terms of storage space. When randomly accessing
+            individual image frames from the dataset, decoding PNG files can be significantly faster than
+            decoding video files.
+        - Numpy memmaps (more on memmaps below): These are by far the least compact in terms of storage space.
+            They are also the fastest option in terms of data loading, but under certain settings video files
+            and PNG images can get surprisingly close when using a PyTorch DataLoader with multiple workers.
+
+    About `numpy.memmap`s: Loosely speaking,
+    memory mapping (https://en.wikipedia.org/wiki/Memory-mapped_file) allows us to treat a portion of disk
+    space as virtual memory. This allows us to work with more data than can fit in our physical memory, while
+    treating the data as if it were just standard numpy arrays. The associated files are saved in the file
+    system under what we call the "storage directory", and the Python object that allows us to treat them as
+    virtual memory is called the "buffer". The storage directory also contains a "metadata.json" file which
+    includes information about the date types and shapes for each memmap. This allows us to load the data
+    without having to specify the data specifications at runtime.
+
+    A size limit must be specified when creating a new buffer (to know how much space to reserve on disk for
+    the `memmaps`). The `add_episodes` method can be used to insert data in the form of integral episodes
+    (starting from frame 0 and with the frames ordered). For the purposes of a limited-capacity experience
+    replay buffer, data is inserted in a circular fashion, inserting after the most recently added frame, and
+    wrapping around to the start of the buffer when necessary (in which case older episode frames are
+    overwritten).
 
     This class is also a PyTorch Dataset and can be used as such in a dataloader for a training loop. The item
     getter returns either a single frame, or a slice of a single episode if `delta_timestamps` is set. It also
     converts the numpy data to torch tensors, and handles converting images to channel-first, float32
     normalized to the range [0, 1].
+
+    Example usage: you want to use a dataset from the Hugging Face hub for training a policy:
+
+    ```python
+    dataset = DataBuffer.from_huggingface_hub("lerobot/pusht")
+    dataloader = torch.utils.data.DataLoader(dataset)
+    ```
+
+    Example usage: you want to create a new dataset and upload it to the hub
+
+    COMING SOON
+
+    Example usage: you need an experience replay buffer for an online RL policy like TD-MPC
+
+    ```python
+    dataset = DataBuffer(storage_dir="online_buffer", buffer_capacity=10000)
+    iter_dataloader = iter(torch.utils.data.DataLoader(dataset))
+
+    # training loop
+    while True:
+        data_dict = do_online_rollouts()
+        dataset.add_episodes(data_dict)
+        batch = next(iter_dataloader)
+        # Policy forward, backward, gradient step.
+    ```
     """
 
     # Special key for a (1,) array storing a pointer to the next index to fill from when adding data.
@@ -151,15 +192,17 @@ def __init__(
     ):
         """
         Args:
-            storage_dir: Where to keep the numpy memmap files and metadata files. One memmap file will be
-                stored for each data key. Note that if the storage directory already exist, the memmap files
-                are opened in read-write mode. If the storage directory does not exist, it will be lazily
-                created with the first call to `add_episodes`.
-            buffer_capacity: How many frames should be stored in the buffer as a maximum. Be aware of your
-                system's available disk space when choosing this. Note that if `storage_dir` references an
-                existing storage directory, `buffer_capacity` should not be provided, as it is already
-                included in "meta.json".
-            image_mode: TODO(now)
+            storage_dir: Where to keep the numpy memmap files, metadata file, video files, and/or image files.
+                One memmap file will be stored for each data key. Note that if the storage directory already
+                exists, the memmap files are opened in read-write mode. If the storage directory does not
+                exist, it will be lazily created with the first call to `add_episodes`.
+            buffer_capacity: How many frames should be stored in the buffer as a maximum. Note that if
+                `storage_dir` references an existing storage directory, `buffer_capacity` should not be
+                provided, as it is already included in "metadata.json".
+            image_mode: The image storage mode used for the item getter. See notes above on the various
+                options. If not provided: when creating a new dataset it defaults to "video" mode, and when
+                loading an existing dataset it defaults to the first mode available from "video", "png",
+                "memmap", in that order.
             image_transform: Transforms to apply in the item getter to all image data (any data whose key
                 starts with "observation.image").
             delta_timestamps: TODO(alexander-soare): Document this somewhere when
@@ -362,7 +405,7 @@ def set_delta_timestamps(self, delta_timestamps: dict[str, list[float]] | None):
         else:
             self._delta_timestamps = None
 
-    def add_episodes(self, data: dict[str, np.ndarray]):
+    def add_episodes(self, data: dict[str, np.ndarray], no_flush: bool = False):
         """Add data to the buffer.
 
         `data` should have the same key, array mapping as the buffer. It should contain at least one episode.
@@ -383,7 +426,8 @@ def add_episodes(self, data: dict[str, np.ndarray]):
             episode_data = {k: data[k][where_episode] for k in data}
             self._add_episode(episode_data)
 
-        self.flush()
+        if not no_flush:
+            self.flush()
 
     def _add_episode(self, data: dict[str, np.ndarray]):
         """Add data for a single episode to the buffer."""
@@ -567,13 +611,23 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
                 else:
                     item[k] = img_or_imgs[0]  # in this case we don't want a temporal dimension
         elif self._image_mode == DataBufferImageMode.PNG:
-            imgs = []
             for k in self.camera_keys:
-                for rel_path in item[k]:
-                    imgs.append(
-                        torchvision.transforms.ToTensor()(Image.open(self.storage_dir / rel_path.decode()))
+                this_key_has_delta_timestamps = (
+                    self.delta_timestamps is not None and k in self.delta_timestamps
+                )
+                if this_key_has_delta_timestamps:
+                    imgs = []
+                    for rel_path in item[k]:
+                        imgs.append(
+                            torchvision.transforms.ToTensor()(
+                                Image.open(self.storage_dir / rel_path.decode())
+                            )
+                        )
+                    item[k] = torch.stack(imgs)
+                else:
+                    item[k] = torchvision.transforms.ToTensor()(
+                        Image.open(self.storage_dir / item[k].decode())
                     )
-            item[k] = torch.stack(imgs)
         else:
             # Convert to PyTorch format: channel-last, float32, normalize to range [0, 1].
             for k in self.camera_keys:
@@ -619,13 +673,14 @@ def from_huggingface_hub(
                 raise ValueError(f"`{k}` should not be provided as it is inferred from the hub dataset.")
 
         hf_dataset = load_hf_dataset(repo_id, version=CODEBASE_VERSION, root=root, split="train")
+        episode_data_index = load_episode_data_index(repo_id, version=CODEBASE_VERSION, root=root)
         hf_dataset.set_transform(lambda x: x)  # there is a default transform in place. reset it
         # Get some metadata necessary for processing videos.
         lerobot_dataset_info = load_info(repo_id, version=CODEBASE_VERSION, root=root)
         # if not lerobot_dataset_info.get("video", False) and decode_images:
         #     raise ValueError(f"The provided dataset is not a video dataset but you have {decode_images=}")
         if lerobot_dataset_info.get("video", False):
-            videos_path = load_videos(repo_id, version=CODEBASE_VERSION, root=root)
+            lerobot_dataset_videos_path = load_videos(repo_id, version=CODEBASE_VERSION, root=root)
 
         kwargs.setdefault(
             "storage_dir",
@@ -658,61 +713,75 @@ def from_huggingface_hub(
 
         # If we have accessed an existing cached data buffer, just return the object as is.
         if buffer_already_on_disk:
-            return obj
+            if len(obj) == len(hf_dataset):
+                # All episodes are already in the storage directory.
+                return obj
+            else:
+                # Only some episodes are in the storage directory. Reset the data pointer and start from
+                # scratch.
+                obj._data[DataBuffer.NEXT_INDEX_KEY][0] = 0
 
         # Populate the buffer with the data from the dataset.
-        data_dict = {}
-        for k, feature in hf_dataset.features.items():
-            if isinstance(feature, datasets.features.Image):
-                if decode_images:
-                    data_dict[k] = np.stack([np.array(pil_img) for pil_img in hf_dataset[k]])
-                else:
-                    relative_paths = []
-                    for i in trange(len(hf_dataset)):
-                        item = hf_dataset[i]
-                        pil_img = item[k]
-                        frame_index = item[DataBuffer.FRAME_INDEX_KEY]
-                        episode_index = item[DataBuffer.EPISODE_INDEX_KEY]
-                        relative_path = f"images/{k}_episode_{episode_index:06d}_frame_{frame_index:06d}.png"
-                        relative_paths.append(np.array(relative_path, dtype=f"S{MAX_VIDEO_PATH_LENGTH}"))
-                        absolute_path = obj.storage_dir / relative_path
-                        os.makedirs(absolute_path.parent, exist_ok=True)
-                        pil_img.save(absolute_path)
-                    data_dict[k] = np.stack(relative_paths, dtype=f"S{MAX_VIDEO_PATH_LENGTH}")
-            elif isinstance(feature, VideoFrame):
-                if decode_images:
-                    # Decode all videos into images.
-                    episode_indices = np.array(hf_dataset["episode_index"])
-                    timestamps = np.array(hf_dataset["timestamp"])
-                    all_imgs = []
-                    for episode_index in tqdm(
-                        np.unique(episode_indices), desc=f"Decoding videos for {k}", disable=inside_slurm()
-                    ):
-                        episode_data_indices = np.where(episode_indices == episode_index)[0]
-                        episode_timestamps = timestamps[episode_indices == episode_index]
+        episode_indices = np.unique(hf_dataset["episode_index"])
+        for episode_index in tqdm(episode_indices, desc="Siphoning episodes into local data structure"):
+            data_dict = {}
+            hf_episode_data = hf_dataset[
+                episode_data_index["from"][episode_index].item() : episode_data_index["to"][
+                    episode_index
+                ].item()
+            ]
+
+            for k, feature in hf_dataset.features.items():
+                if isinstance(feature, datasets.features.Image):
+                    if decode_images:
+                        data_dict[k] = np.stack([np.array(pil_img) for pil_img in hf_episode_data[k]])
+                    else:
+                        relative_paths = []
+                        for i in range(len(hf_episode_data[k])):
+                            pil_img = hf_episode_data[k][i]
+                            frame_index = hf_episode_data[DataBuffer.FRAME_INDEX_KEY][i]
+                            episode_index = hf_episode_data[DataBuffer.EPISODE_INDEX_KEY][i]
+                            relative_path = (
+                                f"images/{k}_episode_{episode_index:06d}_frame_{frame_index:06d}.png"
+                            )
+                            relative_paths.append(np.array(relative_path, dtype=f"S{MAX_VIDEO_PATH_LENGTH}"))
+                            absolute_path = obj.storage_dir / relative_path
+                            os.makedirs(absolute_path.parent, exist_ok=True)
+                            pil_img.save(absolute_path)
+                        data_dict[k] = np.stack(relative_paths, dtype=f"S{MAX_VIDEO_PATH_LENGTH}")
+                elif isinstance(feature, VideoFrame):
+                    if decode_images:
+                        # Decode all videos into images.
+                        all_imgs = []
                         episode_imgs = decode_video_frames_torchvision(
-                            videos_path.parent / hf_dataset[k][episode_data_indices[0]]["path"],
-                            episode_timestamps,
+                            lerobot_dataset_videos_path.parent / hf_episode_data[k][0]["path"],
+                            np.array(hf_episode_data["timestamp"]),
                             1 / lerobot_dataset_info["fps"] - 1e-4,
                             to_pytorch_format=False,
                         )
                         all_imgs.extend(episode_imgs)
-                    data_dict[k] = np.stack(all_imgs)
+                        data_dict[k] = np.stack(all_imgs)
+                    else:
+                        data_dict[k] = np.stack(
+                            [
+                                np.array(dct["path"], dtype=f"S{MAX_VIDEO_PATH_LENGTH}")
+                                for dct in hf_episode_data[k]
+                            ]
+                        )
+                elif isinstance(feature, datasets.features.Sequence):
+                    data_dict[k] = np.array(hf_episode_data[k], dtype=np.dtype(feature.feature.dtype))
+                elif isinstance(feature, datasets.features.Value):
+                    data_dict[k] = np.array(hf_episode_data[k], dtype=np.dtype(feature.dtype))
                 else:
-                    data_dict[k] = np.stack(
-                        [np.array(dct["path"], dtype=f"S{MAX_VIDEO_PATH_LENGTH}") for dct in hf_dataset[k]]
-                    )
-            elif isinstance(feature, datasets.features.Sequence):
-                data_dict[k] = np.array(hf_dataset[k], dtype=np.dtype(feature.feature.dtype))
-            elif isinstance(feature, datasets.features.Value):
-                data_dict[k] = np.array(hf_dataset[k], dtype=np.dtype(feature.dtype))
-            else:
-                raise NotImplementedError(f"feature type {type(feature)} is not handled.")
+                    raise NotImplementedError(f"feature type {type(feature)} is not handled.")
+
+            obj.add_episodes(data_dict, no_flush=True)
+
+        obj.flush()
 
-        obj.add_episodes(data_dict)  # note this must happen after setting _is_video_dataset.
         # Symlink vidoes if needed.
         if obj._image_mode == DataBufferImageMode.VIDEO and not obj._videos_dir.exists():
-            os.symlink(videos_path.absolute(), obj._videos_dir)
+            os.symlink(lerobot_dataset_videos_path.absolute(), obj._videos_dir)
         return obj
 
     @staticmethod

From e88ab2d247e13e28e076b1618830e10cd778677c Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Wed, 25 Sep 2024 15:02:39 +0100
Subject: [PATCH 23/29] nicer list comp

---
 lerobot/common/datasets/online_buffer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
index d28715181..62079527b 100644
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -616,14 +616,14 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
                     self.delta_timestamps is not None and k in self.delta_timestamps
                 )
                 if this_key_has_delta_timestamps:
-                    imgs = []
-                    for rel_path in item[k]:
-                        imgs.append(
+                    item[k] = torch.stack(
+                        [
                             torchvision.transforms.ToTensor()(
                                 Image.open(self.storage_dir / rel_path.decode())
                             )
-                        )
-                    item[k] = torch.stack(imgs)
+                            for rel_path in item[k]
+                        ]
+                    )
                 else:
                     item[k] = torchvision.transforms.ToTensor()(
                         Image.open(self.storage_dir / item[k].decode())

From d13104d99763fb690b004e021464d2814783dd5a Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Thu, 26 Sep 2024 08:50:11 +0100
Subject: [PATCH 24/29] backup wip

---
 lerobot/common/datasets/online_buffer.py | 39 +++++++++++-------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
index 62079527b..67cfb7251 100644
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -106,21 +106,20 @@ class DataBuffer(torch.utils.data.Dataset):
         - Video files: These are the most compact in terms of storage space. Video decoding can be faster
             than PNG image file decoding when we need to access multiple sequential frames at a time,
             otherwise it is generally slower.
-        - PNG files: The are the less compact than videos in terms of storage space. When randomly accessing
+        - PNG files: The are less compact than videos in terms of storage space. When randomly accessing
             individual image frames from the dataset, decoding PNG files can be significantly faster than
             decoding video files.
         - Numpy memmaps (more on memmaps below): These are by far the least compact in terms of storage space.
             They are also the fastest option in terms of data loading, but under certain settings video files
             and PNG images can get surprisingly close when using a PyTorch DataLoader with multiple workers.
 
-    About `numpy.memmap`s: Loosely speaking,
-    memory mapping (https://en.wikipedia.org/wiki/Memory-mapped_file) allows us to treat a portion of disk
-    space as virtual memory. This allows us to work with more data than can fit in our physical memory, while
-    treating the data as if it were just standard numpy arrays. The associated files are saved in the file
-    system under what we call the "storage directory", and the Python object that allows us to treat them as
-    virtual memory is called the "buffer". The storage directory also contains a "metadata.json" file which
-    includes information about the date types and shapes for each memmap. This allows us to load the data
-    without having to specify the data specifications at runtime.
+    About `numpy.memmap`s: Loosely speaking, memory mapping (https://en.wikipedia.org/wiki/Memory-mapped_file)
+    allows us to treat a portion of disk space as virtual memory. This allows us to work with more data than
+    can fit in our physical memory, while treating the data as if it were just standard numpy arrays. The
+    associated files are saved in the file system under what we call the "storage directory", and the Python
+    object that allows us to treat them as virtual memory is called the "buffer". The storage directory also
+    contains a "metadata.json" file which includes information about the date types and shapes for each
+    memmap. This allows us to load the data without having to specify the data specifications at runtime.
 
     A size limit must be specified when creating a new buffer (to know how much space to reserve on disk for
     the `memmaps`). The `add_episodes` method can be used to insert data in the form of integral episodes
@@ -615,19 +614,13 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
                 this_key_has_delta_timestamps = (
                     self.delta_timestamps is not None and k in self.delta_timestamps
                 )
+                to_tensor = torchvision.transforms.ToTensor()
                 if this_key_has_delta_timestamps:
                     item[k] = torch.stack(
-                        [
-                            torchvision.transforms.ToTensor()(
-                                Image.open(self.storage_dir / rel_path.decode())
-                            )
-                            for rel_path in item[k]
-                        ]
+                        [to_tensor(Image.open(self.storage_dir / rel_path.decode())) for rel_path in item[k]]
                     )
                 else:
-                    item[k] = torchvision.transforms.ToTensor()(
-                        Image.open(self.storage_dir / item[k].decode())
-                    )
+                    item[k] = to_tensor(Image.open(self.storage_dir / item[k].decode()))
         else:
             # Convert to PyTorch format: channel-last, float32, normalize to range [0, 1].
             for k in self.camera_keys:
@@ -665,8 +658,6 @@ def from_huggingface_hub(
                 `/tmp/{repo_id}_{hf_dataset._fingerprint}_{decoded?}` unless provided explicitly.
         Returns:
             The resulting DataBuffer object.
-
-        # TODO(now): Add one episode at a time.
         """
         for k in ["data_spec", "buffer_capacity"]:
             if k in kwargs:
@@ -709,6 +700,9 @@ def from_huggingface_hub(
                 obj._videos_dir = kwargs["storage_dir"] / DataBuffer.VIDEOS_DIR
             else:
                 obj._image_mode = DataBufferImageMode.PNG
+                for k, feature in hf_dataset.features.items():
+                    if isinstance(feature, datasets.features.Image):
+                        hf_dataset = hf_dataset.cast_column(k, datasets.features.Image(decode=False))
                 obj._images_dir = kwargs["storage_dir"] / DataBuffer.IMAGES_DIR
 
         # If we have accessed an existing cached data buffer, just return the object as is.
@@ -738,7 +732,6 @@ def from_huggingface_hub(
                     else:
                         relative_paths = []
                         for i in range(len(hf_episode_data[k])):
-                            pil_img = hf_episode_data[k][i]
                             frame_index = hf_episode_data[DataBuffer.FRAME_INDEX_KEY][i]
                             episode_index = hf_episode_data[DataBuffer.EPISODE_INDEX_KEY][i]
                             relative_path = (
@@ -747,7 +740,9 @@ def from_huggingface_hub(
                             relative_paths.append(np.array(relative_path, dtype=f"S{MAX_VIDEO_PATH_LENGTH}"))
                             absolute_path = obj.storage_dir / relative_path
                             os.makedirs(absolute_path.parent, exist_ok=True)
-                            pil_img.save(absolute_path)
+                            img_bytes = hf_episode_data[k][i]["bytes"]
+                            with open(absolute_path, "wb") as f:
+                                f.write(img_bytes)
                         data_dict[k] = np.stack(relative_paths, dtype=f"S{MAX_VIDEO_PATH_LENGTH}")
                 elif isinstance(feature, VideoFrame):
                     if decode_images:

From 7dd7cb37fc49e39717cfa595937cff529b099d2f Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Thu, 26 Sep 2024 15:16:30 +0100
Subject: [PATCH 25/29] backup wip

---
 lerobot/common/datasets/online_buffer.py | 469 +++++++++++++----------
 lerobot/scripts/train.py                 |   6 +-
 tests/test_online_buffer.py              |  30 +-
 3 files changed, 291 insertions(+), 214 deletions(-)

diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
index 67cfb7251..e726d40ad 100644
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""A data buffer for efficient data management during offline and online training."""
+"""A dataset class for efficient data management during offline and online training."""
 
 import json
 import os
@@ -26,7 +26,6 @@
 import einops
 import numpy as np
 import torch
-import torchvision
 from PIL import Image
 from tqdm import tqdm
 
@@ -34,10 +33,6 @@
 from lerobot.common.datasets.utils import load_episode_data_index, load_hf_dataset, load_info, load_videos
 from lerobot.common.datasets.video_utils import VideoFrame, decode_video_frames_torchvision
 
-# TODO(alexander-soare): Move somewhere more appropriate once the DataBuffer class permeates more of the coe
-# base.
-MAX_VIDEO_PATH_LENGTH = 100
-
 
 def _make_memmap_safe(**kwargs) -> np.memmap:
     """Make a numpy memmap with checks on available disk space first.
@@ -64,27 +59,40 @@ class TimestampOutsideToleranceError(Exception):
     pass
 
 
-class DataBufferImageMode(Enum):
+class LeRobotDatasetV2ImageMode(Enum):
     MEMMAP = "memmap"
     PNG = "png"
     VIDEO = "video"
 
 
-class DataBuffer(torch.utils.data.Dataset):
-    """Data buffer and training data item getter.
-
+class LeRobotDatasetV2(torch.utils.data.Dataset):
+    """
     This class is intended for the following main use cases:
     - Downloading and using datasets from the Hugging Face Hub.
     - Creating new datasets and potentially uploading them to the Hugging Face Hub.
     - Use as an online experience replay buffer during training.
 
-    Data is considered to come in the form of "episodes" (an instance of a robot performing a task). Episodes
-    are made up of "frames", which are chronologically ordered and contain timestamp aligned data, potentially
-    including environment observations, and robot actions. NOTE: for the time being, we require all data
-    modalities to be timestamp aligned. This constraint may be relaxed in the future.
+    Conceptually, data is considered to come in the form of "episodes" (an instance of a robot performing a
+    task). Episodes are made up of "frames", which are chronologically ordered and contain timestamp aligned
+    data, potentially including environment observations, and robot actions. NOTE: for the time being, we
+    require all data modalities to be timestamp aligned. This constraint may be relaxed in the future.
+
+    Physically, data is stored on disk in one or more of three forms: video files, PNG files, `numpy.memmap`s.
+    Image data can be stored in any of these forms, while non-image data is only stored in `numpy.memmaps`.
+
+    A quick note on `numpy.memmap`s: memory mapping (https://en.wikipedia.org/wiki/Memory-mapped_file)
+    allows us to treat a portion of disk space as virtual memory. This allows us to work with more data than
+    can fit in our physical memory, while treating the data as if it were just standard numpy arrays.
+
+    All of the on-disk storage goes in one directory on your file system. In here you will find:
+      - One memmap file for each non-image data key, and perhaps one for each image data key.
+      - Perhaps a directory called "videos".
+      - Perhaps a directory called "images"
+      - A "metadata.json" file which contains human-readable information about the dataset and some
+        configuration parameters.
 
-    Data is stored in a mapping from data keys to arrays with shape (total_number_of_frames, *data_dim). The
-    compulsory data keys are:
+    `numpy.memmap` data is stored in a mapping from data keys to arrays with shape (total_number_of_frames,
+    *data_dim). The compulsory data keys are:
         - "index": A sequential integer index per frame.
         - "episode_index": A sequential integer index per episode.
         - "frame_index": A sequential integer index per frame within an episode (it resets for each episode).
@@ -100,46 +108,51 @@ class DataBuffer(torch.utils.data.Dataset):
           necessarily get stored in this data structure. Instead, we might have video files or PNG files to
           decode (more on that below).
 
-    Image data may be stored in at least one of the following formats: video files, PNG image files, as raw
-    pixel arrays (in which case they also have a data key to array mapping as discussed above). For each of
-    these methods, we care about storage capacity and random access data loading speed.
+    Image data may be stored in at least one of the following formats: video files, PNG image files, and as
+    raw pixel arrays (in which case they also have a data key to array mapping as discussed above). For each
+    of these methods, we care about storage capacity, random access data loading speed, and in some use cases,
+    how fast we can add new data.
         - Video files: These are the most compact in terms of storage space. Video decoding can be faster
             than PNG image file decoding when we need to access multiple sequential frames at a time,
-            otherwise it is generally slower.
-        - PNG files: The are less compact than videos in terms of storage space. When randomly accessing
+            otherwise it is generally slower. Video encoding time, when adding new data, is also something to
+            consider.
+        - PNG files: These are less compact than videos in terms of storage space. When randomly accessing
             individual image frames from the dataset, decoding PNG files can be significantly faster than
-            decoding video files.
+            decoding video files. PNG encoding time, when adding new data, is also something to consider.
         - Numpy memmaps (more on memmaps below): These are by far the least compact in terms of storage space.
-            They are also the fastest option in terms of data loading, but under certain settings video files
-            and PNG images can get surprisingly close when using a PyTorch DataLoader with multiple workers.
-
-    About `numpy.memmap`s: Loosely speaking, memory mapping (https://en.wikipedia.org/wiki/Memory-mapped_file)
-    allows us to treat a portion of disk space as virtual memory. This allows us to work with more data than
-    can fit in our physical memory, while treating the data as if it were just standard numpy arrays. The
-    associated files are saved in the file system under what we call the "storage directory", and the Python
-    object that allows us to treat them as virtual memory is called the "buffer". The storage directory also
-    contains a "metadata.json" file which includes information about the date types and shapes for each
-    memmap. This allows us to load the data without having to specify the data specifications at runtime.
-
-    A size limit must be specified when creating a new buffer (to know how much space to reserve on disk for
-    the `memmaps`). The `add_episodes` method can be used to insert data in the form of integral episodes
-    (starting from frame 0 and with the frames ordered). For the purposes of a limited-capacity experience
-    replay buffer, data is inserted in a circular fashion, inserting after the most recently added frame, and
-    wrapping around to the start of the buffer when necessary (in which case older episode frames are
-    overwritten).
+            They are also the fastest option for adding new episodes to the dataset and data loading. But
+            under certain settings, video files and PNG images can get surprisingly close when using a PyTorch
+            DataLoader with multiple workers.
 
     This class is also a PyTorch Dataset and can be used as such in a dataloader for a training loop. The item
     getter returns either a single frame, or a slice of a single episode if `delta_timestamps` is set. It also
     converts the numpy data to torch tensors, and handles converting images to channel-first, float32
     normalized to the range [0, 1].
 
-    Example usage: you want to use a dataset from the Hugging Face hub for training a policy:
+    USE CASES AND EXAMPLES:
+
+    The most common use case will be to download a dataset from Hugging Face Hub for training a policy.
+    The following examples downloads the PushT dataset from the hub (or uses the locally cached dataset if it
+    exists) and passes it to a PyTorch dataloader.
 
     ```python
-    dataset = DataBuffer.from_huggingface_hub("lerobot/pusht")
+    dataset = LeRobotDatasetV2.from_huggingface_hub("lerobot/pusht")
     dataloader = torch.utils.data.DataLoader(dataset)
     ```
 
+    TODO(now): Finish the docs below
+
+    The next most common use case is to create a dataset from scratch. TODO(now): What to do about the need
+    for up-front buffer capacity.
+
+    A size limit must be specified when creating a new dataset (to know how much space to reserve on disk for
+    the `numpy.memmap`s). The `add_episodes` method can be used to insert data in the form of integral
+    episodes (starting from frame 0 and with the frames ordered). For the purposes of a limited-capacity
+    experience replay buffer, data is inserted in a circular fashion, inserting after the most recently added
+    frame, and wrapping around to the start of the buffer when necessary (in which case older episode frames
+    are overwritten).
+
+
     Example usage: you want to create a new dataset and upload it to the hub
 
     COMING SOON
@@ -147,7 +160,7 @@ class DataBuffer(torch.utils.data.Dataset):
     Example usage: you need an experience replay buffer for an online RL policy like TD-MPC
 
     ```python
-    dataset = DataBuffer(storage_dir="online_buffer", buffer_capacity=10000)
+    dataset = LeRobotDatasetV2(storage_dir="online_buffer", buffer_capacity=10000)
     iter_dataloader = iter(torch.utils.data.DataLoader(dataset))
 
     # training loop
@@ -161,11 +174,11 @@ class DataBuffer(torch.utils.data.Dataset):
 
     # Special key for a (1,) array storing a pointer to the next index to fill from when adding data.
     NEXT_INDEX_KEY = "_next_index"
-    # Since the data buffer is pre-allocated, this boolean mask is used to indicate which frames have "real"
-    # data.
+    # Since the numpy.memmap storage is pre-allocated, this boolean mask is used to indicate which frames have
+    # "real" data.
     OCCUPANCY_MASK_KEY = "_occupancy_mask"
-    # This is not a data key used in the buffer. It is used to indicate that a frame is padding, added by the
-    # __getitem__ method.
+    # IS_PAD_POSTFIX is added by the __getitem__ method, and is not stored in the numpy.memmap. It is used to
+    # indicate that a frame is padding.
     IS_PAD_POSTFIX = "_is_pad"
     INDEX_KEY = "index"
     FRAME_INDEX_KEY = "frame_index"
@@ -178,13 +191,15 @@ class DataBuffer(torch.utils.data.Dataset):
     METADATA_FILE_NAME = "metadata.json"
 
     VIDEOS_DIR = "videos"  # directory (relative to storage directory), to store videos
-    IMAGES_DIR = "images"  # directory (relative to storage directory), to store images
+    VIDEO_NAME_FSTRING = "{data_key}_episode_{episode_index:06d}.mp4"
+    PNGS_DIR = "images"  # directory (relative to storage directory), to store images
+    PNG_NAME_FSTRING = "{data_key}_episode_{episode_index:06d}_frame_{frame_index:06d}.png"
 
     def __init__(
         self,
         storage_dir: str | Path,
         buffer_capacity: int | None = None,
-        image_mode: DataBufferImageMode | None = None,
+        image_mode: LeRobotDatasetV2ImageMode | None = None,
         image_transform: Callable[[np.ndarray], np.ndarray] | None = None,
         delta_timestamps: dict[str, list[float]] | dict[str, np.ndarray] | None = None,
         fps: float | None = None,
@@ -199,9 +214,7 @@ def __init__(
                 `storage_dir` references an existing storage directory, `buffer_capacity` should not be
                 provided, as it is already included in "metadata.json".
             image_mode: The image storage mode used for the item getter. See notes above on the various
-                options. If not provided: when creating a new dataset it defaults to "video" mode, and when
-                loading an existing dataset it defaults to the first mode available from "video", "png",
-                "memmap", in that order.
+                options. If not provided it defaults to memmap mode.
             image_transform: Transforms to apply in the item getter to all image data (any data whose key
                 starts with "observation.image").
             delta_timestamps: TODO(alexander-soare): Document this somewhere when
@@ -216,7 +229,7 @@ def __init__(
 
         # Default assumption is that the image storage mode is memmaps meaning we don't need a video or image
         # directory.
-        self._image_mode = DataBufferImageMode.MEMMAP
+        self._image_mode = LeRobotDatasetV2ImageMode.MEMMAP
         self._videos_dir: str | None = None
         self._images_dir: str | None = None
 
@@ -228,7 +241,7 @@ def __init__(
                     "buffer_capacity explicitly. Instead, it will be read from 'meta.json' in the storage "
                     "directory."
                 )
-            data_spec = self._load_data_spec()
+            data_spec = self._load_metadata()
             self._make_memmaps(data_spec, mode="r+")
             self._buffer_capacity = len(self._data[self.INDEX_KEY])
             # Set image mode based on what's available in the storage directory and/or the user's selection.
@@ -240,14 +253,14 @@ def __init__(
                         f"{possible_image_modes}"
                     )
                 self._image_mode = image_mode
-            else:
-                # If image_mode is not provided, default to VIDEO, followed by PNG, followed by MEMMAP
-                # (already set above).
-                if DataBufferImageMode.VIDEO in possible_image_modes:
-                    self._image_mode = DataBufferImageMode.VIDEO
-                elif DataBufferImageMode.PNG in possible_image_modes:
-                    self._image_mode = DataBufferImageMode.PNG
         else:
+            if image_mode is not None and image_mode in [
+                LeRobotDatasetV2ImageMode.VIDEO,
+                LeRobotDatasetV2ImageMode.PNG,
+            ]:
+                raise NotImplementedError(
+                    f"Creating a new dataset from scratch with {image_mode=} is not yet supported."
+                )
             if buffer_capacity is None:
                 raise ValueError(
                     "The storage directory does not exist, which means you need to provide a buffer_capacity."
@@ -278,7 +291,6 @@ def fps(self) -> float | None:
     def num_episodes(self) -> int:
         """Total number of unique episode indices in the dataset."""
         if len(self._data) == 0:
-            # Buffers not created yet.
             return 0
         return len(np.unique(self._data[self.EPISODE_INDEX_KEY][self._data[self.OCCUPANCY_MASK_KEY]]))
 
@@ -289,7 +301,6 @@ def num_samples(self) -> int:
         TODO(alexander-soare): Rename to num_frames once LeRobotDataset is deprecated.
         """
         if len(self._data) == 0:
-            # Buffers not created yet.
             return 0
         return np.count_nonzero(self._data[self.OCCUPANCY_MASK_KEY])
 
@@ -299,28 +310,40 @@ def camera_keys(self) -> list[str]:
 
         By convention, this is all the keys starting with "observation.image".
         """
-        return [k for k in self._data if k.startswith(self.IMAGE_KEY_PREFIX)]
+        if self._image_mode == LeRobotDatasetV2ImageMode.MEMMAP:
+            return [k for k in self._data if k.startswith(self.IMAGE_KEY_PREFIX)]
+        elif self._image_mode == LeRobotDatasetV2ImageMode.VIDEO:
+            # TODO(now): Should this be just gotten from the metadata instead?
+            return list(
+                {name.split("_episode", 1)[0] for name in os.listdir(self.storage_dir / self.VIDEOS_DIR)}
+            )
+        elif self._image_mode == LeRobotDatasetV2ImageMode.PNG:
+            # TODO(now): Should this be just gotten from the metadata instead?
+            return list(
+                {name.split("_episode", 1)[0] for name in os.listdir(self.storage_dir / self.PNGS_DIR)}
+            )
 
-    def _infer_image_modes(self) -> list[DataBufferImageMode]:
+    def _infer_image_modes(self) -> list[LeRobotDatasetV2ImageMode]:
         """Infer which image modes are available according to what is in the storage directory"""
         image_modes = []
         if (self.storage_dir / self.VIDEOS_DIR).exists():
-            image_modes.append(DataBufferImageMode.VIDEO)
-        if (self.storage_dir / self.IMAGES_DIR).exists():
-            image_modes.append(DataBufferImageMode.PNG)
-        if any(k.startswith("observation.image") for k in self._load_data_spec()):
-            image_modes.append(DataBufferImageMode.MEMMAP)
+            image_modes.append(LeRobotDatasetV2ImageMode.VIDEO)
+        if (self.storage_dir / self.PNGS_DIR).exists():
+            image_modes.append(LeRobotDatasetV2ImageMode.PNG)
+        if any(k.startswith("observation.image") for k in self._load_metadata()):
+            image_modes.append(LeRobotDatasetV2ImageMode.MEMMAP)
         return image_modes
 
-    def _save_data_spec(self, data_spec: dict[str, dict]):
+    def _save_metadata(self, data_spec: dict[str, dict]):
         """Save the data type and shape specifications to the storage directory."""
+        # TODO(now): save f-strings
         meta_file = self._storage_dir / self.METADATA_FILE_NAME
         with open(meta_file, "w") as f:
             for k in data_spec:
                 data_spec[k]["dtype"] = str(data_spec[k]["dtype"])
             json.dump(data_spec, f, indent=2)
 
-    def _load_data_spec(self) -> dict[str, dict]:
+    def _load_metadata(self) -> dict[str, dict]:
         """Load the data type and shape specifications from the storage directory."""
         meta_file = self._storage_dir / self.METADATA_FILE_NAME
         with open(meta_file) as f:
@@ -353,14 +376,14 @@ def _make_storage_dir(self, episode_data: dict[str, np.ndarray]):
                 data_spec[k] = {"dtype": v.dtype, "shape": (self._buffer_capacity, *v.shape[1:])}
 
             self._make_memmaps(data_spec, "w+")
-            self._save_data_spec(data_spec)
+            self._save_metadata(data_spec)
         except Exception as e:
             # Attempt to clean up by removing the empty storage directory.
             shutil.rmtree(self._storage_dir)
             raise e
 
     def _make_memmaps(self, data_spec: dict[str, dict], mode: str):
-        """Create the memmap buffer objects.
+        """Create the memmap objects.
 
         The underlying storage directory may or may not already exist. Provide the file opening `mode`
         accordingly.
@@ -405,20 +428,20 @@ def set_delta_timestamps(self, delta_timestamps: dict[str, list[float]] | None):
             self._delta_timestamps = None
 
     def add_episodes(self, data: dict[str, np.ndarray], no_flush: bool = False):
-        """Add data to the buffer.
+        """Add episodes to the dataset.
 
-        `data` should have the same key, array mapping as the buffer. It should contain at least one episode.
-        The episodes should have frame indices that start from 0 and step up in increments of 1.
+        `data` should be a mapping of data key to data in array format. It should contain at least one
+        episode. The episodes should have frame indices that start from 0 and step up in increments of 1.
 
-        Episodes are added to the buffer one-by-one. If an episode has more frames then are available till the
-        end of the buffer, the pointer is reset to the start of the buffer and the episode is inserted there,
-        overwriting existing episode frames.
+        Episodes are added to the dataset one-by-one. If an episode has more frames then are available till
+        the end of the numpy.memmap buffer, the pointer is reset to the start of the buffer and the episode is
+        inserted there, overwriting existing episode frames.  # TODO(now): Maybe I only want this if the user intends to use the dataset as a replay buffer
 
         When episode frames are overwritten by a new episode, by default, any remaining frames belonging to
         the existing episode are left in place (meaning not all episodes will be guaranteed to start from
-        their frame 0).
+        their frame 0).  # TODO(now): Maybe I only want this if the user intends to use the dataset as a replay buffer
 
-        After adding the episodes to the buffer, the buffer is flushed to disk.
+        After adding the episodes to the dataset, the numpy.memmap buffer is flushed to disk.
         """
         for episode_index in np.unique(data[self.EPISODE_INDEX_KEY]):
             where_episode = np.where(data[self.EPISODE_INDEX_KEY] == episode_index)[0]
@@ -429,7 +452,7 @@ def add_episodes(self, data: dict[str, np.ndarray], no_flush: bool = False):
             self.flush()
 
     def _add_episode(self, data: dict[str, np.ndarray]):
-        """Add data for a single episode to the buffer."""
+        """Add a single episode to the dataset."""
         if len(self._data) == 0:
             self._make_storage_dir(data)
 
@@ -450,23 +473,16 @@ def _add_episode(self, data: dict[str, np.ndarray]):
             raise ValueError(
                 "Expected frame indices to start from 0 and step up in increments of 1 per frame."
             )
-        # Special checks on image keys.
+        # Special check on image keys.
         for k in data:
             if not k.startswith(self.IMAGE_KEY_PREFIX):
                 continue
-            if self._image_mode in [DataBufferImageMode.PNG, DataBufferImageMode.VIDEO]:
-                if data[k].dtype != np.dtype(f"S{MAX_VIDEO_PATH_LENGTH}"):
-                    raise ValueError(
-                        f"Any data key starting with '{self.IMAGE_KEY_PREFIX}' is assumed to be an image, "
-                        "and it should be string data (with a relative path to the video/png to be loaded)."
-                    )
-            else:
-                _, h, w, c = data[k].shape
-                if data[k].dtype is not np.dtype("uint8") or c >= min(h, w):
-                    raise ValueError(
-                        f"Any data key starting with '{self.IMAGE_KEY_PREFIX}' is assumed to be an image, "
-                        "and should be of type np.uint8, with channel-last format."
-                    )
+            _, h, w, c = data[k].shape
+            if data[k].dtype is not np.dtype("uint8") or c >= min(h, w):
+                raise ValueError(
+                    f"Any data key starting with '{self.IMAGE_KEY_PREFIX}' is assumed to be an image, "
+                    "and should be of type np.uint8, with channel-last format."
+                )
 
         # Figure out where we need to start filling data next, and make sure we continue data and episode
         # indices.
@@ -524,11 +540,27 @@ def __len__(self):
         return self.num_samples
 
     def get_data_by_key(self, key: str) -> np.ndarray:
-        """Returns all data for a given data key (where the data is valid)."""
+        """Returns all data for a given data key in the numpy.memmap data."""
+        if key.startswith(self.IMAGE_KEY_PREFIX) and self._image_mode in [
+            LeRobotDatasetV2ImageMode.VIDEO,
+            LeRobotDatasetV2ImageMode.PNG,
+        ]:
+            raise ValueError(
+                f"Can't get all data for image keys when {self._image_mode=}. This would require decoding "
+                "the whole dataset!"
+            )
         return self._data[key][self._data[self.OCCUPANCY_MASK_KEY]]
 
+    @staticmethod
+    def _numpy_img_to_tensor(img: np.ndarray) -> torch.Tensor:
+        """
+        Converts img from numpy uint8, in range [0, 255], channel-first to torch float32, in range [0, 1],
+        channel-last.
+        """
+        return torch.from_numpy(einops.rearrange(img.astype(np.float32) / 255.0, "... h w c -> ... c h w"))
+
     def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
-        """Gets an item or slice from the buffer and returns it in PyTorch format.
+        """Gets an item or slice from the dataset and returns it in PyTorch format.
 
         Images (any data key starting with "observation.image") get converted from numpy uint8, in range
         [0, 255], channel-first to torch float32, in range [0, 1], channel-last.
@@ -539,10 +571,30 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
         if idx >= len(self) or idx < -len(self):
             raise IndexError
 
-        item = {k: v[idx] for k, v in self._data.items() if not k.startswith("_")}
+        # Grab the relevant frame from the memmap data. At this point, this may be incomplete for one of two
+        # reasons: TODO(now)
+        # 1. We are using delta_timestamps, so some of these will act as key frames for a temporal chunk that
+        #    wish to extract.
+        # 2. The image mode is either "video" or "png", in which case we will need to decode the image frames
+        #    further down.
+        item = {}
+        for k in self._data:
+            if k.startswith("_"):
+                # Reserved key.
+                continue
+            if (
+                self._image_mode
+                in [
+                    LeRobotDatasetV2ImageMode.VIDEO,
+                    LeRobotDatasetV2ImageMode.PNG,
+                ]
+                and k in self.camera_keys
+            ):
+                continue
+            item[k] = self._data[k][idx]
 
-        # If we are using delta_timestamps take slices of the data.
         if self.delta_timestamps is not None:
+            # We are using delta timestamps, so extract the appropriate temporal chunks of data.
             episode_index = item[self.EPISODE_INDEX_KEY]
             current_ts = item[self.TIMESTAMP_KEY]
             episode_data_indices = np.where(
@@ -553,10 +605,6 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
             )[0]
             episode_timestamps = self._data[self.TIMESTAMP_KEY][episode_data_indices]
 
-            if self._image_mode == DataBufferImageMode.VIDEO:
-                # We'll use this for `decode_video_frames_torchvision`.
-                video_delta_timestamps = {}
-
             for data_key in self.delta_timestamps:
                 # Get timestamps used as query to retrieve data of previous/future frames.
                 query_ts = current_ts + self.delta_timestamps[data_key]
@@ -578,55 +626,81 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
                         f"{self.tolerance_s=}) inside the episode range."
                     )
 
-                if self._image_mode == DataBufferImageMode.VIDEO and data_key.startswith(
-                    self.IMAGE_KEY_PREFIX
+                # For image data, either decode video frames, load PNG files, or slice the memmap. For all
+                # other data, slice the memmap.
+                episode_data_indices_to_retrieve = episode_data_indices[argmin_]
+                if (
+                    data_key.startswith(self.IMAGE_KEY_PREFIX)
+                    and self._image_mode == LeRobotDatasetV2ImageMode.VIDEO
                 ):
-                    video_delta_timestamps[data_key] = self._data[self.TIMESTAMP_KEY][
-                        episode_data_indices[argmin_]
-                    ]
+                    item[data_key] = decode_video_frames_torchvision(
+                        video_path=self.storage_dir
+                        / self.VIDEOS_DIR
+                        / self.VIDEO_NAME_FSTRING.format(data_key=data_key, episode_index=episode_index),
+                        timestamps=self._data[self.TIMESTAMP_KEY][episode_data_indices_to_retrieve],
+                        tolerance_s=self.tolerance_s,
+                        backend="pyav",
+                        to_pytorch_format=True,
+                    )
+                elif (
+                    data_key.startswith(self.IMAGE_KEY_PREFIX)
+                    and self._image_mode == LeRobotDatasetV2ImageMode.PNG
+                ):
+                    imgs = []
+                    for frame_index in self._data[self.FRAME_INDEX_KEY][episode_data_indices_to_retrieve]:
+                        img_path = (
+                            self.storage_dir
+                            / self.PNGS_DIR
+                            / self.PNG_NAME_FSTRING.format(
+                                data_key=data_key,
+                                episode_index=episode_index,
+                                frame_index=frame_index,
+                            )
+                        )
+                        imgs.append(self._numpy_img_to_tensor(np.array(Image.open(img_path))))
+                    item[data_key] = torch.stack(imgs)
                 else:
-                    item[data_key] = self._data[data_key][episode_data_indices[argmin_]]
+                    item[data_key] = self._data[data_key][episode_data_indices_to_retrieve]
 
                 item[f"{data_key}{self.IS_PAD_POSTFIX}"] = is_pad
 
-        if self._image_mode == DataBufferImageMode.VIDEO:
-            # Decode the required video frames.
+        # For video and png images, we may still need to load frames if we have not already done it in the
+        # delta timestamps logic. For memmap images, we just need to finalize by converting the images to
+        # pytorch format.
+        if self._image_mode == LeRobotDatasetV2ImageMode.VIDEO:
             for k in self.camera_keys:
-                this_key_has_delta_timestamps = (
-                    self.delta_timestamps is not None and k in self.delta_timestamps
-                )
-                requested_timestamps = (
-                    video_delta_timestamps[k] if this_key_has_delta_timestamps else [item[self.TIMESTAMP_KEY]]
-                )
-                img_or_imgs = decode_video_frames_torchvision(
-                    video_path=self.storage_dir / item[k].decode(),
-                    timestamps=requested_timestamps,
-                    tolerance_s=self.tolerance_s or 1e-8,  # 1e-8 to account for no fps setting
+                if k in item:
+                    # We have already populated it in the delta_timestamps logic.
+                    assert self.delta_timestamps is not None and k in self.delta_timestamps  # sanity check
+                    continue
+                item[k] = decode_video_frames_torchvision(
+                    video_path=self.storage_dir
+                    / self.VIDEOS_DIR
+                    / self.VIDEO_NAME_FSTRING.format(data_key=k, episode_index=item[self.EPISODE_INDEX_KEY]),
+                    timestamps=[item[self.TIMESTAMP_KEY]],
+                    tolerance_s=1e-8,  # we expect the timestamp to match exactly
                     backend="pyav",
                     to_pytorch_format=True,
-                )
-                if this_key_has_delta_timestamps:
-                    item[k] = img_or_imgs
-                else:
-                    item[k] = img_or_imgs[0]  # in this case we don't want a temporal dimension
-        elif self._image_mode == DataBufferImageMode.PNG:
+                )[0]
+        elif self._image_mode == LeRobotDatasetV2ImageMode.PNG:
             for k in self.camera_keys:
-                this_key_has_delta_timestamps = (
-                    self.delta_timestamps is not None and k in self.delta_timestamps
-                )
-                to_tensor = torchvision.transforms.ToTensor()
-                if this_key_has_delta_timestamps:
-                    item[k] = torch.stack(
-                        [to_tensor(Image.open(self.storage_dir / rel_path.decode())) for rel_path in item[k]]
+                if k in item:
+                    # We have already populated it in the delta_timestamps logic.
+                    assert self.delta_timestamps is not None and k in self.delta_timestamps  # sanity check
+                    continue
+                img_path = (
+                    self.storage_dir
+                    / self.PNGS_DIR
+                    / self.PNG_NAME_FSTRING.format(
+                        data_key=k,
+                        episode_index=item[self.EPISODE_INDEX_KEY],
+                        frame_index=item[self.FRAME_INDEX_KEY],
                     )
-                else:
-                    item[k] = to_tensor(Image.open(self.storage_dir / item[k].decode()))
-        else:
-            # Convert to PyTorch format: channel-last, float32, normalize to range [0, 1].
-            for k in self.camera_keys:
-                item[k] = einops.rearrange(
-                    torch.from_numpy(item[k].astype(np.float32) / 255.0), "... h w c -> ... c h w"
                 )
+                item[k] = self._numpy_img_to_tensor(np.array(Image.open(img_path)))
+        elif self._image_mode == LeRobotDatasetV2ImageMode.MEMMAP:
+            for k in self.camera_keys:
+                item[k] = self._numpy_img_to_tensor(item[k])
 
         if self.image_transform is not None:
             for k in self.camera_keys:
@@ -641,10 +715,10 @@ def from_huggingface_hub(
         decode_images: bool = False,
         root: Path | None = DATA_DIR,
         **kwargs,
-    ) -> "DataBuffer":
-        """Create a DataBuffer from a data repository on the Hugging Face Hub.
+    ) -> "LeRobotDatasetV2":
+        """Create a LeRobotDatasetV2 from a data repository on the Hugging Face Hub.
 
-        NOTE: If the DataBuffer already exists in /tmp, this function will reuse it rather than creating a new
+        NOTE: If the LeRobotDatasetV2 already exists in /tmp, this function will reuse it rather than creating a new
         one.
 
         Args:
@@ -657,13 +731,16 @@ def from_huggingface_hub(
                 `buffer_capacity` arguments which are inferred automatically. `storage_dir` is set to
                 `/tmp/{repo_id}_{hf_dataset._fingerprint}_{decoded?}` unless provided explicitly.
         Returns:
-            The resulting DataBuffer object.
+            The resulting LeRobotDatasetV2 object.
         """
         for k in ["data_spec", "buffer_capacity"]:
             if k in kwargs:
                 raise ValueError(f"`{k}` should not be provided as it is inferred from the hub dataset.")
 
         hf_dataset = load_hf_dataset(repo_id, version=CODEBASE_VERSION, root=root, split="train")
+        hf_dataset_camera_keys = [
+            k for k in hf_dataset.features if k.startswith(LeRobotDatasetV2.IMAGE_KEY_PREFIX)
+        ]
         episode_data_index = load_episode_data_index(repo_id, version=CODEBASE_VERSION, root=root)
         hf_dataset.set_transform(lambda x: x)  # there is a default transform in place. reset it
         # Get some metadata necessary for processing videos.
@@ -675,47 +752,47 @@ def from_huggingface_hub(
 
         kwargs.setdefault(
             "storage_dir",
-            DataBuffer._default_storage_dir_from_huggingface_hub(
+            LeRobotDatasetV2._default_storage_dir_from_huggingface_hub(
                 repo_id, hf_dataset._fingerprint, decode_images
             ),
         )
 
-        buffer_already_on_disk = False
-        if Path(kwargs["storage_dir"] / DataBuffer.METADATA_FILE_NAME).exists():
-            buffer_already_on_disk = True
+        dataset_already_on_disk = False
+        if Path(kwargs["storage_dir"] / LeRobotDatasetV2.METADATA_FILE_NAME).exists():
+            dataset_already_on_disk = True
 
-        # Create the DataBuffer object. Reminder: if the storage directory already exists, this reads it.
+        # Create the LeRobotDatasetV2 object. Reminder: if the storage directory already exists, this reads it.
         # Otherwise, the storage directory is not created until later when we make the first call to
         # `add_episodes`.
         obj = cls(
             **kwargs,
-            buffer_capacity=len(hf_dataset) if not buffer_already_on_disk else None,
+            buffer_capacity=len(hf_dataset) if not dataset_already_on_disk else None,
         )
 
-        if decode_images:
-            obj._image_mode = DataBufferImageMode.MEMMAP
+        # Set the image mode based on the provided HF dataset and whether we are decoding images.
+        if len(hf_dataset_camera_keys) == 0 or decode_images:
+            obj._image_mode = LeRobotDatasetV2ImageMode.MEMMAP
+        elif lerobot_dataset_info.get("video", False):
+            obj._image_mode = LeRobotDatasetV2ImageMode.VIDEO
+            obj._videos_dir = kwargs["storage_dir"] / LeRobotDatasetV2.VIDEOS_DIR
         else:
-            if lerobot_dataset_info.get("video", False):
-                obj._image_mode = DataBufferImageMode.VIDEO
-                obj._videos_dir = kwargs["storage_dir"] / DataBuffer.VIDEOS_DIR
-            else:
-                obj._image_mode = DataBufferImageMode.PNG
-                for k, feature in hf_dataset.features.items():
-                    if isinstance(feature, datasets.features.Image):
-                        hf_dataset = hf_dataset.cast_column(k, datasets.features.Image(decode=False))
-                obj._images_dir = kwargs["storage_dir"] / DataBuffer.IMAGES_DIR
-
-        # If we have accessed an existing cached data buffer, just return the object as is.
-        if buffer_already_on_disk:
+            obj._image_mode = LeRobotDatasetV2ImageMode.PNG
+            for k, feature in hf_dataset.features.items():
+                if isinstance(feature, datasets.features.Image):
+                    hf_dataset = hf_dataset.cast_column(k, datasets.features.Image(decode=False))
+            obj._images_dir = kwargs["storage_dir"] / LeRobotDatasetV2.PNGS_DIR
+
+        # If we have accessed an existing cached dataset, just return the object as is.
+        if dataset_already_on_disk:
             if len(obj) == len(hf_dataset):
                 # All episodes are already in the storage directory.
                 return obj
             else:
                 # Only some episodes are in the storage directory. Reset the data pointer and start from
                 # scratch.
-                obj._data[DataBuffer.NEXT_INDEX_KEY][0] = 0
+                obj._data[LeRobotDatasetV2.NEXT_INDEX_KEY][0] = 0
 
-        # Populate the buffer with the data from the dataset.
+        # Siphon the data from the Hugging Face dataset into our dataset object.
         episode_indices = np.unique(hf_dataset["episode_index"])
         for episode_index in tqdm(episode_indices, desc="Siphoning episodes into local data structure"):
             data_dict = {}
@@ -730,38 +807,40 @@ def from_huggingface_hub(
                     if decode_images:
                         data_dict[k] = np.stack([np.array(pil_img) for pil_img in hf_episode_data[k]])
                     else:
-                        relative_paths = []
                         for i in range(len(hf_episode_data[k])):
-                            frame_index = hf_episode_data[DataBuffer.FRAME_INDEX_KEY][i]
-                            episode_index = hf_episode_data[DataBuffer.EPISODE_INDEX_KEY][i]
-                            relative_path = (
-                                f"images/{k}_episode_{episode_index:06d}_frame_{frame_index:06d}.png"
+                            frame_index = hf_episode_data[LeRobotDatasetV2.FRAME_INDEX_KEY][i]
+                            img_path = (
+                                obj.storage_dir
+                                / LeRobotDatasetV2.PNGS_DIR
+                                / LeRobotDatasetV2.PNG_NAME_FSTRING.format(
+                                    data_key=k, episode_index=episode_index, frame_index=frame_index
+                                )
                             )
-                            relative_paths.append(np.array(relative_path, dtype=f"S{MAX_VIDEO_PATH_LENGTH}"))
-                            absolute_path = obj.storage_dir / relative_path
-                            os.makedirs(absolute_path.parent, exist_ok=True)
+                            os.makedirs(img_path.parent, exist_ok=True)
                             img_bytes = hf_episode_data[k][i]["bytes"]
-                            with open(absolute_path, "wb") as f:
+                            with open(img_path, "wb") as f:
                                 f.write(img_bytes)
-                        data_dict[k] = np.stack(relative_paths, dtype=f"S{MAX_VIDEO_PATH_LENGTH}")
                 elif isinstance(feature, VideoFrame):
                     if decode_images:
-                        # Decode all videos into images.
-                        all_imgs = []
-                        episode_imgs = decode_video_frames_torchvision(
+                        # Decode all frames of the episode.
+                        data_dict[k] = decode_video_frames_torchvision(
                             lerobot_dataset_videos_path.parent / hf_episode_data[k][0]["path"],
-                            np.array(hf_episode_data["timestamp"]),
-                            1 / lerobot_dataset_info["fps"] - 1e-4,
+                            timestamps=np.array(hf_episode_data["timestamp"]),
+                            tolerance_s=1e-8,
                             to_pytorch_format=False,
                         )
-                        all_imgs.extend(episode_imgs)
-                        data_dict[k] = np.stack(all_imgs)
                     else:
-                        data_dict[k] = np.stack(
-                            [
-                                np.array(dct["path"], dtype=f"S{MAX_VIDEO_PATH_LENGTH}")
-                                for dct in hf_episode_data[k]
-                            ]
+                        # Copy video to storage directory.
+                        new_video_path = (
+                            obj.storage_dir
+                            / LeRobotDatasetV2.VIDEOS_DIR
+                            / LeRobotDatasetV2.VIDEO_NAME_FSTRING.format(
+                                data_key=k, episode_index=episode_index
+                            )
+                        )
+                        os.makedirs(new_video_path.parent, exist_ok=True)
+                        os.symlink(
+                            (lerobot_dataset_videos_path / new_video_path.name).absolute(), new_video_path
                         )
                 elif isinstance(feature, datasets.features.Sequence):
                     data_dict[k] = np.array(hf_episode_data[k], dtype=np.dtype(feature.feature.dtype))
@@ -770,13 +849,11 @@ def from_huggingface_hub(
                 else:
                     raise NotImplementedError(f"feature type {type(feature)} is not handled.")
 
+            # no_flush=True makes it a lot faster. We will flush once after the loop.
             obj.add_episodes(data_dict, no_flush=True)
 
         obj.flush()
 
-        # Symlink vidoes if needed.
-        if obj._image_mode == DataBufferImageMode.VIDEO and not obj._videos_dir.exists():
-            os.symlink(lerobot_dataset_videos_path.absolute(), obj._videos_dir)
         return obj
 
     @staticmethod
@@ -791,7 +868,7 @@ def _default_storage_dir_from_huggingface_hub(repo_id: str, fingerprint: str, de
 def compute_sampler_weights(
     offline_dataset: LeRobotDataset,
     offline_drop_n_last_frames: int = 0,
-    online_dataset: DataBuffer | None = None,
+    online_dataset: LeRobotDatasetV2 | None = None,
     online_sampling_ratio: float | None = None,
     online_drop_n_last_frames: int = 0,
 ) -> torch.Tensor:
@@ -800,7 +877,7 @@ def compute_sampler_weights(
     Args:
         offline_dataset: The LeRobotDataset used for offline pre-training.
         online_drop_n_last_frames: Number of frames to drop from the end of each offline dataset episode.
-        online_dataset: The DataBuffer used in online training.
+        online_dataset: The LeRobotDatasetV2 used in online training.
         online_sampling_ratio: The proportion of data that should be sampled from the online dataset. If an
             online dataset is provided, this value must also be provided.
         online_drop_n_first_frames: See `offline_drop_n_last_frames`. This is the same, but for the online
@@ -848,7 +925,7 @@ def compute_sampler_weights(
 
     if online_dataset is not None and len(online_dataset) > 0:
         online_data_mask_indices = []
-        episode_indices = online_dataset.get_data_by_key(DataBuffer.EPISODE_INDEX_KEY)
+        episode_indices = online_dataset.get_data_by_key(LeRobotDatasetV2.EPISODE_INDEX_KEY)
         for episode_idx in np.unique(episode_indices):
             where_episode = np.where(episode_indices == episode_idx)
             start_index = where_episode[0][0]
diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py
index 964d547a6..0a7ca0f95 100644
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@@ -32,7 +32,7 @@
 
 from lerobot.common.datasets.factory import make_dataset, resolve_delta_timestamps
 from lerobot.common.datasets.lerobot_dataset import MultiLeRobotDataset
-from lerobot.common.datasets.online_buffer import DataBuffer, compute_sampler_weights
+from lerobot.common.datasets.online_buffer import LeRobotDatasetV2, compute_sampler_weights
 from lerobot.common.datasets.sampler import EpisodeAwareSampler
 from lerobot.common.datasets.utils import cycle
 from lerobot.common.envs.factory import make_env
@@ -412,7 +412,7 @@ def evaluate_and_checkpoint_if_needed(step, is_online):
                 "depending on the amount of data (but it only needs to happen once, and data loading will be "
                 "fast!)"
             )
-        offline_dataset_for_dataloader = DataBuffer.from_huggingface_hub(
+        offline_dataset_for_dataloader = LeRobotDatasetV2.from_huggingface_hub(
             offline_dataset.repo_id,
             fps=offline_dataset.fps,
             delta_timestamps=offline_dataset.delta_timestamps,
@@ -489,7 +489,7 @@ def evaluate_and_checkpoint_if_needed(step, is_online):
             "was made. This is because the online buffer is updated on disk during training, independently "
             "of our explicit checkpointing mechanisms."
         )
-    online_dataset = DataBuffer(
+    online_dataset = LeRobotDatasetV2(
         online_buffer_path,
         buffer_capacity=cfg.training.online_buffer_capacity,
         fps=online_env.unwrapped.metadata["render_fps"],
diff --git a/tests/test_online_buffer.py b/tests/test_online_buffer.py
index 612a2eee1..8d6f5f93e 100644
--- a/tests/test_online_buffer.py
+++ b/tests/test_online_buffer.py
@@ -24,7 +24,7 @@
 
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, DATA_DIR, LeRobotDataset
 from lerobot.common.datasets.online_buffer import (
-    DataBuffer,
+    LeRobotDatasetV2,
     TimestampOutsideToleranceError,
     compute_sampler_weights,
 )
@@ -41,8 +41,8 @@
 
 def make_new_buffer(
     storage_dir: str, storage_dir_exists: bool = False, delta_timestamps: dict[str, list[float]] | None = None
-) -> DataBuffer:
-    buffer = DataBuffer(
+) -> LeRobotDatasetV2:
+    buffer = LeRobotDatasetV2(
         storage_dir,
         buffer_capacity=buffer_capacity if not storage_dir_exists else None,
         fps=None if delta_timestamps is None else fps,
@@ -54,10 +54,10 @@ def make_new_buffer(
 def make_spoof_data_frames(n_episodes: int, n_frames_per_episode: int) -> dict[str, np.ndarray]:
     new_data = {
         data_key: np.arange(n_frames_per_episode * n_episodes * np.prod(data_shape)).reshape(-1, *data_shape),
-        DataBuffer.INDEX_KEY: np.arange(n_frames_per_episode * n_episodes),
-        DataBuffer.EPISODE_INDEX_KEY: np.repeat(np.arange(n_episodes), n_frames_per_episode),
-        DataBuffer.FRAME_INDEX_KEY: np.tile(np.arange(n_frames_per_episode), n_episodes),
-        DataBuffer.TIMESTAMP_KEY: np.tile(np.arange(n_frames_per_episode) / fps, n_episodes),
+        LeRobotDatasetV2.INDEX_KEY: np.arange(n_frames_per_episode * n_episodes),
+        LeRobotDatasetV2.EPISODE_INDEX_KEY: np.repeat(np.arange(n_episodes), n_frames_per_episode),
+        LeRobotDatasetV2.FRAME_INDEX_KEY: np.tile(np.arange(n_frames_per_episode), n_episodes),
+        LeRobotDatasetV2.TIMESTAMP_KEY: np.tile(np.arange(n_frames_per_episode) / fps, n_episodes),
     }
     return new_data
 
@@ -178,7 +178,7 @@ def test_delta_timestamps_within_tolerance(tmp_path: Path):
     new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
     buffer.add_episodes(new_data)
     item = buffer[2]
-    data, is_pad = item["index"], item[f"index{DataBuffer.IS_PAD_POSTFIX}"]
+    data, is_pad = item["index"], item[f"index{LeRobotDatasetV2.IS_PAD_POSTFIX}"]
     assert torch.allclose(data, torch.tensor([0, 2, 3])), "Data does not match expected values"
     assert not is_pad.any(), "Unexpected padding detected"
 
@@ -228,7 +228,7 @@ def test_delta_timestamps_outside_tolerance_outside_episode_range(tmp_path):
 def test_camera_keys(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
     """Check that the camera_keys property returns all relevant keys."""
     storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}_{decode_video}"
-    buffer = DataBuffer.from_huggingface_hub(dataset_repo_id, decode_video, storage_dir=storage_dir)
+    buffer = LeRobotDatasetV2.from_huggingface_hub(dataset_repo_id, decode_video, storage_dir=storage_dir)
     assert set(buffer.camera_keys) == {k for k in buffer._data if k.startswith("observation.image")}
 
 
@@ -245,7 +245,7 @@ def test_getter_returns_pytorch_format(tmp_path: Path, dataset_repo_id: str, dec
     """Checks that tensors are returned and that images are torch.float32, in range [0, 1], channel-first."""
     # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
     storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}_{decode_video}"
-    buffer = DataBuffer.from_huggingface_hub(dataset_repo_id, decode_video, storage_dir=storage_dir)
+    buffer = LeRobotDatasetV2.from_huggingface_hub(dataset_repo_id, decode_video, storage_dir=storage_dir)
     # Just iterate through the start and end of the dataset to make the test faster.
     for i in np.concatenate([np.arange(0, 10), np.arange(-10, 0)]):
         item = buffer[i]
@@ -272,7 +272,7 @@ def test_getter_images_with_delta_timestamps(tmp_path: Path):
     fps = lerobot_dataset_info["fps"]
     # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
     storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}"
-    buffer = DataBuffer.from_huggingface_hub(dataset_repo_id, storage_dir=storage_dir, fps=fps)
+    buffer = LeRobotDatasetV2.from_huggingface_hub(dataset_repo_id, storage_dir=storage_dir, fps=fps)
     delta_timestamps = [-1 / fps, 0.0, 1 / fps]
     buffer.set_delta_timestamps({k: delta_timestamps for k in buffer.camera_keys})
 
@@ -296,10 +296,10 @@ def test_getter_video_images_with_delta_timestamps(tmp_path: Path):
     lerobot_dataset_info = load_info(dataset_repo_id, version=CODEBASE_VERSION, root=DATA_DIR)
     fps = lerobot_dataset_info["fps"]
     # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
-    buffer_video = DataBuffer.from_huggingface_hub(
+    buffer_video = LeRobotDatasetV2.from_huggingface_hub(
         dataset_repo_id, False, storage_dir=tmp_path / f"{dataset_repo_id}_{uuid4().hex}_False", fps=fps
     )
-    buffer_decode = DataBuffer.from_huggingface_hub(
+    buffer_decode = LeRobotDatasetV2.from_huggingface_hub(
         dataset_repo_id, True, storage_dir=tmp_path / f"{dataset_repo_id}_{uuid4().hex}_True", fps=fps
     )
 
@@ -340,12 +340,12 @@ def test_from_huggingface_hub(tmp_path: Path, dataset_repo_id: str, decode_video
         hf_dataset.set_transform(lambda x: x)
         # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
         # This ensures that the first time this loop is run, the storage directory does not already exist.
-        storage_dir = tmp_path / DataBuffer._default_storage_dir_from_huggingface_hub(
+        storage_dir = tmp_path / LeRobotDatasetV2._default_storage_dir_from_huggingface_hub(
             dataset_repo_id, hf_dataset._fingerprint, decode_video
         ).relative_to("/tmp")
         if iteration == 0 and storage_dir.exists():
             raise DevTestingError("The storage directory should not exist for the first pass of this test.")
-        buffer = DataBuffer.from_huggingface_hub(
+        buffer = LeRobotDatasetV2.from_huggingface_hub(
             dataset_repo_id,
             decode_video,
             storage_dir=storage_dir,

From 2e72b6a95cde502b8103cfd6b66c36169f10f7d9 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Thu, 26 Sep 2024 18:03:42 +0100
Subject: [PATCH 26/29] backup wip

---
 lerobot/common/datasets/online_buffer.py | 170 ++++++++++++++++-------
 1 file changed, 117 insertions(+), 53 deletions(-)

diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
index e726d40ad..546dca9d6 100644
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -18,6 +18,7 @@
 import json
 import os
 import shutil
+from copy import copy
 from enum import Enum
 from pathlib import Path
 from typing import Callable
@@ -210,9 +211,11 @@ def __init__(
                 One memmap file will be stored for each data key. Note that if the storage directory already
                 exists, the memmap files are opened in read-write mode. If the storage directory does not
                 exist, it will be lazily created with the first call to `add_episodes`.
-            buffer_capacity: How many frames should be stored in the buffer as a maximum. Note that if
-                `storage_dir` references an existing storage directory, `buffer_capacity` should not be
-                provided, as it is already included in "metadata.json".
+            buffer_capacity: When using this class as an online replay buffer, this specifies how many frames
+                should be stored in the dataset as a maximum. Calls to `add_episode` beyond the maximum
+                capacity, will result in the oldest episodes being pushed out of the buffer to make way
+                for the new ones (first-in-last-out aka FILO). When loading an existing dataset, this
+                parameter must still be provided if you wish to make use of this FILO mechanism.
             image_mode: The image storage mode used for the item getter. See notes above on the various
                 options. If not provided it defaults to memmap mode.
             image_transform: Transforms to apply in the item getter to all image data (any data whose key
@@ -235,15 +238,18 @@ def __init__(
 
         # If the storage directory and metadata files already exists, load the memmaps.
         if (self._storage_dir / self.METADATA_FILE_NAME).exists():
-            if buffer_capacity is not None:
-                raise ValueError(
-                    "The storage directory already exists, which means you should not provide a "
-                    "buffer_capacity explicitly. Instead, it will be read from 'meta.json' in the storage "
-                    "directory."
-                )
-            data_spec = self._load_metadata()
+            data_spec = self._load_metadata()["_data_spec"]
             self._make_memmaps(data_spec, mode="r+")
-            self._buffer_capacity = len(self._data[self.INDEX_KEY])
+            if buffer_capacity is not None:
+                if buffer_capacity < len(self):
+                    raise ValueError(
+                        f"The storage directory already exists and contains a dataset with {len(self)} "
+                        f"frames. But you have provided {buffer_capacity=}. It is required that "
+                        "buffer_capacity >= {len(self)}"
+                    )
+                if buffer_capacity > len(self._data[self.INDEX_KEY]):
+                    self._extend_memmaps(new_length=buffer_capacity)
+                self._buffer_capacity = buffer_capacity
             # Set image mode based on what's available in the storage directory and/or the user's selection.
             possible_image_modes = self._infer_image_modes()
             if image_mode is not None:
@@ -261,10 +267,6 @@ def __init__(
                 raise NotImplementedError(
                     f"Creating a new dataset from scratch with {image_mode=} is not yet supported."
                 )
-            if buffer_capacity is None:
-                raise ValueError(
-                    "The storage directory does not exist, which means you need to provide a buffer_capacity."
-                )
             self._buffer_capacity = buffer_capacity
 
         # Parameters for the item getter.
@@ -310,18 +312,8 @@ def camera_keys(self) -> list[str]:
 
         By convention, this is all the keys starting with "observation.image".
         """
-        if self._image_mode == LeRobotDatasetV2ImageMode.MEMMAP:
-            return [k for k in self._data if k.startswith(self.IMAGE_KEY_PREFIX)]
-        elif self._image_mode == LeRobotDatasetV2ImageMode.VIDEO:
-            # TODO(now): Should this be just gotten from the metadata instead?
-            return list(
-                {name.split("_episode", 1)[0] for name in os.listdir(self.storage_dir / self.VIDEOS_DIR)}
-            )
-        elif self._image_mode == LeRobotDatasetV2ImageMode.PNG:
-            # TODO(now): Should this be just gotten from the metadata instead?
-            return list(
-                {name.split("_episode", 1)[0] for name in os.listdir(self.storage_dir / self.PNGS_DIR)}
-            )
+        metadata = self._load_metadata()
+        return [k for k in metadata["data_keys"] if k.startswith(self.IMAGE_KEY_PREFIX)]
 
     def _infer_image_modes(self) -> list[LeRobotDatasetV2ImageMode]:
         """Infer which image modes are available according to what is in the storage directory"""
@@ -334,23 +326,47 @@ def _infer_image_modes(self) -> list[LeRobotDatasetV2ImageMode]:
             image_modes.append(LeRobotDatasetV2ImageMode.MEMMAP)
         return image_modes
 
-    def _save_metadata(self, data_spec: dict[str, dict]):
-        """Save the data type and shape specifications to the storage directory."""
-        # TODO(now): save f-strings
-        meta_file = self._storage_dir / self.METADATA_FILE_NAME
-        with open(meta_file, "w") as f:
-            for k in data_spec:
-                data_spec[k]["dtype"] = str(data_spec[k]["dtype"])
-            json.dump(data_spec, f, indent=2)
+    def _save_metadata(
+        self,
+        data_keys: list[str] | None = None,
+        data_spec: dict | None = None,
+    ):
+        """Save or update the metadata file in the storage directory.
+
+        If the metadata file already exists, it is updated with the provided parameters, otherwise a new
+        metadata file is created. There is no mechanism for clearing a field in the metadata file.
 
-    def _load_metadata(self) -> dict[str, dict]:
+        Args:
+            data_keys: All the data keys of the data set. Used for human readability.
+            data_spec: `numpy.memmap` data spec used for loading the memmaps.
+        """
+        metadata_file = self._storage_dir / self.METADATA_FILE_NAME
+        if not metadata_file.exists() and any([data_spec is None, data_keys is None]):
+            raise AssertionError("The first time _save_metadata is called, all arguments should be provided.")
+        metadata = self._load_metadata() if metadata_file.exists() else {}
+
+        # Go through each provided argument, updating the metadata.
+        if data_keys is not None:
+            metadata["data_keys"] = copy(data_keys)
+        if data_spec is not None:
+            data_spec = dict(data_spec)
+            metadata["_data_spec"] = copy(data_spec)
+
+        # Custom serialization.
+        for k in metadata["_data_spec"]:
+            metadata["_data_spec"][k]["dtype"] = str(metadata["_data_spec"][k]["dtype"])
+
+        with open(metadata_file, "w") as f:
+            json.dump(metadata, f, indent=2)
+
+    def _load_metadata(self) -> dict:
         """Load the data type and shape specifications from the storage directory."""
-        meta_file = self._storage_dir / self.METADATA_FILE_NAME
-        with open(meta_file) as f:
-            data_spec = json.load(f)
-        for k in data_spec:
-            data_spec[k]["dtype"] = np.dtype(data_spec[k]["dtype"])
-        return data_spec
+        with open(self._storage_dir / self.METADATA_FILE_NAME) as f:
+            metadata = json.load(f)
+        # Custom deserialization.
+        for k in metadata["_data_spec"]:
+            metadata["_data_spec"][k]["dtype"] = np.dtype(metadata["_data_spec"][k]["dtype"])
+        return metadata
 
     def _make_storage_dir(self, episode_data: dict[str, np.ndarray]):
         """Create the storage directory based on example episode data from the first `add_episodes` call."""
@@ -360,23 +376,36 @@ def _make_storage_dir(self, episode_data: dict[str, np.ndarray]):
 
         self._storage_dir.mkdir(parents=True, exist_ok=True)
 
+        if self._buffer_capacity is None:
+            # Reserve enough storage for one episode. Storage will be extended as needed.
+            num_frames = len(episode_data[self.INDEX_KEY])
+        else:
+            num_frames = self._buffer_capacity
+
+        # TODO(now): Warn if attempting to store too much.
+
         try:
             # Make the data spec for np.memmap
             data_spec = {
                 self.NEXT_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (1,)},
-                self.OCCUPANCY_MASK_KEY: {"dtype": np.dtype("?"), "shape": (self._buffer_capacity,)},
-                self.INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (self._buffer_capacity,)},
-                self.FRAME_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (self._buffer_capacity,)},
-                self.EPISODE_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (self._buffer_capacity,)},
-                self.TIMESTAMP_KEY: {"dtype": np.dtype("float32"), "shape": (self._buffer_capacity,)},
+                self.OCCUPANCY_MASK_KEY: {"dtype": np.dtype("?"), "shape": (num_frames,)},
+                self.INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (num_frames,)},
+                self.FRAME_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (num_frames,)},
+                self.EPISODE_INDEX_KEY: {"dtype": np.dtype("int64"), "shape": (num_frames,)},
+                self.TIMESTAMP_KEY: {"dtype": np.dtype("float32"), "shape": (num_frames,)},
             }
             for k, v in episode_data.items():
                 if k in data_spec:
                     continue
-                data_spec[k] = {"dtype": v.dtype, "shape": (self._buffer_capacity, *v.shape[1:])}
+                if k.startswith(self.IMAGE_KEY_PREFIX):
+                    if self._image_mode == LeRobotDatasetV2ImageMode.VIDEO:
+                        (self._storage_dir / self.VIDEOS_DIR).mkdir(exist_ok=True)
+                    elif self._image_mode == LeRobotDatasetV2ImageMode.VIDEO:
+                        (self._storage_dir / self.PNGS_DIR).mkdir(exist_ok=True)
+                data_spec[k] = {"dtype": v.dtype, "shape": (num_frames, *v.shape[1:])}
 
             self._make_memmaps(data_spec, "w+")
-            self._save_metadata(data_spec)
+            self._save_metadata(data_spec=data_spec, data_keys=list(episode_data))
         except Exception as e:
             # Attempt to clean up by removing the empty storage directory.
             shutil.rmtree(self._storage_dir)
@@ -396,6 +425,20 @@ def _make_memmaps(self, data_spec: dict[str, dict], mode: str):
                 shape=tuple(v["shape"]) if v is not None else None,
             )
 
+    def _extend_memmaps(self, new_length: int):
+        """Increase the frame capacity of the memmaps to new_length."""
+        # TODO(now): Warn if attempting to store too much.
+        assert (
+            len(self._data[self.INDEX_KEY]) < new_length
+        ), "new_length must be more than the current capacity of the memmaps"
+        data_spec = self._load_metadata()["_data_spec"]
+        for k in data_spec:
+            if k == self.NEXT_INDEX_KEY:
+                continue
+            data_spec[k]["shape"][0] = new_length
+        self._make_memmaps(data_spec=data_spec, mode="r+")
+        self._save_metadata(data_spec=data_spec)
+
     @property
     def delta_timestamps(self) -> dict[str, np.ndarray] | None:
         return self._delta_timestamps
@@ -493,12 +536,23 @@ def _add_episode(self, data: dict[str, np.ndarray]):
         else:
             last_episode_index = -1
             last_data_index = -1
-        # If there aren't enough slots in the buffer left to accommodate the episode, wrap to the start.
-        if max(0, new_data_length - (self._buffer_capacity - next_index)) > 0:
-            next_index = 0
+
+        # If there aren't enough slots left in the reserved memmap space to accommodate the episode, either
+        # extend the memmaps, or wrap to the start if we have an explicit buffer capacity.
+        if self._buffer_capacity is None:
+            # A buffer capacity was not explicitly provided, so dynamically resize the memmaps (double the
+            # capacity).
+            if max(0, new_data_length - (len(self._data[self.INDEX_KEY]) - next_index)) > 0:
+                self._extend_memmaps(len(self._data[self.INDEX_KEY]) * 2)
+        else:
+            # A buffer capacity was provided. Wrap to the start.
+            if max(0, new_data_length - (self._buffer_capacity - next_index)) > 0:
+                next_index = 0
 
         # Insert the new data starting from next_index.
         for k in self.data_keys:
+            # TODO(now): Handle "png" and "video", including deleting images and videos when in online buffer
+            # mode.
             slc = slice(next_index, next_index + new_data_length)
             if k == self.EPISODE_INDEX_KEY:
                 self._data[k][slc] = last_episode_index + 1
@@ -572,7 +626,7 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
             raise IndexError
 
         # Grab the relevant frame from the memmap data. At this point, this may be incomplete for one of two
-        # reasons: TODO(now)
+        # reasons:
         # 1. We are using delta_timestamps, so some of these will act as key frames for a temporal chunk that
         #    wish to extract.
         # 2. The image mode is either "video" or "png", in which case we will need to decode the image frames
@@ -793,6 +847,9 @@ def from_huggingface_hub(
                 obj._data[LeRobotDatasetV2.NEXT_INDEX_KEY][0] = 0
 
         # Siphon the data from the Hugging Face dataset into our dataset object.
+        # If the image mode is either "png" or "video" we will apply a small hack. Rather than passing image
+        # arrays to `add_episodes` to be encoded, we will reuse the pre-existing files. This means that we'll
+        # need to manually update metadata["data_keys"] afterwards.
         episode_indices = np.unique(hf_dataset["episode_index"])
         for episode_index in tqdm(episode_indices, desc="Siphoning episodes into local data structure"):
             data_dict = {}
@@ -854,6 +911,13 @@ def from_huggingface_hub(
 
         obj.flush()
 
+        if obj._image_mode in [LeRobotDatasetV2ImageMode.VIDEO, LeRobotDatasetV2ImageMode.PNG]:
+            # We didn't pass the image keys into the first call to `add_episodes` so manually update
+            # metadata["data_keys"].
+            data_keys: list[str] = obj._load_metadata()["data_keys"]
+            data_keys.extend(hf_dataset_camera_keys)
+            obj._save_metadata(data_keys=data_keys)
+
         return obj
 
     @staticmethod

From 90ea3dfd4658e9ebef5700427a2ee27bc5934ca0 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Fri, 27 Sep 2024 14:21:48 +0100
Subject: [PATCH 27/29] backup wip

---
 lerobot/common/datasets/online_buffer.py | 261 +++++++++++++++--------
 tests/test_online_buffer.py              | 189 +++++++++-------
 2 files changed, 282 insertions(+), 168 deletions(-)

diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
index 546dca9d6..f80169e1a 100644
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -18,6 +18,7 @@
 import json
 import os
 import shutil
+import tempfile
 from copy import copy
 from enum import Enum
 from pathlib import Path
@@ -32,7 +33,11 @@
 
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, DATA_DIR, LeRobotDataset
 from lerobot.common.datasets.utils import load_episode_data_index, load_hf_dataset, load_info, load_videos
-from lerobot.common.datasets.video_utils import VideoFrame, decode_video_frames_torchvision
+from lerobot.common.datasets.video_utils import (
+    VideoFrame,
+    decode_video_frames_torchvision,
+    encode_video_frames,
+)
 
 
 def _make_memmap_safe(**kwargs) -> np.memmap:
@@ -65,6 +70,11 @@ class LeRobotDatasetV2ImageMode(Enum):
     PNG = "png"
     VIDEO = "video"
 
+    @staticmethod
+    def needs_decoding(mode: "LeRobotDatasetV2ImageMode") -> bool:
+        """Return true if the provided image mode means decoding is needed, else False."""
+        return mode in [LeRobotDatasetV2ImageMode.PNG, LeRobotDatasetV2ImageMode.VIDEO]
+
 
 class LeRobotDatasetV2(torch.utils.data.Dataset):
     """
@@ -200,6 +210,7 @@ def __init__(
         self,
         storage_dir: str | Path,
         buffer_capacity: int | None = None,
+        use_as_filo_buffer: bool = False,
         image_mode: LeRobotDatasetV2ImageMode | None = None,
         image_transform: Callable[[np.ndarray], np.ndarray] | None = None,
         delta_timestamps: dict[str, list[float]] | dict[str, np.ndarray] | None = None,
@@ -211,11 +222,16 @@ def __init__(
                 One memmap file will be stored for each data key. Note that if the storage directory already
                 exists, the memmap files are opened in read-write mode. If the storage directory does not
                 exist, it will be lazily created with the first call to `add_episodes`.
-            buffer_capacity: When using this class as an online replay buffer, this specifies how many frames
-                should be stored in the dataset as a maximum. Calls to `add_episode` beyond the maximum
-                capacity, will result in the oldest episodes being pushed out of the buffer to make way
-                for the new ones (first-in-last-out aka FILO). When loading an existing dataset, this
-                parameter must still be provided if you wish to make use of this FILO mechanism.
+            buffer_capacity: Sets the size of the preallocated storage space for the memmaps in terms of
+                frames. If not provided, the memmap storage space is dynamically expanded as episodes are
+                added (doubling every time extra space is needed). If you know the number of frames you plan
+                to add in advance, it is recommended that this parameter be provided for efficiency. If
+                provided, attempts to add data beyond the capacity will result in an exception (unless
+                `use_as_filo_buffer` is set). If provided with an existing storage directory, the existing
+                memmaps will be expanded to the provided capacity if needed. TODO(now): test all this
+            use_as_filo_buffer: Set this to use the dataset as an online replay buffer. Calls to `add_episode`
+                beyond the buffer_capacity, will result in the oldest episodes being pushed out of the buffer
+                to make way for the new ones (first-in-last-out aka FILO).
             image_mode: The image storage mode used for the item getter. See notes above on the various
                 options. If not provided it defaults to memmap mode.
             image_transform: Transforms to apply in the item getter to all image data (any data whose key
@@ -225,52 +241,52 @@ def __init__(
             fps: TODO(alexander-soare): Document this somewhere when `load_previous_and_future_frames` is
                 refactored.
 
+
+        TODO(now): How to deal with corrupt state of the storage directory.
         """
-        # Parameters for the data structure.
+        if use_as_filo_buffer and buffer_capacity is None:
+            raise ValueError(f"A buffer_capacity must be provided if {use_as_filo_buffer=}.")
+
         self._storage_dir = Path(storage_dir)
         self._data: dict[str, np.memmap] = {}
-
-        # Default assumption is that the image storage mode is memmaps meaning we don't need a video or image
-        # directory.
+        self._use_as_filo_buffer = use_as_filo_buffer
         self._image_mode = LeRobotDatasetV2ImageMode.MEMMAP
         self._videos_dir: str | None = None
         self._images_dir: str | None = None
+        # TODO(now): Put fps in metadata.
+        self._fps = fps  # TODO(now): Decide how to treat fps (required or not?)
 
         # If the storage directory and metadata files already exists, load the memmaps.
         if (self._storage_dir / self.METADATA_FILE_NAME).exists():
             data_spec = self._load_metadata()["_data_spec"]
             self._make_memmaps(data_spec, mode="r+")
             if buffer_capacity is not None:
-                if buffer_capacity < len(self):
+                current_capacity = len(self._data[self.INDEX_KEY])
+                if buffer_capacity < current_capacity:
                     raise ValueError(
-                        f"The storage directory already exists and contains a dataset with {len(self)} "
-                        f"frames. But you have provided {buffer_capacity=}. It is required that "
-                        "buffer_capacity >= {len(self)}"
+                        f"The storage directory already exists and contains a memmaps with capacity: "
+                        f"{current_capacity} frames. But you have provided "
+                        f"{buffer_capacity=}. It is required that buffer_capacity >= {current_capacity}"
                     )
-                if buffer_capacity > len(self._data[self.INDEX_KEY]):
+                if buffer_capacity > current_capacity:
                     self._extend_memmaps(new_length=buffer_capacity)
                 self._buffer_capacity = buffer_capacity
+            else:
+                self._buffer_capacity = buffer_capacity
             # Set image mode based on what's available in the storage directory and/or the user's selection.
             possible_image_modes = self._infer_image_modes()
             if image_mode is not None:
                 if image_mode not in possible_image_modes:
                     raise ValueError(
-                        f"Provided {image_mode=} not available with this storage directory. Modes available: "
-                        f"{possible_image_modes}"
+                        f"Provided image_mode {str(image_mode)} not available with this storage directory. "
+                        f"Modes available: {[str(m) for m in possible_image_modes]}"
                     )
                 self._image_mode = image_mode
         else:
-            if image_mode is not None and image_mode in [
-                LeRobotDatasetV2ImageMode.VIDEO,
-                LeRobotDatasetV2ImageMode.PNG,
-            ]:
-                raise NotImplementedError(
-                    f"Creating a new dataset from scratch with {image_mode=} is not yet supported."
-                )
             self._buffer_capacity = buffer_capacity
+            if image_mode is not None:
+                self._image_mode = image_mode
 
-        # Parameters for the item getter.
-        self._fps = fps
         self.set_delta_timestamps(delta_timestamps)
         self.image_transform = image_transform
 
@@ -280,10 +296,23 @@ def storage_dir(self) -> Path:
 
     @property
     def data_keys(self) -> list[str]:
-        keys = set(self._data)
-        keys.remove(self.OCCUPANCY_MASK_KEY)
-        keys.remove(self.NEXT_INDEX_KEY)
-        return sorted(keys)
+        """Return the names of all data keys in the dataset.
+
+        Exclude "private" internal keys.
+
+        TODO(now): Test
+        """
+        metadata = self._load_metadata()
+        return list(metadata["data_keys"])
+
+    @property
+    def camera_keys(self) -> list[str]:
+        """Return the names of all data keys pertaining to camera observations.
+
+        By convention, this is all the keys starting with "observation.image".
+        """
+        metadata = self._load_metadata()
+        return [k for k in metadata["data_keys"] if k.startswith(self.IMAGE_KEY_PREFIX)]
 
     @property
     def fps(self) -> float | None:
@@ -306,14 +335,9 @@ def num_samples(self) -> int:
             return 0
         return np.count_nonzero(self._data[self.OCCUPANCY_MASK_KEY])
 
-    @property
-    def camera_keys(self) -> list[str]:
-        """Return the names of all data keys pertaining to camera observations.
-
-        By convention, this is all the keys starting with "observation.image".
-        """
-        metadata = self._load_metadata()
-        return [k for k in metadata["data_keys"] if k.startswith(self.IMAGE_KEY_PREFIX)]
+    def get_unique_episode_indices(self) -> np.ndarray:
+        # TODO(now): test
+        return np.unique(self._data[self.EPISODE_INDEX_KEY][self._data[self.OCCUPANCY_MASK_KEY]])
 
     def _infer_image_modes(self) -> list[LeRobotDatasetV2ImageMode]:
         """Infer which image modes are available according to what is in the storage directory"""
@@ -370,6 +394,7 @@ def _load_metadata(self) -> dict:
 
     def _make_storage_dir(self, episode_data: dict[str, np.ndarray]):
         """Create the storage directory based on example episode data from the first `add_episodes` call."""
+        # TODO(now): Do I really want this?
         assert not (
             self.storage_dir / self.METADATA_FILE_NAME
         ).exists(), "This method should only be called before the storage directory has been created."
@@ -397,12 +422,13 @@ def _make_storage_dir(self, episode_data: dict[str, np.ndarray]):
             for k, v in episode_data.items():
                 if k in data_spec:
                     continue
-                if k.startswith(self.IMAGE_KEY_PREFIX):
-                    if self._image_mode == LeRobotDatasetV2ImageMode.VIDEO:
-                        (self._storage_dir / self.VIDEOS_DIR).mkdir(exist_ok=True)
-                    elif self._image_mode == LeRobotDatasetV2ImageMode.VIDEO:
-                        (self._storage_dir / self.PNGS_DIR).mkdir(exist_ok=True)
-                data_spec[k] = {"dtype": v.dtype, "shape": (num_frames, *v.shape[1:])}
+                is_image_key = k.startswith(self.IMAGE_KEY_PREFIX)
+                if is_image_key and self._image_mode == LeRobotDatasetV2ImageMode.VIDEO:
+                    (self._storage_dir / self.VIDEOS_DIR).mkdir()
+                elif is_image_key and self._image_mode == LeRobotDatasetV2ImageMode.PNG:
+                    (self._storage_dir / self.PNGS_DIR).mkdir()
+                else:
+                    data_spec[k] = {"dtype": v.dtype, "shape": (num_frames, *v.shape[1:])}
 
             self._make_memmaps(data_spec, "w+")
             self._save_metadata(data_spec=data_spec, data_keys=list(episode_data))
@@ -504,8 +530,8 @@ def _add_episode(self, data: dict[str, np.ndarray]):
         new_data_length = len(data[self.data_keys[0]])
         if new_data_length <= 0:
             raise ValueError("The episode has 0 frames")
-        if new_data_length > self._buffer_capacity:
-            raise ValueError("The episode length is larger than the buffer capacity.")
+        if self._buffer_capacity is not None and new_data_length > self._buffer_capacity:
+            raise ValueError("The episode length is larger than the total buffer capacity.")
         if not all(len(data[k]) == new_data_length for k in self.data_keys):
             raise ValueError("All data items should have the same length")
         if not np.all(data[self.EPISODE_INDEX_KEY] == data[self.EPISODE_INDEX_KEY][0]):
@@ -537,34 +563,68 @@ def _add_episode(self, data: dict[str, np.ndarray]):
             last_episode_index = -1
             last_data_index = -1
 
-        # If there aren't enough slots left in the reserved memmap space to accommodate the episode, either
-        # extend the memmaps, or wrap to the start if we have an explicit buffer capacity.
-        if self._buffer_capacity is None:
-            # A buffer capacity was not explicitly provided, so dynamically resize the memmaps (double the
-            # capacity).
-            if max(0, new_data_length - (len(self._data[self.INDEX_KEY]) - next_index)) > 0:
-                self._extend_memmaps(len(self._data[self.INDEX_KEY]) * 2)
-        else:
-            # A buffer capacity was provided. Wrap to the start.
-            if max(0, new_data_length - (self._buffer_capacity - next_index)) > 0:
+        new_episode_index = last_episode_index + 1
+
+        # Handle situation if there aren't enough empty frames left in the memmaps to handle the new episode.
+        capacity = len(self._data[self.INDEX_KEY]) if self._buffer_capacity is None else self._buffer_capacity
+        n_excess = max(0, new_data_length - (capacity - next_index))
+        if n_excess > 0:
+            if self._buffer_capacity is None:
+                # A buffer capacity was not explicitly provided, so dynamically resize the memmaps (double the
+                # capacity).
+                self._extend_memmaps(capacity * 2)
+            elif self._use_as_filo_buffer:
+                # A buffer capacity was provided and we wish to use the dataset as a FILO buffer. Wrap to the
+                # start.
                 next_index = 0
+            else:
+                # A buffer capacity was provided meaning we intend to fix the dataset size.
+                raise RuntimeError(
+                    "Can't add this episode as it would exceed the provided buffer_capacity "
+                    f"({self._buffer_capacity} frames) by {n_excess} frames."
+                )
 
         # Insert the new data starting from next_index.
         for k in self.data_keys:
-            # TODO(now): Handle "png" and "video", including deleting images and videos when in online buffer
-            # mode.
-            slc = slice(next_index, next_index + new_data_length)
-            if k == self.EPISODE_INDEX_KEY:
-                self._data[k][slc] = last_episode_index + 1
-            elif k == self.INDEX_KEY:
-                self._data[k][slc] = np.arange(last_data_index + 1, last_data_index + 1 + new_data_length)
+            # Special treatment of image keys depending on the image mode.
+            if self._image_mode == LeRobotDatasetV2ImageMode.VIDEO and k.startswith(self.IMAGE_KEY_PREFIX):
+                # Encode all frames of the episode into a video and save to disk.
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    for frame_index, img in zip(data[self.FRAME_INDEX_KEY], data[k], strict=True):
+                        Image.fromarray(img).save(Path(tmpdirname) / f"frame_{frame_index:06d}.png")
+                    encode_video_frames(
+                        Path(tmpdirname),
+                        self._storage_dir
+                        / self.VIDEOS_DIR
+                        / self.VIDEO_NAME_FSTRING.format(data_key=k, episode_index=new_episode_index),
+                        self.fps,
+                    )
+            elif self._image_mode == LeRobotDatasetV2ImageMode.PNG and k.startswith(self.IMAGE_KEY_PREFIX):
+                # Encode images to PNG and save to disk.
+                for frame_index, img in zip(data[self.FRAME_INDEX_KEY], data[k], strict=True):
+                    Image.fromarray(img).save(
+                        self._storage_dir
+                        / self.PNGS_DIR
+                        / self.PNG_NAME_FSTRING.format(
+                            data_key=k, episode_index=new_episode_index, frame_index=frame_index
+                        )
+                    )
             else:
-                self._data[k][slc] = data[k]
-            self._data[self.OCCUPANCY_MASK_KEY][slc] = True
+                # Insertion into the memmap for non-image keys or image keys in memmap mode.
+                slc = slice(next_index, next_index + new_data_length)
+                if k == self.EPISODE_INDEX_KEY:
+                    self._data[k][slc] = new_episode_index
+                elif k == self.INDEX_KEY:
+                    self._data[k][slc] = np.arange(last_data_index + 1, last_data_index + 1 + new_data_length)
+                else:
+                    self._data[k][slc] = data[k]
+                self._data[self.OCCUPANCY_MASK_KEY][slc] = True
 
         # Update the data pointer.
         self._data[self.NEXT_INDEX_KEY][0] = next_index + new_data_length
 
+        # TODO(now): Remove videos/images if data is overwritten.
+
     def flush(self):
         """Save the data to disk.
 
@@ -595,16 +655,55 @@ def __len__(self):
 
     def get_data_by_key(self, key: str) -> np.ndarray:
         """Returns all data for a given data key in the numpy.memmap data."""
-        if key.startswith(self.IMAGE_KEY_PREFIX) and self._image_mode in [
-            LeRobotDatasetV2ImageMode.VIDEO,
-            LeRobotDatasetV2ImageMode.PNG,
-        ]:
+        if key.startswith(self.IMAGE_KEY_PREFIX) and LeRobotDatasetV2ImageMode.needs_decoding(
+            self._image_mode
+        ):
             raise ValueError(
                 f"Can't get all data for image keys when {self._image_mode=}. This would require decoding "
                 "the whole dataset!"
             )
         return self._data[key][self._data[self.OCCUPANCY_MASK_KEY]]
 
+    def get_episode(self, episode_index: int) -> dict[str, np.ndarray]:
+        """Returns a whole episode as a key-array mapping.
+
+        Includes decoding videos or PNG files where necessary.
+
+        TODO(now)
+        """
+        episode_data = {}
+        data_mask = np.bitwise_and(
+            self._data[self.EPISODE_INDEX_KEY] == episode_index, self._data[self.OCCUPANCY_MASK_KEY]
+        )
+        for k in self.data_keys:
+            if self._image_mode == LeRobotDatasetV2ImageMode.VIDEO and k in self.camera_keys:
+                episode_data[k] = decode_video_frames_torchvision(
+                    video_path=self.storage_dir
+                    / self.VIDEOS_DIR
+                    / self.VIDEO_NAME_FSTRING.format(data_key=k, episode_index=episode_index),
+                    timestamps=self._data[self.TIMESTAMP_KEY][data_mask],
+                    tolerance_s=self.tolerance_s,
+                    backend="pyav",
+                    to_pytorch_format=False,
+                )
+            elif self._image_mode == LeRobotDatasetV2ImageMode.PNG and k in self.camera_keys:
+                imgs = []
+                for frame_index in self._data[self.FRAME_INDEX_KEY][data_mask]:
+                    img_path = (
+                        self.storage_dir
+                        / self.PNGS_DIR
+                        / self.PNG_NAME_FSTRING.format(
+                            data_key=k,
+                            episode_index=episode_index,
+                            frame_index=frame_index,
+                        )
+                    )
+                    imgs.append(np.array(Image.open(img_path)))
+                episode_data[k] = np.stack(imgs)
+            else:
+                episode_data[k] = self._data[k][data_mask]
+        return episode_data
+
     @staticmethod
     def _numpy_img_to_tensor(img: np.ndarray) -> torch.Tensor:
         """
@@ -632,18 +731,8 @@ def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
         # 2. The image mode is either "video" or "png", in which case we will need to decode the image frames
         #    further down.
         item = {}
-        for k in self._data:
-            if k.startswith("_"):
-                # Reserved key.
-                continue
-            if (
-                self._image_mode
-                in [
-                    LeRobotDatasetV2ImageMode.VIDEO,
-                    LeRobotDatasetV2ImageMode.PNG,
-                ]
-                and k in self.camera_keys
-            ):
+        for k in self.data_keys:
+            if LeRobotDatasetV2ImageMode.needs_decoding(self._image_mode) and k in self.camera_keys:
                 continue
             item[k] = self._data[k][idx]
 
@@ -781,13 +870,13 @@ def from_huggingface_hub(
                 provides large speed benefits for data access but only if your storage can handle it (decoded
                 images take up a lot more storage space than videos or png files).
             root: (will be deprecated) Directory to load the dataset from, instead of the hub.
-            **kwargs: Other arguments to `self.__init__` except for the `data_spec` and
-                `buffer_capacity` arguments which are inferred automatically. `storage_dir` is set to
+            **kwargs: Other arguments to `self.__init__` except for the `data_spec`, `buffer_capacity` and
+                `fps` arguments which are inferred automatically. `storage_dir` is set to
                 `/tmp/{repo_id}_{hf_dataset._fingerprint}_{decoded?}` unless provided explicitly.
         Returns:
             The resulting LeRobotDatasetV2 object.
         """
-        for k in ["data_spec", "buffer_capacity"]:
+        for k in ["data_spec", "buffer_capacity", "fps"]:
             if k in kwargs:
                 raise ValueError(f"`{k}` should not be provided as it is inferred from the hub dataset.")
 
@@ -811,6 +900,8 @@ def from_huggingface_hub(
             ),
         )
 
+        kwargs["fps"] = lerobot_dataset_info["fps"]
+
         dataset_already_on_disk = False
         if Path(kwargs["storage_dir"] / LeRobotDatasetV2.METADATA_FILE_NAME).exists():
             dataset_already_on_disk = True
@@ -911,7 +1002,7 @@ def from_huggingface_hub(
 
         obj.flush()
 
-        if obj._image_mode in [LeRobotDatasetV2ImageMode.VIDEO, LeRobotDatasetV2ImageMode.PNG]:
+        if LeRobotDatasetV2ImageMode.needs_decoding(obj._image_mode):
             # We didn't pass the image keys into the first call to `add_episodes` so manually update
             # metadata["data_keys"].
             data_keys: list[str] = obj._load_metadata()["data_keys"]
diff --git a/tests/test_online_buffer.py b/tests/test_online_buffer.py
index 8d6f5f93e..9cf5fb56f 100644
--- a/tests/test_online_buffer.py
+++ b/tests/test_online_buffer.py
@@ -12,7 +12,12 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.d
+# limitations under the License.
+
+# TODO(now): Test that the data_keys are saved correctly in the metadata
+# TODO(now): Test adding episodes in video mode and png mode.
+# TODO(now): Test ._extend_memmaps
+
 from copy import deepcopy
 from pathlib import Path
 from uuid import uuid4
@@ -33,25 +38,27 @@
 from tests.utils import DevTestingError
 
 # Some constants for DataBuffer tests.
+# TODO(now): remove these globals
 data_key = "data"
 data_shape = (2, 3)  # just some arbitrary > 1D shape
-buffer_capacity = 100
-fps = 10
+# buffer_capacity = 100
+# fps = 10
 
 
-def make_new_buffer(
-    storage_dir: str, storage_dir_exists: bool = False, delta_timestamps: dict[str, list[float]] | None = None
-) -> LeRobotDatasetV2:
-    buffer = LeRobotDatasetV2(
-        storage_dir,
-        buffer_capacity=buffer_capacity if not storage_dir_exists else None,
-        fps=None if delta_timestamps is None else fps,
-        delta_timestamps=delta_timestamps,
-    )
-    return buffer
+# def make_new_dataset(
+#     storage_dir: str, storage_dir_exists: bool = False, delta_timestamps: dict[str, list[float]] | None = None
+# ) -> LeRobotDatasetV2:
+#     buffer = LeRobotDatasetV2(
+#         storage_dir,
+#         buffer_capacity=buffer_capacity if not storage_dir_exists else None,
+#         fps=None if delta_timestamps is None else fps,
+#         delta_timestamps=delta_timestamps,
+#     )
+#     return buffer
 
 
 def make_spoof_data_frames(n_episodes: int, n_frames_per_episode: int) -> dict[str, np.ndarray]:
+    fps = 10
     new_data = {
         data_key: np.arange(n_frames_per_episode * n_episodes * np.prod(data_shape)).reshape(-1, *data_shape),
         LeRobotDatasetV2.INDEX_KEY: np.arange(n_frames_per_episode * n_episodes),
@@ -70,60 +77,68 @@ def test_non_mutate(tmp_path: Path):
     NOTE: If this test fails, it means some of the other tests may be compromised. For example, we can't trust
     a success case for `test_write_read`.
     """
-    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
+    dataset = LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}")
     # Note: choices for args to make_spoof_data_frames are arbitrary.
-    new_data = make_spoof_data_frames(2, buffer_capacity // 4)
+    new_data = make_spoof_data_frames(2, 25)
     new_data_copy = deepcopy(new_data)
-    buffer.add_episodes(new_data)
-    buffer.get_data_by_key(data_key)[:] += 1
+    dataset.add_episodes(new_data)
+    dataset.get_data_by_key(data_key)[:] += 1
     assert all(np.array_equal(new_data[k], new_data_copy[k]) for k in new_data)
 
 
 def test_index_error_no_data(tmp_path: Path):
-    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
+    dataset = LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}")
     with pytest.raises(IndexError):
-        buffer[0]
+        dataset[0]
 
 
 def test_index_error_with_data(tmp_path: Path):
-    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
-    n_frames = buffer_capacity // 2
+    dataset = LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}")
+    n_frames = 50
     # Note: choices for args to make_spoof_data_frames are arbitrary.
     new_data = make_spoof_data_frames(1, n_frames)
-    buffer.add_episodes(new_data)
+    dataset.add_episodes(new_data)
     with pytest.raises(IndexError):
-        buffer[n_frames]
+        dataset[n_frames]
     with pytest.raises(IndexError):
-        buffer[-n_frames - 1]
+        dataset[-n_frames - 1]
 
 
 @pytest.mark.parametrize("do_reload", [False, True])
 def test_write_read(tmp_path: Path, do_reload: bool):
-    """Checks that data can be added to the buffer and read back.
+    """Checks that data can be added to the dataset and read back.
 
-    If do_reload we delete the buffer object and load the buffer back from disk before reading.
+    If do_reload we delete the dataset object and load the dataset back from disk before reading.
     """
-    storage_dir = tmp_path / f"buffer_{uuid4().hex}"
-    buffer = make_new_buffer(storage_dir)
+    storage_dir = tmp_path / f"dataset_{uuid4().hex}"
+    dataset = LeRobotDatasetV2(storage_dir)
     # Note: choices for args to make_spoof_data_frames are arbitrary.
     n_episodes = 2
-    n_frames_per_episode = buffer_capacity // 4
+    n_frames_per_episode = 25
     new_data = make_spoof_data_frames(n_episodes, n_frames_per_episode)
-    buffer.add_episodes(new_data)
+    dataset.add_episodes(new_data)
 
     if do_reload:
-        del buffer
-        buffer = make_new_buffer(storage_dir, storage_dir_exists=True)
+        del dataset
+        dataset = LeRobotDatasetV2(storage_dir)
 
-    assert len(buffer) == n_frames_per_episode * n_episodes
-    for i, item in enumerate(buffer):
+    assert len(dataset) == n_frames_per_episode * n_episodes
+    for i, item in enumerate(dataset):
         assert all(isinstance(item[k], torch.Tensor) for k in item)
         assert np.array_equal(item[data_key].numpy(), new_data[data_key][i])
 
 
-def test_fifo(tmp_path: Path):
+def test_filo_needs_buffer_capacity(tmp_path):
+    with pytest.raises(ValueError):
+        LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}", use_as_filo_buffer=True)
+
+
+def test_filo(tmp_path: Path):
     """Checks that if data is added beyond the buffer capacity, we discard the oldest data first."""
-    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
+    buffer_capacity = 100
+    dataset = LeRobotDatasetV2(
+        tmp_path / f"dataset_{uuid4().hex}", buffer_capacity=buffer_capacity, use_as_filo_buffer=True
+    )
     # Note: choices for args to make_spoof_data_frames are mostly arbitrary. Of interest is:
     #   - later we need `n_more_episodes` to cause an overflow.
     #   - we need that overflow to happen *within* an episode such that we're testing the behavior whereby the
@@ -133,16 +148,16 @@ def test_fifo(tmp_path: Path):
     if buffer_capacity - n_frames_per_episode * n_episodes >= n_frames_per_episode:
         raise DevTestingError("Make sure to set this up such that adding another episode causes an overflow.")
     new_episodes = make_spoof_data_frames(n_episodes, n_frames_per_episode)
-    buffer.add_episodes(new_episodes)
+    dataset.add_episodes(new_episodes)
     # Note `n_more_episodes` is chosen to result in an overflow on the first episode.
     n_more_episodes = 2
     # Make this slightly larger than the prior episodes to test that there is no issue with overwriting the
     # start of an existing episode.
     n_frames_per_more_episodes = n_frames_per_episode + 1
     more_new_episodes = make_spoof_data_frames(n_more_episodes, n_frames_per_more_episodes)
-    buffer.add_episodes(more_new_episodes)
+    dataset.add_episodes(more_new_episodes)
     assert (
-        len(buffer) == n_frames_per_episode * n_episodes
+        len(dataset) == n_frames_per_episode * n_episodes
     ), "The new episode should have wrapped around to the start"
 
     expected_data = {}
@@ -151,30 +166,32 @@ def test_fifo(tmp_path: Path):
         # The extra new episode should overwrite the start of the buffer.
         expected_data[k][: len(more_new_episodes[k])] = more_new_episodes[k]
 
-    for i, item in enumerate(buffer):
+    for i, item in enumerate(dataset):
         assert np.array_equal(item[data_key].numpy(), expected_data[data_key][i])
 
+    # TODO(now): Test that videos and pngs are removed as needed.
+
 
 def test_get_data_by_key(tmp_path: Path):
-    """Tests that data can be added to a buffer and all data for a specific key can be read back."""
-    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
+    """Tests that data can be added to a dataset and all data for a specific key can be read back."""
+    dataset = LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}")
     # Note: choices for args to make_spoof_data_frames are mostly arbitrary. The only intentional aspect is to
     # make sure the buffer is not full, in order to check that `get_data_by_key` only returns the part of the
     # buffer that is occupied.
     n_episodes = 2
-    n_frames_per_episode = buffer_capacity // 4
+    n_frames_per_episode = 25
     new_episodes = make_spoof_data_frames(n_episodes, n_frames_per_episode)
-    buffer.add_episodes(new_episodes)
+    dataset.add_episodes(new_episodes)
 
-    data_from_buffer = buffer.get_data_by_key(data_key)
-    assert np.array_equal(data_from_buffer, new_episodes[data_key])
+    data = dataset.get_data_by_key(data_key)
+    assert np.array_equal(data, new_episodes[data_key])
 
 
 def test_delta_timestamps_within_tolerance(tmp_path: Path):
     """Check that getting an item with delta_timestamps within tolerance succeeds."""
-    if fps != 10:
-        raise DevTestingError("This test is designed to use fps == 10.")
-    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}", delta_timestamps={"index": [-0.2, 0, 0.1]})
+    buffer = LeRobotDatasetV2(
+        tmp_path / f"dataset_{uuid4().hex}", fps=10, delta_timestamps={"index": [-0.2, 0, 0.1]}
+    )
     new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
     buffer.add_episodes(new_data)
     item = buffer[2]
@@ -188,9 +205,9 @@ def test_delta_timestamps_outside_tolerance_inside_episode_range(tmp_path: Path)
 
     We expect it to fail if and only if the requested timestamps are within the episode range.
     """
-    if fps != 10:
-        raise DevTestingError("This test is designed to use fps == 10.")
-    buffer = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}", delta_timestamps={"index": [-0.2, 0, 0.1]})
+    buffer = LeRobotDatasetV2(
+        tmp_path / f"dataset_{uuid4().hex}", fps=10, delta_timestamps={"index": [-0.2, 0, 0.1]}
+    )
     new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
     # Hack the timestamps to invoke a tolerance error.
     new_data["timestamp"][3] += 0.1
@@ -201,10 +218,8 @@ def test_delta_timestamps_outside_tolerance_inside_episode_range(tmp_path: Path)
 
 def test_delta_timestamps_outside_tolerance_outside_episode_range(tmp_path):
     """Check that copy-padding of timestamps outside of the episode range works."""
-    if fps != 10:
-        raise DevTestingError("This test is designed to use fps == 10.")
-    buffer = make_new_buffer(
-        tmp_path / f"buffer_{uuid4().hex}", delta_timestamps={"index": [-0.3, -0.2, 0, 0.2, 0.3]}
+    buffer = LeRobotDatasetV2(
+        tmp_path / f"dataset_{uuid4().hex}", fps=10, delta_timestamps={"index": [-0.3, -0.2, 0, 0.2, 0.3]}
     )
     new_data = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=5)
     buffer.add_episodes(new_data)
@@ -217,35 +232,40 @@ def test_delta_timestamps_outside_tolerance_outside_episode_range(tmp_path):
 
 
 @pytest.mark.parametrize(
-    ("dataset_repo_id", "decode_video"),
+    ("dataset_repo_id", "decode_images"),
     (
         # choose unitreeh1_two_robot_greeting to have multiple image keys (with minimal video data)
         ("lerobot/unitreeh1_two_robot_greeting", True),
         ("lerobot/unitreeh1_two_robot_greeting", False),
         ("lerobot/pusht", False),
+        ("lerobot/pusht_image", False),
+        ("lerobot/pusht_image", True),
     ),
 )
-def test_camera_keys(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
+def test_camera_keys(tmp_path: Path, dataset_repo_id: str, decode_images: bool):
     """Check that the camera_keys property returns all relevant keys."""
-    storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}_{decode_video}"
-    buffer = LeRobotDatasetV2.from_huggingface_hub(dataset_repo_id, decode_video, storage_dir=storage_dir)
-    assert set(buffer.camera_keys) == {k for k in buffer._data if k.startswith("observation.image")}
+    storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}_{decode_images}"
+    buffer = LeRobotDatasetV2.from_huggingface_hub(dataset_repo_id, decode_images, storage_dir=storage_dir)
+    lerobot_dataset = LeRobotDataset(dataset_repo_id)
+    assert set(buffer.camera_keys) == set(lerobot_dataset.camera_keys)
 
 
 @pytest.mark.parametrize(
-    ("dataset_repo_id", "decode_video"),
+    ("dataset_repo_id", "decode_images"),
     (
         # choose unitreeh1_two_robot_greeting to have multiple image keys
         ("lerobot/unitreeh1_two_robot_greeting", True),
         ("lerobot/unitreeh1_two_robot_greeting", False),
         ("lerobot/pusht", False),
+        ("lerobot/pusht_image", False),
+        ("lerobot/pusht_image", True),
     ),
 )
-def test_getter_returns_pytorch_format(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
+def test_getter_returns_pytorch_format(tmp_path: Path, dataset_repo_id: str, decode_images: bool):
     """Checks that tensors are returned and that images are torch.float32, in range [0, 1], channel-first."""
     # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
-    storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}_{decode_video}"
-    buffer = LeRobotDatasetV2.from_huggingface_hub(dataset_repo_id, decode_video, storage_dir=storage_dir)
+    storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}_{decode_images}"
+    buffer = LeRobotDatasetV2.from_huggingface_hub(dataset_repo_id, decode_images, storage_dir=storage_dir)
     # Just iterate through the start and end of the dataset to make the test faster.
     for i in np.concatenate([np.arange(0, 10), np.arange(-10, 0)]):
         item = buffer[i]
@@ -272,7 +292,7 @@ def test_getter_images_with_delta_timestamps(tmp_path: Path):
     fps = lerobot_dataset_info["fps"]
     # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
     storage_dir = tmp_path / f"{dataset_repo_id}_{uuid4().hex}"
-    buffer = LeRobotDatasetV2.from_huggingface_hub(dataset_repo_id, storage_dir=storage_dir, fps=fps)
+    buffer = LeRobotDatasetV2.from_huggingface_hub(dataset_repo_id, storage_dir=storage_dir)
     delta_timestamps = [-1 / fps, 0.0, 1 / fps]
     buffer.set_delta_timestamps({k: delta_timestamps for k in buffer.camera_keys})
 
@@ -297,10 +317,10 @@ def test_getter_video_images_with_delta_timestamps(tmp_path: Path):
     fps = lerobot_dataset_info["fps"]
     # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
     buffer_video = LeRobotDatasetV2.from_huggingface_hub(
-        dataset_repo_id, False, storage_dir=tmp_path / f"{dataset_repo_id}_{uuid4().hex}_False", fps=fps
+        dataset_repo_id, False, storage_dir=tmp_path / f"{dataset_repo_id}_{uuid4().hex}_False"
     )
     buffer_decode = LeRobotDatasetV2.from_huggingface_hub(
-        dataset_repo_id, True, storage_dir=tmp_path / f"{dataset_repo_id}_{uuid4().hex}_True", fps=fps
+        dataset_repo_id, True, storage_dir=tmp_path / f"{dataset_repo_id}_{uuid4().hex}_True"
     )
 
     assert set(buffer_video.camera_keys) == set(buffer_decode.camera_keys)
@@ -320,14 +340,15 @@ def test_getter_video_images_with_delta_timestamps(tmp_path: Path):
 
 
 @pytest.mark.parametrize(
-    ("dataset_repo_id", "decode_video"),
+    ("dataset_repo_id", "decode_images"),
     (
         ("lerobot/pusht", True),
         ("lerobot/pusht", False),
+        ("lerobot/pusht_image", True),
         ("lerobot/pusht_image", False),
     ),
 )
-def test_from_huggingface_hub(tmp_path: Path, dataset_repo_id: str, decode_video: bool):
+def test_from_huggingface_hub(tmp_path: Path, dataset_repo_id: str, decode_images: bool):
     """
     Check that:
         - We can make a buffer from a Hugging Face Hub dataset repository.
@@ -341,23 +362,27 @@ def test_from_huggingface_hub(tmp_path: Path, dataset_repo_id: str, decode_video
         # Note: storage_dir specified explicitly in order to make use of pytest's temporary file fixture.
         # This ensures that the first time this loop is run, the storage directory does not already exist.
         storage_dir = tmp_path / LeRobotDatasetV2._default_storage_dir_from_huggingface_hub(
-            dataset_repo_id, hf_dataset._fingerprint, decode_video
+            dataset_repo_id, hf_dataset._fingerprint, decode_images
         ).relative_to("/tmp")
         if iteration == 0 and storage_dir.exists():
             raise DevTestingError("The storage directory should not exist for the first pass of this test.")
         buffer = LeRobotDatasetV2.from_huggingface_hub(
             dataset_repo_id,
-            decode_video,
+            decode_images,
             storage_dir=storage_dir,
         )
         assert len(buffer) == len(hf_dataset)
         for k, feature in hf_dataset.features.items():
             if isinstance(feature, datasets.features.Image):
-                assert np.array_equal(
-                    buffer.get_data_by_key(k), np.stack([np.array(pil_img) for pil_img in hf_dataset[k]])
-                )
+                if decode_images:
+                    assert np.array_equal(
+                        buffer.get_data_by_key(k), np.stack([np.array(pil_img) for pil_img in hf_dataset[k]])
+                    )
+                else:
+                    # TODO(now)
+                    pass
             elif isinstance(feature, VideoFrame):
-                if decode_video:
+                if decode_images:
                     # Decode the video here.
                     lerobot_dataset_info = load_info(dataset_repo_id, version=CODEBASE_VERSION, root=DATA_DIR)
                     videos_path = load_videos(dataset_repo_id, version=CODEBASE_VERSION, root=DATA_DIR)
@@ -376,10 +401,8 @@ def test_from_huggingface_hub(tmp_path: Path, dataset_repo_id: str, decode_video
                         all_imgs.extend(episode_imgs)
                     assert np.array_equal(buffer.get_data_by_key(k), all_imgs)
                 else:
-                    # Check that the video paths are the same.
-                    assert np.array_equal(
-                        buffer.get_data_by_key(k), [item["path"].encode("ascii") for item in hf_dataset[k]]
-                    )
+                    # TODO(now)
+                    pass
             elif isinstance(feature, (datasets.features.Sequence, datasets.features.Value)):
                 assert np.array_equal(buffer.get_data_by_key(k), hf_dataset[k])
             else:
@@ -412,7 +435,7 @@ def test_compute_sampler_weights_trivial(
             "to": torch.tensor([offline_dataset_size // 2, offline_dataset_size]),
         }
     # Create spoof online datset.
-    online_dataset = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
+    online_dataset = LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}")
     if online_dataset_size > 0:
         online_dataset.add_episodes(
             make_spoof_data_frames(n_episodes=2, n_frames_per_episode=online_dataset_size // 2)
@@ -443,7 +466,7 @@ def test_compute_sampler_weights_nontrivial_ratio(tmp_path: Path):
         "to": torch.tensor([2, 4]),
     }
     # Create spoof online datset.
-    online_dataset = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
+    online_dataset = LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}")
     online_dataset.add_episodes(make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2))
     online_sampling_ratio = 0.8
     weights = compute_sampler_weights(
@@ -466,7 +489,7 @@ def test_compute_sampler_weights_drop_n_last_frames(tmp_path: Path):
     offline_dataset.hf_dataset.set_transform(hf_transform_to_torch)
     offline_dataset.episode_data_index = {"from": torch.tensor([0]), "to": torch.tensor([2])}
 
-    online_dataset = make_new_buffer(tmp_path / f"buffer_{uuid4().hex}")
+    online_dataset = LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}")
     online_dataset.add_episodes(make_spoof_data_frames(n_episodes=4, n_frames_per_episode=2))
 
     weights = compute_sampler_weights(

From fb6841d00cd4419f76aa96f9bf398e7dde942df1 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Fri, 27 Sep 2024 15:44:23 +0100
Subject: [PATCH 28/29] tests passing

---
 lerobot/common/datasets/online_buffer.py | 43 ++++++-----
 tests/test_online_buffer.py              | 94 +++++++++++++-----------
 2 files changed, 73 insertions(+), 64 deletions(-)

diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
index f80169e1a..54b1ad4cf 100644
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -151,32 +151,34 @@ class LeRobotDatasetV2(torch.utils.data.Dataset):
     dataloader = torch.utils.data.DataLoader(dataset)
     ```
 
-    TODO(now): Finish the docs below
+    The next most common use case is to create a dataset from scratch, and push it to the hub in video format.
+    For example:
 
-    The next most common use case is to create a dataset from scratch. TODO(now): What to do about the need
-    for up-front buffer capacity.
-
-    A size limit must be specified when creating a new dataset (to know how much space to reserve on disk for
-    the `numpy.memmap`s). The `add_episodes` method can be used to insert data in the form of integral
-    episodes (starting from frame 0 and with the frames ordered). For the purposes of a limited-capacity
-    experience replay buffer, data is inserted in a circular fashion, inserting after the most recently added
-    frame, and wrapping around to the start of the buffer when necessary (in which case older episode frames
-    are overwritten).
-
-
-    Example usage: you want to create a new dataset and upload it to the hub
-
-    COMING SOON
+    ```python
+    # TODO(now): Should it default to video mode?
+    dataset = LeRobotDatasetV2("path/to/new/dataset", image_mode="video")
+    # OR, if you know the size of the dataset (in number of frames) up front, provide a buffer capacity up
+    # front for more efficient handling of the underlying memmaps.
+    dataset = LeRobotDatasetV2("path/to/new/dataset", image_mode="video", buffer_capacity=25000)
+
+    # Add episodes to the dataset.
+    for _ in range(num_episodes):
+        # Create a dictionary mapping data keys to arrays of shape (num_frames_in_episode, *).
+        dataset.add_episodes(data_dict)
+        TODO(alexander-soare): Push to hub
+    ```
 
-    Example usage: you need an experience replay buffer for an online RL policy like TD-MPC
+    Finally, one may also use LeRobotDatasetV2 as an experience replay buffer for online RL algorithms.
 
     ```python
-    dataset = LeRobotDatasetV2(storage_dir="online_buffer", buffer_capacity=10000)
+    dataset = LeRobotDatasetV2("online_buffer", buffer_capacity=10000, use_as_filo_buffer=True)
     iter_dataloader = iter(torch.utils.data.DataLoader(dataset))
 
     # training loop
     while True:
         data_dict = do_online_rollouts()
+        # Here, if the new frames exceed the capacity of the buffer, the oldest frames are shifted out to
+        # make space.
         dataset.add_episodes(data_dict)
         batch = next(iter_dataloader)
         # Policy forward, backward, gradient step.
@@ -211,7 +213,7 @@ def __init__(
         storage_dir: str | Path,
         buffer_capacity: int | None = None,
         use_as_filo_buffer: bool = False,
-        image_mode: LeRobotDatasetV2ImageMode | None = None,
+        image_mode: LeRobotDatasetV2ImageMode | str | None = None,
         image_transform: Callable[[np.ndarray], np.ndarray] | None = None,
         delta_timestamps: dict[str, list[float]] | dict[str, np.ndarray] | None = None,
         fps: float | None = None,
@@ -232,8 +234,9 @@ def __init__(
             use_as_filo_buffer: Set this to use the dataset as an online replay buffer. Calls to `add_episode`
                 beyond the buffer_capacity, will result in the oldest episodes being pushed out of the buffer
                 to make way for the new ones (first-in-last-out aka FILO).
-            image_mode: The image storage mode used for the item getter. See notes above on the various
-                options. If not provided it defaults to memmap mode.
+            image_mode: The image storage mode used for the item getter. Options are: "video", "png",
+                "memmap". See notes above for more information on the various options. If not provided it
+                defaults to memmap mode. TODO(now): should it default to video mode?
             image_transform: Transforms to apply in the item getter to all image data (any data whose key
                 starts with "observation.image").
             delta_timestamps: TODO(alexander-soare): Document this somewhere when
diff --git a/tests/test_online_buffer.py b/tests/test_online_buffer.py
index 9cf5fb56f..4f7856b79 100644
--- a/tests/test_online_buffer.py
+++ b/tests/test_online_buffer.py
@@ -35,40 +35,59 @@
 )
 from lerobot.common.datasets.utils import hf_transform_to_torch, load_hf_dataset, load_info, load_videos
 from lerobot.common.datasets.video_utils import VideoFrame, decode_video_frames_torchvision
+from lerobot.common.utils.utils import seeded_context
 from tests.utils import DevTestingError
 
 # Some constants for DataBuffer tests.
 # TODO(now): remove these globals
-data_key = "data"
-data_shape = (2, 3)  # just some arbitrary > 1D shape
-# buffer_capacity = 100
-# fps = 10
-
-
-# def make_new_dataset(
-#     storage_dir: str, storage_dir_exists: bool = False, delta_timestamps: dict[str, list[float]] | None = None
-# ) -> LeRobotDatasetV2:
-#     buffer = LeRobotDatasetV2(
-#         storage_dir,
-#         buffer_capacity=buffer_capacity if not storage_dir_exists else None,
-#         fps=None if delta_timestamps is None else fps,
-#         delta_timestamps=delta_timestamps,
-#     )
-#     return buffer
+# data_key = "data"
 
 
 def make_spoof_data_frames(n_episodes: int, n_frames_per_episode: int) -> dict[str, np.ndarray]:
     fps = 10
-    new_data = {
-        data_key: np.arange(n_frames_per_episode * n_episodes * np.prod(data_shape)).reshape(-1, *data_shape),
-        LeRobotDatasetV2.INDEX_KEY: np.arange(n_frames_per_episode * n_episodes),
-        LeRobotDatasetV2.EPISODE_INDEX_KEY: np.repeat(np.arange(n_episodes), n_frames_per_episode),
-        LeRobotDatasetV2.FRAME_INDEX_KEY: np.tile(np.arange(n_frames_per_episode), n_episodes),
-        LeRobotDatasetV2.TIMESTAMP_KEY: np.tile(np.arange(n_frames_per_episode) / fps, n_episodes),
-    }
+    # Arbitrary choice of dimension. `proprio_dim` and `action_dim` are intentionally different.
+    proprio_dim = 6
+    action_dim = 4
+    img_size = (32, 32)
+    n_frames = n_frames_per_episode * n_episodes
+    with seeded_context(0):
+        new_data = {
+            "observation.image": np.random.randint(0, 256, size=(n_frames, *img_size, 3)).astype(np.uint8),
+            "observation.state": np.random.normal(size=(n_frames, proprio_dim)).astype(np.float32),
+            "action": np.random.normal(size=(n_frames, action_dim)).astype(np.float32),
+            LeRobotDatasetV2.INDEX_KEY: np.arange(n_frames),
+            LeRobotDatasetV2.EPISODE_INDEX_KEY: np.repeat(np.arange(n_episodes), n_frames_per_episode),
+            LeRobotDatasetV2.FRAME_INDEX_KEY: np.tile(np.arange(n_frames_per_episode), n_episodes),
+            LeRobotDatasetV2.TIMESTAMP_KEY: np.tile(
+                np.arange(n_frames_per_episode, dtype=np.float32) / fps, n_episodes
+            ),
+        }
     return new_data
 
 
+def test_get_data_keys(tmp_path: Path):
+    dataset = LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}")
+    # Note: choices for args to make_spoof_data_frames are totally arbitrary.
+    new_episodes = make_spoof_data_frames(n_episodes=2, n_frames_per_episode=25)
+    dataset.add_episodes(new_episodes)
+    assert set(dataset.data_keys) == set(new_episodes)
+
+
+def test_get_data_by_key(tmp_path: Path):
+    """Tests that data can be added to a dataset and all data for a specific key can be read back."""
+    dataset = LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}")
+    # Note: choices for args to make_spoof_data_frames are mostly arbitrary. The only intentional aspect is to
+    # make sure the buffer is not full, in order to check that `get_data_by_key` only returns the part of the
+    # buffer that is occupied.
+    n_episodes = 2
+    n_frames_per_episode = 25
+    new_episodes = make_spoof_data_frames(n_episodes, n_frames_per_episode)
+    dataset.add_episodes(new_episodes)
+
+    for k in new_episodes:
+        assert np.array_equal(new_episodes[k], dataset.get_data_by_key(k))
+
+
 def test_non_mutate(tmp_path: Path):
     """Checks that the data provided to the add_data method is copied rather than passed by reference.
 
@@ -82,7 +101,7 @@ def test_non_mutate(tmp_path: Path):
     new_data = make_spoof_data_frames(2, 25)
     new_data_copy = deepcopy(new_data)
     dataset.add_episodes(new_data)
-    dataset.get_data_by_key(data_key)[:] += 1
+    dataset.get_data_by_key("action")[:] += 1
     assert all(np.array_equal(new_data[k], new_data_copy[k]) for k in new_data)
 
 
@@ -123,9 +142,8 @@ def test_write_read(tmp_path: Path, do_reload: bool):
         dataset = LeRobotDatasetV2(storage_dir)
 
     assert len(dataset) == n_frames_per_episode * n_episodes
-    for i, item in enumerate(dataset):
-        assert all(isinstance(item[k], torch.Tensor) for k in item)
-        assert np.array_equal(item[data_key].numpy(), new_data[data_key][i])
+    for k in dataset.data_keys:
+        assert np.array_equal(dataset.get_data_by_key(k), new_data[k])
 
 
 def test_filo_needs_buffer_capacity(tmp_path):
@@ -160,33 +178,21 @@ def test_filo(tmp_path: Path):
         len(dataset) == n_frames_per_episode * n_episodes
     ), "The new episode should have wrapped around to the start"
 
+    # Shift indices to match what the `add_episodes` method would do.
+    more_new_episodes[LeRobotDatasetV2.INDEX_KEY] += n_episodes * n_frames_per_episode
+    more_new_episodes[LeRobotDatasetV2.EPISODE_INDEX_KEY] += n_episodes
     expected_data = {}
     for k in new_episodes:
         expected_data[k] = new_episodes[k]
         # The extra new episode should overwrite the start of the buffer.
         expected_data[k][: len(more_new_episodes[k])] = more_new_episodes[k]
 
-    for i, item in enumerate(dataset):
-        assert np.array_equal(item[data_key].numpy(), expected_data[data_key][i])
+    for k in dataset.data_keys:
+        assert np.array_equal(dataset.get_data_by_key(k), expected_data[k])
 
     # TODO(now): Test that videos and pngs are removed as needed.
 
 
-def test_get_data_by_key(tmp_path: Path):
-    """Tests that data can be added to a dataset and all data for a specific key can be read back."""
-    dataset = LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}")
-    # Note: choices for args to make_spoof_data_frames are mostly arbitrary. The only intentional aspect is to
-    # make sure the buffer is not full, in order to check that `get_data_by_key` only returns the part of the
-    # buffer that is occupied.
-    n_episodes = 2
-    n_frames_per_episode = 25
-    new_episodes = make_spoof_data_frames(n_episodes, n_frames_per_episode)
-    dataset.add_episodes(new_episodes)
-
-    data = dataset.get_data_by_key(data_key)
-    assert np.array_equal(data, new_episodes[data_key])
-
-
 def test_delta_timestamps_within_tolerance(tmp_path: Path):
     """Check that getting an item with delta_timestamps within tolerance succeeds."""
     buffer = LeRobotDatasetV2(

From c312d261cd0da54b14f337aec2a0aa6c0f5eebf0 Mon Sep 17 00:00:00 2001
From: Alexander Soare <alexander.soare@huggingface.co>
Date: Fri, 27 Sep 2024 18:31:33 +0100
Subject: [PATCH 29/29] backup wip

---
 lerobot/common/datasets/online_buffer.py |  87 +++++++++---------
 lerobot/common/datasets/video_utils.py   |  38 +++++++-
 tests/test_online_buffer.py              | 110 +++++++++++++++++++----
 3 files changed, 172 insertions(+), 63 deletions(-)

diff --git a/lerobot/common/datasets/online_buffer.py b/lerobot/common/datasets/online_buffer.py
index 54b1ad4cf..33b57cf56 100644
--- a/lerobot/common/datasets/online_buffer.py
+++ b/lerobot/common/datasets/online_buffer.py
@@ -213,7 +213,7 @@ def __init__(
         storage_dir: str | Path,
         buffer_capacity: int | None = None,
         use_as_filo_buffer: bool = False,
-        image_mode: LeRobotDatasetV2ImageMode | str | None = None,
+        image_mode: LeRobotDatasetV2ImageMode | str = LeRobotDatasetV2ImageMode.MEMMAP,
         image_transform: Callable[[np.ndarray], np.ndarray] | None = None,
         delta_timestamps: dict[str, list[float]] | dict[str, np.ndarray] | None = None,
         fps: float | None = None,
@@ -230,7 +230,7 @@ def __init__(
                 to add in advance, it is recommended that this parameter be provided for efficiency. If
                 provided, attempts to add data beyond the capacity will result in an exception (unless
                 `use_as_filo_buffer` is set). If provided with an existing storage directory, the existing
-                memmaps will be expanded to the provided capacity if needed. TODO(now): test all this
+                memmaps will be expanded to the provided capacity if needed.
             use_as_filo_buffer: Set this to use the dataset as an online replay buffer. Calls to `add_episode`
                 beyond the buffer_capacity, will result in the oldest episodes being pushed out of the buffer
                 to make way for the new ones (first-in-last-out aka FILO).
@@ -253,10 +253,9 @@ def __init__(
         self._storage_dir = Path(storage_dir)
         self._data: dict[str, np.memmap] = {}
         self._use_as_filo_buffer = use_as_filo_buffer
-        self._image_mode = LeRobotDatasetV2ImageMode.MEMMAP
         self._videos_dir: str | None = None
         self._images_dir: str | None = None
-        # TODO(now): Put fps in metadata.
+        # TODO(now): Put fps in metadata. What if fps is provided when loading a dataset?
         self._fps = fps  # TODO(now): Decide how to treat fps (required or not?)
 
         # If the storage directory and metadata files already exists, load the memmaps.
@@ -278,17 +277,15 @@ def __init__(
                 self._buffer_capacity = buffer_capacity
             # Set image mode based on what's available in the storage directory and/or the user's selection.
             possible_image_modes = self._infer_image_modes()
-            if image_mode is not None:
-                if image_mode not in possible_image_modes:
-                    raise ValueError(
-                        f"Provided image_mode {str(image_mode)} not available with this storage directory. "
-                        f"Modes available: {[str(m) for m in possible_image_modes]}"
-                    )
-                self._image_mode = image_mode
+            if image_mode not in possible_image_modes:
+                raise ValueError(
+                    f"Provided image_mode {str(image_mode)} not available with this storage directory. "
+                    f"Modes available: {[str(m) for m in possible_image_modes]}"
+                )
         else:
             self._buffer_capacity = buffer_capacity
-            if image_mode is not None:
-                self._image_mode = image_mode
+
+        self._image_mode = LeRobotDatasetV2ImageMode(image_mode)
 
         self.set_delta_timestamps(delta_timestamps)
         self.image_transform = image_transform
@@ -302,8 +299,6 @@ def data_keys(self) -> list[str]:
         """Return the names of all data keys in the dataset.
 
         Exclude "private" internal keys.
-
-        TODO(now): Test
         """
         metadata = self._load_metadata()
         return list(metadata["data_keys"])
@@ -339,7 +334,6 @@ def num_samples(self) -> int:
         return np.count_nonzero(self._data[self.OCCUPANCY_MASK_KEY])
 
     def get_unique_episode_indices(self) -> np.ndarray:
-        # TODO(now): test
         return np.unique(self._data[self.EPISODE_INDEX_KEY][self._data[self.OCCUPANCY_MASK_KEY]])
 
     def _infer_image_modes(self) -> list[LeRobotDatasetV2ImageMode]:
@@ -349,7 +343,7 @@ def _infer_image_modes(self) -> list[LeRobotDatasetV2ImageMode]:
             image_modes.append(LeRobotDatasetV2ImageMode.VIDEO)
         if (self.storage_dir / self.PNGS_DIR).exists():
             image_modes.append(LeRobotDatasetV2ImageMode.PNG)
-        if any(k.startswith("observation.image") for k in self._load_metadata()):
+        if any(k.startswith(self.IMAGE_KEY_PREFIX) for k in self._load_metadata()["_data_spec"]):
             image_modes.append(LeRobotDatasetV2ImageMode.MEMMAP)
         return image_modes
 
@@ -504,16 +498,22 @@ def add_episodes(self, data: dict[str, np.ndarray], no_flush: bool = False):
 
         `data` should be a mapping of data key to data in array format. It should contain at least one
         episode. The episodes should have frame indices that start from 0 and step up in increments of 1.
+        All image data should be np.uint8, channel-last.
 
         Episodes are added to the dataset one-by-one. If an episode has more frames then are available till
-        the end of the numpy.memmap buffer, the pointer is reset to the start of the buffer and the episode is
-        inserted there, overwriting existing episode frames.  # TODO(now): Maybe I only want this if the user intends to use the dataset as a replay buffer
-
-        When episode frames are overwritten by a new episode, by default, any remaining frames belonging to
-        the existing episode are left in place (meaning not all episodes will be guaranteed to start from
-        their frame 0).  # TODO(now): Maybe I only want this if the user intends to use the dataset as a replay buffer
-
-        After adding the episodes to the dataset, the numpy.memmap buffer is flushed to disk.
+        the end of the numpy.memmap buffer, there are several possibilities:
+            - If `buffer_capacity` was not provided at initialization, the memmaps are doubled in size.
+            - If `buffer_capacity` was provided and `use_as_filo_buffer=False`, an exception will be raised.
+            - If `buffer_capacity` was provided and `use_as_filo_buffer=True`, the data insertion pointer is
+                reset to the start of the memmap and the episode is inserted there, overwriting existing
+                episode frames. When episode frames are overwritten by a new episode, any remaining frames
+                belonging to the existing episode are left in place (meaning not all episodes will be
+                guaranteed to start from their frame 0).
+
+        After adding the episodes to the dataset, the numpy.memmap buffer is flushed to disk, unless
+        `no_flush=True` is provided. In a loop where one episode is added at a time, providing `no_flush=True`
+        may provide speed advantages. The user should just remember to call `flush()` after finishing the
+        loop.
         """
         for episode_index in np.unique(data[self.EPISODE_INDEX_KEY]):
             where_episode = np.where(data[self.EPISODE_INDEX_KEY] == episode_index)[0]
@@ -582,7 +582,7 @@ def _add_episode(self, data: dict[str, np.ndarray]):
                 next_index = 0
             else:
                 # A buffer capacity was provided meaning we intend to fix the dataset size.
-                raise RuntimeError(
+                raise ValueError(
                     "Can't add this episode as it would exceed the provided buffer_capacity "
                     f"({self._buffer_capacity} frames) by {n_excess} frames."
                 )
@@ -601,6 +601,7 @@ def _add_episode(self, data: dict[str, np.ndarray]):
                         / self.VIDEOS_DIR
                         / self.VIDEO_NAME_FSTRING.format(data_key=k, episode_index=new_episode_index),
                         self.fps,
+                        # crf=0,  # TODO(now)
                     )
             elif self._image_mode == LeRobotDatasetV2ImageMode.PNG and k.startswith(self.IMAGE_KEY_PREFIX):
                 # Encode images to PNG and save to disk.
@@ -671,13 +672,13 @@ def get_episode(self, episode_index: int) -> dict[str, np.ndarray]:
         """Returns a whole episode as a key-array mapping.
 
         Includes decoding videos or PNG files where necessary.
-
-        TODO(now)
         """
         episode_data = {}
         data_mask = np.bitwise_and(
             self._data[self.EPISODE_INDEX_KEY] == episode_index, self._data[self.OCCUPANCY_MASK_KEY]
         )
+        if np.count_nonzero(data_mask) == 0:
+            raise IndexError(f"Episode index {episode_index} is not in the dataset.")
         for k in self.data_keys:
             if self._image_mode == LeRobotDatasetV2ImageMode.VIDEO and k in self.camera_keys:
                 episode_data[k] = decode_video_frames_torchvision(
@@ -685,7 +686,7 @@ def get_episode(self, episode_index: int) -> dict[str, np.ndarray]:
                     / self.VIDEOS_DIR
                     / self.VIDEO_NAME_FSTRING.format(data_key=k, episode_index=episode_index),
                     timestamps=self._data[self.TIMESTAMP_KEY][data_mask],
-                    tolerance_s=self.tolerance_s,
+                    tolerance_s=1e-8,
                     backend="pyav",
                     to_pytorch_format=False,
                 )
@@ -903,32 +904,30 @@ def from_huggingface_hub(
             ),
         )
 
-        kwargs["fps"] = lerobot_dataset_info["fps"]
-
         dataset_already_on_disk = False
         if Path(kwargs["storage_dir"] / LeRobotDatasetV2.METADATA_FILE_NAME).exists():
             dataset_already_on_disk = True
 
-        # Create the LeRobotDatasetV2 object. Reminder: if the storage directory already exists, this reads it.
-        # Otherwise, the storage directory is not created until later when we make the first call to
-        # `add_episodes`.
-        obj = cls(
-            **kwargs,
-            buffer_capacity=len(hf_dataset) if not dataset_already_on_disk else None,
-        )
-
         # Set the image mode based on the provided HF dataset and whether we are decoding images.
         if len(hf_dataset_camera_keys) == 0 or decode_images:
-            obj._image_mode = LeRobotDatasetV2ImageMode.MEMMAP
+            image_mode = LeRobotDatasetV2ImageMode.MEMMAP
         elif lerobot_dataset_info.get("video", False):
-            obj._image_mode = LeRobotDatasetV2ImageMode.VIDEO
-            obj._videos_dir = kwargs["storage_dir"] / LeRobotDatasetV2.VIDEOS_DIR
+            image_mode = LeRobotDatasetV2ImageMode.VIDEO
         else:
-            obj._image_mode = LeRobotDatasetV2ImageMode.PNG
+            image_mode = LeRobotDatasetV2ImageMode.PNG
             for k, feature in hf_dataset.features.items():
                 if isinstance(feature, datasets.features.Image):
                     hf_dataset = hf_dataset.cast_column(k, datasets.features.Image(decode=False))
-            obj._images_dir = kwargs["storage_dir"] / LeRobotDatasetV2.PNGS_DIR
+
+        # Create the LeRobotDatasetV2 object. Reminder: if the storage directory already exists, this reads it.
+        # Otherwise, the storage directory is not created until later when we make the first call to
+        # `add_episodes`.
+        obj = cls(
+            **kwargs,
+            buffer_capacity=len(hf_dataset) if not dataset_already_on_disk else None,
+            fps=lerobot_dataset_info["fps"],
+            image_mode=image_mode,
+        )
 
         # If we have accessed an existing cached dataset, just return the object as is.
         if dataset_already_on_disk:
diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py
index af60222e8..29f856604 100644
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -14,9 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import os
 import subprocess
 import warnings
 from collections import OrderedDict
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, ClassVar
@@ -174,6 +176,35 @@ def decode_video_frames_torchvision(
     return closest_frames
 
 
+@contextmanager
+def _ffmpeg_log_level_to_svt(ffmpeg_log_level: str | None):
+    """
+    Context manager to run ffmpeg command and cascade the ffmpeg log level to the corresponding SVT log level.
+    """
+    if ffmpeg_log_level is not None:
+        prior_svt_log_value = os.environ.get("SVT_LOG")
+        # Mapping of ffmpeg log level corresponding SVT log levels
+        svt_log_levels = {
+            "quiet": 0,  # SVT_LOG_FATAL
+            "panic": 0,  # SVT_LOG_FATAL
+            "fatal": 0,  # SVT_LOG_FATAL
+            "error": 1,  # SVT_LOG_ERROR
+            "warning": 2,  # SVT_LOG_WARM
+            "info": 3,  # SVT_LOG_INFO
+            "verbose": 3,  # SVT_LOG_INFO
+            "debug": 4,  # SVT_LOG_DEBUG
+            "trace": -1,  # SVT_LOG_ALL
+        }
+        os.environ["SVT_LOG"] = str(svt_log_levels[ffmpeg_log_level])
+        yield
+        if prior_svt_log_value is not None:
+            os.environ["SVT_LOG"] = prior_svt_log_value
+        else:
+            os.environ.pop("SVT_LOG")
+    else:
+        yield
+
+
 def encode_video_frames(
     imgs_dir: Path,
     video_path: Path,
@@ -187,6 +218,7 @@ def encode_video_frames(
     overwrite: bool = False,
 ) -> None:
     """More info on ffmpeg arguments tuning on `benchmark/video/README.md`"""
+
     video_path = Path(video_path)
     video_path.parent.mkdir(parents=True, exist_ok=True)
 
@@ -219,8 +251,10 @@ def encode_video_frames(
         ffmpeg_args.append("-y")
 
     ffmpeg_cmd = ["ffmpeg"] + ffmpeg_args + [str(video_path)]
-    # redirect stdin to subprocess.DEVNULL to prevent reading random keyboard inputs from terminal
-    subprocess.run(ffmpeg_cmd, check=True, stdin=subprocess.DEVNULL)
+
+    with _ffmpeg_log_level_to_svt(log_level):
+        # redirect stdin to subprocess.DEVNULL to prevent reading random keyboard inputs from terminal
+        subprocess.run(ffmpeg_cmd, check=True, stdin=subprocess.DEVNULL)
 
     if not video_path.exists():
         raise OSError(
diff --git a/tests/test_online_buffer.py b/tests/test_online_buffer.py
index 4f7856b79..0d26c5ff2 100644
--- a/tests/test_online_buffer.py
+++ b/tests/test_online_buffer.py
@@ -14,9 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO(now): Test that the data_keys are saved correctly in the metadata
-# TODO(now): Test adding episodes in video mode and png mode.
-# TODO(now): Test ._extend_memmaps
+# TODO(now): Test relevant functions in all image modes.
 
 from copy import deepcopy
 from pathlib import Path
@@ -30,6 +28,7 @@
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, DATA_DIR, LeRobotDataset
 from lerobot.common.datasets.online_buffer import (
     LeRobotDatasetV2,
+    LeRobotDatasetV2ImageMode,
     TimestampOutsideToleranceError,
     compute_sampler_weights,
 )
@@ -38,19 +37,17 @@
 from lerobot.common.utils.utils import seeded_context
 from tests.utils import DevTestingError
 
-# Some constants for DataBuffer tests.
-# TODO(now): remove these globals
-# data_key = "data"
 
-
-def make_spoof_data_frames(n_episodes: int, n_frames_per_episode: int) -> dict[str, np.ndarray]:
+def make_spoof_data_frames(
+    n_episodes: int, n_frames_per_episode: int, seed: int = 0
+) -> dict[str, np.ndarray]:
     fps = 10
-    # Arbitrary choice of dimension. `proprio_dim` and `action_dim` are intentionally different.
     proprio_dim = 6
     action_dim = 4
-    img_size = (32, 32)
+    # This is the minimum size allowed for encoding.
+    img_size = (64, 64)
     n_frames = n_frames_per_episode * n_episodes
-    with seeded_context(0):
+    with seeded_context(seed):
         new_data = {
             "observation.image": np.random.randint(0, 256, size=(n_frames, *img_size, 3)).astype(np.uint8),
             "observation.state": np.random.normal(size=(n_frames, proprio_dim)).astype(np.float32),
@@ -88,6 +85,16 @@ def test_get_data_by_key(tmp_path: Path):
         assert np.array_equal(new_episodes[k], dataset.get_data_by_key(k))
 
 
+def test_get_unique_episode_indices(tmp_path: Path):
+    dataset = LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}")
+    # Note: choices for args to make_spoof_data_frames are mostly arbitrary. Just make sure there is more
+    # than one episode.
+    n_episodes = 2
+    new_episodes = make_spoof_data_frames(n_episodes=n_episodes, n_frames_per_episode=25)
+    dataset.add_episodes(new_episodes)
+    assert np.array_equal(np.sort(dataset.get_unique_episode_indices()), np.arange(n_episodes))
+
+
 def test_non_mutate(tmp_path: Path):
     """Checks that the data provided to the add_data method is copied rather than passed by reference.
 
@@ -124,13 +131,17 @@ def test_index_error_with_data(tmp_path: Path):
 
 
 @pytest.mark.parametrize("do_reload", [False, True])
-def test_write_read(tmp_path: Path, do_reload: bool):
+@pytest.mark.parametrize("image_mode", list(LeRobotDatasetV2ImageMode))
+def test_write_read_and_get_episode(tmp_path: Path, do_reload: bool, image_mode: str):
     """Checks that data can be added to the dataset and read back.
 
     If do_reload we delete the dataset object and load the dataset back from disk before reading.
+
+    This also tests that the get_episode returns the correct data.
     """
     storage_dir = tmp_path / f"dataset_{uuid4().hex}"
-    dataset = LeRobotDatasetV2(storage_dir)
+    fps = 10
+    dataset = LeRobotDatasetV2(storage_dir, image_mode=image_mode, fps=fps)
     # Note: choices for args to make_spoof_data_frames are arbitrary.
     n_episodes = 2
     n_frames_per_episode = 25
@@ -139,14 +150,79 @@ def test_write_read(tmp_path: Path, do_reload: bool):
 
     if do_reload:
         del dataset
-        dataset = LeRobotDatasetV2(storage_dir)
+        dataset = LeRobotDatasetV2(storage_dir, image_mode=image_mode, fps=fps)
 
     assert len(dataset) == n_frames_per_episode * n_episodes
-    for k in dataset.data_keys:
-        assert np.array_equal(dataset.get_data_by_key(k), new_data[k])
+    for episode_index in range(n_episodes):
+        data_mask = new_data[LeRobotDatasetV2.EPISODE_INDEX_KEY] == episode_index
+        episode_data = dataset.get_episode(episode_index)
+        for k in dataset.data_keys:
+            if image_mode == LeRobotDatasetV2ImageMode.VIDEO and k.startswith(
+                LeRobotDatasetV2.IMAGE_KEY_PREFIX
+            ):
+                # Unfortunately we can't check array equality here because the videos were encoded lossily.
+                # We'll settle for a shape check.
+                # TODO(now): This should work with crf=0
+                assert episode_data[k].shape == new_data[k][data_mask].shape
+            else:
+                assert np.array_equal(episode_data[k], new_data[k][data_mask]), f"Mismatch for {k=}"
 
 
-def test_filo_needs_buffer_capacity(tmp_path):
+def test_get_episode_index_error(tmp_path: Path):
+    """Test that get_episode raises an IndexError is raised with an invalid episode index."""
+    dataset = LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}")
+    # Note: args to make_spoof_data_frames are mostly arbitrary.
+    n_episodes = 1
+    new_episodes = make_spoof_data_frames(n_episodes=n_episodes, n_frames_per_episode=25)
+    dataset.add_episodes(new_episodes)
+    with pytest.raises(IndexError):
+        dataset.get_episode(n_episodes)
+
+
+def test_buffer_capacity(tmp_path: Path):
+    """Check that explicitly providing a buffer capacity, then overflowing, causes an exception."""
+    dataset = LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}", buffer_capacity=60)
+    # Note: choices for args to make_spoof_data_frames are totally arbitrary.
+    new_episodes = make_spoof_data_frames(n_episodes=3, n_frames_per_episode=25)
+    with pytest.raises(ValueError):
+        dataset.add_episodes(new_episodes)
+
+
+def test_dynamic_memmap_size(tmp_path: Path):
+    """Check that the memmap is dynamically resized when trying to add episodes over the capacity.
+
+    Check that the existing data is not corrupted.
+    """
+    dataset = LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}")
+    n_frames_per_episode = 5
+    new_episodes = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=n_frames_per_episode, seed=0)
+    all_episodes = new_episodes
+    # This should trigger the creation of memmaps with 10 frame capacity.
+    dataset.add_episodes(new_episodes)
+    expected_capacity = n_frames_per_episode
+    used_capacity = n_frames_per_episode
+    assert len(dataset._data[LeRobotDatasetV2.INDEX_KEY]) == expected_capacity
+    for i in range(1, 51):  # 50 iterations makes the capacity grow to 320
+        new_episodes = make_spoof_data_frames(n_episodes=1, n_frames_per_episode=n_frames_per_episode, seed=i)
+        # This should trigger the memmaps to double in size.
+        dataset.add_episodes(new_episodes)
+        # Track the data we expect to see in the memmaps.
+        for k in new_episodes:
+            # Shift indices the same we we expect `add_episodes` to.
+            if k == LeRobotDatasetV2.INDEX_KEY:
+                new_episodes[k] += all_episodes[k][-1] + 1
+            if k == LeRobotDatasetV2.EPISODE_INDEX_KEY:
+                new_episodes[k] += all_episodes[k][-1] + 1
+            all_episodes[k] = np.concatenate([all_episodes[k], new_episodes[k]], axis=0)
+        used_capacity += n_frames_per_episode
+        if used_capacity > expected_capacity:
+            expected_capacity *= 2
+        assert len(dataset._data[LeRobotDatasetV2.INDEX_KEY]) == expected_capacity
+        for k in dataset.data_keys:
+            assert np.array_equal(dataset.get_data_by_key(k), all_episodes[k])
+
+
+def test_filo_needs_buffer_capacity(tmp_path: Path):
     with pytest.raises(ValueError):
         LeRobotDatasetV2(tmp_path / f"dataset_{uuid4().hex}", use_as_filo_buffer=True)