From 5a18d94f6392a5e7566dfec92185e2e8817a9109 Mon Sep 17 00:00:00 2001
From: yuecideng <dengyueci@qq.com>
Date: Thu, 19 Feb 2026 09:22:48 +0000
Subject: [PATCH 1/8] wip

---
 embodichain/lab/gym/envs/embodied_env.py | 11 +++++--
 embodichain/lab/gym/utils/gym_utils.py   | 38 ++++++++++++++----------
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/embodichain/lab/gym/envs/embodied_env.py b/embodichain/lab/gym/envs/embodied_env.py
index f89621c7..b412babb 100644
--- a/embodichain/lab/gym/envs/embodied_env.py
+++ b/embodichain/lab/gym/envs/embodied_env.py
@@ -125,6 +125,13 @@ class EnvLightCfg:
     This is useful when we want to disable visual randomization for debug motion and physics issues.
     """
 
+    filter_dataset_saving: bool = False
+    """Whether to filter out dataset saving
+    
+    This is useful when we want to disable dataset saving for debug motion and physics issues.
+    If no dataset manager is configured, this flag will have no effect.
+    """
+
 
 @register_env("EmbodiedEnv-v1")
 class EmbodiedEnv(BaseEnv):
@@ -201,7 +208,7 @@ def _init_sim_state(self, **kwargs):
         if self.cfg.rewards:
             self.reward_manager = RewardManager(self.cfg.rewards, self)
 
-        if self.cfg.dataset:
+        if self.cfg.dataset and not self.cfg.filter_dataset_saving:
             self.dataset_manager = DatasetManager(self.cfg.dataset, self)
 
     def _apply_functor_filter(self) -> None:
@@ -376,7 +383,7 @@ def _initialize_episode(
             env_ids_to_process = list(env_ids)
 
         # Save dataset before clearing buffers for environments that are being reset
-        if save_data and self.cfg.dataset:
+        if save_data and self.dataset_manager:
             if "save" in self.dataset_manager.available_modes:
 
                 # Filter to only save successful episodes
diff --git a/embodichain/lab/gym/utils/gym_utils.py b/embodichain/lab/gym/utils/gym_utils.py
index 72d5fb40..4482b2a8 100644
--- a/embodichain/lab/gym/utils/gym_utils.py
+++ b/embodichain/lab/gym/utils/gym_utils.py
@@ -704,15 +704,16 @@ def add_env_launcher_args_to_parser(parser: argparse.ArgumentParser) -> None:
     """Add common environment launcher arguments to an existing argparse parser.
 
     This function adds the following arguments to the provided parser:
-        - --num_envs: Number of environments to run in parallel (default: 1)
-        - --device: Device to run the environment on (default: 'cpu')
-        - --headless: Whether to perform the simulation in headless mode (default: False)
-        - --enable_rt: Whether to use RTX rendering backend for the simulation (default: False)
-        - --gpu_id: The GPU ID to use for the simulation (default: 0)
-        - --filter_visual_rand: Whether to filter out visual randomization (default: False)
-        - --gym_config: Path to gym config file (default: '')
-        - --action_config: Path to action config file (default: None)
-        - --preview: Whether to preview the environment after launching (default: False)
+        --num_envs: Number of environments to run in parallel (default: 1)
+        --device: Device to run the environment on (default: 'cpu')
+        --headless: Whether to perform the simulation in headless mode (default: False)
+        --enable_rt: Whether to use RTX rendering backend for the simulation (default: False)
+        --gpu_id: The GPU ID to use for the simulation (default: 0)
+        --gym_config: Path to gym config file (default: '')
+        --action_config: Path to action config file (default: None)
+        --preview: Whether to preview the environment after launching (default: False)
+        --filter_visual_rand: Whether to filter out visual randomization (default: False)
+        --filter_dataset_saving: Whether to filter out dataset saving (default: False)
 
     Note:
         1. In preview mode, the environment will be launched and keep running in a loop for user interaction.
@@ -750,12 +751,6 @@ def add_env_launcher_args_to_parser(parser: argparse.ArgumentParser) -> None:
         default=0,
         type=int,
     )
-    parser.add_argument(
-        "--filter_visual_rand",
-        help="Whether to filter out visual randomization.",
-        default=False,
-        action="store_true",
-    )
     parser.add_argument(
         "--gym_config",
         type=str,
@@ -772,6 +767,18 @@ def add_env_launcher_args_to_parser(parser: argparse.ArgumentParser) -> None:
         default=False,
         action="store_true",
     )
+    parser.add_argument(
+        "--filter_visual_rand",
+        help="Whether to filter out visual randomization.",
+        default=False,
+        action="store_true",
+    )
+    parser.add_argument(
+        "--filter_dataset_saving",
+        help="Whether to filter out dataset saving.",
+        default=False,
+        action="store_true",
+    )
 
 
 def build_env_cfg_from_args(
@@ -795,6 +802,7 @@ def build_env_cfg_from_args(
         gym_config, manager_modules=DEFAULT_MANAGER_MODULES
     )
     cfg.filter_visual_rand = args.filter_visual_rand
+    cfg.filter_dataset_saving = args.filter_dataset_saving
 
     action_config = {}
     if args.action_config is not None:

From 78c649551f650bb6ab0e5f3c7edb99863cdcd325 Mon Sep 17 00:00:00 2001
From: yuecideng <dengyueci@qq.com>
Date: Fri, 20 Feb 2026 08:30:28 +0000
Subject: [PATCH 2/8] wip

---
 .../gym/envs/managers/randomization/spatial.py  | 17 ++++++++++++-----
 embodichain/lab/gym/utils/gym_utils.py          |  3 +++
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/embodichain/lab/gym/envs/managers/randomization/spatial.py b/embodichain/lab/gym/envs/managers/randomization/spatial.py
index dda5658d..408fa079 100644
--- a/embodichain/lab/gym/envs/managers/randomization/spatial.py
+++ b/embodichain/lab/gym/envs/managers/randomization/spatial.py
@@ -110,6 +110,7 @@ def randomize_rigid_object_pose(
     rotation_range: tuple[list[float], list[float]] | None = None,
     relative_position: bool = True,
     relative_rotation: bool = False,
+    physics_update_step: int = -1,
 ) -> None:
     """Randomize the pose of a rigid object in the environment.
 
@@ -122,6 +123,7 @@ def randomize_rigid_object_pose(
             The rotation is represented as Euler angles (roll, pitch, yaw) in degree.
         relative_position (bool): Whether to randomize the position relative to the object's initial position. Default is True.
         relative_rotation (bool): Whether to randomize the rotation relative to the object's initial rotation. Default is False.
+        physics_update_step (int): The number of physics update steps to apply after randomization. Default is -1 (no update).
     """
 
     if entity_cfg.uid not in env.sim.get_rigid_object_uid_list():
@@ -155,6 +157,9 @@ def randomize_rigid_object_pose(
     rigid_object.set_local_pose(pose, env_ids=env_ids)
     rigid_object.clear_dynamics()
 
+    if physics_update_step > 0:
+        env.sim.update(step=physics_update_step)
+
 
 def randomize_robot_eef_pose(
     env: EmbodiedEnv,
@@ -202,7 +207,7 @@ def set_random_eef_pose(joint_ids: List[int], robot: Robot) -> None:
         )
 
         new_qpos[ret == False] = current_qpos[ret == False]
-        robot.set_qpos(new_qpos, env_ids=env_ids, joint_ids=joint_ids)
+        robot.set_qpos(new_qpos, env_ids=env_ids, joint_ids=joint_ids, target=False)
 
     robot = env.sim.get_robot(entity_cfg.uid)
 
@@ -215,8 +220,8 @@ def set_random_eef_pose(joint_ids: List[int], robot: Robot) -> None:
             joint_ids = robot.get_joint_ids(part)
             set_random_eef_pose(joint_ids, robot)
 
-    # simulate 10 steps to let the robot reach the target pose.
-    env.sim.update(step=10)
+    # simulate 1 steps to let the robot reach the target pose.
+    env.sim.update(step=1)
 
 
 def randomize_robot_qpos(
@@ -264,8 +269,10 @@ def randomize_robot_qpos(
     else:
         current_qpos = qpos
 
-    robot.set_qpos(qpos=current_qpos, env_ids=env_ids, joint_ids=joint_ids)
-    env.sim.update(step=100)
+    robot.set_qpos(
+        qpos=current_qpos, env_ids=env_ids, joint_ids=joint_ids, target=False
+    )
+    env.sim.update(step=1)
 
 
 def randomize_target_pose(
diff --git a/embodichain/lab/gym/utils/gym_utils.py b/embodichain/lab/gym/utils/gym_utils.py
index 4482b2a8..8cc93ae4 100644
--- a/embodichain/lab/gym/utils/gym_utils.py
+++ b/embodichain/lab/gym/utils/gym_utils.py
@@ -803,6 +803,9 @@ def build_env_cfg_from_args(
     )
     cfg.filter_visual_rand = args.filter_visual_rand
     cfg.filter_dataset_saving = args.filter_dataset_saving
+    if args.preview:
+        # In preview mode, we typically don't want to save data
+        cfg.filter_dataset_saving = True  
 
     action_config = {}
     if args.action_config is not None:

From 536c17a4ec5371241f0a68d904aa6409a7f75448 Mon Sep 17 00:00:00 2001
From: yuecideng <dengyueci@qq.com>
Date: Fri, 20 Feb 2026 08:38:49 +0000
Subject: [PATCH 3/8] wip

---
 embodichain/lab/gym/utils/gym_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/embodichain/lab/gym/utils/gym_utils.py b/embodichain/lab/gym/utils/gym_utils.py
index 4482b2a8..8cc93ae4 100644
--- a/embodichain/lab/gym/utils/gym_utils.py
+++ b/embodichain/lab/gym/utils/gym_utils.py
@@ -803,6 +803,9 @@ def build_env_cfg_from_args(
     )
     cfg.filter_visual_rand = args.filter_visual_rand
     cfg.filter_dataset_saving = args.filter_dataset_saving
+    if args.preview:
+        # In preview mode, we typically don't want to save data
+        cfg.filter_dataset_saving = True  
 
     action_config = {}
     if args.action_config is not None:

From 2cac45f930dac86746899f5f76e9f90cf8345200 Mon Sep 17 00:00:00 2001
From: yuecideng <dengyueci@qq.com>
Date: Fri, 20 Feb 2026 09:30:09 +0000
Subject: [PATCH 4/8] wip

---
 .../lab/gym/envs/managers/observations.py     | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/embodichain/lab/gym/envs/managers/observations.py b/embodichain/lab/gym/envs/managers/observations.py
index 1eb83fa4..52df7c7a 100644
--- a/embodichain/lab/gym/envs/managers/observations.py
+++ b/embodichain/lab/gym/envs/managers/observations.py
@@ -61,6 +61,33 @@ def get_rigid_object_pose(
     return obj.get_local_pose(to_matrix=True)
 
 
+def get_rigid_object_velocity(
+    env: EmbodiedEnv,
+    obs: EnvObs,
+    entity_cfg: SceneEntityCfg,
+) -> torch.Tensor:
+    """Get the world velocities of the rigid objects in the environment.
+
+    If the rigid object with the specified UID does not exist in the environment,
+    a zero tensor will be returned.
+
+    Args:
+        env: The environment instance.
+        obs: The observation dictionary.
+        entity_cfg: The configuration of the scene entity.
+
+    Returns:
+        A tensor of shape (num_envs, 6) representing the linear and angular velocities of the rigid objects.
+    """
+
+    if entity_cfg.uid not in env.sim.get_rigid_object_uid_list():
+        return torch.zeros((env.num_envs, 6), dtype=torch.float32)
+
+    obj = env.sim.get_rigid_object(entity_cfg.uid)
+
+    return obj.body_data.vel
+
+
 def normalize_robot_joint_data(
     env: EmbodiedEnv,
     data: torch.Tensor,

From b659ad19b1c076e8e81c431b68416d7c8133cd41 Mon Sep 17 00:00:00 2001
From: yuecideng <dengyueci@qq.com>
Date: Sat, 21 Feb 2026 09:27:23 +0000
Subject: [PATCH 5/8] wip

---
 embodichain/lab/gym/envs/base_env.py          |  20 +-
 embodichain/lab/gym/envs/embodied_env.py      |   9 +-
 embodichain/lab/gym/envs/managers/datasets.py | 203 +++++++++++-------
 .../lab/gym/envs/managers/observations.py     |   4 +-
 embodichain/lab/gym/utils/gym_utils.py        |   2 +
 5 files changed, 147 insertions(+), 91 deletions(-)

diff --git a/embodichain/lab/gym/envs/base_env.py b/embodichain/lab/gym/envs/base_env.py
index 99f4c211..2471889d 100644
--- a/embodichain/lab/gym/envs/base_env.py
+++ b/embodichain/lab/gym/envs/base_env.py
@@ -164,21 +164,15 @@ def device(self) -> torch.device:
 
     @cached_property
     def single_observation_space(self) -> gym.spaces.Space:
-        if self.num_envs == 1:
-            return gym_utils.convert_observation_to_space(self._init_raw_obs)
-        else:
-            return gym_utils.convert_observation_to_space(
-                self._init_raw_obs, unbatched=True
-            )
+        return gym_utils.convert_observation_to_space(
+            self._init_raw_obs, unbatched=True
+        )
 
     @cached_property
     def observation_space(self) -> gym.spaces.Space:
-        if self.num_envs == 1:
-            return self.single_observation_space
-        else:
-            return gym.vector.utils.batch_space(
-                self.single_observation_space, n=self.num_envs
-            )
+        return gym_utils.convert_observation_to_space(
+            self._init_raw_obs, unbatched=False
+        )
 
     @cached_property
     def flattened_observation_space(self) -> gym.spaces.Box:
@@ -576,10 +570,10 @@ def reset(
         self.sim.reset_objects_state(
             env_ids=reset_ids, excluded_uids=self._detached_uids_for_reset
         )
-        self._elapsed_steps[reset_ids] = 0
 
         # Reset hook for user to perform any custom reset logic.
         self._initialize_episode(reset_ids, **options)
+        self._elapsed_steps[reset_ids] = 0
 
         return self.get_obs(**options), self.get_info(**options)
 
diff --git a/embodichain/lab/gym/envs/embodied_env.py b/embodichain/lab/gym/envs/embodied_env.py
index b412babb..5db51660 100644
--- a/embodichain/lab/gym/envs/embodied_env.py
+++ b/embodichain/lab/gym/envs/embodied_env.py
@@ -178,6 +178,9 @@ def __init__(self, cfg: EmbodiedEnvCfg, **kwargs):
 
         super().__init__(cfg, **kwargs)
 
+        if self.cfg.dataset and not self.cfg.filter_dataset_saving:
+            self.dataset_manager = DatasetManager(self.cfg.dataset, self)
+
         self.episode_obs_buffer: Dict[int, List[EnvObs]] = {
             i: [] for i in range(self.num_envs)
         }
@@ -208,9 +211,6 @@ def _init_sim_state(self, **kwargs):
         if self.cfg.rewards:
             self.reward_manager = RewardManager(self.cfg.rewards, self)
 
-        if self.cfg.dataset and not self.cfg.filter_dataset_saving:
-            self.dataset_manager = DatasetManager(self.cfg.dataset, self)
-
     def _apply_functor_filter(self) -> None:
         """Apply functor filters to the environment components based on configuration.
 
@@ -397,6 +397,7 @@ def _initialize_episode(
                 ]
 
                 if successful_env_ids:
+
                     # Convert back to tensor if needed
                     successful_env_ids_tensor = torch.tensor(
                         successful_env_ids, device=self.device
@@ -405,8 +406,6 @@ def _initialize_episode(
                         mode="save",
                         env_ids=successful_env_ids_tensor,
                     )
-                else:
-                    logger.log_warning("No successful episodes to save.")
 
         # Clear episode buffers and reset success status for environments being reset
         for env_id in env_ids_to_process:
diff --git a/embodichain/lab/gym/envs/managers/datasets.py b/embodichain/lab/gym/envs/managers/datasets.py
index 70b62aa7..5d148ec5 100644
--- a/embodichain/lab/gym/envs/managers/datasets.py
+++ b/embodichain/lab/gym/envs/managers/datasets.py
@@ -22,7 +22,9 @@
 from typing import TYPE_CHECKING, Any, Dict, Optional, Union
 
 import numpy as np
+import gymnasium as gym
 import torch
+import tqdm
 
 from embodichain.utils import logger
 from embodichain.data.constants import EMBODICHAIN_DEFAULT_DATASET_ROOT
@@ -87,6 +89,10 @@ def __init__(self, cfg: DatasetFunctorCfg, env: EmbodiedEnv):
         # Optional parameters
         self.instruction = params.get("instruction", None)
         self.extra = params.get("extra", {})
+
+        # Experimental parameters for extra episode info saving.
+        self.extra_episode_info = self.extra.get("episode_info", {})
+        self.extra_episode_info_buffer = {}
         self.use_videos = params.get("use_videos", False)
 
         # LeRobot dataset instance
@@ -165,10 +171,14 @@ def _save_episodes(
             episode_extra_info = extra_info.copy()
             self.total_time += current_episode_time
             episode_extra_info["total_time"] = self.total_time
-            self._update_dataset_info({"extra": episode_extra_info})
+            self._save_extra_episode_meta_info(env_id)
 
             try:
-                for obs, action in zip(obs_list, action_list):
+                for obs, action in tqdm.tqdm(
+                    zip(obs_list, action_list),
+                    total=len(obs_list),
+                    desc=f"Converting env {env_id} episode to LeRobot format",
+                ):
                     frame = self._convert_frame_to_lerobot(obs, action, task)
                     self.dataset.add_frame(frame)
 
@@ -183,6 +193,28 @@ def _save_episodes(
             except Exception as e:
                 logger.log_error(f"Failed to save episode {env_id}: {e}")
 
+    def _save_extra_episode_meta_info(self, env_id: int) -> None:
+        """Save extra episode meta info for a specific environment ID."""
+
+        curr_extra_episode_info = {}
+        if self.extra_episode_info:
+            for key, attr_list in self.extra_episode_info.items():
+                if key == "rigid_object_physcis_attributes":
+                    rigid_obj_list = self._env.sim.get_rigid_object_uid_list()
+                    for obj_uid in rigid_obj_list:
+                        curr_extra_episode_info[obj_uid] = {}
+                        obj = self._env.sim.get_rigid_object(obj_uid)
+                        for attr in attr_list:
+                            if attr == "mass":
+                                curr_extra_episode_info[obj_uid]["mass"] = round(
+                                    obj.get_mass(env_ids=[env_id]).squeeze_().item(), 5
+                                )
+
+        self.extra_episode_info_buffer[self.curr_episode] = curr_extra_episode_info
+        self._update_dataset_info(
+            {"extra_episode_info": self.extra_episode_info_buffer}
+        )
+
     def finalize(self) -> Optional[str]:
         """Finalize the dataset."""
         # Save any remaining episodes
@@ -260,43 +292,89 @@ def _initialize_dataset(self) -> None:
     def _build_features(self) -> Dict:
         """Build LeRobot features dict."""
         features = {}
-        extra_vision_config = self.robot_meta.get("observation", {}).get("vision", {})
 
-        for camera_name in extra_vision_config.keys():
-            sensor = self._env.get_sensor(camera_name)
+        # Setup robot joint state features based on control_parts or all joints if not specified.
+        control_parts = self.robot_meta.get("control_parts", None)
+        if control_parts is not None:
+            self._joint_ids = []
+            for part in control_parts:
+                part_joint_ids = self._env.robot.get_joint_ids(part, remove_mimic=True)
+                self._joint_ids.extend(part_joint_ids)
+        else:
+            self._joint_ids = self._env.robot.get_joint_ids(remove_mimic=True)
+
+        state_dim = len(self._joint_ids)
+        # Create joint names as strings: joint_0, joint_1, ...
+        joint_indices = [f"{i}" for i in self._joint_ids]
+        features["observation.qpos"] = {
+            "dtype": "float32",
+            "shape": (state_dim,),
+            "names": joint_indices,
+        }
+        features["observation.qvel"] = {
+            "dtype": "float32",
+            "shape": (state_dim,),
+            "names": joint_indices,
+        }
+        features["observation.qf"] = {
+            "dtype": "float32",
+            "shape": (state_dim,),
+            "names": joint_indices,
+        }
+
+        # Use full qpos dimension for action (includes gripper)
+        action_dim = len(self._joint_ids)
+        features["action"] = {
+            "dtype": "float32",
+            "shape": (action_dim,),
+            "names": joint_indices,
+        }
+
+        # Setup sensor observation features based env.observation.sensor
+        sensor_obs_space: dict = self._env.single_observation_space["sensor"]
+
+        for sensor_name, value in sensor_obs_space.items():
+            sensor = self._env.get_sensor(sensor_name)
             is_stereo = is_stereocam(sensor)
-            img_shape = (sensor.cfg.height, sensor.cfg.width, 3)
 
-            features[camera_name] = {
-                "dtype": "video" if self.use_videos else "image",
-                "shape": img_shape,
-                "names": ["height", "width", "channel"],
-            }
+            for frame_name, space in value.items():
+                # TODO: Support depth (uint16) and mask (also uint16 or uint8)
+                if frame_name not in ["color", "color_right"]:
+                    logger.log_error(
+                        f"Only support 'color' frame for vision sensors, but got '{frame_name}' in sensor '{sensor_name}'"
+                    )
 
-            if is_stereo:
-                features[get_right_name(camera_name)] = {
+                features[f"{sensor_name}.{frame_name}"] = {
                     "dtype": "video" if self.use_videos else "image",
-                    "shape": img_shape,
+                    "shape": (sensor.cfg.height, sensor.cfg.width, 3),
                     "names": ["height", "width", "channel"],
                 }
 
-        qpos = self._env.robot.get_qpos()
-        state_dim = qpos.shape[1]
+                if is_stereo:
+                    features[f"{sensor_name}.{frame_name}_right"] = {
+                        "dtype": "video" if self.use_videos else "image",
+                        "shape": (sensor.cfg.height, sensor.cfg.width, 3),
+                        "names": ["height", "width", "channel"],
+                    }
+
+        # TODO: The extra observation features are supposed to be defined in a flattened way in the observation space.
+        # Lerobot requires a flat feature dict, so we may need to support nested dicts to flatten dict conversion in the future.
+        # Add any extra features specified in observation space excluding 'robot' and 'sensor'
+        for key, space in self._env.single_observation_space.items():
+            if key in ["robot", "sensor"]:
+                continue
 
-        if state_dim > 0:
-            features["observation.state"] = {
-                "dtype": "float32",
-                "shape": (state_dim,),
-                "names": ["state"],
-            }
+            names = key
+            if "vel" in key:
+                names = ["lin_x", "lin_y", "lin_z", "ang_x", "ang_y", "ang_z"]
+            elif "pose" in key:
+                names = ["x", "y", "z", "qw", "qx", "qy", "qz"]
 
-        # Use full qpos dimension for action (includes gripper)
-        action_dim = state_dim
-        features["action"] = {
-            "dtype": "float32",
-            "shape": (action_dim,),
-            "names": ["action"],
-        }
+            features[f"observation.{key}"] = {
+                "dtype": str(space.dtype),
+                "shape": space.shape,
+                "names": names,
+            }
 
         return features
 
@@ -314,51 +392,38 @@ def _convert_frame_to_lerobot(
             Frame dict in LeRobot format with numpy arrays
         """
         frame = {"task": task}
-        extra_vision_config = self.robot_meta.get("observation", {}).get("vision", {})
+        sensor_obs_space: dict = self._env.single_observation_space["sensor"]
 
         # Add images
-        for camera_name in extra_vision_config.keys():
-            if camera_name in obs.get("sensor", {}):
-                sensor = self._env.get_sensor(camera_name)
-                is_stereo = is_stereocam(sensor)
-
-                color_data = obs["sensor"][camera_name]["color"]
-                if isinstance(color_data, torch.Tensor):
-                    color_img = color_data[:, :, :3].cpu().numpy()
-                else:
-                    color_img = np.array(color_data)[:, :, :3]
-
-                if color_img.dtype in [np.float32, np.float64]:
-                    color_img = (color_img * 255).astype(np.uint8)
-
-                frame[camera_name] = color_img
-
-                if is_stereo:
-                    color_right_data = obs["sensor"][camera_name]["color_right"]
-                    if isinstance(color_right_data, torch.Tensor):
-                        color_right_img = color_right_data[:, :, :3].cpu().numpy()
-                    else:
-                        color_right_img = np.array(color_right_data)[:, :, :3]
+        for sensor_name, value in sensor_obs_space.items():
+            sensor = self._env.get_sensor(sensor_name)
+            is_stereo = is_stereocam(sensor)
 
-                    if color_right_img.dtype in [np.float32, np.float64]:
-                        color_right_img = (color_right_img * 255).astype(np.uint8)
+            color_data = obs["sensor"][sensor_name]["color"]
+            color_img = color_data[:, :, :3].cpu()
+            frame[f"{sensor_name}.color"] = color_img
 
-                    frame[get_right_name(camera_name)] = color_right_img
+            if is_stereo:
+                color_right_data = obs["sensor"][sensor_name]["color_right"]
+                color_right_img = color_right_data[:, :, :3].cpu()
+                frame[f"{sensor_name}.color_right"] = color_right_img
 
         # Add state
-        qpos = obs["robot"][JointType.QPOS.value]
-        if isinstance(qpos, torch.Tensor):
-            state_data = qpos.cpu().numpy().astype(np.float32)
-        else:
-            state_data = np.array(qpos).astype(np.float32)
+        frame["observation.qpos"] = obs["robot"]["qpos"][self._joint_ids].cpu()
+        frame["observation.qvel"] = obs["robot"]["qvel"][self._joint_ids].cpu()
+        frame["observation.qf"] = obs["robot"]["qf"][self._joint_ids].cpu()
 
-        frame["observation.state"] = state_data
+        # Add extra observation features if they exist
+        for key in obs:
+            if key in ["robot", "sensor"]:
+                continue
 
-        # Add action (save complete qpos including gripper)
+            frame[f"observation.{key}"] = obs[key].cpu()
+
+        # Add action.
+        action = action[self._joint_ids]
         if isinstance(action, torch.Tensor):
-            action_data = action.cpu().numpy()
-        elif isinstance(action, np.ndarray):
-            action_data = action
+            action_data = action.cpu()
         elif isinstance(action, dict):
             # Extract qpos from action dict
             action_tensor = action.get(
@@ -372,13 +437,7 @@ def _convert_frame_to_lerobot(
                         break
 
             if isinstance(action_tensor, torch.Tensor):
-                action_data = action_tensor.cpu().numpy()
-            elif isinstance(action_tensor, np.ndarray):
-                action_data = action_tensor
-            else:
-                action_data = np.array(action_tensor)
-        else:
-            action_data = np.array(action)
+                action_data = action_tensor.cpu()
 
         frame["action"] = action_data
 
diff --git a/embodichain/lab/gym/envs/managers/observations.py b/embodichain/lab/gym/envs/managers/observations.py
index 52df7c7a..abf220af 100644
--- a/embodichain/lab/gym/envs/managers/observations.py
+++ b/embodichain/lab/gym/envs/managers/observations.py
@@ -38,6 +38,7 @@ def get_rigid_object_pose(
     env: EmbodiedEnv,
     obs: EnvObs,
     entity_cfg: SceneEntityCfg,
+    to_matrix: bool = True,
 ) -> torch.Tensor:
     """Get the world poses of the rigid objects in the environment.
 
@@ -48,6 +49,7 @@ def get_rigid_object_pose(
         env: The environment instance.
         obs: The observation dictionary.
         entity_cfg: The configuration of the scene entity.
+        to_matrix: Whether to return the pose as a 4x4 transformation matrix. If False, returns as (position, quaternion).
 
     Returns:
         A tensor of shape (num_envs, 4, 4) representing the world poses of the rigid objects.
@@ -58,7 +60,7 @@ def get_rigid_object_pose(
 
     obj = env.sim.get_rigid_object(entity_cfg.uid)
 
-    return obj.get_local_pose(to_matrix=True)
+    return obj.get_local_pose(to_matrix=to_matrix)
 
 
 def get_rigid_object_velocity(
diff --git a/embodichain/lab/gym/utils/gym_utils.py b/embodichain/lab/gym/utils/gym_utils.py
index 637afcb9..d8b4427f 100644
--- a/embodichain/lab/gym/utils/gym_utils.py
+++ b/embodichain/lab/gym/utils/gym_utils.py
@@ -91,7 +91,9 @@ def convert_observation_to_space(
             torch.float32: np.float32,
             torch.float64: np.float64,
             torch.int8: np.int8,
+            torch.uint8: np.uint8,
             torch.int16: np.int16,
+            torch.uint16: np.uint16,
             torch.int32: np.int32,
             torch.int64: np.int64,
             torch.bool: np.bool_,

From a1c2e4b1b3b44c225c3f1c3d8a1f89d098ab4b95 Mon Sep 17 00:00:00 2001
From: yuecideng <dengyueci@qq.com>
Date: Sat, 21 Feb 2026 10:54:14 +0000
Subject: [PATCH 6/8] wip

---
 .../agent/pour_water_agent/fast_gym_config.json    |  8 +-------
 .../agent/rearrangement_agent/fast_gym_config.json |  8 +-------
 configs/gym/pour_water/gym_config.json             | 14 +-------------
 configs/gym/pour_water/gym_config_simple.json      | 12 +-----------
 configs/gym/special/simple_task_ur10.json          | 11 +----------
 5 files changed, 5 insertions(+), 48 deletions(-)

diff --git a/configs/gym/agent/pour_water_agent/fast_gym_config.json b/configs/gym/agent/pour_water_agent/fast_gym_config.json
index 15c011ec..e56d77b1 100644
--- a/configs/gym/agent/pour_water_agent/fast_gym_config.json
+++ b/configs/gym/agent/pour_water_agent/fast_gym_config.json
@@ -251,14 +251,8 @@
                 "mode": "save",
                 "params": {
                     "robot_meta": {
-                        "arm_dofs": 12,
                         "control_freq": 25,
-                        "control_parts": ["left_arm", "left_eef", "right_arm", "right_eef"],
-                        "observation": {
-                            "vision": {},
-                            "states": ["qpos"]
-                        },
-                        "min_len_steps": 5
+                        "control_parts": ["left_arm", "left_eef", "right_arm", "right_eef"]
                     },
                     "instruction": {
                         "lang": "Pour water from the bottle into the mug."
diff --git a/configs/gym/agent/rearrangement_agent/fast_gym_config.json b/configs/gym/agent/rearrangement_agent/fast_gym_config.json
index 0d03ba65..ec94fb1f 100644
--- a/configs/gym/agent/rearrangement_agent/fast_gym_config.json
+++ b/configs/gym/agent/rearrangement_agent/fast_gym_config.json
@@ -236,14 +236,8 @@
                 "mode": "save",
                 "params": {
                     "robot_meta": {
-                        "arm_dofs": 12,
                         "control_freq": 25,
-                        "control_parts": ["left_arm", "left_eef", "right_arm", "right_eef"],
-                        "observation": {
-                            "vision": {},
-                            "states": ["qpos"]
-                        },
-                        "min_len_steps": 125
+                        "control_parts": ["left_arm", "left_eef", "right_arm", "right_eef"]
                     },
                     "instruction": {
                         "lang": "Place the spoon and fork neatly into the plate on the table."
diff --git a/configs/gym/pour_water/gym_config.json b/configs/gym/pour_water/gym_config.json
index 3e0bcafb..840c3726 100644
--- a/configs/gym/pour_water/gym_config.json
+++ b/configs/gym/pour_water/gym_config.json
@@ -264,20 +264,8 @@
                 "params": {
                     "robot_meta": {
                         "robot_type": "CobotMagic",
-                        "arm_dofs": 12,
                         "control_freq": 25,
-                        "control_parts": ["left_arm", "left_eef", "right_arm", "right_eef"],
-                        "observation": {
-                            "vision": {
-                                "cam_high": ["mask"],
-                                "cam_right_wrist": ["mask"],
-                                "cam_left_wrist": ["mask"]
-                            },
-                            "states": ["qpos"],
-                            "exteroception": ["cam_high", "cam_right_wrist", "cam_left_wrist"]
-                        },
-                        "action": "qpos_with_eef_pose",
-                        "min_len_steps": 5
+                        "control_parts": ["left_arm", "left_eef", "right_arm", "right_eef"]
                     },
                     "instruction": {
                         "lang": "Pour water from bottle to cup"
diff --git a/configs/gym/pour_water/gym_config_simple.json b/configs/gym/pour_water/gym_config_simple.json
index 3b709145..5cf1b217 100644
--- a/configs/gym/pour_water/gym_config_simple.json
+++ b/configs/gym/pour_water/gym_config_simple.json
@@ -213,18 +213,8 @@
                 "params": {
                     "robot_meta": {
                         "robot_type": "CobotMagic",
-                        "arm_dofs": 12,
                         "control_freq": 25,
-                        "control_parts": ["left_arm", "left_eef", "right_arm", "right_eef"],
-                        "observation": {
-                            "vision": {
-                                "cam_high": ["mask"]
-                            },
-                            "states": ["qpos"],
-                            "exteroception": ["cam_high", "cam_right_wrist", "cam_left_wrist"]
-                        },
-                        "action": "qpos_with_eef_pose",
-                        "min_len_steps": 5
+                        "control_parts": ["left_arm", "left_eef", "right_arm", "right_eef"]
                     },
                     "instruction": {
                         "lang": "Pour water from bottle to cup"
diff --git a/configs/gym/special/simple_task_ur10.json b/configs/gym/special/simple_task_ur10.json
index 8596d7a0..1c722b1a 100644
--- a/configs/gym/special/simple_task_ur10.json
+++ b/configs/gym/special/simple_task_ur10.json
@@ -33,16 +33,7 @@
                 "params": {
                     "robot_meta": {
                         "robot_type": "UR10",
-                        "arm_dofs": 6,
-                        "control_freq": 3,
-                        "observation": {
-                            "vision": {
-                                "cam_high": []
-                            },
-                            "states": ["qpos"]
-                        },
-                        "action": "qpos",
-                        "min_len_steps": 5
+                        "control_freq": 3
                     },
                     "instruction": {
                         "lang": "Acting with Oscillatory motion"

From f2f9fd0e9ad04d70d6c0cb7b00f370d489f05e2a Mon Sep 17 00:00:00 2001
From: yuecideng <dengyueci@qq.com>
Date: Sat, 21 Feb 2026 11:58:10 +0000
Subject: [PATCH 7/8] wip

---
 embodichain/lab/gym/envs/base_env.py          |  5 ++
 embodichain/lab/gym/envs/managers/datasets.py | 85 +++++++++++--------
 .../lab/gym/envs/managers/observations.py     | 13 ++-
 tests/agents/test_rl.py                       |  2 -
 4 files changed, 63 insertions(+), 42 deletions(-)

diff --git a/embodichain/lab/gym/envs/base_env.py b/embodichain/lab/gym/envs/base_env.py
index 2471889d..a637dc7d 100644
--- a/embodichain/lab/gym/envs/base_env.py
+++ b/embodichain/lab/gym/envs/base_env.py
@@ -202,6 +202,11 @@ def action_space(self) -> gym.spaces.Space:
     def elapsed_steps(self) -> Union[int, torch.Tensor]:
         return self._elapsed_steps
 
+    @property
+    def has_sensors(self) -> bool:
+        """Return whether the environment has sensors."""
+        return len(self.sensors) > 0
+
     def get_sensor(self, name: str, **kwargs) -> BaseSensor:
         """Get the sensor instance by name.
 
diff --git a/embodichain/lab/gym/envs/managers/datasets.py b/embodichain/lab/gym/envs/managers/datasets.py
index 5d148ec5..b3326236 100644
--- a/embodichain/lab/gym/envs/managers/datasets.py
+++ b/embodichain/lab/gym/envs/managers/datasets.py
@@ -199,7 +199,7 @@ def _save_extra_episode_meta_info(self, env_id: int) -> None:
         curr_extra_episode_info = {}
         if self.extra_episode_info:
             for key, attr_list in self.extra_episode_info.items():
-                if key == "rigid_object_physcis_attributes":
+                if key == "rigid_object_physics_attributes":
                     rigid_obj_list = self._env.sim.get_rigid_object_uid_list()
                     for obj_uid in rigid_obj_list:
                         curr_extra_episode_info[obj_uid] = {}
@@ -304,22 +304,23 @@ def _build_features(self) -> Dict:
             self._joint_ids = self._env.robot.get_joint_ids(remove_mimic=True)
 
         state_dim = len(self._joint_ids)
-        # Create joint names as strings: joint_0, joint_1, ...
-        joint_indices = [f"{i}" for i in self._joint_ids]
+        # Create joint names.
+        joint_names = [self._env.robot.joint_names[i] for i in self._joint_ids]
+
         features["observation.qpos"] = {
             "dtype": "float32",
             "shape": (state_dim,),
-            "names": joint_indices,
+            "names": joint_names,
         }
         features["observation.qvel"] = {
             "dtype": "float32",
             "shape": (state_dim,),
-            "names": joint_indices,
+            "names": joint_names,
         }
         features["observation.qf"] = {
             "dtype": "float32",
             "shape": (state_dim,),
-            "names": joint_indices,
+            "names": joint_names,
         }
 
         # Use full qpos dimension for action (includes gripper)
@@ -327,36 +328,37 @@ def _build_features(self) -> Dict:
         features["action"] = {
             "dtype": "float32",
             "shape": (action_dim,),
-            "names": joint_indices,
+            "names": joint_names,
         }
 
         # Setup sensor observation features based env.observation.sensor
-        sensor_obs_space: dict = self._env.single_observation_space["sensor"]
-
-        for sensor_name, value in sensor_obs_space.items():
-            sensor = self._env.get_sensor(sensor_name)
-            is_stereo = is_stereocam(sensor)
+        if self._env.has_sensors:
+            sensor_obs_space: dict = self._env.single_observation_space["sensor"]
 
-            for frame_name, space in value.items():
-                # TODO: Support depth (uint16) and mask (also uint16 or uint8)
-                if frame_name not in ["color", "color_right"]:
-                    logger.log_error(
-                        f"Only support 'color' frame for vision sensors, but got '{frame_name}' in sensor '{sensor_name}'"
-                    )
+            for sensor_name, value in sensor_obs_space.items():
+                sensor = self._env.get_sensor(sensor_name)
+                is_stereo = is_stereocam(sensor)
 
-                features[f"{sensor_name}.{frame_name}"] = {
-                    "dtype": "video" if self.use_videos else "image",
-                    "shape": (sensor.cfg.height, sensor.cfg.width, 3),
-                    "names": ["height", "width", "channel"],
-                }
+                for frame_name, space in value.items():
+                    # TODO: Support depth (uint16) and mask (also uint16 or uint8)
+                    if frame_name not in ["color", "color_right"]:
+                        logger.log_error(
+                            f"Only support 'color' frame for vision sensors, but got '{frame_name}' in sensor '{sensor_name}'"
+                        )
 
-                if is_stereo:
-                    features[f"{sensor_name}.{frame_name}_right"] = {
+                    features[f"{sensor_name}.{frame_name}"] = {
                         "dtype": "video" if self.use_videos else "image",
                         "shape": (sensor.cfg.height, sensor.cfg.width, 3),
                         "names": ["height", "width", "channel"],
                     }
 
+                    if is_stereo:
+                        features[f"{sensor_name}.{frame_name}_right"] = {
+                            "dtype": "video" if self.use_videos else "image",
+                            "shape": (sensor.cfg.height, sensor.cfg.width, 3),
+                            "names": ["height", "width", "channel"],
+                        }
+
         # TODO: The extra observation features are supposed to be defined in a flattened way in the observation space.
         # Lerobot requires a flat feature dict, so we may need to support nested dicts to flatten dict conversion in the future.
         # Add any extra features specified in observation space excluding 'robot' and 'sensor'
@@ -364,6 +366,13 @@ def _build_features(self) -> Dict:
             if key in ["robot", "sensor"]:
                 continue
 
+            if isinstance(space, gym.spaces.Dict):
+                logger.log_warning(
+                    f"Nested Dict observation space for key '{key}' is not directly supported. "
+                    f"Please flatten it or specify features manually. Skipping '{key}'."
+                )
+                continue
+
             names = key
             if "vel" in key:
                 names = ["lin_x", "lin_y", "lin_z", "ang_x", "ang_y", "ang_z"]
@@ -392,21 +401,23 @@ def _convert_frame_to_lerobot(
             Frame dict in LeRobot format with numpy arrays
         """
         frame = {"task": task}
-        sensor_obs_space: dict = self._env.single_observation_space["sensor"]
 
-        # Add images
-        for sensor_name, value in sensor_obs_space.items():
-            sensor = self._env.get_sensor(sensor_name)
-            is_stereo = is_stereocam(sensor)
+        if self._env.has_sensors:
+            sensor_obs_space: dict = self._env.single_observation_space["sensor"]
 
-            color_data = obs["sensor"][sensor_name]["color"]
-            color_img = color_data[:, :, :3].cpu()
-            frame[f"{sensor_name}.color"] = color_img
+            # Add images
+            for sensor_name, value in sensor_obs_space.items():
+                sensor = self._env.get_sensor(sensor_name)
+                is_stereo = is_stereocam(sensor)
 
-            if is_stereo:
-                color_right_data = obs["sensor"][sensor_name]["color_right"]
-                color_right_img = color_right_data[:, :, :3].cpu()
-                frame[f"{sensor_name}.color_right"] = color_right_img
+                color_data = obs["sensor"][sensor_name]["color"]
+                color_img = color_data[:, :, :3].cpu()
+                frame[f"{sensor_name}.color"] = color_img
+
+                if is_stereo:
+                    color_right_data = obs["sensor"][sensor_name]["color_right"]
+                    color_right_img = color_right_data[:, :, :3].cpu()
+                    frame[f"{sensor_name}.color_right"] = color_right_img
 
         # Add state
         frame["observation.qpos"] = obs["robot"]["qpos"][self._joint_ids].cpu()
diff --git a/embodichain/lab/gym/envs/managers/observations.py b/embodichain/lab/gym/envs/managers/observations.py
index abf220af..537485af 100644
--- a/embodichain/lab/gym/envs/managers/observations.py
+++ b/embodichain/lab/gym/envs/managers/observations.py
@@ -52,11 +52,18 @@ def get_rigid_object_pose(
         to_matrix: Whether to return the pose as a 4x4 transformation matrix. If False, returns as (position, quaternion).
 
     Returns:
-        A tensor of shape (num_envs, 4, 4) representing the world poses of the rigid objects.
+        A tensor of shape (num_envs, 7) or (num_envs, 4, 4) representing the world poses of the rigid objects.
     """
 
     if entity_cfg.uid not in env.sim.get_rigid_object_uid_list():
-        return torch.zeros((env.num_envs, 4, 4), dtype=torch.float32)
+        if to_matrix:
+            return torch.zeros(
+                (env.num_envs, 4, 4), dtype=torch.float32, device=env.device
+            )
+        else:
+            return torch.zeros(
+                (env.num_envs, 7), dtype=torch.float32, device=env.device
+            )
 
     obj = env.sim.get_rigid_object(entity_cfg.uid)
 
@@ -83,7 +90,7 @@ def get_rigid_object_velocity(
     """
 
     if entity_cfg.uid not in env.sim.get_rigid_object_uid_list():
-        return torch.zeros((env.num_envs, 6), dtype=torch.float32)
+        return torch.zeros((env.num_envs, 6), dtype=torch.float32, device=env.device)
 
     obj = env.sim.get_rigid_object(entity_cfg.uid)
 
diff --git a/tests/agents/test_rl.py b/tests/agents/test_rl.py
index 09fdeeff..77246fe6 100644
--- a/tests/agents/test_rl.py
+++ b/tests/agents/test_rl.py
@@ -48,8 +48,6 @@ def setup_method(self):
                     "robot_meta": {
                         "robot_type": "UR10_DH_Gripper",
                         "control_freq": 25,
-                        "arm_dofs": 6,
-                        "observation": {"vision": {}, "states": ["qpos"]},
                     },
                     "instruction": {"lang": "push_cube_to_target"},
                     "extra": {

From 4926e1ff6e4cba2edb3e7b7b35c1a63a2b9b16c9 Mon Sep 17 00:00:00 2001
From: yuecideng <dengyueci@qq.com>
Date: Sat, 21 Feb 2026 13:31:26 +0000
Subject: [PATCH 8/8] wip

---
 tests/agents/test_rl.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/agents/test_rl.py b/tests/agents/test_rl.py
index 77246fe6..e0a5beaf 100644
--- a/tests/agents/test_rl.py
+++ b/tests/agents/test_rl.py
@@ -91,6 +91,11 @@ def teardown_method(self):
             os.remove(self.temp_gym_config_path)
             self.temp_gym_config_path = None
 
+        from embodichain.lab.sim import SimulationManager
+
+        sim = SimulationManager.get_instance()
+        sim.destroy()
+
     def test_training_pipeline(self):
         """Test RL training pipeline with multiple parallel environments."""
         from embodichain.agents.rl.train import train_from_config