From 9a49bedb889a98016d81ef8ffa1f158a7e290041 Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Tue, 20 Jan 2026 15:09:41 +0800
Subject: [PATCH 01/12] action preprocess

---
 configs/agents/rl/push_cube/gym_config.json   |   1 +
 configs/agents/rl/push_cube/train_config.json |   4 +-
 embodichain/agents/rl/algo/ppo.py             |   6 +-
 embodichain/agents/rl/utils/trainer.py        |   9 +-
 embodichain/lab/gym/envs/__init__.py          |   2 +
 embodichain/lab/gym/envs/base_env.py          |  18 +-
 embodichain/lab/gym/envs/embodied_env.py      |  23 +-
 embodichain/lab/gym/envs/managers/rewards.py  |  29 +-
 embodichain/lab/gym/envs/rl_env.py            | 295 ++++++++++++++++++
 .../lab/gym/envs/tasks/rl/push_cube.py        | 103 +-----
 10 files changed, 383 insertions(+), 107 deletions(-)
 create mode 100644 embodichain/lab/gym/envs/rl_env.py

diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json
index 766d12c4..fce093dd 100644
--- a/configs/agents/rl/push_cube/gym_config.json
+++ b/configs/agents/rl/push_cube/gym_config.json
@@ -114,6 +114,7 @@
         },
         "extensions": {
             "obs_mode": "state",
+            "action_type": "delta_qpos",
             "episode_length": 100,
             "joint_limits": 0.5,
             "action_scale": 0.1,
diff --git a/configs/agents/rl/push_cube/train_config.json b/configs/agents/rl/push_cube/train_config.json
index 3c3bda05..3b5c8e3a 100644
--- a/configs/agents/rl/push_cube/train_config.json
+++ b/configs/agents/rl/push_cube/train_config.json
@@ -4,10 +4,10 @@
         "gym_config": "configs/agents/rl/push_cube/gym_config.json",
         "seed": 42,
         "device": "cuda:0",
-        "headless": false,
+        "headless": true,
         "enable_rt": false,
         "gpu_id": 0,
-        "num_envs": 8,
+        "num_envs": 512,
         "iterations": 1000,
         "rollout_steps": 1024,
         "eval_freq": 200,
diff --git a/embodichain/agents/rl/algo/ppo.py b/embodichain/agents/rl/algo/ppo.py
index 5253e897..8f26905b 100644
--- a/embodichain/agents/rl/algo/ppo.py
+++ b/embodichain/agents/rl/algo/ppo.py
@@ -94,8 +94,12 @@ def collect_rollout(
                 current_obs, deterministic=False
             )
 
+            # Wrap action as dict for env processing
+            action_type = getattr(env, "action_type", "qpos")
+            action_dict = {action_type: actions}
+
             # Step environment
-            result = env.step(actions)
+            result = env.step(action_dict)
             next_obs, reward, terminated, truncated, env_info = result
             done = terminated | truncated
             # Light dtype normalization
diff --git a/embodichain/agents/rl/utils/trainer.py b/embodichain/agents/rl/utils/trainer.py
index 34dc1919..94efaecf 100644
--- a/embodichain/agents/rl/utils/trainer.py
+++ b/embodichain/agents/rl/utils/trainer.py
@@ -239,7 +239,14 @@ def _eval_once(self, num_episodes: int = 5):
             ep_ret = torch.zeros(num_envs_eval, dtype=torch.float32, device=self.device)
             while not done_any.any():
                 actions, _, _ = self.policy.get_action(obs, deterministic=True)
-                obs, reward, terminated, truncated, info = self.eval_env.step(actions)
+
+                # Wrap action as dict for env processing
+                action_type = getattr(self.eval_env, "action_type", "qpos")
+                action_dict = {action_type: actions}
+
+                obs, reward, terminated, truncated, info = self.eval_env.step(
+                    action_dict
+                )
 
                 # Flatten dict observation from step
                 if isinstance(obs, dict):
diff --git a/embodichain/lab/gym/envs/__init__.py b/embodichain/lab/gym/envs/__init__.py
index 88257690..4bd4321f 100644
--- a/embodichain/lab/gym/envs/__init__.py
+++ b/embodichain/lab/gym/envs/__init__.py
@@ -16,10 +16,12 @@
 
 from .base_env import *
 from .embodied_env import *
+from .rl_env import *
 from .tasks import *
 from .wrapper import *
 
 from embodichain.lab.gym.envs.embodied_env import EmbodiedEnv
+from embodichain.lab.gym.envs.rl_env import RLEnv
 
 # Specific task environments
 from embodichain.lab.gym.envs.tasks.tableware.pour_water.pour_water import (
diff --git a/embodichain/lab/gym/envs/base_env.py b/embodichain/lab/gym/envs/base_env.py
index 99743607..fd0f873c 100644
--- a/embodichain/lab/gym/envs/base_env.py
+++ b/embodichain/lab/gym/envs/base_env.py
@@ -482,6 +482,22 @@ def get_reward(
 
         return rewards
 
+    def _preprocess_action(self, action: EnvAction) -> EnvAction:
+        """Preprocess action before sending to robot.
+
+        Override this method to add custom preprocessing like:
+        - Action scaling
+        - Coordinate transformation (e.g., EEF pose to joint positions)
+        - Action space conversion
+
+        Args:
+            action: Raw action from policy
+
+        Returns:
+            Preprocessed action
+        """
+        return action
+
     def _step_action(self, action: EnvAction) -> EnvAction:
         """Set action control command into simulation.
 
@@ -536,7 +552,7 @@ def step(
         """
         self._elapsed_steps += 1
 
-        # TODO: may be add hook for action preprocessing.
+        action = self._preprocess_action(action=action)
         action = self._step_action(action=action)
         self.sim.update(self.sim_cfg.physics_dt, self.cfg.sim_steps_per_control)
         self._update_sim_state(**kwargs)
diff --git a/embodichain/lab/gym/envs/embodied_env.py b/embodichain/lab/gym/envs/embodied_env.py
index 4e6b1b98..c449c10a 100644
--- a/embodichain/lab/gym/envs/embodied_env.py
+++ b/embodichain/lab/gym/envs/embodied_env.py
@@ -344,16 +344,31 @@ def _initialize_episode(
     def _step_action(self, action: EnvAction) -> EnvAction:
         """Set action control command into simulation.
 
+        Supports multiple action formats:
+        1. torch.Tensor: Interpreted as qpos (joint positions)
+        2. Dict with keys:
+           - "qpos": Joint positions
+           - "qvel": Joint velocities
+           - "qf": Joint forces/torques
+
         Args:
             action: The action applied to the robot agent.
 
         Returns:
             The action return.
         """
-        # TODO: Support data structure action input such as struct.
-        qpos = action
-
-        self.robot.set_qpos(qpos=qpos)
+        if isinstance(action, dict):
+            # Support multiple control modes simultaneously
+            if "qpos" in action:
+                self.robot.set_qpos(qpos=action["qpos"])
+            if "qvel" in action:
+                self.robot.set_qvel(qvel=action["qvel"])
+            if "qf" in action:
+                self.robot.set_qf(qf=action["qf"])
+        elif isinstance(action, torch.Tensor):
+            self.robot.set_qpos(qpos=action)
+        else:
+            logger.error(f"Unsupported action type: {type(action)}")
 
         return action
 
diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py
index 5cfbef55..2ac38ea2 100644
--- a/embodichain/lab/gym/envs/managers/rewards.py
+++ b/embodichain/lab/gym/envs/managers/rewards.py
@@ -30,7 +30,7 @@
 def distance_between_objects(
     env: EmbodiedEnv,
     obs: dict,
-    action: torch.Tensor,
+    action: dict,
     info: dict,
     source_entity_cfg: SceneEntityCfg = None,
     target_entity_cfg: SceneEntityCfg = None,
@@ -92,7 +92,7 @@ def distance_between_objects(
 def joint_velocity_penalty(
     env: EmbodiedEnv,
     obs: dict,
-    action: torch.Tensor,
+    action: dict,
     info: dict,
     robot_uid: str = "robot",
     joint_ids: slice | list[int] | None = None,
@@ -146,7 +146,7 @@ def joint_velocity_penalty(
 def action_smoothness_penalty(
     env: EmbodiedEnv,
     obs: dict,
-    action: torch.Tensor,
+    action: dict,
     info: dict,
 ) -> torch.Tensor:
     """Penalize large action changes between consecutive timesteps.
@@ -171,20 +171,27 @@ def action_smoothness_penalty(
         }
         ```
     """
+    # Extract action tensor from dict (priority order)
+    action_tensor = None
+    for key in ["qpos", "delta_qpos", "qvel", "qf", "eef_pose"]:
+        if key in action:
+            action_tensor = action[key]
+            break
+
     # Use dictionary-based state management
     if not hasattr(env, "_reward_states"):
         env._reward_states = {}
 
     # compute difference between current and previous action
     if "prev_actions" in env._reward_states:
-        action_diff = action - env._reward_states["prev_actions"]
+        action_diff = action_tensor - env._reward_states["prev_actions"]
         penalty = -torch.norm(action_diff, dim=-1)
     else:
         # no previous action, no penalty
         penalty = torch.zeros(env.num_envs, device=env.device)
 
     # store current action for next step
-    env._reward_states["prev_actions"] = action.clone()
+    env._reward_states["prev_actions"] = action_tensor.clone()
 
     return penalty
 
@@ -192,7 +199,7 @@ def action_smoothness_penalty(
 def joint_limit_penalty(
     env: EmbodiedEnv,
     obs: dict,
-    action: torch.Tensor,
+    action: dict,
     info: dict,
     robot_uid: str = "robot",
     joint_ids: slice | list[int] = slice(None),
@@ -258,7 +265,7 @@ def joint_limit_penalty(
 def orientation_alignment(
     env: EmbodiedEnv,
     obs: dict,
-    action: torch.Tensor,
+    action: dict,
     info: dict,
     source_entity_cfg: SceneEntityCfg = None,
     target_entity_cfg: SceneEntityCfg = None,
@@ -313,7 +320,7 @@ def orientation_alignment(
 def success_reward(
     env: EmbodiedEnv,
     obs: dict,
-    action: torch.Tensor,
+    action: dict,
     info: dict,
 ) -> torch.Tensor:
     """Sparse bonus reward when task succeeds.
@@ -361,7 +368,7 @@ def success_reward(
 def reaching_behind_object(
     env: EmbodiedEnv,
     obs: dict,
-    action: torch.Tensor,
+    action: dict,
     info: dict,
     object_cfg: SceneEntityCfg = None,
     target_pose_key: str = "goal_pose",
@@ -452,7 +459,7 @@ def reaching_behind_object(
 def distance_to_target(
     env: "EmbodiedEnv",
     obs: dict,
-    action: torch.Tensor,
+    action: dict,
     info: dict,
     source_entity_cfg: SceneEntityCfg = None,
     target_pose_key: str = "target_pose",
@@ -529,7 +536,7 @@ def distance_to_target(
 def incremental_distance_to_target(
     env: "EmbodiedEnv",
     obs: dict,
-    action: torch.Tensor,
+    action: dict,
     info: dict,
     source_entity_cfg: SceneEntityCfg = None,
     target_pose_key: str = "target_pose",
diff --git a/embodichain/lab/gym/envs/rl_env.py b/embodichain/lab/gym/envs/rl_env.py
new file mode 100644
index 00000000..986c9f31
--- /dev/null
+++ b/embodichain/lab/gym/envs/rl_env.py
@@ -0,0 +1,295 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2025 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Base environment for reinforcement learning tasks."""
+
+import torch
+from typing import Dict, Any, Sequence, Optional, Tuple, Literal
+
+from embodichain.lab.gym.envs import EmbodiedEnv, EmbodiedEnvCfg
+from embodichain.lab.sim.cfg import MarkerCfg
+from embodichain.lab.sim.types import EnvObs, EnvAction
+from embodichain.utils.math import matrix_from_quat, matrix_from_euler
+
+
+__all__ = ["RLEnv"]
+
+
+class RLEnv(EmbodiedEnv):
+    """Base class for reinforcement learning tasks.
+
+    Provides common utilities for RL tasks:
+    - Goal pose management and visualization
+    - Flexible action preprocessing (scaling, IK, normalization)
+    - Standardized info dictionary structure
+    """
+
+    def __init__(self, cfg: EmbodiedEnvCfg = None, **kwargs):
+        if cfg is None:
+            cfg = EmbodiedEnvCfg()
+        super().__init__(cfg, **kwargs)
+
+    @property
+    def goal_pose(self) -> Optional[torch.Tensor]:
+        """Get current goal poses (4x4 matrices) for all environments."""
+        return self._goal_pose
+
+    def draw_goal_marker(self, env_ids: Sequence[int] | None = None):
+        """Draw axis markers at goal positions for visualization.
+
+        Args:
+            env_ids: Environment IDs to draw markers for (None = all envs)
+        """
+        if self.goal_pose is None:
+            return
+
+        if env_ids is None:
+            env_ids = range(self.num_envs)
+
+        for arena_idx in env_ids:
+            marker_name = f"goal_marker_{arena_idx}"
+            self.sim.remove_marker(marker_name)
+            goal_pose = self.goal_pose[arena_idx].detach().cpu().numpy()
+            marker_cfg = MarkerCfg(
+                name=marker_name,
+                marker_type="axis",
+                axis_xpos=[goal_pose],
+                axis_size=0.003,
+                axis_len=0.02,
+                arena_index=arena_idx,
+            )
+            self.sim.draw_marker(cfg=marker_cfg)
+
+    def _preprocess_action(self, action: EnvAction) -> EnvAction:
+        """Preprocess action for RL tasks with flexible transformation.
+
+        Supports multiple action formats:
+        1. Dict input (keys specify action type):
+           - {"delta_qpos": tensor}: Delta joint positions (scaled and added to current)
+           - {"qpos": tensor}: Absolute joint positions (scaled)
+           - {"qpos_normalized": tensor}: Normalized qpos in [-1, 1]
+           - {"eef_pose": tensor}: End-effector pose (6D or 7D) converted via IK
+           - {"qvel": tensor}: Joint velocities (scaled)
+           - {"qf": tensor}: Joint forces/torques (scaled)
+
+        2. Tensor input: Interpreted based on self.action_type attribute
+           (default: "qpos")
+
+        Args:
+            action: Raw action from policy (tensor or dict)
+
+        Returns:
+            Dict action ready for robot control
+        """
+        # Convert tensor input to dict based on action_type
+        if not isinstance(action, dict):
+            action_type = getattr(self, "action_type", "qpos")
+            action = {action_type: action}
+
+        # Step 1: Scale all action values by action_scale
+        scaled_action = {}
+        for key, value in action.items():
+            if isinstance(value, torch.Tensor):
+                scaled_action[key] = value * self.action_scale
+            else:
+                scaled_action[key] = value
+
+        # Step 2: Process based on dict keys
+        result = {}
+
+        if "qpos" in scaled_action:
+            result["qpos"] = scaled_action["qpos"]
+        elif "delta_qpos" in scaled_action:
+            result["qpos"] = self._process_delta_qpos(scaled_action["delta_qpos"])
+        elif "qpos_normalized" in scaled_action:
+            result["qpos"] = self._denormalize_action(scaled_action["qpos_normalized"])
+        elif "eef_pose" in scaled_action:
+            result["qpos"] = self._process_eef_pose(scaled_action["eef_pose"])
+
+        # Velocity and force controls
+        if "qvel" in scaled_action:
+            result["qvel"] = scaled_action["qvel"]
+
+        if "qf" in scaled_action:
+            result["qf"] = scaled_action["qf"]
+
+        return result
+
+    def _denormalize_action(self, action: torch.Tensor) -> torch.Tensor:
+        """Denormalize action from [-1, 1] to actual range.
+
+        Args:
+            action: Normalized action in [-1, 1]
+
+        Returns:
+            Denormalized action
+        """
+        qpos_limits = self.robot.body_data.qpos_limits[0]
+        low = qpos_limits[:, 0]
+        high = qpos_limits[:, 1]
+
+        # Map [-1, 1] to [low, high]
+        return low + (action + 1.0) * 0.5 * (high - low)
+
+    def _process_delta_qpos(self, action: torch.Tensor) -> torch.Tensor:
+        """Process delta joint position action.
+
+        Args:
+            action: Delta joint positions
+
+        Returns:
+            Absolute joint positions
+        """
+        current_qpos = self.robot.get_qpos()
+        return current_qpos + action
+
+    def _process_eef_pose(self, action: torch.Tensor) -> torch.Tensor:
+        """Process end-effector pose action via inverse kinematics.
+
+        Args:
+            action: End-effector pose (position + orientation)
+                   Shape: (num_envs, 6) for pos+euler or (num_envs, 7) for pos+quat
+
+        Returns:
+            Joint positions from IK
+        """
+        # Auto-select control part if not specified
+        control_part_name = self.control_part_name
+        if control_part_name is None:
+            if self.robot.control_parts is None:
+                raise ValueError(
+                    "Robot has no control_parts defined for eef_pose action"
+                )
+
+            available_parts = list(self.robot.control_parts.keys())
+            if not available_parts:
+                raise ValueError(
+                    "Robot has no control_parts defined for eef_pose action"
+                )
+
+            if len(available_parts) == 1:
+                control_part_name = available_parts[0]
+            else:
+                raise ValueError(
+                    f"Multiple control_parts found {available_parts}. "
+                    f"Please specify control_part_name explicitly."
+                )
+
+        # Get current joint positions as IK seed
+        current_qpos = self.robot.get_qpos()
+        control_joint_ids = self.robot.get_joint_ids(name=control_part_name)
+        seed_qpos = current_qpos[:, control_joint_ids]
+
+        # Convert action to target pose matrix (4x4)
+        batch_size = action.shape[0]
+        target_pose = (
+            torch.eye(4, device=self.device).unsqueeze(0).repeat(batch_size, 1, 1)
+        )
+
+        if action.shape[-1] == 6:
+            # pos (3) + euler angles (3)
+            target_pose[:, :3, 3] = action[:, :3]
+            target_pose[:, :3, :3] = matrix_from_euler(action[:, 3:6])
+        elif action.shape[-1] == 7:
+            # pos (3) + quaternion (4)
+            target_pose[:, :3, 3] = action[:, :3]
+            target_pose[:, :3, :3] = matrix_from_quat(action[:, 3:7])
+        else:
+            raise ValueError(
+                f"EEF pose action must be 6D or 7D, got {action.shape[-1]}D"
+            )
+
+        # Solve IK for each environment
+        ik_solutions = []
+        for env_idx in range(self.num_envs):
+            qpos_ik = self.robot.compute_ik(
+                pose=target_pose[env_idx],
+                joint_seed=seed_qpos[env_idx],
+                name=control_part_name,
+            )
+            ik_solutions.append(qpos_ik)
+
+        # Update full qpos with IK solutions
+        result_qpos = current_qpos.clone()
+        ik_qpos = torch.stack(ik_solutions, dim=0)
+        result_qpos[:, control_joint_ids] = ik_qpos
+
+        return result_qpos
+
+    def compute_task_state(
+        self, **kwargs
+    ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
+        """Compute task-specific state: success, fail, and metrics.
+
+        Override this method in subclass to define task-specific logic.
+
+        Returns:
+            Tuple of (success, fail, metrics):
+                - success: Boolean tensor of shape (num_envs,)
+                - fail: Boolean tensor of shape (num_envs,)
+                - metrics: Dict of metric tensors
+        """
+        success = torch.zeros(self.num_envs, device=self.device, dtype=torch.bool)
+        fail = torch.zeros(self.num_envs, device=self.device, dtype=torch.bool)
+        metrics = {}
+        return success, fail, metrics
+
+    def get_info(self, **kwargs) -> Dict[str, Any]:
+        """Get environment info dictionary.
+
+        Calls compute_task_state() to get task-specific success/fail/metrics.
+        Subclasses should override compute_task_state() instead of this method.
+
+        Returns:
+            Info dictionary with success, fail, elapsed_steps, goal_pose, metrics
+        """
+        success, fail, metrics = self.compute_task_state(**kwargs)
+
+        info = {
+            "success": success,
+            "fail": fail,
+            "elapsed_steps": self._elapsed_steps,
+            "goal_pose": self.goal_pose,
+        }
+        info["metrics"] = metrics
+        return info
+
+    def check_truncated(self, obs: EnvObs, info: Dict[str, Any]) -> torch.Tensor:
+        """Check if episode should be truncated (timeout).
+
+        Args:
+            obs: Current observation
+            info: Info dictionary
+
+        Returns:
+            Boolean tensor of shape (num_envs,)
+        """
+        return self._elapsed_steps >= self.episode_length
+
+    def evaluate(self, **kwargs) -> Dict[str, Any]:
+        """Evaluate the environment state.
+
+        Returns:
+            Evaluation dictionary with success and metrics
+        """
+        info = self.get_info(**kwargs)
+        eval_dict = {
+            "success": info["success"][0].item(),
+        }
+        if "metrics" in info:
+            for key, value in info["metrics"].items():
+                eval_dict[key] = value
+        return eval_dict
diff --git a/embodichain/lab/gym/envs/tasks/rl/push_cube.py b/embodichain/lab/gym/envs/tasks/rl/push_cube.py
index 072574fc..276a433a 100644
--- a/embodichain/lab/gym/envs/tasks/rl/push_cube.py
+++ b/embodichain/lab/gym/envs/tasks/rl/push_cube.py
@@ -15,19 +15,16 @@
 # ----------------------------------------------------------------------------
 
 import torch
-import numpy as np
-from typing import Dict, Any, Sequence
-from gymnasium import spaces
+from typing import Dict, Any, Tuple
 
 from embodichain.lab.gym.utils.registration import register_env
-from embodichain.lab.gym.envs import EmbodiedEnv, EmbodiedEnvCfg
-from embodichain.lab.sim.cfg import MarkerCfg
-from embodichain.lab.sim.types import EnvObs, EnvAction
-from embodichain.utils import logger
+from embodichain.lab.gym.envs.rl_env import RLEnv
+from embodichain.lab.gym.envs import EmbodiedEnvCfg
+from embodichain.lab.sim.types import EnvObs
 
 
 @register_env("PushCubeRL", max_episode_steps=50, override=True)
-class PushCubeEnv(EmbodiedEnv):
+class PushCubeEnv(RLEnv):
     """Push cube task for reinforcement learning.
 
     The task involves pushing a cube to a target goal position using a robotic arm.
@@ -37,100 +34,32 @@ class PushCubeEnv(EmbodiedEnv):
     def __init__(self, cfg=None, **kwargs):
         if cfg is None:
             cfg = EmbodiedEnvCfg()
-
         super().__init__(cfg, **kwargs)
 
-    @property
-    def goal_pose(self) -> torch.Tensor:
-        """Get current goal poses (4x4 matrices) for all environments."""
-        return self._goal_pose
-
-    def _draw_goal_marker(self):
-        """Draw axis marker at goal position for visualization."""
-        goal_sphere = self.sim.get_rigid_object("goal_sphere")
-        if goal_sphere is None:
-            return
-
-        num_envs = self.cfg.num_envs
-
-        # Get actual goal positions from each arena
-        goal_poses = goal_sphere.get_local_pose(to_matrix=True)  # (num_envs, 4, 4)
-
-        # Draw marker for each arena separately
-        for arena_idx in range(num_envs):
-            marker_name = f"goal_marker_{arena_idx}"
-
-            self.sim.remove_marker(marker_name)
-
-            goal_pose = goal_poses[arena_idx].detach().cpu().numpy()
-            marker_cfg = MarkerCfg(
-                name=marker_name,
-                marker_type="axis",
-                axis_xpos=[goal_pose],
-                axis_size=0.003,
-                axis_len=0.02,
-                arena_index=arena_idx,
-            )
-            self.sim.draw_marker(cfg=marker_cfg)
-
-    def _initialize_episode(
-        self, env_ids: Sequence[int] | None = None, **kwargs
-    ) -> None:
-        super()._initialize_episode(env_ids=env_ids, **kwargs)
-
-        # Draw marker at goal position
-        # self._draw_goal_marker()
-
-    def _step_action(self, action: EnvAction) -> EnvAction:
-        scaled_action = action * self.action_scale
-        scaled_action = torch.clamp(
-            scaled_action, -self.joint_limits, self.joint_limits
-        )
-        current_qpos = self.robot.body_data.qpos
-        target_qpos = current_qpos.clone()
-        target_qpos[:, :6] += scaled_action[:, :6]
-        self.robot.set_qpos(qpos=target_qpos)
-        return scaled_action
-
-    def get_info(self, **kwargs) -> Dict[str, Any]:
+    def compute_task_state(
+        self, **kwargs
+    ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
         cube = self.sim.get_rigid_object("cube")
-        cube_pos = cube.body_data.pose[:, :3]
+        cube_pos = cube.get_local_pose(to_matrix=True)[:, :3, 3]
 
-        # Get goal position from event-managed goal pose
         if self.goal_pose is not None:
             goal_pos = self.goal_pose[:, :3, 3]
             xy_distance = torch.norm(cube_pos[:, :2] - goal_pos[:, :2], dim=1)
             is_success = xy_distance < self.success_threshold
         else:
-            # Goal not yet set by randomize_target_pose event (e.g., before first reset)
-            xy_distance = torch.zeros(self.cfg.num_envs, device=self.device)
+            xy_distance = torch.zeros(self.num_envs, device=self.device)
             is_success = torch.zeros(
-                self.cfg.num_envs, device=self.device, dtype=torch.bool
+                self.num_envs, device=self.device, dtype=torch.bool
             )
 
-        info = {
-            "success": is_success,
-            "fail": torch.zeros(
-                self.cfg.num_envs, device=self.device, dtype=torch.bool
-            ),
-            "elapsed_steps": self._elapsed_steps,
-            "goal_pose": self.goal_pose,
-        }
-        info["metrics"] = {
-            "distance_to_goal": xy_distance,
-        }
-        return info
+        is_fail = torch.zeros(self.num_envs, device=self.device, dtype=torch.bool)
+        metrics = {"distance_to_goal": xy_distance}
+
+        return is_success, is_fail, metrics
 
     def check_truncated(self, obs: EnvObs, info: Dict[str, Any]) -> torch.Tensor:
         is_timeout = self._elapsed_steps >= self.episode_length
         cube = self.sim.get_rigid_object("cube")
-        cube_pos = cube.body_data.pose[:, :3]
+        cube_pos = cube.get_local_pose(to_matrix=True)[:, :3, 3]
         is_fallen = cube_pos[:, 2] < -0.1
         return is_timeout | is_fallen
-
-    def evaluate(self, **kwargs) -> Dict[str, Any]:
-        info = self.get_info(**kwargs)
-        return {
-            "success": info["success"][0].item(),
-            "distance_to_goal": info["distance_to_goal"],
-        }

From 61a497ddcbe53e5987aa9152cf8c3115582d83b1 Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Tue, 20 Jan 2026 15:37:17 +0800
Subject: [PATCH 02/12] wip

---
 configs/agents/rl/push_cube/gym_config.json  |  2 --
 docs/source/overview/gym/env.md              |  5 +++--
 docs/source/tutorial/rl.rst                  |  2 +-
 embodichain/agents/rl/algo/ppo.py            |  2 +-
 embodichain/agents/rl/utils/trainer.py       |  2 +-
 embodichain/lab/gym/envs/embodied_env.py     |  3 +--
 embodichain/lab/gym/envs/managers/rewards.py |  5 ++---
 embodichain/lab/gym/envs/rl_env.py           | 10 ++++++++--
 8 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json
index fce093dd..3e0e0445 100644
--- a/configs/agents/rl/push_cube/gym_config.json
+++ b/configs/agents/rl/push_cube/gym_config.json
@@ -113,10 +113,8 @@
             }
         },
         "extensions": {
-            "obs_mode": "state",
             "action_type": "delta_qpos",
             "episode_length": 100,
-            "joint_limits": 0.5,
             "action_scale": 0.1,
             "success_threshold": 0.1
         }
diff --git a/docs/source/overview/gym/env.md b/docs/source/overview/gym/env.md
index a15d26f9..68f4d585 100644
--- a/docs/source/overview/gym/env.md
+++ b/docs/source/overview/gym/env.md
@@ -77,7 +77,7 @@ The {class}`~envs.EmbodiedEnvCfg` class exposes the following additional paramet
   Dataset collection settings. Defaults to None, in which case no dataset collection is performed. Please refer to the {class}`~envs.managers.DatasetManager` class for more details.
 
 * **extensions** (Union[Dict[str, Any], None]): 
-  Task-specific extension parameters that are automatically bound to the environment instance. This allows passing custom parameters (e.g., ``episode_length``, ``obs_mode``, ``action_scale``) without modifying the base configuration class. These parameters are accessible as instance attributes after environment initialization. For example, if ``extensions = {"episode_length": 500}``, you can access it via ``self.episode_length``. Defaults to None.
+  Task-specific extension parameters that are automatically bound to the environment instance. This allows passing custom parameters (e.g., ``episode_length``, ``action_type``, ``action_scale``) without modifying the base configuration class. These parameters are accessible as instance attributes after environment initialization. For example, if ``extensions = {"episode_length": 500}``, you can access it via ``self.episode_length``. Defaults to None.
 
 * **filter_visual_rand** (bool): 
   Whether to filter out visual randomization functors. Useful for debugging motion and physics issues when visual randomization interferes with the debugging process. Defaults to ``False``.
@@ -108,7 +108,8 @@ class MyTaskEnvCfg(EmbodiedEnvCfg):
     # 4. Task Extensions
     extensions = {       # Task-specific parameters
         "episode_length": 500,
-        "obs_mode": "state",
+        "action_type": "delta_qpos",
+        "action_scale": 0.1,
     }
 ```
 
diff --git a/docs/source/tutorial/rl.rst b/docs/source/tutorial/rl.rst
index 7be8c73c..ac05f64f 100644
--- a/docs/source/tutorial/rl.rst
+++ b/docs/source/tutorial/rl.rst
@@ -86,7 +86,7 @@ Example:
      "id": "PushCubeRL",
      "cfg": {
        "num_envs": 4,
-       "obs_mode": "state",
+       "action_type": "delta_qpos",
        "episode_length": 100,
        "action_scale": 0.1,
        "success_threshold": 0.1
diff --git a/embodichain/agents/rl/algo/ppo.py b/embodichain/agents/rl/algo/ppo.py
index 8f26905b..f11fbe37 100644
--- a/embodichain/agents/rl/algo/ppo.py
+++ b/embodichain/agents/rl/algo/ppo.py
@@ -95,7 +95,7 @@ def collect_rollout(
             )
 
             # Wrap action as dict for env processing
-            action_type = getattr(env, "action_type", "qpos")
+            action_type = getattr(env, "action_type", "delta_qpos")
             action_dict = {action_type: actions}
 
             # Step environment
diff --git a/embodichain/agents/rl/utils/trainer.py b/embodichain/agents/rl/utils/trainer.py
index 94efaecf..333a0bcd 100644
--- a/embodichain/agents/rl/utils/trainer.py
+++ b/embodichain/agents/rl/utils/trainer.py
@@ -241,7 +241,7 @@ def _eval_once(self, num_episodes: int = 5):
                 actions, _, _ = self.policy.get_action(obs, deterministic=True)
 
                 # Wrap action as dict for env processing
-                action_type = getattr(self.eval_env, "action_type", "qpos")
+                action_type = getattr(self.eval_env, "action_type", "delta_qpos")
                 action_dict = {action_type: actions}
 
                 obs, reward, terminated, truncated, info = self.eval_env.step(
diff --git a/embodichain/lab/gym/envs/embodied_env.py b/embodichain/lab/gym/envs/embodied_env.py
index c449c10a..9e787bb2 100644
--- a/embodichain/lab/gym/envs/embodied_env.py
+++ b/embodichain/lab/gym/envs/embodied_env.py
@@ -110,10 +110,9 @@ class EnvLightCfg:
     
     This field can be used to pass additional parameters that are specific to certain environments
     or tasks without modifying the base configuration class. For example:
-    - obs_mode: Observation mode (e.g., "state", "image")
     - episode_length: Maximum episode length
-    - joint_limits: Joint limit constraints
     - action_scale: Action scaling factor
+    - action_type: Action type (e.g., "delta_qpos", "qpos", "qvel")
     - vr_joint_mapping: VR joint mapping for teleoperation
     - control_frequency: Control frequency for VR teleoperation
     """
diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py
index 2ac38ea2..dfd2872b 100644
--- a/embodichain/lab/gym/envs/managers/rewards.py
+++ b/embodichain/lab/gym/envs/managers/rewards.py
@@ -171,9 +171,8 @@ def action_smoothness_penalty(
         }
         ```
     """
-    # Extract action tensor from dict (priority order)
-    action_tensor = None
-    for key in ["qpos", "delta_qpos", "qvel", "qf", "eef_pose"]:
+    # Extract action tensor from dict
+    for key in ["qpos", "qvel", "qf"]:
         if key in action:
             action_tensor = action[key]
             break
diff --git a/embodichain/lab/gym/envs/rl_env.py b/embodichain/lab/gym/envs/rl_env.py
index 986c9f31..3a8f4118 100644
--- a/embodichain/lab/gym/envs/rl_env.py
+++ b/embodichain/lab/gym/envs/rl_env.py
@@ -17,7 +17,7 @@
 """Base environment for reinforcement learning tasks."""
 
 import torch
-from typing import Dict, Any, Sequence, Optional, Tuple, Literal
+from typing import Dict, Any, Sequence, Optional, Tuple
 
 from embodichain.lab.gym.envs import EmbodiedEnv, EmbodiedEnvCfg
 from embodichain.lab.sim.cfg import MarkerCfg
@@ -42,6 +42,12 @@ def __init__(self, cfg: EmbodiedEnvCfg = None, **kwargs):
             cfg = EmbodiedEnvCfg()
         super().__init__(cfg, **kwargs)
 
+        # Set default values for common RL parameters
+        if not hasattr(self, "action_scale"):
+            self.action_scale = 1.0
+        if not hasattr(self, "episode_length"):
+            self.episode_length = 1000
+
     @property
     def goal_pose(self) -> Optional[torch.Tensor]:
         """Get current goal poses (4x4 matrices) for all environments."""
@@ -96,7 +102,7 @@ def _preprocess_action(self, action: EnvAction) -> EnvAction:
         """
         # Convert tensor input to dict based on action_type
         if not isinstance(action, dict):
-            action_type = getattr(self, "action_type", "qpos")
+            action_type = getattr(self, "action_type", "delta_qpos")
             action = {action_type: action}
 
         # Step 1: Scale all action values by action_scale

From 22af95655240d9ffc513f1f3426f7cca49fa987e Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Tue, 20 Jan 2026 15:42:08 +0800
Subject: [PATCH 03/12] update rl docs

---
 docs/source/overview/gym/env.md | 99 +++++++++++++++++++++++++--------
 docs/source/tutorial/rl.rst     | 70 ++++++++++++++++++-----
 2 files changed, 133 insertions(+), 36 deletions(-)

diff --git a/docs/source/overview/gym/env.md b/docs/source/overview/gym/env.md
index 68f4d585..88f9cac9 100644
--- a/docs/source/overview/gym/env.md
+++ b/docs/source/overview/gym/env.md
@@ -5,14 +5,19 @@
 
 The {class}`~envs.EmbodiedEnv` is the core environment class in EmbodiChain designed for complex Embodied AI tasks. It adopts a **configuration-driven** architecture, allowing users to define robots, sensors, objects, lighting, and automated behaviors (events) purely through configuration classes, minimizing the need for boilerplate code.
 
+For **Reinforcement Learning** tasks, EmbodiChain provides {class}`~envs.RLEnv`, a specialized subclass that extends {class}`~envs.EmbodiedEnv` with RL-specific utilities such as flexible action preprocessing, goal management, and standardized info structure.
+
 ## Core Architecture
 
-Unlike the standard {class}`~envs.BaseEnv`, the {class}`~envs.EmbodiedEnv` integrates several manager systems to handle the complexity of simulation:
+EmbodiChain provides a hierarchy of environment classes for different task types:
 
-* **Scene Management**: Automatically loads and manages robots, sensors, and scene objects defined in the configuration.
-* **Event Manager**: Handles automated behaviors such as domain randomization, scene setup, and dynamic asset swapping.
-* **Observation Manager**: Allows flexible extension of observation spaces without modifying the environment code.
-* **Dataset Manager**: Built-in support for collecting demonstration data during simulation steps.
+* **{class}`~envs.BaseEnv`**: Minimal environment for simple tasks with custom simulation logic.
+* **{class}`~envs.EmbodiedEnv`**: Feature-rich environment for Embodied AI tasks (IL, custom control). Integrates manager systems:
+  * **Scene Management**: Automatically loads and manages robots, sensors, and scene objects.
+  * **Event Manager**: Domain randomization, scene setup, and dynamic asset swapping.
+  * **Observation Manager**: Flexible observation space extensions.
+  * **Dataset Manager**: Built-in support for demonstration data collection.
+* **{class}`~envs.RLEnv`**: Specialized environment for RL tasks, extending {class}`~envs.EmbodiedEnv` with action preprocessing, goal management, and standardized reward/info structure.
 
 ## Configuration System
 
@@ -166,54 +171,104 @@ The manager operates in a single mode ``"save"`` which handles both recording an
 
 The dataset manager is called automatically during {meth}`~envs.Env.step()`, ensuring all observation-action pairs are recorded without additional user code.
 
+## Reinforcement Learning Environment
+
+For RL tasks, EmbodiChain provides {class}`~envs.RLEnv`, a specialized base class that extends {class}`~envs.EmbodiedEnv` with RL-specific utilities:
+
+* **Action Preprocessing**: Flexible action transformation supporting delta_qpos, absolute qpos, joint velocity, joint force, and end-effector pose (with IK).
+* **Goal Management**: Built-in goal pose tracking and visualization with axis markers.
+* **Standardized Info Structure**: Template methods for computing task-specific success/failure conditions and metrics.
+* **Episode Management**: Configurable episode length and truncation logic.
+
+### Configuration Extensions for RL
+
+RL environments use the ``extensions`` field to pass task-specific parameters:
+
+```python
+extensions = {
+    "action_type": "delta_qpos",      # Action type: delta_qpos, qpos, qvel, qf, eef_pose
+    "action_scale": 0.1,              # Scaling factor applied to all actions
+    "episode_length": 100,            # Maximum episode length
+    "success_threshold": 0.1,         # Task-specific success threshold (optional)
+}
+```
+
 ## Creating a Custom Task
 
-To create a new task, inherit from {class}`~envs.EmbodiedEnv` and implement the task-specific logic.
+### For Reinforcement Learning Tasks
+
+Inherit from {class}`~envs.RLEnv` and implement the task-specific logic:
+
+```python
+from embodichain.lab.gym.envs import RLEnv, EmbodiedEnvCfg
+from embodichain.lab.gym.utils.registration import register_env
+
+@register_env("MyRLTask-v0", max_episode_steps=100)
+class MyRLTaskEnv(RLEnv):
+    def __init__(self, cfg: MyTaskEnvCfg, **kwargs):
+        super().__init__(cfg, **kwargs)
+
+    def compute_task_state(self, **kwargs):
+        # Required: Compute task-specific success/failure and metrics
+        # Returns: Tuple[success, fail, metrics]
+        #   - success: torch.Tensor of shape (num_envs,) with boolean values
+        #   - fail: torch.Tensor of shape (num_envs,) with boolean values
+        #   - metrics: Dict of metric tensors for logging
+        
+        is_success = ...  # Compute success condition
+        is_fail = torch.zeros_like(is_success)
+        metrics = {"distance": ..., "angle_error": ...}
+        
+        return is_success, is_fail, metrics
+
+    def check_truncated(self, obs, info):
+        # Optional: Override to add custom truncation conditions
+        # Default: episode_length timeout
+        is_timeout = super().check_truncated(obs, info)
+        is_fallen = ...  # Custom condition (e.g., robot fell)
+        return is_timeout | is_fallen
+```
+
+Configure rewards through the {class}`~envs.managers.RewardManager` in your environment config rather than overriding ``get_reward``.
+
+### For Imitation Learning Tasks
+
+Inherit from {class}`~envs.EmbodiedEnv` for IL tasks:
 
 ```python
 from embodichain.lab.gym.envs import EmbodiedEnv, EmbodiedEnvCfg
 from embodichain.lab.gym.utils.registration import register_env
 
-@register_env("MyTask-v0", max_episode_steps=500)
-class MyTaskEnv(EmbodiedEnv):
+@register_env("MyILTask-v0", max_episode_steps=500)
+class MyILTaskEnv(EmbodiedEnv):
     def __init__(self, cfg: MyTaskEnvCfg, **kwargs):
         super().__init__(cfg, **kwargs)
 
     def create_demo_action_list(self, *args, **kwargs):
-        # Optional: Implement for expert demonstration data generation (for Imitation Learning)
-        # This method is used to generate scripted demonstrations for IL data collection.
+        # Required: Generate scripted demonstrations for data collection
         # Must set self.action_length = len(action_list) if returning actions
         pass
 
     def is_task_success(self, **kwargs):
-        # Optional: Define success criteria (mainly for IL data collection)
+        # Required: Define success criteria for filtering successful episodes
         # Returns: torch.Tensor of shape (num_envs,) with boolean values
         return success_tensor
 
-    def get_reward(self, obs, action, info):
-        # Optional: Override for RL tasks
-        # Returns: torch.Tensor of shape (num_envs,)
-        return super().get_reward(obs, action, info)
-
     def get_info(self, **kwargs):
         # Optional: Override to add custom info fields
-        # Should include "success" and "fail" keys for termination
         info = super().get_info(**kwargs)
         info["custom_metric"] = ...
         return info
 ```
 
-```{note}
-The {meth}`~envs.EmbodiedEnv.create_demo_action_list` method is specifically designed for expert demonstration data generation in Imitation Learning scenarios. For Reinforcement Learning tasks, you should override the {meth}`~envs.EmbodiedEnv.get_reward` method instead.
-```
-
 For a complete example of a modular environment setup, please refer to the {ref}`tutorial_modular_env` tutorial.
 
 ## See Also
 
 - {ref}`tutorial_create_basic_env` - Creating basic environments
 - {ref}`tutorial_modular_env` - Advanced modular environment setup
-- {doc}`/api_reference/embodichain/embodichain.lab.gym.envs` - Complete API reference for EmbodiedEnv and EmbodiedEnvCfg
+- {ref}`tutorial_rl` - Reinforcement learning training guide
+- {doc}`/api_reference/embodichain/embodichain.lab.gym.envs` - Complete API reference for EmbodiedEnv, RLEnv, and configurations
 
 ```{toctree}
 :maxdepth: 1
diff --git a/docs/source/tutorial/rl.rst b/docs/source/tutorial/rl.rst
index ac05f64f..cbc011b2 100644
--- a/docs/source/tutorial/rl.rst
+++ b/docs/source/tutorial/rl.rst
@@ -78,6 +78,13 @@ The ``env`` section defines the task environment:
 - **id**: Environment registry ID (e.g., "PushCubeRL")
 - **cfg**: Environment-specific configuration parameters
 
+For RL environments (inheriting from ``RLEnv``), use the ``extensions`` field for RL-specific parameters:
+
+- **action_type**: Action type - "delta_qpos" (default), "qpos", "qvel", "qf", "eef_pose"
+- **action_scale**: Scaling factor applied to all actions (default: 1.0)
+- **episode_length**: Maximum episode length (default: 1000)
+- **success_threshold**: Task-specific success threshold (optional)
+
 Example:
 
 .. code-block:: json
@@ -86,10 +93,12 @@ Example:
      "id": "PushCubeRL",
      "cfg": {
        "num_envs": 4,
-       "action_type": "delta_qpos",
-       "episode_length": 100,
-       "action_scale": 0.1,
-       "success_threshold": 0.1
+       "extensions": {
+         "action_type": "delta_qpos",
+         "action_scale": 0.1,
+         "episode_length": 100,
+         "success_threshold": 0.1
+       }
      }
    }
 
@@ -321,41 +330,74 @@ Adding a New Environment
 
 To add a new RL environment:
 
-1. Create an environment class inheriting from ``EmbodiedEnv``
-2. Register it with the Gymnasium registry:
+1. Create an environment class inheriting from ``RLEnv`` (which provides action preprocessing, goal management, and standardized info structure):
 
 .. code-block:: python
 
+   from embodichain.lab.gym.envs import RLEnv, EmbodiedEnvCfg
    from embodichain.lab.gym.utils.registration import register_env
+   import torch
    
    @register_env("MyTaskRL", max_episode_steps=100, override=True)
-   class MyTaskEnv(EmbodiedEnv):
-       cfg: MyTaskEnvCfg
-       ...
+   class MyTaskEnv(RLEnv):
+       def __init__(self, cfg: EmbodiedEnvCfg = None, **kwargs):
+           super().__init__(cfg, **kwargs)
+       
+       def compute_task_state(self, **kwargs):
+           """Compute success/failure conditions and metrics."""
+           is_success = ...  # Define success condition
+           is_fail = torch.zeros_like(is_success)
+           metrics = {"distance": ..., "error": ...}
+           return is_success, is_fail, metrics
+       
+       def check_truncated(self, obs, info):
+           """Optional: Add custom truncation conditions."""
+           is_timeout = super().check_truncated(obs, info)
+           # Add custom conditions if needed
+           return is_timeout
 
-3. Use the environment ID in your JSON config:
+2. Configure the environment in your JSON config with RL-specific extensions:
 
 .. code-block:: json
 
    "env": {
      "id": "MyTaskRL",
      "cfg": {
-       ...
+       "num_envs": 4,
+       "extensions": {
+         "action_type": "delta_qpos",
+         "action_scale": 0.1,
+         "episode_length": 100,
+         "success_threshold": 0.05
+       }
      }
    }
 
+The ``RLEnv`` base class provides:
+
+- **Action Preprocessing**: Automatically handles different action types (delta_qpos, qpos, qvel, qf, eef_pose)
+- **Action Scaling**: Applies ``action_scale`` to all actions
+- **Goal Management**: Built-in goal pose tracking and visualization
+- **Standardized Info**: Implements ``get_info()`` using ``compute_task_state()`` template method
+
 Best Practices
 ~~~~~~~~~~~~~~
 
-- **Device Management**: Device is single-sourced from ``runtime.cuda``. All components (trainer/algorithm/policy/env) share the same device.
+- **Use RLEnv for RL Tasks**: Always inherit from ``RLEnv`` for reinforcement learning tasks. It provides action preprocessing, goal management, and standardized info structure out of the box.
+
+- **Action Type Configuration**: Configure ``action_type`` in the environment's ``extensions`` field. The default is "delta_qpos" (incremental joint positions). Other options: "qpos" (absolute), "qvel" (velocity), "qf" (force), "eef_pose" (end-effector pose with IK).
 
-- **Action Scaling**: Keep action scaling in the environment, not in the policy.
+- **Action Scaling**: Use ``action_scale`` in the environment's ``extensions`` field to scale actions. This is applied in ``RLEnv._preprocess_action()`` before robot control.
+
+- **Device Management**: Device is single-sourced from ``runtime.cuda``. All components (trainer/algorithm/policy/env) share the same device.
 
 - **Observation Format**: Environments should provide consistent observation shape/types (torch.float32) and a single ``done = terminated | truncated``.
 
 - **Algorithm Interface**: Algorithms must implement ``initialize_buffer()``, ``collect_rollout()``, and ``update()`` methods. The algorithm completely controls data collection and buffer management.
 
-- **Reward Components**: Organize reward components in ``info["rewards"]`` dictionary and metrics in ``info["metrics"]`` dictionary. The trainer performs dense per-step logging directly from environment info.
+- **Reward Configuration**: Use the ``RewardManager`` in your environment config to define reward components. Organize reward components in ``info["rewards"]`` dictionary and metrics in ``info["metrics"]`` dictionary. The trainer performs dense per-step logging directly from environment info.
+
+- **Template Methods**: Override ``compute_task_state()`` to define success/failure conditions and metrics. Override ``check_truncated()`` for custom truncation logic.
 
 - **Configuration**: Use JSON for all hyperparameters. This makes experiments reproducible and easy to track.
 

From e037ceaa5c824b26f9cb7243b36fe4dae1941d83 Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Wed, 21 Jan 2026 14:50:14 +0800
Subject: [PATCH 04/12] add unit test for RL

---
 embodichain/agents/rl/train.py | 20 ++++++++++--
 tests/agents/test_rl.py        | 58 ++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 3 deletions(-)
 create mode 100644 tests/agents/test_rl.py

diff --git a/embodichain/agents/rl/train.py b/embodichain/agents/rl/train.py
index ca3c2996..89055cb8 100644
--- a/embodichain/agents/rl/train.py
+++ b/embodichain/agents/rl/train.py
@@ -39,12 +39,20 @@
 from embodichain.lab.gym.envs.managers.cfg import EventCfg
 
 
-def main():
+def parse_args():
+    """Parse command line arguments."""
     parser = argparse.ArgumentParser()
     parser.add_argument("--config", type=str, required=True, help="Path to JSON config")
-    args = parser.parse_args()
+    return parser.parse_args()
+
+
+def train_from_config(config_path: str):
+    """Run training from a config file path.
 
-    with open(args.config, "r") as f:
+    Args:
+        config_path: Path to the JSON config file
+    """
+    with open(config_path, "r") as f:
         cfg_json = json.load(f)
 
     trainer_cfg = cfg_json["trainer"]
@@ -277,5 +285,11 @@ def main():
         logger.log_info("Training finished")
 
 
+def main():
+    """Main entry point for command-line training."""
+    args = parse_args()
+    train_from_config(args.config)
+
+
 if __name__ == "__main__":
     main()
diff --git a/tests/agents/test_rl.py b/tests/agents/test_rl.py
new file mode 100644
index 00000000..2588f998
--- /dev/null
+++ b/tests/agents/test_rl.py
@@ -0,0 +1,58 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2025 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+import os
+import json
+import pytest
+import tempfile
+
+
+def test_rl_training():
+    """Test RL training pipeline by running a few iterations."""
+    # Load the existing push_cube config
+    config_path = "configs/agents/rl/push_cube/train_config.json"
+
+    with open(config_path, "r") as f:
+        config = json.load(f)
+
+    # Create temporary config with reduced iterations for testing
+    test_config = config.copy()
+    test_config["trainer"]["iterations"] = 2  # Only 2 iterations for testing
+    test_config["trainer"]["rollout_steps"] = 32  # Fewer rollout steps
+    test_config["trainer"]["eval_freq"] = 1000000  # Disable eval
+    test_config["trainer"]["save_freq"] = 1000000  # Disable save
+    test_config["trainer"]["headless"] = True
+    test_config["trainer"]["use_wandb"] = False
+    test_config["trainer"]["num_envs"] = 2  # Only 2 parallel envs for testing
+
+    # Save temporary config
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(test_config, f)
+        temp_config_path = f.name
+
+    try:
+        from embodichain.agents.rl.train import train_from_config
+
+        train_from_config(temp_config_path)
+        assert True
+
+    finally:
+        if os.path.exists(temp_config_path):
+            os.remove(temp_config_path)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s"])

From 24ac4170580d6965e1fb0afee2e21fb33d80a193 Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Wed, 21 Jan 2026 18:12:39 +0800
Subject: [PATCH 05/12] update rl env and dataset manager

---
 configs/agents/rl/push_cube/gym_config.json   |  25 ++
 .../rl/push_cube/gym_config_vision.json       | 244 ++++++++++++++++++
 configs/agents/rl/push_cube/train_config.json |   2 +-
 embodichain/lab/gym/envs/embodied_env.py      |  46 ++++
 embodichain/lab/gym/envs/managers/datasets.py | 152 +++++------
 embodichain/lab/gym/envs/managers/rewards.py  |  86 +++---
 embodichain/lab/gym/envs/rl_env.py            |  36 +--
 7 files changed, 455 insertions(+), 136 deletions(-)
 create mode 100644 configs/agents/rl/push_cube/gym_config_vision.json

diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json
index 3e0e0445..7858d18b 100644
--- a/configs/agents/rl/push_cube/gym_config.json
+++ b/configs/agents/rl/push_cube/gym_config.json
@@ -117,6 +117,31 @@
             "episode_length": 100,
             "action_scale": 0.1,
             "success_threshold": 0.1
+        },
+        "dataset": {
+            "lerobot": {
+                "func": "LeRobotRecorder",
+                "mode": "save",
+                "params": {
+                    "robot_meta": {
+                        "robot_type": "UR10_DH_Gripper",
+                        "control_freq": 25,
+                        "arm_dofs": 6,
+                        "observation": {
+                            "vision": {}
+                        }
+                    },
+                    "instruction": {
+                        "lang": "push_cube_to_target"
+                    },
+                    "extra": {
+                        "scene_type": "tabletop",
+                        "task_description": "push_cube_rl"
+                    },
+                    "use_videos": true,
+                    "export_success_only": false
+                }
+            }
         }
     },
     "robot": {
diff --git a/configs/agents/rl/push_cube/gym_config_vision.json b/configs/agents/rl/push_cube/gym_config_vision.json
new file mode 100644
index 00000000..032d4812
--- /dev/null
+++ b/configs/agents/rl/push_cube/gym_config_vision.json
@@ -0,0 +1,244 @@
+{ 
+    "id": "PushCubeRL",
+    "max_episodes": 5,
+    "env": {
+        "num_envs": 128,
+        "sim_steps_per_control": 4,
+        "events": {
+            "randomize_cube": {
+                "func": "randomize_rigid_object_pose",
+                "mode": "reset",
+                "params": {
+                    "entity_cfg": {
+                        "uid": "cube"
+                    },
+                    "position_range": [
+                        [-0.2, -0.2, 0.0],
+                        [0.2, 0.2, 0.0]
+                    ],
+                    "relative_position": true
+                }
+            },
+            "randomize_goal": {
+                "func": "randomize_target_pose",
+                "mode": "reset",
+                "params": {
+                    "position_range": [
+                        [-0.3, -0.3, 0.05],
+                        [0.3, 0.3, 0.05]
+                    ],
+                    "relative_position": false,
+                    "store_key": "goal_pose"
+                }
+            }
+        },
+        "observations": {
+            "robot_qpos": {
+                "func": "normalize_robot_joint_data",
+                "mode": "modify",
+                "name": "robot/qpos",
+                "params": {
+                    "joint_ids": [0, 1, 2, 3, 4, 5]
+                }
+            },
+            "robot_ee_pos": {
+                "func": "get_robot_eef_pose",
+                "mode": "add",
+                "name": "robot/ee_pos",
+                "params": {
+                    "part_name": "arm"
+                }
+            },
+            "cube_pos": {
+                "func": "get_rigid_object_pose",
+                "mode": "add",
+                "name": "object/cube_pos",
+                "params": {
+                    "entity_cfg": {
+                        "uid": "cube"
+                    }
+                }
+            },
+            "goal_pos": {
+                "func": "target_position",
+                "mode": "add",
+                "name": "object/goal_pos",
+                "params": {
+                    "target_pose_key": "goal_pose"
+                }
+            }
+        },
+        "rewards": {
+            "reaching_reward": {
+                "func": "reaching_behind_object",
+                "mode": "add",
+                "weight": 0.1,
+                "params": {
+                    "object_cfg": {
+                        "uid": "cube"
+                    },
+                    "target_pose_key": "goal_pose",
+                    "behind_offset": 0.015,
+                    "height_offset": 0.015,
+                    "distance_scale": 5.0,
+                    "part_name": "arm"
+                }
+            },
+            "place_reward": {
+                "func": "incremental_distance_to_target",
+                "mode": "add",
+                "weight": 1.0,
+                "params": {
+                    "source_entity_cfg": {
+                        "uid": "cube"
+                    },
+                    "target_pose_key": "goal_pose",
+                    "tanh_scale": 10.0,
+                    "positive_weight": 2.0,
+                    "negative_weight": 0.5,
+                    "use_xy_only": true
+                }
+            },
+            "action_penalty": {
+                "func": "action_smoothness_penalty",
+                "mode": "add",
+                "weight": 0.01,
+                "params": {}
+            },
+            "success_bonus": {
+                "func": "success_reward",
+                "mode": "add",
+                "weight": 10.0,
+                "params": {}
+            }
+        },
+        "extensions": {
+            "action_type": "delta_qpos",
+            "episode_length": 100,
+            "action_scale": 0.1,
+            "success_threshold": 0.1
+        },
+        "dataset": {
+            "lerobot": {
+                "func": "LeRobotRecorder",
+                "mode": "save",
+                "params": {
+                    "robot_meta": {
+                        "robot_type": "UR10_DH_Gripper",
+                        "control_freq": 25,
+                        "arm_dofs": 6,
+                        "observation": {
+                            "vision": {
+                                "wrist_camera": [],
+                                "third_person_camera": []
+                            },
+                            "states": ["qpos"]
+                        }
+                    },
+                    "instruction": {
+                        "lang": "push_cube_to_target"
+                    },
+                    "extra": {
+                        "scene_type": "tabletop",
+                        "task_description": "push_cube_rl"
+                    },
+                    "use_videos": true,
+                    "export_success_only": false
+                }
+            }
+        }
+    },
+    "robot": {
+        "uid": "Manipulator",
+        "urdf_cfg": {
+            "components": [
+                {
+                    "component_type": "arm",
+                    "urdf_path": "UniversalRobots/UR10/UR10.urdf"
+                },
+                {
+                    "component_type": "hand",
+                    "urdf_path": "DH_PGI_140_80/DH_PGI_140_80.urdf"
+                }
+            ]
+        },
+        "init_pos": [0.0, 0.0, 0.0],
+        "init_rot": [0.0, 0.0, 0.0],
+        "init_qpos": [0.0, -1.57, 1.57, -1.57, -1.57, 0.0, 0.04, 0.04],
+        "drive_pros": {
+            "drive_type": "force",
+            "stiffness": 100000.0,
+            "damping": 1000.0,
+            "max_velocity": 2.0,
+            "max_effort": 500.0
+        },
+        "solver_cfg": {
+            "arm": {
+                "class_type": "PytorchSolver",
+                "end_link_name": "ee_link",
+                "root_link_name": "base_link",
+                "tcp": [
+                    [1.0, 0.0, 0.0, 0.0],
+                    [0.0, 1.0, 0.0, 0.0],
+                    [0.0, 0.0, 1.0, 0.16],
+                    [0.0, 0.0, 0.0, 1.0]
+                ]
+            }
+        },
+        "control_parts": {
+            "arm": ["JOINT[1-6]"]
+        }
+    },
+    "sensor": [
+        {
+            "sensor_type": "Camera",
+            "uid": "wrist_camera",
+            "width": 640,
+            "height": 480,
+            "intrinsics": [90.0, 90.0, 64.0, 64.0],
+            "extrinsics": {
+                "parent": "ee_link"
+            }
+        },
+        {
+            "sensor_type": "Camera",
+            "uid": "third_person_camera",
+            "width": 640,
+            "height": 480,
+            "intrinsics": [90.0, 90.0, 64.0, 64.0],
+            "extrinsics": {
+                "eye": [0.5, 0.5, 0.8],
+                "target": [0.0, 0.0, 0.3],
+                "up": [0.0, 0.0, 1.0]
+            }
+        }
+    ],
+    "light": {},
+    "background": [],
+    "rigid_object": [
+        {
+            "uid": "cube",
+            "shape": {
+                "shape_type": "Cube",
+                "size": [0.1, 0.1, 0.1]
+            },
+            "body_type": "dynamic",
+            "init_pos": [-0.6, -0.4, 0.05],
+            "attrs": {
+                "mass": 10.0,
+                "static_friction": 3.0,
+                "dynamic_friction": 2.0,
+                "linear_damping": 2.0,
+                "angular_damping": 2.0,
+                "contact_offset": 0.003,
+                "rest_offset": 0.001,
+                "restitution": 0.1,
+                "max_depenetration_velocity": 10.0,
+                "max_linear_velocity": 1.0,
+                "max_angular_velocity": 1.0
+            }
+        }
+    ],
+    "rigid_object_group": [],
+    "articulation": []
+}
diff --git a/configs/agents/rl/push_cube/train_config.json b/configs/agents/rl/push_cube/train_config.json
index 3b5c8e3a..005c940e 100644
--- a/configs/agents/rl/push_cube/train_config.json
+++ b/configs/agents/rl/push_cube/train_config.json
@@ -7,7 +7,7 @@
         "headless": true,
         "enable_rt": false,
         "gpu_id": 0,
-        "num_envs": 512,
+        "num_envs": 8,
         "iterations": 1000,
         "rollout_steps": 1024,
         "eval_freq": 200,
diff --git a/embodichain/lab/gym/envs/embodied_env.py b/embodichain/lab/gym/envs/embodied_env.py
index 9e787bb2..af101935 100644
--- a/embodichain/lab/gym/envs/embodied_env.py
+++ b/embodichain/lab/gym/envs/embodied_env.py
@@ -170,6 +170,13 @@ def __init__(self, cfg: EmbodiedEnvCfg, **kwargs):
 
         super().__init__(cfg, **kwargs)
 
+        self.episode_obs_buffer: Dict[int, List[EnvObs]] = {
+            i: [] for i in range(self.num_envs)
+        }
+        self.episode_action_buffer: Dict[int, List[EnvAction]] = {
+            i: [] for i in range(self.num_envs)
+        }
+
     def _init_sim_state(self, **kwargs):
         """Initialize the simulation state at the beginning of scene creation."""
 
@@ -271,6 +278,29 @@ def get_affordance(self, key: str, default: Any = None):
         """
         return self.affordance_datas.get(key, default)
 
+    def _extract_single_env_data(self, data: Any, env_id: int) -> Any:
+        """Extract single environment data from batched data.
+
+        Args:
+            data: Batched data (dict, tensor, list, or primitive)
+            env_id: Environment index
+
+        Returns:
+            Data for the specified environment
+        """
+        if isinstance(data, dict):
+            return {
+                k: self._extract_single_env_data(v, env_id) for k, v in data.items()
+            }
+        elif isinstance(data, torch.Tensor):
+            return data[env_id] if data.ndim > 0 else data
+        elif isinstance(data, (list, tuple)):
+            return type(data)(
+                self._extract_single_env_data(item, env_id) for item in data
+            )
+        else:
+            return data
+
     def _hook_after_sim_step(
         self,
         obs: EnvObs,
@@ -280,6 +310,13 @@ def _hook_after_sim_step(
         info: Dict,
         **kwargs,
     ):
+        # Extract and append data for each environment
+        for env_id in range(self.num_envs):
+            single_obs = self._extract_single_env_data(obs, env_id)
+            single_action = self._extract_single_env_data(action, env_id)
+            self.episode_obs_buffer[env_id].append(single_obs)
+            self.episode_action_buffer[env_id].append(single_action)
+
         # Call dataset manager with mode="save": it will record and auto-save if dones=True
         if self.cfg.dataset:
             if "save" in self.dataset_manager.available_modes:
@@ -331,6 +368,15 @@ def _update_sim_state(self, **kwargs) -> None:
     def _initialize_episode(
         self, env_ids: Sequence[int] | None = None, **kwargs
     ) -> None:
+        # Clear episode buffers for environments that are being reset
+        if env_ids is None:
+            env_ids = range(self.num_envs)
+        for env_id in env_ids:
+            # Convert to int if it's a tensor
+            env_id = int(env_id) if isinstance(env_id, torch.Tensor) else env_id
+            self.episode_obs_buffer[env_id].clear()
+            self.episode_action_buffer[env_id].clear()
+
         # apply events such as randomization for environments that need a reset
         if self.cfg.events:
             if "reset" in self.event_manager.available_modes:
diff --git a/embodichain/lab/gym/envs/managers/datasets.py b/embodichain/lab/gym/envs/managers/datasets.py
index 625ce8e7..1533d97b 100644
--- a/embodichain/lab/gym/envs/managers/datasets.py
+++ b/embodichain/lab/gym/envs/managers/datasets.py
@@ -92,10 +92,6 @@ def __init__(self, cfg: DatasetFunctorCfg, env: EmbodiedEnv):
         self.use_videos = params.get("use_videos", False)
         self.export_success_only = params.get("export_success_only", False)
 
-        # Episode data buffers
-        self.episode_obs_list: List[Dict] = []
-        self.episode_action_list: List[Any] = []
-
         # LeRobot dataset instance
         self.dataset: Optional[LeRobotDataset] = None
         self.dataset_full_path: Optional[Path] = None
@@ -114,14 +110,6 @@ def dataset_path(self) -> str:
             str(self.dataset_full_path) if self.dataset_full_path else "Not initialized"
         )
 
-    def reset(self, env_ids: Optional[torch.Tensor] = None) -> None:
-        """Reset the recorder buffers.
-
-        Args:
-            env_ids: Environment IDs to reset (currently clears all data).
-        """
-        self._reset_buffer()
-
     def __call__(
         self,
         env: EmbodiedEnv,
@@ -160,8 +148,6 @@ def __call__(
             use_videos: Whether to save videos (already set in __init__).
             export_success_only: Whether to export only successful episodes (already set in __init__).
         """
-        # Always record the step
-        self._record_step(obs, action)
 
         # Check if any episodes are done and save them
         done_env_ids = dones.nonzero(as_tuple=False).squeeze(-1)
@@ -169,43 +155,38 @@ def __call__(
             # Save completed episodes
             self._save_episodes(done_env_ids, terminateds, info)
 
-    def _record_step(self, obs: EnvObs, action: EnvAction) -> None:
-        """Record a single step."""
-        self.episode_obs_list.append(obs)
-        self.episode_action_list.append(action)
-
     def _save_episodes(
         self,
         env_ids: torch.Tensor,
         terminateds: Optional[torch.Tensor] = None,
         info: Optional[Dict[str, Any]] = None,
     ) -> None:
-        """Save completed episodes."""
-        if len(self.episode_obs_list) == 0:
-            logger.log_warning("No episode data to save")
-            return
-
-        obs_list = self.episode_obs_list
-        action_list = self.episode_action_list
+        """Save completed episodes for specified environments."""
+        task = self.instruction.get("lang", "unknown_task")
 
-        # Align obs and action
-        if len(obs_list) > len(action_list):
-            obs_list = obs_list[:-1]
+        # Process each environment
+        for env_id in env_ids.cpu().tolist():
+            # Get buffer for this environment (already contains single-env data)
+            obs_list = self._env.episode_obs_buffer[env_id]
+            action_list = self._env.episode_action_buffer[env_id]
 
-        task = self.instruction.get("lang", "unknown_task")
+            if len(obs_list) == 0:
+                logger.log_warning(f"No episode data to save for env {env_id}")
+                continue
 
-        # Update metadata
-        extra_info = self.extra.copy() if self.extra else {}
-        fps = self.dataset.meta.info.get("fps", 30)
-        current_episode_time = (len(obs_list) * len(env_ids)) / fps if fps > 0 else 0
+            # Align obs and action
+            if len(obs_list) > len(action_list):
+                obs_list = obs_list[:-1]
 
-        episode_extra_info = extra_info.copy()
-        self.total_time += current_episode_time
-        episode_extra_info["total_time"] = self.total_time
-        self._update_dataset_info({"extra": episode_extra_info})
+            # Update metadata
+            extra_info = self.extra.copy() if self.extra else {}
+            fps = self.dataset.meta.info.get("fps", 30)
+            current_episode_time = len(obs_list) / fps if fps > 0 else 0
 
-        # Process each environment
-        for env_id in env_ids.cpu().tolist():
+            episode_extra_info = extra_info.copy()
+            self.total_time += current_episode_time
+            episode_extra_info["total_time"] = self.total_time
+            self._update_dataset_info({"extra": episode_extra_info})
             is_success = False
             if info is not None and "success" in info:
                 success_tensor = info["success"]
@@ -222,42 +203,48 @@ def _save_episodes(
 
             try:
                 for obs, action in zip(obs_list, action_list):
-                    frame = self._convert_frame_to_lerobot(obs, action, task, env_id)
+                    frame = self._convert_frame_to_lerobot(obs, action, task)
                     self.dataset.add_frame(frame)
 
                 self.dataset.save_episode()
+
                 logger.log_info(
-                    f"Auto-saved {'successful' if is_success else 'failed'} "
-                    f"episode {self.curr_episode} for env {env_id} with {len(obs_list)} frames"
+                    f"[LeRobotRecorder] Saved dataset to: {self.dataset_path}\n"
+                    f"  Episode {self.curr_episode} (env {env_id}): "
+                    f"{'successful' if is_success else 'failed'}, {len(obs_list)} frames"
                 )
+
                 self.curr_episode += 1
             except Exception as e:
                 logger.log_error(f"Failed to save episode {env_id}: {e}")
 
-        self._reset_buffer()
-
     def finalize(self) -> Optional[str]:
         """Finalize the dataset."""
-        if len(self.episode_obs_list) > 0:
-            active_env_ids = torch.arange(self.num_envs, device=self.device)
+        # Save any remaining episodes
+        env_ids_with_data = []
+        for env_id in range(self.num_envs):
+            if len(self._env.episode_obs_buffer[env_id]) > 0:
+                env_ids_with_data.append(env_id)
+
+        if env_ids_with_data:
+            active_env_ids = torch.tensor(env_ids_with_data, device=self.device)
             self._save_episodes(active_env_ids)
 
         try:
             if self.dataset is not None:
                 self.dataset.finalize()
-                logger.log_info(f"Dataset finalized at: {self.dataset_path}")
+                logger.log_info(
+                    f"[LeRobotRecorder] Dataset finalized successfully\n"
+                    f"  Path: {self.dataset_path}\n"
+                    f"  Total episodes: {self.curr_episode}\n"
+                    f"  Total time: {self.total_time:.2f}s"
+                )
                 return self.dataset_path
         except Exception as e:
-            logger.log_error(f"Failed to finalize dataset: {e}")
+            logger.log_error(f"[LeRobotRecorder] Failed to finalize dataset: {e}")
 
         return None
 
-    def _reset_buffer(self) -> None:
-        """Reset episode buffers."""
-        self.episode_obs_list.clear()
-        self.episode_action_list.clear()
-        logger.log_info("Reset buffers (cleared all batched data)")
-
     def _initialize_dataset(self) -> None:
         """Initialize the LeRobot dataset."""
         robot_type = self.robot_meta.get("robot_type", "robot")
@@ -348,9 +335,18 @@ def _build_features(self) -> Dict:
         return features
 
     def _convert_frame_to_lerobot(
-        self, obs: Dict[str, Any], action: Any, task: str, env_id: int
+        self, obs: Dict[str, Any], action: Any, task: str
     ) -> Dict:
-        """Convert a single frame to LeRobot format."""
+        """Convert a single frame to LeRobot format.
+
+        Args:
+            obs: Single environment observation (already extracted from batch)
+            action: Single environment action (already extracted from batch)
+            task: Task name
+
+        Returns:
+            Frame dict in LeRobot format with numpy arrays
+        """
         frame = {"task": task}
         extra_vision_config = self.robot_meta.get("observation", {}).get("vision", {})
         arm_dofs = self.robot_meta.get("arm_dofs", 7)
@@ -363,9 +359,9 @@ def _convert_frame_to_lerobot(
 
                 color_data = obs["sensor"][camera_name]["color"]
                 if isinstance(color_data, torch.Tensor):
-                    color_img = color_data[env_id][:, :, :3].cpu().numpy()
+                    color_img = color_data[:, :, :3].cpu().numpy()
                 else:
-                    color_img = np.array(color_data)[env_id][:, :, :3]
+                    color_img = np.array(color_data)[:, :, :3]
 
                 if color_img.dtype in [np.float32, np.float64]:
                     color_img = (color_img * 255).astype(np.uint8)
@@ -375,11 +371,9 @@ def _convert_frame_to_lerobot(
                 if is_stereo:
                     color_right_data = obs["sensor"][camera_name]["color_right"]
                     if isinstance(color_right_data, torch.Tensor):
-                        color_right_img = (
-                            color_right_data[env_id][:, :, :3].cpu().numpy()
-                        )
+                        color_right_img = color_right_data[:, :, :3].cpu().numpy()
                     else:
-                        color_right_img = np.array(color_right_data)[env_id][:, :, :3]
+                        color_right_img = np.array(color_right_data)[:, :, :3]
 
                     if color_right_img.dtype in [np.float32, np.float64]:
                         color_right_img = (color_right_img * 255).astype(np.uint8)
@@ -389,25 +383,37 @@ def _convert_frame_to_lerobot(
         # Add state
         qpos = obs["robot"][JointType.QPOS.value]
         if isinstance(qpos, torch.Tensor):
-            state_data = qpos[env_id].cpu().numpy().astype(np.float32)
+            state_data = qpos.cpu().numpy().astype(np.float32)
         else:
-            state_data = np.array(qpos)[env_id].astype(np.float32)
+            state_data = np.array(qpos).astype(np.float32)
 
         frame["observation.state"] = state_data
 
         # Add action
         if isinstance(action, torch.Tensor):
-            action_data = action[env_id, :arm_dofs].cpu().numpy()
+            action_data = action[:arm_dofs].cpu().numpy()
         elif isinstance(action, np.ndarray):
-            action_data = action[env_id, :arm_dofs]
+            action_data = action[:arm_dofs]
         elif isinstance(action, dict):
-            action_data = action.get("action", action.get("arm_action", action))
-            if isinstance(action_data, torch.Tensor):
-                action_data = action_data[env_id, :arm_dofs].cpu().numpy()
-            elif isinstance(action_data, np.ndarray):
-                action_data = action_data[env_id, :arm_dofs]
+            # Extract qpos from action dict
+            action_tensor = action.get(
+                "qpos", action.get("delta_qpos", action.get("action", None))
+            )
+            if action_tensor is None:
+                # Fallback to first tensor value
+                for v in action.values():
+                    if isinstance(v, (torch.Tensor, np.ndarray)):
+                        action_tensor = v
+                        break
+
+            if isinstance(action_tensor, torch.Tensor):
+                action_data = action_tensor[:arm_dofs].cpu().numpy()
+            elif isinstance(action_tensor, np.ndarray):
+                action_data = action_tensor[:arm_dofs]
+            else:
+                action_data = np.array(action_tensor)[:arm_dofs]
         else:
-            action_data = np.array(action)[env_id, :arm_dofs]
+            action_data = np.array(action)[:arm_dofs]
 
         frame["action"] = action_data
 
diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py
index dfd2872b..dad89b7f 100644
--- a/embodichain/lab/gym/envs/managers/rewards.py
+++ b/embodichain/lab/gym/envs/managers/rewards.py
@@ -30,7 +30,7 @@
 def distance_between_objects(
     env: EmbodiedEnv,
     obs: dict,
-    action: dict,
+    action: torch.Tensor | dict,
     info: dict,
     source_entity_cfg: SceneEntityCfg = None,
     target_entity_cfg: SceneEntityCfg = None,
@@ -92,7 +92,7 @@ def distance_between_objects(
 def joint_velocity_penalty(
     env: EmbodiedEnv,
     obs: dict,
-    action: dict,
+    action: torch.Tensor | dict,
     info: dict,
     robot_uid: str = "robot",
     joint_ids: slice | list[int] | None = None,
@@ -146,21 +146,21 @@ def joint_velocity_penalty(
 def action_smoothness_penalty(
     env: EmbodiedEnv,
     obs: dict,
-    action: dict,
+    action: torch.Tensor | dict,
     info: dict,
 ) -> torch.Tensor:
     """Penalize large action changes between consecutive timesteps.
 
     Encourages smooth control commands by penalizing sudden changes in actions.
-    Stores previous action in env._reward_states for comparison.
+    Gets previous action from env.episode_action_buffer.
 
     Returns:
-        Penalty tensor of shape (num_envs,). Zero on first call (no previous action),
-        negative on subsequent calls (larger change = more negative).
+        Penalty tensor of shape (num_envs,). Zero on first step (no previous action),
+        negative on subsequent steps (larger change = more negative).
 
     Note:
-        This function maintains state across calls using env._reward_states['prev_actions'].
-        State is automatically reset when the environment resets.
+        This function reads from env.episode_action_buffer, which is automatically
+        cleared when the environment resets.
 
     Example:
         ```json
@@ -171,26 +171,48 @@ def action_smoothness_penalty(
         }
         ```
     """
-    # Extract action tensor from dict
-    for key in ["qpos", "qvel", "qf"]:
-        if key in action:
-            action_tensor = action[key]
-            break
-
-    # Use dictionary-based state management
-    if not hasattr(env, "_reward_states"):
-        env._reward_states = {}
-
-    # compute difference between current and previous action
-    if "prev_actions" in env._reward_states:
-        action_diff = action_tensor - env._reward_states["prev_actions"]
-        penalty = -torch.norm(action_diff, dim=-1)
+    # Extract current action tensor
+    if isinstance(action, torch.Tensor):
+        current_action_tensor = action
+    elif isinstance(action, dict):
+        # Extract from dict
+        current_action_tensor = None
+        for key in ["qpos", "qvel", "qf"]:
+            if key in action:
+                current_action_tensor = action[key]
+                break
+        if current_action_tensor is None:
+            return torch.zeros(env.num_envs, device=env.device)
     else:
-        # no previous action, no penalty
-        penalty = torch.zeros(env.num_envs, device=env.device)
+        return torch.zeros(env.num_envs, device=env.device)
 
-    # store current action for next step
-    env._reward_states["prev_actions"] = action_tensor.clone()
+    # Get previous action from buffer for each environment
+    penalty = torch.zeros(env.num_envs, device=env.device)
+
+    for env_id in range(env.num_envs):
+        action_buffer = env.episode_action_buffer[env_id]
+
+        if len(action_buffer) > 0:
+            # Get previous action (last in buffer)
+            prev_action = action_buffer[-1]
+
+            # Extract tensor from previous action
+            if isinstance(prev_action, torch.Tensor):
+                prev_action_tensor = prev_action
+            elif isinstance(prev_action, dict):
+                prev_action_tensor = None
+                for key in ["qpos", "qvel", "qf"]:
+                    if key in prev_action:
+                        prev_action_tensor = prev_action[key]
+                        break
+            else:
+                prev_action_tensor = None
+
+            if prev_action_tensor is not None:
+                # Compute difference and penalty
+                action_diff = current_action_tensor[env_id] - prev_action_tensor
+                penalty[env_id] = -torch.norm(action_diff)
+        # else: first step, no penalty (already zero)
 
     return penalty
 
@@ -198,7 +220,7 @@ def action_smoothness_penalty(
 def joint_limit_penalty(
     env: EmbodiedEnv,
     obs: dict,
-    action: dict,
+    action: torch.Tensor | dict,
     info: dict,
     robot_uid: str = "robot",
     joint_ids: slice | list[int] = slice(None),
@@ -264,7 +286,7 @@ def joint_limit_penalty(
 def orientation_alignment(
     env: EmbodiedEnv,
     obs: dict,
-    action: dict,
+    action: torch.Tensor | dict,
     info: dict,
     source_entity_cfg: SceneEntityCfg = None,
     target_entity_cfg: SceneEntityCfg = None,
@@ -319,7 +341,7 @@ def orientation_alignment(
 def success_reward(
     env: EmbodiedEnv,
     obs: dict,
-    action: dict,
+    action: torch.Tensor | dict,
     info: dict,
 ) -> torch.Tensor:
     """Sparse bonus reward when task succeeds.
@@ -367,7 +389,7 @@ def success_reward(
 def reaching_behind_object(
     env: EmbodiedEnv,
     obs: dict,
-    action: dict,
+    action: torch.Tensor | dict,
     info: dict,
     object_cfg: SceneEntityCfg = None,
     target_pose_key: str = "goal_pose",
@@ -458,7 +480,7 @@ def reaching_behind_object(
 def distance_to_target(
     env: "EmbodiedEnv",
     obs: dict,
-    action: dict,
+    action: torch.Tensor | dict,
     info: dict,
     source_entity_cfg: SceneEntityCfg = None,
     target_pose_key: str = "target_pose",
@@ -535,7 +557,7 @@ def distance_to_target(
 def incremental_distance_to_target(
     env: "EmbodiedEnv",
     obs: dict,
-    action: dict,
+    action: torch.Tensor | dict,
     info: dict,
     source_entity_cfg: SceneEntityCfg = None,
     target_pose_key: str = "target_pose",
diff --git a/embodichain/lab/gym/envs/rl_env.py b/embodichain/lab/gym/envs/rl_env.py
index 3a8f4118..9ce6b5f1 100644
--- a/embodichain/lab/gym/envs/rl_env.py
+++ b/embodichain/lab/gym/envs/rl_env.py
@@ -165,6 +165,9 @@ def _process_delta_qpos(self, action: torch.Tensor) -> torch.Tensor:
     def _process_eef_pose(self, action: torch.Tensor) -> torch.Tensor:
         """Process end-effector pose action via inverse kinematics.
 
+        TODO: Currently only supports single-arm robots (6-axis or 7-axis).
+        For multi-arm or complex robots, please use qpos/delta_qpos actions instead.
+
         Args:
             action: End-effector pose (position + orientation)
                    Shape: (num_envs, 6) for pos+euler or (num_envs, 7) for pos+quat
@@ -172,32 +175,8 @@ def _process_eef_pose(self, action: torch.Tensor) -> torch.Tensor:
         Returns:
             Joint positions from IK
         """
-        # Auto-select control part if not specified
-        control_part_name = self.control_part_name
-        if control_part_name is None:
-            if self.robot.control_parts is None:
-                raise ValueError(
-                    "Robot has no control_parts defined for eef_pose action"
-                )
-
-            available_parts = list(self.robot.control_parts.keys())
-            if not available_parts:
-                raise ValueError(
-                    "Robot has no control_parts defined for eef_pose action"
-                )
-
-            if len(available_parts) == 1:
-                control_part_name = available_parts[0]
-            else:
-                raise ValueError(
-                    f"Multiple control_parts found {available_parts}. "
-                    f"Please specify control_part_name explicitly."
-                )
-
         # Get current joint positions as IK seed
         current_qpos = self.robot.get_qpos()
-        control_joint_ids = self.robot.get_joint_ids(name=control_part_name)
-        seed_qpos = current_qpos[:, control_joint_ids]
 
         # Convert action to target pose matrix (4x4)
         batch_size = action.shape[0]
@@ -223,15 +202,12 @@ def _process_eef_pose(self, action: torch.Tensor) -> torch.Tensor:
         for env_idx in range(self.num_envs):
             qpos_ik = self.robot.compute_ik(
                 pose=target_pose[env_idx],
-                joint_seed=seed_qpos[env_idx],
-                name=control_part_name,
+                joint_seed=current_qpos[env_idx],
             )
             ik_solutions.append(qpos_ik)
 
-        # Update full qpos with IK solutions
-        result_qpos = current_qpos.clone()
-        ik_qpos = torch.stack(ik_solutions, dim=0)
-        result_qpos[:, control_joint_ids] = ik_qpos
+        # Stack IK solutions
+        result_qpos = torch.stack(ik_solutions, dim=0)
 
         return result_qpos
 

From 1d569017f06d49c9a9c6ce005de8edc104d20fe8 Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Thu, 22 Jan 2026 09:41:36 +0800
Subject: [PATCH 06/12] update tensorboard

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 50fef254..658ba0e9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,7 +40,7 @@ dependencies = [
   "PyYAML>=6.0",
   "accelerate>=1.10.0",
   "wandb==0.20.1",
-  "tensorboard",
+  "tensorboard>=2.20.0",
   "transformers>=4.53.0",
   "diffusers>=0.32.1",
   "deepspeed>=0.16.2",

From c96f4d3d66e06c4a2e744330952267e08e36f2ad Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Thu, 22 Jan 2026 15:28:47 +0800
Subject: [PATCH 07/12] remove dataset from config && fix CI

---
 .github/workflows/main.yml                    |  2 +-
 configs/agents/rl/push_cube/gym_config.json   | 25 ------
 .../rl/push_cube/gym_config_vision.json       | 39 +--------
 configs/agents/rl/push_cube/train_config.json |  6 +-
 pyproject.toml                                |  2 +-
 tests/agents/test_rl.py                       | 80 ++++++++++++++-----
 6 files changed, 68 insertions(+), 86 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 4cb35e58..63b697e6 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -48,7 +48,7 @@ jobs:
       - uses: actions/checkout@v4
       - name: Build docs
         run: |
-          pip install -e . --extra-index-url http://pyp.open3dv.site:2345/simple/ --trusted-host pyp.open3dv.site
+          pip install -e .[lerobot] --extra-index-url http://pyp.open3dv.site:2345/simple/ --trusted-host pyp.open3dv.site
           pip install -r docs/requirements.txt
           cd ${GITHUB_WORKSPACE}/docs
           echo "Start Building docs..."
diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json
index 7858d18b..3e0e0445 100644
--- a/configs/agents/rl/push_cube/gym_config.json
+++ b/configs/agents/rl/push_cube/gym_config.json
@@ -117,31 +117,6 @@
             "episode_length": 100,
             "action_scale": 0.1,
             "success_threshold": 0.1
-        },
-        "dataset": {
-            "lerobot": {
-                "func": "LeRobotRecorder",
-                "mode": "save",
-                "params": {
-                    "robot_meta": {
-                        "robot_type": "UR10_DH_Gripper",
-                        "control_freq": 25,
-                        "arm_dofs": 6,
-                        "observation": {
-                            "vision": {}
-                        }
-                    },
-                    "instruction": {
-                        "lang": "push_cube_to_target"
-                    },
-                    "extra": {
-                        "scene_type": "tabletop",
-                        "task_description": "push_cube_rl"
-                    },
-                    "use_videos": true,
-                    "export_success_only": false
-                }
-            }
         }
     },
     "robot": {
diff --git a/configs/agents/rl/push_cube/gym_config_vision.json b/configs/agents/rl/push_cube/gym_config_vision.json
index 032d4812..e64290e0 100644
--- a/configs/agents/rl/push_cube/gym_config_vision.json
+++ b/configs/agents/rl/push_cube/gym_config_vision.json
@@ -2,8 +2,6 @@
     "id": "PushCubeRL",
     "max_episodes": 5,
     "env": {
-        "num_envs": 128,
-        "sim_steps_per_control": 4,
         "events": {
             "randomize_cube": {
                 "func": "randomize_rigid_object_pose",
@@ -117,35 +115,6 @@
             "episode_length": 100,
             "action_scale": 0.1,
             "success_threshold": 0.1
-        },
-        "dataset": {
-            "lerobot": {
-                "func": "LeRobotRecorder",
-                "mode": "save",
-                "params": {
-                    "robot_meta": {
-                        "robot_type": "UR10_DH_Gripper",
-                        "control_freq": 25,
-                        "arm_dofs": 6,
-                        "observation": {
-                            "vision": {
-                                "wrist_camera": [],
-                                "third_person_camera": []
-                            },
-                            "states": ["qpos"]
-                        }
-                    },
-                    "instruction": {
-                        "lang": "push_cube_to_target"
-                    },
-                    "extra": {
-                        "scene_type": "tabletop",
-                        "task_description": "push_cube_rl"
-                    },
-                    "use_videos": true,
-                    "export_success_only": false
-                }
-            }
         }
     },
     "robot": {
@@ -193,8 +162,8 @@
         {
             "sensor_type": "Camera",
             "uid": "wrist_camera",
-            "width": 640,
-            "height": 480,
+            "width": 120,
+            "height": 120,
             "intrinsics": [90.0, 90.0, 64.0, 64.0],
             "extrinsics": {
                 "parent": "ee_link"
@@ -203,8 +172,8 @@
         {
             "sensor_type": "Camera",
             "uid": "third_person_camera",
-            "width": 640,
-            "height": 480,
+            "width": 120,
+            "height": 120,
             "intrinsics": [90.0, 90.0, 64.0, 64.0],
             "extrinsics": {
                 "eye": [0.5, 0.5, 0.8],
diff --git a/configs/agents/rl/push_cube/train_config.json b/configs/agents/rl/push_cube/train_config.json
index 005c940e..1c683654 100644
--- a/configs/agents/rl/push_cube/train_config.json
+++ b/configs/agents/rl/push_cube/train_config.json
@@ -1,18 +1,18 @@
 { 
     "trainer": {
         "exp_name": "push_cube_ppo",
-        "gym_config": "configs/agents/rl/push_cube/gym_config.json",
+        "gym_config": "configs/agents/rl/push_cube/gym_config_vision.json",
         "seed": 42,
         "device": "cuda:0",
         "headless": true,
         "enable_rt": false,
         "gpu_id": 0,
-        "num_envs": 8,
+        "num_envs": 64,
         "iterations": 1000,
         "rollout_steps": 1024,
         "eval_freq": 200,
         "save_freq": 200,
-        "use_wandb": true,
+        "use_wandb": false,
         "wandb_project_name": "embodychain-push_cube",
         "events": {
             "eval": {
diff --git a/pyproject.toml b/pyproject.toml
index 658ba0e9..134acbcc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ dependencies = [
   "polars==1.31.0",
   "PyYAML>=6.0",
   "accelerate>=1.10.0",
-  "wandb==0.20.1",
+  "wandb>=0.22.3",
   "tensorboard>=2.20.0",
   "transformers>=4.53.0",
   "diffusers>=0.32.1",
diff --git a/tests/agents/test_rl.py b/tests/agents/test_rl.py
index 2588f998..827e670f 100644
--- a/tests/agents/test_rl.py
+++ b/tests/agents/test_rl.py
@@ -23,35 +23,73 @@
 def test_rl_training():
     """Test RL training pipeline by running a few iterations."""
     # Load the existing push_cube config
-    config_path = "configs/agents/rl/push_cube/train_config.json"
-
-    with open(config_path, "r") as f:
-        config = json.load(f)
-
-    # Create temporary config with reduced iterations for testing
-    test_config = config.copy()
-    test_config["trainer"]["iterations"] = 2  # Only 2 iterations for testing
-    test_config["trainer"]["rollout_steps"] = 32  # Fewer rollout steps
-    test_config["trainer"]["eval_freq"] = 1000000  # Disable eval
-    test_config["trainer"]["save_freq"] = 1000000  # Disable save
-    test_config["trainer"]["headless"] = True
-    test_config["trainer"]["use_wandb"] = False
-    test_config["trainer"]["num_envs"] = 2  # Only 2 parallel envs for testing
-
-    # Save temporary config
+    train_config_path = "configs/agents/rl/push_cube/train_config.json"
+    gym_config_path = "configs/agents/rl/push_cube/gym_config.json"
+
+    with open(train_config_path, "r") as f:
+        train_config = json.load(f)
+
+    with open(gym_config_path, "r") as f:
+        gym_config = json.load(f)
+
+    # Add dataset configuration dynamically to gym_config
+    gym_config["env"]["dataset"] = {
+        "lerobot": {
+            "func": "LeRobotRecorder",
+            "mode": "save",
+            "params": {
+                "robot_meta": {
+                    "robot_type": "UR10_DH_Gripper",
+                    "control_freq": 25,
+                    "arm_dofs": 6,
+                    "observation": {"vision": {}, "states": ["qpos"]},
+                },
+                "instruction": {"lang": "push_cube_to_target"},
+                "extra": {
+                    "scene_type": "tabletop",
+                    "task_description": "push_cube_rl_test",
+                },
+                "use_videos": False,
+                "export_success_only": False,
+            },
+        }
+    }
+
+    # Create temporary gym config with dataset
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+        json.dump(gym_config, f)
+        temp_gym_config_path = f.name
+
+    # Create temporary train config with reduced iterations for testing
+    test_train_config = train_config.copy()
+    test_train_config["trainer"][
+        "gym_config"
+    ] = temp_gym_config_path  # Point to temp gym config
+    test_train_config["trainer"]["iterations"] = 2  # Only 2 iterations for testing
+    test_train_config["trainer"]["rollout_steps"] = 32  # Fewer rollout steps
+    test_train_config["trainer"]["eval_freq"] = 1000000  # Disable eval
+    test_train_config["trainer"]["save_freq"] = 1000000  # Disable save
+    test_train_config["trainer"]["headless"] = True
+    test_train_config["trainer"]["use_wandb"] = False
+    test_train_config["trainer"]["num_envs"] = 2  # Only 2 parallel envs for testing
+
+    # Save temporary train config
     with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
-        json.dump(test_config, f)
-        temp_config_path = f.name
+        json.dump(test_train_config, f)
+        temp_train_config_path = f.name
 
     try:
         from embodichain.agents.rl.train import train_from_config
 
-        train_from_config(temp_config_path)
+        train_from_config(temp_train_config_path)
         assert True
 
     finally:
-        if os.path.exists(temp_config_path):
-            os.remove(temp_config_path)
+        # Clean up temporary files
+        if os.path.exists(temp_train_config_path):
+            os.remove(temp_train_config_path)
+        if os.path.exists(temp_gym_config_path):
+            os.remove(temp_gym_config_path)
 
 
 if __name__ == "__main__":

From f03df006f92b0a4b7c52e5c409d63a28a041a42b Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Thu, 22 Jan 2026 16:24:19 +0800
Subject: [PATCH 08/12] fix wandb version

---
 configs/agents/rl/push_cube/train_config.json | 2 +-
 pyproject.toml                                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/agents/rl/push_cube/train_config.json b/configs/agents/rl/push_cube/train_config.json
index 1c683654..64e2ecb2 100644
--- a/configs/agents/rl/push_cube/train_config.json
+++ b/configs/agents/rl/push_cube/train_config.json
@@ -1,7 +1,7 @@
 { 
     "trainer": {
         "exp_name": "push_cube_ppo",
-        "gym_config": "configs/agents/rl/push_cube/gym_config_vision.json",
+        "gym_config": "configs/agents/rl/push_cube/gym_config.json",
         "seed": 42,
         "device": "cuda:0",
         "headless": true,
diff --git a/pyproject.toml b/pyproject.toml
index 134acbcc..d834132e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ dependencies = [
   "polars==1.31.0",
   "PyYAML>=6.0",
   "accelerate>=1.10.0",
-  "wandb>=0.22.3",
+  "wandb>=0.21.0",
   "tensorboard>=2.20.0",
   "transformers>=4.53.0",
   "diffusers>=0.32.1",

From 287365726a7bfd724eafb6e738db232b4ec8370d Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Thu, 22 Jan 2026 16:25:36 +0800
Subject: [PATCH 09/12] remove

---
 .../rl/push_cube/gym_config_vision.json       | 213 ------------------
 1 file changed, 213 deletions(-)
 delete mode 100644 configs/agents/rl/push_cube/gym_config_vision.json

diff --git a/configs/agents/rl/push_cube/gym_config_vision.json b/configs/agents/rl/push_cube/gym_config_vision.json
deleted file mode 100644
index e64290e0..00000000
--- a/configs/agents/rl/push_cube/gym_config_vision.json
+++ /dev/null
@@ -1,213 +0,0 @@
-{ 
-    "id": "PushCubeRL",
-    "max_episodes": 5,
-    "env": {
-        "events": {
-            "randomize_cube": {
-                "func": "randomize_rigid_object_pose",
-                "mode": "reset",
-                "params": {
-                    "entity_cfg": {
-                        "uid": "cube"
-                    },
-                    "position_range": [
-                        [-0.2, -0.2, 0.0],
-                        [0.2, 0.2, 0.0]
-                    ],
-                    "relative_position": true
-                }
-            },
-            "randomize_goal": {
-                "func": "randomize_target_pose",
-                "mode": "reset",
-                "params": {
-                    "position_range": [
-                        [-0.3, -0.3, 0.05],
-                        [0.3, 0.3, 0.05]
-                    ],
-                    "relative_position": false,
-                    "store_key": "goal_pose"
-                }
-            }
-        },
-        "observations": {
-            "robot_qpos": {
-                "func": "normalize_robot_joint_data",
-                "mode": "modify",
-                "name": "robot/qpos",
-                "params": {
-                    "joint_ids": [0, 1, 2, 3, 4, 5]
-                }
-            },
-            "robot_ee_pos": {
-                "func": "get_robot_eef_pose",
-                "mode": "add",
-                "name": "robot/ee_pos",
-                "params": {
-                    "part_name": "arm"
-                }
-            },
-            "cube_pos": {
-                "func": "get_rigid_object_pose",
-                "mode": "add",
-                "name": "object/cube_pos",
-                "params": {
-                    "entity_cfg": {
-                        "uid": "cube"
-                    }
-                }
-            },
-            "goal_pos": {
-                "func": "target_position",
-                "mode": "add",
-                "name": "object/goal_pos",
-                "params": {
-                    "target_pose_key": "goal_pose"
-                }
-            }
-        },
-        "rewards": {
-            "reaching_reward": {
-                "func": "reaching_behind_object",
-                "mode": "add",
-                "weight": 0.1,
-                "params": {
-                    "object_cfg": {
-                        "uid": "cube"
-                    },
-                    "target_pose_key": "goal_pose",
-                    "behind_offset": 0.015,
-                    "height_offset": 0.015,
-                    "distance_scale": 5.0,
-                    "part_name": "arm"
-                }
-            },
-            "place_reward": {
-                "func": "incremental_distance_to_target",
-                "mode": "add",
-                "weight": 1.0,
-                "params": {
-                    "source_entity_cfg": {
-                        "uid": "cube"
-                    },
-                    "target_pose_key": "goal_pose",
-                    "tanh_scale": 10.0,
-                    "positive_weight": 2.0,
-                    "negative_weight": 0.5,
-                    "use_xy_only": true
-                }
-            },
-            "action_penalty": {
-                "func": "action_smoothness_penalty",
-                "mode": "add",
-                "weight": 0.01,
-                "params": {}
-            },
-            "success_bonus": {
-                "func": "success_reward",
-                "mode": "add",
-                "weight": 10.0,
-                "params": {}
-            }
-        },
-        "extensions": {
-            "action_type": "delta_qpos",
-            "episode_length": 100,
-            "action_scale": 0.1,
-            "success_threshold": 0.1
-        }
-    },
-    "robot": {
-        "uid": "Manipulator",
-        "urdf_cfg": {
-            "components": [
-                {
-                    "component_type": "arm",
-                    "urdf_path": "UniversalRobots/UR10/UR10.urdf"
-                },
-                {
-                    "component_type": "hand",
-                    "urdf_path": "DH_PGI_140_80/DH_PGI_140_80.urdf"
-                }
-            ]
-        },
-        "init_pos": [0.0, 0.0, 0.0],
-        "init_rot": [0.0, 0.0, 0.0],
-        "init_qpos": [0.0, -1.57, 1.57, -1.57, -1.57, 0.0, 0.04, 0.04],
-        "drive_pros": {
-            "drive_type": "force",
-            "stiffness": 100000.0,
-            "damping": 1000.0,
-            "max_velocity": 2.0,
-            "max_effort": 500.0
-        },
-        "solver_cfg": {
-            "arm": {
-                "class_type": "PytorchSolver",
-                "end_link_name": "ee_link",
-                "root_link_name": "base_link",
-                "tcp": [
-                    [1.0, 0.0, 0.0, 0.0],
-                    [0.0, 1.0, 0.0, 0.0],
-                    [0.0, 0.0, 1.0, 0.16],
-                    [0.0, 0.0, 0.0, 1.0]
-                ]
-            }
-        },
-        "control_parts": {
-            "arm": ["JOINT[1-6]"]
-        }
-    },
-    "sensor": [
-        {
-            "sensor_type": "Camera",
-            "uid": "wrist_camera",
-            "width": 120,
-            "height": 120,
-            "intrinsics": [90.0, 90.0, 64.0, 64.0],
-            "extrinsics": {
-                "parent": "ee_link"
-            }
-        },
-        {
-            "sensor_type": "Camera",
-            "uid": "third_person_camera",
-            "width": 120,
-            "height": 120,
-            "intrinsics": [90.0, 90.0, 64.0, 64.0],
-            "extrinsics": {
-                "eye": [0.5, 0.5, 0.8],
-                "target": [0.0, 0.0, 0.3],
-                "up": [0.0, 0.0, 1.0]
-            }
-        }
-    ],
-    "light": {},
-    "background": [],
-    "rigid_object": [
-        {
-            "uid": "cube",
-            "shape": {
-                "shape_type": "Cube",
-                "size": [0.1, 0.1, 0.1]
-            },
-            "body_type": "dynamic",
-            "init_pos": [-0.6, -0.4, 0.05],
-            "attrs": {
-                "mass": 10.0,
-                "static_friction": 3.0,
-                "dynamic_friction": 2.0,
-                "linear_damping": 2.0,
-                "angular_damping": 2.0,
-                "contact_offset": 0.003,
-                "rest_offset": 0.001,
-                "restitution": 0.1,
-                "max_depenetration_velocity": 10.0,
-                "max_linear_velocity": 1.0,
-                "max_angular_velocity": 1.0
-            }
-        }
-    ],
-    "rigid_object_group": [],
-    "articulation": []
-}

From 6399c1390bf10422de3baa31ecdfe92d2aa4fcb0 Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Thu, 22 Jan 2026 19:02:22 +0800
Subject: [PATCH 10/12] update rl unit test

---
 embodichain/agents/rl/train.py |  14 +++
 tests/agents/test_rl.py        | 165 ++++++++++++++++++++-------------
 2 files changed, 114 insertions(+), 65 deletions(-)

diff --git a/embodichain/agents/rl/train.py b/embodichain/agents/rl/train.py
index 89055cb8..b7e292e3 100644
--- a/embodichain/agents/rl/train.py
+++ b/embodichain/agents/rl/train.py
@@ -282,6 +282,20 @@ def train_from_config(config_path: str):
                 wandb.finish()
             except Exception:
                 pass
+
+        # Clean up environments to prevent resource leaks
+        try:
+            if env is not None:
+                env.close()
+        except Exception as e:
+            logger.log_warning(f"Failed to close training environment: {e}")
+
+        try:
+            if eval_env is not None:
+                eval_env.close()
+        except Exception as e:
+            logger.log_warning(f"Failed to close evaluation environment: {e}")
+
         logger.log_info("Training finished")
 
 
diff --git a/tests/agents/test_rl.py b/tests/agents/test_rl.py
index 827e670f..475ffa1d 100644
--- a/tests/agents/test_rl.py
+++ b/tests/agents/test_rl.py
@@ -18,78 +18,113 @@
 import json
 import pytest
 import tempfile
+from pathlib import Path
 
 
-def test_rl_training():
-    """Test RL training pipeline by running a few iterations."""
-    # Load the existing push_cube config
-    train_config_path = "configs/agents/rl/push_cube/train_config.json"
-    gym_config_path = "configs/agents/rl/push_cube/gym_config.json"
-
-    with open(train_config_path, "r") as f:
-        train_config = json.load(f)
-
-    with open(gym_config_path, "r") as f:
-        gym_config = json.load(f)
-
-    # Add dataset configuration dynamically to gym_config
-    gym_config["env"]["dataset"] = {
-        "lerobot": {
-            "func": "LeRobotRecorder",
-            "mode": "save",
-            "params": {
-                "robot_meta": {
-                    "robot_type": "UR10_DH_Gripper",
-                    "control_freq": 25,
-                    "arm_dofs": 6,
-                    "observation": {"vision": {}, "states": ["qpos"]},
-                },
-                "instruction": {"lang": "push_cube_to_target"},
-                "extra": {
-                    "scene_type": "tabletop",
-                    "task_description": "push_cube_rl_test",
+class TestRLTraining:
+    """Test suite for RL training pipeline."""
+
+    temp_gym_config_path = None
+    temp_train_config_path = None
+
+    def setup_method(self):
+        """Set up test configuration before each test method."""
+        # Load the existing push_cube config
+        train_config_path = "configs/agents/rl/push_cube/train_config.json"
+        gym_config_path = "configs/agents/rl/push_cube/gym_config.json"
+
+        with open(train_config_path, "r") as f:
+            train_config = json.load(f)
+
+        with open(gym_config_path, "r") as f:
+            gym_config = json.load(f)
+
+        # Add dataset configuration dynamically to gym_config
+        gym_config["env"]["dataset"] = {
+            "lerobot": {
+                "func": "LeRobotRecorder",
+                "mode": "save",
+                "params": {
+                    "robot_meta": {
+                        "robot_type": "UR10_DH_Gripper",
+                        "control_freq": 25,
+                        "arm_dofs": 6,
+                        "observation": {"vision": {}, "states": ["qpos"]},
+                    },
+                    "instruction": {"lang": "push_cube_to_target"},
+                    "extra": {
+                        "scene_type": "tabletop",
+                        "task_description": "push_cube_rl_test",
+                    },
+                    "use_videos": False,
+                    "export_success_only": False,
                 },
-                "use_videos": False,
-                "export_success_only": False,
-            },
+            }
         }
-    }
-
-    # Create temporary gym config with dataset
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
-        json.dump(gym_config, f)
-        temp_gym_config_path = f.name
-
-    # Create temporary train config with reduced iterations for testing
-    test_train_config = train_config.copy()
-    test_train_config["trainer"][
-        "gym_config"
-    ] = temp_gym_config_path  # Point to temp gym config
-    test_train_config["trainer"]["iterations"] = 2  # Only 2 iterations for testing
-    test_train_config["trainer"]["rollout_steps"] = 32  # Fewer rollout steps
-    test_train_config["trainer"]["eval_freq"] = 1000000  # Disable eval
-    test_train_config["trainer"]["save_freq"] = 1000000  # Disable save
-    test_train_config["trainer"]["headless"] = True
-    test_train_config["trainer"]["use_wandb"] = False
-    test_train_config["trainer"]["num_envs"] = 2  # Only 2 parallel envs for testing
-
-    # Save temporary train config
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
-        json.dump(test_train_config, f)
-        temp_train_config_path = f.name
-
-    try:
-        from embodichain.agents.rl.train import train_from_config
 
-        train_from_config(temp_train_config_path)
-        assert True
+        # Create temporary gym config with dataset
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(gym_config, f)
+            self.temp_gym_config_path = f.name
+
+        # Create temporary train config with reduced iterations for testing
+        test_train_config = train_config.copy()
+        test_train_config["trainer"]["gym_config"] = self.temp_gym_config_path
+        test_train_config["trainer"]["iterations"] = 2
+        test_train_config["trainer"]["rollout_steps"] = 32
+        test_train_config["trainer"]["eval_freq"] = 1000000  # Disable eval
+        test_train_config["trainer"]["save_freq"] = 1000000  # Disable save
+        test_train_config["trainer"]["headless"] = True
+        test_train_config["trainer"]["use_wandb"] = False
+        test_train_config["trainer"]["num_envs"] = 2
+
+        # Save temporary train config
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(test_train_config, f)
+            self.temp_train_config_path = f.name
 
-    finally:
+    def teardown_method(self):
+        """Clean up resources after each test method."""
         # Clean up temporary files
-        if os.path.exists(temp_train_config_path):
-            os.remove(temp_train_config_path)
-        if os.path.exists(temp_gym_config_path):
-            os.remove(temp_gym_config_path)
+        if self.temp_train_config_path and os.path.exists(self.temp_train_config_path):
+            os.remove(self.temp_train_config_path)
+            self.temp_train_config_path = None
+
+        if self.temp_gym_config_path and os.path.exists(self.temp_gym_config_path):
+            os.remove(self.temp_gym_config_path)
+            self.temp_gym_config_path = None
+
+    def test_training_pipeline(self):
+        """Test basic RL training pipeline with minimal iterations."""
+        from embodichain.agents.rl.train import train_from_config
+
+        # This should run without errors
+        train_from_config(self.temp_train_config_path)
+
+    @pytest.mark.parametrize("num_envs", [1, 2, 4])
+    def test_multi_env_training(self, num_envs: int):
+        """Test training with different numbers of parallel environments."""
+        # Reload and modify config for this specific test
+        with open(self.temp_train_config_path, "r") as f:
+            config = json.load(f)
+
+        config["trainer"]["num_envs"] = num_envs
+        config["trainer"][
+            "iterations"
+        ] = 1  # Even fewer iterations for parameterized test
+
+        # Save modified config
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+            json.dump(config, f)
+            temp_config = f.name
+
+        try:
+            from embodichain.agents.rl.train import train_from_config
+
+            train_from_config(temp_config)
+        finally:
+            if os.path.exists(temp_config):
+                os.remove(temp_config)
 
 
 if __name__ == "__main__":

From 924f7afb360e64aba17e4cc85915db0f1bc400e5 Mon Sep 17 00:00:00 2001
From: yuecideng <dengyueci@qq.com>
Date: Thu, 22 Jan 2026 19:12:42 +0800
Subject: [PATCH 11/12] fix ci issue

---
 .github/workflows/main.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 63b697e6..f17f3ebe 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -54,6 +54,7 @@ jobs:
           echo "Start Building docs..."
           pip uninstall pymeshlab -y
           pip install pymeshlab==2023.12.post3
+          pip install matplotlib==3.10.8
           make html
       - name: Upload docs artifact
         if: github.event_name == 'push' && github.ref == 'refs/heads/main'

From 1f66db16f8d6073f6d285f5d9a66987402db8c5c Mon Sep 17 00:00:00 2001
From: yuecideng <dengyueci@qq.com>
Date: Thu, 22 Jan 2026 19:26:41 +0800
Subject: [PATCH 12/12] wip

---
 .github/workflows/main.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index f17f3ebe..0b83af33 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -48,13 +48,12 @@ jobs:
       - uses: actions/checkout@v4
       - name: Build docs
         run: |
-          pip install -e .[lerobot] --extra-index-url http://pyp.open3dv.site:2345/simple/ --trusted-host pyp.open3dv.site
+          pip install -e . --extra-index-url http://pyp.open3dv.site:2345/simple/ --trusted-host pyp.open3dv.site
           pip install -r docs/requirements.txt
           cd ${GITHUB_WORKSPACE}/docs
           echo "Start Building docs..."
           pip uninstall pymeshlab -y
           pip install pymeshlab==2023.12.post3
-          pip install matplotlib==3.10.8
           make html
       - name: Upload docs artifact
         if: github.event_name == 'push' && github.ref == 'refs/heads/main'
@@ -76,7 +75,7 @@ jobs:
       - uses: actions/checkout@v4
       - name: Run tests
         run: |
-          pip install -e . --extra-index-url http://pyp.open3dv.site:2345/simple/ --trusted-host pyp.open3dv.site
+          pip install -e .[lerobot] --extra-index-url http://pyp.open3dv.site:2345/simple/ --trusted-host pyp.open3dv.site
           echo "Unit test Start"
           export HF_ENDPOINT=https://hf-mirror.com
           pip uninstall pymeshlab -y