From 9a49bedb889a98016d81ef8ffa1f158a7e290041 Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Tue, 20 Jan 2026 15:09:41 +0800 Subject: [PATCH 01/12] action preprocess --- configs/agents/rl/push_cube/gym_config.json | 1 + configs/agents/rl/push_cube/train_config.json | 4 +- embodichain/agents/rl/algo/ppo.py | 6 +- embodichain/agents/rl/utils/trainer.py | 9 +- embodichain/lab/gym/envs/__init__.py | 2 + embodichain/lab/gym/envs/base_env.py | 18 +- embodichain/lab/gym/envs/embodied_env.py | 23 +- embodichain/lab/gym/envs/managers/rewards.py | 29 +- embodichain/lab/gym/envs/rl_env.py | 295 ++++++++++++++++++ .../lab/gym/envs/tasks/rl/push_cube.py | 103 +----- 10 files changed, 383 insertions(+), 107 deletions(-) create mode 100644 embodichain/lab/gym/envs/rl_env.py diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json index 766d12c4..fce093dd 100644 --- a/configs/agents/rl/push_cube/gym_config.json +++ b/configs/agents/rl/push_cube/gym_config.json @@ -114,6 +114,7 @@ }, "extensions": { "obs_mode": "state", + "action_type": "delta_qpos", "episode_length": 100, "joint_limits": 0.5, "action_scale": 0.1, diff --git a/configs/agents/rl/push_cube/train_config.json b/configs/agents/rl/push_cube/train_config.json index 3c3bda05..3b5c8e3a 100644 --- a/configs/agents/rl/push_cube/train_config.json +++ b/configs/agents/rl/push_cube/train_config.json @@ -4,10 +4,10 @@ "gym_config": "configs/agents/rl/push_cube/gym_config.json", "seed": 42, "device": "cuda:0", - "headless": false, + "headless": true, "enable_rt": false, "gpu_id": 0, - "num_envs": 8, + "num_envs": 512, "iterations": 1000, "rollout_steps": 1024, "eval_freq": 200, diff --git a/embodichain/agents/rl/algo/ppo.py b/embodichain/agents/rl/algo/ppo.py index 5253e897..8f26905b 100644 --- a/embodichain/agents/rl/algo/ppo.py +++ b/embodichain/agents/rl/algo/ppo.py @@ -94,8 +94,12 @@ def collect_rollout( current_obs, deterministic=False ) + # Wrap action as dict for env processing + action_type = getattr(env, "action_type", "qpos") + action_dict = {action_type: actions} + # Step environment - result = env.step(actions) + result = env.step(action_dict) next_obs, reward, terminated, truncated, env_info = result done = terminated | truncated # Light dtype normalization diff --git a/embodichain/agents/rl/utils/trainer.py b/embodichain/agents/rl/utils/trainer.py index 34dc1919..94efaecf 100644 --- a/embodichain/agents/rl/utils/trainer.py +++ b/embodichain/agents/rl/utils/trainer.py @@ -239,7 +239,14 @@ def _eval_once(self, num_episodes: int = 5): ep_ret = torch.zeros(num_envs_eval, dtype=torch.float32, device=self.device) while not done_any.any(): actions, _, _ = self.policy.get_action(obs, deterministic=True) - obs, reward, terminated, truncated, info = self.eval_env.step(actions) + + # Wrap action as dict for env processing + action_type = getattr(self.eval_env, "action_type", "qpos") + action_dict = {action_type: actions} + + obs, reward, terminated, truncated, info = self.eval_env.step( + action_dict + ) # Flatten dict observation from step if isinstance(obs, dict): diff --git a/embodichain/lab/gym/envs/__init__.py b/embodichain/lab/gym/envs/__init__.py index 88257690..4bd4321f 100644 --- a/embodichain/lab/gym/envs/__init__.py +++ b/embodichain/lab/gym/envs/__init__.py @@ -16,10 +16,12 @@ from .base_env import * from .embodied_env import * +from .rl_env import * from .tasks import * from .wrapper import * from embodichain.lab.gym.envs.embodied_env import EmbodiedEnv +from embodichain.lab.gym.envs.rl_env import RLEnv # Specific task environments from embodichain.lab.gym.envs.tasks.tableware.pour_water.pour_water import ( diff --git a/embodichain/lab/gym/envs/base_env.py b/embodichain/lab/gym/envs/base_env.py index 99743607..fd0f873c 100644 --- a/embodichain/lab/gym/envs/base_env.py +++ b/embodichain/lab/gym/envs/base_env.py @@ -482,6 +482,22 @@ def get_reward( return rewards + def _preprocess_action(self, action: EnvAction) -> EnvAction: + """Preprocess action before sending to robot. + + Override this method to add custom preprocessing like: + - Action scaling + - Coordinate transformation (e.g., EEF pose to joint positions) + - Action space conversion + + Args: + action: Raw action from policy + + Returns: + Preprocessed action + """ + return action + def _step_action(self, action: EnvAction) -> EnvAction: """Set action control command into simulation. @@ -536,7 +552,7 @@ def step( """ self._elapsed_steps += 1 - # TODO: may be add hook for action preprocessing. + action = self._preprocess_action(action=action) action = self._step_action(action=action) self.sim.update(self.sim_cfg.physics_dt, self.cfg.sim_steps_per_control) self._update_sim_state(**kwargs) diff --git a/embodichain/lab/gym/envs/embodied_env.py b/embodichain/lab/gym/envs/embodied_env.py index 4e6b1b98..c449c10a 100644 --- a/embodichain/lab/gym/envs/embodied_env.py +++ b/embodichain/lab/gym/envs/embodied_env.py @@ -344,16 +344,31 @@ def _initialize_episode( def _step_action(self, action: EnvAction) -> EnvAction: """Set action control command into simulation. + Supports multiple action formats: + 1. torch.Tensor: Interpreted as qpos (joint positions) + 2. Dict with keys: + - "qpos": Joint positions + - "qvel": Joint velocities + - "qf": Joint forces/torques + Args: action: The action applied to the robot agent. Returns: The action return. """ - # TODO: Support data structure action input such as struct. - qpos = action - - self.robot.set_qpos(qpos=qpos) + if isinstance(action, dict): + # Support multiple control modes simultaneously + if "qpos" in action: + self.robot.set_qpos(qpos=action["qpos"]) + if "qvel" in action: + self.robot.set_qvel(qvel=action["qvel"]) + if "qf" in action: + self.robot.set_qf(qf=action["qf"]) + elif isinstance(action, torch.Tensor): + self.robot.set_qpos(qpos=action) + else: + logger.error(f"Unsupported action type: {type(action)}") return action diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py index 5cfbef55..2ac38ea2 100644 --- a/embodichain/lab/gym/envs/managers/rewards.py +++ b/embodichain/lab/gym/envs/managers/rewards.py @@ -30,7 +30,7 @@ def distance_between_objects( env: EmbodiedEnv, obs: dict, - action: torch.Tensor, + action: dict, info: dict, source_entity_cfg: SceneEntityCfg = None, target_entity_cfg: SceneEntityCfg = None, @@ -92,7 +92,7 @@ def distance_between_objects( def joint_velocity_penalty( env: EmbodiedEnv, obs: dict, - action: torch.Tensor, + action: dict, info: dict, robot_uid: str = "robot", joint_ids: slice | list[int] | None = None, @@ -146,7 +146,7 @@ def joint_velocity_penalty( def action_smoothness_penalty( env: EmbodiedEnv, obs: dict, - action: torch.Tensor, + action: dict, info: dict, ) -> torch.Tensor: """Penalize large action changes between consecutive timesteps. @@ -171,20 +171,27 @@ def action_smoothness_penalty( } ``` """ + # Extract action tensor from dict (priority order) + action_tensor = None + for key in ["qpos", "delta_qpos", "qvel", "qf", "eef_pose"]: + if key in action: + action_tensor = action[key] + break + # Use dictionary-based state management if not hasattr(env, "_reward_states"): env._reward_states = {} # compute difference between current and previous action if "prev_actions" in env._reward_states: - action_diff = action - env._reward_states["prev_actions"] + action_diff = action_tensor - env._reward_states["prev_actions"] penalty = -torch.norm(action_diff, dim=-1) else: # no previous action, no penalty penalty = torch.zeros(env.num_envs, device=env.device) # store current action for next step - env._reward_states["prev_actions"] = action.clone() + env._reward_states["prev_actions"] = action_tensor.clone() return penalty @@ -192,7 +199,7 @@ def action_smoothness_penalty( def joint_limit_penalty( env: EmbodiedEnv, obs: dict, - action: torch.Tensor, + action: dict, info: dict, robot_uid: str = "robot", joint_ids: slice | list[int] = slice(None), @@ -258,7 +265,7 @@ def joint_limit_penalty( def orientation_alignment( env: EmbodiedEnv, obs: dict, - action: torch.Tensor, + action: dict, info: dict, source_entity_cfg: SceneEntityCfg = None, target_entity_cfg: SceneEntityCfg = None, @@ -313,7 +320,7 @@ def orientation_alignment( def success_reward( env: EmbodiedEnv, obs: dict, - action: torch.Tensor, + action: dict, info: dict, ) -> torch.Tensor: """Sparse bonus reward when task succeeds. @@ -361,7 +368,7 @@ def success_reward( def reaching_behind_object( env: EmbodiedEnv, obs: dict, - action: torch.Tensor, + action: dict, info: dict, object_cfg: SceneEntityCfg = None, target_pose_key: str = "goal_pose", @@ -452,7 +459,7 @@ def reaching_behind_object( def distance_to_target( env: "EmbodiedEnv", obs: dict, - action: torch.Tensor, + action: dict, info: dict, source_entity_cfg: SceneEntityCfg = None, target_pose_key: str = "target_pose", @@ -529,7 +536,7 @@ def distance_to_target( def incremental_distance_to_target( env: "EmbodiedEnv", obs: dict, - action: torch.Tensor, + action: dict, info: dict, source_entity_cfg: SceneEntityCfg = None, target_pose_key: str = "target_pose", diff --git a/embodichain/lab/gym/envs/rl_env.py b/embodichain/lab/gym/envs/rl_env.py new file mode 100644 index 00000000..986c9f31 --- /dev/null +++ b/embodichain/lab/gym/envs/rl_env.py @@ -0,0 +1,295 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +"""Base environment for reinforcement learning tasks.""" + +import torch +from typing import Dict, Any, Sequence, Optional, Tuple, Literal + +from embodichain.lab.gym.envs import EmbodiedEnv, EmbodiedEnvCfg +from embodichain.lab.sim.cfg import MarkerCfg +from embodichain.lab.sim.types import EnvObs, EnvAction +from embodichain.utils.math import matrix_from_quat, matrix_from_euler + + +__all__ = ["RLEnv"] + + +class RLEnv(EmbodiedEnv): + """Base class for reinforcement learning tasks. + + Provides common utilities for RL tasks: + - Goal pose management and visualization + - Flexible action preprocessing (scaling, IK, normalization) + - Standardized info dictionary structure + """ + + def __init__(self, cfg: EmbodiedEnvCfg = None, **kwargs): + if cfg is None: + cfg = EmbodiedEnvCfg() + super().__init__(cfg, **kwargs) + + @property + def goal_pose(self) -> Optional[torch.Tensor]: + """Get current goal poses (4x4 matrices) for all environments.""" + return self._goal_pose + + def draw_goal_marker(self, env_ids: Sequence[int] | None = None): + """Draw axis markers at goal positions for visualization. + + Args: + env_ids: Environment IDs to draw markers for (None = all envs) + """ + if self.goal_pose is None: + return + + if env_ids is None: + env_ids = range(self.num_envs) + + for arena_idx in env_ids: + marker_name = f"goal_marker_{arena_idx}" + self.sim.remove_marker(marker_name) + goal_pose = self.goal_pose[arena_idx].detach().cpu().numpy() + marker_cfg = MarkerCfg( + name=marker_name, + marker_type="axis", + axis_xpos=[goal_pose], + axis_size=0.003, + axis_len=0.02, + arena_index=arena_idx, + ) + self.sim.draw_marker(cfg=marker_cfg) + + def _preprocess_action(self, action: EnvAction) -> EnvAction: + """Preprocess action for RL tasks with flexible transformation. + + Supports multiple action formats: + 1. Dict input (keys specify action type): + - {"delta_qpos": tensor}: Delta joint positions (scaled and added to current) + - {"qpos": tensor}: Absolute joint positions (scaled) + - {"qpos_normalized": tensor}: Normalized qpos in [-1, 1] + - {"eef_pose": tensor}: End-effector pose (6D or 7D) converted via IK + - {"qvel": tensor}: Joint velocities (scaled) + - {"qf": tensor}: Joint forces/torques (scaled) + + 2. Tensor input: Interpreted based on self.action_type attribute + (default: "qpos") + + Args: + action: Raw action from policy (tensor or dict) + + Returns: + Dict action ready for robot control + """ + # Convert tensor input to dict based on action_type + if not isinstance(action, dict): + action_type = getattr(self, "action_type", "qpos") + action = {action_type: action} + + # Step 1: Scale all action values by action_scale + scaled_action = {} + for key, value in action.items(): + if isinstance(value, torch.Tensor): + scaled_action[key] = value * self.action_scale + else: + scaled_action[key] = value + + # Step 2: Process based on dict keys + result = {} + + if "qpos" in scaled_action: + result["qpos"] = scaled_action["qpos"] + elif "delta_qpos" in scaled_action: + result["qpos"] = self._process_delta_qpos(scaled_action["delta_qpos"]) + elif "qpos_normalized" in scaled_action: + result["qpos"] = self._denormalize_action(scaled_action["qpos_normalized"]) + elif "eef_pose" in scaled_action: + result["qpos"] = self._process_eef_pose(scaled_action["eef_pose"]) + + # Velocity and force controls + if "qvel" in scaled_action: + result["qvel"] = scaled_action["qvel"] + + if "qf" in scaled_action: + result["qf"] = scaled_action["qf"] + + return result + + def _denormalize_action(self, action: torch.Tensor) -> torch.Tensor: + """Denormalize action from [-1, 1] to actual range. + + Args: + action: Normalized action in [-1, 1] + + Returns: + Denormalized action + """ + qpos_limits = self.robot.body_data.qpos_limits[0] + low = qpos_limits[:, 0] + high = qpos_limits[:, 1] + + # Map [-1, 1] to [low, high] + return low + (action + 1.0) * 0.5 * (high - low) + + def _process_delta_qpos(self, action: torch.Tensor) -> torch.Tensor: + """Process delta joint position action. + + Args: + action: Delta joint positions + + Returns: + Absolute joint positions + """ + current_qpos = self.robot.get_qpos() + return current_qpos + action + + def _process_eef_pose(self, action: torch.Tensor) -> torch.Tensor: + """Process end-effector pose action via inverse kinematics. + + Args: + action: End-effector pose (position + orientation) + Shape: (num_envs, 6) for pos+euler or (num_envs, 7) for pos+quat + + Returns: + Joint positions from IK + """ + # Auto-select control part if not specified + control_part_name = self.control_part_name + if control_part_name is None: + if self.robot.control_parts is None: + raise ValueError( + "Robot has no control_parts defined for eef_pose action" + ) + + available_parts = list(self.robot.control_parts.keys()) + if not available_parts: + raise ValueError( + "Robot has no control_parts defined for eef_pose action" + ) + + if len(available_parts) == 1: + control_part_name = available_parts[0] + else: + raise ValueError( + f"Multiple control_parts found {available_parts}. " + f"Please specify control_part_name explicitly." + ) + + # Get current joint positions as IK seed + current_qpos = self.robot.get_qpos() + control_joint_ids = self.robot.get_joint_ids(name=control_part_name) + seed_qpos = current_qpos[:, control_joint_ids] + + # Convert action to target pose matrix (4x4) + batch_size = action.shape[0] + target_pose = ( + torch.eye(4, device=self.device).unsqueeze(0).repeat(batch_size, 1, 1) + ) + + if action.shape[-1] == 6: + # pos (3) + euler angles (3) + target_pose[:, :3, 3] = action[:, :3] + target_pose[:, :3, :3] = matrix_from_euler(action[:, 3:6]) + elif action.shape[-1] == 7: + # pos (3) + quaternion (4) + target_pose[:, :3, 3] = action[:, :3] + target_pose[:, :3, :3] = matrix_from_quat(action[:, 3:7]) + else: + raise ValueError( + f"EEF pose action must be 6D or 7D, got {action.shape[-1]}D" + ) + + # Solve IK for each environment + ik_solutions = [] + for env_idx in range(self.num_envs): + qpos_ik = self.robot.compute_ik( + pose=target_pose[env_idx], + joint_seed=seed_qpos[env_idx], + name=control_part_name, + ) + ik_solutions.append(qpos_ik) + + # Update full qpos with IK solutions + result_qpos = current_qpos.clone() + ik_qpos = torch.stack(ik_solutions, dim=0) + result_qpos[:, control_joint_ids] = ik_qpos + + return result_qpos + + def compute_task_state( + self, **kwargs + ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]: + """Compute task-specific state: success, fail, and metrics. + + Override this method in subclass to define task-specific logic. + + Returns: + Tuple of (success, fail, metrics): + - success: Boolean tensor of shape (num_envs,) + - fail: Boolean tensor of shape (num_envs,) + - metrics: Dict of metric tensors + """ + success = torch.zeros(self.num_envs, device=self.device, dtype=torch.bool) + fail = torch.zeros(self.num_envs, device=self.device, dtype=torch.bool) + metrics = {} + return success, fail, metrics + + def get_info(self, **kwargs) -> Dict[str, Any]: + """Get environment info dictionary. + + Calls compute_task_state() to get task-specific success/fail/metrics. + Subclasses should override compute_task_state() instead of this method. + + Returns: + Info dictionary with success, fail, elapsed_steps, goal_pose, metrics + """ + success, fail, metrics = self.compute_task_state(**kwargs) + + info = { + "success": success, + "fail": fail, + "elapsed_steps": self._elapsed_steps, + "goal_pose": self.goal_pose, + } + info["metrics"] = metrics + return info + + def check_truncated(self, obs: EnvObs, info: Dict[str, Any]) -> torch.Tensor: + """Check if episode should be truncated (timeout). + + Args: + obs: Current observation + info: Info dictionary + + Returns: + Boolean tensor of shape (num_envs,) + """ + return self._elapsed_steps >= self.episode_length + + def evaluate(self, **kwargs) -> Dict[str, Any]: + """Evaluate the environment state. + + Returns: + Evaluation dictionary with success and metrics + """ + info = self.get_info(**kwargs) + eval_dict = { + "success": info["success"][0].item(), + } + if "metrics" in info: + for key, value in info["metrics"].items(): + eval_dict[key] = value + return eval_dict diff --git a/embodichain/lab/gym/envs/tasks/rl/push_cube.py b/embodichain/lab/gym/envs/tasks/rl/push_cube.py index 072574fc..276a433a 100644 --- a/embodichain/lab/gym/envs/tasks/rl/push_cube.py +++ b/embodichain/lab/gym/envs/tasks/rl/push_cube.py @@ -15,19 +15,16 @@ # ---------------------------------------------------------------------------- import torch -import numpy as np -from typing import Dict, Any, Sequence -from gymnasium import spaces +from typing import Dict, Any, Tuple from embodichain.lab.gym.utils.registration import register_env -from embodichain.lab.gym.envs import EmbodiedEnv, EmbodiedEnvCfg -from embodichain.lab.sim.cfg import MarkerCfg -from embodichain.lab.sim.types import EnvObs, EnvAction -from embodichain.utils import logger +from embodichain.lab.gym.envs.rl_env import RLEnv +from embodichain.lab.gym.envs import EmbodiedEnvCfg +from embodichain.lab.sim.types import EnvObs @register_env("PushCubeRL", max_episode_steps=50, override=True) -class PushCubeEnv(EmbodiedEnv): +class PushCubeEnv(RLEnv): """Push cube task for reinforcement learning. The task involves pushing a cube to a target goal position using a robotic arm. @@ -37,100 +34,32 @@ class PushCubeEnv(EmbodiedEnv): def __init__(self, cfg=None, **kwargs): if cfg is None: cfg = EmbodiedEnvCfg() - super().__init__(cfg, **kwargs) - @property - def goal_pose(self) -> torch.Tensor: - """Get current goal poses (4x4 matrices) for all environments.""" - return self._goal_pose - - def _draw_goal_marker(self): - """Draw axis marker at goal position for visualization.""" - goal_sphere = self.sim.get_rigid_object("goal_sphere") - if goal_sphere is None: - return - - num_envs = self.cfg.num_envs - - # Get actual goal positions from each arena - goal_poses = goal_sphere.get_local_pose(to_matrix=True) # (num_envs, 4, 4) - - # Draw marker for each arena separately - for arena_idx in range(num_envs): - marker_name = f"goal_marker_{arena_idx}" - - self.sim.remove_marker(marker_name) - - goal_pose = goal_poses[arena_idx].detach().cpu().numpy() - marker_cfg = MarkerCfg( - name=marker_name, - marker_type="axis", - axis_xpos=[goal_pose], - axis_size=0.003, - axis_len=0.02, - arena_index=arena_idx, - ) - self.sim.draw_marker(cfg=marker_cfg) - - def _initialize_episode( - self, env_ids: Sequence[int] | None = None, **kwargs - ) -> None: - super()._initialize_episode(env_ids=env_ids, **kwargs) - - # Draw marker at goal position - # self._draw_goal_marker() - - def _step_action(self, action: EnvAction) -> EnvAction: - scaled_action = action * self.action_scale - scaled_action = torch.clamp( - scaled_action, -self.joint_limits, self.joint_limits - ) - current_qpos = self.robot.body_data.qpos - target_qpos = current_qpos.clone() - target_qpos[:, :6] += scaled_action[:, :6] - self.robot.set_qpos(qpos=target_qpos) - return scaled_action - - def get_info(self, **kwargs) -> Dict[str, Any]: + def compute_task_state( + self, **kwargs + ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]: cube = self.sim.get_rigid_object("cube") - cube_pos = cube.body_data.pose[:, :3] + cube_pos = cube.get_local_pose(to_matrix=True)[:, :3, 3] - # Get goal position from event-managed goal pose if self.goal_pose is not None: goal_pos = self.goal_pose[:, :3, 3] xy_distance = torch.norm(cube_pos[:, :2] - goal_pos[:, :2], dim=1) is_success = xy_distance < self.success_threshold else: - # Goal not yet set by randomize_target_pose event (e.g., before first reset) - xy_distance = torch.zeros(self.cfg.num_envs, device=self.device) + xy_distance = torch.zeros(self.num_envs, device=self.device) is_success = torch.zeros( - self.cfg.num_envs, device=self.device, dtype=torch.bool + self.num_envs, device=self.device, dtype=torch.bool ) - info = { - "success": is_success, - "fail": torch.zeros( - self.cfg.num_envs, device=self.device, dtype=torch.bool - ), - "elapsed_steps": self._elapsed_steps, - "goal_pose": self.goal_pose, - } - info["metrics"] = { - "distance_to_goal": xy_distance, - } - return info + is_fail = torch.zeros(self.num_envs, device=self.device, dtype=torch.bool) + metrics = {"distance_to_goal": xy_distance} + + return is_success, is_fail, metrics def check_truncated(self, obs: EnvObs, info: Dict[str, Any]) -> torch.Tensor: is_timeout = self._elapsed_steps >= self.episode_length cube = self.sim.get_rigid_object("cube") - cube_pos = cube.body_data.pose[:, :3] + cube_pos = cube.get_local_pose(to_matrix=True)[:, :3, 3] is_fallen = cube_pos[:, 2] < -0.1 return is_timeout | is_fallen - - def evaluate(self, **kwargs) -> Dict[str, Any]: - info = self.get_info(**kwargs) - return { - "success": info["success"][0].item(), - "distance_to_goal": info["distance_to_goal"], - } From 61a497ddcbe53e5987aa9152cf8c3115582d83b1 Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Tue, 20 Jan 2026 15:37:17 +0800 Subject: [PATCH 02/12] wip --- configs/agents/rl/push_cube/gym_config.json | 2 -- docs/source/overview/gym/env.md | 5 +++-- docs/source/tutorial/rl.rst | 2 +- embodichain/agents/rl/algo/ppo.py | 2 +- embodichain/agents/rl/utils/trainer.py | 2 +- embodichain/lab/gym/envs/embodied_env.py | 3 +-- embodichain/lab/gym/envs/managers/rewards.py | 5 ++--- embodichain/lab/gym/envs/rl_env.py | 10 ++++++++-- 8 files changed, 17 insertions(+), 14 deletions(-) diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json index fce093dd..3e0e0445 100644 --- a/configs/agents/rl/push_cube/gym_config.json +++ b/configs/agents/rl/push_cube/gym_config.json @@ -113,10 +113,8 @@ } }, "extensions": { - "obs_mode": "state", "action_type": "delta_qpos", "episode_length": 100, - "joint_limits": 0.5, "action_scale": 0.1, "success_threshold": 0.1 } diff --git a/docs/source/overview/gym/env.md b/docs/source/overview/gym/env.md index a15d26f9..68f4d585 100644 --- a/docs/source/overview/gym/env.md +++ b/docs/source/overview/gym/env.md @@ -77,7 +77,7 @@ The {class}`~envs.EmbodiedEnvCfg` class exposes the following additional paramet Dataset collection settings. Defaults to None, in which case no dataset collection is performed. Please refer to the {class}`~envs.managers.DatasetManager` class for more details. * **extensions** (Union[Dict[str, Any], None]): - Task-specific extension parameters that are automatically bound to the environment instance. This allows passing custom parameters (e.g., ``episode_length``, ``obs_mode``, ``action_scale``) without modifying the base configuration class. These parameters are accessible as instance attributes after environment initialization. For example, if ``extensions = {"episode_length": 500}``, you can access it via ``self.episode_length``. Defaults to None. + Task-specific extension parameters that are automatically bound to the environment instance. This allows passing custom parameters (e.g., ``episode_length``, ``action_type``, ``action_scale``) without modifying the base configuration class. These parameters are accessible as instance attributes after environment initialization. For example, if ``extensions = {"episode_length": 500}``, you can access it via ``self.episode_length``. Defaults to None. * **filter_visual_rand** (bool): Whether to filter out visual randomization functors. Useful for debugging motion and physics issues when visual randomization interferes with the debugging process. Defaults to ``False``. @@ -108,7 +108,8 @@ class MyTaskEnvCfg(EmbodiedEnvCfg): # 4. Task Extensions extensions = { # Task-specific parameters "episode_length": 500, - "obs_mode": "state", + "action_type": "delta_qpos", + "action_scale": 0.1, } ``` diff --git a/docs/source/tutorial/rl.rst b/docs/source/tutorial/rl.rst index 7be8c73c..ac05f64f 100644 --- a/docs/source/tutorial/rl.rst +++ b/docs/source/tutorial/rl.rst @@ -86,7 +86,7 @@ Example: "id": "PushCubeRL", "cfg": { "num_envs": 4, - "obs_mode": "state", + "action_type": "delta_qpos", "episode_length": 100, "action_scale": 0.1, "success_threshold": 0.1 diff --git a/embodichain/agents/rl/algo/ppo.py b/embodichain/agents/rl/algo/ppo.py index 8f26905b..f11fbe37 100644 --- a/embodichain/agents/rl/algo/ppo.py +++ b/embodichain/agents/rl/algo/ppo.py @@ -95,7 +95,7 @@ def collect_rollout( ) # Wrap action as dict for env processing - action_type = getattr(env, "action_type", "qpos") + action_type = getattr(env, "action_type", "delta_qpos") action_dict = {action_type: actions} # Step environment diff --git a/embodichain/agents/rl/utils/trainer.py b/embodichain/agents/rl/utils/trainer.py index 94efaecf..333a0bcd 100644 --- a/embodichain/agents/rl/utils/trainer.py +++ b/embodichain/agents/rl/utils/trainer.py @@ -241,7 +241,7 @@ def _eval_once(self, num_episodes: int = 5): actions, _, _ = self.policy.get_action(obs, deterministic=True) # Wrap action as dict for env processing - action_type = getattr(self.eval_env, "action_type", "qpos") + action_type = getattr(self.eval_env, "action_type", "delta_qpos") action_dict = {action_type: actions} obs, reward, terminated, truncated, info = self.eval_env.step( diff --git a/embodichain/lab/gym/envs/embodied_env.py b/embodichain/lab/gym/envs/embodied_env.py index c449c10a..9e787bb2 100644 --- a/embodichain/lab/gym/envs/embodied_env.py +++ b/embodichain/lab/gym/envs/embodied_env.py @@ -110,10 +110,9 @@ class EnvLightCfg: This field can be used to pass additional parameters that are specific to certain environments or tasks without modifying the base configuration class. For example: - - obs_mode: Observation mode (e.g., "state", "image") - episode_length: Maximum episode length - - joint_limits: Joint limit constraints - action_scale: Action scaling factor + - action_type: Action type (e.g., "delta_qpos", "qpos", "qvel") - vr_joint_mapping: VR joint mapping for teleoperation - control_frequency: Control frequency for VR teleoperation """ diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py index 2ac38ea2..dfd2872b 100644 --- a/embodichain/lab/gym/envs/managers/rewards.py +++ b/embodichain/lab/gym/envs/managers/rewards.py @@ -171,9 +171,8 @@ def action_smoothness_penalty( } ``` """ - # Extract action tensor from dict (priority order) - action_tensor = None - for key in ["qpos", "delta_qpos", "qvel", "qf", "eef_pose"]: + # Extract action tensor from dict + for key in ["qpos", "qvel", "qf"]: if key in action: action_tensor = action[key] break diff --git a/embodichain/lab/gym/envs/rl_env.py b/embodichain/lab/gym/envs/rl_env.py index 986c9f31..3a8f4118 100644 --- a/embodichain/lab/gym/envs/rl_env.py +++ b/embodichain/lab/gym/envs/rl_env.py @@ -17,7 +17,7 @@ """Base environment for reinforcement learning tasks.""" import torch -from typing import Dict, Any, Sequence, Optional, Tuple, Literal +from typing import Dict, Any, Sequence, Optional, Tuple from embodichain.lab.gym.envs import EmbodiedEnv, EmbodiedEnvCfg from embodichain.lab.sim.cfg import MarkerCfg @@ -42,6 +42,12 @@ def __init__(self, cfg: EmbodiedEnvCfg = None, **kwargs): cfg = EmbodiedEnvCfg() super().__init__(cfg, **kwargs) + # Set default values for common RL parameters + if not hasattr(self, "action_scale"): + self.action_scale = 1.0 + if not hasattr(self, "episode_length"): + self.episode_length = 1000 + @property def goal_pose(self) -> Optional[torch.Tensor]: """Get current goal poses (4x4 matrices) for all environments.""" @@ -96,7 +102,7 @@ def _preprocess_action(self, action: EnvAction) -> EnvAction: """ # Convert tensor input to dict based on action_type if not isinstance(action, dict): - action_type = getattr(self, "action_type", "qpos") + action_type = getattr(self, "action_type", "delta_qpos") action = {action_type: action} # Step 1: Scale all action values by action_scale From 22af95655240d9ffc513f1f3426f7cca49fa987e Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Tue, 20 Jan 2026 15:42:08 +0800 Subject: [PATCH 03/12] update rl docs --- docs/source/overview/gym/env.md | 99 +++++++++++++++++++++++++-------- docs/source/tutorial/rl.rst | 70 ++++++++++++++++++----- 2 files changed, 133 insertions(+), 36 deletions(-) diff --git a/docs/source/overview/gym/env.md b/docs/source/overview/gym/env.md index 68f4d585..88f9cac9 100644 --- a/docs/source/overview/gym/env.md +++ b/docs/source/overview/gym/env.md @@ -5,14 +5,19 @@ The {class}`~envs.EmbodiedEnv` is the core environment class in EmbodiChain designed for complex Embodied AI tasks. It adopts a **configuration-driven** architecture, allowing users to define robots, sensors, objects, lighting, and automated behaviors (events) purely through configuration classes, minimizing the need for boilerplate code. +For **Reinforcement Learning** tasks, EmbodiChain provides {class}`~envs.RLEnv`, a specialized subclass that extends {class}`~envs.EmbodiedEnv` with RL-specific utilities such as flexible action preprocessing, goal management, and standardized info structure. + ## Core Architecture -Unlike the standard {class}`~envs.BaseEnv`, the {class}`~envs.EmbodiedEnv` integrates several manager systems to handle the complexity of simulation: +EmbodiChain provides a hierarchy of environment classes for different task types: -* **Scene Management**: Automatically loads and manages robots, sensors, and scene objects defined in the configuration. -* **Event Manager**: Handles automated behaviors such as domain randomization, scene setup, and dynamic asset swapping. -* **Observation Manager**: Allows flexible extension of observation spaces without modifying the environment code. -* **Dataset Manager**: Built-in support for collecting demonstration data during simulation steps. +* **{class}`~envs.BaseEnv`**: Minimal environment for simple tasks with custom simulation logic. +* **{class}`~envs.EmbodiedEnv`**: Feature-rich environment for Embodied AI tasks (IL, custom control). Integrates manager systems: + * **Scene Management**: Automatically loads and manages robots, sensors, and scene objects. + * **Event Manager**: Domain randomization, scene setup, and dynamic asset swapping. + * **Observation Manager**: Flexible observation space extensions. + * **Dataset Manager**: Built-in support for demonstration data collection. +* **{class}`~envs.RLEnv`**: Specialized environment for RL tasks, extending {class}`~envs.EmbodiedEnv` with action preprocessing, goal management, and standardized reward/info structure. ## Configuration System @@ -166,54 +171,104 @@ The manager operates in a single mode ``"save"`` which handles both recording an The dataset manager is called automatically during {meth}`~envs.Env.step()`, ensuring all observation-action pairs are recorded without additional user code. +## Reinforcement Learning Environment + +For RL tasks, EmbodiChain provides {class}`~envs.RLEnv`, a specialized base class that extends {class}`~envs.EmbodiedEnv` with RL-specific utilities: + +* **Action Preprocessing**: Flexible action transformation supporting delta_qpos, absolute qpos, joint velocity, joint force, and end-effector pose (with IK). +* **Goal Management**: Built-in goal pose tracking and visualization with axis markers. +* **Standardized Info Structure**: Template methods for computing task-specific success/failure conditions and metrics. +* **Episode Management**: Configurable episode length and truncation logic. + +### Configuration Extensions for RL + +RL environments use the ``extensions`` field to pass task-specific parameters: + +```python +extensions = { + "action_type": "delta_qpos", # Action type: delta_qpos, qpos, qvel, qf, eef_pose + "action_scale": 0.1, # Scaling factor applied to all actions + "episode_length": 100, # Maximum episode length + "success_threshold": 0.1, # Task-specific success threshold (optional) +} +``` + ## Creating a Custom Task -To create a new task, inherit from {class}`~envs.EmbodiedEnv` and implement the task-specific logic. +### For Reinforcement Learning Tasks + +Inherit from {class}`~envs.RLEnv` and implement the task-specific logic: + +```python +from embodichain.lab.gym.envs import RLEnv, EmbodiedEnvCfg +from embodichain.lab.gym.utils.registration import register_env + +@register_env("MyRLTask-v0", max_episode_steps=100) +class MyRLTaskEnv(RLEnv): + def __init__(self, cfg: MyTaskEnvCfg, **kwargs): + super().__init__(cfg, **kwargs) + + def compute_task_state(self, **kwargs): + # Required: Compute task-specific success/failure and metrics + # Returns: Tuple[success, fail, metrics] + # - success: torch.Tensor of shape (num_envs,) with boolean values + # - fail: torch.Tensor of shape (num_envs,) with boolean values + # - metrics: Dict of metric tensors for logging + + is_success = ... # Compute success condition + is_fail = torch.zeros_like(is_success) + metrics = {"distance": ..., "angle_error": ...} + + return is_success, is_fail, metrics + + def check_truncated(self, obs, info): + # Optional: Override to add custom truncation conditions + # Default: episode_length timeout + is_timeout = super().check_truncated(obs, info) + is_fallen = ... # Custom condition (e.g., robot fell) + return is_timeout | is_fallen +``` + +Configure rewards through the {class}`~envs.managers.RewardManager` in your environment config rather than overriding ``get_reward``. + +### For Imitation Learning Tasks + +Inherit from {class}`~envs.EmbodiedEnv` for IL tasks: ```python from embodichain.lab.gym.envs import EmbodiedEnv, EmbodiedEnvCfg from embodichain.lab.gym.utils.registration import register_env -@register_env("MyTask-v0", max_episode_steps=500) -class MyTaskEnv(EmbodiedEnv): +@register_env("MyILTask-v0", max_episode_steps=500) +class MyILTaskEnv(EmbodiedEnv): def __init__(self, cfg: MyTaskEnvCfg, **kwargs): super().__init__(cfg, **kwargs) def create_demo_action_list(self, *args, **kwargs): - # Optional: Implement for expert demonstration data generation (for Imitation Learning) - # This method is used to generate scripted demonstrations for IL data collection. + # Required: Generate scripted demonstrations for data collection # Must set self.action_length = len(action_list) if returning actions pass def is_task_success(self, **kwargs): - # Optional: Define success criteria (mainly for IL data collection) + # Required: Define success criteria for filtering successful episodes # Returns: torch.Tensor of shape (num_envs,) with boolean values return success_tensor - def get_reward(self, obs, action, info): - # Optional: Override for RL tasks - # Returns: torch.Tensor of shape (num_envs,) - return super().get_reward(obs, action, info) - def get_info(self, **kwargs): # Optional: Override to add custom info fields - # Should include "success" and "fail" keys for termination info = super().get_info(**kwargs) info["custom_metric"] = ... return info ``` -```{note} -The {meth}`~envs.EmbodiedEnv.create_demo_action_list` method is specifically designed for expert demonstration data generation in Imitation Learning scenarios. For Reinforcement Learning tasks, you should override the {meth}`~envs.EmbodiedEnv.get_reward` method instead. -``` - For a complete example of a modular environment setup, please refer to the {ref}`tutorial_modular_env` tutorial. ## See Also - {ref}`tutorial_create_basic_env` - Creating basic environments - {ref}`tutorial_modular_env` - Advanced modular environment setup -- {doc}`/api_reference/embodichain/embodichain.lab.gym.envs` - Complete API reference for EmbodiedEnv and EmbodiedEnvCfg +- {ref}`tutorial_rl` - Reinforcement learning training guide +- {doc}`/api_reference/embodichain/embodichain.lab.gym.envs` - Complete API reference for EmbodiedEnv, RLEnv, and configurations ```{toctree} :maxdepth: 1 diff --git a/docs/source/tutorial/rl.rst b/docs/source/tutorial/rl.rst index ac05f64f..cbc011b2 100644 --- a/docs/source/tutorial/rl.rst +++ b/docs/source/tutorial/rl.rst @@ -78,6 +78,13 @@ The ``env`` section defines the task environment: - **id**: Environment registry ID (e.g., "PushCubeRL") - **cfg**: Environment-specific configuration parameters +For RL environments (inheriting from ``RLEnv``), use the ``extensions`` field for RL-specific parameters: + +- **action_type**: Action type - "delta_qpos" (default), "qpos", "qvel", "qf", "eef_pose" +- **action_scale**: Scaling factor applied to all actions (default: 1.0) +- **episode_length**: Maximum episode length (default: 1000) +- **success_threshold**: Task-specific success threshold (optional) + Example: .. code-block:: json @@ -86,10 +93,12 @@ Example: "id": "PushCubeRL", "cfg": { "num_envs": 4, - "action_type": "delta_qpos", - "episode_length": 100, - "action_scale": 0.1, - "success_threshold": 0.1 + "extensions": { + "action_type": "delta_qpos", + "action_scale": 0.1, + "episode_length": 100, + "success_threshold": 0.1 + } } } @@ -321,41 +330,74 @@ Adding a New Environment To add a new RL environment: -1. Create an environment class inheriting from ``EmbodiedEnv`` -2. Register it with the Gymnasium registry: +1. Create an environment class inheriting from ``RLEnv`` (which provides action preprocessing, goal management, and standardized info structure): .. code-block:: python + from embodichain.lab.gym.envs import RLEnv, EmbodiedEnvCfg from embodichain.lab.gym.utils.registration import register_env + import torch @register_env("MyTaskRL", max_episode_steps=100, override=True) - class MyTaskEnv(EmbodiedEnv): - cfg: MyTaskEnvCfg - ... + class MyTaskEnv(RLEnv): + def __init__(self, cfg: EmbodiedEnvCfg = None, **kwargs): + super().__init__(cfg, **kwargs) + + def compute_task_state(self, **kwargs): + """Compute success/failure conditions and metrics.""" + is_success = ... # Define success condition + is_fail = torch.zeros_like(is_success) + metrics = {"distance": ..., "error": ...} + return is_success, is_fail, metrics + + def check_truncated(self, obs, info): + """Optional: Add custom truncation conditions.""" + is_timeout = super().check_truncated(obs, info) + # Add custom conditions if needed + return is_timeout -3. Use the environment ID in your JSON config: +2. Configure the environment in your JSON config with RL-specific extensions: .. code-block:: json "env": { "id": "MyTaskRL", "cfg": { - ... + "num_envs": 4, + "extensions": { + "action_type": "delta_qpos", + "action_scale": 0.1, + "episode_length": 100, + "success_threshold": 0.05 + } } } +The ``RLEnv`` base class provides: + +- **Action Preprocessing**: Automatically handles different action types (delta_qpos, qpos, qvel, qf, eef_pose) +- **Action Scaling**: Applies ``action_scale`` to all actions +- **Goal Management**: Built-in goal pose tracking and visualization +- **Standardized Info**: Implements ``get_info()`` using ``compute_task_state()`` template method + Best Practices ~~~~~~~~~~~~~~ -- **Device Management**: Device is single-sourced from ``runtime.cuda``. All components (trainer/algorithm/policy/env) share the same device. +- **Use RLEnv for RL Tasks**: Always inherit from ``RLEnv`` for reinforcement learning tasks. It provides action preprocessing, goal management, and standardized info structure out of the box. + +- **Action Type Configuration**: Configure ``action_type`` in the environment's ``extensions`` field. The default is "delta_qpos" (incremental joint positions). Other options: "qpos" (absolute), "qvel" (velocity), "qf" (force), "eef_pose" (end-effector pose with IK). -- **Action Scaling**: Keep action scaling in the environment, not in the policy. +- **Action Scaling**: Use ``action_scale`` in the environment's ``extensions`` field to scale actions. This is applied in ``RLEnv._preprocess_action()`` before robot control. + +- **Device Management**: Device is single-sourced from ``runtime.cuda``. All components (trainer/algorithm/policy/env) share the same device. - **Observation Format**: Environments should provide consistent observation shape/types (torch.float32) and a single ``done = terminated | truncated``. - **Algorithm Interface**: Algorithms must implement ``initialize_buffer()``, ``collect_rollout()``, and ``update()`` methods. The algorithm completely controls data collection and buffer management. -- **Reward Components**: Organize reward components in ``info["rewards"]`` dictionary and metrics in ``info["metrics"]`` dictionary. The trainer performs dense per-step logging directly from environment info. +- **Reward Configuration**: Use the ``RewardManager`` in your environment config to define reward components. Organize reward components in ``info["rewards"]`` dictionary and metrics in ``info["metrics"]`` dictionary. The trainer performs dense per-step logging directly from environment info. + +- **Template Methods**: Override ``compute_task_state()`` to define success/failure conditions and metrics. Override ``check_truncated()`` for custom truncation logic. - **Configuration**: Use JSON for all hyperparameters. This makes experiments reproducible and easy to track. From e037ceaa5c824b26f9cb7243b36fe4dae1941d83 Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Wed, 21 Jan 2026 14:50:14 +0800 Subject: [PATCH 04/12] add unit test for RL --- embodichain/agents/rl/train.py | 20 ++++++++++-- tests/agents/test_rl.py | 58 ++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 3 deletions(-) create mode 100644 tests/agents/test_rl.py diff --git a/embodichain/agents/rl/train.py b/embodichain/agents/rl/train.py index ca3c2996..89055cb8 100644 --- a/embodichain/agents/rl/train.py +++ b/embodichain/agents/rl/train.py @@ -39,12 +39,20 @@ from embodichain.lab.gym.envs.managers.cfg import EventCfg -def main(): +def parse_args(): + """Parse command line arguments.""" parser = argparse.ArgumentParser() parser.add_argument("--config", type=str, required=True, help="Path to JSON config") - args = parser.parse_args() + return parser.parse_args() + + +def train_from_config(config_path: str): + """Run training from a config file path. - with open(args.config, "r") as f: + Args: + config_path: Path to the JSON config file + """ + with open(config_path, "r") as f: cfg_json = json.load(f) trainer_cfg = cfg_json["trainer"] @@ -277,5 +285,11 @@ def main(): logger.log_info("Training finished") +def main(): + """Main entry point for command-line training.""" + args = parse_args() + train_from_config(args.config) + + if __name__ == "__main__": main() diff --git a/tests/agents/test_rl.py b/tests/agents/test_rl.py new file mode 100644 index 00000000..2588f998 --- /dev/null +++ b/tests/agents/test_rl.py @@ -0,0 +1,58 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +import os +import json +import pytest +import tempfile + + +def test_rl_training(): + """Test RL training pipeline by running a few iterations.""" + # Load the existing push_cube config + config_path = "configs/agents/rl/push_cube/train_config.json" + + with open(config_path, "r") as f: + config = json.load(f) + + # Create temporary config with reduced iterations for testing + test_config = config.copy() + test_config["trainer"]["iterations"] = 2 # Only 2 iterations for testing + test_config["trainer"]["rollout_steps"] = 32 # Fewer rollout steps + test_config["trainer"]["eval_freq"] = 1000000 # Disable eval + test_config["trainer"]["save_freq"] = 1000000 # Disable save + test_config["trainer"]["headless"] = True + test_config["trainer"]["use_wandb"] = False + test_config["trainer"]["num_envs"] = 2 # Only 2 parallel envs for testing + + # Save temporary config + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(test_config, f) + temp_config_path = f.name + + try: + from embodichain.agents.rl.train import train_from_config + + train_from_config(temp_config_path) + assert True + + finally: + if os.path.exists(temp_config_path): + os.remove(temp_config_path) + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"]) From 24ac4170580d6965e1fb0afee2e21fb33d80a193 Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Wed, 21 Jan 2026 18:12:39 +0800 Subject: [PATCH 05/12] update rl env and dataset manager --- configs/agents/rl/push_cube/gym_config.json | 25 ++ .../rl/push_cube/gym_config_vision.json | 244 ++++++++++++++++++ configs/agents/rl/push_cube/train_config.json | 2 +- embodichain/lab/gym/envs/embodied_env.py | 46 ++++ embodichain/lab/gym/envs/managers/datasets.py | 152 +++++------ embodichain/lab/gym/envs/managers/rewards.py | 86 +++--- embodichain/lab/gym/envs/rl_env.py | 36 +-- 7 files changed, 455 insertions(+), 136 deletions(-) create mode 100644 configs/agents/rl/push_cube/gym_config_vision.json diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json index 3e0e0445..7858d18b 100644 --- a/configs/agents/rl/push_cube/gym_config.json +++ b/configs/agents/rl/push_cube/gym_config.json @@ -117,6 +117,31 @@ "episode_length": 100, "action_scale": 0.1, "success_threshold": 0.1 + }, + "dataset": { + "lerobot": { + "func": "LeRobotRecorder", + "mode": "save", + "params": { + "robot_meta": { + "robot_type": "UR10_DH_Gripper", + "control_freq": 25, + "arm_dofs": 6, + "observation": { + "vision": {} + } + }, + "instruction": { + "lang": "push_cube_to_target" + }, + "extra": { + "scene_type": "tabletop", + "task_description": "push_cube_rl" + }, + "use_videos": true, + "export_success_only": false + } + } } }, "robot": { diff --git a/configs/agents/rl/push_cube/gym_config_vision.json b/configs/agents/rl/push_cube/gym_config_vision.json new file mode 100644 index 00000000..032d4812 --- /dev/null +++ b/configs/agents/rl/push_cube/gym_config_vision.json @@ -0,0 +1,244 @@ +{ + "id": "PushCubeRL", + "max_episodes": 5, + "env": { + "num_envs": 128, + "sim_steps_per_control": 4, + "events": { + "randomize_cube": { + "func": "randomize_rigid_object_pose", + "mode": "reset", + "params": { + "entity_cfg": { + "uid": "cube" + }, + "position_range": [ + [-0.2, -0.2, 0.0], + [0.2, 0.2, 0.0] + ], + "relative_position": true + } + }, + "randomize_goal": { + "func": "randomize_target_pose", + "mode": "reset", + "params": { + "position_range": [ + [-0.3, -0.3, 0.05], + [0.3, 0.3, 0.05] + ], + "relative_position": false, + "store_key": "goal_pose" + } + } + }, + "observations": { + "robot_qpos": { + "func": "normalize_robot_joint_data", + "mode": "modify", + "name": "robot/qpos", + "params": { + "joint_ids": [0, 1, 2, 3, 4, 5] + } + }, + "robot_ee_pos": { + "func": "get_robot_eef_pose", + "mode": "add", + "name": "robot/ee_pos", + "params": { + "part_name": "arm" + } + }, + "cube_pos": { + "func": "get_rigid_object_pose", + "mode": "add", + "name": "object/cube_pos", + "params": { + "entity_cfg": { + "uid": "cube" + } + } + }, + "goal_pos": { + "func": "target_position", + "mode": "add", + "name": "object/goal_pos", + "params": { + "target_pose_key": "goal_pose" + } + } + }, + "rewards": { + "reaching_reward": { + "func": "reaching_behind_object", + "mode": "add", + "weight": 0.1, + "params": { + "object_cfg": { + "uid": "cube" + }, + "target_pose_key": "goal_pose", + "behind_offset": 0.015, + "height_offset": 0.015, + "distance_scale": 5.0, + "part_name": "arm" + } + }, + "place_reward": { + "func": "incremental_distance_to_target", + "mode": "add", + "weight": 1.0, + "params": { + "source_entity_cfg": { + "uid": "cube" + }, + "target_pose_key": "goal_pose", + "tanh_scale": 10.0, + "positive_weight": 2.0, + "negative_weight": 0.5, + "use_xy_only": true + } + }, + "action_penalty": { + "func": "action_smoothness_penalty", + "mode": "add", + "weight": 0.01, + "params": {} + }, + "success_bonus": { + "func": "success_reward", + "mode": "add", + "weight": 10.0, + "params": {} + } + }, + "extensions": { + "action_type": "delta_qpos", + "episode_length": 100, + "action_scale": 0.1, + "success_threshold": 0.1 + }, + "dataset": { + "lerobot": { + "func": "LeRobotRecorder", + "mode": "save", + "params": { + "robot_meta": { + "robot_type": "UR10_DH_Gripper", + "control_freq": 25, + "arm_dofs": 6, + "observation": { + "vision": { + "wrist_camera": [], + "third_person_camera": [] + }, + "states": ["qpos"] + } + }, + "instruction": { + "lang": "push_cube_to_target" + }, + "extra": { + "scene_type": "tabletop", + "task_description": "push_cube_rl" + }, + "use_videos": true, + "export_success_only": false + } + } + } + }, + "robot": { + "uid": "Manipulator", + "urdf_cfg": { + "components": [ + { + "component_type": "arm", + "urdf_path": "UniversalRobots/UR10/UR10.urdf" + }, + { + "component_type": "hand", + "urdf_path": "DH_PGI_140_80/DH_PGI_140_80.urdf" + } + ] + }, + "init_pos": [0.0, 0.0, 0.0], + "init_rot": [0.0, 0.0, 0.0], + "init_qpos": [0.0, -1.57, 1.57, -1.57, -1.57, 0.0, 0.04, 0.04], + "drive_pros": { + "drive_type": "force", + "stiffness": 100000.0, + "damping": 1000.0, + "max_velocity": 2.0, + "max_effort": 500.0 + }, + "solver_cfg": { + "arm": { + "class_type": "PytorchSolver", + "end_link_name": "ee_link", + "root_link_name": "base_link", + "tcp": [ + [1.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.16], + [0.0, 0.0, 0.0, 1.0] + ] + } + }, + "control_parts": { + "arm": ["JOINT[1-6]"] + } + }, + "sensor": [ + { + "sensor_type": "Camera", + "uid": "wrist_camera", + "width": 640, + "height": 480, + "intrinsics": [90.0, 90.0, 64.0, 64.0], + "extrinsics": { + "parent": "ee_link" + } + }, + { + "sensor_type": "Camera", + "uid": "third_person_camera", + "width": 640, + "height": 480, + "intrinsics": [90.0, 90.0, 64.0, 64.0], + "extrinsics": { + "eye": [0.5, 0.5, 0.8], + "target": [0.0, 0.0, 0.3], + "up": [0.0, 0.0, 1.0] + } + } + ], + "light": {}, + "background": [], + "rigid_object": [ + { + "uid": "cube", + "shape": { + "shape_type": "Cube", + "size": [0.1, 0.1, 0.1] + }, + "body_type": "dynamic", + "init_pos": [-0.6, -0.4, 0.05], + "attrs": { + "mass": 10.0, + "static_friction": 3.0, + "dynamic_friction": 2.0, + "linear_damping": 2.0, + "angular_damping": 2.0, + "contact_offset": 0.003, + "rest_offset": 0.001, + "restitution": 0.1, + "max_depenetration_velocity": 10.0, + "max_linear_velocity": 1.0, + "max_angular_velocity": 1.0 + } + } + ], + "rigid_object_group": [], + "articulation": [] +} diff --git a/configs/agents/rl/push_cube/train_config.json b/configs/agents/rl/push_cube/train_config.json index 3b5c8e3a..005c940e 100644 --- a/configs/agents/rl/push_cube/train_config.json +++ b/configs/agents/rl/push_cube/train_config.json @@ -7,7 +7,7 @@ "headless": true, "enable_rt": false, "gpu_id": 0, - "num_envs": 512, + "num_envs": 8, "iterations": 1000, "rollout_steps": 1024, "eval_freq": 200, diff --git a/embodichain/lab/gym/envs/embodied_env.py b/embodichain/lab/gym/envs/embodied_env.py index 9e787bb2..af101935 100644 --- a/embodichain/lab/gym/envs/embodied_env.py +++ b/embodichain/lab/gym/envs/embodied_env.py @@ -170,6 +170,13 @@ def __init__(self, cfg: EmbodiedEnvCfg, **kwargs): super().__init__(cfg, **kwargs) + self.episode_obs_buffer: Dict[int, List[EnvObs]] = { + i: [] for i in range(self.num_envs) + } + self.episode_action_buffer: Dict[int, List[EnvAction]] = { + i: [] for i in range(self.num_envs) + } + def _init_sim_state(self, **kwargs): """Initialize the simulation state at the beginning of scene creation.""" @@ -271,6 +278,29 @@ def get_affordance(self, key: str, default: Any = None): """ return self.affordance_datas.get(key, default) + def _extract_single_env_data(self, data: Any, env_id: int) -> Any: + """Extract single environment data from batched data. + + Args: + data: Batched data (dict, tensor, list, or primitive) + env_id: Environment index + + Returns: + Data for the specified environment + """ + if isinstance(data, dict): + return { + k: self._extract_single_env_data(v, env_id) for k, v in data.items() + } + elif isinstance(data, torch.Tensor): + return data[env_id] if data.ndim > 0 else data + elif isinstance(data, (list, tuple)): + return type(data)( + self._extract_single_env_data(item, env_id) for item in data + ) + else: + return data + def _hook_after_sim_step( self, obs: EnvObs, @@ -280,6 +310,13 @@ def _hook_after_sim_step( info: Dict, **kwargs, ): + # Extract and append data for each environment + for env_id in range(self.num_envs): + single_obs = self._extract_single_env_data(obs, env_id) + single_action = self._extract_single_env_data(action, env_id) + self.episode_obs_buffer[env_id].append(single_obs) + self.episode_action_buffer[env_id].append(single_action) + # Call dataset manager with mode="save": it will record and auto-save if dones=True if self.cfg.dataset: if "save" in self.dataset_manager.available_modes: @@ -331,6 +368,15 @@ def _update_sim_state(self, **kwargs) -> None: def _initialize_episode( self, env_ids: Sequence[int] | None = None, **kwargs ) -> None: + # Clear episode buffers for environments that are being reset + if env_ids is None: + env_ids = range(self.num_envs) + for env_id in env_ids: + # Convert to int if it's a tensor + env_id = int(env_id) if isinstance(env_id, torch.Tensor) else env_id + self.episode_obs_buffer[env_id].clear() + self.episode_action_buffer[env_id].clear() + # apply events such as randomization for environments that need a reset if self.cfg.events: if "reset" in self.event_manager.available_modes: diff --git a/embodichain/lab/gym/envs/managers/datasets.py b/embodichain/lab/gym/envs/managers/datasets.py index 625ce8e7..1533d97b 100644 --- a/embodichain/lab/gym/envs/managers/datasets.py +++ b/embodichain/lab/gym/envs/managers/datasets.py @@ -92,10 +92,6 @@ def __init__(self, cfg: DatasetFunctorCfg, env: EmbodiedEnv): self.use_videos = params.get("use_videos", False) self.export_success_only = params.get("export_success_only", False) - # Episode data buffers - self.episode_obs_list: List[Dict] = [] - self.episode_action_list: List[Any] = [] - # LeRobot dataset instance self.dataset: Optional[LeRobotDataset] = None self.dataset_full_path: Optional[Path] = None @@ -114,14 +110,6 @@ def dataset_path(self) -> str: str(self.dataset_full_path) if self.dataset_full_path else "Not initialized" ) - def reset(self, env_ids: Optional[torch.Tensor] = None) -> None: - """Reset the recorder buffers. - - Args: - env_ids: Environment IDs to reset (currently clears all data). - """ - self._reset_buffer() - def __call__( self, env: EmbodiedEnv, @@ -160,8 +148,6 @@ def __call__( use_videos: Whether to save videos (already set in __init__). export_success_only: Whether to export only successful episodes (already set in __init__). """ - # Always record the step - self._record_step(obs, action) # Check if any episodes are done and save them done_env_ids = dones.nonzero(as_tuple=False).squeeze(-1) @@ -169,43 +155,38 @@ def __call__( # Save completed episodes self._save_episodes(done_env_ids, terminateds, info) - def _record_step(self, obs: EnvObs, action: EnvAction) -> None: - """Record a single step.""" - self.episode_obs_list.append(obs) - self.episode_action_list.append(action) - def _save_episodes( self, env_ids: torch.Tensor, terminateds: Optional[torch.Tensor] = None, info: Optional[Dict[str, Any]] = None, ) -> None: - """Save completed episodes.""" - if len(self.episode_obs_list) == 0: - logger.log_warning("No episode data to save") - return - - obs_list = self.episode_obs_list - action_list = self.episode_action_list + """Save completed episodes for specified environments.""" + task = self.instruction.get("lang", "unknown_task") - # Align obs and action - if len(obs_list) > len(action_list): - obs_list = obs_list[:-1] + # Process each environment + for env_id in env_ids.cpu().tolist(): + # Get buffer for this environment (already contains single-env data) + obs_list = self._env.episode_obs_buffer[env_id] + action_list = self._env.episode_action_buffer[env_id] - task = self.instruction.get("lang", "unknown_task") + if len(obs_list) == 0: + logger.log_warning(f"No episode data to save for env {env_id}") + continue - # Update metadata - extra_info = self.extra.copy() if self.extra else {} - fps = self.dataset.meta.info.get("fps", 30) - current_episode_time = (len(obs_list) * len(env_ids)) / fps if fps > 0 else 0 + # Align obs and action + if len(obs_list) > len(action_list): + obs_list = obs_list[:-1] - episode_extra_info = extra_info.copy() - self.total_time += current_episode_time - episode_extra_info["total_time"] = self.total_time - self._update_dataset_info({"extra": episode_extra_info}) + # Update metadata + extra_info = self.extra.copy() if self.extra else {} + fps = self.dataset.meta.info.get("fps", 30) + current_episode_time = len(obs_list) / fps if fps > 0 else 0 - # Process each environment - for env_id in env_ids.cpu().tolist(): + episode_extra_info = extra_info.copy() + self.total_time += current_episode_time + episode_extra_info["total_time"] = self.total_time + self._update_dataset_info({"extra": episode_extra_info}) is_success = False if info is not None and "success" in info: success_tensor = info["success"] @@ -222,42 +203,48 @@ def _save_episodes( try: for obs, action in zip(obs_list, action_list): - frame = self._convert_frame_to_lerobot(obs, action, task, env_id) + frame = self._convert_frame_to_lerobot(obs, action, task) self.dataset.add_frame(frame) self.dataset.save_episode() + logger.log_info( - f"Auto-saved {'successful' if is_success else 'failed'} " - f"episode {self.curr_episode} for env {env_id} with {len(obs_list)} frames" + f"[LeRobotRecorder] Saved dataset to: {self.dataset_path}\n" + f" Episode {self.curr_episode} (env {env_id}): " + f"{'successful' if is_success else 'failed'}, {len(obs_list)} frames" ) + self.curr_episode += 1 except Exception as e: logger.log_error(f"Failed to save episode {env_id}: {e}") - self._reset_buffer() - def finalize(self) -> Optional[str]: """Finalize the dataset.""" - if len(self.episode_obs_list) > 0: - active_env_ids = torch.arange(self.num_envs, device=self.device) + # Save any remaining episodes + env_ids_with_data = [] + for env_id in range(self.num_envs): + if len(self._env.episode_obs_buffer[env_id]) > 0: + env_ids_with_data.append(env_id) + + if env_ids_with_data: + active_env_ids = torch.tensor(env_ids_with_data, device=self.device) self._save_episodes(active_env_ids) try: if self.dataset is not None: self.dataset.finalize() - logger.log_info(f"Dataset finalized at: {self.dataset_path}") + logger.log_info( + f"[LeRobotRecorder] Dataset finalized successfully\n" + f" Path: {self.dataset_path}\n" + f" Total episodes: {self.curr_episode}\n" + f" Total time: {self.total_time:.2f}s" + ) return self.dataset_path except Exception as e: - logger.log_error(f"Failed to finalize dataset: {e}") + logger.log_error(f"[LeRobotRecorder] Failed to finalize dataset: {e}") return None - def _reset_buffer(self) -> None: - """Reset episode buffers.""" - self.episode_obs_list.clear() - self.episode_action_list.clear() - logger.log_info("Reset buffers (cleared all batched data)") - def _initialize_dataset(self) -> None: """Initialize the LeRobot dataset.""" robot_type = self.robot_meta.get("robot_type", "robot") @@ -348,9 +335,18 @@ def _build_features(self) -> Dict: return features def _convert_frame_to_lerobot( - self, obs: Dict[str, Any], action: Any, task: str, env_id: int + self, obs: Dict[str, Any], action: Any, task: str ) -> Dict: - """Convert a single frame to LeRobot format.""" + """Convert a single frame to LeRobot format. + + Args: + obs: Single environment observation (already extracted from batch) + action: Single environment action (already extracted from batch) + task: Task name + + Returns: + Frame dict in LeRobot format with numpy arrays + """ frame = {"task": task} extra_vision_config = self.robot_meta.get("observation", {}).get("vision", {}) arm_dofs = self.robot_meta.get("arm_dofs", 7) @@ -363,9 +359,9 @@ def _convert_frame_to_lerobot( color_data = obs["sensor"][camera_name]["color"] if isinstance(color_data, torch.Tensor): - color_img = color_data[env_id][:, :, :3].cpu().numpy() + color_img = color_data[:, :, :3].cpu().numpy() else: - color_img = np.array(color_data)[env_id][:, :, :3] + color_img = np.array(color_data)[:, :, :3] if color_img.dtype in [np.float32, np.float64]: color_img = (color_img * 255).astype(np.uint8) @@ -375,11 +371,9 @@ def _convert_frame_to_lerobot( if is_stereo: color_right_data = obs["sensor"][camera_name]["color_right"] if isinstance(color_right_data, torch.Tensor): - color_right_img = ( - color_right_data[env_id][:, :, :3].cpu().numpy() - ) + color_right_img = color_right_data[:, :, :3].cpu().numpy() else: - color_right_img = np.array(color_right_data)[env_id][:, :, :3] + color_right_img = np.array(color_right_data)[:, :, :3] if color_right_img.dtype in [np.float32, np.float64]: color_right_img = (color_right_img * 255).astype(np.uint8) @@ -389,25 +383,37 @@ def _convert_frame_to_lerobot( # Add state qpos = obs["robot"][JointType.QPOS.value] if isinstance(qpos, torch.Tensor): - state_data = qpos[env_id].cpu().numpy().astype(np.float32) + state_data = qpos.cpu().numpy().astype(np.float32) else: - state_data = np.array(qpos)[env_id].astype(np.float32) + state_data = np.array(qpos).astype(np.float32) frame["observation.state"] = state_data # Add action if isinstance(action, torch.Tensor): - action_data = action[env_id, :arm_dofs].cpu().numpy() + action_data = action[:arm_dofs].cpu().numpy() elif isinstance(action, np.ndarray): - action_data = action[env_id, :arm_dofs] + action_data = action[:arm_dofs] elif isinstance(action, dict): - action_data = action.get("action", action.get("arm_action", action)) - if isinstance(action_data, torch.Tensor): - action_data = action_data[env_id, :arm_dofs].cpu().numpy() - elif isinstance(action_data, np.ndarray): - action_data = action_data[env_id, :arm_dofs] + # Extract qpos from action dict + action_tensor = action.get( + "qpos", action.get("delta_qpos", action.get("action", None)) + ) + if action_tensor is None: + # Fallback to first tensor value + for v in action.values(): + if isinstance(v, (torch.Tensor, np.ndarray)): + action_tensor = v + break + + if isinstance(action_tensor, torch.Tensor): + action_data = action_tensor[:arm_dofs].cpu().numpy() + elif isinstance(action_tensor, np.ndarray): + action_data = action_tensor[:arm_dofs] + else: + action_data = np.array(action_tensor)[:arm_dofs] else: - action_data = np.array(action)[env_id, :arm_dofs] + action_data = np.array(action)[:arm_dofs] frame["action"] = action_data diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py index dfd2872b..dad89b7f 100644 --- a/embodichain/lab/gym/envs/managers/rewards.py +++ b/embodichain/lab/gym/envs/managers/rewards.py @@ -30,7 +30,7 @@ def distance_between_objects( env: EmbodiedEnv, obs: dict, - action: dict, + action: torch.Tensor | dict, info: dict, source_entity_cfg: SceneEntityCfg = None, target_entity_cfg: SceneEntityCfg = None, @@ -92,7 +92,7 @@ def distance_between_objects( def joint_velocity_penalty( env: EmbodiedEnv, obs: dict, - action: dict, + action: torch.Tensor | dict, info: dict, robot_uid: str = "robot", joint_ids: slice | list[int] | None = None, @@ -146,21 +146,21 @@ def joint_velocity_penalty( def action_smoothness_penalty( env: EmbodiedEnv, obs: dict, - action: dict, + action: torch.Tensor | dict, info: dict, ) -> torch.Tensor: """Penalize large action changes between consecutive timesteps. Encourages smooth control commands by penalizing sudden changes in actions. - Stores previous action in env._reward_states for comparison. + Gets previous action from env.episode_action_buffer. Returns: - Penalty tensor of shape (num_envs,). Zero on first call (no previous action), - negative on subsequent calls (larger change = more negative). + Penalty tensor of shape (num_envs,). Zero on first step (no previous action), + negative on subsequent steps (larger change = more negative). Note: - This function maintains state across calls using env._reward_states['prev_actions']. - State is automatically reset when the environment resets. + This function reads from env.episode_action_buffer, which is automatically + cleared when the environment resets. Example: ```json @@ -171,26 +171,48 @@ def action_smoothness_penalty( } ``` """ - # Extract action tensor from dict - for key in ["qpos", "qvel", "qf"]: - if key in action: - action_tensor = action[key] - break - - # Use dictionary-based state management - if not hasattr(env, "_reward_states"): - env._reward_states = {} - - # compute difference between current and previous action - if "prev_actions" in env._reward_states: - action_diff = action_tensor - env._reward_states["prev_actions"] - penalty = -torch.norm(action_diff, dim=-1) + # Extract current action tensor + if isinstance(action, torch.Tensor): + current_action_tensor = action + elif isinstance(action, dict): + # Extract from dict + current_action_tensor = None + for key in ["qpos", "qvel", "qf"]: + if key in action: + current_action_tensor = action[key] + break + if current_action_tensor is None: + return torch.zeros(env.num_envs, device=env.device) else: - # no previous action, no penalty - penalty = torch.zeros(env.num_envs, device=env.device) + return torch.zeros(env.num_envs, device=env.device) - # store current action for next step - env._reward_states["prev_actions"] = action_tensor.clone() + # Get previous action from buffer for each environment + penalty = torch.zeros(env.num_envs, device=env.device) + + for env_id in range(env.num_envs): + action_buffer = env.episode_action_buffer[env_id] + + if len(action_buffer) > 0: + # Get previous action (last in buffer) + prev_action = action_buffer[-1] + + # Extract tensor from previous action + if isinstance(prev_action, torch.Tensor): + prev_action_tensor = prev_action + elif isinstance(prev_action, dict): + prev_action_tensor = None + for key in ["qpos", "qvel", "qf"]: + if key in prev_action: + prev_action_tensor = prev_action[key] + break + else: + prev_action_tensor = None + + if prev_action_tensor is not None: + # Compute difference and penalty + action_diff = current_action_tensor[env_id] - prev_action_tensor + penalty[env_id] = -torch.norm(action_diff) + # else: first step, no penalty (already zero) return penalty @@ -198,7 +220,7 @@ def action_smoothness_penalty( def joint_limit_penalty( env: EmbodiedEnv, obs: dict, - action: dict, + action: torch.Tensor | dict, info: dict, robot_uid: str = "robot", joint_ids: slice | list[int] = slice(None), @@ -264,7 +286,7 @@ def joint_limit_penalty( def orientation_alignment( env: EmbodiedEnv, obs: dict, - action: dict, + action: torch.Tensor | dict, info: dict, source_entity_cfg: SceneEntityCfg = None, target_entity_cfg: SceneEntityCfg = None, @@ -319,7 +341,7 @@ def orientation_alignment( def success_reward( env: EmbodiedEnv, obs: dict, - action: dict, + action: torch.Tensor | dict, info: dict, ) -> torch.Tensor: """Sparse bonus reward when task succeeds. @@ -367,7 +389,7 @@ def success_reward( def reaching_behind_object( env: EmbodiedEnv, obs: dict, - action: dict, + action: torch.Tensor | dict, info: dict, object_cfg: SceneEntityCfg = None, target_pose_key: str = "goal_pose", @@ -458,7 +480,7 @@ def reaching_behind_object( def distance_to_target( env: "EmbodiedEnv", obs: dict, - action: dict, + action: torch.Tensor | dict, info: dict, source_entity_cfg: SceneEntityCfg = None, target_pose_key: str = "target_pose", @@ -535,7 +557,7 @@ def distance_to_target( def incremental_distance_to_target( env: "EmbodiedEnv", obs: dict, - action: dict, + action: torch.Tensor | dict, info: dict, source_entity_cfg: SceneEntityCfg = None, target_pose_key: str = "target_pose", diff --git a/embodichain/lab/gym/envs/rl_env.py b/embodichain/lab/gym/envs/rl_env.py index 3a8f4118..9ce6b5f1 100644 --- a/embodichain/lab/gym/envs/rl_env.py +++ b/embodichain/lab/gym/envs/rl_env.py @@ -165,6 +165,9 @@ def _process_delta_qpos(self, action: torch.Tensor) -> torch.Tensor: def _process_eef_pose(self, action: torch.Tensor) -> torch.Tensor: """Process end-effector pose action via inverse kinematics. + TODO: Currently only supports single-arm robots (6-axis or 7-axis). + For multi-arm or complex robots, please use qpos/delta_qpos actions instead. + Args: action: End-effector pose (position + orientation) Shape: (num_envs, 6) for pos+euler or (num_envs, 7) for pos+quat @@ -172,32 +175,8 @@ def _process_eef_pose(self, action: torch.Tensor) -> torch.Tensor: Returns: Joint positions from IK """ - # Auto-select control part if not specified - control_part_name = self.control_part_name - if control_part_name is None: - if self.robot.control_parts is None: - raise ValueError( - "Robot has no control_parts defined for eef_pose action" - ) - - available_parts = list(self.robot.control_parts.keys()) - if not available_parts: - raise ValueError( - "Robot has no control_parts defined for eef_pose action" - ) - - if len(available_parts) == 1: - control_part_name = available_parts[0] - else: - raise ValueError( - f"Multiple control_parts found {available_parts}. " - f"Please specify control_part_name explicitly." - ) - # Get current joint positions as IK seed current_qpos = self.robot.get_qpos() - control_joint_ids = self.robot.get_joint_ids(name=control_part_name) - seed_qpos = current_qpos[:, control_joint_ids] # Convert action to target pose matrix (4x4) batch_size = action.shape[0] @@ -223,15 +202,12 @@ def _process_eef_pose(self, action: torch.Tensor) -> torch.Tensor: for env_idx in range(self.num_envs): qpos_ik = self.robot.compute_ik( pose=target_pose[env_idx], - joint_seed=seed_qpos[env_idx], - name=control_part_name, + joint_seed=current_qpos[env_idx], ) ik_solutions.append(qpos_ik) - # Update full qpos with IK solutions - result_qpos = current_qpos.clone() - ik_qpos = torch.stack(ik_solutions, dim=0) - result_qpos[:, control_joint_ids] = ik_qpos + # Stack IK solutions + result_qpos = torch.stack(ik_solutions, dim=0) return result_qpos From 1d569017f06d49c9a9c6ce005de8edc104d20fe8 Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Thu, 22 Jan 2026 09:41:36 +0800 Subject: [PATCH 06/12] update tensorboard --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 50fef254..658ba0e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ dependencies = [ "PyYAML>=6.0", "accelerate>=1.10.0", "wandb==0.20.1", - "tensorboard", + "tensorboard>=2.20.0", "transformers>=4.53.0", "diffusers>=0.32.1", "deepspeed>=0.16.2", From c96f4d3d66e06c4a2e744330952267e08e36f2ad Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Thu, 22 Jan 2026 15:28:47 +0800 Subject: [PATCH 07/12] remove dataset from config && fix CI --- .github/workflows/main.yml | 2 +- configs/agents/rl/push_cube/gym_config.json | 25 ------ .../rl/push_cube/gym_config_vision.json | 39 +-------- configs/agents/rl/push_cube/train_config.json | 6 +- pyproject.toml | 2 +- tests/agents/test_rl.py | 80 ++++++++++++++----- 6 files changed, 68 insertions(+), 86 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4cb35e58..63b697e6 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -48,7 +48,7 @@ jobs: - uses: actions/checkout@v4 - name: Build docs run: | - pip install -e . --extra-index-url http://pyp.open3dv.site:2345/simple/ --trusted-host pyp.open3dv.site + pip install -e .[lerobot] --extra-index-url http://pyp.open3dv.site:2345/simple/ --trusted-host pyp.open3dv.site pip install -r docs/requirements.txt cd ${GITHUB_WORKSPACE}/docs echo "Start Building docs..." diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json index 7858d18b..3e0e0445 100644 --- a/configs/agents/rl/push_cube/gym_config.json +++ b/configs/agents/rl/push_cube/gym_config.json @@ -117,31 +117,6 @@ "episode_length": 100, "action_scale": 0.1, "success_threshold": 0.1 - }, - "dataset": { - "lerobot": { - "func": "LeRobotRecorder", - "mode": "save", - "params": { - "robot_meta": { - "robot_type": "UR10_DH_Gripper", - "control_freq": 25, - "arm_dofs": 6, - "observation": { - "vision": {} - } - }, - "instruction": { - "lang": "push_cube_to_target" - }, - "extra": { - "scene_type": "tabletop", - "task_description": "push_cube_rl" - }, - "use_videos": true, - "export_success_only": false - } - } } }, "robot": { diff --git a/configs/agents/rl/push_cube/gym_config_vision.json b/configs/agents/rl/push_cube/gym_config_vision.json index 032d4812..e64290e0 100644 --- a/configs/agents/rl/push_cube/gym_config_vision.json +++ b/configs/agents/rl/push_cube/gym_config_vision.json @@ -2,8 +2,6 @@ "id": "PushCubeRL", "max_episodes": 5, "env": { - "num_envs": 128, - "sim_steps_per_control": 4, "events": { "randomize_cube": { "func": "randomize_rigid_object_pose", @@ -117,35 +115,6 @@ "episode_length": 100, "action_scale": 0.1, "success_threshold": 0.1 - }, - "dataset": { - "lerobot": { - "func": "LeRobotRecorder", - "mode": "save", - "params": { - "robot_meta": { - "robot_type": "UR10_DH_Gripper", - "control_freq": 25, - "arm_dofs": 6, - "observation": { - "vision": { - "wrist_camera": [], - "third_person_camera": [] - }, - "states": ["qpos"] - } - }, - "instruction": { - "lang": "push_cube_to_target" - }, - "extra": { - "scene_type": "tabletop", - "task_description": "push_cube_rl" - }, - "use_videos": true, - "export_success_only": false - } - } } }, "robot": { @@ -193,8 +162,8 @@ { "sensor_type": "Camera", "uid": "wrist_camera", - "width": 640, - "height": 480, + "width": 120, + "height": 120, "intrinsics": [90.0, 90.0, 64.0, 64.0], "extrinsics": { "parent": "ee_link" @@ -203,8 +172,8 @@ { "sensor_type": "Camera", "uid": "third_person_camera", - "width": 640, - "height": 480, + "width": 120, + "height": 120, "intrinsics": [90.0, 90.0, 64.0, 64.0], "extrinsics": { "eye": [0.5, 0.5, 0.8], diff --git a/configs/agents/rl/push_cube/train_config.json b/configs/agents/rl/push_cube/train_config.json index 005c940e..1c683654 100644 --- a/configs/agents/rl/push_cube/train_config.json +++ b/configs/agents/rl/push_cube/train_config.json @@ -1,18 +1,18 @@ { "trainer": { "exp_name": "push_cube_ppo", - "gym_config": "configs/agents/rl/push_cube/gym_config.json", + "gym_config": "configs/agents/rl/push_cube/gym_config_vision.json", "seed": 42, "device": "cuda:0", "headless": true, "enable_rt": false, "gpu_id": 0, - "num_envs": 8, + "num_envs": 64, "iterations": 1000, "rollout_steps": 1024, "eval_freq": 200, "save_freq": 200, - "use_wandb": true, + "use_wandb": false, "wandb_project_name": "embodychain-push_cube", "events": { "eval": { diff --git a/pyproject.toml b/pyproject.toml index 658ba0e9..134acbcc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ "polars==1.31.0", "PyYAML>=6.0", "accelerate>=1.10.0", - "wandb==0.20.1", + "wandb>=0.22.3", "tensorboard>=2.20.0", "transformers>=4.53.0", "diffusers>=0.32.1", diff --git a/tests/agents/test_rl.py b/tests/agents/test_rl.py index 2588f998..827e670f 100644 --- a/tests/agents/test_rl.py +++ b/tests/agents/test_rl.py @@ -23,35 +23,73 @@ def test_rl_training(): """Test RL training pipeline by running a few iterations.""" # Load the existing push_cube config - config_path = "configs/agents/rl/push_cube/train_config.json" - - with open(config_path, "r") as f: - config = json.load(f) - - # Create temporary config with reduced iterations for testing - test_config = config.copy() - test_config["trainer"]["iterations"] = 2 # Only 2 iterations for testing - test_config["trainer"]["rollout_steps"] = 32 # Fewer rollout steps - test_config["trainer"]["eval_freq"] = 1000000 # Disable eval - test_config["trainer"]["save_freq"] = 1000000 # Disable save - test_config["trainer"]["headless"] = True - test_config["trainer"]["use_wandb"] = False - test_config["trainer"]["num_envs"] = 2 # Only 2 parallel envs for testing - - # Save temporary config + train_config_path = "configs/agents/rl/push_cube/train_config.json" + gym_config_path = "configs/agents/rl/push_cube/gym_config.json" + + with open(train_config_path, "r") as f: + train_config = json.load(f) + + with open(gym_config_path, "r") as f: + gym_config = json.load(f) + + # Add dataset configuration dynamically to gym_config + gym_config["env"]["dataset"] = { + "lerobot": { + "func": "LeRobotRecorder", + "mode": "save", + "params": { + "robot_meta": { + "robot_type": "UR10_DH_Gripper", + "control_freq": 25, + "arm_dofs": 6, + "observation": {"vision": {}, "states": ["qpos"]}, + }, + "instruction": {"lang": "push_cube_to_target"}, + "extra": { + "scene_type": "tabletop", + "task_description": "push_cube_rl_test", + }, + "use_videos": False, + "export_success_only": False, + }, + } + } + + # Create temporary gym config with dataset + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(gym_config, f) + temp_gym_config_path = f.name + + # Create temporary train config with reduced iterations for testing + test_train_config = train_config.copy() + test_train_config["trainer"][ + "gym_config" + ] = temp_gym_config_path # Point to temp gym config + test_train_config["trainer"]["iterations"] = 2 # Only 2 iterations for testing + test_train_config["trainer"]["rollout_steps"] = 32 # Fewer rollout steps + test_train_config["trainer"]["eval_freq"] = 1000000 # Disable eval + test_train_config["trainer"]["save_freq"] = 1000000 # Disable save + test_train_config["trainer"]["headless"] = True + test_train_config["trainer"]["use_wandb"] = False + test_train_config["trainer"]["num_envs"] = 2 # Only 2 parallel envs for testing + + # Save temporary train config with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: - json.dump(test_config, f) - temp_config_path = f.name + json.dump(test_train_config, f) + temp_train_config_path = f.name try: from embodichain.agents.rl.train import train_from_config - train_from_config(temp_config_path) + train_from_config(temp_train_config_path) assert True finally: - if os.path.exists(temp_config_path): - os.remove(temp_config_path) + # Clean up temporary files + if os.path.exists(temp_train_config_path): + os.remove(temp_train_config_path) + if os.path.exists(temp_gym_config_path): + os.remove(temp_gym_config_path) if __name__ == "__main__": From f03df006f92b0a4b7c52e5c409d63a28a041a42b Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Thu, 22 Jan 2026 16:24:19 +0800 Subject: [PATCH 08/12] fix wandb version --- configs/agents/rl/push_cube/train_config.json | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/agents/rl/push_cube/train_config.json b/configs/agents/rl/push_cube/train_config.json index 1c683654..64e2ecb2 100644 --- a/configs/agents/rl/push_cube/train_config.json +++ b/configs/agents/rl/push_cube/train_config.json @@ -1,7 +1,7 @@ { "trainer": { "exp_name": "push_cube_ppo", - "gym_config": "configs/agents/rl/push_cube/gym_config_vision.json", + "gym_config": "configs/agents/rl/push_cube/gym_config.json", "seed": 42, "device": "cuda:0", "headless": true, diff --git a/pyproject.toml b/pyproject.toml index 134acbcc..d834132e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ "polars==1.31.0", "PyYAML>=6.0", "accelerate>=1.10.0", - "wandb>=0.22.3", + "wandb>=0.21.0", "tensorboard>=2.20.0", "transformers>=4.53.0", "diffusers>=0.32.1", From 287365726a7bfd724eafb6e738db232b4ec8370d Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Thu, 22 Jan 2026 16:25:36 +0800 Subject: [PATCH 09/12] remove --- .../rl/push_cube/gym_config_vision.json | 213 ------------------ 1 file changed, 213 deletions(-) delete mode 100644 configs/agents/rl/push_cube/gym_config_vision.json diff --git a/configs/agents/rl/push_cube/gym_config_vision.json b/configs/agents/rl/push_cube/gym_config_vision.json deleted file mode 100644 index e64290e0..00000000 --- a/configs/agents/rl/push_cube/gym_config_vision.json +++ /dev/null @@ -1,213 +0,0 @@ -{ - "id": "PushCubeRL", - "max_episodes": 5, - "env": { - "events": { - "randomize_cube": { - "func": "randomize_rigid_object_pose", - "mode": "reset", - "params": { - "entity_cfg": { - "uid": "cube" - }, - "position_range": [ - [-0.2, -0.2, 0.0], - [0.2, 0.2, 0.0] - ], - "relative_position": true - } - }, - "randomize_goal": { - "func": "randomize_target_pose", - "mode": "reset", - "params": { - "position_range": [ - [-0.3, -0.3, 0.05], - [0.3, 0.3, 0.05] - ], - "relative_position": false, - "store_key": "goal_pose" - } - } - }, - "observations": { - "robot_qpos": { - "func": "normalize_robot_joint_data", - "mode": "modify", - "name": "robot/qpos", - "params": { - "joint_ids": [0, 1, 2, 3, 4, 5] - } - }, - "robot_ee_pos": { - "func": "get_robot_eef_pose", - "mode": "add", - "name": "robot/ee_pos", - "params": { - "part_name": "arm" - } - }, - "cube_pos": { - "func": "get_rigid_object_pose", - "mode": "add", - "name": "object/cube_pos", - "params": { - "entity_cfg": { - "uid": "cube" - } - } - }, - "goal_pos": { - "func": "target_position", - "mode": "add", - "name": "object/goal_pos", - "params": { - "target_pose_key": "goal_pose" - } - } - }, - "rewards": { - "reaching_reward": { - "func": "reaching_behind_object", - "mode": "add", - "weight": 0.1, - "params": { - "object_cfg": { - "uid": "cube" - }, - "target_pose_key": "goal_pose", - "behind_offset": 0.015, - "height_offset": 0.015, - "distance_scale": 5.0, - "part_name": "arm" - } - }, - "place_reward": { - "func": "incremental_distance_to_target", - "mode": "add", - "weight": 1.0, - "params": { - "source_entity_cfg": { - "uid": "cube" - }, - "target_pose_key": "goal_pose", - "tanh_scale": 10.0, - "positive_weight": 2.0, - "negative_weight": 0.5, - "use_xy_only": true - } - }, - "action_penalty": { - "func": "action_smoothness_penalty", - "mode": "add", - "weight": 0.01, - "params": {} - }, - "success_bonus": { - "func": "success_reward", - "mode": "add", - "weight": 10.0, - "params": {} - } - }, - "extensions": { - "action_type": "delta_qpos", - "episode_length": 100, - "action_scale": 0.1, - "success_threshold": 0.1 - } - }, - "robot": { - "uid": "Manipulator", - "urdf_cfg": { - "components": [ - { - "component_type": "arm", - "urdf_path": "UniversalRobots/UR10/UR10.urdf" - }, - { - "component_type": "hand", - "urdf_path": "DH_PGI_140_80/DH_PGI_140_80.urdf" - } - ] - }, - "init_pos": [0.0, 0.0, 0.0], - "init_rot": [0.0, 0.0, 0.0], - "init_qpos": [0.0, -1.57, 1.57, -1.57, -1.57, 0.0, 0.04, 0.04], - "drive_pros": { - "drive_type": "force", - "stiffness": 100000.0, - "damping": 1000.0, - "max_velocity": 2.0, - "max_effort": 500.0 - }, - "solver_cfg": { - "arm": { - "class_type": "PytorchSolver", - "end_link_name": "ee_link", - "root_link_name": "base_link", - "tcp": [ - [1.0, 0.0, 0.0, 0.0], - [0.0, 1.0, 0.0, 0.0], - [0.0, 0.0, 1.0, 0.16], - [0.0, 0.0, 0.0, 1.0] - ] - } - }, - "control_parts": { - "arm": ["JOINT[1-6]"] - } - }, - "sensor": [ - { - "sensor_type": "Camera", - "uid": "wrist_camera", - "width": 120, - "height": 120, - "intrinsics": [90.0, 90.0, 64.0, 64.0], - "extrinsics": { - "parent": "ee_link" - } - }, - { - "sensor_type": "Camera", - "uid": "third_person_camera", - "width": 120, - "height": 120, - "intrinsics": [90.0, 90.0, 64.0, 64.0], - "extrinsics": { - "eye": [0.5, 0.5, 0.8], - "target": [0.0, 0.0, 0.3], - "up": [0.0, 0.0, 1.0] - } - } - ], - "light": {}, - "background": [], - "rigid_object": [ - { - "uid": "cube", - "shape": { - "shape_type": "Cube", - "size": [0.1, 0.1, 0.1] - }, - "body_type": "dynamic", - "init_pos": [-0.6, -0.4, 0.05], - "attrs": { - "mass": 10.0, - "static_friction": 3.0, - "dynamic_friction": 2.0, - "linear_damping": 2.0, - "angular_damping": 2.0, - "contact_offset": 0.003, - "rest_offset": 0.001, - "restitution": 0.1, - "max_depenetration_velocity": 10.0, - "max_linear_velocity": 1.0, - "max_angular_velocity": 1.0 - } - } - ], - "rigid_object_group": [], - "articulation": [] -} From 6399c1390bf10422de3baa31ecdfe92d2aa4fcb0 Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Thu, 22 Jan 2026 19:02:22 +0800 Subject: [PATCH 10/12] update rl unit test --- embodichain/agents/rl/train.py | 14 +++ tests/agents/test_rl.py | 165 ++++++++++++++++++++------------- 2 files changed, 114 insertions(+), 65 deletions(-) diff --git a/embodichain/agents/rl/train.py b/embodichain/agents/rl/train.py index 89055cb8..b7e292e3 100644 --- a/embodichain/agents/rl/train.py +++ b/embodichain/agents/rl/train.py @@ -282,6 +282,20 @@ def train_from_config(config_path: str): wandb.finish() except Exception: pass + + # Clean up environments to prevent resource leaks + try: + if env is not None: + env.close() + except Exception as e: + logger.log_warning(f"Failed to close training environment: {e}") + + try: + if eval_env is not None: + eval_env.close() + except Exception as e: + logger.log_warning(f"Failed to close evaluation environment: {e}") + logger.log_info("Training finished") diff --git a/tests/agents/test_rl.py b/tests/agents/test_rl.py index 827e670f..475ffa1d 100644 --- a/tests/agents/test_rl.py +++ b/tests/agents/test_rl.py @@ -18,78 +18,113 @@ import json import pytest import tempfile +from pathlib import Path -def test_rl_training(): - """Test RL training pipeline by running a few iterations.""" - # Load the existing push_cube config - train_config_path = "configs/agents/rl/push_cube/train_config.json" - gym_config_path = "configs/agents/rl/push_cube/gym_config.json" - - with open(train_config_path, "r") as f: - train_config = json.load(f) - - with open(gym_config_path, "r") as f: - gym_config = json.load(f) - - # Add dataset configuration dynamically to gym_config - gym_config["env"]["dataset"] = { - "lerobot": { - "func": "LeRobotRecorder", - "mode": "save", - "params": { - "robot_meta": { - "robot_type": "UR10_DH_Gripper", - "control_freq": 25, - "arm_dofs": 6, - "observation": {"vision": {}, "states": ["qpos"]}, - }, - "instruction": {"lang": "push_cube_to_target"}, - "extra": { - "scene_type": "tabletop", - "task_description": "push_cube_rl_test", +class TestRLTraining: + """Test suite for RL training pipeline.""" + + temp_gym_config_path = None + temp_train_config_path = None + + def setup_method(self): + """Set up test configuration before each test method.""" + # Load the existing push_cube config + train_config_path = "configs/agents/rl/push_cube/train_config.json" + gym_config_path = "configs/agents/rl/push_cube/gym_config.json" + + with open(train_config_path, "r") as f: + train_config = json.load(f) + + with open(gym_config_path, "r") as f: + gym_config = json.load(f) + + # Add dataset configuration dynamically to gym_config + gym_config["env"]["dataset"] = { + "lerobot": { + "func": "LeRobotRecorder", + "mode": "save", + "params": { + "robot_meta": { + "robot_type": "UR10_DH_Gripper", + "control_freq": 25, + "arm_dofs": 6, + "observation": {"vision": {}, "states": ["qpos"]}, + }, + "instruction": {"lang": "push_cube_to_target"}, + "extra": { + "scene_type": "tabletop", + "task_description": "push_cube_rl_test", + }, + "use_videos": False, + "export_success_only": False, }, - "use_videos": False, - "export_success_only": False, - }, + } } - } - - # Create temporary gym config with dataset - with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: - json.dump(gym_config, f) - temp_gym_config_path = f.name - - # Create temporary train config with reduced iterations for testing - test_train_config = train_config.copy() - test_train_config["trainer"][ - "gym_config" - ] = temp_gym_config_path # Point to temp gym config - test_train_config["trainer"]["iterations"] = 2 # Only 2 iterations for testing - test_train_config["trainer"]["rollout_steps"] = 32 # Fewer rollout steps - test_train_config["trainer"]["eval_freq"] = 1000000 # Disable eval - test_train_config["trainer"]["save_freq"] = 1000000 # Disable save - test_train_config["trainer"]["headless"] = True - test_train_config["trainer"]["use_wandb"] = False - test_train_config["trainer"]["num_envs"] = 2 # Only 2 parallel envs for testing - - # Save temporary train config - with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: - json.dump(test_train_config, f) - temp_train_config_path = f.name - - try: - from embodichain.agents.rl.train import train_from_config - train_from_config(temp_train_config_path) - assert True + # Create temporary gym config with dataset + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(gym_config, f) + self.temp_gym_config_path = f.name + + # Create temporary train config with reduced iterations for testing + test_train_config = train_config.copy() + test_train_config["trainer"]["gym_config"] = self.temp_gym_config_path + test_train_config["trainer"]["iterations"] = 2 + test_train_config["trainer"]["rollout_steps"] = 32 + test_train_config["trainer"]["eval_freq"] = 1000000 # Disable eval + test_train_config["trainer"]["save_freq"] = 1000000 # Disable save + test_train_config["trainer"]["headless"] = True + test_train_config["trainer"]["use_wandb"] = False + test_train_config["trainer"]["num_envs"] = 2 + + # Save temporary train config + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(test_train_config, f) + self.temp_train_config_path = f.name - finally: + def teardown_method(self): + """Clean up resources after each test method.""" # Clean up temporary files - if os.path.exists(temp_train_config_path): - os.remove(temp_train_config_path) - if os.path.exists(temp_gym_config_path): - os.remove(temp_gym_config_path) + if self.temp_train_config_path and os.path.exists(self.temp_train_config_path): + os.remove(self.temp_train_config_path) + self.temp_train_config_path = None + + if self.temp_gym_config_path and os.path.exists(self.temp_gym_config_path): + os.remove(self.temp_gym_config_path) + self.temp_gym_config_path = None + + def test_training_pipeline(self): + """Test basic RL training pipeline with minimal iterations.""" + from embodichain.agents.rl.train import train_from_config + + # This should run without errors + train_from_config(self.temp_train_config_path) + + @pytest.mark.parametrize("num_envs", [1, 2, 4]) + def test_multi_env_training(self, num_envs: int): + """Test training with different numbers of parallel environments.""" + # Reload and modify config for this specific test + with open(self.temp_train_config_path, "r") as f: + config = json.load(f) + + config["trainer"]["num_envs"] = num_envs + config["trainer"][ + "iterations" + ] = 1 # Even fewer iterations for parameterized test + + # Save modified config + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(config, f) + temp_config = f.name + + try: + from embodichain.agents.rl.train import train_from_config + + train_from_config(temp_config) + finally: + if os.path.exists(temp_config): + os.remove(temp_config) if __name__ == "__main__": From 924f7afb360e64aba17e4cc85915db0f1bc400e5 Mon Sep 17 00:00:00 2001 From: yuecideng Date: Thu, 22 Jan 2026 19:12:42 +0800 Subject: [PATCH 11/12] fix ci issue --- .github/workflows/main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 63b697e6..f17f3ebe 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -54,6 +54,7 @@ jobs: echo "Start Building docs..." pip uninstall pymeshlab -y pip install pymeshlab==2023.12.post3 + pip install matplotlib==3.10.8 make html - name: Upload docs artifact if: github.event_name == 'push' && github.ref == 'refs/heads/main' From 1f66db16f8d6073f6d285f5d9a66987402db8c5c Mon Sep 17 00:00:00 2001 From: yuecideng Date: Thu, 22 Jan 2026 19:26:41 +0800 Subject: [PATCH 12/12] wip --- .github/workflows/main.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f17f3ebe..0b83af33 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -48,13 +48,12 @@ jobs: - uses: actions/checkout@v4 - name: Build docs run: | - pip install -e .[lerobot] --extra-index-url http://pyp.open3dv.site:2345/simple/ --trusted-host pyp.open3dv.site + pip install -e . --extra-index-url http://pyp.open3dv.site:2345/simple/ --trusted-host pyp.open3dv.site pip install -r docs/requirements.txt cd ${GITHUB_WORKSPACE}/docs echo "Start Building docs..." pip uninstall pymeshlab -y pip install pymeshlab==2023.12.post3 - pip install matplotlib==3.10.8 make html - name: Upload docs artifact if: github.event_name == 'push' && github.ref == 'refs/heads/main' @@ -76,7 +75,7 @@ jobs: - uses: actions/checkout@v4 - name: Run tests run: | - pip install -e . --extra-index-url http://pyp.open3dv.site:2345/simple/ --trusted-host pyp.open3dv.site + pip install -e .[lerobot] --extra-index-url http://pyp.open3dv.site:2345/simple/ --trusted-host pyp.open3dv.site echo "Unit test Start" export HF_ENDPOINT=https://hf-mirror.com pip uninstall pymeshlab -y