From 88e46586dec800ecba225da38531aab9d12487ef Mon Sep 17 00:00:00 2001
From: cc <1716911340@qq.com>
Date: Thu, 25 Sep 2025 11:36:08 +0800
Subject: [PATCH 01/57] fix(kl_divergence): fix wrong passed param when using
 kl_penalty (#129)

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 rlinf/algorithms/losses.py                   | 2 +-
 rlinf/workers/actor/megatron_actor_worker.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/rlinf/algorithms/losses.py b/rlinf/algorithms/losses.py
index 6900d5f68..237e9fb1a 100644
--- a/rlinf/algorithms/losses.py
+++ b/rlinf/algorithms/losses.py
@@ -233,7 +233,7 @@ def compute_math_ppo_actor_loss(**kwargs):
     dual_clip_mask.logical_and_(loss_mask)
 
     clip_fraction = clip_mask.logical_and_(loss_mask).count_nonzero() / loss_mask_count
-    approx_kl = approx_kl.sum() / loss_mask_count
+    approx_kl = -approx_kl.sum() / loss_mask_count
 
     dual_cliped_ratio = torch.where(dual_clip_mask, ratio, 0)
 
diff --git a/rlinf/workers/actor/megatron_actor_worker.py b/rlinf/workers/actor/megatron_actor_worker.py
index ae9beada5..469f3416b 100644
--- a/rlinf/workers/actor/megatron_actor_worker.py
+++ b/rlinf/workers/actor/megatron_actor_worker.py
@@ -394,7 +394,7 @@ def loss_func(output):
 
                 kl_loss = torch.tensor(0.0, device=torch.cuda.current_device())
                 if self.kl_beta > 0 and ref_logprobs is not None:
-                    kld = kl_penalty(ref_logprobs, curr_logprobs, self.kl_penalty_type)
+                    kld = kl_penalty(curr_logprobs, ref_logprobs, self.kl_penalty_type)
                     kl_loss = self.loss_agg_func(kld, mask)
                     loss = loss + kl_loss * self.kl_beta
 

From 92b44d70e28fe42c58ac01b85d7205682714eea0 Mon Sep 17 00:00:00 2001
From: QuanluZhang <z.quanluzhang@gmail.com>
Date: Sat, 27 Sep 2025 16:26:47 +0800
Subject: [PATCH 02/57] docs(example-gallery): update example gallery doc pages
 (#137)

Signed-off-by: QuanluZhang <zhangquanlu@infini-ai.com>
---
 README.md                                     |   3 +-
 docs/source-en/rst_source/examples/index.rst  | 193 ++++++++++++++-
 docs/source-en/rst_source/examples/libero.rst | 232 ++++++++++++++++++
 .../examples/{embodied.rst => maniskill.rst}  |  80 +-----
 .../rst_source/examples/reasoning.rst         |   4 +-
 docs/source-zh/rst_source/examples/index.rst  | 186 +++++++++++++-
 docs/source-zh/rst_source/examples/libero.rst | 222 +++++++++++++++++
 .../examples/{embodied.rst => maniskill.rst}  |  68 +----
 .../rst_source/examples/reasoning.rst         |   4 +-
 9 files changed, 838 insertions(+), 154 deletions(-)
 create mode 100644 docs/source-en/rst_source/examples/libero.rst
 rename docs/source-en/rst_source/examples/{embodied.rst => maniskill.rst} (74%)
 create mode 100644 docs/source-zh/rst_source/examples/libero.rst
 rename docs/source-zh/rst_source/examples/{embodied.rst => maniskill.rst} (78%)

diff --git a/README.md b/README.md
index 9858cc577..5b673c23c 100644
--- a/README.md
+++ b/README.md
@@ -23,8 +23,9 @@ RLinf is a flexible and scalable open-source infrastructure designed for post-tr
 
 
 ## What's NEW!
+- [2025/09] <img src="https://github.githubassets.com/images/icons/emoji/unicode/1f525.png" width="18" /> [Example Gallery](https://rlinf.readthedocs.io/en/latest/rst_source/examples/index.html) is updated, users can find various off-the-shelf examples!
+- [2025/09] The paper [RLinf: Flexible and Efficient Large-scale Reinforcement Learning via Macro-to-Micro Flow Transformation](https://arxiv.org/abs/2509.15965) is released.
 - [2025/08] RLinf is open-sourced. The formal v0.1 will be released soon.
-- [2025/09] The paper [RLinf: Flexible and Efficient Large-scale Reinforcement Learning via Macro-to-Micro Flow Transformation](https://arxiv.org/abs/2509.15965) is released. 
 
 ## Key Features
 
diff --git a/docs/source-en/rst_source/examples/index.rst b/docs/source-en/rst_source/examples/index.rst
index 678f2d68c..4b3984a5e 100644
--- a/docs/source-en/rst_source/examples/index.rst
+++ b/docs/source-en/rst_source/examples/index.rst
@@ -4,20 +4,195 @@ Example Gallery
 This section presents the collection of **examples currently supported by RLinf**, 
 showcasing how the framework can be applied across different scenarios and 
 demonstrating its efficiency in practice.
+This example gallery is continuously expanding, covering new scenarios and tasks to highlight RLinf's flexibility and efficiency.
 
-At present, we provide two major categories of examples:
+Embodied Intelligence Scenarios
+-------------------------------
 
-- **Embodied Agent Scenario**: Training **VLA** models for embodied intelligence.  
-  See :doc:`embodied`.
-- **Reasoner Scenario**: Training **LLM** models for advanced reasoning tasks.  
-  See :doc:`reasoning`.
+This category includes embodied training examples with SOTA models (e.g., pi0, pi0.5, OpenVLA-OFT) and different simulators (e.g., LIBERO, ManiSkill, RoboTwin),
+as well as reinforcement learning training examples on real robots.
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <video controls autoplay loop muted playsinline preload="metadata" style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);">
+         <source src="https://github.com/RLinf/misc/raw/main/pic/embody.mp4" type="video/mp4">
+         Your browser does not support the video tag.
+       </video>
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/en/latest/rst_source/examples/maniskill.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>RL with ManiSkill Simulator</b>
+         </a><br>
+         ManiSkill + OpenVLA + PPO/GRPO achieves SOTA performance
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/libero_numbers.jpeg" 
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/en/latest/rst_source/examples/libero.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>RL with LIBERO Simulator</b>
+         </a><br>
+         LIBERO + OpenVLA-OFT + GRPO reaches 99% success rate
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]RL on π₀ Models</b><br>
+         Significant improvement in RL training on π₀
+       </p>
+     </div>
+   </div>
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://raw.githubusercontent.com/RoboTwin-Platform/RoboTwin/main/assets/files/50_tasks.gif" 
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" 
+            data-target="animated-image.originalImage">
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]RL with RoboTwin</b><br>
+         RoboTwin + OpenVLA-OFT + PPO achieves SOTA performance
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/franka_arm_small.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]Real-World RL with Franka</b><br>
+         RLinf worker seamlessly integrates with the Franka robotic arm
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]RL with World Models</b><br>
+         Training with integrated UnifoLM-WMA-0 world models
+       </p>
+     </div>
+   </div>
+
+
+Reasoning Scenarios
+-------------------
+
+Reinforcement learning is a key approach to improving reasoning capabilities. RLinf supports mainstream models such as Qwen and Qwen-next for RL training in tasks like Math, achieving SOTA results.
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/math_numbers_small.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/en/latest/rst_source/examples/reasoning.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>RL Training for Math Reasoning</b>
+         </a><br>
+         Achieves SOTA results on AIME24/AIME25/GPQA-diamond benchmarks
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]RL Training for MoE Models</b><br>
+         RL training speed improved by xx% compared to other tools
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]RL Training for Qwen-next</b><br>
+         Achieves SOTA training performance with Qwen-next
+       </p>
+     </div>
+   </div>
+
+
+Agent Scenarios
+---------------
+
+RLinf's worker abstraction, flexible communication modules, and support for various accelerators make it naturally suited for building agent workflows and training agents.
+The following examples include agent workflow construction, online RL training, and environment integration.
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/math_numbers_small.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/en/latest/rst_source/examples/cursor_onlinerl.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>Open-Source Online RL for Code Completion</b>
+         </a><br>
+         End-to-end online RL with RLinf + Continue, improving model performance by xx%
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]rStar2-agent RL Training</b><br>
+         Flexible resource allocation and scheduling across components
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]SWE-agent</b><br>
+         Unified deployment, inference, and training with high flexibility and performance
+       </p>
+     </div>
+   </div>
+
+
+Practical System Features
+-------------------------
+
+RLinf's overall design is simple and modular.
+Workers abstract components for RL and agents, with a flexible and efficient communication library enabling inter-component interaction.
+Thanks to this decoupled design, workers can be flexibly and dynamically scheduled to computing resources or assigned to the most suitable accelerators.
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]Hot Scaling/Switching of Workers (Components)</b><br>
+         Hot switching reduces training time by 50%+
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[Ongoing]Hybrid Training on Heterogeneous Accelerator</b><br>
+         Flexible inter-operability between components on different accelerators to build training workflows
+       </p>
+     </div>
+   </div>
 
-The example gallery will continue to expand with new scenarios and tasks over time, 
-illustrating the versatility and scalability of RLinf.
 
 .. toctree::
    :hidden:
    :maxdepth: 2
 
-   embodied
-   reasoning
+   maniskill
+   libero
+   reasoning
\ No newline at end of file
diff --git a/docs/source-en/rst_source/examples/libero.rst b/docs/source-en/rst_source/examples/libero.rst
new file mode 100644
index 000000000..7836ca9c8
--- /dev/null
+++ b/docs/source-en/rst_source/examples/libero.rst
@@ -0,0 +1,232 @@
+RL with LIBERO Simulator
+========================
+
+.. |huggingface| image:: /_static/svg/hf-logo.svg
+   :width: 16px
+   :height: 16px
+   :class: inline-icon
+
+This document provides a comprehensive guide to launching and managing the 
+Vision-Language-Action Models (VLAs) training task within the RLinf framework, 
+focusing on finetuning a VLA model for robotic manipulation in the LIBERO environment. 
+
+The primary objective is to develop a model capable of performing robotic manipulation by:
+
+1. **Visual Understanding**: Processing RGB images from the robot's camera.
+2. **Language Comprehension**: Interpreting natural-language task descriptions.
+3. **Action Generation**: Producing precise robotic actions (position, rotation, gripper control).
+4. **Reinforcement Learning**: Optimizing the policy via the PPO with environment feedback.
+
+Environment
+-----------------------
+
+**LIBERO Environment**
+
+- **Environment**: LIBERO simulation benchmark built on top of *robosuite* (MuJoCo).
+- **Task**: Command a 7-DoF robotic arm to perform a variety of household manipulation skills (pick-and-place, stacking, opening drawers, spatial rearrangement).
+- **Observation**: RGB images (typical resolutions 128 × 128 or 224 × 224) captured by off-screen cameras placed around the workspace.
+- **Action Space**: 7-dimensional continuous actions  
+  - 3D end-effector position control (x, y, z)  
+  - 3D rotation control (roll, pitch, yaw)  
+  - Gripper control (open / close)
+
+**Task Description Format**
+
+.. code-block:: text
+
+   In: What action should the robot take to [task_description]?
+   Out: 
+
+**Data Structure**
+
+- **Images**: RGB tensors ``[batch_size, 3, 224, 224]``
+- **Task Descriptions**: Natural-language instructions
+- **Actions**: Normalized continuous values converted to discrete tokens
+- **Rewards**: Step-level rewards based on task completion
+
+Algorithm
+-----------------------------------------
+
+**Core Algorithm Components**
+
+1. **PPO (Proximal Policy Optimization)**
+
+   - Advantage estimation using GAE (Generalized Advantage Estimation)
+
+   - Policy clipping with ratio limits
+
+   - Value function clipping
+
+   - Entropy regularization
+
+2. **GRPO (Group Relative Policy Optimization)**
+
+   - For every state / prompt the policy generates *G* independent actions
+
+   - Compute the advantage of each action by subtracting the group’s mean reward.
+
+
+3. **Vision-Language-Action Model**
+
+   - OpenVLA architecture with multimodal fusion
+
+   - Action tokenization and de-tokenization
+
+   - Value head for critic function
+
+Running the Script
+-------------------
+
+**1. Key Parameters Configuration**
+
+.. code-block:: yaml
+
+   cluster:
+      num_nodes: 2
+      component_placement:
+         env: 0-7
+         rollout: 8-15
+         actor: 0-15
+
+   rollout:
+      pipeline_stage_num: 2
+
+Here you can flexibly configure the GPU count for env, rollout, and actor components.
+Using the above configuration, you can achieve pipeline overlap between env and rollout, and sharing with actor.
+Additionally, by setting `pipeline_stage_num = 2` in the configuration, you can achieve pipeline overlap between rollout and actor, improving rollout efficiency.
+
+.. code-block:: yaml
+   
+   cluster:
+      num_nodes: 1
+      component_placement:
+         env,rollout,actor: all
+
+You can also reconfigure the placement to achieve complete sharing, where env, rollout, and actor components all share all GPUs.
+
+.. code-block:: yaml
+
+   cluster:
+      num_nodes: 2
+      component_placement:
+         env: 0-3
+         rollout: 4-7
+         actor: 8-15
+
+You can also reconfigure the placement to achieve complete separation, where env, rollout, and actor components each use their own GPUs without interference, eliminating the need for offload functionality.
+
+**2. Configuration Files**
+
+We currently support training in two environments: **ManiSkill3** and **LIBERO**.
+
+We support the **OpenVLA-OFT** model with both **PPO** and **GRPO** algorithms.  
+The corresponding configuration files are:
+
+- **OpenVLA-OFT + PPO**: ``examples/embodiment/config/libero_10_ppo_openvlaoft.yaml``
+- **OpenVLA-OFT + GRPO**: ``examples/embodiment/config/libero_10_grpo_openvlaoft.yaml``
+
+**3. Launch Commands**
+
+To start training with a chosen configuration, run the following command:
+
+.. code-block:: bash
+
+   bash examples/embodiment/run_embodiment.sh CHOSEN_CONFIG
+
+For example, to train the OpenVLA model using the PPO algorithm in the ManiSkill3 environment, run:
+
+.. code-block:: bash
+
+   bash examples/embodiment/run_embodiment.sh libero_10_ppo_openvlaoft
+
+
+Visualization and Results
+-------------------------
+
+**1. TensorBoard Logging**
+
+.. code-block:: bash
+
+   # Start TensorBoard
+   tensorboard --logdir ./logs --port 6006
+
+**2. Key Metrics Tracked**
+
+- **Training Metrics**:
+
+  - ``actor/loss``: PPO policy loss
+  - ``actor/value_loss``: Value function loss
+  - ``actor/entropy``: Policy entropy
+  - ``actor/grad_norm``: Gradient norm
+  - ``actor/lr``: Learning rate
+
+- **Rollout Metrics**:
+
+  - ``rollout/reward_mean``: Average episode reward
+  - ``rollout/reward_std``: Reward standard deviation
+  - ``rollout/episode_length``: Average episode length
+  - ``rollout/success_rate``: Task completion rate
+
+- **Environment Metrics**:
+
+  - ``env/success_rate``: Success rate across environments
+  - ``env/step_reward``: Step-by-step reward
+  - ``env/termination_rate``: Episode termination rate
+
+**3. Video Generation**
+
+.. code-block:: yaml
+
+   video_cfg:
+     save_video: True
+     info_on_video: True
+     video_base_dir: ./logs/video/train
+
+**4. WandB Integration**
+
+.. code-block:: yaml
+
+   trainer:
+     logger:
+       wandb:
+         enable: True
+         project_name: "RLinf"
+         experiment_name: "openvla-maniskill"
+
+
+LIBERO Results
+~~~~~~~~~~~~~~~~~~~
+
+Furthermore, we trained OpenVLA-OFT in the LIBERO environment using the GRPO algorithm. The improvements achieved through our RL fine-tuning are shown below:
+
+.. list-table:: **OpenVLA-OFT model results on LIBERO**
+   :header-rows: 1
+
+   * - Model
+     - `Spatial <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-spatial>`_
+     - `Goal <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-goal>`_
+     - `Object <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-object>`_
+     - `Long <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-long>`_
+     - Average
+   * - OpenVLA-OFT-SFT (one-shot)
+     - 56.5%
+     - 45.6%
+     - 25.6%
+     - 9.7%
+     - 34.4%
+   * - OpenVLA-OFT-RLinf
+     - **99.0%**
+     - **99.0%**
+     - **99.0%**
+     - **94.4%**
+     - **97.9%**
+   * - Improvement
+     - +42.5%
+     - +53.4%
+     - +73.4%
+     - +84.7%
+     - +63.5%
+
+For the Libero experiment, we were inspired by 
+`SimpleVLA <https://github.com/PRIME-RL/SimpleVLA-RL>`_, 
+with only minor modifications. We thank the authors for releasing their open-source code.
diff --git a/docs/source-en/rst_source/examples/embodied.rst b/docs/source-en/rst_source/examples/maniskill.rst
similarity index 74%
rename from docs/source-en/rst_source/examples/embodied.rst
rename to docs/source-en/rst_source/examples/maniskill.rst
index e35f9c4ef..de1992441 100644
--- a/docs/source-en/rst_source/examples/embodied.rst
+++ b/docs/source-en/rst_source/examples/maniskill.rst
@@ -1,5 +1,5 @@
-Agentic RL-VLA
-========================
+RL with ManiSkill Simulator
+===========================
 
 .. |huggingface| image:: /_static/svg/hf-logo.svg
    :width: 16px
@@ -8,7 +8,7 @@ Agentic RL-VLA
 
 This document provides a comprehensive guide to launching and managing the 
 Vision-Language-Action Models (VLAs) training task within the RLinf framework, 
-focusing on finetuning a VLA model for robotic manipulation in the ManiSkill3/LIBERO environment. 
+focusing on finetuning a VLA model for robotic manipulation in the ManiSkill3 environment. 
 
 The primary objective is to develop a model capable of performing robotic manipulation by:
 
@@ -18,7 +18,7 @@ The primary objective is to develop a model capable of performing robotic manipu
 4. **Reinforcement Learning**: Optimizing the policy via the PPO with environment feedback.
 
 Environment
------------------------
+-----------
 
 **ManiSkill3 Environment**
 
@@ -30,16 +30,6 @@ Environment
   - 3D rotation control (roll, pitch, yaw)
   - Gripper control (open/close)
 
-**LIBERO Environment**
-
-- **Environment**: LIBERO simulation benchmark built on top of *robosuite* (MuJoCo).
-- **Task**: Command a 7-DoF robotic arm to perform a variety of household manipulation skills (pick-and-place, stacking, opening drawers, spatial rearrangement).
-- **Observation**: RGB images (typical resolutions 128 × 128 or 224 × 224) captured by off-screen cameras placed around the workspace.
-- **Action Space**: 7-dimensional continuous actions  
-  - 3D end-effector position control (x, y, z)  
-  - 3D rotation control (roll, pitch, yaw)  
-  - Gripper control (open / close)
-
 **Task Description Format**
 
 .. code-block:: text
@@ -129,23 +119,13 @@ You can also reconfigure the placement to achieve complete separation, where env
 
 We currently support training in two environments: **ManiSkill3** and **LIBERO**.
 
-1. **ManiSkill3 Environment**
-
-   We support two models: **OpenVLA** and **OpenVLA-OFT**, along with two algorithms: **PPO** and **GRPO**.  
-   The corresponding configuration files are:
-
-   - **OpenVLA + PPO**: ``examples/embodiment/config/maniskill_ppo_openvla.yaml``
-   - **OpenVLA-OFT + PPO**: ``examples/embodiment/config/maniskill_ppo_openvlaoft.yaml``
-   - **OpenVLA + GRPO**: ``examples/embodiment/config/maniskill_grpo_openvla.yaml``
-   - **OpenVLA-OFT + GRPO**: ``examples/embodiment/config/maniskill_grpo_openvlaoft.yaml``
+We support two models: **OpenVLA** and **OpenVLA-OFT**, along with two algorithms: **PPO** and **GRPO**.  
+The corresponding configuration files are:
 
-2. **LIBERO Environment**
-
-   We support the **OpenVLA-OFT** model with both **PPO** and **GRPO** algorithms.  
-   The corresponding configuration files are:
-
-   - **OpenVLA-OFT + PPO**: ``examples/embodiment/config/libero_10_ppo_openvlaoft.yaml``
-   - **OpenVLA-OFT + GRPO**: ``examples/embodiment/config/libero_10_grpo_openvlaoft.yaml``
+- **OpenVLA + PPO**: ``examples/embodiment/config/maniskill_ppo_openvla.yaml``
+- **OpenVLA-OFT + PPO**: ``examples/embodiment/config/maniskill_ppo_openvlaoft.yaml``
+- **OpenVLA + GRPO**: ``examples/embodiment/config/maniskill_grpo_openvla.yaml``
+- **OpenVLA-OFT + GRPO**: ``examples/embodiment/config/maniskill_grpo_openvlaoft.yaml``
 
 **3. Launch Commands**
 
@@ -216,7 +196,7 @@ Visualization and Results
          experiment_name: "openvla-maniskill"
 
 ManiSkill3 Results
-~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~
 
 As an illustrative example, we present the training results of the PPO algorithm in the ManiSkill3 environment. 
 Running on a single 8-GPU H100 machine, OpenVLA (left) and OpenVLA-OFT (right) achieved up to 90% success on ManiSkill3’s plate-25-main task, after 48 and 24 hours of PPO training, respectively.
@@ -289,41 +269,3 @@ using the PPO algorithm within the RLinf framework.
      <source src="https://github.com/RLinf/misc/raw/main/pic/embody.mp4" type="video/mp4">
      Your browser does not support the video tag.
    </video>
-
-
-LIBERO Results
-~~~~~~~~~~~~~~~~~~~
-
-Furthermore, we trained OpenVLA-OFT in the LIBERO environment using the GRPO algorithm. The improvements achieved through our RL fine-tuning are shown below:
-
-.. list-table:: **OpenVLA-OFT model results on LIBERO**
-   :header-rows: 1
-
-   * - Model
-     - `Spatial <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-spatial>`_
-     - `Goal <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-goal>`_
-     - `Object <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-object>`_
-     - `Long <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-long>`_
-     - Average
-   * - OpenVLA-OFT-SFT (one-shot)
-     - 56.5%
-     - 45.6%
-     - 25.6%
-     - 9.7%
-     - 34.4%
-   * - OpenVLA-OFT-RLinf
-     - **99.0%**
-     - **99.0%**
-     - **99.0%**
-     - **94.4%**
-     - **97.9%**
-   * - Improvement
-     - +42.5%
-     - +53.4%
-     - +73.4%
-     - +84.7%
-     - +63.5%
-
-For the Libero experiment, we were inspired by 
-`SimpleVLA <https://github.com/PRIME-RL/SimpleVLA-RL>`_, 
-with only minor modifications. We thank the authors for releasing their open-source code.
\ No newline at end of file
diff --git a/docs/source-en/rst_source/examples/reasoning.rst b/docs/source-en/rst_source/examples/reasoning.rst
index d786791cf..fdac14524 100644
--- a/docs/source-en/rst_source/examples/reasoning.rst
+++ b/docs/source-en/rst_source/examples/reasoning.rst
@@ -1,5 +1,5 @@
-Reasoning RL-LLM
-=================
+Math Reasoning RL Training
+==========================
 
 .. |huggingface| image:: /_static/svg/hf-logo.svg
    :width: 16px
diff --git a/docs/source-zh/rst_source/examples/index.rst b/docs/source-zh/rst_source/examples/index.rst
index b1f1c2122..70acd04a9 100644
--- a/docs/source-zh/rst_source/examples/index.rst
+++ b/docs/source-zh/rst_source/examples/index.rst
@@ -2,19 +2,191 @@
 ===============
 
 本节展示了 **RLinf 目前支持的示例集合**，  
-展示该框架如何应用于不同场景，并演示其在实际中的高效性。  
+展示该框架如何应用于不同场景，并演示其在实际中的高效性。示例库会随着时间不断扩展，涵盖新的场景和任务，以展示 RLinf 的多样性和可扩展性。
 
-目前，我们提供两大类示例：  
+具身智能场景
+----------------
 
-- **具身智能场景**：训练 **VLA** 模型用于具身智能。参见 :doc:`embodied`。  
-- **推理场景**：训练 **LLM** 模型用于高级推理任务。参见 :doc:`reasoning`。  
+具身智能场景包含SOTA模型（如pi0、pi0.5、OpenVLA-OFT）和不同模拟器（如LIBERO、ManiSkill、RoboTwin）的训练示例，以及真机强化学习训练示例等。
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <video controls autoplay loop muted playsinline preload="metadata" style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);">
+         <source src="https://github.com/RLinf/misc/raw/main/pic/embody.mp4" type="video/mp4">
+         Your browser does not support the video tag.
+       </video>
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/zh-cn/latest/rst_source/examples/maniskill.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>基于ManiSkill的强化学习</b>
+         </a><br>
+         ManiSkill+OpenVLA+PPO/GRPO达到SOTA训练效果
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/libero_numbers.jpeg" 
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/zh-cn/latest/rst_source/examples/libero.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>基于LIBERO的强化学习</b>
+         </a><br>
+         LIBERO+OpenVLA-OFT+GRPO成功率达99%
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]π₀模型强化学习训练</b><br>
+         在π₀上实现强化学习的效果跃升
+       </p>
+     </div>
+   </div>
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://raw.githubusercontent.com/RoboTwin-Platform/RoboTwin/main/assets/files/50_tasks.gif" 
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" 
+            data-target="animated-image.originalImage">
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]基于RoboTwin的强化学习</b><br>
+         RoboTwin+OpenVLA-OFT+PPO达到SOTA训练效果
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/franka_arm_small.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]Franka真机强化学习</b><br>
+         RLinf worker无缝对接Franka机械臂
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]基于世界模型的强化学习</b><br>
+         集成UnifoLM-WMA-0世界模型的强化学习训练
+       </p>
+     </div>
+   </div>
+
+
+推理场景
+--------------
+
+强化学习是提升模型推理能力的关键手段，RLinf支持主流模型如Qwen、Qwen-next在Math等场景的强化学习训练，并达到SOTA的训练效果。
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/math_numbers_small.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/zh-cn/latest/rst_source/examples/reasoning.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>Math推理的强化学习训练</b>
+         </a><br>
+         AIME24/AIME25/GPQA-diamond评测结果达到SOTA
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]MoE模型强化学习训练</b><br>
+         MoE RL训练速度相比同类工具提升xx%
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]Qwen-next强化学习训练</b><br>
+         Qwen-next强化学习训练效果达到SOTA
+       </p>
+     </div>
+   </div>
+
+
+智能体场景
+--------------
+
+RLinf的worker抽象、灵活的通信组件、以及对不同类型加速器的支持使RLinf天然支持智能体工作流的构建，以及智能体的训练。以下示例包含智能体工作流构建、在线强化学习训练、环境接入等示例。
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/math_numbers_small.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <a href="https://rlinf.readthedocs.io/zh-cn/latest/rst_source/examples/cursor_onlinerl.html" target="_blank" style="text-decoration: underline; color: blue;">
+          <b>代码补全在线强化学习开源版</b>
+         </a><br>
+         基于RLinf+continue实现端到端在线强化学习，模型效果提升xx%
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[适配中]rStar2-agent强化学习</b><br>
+         支持各组件所用资源量的灵活配置与调度
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[适配中]SWE-agent</b><br>
+         部署、推理、训练一体，高灵活性、高性能
+       </p>
+     </div>
+   </div>
+
+
+实用系统功能
+--------------------
+
+RLinf的整体设计简洁且模块化，以Worker为抽象封装强化学习训练、智能体中的组件，提供灵活高效的通信库做组件间通信。基于这种解耦的设计，可以灵活调度Worker所使用的计算资源，也可以将Worker分配到更适配的加速器上。
+
+.. raw:: html
+
+   <div style="display: flex; justify-content: center; gap: 20px; align-items: flex-start; flex-wrap: wrap;">
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]Worker(组件)间秒级热切换</b><br>
+         秒级热切换提升训练速度50%+
+       </p>
+     </div>
+
+     <div style="flex: 1 1 30%; max-width: 300px; text-align: center;">
+       <img src="https://github.com/RLinf/misc/raw/main/pic/waiting_icon.jpg"
+            style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
+       <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
+         <b>[开发中]异构加速器混合训练</b><br>
+         使用不同加速器运行的组件间灵活互通，构建训练工作流
+       </p>
+     </div>
+   </div>
 
-示例库会随着时间不断扩展，涵盖新的场景和任务，  
-以展示 RLinf 的多样性和可扩展性。  
 
 .. toctree::
    :hidden:
    :maxdepth: 2
 
-   embodied
+   maniskill
+   libero
    reasoning
diff --git a/docs/source-zh/rst_source/examples/libero.rst b/docs/source-zh/rst_source/examples/libero.rst
new file mode 100644
index 000000000..14b27a7c7
--- /dev/null
+++ b/docs/source-zh/rst_source/examples/libero.rst
@@ -0,0 +1,222 @@
+基于LIBERO模拟器的强化学习训练
+===========================================================
+
+.. |huggingface| image:: /_static/svg/hf-logo.svg
+   :width: 16px
+   :height: 16px
+   :class: inline-icon
+
+本文档给出在 RLinf 框架内启动与管理 **Vision-Language-Action Models (VLAs)** 训练任务的完整指南，
+在 LIBERO 环境中微调 VLA 模型以完成机器人操作。
+
+主要目标是让模型具备以下能力：
+
+1. **视觉理解**：处理来自机器人相机的 RGB 图像。  
+2. **语言理解**：理解自然语言的任务描述。  
+3. **动作生成**：产生精确的机器人动作（位置、旋转、夹爪控制）。  
+4. **强化学习**：结合环境反馈，使用 PPO 优化策略。
+
+环境
+-----------------------
+
+**LIBERO 环境**
+
+- **Environment**：基于 *robosuite*（MuJoCo）的 LIBERO 仿真基准  
+- **Task**：指挥一台 7 自由度机械臂完成多种家居操作技能（抓取放置、叠放、开抽屉、空间重排等）  
+- **Observation**：工作区周围离屏相机采集的 RGB 图像（常见分辨率 128×128 或 224×224）  
+- **Action Space**：7 维连续动作  
+  - 末端执行器三维位置控制（x, y, z）  
+  - 三维旋转控制（roll, pitch, yaw）  
+  - 夹爪控制（开/合）
+
+**任务描述格式**
+
+.. code-block:: text
+
+   In: What action should the robot take to [task_description]?
+   Out: 
+
+**数据结构**
+
+- **Images**：RGB 张量 ``[batch_size, 3, 224, 224]``  
+- **Task Descriptions**：自然语言指令  
+- **Actions**：归一化的连续值，转换为离散 tokens  
+- **Rewards**：基于任务完成度的逐步奖励
+
+算法
+-----------------------------------------
+
+**核心算法组件**
+
+1. **PPO（Proximal Policy Optimization）**
+
+   - 使用 GAE（Generalized Advantage Estimation）进行优势估计  
+   - 基于比率的策略裁剪  
+   - 价值函数裁剪  
+   - 熵正则化
+
+2. **GRPO（Group Relative Policy Optimization）**
+
+   - 对于每个状态/提示，策略生成 *G* 个独立动作  
+   - 以组内平均奖励为基线，计算每个动作的相对优势
+
+3. **Vision-Language-Action 模型**
+
+   - OpenVLA 架构，多模态融合  
+   - 动作 token 化与反 token 化  
+   - 带 Value Head 的 Critic 功能
+
+运行脚本
+-------------------
+
+**1. 关键参数配置**
+
+.. code-block:: yaml
+
+   cluster:
+      num_nodes: 2
+      component_placement:
+         env: 0-7
+         rollout: 8-15
+         actor: 0-15
+
+   rollout:
+      pipeline_stage_num: 2
+
+你可以灵活配置 env、rollout、actor 三个组件使用的 GPU 数量。  
+使用上述配置，可以让 env 与 rollout 之间流水线重叠，并与 actor 共享。  
+此外，在配置中设置 `pipeline_stage_num = 2`，可实现 **rollout 与 actor** 之间的流水线重叠，从而提升 rollout 效率。
+
+.. code-block:: yaml
+   
+   cluster:
+      num_nodes: 1
+      component_placement:
+         env,rollout,actor: all
+
+你也可以重新配置 Placement，实现 **完全共享**：env、rollout、actor 三个组件共享全部 GPU。
+
+.. code-block:: yaml
+
+   cluster:
+      num_nodes: 2
+      component_placement:
+         env: 0-3
+         rollout: 4-7
+         actor: 8-15
+
+你还可以重新配置 Placement，实现 **完全分离**：env、rollout、actor 各用各的 GPU、互不干扰，  
+这样就不需要 offload 功能。
+
+**2. 配置文件**
+
+   支持 **OpenVLA-OFT** 模型，算法为 **PPO** 与 **GRPO**。  
+   对应配置文件：
+
+   - **OpenVLA-OFT + PPO**：``examples/embodiment/config/libero_10_ppo_openvlaoft.yaml``  
+   - **OpenVLA-OFT + GRPO**：``examples/embodiment/config/libero_10_grpo_openvlaoft.yaml``
+
+**3. 启动命令**
+
+选择配置后，运行以下命令开始训练：
+
+.. code-block:: bash
+
+   bash examples/embodiment/run_embodiment.sh CHOSEN_CONFIG
+
+例如，在 LIBERO 环境中使用 PPO 训练 OpenVLA 模型：
+
+.. code-block:: bash
+
+   bash examples/embodiment/run_embodiment.sh libero_10_ppo_openvlaoft
+
+可视化与结果
+-------------------------
+
+**1. TensorBoard 日志**
+
+.. code-block:: bash
+
+   # 启动 TensorBoard
+   tensorboard --logdir ./logs --port 6006
+
+**2. 关键监控指标**
+
+- **训练指标**：
+
+  - ``actor/loss``：PPO 策略损失  
+  - ``actor/value_loss``：价值函数损失  
+  - ``actor/entropy``：策略熵  
+  - ``actor/grad_norm``：梯度范数  
+  - ``actor/lr``：学习率  
+
+- **Rollout 指标**：
+
+  - ``rollout/reward_mean``：平均回合奖励  
+  - ``rollout/reward_std``：奖励标准差  
+  - ``rollout/episode_length``：平均回合长度  
+  - ``rollout/success_rate``：任务完成率  
+
+- **环境指标**：
+
+  - ``env/success_rate``：各环境的成功率  
+  - ``env/step_reward``：逐步奖励  
+  - ``env/termination_rate``：回合终止率  
+
+**3. 视频生成**
+
+.. code-block:: yaml
+
+   video_cfg:
+     save_video: True
+     info_on_video: True
+     video_base_dir: ./logs/video/train
+
+**4. WandB 集成**
+
+.. code-block:: yaml
+
+   trainer:
+     logger:
+       wandb:
+         enable: True
+         project_name: "RLinf"
+         experiment_name: "openvla-libero"
+
+LIBERO 结果
+~~~~~~~~~~~~~~~~~~~
+
+此外，我们在 LIBERO 环境中使用 GRPO 训练了 OpenVLA-OFT。  
+通过 RL 微调所获得的改进如下：
+
+.. list-table:: **LIBERO 上 OpenVLA-OFT 的模型结果**
+   :header-rows: 1
+
+   * - 模型
+     - `Spatial <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-spatial>`_
+     - `Goal <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-goal>`_
+     - `Object <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-object>`_
+     - `Long <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-long>`_
+     - 平均值
+   * - OpenVLA-OFT-SFT (one-shot)
+     - 56.5%
+     - 45.6%
+     - 25.6%
+     - 9.7%
+     - 34.4%
+   * - OpenVLA-OFT-RLinf
+     - **99.0%**
+     - **99.0%**
+     - **99.0%**
+     - **94.4%**
+     - **97.9%**
+   * - 提升
+     - +42.5%
+     - +53.4%
+     - +73.4%
+     - +84.7%
+     - +63.5%
+
+在 Libero 实验中，我们参考了  
+`SimpleVLA <https://github.com/PRIME-RL/SimpleVLA-RL>`_，仅做了少量改动。  
+感谢作者开源代码。
diff --git a/docs/source-zh/rst_source/examples/embodied.rst b/docs/source-zh/rst_source/examples/maniskill.rst
similarity index 78%
rename from docs/source-zh/rst_source/examples/embodied.rst
rename to docs/source-zh/rst_source/examples/maniskill.rst
index cf510b986..e9d00fb7f 100644
--- a/docs/source-zh/rst_source/examples/embodied.rst
+++ b/docs/source-zh/rst_source/examples/maniskill.rst
@@ -1,13 +1,13 @@
-具身智能 RL-VLA
-========================
+基于ManiSkill模拟器的强化学习训练
+======================================================
 
 .. |huggingface| image:: /_static/svg/hf-logo.svg
    :width: 16px
    :height: 16px
    :class: inline-icon
 
-本文档给出在 RLinf 框架内启动与管理 **Vision-Language-Action Models (VLAs)** 训练任务的完整指南，  
-重点是在 ManiSkill3/LIBERO 环境中微调 VLA 模型以完成机器人操作。
+本文档给出在 RLinf 框架内启动与管理 **Vision-Language-Action Models (VLAs)** 训练任务的完整指南，
+在ManiSkill3环境中微调VLA模型以完成机器人操作。
 
 主要目标是让模型具备以下能力：
 
@@ -29,16 +29,6 @@
   - 三维旋转控制（roll, pitch, yaw）  
   - 夹爪控制（开/合）
 
-**LIBERO 环境**
-
-- **Environment**：基于 *robosuite*（MuJoCo）的 LIBERO 仿真基准  
-- **Task**：指挥一台 7 自由度机械臂完成多种家居操作技能（抓取放置、叠放、开抽屉、空间重排等）  
-- **Observation**：工作区周围离屏相机采集的 RGB 图像（常见分辨率 128×128 或 224×224）  
-- **Action Space**：7 维连续动作  
-  - 末端执行器三维位置控制（x, y, z）  
-  - 三维旋转控制（roll, pitch, yaw）  
-  - 夹爪控制（开/合）
-
 **任务描述格式**
 
 .. code-block:: text
@@ -120,10 +110,6 @@
 
 **2. 配置文件**
 
-当前我们支持两个环境：**ManiSkill3** 与 **LIBERO**。
-
-1. **ManiSkill3 环境**
-
    支持两种模型：**OpenVLA** 与 **OpenVLA-OFT**；两种算法：**PPO** 与 **GRPO**。  
    对应配置文件：
 
@@ -132,14 +118,6 @@
    - **OpenVLA + GRPO**：``examples/embodiment/config/maniskill_grpo_openvla.yaml``  
    - **OpenVLA-OFT + GRPO**：``examples/embodiment/config/maniskill_grpo_openvlaoft.yaml``
 
-2. **LIBERO 环境**
-
-   支持 **OpenVLA-OFT** 模型，算法为 **PPO** 与 **GRPO**。  
-   对应配置文件：
-
-   - **OpenVLA-OFT + PPO**：``examples/embodiment/config/libero_10_ppo_openvlaoft.yaml``  
-   - **OpenVLA-OFT + GRPO**：``examples/embodiment/config/libero_10_grpo_openvlaoft.yaml``
-
 **3. 启动命令**
 
 选择配置后，运行以下命令开始训练：
@@ -279,41 +257,3 @@ ManiSkill3 结果
      <source src=https://github.com/RLinf/misc/raw/main/pic/embody.mp4 type="video/mp4">
      Your browser does not support the video tag.
    </video>
-
-LIBERO 结果
-~~~~~~~~~~~~~~~~~~~
-
-此外，我们在 LIBERO 环境中使用 GRPO 训练了 OpenVLA-OFT。  
-通过 RL 微调所获得的改进如下：
-
-.. list-table:: **LIBERO 上 OpenVLA-OFT 的模型结果**
-   :header-rows: 1
-
-   * - 模型
-     - `Spatial <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-spatial>`_
-     - `Goal <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-goal>`_
-     - `Object <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-object>`_
-     - `Long <https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-long>`_
-     - 平均值
-   * - OpenVLA-OFT-SFT (one-shot)
-     - 56.5%
-     - 45.6%
-     - 25.6%
-     - 9.7%
-     - 34.4%
-   * - OpenVLA-OFT-RLinf
-     - **99.0%**
-     - **99.0%**
-     - **99.0%**
-     - **94.4%**
-     - **97.9%**
-   * - 提升
-     - +42.5%
-     - +53.4%
-     - +73.4%
-     - +84.7%
-     - +63.5%
-
-在 Libero 实验中，我们参考了  
-`SimpleVLA <https://github.com/PRIME-RL/SimpleVLA-RL>`_，仅做了少量改动。  
-感谢作者开源代码。
diff --git a/docs/source-zh/rst_source/examples/reasoning.rst b/docs/source-zh/rst_source/examples/reasoning.rst
index 2ac1e1791..a0b5d4b93 100644
--- a/docs/source-zh/rst_source/examples/reasoning.rst
+++ b/docs/source-zh/rst_source/examples/reasoning.rst
@@ -1,5 +1,5 @@
-推理 RL-LLM
-=================
+Math推理的强化学习训练
+================================
 
 .. |huggingface| image:: /_static/svg/hf-logo.svg
    :width: 16px

From 59cc43e04729065f1f441acbe8876aaf48e89afe Mon Sep 17 00:00:00 2001
From: Andy Lin <32576375+andylin-hao@users.noreply.github.com>
Date: Sun, 28 Sep 2025 16:14:43 +0800
Subject: [PATCH 03/57] fix: handle num_nodes configuration mismatch with
 actual node number (#139)

Signed-off-by: Hao Lin <linhaomails@gmail.com>
---
 rlinf/scheduler/cluster.py | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/rlinf/scheduler/cluster.py b/rlinf/scheduler/cluster.py
index 506d09455..8882d082a 100644
--- a/rlinf/scheduler/cluster.py
+++ b/rlinf/scheduler/cluster.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
 import signal
 import sys
 import time
+import warnings
 from dataclasses import dataclass
 from importlib.metadata import version
 from typing import TYPE_CHECKING, Dict, List, Optional, Type
@@ -106,6 +108,22 @@ def __init__(self, num_nodes: Optional[int] = None):
         self._has_initialized = True
 
     def _init_and_launch_managers(self, num_nodes: int):
+        assert num_nodes > 0, "num_nodes must be greater than 0."
+
+        # Add logger
+        self._logger = logging.getLogger(Cluster.SYS_NAME)
+        self._logger.setLevel(Cluster.LOGGING_LEVEL)
+        self._logger.propagate = False
+        for handler in self._logger.handlers:
+            self._logger.removeHandler(handler)
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter(
+            fmt="[%(levelname)s %(asctime)s %(name)s] %(message)s",
+            datefmt="%H:%M:%S",
+        )
+        handler.setFormatter(formatter)
+        self._logger.addHandler(handler)
+
         self._num_nodes = num_nodes
         self._set_default_env_vars()
 
@@ -143,9 +161,8 @@ def _init_and_launch_managers(self, num_nodes: int):
 
         # Wait for the cluster to be ready
         while len(ray.nodes()) < self._num_nodes:
-            print(
-                f"Waiting for {self._num_nodes} nodes to be ready, currently {len(ray.nodes())} nodes available.",
-                flush=True,
+            self._logger.warning(
+                f"Waiting for {self._num_nodes} nodes to be ready, currently {len(ray.nodes())} nodes available."
             )
             time.sleep(1)
 
@@ -177,6 +194,17 @@ def _init_and_launch_managers(self, num_nodes: int):
             node for nodes in nodes_group_by_accel_type.values() for node in nodes
         ]
 
+        # Handle num_nodes configuration mismatch with actual node number
+        if len(self._nodes) > self._num_nodes:
+            warnings.warn(
+                f"The cluster is initialized with {self._num_nodes} nodes, but detected {len(self._nodes)} nodes have joined the ray cluster. So only the first {self._num_nodes} nodes are used."
+            )
+            self._nodes = self._nodes[: self._num_nodes]
+
+        self._logger.info(
+            f"{Cluster.SYS_NAME} is running on a cluster with {len(self._nodes)} node{'s' if len(self._nodes) > 1 else ''} and {self.num_accelerators_in_cluster} accelerator{'s' if self.num_accelerators_in_cluster > 1 else ''}. The nodes' details are: {self._nodes}"
+        )
+
         # Launch managers
         from .manager import (
             CollectiveManager,

From 7875bdda491573f45b780a0731e3baa415820b5f Mon Sep 17 00:00:00 2001
From: Hongzhi Zang <zanghongzhi@infini-ai.com>
Date: Sun, 28 Sep 2025 21:56:54 +0800
Subject: [PATCH 04/57] feat: simplify the maniskill reset id (#135)

* feat: simplify the maniskill reset id

* feat: fix offload wrapper for all reset state id

Signed-off-by: hongzhi <zanghongzhi@infini-ai.com>
---
 rlinf/envs/maniskill/maniskill_env.py           | 5 +----
 rlinf/envs/offload_wrapper/maniskill_wrapper.py | 2 --
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/rlinf/envs/maniskill/maniskill_env.py b/rlinf/envs/maniskill/maniskill_env.py
index 715d490fc..b80096ab8 100644
--- a/rlinf/envs/maniskill/maniskill_env.py
+++ b/rlinf/envs/maniskill/maniskill_env.py
@@ -111,15 +111,12 @@ def instruction(self):
     def _init_reset_state_ids(self):
         self._generator = torch.Generator()
         self._generator.manual_seed(self.seed)
-        self.all_reset_state_ids = torch.randperm(
-            self.total_num_group_envs, generator=self._generator
-        ).to(self.device)
         self.update_reset_state_ids()
 
     def update_reset_state_ids(self):
         reset_state_ids = torch.randint(
             low=0,
-            high=len(self.all_reset_state_ids),
+            high=self.total_num_group_envs,
             size=(self.num_group,),
             generator=self._generator,
         )
diff --git a/rlinf/envs/offload_wrapper/maniskill_wrapper.py b/rlinf/envs/offload_wrapper/maniskill_wrapper.py
index 9317a6ea8..734cd6c50 100644
--- a/rlinf/envs/offload_wrapper/maniskill_wrapper.py
+++ b/rlinf/envs/offload_wrapper/maniskill_wrapper.py
@@ -65,7 +65,6 @@ def get_state(self) -> bytes:
             "action_space_state": action_space_state,
             "prev_step_reward": self.prev_step_reward.cpu(),
             "reset_state_ids": self.reset_state_ids.cpu(),
-            "all_reset_state_ids": self.all_reset_state_ids.cpu(),
             "generator_state": self._generator.get_state(),
             "is_start": self.is_start,
             "video_cnt": self.video_cnt,
@@ -176,7 +175,6 @@ def load_state(self, state_buffer: bytes):
         # Restore simulator task state
         self.prev_step_reward = state["prev_step_reward"].to(self.device)
         self.reset_state_ids = state["reset_state_ids"].to(self.device)
-        self.all_reset_state_ids = state["all_reset_state_ids"].to(self.device)
         self._generator.set_state(state["generator_state"])
         self.is_start = state["is_start"]
 

From fca00cea6d6f7255e86fb2512686cb08f06846a8 Mon Sep 17 00:00:00 2001
From: XuS1994 <154795934+XuS1994@users.noreply.github.com>
Date: Sun, 28 Sep 2025 22:00:59 +0800
Subject: [PATCH 05/57] fix(embodied): bug for single node test (#141)

Signed-off-by: xusi <xusiforwork@gmail.com>
---
 rlinf/hybrid_engines/fsdp/fsdp_model_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
index 5b2fba92d..9b333fcc5 100644
--- a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
+++ b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
@@ -101,7 +101,7 @@ def setup_model_and_optimizer(self):
         self.model = FSDP(
             module,
             param_init_fn=init_fn,
-            use_orig_params=True,
+            use_orig_params=False,
             auto_wrap_policy=auto_wrap_policy,
             device_id=int(os.environ["LOCAL_RANK"]),
             sharding_strategy=sharding_strategy,  # zero3

From 35f6a4bf31f0def3b912e0ed624d479019fad4c1 Mon Sep 17 00:00:00 2001
From: Andy Lin <32576375+andylin-hao@users.noreply.github.com>
Date: Mon, 29 Sep 2025 21:53:28 +0800
Subject: [PATCH 06/57] fix: openvla/oft dependency issues (#143)

Signed-off-by: Hao Lin <linhaomails@gmail.com>
---
 .pre-commit-config.yaml                          |  3 +--
 docker/embodied/Dockerfile.openvla.hf.fsdp       |  6 ++++++
 docker/embodied/Dockerfile.openvlaoft.hf.fsdp    |  6 ++++++
 docker/embodied/Dockerfile.pi0.hf.fsdp           |  3 +++
 docs/source-en/rst_source/start/index.rst        |  2 +-
 docs/source-en/rst_source/start/installation.rst |  9 ++++++---
 docs/source-en/rst_source/start/llm.rst          |  5 +++--
 docs/source-en/rst_source/start/vla.rst          |  7 ++++---
 docs/source-zh/rst_source/start/installation.rst | 11 +++++++----
 docs/source-zh/rst_source/start/llm.rst          |  5 +++--
 docs/source-zh/rst_source/start/vla.rst          |  7 ++++---
 pyproject.toml                                   |  6 ++++++
 requirements/README.md                           |  9 ++++++---
 requirements/install_embodied_deps.sh            |  3 +++
 requirements/openvla.txt                         |  2 --
 requirements/openvla_oft.txt                     |  9 +++++++++
 rlinf/scheduler/worker/worker_group.py           |  4 ++++
 17 files changed, 72 insertions(+), 25 deletions(-)
 create mode 100644 requirements/openvla_oft.txt

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b78ba74a6..9230a8f13 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,9 +3,8 @@ repos:
     rev: "v0.12.9"
     hooks:
       - id: ruff
-        args: ["--preview"]
+        args: ["--preview", "--fix"]
       - id: ruff-format
-        args: ["--check"]
 
   - repo: https://github.com/commit-check/commit-check
     rev: "v0.10.2"
diff --git a/docker/embodied/Dockerfile.openvla.hf.fsdp b/docker/embodied/Dockerfile.openvla.hf.fsdp
index b02a975f7..87fa200c7 100644
--- a/docker/embodied/Dockerfile.openvla.hf.fsdp
+++ b/docker/embodied/Dockerfile.openvla.hf.fsdp
@@ -85,6 +85,12 @@ RUN pip install \
 
 RUN pip install -r /workspace/openvla/experiments/robot/libero/libero_requirements.txt 
 
+# OpenVLA overrides torch with v2.2, needs to be reset
+RUN pip install torch==2.5.1
+
+RUN python -m mani_skill.utils.download_asset bridge_v2_real2sim
+RUN python -m mani_skill.utils.download_asset widowx250s
+
 RUN echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
 RUN echo "conda activate" >> ~/.bashrc
 
diff --git a/docker/embodied/Dockerfile.openvlaoft.hf.fsdp b/docker/embodied/Dockerfile.openvlaoft.hf.fsdp
index e5a24e24e..7c306b390 100644
--- a/docker/embodied/Dockerfile.openvlaoft.hf.fsdp
+++ b/docker/embodied/Dockerfile.openvlaoft.hf.fsdp
@@ -85,6 +85,12 @@ RUN pip install \
 
 RUN pip install -r /workspace/openvla_oft/experiments/robot/libero/libero_requirements.txt 
 
+# OpenVLA overrides torch with v2.2, needs to be reset
+RUN pip install torch==2.5.1
+
+RUN python -m mani_skill.utils.download_asset bridge_v2_real2sim
+RUN python -m mani_skill.utils.download_asset widowx250s
+
 RUN echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
 RUN echo "conda activate" >> ~/.bashrc
 
diff --git a/docker/embodied/Dockerfile.pi0.hf.fsdp b/docker/embodied/Dockerfile.pi0.hf.fsdp
index d85608a1e..240f6321f 100644
--- a/docker/embodied/Dockerfile.pi0.hf.fsdp
+++ b/docker/embodied/Dockerfile.pi0.hf.fsdp
@@ -85,6 +85,9 @@ RUN pip install \
     -e /workspace/libero \
     -e /workspace/lerobot
 
+RUN python -m mani_skill.utils.download_asset bridge_v2_real2sim
+RUN python -m mani_skill.utils.download_asset widowx250s
+
 RUN echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
 RUN echo "conda activate" >> ~/.bashrc
 
diff --git a/docs/source-en/rst_source/start/index.rst b/docs/source-en/rst_source/start/index.rst
index 3d338d783..19a97f017 100644
--- a/docs/source-en/rst_source/start/index.rst
+++ b/docs/source-en/rst_source/start/index.rst
@@ -13,7 +13,7 @@ We present three concise examples to demonstrate the framework's workflow and he
 
 - **Distributed training:** Multi-node training for mathematical tasks (see :doc:`distribute`).
 
-- **Evaluation:** Assessing model performance on embodied intelligence (see :doc:`vlm-eval`) and assessing model performance on long-chain-of-thought mathematical reasoning (see :doc:`llm-eval`).
+- **Evaluation:** Assessing model performance on embodied intelligence (see :doc:`vla-eval`) and assessing model performance on long-chain-of-thought mathematical reasoning (see :doc:`llm-eval`).
 
 .. toctree::
    :hidden:
diff --git a/docs/source-en/rst_source/start/installation.rst b/docs/source-en/rst_source/start/installation.rst
index fce45b6fa..217c28a96 100644
--- a/docs/source-en/rst_source/start/installation.rst
+++ b/docs/source-en/rst_source/start/installation.rst
@@ -186,23 +186,26 @@ vLLM installation:
 
 .. _embodied-dependencies:
 
-Additional Embodied Dependencies
+Embodied Dependencies
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 For embodied experiments, first install the necessary system dependencies (currently only supported on Debian/Ubuntu via ``apt``):
 
 .. code-block:: shell
 
-   bash requirements/install_embodied_deps.sh
    uv sync --extra embodied
+   bash requirements/install_embodied_deps.sh # Must be run after the above command
 
 Then, depending on the experiment type, install the required packages for ``openvla``, ``openvla-oft`` and ``pi0``:
 
 .. code-block:: shell
 
-   # For OpenVLA/OpenVLA-oft experiments
+   # For OpenVLA experiments
    UV_TORCH_BACKEND=auto uv pip install -r requirements/openvla.txt --no-build-isolation
 
+   # For OpenVLA-oft experiment
+   UV_TORCH_BACKEND=auto uv pip install -r requirements/openvla_oft.txt --no-build-isolation
+
    # For Pi0 experiments
    UV_TORCH_BACKEND=auto uv pip install -r requirements/pi0.txt --no-build-isolation
 
diff --git a/docs/source-en/rst_source/start/llm.rst b/docs/source-en/rst_source/start/llm.rst
index 795b8c61a..5cdde520f 100644
--- a/docs/source-en/rst_source/start/llm.rst
+++ b/docs/source-en/rst_source/start/llm.rst
@@ -53,14 +53,15 @@ we highly recommend updating the following configuration option in
 ``cluster.component_placement``.
 
 
-You can dynamically set it to **1, 2, 4, or 8** depending on your available resources.
+You can set it to **0-1**, **0-3** or  **0-7** to use 2/4/8 GPUs depending on your available resources.
+Refer to :doc:`../tutorials/user/yaml` for a more detailed explanation of the placement configuration.
 
 .. code-block:: yaml
 
    cluster:
      num_nodes: 1
      component_placement:
-        actor,rollout: all
+        actor,rollout: 0
 
 Finally, before running the script, you need to modify the corresponding configuration options in the YAML file according to the download paths of the model and dataset. Specifically, update:
 
diff --git a/docs/source-en/rst_source/start/vla.rst b/docs/source-en/rst_source/start/vla.rst
index 4f4646ae0..e79ed101d 100644
--- a/docs/source-en/rst_source/start/vla.rst
+++ b/docs/source-en/rst_source/start/vla.rst
@@ -34,20 +34,21 @@ the model is cited in `paper <https://arxiv.org/abs/2505.19789>`_
 
 **Step 2: Execute the provided launch script:**
 
-For user convenience, our configuration file is set up to run with a single GPU by default.  
+For user convenience, our configuration file is set up to run with at least two GPUs by default.  
 However, if you have multiple GPUs and wish to accelerate the quickstart process,  
 we highly recommend updating the following configuration option in  
 ``./examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml``:  
 ``cluster.component_placement``.
 
-You can dynamically set it to **1, 2, 4, or 8** depending on your available resources.
+You can set it to **0-3** or  **0-7** to use 4/8 GPUs depending on your available resources.
+Refer to :doc:`../tutorials/user/yaml` for a more detailed explanation of the placement configuration.
 
 .. code-block:: yaml
 
    cluster:
      num_nodes: 1
      component_placement:
-        actor,rollout: all
+        actor,rollout: 0-1
 
 Finally, before running the script, you need to modify the corresponding configuration options in the YAML file according to the download paths of the model and dataset. Specifically, update:
 
diff --git a/docs/source-zh/rst_source/start/installation.rst b/docs/source-zh/rst_source/start/installation.rst
index 55ada5280..6f97ed8a9 100644
--- a/docs/source-zh/rst_source/start/installation.rst
+++ b/docs/source-zh/rst_source/start/installation.rst
@@ -121,10 +121,10 @@ RLinf 提供两种安装方式。我们 **推荐使用 Docker**，因为这可
 这一步已经包括了 **FSDP + Huggingface** 的完整配置。
 
 第二步，如果你的实验使用的是 **Megatron 和 SGLang/vLLM** 后端，  
-请参考 :ref:`Megatron 及 SGLang/vLLM 依赖 <megatron-and-sglang-vllm-dependencies>` 安装相应依赖。
+请参考 :ref:`Megatron 和 SGLang/vLLM 依赖 <megatron-and-sglang-vllm-dependencies>` 安装相应依赖。
 
 第三步，如果你要运行具身智能相关实验（如 OpenVLA、OpenVLA-OFT、Pi0），  
-请参考 :ref:`具身智能依赖 <embodied-dependencies>` 安装专用依赖项。
+请参考 :ref:`具身智能相关依赖 <embodied-dependencies>` 安装专用依赖项。
 
 .. _common-dependencies:
 
@@ -188,15 +188,18 @@ vLLM 安装：
 
 .. code-block:: shell
 
-   bash requirements/install_embodied_deps.sh
    uv sync --extra embodied
+   bash requirements/install_embodied_deps.sh # Must be run after the above command
 
 接着，根据具体实验类型安装对应的 Python 包：
 
 .. code-block:: shell
 
-   # OpenVLA / OpenVLA-OFT 实验所需依赖
+   # OpenVLA 实验所需依赖
    UV_TORCH_BACKEND=auto uv pip install -r requirements/openvla.txt --no-build-isolation
 
+   # OpenVLA-oft 实验所需依赖
+   UV_TORCH_BACKEND=auto uv pip install -r requirements/openvla_oft.txt --no-build-isolation
+
    # Pi0 实验所需依赖
    UV_TORCH_BACKEND=auto uv pip install -r requirements/pi0.txt --no-build-isolation
diff --git a/docs/source-zh/rst_source/start/llm.rst b/docs/source-zh/rst_source/start/llm.rst
index 4435b15bf..37155e02e 100644
--- a/docs/source-zh/rst_source/start/llm.rst
+++ b/docs/source-zh/rst_source/start/llm.rst
@@ -50,14 +50,15 @@
 我们推荐你修改配置文件  
 ``./examples/math/config/qwen2.5-1.5b-single-gpu.yaml`` 中的参数 ``cluster.component_placement``。
 
-你可以根据资源情况将其动态设置为 **1, 2, 4 或 8**。
+你可以根据实际资源将该项设置为 **0-1**， **0-3** 或 **0-7**来使用 2/4/8 张 GPU。
+查看 :doc:`../tutorials/user/yaml` 以获取有关 Placement 配置的更详细说明。
 
 .. code-block:: yaml
 
    cluster:
      num_nodes: 1
      component_placement:
-        actor,rollout: all
+        actor,rollout: 0
 
 在运行脚本之前，请根据你的模型和数据集下载路径，  
 在 YAML 配置文件中修改以下字段：
diff --git a/docs/source-zh/rst_source/start/vla.rst b/docs/source-zh/rst_source/start/vla.rst
index b92451c08..fa0dede6d 100644
--- a/docs/source-zh/rst_source/start/vla.rst
+++ b/docs/source-zh/rst_source/start/vla.rst
@@ -34,20 +34,21 @@ ManiSkill3 是一个基于 GPU 加速的机器人研究仿真平台，
 
 **步骤 2：运行官方提供的训练脚本**
 
-为方便使用，我们提供的配置文件默认支持单卡训练。  
+为方便使用，我们提供的配置文件需要至少双卡进行训练。  
 如果你有多张 GPU 并希望加快训练速度，  
 建议你修改配置文件  
 ``./examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml`` 中的参数  
 ``cluster.component_placement``。
 
-你可以根据实际资源设置为 **1、2、4 或 8**。
+你可以根据实际资源将该项设置为 **0-3** 或 **0-7**来使用 4/8 张 GPU。
+查看 :doc:`../tutorials/user/yaml` 以获取有关 Placement 配置的更详细说明。
 
 .. code-block:: yaml
 
    cluster:
      num_nodes: 1
      component_placement:
-        actor,rollout: all
+        actor,rollout: 0-1
 
 运行脚本之前，请根据你下载的模型和数据集路径，修改 YAML 文件中的以下字段：
 
diff --git a/pyproject.toml b/pyproject.toml
index 7adf0a799..c3c35624e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,6 +44,7 @@ dependencies = [
     "pybind11",
     "torch-memory-saver",
     "setuptools>=69.5.1,<75.9",
+    "ninja",
 
     # Logging
     "swanlab",
@@ -81,6 +82,11 @@ conflicts = [
       { extra = "embodied" },
     ],
 ]
+override-dependencies = [
+    "torch==2.6.0",
+    "torchvision==0.21.0",
+    "torchaudio==2.6.0"
+]
 
 [tool.ruff]
 line-length = 88
diff --git a/requirements/README.md b/requirements/README.md
index c9be5abb4..9ccd5f391 100644
--- a/requirements/README.md
+++ b/requirements/README.md
@@ -50,14 +50,17 @@ uv sync --extra vllm
 ### Embodied Dependencies
 For embodied experiments, first install the necessary system dependencies (currently only Debian/Ubuntu `apt` package management is supported).
 ```shell
-bash requirements/install_embodied_deps.sh
 uv sync --extra embodied
+bash requirements/install_embodied_deps.sh # Must be run after the above command
 ```
-Next, depending on the experiment types, install the `openvla` or `pi0` dependencies.
+Next, depending on the experiment types, install the `openvla`, `openvla_oft` or `pi0` dependencies.
 ```shell
-# For OpenVLA/OpenVLA-oft experiments
+# For OpenVLA experiments
 UV_TORCH_BACKEND=auto uv pip install -r requirements/openvla.txt --no-build-isolation
 
+# For OpenVLA-oft experiment
+UV_TORCH_BACKEND=auto uv pip install -r requirements/openvla_oft.txt --no-build-isolation
+
 # For Pi0 experiment
 UV_TORCH_BACKEND=auto uv pip install -r requirements/pi0.txt --no-build-isolation
 ```
\ No newline at end of file
diff --git a/requirements/install_embodied_deps.sh b/requirements/install_embodied_deps.sh
index caeaf4477..79ca51a6d 100755
--- a/requirements/install_embodied_deps.sh
+++ b/requirements/install_embodied_deps.sh
@@ -20,4 +20,7 @@ apt-get install -y --no-install-recommends \
     libxrender-dev \
     libgomp1 \
 
+python -m mani_skill.utils.download_asset bridge_v2_real2sim
+python -m mani_skill.utils.download_asset widowx250s
+
 
diff --git a/requirements/openvla.txt b/requirements/openvla.txt
index 7fdcdce13..cb202fc27 100644
--- a/requirements/openvla.txt
+++ b/requirements/openvla.txt
@@ -1,6 +1,4 @@
 openvla @ git+https://github.com/openvla/openvla.git
-openvla_oft @ git+https://github.com/moojink/openvla-oft.git
-# https://github.com/openvla/openvla/blob/main/experiments/robot/libero/libero_requirements.txt
 flash-attn==2.5.5
 imageio[ffmpeg]
 robosuite==1.4.1
diff --git a/requirements/openvla_oft.txt b/requirements/openvla_oft.txt
new file mode 100644
index 000000000..424c466c5
--- /dev/null
+++ b/requirements/openvla_oft.txt
@@ -0,0 +1,9 @@
+openvla_oft @ git+https://github.com/moojink/openvla-oft.git
+# https://github.com/openvla/openvla/blob/main/experiments/robot/libero/libero_requirements.txt
+flash-attn==2.5.5
+imageio[ffmpeg]
+robosuite==1.4.1
+bddl
+easydict
+cloudpickle
+gym
\ No newline at end of file
diff --git a/rlinf/scheduler/worker/worker_group.py b/rlinf/scheduler/worker/worker_group.py
index 8dd483870..02e0cb4bb 100644
--- a/rlinf/scheduler/worker/worker_group.py
+++ b/rlinf/scheduler/worker/worker_group.py
@@ -154,6 +154,9 @@ def _create_workers(self):
         placements = self._placement_strategy.get_placement(
             self._cluster, self._isolate_gpu
         )
+        master_addr = next(
+            self._cluster.get_node_ip(p.node_id) for p in placements if p.rank == 0
+        )
         self._world_size = len(placements)
         for placement in placements:
             worker_name = WorkerAddress.from_parent_name_rank(
@@ -165,6 +168,7 @@ def _create_workers(self):
             env_vars = {
                 "GROUP_NAME": self._worker_group_name,
                 "WORKER_NAME": worker_name,
+                "MASTER_ADDR": master_addr,
                 "WORLD_SIZE": str(self._world_size),
                 "RANK": str(placement.rank),
                 "NODE_RANK": str(placement.node_rank),

From 547c8ceece349fc11dac22964ee1ab0576d76f4b Mon Sep 17 00:00:00 2001
From: Andy Lin <32576375+andylin-hao@users.noreply.github.com>
Date: Tue, 30 Sep 2025 18:13:49 +0800
Subject: [PATCH 07/57] fix: default gloo group due to prismatic import & dep
 enhancement (#145)

Signed-off-by: Hao Lin <linhaomails@gmail.com>
---
 docker/embodied/Dockerfile.openvla.hf.fsdp       |  4 ++--
 docker/embodied/Dockerfile.openvlaoft.hf.fsdp    |  4 ++--
 docker/embodied/Dockerfile.pi0.hf.fsdp           |  4 ++--
 docs/source-en/rst_source/start/installation.rst | 16 ++--------------
 docs/source-zh/rst_source/start/installation.rst | 16 ++--------------
 pyproject.toml                                   | 11 ++++-------
 requirements/README.md                           | 15 ++-------------
 requirements/install_embodied_deps.sh            |  6 +++---
 rlinf/hybrid_engines/fsdp/utils.py               |  7 ++++++-
 rlinf/workers/rollout/sglang/__init__.py         |  2 +-
 10 files changed, 26 insertions(+), 59 deletions(-)

diff --git a/docker/embodied/Dockerfile.openvla.hf.fsdp b/docker/embodied/Dockerfile.openvla.hf.fsdp
index 87fa200c7..52c9caa84 100644
--- a/docker/embodied/Dockerfile.openvla.hf.fsdp
+++ b/docker/embodied/Dockerfile.openvla.hf.fsdp
@@ -88,8 +88,8 @@ RUN pip install -r /workspace/openvla/experiments/robot/libero/libero_requiremen
 # OpenVLA overrides torch with v2.2, needs to be reset
 RUN pip install torch==2.5.1
 
-RUN python -m mani_skill.utils.download_asset bridge_v2_real2sim
-RUN python -m mani_skill.utils.download_asset widowx250s
+RUN python -m mani_skill.utils.download_asset bridge_v2_real2sim -y
+RUN python -m mani_skill.utils.download_asset widowx250s -y
 
 RUN echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
 RUN echo "conda activate" >> ~/.bashrc
diff --git a/docker/embodied/Dockerfile.openvlaoft.hf.fsdp b/docker/embodied/Dockerfile.openvlaoft.hf.fsdp
index 7c306b390..61e955dda 100644
--- a/docker/embodied/Dockerfile.openvlaoft.hf.fsdp
+++ b/docker/embodied/Dockerfile.openvlaoft.hf.fsdp
@@ -88,8 +88,8 @@ RUN pip install -r /workspace/openvla_oft/experiments/robot/libero/libero_requir
 # OpenVLA overrides torch with v2.2, needs to be reset
 RUN pip install torch==2.5.1
 
-RUN python -m mani_skill.utils.download_asset bridge_v2_real2sim
-RUN python -m mani_skill.utils.download_asset widowx250s
+RUN python -m mani_skill.utils.download_asset bridge_v2_real2sim -y
+RUN python -m mani_skill.utils.download_asset widowx250s -y
 
 RUN echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
 RUN echo "conda activate" >> ~/.bashrc
diff --git a/docker/embodied/Dockerfile.pi0.hf.fsdp b/docker/embodied/Dockerfile.pi0.hf.fsdp
index 240f6321f..71ee57523 100644
--- a/docker/embodied/Dockerfile.pi0.hf.fsdp
+++ b/docker/embodied/Dockerfile.pi0.hf.fsdp
@@ -85,8 +85,8 @@ RUN pip install \
     -e /workspace/libero \
     -e /workspace/lerobot
 
-RUN python -m mani_skill.utils.download_asset bridge_v2_real2sim
-RUN python -m mani_skill.utils.download_asset widowx250s
+RUN python -m mani_skill.utils.download_asset bridge_v2_real2sim -y
+RUN python -m mani_skill.utils.download_asset widowx250s -y
 
 RUN echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
 RUN echo "conda activate" >> ~/.bashrc
diff --git a/docs/source-en/rst_source/start/installation.rst b/docs/source-en/rst_source/start/installation.rst
index 217c28a96..ba4758150 100644
--- a/docs/source-en/rst_source/start/installation.rst
+++ b/docs/source-en/rst_source/start/installation.rst
@@ -162,9 +162,9 @@ Run the following commands to install Megatron, SGLang or vLLM, and their depend
 
 .. code-block:: shell
 
-   uv sync --extra sgl_vllm
+   uv sync --extra sglang-vllm
    mkdir -p /opt && git clone https://github.com/NVIDIA/Megatron-LM.git -b core_r0.13.0 /opt/Megatron-LM
-   APEX_CPP_EXT=1 APEX_CUDA_EXT=1 uv pip install -r requirements/megatron.txt --no-build-isolation
+   APEX_CPP_EXT=1 APEX_CUDA_EXT=1 NVCC_APPEND_FLAGS="--threads 24" APEX_PARALLEL_BUILD=24 uv pip install -r requirements/megatron.txt --no-build-isolation
 
 Before using Megatron, ensure its path is added to the ``PYTHONPATH`` environment variable:
 
@@ -172,18 +172,6 @@ Before using Megatron, ensure its path is added to the ``PYTHONPATH`` environmen
 
    export PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH
 
-SGLang installation:
-
-.. code-block:: shell
-
-   uv sync --extra sglang
-
-vLLM installation:
-
-.. code-block:: shell
-
-   uv sync --extra vllm
-
 .. _embodied-dependencies:
 
 Embodied Dependencies
diff --git a/docs/source-zh/rst_source/start/installation.rst b/docs/source-zh/rst_source/start/installation.rst
index 6f97ed8a9..9c03edc76 100644
--- a/docs/source-zh/rst_source/start/installation.rst
+++ b/docs/source-zh/rst_source/start/installation.rst
@@ -157,9 +157,9 @@ Megatron 和 SGLang/vLLM 依赖
 
 .. code-block:: shell
 
-   uv sync --extra sgl_vllm
+   uv sync --extra sglang-vllm
    mkdir -p /opt && git clone https://github.com/NVIDIA/Megatron-LM.git -b core_r0.13.0 /opt/Megatron-LM
-   APEX_CPP_EXT=1 APEX_CUDA_EXT=1 uv pip install -r requirements/megatron.txt --no-build-isolation
+   APEX_CPP_EXT=1 APEX_CUDA_EXT=1 NVCC_APPEND_FLAGS="--threads 24" APEX_PARALLEL_BUILD=24 uv pip install -r requirements/megatron.txt --no-build-isolation
 
 使用 Megatron 前，请将其路径加入 ``PYTHONPATH`` 环境变量：
 
@@ -167,18 +167,6 @@ Megatron 和 SGLang/vLLM 依赖
 
    export PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH
 
-SGLang 安装：
-
-.. code-block:: shell
-
-   uv sync --extra sglang
-
-vLLM 安装：
-
-.. code-block:: shell
-
-   uv sync --extra vllm
-
 .. _embodied-dependencies:
 
 具身智能相关依赖
diff --git a/pyproject.toml b/pyproject.toml
index c3c35624e..b6a00cbcb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,12 +52,9 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-sglang = [
+sglang-vllm = [
     "transformers==4.51.1",
     "sglang[all]==0.4.6.post5",
-]
-vllm = [
-    "transformers==4.51.1",
     "vllm==0.8.5",
 ]
 embodied = [
@@ -77,15 +74,15 @@ embodied = [
 prerelease = "allow"
 conflicts = [
     [
-      { extra = "sglang" },
-      { extra = "vllm" },
+      { extra = "sglang-vllm" },
       { extra = "embodied" },
     ],
 ]
 override-dependencies = [
     "torch==2.6.0",
     "torchvision==0.21.0",
-    "torchaudio==2.6.0"
+    "torchaudio==2.6.0",
+    "xgrammar==0.1.19"
 ]
 
 [tool.ruff]
diff --git a/requirements/README.md b/requirements/README.md
index 9ccd5f391..531bf40bb 100644
--- a/requirements/README.md
+++ b/requirements/README.md
@@ -26,27 +26,16 @@ UV_TORCH_BACKEND=auto uv sync
 ### Megatron and SGLang/vLLM Dependencies
 Run the following to install Megatron, SGLang or vLLM and their dependencies.
 
-Megatron installation:
 ```shell
-uv sync --extra sgl_vllm
+uv sync --extra sglang-vllm
 mkdir -p /opt && git clone https://github.com/NVIDIA/Megatron-LM.git -b core_r0.13.0 /opt/Megatron-LM
-APEX_CPP_EXT=1 APEX_CUDA_EXT=1 uv pip install -r requirements/megatron.txt --no-build-isolation
+APEX_CPP_EXT=1 APEX_CUDA_EXT=1 NVCC_APPEND_FLAGS="--threads 24" APEX_PARALLEL_BUILD=24 uv pip install -r requirements/megatron.txt --no-build-isolation
 ```
 Before using Megatron, make sure it's path is added to the `PYTHONPATH` environment variables.
 ```shell
 export PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH
 ```
 
-SGLang installation:
-```shell
-uv sync --extra sglang
-```
-
-vLLM installation:
-```shell
-uv sync --extra vllm
-```
-
 ### Embodied Dependencies
 For embodied experiments, first install the necessary system dependencies (currently only Debian/Ubuntu `apt` package management is supported).
 ```shell
diff --git a/requirements/install_embodied_deps.sh b/requirements/install_embodied_deps.sh
index 79ca51a6d..bed7887ec 100755
--- a/requirements/install_embodied_deps.sh
+++ b/requirements/install_embodied_deps.sh
@@ -18,9 +18,9 @@ apt-get install -y --no-install-recommends \
     libsm6 \
     libxext6 \
     libxrender-dev \
-    libgomp1 \
+    libgomp1
 
-python -m mani_skill.utils.download_asset bridge_v2_real2sim
-python -m mani_skill.utils.download_asset widowx250s
+python -m mani_skill.utils.download_asset bridge_v2_real2sim -y
+python -m mani_skill.utils.download_asset widowx250s -y
 
 
diff --git a/rlinf/hybrid_engines/fsdp/utils.py b/rlinf/hybrid_engines/fsdp/utils.py
index 0e136bffb..2334b1fa7 100644
--- a/rlinf/hybrid_engines/fsdp/utils.py
+++ b/rlinf/hybrid_engines/fsdp/utils.py
@@ -30,7 +30,6 @@
 
 import torch
 from accelerate import init_empty_weights
-from prismatic.extern.hf.modeling_prismatic import PrismaticProjector
 from torch.distributed.fsdp.wrap import (
     transformer_auto_wrap_policy,
 )
@@ -111,6 +110,12 @@ def get_fsdp_wrap_policy(module, config=None, is_lora=False):
         policies.append(vit_wrap_policy)
 
         # Prismatic projector policy for VLA models
+        # The prismatic package initializes a DistributedOverwatch by default,
+        # which initializes accelerate.PartialState, which in turn
+        # initializes a torch.distributed process group in gloo.
+        # This results in default group being gloo, which does not support CUDA tensors and allreduce average.
+        from prismatic.extern.hf.modeling_prismatic import PrismaticProjector
+
         prismatic_fsdp_wrapping_policy = functools.partial(
             _module_wrap_policy,
             module_classes={PrismaticProjector},
diff --git a/rlinf/workers/rollout/sglang/__init__.py b/rlinf/workers/rollout/sglang/__init__.py
index 0e903531c..5e0fb219f 100644
--- a/rlinf/workers/rollout/sglang/__init__.py
+++ b/rlinf/workers/rollout/sglang/__init__.py
@@ -30,7 +30,7 @@ def get_version(pkg):
 sglang_version = None
 
 if package_version is None:
-    raise ValueError(f"vllm version {package_version} not supported")
+    raise ValueError(f"sglang version {package_version} not supported")
 elif package_version >= parse("0.4.4") and package_version < parse("0.4.6.post2"):
     sglang_version = package_version
     from rlinf.hybrid_engines.sglang.sglang_0_4_4 import io_struct

From cae65b8f0a41407d93986dca2b44946031c22f64 Mon Sep 17 00:00:00 2001
From: cc <1716911340@qq.com>
Date: Tue, 30 Sep 2025 22:54:37 +0800
Subject: [PATCH 08/57] feat(async_vllm): add async vllm worker and unified
 sync/async interfaces (#90)

* feat(async_vllm): add async vllm worker and unified sync/async interfaces

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 .github/workflows/math_e2e.yml                |   5 +
 .../workflows/math_e2e_rollout_logprobs.yml   |   6 +
 .../qwen2.5-1.5b-grpo-megatron-pipeline.yaml  |   2 +
 .../config/qwen2.5-1.5b-grpo-megatron.yaml    |   2 +
 .../math/config/qwen2.5-1.5b-single-gpu.yaml  |   2 +
 .../config/qwen2.5-32b-grpo-megatron.yaml     |   2 +
 .../math/config/qwen2.5-7b-grpo-megatron.yaml |   2 +
 rlinf/config.py                               |   2 +
 rlinf/data/io_struct.py                       |  96 ++++-
 .../vllm/vllm_0_8_5/vllm_engine.py            | 166 ---------
 .../hybrid_engines/vllm/vllm_0_8_5/worker.py  |   5 +-
 rlinf/workers/rollout/utils.py                |   9 +-
 rlinf/workers/rollout/vllm/__init__.py        |   4 +-
 rlinf/workers/rollout/vllm/vllm_worker.py     | 338 ++++++++++++++----
 ...1.5b-grpo-collocated-rollout-logprobs.yaml |   2 +
 .../sglang/qwen2.5-1.5b-grpo-collocated.yaml  |   2 +
 ...5-1.5b-grpo-pipeline-rollout-logprobs.yaml |   2 +
 .../sglang/qwen2.5-1.5b-grpo-pipeline.yaml    |   2 +
 ...1.5b-grpo-collocated-rollout-logprobs.yaml |   2 +
 .../vllm/qwen2.5-1.5b-grpo-collocated.yaml    |   2 +
 ...5-1.5b-grpo-pipeline-rollout-logprobs.yaml | 268 ++++++++++++++
 .../math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml | 282 +++++++++++++++
 tests/e2e_tests/math/vllm/run_pipeline.sh     |  17 +
 23 files changed, 971 insertions(+), 249 deletions(-)
 delete mode 100644 rlinf/hybrid_engines/vllm/vllm_0_8_5/vllm_engine.py
 create mode 100644 tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
 create mode 100644 tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml
 create mode 100644 tests/e2e_tests/math/vllm/run_pipeline.sh

diff --git a/.github/workflows/math_e2e.yml b/.github/workflows/math_e2e.yml
index 921dc4076..3e31864b1 100644
--- a/.github/workflows/math_e2e.yml
+++ b/.github/workflows/math_e2e.yml
@@ -67,6 +67,11 @@ jobs:
                 export REPO_PATH=$(pwd)
                 bash tests/e2e_tests/math/sglang/run_pipeline.sh
 
+            - name: vLLM Pipeline mode
+              run: |
+                export REPO_PATH=$(pwd)
+                bash tests/e2e_tests/math/vllm/run_pipeline.sh
+
     qwen-grpo-test-sglang044:
         runs-on: rlinf
         container:
diff --git a/.github/workflows/math_e2e_rollout_logprobs.yml b/.github/workflows/math_e2e_rollout_logprobs.yml
index fde67e297..914efb1d2 100644
--- a/.github/workflows/math_e2e_rollout_logprobs.yml
+++ b/.github/workflows/math_e2e_rollout_logprobs.yml
@@ -66,3 +66,9 @@ jobs:
               run: |
                 export REPO_PATH=$(pwd)
                 bash tests/e2e_tests/math/sglang/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
+
+            - name: vLLM Pipeline mode
+              run: |
+                export REPO_PATH=$(pwd)
+                bash tests/e2e_tests/math/vllm/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
+
diff --git a/examples/math/config/qwen2.5-1.5b-grpo-megatron-pipeline.yaml b/examples/math/config/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
index a9ad232ef..9ba641d15 100644
--- a/examples/math/config/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
+++ b/examples/math/config/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
@@ -117,6 +117,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
diff --git a/examples/math/config/qwen2.5-1.5b-grpo-megatron.yaml b/examples/math/config/qwen2.5-1.5b-grpo-megatron.yaml
index c63d1f7be..6f75dc38e 100644
--- a/examples/math/config/qwen2.5-1.5b-grpo-megatron.yaml
+++ b/examples/math/config/qwen2.5-1.5b-grpo-megatron.yaml
@@ -104,6 +104,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
diff --git a/examples/math/config/qwen2.5-1.5b-single-gpu.yaml b/examples/math/config/qwen2.5-1.5b-single-gpu.yaml
index af2a94a13..a9d58f3a2 100644
--- a/examples/math/config/qwen2.5-1.5b-single-gpu.yaml
+++ b/examples/math/config/qwen2.5-1.5b-single-gpu.yaml
@@ -104,6 +104,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
diff --git a/examples/math/config/qwen2.5-32b-grpo-megatron.yaml b/examples/math/config/qwen2.5-32b-grpo-megatron.yaml
index 4aa80cb4c..6e397dfda 100644
--- a/examples/math/config/qwen2.5-32b-grpo-megatron.yaml
+++ b/examples/math/config/qwen2.5-32b-grpo-megatron.yaml
@@ -104,6 +104,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
diff --git a/examples/math/config/qwen2.5-7b-grpo-megatron.yaml b/examples/math/config/qwen2.5-7b-grpo-megatron.yaml
index 29c4f1d5b..63146687e 100644
--- a/examples/math/config/qwen2.5-7b-grpo-megatron.yaml
+++ b/examples/math/config/qwen2.5-7b-grpo-megatron.yaml
@@ -104,6 +104,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
diff --git a/rlinf/config.py b/rlinf/config.py
index 5700a2172..1934adeb4 100644
--- a/rlinf/config.py
+++ b/rlinf/config.py
@@ -164,6 +164,8 @@ def validate_vllm_cfg(cfg):
         cfg.enable_chunked_prefill = cfg.get("enable_chunked_prefill", True)
         cfg.enable_prefix_caching = cfg.get("enable_prefix_caching", True)
         cfg.enable_flash_infer_sampler = cfg.get("enable_flash_infer_sampler", True)
+        cfg.max_num_batched_tokens = cfg.get("max_num_batched_tokens", None)
+        cfg.torch_profiler_dir = cfg.get("torch_profiler_dir", None)
         return cfg
 
     with open_dict(cfg):
diff --git a/rlinf/data/io_struct.py b/rlinf/data/io_struct.py
index d34f3a32c..1c455fa70 100644
--- a/rlinf/data/io_struct.py
+++ b/rlinf/data/io_struct.py
@@ -56,6 +56,60 @@ class RolloutRequest:
     input_ids: List[List[int]]
     answers: List[str]
 
+    def repeat(self) -> "RolloutRequest":
+        """Repeat each input in the RolloutRequest a specified number of times.
+
+        Args:
+            times (int): The number of times to repeat each input.
+
+        Returns:
+            RolloutRequest: A new RolloutRequest with repeated inputs.
+        """
+        assert self.n > 0, "n must be greater than 0"
+
+        input_ids, answers = zip(
+            *[
+                (input_id, answer)
+                for input_id, answer in zip(self.input_ids, self.answers)
+                for _ in range(self.n)
+            ]
+        )
+        return RolloutRequest(
+            n=self.n,
+            input_ids=list(input_ids),
+            answers=list(answers),
+        )
+
+    def split(self, num_splits: int) -> List["RolloutRequest"]:
+        """Split the RolloutRequest into multiple smaller requests.
+
+        Args:
+            num_splits (int): The number of splits to create.
+
+        Returns:
+            List[RolloutRequest]: A list of smaller RolloutRequest instances.
+        """
+        assert num_splits > 0, "num_splits must be greater than 0"
+        assert len(self.input_ids) % num_splits == 0, (
+            f"Input IDs length {len(self.input_ids)} is not divisible by num_splits {num_splits}"
+        )
+
+        input_ids_split_list = split_list(self.input_ids, num_splits)
+        answers_split_list = split_list(self.answers, num_splits)
+
+        splitted_requests = []
+        for input_ids_batch, answers_batch in zip(
+            input_ids_split_list, answers_split_list
+        ):
+            request = RolloutRequest(
+                n=self.n,
+                input_ids=input_ids_batch,
+                answers=answers_batch,
+            )
+            splitted_requests.append(request)
+
+        return splitted_requests
+
     def repeat_and_split(
         self, rollout_batch_size: Optional[int] = None
     ) -> List["RolloutRequest"]:
@@ -255,7 +309,7 @@ def _get_attention_masks_and_position_ids(
     def from_vllm_results(
         group_size: int,
         results: List[VllmRequestOutput],
-        answers: Optional[List[List[int]]] = None,
+        answers: Optional[List[str]] = None,
         return_logprobs: bool = False,
     ) -> "RolloutResult":
         def get_logprobs(
@@ -270,7 +324,7 @@ def get_logprobs(
                 logprobs.append(logprob[response_ids[i]].logprob)
             return logprobs
 
-        num_sequences = len(results)
+        num_sequences = len(results) * group_size
 
         prompt_lengths = []
         prompt_ids = []
@@ -278,26 +332,42 @@ def get_logprobs(
         response_ids = []
         logprobs = []
         is_end = []
-        for _, res in enumerate(results):
-            if res.prompt_token_ids is not None:
-                prompt_ids.append(res.prompt_token_ids)
-                prompt_lengths.append(len(res.prompt_token_ids))
+        response_texts = []
+        rollout_answers = (
+            [answer for answer in answers for _ in range(group_size)]
+            if answers
+            else None
+        )
+        for vllm_result in results:
+            if vllm_result.prompt_token_ids is not None:
+                prompt_ids.extend([vllm_result.prompt_token_ids] * group_size)
+                prompt_lengths.extend([len(vllm_result.prompt_token_ids)] * group_size)
             else:
-                return NotImplementedError("vllm should return tokenized prompt.")
-            response_id = list(res.outputs[0].token_ids)
-            response_ids.append(response_id)
-            response_lengths.append(len(response_id))
-            is_end.append(res.finished)
+                raise NotImplementedError("vllm should return tokenized prompt.")
+            response_ids.extend(
+                [list(output.token_ids) for output in vllm_result.outputs]
+            )
+            response_texts.extend([output.text for output in vllm_result.outputs])
+            response_lengths.extend(
+                [len(output.token_ids) for output in vllm_result.outputs]
+            )
+            is_end.extend([vllm_result.finished] * group_size)
             if return_logprobs:
-                logprobs.append(get_logprobs(response_id, res.outputs[0]))
+                logprobs.extend(
+                    [
+                        get_logprobs(list(output.token_ids), output)
+                        for output in vllm_result.outputs
+                    ]
+                )
         result: RolloutResult = RolloutResult(
             group_size=group_size,
             num_sequence=num_sequences,
-            answers=answers,
+            answers=rollout_answers,
             prompt_ids=prompt_ids,
             prompt_lengths=prompt_lengths,
             response_ids=response_ids,
             response_lengths=response_lengths,
+            response_texts=response_texts,
             is_end=is_end,
         )
         if return_logprobs:
diff --git a/rlinf/hybrid_engines/vllm/vllm_0_8_5/vllm_engine.py b/rlinf/hybrid_engines/vllm/vllm_0_8_5/vllm_engine.py
deleted file mode 100644
index 4bf155a55..000000000
--- a/rlinf/hybrid_engines/vllm/vllm_0_8_5/vllm_engine.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright 2025 The RLinf Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from functools import partial
-from typing import List, Optional, Union
-
-from omegaconf import DictConfig
-from vllm.config import VllmConfig
-from vllm.inputs.data import TextPrompt, TokensPrompt
-from vllm.outputs import RequestOutput
-from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import SamplingParams
-from vllm.utils import Counter
-from vllm.v1.engine.llm_engine import LLMEngine as _LLMEngine
-
-from rlinf.scheduler.manager.worker_manager import WorkerAddress
-from rlinf.utils.placement import ModelParallelComponentPlacement
-
-
-class VLLMEngine:
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        log_stats: bool,
-        dp_rank: int,
-        rlinf_config: DictConfig,
-        parent_address: WorkerAddress,
-        placement: ModelParallelComponentPlacement,
-        multiprocess_model: bool = False,
-    ):
-        # vllm_worker_cls = partial(VLLMWorker, rlinf_config=rlinf_config)
-        vllm_worker_cls = "rlinf.hybrid_engines.vllm.vllm_0_8_5.worker.VLLMWorker"
-        vllm_config.parallel_config.worker_cls = vllm_worker_cls
-
-        from rlinf.hybrid_engines.vllm.vllm_0_8_5.executor import VLLMExecutor
-
-        executor_factory = partial(
-            VLLMExecutor,
-            rlinf_config=rlinf_config,
-            parent_address=parent_address,
-            placement=placement,
-            dp_rank=dp_rank,
-        )
-
-        self._engine = _LLMEngine(
-            vllm_config=vllm_config,
-            executor_class=executor_factory,
-            log_stats=log_stats,
-            multiprocess_mode=multiprocess_model,
-        )
-        self.request_counter = Counter()
-
-    def generate(
-        self,
-        input_ids: Union[List[List[int]], List[int]],
-        sampling_params: Union[SamplingParams, PoolingParams],
-        prompt_texts: Optional[Union[List[str], str]] = None,
-        return_logprobs: bool = False,
-    ) -> List[RequestOutput]:
-        """
-        Use the VLLM engine to generate text based on input token IDs or prompt text.
-
-        Args:
-            input_ids: A list of lists of input token IDs, or a single list of input
-                token IDs.
-            sampling_params: Sampling parameters for generation.
-            prompt_text: Optional; A list of prompt strings or a single prompt string,
-                if provided, it will be used instead of input_ids.
-            return_logprobs: Whether to return log probabilities of the generated tokens.
-
-        Returns:
-            A list of RequestOutput objects containing the results of the generation.
-        """
-        sampling_params.logprobs = 0 if return_logprobs else None
-        self._add_requests(
-            input_ids=input_ids,
-            prompt_texts=prompt_texts,
-            sampling_params=sampling_params,
-        )
-        results: List[RequestOutput] = self._run_engine()
-        return results
-
-    def _add_requests(
-        self,
-        input_ids: Union[List[List[int]], List[int]],
-        sampling_params: Union[SamplingParams, PoolingParams],
-        prompt_texts: Optional[Union[List[str], str]] = None,
-    ) -> None:
-        """
-        Add generation requests to the engine.
-
-        Args:
-            input_ids: A list of lists of input token IDs, or a single list of input token IDs.
-            prompt_texts: Optional; A list of prompt strings or a single prompt string, if provided,
-                it will be used instead of input_ids.
-            sampling_params: Optional; Sampling parameters for generation.
-        """
-        if prompt_texts is not None:
-            # if not None, we use prompt_text rather than input_ids
-            if isinstance(prompt_texts, str):
-                prompt_texts = [prompt_texts]
-            assert isinstance(prompt_texts, list), (
-                f"Expected list for prompt_texts, got {type(prompt_texts)}"
-            )
-            for prompt_text in prompt_texts:
-                request_id = str(next(self.request_counter))
-                text_prompt = TextPrompt(prompt=prompt_text)
-                self._engine.add_request(
-                    request_id=request_id,
-                    prompt=text_prompt,
-                    params=sampling_params,
-                )
-            return
-
-        assert isinstance(input_ids, list), (
-            f"Expected list for input_ids, got {type(input_ids)}"
-        )
-        if not isinstance(input_ids[0], list):
-            input_ids = [input_ids]
-
-        for input_id in input_ids:
-            request_id = str(next(self.request_counter))
-            tokens_prompt = TokensPrompt(prompt_token_ids=input_id)
-            self._engine.add_request(
-                request_id=request_id,
-                prompt=tokens_prompt,
-                params=sampling_params,
-            )
-
-    def _run_engine(self) -> List[RequestOutput]:
-        """
-        Run the engine until all requests are finished.
-
-        Returns:
-            A list of RequestOutput objects containing the results of the generation.
-        """
-        outputs: List[RequestOutput] = []
-
-        while self._engine.has_unfinished_requests():
-            step_outputs = self._engine.step()
-            for output in step_outputs:
-                if output.finished:
-                    outputs.append(output)
-        return sorted(outputs, key=lambda x: int(x.request_id))
-
-    def offload_model_weights(self) -> None:
-        """
-        Offload most graphic memory vllm used, including model's weights, buffers and kv cache.
-        """
-        self._engine.collective_rpc("offload_model_weights")
-
-    def sync_hf_weight(self) -> None:
-        """
-        Sync model weights from actor to the vllm workers.
-        """
-        self._engine.collective_rpc("sync_hf_weight")
diff --git a/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py b/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
index fe82c7d9d..3e021e922 100644
--- a/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
+++ b/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
@@ -82,7 +82,10 @@ def sync_hf_weight(self) -> None:
         state_dict = self._rlinf_worker.recv(
             src_group_name=self._actor_group_name, src_rank=self.actor_weight_rank
         )
-        super().wake_up()
+        if self.placement_mode == PlacementMode.COLLOCATED:
+            # in disaggregated mode, rollout backend will never offload weights
+            # so we don't need to wake up when placement is disaggregated
+            super().wake_up()
 
         model = self.model_runner.model
         if colocate:
diff --git a/rlinf/workers/rollout/utils.py b/rlinf/workers/rollout/utils.py
index 0e1c534ee..f3845ef2f 100644
--- a/rlinf/workers/rollout/utils.py
+++ b/rlinf/workers/rollout/utils.py
@@ -550,12 +550,11 @@ def get_rollout_backend_worker(
     if rollout_backend == "vllm":
         from rlinf.workers.rollout.vllm.vllm_worker import VLLMWorker
 
-        if placement.placement_mode == PlacementMode.COLLOCATED:
+        if (
+            placement.placement_mode == PlacementMode.COLLOCATED
+            or placement.placement_mode == PlacementMode.DISAGGREGATED
+        ):
             return VLLMWorker
-        elif placement.placement_mode == PlacementMode.DISAGGREGATED:
-            raise NotImplementedError(
-                "vLLM rollout backend does not support the pipeline mode."
-            )
         else:
             raise ValueError(f"Unsupported placement mode: {placement.placement_mode}")
     elif rollout_backend == "sglang":
diff --git a/rlinf/workers/rollout/vllm/__init__.py b/rlinf/workers/rollout/vllm/__init__.py
index 1a43de500..7237a1038 100644
--- a/rlinf/workers/rollout/vllm/__init__.py
+++ b/rlinf/workers/rollout/vllm/__init__.py
@@ -34,8 +34,8 @@ def get_version(pkg):
         "vllm package is not installed or its version could not be determined."
     )
 elif package_version >= parse("0.8.5") and package_version < parse("0.9.0"):
-    from rlinf.hybrid_engines.vllm.vllm_0_8_5.vllm_engine import VLLMEngine
+    from rlinf.hybrid_engines.vllm.vllm_0_8_5.executor import VLLMExecutor
 else:
     raise ValueError(f"vllm version {package_version} not supported")
 
-__all__ = ["VLLMEngine"]
+__all__ = ["VLLMExecutor"]
diff --git a/rlinf/workers/rollout/vllm/vllm_worker.py b/rlinf/workers/rollout/vllm/vllm_worker.py
index 6bfb001fb..d5ecd11fc 100644
--- a/rlinf/workers/rollout/vllm/vllm_worker.py
+++ b/rlinf/workers/rollout/vllm/vllm_worker.py
@@ -12,22 +12,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import asyncio
+import io
 import os
-from typing import List
+from functools import partial
+from typing import AsyncGenerator, List, Optional, Union
 
+import requests
+import torch
 from omegaconf import DictConfig
+from PIL.Image import Image
 from transformers import AutoTokenizer
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs.data import PromptType, TextPrompt, TokensPrompt
+from vllm.outputs import RequestOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.utils import Counter
+from vllm.v1.engine.async_llm import AsyncLLM as AsyncLLMEngine
 
 from rlinf.config import torch_dtype_from_precision
 from rlinf.data.io_struct import RolloutRequest, RolloutResult
 from rlinf.scheduler import Channel, Worker
 from rlinf.utils.placement import ComponentPlacement
 from rlinf.workers.rollout.utils import print_vllm_outputs
+from toolkits.math_verifier.verify import MathRewardModel, math_verify_call
 
-from . import VLLMEngine
+from . import VLLMExecutor
 
 
 class VLLMWorker(Worker):
@@ -58,6 +69,8 @@ def __init__(self, config: DictConfig, placement: ComponentPlacement):
             "The capital of France is",
             "The future of AI is",
         ]
+        self._reward_model = MathRewardModel(self._cfg.reward.reward_scale)
+        self.request_counter = Counter()
 
     def _prepare_vllm_environment(self) -> None:
         """
@@ -70,9 +83,15 @@ def _prepare_vllm_environment(self) -> None:
         )
         # use spawn to avoid fork issues with CUDA
         os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-        # set False to use Inproclient, which uses sync calls.
-        os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
         os.environ["VLLM_ATTENTION_BACKEND"] = self._cfg.rollout.vllm.attention_backend
+        # set True to use AsyncMPClient, which uses async calls.
+        os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "1"
+        if self._cfg.rollout.vllm.torch_profiler_dir is not None:
+            os.environ["VLLM_TORCH_PROFILER_DIR"] = (
+                self._cfg.rollout.vllm.torch_profiler_dir
+            )
+            if not os.path.exists(self._cfg.rollout.vllm.torch_profiler_dir):
+                os.makedirs(self._cfg.rollout.vllm.torch_profiler_dir)
 
     def _get_sampling_params_from_config(self) -> SamplingParams:
         """
@@ -84,6 +103,8 @@ def _get_sampling_params_from_config(self) -> SamplingParams:
                 temperature=0,
                 max_tokens=cfg_sampling_params.max_new_tokens,
                 output_kind=RequestOutputKind.FINAL_ONLY,
+                n=self._cfg.algorithm.group_size,
+                logprobs=0 if self._return_logprobs else None,
             )
         else:
             sampling_params = SamplingParams(
@@ -93,39 +114,195 @@ def _get_sampling_params_from_config(self) -> SamplingParams:
                 repetition_penalty=cfg_sampling_params.repetition_penalty,
                 max_tokens=cfg_sampling_params.max_new_tokens,
                 output_kind=RequestOutputKind.FINAL_ONLY,
+                n=self._cfg.algorithm.group_size,
+                logprobs=0 if self._return_logprobs else None,
             )
         return sampling_params
 
-    def _validate_weight_at_first(self) -> None:
+    def _process_image_data(
+        self, image_data: Optional[List[Union[bytes, str]]]
+    ) -> Optional[List[Image]]:
+        """
+        Process the batch image data which can be bytes or image paths.
+
+        Args:
+            batch_image_data (Optional[List[List[Union[bytes,str]]]]): A batch of
+                image data, each item can be bytes or image path (local or URL).
+        Returns:
+            Optional[List[List[Image]]]: A batch of list of PIL Image. If input
+                is None, return None.
+        """
+        if image_data is None:
+            return None
+        if not isinstance(image_data, list):
+            raise ValueError("image_data should be a list of list of image data.")
+        image_list = []
+        for img in image_data:
+            if isinstance(img, bytes):
+                image = Image.open(io.BytesIO(img))
+            elif isinstance(img, str):
+                if img.startswith("http://") or img.startswith("https://"):
+                    response = requests.get(img)
+                    image = Image.open(io.BytesIO(response.content))
+                else:
+                    image = Image.open(img)
+            else:
+                raise ValueError("Unsupported image data type.")
+            image_list.append(image)
+        return image_list
+
+    async def _validate_weight_at_first(self) -> None:
         """
         Validate the model weights before starting to rollout formally.
         """
         if self._cfg.rollout.detokenize:
-            outputs = self._vllm_engine.generate(
+            vllm_outputs = await self.generate(
                 input_ids=None,
                 sampling_params=self._validate_sampling_params,
-                return_logprobs=False,
                 prompt_texts=self._validate_prompts,
             )
         else:
             prompt_ids = self._tokenizer(self._validate_prompts).input_ids
-            outputs = self._vllm_engine.generate(
+            vllm_outputs = await self.generate(
                 input_ids=prompt_ids,
                 sampling_params=self._validate_sampling_params,
-                return_logprobs=False,
             )
-        print_vllm_outputs(outputs=outputs)
-        print("===============================", flush=True)
+        for request_output in vllm_outputs:
+            print_vllm_outputs(request_output, self._tokenizer)
+
+    async def offload_model_weights(self) -> None:
+        """
+        Use async_engine to offload model weights/kv cache.
+        """
+        await self._async_engine.reset_prefix_cache()
+        await self._async_engine.collective_rpc("offload_model_weights")
 
-    def sync_model_from_actor(self) -> None:
+    async def sync_model_from_actor(self) -> None:
         """
-        Use vllm_engine to sync model weights from the actor.
+        Sync model weights from actor to the vllm workers.
         """
-        self._vllm_engine.sync_hf_weight()
+        await self._async_engine.collective_rpc("sync_hf_weight")
+        await self._async_engine.reset_prefix_cache()
 
-    def init_worker(self) -> None:
+    async def _get_output_from_async_generator(
+        self, async_generator: AsyncGenerator[RequestOutput, None]
+    ) -> RequestOutput:
         """
-        Use EngineArgs and VllmConfig to initialize the VLLM engine.
+        Helper function to get the final output from an async generator.
+        """
+        output: RequestOutput = None
+        async for out in async_generator:
+            output = out
+        assert output is not None, "Async generator returned no output."
+        return output
+
+    def _pre_process_rollout_request(
+        self,
+        request: RolloutRequest,
+    ) -> List[List[RolloutRequest]]:
+        if self._rollout_batch_size is not None:
+            # NOTE:
+            # it's different from sglang, here a request's sample count
+            # instead of sample count x group_size  should be divisible by rollout_batch_size
+            assert len(request.input_ids) % self._rollout_batch_size == 0, (
+                f"rollout_batch_size {self._rollout_batch_size} must divide the total number of requests {len(request.input_ids)}"
+            )
+            num_batch = len(request.input_ids) // self._rollout_batch_size
+        else:
+            num_batch = 1
+
+        split_requests = request.split(num_batch)
+        if self._placement.is_disaggregated:
+            num_prompts_per_request = len(split_requests[0].input_ids)
+            return [r.split(num_prompts_per_request) for r in split_requests]
+        else:
+            return [r.split(1) for r in split_requests]
+
+    async def generate(
+        self,
+        input_ids: Union[List[List[int]], List[int]],
+        sampling_params: SamplingParams,
+        prompt_texts: Optional[Union[List[str], str]] = None,
+        image_data: Optional[
+            Union[List[List[Union[bytes, str]]], List[Union[bytes, str]]]
+        ] = None,
+    ) -> List[RequestOutput]:
+        def check_input_ids() -> List[List[int]]:
+            assert isinstance(input_ids, list), (
+                "input_ids should be a list or list of list of int."
+            )
+            assert len(input_ids) > 0, "input_ids should not be empty."
+            if isinstance(input_ids[0], int):
+                return [input_ids]
+            else:
+                return input_ids
+
+        def check_prompt_text() -> Optional[List[str]]:
+            if prompt_texts is None:
+                return None
+            assert isinstance(prompt_texts, list) or isinstance(prompt_texts, str), (
+                "prompt_text should be a string or list of strings."
+            )
+            if isinstance(prompt_texts, str):
+                return [prompt_texts]
+            else:
+                assert len(prompt_texts) > 0, "prompt_text should not be empty."
+                return prompt_texts
+
+        def check_image_data() -> Optional[List[List[Image]]]:
+            if image_data is None:
+                return None
+            assert isinstance(image_data, list), "image_data should be a list."
+            if isinstance(image_data[0], list):
+                return image_data
+            else:
+                return [image_data]
+
+        input_ids = check_input_ids()
+        prompt_texts = check_prompt_text()
+        image_list = check_image_data()
+
+        inputs: List[PromptType] = []
+        outputs: List[RequestOutput] = []
+        if prompt_texts is not None:
+            for i, prompt_text in enumerate(prompt_texts):
+                if image_list is not None:
+                    image_list = self._process_image_data(image_data=image_list[i])
+                    inputs.append(
+                        TextPrompt(prompt=prompt_text, multi_modal_data=image_list)
+                    )
+                else:
+                    inputs.append(TextPrompt(prompt=prompt_text))
+        else:
+            for i, input_id in enumerate(input_ids):
+                if image_list is not None:
+                    image_list = self._process_image_data(image_data=image_list[i])
+                    inputs.append(
+                        TokensPrompt(
+                            prompt_token_ids=input_id, multi_modal_data=image_list
+                        )
+                    )
+                else:
+                    inputs.append(TokensPrompt(prompt_token_ids=input_id))
+
+        outputs = await asyncio.gather(
+            *[
+                self._get_output_from_async_generator(
+                    self._async_engine.generate(
+                        prompt=inp,
+                        sampling_params=sampling_params,
+                        request_id=str(next(self.request_counter)),
+                    )
+                )
+                for inp in inputs
+            ]
+        )
+
+        return outputs
+
+    async def init_worker(self) -> None:
+        """
+        Use EngineArgs and VllmConfig to initialize VLLM async engine.
         Then offload the model weights, ready to use weights sent from actor.
         """
         engine_args: EngineArgs = EngineArgs(
@@ -136,28 +313,46 @@ def init_worker(self) -> None:
             enforce_eager=self._cfg.rollout.enforce_eager,
             enable_chunked_prefill=self._cfg.rollout.vllm.enable_chunked_prefill,
             enable_prefix_caching=self._cfg.rollout.vllm.enable_prefix_caching,
+            max_num_batched_tokens=self._cfg.rollout.vllm.max_num_batched_tokens,
             task="generate",
+            load_format="dummy" if not self._cfg.rollout.validate_weight else "auto",
             trust_remote_code=self._cfg.actor.tokenizer.trust_remote_code,
             max_model_len=self._cfg.runner.seq_length,
             max_num_seqs=self._cfg.rollout.max_running_requests,
             enable_sleep_mode=True,  # it enables offload weights
         )
         vllm_config: VllmConfig = engine_args.create_engine_config()
+
+        # here to set the customed worker class for VLLM engine
+        vllm_worker_cls = "rlinf.hybrid_engines.vllm.vllm_0_8_5.worker.VLLMWorker"
+        vllm_config.parallel_config.worker_cls = vllm_worker_cls
+
         self.log_info(f"vllm_config is {vllm_config}")
-        self.log_info(f"[LLM dp {self._rank}] start to initialize VLLM engine")
-        self._vllm_engine = VLLMEngine(
+
+        executor_class = partial(
+            VLLMExecutor,
             rlinf_config=self._cfg,
-            vllm_config=vllm_config,
-            log_stats=not self._cfg.rollout.disable_log_stats,
-            multiprocess_model=False,  # use Inproclient
             parent_address=self.worker_address,
             placement=self._placement,
             dp_rank=self._rank,
         )
+
+        self._async_engine = AsyncLLMEngine(
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=not self._cfg.rollout.disable_log_stats,
+            log_requests=False,  # do not need to log each request
+        )
+
         self.log_info(f"[LLM dp {self._rank}] VLLM engine initialized.")
-        self._vllm_engine.offload_model_weights()
 
-    def _stop(self) -> None:
+        if not self._placement.is_disaggregated:
+            await self.offload_model_weights()
+
+    async def _put_result(self, result: RolloutResult, output_channel: Channel) -> None:
+        await output_channel.put(result, async_op=True).async_wait()
+
+    async def _stop(self) -> None:
         """
         Helper function to stop the VLLM engine and offload model weights.
         This should only be called when vllm engine has no more requests to process.
@@ -165,50 +360,69 @@ def _stop(self) -> None:
         self.log_debug(
             f"[LLM dp {self._rank}] Received None input tokens, rollout end."
         )
-        self._vllm_engine.offload_model_weights()
+        if not self._placement.is_disaggregated:
+            await self.offload_model_weights()
 
-    def rollout(self, input_channel: Channel, output_channel: Channel) -> None:
+    async def _compute_reward_and_advantage(self, rollout_result: RolloutResult):
         """
-        The main rollout function to interact with the VLLM engine.
-        It receives RolloutRequest from input_channel, and sends back RolloutResult
-        to output_channel.
-
-        Args:
-            input_channel (Channel): The channel to receive RolloutRequest.
-            output_channel (Channel): The channel to send RolloutResult.
+        Compute rewards and advantages for the rollout result using math verification.
         """
-        request: RolloutRequest = input_channel.get()
+        answers = rollout_result.answers
+        outputs = rollout_result.response_texts
+        num_sequence = rollout_result.num_sequence
+        assert len(answers) == len(outputs), (
+            f"Answers length {len(answers)} != outputs length {len(outputs)}"
+        )
+        assert len(answers) == num_sequence, (
+            f"Answers length {len(answers)} != num_sequence {num_sequence}"
+        )
+
+        math_verify_results = math_verify_call(outputs, answers)
+        rewards = [
+            (1 if r else -1) * self._reward_model.scale for r in math_verify_results
+        ]
+        rewards_tensor = torch.tensor(rewards, dtype=torch.float)
+        rollout_result.rewards = rewards_tensor.reshape(-1, 1)
 
-        requests: List[RolloutRequest] = request.repeat_and_split(
-            self._rollout_batch_size
+        mean = rewards_tensor.mean()
+        std = rewards_tensor.std(unbiased=False)
+        advantages = (rewards_tensor - mean) / (std + 1e-6)
+        rollout_result.advantages = advantages.tolist()
+
+    async def rollout_and_return(
+        self, request: RolloutRequest, output_channel: Channel
+    ):
+        vllm_results: List[RequestOutput] = await self.generate(
+            input_ids=request.input_ids, sampling_params=self._sampling_params
+        )
+        rollout_result: RolloutResult = RolloutResult.from_vllm_results(
+            group_size=self._cfg.algorithm.group_size,
+            results=vllm_results,
+            answers=request.answers,
+            return_logprobs=self._return_logprobs,
         )
+        if self._placement.is_disaggregated:
+            await self._compute_reward_and_advantage(rollout_result)
 
-        # Acquire the GPUs to ensure no one is using them during rollout
-        output_channel.device_lock.acquire()
+        await self._put_result(result=rollout_result, output_channel=output_channel)
 
-        rollout_results: List[RolloutResult] = []
-        for request in requests:
-            with self.worker_timer():
-                vllm_results = self._vllm_engine.generate(
-                    input_ids=request.input_ids,
-                    sampling_params=self._sampling_params,
-                    return_logprobs=self._return_logprobs,
-                )
-            # should be converted by _vllm_engine side.
-            results = RolloutResult.from_vllm_results(
-                group_size=self._cfg.algorithm.group_size,
-                results=vllm_results,
-                answers=request.answers,
-                return_logprobs=self._return_logprobs,
-            )
-            rollout_results.append(results)
-            if self._cfg.rollout.print_outputs:
-                print_vllm_outputs(outputs=vllm_results)
-
-        # Stop and offload SGLang first before putting into channel
-        # This avoids running SGLang and Megatron simultaneously
-        self._stop()
-        # Release the GPUs once the engine has offloaded
+    async def rollout(self, input_channel: Channel, output_channel: Channel) -> None:
+        rollout_request: RolloutRequest = await input_channel.get(
+            async_op=True
+        ).async_wait()
+        output_channel.device_lock.acquire()
+        batched_requests = self._pre_process_rollout_request(rollout_request)
+        with self.worker_timer():
+            for requests in batched_requests:
+                rollout_tasks: List[asyncio.Task] = []
+                for request in requests:
+                    rollout_tasks.append(
+                        asyncio.create_task(
+                            self.rollout_and_return(
+                                request=request, output_channel=output_channel
+                            )
+                        )
+                    )
+                await asyncio.gather(*rollout_tasks)
+            await self._stop()
         output_channel.device_lock.release()
-        rollout_result = RolloutResult.merge_result_list(rollout_results)
-        output_channel.put(rollout_result)
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
index 72744a4ed..3253f327b 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
+++ b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
@@ -104,6 +104,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
index 33835a003..a3d8ee225 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
+++ b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
@@ -104,6 +104,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
index 8054bc911..d5f57e660 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
+++ b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
@@ -107,6 +107,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml
index 03d24a2ac..ad84fcec4 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml
+++ b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml
@@ -117,6 +117,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
index 07f30cc8c..6ba79956d 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
@@ -104,6 +104,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml
index 03e024c2e..f06f46ce0 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml
@@ -104,6 +104,8 @@ rollout:
     enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
     enable_prefix_caching: True   # enable vllm to use prefix_caching.
     enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
new file mode 100644
index 000000000..496589988
--- /dev/null
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
@@ -0,0 +1,268 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  num_gpus_per_node: 8
+  component_placement:
+    rollout: 0-3
+    actor: 4-7
+
+runner:
+  task_type: math
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: 3
+
+  val_check_interval: 1
+  save_interval: -1
+
+  seq_length: 1024
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 1024
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: /workspace/results
+
+algorithm:
+  group_size: 2
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: False
+  shuffle_rollout: False
+
+  max_num_gen_batches: 1
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: True
+  clip_ratio_c: null # 3.0
+
+  adv_type: grpo
+  normalize_advantages: False
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: True
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, vllm will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for vllm rollout
+  eos: null                   # will be tokenizer.eos_token_id if null.
+
+  rollout_backend: vllm     # [sglang, vllm]
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of inference engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the inference engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: math
+  max_prompt_length: 256
+  rollout_batch_size: 8
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  shuffle: True
+  validation_shuffle: True
+  seed: 1
+  train_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+  val_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: megatron
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  checkpoint_load_path: null
+
+  offload_optimizer: True
+  offload_weight: True
+  offload_grad: True
+
+  enable_dp_load_balance: False
+
+  calculate_flops: True
+
+  seed: 1234
+
+  model:
+    precision: fp16
+    add_bias_linear: False
+
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+
+    activation: swiglu
+    sequence_parallel: True
+    # recompute_method: block
+    # recompute_granularity: selective
+
+    recompute_method: block
+    recompute_granularity: full
+    recompute_num_layers: 20
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+
+    normalization: rmsnorm
+
+    position_embedding_type: rope
+
+    apply_rope_fusion: True
+    bias_dropout_fusion: False
+    persist_layer_norm: False
+    bias_activation_fusion: False
+    attention_softmax_in_fp32: True
+    batch_p2p_comm: False
+    variable_seq_lengths: True
+    gradient_accumulation_fusion: False
+    moe_token_dispatcher_type: alltoall
+    use_cpu_initialization: False
+
+  optim:
+    optimizer: adam
+    bf16: False
+    fp16: True
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: True
+    overlap_param_gather: True
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  megatron:
+    ddp_bucket_size: null
+    distributed_backend: nccl # Support 'nccl' and 'gloo'
+    distributed_timeout_minutes: 30
+    ckpt_format: torch
+    use_dist_ckpt: False
+    tp_comm_bootstrap_backend: nccl
+    tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
+    use_hf_ckpt: False # if true, will transfer hf model to generate megatron checkpoint and use it for training.
+    use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
+
+    ckpt_convertor: # config for ckpt convertor
+      model: DeepSeek-R1-Distill-Qwen-1.5B
+      model_type: null # will be set by hf model's config if null
+      hf_model_path: ${rollout.model_dir} # path to the hf model
+      save_path: ${runner.output_dir}/${runner.experiment_name}/converted_ckpts/actor
+      use_gpu_num : 0
+      use_gpu_index: null
+      process_num: 16 # number of processes to use for checkpointing
+      tensor_model_parallel_size: ${actor.model.tensor_model_parallel_size}
+      pipeline_model_parallel_size: ${actor.model.pipeline_model_parallel_size}
+      
+    profiler: # profile megatron when inference and traning
+      output_dir: ${runner.output_dir}/${runner.experiment_name}/profiler
+      activities: ["cpu", "cuda"]
+      record_shapes: False
+      profile_memory: False
+      with_stack: False
+      with_flops: False
+      with_modules: True
+      export_tensorboard: True
+      export_chrome_trace: False
+      chrome_filename_prefix: "chrome_trace"
+      schedule_warmup: 2
+      schedule_active: 1
+      schedule_repeat: 1 # inference and training will repeat such times
+      # schedule_wait: it will be set at runtime
+
+
+reward:
+  use_reward_model: false
+  reward_type: 'math'
+  reward_scale: 5.0
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml
new file mode 100644
index 000000000..2100f6535
--- /dev/null
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml
@@ -0,0 +1,282 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  num_gpus_per_node: 8
+  component_placement:
+    rollout: 0-3
+    inference: 4-5
+    actor: 6-7
+
+runner:
+  task_type: math
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: 3
+
+  val_check_interval: 1
+  save_interval: -1
+
+  seq_length: 1024
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 1024
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: /workspace/results
+
+algorithm:
+  group_size: 2
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: True
+  shuffle_rollout: False
+
+  max_num_gen_batches: 1
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: True
+  clip_ratio_c: null # 3.0
+
+  adv_type: grpo
+  normalize_advantages: False
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: True
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+inference:
+  model_arch: ${rollout.model_arch}
+  group_name: "InferenceGroup"
+  load_from_actor: True
+  model:
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+    sequence_parallel: True
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+
+  rollout_backend: vllm     # [sglang, vllm]
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+    max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
+    torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  sglang_decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+  use_torch_compile: False # enable torch_compile in SGLang for rollout.
+  torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+data:
+  type: math
+  max_prompt_length: 256
+  rollout_batch_size: 8
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  shuffle: True
+  validation_shuffle: True
+  seed: 1
+  train_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+  val_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: megatron
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  checkpoint_load_path: null
+
+  offload_optimizer: True
+  offload_weight: True
+  offload_grad: True
+
+  enable_dp_load_balance: False
+
+  calculate_flops: True
+
+  seed: 1234
+
+  model:
+    precision: fp16
+    add_bias_linear: False
+
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+
+    activation: swiglu
+    sequence_parallel: True
+    # recompute_method: block
+    # recompute_granularity: selective
+
+    recompute_method: block
+    recompute_granularity: full
+    recompute_num_layers: 20
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+
+    normalization: rmsnorm
+
+    position_embedding_type: rope
+
+    apply_rope_fusion: True
+    bias_dropout_fusion: False
+    persist_layer_norm: False
+    bias_activation_fusion: False
+    attention_softmax_in_fp32: True
+    batch_p2p_comm: False
+    variable_seq_lengths: True
+    gradient_accumulation_fusion: False
+    moe_token_dispatcher_type: alltoall
+    use_cpu_initialization: False
+
+  optim:
+    optimizer: adam
+    bf16: False
+    fp16: True
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: True
+    overlap_param_gather: True
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  megatron:
+    ddp_bucket_size: null
+    distributed_backend: nccl # Support 'nccl' and 'gloo'
+    distributed_timeout_minutes: 30
+    ckpt_format: torch
+    use_dist_ckpt: False
+    tp_comm_bootstrap_backend: nccl
+    tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
+    use_hf_ckpt: False # if true, will transfer hf model to generate megatron checkpoint and use it for training.
+    use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
+
+    ckpt_convertor: # config for ckpt convertor
+      model: DeepSeek-R1-Distill-Qwen-1.5B
+      model_type: null # will be set by hf model's config if null
+      hf_model_path: ${rollout.model_dir} # path to the hf model
+      save_path: ${runner.output_dir}/${runner.experiment_name}/converted_ckpts/actor
+      use_gpu_num : 0
+      use_gpu_index: null
+      process_num: 16 # number of processes to use for checkpointing
+      tensor_model_parallel_size: ${actor.model.tensor_model_parallel_size}
+      pipeline_model_parallel_size: ${actor.model.pipeline_model_parallel_size}
+      
+    profiler: # profile megatron when inference and traning
+      output_dir: ${runner.output_dir}/${runner.experiment_name}/profiler
+      activities: ["cpu", "cuda"]
+      record_shapes: False
+      profile_memory: False
+      with_stack: False
+      with_flops: False
+      with_modules: True
+      export_tensorboard: True
+      export_chrome_trace: False
+      chrome_filename_prefix: "chrome_trace"
+      schedule_warmup: 2
+      schedule_active: 1
+      schedule_repeat: 1 # inference and training will repeat such times
+      # schedule_wait: it will be set at runtime
+
+
+reward:
+  use_reward_model: false
+  reward_type: 'math'
+  reward_scale: 5.0
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/vllm/run_pipeline.sh b/tests/e2e_tests/math/vllm/run_pipeline.sh
new file mode 100644
index 000000000..0a21368f6
--- /dev/null
+++ b/tests/e2e_tests/math/vllm/run_pipeline.sh
@@ -0,0 +1,17 @@
+#! /bin/bash
+set -x
+
+tabs 4
+export VLLM_ATTENTION_BACKEND=XFORMERS
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
+
+if [ -z "$1" ]; then
+    CONFIG_NAME="qwen2.5-1.5b-grpo-pipeline"
+else
+    CONFIG_NAME=$1
+fi
+
+python ${REPO_PATH}/examples/math/main_math.py --config-path $REPO_PATH/tests/e2e_tests/math/vllm  --config-name $CONFIG_NAME
\ No newline at end of file

From 1d8f5ffbe676386298241748ca3e56659f9130e7 Mon Sep 17 00:00:00 2001
From: LiuYiwei <71703069+secretsites@users.noreply.github.com>
Date: Thu, 9 Oct 2025 11:28:14 +0800
Subject: [PATCH 09/57] chore: fix README-EN and add README-CN (#152)

* chore: fix README-EN and add README-CN

Signed-off-by: LiuYiwei <1252642155@qq.com>
---
 README.md       |  45 ++++--
 README.zh-CN.md | 383 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 416 insertions(+), 12 deletions(-)
 create mode 100644 README.zh-CN.md

diff --git a/README.md b/README.md
index 5b673c23c..ac99ffec5 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,13 @@
 <a href="https://github.com/RLinf/misc/blob/main/pic/wechat.jpg?raw=true"><img src="https://img.shields.io/badge/微信-green?logo=wechat&amp"></a>
 </div>
 
+<div align="center">
+
+[![English](https://img.shields.io/badge/lang-English-blue.svg)](README.md)
+[![简体中文](https://img.shields.io/badge/语言-简体中文-red.svg)](README.zh-CN.md)
+
+</div>
+
 <h1 align="center">
   <sub>RLinf: Reinforcement Learning Infrastructure for Agentic AI</sub>
 </h1>
@@ -25,6 +32,7 @@ RLinf is a flexible and scalable open-source infrastructure designed for post-tr
 ## What's NEW!
 - [2025/09] <img src="https://github.githubassets.com/images/icons/emoji/unicode/1f525.png" width="18" /> [Example Gallery](https://rlinf.readthedocs.io/en/latest/rst_source/examples/index.html) is updated, users can find various off-the-shelf examples!
 - [2025/09] The paper [RLinf: Flexible and Efficient Large-scale Reinforcement Learning via Macro-to-Micro Flow Transformation](https://arxiv.org/abs/2509.15965) is released.
+- [2025/09] The [report on RLinf by Machine Heart](https://mp.weixin.qq.com/s/Xtv4gDu3lhDDGadLrzt6Aw)  is released. 
 - [2025/08] RLinf is open-sourced. The formal v0.1 will be released soon.
 
 ## Key Features
@@ -68,7 +76,7 @@ RLinf is a flexible and scalable open-source infrastructure designed for post-tr
 <div align="center">
 <table>
   <tr>
-    <th colspan="5" style="text-align:center;"><strong>OpenVLA-OFT model results on ManiSkill3</strong></th>
+    <th colspan="5" style="text-align:center;"><strong>OpenVLA and OpenVLA-OFT model results on ManiSkill3</strong></th>
   </tr>
   <tr>
     <th>Model</th>
@@ -120,10 +128,10 @@ RLinf is a flexible and scalable open-source infrastructure designed for post-tr
   </tr>
   <tr>
     <th>Model</th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-spatial">Spatial</a></th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-goal">Goal</a></th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-object">Object</a></th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-long">Long</a></th>
+    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-spatial"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Spatial</a></th>
+    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-goal"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Goal</a></th>
+    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-object"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Object</a></th>
+    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-long"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Long</a></th>
     <th>Average</th>
   </tr>
   <tr>
@@ -166,9 +174,9 @@ RLinf is a flexible and scalable open-source infrastructure designed for post-tr
   </tr>
   <tr>
     <th>Model</th>
-    <th><a href="https://huggingface.co/datasets/RLinf/AIME24">AIME 24</a></th>
-    <th><a href="https://huggingface.co/datasets/RLinf/AIME25">AIME 25</a></th>
-    <th><a href="https://huggingface.co/datasets/RLinf/GPQA-diamond">GPQA-diamond</a></th>
+    <th>AIME 24</a></th>
+    <th>AIME 25</a></th>
+    <th>GPQA-diamond</a></th>
     <th>Average</th>
   </tr>
   <tr>
@@ -211,9 +219,9 @@ RLinf is a flexible and scalable open-source infrastructure designed for post-tr
   </tr>
   <tr>
     <th>Model</th>
-    <th><a href="https://huggingface.co/datasets/RLinf/AIME24">AIME 24</a></th>
-    <th><a href="https://huggingface.co/datasets/RLinf/AIME25">AIME 25</a></th>
-    <th><a href="https://huggingface.co/datasets/RLinf/GPQA-diamond">GPQA-diamond</a></th>
+    <th>AIME 24</a></th>
+    <th>AIME 25</a></th>
+    <th>GPQA-diamond</a></th>
     <th>Average</th>
   </tr>
   <tr>
@@ -330,7 +338,20 @@ If you find **RLinf** helpful, please cite the paper:
 }
 ```
 
-If you use RL+VLA in RLinf, you can also cite our empirical study paper:
+If you use RL+VLA in RLinf, you can also cite our technical report and empirical study paper:
+
+```bibtex
+@misc{zang2025rlinfvlaunifiedefficientframework,
+      title={RLinf-VLA: A Unified and Efficient Framework for VLA+RL Training}, 
+      author={Hongzhi Zang and Mingjie Wei and Si Xu and Yongji Wu and Zhen Guo and Yuanqing Wang and Hao Lin and Liangzhi Shi and Yuqing Xie and Zhexuan Xu and Zhihao Liu and Kang Chen and Wenhao Tang and Quanlu Zhang and Weinan Zhang and Chao Yu and Yu Wang},
+      year={2025},
+      eprint={2510.06710},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2510.06710}, 
+}
+```
+
 ```bibtex
 @misc{liu2025rlbringvlageneralization,
   title={What Can RL Bring to VLA Generalization? An Empirical Study}, 
diff --git a/README.zh-CN.md b/README.zh-CN.md
new file mode 100644
index 000000000..b0801c144
--- /dev/null
+++ b/README.zh-CN.md
@@ -0,0 +1,383 @@
+<div align="center">
+  <img src="docs/source-en/_static/svg/logo_white.svg" alt="RLinf-logo" width="600"/>
+</div>
+
+<div align="center">
+<a href="https://arxiv.org/abs/2509.15965"><img src="https://img.shields.io/badge/arXiv-Paper-red?logo=arxiv"></a>
+<a href="https://huggingface.co/RLinf"><img src="https://img.shields.io/badge/HuggingFace-yellow?logo=huggingface&logoColor=white" alt="Hugging Face"></a>
+<a href="https://rlinf.readthedocs.io/en/latest/"><img src="https://img.shields.io/badge/Documentation-Purple?color=8A2BE2&logo=readthedocs"></a>
+<a href="https://rlinf.readthedocs.io/zh-cn/latest/"><img src="https://img.shields.io/badge/中文文档-red?logo=readthedocs"></a>
+<a href="https://deepwiki.com/RLinf/RLinf"><img src="https://img.shields.io/badge/Ask%20DeepWiki-1DA1F2?logo=databricks&logoColor=white&color=00ADEF" alt="Ask DeepWiki"></a>
+<a href="https://github.com/RLinf/misc/blob/main/pic/wechat.jpg?raw=true"><img src="https://img.shields.io/badge/微信-green?logo=wechat&amp"></a>
+</div>
+
+<div align="center">
+
+[![English](https://img.shields.io/badge/lang-English-blue.svg)](README.md)
+[![简体中文](https://img.shields.io/badge/语言-简体中文-red.svg)](README.zh-CN.md)
+
+</div>
+
+<h1 align="center">
+  <sub>RLinf: 为Agentic AI而生的强化学习框架</sub>
+</h1>
+
+RLinf 是一个灵活且可扩展的开源框架，专为利用强化学习进行基础模型的后训练而设计。名称中的 “inf” 既代表 `Infrastructure`，强调其作为新一代训练坚实基础的作用；也代表 `Infinite`，寓意其支持开放式学习、持续泛化以及智能发展的无限可能。
+
+<div align="center">
+  <img src="docs/source-en/_static/svg/overview.svg" alt="RLinf-overview"/>
+</div>
+
+
+## 最新动态
+- [2025/09] <img src="https://github.githubassets.com/images/icons/emoji/unicode/1f525.png" width="18" /> [示例库](https://rlinf.readthedocs.io/en/latest/rst_source/examples/index.html) 已更新，用户可以在其中找到多种可直接使用的示例！
+- [2025/09] 我们的论文 [《RLinf: Flexible and Efficient Large-scale Reinforcement Learning via Macro-to-Micro Flow Transformation》](https://arxiv.org/abs/2509.15965)已正式发布。
+- [2025/09] 机器之心关于 RLinf 的报道[《首个为具身智能而生的大规模强化学习框架RLinf！清华、北京中关村学院、无问芯穹等重磅开源》](https://mp.weixin.qq.com/s/Xtv4gDu3lhDDGadLrzt6Aw)已经发布。
+- [2025/08] RLinf 已经开源，正式的 v0.1 版本即将发布。
+
+
+## ✨ 核心特性
+
+
+**RLinf 的独特之处在于：**
+- 宏工作流到微执行流的映射机制（Macro-to-Micro Flow）：一种全新的 M2Flow 范式，通过微观层次的执行流来驱动宏观层次的逻辑流，实现逻辑工作流构建（可编程）与物理通信和调度（高效性）的解耦。
+
+- 灵活的执行模式
+
+  - 共享式（Collocated Mode）：用户可以配置组件是否同时常驻于 GPU 内存，或通过卸载 / 重新加载机制交替使用 GPU。
+  - 分离式（Disaggregated Mode）：组件既可以顺序运行（可能导致 GPU 空闲），也可以以流水线方式执行，从而确保所有 GPU 都处于忙碌状态。
+  - 混合式（Hybrid Mode）：进一步扩展了灵活性，支持自定义组合不同的放置形式。典型案例是 Generator 和 GPU-based Simulator 执行分离式细粒度流水，二者与 Inference 和 Trainer 执行共享式。
+
+- 自动调度策略： 根据训练任务自动选择最合适的执行模式，无需手动分配资源。
+  
+- 具身智能体支持
+  - 主流 VLA 模型的快速自适应支持: [OpenVLA](https://github.com/openvla/openvla), [OpenVLA-OFT](https://github.com/moojink/openvla-oft), [π₀](https://github.com/Physical-Intelligence/openpi) 和 [π₀.₅](https://github.com/Physical-Intelligence/openpi).
+  - 支持主流基于 CPU 与 GPU 的模拟器（通过标准化 RL 接口）： [ManiSkill3](https://github.com/haosulab/ManiSkill), [LIBERO](https://github.com/Lifelong-Robot-Learning/LIBERO).
+  - 首次实现对带有 flow-matching action expert 的 $\pi_0$ 和 $\pi_{0.5}$ 模型家族的 RL 微调。
+
+**RLinf 的高效性体现在：**
+
+- 细粒度流水化的混合式模式： 相较于其他框架，实现了 120%+ 的吞吐量提升。
+- 秒级显卡自动扩缩： 可动态扩展训练资源，支持在数秒内完成 GPU 切换，在保持 RL 算法 on-policy 特性的同时，进一步提升 20–40% 的效率。
+
+**RLinf 的灵活性与易用性体现在：**
+
+- 多后端集成
+
+  - FSDP + Hugging Face： 快速适配新模型与新算法，非常适合初学者和快速原型开发。
+  - Megatron + SGLang： 针对大规模训练进行了优化，为专家用户提供最大化效率。
+
+- 自适应通信： 通过异步通信通道实现高效交互。
+
+- 内置支持主流 RL 方法： 包括 [PPO](https://arxiv.org/abs/1707.06347), [GRPO](https://arxiv.org/abs/2402.03300), [DAPO](https://arxiv.org/abs/2503.14476), [Reinforce++](https://arxiv.org/abs/2501.03262) 等。
+
+## 主要成果
+### 具身智能
+
+<div align="center">
+<table>
+  <tr>
+    <th colspan="5" style="text-align:center;"><strong>OpenVLA-OFT 模型在 ManiSkill3 上的实验结果</strong></th>
+  </tr>
+  <tr>
+    <th>Model</th>
+    <th>Vision</th>
+    <th>Semantic</th>
+    <th>Position</th>
+    <th>Average</th>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/gen-robot/openvla-7b-rlvla-warmup"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">rl4vla</a></td>
+    <td>76.6%</td>
+    <td>75.4%</td>
+    <td>77.6%</td>
+    <td>76.1%</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">GRPO-OpenVLA-OFT</td>
+    <td><strong>84.6%</strong></td>
+    <td>51.6%</td>
+    <td>42.9%</td>
+    <td>61.5%</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">PPO-OpenVLA-OFT</td>
+    <td>80.5%</td>
+    <td>56.6%</td>
+    <td>56.1%</td>
+    <td>64.5%</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">PPO-OpenVLA</td>
+    <td>82.0%</td>
+    <td><strong>80.6%</strong></td>
+    <td><strong>89.3%</strong></td>
+    <td><strong>82.2%</strong></td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">GRPO-OpenVLA</td>
+    <td>74.7%</td>
+    <td>74.4%</td>
+    <td>81.6%</td>
+    <td>75.5%</td>
+  </tr>
+</table>
+
+<table>
+  <tr>
+    <th colspan="6" style="text-align:center;"><strong>OpenVLA-OFT 模型在 LIBERO 上的实验结果</strong></th>
+  </tr>
+  <tr>
+    <th>Model</th>
+    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-spatial"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Spatial</a></th>
+    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-goal"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Goal</a></th>
+    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-object"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Object</a></th>
+    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-long"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Long</a></th>
+    <th>Average</th>
+  </tr>
+  <tr>
+    <td>OpenVLA-OFT-SFT (one-shot)</td>
+    <td>56.5%</td>
+    <td>45.6%</td>
+    <td>25.6%</td>
+    <td>9.7%</td>
+    <td>34.4%</td>
+  </tr>
+  <tr>
+    <td>OpenVLA-OFT-RLinf</td>
+    <td><strong>99.0%</strong></td>
+    <td><strong>99.0%</strong></td>
+    <td><strong>99.0%</strong></td>
+    <td><strong>94.4%</strong></td>
+    <td><strong>97.9%</strong></td>
+  </tr>
+  <tr>
+    <td>Improvement</td>
+    <td>+42.5%</td>
+    <td>+53.4%</td>
+    <td>+73.4%</td>
+    <td>+84.7%</td>
+    <td>+63.5%</td>
+  </tr>
+</table>
+</div>
+
+- RLinf 同时支持 PPO 与 GRPO 算法，为视觉-语言-动作（Vision-Language-Action, VLA）模型提供最先进的训练能力。
+- 该框架与主流具身智能基准测试（如 ManiSkill3 与 LIBERO）无缝集成，并在多样化的评测指标上均取得了优异表现。
+
+
+### 数学推理
+
+<div align="center">
+<table>
+  <tr>
+    <th colspan="5" style="text-align:center;"><strong>1.5B model results</strong></th>
+  </tr>
+  <tr>
+    <th>Model</th>
+    <th>AIME 24</a></th>
+    <th>AIME 25</a></th>
+    <th>GPQA-diamond</a></th>
+    <th>Average</th>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">DeepSeek-R1-Distill-Qwen-1.5B (base model)</a></td>
+    <td>28.33</td><td>24.90</td><td>27.45</td><td>26.89</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/zwhe99/DeepMath-1.5B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">DeepMath-1.5B</a></td>
+    <td>37.80</td><td>30.42</td><td>32.11</td><td>33.44</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/agentica-org/DeepScaleR-1.5B-Preview"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">DeepScaleR-1.5B-Preview</a></td>
+    <td>40.41</td><td>30.93</td><td>27.54</td><td>32.96</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/inclusionAI/AReaL-1.5B-Preview-Stage-3"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">AReaL-1.5B-Preview-Stage-3</a></td>
+    <td>40.73</td><td>31.56</td><td>28.10</td><td>33.46</td>
+  </tr>
+  <tr>
+    <td>AReaL-1.5B-retrain*</td>
+    <td>44.42</td><td>34.27</td><td>33.81</td><td>37.50</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/Nickyang/FastCuRL-1.5B-V3"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">FastCuRL-1.5B-V3</a></td>
+    <td>43.65</td><td>32.49</td><td>35.00</td><td>37.05</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/RLinf/RLinf-math-1.5B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;"><strong>RLinf-math-1.5B</strong></a></td>
+    <td><strong>48.44</strong></td><td><strong>35.63</strong></td><td><strong>38.46</strong></td><td><strong>40.84</strong></td>
+  </tr>
+</table>
+</div>
+
+\* 我们使用默认设置对模型进行了 600 步的重新训练。
+
+<div align="center">
+<table>
+  <tr>
+    <th colspan="5" style="text-align:center;"><strong>7B model results</strong></th>
+  </tr>
+  <tr>
+    <th>Model</th>
+    <th>AIME 24</a></th>
+    <th>AIME 25</a></th>
+    <th>GPQA-diamond</a></th>
+    <th>Average</th>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">DeepSeek-R1-Distill-Qwen-7B (base model)</a></td>
+    <td>54.90</td><td>40.20</td><td>45.48</td><td>46.86</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/inclusionAI/AReaL-boba-RL-7B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">AReaL-boba-RL-7B</a></td>
+    <td>61.66</td><td>49.38</td><td>46.93</td><td>52.66</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/Skywork/Skywork-OR1-7B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Skywork-OR1-7B</a></td>
+    <td>66.87</td><td>52.49</td><td>44.43</td><td>54.60</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/POLARIS-Project/Polaris-7B-Preview"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Polaris-7B-Preview</a></td>
+    <td><strong>68.55</strong></td><td>51.24</td><td>43.88</td><td>54.56</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/nvidia/AceMath-RL-Nemotron-7B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">AceMath-RL-Nemotron-7B</a></td>
+    <td>67.30</td><td><strong>55.00</strong></td><td>45.57</td><td>55.96</td>
+  </tr>
+  <tr>
+    <td><a href="https://huggingface.co/RLinf/RLinf-math-7B"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;"><strong>RLinf-math-7B</strong></a></td>
+    <td>68.33</td><td>52.19</td><td><strong>48.18</strong></td><td><strong>56.23</strong></td>
+  </tr>
+</table>
+</div>
+
+- RLinf 在数学推理任务上实现了当前最先进的性能，在多个基准测试（AIME 24、AIME 25、GPQA-diamond）中，1.5B 与 7B 规模的模型均稳定超越现有方法。
+
+## 路线图
+
+### 1. 系统级增强
+- [ ] 支持异构 GPU
+
+- [ ] 支持异步流水线执行
+
+- [ ] 支持专家混合（Mixture of Experts, MoE）
+
+- [ ] 支持 vLLM 推理后端
+
+### 2. 应用级扩展
+- [ ] 支持视觉-语言模型（VLMs）训练
+
+- [ ] 支持深度搜索智能体训练
+
+- [ ] 支持多智能体训练
+- [ ] 支持更多具身模拟器的集成 (如 [Meta-World](https://github.com/Farama-Foundation/Metaworld), [GENESIS](https://github.com/Genesis-Embodied-AI/Genesis), [RoboTwin](https://github.com/RoboTwin-Platform/RoboTwin))  
+- [ ] 支持更多VLA模型，比如 [GR00T](https://github.com/NVIDIA/Isaac-GR00T), [WALL-OSS](https://huggingface.co/x-square-robot/wall-oss-flow)
+- [ ] 支持世界模型（World Model）
+
+- [ ] 支持真实世界的具身智能强化学习
+
+
+## 快速开始 
+
+完整的 RLinf 文档请见[**这里**](https://rlinf.readthedocs.io/en/latest/).
+
+**快速上手**
+
+  - [安装指南](https://rlinf.readthedocs.io/en/latest/rst_source/start/installation.html)
+  - [快速上手 1：在 ManiSkill3 上进行 VLA 的 PPO 训练](https://rlinf.readthedocs.io/en/latest/rst_source/start/vla.html)
+  - [快速上手 2：在 MATH 上进行 LLM 的 GRPO 训练](https://rlinf.readthedocs.io/en/latest/rst_source/start/llm.html)
+  - [多节点训练](https://rlinf.readthedocs.io/en/latest/rst_source/start/distribute.html)
+  - [模型评估](https://rlinf.readthedocs.io/en/latest/rst_source/start/eval.html)
+
+**关键设计**
+  - [统一用户接口使用](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/user/index.html)
+  - [灵活的执行模式](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/mode/index.html)
+  - [自动调度支持](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/scheduler/index.html)
+  - [弹性通信](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/communication/index.html)
+
+**示例库**
+
+  - [具身智能 VLA 模型训练](https://rlinf.readthedocs.io/en/latest/rst_source/examples/embodied.html)
+  - [数学推理模型训练](https://rlinf.readthedocs.io/en/latest/rst_source/examples/reasoning.html)
+
+**高级特性**
+
+  - [Megatron-LM 的 5D 并行配置](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/advance/5D.html)
+  - [LoRA 集成以实现高效微调](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/advance/lora.html)
+  - [在不同版本的 SGLang 之间切换](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/advance/version.html)
+  - [检查点恢复与重启支持](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/advance/resume.html)
+
+**框架扩展**
+
+  - [添加新环境](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/extend/new_env.html)
+  - [基于 FSDP+Hugging Face 后端添加新模型](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/extend/new_model_fsdp.html)
+  - [基于 Megatron+SGLang 后端添加新模型](https://rlinf.readthedocs.io/en/latest/rst_source/tutorials/extend/new_model_megatron.html)
+
+**博客**
+
+  - [与 VeRL 的对比](https://rlinf.readthedocs.io/en/latest/rst_source/blog/compare_with_verl.html)
+
+## 构建状态
+
+| Type             | Status |
+| :--------------: | :----: |
+| 推理 RL-MATH | [![Build Status](https://github.com/RLinf/RLinf/actions/workflows/math_e2e.yml/badge.svg)](https://github.com/RLinf/RLinf/actions/workflows/math_e2e.yml) |
+| 具身 RL-VLA   | [![Build Status](https://github.com/RLinf/RLinf/actions/workflows/embodied_e2e.yml/badge.svg)](https://github.com/RLinf/RLinf/actions/workflows/embodied_e2e.yml) |
+
+
+## 贡献指南
+我们欢迎对 RLinf 的贡献。在参与之前，请先阅读 [贡献指南](https://rlinf.readthedocs.io/en/latest/index.html#contribution-guidelines)。
+
+## 引用与致谢
+
+如果您觉得 **RLinf** 对您的研究或工作有所帮助，请引用以下论文：
+
+```bibtex
+@misc{yu2025rlinfflexibleefficientlargescale,
+  title={RLinf: Flexible and Efficient Large-scale Reinforcement Learning via Macro-to-Micro Flow Transformation}, 
+  author={Chao Yu and Yuanqing Wang and Zhen Guo and Hao Lin and Si Xu and Hongzhi Zang and Quanlu Zhang and Yongji Wu and Chunyang Zhu and Junhao Hu and Zixiao Huang and Mingjie Wei and Yuqing Xie and Ke Yang and Bo Dai and Zhexuan Xu and Xiangyuan Wang and Xu Fu and Zhihao Liu and Kang Chen and Weilin Liu and Gang Liu and Boxun Li and Jianlei Yang and Zhi Yang and Guohao Dai and Yu Wang},
+  year={2025},
+  eprint={2509.15965},
+  archivePrefix={arXiv},
+  primaryClass={cs.LG},
+  url={https://arxiv.org/abs/2509.15965}, 
+}
+```
+
+如果你在 RLinf 中使用了 RL+VLA，欢迎引用我们的算法技术报告和实证研究论文：
+
+```bibtex
+@misc{zang2025rlinfvlaunifiedefficientframework,
+      title={RLinf-VLA: A Unified and Efficient Framework for VLA+RL Training}, 
+      author={Hongzhi Zang and Mingjie Wei and Si Xu and Yongji Wu and Zhen Guo and Yuanqing Wang and Hao Lin and Liangzhi Shi and Yuqing Xie and Zhexuan Xu and Zhihao Liu and Kang Chen and Wenhao Tang and Quanlu Zhang and Weinan Zhang and Chao Yu and Yu Wang},
+      year={2025},
+      eprint={2510.06710},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2510.06710}, 
+}
+```
+
+```bibtex
+@misc{liu2025rlbringvlageneralization,
+  title={What Can RL Bring to VLA Generalization? An Empirical Study}, 
+  author={Jijia Liu and Feng Gao and Bingwen Wei and Xinlei Chen and Qingmin Liao and Yi Wu and Chao Yu and Yu Wang},
+  year={2025},
+  eprint={2505.19789},
+  archivePrefix={arXiv},
+  primaryClass={cs.LG},
+  url={https://arxiv.org/abs/2505.19789}, 
+}
+```
+
+**致谢**
+RLinf 的灵感来源并受益于更广泛开源社区的思想与工具。
+我们特别感谢 VeRL、AReaL、Megatron-LM、SGLang 和 PyTorch Fully Sharded Data Parallel (FSDP) 的团队与贡献者。
+如果我们不慎遗漏了您的项目或贡献，请提交 issue 或 pull request，以便我们能够给予您应有的致谢。
+
+**联系方式：**
+我们欢迎博士后、博士/硕士研究生以及实习生的加入。
+诚邀您共同塑造强化学习基础设施与具身智能的未来！
+- Chao Yu: zoeyuchao@gmail.com
+- Yu Wang: yu-wang@tsinghua.edu.cn
\ No newline at end of file

From 2e6850eacf674fddb50512c7ab06ec788b461f56 Mon Sep 17 00:00:00 2001
From: Andy Lin <32576375+andylin-hao@users.noreply.github.com>
Date: Fri, 10 Oct 2025 13:10:03 +0800
Subject: [PATCH 10/57] fix: remove env channel timeout (#156)

Signed-off-by: Hao Lin <linhaomails@gmail.com>
---
 rlinf/envs/env_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rlinf/envs/env_manager.py b/rlinf/envs/env_manager.py
index f1cc38ffa..dc34b1f63 100644
--- a/rlinf/envs/env_manager.py
+++ b/rlinf/envs/env_manager.py
@@ -213,7 +213,7 @@ def start_simulator(self):
         self.process.start()
 
         # Wait for initialization
-        result = self.result_queue.get(timeout=60)
+        result = self.result_queue.get()
         if result["status"] != "ready":
             raise RuntimeError(f"Simulator initialization failed: {result}")
 

From 89e398e8112fa9ffd4b3127a4fa788acf1e3ceeb Mon Sep 17 00:00:00 2001
From: Hongzhi Zang <zanghongzhi@infini-ai.com>
Date: Fri, 10 Oct 2025 15:12:52 +0800
Subject: [PATCH 11/57] fix: optimal embodied yaml and doc update (#144)

Signed-off-by: hongzhi <zanghongzhi@infini-ai.com>
---
 .../rst_source/start/installation.rst         |   6 +-
 .../rst_source/tutorials/user/yaml.rst        |  15 +-
 .../rst_source/start/installation.rst         |   2 +
 .../rst_source/tutorials/user/yaml.rst        |  17 +-
 .../env/eval/maniskill_ood_template.yaml      |   4 +-
 .../env/train/PutOnPlateInScene25Main.yaml    |   2 +-
 .../config/maniskill_grpo_openvla.yaml        |  26 +--
 .../config/maniskill_grpo_openvlaoft.yaml     |  37 ++--
 .../config/maniskill_ppo_openvla.yaml         |  23 ++-
 .../config/maniskill_ppo_openvla_eval.yaml    | 160 ------------------
 .../config/maniskill_ppo_openvlaoft.yaml      |  28 +--
 .../{eval_all.sh => eval_mani_ood.sh}         |  44 +++--
 tests/e2e_tests/embodied/ppo_openvla.yaml     |   7 -
 13 files changed, 100 insertions(+), 271 deletions(-)
 delete mode 100644 examples/embodiment/config/maniskill_ppo_openvla_eval.yaml
 rename examples/embodiment/{eval_all.sh => eval_mani_ood.sh} (51%)

diff --git a/docs/source-en/rst_source/start/installation.rst b/docs/source-en/rst_source/start/installation.rst
index ba4758150..306e8956c 100644
--- a/docs/source-en/rst_source/start/installation.rst
+++ b/docs/source-en/rst_source/start/installation.rst
@@ -113,12 +113,12 @@ Inside the container, clone the RLinf repository:
 
 .. tip::
 
-   For multi-node training, make sure to clone the repository in shared storage so that every node has access to it.
-
-
+   - For multi-node training, make sure to clone the repository in shared storage so that every node has access to it.
+   - To use ManiSkill settings, refer to the README at ``https://huggingface.co/datasets/RLinf/maniskill_assets`` for instructions on downloading the required files.
 
 Install from Custom Environment
 -------------------------------
+**If you have already used the Docker image, you can skip the following steps.**
 
 Installation is divided into three parts depending on the type of experiments you plan to run.
 
diff --git a/docs/source-en/rst_source/tutorials/user/yaml.rst b/docs/source-en/rst_source/tutorials/user/yaml.rst
index f70366ed7..bac3f47be 100644
--- a/docs/source-en/rst_source/tutorials/user/yaml.rst
+++ b/docs/source-en/rst_source/tutorials/user/yaml.rst
@@ -750,7 +750,6 @@ algorithm
 
     n_chunk_steps: 10
     n_eval_chunk_steps: 10
-    rollout_micro_batch_size: 256
     num_group_envs: 32
     rollout_epoch: 1
 
@@ -766,7 +765,7 @@ algorithm
 
 ``algorithm.auto_reset``: Automatically reset environments when episodes terminate.
 
-``algorithm.ignore_terminations``: Ignore episode terminations during training.
+``algorithm.ignore_terminations``: Ignore episode terminations during training (if enabled, episode only ends when it reaches the ``max_episode_steps``).
 
 ``algorithm.use_fixed_reset_state_ids``: Use fixed reset state IDs (false for randomization). Always True for GRPO, default be False for PPO.
 
@@ -774,13 +773,9 @@ algorithm
 
 ``algorithm.normalize_advantages``: Normalize advantages across the batch.
 
-``algorithm.kl_penalty``: KL divergence estimation method (kl or kl_penalty).
+``algorithm.n_chunk_steps``: Number of chunks (i.e., times the model is called to predict action chunks) within one rollout epoch.
 
-``algorithm.n_chunk_steps``: Number of action steps per chunk.
-
-``algorithm.n_eval_chunk_steps``: Number of action steps per evaluation chunk.
-
-``algorithm.rollout_micro_batch_size``: Micro-batch size for rollout generation.
+``algorithm.n_eval_chunk_steps``: Number of chunks in evaluation.
 
 ``algorithm.num_group_envs``: Number of environment groups.
 
@@ -850,10 +845,6 @@ rollout
 
 ``rollout.backend``: Model backend (huggingface, vllm).
 
-``rollout.enforce_eager``: Disable CUDA graph capture for faster initialization.
-
-``rollout.enable_offload``: Enable model offloading to reduce memory usage.
-
 ``rollout.pipeline_stage_num``: Number of pipeline stages for model parallelism.
 
 actor
diff --git a/docs/source-zh/rst_source/start/installation.rst b/docs/source-zh/rst_source/start/installation.rst
index 9c03edc76..730e1ab0b 100644
--- a/docs/source-zh/rst_source/start/installation.rst
+++ b/docs/source-zh/rst_source/start/installation.rst
@@ -114,6 +114,7 @@ RLinf 提供两种安装方式。我们 **推荐使用 Docker**，因为这可
 
 自定义环境安装
 -------------------------------
+**如果你已经使用了 Docker 镜像，下面步骤可跳过。**
 
 根据你的实验类型，安装分为三步进行：
 
@@ -122,6 +123,7 @@ RLinf 提供两种安装方式。我们 **推荐使用 Docker**，因为这可
 
 第二步，如果你的实验使用的是 **Megatron 和 SGLang/vLLM** 后端，  
 请参考 :ref:`Megatron 和 SGLang/vLLM 依赖 <megatron-and-sglang-vllm-dependencies>` 安装相应依赖。
+（具身智能实验此步可忽略）
 
 第三步，如果你要运行具身智能相关实验（如 OpenVLA、OpenVLA-OFT、Pi0），  
 请参考 :ref:`具身智能相关依赖 <embodied-dependencies>` 安装专用依赖项。
diff --git a/docs/source-zh/rst_source/tutorials/user/yaml.rst b/docs/source-zh/rst_source/tutorials/user/yaml.rst
index 82a7d61c4..e5c00f776 100644
--- a/docs/source-zh/rst_source/tutorials/user/yaml.rst
+++ b/docs/source-zh/rst_source/tutorials/user/yaml.rst
@@ -695,7 +695,6 @@ algorithm
 
     n_chunk_steps: 10
     n_eval_chunk_steps: 10
-    rollout_micro_batch_size: 256
     num_group_envs: 32
     rollout_epoch: 1
 
@@ -710,21 +709,17 @@ algorithm
 
 ``algorithm.auto_reset``：是否在 episode 结束时自动重置环境。
 
-``algorithm.ignore_terminations``：训练时是否忽略 episode 的终止信号。
+``algorithm.ignore_terminations``：训练时是否忽略 episode 的终止信号（若开启，episode 仅在达到最大步数时结束）。
 
 ``algorithm.use_fixed_reset_state_ids``：是否使用固定 reset 状态 ID（GRPO 推荐 True，PPO 默认为 False，旨在随机化）。
 
-``algorithm.require_values``：是否需要同时计算值函数。
+``algorithm.require_values``：是否需要同时计算价值函数。
 
 ``algorithm.normalize_advantages``：是否对优势值归一化处理。
 
-``algorithm.kl_penalty``：KL 散度的估算方式（kl 或 kl_penalty）。
+``algorithm.n_chunk_steps``：每个 rollout epoch 中的 chunk 数量（调用模型 predict 的次数）。
 
-``algorithm.n_chunk_steps``：每个 chunk 的动作步数。
-
-``algorithm.n_eval_chunk_steps``：评估模式下每个 chunk 的动作步数。
-
-``algorithm.rollout_micro_batch_size``：Rollout 生成时的微批大小。
+``algorithm.n_eval_chunk_steps``：评估模式下的 chunk 数量。
 
 ``algorithm.num_group_envs``：环境组数量（用于并行）。
 
@@ -793,10 +788,6 @@ rollout
 
 ``rollout.backend``：模型后端（huggingface、vllm）。  
 
-``rollout.enforce_eager``：禁用 CUDA graph，以更快完成初始化。  
-
-``rollout.enable_offload``：启用模型下放以降低内存占用。  
-
 ``rollout.pipeline_stage_num``：模型并行的流水线阶段数。
 
 actor
diff --git a/examples/embodiment/config/env/eval/maniskill_ood_template.yaml b/examples/embodiment/config/env/eval/maniskill_ood_template.yaml
index d224576fb..42e1279a2 100644
--- a/examples/embodiment/config/env/eval/maniskill_ood_template.yaml
+++ b/examples/embodiment/config/env/eval/maniskill_ood_template.yaml
@@ -16,7 +16,7 @@ video_cfg:
   video_base_dir: ${runner.logger.log_path}/video/eval
 
 init_params:
-  id: null
+  id: "PutOnPlateInScene25Main-v3"
   num_envs: ${env.eval.num_envs}
   obs_mode: ${env.train.init_params.obs_mode}
   control_mode: None
@@ -25,4 +25,4 @@ init_params:
   max_episode_steps: ${env.train.init_params.max_episode_steps}
   sensor_configs: ${env.train.init_params.sensor_configs}
   render_mode: sensors
-  obj_set: test
\ No newline at end of file
+  obj_set: train
\ No newline at end of file
diff --git a/examples/embodiment/config/env/train/PutOnPlateInScene25Main.yaml b/examples/embodiment/config/env/train/PutOnPlateInScene25Main.yaml
index 39083cefb..45fc9a424 100644
--- a/examples/embodiment/config/env/train/PutOnPlateInScene25Main.yaml
+++ b/examples/embodiment/config/env/train/PutOnPlateInScene25Main.yaml
@@ -10,7 +10,7 @@ only_eval: False
 max_episode_steps: 80
 
 video_cfg:
-  save_video: True
+  save_video: False
   info_on_video: True
   video_base_dir: ${runner.logger.log_path}/video/train
 
diff --git a/examples/embodiment/config/maniskill_grpo_openvla.yaml b/examples/embodiment/config/maniskill_grpo_openvla.yaml
index f947e658a..7c77d581c 100644
--- a/examples/embodiment/config/maniskill_grpo_openvla.yaml
+++ b/examples/embodiment/config/maniskill_grpo_openvla.yaml
@@ -1,6 +1,6 @@
 defaults:
-  - env/train: PutCarrotOnPlateInScene
-  - env/eval: PutCarrotOnPlateInScene
+  - env/train: PutOnPlateInScene25Main
+  - env/eval: PutOnPlateInScene25Main
   - override hydra/job_logging: stdout
 
 hydra:
@@ -12,10 +12,11 @@ hydra:
 
 cluster:
   num_nodes: 1
+  num_gpus_per_node: 8
   component_placement:
-    actor: 0-3
-    env: 4-5
-    rollout: 6-7
+    actor: 0-7
+    env: 0-3
+    rollout: 4-7
 
 runner:
   task_type: embodied
@@ -38,15 +39,18 @@ algorithm:
   auto_reset: False
   ignore_terminations: False
   use_fixed_reset_state_ids: True
+
   require_values: False
-  shuffle_samples: True
   normalize_advantages: True
   kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
   group_size: 8
-  n_chunk_steps: 80
-  n_eval_steps: 80
-  num_group_envs: 16
+
+  num_group_envs: 32
   rollout_epoch: 1
+
+  n_chunk_steps: 80
+  n_eval_chunk_steps: 80
+
   reward_type: step_level
   logprob_type: token_level
   entropy_type: token_level
@@ -65,6 +69,7 @@ algorithm:
   gamma: 0.99
   gae_lambda: 0.95
 
+  # params for generation
   sampling_params:
     use_greedy: False
     temperature_train: 1.0
@@ -86,7 +91,7 @@ env:
     name: "env_buffer_list"
     queue_name: "obs_buffer"
     queue_size: 0
-  enable_offload: True
+  enable_offload: False
 
 rollout:
   group_name: "RolloutGroup"
@@ -123,7 +128,6 @@ actor:
 
   model:
     model_name: "openvla"
-    value_type: ${algorithm.reward_type}  # 'action' or 'token'
     action_dim: 7
     num_action_chunks: 1
     use_proprio: False
diff --git a/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml b/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml
index 2e2a6cc1f..b15116e4c 100644
--- a/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml
@@ -1,6 +1,6 @@
 defaults:
-  - env/train: libero_10_grpo
-  - env/eval: libero_10
+  - env/train: PutOnPlateInScene25Main
+  - env/eval: maniskill_ood_template
   - override hydra/job_logging: stdout
 
 hydra:
@@ -13,9 +13,9 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor: 0-3
-    env: 4-5
-    rollout: 6-7
+    actor: 0-7
+    env: 0-7
+    rollout: 0-7
 
 runner:
   task_type: embodied
@@ -30,7 +30,7 @@ runner:
 
   only_eval: False
   val_check_interval: -1
-  save_interval: 25
+  save_interval: 40
   seq_length: 4096
   max_prompt_length: 30
 
@@ -39,15 +39,15 @@ algorithm:
   ignore_terminations: False
   use_fixed_reset_state_ids: True
   require_values: False
-  shuffle_samples: True
   normalize_advantages: True
   kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
   group_size: 8
   n_chunk_steps: 10
   n_eval_chunk_steps: 10
-  num_group_envs: 8
+  num_group_envs: 32
   rollout_epoch: 1
-  reward_type: step_level  # step_level or chunk_level
+
+  reward_type: step_level
   logprob_type: token_level
   entropy_type: token_level
 
@@ -65,11 +65,12 @@ algorithm:
   gamma: 0.99
   gae_lambda: 0.95
 
+  # params for rollout
   sampling_params:
     use_greedy: False
     temperature_train: 1.0
     temperature_eval: 0.6
-    top_k: 50
+    top_k: 0
     top_p: 1.0
     repetition_penalty: 1.0
 
@@ -86,7 +87,7 @@ env:
     name: "env_buffer_list"
     queue_name: "obs_buffer"
     queue_size: 0
-  enable_offload: True
+  enable_offload: False
 
 rollout:
   group_name: "RolloutGroup"
@@ -97,8 +98,8 @@ rollout:
   mode: "colocate"
   generation_backend: "huggingface"
   model_dir: "/path/to/model/Openvla-oft-SFT-libero10-trajall/"
-  enable_offload: False
-  pipeline_stage_num: 2
+  enable_offload: True
+  pipeline_stage_num: 1
 
 actor:
   group_name: "ActorGroup"
@@ -109,8 +110,8 @@ actor:
   training_backend: "fsdp"
   checkpoint_load_path: "/path/to/model/Openvla-oft-SFT-libero10-trajall/"
   checkpoint_save_path: "../results"
-  micro_batch_size: 8
-  global_batch_size: 160
+  micro_batch_size: 40
+  global_batch_size: 640
   seed: 1234
   enable_offload: True
 
@@ -148,12 +149,12 @@ actor:
     padding_side: "right"
 
   optim:
-    lr: 5.0e-6
+    lr: 1.0e-4
     value_lr: 3.0e-3
     adam_beta1: 0.9
-    adam_beta2: 0.95
+    adam_beta2: 0.999
     adam_eps: 1.0e-05
-    clip_grad: 1.0
+    clip_grad: 10.0
 
 reward:
   use_reward_model: False
diff --git a/examples/embodiment/config/maniskill_ppo_openvla.yaml b/examples/embodiment/config/maniskill_ppo_openvla.yaml
index f76f18830..7af5a9d9f 100644
--- a/examples/embodiment/config/maniskill_ppo_openvla.yaml
+++ b/examples/embodiment/config/maniskill_ppo_openvla.yaml
@@ -1,6 +1,6 @@
 defaults:
-  - env/train: PutCarrotOnPlateInScene
-  - env/eval: PutCarrotOnPlateInScene
+  - env/train: PutOnPlateInScene25Main
+  - env/eval: maniskill_ood_template
   - override hydra/job_logging: stdout
 
 hydra:
@@ -36,16 +36,15 @@ runner:
 
 algorithm:
   auto_reset: True
-  ignore_terminations: True
+  ignore_terminations: False
   use_fixed_reset_state_ids: False
   require_values: True
-  shuffle_samples: True
   normalize_advantages: True
   kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
   group_size: 1
-  n_chunk_steps: 80
+  n_chunk_steps: 160
   n_eval_chunk_steps: 80
-  num_group_envs: 120
+  num_group_envs: 128
   rollout_epoch: 1
   reward_type: step_level
   logprob_type: action_level
@@ -70,7 +69,7 @@ algorithm:
     use_greedy: False
     temperature_train: 1.0
     temperature_eval: 0.6
-    top_k: 50
+    top_k: 0
     top_p: 1.0
     repetition_penalty: 1.0
 
@@ -87,7 +86,7 @@ env:
     name: "env_buffer_list"
     queue_name: "obs_buffer"
     queue_size: 0
-  enable_offload: True
+  enable_offload: False
 
 rollout:
   group_name: "RolloutGroup"
@@ -110,8 +109,8 @@ actor:
   training_backend: "fsdp"
   checkpoint_load_path: "/path/to/model/rl4vla/openvla-7b-rlvla-warmup/"
   checkpoint_save_path: "../results"
-  micro_batch_size: 20
-  global_batch_size: 160
+  micro_batch_size: 40
+  global_batch_size: 640
   seed: 1234
   enable_offload: True
 
@@ -143,9 +142,9 @@ actor:
     lr: 1.0e-4
     value_lr: 3.0e-3
     adam_beta1: 0.9
-    adam_beta2: 0.95
+    adam_beta2: 0.999
     adam_eps: 1.0e-05
-    clip_grad: 1.0
+    clip_grad: 10.0
 
   tokenizer:
     tokenizer_type: "HuggingFaceTokenizer"
diff --git a/examples/embodiment/config/maniskill_ppo_openvla_eval.yaml b/examples/embodiment/config/maniskill_ppo_openvla_eval.yaml
deleted file mode 100644
index d8aaac28c..000000000
--- a/examples/embodiment/config/maniskill_ppo_openvla_eval.yaml
+++ /dev/null
@@ -1,160 +0,0 @@
-defaults:
-  - env/eval: maniskill_ood_template
-  - env/train: PutCarrotOnPlateInScene
-  - override hydra/job_logging: stdout
-
-hydra:
-  run:
-    dir: .
-  output_subdir: null
-  searchpath:
-    - file://${oc.env:EMBODIED_PATH}/config/
-
-cluster:
-  num_nodes: 1
-  component_placement:
-    actor: 0-3
-    env: 0-3
-    rollout: 0-3
-
-runner:
-  task_type: embodied
-  logger:
-    log_path: "./logs"
-    project_name: rlinf
-    experiment_name: "test_openvla"
-    logger_backends: ["tensorboard"] # wandb, swanlab
-
-  max_epochs: 1000
-  max_steps: -1
-
-  only_eval: True
-  val_check_interval: -1
-  save_interval: 40
-  seq_length: 4096
-  max_prompt_length: 30
-
-algorithm:
-  auto_reset: True
-  ignore_terminations: True
-  use_fixed_reset_state_ids: False
-  require_values: True
-  shuffle_samples: True
-  normalize_advantages: True
-  kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
-  group_size: 1
-  n_chunk_steps: 80
-  n_eval_chunk_steps: 80
-  num_group_envs: 120
-  rollout_epoch: 1
-  reward_type: step_level
-  logprob_type: action_level
-  entropy_type: action_level
-
-  adv_type: embodied_gae
-  loss_type: embodied_ppo
-  loss_agg_func: "token-mean"
-  kl_beta: 0.0
-  entropy_bonus: 0
-  clip_ratio_high: 0.2
-  clip_ratio_low: 0.2
-  clip_ratio_c: 3.0
-  value_clip: 0.2
-  huber_delta: 10.0
-
-  gamma: 0.99
-  gae_lambda: 0.95
-
-  sampling_params:
-    use_greedy: False
-    temperature_train: 1.0
-    temperature_eval: 0.6
-    top_k: 50
-    top_p: 1.0
-    repetition_penalty: 1.0
-
-  # length argument for autoregressive sampling
-  # max length means max amount of tokens to generate
-  length_params:
-    max_new_token: 7
-    max_length: 1024
-    min_length: 1
-
-env:
-  group_name: "EnvGroup"
-  channel:
-    name: "env_buffer_list"
-    queue_name: "obs_buffer"
-    queue_size: 0
-  enable_offload: True
-
-rollout:
-  group_name: "RolloutGroup"
-  channel:
-    name: ${env.channel.name}
-    queue_name: "action_buffer"
-    queue_size: 0
-  mode: "colocate"
-  backend: "huggingface"
-  model_dir: "/path/to/model/openvla-7b-rlvla-rl/"
-  enable_offload: True
-  pipeline_stage_num: 2
-
-actor:
-  group_name: "ActorGroup"
-  channel:
-    name: ${env.channel.name}
-    queue_name: "replay_buffer"
-    queue_size: 0
-  training_backend: "fsdp"
-  checkpoint_load_path: "/path/to/model/openvla-7b-rlvla-rl/"
-  checkpoint_save_path: "/workspace/results"
-  micro_batch_size: 20
-  global_batch_size: 160
-  seed: 1234
-  enable_offload: True
-
-  tokenizer:
-    tokenizer_type: "HuggingFaceTokenizer"
-    tokenizer_model: "/path/to/model/openvla-7b-rlvla-rl/"
-    use_fast: False
-    trust_remote_code: True
-    padding_side: "right"
-
-  model:
-    model_name: "openvla"
-    action_dim: 7
-    num_action_chunks: 1
-    use_proprio: False
-    unnorm_key: bridge_orig
-    center_crop: True
-    do_sample: False
-    
-    precision: "bf16"
-    add_bias_linear: False
-    add_qkv_bias: True
-    vocab_size: 32000
-    hidden_size: 4096
-    policy_setup: "widowx_bridge"
-    vh_mode: "a0"
-    image_size: [224, 224]
-    is_lora: False
-    lora_rank: 32
-    ckpt_path: null
-    attn_implementation: "flash_attention_2"
-    low_cpu_mem_usage: True
-    trust_remote_code: True
-
-  optim:
-    lr: 1.0e-4
-    value_lr: 3.0e-3
-    adam_beta1: 0.9
-    adam_beta2: 0.95
-    adam_eps: 1.0e-05
-    clip_grad: 1.0
-
-reward:
-  use_reward_model: False
-
-critic:
-  use_critic_model: False
\ No newline at end of file
diff --git a/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml b/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml
index 35680f9d6..d9016cece 100644
--- a/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml
+++ b/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml
@@ -1,6 +1,6 @@
 defaults:
-  - env/train: PutCarrotOnPlateInScene
-  - env/eval: PutCarrotOnPlateInScene
+  - env/train: PutOnPlateInScene25Main
+  - env/eval: maniskill_ood_template
   - override hydra/job_logging: stdout
 
 hydra:
@@ -13,9 +13,9 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor: 0-1
-    env: 2-3
-    rollout: 4-5
+    actor: 0-7
+    env: 0-3
+    rollout: 4-7
 
 runner:
   task_type: embodied
@@ -36,18 +36,18 @@ runner:
 
 algorithm:
   auto_reset: True
-  ignore_terminations: True
+  ignore_terminations: False
   use_fixed_reset_state_ids: False
   require_values: True
   shuffle_samples: True
   normalize_advantages: True
   kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
   group_size: 1
-  n_chunk_steps: 10
+  n_chunk_steps: 20
   n_eval_chunk_steps: 10
-  num_group_envs: 64
+  num_group_envs: 128
   rollout_epoch: 1
-  reward_type: chunk_level
+  reward_type: step_level
   logprob_type: token_level
   entropy_type: token_level
 
@@ -71,7 +71,7 @@ algorithm:
     use_greedy: False
     temperature_train: 1.0
     temperature_eval: 0.6
-    top_k: 50
+    top_k: 0
     top_p: 1.0
     repetition_penalty: 1.0
 
@@ -88,7 +88,7 @@ env:
     name: "env_buffer_list"
     queue_name: "obs_buffer"
     queue_size: 0
-  enable_offload: True
+  enable_offload: False
 
 rollout:
   group_name: "RolloutGroup"
@@ -100,7 +100,7 @@ rollout:
   backend: "huggingface"
   model_dir: "/path/to/model/Openvla-oft-SFT-libero10-trajall/"
   enable_offload: True
-  pipeline_stage_num: 2
+  pipeline_stage_num: 1
 
 actor:
   group_name: "ActorGroup"
@@ -111,8 +111,8 @@ actor:
   training_backend: "fsdp"
   checkpoint_load_path: "/path/to/model/Openvla-oft-SFT-libero10-trajall/"
   checkpoint_save_path: "../results"
-  micro_batch_size: 8
-  global_batch_size: 160
+  micro_batch_size: 40
+  global_batch_size: 640
   seed: 1234
   enable_offload: True
 
diff --git a/examples/embodiment/eval_all.sh b/examples/embodiment/eval_mani_ood.sh
similarity index 51%
rename from examples/embodiment/eval_all.sh
rename to examples/embodiment/eval_mani_ood.sh
index 86d4a9c4d..a9b49a8fe 100644
--- a/examples/embodiment/eval_all.sh
+++ b/examples/embodiment/eval_mani_ood.sh
@@ -5,24 +5,12 @@ export EMBODIED_PATH="$( cd "$(dirname "${BASH_SOURCE[0]}" )" && pwd )"
 export REPO_PATH=$(dirname $(dirname "$EMBODIED_PATH"))
 export SRC_FILE="${EMBODIED_PATH}/eval_embodied_agent.py"
 
-export MUJOCO_GL="egl"
-export PYOPENGL_PLATFORM="egl"
-export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-# NOTE: set LIBERO_REPO_PATH to the path of the LIBERO repo
-export LIBERO_REPO_PATH="/path/to/repo/LIBERO"
-# NOTE: set LIBERO_CONFIG_PATH for libero/libero/__init__.py
-export LIBERO_CONFIG_PATH=${LIBERO_REPO_PATH}
-
-export PYTHONPATH=${LIBERO_REPO_PATH}:$PYTHONPATH
 export CUDA_LAUNCH_BLOCKING=1
 export HYDRA_FULL_ERROR=1
 
-
-if [ -z "$1" ]; then
-    CONFIG_NAME="maniskill_ppo_openvla_eval"
-else
-    CONFIG_NAME=$1
-fi
+EVAL_NAME=YOUR_EVAL_NAME
+CKPT_PATH=YOUR_CKPT_PATH        # .pt file
+CONFIG_NAME=YOUR_CFG_NAME       # env.eval must be maniskill_ood_template
 
 for env_id in \
     "PutOnPlateInScene25VisionImage-v1" "PutOnPlateInScene25VisionTexture03-v1" "PutOnPlateInScene25VisionTexture05-v1" \
@@ -32,14 +20,34 @@ for env_id in \
     "PutOnPlateInScene25Position-v1" "PutOnPlateInScene25EEPose-v1" "PutOnPlateInScene25PositionChangeTo-v1" ; \
 do
     obj_set="test"
-    LOG_DIR="${REPO_PATH}/logs/eval/$(date +'%Y%m%d-%H:%M:%S')-${env_id}-${obj_set}"
+    LOG_DIR="${REPO_PATH}/logs/eval/${EVAL_NAME}/$(date +'%Y%m%d-%H:%M:%S')-${env_id}-${obj_set}"
     MEGA_LOG_FILE="${LOG_DIR}/run_ppo.log"
     mkdir -p "${LOG_DIR}"
-    CMD="python ${SRC_FILE} --config-path ${EMBODIED_PATH}/config/ --config-name ${CONFIG_NAME} \
+    CMD="python ${SRC_FILE} --config-path ${EMBODIED_PATH}/config/ \
+        --config-name ${CONFIG_NAME} \
         runner.logger.log_path=${LOG_DIR} \
         env.eval.init_params.id=${env_id} \
-        env.eval.init_params.obj_set=$obj_set"
+        env.eval.init_params.obj_set=${obj_set} \
+        actor.model.ckpt_path=${CKPT_PATH}"
 
     echo ${CMD} > ${MEGA_LOG_FILE}
     ${CMD} 2>&1 | tee -a ${MEGA_LOG_FILE}
+done
+
+for env_id in \
+    "PutOnPlateInScene25Carrot-v1" "PutOnPlateInScene25MultiCarrot-v1" \
+    "PutOnPlateInScene25MultiPlate-v1" ; \
+do
+    obj_set="train"
+    LOG_DIR="${REPO_PATH}/logs/eval/${EVAL_NAME}/$(date +'%Y%m%d-%H:%M:%S')-${env_id}-${obj_set}"
+    MEGA_LOG_FILE="${LOG_DIR}/run_ppo.log"
+    mkdir -p "${LOG_DIR}"
+    CMD="python ${SRC_FILE} --config-path ${EMBODIED_PATH}/config/ \
+        --config-name ${CONFIG_NAME} \
+        runner.logger.log_path=${LOG_DIR} \
+        env.eval.init_params.id=${env_id} \
+        env.eval.init_params.obj_set=${obj_set} \
+        actor.model.ckpt_path=${CKPT_PATH}"
+    echo ${CMD}  > ${MEGA_LOG_FILE}
+    ${CMD} 2>&1 | tee -a ${MEGA_LOG_FILE}
 done
\ No newline at end of file
diff --git a/tests/e2e_tests/embodied/ppo_openvla.yaml b/tests/e2e_tests/embodied/ppo_openvla.yaml
index a5c3c9487..f1f566b82 100644
--- a/tests/e2e_tests/embodied/ppo_openvla.yaml
+++ b/tests/e2e_tests/embodied/ppo_openvla.yaml
@@ -43,16 +43,9 @@ algorithm:
 
   n_chunk_steps: 10
   n_eval_chunk_steps: 10
-  # training rollout mbs
-  rollout_micro_batch_size: 64
   num_group_envs: 8
   rollout_epoch: 1
 
-  # mbs to do log prob inference, can be set to
-  # lower than rollout_micro_batch_size to reduce
-  # memory usage
-  logprob_forward_micro_batch_size: 16 # ${.rollout_micro_batch_size}
-
   adv_type: embodied_gae
   loss_type: embodied_ppo
   reward_type: chunk_level

From 55a11732daae693b1bef8a0d9ea53db763262f68 Mon Sep 17 00:00:00 2001
From: LiuYiwei <71703069+secretsites@users.noreply.github.com>
Date: Fri, 10 Oct 2025 18:27:18 +0800
Subject: [PATCH 12/57] chore: update README (#158)

* chore: fix README-EN and add README-CN

Signed-off-by: LiuYiwei <1252642155@qq.com>
---
 README.md       | 180 ++++++++++++++++++++++++++++++++----------------
 README.zh-CN.md | 175 ++++++++++++++++++++++++++++++----------------
 2 files changed, 237 insertions(+), 118 deletions(-)

diff --git a/README.md b/README.md
index ac99ffec5..41b8193a6 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,7 @@ RLinf is a flexible and scalable open-source infrastructure designed for post-tr
 
 
 ## What's NEW!
+- [2025/10] The RLinf Algorithm Technical Report [RLinf-VLA: A Unified and Efficient Framework for VLA+RL Training](https://arxiv.org/abs/2510.06710) is released.
 - [2025/09] <img src="https://github.githubassets.com/images/icons/emoji/unicode/1f525.png" width="18" /> [Example Gallery](https://rlinf.readthedocs.io/en/latest/rst_source/examples/index.html) is updated, users can find various off-the-shelf examples!
 - [2025/09] The paper [RLinf: Flexible and Efficient Large-scale Reinforcement Learning via Macro-to-Micro Flow Transformation](https://arxiv.org/abs/2509.15965) is released.
 - [2025/09] The [report on RLinf by Machine Heart](https://mp.weixin.qq.com/s/Xtv4gDu3lhDDGadLrzt6Aw)  is released. 
@@ -73,90 +74,149 @@ RLinf is a flexible and scalable open-source infrastructure designed for post-tr
 ## Main Results
 ### Embodied Intelligence
 
+
+<div align="center">
+<table border="0">
+  <tr>
+    <td align="center">
+      <img src="https://github.com/RLinf/misc/raw/main/pic/mani_openvla.png" alt="mani_openvla" width="350"/>
+      <br/>
+      <strong>OpenVLA</strong>
+    </td>
+    <td align="center">
+      <img src="https://github.com/RLinf/misc/raw/main/pic/mani_openvlaoft.png" alt="mani_openvlaoft" width="350"/>
+      <br/>
+      <strong>OpenVLA-OFT</strong>
+    </td>
+  </tr>
+</table>
+</div>
+
+- Training curves on ManiSkill “PutOnPlateInScene25Mani-v3” with OpenVLA and
+OpenVLA-OFT models, using PPO and GRPO algorithms. PPO consistently outperforms GRPO
+and exhibits greater stability.
+
 <div align="center">
-<table>
+<table style="text-align:center;">
   <tr>
-    <th colspan="5" style="text-align:center;"><strong>OpenVLA and OpenVLA-OFT model results on ManiSkill3</strong></th>
+    <th colspan="6" style="text-align:center;"> <strong>Evaluation results on ManiSkill. Values denote success rates</strong></th>
   </tr>
   <tr>
-    <th>Model</th>
-    <th>Vision</th>
-    <th>Semantic</th>
-    <th>Position</th>
-    <th>Average</th>
+    <td style="text-align:center;"></td>
+    <th rowspan="2" colspan="1" style="text-align:center;">In-Distribution</th>
+    <td colspan="4" style="text-align:center;"><strong>Out-Of-Distribution<strong></td>
+  
+  </tr>
+  <tr>
+    <th style="text-align:center;"></th>
+    <th style="text-align:center;">Vision</th>
+    <th style="text-align:center;">Semantic</th>
+    <th style="text-align:center;">Execution</th>
+    <th style="text-align:center;">Avg.</th>
+  </tr>
+  <tr>
+    <td style="text-align:center;">OpenVLA (Base)</td>
+    <td style="text-align:center;">53.91%</td>
+    <td style="text-align:center;">38.75%</td>
+    <td style="text-align:center;">35.94%</td>
+    <td style="text-align:center;">42.11%</td>
+    <td style="text-align:center;">39.10%</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;"><a href="https://huggingface.co/gen-robot/openvla-7b-rlvla-warmup"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">RL4VLA (PPO)</td>
+    <td style="text-align:center;">93.75%</td>
+    <td style="text-align:center;">80.47%</td>
+    <td style="text-align:center;">75.00%</td>
+    <td style="text-align:center;">81.77%</td>
+    <td style="text-align:center;">79.15%</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA (RLinf-GRPO)</td>
+    <td style="text-align:center;">84.38%</td>
+    <td style="text-align:center;">74.69%</td>
+    <td style="text-align:center;">72.99%</td>
+    <td style="text-align:center;">77.86%</td>
+    <td style="text-align:center;">75.15%</td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/gen-robot/openvla-7b-rlvla-warmup"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">rl4vla</a></td>
-    <td>76.6%</td>
-    <td>75.4%</td>
-    <td>77.6%</td>
-    <td>76.1%</td>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA (RLinf-PPO)</td>
+    <td style="text-align:center;"><strong>96.09%</strong></td>
+    <td style="text-align:center;">82.03%</td>
+    <td style="text-align:center;"><strong>78.35%</strong></td>
+    <td style="text-align:center;"><strong>85.42%</strong></td>
+    <td style="text-align:center;"><strong>81.93%</strong></td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">GRPO-OpenVLA-OFT</td>
-    <td><strong>84.6%</strong></td>
-    <td>51.6%</td>
-    <td>42.9%</td>
-    <td>61.5%</td>
+    <th colspan="6" style="text-align:center;"></th>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">PPO-OpenVLA-OFT</td>
-    <td>80.5%</td>
-    <td>56.6%</td>
-    <td>56.1%</td>
-    <td>64.5%</td>
+    <td style="text-align:center;">OpenVLA-OFT (Base)</td>
+    <td style="text-align:center;">28.13%</td>
+    <td style="text-align:center;">27.73%</td>
+    <td style="text-align:center;">12.95%</td>
+    <td style="text-align:center;">11.72%</td>
+    <td style="text-align:center;">18.29%</td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">PPO-OpenVLA</td>
-    <td>82.0%</td>
-    <td><strong>80.6%</strong></td>
-    <td><strong>89.3%</strong></td>
-    <td><strong>82.2%</strong></td>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (RLinf-GRPO)</td>
+    <td style="text-align:center;">94.14%</td>
+    <td style="text-align:center;">84.69%</td>
+    <td style="text-align:center;">45.54%</td>
+    <td style="text-align:center;">44.66%</td>
+    <td style="text-align:center;">60.64%</td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">GRPO-OpenVLA</td>
-    <td>74.7%</td>
-    <td>74.4%</td>
-    <td>81.6%</td>
-    <td>75.5%</td>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (RLinf-PPO)</td>
+    <td style="text-align:center;"><strong>97.66%</strong></td>
+    <td style="text-align:center;"><strong>92.11%</strong></td>
+    <td style="text-align:center;">64.84%</td>
+    <td style="text-align:center;">73.57%</td>
+    <td style="text-align:center;">77.05%</td>
   </tr>
 </table>
+</div>
+
 
-<table>
+<div align="center">
+<table style="text-align:center;">
   <tr>
-    <th colspan="6" style="text-align:center;"><strong>OpenVLA-OFT model results on LIBERO</strong></th>
+    <th colspan="7" style="text-align:center;"><strong>Evaluation results of the unified model on the five LIBERO task groups</strong></th>
   </tr>
   <tr>
-    <th>Model</th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-spatial"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Spatial</a></th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-goal"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Goal</a></th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-object"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Object</a></th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-long"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Long</a></th>
-    <th>Average</th>
+    <th style="text-align:center;">Model</th>
+    <th style="text-align:center;">Spatial</th>
+    <th style="text-align:center;">Object</th>
+    <th style="text-align:center;">Goal</th>
+    <th style="text-align:center;">10</th>
+    <th style="text-align:center;">90</th>
+    <th style="text-align:center;">Avg.</th>
   </tr>
   <tr>
-    <td>OpenVLA-OFT-SFT (one-shot)</td>
-    <td>56.5%</td>
-    <td>45.6%</td>
-    <td>25.6%</td>
-    <td>9.7%</td>
-    <td>34.4%</td>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-LIBERO-130-Base-Lora"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (Base)</td>
+    <td style="text-align:center;">72.18%</td>
+    <td style="text-align:center;">71.48%</td>
+    <td style="text-align:center;">64.06%</td>
+    <td style="text-align:center;">48.44%</td>
+    <td style="text-align:center;">70.97%</td>
+    <td style="text-align:center;">65.43%</td>
   </tr>
   <tr>
-    <td>OpenVLA-OFT-RLinf</td>
-    <td><strong>99.0%</strong></td>
-    <td><strong>99.0%</strong></td>
-    <td><strong>99.0%</strong></td>
-    <td><strong>94.4%</strong></td>
-    <td><strong>97.9%</strong></td>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-LIBERO-130"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (RLinf-GRPO)</td>
+    <td style="text-align:center;"><strong>99.40%<strong></td>
+    <td style="text-align:center;"><strong>99.80%<strong></td>
+    <td style="text-align:center;"><strong>98.79%<strong></td>
+    <td style="text-align:center;"><strong>93.95%<strong></td>
+    <td style="text-align:center;"><strong>98.59%<strong></td>
+    <td style="text-align:center;"><strong>98.11%<strong></td>
   </tr>
   <tr>
-    <td>Improvement</td>
-    <td>+42.5%</td>
-    <td>+53.4%</td>
-    <td>+73.4%</td>
-    <td>+84.7%</td>
-    <td>+63.5%</td>
+    <td style="text-align:center;">Δ Improvement</td>
+    <td style="text-align:center;">+27.22</td>
+    <td style="text-align:center;">+28.32</td>
+    <td style="text-align:center;">+34.73</td>
+    <td style="text-align:center;">+45.51</td>
+    <td style="text-align:center;">+27.62</td>
+    <td style="text-align:center;">+32.68</td>
   </tr>
 </table>
 </div>
@@ -168,7 +228,7 @@ RLinf is a flexible and scalable open-source infrastructure designed for post-tr
 ### Math Reasoning
 
 <div align="center">
-<table>
+<table style="text-align:center;">
   <tr>
     <th colspan="5" style="text-align:center;"><strong>1.5B model results</strong></th>
   </tr>
@@ -213,7 +273,7 @@ RLinf is a flexible and scalable open-source infrastructure designed for post-tr
 \* We retrain the model using the default settings for 600 steps.
 
 <div align="center">
-<table>
+<table style="text-align:center;">
   <tr>
     <th colspan="5" style="text-align:center;"><strong>7B model results</strong></th>
   </tr>
diff --git a/README.zh-CN.md b/README.zh-CN.md
index b0801c144..4eb14f6da 100644
--- a/README.zh-CN.md
+++ b/README.zh-CN.md
@@ -30,6 +30,7 @@ RLinf 是一个灵活且可扩展的开源框架，专为利用强化学习进
 
 
 ## 最新动态
+- [2025/10] RLinf算法技术报告 [《RLinf-VLA：一个统一且高效的VLA+RL训练框架》](https://arxiv.org/abs/2510.06710) 已正式发布。
 - [2025/09] <img src="https://github.githubassets.com/images/icons/emoji/unicode/1f525.png" width="18" /> [示例库](https://rlinf.readthedocs.io/en/latest/rst_source/examples/index.html) 已更新，用户可以在其中找到多种可直接使用的示例！
 - [2025/09] 我们的论文 [《RLinf: Flexible and Efficient Large-scale Reinforcement Learning via Macro-to-Micro Flow Transformation》](https://arxiv.org/abs/2509.15965)已正式发布。
 - [2025/09] 机器之心关于 RLinf 的报道[《首个为具身智能而生的大规模强化学习框架RLinf！清华、北京中关村学院、无问芯穹等重磅开源》](https://mp.weixin.qq.com/s/Xtv4gDu3lhDDGadLrzt6Aw)已经发布。
@@ -74,94 +75,152 @@ RLinf 是一个灵活且可扩展的开源框架，专为利用强化学习进
 ## 主要成果
 ### 具身智能
 
+
 <div align="center">
-<table>
+<table border="0">
+  <tr>
+    <td align="center">
+      <img src="https://github.com/RLinf/misc/raw/main/pic/mani_openvla.png" alt="mani_openvla" width="350"/>
+      <br/>
+      <strong>OpenVLA</strong>
+    </td>
+    <td align="center">
+      <img src="https://github.com/RLinf/misc/raw/main/pic/mani_openvlaoft.png" alt="mani_openvlaoft" width="350"/>
+      <br/>
+      <strong>OpenVLA-OFT</strong>
+    </td>
+  </tr>
+</table>
+</div>
+
+- 在 ManiSkill 环境 “PutOnPlateInScene25Mani-v3” 上，使用 OpenVLA 与 OpenVLA-OFT 模型进行训练。结果显示，在 PPO 与 GRPO 算法的对比中，PPO 始终表现优于 GRPO，且训练过程更加稳定。
+
+<div align="center">
+<table style="text-align:center;">
   <tr>
-    <th colspan="5" style="text-align:center;"><strong>OpenVLA-OFT 模型在 ManiSkill3 上的实验结果</strong></th>
+    <th colspan="6" style="text-align:center;"> <strong>在 ManiSkill 上的评测结果。表中数值表示任务的成功率（Success Rate）</strong></th>
   </tr>
   <tr>
-    <th>Model</th>
-    <th>Vision</th>
-    <th>Semantic</th>
-    <th>Position</th>
-    <th>Average</th>
+    <td style="text-align:center;"></td>
+    <th rowspan="2" colspan="1" style="text-align:center;">In-Distribution</th>
+    <td colspan="4" style="text-align:center;"><strong>Out-Of-Distribution<strong></td>
+  
+  </tr>
+  <tr>
+    <th style="text-align:center;"></th>
+    <th style="text-align:center;">Vision</th>
+    <th style="text-align:center;">Semantic</th>
+    <th style="text-align:center;">Execution</th>
+    <th style="text-align:center;">Avg.</th>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/gen-robot/openvla-7b-rlvla-warmup"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">rl4vla</a></td>
-    <td>76.6%</td>
-    <td>75.4%</td>
-    <td>77.6%</td>
-    <td>76.1%</td>
+    <td style="text-align:center;">OpenVLA (Base)</td>
+    <td style="text-align:center;">53.91%</td>
+    <td style="text-align:center;">38.75%</td>
+    <td style="text-align:center;">35.94%</td>
+    <td style="text-align:center;">42.11%</td>
+    <td style="text-align:center;">39.10%</td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">GRPO-OpenVLA-OFT</td>
-    <td><strong>84.6%</strong></td>
-    <td>51.6%</td>
-    <td>42.9%</td>
-    <td>61.5%</td>
+    <td style="text-align:center;"><a href="https://huggingface.co/gen-robot/openvla-7b-rlvla-warmup"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">RL4VLA (PPO)</td>
+    <td style="text-align:center;">93.75%</td>
+    <td style="text-align:center;">80.47%</td>
+    <td style="text-align:center;">75.00%</td>
+    <td style="text-align:center;">81.77%</td>
+    <td style="text-align:center;">79.15%</td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">PPO-OpenVLA-OFT</td>
-    <td>80.5%</td>
-    <td>56.6%</td>
-    <td>56.1%</td>
-    <td>64.5%</td>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA (RLinf-GRPO)</td>
+    <td style="text-align:center;">84.38%</td>
+    <td style="text-align:center;">74.69%</td>
+    <td style="text-align:center;">72.99%</td>
+    <td style="text-align:center;">77.86%</td>
+    <td style="text-align:center;">75.15%</td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">PPO-OpenVLA</td>
-    <td>82.0%</td>
-    <td><strong>80.6%</strong></td>
-    <td><strong>89.3%</strong></td>
-    <td><strong>82.2%</strong></td>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA (RLinf-PPO)</td>
+    <td style="text-align:center;"><strong>96.09%</strong></td>
+    <td style="text-align:center;">82.03%</td>
+    <td style="text-align:center;"><strong>78.35%</strong></td>
+    <td style="text-align:center;"><strong>85.42%</strong></td>
+    <td style="text-align:center;"><strong>81.93%</strong></td>
   </tr>
   <tr>
-    <td><a href="https://huggingface.co/RLinf/RLinf-OpenVLA-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">GRPO-OpenVLA</td>
-    <td>74.7%</td>
-    <td>74.4%</td>
-    <td>81.6%</td>
-    <td>75.5%</td>
+    <th colspan="6" style="text-align:center;"></th>
+  </tr>
+  <tr>
+    <td style="text-align:center;">OpenVLA-OFT (Base)</td>
+    <td style="text-align:center;">28.13%</td>
+    <td style="text-align:center;">27.73%</td>
+    <td style="text-align:center;">12.95%</td>
+    <td style="text-align:center;">11.72%</td>
+    <td style="text-align:center;">18.29%</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (RLinf-GRPO)</td>
+    <td style="text-align:center;">94.14%</td>
+    <td style="text-align:center;">84.69%</td>
+    <td style="text-align:center;">45.54%</td>
+    <td style="text-align:center;">44.66%</td>
+    <td style="text-align:center;">60.64%</td>
+  </tr>
+  <tr>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-PPO-ManiSkill3-25ood"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (RLinf-PPO)</td>
+    <td style="text-align:center;"><strong>97.66%</strong></td>
+    <td style="text-align:center;"><strong>92.11%</strong></td>
+    <td style="text-align:center;">64.84%</td>
+    <td style="text-align:center;">73.57%</td>
+    <td style="text-align:center;">77.05%</td>
   </tr>
 </table>
+</div>
 
-<table>
+
+<div align="center">
+<table style="text-align:center;">
   <tr>
-    <th colspan="6" style="text-align:center;"><strong>OpenVLA-OFT 模型在 LIBERO 上的实验结果</strong></th>
+    <th colspan="7" style="text-align:center;"><strong>统一模型在五个 LIBERO 任务组上的评测结果</strong></th>
   </tr>
   <tr>
-    <th>Model</th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-spatial"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Spatial</a></th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-goal"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Goal</a></th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-object"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Object</a></th>
-    <th><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-GRPO-LIBERO-long"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">Long</a></th>
-    <th>Average</th>
+    <th style="text-align:center;">Model</th>
+    <th style="text-align:center;">Spatial</th>
+    <th style="text-align:center;">Object</th>
+    <th style="text-align:center;">Goal</th>
+    <th style="text-align:center;">10</th>
+    <th style="text-align:center;">90</th>
+    <th style="text-align:center;">Avg.</th>
   </tr>
   <tr>
-    <td>OpenVLA-OFT-SFT (one-shot)</td>
-    <td>56.5%</td>
-    <td>45.6%</td>
-    <td>25.6%</td>
-    <td>9.7%</td>
-    <td>34.4%</td>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-LIBERO-130-Base-Lora"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (Base)</td>
+    <td style="text-align:center;">72.18%</td>
+    <td style="text-align:center;">71.48%</td>
+    <td style="text-align:center;">64.06%</td>
+    <td style="text-align:center;">48.44%</td>
+    <td style="text-align:center;">70.97%</td>
+    <td style="text-align:center;">65.43%</td>
   </tr>
   <tr>
-    <td>OpenVLA-OFT-RLinf</td>
-    <td><strong>99.0%</strong></td>
-    <td><strong>99.0%</strong></td>
-    <td><strong>99.0%</strong></td>
-    <td><strong>94.4%</strong></td>
-    <td><strong>97.9%</strong></td>
+    <td style="text-align:center;"><a href="https://huggingface.co/RLinf/RLinf-OpenVLAOFT-LIBERO-130"><img src="docs/source-en/_static/svg/hf-logo.svg" alt="HF" width="16" height="16" style="vertical-align: middle;">OpenVLA-OFT (RLinf-GRPO)</td>
+    <td style="text-align:center;"><strong>99.40%<strong></td>
+    <td style="text-align:center;"><strong>99.80%<strong></td>
+    <td style="text-align:center;"><strong>98.79%<strong></td>
+    <td style="text-align:center;"><strong>93.95%<strong></td>
+    <td style="text-align:center;"><strong>98.59%<strong></td>
+    <td style="text-align:center;"><strong>98.11%<strong></td>
   </tr>
   <tr>
-    <td>Improvement</td>
-    <td>+42.5%</td>
-    <td>+53.4%</td>
-    <td>+73.4%</td>
-    <td>+84.7%</td>
-    <td>+63.5%</td>
+    <td style="text-align:center;">Δ Improvement</td>
+    <td style="text-align:center;">+27.22</td>
+    <td style="text-align:center;">+28.32</td>
+    <td style="text-align:center;">+34.73</td>
+    <td style="text-align:center;">+45.51</td>
+    <td style="text-align:center;">+27.62</td>
+    <td style="text-align:center;">+32.68</td>
   </tr>
 </table>
 </div>
 
+
 - RLinf 同时支持 PPO 与 GRPO 算法，为视觉-语言-动作（Vision-Language-Action, VLA）模型提供最先进的训练能力。
 - 该框架与主流具身智能基准测试（如 ManiSkill3 与 LIBERO）无缝集成，并在多样化的评测指标上均取得了优异表现。
 

From d611e9163de225ee7b880c5f6519f47a60ea6509 Mon Sep 17 00:00:00 2001
From: Andy Lin <32576375+andylin-hao@users.noreply.github.com>
Date: Sat, 11 Oct 2025 13:19:57 +0800
Subject: [PATCH 13/57] feat: unified docker image and dockerfile (#147)

Signed-off-by: Hao Lin <linhaomails@gmail.com>
---
 .github/ISSUE_TEMPLATE/bug-report.yml         |  20 ++-
 .github/workflows/auto_placement.yml          |   1 +
 .github/workflows/embodied_e2e.yml            |   5 +-
 .github/workflows/math_e2e.yml                |   5 +
 .../workflows/math_e2e_rollout_logprobs.yml   |   4 +
 .github/workflows/unit_test.yml               |   3 +
 CONTRIBUTING.md                               |   6 +-
 docker/README.md                              |  24 +++
 docker/embodied/Dockerfile.openvla.hf.fsdp    |  97 ------------
 docker/embodied/Dockerfile.openvlaoft.hf.fsdp |  97 ------------
 docker/embodied/Dockerfile.pi0.hf.fsdp        |  94 ------------
 ...sglang0.4.4.vllm0.7.1.megatron0.11.te2.1.0 |  32 ----
 ...g0.4.6post5.vllm0.8.5.megatron0.13.te2.1.0 |  56 -------
 docker/torch-2.6/Dockerfile                   | 142 ++++++++++++++++++
 docker/torch-2.6/project                      | 128 ++++++++++++++++
 .../rst_source/start/installation.rst         |  53 ++++---
 docs/source-en/rst_source/start/vla.rst       |   8 +
 .../rst_source/start/installation.rst         |  50 +++---
 docs/source-zh/rst_source/start/vla.rst       |   8 +
 examples/embodiment/run_embodiment.sh         |   7 +-
 pyproject.toml                                |   9 +-
 requirements/install_embodied_deps.sh         |   6 +
 requirements/openvla.txt                      |   8 +-
 requirements/openvla_oft.txt                  |   8 +-
 rlinf/scheduler/worker/worker.py              |   2 +
 tests/e2e_tests/embodied/run_openvla.sh       |   1 +
 26 files changed, 426 insertions(+), 448 deletions(-)
 create mode 100644 docker/README.md
 delete mode 100644 docker/embodied/Dockerfile.openvla.hf.fsdp
 delete mode 100644 docker/embodied/Dockerfile.openvlaoft.hf.fsdp
 delete mode 100644 docker/embodied/Dockerfile.pi0.hf.fsdp
 delete mode 100644 docker/math/Dockerfile.sglang0.4.4.vllm0.7.1.megatron0.11.te2.1.0
 delete mode 100644 docker/math/Dockerfile.sglang0.4.6post5.vllm0.8.5.megatron0.13.te2.1.0
 create mode 100644 docker/torch-2.6/Dockerfile
 create mode 100644 docker/torch-2.6/project

diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index 0145478f0..b928e91b3 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -18,6 +18,22 @@ body:
       Please provide a clear and concise description of what the bug is.
   validations:
     required: true
+- type: textarea
+  attributes:
+    label: Log file
+    description: |
+      Please provide upload the log file or paste the full log messages when the bug happens.
+    value: |
+      You can find the log file in `logs/` folder or the $output_dir/$experiment_name folder (defined in the yaml config) if you are using our example scripts.
+
+      Log file:
+
+      If you cannot find the log, please provide the full log messages here.
+      ```
+      <paste log here>
+      ```
+  validations:
+    required: true
 - type: textarea
   attributes:
     label: Environment
@@ -28,13 +44,13 @@ body:
       pip list | grep -E "torch|sglang|vllm|ray|transformers"
       nvidia-smi
       ```
-      Additionally, please provide the RLinf version, Megatron version if you are using Megatron, and docker image version if you are using our images
+      Additionally, please provide the RLinf version, Megatron version if you are using Megatron, and docker image tag if you are using our images
     value: |
       Python version:
       PIP list:
       RLinf version: 
       Megatron version: 
-      Docker image version: 
+      Docker image tag: 
       nvidia-smi:
   validations:
     required: true
diff --git a/.github/workflows/auto_placement.yml b/.github/workflows/auto_placement.yml
index 346a7226f..169e48186 100644
--- a/.github/workflows/auto_placement.yml
+++ b/.github/workflows/auto_placement.yml
@@ -53,6 +53,7 @@ jobs:
               uses: actions/checkout@v4
 
             - name: auto-placement
+              timeout-minutes: 20
               run: |
                 export REPO_PATH=$(pwd)
                 bash tests/e2e_tests/auto_placement/run_auto_placement.sh
\ No newline at end of file
diff --git a/.github/workflows/embodied_e2e.yml b/.github/workflows/embodied_e2e.yml
index 259c76d41..5e8a999bc 100644
--- a/.github/workflows/embodied_e2e.yml
+++ b/.github/workflows/embodied_e2e.yml
@@ -42,16 +42,17 @@ jobs:
     openvla-ppo-test:
         runs-on: rlinf
         container:
-            image: rlinf/rlinf:agentic-openvla-rlinf0.1-torch2.5.1-transformer4.40
+            image: rlinf/rlinf:agentic-rlinf0.1-torch2.6.0-openvla-openvlaoft-pi0
             volumes:
                 - /mnt/public/dataset:/workspace/dataset
                 - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=2g -e NVIDIA_DRIVER_CAPABILITIES="compute,utility,graphics"
+            options: --gpus="all" --shm-size=2g
 
         steps:
             - name: Checkout code
               uses: actions/checkout@v4
             - name: OpenVLA test
+              timeout-minutes: 20
               run: |
                 export REPO_PATH=$(pwd)
                 bash tests/e2e_tests/embodied/run_openvla.sh
\ No newline at end of file
diff --git a/.github/workflows/math_e2e.yml b/.github/workflows/math_e2e.yml
index 3e31864b1..ffd474253 100644
--- a/.github/workflows/math_e2e.yml
+++ b/.github/workflows/math_e2e.yml
@@ -53,21 +53,25 @@ jobs:
               uses: actions/checkout@v4
 
             - name: SGLang Collocated mode
+              timeout-minutes: 20
               run: |
                 export REPO_PATH=$(pwd)
                 bash tests/e2e_tests/math/sglang/run_collocated.sh
 
             - name: vLLM Collocated mode
+              timeout-minutes: 20
               run: |
                 export REPO_PATH=$(pwd)
                 bash tests/e2e_tests/math/vllm/run_collocated.sh
 
             - name: SGLang Pipeline mode
+              timeout-minutes: 20
               run: |
                 export REPO_PATH=$(pwd)
                 bash tests/e2e_tests/math/sglang/run_pipeline.sh
 
             - name: vLLM Pipeline mode
+              timeout-minutes: 20
               run: |
                 export REPO_PATH=$(pwd)
                 bash tests/e2e_tests/math/vllm/run_pipeline.sh
@@ -86,6 +90,7 @@ jobs:
               uses: actions/checkout@v4
 
             - name: SGLang Collocated mode
+              timeout-minutes: 20
               run: |
                 export REPO_PATH=$(pwd)
                 bash tests/e2e_tests/math/sglang/run_collocated.sh
diff --git a/.github/workflows/math_e2e_rollout_logprobs.yml b/.github/workflows/math_e2e_rollout_logprobs.yml
index 914efb1d2..168737332 100644
--- a/.github/workflows/math_e2e_rollout_logprobs.yml
+++ b/.github/workflows/math_e2e_rollout_logprobs.yml
@@ -53,21 +53,25 @@ jobs:
               uses: actions/checkout@v4
 
             - name: SGLang Collocated mode
+              timeout-minutes: 20
               run: |
                 export REPO_PATH=$(pwd)
                 bash tests/e2e_tests/math/sglang/run_collocated.sh qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
 
             - name: vLLM Collocated mode
+              timeout-minutes: 20
               run: |
                 export REPO_PATH=$(pwd)
                 bash tests/e2e_tests/math/vllm/run_collocated.sh qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
 
             - name: SGLang Pipeline mode
+              timeout-minutes: 20
               run: |
                 export REPO_PATH=$(pwd)
                 bash tests/e2e_tests/math/sglang/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
 
             - name: vLLM Pipeline mode
+              timeout-minutes: 20
               run: |
                 export REPO_PATH=$(pwd)
                 bash tests/e2e_tests/math/vllm/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml
index aa7747de6..634579f5f 100644
--- a/.github/workflows/unit_test.yml
+++ b/.github/workflows/unit_test.yml
@@ -48,11 +48,13 @@ jobs:
               run: pip install pytest
 
             - name: Run pytest
+              timeout-minutes: 20
               run: |
                 export PYTHONPATH=$(pwd):$(pwd)/megatron:$(pwd)/tests/unit_tests 
                 pytest tests/unit_tests
             
             - name: Run doctest
+              timeout-minutes: 20
               run: |
                 pytest --doctest-modules rlinf/scheduler
       
@@ -69,6 +71,7 @@ jobs:
               run: pip install pytest
 
             - name: Run pytest
+              timeout-minutes: 20
               run: |
                 export PYTHONPATH=$(pwd):$(pwd)/megatron:$(pwd)/tests/unit_tests 
                 pytest tests/unit_tests
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 314cd313a..d151594d5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -21,7 +21,7 @@ All types of contributions are encouraged and valued. See the [Table of Contents
 
 
 ## I Want To Contribute
-> All contributions (including the project team's contribution) takes the form of [GitHub Pull Requests](https://github.com/RLinf/RLinf/pulls).
+All contributions (including the project team's contribution) takes the form of [GitHub Pull Requests](https://github.com/RLinf/RLinf/pulls).
 To contribute, first you need to [fork the repository](https://github.com/RLinf/RLinf/fork) and clone it to your local machine.
 Then, create a new development branch from `main` for your contribution:
 ```bash
@@ -29,7 +29,7 @@ git checkout main
 git pull origin main
 git checkout -b feature/your-feature-name
 ```
-After you have made your changes, commit them with a clear and descriptive commit message:
+After you have made your changes, commit them with a clear and descriptive commit message. The `-s` flag is necessary, which adds a "Signed-off-by" line at the end of the commit message:
 ```bash
 git add .
 git commit -m "feat(embodied): add a clear and descriptive commit message" -s
@@ -47,8 +47,6 @@ Where `<type>` commonly includes the following (others can be found in the [Conv
 - `test`: adding missing tests, refactoring tests; no production code change
 - `chore`: updating build tasks, package manager configs, etc; no production code change.
 
-The `-s` flag is necessary, which adds a "Signed-off-by" line at the end of the commit message, certifying that you have the right to submit this work under the project's license.
-
 Finally, before pushing your changes to your fork, please run the pre-commit checks to ensure that your code adheres to the project's coding standards:
 ```bash
 pip install pre-commit
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 000000000..2610efcd6
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,24 @@
+## Building Docker Images
+
+RLinf provides a unified Dockerfile for both the math reasoning and embodied images, and can switch between the two images using the `BUILD_TARGET` build argument, which can be `reason` or `embodied`.
+To build the Docker image, run the following command in the `docker/torch-x.x` directory, replacing `x.x` with the desired PyTorch version (e.g., `2.6` or `2.7`):
+
+```shell
+export BUILD_TARGET=reason # or embodied for the embodied image
+docker build --build-arg BUILD_TARGET=$BUILD_TARGET -t rlinf:$BUILD_TARGET .
+```
+
+If you are building the `reason` image and run into OOM during build, it might be because the `APEX` package's number of compile threads is set too high (default 24 and may require over 200 GB memory).
+You can reduce the number of compile threads by adding `--build-arg APEX_BUILD_THREADS=<num_threads>` to the `docker build` command, where `<num_threads>` is the number of threads you want to use (e.g., 8 or 12).
+
+# Using the Docker Image
+
+The built Docker image contains one or multiple Python virtual environments (venv) in the `/opt/venv` directory, depending on the `BUILD_TARGET`.
+
+Currently, the reasoning image contains one venv named `reason` in `/opt/venv/reason`, while the embodied image contains three venvs named `openvla`, `openvla-oft` and `pi0` in `/opt/venv/`.
+
+To switch to the desired venv, we have a built-in script `switch_env` that can switch among venvs in a single command.
+
+```shell
+source switch_env <env_name> # e.g., source switch_env openvla-oft, source switch_env pi0, etc.
+```
\ No newline at end of file
diff --git a/docker/embodied/Dockerfile.openvla.hf.fsdp b/docker/embodied/Dockerfile.openvla.hf.fsdp
deleted file mode 100644
index 52c9caa84..000000000
--- a/docker/embodied/Dockerfile.openvla.hf.fsdp
+++ /dev/null
@@ -1,97 +0,0 @@
-FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
-
-ENV DEBIAN_FRONTEND=noninteractive \
-    PATH=/opt/conda/bin:$PATH \
-    CUDNN_PATH=/opt/conda/lib/python3.11/site-packages/nvidia/cudnn/
-
-RUN sed -i 's|http://archive.ubuntu.com/ubuntu/|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list && \
-    sed -i 's|http://security.ubuntu.com/ubuntu/|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        git \
-        vim \
-        libibverbs-dev \
-        openssh-server \
-        sudo \
-        runit \
-        runit-systemd \
-        tmux \
-        wget \
-        curl \
-        ca-certificates \
-        mesa-utils \
-        libosmesa6-dev \
-        freeglut3-dev \
-        libglew-dev \
-        libegl1 \
-        libgles2 \
-        libglvnd-dev \
-        libglfw3-dev \
-        libgl1-mesa-dev \
-        libgl1-mesa-glx \
-        libglib2.0-0 \
-        libsm6 \
-        libxext6 \
-        libxrender-dev \
-        libgomp1 \
-        && rm -rf /var/lib/apt/lists/* \
-        && apt-get clean
-
-RUN python -m pip install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple --upgrade pip && \
-    pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-RUN pip install \
-    hydra-core==1.4.0.dev1 \
-    torchdata \
-    word2number \
-    setuptools==69.5.1 \
-    datasets \
-    sentencepiece \
-    regex \
-    einops \
-    scipy \
-    wandb \
-    tensorboard \
-    nvitop \
-    accelerate \
-    pylatexenc \
-    pybind11 \
-    torch_memory_saver \
-    ray[default]==2.47.0 \
-    draccus \
-    rich \
-    tensorflow_graphics \
-    peft==0.11.1 \
-    timm==0.9.10 \
-    tensordict \
-    transformers==4.40.1
-
-RUN pip install --no-build-isolation  --use-pep517 flash-attn==2.5.5
-
-WORKDIR /workspace
-
-RUN git clone https://github.com/openvla/openvla.git /workspace/openvla && \
-    git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git /workspace/libero && \
-    git clone https://github.com/haosulab/ManiSkill.git /workspace/maniskill
-
-RUN cd /workspace/maniskill && \
-    git checkout fa22a46ecf54a4035a762dade27f8cb3f907aa46 && \
-    cd /workspace
-
-RUN pip install \
-    -e /workspace/maniskill \
-    -e /workspace/libero \
-    -e /workspace/openvla
-
-RUN pip install -r /workspace/openvla/experiments/robot/libero/libero_requirements.txt 
-
-# OpenVLA overrides torch with v2.2, needs to be reset
-RUN pip install torch==2.5.1
-
-RUN python -m mani_skill.utils.download_asset bridge_v2_real2sim -y
-RUN python -m mani_skill.utils.download_asset widowx250s -y
-
-RUN echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
-RUN echo "conda activate" >> ~/.bashrc
-
-CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/embodied/Dockerfile.openvlaoft.hf.fsdp b/docker/embodied/Dockerfile.openvlaoft.hf.fsdp
deleted file mode 100644
index 61e955dda..000000000
--- a/docker/embodied/Dockerfile.openvlaoft.hf.fsdp
+++ /dev/null
@@ -1,97 +0,0 @@
-FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
-
-ENV DEBIAN_FRONTEND=noninteractive \
-    PATH=/opt/conda/bin:$PATH \
-    CUDNN_PATH=/opt/conda/lib/python3.11/site-packages/nvidia/cudnn/
-
-RUN sed -i 's|http://archive.ubuntu.com/ubuntu/|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list && \
-    sed -i 's|http://security.ubuntu.com/ubuntu/|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        git \
-        vim \
-        libibverbs-dev \
-        openssh-server \
-        sudo \
-        runit \
-        runit-systemd \
-        tmux \
-        wget \
-        curl \
-        ca-certificates \
-        mesa-utils \
-        libosmesa6-dev \
-        freeglut3-dev \
-        libglew-dev \
-        libegl1 \
-        libgles2 \
-        libglvnd-dev \
-        libglfw3-dev \
-        libgl1-mesa-dev \
-        libgl1-mesa-glx \
-        libglib2.0-0 \
-        libsm6 \
-        libxext6 \
-        libxrender-dev \
-        libgomp1 \
-        && rm -rf /var/lib/apt/lists/* \
-        && apt-get clean
-
-RUN python -m pip install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple --upgrade pip && \
-    pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-RUN pip install \
-    hydra-core==1.4.0.dev1 \
-    torchdata \
-    word2number \
-    setuptools==69.5.1 \
-    datasets \
-    sentencepiece \
-    regex \
-    einops \
-    scipy \
-    wandb \
-    tensorboard \
-    nvitop \
-    accelerate \
-    pylatexenc \
-    pybind11 \
-    torch_memory_saver \
-    ray[default]==2.47.0 \
-    draccus \
-    rich \
-    tensorflow_graphics \
-    peft==0.11.1 \
-    timm==0.9.10 \
-    tensordict \
-    transformers==4.40.1
-
-RUN pip install --no-build-isolation  --use-pep517 flash-attn==2.5.5
-
-WORKDIR /workspace
-
-RUN git clone https://github.com/moojink/openvla-oft.git /workspace/openvla_oft && \
-    git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git /workspace/libero && \
-    git clone https://github.com/haosulab/ManiSkill.git /workspace/maniskill
-
-RUN cd /workspace/maniskill && \
-    git checkout fa22a46ecf54a4035a762dade27f8cb3f907aa46 && \
-    cd /workspace
-
-RUN pip install \
-    -e /workspace/maniskill \
-    -e /workspace/libero \
-    -e /workspace/openvla_oft
-
-RUN pip install -r /workspace/openvla_oft/experiments/robot/libero/libero_requirements.txt 
-
-# OpenVLA overrides torch with v2.2, needs to be reset
-RUN pip install torch==2.5.1
-
-RUN python -m mani_skill.utils.download_asset bridge_v2_real2sim -y
-RUN python -m mani_skill.utils.download_asset widowx250s -y
-
-RUN echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
-RUN echo "conda activate" >> ~/.bashrc
-
-CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/embodied/Dockerfile.pi0.hf.fsdp b/docker/embodied/Dockerfile.pi0.hf.fsdp
deleted file mode 100644
index 71ee57523..000000000
--- a/docker/embodied/Dockerfile.pi0.hf.fsdp
+++ /dev/null
@@ -1,94 +0,0 @@
-FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
-
-ENV DEBIAN_FRONTEND=noninteractive \
-    PATH=/opt/conda/bin:$PATH \
-    CUDNN_PATH=/opt/conda/lib/python3.11/site-packages/nvidia/cudnn/
-
-RUN sed -i 's|http://archive.ubuntu.com/ubuntu/|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list && \
-    sed -i 's|http://security.ubuntu.com/ubuntu/|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|g' /etc/apt/sources.list && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        git \
-        vim \
-        libibverbs-dev \
-        openssh-server \
-        sudo \
-        runit \
-        runit-systemd \
-        tmux \
-        wget \
-        curl \
-        ca-certificates \
-        mesa-utils \
-        libosmesa6-dev \
-        freeglut3-dev \
-        libglew-dev \
-        libegl1 \
-        libgles2 \
-        libglvnd-dev \
-        libglfw3-dev \
-        libgl1-mesa-dev \
-        libgl1-mesa-glx \
-        libglib2.0-0 \
-        libsm6 \
-        libxext6 \
-        libxrender-dev \
-        libgomp1 \
-        && rm -rf /var/lib/apt/lists/* \
-        && apt-get clean
-
-RUN python -m pip install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple --upgrade pip && \
-    pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-RUN pip install \
-    hydra-core==1.4.0.dev1 \
-    torchdata \
-    word2number \
-    setuptools==69.5.1 \
-    datasets \
-    sentencepiece \
-    regex \
-    einops \
-    scipy \
-    wandb \
-    tensorboard \
-    nvitop \
-    accelerate \
-    pylatexenc \
-    pybind11 \
-    torch_memory_saver \
-    ray[default]==2.47.0 \
-    draccus \
-    rich \
-    tensorflow_graphics \
-    peft==0.11.1 \
-    timm==0.9.10 \
-    tensordict \
-    transformers==4.53.0 \
-    tokenizers==0.21.4 \
-    av==15.0.0
-
-RUN pip install --no-build-isolation --use-pep517 flash-attn==2.5.5
-
-WORKDIR /workspace
-
-RUN git clone https://github.com/huggingface/lerobot.git /workspace/lerobot && \
-    git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git /workspace/libero && \
-    git clone https://github.com/haosulab/ManiSkill.git /workspace/maniskill
-
-RUN cd /workspace/maniskill && \
-    git checkout fa22a46ecf54a4035a762dade27f8cb3f907aa46 && \
-    cd /workspace
-
-RUN pip install \
-    -e /workspace/maniskill \
-    -e /workspace/libero \
-    -e /workspace/lerobot
-
-RUN python -m mani_skill.utils.download_asset bridge_v2_real2sim -y
-RUN python -m mani_skill.utils.download_asset widowx250s -y
-
-RUN echo "source /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc
-RUN echo "conda activate" >> ~/.bashrc
-
-CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/math/Dockerfile.sglang0.4.4.vllm0.7.1.megatron0.11.te2.1.0 b/docker/math/Dockerfile.sglang0.4.4.vllm0.7.1.megatron0.11.te2.1.0
deleted file mode 100644
index 41ffd17c2..000000000
--- a/docker/math/Dockerfile.sglang0.4.4.vllm0.7.1.megatron0.11.te2.1.0
+++ /dev/null
@@ -1,32 +0,0 @@
-FROM pytorch/pytorch:2.5.1-cuda12.4-cudnn9-devel
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt update && apt install git vim libibverbs-dev openssh-server sudo runit runit-systemd tmux -y
-RUN python -m pip install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple --upgrade pip
-RUN pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-WORKDIR /opt
-RUN git clone https://github.com/NVIDIA/Megatron-LM.git && cd Megatron-LM && git checkout v0.11.0
-WORKDIR /opt
-RUN git clone https://github.com/RLinf/latex2sympy2.git && cd latex2sympy2 && pip install -e .
-RUN pip install hydra-core==1.4.0.dev1
-RUN pip install torchdata
-RUN pip install word2number
-RUN pip install vllm==0.7.1
-RUN pip install setuptools==69.5.1 datasets sentencepiece regex einops scipy wandb tensorboard nvitop accelerate pylatexenc pybind11 torch_memory_saver swanlab
-RUN pip install ray[default]==2.47.0
-ENV CUDNN_PATH=/opt/conda/lib/python3.11/site-packages/nvidia/cudnn/
-RUN pip install transformer_engine[pytorch]==2.1.0
-WORKDIR /opt
-RUN git clone https://github.com/NVIDIA/apex && cd apex && pip install -v \
-        --disable-pip-version-check \
-        --no-cache-dir \
-        --no-build-isolation \
-        --config-settings "--build-option=--cpp_ext" \
-        --config-settings "--build-option=--cuda_ext" ./
-WORKDIR /opt
-RUN pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
-# install sglang
-RUN pip install sglang[all]==0.4.4
-ENV PATH=/opt/conda/bin:$PATH
-ENV PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH
-WORKDIR /workspace
-CMD [ "/bin/bash" ]
\ No newline at end of file
diff --git a/docker/math/Dockerfile.sglang0.4.6post5.vllm0.8.5.megatron0.13.te2.1.0 b/docker/math/Dockerfile.sglang0.4.6post5.vllm0.8.5.megatron0.13.te2.1.0
deleted file mode 100644
index 5e0a1bcb8..000000000
--- a/docker/math/Dockerfile.sglang0.4.6post5.vllm0.8.5.megatron0.13.te2.1.0
+++ /dev/null
@@ -1,56 +0,0 @@
-FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    git vim libibverbs-dev openssh-server sudo runit runit-systemd tmux \
-    build-essential python3-dev cmake pkg-config \
- && rm -rf /var/lib/apt/lists/*
-
-RUN python -m pip install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple --upgrade pip setuptools wheel
-RUN pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-ENV HF_HOME=/opt/.cache/huggingface
-RUN mkdir -p $HF_HOME
-
-WORKDIR /opt
-RUN git clone https://github.com/NVIDIA/Megatron-LM.git && cd Megatron-LM && git checkout core_r0.13.0
-WORKDIR /opt
-RUN git clone --depth=1 https://github.com/RLinf/latex2sympy2.git && cd latex2sympy2 && pip install -e .
-
-RUN pip install hydra-core==1.4.0.dev1 torchdata word2number vllm==0.8.5 \
-    datasets sentencepiece regex einops scipy wandb tensorboard nvitop accelerate pylatexenc pybind11 \
-    torch_memory_saver swanlab ray[default]==2.47.0
-
-ENV CUDNN_PATH=/opt/conda/lib/python3.11/site-packages/nvidia/cudnn/
-RUN pip install 'transformer_engine[pytorch]==2.1.0'
-
-WORKDIR /opt
-RUN git clone --depth=1 https://github.com/NVIDIA/apex && cd apex && \
-    pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
-        --config-settings "--build-option=--cpp_ext" \
-        --config-settings "--build-option=--cuda_ext" ./
-
-# RUN pip install 'flash-attn'==2.7.4.post1 --no-build-isolation
-RUN pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
-
-
-RUN pip install 'sglang[all]==0.4.6.post5'
-
-RUN pip install flashinfer-python==0.2.2 
-
-RUN pip install triton==3.1.0
-
-RUN pip uninstall pynvml -y
-
-ENV PATH=/opt/conda/bin:$PATH
-ENV PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH
-WORKDIR /workspace
-
-RUN python - <<'PY'
-import torch, vllm, flash_attn_cuda
-import apex
-print("Torch:", torch.__version__, "CUDA:", torch.version.cuda)
-print("CUDA available:", torch.cuda.is_available())
-PY
-
-CMD ["/bin/bash"]
diff --git a/docker/torch-2.6/Dockerfile b/docker/torch-2.6/Dockerfile
new file mode 100644
index 000000000..95da4683e
--- /dev/null
+++ b/docker/torch-2.6/Dockerfile
@@ -0,0 +1,142 @@
+ARG BUILD_TARGET=reason
+
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS base-image
+
+SHELL ["/bin/bash", "-c"]
+ENV PATH=/opt/conda/bin:$PATH
+ENV DEBIAN_FRONTEND=noninteractive
+RUN sed -i 's@//.*archive.ubuntu.com@//mirrors.ustc.edu.cn@g' /etc/apt/sources.list
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git vim libibverbs-dev openssh-server sudo runit runit-systemd tmux \
+    build-essential python3-dev cmake pkg-config iproute2 pciutils python3 python3-pip
+
+RUN pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+RUN python3 -m pip install -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple --upgrade pip setuptools wheel uv
+
+ENV HF_HOME=/opt/.cache/huggingface
+RUN mkdir -p $HF_HOME
+
+# UV index
+RUN mkdir -p /etc/uv
+RUN cat <<EOF > /etc/uv/uv.toml
+[[index]]
+url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+default = true
+EOF
+
+# UV setup
+ENV UV_PATH=/opt/venv
+RUN mkdir $UV_PATH
+WORKDIR $UV_PATH
+COPY project $UV_PATH/pyproject.toml
+ENV UV_LINK_MODE=symlink
+ENV UV_CACHE_DIR=$UV_PATH/.cache
+
+FROM base-image AS reason-image
+
+ARG APEX_BUILD_THREADS=24
+
+# Install Megatron-LM
+RUN git clone https://github.com/NVIDIA/Megatron-LM.git -b core_r0.13.0 /opt/Megatron-LM
+ENV PYTHONPATH=/opt/Megatron-LM:$PYTHONPATH
+
+# Install reasoning env
+RUN uv venv reason && source ${UV_PATH}/reason/bin/activate && \
+    UV_TORCH_BACKEND=auto uv sync --active && \
+    uv sync --extra sglang-vllm --active && \
+    APEX_CPP_EXT=1 APEX_CUDA_EXT=1 NVCC_APPEND_FLAGS="--threads $APEX_BUILD_THREADS" APEX_PARALLEL_BUILD=$APEX_BUILD_THREADS uv pip install git+https://github.com/NVIDIA/apex.git --no-build-isolation && \
+    uv pip install transformer_engine[pytorch]==2.1.0 --no-build-isolation && \
+    uv pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl && \
+    uv pip uninstall pynvml
+
+# Set default env
+RUN echo "source ${UV_PATH}/reason/bin/activate" >> ~/.bashrc
+
+FROM base-image AS embodied-image
+
+# Embodied NVIDIA_DRIVER_CAPABILITIES
+ENV NVIDIA_DRIVER_CAPABILITIES="compute,utility,graphics"
+
+# Embodied system dependencies
+RUN apt-get install -y --no-install-recommends \
+    wget \
+    unzip \
+    mesa-utils \
+    libosmesa6-dev \
+    freeglut3-dev \
+    libglew-dev \
+    libegl1 \
+    libgles2 \
+    libglvnd-dev \
+    libglfw3-dev \
+    libgl1-mesa-dev \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1
+
+# Install openvla env
+RUN uv venv openvla && source ${UV_PATH}/openvla/bin/activate && \
+    UV_TORCH_BACKEND=auto uv sync --active && \
+    uv sync --extra embodied --active && \
+    uv pip install git+https://github.com/openvla/openvla.git --no-build-isolation && \
+    uv pip install flash-attn==2.5.5 --no-build-isolation && \
+    uv pip uninstall pynvml
+
+# Install openvla-oft env
+RUN uv venv openvla-oft && source ${UV_PATH}/openvla-oft/bin/activate && \
+    UV_TORCH_BACKEND=auto uv sync --active && \
+    uv sync --extra embodied --active && \
+    uv pip install git+https://github.com/moojink/openvla-oft.git --no-build-isolation && \
+    uv pip install flash-attn==2.5.5 --no-build-isolation && \
+    uv pip uninstall pynvml
+
+# Install pi0 env
+RUN uv venv pi0 && source ${UV_PATH}/pi0/bin/activate && \
+    UV_TORCH_BACKEND=auto uv sync --active && \
+    uv sync --extra embodied --active && \
+    uv pip install "lerobot>=0.3.3" && \
+    uv pip install flash-attn==2.5.5 --no-build-isolation && \
+    uv pip uninstall pynvml
+
+# Install maniskill assets
+RUN source ${UV_PATH}/openvla/bin/activate && \
+    python -m mani_skill.utils.download_asset bridge_v2_real2sim -y && \
+    python -m mani_skill.utils.download_asset widowx250s -y
+
+# Install SAPIEN PhysX Patch
+RUN export PHYSX_VERSION=105.1-physx-5.3.1.patch0 && \
+    export PHYSX_DIR=~/.sapien/physx/$PHYSX_VERSION && \ 
+    mkdir -p $PHYSX_DIR && \
+    wget -O $PHYSX_DIR/linux-so.zip https://github.com/sapien-sim/physx-precompiled/releases/download/$PHYSX_VERSION/linux-so.zip && \
+    unzip $PHYSX_DIR/linux-so.zip -d $PHYSX_DIR && rm $PHYSX_DIR/linux-so.zip
+
+# Set default env
+RUN echo "source ${UV_PATH}/openvla/bin/activate" >> ~/.bashrc
+
+FROM ${BUILD_TARGET}-image AS final-image
+
+# switch_env utility
+RUN cat <<EOF > /usr/local/bin/switch_env
+#!/bin/bash
+if [ -z "\$1" ]; then
+    echo "Usage: switch_env <env_name>"
+    exit 1
+fi
+if [ ! -d "${UV_PATH}/\$1" ]; then
+    echo "Environment \$1 does not exist in ${UV_PATH}."
+    exit 1
+fi
+source ${UV_PATH}/\$1/bin/activate
+EOF
+RUN chmod +x /usr/local/bin/switch_env
+
+# Clean up
+RUN uv clean prune
+RUN rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/docker/torch-2.6/project b/docker/torch-2.6/project
new file mode 100644
index 000000000..ebf2c57e4
--- /dev/null
+++ b/docker/torch-2.6/project
@@ -0,0 +1,128 @@
+[project]
+name = "RLinf"
+version = "0.1.0"
+readme = {file = "README.md", content-type = "text/markdown"}
+requires-python = "==3.11.10"
+keywords = [
+    "reinforcement-learning",
+    "embodied-intelligence",
+    "large-language-models",
+]
+classifiers = [
+    #   2 - Pre-Alpha  
+    #   3 - Alpha
+    #   4 - Beta
+    #   5 - Production/Stable
+    "Development Status :: 2 - Pre-Alpha",
+    "Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.4",
+    "Intended Audience :: Developers",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+]
+
+dependencies = [
+    # Core System
+    "accelerate",
+    "ray[default]>=2.47.0",
+    "torch==2.6.0",
+
+    # Data processing
+    "pylatexenc",
+    "datasets",
+    "latex2sympy2 @ git+https://github.com/RLinf/latex2sympy2.git",
+    "sentencepiece",
+    "torchdata",
+    "wandb",
+    "word2number",
+    "regex",
+    "scipy",
+
+    # Utilities
+    "hydra-core==1.4.0.dev1",
+    "einops",
+    "nvitop",
+    "pybind11",
+    "torch-memory-saver",
+    "setuptools>=69.5.1,<75.9",
+    "ninja",
+
+    # Logging
+    "swanlab",
+    "tensorboard",
+]
+
+[project.optional-dependencies]
+sglang-vllm = [
+    "transformers==4.51.1",
+    "sglang[all]==0.4.6.post5",
+    "vllm==0.8.5",
+]
+embodied = [
+    "prismatic",
+    "transformers==4.40.1",
+    "draccus",
+    "rich",
+    "tensorflow_graphics",
+    "peft==0.11.1",
+    "timm==0.9.10",
+    "sapien==3.0.1;platform_system=='Linux'",
+    "mani_skill @ git+https://github.com/haosulab/ManiSkill.git",
+    "tensordict",
+    "libero @ git+https://github.com/RLinf/LIBERO.git",
+    "imageio[ffmpeg]",
+    "robosuite==1.4.1",
+    "bddl",
+    "easydict",
+    "cloudpickle",
+    "gym",
+]
+
+[tool.uv]
+prerelease = "allow"
+conflicts = [
+    [
+      { extra = "sglang-vllm" },
+      { extra = "embodied" },
+    ],
+]
+override-dependencies = [
+    "torch==2.6.0",
+    "torchvision==0.21.0",
+    "torchaudio==2.6.0",
+    "xgrammar==0.1.19"
+]
+
+[tool.ruff]
+line-length = 88
+indent-width = 4
+target-version = "py38"
+
+[tool.ruff.lint]
+isort = {known-first-party = ["rlinf"]}
+select = ["C", "E", "F", "I", "W", "CPY001", "RUF013", "UP006", "PERF102", "PLC1802", "PLC0208", "D", "RUF002"]
+ignore = [
+    "C901", # "complex-structure"
+    "E501", # "line-too-long"
+    "E741", # "ambiguous-variable-name"
+    "D100", # "Missing docstring in public module"
+    "D104", # "Missing docstring in public package"
+    "D203", # "incorrect-blank-line-before-class", conflict with D211,
+    "D213", # "multi-line-summary-second-line", conflict with D212
+]
+fixable = ["ALL"]
+unfixable = []
+
+[tool.ruff.lint.per-file-ignores]
+# Only enable docstring check for the scheduler module for now.
+"!rlinf/scheduler/**.py" = ["D", "RUF002"]
+
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+docstring-code-format = true
+docstring-code-line-length = "dynamic"
\ No newline at end of file
diff --git a/docs/source-en/rst_source/start/installation.rst b/docs/source-en/rst_source/start/installation.rst
index 306e8956c..58288582e 100644
--- a/docs/source-en/rst_source/start/installation.rst
+++ b/docs/source-en/rst_source/start/installation.rst
@@ -21,12 +21,6 @@ Backend Engines
 
    - **Huggingface**: Easy to use, with native APIs provided by the Huggingface ecosystem.
 
-Installation Methods
---------------------
-
-RLinf provides two installation options. We **recommend using Docker**, as it provides the fastest and most reproducible environment.
-However, if your system is incompatible with the Docker image, you can also install RLinf manually in a Python environment.
-
 Hardware Requirements
 ~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -71,21 +65,25 @@ Software Requirements
    * - NVIDIA Container Toolkit
      - 1.17.8
 
+Installation Methods
+--------------------
+
+RLinf provides two installation options. We **recommend using Docker**, as it provides the fastest and most reproducible environment.
+However, if your system is incompatible with the Docker image, you can also install RLinf manually in a Python environment.
+
 
-Install from Docker Image
+Installation Method 1: Docker Image
 -------------------------
 
 We provide two official Docker images optimized for different backend configurations:
 
-- **Megatron + SGLang/vLLM**:  
+- **Math reasoning with Megatron + SGLang/vLLM**:  
 
   - ``rlinf/rlinf:math-rlinf0.1-torch2.5.1-sglang0.4.4-vllm0.7.1-megatron0.11.0-te2.1`` (used for enhancing LLM reasoning on MATH tasks)
 
-- **FSDP + Huggingface**:  
-
-  - ``rlinf/rlinf:agentic-openvla-rlinf0.1-torch2.5.1`` (for the OpenVLA model)  
-  - ``rlinf/rlinf:agentic-openvlaoft-rlinf0.1-torch2.5.1`` (for the OpenVLA-OFT model)
+- **Embodied with FSDP + Huggingface**:  
 
+  - ``rlinf/rlinf:agentic-rlinf0.1-torch2.6.0-openvla-openvlaoft-pi0`` (for the OpenVLA/OpenVLA-OFT/Pi0 model)
 
 Once you've identified the appropriate image for your setup, pull the Docker image:
 
@@ -100,7 +98,6 @@ Then, start the container using the pulled image:
    docker run -it --gpus all \
       --shm-size 100g \
       --net=host \
-      --env NVIDIA_DRIVER_CAPABILITIES=compute,utility,graphics \
       --name rlinf \
       rlinf/rlinf:CHOSEN_IMAGE /bin/bash
 
@@ -111,25 +108,32 @@ Inside the container, clone the RLinf repository:
    git clone https://github.com/RLinf/RLinf.git
    cd RLinf
 
+The embodied image contains multiple Python virtual environments (venv) located in the `/opt/venv` directory for different models, namely ``openvla``, ``openvla-oft``, and ``pi0``.
+The default environment is set to ``openvla``.
+To switch to the desired venv, use the built-in script `switch_env`:
+
+.. code-block:: bash
+
+   source switch_env <env_name> # e.g., source switch_env openvla-oft, source switch_env pi0, etc.
+
 .. tip::
 
    - For multi-node training, make sure to clone the repository in shared storage so that every node has access to it.
    - To use ManiSkill settings, refer to the README at ``https://huggingface.co/datasets/RLinf/maniskill_assets`` for instructions on downloading the required files.
 
-Install from Custom Environment
+Installation Method 2: UV Custom Environment
 -------------------------------
 **If you have already used the Docker image, you can skip the following steps.**
 
-Installation is divided into three parts depending on the type of experiments you plan to run.
+Installation is divided into two parts depending on the type of experiments you plan to run.
+
+First, for all experiments, follow the :ref:`Common Dependencies <common-dependencies>` section to install the shared dependencies.
 
-First, for all experiments, follow the :ref:`Common Dependencies <common-dependencies>` section to install the shared dependencies.  
-This already includes the full backend setup for **FSDP + Huggingface**.
+Next, install the specific dependencies based on your experiment type.
 
-Second, for experiments using **Megatron** and **SGLang/vLLM** backends,  
-follow the :ref:`Megatron and SGLang/vLLM Dependencies <megatron-and-sglang-vllm-dependencies>` section to install all required packages.  
+* For reasoning experiments using **Megatron** and **SGLang/vLLM** backends, follow the :ref:`Megatron and SGLang/vLLM Dependencies <megatron-and-sglang-vllm-dependencies>` section to install all required packages.  
 
-Third, for embodied intelligence experiments (e.g., OpenVLA, OpenVLA-OFT and Pi0),  
-follow the :ref:`Embodied Dependencies <embodied-dependencies>` section to install their specific dependencies.
+* For embodied intelligence experiments (e.g., OpenVLA, OpenVLA-OFT and Pi0), follow the :ref:`Embodied Dependencies <embodied-dependencies>` section to install their specific dependencies.
 
 .. _common-dependencies:
 
@@ -158,6 +162,10 @@ After installing ``uv``, create a virtual environment and install PyTorch along
 Megatron and SGLang/vLLM Dependencies
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+.. note::
+  If you are running embodied experiments, there is no need to install these dependencies.
+  Please proceed directly to the :ref:`Embodied Dependencies <embodied-dependencies>` section.
+
 Run the following commands to install Megatron, SGLang or vLLM, and their dependencies:
 
 .. code-block:: shell
@@ -195,5 +203,4 @@ Then, depending on the experiment type, install the required packages for ``open
    UV_TORCH_BACKEND=auto uv pip install -r requirements/openvla_oft.txt --no-build-isolation
 
    # For Pi0 experiments
-   UV_TORCH_BACKEND=auto uv pip install -r requirements/pi0.txt --no-build-isolation
-
+   UV_TORCH_BACKEND=auto uv pip install -r requirements/pi0.txt --no-build-isolation
\ No newline at end of file
diff --git a/docs/source-en/rst_source/start/vla.rst b/docs/source-en/rst_source/start/vla.rst
index e79ed101d..32357ba91 100644
--- a/docs/source-en/rst_source/start/vla.rst
+++ b/docs/source-en/rst_source/start/vla.rst
@@ -34,6 +34,14 @@ the model is cited in `paper <https://arxiv.org/abs/2505.19789>`_
 
 **Step 2: Execute the provided launch script:**
 
+.. note:: 
+   If you have installed RLinf via the Docker image (see :doc:`./installation`), please make sure you have switched to the right Python environment for the target model.
+   The default environment is set to ``openvla``. 
+   To switch to OpenVLA-OFT or Pi0, use the built-in script `switch_env`: 
+   ``source switch_env openvla-oft`` or ``source switch_env pi0``.
+
+   If you have installed RLinf in a custom environment, please ensure that you have installed the model's corresponding dependencies as described in :doc:`./installation`.
+
 For user convenience, our configuration file is set up to run with at least two GPUs by default.  
 However, if you have multiple GPUs and wish to accelerate the quickstart process,  
 we highly recommend updating the following configuration option in  
diff --git a/docs/source-zh/rst_source/start/installation.rst b/docs/source-zh/rst_source/start/installation.rst
index 730e1ab0b..220d973d1 100644
--- a/docs/source-zh/rst_source/start/installation.rst
+++ b/docs/source-zh/rst_source/start/installation.rst
@@ -21,12 +21,6 @@ RLinf 支持多种后端引擎，用于训练和推理。目前支持以下配
 
    - **Huggingface**：简单易用，配套 Huggingface 生态提供的原生 API。
 
-安装方式
---------------------
-
-RLinf 提供两种安装方式。我们 **推荐使用 Docker**，因为这可以提供最快速、最可复现的环境。  
-如果你的系统无法使用 Docker 镜像，也可以选择在本地 Python 环境中手动安装。
-
 硬件要求
 ~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -70,19 +64,25 @@ RLinf 提供两种安装方式。我们 **推荐使用 Docker**，因为这可
    * - NVIDIA Container Toolkit
      - 1.17.8
 
-使用 Docker 镜像安装
+
+安装方式
+--------------------
+
+RLinf 提供两种安装方式。我们 **推荐使用 Docker**，因为这可以提供最快速、最可复现的环境。  
+如果你的系统无法使用 Docker 镜像，也可以选择在本地 Python 环境中手动安装。
+
+安装方式1： Docker 镜像
 -------------------------
 
 我们提供了两个官方镜像，分别针对不同后端配置进行了优化：
 
-- **Megatron + SGLang/vLLM**：
+- **基于Megatron + SGLang/vLLM的数学推理镜像**：
 
   - ``rlinf/rlinf:math-rlinf0.1-torch2.5.1-sglang0.4.4-vllm0.7.1-megatron0.11.0-te2.1`` （用于增强大语言模型在 MATH 任务中的推理能力）
 
-- **FSDP + Huggingface**：
+- **基于FSDP + Huggingface的具身智能镜像**：
 
-  - ``rlinf/rlinf:agentic-openvla-rlinf0.1-torch2.5.1`` （适用于 OpenVLA 模型）  
-  - ``rlinf/rlinf:agentic-openvlaoft-rlinf0.1-torch2.5.1`` （适用于 OpenVLA-OFT 模型）
+  - ``rlinf/rlinf:agentic-rlinf0.1-torch2.6.0-openvla-openvlaoft-pi0`` （适用于 OpenVLA/OpenVLA-OFT/Pi0 模型）
 
 确认适合你任务的镜像后，拉取镜像：
 
@@ -97,7 +97,6 @@ RLinf 提供两种安装方式。我们 **推荐使用 Docker**，因为这可
    docker run -it --gpus all \
       --shm-size 100g \
       --net=host \
-      --env NVIDIA_DRIVER_CAPABILITIES=compute,utility,graphics \
       --name rlinf \
       rlinf/rlinf:CHOSEN_IMAGE /bin/bash
 
@@ -108,25 +107,30 @@ RLinf 提供两种安装方式。我们 **推荐使用 Docker**，因为这可
    git clone https://github.com/RLinf/RLinf.git
    cd RLinf
 
+具身智能镜像中包含多个 Python 虚拟环境（venv），位于 ``/opt/venv`` 目录下，分别对应不同模型，即 ``openvla``、``openvla-oft`` 和 ``pi0``。
+默认环境设置为 ``openvla``。
+要切换到所需的 venv，可以使用内置脚本 `switch_env`：
+.. code-block:: bash
+
+   source switch_env <env_name> # 例如，source switch_env openvla-oft, source switch_env pi0 等
+
 .. tip::
 
    如果进行多节点训练，请将仓库克隆到共享存储路径，确保每个节点都能访问该代码。
 
-自定义环境安装
+安装方式2：UV 自定义环境
 -------------------------------
 **如果你已经使用了 Docker 镜像，下面步骤可跳过。**
 
-根据你的实验类型，安装分为三步进行：
+根据你的实验类型，安装分为两步进行：
 
-第一步，对于所有实验，请先完成 :ref:`共同依赖 <common-dependencies>` 中的依赖安装，  
-这一步已经包括了 **FSDP + Huggingface** 的完整配置。
+第一步，对于所有实验类型，请先完成 :ref:`共同依赖 <common-dependencies>` 中的依赖安装。
 
-第二步，如果你的实验使用的是 **Megatron 和 SGLang/vLLM** 后端，  
-请参考 :ref:`Megatron 和 SGLang/vLLM 依赖 <megatron-and-sglang-vllm-dependencies>` 安装相应依赖。
-（具身智能实验此步可忽略）
+第二步，根据你的实验类型，安装对应的依赖。  
 
-第三步，如果你要运行具身智能相关实验（如 OpenVLA、OpenVLA-OFT、Pi0），  
-请参考 :ref:`具身智能相关依赖 <embodied-dependencies>` 安装专用依赖项。
+* 如果你要运行数学推理实验，需要安装 **Megatron 和 SGLang/vLLM** 后端，请参考 :ref:`Megatron 和 SGLang/vLLM 依赖 <megatron-and-sglang-vllm-dependencies>` 安装相应依赖。
+
+* 如果你要运行具身智能相关实验（如 OpenVLA、OpenVLA-OFT、Pi0），请参考 :ref:`具身智能相关依赖 <embodied-dependencies>` 安装专用依赖项。
 
 .. _common-dependencies:
 
@@ -155,6 +159,10 @@ RLinf 提供两种安装方式。我们 **推荐使用 Docker**，因为这可
 Megatron 和 SGLang/vLLM 依赖
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+.. note::
+  如果你运行的是具身智能实验，则无需安装这些依赖。  
+  请直接跳转到 :ref:`具身智能相关依赖 <embodied-dependencies>` 部分。
+
 运行以下命令，安装 Megatron、SGLang/vLLM 及其所需依赖：
 
 .. code-block:: shell
diff --git a/docs/source-zh/rst_source/start/vla.rst b/docs/source-zh/rst_source/start/vla.rst
index fa0dede6d..fc7b5a537 100644
--- a/docs/source-zh/rst_source/start/vla.rst
+++ b/docs/source-zh/rst_source/start/vla.rst
@@ -34,6 +34,14 @@ ManiSkill3 是一个基于 GPU 加速的机器人研究仿真平台，
 
 **步骤 2：运行官方提供的训练脚本**
 
+.. note::
+   如果你是通过 Docker 镜像安装的 **RLinf**（见 :doc:`./installation`），请确保已切换到目标模型对应的 Python 环境。
+   默认环境为 ``openvla``。
+   若使用 OpenVLA-OFT 或 Pi0，请使用内置脚本 `switch_env` 切换环境：
+   ``source switch_env openvla-oft`` 或 ``source switch_env pi0``。
+
+   如果你是通过自定义环境安装的 **RLinf**，请确保已安装对应模型的依赖，详见 :doc:`./installation`。
+
 为方便使用，我们提供的配置文件需要至少双卡进行训练。  
 如果你有多张 GPU 并希望加快训练速度，  
 建议你修改配置文件  
diff --git a/examples/embodiment/run_embodiment.sh b/examples/embodiment/run_embodiment.sh
index 19134dfbc..6f31890c6 100644
--- a/examples/embodiment/run_embodiment.sh
+++ b/examples/embodiment/run_embodiment.sh
@@ -7,12 +7,8 @@ export SRC_FILE="${EMBODIED_PATH}/train_embodied_agent.py"
 export MUJOCO_GL="egl"
 export PYOPENGL_PLATFORM="egl"
 export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-# NOTE: set LIBERO_REPO_PATH to the path of the LIBERO repo
-export LIBERO_REPO_PATH="/path/to/repo/LIBERO"
-# NOTE: set LIBERO_CONFIG_PATH for libero/libero/__init__.py
-export LIBERO_CONFIG_PATH=${LIBERO_REPO_PATH}
+export LIBERO_CONFIG_PATH="/path/to/repo/LIBERO_CONFIG_PATH"
 
-export PYTHONPATH=${LIBERO_REPO_PATH}:$PYTHONPATH
 export CUDA_LAUNCH_BLOCKING=1
 export HYDRA_FULL_ERROR=1
 
@@ -23,6 +19,7 @@ else
     CONFIG_NAME=$1
 fi
 
+echo "Using Python at $(which python)"
 LOG_DIR="${REPO_PATH}/logs/$(date +'%Y%m%d-%H:%M:%S')" #/$(date +'%Y%m%d-%H:%M:%S')"
 MEGA_LOG_FILE="${LOG_DIR}/run_embodiment.log"
 mkdir -p "${LOG_DIR}"
diff --git a/pyproject.toml b/pyproject.toml
index b6a00cbcb..ebf2c57e4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,9 +65,16 @@ embodied = [
     "tensorflow_graphics",
     "peft==0.11.1",
     "timm==0.9.10",
+    "sapien==3.0.1;platform_system=='Linux'",
     "mani_skill @ git+https://github.com/haosulab/ManiSkill.git",
     "tensordict",
-    "libero @ git+https://github.com/Lifelong-Robot-Learning/LIBERO.git"
+    "libero @ git+https://github.com/RLinf/LIBERO.git",
+    "imageio[ffmpeg]",
+    "robosuite==1.4.1",
+    "bddl",
+    "easydict",
+    "cloudpickle",
+    "gym",
 ]
 
 [tool.uv]
diff --git a/requirements/install_embodied_deps.sh b/requirements/install_embodied_deps.sh
index bed7887ec..1bb90fd8b 100755
--- a/requirements/install_embodied_deps.sh
+++ b/requirements/install_embodied_deps.sh
@@ -3,6 +3,8 @@
 # Embodied dependencies
 apt-get update -y
 apt-get install -y --no-install-recommends \
+    wget \
+    unzip \
     libibverbs-dev \
     mesa-utils \
     libosmesa6-dev \
@@ -23,4 +25,8 @@ apt-get install -y --no-install-recommends \
 python -m mani_skill.utils.download_asset bridge_v2_real2sim -y
 python -m mani_skill.utils.download_asset widowx250s -y
 
+PHYSX_VERSION=105.1-physx-5.3.1.patch0
+PHYSX_DIR=~/.sapien/physx/$PHYSX_VERSION
+mkdir -p $PHYSX_DIR && wget -O $PHYSX_DIR/linux-so.zip https://github.com/sapien-sim/physx-precompiled/releases/download/$PHYSX_VERSION/linux-so.zip && unzip $PHYSX_DIR/linux-so.zip -d $PHYSX_DIR && rm $PHYSX_DIR/linux-so.zip
+
 
diff --git a/requirements/openvla.txt b/requirements/openvla.txt
index cb202fc27..c9d8a97cc 100644
--- a/requirements/openvla.txt
+++ b/requirements/openvla.txt
@@ -1,8 +1,2 @@
 openvla @ git+https://github.com/openvla/openvla.git
-flash-attn==2.5.5
-imageio[ffmpeg]
-robosuite==1.4.1
-bddl
-easydict
-cloudpickle
-gym
\ No newline at end of file
+flash-attn==2.5.5
\ No newline at end of file
diff --git a/requirements/openvla_oft.txt b/requirements/openvla_oft.txt
index 424c466c5..886134df0 100644
--- a/requirements/openvla_oft.txt
+++ b/requirements/openvla_oft.txt
@@ -1,9 +1,3 @@
 openvla_oft @ git+https://github.com/moojink/openvla-oft.git
 # https://github.com/openvla/openvla/blob/main/experiments/robot/libero/libero_requirements.txt
-flash-attn==2.5.5
-imageio[ffmpeg]
-robosuite==1.4.1
-bddl
-easydict
-cloudpickle
-gym
\ No newline at end of file
+flash-attn==2.5.5
\ No newline at end of file
diff --git a/rlinf/scheduler/worker/worker.py b/rlinf/scheduler/worker/worker.py
index 7e2c73191..98d485635 100644
--- a/rlinf/scheduler/worker/worker.py
+++ b/rlinf/scheduler/worker/worker.py
@@ -313,7 +313,9 @@ class Worker(metaclass=WorkerMeta):
 
     PID = None
     current_worker = None
+    logging.basicConfig()
     logger = logging.getLogger(Cluster.SYS_NAME)
+    logger.setLevel(logging.INFO)
     torch_platform = torch.cuda
     torch_device_type = "cuda"
 
diff --git a/tests/e2e_tests/embodied/run_openvla.sh b/tests/e2e_tests/embodied/run_openvla.sh
index 34a9b7ac9..d5304dece 100644
--- a/tests/e2e_tests/embodied/run_openvla.sh
+++ b/tests/e2e_tests/embodied/run_openvla.sh
@@ -6,4 +6,5 @@ tabs 4
 export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
 unset HOME # GitHub action sets HOME to a wrong path (/github/home), breaking simulator
 
+source switch_env openvla
 python ${REPO_PATH}/examples/embodiment/train_embodied_agent.py --config-path ${REPO_PATH}/tests/e2e_tests/embodied --config-name ppo_openvla
\ No newline at end of file

From a475bc4811a822e895e55d9e671ffc6d9f803fcc Mon Sep 17 00:00:00 2001
From: WinstonWmj <983289917@qq.com>
Date: Sat, 11 Oct 2025 14:46:34 +0800
Subject: [PATCH 14/57] fix(embodied): fix function of ordered reset id; make
 gradient ckpt into optional; override get_benchmark to support multi-task
 training; (#153)

* fix(embodied): fix function of ordered reset id; make gradient ckpt into optional; override get_benchmark to support multi-task training; change the default gradient ckpt as True; add yaml for libero90 and libero130; change the way of overriding get_benchmark;  use logging.info;

Signed-off-by: weimingjie <weimingjie@infini-ai.com>
---
 .github/workflows/embodied_e2e.yml            |  18 +-
 .../config/env/eval/libero_130.yaml           |  27 +++
 .../embodiment/config/env/eval/libero_90.yaml |  27 +++
 .../{libero_10_ppo.yaml => libero_10.yaml}    |   4 +-
 .../config/env/train/libero_130.yaml          |  29 ++++
 .../{libero_10_grpo.yaml => libero_90.yaml}   |   2 +-
 .../config/libero_10_grpo_openvlaoft.yaml     |   1 +
 .../libero_10_grpo_openvlaoft_eval.yaml       |   1 +
 .../config/libero_10_ppo_openvlaoft.yaml      |   1 +
 .../config/libero_130_grpo_openvlaoft.yaml    | 164 ++++++++++++++++++
 .../config/libero_90_grpo_openvlaoft.yaml     | 164 ++++++++++++++++++
 .../config/libero_goal_grpo_openvlaoft.yaml   |   1 +
 .../config/libero_object_grpo_openvlaoft.yaml |   1 +
 .../libero_spatial_grpo_openvlaoft.yaml       |   1 +
 .../config/maniskill_grpo_openvla.yaml        |   1 +
 .../config/maniskill_grpo_openvlaoft.yaml     |   1 +
 .../config/maniskill_ppo_openvla.yaml         |   1 +
 .../config/maniskill_ppo_openvla_eval.yaml    | 161 +++++++++++++++++
 .../maniskill_ppo_openvla_quickstart.yaml     |   1 +
 .../config/maniskill_ppo_openvlaoft.yaml      |   1 +
 .../config/robotwin_ppo_openvlaoft.yaml       |   1 +
 rlinf/envs/libero/libero_env.py               |  11 +-
 rlinf/envs/libero/utils.py                    |  46 +++++
 .../hybrid_engines/fsdp/fsdp_model_manager.py |   9 +-
 rlinf/utils/logging.py                        |  20 +++
 .../embodied/libero_130_grpo_openvlaoft.yaml  | 164 ++++++++++++++++++
 .../embodied/run_openvlaoft_libero130.sh      |  13 ++
 27 files changed, 862 insertions(+), 9 deletions(-)
 create mode 100644 examples/embodiment/config/env/eval/libero_130.yaml
 create mode 100644 examples/embodiment/config/env/eval/libero_90.yaml
 rename examples/embodiment/config/env/train/{libero_10_ppo.yaml => libero_10.yaml} (91%)
 create mode 100644 examples/embodiment/config/env/train/libero_130.yaml
 rename examples/embodiment/config/env/train/{libero_10_grpo.yaml => libero_90.yaml} (95%)
 create mode 100644 examples/embodiment/config/libero_130_grpo_openvlaoft.yaml
 create mode 100644 examples/embodiment/config/libero_90_grpo_openvlaoft.yaml
 create mode 100644 examples/embodiment/config/maniskill_ppo_openvla_eval.yaml
 create mode 100644 rlinf/utils/logging.py
 create mode 100644 tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml
 create mode 100644 tests/e2e_tests/embodied/run_openvlaoft_libero130.sh

diff --git a/.github/workflows/embodied_e2e.yml b/.github/workflows/embodied_e2e.yml
index 5e8a999bc..16fb7a275 100644
--- a/.github/workflows/embodied_e2e.yml
+++ b/.github/workflows/embodied_e2e.yml
@@ -55,4 +55,20 @@ jobs:
               timeout-minutes: 20
               run: |
                 export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/embodied/run_openvla.sh
\ No newline at end of file
+                bash tests/e2e_tests/embodied/run_openvla.sh
+    openvlaoft-grpo-test:
+        runs-on: rlinf
+        container:
+            image: rlinf/rlinf:agentic-openvlaoft-rlinf0.1-torch2.5.1
+            volumes:
+                - /mnt/public/dataset:/workspace/dataset
+                - /mnt/public/tokenizer:/workspace/tokenizer
+            options: --gpus="all" --shm-size=2g -e NVIDIA_DRIVER_CAPABILITIES="compute,utility,graphics"
+
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+            - name: OpenVLA-OFT test
+              run: |
+                export REPO_PATH=$(pwd)
+                bash tests/e2e_tests/embodied/run_openvlaoft_libero130.sh
\ No newline at end of file
diff --git a/examples/embodiment/config/env/eval/libero_130.yaml b/examples/embodiment/config/env/eval/libero_130.yaml
new file mode 100644
index 000000000..c30f619af
--- /dev/null
+++ b/examples/embodiment/config/env/eval/libero_130.yaml
@@ -0,0 +1,27 @@
+simulator_type: libero
+task_suite_name: libero_130
+
+auto_reset: True
+ignore_terminations: True
+max_episode_steps: 512
+
+use_rel_reward: True
+reward_coef: 5.0
+only_eval: True
+
+seed: 0
+num_group: ${env.eval.num_envs}
+group_size: 1
+use_fixed_reset_state_ids: True
+num_images_in_input: 1
+
+num_envs: 500
+
+video_cfg:
+  save_video: True
+  info_on_video: True
+  video_base_dir: ${runner.logger.log_path}/video/train
+
+init_params:
+  camera_heights: 256
+  camera_widths: 256
\ No newline at end of file
diff --git a/examples/embodiment/config/env/eval/libero_90.yaml b/examples/embodiment/config/env/eval/libero_90.yaml
new file mode 100644
index 000000000..be380c000
--- /dev/null
+++ b/examples/embodiment/config/env/eval/libero_90.yaml
@@ -0,0 +1,27 @@
+simulator_type: libero
+task_suite_name: libero_90
+
+auto_reset: True
+ignore_terminations: True
+max_episode_steps: 512
+
+use_rel_reward: True
+reward_coef: 5.0
+only_eval: True
+
+seed: 0
+num_group: ${env.eval.num_envs}
+group_size: 1
+use_fixed_reset_state_ids: True
+num_images_in_input: 1
+
+num_envs: 500
+
+video_cfg:
+  save_video: True
+  info_on_video: True
+  video_base_dir: ${runner.logger.log_path}/video/train
+
+init_params:
+  camera_heights: 256
+  camera_widths: 256
\ No newline at end of file
diff --git a/examples/embodiment/config/env/train/libero_10_ppo.yaml b/examples/embodiment/config/env/train/libero_10.yaml
similarity index 91%
rename from examples/embodiment/config/env/train/libero_10_ppo.yaml
rename to examples/embodiment/config/env/train/libero_10.yaml
index 0147d3b1a..d6dc7807a 100644
--- a/examples/embodiment/config/env/train/libero_10_ppo.yaml
+++ b/examples/embodiment/config/env/train/libero_10.yaml
@@ -24,4 +24,6 @@ video_cfg:
 
 init_params:
   camera_heights: 256
-  camera_widths: 256
\ No newline at end of file
+  camera_widths: 256
+
+use_ordered_reset_state_ids: True
diff --git a/examples/embodiment/config/env/train/libero_130.yaml b/examples/embodiment/config/env/train/libero_130.yaml
new file mode 100644
index 000000000..5be83a8a0
--- /dev/null
+++ b/examples/embodiment/config/env/train/libero_130.yaml
@@ -0,0 +1,29 @@
+simulator_type: libero
+task_suite_name: libero_130
+
+auto_reset: ${algorithm.auto_reset}
+ignore_terminations: ${algorithm.ignore_terminations}
+max_episode_steps: 512
+
+use_rel_reward: True
+reward_coef: 5.0
+only_eval: False
+
+seed: 0
+num_group: ${algorithm.num_group_envs}
+group_size: ${algorithm.group_size}
+use_fixed_reset_state_ids: ${algorithm.use_fixed_reset_state_ids}
+num_images_in_input: 1
+
+num_envs: ${multiply:${algorithm.group_size}, ${algorithm.num_group_envs}}
+
+video_cfg:
+  save_video: True
+  info_on_video: True
+  video_base_dir: ${runner.logger.log_path}/video/train
+
+init_params:
+  camera_heights: 256
+  camera_widths: 256
+
+use_ordered_reset_state_ids: True
\ No newline at end of file
diff --git a/examples/embodiment/config/env/train/libero_10_grpo.yaml b/examples/embodiment/config/env/train/libero_90.yaml
similarity index 95%
rename from examples/embodiment/config/env/train/libero_10_grpo.yaml
rename to examples/embodiment/config/env/train/libero_90.yaml
index d25e1a57b..97c074b87 100644
--- a/examples/embodiment/config/env/train/libero_10_grpo.yaml
+++ b/examples/embodiment/config/env/train/libero_90.yaml
@@ -1,5 +1,5 @@
 simulator_type: libero
-task_suite_name: libero_10
+task_suite_name: libero_90
 
 auto_reset: ${algorithm.auto_reset}
 ignore_terminations: ${algorithm.ignore_terminations}
diff --git a/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml
index 3f15ff2ba..7d76f2c5c 100644
--- a/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml
@@ -140,6 +140,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 2.0e-5
diff --git a/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml b/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml
index f409707d9..73a0d1511 100644
--- a/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml
+++ b/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml
@@ -141,6 +141,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 2.0e-5
diff --git a/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml b/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml
index ba0d90d0d..af0fb7316 100644
--- a/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml
@@ -135,6 +135,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 5e-6
diff --git a/examples/embodiment/config/libero_130_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_130_grpo_openvlaoft.yaml
new file mode 100644
index 000000000..1e71c0868
--- /dev/null
+++ b/examples/embodiment/config/libero_130_grpo_openvlaoft.yaml
@@ -0,0 +1,164 @@
+defaults:
+  - env/train: libero_130
+  - env/eval: libero_130
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  searchpath:
+    - file://${oc.env:EMBODIED_PATH}/config/
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,env,rollout: all
+
+runner:
+  task_type: embodied
+  logger:
+    log_path: "../results"
+    project_name: rlinf
+    experiment_name: "test_openvla"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1000
+  max_steps: -1
+
+  only_eval: False
+  val_check_interval: -1
+  save_interval: 25
+  seq_length: 4096
+  max_prompt_length: 128
+
+algorithm:
+  auto_reset: False
+  ignore_terminations: False
+  use_fixed_reset_state_ids: True
+  require_values: False
+  shuffle_samples: True
+  normalize_advantages: True
+  kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
+  group_size: 8
+  n_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  num_group_envs: 8
+  rollout_epoch: 8
+  reward_type: step_level  # step_level or chunk_level
+  logprob_type: token_level
+  entropy_type: token_level
+
+  adv_type: embodied_grpo
+  loss_type: embodied_grpo
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0
+  entropy_bonus: 0
+  clip_ratio_high: 0.28
+  clip_ratio_low: 0.2
+  clip_ratio_c: 3.0
+  value_clip: 0.2
+  huber_delta: 10.0
+
+  gamma: 0.99
+  gae_lambda: 0.95
+
+  sampling_params:
+    use_greedy: False
+    temperature_train: 1.6
+    temperature_eval: 1.6
+    top_k: -1
+    top_p: 1.0
+    repetition_penalty: 1.0
+
+  # length argument for autoregressive sampling
+  # max length means max amount of tokens to generate
+  length_params:
+    max_new_token: null
+    max_length: 1024
+    min_length: 1
+
+  filter_rewards: True
+  rewards_lower_bound: 0.5
+  rewards_upper_bound: 4.5
+
+env:
+  group_name: "EnvGroup"
+  channel:
+    name: "env_buffer_list"
+    queue_name: "obs_buffer"
+    queue_size: 0
+  enable_offload: False
+
+rollout:
+  group_name: "RolloutGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "action_buffer"
+    queue_size: 0
+  mode: "colocate"
+  generation_backend: "huggingface"
+  model_dir: "/path/to/model/Openvla-oft-SFT-libero130-traj1/"
+
+  enable_offload: False
+  pipeline_stage_num: 1
+
+actor:
+  group_name: "ActorGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "replay_buffer"
+    queue_size: 0
+  training_backend: "fsdp"
+  checkpoint_load_path: "/path/to/model/Openvla-oft-SFT-libero130-traj1/"
+  checkpoint_save_path: "../results"
+  micro_batch_size: 32
+  global_batch_size: 8192
+  seed: 1234
+  enable_offload: False
+
+  model:
+    model_name: "openvla_oft"
+    value_type: ${algorithm.reward_type}  # 'action' or 'token'
+    action_dim: 7
+    num_action_chunks: 8
+    use_proprio: False
+    unnorm_key: libero_130_no_noops
+    center_crop: True
+
+    precision: "bf16"
+    add_bias_linear: False
+    add_qkv_bias: True
+    vocab_size: 32000
+    hidden_size: 4096
+    policy_setup: "widowx_bridge"
+    vh_mode: "a0"
+    image_size: [224, 224]
+    is_lora: False
+    lora_rank: 32
+    num_images_in_input: 1
+    attn_implementation: "flash_attention_2"
+    low_cpu_mem_usage: True
+    trust_remote_code: True
+    gradient_checkpointing: False
+
+  optim:
+    lr: 2.0e-5
+    value_lr: 3.0e-3
+    adam_beta1: 0.9
+    adam_beta2: 0.999
+    adam_eps: 1.0e-05
+    clip_grad: 1.0
+
+  tokenizer:
+    tokenizer_type: "HuggingFaceTokenizer"
+    tokenizer_model: "/path/to/model/RLinf-OpenVLAOFT-GRPO-LIBERO-130/"
+    use_fast: False
+    trust_remote_code: True
+    padding_side: "right"
+
+reward:
+  use_reward_model: False
+
+critic:
+  use_critic_model: False
\ No newline at end of file
diff --git a/examples/embodiment/config/libero_90_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_90_grpo_openvlaoft.yaml
new file mode 100644
index 000000000..ec400877f
--- /dev/null
+++ b/examples/embodiment/config/libero_90_grpo_openvlaoft.yaml
@@ -0,0 +1,164 @@
+defaults:
+  - env/train: libero_90
+  - env/eval: libero_90
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  searchpath:
+    - file://${oc.env:EMBODIED_PATH}/config/
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,env,rollout: all
+
+runner:
+  task_type: embodied
+  logger:
+    log_path: "../results"
+    project_name: rlinf
+    experiment_name: "test_openvla"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1000
+  max_steps: -1
+
+  only_eval: False
+  val_check_interval: -1
+  save_interval: 25
+  seq_length: 4096
+  max_prompt_length: 128
+
+algorithm:
+  auto_reset: False
+  ignore_terminations: False
+  use_fixed_reset_state_ids: True
+  require_values: False
+  shuffle_samples: True
+  normalize_advantages: True
+  kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
+  group_size: 8
+  n_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  num_group_envs: 8
+  rollout_epoch: 8
+  reward_type: step_level  # step_level or chunk_level
+  logprob_type: token_level
+  entropy_type: token_level
+
+  adv_type: embodied_grpo
+  loss_type: embodied_grpo
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0
+  entropy_bonus: 0
+  clip_ratio_high: 0.28
+  clip_ratio_low: 0.2
+  clip_ratio_c: 3.0
+  value_clip: 0.2
+  huber_delta: 10.0
+
+  gamma: 0.99
+  gae_lambda: 0.95
+
+  sampling_params:
+    use_greedy: False
+    temperature_train: 1.6
+    temperature_eval: 1.6
+    top_k: -1
+    top_p: 1.0
+    repetition_penalty: 1.0
+
+  # length argument for autoregressive sampling
+  # max length means max amount of tokens to generate
+  length_params:
+    max_new_token: null
+    max_length: 1024
+    min_length: 1
+
+  filter_rewards: True
+  rewards_lower_bound: 0.5
+  rewards_upper_bound: 4.5
+
+env:
+  group_name: "EnvGroup"
+  channel:
+    name: "env_buffer_list"
+    queue_name: "obs_buffer"
+    queue_size: 0
+  enable_offload: False
+
+rollout:
+  group_name: "RolloutGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "action_buffer"
+    queue_size: 0
+  mode: "colocate"
+  generation_backend: "huggingface"
+  model_dir: "/path/to/model/RLinf-OpenVLAOFT-GRPO-LIBERO-90/"
+
+  enable_offload: False
+  pipeline_stage_num: 1
+
+actor:
+  group_name: "ActorGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "replay_buffer"
+    queue_size: 0
+  training_backend: "fsdp"
+  checkpoint_load_path: "/path/to/model/RLinf-OpenVLAOFT-GRPO-LIBERO-90/"
+  checkpoint_save_path: "../results"
+  micro_batch_size: 32
+  global_batch_size: 8192
+  seed: 1234
+  enable_offload: False
+
+  model:
+    model_name: "openvla_oft"
+    value_type: ${algorithm.reward_type}  # 'action' or 'token'
+    action_dim: 7
+    num_action_chunks: 8
+    use_proprio: False
+    unnorm_key: libero_90_no_noops
+    center_crop: True
+
+    precision: "bf16"
+    add_bias_linear: False
+    add_qkv_bias: True
+    vocab_size: 32000
+    hidden_size: 4096
+    policy_setup: "widowx_bridge"
+    vh_mode: "a0"
+    image_size: [224, 224]
+    is_lora: False
+    lora_rank: 32
+    num_images_in_input: 1
+    attn_implementation: "flash_attention_2"
+    low_cpu_mem_usage: True
+    trust_remote_code: True
+    gradient_checkpointing: False
+
+  optim:
+    lr: 2.0e-5
+    value_lr: 3.0e-3
+    adam_beta1: 0.9
+    adam_beta2: 0.999
+    adam_eps: 1.0e-05
+    clip_grad: 1.0
+
+  tokenizer:
+    tokenizer_type: "HuggingFaceTokenizer"
+    tokenizer_model: "/path/to/model/RLinf-OpenVLAOFT-GRPO-LIBERO-90/"
+    use_fast: False
+    trust_remote_code: True
+    padding_side: "right"
+
+reward:
+  use_reward_model: False
+
+critic:
+  use_critic_model: False
\ No newline at end of file
diff --git a/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml
index 80a691848..b95bad55c 100644
--- a/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml
@@ -139,6 +139,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 2.0e-5
diff --git a/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml
index 37326ef1e..a75a87596 100644
--- a/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml
@@ -139,6 +139,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 2.0e-5
diff --git a/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml
index eb6f1cf0c..bab9bd4cd 100644
--- a/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml
@@ -139,6 +139,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 2.0e-5
diff --git a/examples/embodiment/config/maniskill_grpo_openvla.yaml b/examples/embodiment/config/maniskill_grpo_openvla.yaml
index 7c77d581c..4b0500673 100644
--- a/examples/embodiment/config/maniskill_grpo_openvla.yaml
+++ b/examples/embodiment/config/maniskill_grpo_openvla.yaml
@@ -148,6 +148,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 1.0e-5
diff --git a/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml b/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml
index b15116e4c..cb752180c 100644
--- a/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml
@@ -140,6 +140,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   tokenizer:
     tokenizer_type: "HuggingFaceTokenizer"
diff --git a/examples/embodiment/config/maniskill_ppo_openvla.yaml b/examples/embodiment/config/maniskill_ppo_openvla.yaml
index 7af5a9d9f..9315b7db8 100644
--- a/examples/embodiment/config/maniskill_ppo_openvla.yaml
+++ b/examples/embodiment/config/maniskill_ppo_openvla.yaml
@@ -137,6 +137,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 1.0e-4
diff --git a/examples/embodiment/config/maniskill_ppo_openvla_eval.yaml b/examples/embodiment/config/maniskill_ppo_openvla_eval.yaml
new file mode 100644
index 000000000..034130c7b
--- /dev/null
+++ b/examples/embodiment/config/maniskill_ppo_openvla_eval.yaml
@@ -0,0 +1,161 @@
+defaults:
+  - env/eval: maniskill_ood_template
+  - env/train: PutCarrotOnPlateInScene
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  searchpath:
+    - file://${oc.env:EMBODIED_PATH}/config/
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor: 0-3
+    env: 0-3
+    rollout: 0-3
+
+runner:
+  task_type: embodied
+  logger:
+    log_path: "./logs"
+    project_name: rlinf
+    experiment_name: "test_openvla"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1000
+  max_steps: -1
+
+  only_eval: True
+  val_check_interval: -1
+  save_interval: 40
+  seq_length: 4096
+  max_prompt_length: 30
+
+algorithm:
+  auto_reset: True
+  ignore_terminations: True
+  use_fixed_reset_state_ids: False
+  require_values: True
+  shuffle_samples: True
+  normalize_advantages: True
+  kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
+  group_size: 1
+  n_chunk_steps: 80
+  n_eval_chunk_steps: 80
+  num_group_envs: 120
+  rollout_epoch: 1
+  reward_type: step_level
+  logprob_type: action_level
+  entropy_type: action_level
+
+  adv_type: embodied_gae
+  loss_type: embodied_ppo
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0
+  entropy_bonus: 0
+  clip_ratio_high: 0.2
+  clip_ratio_low: 0.2
+  clip_ratio_c: 3.0
+  value_clip: 0.2
+  huber_delta: 10.0
+
+  gamma: 0.99
+  gae_lambda: 0.95
+
+  sampling_params:
+    use_greedy: False
+    temperature_train: 1.0
+    temperature_eval: 0.6
+    top_k: 50
+    top_p: 1.0
+    repetition_penalty: 1.0
+
+  # length argument for autoregressive sampling
+  # max length means max amount of tokens to generate
+  length_params:
+    max_new_token: 7
+    max_length: 1024
+    min_length: 1
+
+env:
+  group_name: "EnvGroup"
+  channel:
+    name: "env_buffer_list"
+    queue_name: "obs_buffer"
+    queue_size: 0
+  enable_offload: True
+
+rollout:
+  group_name: "RolloutGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "action_buffer"
+    queue_size: 0
+  mode: "colocate"
+  backend: "huggingface"
+  model_dir: "/path/to/model/openvla-7b-rlvla-rl/"
+  enable_offload: True
+  pipeline_stage_num: 2
+
+actor:
+  group_name: "ActorGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "replay_buffer"
+    queue_size: 0
+  training_backend: "fsdp"
+  checkpoint_load_path: "/path/to/model/openvla-7b-rlvla-rl/"
+  checkpoint_save_path: "/workspace/results"
+  micro_batch_size: 20
+  global_batch_size: 160
+  seed: 1234
+  enable_offload: True
+
+  tokenizer:
+    tokenizer_type: "HuggingFaceTokenizer"
+    tokenizer_model: "/path/to/model/openvla-7b-rlvla-rl/"
+    use_fast: False
+    trust_remote_code: True
+    padding_side: "right"
+
+  model:
+    model_name: "openvla"
+    action_dim: 7
+    num_action_chunks: 1
+    use_proprio: False
+    unnorm_key: bridge_orig
+    center_crop: True
+    do_sample: False
+    
+    precision: "bf16"
+    add_bias_linear: False
+    add_qkv_bias: True
+    vocab_size: 32000
+    hidden_size: 4096
+    policy_setup: "widowx_bridge"
+    vh_mode: "a0"
+    image_size: [224, 224]
+    is_lora: False
+    lora_rank: 32
+    ckpt_path: null
+    attn_implementation: "flash_attention_2"
+    low_cpu_mem_usage: True
+    trust_remote_code: True
+    gradient_checkpointing: False
+
+  optim:
+    lr: 1.0e-4
+    value_lr: 3.0e-3
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    clip_grad: 1.0
+
+reward:
+  use_reward_model: False
+
+critic:
+  use_critic_model: False
\ No newline at end of file
diff --git a/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml b/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml
index baad8b402..b277418f4 100644
--- a/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml
+++ b/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml
@@ -160,6 +160,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 1.0e-4
diff --git a/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml b/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml
index d9016cece..1e5893f43 100644
--- a/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml
+++ b/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml
@@ -142,6 +142,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 1.0e-4
diff --git a/examples/embodiment/config/robotwin_ppo_openvlaoft.yaml b/examples/embodiment/config/robotwin_ppo_openvlaoft.yaml
index 48686107a..30700e1c2 100644
--- a/examples/embodiment/config/robotwin_ppo_openvlaoft.yaml
+++ b/examples/embodiment/config/robotwin_ppo_openvlaoft.yaml
@@ -141,6 +141,7 @@ actor:
     attn_implementation: "flash_attention_2"
     low_cpu_mem_usage: True
     trust_remote_code: True
+    gradient_checkpointing: False
 
   optim:
     lr: 1.0e-4
diff --git a/rlinf/envs/libero/libero_env.py b/rlinf/envs/libero/libero_env.py
index 8f916c0f2..ab117d0c1 100644
--- a/rlinf/envs/libero/libero_env.py
+++ b/rlinf/envs/libero/libero_env.py
@@ -20,11 +20,12 @@
 import numpy as np
 import torch
 from libero.libero import get_libero_path
-from libero.libero.benchmark import Benchmark, get_benchmark
+from libero.libero.benchmark import Benchmark
 from libero.libero.envs import OffScreenRenderEnv
 from omegaconf.omegaconf import OmegaConf
 
 from rlinf.envs.libero.utils import (
+    get_benchmark_overridden,
     get_libero_image,
     get_libero_wrist_image,
     list_of_dict_to_dict_of_list,
@@ -56,7 +57,7 @@ def __init__(self, cfg, rank, world_size):
         self._generator_ordered = np.random.default_rng(seed=0)
         self.start_idx = 0
 
-        self.task_suite: Benchmark = get_benchmark(cfg.task_suite_name)()
+        self.task_suite: Benchmark = get_benchmark_overridden(cfg.task_suite_name)()
 
         self._compute_total_num_group_envs()
         self.reset_state_ids_all = self.get_reset_state_ids_all()
@@ -154,13 +155,13 @@ def get_reset_state_ids_all(self):
         return reset_state_ids
 
     def _get_ordered_reset_state_ids(self, num_reset_states):
+        if self.start_idx + num_reset_states > len(self.reset_state_ids_all[0]):
+            self.reset_state_ids_all = self.get_reset_state_ids_all()
+            self.start_idx = 0
         reset_state_ids = self.reset_state_ids_all[self.rank][
             self.start_idx : self.start_idx + num_reset_states
         ]
         self.start_idx = self.start_idx + num_reset_states
-        if self.start_idx >= len(self.reset_state_ids_all[0]):
-            self.reset_state_ids_all = self.get_reset_state_ids_all()
-            self.start_idx = 0
         return reset_state_ids
 
     def _get_task_and_trial_ids_from_reset_state_ids(self, reset_state_ids):
diff --git a/rlinf/envs/libero/utils.py b/rlinf/envs/libero/utils.py
index e55c353e7..3c92f30d9 100644
--- a/rlinf/envs/libero/utils.py
+++ b/rlinf/envs/libero/utils.py
@@ -19,9 +19,11 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import imageio
+import libero.libero.benchmark as benchmark
 import numpy as np
 import torch
 from libero.libero import get_libero_path
+from libero.libero.benchmark import Benchmark
 from libero.libero.envs import OffScreenRenderEnv
 from PIL import Image, ImageDraw, ImageFont
 
@@ -447,3 +449,47 @@ def save_rollout_video(
     for img in rollout_images:
         video_writer.append_data(img)
     video_writer.close()
+
+
+def get_benchmark_overridden(benchmark_name) -> Benchmark:
+    """
+    Return the Benchmark class for a given name.
+    For "libero_130": return a dynamically aggregated class from all suites.
+    For others: delegate to the original LIBERO get_benchmark.
+
+    Args:
+        benchmark_name: Name of the benchmark to get
+
+    Returns:
+        Benchmark class
+    """
+    name = str(benchmark_name).lower()
+    if name != "libero_130":
+        return benchmark.get_benchmark(benchmark_name)
+
+    libreo_cls = benchmark.BENCHMARK_MAPPING.get("libero_130", None)
+    if libreo_cls is not None:
+        return libreo_cls
+
+    # Build aggregated task map once, preserving order and de-duplicating by task name
+    aggregated_task_map: Dict[str, benchmark.Task] = {}
+    for suite_name in getattr(benchmark, "libero_suites", []):
+        suite_map = benchmark.task_maps.get(suite_name, {})
+        for task_name, task in suite_map.items():
+            if task_name not in aggregated_task_map:
+                aggregated_task_map[task_name] = task
+
+    class LIBERO_ALL(Benchmark):
+        def __init__(self, task_order_index=0):
+            super().__init__(task_order_index=task_order_index)
+            self.name = "libero_130"
+            self._make_benchmark()
+
+        def _make_benchmark(self):
+            tasks = list(aggregated_task_map.values())
+            self.tasks = tasks
+            self.n_tasks = len(self.tasks)
+
+    # Register for discoverability/help
+    benchmark.BENCHMARK_MAPPING["libero_130"] = LIBERO_ALL
+    return LIBERO_ALL
diff --git a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
index 9b333fcc5..8e3b9f16c 100644
--- a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
+++ b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
@@ -26,6 +26,7 @@
     get_fsdp_wrap_policy,
     init_fn,
 )
+from rlinf.utils.logging import get_logger
 from rlinf.utils.utils import clear_memory
 
 
@@ -36,6 +37,7 @@ class FSDPModelManager:
 
     def __init__(self, cfg: DictConfig):
         self._cfg = cfg
+        self.logger = get_logger()
         self.torch_dtype = torch_dtype_from_precision(self._cfg.model.precision)
 
         assert (
@@ -78,7 +80,12 @@ def setup_model_and_optimizer(self):
         """Setup model and optimizer."""
         module = self.model_provider_func()
 
-        module.gradient_checkpointing_enable()
+        # Enable gradient checkpointing if configured
+        if self._cfg.model.get("gradient_checkpointing", False):
+            self.logger.info("[FSDP] Enabling gradient checkpointing")
+            module.gradient_checkpointing_enable()
+        else:
+            self.logger.info("[FSDP] Gradient checkpointing is disabled")
 
         mixed_precision = MixedPrecision(
             param_dtype=self.torch_dtype,
diff --git a/rlinf/utils/logging.py b/rlinf/utils/logging.py
new file mode 100644
index 000000000..e80b6cf4f
--- /dev/null
+++ b/rlinf/utils/logging.py
@@ -0,0 +1,20 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def get_logger():
+    """Get the logger instance of the current worker."""
+    from rlinf.scheduler.worker import Worker
+
+    return Worker.logger
diff --git a/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml b/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml
new file mode 100644
index 000000000..c5f5744b3
--- /dev/null
+++ b/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml
@@ -0,0 +1,164 @@
+defaults:
+  - env/train: libero_130
+  - env/eval: libero_130
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  searchpath:
+    - file://${oc.env:REPO_PATH}/examples/embodiment/config
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,env,rollout: all
+
+runner:
+  task_type: embodied
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 3
+  max_steps: -1
+
+  only_eval: False
+  val_check_interval: -1
+  save_interval: -1
+  seq_length: 4096
+  max_prompt_length: 128
+
+algorithm:
+  auto_reset: False
+  ignore_terminations: False
+  use_fixed_reset_state_ids: True
+  require_values: False
+  shuffle_samples: True
+  normalize_advantages: True
+  kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
+  group_size: 8
+  n_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  num_group_envs: 8
+  rollout_epoch: 1
+  reward_type: step_level  # step_level or chunk_level
+  logprob_type: token_level
+  entropy_type: token_level
+
+  adv_type: embodied_grpo
+  loss_type: embodied_grpo
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0
+  entropy_bonus: 0
+  clip_ratio_high: 0.28
+  clip_ratio_low: 0.2
+  clip_ratio_c: 3.0
+  value_clip: 0.2
+  huber_delta: 10.0
+
+  gamma: 0.99
+  gae_lambda: 0.95
+
+  sampling_params:
+    use_greedy: False
+    temperature_train: 1.6
+    temperature_eval: 1.6
+    top_k: -1
+    top_p: 1.0
+    repetition_penalty: 1.0
+
+  # length argument for autoregressive sampling
+  # max length means max amount of tokens to generate
+  length_params:
+    max_new_token: null
+    max_length: 1024
+    min_length: 1
+
+  filter_rewards: True
+  rewards_lower_bound: 0.5
+  rewards_upper_bound: 4.5
+
+env:
+  group_name: "EnvGroup"
+  channel:
+    name: "env_buffer_list"
+    queue_name: "obs_buffer"
+    queue_size: 0
+  enable_offload: False
+
+rollout:
+  group_name: "RolloutGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "action_buffer"
+    queue_size: 0
+  mode: "colocate"
+  generation_backend: "huggingface"
+  model_dir: "/workspace/dataset/RLinf-OpenVLAOFT-LIBERO-130-Base-Lora"
+
+  enable_offload: True
+  pipeline_stage_num: 1
+
+actor:
+  group_name: "ActorGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "replay_buffer"
+    queue_size: 0
+  training_backend: "fsdp"
+  checkpoint_load_path: "/workspace/dataset/RLinf-OpenVLAOFT-LIBERO-130-Base-Lora"
+  checkpoint_save_path: "/workspace/results"
+  micro_batch_size: 2
+  global_batch_size: 256
+  seed: 1234
+  enable_offload: True
+
+  model:
+    model_name: "openvla_oft"
+    value_type: ${algorithm.reward_type}  # 'action' or 'token'
+    action_dim: 7
+    num_action_chunks: 8
+    use_proprio: False
+    unnorm_key: libero_130_no_noops_trajall
+    center_crop: True
+
+    precision: "bf16"
+    add_bias_linear: False
+    add_qkv_bias: True
+    vocab_size: 32000
+    hidden_size: 4096
+    policy_setup: "widowx_bridge"
+    vh_mode: "a0"
+    image_size: [224, 224]
+    is_lora: False
+    lora_rank: 32
+    num_images_in_input: 1
+    attn_implementation: "flash_attention_2"
+    low_cpu_mem_usage: True
+    trust_remote_code: True
+    gradient_checkpointing: False
+
+  optim:
+    lr: 2.0e-5
+    value_lr: 3.0e-3
+    adam_beta1: 0.9
+    adam_beta2: 0.999
+    adam_eps: 1.0e-05
+    clip_grad: 1.0
+
+  tokenizer:
+    tokenizer_type: "HuggingFaceTokenizer"
+    tokenizer_model: "/workspace/dataset/RLinf-OpenVLAOFT-LIBERO-130-Base-Lora"
+    use_fast: False
+    trust_remote_code: True
+    padding_side: "right"
+
+reward:
+  use_reward_model: False
+
+critic:
+  use_critic_model: False
\ No newline at end of file
diff --git a/tests/e2e_tests/embodied/run_openvlaoft_libero130.sh b/tests/e2e_tests/embodied/run_openvlaoft_libero130.sh
new file mode 100644
index 000000000..596146ff3
--- /dev/null
+++ b/tests/e2e_tests/embodied/run_openvlaoft_libero130.sh
@@ -0,0 +1,13 @@
+#! /bin/bash
+set -x
+
+tabs 4
+export MUJOCO_GL="osmesa"
+export PYOPENGL_PLATFORM="osmesa"
+export LIBERO_REPO_PATH="/workspace/libero"
+export LIBERO_CONFIG_PATH=${LIBERO_REPO_PATH}
+export PYTHONPATH=${LIBERO_REPO_PATH}:$PYTHONPATH
+export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
+unset HOME # GitHub action sets HOME to a wrong path (/github/home), breaking simulator
+
+python ${REPO_PATH}/examples/embodiment/train_embodied_agent.py --config-path ${REPO_PATH}/tests/e2e_tests/embodied --config-name libero_130_grpo_openvlaoft
\ No newline at end of file

From e77a20ddd644572b11d85353b2af031ca2b74a88 Mon Sep 17 00:00:00 2001
From: qurakchin <czzcy3832515@hotmail.com>
Date: Sat, 11 Oct 2025 15:22:03 +0800
Subject: [PATCH 15/57] feat: the open source version of cursor online rl for
 tab completion (#136)

Signed-off-by: qurakchin <czzcy3832515@hotmail.com>
Signed-off-by: yaozhuyu <yaozhuyu@infini-ai.com>
Co-authored-by: huangzx02 <huangzx02@foxmail.com>
Co-authored-by: zhuchunyang <zhuchunyang@infini-ai.com>
Co-authored-by: yaozhuyu <yaozhuyu@infini-ai.com>
---
 .github/workflows/coding_online_rl_e2e.yml    |  62 +++
 .../rst_source/examples/coding_online_rl.rst  | 180 +++++++++
 docs/source-en/rst_source/examples/index.rst  |   5 +-
 .../rst_source/examples/coding_online_rl.rst  | 181 +++++++++
 docs/source-zh/rst_source/examples/index.rst  |   3 +-
 .../config/qwen2.5-1.5b-ppo.yaml              | 300 ++++++++++++++
 .../config/tp_comm_overlap_cfg.yaml           |  47 +++
 .../coding_online_rl/main_coding_online_rl.py | 106 +++++
 .../run_main_coding_online_rl.sh              |  21 +
 .../coding_online_rl/simple_test_client.py    | 110 +++++
 .../math/config/qwen2.5-1.5b-single-gpu.yaml  |   4 +-
 rlinf/algorithms/advantages.py                | 162 ++++++++
 rlinf/algorithms/losses.py                    |  16 +-
 rlinf/algorithms/registry.py                  |  19 +
 rlinf/config.py                               |  50 +++
 rlinf/runners/coding_online_rl_runner.py      | 318 +++++++++++++++
 rlinf/runners/math_runner.py                  |  14 +-
 rlinf/utils/utils.py                          |  60 +--
 rlinf/workers/actor/megatron_actor_worker.py  |  24 +-
 .../rollout/server/online_router_worker.py    | 250 ++++++++++++
 .../rollout/server/server_rollout_worker.py   | 377 ++++++++++++++++++
 rlinf/workers/rollout/sglang/sglang_worker.py |  18 +-
 .../coding_online_rl/qwen2.5-1.5b-ppo.yaml    | 300 ++++++++++++++
 .../coding_online_rl/run_coding_online_rl.sh  |  13 +
 toolkits/__init__.py                          |  19 +
 .../convert_hf_to_middle_file.py              |  30 +-
 .../convert_middle_file_to_hf.py              |  11 -
 toolkits/ckpt_convertor/default_args.yaml     |  10 +
 toolkits/code_verifier/__init__.py            |  13 +
 toolkits/code_verifier/verify.py              |  42 ++
 toolkits/math_verifier/verify.py              |   2 +
 31 files changed, 2676 insertions(+), 91 deletions(-)
 create mode 100644 .github/workflows/coding_online_rl_e2e.yml
 create mode 100644 docs/source-en/rst_source/examples/coding_online_rl.rst
 create mode 100644 docs/source-zh/rst_source/examples/coding_online_rl.rst
 create mode 100644 examples/coding_online_rl/config/qwen2.5-1.5b-ppo.yaml
 create mode 100644 examples/coding_online_rl/config/tp_comm_overlap_cfg.yaml
 create mode 100644 examples/coding_online_rl/main_coding_online_rl.py
 create mode 100644 examples/coding_online_rl/run_main_coding_online_rl.sh
 create mode 100644 examples/coding_online_rl/simple_test_client.py
 create mode 100644 rlinf/runners/coding_online_rl_runner.py
 create mode 100644 rlinf/workers/rollout/server/online_router_worker.py
 create mode 100644 rlinf/workers/rollout/server/server_rollout_worker.py
 create mode 100644 tests/e2e_tests/coding_online_rl/qwen2.5-1.5b-ppo.yaml
 create mode 100644 tests/e2e_tests/coding_online_rl/run_coding_online_rl.sh
 create mode 100644 toolkits/code_verifier/__init__.py
 create mode 100644 toolkits/code_verifier/verify.py

diff --git a/.github/workflows/coding_online_rl_e2e.yml b/.github/workflows/coding_online_rl_e2e.yml
new file mode 100644
index 000000000..8618b39c7
--- /dev/null
+++ b/.github/workflows/coding_online_rl_e2e.yml
@@ -0,0 +1,62 @@
+name: Coding Online RL End2End
+
+on:
+    push:
+        branches:
+          - 'release/v[0-9].[0-9]'
+          - main
+        paths:
+          - '**/*.py'
+          - 'tests/**'
+          - '.github/workflows/*.yml'
+          - '!docs/**'
+          - '!README.md'
+          - '!*.yaml'
+          - '!*.toml'
+          - '!ray_utils/**'
+          - '!requirements/**'
+
+    pull_request:
+        branches:
+          - 'release/v[0-9].[0-9]'
+          - main
+        paths:
+          - '**/*.py'
+          - 'tests/**'
+          - '.github/workflows/*.yml'
+          - '!docs/**'
+          - '!README.md'
+          - '*.yaml'
+          - '*.toml'
+          - '!ray_utils/**'
+          - '!requirements/**'
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+    qwen-ppo-test:
+        runs-on: rlinf
+        container:
+            image: rlinf/rlinf:math-rlinf0.1-torch2.6.0-sglang0.4.6.post5-vllm0.8.5-megatron0.13.0-te2.1
+            volumes:
+                - /mnt/public/dataset:/workspace/dataset
+                - /mnt/public/tokenizer:/workspace/tokenizer
+            options: --gpus="all" --shm-size=80g
+
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - name: Install dependencies
+              run: |
+                pip install httpx asyncio fuzzywuzzy
+
+            - name: SGLang Collocated mode
+              run: |
+                export REPO_PATH=$(pwd)
+                bash tests/e2e_tests/coding_online_rl/run_coding_online_rl.sh
diff --git a/docs/source-en/rst_source/examples/coding_online_rl.rst b/docs/source-en/rst_source/examples/coding_online_rl.rst
new file mode 100644
index 000000000..d8cc50daa
--- /dev/null
+++ b/docs/source-en/rst_source/examples/coding_online_rl.rst
@@ -0,0 +1,180 @@
+Online Reinforcement Learning for Code Completion Agent
+=======================================================
+
+Online Reinforcement Learning for Code Completion Agent is an important application scenario in the RLinf framework.
+Through integration with code editors like Continue, we can collect user preference feedback on code completions, enabling near real-time code generation and feedback learning to quickly improve code completion quality and align with user preferences.
+This example demonstrates how to use the RLinf framework to train a model capable of online code completion tasks.
+
+Overview
+--------
+
+The online reinforcement learning for code completion agent system works through the following process:
+
+1. **Real-time Interaction**: The system receives code completion requests from editors like Continue
+2. **Model Inference**: Uses trained models to generate code completion suggestions
+3. **User Feedback**: Collects user acceptance/rejection feedback on generated code
+4. **Online Learning**: Updates model parameters in real-time based on user feedback
+
+This real-time learning mechanism allows the model to quickly adapt to user programming habits and preferences.
+
+Running the Script
+------------------
+
+**Environment Setup**
+
+First, ensure you have installed the RLinf framework and its dependencies:
+
+.. code-block:: bash
+
+   # Install additional dependencies
+   pip install httpx asyncio fuzzywuzzy
+
+**Configure Continue Integration**
+
+1. **Install Continue Extension**
+   
+   Since the current Continue does not support uploading user preference feedback on code completions, we have modified the Continue source code to support uploading user preference feedback on code completions.
+   Users can get the compiled modified Continue plugin from `here <https://github.com/RLinf/continue/releases>`_ or build it themselves.
+
+   After downloading the compiled Continue plugin, install it in VS Code.
+
+   Method 1: code --install-extension /path/to/continue-1.3.9.vsix"
+
+   Method 2: In VSCode, press Cmd+Shift+P, type 'Extensions: Install from VSIX', and select the above file
+
+2. **Configure Continue Settings**
+
+   The Continue configuration file path is:
+
+   .. code-block:: bash
+
+      ~/.continue/config.yaml
+
+   Add the following settings to your Continue configuration file:
+
+   .. code-block:: yaml
+
+      # Please replace http://xxx:xx/ with the actual RLinf online code completion service address
+
+      models:
+      # Add a model for code completion
+      - name: my-autocomplete
+         provider: openai
+         model: Qwen2.5-Coder-1.5B
+         apiBase: http://xxx:8081/v1
+         apiKey: xxx
+         roles:
+            - autocomplete
+
+      # Add sending user feedback on whether to accept code completions
+      tabAutocompleteOptions:
+      enableCompletionTracking: true
+      completionTrackingUrl: http://xxx:8082/api/training/submit
+      completionTrackingHeaders:
+         Authorization: "Bearer test-token"
+         X-Project-ID: "test-project"
+      maxPromptTokens: 1024
+      debounceDelay: 350
+      multilineCompletions: "auto"
+
+   After modifying and saving, open the Continue extension from the left panel, click the "Settings" gear button in the top right corner, and ensure "Autocomplete Model" is set to my-autocomplete in the "Models" page.
+
+**Start Training Service**
+
+1. **Prepare Model and Configuration**
+   
+   Ensure you have pre-trained model weights and modify the configuration file to match model paths, ports to use, etc.
+
+   .. code-block:: yaml
+
+      rollout:
+        model_dir: /path/to/your/model/DeepSeek-R1-Distill-Qwen-1.5B/
+      
+      actor:
+        tokenizer:
+          tokenizer_model: /path/to/your/model/DeepSeek-R1-Distill-Qwen-1.5B/
+
+2. **Start RLinf Training Service**
+   
+   .. code-block:: bash
+
+      # Navigate to project directory
+      cd /path/to/rlinf_online_rl
+      
+      # Start training service
+      bash examples/coding_online_rl/run_main_math_pipeline_grpo_megatron.sh qwen2.5-1.5b-ppo-megatron
+
+   This will start the following services:
+   - **Inference Service**: Provides code completion API on port 8081
+   - **Training Service**: Receives user feedback data on port 8082
+
+**Integration with Continue**
+
+1. **Start Continue**
+   
+   Launch the Continue extension in VS Code, ensuring it connects to the correct API endpoints.
+
+2. **Begin Programming**
+   
+   Start writing code in Continue. The system will:
+   - Automatically send code completion requests to the inference service
+   - Receive model-generated code suggestions
+   - Collect your acceptance/rejection feedback on suggestions
+
+3. **Real-time Learning**
+   
+   The system processes your feedback in real-time:
+   - Accepted suggestions are marked as positive feedback
+   - Rejected suggestions are marked as negative feedback
+   - Model parameters are updated online based on feedback
+
+**Monitor Training Process**
+
+You can monitor the training process through the following methods:
+
+1. **View Log Output**
+   
+   .. code-block:: bash
+
+      # View training logs
+      tail -f results/ppo-1.5b/train.log
+
+2. **Use TensorBoard**
+   
+   .. code-block:: bash
+
+      # Start TensorBoard
+      tensorboard --logdir results/grpo-1.5b
+
+3. **Check Model Checkpoints**
+   
+   Model checkpoints are periodically saved to the `results/grpo-1.5b/checkpoints/` directory during training.
+
+**Test Client**
+
+You can use the provided test client to verify system functionality:
+
+.. code-block:: bash
+
+   # Run test client
+   python examples/coding_online_rl/simple_test_client.py
+
+The test client simulates Continue behavior by sending code completion requests and submitting feedback data.
+
+**Troubleshooting**
+
+Common issues and solutions:
+
+1. **Port Conflicts**
+   
+   If ports 8081 or 8082 are occupied, modify the port settings in the configuration file.
+
+2. **Model Loading Failure**
+   
+   Check that the model path is correct and ensure model files exist and are accessible.
+
+3. **Continue Connection Failure**
+   
+   Ensure the API endpoint addresses in Continue configuration are correct and check network connectivity. You can also use simple_test_client to test if feedback data can be received normally.
+
+Through these steps, you can successfully run the online reinforcement learning for code completion agent system and achieve seamless integration with the Continue editor.
diff --git a/docs/source-en/rst_source/examples/index.rst b/docs/source-en/rst_source/examples/index.rst
index 4b3984a5e..01b6fbd8a 100644
--- a/docs/source-en/rst_source/examples/index.rst
+++ b/docs/source-en/rst_source/examples/index.rst
@@ -132,7 +132,7 @@ The following examples include agent workflow construction, online RL training,
        <img src="https://github.com/RLinf/misc/raw/main/pic/math_numbers_small.jpg"
             style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
        <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
-         <a href="https://rlinf.readthedocs.io/en/latest/rst_source/examples/cursor_onlinerl.html" target="_blank" style="text-decoration: underline; color: blue;">
+         <a href="https://rlinf.readthedocs.io/en/latest/rst_source/examples/coding_online_rl.html" target="_blank" style="text-decoration: underline; color: blue;">
           <b>Open-Source Online RL for Code Completion</b>
          </a><br>
          End-to-end online RL with RLinf + Continue, improving model performance by xx%
@@ -195,4 +195,5 @@ Thanks to this decoupled design, workers can be flexibly and dynamically schedul
 
    maniskill
    libero
-   reasoning
\ No newline at end of file
+   reasoning
+   coding_online_rl
diff --git a/docs/source-zh/rst_source/examples/coding_online_rl.rst b/docs/source-zh/rst_source/examples/coding_online_rl.rst
new file mode 100644
index 000000000..8917a5999
--- /dev/null
+++ b/docs/source-zh/rst_source/examples/coding_online_rl.rst
@@ -0,0 +1,181 @@
+代码补全在线强化学习
+=================
+
+代码补全在线强化学习（Online Coding RL）是 RLinf 框架中的一个重要应用场景。
+通过与 Continue 等代码编辑器的集成，获取用户对代码补全的偏好反馈，可以实现近乎实时的代码生成和反馈学习，快速提高代码补全的质量，和对齐用户的偏好。
+本示例展示了如何使用 RLinf 框架训练一个能够进行在线代码补全任务的模型。
+
+概述
+----
+
+代码补全在线强化学习系统通过以下方式工作：
+
+1. **实时交互**：系统接收来自 Continue 等编辑器的代码补全请求
+2. **模型推理**：使用训练好的模型生成代码补全建议
+3. **用户反馈**：收集用户对生成代码的接受/拒绝反馈
+4. **在线学习**：基于用户反馈实时更新模型参数
+
+这种实时学习机制使得模型能够快速适应用户的编程习惯和偏好。
+
+运行脚本
+-------
+
+**环境准备**
+
+
+首先确保您已经安装了 RLinf 框架及其依赖：
+
+.. code-block:: bash
+
+   # 安装额外依赖
+   pip install httpx asyncio fuzzywuzzy
+
+**配置 Continue 集成**
+
+1. **安装 Continue 扩展**
+   
+   由于当前 Continue 未支持上传用户对代码补全的偏好反馈，因此我们修改了 Continue 的源码，支持上传用户对代码补全的偏好反馈。
+   用户可从 `这里 <https://github.com/RLinf/continue/releases>`_ 获取编译好的修改后的 Continue 插件，或自行构建。
+
+   下载编译好的 Continue 插件后，在 VS Code 中安装。
+
+   方法1: code --install-extension /path/to/continue-1.3.9.vsix"
+
+   方法2: 在 VSCode 中按 Cmd+Shift+P ，输入 'Extensions: Install from VSIX'，选择上述文件
+
+2. **配置 Continue 设置**
+
+   Continue 的配置文件路径为：
+
+   .. code-block:: bash
+
+      ~/.continue/config.yaml
+
+   在 Continue 的配置文件中添加以下设置：
+
+   .. code-block:: yaml
+
+      # 请将 http://xxx:xx/ 替换为实际的 RLinf 在线代码补全服务地址
+
+      models:
+      # 添加一个模型，用于代码补全
+      - name: my-autocomplete
+         provider: openai
+         model: Qwen2.5-Coder-1.5B
+         apiBase: http://xxx:8081/v1
+         apiKey: xxx
+         roles:
+            - autocomplete
+
+      # 添加发送用户是否接受代码补全的反馈
+      tabAutocompleteOptions:
+      enableCompletionTracking: true
+      completionTrackingUrl: http://xxx:8082/api/training/submit
+      completionTrackingHeaders:
+         Authorization: "Bearer test-token"
+         X-Project-ID: "test-project"
+      maxPromptTokens: 1024
+      debounceDelay: 350
+      multilineCompletions: "auto"
+
+   修改并保存完成后，从左侧面板打开 Continue 扩展，点击右上角的 "设置" 齿轮按钮，在 "Models" 页面确保 "Autocomplete 模型" 选用 my-autocomplete。
+
+**启动训练服务**
+
+1. **准备模型和配置**
+   
+   确保您有预训练的模型权重，并修改配置文件，匹配模型路径、需要使用的端口等
+
+   .. code-block:: yaml
+
+      rollout:
+        model_dir: /path/to/your/model/DeepSeek-R1-Distill-Qwen-1.5B/
+      
+      actor:
+        tokenizer:
+          tokenizer_model: /path/to/your/model/DeepSeek-R1-Distill-Qwen-1.5B/
+
+2. **启动 RLinf 训练服务**
+   
+   .. code-block:: bash
+
+      # 进入项目目录
+      cd /path/to/rlinf_online_rl
+      
+      # 启动训练服务
+      bash examples/coding_online_rl/run_main_math_pipeline_grpo_megatron.sh qwen2.5-1.5b-ppo-megatron
+
+   这将启动以下服务：
+   - **推理服务**：在端口 8081 提供代码补全 API
+   - **训练服务**：在端口 8082 接收用户反馈数据
+
+**与 Continue 联动**
+
+1. **启动 Continue**
+   
+   在 VS Code 中启动 Continue 扩展，确保它连接到正确的 API 端点。
+
+2. **开始编程**
+   
+   在 Continue 中开始编写代码，系统将：
+   - 自动发送代码补全请求到推理服务
+   - 接收模型生成的代码建议
+   - 收集您对建议的接受/拒绝反馈
+
+3. **实时学习**
+   
+   系统会实时处理您的反馈：
+   - 接受的建议被标记为正面反馈
+   - 拒绝的建议被标记为负面反馈
+   - 模型参数根据反馈进行在线更新
+
+**监控训练过程**
+
+您可以通过以下方式监控训练过程：
+
+1. **查看日志输出**
+   
+   .. code-block:: bash
+
+      # 查看训练日志
+      tail -f results/ppo-1.5b/train.log
+
+2. **使用 TensorBoard**
+   
+   .. code-block:: bash
+
+      # 启动 TensorBoard
+      tensorboard --logdir results/grpo-1.5b
+
+3. **检查模型检查点**
+   
+   训练过程中会定期保存模型检查点到 `results/grpo-1.5b/checkpoints/` 目录。
+
+**测试客户端**
+
+您可以使用提供的测试客户端来验证系统功能：
+
+.. code-block:: bash
+
+   # 运行测试客户端
+   python examples/coding_online_rl/simple_test_client.py
+
+测试客户端会模拟 Continue 的行为，发送代码补全请求并提交反馈数据。
+
+**故障排除**
+
+常见问题及解决方案：
+
+1. **端口冲突**
+   
+   如果端口 8081 或 8082 被占用，请修改配置文件中的端口设置。
+
+2. **模型加载失败**
+   
+   检查模型路径是否正确，确保模型文件存在且可访问。
+
+3. **Continue 连接失败**
+   
+   确保 Continue 配置中的 API 端点地址正确，检查网络连接。还可使用 simple_test_client 测试是否能正常收到反馈数据。
+
+通过以上步骤，您就可以成功运行代码补全在线强化学习系统，并实现与 Continue 编辑器的无缝集成。
diff --git a/docs/source-zh/rst_source/examples/index.rst b/docs/source-zh/rst_source/examples/index.rst
index 70acd04a9..00e7759c7 100644
--- a/docs/source-zh/rst_source/examples/index.rst
+++ b/docs/source-zh/rst_source/examples/index.rst
@@ -128,7 +128,7 @@ RLinf的worker抽象、灵活的通信组件、以及对不同类型加速器的
        <img src="https://github.com/RLinf/misc/raw/main/pic/math_numbers_small.jpg"
             style="width: 100%; height: 200px; object-fit: cover; border-radius: 8px; box-shadow: 0 2px 6px rgba(0,0,0,0.15);" />
        <p style="margin-top: 8px; font-size: 14px; line-height: 1.4;">
-         <a href="https://rlinf.readthedocs.io/zh-cn/latest/rst_source/examples/cursor_onlinerl.html" target="_blank" style="text-decoration: underline; color: blue;">
+         <a href="https://rlinf.readthedocs.io/zh-cn/latest/rst_source/examples/coding_online_rl.html" target="_blank" style="text-decoration: underline; color: blue;">
           <b>代码补全在线强化学习开源版</b>
          </a><br>
          基于RLinf+continue实现端到端在线强化学习，模型效果提升xx%
@@ -190,3 +190,4 @@ RLinf的整体设计简洁且模块化，以Worker为抽象封装强化学习训
    maniskill
    libero
    reasoning
+   coding_online_rl
diff --git a/examples/coding_online_rl/config/qwen2.5-1.5b-ppo.yaml b/examples/coding_online_rl/config/qwen2.5-1.5b-ppo.yaml
new file mode 100644
index 000000000..35d9b1833
--- /dev/null
+++ b/examples/coding_online_rl/config/qwen2.5-1.5b-ppo.yaml
@@ -0,0 +1,300 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    rollout: 0-3
+    inference: 4-5
+    actor: 6-7
+
+runner:
+  task_type: coding_online_rl
+  logger:
+    log_path: ${runner.output_dir}/${runner.experiment_name}
+    project_name: rlinf
+    experiment_name: ${runner.experiment_name}
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 10
+  max_steps: -1
+
+  val_check_interval: 1
+  save_interval: 10
+
+  seq_length: 2560
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 2560
+
+  resume_dir: null
+  experiment_name: online-ppo-1.5b-pipeline
+  output_dir: /mnt/public/zhuchunyang_rl/logs
+
+algorithm:
+  group_size: 1
+
+  n_minibatches: 2
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: True
+  shuffle_rollout: False
+
+  max_num_gen_batches: 1
+
+  # PPO loss params (no critic model)
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+
+  # Control critic usage (similar to AReaL's disable_head)
+  use_critic: False  # Disable critic model
+  use_value_loss: False  # Disable value loss computation
+  
+  # PPO parameters for no-critic setup
+  gamma: 0.99
+  gae_lambda: 0.95
+  # value_clip and huber_delta not needed without critic
+
+  # Use no-critic GAE advantage computation
+  adv_type: math_gae_no_critic
+  normalize_advantages: False
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 0.1
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+    stop: [
+      "<|endoftext|>",
+      "<|fim_prefix|>",
+      "<|fim_middle|>",
+      "<|fim_suffix|>",
+      "<|fim_pad|>",
+      "<|repo_name|>",
+      "<|file_sep|>",
+      "<|im_start|>",
+      "<|im_end|>",
+    ]
+
+inference:
+  model_arch: ${rollout.model_arch}
+  group_name: "InferenceGroup"
+  load_from_actor: True
+  model:
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+    sequence_parallel: True
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /mnt/public/hf_models/Qwen2.5-Coder-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: True            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+
+  rollout_backend: sglang     # online_rl now only support sglang 
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    # not used, but reserved to pass config validate
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  max_prompt_length: 1024
+  rollout_batch_size: 16
+  seed: 1234
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: megatron
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  checkpoint_load_path: null
+
+  offload_optimizer: True
+  offload_weight: True
+  offload_grad: True
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: fp16
+    add_bias_linear: False
+
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+
+    activation: swiglu
+    sequence_parallel: True
+    # recompute_method: block
+    # recompute_granularity: selective
+
+    recompute_method: block
+    recompute_granularity: full
+    recompute_num_layers: 20
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+
+    normalization: rmsnorm
+
+    position_embedding_type: rope
+
+    apply_rope_fusion: True
+    bias_dropout_fusion: False
+    persist_layer_norm: False
+    bias_activation_fusion: False
+    attention_softmax_in_fp32: True
+    batch_p2p_comm: False
+    variable_seq_lengths: True
+    gradient_accumulation_fusion: False
+    moe_token_dispatcher_type: alltoall
+    use_cpu_initialization: False
+
+  optim:
+    optimizer: adam
+    bf16: True
+    fp16: False
+    lr: 2e-06
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-7
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: False
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-6
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: ${rollout.model_dir}
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  megatron:
+    ddp_bucket_size: null
+    distributed_backend: nccl # Support 'nccl' and 'gloo'
+    distributed_timeout_minutes: 30
+    ckpt_format: torch
+    use_dist_ckpt: False
+    tp_comm_bootstrap_backend: nccl
+    tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
+    use_hf_ckpt: True # if true, will transfer hf model to generate megatron checkpoint and use it for training.
+    use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
+
+    ckpt_convertor: # config for ckpt convertor
+      model: Qwen2.5-Coder-1.5B
+      model_type: null # will be set by hf model's config if null
+      hf_model_path: ${rollout.model_dir} # path to the hf model
+      save_path: ${runner.output_dir}/${runner.experiment_name}/converted_ckpts/actor
+      use_gpu_num : 0
+      use_gpu_index: null
+      process_num: 16 # number of processes to use for checkpointing
+      tensor_model_parallel_size: ${actor.model.tensor_model_parallel_size}
+      pipeline_model_parallel_size: ${actor.model.pipeline_model_parallel_size}
+
+    profiler: # profile megatron when inference and traning
+      output_dir: ${runner.output_dir}/${runner.experiment_name}/profiler
+      activities: ["cpu", "cuda"]
+      record_shapes: False
+      profile_memory: False
+      with_stack: False
+      with_flops: False
+      with_modules: True
+      export_tensorboard: True
+      export_chrome_trace: False
+      chrome_filename_prefix: "chrome_trace"
+      schedule_warmup: 2
+      schedule_active: 1
+      schedule_repeat: 1 # inference and training will repeat such times
+      # schedule_wait: it will be set at runtime
+
+reward:
+  use_reward_model: False
+  reward_type: fim_verify_call
+  reward_scale: 5.0
+
+critic:
+  use_critic_model: False
+
+server:
+  # online serving and user reward track
+  online_router:
+    host: 0.0.0.0
+    port: 8081
+
+  tracking_rollout:
+    host: 0.0.0.0
+    port: 8082
+    enable_dummy_data: True
diff --git a/examples/coding_online_rl/config/tp_comm_overlap_cfg.yaml b/examples/coding_online_rl/config/tp_comm_overlap_cfg.yaml
new file mode 100644
index 000000000..97fc8b391
--- /dev/null
+++ b/examples/coding_online_rl/config/tp_comm_overlap_cfg.yaml
@@ -0,0 +1,47 @@
+qkv_fprop:
+  method: ring_exchange
+  aggregate: 1
+
+proj_fprop:
+  method: pipeline
+  num_sm: 4
+  num_splits: 4
+  set_sm_margin: 0
+
+fc1_fprop:
+  method: ring_exchange
+  aggregate: 0
+
+fc2_fprop:
+  method: pipeline
+  num_sm: 4
+  num_splits: 4
+  set_sm_margin: 0
+
+fc2_dgrad:
+  method: ring_exchange
+  aggregate: 0
+
+fc1_dgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
+
+fc1_wgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
+
+proj_dgrad:
+  method: ring_exchange
+  aggregate: 1
+
+qkv_dgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
+
+qkv_wgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
\ No newline at end of file
diff --git a/examples/coding_online_rl/main_coding_online_rl.py b/examples/coding_online_rl/main_coding_online_rl.py
new file mode 100644
index 000000000..c5246292c
--- /dev/null
+++ b/examples/coding_online_rl/main_coding_online_rl.py
@@ -0,0 +1,106 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+import hydra
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf
+
+from rlinf.config import validate_cfg
+from rlinf.runners.coding_online_rl_runner import CodingOnlineRLRunner
+from rlinf.scheduler import Cluster
+from rlinf.scheduler.placement import PackedPlacementStrategy
+from rlinf.utils.placement import ModelParallelComponentPlacement, PlacementMode
+from rlinf.utils.utils import output_redirector
+from rlinf.workers.actor.megatron_actor_worker import MegatronActor
+from rlinf.workers.inference.megatron_inference_worker import MegatronInference
+from rlinf.workers.rollout.server.online_router_worker import OnlineRouterWorker
+from rlinf.workers.rollout.server.server_rollout_worker import ServerRolloutWorker
+from rlinf.workers.rollout.utils import get_rollout_backend_worker
+
+"""Script to start GRPO training"""
+mp.set_start_method("spawn", force=True)
+
+
+@hydra.main(version_base="1.1")
+@output_redirector
+def main(cfg) -> None:
+    cfg = validate_cfg(cfg)
+    print(json.dumps(OmegaConf.to_container(cfg, resolve=True), indent=2))
+
+    cluster = Cluster(num_nodes=cfg.cluster.num_nodes)
+    component_placement = ModelParallelComponentPlacement(cfg, cluster)
+
+    singleton_placement_strategy = PackedPlacementStrategy(
+        start_accelerator_id=0, end_accelerator_id=0
+    )
+    online_router = OnlineRouterWorker.create_group(cfg, component_placement).launch(
+        cluster=cluster,
+        name="OnlineRouterWorker",
+        placement_strategy=singleton_placement_strategy,
+    )
+    server_rollout = ServerRolloutWorker.create_group(cfg).launch(
+        cluster=cluster,
+        name="ServerRolloutWorker",
+        placement_strategy=singleton_placement_strategy,
+    )
+
+    rollout_worker_cls = get_rollout_backend_worker(cfg, component_placement)
+
+    # Rollout group
+    rollout_placement_strategy = component_placement.get_strategy("rollout")
+    rollout_group = rollout_worker_cls.create_group(cfg, component_placement).launch(
+        cluster,
+        name=cfg.rollout.group_name,
+        placement_strategy=rollout_placement_strategy,
+    )
+
+    # Inference group
+    inference_group = None
+    if (
+        component_placement.placement_mode == PlacementMode.DISAGGREGATED
+        and cfg.algorithm.recompute_logprobs
+    ):
+        inference_placement_strategy = component_placement.get_strategy("inference")
+        inference_group = MegatronInference.create_group(
+            cfg, component_placement
+        ).launch(
+            cluster,
+            name=cfg.inference.group_name,
+            placement_strategy=inference_placement_strategy,
+        )
+
+    # GRPO Actor group
+    actor_placement_strategy = component_placement.get_strategy("actor")
+    actor_group = MegatronActor.create_group(cfg, component_placement).launch(
+        cluster, name=cfg.actor.group_name, placement_strategy=actor_placement_strategy
+    )
+
+    runner = CodingOnlineRLRunner(
+        cfg=cfg,
+        placement=component_placement,
+        rollout=rollout_group,
+        inference=inference_group,
+        actor=actor_group,
+        online_router=online_router,
+        server_rollout=server_rollout,
+    )
+
+    runner.init_workers()
+    runner.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/coding_online_rl/run_main_coding_online_rl.sh b/examples/coding_online_rl/run_main_coding_online_rl.sh
new file mode 100644
index 000000000..a547d773f
--- /dev/null
+++ b/examples/coding_online_rl/run_main_coding_online_rl.sh
@@ -0,0 +1,21 @@
+#! /bin/bash
+set -x
+
+tabs 4
+export VLLM_ATTENTION_BACKEND=XFORMERS
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export TOKENIZERS_PARALLELISM=false
+export RAY_DEDUP_LOGS=0
+
+CONFIG_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+REPO_PATH=$(dirname $(dirname "$CONFIG_PATH"))
+MEGATRON_PATH=/opt/Megatron-LM
+export PYTHONPATH=${REPO_PATH}:${MEGATRON_PATH}:$PYTHONPATH
+
+if [ -z "$1" ]; then
+    CONFIG_NAME="qwen2.5-1.5b-ppo"
+else
+    CONFIG_NAME=$1
+fi
+
+python ${REPO_PATH}/examples/coding_online_rl/main_coding_online_rl.py --config-path ${CONFIG_PATH}/config/  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/examples/coding_online_rl/simple_test_client.py b/examples/coding_online_rl/simple_test_client.py
new file mode 100644
index 000000000..570e6e842
--- /dev/null
+++ b/examples/coding_online_rl/simple_test_client.py
@@ -0,0 +1,110 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import uuid
+from datetime import datetime
+
+import httpx
+
+batch_size = 16
+epoch = 10 * 2
+
+
+async def agenerate(prefix, suffix):
+    TARGET_URL = "http://127.0.0.1:8081/v1/completions"
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": "Bearer test-token",
+    }
+    body = {
+        "model": "test-model",
+        "prompt": f"<|fim_prefix|>{prefix}<|fim_suffix|>{suffix}<|fim_middle|>",
+        "max_tokens": 50,
+        "temperature": 0.7,
+        "stream": False,
+    }
+
+    async with httpx.AsyncClient() as client:
+        response = await client.post(
+            TARGET_URL,
+            headers=headers,
+            json=body,
+            timeout=15.0,
+        )
+        print(f"agenerate get response: {response.json()}")
+        return response.json()["choices"][0]["text"]
+
+
+async def atrack(prefix, suffix, completion, accepted):
+    TARGET_URL = "http://127.0.0.1:8082/api/training/submit"
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": "Bearer test-token",
+    }
+
+    body = {
+        "completionId": str(uuid.uuid4()),
+        "filepath": "file:///Users/qurakchin/.vscode/extensions/continue.continue-1.2.3-darwin-arm64/continue_tutorial.py",
+        "prefix": prefix,
+        "suffix": suffix,
+        "prompt": f"<|fim_prefix|>{prefix}<|fim_suffix|>{suffix}<|fim_middle|>",
+        "completion": completion,
+        "modelProvider": "openai",
+        "modelName": "Qwen2.5-Coder-1.5B-Q8_0.gguf",
+        "accepted": accepted,
+        "timestamp": datetime.now().isoformat(),
+        "time": 4294,
+        "uniqueId": str(uuid.uuid4()),
+        "numLines": 1,
+        "cacheHit": False,
+    }
+
+    async with httpx.AsyncClient() as client:
+        response = await client.post(
+            TARGET_URL,
+            headers=headers,
+            json=body,
+            timeout=15.0,
+        )
+        print(f"atrack get response: {response.json()}")
+
+
+async def single_iteration(prefix, suffix):
+    await asyncio.sleep(0.001)
+    completion = await agenerate(prefix=prefix, suffix=suffix)
+    await asyncio.sleep(0.001)
+    await atrack(prefix=prefix, suffix=suffix, completion=completion, accepted=True)
+
+
+async def loop():
+    prefix = "if x[j] > x[j + 1]:\n                x[j], x[j + 1] = x[j + 1], x[j]\n    return x\n\ndef han"
+    suffix = '\n# —————————————————————————————————————————————————     Agent      ————————————————————————————————————————————————— #\n#           Agent equips the Chat model with the tools needed to handle a wide range of coding tasks, allowing\n#           the model to make decisions and save you the work of manually finding context and performing actions.\n\n# 1. Switch from "Chat" to "Agent" mode using the dropdown in the bottom left of the input box'
+
+    tasks = []
+    for i in range(batch_size * epoch):
+        task = asyncio.create_task(single_iteration(prefix, suffix))
+        tasks.append(task)
+
+        if i % batch_size == 0:
+            await asyncio.gather(*tasks)
+            tasks = []
+
+    await asyncio.gather(*tasks)
+
+
+if __name__ == "__main__":
+    asyncio.run(loop())
diff --git a/examples/math/config/qwen2.5-1.5b-single-gpu.yaml b/examples/math/config/qwen2.5-1.5b-single-gpu.yaml
index a9d58f3a2..e3f5bf28d 100644
--- a/examples/math/config/qwen2.5-1.5b-single-gpu.yaml
+++ b/examples/math/config/qwen2.5-1.5b-single-gpu.yaml
@@ -90,7 +90,7 @@ rollout:
   detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
   padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
   eos: null                   # will be tokenizer.eos_token_id if null.
-  
+
   rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
 
   sglang:
@@ -239,7 +239,7 @@ actor:
       process_num: 16 # number of processes to use for checkpointing
       tensor_model_parallel_size: ${actor.model.tensor_model_parallel_size}
       pipeline_model_parallel_size: ${actor.model.pipeline_model_parallel_size}
-      
+
     profiler: # profile megatron when inference and traning
       output_dir: ${runner.output_dir}/${runner.experiment_name}/profiler
       activities: ["cpu", "cuda"]
diff --git a/rlinf/algorithms/advantages.py b/rlinf/algorithms/advantages.py
index abb654533..4ce0338e0 100644
--- a/rlinf/algorithms/advantages.py
+++ b/rlinf/algorithms/advantages.py
@@ -155,6 +155,168 @@ def compute_embodied_grpo_advantages(
     return advantages, advantages
 
 
+@register_advantage("math_gae_no_critic")
+def compute_math_gae_no_critic_advantages_and_returns(**kwargs):
+    """
+    Calculate advantages and returns for math tasks using GAE without critic model.
+
+    This function implements a simplified advantage estimation for math tasks
+    without requiring a value function, similar to AReaL's disable_head approach.
+
+    Args:
+        reward_scores (torch.Tensor): Reward scores for math responses
+        mask (torch.Tensor): Attention mask of shape [bsz, seq_len] or [bsz, max_seq_len]
+        gamma (float): Discount factor
+        gae_lambda (float): GAE lambda parameter
+        normalize_advantages (bool): Whether to normalize advantages
+        normalize_returns (bool): Whether to normalize returns
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: (advantages, returns) tensors
+    """
+    reward_scores = kwargs["reward_scores"]
+    mask = kwargs["mask"]
+    gamma = kwargs.get("gamma", 1.0)
+    normalize_advantages = kwargs.get("normalize_advantages", True)
+    normalize_returns = kwargs.get("normalize_returns", False)
+
+    # For math tasks without critic, we use reward-to-go as baseline
+    bsz, seq_len = mask.shape
+
+    # Create reward structure: reward at the end of sequence
+    rewards = torch.zeros_like(mask, dtype=torch.float32)
+    rewards[:, -1] = reward_scores  # Put reward at the end of sequence
+
+    # Create done flags (episode ends at the last token)
+    dones = torch.zeros_like(mask, dtype=torch.bool)
+    dones[:, -1] = True
+
+    # Compute reward-to-go (cumulative discounted rewards)
+    returns = torch.zeros_like(mask, dtype=torch.float32)
+    cumulative_reward = 0
+
+    for t in reversed(range(seq_len)):
+        cumulative_reward = rewards[:, t] + gamma * cumulative_reward * (~dones[:, t])
+        returns[:, t] = cumulative_reward
+
+    # For no-critic setup, advantages are computed using reward-to-go
+    # with a simple baseline subtraction
+    advantages = returns.clone()
+
+    # Apply mask
+    advantages = advantages * mask
+    returns = returns * mask
+
+    # Simple baseline subtraction (mean of valid advantages)
+    if normalize_advantages:
+        valid_advantages = advantages[mask.bool()]
+        if len(valid_advantages) > 0:
+            mean_advantages = valid_advantages.mean()
+            std_advantages = valid_advantages.std()
+            advantages = (advantages - mean_advantages) / (std_advantages + 1e-5)
+
+    # Normalize returns if requested
+    if normalize_returns:
+        valid_returns = returns[mask.bool()]
+        if len(valid_returns) > 0:
+            mean_returns = valid_returns.mean()
+            std_returns = valid_returns.std()
+            returns = (returns - mean_returns) / (std_returns + 1e-5)
+
+    return advantages, returns
+
+
+@register_advantage("math_gae")
+def compute_math_gae_advantages_and_returns(**kwargs):
+    """
+    Calculate advantages and returns for math tasks using GAE.
+
+    This function implements Generalized Advantage Estimation (GAE) specifically
+    designed for math tasks, which may have different data structures compared
+    to embodied tasks.
+
+    Args:
+        reward_scores (torch.Tensor): Reward scores for math responses
+        values (torch.Tensor): Value predictions of shape [bsz, seq_len] or [bsz, max_seq_len]
+        mask (torch.Tensor): Attention mask of shape [bsz, seq_len] or [bsz, max_seq_len]
+        gamma (float): Discount factor
+        gae_lambda (float): GAE lambda parameter
+        normalize_advantages (bool): Whether to normalize advantages
+        normalize_returns (bool): Whether to normalize returns
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: (advantages, returns) tensors
+    """
+    reward_scores = kwargs["reward_scores"]
+    values = kwargs["values"]
+    mask = kwargs["mask"]
+    gamma = kwargs.get("gamma", 1.0)
+    gae_lambda = kwargs.get("gae_lambda", 1.0)
+    normalize_advantages = kwargs.get("normalize_advantages", True)
+    normalize_returns = kwargs.get("normalize_returns", False)
+
+    # For math tasks, we typically have [bsz, seq_len] tensors
+    bsz, seq_len = values.shape
+
+    # Create a simple reward structure for math tasks
+    # The reward is typically given at the end of the sequence
+    rewards = torch.zeros_like(values)
+    rewards[:, -1] = reward_scores  # Put reward at the end of sequence
+
+    # Create done flags (episode ends at the last token)
+    dones = torch.zeros_like(values, dtype=torch.bool)
+    dones[:, -1] = True
+
+    # Add bootstrap value for the next state (after the sequence)
+    next_values = torch.zeros(bsz, 1, device=values.device, dtype=values.dtype)
+
+    # Compute GAE advantages
+    advantages = torch.zeros_like(values)
+    returns = torch.zeros_like(values)
+
+    gae = 0
+    for t in reversed(range(seq_len)):
+        if t == seq_len - 1:
+            # Last timestep
+            delta = (
+                rewards[:, t]
+                + gamma * next_values[:, 0] * (~dones[:, t])
+                - values[:, t]
+            )
+        else:
+            # Regular timestep
+            delta = (
+                rewards[:, t] + gamma * values[:, t + 1] * (~dones[:, t]) - values[:, t]
+            )
+
+        gae = delta + gamma * gae_lambda * (~dones[:, t]) * gae
+        advantages[:, t] = gae
+        returns[:, t] = gae + values[:, t]
+
+    # Apply mask to advantages and returns
+    advantages = advantages * mask
+    returns = returns * mask
+
+    # Normalize advantages if requested
+    if normalize_advantages:
+        # Only normalize over valid (masked) positions
+        valid_advantages = advantages[mask.bool()]
+        if len(valid_advantages) > 0:
+            mean_advantages = valid_advantages.mean()
+            std_advantages = valid_advantages.std()
+            advantages = (advantages - mean_advantages) / (std_advantages + 1e-5)
+
+    # Normalize returns if requested
+    if normalize_returns:
+        valid_returns = returns[mask.bool()]
+        if len(valid_returns) > 0:
+            mean_returns = valid_returns.mean()
+            std_returns = valid_returns.std()
+            returns = (returns - mean_returns) / (std_returns + 1e-5)
+
+    return advantages, returns
+
+
 @register_advantage("math_grpo")
 def compute_math_grpo_advantages(**kwargs):
     reward_scores = kwargs["reward_scores"]
diff --git a/rlinf/algorithms/losses.py b/rlinf/algorithms/losses.py
index 237e9fb1a..3980f9136 100644
--- a/rlinf/algorithms/losses.py
+++ b/rlinf/algorithms/losses.py
@@ -267,18 +267,9 @@ def compute_math_ppo_actor_loss(**kwargs):
         "loss_mask": loss_mask,
         "loss_agg_func": lambda x, mask: (x * mask).sum() / (mask.sum() or 1),
     }
-    (
-        loss,
-        clip_fraction,
-        approx_kl,
-        ratio,
-        clipped_ratio,
-        dual_cliped_ratio,
-    ) = compute_math_ppo_actor_loss(**kwargs)
-    print(f"{loss=}, {clip_fraction=}, {approx_kl=}")
-    print(f"{ratio=}")
-    print(f"{clipped_ratio=}")
-    print(f"{dual_cliped_ratio=}")
+    loss, metrics_data = compute_math_ppo_actor_loss(**kwargs)
+    print(f"Policy loss: {loss=}")
+    print(f"Metrics: {metrics_data}")
 
     # test grpo_actor_loss_fn
     torch.manual_seed(0)
@@ -298,6 +289,7 @@ def compute_math_ppo_actor_loss(**kwargs):
         "clip_ratio_high": clip_ratio_high,
         "loss_mask": loss_mask,
         "loss_mask_sum": loss_mask.sum(),
+        "max_episode_steps": 512,
     }
     loss, metrics_data = compute_embodied_grpo_actor_loss_fn(**kwargs)
     print(f"{loss=}, {metrics_data=}")
diff --git a/rlinf/algorithms/registry.py b/rlinf/algorithms/registry.py
index 11bfc6c6a..96b19f2b4 100644
--- a/rlinf/algorithms/registry.py
+++ b/rlinf/algorithms/registry.py
@@ -73,3 +73,22 @@ def calculate_adv_and_returns(**kwargs) -> Tuple[torch.Tensor, Optional[torch.Te
     adv_type = kwargs["adv_type"]
     fn = get_adv_and_returns(adv_type)
     return fn(**kwargs)
+
+
+REWARD_REGISTRY: Dict[str, Callable] = {}
+
+
+def register_reward_fn(name: str):
+    def decorator(fn):
+        REWARD_REGISTRY[name] = fn
+        return fn
+
+    return decorator
+
+
+def get_reward_fn(name: Optional[str]):
+    if name is None:
+        return None
+    if name not in REWARD_REGISTRY:
+        raise ValueError(f"Reward function {name} not registered")
+    return REWARD_REGISTRY[name]
diff --git a/rlinf/config.py b/rlinf/config.py
index 1934adeb4..86198334a 100644
--- a/rlinf/config.py
+++ b/rlinf/config.py
@@ -24,6 +24,9 @@
 from omegaconf.dictconfig import DictConfig
 from transformers import AutoConfig
 
+from rlinf.scheduler.cluster import Cluster
+from rlinf.utils.placement import ModelParallelComponentPlacement, PlacementMode
+
 if TYPE_CHECKING:
     from megatron.core.model_parallel_config import ModelParallelConfig
     from megatron.core.transformer.transformer_config import TransformerConfig
@@ -555,6 +558,51 @@ def validate_math_cfg(cfg: DictConfig) -> DictConfig:
     return cfg
 
 
+def validate_coding_online_rl_cfg(cfg: DictConfig) -> DictConfig:
+    assert cfg.rollout.model_arch == "qwen2.5", (
+        f"Model {cfg.rollout.model_arch} is not supported"
+    )
+
+    assert cfg.algorithm.recompute_logprobs != cfg.rollout.return_logprobs, (
+        "Exactly one of `algorithm.recompute_logprobs` or `rollout.return_logprobs` must be True to compute `prev_logprobs`."
+    )
+
+    assert cfg.algorithm.recompute_logprobs, (
+        "Online coding task must use recompute_logprobs"
+    )
+
+    assert cfg.actor.training_backend == "megatron", (
+        "Online coding task must use megatron training backend"
+    )
+
+    cluster = Cluster(num_nodes=cfg.cluster.num_nodes)
+    component_placement = ModelParallelComponentPlacement(cfg, cluster)
+    assert component_placement.placement_mode == PlacementMode.DISAGGREGATED, (
+        "Online coding task must use disaggregated placement mode"
+    )
+
+    with open_dict(cfg):
+        cfg.algorithm.training_batch_size_per_gpu = cfg.algorithm.get(
+            "training_batch_size_per_gpu", 1
+        )
+        cfg.algorithm.n_minibatches = cfg.algorithm.get("n_minibatches", 1)
+        cfg.algorithm.max_num_gen_batches = cfg.algorithm.get("max_num_gen_batches", 1)
+        cfg.actor.micro_batch_size = cfg.algorithm.training_batch_size_per_gpu
+        cfg.actor.global_batch_size = (
+            cfg.data.rollout_batch_size
+            * cfg.algorithm.group_size
+            // cfg.algorithm.n_minibatches
+        )
+        assert cfg.actor.micro_batch_size >= 1
+        assert cfg.actor.global_batch_size >= 1
+        assert cfg.runner.seq_length > cfg.data.max_prompt_length, (
+            f"runner.seq_length ({cfg.runner.seq_length}) must be greater than data.max_prompt_length ({cfg.data.max_prompt_length})"
+        )
+
+        cfg.rollout = validate_rollout_cfg(cfg.rollout)
+    return cfg
+
+
 def validate_cfg(cfg: DictConfig) -> DictConfig:
     OmegaConf.set_struct(cfg, True)
 
@@ -562,6 +610,8 @@ def validate_cfg(cfg: DictConfig) -> DictConfig:
         cfg = validate_embodied_cfg(cfg)
     if cfg.runner.task_type == "math":
         cfg = validate_math_cfg(cfg)
+    if cfg.runner.task_type == "coding_online_rl":
+        cfg = validate_coding_online_rl_cfg(cfg)
 
     if (
         cfg.algorithm.adv_type == "embodied_grpo"
diff --git a/rlinf/runners/coding_online_rl_runner.py b/rlinf/runners/coding_online_rl_runner.py
new file mode 100644
index 000000000..46be3423a
--- /dev/null
+++ b/rlinf/runners/coding_online_rl_runner.py
@@ -0,0 +1,318 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from typing import Optional
+
+import pandas as pd
+from omegaconf.dictconfig import DictConfig
+from tqdm import tqdm
+
+from rlinf.scheduler import Channel
+from rlinf.scheduler import WorkerGroupFuncResult as Handle
+from rlinf.utils.distributed import ScopedTimer
+from rlinf.utils.metric_logger import MetricLogger
+from rlinf.utils.placement import ModelParallelComponentPlacement
+from rlinf.utils.runner_utils import check_progress
+from rlinf.utils.timers import Timer
+from rlinf.workers.actor.megatron_actor_worker import MegatronActor
+from rlinf.workers.inference.megatron_inference_worker import MegatronInference
+from rlinf.workers.rollout.server.online_router_worker import OnlineRouterWorker
+from rlinf.workers.rollout.server.server_rollout_worker import ServerRolloutWorker
+from rlinf.workers.rollout.sglang.sglang_worker import SGLangWorker
+
+logging.getLogger().setLevel(logging.INFO)
+
+
+class CodingOnlineRLRunner:
+    """Runner for online coding model training."""
+
+    def __init__(
+        self,
+        cfg: DictConfig,
+        placement: ModelParallelComponentPlacement,
+        rollout: SGLangWorker,
+        inference: Optional[MegatronInference],
+        actor: MegatronActor,
+        online_router: OnlineRouterWorker,
+        server_rollout: ServerRolloutWorker,
+    ):
+        """"""
+        self.cfg = cfg
+        self.component_placement = placement
+        self.is_pipeline = self.component_placement.is_disaggregated
+        self.has_dedicated_inference = inference is not None
+
+        # Workers
+        self.rollout = rollout
+        self.actor = actor
+        self.online_router = online_router
+        self.server_rollout = server_rollout
+        # Collocated mode uses actor as inference
+        self.inference = inference if self.has_dedicated_inference else self.actor
+
+        # Data channels
+        self.dataloader_channel = Channel.create("DataLoader")
+        # Create a local channel (i.e., a channel that is different in every process)
+        # if inference is not a dedicated worker
+        self.inference_channel = Channel.create(
+            "Inference", local=not self.has_dedicated_inference
+        )
+        self.actor_channel = Channel.create("Actor", local=True)
+
+        # Configurations
+        self.compute_ref_logprobs = self.cfg.algorithm.kl_beta > 0
+        self.recompute_logprobs = self.cfg.algorithm.recompute_logprobs
+        assert self.recompute_logprobs, "online rl must recompute logprobs"
+        self.consumed_samples = 0
+        self.global_steps = 0
+
+        # Build dataloader and compute `max_steps`
+        self.max_steps = self.cfg.runner.get("max_steps", -self.global_steps)
+
+        # Wandb table
+        self.train_df = pd.DataFrame(columns=["step", "prompt", "response", "reward"])
+        self.val_df = pd.DataFrame(columns=["step", "prompt", "response", "reward"])
+
+        # Timers
+        self.timer = ScopedTimer(reduction="max", sync_cuda=False)
+        self.run_timer = Timer(None)  # Timer that checks if we should stop training
+
+        self.metric_logger = MetricLogger(cfg)
+
+    def init_workers(self):
+        # Must be done before actor init
+        if self.cfg.runner.resume_dir is None:
+            logging.info("Training from scratch")
+            if (
+                self.cfg.actor.training_backend == "megatron"
+                and self.cfg.actor.megatron.use_hf_ckpt
+            ):
+                from toolkits.ckpt_convertor.convert_hf_to_mg import convert_hf_to_mg
+
+                convert_hf_to_mg(
+                    self.cfg.actor.megatron.ckpt_convertor.hf_model_path,
+                    self.cfg.actor.megatron.ckpt_convertor,
+                )
+
+        # Init workers
+        self.rollout.init_worker().wait()
+        self.actor.init_worker().wait()
+        self.online_router.init_worker(self.rollout).wait()
+        self.server_rollout.init_worker().wait()
+        if self.has_dedicated_inference:
+            self.inference.init_worker().wait()
+
+        if self.cfg.runner.resume_dir is None:
+            return
+
+        # Checkpoint loading
+        logging.info(f"Load from checkpoint folder: {self.cfg.runner.resume_dir}")
+        # set global step
+        self.global_steps = int(self.cfg.runner.resume_dir.split("global_step_")[-1])
+        logging.info(f"Setting global step to {self.global_steps}")
+        print(f"Setting global step to {self.global_steps}")
+
+        actor_checkpoint_path = os.path.join(self.cfg.runner.resume_dir, "actor")
+        self.actor.load_checkpoint(actor_checkpoint_path).wait()
+
+    def _compute_flops_metrics(self, time_metrics, act_rollout_metrics) -> dict:
+        rollout_time = time_metrics.get("rollout")
+        inference_time = time_metrics.get("inference", -1)
+        training_time = time_metrics.get("training")
+
+        num_gpus_actor = self.component_placement.actor_world_size
+        num_gpus_rollout = self.component_placement.rollout_world_size
+
+        rollout_tflops = act_rollout_metrics["rollout_tflops"]
+        inference_tflops = act_rollout_metrics["inference_tflops"]
+        training_tflops = act_rollout_metrics["training_tflops"]
+
+        flops_metrics = {
+            "rollout_tflops_per_gpu": 0.0,
+            "inference_tflops_per_gpu": 0.0,
+            "training_tflops_per_gpu": 0.0,
+        }
+        if rollout_time > 0 and rollout_tflops > 0:
+            flops_metrics["rollout_tflops_per_gpu"] = (
+                rollout_tflops / rollout_time / num_gpus_rollout
+            )
+
+        if inference_time > 0 and inference_tflops > 0:
+            num_gpus_inference = self.component_placement.inference_world_size
+            if num_gpus_inference == 0:
+                num_gpus_inference = self.component_placement.actor_world_size
+            flops_metrics["inference_tflops_per_gpu"] = (
+                inference_tflops / inference_time / num_gpus_inference
+            )
+
+        if training_time > 0 and training_tflops > 0:
+            flops_metrics["training_tflops_per_gpu"] = (
+                training_tflops / training_time / num_gpus_actor
+            )
+
+        return flops_metrics
+
+    def _save_checkpoint(self):
+        base_output_dir = os.path.join(
+            self.cfg.runner.output_dir,
+            self.cfg.runner.experiment_name,
+            f"checkpoints/global_step_{self.global_steps}",
+        )
+        actor_save_path = os.path.join(base_output_dir, "actor")
+
+        # actor
+        self.actor.save_checkpoint(actor_save_path, self.global_steps).wait()
+
+    def _sync_weights(self):
+        self.online_router.sync_model_start()
+        self.actor.sync_model_to_rollout()
+        self.rollout.sync_model_from_actor().wait()
+        self.actor.del_reshard_state_dict().wait()
+
+        if self.has_dedicated_inference:
+            self.actor.sync_model_to_inference()
+            self.inference.sync_model_from_actor().wait()
+        self.online_router.sync_model_end()
+
+    def run(self):
+        global_pbar = tqdm(
+            initial=0,
+            total=self.cfg.runner.max_epochs,
+            desc="Global Step",
+            ncols=620,
+        )
+
+        self.online_router.server_start()
+        self.server_rollout.server_start()
+        self.run_timer.start_time()
+        for _ in range(self.cfg.runner.max_epochs):
+            with self.timer("step"):
+                with self.timer("sync_weights"):
+                    self._sync_weights()
+
+                rollout_handle: Handle = self.server_rollout.rollout(
+                    output_channel=self.dataloader_channel,
+                )
+
+                if self.recompute_logprobs:
+                    # Inference prev/ref logprobs
+                    infer_handle: Handle = self.inference.run_inference(
+                        input_channel=self.dataloader_channel,
+                        output_channel=self.inference_channel,
+                        compute_ref_logprobs=self.compute_ref_logprobs,
+                    )
+                    inference_channel = self.inference_channel
+                else:
+                    infer_handle = None
+                    inference_channel = self.dataloader_channel
+
+                # Advantages and returns
+                adv_handle: Handle = self.actor.compute_advantages_and_returns(
+                    input_channel=self.inference_channel,
+                    output_channel=self.actor_channel,
+                )
+
+                # Actor training
+                actor_input_channel = self.actor_channel
+                if self.is_pipeline:
+                    # In pipeline mode, the rollout already contains the advantages and returns
+                    # So the above two steps are in fact no-ops, and we should directly use the inference channel as the input
+                    actor_input_channel = inference_channel
+                actor_handle: Handle = self.actor.run_training(
+                    input_channel=actor_input_channel,
+                )
+
+                metrics = actor_handle.wait()
+                actor_rollout_metrics = metrics[0][0]
+                actor_training_metrics = metrics[0][1]
+                self.global_steps += 1
+
+                run_time_exceeded = self.run_timer.is_finished()
+                _, save_model, is_train_end = check_progress(
+                    self.global_steps,
+                    self.max_steps,
+                    self.cfg.runner.val_check_interval,
+                    self.cfg.runner.save_interval,
+                    1.0,
+                    run_time_exceeded=run_time_exceeded,
+                )
+
+                if save_model:
+                    self._save_checkpoint()
+
+                if is_train_end:
+                    logging.info(
+                        f"Step limit given by max_steps={self.max_steps} reached. Stopping run"
+                    )
+                    return
+
+                if run_time_exceeded:
+                    logging.info(
+                        f"Time limit given by run_timer={self.run_timer} reached. Stopping run"
+                    )
+                    return
+
+                # To ensure the router server is paused (old requests are finished and new requests are paused).
+                # So it's safe to do weight sync on sglang.
+                rollout_handle.wait()
+
+            time_metrics = self.timer.consume_durations()
+            time_metrics["training"] = actor_handle.consume_duration()
+            time_metrics["advantage"] = adv_handle.consume_duration()
+            if infer_handle is not None:
+                # Inference time should be the min time across ranks, because different DP receive the rollout results differently
+                # But at the begin of the pp schedule, there is a timer barrier
+                # This makes all DP end at the same time, while they start at differnt times, and thus only the min time is correct
+                time_metrics["inference"] = infer_handle.consume_duration(
+                    reduction_type="min"
+                )
+
+            logging_steps = (self.global_steps - 1) * self.cfg.algorithm.n_minibatches
+            # add prefix to the metrics
+            log_time_metrics = {f"time/{k}": v for k, v in time_metrics.items()}
+            rollout_metrics = {
+                f"rollout/{k}": v for k, v in actor_rollout_metrics.items()
+            }
+
+            self.metric_logger.log(log_time_metrics, logging_steps)
+            self.metric_logger.log(rollout_metrics, logging_steps)
+            for i in range(self.cfg.algorithm.n_minibatches):
+                training_metrics = {
+                    f"train/{k}": v for k, v in actor_training_metrics[i].items()
+                }
+                self.metric_logger.log(training_metrics, logging_steps + i)
+
+            logging_metrics = time_metrics
+
+            if self.cfg.actor.get("calculate_flops", False):
+                flops_metrics = self._compute_flops_metrics(
+                    time_metrics, actor_rollout_metrics
+                )
+                flops_metrics = {f"flops/{k}": v for k, v in flops_metrics.items()}
+                self.metric_logger.log(flops_metrics, logging_steps)
+                logging_metrics.update(flops_metrics)
+
+            logging_metrics.update(actor_rollout_metrics)
+            logging_metrics.update(actor_training_metrics[-1])
+
+            global_pbar.set_postfix(logging_metrics)
+            global_pbar.update(1)
+
+        self.server_rollout.shutdown()
+        self.online_router.server_stop()
+        self.server_rollout.server_stop()
+        # No need to wait for rollout_handle since rollout service runs continuously
+        self.metric_logger.finish()
diff --git a/rlinf/runners/math_runner.py b/rlinf/runners/math_runner.py
index 9dadbb78b..e3f5b3750 100644
--- a/rlinf/runners/math_runner.py
+++ b/rlinf/runners/math_runner.py
@@ -362,6 +362,8 @@ def run(self):
                     )
 
                     metrics = actor_handle.wait()
+                    actor_rollout_metrics = metrics[0][0]
+                    actor_training_metrics = metrics[0][1]
                     self.global_steps += 1
 
                     run_time_exceeded = self.run_timer.is_finished()
@@ -407,13 +409,15 @@ def run(self):
                 ) * self.cfg.algorithm.n_minibatches
                 # add prefix to the metrics
                 log_time_metrics = {f"time/{k}": v for k, v in time_metrics.items()}
-                rollout_metrics = {f"rollout/{k}": v for k, v in metrics[0][0].items()}
+                rollout_metrics = {
+                    f"rollout/{k}": v for k, v in actor_rollout_metrics.items()
+                }
 
                 self.metric_logger.log(log_time_metrics, logging_steps)
                 self.metric_logger.log(rollout_metrics, logging_steps)
                 for i in range(self.cfg.algorithm.n_minibatches):
                     training_metrics = {
-                        f"train/{k}": v for k, v in metrics[0][1][i].items()
+                        f"train/{k}": v for k, v in actor_training_metrics[i].items()
                     }
                     self.metric_logger.log(training_metrics, logging_steps + i)
 
@@ -421,14 +425,14 @@ def run(self):
 
                 if self.cfg.actor.get("calculate_flops", False):
                     flops_metrics = self._compute_flops_metrics(
-                        time_metrics, metrics[0][0]
+                        time_metrics, actor_rollout_metrics
                     )
                     flops_metrics = {f"flops/{k}": v for k, v in flops_metrics.items()}
                     self.metric_logger.log(flops_metrics, logging_steps)
                     logging_metrics.update(flops_metrics)
 
-                logging_metrics.update(metrics[0][0])
-                logging_metrics.update(metrics[0][1][-1])
+                logging_metrics.update(actor_rollout_metrics)
+                logging_metrics.update(actor_training_metrics[-1])
 
                 global_pbar.set_postfix(logging_metrics)
                 global_pbar.update(1)
diff --git a/rlinf/utils/utils.py b/rlinf/utils/utils.py
index a2fc29cdd..449412f8b 100644
--- a/rlinf/utils/utils.py
+++ b/rlinf/utils/utils.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import atexit
 import gc
 import os
 import sys
@@ -166,32 +167,37 @@ def wrapper(cfg, *args, **kwargs):
         )
         os.makedirs(os.path.dirname(log_path), exist_ok=True)
 
-        with open(log_path, "w", encoding="utf-8", buffering=1) as f:
-            dual_out = DualOutput(f, sys.stdout)
-            dual_err = DualOutput(f, sys.stderr)
-
-            old_stdout = sys.stdout
-            old_stderr = sys.stderr
-            try:
-                sys.stdout = dual_out
-                sys.stderr = dual_err
-                return func(cfg, *args, **kwargs)
-
-            except Exception as e:
-                import traceback
-
-                error_msg = f"\nException occurred: {e}\n{traceback.format_exc()}\n"
-                dual_err.write(error_msg)
-                dual_err.flush()
-                f.flush()
-                raise
-
-            finally:
-                sys.stdout = old_stdout
-                sys.stderr = old_stderr
-
-                dual_out.flush()
-                dual_err.flush()
-                f.flush()
+        f = open(log_path, "w", encoding="utf-8", buffering=1)
+
+        def close():
+            dual_out.flush()
+            dual_err.flush()
+            f.flush()
+            f.close()
+
+        atexit.register(close)
+
+        dual_out = DualOutput(f, sys.stdout)
+        dual_err = DualOutput(f, sys.stderr)
+
+        old_stdout = sys.stdout
+        old_stderr = sys.stderr
+        try:
+            sys.stdout = dual_out
+            sys.stderr = dual_err
+            return func(cfg, *args, **kwargs)
+
+        except Exception as e:
+            import traceback
+
+            error_msg = f"\nException occurred: {e}\n{traceback.format_exc()}\n"
+            dual_err.write(error_msg)
+            dual_err.flush()
+            f.flush()
+            raise
+
+        finally:
+            sys.stdout = old_stdout
+            sys.stderr = old_stderr
 
     return wrapper
diff --git a/rlinf/workers/actor/megatron_actor_worker.py b/rlinf/workers/actor/megatron_actor_worker.py
index 469f3416b..54376e1a5 100644
--- a/rlinf/workers/actor/megatron_actor_worker.py
+++ b/rlinf/workers/actor/megatron_actor_worker.py
@@ -31,6 +31,7 @@
 from rlinf.algorithms.registry import (
     actor_loss,
     calculate_adv_and_returns,
+    get_reward_fn,
 )
 from rlinf.algorithms.utils import kl_penalty
 from rlinf.data.io_struct import (
@@ -79,7 +80,7 @@
     seq_mean_token_sum,
 )
 from rlinf.workers.rollout.utils import RankMapper
-from toolkits.math_verifier.verify import math_verify_call
+from toolkits import register_rewards
 
 
 class MegatronActor(MegatronModelManager, Worker):
@@ -145,8 +146,8 @@ def __init__(
 
         # Reward configurations
         if not self.cfg.reward.use_reward_model:
-            assert self.cfg.reward.reward_type == "math", "only support math"
-            self.reward_fn = math_verify_call
+            register_rewards()
+            self.reward_fn = get_reward_fn(self.cfg.reward.reward_type)
 
         # Rollout configurations
         self.rollout_group_name = self.cfg.rollout.group_name
@@ -873,6 +874,7 @@ def compute_rewards(self, input_channel: Channel, output_channel: Channel):
             input_channel: The input channel to read from.
             output_channel: The output channel to send results to.
         """
+        assert self.reward_fn is not None, "reward_fn is not set"
         if self.is_pipeline:
             # In pipeline mode, rewards are computed in the rollout
             with self.worker_timer():
@@ -915,12 +917,16 @@ def _compute_batch_rewards(
 
         if torch.distributed.get_rank() == parallel_state.get_model_parallel_src_rank():
             rewards = self.reward_fn(texts, answers)
-            reward_scores = [
-                self.cfg.reward.reward_scale
-                if reward == 1
-                else -self.cfg.reward.reward_scale
-                for reward in rewards
-            ]
+            if self.cfg.reward.reward_type == "math":
+                reward_scores = [
+                    self.cfg.reward.reward_scale
+                    if reward == 1
+                    else -self.cfg.reward.reward_scale
+                    for reward in rewards
+                ]
+            else:
+                reward_scores = rewards
+
             all_reward_scores.extend(reward_scores)
 
         if len(all_reward_scores) > 0:
diff --git a/rlinf/workers/rollout/server/online_router_worker.py b/rlinf/workers/rollout/server/online_router_worker.py
new file mode 100644
index 000000000..ea2959dc6
--- /dev/null
+++ b/rlinf/workers/rollout/server/online_router_worker.py
@@ -0,0 +1,250 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import json
+import random
+import time
+import uuid
+from typing import Any, Dict, List, Optional
+
+import uvicorn
+from fastapi import FastAPI
+from fastapi.responses import StreamingResponse
+from omegaconf.dictconfig import DictConfig
+from pydantic import BaseModel
+
+from rlinf.scheduler import Worker
+from rlinf.utils.placement import ComponentPlacement
+from rlinf.workers.rollout.sglang.sglang_worker import AsyncSGLangWorker
+
+
+class CompleteRequest(BaseModel):
+    """Complete request model."""
+
+    prompt: str
+    model: Optional[str] = None
+    max_tokens: Optional[int] = 1024
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 0.9
+    top_k: Optional[int] = 50
+    repetition_penalty: Optional[float] = 1.0
+    stop: Optional[List[str]] = None
+    stream: Optional[bool] = False
+
+
+class CompleteResponse(BaseModel):
+    """Complete response model."""
+
+    id: str
+    choices: List[Dict[str, Any]]
+    model: str
+    created: int
+    object: str = "text_completion"
+
+
+class OnlineRouterWorker(Worker):
+    """Online router worker with FastAPI server for handling complete and completionTrack requests."""
+
+    def __init__(self, cfg: DictConfig, placement: ComponentPlacement):
+        Worker.__init__(self)
+
+        self._cfg = cfg
+
+        # Configuration
+        self._server_host = cfg.server.online_router.get("host", "0.0.0.0")
+        self._server_port = cfg.server.online_router.get("port", 8081)
+        self._rollout_instance_num = placement.rollout_dp_size
+
+        # Sync weight state management
+        self._sync_model_lock = asyncio.Lock()
+        self._sync_model_in_progress = False
+        self._pending_requests: List[asyncio.Future] = []
+
+        # Request synchronization state
+        self._sync_in_progress = False
+        self._old_requests_complete = asyncio.Event()
+        self._new_requests_blocked = asyncio.Event()
+        self._new_requests_blocked.set()  # Initially allow new requests
+        self._blocked_requests: List[asyncio.Future] = []
+
+        # Request tracking
+        self._active_requests: Dict[str, asyncio.Future] = {}
+
+        # Setup FastAPI routes
+        self._setup_routes()
+        self._server_task = None
+
+    def _setup_routes(self):
+        """Setup FastAPI routes."""
+        app = FastAPI(title="OnlineRouterWorker", version="1.0.0")
+        app.add_api_route("/v1/completions", self._handle_complete, methods=["POST"])
+
+        # Init the HTTP server
+        self._server = uvicorn.Server(
+            uvicorn.Config(
+                app, host=self._server_host, port=self._server_port, log_level="info"
+            )
+        )
+
+    def server_start(self):
+        """Start service."""
+        assert self._server_task is None
+
+        # Start server in background task
+        self._server_task = asyncio.create_task(self._server.serve())
+
+        self.log_info(f"service started on {self._server_host}:{self._server_port}")
+
+    async def server_stop(self):
+        """Stop service."""
+        assert self._server_task is not None
+
+        # Stop the HTTP server
+        self._server.should_exit = True
+
+        # Wait the HTTP server to stop
+        await self._server_task
+
+        self._server_task = None
+        self.log_info("service stopped")
+
+    async def _handle_complete(self, request: CompleteRequest):
+        """Handle complete requests with synchronization support."""
+        request_id = str(uuid.uuid4())
+        start_time = time.time()
+
+        # Check if sync is in progress
+        if self._sync_in_progress:
+            # Wait for old requests to complete
+            await self._old_requests_complete.wait()
+            # Block new requests during sync
+            await self._new_requests_blocked.wait()
+
+        # Create future for this request
+        future = asyncio.Future()
+        self._active_requests[request_id] = future
+
+        try:
+            # Forward request to rollout worker
+            sglang_instance_id = random.randint(0, self._rollout_instance_num - 1)
+            generate_result = (
+                await self.rollout_worker.execute_on(sglang_instance_id)
+                .agenerate(request.prompt, stop=request.stop)
+                .async_wait()
+            )
+            generated_text = generate_result[0]["text"]
+
+            if not request.stream:
+                # Create response
+                response = CompleteResponse(
+                    id=str(request_id),
+                    choices=[
+                        {
+                            "text": generated_text,
+                            "index": 0,
+                            "logprobs": None,
+                            "finish_reason": generate_result[0]["meta_info"][
+                                "finish_reason"
+                            ]["type"],
+                        }
+                    ],
+                    created=int(start_time),
+                    model="test-model",
+                    object="text_completion",
+                )
+            else:
+
+                def generate_stream():
+                    # Send final chunk with finish_reason
+                    final_data = {
+                        "id": request_id,
+                        "object": "text_completion.chunk",
+                        "created": int(start_time),
+                        "model": "test-model",
+                        "choices": [
+                            {
+                                "text": generated_text,
+                                "index": 0,
+                                "logprobs": None,
+                                "finish_reason": "stop",
+                            }
+                        ],
+                    }
+                    yield f"data: {json.dumps(final_data)}\n\n"
+                    yield "data: [DONE]\n\n"
+
+                response = StreamingResponse(
+                    generate_stream(),
+                    media_type="text/event-stream",
+                    headers={
+                        "Cache-Control": "no-cache",
+                        "Connection": "keep-alive",
+                        "X-Accel-Buffering": "no",  # Disable nginx buffering
+                    },
+                )
+
+            # Set future result
+            future.set_result(response)
+            return response
+
+        finally:
+            # Clean up
+            if request_id in self._active_requests:
+                del self._active_requests[request_id]
+
+    async def init_worker(self, rollout_worker: AsyncSGLangWorker):
+        """Initialize the worker."""
+        self.rollout_worker = rollout_worker
+
+    async def sync_model_start(self):
+        """Start model synchronization. Block new requests and wait for old ones to complete."""
+        async with self._sync_model_lock:
+            assert not self._sync_in_progress
+
+            self.log_info("Starting model synchronization...")
+            self._sync_in_progress = True
+
+            # Clear the event to block new requests
+            self._new_requests_blocked.clear()
+
+            # Wait for all existing requests to complete
+            if self._active_requests:
+                self.log_info(
+                    f"Waiting for {len(self._active_requests)} active requests to complete..."
+                )
+                # Wait for all active requests to finish
+                await asyncio.gather(
+                    *self._active_requests.values(), return_exceptions=True
+                )
+
+            # Set event to indicate old requests are complete
+            self._old_requests_complete.set()
+            self.log_info("All old requests completed, sync can proceed")
+
+    async def sync_model_end(self):
+        """End model synchronization. Resume processing of blocked requests."""
+        async with self._sync_model_lock:
+            assert self._sync_in_progress
+
+            self.log_info("Ending model synchronization...")
+
+            # Reset sync state
+            self._sync_in_progress = False
+            self._old_requests_complete.clear()
+
+            # Allow new requests to proceed
+            self._new_requests_blocked.set()
+
+            self.log_info("Model synchronization completed, new requests can proceed")
diff --git a/rlinf/workers/rollout/server/server_rollout_worker.py b/rlinf/workers/rollout/server/server_rollout_worker.py
new file mode 100644
index 000000000..d95135462
--- /dev/null
+++ b/rlinf/workers/rollout/server/server_rollout_worker.py
@@ -0,0 +1,377 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import json
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import torch
+import uvicorn
+from fastapi import FastAPI, Request, Response
+from omegaconf import DictConfig
+from transformers import AutoTokenizer
+
+from rlinf.data.io_struct import (
+    RolloutResult,
+)
+from rlinf.scheduler import Channel, Worker
+
+
+class TrainingDataStorage:
+    """Storage manager for training data received via HTTP API."""
+
+    def __init__(self, storage_config: Optional[Dict[str, Any]] = None):
+        """
+        Initialize storage manager.
+
+        Args:
+            storage_config: Configuration dict with options:
+                - enabled: bool, whether to enable storage (default: True)
+                - storage_dir: str, directory to store files (default: "./training_data")
+                - max_files_per_dir: int, max files per directory (default: 1000)
+                - compress: bool, whether to compress files (default: False)
+        """
+        if storage_config is None:
+            storage_config = {}
+
+        self.enabled = storage_config.get("enabled", True)
+        self.storage_dir = Path(storage_config.get("storage_dir", "./training_data"))
+        self.max_files_per_dir = storage_config.get("max_files_per_dir", 1000)
+        self.compress = storage_config.get("compress", False)
+
+        # Create storage directory if enabled
+        if self.enabled:
+            self.storage_dir.mkdir(parents=True, exist_ok=True)
+
+        # Track current file and entry count
+        self._current_file_path = None
+        self._entries_in_current_file = 0
+
+    def store_training_data(self, training_data: Dict[str, Any]) -> Optional[str]:
+        """
+        Store training data to file.
+
+        Args:
+            training_data: The training data dictionary to store
+
+        Returns:
+            Path to the stored file, or None if storage is disabled
+        """
+        if not self.enabled:
+            return None
+
+        # Add metadata
+        storage_entry = {
+            "timestamp": datetime.utcnow().isoformat(),
+            "stored_at": time.time(),
+            "data": training_data,
+        }
+
+        # Get or create file for writing
+        file_path = self._get_current_file_path()
+
+        # Write data based on format
+        self._write_jsonl_entry(file_path, storage_entry)
+
+        return str(file_path)
+
+    def _get_current_file_path(self) -> Path:
+        """Get the current file path for writing, creating new file if needed."""
+        # Check if we need a new file
+        if (
+            self._current_file_path is None
+            or self._entries_in_current_file >= self.max_files_per_dir
+        ):
+            # Create new file path
+            timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S_%f")[
+                :-3
+            ]  # microseconds to milliseconds
+            filename = f"training_data_{timestamp}.jsonl"
+            if self.compress:
+                filename += ".gz"
+
+            self._current_file_path = self.storage_dir / filename
+            self._entries_in_current_file = 0
+
+        return self._current_file_path
+
+    def _write_jsonl_entry(self, file_path: Path, entry: Dict[str, Any]):
+        """Write entry to JSONL file (one JSON per line)."""
+        # JSONL is more efficient for appending
+        with open(file_path, "a", encoding="utf-8") as f:
+            json.dump(entry, f, ensure_ascii=False)
+            f.write("\n")
+
+        self._entries_in_current_file += 1
+
+    def get_storage_stats(self) -> Dict[str, Any]:
+        """Get storage statistics."""
+        if not self.enabled:
+            return {"enabled": False}
+
+        stats = {
+            "enabled": True,
+            "storage_dir": str(self.storage_dir),
+            "current_file": str(self._current_file_path)
+            if self._current_file_path
+            else None,
+            "entries_in_current_file": self._entries_in_current_file,
+            "total_files": 0,
+            "total_size_bytes": 0,
+        }
+
+        # Count files and calculate total size
+        if self.storage_dir.exists():
+            for file_path in self.storage_dir.iterdir():
+                if file_path.is_file():
+                    stats["total_files"] += 1
+                    stats["total_size_bytes"] += file_path.stat().st_size
+
+        return stats
+
+
+class ServerRolloutWorker(Worker):
+    """
+    ServerRolloutWorker that supports both HTTP API and Channel interfaces.
+    It can receive training data from router's feedback_worker via HTTP
+    and also work with CodingOnlineRLRunner via Channel interface.
+
+    Key features:
+    - Unified data processing for both HTTP and Channel inputs
+    - Automatic rollout processing after server startup
+    - Compatible with CodingOnlineRLRunner interface
+    """
+
+    def __init__(self, cfg: DictConfig):
+        Worker.__init__(self)
+
+        self._cfg = cfg
+
+        # Initialize tokenizer for text processing
+        self._tokenizer = AutoTokenizer.from_pretrained(self._cfg.rollout.model_dir)
+
+        # Configuration
+        self._server_host = cfg.server.tracking_rollout.get("host", "0.0.0.0")
+        self._server_port = cfg.server.tracking_rollout.get("port", 8082)
+        self._enable_dummy_data = cfg.server.tracking_rollout.get(
+            "enable_dummy_data", False
+        )
+
+        # Unified data source for both HTTP and Channel data
+        self._data_source = asyncio.Queue()
+
+        # Initialize training data storage
+        # storage_config = getattr(self._cfg, 'storage', None)
+        storage_config = None
+        if storage_config is not None:
+            storage_config = dict(storage_config)
+        self._storage = TrainingDataStorage(storage_config)
+
+        # Processing configuration
+        self._max_new_tokens = getattr(
+            self._cfg.algorithm.sampling_params, "max_new_tokens", 512
+        )
+        self._batch_size = cfg.data.rollout_batch_size * cfg.algorithm.group_size
+
+        # Processing control
+        self._track_data_enable = False
+
+        # Output channel for continuous processing
+
+        # Setup FastAPI routes
+        self._setup_routes()
+        self._server_task = None
+
+    def _setup_routes(self):
+        """Setup FastAPI routes."""
+        app = FastAPI(title="OnlineRouterWorker", version="1.0.0")
+        app.add_route("/api/training/submit", self._handle_track, methods=["POST"])
+
+        # Init the HTTP server
+        self._server = uvicorn.Server(
+            uvicorn.Config(
+                app, host=self._server_host, port=self._server_port, log_level="info"
+            )
+        )
+
+    def server_start(self):
+        """Start service."""
+        assert self._server_task is None
+
+        # Start server in background task
+        self._server_task = asyncio.create_task(self._server.serve())
+
+        self.log_info(f"service started on {self._server_host}:{self._server_port}")
+
+    async def server_stop(self):
+        """Stop service."""
+        assert self._server_task is not None
+
+        # Stop the HTTP server
+        self._server.should_exit = True
+
+        # Wait the HTTP server to stop
+        await self._server_task
+
+        self._server_task = None
+        self.log_info("service stopped")
+
+    async def _handle_track(self, request: Request):
+        """Handle training data submission from router's feedback_worker."""
+        # Parse incoming training data
+        training_data = await request.json()
+
+        self.log_debug(
+            f"Received training data: {training_data.get('metadata', {}).get('request_id', 'unknown')}"
+        )
+
+        training_data["received_at"] = time.time()
+
+        if self._track_data_enable:
+            # Store training data to file (async, non-blocking)
+            storage_path = self._storage.store_training_data(training_data)
+            if storage_path:
+                training_data["storage_path"] = storage_path
+                self.log_debug(f"Training data stored to: {storage_path}")
+
+            # Put data into unified data source
+            await self._data_source.put(training_data)
+
+        # Return response to router
+        response_data = {
+            "status": "submitted",
+            "message": "Training data submitted successfully",
+            "queue_position": self._data_source.qsize(),
+        }
+
+        return Response(
+            content=json.dumps(response_data),
+            media_type="application/json",
+        )
+
+    def _convert_training_data_to_rollout_result(
+        self, training_data: Dict[str, Any]
+    ) -> RolloutResult:
+        """Convert training data from HTTP request into RolloutResult format."""
+        # Extract text data
+        input_text = training_data.get("prompt", "")
+        output_text = training_data.get("completion", "")
+        reward_score = training_data.get("accepted", 0.0)
+        assert input_text is not None
+        assert output_text is not None
+
+        # Tokenize texts
+        input_encoding = self._tokenizer(
+            input_text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=self._cfg.runner.seq_length - self._max_new_tokens,
+        )
+        input_ids = input_encoding["input_ids"][0].tolist()
+
+        output_encoding = self._tokenizer(
+            text=output_text,
+            return_tensors="pt",
+            truncation=True,
+            max_length=self._max_new_tokens,
+        )
+        output_ids = output_encoding["input_ids"][0].tolist()
+
+        # Create RolloutResult with the feedback data
+        group_size = getattr(self._cfg.algorithm, "group_size", 1)
+
+        rollout_result = RolloutResult(
+            num_sequence=1,
+            group_size=group_size,
+            prompt_lengths=[len(input_ids)],
+            prompt_ids=[input_ids],
+            response_lengths=[len(output_ids)],
+            response_ids=[output_ids],
+            is_end=[True],  # Assume the response is complete
+            rewards=torch.tensor([reward_score], dtype=torch.float32).reshape(-1, 1),
+            advantages=[0.0],  # Will be computed later in the training pipeline
+            prompt_texts=[input_text],
+            response_texts=[output_text],
+            answers=[output_text],
+        )
+
+        self.log_debug(
+            f"Created RolloutResult from HTTP data with reward {reward_score}"
+        )
+
+        return rollout_result
+
+    async def _process_unified_data_continuously(self, output_channel: Channel):
+        """Continuously process data from the unified data source."""
+        self.log_info("Starting continuous unified data processing")
+
+        # clear existing data in self._data_source
+        while not self._data_source.empty():
+            self._data_source.get_nowait()
+
+        # start tracking new data
+        self._track_data_enable = True
+        if self._enable_dummy_data:
+            for i in range(self._batch_size):
+                data = {
+                    "prompt": "Hello, world!",
+                    "completion": "Hello, world!",
+                    "accepted": 1.0,
+                }
+                await self._data_source.put(data)
+
+        for i in range(self._batch_size):
+            # Get data from unified source (either HTTP or Channel)
+            data = await self._data_source.get()
+
+            # Convert data to RolloutResult based on source type
+            rollout_result = self._convert_training_data_to_rollout_result(data)
+
+            # Send result to output channel if available
+            await output_channel.put(item=rollout_result, async_op=True).async_wait()
+            # log the qsize of the output channel
+            self.log_debug(f"Output channel qsize: {output_channel.qsize()}")
+
+            # Mark task as done
+            self._data_source.task_done()
+        self._track_data_enable = False
+
+        self.log_info("Continuous unified data processing stopped")
+
+    async def rollout(self, output_channel: Channel):
+        """Run HTTP server and start automatic data processing."""
+
+        # Start automatic processing
+        await self._process_unified_data_continuously(output_channel)
+
+        self.log_info(
+            "ServerRolloutWorker is running with HTTP server and auto processing"
+        )
+
+    def init_worker(self):
+        """Initialize the worker (sync version)."""
+
+        self.log_info("ServerRolloutWorker initialized")
+
+    async def shutdown(self):
+        """Shutdown the server and cleanup resources."""
+        self.log_info("Shutting down ServerRolloutWorker")
+
+        while not self._data_source.empty():
+            self._data_source.get_nowait()
+
+        self.log_info("ServerRolloutWorker shutdown complete")
diff --git a/rlinf/workers/rollout/sglang/sglang_worker.py b/rlinf/workers/rollout/sglang/sglang_worker.py
index 96c723284..8d4a15cb7 100644
--- a/rlinf/workers/rollout/sglang/sglang_worker.py
+++ b/rlinf/workers/rollout/sglang/sglang_worker.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 
 import asyncio
+import copy
 import dataclasses
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -144,6 +145,8 @@ def _get_sampling_param_from_config(self) -> dict:
                 "repetition_penalty": cfg_sampling_params.repetition_penalty,
                 "max_new_tokens": cfg_sampling_params.max_new_tokens,
             }
+            if "stop" in cfg_sampling_params:
+                sampling_params["stop"] = cfg_sampling_params["stop"]
         return sampling_params
 
     def _stop(self):
@@ -391,3 +394,16 @@ def shutdown(self):
         self.log_info(f"Shutting down SGLang worker {self._rank} ...")
         self._engine.shutdown()
         self.log_info(f"SGLang worker {self._rank} shutdown complete.")
+
+    async def agenerate(self, prompt: str, stop: Optional[List[str]] = None):
+        sampling_params = self._sampling_params
+        if stop is not None:
+            sampling_params = copy.deepcopy(sampling_params)
+            sampling_params["stop"] = stop
+
+        result = await self._engine.async_generate(
+            prompt=prompt,
+            sampling_params=sampling_params,
+            return_logprob=self._return_logprobs,
+        )
+        return result
diff --git a/tests/e2e_tests/coding_online_rl/qwen2.5-1.5b-ppo.yaml b/tests/e2e_tests/coding_online_rl/qwen2.5-1.5b-ppo.yaml
new file mode 100644
index 000000000..1de266648
--- /dev/null
+++ b/tests/e2e_tests/coding_online_rl/qwen2.5-1.5b-ppo.yaml
@@ -0,0 +1,300 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    rollout: 0-3
+    inference: 4-5
+    actor: 6-7
+
+runner:
+  task_type: coding_online_rl
+  logger:
+    log_path: ${runner.output_dir}/${runner.experiment_name}
+    project_name: rlinf
+    experiment_name: ${runner.experiment_name}
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 10
+  max_steps: -1
+
+  val_check_interval: 1
+  save_interval: 10
+
+  seq_length: 2560
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 2560
+
+  resume_dir: null
+  experiment_name: online-ppo-1.5b-pipeline
+  output_dir: ../results
+
+algorithm:
+  group_size: 1
+
+  n_minibatches: 2
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: True
+  shuffle_rollout: False
+
+  max_num_gen_batches: 1
+
+  # PPO loss params (no critic model)
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+
+  # Control critic usage (similar to AReaL's disable_head)
+  use_critic: False  # Disable critic model
+  use_value_loss: False  # Disable value loss computation
+  
+  # PPO parameters for no-critic setup
+  gamma: 0.99
+  gae_lambda: 0.95
+  # value_clip and huber_delta not needed without critic
+
+  # Use no-critic GAE advantage computation
+  adv_type: math_gae_no_critic
+  normalize_advantages: False
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 0.1
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+    stop: [
+      "<|endoftext|>",
+      "<|fim_prefix|>",
+      "<|fim_middle|>",
+      "<|fim_suffix|>",
+      "<|fim_pad|>",
+      "<|repo_name|>",
+      "<|file_sep|>",
+      "<|im_start|>",
+      "<|im_end|>",
+    ]
+
+inference:
+  model_arch: ${rollout.model_arch}
+  group_name: "InferenceGroup"
+  load_from_actor: True
+  model:
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+    sequence_parallel: True
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/Qwen2.5-Coder-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: True            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+
+  rollout_backend: sglang     # online_rl now only support sglang 
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    # not used, but reserved to pass config validate
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  max_prompt_length: 1024
+  rollout_batch_size: 16
+  seed: 1234
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: megatron
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  checkpoint_load_path: null
+
+  offload_optimizer: True
+  offload_weight: True
+  offload_grad: True
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: fp16
+    add_bias_linear: False
+
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+
+    activation: swiglu
+    sequence_parallel: True
+    # recompute_method: block
+    # recompute_granularity: selective
+
+    recompute_method: block
+    recompute_granularity: full
+    recompute_num_layers: 20
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+
+    normalization: rmsnorm
+
+    position_embedding_type: rope
+
+    apply_rope_fusion: True
+    bias_dropout_fusion: False
+    persist_layer_norm: False
+    bias_activation_fusion: False
+    attention_softmax_in_fp32: True
+    batch_p2p_comm: False
+    variable_seq_lengths: True
+    gradient_accumulation_fusion: False
+    moe_token_dispatcher_type: alltoall
+    use_cpu_initialization: False
+
+  optim:
+    optimizer: adam
+    bf16: True
+    fp16: False
+    lr: 2e-06
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-7
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: False
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-6
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: ${rollout.model_dir}
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  megatron:
+    ddp_bucket_size: null
+    distributed_backend: nccl # Support 'nccl' and 'gloo'
+    distributed_timeout_minutes: 30
+    ckpt_format: torch
+    use_dist_ckpt: False
+    tp_comm_bootstrap_backend: nccl
+    tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
+    use_hf_ckpt: True # if true, will transfer hf model to generate megatron checkpoint and use it for training.
+    use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
+
+    ckpt_convertor: # config for ckpt convertor
+      model: Qwen2.5-Coder-1.5B
+      model_type: null # will be set by hf model's config if null
+      hf_model_path: ${rollout.model_dir} # path to the hf model
+      save_path: ${runner.output_dir}/${runner.experiment_name}/converted_ckpts/actor
+      use_gpu_num : 0
+      use_gpu_index: null
+      process_num: 16 # number of processes to use for checkpointing
+      tensor_model_parallel_size: ${actor.model.tensor_model_parallel_size}
+      pipeline_model_parallel_size: ${actor.model.pipeline_model_parallel_size}
+
+    profiler: # profile megatron when inference and traning
+      output_dir: ${runner.output_dir}/${runner.experiment_name}/profiler
+      activities: ["cpu", "cuda"]
+      record_shapes: False
+      profile_memory: False
+      with_stack: False
+      with_flops: False
+      with_modules: True
+      export_tensorboard: True
+      export_chrome_trace: False
+      chrome_filename_prefix: "chrome_trace"
+      schedule_warmup: 2
+      schedule_active: 1
+      schedule_repeat: 1 # inference and training will repeat such times
+      # schedule_wait: it will be set at runtime
+
+reward:
+  use_reward_model: False
+  reward_type: fim_verify_call
+  reward_scale: 5.0
+
+critic:
+  use_critic_model: False
+
+server:
+  # online serving and user reward track
+  online_router:
+    host: 0.0.0.0
+    port: 8081
+
+  tracking_rollout:
+    host: 0.0.0.0
+    port: 8082
+    enable_dummy_data: True
diff --git a/tests/e2e_tests/coding_online_rl/run_coding_online_rl.sh b/tests/e2e_tests/coding_online_rl/run_coding_online_rl.sh
new file mode 100644
index 000000000..7d089861e
--- /dev/null
+++ b/tests/e2e_tests/coding_online_rl/run_coding_online_rl.sh
@@ -0,0 +1,13 @@
+#! /bin/bash
+set -x
+
+tabs 4
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export TOKENIZERS_PARALLELISM=false
+export RAY_DEDUP_LOGS=0
+
+export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
+
+
+python ${REPO_PATH}/examples/coding_online_rl/main_coding_online_rl.py --config-path ${REPO_PATH}/tests/e2e_tests/coding_online_rl  --config-name qwen2.5-1.5b-ppo
+
diff --git a/toolkits/__init__.py b/toolkits/__init__.py
index 5b365ea1e..8b6f0114a 100644
--- a/toolkits/__init__.py
+++ b/toolkits/__init__.py
@@ -11,3 +11,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+
+from rlinf.algorithms.registry import get_reward_fn
+
+
+def register_rewards():
+    try:
+        from toolkits.code_verifier.verify import fim_verify_call
+
+        assert get_reward_fn("fim_verify_call") == fim_verify_call
+    except ImportError:
+        pass
+
+    try:
+        from toolkits.math_verifier.verify import math_verify_call
+
+        assert get_reward_fn("math") == math_verify_call
+    except ImportError:
+        pass
diff --git a/toolkits/ckpt_convertor/convert_hf_to_middle_file.py b/toolkits/ckpt_convertor/convert_hf_to_middle_file.py
index 92983ffbb..40e3d615a 100644
--- a/toolkits/ckpt_convertor/convert_hf_to_middle_file.py
+++ b/toolkits/ckpt_convertor/convert_hf_to_middle_file.py
@@ -116,27 +116,15 @@ def convert_layer(
         should_load_prefix.update(
             (k for k in hfst_loader.keys() if k.startswith("model.embed_tokens."))
         )
-        if not convert_config.tie_word_embeddings:
-            model_strategy_map.update(
-                {
-                    "embedding.word_embeddings.weight": (
-                        "copy",
-                        linear_trans,
-                        "model.embed_tokens.weight",
-                    ),
-                }
-            )
-        else:
-            model_strategy_map.update(
-                {
-                    "embedding.word_embeddings.weight": (
-                        "copy_equal",
-                        linear_trans,
-                        "model.embed_tokens.weight",
-                        "lm_head.weight",
-                    ),
-                }
-            )
+        model_strategy_map.update(
+            {
+                "embedding.word_embeddings.weight": (
+                    "copy",
+                    linear_trans,
+                    "model.embed_tokens.weight",
+                ),
+            }
+        )
     elif layer_idx == num_layers + 1:
         should_load_prefix.update(
             (
diff --git a/toolkits/ckpt_convertor/convert_middle_file_to_hf.py b/toolkits/ckpt_convertor/convert_middle_file_to_hf.py
index 01208f18a..c48246a5f 100644
--- a/toolkits/ckpt_convertor/convert_middle_file_to_hf.py
+++ b/toolkits/ckpt_convertor/convert_middle_file_to_hf.py
@@ -293,7 +293,6 @@ def convert_layer(args, mfst_loader: STLoaderLazy, saver: HFSTSaver, layer_idx):
                     layernorm_trans,
                     "decoder.final_layernorm.weight",
                 ),
-                "lm_head.weight": ("copy", linear_trans, "output_layer.weight"),
             }
         )
         if not args.tie_word_embeddings:
@@ -302,16 +301,6 @@ def convert_layer(args, mfst_loader: STLoaderLazy, saver: HFSTSaver, layer_idx):
                     "lm_head.weight": ("copy", linear_trans, "output_layer.weight"),
                 }
             )
-        else:
-            model_strategy_map.update(
-                {
-                    "lm_head.weight": (
-                        "copy",
-                        linear_trans,
-                        "embedding.word_embeddings.weight",
-                    ),
-                }
-            )
     else:
         should_load_prefix.update(
             (
diff --git a/toolkits/ckpt_convertor/default_args.yaml b/toolkits/ckpt_convertor/default_args.yaml
index 4065992e2..1c89ee0e4 100644
--- a/toolkits/ckpt_convertor/default_args.yaml
+++ b/toolkits/ckpt_convertor/default_args.yaml
@@ -65,6 +65,16 @@ explict_model:
     head_dim: 128
     num_layers: 28
     tie_word_embeddings: true
+  'Qwen2.5-Coder-1.5B':
+    model_type: qwen_2
+    num_attention_heads: 12
+    num_query_groups: 2
+    head_dim: 128
+    num_layers: 28
+    te_ln_linear_qkv: True
+    te_ln_linear_mlp_fc1: True
+    te_ln_add_extra_state: True # pay attention if precision is fp8 mixture
+    tie_word_embeddings: true
 
 model_type:
   deepseek:
diff --git a/toolkits/code_verifier/__init__.py b/toolkits/code_verifier/__init__.py
new file mode 100644
index 000000000..5b365ea1e
--- /dev/null
+++ b/toolkits/code_verifier/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/toolkits/code_verifier/verify.py b/toolkits/code_verifier/verify.py
new file mode 100644
index 000000000..0ad756d02
--- /dev/null
+++ b/toolkits/code_verifier/verify.py
@@ -0,0 +1,42 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+try:
+    from fuzzywuzzy import fuzz
+
+    FUZZY_AVAILABLE = True
+except ImportError:
+    fuzz = None
+    FUZZY_AVAILABLE = False
+from rlinf.algorithms.registry import register_reward_fn
+
+
+@register_reward_fn("fim_verify_call")
+def fim_verify_call(
+    responses: List[str],
+    references: List[str],
+) -> List:
+    assert FUZZY_AVAILABLE, "fuzzywuzzy is not installed"
+    assert len(responses) == len(references), (
+        len(responses),
+        len(references),
+    )
+
+    rewards = []
+    for resp, ref in zip(responses, references):
+        fuzzy_sim = fuzz.ratio(resp.strip(), ref.strip()) / 100
+        rewards.append(fuzzy_sim)
+    return rewards
diff --git a/toolkits/math_verifier/verify.py b/toolkits/math_verifier/verify.py
index 89cd9ef1e..80bf4b552 100644
--- a/toolkits/math_verifier/verify.py
+++ b/toolkits/math_verifier/verify.py
@@ -23,6 +23,7 @@
 from sympy.parsing.latex import parse_latex
 from sympy.parsing.sympy_parser import parse_expr
 
+from rlinf.algorithms.registry import register_reward_fn
 from toolkits.math_verifier.parser import extract_answer
 
 global_executor = ProcessPoolExecutor(max_workers=40)
@@ -382,6 +383,7 @@ def verify_math_solution(answer: str, solution: str):
     return process_results(answer, solution)[0]
 
 
+@register_reward_fn("math")
 def math_verify_call(
     responses: List[str],
     references: List[str],

From 5cdfe4fc7330c87240335c4322259e5d7e841e0b Mon Sep 17 00:00:00 2001
From: Hongzhi Zang <zanghongzhi@infini-ai.com>
Date: Sat, 11 Oct 2025 15:30:31 +0800
Subject: [PATCH 16/57] fix: maniskill render, sim stage seed, rename
 step-level to action-level for reward type; support chunk-level with mask
 setting (#150)

* fix: maniskill_render; simulator_stage_seed; quickstart; reward_type step_level to action_level; support chunk-level reward with loss mask in grpo; del abs path; split rank and seed-offset; fix eval pipeline bug; also fix for libero and robotwin

Signed-off-by: hongzhi <zanghongzhi@infini-ai.com>
---
 .../rst_source/tutorials/user/yaml.rst        |  2 +-
 .../rst_source/tutorials/user/yaml.rst        |  2 +-
 .../config/libero_10_grpo_openvlaoft.yaml     |  2 +-
 .../libero_10_grpo_openvlaoft_eval.yaml       |  2 +-
 .../config/libero_goal_grpo_openvlaoft.yaml   |  2 +-
 .../config/libero_object_grpo_openvlaoft.yaml |  2 +-
 .../libero_spatial_grpo_openvlaoft.yaml       |  2 +-
 .../config/maniskill_grpo_openvla.yaml        |  2 +-
 .../config/maniskill_grpo_openvlaoft.yaml     |  3 +-
 .../config/maniskill_ppo_openvla.yaml         |  2 +-
 .../maniskill_ppo_openvla_quickstart.yaml     | 12 +++----
 rlinf/envs/env_manager.py                     | 20 +++++++----
 rlinf/envs/libero/libero_env.py               | 18 +++++-----
 rlinf/envs/maniskill/maniskill_env.py         | 15 +++++---
 .../maniskill/tasks/put_on_in_scene_multi.py  | 32 +++++++----------
 rlinf/envs/robotwin/RoboTwin_env.py           |  6 ++--
 rlinf/workers/actor/fsdp_actor_worker.py      |  4 +++
 rlinf/workers/env/env_worker.py               | 36 +++++++++++--------
 .../workers/rollout/hf/huggingface_worker.py  | 12 ++++---
 19 files changed, 96 insertions(+), 80 deletions(-)

diff --git a/docs/source-en/rst_source/tutorials/user/yaml.rst b/docs/source-en/rst_source/tutorials/user/yaml.rst
index bac3f47be..51c0c691e 100644
--- a/docs/source-en/rst_source/tutorials/user/yaml.rst
+++ b/docs/source-en/rst_source/tutorials/user/yaml.rst
@@ -781,7 +781,7 @@ algorithm
 
 ``algorithm.rollout_epoch``: Number of rollout epochs per training step.
 
-``algorithm.reward_type``: Reward aggregation level (chunk_level, token_level, step_level).
+``algorithm.reward_type``: Reward aggregation level (chunk_level, action_level).
 
 ``algorithm.logprob_type``: Log probability computation level.
 
diff --git a/docs/source-zh/rst_source/tutorials/user/yaml.rst b/docs/source-zh/rst_source/tutorials/user/yaml.rst
index e5c00f776..b169c4ae8 100644
--- a/docs/source-zh/rst_source/tutorials/user/yaml.rst
+++ b/docs/source-zh/rst_source/tutorials/user/yaml.rst
@@ -725,7 +725,7 @@ algorithm
 
 ``algorithm.rollout_epoch``：每个训练步骤前的 rollout 轮数。
 
-``algorithm.reward_type``：奖励聚合层级（chunk_level、token_level、step_level）。
+``algorithm.reward_type``：奖励聚合层级（chunk_level、action_level）。
 
 ``algorithm.logprob_type``：对数概率的计算层级。
 
diff --git a/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml
index 7d76f2c5c..40a43e3aa 100644
--- a/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml
@@ -45,7 +45,7 @@ algorithm:
   n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
   num_group_envs: 8
   rollout_epoch: 8
-  reward_type: step_level  # step_level or chunk_level
+  reward_type: action_level  # action_level or chunk_level
   logprob_type: token_level
   entropy_type: token_level
 
diff --git a/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml b/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml
index 73a0d1511..4aad72ffa 100644
--- a/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml
+++ b/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml
@@ -45,7 +45,7 @@ algorithm:
   n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
   num_group_envs: 8
   rollout_epoch: 8
-  reward_type: step_level  # step_level or chunk_level
+  reward_type: action_level  # action_level or chunk_level
   logprob_type: token_level
   entropy_type: token_level
 
diff --git a/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml
index b95bad55c..d6356c314 100644
--- a/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml
@@ -45,7 +45,7 @@ algorithm:
   n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
   num_group_envs: 8
   rollout_epoch: 8
-  reward_type: step_level  # step_level or chunk_level
+  reward_type: action_level  # action_level or chunk_level
   logprob_type: token_level
   entropy_type: token_level
 
diff --git a/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml
index a75a87596..b767fd25a 100644
--- a/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml
@@ -45,7 +45,7 @@ algorithm:
   n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
   num_group_envs: 8
   rollout_epoch: 8
-  reward_type: step_level  # step_level or chunk_level
+  reward_type: action_level  # action_level or chunk_level
   logprob_type: token_level
   entropy_type: token_level
 
diff --git a/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml
index bab9bd4cd..69aec20eb 100644
--- a/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml
@@ -45,7 +45,7 @@ algorithm:
   n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
   num_group_envs: 8
   rollout_epoch: 8
-  reward_type: step_level  # step_level or chunk_level
+  reward_type: action_level  # action_level or chunk_level
   logprob_type: token_level
   entropy_type: token_level
 
diff --git a/examples/embodiment/config/maniskill_grpo_openvla.yaml b/examples/embodiment/config/maniskill_grpo_openvla.yaml
index 4b0500673..16dc2af06 100644
--- a/examples/embodiment/config/maniskill_grpo_openvla.yaml
+++ b/examples/embodiment/config/maniskill_grpo_openvla.yaml
@@ -51,7 +51,7 @@ algorithm:
   n_chunk_steps: 80
   n_eval_chunk_steps: 80
 
-  reward_type: step_level
+  reward_type: action_level
   logprob_type: token_level
   entropy_type: token_level
 
diff --git a/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml b/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml
index cb752180c..def45aafb 100644
--- a/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml
@@ -46,8 +46,7 @@ algorithm:
   n_eval_chunk_steps: 10
   num_group_envs: 32
   rollout_epoch: 1
-
-  reward_type: step_level
+  reward_type: action_level  # action_level or chunk_level
   logprob_type: token_level
   entropy_type: token_level
 
diff --git a/examples/embodiment/config/maniskill_ppo_openvla.yaml b/examples/embodiment/config/maniskill_ppo_openvla.yaml
index 9315b7db8..6aeae632d 100644
--- a/examples/embodiment/config/maniskill_ppo_openvla.yaml
+++ b/examples/embodiment/config/maniskill_ppo_openvla.yaml
@@ -46,7 +46,7 @@ algorithm:
   n_eval_chunk_steps: 80
   num_group_envs: 128
   rollout_epoch: 1
-  reward_type: step_level
+  reward_type: action_level
   logprob_type: action_level
   entropy_type: action_level
 
diff --git a/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml b/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml
index b277418f4..969dc85cb 100644
--- a/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml
+++ b/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml
@@ -23,7 +23,7 @@ runner:
     experiment_name: "test_openvla"
     logger_backends: ["tensorboard"] # wandb, swanlab
 
-  max_epochs: 5
+  max_epochs: 1000
   max_steps: -1
 
   only_eval: False
@@ -41,14 +41,14 @@ algorithm:
   kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
   group_size: 1
 
-  n_chunk_steps: 10
-  n_eval_chunk_steps: 10
+  n_chunk_steps: 80
+  n_eval_chunk_steps: 80
   # training rollout mbs
   rollout_micro_batch_size: 64
-  num_group_envs: 8
+  num_group_envs: 32
   rollout_epoch: 1
 
-  reward_type: step_level
+  reward_type: action_level
   logprob_type: action_level
   entropy_type: action_level
 
@@ -123,7 +123,7 @@ actor:
   checkpoint_save_path: "../results"
 
   micro_batch_size: 20
-  global_batch_size: 80
+  global_batch_size: 160
   seed: 1234
   enable_offload: True
 
diff --git a/rlinf/envs/env_manager.py b/rlinf/envs/env_manager.py
index dc34b1f63..18d9bacbd 100644
--- a/rlinf/envs/env_manager.py
+++ b/rlinf/envs/env_manager.py
@@ -157,10 +157,13 @@ def recursive_to_own(obj):
 
 
 class EnvManager:
-    def __init__(self, cfg, rank, world_size, env_cls, enable_offload=False):
+    def __init__(
+        self, cfg, rank, seed_offset, total_num_processes, env_cls, enable_offload=False
+    ):
         self.cfg = cfg
         self.rank = rank
-        self.world_size = world_size
+        self.seed_offset = seed_offset
+        self.total_num_processes = total_num_processes
         self.process: Optional[mp.Process] = None
         self.command_queue: Optional[mp.Queue] = None
         self.result_queue: Optional[mp.Queue] = None
@@ -181,7 +184,7 @@ def __init__(self, cfg, rank, world_size, env_cls, enable_offload=False):
             self.env = None
         else:
             self.env_cls = env_cls
-            self.env = self.env_cls(cfg, rank, world_size)
+            self.env = self.env_cls(cfg, seed_offset, total_num_processes)
 
     def start_simulator(self):
         """Start simulator process with shared memory queues"""
@@ -202,7 +205,8 @@ def start_simulator(self):
             args=(
                 self.cfg,
                 self.rank,
-                self.world_size,
+                self.seed_offset,
+                self.total_num_processes,
                 self.env_cls,
                 self.command_queue,
                 self.result_queue,
@@ -277,7 +281,8 @@ def __setattr__(self, name, value):
         if name in [
             "cfg",
             "rank",
-            "world_size",
+            "seed_offset",
+            "total_num_processes",
             "process",
             "command_queue",
             "result_queue",
@@ -321,7 +326,8 @@ def __setattr__(self, name, value):
 def _simulator_worker(
     cfg,
     rank,
-    world_size,
+    seed_offset,
+    total_num_processes,
     env_cls,
     command_queue,
     result_queue,
@@ -340,7 +346,7 @@ def _simulator_worker(
     omegaconf_register()
 
     try:
-        simulator = env_cls(cfg, rank, world_size)
+        simulator = env_cls(cfg, seed_offset, total_num_processes)
         assert isinstance(simulator, EnvOffloadMixin), (
             f"Environment class {env_cls.__name__} must inherit from EnvOffloadMixin"
         )
diff --git a/rlinf/envs/libero/libero_env.py b/rlinf/envs/libero/libero_env.py
index ab117d0c1..6471bf770 100644
--- a/rlinf/envs/libero/libero_env.py
+++ b/rlinf/envs/libero/libero_env.py
@@ -39,11 +39,11 @@
 
 
 class LiberoEnv(gym.Env):
-    def __init__(self, cfg, rank, world_size):
-        self.rank = rank
+    def __init__(self, cfg, seed_offset, total_num_processes):
+        self.seed_offset = seed_offset
         self.cfg = cfg
-        self.world_size = world_size
-        self.seed = self.cfg.seed + rank
+        self.total_num_processes = total_num_processes
+        self.seed = self.cfg.seed + seed_offset
         self._is_start = True
         self.num_envs = self.cfg.num_envs
         self.group_size = self.cfg.group_size
@@ -148,17 +148,19 @@ def _get_random_reset_state_ids(self, num_reset_states):
 
     def get_reset_state_ids_all(self):
         reset_state_ids = np.arange(self.total_num_group_envs)
-        valid_size = len(reset_state_ids) - (len(reset_state_ids) % self.world_size)
+        valid_size = len(reset_state_ids) - (
+            len(reset_state_ids) % self.total_num_processes
+        )
         self._generator_ordered.shuffle(reset_state_ids)
         reset_state_ids = reset_state_ids[:valid_size]
-        reset_state_ids = reset_state_ids.reshape(self.world_size, -1)
+        reset_state_ids = reset_state_ids.reshape(self.total_num_processes, -1)
         return reset_state_ids
 
     def _get_ordered_reset_state_ids(self, num_reset_states):
         if self.start_idx + num_reset_states > len(self.reset_state_ids_all[0]):
             self.reset_state_ids_all = self.get_reset_state_ids_all()
             self.start_idx = 0
-        reset_state_ids = self.reset_state_ids_all[self.rank][
+        reset_state_ids = self.reset_state_ids_all[self.seed_offset][
             self.start_idx : self.start_idx + num_reset_states
         ]
         self.start_idx = self.start_idx + num_reset_states
@@ -474,7 +476,7 @@ def add_new_frames(self, raw_obs, plot_infos):
         self.render_images.append(full_image)
 
     def flush_video(self, video_sub_dir: Optional[str] = None):
-        output_dir = os.path.join(self.video_cfg.video_base_dir, f"rank_{self.rank}")
+        output_dir = os.path.join(self.video_cfg.video_base_dir, f"seed_{self.seed}")
         if video_sub_dir is not None:
             output_dir = os.path.join(output_dir, f"{video_sub_dir}")
         save_rollout_video(
diff --git a/rlinf/envs/maniskill/maniskill_env.py b/rlinf/envs/maniskill/maniskill_env.py
index b80096ab8..8d79315e2 100644
--- a/rlinf/envs/maniskill/maniskill_env.py
+++ b/rlinf/envs/maniskill/maniskill_env.py
@@ -47,11 +47,10 @@ def extract_termination_from_info(info, num_envs, device):
 
 
 class ManiskillEnv(gym.Env):
-    def __init__(self, cfg, rank, world_size, record_metrics=True):
+    def __init__(self, cfg, seed_offset, total_num_processes, record_metrics=True):
         env_seed = cfg.seed
-        self.seed = env_seed + rank
-        self.rank = rank
-        self.world_size = world_size
+        self.seed = env_seed + seed_offset
+        self.total_num_processes = total_num_processes
         self.auto_reset = cfg.auto_reset
         self.use_rel_reward = cfg.use_rel_reward
         self.ignore_terminations = cfg.ignore_terminations
@@ -367,8 +366,14 @@ def add_new_frames(self, infos, rewards=None):
         image = self.render(infos, rewards)
         self.render_images.append(image)
 
+    def add_new_frames_from_obs(self, raw_obs):
+        """For debugging render"""
+        raw_imgs = common.to_numpy(raw_obs["images"].permute(0, 2, 3, 1))
+        raw_full_img = tile_images(raw_imgs, nrows=int(np.sqrt(self.num_envs)))
+        self.render_images.append(raw_full_img)
+
     def flush_video(self, video_sub_dir: Optional[str] = None):
-        output_dir = os.path.join(self.video_cfg.video_base_dir, f"rank_{self.rank}")
+        output_dir = os.path.join(self.video_cfg.video_base_dir, f"seed_{self.seed}")
         if video_sub_dir is not None:
             output_dir = os.path.join(output_dir, f"{video_sub_dir}")
         images_to_video(
diff --git a/rlinf/envs/maniskill/tasks/put_on_in_scene_multi.py b/rlinf/envs/maniskill/tasks/put_on_in_scene_multi.py
index 85e92a057..94de7ebc3 100644
--- a/rlinf/envs/maniskill/tasks/put_on_in_scene_multi.py
+++ b/rlinf/envs/maniskill/tasks/put_on_in_scene_multi.py
@@ -14,7 +14,6 @@
 
 import os
 from pathlib import Path
-from typing import Optional
 
 import cv2
 import numpy as np
@@ -745,11 +744,8 @@ def _green_sceen_rgb(
 
         return rgb_ret
 
-    def get_obs(self, info: Optional[dict] = None, unflattened=True):
-        assert unflattened
-        obs = super().get_obs(info)
-
-        # "greenscreen" process
+    def _get_obs_sensor_data(self, apply_texture_transforms=True):
+        sensor_obs = super()._get_obs_sensor_data(apply_texture_transforms)
         if (
             self.obs_mode_struct.visual.rgb
             and self.obs_mode_struct.visual.segmentation
@@ -757,27 +753,23 @@ def get_obs(self, info: Optional[dict] = None, unflattened=True):
         ):
             # get the actor ids of objects to manipulate; note that objects here are not articulated
             camera_name = self.rgb_camera_name
-            assert "segmentation" in obs["sensor_data"][camera_name].keys()
+            assert "segmentation" in sensor_obs[camera_name].keys()
 
-            overlay_img = self.overlay_images.to(
-                obs["sensor_data"][camera_name]["rgb"].device
-            )
-            overlay_texture = self.overlay_textures.to(
-                obs["sensor_data"][camera_name]["rgb"].device
-            )
-            overlay_mix = self.overlay_mix.to(
-                obs["sensor_data"][camera_name]["rgb"].device
-            )
+            raw_rgb_device = sensor_obs[camera_name]["rgb"].device
+
+            overlay_img = self.overlay_images.to(raw_rgb_device)
+            overlay_texture = self.overlay_textures.to(raw_rgb_device)
+            overlay_mix = self.overlay_mix.to(raw_rgb_device)
 
             green_screened_rgb = self._green_sceen_rgb(
-                obs["sensor_data"][camera_name]["rgb"],
-                obs["sensor_data"][camera_name]["segmentation"],
+                sensor_obs[camera_name]["rgb"],
+                sensor_obs[camera_name]["segmentation"],
                 overlay_img,
                 overlay_texture,
                 overlay_mix,
             )
-            obs["sensor_data"][camera_name]["rgb"] = green_screened_rgb
-        return obs
+            sensor_obs[camera_name]["rgb"] = green_screened_rgb
+        return sensor_obs
 
     # widowx
     @property
diff --git a/rlinf/envs/robotwin/RoboTwin_env.py b/rlinf/envs/robotwin/RoboTwin_env.py
index b43f98c9a..aac0be6aa 100644
--- a/rlinf/envs/robotwin/RoboTwin_env.py
+++ b/rlinf/envs/robotwin/RoboTwin_env.py
@@ -247,11 +247,11 @@ def update_obs(observation):
 
 
 class RoboTwin(gym.Env):
-    def __init__(self, cfg, rank, world_size, record_metrics=True):
+    def __init__(self, cfg, seed_offset, total_num_processes, record_metrics=True):
         # Get parameters from configuration
         self.cfg = cfg
-        self.rank = rank
-        self.world_size = world_size
+        self.seed_offset = seed_offset
+        self.total_num_processes = total_num_processes
         self.record_metrics = record_metrics
         self._is_start = True
         self.info_logging_keys = ["is_src_obj_grasped", "consecutive_grasp", "success"]
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index 51c0a9533..61b05c9b4 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -159,6 +159,10 @@ def _process_received_rollout_batch(self, rollout_batch):
             ]  # [n_chunk_step, rollout_epoch x bsz, num_action_chunks]
             loss_mask, loss_mask_sum = compute_loss_mask(dones)
 
+            if self.cfg.algorithm.reward_type == "chunk_level":
+                loss_mask = loss_mask.any(dim=-1, keepdim=True)
+                loss_mask_sum = loss_mask_sum[..., -1:]
+
             rollout_batch["loss_mask"] = loss_mask
             rollout_batch["loss_mask_sum"] = loss_mask_sum
 
diff --git a/rlinf/workers/env/env_worker.py b/rlinf/workers/env/env_worker.py
index 13d24d1e1..06375ed98 100644
--- a/rlinf/workers/env/env_worker.py
+++ b/rlinf/workers/env/env_worker.py
@@ -96,23 +96,25 @@ def init_worker(self):
             from rlinf.envs.maniskill.maniskill_env import ManiskillEnv
 
             if not only_eval:
-                for _ in range(self.stage_num):
+                for stage_id in range(self.stage_num):
                     self.simulator_list.append(
                         EnvManager(
                             self.cfg.env.train,
                             rank=self._rank,
-                            world_size=self._world_size,
+                            seed_offset=self._rank * self.stage_num + stage_id,
+                            total_num_processes=self._world_size,
                             env_cls=ManiskillEnv,
                             enable_offload=enable_offload,
                         )
                     )
             if self.cfg.runner.val_check_interval > 0 or only_eval:
-                for _ in range(self.stage_num):
+                for stage_id in range(self.stage_num):
                     self.eval_simulator_list.append(
                         EnvManager(
                             self.cfg.env.eval,
                             rank=self._rank,
-                            world_size=self._world_size,
+                            seed_offset=self._rank * self.stage_num + stage_id,
+                            total_num_processes=self._world_size,
                             env_cls=ManiskillEnv,
                             enable_offload=enable_offload,
                         )
@@ -121,23 +123,25 @@ def init_worker(self):
             from rlinf.envs.libero.libero_env import LiberoEnv
 
             if not only_eval:
-                for _ in range(self.stage_num):
+                for stage_id in range(self.stage_num):
                     self.simulator_list.append(
                         EnvManager(
                             self.cfg.env.train,
                             rank=self._rank,
-                            world_size=self._world_size,
+                            seed_offset=self._rank * self.stage_num + stage_id,
+                            total_num_processes=self._world_size * self.stage_num,
                             env_cls=LiberoEnv,
                             enable_offload=enable_offload,
                         )
                     )
             if self.cfg.runner.val_check_interval > 0 or only_eval:
-                for _ in range(self.stage_num):
+                for stage_id in range(self.stage_num):
                     self.eval_simulator_list.append(
                         EnvManager(
                             self.cfg.env.eval,
                             rank=self._rank,
-                            world_size=self._world_size,
+                            seed_offset=self._rank * self.stage_num + stage_id,
+                            total_num_processes=self._world_size * self.stage_num,
                             env_cls=LiberoEnv,
                             enable_offload=enable_offload,
                         )
@@ -146,24 +150,26 @@ def init_worker(self):
             from rlinf.envs.robotwin.RoboTwin_env import RoboTwin
 
             if not only_eval:
-                for _ in range(self.stage_num):
+                for stage_id in range(self.stage_num):
                     self.simulator_list.append(
                         EnvManager(
                             self.cfg.env.train,
                             rank=self._rank,
-                            world_size=self._world_size,
+                            seed_offset=self._rank * self.stage_num + stage_id,
+                            total_num_processes=self._world_size * self.stage_num,
                             env_cls=RoboTwin,
                             enable_offload=enable_offload,
                         )
-                        # RoboTwin(self.cfg.env.train, rank=self._rank, world_size=self._world_size)
+                        # RoboTwin(self.cfg.env.train, rank=self._rank, total_num_processes=self._world_size)
                     )
             if self.cfg.runner.val_check_interval > 0 or only_eval:
-                for _ in range(self.stage_num):
+                for stage_id in range(self.stage_num):
                     self.eval_simulator_list.append(
                         EnvManager(
                             self.cfg.env.eval,
                             rank=self._rank,
-                            world_size=self._world_size,
+                            seed_offset=self._rank * self.stage_num + stage_id,
+                            total_num_processes=self._world_size,
                             env_cls=RoboTwin,
                             enable_offload=enable_offload,
                         )
@@ -283,13 +289,13 @@ def finish_rollout(self, mode="train"):
         if mode == "train":
             if self.cfg.env.train.video_cfg.save_video:
                 for i in range(self.stage_num):
-                    self.simulator_list[i].flush_video(video_sub_dir=f"stage_{i}")
+                    self.simulator_list[i].flush_video()
             for i in range(self.stage_num):
                 self.simulator_list[i].update_reset_state_ids()
         elif mode == "eval":
             if self.cfg.env.eval.video_cfg.save_video:
                 for i in range(self.stage_num):
-                    self.eval_simulator_list[i].flush_video(video_sub_dir=f"stage_{i}")
+                    self.eval_simulator_list[i].flush_video()
 
     def split_env_batch(self, env_batch, gather_id, mode):
         env_batch_i = {}
diff --git a/rlinf/workers/rollout/hf/huggingface_worker.py b/rlinf/workers/rollout/hf/huggingface_worker.py
index 911cbe54e..c45be2991 100644
--- a/rlinf/workers/rollout/hf/huggingface_worker.py
+++ b/rlinf/workers/rollout/hf/huggingface_worker.py
@@ -302,11 +302,13 @@ async def evaluate(self):
                     for key, value in env_info_list.items():
                         eval_info[f"env_info/{key}"].append(value)
 
-        env_batch = await self.recv_env_batch()
-        if "meta" in env_batch:
-            env_info_list = env_batch["meta"]
-            for key, value in env_info_list.items():
-                eval_info[f"env_info/{key}"].append(value)
+        for i in range(self.stage_num):
+            env_batch = await self.recv_env_batch()
+            if "meta" in env_batch:
+                env_info_list = env_batch["meta"]
+                for key, value in env_info_list.items():
+                    eval_info[f"env_info/{key}"].append(value)
+
         eval_metrics = create_rollout_batch(eval_info)
         if self.cfg.rollout.get("enable_offload", False):
             self.offload_model()

From 69324c60d69ac09bc26e75b0a7ed5aa91118bf6a Mon Sep 17 00:00:00 2001
From: WinstonWmj <983289917@qq.com>
Date: Sat, 11 Oct 2025 16:13:51 +0800
Subject: [PATCH 17/57] fix(embodied): urgent fix for libero_10 yaml; (#161)

Signed-off-by: infiniAI_wmj <mjwei@infiniAI-wmjdeMacBook-Air.local>
Co-authored-by: infiniAI_wmj <mjwei@infiniAI-wmjdeMacBook-Air.local>
---
 examples/embodiment/config/libero_10_grpo_openvlaoft.yaml      | 2 +-
 examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml | 2 +-
 examples/embodiment/config/libero_10_ppo_openvlaoft.yaml       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml
index 40a43e3aa..1d9720fdd 100644
--- a/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml
@@ -1,5 +1,5 @@
 defaults:
-  - env/train: libero_10_grpo
+  - env/train: libero_10
   - env/eval: libero_10
   - override hydra/job_logging: stdout
 
diff --git a/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml b/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml
index 4aad72ffa..628272ed1 100644
--- a/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml
+++ b/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml
@@ -1,5 +1,5 @@
 defaults:
-  - env/train: libero_10_grpo
+  - env/train: libero_10
   - env/eval: libero_10
   - override hydra/job_logging: stdout
 
diff --git a/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml b/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml
index af0fb7316..bf7e31667 100644
--- a/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml
@@ -1,5 +1,5 @@
 defaults:
-  - env/train: libero_10_ppo
+  - env/train: libero_10
   - env/eval: libero_10
   - override hydra/job_logging: stdout
 

From af32ed06e06d790b836ad7aa3ce2ed58e0b42f44 Mon Sep 17 00:00:00 2001
From: Andy Lin <32576375+andylin-hao@users.noreply.github.com>
Date: Sat, 11 Oct 2025 16:31:11 +0800
Subject: [PATCH 18/57] fix: libero not fully install in image (#160)

Signed-off-by: Hao Lin <linhaomails@gmail.com>
---
 .github/CODEOWNERS                               |  2 +-
 docker/torch-2.6/Dockerfile                      |  3 +++
 docs/source-en/rst_source/start/installation.rst | 14 +++++++++++++-
 docs/source-zh/rst_source/start/installation.rst | 12 ++++++++++++
 examples/embodiment/run_embodiment.sh            |  4 +++-
 requirements/README.md                           | 10 ++++++++++
 6 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 8f7bc741f..def6b99ac 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -38,7 +38,7 @@
 
 /tests @andylin-hao
 /toolkits/auto_placement @i-Taozi
-/toolkits/ckpt_convertor @Louis-J
+/toolkits/ckpt_convertor @qurakchin
 
 /.pre-commit-config.yaml @andylin-hao
 /pyproject.toml @andylin-hao
diff --git a/docker/torch-2.6/Dockerfile b/docker/torch-2.6/Dockerfile
index 95da4683e..085d5c55d 100644
--- a/docker/torch-2.6/Dockerfile
+++ b/docker/torch-2.6/Dockerfile
@@ -113,6 +113,9 @@ RUN export PHYSX_VERSION=105.1-physx-5.3.1.patch0 && \
     wget -O $PHYSX_DIR/linux-so.zip https://github.com/sapien-sim/physx-precompiled/releases/download/$PHYSX_VERSION/linux-so.zip && \
     unzip $PHYSX_DIR/linux-so.zip -d $PHYSX_DIR && rm $PHYSX_DIR/linux-so.zip
 
+RUN git clone https://github.com/RLinf/LIBERO.git /opt/libero
+ENV PYTHONPATH=/opt/libero:$PYTHONPATH
+
 # Set default env
 RUN echo "source ${UV_PATH}/openvla/bin/activate" >> ~/.bashrc
 
diff --git a/docs/source-en/rst_source/start/installation.rst b/docs/source-en/rst_source/start/installation.rst
index 58288582e..534e467f9 100644
--- a/docs/source-en/rst_source/start/installation.rst
+++ b/docs/source-en/rst_source/start/installation.rst
@@ -203,4 +203,16 @@ Then, depending on the experiment type, install the required packages for ``open
    UV_TORCH_BACKEND=auto uv pip install -r requirements/openvla_oft.txt --no-build-isolation
 
    # For Pi0 experiments
-   UV_TORCH_BACKEND=auto uv pip install -r requirements/pi0.txt --no-build-isolation
\ No newline at end of file
+   UV_TORCH_BACKEND=auto uv pip install -r requirements/pi0.txt --no-build-isolation
+
+Finally, Run the following to install the libero dependency.
+
+.. code-block:: shell
+
+  mkdir -p /opt && git clone https://github.com/RLinf/LIBERO.git /opt/libero
+
+Before using LIBERO, make sure its path is added to the `PYTHONPATH` environment variables.
+
+.. code-block:: shell
+  
+  export PYTHONPATH=/opt/libero:$PYTHONPATH
\ No newline at end of file
diff --git a/docs/source-zh/rst_source/start/installation.rst b/docs/source-zh/rst_source/start/installation.rst
index 220d973d1..84038f8de 100644
--- a/docs/source-zh/rst_source/start/installation.rst
+++ b/docs/source-zh/rst_source/start/installation.rst
@@ -201,3 +201,15 @@ Megatron 和 SGLang/vLLM 依赖
 
    # Pi0 实验所需依赖
    UV_TORCH_BACKEND=auto uv pip install -r requirements/pi0.txt --no-build-isolation
+
+最后，运行以下命令安装 libero 依赖。
+
+.. code-block:: shell
+
+  mkdir -p /opt && git clone https://github.com/RLinf/LIBERO.git /opt/libero
+
+在使用 LIBERO 前，请确保将其路径添加到 ``PYTHONPATH`` 环境变量中：
+
+.. code-block:: shell
+
+  export PYTHONPATH=/opt/libero:$PYTHONPATH
diff --git a/examples/embodiment/run_embodiment.sh b/examples/embodiment/run_embodiment.sh
index 6f31890c6..cf5d91d4f 100644
--- a/examples/embodiment/run_embodiment.sh
+++ b/examples/embodiment/run_embodiment.sh
@@ -7,7 +7,9 @@ export SRC_FILE="${EMBODIED_PATH}/train_embodied_agent.py"
 export MUJOCO_GL="egl"
 export PYOPENGL_PLATFORM="egl"
 export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-export LIBERO_CONFIG_PATH="/path/to/repo/LIBERO_CONFIG_PATH"
+
+LIBERO_PATH=/opt/libero
+export PYTHONPATH=${LIBERO_PATH}:$PYTHONPATH
 
 export CUDA_LAUNCH_BLOCKING=1
 export HYDRA_FULL_ERROR=1
diff --git a/requirements/README.md b/requirements/README.md
index 531bf40bb..d8f565a1e 100644
--- a/requirements/README.md
+++ b/requirements/README.md
@@ -52,4 +52,14 @@ UV_TORCH_BACKEND=auto uv pip install -r requirements/openvla_oft.txt --no-build-
 
 # For Pi0 experiment
 UV_TORCH_BACKEND=auto uv pip install -r requirements/pi0.txt --no-build-isolation
+```
+
+Finally, Run the following to install the libero dependency.
+
+```shell
+mkdir -p /opt && git clone https://github.com/RLinf/LIBERO.git /opt/libero
+```
+Before using LIBERO, make sure its path is added to the `PYTHONPATH` environment variables.
+```shell
+export PYTHONPATH=/opt/libero:$PYTHONPATH
 ```
\ No newline at end of file

From 5e73652f1eadf2002b305bc9f414ec65b9a6f961 Mon Sep 17 00:00:00 2001
From: Andy Lin <32576375+andylin-hao@users.noreply.github.com>
Date: Sun, 12 Oct 2025 10:24:06 +0800
Subject: [PATCH 19/57] ci: use new CI machines and accelerate CI process
 (#162)

Signed-off-by: Hao Lin <linhaomails@gmail.com>
---
 .github/workflows/auto_placement.yml          |  59 ----
 .github/workflows/code-test.yml               | 291 ++++++++++++++++++
 .github/workflows/coding_online_rl_e2e.yml    |  62 ----
 .github/workflows/commit_check.yml            |   2 +-
 .github/workflows/embodied_e2e.yml            |  74 -----
 .github/workflows/lint.yml                    |   2 +-
 .github/workflows/math_e2e.yml                |  96 ------
 .../workflows/math_e2e_rollout_logprobs.yml   |  78 -----
 .github/workflows/unit_test.yml               |  77 -----
 docs/source-en/rst_source/start/vla-eval.rst  |   5 +-
 docs/source-zh/rst_source/start/vla-eval.rst  |   5 +-
 examples/embodiment/eval_embodiment.sh        |   8 +-
 .../coding_online_rl/run_coding_online_rl.sh  |   1 -
 .../embodied/libero_130_grpo_openvlaoft.yaml  |   2 +-
 tests/e2e_tests/embodied/run_openvla.sh       |   2 -
 .../embodied/run_openvlaoft_libero130.sh      |   4 -
 .../sglang/qwen2.5-1.5b-grpo-collocated.yaml  |   4 +-
 ...5-1.5b-grpo-pipeline-rollout-logprobs.yaml |   2 +-
 .../sglang/qwen2.5-1.5b-grpo-pipeline.yaml    |   2 +-
 ...1.5b-grpo-collocated-rollout-logprobs.yaml |   4 +-
 .../vllm/qwen2.5-1.5b-grpo-collocated.yaml    |   4 +-
 ...5-1.5b-grpo-pipeline-rollout-logprobs.yaml |   2 +-
 .../math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml |   2 +-
 23 files changed, 311 insertions(+), 477 deletions(-)
 delete mode 100644 .github/workflows/auto_placement.yml
 create mode 100644 .github/workflows/code-test.yml
 delete mode 100644 .github/workflows/coding_online_rl_e2e.yml
 delete mode 100644 .github/workflows/embodied_e2e.yml
 delete mode 100644 .github/workflows/math_e2e.yml
 delete mode 100644 .github/workflows/math_e2e_rollout_logprobs.yml
 delete mode 100644 .github/workflows/unit_test.yml

diff --git a/.github/workflows/auto_placement.yml b/.github/workflows/auto_placement.yml
deleted file mode 100644
index 169e48186..000000000
--- a/.github/workflows/auto_placement.yml
+++ /dev/null
@@ -1,59 +0,0 @@
-name: Math Auto Placement
-
-on:
-    push:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '!*.yaml'
-          - '!*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-    pull_request:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '*.yaml'
-          - '*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-permissions:
-  contents: read
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-    qwen-grpo-test:
-        runs-on: rlinf
-        container:
-            image: rlinf/rlinf:math-rlinf0.1-torch2.6.0-sglang0.4.6.post5-vllm0.8.5-megatron0.13.0-te2.1
-            volumes:
-                - /mnt/public/dataset:/workspace/dataset
-                - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=80g
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - name: auto-placement
-              timeout-minutes: 20
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/auto_placement/run_auto_placement.sh
\ No newline at end of file
diff --git a/.github/workflows/code-test.yml b/.github/workflows/code-test.yml
new file mode 100644
index 000000000..b61817bfe
--- /dev/null
+++ b/.github/workflows/code-test.yml
@@ -0,0 +1,291 @@
+name: Code Test
+
+on:
+  push:
+    branches:
+      - "release/v[0-9].[0-9]"
+      - main
+  pull_request:
+    branches: [main]
+    types: [synchronize, labeled]
+  workflow_dispatch:
+
+concurrency:
+  group: code-test-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # =============================================== check changes ====================================================
+  check-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      file_filter: ${{ steps.filter.outputs.file_filter }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Fail if the PR does not have the 'run-ci' label
+        if: github.event_name == 'pull_request' && !contains(github.event.pull_request.labels.*.name, 'run-ci')
+        run: |
+          echo "This pull request does not have the 'run-ci' label. Failing the workflow."
+          exit 1
+
+      - name: Fail if the PR is a draft
+        if: github.event_name == 'pull_request' && github.event.pull_request.draft == true
+        run: |
+          echo "This pull request is a draft. Failing the workflow."
+          exit 1
+
+      - name: Detect file changes
+        id: filter
+        uses: dorny/paths-filter@v3
+        with:
+          filters: |
+            file_filter:
+              - '**/*.py'
+              - 'tests/**'
+              - '.github/workflows/*.yml'
+              - '!docs/**'
+              - '!README.md'
+              - '*.yaml'
+              - '*.toml'
+              - '!ray_utils/**'
+              - '!requirements/**'
+
+  # =============================================== unit tests ====================================================
+
+  unit-tests-cuda:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: reason
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Install pytest
+        run: |
+          source switch_env reason
+          uv pip install pytest
+
+      - name: Run pytest
+        timeout-minutes: 20
+        run: |
+          export PYTHONPATH=$(pwd):$(pwd)/megatron:$(pwd)/tests/unit_tests
+          source switch_env reason
+          pytest tests/unit_tests
+
+      - name: Run doctest
+        timeout-minutes: 20
+        run: |
+          source switch_env reason
+          pytest --doctest-modules rlinf/scheduler
+
+  unit-tests-cpu:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Install dependencies
+        run: |
+          pip install uv
+          uv venv
+          source .venv/bin/activate
+          UV_TORCH_BACKEND=auto uv sync
+          uv pip install pytest
+
+      - name: Run pytest
+        timeout-minutes: 20
+        run: |
+          export PYTHONPATH=$(pwd):$(pwd)/megatron:$(pwd)/tests/unit_tests 
+          source .venv/bin/activate
+          pytest tests/unit_tests
+
+  # =============================================== reason e2e tests ====================================================
+
+  reason-qwen-grpo-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: reason
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: SGLang Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/math/sglang/run_collocated.sh
+
+      - name: vLLM Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/math/vllm/run_collocated.sh
+
+      - name: SGLang Pipeline mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/math/sglang/run_pipeline.sh
+
+      - name: vLLM Pipeline mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/math/vllm/run_pipeline.sh
+
+  reason-qwen-grpo-test-rollout-logprobs:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: reason
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: SGLang Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/math/sglang/run_collocated.sh qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
+
+      - name: vLLM Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/math/vllm/run_collocated.sh qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
+
+      - name: SGLang Pipeline mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/math/sglang/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
+
+      - name: vLLM Pipeline mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/math/vllm/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
+
+  coding-online-rl-qwen-ppo-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: reason
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Install dependencies
+        run: |
+          pip install httpx asyncio fuzzywuzzy
+
+      - name: SGLang Collocated mode
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/coding_online_rl/run_coding_online_rl.sh
+
+  # =============================================== embodied e2e tests ====================================================
+
+  embodied-openvla-ppo-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: embodied
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+      - name: OpenVLA test
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env openvla
+          bash tests/e2e_tests/embodied/run_openvla.sh
+
+  embodied-openvlaoft-grpo-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: embodied
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+      - name: OpenVLA-OFT test
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env openvla-oft
+          bash tests/e2e_tests/embodied/run_openvlaoft_libero130.sh
+
+  # =============================================== auto placement tests ====================================================
+
+  static-auto-placement-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: reason
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: auto-placement
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/auto_placement/run_auto_placement.sh
+
+# =============================================== finale ====================================================
+
+  pr-test-finish:
+    needs: [
+      check-changes,
+
+      # Unit tests
+      unit-tests-cuda, unit-tests-cpu,
+
+      # Reason e2e tests
+      reason-qwen-grpo-test, reason-qwen-grpo-test-rollout-logprobs,
+      coding-online-rl-qwen-ppo-test,
+
+      # Embodied e2e tests
+      embodied-openvla-ppo-test, embodied-openvlaoft-grpo-test,
+
+      # Auto placement tests
+      static-auto-placement-test
+    ]
+    if: always()
+    runs-on: ubuntu-latest
+    steps:
+      # Refer to https://github.com/sgl-project/sglang/blob/main/.github/workflows/pr-test.yml
+      - name: Check all dependent job statuses
+        run: |
+          # Convert the 'needs' context to a JSON string
+          json_needs='${{ toJson(needs) }}'
+
+          # Get a list of all job names from the JSON keys
+          job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]')
+
+          for job in $job_names; do
+            # For each job, extract its result
+            result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result')
+
+            # Print the job name and its result
+            echo "$job: $result"
+
+            # Check for failure or cancellation and exit if found
+            if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
+              echo "The above jobs failed."
+              exit 1
+            fi
+          done
+
+          # If the loop completes, all jobs were successful
+          echo "All jobs completed successfully"
+          exit 0
diff --git a/.github/workflows/coding_online_rl_e2e.yml b/.github/workflows/coding_online_rl_e2e.yml
deleted file mode 100644
index 8618b39c7..000000000
--- a/.github/workflows/coding_online_rl_e2e.yml
+++ /dev/null
@@ -1,62 +0,0 @@
-name: Coding Online RL End2End
-
-on:
-    push:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '!*.yaml'
-          - '!*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-    pull_request:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '*.yaml'
-          - '*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-permissions:
-  contents: read
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-    qwen-ppo-test:
-        runs-on: rlinf
-        container:
-            image: rlinf/rlinf:math-rlinf0.1-torch2.6.0-sglang0.4.6.post5-vllm0.8.5-megatron0.13.0-te2.1
-            volumes:
-                - /mnt/public/dataset:/workspace/dataset
-                - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=80g
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - name: Install dependencies
-              run: |
-                pip install httpx asyncio fuzzywuzzy
-
-            - name: SGLang Collocated mode
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/coding_online_rl/run_coding_online_rl.sh
diff --git a/.github/workflows/commit_check.yml b/.github/workflows/commit_check.yml
index e1391abe6..7f17365ce 100644
--- a/.github/workflows/commit_check.yml
+++ b/.github/workflows/commit_check.yml
@@ -12,7 +12,7 @@ jobs:
       contents: read
       pull-requests: write
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           ref: ${{ github.event.pull_request.head.sha }}  # checkout PR HEAD commit
           fetch-depth: 0  # required for merge-base check
diff --git a/.github/workflows/embodied_e2e.yml b/.github/workflows/embodied_e2e.yml
deleted file mode 100644
index 16fb7a275..000000000
--- a/.github/workflows/embodied_e2e.yml
+++ /dev/null
@@ -1,74 +0,0 @@
-name: Embodied End2End
-
-on:
-    push:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '!*.yaml'
-          - '!*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-    pull_request:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '!*.yaml'
-          - '!*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-permissions:
-  contents: read
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-    openvla-ppo-test:
-        runs-on: rlinf
-        container:
-            image: rlinf/rlinf:agentic-rlinf0.1-torch2.6.0-openvla-openvlaoft-pi0
-            volumes:
-                - /mnt/public/dataset:/workspace/dataset
-                - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=2g
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-            - name: OpenVLA test
-              timeout-minutes: 20
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/embodied/run_openvla.sh
-    openvlaoft-grpo-test:
-        runs-on: rlinf
-        container:
-            image: rlinf/rlinf:agentic-openvlaoft-rlinf0.1-torch2.5.1
-            volumes:
-                - /mnt/public/dataset:/workspace/dataset
-                - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=2g -e NVIDIA_DRIVER_CAPABILITIES="compute,utility,graphics"
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-            - name: OpenVLA-OFT test
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/embodied/run_openvlaoft_libero130.sh
\ No newline at end of file
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index e21ced73b..921232e28 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -9,6 +9,6 @@ jobs:
   lint:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v5
     - uses: actions/setup-python@v3
     - uses: pre-commit/action@v3.0.1
\ No newline at end of file
diff --git a/.github/workflows/math_e2e.yml b/.github/workflows/math_e2e.yml
deleted file mode 100644
index ffd474253..000000000
--- a/.github/workflows/math_e2e.yml
+++ /dev/null
@@ -1,96 +0,0 @@
-name: Math End2End
-
-on:
-    push:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '!*.yaml'
-          - '!*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-    pull_request:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '*.yaml'
-          - '*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-permissions:
-  contents: read
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-    qwen-grpo-test:
-        runs-on: rlinf
-        container:
-            image: rlinf/rlinf:math-rlinf0.1-torch2.6.0-sglang0.4.6.post5-vllm0.8.5-megatron0.13.0-te2.1
-            volumes:
-                - /mnt/public/dataset:/workspace/dataset
-                - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=80g
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - name: SGLang Collocated mode
-              timeout-minutes: 20
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/sglang/run_collocated.sh
-
-            - name: vLLM Collocated mode
-              timeout-minutes: 20
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/vllm/run_collocated.sh
-
-            - name: SGLang Pipeline mode
-              timeout-minutes: 20
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/sglang/run_pipeline.sh
-
-            - name: vLLM Pipeline mode
-              timeout-minutes: 20
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/vllm/run_pipeline.sh
-
-    qwen-grpo-test-sglang044:
-        runs-on: rlinf
-        container:
-            image: rlinf/rlinf:math-rlinf0.1-torch2.5.1-sglang0.4.4-vllm0.7.1-megatron0.11.0-te2.1
-            volumes:
-                - /mnt/public/dataset:/workspace/dataset
-                - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=80g
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - name: SGLang Collocated mode
-              timeout-minutes: 20
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/sglang/run_collocated.sh
diff --git a/.github/workflows/math_e2e_rollout_logprobs.yml b/.github/workflows/math_e2e_rollout_logprobs.yml
deleted file mode 100644
index 168737332..000000000
--- a/.github/workflows/math_e2e_rollout_logprobs.yml
+++ /dev/null
@@ -1,78 +0,0 @@
-name: Math End2End Rollout Logprobs
-
-on:
-    push:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '!*.yaml'
-          - '!*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-    pull_request:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '*.yaml'
-          - '*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-permissions:
-  contents: read
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-    qwen-grpo-test:
-        runs-on: rlinf
-        container:
-            image: rlinf/rlinf:math-rlinf0.1-torch2.6.0-sglang0.4.6.post5-vllm0.8.5-megatron0.13.0-te2.1
-            volumes:
-                - /mnt/public/dataset:/workspace/dataset
-                - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=80g
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - name: SGLang Collocated mode
-              timeout-minutes: 20
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/sglang/run_collocated.sh qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
-
-            - name: vLLM Collocated mode
-              timeout-minutes: 20
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/vllm/run_collocated.sh qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
-
-            - name: SGLang Pipeline mode
-              timeout-minutes: 20
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/sglang/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
-
-            - name: vLLM Pipeline mode
-              timeout-minutes: 20
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/math/vllm/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
-
diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml
deleted file mode 100644
index 634579f5f..000000000
--- a/.github/workflows/unit_test.yml
+++ /dev/null
@@ -1,77 +0,0 @@
-name: Unit Tests
-
-on:
-    push:
-        branches:
-          - 'v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '!*.yaml'
-          - '!*.toml'
-
-    pull_request:
-        branches:
-          - 'v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '*.yaml'
-          - '*.toml'
-
-permissions:
-  contents: read
-
-jobs:
-    unit-tests-cuda:
-        runs-on: rlinf
-        container:
-            image: rlinf/rlinf:math-rlinf0.1-torch2.6.0-sglang0.4.6.post5-vllm0.8.5-megatron0.13.0-te2.1
-            volumes:
-                - /mnt/public/dataset:/workspace/dataset
-                - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=80g
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - name: Install pytest
-              run: pip install pytest
-
-            - name: Run pytest
-              timeout-minutes: 20
-              run: |
-                export PYTHONPATH=$(pwd):$(pwd)/megatron:$(pwd)/tests/unit_tests 
-                pytest tests/unit_tests
-            
-            - name: Run doctest
-              timeout-minutes: 20
-              run: |
-                pytest --doctest-modules rlinf/scheduler
-      
-    unit-tests-cpu:
-        runs-on: ubuntu-latest
-        container:
-            image: rlinf/rlinf:math-rlinf0.1-torch2.6.0-sglang0.4.6.post5-vllm0.8.5-megatron0.13.0-te2.1
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - name: Install pytest
-              run: pip install pytest
-
-            - name: Run pytest
-              timeout-minutes: 20
-              run: |
-                export PYTHONPATH=$(pwd):$(pwd)/megatron:$(pwd)/tests/unit_tests 
-                pytest tests/unit_tests
\ No newline at end of file
diff --git a/docs/source-en/rst_source/start/vla-eval.rst b/docs/source-en/rst_source/start/vla-eval.rst
index fc1d93ae8..db9ca221b 100644
--- a/docs/source-en/rst_source/start/vla-eval.rst
+++ b/docs/source-en/rst_source/start/vla-eval.rst
@@ -100,9 +100,8 @@ Quick Start — LIBERO
    export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
 
    # path to the LIBERO repo
-   export LIBERO_REPO_PATH="/root/LIBERO"
-   export LIBERO_CONFIG_PATH=${LIBERO_REPO_PATH}
-   export PYTHONPATH=${LIBERO_REPO_PATH}:$PYTHONPATH
+   export LIBERO_PATH="/opt/LIBERO"
+   export PYTHONPATH=${LIBERO_PATH}:$PYTHONPATH
 
    export CUDA_LAUNCH_BLOCKING=1
    export HYDRA_FULL_ERROR=1
diff --git a/docs/source-zh/rst_source/start/vla-eval.rst b/docs/source-zh/rst_source/start/vla-eval.rst
index 499777f2f..e671ddc73 100644
--- a/docs/source-zh/rst_source/start/vla-eval.rst
+++ b/docs/source-zh/rst_source/start/vla-eval.rst
@@ -94,9 +94,8 @@ RLinf 提供了 **即开即用的评估脚本**，用于在 *训练分布内* 
    export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
 
    # LIBERO 仓库路径
-   export LIBERO_REPO_PATH="/root/LIBERO"
-   export LIBERO_CONFIG_PATH=${LIBERO_REPO_PATH}
-   export PYTHONPATH=${LIBERO_REPO_PATH}:$PYTHONPATH
+   export LIBERO_PATH="/opt/LIBERO"
+   export PYTHONPATH=${LIBERO_PATH}:$PYTHONPATH
 
    export CUDA_LAUNCH_BLOCKING=1
    export HYDRA_FULL_ERROR=1
diff --git a/examples/embodiment/eval_embodiment.sh b/examples/embodiment/eval_embodiment.sh
index eec38eee0..316076ae8 100644
--- a/examples/embodiment/eval_embodiment.sh
+++ b/examples/embodiment/eval_embodiment.sh
@@ -7,12 +7,10 @@ export SRC_FILE="${EMBODIED_PATH}/eval_embodied_agent.py"
 export MUJOCO_GL="osmesa"
 export PYOPENGL_PLATFORM="osmesa"
 export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-# NOTE: set LIBERO_REPO_PATH to the path of the LIBERO repo
-export LIBERO_REPO_PATH="/path/to/repo/LIBERO"
-# NOTE: set LIBERO_CONFIG_PATH for libero/libero/__init__.py
-export LIBERO_CONFIG_PATH=${LIBERO_REPO_PATH}
 
-export PYTHONPATH=${LIBERO_REPO_PATH}:$PYTHONPATH
+LIBERO_PATH=/opt/libero
+export PYTHONPATH=${LIBERO_PATH}:$PYTHONPATH
+
 export CUDA_LAUNCH_BLOCKING=1
 export HYDRA_FULL_ERROR=1
 
diff --git a/tests/e2e_tests/coding_online_rl/run_coding_online_rl.sh b/tests/e2e_tests/coding_online_rl/run_coding_online_rl.sh
index 7d089861e..9bf7b8df7 100644
--- a/tests/e2e_tests/coding_online_rl/run_coding_online_rl.sh
+++ b/tests/e2e_tests/coding_online_rl/run_coding_online_rl.sh
@@ -8,6 +8,5 @@ export RAY_DEDUP_LOGS=0
 
 export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
 
-
 python ${REPO_PATH}/examples/coding_online_rl/main_coding_online_rl.py --config-path ${REPO_PATH}/tests/e2e_tests/coding_online_rl  --config-name qwen2.5-1.5b-ppo
 
diff --git a/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml b/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml
index c5f5744b3..9f649b48c 100644
--- a/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml
+++ b/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml
@@ -23,7 +23,7 @@ runner:
     experiment_name: "ci-test"
     logger_backends: ["tensorboard"] # wandb, swanlab
 
-  max_epochs: 3
+  max_epochs: 1
   max_steps: -1
 
   only_eval: False
diff --git a/tests/e2e_tests/embodied/run_openvla.sh b/tests/e2e_tests/embodied/run_openvla.sh
index d5304dece..b28ee4848 100644
--- a/tests/e2e_tests/embodied/run_openvla.sh
+++ b/tests/e2e_tests/embodied/run_openvla.sh
@@ -4,7 +4,5 @@ set -x
 tabs 4
 
 export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-unset HOME # GitHub action sets HOME to a wrong path (/github/home), breaking simulator
 
-source switch_env openvla
 python ${REPO_PATH}/examples/embodiment/train_embodied_agent.py --config-path ${REPO_PATH}/tests/e2e_tests/embodied --config-name ppo_openvla
\ No newline at end of file
diff --git a/tests/e2e_tests/embodied/run_openvlaoft_libero130.sh b/tests/e2e_tests/embodied/run_openvlaoft_libero130.sh
index 596146ff3..003b40910 100644
--- a/tests/e2e_tests/embodied/run_openvlaoft_libero130.sh
+++ b/tests/e2e_tests/embodied/run_openvlaoft_libero130.sh
@@ -4,10 +4,6 @@ set -x
 tabs 4
 export MUJOCO_GL="osmesa"
 export PYOPENGL_PLATFORM="osmesa"
-export LIBERO_REPO_PATH="/workspace/libero"
-export LIBERO_CONFIG_PATH=${LIBERO_REPO_PATH}
-export PYTHONPATH=${LIBERO_REPO_PATH}:$PYTHONPATH
 export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-unset HOME # GitHub action sets HOME to a wrong path (/github/home), breaking simulator
 
 python ${REPO_PATH}/examples/embodiment/train_embodied_agent.py --config-path ${REPO_PATH}/tests/e2e_tests/embodied --config-name libero_130_grpo_openvlaoft
\ No newline at end of file
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
index a3d8ee225..172a33650 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
+++ b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
@@ -144,7 +144,7 @@ actor:
   mcore_gpt: True
   spec_name: decoder_gpt
 
-  checkpoint_load_path: null
+  checkpoint_load_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B-tp2-pp1
 
   offload_optimizer: True
   offload_weight: True
@@ -232,7 +232,7 @@ actor:
     use_dist_ckpt: False
     tp_comm_bootstrap_backend: nccl
     tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
-    use_hf_ckpt: True # if true, will transfer hf model to generate megatron checkpoint and use it for training.
+    use_hf_ckpt: False # if true, will transfer hf model to generate megatron checkpoint and use it for training.
     use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
     
     ckpt_convertor: # config for ckpt convertor
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
index d5f57e660..b1264a9f9 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
+++ b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
@@ -141,7 +141,7 @@ actor:
   mcore_gpt: True
   spec_name: decoder_gpt
 
-  checkpoint_load_path: null
+  checkpoint_load_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B-tp2-pp1
 
   offload_optimizer: True
   offload_weight: True
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml
index ad84fcec4..04232611a 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml
+++ b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml
@@ -155,7 +155,7 @@ actor:
   mcore_gpt: True
   spec_name: decoder_gpt
 
-  checkpoint_load_path: null
+  checkpoint_load_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B-tp2-pp1
 
   offload_optimizer: True
   offload_weight: True
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
index 6ba79956d..17a8c1384 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
@@ -139,7 +139,7 @@ actor:
   mcore_gpt: True
   spec_name: decoder_gpt
 
-  checkpoint_load_path: null
+  checkpoint_load_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B-tp2-pp1
 
   offload_optimizer: True
   offload_weight: True
@@ -227,7 +227,7 @@ actor:
     use_dist_ckpt: False
     tp_comm_bootstrap_backend: nccl
     tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
-    use_hf_ckpt: True # if true, will transfer hf model to generate megatron checkpoint and use it for training.
+    use_hf_ckpt: False # if true, will transfer hf model to generate megatron checkpoint and use it for training.
     use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
     
     ckpt_convertor: # config for ckpt convertor
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml
index f06f46ce0..c62990fda 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml
@@ -140,7 +140,7 @@ actor:
   mcore_gpt: True
   spec_name: decoder_gpt
 
-  checkpoint_load_path: null
+  checkpoint_load_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B-tp2-pp1
 
   offload_optimizer: True
   offload_weight: True
@@ -228,7 +228,7 @@ actor:
     use_dist_ckpt: False
     tp_comm_bootstrap_backend: nccl
     tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
-    use_hf_ckpt: True # if true, will transfer hf model to generate megatron checkpoint and use it for training.
+    use_hf_ckpt: False # if true, will transfer hf model to generate megatron checkpoint and use it for training.
     use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
     
     ckpt_convertor: # config for ckpt convertor
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
index 496589988..dd4e1a6a6 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
@@ -142,7 +142,7 @@ actor:
   mcore_gpt: True
   spec_name: decoder_gpt
 
-  checkpoint_load_path: null
+  checkpoint_load_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B-tp2-pp1
 
   offload_optimizer: True
   offload_weight: True
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml
index 2100f6535..5785508b3 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml
@@ -156,7 +156,7 @@ actor:
   mcore_gpt: True
   spec_name: decoder_gpt
 
-  checkpoint_load_path: null
+  checkpoint_load_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B-tp2-pp1
 
   offload_optimizer: True
   offload_weight: True

From 78ddc653764c10d7187ee68fce59e5ff715653ac Mon Sep 17 00:00:00 2001
From: Andy Lin <32576375+andylin-hao@users.noreply.github.com>
Date: Mon, 13 Oct 2025 01:58:54 +0800
Subject: [PATCH 20/57] feat: unfied CI script and new embodied CI tests (#163)

Signed-off-by: Hao Lin <linhaomails@gmail.com>
---
 .github/workflows/code-test.yml               |  41 ++++-
 .../embodied/libero_130_grpo_openvlaoft.yaml  |   2 +-
 .../embodied/libero_goal_grpo_openvlaoft.yaml | 163 ++++++++++++++++++
 .../embodied/maniskill_grpo_openvlaoft.yaml   | 162 +++++++++++++++++
 ...penvla.yaml => maniskill_ppo_openvla.yaml} |   8 +-
 .../embodied/{run_openvla.sh => run.sh}       |   7 +-
 .../embodied/run_openvlaoft_libero130.sh      |   9 -
 ...1.5b-grpo-collocated-rollout-logprobs.yaml |   2 +-
 .../sglang/qwen2.5-1.5b-grpo-collocated.yaml  |   2 +-
 ...5-1.5b-grpo-pipeline-rollout-logprobs.yaml |   2 +-
 .../sglang/qwen2.5-1.5b-grpo-pipeline.yaml    |   2 +-
 ...1.5b-grpo-collocated-rollout-logprobs.yaml |   2 +-
 .../vllm/qwen2.5-1.5b-grpo-collocated.yaml    |   2 +-
 ...5-1.5b-grpo-pipeline-rollout-logprobs.yaml |   2 +-
 .../math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml |   2 +-
 15 files changed, 380 insertions(+), 28 deletions(-)
 create mode 100644 tests/e2e_tests/embodied/libero_goal_grpo_openvlaoft.yaml
 create mode 100644 tests/e2e_tests/embodied/maniskill_grpo_openvlaoft.yaml
 rename tests/e2e_tests/embodied/{ppo_openvla.yaml => maniskill_ppo_openvla.yaml} (97%)
 rename tests/e2e_tests/embodied/{run_openvla.sh => run.sh} (65%)
 delete mode 100644 tests/e2e_tests/embodied/run_openvlaoft_libero130.sh

diff --git a/.github/workflows/code-test.yml b/.github/workflows/code-test.yml
index b61817bfe..c9c34a6c6 100644
--- a/.github/workflows/code-test.yml
+++ b/.github/workflows/code-test.yml
@@ -190,6 +190,7 @@ jobs:
           pip install httpx asyncio fuzzywuzzy
 
       - name: SGLang Collocated mode
+        timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
@@ -197,7 +198,7 @@ jobs:
 
   # =============================================== embodied e2e tests ====================================================
 
-  embodied-openvla-ppo-test:
+  embodied-maniskill-ppo-openvla-test:
     needs: [check-changes]
     if: needs.check-changes.outputs.file_filter == 'true'
     runs-on: embodied
@@ -209,9 +210,24 @@ jobs:
         run: |
           export REPO_PATH=$(pwd)
           source switch_env openvla
-          bash tests/e2e_tests/embodied/run_openvla.sh
+          bash tests/e2e_tests/embodied/run.sh maniskill_ppo_openvla
+          
+  embodied-maniskill-grpo-openvlaoft-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: embodied
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+      - name: OpenVLA test
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env openvla-oft
+          cp -r /workspace/dataset/maniskill_assets/assets ${REPO_PATH}/rlinf/envs/maniskill/
+          bash tests/e2e_tests/embodied/run.sh maniskill_grpo_openvlaoft
 
-  embodied-openvlaoft-grpo-test:
+  embodied-libero-goal-grpo-openvlaoft-test:
     needs: [check-changes]
     if: needs.check-changes.outputs.file_filter == 'true'
     runs-on: embodied
@@ -219,10 +235,25 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v5
       - name: OpenVLA-OFT test
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env openvla-oft
+          bash tests/e2e_tests/embodied/run.sh libero_goal_grpo_openvlaoft
+
+  embodied-libero-130-grpo-openvlaoft-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: embodied
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+      - name: OpenVLA-OFT test
+        timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
           source switch_env openvla-oft
-          bash tests/e2e_tests/embodied/run_openvlaoft_libero130.sh
+          bash tests/e2e_tests/embodied/run.sh libero_130_grpo_openvlaoft
 
   # =============================================== auto placement tests ====================================================
 
@@ -255,7 +286,7 @@ jobs:
       coding-online-rl-qwen-ppo-test,
 
       # Embodied e2e tests
-      embodied-openvla-ppo-test, embodied-openvlaoft-grpo-test,
+      embodied-maniskill-ppo-openvla-test, embodied-maniskill-grpo-openvlaoft-test, embodied-libero-goal-grpo-openvlaoft-test,embodied-libero-130-grpo-openvlaoft-test,
 
       # Auto placement tests
       static-auto-placement-test
diff --git a/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml b/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml
index 9f649b48c..2365b5a86 100644
--- a/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml
+++ b/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml
@@ -13,7 +13,7 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor,env,rollout: all
+    actor,env,rollout: 0-3
 
 runner:
   task_type: embodied
diff --git a/tests/e2e_tests/embodied/libero_goal_grpo_openvlaoft.yaml b/tests/e2e_tests/embodied/libero_goal_grpo_openvlaoft.yaml
new file mode 100644
index 000000000..ddfe4a500
--- /dev/null
+++ b/tests/e2e_tests/embodied/libero_goal_grpo_openvlaoft.yaml
@@ -0,0 +1,163 @@
+defaults:
+  - env/train: libero_goal
+  - env/eval: libero_goal
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  searchpath:
+    - file://${oc.env:REPO_PATH}/examples/embodiment/config
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,env,rollout: 0-3
+
+runner:
+  task_type: embodied
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: -1
+
+  only_eval: False
+  val_check_interval: -1
+  save_interval: -1
+  seq_length: 4096
+  max_prompt_length: 128
+
+algorithm:
+  auto_reset: False
+  ignore_terminations: False
+  use_fixed_reset_state_ids: True
+  require_values: False
+  shuffle_samples: True
+  normalize_advantages: True
+  kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
+  group_size: 8
+  n_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  n_eval_chunk_steps: ${int_div:${env.train.max_episode_steps}, ${actor.model.num_action_chunks}}
+  num_group_envs: 8
+  rollout_epoch: 1
+  reward_type: action_level  # action_level or chunk_level
+  logprob_type: token_level
+  entropy_type: token_level
+
+  adv_type: embodied_grpo
+  loss_type: embodied_grpo
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0
+  entropy_bonus: 0
+  clip_ratio_high: 0.28
+  clip_ratio_low: 0.2
+  clip_ratio_c: 3.0
+  value_clip: 0.2
+  huber_delta: 10.0
+
+  gamma: 0.99
+  gae_lambda: 0.95
+
+  sampling_params:
+    use_greedy: False
+    temperature_train: 1.6
+    temperature_eval: 1.6
+    top_k: -1
+    top_p: 1.0
+    repetition_penalty: 1.0
+
+  # length argument for autoregressive sampling
+  # max length means max amount of tokens to generate
+  length_params:
+    max_new_token: null
+    max_length: 1024
+    min_length: 1
+
+  filter_rewards: True
+  rewards_lower_bound: 0.5
+  rewards_upper_bound: 4.5
+
+env:
+  group_name: "EnvGroup"
+  channel:
+    name: "env_buffer_list"
+    queue_name: "obs_buffer"
+    queue_size: 0
+  enable_offload: False
+
+rollout:
+  group_name: "RolloutGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "action_buffer"
+    queue_size: 0
+  mode: "colocate"
+  generation_backend: "huggingface"
+  model_dir: "/workspace/dataset/Openvla-oft-SFT-libero-goal-traj1/"
+  enable_offload: False
+  pipeline_stage_num: 1
+
+actor:
+  group_name: "ActorGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "replay_buffer"
+    queue_size: 0
+  training_backend: "fsdp"
+  checkpoint_load_path: "/workspace/dataset/Openvla-oft-SFT-libero-goal-traj1/"
+  checkpoint_save_path: "/workspace/results"
+  micro_batch_size: 2
+  global_batch_size: 256
+  seed: 1234
+  enable_offload: False
+
+  model:
+    model_name: "openvla_oft"
+    value_type: ${algorithm.reward_type}  # 'action' or 'token'
+    action_dim: 7
+    num_action_chunks: 8
+    use_proprio: False
+    unnorm_key: libero_goal_no_noops
+    center_crop: True
+
+    precision: "bf16"
+    add_bias_linear: False
+    add_qkv_bias: True
+    vocab_size: 32000
+    hidden_size: 4096
+    policy_setup: "widowx_bridge"
+    vh_mode: "a0"
+    image_size: [224, 224]
+    is_lora: False
+    lora_rank: 32
+    num_images_in_input: 1
+    attn_implementation: "flash_attention_2"
+    low_cpu_mem_usage: True
+    trust_remote_code: True
+    gradient_checkpointing: False
+
+  optim:
+    lr: 2.0e-5
+    value_lr: 3.0e-3
+    adam_beta1: 0.9
+    adam_beta2: 0.999
+    adam_eps: 1.0e-05
+    clip_grad: 1.0
+
+  tokenizer:
+    tokenizer_type: "HuggingFaceTokenizer"
+    tokenizer_model: "/workspace/dataset/Openvla-oft-SFT-libero-goal-traj1/"
+    use_fast: False
+    trust_remote_code: True
+    padding_side: "right"
+
+reward:
+  use_reward_model: False
+
+critic:
+  use_critic_model: False
\ No newline at end of file
diff --git a/tests/e2e_tests/embodied/maniskill_grpo_openvlaoft.yaml b/tests/e2e_tests/embodied/maniskill_grpo_openvlaoft.yaml
new file mode 100644
index 000000000..ab384947a
--- /dev/null
+++ b/tests/e2e_tests/embodied/maniskill_grpo_openvlaoft.yaml
@@ -0,0 +1,162 @@
+defaults:
+  - env/train: PutOnPlateInScene25Main
+  - env/eval: maniskill_ood_template
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  searchpath:
+    - file://${oc.env:REPO_PATH}/examples/embodiment/config
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,env,rollout: 0-3
+
+runner:
+  task_type: embodied
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 2
+  max_steps: -1
+
+  val_check_interval: -1
+  save_interval: -1
+  seq_length: 1024
+  max_prompt_length: 30
+
+algorithm:
+  auto_reset: False
+  ignore_terminations: False
+  use_fixed_reset_state_ids: True
+  require_values: False
+  normalize_advantages: True
+  kl_penalty: kl  # how to estimate kl divergence: kl or kl_penalty
+  group_size: 2
+
+  n_chunk_steps: 10
+  n_eval_chunk_steps: 10
+  num_group_envs: 8
+  rollout_epoch: 1
+  reward_type: action_level  # action_level or chunk_level
+  logprob_type: token_level
+  entropy_type: token_level
+
+  adv_type: embodied_grpo
+  loss_type: embodied_grpo
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0
+  entropy_bonus: 0
+  clip_ratio_high: 0.28
+  clip_ratio_low: 0.2
+  clip_ratio_c: 3.0
+  value_clip: 0.2
+  huber_delta: 10.0
+
+  gamma: 0.99
+  gae_lambda: 0.95
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature_train: 1.0
+    temperature_eval: 0.6
+    top_k: 0
+    top_p: 1.0
+    repetition_penalty: 1.0
+
+  # length argument for autoregressive sampling
+  # max length means max amount of tokens to generate
+  length_params:
+    max_new_token: null
+    max_length: 1024
+    min_length: 1
+
+env:
+  group_name: "EnvGroup"
+  channel:
+    name: "env_buffer_list"
+    queue_name: "obs_buffer"
+    queue_size: 0
+  enable_offload: False
+
+rollout:
+  group_name: "RolloutGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "action_buffer"
+    queue_size: 0
+  mode: "colocate"
+  generation_backend: "huggingface"
+  model_dir: "/workspace/dataset/Openvla-oft-SFT-libero-goal-traj1/"
+  enable_offload: True
+  pipeline_stage_num: 1
+
+actor:
+  group_name: "ActorGroup"
+  channel:
+    name: ${env.channel.name}
+    queue_name: "replay_buffer"
+    queue_size: 0
+  training_backend: "fsdp"
+  checkpoint_load_path: "/workspace/dataset/Openvla-oft-SFT-libero-goal-traj1/"
+  checkpoint_save_path: "/workspace/results"
+
+  micro_batch_size: 10
+  global_batch_size: 40
+  seed: 1234
+  enable_offload: True
+
+  model:
+    model_name: "openvla_oft"
+    value_type: ${algorithm.reward_type}  # 'action' or 'token'
+    action_dim: 7
+    num_action_chunks: 8
+    use_proprio: False
+    unnorm_key: bridge_orig
+    center_crop: True
+
+    precision: "bf16"
+    add_bias_linear: False
+    add_qkv_bias: True
+    vocab_size: 32000
+    hidden_size: 4096
+    policy_setup: "widowx_bridge"
+    vh_mode: "a0"
+    image_size: [224, 224]
+    is_lora: True
+    lora_rank: 32
+    lora_path: /workspace/dataset/RLinf-OpenVLAOFT-LIBERO-130-Base-Lora/lora_adapter
+    ckpt_path: null
+    num_images_in_input: 1
+    attn_implementation: "flash_attention_2"
+    low_cpu_mem_usage: True
+    trust_remote_code: True
+    gradient_checkpointing: False
+
+  tokenizer:
+    tokenizer_type: "HuggingFaceTokenizer"
+    tokenizer_model: "/workspace/dataset/Openvla-oft-SFT-libero-goal-traj1/"
+    use_fast: False
+    trust_remote_code: True
+    padding_side: "right"
+
+  optim:
+    lr: 1.0e-4
+    value_lr: 3.0e-3
+    adam_beta1: 0.9
+    adam_beta2: 0.999
+    adam_eps: 1.0e-05
+    clip_grad: 10.0
+
+reward:
+  use_reward_model: False
+
+critic:
+  use_critic_model: False
\ No newline at end of file
diff --git a/tests/e2e_tests/embodied/ppo_openvla.yaml b/tests/e2e_tests/embodied/maniskill_ppo_openvla.yaml
similarity index 97%
rename from tests/e2e_tests/embodied/ppo_openvla.yaml
rename to tests/e2e_tests/embodied/maniskill_ppo_openvla.yaml
index f1f566b82..2063ab483 100644
--- a/tests/e2e_tests/embodied/ppo_openvla.yaml
+++ b/tests/e2e_tests/embodied/maniskill_ppo_openvla.yaml
@@ -24,12 +24,12 @@ runner:
     experiment_name: "ci-test"
     logger_backends: ["tensorboard"] # wandb, swanlab
 
-  max_epochs: 3
+  max_epochs: 2
   max_steps: -1
 
   val_check_interval: -1
   save_interval: -1
-  seq_length: 4096
+  seq_length: 1024
   max_prompt_length: 30
 
 algorithm:
@@ -115,8 +115,8 @@ actor:
   checkpoint_load_path: "/workspace/dataset/openvla-7b"
   checkpoint_save_path: "/workspace/results"
 
-  micro_batch_size: 20
-  global_batch_size: 80
+  micro_batch_size: 10
+  global_batch_size: 40
   seed: 1234
   enable_offload: True
 
diff --git a/tests/e2e_tests/embodied/run_openvla.sh b/tests/e2e_tests/embodied/run.sh
similarity index 65%
rename from tests/e2e_tests/embodied/run_openvla.sh
rename to tests/e2e_tests/embodied/run.sh
index b28ee4848..09d44d7cc 100644
--- a/tests/e2e_tests/embodied/run_openvla.sh
+++ b/tests/e2e_tests/embodied/run.sh
@@ -3,6 +3,11 @@ set -x
 
 tabs 4
 
+CONFIG=$1
+BACKEND=${2:-"egl"}
+
+export MUJOCO_GL=${BACKEND}
+export PYOPENGL_PLATFORM=${BACKEND}
 export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
 
-python ${REPO_PATH}/examples/embodiment/train_embodied_agent.py --config-path ${REPO_PATH}/tests/e2e_tests/embodied --config-name ppo_openvla
\ No newline at end of file
+python ${REPO_PATH}/examples/embodiment/train_embodied_agent.py --config-path ${REPO_PATH}/tests/e2e_tests/embodied --config-name ${CONFIG}
\ No newline at end of file
diff --git a/tests/e2e_tests/embodied/run_openvlaoft_libero130.sh b/tests/e2e_tests/embodied/run_openvlaoft_libero130.sh
deleted file mode 100644
index 003b40910..000000000
--- a/tests/e2e_tests/embodied/run_openvlaoft_libero130.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /bin/bash
-set -x
-
-tabs 4
-export MUJOCO_GL="osmesa"
-export PYOPENGL_PLATFORM="osmesa"
-export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-
-python ${REPO_PATH}/examples/embodiment/train_embodied_agent.py --config-path ${REPO_PATH}/tests/e2e_tests/embodied --config-name libero_130_grpo_openvlaoft
\ No newline at end of file
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
index 3253f327b..7e2c5164a 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
+++ b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
@@ -20,7 +20,7 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
   save_interval: -1
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
index 172a33650..1dfe47aeb 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
+++ b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
@@ -20,7 +20,7 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
   save_interval: -1
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
index b1264a9f9..25344d0bf 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
+++ b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
@@ -21,7 +21,7 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
   save_interval: -1
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml
index 04232611a..d5cd2b4a4 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml
+++ b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml
@@ -22,7 +22,7 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
   save_interval: -1
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
index 17a8c1384..fe61ab16c 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
@@ -20,7 +20,7 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
   save_interval: -1
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml
index c62990fda..099fe7268 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml
@@ -20,7 +20,7 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
   save_interval: -1
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
index dd4e1a6a6..62d0c5247 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
@@ -22,7 +22,7 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
   save_interval: -1
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml
index 5785508b3..3f7821587 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml
@@ -23,7 +23,7 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
   save_interval: -1

From 461e4e645a8481e756723dd75c06a168c0603fab Mon Sep 17 00:00:00 2001
From: cc <1716911340@qq.com>
Date: Mon, 8 Sep 2025 18:38:19 +0800
Subject: [PATCH 21/57] Merge pull request #1 from
 anHappyDog/feature/weight_convertor

feat(weight): refactor and add qwen2.5-vl mg2hf convertor
---
 rlinf/utils/convertor/__init__.py             |  13 +
 rlinf/utils/convertor/utils.py                | 457 ++++++++++++++++++
 .../utils/resharding/mcore_weight_reshard.py  |   2 +-
 rlinf/utils/resharding/reshard_config.py      |   9 +-
 rlinf/utils/resharding/utils.py               | 218 ---------
 5 files changed, 477 insertions(+), 222 deletions(-)
 create mode 100644 rlinf/utils/convertor/__init__.py
 create mode 100644 rlinf/utils/convertor/utils.py

diff --git a/rlinf/utils/convertor/__init__.py b/rlinf/utils/convertor/__init__.py
new file mode 100644
index 000000000..5b365ea1e
--- /dev/null
+++ b/rlinf/utils/convertor/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/rlinf/utils/convertor/utils.py b/rlinf/utils/convertor/utils.py
new file mode 100644
index 000000000..5d15e3639
--- /dev/null
+++ b/rlinf/utils/convertor/utils.py
@@ -0,0 +1,457 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from dataclasses import dataclass
+from enum import Enum
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+
+
+class TransformType(Enum):
+    SPLIT_QKV = "split_qkv"
+    SPLIT_QKV_BIAS = "split_qkv_bias"
+    SPLIT_FC1 = "split_fc1"
+    SPLIT_NONE = "split_none"
+
+
+class TransformFunc:
+    @staticmethod
+    def _split_gqa_tensor(
+        tensor: torch.Tensor, new_statedict: dict, weight_names: List[str], config
+    ) -> None:
+        """
+        Private helper to split a GQA-combined tensor (weight or bias).
+        """
+        hidden_size = config.model_config.hidden_size
+        num_attention_heads = config.model_config.num_attention_heads
+        num_key_value_heads = (
+            config.model_config.num_query_groups or num_attention_heads
+        )
+        head_dim = hidden_size // num_attention_heads
+
+        tp_size = config.model_config.tensor_model_parallel_size
+
+        assert num_key_value_heads % tp_size == 0, (
+            "num_key_value_heads must be divisible by tensor parallel size"
+        )
+
+        q_heads_per_rank = num_attention_heads // tp_size
+        kv_heads_per_rank = num_key_value_heads // tp_size
+
+        q_shard_size = q_heads_per_rank * head_dim
+        k_shard_size = kv_heads_per_rank * head_dim
+        v_shard_size = kv_heads_per_rank * head_dim
+
+        shard_size = q_shard_size + k_shard_size + v_shard_size
+
+        q_shards, k_shards, v_shards = [], [], []
+
+        # [Qi,Ki,Vi]
+        for shard in tensor.split(shard_size, dim=0):
+            # Qi, Ki, Vi
+            q_shard, k_shard, v_shard = shard.split(
+                [q_shard_size, k_shard_size, v_shard_size], dim=0
+            )
+            q_shards.append(q_shard)
+            k_shards.append(k_shard)
+            v_shards.append(v_shard)
+
+        # cat
+        q_full = torch.cat(q_shards, dim=0)
+        k_full = torch.cat(k_shards, dim=0)
+        v_full = torch.cat(v_shards, dim=0)
+
+        # saved
+        new_statedict[weight_names[0]] = q_full.clone()
+        new_statedict[weight_names[1]] = k_full.clone()
+        new_statedict[weight_names[2]] = v_full.clone()
+
+    @staticmethod
+    def split_fc1(
+        linear_fc1: torch.Tensor, new_statedict: dict, weight_names: List[str], config
+    ) -> None:
+        assert weight_names is not None and len(weight_names) == 2, (
+            f"split_fc1 transform expects two weight names, got {weight_names}"
+        )
+
+        tp_size = config.model_config.tensor_model_parallel_size
+        target_tp = config.reshard_tp_size
+        split_size = linear_fc1.shape[0] // (tp_size // target_tp)
+        linear_fc1_slice = torch.split(linear_fc1, split_size, dim=0)
+
+        gate_proj_shards = []
+        up_proj_shards = []
+        for weight in linear_fc1_slice:
+            assert weight.shape[0] % 2 == 0, (
+                f"linear_fc1 weight shape {weight.shape} is not even along dim 0"
+            )
+            weight_chunk = torch.chunk(weight, 2, dim=0)
+            gate_proj_shards.append(weight_chunk[0])
+            up_proj_shards.append(weight_chunk[1])
+        gate_proj = torch.cat(gate_proj_shards, dim=0)
+        up_proj = torch.cat(up_proj_shards, dim=0)
+
+        new_statedict[weight_names[0]] = gate_proj.clone()
+        new_statedict[weight_names[1]] = up_proj.clone()
+
+    @staticmethod
+    def split_none(
+        tensor: torch.Tensor, new_statedict: dict, weight_names: List[str]
+    ) -> None:
+        assert weight_names is not None and len(weight_names) == 1, (
+            f"split_none transform expects one weight name, got {weight_names}"
+        )
+        new_statedict[weight_names[0]] = tensor.clone()
+
+
+@dataclass
+class ConvertorRule:
+    pattern: re.Pattern
+    transform: TransformType
+    targets: List[str]
+    post: Optional[Callable] = None
+
+
+class BaseConvertor:
+    def __init__(self, config, strict: bool = False):
+        self.cfg = config
+        self.strict = strict
+        self.rules = self.build_rules()
+
+    def map_name(self, name: str) -> Optional[Tuple[TransformType, List[str]]]:
+        def _get_targets_from_match(templates: list[str], m: re.Match) -> list[str]:
+            gd = m.groupdict()
+            out = []
+            for t in templates:
+                if "{" in t and "}" in t:
+                    out.append(t.format(**gd))
+                else:
+                    out.append(m.expand(t))
+            return out
+
+        for r in self.rules:
+            m = r.pattern.fullmatch(name)
+            if not m:
+                continue
+            targets = r.targets
+            if r.post:
+                targets = r.post(targets, m)
+            full_names = _get_targets_from_match(targets, m)
+            return r.transform, full_names
+        return None
+
+    def convert(self, state_dict: Dict) -> Dict:
+        converted = {}
+        for k, v in state_dict.items():
+            mapped = self.map_name(k)
+            if mapped is None:
+                if self.strict:
+                    raise KeyError(f"Unmapped key {k}")
+                continue
+            transform, targets = mapped
+            if transform in (TransformType.SPLIT_QKV, TransformType.SPLIT_QKV_BIAS):
+                TransformFunc._split_gqa_tensor(v, converted, targets, self.cfg)
+            elif transform == TransformType.SPLIT_FC1:
+                TransformFunc.split_fc1(v, converted, targets, self.cfg)
+            elif transform == TransformType.SPLIT_NONE:
+                TransformFunc.split_none(v, converted, targets)
+            else:
+                raise ValueError(f"Unknown transform type {transform}")
+        return converted
+
+    def build_rules(self) -> List[ConvertorRule]:
+        """
+        Should be implemented in subclass to build the conversion rules.
+        """
+        raise NotImplementedError
+
+
+class Qwen2_5Convertor(BaseConvertor):
+    def build_rules(self) -> List[ConvertorRule]:
+        LID = r"(?P<i>\d+)"
+        WB = r"(?P<wb>weight|bias)"
+
+        return [
+            # embeddings
+            ConvertorRule(
+                re.compile(r"embedding\.word_embeddings\.weight$"),
+                TransformType.SPLIT_NONE,
+                [r"model.embed_tokens.weight"],
+            ),
+            # final_layernorm
+            ConvertorRule(
+                re.compile(r"decoder\.final_layernorm\.weight$"),
+                TransformType.SPLIT_NONE,
+                [r"model.norm.weight"],
+            ),
+            # lm_head
+            ConvertorRule(
+                re.compile(r"output_layer\.weight$"),
+                TransformType.SPLIT_NONE,
+                [r"lm_head.weight"],
+            ),
+            # attn qkv norm
+            ConvertorRule(
+                re.compile(
+                    rf"decoder\.layers\.{LID}\.self_attention\.linear_qkv\.layer_norm_weight$"
+                ),
+                TransformType.SPLIT_NONE,
+                [r"model.layers.\g<i>.input_layernorm.weight"],
+            ),
+            # attn qkv weights/bias
+            ConvertorRule(
+                re.compile(
+                    rf"decoder\.layers\.{LID}\.self_attention\.linear_qkv\.{WB}$"
+                ),
+                TransformType.SPLIT_QKV,
+                [
+                    r"model.layers.\g<i>.self_attn.q_proj.\g<wb>",
+                    r"model.layers.\g<i>.self_attn.k_proj.\g<wb>",
+                    r"model.layers.\g<i>.self_attn.v_proj.\g<wb>",
+                ],
+            ),
+            # attn o proj
+            ConvertorRule(
+                re.compile(
+                    rf"decoder\.layers\.{LID}\.self_attention\.linear_proj\.{WB}$"
+                ),
+                TransformType.SPLIT_NONE,
+                [r"model.layers.\g<i>.self_attn.o_proj.\g<wb>"],
+            ),
+            # mlp fc1
+            ConvertorRule(
+                re.compile(rf"decoder\.layers\.{LID}\.mlp\.linear_fc1\.{WB}$"),
+                TransformType.SPLIT_FC1,
+                [
+                    r"model.layers.\g<i>.mlp.gate_proj.\g<wb>",
+                    r"model.layers.\g<i>.mlp.up_proj.\g<wb>",
+                ],
+            ),
+            # mlp fc2
+            ConvertorRule(
+                re.compile(rf"decoder\.layers\.{LID}\.mlp\.linear_fc2\.{WB}$"),
+                TransformType.SPLIT_NONE,
+                [r"model.layers.\g<i>.mlp.down_proj.\g<wb>"],
+            ),
+            # mlp norms
+            ConvertorRule(
+                re.compile(
+                    rf"decoder\.layers\.{LID}\.mlp\.linear_fc1\.layer_norm_weight$"
+                ),
+                TransformType.SPLIT_NONE,
+                [r"model.layers.\g<i>.post_attention_layernorm.weight"],
+            ),
+        ]
+
+
+class Qwen2_5VLConvertor(BaseConvertor):
+    def _build_vision_rules(self) -> List[ConvertorRule]:
+        B = r"(?P<i>\d+)"
+        WB = r"(?P<wb>weight|bias)"
+        HF_V_PREFIX = "model.visual"
+        HF_V_DECODER_PREFIX = f"{HF_V_PREFIX}.blocks"
+        MG_V_PREFIX = "vision_model"
+        MG_V_DECODER_PREFIX = rf"{MG_V_PREFIX}\.decoder\.layers"
+
+        vision_rules = [
+            # vision patch embed
+            ConvertorRule(
+                re.compile(rf"^{MG_V_PREFIX}\.patch_embed\.proj\.weight$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_V_PREFIX}.patch_embed.proj.weight"],
+            ),
+            # final layer norm
+            ConvertorRule(
+                re.compile(rf"^{MG_V_PREFIX}\.decoder\.final_layernorm\.weight$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_V_PREFIX}.merger.ln_q.weight"],
+            ),
+            # attn norm
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_V_DECODER_PREFIX}\.{B}\.self_attention\.layer_norm_weight$"
+                ),
+                TransformType.SPLIT_NONE,
+                [f"{HF_V_DECODER_PREFIX}" + r".\g<i>.norm1.weight"],
+            ),
+            # attn qkv
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_V_DECODER_PREFIX}\.{B}\.self_attention\.linear_qkv\.{WB}$"
+                ),
+                TransformType.SPLIT_NONE,
+                [f"{HF_V_DECODER_PREFIX}" + r".\g<i>.attn.qkv.\g<wb>"],
+            ),
+            # attn proj
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_V_DECODER_PREFIX}\.{B}\.self_attention\.linear_proj\.{WB}$"
+                ),
+                TransformType.SPLIT_NONE,
+                [f"{HF_V_DECODER_PREFIX}" + r".\g<i>.attn.proj.\g<wb>"],
+            ),
+            # mlp fc1
+            ConvertorRule(
+                re.compile(rf"^{MG_V_DECODER_PREFIX}\.{B}\.mlp\.linear_fc1\.{WB}$"),
+                TransformType.SPLIT_FC1,
+                [
+                    f"{HF_V_DECODER_PREFIX}" + r".\g<i>.mlp.gate_proj.\g<wb>",
+                    f"{HF_V_DECODER_PREFIX}" + r".\g<i>.mlp.up_proj.\g<wb>",
+                ],
+            ),
+            # mlp fc2
+            ConvertorRule(
+                re.compile(rf"^{MG_V_DECODER_PREFIX}\.{B}\.mlp\.linear_fc2\.{WB}$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_V_DECODER_PREFIX}" + r".\g<i>.mlp.down_proj.\g<wb>"],
+            ),
+            # mlp norm
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_V_DECODER_PREFIX}\.{B}\.mlp\.linear_fc1\.layer_norm_weight$"
+                ),
+                TransformType.SPLIT_NONE,
+                [f"{HF_V_DECODER_PREFIX}" + r".\g<i>.norm2.weight"],
+            ),
+        ]
+        return vision_rules
+
+    def _build_llm_rules(self) -> List[ConvertorRule]:
+        B = r"(?P<i>\d+)"
+        WB = r"(?P<wb>weight|bias)"
+        HF_LLM_PREFIX = "model.language_model"
+        MG_LLM_PREFIX = "language_model"
+        MG_LLM_DECODER_PREFIX = rf"{MG_LLM_PREFIX}\.decoder\.layers"
+
+        llm_rules = [
+            # embeddings
+            ConvertorRule(
+                re.compile(rf"^{MG_LLM_PREFIX}\.embed_tokens\.weight$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_LLM_PREFIX}.embedding.weight"],
+            ),
+            # final_layernorm
+            ConvertorRule(
+                re.compile(rf"^{MG_LLM_PREFIX}\.final_layernorm\.weight$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_LLM_PREFIX}.norm.weight"],
+            ),
+            # attn norm
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_LLM_DECODER_PREFIX}\.{B}\.self_attention\.layer_norm_weight$"
+                ),
+                TransformType.SPLIT_NONE,
+                [f"{HF_LLM_PREFIX}" + r".decoder.layers.\g<i>.input_layernorm.weight"],
+            ),
+            # attn qkv
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_LLM_DECODER_PREFIX}\.{B}\.self_attention\.linear_qkv\.{WB}$"
+                ),
+                TransformType.SPLIT_QKV,
+                [
+                    f"{HF_LLM_PREFIX}"
+                    + r".decoder.layers.\g<i>.self_attn.q_proj.\g<wb>",
+                    f"{HF_LLM_PREFIX}"
+                    + r".decoder.layers.\g<i>.self_attn.k_proj.\g<wb>",
+                    f"{HF_LLM_PREFIX}"
+                    + r".decoder.layers.\g<i>.self_attn.v_proj.\g<wb>",
+                ],
+            ),
+            # attn proj
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_LLM_DECODER_PREFIX}\.{B}\.self_attention\.linear_proj\.{WB}$"
+                ),
+                TransformType.SPLIT_NONE,
+                [f"{HF_LLM_PREFIX}" + r".decoder.layers.\g<i>.self_attn.o_proj.\g<wb>"],
+            ),
+            # mlp fc1
+            ConvertorRule(
+                re.compile(rf"^{MG_LLM_DECODER_PREFIX}\.{B}\.mlp\.linear_fc1\.{WB}$"),
+                TransformType.SPLIT_FC1,
+                [
+                    f"{HF_LLM_PREFIX}" + r".decoder.layers.\g<i>.mlp.gate_proj.\g<wb>",
+                    f"{HF_LLM_PREFIX}" + r".decoder.layers.\g<i>.mlp.up_proj.\g<wb>",
+                ],
+            ),
+            # mlp fc2
+            ConvertorRule(
+                re.compile(rf"^{MG_LLM_DECODER_PREFIX}\.{B}\.mlp\.linear_fc2\.{WB}$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_LLM_PREFIX}" + r".decoder.layers.\g<i>.mlp.down_proj.\g<wb>"],
+            ),
+            # mlp norm
+            ConvertorRule(
+                re.compile(
+                    rf"^{MG_LLM_DECODER_PREFIX}\.{B}\.mlp\.linear_fc1\.layer_norm_weight$"
+                ),
+                TransformType.SPLIT_NONE,
+                [
+                    f"{HF_LLM_PREFIX}"
+                    + r".decoder.layers.\g<i>.post_attention_layernorm.weight"
+                ],
+            ),
+        ]
+        return llm_rules
+
+    def _build_projector_rules(self) -> List[ConvertorRule]:
+        HF_PROJECTOR_PREFIX = "model.visual.merger"
+        MG_PROJECTOR_PREFIX = "vision_model.protection.encoder"
+        WB = r"(?P<wb>weight|bias)"
+
+        projector_rules = [
+            # projector fc1
+            ConvertorRule(
+                re.compile(rf"^{MG_PROJECTOR_PREFIX}\.linear_fc1\.{WB}$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_PROJECTOR_PREFIX}" + r".mlp.0.\g<wb>"],
+            ),
+            # projector fc2
+            ConvertorRule(
+                re.compile(rf"^{MG_PROJECTOR_PREFIX}\.linear_fc2\.{WB}$"),
+                TransformType.SPLIT_NONE,
+                [f"{HF_PROJECTOR_PREFIX}" + r".mlp.2.\g<wb>"],
+            ),
+        ]
+        return projector_rules
+
+    def build_rules(self) -> List[ConvertorRule]:
+        rules = []
+        rules.extend(self._build_vision_rules())
+        rules.extend(self._build_llm_rules())
+        rules.extend(self._build_projector_rules())
+        return rules
+
+
+_MG2HF_CONVERTOR_REGISTRY = {}
+
+
+def register_mg2hf_convertor(model_arch: str, convertor_cls: Callable) -> None:
+    if model_arch in _MG2HF_CONVERTOR_REGISTRY:
+        raise ValueError(f"Convertor for {model_arch} already registered")
+    _MG2HF_CONVERTOR_REGISTRY[model_arch] = convertor_cls
+
+
+register_mg2hf_convertor("qwen2.5", Qwen2_5Convertor)
+register_mg2hf_convertor("qwen2.5-vl", Qwen2_5VLConvertor)
+
+
+def get_mg2hf_convertor(model_arch: str, config, strict: bool = False) -> BaseConvertor:
+    if model_arch not in _MG2HF_CONVERTOR_REGISTRY:
+        raise ValueError(f"No convertor registered for {model_arch}")
+    convertor_cls = _MG2HF_CONVERTOR_REGISTRY[model_arch]
+    return convertor_cls(config=config, strict=strict)
diff --git a/rlinf/utils/resharding/mcore_weight_reshard.py b/rlinf/utils/resharding/mcore_weight_reshard.py
index 9ec44eba0..90d8277fa 100644
--- a/rlinf/utils/resharding/mcore_weight_reshard.py
+++ b/rlinf/utils/resharding/mcore_weight_reshard.py
@@ -183,6 +183,6 @@ def get_layer_num(param_name):
             )
 
         if self.config.convert_fn is not None:
-            model_state_dict = self.config.convert_fn(model_state_dict, self.config)
+            model_state_dict = self.config.convert_fn(model_state_dict)
 
         return model_state_dict
diff --git a/rlinf/utils/resharding/reshard_config.py b/rlinf/utils/resharding/reshard_config.py
index 2b493a839..79e089bcd 100644
--- a/rlinf/utils/resharding/reshard_config.py
+++ b/rlinf/utils/resharding/reshard_config.py
@@ -17,7 +17,9 @@
 
 from megatron.core.transformer import TransformerConfig
 
-from .utils import get_convert_fn, get_pp_reshard_fn, get_tp_reshard_fn
+from rlinf.utils.convertor.utils import get_mg2hf_convertor
+
+from .utils import get_pp_reshard_fn, get_tp_reshard_fn
 
 
 @dataclass
@@ -37,7 +39,7 @@ class ReshardConfig:
     """Resharding pp size."""
 
     convert_fn: Callable = None
-    """Convert function to use for converting the model parameters' weight and name from training engine to rollout engine."""
+    """Function to convert the model weights from megatron format to HuggingFace format."""
 
     tp_reshard_fn: Callable = None
     """Resharding function to use for resharding the model parallelism from tensor_model_parallel_size to reshard_tp_size."""
@@ -59,7 +61,8 @@ def __post_init__(self):
             )
 
         if self.convert_fn is None and self.reshard_weights_format != "mcore":
-            self.convert_fn = get_convert_fn(self.model_arch)
+            self._convertor = get_mg2hf_convertor(self.model_arch, self, strict=True)
+            self.convert_fn = self._convertor.convert
 
         if self.tp_reshard_fn is None:
             self.tp_reshard_fn = get_tp_reshard_fn(self.model_arch)
diff --git a/rlinf/utils/resharding/utils.py b/rlinf/utils/resharding/utils.py
index 1fae2b05a..82ca3eadf 100644
--- a/rlinf/utils/resharding/utils.py
+++ b/rlinf/utils/resharding/utils.py
@@ -12,23 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import re
-from enum import Enum
-from typing import List, Tuple
 
 import torch
 from megatron.core import parallel_state
 
 
-def get_convert_fn(model_arch: str):
-    if model_arch == "qwen2.5":
-        return TransformFunc.convert_mega_qwen2_5_to_hf
-    else:
-        raise NotImplementedError(
-            f"get_convert_fn for model_arch {model_arch} is not implemented"
-        )
-
-
 def get_tp_reshard_fn(model_arch: str):
     if model_arch == "qwen2.5":
         return tp_reshard_fn_qwen2_5
@@ -47,212 +35,6 @@ def get_pp_reshard_fn(model_arch: str):
         )
 
 
-###########################
-# convert fn implementation
-###########################
-
-
-class TransformType(Enum):
-    SPLIT_QKV = "split_qkv"
-    SPLIT_QKV_BIAS = "split_qkv_bias"
-    SPLIT_FC1 = "split_fc1"
-    SPLIT_NONE = "split_none"
-
-
-class TransformFunc:
-    @staticmethod
-    def _split_gqa_tensor(
-        tensor: torch.Tensor, new_statedict: dict, weight_names: List[str], config
-    ) -> None:
-        hidden_size = config.model_config.hidden_size
-        num_attention_heads = config.model_config.num_attention_heads
-        num_query_groups = config.model_config.num_query_groups or num_attention_heads
-        head_dim = hidden_size // num_attention_heads
-
-        target_tp = config.reshard_tp_size
-        assert num_query_groups % target_tp == 0, (
-            "num_query_groups must be divisible by reshard_tp_size"
-        )
-        local_num_query_groups = num_query_groups // target_tp
-
-        # heads per query group
-        assert num_attention_heads % num_query_groups == 0, (
-            "num_attention_heads must be divisible by num_query_groups"
-        )
-        q_heads_per_group = num_attention_heads // num_query_groups
-
-        num_channel_qkv = q_heads_per_group + 2
-
-        if tensor.ndim == 2:
-            # Weight: [out_features, in_features]
-            out_features, in_features = tensor.shape
-            expected_out = local_num_query_groups * num_channel_qkv * head_dim
-            assert out_features == expected_out, (
-                f"Unexpected fused QKV weight shape {tensor.shape}, expect "
-                f"[{expected_out}, {in_features}] (local groups={local_num_query_groups})"
-            )
-
-            qkv = tensor.view(
-                local_num_query_groups, num_channel_qkv, head_dim, in_features
-            )
-            q, k, v = torch.split(
-                qkv, [q_heads_per_group, 1, 1], dim=1
-            )  # shapes: [G, qh, D, In], [G,1,D,In], [G,1,D,In]
-            q_full = q.reshape(-1, in_features).contiguous()
-            k_full = k.reshape(-1, in_features).contiguous()
-            v_full = v.reshape(-1, in_features).contiguous()
-        else:
-            # Bias: [out_features]
-            out_features = tensor.shape[0]
-            expected_out = local_num_query_groups * num_channel_qkv * head_dim
-            assert out_features == expected_out, (
-                f"Unexpected fused QKV bias shape {tensor.shape}, expect "
-                f"[{expected_out}] (local groups={local_num_query_groups})"
-            )
-
-            qkv = tensor.view(local_num_query_groups, num_channel_qkv, head_dim)
-            q, k, v = torch.split(qkv, [q_heads_per_group, 1, 1], dim=1)
-            q_full = q.reshape(-1).contiguous()
-            k_full = k.reshape(-1).contiguous()
-            v_full = v.reshape(-1).contiguous()
-
-        # Save to target names
-        new_statedict[weight_names[0]] = q_full.clone()
-        new_statedict[weight_names[1]] = k_full.clone()
-        new_statedict[weight_names[2]] = v_full.clone()
-
-    @staticmethod
-    def split_fc1(
-        linear_fc1: torch.Tensor, new_statedict: dict, weight_names: List[str], config
-    ) -> None:
-        assert weight_names is not None and len(weight_names) == 2, (
-            f"split_fc1 transform expects two weight names, got {weight_names}"
-        )
-
-        tp_size = config.model_config.tensor_model_parallel_size
-        target_tp = config.reshard_tp_size
-        split_size = linear_fc1.shape[0] // (tp_size // target_tp)
-        linear_fc1_slice = torch.split(linear_fc1, split_size, dim=0)
-
-        gate_proj_shards = []
-        up_proj_shards = []
-        for weight in linear_fc1_slice:
-            assert weight.shape[0] % 2 == 0, (
-                f"linear_fc1 weight shape {weight.shape} is not even along dim 0"
-            )
-            weight_chunk = torch.chunk(weight, 2, dim=0)
-            gate_proj_shards.append(weight_chunk[0])
-            up_proj_shards.append(weight_chunk[1])
-        gate_proj = torch.cat(gate_proj_shards, dim=0)
-        up_proj = torch.cat(up_proj_shards, dim=0)
-
-        new_statedict[weight_names[0]] = gate_proj.clone()
-        new_statedict[weight_names[1]] = up_proj.clone()
-
-    @staticmethod
-    def split_none(
-        tensor: torch.Tensor, new_statedict: dict, weight_names: List[str]
-    ) -> None:
-        assert weight_names is not None and len(weight_names) == 1, (
-            f"split_none transform expects one weight name, got {weight_names}"
-        )
-        new_statedict[weight_names[0]] = tensor.clone()
-
-    @staticmethod
-    def mega_name_qwen2_5_to_hf(name: str) -> Tuple[TransformType, List[str]]:
-        """
-        Convert qwen2_5 model weight megatron name to hf name and do shape transform if needed.
-
-        Args:
-            name (str): megatron model weight name
-
-        Returns:
-            (TransformType, List[str]): transform type and the corresponding hf model weight name
-        """
-        if "embedding.word_embeddings.weight" in name:
-            return (TransformType.SPLIT_NONE, ["model.embed_tokens.weight"])
-        if "decoder.final_layernorm.weight" in name:
-            return (TransformType.SPLIT_NONE, ["model.norm.weight"])
-        if "output_layer.weight" in name:
-            return (TransformType.SPLIT_NONE, ["lm_head.weight"])
-        layer_id, suffix = TransformFunc.extract_layer_info(name)
-        assert layer_id is not None, f"Cannot extract layer info from {name}"
-        result_pattern = "model.layers.{}.{}"
-        nmap = {
-            "self_attention.linear_proj.weight": (
-                TransformType.SPLIT_NONE,
-                ["self_attn.o_proj.weight"],
-            ),
-            "self_attention.linear_qkv.layer_norm_weight": (
-                TransformType.SPLIT_NONE,
-                ["input_layernorm.weight"],
-            ),
-            "self_attention.linear_qkv.weight": (
-                TransformType.SPLIT_QKV,
-                [
-                    "self_attn.q_proj.weight",
-                    "self_attn.k_proj.weight",
-                    "self_attn.v_proj.weight",
-                ],
-            ),
-            "self_attention.linear_qkv.bias": (
-                TransformType.SPLIT_QKV_BIAS,
-                [
-                    "self_attn.q_proj.bias",
-                    "self_attn.k_proj.bias",
-                    "self_attn.v_proj.bias",
-                ],
-            ),
-            "mlp.linear_fc1.layer_norm_weight": (
-                TransformType.SPLIT_NONE,
-                ["post_attention_layernorm.weight"],
-            ),
-            "mlp.linear_fc1.weight": (
-                TransformType.SPLIT_FC1,
-                ["mlp.gate_proj.weight", "mlp.up_proj.weight"],
-            ),
-            "mlp.linear_fc2.weight": (
-                TransformType.SPLIT_NONE,
-                ["mlp.down_proj.weight"],
-            ),
-        }
-
-        assert suffix in nmap, f"Cannot find mapping for {suffix}"
-
-        transform_type, suffixes = nmap[suffix]
-        return (
-            transform_type,
-            [result_pattern.format(layer_id, suffix) for suffix in suffixes],
-        )
-
-    @staticmethod
-    def convert_mega_qwen2_5_to_hf(model_state_dict: dict, config) -> dict:
-        new_statedict = {}
-        for name, param in model_state_dict.items():
-            transform_type, hf_names = TransformFunc.mega_name_qwen2_5_to_hf(name)
-            if transform_type == TransformType.SPLIT_QKV:
-                TransformFunc._split_gqa_tensor(param, new_statedict, hf_names, config)
-            elif transform_type == TransformType.SPLIT_QKV_BIAS:
-                TransformFunc._split_gqa_tensor(param, new_statedict, hf_names, config)
-            elif transform_type == TransformType.SPLIT_FC1:
-                TransformFunc.split_fc1(param, new_statedict, hf_names, config)
-            elif transform_type == TransformType.SPLIT_NONE:
-                TransformFunc.split_none(param, new_statedict, hf_names)
-            else:
-                raise NotImplementedError(
-                    f"Transform type {transform_type} not implemented"
-                )
-        return new_statedict
-
-    @staticmethod
-    def extract_layer_info(s):
-        pattern = r"layers\.(\d+)\.(.+)"
-        match = re.search(pattern, s)
-        if match:
-            return match.group(1), match.group(2)
-        return None, None
-
-
 ##############################
 # tp reshard fn implementation
 ##############################

From 5db6fab7125716285b2dbf17fe6abccaf27acf79 Mon Sep 17 00:00:00 2001
From: cc <1716911340@qq.com>
Date: Fri, 12 Sep 2025 15:40:01 +0800
Subject: [PATCH 22/57] feat(rollout_mm): add multimodal input/output for
 rollout backend (#2)

* Merge pull request #1 from anHappyDog/feature/weight_convertor

feat(weight): refactor and add qwen2.5-vl mg2hf convertor

* feat(mm_input): add basic vision-language dataset processor and yaml config

Signed-off-by: Bo Dai <daibo@infini-ai.com>

* feat(mm_input): add vLLM multimodal support

Signed-off-by: Bo Dai <daibo@infini-ai.com>

---------

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 .../config/qwen2.5-vl-3b-grpo-megatron.yaml   | 262 ++++++++++++++++++
 examples/vlm/main_vlm.py                      |  99 +++++++
 examples/vlm/run_main_vlm_grpo_megatron.sh    |  21 ++
 rlinf/config.py                               |  12 +-
 rlinf/data/datasets.py                        | 245 +++++++++++++---
 rlinf/data/io_struct.py                       |  28 +-
 rlinf/runners/math_runner.py                  |   7 +-
 rlinf/workers/rollout/sglang/sglang_worker.py |   4 +-
 8 files changed, 629 insertions(+), 49 deletions(-)
 create mode 100644 examples/vlm/config/qwen2.5-vl-3b-grpo-megatron.yaml
 create mode 100644 examples/vlm/main_vlm.py
 create mode 100644 examples/vlm/run_main_vlm_grpo_megatron.sh

diff --git a/examples/vlm/config/qwen2.5-vl-3b-grpo-megatron.yaml b/examples/vlm/config/qwen2.5-vl-3b-grpo-megatron.yaml
new file mode 100644
index 000000000..e04fe0118
--- /dev/null
+++ b/examples/vlm/config/qwen2.5-vl-3b-grpo-megatron.yaml
@@ -0,0 +1,262 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  num_gpus_per_node: 8
+  component_placement:
+    actor,rollout: all
+
+runner:
+  task_type: math
+  logger:
+    log_path: ${runner.output_dir}/${runner.experiment_name}
+    project_name: rlinf
+    experiment_name: ${runner.experiment_name}
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 5
+  max_steps: -1
+
+  val_check_interval: 1
+  save_interval: 50
+
+  seq_length: 2048
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 28672
+
+  resume_dir: null
+  experiment_name: qwen2.5-vl-3b-grpo
+  output_dir: /mnt/public/daibo/results
+
+algorithm:
+  group_size: 16
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: True
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: ppo
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+
+  adv_type: grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /mnt/public/hf_models/qwen2.5-VL-3B/
+  model_arch: qwen2.5-vl
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+
+  attention_backend: triton
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  sglang_decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+  use_torch_compile: False # enable torch_compile in SGLang for rollout.
+  torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+data:
+  type: vision_language
+  max_prompt_length: 1024
+  filter_prompt_by_length: True
+  rollout_batch_size: 512
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  image_keys: ["image"] # some vlm datasets may have multiple image columns
+  choice_key: "choices"
+  answer_key: "answer"
+  solution_key: "solution"
+  use_chat_template: True
+  shuffle: True
+  validation_shuffle: True
+  seed: 1234
+  train_data_paths: ["/mnt/public/daibo/dataset/science_qa/data/train-00000-of-00001-1028f23e353fbe3e.parquet"]
+  val_data_paths: ["/mnt/public/daibo/dataset/science_qa/data/validation-00000-of-00001-6c7328ff6c84284c.parquet"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: megatron
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  checkpoint_load_path: /mnt/public/mg_ckpts/qwen2.5-VL-3B-tp2-pp1/
+
+  offload_optimizer: True
+  offload_weight: True
+  offload_grad: True
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: fp16
+    add_bias_linear: False
+
+    tensor_model_parallel_size: 2
+    pipeline_model_parallel_size: 1
+
+    activation: swiglu
+    sequence_parallel: True
+    # recompute_method: block
+    # recompute_granularity: selective
+
+    recompute_method: block
+    recompute_granularity: full
+    recompute_num_layers: 20
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+
+    normalization: rmsnorm
+
+    position_embedding_type: rope
+
+    apply_rope_fusion: True
+    bias_dropout_fusion: False
+    persist_layer_norm: False
+    bias_activation_fusion: False
+    attention_softmax_in_fp32: True
+    batch_p2p_comm: False
+    variable_seq_lengths: True
+    gradient_accumulation_fusion: False
+    moe_token_dispatcher_type: alltoall
+    use_cpu_initialization: False
+
+  optim:
+    optimizer: adam
+    bf16: False
+    fp16: True
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /mnt/public/hf_models/qwen2.5-VL-3B/
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+  megatron:
+    ddp_bucket_size: null
+    distributed_backend: nccl # Support 'nccl' and 'gloo'
+    distributed_timeout_minutes: 30
+    ckpt_format: torch
+    use_dist_ckpt: False
+    tp_comm_bootstrap_backend: nccl
+    tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
+    use_hf_ckpt: False # if true, will transfer hf model to generate megatron checkpoint and use it for training.
+    use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
+
+    ckpt_convertor: # config for ckpt convertor
+      model: DeepSeek-R1-Distill-Qwen-1.5B
+      model_type: null # will be set by hf model's config if null
+      hf_model_path: ${rollout.model_dir} # path to the hf model
+      save_path: ${runner.output_dir}/${runner.experiment_name}/converted_ckpts/actor
+      use_gpu_num : 0
+      use_gpu_index: null
+      process_num: 16 # number of processes to use for checkpointing
+      tensor_model_parallel_size: ${actor.model.tensor_model_parallel_size}
+      pipeline_model_parallel_size: ${actor.model.pipeline_model_parallel_size}
+      
+    profiler: # profile megatron when inference and traning
+      output_dir: ${runner.output_dir}/${runner.experiment_name}/profiler
+      activities: ["cpu", "cuda"]
+      record_shapes: False
+      profile_memory: False
+      with_stack: False
+      with_flops: False
+      with_modules: True
+      export_tensorboard: True
+      export_chrome_trace: False
+      chrome_filename_prefix: "chrome_trace"
+      schedule_warmup: 2
+      schedule_active: 1
+      schedule_repeat: 1 # inference and training will repeat such times
+      # schedule_wait: it will be set at runtime
+
+
+reward:
+  use_reward_model: false
+  reward_type: 'math'
+  reward_scale: 5.0
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/examples/vlm/main_vlm.py b/examples/vlm/main_vlm.py
new file mode 100644
index 000000000..32466fa12
--- /dev/null
+++ b/examples/vlm/main_vlm.py
@@ -0,0 +1,99 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+import hydra
+import torch.multiprocessing as mp
+from omegaconf.omegaconf import OmegaConf
+
+from rlinf.config import validate_cfg
+from rlinf.data.datasets import create_rl_dataset
+from rlinf.data.tokenizers import hf_tokenizer
+from rlinf.runners.math_runner import MathRunner
+from rlinf.scheduler import Cluster
+from rlinf.utils.placement import ModelParallelComponentPlacement, PlacementMode
+from rlinf.utils.utils import output_redirector
+from rlinf.workers.actor.megatron_actor_worker import MegatronActor
+from rlinf.workers.inference.megatron_inference_worker import MegatronInference
+from rlinf.workers.rollout.sglang.sglang_worker import AsyncSGLangWorker, SGLangWorker
+
+"""Script to start GRPO training"""
+mp.set_start_method("spawn", force=True)
+
+
+@hydra.main(version_base="1.1")
+@output_redirector
+def main(cfg) -> None:
+    cfg = validate_cfg(cfg)
+    print(json.dumps(OmegaConf.to_container(cfg, resolve=True), indent=2))
+
+    cluster = Cluster(
+        num_nodes=cfg.cluster.num_nodes, num_gpus_per_node=cfg.cluster.num_gpus_per_node
+    )
+    component_placement = ModelParallelComponentPlacement(cfg)
+
+    # Rollout group
+    rollout_placement_strategy = component_placement.get_strategy("rollout")
+    SGLangWorkerCls = (
+        SGLangWorker
+        if component_placement.placement_mode == PlacementMode.COLLOCATED
+        else AsyncSGLangWorker
+    )
+    rollout_group = SGLangWorkerCls.create_group(cfg, component_placement).launch(
+        cluster,
+        name=cfg.rollout.group_name,
+        placement_strategy=rollout_placement_strategy,
+    )
+
+    # Inference group
+    inference_group = None
+    if (
+        component_placement.placement_mode == PlacementMode.DISAGGREGATED
+        and cfg.algorithm.recompute_logprobs
+    ):
+        inference_placement_strategy = component_placement.get_strategy("inference")
+        inference_group = MegatronInference.create_group(
+            cfg, component_placement
+        ).launch(
+            cluster,
+            name=cfg.inference.group_name,
+            placement_strategy=inference_placement_strategy,
+        )
+
+    # GRPO Actor group
+    actor_placement_strategy = component_placement.get_strategy("actor")
+    actor_group = MegatronActor.create_group(cfg, component_placement).launch(
+        cluster, name=cfg.actor.group_name, placement_strategy=actor_placement_strategy
+    )
+
+    tokenizer = hf_tokenizer(cfg.actor.tokenizer.tokenizer_model)
+    train_ds, val_ds = create_rl_dataset(cfg.data, tokenizer)
+
+    runner = MathRunner(
+        cfg=cfg,
+        placement=component_placement,
+        train_dataset=train_ds,
+        val_dataset=val_ds,
+        rollout=rollout_group,
+        inference=inference_group,
+        actor=actor_group,
+    )
+
+    runner.init_workers()
+    runner.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/vlm/run_main_vlm_grpo_megatron.sh b/examples/vlm/run_main_vlm_grpo_megatron.sh
new file mode 100644
index 000000000..2e5a75e3a
--- /dev/null
+++ b/examples/vlm/run_main_vlm_grpo_megatron.sh
@@ -0,0 +1,21 @@
+#! /bin/bash
+set -x
+
+tabs 4
+export VLLM_ATTENTION_BACKEND=XFORMERS
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export TOKENIZERS_PARALLELISM=false
+export RAY_DEDUP_LOGS=0
+
+CONFIG_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+REPO_PATH=$(dirname $(dirname "$CONFIG_PATH"))
+MEGATRON_PATH=/opt/Megatron-LM
+export PYTHONPATH=${REPO_PATH}:${MEGATRON_PATH}:$PYTHONPATH
+
+if [ -z "$1" ]; then
+    CONFIG_NAME="qwen2.5-vl-3b-grpo-megatron"
+else
+    CONFIG_NAME=$1
+fi
+
+python ${REPO_PATH}/examples/vlm/main_vlm.py --config-path ${CONFIG_PATH}/config/  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/rlinf/config.py b/rlinf/config.py
index 86198334a..f3b2984cd 100644
--- a/rlinf/config.py
+++ b/rlinf/config.py
@@ -199,8 +199,16 @@ def validate_model_cfg_by_hf_config(cfg, hf_model_path):
         qkv_bias = getattr(hf_config, "attention_bias", False)
 
     with open_dict(cfg):
-        if hf_config.rope_scaling is not None:
-            cfg.model.seq_len_interpolation_factor = hf_config.rope_scaling["factor"]
+        rs = getattr(hf_config, "rope_scaling", None)
+        if isinstance(rs, dict):
+            rtype = rs.get("type", "")
+            if rtype in {"linear", "dynamic", "ntk", "yarn"}:
+                f = rs.get("factor")
+                if f is not None:
+                    cfg.model.seq_len_interpolation_factor = float(f)
+            else:
+                # mrope
+                cfg.model.seq_len_interpolation_factor = None
         cfg.model.override_vocab_size = hf_config.vocab_size
         cfg.model.max_position_embeddings = hf_config.max_position_embeddings
         cfg.model.rotary_base = hf_config.rope_theta
diff --git a/rlinf/data/datasets.py b/rlinf/data/datasets.py
index fcce53f47..e33667703 100644
--- a/rlinf/data/datasets.py
+++ b/rlinf/data/datasets.py
@@ -15,11 +15,12 @@
 import json
 import logging
 import os
-from collections import defaultdict
-from typing import List
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
 
-import numpy as np
+import pandas as pd
 import torch
+from omegaconf import DictConfig
 from torch.utils.data import Dataset
 
 
@@ -60,6 +61,18 @@ def batch_pad_to_fixed_len(
     return batch_pad
 
 
+@dataclass
+class DatasetItem:
+    prompt: torch.Tensor
+    length: int
+    answer: str
+    idx: int
+    solution: Optional[str] = None
+    image_data: Optional[List[Union[bytes, str]]] = None
+    prompt_text: Optional[str] = None
+    meta: Optional[Dict[str, Any]] = None
+
+
 class MathDataset(Dataset):
     def __init__(self, data_paths, config, tokenizer):
         super().__init__()
@@ -151,16 +164,160 @@ def __getitem__(self, idx):
             self.tokenizer.eos_token_id,
             left_pad=True,
         )[0]
-
-        output = {
-            "prompt": prompt_tokens_tensor,
-            "length": prompt_length,
-            "answer": answer,
-            "idx": idx,
-        }
+        output = DatasetItem(
+            prompt=prompt_tokens_tensor,
+            length=prompt_length,
+            answer=answer,
+            idx=idx,
+            image_data=[],
+        )
         return output
 
 
+class VisionLanguageDataset(Dataset):
+    def __init__(
+        self, data_paths: Union[List[str], str], config: DictConfig, tokenizer
+    ):
+        super().__init__()
+        self.data_paths = data_paths
+        self.use_chat_template = config.use_chat_template
+
+        self.image_keys = config.image_keys
+        self.prompt_key = config.prompt_key
+        self.choice_key = config.choice_key
+        self.answer_key = config.answer_key
+        self.solution_key = config.solution_key
+
+        if isinstance(self.data_paths, str):
+            self.data_paths = [self.data_paths]
+
+        self.max_prompt_length = config.max_prompt_length
+        self.tokenizer = tokenizer
+        self.data = self._load_data()
+        self.post_process()
+
+    def post_process(self) -> None:
+        def get_image_list(
+            dataitem: Dict, image_keys: Optional[List[str]]
+        ) -> List[Union[bytes, str]]:
+            image_list: List[Union[bytes, str]] = []
+            if image_keys:
+                for key in image_keys:
+                    image_content = dataitem.get(key, None)
+                    if image_content is None:
+                        continue
+                    if isinstance(image_content, dict) and "bytes" in image_content:
+                        image_content = image_content["bytes"]
+                        assert isinstance(image_content, bytes), (
+                            f"image content should be bytes, but got {type(image_content)} , content is {image_content}"
+                        )
+                    image_list.append(image_content)
+            return image_list
+
+        def process_prompt(
+            data_item: Dict, image_count: int
+        ) -> Tuple[str, List[int], int]:
+            question = data_item.get(self.prompt_key, "")
+            options = data_item.get(self.choice_key, [])
+            if not isinstance(options, list):
+                options = [options]
+            prompt_text = question
+            if options:
+                prompt_text += f"{options}\n"
+            if self.use_chat_template:
+                message_content: List = []
+                for i in range(image_count):
+                    message_content.append({"type": "image"})
+                message_content.append({"type": "text", "text": prompt_text})
+                messages = [{"role": "user", "content": message_content}]
+                prompt_text = self.tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompt_ids = self.tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=True,
+                    add_generation_prompt=True,
+                    return_tensors="pt",
+                )
+                if isinstance(prompt_ids, torch.Tensor):
+                    if prompt_ids.dim() == 2 and prompt_ids.size(0) == 1:
+                        prompt_ids = prompt_ids.squeeze(0)  # [L]
+                    prompt_ids = prompt_ids.to(dtype=torch.long)
+                else:
+                    prompt_ids = torch.tensor(prompt_ids, dtype=torch.long)
+                prompt_length = len(prompt_ids)
+
+                return prompt_text, prompt_ids, prompt_length
+            else:
+                raise NotImplementedError("Non-chat template not implemented yet.")
+
+        processed_data: List[DatasetItem] = []
+        for idx, item in enumerate(self.data):
+            image_list: List[Union[bytes, str]] = get_image_list(item, self.image_keys)
+            prompt_text, prompt_ids, prompt_length = process_prompt(
+                item, len(image_list)
+            )
+
+            if prompt_length > self.max_prompt_length:
+                print(
+                    f"prompt_ids length {prompt_length} exceeds the max_prompt_length {self.max_prompt_length}",
+                )
+                prompt_ids = prompt_ids[: self.max_prompt_length]
+                prompt_length = self.max_prompt_length
+            prompt_ids = batch_pad_to_fixed_len(
+                [prompt_ids],
+                self.max_prompt_length,
+                self.tokenizer.eos_token_id,
+                left_pad=True,
+            )[0]
+            answer = item.get(self.answer_key, None)
+            solution = item.get(self.solution_key, None)
+
+            data_item = DatasetItem(
+                prompt_text=prompt_text,
+                prompt=prompt_ids,
+                length=prompt_length,
+                image_data=image_list,
+                answer=answer,
+                solution=solution,
+                idx=idx,
+            )
+            processed_data.append(data_item)
+        self.data = processed_data
+
+    def _load_data(self) -> List:
+        merged_data = []
+        for path in self.data_paths:
+            _, file_extension = os.path.splitext(path)
+            try:
+                pass
+                if file_extension == ".parquet":
+                    loaded_data: List = pd.read_parquet(path).to_dict(orient="records")
+                    merged_data.extend(loaded_data)
+                elif file_extension == ".jsonl":
+                    with open(path, "r", encoding="utf-8") as file:
+                        loaded_data = [json.loads(line.strip()) for line in file]
+                        merged_data.extend(loaded_data)
+                elif file_extension == ".json":
+                    with open(path, "r", encoding="utf-8") as file:
+                        content = json.load(file)
+                        if isinstance(content, list):
+                            merged_data.extend(content)
+                        else:
+                            merged_data.append(content)
+                else:
+                    print(f"Unsupport {file_extension}, skip: {path}")
+            except Exception as e:
+                raise RuntimeError(f"Load data error: {e}")
+        return merged_data
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+
 def create_rl_dataset(data_config, tokenizer):
     """Create rl datasets.
 
@@ -176,6 +333,8 @@ def create_rl_dataset(data_config, tokenizer):
 
     if data_config.type == "math":
         dataset_cls = MathDataset
+    elif data_config.type == "vision_language":
+        dataset_cls = VisionLanguageDataset
     else:
         return None, None
 
@@ -197,32 +356,46 @@ def create_rl_dataset(data_config, tokenizer):
     return train_dataset, val_dataset
 
 
-def collate_fn(data_list: list[dict]) -> dict:
-    r"""
-    Collate a batch of sample dicts into batched tensors and arrays.
-
-    Args:
-        data_list: List of dicts mapping feature names to torch.Tensor or other values.
-
-    Returns:
-        Dict where tensor entries are stacked into a torch.Tensor of shape
-        (batch_size, \*dims) and non-tensor entries are converted to
-        np.ndarray of dtype object with shape (batch_size,).
-    """
-    tensors = defaultdict(list)
-    non_tensors = defaultdict(list)
-
-    for data in data_list:
-        for key, val in data.items():
-            if isinstance(val, torch.Tensor):
-                tensors[key].append(val)
-            else:
-                non_tensors[key].append(val)
+def collate_fn(data_list: List["DatasetItem"]) -> Dict[str, Any]:
+    prompts = []
+    lens = []
+    for it in data_list:
+        p = (
+            it.prompt
+            if isinstance(it.prompt, torch.Tensor)
+            else torch.as_tensor(it.prompt, dtype=torch.long)
+        )
+        if p.dim() == 2 and p.size(0) == 1:
+            p = p.squeeze(0)
+        assert p.dim() == 1, (
+            f"DatasetItem.prompt must be 1-D tensor, current shape is: {p.shape}"
+        )
+        prompts.append(p)
+        lens.append(p.numel())
 
-    for key, val in tensors.items():
-        tensors[key] = torch.stack(val, dim=0)
+    if len(set(lens)) == 1:
+        target_len = lens[0]
+    else:
+        target_len = min(lens)
+        prompts = [p[-target_len:] if p.numel() > target_len else p for p in prompts]
 
-    for key, val in non_tensors.items():
-        non_tensors[key] = np.array(val, dtype=object)
+    batch_prompt = torch.stack(prompts, dim=0)  # [B, L]
+    batch_length = torch.tensor(
+        [min(int(it.length), target_len) for it in data_list], dtype=torch.long
+    )
 
-    return {**tensors, **non_tensors}
+    batch_idx = torch.tensor([int(it.idx) for it in data_list], dtype=torch.long)
+
+    batch: Dict[str, Any] = {
+        "prompt": batch_prompt,  # [B, L]
+        "length": batch_length,  # [B]
+        "answer": [it.answer for it in data_list],  # List[str]
+        "idx": batch_idx,  # [B]
+        "solution": [it.solution for it in data_list],  # List[Optional[str]]
+        "image_data": [
+            it.image_data for it in data_list
+        ],  # List[Optional[List[bytes|str]]]
+        "prompt_text": [it.prompt_text for it in data_list],  # List[Optional[str]]
+        "meta": [it.meta for it in data_list],  # List[Optional[dict]]
+    }
+    return batch
diff --git a/rlinf/data/io_struct.py b/rlinf/data/io_struct.py
index 1c455fa70..510acb292 100644
--- a/rlinf/data/io_struct.py
+++ b/rlinf/data/io_struct.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from omegaconf import DictConfig
@@ -49,12 +49,14 @@ class RolloutRequest:
     n: Number of completions to generate for each input
     idx: List of unique identifiers for the requests, used for tracking
     input_lengths: List of lengths of the input sequences, corresponding to input_ids
+    image_data: list of image data (bytes or URLs) for multimodal inputs
     answers: Optional list of answers for the requests, if available
     """
 
     n: int
     input_ids: List[List[int]]
     answers: List[str]
+    image_data: Union[List[List[bytes]], List[List[str]]]
 
     def repeat(self) -> "RolloutRequest":
         """Repeat each input in the RolloutRequest a specified number of times.
@@ -113,14 +115,20 @@ def split(self, num_splits: int) -> List["RolloutRequest"]:
     def repeat_and_split(
         self, rollout_batch_size: Optional[int] = None
     ) -> List["RolloutRequest"]:
-        input_ids, answers = zip(
+        input_ids, answers, image_data = zip(
             *[
-                (input_id, answer)
-                for input_id, answer in zip(self.input_ids, self.answers)
+                (input_id, answer, image_data)
+                for input_id, answer, image_data in zip(
+                    self.input_ids, self.answers, self.image_data
+                )
                 for _ in range(self.n)
             ]
         )
-        input_ids, answers = (list(input_ids), list(answers))
+        input_ids, answers, image_data = (
+            list(input_ids),
+            list(answers),
+            list(image_data),
+        )
 
         # Split input ids based on rollout_batch_size_per_gpu
         if rollout_batch_size is None:
@@ -134,14 +142,16 @@ def repeat_and_split(
         splitted_requests = []
         input_ids_split_list = split_list(input_ids, num_batches)
         answers_split_list = split_list(answers, num_batches)
+        image_data_split_list = split_list(image_data, num_batches)
 
-        for input_ids_batch, answers_batch in zip(
-            input_ids_split_list, answers_split_list
+        for input_ids_batch, answers_batch, image_data_batch in zip(
+            input_ids_split_list, answers_split_list, image_data_split_list
         ):
             request = RolloutRequest(
                 n=self.n,
                 input_ids=input_ids_batch,
                 answers=answers_batch,
+                image_data=image_data_batch,
             )
             splitted_requests.append(request)
 
@@ -257,7 +267,7 @@ class RolloutResult:
     prompt_texts: Optional[List[str]] = None
     response_texts: Optional[List[str]] = None
     answers: Optional[List[str]] = None
-
+    image_data: Optional[Union[List[List[bytes]], List[List[str]]]] = None
     # Inference
     # Only set when recompute_logprobs is False
     rollout_logprobs: Optional[List[List[float]]] = None
@@ -380,6 +390,7 @@ def from_sglang_results(
         group_size: int,
         input_ids: List[List[int]],
         answers: Optional[List[List[int]]] = None,
+        image_data: Optional[Union[List[List[bytes]], List[List[str]]]] = None,
         return_logprobs: bool = False,
     ) -> "RolloutResult":
         """Create a MathRolloutResult from the given results and input IDs.
@@ -406,6 +417,7 @@ def from_sglang_results(
             response_lengths=[len(res["output_ids"]) for res in results],
             response_ids=[res["output_ids"] for res in results],
             answers=answers,
+            image_data=image_data,
             is_end=[
                 res["meta_info"]["finish_reason"]["type"] == "stop" for res in results
             ],
diff --git a/rlinf/runners/math_runner.py b/rlinf/runners/math_runner.py
index e3f5b3750..a88826f5a 100644
--- a/rlinf/runners/math_runner.py
+++ b/rlinf/runners/math_runner.py
@@ -274,18 +274,21 @@ def epoch(self):
     def _put_batch(self, batch: Dict[str, torch.Tensor]):
         prompt_ids = batch["prompt"].tolist()
         lengths = batch["length"].tolist()
-        answers = batch["answer"].tolist()
+        answers = batch["answer"]
+        image_data = batch["image_data"]
         prompts = [ids[-pmp_len:] for ids, pmp_len in zip(prompt_ids, lengths)]
         rollout_dp_size = self.component_placement.rollout_dp_size
 
-        for input_ids, answers in zip(
+        for input_ids, answers, image_data in zip(
             split_list(prompts, rollout_dp_size, enforce_divisible_batch=False),
             split_list(answers, rollout_dp_size, enforce_divisible_batch=False),
+            split_list(image_data, rollout_dp_size, enforce_divisible_batch=False),
         ):
             request = RolloutRequest(
                 n=self.cfg.algorithm.group_size,
                 input_ids=input_ids,
                 answers=answers,
+                image_data=image_data,
             )
             self.dataloader_channel.put(request, async_op=True)
 
diff --git a/rlinf/workers/rollout/sglang/sglang_worker.py b/rlinf/workers/rollout/sglang/sglang_worker.py
index 8d4a15cb7..6a921c0bd 100644
--- a/rlinf/workers/rollout/sglang/sglang_worker.py
+++ b/rlinf/workers/rollout/sglang/sglang_worker.py
@@ -169,7 +169,6 @@ def sync_model_from_actor(self):
 
     def rollout(self, input_channel: Channel, output_channel: Channel):
         request: RolloutRequest = input_channel.get()
-
         # Repeat prompts based on the group_size config
         requests = request.repeat_and_split(self._rollout_batch_size)
 
@@ -181,6 +180,8 @@ def rollout(self, input_channel: Channel, output_channel: Channel):
             with self.worker_timer():
                 results = self._engine.generate(
                     input_ids=request.input_ids,
+                    # 0.4.4 has modality bug,can't pass non-None image_data
+                    image_data=request.image_data if any(request.image_data) else None,
                     sampling_params=self._sampling_params,
                     return_logprob=self._return_logprobs,
                 )
@@ -191,6 +192,7 @@ def rollout(self, input_channel: Channel, output_channel: Channel):
                 request.n,
                 request.input_ids,
                 request.answers,
+                request.image_data,
                 self._return_logprobs,
             )
             rollout_results.append(rollout_result)

From 90a13089eed7b521baba5a24812c5ab77e7e5930 Mon Sep 17 00:00:00 2001
From: guozhen <37097045+guozhen1997@users.noreply.github.com>
Date: Thu, 18 Sep 2025 18:13:33 +0800
Subject: [PATCH 23/57] feat(vlm): support VLM sglang rollout and fsdp training
 (#6)

Signed-off-by: guozhen1997 <2997871698@qq.com>
Signed-off-by: Bo Dai <daibo@infini-ai.com>
Co-authored-by: Bo Dai <daibo@infini-ai.com>
---
 .../math/config/qwen2.5-1.5b-grpo-fsdp.yaml   | 203 ++++++++
 examples/math/main_math.py                    |   7 +-
 ...grpo_megatron.sh => run_main_math_grpo.sh} |   2 +-
 .../run_main_math_pipeline_grpo_megatron.sh   |  21 -
 .../config/qwen2.5-vl-3b-grpo-megatron.yaml   | 132 ++---
 examples/vlm/main_vlm.py                      |  18 +-
 rlinf/algorithms/losses.py                    |  17 +-
 rlinf/config.py                               |  16 +-
 rlinf/data/datasets.py                        |  62 ++-
 rlinf/data/io_struct.py                       | 146 ++++++
 .../hybrid_engines/fsdp/fsdp_model_manager.py |  46 +-
 rlinf/hybrid_engines/fsdp/utils.py            |   8 +-
 .../sglang/sglang_0_4_4/sgl_scheduler.py      |   5 +-
 .../sglang/sglang_0_4_6/sgl_scheduler.py      |   6 +-
 .../sglang/sglang_0_4_9/sgl_scheduler.py      |   5 +-
 rlinf/models/__init__.py                      |   3 +-
 rlinf/models/embodiment/model_utils.py        |  25 +-
 rlinf/runners/math_runner.py                  |   2 +-
 rlinf/utils/convertor/utils.py                |   2 +-
 rlinf/utils/distributed.py                    |  27 +-
 rlinf/utils/placement.py                      |  36 +-
 rlinf/utils/resharding/utils.py               |   3 +-
 rlinf/utils/utils.py                          |  52 ++
 rlinf/workers/actor/__init__.py               |  17 +
 rlinf/workers/actor/fsdp_actor_worker.py      | 455 +++++++++++++++++-
 rlinf/workers/rollout/sglang/sglang_worker.py |   5 +-
 rlinf/workers/rollout/utils.py                |   6 +
 27 files changed, 1068 insertions(+), 259 deletions(-)
 create mode 100644 examples/math/config/qwen2.5-1.5b-grpo-fsdp.yaml
 rename examples/math/{run_main_math_grpo_megatron.sh => run_main_math_grpo.sh} (92%)
 delete mode 100644 examples/math/run_main_math_pipeline_grpo_megatron.sh

diff --git a/examples/math/config/qwen2.5-1.5b-grpo-fsdp.yaml b/examples/math/config/qwen2.5-1.5b-grpo-fsdp.yaml
new file mode 100644
index 000000000..d8a1e8c3f
--- /dev/null
+++ b/examples/math/config/qwen2.5-1.5b-grpo-fsdp.yaml
@@ -0,0 +1,203 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  num_gpus_per_node: 8
+  component_placement:
+    actor,rollout: all
+
+runner:
+  task_type: math
+  logger:
+    log_path: ${runner.output_dir}/${runner.experiment_name}
+    project_name: rlinf
+    experiment_name: ${runner.experiment_name}
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 5
+  max_steps: -1
+
+  val_check_interval: 1
+  save_interval: 50
+
+  seq_length: 2048
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 28672
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: ../results
+
+algorithm:
+  group_size: 8
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: False
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /mnt/public/hf_models/DeepSeek-R1-Distill-Qwen-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: math
+  max_prompt_length: 1024
+  filter_prompt_by_length: True
+  rollout_batch_size: 8
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  shuffle: True
+  validation_shuffle: True
+  seed: 1234
+  train_data_paths: ["/mnt/public/guozhen/data/boba_106k_0319_prompt_1024.jsonl"]
+  val_data_paths: ["/mnt/public/guozhen/data/boba_106k_0319_prompt_1024.jsonl"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: fp16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /mnt/public/hf_models/DeepSeek-R1-Distill-Qwen-1.5B
+
+  optim:
+    optimizer: adam
+    bf16: False
+    fp16: True
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /mnt/public/hf_models/DeepSeek-R1-Distill-Qwen-1.5B
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+reward:
+  use_reward_model: false
+  reward_type: 'math'
+  reward_scale: 5.0
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/examples/math/main_math.py b/examples/math/main_math.py
index 7150566fb..80f408bd0 100644
--- a/examples/math/main_math.py
+++ b/examples/math/main_math.py
@@ -25,7 +25,7 @@
 from rlinf.scheduler import Cluster
 from rlinf.utils.placement import ModelParallelComponentPlacement, PlacementMode
 from rlinf.utils.utils import output_redirector
-from rlinf.workers.actor.megatron_actor_worker import MegatronActor
+from rlinf.workers.actor import get_actor_worker
 from rlinf.workers.inference.megatron_inference_worker import MegatronInference
 from rlinf.workers.rollout.utils import get_rollout_backend_worker
 
@@ -68,13 +68,14 @@ def main(cfg) -> None:
         )
 
     # GRPO Actor group
+    actor_worker_cls = get_actor_worker(cfg)
     actor_placement_strategy = component_placement.get_strategy("actor")
-    actor_group = MegatronActor.create_group(cfg, component_placement).launch(
+    actor_group = actor_worker_cls.create_group(cfg, component_placement).launch(
         cluster, name=cfg.actor.group_name, placement_strategy=actor_placement_strategy
     )
 
     tokenizer = hf_tokenizer(cfg.actor.tokenizer.tokenizer_model)
-    train_ds, val_ds = create_rl_dataset(cfg.data, tokenizer)
+    train_ds, val_ds = create_rl_dataset(cfg, tokenizer)
 
     runner = MathRunner(
         cfg=cfg,
diff --git a/examples/math/run_main_math_grpo_megatron.sh b/examples/math/run_main_math_grpo.sh
similarity index 92%
rename from examples/math/run_main_math_grpo_megatron.sh
rename to examples/math/run_main_math_grpo.sh
index f826f882f..dc2f75ee0 100644
--- a/examples/math/run_main_math_grpo_megatron.sh
+++ b/examples/math/run_main_math_grpo.sh
@@ -13,7 +13,7 @@ MEGATRON_PATH=/opt/Megatron-LM
 export PYTHONPATH=${REPO_PATH}:${MEGATRON_PATH}:$PYTHONPATH
 
 if [ -z "$1" ]; then
-    CONFIG_NAME="qwen2.5-1.5b-grpo-megatron"
+    CONFIG_NAME="qwen2.5-1.5b-grpo-fsdp"
 else
     CONFIG_NAME=$1
 fi
diff --git a/examples/math/run_main_math_pipeline_grpo_megatron.sh b/examples/math/run_main_math_pipeline_grpo_megatron.sh
deleted file mode 100644
index 7deb96519..000000000
--- a/examples/math/run_main_math_pipeline_grpo_megatron.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#! /bin/bash
-set -x
-
-tabs 4
-export VLLM_ATTENTION_BACKEND=XFORMERS
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export TOKENIZERS_PARALLELISM=false
-export RAY_DEDUP_LOGS=0
-
-CONFIG_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-REPO_PATH=$(dirname $(dirname "$CONFIG_PATH"))
-MEGATRON_PATH=/opt/Megatron-LM
-export PYTHONPATH=${REPO_PATH}:${MEGATRON_PATH}:$PYTHONPATH
-
-if [ -z "$1" ]; then
-    CONFIG_NAME="qwen2.5-1.5b-grpo-megatron-pipeline"
-else
-    CONFIG_NAME=$1
-fi
-
-python ${REPO_PATH}/examples/math/main_math.py --config-path ${CONFIG_PATH}/config/  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/examples/vlm/config/qwen2.5-vl-3b-grpo-megatron.yaml b/examples/vlm/config/qwen2.5-vl-3b-grpo-megatron.yaml
index e04fe0118..cfe4febe7 100644
--- a/examples/vlm/config/qwen2.5-vl-3b-grpo-megatron.yaml
+++ b/examples/vlm/config/qwen2.5-vl-3b-grpo-megatron.yaml
@@ -32,11 +32,11 @@ runner:
   max_tokens_per_mbs: 28672
 
   resume_dir: null
-  experiment_name: qwen2.5-vl-3b-grpo
-  output_dir: /mnt/public/daibo/results
+  experiment_name: grpo-1.5b
+  output_dir: ../results
 
 algorithm:
-  group_size: 16
+  group_size: 8
 
   n_minibatches: 4
   training_batch_size_per_gpu: 1 # micro batch size
@@ -50,11 +50,11 @@ algorithm:
   # val rollout mbs
   val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
 
-  recompute_logprobs: True
+  recompute_logprobs: False
   shuffle_rollout: False
 
   # GRPO loss params
-  loss_type: ppo
+  loss_type: math_ppo_actor
   loss_agg_func: "token-mean"
   kl_beta: 0.0 # 0.001
   kl_penalty_type: low_var_kl
@@ -62,8 +62,10 @@ algorithm:
   entropy_bonus: 0.0
   calculate_entropy: False
   clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
 
-  adv_type: grpo
+  adv_type: math_grpo
   normalize_advantages: True
   early_stop_imp_ratio: 5.0
   use_valid_token_scale: False
@@ -83,38 +85,46 @@ rollout:
 
   gpu_memory_utilization: 0.55
 
-  model_dir: /mnt/public/hf_models/qwen2.5-VL-3B/
-  model_arch: qwen2.5-vl
+  model_dir: /mnt/public/hf_models/Qwen2.5-VL-3B-Instruct
+  model_arch: qwen2.5_vl #qwen2.5
   enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
   distributed_executor_backend: mp   # ray or mp
   disable_log_stats: False
   detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
   padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
   eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
 
-  attention_backend: triton
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
-  tensor_parallel_size: 1
+  tensor_parallel_size: 2
   pipeline_parallel_size: 1
   
   validate_weight: False # whether to send all weights at first for weight comparison.
   validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
   print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
 
-  sglang_decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
   max_running_requests: 64 # the maximum number of running requests in the rollout engine.
   cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
 
-  use_torch_compile: False # enable torch_compile in SGLang for rollout.
-  torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
-
 data:
   type: vision_language
   max_prompt_length: 1024
   filter_prompt_by_length: True
-  rollout_batch_size: 512
+  rollout_batch_size: 8
   val_rollout_batch_size: null
   num_workers: 2
   prompt_key: prompt
@@ -126,20 +136,20 @@ data:
   shuffle: True
   validation_shuffle: True
   seed: 1234
-  train_data_paths: ["/mnt/public/daibo/dataset/science_qa/data/train-00000-of-00001-1028f23e353fbe3e.parquet"]
-  val_data_paths: ["/mnt/public/daibo/dataset/science_qa/data/validation-00000-of-00001-6c7328ff6c84284c.parquet"]
+  train_data_paths: ["/mnt/public/guozhen/data/science_qa/train-00000-of-00001-1028f23e353fbe3e.parquet"]
+  val_data_paths: ["/mnt/public/guozhen/data/science_qa/test-00000-of-00001-f0e719df791966ff.parquet"]
 
 actor:
   group_name: "ActorGroup"
-  training_backend: megatron
+  training_backend: fsdp
   mcore_gpt: True
   spec_name: decoder_gpt
 
-  checkpoint_load_path: /mnt/public/mg_ckpts/qwen2.5-VL-3B-tp2-pp1/
+  enable_offload: True
+  checkpoint_load_path: null
 
-  offload_optimizer: True
-  offload_weight: True
-  offload_grad: True
+  global_batch_size: 8
+  micro_batch_size: 1
 
   enable_dp_load_balance: False
 
@@ -148,43 +158,20 @@ actor:
   seed: 1234
 
   model:
-    precision: fp16
-    add_bias_linear: False
-
-    tensor_model_parallel_size: 2
-    pipeline_model_parallel_size: 1
-
-    activation: swiglu
-    sequence_parallel: True
-    # recompute_method: block
-    # recompute_granularity: selective
-
-    recompute_method: block
-    recompute_granularity: full
-    recompute_num_layers: 20
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
 
     seq_length: ${runner.seq_length}
     encoder_seq_length: ${runner.seq_length}
+    model_path: /mnt/public/hf_models/Qwen2.5-VL-3B-Instruct/
 
-    normalization: rmsnorm
-
-    position_embedding_type: rope
-
-    apply_rope_fusion: True
-    bias_dropout_fusion: False
-    persist_layer_norm: False
-    bias_activation_fusion: False
-    attention_softmax_in_fp32: True
-    batch_p2p_comm: False
-    variable_seq_lengths: True
-    gradient_accumulation_fusion: False
-    moe_token_dispatcher_type: alltoall
-    use_cpu_initialization: False
+    model_arch: ${rollout.model_arch}
 
   optim:
     optimizer: adam
-    bf16: False
-    fp16: True
+    bf16: True #False
+    fp16: False #True
     lr: 2e-05
     adam_beta1: 0.9
     adam_beta2: 0.95
@@ -209,50 +196,11 @@ actor:
     lr_decay_iters: 10
 
   tokenizer:
-    tokenizer_model: /mnt/public/hf_models/qwen2.5-VL-3B/
+    tokenizer_model: /mnt/public/hf_models/Qwen2.5-VL-3B-Instruct
     use_fast: False
     trust_remote_code: True
     padding_side: 'right'
 
-  megatron:
-    ddp_bucket_size: null
-    distributed_backend: nccl # Support 'nccl' and 'gloo'
-    distributed_timeout_minutes: 30
-    ckpt_format: torch
-    use_dist_ckpt: False
-    tp_comm_bootstrap_backend: nccl
-    tp_comm_overlap_cfg: null # tp_comm_overlap_cfg.yaml
-    use_hf_ckpt: False # if true, will transfer hf model to generate megatron checkpoint and use it for training.
-    use_profiler: False # if true, will enable torch profiler when training, pay attention it has influence on performance
-
-    ckpt_convertor: # config for ckpt convertor
-      model: DeepSeek-R1-Distill-Qwen-1.5B
-      model_type: null # will be set by hf model's config if null
-      hf_model_path: ${rollout.model_dir} # path to the hf model
-      save_path: ${runner.output_dir}/${runner.experiment_name}/converted_ckpts/actor
-      use_gpu_num : 0
-      use_gpu_index: null
-      process_num: 16 # number of processes to use for checkpointing
-      tensor_model_parallel_size: ${actor.model.tensor_model_parallel_size}
-      pipeline_model_parallel_size: ${actor.model.pipeline_model_parallel_size}
-      
-    profiler: # profile megatron when inference and traning
-      output_dir: ${runner.output_dir}/${runner.experiment_name}/profiler
-      activities: ["cpu", "cuda"]
-      record_shapes: False
-      profile_memory: False
-      with_stack: False
-      with_flops: False
-      with_modules: True
-      export_tensorboard: True
-      export_chrome_trace: False
-      chrome_filename_prefix: "chrome_trace"
-      schedule_warmup: 2
-      schedule_active: 1
-      schedule_repeat: 1 # inference and training will repeat such times
-      # schedule_wait: it will be set at runtime
-
-
 reward:
   use_reward_model: false
   reward_type: 'math'
diff --git a/examples/vlm/main_vlm.py b/examples/vlm/main_vlm.py
index 32466fa12..605577fba 100644
--- a/examples/vlm/main_vlm.py
+++ b/examples/vlm/main_vlm.py
@@ -25,9 +25,9 @@
 from rlinf.scheduler import Cluster
 from rlinf.utils.placement import ModelParallelComponentPlacement, PlacementMode
 from rlinf.utils.utils import output_redirector
-from rlinf.workers.actor.megatron_actor_worker import MegatronActor
+from rlinf.workers.actor import get_actor_worker
 from rlinf.workers.inference.megatron_inference_worker import MegatronInference
-from rlinf.workers.rollout.sglang.sglang_worker import AsyncSGLangWorker, SGLangWorker
+from rlinf.workers.rollout.utils import get_rollout_backend_worker
 
 """Script to start GRPO training"""
 mp.set_start_method("spawn", force=True)
@@ -44,14 +44,11 @@ def main(cfg) -> None:
     )
     component_placement = ModelParallelComponentPlacement(cfg)
 
+    rollout_worker_cls = get_rollout_backend_worker(cfg, component_placement)
+
     # Rollout group
     rollout_placement_strategy = component_placement.get_strategy("rollout")
-    SGLangWorkerCls = (
-        SGLangWorker
-        if component_placement.placement_mode == PlacementMode.COLLOCATED
-        else AsyncSGLangWorker
-    )
-    rollout_group = SGLangWorkerCls.create_group(cfg, component_placement).launch(
+    rollout_group = rollout_worker_cls.create_group(cfg, component_placement).launch(
         cluster,
         name=cfg.rollout.group_name,
         placement_strategy=rollout_placement_strategy,
@@ -73,13 +70,14 @@ def main(cfg) -> None:
         )
 
     # GRPO Actor group
+    actor_worker_cls = get_actor_worker(cfg)
     actor_placement_strategy = component_placement.get_strategy("actor")
-    actor_group = MegatronActor.create_group(cfg, component_placement).launch(
+    actor_group = actor_worker_cls.create_group(cfg, component_placement).launch(
         cluster, name=cfg.actor.group_name, placement_strategy=actor_placement_strategy
     )
 
     tokenizer = hf_tokenizer(cfg.actor.tokenizer.tokenizer_model)
-    train_ds, val_ds = create_rl_dataset(cfg.data, tokenizer)
+    train_ds, val_ds = create_rl_dataset(cfg, tokenizer)
 
     runner = MathRunner(
         cfg=cfg,
diff --git a/rlinf/algorithms/losses.py b/rlinf/algorithms/losses.py
index 3980f9136..798e5330f 100644
--- a/rlinf/algorithms/losses.py
+++ b/rlinf/algorithms/losses.py
@@ -196,7 +196,8 @@ def compute_math_ppo_actor_loss(**kwargs):
     loss_agg_func = kwargs["loss_agg_func"]
     logprobs = kwargs["logprobs"]
     old_logprobs = kwargs["old_logprobs"]
-    eps_clip = kwargs["eps_clip"]
+    clip_ratio_low = kwargs["clip_ratio_low"]
+    clip_ratio_high = kwargs["clip_ratio_high"]
     advantages = kwargs["advantages"]
     loss_mask = kwargs.get("loss_mask", None)
     c_clip = kwargs.get("c_clip", None)
@@ -212,7 +213,7 @@ def compute_math_ppo_actor_loss(**kwargs):
     ratio = torch.where(loss_mask, torch.exp(logprobs - old_logprobs), 0)
     approx_kl = torch.where(loss_mask, (logprobs - old_logprobs).detach(), 0.0)
 
-    clipped_ratio = torch.clamp(ratio, 1.0 - eps_clip, 1.0 + eps_clip)
+    clipped_ratio = torch.clamp(ratio, 1.0 - clip_ratio_low, 1.0 + clip_ratio_high)
     policy_loss1 = -advantages * ratio
     policy_loss2 = -advantages * clipped_ratio
 
@@ -239,12 +240,12 @@ def compute_math_ppo_actor_loss(**kwargs):
 
     # Compile metrics for logging
     metrics_data = {
-        "policy_loss": masked_mean(policy_loss.detach(), loss_mask),
-        "ratio": masked_mean(ratio.detach(), loss_mask),
-        "clipped_ratio": masked_mean(clipped_ratio.detach(), loss_mask),
-        "dual_cliped_ratio": masked_mean(dual_cliped_ratio.detach(), loss_mask),
-        "approx_kl": approx_kl.detach(),
-        "clip_fraction": clip_fraction.detach(),
+        "policy_loss": masked_mean(policy_loss.detach(), loss_mask).cpu(),
+        "ratio": masked_mean(ratio.detach(), loss_mask).cpu(),
+        "clipped_ratio": masked_mean(clipped_ratio.detach(), loss_mask).cpu(),
+        "dual_cliped_ratio": masked_mean(dual_cliped_ratio.detach(), loss_mask).cpu(),
+        "approx_kl": approx_kl.detach().cpu(),
+        "clip_fraction": clip_fraction.detach().cpu(),
     }
     return policy_loss, metrics_data
 
diff --git a/rlinf/config.py b/rlinf/config.py
index f3b2984cd..0f3a21903 100644
--- a/rlinf/config.py
+++ b/rlinf/config.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import dataclasses
+import importlib.util
 import logging
 import os
 from dataclasses import asdict
@@ -33,15 +34,7 @@
 
 logging.getLogger().setLevel(logging.INFO)
 
-try:
-    import transformer_engine
-
-    HAVE_TE = True
-except ImportError:
-    transformer_engine = None
-    HAVE_TE = False
-
-SUPPORTED_MODEL_ARCHS = ["qwen2.5", "openvla", "openvla_oft"]
+SUPPORTED_MODEL_ARCHS = ["qwen2.5", "qwen2.5_vl", "openvla", "openvla_oft"]
 SUPPORTED_ROLLOUT_BACKENDS = ["sglang", "vllm"]
 __all__ = ["build_config"]
 
@@ -764,7 +757,10 @@ def build_transformer_config(cfg) -> "TransformerConfig":
     tp_only_amax_red = cfg.get("tp_only_amax_red", False)
 
     if cfg.get("enable_cuda_graph", False):
-        assert HAVE_TE, "Transformer Engine is required for cudagraphs."
+        if importlib.util.find_spec("transformer_engine") is None:
+            raise ImportError(
+                "Can not import transformer_engine, which is required for cudagraphs."
+            )
         assert cfg.get("use_te_rng_tracker", False), (
             "Transformer engine's RNG tracker is required for cudagraphs, this can be enabled with \
             'use_te_rng_tracker=True'."
diff --git a/rlinf/data/datasets.py b/rlinf/data/datasets.py
index e33667703..162d7b94d 100644
--- a/rlinf/data/datasets.py
+++ b/rlinf/data/datasets.py
@@ -21,7 +21,9 @@
 import pandas as pd
 import torch
 from omegaconf import DictConfig
+from PIL.Image import Image
 from torch.utils.data import Dataset
+from transformers import AutoProcessor
 
 
 def batch_pad_to_fixed_len(
@@ -80,12 +82,12 @@ def __init__(self, data_paths, config, tokenizer):
         if isinstance(self.data_paths, str):
             self.data_paths = [self.data_paths]
 
-        self.max_prompt_length = config.max_prompt_length
+        self.max_prompt_length = config.data.max_prompt_length
         self.tokenizer = tokenizer
-        self.prompt_key = config.prompt_key
+        self.prompt_key = config.data.prompt_key
 
         self.data = self._load_data()
-        if config.get("filter_prompt_by_length", False):
+        if config.data.get("filter_prompt_by_length", False):
             total = len(self.data)
             filtered = []
             failed = 0
@@ -180,19 +182,20 @@ def __init__(
     ):
         super().__init__()
         self.data_paths = data_paths
-        self.use_chat_template = config.use_chat_template
+        self.use_chat_template = config.data.use_chat_template
 
-        self.image_keys = config.image_keys
-        self.prompt_key = config.prompt_key
-        self.choice_key = config.choice_key
-        self.answer_key = config.answer_key
-        self.solution_key = config.solution_key
+        self.image_keys = config.data.image_keys
+        self.prompt_key = config.data.prompt_key
+        self.choice_key = config.data.choice_key
+        self.answer_key = config.data.answer_key
+        self.solution_key = config.data.solution_key
 
         if isinstance(self.data_paths, str):
             self.data_paths = [self.data_paths]
 
-        self.max_prompt_length = config.max_prompt_length
+        self.max_prompt_length = config.data.max_prompt_length
         self.tokenizer = tokenizer
+        self.processor = AutoProcessor.from_pretrained(config.actor.model.model_path)
         self.data = self._load_data()
         self.post_process()
 
@@ -206,17 +209,25 @@ def get_image_list(
                     image_content = dataitem.get(key, None)
                     if image_content is None:
                         continue
+                    if isinstance(image_content, Image):
+                        image_content.append(image_content)
                     if isinstance(image_content, dict) and "bytes" in image_content:
                         image_content = image_content["bytes"]
                         assert isinstance(image_content, bytes), (
                             f"image content should be bytes, but got {type(image_content)} , content is {image_content}"
                         )
                     image_list.append(image_content)
+            if image_list == []:
+                return [None]
             return image_list
 
         def process_prompt(
             data_item: Dict, image_count: int
-        ) -> Tuple[str, List[int], int]:
+        ) -> Tuple[
+            str,
+            List[int],
+            int,
+        ]:
             question = data_item.get(self.prompt_key, "")
             options = data_item.get(self.choice_key, [])
             if not isinstance(options, list):
@@ -230,15 +241,14 @@ def process_prompt(
                     message_content.append({"type": "image"})
                 message_content.append({"type": "text", "text": prompt_text})
                 messages = [{"role": "user", "content": message_content}]
-                prompt_text = self.tokenizer.apply_chat_template(
+                prompt_text = self.processor.apply_chat_template(
                     messages, tokenize=False, add_generation_prompt=True
                 )
-                prompt_ids = self.tokenizer.apply_chat_template(
-                    messages,
-                    tokenize=True,
-                    add_generation_prompt=True,
+                prompt_ids = self.processor(
+                    text=[prompt_text],
+                    padding=True,
                     return_tensors="pt",
-                )
+                )["input_ids"]
                 if isinstance(prompt_ids, torch.Tensor):
                     if prompt_ids.dim() == 2 and prompt_ids.size(0) == 1:
                         prompt_ids = prompt_ids.squeeze(0)  # [L]
@@ -278,7 +288,7 @@ def process_prompt(
                 prompt=prompt_ids,
                 length=prompt_length,
                 image_data=image_list,
-                answer=answer,
+                answer=str(answer),
                 solution=solution,
                 idx=idx,
             )
@@ -318,11 +328,11 @@ def __getitem__(self, index):
         return self.data[index]
 
 
-def create_rl_dataset(data_config, tokenizer):
+def create_rl_dataset(config: DictConfig, tokenizer):
     """Create rl datasets.
 
     Arguments:
-        data_config: The data config.
+        config: The RLinf config.
         tokenizer (Tokenizer): The tokenizer.
 
     Returns:
@@ -331,9 +341,9 @@ def create_rl_dataset(data_config, tokenizer):
         val_dataset (Dataset): The validation dataset.
     """
 
-    if data_config.type == "math":
+    if config.data.type == "math":
         dataset_cls = MathDataset
-    elif data_config.type == "vision_language":
+    elif config.data.type == "vision_language":
         dataset_cls = VisionLanguageDataset
     else:
         return None, None
@@ -342,14 +352,14 @@ def create_rl_dataset(data_config, tokenizer):
 
     # Instantiate the dataset using the determined dataset class
     train_dataset = dataset_cls(
-        data_paths=data_config.train_data_paths,
-        config=data_config,
+        data_paths=config.data.train_data_paths,
+        config=config,
         tokenizer=tokenizer,
     )
 
     val_dataset = dataset_cls(
-        data_paths=data_config.val_data_paths,
-        config=data_config,
+        data_paths=config.data.val_data_paths,
+        config=config,
         tokenizer=tokenizer,
     )
 
diff --git a/rlinf/data/io_struct.py b/rlinf/data/io_struct.py
index 510acb292..e40fed973 100644
--- a/rlinf/data/io_struct.py
+++ b/rlinf/data/io_struct.py
@@ -519,6 +519,152 @@ def merge_list(dst_list: List, src_list: List):
 
         return merged_result
 
+    @staticmethod
+    def split_result_list_by_group(
+        rollout_results: List["RolloutResult"],
+    ) -> List["RolloutResult"]:
+        """
+        Split RolloutResult objects by group_size.
+
+        If input has only one RolloutResult, split it into multiple RolloutResult objects by group_size.
+        If input has multiple RolloutResult objects, split each one and merge the results.
+
+        Args:
+            rollout_results: List of input RolloutResult objects
+
+        Returns:
+            List of RolloutResult objects grouped by group_size
+        """
+        assert len(rollout_results) > 0, "No rollout results to split."
+
+        all_split_results = []
+
+        for rollout_result in rollout_results:
+            split_results = RolloutResult._split_single_result_by_group(rollout_result)
+            all_split_results.extend(split_results)
+
+        return all_split_results
+
+    @staticmethod
+    def _split_single_result_by_group(
+        rollout_result: "RolloutResult",
+    ) -> List["RolloutResult"]:
+        """
+        Split a single RolloutResult into multiple RolloutResult objects by group_size.
+
+        Args:
+            rollout_result: The RolloutResult to be split
+
+        Returns:
+            List of split RolloutResult objects
+        """
+        group_size = rollout_result.group_size
+        num_sequence = rollout_result.num_sequence
+
+        assert num_sequence % group_size == 0, (
+            f"num_sequence ({num_sequence}) must be divisible by group_size ({group_size})"
+        )
+
+        num_groups = num_sequence // group_size
+        split_results = []
+
+        # Split list fields
+        prompt_lengths_split = split_list(rollout_result.prompt_lengths, num_groups)
+        prompt_ids_split = split_list(rollout_result.prompt_ids, num_groups)
+        response_lengths_split = split_list(rollout_result.response_lengths, num_groups)
+        response_ids_split = split_list(rollout_result.response_ids, num_groups)
+        is_end_split = split_list(rollout_result.is_end, num_groups)
+
+        # Handle optional fields
+        answers_split = None
+        if rollout_result.answers is not None:
+            answers_split = split_list(rollout_result.answers, num_groups)
+
+        image_data_split = None
+        if rollout_result.image_data is not None:
+            image_data_split = split_list(rollout_result.image_data, num_groups)
+
+        prompt_texts_split = None
+        if rollout_result.prompt_texts is not None:
+            prompt_texts_split = split_list(rollout_result.prompt_texts, num_groups)
+
+        response_texts_split = None
+        if rollout_result.response_texts is not None:
+            response_texts_split = split_list(rollout_result.response_texts, num_groups)
+
+        rollout_logprobs_split = None
+        if rollout_result.rollout_logprobs is not None:
+            rollout_logprobs_split = split_list(
+                rollout_result.rollout_logprobs, num_groups
+            )
+
+        # Handle tensor fields
+        rewards_split = None
+        if rollout_result.rewards is not None:
+            if isinstance(rollout_result.rewards, torch.Tensor):
+                rewards_split = torch.chunk(rollout_result.rewards, num_groups, dim=0)
+            else:
+                rewards_split = split_list(rollout_result.rewards, num_groups)
+
+        advantages_split = None
+        if rollout_result.advantages is not None:
+            if isinstance(rollout_result.advantages, torch.Tensor):
+                advantages_split = torch.chunk(
+                    rollout_result.advantages, num_groups, dim=0
+                )
+            else:
+                advantages_split = split_list(rollout_result.advantages, num_groups)
+
+        prev_logprobs_split = None
+        if rollout_result.prev_logprobs is not None:
+            prev_logprobs_split = torch.chunk(
+                rollout_result.prev_logprobs, num_groups, dim=0
+            )
+
+        ref_logprobs_split = None
+        if rollout_result.ref_logprobs is not None:
+            ref_logprobs_split = torch.chunk(
+                rollout_result.ref_logprobs, num_groups, dim=0
+            )
+
+        # Create split RolloutResult objects
+        for i in range(num_groups):
+            split_result = RolloutResult(
+                num_sequence=group_size,
+                group_size=group_size,
+                prompt_lengths=prompt_lengths_split[i],
+                prompt_ids=prompt_ids_split[i],
+                response_lengths=response_lengths_split[i],
+                response_ids=response_ids_split[i],
+                is_end=is_end_split[i],
+                answers=answers_split[i] if answers_split is not None else None,
+                image_data=image_data_split[i]
+                if image_data_split is not None
+                else None,
+                prompt_texts=prompt_texts_split[i]
+                if prompt_texts_split is not None
+                else None,
+                response_texts=response_texts_split[i]
+                if response_texts_split is not None
+                else None,
+                rollout_logprobs=rollout_logprobs_split[i]
+                if rollout_logprobs_split is not None
+                else None,
+                rewards=rewards_split[i] if rewards_split is not None else None,
+                advantages=advantages_split[i]
+                if advantages_split is not None
+                else None,
+                prev_logprobs=prev_logprobs_split[i]
+                if prev_logprobs_split is not None
+                else None,
+                ref_logprobs=ref_logprobs_split[i]
+                if ref_logprobs_split is not None
+                else None,
+            )
+            split_results.append(split_result)
+
+        return split_results
+
     def to_actor_batch(
         self,
         data_seq_length: int,
diff --git a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
index 8e3b9f16c..5f04d633f 100644
--- a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
+++ b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
@@ -19,9 +19,10 @@
 from omegaconf import DictConfig
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp import MixedPrecision, ShardingStrategy, StateDictType
-from transformers import AutoModelForCausalLM
+from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForVision2Seq
 
 from rlinf.config import torch_dtype_from_precision
+from rlinf.data.tokenizers import hf_tokenizer
 from rlinf.hybrid_engines.fsdp.utils import (
     get_fsdp_wrap_policy,
     init_fn,
@@ -40,13 +41,15 @@ def __init__(self, cfg: DictConfig):
         self.logger = get_logger()
         self.torch_dtype = torch_dtype_from_precision(self._cfg.model.precision)
 
-        assert (
-            self.torch_dtype == torch.float16 or self.torch_dtype == torch.bfloat16
-        ), (
-            f"Precision {self._cfg.model.precision} is not supported, only support bf16 and fp16."
-        )
+        self.tokenizer = hf_tokenizer(cfg.tokenizer.tokenizer_model)
 
     def model_provider_func(self) -> torch.nn.Module:
+        model_config = AutoConfig.from_pretrained(
+            self._cfg.model.model_path,
+            trust_remote_code=True,
+            attn_implementation="flash_attention_2",
+        )
+
         if self._cfg.model.get("gptq_model", False):
             from auto_gptq import AutoGPTQForCausalLM
 
@@ -57,23 +60,31 @@ def model_provider_func(self) -> torch.nn.Module:
         elif self._cfg.model.get("load_in_8bit", False):
             model = AutoModelForCausalLM.from_pretrained(
                 self._cfg.model.model_path,
-                device_map=self._cfg.model.get("device_map", "auto"),
                 load_in_8bit=True,
             )
         else:
+            if type(model_config) in AutoModelForVision2Seq._model_mapping.keys():
+                auto_model_class = AutoModelForVision2Seq
+            else:
+                auto_model_class = AutoModelForCausalLM
+
+            # TODO: fix this, load model in float16/bfloat16 may cause optimizer in bf16, which is incorrect
             # default load in float16
-            model = AutoModelForCausalLM.from_pretrained(
+            model = auto_model_class.from_pretrained(
                 self._cfg.model.model_path,
                 torch_dtype=self.torch_dtype,
-                device_map=self._cfg.model.get("device_map", "auto"),
+                config=model_config,
                 trust_remote_code=True,
-                use_safetensors=self._cfg.model.get("use_safetensors", False),
             )
-            if torch.cuda.is_available():
-                model = model.cuda()
-            if self.torch_dtype == torch.float16:
-                model = model.half()
 
+        model.to(self.torch_dtype)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+        if self.torch_dtype == torch.float16:
+            model = model.half()
+
+        torch.distributed.barrier()
         return model
 
     def setup_model_and_optimizer(self):
@@ -89,8 +100,8 @@ def setup_model_and_optimizer(self):
 
         mixed_precision = MixedPrecision(
             param_dtype=self.torch_dtype,
-            reduce_dtype=self.torch_dtype,
-            buffer_dtype=self.torch_dtype,
+            reduce_dtype=torch.float32,
+            buffer_dtype=torch.float32,
         )
 
         if self._cfg.model.sharding_strategy == "full_shard":
@@ -108,7 +119,6 @@ def setup_model_and_optimizer(self):
         self.model = FSDP(
             module,
             param_init_fn=init_fn,
-            use_orig_params=False,
             auto_wrap_policy=auto_wrap_policy,
             device_id=int(os.environ["LOCAL_RANK"]),
             sharding_strategy=sharding_strategy,  # zero3
@@ -130,7 +140,7 @@ def setup_model_and_optimizer(self):
             },
         ]
 
-        if self._cfg.model.vh_mode in ["a", "a0", "a6"]:
+        if self._cfg.model.get("vh_mode", None) in ["a", "a0", "a6"]:
             param_groups.append(
                 {
                     "params": [
diff --git a/rlinf/hybrid_engines/fsdp/utils.py b/rlinf/hybrid_engines/fsdp/utils.py
index 2334b1fa7..2461f7006 100644
--- a/rlinf/hybrid_engines/fsdp/utils.py
+++ b/rlinf/hybrid_engines/fsdp/utils.py
@@ -58,7 +58,7 @@ def cpu_init_weights():
     return init_context
 
 
-def get_fsdp_wrap_policy(module, config=None, is_lora=False):
+def get_fsdp_wrap_policy(module, config=None, is_lora=False, is_vla_model=False):
     """
     FSDP wrap policy that handles both standard transformer models and VLA models.
 
@@ -76,11 +76,8 @@ def get_fsdp_wrap_policy(module, config=None, is_lora=False):
     if config.get("disable", False):
         return None
 
-    # Check if this is a VLA model by looking for language_model attribute
-    is_vla_model = hasattr(module, "language_model")
-
     # Get transformer layer classes to wrap
-    if is_vla_model:
+    if hasattr(module, "language_model"):
         # For VLA models, get transformer classes from language_model submodule
         default_transformer_cls_names_to_wrap = getattr(
             module.language_model, "_no_split_modules", None
@@ -100,6 +97,7 @@ def get_fsdp_wrap_policy(module, config=None, is_lora=False):
 
     # Add vision transformer policies for VLA models
     if is_vla_model:
+        from prismatic.extern.hf.modeling_prismatic import PrismaticProjector
         from timm.models.vision_transformer import VisionTransformer
         from torch.distributed.fsdp.wrap import _module_wrap_policy, _or_policy
 
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_4_4/sgl_scheduler.py b/rlinf/hybrid_engines/sglang/sglang_0_4_4/sgl_scheduler.py
index 509c56bab..87f736798 100644
--- a/rlinf/hybrid_engines/sglang/sglang_0_4_4/sgl_scheduler.py
+++ b/rlinf/hybrid_engines/sglang/sglang_0_4_4/sgl_scheduler.py
@@ -108,10 +108,13 @@ def __init__(
             placement
         )[(self.get_parent_rank(), self._rank)]
 
+        use_presharded_weights = (
+            False if self.cfg.actor.training_backend == "fsdp" else True
+        )
         # it's important to use load_weight to load resharded weight from megatron
         for _, module in self.tp_worker.worker.model_runner.model.named_modules():
             if hasattr(module, "use_presharded_weights"):
-                module.use_presharded_weights = True
+                module.use_presharded_weights = use_presharded_weights
 
         self._logger.info(
             f"Running Scheduler dp rank {self.get_parent_rank()}, tp rank {self.tp_rank}, corresponding actor weight rank = {self.actor_weight_rank}"
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_4_6/sgl_scheduler.py b/rlinf/hybrid_engines/sglang/sglang_0_4_6/sgl_scheduler.py
index ef503527b..684f8d333 100644
--- a/rlinf/hybrid_engines/sglang/sglang_0_4_6/sgl_scheduler.py
+++ b/rlinf/hybrid_engines/sglang/sglang_0_4_6/sgl_scheduler.py
@@ -29,6 +29,7 @@
     ReleaseMemoryOccupationReqInput,
     ResumeMemoryOccupationReqInput,
 )
+from sglang.srt.managers.mm_utils import init_embedding_cache
 from sglang.srt.managers.scheduler import Scheduler as _Scheduler
 from sglang.srt.managers.scheduler import logger
 from sglang.srt.server_args import PortArgs, ServerArgs
@@ -110,10 +111,13 @@ def __init__(
         self.actor_weight_rank = RankMapper.get_rollout_rank_to_actor_rank_map(
             placement
         )[(self.get_parent_rank(), self._rank)]
+        use_presharded_weights = (
+            False if self.cfg.actor.training_backend == "fsdp" else True
+        )
         # it's important to use load_weight to load resharded weight from megatron
         for _, module in self.tp_worker.worker.model_runner.model.named_modules():
             if hasattr(module, "use_presharded_weights"):
-                module.use_presharded_weights = True
+                module.use_presharded_weights = use_presharded_weights
 
         self._logger.info(
             f"Running Scheduler dp rank {self.get_parent_rank()}, tp rank {self.tp_rank}, corresponding actor weight rank = {self.actor_weight_rank}"
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_4_9/sgl_scheduler.py b/rlinf/hybrid_engines/sglang/sglang_0_4_9/sgl_scheduler.py
index a7057a161..1f9beb409 100644
--- a/rlinf/hybrid_engines/sglang/sglang_0_4_9/sgl_scheduler.py
+++ b/rlinf/hybrid_engines/sglang/sglang_0_4_9/sgl_scheduler.py
@@ -112,10 +112,13 @@ def __init__(
         self.actor_weight_rank = RankMapper.get_rollout_rank_to_actor_rank_map(
             placement
         )[(self.get_parent_rank(), self._rank)]
+        use_presharded_weights = (
+            False if self.cfg.actor.training_backend == "fsdp" else True
+        )
         # it's important to use load_weight to load resharded weight from megatron
         for _, module in self.tp_worker.worker.model_runner.model.named_modules():
             if hasattr(module, "use_presharded_weights"):
-                module.use_presharded_weights = True
+                module.use_presharded_weights = use_presharded_weights
 
         self._logger.info(
             f"Running Scheduler dp rank {self.get_parent_rank()}, tp rank {self.tp_rank}, corresponding actor weight rank = {self.actor_weight_rank}"
diff --git a/rlinf/models/__init__.py b/rlinf/models/__init__.py
index 08207900a..617ac7467 100644
--- a/rlinf/models/__init__.py
+++ b/rlinf/models/__init__.py
@@ -17,7 +17,6 @@
 
 import torch
 from omegaconf import DictConfig
-from peft import LoraConfig, PeftModel, get_peft_model
 from transformers import (
     AutoConfig,
     AutoImageProcessor,
@@ -172,6 +171,8 @@ def get_model(model_path, cfg: DictConfig, override_config_kwargs=None):
         model = model.cuda()
 
     if cfg.is_lora:
+        from peft import LoraConfig, PeftModel, get_peft_model
+
         if not hasattr(cfg, "lora_path") or cfg.lora_path is None:
             lora_config = LoraConfig(
                 r=cfg.lora_rank,
diff --git a/rlinf/models/embodiment/model_utils.py b/rlinf/models/embodiment/model_utils.py
index 04425cfdc..8e7aebeb1 100644
--- a/rlinf/models/embodiment/model_utils.py
+++ b/rlinf/models/embodiment/model_utils.py
@@ -15,9 +15,10 @@
 from typing import Any, Optional
 
 import torch
-import torch.nn.functional as F
 from transformers.generation import TopKLogitsWarper
 
+from rlinf.utils.utils import compute_entropy_from_logits, compute_logprobs_from_logits
+
 
 def default_logits_processor(logits, action_tokens, vocab_size, n_action_bins):
     logits = logits.permute(0, 2, 1)  # [B, vocab-size, action-dim]
@@ -34,28 +35,6 @@ def default_logits_processor(logits, action_tokens, vocab_size, n_action_bins):
     return ret
 
 
-def compute_logprobs_from_logits(logits, target):
-    logprobs = -F.cross_entropy(
-        logits, target=target, reduction="none"
-    )  # [B, action-dim]
-    return logprobs
-
-
-def compute_entropy_from_logits(logits, epsilon=1e-10):
-    """
-    Compute entropy by logits.
-
-    Args:
-        logits: [B, vocab-size, seq-len]
-    Returns:
-        entropy: [B, seq-len]
-    """
-    all_probs = F.softmax(logits, dim=1)  # [B, vocab-size, seq-len]
-    all_log_probs = torch.log(all_probs + epsilon)
-    entropy = -torch.sum(all_probs * all_log_probs, dim=1)  # [B, seq-len]
-    return entropy
-
-
 def custom_forward(
     model,
     input_ids,
diff --git a/rlinf/runners/math_runner.py b/rlinf/runners/math_runner.py
index a88826f5a..be2cfac1f 100644
--- a/rlinf/runners/math_runner.py
+++ b/rlinf/runners/math_runner.py
@@ -424,7 +424,7 @@ def run(self):
                     }
                     self.metric_logger.log(training_metrics, logging_steps + i)
 
-                logging_metrics = time_metrics
+                logging_metrics = {f"{k}_time": v for k, v in time_metrics.items()}
 
                 if self.cfg.actor.get("calculate_flops", False):
                     flops_metrics = self._compute_flops_metrics(
diff --git a/rlinf/utils/convertor/utils.py b/rlinf/utils/convertor/utils.py
index 5d15e3639..e187bb919 100644
--- a/rlinf/utils/convertor/utils.py
+++ b/rlinf/utils/convertor/utils.py
@@ -447,7 +447,7 @@ def register_mg2hf_convertor(model_arch: str, convertor_cls: Callable) -> None:
 
 
 register_mg2hf_convertor("qwen2.5", Qwen2_5Convertor)
-register_mg2hf_convertor("qwen2.5-vl", Qwen2_5VLConvertor)
+register_mg2hf_convertor("qwen2.5_vl", Qwen2_5VLConvertor)
 
 
 def get_mg2hf_convertor(model_arch: str, config, strict: bool = False) -> BaseConvertor:
diff --git a/rlinf/utils/distributed.py b/rlinf/utils/distributed.py
index e9da8d6da..a54f444d8 100644
--- a/rlinf/utils/distributed.py
+++ b/rlinf/utils/distributed.py
@@ -31,7 +31,12 @@
 
 
 def compute_rollout_metrics(
-    rollout_batch, max_prompt_len, response_len, use_critic=False
+    rollout_batch,
+    max_prompt_len,
+    response_len,
+    dp_world_size,
+    dp_group=None,
+    use_critic=False,
 ):
     device = torch.device(f"cuda:{torch.cuda.current_device()}")
     advantages = rollout_batch["advantages"].to(device=device)
@@ -41,8 +46,6 @@ def compute_rollout_metrics(
     reward_scores = rollout_batch["rewards"].clone().to(device=device)
     is_end = rollout_batch["is_end"].clone().float().to(device=device)
 
-    dp_world_size = parallel_state.get_data_parallel_world_size()
-
     prompt_lengths_list = [
         torch.empty_like(prompt_lengths) for _ in range(dp_world_size)
     ]
@@ -52,12 +55,12 @@ def compute_rollout_metrics(
     torch.distributed.all_gather(
         prompt_lengths_list,
         prompt_lengths,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
     torch.distributed.all_gather(
         decode_lengths_list,
         response_lengths,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
 
     total_prompt_lengths = torch.cat(prompt_lengths_list, dim=0)
@@ -66,22 +69,22 @@ def compute_rollout_metrics(
     torch.distributed.all_reduce(
         prompt_lengths,
         torch.distributed.ReduceOp.AVG,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
     torch.distributed.all_reduce(
         response_lengths,
         torch.distributed.ReduceOp.AVG,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
     torch.distributed.all_reduce(
         reward_scores,
         torch.distributed.ReduceOp.AVG,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
     torch.distributed.all_reduce(
         is_end,
         torch.distributed.ReduceOp.AVG,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
 
     valid_adv = torch.masked_select(advantages, mask)
@@ -90,12 +93,12 @@ def compute_rollout_metrics(
     torch.distributed.all_reduce(
         n_valid_token,
         op=torch.distributed.ReduceOp.SUM,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
     torch.distributed.all_reduce(
         adv_sum,
         op=torch.distributed.ReduceOp.SUM,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
     adv_mean = adv_sum / n_valid_token
 
@@ -107,7 +110,7 @@ def compute_rollout_metrics(
     torch.distributed.all_reduce(
         reduce_tensor,
         torch.distributed.ReduceOp.MAX,
-        group=parallel_state.get_data_parallel_group(),
+        group=dp_group,
     )
     adv_min, adv_max = reduce_tensor.tolist()
 
diff --git a/rlinf/utils/placement.py b/rlinf/utils/placement.py
index 7b707894e..10738ce9e 100644
--- a/rlinf/utils/placement.py
+++ b/rlinf/utils/placement.py
@@ -226,9 +226,6 @@ def __init__(self, config: DictConfig, cluster: Cluster):
         self._rollout_num_gpus = len(self._rollout_gpus)
 
         if self._is_collocated():
-            assert self.actor_tp_size >= self.rollout_tp_size, (
-                f"Actor TP size {self.actor_tp_size} must be greater or equal to Rollout TP size {self.rollout_tp_size}."
-            )
             assert self._inference_gpus is None, (
                 "Inference GPUs must not be specified in collocated mode."
             )
@@ -282,11 +279,14 @@ def _generate_placements(self):
 
             actor_tp_size = self._config.actor.model.tensor_model_parallel_size
             rollout_tp_size = self._config.rollout.tensor_parallel_size
-            assert actor_tp_size >= rollout_tp_size, (
-                f"Actor TP size ({actor_tp_size}) must be greater or equal to Rollout TP size ({rollout_tp_size})"
-            )
-            assert actor_tp_size % rollout_tp_size == 0, (
-                f"Actor TP size ({actor_tp_size}) must be divisible by Rollout TP size ({rollout_tp_size})"
+            if actor_tp_size > rollout_tp_size:
+                assert actor_tp_size % rollout_tp_size == 0, (
+                    f"Actor TP size ({actor_tp_size}) must be divisible by Rollout TP size ({rollout_tp_size})"
+                )
+            stride = (
+                self.actor_tp_size // self.rollout_tp_size
+                if self.actor_tp_size > self.rollout_tp_size
+                else 1
             )
             stride = actor_tp_size // rollout_tp_size
             self._placements["rollout"] = PackedPlacementStrategy(
@@ -325,18 +325,18 @@ def has_dedicated_inference(self):
     @property
     def actor_dp_size(self) -> int:
         return self._actor_num_gpus // (
-            self._config.actor.model.tensor_model_parallel_size
-            * self._config.actor.model.context_parallel_size
-            * self._config.actor.model.pipeline_model_parallel_size
+            self._config.actor.model.get("tensor_model_parallel_size", 1)
+            * self._config.actor.model.get("context_parallel_size", 1)
+            * self._config.actor.model.get("pipeline_model_parallel_size", 1)
         )
 
     @property
     def actor_tp_size(self) -> int:
-        return self._config.actor.model.tensor_model_parallel_size
+        return self._config.actor.model.get("tensor_model_parallel_size", 1)
 
     @property
     def actor_pp_size(self) -> int:
-        return self._config.actor.model.pipeline_model_parallel_size
+        return self._config.actor.model.get("pipeline_model_parallel_size", 1)
 
     @property
     def actor_world_size(self) -> int:
@@ -349,7 +349,7 @@ def inference_tp_size(self) -> int:
             and hasattr(self._config.inference, "model")
             and hasattr(self._config.inference.model, "tensor_model_parallel_size")
         ):
-            return self._config.inference.model.tensor_model_parallel_size
+            return self._config.inference.model.get("tensor_model_parallel_size", 1)
         else:
             return self.actor_tp_size
 
@@ -360,7 +360,7 @@ def inference_pp_size(self) -> int:
             and hasattr(self._config.inference, "model")
             and hasattr(self._config.inference.model, "pipeline_model_parallel_size")
         ):
-            return self._config.inference.model.pipeline_model_parallel_size
+            return self._config.inference.model.get("pipeline_model_parallel_size", 1)
         else:
             return self.actor_pp_size
 
@@ -377,13 +377,13 @@ def inference_world_size(self) -> int:
     @property
     def rollout_dp_size(self) -> int:
         return self._rollout_num_gpus // (
-            self._config.rollout.tensor_parallel_size
-            * self._config.rollout.pipeline_parallel_size
+            self._config.rollout.get("tensor_parallel_size", 1)
+            * self._config.rollout.get("pipeline_parallel_size", 1)
         )
 
     @property
     def rollout_tp_size(self) -> int:
-        return self._config.rollout.tensor_parallel_size
+        return self._config.rollout.get("tensor_parallel_size", 1)
 
     @property
     def rollout_world_size(self) -> int:
diff --git a/rlinf/utils/resharding/utils.py b/rlinf/utils/resharding/utils.py
index 82ca3eadf..d7a4af231 100644
--- a/rlinf/utils/resharding/utils.py
+++ b/rlinf/utils/resharding/utils.py
@@ -14,7 +14,6 @@
 
 
 import torch
-from megatron.core import parallel_state
 
 
 def get_tp_reshard_fn(model_arch: str):
@@ -96,6 +95,8 @@ def _gather_pp_group_tensor_and_reshard(
 
 
 def pp_reshard_fn_qwen2_5(model_state_dict, pp_group, dtype):
+    from megatron.core import parallel_state
+
     pp_first_rank = parallel_state.get_pipeline_model_parallel_first_rank()
     pp_last_rank = parallel_state.get_pipeline_model_parallel_last_rank()
 
diff --git a/rlinf/utils/utils.py b/rlinf/utils/utils.py
index 449412f8b..d117f4dc4 100644
--- a/rlinf/utils/utils.py
+++ b/rlinf/utils/utils.py
@@ -20,6 +20,7 @@
 from functools import partial, wraps
 
 import torch
+import torch.nn.functional as F
 
 
 def clear_memory(sync=True):
@@ -124,6 +125,57 @@ def seq_mean_token_mean(values, mask):
     return loss
 
 
+def logprobs_from_logits_flash_attn(logits, labels, inplace_backward=True):
+    from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
+
+    output = cross_entropy_loss(logits, labels, inplace_backward=inplace_backward)
+    assert isinstance(output, tuple), (
+        "please make sure flash-attn>=2.4.3 where cross_entropy_loss returns Tuple[losses, z_losses]."
+    )
+    return -output[0]
+
+
+def compute_logprobs_from_logits(logits, target, task_type="embodied"):
+    if task_type == "embodied":
+        logprobs = -F.cross_entropy(
+            logits, target=target, reduction="none"
+        )  # [B, action-dim]
+        return logprobs
+    batch_dim = logits.shape[:-1]
+    last_dim = logits.shape[-1]
+    logits = logits.reshape(-1, last_dim)
+    labels = target.reshape(-1)
+    logprobs = logprobs_from_logits_flash_attn(
+        logits, labels=labels, inplace_backward=False
+    )
+    logprobs = logprobs.view(*batch_dim)
+    return logprobs
+
+
+def entropy_from_logits(logits: torch.Tensor):
+    """Calculate entropy from logits."""
+    pd = torch.nn.functional.softmax(logits, dim=-1)
+    entropy = torch.logsumexp(logits, dim=-1) - torch.sum(pd * logits, dim=-1)
+    return entropy
+
+
+def compute_entropy_from_logits(logits, epsilon=1e-10, task_type="embodied"):
+    """
+    Compute entropy by logits.
+
+    Args:
+        logits: [B, vocab-size, seq-len]
+    Returns:
+        entropy: [B, seq-len]
+    """
+    if task_type == "embodied":
+        all_probs = F.softmax(logits, dim=1)  # [B, vocab-size, seq-len]
+        all_log_probs = torch.log(all_probs + epsilon)
+        entropy = -torch.sum(all_probs * all_log_probs, dim=1)  # [B, seq-len]
+        return entropy
+    return entropy_from_logits(logits=logits)
+
+
 class DualOutput:
     def __init__(self, file, terminal):
         self.file = file
diff --git a/rlinf/workers/actor/__init__.py b/rlinf/workers/actor/__init__.py
index 5b365ea1e..2d315469e 100644
--- a/rlinf/workers/actor/__init__.py
+++ b/rlinf/workers/actor/__init__.py
@@ -11,3 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from omegaconf import DictConfig
+
+from rlinf.scheduler.worker.worker import Worker
+
+
+def get_actor_worker(cfg: DictConfig) -> Worker:
+    if cfg.actor.training_backend == "fsdp":
+        from .fsdp_actor_worker import FSDPActor
+
+        return FSDPActor
+    elif cfg.actor.training_backend == "megatron":
+        from .megatron_actor_worker import MegatronActor
+
+        return MegatronActor
+    else:
+        raise ValueError(f"Unsupported training backend: {cfg.actor.training_backend}")
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index 61b05c9b4..37dd1d9ba 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -14,31 +14,480 @@
 
 import gc
 import os
+from typing import Dict, List, Tuple
 
 import numpy as np
 import torch
 from omegaconf import DictConfig
 from torch.distributed.device_mesh import init_device_mesh
+from torch.multiprocessing.reductions import reduce_tensor
 from tqdm import tqdm
 
 import rlinf.algorithms  # noqa: F401
 from rlinf.algorithms.registry import actor_loss, calculate_adv_and_returns
-from rlinf.algorithms.utils import preprocess_advantages_inputs, preprocess_loss_inputs
+from rlinf.algorithms.utils import (
+    kl_penalty,
+    preprocess_advantages_inputs,
+    preprocess_loss_inputs,
+)
+from rlinf.data.io_struct import RolloutResult
 from rlinf.hybrid_engines.fsdp.fsdp_model_manager import (
     FSDPModelManager,
 )
 from rlinf.models import get_model
 from rlinf.models.embodiment.model_utils import custom_forward
-from rlinf.scheduler import Cluster, Worker
+from rlinf.scheduler import Channel, Cluster, Worker
 from rlinf.utils.data_iter_utils import get_iterator_k_split
 from rlinf.utils.distributed import all_reduce_dict
+from rlinf.utils.distributed import (
+    compute_rollout_metrics as compute_math_rollout_metrics,
+)
 from rlinf.utils.metric_utils import (
     append_to_dict,
     compute_loss_mask,
     compute_rollout_metrics,
     compute_split_num,
 )
-from rlinf.utils.placement import HybridComponentPlacement
+from rlinf.utils.placement import (
+    HybridComponentPlacement,
+    ModelParallelComponentPlacement,
+)
+from rlinf.utils.utils import (
+    compute_entropy_from_logits,
+    compute_logprobs_from_logits,
+    masked_mean,
+    seq_mean_token_mean,
+    seq_mean_token_sum,
+)
+from rlinf.workers.rollout.utils import RankMapper
+from toolkits.math_verifier.verify import math_verify_call
+
+
+class FSDPActor(FSDPModelManager, Worker):
+    def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
+        Worker.__init__(self)
+        super().__init__(cfg.actor)
+
+        self.cfg = cfg
+
+        self.response_len = (
+            cfg.actor.model.encoder_seq_length - cfg.data.max_prompt_length
+        )
+        self.calculate_entropy = self.cfg.algorithm.calculate_entropy
+        self.calculate_entropy_loss = (
+            self.cfg.algorithm.entropy_bonus > 0 and self.calculate_entropy
+        )
+        self.kl_beta = self.cfg.algorithm.kl_beta
+        self.kl_penalty_type = self.cfg.algorithm.kl_penalty_type
+
+        self.total_batch_size_per_dp = (
+            self.cfg.data.rollout_batch_size
+            * self.cfg.algorithm.group_size
+            // self._world_size
+        )
+
+        torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+        self.device = torch.cuda.current_device()
+        world_size = self._world_size
+        self.device_mesh = init_device_mesh(
+            "cuda", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]
+        )
+
+        self._rollout_group_name = cfg.rollout.group_name
+        self._component_placement = placement
+        self.is_data_io_rank = True
+
+        if self.cfg.algorithm.loss_agg_func == "token-mean":
+            self.loss_agg_func = masked_mean
+        elif self.cfg.algorithm.loss_agg_func == "seq-mean-token-sum":
+            self.loss_agg_func = seq_mean_token_sum
+        elif self.cfg.algorithm.loss_agg_func == "seq-mean-token-mean":
+            self.loss_agg_func = seq_mean_token_mean
+        else:
+            raise NotImplementedError(
+                f"algorithm.loss_agg_func={self.cfg.algorithm.loss_agg_func} is not supported!"
+            )
+
+        # Reward configurations
+        if not self.cfg.reward.use_reward_model:
+            assert self.cfg.reward.reward_type == "math", "only support math"
+            self.reward_fn = math_verify_call
+
+    def init_worker(self):
+        self.setup_model_and_optimizer()
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_param_and_grad()
+            self.offload_fsdp_optimizer()
+            torch.cuda.synchronize()
+            gc.collect()
+            torch.cuda.empty_cache()
+        self._setup_rollout_weight_dst_ranks()
+
+    def _setup_rollout_weight_dst_ranks(self):
+        """Setup destination ranks for token and weight communication."""
+        rank_map = RankMapper.get_actor_rank_to_rollout_rank_map(
+            self._component_placement
+        )
+        self._weight_dst_rank_in_rollout = rank_map[self._rank]
+        self.log_info(
+            f"Actor rank {self._rank} will send weights to {self._weight_dst_rank_in_rollout}"
+        )
+
+    def del_reshard_state_dict(self):
+        if hasattr(self, "rollou_state_dict"):
+            del self.rollou_state_dict
+
+    def sync_model_to_rollout(self):
+        if next(self.model.parameters()).is_cpu:
+            self.load_fsdp_param_and_grad(self.device)
+
+        self.rollou_state_dict = self.get_model_state_dict()
+
+        if self._weight_dst_rank_in_rollout is not None:
+
+            def transform_key(k):
+                if k.startswith("model.language_model."):
+                    return "model." + k[21:]
+                elif k.startswith("model."):
+                    return k[6:]
+                else:
+                    return k
+
+            handle = {
+                transform_key(k): reduce_tensor(v)
+                for k, v in self.rollou_state_dict.items()
+            }
+
+            self.send(
+                handle, self._rollout_group_name, self._weight_dst_rank_in_rollout
+            )
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_param_and_grad()
+            torch.cuda.synchronize()
+            gc.collect()
+            torch.cuda.empty_cache()
+
+    def compute_logprobs(self):
+        self.model.eval()
+        self.rollout_batch["logprob"] = self.rollout_batch["prev_logprobs"]
+
+    def get_batch(
+        self, channel: Channel
+    ) -> Tuple[Dict[str, torch.Tensor], RolloutResult]:
+        result: RolloutResult = channel.get()
+
+        batch = result.to_actor_batch(
+            self.cfg.data.max_prompt_length,
+            self.cfg.actor.model.encoder_seq_length,
+            self.tokenizer.eos_token_id,
+        )
+        return batch, result
+
+    def put_result(self, result: RolloutResult, channel: Channel):
+        if channel.is_local:
+            # Local channel, every process will put its own data locally
+            # No need to broadcast
+            channel.put(result)
+        else:
+            if self.is_data_io_rank:
+                channel.put(result)
+
+    def _load_weight_and_optimizer(self, channel: Channel):
+        # Acquire the GPUs to ensure that no one is using them before loading models
+        # Otherwise, it may lead to OOM
+        with channel.gpu_lock:
+            if self.cfg.actor.get("enable_offload", False):
+                self.load_fsdp_param_and_grad(self.device)
+                self.load_fsdp_optimizer(self.device)
+
+    def run_training(self, input_channel: Channel):
+        # Get all batches for this DP
+        batches = []
+        recv_batch_size = 0
+        while recv_batch_size < self.total_batch_size_per_dp:
+            batch, rollout_result = self.get_batch(input_channel)
+            batches.append(batch)
+            recv_batch_size += rollout_result.num_sequence
+        assert recv_batch_size == self.total_batch_size_per_dp, (
+            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+        )
+        batch = RolloutResult.merge_batches(batches)
+
+        # Must be called after batch is retrieved, which is when rollout has stopped
+        # Otherwise, loading model might cause OOM
+        self._load_weight_and_optimizer(input_channel)
+
+        global_batches = get_iterator_k_split(
+            batch,
+            num_splits=self.cfg.algorithm.n_minibatches,
+            shuffle=self.cfg.algorithm.get("shuffle_rollout", True),
+            shuffle_seed=self.cfg.actor.seed,
+        )
+
+        self.model.train()
+        assert (
+            self.cfg.actor.global_batch_size
+            % (self.cfg.actor.micro_batch_size * self._world_size)
+            == 0
+        )
+
+        self.gradient_accumulation = (
+            self.cfg.actor.global_batch_size
+            // self.cfg.actor.micro_batch_size
+            // self._world_size
+        )
+
+        training_metrics_list = []
+        # Global batch iterations
+        with self.worker_timer():
+            for global_batch in global_batches:
+                train_global_batch_size = global_batch["input_ids"].shape[0]
+                assert (
+                    train_global_batch_size
+                    == self.cfg.actor.global_batch_size
+                    // torch.distributed.get_world_size()
+                )
+                assert train_global_batch_size % self.cfg.actor.micro_batch_size == 0, (
+                    f"{train_global_batch_size=}, {self.cfg.actor.micro_batch_size}"
+                )
+
+                self.gradient_accumulation = (
+                    self.cfg.actor.global_batch_size
+                    // self.cfg.actor.micro_batch_size
+                    // self._world_size
+                )
+                # split batch into micro_batches
+                train_micro_batches = get_iterator_k_split(
+                    global_batch,
+                    train_global_batch_size // self.cfg.actor.micro_batch_size,
+                )
+
+                self.optimizer.zero_grad()
+                metrics = {}
+                for _, m_batch in enumerate(train_micro_batches):
+                    for k, v in m_batch.items():
+                        m_batch[k] = v.to(f"cuda:{int(os.environ['LOCAL_RANK'])}")
+
+                    multi_modal_inputs = {}
+                    if "multi_modal_inputs" in m_batch.keys():
+                        if (
+                            "image_bound" in m_batch["multi_modal_inputs"][0]
+                        ):  # minicpm-o logic
+                            for key in m_batch["multi_modal_inputs"][0].keys():
+                                multi_modal_inputs[key] = [
+                                    inputs[key]
+                                    for inputs in m_batch["multi_modal_inputs"]
+                                ]
+                        else:
+                            for key in m_batch["multi_modal_inputs"][0].keys():
+                                multi_modal_inputs[key] = torch.cat(
+                                    [
+                                        inputs[key]
+                                        for inputs in m_batch["multi_modal_inputs"]
+                                    ],
+                                    dim=0,
+                                )
+
+                    input_ids = m_batch["input_ids"]
+                    attention_mask = m_batch["attention_mask"]
+                    position_ids = m_batch["position_ids"]
+                    prev_logprobs = m_batch["prev_logprobs"]
+                    advantages = m_batch["advantages"]
+                    ref_logprobs = None
+                    if "ref_logprobs" in m_batch:
+                        ref_logprobs = m_batch["ref_logprobs"]
+
+                    loss_mask = m_batch["attention_mask"][:, -self.response_len :]
+
+                    output = self.model(
+                        input_ids=input_ids,
+                        attention_mask=attention_mask,
+                        position_ids=position_ids,
+                        **multi_modal_inputs,
+                        use_cache=False,
+                    )  # prevent model thinks we are generating
+
+                    logits = output.logits
+
+                    logits.div_(self.cfg.algorithm.sampling_params.temperature)
+
+                    responses = input_ids[:, -self.response_len :]
+                    logits = logits[
+                        :, -self.response_len - 1 : -1, :
+                    ]  # (bsz, response_length, vocab_size)
+                    logprobs = compute_logprobs_from_logits(
+                        logits, responses, task_type=self.cfg.runner.task_type
+                    )
+                    if self.calculate_entropy:
+                        entropy = compute_entropy_from_logits(
+                            logits, task_type=self.cfg.runner.task_type
+                        )  # (bsz, response_length)
+
+                    clip_ratio = self.cfg.algorithm.ratio_clip_eps
+                    clip_ratio_low = (
+                        self.cfg.algorithm.clip_ratio_low
+                        if self.cfg.algorithm.clip_ratio_low is not None
+                        else clip_ratio
+                    )
+                    clip_ratio_high = (
+                        self.cfg.algorithm.clip_ratio_high
+                        if self.cfg.algorithm.clip_ratio_high is not None
+                        else clip_ratio
+                    )
+                    clip_ratio_c = self.cfg.algorithm.get("clip_ratio_c", 3.0)
+
+                    loss, mbs_metrics_data = actor_loss(
+                        loss_type=self.cfg.algorithm.loss_type,
+                        loss_agg_func=self.loss_agg_func,
+                        logprobs=logprobs,
+                        old_logprobs=prev_logprobs,
+                        advantages=advantages,
+                        clip_ratio_low=clip_ratio_low,
+                        clip_ratio_high=clip_ratio_high,
+                        clip_ratio_c=clip_ratio_c,
+                        loss_mask=loss_mask,
+                    )
+
+                    entropy_loss = torch.tensor(0.0, device=torch.cuda.current_device())
+                    if self.calculate_entropy:
+                        entropy = output["entropy"][
+                            :, -self.response_len - 1 : -1
+                        ].contiguous()
+                        entropy_loss = self.loss_agg_func(entropy, mask=loss_mask)
+                        if self.calculate_entropy_loss:
+                            loss = (
+                                loss - self.cfg.algorithm.entropy_bonus * entropy_loss
+                            )
+
+                    kl_loss = torch.tensor(0.0, device=torch.cuda.current_device())
+                    if self.kl_beta > 0 and ref_logprobs is not None:
+                        kld = kl_penalty(ref_logprobs, logprobs, self.kl_penalty_type)
+                        kl_loss = self.loss_agg_func(kld, loss_mask)
+                        loss = loss + kl_loss * self.kl_beta
+
+                    # add to log
+                    mbs_metrics_data.update(
+                        {
+                            "final_loss": loss.detach().cpu(),
+                            "entropy_loss": entropy_loss.detach().cpu(),
+                            "kl_loss": kl_loss.detach().cpu(),
+                        }
+                    )
+
+                    append_to_dict(metrics, mbs_metrics_data)
+
+                mean_metric_dict = {
+                    key: np.mean(value) for key, value in metrics.items()
+                }
+                mean_metric_dict = all_reduce_dict(
+                    mean_metric_dict, op=torch.distributed.ReduceOp.AVG
+                )
+                training_metrics_list.append(mean_metric_dict)
+
+        # Rollout metrics
+        rollout_metrics, _, _ = compute_math_rollout_metrics(
+            batch, self.cfg.data.max_prompt_length, self.response_len, self._world_size
+        )
+
+        return rollout_metrics, training_metrics_list
+
+    def save_checkpoint(self, save_base_path, step):
+        torch.distributed.barrier()
+        model_state = self.get_model_state_dict()
+        optim_state = self.get_optimizer_state_dict()
+        if self._rank == 0:
+            os.makedirs(save_base_path, exist_ok=True)
+            torch.save(model_state, os.path.join(save_base_path, "model.pt"))
+            torch.save(optim_state, os.path.join(save_base_path, "optim.pt"))
+        torch.distributed.barrier()
+
+    def _compute_batch_rewards(
+        self, batch: Dict[str, torch.Tensor], answers: List[str]
+    ):
+        """Reward computation using non-model based reward."""
+        texts = []
+        for response, response_len in zip(
+            batch["input_ids"],
+            batch["response_lengths"],
+        ):
+            response = response[
+                self.cfg.data.max_prompt_length : self.cfg.data.max_prompt_length
+                + response_len
+            ]
+            texts.append(
+                self.tokenizer.decode(response.tolist(), skip_special_tokens=True)
+            )
+        rewards = self.reward_fn(texts, answers)
+        reward_scores = [
+            self.cfg.reward.reward_scale
+            if reward == 1
+            else -self.cfg.reward.reward_scale
+            for reward in rewards
+        ]
+        all_reward_scores = torch.as_tensor(
+            reward_scores,
+            dtype=torch.float,
+            device=torch.device("cpu"),
+        ).view(-1, 1)
+        return all_reward_scores.flatten()
+
+    # Rewards
+    def compute_rewards(self, input_channel: Channel, output_channel: Channel):
+        """Compute rewards.
+
+        Args:
+            input_channel: The input channel to read from.
+            output_channel: The output channel to send results to.
+        """
+        recv_batch_size = 0
+        while recv_batch_size < self.total_batch_size_per_dp:
+            batch, rollout_result = self.get_batch(input_channel)
+            recv_batch_size += rollout_result.num_sequence
+
+            # Compute rule-based reward
+            with self.worker_timer():
+                if rollout_result.rewards is None:
+                    rollout_result.rewards = self._compute_batch_rewards(
+                        batch, rollout_result.answers
+                    )
+
+            self.put_result(rollout_result, output_channel)
+
+        assert recv_batch_size == self.total_batch_size_per_dp, (
+            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+        )
+
+    # Advantages and returns
+    def compute_advantages_and_returns(
+        self, input_channel: Channel, output_channel: Channel
+    ):
+        """Compute the advantages and returns.
+
+        Args:
+            input_channel: The input channel to read from.
+            output_channel: The output channel to send results to.
+        """
+        recv_batch_size = 0
+        while recv_batch_size < self.total_batch_size_per_dp:
+            batch, rollout_result = self.get_batch(input_channel)
+            recv_batch_size += rollout_result.num_sequence
+
+            with self.worker_timer():
+                if rollout_result.advantages is None:
+                    mask = batch["attention_mask"][:, -self.response_len :]
+                    advantages, returns = calculate_adv_and_returns(
+                        adv_type=self.cfg.algorithm.adv_type,
+                        reward_scores=batch["rewards"].cuda(),
+                        mask=mask.cuda(),
+                        num_responses=self.cfg.algorithm.group_size,
+                    )
+                    rollout_result.advantages = advantages.cpu()
+
+            self.put_result(rollout_result, output_channel)
+
+        assert recv_batch_size == self.total_batch_size_per_dp, (
+            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+        )
 
 
 class EmbodiedFSDPActor(FSDPModelManager, Worker):
diff --git a/rlinf/workers/rollout/sglang/sglang_worker.py b/rlinf/workers/rollout/sglang/sglang_worker.py
index 6a921c0bd..41f026879 100644
--- a/rlinf/workers/rollout/sglang/sglang_worker.py
+++ b/rlinf/workers/rollout/sglang/sglang_worker.py
@@ -207,8 +207,9 @@ def rollout(self, input_channel: Channel, output_channel: Channel):
         self._stop()
         # Release the GPUs once the engine has offloaded
         output_channel.device_lock.release()
-        rollout_result = RolloutResult.merge_result_list(rollout_results)
-        output_channel.put(rollout_result)
+        rollout_result_list = RolloutResult.split_result_list_by_group(rollout_results)
+        for rollout_result in rollout_result_list:
+            output_channel.put(rollout_result)
 
 
 def all_floats_equal(float_list: list[float], epsilon: float = 1e-9) -> bool:
diff --git a/rlinf/workers/rollout/utils.py b/rlinf/workers/rollout/utils.py
index f3845ef2f..f92e48caa 100644
--- a/rlinf/workers/rollout/utils.py
+++ b/rlinf/workers/rollout/utils.py
@@ -376,6 +376,12 @@ def get_actor_rank_to_rollout_rank_map(
         """
         Get the global mapping from actor 1D rank to rollout 2D rank as dict.
         """
+        # rank -> (dp, tp)
+        if actor_tp_size == 1:
+            return {
+                rank: (rank // rollout_tp_size, rank % rollout_tp_size)
+                for rank in range(actor_world_size)
+            }
         rank_map = {}
         for actor_rank in range(actor_world_size):
             rank_map[actor_rank] = cls._get_actor_rank_to_rollout_rank(

From 7c382c8114912217117d5335e3e1a26bfe2194b2 Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Fri, 19 Sep 2025 08:50:10 +0000
Subject: [PATCH 24/57] feat(dataset): refactor and add lazy loader process

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 rlinf/data/datasets.py | 562 +++++++++++++++++++++++++++++++----------
 1 file changed, 424 insertions(+), 138 deletions(-)

diff --git a/rlinf/data/datasets.py b/rlinf/data/datasets.py
index 162d7b94d..2be41ce3b 100644
--- a/rlinf/data/datasets.py
+++ b/rlinf/data/datasets.py
@@ -16,14 +16,14 @@
 import logging
 import os
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import pandas as pd
 import torch
 from omegaconf import DictConfig
 from PIL.Image import Image
 from torch.utils.data import Dataset
-from transformers import AutoProcessor
+from transformers import AutoProcessor, AutoTokenizer
 
 
 def batch_pad_to_fixed_len(
@@ -176,156 +176,424 @@ def __getitem__(self, idx):
         return output
 
 
-class VisionLanguageDataset(Dataset):
+class VLMBaseDataset(Dataset):
+
     def __init__(
-        self, data_paths: Union[List[str], str], config: DictConfig, tokenizer
-    ):
+        self,
+        data_paths: Union[List[str], str],
+        config: DictConfig,
+        tokenizer: AutoTokenizer,
+        *,
+        lazy_loading: bool = False,
+    ) -> None:
         super().__init__()
-        self.data_paths = data_paths
-        self.use_chat_template = config.data.use_chat_template
+        self.cfg = config
+        raw_paths = [data_paths] if isinstance(data_paths, str) else list(data_paths)
+        # Expand directories into file lists recursively (json/jsonl/parquet)
+        self.data_paths = self._expand_data_paths(raw_paths)
+        self.tokenizer = tokenizer
+        # Delay processor creation; only needed when use_chat_template is True
+        self._processor = None
 
-        self.image_keys = config.data.image_keys
+        self.use_chat_template = bool(config.data.use_chat_template)
+        self.image_keys = list(config.data.image_keys or [])
         self.prompt_key = config.data.prompt_key
-        self.choice_key = config.data.choice_key
-        self.answer_key = config.data.answer_key
-        self.solution_key = config.data.solution_key
+        self.choice_key = config.data.get("choice_key", None)
+        self.answer_key = config.data.get("answer_key", None)
+        self.solution_key = config.data.get("solution_key", None)
+        self.max_prompt_length = int(config.data.max_prompt_length)
+        self.eos_id = int(self.tokenizer.eos_token_id)
 
-        if isinstance(self.data_paths, str):
-            self.data_paths = [self.data_paths]
+        # Loading mode
+        self.lazy_loading = bool(getattr(config.data, "lazy_loading", lazy_loading))
 
-        self.max_prompt_length = config.data.max_prompt_length
-        self.tokenizer = tokenizer
-        self.processor = AutoProcessor.from_pretrained(config.actor.model.model_path)
-        self.data = self._load_data()
-        self.post_process()
-
-    def post_process(self) -> None:
-        def get_image_list(
-            dataitem: Dict, image_keys: Optional[List[str]]
-        ) -> List[Union[bytes, str]]:
-            image_list: List[Union[bytes, str]] = []
-            if image_keys:
-                for key in image_keys:
-                    image_content = dataitem.get(key, None)
-                    if image_content is None:
-                        continue
-                    if isinstance(image_content, Image):
-                        image_content.append(image_content)
-                    if isinstance(image_content, dict) and "bytes" in image_content:
-                        image_content = image_content["bytes"]
-                        assert isinstance(image_content, bytes), (
-                            f"image content should be bytes, but got {type(image_content)} , content is {image_content}"
-                        )
-                    image_list.append(image_content)
-            if image_list == []:
-                return [None]
-            return image_list
-
-        def process_prompt(
-            data_item: Dict, image_count: int
-        ) -> Tuple[
-            str,
-            List[int],
-            int,
-        ]:
-            question = data_item.get(self.prompt_key, "")
-            options = data_item.get(self.choice_key, [])
-            if not isinstance(options, list):
-                options = [options]
-            prompt_text = question
-            if options:
-                prompt_text += f"{options}\n"
-            if self.use_chat_template:
-                message_content: List = []
-                for i in range(image_count):
-                    message_content.append({"type": "image"})
-                message_content.append({"type": "text", "text": prompt_text})
-                messages = [{"role": "user", "content": message_content}]
-                prompt_text = self.processor.apply_chat_template(
-                    messages, tokenize=False, add_generation_prompt=True
-                )
-                prompt_ids = self.processor(
-                    text=[prompt_text],
-                    padding=True,
-                    return_tensors="pt",
-                )["input_ids"]
-                if isinstance(prompt_ids, torch.Tensor):
-                    if prompt_ids.dim() == 2 and prompt_ids.size(0) == 1:
-                        prompt_ids = prompt_ids.squeeze(0)  # [L]
-                    prompt_ids = prompt_ids.to(dtype=torch.long)
-                else:
-                    prompt_ids = torch.tensor(prompt_ids, dtype=torch.long)
-                prompt_length = len(prompt_ids)
-
-                return prompt_text, prompt_ids, prompt_length
-            else:
-                raise NotImplementedError("Non-chat template not implemented yet.")
+        self._records = []
+        self._indices = []  # (path, fmt, row_index_or_offset)
 
-        processed_data: List[DatasetItem] = []
-        for idx, item in enumerate(self.data):
-            image_list: List[Union[bytes, str]] = get_image_list(item, self.image_keys)
-            prompt_text, prompt_ids, prompt_length = process_prompt(
-                item, len(image_list)
-            )
+        if self.lazy_loading:
+            self._build_lazy_indices()
+        else:
+            self._eager_load_all()
 
-            if prompt_length > self.max_prompt_length:
-                print(
-                    f"prompt_ids length {prompt_length} exceeds the max_prompt_length {self.max_prompt_length}",
+    def __len__(self) -> int:
+        return len(self._indices) if self.lazy_loading else len(self._records)
+
+    def __getitem__(self, idx: int) -> DatasetItem:
+        if self.lazy_loading:
+            path, fmt, key = self._indices[idx]
+            raw = self._load_single_lazy(path, fmt, key)
+            return self._process_raw_record(raw, idx)
+        else:
+            raw = self._records[idx]
+            return self._process_raw_record(raw, idx)
+
+    # Ensure dataset is picklable for multi-process DataLoader by removing
+    # unpicklable cache objects like pyarrow.ParquetFile from state.
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        # Drop heavy/unpicklable caches; they will be rebuilt on-demand in workers
+        for k in ("_parquet_cache", "_parquet_df_cache"):
+            if k in state:
+                state[k] = {}
+        return state
+
+    def __setstate__(self, state):
+        # Restore state and ensure cache dicts exist
+        self.__dict__.update(state)
+        self._parquet_cache = getattr(self, "_parquet_cache", {})
+        self._parquet_df_cache = getattr(self, "_parquet_df_cache", {})
+
+    def get_image_list(self, dataitem: Dict[str, Any]) -> List[Union[bytes, str, None]]:
+        images: List[Union[bytes, str, None]] = []
+        for k in self.image_keys:
+            v = dataitem.get(k, None)
+            if v is None:
+                continue
+            if isinstance(v, Image):
+                images.append(v)
+            elif isinstance(v, dict) and "bytes" in v:
+                images.append(v["bytes"])
+            else:
+                images.append(v)  # path or url
+        if not images:
+            images = [None]
+        return images
+
+    def build_prompt_text(self, data_item: Dict[str, Any]) -> str:
+        # Default: prompt + optional choices rendered inline
+        q = data_item.get(self.prompt_key, "")
+        choices = data_item.get(self.choice_key, []) if self.choice_key else []
+        if not isinstance(choices, list):
+            choices = [choices]
+        if choices:
+            return f"{q}{choices}\n"
+        return str(q)
+
+    def encode_prompt(
+        self, prompt_text: str, image_count: int
+    ) -> Tuple[torch.Tensor, int, Optional[str]]:
+        """
+        Return (token_ids[L], length, prompt_text_used). If using chat template, encode with processor.
+        Subclasses may override to support alternative prompting.
+        """
+        if self.use_chat_template:
+            if self._processor is None:
+                self._processor = AutoProcessor.from_pretrained(
+                    self.cfg.actor.model.model_path
                 )
-                prompt_ids = prompt_ids[: self.max_prompt_length]
-                prompt_length = self.max_prompt_length
-            prompt_ids = batch_pad_to_fixed_len(
-                [prompt_ids],
-                self.max_prompt_length,
-                self.tokenizer.eos_token_id,
-                left_pad=True,
-            )[0]
-            answer = item.get(self.answer_key, None)
-            solution = item.get(self.solution_key, None)
-
-            data_item = DatasetItem(
-                prompt_text=prompt_text,
-                prompt=prompt_ids,
-                length=prompt_length,
-                image_data=image_list,
-                answer=str(answer),
-                solution=solution,
-                idx=idx,
+            content: List[Dict[str, Any]] = []
+            for _ in range(max(0, image_count)):
+                content.append({"type": "image"})
+            content.append({"type": "text", "text": prompt_text})
+            messages = [{"role": "user", "content": content}]
+            rendered = self._processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
             )
-            processed_data.append(data_item)
-        self.data = processed_data
+            ids = self._processor(text=[rendered], padding=True, return_tensors="pt")[
+                "input_ids"
+            ]
+            if isinstance(ids, torch.Tensor):
+                if ids.dim() == 2 and ids.size(0) == 1:
+                    ids = ids.squeeze(0)
+                ids = ids.to(dtype=torch.long)
+            else:
+                ids = torch.tensor(ids, dtype=torch.long)
+            return ids, int(ids.numel()), rendered
+        else:
+            # fallback: tokenizer only
+            ids_list = self.tokenizer.encode(prompt_text)
+            ids = torch.as_tensor(ids_list, dtype=torch.long)
+            return ids, int(ids.numel()), prompt_text
+
+    def postprocess_dataset_item(
+        self, item: DatasetItem, raw: Dict[str, Any]
+    ) -> DatasetItem:
+        return item
+
+    def _expand_data_paths(self, inputs: List[str]) -> List[str]:
+        exts = {".jsonl", ".json", ".parquet"}
+        files: List[str] = []
+        for p in inputs:
+            if os.path.isdir(p):
+                for root, _, fnames in os.walk(p):
+                    for fn in fnames:
+                        ext = os.path.splitext(fn)[1].lower()
+                        if ext in exts:
+                            files.append(os.path.join(root, fn))
+            else:
+                files.append(p)
+        files = sorted(set(files))
+        return files
 
-    def _load_data(self) -> List:
-        merged_data = []
+    def _eager_load_all(self) -> None:
+        merged: List[Dict[str, Any]] = []
         for path in self.data_paths:
-            _, file_extension = os.path.splitext(path)
+            fmt = os.path.splitext(path)[1].lower()
+            if fmt == ".jsonl":
+                with open(path, "r", encoding="utf-8") as f:
+                    merged.extend(json.loads(l) for l in f)
+            elif fmt == ".json":
+                with open(path, "r", encoding="utf-8") as f:
+                    content = json.load(f)
+                    if isinstance(content, list):
+                        merged.extend(content)
+                    else:
+                        merged.append(content)
+            elif fmt == ".parquet":
+                try:
+                    merged.extend(pd.read_parquet(path).to_dict(orient="records"))
+                except Exception as e:
+                    raise RuntimeError(f"Failed to load parquet eagerly: {path}: {e}")
+            else:
+                logging.warning(f"Unsupported format {fmt} for path {path}, skipping.")
+        self._records = merged
+        # Build indices for consistency
+        self._indices = [("", "eager", i) for i in range(len(self._records))]
+
+    def _build_lazy_indices(self) -> None:
+        self._indices.clear()
+        for path in self.data_paths:
+            fmt = os.path.splitext(path)[1].lower()
+            if fmt == ".jsonl":
+                # index by byte offsets for each line
+                offsets: List[int] = []
+                with open(path, "rb") as fb:
+                    pos = 0
+                    for line in fb:
+                        offsets.append(pos)
+                        pos += len(line)
+                self._indices.extend((path, "jsonl", off) for off in offsets)
+            elif fmt == ".json":
+                try:
+                    with open(path, "r", encoding="utf-8") as f:
+                        content = json.load(f)
+                    if not isinstance(content, list):
+                        content = [content]
+                    # store the content to avoid re-reading
+                    # keep perfile cache
+                    self._json_cache = getattr(self, "_json_cache", {})
+                    self._json_cache[path] = content
+                    self._indices.extend((path, "json", i) for i in range(len(content)))
+                except Exception as e:
+                    raise RuntimeError(f"Failed to index json lazily: {path}: {e}")
+            elif fmt == ".parquet":
+                try:
+                    import pyarrow.parquet as pq  # type: ignore
+
+                    pf = pq.ParquetFile(path)
+                    num_rows = pf.metadata.num_rows
+                    # file handle cache
+                    self._parquet_cache = getattr(self, "_parquet_cache", {})
+                    self._parquet_cache[path] = pf
+                    self._indices.extend((path, "parquet", i) for i in range(num_rows))
+                except Exception:
+                    df = pd.read_parquet(path)
+                    self._parquet_df_cache = getattr(self, "_parquet_df_cache", {})
+                    self._parquet_df_cache[path] = df
+                    self._indices.extend(
+                        (path, "parquet_pd", i) for i in range(len(df))
+                    )
+            else:
+                logging.warning(f"Unsupported format {fmt} for path {path}, skipping.")
+
+    def _load_single_lazy(self, path: str, fmt: str, key: Any) -> Dict[str, Any]:
+        if fmt == "eager":
+            return self._records[int(key)]
+        if fmt == "jsonl":
+            with open(path, "rb") as fb:
+                fb.seek(int(key))
+                line = fb.readline()
+            return json.loads(line.decode("utf-8").strip())
+        if fmt == "json":
+            return self._json_cache[path][int(key)]  # type: ignore[attr-defined]
+        if fmt == "parquet":
+            # Try to use pyarrow lazily; rebuild cache if missing
+            self._parquet_cache = getattr(self, "_parquet_cache", {})
+            pf = self._parquet_cache.get(path)
+            if pf is None:
+                try:
+                    import pyarrow.parquet as pq  # type: ignore
+
+                    pf = pq.ParquetFile(path)
+                    self._parquet_cache[path] = pf
+                except Exception:
+                    # Fall back to pandas-based cache
+                    self._parquet_df_cache = getattr(self, "_parquet_df_cache", {})
+                    df = self._parquet_df_cache.get(path)
+                    if df is None:
+                        df = pd.read_parquet(path)
+                        self._parquet_df_cache[path] = df
+                    return df.iloc[int(key)].to_dict()
+            table = pf.read_row_group(key // max(1, pf.metadata.num_rows), columns=None)
             try:
-                pass
-                if file_extension == ".parquet":
-                    loaded_data: List = pd.read_parquet(path).to_dict(orient="records")
-                    merged_data.extend(loaded_data)
-                elif file_extension == ".jsonl":
-                    with open(path, "r", encoding="utf-8") as file:
-                        loaded_data = [json.loads(line.strip()) for line in file]
-                        merged_data.extend(loaded_data)
-                elif file_extension == ".json":
-                    with open(path, "r", encoding="utf-8") as file:
-                        content = json.load(file)
-                        if isinstance(content, list):
-                            merged_data.extend(content)
-                        else:
-                            merged_data.append(content)
-                else:
-                    print(f"Unsupport {file_extension}, skip: {path}")
-            except Exception as e:
-                raise RuntimeError(f"Load data error: {e}")
-        return merged_data
+                df = table.to_pandas()
+                return df.iloc[int(key) % len(df)].to_dict()
+            except Exception:
+                df_all = pf.read().to_pandas()
+                return df_all.iloc[int(key)].to_dict()
+        if fmt == "parquet_pd":
+            self._parquet_df_cache = getattr(self, "_parquet_df_cache", {})
+            df = self._parquet_df_cache.get(path)
+            if df is None:
+                df = pd.read_parquet(path)
+                self._parquet_df_cache[path] = df
+            return df.iloc[int(key)].to_dict()
+        raise RuntimeError(f"Unknown lazy fmt {fmt}")
+
+    def _process_raw_record(self, raw: Dict[str, Any], idx: int) -> DatasetItem:
+        images = self.get_image_list(raw)
+        prompt_text = self.build_prompt_text(raw)
+        prompt_ids, plen, rendered_text = self.encode_prompt(prompt_text, len(images))
+
+        if plen > self.max_prompt_length:
+            prompt_ids = prompt_ids[: self.max_prompt_length]
+            plen = self.max_prompt_length
+        prompt_ids = batch_pad_to_fixed_len(
+            [prompt_ids], self.max_prompt_length, self.eos_id, left_pad=True
+        )[0]
 
-    def __len__(self) -> int:
-        return len(self.data)
+        answer_val = raw.get(self.answer_key, None) if self.answer_key else None
+        solution_val = raw.get(self.solution_key, None) if self.solution_key else None
+        item = DatasetItem(
+            prompt=prompt_ids,
+            length=plen,
+            answer=str(answer_val) if answer_val is not None else None,
+            idx=idx,
+            image_data=images,
+            prompt_text=rendered_text or prompt_text,
+            solution=solution_val,
+            meta=None,
+        )
+        return self.postprocess_dataset_item(item, raw)
+
+
+class VLMDatasetRegistry:
+    registry: Dict[str, Callable[..., VLMBaseDataset]] = {}
+
+    @classmethod
+    def register(
+        cls, name: str
+    ) -> Callable[[Callable[..., VLMBaseDataset]], Callable[..., VLMBaseDataset]]:
+        def decorator(klass: Callable[..., VLMBaseDataset]):
+            cls.registry[name] = klass
+            return klass
+
+        return decorator
+
+    @classmethod
+    def create(
+        cls,
+        dataset_name: Optional[str],
+        *,
+        data_paths: Union[List[str], str],
+        config: DictConfig,
+        tokenizer: AutoTokenizer,
+    ) -> VLMBaseDataset:
+        key = dataset_name.lower()
+        klass = cls.registry.get(key)
+        return klass(data_paths=data_paths, config=config, tokenizer=tokenizer)
+
+
+@VLMDatasetRegistry.register("robo2vlm")
+class Robo2VLMDataset(VLMBaseDataset):
+    def get_image_list(self, dataitem: Dict[str, Any]) -> List[Union[bytes, str, None]]:
+        # Prefer common robo2vlm fields if present, else fallback to configured keys
+        images: List[Any] = []
+        if "images" in dataitem:
+            v = dataitem.get("images")
+            if isinstance(v, list):
+                images = list(v)
+            elif v is not None:
+                images = [v]
+            else:
+                images = [None]
+        elif "image" in dataitem:
+            v = dataitem.get("image")
+            if v is not None:
+                images = [v]
+            else:
+                images = [None]
+        else:
+            # fallback to base behavior using configured image_keys
+            return super().get_image_list(dataitem)
+
+        # Normalize each element similar to base behavior
+        normed: List[Union[bytes, str, None]] = []
+        for v in images:
+            if v is None:
+                continue
+            if isinstance(v, Image):
+                normed.append(v)
+            elif isinstance(v, dict) and "bytes" in v:
+                normed.append(v["bytes"])  # raw bytes
+            else:
+                normed.append(v)  # path/uri/string
+        if not normed:
+            normed = [None]
+        return normed
+
+    def build_prompt_text(self, data_item: Dict[str, Any]) -> str:
+        # Use 'question' and 'choices' if present; else fallback to base using configured prompt/choice keys
+        question = data_item.get("question", None)
+        choices = data_item.get("choices", None)
+        if question is None:
+            return super().build_prompt_text(data_item)
+        # normalize choices
+        if isinstance(choices, str):
+            try:
+                import ast
 
-    def __getitem__(self, index):
-        return self.data[index]
+                choices = ast.literal_eval(choices)
+            except Exception:
+                choices = [choices]
+        if not isinstance(choices, list):
+            choices = [choices] if choices is not None else []
+
+        text = f"{question}\n"
+        if choices:
+            text += "Choices:\n"
+            for i, c in enumerate(choices):
+                text += f"{chr(65 + i)}. {c}\n"
+        return text
+
+    def postprocess_dataset_item(
+        self, item: DatasetItem, raw: Dict[str, Any]
+    ) -> DatasetItem:
+        # Derive answer from 'correct_answer' and 'choices' if not provided
+        if not item.answer or str(item.answer).lower() in {"none", "", "null"}:
+            choices = raw.get("choices")
+            ca = raw.get("correct_answer")
+            try:
+                # Normalize choices
+                if isinstance(choices, str):
+                    import ast
+
+                    choices = ast.literal_eval(choices)
+                if not isinstance(choices, list):
+                    choices = [choices] if choices is not None else []
+
+                ans_val: Optional[str] = None
+                if isinstance(ca, int) and 0 <= ca < len(choices):
+                    ans_val = str(choices[ca])
+                elif isinstance(ca, str):
+                    cstr = ca.strip()
+                    # Letter index like 'A', 'B', ...
+                    if len(cstr) == 1 and "A" <= cstr <= "Z":
+                        idx = ord(cstr) - ord("A")
+                        if 0 <= idx < len(choices):
+                            ans_val = str(choices[idx])
+                    # Direct match to a choice value
+                    if ans_val is None and choices:
+                        for ch in choices:
+                            if str(ch) == cstr:
+                                ans_val = cstr
+                                break
+                if ans_val is not None:
+                    item.answer = ans_val
+            except Exception:
+                # Keep original if any
+                pass
+        return item
 
 
 def create_rl_dataset(config: DictConfig, tokenizer):
@@ -344,7 +612,25 @@ def create_rl_dataset(config: DictConfig, tokenizer):
     if config.data.type == "math":
         dataset_cls = MathDataset
     elif config.data.type == "vision_language":
-        dataset_cls = VisionLanguageDataset
+        # Prefer new factory-based VLM datasets; fallback to legacy if requested
+        dataset_name = getattr(config.data, "dataset_name", None)
+        lazy_loading = bool(getattr(config.data, "lazy_loading", False))
+
+        print(f"Using VLM dataset: name={dataset_name}, lazy_loading={lazy_loading}")
+
+        train_dataset = VLMDatasetRegistry.create(
+            dataset_name,
+            data_paths=config.data.train_data_paths,
+            config=config,
+            tokenizer=tokenizer,
+        )
+        val_dataset = VLMDatasetRegistry.create(
+            dataset_name,
+            data_paths=config.data.val_data_paths,
+            config=config,
+            tokenizer=tokenizer,
+        )
+        return train_dataset, val_dataset
     else:
         return None, None
 

From fa0fc753d30a2c7a386b6a42df23063f7ba4608a Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Fri, 19 Sep 2025 11:13:33 +0000
Subject: [PATCH 25/57] fix(vllm): fix wrong image_data param when running vlm
 in vllm

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 .../math/config/qwen2.5-1.5b-grpo-fsdp.yaml   | 28 +++++++++++--------
 rlinf/data/datasets.py                        |  1 -
 .../hybrid_engines/vllm/vllm_0_8_5/worker.py  |  5 +++-
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/examples/math/config/qwen2.5-1.5b-grpo-fsdp.yaml b/examples/math/config/qwen2.5-1.5b-grpo-fsdp.yaml
index d8a1e8c3f..55285baa8 100644
--- a/examples/math/config/qwen2.5-1.5b-grpo-fsdp.yaml
+++ b/examples/math/config/qwen2.5-1.5b-grpo-fsdp.yaml
@@ -8,7 +8,7 @@ hydra:
 
 cluster:
   num_nodes: 1
-  num_gpus_per_node: 8
+  num_gpus_per_node: 4
   component_placement:
     actor,rollout: all
 
@@ -33,8 +33,7 @@ runner:
 
   resume_dir: null
   experiment_name: grpo-1.5b
-  output_dir: ../results
-
+  output_dir: /mnt/public/daibo/results
 algorithm:
   group_size: 8
 
@@ -85,7 +84,7 @@ rollout:
 
   gpu_memory_utilization: 0.55
 
-  model_dir: /mnt/public/hf_models/DeepSeek-R1-Distill-Qwen-1.5B
+  model_dir: /mnt/public/hf_models/qwen2.5-VL-3B/
   model_arch: qwen2.5
   enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
   distributed_executor_backend: mp   # ray or mp
@@ -94,7 +93,7 @@ rollout:
   padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
   eos: null                   # will be tokenizer.eos_token_id if null.
     
-  rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
+  rollout_backend: vllm     # here choose which backend to rollout,support [sglang, vllm] 
 
   sglang:
     attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
@@ -121,18 +120,25 @@ rollout:
   cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
 
 data:
-  type: math
+  type: vision_language
+  dataset_name: robo2vlm
   max_prompt_length: 1024
   filter_prompt_by_length: True
   rollout_batch_size: 8
   val_rollout_batch_size: null
   num_workers: 2
-  prompt_key: prompt
   shuffle: True
   validation_shuffle: True
   seed: 1234
-  train_data_paths: ["/mnt/public/guozhen/data/boba_106k_0319_prompt_1024.jsonl"]
-  val_data_paths: ["/mnt/public/guozhen/data/boba_106k_0319_prompt_1024.jsonl"]
+  train_data_paths: ["/mnt/public/daibo/dataset/robo2vlm-1/data/"]
+  val_data_paths: ["/mnt/public/daibo/dataset/robo2vlm-1/data/"]
+  prompt_key: question
+  image_keys: [image]
+  answer_key: answer
+  choice_key: choices
+  solution_key: null
+  use_chat_template: True
+  lazy_loading: True
 
 actor:
   group_name: "ActorGroup"
@@ -159,7 +165,7 @@ actor:
 
     seq_length: ${runner.seq_length}
     encoder_seq_length: ${runner.seq_length}
-    model_path: /mnt/public/hf_models/DeepSeek-R1-Distill-Qwen-1.5B
+    model_path: /mnt/public/hf_models/qwen2.5-VL-3B/
 
   optim:
     optimizer: adam
@@ -189,7 +195,7 @@ actor:
     lr_decay_iters: 10
 
   tokenizer:
-    tokenizer_model: /mnt/public/hf_models/DeepSeek-R1-Distill-Qwen-1.5B
+    tokenizer_model: /mnt/public/hf_models/qwen2.5-VL-3B/
     use_fast: False
     trust_remote_code: True
     padding_side: 'right'
diff --git a/rlinf/data/datasets.py b/rlinf/data/datasets.py
index 2be41ce3b..107e89d55 100644
--- a/rlinf/data/datasets.py
+++ b/rlinf/data/datasets.py
@@ -177,7 +177,6 @@ def __getitem__(self, idx):
 
 
 class VLMBaseDataset(Dataset):
-
     def __init__(
         self,
         data_paths: Union[List[str], str],
diff --git a/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py b/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
index 3e021e922..519895e49 100644
--- a/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
+++ b/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
@@ -48,6 +48,9 @@ def __init__(
         )
         # rlinf specific
         self.rlinf_config = rlinf_config
+        self.using_sharded_weight = (
+            False if self.rlinf_config.actor.training_backend == "fsdp" else True
+        )
         self._rlinf_worker = _RLinfWorker(
             parent_address=parent_address,
             world_size=vllm_config.parallel_config.world_size,
@@ -103,7 +106,7 @@ def sync_hf_weight(self) -> None:
     def use_sharded_weights(self) -> None:
         model = self.model_runner.model
         for _, param in model.named_parameters():
-            setattr(param, "is_sharded_weight", True)
+            setattr(param, "is_sharded_weight", self.using_sharded_weight)
 
     def get_dp_rank(self) -> int:
         return self._rlinf_worker.get_parent_rank()

From 6a7d4bc3b8d7cf69ed3732ca24c56c069af26899 Mon Sep 17 00:00:00 2001
From: guozhen1997 <2997871698@qq.com>
Date: Mon, 22 Sep 2025 21:22:50 +0800
Subject: [PATCH 26/57] feat: add vqa reward function, unify math and vqa
 reward

Signed-off-by: guozhen1997 <2997871698@qq.com>
---
 rlinf/algorithms/rewards/__init__.py          | 15 +++++
 rlinf/algorithms/rewards/math/__init__.py     | 24 +++++++
 rlinf/algorithms/rewards/vqa/__init__.py      | 42 +++++++++++++
 .../algorithms/rewards/vqa/format_rewards.py  | 53 ++++++++++++++++
 rlinf/algorithms/rewards/vqa/qa_rewards.py    | 63 +++++++++++++++++++
 rlinf/data/datasets.py                        |  4 +-
 .../hybrid_engines/fsdp/fsdp_model_manager.py |  1 -
 rlinf/workers/actor/fsdp_actor_worker.py      | 16 ++---
 8 files changed, 205 insertions(+), 13 deletions(-)
 create mode 100644 rlinf/algorithms/rewards/__init__.py
 create mode 100644 rlinf/algorithms/rewards/math/__init__.py
 create mode 100644 rlinf/algorithms/rewards/vqa/__init__.py
 create mode 100644 rlinf/algorithms/rewards/vqa/format_rewards.py
 create mode 100644 rlinf/algorithms/rewards/vqa/qa_rewards.py

diff --git a/rlinf/algorithms/rewards/__init__.py b/rlinf/algorithms/rewards/__init__.py
new file mode 100644
index 000000000..3a48b84b6
--- /dev/null
+++ b/rlinf/algorithms/rewards/__init__.py
@@ -0,0 +1,15 @@
+from .math import MathReward
+from .vqa import VQAReward
+
+def register_reward(name: str, reward_class: type):
+    assert name not in reward_registry, f"Reward {name} already registered"
+    reward_registry[name] = reward_class
+
+def get_reward_class(name: str):
+    assert name in reward_registry, f"Reward {name} not found"
+    return reward_registry[name]
+
+reward_registry = {}
+
+register_reward("math", MathReward)
+register_reward("vqa", VQAReward)
\ No newline at end of file
diff --git a/rlinf/algorithms/rewards/math/__init__.py b/rlinf/algorithms/rewards/math/__init__.py
new file mode 100644
index 000000000..a94ff2dc4
--- /dev/null
+++ b/rlinf/algorithms/rewards/math/__init__.py
@@ -0,0 +1,24 @@
+from typing import List
+from omegaconf import DictConfig
+from toolkits.math_verifier.verify import math_verify_call
+
+
+class MathReward:
+    def __init__(self, config: DictConfig):
+        self.scale = config.get("scale", 1.0)
+
+    def get_reward(
+        self, response: List[str], reference: List[List[str]]
+    ) -> List[float]:
+        """
+        Calculates reward scores for a list of responses compared to corresponding lists of reference answers.
+        For each response, the function checks if it matches any of the provided references using the `process_results` function.
+        The reward for each response is computed as the first element of the result (converted to float) multiplied by `self.scale`.
+        Args:
+            response (List[str]): A list of response strings to be evaluated.
+            reference (List[List[str]]): A list where each element is a list of reference strings corresponding to each response.
+        Returns:
+            List[float]: A list of reward scores, one for each response.
+        """
+
+        return math_verify_call(response, reference) * self.scale
\ No newline at end of file
diff --git a/rlinf/algorithms/rewards/vqa/__init__.py b/rlinf/algorithms/rewards/vqa/__init__.py
new file mode 100644
index 000000000..d4dd55f20
--- /dev/null
+++ b/rlinf/algorithms/rewards/vqa/__init__.py
@@ -0,0 +1,42 @@
+import torch
+from typing import List
+from omegaconf import DictConfig
+from .qa_rewards import qa_accuracy_reward
+from .format_rewards import think_format_reward, answer_format_reward
+
+
+class VQAReward:
+    def __init__(self, config: DictConfig):
+        self.reward_weights = config.get("reward_weights", {
+            "qa_accuracy": 1.0,
+            "think_format": 0.0,
+            "answer_format": 0.0,
+        })
+        for reward_name, reward_weight in self.reward_weights.items():
+            assert reward_name in ["qa_accuracy", "think_format", "answer_format"], f"Reward {reward_name} not supported"
+            assert reward_weight >= 0, f"Reward weight {reward_weight} must be non-negative"
+        self.reward_weights = [reward_weight["qa_accuracy"], reward_weight["think_format"], reward_weight["answer_format"]]
+
+        self.reward_functions = [qa_accuracy_reward, think_format_reward, answer_format_reward]
+
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    def get_reward(self, completions: List[str], answers: List[str]) -> List[float]:
+        rewards = []
+        for i, reward_function in enumerate(self.reward_functions):
+            if self.reward_weights[i] > 0:
+                rewards.append(reward_function(completions, answers))
+            else:
+                rewards.append([0.0] * len(completions))
+
+        # Apply weights to each reward function's output and sum
+
+        # rewards [num_reward_functions, len(completions)]
+        rewards_tensor = torch.tensor(rewards, device=self.device)
+        weights_tensor = torch.tensor(self.reward_weights, device=self.device)
+        
+        # [num_reward_functions, num_completions] * [num_reward_functions, 1] -> [num_completions]
+        final_rewards = (rewards_tensor * weights_tensor.unsqueeze(1)).sum(dim=0)
+        
+        return final_rewards.tolist()
+        
\ No newline at end of file
diff --git a/rlinf/algorithms/rewards/vqa/format_rewards.py b/rlinf/algorithms/rewards/vqa/format_rewards.py
new file mode 100644
index 000000000..2f926e733
--- /dev/null
+++ b/rlinf/algorithms/rewards/vqa/format_rewards.py
@@ -0,0 +1,53 @@
+import re
+from typing import List
+
+
+def think_format_reward(completions, answers) -> List[float]:
+    """
+    Think format reward function compatible with GRPO training.
+    
+    Reward function that checks if reasoning is enclosed within <think></think> tags.
+    
+    Args:
+        completions: List of model completions (text strings)
+        
+    Returns:
+        List of reward scores (1.0 for correct format, 0.0 otherwise)
+    """
+    pattern = r"^<think>(?!.*<think>)(.*?)</think>.*$"
+    rewards = []
+    
+    for completion in completions:
+        completion_text = str(completion).strip()
+        match = re.match(pattern, completion_text, re.DOTALL | re.MULTILINE)
+        rewards.append(1.0 if match else 0.0)
+    
+    return rewards
+
+
+def answer_format_reward(completions, answers) -> List[float]:
+    """
+    Reward function that checks for proper answer formatting.
+    
+    Expected format: <answer>X. content</answer> where X is a choice letter.
+    
+    Args:
+        completions: List of model completions (text strings) 
+        
+    Returns:
+        List of reward scores (1.0 for correct format, 0.0 otherwise)
+    """
+    rewards = []
+    
+    for completion in completions:
+        completion_text = str(completion).strip()
+        
+        # Check for proper answer format: <answer>X. content</answer>
+        answer_pattern = r'<answer>\s*[A-E]\.\s*.+?\s*</answer>'
+        has_proper_answer = bool(re.search(
+            answer_pattern, completion_text, re.DOTALL | re.IGNORECASE
+        ))
+        
+        rewards.append(1.0 if has_proper_answer else 0.0)
+    
+    return rewards
\ No newline at end of file
diff --git a/rlinf/algorithms/rewards/vqa/qa_rewards.py b/rlinf/algorithms/rewards/vqa/qa_rewards.py
new file mode 100644
index 000000000..ce7a3443b
--- /dev/null
+++ b/rlinf/algorithms/rewards/vqa/qa_rewards.py
@@ -0,0 +1,63 @@
+import re
+from typing import List
+
+
+def qa_accuracy_reward(completions, answers) -> List[float]:
+    """
+    Reward function that evaluates question-answering accuracy for VQA tasks.
+    
+    Based on TRL's accuracy_reward pattern but adapted for multiple choice VQA.
+    
+    Args:
+        completions: List of model completions (text strings)
+        answers: List of correct answers (text strings)
+        
+    Returns:
+        List of reward scores (1.0 for correct, 0.0 for incorrect)
+    """
+    rewards = []
+    
+    for completion, answer in zip(completions, answers):
+        completion_text = str(completion).strip()
+        
+        # Extract answer from completion - look for <answer>X. content</answer>
+        patterns = [
+            r'<answer>\s*[A-E]\.\s*(.*?)\s*</answer>',
+            r'<answer>\s*[A-E]\s*(.*?)\s*</answer>',
+            r'<answer>\s*(.*?)\s*</answer>',
+        ]
+        
+        answer_match = None
+        for pattern in patterns:
+            answer_match = re.search(pattern, completion_text, re.DOTALL | re.IGNORECASE)
+            if answer_match:
+                break
+        
+        if not answer_match:
+            rewards.append(0.0)
+            continue
+            
+        predicted_content = answer_match.group(1).strip()
+        
+        content_match = _compare_choice_content(predicted_content, answer)
+        
+        rewards.append(1.0 if content_match else 0.0)
+    
+    return rewards
+
+
+def _compare_choice_content(predicted: str, correct: str) -> bool:
+    """Compare predicted choice content with correct content."""
+    # Simple normalized comparison
+    pred_normalized = predicted.lower().strip()
+    correct_normalized = correct.lower().strip()
+    
+    # Direct match
+    if pred_normalized == correct_normalized:
+        return True
+    
+    # Partial match for more flexibility
+    if pred_normalized in correct_normalized or correct_normalized in pred_normalized:
+        return True
+    
+    return False
\ No newline at end of file
diff --git a/rlinf/data/datasets.py b/rlinf/data/datasets.py
index 107e89d55..2669b14e0 100644
--- a/rlinf/data/datasets.py
+++ b/rlinf/data/datasets.py
@@ -489,8 +489,8 @@ def create(
         tokenizer: AutoTokenizer,
     ) -> VLMBaseDataset:
         key = dataset_name.lower()
-        klass = cls.registry.get(key)
-        return klass(data_paths=data_paths, config=config, tokenizer=tokenizer)
+        dataset_class = cls.registry.get(key)
+        return dataset_class(data_paths=data_paths, config=config, tokenizer=tokenizer)
 
 
 @VLMDatasetRegistry.register("robo2vlm")
diff --git a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
index 5f04d633f..c3bd9475a 100644
--- a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
+++ b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
@@ -68,7 +68,6 @@ def model_provider_func(self) -> torch.nn.Module:
             else:
                 auto_model_class = AutoModelForCausalLM
 
-            # TODO: fix this, load model in float16/bfloat16 may cause optimizer in bf16, which is incorrect
             # default load in float16
             model = auto_model_class.from_pretrained(
                 self._cfg.model.model_path,
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index 37dd1d9ba..e5432efe9 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -17,6 +17,7 @@
 from typing import Dict, List, Tuple
 
 import numpy as np
+from RLinf.rlinf.algorithms.rewards import get_reward_class
 import torch
 from omegaconf import DictConfig
 from torch.distributed.device_mesh import init_device_mesh
@@ -60,7 +61,6 @@
     seq_mean_token_sum,
 )
 from rlinf.workers.rollout.utils import RankMapper
-from toolkits.math_verifier.verify import math_verify_call
 
 
 class FSDPActor(FSDPModelManager, Worker):
@@ -110,8 +110,9 @@ def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
 
         # Reward configurations
         if not self.cfg.reward.use_reward_model:
-            assert self.cfg.reward.reward_type == "math", "only support math"
-            self.reward_fn = math_verify_call
+            assert self.cfg.reward.reward_type in ["math", "vqa"], "only support math and vqa reward!"
+            reward_cls = get_reward_class(self.cfg.reward.reward_type)
+            self.reward = reward_cls(self.cfg.reward)
 
     def init_worker(self):
         self.setup_model_and_optimizer()
@@ -417,13 +418,8 @@ def _compute_batch_rewards(
             texts.append(
                 self.tokenizer.decode(response.tolist(), skip_special_tokens=True)
             )
-        rewards = self.reward_fn(texts, answers)
-        reward_scores = [
-            self.cfg.reward.reward_scale
-            if reward == 1
-            else -self.cfg.reward.reward_scale
-            for reward in rewards
-        ]
+        reward_scores = self.reward.get_reward(texts, answers)
+
         all_reward_scores = torch.as_tensor(
             reward_scores,
             dtype=torch.float,

From 7100e6b7eb36555810fe1861268ee20cf2ddf7b9 Mon Sep 17 00:00:00 2001
From: guozhen1997 <2997871698@qq.com>
Date: Mon, 22 Sep 2025 22:03:39 +0800
Subject: [PATCH 27/57] feat: add reward worker

Signed-off-by: guozhen1997 <2997871698@qq.com>
---
 .../math/config/qwen2.5-1.5b-grpo-fsdp.yaml   |   9 +-
 examples/math/main_math.py                    |   8 ++
 rlinf/runners/math_runner.py                  |   3 +-
 rlinf/utils/placement.py                      |   8 ++
 rlinf/workers/reward/reward_worker.py         | 101 ++++++++++++++++++
 5 files changed, 126 insertions(+), 3 deletions(-)
 create mode 100644 rlinf/workers/reward/reward_worker.py

diff --git a/examples/math/config/qwen2.5-1.5b-grpo-fsdp.yaml b/examples/math/config/qwen2.5-1.5b-grpo-fsdp.yaml
index 55285baa8..822008f04 100644
--- a/examples/math/config/qwen2.5-1.5b-grpo-fsdp.yaml
+++ b/examples/math/config/qwen2.5-1.5b-grpo-fsdp.yaml
@@ -201,9 +201,14 @@ actor:
     padding_side: 'right'
 
 reward:
+  group_name: "ActorGroup"
   use_reward_model: false
-  reward_type: 'math'
-  reward_scale: 5.0
+  reward_type: 'vqa'
+  # reward_scale: 5.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
 
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/examples/math/main_math.py b/examples/math/main_math.py
index 80f408bd0..37c75aca8 100644
--- a/examples/math/main_math.py
+++ b/examples/math/main_math.py
@@ -28,6 +28,7 @@
 from rlinf.workers.actor import get_actor_worker
 from rlinf.workers.inference.megatron_inference_worker import MegatronInference
 from rlinf.workers.rollout.utils import get_rollout_backend_worker
+from rlinf.workers.reward.reward_worker import RewardWorker
 
 """Script to start GRPO training"""
 mp.set_start_method("spawn", force=True)
@@ -66,6 +67,12 @@ def main(cfg) -> None:
             name=cfg.inference.group_name,
             placement_strategy=inference_placement_strategy,
         )
+    
+    # Reward group
+    reward_placement_strategy = component_placement.get_strategy("reward")
+    reward_group = RewardWorker.create_group(cfg, component_placement).launch(
+        cluster, name=cfg.reward.group_name, placement_strategy=reward_placement_strategy
+    )
 
     # GRPO Actor group
     actor_worker_cls = get_actor_worker(cfg)
@@ -85,6 +92,7 @@ def main(cfg) -> None:
         rollout=rollout_group,
         inference=inference_group,
         actor=actor_group,
+        reward=reward_group,
     )
 
     runner.init_workers()
diff --git a/rlinf/runners/math_runner.py b/rlinf/runners/math_runner.py
index be2cfac1f..c0326a8f7 100644
--- a/rlinf/runners/math_runner.py
+++ b/rlinf/runners/math_runner.py
@@ -35,6 +35,7 @@
 from rlinf.utils.timers import Timer
 from rlinf.workers.actor.megatron_actor_worker import MegatronActor
 from rlinf.workers.inference.megatron_inference_worker import MegatronInference
+from rlinf.workers.reward.reward_worker import RewardWorker
 
 if typing.TYPE_CHECKING:
     from rlinf.workers.rollout.sglang.sglang_worker import SGLangWorker
@@ -55,7 +56,7 @@ def __init__(
         rollout: Union["SGLangWorker", "VLLMWorker"],
         inference: Optional[MegatronInference],
         actor: MegatronActor,
-        reward: Optional[Worker] = None,
+        reward: Optional[RewardWorker] = None,
     ):
         """"""
         self.cfg = cfg
diff --git a/rlinf/utils/placement.py b/rlinf/utils/placement.py
index 10738ce9e..d6fa798c8 100644
--- a/rlinf/utils/placement.py
+++ b/rlinf/utils/placement.py
@@ -202,6 +202,7 @@ def __init__(self, config: DictConfig, cluster: Cluster):
         self._actor_gpus = self._component_gpu_map.get("actor", None)
         self._inference_gpus = self._component_gpu_map.get("inference", None)
         self._rollout_gpus = self._component_gpu_map.get("rollout", None)
+        self._reward_gpus = self._component_gpu_map.get("reward", None)
         assert self._actor_gpus is not None, (
             "Actor GPUs must be specified in the component_placement config."
         )
@@ -224,6 +225,7 @@ def __init__(self, config: DictConfig, cluster: Cluster):
             len(self._inference_gpus) if self._inference_gpus else 0
         )
         self._rollout_num_gpus = len(self._rollout_gpus)
+        self._reward_num_gpus = len(self._reward_gpus)
 
         if self._is_collocated():
             assert self._inference_gpus is None, (
@@ -295,6 +297,9 @@ def _generate_placements(self):
                 num_accelerators_per_process=rollout_tp_size,
                 stride=stride,
             )
+            self._placements["reward"] = PackedPlacementStrategy(
+                self._reward_gpus[0], self._reward_gpus[-1]
+            )
         elif self._placement_mode == PlacementMode.DISAGGREGATED:
             # Generate continuous placement strategies for components in a cluster.
             num_gpus_per_rollout_dp = len(self._rollout_gpus) // self.rollout_dp_size
@@ -310,6 +315,9 @@ def _generate_placements(self):
             self._placements["actor"] = PackedPlacementStrategy(
                 self._actor_gpus[0], self._actor_gpus[-1]
             )
+            self._placements["reward"] = PackedPlacementStrategy(
+                self._reward_gpus[0], self._reward_gpus[-1]
+            )
 
     @property
     def is_disaggregated(self):
diff --git a/rlinf/workers/reward/reward_worker.py b/rlinf/workers/reward/reward_worker.py
new file mode 100644
index 000000000..bc98d5a6a
--- /dev/null
+++ b/rlinf/workers/reward/reward_worker.py
@@ -0,0 +1,101 @@
+import torch
+from typing import Dict, Tuple, List
+from omegaconf import DictConfig
+from rlinf.hybrid_engines.fsdp.fsdp_model_manager import FSDPModelManager
+from rlinf.scheduler import Worker, Channel
+from rlinf.algorithms.rewards import get_reward_class
+from rlinf.data.io_struct import RolloutResult
+
+
+class RewardWorker(Worker, FSDPModelManager):
+    def __init__(self, cfg: DictConfig):
+        Worker.__init__(self)
+        super().__init__(cfg.reward)
+        self.cfg = cfg
+
+        self.total_batch_size_per_dp = (
+            self.cfg.data.rollout_batch_size
+            * self.cfg.algorithm.get("group_size", 1)
+            // self._world_size
+        )
+
+    def init_worker(self):
+        if self.cfg.reward.use_reward_model:
+            self.setup_model_and_optimizer()
+            self.offload_fsdp_param_and_grad()
+            self.offload_fsdp_optimizer()
+        else:
+            self.reward = get_reward_class(self.cfg.reward.name)(self.cfg.reward)
+
+    def get_batch(
+        self, channel: Channel
+    ) -> Tuple[Dict[str, torch.Tensor], RolloutResult]:
+        result: RolloutResult = channel.get()
+
+        batch = result.to_actor_batch(
+            self.cfg.data.max_prompt_length,
+            self.cfg.actor.model.encoder_seq_length,
+            self.tokenizer.eos_token_id,
+        )
+        return batch, result
+
+    def compute_rewards(self, input_channel: Channel, output_channel: Channel):
+        """Compute rewards.
+
+        Args:
+            input_channel: The input channel to read from.
+            output_channel: The output channel to send results to.
+        """
+
+        with self.worker_timer():
+            recv_batch_size = 0
+            while recv_batch_size < self.total_batch_size_per_dp:
+                batch, rollout_result = self.get_batch(input_channel)
+                recv_batch_size += rollout_result.num_sequence
+
+                # Compute rule-based reward
+                if rollout_result.rewards is None:
+                    rollout_result.rewards = self._compute_batch_rewards(
+                        batch, rollout_result.answers
+                    )
+                output_channel.put(rollout_result)
+
+            assert recv_batch_size == self.total_batch_size_per_dp, (
+                f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+            )
+
+    def _compute_batch_rewards(
+        self, batch: Dict[str, torch.Tensor], answers: List[str]
+    ):
+        """Reward computation using non-model based reward."""
+
+        if self.cfg.reward.use_reward_model:
+            return self.compute_batch_rewards_with_model(batch)
+
+        texts = []
+        for response, response_len in zip(
+            batch["input_ids"],
+            batch["response_lengths"],
+        ):
+            response = response[
+                self.cfg.data.max_prompt_length : self.cfg.data.max_prompt_length
+                + response_len
+            ]
+            texts.append(
+                self.tokenizer.decode(response.tolist(), skip_special_tokens=True)
+            )
+        reward_scores = self.reward.get_reward(texts, answers)
+
+        all_reward_scores = torch.as_tensor(
+            reward_scores,
+            dtype=torch.float,
+            device=torch.device("cpu"),
+        ).view(-1, 1)
+        return all_reward_scores.flatten()
+
+    def compute_batch_rewards_with_model(self, batch: Dict[str, torch.Tensor]):
+        self.model.eval()
+        with torch.no_grad():
+            # TODO: fix this
+            rewards = self.model(batch["input_ids"], batch["attention_mask"])
+        return rewards
\ No newline at end of file

From f7c2fbae79743a8b8a990850293700dce80df549 Mon Sep 17 00:00:00 2001
From: guozhen1997 <2997871698@qq.com>
Date: Tue, 23 Sep 2025 16:47:31 +0800
Subject: [PATCH 28/57] fix: fix vqa reward bugs and ruff format

Signed-off-by: guozhen1997 <2997871698@qq.com>
---
 examples/math/main_math.py                    |   8 +-
 ...tron.yaml => qwen2.5-vl-3b-grpo-fsdp.yaml} |  25 ++++-
 examples/vlm/main_vlm.py                      |  10 ++
 examples/vlm/run_main_vlm_grpo_megatron.sh    |   2 +-
 rlinf/algorithms/rewards/__init__.py          |  19 +++-
 rlinf/algorithms/rewards/math/__init__.py     |  18 ++-
 rlinf/algorithms/rewards/vqa/__init__.py      |  64 ++++++++---
 .../algorithms/rewards/vqa/format_rewards.py  |  50 ++++++---
 rlinf/algorithms/rewards/vqa/qa_rewards.py    | 105 +++++++++++++-----
 rlinf/data/datasets.py                        |  72 ++++++------
 rlinf/data/io_struct.py                       |   2 +-
 rlinf/runners/math_runner.py                  |   2 +-
 rlinf/workers/actor/fsdp_actor_worker.py      |   6 +-
 rlinf/workers/reward/reward_worker.py         |  34 ++++--
 14 files changed, 290 insertions(+), 127 deletions(-)
 rename examples/vlm/config/{qwen2.5-vl-3b-grpo-megatron.yaml => qwen2.5-vl-3b-grpo-fsdp.yaml} (90%)

diff --git a/examples/math/main_math.py b/examples/math/main_math.py
index 37c75aca8..5b9ee61e1 100644
--- a/examples/math/main_math.py
+++ b/examples/math/main_math.py
@@ -27,8 +27,8 @@
 from rlinf.utils.utils import output_redirector
 from rlinf.workers.actor import get_actor_worker
 from rlinf.workers.inference.megatron_inference_worker import MegatronInference
-from rlinf.workers.rollout.utils import get_rollout_backend_worker
 from rlinf.workers.reward.reward_worker import RewardWorker
+from rlinf.workers.rollout.utils import get_rollout_backend_worker
 
 """Script to start GRPO training"""
 mp.set_start_method("spawn", force=True)
@@ -67,11 +67,13 @@ def main(cfg) -> None:
             name=cfg.inference.group_name,
             placement_strategy=inference_placement_strategy,
         )
-    
+
     # Reward group
     reward_placement_strategy = component_placement.get_strategy("reward")
     reward_group = RewardWorker.create_group(cfg, component_placement).launch(
-        cluster, name=cfg.reward.group_name, placement_strategy=reward_placement_strategy
+        cluster,
+        name=cfg.reward.group_name,
+        placement_strategy=reward_placement_strategy,
     )
 
     # GRPO Actor group
diff --git a/examples/vlm/config/qwen2.5-vl-3b-grpo-megatron.yaml b/examples/vlm/config/qwen2.5-vl-3b-grpo-fsdp.yaml
similarity index 90%
rename from examples/vlm/config/qwen2.5-vl-3b-grpo-megatron.yaml
rename to examples/vlm/config/qwen2.5-vl-3b-grpo-fsdp.yaml
index cfe4febe7..02b5d8ea4 100644
--- a/examples/vlm/config/qwen2.5-vl-3b-grpo-megatron.yaml
+++ b/examples/vlm/config/qwen2.5-vl-3b-grpo-fsdp.yaml
@@ -10,7 +10,7 @@ cluster:
   num_nodes: 1
   num_gpus_per_node: 8
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
   task_type: math
@@ -94,7 +94,7 @@ rollout:
   padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
   eos: null                   # will be tokenizer.eos_token_id if null.
     
-  rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
+  rollout_backend: vllm     # here choose which backend to rollout,support [sglang, vllm] 
 
   sglang:
     attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
@@ -122,6 +122,7 @@ rollout:
 
 data:
   type: vision_language
+  dataset_name: robo2vlm
   max_prompt_length: 1024
   filter_prompt_by_length: True
   rollout_batch_size: 8
@@ -133,11 +134,12 @@ data:
   answer_key: "answer"
   solution_key: "solution"
   use_chat_template: True
+  lazy_loading: True
   shuffle: True
   validation_shuffle: True
   seed: 1234
-  train_data_paths: ["/mnt/public/guozhen/data/science_qa/train-00000-of-00001-1028f23e353fbe3e.parquet"]
-  val_data_paths: ["/mnt/public/guozhen/data/science_qa/test-00000-of-00001-f0e719df791966ff.parquet"]
+  train_data_paths: ["/mnt/public/guozhen/data/robo2vlm/train/"]
+  val_data_paths: ["/mnt/public/guozhen/data/robo2vlm/test/"]
 
 actor:
   group_name: "ActorGroup"
@@ -202,9 +204,20 @@ actor:
     padding_side: 'right'
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
-  reward_type: 'math'
-  reward_scale: 5.0
+  reward_type: 'vqa'
+  # reward_scale: 5.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: /mnt/public/hf_models/Qwen2.5-VL-3B-Instruct
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
 
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/examples/vlm/main_vlm.py b/examples/vlm/main_vlm.py
index 605577fba..6ed11dd75 100644
--- a/examples/vlm/main_vlm.py
+++ b/examples/vlm/main_vlm.py
@@ -27,6 +27,7 @@
 from rlinf.utils.utils import output_redirector
 from rlinf.workers.actor import get_actor_worker
 from rlinf.workers.inference.megatron_inference_worker import MegatronInference
+from rlinf.workers.reward.reward_worker import RewardWorker
 from rlinf.workers.rollout.utils import get_rollout_backend_worker
 
 """Script to start GRPO training"""
@@ -69,6 +70,14 @@ def main(cfg) -> None:
             placement_strategy=inference_placement_strategy,
         )
 
+    # Reward group
+    reward_placement_strategy = component_placement.get_strategy("reward")
+    reward_group = RewardWorker.create_group(cfg, component_placement).launch(
+        cluster,
+        name=cfg.reward.group_name,
+        placement_strategy=reward_placement_strategy,
+    )
+
     # GRPO Actor group
     actor_worker_cls = get_actor_worker(cfg)
     actor_placement_strategy = component_placement.get_strategy("actor")
@@ -87,6 +96,7 @@ def main(cfg) -> None:
         rollout=rollout_group,
         inference=inference_group,
         actor=actor_group,
+        reward=reward_group,
     )
 
     runner.init_workers()
diff --git a/examples/vlm/run_main_vlm_grpo_megatron.sh b/examples/vlm/run_main_vlm_grpo_megatron.sh
index 2e5a75e3a..99165babb 100644
--- a/examples/vlm/run_main_vlm_grpo_megatron.sh
+++ b/examples/vlm/run_main_vlm_grpo_megatron.sh
@@ -13,7 +13,7 @@ MEGATRON_PATH=/opt/Megatron-LM
 export PYTHONPATH=${REPO_PATH}:${MEGATRON_PATH}:$PYTHONPATH
 
 if [ -z "$1" ]; then
-    CONFIG_NAME="qwen2.5-vl-3b-grpo-megatron"
+    CONFIG_NAME="qwen2.5-vl-3b-grpo-fsdp"
 else
     CONFIG_NAME=$1
 fi
diff --git a/rlinf/algorithms/rewards/__init__.py b/rlinf/algorithms/rewards/__init__.py
index 3a48b84b6..3d354437b 100644
--- a/rlinf/algorithms/rewards/__init__.py
+++ b/rlinf/algorithms/rewards/__init__.py
@@ -1,15 +1,32 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .math import MathReward
 from .vqa import VQAReward
 
+
 def register_reward(name: str, reward_class: type):
     assert name not in reward_registry, f"Reward {name} already registered"
     reward_registry[name] = reward_class
 
+
 def get_reward_class(name: str):
     assert name in reward_registry, f"Reward {name} not found"
     return reward_registry[name]
 
+
 reward_registry = {}
 
 register_reward("math", MathReward)
-register_reward("vqa", VQAReward)
\ No newline at end of file
+register_reward("vqa", VQAReward)
diff --git a/rlinf/algorithms/rewards/math/__init__.py b/rlinf/algorithms/rewards/math/__init__.py
index a94ff2dc4..7eb6401a8 100644
--- a/rlinf/algorithms/rewards/math/__init__.py
+++ b/rlinf/algorithms/rewards/math/__init__.py
@@ -1,5 +1,21 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import List
+
 from omegaconf import DictConfig
+
 from toolkits.math_verifier.verify import math_verify_call
 
 
@@ -21,4 +37,4 @@ def get_reward(
             List[float]: A list of reward scores, one for each response.
         """
 
-        return math_verify_call(response, reference) * self.scale
\ No newline at end of file
+        return math_verify_call(response, reference) * self.scale
diff --git a/rlinf/algorithms/rewards/vqa/__init__.py b/rlinf/algorithms/rewards/vqa/__init__.py
index d4dd55f20..8175d72a1 100644
--- a/rlinf/algorithms/rewards/vqa/__init__.py
+++ b/rlinf/algorithms/rewards/vqa/__init__.py
@@ -1,27 +1,58 @@
-import torch
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import List
+
+import torch
 from omegaconf import DictConfig
+
+from .format_rewards import answer_format_reward, think_format_reward
 from .qa_rewards import qa_accuracy_reward
-from .format_rewards import think_format_reward, answer_format_reward
 
 
 class VQAReward:
     def __init__(self, config: DictConfig):
-        self.reward_weights = config.get("reward_weights", {
-            "qa_accuracy": 1.0,
-            "think_format": 0.0,
-            "answer_format": 0.0,
-        })
-        for reward_name, reward_weight in self.reward_weights.items():
-            assert reward_name in ["qa_accuracy", "think_format", "answer_format"], f"Reward {reward_name} not supported"
-            assert reward_weight >= 0, f"Reward weight {reward_weight} must be non-negative"
-        self.reward_weights = [reward_weight["qa_accuracy"], reward_weight["think_format"], reward_weight["answer_format"]]
-
-        self.reward_functions = [qa_accuracy_reward, think_format_reward, answer_format_reward]
+        reward_weights_config = config.get(
+            "reward_weights",
+            {
+                "qa_accuracy": 1.0,
+                "think_format": 0.0,
+                "answer_format": 0.0,
+            },
+        )
+        for reward_name, reward_weight in reward_weights_config.items():
+            assert reward_name in ["qa_accuracy", "think_format", "answer_format"], (
+                f"Reward {reward_name} not supported"
+            )
+            assert reward_weight >= 0, (
+                f"Reward weight {reward_weight} must be non-negative"
+            )
+        self.reward_weights = [
+            reward_weights_config["qa_accuracy"],
+            reward_weights_config["think_format"],
+            reward_weights_config["answer_format"],
+        ]
+
+        self.reward_functions = [
+            qa_accuracy_reward,
+            think_format_reward,
+            answer_format_reward,
+        ]
 
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-    def get_reward(self, completions: List[str], answers: List[str]) -> List[float]:
+    def get_reward(self, completions: List[str], answers: List[dict]) -> List[float]:
         rewards = []
         for i, reward_function in enumerate(self.reward_functions):
             if self.reward_weights[i] > 0:
@@ -34,9 +65,8 @@ def get_reward(self, completions: List[str], answers: List[str]) -> List[float]:
         # rewards [num_reward_functions, len(completions)]
         rewards_tensor = torch.tensor(rewards, device=self.device)
         weights_tensor = torch.tensor(self.reward_weights, device=self.device)
-        
+
         # [num_reward_functions, num_completions] * [num_reward_functions, 1] -> [num_completions]
         final_rewards = (rewards_tensor * weights_tensor.unsqueeze(1)).sum(dim=0)
-        
+
         return final_rewards.tolist()
-        
\ No newline at end of file
diff --git a/rlinf/algorithms/rewards/vqa/format_rewards.py b/rlinf/algorithms/rewards/vqa/format_rewards.py
index 2f926e733..205bbe336 100644
--- a/rlinf/algorithms/rewards/vqa/format_rewards.py
+++ b/rlinf/algorithms/rewards/vqa/format_rewards.py
@@ -1,3 +1,17 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import re
 from typing import List
 
@@ -5,49 +19,49 @@
 def think_format_reward(completions, answers) -> List[float]:
     """
     Think format reward function compatible with GRPO training.
-    
+
     Reward function that checks if reasoning is enclosed within <think></think> tags.
-    
+
     Args:
         completions: List of model completions (text strings)
-        
+
     Returns:
         List of reward scores (1.0 for correct format, 0.0 otherwise)
     """
     pattern = r"^<think>(?!.*<think>)(.*?)</think>.*$"
     rewards = []
-    
+
     for completion in completions:
         completion_text = str(completion).strip()
         match = re.match(pattern, completion_text, re.DOTALL | re.MULTILINE)
         rewards.append(1.0 if match else 0.0)
-    
+
     return rewards
 
 
 def answer_format_reward(completions, answers) -> List[float]:
     """
     Reward function that checks for proper answer formatting.
-    
+
     Expected format: <answer>X. content</answer> where X is a choice letter.
-    
+
     Args:
-        completions: List of model completions (text strings) 
-        
+        completions: List of model completions (text strings)
+
     Returns:
         List of reward scores (1.0 for correct format, 0.0 otherwise)
     """
     rewards = []
-    
+
     for completion in completions:
         completion_text = str(completion).strip()
-        
+
         # Check for proper answer format: <answer>X. content</answer>
-        answer_pattern = r'<answer>\s*[A-E]\.\s*.+?\s*</answer>'
-        has_proper_answer = bool(re.search(
-            answer_pattern, completion_text, re.DOTALL | re.IGNORECASE
-        ))
-        
+        answer_pattern = r"<answer>\s*[A-E]\.\s*.+?\s*</answer>"
+        has_proper_answer = bool(
+            re.search(answer_pattern, completion_text, re.DOTALL | re.IGNORECASE)
+        )
+
         rewards.append(1.0 if has_proper_answer else 0.0)
-    
-    return rewards
\ No newline at end of file
+
+    return rewards
diff --git a/rlinf/algorithms/rewards/vqa/qa_rewards.py b/rlinf/algorithms/rewards/vqa/qa_rewards.py
index ce7a3443b..2bc9540d3 100644
--- a/rlinf/algorithms/rewards/vqa/qa_rewards.py
+++ b/rlinf/algorithms/rewards/vqa/qa_rewards.py
@@ -1,3 +1,17 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import re
 from typing import List
 
@@ -5,44 +19,77 @@
 def qa_accuracy_reward(completions, answers) -> List[float]:
     """
     Reward function that evaluates question-answering accuracy for VQA tasks.
-    
+
     Based on TRL's accuracy_reward pattern but adapted for multiple choice VQA.
-    
+
     Args:
         completions: List of model completions (text strings)
-        answers: List of correct answers (text strings)
-        
+        answers: List of correct answers (dict)
+
     Returns:
         List of reward scores (1.0 for correct, 0.0 for incorrect)
     """
     rewards = []
-    
+
     for completion, answer in zip(completions, answers):
         completion_text = str(completion).strip()
-        
+
         # Extract answer from completion - look for <answer>X. content</answer>
-        patterns = [
-            r'<answer>\s*[A-E]\.\s*(.*?)\s*</answer>',
-            r'<answer>\s*[A-E]\s*(.*?)\s*</answer>',
-            r'<answer>\s*(.*?)\s*</answer>',
-        ]
-        
-        answer_match = None
-        for pattern in patterns:
-            answer_match = re.search(pattern, completion_text, re.DOTALL | re.IGNORECASE)
-            if answer_match:
-                break
-        
+        answer_match = re.search(
+            r"<answer>\s*([A-E])\.\s*(.*?)\s*</answer>",
+            completion_text,
+            re.DOTALL | re.IGNORECASE,
+        )
+
         if not answer_match:
             rewards.append(0.0)
             continue
-            
-        predicted_content = answer_match.group(1).strip()
-        
-        content_match = _compare_choice_content(predicted_content, answer)
-        
-        rewards.append(1.0 if content_match else 0.0)
-    
+
+        predicted_letter = answer_match.group(1).upper()
+        predicted_content = answer_match.group(2).strip()
+
+        # Get ground truth from kwargs
+        correct_answer = answer.get("correct_answer", None)
+        choices = answer.get("choices", None)
+
+        if correct_answer is None or choices is None:
+            rewards.append(0.0)
+            continue
+
+        # Normalize correct_answer to letter format
+        if isinstance(correct_answer, int):
+            correct_letter = chr(65 + correct_answer)  # 0->A, 1->B, etc.
+        elif isinstance(correct_answer, str):
+            correct_letter = correct_answer.strip().upper()
+        else:
+            rewards.append(0.0)
+            continue
+
+        # Parse choices if string format
+        if isinstance(choices, str):
+            try:
+                import ast
+
+                choices = ast.literal_eval(choices)
+            except (ValueError, SyntaxError):
+                choices = [str(choices)]
+
+        # Get correct choice content
+        letter_to_idx = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4}
+        if correct_letter in letter_to_idx and letter_to_idx[correct_letter] < len(
+            choices
+        ):
+            correct_content = choices[letter_to_idx[correct_letter]].strip()
+        else:
+            rewards.append(0.0)
+            continue
+
+        # Check accuracy: both letter and content must match
+        letter_match = predicted_letter == correct_letter
+        content_match = _compare_choice_content(predicted_content, correct_content)
+
+        rewards.append(1.0 if (letter_match and content_match) else 0.0)
+
     return rewards
 
 
@@ -51,13 +98,13 @@ def _compare_choice_content(predicted: str, correct: str) -> bool:
     # Simple normalized comparison
     pred_normalized = predicted.lower().strip()
     correct_normalized = correct.lower().strip()
-    
+
     # Direct match
     if pred_normalized == correct_normalized:
         return True
-    
+
     # Partial match for more flexibility
     if pred_normalized in correct_normalized or correct_normalized in pred_normalized:
         return True
-    
-    return False
\ No newline at end of file
+
+    return False
diff --git a/rlinf/data/datasets.py b/rlinf/data/datasets.py
index 2669b14e0..75922780b 100644
--- a/rlinf/data/datasets.py
+++ b/rlinf/data/datasets.py
@@ -67,7 +67,7 @@ def batch_pad_to_fixed_len(
 class DatasetItem:
     prompt: torch.Tensor
     length: int
-    answer: str
+    answer: str | dict
     idx: int
     solution: Optional[str] = None
     image_data: Optional[List[Union[bytes, str]]] = None
@@ -182,8 +182,6 @@ def __init__(
         data_paths: Union[List[str], str],
         config: DictConfig,
         tokenizer: AutoTokenizer,
-        *,
-        lazy_loading: bool = False,
     ) -> None:
         super().__init__()
         self.cfg = config
@@ -194,6 +192,7 @@ def __init__(
         # Delay processor creation; only needed when use_chat_template is True
         self._processor = None
 
+        self.system_prompt = config.data.get("system_prompt", None)
         self.use_chat_template = bool(config.data.use_chat_template)
         self.image_keys = list(config.data.image_keys or [])
         self.prompt_key = config.data.prompt_key
@@ -204,7 +203,7 @@ def __init__(
         self.eos_id = int(self.tokenizer.eos_token_id)
 
         # Loading mode
-        self.lazy_loading = bool(getattr(config.data, "lazy_loading", lazy_loading))
+        self.lazy_loading = bool(getattr(config.data, "lazy_loading", False))
 
         self._records = []
         self._indices = []  # (path, fmt, row_index_or_offset)
@@ -280,11 +279,20 @@ def encode_prompt(
                 self._processor = AutoProcessor.from_pretrained(
                     self.cfg.actor.model.model_path
                 )
+            messages = []
+            if self.system_prompt is not None:
+                messages.append(
+                    {
+                        "role": "system",
+                        "content": [{"type": "text", "text": self.system_prompt}],
+                    }
+                )
+
             content: List[Dict[str, Any]] = []
             for _ in range(max(0, image_count)):
                 content.append({"type": "image"})
             content.append({"type": "text", "text": prompt_text})
-            messages = [{"role": "user", "content": content}]
+            messages.append({"role": "user", "content": content})
             rendered = self._processor.apply_chat_template(
                 messages, tokenize=False, add_generation_prompt=True
             )
@@ -495,6 +503,20 @@ def create(
 
 @VLMDatasetRegistry.register("robo2vlm")
 class Robo2VLMDataset(VLMBaseDataset):
+    def __init__(
+        self,
+        data_paths: Union[List[str], str],
+        config: DictConfig,
+        tokenizer: AutoTokenizer,
+    ) -> None:
+        super().__init__(data_paths, config, tokenizer)
+        self.system_prompt = (
+            "You are a helpful robotic vision assistant specialized in "
+            "answering questions about robotic manipulation tasks. "
+            "Use <think></think> tags to show your reasoning process, "
+            "then provide your final answer in <answer></answer> tags."
+        )
+
     def get_image_list(self, dataitem: Dict[str, Any]) -> List[Union[bytes, str, None]]:
         # Prefer common robo2vlm fields if present, else fallback to configured keys
         images: List[Any] = []
@@ -558,40 +580,12 @@ def build_prompt_text(self, data_item: Dict[str, Any]) -> str:
     def postprocess_dataset_item(
         self, item: DatasetItem, raw: Dict[str, Any]
     ) -> DatasetItem:
-        # Derive answer from 'correct_answer' and 'choices' if not provided
-        if not item.answer or str(item.answer).lower() in {"none", "", "null"}:
-            choices = raw.get("choices")
-            ca = raw.get("correct_answer")
-            try:
-                # Normalize choices
-                if isinstance(choices, str):
-                    import ast
-
-                    choices = ast.literal_eval(choices)
-                if not isinstance(choices, list):
-                    choices = [choices] if choices is not None else []
-
-                ans_val: Optional[str] = None
-                if isinstance(ca, int) and 0 <= ca < len(choices):
-                    ans_val = str(choices[ca])
-                elif isinstance(ca, str):
-                    cstr = ca.strip()
-                    # Letter index like 'A', 'B', ...
-                    if len(cstr) == 1 and "A" <= cstr <= "Z":
-                        idx = ord(cstr) - ord("A")
-                        if 0 <= idx < len(choices):
-                            ans_val = str(choices[idx])
-                    # Direct match to a choice value
-                    if ans_val is None and choices:
-                        for ch in choices:
-                            if str(ch) == cstr:
-                                ans_val = cstr
-                                break
-                if ans_val is not None:
-                    item.answer = ans_val
-            except Exception:
-                # Keep original if any
-                pass
+        answer_dict = {
+            "choices": raw.get("choices", None),
+            "correct_answer": raw.get("correct_answer", None),
+        }
+        item.answer = answer_dict
+
         return item
 
 
diff --git a/rlinf/data/io_struct.py b/rlinf/data/io_struct.py
index e40fed973..78459e548 100644
--- a/rlinf/data/io_struct.py
+++ b/rlinf/data/io_struct.py
@@ -266,7 +266,7 @@ class RolloutResult:
     advantages: Optional[List[float] | torch.Tensor] = None
     prompt_texts: Optional[List[str]] = None
     response_texts: Optional[List[str]] = None
-    answers: Optional[List[str]] = None
+    answers: Optional[List[str | dict]] = None
     image_data: Optional[Union[List[List[bytes]], List[List[str]]]] = None
     # Inference
     # Only set when recompute_logprobs is False
diff --git a/rlinf/runners/math_runner.py b/rlinf/runners/math_runner.py
index c0326a8f7..dca52c0c0 100644
--- a/rlinf/runners/math_runner.py
+++ b/rlinf/runners/math_runner.py
@@ -25,7 +25,7 @@
 from tqdm import tqdm
 
 from rlinf.data.io_struct import RolloutRequest
-from rlinf.scheduler import Channel, Worker
+from rlinf.scheduler import Channel
 from rlinf.scheduler import WorkerGroupFuncResult as Handle
 from rlinf.utils.data_iter_utils import split_list
 from rlinf.utils.distributed import ScopedTimer
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index e5432efe9..06c28797d 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -17,7 +17,6 @@
 from typing import Dict, List, Tuple
 
 import numpy as np
-from RLinf.rlinf.algorithms.rewards import get_reward_class
 import torch
 from omegaconf import DictConfig
 from torch.distributed.device_mesh import init_device_mesh
@@ -26,6 +25,7 @@
 
 import rlinf.algorithms  # noqa: F401
 from rlinf.algorithms.registry import actor_loss, calculate_adv_and_returns
+from rlinf.algorithms.rewards import get_reward_class
 from rlinf.algorithms.utils import (
     kl_penalty,
     preprocess_advantages_inputs,
@@ -110,7 +110,9 @@ def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
 
         # Reward configurations
         if not self.cfg.reward.use_reward_model:
-            assert self.cfg.reward.reward_type in ["math", "vqa"], "only support math and vqa reward!"
+            assert self.cfg.reward.reward_type in ["math", "vqa"], (
+                "only support math and vqa reward!"
+            )
             reward_cls = get_reward_class(self.cfg.reward.reward_type)
             self.reward = reward_cls(self.cfg.reward)
 
diff --git a/rlinf/workers/reward/reward_worker.py b/rlinf/workers/reward/reward_worker.py
index bc98d5a6a..9290c23d6 100644
--- a/rlinf/workers/reward/reward_worker.py
+++ b/rlinf/workers/reward/reward_worker.py
@@ -1,17 +1,35 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
 import torch
-from typing import Dict, Tuple, List
 from omegaconf import DictConfig
-from rlinf.hybrid_engines.fsdp.fsdp_model_manager import FSDPModelManager
-from rlinf.scheduler import Worker, Channel
+
 from rlinf.algorithms.rewards import get_reward_class
 from rlinf.data.io_struct import RolloutResult
+from rlinf.hybrid_engines.fsdp.fsdp_model_manager import FSDPModelManager
+from rlinf.scheduler import Channel, Worker
+from rlinf.utils.placement import ModelParallelComponentPlacement
 
 
-class RewardWorker(Worker, FSDPModelManager):
-    def __init__(self, cfg: DictConfig):
+class RewardWorker(FSDPModelManager, Worker):
+    def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
         Worker.__init__(self)
         super().__init__(cfg.reward)
         self.cfg = cfg
+        self.component_placement = placement
 
         self.total_batch_size_per_dp = (
             self.cfg.data.rollout_batch_size
@@ -25,7 +43,7 @@ def init_worker(self):
             self.offload_fsdp_param_and_grad()
             self.offload_fsdp_optimizer()
         else:
-            self.reward = get_reward_class(self.cfg.reward.name)(self.cfg.reward)
+            self.reward = get_reward_class(self.cfg.reward.reward_type)(self.cfg.reward)
 
     def get_batch(
         self, channel: Channel
@@ -65,7 +83,7 @@ def compute_rewards(self, input_channel: Channel, output_channel: Channel):
             )
 
     def _compute_batch_rewards(
-        self, batch: Dict[str, torch.Tensor], answers: List[str]
+        self, batch: Dict[str, torch.Tensor], answers: List[str | dict]
     ):
         """Reward computation using non-model based reward."""
 
@@ -98,4 +116,4 @@ def compute_batch_rewards_with_model(self, batch: Dict[str, torch.Tensor]):
         with torch.no_grad():
             # TODO: fix this
             rewards = self.model(batch["input_ids"], batch["attention_mask"])
-        return rewards
\ No newline at end of file
+        return rewards

From 0c38831a87ba3d3473624e56d7ecf6f79732f58c Mon Sep 17 00:00:00 2001
From: guozhen1997 <2997871698@qq.com>
Date: Tue, 23 Sep 2025 18:37:55 +0800
Subject: [PATCH 29/57] feat: rename and reorganize example config

Signed-off-by: guozhen1997 <2997871698@qq.com>
---
 .../config/math}/qwen2.5-1.5b-grpo-fsdp.yaml  |   2 +-
 .../qwen2.5-1.5b-grpo-megatron-pipeline.yaml  |   0
 .../math}/qwen2.5-1.5b-grpo-megatron.yaml     |   0
 .../config/math}/qwen2.5-1.5b-single-gpu.yaml |   0
 .../math}/qwen2.5-32b-grpo-megatron.yaml      |   0
 .../math}/qwen2.5-7b-grpo-megatron.yaml       |   0
 .../config/tp_comm_overlap_cfg.yaml           |   0
 .../config/vqa}/qwen2.5-vl-3b-grpo-fsdp.yaml  |   4 +-
 .../main_math.py => reasoning/main_grpo.py}   |   0
 .../run_main_grpo_math.sh}                    |   4 +-
 .../run_main_grpo_vqa.sh}                     |   2 +-
 .../run_placement_autotune.sh                 |   0
 examples/vlm/main_vlm.py                      | 107 ------------------
 rlinf/config.py                               |  12 +-
 14 files changed, 14 insertions(+), 117 deletions(-)
 rename examples/{math/config => reasoning/config/math}/qwen2.5-1.5b-grpo-fsdp.yaml (99%)
 rename examples/{math/config => reasoning/config/math}/qwen2.5-1.5b-grpo-megatron-pipeline.yaml (100%)
 rename examples/{math/config => reasoning/config/math}/qwen2.5-1.5b-grpo-megatron.yaml (100%)
 rename examples/{math/config => reasoning/config/math}/qwen2.5-1.5b-single-gpu.yaml (100%)
 rename examples/{math/config => reasoning/config/math}/qwen2.5-32b-grpo-megatron.yaml (100%)
 rename examples/{math/config => reasoning/config/math}/qwen2.5-7b-grpo-megatron.yaml (100%)
 rename examples/{math => reasoning}/config/tp_comm_overlap_cfg.yaml (100%)
 rename examples/{vlm/config => reasoning/config/vqa}/qwen2.5-vl-3b-grpo-fsdp.yaml (99%)
 rename examples/{math/main_math.py => reasoning/main_grpo.py} (100%)
 rename examples/{math/run_main_math_grpo.sh => reasoning/run_main_grpo_math.sh} (71%)
 rename examples/{vlm/run_main_vlm_grpo_megatron.sh => reasoning/run_main_grpo_vqa.sh} (79%)
 rename examples/{math => reasoning}/run_placement_autotune.sh (100%)
 delete mode 100644 examples/vlm/main_vlm.py

diff --git a/examples/math/config/qwen2.5-1.5b-grpo-fsdp.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
similarity index 99%
rename from examples/math/config/qwen2.5-1.5b-grpo-fsdp.yaml
rename to examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
index 822008f04..bc83ef381 100644
--- a/examples/math/config/qwen2.5-1.5b-grpo-fsdp.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
@@ -13,7 +13,7 @@ cluster:
     actor,rollout: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: ${runner.output_dir}/${runner.experiment_name}
     project_name: rlinf
diff --git a/examples/math/config/qwen2.5-1.5b-grpo-megatron-pipeline.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
similarity index 100%
rename from examples/math/config/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
rename to examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
diff --git a/examples/math/config/qwen2.5-1.5b-grpo-megatron.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml
similarity index 100%
rename from examples/math/config/qwen2.5-1.5b-grpo-megatron.yaml
rename to examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml
diff --git a/examples/math/config/qwen2.5-1.5b-single-gpu.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-single-gpu.yaml
similarity index 100%
rename from examples/math/config/qwen2.5-1.5b-single-gpu.yaml
rename to examples/reasoning/config/math/qwen2.5-1.5b-single-gpu.yaml
diff --git a/examples/math/config/qwen2.5-32b-grpo-megatron.yaml b/examples/reasoning/config/math/qwen2.5-32b-grpo-megatron.yaml
similarity index 100%
rename from examples/math/config/qwen2.5-32b-grpo-megatron.yaml
rename to examples/reasoning/config/math/qwen2.5-32b-grpo-megatron.yaml
diff --git a/examples/math/config/qwen2.5-7b-grpo-megatron.yaml b/examples/reasoning/config/math/qwen2.5-7b-grpo-megatron.yaml
similarity index 100%
rename from examples/math/config/qwen2.5-7b-grpo-megatron.yaml
rename to examples/reasoning/config/math/qwen2.5-7b-grpo-megatron.yaml
diff --git a/examples/math/config/tp_comm_overlap_cfg.yaml b/examples/reasoning/config/tp_comm_overlap_cfg.yaml
similarity index 100%
rename from examples/math/config/tp_comm_overlap_cfg.yaml
rename to examples/reasoning/config/tp_comm_overlap_cfg.yaml
diff --git a/examples/vlm/config/qwen2.5-vl-3b-grpo-fsdp.yaml b/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml
similarity index 99%
rename from examples/vlm/config/qwen2.5-vl-3b-grpo-fsdp.yaml
rename to examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml
index 02b5d8ea4..c96445a06 100644
--- a/examples/vlm/config/qwen2.5-vl-3b-grpo-fsdp.yaml
+++ b/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml
@@ -13,7 +13,7 @@ cluster:
     actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: ${runner.output_dir}/${runner.experiment_name}
     project_name: rlinf
@@ -207,7 +207,7 @@ reward:
   group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'vqa'
-  # reward_scale: 5.0
+  reward_scale: 1.0
   reward_weights:
     qa_accuracy: 1.0
     think_format: 0.0
diff --git a/examples/math/main_math.py b/examples/reasoning/main_grpo.py
similarity index 100%
rename from examples/math/main_math.py
rename to examples/reasoning/main_grpo.py
diff --git a/examples/math/run_main_math_grpo.sh b/examples/reasoning/run_main_grpo_math.sh
similarity index 71%
rename from examples/math/run_main_math_grpo.sh
rename to examples/reasoning/run_main_grpo_math.sh
index dc2f75ee0..56e13c7c2 100644
--- a/examples/math/run_main_math_grpo.sh
+++ b/examples/reasoning/run_main_grpo_math.sh
@@ -13,9 +13,9 @@ MEGATRON_PATH=/opt/Megatron-LM
 export PYTHONPATH=${REPO_PATH}:${MEGATRON_PATH}:$PYTHONPATH
 
 if [ -z "$1" ]; then
-    CONFIG_NAME="qwen2.5-1.5b-grpo-fsdp"
+    CONFIG_NAME="qwen2.5-1.5b-grpo-megatron"
 else
     CONFIG_NAME=$1
 fi
 
-python ${REPO_PATH}/examples/math/main_math.py --config-path ${CONFIG_PATH}/config/  --config-name $CONFIG_NAME
\ No newline at end of file
+python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path ${CONFIG_PATH}/config/math/  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/examples/vlm/run_main_vlm_grpo_megatron.sh b/examples/reasoning/run_main_grpo_vqa.sh
similarity index 79%
rename from examples/vlm/run_main_vlm_grpo_megatron.sh
rename to examples/reasoning/run_main_grpo_vqa.sh
index 99165babb..1b41f415c 100644
--- a/examples/vlm/run_main_vlm_grpo_megatron.sh
+++ b/examples/reasoning/run_main_grpo_vqa.sh
@@ -18,4 +18,4 @@ else
     CONFIG_NAME=$1
 fi
 
-python ${REPO_PATH}/examples/vlm/main_vlm.py --config-path ${CONFIG_PATH}/config/  --config-name $CONFIG_NAME
\ No newline at end of file
+python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path ${CONFIG_PATH}/config/vqa/  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/examples/math/run_placement_autotune.sh b/examples/reasoning/run_placement_autotune.sh
similarity index 100%
rename from examples/math/run_placement_autotune.sh
rename to examples/reasoning/run_placement_autotune.sh
diff --git a/examples/vlm/main_vlm.py b/examples/vlm/main_vlm.py
deleted file mode 100644
index 6ed11dd75..000000000
--- a/examples/vlm/main_vlm.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright 2025 The RLinf Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-
-import hydra
-import torch.multiprocessing as mp
-from omegaconf.omegaconf import OmegaConf
-
-from rlinf.config import validate_cfg
-from rlinf.data.datasets import create_rl_dataset
-from rlinf.data.tokenizers import hf_tokenizer
-from rlinf.runners.math_runner import MathRunner
-from rlinf.scheduler import Cluster
-from rlinf.utils.placement import ModelParallelComponentPlacement, PlacementMode
-from rlinf.utils.utils import output_redirector
-from rlinf.workers.actor import get_actor_worker
-from rlinf.workers.inference.megatron_inference_worker import MegatronInference
-from rlinf.workers.reward.reward_worker import RewardWorker
-from rlinf.workers.rollout.utils import get_rollout_backend_worker
-
-"""Script to start GRPO training"""
-mp.set_start_method("spawn", force=True)
-
-
-@hydra.main(version_base="1.1")
-@output_redirector
-def main(cfg) -> None:
-    cfg = validate_cfg(cfg)
-    print(json.dumps(OmegaConf.to_container(cfg, resolve=True), indent=2))
-
-    cluster = Cluster(
-        num_nodes=cfg.cluster.num_nodes, num_gpus_per_node=cfg.cluster.num_gpus_per_node
-    )
-    component_placement = ModelParallelComponentPlacement(cfg)
-
-    rollout_worker_cls = get_rollout_backend_worker(cfg, component_placement)
-
-    # Rollout group
-    rollout_placement_strategy = component_placement.get_strategy("rollout")
-    rollout_group = rollout_worker_cls.create_group(cfg, component_placement).launch(
-        cluster,
-        name=cfg.rollout.group_name,
-        placement_strategy=rollout_placement_strategy,
-    )
-
-    # Inference group
-    inference_group = None
-    if (
-        component_placement.placement_mode == PlacementMode.DISAGGREGATED
-        and cfg.algorithm.recompute_logprobs
-    ):
-        inference_placement_strategy = component_placement.get_strategy("inference")
-        inference_group = MegatronInference.create_group(
-            cfg, component_placement
-        ).launch(
-            cluster,
-            name=cfg.inference.group_name,
-            placement_strategy=inference_placement_strategy,
-        )
-
-    # Reward group
-    reward_placement_strategy = component_placement.get_strategy("reward")
-    reward_group = RewardWorker.create_group(cfg, component_placement).launch(
-        cluster,
-        name=cfg.reward.group_name,
-        placement_strategy=reward_placement_strategy,
-    )
-
-    # GRPO Actor group
-    actor_worker_cls = get_actor_worker(cfg)
-    actor_placement_strategy = component_placement.get_strategy("actor")
-    actor_group = actor_worker_cls.create_group(cfg, component_placement).launch(
-        cluster, name=cfg.actor.group_name, placement_strategy=actor_placement_strategy
-    )
-
-    tokenizer = hf_tokenizer(cfg.actor.tokenizer.tokenizer_model)
-    train_ds, val_ds = create_rl_dataset(cfg, tokenizer)
-
-    runner = MathRunner(
-        cfg=cfg,
-        placement=component_placement,
-        train_dataset=train_ds,
-        val_dataset=val_ds,
-        rollout=rollout_group,
-        inference=inference_group,
-        actor=actor_group,
-        reward=reward_group,
-    )
-
-    runner.init_workers()
-    runner.run()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/rlinf/config.py b/rlinf/config.py
index 0f3a21903..9c31cd49f 100644
--- a/rlinf/config.py
+++ b/rlinf/config.py
@@ -36,6 +36,7 @@
 
 SUPPORTED_MODEL_ARCHS = ["qwen2.5", "qwen2.5_vl", "openvla", "openvla_oft"]
 SUPPORTED_ROLLOUT_BACKENDS = ["sglang", "vllm"]
+SUPPORTED_TASK_TYPE = ["embodied", "reasoning", "coding_online_rl"]
 __all__ = ["build_config"]
 
 
@@ -528,7 +529,7 @@ def get_robot_control_mode(robot: str):
     return cfg
 
 
-def validate_math_cfg(cfg: DictConfig) -> DictConfig:
+def validate_reasoning_cfg(cfg: DictConfig) -> DictConfig:
     assert cfg.rollout.model_arch in SUPPORTED_MODEL_ARCHS, (
         f"Model {cfg.rollout.model_arch} is not supported"
     )
@@ -607,11 +608,14 @@ def validate_coding_online_rl_cfg(cfg: DictConfig) -> DictConfig:
 def validate_cfg(cfg: DictConfig) -> DictConfig:
     OmegaConf.set_struct(cfg, True)
 
+    assert cfg.runner.task_type in SUPPORTED_TASK_TYPE, (
+        f"task_type must be one of {SUPPORTED_TASK_TYPE}"
+    )
     if cfg.runner.task_type == "embodied":
         cfg = validate_embodied_cfg(cfg)
-    if cfg.runner.task_type == "math":
-        cfg = validate_math_cfg(cfg)
-    if cfg.runner.task_type == "coding_online_rl":
+    elif cfg.runner.task_type == "reasoning":
+        cfg = validate_reasoning_cfg(cfg)
+    elif cfg.runner.task_type == "coding_online_rl":
         cfg = validate_coding_online_rl_cfg(cfg)
 
     if (

From e6ebd609404a79e0ff3cb6f735020a24e2493b5e Mon Sep 17 00:00:00 2001
From: guozhen1997 <2997871698@qq.com>
Date: Tue, 23 Sep 2025 20:43:14 +0800
Subject: [PATCH 30/57] fix: fix ruff, fix merge bugs

Signed-off-by: guozhen1997 <2997871698@qq.com>
---
 .../sglang/sglang_0_4_6/sgl_scheduler.py      |  1 -
 rlinf/utils/convertor/utils.py                | 72 +++++++++++--------
 2 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/rlinf/hybrid_engines/sglang/sglang_0_4_6/sgl_scheduler.py b/rlinf/hybrid_engines/sglang/sglang_0_4_6/sgl_scheduler.py
index 684f8d333..9a69b8548 100644
--- a/rlinf/hybrid_engines/sglang/sglang_0_4_6/sgl_scheduler.py
+++ b/rlinf/hybrid_engines/sglang/sglang_0_4_6/sgl_scheduler.py
@@ -29,7 +29,6 @@
     ReleaseMemoryOccupationReqInput,
     ResumeMemoryOccupationReqInput,
 )
-from sglang.srt.managers.mm_utils import init_embedding_cache
 from sglang.srt.managers.scheduler import Scheduler as _Scheduler
 from sglang.srt.managers.scheduler import logger
 from sglang.srt.server_args import PortArgs, ServerArgs
diff --git a/rlinf/utils/convertor/utils.py b/rlinf/utils/convertor/utils.py
index e187bb919..761218650 100644
--- a/rlinf/utils/convertor/utils.py
+++ b/rlinf/utils/convertor/utils.py
@@ -31,49 +31,59 @@ class TransformFunc:
     def _split_gqa_tensor(
         tensor: torch.Tensor, new_statedict: dict, weight_names: List[str], config
     ) -> None:
-        """
-        Private helper to split a GQA-combined tensor (weight or bias).
-        """
         hidden_size = config.model_config.hidden_size
         num_attention_heads = config.model_config.num_attention_heads
-        num_key_value_heads = (
-            config.model_config.num_query_groups or num_attention_heads
-        )
+        num_query_groups = config.model_config.num_query_groups or num_attention_heads
         head_dim = hidden_size // num_attention_heads
 
-        tp_size = config.model_config.tensor_model_parallel_size
-
-        assert num_key_value_heads % tp_size == 0, (
-            "num_key_value_heads must be divisible by tensor parallel size"
+        target_tp = config.reshard_tp_size
+        assert num_query_groups % target_tp == 0, (
+            "num_query_groups must be divisible by reshard_tp_size"
         )
+        local_num_query_groups = num_query_groups // target_tp
 
-        q_heads_per_rank = num_attention_heads // tp_size
-        kv_heads_per_rank = num_key_value_heads // tp_size
-
-        q_shard_size = q_heads_per_rank * head_dim
-        k_shard_size = kv_heads_per_rank * head_dim
-        v_shard_size = kv_heads_per_rank * head_dim
+        # heads per query group
+        assert num_attention_heads % num_query_groups == 0, (
+            "num_attention_heads must be divisible by num_query_groups"
+        )
+        q_heads_per_group = num_attention_heads // num_query_groups
 
-        shard_size = q_shard_size + k_shard_size + v_shard_size
+        num_channel_qkv = q_heads_per_group + 2
 
-        q_shards, k_shards, v_shards = [], [], []
+        if tensor.ndim == 2:
+            # Weight: [out_features, in_features]
+            out_features, in_features = tensor.shape
+            expected_out = local_num_query_groups * num_channel_qkv * head_dim
+            assert out_features == expected_out, (
+                f"Unexpected fused QKV weight shape {tensor.shape}, expect "
+                f"[{expected_out}, {in_features}] (local groups={local_num_query_groups})"
+            )
 
-        # [Qi,Ki,Vi]
-        for shard in tensor.split(shard_size, dim=0):
-            # Qi, Ki, Vi
-            q_shard, k_shard, v_shard = shard.split(
-                [q_shard_size, k_shard_size, v_shard_size], dim=0
+            qkv = tensor.view(
+                local_num_query_groups, num_channel_qkv, head_dim, in_features
+            )
+            q, k, v = torch.split(
+                qkv, [q_heads_per_group, 1, 1], dim=1
+            )  # shapes: [G, qh, D, In], [G,1,D,In], [G,1,D,In]
+            q_full = q.reshape(-1, in_features).contiguous()
+            k_full = k.reshape(-1, in_features).contiguous()
+            v_full = v.reshape(-1, in_features).contiguous()
+        else:
+            # Bias: [out_features]
+            out_features = tensor.shape[0]
+            expected_out = local_num_query_groups * num_channel_qkv * head_dim
+            assert out_features == expected_out, (
+                f"Unexpected fused QKV bias shape {tensor.shape}, expect "
+                f"[{expected_out}] (local groups={local_num_query_groups})"
             )
-            q_shards.append(q_shard)
-            k_shards.append(k_shard)
-            v_shards.append(v_shard)
 
-        # cat
-        q_full = torch.cat(q_shards, dim=0)
-        k_full = torch.cat(k_shards, dim=0)
-        v_full = torch.cat(v_shards, dim=0)
+            qkv = tensor.view(local_num_query_groups, num_channel_qkv, head_dim)
+            q, k, v = torch.split(qkv, [q_heads_per_group, 1, 1], dim=1)
+            q_full = q.reshape(-1).contiguous()
+            k_full = k.reshape(-1).contiguous()
+            v_full = v.reshape(-1).contiguous()
 
-        # saved
+        # Save to target names
         new_statedict[weight_names[0]] = q_full.clone()
         new_statedict[weight_names[1]] = k_full.clone()
         new_statedict[weight_names[2]] = v_full.clone()

From dc446fc91b6b01066ccbb4b71ec30c1ea6a316ef Mon Sep 17 00:00:00 2001
From: guozhen1997 <2997871698@qq.com>
Date: Thu, 25 Sep 2025 14:38:26 +0800
Subject: [PATCH 31/57] fix: fix multi modal inputs

Signed-off-by: guozhen1997 <2997871698@qq.com>
---
 .../config/math/qwen2.5-1.5b-grpo-fsdp.yaml   |   4 +-
 .../config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml   |   1 -
 examples/reasoning/main_grpo.py               |   4 +-
 rlinf/data/datasets.py                        |  49 ++++++--
 rlinf/data/io_struct.py                       | 110 ++++++++++++++++--
 .../{math_runner.py => reasoning_runner.py}   |  15 ++-
 rlinf/utils/placement.py                      |  17 +--
 rlinf/workers/actor/fsdp_actor_worker.py      |  45 ++-----
 rlinf/workers/rollout/sglang/sglang_worker.py |   1 +
 rlinf/workers/rollout/vllm/vllm_worker.py     |   1 +
 10 files changed, 174 insertions(+), 73 deletions(-)
 rename rlinf/runners/{math_runner.py => reasoning_runner.py} (97%)

diff --git a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
index bc83ef381..e17486fe2 100644
--- a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
@@ -8,9 +8,9 @@ hydra:
 
 cluster:
   num_nodes: 1
-  num_gpus_per_node: 4
   component_placement:
-    actor,rollout: all
+    actor: 0-3
+    rollout: 4-7
 
 runner:
   task_type: reasoning
diff --git a/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml b/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml
index c96445a06..6643e74bd 100644
--- a/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml
+++ b/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml
@@ -8,7 +8,6 @@ hydra:
 
 cluster:
   num_nodes: 1
-  num_gpus_per_node: 8
   component_placement:
     actor,rollout,reward: all
 
diff --git a/examples/reasoning/main_grpo.py b/examples/reasoning/main_grpo.py
index 5b9ee61e1..30073d562 100644
--- a/examples/reasoning/main_grpo.py
+++ b/examples/reasoning/main_grpo.py
@@ -21,7 +21,7 @@
 from rlinf.config import validate_cfg
 from rlinf.data.datasets import create_rl_dataset
 from rlinf.data.tokenizers import hf_tokenizer
-from rlinf.runners.math_runner import MathRunner
+from rlinf.runners.reasoning_runner import ReasoningRunner
 from rlinf.scheduler import Cluster
 from rlinf.utils.placement import ModelParallelComponentPlacement, PlacementMode
 from rlinf.utils.utils import output_redirector
@@ -86,7 +86,7 @@ def main(cfg) -> None:
     tokenizer = hf_tokenizer(cfg.actor.tokenizer.tokenizer_model)
     train_ds, val_ds = create_rl_dataset(cfg, tokenizer)
 
-    runner = MathRunner(
+    runner = ReasoningRunner(
         cfg=cfg,
         placement=component_placement,
         train_dataset=train_ds,
diff --git a/rlinf/data/datasets.py b/rlinf/data/datasets.py
index 75922780b..677377a68 100644
--- a/rlinf/data/datasets.py
+++ b/rlinf/data/datasets.py
@@ -16,12 +16,13 @@
 import logging
 import os
 from dataclasses import dataclass
+from io import BytesIO
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import pandas as pd
 import torch
 from omegaconf import DictConfig
-from PIL.Image import Image
+from PIL import Image
 from torch.utils.data import Dataset
 from transformers import AutoProcessor, AutoTokenizer
 
@@ -73,6 +74,7 @@ class DatasetItem:
     image_data: Optional[List[Union[bytes, str]]] = None
     prompt_text: Optional[str] = None
     meta: Optional[Dict[str, Any]] = None
+    multi_modal_inputs: Optional[Dict[str, Any]] = None
 
 
 class MathDataset(Dataset):
@@ -247,7 +249,7 @@ def get_image_list(self, dataitem: Dict[str, Any]) -> List[Union[bytes, str, Non
             v = dataitem.get(k, None)
             if v is None:
                 continue
-            if isinstance(v, Image):
+            if isinstance(v, Image.Image):
                 images.append(v)
             elif isinstance(v, dict) and "bytes" in v:
                 images.append(v["bytes"])
@@ -268,7 +270,7 @@ def build_prompt_text(self, data_item: Dict[str, Any]) -> str:
         return str(q)
 
     def encode_prompt(
-        self, prompt_text: str, image_count: int
+        self, prompt_text: str, images
     ) -> Tuple[torch.Tensor, int, Optional[str]]:
         """
         Return (token_ids[L], length, prompt_text_used). If using chat template, encode with processor.
@@ -289,28 +291,47 @@ def encode_prompt(
                 )
 
             content: List[Dict[str, Any]] = []
-            for _ in range(max(0, image_count)):
+            for _ in range(max(0, len(images))):
                 content.append({"type": "image"})
             content.append({"type": "text", "text": prompt_text})
             messages.append({"role": "user", "content": content})
             rendered = self._processor.apply_chat_template(
                 messages, tokenize=False, add_generation_prompt=True
             )
-            ids = self._processor(text=[rendered], padding=True, return_tensors="pt")[
-                "input_ids"
-            ]
+
+            images_inputs = []
+            for image in images:
+                image_obj = None
+                if isinstance(image, Image.Image):
+                    image_obj = image.convert("RGB")
+                if isinstance(image, (bytes, bytearray)):
+                    image_obj = Image.open(BytesIO(image)).convert("RGB")
+                images_inputs.append(image_obj)
+
+            inputs = self._processor(
+                text=[rendered], images=images_inputs, padding=True, return_tensors="pt"
+            )
+            inputs.pop("attention_mask")
+            inputs.pop("input_ids")
+            ids = self._processor(
+                text=[rendered], images=None, padding=True, return_tensors="pt"
+            )["input_ids"]
             if isinstance(ids, torch.Tensor):
                 if ids.dim() == 2 and ids.size(0) == 1:
                     ids = ids.squeeze(0)
                 ids = ids.to(dtype=torch.long)
             else:
                 ids = torch.tensor(ids, dtype=torch.long)
-            return ids, int(ids.numel()), rendered
+
+            multi_modal_inputs = {}
+            for k, v in inputs.items():
+                multi_modal_inputs[k] = v
+            return ids, int(ids.numel()), rendered, multi_modal_inputs
         else:
             # fallback: tokenizer only
             ids_list = self.tokenizer.encode(prompt_text)
             ids = torch.as_tensor(ids_list, dtype=torch.long)
-            return ids, int(ids.numel()), prompt_text
+            return ids, int(ids.numel()), prompt_text, {}
 
     def postprocess_dataset_item(
         self, item: DatasetItem, raw: Dict[str, Any]
@@ -450,7 +471,9 @@ def _load_single_lazy(self, path: str, fmt: str, key: Any) -> Dict[str, Any]:
     def _process_raw_record(self, raw: Dict[str, Any], idx: int) -> DatasetItem:
         images = self.get_image_list(raw)
         prompt_text = self.build_prompt_text(raw)
-        prompt_ids, plen, rendered_text = self.encode_prompt(prompt_text, len(images))
+        prompt_ids, plen, rendered_text, multi_modal_inputs = self.encode_prompt(
+            prompt_text, images
+        )
 
         if plen > self.max_prompt_length:
             prompt_ids = prompt_ids[: self.max_prompt_length]
@@ -470,6 +493,7 @@ def _process_raw_record(self, raw: Dict[str, Any], idx: int) -> DatasetItem:
             prompt_text=rendered_text or prompt_text,
             solution=solution_val,
             meta=None,
+            multi_modal_inputs=multi_modal_inputs,
         )
         return self.postprocess_dataset_item(item, raw)
 
@@ -543,7 +567,7 @@ def get_image_list(self, dataitem: Dict[str, Any]) -> List[Union[bytes, str, Non
         for v in images:
             if v is None:
                 continue
-            if isinstance(v, Image):
+            if isinstance(v, Image.Image):
                 normed.append(v)
             elif isinstance(v, dict) and "bytes" in v:
                 normed.append(v["bytes"])  # raw bytes
@@ -686,5 +710,8 @@ def collate_fn(data_list: List["DatasetItem"]) -> Dict[str, Any]:
         ],  # List[Optional[List[bytes|str]]]
         "prompt_text": [it.prompt_text for it in data_list],  # List[Optional[str]]
         "meta": [it.meta for it in data_list],  # List[Optional[dict]]
+        "multi_modal_inputs": [
+            it.multi_modal_inputs for it in data_list
+        ],  # List[Optional[dict]]
     }
     return batch
diff --git a/rlinf/data/io_struct.py b/rlinf/data/io_struct.py
index 78459e548..2c07bc9b2 100644
--- a/rlinf/data/io_struct.py
+++ b/rlinf/data/io_struct.py
@@ -47,16 +47,70 @@ class RolloutRequest:
     Attr
     input_ids: List of input token IDs for rollout
     n: Number of completions to generate for each input
-    idx: List of unique identifiers for the requests, used for tracking
-    input_lengths: List of lengths of the input sequences, corresponding to input_ids
     image_data: list of image data (bytes or URLs) for multimodal inputs
     answers: Optional list of answers for the requests, if available
+    multi_modal_inputs: list of multi-modal inputs for the requests
     """
 
     n: int
     input_ids: List[List[int]]
-    answers: List[str]
     image_data: Union[List[List[bytes]], List[List[str]]]
+    answers: List[str]    
+    multi_modal_inputs: List[Dict]
+
+    def repeat(self) -> "RolloutRequest":
+        """Repeat each input in the RolloutRequest a specified number of times.
+
+        Args:
+            times (int): The number of times to repeat each input.
+
+        Returns:
+            RolloutRequest: A new RolloutRequest with repeated inputs.
+        """
+        assert self.n > 0, "n must be greater than 0"
+
+        input_ids, answers = zip(
+            *[
+                (input_id, answer)
+                for input_id, answer in zip(self.input_ids, self.answers)
+                for _ in range(self.n)
+            ]
+        )
+        return RolloutRequest(
+            n=self.n,
+            input_ids=list(input_ids),
+            answers=list(answers),
+        )
+
+    def split(self, num_splits: int) -> List["RolloutRequest"]:
+        """Split the RolloutRequest into multiple smaller requests.
+
+        Args:
+            num_splits (int): The number of splits to create.
+
+        Returns:
+            List[RolloutRequest]: A list of smaller RolloutRequest instances.
+        """
+        assert num_splits > 0, "num_splits must be greater than 0"
+        assert len(self.input_ids) % num_splits == 0, (
+            f"Input IDs length {len(self.input_ids)} is not divisible by num_splits {num_splits}"
+        )
+
+        input_ids_split_list = split_list(self.input_ids, num_splits)
+        answers_split_list = split_list(self.answers, num_splits)
+
+        splitted_requests = []
+        for input_ids_batch, answers_batch in zip(
+            input_ids_split_list, answers_split_list
+        ):
+            request = RolloutRequest(
+                n=self.n,
+                input_ids=input_ids_batch,
+                answers=answers_batch,
+            )
+            splitted_requests.append(request)
+
+        return splitted_requests
 
     def repeat(self) -> "RolloutRequest":
         """Repeat each input in the RolloutRequest a specified number of times.
@@ -115,19 +169,23 @@ def split(self, num_splits: int) -> List["RolloutRequest"]:
     def repeat_and_split(
         self, rollout_batch_size: Optional[int] = None
     ) -> List["RolloutRequest"]:
-        input_ids, answers, image_data = zip(
+        input_ids, answers, image_data, multi_modal_inputs = zip(
             *[
-                (input_id, answer, image_data)
-                for input_id, answer, image_data in zip(
-                    self.input_ids, self.answers, self.image_data
+                (input_id, answer, image_data, multi_modal_inputs)
+                for input_id, answer, image_data, multi_modal_inputs in zip(
+                    self.input_ids,
+                    self.answers,
+                    self.image_data,
+                    self.multi_modal_inputs,
                 )
                 for _ in range(self.n)
             ]
         )
-        input_ids, answers, image_data = (
+        input_ids, answers, image_data, multi_modal_inputs = (
             list(input_ids),
             list(answers),
             list(image_data),
+            list(multi_modal_inputs),
         )
 
         # Split input ids based on rollout_batch_size_per_gpu
@@ -143,15 +201,25 @@ def repeat_and_split(
         input_ids_split_list = split_list(input_ids, num_batches)
         answers_split_list = split_list(answers, num_batches)
         image_data_split_list = split_list(image_data, num_batches)
-
-        for input_ids_batch, answers_batch, image_data_batch in zip(
-            input_ids_split_list, answers_split_list, image_data_split_list
+        multi_modal_inputs_split_list = split_list(multi_modal_inputs, num_batches)
+
+        for (
+            input_ids_batch,
+            answers_batch,
+            image_data_batch,
+            multi_modal_inputs_batch,
+        ) in zip(
+            input_ids_split_list,
+            answers_split_list,
+            image_data_split_list,
+            multi_modal_inputs_split_list,
         ):
             request = RolloutRequest(
                 n=self.n,
                 input_ids=input_ids_batch,
                 answers=answers_batch,
                 image_data=image_data_batch,
+                multi_modal_inputs=multi_modal_inputs_batch,
             )
             splitted_requests.append(request)
 
@@ -268,6 +336,7 @@ class RolloutResult:
     response_texts: Optional[List[str]] = None
     answers: Optional[List[str | dict]] = None
     image_data: Optional[Union[List[List[bytes]], List[List[str]]]] = None
+    multi_modal_inputs: Optional[List[dict]] = None
     # Inference
     # Only set when recompute_logprobs is False
     rollout_logprobs: Optional[List[List[float]]] = None
@@ -320,6 +389,7 @@ def from_vllm_results(
         group_size: int,
         results: List[VllmRequestOutput],
         answers: Optional[List[str]] = None,
+        multi_modal_inputs: Optional[List[Dict]] = None,
         return_logprobs: bool = False,
     ) -> "RolloutResult":
         def get_logprobs(
@@ -378,6 +448,7 @@ def get_logprobs(
             response_ids=response_ids,
             response_lengths=response_lengths,
             response_texts=response_texts,
+            multi_modal_inputs=multi_modal_inputs,
             is_end=is_end,
         )
         if return_logprobs:
@@ -391,6 +462,7 @@ def from_sglang_results(
         input_ids: List[List[int]],
         answers: Optional[List[List[int]]] = None,
         image_data: Optional[Union[List[List[bytes]], List[List[str]]]] = None,
+        multi_modal_inputs: Optional[List[Dict]] = None,
         return_logprobs: bool = False,
     ) -> "RolloutResult":
         """Create a MathRolloutResult from the given results and input IDs.
@@ -418,6 +490,7 @@ def from_sglang_results(
             response_ids=[res["output_ids"] for res in results],
             answers=answers,
             image_data=image_data,
+            multi_modal_inputs=multi_modal_inputs,
             is_end=[
                 res["meta_info"]["finish_reason"]["type"] == "stop" for res in results
             ],
@@ -584,6 +657,12 @@ def _split_single_result_by_group(
         if rollout_result.image_data is not None:
             image_data_split = split_list(rollout_result.image_data, num_groups)
 
+        multi_modal_inputs_split = None
+        if rollout_result.multi_modal_inputs is not None:
+            multi_modal_inputs_split = split_list(
+                rollout_result.multi_modal_inputs, num_groups
+            )
+
         prompt_texts_split = None
         if rollout_result.prompt_texts is not None:
             prompt_texts_split = split_list(rollout_result.prompt_texts, num_groups)
@@ -641,6 +720,9 @@ def _split_single_result_by_group(
                 image_data=image_data_split[i]
                 if image_data_split is not None
                 else None,
+                multi_modal_inputs=multi_modal_inputs_split[i]
+                if multi_modal_inputs_split is not None
+                else None,
                 prompt_texts=prompt_texts_split[i]
                 if prompt_texts_split is not None
                 else None,
@@ -761,6 +843,12 @@ def to_actor_batch(
             "response_lengths": response_lengths.cuda(),
         }
 
+        if (
+            self.multi_modal_inputs is not None
+            and self.multi_modal_inputs[0] is not None
+        ):
+            batch["multi_modal_inputs"] = self.multi_modal_inputs
+
         if self.advantages is not None:
             if isinstance(self.advantages, torch.Tensor):
                 batch["advantages"] = self.advantages.cuda()
diff --git a/rlinf/runners/math_runner.py b/rlinf/runners/reasoning_runner.py
similarity index 97%
rename from rlinf/runners/math_runner.py
rename to rlinf/runners/reasoning_runner.py
index dca52c0c0..404154fe2 100644
--- a/rlinf/runners/math_runner.py
+++ b/rlinf/runners/reasoning_runner.py
@@ -44,8 +44,8 @@
 logging.getLogger().setLevel(logging.INFO)
 
 
-class MathRunner:
-    """Runner for math model training."""
+class ReasoningRunner:
+    """Runner for reasoning task RL training."""
 
     def __init__(
         self,
@@ -277,19 +277,24 @@ def _put_batch(self, batch: Dict[str, torch.Tensor]):
         lengths = batch["length"].tolist()
         answers = batch["answer"]
         image_data = batch["image_data"]
-        prompts = [ids[-pmp_len:] for ids, pmp_len in zip(prompt_ids, lengths)]
+        multi_modal_inputs = batch["multi_modal_inputs"]
+        prompt_ids = [ids[-pmp_len:] for ids, pmp_len in zip(prompt_ids, lengths)]
         rollout_dp_size = self.component_placement.rollout_dp_size
 
-        for input_ids, answers, image_data in zip(
-            split_list(prompts, rollout_dp_size, enforce_divisible_batch=False),
+        for input_ids, answers, image_data, multi_modal_inputs in zip(
+            split_list(prompt_ids, rollout_dp_size, enforce_divisible_batch=False),
             split_list(answers, rollout_dp_size, enforce_divisible_batch=False),
             split_list(image_data, rollout_dp_size, enforce_divisible_batch=False),
+            split_list(
+                multi_modal_inputs, rollout_dp_size, enforce_divisible_batch=False
+            ),
         ):
             request = RolloutRequest(
                 n=self.cfg.algorithm.group_size,
                 input_ids=input_ids,
                 answers=answers,
                 image_data=image_data,
+                multi_modal_inputs=multi_modal_inputs,
             )
             self.dataloader_channel.put(request, async_op=True)
 
diff --git a/rlinf/utils/placement.py b/rlinf/utils/placement.py
index d6fa798c8..6ecea767f 100644
--- a/rlinf/utils/placement.py
+++ b/rlinf/utils/placement.py
@@ -225,7 +225,7 @@ def __init__(self, config: DictConfig, cluster: Cluster):
             len(self._inference_gpus) if self._inference_gpus else 0
         )
         self._rollout_num_gpus = len(self._rollout_gpus)
-        self._reward_num_gpus = len(self._reward_gpus)
+        self._reward_num_gpus = len(self._reward_gpus) if self._reward_gpus else 0
 
         if self._is_collocated():
             assert self._inference_gpus is None, (
@@ -279,22 +279,19 @@ def _generate_placements(self):
                 self._actor_gpus[0], self._actor_gpus[-1]
             )
 
-            actor_tp_size = self._config.actor.model.tensor_model_parallel_size
-            rollout_tp_size = self._config.rollout.tensor_parallel_size
-            if actor_tp_size > rollout_tp_size:
-                assert actor_tp_size % rollout_tp_size == 0, (
-                    f"Actor TP size ({actor_tp_size}) must be divisible by Rollout TP size ({rollout_tp_size})"
+            if self.actor_tp_size > self.rollout_tp_size:
+                assert self.actor_tp_size % self.rollout_tp_size == 0, (
+                    f"Actor TP size ({self.actor_tp_size}) must be divisible by Rollout TP size ({self.rollout_tp_size})"
                 )
             stride = (
                 self.actor_tp_size // self.rollout_tp_size
                 if self.actor_tp_size > self.rollout_tp_size
                 else 1
             )
-            stride = actor_tp_size // rollout_tp_size
             self._placements["rollout"] = PackedPlacementStrategy(
                 self._rollout_gpus[0],
                 self._rollout_gpus[-1],
-                num_accelerators_per_process=rollout_tp_size,
+                num_accelerators_per_process=self.rollout_tp_size,
                 stride=stride,
             )
             self._placements["reward"] = PackedPlacementStrategy(
@@ -396,3 +393,7 @@ def rollout_tp_size(self) -> int:
     @property
     def rollout_world_size(self) -> int:
         return self._rollout_num_gpus
+
+    @property
+    def reward_world_size(self) -> int:
+        return self._reward_num_gpus
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index 06c28797d..2604c0bb0 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -198,7 +198,7 @@ def put_result(self, result: RolloutResult, channel: Channel):
     def _load_weight_and_optimizer(self, channel: Channel):
         # Acquire the GPUs to ensure that no one is using them before loading models
         # Otherwise, it may lead to OOM
-        with channel.gpu_lock:
+        with channel.device_lock:
             if self.cfg.actor.get("enable_offload", False):
                 self.load_fsdp_param_and_grad(self.device)
                 self.load_fsdp_optimizer(self.device)
@@ -234,30 +234,18 @@ def run_training(self, input_channel: Channel):
             == 0
         )
 
-        self.gradient_accumulation = (
-            self.cfg.actor.global_batch_size
-            // self.cfg.actor.micro_batch_size
-            // self._world_size
-        )
-
         training_metrics_list = []
         # Global batch iterations
         with self.worker_timer():
             for global_batch in global_batches:
                 train_global_batch_size = global_batch["input_ids"].shape[0]
-                assert (
-                    train_global_batch_size
-                    == self.cfg.actor.global_batch_size
-                    // torch.distributed.get_world_size()
-                )
+
                 assert train_global_batch_size % self.cfg.actor.micro_batch_size == 0, (
-                    f"{train_global_batch_size=}, {self.cfg.actor.micro_batch_size}"
+                    f"{train_global_batch_size=}, {self.cfg.actor.micro_batch_size=}"
                 )
 
                 self.gradient_accumulation = (
-                    self.cfg.actor.global_batch_size
-                    // self.cfg.actor.micro_batch_size
-                    // self._world_size
+                    train_global_batch_size // self.cfg.actor.micro_batch_size
                 )
                 # split batch into micro_batches
                 train_micro_batches = get_iterator_k_split(
@@ -269,27 +257,18 @@ def run_training(self, input_channel: Channel):
                 metrics = {}
                 for _, m_batch in enumerate(train_micro_batches):
                     for k, v in m_batch.items():
-                        m_batch[k] = v.to(f"cuda:{int(os.environ['LOCAL_RANK'])}")
+                        m_batch[k] = v.cuda() if isinstance(v, torch.Tensor) else v
 
                     multi_modal_inputs = {}
                     if "multi_modal_inputs" in m_batch.keys():
-                        if (
-                            "image_bound" in m_batch["multi_modal_inputs"][0]
-                        ):  # minicpm-o logic
-                            for key in m_batch["multi_modal_inputs"][0].keys():
-                                multi_modal_inputs[key] = [
+                        for key in m_batch["multi_modal_inputs"][0].keys():
+                            multi_modal_inputs[key] = torch.cat(
+                                [
                                     inputs[key]
                                     for inputs in m_batch["multi_modal_inputs"]
-                                ]
-                        else:
-                            for key in m_batch["multi_modal_inputs"][0].keys():
-                                multi_modal_inputs[key] = torch.cat(
-                                    [
-                                        inputs[key]
-                                        for inputs in m_batch["multi_modal_inputs"]
-                                    ],
-                                    dim=0,
-                                )
+                                ],
+                                dim=0,
+                            ).cuda()
 
                     input_ids = m_batch["input_ids"]
                     attention_mask = m_batch["attention_mask"]
@@ -308,7 +287,7 @@ def run_training(self, input_channel: Channel):
                         position_ids=position_ids,
                         **multi_modal_inputs,
                         use_cache=False,
-                    )  # prevent model thinks we are generating
+                    )
 
                     logits = output.logits
 
diff --git a/rlinf/workers/rollout/sglang/sglang_worker.py b/rlinf/workers/rollout/sglang/sglang_worker.py
index 41f026879..764702c63 100644
--- a/rlinf/workers/rollout/sglang/sglang_worker.py
+++ b/rlinf/workers/rollout/sglang/sglang_worker.py
@@ -193,6 +193,7 @@ def rollout(self, input_channel: Channel, output_channel: Channel):
                 request.input_ids,
                 request.answers,
                 request.image_data,
+                request.multi_modal_inputs,
                 self._return_logprobs,
             )
             rollout_results.append(rollout_result)
diff --git a/rlinf/workers/rollout/vllm/vllm_worker.py b/rlinf/workers/rollout/vllm/vllm_worker.py
index d5ecd11fc..54c1c7ee5 100644
--- a/rlinf/workers/rollout/vllm/vllm_worker.py
+++ b/rlinf/workers/rollout/vllm/vllm_worker.py
@@ -399,6 +399,7 @@ async def rollout_and_return(
             group_size=self._cfg.algorithm.group_size,
             results=vllm_results,
             answers=request.answers,
+                multi_modal_inputs=request.multi_modal_inputs,
             return_logprobs=self._return_logprobs,
         )
         if self._placement.is_disaggregated:

From a04f2d0ac8c44a85da8d6ab2b0bc35cb6ca66861 Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Mon, 29 Sep 2025 03:51:36 +0000
Subject: [PATCH 32/57] fix(math): fix some bugs when running math model

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 .../config/math/qwen2.5-1.5b-grpo-fsdp.yaml   | 33 +++++++++++--------
 rlinf/algorithms/rewards/math/__init__.py     |  5 +--
 rlinf/runners/reasoning_runner.py             | 12 +++----
 rlinf/workers/actor/fsdp_actor_worker.py      | 14 +-------
 4 files changed, 27 insertions(+), 37 deletions(-)

diff --git a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
index e17486fe2..c4c646808 100644
--- a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
@@ -9,8 +9,7 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor: 0-3
-    rollout: 4-7
+    actor,rollout,reward: all
 
 runner:
   task_type: reasoning
@@ -33,7 +32,7 @@ runner:
 
   resume_dir: null
   experiment_name: grpo-1.5b
-  output_dir: /mnt/public/daibo/results
+  output_dir: ../results
 algorithm:
   group_size: 8
 
@@ -84,7 +83,7 @@ rollout:
 
   gpu_memory_utilization: 0.55
 
-  model_dir: /mnt/public/hf_models/qwen2.5-VL-3B/
+  model_dir: /path/to/model/DeepSeek-R1-Distill-Qwen-1.5B/
   model_arch: qwen2.5
   enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
   distributed_executor_backend: mp   # ray or mp
@@ -120,8 +119,8 @@ rollout:
   cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
 
 data:
-  type: vision_language
-  dataset_name: robo2vlm
+  type: math
+  dataset_name: boba
   max_prompt_length: 1024
   filter_prompt_by_length: True
   rollout_batch_size: 8
@@ -130,9 +129,9 @@ data:
   shuffle: True
   validation_shuffle: True
   seed: 1234
-  train_data_paths: ["/mnt/public/daibo/dataset/robo2vlm-1/data/"]
-  val_data_paths: ["/mnt/public/daibo/dataset/robo2vlm-1/data/"]
-  prompt_key: question
+  train_data_paths: ["/dataset/boba/AReaL-boba-106k.jsonl"]
+  val_data_paths: ["/dataset/boba/AReaL-boba-106k.jsonl"]
+  prompt_key: prompt
   image_keys: [image]
   answer_key: answer
   choice_key: choices
@@ -165,7 +164,7 @@ actor:
 
     seq_length: ${runner.seq_length}
     encoder_seq_length: ${runner.seq_length}
-    model_path: /mnt/public/hf_models/qwen2.5-VL-3B/
+    model_path: /path/to/model/DeepSeek-R1-Distill-Qwen-1.5B/
 
   optim:
     optimizer: adam
@@ -195,20 +194,26 @@ actor:
     lr_decay_iters: 10
 
   tokenizer:
-    tokenizer_model: /mnt/public/hf_models/qwen2.5-VL-3B/
+    tokenizer_model: /path/to/model/DeepSeek-R1-Distill-Qwen-1.5B/
     use_fast: False
     trust_remote_code: True
     padding_side: 'right'
 
 reward:
-  group_name: "ActorGroup"
+  group_name: "RewardGroup"
   use_reward_model: false
-  reward_type: 'vqa'
-  # reward_scale: 5.0
+  reward_type: 'math'
+  reward_scale: 5.0
   reward_weights:
     qa_accuracy: 1.0
     think_format: 0.0
     answer_format: 0.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/rlinf/algorithms/rewards/math/__init__.py b/rlinf/algorithms/rewards/math/__init__.py
index 7eb6401a8..1a67e80e1 100644
--- a/rlinf/algorithms/rewards/math/__init__.py
+++ b/rlinf/algorithms/rewards/math/__init__.py
@@ -21,7 +21,7 @@
 
 class MathReward:
     def __init__(self, config: DictConfig):
-        self.scale = config.get("scale", 1.0)
+        self.scale = config.get("reward_scale", 1.0)
 
     def get_reward(
         self, response: List[str], reference: List[List[str]]
@@ -37,4 +37,5 @@ def get_reward(
             List[float]: A list of reward scores, one for each response.
         """
 
-        return math_verify_call(response, reference) * self.scale
+        rewards = math_verify_call(response, reference)
+        return [float(reward) * self.scale for reward in rewards]
diff --git a/rlinf/runners/reasoning_runner.py b/rlinf/runners/reasoning_runner.py
index 404154fe2..1d2bf64d9 100644
--- a/rlinf/runners/reasoning_runner.py
+++ b/rlinf/runners/reasoning_runner.py
@@ -56,21 +56,20 @@ def __init__(
         rollout: Union["SGLangWorker", "VLLMWorker"],
         inference: Optional[MegatronInference],
         actor: MegatronActor,
-        reward: Optional[RewardWorker] = None,
+        reward: RewardWorker,
     ):
         """"""
         self.cfg = cfg
         self.component_placement = placement
         self.is_pipeline = self.component_placement.is_disaggregated
         self.has_dedicated_inference = inference is not None
-        self.has_dedicated_reward = reward is not None
 
         # Workers
         self.rollout = rollout
         self.actor = actor
         # Collocated mode uses actor as inference
         self.inference = inference if self.has_dedicated_inference else self.actor
-        self.reward = reward if self.has_dedicated_reward else self.actor
+        self.reward = reward
 
         # Data channels
         self.dataloader_channel = Channel.create("DataLoader")
@@ -80,9 +79,7 @@ def __init__(
         self.inference_channel = Channel.create(
             "Inference", local=not self.has_dedicated_inference
         )
-        self.reward_channel = Channel.create(
-            "Reward", local=not self.has_dedicated_reward
-        )
+        self.reward_channel = Channel.create("Reward")
         self.actor_channel = Channel.create("Actor", local=True)
 
         # Configurations
@@ -180,8 +177,7 @@ def init_workers(self):
         self.actor.init_worker().wait()
         if self.has_dedicated_inference:
             self.inference.init_worker().wait()
-        if self.has_dedicated_reward:
-            self.reward.init_worker().wait()
+        self.reward.init_worker().wait()
 
         if self.cfg.runner.resume_dir is None:
             return
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index 2604c0bb0..a26c03c16 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -147,19 +147,7 @@ def sync_model_to_rollout(self):
         self.rollou_state_dict = self.get_model_state_dict()
 
         if self._weight_dst_rank_in_rollout is not None:
-
-            def transform_key(k):
-                if k.startswith("model.language_model."):
-                    return "model." + k[21:]
-                elif k.startswith("model."):
-                    return k[6:]
-                else:
-                    return k
-
-            handle = {
-                transform_key(k): reduce_tensor(v)
-                for k, v in self.rollou_state_dict.items()
-            }
+            handle = {k: reduce_tensor(v) for k, v in self.rollou_state_dict.items()}
 
             self.send(
                 handle, self._rollout_group_name, self._weight_dst_rank_in_rollout

From a7df8fc21fb9c2422da06f5a5ffc4ad5be51376b Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Mon, 29 Sep 2025 07:59:01 +0000
Subject: [PATCH 33/57] fix(math): fix some merge_batch when item is not
 tensor,add support for special prefix

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 rlinf/workers/actor/fsdp_actor_worker.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index a26c03c16..ccf3bc68e 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -138,19 +138,30 @@ def _setup_rollout_weight_dst_ranks(self):
 
     def del_reshard_state_dict(self):
         if hasattr(self, "rollou_state_dict"):
-            del self.rollou_state_dict
+            del self.rollout_state_dict
 
     def sync_model_to_rollout(self):
         if next(self.model.parameters()).is_cpu:
             self.load_fsdp_param_and_grad(self.device)
 
-        self.rollou_state_dict = self.get_model_state_dict()
+        self.rollout_state_dict = self.get_model_state_dict()
+
+        has_visual = any("visual." in k for k in self.rollout_state_dict.keys())
+
+        state_dict = {}
 
         if self._weight_dst_rank_in_rollout is not None:
-            handle = {k: reduce_tensor(v) for k, v in self.rollou_state_dict.items()}
+            for k, v in self.rollout_state_dict.items():
+                name = k
+                if has_visual:
+                    if name.startswith("model.language_model."):
+                        name = "model." + name[21:]
+                    elif name.startswith("model."):
+                        name = name[6:]
+                state_dict[name] = reduce_tensor(v)
 
             self.send(
-                handle, self._rollout_group_name, self._weight_dst_rank_in_rollout
+                state_dict, self._rollout_group_name, self._weight_dst_rank_in_rollout
             )
         if self.cfg.actor.get("enable_offload", False):
             self.offload_fsdp_param_and_grad()

From 14cbdf0bb387848ebc334d673631bb5c50390963 Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Mon, 29 Sep 2025 08:46:48 +0000
Subject: [PATCH 34/57] chore: add corresponding changes to yaml because of
 RewardModel and other configurations

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 .../qwen2.5-1.5b-grpo-megatron-pipeline.yaml  | 11 +++++++++-
 .../math/qwen2.5-1.5b-grpo-megatron.yaml      | 20 ++++++++++++++++---
 .../config/math/qwen2.5-1.5b-single-gpu.yaml  | 11 ++++++++--
 .../math/qwen2.5-32b-grpo-megatron.yaml       | 10 ++++++++--
 .../config/math/qwen2.5-7b-grpo-megatron.yaml | 11 ++++++++--
 rlinf/data/io_struct.py                       | 16 ++++++++-------
 6 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron-pipeline.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
index 9ba641d15..0815011f3 100644
--- a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
@@ -12,9 +12,10 @@ cluster:
     rollout: 0-15
     inference: 16-23
     actor: 24-63
+    reward: 0-15
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: ${runner.output_dir}/${runner.experiment_name}
     project_name: rlinf
@@ -134,6 +135,7 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 1024
   filter_prompt_by_length: True
   rollout_batch_size: 512
@@ -146,6 +148,7 @@ data:
   train_data_paths: ["/dataset/boba/AReaL-boba-106k.jsonl"]
   val_data_paths: ["/dataset/boba/AReaL-boba-106k.jsonl"]
 
+
 actor:
   group_name: "ActorGroup"
   training_backend: megatron
@@ -271,9 +274,15 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
 
 critic:
   use_critic_model: false
diff --git a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml
index 6f75dc38e..28a5ca960 100644
--- a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 16
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: ${runner.output_dir}/${runner.experiment_name}
     project_name: rlinf
@@ -121,17 +121,24 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 1024
   filter_prompt_by_length: True
   rollout_batch_size: 512
   val_rollout_batch_size: null
   num_workers: 2
-  prompt_key: prompt
   shuffle: True
   validation_shuffle: True
   seed: 1234
   train_data_paths: ["/dataset/boba/AReaL-boba-106k.jsonl"]
   val_data_paths: ["/dataset/boba/AReaL-boba-106k.jsonl"]
+  prompt_key: prompt
+  image_keys: [image]
+  answer_key: answer
+  choice_key: choices
+  solution_key: null
+  use_chat_template: True
+  lazy_loading: True
 
 actor:
   group_name: "ActorGroup"
@@ -258,9 +265,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/examples/reasoning/config/math/qwen2.5-1.5b-single-gpu.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-single-gpu.yaml
index e3f5bf28d..d654c6522 100644
--- a/examples/reasoning/config/math/qwen2.5-1.5b-single-gpu.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-single-gpu.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor,rollout: 0
+    actor,rollout,reward: 0
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: ${runner.output_dir}/${runner.experiment_name}
     project_name: rlinf
@@ -258,9 +258,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/examples/reasoning/config/math/qwen2.5-32b-grpo-megatron.yaml b/examples/reasoning/config/math/qwen2.5-32b-grpo-megatron.yaml
index 6e397dfda..f7fb2e16c 100644
--- a/examples/reasoning/config/math/qwen2.5-32b-grpo-megatron.yaml
+++ b/examples/reasoning/config/math/qwen2.5-32b-grpo-megatron.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 32
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: ${runner.output_dir}/${runner.experiment_name}
     project_name: rlinf
@@ -259,5 +259,11 @@ reward:
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/examples/reasoning/config/math/qwen2.5-7b-grpo-megatron.yaml b/examples/reasoning/config/math/qwen2.5-7b-grpo-megatron.yaml
index 63146687e..5f33d9cb2 100644
--- a/examples/reasoning/config/math/qwen2.5-7b-grpo-megatron.yaml
+++ b/examples/reasoning/config/math/qwen2.5-7b-grpo-megatron.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 16
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: ${runner.output_dir}/${runner.experiment_name}
     project_name: rlinf
@@ -257,9 +257,16 @@ actor:
 
   
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/rlinf/data/io_struct.py b/rlinf/data/io_struct.py
index 2c07bc9b2..ea78685b5 100644
--- a/rlinf/data/io_struct.py
+++ b/rlinf/data/io_struct.py
@@ -894,14 +894,16 @@ def merge_batches(
             return merged_batch
         if len(batches) == 1:
             return batches[0]
+
         for key in batches[0].keys():
-            assert torch.is_tensor(batches[0][key]), (
-                f"Expected tensor for key {key} in batches, got {type(batches[0][key])}"
-            )
-            assert torch.is_tensor(batches[0][key]), (
-                f"Expected tensor for key {key} in batches, got {type(batches[0][key])}"
-            )
-            merged_batch[key] = torch.cat([batch[key] for batch in batches], dim=0)
+            if torch.is_tensor(batches[0][key]):
+                merged_batch[key] = torch.cat([batch[key] for batch in batches], dim=0)
+            elif isinstance(batches[0][key], list):
+                merged_batch[key] = []
+                for batch in batches:
+                    merged_batch[key].extend(batch[key])
+            else:
+                raise ValueError(f"Unsupported batch key type: {type(batches[0][key])}")
         return merged_batch
 
 

From 124598874d4eb11bd163ee154a92a41695ca1aea Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Tue, 30 Sep 2025 04:05:18 +0000
Subject: [PATCH 35/57] fix(megatron): apply corresponding changes due to fsdp

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 .../qwen2.5-1.5b-grpo-megatron-pipeline.yaml  |  2 +
 .../math/qwen2.5-1.5b-grpo-megatron.yaml      | 30 +++++++--------
 .../config/math/qwen2.5-1.5b-single-gpu.yaml  |  2 +
 .../math/qwen2.5-32b-grpo-megatron.yaml       |  2 +
 .../config/math/qwen2.5-7b-grpo-megatron.yaml |  2 +
 rlinf/algorithms/losses.py                    | 12 +++---
 rlinf/runners/reasoning_runner.py             | 29 ++++++---------
 rlinf/workers/actor/megatron_actor_worker.py  | 24 +++++++++---
 rlinf/workers/reward/reward_worker.py         |  3 +-
 rlinf/workers/rollout/sglang/sglang_worker.py | 37 +++++++++----------
 10 files changed, 75 insertions(+), 68 deletions(-)

diff --git a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron-pipeline.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
index 0815011f3..ea4606003 100644
--- a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron-pipeline.yaml
@@ -66,6 +66,8 @@ algorithm:
   entropy_bonus: 0.0
   calculate_entropy: True
   clip_ratio_c: null # 3.0
+  clip_ratio_low: null # if null or not set, will use ratio_clip_eps
+  clip_ratio_high: null # if null or not set, will use ratio_clip_eps
 
   adv_type: math_grpo
   normalize_advantages: False
diff --git a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml
index 28a5ca960..63e972b3a 100644
--- a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml
@@ -7,7 +7,7 @@ hydra:
   output_subdir: null
 
 cluster:
-  num_nodes: 16
+  num_nodes: 1
   component_placement:
     actor,rollout,reward: all
 
@@ -25,14 +25,14 @@ runner:
   val_check_interval: 1
   save_interval: 50
 
-  seq_length: 28672
+  seq_length: 10240
 
   enable_dynamic_batch_size: False
   max_tokens_per_mbs: 28672
 
   resume_dir: null
-  experiment_name: grpo-1.5b
-  output_dir: ../results
+  experiment_name: megatron-vllm-1.5b-math-test
+  output_dir: /mnt/public/daibo/results
 
 algorithm:
   group_size: 16
@@ -61,6 +61,8 @@ algorithm:
   entropy_bonus: 0.0
   calculate_entropy: False
   clip_ratio_c: null # 3.0
+  clip_ratio_low: null # if null or not set, will use ratio_clip_eps
+  clip_ratio_high: null # if null or not set, will use ratio_clip_eps
 
   adv_type: math_grpo
   normalize_advantages: True
@@ -82,7 +84,7 @@ rollout:
 
   gpu_memory_utilization: 0.55
 
-  model_dir: /path/to/model/DeepSeek-R1-Distill-Qwen-1.5B/
+  model_dir: /mnt/public/hf_models/DeepSeek-R1-Distill-Qwen-1.5B/
   model_arch: qwen2.5
   enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
   distributed_executor_backend: mp   # ray or mp
@@ -91,7 +93,7 @@ rollout:
   padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
   eos: null                   # will be tokenizer.eos_token_id if null.
     
-  rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
+  rollout_backend: vllm     # here choose which backend to rollout,support [sglang, vllm] 
 
   sglang:
     attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
@@ -124,21 +126,15 @@ data:
   dataset_name: boba
   max_prompt_length: 1024
   filter_prompt_by_length: True
-  rollout_batch_size: 512
+  rollout_batch_size: 8
   val_rollout_batch_size: null
   num_workers: 2
+  prompt_key: prompt
   shuffle: True
   validation_shuffle: True
   seed: 1234
-  train_data_paths: ["/dataset/boba/AReaL-boba-106k.jsonl"]
-  val_data_paths: ["/dataset/boba/AReaL-boba-106k.jsonl"]
-  prompt_key: prompt
-  image_keys: [image]
-  answer_key: answer
-  choice_key: choices
-  solution_key: null
-  use_chat_template: True
-  lazy_loading: True
+  train_data_paths: ["/mnt/public/daibo/dataset/boba_106k_0319_prompt_1024.jsonl"]
+  val_data_paths: ["/mnt/public/daibo/dataset/boba_106k_0319_prompt_1024.jsonl"]
 
 actor:
   group_name: "ActorGroup"
@@ -220,7 +216,7 @@ actor:
     lr_decay_iters: 10
 
   tokenizer:
-    tokenizer_model: /path/to/model/DeepSeek-R1-Distill-Qwen-1.5B/
+    tokenizer_model: /mnt/public/hf_models/DeepSeek-R1-Distill-Qwen-1.5B/
     use_fast: False
     trust_remote_code: True
     padding_side: 'right'
diff --git a/examples/reasoning/config/math/qwen2.5-1.5b-single-gpu.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-single-gpu.yaml
index d654c6522..1829050c1 100644
--- a/examples/reasoning/config/math/qwen2.5-1.5b-single-gpu.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-single-gpu.yaml
@@ -61,6 +61,8 @@ algorithm:
   entropy_bonus: 0.0
   calculate_entropy: False
   clip_ratio_c: null # 3.0
+  clip_ratio_low: null # if null or not set, will use ratio_clip_eps
+  clip_ratio_high: null # if null or not set, will use ratio_clip_eps
 
   adv_type: math_grpo
   normalize_advantages: True
diff --git a/examples/reasoning/config/math/qwen2.5-32b-grpo-megatron.yaml b/examples/reasoning/config/math/qwen2.5-32b-grpo-megatron.yaml
index f7fb2e16c..e9eb2089e 100644
--- a/examples/reasoning/config/math/qwen2.5-32b-grpo-megatron.yaml
+++ b/examples/reasoning/config/math/qwen2.5-32b-grpo-megatron.yaml
@@ -61,6 +61,8 @@ algorithm:
   entropy_bonus: 0.0
   calculate_entropy: False
   clip_ratio_c: null # 3.0
+  clip_ratio_low: null # if null or not set, will use ratio_clip_eps
+  clip_ratio_high: null # if null or not set, will use ratio_clip_eps
 
   adv_type: math_grpo
   normalize_advantages: True
diff --git a/examples/reasoning/config/math/qwen2.5-7b-grpo-megatron.yaml b/examples/reasoning/config/math/qwen2.5-7b-grpo-megatron.yaml
index 5f33d9cb2..b2a70d6f8 100644
--- a/examples/reasoning/config/math/qwen2.5-7b-grpo-megatron.yaml
+++ b/examples/reasoning/config/math/qwen2.5-7b-grpo-megatron.yaml
@@ -61,6 +61,8 @@ algorithm:
   entropy_bonus: 0.0
   calculate_entropy: False
   clip_ratio_c: null # 3.0
+  clip_ratio_low: null # if null or not set, will use ratio_clip_eps
+  clip_ratio_high: null # if null or not set, will use ratio_clip_eps
 
   adv_type: math_grpo
   normalize_advantages: True
diff --git a/rlinf/algorithms/losses.py b/rlinf/algorithms/losses.py
index 798e5330f..1d66885ea 100644
--- a/rlinf/algorithms/losses.py
+++ b/rlinf/algorithms/losses.py
@@ -240,12 +240,12 @@ def compute_math_ppo_actor_loss(**kwargs):
 
     # Compile metrics for logging
     metrics_data = {
-        "policy_loss": masked_mean(policy_loss.detach(), loss_mask).cpu(),
-        "ratio": masked_mean(ratio.detach(), loss_mask).cpu(),
-        "clipped_ratio": masked_mean(clipped_ratio.detach(), loss_mask).cpu(),
-        "dual_cliped_ratio": masked_mean(dual_cliped_ratio.detach(), loss_mask).cpu(),
-        "approx_kl": approx_kl.detach().cpu(),
-        "clip_fraction": clip_fraction.detach().cpu(),
+        "policy_loss": masked_mean(policy_loss.detach(), loss_mask),
+        "ratio": masked_mean(ratio.detach(), loss_mask),
+        "clipped_ratio": masked_mean(clipped_ratio.detach(), loss_mask),
+        "dual_cliped_ratio": masked_mean(dual_cliped_ratio.detach(), loss_mask),
+        "approx_kl": approx_kl.detach(),
+        "clip_fraction": clip_fraction.detach(),
     }
     return policy_loss, metrics_data
 
diff --git a/rlinf/runners/reasoning_runner.py b/rlinf/runners/reasoning_runner.py
index 1d2bf64d9..d68abf72a 100644
--- a/rlinf/runners/reasoning_runner.py
+++ b/rlinf/runners/reasoning_runner.py
@@ -76,9 +76,7 @@ def __init__(
         self.rollout_channel = Channel.create("Rollout")
         # Create a local channel (i.e., a channel that is different in every process)
         # if inference is not a dedicated worker
-        self.inference_channel = Channel.create(
-            "Inference", local=not self.has_dedicated_inference
-        )
+        self.inference_channel = Channel.create("Inference")
         self.reward_channel = Channel.create("Reward")
         self.actor_channel = Channel.create("Actor", local=True)
 
@@ -332,38 +330,33 @@ def run(self):
                         output_channel=self.rollout_channel,
                     )
 
+                    # Rewards
+                    reward_handle: Handle = self.reward.compute_rewards(
+                        input_channel=self.rollout_channel,
+                        output_channel=self.reward_channel,
+                    )
+
                     if self.recompute_logprobs:
                         # Inference prev/ref logprobs
                         infer_handle: Handle = self.inference.run_inference(
-                            input_channel=self.rollout_channel,
+                            input_channel=self.reward_channel,
                             output_channel=self.inference_channel,
                             compute_ref_logprobs=self.compute_ref_logprobs,
                         )
                         inference_channel = self.inference_channel
                     else:
                         infer_handle = None
-                        inference_channel = self.rollout_channel
-
-                    # Rewards
-                    reward_handle: Handle = self.reward.compute_rewards(
-                        input_channel=inference_channel,
-                        output_channel=self.reward_channel,
-                    )
+                        inference_channel = self.reward_channel
 
                     # Advantages and returns
                     adv_handle: Handle = self.actor.compute_advantages_and_returns(
-                        input_channel=self.reward_channel,
+                        input_channel=inference_channel,
                         output_channel=self.actor_channel,
                     )
 
                     # Actor training
-                    actor_input_channel = self.actor_channel
-                    if self.is_pipeline:
-                        # In pipeline mode, the rollout already contains the advantages and returns
-                        # So the above two steps are in fact no-ops, and we should directly use the inference channel as the input
-                        actor_input_channel = inference_channel
                     actor_handle: Handle = self.actor.run_training(
-                        input_channel=actor_input_channel,
+                        input_channel=self.actor_channel,
                     )
 
                     metrics = actor_handle.wait()
diff --git a/rlinf/workers/actor/megatron_actor_worker.py b/rlinf/workers/actor/megatron_actor_worker.py
index 54376e1a5..57c54dbb9 100644
--- a/rlinf/workers/actor/megatron_actor_worker.py
+++ b/rlinf/workers/actor/megatron_actor_worker.py
@@ -115,10 +115,21 @@ def __init__(
         self.calculate_entropy_loss = (
             self.cfg.algorithm.entropy_bonus > 0 and self.calculate_entropy
         )
-        self.ratio_eps = self.cfg.algorithm.ratio_clip_eps
+        clip_ratio = self.cfg.algorithm.ratio_clip_eps
+        self.clip_ratio_low = (
+            self.cfg.algorithm.get("clip_ratio_low")
+            if self.cfg.algorithm.get("clip_ratio_low") is not None
+            else clip_ratio
+        )
+        self.clip_ratio_high = (
+            self.cfg.algorithm.get("clip_ratio_high")
+            if self.cfg.algorithm.get("clip_ratio_high") is not None
+            else clip_ratio
+        )
         self.logprob_forward_micro_batch_size = (
             self.cfg.algorithm.logprob_forward_micro_batch_size
         )
+
         self.kl_beta = self.cfg.algorithm.kl_beta
         self.kl_penalty_type = self.cfg.algorithm.kl_penalty_type
         self.clip_ratio_c = self.cfg.algorithm.clip_ratio_c
@@ -382,7 +393,8 @@ def loss_func(output):
                     logprobs=curr_logprobs,
                     old_logprobs=prev_logprobs,
                     advantages=advantages,
-                    eps_clip=self.ratio_eps,
+                    clip_ratio_low=self.clip_ratio_low,
+                    clip_ratio_high=self.clip_ratio_high,
                     loss_mask=mask,
                 )
 
@@ -843,7 +855,6 @@ def run_inference(
         while recv_batch_size < self.total_batch_size_per_dp:
             batch, rollout_result = self.get_batch(input_channel)
             recv_batch_size += rollout_result.num_sequence
-
             # Must be called after batch is retrieved, suggesting that rollout has stopped
             # Otherwise, loading model might cause OOM in the collocated mode
             self._load_weight_and_optimizer(input_channel)
@@ -859,7 +870,6 @@ def run_inference(
                 with cpu_weight_swap(self.model[0], self.ref_policy_state_dict):
                     ref_logprobs = self.inference_step(batch)
                     rollout_result.ref_logprobs = ref_logprobs.cpu()
-
             self.put_result(rollout_result, output_channel)
 
         assert recv_batch_size == self.total_batch_size_per_dp, (
@@ -963,7 +973,6 @@ def compute_advantages_and_returns(
         while recv_batch_size < self.total_batch_size_per_dp:
             batch, rollout_result = self.get_batch(input_channel)
             recv_batch_size += rollout_result.num_sequence
-
             with self.worker_timer():
                 if rollout_result.advantages is None:
                     mask = batch["attention_mask"][:, -self.response_len :]
@@ -1033,7 +1042,10 @@ def sync_model_to_rollout(self):
     def _compute_rollout_metrics(self, batch):
         rollout_metrics, total_prompt_lengths, total_decode_lengths = (
             compute_rollout_metrics(
-                batch, self.cfg.data.max_prompt_length, self.response_len
+                batch,
+                self.cfg.data.max_prompt_length,
+                self.response_len,
+                self._world_size,
             )
         )
 
diff --git a/rlinf/workers/reward/reward_worker.py b/rlinf/workers/reward/reward_worker.py
index 9290c23d6..fefd422d6 100644
--- a/rlinf/workers/reward/reward_worker.py
+++ b/rlinf/workers/reward/reward_worker.py
@@ -49,7 +49,6 @@ def get_batch(
         self, channel: Channel
     ) -> Tuple[Dict[str, torch.Tensor], RolloutResult]:
         result: RolloutResult = channel.get()
-
         batch = result.to_actor_batch(
             self.cfg.data.max_prompt_length,
             self.cfg.actor.model.encoder_seq_length,
@@ -69,8 +68,8 @@ def compute_rewards(self, input_channel: Channel, output_channel: Channel):
             recv_batch_size = 0
             while recv_batch_size < self.total_batch_size_per_dp:
                 batch, rollout_result = self.get_batch(input_channel)
-                recv_batch_size += rollout_result.num_sequence
 
+                recv_batch_size += rollout_result.num_sequence
                 # Compute rule-based reward
                 if rollout_result.rewards is None:
                     rollout_result.rewards = self._compute_batch_rewards(
diff --git a/rlinf/workers/rollout/sglang/sglang_worker.py b/rlinf/workers/rollout/sglang/sglang_worker.py
index 764702c63..a339edd9e 100644
--- a/rlinf/workers/rollout/sglang/sglang_worker.py
+++ b/rlinf/workers/rollout/sglang/sglang_worker.py
@@ -323,7 +323,6 @@ async def rollout(self, input_channel: Channel, output_channel: Channel):
             total_reqs = len(rollout_tasks)
             required_reqs = total_reqs // self._cfg.algorithm.max_num_gen_batches
 
-            droped_reqs = 0
             finished_reqs = 0
             abort_flag = False
 
@@ -334,20 +333,20 @@ async def rollout(self, input_channel: Channel, output_channel: Channel):
 
                 if self._completion_info.is_completed(hash_id):
                     results = self._completion_info.get_results(hash_id)
-                    (
-                        rewards,
-                        advantages,
-                    ) = await self._compute_reward_and_advantage(
-                        results,
-                        self._current_request.answers[raw_id],
-                    )
-                    if (
-                        all_floats_equal(rewards)
-                        and self._cfg.algorithm.get("max_num_gen_batches", 1) > 1
-                    ):
-                        if (total_reqs - droped_reqs) > required_reqs:
-                            droped_reqs += rollout_request.n
-                            continue
+                    # (
+                    #     rewards,
+                    #     advantages,
+                    # ) = await self._compute_reward_and_advantage(
+                    #     results,
+                    #     self._current_request.answers[raw_id],
+                    # )
+                    # if (
+                    #     all_floats_equal(rewards)
+                    #     and self._cfg.algorithm.get("max_num_gen_batches", 1) > 1
+                    # ):
+                    #     if (total_reqs - droped_reqs) > required_reqs:
+                    #         droped_reqs += rollout_request.n
+                    #         continue
 
                     input_ids = [input_ids] * len(results)
                     rollout_result = RolloutResult.from_sglang_results(
@@ -356,10 +355,10 @@ async def rollout(self, input_channel: Channel, output_channel: Channel):
                         input_ids,
                         return_logprobs=self._return_logprobs,
                     )
-                    rollout_result.rewards = torch.tensor(
-                        rewards, dtype=torch.float32
-                    ).reshape(-1, 1)
-                    rollout_result.advantages = advantages
+                    # rollout_result.rewards = torch.tensor(
+                    #     rewards, dtype=torch.float32
+                    # ).reshape(-1, 1)
+                    # rollout_result.advantages = advantages
                     return_tasks.append(
                         asyncio.create_task(
                             self._put_result(rollout_result, output_channel)

From 4bf2d812ef4c864ede6ce6150782d56c06834059 Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Tue, 30 Sep 2025 10:21:46 +0000
Subject: [PATCH 36/57] fix(reward): change math_verify_call's result from
 {0,1} to {-1,1}

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 rlinf/data/io_struct.py                      |  2 +-
 rlinf/workers/actor/megatron_actor_worker.py | 81 +-------------------
 rlinf/workers/rollout/vllm/vllm_worker.py    |  2 +-
 toolkits/math_verifier/verify.py             | 12 ++-
 4 files changed, 8 insertions(+), 89 deletions(-)

diff --git a/rlinf/data/io_struct.py b/rlinf/data/io_struct.py
index ea78685b5..b69b680bd 100644
--- a/rlinf/data/io_struct.py
+++ b/rlinf/data/io_struct.py
@@ -55,7 +55,7 @@ class RolloutRequest:
     n: int
     input_ids: List[List[int]]
     image_data: Union[List[List[bytes]], List[List[str]]]
-    answers: List[str]    
+    answers: List[str]
     multi_modal_inputs: List[Dict]
 
     def repeat(self) -> "RolloutRequest":
diff --git a/rlinf/workers/actor/megatron_actor_worker.py b/rlinf/workers/actor/megatron_actor_worker.py
index 57c54dbb9..a08415ea9 100644
--- a/rlinf/workers/actor/megatron_actor_worker.py
+++ b/rlinf/workers/actor/megatron_actor_worker.py
@@ -14,7 +14,7 @@
 
 import copy
 from functools import partial
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.distributed
@@ -53,7 +53,6 @@
 )
 from rlinf.utils.distributed import (
     RolloutDataBalance,
-    broadcast_tensor_within_mp,
     broadcast_tensor_within_pp,
     compute_rollout_metrics,
     masked_normalization,
@@ -876,84 +875,6 @@ def run_inference(
             f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
         )
 
-    # Rewards
-    def compute_rewards(self, input_channel: Channel, output_channel: Channel):
-        """Compute rewards.
-
-        Args:
-            input_channel: The input channel to read from.
-            output_channel: The output channel to send results to.
-        """
-        assert self.reward_fn is not None, "reward_fn is not set"
-        if self.is_pipeline:
-            # In pipeline mode, rewards are computed in the rollout
-            with self.worker_timer():
-                return
-        recv_batch_size = 0
-        while recv_batch_size < self.total_batch_size_per_dp:
-            batch, rollout_result = self.get_batch(input_channel)
-            recv_batch_size += rollout_result.num_sequence
-
-            # Compute rule-based reward
-            with self.worker_timer():
-                if rollout_result.rewards is None:
-                    rollout_result.rewards = self._compute_batch_rewards(
-                        batch, rollout_result.answers
-                    ).cpu()
-
-            self.put_result(rollout_result, output_channel)
-
-        assert recv_batch_size == self.total_batch_size_per_dp, (
-            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
-        )
-
-    def _compute_batch_rewards(
-        self, batch: Dict[str, torch.Tensor], answers: List[str]
-    ):
-        """Reward computation using non-model based reward."""
-        all_reward_scores = []
-        texts = []
-        for response, response_len in zip(
-            batch["input_ids"],
-            batch["response_lengths"],
-        ):
-            response = response[
-                self.cfg.data.max_prompt_length : self.cfg.data.max_prompt_length
-                + response_len
-            ]
-            texts.append(
-                self.tokenizer.decode(response.tolist(), skip_special_tokens=True)
-            )
-
-        if torch.distributed.get_rank() == parallel_state.get_model_parallel_src_rank():
-            rewards = self.reward_fn(texts, answers)
-            if self.cfg.reward.reward_type == "math":
-                reward_scores = [
-                    self.cfg.reward.reward_scale
-                    if reward == 1
-                    else -self.cfg.reward.reward_scale
-                    for reward in rewards
-                ]
-            else:
-                reward_scores = rewards
-
-            all_reward_scores.extend(reward_scores)
-
-        if len(all_reward_scores) > 0:
-            new_all_rewards = []
-
-            for response in all_reward_scores:
-                if response is None:
-                    response = 0.0
-                new_all_rewards.append(response)
-
-            all_reward_scores = torch.as_tensor(
-                new_all_rewards,
-                dtype=torch.float,
-                device=torch.cuda.current_device(),
-            ).view(-1, 1)
-        return broadcast_tensor_within_mp(all_reward_scores).flatten().to("cpu")
-
     # Advantages and returns
     def compute_advantages_and_returns(
         self, input_channel: Channel, output_channel: Channel
diff --git a/rlinf/workers/rollout/vllm/vllm_worker.py b/rlinf/workers/rollout/vllm/vllm_worker.py
index 54c1c7ee5..899e07f1e 100644
--- a/rlinf/workers/rollout/vllm/vllm_worker.py
+++ b/rlinf/workers/rollout/vllm/vllm_worker.py
@@ -399,7 +399,7 @@ async def rollout_and_return(
             group_size=self._cfg.algorithm.group_size,
             results=vllm_results,
             answers=request.answers,
-                multi_modal_inputs=request.multi_modal_inputs,
+            multi_modal_inputs=request.multi_modal_inputs,
             return_logprobs=self._return_logprobs,
         )
         if self._placement.is_disaggregated:
diff --git a/toolkits/math_verifier/verify.py b/toolkits/math_verifier/verify.py
index 80bf4b552..31d92c280 100644
--- a/toolkits/math_verifier/verify.py
+++ b/toolkits/math_verifier/verify.py
@@ -348,22 +348,22 @@ def process_results(answer, solution):
         extracted_solution = extract_answer(solution, "math", use_last_number=True)
 
         if extracted_answer is None or extracted_answer.strip() in ["None", "none", ""]:
-            retval = 0
+            retval = -1
         elif extracted_solution is None or extracted_solution.strip() in [
             "None",
             "none",
             "",
         ]:
-            retval = 0
+            retval = -1
         elif math_equal(extracted_answer, extracted_solution, timeout=False):
             # elif call_with_timeout(math_equal, extracted_answer, extracted_solution):
             retval = 1
         else:
-            retval = 0
+            retval = -1
 
         return retval, (extracted_answer, extracted_solution)
     except Exception:
-        return 0, ("None", "None")
+        return -1, ("None", "None")
 
 
 def process_results_process(a, b, output_queue):
@@ -406,14 +406,12 @@ def math_verify_call(
     labels = []
     has_timeout = False
     for jobs in all_jobs:
-        label = 0
         try:
             for job in as_completed(jobs, timeout=timeout):
                 x = job.result()
-                label = label or x
+                labels.append(x)
         except TimeoutError:
             has_timeout = True
-        labels.append(label)
 
     if has_timeout:
         reset_global_process_pool()

From fc77e2aac77324d5720643cdba6c04d9bb86a005 Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Tue, 30 Sep 2025 10:46:13 +0000
Subject: [PATCH 37/57] feat(ci): change corresponding ci config  for
 refactored code

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 rlinf/data/io_struct.py                       | 54 -------------------
 rlinf/hybrid_engines/fsdp/utils.py            |  1 -
 .../auto_placement/qwen2.5-1.5b-grpo.yaml     | 12 ++++-
 ...1.5b-grpo-collocated-rollout-logprobs.yaml | 12 ++++-
 .../sglang/qwen2.5-1.5b-grpo-collocated.yaml  | 12 ++++-
 ...5-1.5b-grpo-pipeline-rollout-logprobs.yaml | 12 ++++-
 .../sglang/qwen2.5-1.5b-grpo-pipeline.yaml    | 11 +++-
 tests/e2e_tests/math/sglang/run_collocated.sh |  2 +-
 tests/e2e_tests/math/sglang/run_pipeline.sh   |  2 +-
 ...1.5b-grpo-collocated-rollout-logprobs.yaml | 12 ++++-
 .../vllm/qwen2.5-1.5b-grpo-collocated.yaml    | 12 ++++-
 tests/e2e_tests/math/vllm/run_collocated.sh   |  2 +-
 tests/unit_tests/test_placement.py            | 28 ----------
 13 files changed, 73 insertions(+), 99 deletions(-)

diff --git a/rlinf/data/io_struct.py b/rlinf/data/io_struct.py
index b69b680bd..b7dc359b5 100644
--- a/rlinf/data/io_struct.py
+++ b/rlinf/data/io_struct.py
@@ -58,60 +58,6 @@ class RolloutRequest:
     answers: List[str]
     multi_modal_inputs: List[Dict]
 
-    def repeat(self) -> "RolloutRequest":
-        """Repeat each input in the RolloutRequest a specified number of times.
-
-        Args:
-            times (int): The number of times to repeat each input.
-
-        Returns:
-            RolloutRequest: A new RolloutRequest with repeated inputs.
-        """
-        assert self.n > 0, "n must be greater than 0"
-
-        input_ids, answers = zip(
-            *[
-                (input_id, answer)
-                for input_id, answer in zip(self.input_ids, self.answers)
-                for _ in range(self.n)
-            ]
-        )
-        return RolloutRequest(
-            n=self.n,
-            input_ids=list(input_ids),
-            answers=list(answers),
-        )
-
-    def split(self, num_splits: int) -> List["RolloutRequest"]:
-        """Split the RolloutRequest into multiple smaller requests.
-
-        Args:
-            num_splits (int): The number of splits to create.
-
-        Returns:
-            List[RolloutRequest]: A list of smaller RolloutRequest instances.
-        """
-        assert num_splits > 0, "num_splits must be greater than 0"
-        assert len(self.input_ids) % num_splits == 0, (
-            f"Input IDs length {len(self.input_ids)} is not divisible by num_splits {num_splits}"
-        )
-
-        input_ids_split_list = split_list(self.input_ids, num_splits)
-        answers_split_list = split_list(self.answers, num_splits)
-
-        splitted_requests = []
-        for input_ids_batch, answers_batch in zip(
-            input_ids_split_list, answers_split_list
-        ):
-            request = RolloutRequest(
-                n=self.n,
-                input_ids=input_ids_batch,
-                answers=answers_batch,
-            )
-            splitted_requests.append(request)
-
-        return splitted_requests
-
     def repeat(self) -> "RolloutRequest":
         """Repeat each input in the RolloutRequest a specified number of times.
 
diff --git a/rlinf/hybrid_engines/fsdp/utils.py b/rlinf/hybrid_engines/fsdp/utils.py
index 2461f7006..0a9f0054d 100644
--- a/rlinf/hybrid_engines/fsdp/utils.py
+++ b/rlinf/hybrid_engines/fsdp/utils.py
@@ -97,7 +97,6 @@ def get_fsdp_wrap_policy(module, config=None, is_lora=False, is_vla_model=False)
 
     # Add vision transformer policies for VLA models
     if is_vla_model:
-        from prismatic.extern.hf.modeling_prismatic import PrismaticProjector
         from timm.models.vision_transformer import VisionTransformer
         from torch.distributed.fsdp.wrap import _module_wrap_policy, _or_policy
 
diff --git a/tests/e2e_tests/auto_placement/qwen2.5-1.5b-grpo.yaml b/tests/e2e_tests/auto_placement/qwen2.5-1.5b-grpo.yaml
index 6555cb9bf..d1c65161b 100644
--- a/tests/e2e_tests/auto_placement/qwen2.5-1.5b-grpo.yaml
+++ b/tests/e2e_tests/auto_placement/qwen2.5-1.5b-grpo.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: ${runner.output_dir}/${runner.experiment_name}
     project_name: rlinf
@@ -119,6 +119,7 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 1024
   filter_prompt_by_length: True
   rollout_batch_size: 128
@@ -256,13 +257,20 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
 
 critic:
   use_critic_model: false
 
+
 profile_data:
   actor_cost: 95.7
   inference_cost: 30.8
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
index 7e2c5164a..3516fe44b 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
+++ b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: /workspace/results/
     project_name: rlinf
@@ -121,6 +121,7 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 256
   filter_prompt_by_length: True
   rollout_batch_size: 8
@@ -259,9 +260,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
index 1dfe47aeb..79b5e1595 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
+++ b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: /workspace/results/
     project_name: rlinf
@@ -126,6 +126,7 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 256
   filter_prompt_by_length: True
   rollout_batch_size: 8
@@ -264,9 +265,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
index 25344d0bf..34bcff492 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
+++ b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
@@ -11,9 +11,10 @@ cluster:
   component_placement:
     rollout: 0-3
     actor: 4-7
+    reward: 0-3
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: /workspace/results/
     project_name: rlinf
@@ -124,6 +125,7 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 256
   rollout_batch_size: 8
   val_rollout_batch_size: null
@@ -257,11 +259,17 @@ actor:
       schedule_repeat: 1 # inference and training will repeat such times
       # schedule_wait: it will be set at runtime
 
-
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml
index d5cd2b4a4..b48eb6057 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml
+++ b/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml
@@ -12,9 +12,10 @@ cluster:
     rollout: 0-3
     inference: 4-5
     actor: 6-7
+    reward: 0-3
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: /workspace/results/
     project_name: rlinf
@@ -138,6 +139,7 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 256
   rollout_batch_size: 8
   val_rollout_batch_size: null
@@ -273,9 +275,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/sglang/run_collocated.sh b/tests/e2e_tests/math/sglang/run_collocated.sh
index 1610f8fac..5911653e7 100644
--- a/tests/e2e_tests/math/sglang/run_collocated.sh
+++ b/tests/e2e_tests/math/sglang/run_collocated.sh
@@ -14,4 +14,4 @@ else
     CONFIG_NAME=$1
 fi
 
-python ${REPO_PATH}/examples/math/main_math.py --config-path $REPO_PATH/tests/e2e_tests/math/sglang  --config-name $CONFIG_NAME
\ No newline at end of file
+python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/math/sglang  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/tests/e2e_tests/math/sglang/run_pipeline.sh b/tests/e2e_tests/math/sglang/run_pipeline.sh
index 85e2e5c2d..f18012bb4 100644
--- a/tests/e2e_tests/math/sglang/run_pipeline.sh
+++ b/tests/e2e_tests/math/sglang/run_pipeline.sh
@@ -14,4 +14,4 @@ else
     CONFIG_NAME=$1
 fi
 
-python ${REPO_PATH}/examples/math/main_math.py --config-path $REPO_PATH/tests/e2e_tests/math/sglang  --config-name $CONFIG_NAME
\ No newline at end of file
+python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/math/sglang  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
index fe61ab16c..edeaee9c3 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: /workspace/results/
     project_name: rlinf
@@ -121,6 +121,7 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 256
   filter_prompt_by_length: True
   rollout_batch_size: 8
@@ -259,9 +260,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml
index 099fe7268..09df84ca8 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml
@@ -9,10 +9,10 @@ hydra:
 cluster:
   num_nodes: 1
   component_placement:
-    actor,rollout: all
+    actor,rollout,reward: all
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: /workspace/results/
     project_name: rlinf
@@ -122,6 +122,7 @@ rollout:
 
 data:
   type: math
+  dataset_name: boba
   max_prompt_length: 256
   filter_prompt_by_length: True
   rollout_batch_size: 8
@@ -260,9 +261,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/vllm/run_collocated.sh b/tests/e2e_tests/math/vllm/run_collocated.sh
index b4e924b1d..6ce4067fd 100644
--- a/tests/e2e_tests/math/vllm/run_collocated.sh
+++ b/tests/e2e_tests/math/vllm/run_collocated.sh
@@ -14,4 +14,4 @@ else
     CONFIG_NAME=$1
 fi
 
-python ${REPO_PATH}/examples/math/main_math.py --config-path $REPO_PATH/tests/e2e_tests/math/vllm  --config-name $CONFIG_NAME
\ No newline at end of file
+python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/math/vllm  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/tests/unit_tests/test_placement.py b/tests/unit_tests/test_placement.py
index c16ff7fb9..22c75ab68 100644
--- a/tests/unit_tests/test_placement.py
+++ b/tests/unit_tests/test_placement.py
@@ -1087,34 +1087,6 @@ def test_model_parallel_component_placement_init_missing_rollout_gpus(self):
             cluster = mock_cluster(num_nodes=1, num_accelerators_per_node=4)
             ModelParallelComponentPlacement(config, cluster)
 
-    def test_model_parallel_component_placement_init_collocated_mode_invalid_tp_sizes(
-        self,
-    ):
-        """Test ModelParallelComponentPlacement raises error when actor TP size < rollout TP size in collocated mode."""
-        config = DictConfig(
-            {
-                "cluster": {
-                    "num_nodes": 1,
-                    "component_placement": {"actor,rollout": "0-3"},
-                },
-                "actor": {
-                    "model": {
-                        "tensor_model_parallel_size": 2,
-                        "context_parallel_size": 1,
-                        "pipeline_model_parallel_size": 1,
-                    }
-                },
-                "rollout": {"tensor_parallel_size": 4, "pipeline_parallel_size": 1},
-            }
-        )
-
-        with pytest.raises(
-            AssertionError,
-            match="Actor TP size 2 must be greater or equal to Rollout TP size 4",
-        ):
-            cluster = mock_cluster(num_nodes=1, num_accelerators_per_node=4)
-            ModelParallelComponentPlacement(config, cluster)
-
     def test_model_parallel_component_placement_init_collocated_mode_with_inference_gpus(
         self,
     ):

From 9d40cb40186189dec4e2c4cb0051b16d0d045ab2 Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Thu, 2 Oct 2025 09:03:44 +0000
Subject: [PATCH 38/57] chore: refactor dataset parts

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 rlinf/data/datasets/__init__.py               | 150 ++++++++++
 rlinf/data/datasets/item.py                   |  59 ++++
 rlinf/data/datasets/math.py                   | 153 ++++++++++
 rlinf/data/datasets/utils.py                  |  68 +++++
 rlinf/data/{datasets.py => datasets/vlm.py}   | 275 ++----------------
 rlinf/data/io_struct.py                       |  29 +-
 rlinf/workers/actor/fsdp_actor_worker.py      |   2 +-
 rlinf/workers/reward/reward_worker.py         |   4 +-
 rlinf/workers/rollout/sglang/sglang_worker.py |  21 +-
 rlinf/workers/rollout/vllm/vllm_worker.py     |  31 +-
 tests/unit_tests/test_io_struct.py            | 135 +++++++++
 11 files changed, 613 insertions(+), 314 deletions(-)
 create mode 100644 rlinf/data/datasets/__init__.py
 create mode 100644 rlinf/data/datasets/item.py
 create mode 100644 rlinf/data/datasets/math.py
 create mode 100644 rlinf/data/datasets/utils.py
 rename rlinf/data/{datasets.py => datasets/vlm.py} (68%)
 create mode 100644 tests/unit_tests/test_io_struct.py

diff --git a/rlinf/data/datasets/__init__.py b/rlinf/data/datasets/__init__.py
new file mode 100644
index 000000000..f86f3576e
--- /dev/null
+++ b/rlinf/data/datasets/__init__.py
@@ -0,0 +1,150 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Any, Dict, List, Tuple
+
+import torch
+from omegaconf import DictConfig
+from torch.utils.data import Dataset
+from transformers import AutoTokenizer
+
+from rlinf.data.datasets.item import DatasetItem
+from rlinf.data.datasets.math import MathDataset
+from rlinf.data.datasets.vlm import VLMDatasetRegistry
+
+
+def create_rl_dataset(
+    config: DictConfig, tokenizer: AutoTokenizer
+) -> Tuple[Dataset, Dataset]:
+    """Create rl datasets.
+
+    Arguments:
+        config: The RLinf config.
+        tokenizer (Tokenizer): The tokenizer.
+
+    Returns:
+        train_dataset (Dataset): The training dataset.
+
+        val_dataset (Dataset): The validation dataset.
+    """
+
+    if config.data.type == "math":
+        dataset_cls = MathDataset
+    elif config.data.type == "vision_language":
+        # Prefer new factory-based VLM datasets; fallback to legacy if requested
+        dataset_name = getattr(config.data, "dataset_name", None)
+        lazy_loading = bool(getattr(config.data, "lazy_loading", False))
+
+        logging.info(
+            f"Using VLM dataset: name={dataset_name}, lazy_loading={lazy_loading}"
+        )
+
+        train_dataset = VLMDatasetRegistry.create(
+            dataset_name,
+            data_paths=config.data.train_data_paths,
+            config=config,
+            tokenizer=tokenizer,
+        )
+        val_dataset = VLMDatasetRegistry.create(
+            dataset_name,
+            data_paths=config.data.val_data_paths,
+            config=config,
+            tokenizer=tokenizer,
+        )
+        return train_dataset, val_dataset
+    else:
+        return None, None
+
+    logging.info(f"Using dataset class: {dataset_cls.__name__}")
+
+    # Instantiate the dataset using the determined dataset class
+    train_dataset = dataset_cls(
+        data_paths=config.data.train_data_paths,
+        config=config,
+        tokenizer=tokenizer,
+    )
+
+    val_dataset = dataset_cls(
+        data_paths=config.data.val_data_paths,
+        config=config,
+        tokenizer=tokenizer,
+    )
+
+    return train_dataset, val_dataset
+
+
+def collate_fn(data_list: List["DatasetItem"]) -> Dict[str, Any]:
+    """
+    Collate function for batching dataset items.
+    """
+    prompts = []
+    lens = []
+    for it in data_list:
+        p = (
+            it.prompt
+            if isinstance(it.prompt, torch.Tensor)
+            else torch.as_tensor(it.prompt, dtype=torch.long)
+        )
+        if p.dim() == 2 and p.size(0) == 1:
+            p = p.squeeze(0)
+        assert p.dim() == 1, (
+            f"DatasetItem.prompt must be 1-D tensor, current shape is: {p.shape}"
+        )
+        prompts.append(p)
+        lens.append(p.numel())
+
+    if len(set(lens)) == 1:
+        target_len = lens[0]
+    else:
+        target_len = min(lens)
+        prompts = [p[-target_len:] if p.numel() > target_len else p for p in prompts]
+
+    batch_prompt = torch.stack(prompts, dim=0)  # [B, L]
+    batch_length = torch.tensor(
+        [min(int(it.length), target_len) for it in data_list], dtype=torch.long
+    )
+
+    batch_idx = torch.tensor([int(it.idx) for it in data_list], dtype=torch.long)
+
+    batch: Dict[str, Any] = {
+        "prompt": batch_prompt,  # [B, L]
+        "length": batch_length,  # [B]
+        "answer": [it.answer for it in data_list],  # List[str]
+        "idx": batch_idx,  # [B]
+        "solution": [it.solution for it in data_list],  # List[Optional[str]]
+        "image_data": [
+            it.image_data for it in data_list
+        ],  # List[Optional[List[bytes|str]]]
+        "prompt_text": [it.prompt_text for it in data_list],  # List[Optional[str]]
+        "meta": [it.meta for it in data_list],  # List[Optional[dict]]
+        "multi_modal_inputs": [
+            it.multi_modal_inputs for it in data_list
+        ],  # List[Optional[dict]]
+    }
+    return batch
diff --git a/rlinf/data/datasets/item.py b/rlinf/data/datasets/item.py
new file mode 100644
index 000000000..e75155dcb
--- /dev/null
+++ b/rlinf/data/datasets/item.py
@@ -0,0 +1,59 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Union
+
+import torch
+
+
+@dataclass
+class DatasetItem:
+    """
+    A single item in processed dataset.
+
+    Attributes:
+        prompt (torch.Tensor): Tokenized prompt input_ids tensor.
+        length (int): Length of the prompt input_ids.
+        answer (str | dict): The answer associated with the prompt.
+        idx (int): Index of the item in the dataset.
+        solution (Optional[str]): Optional solution text if exists.
+        prompt_text (Optional[str]): Optional original prompt text before tokenization.
+        meta (Optional[Dict[str, Any]]): Optional metadata dictionary.
+        multi_modal_inputs (Optional[Dict[str, Any]]): Optional dictionary for additional multi-modal inputs.
+    """
+
+    prompt: torch.Tensor
+    length: int
+    answer: str | dict
+    idx: int
+    solution: Optional[str] = None
+    image_data: Optional[List[Union[bytes, str]]] = None
+    prompt_text: Optional[str] = None
+    meta: Optional[Dict[str, Any]] = None
+    multi_modal_inputs: Optional[Dict[str, Any]] = None
diff --git a/rlinf/data/datasets/math.py b/rlinf/data/datasets/math.py
new file mode 100644
index 000000000..821074bf6
--- /dev/null
+++ b/rlinf/data/datasets/math.py
@@ -0,0 +1,153 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import logging
+import os
+from typing import Any, List, Tuple, Union
+
+import torch
+from omegaconf import DictConfig
+from torch.utils.data import Dataset
+from transformers import AutoTokenizer
+
+from rlinf.data.datasets.item import DatasetItem
+from rlinf.data.datasets.utils import batch_pad_to_fixed_len
+
+
+class MathDataset(Dataset):
+    def __init__(
+        self,
+        data_paths: Union[str, List[str]],
+        config: DictConfig,
+        tokenizer: AutoTokenizer,
+    ):
+        super().__init__()
+        self.data_paths = data_paths
+        if isinstance(self.data_paths, str):
+            self.data_paths = [self.data_paths]
+
+        self.max_prompt_length = config.data.max_prompt_length
+        self.tokenizer = tokenizer
+        self.prompt_key = config.data.prompt_key
+
+        self.data = self._load_data()
+        if config.data.get("filter_prompt_by_length", False):
+            total = len(self.data)
+            filtered = []
+            failed = 0
+
+            for item in self.data:
+                try:
+                    _, L = self.encode(item[self.prompt_key])
+                    if L <= self.max_prompt_length:
+                        filtered.append(item)
+                except Exception:
+                    failed += 1
+
+            self.data = filtered
+            assert len(self.data) > 0, (
+                f"No samples found within max_prompt_length={self.max_prompt_length}. "
+                "Please check your dataset or increase max_prompt_length."
+            )
+
+            if failed > 0:
+                logging.warning(
+                    f"{failed} samples were skipped due to format issues "
+                    f"(kept {len(self.data)} / {total})."
+                )
+
+    def _load_data(self) -> List[Any]:
+        """
+        Load and merge data from multiple files(json or jsonl).
+        """
+        merged_data = []
+
+        for path in self.data_paths:
+            _, file_extension = os.path.splitext(path)
+            try:
+                with open(path, "r", encoding="utf-8") as file:
+                    if file_extension == ".jsonl":
+                        merged_data.extend([json.loads(line.strip()) for line in file])
+                    elif file_extension == ".json":
+                        content = json.load(file)
+                        if isinstance(content, list):
+                            merged_data.extend(content)
+                        else:
+                            merged_data.append(content)
+                    else:
+                        print(f"Unsupport {file_extension}, skip: {path}")
+            except Exception:
+                raise RuntimeError("Load data error")
+
+        return merged_data
+
+    def __len__(self):
+        return len(self.data)
+
+    def encode(self, text: str) -> Tuple[List[int], int]:
+        """
+        Use tokenizer to encode the text and return the token ids and length.
+        """
+        text_ids = self.tokenizer.encode(text)
+        return text_ids, len(text_ids)
+
+    def __getitem__(self, idx):
+        """
+        Return a single prompt.
+        """
+
+        prompt = self.data[idx][self.prompt_key]
+
+        answer = self.data[idx]["solutions"]
+
+        prompt_tokens, prompt_length = self.encode(prompt)
+        prompt_tokens_tensor = torch.as_tensor(prompt_tokens, dtype=torch.int64)
+
+        if prompt_length > self.max_prompt_length:
+            print(
+                f"prompt_tokens_tensor length {prompt_length} exceeds the max_prompt_length {self.max_prompt_length}",
+            )
+            prompt_tokens_tensor = prompt_tokens_tensor[: self.max_prompt_length]
+            prompt_length = self.max_prompt_length
+
+        prompt_tokens_tensor = batch_pad_to_fixed_len(
+            [prompt_tokens_tensor],
+            self.max_prompt_length,
+            self.tokenizer.eos_token_id,
+            left_pad=True,
+        )[0]
+        output = DatasetItem(
+            prompt=prompt_tokens_tensor,
+            length=prompt_length,
+            answer=answer,
+            idx=idx,
+            image_data=[],
+        )
+        return output
diff --git a/rlinf/data/datasets/utils.py b/rlinf/data/datasets/utils.py
new file mode 100644
index 000000000..db4dbdb58
--- /dev/null
+++ b/rlinf/data/datasets/utils.py
@@ -0,0 +1,68 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+import torch
+
+
+def batch_pad_to_fixed_len(
+    batch: List[torch.Tensor],
+    max_batch_len: int,
+    pad_token: int,
+    left_pad: bool = False,
+) -> torch.Tensor:
+    if left_pad:
+        batch_pad = torch.stack(
+            [
+                torch.cat(
+                    [
+                        torch.full(
+                            (max_batch_len - len(seq),), pad_token, dtype=seq.dtype
+                        ),  # pad on the left
+                        seq,
+                    ]
+                )
+                for seq in batch
+            ]
+        )
+    else:
+        batch_pad = torch.stack(
+            [
+                torch.cat(
+                    [
+                        seq,
+                        torch.full(
+                            (max_batch_len - len(seq),), pad_token, dtype=seq.dtype
+                        ),
+                    ]
+                )
+                for seq in batch
+            ]
+        )
+    return batch_pad
diff --git a/rlinf/data/datasets.py b/rlinf/data/datasets/vlm.py
similarity index 68%
rename from rlinf/data/datasets.py
rename to rlinf/data/datasets/vlm.py
index 677377a68..18956dd81 100644
--- a/rlinf/data/datasets.py
+++ b/rlinf/data/datasets/vlm.py
@@ -12,10 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import json
 import logging
 import os
-from dataclasses import dataclass
 from io import BytesIO
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
@@ -26,156 +40,8 @@
 from torch.utils.data import Dataset
 from transformers import AutoProcessor, AutoTokenizer
 
-
-def batch_pad_to_fixed_len(
-    batch: List[torch.Tensor],
-    max_batch_len: int,
-    pad_token: int,
-    left_pad: bool = False,
-) -> torch.Tensor:
-    if left_pad:
-        batch_pad = torch.stack(
-            [
-                torch.cat(
-                    [
-                        torch.full(
-                            (max_batch_len - len(seq),), pad_token, dtype=seq.dtype
-                        ),  # pad on the left
-                        seq,
-                    ]
-                )
-                for seq in batch
-            ]
-        )
-    else:
-        batch_pad = torch.stack(
-            [
-                torch.cat(
-                    [
-                        seq,
-                        torch.full(
-                            (max_batch_len - len(seq),), pad_token, dtype=seq.dtype
-                        ),
-                    ]
-                )
-                for seq in batch
-            ]
-        )
-    return batch_pad
-
-
-@dataclass
-class DatasetItem:
-    prompt: torch.Tensor
-    length: int
-    answer: str | dict
-    idx: int
-    solution: Optional[str] = None
-    image_data: Optional[List[Union[bytes, str]]] = None
-    prompt_text: Optional[str] = None
-    meta: Optional[Dict[str, Any]] = None
-    multi_modal_inputs: Optional[Dict[str, Any]] = None
-
-
-class MathDataset(Dataset):
-    def __init__(self, data_paths, config, tokenizer):
-        super().__init__()
-        self.data_paths = data_paths
-        if isinstance(self.data_paths, str):
-            self.data_paths = [self.data_paths]
-
-        self.max_prompt_length = config.data.max_prompt_length
-        self.tokenizer = tokenizer
-        self.prompt_key = config.data.prompt_key
-
-        self.data = self._load_data()
-        if config.data.get("filter_prompt_by_length", False):
-            total = len(self.data)
-            filtered = []
-            failed = 0
-
-            for item in self.data:
-                try:
-                    _, L = self.encode(item[self.prompt_key])
-                    if L <= self.max_prompt_length:
-                        filtered.append(item)
-                except Exception:
-                    failed += 1
-
-            self.data = filtered
-            assert len(self.data) > 0, (
-                f"No samples found within max_prompt_length={self.max_prompt_length}. "
-                "Please check your dataset or increase max_prompt_length."
-            )
-
-            if failed > 0:
-                logging.warning(
-                    f"{failed} samples were skipped due to format issues "
-                    f"(kept {len(self.data)} / {total})."
-                )
-
-    def _load_data(self):
-        merged_data = []
-
-        for path in self.data_paths:
-            _, file_extension = os.path.splitext(path)
-            try:
-                with open(path, "r", encoding="utf-8") as file:
-                    if file_extension == ".jsonl":
-                        merged_data.extend([json.loads(line.strip()) for line in file])
-                    elif file_extension == ".json":
-                        content = json.load(file)
-                        if isinstance(content, list):
-                            merged_data.extend(content)
-                        else:
-                            merged_data.append(content)
-                    else:
-                        print(f"Unsupport {file_extension}, skip: {path}")
-            except Exception:
-                raise RuntimeError("Load data error")
-
-        return merged_data
-
-    def __len__(self):
-        return len(self.data)
-
-    def encode(self, text):
-        text_ids = self.tokenizer.encode(text)
-        return text_ids, len(text_ids)
-
-    def __getitem__(self, idx):
-        """
-        Return a single prompt.
-        """
-
-        prompt = self.data[idx][self.prompt_key]
-
-        answer = self.data[idx]["solutions"]
-
-        prompt_tokens, prompt_length = self.encode(prompt)
-        prompt_tokens_tensor = torch.as_tensor(prompt_tokens, dtype=torch.int64)
-
-        if prompt_length > self.max_prompt_length:
-            print(
-                f"prompt_tokens_tensor length {prompt_length} exceeds the max_prompt_length {self.max_prompt_length}",
-            )
-            prompt_tokens_tensor = prompt_tokens_tensor[: self.max_prompt_length]
-            prompt_length = self.max_prompt_length
-
-        prompt_tokens_tensor = batch_pad_to_fixed_len(
-            [prompt_tokens_tensor],
-            self.max_prompt_length,
-            self.tokenizer.eos_token_id,
-            left_pad=True,
-        )[0]
-        output = DatasetItem(
-            prompt=prompt_tokens_tensor,
-            length=prompt_length,
-            answer=answer,
-            idx=idx,
-            image_data=[],
-        )
-        return output
+from rlinf.data.datasets.item import DatasetItem
+from rlinf.data.datasets.utils import batch_pad_to_fixed_len
 
 
 class VLMBaseDataset(Dataset):
@@ -542,7 +408,6 @@ def __init__(
         )
 
     def get_image_list(self, dataitem: Dict[str, Any]) -> List[Union[bytes, str, None]]:
-        # Prefer common robo2vlm fields if present, else fallback to configured keys
         images: List[Any] = []
         if "images" in dataitem:
             v = dataitem.get("images")
@@ -559,10 +424,8 @@ def get_image_list(self, dataitem: Dict[str, Any]) -> List[Union[bytes, str, Non
             else:
                 images = [None]
         else:
-            # fallback to base behavior using configured image_keys
             return super().get_image_list(dataitem)
 
-        # Normalize each element similar to base behavior
         normed: List[Union[bytes, str, None]] = []
         for v in images:
             if v is None:
@@ -611,107 +474,3 @@ def postprocess_dataset_item(
         item.answer = answer_dict
 
         return item
-
-
-def create_rl_dataset(config: DictConfig, tokenizer):
-    """Create rl datasets.
-
-    Arguments:
-        config: The RLinf config.
-        tokenizer (Tokenizer): The tokenizer.
-
-    Returns:
-        train_dataset (Dataset): The training dataset.
-
-        val_dataset (Dataset): The validation dataset.
-    """
-
-    if config.data.type == "math":
-        dataset_cls = MathDataset
-    elif config.data.type == "vision_language":
-        # Prefer new factory-based VLM datasets; fallback to legacy if requested
-        dataset_name = getattr(config.data, "dataset_name", None)
-        lazy_loading = bool(getattr(config.data, "lazy_loading", False))
-
-        print(f"Using VLM dataset: name={dataset_name}, lazy_loading={lazy_loading}")
-
-        train_dataset = VLMDatasetRegistry.create(
-            dataset_name,
-            data_paths=config.data.train_data_paths,
-            config=config,
-            tokenizer=tokenizer,
-        )
-        val_dataset = VLMDatasetRegistry.create(
-            dataset_name,
-            data_paths=config.data.val_data_paths,
-            config=config,
-            tokenizer=tokenizer,
-        )
-        return train_dataset, val_dataset
-    else:
-        return None, None
-
-    print(f"Using dataset class: {dataset_cls.__name__}")
-
-    # Instantiate the dataset using the determined dataset class
-    train_dataset = dataset_cls(
-        data_paths=config.data.train_data_paths,
-        config=config,
-        tokenizer=tokenizer,
-    )
-
-    val_dataset = dataset_cls(
-        data_paths=config.data.val_data_paths,
-        config=config,
-        tokenizer=tokenizer,
-    )
-
-    return train_dataset, val_dataset
-
-
-def collate_fn(data_list: List["DatasetItem"]) -> Dict[str, Any]:
-    prompts = []
-    lens = []
-    for it in data_list:
-        p = (
-            it.prompt
-            if isinstance(it.prompt, torch.Tensor)
-            else torch.as_tensor(it.prompt, dtype=torch.long)
-        )
-        if p.dim() == 2 and p.size(0) == 1:
-            p = p.squeeze(0)
-        assert p.dim() == 1, (
-            f"DatasetItem.prompt must be 1-D tensor, current shape is: {p.shape}"
-        )
-        prompts.append(p)
-        lens.append(p.numel())
-
-    if len(set(lens)) == 1:
-        target_len = lens[0]
-    else:
-        target_len = min(lens)
-        prompts = [p[-target_len:] if p.numel() > target_len else p for p in prompts]
-
-    batch_prompt = torch.stack(prompts, dim=0)  # [B, L]
-    batch_length = torch.tensor(
-        [min(int(it.length), target_len) for it in data_list], dtype=torch.long
-    )
-
-    batch_idx = torch.tensor([int(it.idx) for it in data_list], dtype=torch.long)
-
-    batch: Dict[str, Any] = {
-        "prompt": batch_prompt,  # [B, L]
-        "length": batch_length,  # [B]
-        "answer": [it.answer for it in data_list],  # List[str]
-        "idx": batch_idx,  # [B]
-        "solution": [it.solution for it in data_list],  # List[Optional[str]]
-        "image_data": [
-            it.image_data for it in data_list
-        ],  # List[Optional[List[bytes|str]]]
-        "prompt_text": [it.prompt_text for it in data_list],  # List[Optional[str]]
-        "meta": [it.meta for it in data_list],  # List[Optional[dict]]
-        "multi_modal_inputs": [
-            it.multi_modal_inputs for it in data_list
-        ],  # List[Optional[dict]]
-    }
-    return batch
diff --git a/rlinf/data/io_struct.py b/rlinf/data/io_struct.py
index b7dc359b5..cb5e6133e 100644
--- a/rlinf/data/io_struct.py
+++ b/rlinf/data/io_struct.py
@@ -69,10 +69,15 @@ def repeat(self) -> "RolloutRequest":
         """
         assert self.n > 0, "n must be greater than 0"
 
-        input_ids, answers = zip(
+        input_ids, answers, image_data, multi_modal_inputs = zip(
             *[
-                (input_id, answer)
-                for input_id, answer in zip(self.input_ids, self.answers)
+                (input_id, answer, image_data, multi_modal_inputs)
+                for input_id, answer, image_data, multi_modal_inputs in zip(
+                    self.input_ids,
+                    self.answers,
+                    self.image_data,
+                    self.multi_modal_inputs,
+                )
                 for _ in range(self.n)
             ]
         )
@@ -80,6 +85,8 @@ def repeat(self) -> "RolloutRequest":
             n=self.n,
             input_ids=list(input_ids),
             answers=list(answers),
+            image_data=list(image_data),
+            multi_modal_inputs=list(multi_modal_inputs),
         )
 
     def split(self, num_splits: int) -> List["RolloutRequest"]:
@@ -98,15 +105,27 @@ def split(self, num_splits: int) -> List["RolloutRequest"]:
 
         input_ids_split_list = split_list(self.input_ids, num_splits)
         answers_split_list = split_list(self.answers, num_splits)
+        image_data_split_list = split_list(self.image_data, num_splits)
+        multi_modal_inputs_split_list = split_list(self.multi_modal_inputs, num_splits)
 
         splitted_requests = []
-        for input_ids_batch, answers_batch in zip(
-            input_ids_split_list, answers_split_list
+        for (
+            input_ids_batch,
+            answers_batch,
+            image_data_batch,
+            multi_modal_inputs_batch,
+        ) in zip(
+            input_ids_split_list,
+            answers_split_list,
+            image_data_split_list,
+            multi_modal_inputs_split_list,
         ):
             request = RolloutRequest(
                 n=self.n,
                 input_ids=input_ids_batch,
                 answers=answers_batch,
+                image_data=image_data_batch,
+                multi_modal_inputs=multi_modal_inputs_batch,
             )
             splitted_requests.append(request)
 
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index ccf3bc68e..429476402 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -137,7 +137,7 @@ def _setup_rollout_weight_dst_ranks(self):
         )
 
     def del_reshard_state_dict(self):
-        if hasattr(self, "rollou_state_dict"):
+        if hasattr(self, "rollout_state_dict"):
             del self.rollout_state_dict
 
     def sync_model_to_rollout(self):
diff --git a/rlinf/workers/reward/reward_worker.py b/rlinf/workers/reward/reward_worker.py
index fefd422d6..03b2311af 100644
--- a/rlinf/workers/reward/reward_worker.py
+++ b/rlinf/workers/reward/reward_worker.py
@@ -114,5 +114,7 @@ def compute_batch_rewards_with_model(self, batch: Dict[str, torch.Tensor]):
         self.model.eval()
         with torch.no_grad():
             # TODO: fix this
-            rewards = self.model(batch["input_ids"], batch["attention_mask"])
+            rewards = (
+                self.model(batch["input_ids"], batch["attention_mask"]).detach().cpu()
+            )
         return rewards
diff --git a/rlinf/workers/rollout/sglang/sglang_worker.py b/rlinf/workers/rollout/sglang/sglang_worker.py
index a339edd9e..93f268d6c 100644
--- a/rlinf/workers/rollout/sglang/sglang_worker.py
+++ b/rlinf/workers/rollout/sglang/sglang_worker.py
@@ -276,7 +276,7 @@ async def _compute_reward_and_advantage(
                 )
 
         results = math_verify_call(texts, answers)
-        rewards = [(1 if r else -1) * self._reward_model.scale for r in results]
+        rewards = [r * self._reward_model.scale for r in results]
         rewards_tensor = torch.tensor(rewards, dtype=torch.float)
 
         mean = rewards_tensor.mean()
@@ -333,20 +333,6 @@ async def rollout(self, input_channel: Channel, output_channel: Channel):
 
                 if self._completion_info.is_completed(hash_id):
                     results = self._completion_info.get_results(hash_id)
-                    # (
-                    #     rewards,
-                    #     advantages,
-                    # ) = await self._compute_reward_and_advantage(
-                    #     results,
-                    #     self._current_request.answers[raw_id],
-                    # )
-                    # if (
-                    #     all_floats_equal(rewards)
-                    #     and self._cfg.algorithm.get("max_num_gen_batches", 1) > 1
-                    # ):
-                    #     if (total_reqs - droped_reqs) > required_reqs:
-                    #         droped_reqs += rollout_request.n
-                    #         continue
 
                     input_ids = [input_ids] * len(results)
                     rollout_result = RolloutResult.from_sglang_results(
@@ -355,10 +341,7 @@ async def rollout(self, input_channel: Channel, output_channel: Channel):
                         input_ids,
                         return_logprobs=self._return_logprobs,
                     )
-                    # rollout_result.rewards = torch.tensor(
-                    #     rewards, dtype=torch.float32
-                    # ).reshape(-1, 1)
-                    # rollout_result.advantages = advantages
+
                     return_tasks.append(
                         asyncio.create_task(
                             self._put_result(rollout_result, output_channel)
diff --git a/rlinf/workers/rollout/vllm/vllm_worker.py b/rlinf/workers/rollout/vllm/vllm_worker.py
index 899e07f1e..edeab9f0d 100644
--- a/rlinf/workers/rollout/vllm/vllm_worker.py
+++ b/rlinf/workers/rollout/vllm/vllm_worker.py
@@ -19,7 +19,6 @@
 from typing import AsyncGenerator, List, Optional, Union
 
 import requests
-import torch
 from omegaconf import DictConfig
 from PIL.Image import Image
 from transformers import AutoTokenizer
@@ -36,7 +35,7 @@
 from rlinf.scheduler import Channel, Worker
 from rlinf.utils.placement import ComponentPlacement
 from rlinf.workers.rollout.utils import print_vllm_outputs
-from toolkits.math_verifier.verify import MathRewardModel, math_verify_call
+from toolkits.math_verifier.verify import MathRewardModel
 
 from . import VLLMExecutor
 
@@ -363,32 +362,6 @@ async def _stop(self) -> None:
         if not self._placement.is_disaggregated:
             await self.offload_model_weights()
 
-    async def _compute_reward_and_advantage(self, rollout_result: RolloutResult):
-        """
-        Compute rewards and advantages for the rollout result using math verification.
-        """
-        answers = rollout_result.answers
-        outputs = rollout_result.response_texts
-        num_sequence = rollout_result.num_sequence
-        assert len(answers) == len(outputs), (
-            f"Answers length {len(answers)} != outputs length {len(outputs)}"
-        )
-        assert len(answers) == num_sequence, (
-            f"Answers length {len(answers)} != num_sequence {num_sequence}"
-        )
-
-        math_verify_results = math_verify_call(outputs, answers)
-        rewards = [
-            (1 if r else -1) * self._reward_model.scale for r in math_verify_results
-        ]
-        rewards_tensor = torch.tensor(rewards, dtype=torch.float)
-        rollout_result.rewards = rewards_tensor.reshape(-1, 1)
-
-        mean = rewards_tensor.mean()
-        std = rewards_tensor.std(unbiased=False)
-        advantages = (rewards_tensor - mean) / (std + 1e-6)
-        rollout_result.advantages = advantages.tolist()
-
     async def rollout_and_return(
         self, request: RolloutRequest, output_channel: Channel
     ):
@@ -402,8 +375,6 @@ async def rollout_and_return(
             multi_modal_inputs=request.multi_modal_inputs,
             return_logprobs=self._return_logprobs,
         )
-        if self._placement.is_disaggregated:
-            await self._compute_reward_and_advantage(rollout_result)
 
         await self._put_result(result=rollout_result, output_channel=output_channel)
 
diff --git a/tests/unit_tests/test_io_struct.py b/tests/unit_tests/test_io_struct.py
new file mode 100644
index 000000000..7104c277a
--- /dev/null
+++ b/tests/unit_tests/test_io_struct.py
@@ -0,0 +1,135 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+
+from rlinf.data.io_struct import RolloutRequest, RolloutResult
+
+
+def test_rollout_request_repeat_preserves_multimodal():
+    request = RolloutRequest(
+        n=2,
+        input_ids=[[1, 2, 3], [4, 5]],
+        image_data=[[b"img1-1", b"img1-2"], []],
+        answers=["ans1", "ans2"],
+        multi_modal_inputs=[{"pixels": [1, 2]}, {"pixels": [3]}],
+    )
+
+    repeated = request.repeat()
+
+    assert repeated.n == 2
+    assert repeated.input_ids == [[1, 2, 3], [1, 2, 3], [4, 5], [4, 5]]
+    assert repeated.answers == ["ans1", "ans1", "ans2", "ans2"]
+    assert repeated.image_data == [
+        [b"img1-1", b"img1-2"],
+        [b"img1-1", b"img1-2"],
+        [],
+        [],
+    ]
+    assert repeated.multi_modal_inputs == [
+        {"pixels": [1, 2]},
+        {"pixels": [1, 2]},
+        {"pixels": [3]},
+        {"pixels": [3]},
+    ]
+
+
+def _make_rollout_result():
+    num_sequence = 4
+    group_size = 2
+    return RolloutResult(
+        num_sequence=num_sequence,
+        group_size=group_size,
+        prompt_lengths=[3, 3, 4, 4],
+        prompt_ids=[[11, 12, 13], [11, 12, 13], [21, 22, 23, 24], [21, 22, 23, 24]],
+        response_lengths=[2, 2, 2, 2],
+        response_ids=[[101, 102], [201, 202], [301, 302], [401, 402]],
+        is_end=[True, False, True, True],
+        answers=[{"answer": "a"}, {"answer": "b"}, {"answer": "c"}, {"answer": "d"}],
+        image_data=[[b"a"], [b"b"], [b"c"], [b"d"]],
+        multi_modal_inputs=[
+            {"vision": "img-a"},
+            {"vision": "img-b"},
+            {"vision": "img-c"},
+            {"vision": "img-d"},
+        ],
+        prompt_texts=["prompt-a", "prompt-a", "prompt-b", "prompt-b"],
+        response_texts=["resp-a1", "resp-a2", "resp-b1", "resp-b2"],
+        rollout_logprobs=[[0.1, 0.2], [0.3, 0.4], [0.5, 0.6], [0.7, 0.8]],
+        rewards=torch.tensor([[1.0], [0.5], [0.2], [0.1]]),
+        advantages=[0.1, 0.2, 0.3, 0.4],
+        prev_logprobs=torch.tensor(
+            [
+                [0.01, 0.02],
+                [0.03, 0.04],
+                [0.05, 0.06],
+                [0.07, 0.08],
+            ]
+        ),
+        ref_logprobs=torch.tensor(
+            [
+                [0.11, 0.12],
+                [0.13, 0.14],
+                [0.15, 0.16],
+                [0.17, 0.18],
+            ]
+        ),
+    )
+
+
+def test_rollout_result_split_and_merge_roundtrip():
+    result = _make_rollout_result()
+
+    split_results = RolloutResult.split_result_list_by_group([result])
+
+    assert len(split_results) == result.num_sequence // result.group_size
+    first, second = split_results
+
+    assert first.num_sequence == result.group_size
+    assert second.num_sequence == result.group_size
+    assert first.prompt_ids == result.prompt_ids[: result.group_size]
+    assert second.prompt_ids == result.prompt_ids[result.group_size :]
+    assert first.response_ids == result.response_ids[: result.group_size]
+    assert second.response_ids == result.response_ids[result.group_size :]
+    assert first.prompt_texts == result.prompt_texts[: result.group_size]
+    assert second.prompt_texts == result.prompt_texts[result.group_size :]
+    assert first.response_texts == result.response_texts[: result.group_size]
+    assert second.response_texts == result.response_texts[result.group_size :]
+    assert first.image_data == result.image_data[: result.group_size]
+    assert second.image_data == result.image_data[result.group_size :]
+    assert first.multi_modal_inputs == result.multi_modal_inputs[: result.group_size]
+    assert second.multi_modal_inputs == result.multi_modal_inputs[result.group_size :]
+    assert first.rollout_logprobs == result.rollout_logprobs[: result.group_size]
+    assert second.rollout_logprobs == result.rollout_logprobs[result.group_size :]
+    assert torch.equal(first.rewards, result.rewards[: result.group_size])
+    assert torch.equal(second.rewards, result.rewards[result.group_size :])
+    assert first.advantages == result.advantages[: result.group_size]
+    assert second.advantages == result.advantages[result.group_size :]
+
+    merged = RolloutResult.merge_result_list(split_results)
+
+    assert merged.num_sequence == result.num_sequence
+    assert merged.group_size == result.group_size
+    assert merged.prompt_ids == result.prompt_ids
+    assert merged.prompt_lengths == result.prompt_lengths
+    assert merged.response_ids == result.response_ids
+    assert merged.response_lengths == result.response_lengths
+    assert merged.is_end == result.is_end
+    assert merged.answers == result.answers
+    assert merged.rollout_logprobs == result.rollout_logprobs
+    assert merged.advantages == result.advantages
+    assert torch.equal(merged.rewards, result.rewards)
+    assert torch.equal(merged.prev_logprobs, result.prev_logprobs)
+    assert torch.equal(merged.ref_logprobs, result.ref_logprobs)

From ecb1ed0b463ec7df5ef4c0417942be9ee0ad4a3e Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Thu, 2 Oct 2025 11:34:25 +0000
Subject: [PATCH 39/57] fix(mm_data): unify vllm/sglang's mm_data passing

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 rlinf/data/datasets/vlm.py                    |  5 +-
 rlinf/data/io_struct.py                       |  2 +-
 rlinf/workers/actor/fsdp_actor_worker.py      |  4 +-
 rlinf/workers/rollout/sglang/sglang_worker.py |  4 +-
 rlinf/workers/rollout/vllm/vllm_worker.py     | 62 ++++++++++++++++---
 5 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/rlinf/data/datasets/vlm.py b/rlinf/data/datasets/vlm.py
index 18956dd81..73509c32a 100644
--- a/rlinf/data/datasets/vlm.py
+++ b/rlinf/data/datasets/vlm.py
@@ -178,7 +178,10 @@ def encode_prompt(
                 text=[rendered], images=images_inputs, padding=True, return_tensors="pt"
             )
             inputs.pop("attention_mask")
-            inputs.pop("input_ids")
+            # NOTE:
+            # we use these input_ids in inputs rather than belows
+            # because sglang need corresponding pixel_values len's placeholder
+            # in input_ids, while vllm does not need.
             ids = self._processor(
                 text=[rendered], images=None, padding=True, return_tensors="pt"
             )["input_ids"]
diff --git a/rlinf/data/io_struct.py b/rlinf/data/io_struct.py
index cb5e6133e..8a01ddc1c 100644
--- a/rlinf/data/io_struct.py
+++ b/rlinf/data/io_struct.py
@@ -20,7 +20,7 @@
 from vllm.outputs import CompletionOutput
 from vllm.outputs import RequestOutput as VllmRequestOutput
 
-from rlinf.data.datasets import batch_pad_to_fixed_len
+from rlinf.data.datasets.utils import batch_pad_to_fixed_len
 from rlinf.utils.data_iter_utils import (
     get_iterator_k_split,
     split_list,
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index 429476402..1e9e2c598 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -356,9 +356,9 @@ def run_training(self, input_channel: Channel):
                     )
 
                     append_to_dict(metrics, mbs_metrics_data)
-
                 mean_metric_dict = {
-                    key: np.mean(value) for key, value in metrics.items()
+                    key: torch.mean(torch.stack(value))
+                    for key, value in metrics.items()
                 }
                 mean_metric_dict = all_reduce_dict(
                     mean_metric_dict, op=torch.distributed.ReduceOp.AVG
diff --git a/rlinf/workers/rollout/sglang/sglang_worker.py b/rlinf/workers/rollout/sglang/sglang_worker.py
index 93f268d6c..3fcda8fca 100644
--- a/rlinf/workers/rollout/sglang/sglang_worker.py
+++ b/rlinf/workers/rollout/sglang/sglang_worker.py
@@ -179,7 +179,9 @@ def rollout(self, input_channel: Channel, output_channel: Channel):
             # Generate outputs using the SGLang engine.
             with self.worker_timer():
                 results = self._engine.generate(
-                    input_ids=request.input_ids,
+                    input_ids=request.input_ids
+                    if request.multi_modal_inputs
+                    else request.multi_modal_inputs["input_ids"],
                     # 0.4.4 has modality bug,can't pass non-None image_data
                     image_data=request.image_data if any(request.image_data) else None,
                     sampling_params=self._sampling_params,
diff --git a/rlinf/workers/rollout/vllm/vllm_worker.py b/rlinf/workers/rollout/vllm/vllm_worker.py
index edeab9f0d..b3f82d9d9 100644
--- a/rlinf/workers/rollout/vllm/vllm_worker.py
+++ b/rlinf/workers/rollout/vllm/vllm_worker.py
@@ -20,7 +20,7 @@
 
 import requests
 from omegaconf import DictConfig
-from PIL.Image import Image
+from PIL import Image
 from transformers import AutoTokenizer
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
@@ -226,6 +226,23 @@ async def generate(
             Union[List[List[Union[bytes, str]]], List[Union[bytes, str]]]
         ] = None,
     ) -> List[RequestOutput]:
+        """
+        Do Generate Task using the vllm async engine.
+
+        Args:
+            input_ids: The input token ids to generate. It can be a list of list of int,
+                or a list of int (single prompt).
+            sampling_params: The sampling parameters to use for generation.
+            prompt_texts: The input prompt texts to generate. It can be a list of strings
+                or a single string. If provided, it will be used instead of input_ids.
+            image_data: The input multi-modal data to generate. It can be a list of list
+                of bytes or image paths (local or URL), or a list of bytes or image paths
+                (single prompt).
+
+        Returns:
+            List[RequestOutput]: A list of RequestOutput from vllm engine.
+        """
+
         def check_input_ids() -> List[List[int]]:
             assert isinstance(input_ids, list), (
                 "input_ids should be a list or list of list of int."
@@ -266,19 +283,22 @@ def check_image_data() -> Optional[List[List[Image]]]:
         if prompt_texts is not None:
             for i, prompt_text in enumerate(prompt_texts):
                 if image_list is not None:
-                    image_list = self._process_image_data(image_data=image_list[i])
+                    images = self._process_image_data(image_data=image_list[i])
                     inputs.append(
-                        TextPrompt(prompt=prompt_text, multi_modal_data=image_list)
+                        TextPrompt(
+                            prompt=prompt_text, multi_modal_data={"image": images}
+                        )
                     )
                 else:
                     inputs.append(TextPrompt(prompt=prompt_text))
         else:
             for i, input_id in enumerate(input_ids):
                 if image_list is not None:
-                    image_list = self._process_image_data(image_data=image_list[i])
+                    images = self._process_image_data(image_data=image_list[i])
                     inputs.append(
                         TokensPrompt(
-                            prompt_token_ids=input_id, multi_modal_data=image_list
+                            prompt_token_ids=input_id,
+                            multi_modal_data={"image": images},
                         )
                     )
                 else:
@@ -302,7 +322,8 @@ def check_image_data() -> Optional[List[List[Image]]]:
     async def init_worker(self) -> None:
         """
         Use EngineArgs and VllmConfig to initialize VLLM async engine.
-        Then offload the model weights, ready to use weights sent from actor.
+        If mode is collocated, it will additionally offload model weights,
+        ready to use parameters sent from actor.
         """
         engine_args: EngineArgs = EngineArgs(
             model=self._cfg.rollout.model_dir,
@@ -349,6 +370,13 @@ async def init_worker(self) -> None:
             await self.offload_model_weights()
 
     async def _put_result(self, result: RolloutResult, output_channel: Channel) -> None:
+        """
+        Helper function to put the result to output channel.
+
+        Args:
+            result: The RolloutResult to put to the channel.
+            output_channel: The output channel to send results to.
+        """
         await output_channel.put(result, async_op=True).async_wait()
 
     async def _stop(self) -> None:
@@ -365,8 +393,18 @@ async def _stop(self) -> None:
     async def rollout_and_return(
         self, request: RolloutRequest, output_channel: Channel
     ):
+        """
+        Helper function to rollout for a single RolloutRequest and build RolloutResult then
+        put it to output channel.
+
+        Args:
+            request: The RolloutRequest to process.
+            output_channel: The output channel to send results to.
+        """
         vllm_results: List[RequestOutput] = await self.generate(
-            input_ids=request.input_ids, sampling_params=self._sampling_params
+            input_ids=request.input_ids,
+            image_data=request.image_data,
+            sampling_params=self._sampling_params,
         )
         rollout_result: RolloutResult = RolloutResult.from_vllm_results(
             group_size=self._cfg.algorithm.group_size,
@@ -375,10 +413,18 @@ async def rollout_and_return(
             multi_modal_inputs=request.multi_modal_inputs,
             return_logprobs=self._return_logprobs,
         )
-
         await self._put_result(result=rollout_result, output_channel=output_channel)
 
     async def rollout(self, input_channel: Channel, output_channel: Channel) -> None:
+        """
+            Perform rollout using vllm engine.
+            It will read `RolloutRequest` from input_channel and put `RolloutResult` to output_channel.
+            If the input request is None, it will stop the rollout.
+
+        Args:
+            input_channel: The input channel to read from.
+            output_channel: The output channel to send results to.
+        """
         rollout_request: RolloutRequest = await input_channel.get(
             async_op=True
         ).async_wait()

From 582a438525ea79d66e7044937bc8a5ea766b5908 Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Thu, 2 Oct 2025 12:59:02 +0000
Subject: [PATCH 40/57] fix(rollout): fix some problems in sglang/vllm, now
 both are ok

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 rlinf/data/datasets/vlm.py                    | 18 +++++++++++-------
 rlinf/data/io_struct.py                       |  9 ++++++++-
 rlinf/utils/data_iter_utils.py                |  8 ++++++--
 rlinf/workers/actor/fsdp_actor_worker.py      |  2 --
 rlinf/workers/rollout/sglang/sglang_worker.py |  4 +---
 rlinf/workers/rollout/vllm/vllm_worker.py     |  9 ++++++++-
 6 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/rlinf/data/datasets/vlm.py b/rlinf/data/datasets/vlm.py
index 73509c32a..da9e952b9 100644
--- a/rlinf/data/datasets/vlm.py
+++ b/rlinf/data/datasets/vlm.py
@@ -178,13 +178,17 @@ def encode_prompt(
                 text=[rendered], images=images_inputs, padding=True, return_tensors="pt"
             )
             inputs.pop("attention_mask")
-            # NOTE:
-            # we use these input_ids in inputs rather than belows
-            # because sglang need corresponding pixel_values len's placeholder
-            # in input_ids, while vllm does not need.
-            ids = self._processor(
-                text=[rendered], images=None, padding=True, return_tensors="pt"
-            )["input_ids"]
+            if self.cfg.rollout.rollout_backend == "sglang":
+                ids = inputs.pop("input_ids")
+            elif self.cfg.rollout.rollout_backend == "vllm":
+                inputs.pop("input_ids")
+                ids = self._processor(
+                    text=[rendered], images=None, padding=True, return_tensors="pt"
+                )["input_ids"]
+            else:
+                raise ValueError(
+                    f"Unsupported rollout backend {self.cfg.rollout.rollout_backend}"
+                )
             if isinstance(ids, torch.Tensor):
                 if ids.dim() == 2 and ids.size(0) == 1:
                     ids = ids.squeeze(0)
diff --git a/rlinf/data/io_struct.py b/rlinf/data/io_struct.py
index 8a01ddc1c..bb1978754 100644
--- a/rlinf/data/io_struct.py
+++ b/rlinf/data/io_struct.py
@@ -371,6 +371,13 @@ def get_logprobs(
 
         num_sequences = len(results) * group_size
 
+        if multi_modal_inputs:
+            mm_inputs = []
+            for mm_input in multi_modal_inputs:
+                mm_inputs.extend([mm_input] * group_size)
+        else:
+            mm_inputs = None
+
         prompt_lengths = []
         prompt_ids = []
         response_lengths = []
@@ -413,7 +420,7 @@ def get_logprobs(
             response_ids=response_ids,
             response_lengths=response_lengths,
             response_texts=response_texts,
-            multi_modal_inputs=multi_modal_inputs,
+            multi_modal_inputs=mm_inputs,
             is_end=is_end,
         )
         if return_logprobs:
diff --git a/rlinf/utils/data_iter_utils.py b/rlinf/utils/data_iter_utils.py
index 27bbd7215..31e440201 100644
--- a/rlinf/utils/data_iter_utils.py
+++ b/rlinf/utils/data_iter_utils.py
@@ -60,13 +60,17 @@ def concat_dict_list(list_of_dicts: List[Dict[str, Any]]) -> Dict[str, Any]:
     return result
 
 
-def split_list(inputs, num_chunks, enforce_divisible_batch: Optional[bool] = True):
+def split_list(
+    inputs: List, num_chunks: int, enforce_divisible_batch: Optional[bool] = True
+):
     """
     Split a list into equal sized chunks
     """
     if enforce_divisible_batch:
         chunk_size = len(inputs) // num_chunks
-        assert len(inputs) % chunk_size == 0, "Issue with batch size configuration!"
+        assert len(inputs) % chunk_size == 0, (
+            f"Issue with batch size configuration! inputs len:{len(inputs)} num_chunks:{num_chunks}"
+        )
         return [inputs[i : i + chunk_size] for i in range(0, len(inputs), chunk_size)]
     else:
         k, m = divmod(len(inputs), num_chunks)
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index 1e9e2c598..7bc376fba 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -214,7 +214,6 @@ def run_training(self, input_channel: Channel):
             f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
         )
         batch = RolloutResult.merge_batches(batches)
-
         # Must be called after batch is retrieved, which is when rollout has stopped
         # Otherwise, loading model might cause OOM
         self._load_weight_and_optimizer(input_channel)
@@ -279,7 +278,6 @@ def run_training(self, input_channel: Channel):
                         ref_logprobs = m_batch["ref_logprobs"]
 
                     loss_mask = m_batch["attention_mask"][:, -self.response_len :]
-
                     output = self.model(
                         input_ids=input_ids,
                         attention_mask=attention_mask,
diff --git a/rlinf/workers/rollout/sglang/sglang_worker.py b/rlinf/workers/rollout/sglang/sglang_worker.py
index 3fcda8fca..93f268d6c 100644
--- a/rlinf/workers/rollout/sglang/sglang_worker.py
+++ b/rlinf/workers/rollout/sglang/sglang_worker.py
@@ -179,9 +179,7 @@ def rollout(self, input_channel: Channel, output_channel: Channel):
             # Generate outputs using the SGLang engine.
             with self.worker_timer():
                 results = self._engine.generate(
-                    input_ids=request.input_ids
-                    if request.multi_modal_inputs
-                    else request.multi_modal_inputs["input_ids"],
+                    input_ids=request.input_ids,
                     # 0.4.4 has modality bug,can't pass non-None image_data
                     image_data=request.image_data if any(request.image_data) else None,
                     sampling_params=self._sampling_params,
diff --git a/rlinf/workers/rollout/vllm/vllm_worker.py b/rlinf/workers/rollout/vllm/vllm_worker.py
index b3f82d9d9..1c6e859a5 100644
--- a/rlinf/workers/rollout/vllm/vllm_worker.py
+++ b/rlinf/workers/rollout/vllm/vllm_worker.py
@@ -377,7 +377,14 @@ async def _put_result(self, result: RolloutResult, output_channel: Channel) -> N
             result: The RolloutResult to put to the channel.
             output_channel: The output channel to send results to.
         """
-        await output_channel.put(result, async_op=True).async_wait()
+        # NOTE:
+        # To fit reward worker and actor workers' expected input count,
+        # currently we can only split result into groups.
+        splited_results = RolloutResult.split_result_list_by_group([result])
+        put_tasks = [
+            output_channel.put(r, async_op=True).async_wait() for r in splited_results
+        ]
+        await asyncio.gather(*put_tasks)
 
     async def _stop(self) -> None:
         """

From fa5b8610d1adf6cb71768c42bad593b719e5e484 Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Thu, 2 Oct 2025 16:23:30 +0000
Subject: [PATCH 41/57] fix(ci): add ci for vqa

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 .github/workflows/vqa_e2e.yml                 |  63 +++++
 rlinf/algorithms/rewards/__init__.py          |   4 +-
 .../sglang/qwen2.5-vl-3b-grpo-collocated.yaml | 222 ++++++++++++++++++
 tests/e2e_tests/vqa/sglang/run_collocated.sh  |  17 ++
 .../vllm/qwen2.5-vl-3b-grpo-collocated.yaml   | 222 ++++++++++++++++++
 tests/e2e_tests/vqa/vllm/run_collocated.sh    |  17 ++
 6 files changed, 543 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/vqa_e2e.yml
 create mode 100644 tests/e2e_tests/vqa/sglang/qwen2.5-vl-3b-grpo-collocated.yaml
 create mode 100644 tests/e2e_tests/vqa/sglang/run_collocated.sh
 create mode 100644 tests/e2e_tests/vqa/vllm/qwen2.5-vl-3b-grpo-collocated.yaml
 create mode 100644 tests/e2e_tests/vqa/vllm/run_collocated.sh

diff --git a/.github/workflows/vqa_e2e.yml b/.github/workflows/vqa_e2e.yml
new file mode 100644
index 000000000..58ce056e2
--- /dev/null
+++ b/.github/workflows/vqa_e2e.yml
@@ -0,0 +1,63 @@
+name: VQA End2End
+
+on:
+    push:
+        branches:
+          - 'release/v[0-9].[0-9]'
+          - main
+        paths:
+          - '**/*.py'
+          - 'tests/**'
+          - '.github/workflows/*.yml'
+          - '!docs/**'
+          - '!README.md'
+          - '!*.yaml'
+          - '!*.toml'
+          - '!ray_utils/**'
+          - '!requirements/**'
+
+    pull_request:
+        branches:
+          - 'release/v[0-9].[0-9]'
+          - main
+        paths:
+          - '**/*.py'
+          - 'tests/**'
+          - '.github/workflows/*.yml'
+          - '!docs/**'
+          - '!README.md'
+          - '*.yaml'
+          - '*.toml'
+          - '!ray_utils/**'
+          - '!requirements/**'
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+    qwen-vl-grpo-test:
+        runs-on: rlinf
+        container:
+            image: rlinf/rlinf:math-rlinf0.1-torch2.6.0-sglang0.4.6.post5-vllm0.8.5-megatron0.13.0-te2.1
+            volumes:
+                - /mnt/public/dataset:/workspace/dataset
+                - /mnt/public/tokenizer:/workspace/tokenizer
+            options: --gpus="all" --shm-size=80g
+
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - name: SGLang Collocated mode
+              run: |
+                export REPO_PATH=$(pwd)
+                bash tests/e2e_tests/vqa/sglang/run_collocated.sh
+
+            - name: vLLM Collocated mode
+              run: |
+                export REPO_PATH=$(pwd)
+                bash tests/e2e_tests/vqa/vllm/run_collocated.sh
diff --git a/rlinf/algorithms/rewards/__init__.py b/rlinf/algorithms/rewards/__init__.py
index 3d354437b..2ab6528ca 100644
--- a/rlinf/algorithms/rewards/__init__.py
+++ b/rlinf/algorithms/rewards/__init__.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .math import MathReward
-from .vqa import VQAReward
+from rlinf.algorithms.rewards.math import MathReward
+from rlinf.algorithms.rewards.vqa import VQAReward
 
 
 def register_reward(name: str, reward_class: type):
diff --git a/tests/e2e_tests/vqa/sglang/qwen2.5-vl-3b-grpo-collocated.yaml b/tests/e2e_tests/vqa/sglang/qwen2.5-vl-3b-grpo-collocated.yaml
new file mode 100644
index 000000000..30c7a150c
--- /dev/null
+++ b/tests/e2e_tests/vqa/sglang/qwen2.5-vl-3b-grpo-collocated.yaml
@@ -0,0 +1,222 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: ${runner.output_dir}/${runner.experiment_name}
+    project_name: rlinf
+    experiment_name: ${runner.experiment_name}
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 5
+  max_steps: -1
+
+  val_check_interval: 1
+  save_interval: 50
+
+  seq_length: 2048
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 28672
+
+  resume_dir: null
+  experiment_name: grpo-qwen2.5-vl-3b
+  output_dir: /workspace/results
+
+algorithm:
+  group_size: 8
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: False
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/Qwen2.5-VL-3B-Instruct
+  model_arch: qwen2.5_vl #qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: vision_language
+  dataset_name: robo2vlm
+  max_prompt_length: 1024
+  filter_prompt_by_length: True
+  rollout_batch_size: 8
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  image_keys: ["image"] # some vlm datasets may have multiple image columns
+  choice_key: "choices"
+  answer_key: "answer"
+  solution_key: "solution"
+  use_chat_template: True
+  lazy_loading: True
+  shuffle: True
+  validation_shuffle: True
+  seed: 1234
+  train_data_paths: ["/workspace/dataset/robo2vlm-1/data/train-00000-of-00262.parquet"]
+  val_data_paths: ["/workspace/dataset/robo2vlm-1/data/test-00000-of-00003.parquet"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /workspace/dataset/Qwen2.5-VL-3B-Instruct
+
+    model_arch: ${rollout.model_arch}
+
+  optim:
+    optimizer: adam
+    bf16: True #False
+    fp16: False #True
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/Qwen2.5-VL-3B-Instruct
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'vqa'
+  reward_scale: 1.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/vqa/sglang/run_collocated.sh b/tests/e2e_tests/vqa/sglang/run_collocated.sh
new file mode 100644
index 000000000..43fa65fd0
--- /dev/null
+++ b/tests/e2e_tests/vqa/sglang/run_collocated.sh
@@ -0,0 +1,17 @@
+#! /bin/bash
+set -x
+
+tabs 4
+export VLLM_ATTENTION_BACKEND=XFORMERS
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
+
+if [ -z "$1" ]; then
+    CONFIG_NAME="qwen2.5-vl-3b-grpo-collocated"
+else
+    CONFIG_NAME=$1
+fi
+
+python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/math/sglang  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/tests/e2e_tests/vqa/vllm/qwen2.5-vl-3b-grpo-collocated.yaml b/tests/e2e_tests/vqa/vllm/qwen2.5-vl-3b-grpo-collocated.yaml
new file mode 100644
index 000000000..aef3ec271
--- /dev/null
+++ b/tests/e2e_tests/vqa/vllm/qwen2.5-vl-3b-grpo-collocated.yaml
@@ -0,0 +1,222 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: ${runner.output_dir}/${runner.experiment_name}
+    project_name: rlinf
+    experiment_name: ${runner.experiment_name}
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 5
+  max_steps: -1
+
+  val_check_interval: 1
+  save_interval: 50
+
+  seq_length: 2048
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 28672
+
+  resume_dir: null
+  experiment_name: grpo-qwen2.5-vl-3b
+  output_dir: /workspace/results
+
+algorithm:
+  group_size: 8
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: False
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/Qwen2.5-VL-3B-Instruct
+  model_arch: qwen2.5_vl #qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: vllm     # here choose which backend to rollout,support [sglang, vllm] 
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: vision_language
+  dataset_name: robo2vlm
+  max_prompt_length: 1024
+  filter_prompt_by_length: True
+  rollout_batch_size: 8
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  image_keys: ["image"] # some vlm datasets may have multiple image columns
+  choice_key: "choices"
+  answer_key: "answer"
+  solution_key: "solution"
+  use_chat_template: True
+  lazy_loading: True
+  shuffle: True
+  validation_shuffle: True
+  seed: 1234
+  train_data_paths: ["/workspace/dataset/robo2vlm-1/data/train-00000-of-00262.parquet"]
+  val_data_paths: ["/workspace/dataset/robo2vlm-1/data/test-00000-of-00003.parquet"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /workspace/dataset/Qwen2.5-VL-3B-Instruct
+
+    model_arch: ${rollout.model_arch}
+
+  optim:
+    optimizer: adam
+    bf16: True #False
+    fp16: False #True
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/Qwen2.5-VL-3B-Instruct
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'vqa'
+  reward_scale: 1.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/vqa/vllm/run_collocated.sh b/tests/e2e_tests/vqa/vllm/run_collocated.sh
new file mode 100644
index 000000000..ab406c4ea
--- /dev/null
+++ b/tests/e2e_tests/vqa/vllm/run_collocated.sh
@@ -0,0 +1,17 @@
+#! /bin/bash
+set -x
+
+tabs 4
+export VLLM_ATTENTION_BACKEND=XFORMERS
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
+
+if [ -z "$1" ]; then
+    CONFIG_NAME="qwen2.5-vl-3b-grpo-collocated"
+else
+    CONFIG_NAME=$1
+fi
+
+python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/vqa/vllm  --config-name $CONFIG_NAME
\ No newline at end of file

From 74535e68e99e1160849750bef670f810ee9b236d Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Fri, 3 Oct 2025 04:57:47 +0000
Subject: [PATCH 42/57] fix(ci): fix some bugs in ci

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 rlinf/data/io_struct.py                       | 12 +++++++-----
 .../megatron/megatron_model_manager.py        |  1 +
 rlinf/workers/actor/fsdp_actor_worker.py      |  3 ++-
 rlinf/workers/actor/megatron_actor_worker.py  |  4 ----
 rlinf/workers/rollout/sglang/sglang_worker.py |  2 ++
 rlinf/workers/rollout/vllm/vllm_worker.py     |  4 ++--
 ...5-1.5b-grpo-pipeline-rollout-logprobs.yaml | 12 ++++++++++--
 .../math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml | 12 ++++++++++--
 tests/e2e_tests/math/vllm/run_pipeline.sh     |  2 +-
 tests/e2e_tests/vqa/sglang/run_collocated.sh  |  2 +-
 tests/unit_tests/test_auto_placement.py       |  4 ++--
 toolkits/auto_placement/scheduler_task.py     | 10 ++++++----
 toolkits/math_verifier/verify.py              | 19 +++++++++++++++----
 13 files changed, 59 insertions(+), 28 deletions(-)

diff --git a/rlinf/data/io_struct.py b/rlinf/data/io_struct.py
index bb1978754..4948131a6 100644
--- a/rlinf/data/io_struct.py
+++ b/rlinf/data/io_struct.py
@@ -13,12 +13,14 @@
 # limitations under the License.
 
 from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 from omegaconf import DictConfig
-from vllm.outputs import CompletionOutput
-from vllm.outputs import RequestOutput as VllmRequestOutput
+
+if TYPE_CHECKING:
+    from vllm.outputs import CompletionOutput
+    from vllm.outputs import RequestOutput as VllmRequestOutput
 
 from rlinf.data.datasets.utils import batch_pad_to_fixed_len
 from rlinf.utils.data_iter_utils import (
@@ -352,13 +354,13 @@ def _get_attention_masks_and_position_ids(
     @staticmethod
     def from_vllm_results(
         group_size: int,
-        results: List[VllmRequestOutput],
+        results: List["VllmRequestOutput"],
         answers: Optional[List[str]] = None,
         multi_modal_inputs: Optional[List[Dict]] = None,
         return_logprobs: bool = False,
     ) -> "RolloutResult":
         def get_logprobs(
-            response_ids: List[int], output: CompletionOutput
+            response_ids: List[int], output: "CompletionOutput"
         ) -> List[float]:
             logprobs = []
             returned_logprobs = output.logprobs
diff --git a/rlinf/hybrid_engines/megatron/megatron_model_manager.py b/rlinf/hybrid_engines/megatron/megatron_model_manager.py
index 57f34dbf5..6fe4fbfd6 100644
--- a/rlinf/hybrid_engines/megatron/megatron_model_manager.py
+++ b/rlinf/hybrid_engines/megatron/megatron_model_manager.py
@@ -184,6 +184,7 @@ def model_provider_func(self, pre_process, post_process):
         return model
 
     def optimizer_step(self, increment):
+        clear_memory()
         success, grad_norm, num_zeros_in_grad = self.optimizer.step()
 
         self.lr_scheduler.step(increment=increment)
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index 7bc376fba..e13a8949f 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -25,7 +25,6 @@
 
 import rlinf.algorithms  # noqa: F401
 from rlinf.algorithms.registry import actor_loss, calculate_adv_and_returns
-from rlinf.algorithms.rewards import get_reward_class
 from rlinf.algorithms.utils import (
     kl_penalty,
     preprocess_advantages_inputs,
@@ -113,6 +112,8 @@ def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
             assert self.cfg.reward.reward_type in ["math", "vqa"], (
                 "only support math and vqa reward!"
             )
+            from rlinf.algorithms.rewards import get_reward_class
+
             reward_cls = get_reward_class(self.cfg.reward.reward_type)
             self.reward = reward_cls(self.cfg.reward)
 
diff --git a/rlinf/workers/actor/megatron_actor_worker.py b/rlinf/workers/actor/megatron_actor_worker.py
index a08415ea9..bbac0701c 100644
--- a/rlinf/workers/actor/megatron_actor_worker.py
+++ b/rlinf/workers/actor/megatron_actor_worker.py
@@ -885,10 +885,6 @@ def compute_advantages_and_returns(
             input_channel: The input channel to read from.
             output_channel: The output channel to send results to.
         """
-        if self.is_pipeline:
-            # In pipeline mode, advantages are computed in the rollout
-            with self.worker_timer():
-                return
         clear_memory()
         recv_batch_size = 0
         while recv_batch_size < self.total_batch_size_per_dp:
diff --git a/rlinf/workers/rollout/sglang/sglang_worker.py b/rlinf/workers/rollout/sglang/sglang_worker.py
index 93f268d6c..abdf59365 100644
--- a/rlinf/workers/rollout/sglang/sglang_worker.py
+++ b/rlinf/workers/rollout/sglang/sglang_worker.py
@@ -335,10 +335,12 @@ async def rollout(self, input_channel: Channel, output_channel: Channel):
                     results = self._completion_info.get_results(hash_id)
 
                     input_ids = [input_ids] * len(results)
+                    answers = [rollout_request.answers[raw_id]] * len(results)
                     rollout_result = RolloutResult.from_sglang_results(
                         results,
                         rollout_request.n,
                         input_ids,
+                        answers=answers,
                         return_logprobs=self._return_logprobs,
                     )
 
diff --git a/rlinf/workers/rollout/vllm/vllm_worker.py b/rlinf/workers/rollout/vllm/vllm_worker.py
index 1c6e859a5..a67d967ed 100644
--- a/rlinf/workers/rollout/vllm/vllm_worker.py
+++ b/rlinf/workers/rollout/vllm/vllm_worker.py
@@ -265,8 +265,8 @@ def check_prompt_text() -> Optional[List[str]]:
                 assert len(prompt_texts) > 0, "prompt_text should not be empty."
                 return prompt_texts
 
-        def check_image_data() -> Optional[List[List[Image]]]:
-            if image_data is None:
+        def check_image_data() -> Optional[List[List[Image.Image]]]:
+            if image_data is None or not any(image_data):
                 return None
             assert isinstance(image_data, list), "image_data should be a list."
             if isinstance(image_data[0], list):
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
index 62d0c5247..111969ff4 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
@@ -12,9 +12,10 @@ cluster:
   component_placement:
     rollout: 0-3
     actor: 4-7
+    reward: 0-3
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: /workspace/results/
     project_name: rlinf
@@ -66,7 +67,7 @@ algorithm:
   calculate_entropy: True
   clip_ratio_c: null # 3.0
 
-  adv_type: grpo
+  adv_type: math_grpo
   normalize_advantages: False
   early_stop_imp_ratio: 5.0
   use_valid_token_scale: True
@@ -260,9 +261,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml
index 3f7821587..705757c31 100644
--- a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml
+++ b/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml
@@ -13,9 +13,10 @@ cluster:
     rollout: 0-3
     inference: 4-5
     actor: 6-7
+    reward: 0-3
 
 runner:
-  task_type: math
+  task_type: reasoning
   logger:
     log_path: /workspace/results/
     project_name: rlinf
@@ -67,7 +68,7 @@ algorithm:
   calculate_entropy: True
   clip_ratio_c: null # 3.0
 
-  adv_type: grpo
+  adv_type: math_grpo
   normalize_advantages: False
   early_stop_imp_ratio: 5.0
   use_valid_token_scale: True
@@ -274,9 +275,16 @@ actor:
 
 
 reward:
+  group_name: "RewardGroup"
   use_reward_model: false
   reward_type: 'math'
   reward_scale: 5.0
 
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
 critic:
   use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/vllm/run_pipeline.sh b/tests/e2e_tests/math/vllm/run_pipeline.sh
index 0a21368f6..59fb19454 100644
--- a/tests/e2e_tests/math/vllm/run_pipeline.sh
+++ b/tests/e2e_tests/math/vllm/run_pipeline.sh
@@ -14,4 +14,4 @@ else
     CONFIG_NAME=$1
 fi
 
-python ${REPO_PATH}/examples/math/main_math.py --config-path $REPO_PATH/tests/e2e_tests/math/vllm  --config-name $CONFIG_NAME
\ No newline at end of file
+python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/math/vllm  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/tests/e2e_tests/vqa/sglang/run_collocated.sh b/tests/e2e_tests/vqa/sglang/run_collocated.sh
index 43fa65fd0..793c92417 100644
--- a/tests/e2e_tests/vqa/sglang/run_collocated.sh
+++ b/tests/e2e_tests/vqa/sglang/run_collocated.sh
@@ -14,4 +14,4 @@ else
     CONFIG_NAME=$1
 fi
 
-python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/math/sglang  --config-name $CONFIG_NAME
\ No newline at end of file
+python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/vqa/sglang  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/tests/unit_tests/test_auto_placement.py b/tests/unit_tests/test_auto_placement.py
index a70b01c9d..559229763 100644
--- a/tests/unit_tests/test_auto_placement.py
+++ b/tests/unit_tests/test_auto_placement.py
@@ -598,7 +598,7 @@ def test_scheduler_task_initialization(self, mock_validate):
         """Test SchedulerTask initialization."""
         # Create a mock config
         mock_cfg = MagicMock()
-        mock_cfg.runner.task_type = "math"
+        mock_cfg.runner.task_type = "reasoning"
         mock_cfg.actor.model.tensor_model_parallel_size = 2
         mock_cfg.actor.model.pipeline_model_parallel_size = 1
         mock_cfg.rollout.tensor_parallel_size = 1
@@ -620,7 +620,7 @@ def test_scheduler_task_initialization(self, mock_validate):
 
         scheduler_task = SchedulerTask(mock_cfg, mock_cluster)
 
-        assert scheduler_task.is_math is True
+        assert scheduler_task.is_reasoning is True
         assert scheduler_task.total_gpus == 8
         assert scheduler_task.group_size == 4
         assert "actor" in scheduler_task.components_config
diff --git a/toolkits/auto_placement/scheduler_task.py b/toolkits/auto_placement/scheduler_task.py
index 00d8ea8aa..b3be46012 100644
--- a/toolkits/auto_placement/scheduler_task.py
+++ b/toolkits/auto_placement/scheduler_task.py
@@ -31,8 +31,10 @@ def __init__(
         workflow_graph: Optional[Dict[ComponentNode, List[ComponentNode]]] = None,
     ):
         self.cfg = cfg
-        self.is_math = cfg.runner.task_type == "math"
-        assert self.is_math, "Only math task is supported"
+        self.is_reasoning = cfg.runner.task_type == "reasoning"
+        assert self.is_reasoning, (
+            f"Only reasoning task is supported, current task type: {cfg.runner.task_type}"
+        )
 
         self.components_config = {
             "actor": {
@@ -71,7 +73,7 @@ def __init__(
         self.global_step_batch_size = self.rollout_batch_size * self.group_size
 
         if workflow_graph is None:
-            if self.is_math:
+            if self.is_reasoning:
                 actor = ComponentNode("actor")
                 inference = ComponentNode("inference")
                 rollout = ComponentNode("rollout")
@@ -179,7 +181,7 @@ def parse_partition_allocation_to_cfg(
 
     def time_division_multiplexing(self) -> List[Dict[str, Workflow]]:
         partitions: List[Dict[str, Workflow]] = get_workflow_partition(self.workflow)
-        if self.is_math:
+        if self.is_reasoning:
             valid_partitions = [
                 i for i in partitions if len(i) in [1, len(self.components_config)]
             ]
diff --git a/toolkits/math_verifier/verify.py b/toolkits/math_verifier/verify.py
index 31d92c280..988b86045 100644
--- a/toolkits/math_verifier/verify.py
+++ b/toolkits/math_verifier/verify.py
@@ -14,7 +14,13 @@
 
 import multiprocessing
 import re
-from concurrent.futures import ProcessPoolExecutor, as_completed
+from concurrent.futures import (
+    ProcessPoolExecutor,
+    as_completed,
+)
+from concurrent.futures import (
+    TimeoutError as FuturesTimeoutError,
+)
 from typing import List, Union
 
 import regex
@@ -403,15 +409,20 @@ def math_verify_call(
             jobs.append(job)
         all_jobs.append(jobs)
 
-    labels = []
+    labels: List[int] = []
     has_timeout = False
     for jobs in all_jobs:
+        label = 0
         try:
             for job in as_completed(jobs, timeout=timeout):
                 x = job.result()
-                labels.append(x)
-        except TimeoutError:
+                label = label or x
+        except FuturesTimeoutError:
             has_timeout = True
+            for job in jobs:
+                job.cancel()
+        finally:
+            labels.append(label)
 
     if has_timeout:
         reset_global_process_pool()

From 62df3139450a83baa087bc06425ff638721b7f8c Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Sun, 5 Oct 2025 00:28:27 +0000
Subject: [PATCH 43/57] fix(fsdp): add forgotten backward and optimizer step

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 rlinf/workers/actor/fsdp_actor_worker.py | 29 ++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index e13a8949f..5e4a48848 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -346,6 +346,10 @@ def run_training(self, input_channel: Channel):
                         loss = loss + kl_loss * self.kl_beta
 
                     # add to log
+                    # scale loss for gradient accumulation and backprop
+                    loss = loss / self.gradient_accumulation
+                    loss.backward()
+
                     mbs_metrics_data.update(
                         {
                             "final_loss": loss.detach().cpu(),
@@ -355,6 +359,18 @@ def run_training(self, input_channel: Channel):
                     )
 
                     append_to_dict(metrics, mbs_metrics_data)
+                # apply gradient clipping and optimizer step at the end of a global batch
+                grad_norm = None
+                try:
+                    grad_norm = self.model.clip_grad_norm_(
+                        max_norm=self.cfg.actor.optim.clip_grad
+                    )
+                except Exception:
+                    pass
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+
+                # aggregate metrics across micro-batches
                 mean_metric_dict = {
                     key: torch.mean(torch.stack(value))
                     for key, value in metrics.items()
@@ -362,6 +378,19 @@ def run_training(self, input_channel: Channel):
                 mean_metric_dict = all_reduce_dict(
                     mean_metric_dict, op=torch.distributed.ReduceOp.AVG
                 )
+                # add optimizer stats
+                if grad_norm is not None:
+                    mean_metric_dict["actor/grad_norm"] = (
+                        torch.as_tensor(
+                            grad_norm
+                            if torch.is_tensor(grad_norm)
+                            else float(grad_norm)
+                        )
+                        .float()
+                        .cpu()
+                    )
+                lr = self.optimizer.param_groups[0]["lr"]
+                mean_metric_dict["actor/lr"] = torch.as_tensor(lr).float().cpu()
                 training_metrics_list.append(mean_metric_dict)
 
         # Rollout metrics

From 1ec7e5450948764605e05a0bc812b9603d4bee96 Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Sun, 5 Oct 2025 11:08:01 +0000
Subject: [PATCH 44/57] fix(collocated): fix inference/rollout do jobs
 parallelly which causes oom in collocated mode

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 rlinf/runners/reasoning_runner.py            | 1 +
 rlinf/workers/actor/megatron_actor_worker.py | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/rlinf/runners/reasoning_runner.py b/rlinf/runners/reasoning_runner.py
index d68abf72a..b53010e18 100644
--- a/rlinf/runners/reasoning_runner.py
+++ b/rlinf/runners/reasoning_runner.py
@@ -341,6 +341,7 @@ def run(self):
                         infer_handle: Handle = self.inference.run_inference(
                             input_channel=self.reward_channel,
                             output_channel=self.inference_channel,
+                            rollout_channel=self.rollout_channel,
                             compute_ref_logprobs=self.compute_ref_logprobs,
                         )
                         inference_channel = self.inference_channel
diff --git a/rlinf/workers/actor/megatron_actor_worker.py b/rlinf/workers/actor/megatron_actor_worker.py
index bbac0701c..cb74b0d00 100644
--- a/rlinf/workers/actor/megatron_actor_worker.py
+++ b/rlinf/workers/actor/megatron_actor_worker.py
@@ -841,6 +841,7 @@ def run_inference(
         self,
         input_channel: Channel,
         output_channel: Channel,
+        rollout_channel: Channel,
         compute_ref_logprobs: bool,
     ):
         """Compute prev/ref logprobs using the actor Model's forward.
@@ -851,6 +852,8 @@ def run_inference(
             compute_ref_logprobs: Whether to compute reference logprobs.
         """
         recv_batch_size = 0
+        if not self.is_pipeline:
+            rollout_channel.device_lock.acquire()
         while recv_batch_size < self.total_batch_size_per_dp:
             batch, rollout_result = self.get_batch(input_channel)
             recv_batch_size += rollout_result.num_sequence
@@ -874,6 +877,8 @@ def run_inference(
         assert recv_batch_size == self.total_batch_size_per_dp, (
             f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
         )
+        if not self.is_pipeline:
+            rollout_channel.device_lock.release()
 
     # Advantages and returns
     def compute_advantages_and_returns(

From e57d10df7007f71cff52484cb22b896f946c6c64 Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Wed, 8 Oct 2025 18:44:16 +0000
Subject: [PATCH 45/57] fix(sync_weight): fix oom bugs

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 rlinf/runners/reasoning_runner.py            |  1 -
 rlinf/workers/actor/fsdp_actor_worker.py     | 16 +++--
 rlinf/workers/actor/megatron_actor_worker.py |  5 --
 rlinf/workers/reward/reward_worker.py        | 63 +++++++++-----------
 4 files changed, 36 insertions(+), 49 deletions(-)

diff --git a/rlinf/runners/reasoning_runner.py b/rlinf/runners/reasoning_runner.py
index b53010e18..d68abf72a 100644
--- a/rlinf/runners/reasoning_runner.py
+++ b/rlinf/runners/reasoning_runner.py
@@ -341,7 +341,6 @@ def run(self):
                         infer_handle: Handle = self.inference.run_inference(
                             input_channel=self.reward_channel,
                             output_channel=self.inference_channel,
-                            rollout_channel=self.rollout_channel,
                             compute_ref_logprobs=self.compute_ref_logprobs,
                         )
                         inference_channel = self.inference_channel
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index 5e4a48848..ba35362e0 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -144,9 +144,12 @@ def del_reshard_state_dict(self):
     def sync_model_to_rollout(self):
         if next(self.model.parameters()).is_cpu:
             self.load_fsdp_param_and_grad(self.device)
-
         self.rollout_state_dict = self.get_model_state_dict()
 
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_param_and_grad(offload_grad=True)
+            self.offload_fsdp_optimizer()
+
         has_visual = any("visual." in k for k in self.rollout_state_dict.keys())
 
         state_dict = {}
@@ -161,14 +164,9 @@ def sync_model_to_rollout(self):
                         name = name[6:]
                 state_dict[name] = reduce_tensor(v)
 
-            self.send(
-                state_dict, self._rollout_group_name, self._weight_dst_rank_in_rollout
-            )
-        if self.cfg.actor.get("enable_offload", False):
-            self.offload_fsdp_param_and_grad()
-            torch.cuda.synchronize()
-            gc.collect()
-            torch.cuda.empty_cache()
+        self.send(
+            state_dict, self._rollout_group_name, self._weight_dst_rank_in_rollout
+        )
 
     def compute_logprobs(self):
         self.model.eval()
diff --git a/rlinf/workers/actor/megatron_actor_worker.py b/rlinf/workers/actor/megatron_actor_worker.py
index cb74b0d00..bbac0701c 100644
--- a/rlinf/workers/actor/megatron_actor_worker.py
+++ b/rlinf/workers/actor/megatron_actor_worker.py
@@ -841,7 +841,6 @@ def run_inference(
         self,
         input_channel: Channel,
         output_channel: Channel,
-        rollout_channel: Channel,
         compute_ref_logprobs: bool,
     ):
         """Compute prev/ref logprobs using the actor Model's forward.
@@ -852,8 +851,6 @@ def run_inference(
             compute_ref_logprobs: Whether to compute reference logprobs.
         """
         recv_batch_size = 0
-        if not self.is_pipeline:
-            rollout_channel.device_lock.acquire()
         while recv_batch_size < self.total_batch_size_per_dp:
             batch, rollout_result = self.get_batch(input_channel)
             recv_batch_size += rollout_result.num_sequence
@@ -877,8 +874,6 @@ def run_inference(
         assert recv_batch_size == self.total_batch_size_per_dp, (
             f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
         )
-        if not self.is_pipeline:
-            rollout_channel.device_lock.release()
 
     # Advantages and returns
     def compute_advantages_and_returns(
diff --git a/rlinf/workers/reward/reward_worker.py b/rlinf/workers/reward/reward_worker.py
index 03b2311af..e186eebac 100644
--- a/rlinf/workers/reward/reward_worker.py
+++ b/rlinf/workers/reward/reward_worker.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, List, Tuple
+from typing import Dict, Tuple
 
 import torch
 from omegaconf import DictConfig
@@ -67,48 +67,43 @@ def compute_rewards(self, input_channel: Channel, output_channel: Channel):
         with self.worker_timer():
             recv_batch_size = 0
             while recv_batch_size < self.total_batch_size_per_dp:
-                batch, rollout_result = self.get_batch(input_channel)
-
+                rollout_result: RolloutResult = input_channel.get()
                 recv_batch_size += rollout_result.num_sequence
-                # Compute rule-based reward
+
                 if rollout_result.rewards is None:
-                    rollout_result.rewards = self._compute_batch_rewards(
-                        batch, rollout_result.answers
-                    )
+                    if self.cfg.reward.use_reward_model:
+                        with input_channel.device_lock:
+                            batch = rollout_result.to_actor_batch(
+                                self.cfg.data.max_prompt_length,
+                                self.cfg.actor.model.encoder_seq_length,
+                                self.tokenizer.eos_token_id,
+                            )
+                            rollout_result.rewards = (
+                                self.compute_batch_rewards_with_model(batch)
+                            )
+                    else:
+                        rollout_result.rewards = self._compute_rule_based_rewards(
+                            rollout_result
+                        )
+
                 output_channel.put(rollout_result)
 
             assert recv_batch_size == self.total_batch_size_per_dp, (
                 f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
             )
 
-    def _compute_batch_rewards(
-        self, batch: Dict[str, torch.Tensor], answers: List[str | dict]
-    ):
-        """Reward computation using non-model based reward."""
+    def _compute_rule_based_rewards(self, rollout_result: RolloutResult):
+        # Decode only the generated tokens; response_ids are already the post-prompt tokens
+        texts = self.tokenizer.batch_decode(
+            rollout_result.response_ids, skip_special_tokens=True
+        )
 
-        if self.cfg.reward.use_reward_model:
-            return self.compute_batch_rewards_with_model(batch)
-
-        texts = []
-        for response, response_len in zip(
-            batch["input_ids"],
-            batch["response_lengths"],
-        ):
-            response = response[
-                self.cfg.data.max_prompt_length : self.cfg.data.max_prompt_length
-                + response_len
-            ]
-            texts.append(
-                self.tokenizer.decode(response.tolist(), skip_special_tokens=True)
-            )
-        reward_scores = self.reward.get_reward(texts, answers)
-
-        all_reward_scores = torch.as_tensor(
-            reward_scores,
-            dtype=torch.float,
-            device=torch.device("cpu"),
-        ).view(-1, 1)
-        return all_reward_scores.flatten()
+        scores = self.reward.get_reward(texts, rollout_result.answers)
+        return (
+            torch.as_tensor(scores, dtype=torch.float, device=torch.device("cpu"))
+            .view(-1, 1)
+            .flatten()
+        )
 
     def compute_batch_rewards_with_model(self, batch: Dict[str, torch.Tensor]):
         self.model.eval()

From d0edcd0dfc0b9ce24c2032c2f0d6690403ec366d Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Thu, 9 Oct 2025 06:36:55 +0000
Subject: [PATCH 46/57] fix(vlm): in torch260's image, transformers version is
 4.51.1 and it's ok, use 4.51.1 rather than 4.56.1

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 .../math/qwen2.5-1.5b-grpo-megatron.yaml      | 20 +++++++++----------
 rlinf/runners/reasoning_runner.py             |  1 +
 rlinf/workers/actor/fsdp_actor_worker.py      |  8 ++++++--
 rlinf/workers/actor/megatron_actor_worker.py  |  5 ++++-
 .../sglang/qwen2.5-vl-3b-grpo-collocated.yaml |  4 ++--
 .../vllm/qwen2.5-vl-3b-grpo-collocated.yaml   |  4 ++--
 6 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml
index 63e972b3a..4d851f894 100644
--- a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-megatron.yaml
@@ -7,7 +7,7 @@ hydra:
   output_subdir: null
 
 cluster:
-  num_nodes: 1
+  num_nodes: 16
   component_placement:
     actor,rollout,reward: all
 
@@ -25,14 +25,14 @@ runner:
   val_check_interval: 1
   save_interval: 50
 
-  seq_length: 10240
+  seq_length: 28672
 
   enable_dynamic_batch_size: False
   max_tokens_per_mbs: 28672
 
   resume_dir: null
-  experiment_name: megatron-vllm-1.5b-math-test
-  output_dir: /mnt/public/daibo/results
+  experiment_name: grpo-1.5b
+  output_dir: ../results
 
 algorithm:
   group_size: 16
@@ -84,7 +84,7 @@ rollout:
 
   gpu_memory_utilization: 0.55
 
-  model_dir: /mnt/public/hf_models/DeepSeek-R1-Distill-Qwen-1.5B/
+  model_dir: /path/to/model/DeepSeek-R1-Distill-Qwen-1.5B/
   model_arch: qwen2.5
   enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
   distributed_executor_backend: mp   # ray or mp
@@ -93,7 +93,7 @@ rollout:
   padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
   eos: null                   # will be tokenizer.eos_token_id if null.
     
-  rollout_backend: vllm     # here choose which backend to rollout,support [sglang, vllm] 
+  rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
 
   sglang:
     attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
@@ -126,15 +126,15 @@ data:
   dataset_name: boba
   max_prompt_length: 1024
   filter_prompt_by_length: True
-  rollout_batch_size: 8
+  rollout_batch_size: 512
   val_rollout_batch_size: null
   num_workers: 2
   prompt_key: prompt
   shuffle: True
   validation_shuffle: True
   seed: 1234
-  train_data_paths: ["/mnt/public/daibo/dataset/boba_106k_0319_prompt_1024.jsonl"]
-  val_data_paths: ["/mnt/public/daibo/dataset/boba_106k_0319_prompt_1024.jsonl"]
+  train_data_paths: ["/dataset/boba/AReaL-boba-106k.jsonl"]
+  val_data_paths: ["/dataset/boba/AReaL-boba-106k.jsonl"]
 
 actor:
   group_name: "ActorGroup"
@@ -216,7 +216,7 @@ actor:
     lr_decay_iters: 10
 
   tokenizer:
-    tokenizer_model: /mnt/public/hf_models/DeepSeek-R1-Distill-Qwen-1.5B/
+    tokenizer_model: /path/to/model/DeepSeek-R1-Distill-Qwen-1.5B/
     use_fast: False
     trust_remote_code: True
     padding_side: 'right'
diff --git a/rlinf/runners/reasoning_runner.py b/rlinf/runners/reasoning_runner.py
index d68abf72a..b53010e18 100644
--- a/rlinf/runners/reasoning_runner.py
+++ b/rlinf/runners/reasoning_runner.py
@@ -341,6 +341,7 @@ def run(self):
                         infer_handle: Handle = self.inference.run_inference(
                             input_channel=self.reward_channel,
                             output_channel=self.inference_channel,
+                            rollout_channel=self.rollout_channel,
                             compute_ref_logprobs=self.compute_ref_logprobs,
                         )
                         inference_channel = self.inference_channel
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index ba35362e0..f4a047766 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -160,8 +160,12 @@ def sync_model_to_rollout(self):
                 if has_visual:
                     if name.startswith("model.language_model."):
                         name = "model." + name[21:]
-                    elif name.startswith("model."):
-                        name = name[6:]
+                    # NOTE:
+                    # if transformers version is 4.56.1 or older(not tested),
+                    # the following line should be uncommented
+
+                    # elif name.startswith("model."):
+                    #     name = name[6:]
                 state_dict[name] = reduce_tensor(v)
 
         self.send(
diff --git a/rlinf/workers/actor/megatron_actor_worker.py b/rlinf/workers/actor/megatron_actor_worker.py
index bbac0701c..6736e550b 100644
--- a/rlinf/workers/actor/megatron_actor_worker.py
+++ b/rlinf/workers/actor/megatron_actor_worker.py
@@ -841,6 +841,7 @@ def run_inference(
         self,
         input_channel: Channel,
         output_channel: Channel,
+        rollout_channel: Channel,
         compute_ref_logprobs: bool,
     ):
         """Compute prev/ref logprobs using the actor Model's forward.
@@ -856,7 +857,9 @@ def run_inference(
             recv_batch_size += rollout_result.num_sequence
             # Must be called after batch is retrieved, suggesting that rollout has stopped
             # Otherwise, loading model might cause OOM in the collocated mode
-            self._load_weight_and_optimizer(input_channel)
+            self._load_weight_and_optimizer(
+                input_channel if self.is_pipeline else rollout_channel
+            )
 
             # Prev logprobs
             with self.worker_timer():
diff --git a/tests/e2e_tests/vqa/sglang/qwen2.5-vl-3b-grpo-collocated.yaml b/tests/e2e_tests/vqa/sglang/qwen2.5-vl-3b-grpo-collocated.yaml
index 30c7a150c..bcbae2f94 100644
--- a/tests/e2e_tests/vqa/sglang/qwen2.5-vl-3b-grpo-collocated.yaml
+++ b/tests/e2e_tests/vqa/sglang/qwen2.5-vl-3b-grpo-collocated.yaml
@@ -19,8 +19,8 @@ runner:
     experiment_name: ${runner.experiment_name}
     logger_backends: ["tensorboard"] # wandb, swanlab
 
-  max_epochs: 5
-  max_steps: -1
+  max_epochs: 1
+  max_steps: 3
 
   val_check_interval: 1
   save_interval: 50
diff --git a/tests/e2e_tests/vqa/vllm/qwen2.5-vl-3b-grpo-collocated.yaml b/tests/e2e_tests/vqa/vllm/qwen2.5-vl-3b-grpo-collocated.yaml
index aef3ec271..f555a7292 100644
--- a/tests/e2e_tests/vqa/vllm/qwen2.5-vl-3b-grpo-collocated.yaml
+++ b/tests/e2e_tests/vqa/vllm/qwen2.5-vl-3b-grpo-collocated.yaml
@@ -19,8 +19,8 @@ runner:
     experiment_name: ${runner.experiment_name}
     logger_backends: ["tensorboard"] # wandb, swanlab
 
-  max_epochs: 5
-  max_steps: -1
+  max_epochs: 1
+  max_steps: 3
 
   val_check_interval: 1
   save_interval: 50

From 19f2a278489154aca4a198200c3ae6460b2c0c4a Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Fri, 10 Oct 2025 09:08:22 +0000
Subject: [PATCH 47/57] fix(fsdp): use bf16 instead of fp16 for training

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 .../config/math/qwen2.5-1.5b-grpo-fsdp.yaml   |   6 +-
 rlinf/algorithms/losses.py                    |  16 ++-
 rlinf/workers/actor/fsdp_actor_worker.py      | 104 +++++-------------
 rlinf/workers/rollout/vllm/vllm_worker.py     |   2 +
 4 files changed, 41 insertions(+), 87 deletions(-)

diff --git a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
index c4c646808..e1e97c215 100644
--- a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
@@ -158,7 +158,7 @@ actor:
   seed: 1234
 
   model:
-    precision: fp16
+    precision: bf16
     sharding_strategy: full_shard
     is_lora: False
 
@@ -168,8 +168,8 @@ actor:
 
   optim:
     optimizer: adam
-    bf16: False
-    fp16: True
+    bf16: True
+    fp16: False
     lr: 2e-05
     adam_beta1: 0.9
     adam_beta2: 0.95
diff --git a/rlinf/algorithms/losses.py b/rlinf/algorithms/losses.py
index 1d66885ea..f1bf025cd 100644
--- a/rlinf/algorithms/losses.py
+++ b/rlinf/algorithms/losses.py
@@ -233,17 +233,21 @@ def compute_math_ppo_actor_loss(**kwargs):
     clip_mask = policy_loss1.detach() < policy_loss2.detach()
     dual_clip_mask.logical_and_(loss_mask)
 
-    clip_fraction = clip_mask.logical_and_(loss_mask).count_nonzero() / loss_mask_count
-    approx_kl = -approx_kl.sum() / loss_mask_count
+    num_clipped = clip_mask.logical_and_(loss_mask).count_nonzero()
+
+    clip_fraction = num_clipped.float() / float(loss_mask_count)
+    approx_kl = -approx_kl.sum() / float(loss_mask_count)
 
     dual_cliped_ratio = torch.where(dual_clip_mask, ratio, 0)
 
     # Compile metrics for logging
     metrics_data = {
-        "policy_loss": masked_mean(policy_loss.detach(), loss_mask),
-        "ratio": masked_mean(ratio.detach(), loss_mask),
-        "clipped_ratio": masked_mean(clipped_ratio.detach(), loss_mask),
-        "dual_cliped_ratio": masked_mean(dual_cliped_ratio.detach(), loss_mask),
+        "policy_loss": masked_mean(policy_loss.detach(), loss_mask).detach(),
+        "ratio": masked_mean(ratio.detach(), loss_mask).detach(),
+        "clipped_ratio": masked_mean(clipped_ratio.detach(), loss_mask).detach(),
+        "dual_cliped_ratio": masked_mean(
+            dual_cliped_ratio.detach(), loss_mask
+        ).detach(),
         "approx_kl": approx_kl.detach(),
         "clip_fraction": clip_fraction.detach(),
     }
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index f4a047766..8c467d553 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -14,7 +14,7 @@
 
 import gc
 import os
-from typing import Dict, List, Tuple
+from typing import Dict, Tuple
 
 import numpy as np
 import torch
@@ -142,14 +142,13 @@ def del_reshard_state_dict(self):
             del self.rollout_state_dict
 
     def sync_model_to_rollout(self):
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_optimizer()
+
         if next(self.model.parameters()).is_cpu:
             self.load_fsdp_param_and_grad(self.device)
         self.rollout_state_dict = self.get_model_state_dict()
 
-        if self.cfg.actor.get("enable_offload", False):
-            self.offload_fsdp_param_and_grad(offload_grad=True)
-            self.offload_fsdp_optimizer()
-
         has_visual = any("visual." in k for k in self.rollout_state_dict.keys())
 
         state_dict = {}
@@ -168,9 +167,12 @@ def sync_model_to_rollout(self):
                     #     name = name[6:]
                 state_dict[name] = reduce_tensor(v)
 
-        self.send(
-            state_dict, self._rollout_group_name, self._weight_dst_rank_in_rollout
-        )
+            self.send(
+                state_dict, self._rollout_group_name, self._weight_dst_rank_in_rollout
+            )
+
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_param_and_grad()
 
     def compute_logprobs(self):
         self.model.eval()
@@ -354,22 +356,23 @@ def run_training(self, input_channel: Channel):
 
                     mbs_metrics_data.update(
                         {
-                            "final_loss": loss.detach().cpu(),
-                            "entropy_loss": entropy_loss.detach().cpu(),
-                            "kl_loss": kl_loss.detach().cpu(),
+                            "final_loss": loss.detach(),
+                            "entropy_loss": entropy_loss.detach(),
+                            "kl_loss": kl_loss.detach(),
                         }
                     )
 
                     append_to_dict(metrics, mbs_metrics_data)
                 # apply gradient clipping and optimizer step at the end of a global batch
-                grad_norm = None
-                try:
-                    grad_norm = self.model.clip_grad_norm_(
-                        max_norm=self.cfg.actor.optim.clip_grad
+                grad_norm = self.model.clip_grad_norm_(
+                    max_norm=self.cfg.actor.optim.clip_grad
+                )
+                if not torch.isfinite(grad_norm).all():
+                    self.log_warning(
+                        "grad norm is not finite, skip this optimizer step."
                     )
-                except Exception:
-                    pass
-                self.optimizer.step()
+                else:
+                    self.optimizer.step()
                 self.optimizer.zero_grad()
 
                 # aggregate metrics across micro-batches
@@ -381,16 +384,12 @@ def run_training(self, input_channel: Channel):
                     mean_metric_dict, op=torch.distributed.ReduceOp.AVG
                 )
                 # add optimizer stats
-                if grad_norm is not None:
-                    mean_metric_dict["actor/grad_norm"] = (
-                        torch.as_tensor(
-                            grad_norm
-                            if torch.is_tensor(grad_norm)
-                            else float(grad_norm)
-                        )
-                        .float()
-                        .cpu()
+                if torch.is_tensor(grad_norm):
+                    mean_metric_dict["actor/grad_norm"] = float(
+                        grad_norm.detach().item()
                     )
+                else:
+                    mean_metric_dict["actor/grad_norm"] = float(grad_norm)
                 lr = self.optimizer.param_groups[0]["lr"]
                 mean_metric_dict["actor/lr"] = torch.as_tensor(lr).float().cpu()
                 training_metrics_list.append(mean_metric_dict)
@@ -412,57 +411,6 @@ def save_checkpoint(self, save_base_path, step):
             torch.save(optim_state, os.path.join(save_base_path, "optim.pt"))
         torch.distributed.barrier()
 
-    def _compute_batch_rewards(
-        self, batch: Dict[str, torch.Tensor], answers: List[str]
-    ):
-        """Reward computation using non-model based reward."""
-        texts = []
-        for response, response_len in zip(
-            batch["input_ids"],
-            batch["response_lengths"],
-        ):
-            response = response[
-                self.cfg.data.max_prompt_length : self.cfg.data.max_prompt_length
-                + response_len
-            ]
-            texts.append(
-                self.tokenizer.decode(response.tolist(), skip_special_tokens=True)
-            )
-        reward_scores = self.reward.get_reward(texts, answers)
-
-        all_reward_scores = torch.as_tensor(
-            reward_scores,
-            dtype=torch.float,
-            device=torch.device("cpu"),
-        ).view(-1, 1)
-        return all_reward_scores.flatten()
-
-    # Rewards
-    def compute_rewards(self, input_channel: Channel, output_channel: Channel):
-        """Compute rewards.
-
-        Args:
-            input_channel: The input channel to read from.
-            output_channel: The output channel to send results to.
-        """
-        recv_batch_size = 0
-        while recv_batch_size < self.total_batch_size_per_dp:
-            batch, rollout_result = self.get_batch(input_channel)
-            recv_batch_size += rollout_result.num_sequence
-
-            # Compute rule-based reward
-            with self.worker_timer():
-                if rollout_result.rewards is None:
-                    rollout_result.rewards = self._compute_batch_rewards(
-                        batch, rollout_result.answers
-                    )
-
-            self.put_result(rollout_result, output_channel)
-
-        assert recv_batch_size == self.total_batch_size_per_dp, (
-            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
-        )
-
     # Advantages and returns
     def compute_advantages_and_returns(
         self, input_channel: Channel, output_channel: Channel
diff --git a/rlinf/workers/rollout/vllm/vllm_worker.py b/rlinf/workers/rollout/vllm/vllm_worker.py
index a67d967ed..b3629b170 100644
--- a/rlinf/workers/rollout/vllm/vllm_worker.py
+++ b/rlinf/workers/rollout/vllm/vllm_worker.py
@@ -420,6 +420,8 @@ async def rollout_and_return(
             multi_modal_inputs=request.multi_modal_inputs,
             return_logprobs=self._return_logprobs,
         )
+        if self._cfg.rollout.print_outputs:
+            print_vllm_outputs(outputs=vllm_results)
         await self._put_result(result=rollout_result, output_channel=output_channel)
 
     async def rollout(self, input_channel: Channel, output_channel: Channel) -> None:

From d67365ca444fef81f00f2f6a5d74e4200e33e0cf Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Fri, 10 Oct 2025 10:09:43 +0000
Subject: [PATCH 48/57] feat(ci): add fsdp ci

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 .github/workflows/code-test.yml               |  77 +++++--
 .github/workflows/vqa_e2e.yml                 |   4 +-
 examples/reasoning/run_main_grpo_math.sh      |   1 -
 examples/reasoning/run_main_grpo_vqa.sh       |   1 -
 tests/e2e_tests/math/sglang/run_collocated.sh |  17 --
 tests/e2e_tests/math/vllm/run_collocated.sh   |  17 --
 tests/e2e_tests/math/vllm/run_pipeline.sh     |  17 --
 ...-collocated-fsdp-sgl-rollout-logprobs.yaml | 213 ++++++++++++++++++
 ...qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml | 213 ++++++++++++++++++
 ...collocated-fsdp-vllm-rollout-logprobs.yaml | 213 ++++++++++++++++++
 ...wen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml | 213 ++++++++++++++++++
 ...o-collocated-mg-sgl-rollout-logprobs.yaml} |   8 +-
 .../qwen2.5-1.5b-grpo-collocated-mg-sgl.yaml} |   5 -
 ...-collocated-mg-vllm-rollout-logprobs.yaml} |   0
 ...qwen2.5-1.5b-grpo-collocated-mg-vllm.yaml} |   0
 ...rpo-pipeline-mg-sgl-rollout-logprobs.yaml} |   0
 .../qwen2.5-1.5b-grpo-pipeline-mg-sgl.yaml}   |   0
 ...po-pipeline-mg-vllm-rollout-logprobs.yaml} |   0
 .../qwen2.5-1.5b-grpo-pipeline-mg-vllm.yaml}  |   0
 ...en2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml} |   0
 ...n2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml} |   0
 .../sglang => reasoning}/run_collocated.sh    |   7 +-
 .../sglang => reasoning}/run_pipeline.sh      |   6 +-
 tests/e2e_tests/vqa/vllm/run_collocated.sh    |  17 --
 24 files changed, 917 insertions(+), 112 deletions(-)
 delete mode 100644 tests/e2e_tests/math/sglang/run_collocated.sh
 delete mode 100644 tests/e2e_tests/math/vllm/run_collocated.sh
 delete mode 100644 tests/e2e_tests/math/vllm/run_pipeline.sh
 create mode 100644 tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs.yaml
 create mode 100644 tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml
 create mode 100644 tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs.yaml
 create mode 100644 tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml
 rename tests/e2e_tests/{math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml => reasoning/qwen2.5-1.5b-grpo-collocated-mg-sgl-rollout-logprobs.yaml} (96%)
 rename tests/e2e_tests/{math/sglang/qwen2.5-1.5b-grpo-collocated.yaml => reasoning/qwen2.5-1.5b-grpo-collocated-mg-sgl.yaml} (96%)
 rename tests/e2e_tests/{math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml => reasoning/qwen2.5-1.5b-grpo-collocated-mg-vllm-rollout-logprobs.yaml} (100%)
 rename tests/e2e_tests/{math/vllm/qwen2.5-1.5b-grpo-collocated.yaml => reasoning/qwen2.5-1.5b-grpo-collocated-mg-vllm.yaml} (100%)
 rename tests/e2e_tests/{math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml => reasoning/qwen2.5-1.5b-grpo-pipeline-mg-sgl-rollout-logprobs.yaml} (100%)
 rename tests/e2e_tests/{math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml => reasoning/qwen2.5-1.5b-grpo-pipeline-mg-sgl.yaml} (100%)
 rename tests/e2e_tests/{math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml => reasoning/qwen2.5-1.5b-grpo-pipeline-mg-vllm-rollout-logprobs.yaml} (100%)
 rename tests/e2e_tests/{math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml => reasoning/qwen2.5-1.5b-grpo-pipeline-mg-vllm.yaml} (100%)
 rename tests/e2e_tests/{vqa/sglang/qwen2.5-vl-3b-grpo-collocated.yaml => reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml} (100%)
 rename tests/e2e_tests/{vqa/vllm/qwen2.5-vl-3b-grpo-collocated.yaml => reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml} (100%)
 rename tests/e2e_tests/{vqa/sglang => reasoning}/run_collocated.sh (63%)
 rename tests/e2e_tests/{math/sglang => reasoning}/run_pipeline.sh (63%)
 delete mode 100644 tests/e2e_tests/vqa/vllm/run_collocated.sh

diff --git a/.github/workflows/code-test.yml b/.github/workflows/code-test.yml
index c9c34a6c6..6fd1c1409 100644
--- a/.github/workflows/code-test.yml
+++ b/.github/workflows/code-test.yml
@@ -113,33 +113,48 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v5
 
-      - name: SGLang Collocated mode
+      - name: Megatron SGLang Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reason
-          bash tests/e2e_tests/math/sglang/run_collocated.sh
+          source switch_env reasoning
+          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-mg-sgl
 
-      - name: vLLM Collocated mode
+      - name: Megatron vLLM Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reason
-          bash tests/e2e_tests/math/vllm/run_collocated.sh
+          source switch_env reasoning
+          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-mg-vllm
 
-      - name: SGLang Pipeline mode
+      - name: Megatron SGLang Pipeline mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reason
-          bash tests/e2e_tests/math/sglang/run_pipeline.sh
+          source switch_env reasoning
+          bash tests/e2e_tests/reasoning/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-mg-sgl
 
-      - name: vLLM Pipeline mode
+      - name: Megatron vLLM Pipeline mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reason
-          bash tests/e2e_tests/math/vllm/run_pipeline.sh
+          source switch_env reasoning
+          bash tests/e2e_tests/reasoning/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-mg-vllm
+
+      - name: FSDO SGLang Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reasoning
+          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-fsdp-sgl
+
+      - name: FSDP vLLM Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reasoning
+          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-fsdp-vllm
+
 
   reason-qwen-grpo-test-rollout-logprobs:
     needs: [check-changes]
@@ -149,33 +164,47 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v5
 
-      - name: SGLang Collocated mode
+      - name: Megatron SGLang Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reason
-          bash tests/e2e_tests/math/sglang/run_collocated.sh qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
+          source switch_env reasoning
+          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-mg-sgl-rollout-logprobs
 
-      - name: vLLM Collocated mode
+      - name: Megatron vLLM Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reason
-          bash tests/e2e_tests/math/vllm/run_collocated.sh qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
+          source switch_env reasoning
+          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-mg-vllm-rollout-logprobs
 
-      - name: SGLang Pipeline mode
+      - name: Megatron SGLang Pipeline mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reason
-          bash tests/e2e_tests/math/sglang/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
+          source switch_env reasoning
+          bash tests/e2e_tests/reasoning/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-mg-sgl-rollout-logprobs
 
-      - name: vLLM Pipeline mode
+      - name: Megatron vLLM Pipeline mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reason
-          bash tests/e2e_tests/math/vllm/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
+          source switch_env reasoning
+          bash tests/e2e_tests/reasoning/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-mg-vllm-rollout-logprobs
+
+      - name: FSDP SGLang Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reasoning
+          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-fsdp-sglang-rollout-logprobs
+
+      - name: FSDP vLLM Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reasoning
+          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs
 
   coding-online-rl-qwen-ppo-test:
     needs: [check-changes]
diff --git a/.github/workflows/vqa_e2e.yml b/.github/workflows/vqa_e2e.yml
index 58ce056e2..50b6217aa 100644
--- a/.github/workflows/vqa_e2e.yml
+++ b/.github/workflows/vqa_e2e.yml
@@ -55,9 +55,9 @@ jobs:
             - name: SGLang Collocated mode
               run: |
                 export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/vqa/sglang/run_collocated.sh
+                bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-vl-3b-grpo-collocated-fsdp-sgl
 
             - name: vLLM Collocated mode
               run: |
                 export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/vqa/vllm/run_collocated.sh
+                bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-vl-3b-grpo-collocated-fsdp-vllm
diff --git a/examples/reasoning/run_main_grpo_math.sh b/examples/reasoning/run_main_grpo_math.sh
index 56e13c7c2..18a48d780 100644
--- a/examples/reasoning/run_main_grpo_math.sh
+++ b/examples/reasoning/run_main_grpo_math.sh
@@ -2,7 +2,6 @@
 set -x
 
 tabs 4
-export VLLM_ATTENTION_BACKEND=XFORMERS
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export TOKENIZERS_PARALLELISM=false
 export RAY_DEDUP_LOGS=0
diff --git a/examples/reasoning/run_main_grpo_vqa.sh b/examples/reasoning/run_main_grpo_vqa.sh
index 1b41f415c..3cc526f0e 100644
--- a/examples/reasoning/run_main_grpo_vqa.sh
+++ b/examples/reasoning/run_main_grpo_vqa.sh
@@ -2,7 +2,6 @@
 set -x
 
 tabs 4
-export VLLM_ATTENTION_BACKEND=XFORMERS
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export TOKENIZERS_PARALLELISM=false
 export RAY_DEDUP_LOGS=0
diff --git a/tests/e2e_tests/math/sglang/run_collocated.sh b/tests/e2e_tests/math/sglang/run_collocated.sh
deleted file mode 100644
index 5911653e7..000000000
--- a/tests/e2e_tests/math/sglang/run_collocated.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#! /bin/bash
-set -x
-
-tabs 4
-export VLLM_ATTENTION_BACKEND=XFORMERS
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export TOKENIZERS_PARALLELISM=false
-
-export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-
-if [ -z "$1" ]; then
-    CONFIG_NAME="qwen2.5-1.5b-grpo-collocated"
-else
-    CONFIG_NAME=$1
-fi
-
-python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/math/sglang  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/tests/e2e_tests/math/vllm/run_collocated.sh b/tests/e2e_tests/math/vllm/run_collocated.sh
deleted file mode 100644
index 6ce4067fd..000000000
--- a/tests/e2e_tests/math/vllm/run_collocated.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#! /bin/bash
-set -x
-
-tabs 4
-export VLLM_ATTENTION_BACKEND=XFORMERS
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export TOKENIZERS_PARALLELISM=false
-
-export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-
-if [ -z "$1" ]; then
-    CONFIG_NAME="qwen2.5-1.5b-grpo-collocated"
-else
-    CONFIG_NAME=$1
-fi
-
-python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/math/vllm  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/tests/e2e_tests/math/vllm/run_pipeline.sh b/tests/e2e_tests/math/vllm/run_pipeline.sh
deleted file mode 100644
index 59fb19454..000000000
--- a/tests/e2e_tests/math/vllm/run_pipeline.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#! /bin/bash
-set -x
-
-tabs 4
-export VLLM_ATTENTION_BACKEND=XFORMERS
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export TOKENIZERS_PARALLELISM=false
-
-export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-
-if [ -z "$1" ]; then
-    CONFIG_NAME="qwen2.5-1.5b-grpo-pipeline"
-else
-    CONFIG_NAME=$1
-fi
-
-python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/math/vllm  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs.yaml
new file mode 100644
index 000000000..fd208e21b
--- /dev/null
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs.yaml
@@ -0,0 +1,213 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: 3
+
+  val_check_interval: 1
+  save_interval: -1
+
+  seq_length: 1024
+
+  enable_dynamic_batch_size: True
+  max_tokens_per_mbs: 1024
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: /workspace/results
+algorithm:
+  group_size: 2
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: False
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: sglang     # [sglang, vllm]
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: math
+  dataset_name: boba
+  max_prompt_length: 256
+  filter_prompt_by_length: True
+  rollout_batch_size: 8
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  shuffle: True
+  validation_shuffle: True
+  seed: 1
+  train_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+  val_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+
+  optim:
+    optimizer: adam
+    bf16: True
+    fp16: False
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'math'
+  reward_scale: 5.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml
new file mode 100644
index 000000000..bc7ab77e2
--- /dev/null
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml
@@ -0,0 +1,213 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: 3
+
+  val_check_interval: 1
+  save_interval: -1
+
+  seq_length: 1024
+
+  enable_dynamic_batch_size: True
+  max_tokens_per_mbs: 1024
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: /workspace/results
+algorithm:
+  group_size: 2
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: True
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: sglang     # [sglang, vllm]
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: math
+  dataset_name: boba
+  max_prompt_length: 256
+  filter_prompt_by_length: True
+  rollout_batch_size: 8
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  shuffle: True
+  validation_shuffle: True
+  seed: 1
+  train_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+  val_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+
+  optim:
+    optimizer: adam
+    bf16: True
+    fp16: False
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'math'
+  reward_scale: 5.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs.yaml
new file mode 100644
index 000000000..f2b15f9cb
--- /dev/null
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs.yaml
@@ -0,0 +1,213 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: 3
+
+  val_check_interval: 1
+  save_interval: -1
+
+  seq_length: 1024
+
+  enable_dynamic_batch_size: True
+  max_tokens_per_mbs: 1024
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: /workspace/results
+algorithm:
+  group_size: 2
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: False
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: vllm     # [sglang, vllm]
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: math
+  dataset_name: boba
+  max_prompt_length: 256
+  filter_prompt_by_length: True
+  rollout_batch_size: 8
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  shuffle: True
+  validation_shuffle: True
+  seed: 1
+  train_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+  val_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+
+  optim:
+    optimizer: adam
+    bf16: True
+    fp16: False
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'math'
+  reward_scale: 5.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml
new file mode 100644
index 000000000..19a76629f
--- /dev/null
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml
@@ -0,0 +1,213 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: /workspace/results/
+    project_name: rlinf
+    experiment_name: "ci-test"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1
+  max_steps: 3
+
+  val_check_interval: 1
+  save_interval: -1
+
+  seq_length: 1024
+
+  enable_dynamic_batch_size: True
+  max_tokens_per_mbs: 1024
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: /workspace/results
+algorithm:
+  group_size: 2
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: True
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+  model_arch: qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: vllm     # [sglang, vllm]
+
+  sglang:
+    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: math
+  dataset_name: boba
+  max_prompt_length: 256
+  filter_prompt_by_length: True
+  rollout_batch_size: 8
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  shuffle: True
+  validation_shuffle: True
+  seed: 1
+  train_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+  val_data_paths: ["/workspace/dataset/boba_106k_0319_prompt_1024.jsonl"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+
+  optim:
+    optimizer: adam
+    bf16: True
+    fp16: False
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'math'
+  reward_scale: 5.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: ${actor.tokenizer.tokenizer_model}
+    use_fast: ${actor.tokenizer.use_fast}
+    trust_remote_code: ${actor.tokenizer.trust_remote_code}
+    padding_side: ${actor.tokenizer.padding_side}
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-sgl-rollout-logprobs.yaml
similarity index 96%
rename from tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-sgl-rollout-logprobs.yaml
index 3516fe44b..4edc14979 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-sgl-rollout-logprobs.yaml
@@ -84,11 +84,11 @@ rollout:
 
   model_dir: /workspace/dataset/DeepSeek-R1-Distill-Qwen-1.5B
   model_arch: qwen2.5
-  enforce_eager: False         # if False, vllm will capture cuda graph, which will take more time to initialize.
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
   distributed_executor_backend: mp   # ray or mp
   disable_log_stats: False
   detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
-  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for vllm rollout
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
   eos: null                   # will be tokenizer.eos_token_id if null.
 
   rollout_backend: sglang     # [sglang, vllm]
@@ -114,9 +114,9 @@ rollout:
   
   validate_weight: False # whether to send all weights at first for weight comparison.
   validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
-  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of inference engine.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
 
-  max_running_requests: 64 # the maximum number of running requests in the inference engine.
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
   cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
 
 data:
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-sgl.yaml
similarity index 96%
rename from tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-sgl.yaml
index 79b5e1595..1854fd8c3 100644
--- a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-collocated.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-sgl.yaml
@@ -107,7 +107,6 @@ rollout:
     max_num_batched_tokens: null # the maximum number of tokens to be batched together in vllm. If set to null, vllm will use its default value.
     torch_profiler_dir: null # if not null, vllm will enable torch profiler and save the result to the specified directory.
 
-
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
   tensor_parallel_size: 1
@@ -117,13 +116,9 @@ rollout:
   validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
   print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
 
-  sglang_decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
   max_running_requests: 64 # the maximum number of running requests in the rollout engine.
   cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
 
-  use_torch_compile: False # enable torch_compile in SGLang for rollout.
-  torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
-
 data:
   type: math
   dataset_name: boba
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-vllm-rollout-logprobs.yaml
similarity index 100%
rename from tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated-rollout-logprobs.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-vllm-rollout-logprobs.yaml
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-vllm.yaml
similarity index 100%
rename from tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-collocated.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-mg-vllm.yaml
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-sgl-rollout-logprobs.yaml
similarity index 100%
rename from tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-sgl-rollout-logprobs.yaml
diff --git a/tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-sgl.yaml
similarity index 100%
rename from tests/e2e_tests/math/sglang/qwen2.5-1.5b-grpo-pipeline.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-sgl.yaml
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-vllm-rollout-logprobs.yaml
similarity index 100%
rename from tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline-rollout-logprobs.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-vllm-rollout-logprobs.yaml
diff --git a/tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-vllm.yaml
similarity index 100%
rename from tests/e2e_tests/math/vllm/qwen2.5-1.5b-grpo-pipeline.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-pipeline-mg-vllm.yaml
diff --git a/tests/e2e_tests/vqa/sglang/qwen2.5-vl-3b-grpo-collocated.yaml b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml
similarity index 100%
rename from tests/e2e_tests/vqa/sglang/qwen2.5-vl-3b-grpo-collocated.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml
diff --git a/tests/e2e_tests/vqa/vllm/qwen2.5-vl-3b-grpo-collocated.yaml b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml
similarity index 100%
rename from tests/e2e_tests/vqa/vllm/qwen2.5-vl-3b-grpo-collocated.yaml
rename to tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml
diff --git a/tests/e2e_tests/vqa/sglang/run_collocated.sh b/tests/e2e_tests/reasoning/run_collocated.sh
similarity index 63%
rename from tests/e2e_tests/vqa/sglang/run_collocated.sh
rename to tests/e2e_tests/reasoning/run_collocated.sh
index 793c92417..92e43866f 100644
--- a/tests/e2e_tests/vqa/sglang/run_collocated.sh
+++ b/tests/e2e_tests/reasoning/run_collocated.sh
@@ -2,16 +2,15 @@
 set -x
 
 tabs 4
-export VLLM_ATTENTION_BACKEND=XFORMERS
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export TOKENIZERS_PARALLELISM=false
 
 export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
 
 if [ -z "$1" ]; then
-    CONFIG_NAME="qwen2.5-vl-3b-grpo-collocated"
+    echo "Please provide a config name as the first argument."
+    exit 1
 else
     CONFIG_NAME=$1
 fi
-
-python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/vqa/sglang  --config-name $CONFIG_NAME
\ No newline at end of file
+python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/reasoning/  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/tests/e2e_tests/math/sglang/run_pipeline.sh b/tests/e2e_tests/reasoning/run_pipeline.sh
similarity index 63%
rename from tests/e2e_tests/math/sglang/run_pipeline.sh
rename to tests/e2e_tests/reasoning/run_pipeline.sh
index f18012bb4..3ca1574f0 100644
--- a/tests/e2e_tests/math/sglang/run_pipeline.sh
+++ b/tests/e2e_tests/reasoning/run_pipeline.sh
@@ -2,16 +2,16 @@
 set -x
 
 tabs 4
-export VLLM_ATTENTION_BACKEND=XFORMERS
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export TOKENIZERS_PARALLELISM=false
 
 export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
 
 if [ -z "$1" ]; then
-    CONFIG_NAME="qwen2.5-1.5b-grpo-pipeline"
+    echo "Please provide a config name as the first argument."
+    exit 1
 else
     CONFIG_NAME=$1
 fi
 
-python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/math/sglang  --config-name $CONFIG_NAME
\ No newline at end of file
+python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/reasoning    --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/tests/e2e_tests/vqa/vllm/run_collocated.sh b/tests/e2e_tests/vqa/vllm/run_collocated.sh
deleted file mode 100644
index ab406c4ea..000000000
--- a/tests/e2e_tests/vqa/vllm/run_collocated.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#! /bin/bash
-set -x
-
-tabs 4
-export VLLM_ATTENTION_BACKEND=XFORMERS
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export TOKENIZERS_PARALLELISM=false
-
-export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-
-if [ -z "$1" ]; then
-    CONFIG_NAME="qwen2.5-vl-3b-grpo-collocated"
-else
-    CONFIG_NAME=$1
-fi
-
-python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/vqa/vllm  --config-name $CONFIG_NAME
\ No newline at end of file

From 01f95ff36ea5d4d72d880ba25680331fd06fcfb6 Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Fri, 10 Oct 2025 15:15:26 +0000
Subject: [PATCH 49/57] feat(fsdp): fix ci, add fsdp optimizations like overlap
 and gradient accumulation

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 .../config/libero_10_grpo_openvlaoft.yaml     |  6 ++++++
 .../libero_10_grpo_openvlaoft_eval.yaml       |  6 ++++++
 .../config/libero_10_ppo_openvlaoft.yaml      |  6 ++++++
 .../config/libero_goal_grpo_openvlaoft.yaml   |  6 ++++++
 .../config/libero_object_grpo_openvlaoft.yaml |  6 ++++++
 .../libero_spatial_grpo_openvlaoft.yaml       |  6 ++++++
 .../config/maniskill_grpo_openvla.yaml        |  6 ++++++
 .../config/maniskill_grpo_openvlaoft.yaml     |  6 ++++++
 .../config/maniskill_ppo_openvla.yaml         |  6 ++++++
 .../maniskill_ppo_openvla_quickstart.yaml     |  6 ++++++
 .../config/maniskill_ppo_openvlaoft.yaml      |  6 ++++++
 .../config/robotwin_ppo_openvlaoft.yaml       |  6 ++++++
 .../config/math/qwen2.5-1.5b-grpo-fsdp.yaml   |  8 ++++++-
 .../config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml   | 18 ++++++++++------
 rlinf/config.py                               | 21 +++++++++++++++++++
 .../hybrid_engines/fsdp/fsdp_model_manager.py | 15 ++++++++++++-
 rlinf/workers/actor/fsdp_actor_worker.py      | 11 ++++++++--
 .../embodied/maniskill_ppo_openvla.yaml       |  6 ++++++
 ...-collocated-fsdp-sgl-rollout-logprobs.yaml |  6 ++++++
 ...qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml |  6 ++++++
 ...collocated-fsdp-vllm-rollout-logprobs.yaml |  6 ++++++
 ...wen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml |  6 ++++++
 ...wen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml |  6 ++++++
 ...en2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml |  6 ++++++
 24 files changed, 177 insertions(+), 10 deletions(-)

diff --git a/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml
index 1d9720fdd..525b1c951 100644
--- a/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_10_grpo_openvlaoft.yaml
@@ -157,6 +157,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml b/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml
index 628272ed1..69708e7c0 100644
--- a/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml
+++ b/examples/embodiment/config/libero_10_grpo_openvlaoft_eval.yaml
@@ -158,6 +158,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml b/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml
index bf7e31667..15b33bbbc 100644
--- a/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_10_ppo_openvlaoft.yaml
@@ -152,6 +152,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml
index d6356c314..699d7ab40 100644
--- a/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_goal_grpo_openvlaoft.yaml
@@ -156,6 +156,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml
index b767fd25a..d2bacd05e 100644
--- a/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_object_grpo_openvlaoft.yaml
@@ -156,6 +156,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml b/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml
index 69aec20eb..9469166d9 100644
--- a/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/libero_spatial_grpo_openvlaoft.yaml
@@ -156,6 +156,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/maniskill_grpo_openvla.yaml b/examples/embodiment/config/maniskill_grpo_openvla.yaml
index 16dc2af06..3679d533d 100644
--- a/examples/embodiment/config/maniskill_grpo_openvla.yaml
+++ b/examples/embodiment/config/maniskill_grpo_openvla.yaml
@@ -158,6 +158,12 @@ actor:
     adam_eps: 1.0e-05
     clip_grad: 1.0
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml b/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml
index def45aafb..7bd32855b 100644
--- a/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml
+++ b/examples/embodiment/config/maniskill_grpo_openvlaoft.yaml
@@ -156,6 +156,12 @@ actor:
     adam_eps: 1.0e-05
     clip_grad: 10.0
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/maniskill_ppo_openvla.yaml b/examples/embodiment/config/maniskill_ppo_openvla.yaml
index 6aeae632d..bf9cab8eb 100644
--- a/examples/embodiment/config/maniskill_ppo_openvla.yaml
+++ b/examples/embodiment/config/maniskill_ppo_openvla.yaml
@@ -154,6 +154,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml b/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml
index 969dc85cb..b81d1390f 100644
--- a/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml
+++ b/examples/embodiment/config/maniskill_ppo_openvla_quickstart.yaml
@@ -170,6 +170,12 @@ actor:
     adam_eps: 1.0e-05
     clip_grad: 1.0
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml b/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml
index 1e5893f43..f957f1997 100644
--- a/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml
+++ b/examples/embodiment/config/maniskill_ppo_openvlaoft.yaml
@@ -159,6 +159,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/embodiment/config/robotwin_ppo_openvlaoft.yaml b/examples/embodiment/config/robotwin_ppo_openvlaoft.yaml
index 30700e1c2..56d936659 100644
--- a/examples/embodiment/config/robotwin_ppo_openvlaoft.yaml
+++ b/examples/embodiment/config/robotwin_ppo_openvlaoft.yaml
@@ -158,6 +158,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
index e1e97c215..6a7bef298 100644
--- a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
@@ -25,7 +25,7 @@ runner:
   val_check_interval: 1
   save_interval: 50
 
-  seq_length: 2048
+  seq_length: 28672
 
   enable_dynamic_batch_size: False
   max_tokens_per_mbs: 28672
@@ -199,6 +199,12 @@ actor:
     trust_remote_code: True
     padding_side: 'right'
 
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
 reward:
   group_name: "RewardGroup"
   use_reward_model: false
diff --git a/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml b/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml
index 6643e74bd..8d32a6a33 100644
--- a/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml
+++ b/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml
@@ -84,7 +84,7 @@ rollout:
 
   gpu_memory_utilization: 0.55
 
-  model_dir: /mnt/public/hf_models/Qwen2.5-VL-3B-Instruct
+  model_dir: /path/to/model/Qwen2.5-VL-3B-Instruct
   model_arch: qwen2.5_vl #qwen2.5
   enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
   distributed_executor_backend: mp   # ray or mp
@@ -137,8 +137,8 @@ data:
   shuffle: True
   validation_shuffle: True
   seed: 1234
-  train_data_paths: ["/mnt/public/guozhen/data/robo2vlm/train/"]
-  val_data_paths: ["/mnt/public/guozhen/data/robo2vlm/test/"]
+  train_data_paths: ["/dataset/robo2vlm-1/data/train/"]
+  val_data_paths: ["/dataset/robo2vlm-1/data/val/"]
 
 actor:
   group_name: "ActorGroup"
@@ -165,7 +165,7 @@ actor:
 
     seq_length: ${runner.seq_length}
     encoder_seq_length: ${runner.seq_length}
-    model_path: /mnt/public/hf_models/Qwen2.5-VL-3B-Instruct/
+    model_path: /path/to/model/Qwen2.5-VL-3B-Instruct/
 
     model_arch: ${rollout.model_arch}
 
@@ -197,11 +197,17 @@ actor:
     lr_decay_iters: 10
 
   tokenizer:
-    tokenizer_model: /mnt/public/hf_models/Qwen2.5-VL-3B-Instruct
+    tokenizer_model: /path/to/model/Qwen2.5-VL-3B-Instruct
     use_fast: False
     trust_remote_code: True
     padding_side: 'right'
 
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
 reward:
   group_name: "RewardGroup"
   use_reward_model: false
@@ -213,7 +219,7 @@ reward:
     answer_format: 0.0
 
   tokenizer:
-    tokenizer_model: /mnt/public/hf_models/Qwen2.5-VL-3B-Instruct
+    tokenizer_model: /path/to/model/Qwen2.5-VL-3B-Instruct
     use_fast: False
     trust_remote_code: True
     padding_side: 'right'
diff --git a/rlinf/config.py b/rlinf/config.py
index 9c31cd49f..16f69f2e8 100644
--- a/rlinf/config.py
+++ b/rlinf/config.py
@@ -37,6 +37,7 @@
 SUPPORTED_MODEL_ARCHS = ["qwen2.5", "qwen2.5_vl", "openvla", "openvla_oft"]
 SUPPORTED_ROLLOUT_BACKENDS = ["sglang", "vllm"]
 SUPPORTED_TASK_TYPE = ["embodied", "reasoning", "coding_online_rl"]
+SUPPORTED_TRAINING_BACKENDS = ["megatron", "fsdp"]
 __all__ = ["build_config"]
 
 
@@ -222,6 +223,16 @@ def validate_model_cfg_by_hf_config(cfg, hf_model_path):
     return cfg
 
 
+def validate_fsdp_cfg(cfg: DictConfig) -> DictConfig:
+    OmegaConf.set_struct(cfg, True)
+    with open_dict(cfg):
+        cfg.fsdp.forward_prefetch = cfg.fsdp.get("forward_prefetch", False)
+        cfg.fsdp.limit_all_gathers = cfg.fsdp.get("limit_all_gathers", False)
+        cfg.fsdp.backward_prefetch = cfg.fsdp.get("backward_prefetch", False)
+        cfg.fsdp.use_orig_params = cfg.fsdp.get("use_orig_params", False)
+    return cfg
+
+
 def validate_megatron_cfg(cfg: DictConfig) -> DictConfig:
     OmegaConf.set_struct(cfg, True)
 
@@ -624,13 +635,23 @@ def validate_cfg(cfg: DictConfig) -> DictConfig:
     ):
         assert cfg.algorithm.group_size > 1
 
+    assert cfg.actor.training_backend in SUPPORTED_TRAINING_BACKENDS, (
+        f"Unsupported training_backend {cfg.actor.training_backend}. Supported training backends are {SUPPORTED_TRAINING_BACKENDS}."
+    )
+
     if cfg.actor.training_backend == "megatron":
         cfg.actor = validate_megatron_cfg(cfg.actor)
         cfg.actor = validate_model_cfg_by_hf_config(cfg.actor, cfg.rollout.model_dir)
+    elif cfg.actor.training_backend == "fsdp":
+        cfg.actor = validate_fsdp_cfg(cfg.actor)
+        cfg.actor = validate_model_cfg_by_hf_config(cfg.actor, cfg.rollout.model_dir)
 
     if cfg.critic.use_critic_model and cfg.critic.training_backend == "megatron":
         cfg.critic = validate_megatron_cfg(cfg.critic)
         cfg = validate_model_cfg_by_hf_config(cfg.critic, cfg.rollout.model_dir)
+    elif cfg.critic.use_critic_model and cfg.critic.training_backend == "fsdp":
+        cfg.critic = validate_fsdp_cfg(cfg.critic)
+        cfg.critic = validate_model_cfg_by_hf_config(cfg.critic, cfg.rollout.model_dir)
 
     return cfg
 
diff --git a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
index c3bd9475a..9679514c0 100644
--- a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
+++ b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
@@ -17,8 +17,13 @@
 import torch
 import torch.optim as optim
 from omegaconf import DictConfig
+from torch.distributed.fsdp import (
+    BackwardPrefetch,
+    MixedPrecision,
+    ShardingStrategy,
+    StateDictType,
+)
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp import MixedPrecision, ShardingStrategy, StateDictType
 from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForVision2Seq
 
 from rlinf.config import torch_dtype_from_precision
@@ -123,6 +128,14 @@ def setup_model_and_optimizer(self):
             sharding_strategy=sharding_strategy,  # zero3
             mixed_precision=mixed_precision,
             sync_module_states=True,
+            forward_prefetch=self._cfg.fsdp.forward_prefetch,
+            backward_prefetch=(
+                BackwardPrefetch.BACKWARD_PRE
+                if self._cfg.fsdp.backward_prefetch
+                else BackwardPrefetch.NONE
+            ),
+            limit_all_gathers=self._cfg.fsdp.limit_all_gathers,
+            use_orig_params=self._cfg.fsdp.use_orig_params,
         )
 
         # NOTE: Currently we assume that only the value head contains "value_head" in its name.
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index 8c467d553..3276cd744 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -14,6 +14,7 @@
 
 import gc
 import os
+from contextlib import nullcontext
 from typing import Dict, Tuple
 
 import numpy as np
@@ -258,7 +259,12 @@ def run_training(self, input_channel: Channel):
 
                 self.optimizer.zero_grad()
                 metrics = {}
-                for _, m_batch in enumerate(train_micro_batches):
+                for idx, m_batch in enumerate(train_micro_batches):
+                    backward_ctx = (
+                        self.model.no_sync()
+                        if idx < self.gradient_accumulation - 1
+                        else nullcontext()
+                    )
                     for k, v in m_batch.items():
                         m_batch[k] = v.cuda() if isinstance(v, torch.Tensor) else v
 
@@ -352,7 +358,8 @@ def run_training(self, input_channel: Channel):
                     # add to log
                     # scale loss for gradient accumulation and backprop
                     loss = loss / self.gradient_accumulation
-                    loss.backward()
+                    with backward_ctx:
+                        loss.backward()
 
                     mbs_metrics_data.update(
                         {
diff --git a/tests/e2e_tests/embodied/maniskill_ppo_openvla.yaml b/tests/e2e_tests/embodied/maniskill_ppo_openvla.yaml
index 2063ab483..5d9258743 100644
--- a/tests/e2e_tests/embodied/maniskill_ppo_openvla.yaml
+++ b/tests/e2e_tests/embodied/maniskill_ppo_openvla.yaml
@@ -165,6 +165,12 @@ actor:
     adam_eps: 1.0e-05
     clip_grad: 1.0
 
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
 reward:
   use_reward_model: False
 
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs.yaml
index fd208e21b..658a730ca 100644
--- a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs.yaml
@@ -193,6 +193,12 @@ actor:
     trust_remote_code: True
     padding_side: 'right'
 
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
 reward:
   group_name: "RewardGroup"
   use_reward_model: false
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml
index bc7ab77e2..29ac00d07 100644
--- a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml
@@ -193,6 +193,12 @@ actor:
     trust_remote_code: True
     padding_side: 'right'
 
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
 reward:
   group_name: "RewardGroup"
   use_reward_model: false
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs.yaml
index f2b15f9cb..6b94ddf21 100644
--- a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs.yaml
@@ -193,6 +193,12 @@ actor:
     trust_remote_code: True
     padding_side: 'right'
 
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
 reward:
   group_name: "RewardGroup"
   use_reward_model: false
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml
index 19a76629f..ad06eabf3 100644
--- a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml
@@ -193,6 +193,12 @@ actor:
     trust_remote_code: True
     padding_side: 'right'
 
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
 reward:
   group_name: "RewardGroup"
   use_reward_model: false
diff --git a/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml
index bcbae2f94..45575d9e7 100644
--- a/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml
@@ -202,6 +202,12 @@ actor:
     trust_remote_code: True
     padding_side: 'right'
 
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
 reward:
   group_name: "RewardGroup"
   use_reward_model: false
diff --git a/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml
index f555a7292..870731fbb 100644
--- a/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml
@@ -202,6 +202,12 @@ actor:
     trust_remote_code: True
     padding_side: 'right'
 
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
 reward:
   group_name: "RewardGroup"
   use_reward_model: false

From a8023c82d16c80f2fcc43acb5872117b79640fc9 Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Sat, 11 Oct 2025 09:35:28 +0000
Subject: [PATCH 50/57] fix(ci): add fsdp's run_inference, fix ci

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 .github/workflows/code-test.yml               |  24 ++--
 .../config/math/qwen2.5-1.5b-grpo-fsdp.yaml   |   2 +-
 rlinf/config.py                               |   2 +-
 rlinf/workers/actor/fsdp_actor_worker.py      | 103 ++++++++++++++++--
 rlinf/workers/actor/megatron_actor_worker.py  |   4 +-
 rlinf/workers/reward/reward_worker.py         |  16 +--
 ...-collocated-fsdp-sgl-rollout-logprobs.yaml |   2 +-
 ...qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml |   2 +-
 ...collocated-fsdp-vllm-rollout-logprobs.yaml |   2 +-
 ...wen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml |   2 +-
 ...wen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml |   2 +-
 ...en2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml |   2 +-
 12 files changed, 119 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/code-test.yml b/.github/workflows/code-test.yml
index 6fd1c1409..075b7ac8d 100644
--- a/.github/workflows/code-test.yml
+++ b/.github/workflows/code-test.yml
@@ -117,42 +117,42 @@ jobs:
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reasoning
+          source switch_env reason
           bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-mg-sgl
 
       - name: Megatron vLLM Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reasoning
+          source switch_env reason
           bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-mg-vllm
 
       - name: Megatron SGLang Pipeline mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reasoning
+          source switch_env reason
           bash tests/e2e_tests/reasoning/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-mg-sgl
 
       - name: Megatron vLLM Pipeline mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reasoning
+          source switch_env reason
           bash tests/e2e_tests/reasoning/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-mg-vllm
 
       - name: FSDO SGLang Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reasoning
+          source switch_env reason
           bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-fsdp-sgl
 
       - name: FSDP vLLM Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reasoning
+          source switch_env reason
           bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-fsdp-vllm
 
 
@@ -168,42 +168,42 @@ jobs:
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reasoning
+          source switch_env reason
           bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-mg-sgl-rollout-logprobs
 
       - name: Megatron vLLM Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reasoning
+          source switch_env reason
           bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-mg-vllm-rollout-logprobs
 
       - name: Megatron SGLang Pipeline mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reasoning
+          source switch_env reason
           bash tests/e2e_tests/reasoning/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-mg-sgl-rollout-logprobs
 
       - name: Megatron vLLM Pipeline mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reasoning
+          source switch_env reason
           bash tests/e2e_tests/reasoning/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-mg-vllm-rollout-logprobs
 
       - name: FSDP SGLang Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reasoning
+          source switch_env reason
           bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-fsdp-sglang-rollout-logprobs
 
       - name: FSDP vLLM Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
-          source switch_env reasoning
+          source switch_env reason
           bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs
 
   coding-online-rl-qwen-ppo-test:
diff --git a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
index 6a7bef298..19281cbc9 100644
--- a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
@@ -123,7 +123,7 @@ data:
   dataset_name: boba
   max_prompt_length: 1024
   filter_prompt_by_length: True
-  rollout_batch_size: 8
+  rollout_batch_size: 512
   val_rollout_batch_size: null
   num_workers: 2
   shuffle: True
diff --git a/rlinf/config.py b/rlinf/config.py
index 16f69f2e8..ae7b85985 100644
--- a/rlinf/config.py
+++ b/rlinf/config.py
@@ -186,7 +186,7 @@ def validate_vllm_cfg(cfg):
 
 def validate_model_cfg_by_hf_config(cfg, hf_model_path):
     # validate by hf config
-    hf_config = AutoConfig.from_pretrained(hf_model_path)
+    hf_config = AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)
 
     if "Qwen2ForCausalLM" in hf_config.architectures:
         qkv_bias = True
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index 3276cd744..3b18c8e1e 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -56,7 +56,9 @@
 from rlinf.utils.utils import (
     compute_entropy_from_logits,
     compute_logprobs_from_logits,
+    cpu_weight_swap,
     masked_mean,
+    retrieve_model_state_dict_in_cpu,
     seq_mean_token_mean,
     seq_mean_token_sum,
 )
@@ -96,6 +98,8 @@ def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
         self._rollout_group_name = cfg.rollout.group_name
         self._component_placement = placement
         self.is_data_io_rank = True
+        self.is_pipeline = self._component_placement.is_disaggregated
+        self.ref_policy_state_dict = None
 
         if self.cfg.algorithm.loss_agg_func == "token-mean":
             self.loss_agg_func = masked_mean
@@ -118,8 +122,13 @@ def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
             reward_cls = get_reward_class(self.cfg.reward.reward_type)
             self.reward = reward_cls(self.cfg.reward)
 
-    def init_worker(self):
+    def init_worker(self) -> None:
         self.setup_model_and_optimizer()
+        if self.cfg.algorithm.kl_beta > 0 and self.cfg.actor.get(
+            "combine_reference_model", True
+        ):
+            self.ref_policy_state_dict = retrieve_model_state_dict_in_cpu(self.model)
+
         if self.cfg.actor.get("enable_offload", False):
             self.offload_fsdp_param_and_grad()
             self.offload_fsdp_optimizer()
@@ -128,7 +137,7 @@ def init_worker(self):
             torch.cuda.empty_cache()
         self._setup_rollout_weight_dst_ranks()
 
-    def _setup_rollout_weight_dst_ranks(self):
+    def _setup_rollout_weight_dst_ranks(self) -> None:
         """Setup destination ranks for token and weight communication."""
         rank_map = RankMapper.get_actor_rank_to_rollout_rank_map(
             self._component_placement
@@ -138,11 +147,11 @@ def _setup_rollout_weight_dst_ranks(self):
             f"Actor rank {self._rank} will send weights to {self._weight_dst_rank_in_rollout}"
         )
 
-    def del_reshard_state_dict(self):
+    def del_reshard_state_dict(self) -> None:
         if hasattr(self, "rollout_state_dict"):
             del self.rollout_state_dict
 
-    def sync_model_to_rollout(self):
+    def sync_model_to_rollout(self) -> None:
         if self.cfg.actor.get("enable_offload", False):
             self.offload_fsdp_optimizer()
 
@@ -175,7 +184,7 @@ def sync_model_to_rollout(self):
         if self.cfg.actor.get("enable_offload", False):
             self.offload_fsdp_param_and_grad()
 
-    def compute_logprobs(self):
+    def compute_logprobs(self) -> None:
         self.model.eval()
         self.rollout_batch["logprob"] = self.rollout_batch["prev_logprobs"]
 
@@ -191,7 +200,7 @@ def get_batch(
         )
         return batch, result
 
-    def put_result(self, result: RolloutResult, channel: Channel):
+    def put_result(self, result: RolloutResult, channel: Channel) -> None:
         if channel.is_local:
             # Local channel, every process will put its own data locally
             # No need to broadcast
@@ -200,7 +209,7 @@ def put_result(self, result: RolloutResult, channel: Channel):
             if self.is_data_io_rank:
                 channel.put(result)
 
-    def _load_weight_and_optimizer(self, channel: Channel):
+    def _load_weight_and_optimizer(self, channel: Channel) -> None:
         # Acquire the GPUs to ensure that no one is using them before loading models
         # Otherwise, it may lead to OOM
         with channel.device_lock:
@@ -208,7 +217,81 @@ def _load_weight_and_optimizer(self, channel: Channel):
                 self.load_fsdp_param_and_grad(self.device)
                 self.load_fsdp_optimizer(self.device)
 
-    def run_training(self, input_channel: Channel):
+    @torch.no_grad()
+    def inference_step(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        self.model.eval()
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        position_ids = batch["position_ids"]
+
+        multi_modal_inputs = {}
+        if "multi_modal_inputs" in batch.keys():
+            for key in batch["multi_modal_inputs"][0].keys():
+                multi_modal_inputs[key] = torch.cat(
+                    [inputs[key] for inputs in batch["multi_modal_inputs"]],
+                    dim=0,
+                ).cuda()
+
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=False,
+            **multi_modal_inputs,
+        )
+
+        logits = outputs.logits
+        logits = logits[:, -self.response_len - 1 : -1, :]
+        logits = logits / self.cfg.algorithm.sampling_params.temperature
+
+        responses = input_ids[:, -self.response_len :]
+        logprobs = compute_logprobs_from_logits(
+            logits, responses, task_type=self.cfg.runner.task_type
+        )
+        return logprobs
+
+    def run_inference(
+        self,
+        input_channel: Channel,
+        output_channel: Channel,
+        rollout_channel: Channel,
+        compute_ref_logprobs: bool,
+    ) -> None:
+        """
+        Compute prev/ref logprobs using the actor Model's forward.
+
+        Args:
+            input_channel: The input channel to read from.
+            output_channel: The output channel to send results to.
+            rollout_channel: get the rollout channel's device lock in case of collision.
+            compute_ref_logprobs: Whether to compute reference logprobs.
+        """
+        recv_batch_size = 0
+        while recv_batch_size < self.total_batch_size_per_dp:
+            batch, rollout_result = self.get_batch(input_channel)
+            recv_batch_size += rollout_result.num_sequence
+            self._load_weight_and_optimizer(
+                input_channel if self.is_pipeline else rollout_channel
+            )
+
+            with self.worker_timer():
+                prev_logprobs = self.inference_step(batch)
+                rollout_result.prev_logprobs = prev_logprobs.cpu()
+
+            if compute_ref_logprobs:
+                assert self.ref_policy_state_dict is not None, (
+                    "Reference policy state dict is None but compute_ref_logprobs is True"
+                )
+                with cpu_weight_swap(self.model, self.ref_policy_state_dict):
+                    ref_logprobs = self.inference_step(batch)
+                    rollout_result.ref_logprobs = ref_logprobs.cpu()
+            self.put_result(rollout_result, output_channel)
+
+        assert recv_batch_size == self.total_batch_size_per_dp, (
+            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+        )
+
+    def run_training(self, input_channel: Channel) -> Tuple[Dict, list]:
         # Get all batches for this DP
         batches = []
         recv_batch_size = 0
@@ -408,7 +491,7 @@ def run_training(self, input_channel: Channel):
 
         return rollout_metrics, training_metrics_list
 
-    def save_checkpoint(self, save_base_path, step):
+    def save_checkpoint(self, save_base_path: str, step: int) -> None:
         torch.distributed.barrier()
         model_state = self.get_model_state_dict()
         optim_state = self.get_optimizer_state_dict()
@@ -421,7 +504,7 @@ def save_checkpoint(self, save_base_path, step):
     # Advantages and returns
     def compute_advantages_and_returns(
         self, input_channel: Channel, output_channel: Channel
-    ):
+    ) -> None:
         """Compute the advantages and returns.
 
         Args:
diff --git a/rlinf/workers/actor/megatron_actor_worker.py b/rlinf/workers/actor/megatron_actor_worker.py
index 6736e550b..c7129a321 100644
--- a/rlinf/workers/actor/megatron_actor_worker.py
+++ b/rlinf/workers/actor/megatron_actor_worker.py
@@ -844,11 +844,13 @@ def run_inference(
         rollout_channel: Channel,
         compute_ref_logprobs: bool,
     ):
-        """Compute prev/ref logprobs using the actor Model's forward.
+        """
+        Compute prev/ref logprobs using the actor Model's forward.
 
         Args:
             input_channel: The input channel to read from.
             output_channel: The output channel to send results to.
+            rollout_channel: get the rollout channel's device lock in case of collision.
             compute_ref_logprobs: Whether to compute reference logprobs.
         """
         recv_batch_size = 0
diff --git a/rlinf/workers/reward/reward_worker.py b/rlinf/workers/reward/reward_worker.py
index e186eebac..eea493117 100644
--- a/rlinf/workers/reward/reward_worker.py
+++ b/rlinf/workers/reward/reward_worker.py
@@ -19,15 +19,13 @@
 
 from rlinf.algorithms.rewards import get_reward_class
 from rlinf.data.io_struct import RolloutResult
-from rlinf.hybrid_engines.fsdp.fsdp_model_manager import FSDPModelManager
 from rlinf.scheduler import Channel, Worker
 from rlinf.utils.placement import ModelParallelComponentPlacement
 
 
-class RewardWorker(FSDPModelManager, Worker):
+class RewardWorker(Worker):
     def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
         Worker.__init__(self)
-        super().__init__(cfg.reward)
         self.cfg = cfg
         self.component_placement = placement
 
@@ -39,9 +37,7 @@ def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
 
     def init_worker(self):
         if self.cfg.reward.use_reward_model:
-            self.setup_model_and_optimizer()
-            self.offload_fsdp_param_and_grad()
-            self.offload_fsdp_optimizer()
+            raise NotImplementedError("Reward model is not implemented yet.")
         else:
             self.reward = get_reward_class(self.cfg.reward.reward_type)(self.cfg.reward)
 
@@ -106,10 +102,4 @@ def _compute_rule_based_rewards(self, rollout_result: RolloutResult):
         )
 
     def compute_batch_rewards_with_model(self, batch: Dict[str, torch.Tensor]):
-        self.model.eval()
-        with torch.no_grad():
-            # TODO: fix this
-            rewards = (
-                self.model(batch["input_ids"], batch["attention_mask"]).detach().cpu()
-            )
-        return rewards
+        raise NotImplementedError("Reward model is not implemented yet.")
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs.yaml
index 658a730ca..86c6f362c 100644
--- a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs.yaml
@@ -123,7 +123,7 @@ data:
   dataset_name: boba
   max_prompt_length: 256
   filter_prompt_by_length: True
-  rollout_batch_size: 8
+  rollout_batch_size: 16
   val_rollout_batch_size: null
   num_workers: 2
   prompt_key: prompt
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml
index 29ac00d07..dbf7925f2 100644
--- a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-sgl.yaml
@@ -123,7 +123,7 @@ data:
   dataset_name: boba
   max_prompt_length: 256
   filter_prompt_by_length: True
-  rollout_batch_size: 8
+  rollout_batch_size: 16
   val_rollout_batch_size: null
   num_workers: 2
   prompt_key: prompt
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs.yaml
index 6b94ddf21..e85fd4146 100644
--- a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs.yaml
@@ -123,7 +123,7 @@ data:
   dataset_name: boba
   max_prompt_length: 256
   filter_prompt_by_length: True
-  rollout_batch_size: 8
+  rollout_batch_size: 16
   val_rollout_batch_size: null
   num_workers: 2
   prompt_key: prompt
diff --git a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml
index ad06eabf3..1aab4cb7b 100644
--- a/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-1.5b-grpo-collocated-fsdp-vllm.yaml
@@ -123,7 +123,7 @@ data:
   dataset_name: boba
   max_prompt_length: 256
   filter_prompt_by_length: True
-  rollout_batch_size: 8
+  rollout_batch_size: 16
   val_rollout_batch_size: null
   num_workers: 2
   prompt_key: prompt
diff --git a/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml
index 45575d9e7..46278b949 100644
--- a/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml
@@ -124,7 +124,7 @@ data:
   dataset_name: robo2vlm
   max_prompt_length: 1024
   filter_prompt_by_length: True
-  rollout_batch_size: 8
+  rollout_batch_size: 16
   val_rollout_batch_size: null
   num_workers: 2
   prompt_key: prompt
diff --git a/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml
index 870731fbb..fb7c8aa02 100644
--- a/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml
@@ -124,7 +124,7 @@ data:
   dataset_name: robo2vlm
   max_prompt_length: 1024
   filter_prompt_by_length: True
-  rollout_batch_size: 8
+  rollout_batch_size: 16
   val_rollout_batch_size: null
   num_workers: 2
   prompt_key: prompt

From 803c4c68be2b43b87611209f824464b32ae6791b Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Sun, 12 Oct 2025 12:20:23 +0000
Subject: [PATCH 51/57] fix(ci): fix some errors

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 .github/workflows/code-test.yml               |  28 +++-
 .github/workflows/vqa_e2e.yml                 |  63 --------
 rlinf/config.py                               |   4 +-
 .../hybrid_engines/fsdp/fsdp_model_manager.py |  38 ++---
 .../hybrid_engines/vllm/vllm_0_8_5/worker.py  |  13 ++
 rlinf/runners/coding_online_rl_runner.py      |   1 +
 rlinf/workers/actor/fsdp_actor_worker.py      |   8 --
 rlinf/workers/actor/megatron_actor_worker.py  |   6 +-
 rlinf/workers/reward/reward_worker.py         |   3 +-
 .../coding_online_rl/qwen2.5-1.5b-ppo.yaml    |   1 +
 .../embodied/libero_130_grpo_openvlaoft.yaml  |   6 +
 tests/unit_tests/test_io_struct.py            | 135 ------------------
 12 files changed, 75 insertions(+), 231 deletions(-)
 delete mode 100644 .github/workflows/vqa_e2e.yml
 delete mode 100644 tests/unit_tests/test_io_struct.py

diff --git a/.github/workflows/code-test.yml b/.github/workflows/code-test.yml
index 075b7ac8d..830ee82e7 100644
--- a/.github/workflows/code-test.yml
+++ b/.github/workflows/code-test.yml
@@ -141,7 +141,7 @@ jobs:
           source switch_env reason
           bash tests/e2e_tests/reasoning/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-mg-vllm
 
-      - name: FSDO SGLang Collocated mode
+      - name: FSDP SGLang Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
@@ -197,7 +197,7 @@ jobs:
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-fsdp-sglang-rollout-logprobs
+          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs
 
       - name: FSDP vLLM Collocated mode
         timeout-minutes: 20
@@ -225,6 +225,28 @@ jobs:
           source switch_env reason
           bash tests/e2e_tests/coding_online_rl/run_coding_online_rl.sh
 
+  qwen-vl-grpo-test:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true'
+    runs-on: reason
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: FSDP SGLang Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-vl-3b-grpo-collocated-fsdp-sgl
+
+      - name: FSDP vLLM Collocated mode
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source switch_env reason
+          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-vl-3b-grpo-collocated-fsdp-vllm
+
   # =============================================== embodied e2e tests ====================================================
 
   embodied-maniskill-ppo-openvla-test:
@@ -312,7 +334,7 @@ jobs:
 
       # Reason e2e tests
       reason-qwen-grpo-test, reason-qwen-grpo-test-rollout-logprobs,
-      coding-online-rl-qwen-ppo-test,
+      coding-online-rl-qwen-ppo-test, qwen-vl-grpo-test,
 
       # Embodied e2e tests
       embodied-maniskill-ppo-openvla-test, embodied-maniskill-grpo-openvlaoft-test, embodied-libero-goal-grpo-openvlaoft-test,embodied-libero-130-grpo-openvlaoft-test,
diff --git a/.github/workflows/vqa_e2e.yml b/.github/workflows/vqa_e2e.yml
deleted file mode 100644
index 50b6217aa..000000000
--- a/.github/workflows/vqa_e2e.yml
+++ /dev/null
@@ -1,63 +0,0 @@
-name: VQA End2End
-
-on:
-    push:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '!*.yaml'
-          - '!*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-    pull_request:
-        branches:
-          - 'release/v[0-9].[0-9]'
-          - main
-        paths:
-          - '**/*.py'
-          - 'tests/**'
-          - '.github/workflows/*.yml'
-          - '!docs/**'
-          - '!README.md'
-          - '*.yaml'
-          - '*.toml'
-          - '!ray_utils/**'
-          - '!requirements/**'
-
-permissions:
-  contents: read
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-    qwen-vl-grpo-test:
-        runs-on: rlinf
-        container:
-            image: rlinf/rlinf:math-rlinf0.1-torch2.6.0-sglang0.4.6.post5-vllm0.8.5-megatron0.13.0-te2.1
-            volumes:
-                - /mnt/public/dataset:/workspace/dataset
-                - /mnt/public/tokenizer:/workspace/tokenizer
-            options: --gpus="all" --shm-size=80g
-
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - name: SGLang Collocated mode
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-vl-3b-grpo-collocated-fsdp-sgl
-
-            - name: vLLM Collocated mode
-              run: |
-                export REPO_PATH=$(pwd)
-                bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-vl-3b-grpo-collocated-fsdp-vllm
diff --git a/rlinf/config.py b/rlinf/config.py
index ae7b85985..05de15254 100644
--- a/rlinf/config.py
+++ b/rlinf/config.py
@@ -644,14 +644,12 @@ def validate_cfg(cfg: DictConfig) -> DictConfig:
         cfg.actor = validate_model_cfg_by_hf_config(cfg.actor, cfg.rollout.model_dir)
     elif cfg.actor.training_backend == "fsdp":
         cfg.actor = validate_fsdp_cfg(cfg.actor)
-        cfg.actor = validate_model_cfg_by_hf_config(cfg.actor, cfg.rollout.model_dir)
 
     if cfg.critic.use_critic_model and cfg.critic.training_backend == "megatron":
         cfg.critic = validate_megatron_cfg(cfg.critic)
-        cfg = validate_model_cfg_by_hf_config(cfg.critic, cfg.rollout.model_dir)
+        cfg.critic = validate_model_cfg_by_hf_config(cfg.critic, cfg.rollout.model_dir)
     elif cfg.critic.use_critic_model and cfg.critic.training_backend == "fsdp":
         cfg.critic = validate_fsdp_cfg(cfg.critic)
-        cfg.critic = validate_model_cfg_by_hf_config(cfg.critic, cfg.rollout.model_dir)
 
     return cfg
 
diff --git a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
index 9679514c0..aad2c130f 100644
--- a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
+++ b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
@@ -49,22 +49,35 @@ def __init__(self, cfg: DictConfig):
         self.tokenizer = hf_tokenizer(cfg.tokenizer.tokenizer_model)
 
     def model_provider_func(self) -> torch.nn.Module:
+        cfg = self._cfg
+        use_gptq = cfg.model.get("gptq_model", False)
+        load_in_8bit = cfg.model.get("load_in_8bit", False)
+
+        use_triton = cfg.get("use_triton", True)
+
+        assert torch.cuda.is_available(), "CUDA is not available."
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+        device = torch.device(f"cuda:{local_rank}")
+
         model_config = AutoConfig.from_pretrained(
-            self._cfg.model.model_path,
+            cfg.model.model_path,
             trust_remote_code=True,
             attn_implementation="flash_attention_2",
         )
 
-        if self._cfg.model.get("gptq_model", False):
+        if use_gptq:
             from auto_gptq import AutoGPTQForCausalLM
 
             model_wrapper = AutoGPTQForCausalLM.from_quantized(
-                self._cfg.model.model_path, device="cuda:0", use_triton=True
+                cfg.model.model_path,
+                device=device,
+                use_triton=use_triton,
             )
             model = model_wrapper.model
-        elif self._cfg.model.get("load_in_8bit", False):
+        elif load_in_8bit:
             model = AutoModelForCausalLM.from_pretrained(
-                self._cfg.model.model_path,
+                cfg.model.model_path,
+                config=model_config,
                 load_in_8bit=True,
             )
         else:
@@ -73,22 +86,15 @@ def model_provider_func(self) -> torch.nn.Module:
             else:
                 auto_model_class = AutoModelForCausalLM
 
-            # default load in float16
             model = auto_model_class.from_pretrained(
-                self._cfg.model.model_path,
+                cfg.model.model_path,
                 torch_dtype=self.torch_dtype,
                 config=model_config,
                 trust_remote_code=True,
             )
 
-        model.to(self.torch_dtype)
-
-        if torch.cuda.is_available():
-            model = model.cuda()
-        if self.torch_dtype == torch.float16:
-            model = model.half()
-
-        torch.distributed.barrier()
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
         return model
 
     def setup_model_and_optimizer(self):
@@ -132,7 +138,7 @@ def setup_model_and_optimizer(self):
             backward_prefetch=(
                 BackwardPrefetch.BACKWARD_PRE
                 if self._cfg.fsdp.backward_prefetch
-                else BackwardPrefetch.NONE
+                else None
             ),
             limit_all_gathers=self._cfg.fsdp.limit_all_gathers,
             use_orig_params=self._cfg.fsdp.use_orig_params,
diff --git a/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py b/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
index 519895e49..104334c1a 100644
--- a/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
+++ b/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
@@ -75,6 +75,19 @@ def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
         super().initialize_from_config(kv_cache_config)
 
     def offload_model_weights(self) -> None:
+        torch.cuda.synchronize()
+
+        model = self.model_runner.model
+        with torch.no_grad():
+            for mod in model.modules():
+                for name, buf in list(getattr(mod, "_buffers", {}).items()):
+                    if isinstance(buf, torch.Tensor) and buf.is_cuda:
+                        cpu_buf = (
+                            buf.detach().to("cpu", non_blocking=False).contiguous()
+                        )
+                        mod._buffers[name] = cpu_buf
+        torch.cuda.empty_cache()
+
         super().sleep(level=2)
 
     def sync_hf_weight(self) -> None:
diff --git a/rlinf/runners/coding_online_rl_runner.py b/rlinf/runners/coding_online_rl_runner.py
index 46be3423a..4ea2ec024 100644
--- a/rlinf/runners/coding_online_rl_runner.py
+++ b/rlinf/runners/coding_online_rl_runner.py
@@ -212,6 +212,7 @@ def run(self):
                     infer_handle: Handle = self.inference.run_inference(
                         input_channel=self.dataloader_channel,
                         output_channel=self.inference_channel,
+                        rollout_channel=None,
                         compute_ref_logprobs=self.compute_ref_logprobs,
                     )
                     inference_channel = self.inference_channel
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index 3b18c8e1e..17b9f42ec 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -54,7 +54,6 @@
     ModelParallelComponentPlacement,
 )
 from rlinf.utils.utils import (
-    compute_entropy_from_logits,
     compute_logprobs_from_logits,
     cpu_weight_swap,
     masked_mean,
@@ -132,9 +131,6 @@ def init_worker(self) -> None:
         if self.cfg.actor.get("enable_offload", False):
             self.offload_fsdp_param_and_grad()
             self.offload_fsdp_optimizer()
-            torch.cuda.synchronize()
-            gc.collect()
-            torch.cuda.empty_cache()
         self._setup_rollout_weight_dst_ranks()
 
     def _setup_rollout_weight_dst_ranks(self) -> None:
@@ -391,10 +387,6 @@ def run_training(self, input_channel: Channel) -> Tuple[Dict, list]:
                     logprobs = compute_logprobs_from_logits(
                         logits, responses, task_type=self.cfg.runner.task_type
                     )
-                    if self.calculate_entropy:
-                        entropy = compute_entropy_from_logits(
-                            logits, task_type=self.cfg.runner.task_type
-                        )  # (bsz, response_length)
 
                     clip_ratio = self.cfg.algorithm.ratio_clip_eps
                     clip_ratio_low = (
diff --git a/rlinf/workers/actor/megatron_actor_worker.py b/rlinf/workers/actor/megatron_actor_worker.py
index c7129a321..f7bb3631e 100644
--- a/rlinf/workers/actor/megatron_actor_worker.py
+++ b/rlinf/workers/actor/megatron_actor_worker.py
@@ -841,7 +841,7 @@ def run_inference(
         self,
         input_channel: Channel,
         output_channel: Channel,
-        rollout_channel: Channel,
+        rollout_channel: Optional[Channel],
         compute_ref_logprobs: bool,
     ):
         """
@@ -860,7 +860,9 @@ def run_inference(
             # Must be called after batch is retrieved, suggesting that rollout has stopped
             # Otherwise, loading model might cause OOM in the collocated mode
             self._load_weight_and_optimizer(
-                input_channel if self.is_pipeline else rollout_channel
+                input_channel
+                if self.is_pipeline or rollout_channel is None
+                else rollout_channel
             )
 
             # Prev logprobs
diff --git a/rlinf/workers/reward/reward_worker.py b/rlinf/workers/reward/reward_worker.py
index eea493117..fd2472611 100644
--- a/rlinf/workers/reward/reward_worker.py
+++ b/rlinf/workers/reward/reward_worker.py
@@ -19,6 +19,7 @@
 
 from rlinf.algorithms.rewards import get_reward_class
 from rlinf.data.io_struct import RolloutResult
+from rlinf.data.tokenizers import hf_tokenizer
 from rlinf.scheduler import Channel, Worker
 from rlinf.utils.placement import ModelParallelComponentPlacement
 
@@ -28,7 +29,7 @@ def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
         Worker.__init__(self)
         self.cfg = cfg
         self.component_placement = placement
-
+        self.tokenizer = hf_tokenizer(cfg.reward.tokenizer.tokenizer_model)
         self.total_batch_size_per_dp = (
             self.cfg.data.rollout_batch_size
             * self.cfg.algorithm.get("group_size", 1)
diff --git a/tests/e2e_tests/coding_online_rl/qwen2.5-1.5b-ppo.yaml b/tests/e2e_tests/coding_online_rl/qwen2.5-1.5b-ppo.yaml
index 1de266648..a87c3a474 100644
--- a/tests/e2e_tests/coding_online_rl/qwen2.5-1.5b-ppo.yaml
+++ b/tests/e2e_tests/coding_online_rl/qwen2.5-1.5b-ppo.yaml
@@ -12,6 +12,7 @@ cluster:
     rollout: 0-3
     inference: 4-5
     actor: 6-7
+    reward: 0-3
 
 runner:
   task_type: coding_online_rl
diff --git a/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml b/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml
index 2365b5a86..2672dcd01 100644
--- a/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml
+++ b/tests/e2e_tests/embodied/libero_130_grpo_openvlaoft.yaml
@@ -157,6 +157,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: True
+    limit_all_gathers: True
+    backward_prefetch: True
+    use_orig_params: True
+
 reward:
   use_reward_model: False
 
diff --git a/tests/unit_tests/test_io_struct.py b/tests/unit_tests/test_io_struct.py
deleted file mode 100644
index 7104c277a..000000000
--- a/tests/unit_tests/test_io_struct.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright 2025 The RLinf Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import torch
-
-from rlinf.data.io_struct import RolloutRequest, RolloutResult
-
-
-def test_rollout_request_repeat_preserves_multimodal():
-    request = RolloutRequest(
-        n=2,
-        input_ids=[[1, 2, 3], [4, 5]],
-        image_data=[[b"img1-1", b"img1-2"], []],
-        answers=["ans1", "ans2"],
-        multi_modal_inputs=[{"pixels": [1, 2]}, {"pixels": [3]}],
-    )
-
-    repeated = request.repeat()
-
-    assert repeated.n == 2
-    assert repeated.input_ids == [[1, 2, 3], [1, 2, 3], [4, 5], [4, 5]]
-    assert repeated.answers == ["ans1", "ans1", "ans2", "ans2"]
-    assert repeated.image_data == [
-        [b"img1-1", b"img1-2"],
-        [b"img1-1", b"img1-2"],
-        [],
-        [],
-    ]
-    assert repeated.multi_modal_inputs == [
-        {"pixels": [1, 2]},
-        {"pixels": [1, 2]},
-        {"pixels": [3]},
-        {"pixels": [3]},
-    ]
-
-
-def _make_rollout_result():
-    num_sequence = 4
-    group_size = 2
-    return RolloutResult(
-        num_sequence=num_sequence,
-        group_size=group_size,
-        prompt_lengths=[3, 3, 4, 4],
-        prompt_ids=[[11, 12, 13], [11, 12, 13], [21, 22, 23, 24], [21, 22, 23, 24]],
-        response_lengths=[2, 2, 2, 2],
-        response_ids=[[101, 102], [201, 202], [301, 302], [401, 402]],
-        is_end=[True, False, True, True],
-        answers=[{"answer": "a"}, {"answer": "b"}, {"answer": "c"}, {"answer": "d"}],
-        image_data=[[b"a"], [b"b"], [b"c"], [b"d"]],
-        multi_modal_inputs=[
-            {"vision": "img-a"},
-            {"vision": "img-b"},
-            {"vision": "img-c"},
-            {"vision": "img-d"},
-        ],
-        prompt_texts=["prompt-a", "prompt-a", "prompt-b", "prompt-b"],
-        response_texts=["resp-a1", "resp-a2", "resp-b1", "resp-b2"],
-        rollout_logprobs=[[0.1, 0.2], [0.3, 0.4], [0.5, 0.6], [0.7, 0.8]],
-        rewards=torch.tensor([[1.0], [0.5], [0.2], [0.1]]),
-        advantages=[0.1, 0.2, 0.3, 0.4],
-        prev_logprobs=torch.tensor(
-            [
-                [0.01, 0.02],
-                [0.03, 0.04],
-                [0.05, 0.06],
-                [0.07, 0.08],
-            ]
-        ),
-        ref_logprobs=torch.tensor(
-            [
-                [0.11, 0.12],
-                [0.13, 0.14],
-                [0.15, 0.16],
-                [0.17, 0.18],
-            ]
-        ),
-    )
-
-
-def test_rollout_result_split_and_merge_roundtrip():
-    result = _make_rollout_result()
-
-    split_results = RolloutResult.split_result_list_by_group([result])
-
-    assert len(split_results) == result.num_sequence // result.group_size
-    first, second = split_results
-
-    assert first.num_sequence == result.group_size
-    assert second.num_sequence == result.group_size
-    assert first.prompt_ids == result.prompt_ids[: result.group_size]
-    assert second.prompt_ids == result.prompt_ids[result.group_size :]
-    assert first.response_ids == result.response_ids[: result.group_size]
-    assert second.response_ids == result.response_ids[result.group_size :]
-    assert first.prompt_texts == result.prompt_texts[: result.group_size]
-    assert second.prompt_texts == result.prompt_texts[result.group_size :]
-    assert first.response_texts == result.response_texts[: result.group_size]
-    assert second.response_texts == result.response_texts[result.group_size :]
-    assert first.image_data == result.image_data[: result.group_size]
-    assert second.image_data == result.image_data[result.group_size :]
-    assert first.multi_modal_inputs == result.multi_modal_inputs[: result.group_size]
-    assert second.multi_modal_inputs == result.multi_modal_inputs[result.group_size :]
-    assert first.rollout_logprobs == result.rollout_logprobs[: result.group_size]
-    assert second.rollout_logprobs == result.rollout_logprobs[result.group_size :]
-    assert torch.equal(first.rewards, result.rewards[: result.group_size])
-    assert torch.equal(second.rewards, result.rewards[result.group_size :])
-    assert first.advantages == result.advantages[: result.group_size]
-    assert second.advantages == result.advantages[result.group_size :]
-
-    merged = RolloutResult.merge_result_list(split_results)
-
-    assert merged.num_sequence == result.num_sequence
-    assert merged.group_size == result.group_size
-    assert merged.prompt_ids == result.prompt_ids
-    assert merged.prompt_lengths == result.prompt_lengths
-    assert merged.response_ids == result.response_ids
-    assert merged.response_lengths == result.response_lengths
-    assert merged.is_end == result.is_end
-    assert merged.answers == result.answers
-    assert merged.rollout_logprobs == result.rollout_logprobs
-    assert merged.advantages == result.advantages
-    assert torch.equal(merged.rewards, result.rewards)
-    assert torch.equal(merged.prev_logprobs, result.prev_logprobs)
-    assert torch.equal(merged.ref_logprobs, result.ref_logprobs)

From c1a74b0afe6391fd9b1197d1e2099914e2e2e8dd Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Mon, 13 Oct 2025 02:17:56 +0000
Subject: [PATCH 52/57] feat(ci): fix ci

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 .github/workflows/code-test.yml               | 32 +++++++++----------
 .../hybrid_engines/fsdp/fsdp_model_manager.py |  4 +--
 .../hybrid_engines/vllm/vllm_0_8_5/worker.py  | 11 ++-----
 rlinf/runners/coding_online_rl_runner.py      |  6 +---
 .../{run_auto_placement.sh => run.sh}         |  0
 .../{run_coding_online_rl.sh => run.sh}       |  0
 .../embodied/libero_goal_grpo_openvlaoft.yaml |  6 ++++
 .../embodied/maniskill_grpo_openvlaoft.yaml   |  6 ++++
 ...wen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml |  4 +--
 ...en2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml |  4 +--
 .../reasoning/{run_collocated.sh => run.sh}   |  0
 tests/e2e_tests/reasoning/run_pipeline.sh     | 17 ----------
 12 files changed, 38 insertions(+), 52 deletions(-)
 rename tests/e2e_tests/auto_placement/{run_auto_placement.sh => run.sh} (100%)
 rename tests/e2e_tests/coding_online_rl/{run_coding_online_rl.sh => run.sh} (100%)
 rename tests/e2e_tests/reasoning/{run_collocated.sh => run.sh} (100%)
 delete mode 100644 tests/e2e_tests/reasoning/run_pipeline.sh

diff --git a/.github/workflows/code-test.yml b/.github/workflows/code-test.yml
index 830ee82e7..0ec8ca3ef 100644
--- a/.github/workflows/code-test.yml
+++ b/.github/workflows/code-test.yml
@@ -118,42 +118,42 @@ jobs:
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-mg-sgl
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-mg-sgl
 
       - name: Megatron vLLM Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-mg-vllm
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-mg-vllm
 
       - name: Megatron SGLang Pipeline mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/reasoning/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-mg-sgl
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-pipeline-mg-sgl
 
       - name: Megatron vLLM Pipeline mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/reasoning/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-mg-vllm
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-pipeline-mg-vllm
 
       - name: FSDP SGLang Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-fsdp-sgl
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-fsdp-sgl
 
       - name: FSDP vLLM Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-fsdp-vllm
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-fsdp-vllm
 
 
   reason-qwen-grpo-test-rollout-logprobs:
@@ -169,42 +169,42 @@ jobs:
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-mg-sgl-rollout-logprobs
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-mg-sgl-rollout-logprobs
 
       - name: Megatron vLLM Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-mg-vllm-rollout-logprobs
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-mg-vllm-rollout-logprobs
 
       - name: Megatron SGLang Pipeline mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/reasoning/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-mg-sgl-rollout-logprobs
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-pipeline-mg-sgl-rollout-logprobs
 
       - name: Megatron vLLM Pipeline mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/reasoning/run_pipeline.sh qwen2.5-1.5b-grpo-pipeline-mg-vllm-rollout-logprobs
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-pipeline-mg-vllm-rollout-logprobs
 
       - name: FSDP SGLang Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-fsdp-sgl-rollout-logprobs
 
       - name: FSDP vLLM Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-1.5b-grpo-collocated-fsdp-vllm-rollout-logprobs
 
   coding-online-rl-qwen-ppo-test:
     needs: [check-changes]
@@ -223,7 +223,7 @@ jobs:
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/coding_online_rl/run_coding_online_rl.sh
+          bash tests/e2e_tests/coding_online_rl/run.sh
 
   qwen-vl-grpo-test:
     needs: [check-changes]
@@ -238,14 +238,14 @@ jobs:
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-vl-3b-grpo-collocated-fsdp-sgl
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-vl-3b-grpo-collocated-fsdp-sgl
 
       - name: FSDP vLLM Collocated mode
         timeout-minutes: 20
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/reasoning/run_collocated.sh qwen2.5-vl-3b-grpo-collocated-fsdp-vllm
+          bash tests/e2e_tests/reasoning/run.sh qwen2.5-vl-3b-grpo-collocated-fsdp-vllm
 
   # =============================================== embodied e2e tests ====================================================
 
@@ -321,7 +321,7 @@ jobs:
         run: |
           export REPO_PATH=$(pwd)
           source switch_env reason
-          bash tests/e2e_tests/auto_placement/run_auto_placement.sh
+          bash tests/e2e_tests/auto_placement/run.sh
 
 # =============================================== finale ====================================================
 
diff --git a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
index aad2c130f..c16ddc3a8 100644
--- a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
+++ b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
@@ -110,8 +110,8 @@ def setup_model_and_optimizer(self):
 
         mixed_precision = MixedPrecision(
             param_dtype=self.torch_dtype,
-            reduce_dtype=torch.float32,
-            buffer_dtype=torch.float32,
+            reduce_dtype=self.torch_dtype,
+            buffer_dtype=self.torch_dtype,
         )
 
         if self._cfg.model.sharding_strategy == "full_shard":
diff --git a/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py b/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
index 104334c1a..9a3cf4d85 100644
--- a/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
+++ b/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
@@ -78,14 +78,9 @@ def offload_model_weights(self) -> None:
         torch.cuda.synchronize()
 
         model = self.model_runner.model
-        with torch.no_grad():
-            for mod in model.modules():
-                for name, buf in list(getattr(mod, "_buffers", {}).items()):
-                    if isinstance(buf, torch.Tensor) and buf.is_cuda:
-                        cpu_buf = (
-                            buf.detach().to("cpu", non_blocking=False).contiguous()
-                        )
-                        mod._buffers[name] = cpu_buf
+        self._sleep_saved_buffers = {
+            name: buffer.cpu().clone() for name, buffer in model.named_buffers()
+        }
         torch.cuda.empty_cache()
 
         super().sleep(level=2)
diff --git a/rlinf/runners/coding_online_rl_runner.py b/rlinf/runners/coding_online_rl_runner.py
index 4ea2ec024..377667523 100644
--- a/rlinf/runners/coding_online_rl_runner.py
+++ b/rlinf/runners/coding_online_rl_runner.py
@@ -222,16 +222,12 @@ def run(self):
 
                 # Advantages and returns
                 adv_handle: Handle = self.actor.compute_advantages_and_returns(
-                    input_channel=self.inference_channel,
+                    input_channel=inference_channel,
                     output_channel=self.actor_channel,
                 )
 
                 # Actor training
                 actor_input_channel = self.actor_channel
-                if self.is_pipeline:
-                    # In pipeline mode, the rollout already contains the advantages and returns
-                    # So the above two steps are in fact no-ops, and we should directly use the inference channel as the input
-                    actor_input_channel = inference_channel
                 actor_handle: Handle = self.actor.run_training(
                     input_channel=actor_input_channel,
                 )
diff --git a/tests/e2e_tests/auto_placement/run_auto_placement.sh b/tests/e2e_tests/auto_placement/run.sh
similarity index 100%
rename from tests/e2e_tests/auto_placement/run_auto_placement.sh
rename to tests/e2e_tests/auto_placement/run.sh
diff --git a/tests/e2e_tests/coding_online_rl/run_coding_online_rl.sh b/tests/e2e_tests/coding_online_rl/run.sh
similarity index 100%
rename from tests/e2e_tests/coding_online_rl/run_coding_online_rl.sh
rename to tests/e2e_tests/coding_online_rl/run.sh
diff --git a/tests/e2e_tests/embodied/libero_goal_grpo_openvlaoft.yaml b/tests/e2e_tests/embodied/libero_goal_grpo_openvlaoft.yaml
index ddfe4a500..6dc7893d3 100644
--- a/tests/e2e_tests/embodied/libero_goal_grpo_openvlaoft.yaml
+++ b/tests/e2e_tests/embodied/libero_goal_grpo_openvlaoft.yaml
@@ -156,6 +156,12 @@ actor:
     trust_remote_code: True
     padding_side: "right"
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/tests/e2e_tests/embodied/maniskill_grpo_openvlaoft.yaml b/tests/e2e_tests/embodied/maniskill_grpo_openvlaoft.yaml
index ab384947a..f26ce7c3d 100644
--- a/tests/e2e_tests/embodied/maniskill_grpo_openvlaoft.yaml
+++ b/tests/e2e_tests/embodied/maniskill_grpo_openvlaoft.yaml
@@ -155,6 +155,12 @@ actor:
     adam_eps: 1.0e-05
     clip_grad: 10.0
 
+  fsdp:
+    forward_prefetch: False
+    limit_all_gathers: False
+    backward_prefetch: False
+    use_orig_params: False
+
 reward:
   use_reward_model: False
 
diff --git a/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml
index 46278b949..ddc087f99 100644
--- a/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-sgl.yaml
@@ -20,10 +20,10 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
-  save_interval: 50
+  save_interval: -1
 
   seq_length: 2048
 
diff --git a/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml
index fb7c8aa02..86c64ebe0 100644
--- a/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml
+++ b/tests/e2e_tests/reasoning/qwen2.5-vl-3b-grpo-collocated-fsdp-vllm.yaml
@@ -20,10 +20,10 @@ runner:
     logger_backends: ["tensorboard"] # wandb, swanlab
 
   max_epochs: 1
-  max_steps: 3
+  max_steps: 2
 
   val_check_interval: 1
-  save_interval: 50
+  save_interval: -1
 
   seq_length: 2048
 
diff --git a/tests/e2e_tests/reasoning/run_collocated.sh b/tests/e2e_tests/reasoning/run.sh
similarity index 100%
rename from tests/e2e_tests/reasoning/run_collocated.sh
rename to tests/e2e_tests/reasoning/run.sh
diff --git a/tests/e2e_tests/reasoning/run_pipeline.sh b/tests/e2e_tests/reasoning/run_pipeline.sh
deleted file mode 100644
index 3ca1574f0..000000000
--- a/tests/e2e_tests/reasoning/run_pipeline.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#! /bin/bash
-set -x
-
-tabs 4
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export TOKENIZERS_PARALLELISM=false
-
-export PYTHONPATH=${REPO_PATH}:$PYTHONPATH
-
-if [ -z "$1" ]; then
-    echo "Please provide a config name as the first argument."
-    exit 1
-else
-    CONFIG_NAME=$1
-fi
-
-python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path $REPO_PATH/tests/e2e_tests/reasoning    --config-name $CONFIG_NAME
\ No newline at end of file

From 2d5131394fc6b5f9ac73049e86c768f39bb3312b Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Mon, 13 Oct 2025 12:14:24 +0000
Subject: [PATCH 53/57] fix(reward): remove redundant reward definitions

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 rlinf/algorithms/registry.py                  | 19 -------
 rlinf/algorithms/rewards/__init__.py          |  2 +
 rlinf/algorithms/rewards/code/__init__.py     | 30 ++++++++++
 rlinf/algorithms/rewards/vqa/__init__.py      | 56 ++++++++-----------
 rlinf/workers/actor/fsdp_actor_worker.py      | 10 ----
 rlinf/workers/actor/megatron_actor_worker.py  | 11 ++--
 rlinf/workers/rollout/sglang/sglang_worker.py | 26 ---------
 rlinf/workers/rollout/vllm/vllm_worker.py     |  2 -
 .../coding_online_rl/qwen2.5-1.5b-ppo.yaml    |  2 +-
 toolkits/__init__.py                          | 19 -------
 toolkits/code_verifier/verify.py              |  2 -
 toolkits/math_verifier/verify.py              | 27 ---------
 12 files changed, 60 insertions(+), 146 deletions(-)
 create mode 100644 rlinf/algorithms/rewards/code/__init__.py

diff --git a/rlinf/algorithms/registry.py b/rlinf/algorithms/registry.py
index 96b19f2b4..11bfc6c6a 100644
--- a/rlinf/algorithms/registry.py
+++ b/rlinf/algorithms/registry.py
@@ -73,22 +73,3 @@ def calculate_adv_and_returns(**kwargs) -> Tuple[torch.Tensor, Optional[torch.Te
     adv_type = kwargs["adv_type"]
     fn = get_adv_and_returns(adv_type)
     return fn(**kwargs)
-
-
-REWARD_REGISTRY: Dict[str, Callable] = {}
-
-
-def register_reward_fn(name: str):
-    def decorator(fn):
-        REWARD_REGISTRY[name] = fn
-        return fn
-
-    return decorator
-
-
-def get_reward_fn(name: Optional[str]):
-    if name is None:
-        return None
-    if name not in REWARD_REGISTRY:
-        raise ValueError(f"Reward function {name} not registered")
-    return REWARD_REGISTRY[name]
diff --git a/rlinf/algorithms/rewards/__init__.py b/rlinf/algorithms/rewards/__init__.py
index 2ab6528ca..380cfa102 100644
--- a/rlinf/algorithms/rewards/__init__.py
+++ b/rlinf/algorithms/rewards/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from rlinf.algorithms.rewards.code import CodeReward
 from rlinf.algorithms.rewards.math import MathReward
 from rlinf.algorithms.rewards.vqa import VQAReward
 
@@ -30,3 +31,4 @@ def get_reward_class(name: str):
 
 register_reward("math", MathReward)
 register_reward("vqa", VQAReward)
+register_reward("code", CodeReward)
diff --git a/rlinf/algorithms/rewards/code/__init__.py b/rlinf/algorithms/rewards/code/__init__.py
new file mode 100644
index 000000000..0fc75f971
--- /dev/null
+++ b/rlinf/algorithms/rewards/code/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+from omegaconf import DictConfig
+
+from toolkits.code_verifier.verify import fim_verify_call
+
+
+class CodeReward:
+    def __init__(self, config: DictConfig):
+        self.scale = config.get("reward_scale", 1.0)
+
+    def get_reward(
+        self, response: List[str], reference: List[List[str]]
+    ) -> List[float]:
+        rewards = fim_verify_call(response, reference)
+        return [float(reward) * self.scale for reward in rewards]
diff --git a/rlinf/algorithms/rewards/vqa/__init__.py b/rlinf/algorithms/rewards/vqa/__init__.py
index 8175d72a1..77b009369 100644
--- a/rlinf/algorithms/rewards/vqa/__init__.py
+++ b/rlinf/algorithms/rewards/vqa/__init__.py
@@ -22,51 +22,41 @@
 
 
 class VQAReward:
-    def __init__(self, config: DictConfig):
-        reward_weights_config = config.get(
-            "reward_weights",
-            {
-                "qa_accuracy": 1.0,
-                "think_format": 0.0,
-                "answer_format": 0.0,
-            },
-        )
-        for reward_name, reward_weight in reward_weights_config.items():
-            assert reward_name in ["qa_accuracy", "think_format", "answer_format"], (
-                f"Reward {reward_name} not supported"
-            )
-            assert reward_weight >= 0, (
-                f"Reward weight {reward_weight} must be non-negative"
-            )
-        self.reward_weights = [
-            reward_weights_config["qa_accuracy"],
-            reward_weights_config["think_format"],
-            reward_weights_config["answer_format"],
-        ]
+    NEEDED_REWARD_FUNCTIONS = {
+        "qa_accuracy": qa_accuracy_reward,
+        "think_format": think_format_reward,
+        "answer_format": answer_format_reward,
+    }
 
-        self.reward_functions = [
-            qa_accuracy_reward,
-            think_format_reward,
-            answer_format_reward,
-        ]
+    def __init__(self, config: DictConfig):
+        assert "reward_weights" in config, "VQAReward requires reward_weights in config"
 
+        self.reward_weights_config = config.reward_weights
+        assert set(self.reward_weights_config.keys()) == set(
+            self.NEEDED_REWARD_FUNCTIONS.keys()
+        ), (
+            f"Reward weights must contains all of: {self.NEEDED_REWARD_FUNCTIONS.keys()} but got {list(self.reward_weights_config.keys())}"
+        )
+        assert all(
+            reward_weight >= 0 for reward_weight in self.reward_weights_config.values()
+        ), (
+            f"All reward weights must be non-negative but got {list(self.reward_weights_config.values())}"
+        )
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
     def get_reward(self, completions: List[str], answers: List[dict]) -> List[float]:
         rewards = []
-        for i, reward_function in enumerate(self.reward_functions):
-            if self.reward_weights[i] > 0:
+        reward_weights = []
+        for reward_name, reward_function in self.NEEDED_REWARD_FUNCTIONS.items():
+            if self.reward_weights_config[reward_name] > 0:
                 rewards.append(reward_function(completions, answers))
             else:
                 rewards.append([0.0] * len(completions))
+            reward_weights.append(self.reward_weights_config[reward_name])
 
-        # Apply weights to each reward function's output and sum
-
-        # rewards [num_reward_functions, len(completions)]
         rewards_tensor = torch.tensor(rewards, device=self.device)
-        weights_tensor = torch.tensor(self.reward_weights, device=self.device)
+        weights_tensor = torch.tensor(reward_weights, device=self.device)
 
-        # [num_reward_functions, num_completions] * [num_reward_functions, 1] -> [num_completions]
         final_rewards = (rewards_tensor * weights_tensor.unsqueeze(1)).sum(dim=0)
 
         return final_rewards.tolist()
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index 17b9f42ec..06607903f 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -111,16 +111,6 @@ def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
                 f"algorithm.loss_agg_func={self.cfg.algorithm.loss_agg_func} is not supported!"
             )
 
-        # Reward configurations
-        if not self.cfg.reward.use_reward_model:
-            assert self.cfg.reward.reward_type in ["math", "vqa"], (
-                "only support math and vqa reward!"
-            )
-            from rlinf.algorithms.rewards import get_reward_class
-
-            reward_cls = get_reward_class(self.cfg.reward.reward_type)
-            self.reward = reward_cls(self.cfg.reward)
-
     def init_worker(self) -> None:
         self.setup_model_and_optimizer()
         if self.cfg.algorithm.kl_beta > 0 and self.cfg.actor.get(
diff --git a/rlinf/workers/actor/megatron_actor_worker.py b/rlinf/workers/actor/megatron_actor_worker.py
index f7bb3631e..5bd19ddc5 100644
--- a/rlinf/workers/actor/megatron_actor_worker.py
+++ b/rlinf/workers/actor/megatron_actor_worker.py
@@ -31,7 +31,6 @@
 from rlinf.algorithms.registry import (
     actor_loss,
     calculate_adv_and_returns,
-    get_reward_fn,
 )
 from rlinf.algorithms.utils import kl_penalty
 from rlinf.data.io_struct import (
@@ -79,7 +78,6 @@
     seq_mean_token_sum,
 )
 from rlinf.workers.rollout.utils import RankMapper
-from toolkits import register_rewards
 
 
 class MegatronActor(MegatronModelManager, Worker):
@@ -102,6 +100,10 @@ def __init__(
         self.cfg = cfg
         self.component_placement = placement
 
+        # check placement validity when actor backend is megatron
+        assert placement.rollout_tp_size <= placement.actor_tp_size, (
+            f" rollout tensor parallel size {placement.rollout_tp_size} must be less than or equal to actor tensor parallel size {placement.actor_tp_size}."
+        )
         # Data configurations
         self.response_len = (
             role_cfg.model.encoder_seq_length - cfg.data.max_prompt_length
@@ -154,11 +156,6 @@ def __init__(
         self.ref_policy_state_dict = None
         self.is_pipeline = self.component_placement.is_disaggregated
 
-        # Reward configurations
-        if not self.cfg.reward.use_reward_model:
-            register_rewards()
-            self.reward_fn = get_reward_fn(self.cfg.reward.reward_type)
-
         # Rollout configurations
         self.rollout_group_name = self.cfg.rollout.group_name
 
diff --git a/rlinf/workers/rollout/sglang/sglang_worker.py b/rlinf/workers/rollout/sglang/sglang_worker.py
index abdf59365..4d5c51552 100644
--- a/rlinf/workers/rollout/sglang/sglang_worker.py
+++ b/rlinf/workers/rollout/sglang/sglang_worker.py
@@ -18,7 +18,6 @@
 from typing import Dict, List, Optional, Tuple
 
 import numpy as np
-import torch
 from omegaconf import DictConfig
 from sglang.srt.server_args import ServerArgs
 from transformers import AutoTokenizer
@@ -35,7 +34,6 @@
 from rlinf.workers.rollout.utils import (
     print_sglang_outputs,
 )
-from toolkits.math_verifier.verify import MathRewardModel, math_verify_call
 
 
 class SGLangWorker(Worker):
@@ -233,7 +231,6 @@ def __init__(self, config: DictConfig, placement: ComponentPlacement):
         self._rollout_end_event = asyncio.Event()
         self._sync_weight_end_event = asyncio.Event()
 
-        self._reward_model = MathRewardModel(scale=self._cfg.reward.reward_scale)
         assert self._rollout_batch_size is None, (
             "rollout_batch_size_per_gpu is not supported in AsyncSGLangWorker"
         )
@@ -262,29 +259,6 @@ async def init_worker(self):
         if self._cfg.rollout.validate_weight:
             await self._validate_weight_at_first()
 
-    async def _compute_reward_and_advantage(
-        self, engine_results: List[Dict], answer: str
-    ):
-        answers = [answer] * len(engine_results)
-        texts: List[str] = []
-        for res in engine_results:
-            if hasattr(res, "text"):
-                texts.append(res["text"])
-            else:
-                texts.append(
-                    self._tokenizer.decode(res["output_ids"], skip_special_tokens=True)
-                )
-
-        results = math_verify_call(texts, answers)
-        rewards = [r * self._reward_model.scale for r in results]
-        rewards_tensor = torch.tensor(rewards, dtype=torch.float)
-
-        mean = rewards_tensor.mean()
-        std = rewards_tensor.std()
-        advantages = (rewards_tensor - mean) / (std + 1e-6)
-
-        return rewards, advantages.tolist()
-
     async def _async_generate(
         self, raw_id: int, input_ids: List[int], sampling_params: dict
     ):
diff --git a/rlinf/workers/rollout/vllm/vllm_worker.py b/rlinf/workers/rollout/vllm/vllm_worker.py
index b3629b170..3d36b9a44 100644
--- a/rlinf/workers/rollout/vllm/vllm_worker.py
+++ b/rlinf/workers/rollout/vllm/vllm_worker.py
@@ -35,7 +35,6 @@
 from rlinf.scheduler import Channel, Worker
 from rlinf.utils.placement import ComponentPlacement
 from rlinf.workers.rollout.utils import print_vllm_outputs
-from toolkits.math_verifier.verify import MathRewardModel
 
 from . import VLLMExecutor
 
@@ -68,7 +67,6 @@ def __init__(self, config: DictConfig, placement: ComponentPlacement):
             "The capital of France is",
             "The future of AI is",
         ]
-        self._reward_model = MathRewardModel(self._cfg.reward.reward_scale)
         self.request_counter = Counter()
 
     def _prepare_vllm_environment(self) -> None:
diff --git a/tests/e2e_tests/coding_online_rl/qwen2.5-1.5b-ppo.yaml b/tests/e2e_tests/coding_online_rl/qwen2.5-1.5b-ppo.yaml
index a87c3a474..f34ee4cae 100644
--- a/tests/e2e_tests/coding_online_rl/qwen2.5-1.5b-ppo.yaml
+++ b/tests/e2e_tests/coding_online_rl/qwen2.5-1.5b-ppo.yaml
@@ -283,7 +283,7 @@ actor:
 
 reward:
   use_reward_model: False
-  reward_type: fim_verify_call
+  reward_type: code
   reward_scale: 5.0
 
 critic:
diff --git a/toolkits/__init__.py b/toolkits/__init__.py
index 8b6f0114a..5b365ea1e 100644
--- a/toolkits/__init__.py
+++ b/toolkits/__init__.py
@@ -11,22 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-
-from rlinf.algorithms.registry import get_reward_fn
-
-
-def register_rewards():
-    try:
-        from toolkits.code_verifier.verify import fim_verify_call
-
-        assert get_reward_fn("fim_verify_call") == fim_verify_call
-    except ImportError:
-        pass
-
-    try:
-        from toolkits.math_verifier.verify import math_verify_call
-
-        assert get_reward_fn("math") == math_verify_call
-    except ImportError:
-        pass
diff --git a/toolkits/code_verifier/verify.py b/toolkits/code_verifier/verify.py
index 0ad756d02..5017b9e54 100644
--- a/toolkits/code_verifier/verify.py
+++ b/toolkits/code_verifier/verify.py
@@ -21,10 +21,8 @@
 except ImportError:
     fuzz = None
     FUZZY_AVAILABLE = False
-from rlinf.algorithms.registry import register_reward_fn
 
 
-@register_reward_fn("fim_verify_call")
 def fim_verify_call(
     responses: List[str],
     references: List[str],
diff --git a/toolkits/math_verifier/verify.py b/toolkits/math_verifier/verify.py
index 988b86045..8d0cbdb11 100644
--- a/toolkits/math_verifier/verify.py
+++ b/toolkits/math_verifier/verify.py
@@ -29,7 +29,6 @@
 from sympy.parsing.latex import parse_latex
 from sympy.parsing.sympy_parser import parse_expr
 
-from rlinf.algorithms.registry import register_reward_fn
 from toolkits.math_verifier.parser import extract_answer
 
 global_executor = ProcessPoolExecutor(max_workers=40)
@@ -389,7 +388,6 @@ def verify_math_solution(answer: str, solution: str):
     return process_results(answer, solution)[0]
 
 
-@register_reward_fn("math")
 def math_verify_call(
     responses: List[str],
     references: List[str],
@@ -429,31 +427,6 @@ def math_verify_call(
     return labels
 
 
-class MathRewardModel:
-    def __init__(self, scale: float):
-        self.scale = scale
-
-    def get_reward(
-        self, response: List[str], reference: List[List[str]]
-    ) -> List[float]:
-        """
-        Calculates reward scores for a list of responses compared to corresponding lists of reference answers.
-        For each response, the function checks if it matches any of the provided references using the `process_results` function.
-        The reward for each response is computed as the first element of the result (converted to float) multiplied by `self.scale`.
-        Args:
-            response (List[str]): A list of response strings to be evaluated.
-            reference (List[List[str]]): A list where each element is a list of reference strings corresponding to each response.
-        Returns:
-            List[float]: A list of reward scores, one for each response.
-        """
-
-        results = []
-        for resp, refs in zip(response, reference):
-            result = any(process_results(resp, ref)[0] for ref in refs)
-            results.append((1 if result else -1) * self.scale)
-        return results
-
-
 if __name__ == "__main__":
     sample = {
         "answers": ["\\boxed{-\\frac{2}{3}}"],

From fbc9be75f6eb807cc7772956d38e0f185a3c8661 Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Mon, 13 Oct 2025 14:19:05 +0000
Subject: [PATCH 54/57] fix(lock): set fsdp's recompute_logprobs True for lock
 competition safety

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
index 19281cbc9..6d76c780e 100644
--- a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
@@ -48,7 +48,7 @@ algorithm:
   # val rollout mbs
   val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
 
-  recompute_logprobs: False
+  recompute_logprobs: True
   shuffle_rollout: False
 
   # GRPO loss params

From 29693590fcc4cc706f22829e5d946a62c73209d0 Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Tue, 14 Oct 2025 03:33:41 +0000
Subject: [PATCH 55/57] chore: remove useless code, add correct dp_group param
 for mg

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py | 8 --------
 rlinf/workers/actor/megatron_actor_worker.py   | 1 +
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py b/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
index 9a3cf4d85..519895e49 100644
--- a/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
+++ b/rlinf/hybrid_engines/vllm/vllm_0_8_5/worker.py
@@ -75,14 +75,6 @@ def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
         super().initialize_from_config(kv_cache_config)
 
     def offload_model_weights(self) -> None:
-        torch.cuda.synchronize()
-
-        model = self.model_runner.model
-        self._sleep_saved_buffers = {
-            name: buffer.cpu().clone() for name, buffer in model.named_buffers()
-        }
-        torch.cuda.empty_cache()
-
         super().sleep(level=2)
 
     def sync_hf_weight(self) -> None:
diff --git a/rlinf/workers/actor/megatron_actor_worker.py b/rlinf/workers/actor/megatron_actor_worker.py
index 5bd19ddc5..40289ddf0 100644
--- a/rlinf/workers/actor/megatron_actor_worker.py
+++ b/rlinf/workers/actor/megatron_actor_worker.py
@@ -967,6 +967,7 @@ def _compute_rollout_metrics(self, batch):
                 self.cfg.data.max_prompt_length,
                 self.response_len,
                 self._world_size,
+                dp_group=parallel_state.get_data_parallel_group(),
             )
         )
 

From 9dae32e8e9024bec7f27c29c429ddbe01024c150 Mon Sep 17 00:00:00 2001
From: Bo Dai <daibo@infini-ai.com>
Date: Tue, 14 Oct 2025 03:55:36 +0000
Subject: [PATCH 56/57] fix(reward): move reward worker's timer to where reward
 computation really happens

Signed-off-by: Bo Dai <daibo@infini-ai.com>
---
 rlinf/workers/reward/reward_worker.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/rlinf/workers/reward/reward_worker.py b/rlinf/workers/reward/reward_worker.py
index fd2472611..88be65ddc 100644
--- a/rlinf/workers/reward/reward_worker.py
+++ b/rlinf/workers/reward/reward_worker.py
@@ -60,13 +60,11 @@ def compute_rewards(self, input_channel: Channel, output_channel: Channel):
             input_channel: The input channel to read from.
             output_channel: The output channel to send results to.
         """
-
-        with self.worker_timer():
-            recv_batch_size = 0
-            while recv_batch_size < self.total_batch_size_per_dp:
-                rollout_result: RolloutResult = input_channel.get()
-                recv_batch_size += rollout_result.num_sequence
-
+        recv_batch_size = 0
+        while recv_batch_size < self.total_batch_size_per_dp:
+            rollout_result: RolloutResult = input_channel.get()
+            recv_batch_size += rollout_result.num_sequence
+            with self.worker_timer():
                 if rollout_result.rewards is None:
                     if self.cfg.reward.use_reward_model:
                         with input_channel.device_lock:
@@ -83,11 +81,11 @@ def compute_rewards(self, input_channel: Channel, output_channel: Channel):
                             rollout_result
                         )
 
-                output_channel.put(rollout_result)
+            output_channel.put(rollout_result)
 
-            assert recv_batch_size == self.total_batch_size_per_dp, (
-                f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
-            )
+        assert recv_batch_size == self.total_batch_size_per_dp, (
+            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+        )
 
     def _compute_rule_based_rewards(self, rollout_result: RolloutResult):
         # Decode only the generated tokens; response_ids are already the post-prompt tokens

From b334af1a8e0dc7a4172e15cb8ab74ea6a4d0f783 Mon Sep 17 00:00:00 2001
From: Taoxu-unihannover <2423750782@qq.com>
Date: Fri, 7 Nov 2025 10:03:15 +0800
Subject: [PATCH 57/57] Rlinf-Acend

---
 .gitignore                                    |   3 +-
 .../config/math/qwen2.5-1.5b-grpo-fsdp.yaml   |  26 +-
 .../config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml   |  20 +-
 .../vqa/qwen2.5-vl-3b-grpo-fsdp_baak.yaml     | 222 +++++
 examples/reasoning/run_main_grpo_math.sh      |   2 +-
 examples/reasoning/run_main_grpo_vqa.sh       |   0
 fusion_result.json                            |  21 +
 logs-lxs/cmd.sh                               |   3 +
 .../tensorboard/config.json                   | 205 ++++
 ...events.1761824220.hostname-kjuul.1028618.0 | Bin 0 -> 88 bytes
 ...events.1761825588.hostname-kjuul.1562049.0 | Bin 0 -> 88 bytes
 .../tensorboard/config.json                   | 205 ++++
 ...events.1761829334.hostname-kjuul.2113735.0 | Bin 0 -> 88 bytes
 .../tensorboard/config.json                   | 205 ++++
 ...events.1761898947.hostname-kjuul.3557129.0 | Bin 0 -> 88 bytes
 ...tfevents.1761900889.hostname-kjuul.30866.0 | Bin 0 -> 30524 bytes
 .../tensorboard/config.json                   | 205 ++++
 ...events.1761820416.hostname-kjuul.3651077.0 | Bin 0 -> 88 bytes
 ...tfevents.1761820811.hostname-kjuul.84956.0 | Bin 0 -> 88 bytes
 ...fevents.1761821088.hostname-kjuul.261840.0 | Bin 0 -> 88 bytes
 logs-lxs/fsdp-sglang/tensorboard/config.json  | 205 ++++
 ...events.1761816870.hostname-kjuul.2287170.0 | Bin 0 -> 88 bytes
 ...events.1761817980.hostname-kjuul.2660597.0 | Bin 0 -> 3420 bytes
 ...events.1761819019.hostname-kjuul.2893398.0 | Bin 0 -> 6808 bytes
 ...events.1761819424.hostname-kjuul.3154814.0 | Bin 0 -> 6808 bytes
 logs-lxs/fusion_result.json                   |   1 +
 logs-lxs/run_sglang.py                        |  32 +
 rlinf/algorithms/losses.py                    |  14 +-
 rlinf/data/io_struct.py                       |  28 +-
 .../hybrid_engines/fsdp/fsdp_model_manager.py |   5 +-
 .../sglang/sglang_0_5_2/__init__.py           |  13 +
 .../sglang/sglang_0_5_2/io_struct.py          |  59 ++
 .../sglang/sglang_0_5_2/sgl_engine.py         | 363 +++++++
 .../sglang/sglang_0_5_2/sgl_scheduler.py      | 476 +++++++++
 .../sglang/sglang_0_5_2/tokenizer_manager.py  | 129 +++
 rlinf/utils/distributed.py                    |  44 +-
 rlinf/utils/utils.py                          |  28 +-
 rlinf/workers/actor/fsdp_actor_worker.py      |  66 +-
 rlinf/workers/actor/fsdp_actor_worker_bak.py  | 903 ++++++++++++++++++
 rlinf/workers/rollout/sglang/__init__.py      |   6 +
 rlinf/workers/rollout/sglang/sglang_worker.py |   5 +
 rlinf/workers/rollout/vllm/vllm_worker.py     |   3 +-
 test.py                                       |   3 +
 43 files changed, 3394 insertions(+), 106 deletions(-)
 create mode 100644 examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp_baak.yaml
 mode change 100644 => 100755 examples/reasoning/run_main_grpo_vqa.sh
 create mode 100644 fusion_result.json
 create mode 100644 logs-lxs/cmd.sh
 create mode 100644 logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/config.json
 create mode 100644 logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761824220.hostname-kjuul.1028618.0
 create mode 100644 logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761825588.hostname-kjuul.1562049.0
 create mode 100644 logs-lxs/fsdp-sglang-512*8-16card-kernel-timeout/tensorboard/config.json
 create mode 100644 logs-lxs/fsdp-sglang-512*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761829334.hostname-kjuul.2113735.0
 create mode 100644 logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/config.json
 create mode 100644 logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/events.out.tfevents.1761898947.hostname-kjuul.3557129.0
 create mode 100644 logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/events.out.tfevents.1761900889.hostname-kjuul.30866.0
 create mode 100644 logs-lxs/fsdp-sglang-512*8-16card/tensorboard/config.json
 create mode 100644 logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761820416.hostname-kjuul.3651077.0
 create mode 100644 logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761820811.hostname-kjuul.84956.0
 create mode 100644 logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761821088.hostname-kjuul.261840.0
 create mode 100644 logs-lxs/fsdp-sglang/tensorboard/config.json
 create mode 100644 logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761816870.hostname-kjuul.2287170.0
 create mode 100644 logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761817980.hostname-kjuul.2660597.0
 create mode 100644 logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761819019.hostname-kjuul.2893398.0
 create mode 100644 logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761819424.hostname-kjuul.3154814.0
 create mode 100644 logs-lxs/fusion_result.json
 create mode 100644 logs-lxs/run_sglang.py
 create mode 100644 rlinf/hybrid_engines/sglang/sglang_0_5_2/__init__.py
 create mode 100644 rlinf/hybrid_engines/sglang/sglang_0_5_2/io_struct.py
 create mode 100644 rlinf/hybrid_engines/sglang/sglang_0_5_2/sgl_engine.py
 create mode 100644 rlinf/hybrid_engines/sglang/sglang_0_5_2/sgl_scheduler.py
 create mode 100644 rlinf/hybrid_engines/sglang/sglang_0_5_2/tokenizer_manager.py
 create mode 100644 rlinf/workers/actor/fsdp_actor_worker_bak.py
 create mode 100644 test.py

diff --git a/.gitignore b/.gitignore
index c6d3a86ea..49fc83734 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,4 +15,5 @@ logs/*
 *.so
 .venv
 uv.lock
-assets/
\ No newline at end of file
+assets/
+kernel_meta/
\ No newline at end of file
diff --git a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
index 6d76c780e..c6e8251ac 100644
--- a/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
+++ b/examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml
@@ -31,8 +31,8 @@ runner:
   max_tokens_per_mbs: 28672
 
   resume_dir: null
-  experiment_name: grpo-1.5b
-  output_dir: ../results
+  experiment_name: fsdp-sglang-512*8-16card-timeout
+  output_dir: ./logs
 algorithm:
   group_size: 8
 
@@ -83,7 +83,7 @@ rollout:
 
   gpu_memory_utilization: 0.55
 
-  model_dir: /path/to/model/DeepSeek-R1-Distill-Qwen-1.5B/
+  model_dir: /home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/
   model_arch: qwen2.5
   enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
   distributed_executor_backend: mp   # ray or mp
@@ -92,11 +92,11 @@ rollout:
   padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
   eos: null                   # will be tokenizer.eos_token_id if null.
     
-  rollout_backend: vllm     # here choose which backend to rollout,support [sglang, vllm] 
+  rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
 
   sglang:
-    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
-    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    attention_backend: ascend # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 4096 # the interval for SGLang to log the decode time and other stats.
     use_torch_compile: False # enable torch_compile in SGLang for rollout.
     torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
 
@@ -108,14 +108,14 @@ rollout:
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
-  tensor_parallel_size: 2
+  tensor_parallel_size: 1
   pipeline_parallel_size: 1
   
   validate_weight: False # whether to send all weights at first for weight comparison.
   validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
   print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
 
-  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  max_running_requests: 128 # the maximum number of running requests in the rollout engine.
   cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
 
 data:
@@ -123,14 +123,14 @@ data:
   dataset_name: boba
   max_prompt_length: 1024
   filter_prompt_by_length: True
-  rollout_batch_size: 512
+  rollout_batch_size: 32
   val_rollout_batch_size: null
   num_workers: 2
   shuffle: True
   validation_shuffle: True
   seed: 1234
-  train_data_paths: ["/dataset/boba/AReaL-boba-106k.jsonl"]
-  val_data_paths: ["/dataset/boba/AReaL-boba-106k.jsonl"]
+  train_data_paths: ["/home/dataset/boba/AReaL-boba-106k.jsonl"]
+  val_data_paths: ["/home/dataset/boba/AReaL-boba-106k.jsonl"]
   prompt_key: prompt
   image_keys: [image]
   answer_key: answer
@@ -164,7 +164,7 @@ actor:
 
     seq_length: ${runner.seq_length}
     encoder_seq_length: ${runner.seq_length}
-    model_path: /path/to/model/DeepSeek-R1-Distill-Qwen-1.5B/
+    model_path: /home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/
 
   optim:
     optimizer: adam
@@ -194,7 +194,7 @@ actor:
     lr_decay_iters: 10
 
   tokenizer:
-    tokenizer_model: /path/to/model/DeepSeek-R1-Distill-Qwen-1.5B/
+    tokenizer_model: /home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/
     use_fast: False
     trust_remote_code: True
     padding_side: 'right'
diff --git a/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml b/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml
index 8d32a6a33..17c192fa2 100644
--- a/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml
+++ b/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp.yaml
@@ -84,7 +84,7 @@ rollout:
 
   gpu_memory_utilization: 0.55
 
-  model_dir: /path/to/model/Qwen2.5-VL-3B-Instruct
+  model_dir: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
   model_arch: qwen2.5_vl #qwen2.5
   enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize.
   distributed_executor_backend: mp   # ray or mp
@@ -93,10 +93,10 @@ rollout:
   padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
   eos: null                   # will be tokenizer.eos_token_id if null.
     
-  rollout_backend: vllm     # here choose which backend to rollout,support [sglang, vllm] 
+  rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
 
   sglang:
-    attention_backend: triton # [flashinfer, triton] for more, see sglang's doc
+    attention_backend: ascend # [flashinfer, triton] for more, see sglang's doc
     decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
     use_torch_compile: False # enable torch_compile in SGLang for rollout.
     torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
@@ -109,7 +109,7 @@ rollout:
 
   return_logprobs: ${not:${algorithm.recompute_logprobs}}
 
-  tensor_parallel_size: 2
+  tensor_parallel_size: 1
   pipeline_parallel_size: 1
   
   validate_weight: False # whether to send all weights at first for weight comparison.
@@ -137,8 +137,8 @@ data:
   shuffle: True
   validation_shuffle: True
   seed: 1234
-  train_data_paths: ["/dataset/robo2vlm-1/data/train/"]
-  val_data_paths: ["/dataset/robo2vlm-1/data/val/"]
+  train_data_paths: ["/home/x00922209/datasets/advaitgupta/robo2VLM/data/"]
+  val_data_paths: ["/home/x00922209/datasets/advaitgupta/robo2VLM/data/"]
 
 actor:
   group_name: "ActorGroup"
@@ -165,7 +165,7 @@ actor:
 
     seq_length: ${runner.seq_length}
     encoder_seq_length: ${runner.seq_length}
-    model_path: /path/to/model/Qwen2.5-VL-3B-Instruct/
+    model_path: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
 
     model_arch: ${rollout.model_arch}
 
@@ -197,7 +197,7 @@ actor:
     lr_decay_iters: 10
 
   tokenizer:
-    tokenizer_model: /path/to/model/Qwen2.5-VL-3B-Instruct
+    tokenizer_model: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
     use_fast: False
     trust_remote_code: True
     padding_side: 'right'
@@ -219,10 +219,10 @@ reward:
     answer_format: 0.0
 
   tokenizer:
-    tokenizer_model: /path/to/model/Qwen2.5-VL-3B-Instruct
+    tokenizer_model: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
     use_fast: False
     trust_remote_code: True
     padding_side: 'right'
 
 critic:
-  use_critic_model: false
\ No newline at end of file
+  use_critic_model: false
diff --git a/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp_baak.yaml b/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp_baak.yaml
new file mode 100644
index 000000000..fa5b16080
--- /dev/null
+++ b/examples/reasoning/config/vqa/qwen2.5-vl-3b-grpo-fsdp_baak.yaml
@@ -0,0 +1,222 @@
+defaults:
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,rollout,reward: all
+
+runner:
+  task_type: reasoning
+  logger:
+    log_path: ${runner.output_dir}/${runner.experiment_name}
+    project_name: rlinf
+    experiment_name: ${runner.experiment_name}
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 5
+  max_steps: -1
+
+  val_check_interval: 1
+  save_interval: 50
+
+  seq_length: 2048
+
+  enable_dynamic_batch_size: False
+  max_tokens_per_mbs: 28672
+
+  resume_dir: null
+  experiment_name: grpo-1.5b
+  output_dir: ../results
+
+algorithm:
+  group_size: 8
+
+  n_minibatches: 4
+  training_batch_size_per_gpu: 1 # micro batch size
+  rollout_batch_size_per_gpu: null # If set to null, rollout_batch_size will be evenly divided across all inference instances. You can reduce this parameter if inference consumes too much GPU memory.
+
+  # mbs to do log prob inference, can be set to
+  # lower than rollout_batch_size_per_gpu to reduce
+  # memory usage
+  logprob_forward_micro_batch_size: 1 # ${.rollout_batch_size_per_gpu}
+
+  # val rollout mbs
+  val_rollout_batch_size_per_gpu: 4 # ${.rollout_batch_size_per_gpu}
+
+  recompute_logprobs: False
+  shuffle_rollout: False
+
+  # GRPO loss params
+  loss_type: math_ppo_actor
+  loss_agg_func: "token-mean"
+  kl_beta: 0.0 # 0.001
+  kl_penalty_type: low_var_kl
+  ratio_clip_eps: 0.2
+  entropy_bonus: 0.0
+  calculate_entropy: False
+  clip_ratio_c: null # 3.0
+  clip_ratio_low: null
+  clip_ratio_high: null
+
+  adv_type: math_grpo
+  normalize_advantages: True
+  early_stop_imp_ratio: 5.0
+  use_valid_token_scale: False
+
+  # params for rollout
+  sampling_params:
+    use_greedy: False
+    temperature: 1.0
+    top_k: 1000000
+    top_p: 1.0
+    repetition_penalty: 1.0
+    max_new_tokens: ${subtract:${runner.seq_length}, ${data.max_prompt_length}}
+    min_new_tokens: 1
+
+rollout:
+  group_name: "RolloutGroup"
+
+  gpu_memory_utilization: 0.55
+
+  model_dir: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
+  model_arch: qwen2.5_vl #qwen2.5
+  enforce_eager: False         # if False, rollout engine will capture cuda graph, which will take more time to initialize. taoxu 1010
+  distributed_executor_backend: mp   # ray or mp
+  disable_log_stats: False
+  detokenize: False            # Whether to detokenize the output. During RL we actually don't need to detokenize it. Can be set to True for debugging.
+  padding: null               # will be tokenizer.pad_token_id if null. it is used to filter megatron's padding for rollout engine
+  eos: null                   # will be tokenizer.eos_token_id if null.
+    
+  rollout_backend: sglang     # here choose which backend to rollout,support [sglang, vllm] 
+
+  sglang:
+    attention_backend: ascend # [flashinfer, triton] for more, see sglang's doc
+    decode_log_interval: 500000 # the interval for SGLang to log the decode time and other stats.
+    use_torch_compile: False # enable torch_compile in SGLang for rollout.
+    torch_compile_max_bs: 128 # the maximum batch size for torch compile. If the batch size is larger than this, torch compile will not be used.
+
+  vllm:
+    attention_backend: FLASH_ATTN #[FLASH_ATTN,XFORMERS] for more, see vllm's doc
+    enable_chunked_prefill: True  # enable vllm to use chunked_prefill.
+    enable_prefix_caching: True   # enable vllm to use prefix_caching.
+    enable_flash_infer_sampler: True # if True, vllm will use flashinfer to do sampling.
+
+  return_logprobs: ${not:${algorithm.recompute_logprobs}}
+
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  
+  validate_weight: False # whether to send all weights at first for weight comparison.
+  validate_save_dir: null # the directory to save the weights for comparison. If validate_weight is True, this will be used to save the weights for comparison.
+  print_outputs: False         # whether to print the outputs (token ids, texts, etc.) of rollout engine.
+
+  max_running_requests: 64 # the maximum number of running requests in the rollout engine.
+  cuda_graph_max_bs: 128 # the maximum batch size for cuda graph. If the batch size is larger than this, cuda graph will not be used.
+
+data:
+  type: vision_language
+  dataset_name: robo2vlm
+  max_prompt_length: 1024
+  filter_prompt_by_length: True
+  rollout_batch_size: 8
+  val_rollout_batch_size: null
+  num_workers: 2
+  prompt_key: prompt
+  image_keys: ["image"] # some vlm datasets may have multiple image columns
+  choice_key: "choices"
+  answer_key: "answer"
+  solution_key: "solution"
+  use_chat_template: True
+  lazy_loading: True
+  shuffle: True
+  validation_shuffle: True
+  seed: 1234
+  train_data_paths: ["/home/x00922209/datasets/advaitgupta/robo2VLM/data/"]
+  val_data_paths: ["/home/x00922209/datasets/advaitgupta/robo2VLM/data/"]
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: fsdp
+  mcore_gpt: True
+  spec_name: decoder_gpt
+
+  enable_offload: True
+  checkpoint_load_path: null
+
+  global_batch_size: 8
+  micro_batch_size: 1
+
+  enable_dp_load_balance: False
+
+  calculate_flops: False
+
+  seed: 1234
+
+  model:
+    precision: bf16
+    sharding_strategy: full_shard
+    is_lora: False
+
+    seq_length: ${runner.seq_length}
+    encoder_seq_length: ${runner.seq_length}
+    model_path: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
+
+    model_arch: ${rollout.model_arch}
+
+  optim:
+    optimizer: adam
+    bf16: True #False
+    fp16: False #True
+    lr: 2e-05
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    min_lr: 2.0e-6
+    weight_decay: 0.05
+    use_distributed_optimizer: True
+    overlap_grad_reduce: False
+    overlap_param_gather: False
+    optimizer_enable_pin: false
+    overlap_param_gather_with_optimizer_step: False
+    clip_grad: 0.8
+    loss_scale: 65536
+
+  lr_sched:
+    lr_warmup_fraction: 0.01
+    lr_warmup_init: 0.0
+    lr_warmup_iters: 0
+    max_lr: 2.0e-5
+    min_lr: 0.0
+    lr_decay_style: constant
+    lr_decay_iters: 10
+
+  tokenizer:
+    tokenizer_model: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+reward:
+  group_name: "RewardGroup"
+  use_reward_model: false
+  reward_type: 'vqa'
+  reward_scale: 1.0
+  reward_weights:
+    qa_accuracy: 1.0
+    think_format: 0.0
+    answer_format: 0.0
+
+  tokenizer:
+    tokenizer_model: /home/x00922209/models/Qwen/Qwen2.5-VL-3B-Instruct
+    use_fast: False
+    trust_remote_code: True
+    padding_side: 'right'
+
+critic:
+  use_critic_model: false
\ No newline at end of file
diff --git a/examples/reasoning/run_main_grpo_math.sh b/examples/reasoning/run_main_grpo_math.sh
index 18a48d780..81f4fcd56 100644
--- a/examples/reasoning/run_main_grpo_math.sh
+++ b/examples/reasoning/run_main_grpo_math.sh
@@ -14,7 +14,7 @@ export PYTHONPATH=${REPO_PATH}:${MEGATRON_PATH}:$PYTHONPATH
 if [ -z "$1" ]; then
     CONFIG_NAME="qwen2.5-1.5b-grpo-megatron"
 else
-    CONFIG_NAME=$1
+    CONFIG_NAME="qwen2.5-1.5b-grpo-fsdp.yaml"
 fi
 
 python ${REPO_PATH}/examples/reasoning/main_grpo.py --config-path ${CONFIG_PATH}/config/math/  --config-name $CONFIG_NAME
\ No newline at end of file
diff --git a/examples/reasoning/run_main_grpo_vqa.sh b/examples/reasoning/run_main_grpo_vqa.sh
old mode 100644
new mode 100755
diff --git a/fusion_result.json b/fusion_result.json
new file mode 100644
index 000000000..20b56950d
--- /dev/null
+++ b/fusion_result.json
@@ -0,0 +1,21 @@
+{
+    "session_and_graph_id_0_0": {
+        "graph_fusion": {
+            "IndexByTensorStaticFusionPass": {
+                "effect_times": "0",
+                "match_times": "1"
+            },
+            "RefreshInt64ToInt32FusionPass": {
+                "effect_times": "0",
+                "match_times": "1"
+            }
+        },
+        "ub_fusion": {
+            "AutomaticUbFusion": {
+                "effect_times": "0",
+                "match_times": "2",
+                "repository_hit_times": "0"
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/logs-lxs/cmd.sh b/logs-lxs/cmd.sh
new file mode 100644
index 000000000..31c8fc1ee
--- /dev/null
+++ b/logs-lxs/cmd.sh
@@ -0,0 +1,3 @@
+nohup bash examples/reasoning/run_main_grpo_math.sh examples/reasoning/config/math/qwen2.5-1.5b-grpo-fsdp.yaml > test_math.log 2>&1
+
+bash examples/reasoning/run_main_grpo_math.sh qwen2.5-1.5b-grpo-fsdp
\ No newline at end of file
diff --git a/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/config.json b/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/config.json
new file mode 100644
index 000000000..1e6eeade1
--- /dev/null
+++ b/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/config.json
@@ -0,0 +1,205 @@
+{
+    "cluster": {
+        "num_nodes": 1,
+        "component_placement": {
+            "actor,rollout,reward": "all"
+        }
+    },
+    "runner": {
+        "task_type": "reasoning",
+        "logger": {
+            "log_path": "./logs/fsdp-sglang-256*8-16card",
+            "project_name": "rlinf",
+            "experiment_name": "fsdp-sglang-256*8-16card",
+            "logger_backends": [
+                "tensorboard"
+            ]
+        },
+        "max_epochs": 5,
+        "max_steps": -1,
+        "val_check_interval": 1,
+        "save_interval": 50,
+        "seq_length": 28672,
+        "enable_dynamic_batch_size": false,
+        "max_tokens_per_mbs": 28672,
+        "resume_dir": null,
+        "experiment_name": "fsdp-sglang-256*8-16card",
+        "output_dir": "./logs"
+    },
+    "algorithm": {
+        "group_size": 8,
+        "n_minibatches": 4,
+        "training_batch_size_per_gpu": 1,
+        "rollout_batch_size_per_gpu": null,
+        "logprob_forward_micro_batch_size": 1,
+        "val_rollout_batch_size_per_gpu": 4,
+        "recompute_logprobs": true,
+        "shuffle_rollout": false,
+        "loss_type": "math_ppo_actor",
+        "loss_agg_func": "token-mean",
+        "kl_beta": 0.0,
+        "kl_penalty_type": "low_var_kl",
+        "ratio_clip_eps": 0.2,
+        "entropy_bonus": 0.0,
+        "calculate_entropy": false,
+        "clip_ratio_c": null,
+        "clip_ratio_low": null,
+        "clip_ratio_high": null,
+        "adv_type": "math_grpo",
+        "normalize_advantages": true,
+        "early_stop_imp_ratio": 5.0,
+        "use_valid_token_scale": false,
+        "sampling_params": {
+            "use_greedy": false,
+            "temperature": 1.0,
+            "top_k": 1000000,
+            "top_p": 1.0,
+            "repetition_penalty": 1.0,
+            "max_new_tokens": 27648,
+            "min_new_tokens": 1
+        },
+        "max_num_gen_batches": 1
+    },
+    "rollout": {
+        "group_name": "RolloutGroup",
+        "gpu_memory_utilization": 0.55,
+        "model_dir": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+        "model_arch": "qwen2.5",
+        "enforce_eager": false,
+        "distributed_executor_backend": "mp",
+        "disable_log_stats": false,
+        "detokenize": false,
+        "padding": null,
+        "eos": null,
+        "rollout_backend": "sglang",
+        "sglang": {
+            "attention_backend": "ascend",
+            "decode_log_interval": 4096,
+            "use_torch_compile": false,
+            "torch_compile_max_bs": 128
+        },
+        "vllm": {
+            "attention_backend": "FLASH_ATTN",
+            "enable_chunked_prefill": true,
+            "enable_prefix_caching": true,
+            "enable_flash_infer_sampler": true,
+            "max_num_batched_tokens": null,
+            "torch_profiler_dir": null
+        },
+        "return_logprobs": false,
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "validate_weight": false,
+        "validate_save_dir": null,
+        "print_outputs": false,
+        "max_running_requests": 128,
+        "cuda_graph_max_bs": 128
+    },
+    "data": {
+        "type": "math",
+        "dataset_name": "boba",
+        "max_prompt_length": 1024,
+        "filter_prompt_by_length": true,
+        "rollout_batch_size": 256,
+        "val_rollout_batch_size": null,
+        "num_workers": 2,
+        "shuffle": true,
+        "validation_shuffle": true,
+        "seed": 1234,
+        "train_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "val_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "prompt_key": "prompt",
+        "image_keys": [
+            "image"
+        ],
+        "answer_key": "answer",
+        "choice_key": "choices",
+        "solution_key": null,
+        "use_chat_template": true,
+        "lazy_loading": true
+    },
+    "actor": {
+        "group_name": "ActorGroup",
+        "training_backend": "fsdp",
+        "mcore_gpt": true,
+        "spec_name": "decoder_gpt",
+        "enable_offload": true,
+        "checkpoint_load_path": null,
+        "global_batch_size": 512,
+        "micro_batch_size": 1,
+        "enable_dp_load_balance": false,
+        "calculate_flops": false,
+        "seed": 1234,
+        "model": {
+            "precision": "bf16",
+            "sharding_strategy": "full_shard",
+            "is_lora": false,
+            "seq_length": 28672,
+            "encoder_seq_length": 28672,
+            "model_path": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/"
+        },
+        "optim": {
+            "optimizer": "adam",
+            "bf16": true,
+            "fp16": false,
+            "lr": 2e-05,
+            "adam_beta1": 0.9,
+            "adam_beta2": 0.95,
+            "adam_eps": 1e-05,
+            "min_lr": 2e-06,
+            "weight_decay": 0.05,
+            "use_distributed_optimizer": true,
+            "overlap_grad_reduce": false,
+            "overlap_param_gather": false,
+            "optimizer_enable_pin": false,
+            "overlap_param_gather_with_optimizer_step": false,
+            "clip_grad": 0.8,
+            "loss_scale": 65536
+        },
+        "lr_sched": {
+            "lr_warmup_fraction": 0.01,
+            "lr_warmup_init": 0.0,
+            "lr_warmup_iters": 0,
+            "max_lr": 2e-05,
+            "min_lr": 0.0,
+            "lr_decay_style": "constant",
+            "lr_decay_iters": 10
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        },
+        "fsdp": {
+            "forward_prefetch": true,
+            "limit_all_gathers": true,
+            "backward_prefetch": true,
+            "use_orig_params": true
+        }
+    },
+    "reward": {
+        "group_name": "RewardGroup",
+        "use_reward_model": false,
+        "reward_type": "math",
+        "reward_scale": 5.0,
+        "reward_weights": {
+            "qa_accuracy": 1.0,
+            "think_format": 0.0,
+            "answer_format": 0.0
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        }
+    },
+    "critic": {
+        "use_critic_model": false
+    }
+}
\ No newline at end of file
diff --git a/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761824220.hostname-kjuul.1028618.0 b/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761824220.hostname-kjuul.1028618.0
new file mode 100644
index 0000000000000000000000000000000000000000..853a8b900036ac502df01abe80138a76f85a7d6e
GIT binary patch
literal 88
zcmeZZfPjCKJmzxF{-;!a+2NL>6mL>dVrHJ6YguYuiIq{19+yr@YF=@EQBr<lQHox1
gX>M*}QKepaQD#YMkzOiDReV}zPHH?<<fnN(06HfiegFUf

literal 0
HcmV?d00001

diff --git a/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761825588.hostname-kjuul.1562049.0 b/logs-lxs/fsdp-sglang-256*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761825588.hostname-kjuul.1562049.0
new file mode 100644
index 0000000000000000000000000000000000000000..af500a4645284075704f5732f84f7344a1e39eea
GIT binary patch
literal 88
zcmeZZfPjCKJmzvvZxTIw#o?Bt6mL>dVrHJ6YguYuiIq{19+yr@YF=@EQBr<lQHox1
hX>M*}QKepaQD#YMkzOiDReV}zPHH?vq$?;+6#ywzAIbm#

literal 0
HcmV?d00001

diff --git a/logs-lxs/fsdp-sglang-512*8-16card-kernel-timeout/tensorboard/config.json b/logs-lxs/fsdp-sglang-512*8-16card-kernel-timeout/tensorboard/config.json
new file mode 100644
index 000000000..13cd40304
--- /dev/null
+++ b/logs-lxs/fsdp-sglang-512*8-16card-kernel-timeout/tensorboard/config.json
@@ -0,0 +1,205 @@
+{
+    "cluster": {
+        "num_nodes": 1,
+        "component_placement": {
+            "actor,rollout,reward": "all"
+        }
+    },
+    "runner": {
+        "task_type": "reasoning",
+        "logger": {
+            "log_path": "./logs/fsdp-sglang-512*8-16card-timeout",
+            "project_name": "rlinf",
+            "experiment_name": "fsdp-sglang-512*8-16card-timeout",
+            "logger_backends": [
+                "tensorboard"
+            ]
+        },
+        "max_epochs": 5,
+        "max_steps": -1,
+        "val_check_interval": 1,
+        "save_interval": 50,
+        "seq_length": 28672,
+        "enable_dynamic_batch_size": false,
+        "max_tokens_per_mbs": 28672,
+        "resume_dir": null,
+        "experiment_name": "fsdp-sglang-512*8-16card-timeout",
+        "output_dir": "./logs"
+    },
+    "algorithm": {
+        "group_size": 8,
+        "n_minibatches": 4,
+        "training_batch_size_per_gpu": 1,
+        "rollout_batch_size_per_gpu": null,
+        "logprob_forward_micro_batch_size": 1,
+        "val_rollout_batch_size_per_gpu": 4,
+        "recompute_logprobs": true,
+        "shuffle_rollout": false,
+        "loss_type": "math_ppo_actor",
+        "loss_agg_func": "token-mean",
+        "kl_beta": 0.0,
+        "kl_penalty_type": "low_var_kl",
+        "ratio_clip_eps": 0.2,
+        "entropy_bonus": 0.0,
+        "calculate_entropy": false,
+        "clip_ratio_c": null,
+        "clip_ratio_low": null,
+        "clip_ratio_high": null,
+        "adv_type": "math_grpo",
+        "normalize_advantages": true,
+        "early_stop_imp_ratio": 5.0,
+        "use_valid_token_scale": false,
+        "sampling_params": {
+            "use_greedy": false,
+            "temperature": 1.0,
+            "top_k": 1000000,
+            "top_p": 1.0,
+            "repetition_penalty": 1.0,
+            "max_new_tokens": 27648,
+            "min_new_tokens": 1
+        },
+        "max_num_gen_batches": 1
+    },
+    "rollout": {
+        "group_name": "RolloutGroup",
+        "gpu_memory_utilization": 0.55,
+        "model_dir": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+        "model_arch": "qwen2.5",
+        "enforce_eager": false,
+        "distributed_executor_backend": "mp",
+        "disable_log_stats": false,
+        "detokenize": false,
+        "padding": null,
+        "eos": null,
+        "rollout_backend": "sglang",
+        "sglang": {
+            "attention_backend": "ascend",
+            "decode_log_interval": 4096,
+            "use_torch_compile": false,
+            "torch_compile_max_bs": 128
+        },
+        "vllm": {
+            "attention_backend": "FLASH_ATTN",
+            "enable_chunked_prefill": true,
+            "enable_prefix_caching": true,
+            "enable_flash_infer_sampler": true,
+            "max_num_batched_tokens": null,
+            "torch_profiler_dir": null
+        },
+        "return_logprobs": false,
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "validate_weight": false,
+        "validate_save_dir": null,
+        "print_outputs": false,
+        "max_running_requests": 128,
+        "cuda_graph_max_bs": 128
+    },
+    "data": {
+        "type": "math",
+        "dataset_name": "boba",
+        "max_prompt_length": 1024,
+        "filter_prompt_by_length": true,
+        "rollout_batch_size": 512,
+        "val_rollout_batch_size": null,
+        "num_workers": 2,
+        "shuffle": true,
+        "validation_shuffle": true,
+        "seed": 1234,
+        "train_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "val_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "prompt_key": "prompt",
+        "image_keys": [
+            "image"
+        ],
+        "answer_key": "answer",
+        "choice_key": "choices",
+        "solution_key": null,
+        "use_chat_template": true,
+        "lazy_loading": true
+    },
+    "actor": {
+        "group_name": "ActorGroup",
+        "training_backend": "fsdp",
+        "mcore_gpt": true,
+        "spec_name": "decoder_gpt",
+        "enable_offload": true,
+        "checkpoint_load_path": null,
+        "global_batch_size": 1024,
+        "micro_batch_size": 1,
+        "enable_dp_load_balance": false,
+        "calculate_flops": false,
+        "seed": 1234,
+        "model": {
+            "precision": "bf16",
+            "sharding_strategy": "full_shard",
+            "is_lora": false,
+            "seq_length": 28672,
+            "encoder_seq_length": 28672,
+            "model_path": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/"
+        },
+        "optim": {
+            "optimizer": "adam",
+            "bf16": true,
+            "fp16": false,
+            "lr": 2e-05,
+            "adam_beta1": 0.9,
+            "adam_beta2": 0.95,
+            "adam_eps": 1e-05,
+            "min_lr": 2e-06,
+            "weight_decay": 0.05,
+            "use_distributed_optimizer": true,
+            "overlap_grad_reduce": false,
+            "overlap_param_gather": false,
+            "optimizer_enable_pin": false,
+            "overlap_param_gather_with_optimizer_step": false,
+            "clip_grad": 0.8,
+            "loss_scale": 65536
+        },
+        "lr_sched": {
+            "lr_warmup_fraction": 0.01,
+            "lr_warmup_init": 0.0,
+            "lr_warmup_iters": 0,
+            "max_lr": 2e-05,
+            "min_lr": 0.0,
+            "lr_decay_style": "constant",
+            "lr_decay_iters": 10
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        },
+        "fsdp": {
+            "forward_prefetch": true,
+            "limit_all_gathers": true,
+            "backward_prefetch": true,
+            "use_orig_params": true
+        }
+    },
+    "reward": {
+        "group_name": "RewardGroup",
+        "use_reward_model": false,
+        "reward_type": "math",
+        "reward_scale": 5.0,
+        "reward_weights": {
+            "qa_accuracy": 1.0,
+            "think_format": 0.0,
+            "answer_format": 0.0
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        }
+    },
+    "critic": {
+        "use_critic_model": false
+    }
+}
\ No newline at end of file
diff --git a/logs-lxs/fsdp-sglang-512*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761829334.hostname-kjuul.2113735.0 b/logs-lxs/fsdp-sglang-512*8-16card-kernel-timeout/tensorboard/events.out.tfevents.1761829334.hostname-kjuul.2113735.0
new file mode 100644
index 0000000000000000000000000000000000000000..a4cd17a41f7f3213d45cde32d74dc0818a4de3f1
GIT binary patch
literal 88
zcmeZZfPjCKJmzxl>045I!{L^r6mL>dVrHJ6YguYuiIq{19+yr@YF=@EQBr<lQHox1
hX>M*}QKepaQD#YMkzOiDReV}zPHH?vB*i(<002kwAO8RV

literal 0
HcmV?d00001

diff --git a/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/config.json b/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/config.json
new file mode 100644
index 000000000..c1197da7b
--- /dev/null
+++ b/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/config.json
@@ -0,0 +1,205 @@
+{
+    "cluster": {
+        "num_nodes": 1,
+        "component_placement": {
+            "actor,rollout,reward": "all"
+        }
+    },
+    "runner": {
+        "task_type": "reasoning",
+        "logger": {
+            "log_path": "./logs/fsdp-sglang-512*8-16card-timeout",
+            "project_name": "rlinf",
+            "experiment_name": "fsdp-sglang-512*8-16card-timeout",
+            "logger_backends": [
+                "tensorboard"
+            ]
+        },
+        "max_epochs": 5,
+        "max_steps": -1,
+        "val_check_interval": 1,
+        "save_interval": 50,
+        "seq_length": 28672,
+        "enable_dynamic_batch_size": false,
+        "max_tokens_per_mbs": 28672,
+        "resume_dir": null,
+        "experiment_name": "fsdp-sglang-512*8-16card-timeout",
+        "output_dir": "./logs"
+    },
+    "algorithm": {
+        "group_size": 8,
+        "n_minibatches": 4,
+        "training_batch_size_per_gpu": 1,
+        "rollout_batch_size_per_gpu": null,
+        "logprob_forward_micro_batch_size": 1,
+        "val_rollout_batch_size_per_gpu": 4,
+        "recompute_logprobs": true,
+        "shuffle_rollout": false,
+        "loss_type": "math_ppo_actor",
+        "loss_agg_func": "token-mean",
+        "kl_beta": 0.0,
+        "kl_penalty_type": "low_var_kl",
+        "ratio_clip_eps": 0.2,
+        "entropy_bonus": 0.0,
+        "calculate_entropy": false,
+        "clip_ratio_c": null,
+        "clip_ratio_low": null,
+        "clip_ratio_high": null,
+        "adv_type": "math_grpo",
+        "normalize_advantages": true,
+        "early_stop_imp_ratio": 5.0,
+        "use_valid_token_scale": false,
+        "sampling_params": {
+            "use_greedy": false,
+            "temperature": 1.0,
+            "top_k": 1000000,
+            "top_p": 1.0,
+            "repetition_penalty": 1.0,
+            "max_new_tokens": 27648,
+            "min_new_tokens": 1
+        },
+        "max_num_gen_batches": 1
+    },
+    "rollout": {
+        "group_name": "RolloutGroup",
+        "gpu_memory_utilization": 0.55,
+        "model_dir": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+        "model_arch": "qwen2.5",
+        "enforce_eager": false,
+        "distributed_executor_backend": "mp",
+        "disable_log_stats": false,
+        "detokenize": false,
+        "padding": null,
+        "eos": null,
+        "rollout_backend": "sglang",
+        "sglang": {
+            "attention_backend": "ascend",
+            "decode_log_interval": 4096,
+            "use_torch_compile": false,
+            "torch_compile_max_bs": 128
+        },
+        "vllm": {
+            "attention_backend": "FLASH_ATTN",
+            "enable_chunked_prefill": true,
+            "enable_prefix_caching": true,
+            "enable_flash_infer_sampler": true,
+            "max_num_batched_tokens": null,
+            "torch_profiler_dir": null
+        },
+        "return_logprobs": false,
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "validate_weight": false,
+        "validate_save_dir": null,
+        "print_outputs": false,
+        "max_running_requests": 128,
+        "cuda_graph_max_bs": 128
+    },
+    "data": {
+        "type": "math",
+        "dataset_name": "boba",
+        "max_prompt_length": 1024,
+        "filter_prompt_by_length": true,
+        "rollout_batch_size": 16,
+        "val_rollout_batch_size": null,
+        "num_workers": 2,
+        "shuffle": true,
+        "validation_shuffle": true,
+        "seed": 1234,
+        "train_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "val_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "prompt_key": "prompt",
+        "image_keys": [
+            "image"
+        ],
+        "answer_key": "answer",
+        "choice_key": "choices",
+        "solution_key": null,
+        "use_chat_template": true,
+        "lazy_loading": true
+    },
+    "actor": {
+        "group_name": "ActorGroup",
+        "training_backend": "fsdp",
+        "mcore_gpt": true,
+        "spec_name": "decoder_gpt",
+        "enable_offload": true,
+        "checkpoint_load_path": null,
+        "global_batch_size": 32,
+        "micro_batch_size": 1,
+        "enable_dp_load_balance": false,
+        "calculate_flops": false,
+        "seed": 1234,
+        "model": {
+            "precision": "bf16",
+            "sharding_strategy": "full_shard",
+            "is_lora": false,
+            "seq_length": 28672,
+            "encoder_seq_length": 28672,
+            "model_path": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/"
+        },
+        "optim": {
+            "optimizer": "adam",
+            "bf16": true,
+            "fp16": false,
+            "lr": 2e-05,
+            "adam_beta1": 0.9,
+            "adam_beta2": 0.95,
+            "adam_eps": 1e-05,
+            "min_lr": 2e-06,
+            "weight_decay": 0.05,
+            "use_distributed_optimizer": true,
+            "overlap_grad_reduce": false,
+            "overlap_param_gather": false,
+            "optimizer_enable_pin": false,
+            "overlap_param_gather_with_optimizer_step": false,
+            "clip_grad": 0.8,
+            "loss_scale": 65536
+        },
+        "lr_sched": {
+            "lr_warmup_fraction": 0.01,
+            "lr_warmup_init": 0.0,
+            "lr_warmup_iters": 0,
+            "max_lr": 2e-05,
+            "min_lr": 0.0,
+            "lr_decay_style": "constant",
+            "lr_decay_iters": 10
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        },
+        "fsdp": {
+            "forward_prefetch": true,
+            "limit_all_gathers": true,
+            "backward_prefetch": true,
+            "use_orig_params": true
+        }
+    },
+    "reward": {
+        "group_name": "RewardGroup",
+        "use_reward_model": false,
+        "reward_type": "math",
+        "reward_scale": 5.0,
+        "reward_weights": {
+            "qa_accuracy": 1.0,
+            "think_format": 0.0,
+            "answer_format": 0.0
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        }
+    },
+    "critic": {
+        "use_critic_model": false
+    }
+}
\ No newline at end of file
diff --git a/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/events.out.tfevents.1761898947.hostname-kjuul.3557129.0 b/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/events.out.tfevents.1761898947.hostname-kjuul.3557129.0
new file mode 100644
index 0000000000000000000000000000000000000000..f39457f2038a58c16257dc7446862e1e7d7a70e5
GIT binary patch
literal 88
zcmeZZfPjCKJmzw)uDnzr<9N$aiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-^3~??bO0E4Aj$v$

literal 0
HcmV?d00001

diff --git a/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/events.out.tfevents.1761900889.hostname-kjuul.30866.0 b/logs-lxs/fsdp-sglang-512*8-16card-timeout/tensorboard/events.out.tfevents.1761900889.hostname-kjuul.30866.0
new file mode 100644
index 0000000000000000000000000000000000000000..ddcafb574647696a958f6dd3f987d13b63b45865
GIT binary patch
literal 30524
zcmb82d0fof|Hmi$4DCuqNm)ZeX=O4I$&%f*B{hgPl|970$)0^_v+q<SOGHGn@9Wo+
z<^JqD$#va+(=6|g*EHwzxwk((KF>Ml^FGgW-k+D5rmiB@&p*q$riokEoiV_s<Gqen
z`Xi%*LL(eIPn1W*dbH`|C}|NZkBEtk9vK-F9pV@>Av`=NdWz$u=+Ibsw4=QIs=!g9
zVe&xLza)D<TG&Yay04D5`z#(ZGigux>pECTEF{LUq2cmYQPJ|KplEquNKkB$S^QtK
zyl!3bN(9I+z5*08B_cR*l00<unAjLIvoFiM7acH3wpNccqSYgN@mdXJA~BMPl_SN(
z%A?HA8cg?zeiP{4Kz*^^v^OQ<N1BVxCAB5`%8O&m?>jVNv{~fd(XxGMNfx$hpxN6U
z0b-k3>M5B?OeM9HKy+kSSmcCQv)Dnkd^S5p&8e@xzPEoT3-J)m^|d7?lA5aP<&zX8
zG}~-+Q#$BIda0fI^2x^Se~QyIm)DV)Neq;i2Zc-wiiiywEjN2;mD1j(WyT&L%x9}7
z#1WwpqvX-@h+w(dwujH9uQnb1+EP7+SI7B3#M3l{oy1mRsh)kSkwLM+V*+DBr^*AP
z<k5j4QDzFJW1Y-I!)9vOwJcC1KBR$dB=sa_8aOICGCV3aFiajXI(Cc#+$jrv>YCJC
z4bG`s;4l890c|BVk~$hNS{@S>84)Abp!t$RzWV2Se`~5n%c9J?iER_qw`eV?E2*tP
zV<TgO!id{{$T!ERRT0cJW}%vcxkf9?9~cuHsTf=_*Y)|uQ)?!6P!D+bn)h?@>qTm?
zt)#WYK{Mc}=%C=((8!3u$WehYLE%wh@|ZxyjVr1ZHYHFV5h4#!yxQarFB2cdz@4{F
zQokD9IW22UV7NReqP*Io{XZ|icTi6xW6{#D;(kliXnl#b#9T9xU(uk+O7e^7>1rV7
zEfpd|Rb#p}Y2VIXJ!aOrNj~CpE7hch#9U&i+zqXQq7*keIdEK<a<GB#uGMIwMvgq*
z>no03tw!oetR$u?BseTIDo|50C6*WN^cP?^*67Bf<RO94L5fl<uz0C$?;$Br&)Ugq
zOj%FWXdx36E|d@X3n~&hP7CeRSUu60UQb2h?kQ@tuEbJeqCyoXmtqm81cpV%l<Oc8
zE&l%EVM8?~YaZW2JR?<2SxV|ij8xR9(1>#TG|bYT>kq=gvOZ1Cs1BC$Go{tIUjQS^
zsU>jM-Az{mssmC9L`8;$2LCd%MaNec!0F<#UJa>E7bV11&fsv;0N4-%H>fXHZV1(l
zD4dRrZZ$eOC?qf<GCEvYf$vt+dch%cW2+ie9Wu&sRA5-N+3wTZTo+G#G7D~J^fnb?
zr02l&D5Kh^_b@;Cc7k|Ka&?~f1HkkhEbvz$;-&8A+Bv~lSh$lwL7QfWm<@w->Fq9p
z`Fy&*ujyE@&}<KJY@^(Nv$BVM>7PLq!>K<O2KRvt7L`c|k5#Gr+C_o8iCKi{^r}kM
zPu!7?`v?#)YprFA9k92W?<bVyR_cp98MrkgN6l1XyXD;SpKYMXwF3n4y48BZgtq{S
z53xXWYgGGJUBImthY1vP>v}Jz#o*S}T!Q)BnjhSx61Y|OC~<7J&W`P*4{piwh+?>v
z@#xk^a4Yy2Av|tnOUt6ctr^D&)9F@G?X?+j+ONweK)|gP)kgY*TO&^r%5tkozv7YL
zR$_sgsmyk(|Kr`xpvb;c1o67{taF`I0B@XTf#}wVaMOF>*3Uu$1>Fj5ES?5#Srie>
z=hm1@1>?c3&S!~ZyY*y4+m7H?>^Y(sZZ%sJR1Mr(bDj_$w;~_D=nrm9yGWQ$w>oyM
z-3Huxc8LH1w+8pKjsdrVt`N#{YxDU+3EWn|s}~Q-GmudgRtggBH1KNL^W9Ovp6kLO
z2L)vMd6#~6YX#2+4zFo6tt51GU`L-gUzaDaOcC7Ws%0vVXv#-~@+>q%)I(NvLtzbg
zD%iy#71PkJC3-$4rrk!u3O(lt)e4mlqIw`Vt0Z!lehfSB0%dOShHgSt=5m0Zf6c;o
zzVwiqe?AY)AYBZXiE`%VN<*DS{%oaw#0wv09U$JwG5;bJ`R-2}0wO)&aHU9^Cw1*y
zR3iJm_lAA2lWV`FJ5=su4XT}t;xmr;&adfs3!d`pz9*QSjp8>S^&LC4MFPy&^b?UV
z<BI91*F^^#Z*{Gj*7>qh?D3V*|DKNqb#P9stp^oYTSBh~6`*DQB_S1`Oy1T)J(({V
z7dngcz7Z^xk;3QP`vC0KQejq|lAc-Jdjfsp!C%BOpKrT+{do_L`?r4+#M>t>Nk=>c
zQ1%ZC#6Dqbbha(jHl&O|!9Fp)ZroMq6Mu+aYR)UZJ`sJRU;^}s`Fg~$`$Wd~>^X1%
zt*%5AqfZPKy;}w+qg@q3c>09ptQT$J6lz|TFr9ru&uwKQY?l0L1PJtrr@c10!eQhm
zCY04DvSxlA3T}PSFjd*lAT4``bOuF4HPr)Eb>MaD`h>DY0G2XAbjv*<D*)X3T8l;$
zbZd{pyuRSp8-0TL+`8Ll?_6-J*nl{;TO0c3egn6z8xqBED>G}L1Gsg?h!7sPWFyO3
zfm=t63DfD8e?s63aBH6l0RnE7p0H>OZhbH%l;u{ZK>v2&mYtcJsm6Az^5k1xK#_*#
z1o67{eNF6o0PELbf#_DZ(r+SgtG)$+f^HS=jxh(ftSt%Vb8A!^moMO!ffaFVx5g~4
z(G%P%<tT<*?Z3pm1-D+;rGa?d^2jXvKXB_-J;HRlbz^m?6S#HOng9W}^mZ7w0k`z(
z6UuT+*2C!$xMgIkW~#H@>NBwYeo&-F1A=(nil}z>34kIy7Km<{r;a}lZq2kOP|&T%
z-49*{w?vHy=5wpy#hTmTR>Q``vE5p8`s6WiYjabg7;ZV7?ll(N`q_*S9=Gy;-t`5y
zteX?2)2+7wZ?=M4{aX+q;8xg?UUR{%Pb~>$xn($M#bvmyl*TPg>X&Dy8qDl;CE4@S
z)LDA~^J}65gY2|=Mu2y%rpsGGc3Qw;TG=V5EWjswd8J9POhw$~s%3)gw4!L5?8t$+
z(;+)mZb}oYn4L}~RPk}}3VjPJ)R7}pEA&ftY9H858Yo?K7s@<^15{<k>@@lD1&_5q
z&GrG4sf*#V(~r!<QtA8Dzacx7aLm8RPDbzNTY^ZA&8QMeku(8FJ3HN%nS6*dg`FJC
zp~{`CLAA3}ex)4W1NqrqAUhpxPUEt(Q~tvoU%Sn<B{1VGM<QXy6|>XrCo5)Pc5)*0
zzh|dEIxHV>uLvrT(uQ6SDxfPnb?^4_2IQabZ3!02PFaqneSp<>5oXod>Cn|pr=d?&
zbt9IUoqEh$-VKiXF75>J_KC_j->w31qz4PcJ`vla({!k<ix+`{eInp?P%iX|DeVd7
z>l58JC%8bLnA3qccAvP{!l^$TKy#%;G5W;X6rUnE8CQD~!qX>GlD7AOQ)q2R!gThD
zs8Lf!z-C!1BS4@}80@M%2@WHt&V;i1#JWxwoWZT78b-{128kMQC=e7$_f-!h=Fd(m
z+NGxhxS|USM7L^*6VHKLE4vaX=+?A|0g2#NmLI`<Zf$B@!vx&g-i<i6Th~TgF9o;u
z_!Gr&tLss}>)=*qcS3mFidv$d0d7_8NtjNze5}WKfLo~n1PHjbEBp#JI+MPHvfT2o
zm*WL)P3)&;By6`{O!|8yC~~AfLA>WyhP20R03`!iAiC9kviLB#)oBocf^PME;(s07
zn)n;Rd~OB4Oeh4mk_HpUcB`-bv#;RR+o41;+}iW!i<97%bT}bAZq;kMz!==>A4r%^
zx6BTwJOj7lMi3z2RwMEMJ%MxURuG{qx5nlfH2}A21gn{vY`1pjpYs7lj6(?Gb?cv;
zV=@5E<SY=~+WcsoA-H8bia<fPO8)OhZE(wBG{Jms9XIpO0k@`y632Ec*Rt9daO>V!
zq8M)785LI(-1;z%5FWSAzjdtzZhZ(NOs8A5F25@Vx4wrHAmCP9pX#CDRzMV?EVqIO
zEOLU|Y8kFCQ)Z`H%<MGapyxNeN6i4M9-#w+>?F1A>HX-<vGtIhx^tLTc5<5{^?AN-
z=L1-#dEDiyWrFOK+u2Vx=wAE-$W8?uQZYN7xi0c)TU>zu<NJdnR4epLc4|1Oy)@@|
zI}a$cb0pons?3<3X11#8Im4>|U|>RZF<f@ix6PF%KJ9A&+37jQ{EO^#$lx;m$8@78
znx;}DO#sr)P7h_hUN+5xojjC7l{;C3YG)_Wf?VHYg?e=%I~m8)xa{mC3O(XGy(lRS
z)bBivNSJZO>=d<bL@mfppW+Dp@7ZZghivhqLr{TXGwJo90=lx3+vm!2A^&`fCs-&u
zoem522DbBTVOE`;3@f{5K%Y4N2eHiTWSqBs1RVG7i3IWXiED$Wg#ws6mjz;<IFof`
zCDgX|d;$gggvs}MLC`09EFhS#PqZu19}0aUX(4g!K2fjxodIwFonJ&0qfgAOzUeHS
zjFpoJ;pr2fC$Ggm;kbk_oqgi|ji+v~Sw=4<K%h^|Y&oJM97e7y31#((tQb)(aBIDW
z(Puw{1oeKG28ujcr5;G1KRZo%H*h$B;x#M~-Ksjva5uQ+zm`Bjx2nw@`W4(7pG+{H
zTh}hOE&{jGQix-_<@e@gDY$hyl_-W=%X<Dc9^CqnMhK5vrB7FNg=Y|}b%g13E6F=2
z8Qe<VK!AW-`8h*}f?F<|31zu;Z$|!jaBIaDHDkbbYxAWmJwTDyTM6Plw-(4AEd#L8
zb{2?koqs?50k|dKL7<>pi>59a2X3Y8B$&^w`LUg|!L2I0iDSD}dg5?CxMjYFD27{o
z!&ilYTfb!x!sAxj^1-*kt$CS*>2zyw`~G_1)`ctr1l*clsr5H-%OaalmRpzm4{Zr<
zdG1#;hHST56y-&LB6D*H;&rP-JwJTE{NMo=h;GGQ>M{}BdVY{VLAO?XDcuil*&HUA
z&#kjVTwjA*e!0Z4-3sl|GZEYxeuOB7TkSeJT>!U!KS~IXTf=hBZU?t^<q@XSt-eQw
zm!Vt72oP|~-(qG9aI4Y@LRoGlUrhc7ZfiqIQIs+}88Ney-8RoG7r7Z=ZBlh$kexE8
z`+4{MJh(Gtr%4>9m7R_kd-w#)isi6O+1%x-WrFPVKbf=4CGheR$WCuKq+)imxhL{j
zEU(%KR>&@mZbh|1zhtMv<Nc(~l8!k;nFn%!s?3<3-aWnI;ePg5A7B>hVz}(I%J6`6
zN8h1uAv;~-n17L-zFh7#2}G)vPWPo0NfUsyv(tT<XZ6MJVJFKtRJoHisCIVpK6}vj
zN!g%c$WBh{X<T-8^4@aD_tW)V_hH6m8;OJ&Yh|bSwEU>3DI&;D(#?ea_w2N=L$PuE
zB&a~>7J5CXfOdB3HKMU+-Shvu4f$umR)YDmQ|}NP?^w&z!-3tnO_)_=r}tHIj;((q
zx($6IbqBG`?9_fh!>iCIj_o9fw@=tMIdK`l;$18d`$Rq0#dc8JZ@UQ;>=Suin*0uZ
z!Z?FqzCLlatoI+#C+stc)7>ZDS8?$gTXSV096;T(h+_1KtDXz@!^tS$O9-P+DAvwv
zuw#||a0*S|N0`n&k>XRb1~$u@Yyx=uM4?ye#ij;u7}*>kl+`DuhlK=zTQfC`vF>M(
z0P{u%leXM41x3~$R1aj#pPgK-rd0!Q-ys&rxfLyrm&(#d>;$(?9VSrFE%An3JHf4M
zxdijMb@oKHci`6JBgE-;E7{b=>u{ZJF5uRiqeL;>in=4k_Xf)H2w}LDVybYfuz@GI
zRqHrm72H~=sN$sM-QvKl2KfZ=x|QNpx+m!~xV7vAp)9wW=1&<5Ze^ZSGbXy-vdekq
zwqp5{`=Cf(0YS`jYx$(=uG^M|<N$d76bt0sx=}q|di&7&SK!v&(*z2-^}uoMBXH|=
zA;ElZ)shX_0dD;~L!556!cSJuYZ=@hKWkj|EKv-%9)=`gE;l+y2*a)LlZknqOLKRE
zTQ=tjtKgPvjd*FkG{ObkYIA`AUbm`tSa@^FM{sM-MM7C_t<*d88r<4?NzIt*cB^62
zqi(MAI*kKGaxW9abjxW&Uc2ivwtoch%oP^Mxz$!tl~H-=_@3s?s{{(VHFwgSS>V>w
zYXtMTWm~V&DRAr4b>eipHAdgX%W0~29GqK~ZW6_CYv|DiH=rj<ZV|$8Ym9!5mwT7J
zlff;++k{ncD@swtV?lq`0k>@L5WwqJzSl(G>OH}&Eq4iJx#hU`$}_mFTOZRmDYH{;
zW_H@T&9kiU3SYn`f6{?Lb}IJh>YX|4!ac}Nc^sydotpRX_o>--+Gkj%AKc}tWrFOK
zd@@EB-!1txWGCm(G_i`=sb!~1J|>@f#=r`NafE7xe#uUo+_y>hziVNNW##}?nYFT0
zpSK?2XC_($b59q;Wv6Xx4oX9IySYPlvj0N2@-MQ}?Nd>`L8M_Et`tcVfV8vIF`2J-
zlkd<amT{<ZCu>mc?391)kni*fQTY2#3rc9(?Cg}k<go96x9{G-j0b-w5@xKGofO{&
zVfQTA8M4#fzX<*B*(sny!baOfsKE0d^m<SM?d&8{{KOc=oP_*i@{?e`>{QNv-{E5j
zY?pt8S@ru)muDv8@5QbXt<&sfW_HTn+NTN}_qX&2V)lvhnSOoc_XWVpm02LyC(6GY
z!Y%a!+T~D%K*2t7`lqoQ^ogNW3Fhk)26iQTp-;@OMx5?GQGSoD76y!f1L$~lq8NQ*
zQ&R4HI2nJ`AcWB;%11gp%fk{*p=J`oD)fo+Dz-DdgTEZuza{~^eL^IvQUAhCIE+jU
z2xaw&drdO6vXh1})BOw*J1WI3zRsSVpvX=`^+0C)+397o-<AUS(1-<cZiQ8gmu649
zGX?gofeC?vZuPnB;tp=Pm=etA)*n{yox!aUwTWZ972fULZg6X+8Bq+ko_?5K3*6db
zP6&@%1DXY;f?GH15LUsh2t^fVhfc7DgSfIK0RnF6iwor7R+1H=EVpbL)h&eQ+sAd)
zjJa;N(t;!06F%*%4T>0A6U2LN^=?rEzj4URh6QqN)vcs(>%y?!;8t*b0tMZ2E*>-x
z+)A(|n9r>RJ@*X<w{|ojj_sEJ*@#Qv)@3`Q7;c@ey`eR@_1T^f9=Ebqw66hfHEKjy
z1-Du%s@S*7nabdne`5j!+%mgr-woV))|60|TRyd}eFV2wwoo&5bi4Ju@-DZ84|VZ(
z>0%rS;&p46p_K(l*~y6oa&GNc<nqJwigtio;x+^dy7iY{!UJ%tsx!fSZUwabD;V4=
zaUqWF)`4SRqrj~@u0%21nm+jTKya(NJ0U!7nMAG12e%wN2&>@M6-5=3Y^>73t$v;a
z2)NZDGG`;Wb;*lRmRl>9G{#mR9CGivGCNr?vy*d%r_Vq4@iz-U>B1m88CUM@ozw5c
z9>`AZ<TRRAcCvWt?Nj%G%|}?K@!aLAWrFNv_S9GQ`}Sw}zS9m4shFJ%5-Rxw<m$JC
z6?(`KsulVrJ6VrvBCQ*pQvhW)9z{2yDl=xMiFe&RkDoeO0!(LJ440kki;hTF9=mM?
z*=Y{P{EO^#^Ujf~AW|NOD@D=-Anok*Le|A@;wsq5A30RHlQpPzb`l-U^BufN|0}%j
zv~>)Xhn<~7n~(WUcr>ON%-CWakuc+m*~u&6_qN{U&!CDQ<#$C44kPrxXD7D~VJ~u}
zP=Uwc^m<SMUD;`?zHuPrpKp-_3uUK^NnY;2hK?6z)!FH)XzvK<6MbTdWoD<p_dXp9
z$Nl691o8HXAqB?g0h~XP1#*3&{Hgz4+?yn*?aE063igQ}&u1NgKCx~x!F+upBD4X1
zr%>t?;@Ev6ZlGsdIDnF;62<5fJu}YTg+8%)8X-J=BJ=o*mv9Pgh$E~*pD3?lFYBVE
zuvs=tCqST2EVj!T4u_F`JfW;Ukz2Dk2HcvXVJvk&gH$uW>=ya(QZgu#FiSm<C4Y7*
z2>Xn0hK~H51#)ghRE?Kbzx511`*Q3L0tMYNc5Sd7+`5-QFrQnkZ$&!7Gsycn#IfC)
zxF*97+^Rd5D27|+zH24mR>OIO@VHen^~F+ftNDDwD!4UKQN=prCVGQgT^A4_;FgV*
zG5)?&v&Dq6+)6&T7{B9YpN6s0?bes{u5KSn-aP=ft|X}kvf_8^*~H!90RFjz1#)g>
zD4xv1+r|G5=T_-b0tMYl4V3wUTUC}5%;(m+#!EYaTQyb?$9C&c$*KL|mi|hj7;bgB
zU2O}vRc{p`JZ`-^@V*(i)nqkc72G<csN#-~RqldYt=AAB;MSn|S<B(v%2-P%%Pp&!
zL$cxAdX%hY>gsmurL(@<xWBv;L6MRaf_QsktKpmS0hFY%K+di0ig!%p;k&}YE$egw
z1>G8CS=0~Q@?J+UpIhGFhhb0bx1Kn*TRZl61%g|@Z6J!_R!C^m<KR}rMnZVps$b}j
z-?JCDiLeT8T~bu>jnzCExV3mQ0RnDyxbx@~xb<TTp)9xD^s5xZZS_xguV!2RT@m$|
z+3C)H&o<+G*dn%02L{<`b7T+icf-eSgzS{gVOrU#-*`Wt&I>&8_nq!@m#dZuvQu$R
zglze+S7+dTC)4#bv5MJg-a<W}sx7a+g5B@W5vmpXB|E+Q<|*A=DLe?u9M1u&GHYe0
zp`Scf9}2>ja!?n;WhcYKhoxWN?->W#sm}(wm4A_)He8zTi6U{hQY1|P(#}q$vPLfk
zY=oVBnnRU4S%X%{PHs8Ag>UXBKz7pGLeu8SPEH4WS2%XWZ?{O>MkLHwD?2GZv-kQ`
z+DY&7;H3CbW~V#b3H|TcX>x}nmX<%E0zY=p>p=yyvs25f&U;ff^|%E2$8s0JeA%h|
zGkX&Z61{+R+%3$iKeKoG58HU?6KykyWoD-n>D~+BxR+-V#M>t>*{5a#n2^N+xjs?;
z)ZcGk@f@h_*1ZG@_K9TsyCb1b6zn6IuTR7mAD;uyApgrIPIsTkE~uW@wBDjV&?oeB
zh+^~!ADb)7;biQ2fDlHX$Sz3CbB*hN4Njpk2MMdtC(5gMrH0cs*xrv05y0CgF1Fv7
zI|IL?z2GRJtUj^Y@pe6Mt6iR&vDW<zBEG%&V2#gX@b~%0a}d+5#`{;hR5tVL32qHK
zMx$wgoLgTNpEk2EGad+TMIR?n(5;}Ob60>{lky4XbE`nq6F;*z?F4bU-I7?icrC~@
zOa-@Qog|9k)~dhOjRv>o6%fL3OJb4Z6*;Z_LvU-sDZ(naRi>!og(22wz^%2X3E*`r
z+sm^1z8}yWOA85Qxpl~>F@DxKwn)v`=yvP;<J^Of-+grhMGl`ONJ|m?OA3Wk>TCq?
z@;MgBxwT#rFO7Mq9|~^0I!~aWTcVN8vcavg3k37IrB^AgGn`x1FA=BPtrb=-UY{Cg
znuA-lE)&IY>*t(mO~EbGD}*rIT49yrHQ8#}7jVn$Dq$7eDpgdmV9oT;;FkS00(jk8
z@AbH#Bm>-<a-C3?Td~#cJHxq^dPB|B*X>q4i<Yj&<^!gHB3U;H;&sc;e84pTPu^mI
zoLl*d9rtI9p$y!*aGO9uw<fl|9|dmRxI-|XTk{v2=!09OcZt*O);S{=uUf0kGQq9V
zdqgqZl0D1H1-E|wj}V4i=Z$i_dT(g;8r=H(K4BHydZno1v&Gg%;FkCy0laQyd&yoD
zHUPJVJR+3kmPOo-)o@!bO_w_>vy&||I}JVJd3WQL-hee}rUQfQ)ULIUcbmogOCUQ%
zaF|wh%K6RRXVJyR7O+fN+~uleg6#D6W(!$Ev3UYyr#Bo@F+0s3r04T@(y(Ayq2>;B
zE2<UxB|CMSxLEpQ;BtE?a|j2h%8c15Z@^8DChj}00F$nZ;j&XR(;R92c{7?ocKVZJ
z{zZ0LXyAm|$+kJ&mr^860MgD*{yqVXZN|b*9?GH0ovcB%vs3>2d|wx9_e+qS-aFE`
z?Cg}EaMJf;@=Sc+>9sSFFk`Lkr1;F<(k-Up-pb^p&QAT>5&GY=lT(M-Pk-az^4RN2
zuLl*-&Q9fXIN2~X2l9{DonXG~RL&*_8nptpkB2a;e&4D4^gR5VYg@dCWo9S)A4k5y
zabKkaLA-q;#dfwIfFq?Wkn0oWPyO|8&!`D@9rY$quursheYyntgoBJ=zCKa>a?4EU
z6El2>)7>Y^SHrTXn=c$d*E<o#=o5F}_rvemYwJr0qfeBNbpDoc8k|CjT?nhtC(5fB
z<1+_8v-hkk0la-eB-)g475`33&F+M<`ov9>t^ME`Bv!*T(ESY3!R7CEE$2I!gCduD
zs0V7m|K7l|>3U@V*6+mvIk!$K{^xO<h1mpfYf=D#f^NBrJ{y2rMZF2;b1UY7S3hv8
zYhU8nZXFMBod<3O_9Ken)`0%O3!v*J4j_cbE!QdXjo{XafrM3X>xQC=MYC(Sf@hFx
zg9s3CtBz#|y0vsLp)9wo+n<{cZXF+@X6$skCGv{Qe$%WwejfD9P=a{Rt)$s=)`456
zhOt1-tvQM(v*8mS#DZIe!wD30Yg6?tY2a2tAi;cYof^Bh0l2k)1aWM)rW$%&1-Cv0
z5yfz;xpj9-aH~czAv|u~?YD3Zxb>66D!8>+QN{H5k?p~)Zy_{}fLmWZK4rs0l;0>q
zS#E6^Zu|h;>OWe|*z0!7_N2t^m1O-XP^9-5f_UAUEov(Tuv;h#<lMTi*l|6F*Q^e1
zbs9^cpj!@iJGTP2JjM~s=hpc7)$#l9ox+G?yX9BI3co)oJe(+oTcuk(+Jjs22ts(=
wI#O$#9k>+~NmvE9{!&!2(C6V3aI0?=0RnDWcYGZNZp|N0D9f!TO~(!TfB0;(X#fBK

literal 0
HcmV?d00001

diff --git a/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/config.json b/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/config.json
new file mode 100644
index 000000000..3ffdafacb
--- /dev/null
+++ b/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/config.json
@@ -0,0 +1,205 @@
+{
+    "cluster": {
+        "num_nodes": 1,
+        "component_placement": {
+            "actor,rollout,reward": "all"
+        }
+    },
+    "runner": {
+        "task_type": "reasoning",
+        "logger": {
+            "log_path": "./logs/fsdp-sglang-512*8-16card",
+            "project_name": "rlinf",
+            "experiment_name": "fsdp-sglang-512*8-16card",
+            "logger_backends": [
+                "tensorboard"
+            ]
+        },
+        "max_epochs": 5,
+        "max_steps": -1,
+        "val_check_interval": 1,
+        "save_interval": 50,
+        "seq_length": 28672,
+        "enable_dynamic_batch_size": false,
+        "max_tokens_per_mbs": 28672,
+        "resume_dir": null,
+        "experiment_name": "fsdp-sglang-512*8-16card",
+        "output_dir": "./logs"
+    },
+    "algorithm": {
+        "group_size": 8,
+        "n_minibatches": 4,
+        "training_batch_size_per_gpu": 1,
+        "rollout_batch_size_per_gpu": null,
+        "logprob_forward_micro_batch_size": 1,
+        "val_rollout_batch_size_per_gpu": 4,
+        "recompute_logprobs": true,
+        "shuffle_rollout": false,
+        "loss_type": "math_ppo_actor",
+        "loss_agg_func": "token-mean",
+        "kl_beta": 0.0,
+        "kl_penalty_type": "low_var_kl",
+        "ratio_clip_eps": 0.2,
+        "entropy_bonus": 0.0,
+        "calculate_entropy": false,
+        "clip_ratio_c": null,
+        "clip_ratio_low": null,
+        "clip_ratio_high": null,
+        "adv_type": "math_grpo",
+        "normalize_advantages": true,
+        "early_stop_imp_ratio": 5.0,
+        "use_valid_token_scale": false,
+        "sampling_params": {
+            "use_greedy": false,
+            "temperature": 1.0,
+            "top_k": 1000000,
+            "top_p": 1.0,
+            "repetition_penalty": 1.0,
+            "max_new_tokens": 27648,
+            "min_new_tokens": 1
+        },
+        "max_num_gen_batches": 1
+    },
+    "rollout": {
+        "group_name": "RolloutGroup",
+        "gpu_memory_utilization": 0.55,
+        "model_dir": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+        "model_arch": "qwen2.5",
+        "enforce_eager": false,
+        "distributed_executor_backend": "mp",
+        "disable_log_stats": false,
+        "detokenize": false,
+        "padding": null,
+        "eos": null,
+        "rollout_backend": "sglang",
+        "sglang": {
+            "attention_backend": "ascend",
+            "decode_log_interval": 4096,
+            "use_torch_compile": false,
+            "torch_compile_max_bs": 128
+        },
+        "vllm": {
+            "attention_backend": "FLASH_ATTN",
+            "enable_chunked_prefill": true,
+            "enable_prefix_caching": true,
+            "enable_flash_infer_sampler": true,
+            "max_num_batched_tokens": null,
+            "torch_profiler_dir": null
+        },
+        "return_logprobs": false,
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "validate_weight": false,
+        "validate_save_dir": null,
+        "print_outputs": false,
+        "max_running_requests": 128,
+        "cuda_graph_max_bs": 128
+    },
+    "data": {
+        "type": "math",
+        "dataset_name": "boba",
+        "max_prompt_length": 1024,
+        "filter_prompt_by_length": true,
+        "rollout_batch_size": 512,
+        "val_rollout_batch_size": null,
+        "num_workers": 2,
+        "shuffle": true,
+        "validation_shuffle": true,
+        "seed": 1234,
+        "train_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "val_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "prompt_key": "prompt",
+        "image_keys": [
+            "image"
+        ],
+        "answer_key": "answer",
+        "choice_key": "choices",
+        "solution_key": null,
+        "use_chat_template": true,
+        "lazy_loading": true
+    },
+    "actor": {
+        "group_name": "ActorGroup",
+        "training_backend": "fsdp",
+        "mcore_gpt": true,
+        "spec_name": "decoder_gpt",
+        "enable_offload": true,
+        "checkpoint_load_path": null,
+        "global_batch_size": 1024,
+        "micro_batch_size": 1,
+        "enable_dp_load_balance": false,
+        "calculate_flops": false,
+        "seed": 1234,
+        "model": {
+            "precision": "bf16",
+            "sharding_strategy": "full_shard",
+            "is_lora": false,
+            "seq_length": 28672,
+            "encoder_seq_length": 28672,
+            "model_path": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/"
+        },
+        "optim": {
+            "optimizer": "adam",
+            "bf16": true,
+            "fp16": false,
+            "lr": 2e-05,
+            "adam_beta1": 0.9,
+            "adam_beta2": 0.95,
+            "adam_eps": 1e-05,
+            "min_lr": 2e-06,
+            "weight_decay": 0.05,
+            "use_distributed_optimizer": true,
+            "overlap_grad_reduce": false,
+            "overlap_param_gather": false,
+            "optimizer_enable_pin": false,
+            "overlap_param_gather_with_optimizer_step": false,
+            "clip_grad": 0.8,
+            "loss_scale": 65536
+        },
+        "lr_sched": {
+            "lr_warmup_fraction": 0.01,
+            "lr_warmup_init": 0.0,
+            "lr_warmup_iters": 0,
+            "max_lr": 2e-05,
+            "min_lr": 0.0,
+            "lr_decay_style": "constant",
+            "lr_decay_iters": 10
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        },
+        "fsdp": {
+            "forward_prefetch": true,
+            "limit_all_gathers": true,
+            "backward_prefetch": true,
+            "use_orig_params": true
+        }
+    },
+    "reward": {
+        "group_name": "RewardGroup",
+        "use_reward_model": false,
+        "reward_type": "math",
+        "reward_scale": 5.0,
+        "reward_weights": {
+            "qa_accuracy": 1.0,
+            "think_format": 0.0,
+            "answer_format": 0.0
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        }
+    },
+    "critic": {
+        "use_critic_model": false
+    }
+}
\ No newline at end of file
diff --git a/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761820416.hostname-kjuul.3651077.0 b/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761820416.hostname-kjuul.3651077.0
new file mode 100644
index 0000000000000000000000000000000000000000..095a18e7191e570ba7f55d97894a93deca4aa0aa
GIT binary patch
literal 88
zcmeZZfPjCKJmzxNhpQhr?{LdeiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-QqPzF000%*AOrva

literal 0
HcmV?d00001

diff --git a/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761820811.hostname-kjuul.84956.0 b/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761820811.hostname-kjuul.84956.0
new file mode 100644
index 0000000000000000000000000000000000000000..1ae80ab661ce91685b07ef65329a99aa18ce1d15
GIT binary patch
literal 88
zcmeZZfPjCKJmzw8?Ruhg!Qqyp6mL>dVrHJ6YguYuiIq{19+yr@YF=@EQBr<lQHox1
hX>M*}QKepaQD#YMkzOiDReV}zPHH?vM96LG2>=-tAL{@B

literal 0
HcmV?d00001

diff --git a/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761821088.hostname-kjuul.261840.0 b/logs-lxs/fsdp-sglang-512*8-16card/tensorboard/events.out.tfevents.1761821088.hostname-kjuul.261840.0
new file mode 100644
index 0000000000000000000000000000000000000000..96fc0177f32d83e61b0cc7e0b80ff17318f9e64d
GIT binary patch
literal 88
zcmeZZfPjCKJmzw8Zj;Zr;Bd=PiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-vNJ^Q5&-oWAHM(q

literal 0
HcmV?d00001

diff --git a/logs-lxs/fsdp-sglang/tensorboard/config.json b/logs-lxs/fsdp-sglang/tensorboard/config.json
new file mode 100644
index 000000000..5299b4cdb
--- /dev/null
+++ b/logs-lxs/fsdp-sglang/tensorboard/config.json
@@ -0,0 +1,205 @@
+{
+    "cluster": {
+        "num_nodes": 1,
+        "component_placement": {
+            "actor,rollout,reward": "0-1"
+        }
+    },
+    "runner": {
+        "task_type": "reasoning",
+        "logger": {
+            "log_path": "./logs/fsdp-sglang",
+            "project_name": "rlinf",
+            "experiment_name": "fsdp-sglang",
+            "logger_backends": [
+                "tensorboard"
+            ]
+        },
+        "max_epochs": 5,
+        "max_steps": -1,
+        "val_check_interval": 1,
+        "save_interval": 50,
+        "seq_length": 28672,
+        "enable_dynamic_batch_size": false,
+        "max_tokens_per_mbs": 28672,
+        "resume_dir": null,
+        "experiment_name": "fsdp-sglang",
+        "output_dir": "./logs"
+    },
+    "algorithm": {
+        "group_size": 8,
+        "n_minibatches": 4,
+        "training_batch_size_per_gpu": 1,
+        "rollout_batch_size_per_gpu": null,
+        "logprob_forward_micro_batch_size": 1,
+        "val_rollout_batch_size_per_gpu": 4,
+        "recompute_logprobs": true,
+        "shuffle_rollout": false,
+        "loss_type": "math_ppo_actor",
+        "loss_agg_func": "token-mean",
+        "kl_beta": 0.0,
+        "kl_penalty_type": "low_var_kl",
+        "ratio_clip_eps": 0.2,
+        "entropy_bonus": 0.0,
+        "calculate_entropy": false,
+        "clip_ratio_c": null,
+        "clip_ratio_low": null,
+        "clip_ratio_high": null,
+        "adv_type": "math_grpo",
+        "normalize_advantages": true,
+        "early_stop_imp_ratio": 5.0,
+        "use_valid_token_scale": false,
+        "sampling_params": {
+            "use_greedy": false,
+            "temperature": 1.0,
+            "top_k": 1000000,
+            "top_p": 1.0,
+            "repetition_penalty": 1.0,
+            "max_new_tokens": 27648,
+            "min_new_tokens": 1
+        },
+        "max_num_gen_batches": 1
+    },
+    "rollout": {
+        "group_name": "RolloutGroup",
+        "gpu_memory_utilization": 0.55,
+        "model_dir": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+        "model_arch": "qwen2.5",
+        "enforce_eager": false,
+        "distributed_executor_backend": "mp",
+        "disable_log_stats": false,
+        "detokenize": false,
+        "padding": null,
+        "eos": null,
+        "rollout_backend": "sglang",
+        "sglang": {
+            "attention_backend": "ascend",
+            "decode_log_interval": 2000,
+            "use_torch_compile": false,
+            "torch_compile_max_bs": 128
+        },
+        "vllm": {
+            "attention_backend": "FLASH_ATTN",
+            "enable_chunked_prefill": true,
+            "enable_prefix_caching": true,
+            "enable_flash_infer_sampler": true,
+            "max_num_batched_tokens": null,
+            "torch_profiler_dir": null
+        },
+        "return_logprobs": false,
+        "tensor_parallel_size": 1,
+        "pipeline_parallel_size": 1,
+        "validate_weight": false,
+        "validate_save_dir": null,
+        "print_outputs": false,
+        "max_running_requests": 64,
+        "cuda_graph_max_bs": 128
+    },
+    "data": {
+        "type": "math",
+        "dataset_name": "boba",
+        "max_prompt_length": 1024,
+        "filter_prompt_by_length": true,
+        "rollout_batch_size": 16,
+        "val_rollout_batch_size": null,
+        "num_workers": 2,
+        "shuffle": true,
+        "validation_shuffle": true,
+        "seed": 1234,
+        "train_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "val_data_paths": [
+            "/home/dataset/boba/AReaL-boba-106k.jsonl"
+        ],
+        "prompt_key": "prompt",
+        "image_keys": [
+            "image"
+        ],
+        "answer_key": "answer",
+        "choice_key": "choices",
+        "solution_key": null,
+        "use_chat_template": true,
+        "lazy_loading": true
+    },
+    "actor": {
+        "group_name": "ActorGroup",
+        "training_backend": "fsdp",
+        "mcore_gpt": true,
+        "spec_name": "decoder_gpt",
+        "enable_offload": true,
+        "checkpoint_load_path": null,
+        "global_batch_size": 32,
+        "micro_batch_size": 1,
+        "enable_dp_load_balance": false,
+        "calculate_flops": false,
+        "seed": 1234,
+        "model": {
+            "precision": "bf16",
+            "sharding_strategy": "full_shard",
+            "is_lora": false,
+            "seq_length": 28672,
+            "encoder_seq_length": 28672,
+            "model_path": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/"
+        },
+        "optim": {
+            "optimizer": "adam",
+            "bf16": true,
+            "fp16": false,
+            "lr": 2e-05,
+            "adam_beta1": 0.9,
+            "adam_beta2": 0.95,
+            "adam_eps": 1e-05,
+            "min_lr": 2e-06,
+            "weight_decay": 0.05,
+            "use_distributed_optimizer": true,
+            "overlap_grad_reduce": false,
+            "overlap_param_gather": false,
+            "optimizer_enable_pin": false,
+            "overlap_param_gather_with_optimizer_step": false,
+            "clip_grad": 0.8,
+            "loss_scale": 65536
+        },
+        "lr_sched": {
+            "lr_warmup_fraction": 0.01,
+            "lr_warmup_init": 0.0,
+            "lr_warmup_iters": 0,
+            "max_lr": 2e-05,
+            "min_lr": 0.0,
+            "lr_decay_style": "constant",
+            "lr_decay_iters": 10
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        },
+        "fsdp": {
+            "forward_prefetch": true,
+            "limit_all_gathers": true,
+            "backward_prefetch": true,
+            "use_orig_params": true
+        }
+    },
+    "reward": {
+        "group_name": "RewardGroup",
+        "use_reward_model": false,
+        "reward_type": "math",
+        "reward_scale": 5.0,
+        "reward_weights": {
+            "qa_accuracy": 1.0,
+            "think_format": 0.0,
+            "answer_format": 0.0
+        },
+        "tokenizer": {
+            "tokenizer_model": "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/",
+            "use_fast": false,
+            "trust_remote_code": true,
+            "padding_side": "right"
+        }
+    },
+    "critic": {
+        "use_critic_model": false
+    }
+}
\ No newline at end of file
diff --git a/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761816870.hostname-kjuul.2287170.0 b/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761816870.hostname-kjuul.2287170.0
new file mode 100644
index 0000000000000000000000000000000000000000..af851bb2db014e9130f0d3684629ba52fb343229
GIT binary patch
literal 88
zcmeZZfPjCKJmzw0yY2Qo<8aGSiZ`h!F*8rkwJbHS#L6g0k4vW{HLp0oC@DX&C`GTh
hG&eV~s8X-ID6=HBNG}znDn2bUCp8`-@;^*J4FCceAIAUy

literal 0
HcmV?d00001

diff --git a/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761817980.hostname-kjuul.2660597.0 b/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761817980.hostname-kjuul.2660597.0
new file mode 100644
index 0000000000000000000000000000000000000000..ebbf405436fdad78db6887e766b37f129c934669
GIT binary patch
literal 3420
zcmai$du$X{6o;o2(7B2f%Ivm<!cw4>0=rTk4W*P)DIiFo5D@>sVY+*FS(%*~XQsQ|
zYPCqD5d#St3`ojDf}{itB#6dpAjM);Y?^9IAW<xkERRx3OYwnFXJ_tQdShpI|J<B!
ze)pXD?#V2njNjuAm(<X&9kH`N7hYbFHmp?U{DQMM2n97SXO$C7RiU7Wa;eD49;Z?k
z2yk-P8It`fl$|hI#d`fbWQ~7r)}3-pLeIVR&FmgJ>-_Y0qR-7v11VsH>JLDdBtwal
zA?x8(j-hs6$V+f#9wQ+6`~N{oSa7o;==W_@6=r-BU$~#2xicLlZE#(gKsT7sNRR|*
zJxNia#H^{>S#-Q@bLDtcobYDH0~$1<1REFyhUvv>v~R!QV?ue~6*?EbFvfv^$#-g3
z(o0Q%0m<N59gszy7t2&8FnMgzhI_5Paj3pz*}@dM+N>W1MuKOIdKl71$mBF0ntvg$
zYj*}JFCQ`Q5#3^z+W`Zf)5|$ekP}qS2boAjnwRA}o<k7pen1^OA^d_D%204aX7=wB
z3*p_t-f1W!f5Eng^m&uW01l9fPVXw^RQE<!@mD}rf->uo7>y}<HBz&BGJ<paSc)F<
zBZ?af#sJ2IC0PteD$7H`r*34Zs>4ND73H5!LEzTW=a$h0Cg=cTf!zdUs7RuqKohMh
zX(|ry-TLSA2<?+>OX*8ZC^H?52BS<=6;+PMT%@`9y|;=^5NAEyGSFFNef+HA7PWA#
z$pdGH_AcI1fC4YI6h`O?zanr3m=3bdK(EZXRlg{(qL)>;fW$+E)f(5P#fMoac%Vmn
znxjCR>R3;7&bOkc5uM}Ls;~jb3DId&i}v>QWTS(8zI|seUDJlpaUdPo%!BktxpJLs
zI{M{Ng1me&9_dG!E1GgApv)8dLod?pe;_gi*uZf8Hn=!RYqXr*#Oq-nIA1@Wg^;@T
zlZ)t??FbnI(m=9-xOu<Cnj_P(t#gOnAh69Am;^nn%xR<6u&Vhx=AJ)8%=0S~)8}cV
zdCD{wqLKYjin52>KFdT0RaRW1=nfMd4N}2K1JyQ{_7cM^FDg+TC~8m3%94qQDw<NY
zoOWDA?WF=cNHQp|Ux?agGKq3q2eDXIn~bqoqC3U4sUNV5>fT1|x@r?J7NibHBJX$i
zJDH>Q%57r1)R~a6U3BQl<gPV7?jkP4J+oZDyhbCM)1vJ1$()B3L^+^Op!Cq$Ok&A=
z8$lB=mW-ZbfV|8cYdDhInOr%XXealEK_n$)TY5BW`{~of`NrMEP)w65cQCC4m~+bt
z>@P&s{5@YUZXkAH!EFr2wW&E}#QVfvTG)Z%SU$DBopFk=@Z~$0vo`9&jAk2gFIRM8
z%EGC}>(k~C7i{fajKr|2Gyl<1!red@#s>7tcBZ(RxRE9IFc8PAb9I#)h`V*F8>3ck
zb>}bsjd1I74`K#cyOnD%t0fe<dLKhE-Lj`;R1jdK*9s)vn)dO&UkJDEKfqvIx5jTt
zUrxA{{1=8}xz#!A@VkUtJx@4mw^j$d`Gi}Yf8#U@x7PLAP7-c){ezJhZVld5^Ah3K
z(0?&Dpj+8JsXq~J(T^|?$E{w5n?bm>_Ay4S+&cTos)K}EZ}cH1(b}zp*PF?Yq9B!{
zf7~V|X2*1E(<G*pa4VQ#1(I&vXs_QvxK%a?gK^z@$Y-u6+zKaRIF?)WG^{7wsu+wp
zYqyeXFU}#{3Jt-Og<DIzUs*%ARWTGJG2B|Pui8tv6?zI|1G;sfp0X2e1)s)19Jhi(
Su$geHoyMq@TVFJMRP!HFNfL$t

literal 0
HcmV?d00001

diff --git a/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761819019.hostname-kjuul.2893398.0 b/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761819019.hostname-kjuul.2893398.0
new file mode 100644
index 0000000000000000000000000000000000000000..b3eb253126c8da8cf8843aed6703c3c32a0c6464
GIT binary patch
literal 6808
zcmb7|dsI_L8o<Mwdy&d3gohyFs)zzU>f;DVh{mOWJbH9j+ParZa=~c0NpE67wzX)h
zZLwX7#qR1JtKD@gwbezpoYqCHX!THw7Vxo@1=N6G1$9AQV!_R0GU3DAWS2iV^Sj^7
zH^2F2z9b1ue`C9o_7jZ_tvOf3&51#tIUKEEqouDgtajOg7o*9!T87oAxEvMD$)h!!
z^YdwLOLPIJ&@x;!V;Du{DwGUm`cL6W|EwA4yZ25lenhw*{p>x%cO!$y0Mc8l$Y)~I
z9HXW=hLY1-S|~_rUKTr=G9N(w&qA6ltc)sP6nPu88sYoy#p1%qx3-6%NZBzrrxU5i
zP$VDHizJMZG+IV23~f3u31mg()6igt-}ej>d(4CVNk7ulI9O{~w}Q<RDx#tjKVBWy
z8HRw__dm}dZkhoh=}S&B0-Q>zRBhG@{aRm^TwauNbSfI3nX)o~nD%!x8$Z&A^e~NQ
z3iK%y_9Q<q4qx&0OW|mEp?CZ!vB5lCL<-60jKgX9Yc#8+^BCbLHJjsq+tOJI2<J)^
zz(Oe4T!v#<86#w)B;pzK4@jd>jJU*-Cxq5agp*-pAewzl4y~1KpfrjihEg*eC07dt
zf?~-V^4_;+BY4SEN<dVZ;ZSl4DKx`sPL;3LQc8x+({2z7#789Zz^``ALEz5d^C?81
z84M#sNs$@k7>!!RY8W$m_LW9yZKcjR0-;aU{>j9NlW2({WH9MxMzty}t;C$mb!qIo
zZqvYAGtM9m%#|~ypVG)w`r!IS9z6SMKt$7W6!51;aTl?q27wF7`DCOyU@k|?v<ekV
zsd6a|ov&6h8cM%#eYTWaD2A0Ya{brp<D1LpNx{m0JdeHxR*tt-L*+9xt49Tb_08*?
zBhf^TeNZ+)EV_u$sbmQ0Z=T3_lrA)qHKPId0lA>o9;q<JOl;{}IUU8UJW#NL_~|kt
z14w^zvhg&;&}#ic3#m;?V=#Bsj^<eislI)D6%kdBkSSyk>1#q{N`;y-r_6|*F8L@H
zV05E(OwGtCj@GAIj}<2*?5_ML$mjlR2s7r>6ivQa?}8!ZI8-1I&AYg6CYq=wy<I?@
z)gd&P3?zL_sJ?RPH*pK4RA~%42m~(IQayi#D9N1S)x^>+L<N!}(#u5UDp-SkW~S)E
zAL~J3`D-^~OohczQ(`ua19nb6t^>7BbRPkx0%-))Dy2d;Zf1*CNyk8SS^NM&rs`sZ
zESW2Nx#|cw5R!*zxbZ-k7NU1r#l_@tw47p9T)r^_2dLyWP-Lh<1e%JBF^&mTa>6e!
z99}Z)k`9;SJY*ug93pu=iVpg82bkXr7RaVaeu<HO01O$%(I$Z7g(yyFk2sbIYGLGK
z4BEBn_)tX#s7rw(7`Elptu+lKu+ZxX=J<_TxA=#d;9NTVf+-%SUijo(6gXhL7Q}{C
z%?sYo0`7K=;z(A#3Y*h$0-VU9F$~x-Yj9QfYH+qn1&59A0=*}EZbgjUNdre^y8~hz
z`Q2KOG;#<i@{uEkCMW`{l>h4;fdJg;#0NsR9M5e#1l-!{j6u6@J-#R`0B-%m6~ngN
zQm3tb2e{?qjyZm}3JUh_0&c|-nBsBkEB3vGz^!x=BR1SRTv#RqZuNR#%&J@7p&<-#
zE6x)GcHCN6Q`-RCnl~Avd~W^5<KzY4)_O0*IPtsXEgf6|6shpRkWIHf?`>KGz;<6g
z5V{q9_gV{ZtH%$6cHR2o!;|&EEl+<8+j48}H+>1fEmsld_}%(LRhtU9_3J=P@wgTI
z{jRmZtrI~QvEf$YjO6ctTP?vDv+CBZy1feE*3c9T*m29XY||s)RzoO8`P|C=bNmQc
zR!H^j{e~XM%bB+e8osei`O4#40J~xh1AWkv<&omqUe4*@Z4i12N1M<IZM^iU#O*Kd
zehaoqWf^YTCPRy6d?FZnp=~?D6Ke&t>cOkvv;}##8_F46BUxISu@vl(<7qr6(+(Me
zm>%T9Z%gtMj#5R}L7L+%fGN!e0Kb2QY<F$!+18?Bz!X_yEIpBI@;vbm+@6>i^u)_w
zACf^dTA2T#D+<od@&+RLp26!fie!GNPw0yTgOcRy-+cg1@{1PKc#_TN#Lh?%c2zpR
zyw4rH@(<SFxct45VEBgg<Czb8!Hjoa#3Y#Uv)$3|N%z)?QLZ9H{d<UgwHW>X`y;{f
zf$dY@0U3zBjK2pmfVX*EhxF3&HwWjU$@CwqlM<CzFl?_!3O;r70bnQ9+h<Xi)ajn8
z0QbZ<S24@`zKw4ab%1jJ<24M~yeIZ;msA0;;W{4(?}<gNNlK90>o+iH_n!Fv&ANTy
zp1AWb4BNUVDt1(E1NVfk33L4S1i3OZ3lyNbW=!$i6WP_Re*%^9cP$vPaZl85yj%^c
zkg^qHR_}>Jbapv7EL<A~?A#OXVICEr7`fcWDBnG?RaUhIxTQ8TF8pthgox~Pph&h3
z1#*eB>6V*Hr~_bTJ0A$$N<T8|3~(!@1A}(m60?7+25v3BgJD~44Yci@3*3t6#2mj{
z@n65*58QgL3sXF9ozV4^0=FFQV#J19>1%^;1Gl;?m{qq1F17dqxBmSzj$_9yB9u)7
zZe8leD4$!o!LJqpw^Dl$<I3-rwlpplD3W_0LpI&YIzN^M!1WLKK<JiZd&!r;t<;AY
zwCk28?V$^BOY$EK+j6T@wy^-XHM<vc{BHS_ec=e)67^w<$F0xq4txmQ8nqxc+)Au)
z5dgRDJ;IT!y7lvP*AU>=)qV`vacfqQ&K0=jH;7R_w_Fc$Zv(eth7jY%@0M<4Ul&kh
z#xRC#x|MnGoCtu=Kjs6WTP@w$3DB((4BB<8q$J1(y7dIZw%oecrs;!j{en4uw{D2G
zMgg~iMlr?XR%zWEe!wlKF^t%7YokX-6L71~f?0K|xyR`aaI5Vpj$_BI@@yw(;MOz;
Rov9CU<NeL2cV<|}e*s6`L;wH)

literal 0
HcmV?d00001

diff --git a/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761819424.hostname-kjuul.3154814.0 b/logs-lxs/fsdp-sglang/tensorboard/events.out.tfevents.1761819424.hostname-kjuul.3154814.0
new file mode 100644
index 0000000000000000000000000000000000000000..54327db792dd05c4358c87824fc50755e2a615bf
GIT binary patch
literal 6808
zcmb7|d010d7Qmy$+V|=bz_5rcLPrsVP&yqdZUKZ3urhA7+s8a2c|gV_FT4bS)^V`z
zMXO+?;NUVij)=H`1TCm2i%>w6y0le0;-Eqm87U|ooyugnxo|M=Wrjcb?(dy@&;8wV
z?@6S<^!Fk=ayQ}K;MmY2xe+pP>~dBrSNVspq*dDBz{URLbS<sYFzj+h%F6sTDN3c3
zP4`b_<yxBcrwya11i6ByO#gYZuldX0pzl67{kvYmb;Fs!OZxBnOeCF1TdiD4i`6Wx
zma;S@lWL_R!KT~6eFr@l0JXggY0_2kR4Oe`OwwvZONGgj$+nR(t|(HR__i0}zY#^U
zBZVYkjHJ=hYEfar*J0BS=L#pI!6QEHdP*dl2Ro1x$g#%3TEn{KszgzMEHgBanwIK;
zfN75oEhLVa0TF3Wjxho(qfjs@T9N$W`mj|$_xZV_@r&lqb0WIU<0p`I<m;yKbgDju
zqQy;RlGA;erJiVbn(gdgi8nW)$vcuFa-4CvRJKy8(n=F)(VeK3v%A;W&IUxy%Y<B&
zK(n+eo)(RHO%g)&jQr6L#h4ZH`Ez2*W)#Dd^dOzl?8VEa+V~_&Bmab^)HF-U)FOc(
zBed=AWpWyVg9a!8L7QPWauO*r!)lgMs<o7YRwZha^k7lw-HX>=_#$wFOU-;D-wb+?
zZlt3bWND3>QE6y1x=^Nz$jIO7?v2m^wL>&9atoTUE9pW`Fr!*VD^*}_TzN#RtIyxT
zTvvU8I51b%n0`tV&*+2e=eqA48<4hh4hs0gw~`K`Z5smLBHtu^%mEWvX}ng>s3<0Z
z(nytR1+Ag<3)g2$kxtPn87(uUY)%Jl^9xwHQx^IfSUKKS4W*=|Dm^L?#Cpf}`=E&&
z-J11;n3;`GchZ$~Fi&JCDorzzt#0$P0J&_3HBxShx##WPRxcFu_}<hoB4sBcok$0A
zyzw-MrE2{`)2I~+W3alITJcnbRNg6xAWCu&GKri>+MAGgg<MUUQ)a~eQ;}E&u%=wg
zn3|SRtW=+BJ(e+-b#W~O`E1Wam@%KGXtET&3x<$GP=P?P%uz4}O;q#yb^+193!yHg
zGihf+^_5G%iRqMr(HL|P2pXa@-hTs8p}rXl2w^^=oJmJgXrdD2DuaDy=332y9iXtd
z??H^Iuo!BJc*PK)_|_>8Q0pG-MS!V58UZz<kjD?1Sx;174yZ1_-H#wsbumJNnJcLD
zcLWC_#XQ`2AWRF<JI%1-L{=)JR1B*$W?)?@u@@AXF$Yl`Q&}>`F@XwJR9ts3=t6(i
z8nB#vL<mRtaC$U8v=AQHn}ZOqNoBqEO0Y1yLeu>8(T0ITg~*sYZKAjY)Iys>7_@3r
z)sk38P?x-mFwEyud%ydAU}5B8%yAoacuKtuIG4+cF~#8&*--2W4%mt#7~!$%`p3`n
zfV=Ebj9K)mzu(RcoXEAuFkr>3wrzf0;B0jr$0(OuPjxS50Jo-;B4#AFTQgN@4nUEa
zConWj5m=@6<OMJQTw2BjLbrNywpIYQG&&4gbu0gBOaX8!ryRq4Zsngka0IwjcoK8m
zZtZ`uDIU00ehO0@ZrN{dz69L*{%efzxb?i?Sp;yazXD?x-P+|=p#yGBIE?`-ZpE$n
zFcG+=K7&y%w=&l$+<{xW&m!g(ZnxT98E>G-$tn!-x^-*mtN{Qvp5p?cTb<Xwn*rRq
zdme*U-O60&R}b9kt;R5)TP5U(9^lrfTFh~~^~LK8767+Ib(rFCD{bQka^RN#1&r{x
zm9UyQ0Ne_!$CyR8*b1*%z^&j03|Mii;HQu`fLl4=V3f-(?S9V~u&neFL8hSx654Qf
zLHpW*!`CO=1ek591q}2-^91imUL)eZ0dIrFgV?Z6XxG}~VJ$>QHrS>egTqbRWN6Wh
zPXt3R^rvwvLLJ)vwi~<(E)F6ucSEtt^<jTp=j{x3=#>+ADW)AV1Tj6xMZ(XcBn6pQ
zpMo?84+5q%8vy+N6$y^$W`>1#6$6H`#0>UC@7ul=l2znuBSud=WvuH$qHK`)AG)G<
zmYls0M0zxc8$~j|)Q9y&lBc1wu6um|PO@hiPKog(o6+H&(WYo!g!s^i?ckMPtHUsN
zZ`2?ukMP_&`XrcfZzU$dj9>1KoWd^6m7rXCJNE4)rk=s*|KA^}=Zu_}2Qv_M7Jm<9
z0B`e<4(Zlp^2u~GnZBdV;l!dU3|s4wYUl^<fc?Wc>n!S$bf2Aszh}Hxjakn3?e>Vr
z8$r2O)L@ABp0J}oIRU`UwOrt^d!oqw9<VFB4ue+jiJUzh-r%12>jez+-4k0X7Jdcp
ziDUJc<Gv@dlWJFi0#w<6DUN$$4CP!4D&wR^jPTqOXB($xfhrW$gfWZv#Mq$jmEf>M
zH)FucJ>j-t+j&roHnd=r>z+8Hj(H5;Af0Ar6!#m1I?!1S6zRT%0*xBp0>Sq!TU8ni
zz@IO3fzT~&KyE&8tFIM<R^7T0^!8tXTcRr%=5y;t+~Ex1*4V3<<8~|1Ytf&9TXxql
z#o<=e^Lw8Hx8A#s5gxY&e7C0pw|u|Dm_@hR%5sZ<TLIr=z=~V_ws~Iyw>JNPQ7*S)
z20ruzZk@V;m{+;o^3FXJ2^2Yh6GOapt0QC9Y5>;V;sT*tbx!vyfm=;KV$iBvS03g)
z18z0_1H*i7&4_9o0B*J1#vHd>)qYR50Jj?2Fva0kh;Vu?aI5(aMtIy34s<pGx9b0i
zF^g`6Kbr?H`K#?1u;P|(bVWIE>!W)Z<#OwI>*im8TkqaS%xG@6vhJ;Y6)3W#14F!S
zxm^7B8UQZq<N~2vJ(pLhfm<K_gh8urMJqn80&YnkV3^OX)sl=};8xs2%yGLlP~Cn6
zxRug{DGs*=k{`DKx0pv5;c@HAuSpMqTglxRv*?!2K6yN-TiSnNz=~T{kul!_x2`|N
MD3@E^u0iyF0n`~1-v9sr

literal 0
HcmV?d00001

diff --git a/logs-lxs/fusion_result.json b/logs-lxs/fusion_result.json
new file mode 100644
index 000000000..ec747fa47
--- /dev/null
+++ b/logs-lxs/fusion_result.json
@@ -0,0 +1 @@
+null
\ No newline at end of file
diff --git a/logs-lxs/run_sglang.py b/logs-lxs/run_sglang.py
new file mode 100644
index 000000000..165e813e7
--- /dev/null
+++ b/logs-lxs/run_sglang.py
@@ -0,0 +1,32 @@
+import dataclasses
+import sglang as sgl
+from sglang.srt.server_args import ServerArgs
+if __name__ == "__main__":
+    model_dir = "/home/weight/DeepSeek-R1-Distill-Qwen-1.5B-2layer/"
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    serve_args = ServerArgs(
+        model_path=model_dir,
+        attention_backend="ascend",
+        cuda_graph_max_bs=32,
+        enable_memory_saver=True,
+    )
+    llm = sgl.Engine(**dataclasses.asdict(serve_args))
+    sampling_params = {"temperature": 0.8, "top_p": 0.95, "max_new_tokens": 32}
+    outputs = llm.generate(prompt=prompts, sampling_params=sampling_params)
+    for prompt, output in zip(prompts, outputs):
+        print("===============================")
+        print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
+
+    llm.release_memory_occupation()
+    print("Memory occupation released.")
+    llm.resume_memory_occupation()
+    print("Memory occupation resumed.")
+    outputs = llm.generate(prompt=prompts, sampling_params=sampling_params)
+    for prompt, output in zip(prompts, outputs):
+        print("===============================")
+        print(f"Regenerate prompt: {prompt}\nGenerated text: {output['text']}")
diff --git a/rlinf/algorithms/losses.py b/rlinf/algorithms/losses.py
index f1bf025cd..72e9004c6 100644
--- a/rlinf/algorithms/losses.py
+++ b/rlinf/algorithms/losses.py
@@ -201,11 +201,15 @@ def compute_math_ppo_actor_loss(**kwargs):
     advantages = kwargs["advantages"]
     loss_mask = kwargs.get("loss_mask", None)
     c_clip = kwargs.get("c_clip", None)
-
-    assert logprobs.dtype == torch.float32
-    assert old_logprobs.dtype == torch.float32
-    assert advantages.dtype == torch.float32
-
+    if logprobs.dtype != torch.float32:
+        logprobs = logprobs.float()  # 转换为 float32
+    #assert logprobs.dtype == torch.float32
+    #assert old_logprobs.dtype == torch.float32
+    #assert advantages.dtype == torch.float32
+    if old_logprobs.dtype != torch.float32:
+        old_logprobs = old_logprobs.float()
+    if advantages.dtype != torch.float32:
+        advantages = advantages.float()
     assert loss_mask is not None
 
     loss_mask_count = loss_mask.count_nonzero() or 1
diff --git a/rlinf/data/io_struct.py b/rlinf/data/io_struct.py
index 4948131a6..2fa0fea8a 100644
--- a/rlinf/data/io_struct.py
+++ b/rlinf/data/io_struct.py
@@ -27,7 +27,7 @@
     get_iterator_k_split,
     split_list,
 )
-
+import torch_npu
 
 def get_batch_size(
     batch: Dict[str, torch.Tensor], batch_tensor_key: str = "input_ids"
@@ -809,12 +809,12 @@ def to_actor_batch(
         )  # [B, training_seq_length]
 
         batch = {
-            "input_ids": input_ids.cuda(),
-            "attention_mask": attention_mask.cuda(),
-            "is_end": is_end.cuda(),
-            "position_ids": position_ids.cuda(),
-            "prompt_lengths": prompt_lengths.cuda(),
-            "response_lengths": response_lengths.cuda(),
+            "input_ids": input_ids.npu(),
+            "attention_mask": attention_mask.npu(),
+            "is_end": is_end.npu(),
+            "position_ids": position_ids.npu(),
+            "prompt_lengths": prompt_lengths.npu(),
+            "response_lengths": response_lengths.npu(),
         }
 
         if (
@@ -825,7 +825,7 @@ def to_actor_batch(
 
         if self.advantages is not None:
             if isinstance(self.advantages, torch.Tensor):
-                batch["advantages"] = self.advantages.cuda()
+                batch["advantages"] = self.advantages.npu()
             else:
                 response_attention_mask = attention_mask[
                     :, -max_response_len:
@@ -833,17 +833,17 @@ def to_actor_batch(
                 advantages = torch.tensor(self.advantages, dtype=torch.float32).reshape(
                     -1, 1
                 )  # [B, 1]
-                advantages = response_attention_mask.float().cuda() * advantages.cuda()
-                batch["advantages"] = advantages.cuda()
+                advantages = response_attention_mask.float().npu() * advantages.npu()
+                batch["advantages"] = advantages.npu()
 
         if self.prev_logprobs is not None:
-            batch["prev_logprobs"] = self.prev_logprobs.cuda()
+            batch["prev_logprobs"] = self.prev_logprobs.npu()
 
         if self.ref_logprobs is not None:
-            batch["ref_logprobs"] = self.ref_logprobs.cuda()
+            batch["ref_logprobs"] = self.ref_logprobs.npu()
 
         if self.rewards is not None:
-            batch["rewards"] = self.rewards.cuda()
+            batch["rewards"] = self.rewards.npu()
 
         if self.rollout_logprobs is not None:
             logprobs = batch_pad_to_fixed_len(
@@ -854,7 +854,7 @@ def to_actor_batch(
                 max_batch_len=max_response_len,
                 pad_token=pad_token,
             )
-            batch["prev_logprobs"] = logprobs.cuda()
+            batch["prev_logprobs"] = logprobs.npu()
 
         return batch
 
diff --git a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
index c16ddc3a8..690b3d4f5 100644
--- a/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
+++ b/rlinf/hybrid_engines/fsdp/fsdp_model_manager.py
@@ -35,6 +35,7 @@
 from rlinf.utils.logging import get_logger
 from rlinf.utils.utils import clear_memory
 
+import torch_npu
 
 class FSDPModelManager:
     """
@@ -55,9 +56,9 @@ def model_provider_func(self) -> torch.nn.Module:
 
         use_triton = cfg.get("use_triton", True)
 
-        assert torch.cuda.is_available(), "CUDA is not available."
+        assert torch.npu.is_available(), "CUDA is not available."
         local_rank = int(os.environ.get("LOCAL_RANK", 0))
-        device = torch.device(f"cuda:{local_rank}")
+        device = torch.npu.device(f"npu:{local_rank}")
 
         model_config = AutoConfig.from_pretrained(
             cfg.model.model_path,
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_5_2/__init__.py b/rlinf/hybrid_engines/sglang/sglang_0_5_2/__init__.py
new file mode 100644
index 000000000..5b365ea1e
--- /dev/null
+++ b/rlinf/hybrid_engines/sglang/sglang_0_5_2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_5_2/io_struct.py b/rlinf/hybrid_engines/sglang/sglang_0_5_2/io_struct.py
new file mode 100644
index 000000000..960d40eb0
--- /dev/null
+++ b/rlinf/hybrid_engines/sglang/sglang_0_5_2/io_struct.py
@@ -0,0 +1,59 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+
+@dataclass
+class TaskMethodInput:
+    method_name: str
+    args: List[Any] = field(default_factory=list)
+    kwargs: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class TaskMethodOutput:
+    method_name: str
+    result: Optional[Any] = None
+
+
+@dataclass
+class OffloadReqInput:
+    pass
+
+
+@dataclass
+class OffloadReqOutput:
+    pass
+
+
+@dataclass
+class SyncWeightInput:
+    pass
+
+
+@dataclass
+class SyncWeightOutput:
+    pass
+
+
+@dataclass
+class SyncHFWeightInput:
+    pass
+
+
+@dataclass
+class SyncHFWeightOutput:
+    pass
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_5_2/sgl_engine.py b/rlinf/hybrid_engines/sglang/sglang_0_5_2/sgl_engine.py
new file mode 100644
index 000000000..e8e05c88b
--- /dev/null
+++ b/rlinf/hybrid_engines/sglang/sglang_0_5_2/sgl_engine.py
@@ -0,0 +1,363 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import atexit
+import logging
+import multiprocessing as mp
+import os
+import random
+import signal
+import threading
+import time
+from typing import Dict, Optional, Tuple
+
+import uvloop
+import zmq
+from omegaconf import DictConfig
+from sglang.srt.entrypoints.engine import Engine as _Engine
+from sglang.srt.managers.data_parallel_controller import (
+    run_data_parallel_controller_process,
+)
+from sglang.srt.managers.detokenizer_manager import run_detokenizer_process
+from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerRouter
+
+# from sglang.srt.managers.scheduler import run_scheduler_process
+# from sglang.srt.managers.tokenizer_manager import TokenizerManager
+from sglang.srt.managers.template_manager import TemplateManager
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
+from sglang.srt.utils import (
+    assert_pkg_version,
+    configure_logger,
+    get_bool_env_var,
+    get_zmq_socket,
+    is_cuda,
+    kill_process_tree,
+    launch_dummy_health_check_server,
+    prepare_model_and_tokenizer,
+    set_prometheus_multiproc_dir,
+    set_ulimit,
+)
+
+from rlinf.scheduler import WorkerAddress
+from rlinf.utils.placement import ComponentPlacement
+
+from .io_struct import OffloadReqInput, SyncHFWeightInput, SyncWeightInput
+from .sgl_scheduler import run_scheduler_process
+from .tokenizer_manager import TokenizerManager
+
+# Fix a bug of Python threading
+setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+
+logger = logging.getLogger(__name__)
+asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+
+_is_cuda = is_cuda()
+
+
+class Engine(_Engine):
+    def __init__(
+        self,
+        parent_address: WorkerAddress,
+        placement: ComponentPlacement,
+        config: DictConfig,
+        dp_rank: int,
+        **kwargs,
+    ):
+        """
+        The arguments of this function is the same as `sglang/srt/server_args.py::ServerArgs`.
+        Please refer to `ServerArgs` for the documentation.
+        """
+        if "server_args" in kwargs:
+            # Directly load server_args
+            server_args = kwargs["server_args"]
+        else:
+            # Construct server_args from kwargs
+            if "log_level" not in kwargs:
+                # Do not print logs by default
+                kwargs["log_level"] = "error"
+            server_args = ServerArgs(**kwargs)
+
+        # Shutdown the subprocesses automatically when the program exits
+        atexit.register(self.shutdown)
+
+        # Allocate ports for inter-process communications
+        self.port_args = PortArgs.init_new(server_args)
+
+        # Launch subprocesses
+        tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
+            parent_address=parent_address,
+            placement=placement,
+            config=config,
+            dp_rank=dp_rank,
+            server_args=server_args,
+            port_args=self.port_args,
+        )
+
+        self.server_args = server_args
+        self.tokenizer_manager = tokenizer_manager
+        self.scheduler_info = scheduler_info
+        self.template_manager = template_manager
+
+        context = zmq.Context(2)
+        self.send_to_rpc = get_zmq_socket(
+            context, zmq.DEALER, self.port_args.rpc_ipc_name, True
+        )
+
+    def offload_model_weights(self):
+        """Offload model weights to meta."""
+        obj = OffloadReqInput()
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(
+            self.tokenizer_manager.offload_model_weights(obj, None)
+        )
+
+    def sync_hf_weight(self):
+        obj = SyncHFWeightInput()
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(self.tokenizer_manager.sync_hf_weight(obj))
+
+    def sync_weight(self):
+        obj = SyncWeightInput()
+        loop = asyncio.get_event_loop()
+        return loop.run_until_complete(self.tokenizer_manager.sync_weight(obj))
+
+
+def _set_envs_and_config(server_args: ServerArgs):
+    # Set global environments
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+    os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
+    if not server_args.enable_symm_mem:
+        os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
+    os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
+    os.environ["CUDA_MODULE_LOADING"] = "AUTO"
+    # flashinfer uses this environment variable for various kernels from MoE to quant kernels
+    os.environ["TRTLLM_ENABLE_PDL"] = "1"
+
+    # Can also be passed as argument
+    os.environ["SGLANG_RUN_ID"] = (
+        f"sglang-run-{time.time()}-{random.randint(0, 100000000)}"
+    )
+
+    # Set prometheus env vars
+    if server_args.enable_metrics:
+        set_prometheus_multiproc_dir()
+
+    # Set ulimit
+    set_ulimit()
+
+    # Check flashinfer version
+    if server_args.attention_backend == "flashinfer":
+        assert_pkg_version(
+            "flashinfer_python",
+            "0.3.0",
+            "Please uninstall the old version and "
+            "reinstall the latest version by following the instructions "
+            "at https://docs.flashinfer.ai/installation.html.",
+        )
+    if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
+        assert_pkg_version(
+            "sgl-kernel",
+            "0.3.8",
+            "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
+        )
+
+    if True:  # Keep this check for internal code compatibility
+        # Register the signal handler.
+        # The child processes will send SIGQUIT to this process when any error happens
+        # This process then clean up the whole process tree
+        # Note: This sigquit handler is used in the launch phase, and may be replaced by
+        # the running_phase_sigquit_handler in the tokenizer manager after the grpc server is launched.
+        def launch_phase_sigquit_handler(signum, frame):
+            logger.error(
+                "Received sigquit from a child process. It usually means the child failed."
+            )
+            kill_process_tree(os.getpid())
+
+        signal.signal(signal.SIGQUIT, launch_phase_sigquit_handler)
+
+    # Set mp start method
+    mp.set_start_method("spawn", force=True)
+
+
+def _launch_subprocesses(
+    parent_address: WorkerAddress,
+    placement: ComponentPlacement,
+    config: DictConfig,
+    dp_rank: int,
+    server_args: ServerArgs,
+    port_args: Optional[PortArgs] = None,
+) -> Tuple[TokenizerManager, TemplateManager, Dict]:
+    """
+    Launch the TokenizerManager in the main process, the Scheduler in a subprocess, and the DetokenizerManager in another subprocess.
+    """
+
+    assert server_args.pp_size == 1, (
+        "RLinf currently only supports and validates pp_size=1."
+    )
+
+    # Configure global environment
+    configure_logger(server_args)
+    server_args.check_server_args()
+    _set_envs_and_config(server_args)
+
+    # Allocate ports for inter-process communications
+    if port_args is None:
+        port_args = PortArgs.init_new(server_args)
+        logger.info(f"{server_args=}")
+
+    # If using model from www.modelscope.cn, first download the model.
+    server_args.model_path, server_args.tokenizer_path = prepare_model_and_tokenizer(
+        server_args.model_path, server_args.tokenizer_path
+    )
+
+    scheduler_procs = []
+    if server_args.dp_size == 1:
+        memory_saver_adapter = TorchMemorySaverAdapter.create(
+            enable=server_args.enable_memory_saver
+        )
+        scheduler_pipe_readers = []
+
+        nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
+        tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
+        tp_rank_range = range(
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
+        )
+
+        pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
+        pp_rank_range = range(
+            pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
+            pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
+        )
+
+        for pp_rank in pp_rank_range:
+            for tp_rank in tp_rank_range:
+                reader, writer = mp.Pipe(duplex=False)
+                gpu_id = (
+                    server_args.base_gpu_id
+                    + ((pp_rank % pp_size_per_node) * tp_size_per_node)
+                    + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
+                )
+                moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
+                proc = mp.Process(
+                    target=run_scheduler_process,
+                    args=(
+                        parent_address,
+                        placement,
+                        config,
+                        server_args.tp_size * server_args.pp_size,
+                        tp_rank + pp_rank * server_args.pp_size,
+                        server_args,
+                        port_args,
+                        gpu_id,
+                        tp_rank,
+                        moe_ep_rank,
+                        pp_rank,
+                        None,
+                        writer,
+                        None,
+                    ),
+                )
+
+                with memory_saver_adapter.configure_subprocess():
+                    proc.start()
+                scheduler_procs.append(proc)
+                scheduler_pipe_readers.append(reader)
+    else:
+        # Launch the data parallel controller
+        reader, writer = mp.Pipe(duplex=False)
+        scheduler_pipe_readers = [reader]
+        proc = mp.Process(
+            target=run_data_parallel_controller_process,
+            args=(server_args, port_args, writer),
+        )
+        proc.start()
+        scheduler_procs.append(proc)
+
+    if server_args.node_rank >= 1:
+        # In multi-node cases, non-zero rank nodes do not need to run tokenizer or detokenizer,
+        # so they can just wait here.
+
+        for reader in scheduler_pipe_readers:
+            data = reader.recv()
+            assert data["status"] == "ready"
+
+        if os.getenv("SGLANG_BLOCK_NONZERO_RANK_CHILDREN") == "0":
+            # When using `Engine` as a Python API, we don't want to block here.
+            return None, None, None
+
+        launch_dummy_health_check_server(
+            server_args.host, server_args.port, server_args.enable_metrics
+        )
+
+        for proc in scheduler_procs:
+            proc.join()
+            logger.error(
+                f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
+            )
+        return None, None, None
+
+    # Launch detokenizer process
+    detoken_proc = mp.Process(
+        target=run_detokenizer_process,
+        args=(
+            server_args,
+            port_args,
+        ),
+    )
+    detoken_proc.start()
+    if server_args.tokenizer_worker_num > 1:
+        # Launch multi-tokenizer router
+        tokenizer_manager = MultiTokenizerRouter(server_args, port_args)
+
+        # Initialize templates
+        template_manager = None
+    else:
+        # Launch tokenizer process
+        tokenizer_manager = TokenizerManager(server_args, port_args)
+
+        # Initialize templates
+        template_manager = TemplateManager()
+        template_manager.initialize_templates(
+            tokenizer_manager=tokenizer_manager,
+            model_path=server_args.model_path,
+            chat_template=server_args.chat_template,
+            completion_template=server_args.completion_template,
+        )
+
+    # Wait for the model to finish loading
+    scheduler_infos = []
+    for i in range(len(scheduler_pipe_readers)):
+        try:
+            data = scheduler_pipe_readers[i].recv()
+        except EOFError:
+            logger.error(
+                f"Rank {i} scheduler is dead. Please check if there are relevant logs."
+            )
+            scheduler_procs[i].join()
+            logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
+            raise
+
+        if data["status"] != "ready":
+            raise RuntimeError(
+                "Initialization failed. Please see the error messages above."
+            )
+        scheduler_infos.append(data)
+
+    # Assume all schedulers have the same scheduler_info
+    scheduler_info = scheduler_infos[0]
+    tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
+    return tokenizer_manager, template_manager, scheduler_info
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_5_2/sgl_scheduler.py b/rlinf/hybrid_engines/sglang/sglang_0_5_2/sgl_scheduler.py
new file mode 100644
index 000000000..2cfb69abc
--- /dev/null
+++ b/rlinf/hybrid_engines/sglang/sglang_0_5_2/sgl_scheduler.py
@@ -0,0 +1,476 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import faulthandler
+import logging
+import os
+import signal
+from typing import Optional
+
+import psutil
+import setproctitle
+import torch
+from omegaconf import DictConfig
+from sglang.srt.disaggregation.utils import (
+    DisaggregationMode,
+)
+from sglang.srt.managers.io_struct import (
+    ReleaseMemoryOccupationReqInput,
+    ResumeMemoryOccupationReqInput,
+)
+from sglang.srt.managers.scheduler import Scheduler as _Scheduler
+from sglang.srt.managers.scheduler import logger
+from sglang.srt.managers.utils import DPBalanceMeta
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.utils import (
+    broadcast_pyobj,
+    configure_logger,
+    get_bool_env_var,
+    kill_itself_when_parent_died,
+    set_gpu_proc_affinity,
+    suppress_other_loggers,
+)
+from sglang.utils import get_exception_traceback
+
+from rlinf.scheduler import Worker, WorkerAddress
+from rlinf.utils.placement import ModelParallelComponentPlacement, PlacementMode
+from rlinf.workers.rollout.utils import (
+    RankMapper,
+    get_module_from_name,
+    rebind_param_attr,
+    swap_tensor_pointer,
+)
+
+from .io_struct import (
+    OffloadReqInput,
+    OffloadReqOutput,
+    SyncHFWeightInput,
+    SyncHFWeightOutput,
+    SyncWeightInput,
+    SyncWeightOutput,
+    TaskMethodInput,
+    TaskMethodOutput,
+)
+import torch_npu
+logger.setLevel(logging.INFO)
+
+def safe_load_weights(model, weights: list):
+    """
+    安全加载权重，自动兼容两种命名约定：
+    - 'visual.xxx'          (Hugging Face Qwen-VL 格式)
+    - 'model.visual.xxx'    (部分 SGLang 或自定义格式)
+    
+    Parameters:
+        model: PyTorch 模型实例（需有 named_parameters()）
+        weights: List of (name, torch.Tensor)
+    """
+    params_dict = dict(model.named_parameters())
+    
+    # 构建一个映射：标准化 key -> 实际参数名
+    # 例如：'visual.patch_embed.proj.weight' 可能对应 params_dict 中的 'visual...' 或 'model.visual...'
+    normalized_to_actual = {}
+    for param_name in params_dict.keys():
+        if param_name.startswith("model.visual."):
+            # 映射到无 model. 的标准名
+            normalized = param_name[len("model."):]  # "visual.xxx"
+            normalized_to_actual[normalized] = param_name
+            normalized_to_actual[param_name] = param_name  # 也保留原名
+        elif param_name.startswith("visual."):
+            normalized = param_name
+            normalized_to_actual[normalized] = param_name
+            normalized_to_actual["model." + normalized] = param_name  # 兼容带 model. 的输入
+        else:
+            # 非 visual 参数，直接映射
+            normalized_to_actual[param_name] = param_name
+
+    # 加载每个权重
+    for name, loaded_weight in weights:
+        if name in normalized_to_actual:
+            actual_name = normalized_to_actual[name]
+            param = params_dict[actual_name]
+            assert param.shape == loaded_weight.shape, (
+                f"Shape mismatch for {name}: expected {param.shape}, got {loaded_weight.shape}"
+            )
+            param.copy_(loaded_weight)
+        else:
+            # 可选：跳过不存在的参数（如优化器状态、非模型参数）
+            print(f"[Warning] Skipping weight not in model: {name}")
+            continue
+
+class Scheduler(_Scheduler, Worker):
+    """
+    Overridden class of SGLang's TP worker class _Scheduler.
+    A Scheduler is a Task that manages the TP worker, and performs necessary weight synchronization with actor and weight offloading.
+    """
+
+    def __init__(
+        self,
+        parent_address: WorkerAddress,
+        placement: ModelParallelComponentPlacement,
+        config: DictConfig,
+        world_size: int,
+        rank: int,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        gpu_id: int,
+        tp_rank: int,
+        moe_ep_rank: int,
+        pp_rank: int,
+        dp_rank: Optional[int],
+        dp_balance_meta: Optional[DPBalanceMeta] = None,
+    ):
+        Worker.__init__(
+            self, parent_address=parent_address, world_size=world_size, rank=rank
+        )
+
+        # since 0.4.6.post2, pp_rank is added into Scheduler init's parameters
+        # but we don't use it in our implementation, so we set it to 0
+        _Scheduler.__init__(
+            self,
+            server_args,
+            port_args,
+            gpu_id,
+            tp_rank,
+            moe_ep_rank,
+            pp_rank,
+            dp_rank,
+            dp_balance_meta,
+        )
+        # `TpModelWorkerClient` is used when ServerArgs.enable_overlap=True, and it has 'worker' attribute.
+        # But in early SGLang version, `TpModelWorker` doesn't have 'worker' attribute.
+        if not hasattr(self.tp_worker, "worker"):
+            self.tp_worker.worker = self.tp_worker
+
+        self._request_dispatcher._mapping.extend(
+            [
+                (TaskMethodInput, self.run_task_method),
+                (OffloadReqInput, self.offload_model_weights),
+                (SyncWeightInput, self.sync_weight),
+                (SyncHFWeightInput, self.sync_hf_weight),
+            ]
+        )
+        self.cfg = config
+        self.binded_attr = {}
+
+        self._actor_group_name = self.cfg.actor.group_name
+        self.placement_mode = placement.placement_mode
+        self.actor_weight_rank = RankMapper.get_rollout_rank_to_actor_rank_map(
+            placement
+        )[(self.get_parent_rank(), self._rank)]
+        # it's important to use load_weight to load resharded weight from megatron
+        for _, module in self.tp_worker.worker.model_runner.model.named_modules():
+            if hasattr(module, "use_presharded_weights"):
+                module.use_presharded_weights = False
+
+        self._logger.info(
+            f"Running Scheduler dp rank {self.get_parent_rank()}, tp rank {self.tp_rank}, corresponding actor weight rank = {self.actor_weight_rank}"
+        )
+
+    def sync_in_tp(self, fn: str = ""):
+        broadcast_pyobj(
+            [], self.tp_rank, self.tp_worker.worker.model_runner.tp_group.cpu_group
+        )
+        # logger.info(f"{fn}: Sync in tp success!")
+
+    def cuda_info(self, text: str = ""):
+        free_gpu_memory, total_gpu_memory = torch.npu.mem_get_info()
+        free_gpu_memory /= 2**30
+        total_gpu_memory /= 2**30
+
+        memory_allocated = torch.npu.memory_allocated() / 2**30
+        memory_reserved = torch.npu.memory_reserved() / 2**30
+
+        self._logger.info(
+            f"[dp {self.get_parent_rank()}-tp {self.tp_rank}] {text} "
+            f"{memory_allocated=:.2f} GiB, {memory_reserved=:.2f} GiB, "
+            f"{free_gpu_memory=:.2f} GiB, {total_gpu_memory=:.2f} GiB"
+        )
+
+    def offload_model_weights(self, recv_req: OffloadReqInput):
+        use_cudagraph = not self.cfg.rollout.enforce_eager
+        colocate = self.placement_mode == PlacementMode.COLLOCATED
+        if not colocate:
+            assert use_cudagraph, "If not colocate, use_cudagraph must be True now."
+
+        if use_cudagraph or not colocate:
+            self.release_memory_occupation(ReleaseMemoryOccupationReqInput())
+            # self.cuda_info("After offload Model weights and kv cache")
+            return OffloadReqOutput()
+
+        # manually offload
+        self.named_buffers = {
+            n: buf.clone()
+            for n, buf in self.tp_worker.worker.model_runner.model.named_buffers()
+        }
+
+        self.binded_attr = {
+            name: param.__dict__
+            for name, param in self.tp_worker.worker.model_runner.model.named_parameters()
+        }
+
+        # offload parameters
+        self.tp_worker.worker.model_runner.model.to("meta")
+
+        # offload kv cache
+        self.tp_worker.worker.model_runner.token_to_kv_pool._clear_buffers()
+
+        self.flush_cache()
+        self.sync_in_tp("offload_model_weights")
+        # self.cuda_info("After offload Model weights and kv cache")
+        return OffloadReqOutput()
+
+    def sync_hf_weight(self, recv_req: SyncHFWeightInput):
+        use_cudagraph = not self.cfg.rollout.enforce_eager
+        colocate = self.placement_mode == PlacementMode.COLLOCATED
+
+        assert use_cudagraph, "use_cudagraph must be True now."
+
+        state_dict = self.recv(
+            src_group_name=self._actor_group_name,
+            src_rank=self.actor_weight_rank,
+        )
+
+        model = self.tp_worker.worker.model_runner.model
+
+        if colocate:
+            self.resume_memory_occupation(ResumeMemoryOccupationReqInput())
+            for name, handle in state_dict.items():
+                #func, args = handle
+                #list_args = list(args)
+                # NOTE: the key is to change device id to the current device id
+                # in case two processes have different CUDA_VISIBLE_DEVICES
+                #list_args[6] = torch.npu.current_device()
+                #new_weight = func(*list_args)
+                
+                #model.load_weights([(name, new_weight)])
+                #self.tp_worker.worker.model_runner.update_weights_from_tensor(
+                #    [(name, new_weight)], load_format="direct"
+                #)
+                #del new_weight
+                func, args = handle
+                import inspect
+                sig = inspect.signature(func)
+                param_names = list(sig.parameters.keys())
+
+                # 将 args 转为 kwargs
+                kwargs = {}
+                args = list(args)
+                for i, param_name in enumerate(param_names):
+                    if i < len(args):
+                        kwargs[param_name] = args[i]
+                    else:
+                        break
+
+                # 修改设备参数（假设参数名是 'map_location' 或 'device'）
+                if 'map_location' in kwargs:
+                    kwargs['map_location'] = f"npu:{torch.npu.current_device()}"
+                elif 'device' in kwargs:
+                    kwargs['device'] = torch.npu.current_device()
+
+                new_weight = func(**kwargs)
+                model.load_weights([(name, new_weight)])
+                #safe_load_weights(model, [(name, new_weight)])
+                del new_weight
+                #fixed_weights = []
+                #for name, weight in [(name, new_weight)]:
+                #    if name.startswith("visual.") and not name.startswith("model.visual."):
+                #        fixed_weights.append(("model." + name, weight))
+                #    else:
+                #        fixed_weights.append((name, weight))
+                #model.load_weights(fixed_weights)
+                #del new_weight
+        else:
+            # disaggregate mode, recv tensor directly
+            for name, tensor in state_dict.items():
+                model.load_weights([(name, tensor)])
+        self.flush_cache()
+        self.sync_in_tp("sync_hf_weight")
+        return SyncHFWeightOutput()
+
+    def sync_weight(self, recv_req: SyncWeightInput):
+        use_cudagraph = not self.cfg.rollout.enforce_eager
+        colocate = self.placement_mode == PlacementMode.COLLOCATED
+        if not colocate:
+            assert use_cudagraph, "If not colocate, use_cudagraph must be True now."
+
+        state_dict = self.recv(
+            src_group_name=self._actor_group_name,
+            src_rank=self.actor_weight_rank,
+        )
+        model = self.tp_worker.worker.model_runner.model
+
+        if use_cudagraph and colocate:
+            self.resume_memory_occupation(ResumeMemoryOccupationReqInput())
+
+        if colocate:
+            if use_cudagraph:
+                for name, handle in state_dict.items():
+                    func, args = handle
+                    list_args = list(args)
+                    # NOTE: the key is to change device id to the current device id
+                    # in case two processes have different CUDA_VISIBLE_DEVICES
+                    list_args[6] = torch.npu.current_device()
+                    new_weight = func(*list_args)
+
+                    self.tp_worker.worker.model_runner.update_weights_from_tensor(
+                        [(name, new_weight)], load_format="direct"
+                    )
+                    del new_weight
+
+            else:
+                named_params = dict(model.named_parameters())
+                for name, handle in state_dict.items():
+                    rebind_param_attr(model, name, self.binded_attr, materialize=False)
+                    func, args = handle
+                    list_args = list(args)
+                    list_args[6] = torch.npu.current_device()
+                    new_weight = func(*list_args)
+                    vllm_weight = named_params[name]
+                    assert vllm_weight.shape == new_weight.shape, (
+                        f"{name}: {vllm_weight.shape=}, {new_weight.shape=}"
+                    )
+                    assert vllm_weight.dtype == new_weight.dtype, (
+                        f"{name}: {vllm_weight.dtype=}, {new_weight.dtype=}"
+                    )
+
+                    swap_tensor_pointer(vllm_weight, new_weight)
+                    del new_weight
+
+                for name, buffer in self.named_buffers.items():
+                    vllm_buffer = get_module_from_name(model, name)
+                    assert vllm_buffer.shape == buffer.shape
+                    assert vllm_buffer.dtype == buffer.dtype
+                    swap_tensor_pointer(vllm_buffer, buffer)
+
+                self.named_buffers = {}
+
+                self.tp_worker.worker.model_runner.token_to_kv_pool._create_buffers()
+        else:
+            # disaggregate mode, recv tensor directly
+            named_tensors = [(n, p) for n, p in state_dict.items()]
+            self.tp_worker.worker.model_runner.update_weights_from_tensor(
+                named_tensors, load_format="direct"
+            )
+        self.sync_in_tp("sync_weight")
+
+        return SyncWeightOutput()
+
+    def run_task_method(self, obj: TaskMethodInput):
+        """
+        Run a CommTask method with the given name and arguments.
+        NOTE: will call wait() if async_op is True.
+        """
+        result = getattr(self, obj.method_name)(*obj.args, **obj.kwargs)
+        if "async_op" in obj.kwargs and obj.kwargs["async_op"]:
+            result = result.wait()
+        return TaskMethodOutput(method_name=obj.method_name, result=result)
+
+
+def run_scheduler_process(
+    parent_address: WorkerAddress,
+    placement: ModelParallelComponentPlacement,
+    config: DictConfig,
+    world_size: int,
+    rank: int,
+    server_args: ServerArgs,
+    port_args: PortArgs,
+    gpu_id: int,
+    tp_rank: int,
+    moe_ep_rank: int,
+    pp_rank: int,
+    dp_rank: Optional[int],
+    pipe_writer,
+    balance_meta: Optional[DPBalanceMeta] = None,
+):
+    # Generate the prefix
+    prefix = ""
+    if dp_rank is not None:
+        prefix += f" DP{dp_rank}"
+    if server_args.tp_size > 1:
+        prefix += f" TP{tp_rank}"
+    if server_args.ep_size > 1:
+        prefix += f" EP{moe_ep_rank}"
+    if server_args.pp_size > 1:
+        prefix += f" PP{pp_rank}"
+
+    # Config the process
+    setproctitle.setproctitle(f"sglang::scheduler{prefix.replace(' ', '_')}")
+    faulthandler.enable()
+    kill_itself_when_parent_died()
+    parent_process = psutil.Process().parent()
+
+    # [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
+    if dp_rank is None and "SGLANG_DP_RANK" in os.environ:
+        dp_rank = int(os.environ["SGLANG_DP_RANK"])
+
+    # Configure the logger
+    configure_logger(server_args, prefix=prefix)
+    suppress_other_loggers()
+
+    # Set cpu affinity to this gpu process
+    if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
+        set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
+
+    # Create a scheduler and run the event loop
+    try:
+        scheduler = Scheduler(
+            parent_address,
+            placement,
+            config,
+            world_size,
+            rank,
+            server_args,
+            port_args,
+            gpu_id,
+            tp_rank,
+            moe_ep_rank,
+            pp_rank,
+            dp_rank,
+            dp_balance_meta=balance_meta,
+        )
+        pipe_writer.send(
+            {
+                "status": "ready",
+                "max_total_num_tokens": scheduler.max_total_num_tokens,
+                "max_req_input_len": scheduler.max_req_input_len,
+            }
+        )
+
+        disaggregation_mode: DisaggregationMode = scheduler.disaggregation_mode
+        if disaggregation_mode == DisaggregationMode.NULL:
+            if server_args.pp_size > 1:
+                scheduler.event_loop_pp()
+            elif scheduler.enable_overlap:
+                scheduler.event_loop_overlap()
+            else:
+                scheduler.event_loop_normal()
+        elif disaggregation_mode == DisaggregationMode.PREFILL:
+            if scheduler.enable_overlap:
+                scheduler.event_loop_overlap_disagg_prefill()
+            else:
+                if server_args.pp_size > 1:
+                    scheduler.event_loop_pp_disagg_prefill()
+                else:
+                    scheduler.event_loop_normal_disagg_prefill()
+
+        elif disaggregation_mode == DisaggregationMode.DECODE:
+            if scheduler.enable_overlap:
+                scheduler.event_loop_overlap_disagg_decode()
+            else:
+                scheduler.event_loop_normal_disagg_decode()
+
+    except Exception:
+        traceback = get_exception_traceback()
+        logger.error(f"Scheduler hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)
diff --git a/rlinf/hybrid_engines/sglang/sglang_0_5_2/tokenizer_manager.py b/rlinf/hybrid_engines/sglang/sglang_0_5_2/tokenizer_manager.py
new file mode 100644
index 000000000..07b77b200
--- /dev/null
+++ b/rlinf/hybrid_engines/sglang/sglang_0_5_2/tokenizer_manager.py
@@ -0,0 +1,129 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+import fastapi
+from sglang.srt.managers.io_struct import AbortReq
+from sglang.srt.managers.tokenizer_manager import TokenizerManager as _TokenizerManager
+#from sglang.srt.managers.tokenizer_manager import _Communicator
+from sglang.srt.managers.tokenizer_communicator_mixin import _Communicator
+from sglang.srt.server_args import PortArgs, ServerArgs
+from .io_struct import (
+    OffloadReqInput,
+    OffloadReqOutput,
+    SyncHFWeightInput,
+    SyncHFWeightOutput,
+    SyncWeightInput,
+    SyncWeightOutput,
+    TaskMethodInput,
+    TaskMethodOutput,
+)
+
+
+# Add two methods and their communicators, input/output structs.
+class TokenizerManager(_TokenizerManager):
+    def __init__(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+    ):
+        super().__init__(
+            server_args=server_args,
+            port_args=port_args,
+        )
+
+        self.run_task_method_communicator = _Communicator(
+            self.send_to_scheduler,
+            fan_out=server_args.dp_size,
+        )
+        self.offload_model_weights_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.sync_weight_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+        self.sync_hf_weight_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
+
+        self._result_dispatcher._mapping.extend(
+            [
+                (
+                    TaskMethodOutput,
+                    self.run_task_method_communicator.handle_recv,
+                ),
+                (
+                    OffloadReqOutput,
+                    self.offload_model_weights_communicator.handle_recv,
+                ),
+                (
+                    SyncWeightOutput,
+                    self.sync_weight_communicator.handle_recv,
+                ),
+                (
+                    SyncHFWeightOutput,
+                    self.sync_hf_weight_communicator.handle_recv,
+                ),
+            ]
+        )
+
+    async def run_task_method(
+        self,
+        obj: TaskMethodInput = None,
+        request: Optional[fastapi.Request] = None,
+    ):
+        """
+        Run a task method with the given name and arguments.
+        """
+        self.auto_create_handle_loop()
+        if isinstance(obj, str):
+            obj = TaskMethodInput(method_name=obj)
+        res: List[TaskMethodOutput] = await self.run_task_method_communicator(obj)
+        return res[0].result
+
+    async def offload_model_weights(
+        self,
+        obj: OffloadReqInput = None,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        if obj is None:
+            obj = OffloadReqInput()
+        await self.offload_model_weights_communicator(obj)
+
+    async def sync_hf_weight(
+        self,
+        obj: SyncHFWeightInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        await self.sync_hf_weight_communicator(obj)
+
+    async def sync_weight(
+        self,
+        obj: SyncWeightInput,
+        request: Optional[fastapi.Request] = None,
+    ):
+        self.auto_create_handle_loop()
+        await self.sync_weight_communicator(obj)
+
+    def abort_request(self, rid: str):
+        if rid != "" and rid not in self.rid_to_state:
+            return
+        req = AbortReq(rid)
+        self.send_to_scheduler.send_pyobj(req)
+
+    async def pause_generation(self):
+        self.abort_request("")
diff --git a/rlinf/utils/distributed.py b/rlinf/utils/distributed.py
index a54f444d8..1f5ddc9c6 100644
--- a/rlinf/utils/distributed.py
+++ b/rlinf/utils/distributed.py
@@ -29,7 +29,7 @@
 
 from rlinf.utils.timers import NamedTimer
 
-
+import torch_npu
 def compute_rollout_metrics(
     rollout_batch,
     max_prompt_len,
@@ -38,7 +38,7 @@ def compute_rollout_metrics(
     dp_group=None,
     use_critic=False,
 ):
-    device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    device = torch.device(f"npu:{torch.npu.current_device()}")
     advantages = rollout_batch["advantages"].to(device=device)
     mask = rollout_batch["attention_mask"][:, -response_len:].to(device=device)
     prompt_lengths = rollout_batch["prompt_lengths"].clone().to(device=device)
@@ -105,7 +105,7 @@ def compute_rollout_metrics(
     adv_max = torch.max(valid_adv).detach().item()
     adv_min = torch.min(valid_adv).detach().item()
     reduce_tensor = torch.as_tensor(
-        [-adv_min, adv_max], device=torch.cuda.current_device(), dtype=torch.float32
+        [-adv_min, adv_max], device=torch.npu.current_device(), dtype=torch.float32
     )
     torch.distributed.all_reduce(
         reduce_tensor,
@@ -175,7 +175,7 @@ def from_rollout_batches(
         dp_group: Optional[ProcessGroup],
         partitioning_tool: Callable,
     ) -> Self:
-        current_device = torch.cuda.current_device()
+        current_device = torch.npu.current_device()
 
         attn_mask = rollout_batches.get("attention_mask")
         current_num_samples = attn_mask.size(0)
@@ -406,12 +406,12 @@ def rebalance_nd_tensor(tensor, group):
     NOTE: assumes all other (i.e., non-zero) dimensions are equal.
     """
     num_samples = torch.as_tensor(
-        tensor.size(0), dtype=torch.int64, device=torch.cuda.current_device()
+        tensor.size(0), dtype=torch.int64, device=torch.npu.current_device()
     )
     batch_num_per_rank = torch.zeros(
         torch.distributed.get_world_size(group),
         dtype=torch.int64,
-        device=torch.cuda.current_device(),
+        device=torch.npu.current_device(),
     )
     torch.distributed.all_gather_into_tensor(
         batch_num_per_rank, num_samples, group=group
@@ -422,7 +422,7 @@ def rebalance_nd_tensor(tensor, group):
 
     indices = batch_num_per_rank.cumsum(dim=0)
     output_tensor = torch.zeros(
-        B, *other_dims, dtype=tensor.dtype, device=torch.cuda.current_device()
+        B, *other_dims, dtype=tensor.dtype, device=torch.npu.current_device()
     )
 
     # tensor_split is a view we can copy into
@@ -454,7 +454,7 @@ def broadcast_tensor(
     """
 
     if torch.distributed.get_rank() == src:
-        tensor = tensor.cuda()
+        tensor = tensor.npu()
         if dtype:
             tensor = tensor.to(dtype)
 
@@ -467,7 +467,7 @@ def broadcast_tensor(
         torch.distributed.broadcast_object_list(metadata, src, group)
 
         dtype, input_shape = metadata
-        tensor = torch.empty(input_shape, dtype=dtype, device="cuda")
+        tensor = torch.empty(input_shape, dtype=dtype, device="npu")
         torch.distributed.broadcast(tensor, src, group)
     return tensor
 
@@ -519,7 +519,7 @@ def broadcast_tensor_within_dp(tensor: torch.Tensor, dtype: torch.dtype):
 def gather_tensor(tensor, dst, group, dtype=None):
     """Gather any tensor to the dst rank from every other rank in the given group.
     All the ranks that send or receive data must call this function."""
-    tensor = tensor.to(device=torch.cuda.current_device(), dtype=dtype)
+    tensor = tensor.to(device=torch.npu.current_device(), dtype=dtype)
     if torch.distributed.get_rank() == dst:
         gather_list = [
             torch.empty_like(tensor)
@@ -549,8 +549,8 @@ def normalize_tensor(tensor, mask, group=None):
     """normalizes a tensor using global mean and std"""
     dtype = torch.float64
     tensor = tensor.to(dtype)
-    tensor = tensor.to(device=torch.cuda.current_device())
-    mask = mask.to(device=torch.cuda.current_device())
+    tensor = tensor.to(device=torch.npu.current_device())
+    mask = mask.to(device=torch.npu.current_device())
 
     tensor_global_mean, tensor_global_var = masked_global_mean_var(
         tensor, mask, group=group
@@ -589,7 +589,7 @@ def masked_normalization(
             Normalized x, with the same shape as x.
     """
     dtype = torch.float64 if high_precision else torch.float32
-    x = x.to(dtype=dtype).cuda()
+    x = x.to(dtype=dtype).npu()
     if not inplace:
         x = x.clone()
     if dim is None:
@@ -599,7 +599,7 @@ def masked_normalization(
             np.prod([x.shape[d] for d in dim]), dtype=dtype, device=x.device
         )
     else:
-        mask = mask.to(dtype=dtype).cuda()
+        mask = mask.to(dtype=dtype).npu()
         assert len(mask.shape) == len(x.shape), (mask.shape, x.shape, dim)
         for i in range(len(x.shape)):
             if i in dim:
@@ -643,8 +643,8 @@ def masked_global_mean_var(values, mask, group=None):
     mask and values must have same shape, with mask being {0,1} with 1 being the values we want to keep
     """
     assert values.shape == mask.shape, (values.shape, mask.shape)
-    values = values.to(device=torch.cuda.current_device())
-    mask = mask.to(device=torch.cuda.current_device())
+    values = values.to(device=torch.npu.current_device())
+    mask = mask.to(device=torch.npu.current_device())
 
     values = values * mask
 
@@ -652,7 +652,7 @@ def masked_global_mean_var(values, mask, group=None):
     sum_and_count = torch.tensor(
         [values.sum(), mask.sum()],
         dtype=torch.float64,
-        device=torch.cuda.current_device(),
+        device=torch.npu.current_device(),
     )
     torch.distributed.all_reduce(sum_and_count, group=group)
     global_sum, global_count = sum_and_count
@@ -660,7 +660,7 @@ def masked_global_mean_var(values, mask, group=None):
     variance_summed = (
         (((values - global_mean) ** 2) * mask)
         .sum()
-        .to(device=torch.cuda.current_device(), dtype=torch.float64)
+        .to(device=torch.npu.current_device(), dtype=torch.float64)
     )
 
     torch.distributed.all_reduce(variance_summed, group=group)
@@ -669,12 +669,12 @@ def masked_global_mean_var(values, mask, group=None):
 
 
 def report_device_info(info_str):
-    free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+    free_gpu_memory, total_gpu_memory = torch.npu.mem_get_info()
     free_gpu_memory /= 2**30
     total_gpu_memory /= 2**30
 
-    memory_allocated = torch.cuda.memory_allocated() / 2**30
-    memory_reserved = torch.cuda.memory_reserved() / 2**30
+    memory_allocated = torch.npu.memory_allocated() / 2**30
+    memory_reserved = torch.npu.memory_reserved() / 2**30
 
     print(
         f"[Rank {torch.distributed.get_rank()}] {info_str}, {free_gpu_memory=:.2f} GiB, {total_gpu_memory=:.2f} GiB, {memory_allocated=:.2f} GiB, {memory_reserved=:.2f} GiB"
@@ -725,7 +725,7 @@ def all_reduce_dict(
 ):
     keys = sorted(dictionary)
     tensor = torch.as_tensor(
-        [dictionary[k] for k in keys], dtype=dtype, device=torch.cuda.current_device()
+        [dictionary[k] for k in keys], dtype=dtype, device=torch.npu.current_device()
     )
     torch.distributed.all_reduce(tensor, op=op, group=group)
     return dict(zip(keys, tensor.tolist()))
diff --git a/rlinf/utils/utils.py b/rlinf/utils/utils.py
index d117f4dc4..b840538cc 100644
--- a/rlinf/utils/utils.py
+++ b/rlinf/utils/utils.py
@@ -21,13 +21,13 @@
 
 import torch
 import torch.nn.functional as F
-
+import torch_npu
 
 def clear_memory(sync=True):
     if sync:
-        torch.cuda.synchronize()
+        torch.npu.synchronize()
     gc.collect()
-    torch.cuda.empty_cache()
+    torch.npu.empty_cache()
 
 
 def apply_func_to_dict(func, dictionary):
@@ -54,7 +54,7 @@ def retrieve_model_state_dict_in_cpu(model):
 
         cpu_dict[name] = item
 
-    torch.cuda.synchronize()
+    torch.npu.synchronize()
     return cpu_dict
 
 
@@ -126,13 +126,21 @@ def seq_mean_token_mean(values, mask):
 
 
 def logprobs_from_logits_flash_attn(logits, labels, inplace_backward=True):
-    from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
+    #from flash_attn.ops.triton.cross_entropy import cross_entropy_loss
 
-    output = cross_entropy_loss(logits, labels, inplace_backward=inplace_backward)
-    assert isinstance(output, tuple), (
-        "please make sure flash-attn>=2.4.3 where cross_entropy_loss returns Tuple[losses, z_losses]."
-    )
-    return -output[0]
+    #output = cross_entropy_loss(logits, labels, inplace_backward=inplace_backward)
+    #assert isinstance(output, tuple), (
+    #    "please make sure flash-attn>=2.4.3 where cross_entropy_loss returns Tuple[losses, z_losses]."
+    #)
+    #return -output[0]
+    import torch.nn.functional as F
+
+    # 数值稳定的 log_softmax
+    log_probs = F.log_softmax(logits, dim=-1)
+
+    # 提取标签对应的 logprob
+    labels = labels.unsqueeze(-1)
+    return torch.gather(log_probs, -1, labels).squeeze(-1)
 
 
 def compute_logprobs_from_logits(logits, target, task_type="embodied"):
diff --git a/rlinf/workers/actor/fsdp_actor_worker.py b/rlinf/workers/actor/fsdp_actor_worker.py
index 06607903f..1b3b015bb 100644
--- a/rlinf/workers/actor/fsdp_actor_worker.py
+++ b/rlinf/workers/actor/fsdp_actor_worker.py
@@ -62,7 +62,7 @@
     seq_mean_token_sum,
 )
 from rlinf.workers.rollout.utils import RankMapper
-
+import torch_npu
 
 class FSDPActor(FSDPModelManager, Worker):
     def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
@@ -87,11 +87,11 @@ def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
             // self._world_size
         )
 
-        torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
-        self.device = torch.cuda.current_device()
+        torch.npu.set_device(int(os.environ["LOCAL_RANK"]))
+        self.device = torch.npu.current_device()
         world_size = self._world_size
         self.device_mesh = init_device_mesh(
-            "cuda", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]
+            "npu", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]
         )
 
         self._rollout_group_name = cfg.rollout.group_name
@@ -216,7 +216,7 @@ def inference_step(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
                 multi_modal_inputs[key] = torch.cat(
                     [inputs[key] for inputs in batch["multi_modal_inputs"]],
                     dim=0,
-                ).cuda()
+                ).npu()
 
         outputs = self.model(
             input_ids=input_ids,
@@ -257,20 +257,32 @@ def run_inference(
             batch, rollout_result = self.get_batch(input_channel)
             recv_batch_size += rollout_result.num_sequence
             self._load_weight_and_optimizer(
-                input_channel if self.is_pipeline else rollout_channel
+                    input_channel if self.is_pipeline else rollout_channel
+            )
+            num_splits = (
+                rollout_result.num_sequence
+                // self.cfg.algorithm.logprob_forward_micro_batch_size
             )
+            micro_batches_iter = get_iterator_k_split(
+                batch,
+                num_splits=num_splits,
+            )
+            micro_batches = list(micro_batches_iter)
 
+            prev_logprobs = []
             with self.worker_timer():
-                prev_logprobs = self.inference_step(batch)
-                rollout_result.prev_logprobs = prev_logprobs.cpu()
-
+                for micro_batch in micro_batches:
+                    prev_logprobs.append(self.inference_step(micro_batch).cpu())
+                rollout_result.prev_logprobs = torch.cat(prev_logprobs)
             if compute_ref_logprobs:
                 assert self.ref_policy_state_dict is not None, (
                     "Reference policy state dict is None but compute_ref_logprobs is True"
                 )
+                ref_logprobs = []
                 with cpu_weight_swap(self.model, self.ref_policy_state_dict):
-                    ref_logprobs = self.inference_step(batch)
-                    rollout_result.ref_logprobs = ref_logprobs.cpu()
+                    for micro_batch in micro_batches:
+                        ref_logprobs.append(self.inference_step(micro_batch).cpu())
+                    rollout_result.ref_logprobs = torch.cat(ref_logprobs)
             self.put_result(rollout_result, output_channel)
 
         assert recv_batch_size == self.total_batch_size_per_dp, (
@@ -335,7 +347,7 @@ def run_training(self, input_channel: Channel) -> Tuple[Dict, list]:
                         else nullcontext()
                     )
                     for k, v in m_batch.items():
-                        m_batch[k] = v.cuda() if isinstance(v, torch.Tensor) else v
+                        m_batch[k] = v.npu() if isinstance(v, torch.Tensor) else v
 
                     multi_modal_inputs = {}
                     if "multi_modal_inputs" in m_batch.keys():
@@ -346,7 +358,7 @@ def run_training(self, input_channel: Channel) -> Tuple[Dict, list]:
                                     for inputs in m_batch["multi_modal_inputs"]
                                 ],
                                 dim=0,
-                            ).cuda()
+                            ).npu()
 
                     input_ids = m_batch["input_ids"]
                     attention_mask = m_batch["attention_mask"]
@@ -403,7 +415,7 @@ def run_training(self, input_channel: Channel) -> Tuple[Dict, list]:
                         loss_mask=loss_mask,
                     )
 
-                    entropy_loss = torch.tensor(0.0, device=torch.cuda.current_device())
+                    entropy_loss = torch.tensor(0.0, device=torch.npu.current_device())
                     if self.calculate_entropy:
                         entropy = output["entropy"][
                             :, -self.response_len - 1 : -1
@@ -414,7 +426,7 @@ def run_training(self, input_channel: Channel) -> Tuple[Dict, list]:
                                 loss - self.cfg.algorithm.entropy_bonus * entropy_loss
                             )
 
-                    kl_loss = torch.tensor(0.0, device=torch.cuda.current_device())
+                    kl_loss = torch.tensor(0.0, device=torch.npu.current_device())
                     if self.kl_beta > 0 and ref_logprobs is not None:
                         kld = kl_penalty(ref_logprobs, logprobs, self.kl_penalty_type)
                         kl_loss = self.loss_agg_func(kld, loss_mask)
@@ -503,8 +515,8 @@ def compute_advantages_and_returns(
                     mask = batch["attention_mask"][:, -self.response_len :]
                     advantages, returns = calculate_adv_and_returns(
                         adv_type=self.cfg.algorithm.adv_type,
-                        reward_scores=batch["rewards"].cuda(),
-                        mask=mask.cuda(),
+                        reward_scores=batch["rewards"].npu(),
+                        mask=mask.npu(),
                         num_responses=self.cfg.algorithm.group_size,
                     )
                     rollout_result.advantages = advantages.cpu()
@@ -522,11 +534,11 @@ def __init__(self, cfg: DictConfig):
         super().__init__(cfg.actor)
 
         self.cfg = cfg
-        torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
-        self.device = torch.cuda.current_device()
+        torch.npu.set_device(int(os.environ["LOCAL_RANK"]))
+        self.device = torch.npu.current_device()
         world_size = self._world_size
         self.device_mesh = init_device_mesh(
-            "cuda", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]
+            "npu", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]
         )
 
         self._env_group_name = cfg.env.group_name
@@ -554,9 +566,9 @@ def init_worker(self):
         if self.cfg.actor.get("enable_offload", False):
             self.offload_fsdp_param_and_grad()
             self.offload_fsdp_optimizer()
-            torch.cuda.synchronize()
+            torch.npu.synchronize()
             gc.collect()
-            torch.cuda.empty_cache()
+            torch.npu.empty_cache()
 
     def model_provider_func(self):
         model = get_model(self.cfg.actor.checkpoint_load_path, self.cfg.actor.model)
@@ -577,10 +589,10 @@ def sync_model_to_rollout(self):
         if self.cfg.actor.get("enable_offload", False):
             self.offload_fsdp_param_and_grad()
             self.offload_fsdp_optimizer()
-            torch.cuda.synchronize()
+            torch.npu.synchronize()
             del state_dict
             gc.collect()
-            torch.cuda.empty_cache()
+            torch.npu.empty_cache()
 
     async def recv_rollout_batch(self):
         send_num = self._component_placement.get_world_size("rollout") * self.stage_num
@@ -864,7 +876,7 @@ def run_training(self):
                 metrics_data["loss"] = loss.detach().item()
                 append_to_dict(metrics, metrics_data)
 
-            torch.cuda.empty_cache()
+            torch.npu.empty_cache()
 
             grad_norm = self.model.clip_grad_norm_(
                 max_norm=self.cfg.actor.optim.clip_grad
@@ -886,9 +898,9 @@ def run_training(self):
         )
 
         self.optimizer.zero_grad()
-        torch.cuda.synchronize()
+        torch.npu.synchronize()
         torch.distributed.barrier()
-        torch.cuda.empty_cache()
+        torch.npu.empty_cache()
 
         return mean_metric_dict
 
diff --git a/rlinf/workers/actor/fsdp_actor_worker_bak.py b/rlinf/workers/actor/fsdp_actor_worker_bak.py
new file mode 100644
index 000000000..84d8e09c5
--- /dev/null
+++ b/rlinf/workers/actor/fsdp_actor_worker_bak.py
@@ -0,0 +1,903 @@
+# Copyright 2025 The RLinf Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+from contextlib import nullcontext
+from typing import Dict, Tuple
+
+import numpy as np
+import torch
+from omegaconf import DictConfig
+from torch.distributed.device_mesh import init_device_mesh
+from torch.multiprocessing.reductions import reduce_tensor
+from tqdm import tqdm
+
+import rlinf.algorithms  # noqa: F401
+from rlinf.algorithms.registry import actor_loss, calculate_adv_and_returns
+from rlinf.algorithms.utils import (
+    kl_penalty,
+    preprocess_advantages_inputs,
+    preprocess_loss_inputs,
+)
+from rlinf.data.io_struct import RolloutResult
+from rlinf.hybrid_engines.fsdp.fsdp_model_manager import (
+    FSDPModelManager,
+)
+from rlinf.models import get_model
+from rlinf.models.embodiment.model_utils import custom_forward
+from rlinf.scheduler import Channel, Cluster, Worker
+from rlinf.utils.data_iter_utils import get_iterator_k_split
+from rlinf.utils.distributed import all_reduce_dict
+from rlinf.utils.distributed import (
+    compute_rollout_metrics as compute_math_rollout_metrics,
+)
+from rlinf.utils.metric_utils import (
+    append_to_dict,
+    compute_loss_mask,
+    compute_rollout_metrics,
+    compute_split_num,
+)
+from rlinf.utils.placement import (
+    HybridComponentPlacement,
+    ModelParallelComponentPlacement,
+)
+from rlinf.utils.utils import (
+    compute_logprobs_from_logits,
+    cpu_weight_swap,
+    masked_mean,
+    retrieve_model_state_dict_in_cpu,
+    seq_mean_token_mean,
+    seq_mean_token_sum,
+)
+from rlinf.workers.rollout.utils import RankMapper
+import torch_npu
+
+class FSDPActor(FSDPModelManager, Worker):
+    def __init__(self, cfg: DictConfig, placement: ModelParallelComponentPlacement):
+        Worker.__init__(self)
+        super().__init__(cfg.actor)
+
+        self.cfg = cfg
+
+        self.response_len = (
+            cfg.actor.model.encoder_seq_length - cfg.data.max_prompt_length
+        )
+        self.calculate_entropy = self.cfg.algorithm.calculate_entropy
+        self.calculate_entropy_loss = (
+            self.cfg.algorithm.entropy_bonus > 0 and self.calculate_entropy
+        )
+        self.kl_beta = self.cfg.algorithm.kl_beta
+        self.kl_penalty_type = self.cfg.algorithm.kl_penalty_type
+
+        self.total_batch_size_per_dp = (
+            self.cfg.data.rollout_batch_size
+            * self.cfg.algorithm.group_size
+            // self._world_size
+        )
+
+        torch.npu.set_device(int(os.environ["LOCAL_RANK"]))
+        self.device = torch.npu.current_device()
+        world_size = self._world_size
+        self.device_mesh = init_device_mesh(
+            "npu", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]
+        )
+
+        self._rollout_group_name = cfg.rollout.group_name
+        self._component_placement = placement
+        self.is_data_io_rank = True
+        self.is_pipeline = self._component_placement.is_disaggregated
+        self.ref_policy_state_dict = None
+
+        if self.cfg.algorithm.loss_agg_func == "token-mean":
+            self.loss_agg_func = masked_mean
+        elif self.cfg.algorithm.loss_agg_func == "seq-mean-token-sum":
+            self.loss_agg_func = seq_mean_token_sum
+        elif self.cfg.algorithm.loss_agg_func == "seq-mean-token-mean":
+            self.loss_agg_func = seq_mean_token_mean
+        else:
+            raise NotImplementedError(
+                f"algorithm.loss_agg_func={self.cfg.algorithm.loss_agg_func} is not supported!"
+            )
+
+    def init_worker(self) -> None:
+        self.setup_model_and_optimizer()
+        if self.cfg.algorithm.kl_beta > 0 and self.cfg.actor.get(
+            "combine_reference_model", True
+        ):
+            self.ref_policy_state_dict = retrieve_model_state_dict_in_cpu(self.model)
+
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_param_and_grad()
+            self.offload_fsdp_optimizer()
+        self._setup_rollout_weight_dst_ranks()
+
+    def _setup_rollout_weight_dst_ranks(self) -> None:
+        """Setup destination ranks for token and weight communication."""
+        rank_map = RankMapper.get_actor_rank_to_rollout_rank_map(
+            self._component_placement
+        )
+        self._weight_dst_rank_in_rollout = rank_map[self._rank]
+        self.log_info(
+            f"Actor rank {self._rank} will send weights to {self._weight_dst_rank_in_rollout}"
+        )
+
+    def del_reshard_state_dict(self) -> None:
+        if hasattr(self, "rollout_state_dict"):
+            del self.rollout_state_dict
+
+    def sync_model_to_rollout(self) -> None:
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_optimizer()
+
+        if next(self.model.parameters()).is_cpu:
+            self.load_fsdp_param_and_grad(self.device)
+        self.rollout_state_dict = self.get_model_state_dict()
+
+        has_visual = any("visual." in k for k in self.rollout_state_dict.keys())
+
+        state_dict = {}
+
+        if self._weight_dst_rank_in_rollout is not None:
+            for k, v in self.rollout_state_dict.items():
+                name = k
+                if has_visual:
+                    if name.startswith("model.language_model."):
+                        name = "model." + name[21:]
+                    # NOTE:
+                    # if transformers version is 4.56.1 or older(not tested),
+                    # the following line should be uncommented
+
+                    # elif name.startswith("model."):
+                    #     name = name[6:]
+                state_dict[name] = reduce_tensor(v)
+
+            self.send(
+                state_dict, self._rollout_group_name, self._weight_dst_rank_in_rollout
+            )
+
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_param_and_grad()
+
+    def compute_logprobs(self) -> None:
+        self.model.eval()
+        self.rollout_batch["logprob"] = self.rollout_batch["prev_logprobs"]
+
+    def get_batch(
+        self, channel: Channel
+    ) -> Tuple[Dict[str, torch.Tensor], RolloutResult]:
+        result: RolloutResult = channel.get()
+
+        batch = result.to_actor_batch(
+            self.cfg.data.max_prompt_length,
+            self.cfg.actor.model.encoder_seq_length,
+            self.tokenizer.eos_token_id,
+        )
+        return batch, result
+
+    def put_result(self, result: RolloutResult, channel: Channel) -> None:
+        if channel.is_local:
+            # Local channel, every process will put its own data locally
+            # No need to broadcast
+            channel.put(result)
+        else:
+            if self.is_data_io_rank:
+                channel.put(result)
+
+    def _load_weight_and_optimizer(self, channel: Channel) -> None:
+        # Acquire the GPUs to ensure that no one is using them before loading models
+        # Otherwise, it may lead to OOM
+        with channel.device_lock:
+            if self.cfg.actor.get("enable_offload", False):
+                self.load_fsdp_param_and_grad(self.device)
+                self.load_fsdp_optimizer(self.device)
+
+    @torch.no_grad()
+    def inference_step(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        self.model.eval()
+        input_ids = batch["input_ids"]
+        attention_mask = batch["attention_mask"]
+        position_ids = batch["position_ids"]
+
+        multi_modal_inputs = {}
+        if "multi_modal_inputs" in batch.keys():
+            for key in batch["multi_modal_inputs"][0].keys():
+                multi_modal_inputs[key] = torch.cat(
+                    [inputs[key] for inputs in batch["multi_modal_inputs"]],
+                    dim=0,
+                ).npu()
+
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=False,
+            **multi_modal_inputs,
+        )
+
+        logits = outputs.logits
+        logits = logits[:, -self.response_len - 1 : -1, :]
+        logits = logits / self.cfg.algorithm.sampling_params.temperature
+
+        responses = input_ids[:, -self.response_len :]
+        logprobs = compute_logprobs_from_logits(
+            logits, responses, task_type=self.cfg.runner.task_type
+        )
+        return logprobs
+
+    def run_inference(
+        self,
+        input_channel: Channel,
+        output_channel: Channel,
+        rollout_channel: Channel,
+        compute_ref_logprobs: bool,
+    ) -> None:
+        """
+        Compute prev/ref logprobs using the actor Model's forward.
+
+        Args:
+            input_channel: The input channel to read from.
+            output_channel: The output channel to send results to.
+            rollout_channel: get the rollout channel's device lock in case of collision.
+            compute_ref_logprobs: Whether to compute reference logprobs.
+        """
+        recv_batch_size = 0
+        while recv_batch_size < self.total_batch_size_per_dp:
+            batch, rollout_result = self.get_batch(input_channel)
+            recv_batch_size += rollout_result.num_sequence
+            self._load_weight_and_optimizer(
+                input_channel if self.is_pipeline else rollout_channel
+            )
+
+            with self.worker_timer():
+                prev_logprobs = self.inference_step(batch)
+                rollout_result.prev_logprobs = prev_logprobs.cpu()
+
+            if compute_ref_logprobs:
+                assert self.ref_policy_state_dict is not None, (
+                    "Reference policy state dict is None but compute_ref_logprobs is True"
+                )
+                with cpu_weight_swap(self.model, self.ref_policy_state_dict):
+                    ref_logprobs = self.inference_step(batch)
+                    rollout_result.ref_logprobs = ref_logprobs.cpu()
+            self.put_result(rollout_result, output_channel)
+
+        assert recv_batch_size == self.total_batch_size_per_dp, (
+            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+        )
+
+    def run_training(self, input_channel: Channel) -> Tuple[Dict, list]:
+        # Get all batches for this DP
+        batches = []
+        recv_batch_size = 0
+        while recv_batch_size < self.total_batch_size_per_dp:
+            batch, rollout_result = self.get_batch(input_channel)
+            batches.append(batch)
+            recv_batch_size += rollout_result.num_sequence
+        assert recv_batch_size == self.total_batch_size_per_dp, (
+            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+        )
+        batch = RolloutResult.merge_batches(batches)
+        # Must be called after batch is retrieved, which is when rollout has stopped
+        # Otherwise, loading model might cause OOM
+        self._load_weight_and_optimizer(input_channel)
+
+        global_batches = get_iterator_k_split(
+            batch,
+            num_splits=self.cfg.algorithm.n_minibatches,
+            shuffle=self.cfg.algorithm.get("shuffle_rollout", True),
+            shuffle_seed=self.cfg.actor.seed,
+        )
+
+        self.model.train()
+        assert (
+            self.cfg.actor.global_batch_size
+            % (self.cfg.actor.micro_batch_size * self._world_size)
+            == 0
+        )
+
+        training_metrics_list = []
+        # Global batch iterations
+        with self.worker_timer():
+            for global_batch in global_batches:
+                train_global_batch_size = global_batch["input_ids"].shape[0]
+
+                assert train_global_batch_size % self.cfg.actor.micro_batch_size == 0, (
+                    f"{train_global_batch_size=}, {self.cfg.actor.micro_batch_size=}"
+                )
+
+                self.gradient_accumulation = (
+                    train_global_batch_size // self.cfg.actor.micro_batch_size
+                )
+                # split batch into micro_batches
+                train_micro_batches = get_iterator_k_split(
+                    global_batch,
+                    train_global_batch_size // self.cfg.actor.micro_batch_size,
+                )
+
+                self.optimizer.zero_grad()
+                metrics = {}
+                for idx, m_batch in enumerate(train_micro_batches):
+                    backward_ctx = (
+                        self.model.no_sync()
+                        if idx < self.gradient_accumulation - 1
+                        else nullcontext()
+                    )
+                    for k, v in m_batch.items():
+                        m_batch[k] = v.npu() if isinstance(v, torch.Tensor) else v
+
+                    multi_modal_inputs = {}
+                    if "multi_modal_inputs" in m_batch.keys():
+                        for key in m_batch["multi_modal_inputs"][0].keys():
+                            multi_modal_inputs[key] = torch.cat(
+                                [
+                                    inputs[key]
+                                    for inputs in m_batch["multi_modal_inputs"]
+                                ],
+                                dim=0,
+                            ).npu()
+
+                    input_ids = m_batch["input_ids"]
+                    attention_mask = m_batch["attention_mask"]
+                    position_ids = m_batch["position_ids"]
+                    prev_logprobs = m_batch["prev_logprobs"]
+                    advantages = m_batch["advantages"]
+                    ref_logprobs = None
+                    if "ref_logprobs" in m_batch:
+                        ref_logprobs = m_batch["ref_logprobs"]
+
+                    loss_mask = m_batch["attention_mask"][:, -self.response_len :]
+                    output = self.model(
+                        input_ids=input_ids,
+                        attention_mask=attention_mask,
+                        position_ids=position_ids,
+                        **multi_modal_inputs,
+                        use_cache=False,
+                    )
+
+                    logits = output.logits
+
+                    logits.div_(self.cfg.algorithm.sampling_params.temperature)
+
+                    responses = input_ids[:, -self.response_len :]
+                    logits = logits[
+                        :, -self.response_len - 1 : -1, :
+                    ]  # (bsz, response_length, vocab_size)
+                    logprobs = compute_logprobs_from_logits(
+                        logits, responses, task_type=self.cfg.runner.task_type
+                    )
+
+                    clip_ratio = self.cfg.algorithm.ratio_clip_eps
+                    clip_ratio_low = (
+                        self.cfg.algorithm.clip_ratio_low
+                        if self.cfg.algorithm.clip_ratio_low is not None
+                        else clip_ratio
+                    )
+                    clip_ratio_high = (
+                        self.cfg.algorithm.clip_ratio_high
+                        if self.cfg.algorithm.clip_ratio_high is not None
+                        else clip_ratio
+                    )
+                    clip_ratio_c = self.cfg.algorithm.get("clip_ratio_c", 3.0)
+
+                    loss, mbs_metrics_data = actor_loss(
+                        loss_type=self.cfg.algorithm.loss_type,
+                        loss_agg_func=self.loss_agg_func,
+                        logprobs=logprobs,
+                        old_logprobs=prev_logprobs,
+                        advantages=advantages,
+                        clip_ratio_low=clip_ratio_low,
+                        clip_ratio_high=clip_ratio_high,
+                        clip_ratio_c=clip_ratio_c,
+                        loss_mask=loss_mask,
+                    )
+
+                    entropy_loss = torch.tensor(0.0, device=torch.npu.current_device())
+                    if self.calculate_entropy:
+                        entropy = output["entropy"][
+                            :, -self.response_len - 1 : -1
+                        ].contiguous()
+                        entropy_loss = self.loss_agg_func(entropy, mask=loss_mask)
+                        if self.calculate_entropy_loss:
+                            loss = (
+                                loss - self.cfg.algorithm.entropy_bonus * entropy_loss
+                            )
+
+                    kl_loss = torch.tensor(0.0, device=torch.npu.current_device())
+                    if self.kl_beta > 0 and ref_logprobs is not None:
+                        kld = kl_penalty(ref_logprobs, logprobs, self.kl_penalty_type)
+                        kl_loss = self.loss_agg_func(kld, loss_mask)
+                        loss = loss + kl_loss * self.kl_beta
+
+                    # add to log
+                    # scale loss for gradient accumulation and backprop
+                    loss = loss / self.gradient_accumulation
+                    with backward_ctx:
+                        loss.backward()
+
+                    mbs_metrics_data.update(
+                        {
+                            "final_loss": loss.detach(),
+                            "entropy_loss": entropy_loss.detach(),
+                            "kl_loss": kl_loss.detach(),
+                        }
+                    )
+
+                    append_to_dict(metrics, mbs_metrics_data)
+                # apply gradient clipping and optimizer step at the end of a global batch
+                grad_norm = self.model.clip_grad_norm_(
+                    max_norm=self.cfg.actor.optim.clip_grad
+                )
+                if not torch.isfinite(grad_norm).all():
+                    self.log_warning(
+                        "grad norm is not finite, skip this optimizer step."
+                    )
+                else:
+                    self.optimizer.step()
+                self.optimizer.zero_grad()
+
+                # aggregate metrics across micro-batches
+                mean_metric_dict = {
+                    key: torch.mean(torch.stack(value))
+                    for key, value in metrics.items()
+                }
+                mean_metric_dict = all_reduce_dict(
+                    mean_metric_dict, op=torch.distributed.ReduceOp.AVG
+                )
+                # add optimizer stats
+                if torch.is_tensor(grad_norm):
+                    mean_metric_dict["actor/grad_norm"] = float(
+                        grad_norm.detach().item()
+                    )
+                else:
+                    mean_metric_dict["actor/grad_norm"] = float(grad_norm)
+                lr = self.optimizer.param_groups[0]["lr"]
+                mean_metric_dict["actor/lr"] = torch.as_tensor(lr).float().cpu()
+                training_metrics_list.append(mean_metric_dict)
+
+        # Rollout metrics
+        rollout_metrics, _, _ = compute_math_rollout_metrics(
+            batch, self.cfg.data.max_prompt_length, self.response_len, self._world_size
+        )
+
+        return rollout_metrics, training_metrics_list
+
+    def save_checkpoint(self, save_base_path: str, step: int) -> None:
+        torch.distributed.barrier()
+        model_state = self.get_model_state_dict()
+        optim_state = self.get_optimizer_state_dict()
+        if self._rank == 0:
+            os.makedirs(save_base_path, exist_ok=True)
+            torch.save(model_state, os.path.join(save_base_path, "model.pt"))
+            torch.save(optim_state, os.path.join(save_base_path, "optim.pt"))
+        torch.distributed.barrier()
+
+    # Advantages and returns
+    def compute_advantages_and_returns(
+        self, input_channel: Channel, output_channel: Channel
+    ) -> None:
+        """Compute the advantages and returns.
+
+        Args:
+            input_channel: The input channel to read from.
+            output_channel: The output channel to send results to.
+        """
+        recv_batch_size = 0
+        while recv_batch_size < self.total_batch_size_per_dp:
+            batch, rollout_result = self.get_batch(input_channel)
+            recv_batch_size += rollout_result.num_sequence
+
+            with self.worker_timer():
+                if rollout_result.advantages is None:
+                    mask = batch["attention_mask"][:, -self.response_len :]
+                    advantages, returns = calculate_adv_and_returns(
+                        adv_type=self.cfg.algorithm.adv_type,
+                        reward_scores=batch["rewards"].npu(),
+                        mask=mask.npu(),
+                        num_responses=self.cfg.algorithm.group_size,
+                    )
+                    rollout_result.advantages = advantages.cpu()
+
+            self.put_result(rollout_result, output_channel)
+
+        assert recv_batch_size == self.total_batch_size_per_dp, (
+            f"Expected {self.total_batch_size_per_dp} sequences from channel, but got {recv_batch_size}"
+        )
+
+
+class EmbodiedFSDPActor(FSDPModelManager, Worker):
+    def __init__(self, cfg: DictConfig):
+        Worker.__init__(self)
+        super().__init__(cfg.actor)
+
+        self.cfg = cfg
+        torch.npu.set_device(int(os.environ["LOCAL_RANK"]))
+        self.device = torch.npu.current_device()
+        world_size = self._world_size
+        self.device_mesh = init_device_mesh(
+            "npu", mesh_shape=(world_size,), mesh_dim_names=["fsdp"]
+        )
+
+        self._env_group_name = cfg.env.group_name
+        self._rollout_group_name = cfg.rollout.group_name
+        self._component_placement = HybridComponentPlacement(cfg, Cluster())
+        self._weight_dst_rank_in_rollout = self._rank
+        if self._weight_dst_rank_in_rollout >= self._component_placement.get_world_size(
+            "rollout"
+        ):
+            self._weight_dst_rank_in_rollout = None
+
+        self._obs_queue_name = cfg.env.channel.queue_name
+        self._action_queue_name = cfg.rollout.channel.queue_name
+        self._replay_buffer_name = cfg.actor.channel.queue_name
+        # stage_num: default to 2, use for pipeline rollout process
+        self.stage_num = cfg.rollout.pipeline_stage_num
+
+        self.channel = self.connect_channel(cfg.actor.channel.name)
+        self.channel.create_queue(
+            cfg.actor.channel.queue_name, maxsize=cfg.actor.channel.queue_size
+        )
+
+    def init_worker(self):
+        self.setup_model_and_optimizer()
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_param_and_grad()
+            self.offload_fsdp_optimizer()
+            torch.npu.synchronize()
+            gc.collect()
+            torch.npu.empty_cache()
+
+    def model_provider_func(self):
+        model = get_model(self.cfg.actor.checkpoint_load_path, self.cfg.actor.model)
+        if model is not None:
+            return model
+        return super().model_provider_func()
+
+    def sync_model_to_rollout(self):
+        if next(self.model.parameters()).is_cpu:
+            self.load_fsdp_param_and_grad(self.device)
+            self.load_fsdp_optimizer(self.device)
+
+        state_dict = self.get_model_state_dict()
+        if self._weight_dst_rank_in_rollout is not None:
+            self.send(
+                state_dict, self._rollout_group_name, self._weight_dst_rank_in_rollout
+            )
+        if self.cfg.actor.get("enable_offload", False):
+            self.offload_fsdp_param_and_grad()
+            self.offload_fsdp_optimizer()
+            torch.npu.synchronize()
+            del state_dict
+            gc.collect()
+            torch.npu.empty_cache()
+
+    async def recv_rollout_batch(self):
+        send_num = self._component_placement.get_world_size("rollout") * self.stage_num
+        recv_num = self._component_placement.get_world_size("actor")
+        split_num = compute_split_num(send_num, recv_num)
+
+        self.rollout_batch = {}
+        recv_list = []
+        for i in range(split_num):
+            recv_list.append(
+                await self.channel.get(
+                    queue_name=self._replay_buffer_name, async_op=True
+                ).async_wait()
+            )
+
+        # shape [num_chunk, bsz, chunk_size], cat dim 1
+        for key in recv_list[0].keys():
+            if "env_info/" not in key:
+                self.rollout_batch[key] = torch.cat(
+                    [recv_list[i][key] for i in range(split_num)], dim=1
+                )
+            else:
+                self.rollout_batch[key] = torch.cat(
+                    [recv_list[i][key] for i in range(split_num)], dim=0
+                )
+
+        self.rollout_batch = self._process_received_rollout_batch(self.rollout_batch)
+
+    def _process_received_rollout_batch(self, rollout_batch):
+        """
+        original shape: [rollout_epoch x n_chunk_steps, bsz, num_action_chunks, ...]
+        target shape: [n_chunk_steps, rollout_epoch x bsz, num_action_chunks, ...]
+        """
+        rollout_epoch = self.cfg.algorithm.rollout_epoch
+        for key, value in rollout_batch.items():
+            new_value = value.reshape(
+                rollout_epoch, -1, *value.shape[1:]
+            )  # [rollout_epoch, n_chunk_step, bsz, ...]
+            new_value = new_value.transpose(
+                0, 1
+            )  # [n_chunk_step, rollout_epoch, bsz, ...]
+            new_value = new_value.reshape(new_value.shape[0], -1, *new_value.shape[3:])
+            rollout_batch[key] = new_value
+
+        if (
+            not self.cfg.env.train.auto_reset
+            and not self.cfg.env.train.ignore_terminations
+        ):
+            dones = rollout_batch[
+                "dones"
+            ]  # [n_chunk_step, rollout_epoch x bsz, num_action_chunks]
+            loss_mask, loss_mask_sum = compute_loss_mask(dones)
+
+            if self.cfg.algorithm.reward_type == "chunk_level":
+                loss_mask = loss_mask.any(dim=-1, keepdim=True)
+                loss_mask_sum = loss_mask_sum[..., -1:]
+
+            rollout_batch["loss_mask"] = loss_mask
+            rollout_batch["loss_mask_sum"] = loss_mask_sum
+
+        # filter data by rewards
+        if self.cfg.algorithm.get("filter_rewards", False):
+            rewards = rollout_batch[
+                "rewards"
+            ]  # [n_chunk_step, batch, num_action_chunks]
+            if self.rollout_batch.get("loss_mask", None) is not None:
+                rewards = rewards * rollout_batch["loss_mask"]
+            n_chunk_step, batch_size, num_action_chunks = rewards.shape
+
+            group_size = self.cfg.algorithm.group_size
+            assert batch_size % group_size == 0, (
+                f"batch {batch_size} not divisible by group_size {group_size}"
+            )
+            n_prompts = batch_size // group_size
+
+            # calculate rewards by prompt
+            rewards = rewards.transpose(
+                0, 1
+            )  # [batch, n_chunk_step, num_action_chunks]
+            rewards = rewards.reshape(rewards.shape[0], -1)  # [batch, n_step]
+            reward_matrix = rewards.reshape(
+                n_prompts, group_size, rewards.shape[-1]
+            )  # [n_prompts, group_size, n_step]
+            reward_matrix = reward_matrix.sum(dim=-1)  # [n_prompts, group_size]
+            mean_reward_in_group = reward_matrix.mean(dim=1)  # [n_prompts]
+
+            # mask
+            reward_filter_mask = (
+                mean_reward_in_group >= self.cfg.algorithm.rewards_lower_bound
+            ) & (
+                mean_reward_in_group <= self.cfg.algorithm.rewards_upper_bound
+            )  # [n_prompts]
+
+            # extend mask dimension
+            reward_filter_mask = reward_filter_mask.repeat_interleave(
+                group_size
+            )  # [batch]
+            reward_filter_mask = (
+                reward_filter_mask.unsqueeze(0).expand(n_chunk_step, -1).unsqueeze(-1)
+            )  # [n_chunk_step, batch, 1]
+
+            # update loss_mask
+            if self.rollout_batch.get("loss_mask", None) is not None:
+                rollout_batch["loss_mask"] = (
+                    reward_filter_mask & self.rollout_batch["loss_mask"]
+                )
+            else:
+                rollout_batch["loss_mask"] = reward_filter_mask
+
+        return rollout_batch
+
+    def compute_logprobs(self):
+        self.model.eval()
+        self.rollout_batch["logprob"] = self.rollout_batch["prev_logprobs"]
+
+    def compute_advantages_and_returns(self):
+        stage_num = self.cfg.rollout.pipeline_stage_num
+        env_world_size = self._component_placement.get_world_size("env")
+        actor_world_size = self._component_placement.get_world_size("actor")
+        num_group_envs_for_train = (
+            self.cfg.algorithm.num_group_envs
+            * stage_num
+            * env_world_size
+            // actor_world_size
+        )
+
+        kwargs = {
+            "adv_type": self.cfg.algorithm.adv_type,
+            "rewards": self.rollout_batch["rewards"],
+            "dones": self.rollout_batch["dones"],
+            "normalize_advantages": self.cfg.algorithm.get(
+                "normalize_advantages", True
+            ),
+            "values": self.rollout_batch.get("prev_values", None),
+            "gamma": self.cfg.algorithm.get("gamma", 1),
+            "gae_lambda": self.cfg.algorithm.get("gae_lambda", 1),
+            "num_group_envs": num_group_envs_for_train,
+            "group_size": self.cfg.algorithm.get("group_size", 8),
+            "reward_type": self.cfg.algorithm.reward_type,
+            "loss_mask": self.rollout_batch.get("loss_mask", None),
+            "rollout_epoch": self.cfg.algorithm.get("rollout_epoch", 1),
+        }
+        kwargs = preprocess_advantages_inputs(**kwargs)
+        advantages, returns = calculate_adv_and_returns(**kwargs)
+
+        self.rollout_batch.update({"advantages": advantages, "returns": returns})
+        rollout_metrics = compute_rollout_metrics(self.rollout_batch)
+        return rollout_metrics
+
+    def run_training(self):
+        if self.cfg.actor.get("enable_offload", False):
+            self.load_fsdp_param_and_grad(self.device)
+            self.load_fsdp_optimizer(self.device)
+
+        self.model.train()
+        self.optimizer.zero_grad()
+        rollout_size = (
+            self.rollout_batch["input_ids"].shape[0]
+            * self.rollout_batch["input_ids"].shape[1]
+        )
+        shuffle_id = torch.randperm(rollout_size)
+
+        for key, value in self.rollout_batch.items():
+            self.log_on_first_rank(f"run training, {key}: {value.shape}")
+
+        with torch.no_grad():
+            for key, value in self.rollout_batch.items():
+                if key in ["dones", "prev_values"]:
+                    value = value[:-1]
+                if "env_info" in key:
+                    continue
+                value = value.reshape(rollout_size, *value.shape[2:])
+                self.rollout_batch[key] = value[shuffle_id]
+
+        assert (
+            self.cfg.actor.global_batch_size
+            % (self.cfg.actor.micro_batch_size * self._world_size)
+            == 0
+        )
+
+        self.gradient_accumulation = (
+            self.cfg.actor.global_batch_size
+            // self.cfg.actor.micro_batch_size
+            // self._world_size
+        )
+
+        # Split to make minibatch iterator for updating the actor
+        # See PPO paper for details. https://arxiv.org/abs/1707.06347
+        rollout_size = self.rollout_batch["input_ids"].size(0)
+        batch_size_per_rank = self.cfg.actor.global_batch_size // self._world_size
+        assert rollout_size % batch_size_per_rank == 0, (
+            f"{rollout_size} is not divisible by {batch_size_per_rank}"
+        )
+        rollout_dataloader_iter = get_iterator_k_split(
+            self.rollout_batch,
+            rollout_size // batch_size_per_rank,
+        )
+
+        metrics = {}
+        for _, train_global_batch in tqdm(
+            enumerate(rollout_dataloader_iter), desc="get loss and metrics"
+        ):
+            # split batch into micro_batches
+            train_global_batch_size = train_global_batch["input_ids"].shape[0]
+            assert (
+                train_global_batch_size
+                == self.cfg.actor.global_batch_size
+                // torch.distributed.get_world_size()
+            )
+            assert train_global_batch_size % self.cfg.actor.micro_batch_size == 0, (
+                f"{train_global_batch_size=}, {self.cfg.actor.micro_batch_size}"
+            )
+            train_micro_batch = get_iterator_k_split(
+                train_global_batch,
+                train_global_batch_size // self.cfg.actor.micro_batch_size,
+            )
+
+            self.optimizer.zero_grad()
+            for data_idx, data in enumerate(train_micro_batch):
+                for k, v in data.items():
+                    data[k] = v.to(f"cuda:{int(os.environ['LOCAL_RANK'])}")
+
+                data = self.model.preprocess_for_train(data)
+                input_ids = data["input_ids"]
+                action_tokens = data["action_tokens"]
+                attention_mask = data["attention_mask"]
+                pixel_values = data["pixel_values"]
+
+                action_token_len = self.model.action_dim * self.model.num_action_chunks
+
+                logits_processor_args = {
+                    "action_tokens": action_tokens,
+                    "vocab_size": self.model.vocab_size,
+                    "n_action_bins": self.model.config.n_action_bins,
+                }
+
+                output_dict = custom_forward(
+                    self.model,
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    pixel_values=pixel_values,
+                    action_token_len=action_token_len,
+                    value_model=True
+                    if self.cfg.algorithm.adv_type == "embodied_gae"
+                    else False,
+                    value_head_mode=self.cfg.actor.model.get("vh_mode", None),
+                    temperature=self.cfg.algorithm.sampling_params.temperature_train,
+                    top_k=self.cfg.algorithm.sampling_params.top_k,
+                    logits_processor_args=logits_processor_args,
+                )
+
+                kwargs = {
+                    "loss_type": self.cfg.algorithm.loss_type,
+                    "logprob_type": self.cfg.algorithm.logprob_type,
+                    "entropy_type": self.cfg.algorithm.entropy_type,
+                    "single_action_dim": self.model.action_dim,
+                    "logprobs": output_dict["logprobs"],
+                    "entropy": output_dict["entropy"],
+                    "values": output_dict.get("values", None),
+                    "old_logprobs": data["prev_logprobs"],
+                    "advantages": data["advantages"],
+                    "returns": data["returns"],
+                    "prev_values": data["prev_values"],
+                    "clip_ratio_high": self.cfg.algorithm.clip_ratio_high,
+                    "clip_ratio_low": self.cfg.algorithm.clip_ratio_low,
+                    "value_clip": self.cfg.algorithm.get("value_clip", None),
+                    "huber_delta": self.cfg.algorithm.get("huber_delta", None),
+                    "entropy_bonus": self.cfg.algorithm.entropy_bonus,
+                    "loss_mask": data.get("loss_mask", None),
+                    "loss_mask_sum": data.get("loss_mask_sum", None),
+                    "max_episode_steps": self.cfg.env.train.max_episode_steps,
+                }
+
+                kwargs = preprocess_loss_inputs(**kwargs)
+
+                loss, metrics_data = actor_loss(**kwargs)
+
+                loss /= self.gradient_accumulation
+                loss.backward()
+
+                metrics_data["loss"] = loss.detach().item()
+                append_to_dict(metrics, metrics_data)
+
+            torch.npu.empty_cache()
+
+            grad_norm = self.model.clip_grad_norm_(
+                max_norm=self.cfg.actor.optim.clip_grad
+            )
+            self.optimizer.step()
+
+            self.optimizer.zero_grad()
+            data = {
+                "actor/grad_norm": grad_norm.detach().item(),
+                "actor/lr": self.optimizer.param_groups[0]["lr"],
+            }
+            if self.cfg.algorithm.adv_type == "embodied_gae":
+                data["critic/lr"] = self.optimizer.param_groups[1]["lr"]
+            append_to_dict(metrics, data)
+
+        mean_metric_dict = {key: np.mean(value) for key, value in metrics.items()}
+        mean_metric_dict = all_reduce_dict(
+            mean_metric_dict, op=torch.distributed.ReduceOp.AVG
+        )
+
+        self.optimizer.zero_grad()
+        torch.npu.synchronize()
+        torch.distributed.barrier()
+        torch.npu.empty_cache()
+
+        return mean_metric_dict
+
+    def save_checkpoint(self, save_base_path, step):
+        torch.distributed.barrier()
+        model_state = self.get_model_state_dict()
+        optim_state = self.get_optimizer_state_dict()
+        if self._rank == 0:
+            os.makedirs(save_base_path, exist_ok=True)
+            torch.save(model_state, os.path.join(save_base_path, "model.pt"))
+            torch.save(optim_state, os.path.join(save_base_path, "optim.pt"))
+        torch.distributed.barrier()
diff --git a/rlinf/workers/rollout/sglang/__init__.py b/rlinf/workers/rollout/sglang/__init__.py
index 5e0fb219f..cc78162b9 100644
--- a/rlinf/workers/rollout/sglang/__init__.py
+++ b/rlinf/workers/rollout/sglang/__init__.py
@@ -49,6 +49,12 @@ def get_version(pkg):
     from rlinf.hybrid_engines.sglang.sglang_0_4_9.sgl_engine import (
         Engine,
     )
+elif package_version >= parse("0.5.0") and package_version < parse("0.5.3"):
+    sglang_version = package_version
+    from rlinf.hybrid_engines.sglang.sglang_0_5_2 import io_struct
+    from rlinf.hybrid_engines.sglang.sglang_0_5_2.sgl_engine import (
+        Engine,
+    )
 else:
     raise ValueError(f"sglang version {package_version} not supported")
 
diff --git a/rlinf/workers/rollout/sglang/sglang_worker.py b/rlinf/workers/rollout/sglang/sglang_worker.py
index 4d5c51552..e58027ba5 100644
--- a/rlinf/workers/rollout/sglang/sglang_worker.py
+++ b/rlinf/workers/rollout/sglang/sglang_worker.py
@@ -113,6 +113,8 @@ def _init_engine(self):
             log_level="info",
             max_running_requests=self._cfg.rollout.max_running_requests,
             dist_init_addr=f"127.0.0.1:{str(Cluster.find_free_port())}",
+            device="npu",
+            watchdog_timeout=3600,
         )
 
         self.log_on_first_rank(f"{server_args=}")
@@ -176,6 +178,7 @@ def rollout(self, input_channel: Channel, output_channel: Channel):
         for request in requests:
             # Generate outputs using the SGLang engine.
             with self.worker_timer():
+                self.log_info(f"Generating {len(request.input_ids)} samples...")
                 results = self._engine.generate(
                     input_ids=request.input_ids,
                     # 0.4.4 has modality bug,can't pass non-None image_data
@@ -184,6 +187,8 @@ def rollout(self, input_channel: Channel, output_channel: Channel):
                     return_logprob=self._return_logprobs,
                 )
 
+            self.log_info(f"Generation for {len(request.input_ids)} samples done.")
+
             # Create RolloutResult from the outputs.
             rollout_result = RolloutResult.from_sglang_results(
                 results,
diff --git a/rlinf/workers/rollout/vllm/vllm_worker.py b/rlinf/workers/rollout/vllm/vllm_worker.py
index 3d36b9a44..7324fe841 100644
--- a/rlinf/workers/rollout/vllm/vllm_worker.py
+++ b/rlinf/workers/rollout/vllm/vllm_worker.py
@@ -337,7 +337,8 @@ async def init_worker(self) -> None:
             trust_remote_code=self._cfg.actor.tokenizer.trust_remote_code,
             max_model_len=self._cfg.runner.seq_length,
             max_num_seqs=self._cfg.rollout.max_running_requests,
-            enable_sleep_mode=True,  # it enables offload weights
+            enable_sleep_mode=False,
+            device="npu", 
         )
         vllm_config: VllmConfig = engine_args.create_engine_config()
 
diff --git a/test.py b/test.py
new file mode 100644
index 000000000..485f51f99
--- /dev/null
+++ b/test.py
@@ -0,0 +1,3 @@
+from safetensors import safe_open
+with safe_open("/home/weight/Qwen2.5-VL-3B-Instruct/model-00001-of-00002.safetensors", framework="pt") as f:
+    print(f.keys())

OpenVLA-OFT model results on ManiSkill3					OpenVLA and OpenVLA-OFT model results on ManiSkill3
Model
Model	Spatial	Goal	Object	Long	Spatial	Goal	Object	Long	Average
Model	AIME 24	AIME 25	GPQA-diamond	AIME 24	AIME 25	GPQA-diamond	Average
Model	AIME 24	AIME 25	GPQA-diamond	AIME 24	AIME 25	GPQA-diamond	Average
OpenVLA-OFT 模型在 ManiSkill3 上的实验结果
Model	Vision	Semantic	Position	Average
rl4vla	76.6%	75.4%	77.6%	76.1%
GRPO-OpenVLA-OFT	84.6%	51.6%	42.9%	61.5%
PPO-OpenVLA-OFT	80.5%	56.6%	56.1%	64.5%
PPO-OpenVLA	82.0%	80.6%	89.3%	82.2%
GRPO-OpenVLA	74.7%	74.4%	81.6%	75.5%