From 57cce29d655be265d87ec5c027e9f8f7fd7e3ff0 Mon Sep 17 00:00:00 2001
From: xyuzh <xinyzng@gmail.com>
Date: Sun, 22 Feb 2026 20:25:32 -0800
Subject: [PATCH 1/7] Add Miles Qwen3-8B GRPO training example on H100

Single-node RL training of Qwen3-8B with GRPO on 8x H100-80GB using
Anyscale. Includes Dockerfile, job config, and entrypoint script that
handles model download, weight conversion, and async GRPO training
with Megatron backend (TP=2, DP=2) and 3 SGLang rollout engines.
---
 miles_qwen3_8b_h100/Dockerfile.anyscale | 118 +++++++++++++++++
 miles_qwen3_8b_h100/README.md           |  77 +++++++++++
 miles_qwen3_8b_h100/entrypoint.sh       | 168 ++++++++++++++++++++++++
 miles_qwen3_8b_h100/job.yaml            |  33 +++++
 4 files changed, 396 insertions(+)
 create mode 100644 miles_qwen3_8b_h100/Dockerfile.anyscale
 create mode 100644 miles_qwen3_8b_h100/README.md
 create mode 100755 miles_qwen3_8b_h100/entrypoint.sh
 create mode 100644 miles_qwen3_8b_h100/job.yaml

diff --git a/miles_qwen3_8b_h100/Dockerfile.anyscale b/miles_qwen3_8b_h100/Dockerfile.anyscale
new file mode 100644
index 0000000..72640ea
--- /dev/null
+++ b/miles_qwen3_8b_h100/Dockerfile.anyscale
@@ -0,0 +1,118 @@
+FROM anyscale/ray:2.54.0-py312-cu129
+
+ARG PATCH_VERSION=latest
+ARG MEGATRON_COMMIT=3714d81d418c9f1bca4594fc35f9e8289f652862
+ARG SGLANG_COMMIT=24c91001cf99ba642be791e099d358f4dfe955f5
+ARG MILES_REF=main
+
+# Anyscale base image runs as non-root; switch to root for system installs.
+USER root
+WORKDIR /root
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git rsync dnsutils nvtop && \
+    rm -rf /var/lib/apt/lists/*
+
+# Keep pip tooling current and pin numpy to 1.x for Megatron compatibility.
+RUN python -m pip install --upgrade pip setuptools wheel && \
+    python -m pip install "numpy<2" huggingface_hub
+
+# Pin PyTorch 2.9.1 — matches sgl_kernel from PyPI (compiled for torch 2.9.x)
+# and has a pre-built flash-attn 2.8.3 wheel available.
+RUN python -m pip install torch==2.9.1 torchvision torchaudio \
+    --index-url https://download.pytorch.org/whl/cu128
+
+# Pre-built flash-attn wheel for torch 2.9 + cu12 (source compilation
+# exceeds Anyscale's ~60 min build timeout).
+RUN python -m pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3%2Bcu12torch2.9cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
+
+# Apex: install Python-only (no CUDA extensions) to stay within Anyscale's
+# ~60 min build timeout.  Megatron falls back to PyTorch-native kernels.
+RUN git clone --filter=blob:none https://github.com/NVIDIA/apex.git /tmp/apex && \
+    cd /tmp/apex && \
+    git checkout 10417aceddd7d5d05d7cbf7b0fc2daad1105f8b4 && \
+    python -m pip install --disable-pip-version-check --no-cache-dir \
+    --no-build-isolation . && \
+    rm -rf /tmp/apex
+
+# Install SGLang from source.  sgl_kernel comes from PyPI, pre-compiled
+# for torch 2.9.x — no need to rebuild from source.
+RUN git clone https://github.com/sgl-project/sglang.git /root/sglang && \
+    cd /root/sglang && \
+    git checkout ${SGLANG_COMMIT} && \
+    python -m pip install -e "python[all]"
+
+# Install Megatron-LM from source.
+RUN git clone --recursive https://github.com/NVIDIA/Megatron-LM.git /root/Megatron-LM && \
+    cd /root/Megatron-LM && \
+    git checkout ${MEGATRON_COMMIT} && \
+    python -m pip install -e .
+
+# Pull Miles source for patches and dependency manifests.
+RUN git clone https://github.com/radixark/miles.git /tmp/miles && \
+    cd /tmp/miles && \
+    git checkout ${MILES_REF}
+
+# Apply SGLang patch.
+RUN cd /root/sglang && \
+    cp /tmp/miles/docker/patch/${PATCH_VERSION}/sglang.patch ./sglang.patch && \
+    git update-index --refresh && \
+    git apply sglang.patch --3way && \
+    if grep -R -n '^<<<<<<< ' .; then \
+      echo "SGLang patch failed to apply cleanly. Please resolve conflicts." && \
+      exit 1; \
+    fi && \
+    rm sglang.patch
+
+# Apply Megatron-LM patch.
+RUN cd /root/Megatron-LM && \
+    cp /tmp/miles/docker/patch/${PATCH_VERSION}/megatron.patch ./megatron.patch && \
+    git update-index --refresh && \
+    git apply megatron.patch --3way && \
+    if grep -R -n '^<<<<<<< ' .; then \
+      echo "Megatron patch failed to apply cleanly. Please resolve conflicts." && \
+      exit 1; \
+    fi && \
+    rm megatron.patch
+
+# Install Miles dependencies.
+RUN python -m pip install git+https://github.com/ISEEKYAN/mbridge.git@89eb10887887bc74853f89a4de258c0702932a1c --no-deps && \
+    python -m pip install git+https://github.com/fzyzcjy/torch_memory_saver.git@dc6876905830430b5054325fa4211ff302169c6b --no-cache-dir --force-reinstall && \
+    python -m pip install git+https://github.com/fzyzcjy/Megatron-Bridge.git@dev_rl --no-build-isolation && \
+    python -m pip install "nvidia-modelopt[torch]>=0.37.0" --no-build-isolation
+
+# Make MXFP8 quantizer import conditional — mxfp8_group_quantize was added
+# in a newer SGLang than our pinned commit. Not needed for Qwen3-8B training.
+RUN python -c "\
+import pathlib; \
+p = pathlib.Path('/tmp/miles/miles/backends/megatron_utils/megatron_to_hf/processors/quantizer_mxfp8.py'); \
+t = p.read_text(); \
+t = t.replace( \
+    'from sglang.srt.layers.quantization.fp8_utils import mxfp8_group_quantize', \
+    'try:\\n    from sglang.srt.layers.quantization.fp8_utils import mxfp8_group_quantize\\nexcept ImportError:\\n    mxfp8_group_quantize = None' \
+); \
+p.write_text(t)"
+
+# Install Miles itself.
+RUN python -m pip install -r /tmp/miles/requirements.txt && \
+    python -m pip install -e /tmp/miles --no-deps && \
+    cd /tmp/miles/miles/backends/megatron_utils/kernels/int4_qat && \
+    python -m pip install . --no-build-isolation
+
+# Re-pin PyTorch 2.9.1 and reinstall flash-attn + TE at the end.
+# Earlier installs may have upgraded torch, breaking pre-built binary wheels.
+RUN python -c "import torch; print(f'Before re-pin: PyTorch {torch.__version__}')"
+RUN python -m pip install torch==2.9.1 torchvision torchaudio \
+    --index-url https://download.pytorch.org/whl/cu128
+RUN python -m pip install --force-reinstall --no-deps \
+    https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3%2Bcu12torch2.9cxx11abiTRUE-cp312-cp312-linux_x86_64.whl
+RUN python -m pip install --no-build-isolation "transformer_engine[pytorch]==2.10.0"
+
+# Verify torch + flash-attn ABI compatibility.
+# sgl_kernel is skipped here — it requires libcuda.so.1 (GPU hardware) to import.
+RUN python -c "\
+import torch; print(f'PyTorch: {torch.__version__}, CUDA: {torch.version.cuda}'); \
+assert torch.__version__.startswith('2.9'), f'Expected 2.9.x, got {torch.__version__}'; \
+from flash_attn import flash_attn_func; print('flash-attn OK')"
+
+WORKDIR /tmp/miles
diff --git a/miles_qwen3_8b_h100/README.md b/miles_qwen3_8b_h100/README.md
new file mode 100644
index 0000000..083b7b2
--- /dev/null
+++ b/miles_qwen3_8b_h100/README.md
@@ -0,0 +1,77 @@
+# Qwen3-8B GRPO Training on Anyscale (H100)
+
+Single-node RL training of Qwen3-8B with GRPO on **8x H100-80GB** using Anyscale, following the pattern from [anyscale/examples#43](https://github.com/anyscale/examples/pull/43).
+
+## Cluster Layout
+
+```
+Head node (m5.2xlarge):  driver only, no GPUs
+Worker 0 (8x H100-80GB):
+  GPU 0-3: Training (TP=2, DP=2)
+  GPU 4-7: Rollout (3 SGLang engines + 1 driver)
+```
+
+- **Training**: 4 GPUs — TP=2 x DP=2 (Megatron backend)
+- **Rollout**: 3 GPUs — disaggregated SGLang inference, 1 GPU per engine (1 GPU reserved for driver)
+- **Algorithm**: GRPO with DAPO-style asymmetric clipping
+- **Dataset**: DAPO-Math-17k (integer math, deterministic reward)
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `job.yaml` | Anyscale job config (`m5.2xlarge` head + 1x `p5.48xlarge` worker) |
+| `Dockerfile.anyscale` | Docker image with Miles, Megatron-LM, SGLang, flash-attn, TE |
+| `entrypoint.sh` | Downloads model/data, converts weights, runs async GRPO training |
+
+## Quick Start
+
+```bash
+pip install -U anyscale
+anyscale login
+
+cd examples/anyscale_qwen3_8b_h100
+anyscale job submit -f job.yaml
+```
+
+The entrypoint automatically:
+1. Downloads `Qwen/Qwen3-8B` and `zhuzilin/dapo-math-17k` to `/mnt/cluster_storage`
+2. Converts HF weights to Megatron torch_dist format (on GPU worker)
+3. Runs async GRPO training with `dapo` reward model via `train_async.py`
+
+## Key Differences from the Slime A10G Example (PR #43)
+
+| | Slime A10G (PR #43) | This Example |
+|---|---|---|
+| GPUs | 2x4 A10G (24GB) | 1x8 H100 (80GB) |
+| Model | Qwen3-1.7B | Qwen3-8B |
+| Training | `train.py` (sync) | `train_async.py` (pipelined async) |
+| Parallelism | TP=2, PP=2 across nodes | TP=2, DP=2, single node |
+| A10G patches | sgl_kernel, Triton, multi_platform | Not needed (H100 = SM90) |
+| Batch size | 64 (16 prompts x 4 samples) | 256 (32 prompts x 8 samples) |
+| Max tokens/GPU | 4096 | 9216 |
+| Attention | FA2 only (Ampere) | FA2 (FA3 available with custom image) |
+
+## Verification
+
+A successful run shows:
+- SGLang engine startup on rollout GPUs
+- Weight conversion completes (first run only)
+- Training loss values printed each step
+- Reward gradually increasing over rollouts
+- Weight sync between training and rollout engines
+
+## If You Hit OOM
+
+**Training GPUs:**
+1. `--max-tokens-per-gpu` -> `4096`
+2. `--rollout-max-response-len` -> `4096`
+3. `--n-samples-per-prompt` -> `4` and `--global-batch-size` -> `128`
+
+**Rollout GPUs:**
+1. `--sglang-mem-fraction-static` -> `0.5`
+2. Add `--sglang-chunked-prefill-size 4096`
+
+## View the Job
+
+View the job in the [jobs tab](https://console.anyscale.com/jobs) of the Anyscale console.
diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/miles_qwen3_8b_h100/entrypoint.sh
new file mode 100755
index 0000000..505ee89
--- /dev/null
+++ b/miles_qwen3_8b_h100/entrypoint.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+# Anyscale entrypoint: Qwen3-8B GRPO training on 1 worker × 8x H100-80GB
+# Downloads model/dataset, converts weights, and runs async RL training.
+#
+# Head node (m5.2xlarge): driver only, no GPUs
+# Layout (GPU worker):
+#   Worker 0 (8x H100):
+#     GPU 0-3: Training (TP=2, DP=2)
+#     GPU 4-7: Rollout (4 SGLang engines, 1 GPU each)
+
+set -ex
+
+export PYTHONBUFFERED=16
+STORAGE=/mnt/cluster_storage
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+
+# Qwen3-8B model architecture args (from scripts/models/qwen3-8B.sh)
+MODEL_ARGS=(
+   --swiglu
+   --num-layers 36
+   --hidden-size 4096
+   --ffn-hidden-size 12288
+   --num-attention-heads 32
+   --group-query-attention
+   --num-query-groups 8
+   --use-rotary-position-embeddings
+   --disable-bias-linear
+   --normalization "RMSNorm"
+   --norm-epsilon 1e-6
+   --rotary-base 1000000
+   --vocab-size 151936
+   --kv-channels 128
+   --qk-layernorm
+   --untie-embeddings-and-output-weights
+)
+
+# ======================== Step 1: Download model & dataset ========================
+
+echo "=== Downloading model ==="
+huggingface-cli download Qwen/Qwen3-8B --local-dir ${STORAGE}/Qwen3-8B
+
+echo "=== Downloading dataset ==="
+huggingface-cli download --repo-type dataset zhuzilin/dapo-math-17k --local-dir ${STORAGE}/dapo-math-17k
+
+# ======================== Step 2: Convert HF weights to torch_dist ========================
+
+if [ ! -d "${STORAGE}/Qwen3-8B_torch_dist/iter_0000000" ]; then
+  echo "=== Converting weights (HF -> torch_dist) on GPU worker ==="
+  CONVERT_ENV_JSON='{
+    "env_vars": {
+      "PYTHONPATH": "/root/Megatron-LM/"
+    }
+  }'
+  ray job submit --address="http://127.0.0.1:8265" \
+    --runtime-env-json="${CONVERT_ENV_JSON}" \
+    --entrypoint-num-gpus 1 \
+    -- python3 /tmp/miles/tools/convert_hf_to_torch_dist.py \
+      ${MODEL_ARGS[@]} \
+      --no-gradient-accumulation-fusion \
+      --hf-checkpoint ${STORAGE}/Qwen3-8B \
+      --save ${STORAGE}/Qwen3-8B_torch_dist
+else
+  echo "=== Converted weights already exist, skipping ==="
+fi
+
+# ======================== Step 3: Run training ========================
+
+CKPT_ARGS=(
+   --hf-checkpoint ${STORAGE}/Qwen3-8B
+   --ref-load ${STORAGE}/Qwen3-8B_torch_dist
+   --load ${STORAGE}/Qwen3-8B_torch_dist
+   --save ${STORAGE}/Qwen3-8B_miles/
+   --save-interval 20
+)
+
+ROLLOUT_ARGS=(
+   --prompt-data ${STORAGE}/dapo-math-17k/dapo-math-17k.jsonl
+   --input-key prompt
+   --label-key label
+   --apply-chat-template
+   --rollout-shuffle
+   --balance-data
+   --rm-type dapo
+   --reward-key score
+   --num-rollout 3000
+   --rollout-batch-size 32
+   --n-samples-per-prompt 8
+   --rollout-max-response-len 8192
+   --rollout-temperature 1
+   --global-batch-size 256
+)
+
+PERF_ARGS=(
+   --tensor-model-parallel-size 2
+   --sequence-parallel
+   --pipeline-model-parallel-size 1
+   --context-parallel-size 1
+   --expert-model-parallel-size 1
+   --expert-tensor-parallel-size 1
+
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 9216
+)
+
+GRPO_ARGS=(
+   --advantage-estimator grpo
+   --use-kl-loss
+   --kl-loss-coef 0.00
+   --kl-loss-type low_var_kl
+   --entropy-coef 0.00
+   --eps-clip 0.2
+   --eps-clip-high 0.28
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-6
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+)
+
+SGLANG_ARGS=(
+   --rollout-num-gpus-per-engine 1
+   --sglang-mem-fraction-static 0.7
+)
+
+MISC_ARGS=(
+   --no-gradient-accumulation-fusion
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   --attention-backend flash
+   --use-tensorboard
+   --tensorboard-dir ${STORAGE}/tensorboard_logs
+)
+
+RUNTIME_ENV_JSON='{
+  "env_vars": {
+    "PYTHONPATH": "/root/Megatron-LM/",
+    "CUDA_DEVICE_MAX_CONNECTIONS": "1",
+    "TENSORBOARD_DIR": "/mnt/cluster_storage/tensorboard_logs"
+  }
+}'
+
+echo "=== Submitting training job ==="
+ray job submit --address="http://127.0.0.1:8265" \
+   --runtime-env-json="${RUNTIME_ENV_JSON}" \
+   --entrypoint-num-gpus 1 \
+   -- python3 /tmp/miles/train_async.py \
+   --actor-num-nodes 1 \
+   --actor-num-gpus-per-node 4 \
+   --rollout-num-gpus 3 \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${GRPO_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${SGLANG_ARGS[@]} \
+   ${MISC_ARGS[@]}
diff --git a/miles_qwen3_8b_h100/job.yaml b/miles_qwen3_8b_h100/job.yaml
new file mode 100644
index 0000000..10682e1
--- /dev/null
+++ b/miles_qwen3_8b_h100/job.yaml
@@ -0,0 +1,33 @@
+# Anyscale job config: Miles Qwen3-8B GRPO training on H100
+# Single node × 8x H100-80GB
+#
+# Layout:
+#   Head node (m5.2xlarge): driver only, no GPUs
+#   Worker 0 (8x H100):  [GPU 0-3: Training TP=2 DP=2] [GPU 4-7: Rollout (4 engines)]
+#
+# Submit with:
+#   cd examples/anyscale_qwen3_8b_h100
+#   anyscale job submit -f job.yaml
+cloud: anyscale-v2-cloud-us-east-1
+
+name: miles-qwen3-8b-grpo-h100
+
+containerfile: ./Dockerfile.anyscale
+
+compute_config:
+  head_node:
+    instance_type: m5.2xlarge       # CPU-only, runs driver script
+  worker_nodes:
+    - instance_type: p5.48xlarge    # 8x H100-80GB, 192 vCPU, 2048 GB RAM
+      min_nodes: 1
+      max_nodes: 1
+      advanced_instance_config:
+        CapacityReservationSpecification:
+          CapacityReservationTarget:
+            CapacityReservationId: cr-0dfe1157d299ae5fc
+
+working_dir: .
+
+entrypoint: bash entrypoint.sh
+
+max_retries: 0

From 2c18184e689f3297a4779b0fb2971ec3764b2cc4 Mon Sep 17 00:00:00 2001
From: Xinyu Zhang <60529799+xyuzh@users.noreply.github.com>
Date: Thu, 26 Feb 2026 10:57:11 -0800
Subject: [PATCH 2/7] Change num-rollout from 3000 to 5

---
 miles_qwen3_8b_h100/entrypoint.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/miles_qwen3_8b_h100/entrypoint.sh
index 505ee89..1be7221 100755
--- a/miles_qwen3_8b_h100/entrypoint.sh
+++ b/miles_qwen3_8b_h100/entrypoint.sh
@@ -83,7 +83,7 @@ ROLLOUT_ARGS=(
    --balance-data
    --rm-type dapo
    --reward-key score
-   --num-rollout 3000
+   --num-rollout 5
    --rollout-batch-size 32
    --n-samples-per-prompt 8
    --rollout-max-response-len 8192

From 6ca34bcf972503a6e8a4c847643b1deb8aca6446 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 27 Feb 2026 13:30:57 -0800
Subject: [PATCH 3/7] Polish Qwen3-8B GRPO example

- Remove ray job submit, call python directly
- Move env vars to appropriate locations (PYTHONPATH in Dockerfile, CUDA_DEVICE_MAX_CONNECTIONS in job.yaml)
- Simplify entrypoint.sh (remove unused vars, fix paths)
- Add timeout_s to job.yaml
- Restructure README to match other examples pattern
- Rename Dockerfile.anyscale -> Dockerfile
- Change python3 -> python throughout

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 .../{Dockerfile.anyscale => Dockerfile}       | 25 +++---
 miles_qwen3_8b_h100/README.md                 | 84 +++++--------------
 miles_qwen3_8b_h100/entrypoint.sh             | 39 ++-------
 miles_qwen3_8b_h100/job.yaml                  | 17 ++--
 4 files changed, 53 insertions(+), 112 deletions(-)
 rename miles_qwen3_8b_h100/{Dockerfile.anyscale => Dockerfile} (90%)

diff --git a/miles_qwen3_8b_h100/Dockerfile.anyscale b/miles_qwen3_8b_h100/Dockerfile
similarity index 90%
rename from miles_qwen3_8b_h100/Dockerfile.anyscale
rename to miles_qwen3_8b_h100/Dockerfile
index 72640ea..265e250 100644
--- a/miles_qwen3_8b_h100/Dockerfile.anyscale
+++ b/miles_qwen3_8b_h100/Dockerfile
@@ -5,13 +5,12 @@ ARG MEGATRON_COMMIT=3714d81d418c9f1bca4594fc35f9e8289f652862
 ARG SGLANG_COMMIT=24c91001cf99ba642be791e099d358f4dfe955f5
 ARG MILES_REF=main
 
-# Anyscale base image runs as non-root; switch to root for system installs.
-USER root
-WORKDIR /root
+# Anyscale base image runs as non-root; use sudo for system installs.
+WORKDIR /home/ray
 
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends git rsync dnsutils nvtop && \
-    rm -rf /var/lib/apt/lists/*
+RUN sudo apt-get update && \
+    sudo apt-get install -y --no-install-recommends git rsync dnsutils nvtop && \
+    sudo rm -rf /var/lib/apt/lists/*
 
 # Keep pip tooling current and pin numpy to 1.x for Megatron compatibility.
 RUN python -m pip install --upgrade pip setuptools wheel && \
@@ -37,14 +36,14 @@ RUN git clone --filter=blob:none https://github.com/NVIDIA/apex.git /tmp/apex &&
 
 # Install SGLang from source.  sgl_kernel comes from PyPI, pre-compiled
 # for torch 2.9.x — no need to rebuild from source.
-RUN git clone https://github.com/sgl-project/sglang.git /root/sglang && \
-    cd /root/sglang && \
+RUN git clone https://github.com/sgl-project/sglang.git /home/ray/sglang && \
+    cd /home/ray/sglang && \
     git checkout ${SGLANG_COMMIT} && \
     python -m pip install -e "python[all]"
 
 # Install Megatron-LM from source.
-RUN git clone --recursive https://github.com/NVIDIA/Megatron-LM.git /root/Megatron-LM && \
-    cd /root/Megatron-LM && \
+RUN git clone --recursive https://github.com/NVIDIA/Megatron-LM.git /home/ray/Megatron-LM && \
+    cd /home/ray/Megatron-LM && \
     git checkout ${MEGATRON_COMMIT} && \
     python -m pip install -e .
 
@@ -54,7 +53,7 @@ RUN git clone https://github.com/radixark/miles.git /tmp/miles && \
     git checkout ${MILES_REF}
 
 # Apply SGLang patch.
-RUN cd /root/sglang && \
+RUN cd /home/ray/sglang && \
     cp /tmp/miles/docker/patch/${PATCH_VERSION}/sglang.patch ./sglang.patch && \
     git update-index --refresh && \
     git apply sglang.patch --3way && \
@@ -65,7 +64,7 @@ RUN cd /root/sglang && \
     rm sglang.patch
 
 # Apply Megatron-LM patch.
-RUN cd /root/Megatron-LM && \
+RUN cd /home/ray/Megatron-LM && \
     cp /tmp/miles/docker/patch/${PATCH_VERSION}/megatron.patch ./megatron.patch && \
     git update-index --refresh && \
     git apply megatron.patch --3way && \
@@ -115,4 +114,6 @@ import torch; print(f'PyTorch: {torch.__version__}, CUDA: {torch.version.cuda}')
 assert torch.__version__.startswith('2.9'), f'Expected 2.9.x, got {torch.__version__}'; \
 from flash_attn import flash_attn_func; print('flash-attn OK')"
 
+ENV PYTHONPATH=/home/ray/Megatron-LM:$PYTHONPATH
+
 WORKDIR /tmp/miles
diff --git a/miles_qwen3_8b_h100/README.md b/miles_qwen3_8b_h100/README.md
index 083b7b2..fcd12b4 100644
--- a/miles_qwen3_8b_h100/README.md
+++ b/miles_qwen3_8b_h100/README.md
@@ -1,77 +1,39 @@
-# Qwen3-8B GRPO Training on Anyscale (H100)
+# GRPO Training for Qwen3-8B with MILES
 
-Single-node RL training of Qwen3-8B with GRPO on **8x H100-80GB** using Anyscale, following the pattern from [anyscale/examples#43](https://github.com/anyscale/examples/pull/43).
+This example demonstrates reinforcement learning fine-tuning of Qwen3-8B using **Group Relative Policy Optimization (GRPO)** on the DAPO-Math-17k dataset. It uses the [MILES](https://github.com/radixark/miles) framework for distributed RL training with disaggregated rollouts on Anyscale.
 
-## Cluster Layout
+The training runs on a single node with **8x H100-80GB GPUs**, using:
+- **4 GPUs for training** (TP=2, DP=2 with Megatron-LM)
+- **4 GPUs for rollout inference** (disaggregated SGLang engines)
 
-```
-Head node (m5.2xlarge):  driver only, no GPUs
-Worker 0 (8x H100-80GB):
-  GPU 0-3: Training (TP=2, DP=2)
-  GPU 4-7: Rollout (3 SGLang engines + 1 driver)
-```
-
-- **Training**: 4 GPUs — TP=2 x DP=2 (Megatron backend)
-- **Rollout**: 3 GPUs — disaggregated SGLang inference, 1 GPU per engine (1 GPU reserved for driver)
-- **Algorithm**: GRPO with DAPO-style asymmetric clipping
-- **Dataset**: DAPO-Math-17k (integer math, deterministic reward)
-
-## Files
-
-| File | Description |
-|------|-------------|
-| `job.yaml` | Anyscale job config (`m5.2xlarge` head + 1x `p5.48xlarge` worker) |
-| `Dockerfile.anyscale` | Docker image with Miles, Megatron-LM, SGLang, flash-attn, TE |
-| `entrypoint.sh` | Downloads model/data, converts weights, runs async GRPO training |
-
-## Quick Start
+## Install the Anyscale CLI
 
 ```bash
 pip install -U anyscale
 anyscale login
-
-cd examples/anyscale_qwen3_8b_h100
-anyscale job submit -f job.yaml
 ```
 
-The entrypoint automatically:
-1. Downloads `Qwen/Qwen3-8B` and `zhuzilin/dapo-math-17k` to `/mnt/cluster_storage`
-2. Converts HF weights to Megatron torch_dist format (on GPU worker)
-3. Runs async GRPO training with `dapo` reward model via `train_async.py`
+## Submit the job
 
-## Key Differences from the Slime A10G Example (PR #43)
+Clone the example from GitHub.
 
-| | Slime A10G (PR #43) | This Example |
-|---|---|---|
-| GPUs | 2x4 A10G (24GB) | 1x8 H100 (80GB) |
-| Model | Qwen3-1.7B | Qwen3-8B |
-| Training | `train.py` (sync) | `train_async.py` (pipelined async) |
-| Parallelism | TP=2, PP=2 across nodes | TP=2, DP=2, single node |
-| A10G patches | sgl_kernel, Triton, multi_platform | Not needed (H100 = SM90) |
-| Batch size | 64 (16 prompts x 4 samples) | 256 (32 prompts x 8 samples) |
-| Max tokens/GPU | 4096 | 9216 |
-| Attention | FA2 only (Ampere) | FA2 (FA3 available with custom image) |
-
-## Verification
-
-A successful run shows:
-- SGLang engine startup on rollout GPUs
-- Weight conversion completes (first run only)
-- Training loss values printed each step
-- Reward gradually increasing over rollouts
-- Weight sync between training and rollout engines
+```bash
+git clone https://github.com/anyscale/examples.git
+cd examples/miles_qwen3_8b_h100
+```
 
-## If You Hit OOM
+Submit the job.
 
-**Training GPUs:**
-1. `--max-tokens-per-gpu` -> `4096`
-2. `--rollout-max-response-len` -> `4096`
-3. `--n-samples-per-prompt` -> `4` and `--global-batch-size` -> `128`
+```bash
+anyscale job submit -f job.yaml
+```
 
-**Rollout GPUs:**
-1. `--sglang-mem-fraction-static` -> `0.5`
-2. Add `--sglang-chunked-prefill-size 4096`
+The entrypoint will automatically download the model and dataset, convert weights to Megatron format, and start training. Training progress can be monitored via TensorBoard logs in `/mnt/cluster_storage/tensorboard_logs`.
 
-## View the Job
+## Understanding the example
 
-View the job in the [jobs tab](https://console.anyscale.com/jobs) of the Anyscale console.
+- **Algorithm**: This example uses GRPO with DAPO-style asymmetric clipping (ε_low=0.2, ε_high=0.28), which is particularly effective for math reasoning tasks.
+- **Dataset**: [DAPO-Math-17k](https://huggingface.co/datasets/zhuzilin/dapo-math-17k) contains 17k integer math problems with deterministic reward signals based on answer correctness.
+- **Disaggregated architecture**: Training and rollout happen on separate GPUs for maximum throughput. The 4 SGLang rollout engines run inference in parallel while the training GPUs perform gradient updates.
+- **Weight conversion**: On the first run, HuggingFace weights are converted to Megatron-LM's `torch_dist` format. Converted weights are cached in `/mnt/cluster_storage/Qwen3-8B_torch_dist` for subsequent runs.
+- **Async training**: The pipeline uses `train_async.py` which overlaps rollout generation and policy updates for better GPU utilization.
diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/miles_qwen3_8b_h100/entrypoint.sh
index 1be7221..79f1a06 100755
--- a/miles_qwen3_8b_h100/entrypoint.sh
+++ b/miles_qwen3_8b_h100/entrypoint.sh
@@ -13,8 +13,6 @@ set -ex
 export PYTHONBUFFERED=16
 STORAGE=/mnt/cluster_storage
 
-SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
-
 # Qwen3-8B model architecture args (from scripts/models/qwen3-8B.sh)
 MODEL_ARGS=(
    --swiglu
@@ -46,20 +44,12 @@ huggingface-cli download --repo-type dataset zhuzilin/dapo-math-17k --local-dir
 # ======================== Step 2: Convert HF weights to torch_dist ========================
 
 if [ ! -d "${STORAGE}/Qwen3-8B_torch_dist/iter_0000000" ]; then
-  echo "=== Converting weights (HF -> torch_dist) on GPU worker ==="
-  CONVERT_ENV_JSON='{
-    "env_vars": {
-      "PYTHONPATH": "/root/Megatron-LM/"
-    }
-  }'
-  ray job submit --address="http://127.0.0.1:8265" \
-    --runtime-env-json="${CONVERT_ENV_JSON}" \
-    --entrypoint-num-gpus 1 \
-    -- python3 /tmp/miles/tools/convert_hf_to_torch_dist.py \
-      ${MODEL_ARGS[@]} \
-      --no-gradient-accumulation-fusion \
-      --hf-checkpoint ${STORAGE}/Qwen3-8B \
-      --save ${STORAGE}/Qwen3-8B_torch_dist
+  echo "=== Converting weights (HF -> torch_dist) ==="
+  python /tmp/miles/tools/convert_hf_to_torch_dist.py \
+    ${MODEL_ARGS[@]} \
+    --no-gradient-accumulation-fusion \
+    --hf-checkpoint ${STORAGE}/Qwen3-8B \
+    --save ${STORAGE}/Qwen3-8B_torch_dist
 else
   echo "=== Converted weights already exist, skipping ==="
 fi
@@ -142,22 +132,11 @@ MISC_ARGS=(
    --tensorboard-dir ${STORAGE}/tensorboard_logs
 )
 
-RUNTIME_ENV_JSON='{
-  "env_vars": {
-    "PYTHONPATH": "/root/Megatron-LM/",
-    "CUDA_DEVICE_MAX_CONNECTIONS": "1",
-    "TENSORBOARD_DIR": "/mnt/cluster_storage/tensorboard_logs"
-  }
-}'
-
-echo "=== Submitting training job ==="
-ray job submit --address="http://127.0.0.1:8265" \
-   --runtime-env-json="${RUNTIME_ENV_JSON}" \
-   --entrypoint-num-gpus 1 \
-   -- python3 /tmp/miles/train_async.py \
+echo "=== Starting training ==="
+python /tmp/miles/train_async.py \
    --actor-num-nodes 1 \
    --actor-num-gpus-per-node 4 \
-   --rollout-num-gpus 3 \
+   --rollout-num-gpus 4 \
    ${MODEL_ARGS[@]} \
    ${CKPT_ARGS[@]} \
    ${ROLLOUT_ARGS[@]} \
diff --git a/miles_qwen3_8b_h100/job.yaml b/miles_qwen3_8b_h100/job.yaml
index 10682e1..7c65bac 100644
--- a/miles_qwen3_8b_h100/job.yaml
+++ b/miles_qwen3_8b_h100/job.yaml
@@ -6,28 +6,27 @@
 #   Worker 0 (8x H100):  [GPU 0-3: Training TP=2 DP=2] [GPU 4-7: Rollout (4 engines)]
 #
 # Submit with:
-#   cd examples/anyscale_qwen3_8b_h100
+#   cd miles_qwen3_8b_h100
 #   anyscale job submit -f job.yaml
-cloud: anyscale-v2-cloud-us-east-1
 
 name: miles-qwen3-8b-grpo-h100
 
-containerfile: ./Dockerfile.anyscale
+containerfile: ./Dockerfile
 
 compute_config:
   head_node:
-    instance_type: m5.2xlarge       # CPU-only, runs driver script
+    instance_type: m5.2xlarge
   worker_nodes:
-    - instance_type: p5.48xlarge    # 8x H100-80GB, 192 vCPU, 2048 GB RAM
+    - instance_type: p5.48xlarge    # 8x H100-80GB
       min_nodes: 1
       max_nodes: 1
-      advanced_instance_config:
-        CapacityReservationSpecification:
-          CapacityReservationTarget:
-            CapacityReservationId: cr-0dfe1157d299ae5fc
 
 working_dir: .
 
 entrypoint: bash entrypoint.sh
 
+env_vars:
+  CUDA_DEVICE_MAX_CONNECTIONS: "1"
+
 max_retries: 0
+timeout_s: 7200

From e7aa67c39a51ed3703a1e3c737e3a90c3fd2687c Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 27 Feb 2026 13:42:58 -0800
Subject: [PATCH 4/7] Use declarative compute config instead of hardcoded
 instance types

- Replace instance_type with required_resources and required_labels
- Specify H100 accelerator type using ray.io/accelerator-type label
- Define resource requirements: 8 CPUs/32Gi for head, 96 CPUs/512Gi/8 GPUs for workers
- Allows Anyscale to select optimal H100 instance type (e.g., p5.48xlarge)

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 miles_qwen3_8b_h100/job.yaml | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/miles_qwen3_8b_h100/job.yaml b/miles_qwen3_8b_h100/job.yaml
index 7c65bac..6f54dc6 100644
--- a/miles_qwen3_8b_h100/job.yaml
+++ b/miles_qwen3_8b_h100/job.yaml
@@ -15,9 +15,17 @@ containerfile: ./Dockerfile
 
 compute_config:
   head_node:
-    instance_type: m5.2xlarge
+    required_resources:
+      CPU: 8
+      memory: 32Gi
   worker_nodes:
-    - instance_type: p5.48xlarge    # 8x H100-80GB
+    - name: h100-workers
+      required_resources:
+        CPU: 96
+        memory: 512Gi
+        GPU: 8
+      required_labels:
+        ray.io/accelerator-type: H100
       min_nodes: 1
       max_nodes: 1
 

From 4c572960cdc0d314833d0f38cac5c23b5575d058 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 27 Feb 2026 16:15:20 -0800
Subject: [PATCH 5/7] Fix declarative compute config resource requirements

- Update worker resources to match p5.48xlarge specs: 192 vCPUs, 2048Gi memory
- Keeps 8 H100 GPUs with H100 accelerator type label

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 miles_qwen3_8b_h100/job.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/miles_qwen3_8b_h100/job.yaml b/miles_qwen3_8b_h100/job.yaml
index 6f54dc6..53ec596 100644
--- a/miles_qwen3_8b_h100/job.yaml
+++ b/miles_qwen3_8b_h100/job.yaml
@@ -21,8 +21,8 @@ compute_config:
   worker_nodes:
     - name: h100-workers
       required_resources:
-        CPU: 96
-        memory: 512Gi
+        CPU: 192
+        memory: 2048Gi
         GPU: 8
       required_labels:
         ray.io/accelerator-type: H100

From 7e690cdf917c0f26454b81983b48053d907e2e73 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 27 Feb 2026 16:24:36 -0800
Subject: [PATCH 6/7] Use Ray remote to run weight conversion on GPU worker

- Add convert_weights_remote.py wrapper with @ray.remote(num_gpus=1)
- Ensures weight conversion runs on GPU worker instead of head node
- Fixes 'No NVIDIA driver' error when running conversion

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 miles_qwen3_8b_h100/convert_weights_remote.py | 30 +++++++++++++++++++
 miles_qwen3_8b_h100/entrypoint.sh             |  4 +--
 2 files changed, 32 insertions(+), 2 deletions(-)
 create mode 100644 miles_qwen3_8b_h100/convert_weights_remote.py

diff --git a/miles_qwen3_8b_h100/convert_weights_remote.py b/miles_qwen3_8b_h100/convert_weights_remote.py
new file mode 100644
index 0000000..f7aaa4d
--- /dev/null
+++ b/miles_qwen3_8b_h100/convert_weights_remote.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+"""Ray remote wrapper for weight conversion - ensures it runs on a GPU worker."""
+import sys
+import subprocess
+import ray
+
+@ray.remote(num_gpus=1)
+def convert_weights(cmd_args):
+    """Run weight conversion on a GPU worker."""
+    result = subprocess.run(
+        ["python", "/tmp/miles/tools/convert_hf_to_torch_dist.py"] + cmd_args,
+        capture_output=True,
+        text=True
+    )
+    return result.returncode, result.stdout, result.stderr
+
+if __name__ == "__main__":
+    # Pass through all command-line arguments
+    cmd_args = sys.argv[1:]
+
+    # Run conversion on GPU worker
+    returncode, stdout, stderr = ray.get(convert_weights.remote(cmd_args))
+
+    # Print output
+    if stdout:
+        print(stdout, end="")
+    if stderr:
+        print(stderr, end="", file=sys.stderr)
+
+    sys.exit(returncode)
diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/miles_qwen3_8b_h100/entrypoint.sh
index 79f1a06..04345e8 100755
--- a/miles_qwen3_8b_h100/entrypoint.sh
+++ b/miles_qwen3_8b_h100/entrypoint.sh
@@ -44,8 +44,8 @@ huggingface-cli download --repo-type dataset zhuzilin/dapo-math-17k --local-dir
 # ======================== Step 2: Convert HF weights to torch_dist ========================
 
 if [ ! -d "${STORAGE}/Qwen3-8B_torch_dist/iter_0000000" ]; then
-  echo "=== Converting weights (HF -> torch_dist) ==="
-  python /tmp/miles/tools/convert_hf_to_torch_dist.py \
+  echo "=== Converting weights (HF -> torch_dist) on GPU worker ==="
+  python convert_weights_remote.py \
     ${MODEL_ARGS[@]} \
     --no-gradient-accumulation-fusion \
     --hf-checkpoint ${STORAGE}/Qwen3-8B \

From eba802aca5c5fab30e1dd3de0b051ba13222d9a6 Mon Sep 17 00:00:00 2001
From: Robert Nishihara <rkn@anyscale.com>
Date: Fri, 27 Feb 2026 18:23:18 -0800
Subject: [PATCH 7/7] Add Ray remote wrapper for training script

- Create train_remote.py with @ray.remote(num_gpus=4)
- Ensures training runs on GPU workers instead of head node
- Both weight conversion and training now use Ray remote

Signed-off-by: Robert Nishihara <rkn@anyscale.com>
---
 miles_qwen3_8b_h100/entrypoint.sh   |  2 +-
 miles_qwen3_8b_h100/train_remote.py | 24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 miles_qwen3_8b_h100/train_remote.py

diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/miles_qwen3_8b_h100/entrypoint.sh
index 04345e8..eba3188 100755
--- a/miles_qwen3_8b_h100/entrypoint.sh
+++ b/miles_qwen3_8b_h100/entrypoint.sh
@@ -133,7 +133,7 @@ MISC_ARGS=(
 )
 
 echo "=== Starting training ==="
-python /tmp/miles/train_async.py \
+python train_remote.py \
    --actor-num-nodes 1 \
    --actor-num-gpus-per-node 4 \
    --rollout-num-gpus 4 \
diff --git a/miles_qwen3_8b_h100/train_remote.py b/miles_qwen3_8b_h100/train_remote.py
new file mode 100644
index 0000000..d95fbaa
--- /dev/null
+++ b/miles_qwen3_8b_h100/train_remote.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+"""Ray remote wrapper for training - ensures it runs on GPU workers."""
+import sys
+import subprocess
+import ray
+
+@ray.remote(num_gpus=4)  # Training needs 4 GPUs
+def run_training(cmd_args):
+    """Run training on GPU workers."""
+    result = subprocess.run(
+        ["python", "/tmp/miles/train_async.py"] + cmd_args,
+        capture_output=False,  # Stream output directly
+        text=True
+    )
+    return result.returncode
+
+if __name__ == "__main__":
+    # Pass through all command-line arguments
+    cmd_args = sys.argv[1:]
+
+    # Run training on GPU workers
+    returncode = ray.get(run_training.remote(cmd_args))
+
+    sys.exit(returncode)