From 57cce29d655be265d87ec5c027e9f8f7fd7e3ff0 Mon Sep 17 00:00:00 2001 From: xyuzh Date: Sun, 22 Feb 2026 20:25:32 -0800 Subject: [PATCH 1/7] Add Miles Qwen3-8B GRPO training example on H100 Single-node RL training of Qwen3-8B with GRPO on 8x H100-80GB using Anyscale. Includes Dockerfile, job config, and entrypoint script that handles model download, weight conversion, and async GRPO training with Megatron backend (TP=2, DP=2) and 3 SGLang rollout engines. --- miles_qwen3_8b_h100/Dockerfile.anyscale | 118 +++++++++++++++++ miles_qwen3_8b_h100/README.md | 77 +++++++++++ miles_qwen3_8b_h100/entrypoint.sh | 168 ++++++++++++++++++++++++ miles_qwen3_8b_h100/job.yaml | 33 +++++ 4 files changed, 396 insertions(+) create mode 100644 miles_qwen3_8b_h100/Dockerfile.anyscale create mode 100644 miles_qwen3_8b_h100/README.md create mode 100755 miles_qwen3_8b_h100/entrypoint.sh create mode 100644 miles_qwen3_8b_h100/job.yaml diff --git a/miles_qwen3_8b_h100/Dockerfile.anyscale b/miles_qwen3_8b_h100/Dockerfile.anyscale new file mode 100644 index 0000000..72640ea --- /dev/null +++ b/miles_qwen3_8b_h100/Dockerfile.anyscale @@ -0,0 +1,118 @@ +FROM anyscale/ray:2.54.0-py312-cu129 + +ARG PATCH_VERSION=latest +ARG MEGATRON_COMMIT=3714d81d418c9f1bca4594fc35f9e8289f652862 +ARG SGLANG_COMMIT=24c91001cf99ba642be791e099d358f4dfe955f5 +ARG MILES_REF=main + +# Anyscale base image runs as non-root; switch to root for system installs. +USER root +WORKDIR /root + +RUN apt-get update && \ + apt-get install -y --no-install-recommends git rsync dnsutils nvtop && \ + rm -rf /var/lib/apt/lists/* + +# Keep pip tooling current and pin numpy to 1.x for Megatron compatibility. +RUN python -m pip install --upgrade pip setuptools wheel && \ + python -m pip install "numpy<2" huggingface_hub + +# Pin PyTorch 2.9.1 — matches sgl_kernel from PyPI (compiled for torch 2.9.x) +# and has a pre-built flash-attn 2.8.3 wheel available. +RUN python -m pip install torch==2.9.1 torchvision torchaudio \ + --index-url https://download.pytorch.org/whl/cu128 + +# Pre-built flash-attn wheel for torch 2.9 + cu12 (source compilation +# exceeds Anyscale's ~60 min build timeout). +RUN python -m pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3%2Bcu12torch2.9cxx11abiTRUE-cp312-cp312-linux_x86_64.whl + +# Apex: install Python-only (no CUDA extensions) to stay within Anyscale's +# ~60 min build timeout. Megatron falls back to PyTorch-native kernels. +RUN git clone --filter=blob:none https://github.com/NVIDIA/apex.git /tmp/apex && \ + cd /tmp/apex && \ + git checkout 10417aceddd7d5d05d7cbf7b0fc2daad1105f8b4 && \ + python -m pip install --disable-pip-version-check --no-cache-dir \ + --no-build-isolation . && \ + rm -rf /tmp/apex + +# Install SGLang from source. sgl_kernel comes from PyPI, pre-compiled +# for torch 2.9.x — no need to rebuild from source. +RUN git clone https://github.com/sgl-project/sglang.git /root/sglang && \ + cd /root/sglang && \ + git checkout ${SGLANG_COMMIT} && \ + python -m pip install -e "python[all]" + +# Install Megatron-LM from source. +RUN git clone --recursive https://github.com/NVIDIA/Megatron-LM.git /root/Megatron-LM && \ + cd /root/Megatron-LM && \ + git checkout ${MEGATRON_COMMIT} && \ + python -m pip install -e . + +# Pull Miles source for patches and dependency manifests. +RUN git clone https://github.com/radixark/miles.git /tmp/miles && \ + cd /tmp/miles && \ + git checkout ${MILES_REF} + +# Apply SGLang patch. +RUN cd /root/sglang && \ + cp /tmp/miles/docker/patch/${PATCH_VERSION}/sglang.patch ./sglang.patch && \ + git update-index --refresh && \ + git apply sglang.patch --3way && \ + if grep -R -n '^<<<<<<< ' .; then \ + echo "SGLang patch failed to apply cleanly. Please resolve conflicts." && \ + exit 1; \ + fi && \ + rm sglang.patch + +# Apply Megatron-LM patch. +RUN cd /root/Megatron-LM && \ + cp /tmp/miles/docker/patch/${PATCH_VERSION}/megatron.patch ./megatron.patch && \ + git update-index --refresh && \ + git apply megatron.patch --3way && \ + if grep -R -n '^<<<<<<< ' .; then \ + echo "Megatron patch failed to apply cleanly. Please resolve conflicts." && \ + exit 1; \ + fi && \ + rm megatron.patch + +# Install Miles dependencies. +RUN python -m pip install git+https://github.com/ISEEKYAN/mbridge.git@89eb10887887bc74853f89a4de258c0702932a1c --no-deps && \ + python -m pip install git+https://github.com/fzyzcjy/torch_memory_saver.git@dc6876905830430b5054325fa4211ff302169c6b --no-cache-dir --force-reinstall && \ + python -m pip install git+https://github.com/fzyzcjy/Megatron-Bridge.git@dev_rl --no-build-isolation && \ + python -m pip install "nvidia-modelopt[torch]>=0.37.0" --no-build-isolation + +# Make MXFP8 quantizer import conditional — mxfp8_group_quantize was added +# in a newer SGLang than our pinned commit. Not needed for Qwen3-8B training. +RUN python -c "\ +import pathlib; \ +p = pathlib.Path('/tmp/miles/miles/backends/megatron_utils/megatron_to_hf/processors/quantizer_mxfp8.py'); \ +t = p.read_text(); \ +t = t.replace( \ + 'from sglang.srt.layers.quantization.fp8_utils import mxfp8_group_quantize', \ + 'try:\\n from sglang.srt.layers.quantization.fp8_utils import mxfp8_group_quantize\\nexcept ImportError:\\n mxfp8_group_quantize = None' \ +); \ +p.write_text(t)" + +# Install Miles itself. +RUN python -m pip install -r /tmp/miles/requirements.txt && \ + python -m pip install -e /tmp/miles --no-deps && \ + cd /tmp/miles/miles/backends/megatron_utils/kernels/int4_qat && \ + python -m pip install . --no-build-isolation + +# Re-pin PyTorch 2.9.1 and reinstall flash-attn + TE at the end. +# Earlier installs may have upgraded torch, breaking pre-built binary wheels. +RUN python -c "import torch; print(f'Before re-pin: PyTorch {torch.__version__}')" +RUN python -m pip install torch==2.9.1 torchvision torchaudio \ + --index-url https://download.pytorch.org/whl/cu128 +RUN python -m pip install --force-reinstall --no-deps \ + https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3%2Bcu12torch2.9cxx11abiTRUE-cp312-cp312-linux_x86_64.whl +RUN python -m pip install --no-build-isolation "transformer_engine[pytorch]==2.10.0" + +# Verify torch + flash-attn ABI compatibility. +# sgl_kernel is skipped here — it requires libcuda.so.1 (GPU hardware) to import. +RUN python -c "\ +import torch; print(f'PyTorch: {torch.__version__}, CUDA: {torch.version.cuda}'); \ +assert torch.__version__.startswith('2.9'), f'Expected 2.9.x, got {torch.__version__}'; \ +from flash_attn import flash_attn_func; print('flash-attn OK')" + +WORKDIR /tmp/miles diff --git a/miles_qwen3_8b_h100/README.md b/miles_qwen3_8b_h100/README.md new file mode 100644 index 0000000..083b7b2 --- /dev/null +++ b/miles_qwen3_8b_h100/README.md @@ -0,0 +1,77 @@ +# Qwen3-8B GRPO Training on Anyscale (H100) + +Single-node RL training of Qwen3-8B with GRPO on **8x H100-80GB** using Anyscale, following the pattern from [anyscale/examples#43](https://github.com/anyscale/examples/pull/43). + +## Cluster Layout + +``` +Head node (m5.2xlarge): driver only, no GPUs +Worker 0 (8x H100-80GB): + GPU 0-3: Training (TP=2, DP=2) + GPU 4-7: Rollout (3 SGLang engines + 1 driver) +``` + +- **Training**: 4 GPUs — TP=2 x DP=2 (Megatron backend) +- **Rollout**: 3 GPUs — disaggregated SGLang inference, 1 GPU per engine (1 GPU reserved for driver) +- **Algorithm**: GRPO with DAPO-style asymmetric clipping +- **Dataset**: DAPO-Math-17k (integer math, deterministic reward) + +## Files + +| File | Description | +|------|-------------| +| `job.yaml` | Anyscale job config (`m5.2xlarge` head + 1x `p5.48xlarge` worker) | +| `Dockerfile.anyscale` | Docker image with Miles, Megatron-LM, SGLang, flash-attn, TE | +| `entrypoint.sh` | Downloads model/data, converts weights, runs async GRPO training | + +## Quick Start + +```bash +pip install -U anyscale +anyscale login + +cd examples/anyscale_qwen3_8b_h100 +anyscale job submit -f job.yaml +``` + +The entrypoint automatically: +1. Downloads `Qwen/Qwen3-8B` and `zhuzilin/dapo-math-17k` to `/mnt/cluster_storage` +2. Converts HF weights to Megatron torch_dist format (on GPU worker) +3. Runs async GRPO training with `dapo` reward model via `train_async.py` + +## Key Differences from the Slime A10G Example (PR #43) + +| | Slime A10G (PR #43) | This Example | +|---|---|---| +| GPUs | 2x4 A10G (24GB) | 1x8 H100 (80GB) | +| Model | Qwen3-1.7B | Qwen3-8B | +| Training | `train.py` (sync) | `train_async.py` (pipelined async) | +| Parallelism | TP=2, PP=2 across nodes | TP=2, DP=2, single node | +| A10G patches | sgl_kernel, Triton, multi_platform | Not needed (H100 = SM90) | +| Batch size | 64 (16 prompts x 4 samples) | 256 (32 prompts x 8 samples) | +| Max tokens/GPU | 4096 | 9216 | +| Attention | FA2 only (Ampere) | FA2 (FA3 available with custom image) | + +## Verification + +A successful run shows: +- SGLang engine startup on rollout GPUs +- Weight conversion completes (first run only) +- Training loss values printed each step +- Reward gradually increasing over rollouts +- Weight sync between training and rollout engines + +## If You Hit OOM + +**Training GPUs:** +1. `--max-tokens-per-gpu` -> `4096` +2. `--rollout-max-response-len` -> `4096` +3. `--n-samples-per-prompt` -> `4` and `--global-batch-size` -> `128` + +**Rollout GPUs:** +1. `--sglang-mem-fraction-static` -> `0.5` +2. Add `--sglang-chunked-prefill-size 4096` + +## View the Job + +View the job in the [jobs tab](https://console.anyscale.com/jobs) of the Anyscale console. diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/miles_qwen3_8b_h100/entrypoint.sh new file mode 100755 index 0000000..505ee89 --- /dev/null +++ b/miles_qwen3_8b_h100/entrypoint.sh @@ -0,0 +1,168 @@ +#!/bin/bash +# Anyscale entrypoint: Qwen3-8B GRPO training on 1 worker × 8x H100-80GB +# Downloads model/dataset, converts weights, and runs async RL training. +# +# Head node (m5.2xlarge): driver only, no GPUs +# Layout (GPU worker): +# Worker 0 (8x H100): +# GPU 0-3: Training (TP=2, DP=2) +# GPU 4-7: Rollout (4 SGLang engines, 1 GPU each) + +set -ex + +export PYTHONBUFFERED=16 +STORAGE=/mnt/cluster_storage + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" + +# Qwen3-8B model architecture args (from scripts/models/qwen3-8B.sh) +MODEL_ARGS=( + --swiglu + --num-layers 36 + --hidden-size 4096 + --ffn-hidden-size 12288 + --num-attention-heads 32 + --group-query-attention + --num-query-groups 8 + --use-rotary-position-embeddings + --disable-bias-linear + --normalization "RMSNorm" + --norm-epsilon 1e-6 + --rotary-base 1000000 + --vocab-size 151936 + --kv-channels 128 + --qk-layernorm + --untie-embeddings-and-output-weights +) + +# ======================== Step 1: Download model & dataset ======================== + +echo "=== Downloading model ===" +huggingface-cli download Qwen/Qwen3-8B --local-dir ${STORAGE}/Qwen3-8B + +echo "=== Downloading dataset ===" +huggingface-cli download --repo-type dataset zhuzilin/dapo-math-17k --local-dir ${STORAGE}/dapo-math-17k + +# ======================== Step 2: Convert HF weights to torch_dist ======================== + +if [ ! -d "${STORAGE}/Qwen3-8B_torch_dist/iter_0000000" ]; then + echo "=== Converting weights (HF -> torch_dist) on GPU worker ===" + CONVERT_ENV_JSON='{ + "env_vars": { + "PYTHONPATH": "/root/Megatron-LM/" + } + }' + ray job submit --address="http://127.0.0.1:8265" \ + --runtime-env-json="${CONVERT_ENV_JSON}" \ + --entrypoint-num-gpus 1 \ + -- python3 /tmp/miles/tools/convert_hf_to_torch_dist.py \ + ${MODEL_ARGS[@]} \ + --no-gradient-accumulation-fusion \ + --hf-checkpoint ${STORAGE}/Qwen3-8B \ + --save ${STORAGE}/Qwen3-8B_torch_dist +else + echo "=== Converted weights already exist, skipping ===" +fi + +# ======================== Step 3: Run training ======================== + +CKPT_ARGS=( + --hf-checkpoint ${STORAGE}/Qwen3-8B + --ref-load ${STORAGE}/Qwen3-8B_torch_dist + --load ${STORAGE}/Qwen3-8B_torch_dist + --save ${STORAGE}/Qwen3-8B_miles/ + --save-interval 20 +) + +ROLLOUT_ARGS=( + --prompt-data ${STORAGE}/dapo-math-17k/dapo-math-17k.jsonl + --input-key prompt + --label-key label + --apply-chat-template + --rollout-shuffle + --balance-data + --rm-type dapo + --reward-key score + --num-rollout 3000 + --rollout-batch-size 32 + --n-samples-per-prompt 8 + --rollout-max-response-len 8192 + --rollout-temperature 1 + --global-batch-size 256 +) + +PERF_ARGS=( + --tensor-model-parallel-size 2 + --sequence-parallel + --pipeline-model-parallel-size 1 + --context-parallel-size 1 + --expert-model-parallel-size 1 + --expert-tensor-parallel-size 1 + + --recompute-granularity full + --recompute-method uniform + --recompute-num-layers 1 + + --use-dynamic-batch-size + --max-tokens-per-gpu 9216 +) + +GRPO_ARGS=( + --advantage-estimator grpo + --use-kl-loss + --kl-loss-coef 0.00 + --kl-loss-type low_var_kl + --entropy-coef 0.00 + --eps-clip 0.2 + --eps-clip-high 0.28 +) + +OPTIMIZER_ARGS=( + --optimizer adam + --lr 1e-6 + --lr-decay-style constant + --weight-decay 0.1 + --adam-beta1 0.9 + --adam-beta2 0.98 +) + +SGLANG_ARGS=( + --rollout-num-gpus-per-engine 1 + --sglang-mem-fraction-static 0.7 +) + +MISC_ARGS=( + --no-gradient-accumulation-fusion + --attention-dropout 0.0 + --hidden-dropout 0.0 + --accumulate-allreduce-grads-in-fp32 + --attention-softmax-in-fp32 + --attention-backend flash + --use-tensorboard + --tensorboard-dir ${STORAGE}/tensorboard_logs +) + +RUNTIME_ENV_JSON='{ + "env_vars": { + "PYTHONPATH": "/root/Megatron-LM/", + "CUDA_DEVICE_MAX_CONNECTIONS": "1", + "TENSORBOARD_DIR": "/mnt/cluster_storage/tensorboard_logs" + } +}' + +echo "=== Submitting training job ===" +ray job submit --address="http://127.0.0.1:8265" \ + --runtime-env-json="${RUNTIME_ENV_JSON}" \ + --entrypoint-num-gpus 1 \ + -- python3 /tmp/miles/train_async.py \ + --actor-num-nodes 1 \ + --actor-num-gpus-per-node 4 \ + --rollout-num-gpus 3 \ + ${MODEL_ARGS[@]} \ + ${CKPT_ARGS[@]} \ + ${ROLLOUT_ARGS[@]} \ + ${OPTIMIZER_ARGS[@]} \ + ${GRPO_ARGS[@]} \ + ${PERF_ARGS[@]} \ + ${SGLANG_ARGS[@]} \ + ${MISC_ARGS[@]} diff --git a/miles_qwen3_8b_h100/job.yaml b/miles_qwen3_8b_h100/job.yaml new file mode 100644 index 0000000..10682e1 --- /dev/null +++ b/miles_qwen3_8b_h100/job.yaml @@ -0,0 +1,33 @@ +# Anyscale job config: Miles Qwen3-8B GRPO training on H100 +# Single node × 8x H100-80GB +# +# Layout: +# Head node (m5.2xlarge): driver only, no GPUs +# Worker 0 (8x H100): [GPU 0-3: Training TP=2 DP=2] [GPU 4-7: Rollout (4 engines)] +# +# Submit with: +# cd examples/anyscale_qwen3_8b_h100 +# anyscale job submit -f job.yaml +cloud: anyscale-v2-cloud-us-east-1 + +name: miles-qwen3-8b-grpo-h100 + +containerfile: ./Dockerfile.anyscale + +compute_config: + head_node: + instance_type: m5.2xlarge # CPU-only, runs driver script + worker_nodes: + - instance_type: p5.48xlarge # 8x H100-80GB, 192 vCPU, 2048 GB RAM + min_nodes: 1 + max_nodes: 1 + advanced_instance_config: + CapacityReservationSpecification: + CapacityReservationTarget: + CapacityReservationId: cr-0dfe1157d299ae5fc + +working_dir: . + +entrypoint: bash entrypoint.sh + +max_retries: 0 From 2c18184e689f3297a4779b0fb2971ec3764b2cc4 Mon Sep 17 00:00:00 2001 From: Xinyu Zhang <60529799+xyuzh@users.noreply.github.com> Date: Thu, 26 Feb 2026 10:57:11 -0800 Subject: [PATCH 2/7] Change num-rollout from 3000 to 5 --- miles_qwen3_8b_h100/entrypoint.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/miles_qwen3_8b_h100/entrypoint.sh index 505ee89..1be7221 100755 --- a/miles_qwen3_8b_h100/entrypoint.sh +++ b/miles_qwen3_8b_h100/entrypoint.sh @@ -83,7 +83,7 @@ ROLLOUT_ARGS=( --balance-data --rm-type dapo --reward-key score - --num-rollout 3000 + --num-rollout 5 --rollout-batch-size 32 --n-samples-per-prompt 8 --rollout-max-response-len 8192 From 6ca34bcf972503a6e8a4c847643b1deb8aca6446 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 27 Feb 2026 13:30:57 -0800 Subject: [PATCH 3/7] Polish Qwen3-8B GRPO example - Remove ray job submit, call python directly - Move env vars to appropriate locations (PYTHONPATH in Dockerfile, CUDA_DEVICE_MAX_CONNECTIONS in job.yaml) - Simplify entrypoint.sh (remove unused vars, fix paths) - Add timeout_s to job.yaml - Restructure README to match other examples pattern - Rename Dockerfile.anyscale -> Dockerfile - Change python3 -> python throughout Signed-off-by: Robert Nishihara --- .../{Dockerfile.anyscale => Dockerfile} | 25 +++--- miles_qwen3_8b_h100/README.md | 84 +++++-------------- miles_qwen3_8b_h100/entrypoint.sh | 39 ++------- miles_qwen3_8b_h100/job.yaml | 17 ++-- 4 files changed, 53 insertions(+), 112 deletions(-) rename miles_qwen3_8b_h100/{Dockerfile.anyscale => Dockerfile} (90%) diff --git a/miles_qwen3_8b_h100/Dockerfile.anyscale b/miles_qwen3_8b_h100/Dockerfile similarity index 90% rename from miles_qwen3_8b_h100/Dockerfile.anyscale rename to miles_qwen3_8b_h100/Dockerfile index 72640ea..265e250 100644 --- a/miles_qwen3_8b_h100/Dockerfile.anyscale +++ b/miles_qwen3_8b_h100/Dockerfile @@ -5,13 +5,12 @@ ARG MEGATRON_COMMIT=3714d81d418c9f1bca4594fc35f9e8289f652862 ARG SGLANG_COMMIT=24c91001cf99ba642be791e099d358f4dfe955f5 ARG MILES_REF=main -# Anyscale base image runs as non-root; switch to root for system installs. -USER root -WORKDIR /root +# Anyscale base image runs as non-root; use sudo for system installs. +WORKDIR /home/ray -RUN apt-get update && \ - apt-get install -y --no-install-recommends git rsync dnsutils nvtop && \ - rm -rf /var/lib/apt/lists/* +RUN sudo apt-get update && \ + sudo apt-get install -y --no-install-recommends git rsync dnsutils nvtop && \ + sudo rm -rf /var/lib/apt/lists/* # Keep pip tooling current and pin numpy to 1.x for Megatron compatibility. RUN python -m pip install --upgrade pip setuptools wheel && \ @@ -37,14 +36,14 @@ RUN git clone --filter=blob:none https://github.com/NVIDIA/apex.git /tmp/apex && # Install SGLang from source. sgl_kernel comes from PyPI, pre-compiled # for torch 2.9.x — no need to rebuild from source. -RUN git clone https://github.com/sgl-project/sglang.git /root/sglang && \ - cd /root/sglang && \ +RUN git clone https://github.com/sgl-project/sglang.git /home/ray/sglang && \ + cd /home/ray/sglang && \ git checkout ${SGLANG_COMMIT} && \ python -m pip install -e "python[all]" # Install Megatron-LM from source. -RUN git clone --recursive https://github.com/NVIDIA/Megatron-LM.git /root/Megatron-LM && \ - cd /root/Megatron-LM && \ +RUN git clone --recursive https://github.com/NVIDIA/Megatron-LM.git /home/ray/Megatron-LM && \ + cd /home/ray/Megatron-LM && \ git checkout ${MEGATRON_COMMIT} && \ python -m pip install -e . @@ -54,7 +53,7 @@ RUN git clone https://github.com/radixark/miles.git /tmp/miles && \ git checkout ${MILES_REF} # Apply SGLang patch. -RUN cd /root/sglang && \ +RUN cd /home/ray/sglang && \ cp /tmp/miles/docker/patch/${PATCH_VERSION}/sglang.patch ./sglang.patch && \ git update-index --refresh && \ git apply sglang.patch --3way && \ @@ -65,7 +64,7 @@ RUN cd /root/sglang && \ rm sglang.patch # Apply Megatron-LM patch. -RUN cd /root/Megatron-LM && \ +RUN cd /home/ray/Megatron-LM && \ cp /tmp/miles/docker/patch/${PATCH_VERSION}/megatron.patch ./megatron.patch && \ git update-index --refresh && \ git apply megatron.patch --3way && \ @@ -115,4 +114,6 @@ import torch; print(f'PyTorch: {torch.__version__}, CUDA: {torch.version.cuda}') assert torch.__version__.startswith('2.9'), f'Expected 2.9.x, got {torch.__version__}'; \ from flash_attn import flash_attn_func; print('flash-attn OK')" +ENV PYTHONPATH=/home/ray/Megatron-LM:$PYTHONPATH + WORKDIR /tmp/miles diff --git a/miles_qwen3_8b_h100/README.md b/miles_qwen3_8b_h100/README.md index 083b7b2..fcd12b4 100644 --- a/miles_qwen3_8b_h100/README.md +++ b/miles_qwen3_8b_h100/README.md @@ -1,77 +1,39 @@ -# Qwen3-8B GRPO Training on Anyscale (H100) +# GRPO Training for Qwen3-8B with MILES -Single-node RL training of Qwen3-8B with GRPO on **8x H100-80GB** using Anyscale, following the pattern from [anyscale/examples#43](https://github.com/anyscale/examples/pull/43). +This example demonstrates reinforcement learning fine-tuning of Qwen3-8B using **Group Relative Policy Optimization (GRPO)** on the DAPO-Math-17k dataset. It uses the [MILES](https://github.com/radixark/miles) framework for distributed RL training with disaggregated rollouts on Anyscale. -## Cluster Layout +The training runs on a single node with **8x H100-80GB GPUs**, using: +- **4 GPUs for training** (TP=2, DP=2 with Megatron-LM) +- **4 GPUs for rollout inference** (disaggregated SGLang engines) -``` -Head node (m5.2xlarge): driver only, no GPUs -Worker 0 (8x H100-80GB): - GPU 0-3: Training (TP=2, DP=2) - GPU 4-7: Rollout (3 SGLang engines + 1 driver) -``` - -- **Training**: 4 GPUs — TP=2 x DP=2 (Megatron backend) -- **Rollout**: 3 GPUs — disaggregated SGLang inference, 1 GPU per engine (1 GPU reserved for driver) -- **Algorithm**: GRPO with DAPO-style asymmetric clipping -- **Dataset**: DAPO-Math-17k (integer math, deterministic reward) - -## Files - -| File | Description | -|------|-------------| -| `job.yaml` | Anyscale job config (`m5.2xlarge` head + 1x `p5.48xlarge` worker) | -| `Dockerfile.anyscale` | Docker image with Miles, Megatron-LM, SGLang, flash-attn, TE | -| `entrypoint.sh` | Downloads model/data, converts weights, runs async GRPO training | - -## Quick Start +## Install the Anyscale CLI ```bash pip install -U anyscale anyscale login - -cd examples/anyscale_qwen3_8b_h100 -anyscale job submit -f job.yaml ``` -The entrypoint automatically: -1. Downloads `Qwen/Qwen3-8B` and `zhuzilin/dapo-math-17k` to `/mnt/cluster_storage` -2. Converts HF weights to Megatron torch_dist format (on GPU worker) -3. Runs async GRPO training with `dapo` reward model via `train_async.py` +## Submit the job -## Key Differences from the Slime A10G Example (PR #43) +Clone the example from GitHub. -| | Slime A10G (PR #43) | This Example | -|---|---|---| -| GPUs | 2x4 A10G (24GB) | 1x8 H100 (80GB) | -| Model | Qwen3-1.7B | Qwen3-8B | -| Training | `train.py` (sync) | `train_async.py` (pipelined async) | -| Parallelism | TP=2, PP=2 across nodes | TP=2, DP=2, single node | -| A10G patches | sgl_kernel, Triton, multi_platform | Not needed (H100 = SM90) | -| Batch size | 64 (16 prompts x 4 samples) | 256 (32 prompts x 8 samples) | -| Max tokens/GPU | 4096 | 9216 | -| Attention | FA2 only (Ampere) | FA2 (FA3 available with custom image) | - -## Verification - -A successful run shows: -- SGLang engine startup on rollout GPUs -- Weight conversion completes (first run only) -- Training loss values printed each step -- Reward gradually increasing over rollouts -- Weight sync between training and rollout engines +```bash +git clone https://github.com/anyscale/examples.git +cd examples/miles_qwen3_8b_h100 +``` -## If You Hit OOM +Submit the job. -**Training GPUs:** -1. `--max-tokens-per-gpu` -> `4096` -2. `--rollout-max-response-len` -> `4096` -3. `--n-samples-per-prompt` -> `4` and `--global-batch-size` -> `128` +```bash +anyscale job submit -f job.yaml +``` -**Rollout GPUs:** -1. `--sglang-mem-fraction-static` -> `0.5` -2. Add `--sglang-chunked-prefill-size 4096` +The entrypoint will automatically download the model and dataset, convert weights to Megatron format, and start training. Training progress can be monitored via TensorBoard logs in `/mnt/cluster_storage/tensorboard_logs`. -## View the Job +## Understanding the example -View the job in the [jobs tab](https://console.anyscale.com/jobs) of the Anyscale console. +- **Algorithm**: This example uses GRPO with DAPO-style asymmetric clipping (ε_low=0.2, ε_high=0.28), which is particularly effective for math reasoning tasks. +- **Dataset**: [DAPO-Math-17k](https://huggingface.co/datasets/zhuzilin/dapo-math-17k) contains 17k integer math problems with deterministic reward signals based on answer correctness. +- **Disaggregated architecture**: Training and rollout happen on separate GPUs for maximum throughput. The 4 SGLang rollout engines run inference in parallel while the training GPUs perform gradient updates. +- **Weight conversion**: On the first run, HuggingFace weights are converted to Megatron-LM's `torch_dist` format. Converted weights are cached in `/mnt/cluster_storage/Qwen3-8B_torch_dist` for subsequent runs. +- **Async training**: The pipeline uses `train_async.py` which overlaps rollout generation and policy updates for better GPU utilization. diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/miles_qwen3_8b_h100/entrypoint.sh index 1be7221..79f1a06 100755 --- a/miles_qwen3_8b_h100/entrypoint.sh +++ b/miles_qwen3_8b_h100/entrypoint.sh @@ -13,8 +13,6 @@ set -ex export PYTHONBUFFERED=16 STORAGE=/mnt/cluster_storage -SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" - # Qwen3-8B model architecture args (from scripts/models/qwen3-8B.sh) MODEL_ARGS=( --swiglu @@ -46,20 +44,12 @@ huggingface-cli download --repo-type dataset zhuzilin/dapo-math-17k --local-dir # ======================== Step 2: Convert HF weights to torch_dist ======================== if [ ! -d "${STORAGE}/Qwen3-8B_torch_dist/iter_0000000" ]; then - echo "=== Converting weights (HF -> torch_dist) on GPU worker ===" - CONVERT_ENV_JSON='{ - "env_vars": { - "PYTHONPATH": "/root/Megatron-LM/" - } - }' - ray job submit --address="http://127.0.0.1:8265" \ - --runtime-env-json="${CONVERT_ENV_JSON}" \ - --entrypoint-num-gpus 1 \ - -- python3 /tmp/miles/tools/convert_hf_to_torch_dist.py \ - ${MODEL_ARGS[@]} \ - --no-gradient-accumulation-fusion \ - --hf-checkpoint ${STORAGE}/Qwen3-8B \ - --save ${STORAGE}/Qwen3-8B_torch_dist + echo "=== Converting weights (HF -> torch_dist) ===" + python /tmp/miles/tools/convert_hf_to_torch_dist.py \ + ${MODEL_ARGS[@]} \ + --no-gradient-accumulation-fusion \ + --hf-checkpoint ${STORAGE}/Qwen3-8B \ + --save ${STORAGE}/Qwen3-8B_torch_dist else echo "=== Converted weights already exist, skipping ===" fi @@ -142,22 +132,11 @@ MISC_ARGS=( --tensorboard-dir ${STORAGE}/tensorboard_logs ) -RUNTIME_ENV_JSON='{ - "env_vars": { - "PYTHONPATH": "/root/Megatron-LM/", - "CUDA_DEVICE_MAX_CONNECTIONS": "1", - "TENSORBOARD_DIR": "/mnt/cluster_storage/tensorboard_logs" - } -}' - -echo "=== Submitting training job ===" -ray job submit --address="http://127.0.0.1:8265" \ - --runtime-env-json="${RUNTIME_ENV_JSON}" \ - --entrypoint-num-gpus 1 \ - -- python3 /tmp/miles/train_async.py \ +echo "=== Starting training ===" +python /tmp/miles/train_async.py \ --actor-num-nodes 1 \ --actor-num-gpus-per-node 4 \ - --rollout-num-gpus 3 \ + --rollout-num-gpus 4 \ ${MODEL_ARGS[@]} \ ${CKPT_ARGS[@]} \ ${ROLLOUT_ARGS[@]} \ diff --git a/miles_qwen3_8b_h100/job.yaml b/miles_qwen3_8b_h100/job.yaml index 10682e1..7c65bac 100644 --- a/miles_qwen3_8b_h100/job.yaml +++ b/miles_qwen3_8b_h100/job.yaml @@ -6,28 +6,27 @@ # Worker 0 (8x H100): [GPU 0-3: Training TP=2 DP=2] [GPU 4-7: Rollout (4 engines)] # # Submit with: -# cd examples/anyscale_qwen3_8b_h100 +# cd miles_qwen3_8b_h100 # anyscale job submit -f job.yaml -cloud: anyscale-v2-cloud-us-east-1 name: miles-qwen3-8b-grpo-h100 -containerfile: ./Dockerfile.anyscale +containerfile: ./Dockerfile compute_config: head_node: - instance_type: m5.2xlarge # CPU-only, runs driver script + instance_type: m5.2xlarge worker_nodes: - - instance_type: p5.48xlarge # 8x H100-80GB, 192 vCPU, 2048 GB RAM + - instance_type: p5.48xlarge # 8x H100-80GB min_nodes: 1 max_nodes: 1 - advanced_instance_config: - CapacityReservationSpecification: - CapacityReservationTarget: - CapacityReservationId: cr-0dfe1157d299ae5fc working_dir: . entrypoint: bash entrypoint.sh +env_vars: + CUDA_DEVICE_MAX_CONNECTIONS: "1" + max_retries: 0 +timeout_s: 7200 From e7aa67c39a51ed3703a1e3c737e3a90c3fd2687c Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 27 Feb 2026 13:42:58 -0800 Subject: [PATCH 4/7] Use declarative compute config instead of hardcoded instance types - Replace instance_type with required_resources and required_labels - Specify H100 accelerator type using ray.io/accelerator-type label - Define resource requirements: 8 CPUs/32Gi for head, 96 CPUs/512Gi/8 GPUs for workers - Allows Anyscale to select optimal H100 instance type (e.g., p5.48xlarge) Signed-off-by: Robert Nishihara --- miles_qwen3_8b_h100/job.yaml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/miles_qwen3_8b_h100/job.yaml b/miles_qwen3_8b_h100/job.yaml index 7c65bac..6f54dc6 100644 --- a/miles_qwen3_8b_h100/job.yaml +++ b/miles_qwen3_8b_h100/job.yaml @@ -15,9 +15,17 @@ containerfile: ./Dockerfile compute_config: head_node: - instance_type: m5.2xlarge + required_resources: + CPU: 8 + memory: 32Gi worker_nodes: - - instance_type: p5.48xlarge # 8x H100-80GB + - name: h100-workers + required_resources: + CPU: 96 + memory: 512Gi + GPU: 8 + required_labels: + ray.io/accelerator-type: H100 min_nodes: 1 max_nodes: 1 From 4c572960cdc0d314833d0f38cac5c23b5575d058 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 27 Feb 2026 16:15:20 -0800 Subject: [PATCH 5/7] Fix declarative compute config resource requirements - Update worker resources to match p5.48xlarge specs: 192 vCPUs, 2048Gi memory - Keeps 8 H100 GPUs with H100 accelerator type label Signed-off-by: Robert Nishihara --- miles_qwen3_8b_h100/job.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/miles_qwen3_8b_h100/job.yaml b/miles_qwen3_8b_h100/job.yaml index 6f54dc6..53ec596 100644 --- a/miles_qwen3_8b_h100/job.yaml +++ b/miles_qwen3_8b_h100/job.yaml @@ -21,8 +21,8 @@ compute_config: worker_nodes: - name: h100-workers required_resources: - CPU: 96 - memory: 512Gi + CPU: 192 + memory: 2048Gi GPU: 8 required_labels: ray.io/accelerator-type: H100 From 7e690cdf917c0f26454b81983b48053d907e2e73 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 27 Feb 2026 16:24:36 -0800 Subject: [PATCH 6/7] Use Ray remote to run weight conversion on GPU worker - Add convert_weights_remote.py wrapper with @ray.remote(num_gpus=1) - Ensures weight conversion runs on GPU worker instead of head node - Fixes 'No NVIDIA driver' error when running conversion Signed-off-by: Robert Nishihara --- miles_qwen3_8b_h100/convert_weights_remote.py | 30 +++++++++++++++++++ miles_qwen3_8b_h100/entrypoint.sh | 4 +-- 2 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 miles_qwen3_8b_h100/convert_weights_remote.py diff --git a/miles_qwen3_8b_h100/convert_weights_remote.py b/miles_qwen3_8b_h100/convert_weights_remote.py new file mode 100644 index 0000000..f7aaa4d --- /dev/null +++ b/miles_qwen3_8b_h100/convert_weights_remote.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +"""Ray remote wrapper for weight conversion - ensures it runs on a GPU worker.""" +import sys +import subprocess +import ray + +@ray.remote(num_gpus=1) +def convert_weights(cmd_args): + """Run weight conversion on a GPU worker.""" + result = subprocess.run( + ["python", "/tmp/miles/tools/convert_hf_to_torch_dist.py"] + cmd_args, + capture_output=True, + text=True + ) + return result.returncode, result.stdout, result.stderr + +if __name__ == "__main__": + # Pass through all command-line arguments + cmd_args = sys.argv[1:] + + # Run conversion on GPU worker + returncode, stdout, stderr = ray.get(convert_weights.remote(cmd_args)) + + # Print output + if stdout: + print(stdout, end="") + if stderr: + print(stderr, end="", file=sys.stderr) + + sys.exit(returncode) diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/miles_qwen3_8b_h100/entrypoint.sh index 79f1a06..04345e8 100755 --- a/miles_qwen3_8b_h100/entrypoint.sh +++ b/miles_qwen3_8b_h100/entrypoint.sh @@ -44,8 +44,8 @@ huggingface-cli download --repo-type dataset zhuzilin/dapo-math-17k --local-dir # ======================== Step 2: Convert HF weights to torch_dist ======================== if [ ! -d "${STORAGE}/Qwen3-8B_torch_dist/iter_0000000" ]; then - echo "=== Converting weights (HF -> torch_dist) ===" - python /tmp/miles/tools/convert_hf_to_torch_dist.py \ + echo "=== Converting weights (HF -> torch_dist) on GPU worker ===" + python convert_weights_remote.py \ ${MODEL_ARGS[@]} \ --no-gradient-accumulation-fusion \ --hf-checkpoint ${STORAGE}/Qwen3-8B \ From eba802aca5c5fab30e1dd3de0b051ba13222d9a6 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 27 Feb 2026 18:23:18 -0800 Subject: [PATCH 7/7] Add Ray remote wrapper for training script - Create train_remote.py with @ray.remote(num_gpus=4) - Ensures training runs on GPU workers instead of head node - Both weight conversion and training now use Ray remote Signed-off-by: Robert Nishihara --- miles_qwen3_8b_h100/entrypoint.sh | 2 +- miles_qwen3_8b_h100/train_remote.py | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 miles_qwen3_8b_h100/train_remote.py diff --git a/miles_qwen3_8b_h100/entrypoint.sh b/miles_qwen3_8b_h100/entrypoint.sh index 04345e8..eba3188 100755 --- a/miles_qwen3_8b_h100/entrypoint.sh +++ b/miles_qwen3_8b_h100/entrypoint.sh @@ -133,7 +133,7 @@ MISC_ARGS=( ) echo "=== Starting training ===" -python /tmp/miles/train_async.py \ +python train_remote.py \ --actor-num-nodes 1 \ --actor-num-gpus-per-node 4 \ --rollout-num-gpus 4 \ diff --git a/miles_qwen3_8b_h100/train_remote.py b/miles_qwen3_8b_h100/train_remote.py new file mode 100644 index 0000000..d95fbaa --- /dev/null +++ b/miles_qwen3_8b_h100/train_remote.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +"""Ray remote wrapper for training - ensures it runs on GPU workers.""" +import sys +import subprocess +import ray + +@ray.remote(num_gpus=4) # Training needs 4 GPUs +def run_training(cmd_args): + """Run training on GPU workers.""" + result = subprocess.run( + ["python", "/tmp/miles/train_async.py"] + cmd_args, + capture_output=False, # Stream output directly + text=True + ) + return result.returncode + +if __name__ == "__main__": + # Pass through all command-line arguments + cmd_args = sys.argv[1:] + + # Run training on GPU workers + returncode = ray.get(run_training.remote(cmd_args)) + + sys.exit(returncode)