anyscale · tohtana · Feb 12, 2026 · Feb 12, 2026
diff --git a/swift_training/Dockerfile b/swift_training/Dockerfile
@@ -0,0 +1,69 @@
+# Dockerfile for Megatron-SWIFT with Ray Train
+#
+# This image provides SWIFT with Megatron-LM support for distributed LLM training.
+#
+# Build:
+#   docker build -t swift-megatron:latest .
+#
+# For Anyscale, this Dockerfile serves as a reference for the container build configuration.
+
+FROM anyscale/ray:2.53.0-py312-cu128
+
+# Pin PyTorch CUDA stack for this training workflow.
+# PyTorch v2.9 has a bug with conv3d. See https://github.com/pytorch/pytorch/issues/166122 for more details.
+# As of 2026-02-11, SWIFT recommends using PyTorch v2.8.0 (https://github.com/modelscope/ms-swift/issues/6744).
+RUN pip install --no-cache-dir --index-url https://download.pytorch.org/whl/cu128 \
+    "torch==2.8.0"
+
+# Install core dependencies
+RUN pip install --no-cache-dir \
+    transformers>=4.50.0 \
+    datasets>=2.0.0 \
+    tensorboard>=2.19.0 \
+    accelerate>=0.25.0 \
+    peft>=0.14.0 \
+    tqdm>=4.67.1 \
+    rich \
+    wandb>=0.19.10 \
+    modelscope>=1.20.0
+
+# Install NVIDIA packages for Megatron performance (optional, allow failure)
+RUN pip install --no-cache-dir nvidia-modelopt || true
+RUN pip install --no-cache-dir nvidia-resiliency-ext || true
+
+# Install Transformer Engine with PyTorch extensions.
+# --no-build-isolation is required so TE can find torch during CUDA extension compilation.
+# The Anyscale build environment has CUDA toolkit available (verified working).
+RUN pip install --no-cache-dir --no-build-isolation "transformer-engine[pytorch]==2.8.0"
+
+# NOTE: FlashAttention is NOT installed here because its CUDA kernel compilation
+# takes >1 hour, exceeding Anyscale's image build timeout.
+
+# Install SWIFT with Megatron support
+# Note: ms-swift automatically handles Megatron-LM setup (clones core_r0.15.0)
+RUN pip install --no-cache-dir "ms-swift>=3.3.0"
+
+# Patch SWIFT bug: get_padding_to crashes with attention_backend=fused when
+# padding_to is None (TypeError: '>' not supported between int and NoneType).
+# Fix: use `padding_to or 0` instead of bare `padding_to` in the max() call.
+# Note: str.replace is idempotent — safe to run even if already patched.
+RUN python -c "import swift.megatron.utils.utils as m; import pathlib; p=pathlib.Path(m.__file__); t=p.read_text(); p.write_text(t.replace('max(padding_to,', 'max(padding_to or 0,')); print('Patched get_padding_to')"
+
+# Set environment variables for Megatron
+ENV CUDA_DEVICE_MAX_CONNECTIONS=1
+ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+ENV RAY_TRAIN_V2_ENABLED=1
+
+# Pre-download Megatron-LM to avoid runtime delays
+# SWIFT will clone this automatically, but pre-downloading helps with reproducibility
+RUN python -c "from swift.megatron import megatron_sft_main; print('SWIFT Megatron module loaded successfully')" || true
+
+# Pre-download the tutorial model (Qwen2.5-1.5B-Instruct) into the image.
+# Without this, all 4 workers attempt a slow download at runtime (~1 MB/s),
+# hitting the NCCL 30-minute barrier timeout and crashing.
+RUN python -c "\
+from modelscope.hub.snapshot_download import snapshot_download; \
+snapshot_download('Qwen/Qwen2.5-1.5B-Instruct'); \
+print('Model pre-downloaded successfully')"
+
+WORKDIR /app
diff --git a/swift_training/README.md b/swift_training/README.md
@@ -0,0 +1,141 @@
+# Ray Train + Megatron-SWIFT LLM Fine-tuning Example
+
+This example demonstrates distributed LLM fine-tuning using:
+- **Ray Train**: Orchestrates distributed workers across GPUs/nodes
+- **Megatron-SWIFT**: Provides efficient tensor and pipeline parallelism for training (See the [document](https://swift.readthedocs.io/en/latest/Megatron-SWIFT/Quick-start.html) for more details)
+
+## Overview
+
+The integration combines SWIFT's easy-to-use training interface with Megatron-LM's parallelism capabilities, orchestrated by Ray Train for multi-node scaling.
+
+### Architecture
+
+```
+main() -> TorchTrainer -> train_loop() on each GPU
+                          |
+                     megatron_sft_main()
+                          |
+                     Megatron pretrain()
+```
+
+## Files
+
+- `llm_sft_ray_train_swift.py` - Main training script with Ray Train integration
+- `job.yaml` - Anyscale job configuration for cloud deployment
+- `Dockerfile` - Container image with SWIFT and Megatron dependencies
+
+## Prerequisites
+
+The job builds a Docker image from the included `Dockerfile` at submit time. No pre-built image is needed.
+
+## Quick Start
+
+```bash
+# Submit job to Anyscale (passes HF token for model downloads)
+anyscale job submit -f job.yaml --env HF_TOKEN=$HF_TOKEN
+
+# Monitor logs
+anyscale job logs <job-id>
+```
+
+**What this job does:**
+1. **Builds** a Docker image with SWIFT and Megatron dependencies (using `Dockerfile`).
+2. **Provisions** 4 GPUs (tested with 4×L4 GPUs).
+3. **Runs** the distributed training script `llm_sft_ray_train_swift.py`.
+
+## Configuration
+
+### Parallelism Settings
+
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `--tensor_parallel_size` | 2 | Split model layers across GPUs |
+| `--pipeline_parallel_size` | 1 | Split model stages across GPUs |
+| `--num_workers` | 4 | Total number of GPUs |
+
+**Note**: `num_workers` must be divisible by `tensor_parallel_size * pipeline_parallel_size`.
+
+Data Parallelism (DP) is automatically calculated as:
+```
+DP = num_workers / (TP * PP)
+```
+
+### Training Settings
+
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `--model` | Qwen/Qwen2.5-1.5B-Instruct | HuggingFace model ID |
+| `--dataset` | AI-ModelScope/alpaca-gpt4-data-en#500 | Dataset (append #N for sampling) |
+| `--train_iters` | 100 | Number of training iterations |
+| `--micro_batch_size` | 2 | Batch size per GPU |
+| `--seq_length` | 512 | Maximum sequence length |
+| `--learning_rate` | 1e-5 | Learning rate |
+
+### LoRA Settings
+
+Enable parameter-efficient fine-tuning with LoRA:
+
+```bash
+python llm_sft_ray_train_swift.py --use_lora --lora_rank 8 --lora_alpha 32
+```
+
+## Example Configurations
+
+### 8 GPUs with TP=2, PP=1 (DP=4)
+
+```bash
+python llm_sft_ray_train_swift.py \
+    --num_workers 8 \
+    --tensor_parallel_size 2 \
+    --pipeline_parallel_size 1 \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --train_iters 200
+```
+
+### 8 GPUs with TP=4, PP=2 (DP=1)
+
+For larger models requiring more parallelism:
+
+```bash
+python llm_sft_ray_train_swift.py \
+    --num_workers 8 \
+    --tensor_parallel_size 4 \
+    --pipeline_parallel_size 2 \
+    --model Qwen/Qwen2.5-72B-Instruct \
+    --micro_batch_size 1
+```
+
+## Supported Models
+
+SWIFT's Megatron integration supports many HuggingFace models, including:
+- Qwen2/Qwen2.5 series
+- Llama 2/3 series
+- Mistral/Mixtral series
+- DeepSeek series
+
+Check [SWIFT documentation](https://swift.readthedocs.io/en/latest/Megatron-SWIFT/index.html) for the full list.
+
+## Troubleshooting
+
+### CUDA Out of Memory
+
+- Reduce `--micro_batch_size`
+- Reduce `--seq_length`
+- Increase `--tensor_parallel_size`
+- Enable LoRA with `--use_lora`
+
+### Distributed Initialization Errors
+
+- Ensure all GPUs are visible: `echo $CUDA_VISIBLE_DEVICES`
+- For multi-node, ensure `MODELSCOPE_CACHE` points to shared storage
+
+### Slow Data Loading
+
+- For multi-node training, set `MODELSCOPE_CACHE` to a shared storage path
+- Consider using streaming datasets for large datasets
+
+## References
+
+- [SWIFT Megatron Documentation](https://swift.readthedocs.io/en/latest/Megatron-SWIFT/index.html)
+- [Ray Train Documentation](https://docs.ray.io/en/latest/train/train.html)
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
diff --git a/swift_training/job.yaml b/swift_training/job.yaml
@@ -0,0 +1,38 @@
+# Anyscale Job configuration for Ray Train + Megatron-SWIFT training
+#
+# Submit with:
+#   anyscale job submit -f job.yaml --env HF_TOKEN=$HF_TOKEN
+
+name: swift-ray-train-sft
+max_retries: 0
+
+# Build image from local Dockerfile at submit time
+containerfile: ./Dockerfile
+cloud:
+
+# When empty, Anyscale will auto-select the instance types.
+compute_config:
+
+working_dir: .
+
+env_vars:
+  CUDA_DEVICE_MAX_CONNECTIONS: "1"
+  PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+  RAY_TRAIN_V2_ENABLED: "1"
+  NCCL_DEBUG: "WARN"
+  PYTHONUNBUFFERED: "1"
+
+entrypoint: |
+  python llm_sft_ray_train_swift.py \
+    --model Qwen/Qwen2.5-1.5B-Instruct \
+    --dataset "AI-ModelScope/alpaca-gpt4-data-en#500" \
+    --num_workers 4 \
+    --tensor_parallel_size 2 \
+    --pipeline_parallel_size 1 \
+    --train_iters 100 \
+    --save_interval 50 \
+    --log_interval 10 \
+    --micro_batch_size 1 \
+    --storage_path /mnt/local_storage \
+    --attention_backend auto \
+    --padding_free false