anyscale · xyuzh · Feb 11, 2026 · Feb 11, 2026 · Feb 22, 2026 · Feb 22, 2026
diff --git a/sglang_inference/Dockerfile b/sglang_inference/Dockerfile
@@ -0,0 +1,39 @@
+FROM anyscale/ray:2.53.0-py312-cu129
+
+# System dependencies for building SGLang extensions
+RUN sudo apt-get update && \
+    sudo apt-get install -y --no-install-recommends \
+        build-essential \
+        cmake \
+        ninja-build \
+        libnuma-dev \
+        curl \
+    && sudo rm -rf /var/lib/apt/lists/*
+
+# CUDA toolkit (nvcc) for compiling CUDA kernels
+RUN curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb -o /tmp/cuda-keyring.deb && \
+    sudo dpkg -i /tmp/cuda-keyring.deb && \
+    rm /tmp/cuda-keyring.deb && \
+    sudo apt-get update && \
+    sudo apt-get install -y --no-install-recommends \
+        cuda-nvcc-12-9 \
+        cuda-cudart-dev-12-9 \
+        cuda-crt-12-9 \
+    && sudo rm -rf /var/lib/apt/lists/* && \
+    sudo rm -rf /usr/local/cuda && \
+    sudo ln -s /usr/local/cuda-12.9 /usr/local/cuda
+
+ENV PATH="/usr/local/cuda/bin:${PATH}"
+ENV CUDA_HOME="/usr/local/cuda"
+ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
+
+# Python dependencies
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/home/ray/.local/bin:${PATH}"
+
+# Install SGLang from a feature branch that supports Ray actor scheduling.
+# Replace with a released version after this feature is merged into a future SGLang release.
+RUN uv pip install --system \
+    "sglang[all] @ git+https://github.com/xyuzh/sglang.git@feature/ray-actor-scheduler#subdirectory=python"
+
+WORKDIR /home/ray/default
diff --git a/sglang_inference/README.md b/sglang_inference/README.md
@@ -0,0 +1,93 @@
+# Deploy SGLang Multi-Node Inference
+
+This example deploys [SGLang](https://github.com/sgl-project/sglang) for multi-node tensor-parallel inference using Ray on Anyscale.
+
+## Install the Anyscale CLI
+
+```bash
+pip install -U anyscale
+anyscale login
+```
+
+## Clone the example
+
+```bash
+git clone https://github.com/anyscale/examples.git
+cd examples/sglang_inference
+```
+
+## Batch inference
+
+Run batch inference as an Anyscale job:
+
+```bash
+anyscale job submit -f job.yaml
+```
+
+Or with the larger model:
+
+```bash
+anyscale job submit -f job.yaml --env MODEL_PATH=Qwen/Qwen3-30B-A3B-Instruct-2507
+```
+
+## Deploy as a service
+
+Deploy as an HTTP endpoint with Ray Serve:
+
+```bash
+anyscale service deploy -f service.yaml
+```
+
+Or with the larger model:
+
+```bash
+anyscale service deploy -f service.yaml --env MODEL_PATH=Qwen/Qwen3-30B-A3B-Instruct-2507
+```
+
+Wait for the service to be ready:
+
+```bash
+anyscale service wait --name sglang-inference --state RUNNING --timeout-s 900
+```
+
+The `anyscale service deploy` command outputs a line that looks like:
+
+```text
+curl -H "Authorization: Bearer <SERVICE_TOKEN>" <SERVICE_URL>
+```
+
+Set the environment variables from this output and query the model:
+
+```bash
+export SERVICE_URL=<SERVICE_URL>
+export SERVICE_TOKEN=<SERVICE_TOKEN>
+
+pip install requests
+python query.py
+```
+
+Shutdown the service when done:
+
+```bash
+anyscale service terminate --name sglang-inference
+```
+
+## Understanding the example
+
+- [serve.py](https://github.com/anyscale/examples/blob/main/sglang_inference/serve.py) uses Ray Serve's [`placement_group_bundles`](https://docs.ray.io/en/latest/serve/advanced-guides/replica-scheduling.html) to reserve GPUs across multiple nodes for tensor-parallel inference.
+- [driver_offline.py](https://github.com/anyscale/examples/blob/main/sglang_inference/driver_offline.py) wraps SGLang in a Ray actor for batch inference.
+- SGLang is imported inside the actor because it initializes CUDA and cannot be imported on CPU-only nodes.
+- The default configuration uses TP=4, PP=2 across 2 nodes (8 GPUs per replica) on A10G GPUs. Other GPU types like L4, L40S, A100, and H100 would also work.
+- The service autoscales from 1-4 replicas based on queue depth. See [AutoscalingConfig](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.config.AutoscalingConfig.html) for tuning.
+- The [Dockerfile](https://github.com/anyscale/examples/blob/main/sglang_inference/Dockerfile) installs CUDA toolkit and SGLang dependencies on top of the Ray base image.
+
+**Environment variables:**
+
+Override any variable at deploy/submit time with `--env`:
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `MODEL_PATH` | `Qwen/Qwen3-1.7B` | HuggingFace model ID |
+| `TP_SIZE` | `4` | Tensor parallelism (GPUs per pipeline stage) |
+| `PP_SIZE` | `2` | Pipeline parallelism (number of stages) |
+| `NUM_NODES` | `2` | Nodes per replica |
diff --git a/sglang_inference/driver_offline.py b/sglang_inference/driver_offline.py
@@ -0,0 +1,105 @@
+"""
+Offline (batch) inference with SGLang on Ray.
+
+Wraps sglang.Engine in a Ray actor for multi-node batch generation.
+The driver (head node) needs no GPU — sglang is imported only inside the actor.
+
+Usage:
+    python driver_offline.py
+"""
+
+import os
+import time
+
+import ray
+from ray.util.placement_group import placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+# Configuration from environment (same as serve.py)
+MODEL_PATH = os.environ.get("MODEL_PATH", "Qwen/Qwen3-1.7B")
+TP_SIZE = int(os.environ.get("TP_SIZE", "4"))
+PP_SIZE = int(os.environ.get("PP_SIZE", "2"))
+NUM_NODES = int(os.environ.get("NUM_NODES", "2"))
+
+
+@ray.remote
+class EngineActor:
+    """Thin wrapper that creates an sglang.Engine inside a Ray actor.
+
+    We import sglang inside the actor because it initializes CUDA and
+    cannot be imported on the CPU-only head node where the driver runs.
+    """
+
+    def __init__(self, **kwargs):
+        from sglang import Engine
+
+        self.engine = Engine(**kwargs)
+
+    def generate(self, prompts, sampling_params):
+        return [
+            self.engine.generate(prompt=p, sampling_params=sampling_params)
+            for p in prompts
+        ]
+
+    def shutdown(self):
+        self.engine.shutdown()
+
+
+def main():
+    gpus_per_node = (TP_SIZE * PP_SIZE) // NUM_NODES
+
+    print(f"Configuration: MODEL_PATH={MODEL_PATH}, TP={TP_SIZE}, PP={PP_SIZE}, NUM_NODES={NUM_NODES}")
+    print(f"GPUs per node: {gpus_per_node}")
+
+    # Reserve GPUs across nodes
+    pg = placement_group(
+        bundles=[{"CPU": 1, "GPU": gpus_per_node}] * NUM_NODES,
+    )
+    ray.get(pg.ready())
+    print("Placement group ready.")
+
+    # Start engine actor on the first bundle
+    engine = EngineActor.options(
+        num_cpus=1,
+        num_gpus=0,
+        scheduling_strategy=PlacementGroupSchedulingStrategy(
+            placement_group=pg, placement_group_bundle_index=0,
+        ),
+    ).remote(
+        model_path=MODEL_PATH,
+        tp_size=TP_SIZE,
+        pp_size=PP_SIZE,
+        nnodes=NUM_NODES,
+        use_ray=True,
+    )
+
+    # Wait for engine to be ready (model loaded)
+    print("Loading model...")
+    ray.get(engine.generate.remote(["warmup"], {"max_new_tokens": 1}))
+    print("Engine ready.")
+
+    # Batch generate
+    prompts = [
+        "The capital of France is",
+        "Explain quantum computing in simple terms:",
+        "Write a haiku about programming:",
+        "What is 2 + 2?",
+    ]
+
+    t0 = time.time()
+    results = ray.get(
+        engine.generate.remote(prompts, {"max_new_tokens": 64, "temperature": 0.0})
+    )
+    print(f"Generated {len(results)} responses in {time.time() - t0:.2f}s\n")
+
+    for prompt, result in zip(prompts, results):
+        print(f"Prompt:   {prompt}")
+        print(f"Response: {result['text'][:200]}\n")
+
+    # Cleanup
+    ray.get(engine.shutdown.remote())
+    ray.util.remove_placement_group(pg)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sglang_inference/job.yaml b/sglang_inference/job.yaml
@@ -0,0 +1,36 @@
+# Anyscale Job: SGLang Offline (Batch) Inference
+#
+# Configuration: TP=4, PP=2 across 2 nodes (4 GPUs per node)
+#
+# Submit (small model, fast):
+#   anyscale job submit -f job.yaml
+#
+# Submit (large model):
+#   anyscale job submit -f job.yaml --env MODEL_PATH=Qwen/Qwen3-30B-A3B-Instruct-2507
+
+name: sglang-offline-inference
+
+containerfile: ./Dockerfile
+
+compute_config:
+  head_node:
+    instance_type: m5.2xlarge        # CPU-only head
+  worker_nodes:
+    - instance_type: g5.12xlarge     # 4x A10G
+      min_nodes: 4
+      max_nodes: 4
+
+env_vars:
+  MODEL_PATH: "Qwen/Qwen3-1.7B"
+  TP_SIZE: "4"
+  PP_SIZE: "2"
+  NUM_NODES: "2"
+  # Ray normally sets CUDA_VISIBLE_DEVICES for each worker process.
+  # Disable this because SGLang assumes CUDA_VISIBLE_DEVICES lists all GPUs.
+  RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES: "1"
+
+working_dir: .
+
+entrypoint: python driver_offline.py
+
+max_retries: 0
diff --git a/sglang_inference/query.py b/sglang_inference/query.py
@@ -0,0 +1,29 @@
+"""Query the SGLang inference service."""
+
+import os
+import requests
+
+SERVICE_URL = os.environ.get("SERVICE_URL")
+SERVICE_TOKEN = os.environ.get("SERVICE_TOKEN")
+
+if not SERVICE_URL or not SERVICE_TOKEN:
+    print("Set SERVICE_URL and SERVICE_TOKEN from 'anyscale service deploy' output")
+    raise SystemExit(1)
+
+prompts = [
+    "The capital of France is",
+    "Explain quantum computing in one sentence:",
+    "Write a haiku about programming:",
+    "What is 2 + 2?",
+    "The largest planet in our solar system is",
+]
+
+for prompt in prompts:
+    response = requests.post(
+        SERVICE_URL,
+        headers={"Authorization": f"Bearer {SERVICE_TOKEN}"},
+        json={"text": prompt, "sampling_params": {"max_new_tokens": 32}},
+        timeout=120,
+    )
+    response.raise_for_status()
+    print(f"{prompt}{response.json()['text']}\n")
diff --git a/sglang_inference/serve.py b/sglang_inference/serve.py
@@ -0,0 +1,80 @@
+"""Ray Serve deployment for SGLang inference.
+
+This deployment uses Ray Serve's placement_group_bundles to reserve GPUs
+across multiple nodes for tensor-parallel inference with SGLang.
+
+Based on the Ray Serve LLM + SGLang integration pattern from:
+https://github.com/ray-project/ray/pull/58366
+"""
+
+import os
+import signal
+
+from fastapi import FastAPI
+from ray import serve
+
+# Configuration from environment (same defaults as driver_offline.py)
+MODEL_PATH = os.environ.get("MODEL_PATH", "Qwen/Qwen3-1.7B")
+TP_SIZE = int(os.environ.get("TP_SIZE", "4"))
+PP_SIZE = int(os.environ.get("PP_SIZE", "2"))
+NUM_NODES = int(os.environ.get("NUM_NODES", "2"))
+
+gpus_per_node = (TP_SIZE * PP_SIZE) // NUM_NODES
+
+app = FastAPI()
+
+
+@serve.deployment(
+    autoscaling_config={
+        "min_replicas": 1,
+        "max_replicas": 4,
+        "target_ongoing_requests": 4,
+    },
+    ray_actor_options={
+        "num_cpus": 1,
+        "num_gpus": 0,
+    },
+    # Reserve resources across multiple nodes for tensor parallelism.
+    # Each bundle reserves GPUs on one node.
+    placement_group_bundles=[{"CPU": 1, "GPU": gpus_per_node}] * NUM_NODES,
+)
+@serve.ingress(app)
+class SGLangDeployment:
+    def __init__(self):
+        # Import sglang inside the actor because it initializes CUDA and
+        # cannot be imported on the CPU-only head node where the Serve
+        # controller runs.
+        from sglang import Engine
+
+        # Monkey patch signal.signal to avoid "signal only works in main thread"
+        # error. SGLang tries to register signal handlers for graceful shutdown,
+        # but Ray Serve workers are not in the main thread.
+        original_signal = signal.signal
+
+        def noop_signal_handler(sig, action):
+            return signal.SIG_DFL
+
+        try:
+            signal.signal = noop_signal_handler
+            self.engine = Engine(
+                model_path=MODEL_PATH,
+                tp_size=TP_SIZE,
+                pp_size=PP_SIZE,
+                nnodes=NUM_NODES,
+                use_ray=True,
+            )
+        finally:
+            signal.signal = original_signal
+
+    @app.post("/")
+    async def generate(self, request: dict) -> dict:
+        text = request.get("text", "")
+        sampling_params = request.get("sampling_params", {"max_new_tokens": 64})
+        result = await self.engine.async_generate(
+            prompt=text,
+            sampling_params=sampling_params,
+        )
+        return {"text": result["text"]}
+
+
+app_deploy = SGLangDeployment.bind()