diff --git a/sglang_inference/Dockerfile b/sglang_inference/Dockerfile new file mode 100644 index 0000000..061792a --- /dev/null +++ b/sglang_inference/Dockerfile @@ -0,0 +1,39 @@ +FROM anyscale/ray:2.53.0-py312-cu129 + +# System dependencies for building SGLang extensions +RUN sudo apt-get update && \ + sudo apt-get install -y --no-install-recommends \ + build-essential \ + cmake \ + ninja-build \ + libnuma-dev \ + curl \ + && sudo rm -rf /var/lib/apt/lists/* + +# CUDA toolkit (nvcc) for compiling CUDA kernels +RUN curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb -o /tmp/cuda-keyring.deb && \ + sudo dpkg -i /tmp/cuda-keyring.deb && \ + rm /tmp/cuda-keyring.deb && \ + sudo apt-get update && \ + sudo apt-get install -y --no-install-recommends \ + cuda-nvcc-12-9 \ + cuda-cudart-dev-12-9 \ + cuda-crt-12-9 \ + && sudo rm -rf /var/lib/apt/lists/* && \ + sudo rm -rf /usr/local/cuda && \ + sudo ln -s /usr/local/cuda-12.9 /usr/local/cuda + +ENV PATH="/usr/local/cuda/bin:${PATH}" +ENV CUDA_HOME="/usr/local/cuda" +ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}" + +# Python dependencies +RUN curl -LsSf https://astral.sh/uv/install.sh | sh +ENV PATH="/home/ray/.local/bin:${PATH}" + +# Install SGLang from a feature branch that supports Ray actor scheduling. +# Replace with a released version after this feature is merged into a future SGLang release. +RUN uv pip install --system \ + "sglang[all] @ git+https://github.com/xyuzh/sglang.git@feature/ray-actor-scheduler#subdirectory=python" + +WORKDIR /home/ray/default diff --git a/sglang_inference/README.md b/sglang_inference/README.md new file mode 100644 index 0000000..94affe0 --- /dev/null +++ b/sglang_inference/README.md @@ -0,0 +1,93 @@ +# Deploy SGLang Multi-Node Inference + +This example deploys [SGLang](https://github.com/sgl-project/sglang) for multi-node tensor-parallel inference using Ray on Anyscale. + +## Install the Anyscale CLI + +```bash +pip install -U anyscale +anyscale login +``` + +## Clone the example + +```bash +git clone https://github.com/anyscale/examples.git +cd examples/sglang_inference +``` + +## Batch inference + +Run batch inference as an Anyscale job: + +```bash +anyscale job submit -f job.yaml +``` + +Or with the larger model: + +```bash +anyscale job submit -f job.yaml --env MODEL_PATH=Qwen/Qwen3-30B-A3B-Instruct-2507 +``` + +## Deploy as a service + +Deploy as an HTTP endpoint with Ray Serve: + +```bash +anyscale service deploy -f service.yaml +``` + +Or with the larger model: + +```bash +anyscale service deploy -f service.yaml --env MODEL_PATH=Qwen/Qwen3-30B-A3B-Instruct-2507 +``` + +Wait for the service to be ready: + +```bash +anyscale service wait --name sglang-inference --state RUNNING --timeout-s 900 +``` + +The `anyscale service deploy` command outputs a line that looks like: + +```text +curl -H "Authorization: Bearer " +``` + +Set the environment variables from this output and query the model: + +```bash +export SERVICE_URL= +export SERVICE_TOKEN= + +pip install requests +python query.py +``` + +Shutdown the service when done: + +```bash +anyscale service terminate --name sglang-inference +``` + +## Understanding the example + +- [serve.py](https://github.com/anyscale/examples/blob/main/sglang_inference/serve.py) uses Ray Serve's [`placement_group_bundles`](https://docs.ray.io/en/latest/serve/advanced-guides/replica-scheduling.html) to reserve GPUs across multiple nodes for tensor-parallel inference. +- [driver_offline.py](https://github.com/anyscale/examples/blob/main/sglang_inference/driver_offline.py) wraps SGLang in a Ray actor for batch inference. +- SGLang is imported inside the actor because it initializes CUDA and cannot be imported on CPU-only nodes. +- The default configuration uses TP=4, PP=2 across 2 nodes (8 GPUs per replica) on A10G GPUs. Other GPU types like L4, L40S, A100, and H100 would also work. +- The service autoscales from 1-4 replicas based on queue depth. See [AutoscalingConfig](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.config.AutoscalingConfig.html) for tuning. +- The [Dockerfile](https://github.com/anyscale/examples/blob/main/sglang_inference/Dockerfile) installs CUDA toolkit and SGLang dependencies on top of the Ray base image. + +**Environment variables:** + +Override any variable at deploy/submit time with `--env`: + +| Variable | Default | Description | +|----------|---------|-------------| +| `MODEL_PATH` | `Qwen/Qwen3-1.7B` | HuggingFace model ID | +| `TP_SIZE` | `4` | Tensor parallelism (GPUs per pipeline stage) | +| `PP_SIZE` | `2` | Pipeline parallelism (number of stages) | +| `NUM_NODES` | `2` | Nodes per replica | diff --git a/sglang_inference/driver_offline.py b/sglang_inference/driver_offline.py new file mode 100644 index 0000000..361b8fa --- /dev/null +++ b/sglang_inference/driver_offline.py @@ -0,0 +1,105 @@ +""" +Offline (batch) inference with SGLang on Ray. + +Wraps sglang.Engine in a Ray actor for multi-node batch generation. +The driver (head node) needs no GPU — sglang is imported only inside the actor. + +Usage: + python driver_offline.py +""" + +import os +import time + +import ray +from ray.util.placement_group import placement_group +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +# Configuration from environment (same as serve.py) +MODEL_PATH = os.environ.get("MODEL_PATH", "Qwen/Qwen3-1.7B") +TP_SIZE = int(os.environ.get("TP_SIZE", "4")) +PP_SIZE = int(os.environ.get("PP_SIZE", "2")) +NUM_NODES = int(os.environ.get("NUM_NODES", "2")) + + +@ray.remote +class EngineActor: + """Thin wrapper that creates an sglang.Engine inside a Ray actor. + + We import sglang inside the actor because it initializes CUDA and + cannot be imported on the CPU-only head node where the driver runs. + """ + + def __init__(self, **kwargs): + from sglang import Engine + + self.engine = Engine(**kwargs) + + def generate(self, prompts, sampling_params): + return [ + self.engine.generate(prompt=p, sampling_params=sampling_params) + for p in prompts + ] + + def shutdown(self): + self.engine.shutdown() + + +def main(): + gpus_per_node = (TP_SIZE * PP_SIZE) // NUM_NODES + + print(f"Configuration: MODEL_PATH={MODEL_PATH}, TP={TP_SIZE}, PP={PP_SIZE}, NUM_NODES={NUM_NODES}") + print(f"GPUs per node: {gpus_per_node}") + + # Reserve GPUs across nodes + pg = placement_group( + bundles=[{"CPU": 1, "GPU": gpus_per_node}] * NUM_NODES, + ) + ray.get(pg.ready()) + print("Placement group ready.") + + # Start engine actor on the first bundle + engine = EngineActor.options( + num_cpus=1, + num_gpus=0, + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=pg, placement_group_bundle_index=0, + ), + ).remote( + model_path=MODEL_PATH, + tp_size=TP_SIZE, + pp_size=PP_SIZE, + nnodes=NUM_NODES, + use_ray=True, + ) + + # Wait for engine to be ready (model loaded) + print("Loading model...") + ray.get(engine.generate.remote(["warmup"], {"max_new_tokens": 1})) + print("Engine ready.") + + # Batch generate + prompts = [ + "The capital of France is", + "Explain quantum computing in simple terms:", + "Write a haiku about programming:", + "What is 2 + 2?", + ] + + t0 = time.time() + results = ray.get( + engine.generate.remote(prompts, {"max_new_tokens": 64, "temperature": 0.0}) + ) + print(f"Generated {len(results)} responses in {time.time() - t0:.2f}s\n") + + for prompt, result in zip(prompts, results): + print(f"Prompt: {prompt}") + print(f"Response: {result['text'][:200]}\n") + + # Cleanup + ray.get(engine.shutdown.remote()) + ray.util.remove_placement_group(pg) + + +if __name__ == "__main__": + main() diff --git a/sglang_inference/job.yaml b/sglang_inference/job.yaml new file mode 100644 index 0000000..c32a7ca --- /dev/null +++ b/sglang_inference/job.yaml @@ -0,0 +1,36 @@ +# Anyscale Job: SGLang Offline (Batch) Inference +# +# Configuration: TP=4, PP=2 across 2 nodes (4 GPUs per node) +# +# Submit (small model, fast): +# anyscale job submit -f job.yaml +# +# Submit (large model): +# anyscale job submit -f job.yaml --env MODEL_PATH=Qwen/Qwen3-30B-A3B-Instruct-2507 + +name: sglang-offline-inference + +containerfile: ./Dockerfile + +compute_config: + head_node: + instance_type: m5.2xlarge # CPU-only head + worker_nodes: + - instance_type: g5.12xlarge # 4x A10G + min_nodes: 4 + max_nodes: 4 + +env_vars: + MODEL_PATH: "Qwen/Qwen3-1.7B" + TP_SIZE: "4" + PP_SIZE: "2" + NUM_NODES: "2" + # Ray normally sets CUDA_VISIBLE_DEVICES for each worker process. + # Disable this because SGLang assumes CUDA_VISIBLE_DEVICES lists all GPUs. + RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES: "1" + +working_dir: . + +entrypoint: python driver_offline.py + +max_retries: 0 diff --git a/sglang_inference/query.py b/sglang_inference/query.py new file mode 100644 index 0000000..90af229 --- /dev/null +++ b/sglang_inference/query.py @@ -0,0 +1,29 @@ +"""Query the SGLang inference service.""" + +import os +import requests + +SERVICE_URL = os.environ.get("SERVICE_URL") +SERVICE_TOKEN = os.environ.get("SERVICE_TOKEN") + +if not SERVICE_URL or not SERVICE_TOKEN: + print("Set SERVICE_URL and SERVICE_TOKEN from 'anyscale service deploy' output") + raise SystemExit(1) + +prompts = [ + "The capital of France is", + "Explain quantum computing in one sentence:", + "Write a haiku about programming:", + "What is 2 + 2?", + "The largest planet in our solar system is", +] + +for prompt in prompts: + response = requests.post( + SERVICE_URL, + headers={"Authorization": f"Bearer {SERVICE_TOKEN}"}, + json={"text": prompt, "sampling_params": {"max_new_tokens": 32}}, + timeout=120, + ) + response.raise_for_status() + print(f"{prompt}{response.json()['text']}\n") diff --git a/sglang_inference/serve.py b/sglang_inference/serve.py new file mode 100644 index 0000000..0bc5533 --- /dev/null +++ b/sglang_inference/serve.py @@ -0,0 +1,80 @@ +"""Ray Serve deployment for SGLang inference. + +This deployment uses Ray Serve's placement_group_bundles to reserve GPUs +across multiple nodes for tensor-parallel inference with SGLang. + +Based on the Ray Serve LLM + SGLang integration pattern from: +https://github.com/ray-project/ray/pull/58366 +""" + +import os +import signal + +from fastapi import FastAPI +from ray import serve + +# Configuration from environment (same defaults as driver_offline.py) +MODEL_PATH = os.environ.get("MODEL_PATH", "Qwen/Qwen3-1.7B") +TP_SIZE = int(os.environ.get("TP_SIZE", "4")) +PP_SIZE = int(os.environ.get("PP_SIZE", "2")) +NUM_NODES = int(os.environ.get("NUM_NODES", "2")) + +gpus_per_node = (TP_SIZE * PP_SIZE) // NUM_NODES + +app = FastAPI() + + +@serve.deployment( + autoscaling_config={ + "min_replicas": 1, + "max_replicas": 4, + "target_ongoing_requests": 4, + }, + ray_actor_options={ + "num_cpus": 1, + "num_gpus": 0, + }, + # Reserve resources across multiple nodes for tensor parallelism. + # Each bundle reserves GPUs on one node. + placement_group_bundles=[{"CPU": 1, "GPU": gpus_per_node}] * NUM_NODES, +) +@serve.ingress(app) +class SGLangDeployment: + def __init__(self): + # Import sglang inside the actor because it initializes CUDA and + # cannot be imported on the CPU-only head node where the Serve + # controller runs. + from sglang import Engine + + # Monkey patch signal.signal to avoid "signal only works in main thread" + # error. SGLang tries to register signal handlers for graceful shutdown, + # but Ray Serve workers are not in the main thread. + original_signal = signal.signal + + def noop_signal_handler(sig, action): + return signal.SIG_DFL + + try: + signal.signal = noop_signal_handler + self.engine = Engine( + model_path=MODEL_PATH, + tp_size=TP_SIZE, + pp_size=PP_SIZE, + nnodes=NUM_NODES, + use_ray=True, + ) + finally: + signal.signal = original_signal + + @app.post("/") + async def generate(self, request: dict) -> dict: + text = request.get("text", "") + sampling_params = request.get("sampling_params", {"max_new_tokens": 64}) + result = await self.engine.async_generate( + prompt=text, + sampling_params=sampling_params, + ) + return {"text": result["text"]} + + +app_deploy = SGLangDeployment.bind() diff --git a/sglang_inference/service.yaml b/sglang_inference/service.yaml new file mode 100644 index 0000000..c3473b5 --- /dev/null +++ b/sglang_inference/service.yaml @@ -0,0 +1,35 @@ +# Anyscale Service: SGLang Online Inference +# +# Configuration: TP=4, PP=2 across 2 nodes (4 GPUs per node) +# +# Deploy (small model, fast): +# anyscale service deploy -f service.yaml +# +# Deploy (large model): +# anyscale service deploy -f service.yaml --env MODEL_PATH=Qwen/Qwen3-30B-A3B-Instruct-2507 + +name: sglang-inference + +containerfile: ./Dockerfile + +compute_config: + head_node: + instance_type: m5.2xlarge # CPU-only head + worker_nodes: + - instance_type: g5.12xlarge # 4x A10G + min_nodes: 4 + max_nodes: 8 + +env_vars: + MODEL_PATH: "Qwen/Qwen3-1.7B" + TP_SIZE: "4" + PP_SIZE: "2" + NUM_NODES: "2" + # Ray normally sets CUDA_VISIBLE_DEVICES for each worker process. + # Disable this because SGLang assumes CUDA_VISIBLE_DEVICES lists all GPUs. + RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES: "1" + +working_dir: . + +applications: + - import_path: serve:app_deploy