diff --git a/deploy_deepseek_r1/Dockerfile b/deploy_deepseek_r1/Dockerfile new file mode 100644 index 0000000..ea0023c --- /dev/null +++ b/deploy_deepseek_r1/Dockerfile @@ -0,0 +1,8 @@ +FROM anyscale/ray:2.50.0-slim-py312-cu128 + +# C compiler for Triton’s runtime build step (vLLM V1 engine) +# https://github.com/vllm-project/vllm/issues/2997 +RUN sudo apt-get update && \ + sudo apt-get install -y --no-install-recommends build-essential + +RUN pip install vllm==0.11.0 diff --git a/deploy_deepseek_r1/README.md b/deploy_deepseek_r1/README.md new file mode 100644 index 0000000..db4036f --- /dev/null +++ b/deploy_deepseek_r1/README.md @@ -0,0 +1,60 @@ +# Deploy DeepSeek R1 + +This example uses Ray Serve along with vLLM to deploy a DeepSeek R1 model as an Anyscale service. + +## Install the Anyscale CLI + +```bash +pip install -U anyscale +anyscale login +``` + +## Deploy the service + +Clone the example from GitHub. + +```bash +git clone https://github.com/anyscale/examples.git +cd examples/deploy_deepseek_r1 +``` + +Deploy the service. +```bash +anyscale service deploy -f service.yaml +``` + +## Understanding the example + +- The [application code](https://github.com/anyscale/examples/blob/main/deploy_deepseek_r1/serve_deepseek_r1.py) sets the required accelerator type with `accelerator_type="H100"`. To use a different accelerator, replace `"H100"` with the desired name. See the [list of supported accelerators](https://docs.ray.io/en/latest/ray-core/accelerator-types.html#accelerator-types) for available options. +- Ray Serve automatically autoscales the number of model replicas between `min_replicas` and `max_replicas`. Ray Serve adapts the number of replicas by monitoring queue sizes. For more information on configuring autoscaling, see the [AutoscalingConfig documentation](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.config.AutoscalingConfig.html). +- This example uses vLLM, and the [Dockerfile](https://github.com/anyscale/examples/blob/main/deploy_deepseek_r1/Dockerfile) defines the service’s dependencies. When you run `anyscale service deploy`, the build process adds these dependencies on top of an Anyscale-provided base image. +- To configure vLLM, modify the `engine_kwargs` dictionary. See [Ray documentation for the `LLMConfig` object](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html#ray.serve.llm.LLMConfig). + + +## Query the service + +The `anyscale service deploy` command outputs a line that looks like +```text +curl -H "Authorization: Bearer " +``` + +From the output, you can extract the service token and base URL. Open [query.py](https://github.com/anyscale/examples/blob/main/deploy_deepseek_r1/query.py) and add them to the appropriate fields. +```python +token = +base_url = +``` + +Query the model +```bash +pip install openai +python query.py +``` + +View the service in the [services tab](https://console.anyscale.com/services) of the Anyscale console. + +## Shutdown + +Shutdown your Anyscale Service: +```bash +anyscale service terminate -n deploy-deepseek-r1 +``` \ No newline at end of file diff --git a/deploy_deepseek_r1/query.py b/deploy_deepseek_r1/query.py new file mode 100644 index 0000000..2034e60 --- /dev/null +++ b/deploy_deepseek_r1/query.py @@ -0,0 +1,33 @@ +from urllib.parse import urljoin +from openai import OpenAI + +# The "anyscale service deploy" script outputs a line that looks like +# +# curl -H "Authorization: Bearer " +# +# From this, you can parse out the service token and base URL. +token = # Fill this in. If deploying and querying locally, use token = "FAKE_KEY" +base_url = # Fill this in. If deploying and querying locally, use base_url = "http://localhost:8000" + +client = OpenAI(base_url= urljoin(base_url, "v1"), api_key=token) + +response = client.chat.completions.create( + model="my-deepseek-r1", + messages=[ + {"role": "user", "content": "What's the capital of France?"} + ], + stream=True +) + +# Stream and print JSON +for chunk in response: + # Stream reasoning content + if hasattr(chunk.choices[0].delta, "reasoning_content"): + data_reasoning = chunk.choices[0].delta.reasoning_content + if data_reasoning: + print(data_reasoning, end="", flush=True) + # Later, stream the final answer + if hasattr(chunk.choices[0].delta, "content"): + data_content = chunk.choices[0].delta.content + if data_content: + print(data_content, end="", flush=True) \ No newline at end of file diff --git a/deploy_deepseek_r1/serve_deepseek_r1.py b/deploy_deepseek_r1/serve_deepseek_r1.py new file mode 100644 index 0000000..a521467 --- /dev/null +++ b/deploy_deepseek_r1/serve_deepseek_r1.py @@ -0,0 +1,29 @@ +# serve_deepseek_r1.py +from ray import serve +from ray.serve.llm import LLMConfig, build_openai_app + +llm_config = LLMConfig( + model_loading_config=dict( + model_id="my-deepseek-r1", + model_source="deepseek-ai/DeepSeek-R1", + ), + accelerator_type="H100", + deployment_config=dict( + autoscaling_config=dict( + min_replicas=1, + max_replicas=1, + ) + ), + engine_kwargs=dict( + max_model_len=16384, + # Split weights among 8 GPUs in the node + tensor_parallel_size=8, + pipeline_parallel_size=2, + reasoning_parser="deepseek_r1", # Optional: separate reasoning content from the final answer + ), +) + +app = build_openai_app({"llm_configs": [llm_config]}) + +# Uncomment the below line to run the service locally with Python. +# serve.run(app, blocking=True) diff --git a/deploy_deepseek_r1/service.yaml b/deploy_deepseek_r1/service.yaml new file mode 100644 index 0000000..edaf67c --- /dev/null +++ b/deploy_deepseek_r1/service.yaml @@ -0,0 +1,59 @@ +# View the docs https://docs.anyscale.com/reference/service-api#serviceconfig. + +name: deploy-deepseek-r1 + +# When empty, use the default image. This can be an Anyscale-provided base image +# like anyscale/ray:2.43.0-slim-py312-cu125, a user-provided base image (provided +# that it meets certain specs), or you can build new images using the Anyscale +# image builder at https://console.anyscale-staging.com/v2/container-images. + +containerfile: ./Dockerfile + +# When empty, Anyscale will auto-select the instance types. You can also specify +# minimum and maximum resources. +compute_config: +# head_node: +# instance_type: m5.2xlarge +# worker_nodes: +# - instance_type: m5.16xlarge +# min_nodes: 0 +# max_nodes: 100 +# - instance_type: m7a.24xlarge +# min_nodes: 0 +# max_nodes: 100 +# market_type: PREFER_SPOT # Defaults to ON_DEMAND +# - instance_type: g4dn.2xlarge +# min_nodes: 0 +# max_nodes: 100 +# market_type: PREFER_SPOT # Defaults to ON_DEMAND + auto_select_worker_config: true + # Change default disk size to 1000GB + advanced_instance_config: + ## AWS ## + BlockDeviceMappings: + - Ebs: + - VolumeSize: 1000 + VolumeType: gp3 + DeleteOnTermination: true + DeviceName: "/dev/sda1" + ######### + ## GCP ## + #instanceProperties: + # disks: + # - boot: true + # auto_delete: true + # initialize_params: + # - disk_size_gb: 1000 + ######### + +# Path to a local directory or a remote URI to a .zip file (S3, GS, HTTP) that +# will be the working directory for the job. The files in the directory will be +# automatically uploaded to the job environment in Anyscale. +working_dir: . + +# When empty, this uses the default Anyscale Cloud in your organization. +cloud: + +# Specify the Ray Serve app to deploy. +applications: +- import_path: serve_deepseek_r1:app \ No newline at end of file