From d433dd5fc84500c18499e57c2c04eeaf7746e2ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 9 Mar 2026 18:17:46 -0700 Subject: [PATCH 1/4] feat(04_dependencies): add mixed_worker example and GPU vs CPU packaging docs Add mixed_worker.py demonstrating numpy used by both GPU and CPU endpoints -- the key scenario where the dependency blacklist fix matters. Update README with GPU vs CPU packaging section explaining base image differences, build exclusions, and the runtime safety net. --- 01_getting_started/04_dependencies/README.md | 59 +++++++++++-- .../04_dependencies/mixed_worker.py | 82 +++++++++++++++++++ 2 files changed, 136 insertions(+), 5 deletions(-) create mode 100644 01_getting_started/04_dependencies/mixed_worker.py diff --git a/01_getting_started/04_dependencies/README.md b/01_getting_started/04_dependencies/README.md index 6f7a6fd..5f7d080 100644 --- a/01_getting_started/04_dependencies/README.md +++ b/01_getting_started/04_dependencies/README.md @@ -6,6 +6,8 @@ Learn how to manage Python packages and system dependencies in Flash workers. - **Python dependencies** - Installing packages with version constraints - **System dependencies** - Installing apt packages (ffmpeg, libgl1, etc.) +- **GPU vs CPU packaging** - How dependencies are resolved differently per runtime +- **Shared dependencies** - GPU and CPU endpoints using the same package (numpy) - **Version constraints** - Supported syntax for version pinning - **Dependency optimization** - Minimizing cold start time @@ -13,6 +15,14 @@ Learn how to manage Python packages and system dependencies in Flash workers. **Prerequisites**: Complete the [repository setup](../../README.md#quick-start) first (clone, `make dev`, set API key). +### Files + +| File | What it demonstrates | +|------|---------------------| +| `gpu_worker.py` | Python deps with version pins, system deps (ffmpeg, libgl1) | +| `cpu_worker.py` | Data science deps on CPU (numpy, pandas, scipy), zero-dep worker | +| `mixed_worker.py` | Same dependency (numpy) on both GPU and CPU endpoints | + ### Run This Example ```bash @@ -38,6 +48,26 @@ uv run flash login uv run flash run ``` +## GPU vs CPU Packaging + +GPU and CPU endpoints use different base Docker images, which affects how dependencies are resolved: + +| | GPU images (`runpod/pytorch:*`) | CPU images (`python:X.Y-slim`) | +|---|---|---| +| **Base image** | PyTorch + CUDA + numpy + triton | Python stdlib only | +| **Pre-installed** | torch, torchvision, torchaudio, numpy, triton | Nothing | +| **Build artifact** | Excludes torch ecosystem (too large for 500 MB tarball) | Includes everything declared in `dependencies` | + +**What this means for you:** + +- **GPU endpoints**: `torch`, `torchvision`, `torchaudio`, and `triton` are excluded from the build artifact because they already exist in the base image and would exceed the 500 MB tarball limit. All other dependencies (including `numpy`) are packaged normally. +- **CPU endpoints**: Every dependency must be in the build artifact. Nothing is pre-installed. +- **Mixed projects**: When GPU and CPU endpoints share a dependency like `numpy`, it ships in the tarball. The GPU image ignores the duplicate (its pre-installed copy takes precedence). + +See `mixed_worker.py` for a working example of GPU and CPU endpoints sharing `numpy`. + +**Safety net**: If a dependency is missing from the build artifact at runtime, the worker attempts to install it on-the-fly and logs a warning. This prevents crashes but adds to cold start time. Always declare your dependencies explicitly to avoid this penalty. + ## Dependency Types ### 1. Python Dependencies @@ -47,7 +77,7 @@ Specified in the `Endpoint` decorator: ```python @Endpoint( name="my-worker", - gpu=GpuGroup.ADA_24, + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, dependencies=[ "requests==2.32.3", # Exact version "Pillow>=10.0.0", # Minimum version @@ -70,7 +100,7 @@ Install apt packages: ```python @Endpoint( name="my-worker", - gpu=GpuGroup.AMPERE_16, + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, dependencies=["opencv-python"], system_dependencies=["ffmpeg", "libgl1", "graphviz"], ) @@ -251,7 +281,7 @@ python cpu_worker.py ```python @Endpoint( name="worker", - gpu=GpuGroup.ADA_24, + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, dependencies=[ "requests==2.32.3", # API calls "Pillow>=10.0.0", # Image processing @@ -335,17 +365,36 @@ numpy **Note:** Worker dependencies in the `Endpoint` decorator are deployed automatically. `requirements.txt` is for local development only. +## Build Exclusions + +Flash automatically excludes packages that are too large for the 500 MB build artifact limit. Currently excluded: `torch`, `torchvision`, `torchaudio`, `triton` (all CUDA-specific, pre-installed in GPU images). + +You can exclude additional large packages with `--exclude`: + +```bash +# Exclude tensorflow from the build artifact +flash build --exclude tensorflow +``` + +**Important:** Only exclude packages that are pre-installed in your target runtime. If you exclude a package that a CPU endpoint needs, the worker will attempt to install it on-the-fly at startup. This works but adds to cold start time and logs a warning: + +``` +WARNING - Package 'scipy' is not in the build artifact. Installing on-the-fly. +This adds to cold start time -- consider adding it to your dependencies list +to include it in the build artifact. +``` + ## Advanced: External Docker Images For complex dependencies, deploy a pre-built image: ```python -from runpod_flash import Endpoint, GpuGroup +from runpod_flash import Endpoint, GpuType vllm = Endpoint( name="vllm-service", image="vllm/vllm-openai:latest", - gpu=GpuGroup.ADA_24, + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, ) # call it as an API client diff --git a/01_getting_started/04_dependencies/mixed_worker.py b/01_getting_started/04_dependencies/mixed_worker.py new file mode 100644 index 0000000..1a91d0b --- /dev/null +++ b/01_getting_started/04_dependencies/mixed_worker.py @@ -0,0 +1,82 @@ +# GPU and CPU workers sharing a common dependency (numpy). +# Demonstrates that dependencies work correctly across both runtime environments: +# - GPU images (runpod/pytorch:*) have numpy pre-installed +# - CPU images (python-slim) install numpy from the build artifact +# +# run with: flash run +# test directly: python mixed_worker.py +from runpod_flash import CpuInstanceType, Endpoint, GpuType + + +@Endpoint( + name="01_04_deps_gpu_numpy", + gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, + workers=(0, 3), + dependencies=["numpy"], +) +async def gpu_matrix_multiply(input_data: dict) -> dict: + """GPU worker using numpy for matrix operations. + + On GPU images, numpy is pre-installed in the base image. The build + artifact also includes it, so both paths work. The GPU image's + pre-installed copy takes precedence via Python's import resolution. + """ + import numpy as np + + size = input_data.get("size", 100) + a = np.random.rand(size, size) + b = np.random.rand(size, size) + result = np.dot(a, b) + + return { + "status": "success", + "worker_type": "GPU", + "matrix_size": size, + "result_shape": list(result.shape), + "result_trace": float(np.trace(result)), + "numpy_version": np.__version__, + } + + +@Endpoint( + name="01_04_deps_cpu_numpy", + cpu=CpuInstanceType.CPU3C_1_2, + workers=(0, 3), + dependencies=["numpy"], +) +async def cpu_statistics(input_data: dict) -> dict: + """CPU worker using numpy for statistical computations. + + On CPU images (python-slim), numpy is NOT pre-installed. The build + artifact must include it. Flash's build pipeline ships numpy in the + tarball for CPU endpoints. + """ + import numpy as np + + values = input_data.get("values", [1.0, 2.0, 3.0, 4.0, 5.0]) + arr = np.array(values) + + return { + "status": "success", + "worker_type": "CPU", + "count": len(values), + "mean": float(np.mean(arr)), + "std": float(np.std(arr)), + "median": float(np.median(arr)), + "numpy_version": np.__version__, + } + + +if __name__ == "__main__": + import asyncio + + async def test(): + print("\n=== Testing GPU numpy (matrix multiply) ===") + gpu_result = await gpu_matrix_multiply({"size": 50}) + print(f"Result: {gpu_result}\n") + + print("=== Testing CPU numpy (statistics) ===") + cpu_result = await cpu_statistics({"values": [10, 20, 30, 40, 50]}) + print(f"Result: {cpu_result}\n") + + asyncio.run(test()) From bd62c1738affdf1be2119bc7268d1bf63fee3423 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 9 Mar 2026 20:57:10 -0700 Subject: [PATCH 2/4] fix(review): address PR feedback for #42 - Clarify GPU worker docstring: numpy computations are CPU-bound despite GPU instance - Add input validation for size (clamped 1-10000) and values (list, max 100k elements) - Add note about GpuGroup vs GpuType enum inconsistency in README --- 01_getting_started/04_dependencies/README.md | 2 ++ .../04_dependencies/mixed_worker.py | 19 +++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/01_getting_started/04_dependencies/README.md b/01_getting_started/04_dependencies/README.md index 5f7d080..cf9a4a8 100644 --- a/01_getting_started/04_dependencies/README.md +++ b/01_getting_started/04_dependencies/README.md @@ -23,6 +23,8 @@ Learn how to manage Python packages and system dependencies in Flash workers. | `cpu_worker.py` | Data science deps on CPU (numpy, pandas, scipy), zero-dep worker | | `mixed_worker.py` | Same dependency (numpy) on both GPU and CPU endpoints | +> **Note:** `gpu_worker.py` uses `GpuGroup` while newer snippets in this README use `GpuType`. Both enums are supported by the SDK; `GpuType` is recommended for new code. + ### Run This Example ```bash diff --git a/01_getting_started/04_dependencies/mixed_worker.py b/01_getting_started/04_dependencies/mixed_worker.py index 1a91d0b..4b15892 100644 --- a/01_getting_started/04_dependencies/mixed_worker.py +++ b/01_getting_started/04_dependencies/mixed_worker.py @@ -15,15 +15,16 @@ dependencies=["numpy"], ) async def gpu_matrix_multiply(input_data: dict) -> dict: - """GPU worker using numpy for matrix operations. + """GPU-instance worker running CPU-bound numpy matrix operations. - On GPU images, numpy is pre-installed in the base image. The build - artifact also includes it, so both paths work. The GPU image's - pre-installed copy takes precedence via Python's import resolution. + This endpoint runs on a GPU instance type, but uses standard numpy, + so all computations execute on the CPU. On GPU images, numpy is + pre-installed in the base image; the build artifact also includes + it, so both paths work, with the image's copy taking precedence. """ import numpy as np - size = input_data.get("size", 100) + size = min(max(int(input_data.get("size", 100)), 1), 10_000) a = np.random.rand(size, size) b = np.random.rand(size, size) result = np.dot(a, b) @@ -53,7 +54,13 @@ async def cpu_statistics(input_data: dict) -> dict: """ import numpy as np - values = input_data.get("values", [1.0, 2.0, 3.0, 4.0, 5.0]) + raw_values = input_data.get("values", [1.0, 2.0, 3.0, 4.0, 5.0]) + if not isinstance(raw_values, list) or len(raw_values) > 100_000: + return { + "status": "error", + "message": "values must be a list with at most 100000 elements", + } + values = raw_values arr = np.array(values) return { From 967f8bfeddc7950c3c0fc773d03eeda7f2039184 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Mon, 9 Mar 2026 21:15:54 -0700 Subject: [PATCH 3/4] fix(05_load_balancer): use typed param for gpu_lb compute endpoint Replace untyped `request: dict` with `numbers: list[float]` to fix division by zero on empty input and provide proper Swagger examples. Module-level Pydantic models cannot be used in LB endpoints because function bodies are serialized to remote workers. --- 03_advanced_workers/05_load_balancer/gpu_lb.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/03_advanced_workers/05_load_balancer/gpu_lb.py b/03_advanced_workers/05_load_balancer/gpu_lb.py index 574b029..30e107f 100644 --- a/03_advanced_workers/05_load_balancer/gpu_lb.py +++ b/03_advanced_workers/05_load_balancer/gpu_lb.py @@ -17,11 +17,11 @@ async def gpu_health() -> dict: @api.post("/compute") -async def compute_intensive(request: dict) -> dict: +async def compute_intensive(numbers: list[float]) -> dict: """Perform compute-intensive operation on GPU. Args: - request: Request dict with numbers to process + numbers: List of numbers to compute statistics on Returns: Computation results @@ -29,7 +29,11 @@ async def compute_intensive(request: dict) -> dict: import time from datetime import datetime, timezone - numbers = request.get("numbers", []) + if not numbers: + return { + "status": "error", + "message": "numbers list must not be empty", + } start_time = time.time() result = sum(x**2 for x in numbers) @@ -82,8 +86,7 @@ async def test(): print(f" {result}\n") print("2. Compute intensive:") - request_data = {"numbers": [1, 2, 3, 4, 5]} - result = await compute_intensive(request_data) + result = await compute_intensive([1, 2, 3, 4, 5]) print(f" Sum of squares: {result['sum_of_squares']}") print(f" Mean: {result['mean']}\n") From 67c51943f292685be7b16d744f2cdd92cfa626d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dean=20Qui=C3=B1anola?= Date: Tue, 10 Mar 2026 09:19:56 -0700 Subject: [PATCH 4/4] fix(05_load_balancer): fix gpu_lb endpoint typing and torch import - Use typed param (numbers: list[float]) instead of untyped dict for proper Swagger examples and input validation - Remove broad except Exception that swallowed ImportError as misleading device info; torch should be available on GPU worker images - Add dependencies=["torch"] for environments where torch needs explicit installation --- .../05_load_balancer/gpu_lb.py | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/03_advanced_workers/05_load_balancer/gpu_lb.py b/03_advanced_workers/05_load_balancer/gpu_lb.py index 30e107f..2637bef 100644 --- a/03_advanced_workers/05_load_balancer/gpu_lb.py +++ b/03_advanced_workers/05_load_balancer/gpu_lb.py @@ -7,6 +7,7 @@ name="03_05_load_balancer_gpu", gpu=GpuType.NVIDIA_GEFORCE_RTX_4090, workers=(1, 3), + dependencies=["torch"], ) @@ -58,21 +59,16 @@ async def compute_intensive(numbers: list[float]) -> dict: @api.get("/info") async def gpu_info() -> dict: """Get GPU availability information.""" - try: - import torch - - if torch.cuda.is_available(): - info = { - "available": True, - "device": torch.cuda.get_device_name(0), - "count": torch.cuda.device_count(), - } - else: - info = {"available": False, "device": "No GPU", "count": 0} - except Exception as e: - info = {"available": False, "device": str(e), "count": 0} - - return info + import torch + + if torch.cuda.is_available(): + return { + "available": True, + "device": torch.cuda.get_device_name(0), + "count": torch.cuda.device_count(), + } + + return {"available": False, "device": "No GPU", "count": 0} if __name__ == "__main__":