Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions 01_getting_started/01_hello_world/gpu_worker.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# gpu serverless worker -- detects available GPU hardware.
# run with: flash run
# test directly: python gpu_worker.py
# GPU serverless worker -- detects available GPU hardware.
# Run with: flash run
# Test directly: python gpu_worker.py
from runpod_flash import Endpoint, GpuGroup


Expand All @@ -11,7 +11,7 @@
idle_timeout=5,
)
async def gpu_hello(input_data: dict) -> dict:
"""GPU worker that returns GPU hardware info."""
"""Simple GPU worker that returns GPU hardware info."""
import platform
from datetime import datetime

Expand Down
5 changes: 5 additions & 0 deletions 02_ml_inference/02_text_generation/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# FLASH_HOST=localhost
# FLASH_PORT=8888
# LOG_LEVEL=INFO
# RUNPOD_API_KEY=your_api_key_here
# VLLM_MODEL=Qwen/Qwen2.5-3B-Instruct
40 changes: 40 additions & 0 deletions 02_ml_inference/02_text_generation/.flashignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Flash Build Ignore Patterns

# Python cache
__pycache__/
*.pyc

# Virtual environments
venv/
.venv/
env/

# IDE
.vscode/
.idea/

# Environment files
.env
.env.local

# Git
.git/
.gitignore

# Build artifacts
dist/
build/
*.egg-info/

# Flash resources
.runpod/

# Tests
tests/
test_*.py
*_test.py

# Documentation
docs/
*.md
!README.md
45 changes: 45 additions & 0 deletions 02_ml_inference/02_text_generation/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
env/
venv/
.venv/
ENV/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# IDEs
.vscode/
.idea/
*.swp
*.swo
*~

# Environment
.env
.env.local

# Flash
.runpod/
dist/

# OS
.DS_Store
Thumbs.db
.flash/
45 changes: 45 additions & 0 deletions 02_ml_inference/02_text_generation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Text Generation with vLLM (Qwen)

Serve a chat-style text generation worker with vLLM on Runpod Flash GPUs.

## What You'll Learn

- Configuring a GPU worker with `LiveServerless`
- Serving a vLLM-backed model with `@remote`
- Caching model initialization to avoid repeated cold loads
- Calling a `@remote` class method from a worker function

## Quick Start

### Prerequisites

- Python 3.10+
- Runpod API key

### Setup

```bash
cd 02_ml_inference/02_text_generation
pip install -r requirements.txt
cp .env.example .env
# Add RUNPOD_API_KEY to .env
```

### Run

```bash
flash run
```

Server starts at http://localhost:8888
Visit http://localhost:8888/docs for interactive docs.

## Common Issues

- Cold start latency on first request is expected when worker is scaled to zero.
- If you hit GPU memory errors, reduce `max_model_len` or use a larger GPU group.

## References

- https://docs.runpod.io/flash
- https://docs.vllm.ai
75 changes: 75 additions & 0 deletions 02_ml_inference/02_text_generation/gpu_worker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import asyncio
import logging

from runpod_flash import Endpoint, GpuGroup

logger = logging.getLogger(__name__)


@Endpoint(
name="02_02_text_generation_gpu",
gpu=GpuGroup.ADA_24,
workers=(0, 3),
idle_timeout=30,
dependencies=["vllm"],
)
class MinimalVLLM:
_instance = None

def __new__(cls):
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance

def __init__(self) -> None:
self.MODEL = "Qwen/Qwen2.5-3B-Instruct"
self.SYSTEM_PROMPT = (
"You are a pirate-style rewriter. Rewrite user text in fun pirate voice. "
"Keep original meaning. Keep it concise. Output only the rewritten text."
)

import os
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

self.llm = LLM(
model=self.MODEL, # Hugging Face model id loaded by vLLM.
enforce_eager=True, # Skip CUDA graph capture; faster cold start, usually lower throughput.
gpu_memory_utilization=0.6, # Reserve ~60% of GPU memory for model/kv cache.
max_model_len=1024, # Max context window handled per request.
)
self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL)
self.sampling = SamplingParams(
temperature=0.6, # Moderate creativity while preserving meaning.
top_p=0.9, # Nucleus sampling to avoid low-probability tails.
max_tokens=500, # Cap response length.
)
print("vLLM initialized successfully")

def piratize(self, text: str) -> str:
messages = [
{"role": "system", "content": self.SYSTEM_PROMPT},
{"role": "user", "content": text},
]
prompt = self.tokenizer.apply_chat_template(
messages,
tokenize=False, # Return plain prompt text for vLLM.
add_generation_prompt=True, # Append assistant prefix so model continues as assistant.
)
out = self.llm.generate([prompt], self.sampling)
return out[0].outputs[0].text.strip()

async def generate_single(self, prompt: str) -> str:
return self.piratize(prompt)


async def piratize_text(text: str) -> str:
llm = MinimalVLLM()
return await llm.generate_single(text)


if __name__ == "__main__":
sample = "Want to go get something to eat? man, I sure do enjoy sailing."
print(asyncio.run(piratize_text(sample)))
11 changes: 11 additions & 0 deletions 02_ml_inference/02_text_generation/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[project]
name = "02_text_generation"
version = "0.1.0"
description = "Text generation with vLLM on Runpod Flash"
requires-python = ">=3.10"
dependencies = [
"runpod-flash",
"fastapi>=0.104.0",
"uvicorn>=0.24.0",
"pydantic>=2.0.0",
]
1 change: 1 addition & 0 deletions 02_ml_inference/02_text_generation/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
runpod-flash
Loading