runpod · jhcipar · Feb 19, 2026 · Feb 20, 2026
diff --git a/01_getting_started/01_hello_world/gpu_worker.py b/01_getting_started/01_hello_world/gpu_worker.py
@@ -1,6 +1,6 @@
-# gpu serverless worker -- detects available GPU hardware.
-# run with: flash run
-# test directly: python gpu_worker.py
+# GPU serverless worker -- detects available GPU hardware.
+# Run with: flash run
+# Test directly: python gpu_worker.py
 from runpod_flash import Endpoint, GpuGroup
 
 
@@ -11,7 +11,7 @@
     idle_timeout=5,
 )
 async def gpu_hello(input_data: dict) -> dict:
-    """GPU worker that returns GPU hardware info."""
+    """Simple GPU worker that returns GPU hardware info."""
     import platform
     from datetime import datetime
 

diff --git a/02_ml_inference/02_text_generation/.env.example b/02_ml_inference/02_text_generation/.env.example
@@ -0,0 +1,5 @@
+# FLASH_HOST=localhost
+# FLASH_PORT=8888
+# LOG_LEVEL=INFO
+# RUNPOD_API_KEY=your_api_key_here
+# VLLM_MODEL=Qwen/Qwen2.5-3B-Instruct
diff --git a/02_ml_inference/02_text_generation/.flashignore b/02_ml_inference/02_text_generation/.flashignore
@@ -0,0 +1,40 @@
+# Flash Build Ignore Patterns
+
+# Python cache
+__pycache__/
+*.pyc
+
+# Virtual environments
+venv/
+.venv/
+env/
+
+# IDE
+.vscode/
+.idea/
+
+# Environment files
+.env
+.env.local
+
+# Git
+.git/
+.gitignore
+
+# Build artifacts
+dist/
+build/
+*.egg-info/
+
+# Flash resources
+.runpod/
+
+# Tests
+tests/
+test_*.py
+*_test.py
+
+# Documentation
+docs/
+*.md
+!README.md
diff --git a/02_ml_inference/02_text_generation/.gitignore b/02_ml_inference/02_text_generation/.gitignore
@@ -0,0 +1,45 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+.venv/
+ENV/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# Environment
+.env
+.env.local
+
+# Flash
+.runpod/
+dist/
+
+# OS
+.DS_Store
+Thumbs.db
+.flash/
diff --git a/02_ml_inference/02_text_generation/README.md b/02_ml_inference/02_text_generation/README.md
@@ -0,0 +1,45 @@
+# Text Generation with vLLM (Qwen)
+
+Serve a chat-style text generation worker with vLLM on Runpod Flash GPUs.
+
+## What You'll Learn
+
+- Configuring a GPU worker with `LiveServerless`
+- Serving a vLLM-backed model with `@remote`
+- Caching model initialization to avoid repeated cold loads
+- Calling a `@remote` class method from a worker function
+
+## Quick Start
+
+### Prerequisites
+
+- Python 3.10+
+- Runpod API key
+
+### Setup
+
+```bash
+cd 02_ml_inference/02_text_generation
+pip install -r requirements.txt
+cp .env.example .env
+# Add RUNPOD_API_KEY to .env
+```
+
+### Run
+
+```bash
+flash run
+```
+
+Server starts at http://localhost:8888
+Visit http://localhost:8888/docs for interactive docs.
+
+## Common Issues
+
+- Cold start latency on first request is expected when worker is scaled to zero.
+- If you hit GPU memory errors, reduce `max_model_len` or use a larger GPU group.
+
+## References
+
+- https://docs.runpod.io/flash
+- https://docs.vllm.ai
diff --git a/02_ml_inference/02_text_generation/gpu_worker.py b/02_ml_inference/02_text_generation/gpu_worker.py
@@ -0,0 +1,75 @@
+import asyncio
+import logging
+
+from runpod_flash import Endpoint, GpuGroup
+
+logger = logging.getLogger(__name__)
+
+
+@Endpoint(
+    name="02_02_text_generation_gpu",
+    gpu=GpuGroup.ADA_24,
+    workers=(0, 3),
+    idle_timeout=30,
+    dependencies=["vllm"],
+)
+class MinimalVLLM:
+    _instance = None
+
+    def __new__(cls):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def __init__(self) -> None:
+        self.MODEL = "Qwen/Qwen2.5-3B-Instruct"
+        self.SYSTEM_PROMPT = (
+            "You are a pirate-style rewriter. Rewrite user text in fun pirate voice. "
+            "Keep original meaning. Keep it concise. Output only the rewritten text."
+        )
+
+        import os
+        from transformers import AutoTokenizer
+        from vllm import LLM, SamplingParams
+
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+        self.llm = LLM(
+            model=self.MODEL,  # Hugging Face model id loaded by vLLM.
+            enforce_eager=True,  # Skip CUDA graph capture; faster cold start, usually lower throughput.
+            gpu_memory_utilization=0.6,  # Reserve ~60% of GPU memory for model/kv cache.
+            max_model_len=1024,  # Max context window handled per request.
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL)
+        self.sampling = SamplingParams(
+            temperature=0.6,  # Moderate creativity while preserving meaning.
+            top_p=0.9,  # Nucleus sampling to avoid low-probability tails.
+            max_tokens=500,  # Cap response length.
+        )
+        print("vLLM initialized successfully")
+
+    def piratize(self, text: str) -> str:
+        messages = [
+            {"role": "system", "content": self.SYSTEM_PROMPT},
+            {"role": "user", "content": text},
+        ]
+        prompt = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,  # Return plain prompt text for vLLM.
+            add_generation_prompt=True,  # Append assistant prefix so model continues as assistant.
+        )
+        out = self.llm.generate([prompt], self.sampling)
+        return out[0].outputs[0].text.strip()
+
+    async def generate_single(self, prompt: str) -> str:
+        return self.piratize(prompt)
+
+
+async def piratize_text(text: str) -> str:
+    llm = MinimalVLLM()
+    return await llm.generate_single(text)
+
+
+if __name__ == "__main__":
+    sample = "Want to go get something to eat? man, I sure do enjoy sailing."
+    print(asyncio.run(piratize_text(sample)))
diff --git a/02_ml_inference/02_text_generation/pyproject.toml b/02_ml_inference/02_text_generation/pyproject.toml
@@ -0,0 +1,11 @@
+[project]
+name = "02_text_generation"
+version = "0.1.0"
+description = "Text generation with vLLM on Runpod Flash"
+requires-python = ">=3.10"
+dependencies = [
+    "runpod-flash",
+    "fastapi>=0.104.0",
+    "uvicorn>=0.24.0",
+    "pydantic>=2.0.0",
+]
diff --git a/02_ml_inference/02_text_generation/requirements.txt b/02_ml_inference/02_text_generation/requirements.txt
@@ -0,0 +1 @@
+runpod-flash