diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..11ad8d4 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,68 @@ +name: CI + +on: + push: + branches: [master, main] + pull_request: + +jobs: + shaders: + name: Compile SPIR-V shaders + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install glslang + run: | + sudo apt-get update + sudo apt-get install -y glslang-tools + + # Compile every .comp to a scratch dir and fail if any shader does not + # compile. This guards the regression that the default `glslangValidator -V` + # (SPIR-V 1.0) could not build the subgroup-using quantized kernels, which + # need --target-env vulkan1.1 (SPIR-V 1.3). + - name: Compile all shaders + working-directory: csrc/shaders + run: | + set -euo pipefail + out="$(mktemp -d)" + fail=0 + for comp in *.comp; do + echo "Compiling $comp" + if ! glslangValidator --target-env vulkan1.1 -V "$comp" -o "$out/${comp%.comp}.spv"; then + echo "::error file=csrc/shaders/$comp::shader failed to compile" + fail=1 + fi + done + [ "$fail" -eq 0 ] || { echo "One or more shaders failed to compile"; exit 1; } + echo "Compiled $(ls "$out"/*.spv | wc -l) shaders." + + python-lint: + name: Python lint + syntax + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install ruff + run: pip install ruff + + # py_compile catches syntax errors without needing the compiled _C + # extension (which requires a Vulkan GPU + Kompute + custom PyTorch and + # cannot run on a stock CI runner). + - name: Syntax check + run: | + python -m py_compile setup.py persistent_pipeline.py \ + torch_vulkan/__init__.py tests/*.py + + - name: Ruff + run: ruff check . + +# NOTE: The C++ extension build (CMake + Torch + Kompute) and the runtime test +# suite require a Vulkan-capable GPU and are not run here -- GitHub-hosted +# runners have no GPU. Build/test verification is done locally on the target +# hardware (AMD Radeon 890M / RADV). This workflow verifies what can be checked +# without a GPU: shader compilation and Python static checks. diff --git a/README.md b/README.md index b7b72a2..1afc88c 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,16 @@ attention + KV cache, kv_store. ```bash # Requires a Vulkan 1.2+ driver (RADV recommended) and glslangValidator for shader build. -cd build && cmake .. && make -j$(nproc) + +# 1. (Re)compile the GLSL shaders to SPIR-V (pre-built .spv files are committed, +# so this is only needed if you edit a shader). Targets Vulkan 1.1 / SPIR-V 1.3 +# because the quantized kernels use subgroup ops. +bash csrc/shaders/compile.sh + +# 2. Build the C++ extension. +cmake -S . -B build && cmake --build build -j + +# 3. Install the Python package (build_ext also invokes CMake). pip install -e . python -c "import torch, torch_vulkan; print(torch_vulkan.device_name())" diff --git a/csrc/shaders/compile.sh b/csrc/shaders/compile.sh index e1dca44..c5c73dd 100755 --- a/csrc/shaders/compile.sh +++ b/csrc/shaders/compile.sh @@ -1,14 +1,30 @@ #!/bin/bash # Compile all GLSL compute shaders to SPIR-V # Requires: glslangValidator (from Vulkan SDK or `pacman -S glslang`) +# +# We target Vulkan 1.1, which produces SPIR-V 1.3. Several of the quantized +# matmul kernels (matmul_q*k*, matmul_gpuq*) use GL_KHR_shader_subgroup +# reductions, and those subgroup ops require SPIR-V >= 1.3. The default +# `glslangValidator -V` emits SPIR-V 1.0 and fails on them, so the explicit +# --target-env is required to compile the full shader set. +set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" cd "$SCRIPT_DIR" +fail=0 for comp in *.comp; do spv="${comp%.comp}.spv" echo "Compiling $comp -> $spv" - glslangValidator -V "$comp" -o "$spv" + if ! glslangValidator --target-env vulkan1.1 -V "$comp" -o "$spv"; then + echo " ERROR: failed to compile $comp" >&2 + fail=1 + fi done +if [ "$fail" -ne 0 ]; then + echo "One or more shaders failed to compile." >&2 + exit 1 +fi + echo "Done. $(ls *.spv | wc -l) shaders compiled." diff --git a/csrc/torch_vulkan.cpp b/csrc/torch_vulkan.cpp index 12f605f..c1bb559 100644 --- a/csrc/torch_vulkan.cpp +++ b/csrc/torch_vulkan.cpp @@ -62,6 +62,12 @@ C10_REGISTER_GUARD_IMPL(PrivateUse1, VulkanGuardImpl); // Python bindings void set_shader_dir(const std::string& path) { + // Set the shader dir on the Kompute-based VulkanContext (used by all the + // registered ops). The raw VulkanEngine (used only by the not-yet-wired + // mm_raw path) is constructed lazily and reads its directory from the + // TORCH_VULKAN_SHADER_DIR env var, which __init__.py points at this same + // directory -- so we deliberately do NOT touch VulkanEngine::instance() + // here, to avoid eagerly creating a second Vulkan device at import time. VulkanContext::instance().set_shader_dir(path); } diff --git a/csrc/vulkan_engine.cpp b/csrc/vulkan_engine.cpp index 9c5f7af..aa59b5c 100644 --- a/csrc/vulkan_engine.cpp +++ b/csrc/vulkan_engine.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include namespace torch_vulkan { @@ -12,7 +13,16 @@ VulkanEngine& VulkanEngine::instance() { } VulkanEngine::VulkanEngine() { - shaderDir_ = "/home/raz/projects/torch-vulkan/csrc/shaders/"; + // The shader directory is normally set at import time via + // setShaderDir() (driven by torch_vulkan._set_shader_dir in __init__.py). + // As a fallback for direct/standalone use, honour TORCH_VULKAN_SHADER_DIR + // so the path is never hardcoded to a developer's machine. + if (const char* env = std::getenv("TORCH_VULKAN_SHADER_DIR")) { + shaderDir_ = env; + if (!shaderDir_.empty() && shaderDir_.back() != '/') { + shaderDir_ += '/'; + } + } initVulkan(); } diff --git a/persistent_pipeline.py b/persistent_pipeline.py index 9a02a17..a953a67 100644 --- a/persistent_pipeline.py +++ b/persistent_pipeline.py @@ -8,8 +8,6 @@ """ import torch -import time -import numpy as np class PersistentLayerPipeline: diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..42666cb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,13 @@ +[tool.ruff] +# torch-vulkan is a thin Python shim over a C++/Vulkan extension; keep linting +# focused on real problems (pyflakes + import sorting) rather than style churn. +line-length = 100 +target-version = "py310" + +[tool.ruff.lint] +select = ["E", "F", "I"] +# E402 (module-level import not at top of file) is intentionally allowed: the +# package must import `torch` first, then load the `_C` extension that registers +# the PrivateUse1 backend, and finally alias the module into `torch.vulkan` -- +# all of which happen mid-module by design. +ignore = ["E402"] diff --git a/setup.py b/setup.py index d22f767..781f1ea 100644 --- a/setup.py +++ b/setup.py @@ -8,9 +8,8 @@ import os import subprocess -import sys -from setuptools import setup, Extension +from setuptools import Extension, setup from setuptools.command.build_ext import build_ext diff --git a/tests/bench_layer.py b/tests/bench_layer.py index d541951..56a3852 100644 --- a/tests/bench_layer.py +++ b/tests/bench_layer.py @@ -4,14 +4,19 @@ Focuses on the hot ops: mm, add, gelu that repeat across layers. """ -import sys import os +import sys import time -sys.path.insert(0, "/home/raz/builds/pytorch-gfx1150") +# Allow pointing at a custom-built PyTorch without hardcoding a developer's +# path. Set TORCH_VULKAN_PYTORCH_PATH if needed. +_custom_torch = os.environ.get("TORCH_VULKAN_PYTORCH_PATH") +if _custom_torch: + sys.path.insert(0, _custom_torch) sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) import torch + import torch_vulkan @@ -23,12 +28,12 @@ def bench_mm_repeated(M, K, N, iters=20): b = b_cpu.to("vulkan") # Warmup (first call = cache miss) - c = torch.mm(a, b) + torch.mm(a, b) # Benchmark (subsequent calls should hit cache) t0 = time.perf_counter() for _ in range(iters): - c = torch.mm(a, b) + torch.mm(a, b) elapsed = (time.perf_counter() - t0) / iters print(f" mm [{M}x{K}] @ [{K}x{N}]: {elapsed*1000:.2f} ms/call") return elapsed @@ -41,11 +46,11 @@ def bench_add_repeated(N, iters=20): a = a_cpu.to("vulkan") b = b_cpu.to("vulkan") - c = torch.add(a, b) # warmup + torch.add(a, b) # warmup t0 = time.perf_counter() for _ in range(iters): - c = torch.add(a, b) + torch.add(a, b) elapsed = (time.perf_counter() - t0) / iters print(f" add [{N}]: {elapsed*1000:.2f} ms/call") return elapsed @@ -56,11 +61,11 @@ def bench_gelu_repeated(N, iters=20): a_cpu = torch.randn(N) a = a_cpu.to("vulkan") - c = torch.nn.functional.gelu(a) # warmup + torch.nn.functional.gelu(a) # warmup t0 = time.perf_counter() for _ in range(iters): - c = torch.nn.functional.gelu(a) + torch.nn.functional.gelu(a) elapsed = (time.perf_counter() - t0) / iters print(f" gelu [{N}]: {elapsed*1000:.2f} ms/call") return elapsed diff --git a/tests/test_algo_cache.py b/tests/test_algo_cache.py index 4b307f2..3764a5d 100644 --- a/tests/test_algo_cache.py +++ b/tests/test_algo_cache.py @@ -4,15 +4,19 @@ repeated dispatches with the same tensor buffers hit the cache. """ -import sys import os +import sys import time -# Use the custom-built PyTorch -sys.path.insert(0, "/home/raz/builds/pytorch-gfx1150") +# Allow pointing at a custom-built PyTorch (e.g. a local APU build) without +# hardcoding a developer's path. Set TORCH_VULKAN_PYTORCH_PATH if needed. +_custom_torch = os.environ.get("TORCH_VULKAN_PYTORCH_PATH") +if _custom_torch: + sys.path.insert(0, _custom_torch) sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) import torch + import torch_vulkan diff --git a/tests/test_mm.py b/tests/test_mm.py index 4135998..7638701 100644 --- a/tests/test_mm.py +++ b/tests/test_mm.py @@ -1,6 +1,7 @@ """Smoke test: matrix multiply through the Vulkan backend.""" import torch + import torch_vulkan @@ -35,11 +36,23 @@ def test_mm_small(): print("mm 2x2: PASS") -def test_cpu_fallback(): - # relu isn't implemented in Vulkan yet — should fall back to CPU - a = torch.randn(16, device="vulkan") +def test_relu(): + # relu IS wired to the Vulkan backend (relu.spv); verify both shape and + # values against the CPU reference. + a_cpu = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0]) + a = a_cpu.to("vulkan") b = torch.relu(a) assert b.shape == a.shape + torch.testing.assert_close(b.cpu(), torch.relu(a_cpu), rtol=1e-3, atol=1e-3) + print("relu: PASS") + + +def test_unimplemented_op_falls_back_to_cpu(): + # An op with no Vulkan impl (here: sign) should route through the boxed + # CPU fallback registered for PrivateUse1 and still return a valid result. + a = torch.randn(16, device="vulkan") + b = torch.sign(a) + assert b.shape == a.shape print("CPU fallback: PASS") @@ -47,5 +60,6 @@ def test_cpu_fallback(): test_vulkan_available() test_mm_small() test_mm_square() - test_cpu_fallback() + test_relu() + test_unimplemented_op_falls_back_to_cpu() print("\nAll tests passed.") diff --git a/torch_vulkan/__init__.py b/torch_vulkan/__init__.py index 0d9ccee..ea3dc4f 100644 --- a/torch_vulkan/__init__.py +++ b/torch_vulkan/__init__.py @@ -9,17 +9,20 @@ print(torch_vulkan.is_available()) print(torch_vulkan.device_name()) - # Create tensors on Vulkan device - a = torch.randn(64, 64, device="vulkan") - b = torch.randn(64, 64, device="vulkan") - c = torch.mm(a, b) # runs matmul.spv on GPU - - # Or move existing tensors - x = torch.randn(128, 128) - x_vk = x.vulkan() + # Create on CPU, then move to the Vulkan device (the supported path): + a = torch.randn(64, 64).to("vulkan") + b = torch.randn(64, 64).to("vulkan") + c = torch.mm(a, b) # runs matmul_tiled.spv on GPU + c.cpu() # bring the result back + + # torch.empty(..., device="vulkan") and torch.tensor(data, device="vulkan") + # also work. The torch.randn(..., device="vulkan") constructor and the + # .vulkan() method are only partially wired -- see the README Limitations + # section. Prefer .to("vulkan"). """ import os + import torch # Load the C++ extension — this registers PrivateUse1 as "vulkan" @@ -33,13 +36,21 @@ for_tensor=True, for_module=True, for_storage=False ) except Exception as e: - import warnings; warnings.warn(f"generate_methods_for_privateuse1_backend failed, using .vulkan() shim: {e}") + import warnings + warnings.warn( + f"generate_methods_for_privateuse1_backend failed, using .vulkan() shim: {e}" + ) torch.Tensor.vulkan = lambda self, *a, **k: self.to("vulkan") -# Point the shader loader at our bundled .spv files +# Point the shader loader at our bundled .spv files. _shader_dir = os.path.join(os.path.dirname(__file__), "..", "csrc", "shaders") if os.path.isdir(_shader_dir): - _C._set_shader_dir(os.path.abspath(_shader_dir)) + _abs_shader_dir = os.path.abspath(_shader_dir) + _C._set_shader_dir(_abs_shader_dir) + # The raw VulkanEngine (used by the mm_raw path) is constructed lazily and + # reads its shader directory from this env var, so it resolves to the same + # bundled shaders instead of a hardcoded default. + os.environ.setdefault("TORCH_VULKAN_SHADER_DIR", _abs_shader_dir) def is_available() -> bool: @@ -81,4 +92,5 @@ def clear_algorithm_cache(): # Register as torch.vulkan so .to("vulkan") works # PyTorch's PrivateUse1 dispatch does "import torch." import sys + sys.modules['torch.vulkan'] = sys.modules[__name__]