diff --git a/qdp/qdp-core/build.rs b/qdp/qdp-core/build.rs
index 311ea139dd..b98f008f76 100644
--- a/qdp/qdp-core/build.rs
+++ b/qdp/qdp-core/build.rs
@@ -14,10 +14,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::env;
+use std::process::Command;
+
 fn main() {
+    compile_protos();
+    configure_cuda_linkage();
+}
+
+fn compile_protos() {
     // Use vendored protoc to avoid missing protoc in CI/dev environments
     unsafe {
-        std::env::set_var("PROTOC", protoc_bin_vendored::protoc_bin_path().unwrap());
+        env::set_var("PROTOC", protoc_bin_vendored::protoc_bin_path().unwrap());
     }
 
     let mut config = prost_build::Config::new();
@@ -34,3 +42,54 @@ fn main() {
 
     println!("cargo:rerun-if-changed=proto/tensor.proto");
 }
+
+/// Detect the CUDA Runtime toolkit and emit the appropriate link directives.
+///
+/// `qdp-core` declares CUDA Runtime API extern symbols in `src/gpu/cuda_ffi.rs`
+/// (cudaHostAlloc, cudaMemGetInfo, cudaEventCreateWithFlags, ...). Those symbols
+/// must be resolved at link time, which requires `libcudart` from the CUDA
+/// Toolkit. Previously the only `-lcudart` directive lived in `qdp-kernels`'
+/// build script, and the `qdp_no_cuda` cfg it sets does not propagate
+/// cross-crate. The result was confusing linker errors on systems that have
+/// the NVIDIA driver but not the toolkit (e.g. PyTorch-only setups, where
+/// PyTorch ships its own bundled cudart inside the wheel).
+///
+/// This function:
+///   * emits `cargo:rustc-link-lib=cudart` and the appropriate
+///     `cargo:rustc-link-search` path when nvcc is found, and
+///   * emits `cargo:rustc-cfg=qdp_no_cuda` when it is not, gating the
+///     extern block in `cuda_ffi.rs` so the crate still links (with stubs
+///     that return a non-zero CUDA error at runtime).
+fn configure_cuda_linkage() {
+    println!("cargo::rustc-check-cfg=cfg(qdp_no_cuda)");
+    println!("cargo:rerun-if-env-changed=QDP_NO_CUDA");
+    println!("cargo:rerun-if-env-changed=CUDA_PATH");
+
+    let force_no_cuda = env::var("QDP_NO_CUDA")
+        .map(|v| v == "1" || v.eq_ignore_ascii_case("true") || v.eq_ignore_ascii_case("yes"))
+        .unwrap_or(false);
+
+    let has_cuda = !force_no_cuda
+        && Command::new("nvcc")
+            .arg("--version")
+            .output()
+            .is_ok();
+
+    if !has_cuda {
+        println!("cargo:rustc-cfg=qdp_no_cuda");
+        println!(
+            "cargo:warning=qdp-core: CUDA toolkit not found (nvcc not in PATH). \
+             Building with stub CUDA Runtime symbols; GPU functionality will be \
+             unavailable at runtime. Install the CUDA Toolkit to enable GPU support."
+        );
+        return;
+    }
+
+    let cuda_path = env::var("CUDA_PATH").unwrap_or_else(|_| "/usr/local/cuda".to_string());
+    println!("cargo:rustc-link-search=native={}/lib64", cuda_path);
+    println!("cargo:rustc-link-lib=cudart");
+
+    // On macOS, also check /usr/local/cuda/lib
+    #[cfg(target_os = "macos")]
+    println!("cargo:rustc-link-search=native={}/lib", cuda_path);
+}
diff --git a/qdp/qdp-core/src/gpu/cuda_ffi.rs b/qdp/qdp-core/src/gpu/cuda_ffi.rs
index 2ed60c311e..2a403c91ca 100644
--- a/qdp/qdp-core/src/gpu/cuda_ffi.rs
+++ b/qdp/qdp-core/src/gpu/cuda_ffi.rs
@@ -44,6 +44,19 @@ pub(crate) const CUDA_SUCCESS: i32 = 0;
 #[allow(dead_code)]
 pub(crate) const CUDA_ERROR_NOT_READY: i32 = 34;
 
+// CUDA Runtime API bindings.
+//
+// On systems with the CUDA Toolkit installed, `qdp-core`'s build script emits
+// `cargo:rustc-link-lib=cudart` and the extern block below is used unchanged.
+//
+// On systems without the toolkit (e.g. driver-only Linux hosts, or macOS),
+// the build script sets `qdp_no_cuda` and the stub block is used instead.
+// Stubs match each extern's signature and return a non-zero sentinel
+// (`QDP_CUDA_UNAVAILABLE`, mirroring qdp-kernels' "999" convention) so the
+// crate links cleanly but any actual GPU call surfaces as a runtime error
+// through the existing `if ret != 0 { Err(...) }` paths in the callers.
+
+#[cfg(not(qdp_no_cuda))]
 unsafe extern "C" {
     pub(crate) fn cudaHostAlloc(pHost: *mut *mut c_void, size: usize, flags: u32) -> i32;
     pub(crate) fn cudaFreeHost(ptr: *mut c_void) -> i32;
@@ -99,3 +112,111 @@ unsafe extern "C" {
     /// Reference: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html
     pub(crate) fn cudaEventElapsedTime(ms: *mut f32, start: *mut c_void, end: *mut c_void) -> i32;
 }
+
+// ---------------------------------------------------------------------------
+// Stub implementations when building without the CUDA toolkit (`qdp_no_cuda`).
+//
+// Wrapped in a private module so a single `#[allow(non_snake_case)]` covers
+// all stub names (the originals are camelCase to match the real CUDA Runtime
+// API; the `extern "C"` block above is exempt from this lint, but plain Rust
+// fns are not).
+// ---------------------------------------------------------------------------
+
+#[cfg(qdp_no_cuda)]
+#[allow(non_snake_case)]
+mod no_cuda_stubs {
+    use super::CudaPointerAttributes;
+    use std::ffi::c_void;
+
+    /// Sentinel error code returned by stub CUDA Runtime calls.
+    ///
+    /// Matches the "999" convention used by qdp-kernels' kernel-launcher stubs.
+    const QDP_CUDA_UNAVAILABLE: i32 = 999;
+
+    pub(crate) unsafe fn cudaHostAlloc(
+        _pHost: *mut *mut c_void,
+        _size: usize,
+        _flags: u32,
+    ) -> i32 {
+        QDP_CUDA_UNAVAILABLE
+    }
+
+    pub(crate) unsafe fn cudaFreeHost(_ptr: *mut c_void) -> i32 {
+        QDP_CUDA_UNAVAILABLE
+    }
+
+    #[allow(dead_code)]
+    pub(crate) unsafe fn cudaPointerGetAttributes(
+        _attributes: *mut CudaPointerAttributes,
+        _ptr: *const c_void,
+    ) -> i32 {
+        QDP_CUDA_UNAVAILABLE
+    }
+
+    pub(crate) unsafe fn cudaMemGetInfo(_free: *mut usize, _total: *mut usize) -> i32 {
+        QDP_CUDA_UNAVAILABLE
+    }
+
+    pub(crate) unsafe fn cudaMemcpyAsync(
+        _dst: *mut c_void,
+        _src: *const c_void,
+        _count: usize,
+        _kind: u32,
+        _stream: *mut c_void,
+    ) -> i32 {
+        QDP_CUDA_UNAVAILABLE
+    }
+
+    pub(crate) unsafe fn cudaEventCreateWithFlags(_event: *mut *mut c_void, _flags: u32) -> i32 {
+        QDP_CUDA_UNAVAILABLE
+    }
+
+    pub(crate) unsafe fn cudaEventRecord(_event: *mut c_void, _stream: *mut c_void) -> i32 {
+        QDP_CUDA_UNAVAILABLE
+    }
+
+    pub(crate) unsafe fn cudaEventDestroy(_event: *mut c_void) -> i32 {
+        QDP_CUDA_UNAVAILABLE
+    }
+
+    pub(crate) unsafe fn cudaStreamWaitEvent(
+        _stream: *mut c_void,
+        _event: *mut c_void,
+        _flags: u32,
+    ) -> i32 {
+        QDP_CUDA_UNAVAILABLE
+    }
+
+    pub(crate) unsafe fn cudaStreamSynchronize(_stream: *mut c_void) -> i32 {
+        QDP_CUDA_UNAVAILABLE
+    }
+
+    pub(crate) unsafe fn cudaMemsetAsync(
+        _devPtr: *mut c_void,
+        _value: i32,
+        _count: usize,
+        _stream: *mut c_void,
+    ) -> i32 {
+        QDP_CUDA_UNAVAILABLE
+    }
+
+    #[allow(dead_code)]
+    pub(crate) unsafe fn cudaEventQuery(_event: *mut c_void) -> i32 {
+        QDP_CUDA_UNAVAILABLE
+    }
+
+    pub(crate) unsafe fn cudaEventSynchronize(_event: *mut c_void) -> i32 {
+        QDP_CUDA_UNAVAILABLE
+    }
+
+    pub(crate) unsafe fn cudaEventElapsedTime(
+        _ms: *mut f32,
+        _start: *mut c_void,
+        _end: *mut c_void,
+    ) -> i32 {
+        QDP_CUDA_UNAVAILABLE
+    }
+}
+
+#[cfg(qdp_no_cuda)]
+pub(crate) use no_cuda_stubs::*;