diff --git a/qdp/qdp-core/build.rs b/qdp/qdp-core/build.rs index 311ea139dd..b98f008f76 100644 --- a/qdp/qdp-core/build.rs +++ b/qdp/qdp-core/build.rs @@ -14,10 +14,18 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::env; +use std::process::Command; + fn main() { + compile_protos(); + configure_cuda_linkage(); +} + +fn compile_protos() { // Use vendored protoc to avoid missing protoc in CI/dev environments unsafe { - std::env::set_var("PROTOC", protoc_bin_vendored::protoc_bin_path().unwrap()); + env::set_var("PROTOC", protoc_bin_vendored::protoc_bin_path().unwrap()); } let mut config = prost_build::Config::new(); @@ -34,3 +42,54 @@ fn main() { println!("cargo:rerun-if-changed=proto/tensor.proto"); } + +/// Detect the CUDA Runtime toolkit and emit the appropriate link directives. +/// +/// `qdp-core` declares CUDA Runtime API extern symbols in `src/gpu/cuda_ffi.rs` +/// (cudaHostAlloc, cudaMemGetInfo, cudaEventCreateWithFlags, ...). Those symbols +/// must be resolved at link time, which requires `libcudart` from the CUDA +/// Toolkit. Previously the only `-lcudart` directive lived in `qdp-kernels`' +/// build script, and the `qdp_no_cuda` cfg it sets does not propagate +/// cross-crate. The result was confusing linker errors on systems that have +/// the NVIDIA driver but not the toolkit (e.g. PyTorch-only setups, where +/// PyTorch ships its own bundled cudart inside the wheel). +/// +/// This function: +/// * emits `cargo:rustc-link-lib=cudart` and the appropriate +/// `cargo:rustc-link-search` path when nvcc is found, and +/// * emits `cargo:rustc-cfg=qdp_no_cuda` when it is not, gating the +/// extern block in `cuda_ffi.rs` so the crate still links (with stubs +/// that return a non-zero CUDA error at runtime). +fn configure_cuda_linkage() { + println!("cargo::rustc-check-cfg=cfg(qdp_no_cuda)"); + println!("cargo:rerun-if-env-changed=QDP_NO_CUDA"); + println!("cargo:rerun-if-env-changed=CUDA_PATH"); + + let force_no_cuda = env::var("QDP_NO_CUDA") + .map(|v| v == "1" || v.eq_ignore_ascii_case("true") || v.eq_ignore_ascii_case("yes")) + .unwrap_or(false); + + let has_cuda = !force_no_cuda + && Command::new("nvcc") + .arg("--version") + .output() + .is_ok(); + + if !has_cuda { + println!("cargo:rustc-cfg=qdp_no_cuda"); + println!( + "cargo:warning=qdp-core: CUDA toolkit not found (nvcc not in PATH). \ + Building with stub CUDA Runtime symbols; GPU functionality will be \ + unavailable at runtime. Install the CUDA Toolkit to enable GPU support." + ); + return; + } + + let cuda_path = env::var("CUDA_PATH").unwrap_or_else(|_| "/usr/local/cuda".to_string()); + println!("cargo:rustc-link-search=native={}/lib64", cuda_path); + println!("cargo:rustc-link-lib=cudart"); + + // On macOS, also check /usr/local/cuda/lib + #[cfg(target_os = "macos")] + println!("cargo:rustc-link-search=native={}/lib", cuda_path); +} diff --git a/qdp/qdp-core/src/gpu/cuda_ffi.rs b/qdp/qdp-core/src/gpu/cuda_ffi.rs index 2ed60c311e..2a403c91ca 100644 --- a/qdp/qdp-core/src/gpu/cuda_ffi.rs +++ b/qdp/qdp-core/src/gpu/cuda_ffi.rs @@ -44,6 +44,19 @@ pub(crate) const CUDA_SUCCESS: i32 = 0; #[allow(dead_code)] pub(crate) const CUDA_ERROR_NOT_READY: i32 = 34; +// CUDA Runtime API bindings. +// +// On systems with the CUDA Toolkit installed, `qdp-core`'s build script emits +// `cargo:rustc-link-lib=cudart` and the extern block below is used unchanged. +// +// On systems without the toolkit (e.g. driver-only Linux hosts, or macOS), +// the build script sets `qdp_no_cuda` and the stub block is used instead. +// Stubs match each extern's signature and return a non-zero sentinel +// (`QDP_CUDA_UNAVAILABLE`, mirroring qdp-kernels' "999" convention) so the +// crate links cleanly but any actual GPU call surfaces as a runtime error +// through the existing `if ret != 0 { Err(...) }` paths in the callers. + +#[cfg(not(qdp_no_cuda))] unsafe extern "C" { pub(crate) fn cudaHostAlloc(pHost: *mut *mut c_void, size: usize, flags: u32) -> i32; pub(crate) fn cudaFreeHost(ptr: *mut c_void) -> i32; @@ -99,3 +112,111 @@ unsafe extern "C" { /// Reference: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html pub(crate) fn cudaEventElapsedTime(ms: *mut f32, start: *mut c_void, end: *mut c_void) -> i32; } + +// --------------------------------------------------------------------------- +// Stub implementations when building without the CUDA toolkit (`qdp_no_cuda`). +// +// Wrapped in a private module so a single `#[allow(non_snake_case)]` covers +// all stub names (the originals are camelCase to match the real CUDA Runtime +// API; the `extern "C"` block above is exempt from this lint, but plain Rust +// fns are not). +// --------------------------------------------------------------------------- + +#[cfg(qdp_no_cuda)] +#[allow(non_snake_case)] +mod no_cuda_stubs { + use super::CudaPointerAttributes; + use std::ffi::c_void; + + /// Sentinel error code returned by stub CUDA Runtime calls. + /// + /// Matches the "999" convention used by qdp-kernels' kernel-launcher stubs. + const QDP_CUDA_UNAVAILABLE: i32 = 999; + + pub(crate) unsafe fn cudaHostAlloc( + _pHost: *mut *mut c_void, + _size: usize, + _flags: u32, + ) -> i32 { + QDP_CUDA_UNAVAILABLE + } + + pub(crate) unsafe fn cudaFreeHost(_ptr: *mut c_void) -> i32 { + QDP_CUDA_UNAVAILABLE + } + + #[allow(dead_code)] + pub(crate) unsafe fn cudaPointerGetAttributes( + _attributes: *mut CudaPointerAttributes, + _ptr: *const c_void, + ) -> i32 { + QDP_CUDA_UNAVAILABLE + } + + pub(crate) unsafe fn cudaMemGetInfo(_free: *mut usize, _total: *mut usize) -> i32 { + QDP_CUDA_UNAVAILABLE + } + + pub(crate) unsafe fn cudaMemcpyAsync( + _dst: *mut c_void, + _src: *const c_void, + _count: usize, + _kind: u32, + _stream: *mut c_void, + ) -> i32 { + QDP_CUDA_UNAVAILABLE + } + + pub(crate) unsafe fn cudaEventCreateWithFlags(_event: *mut *mut c_void, _flags: u32) -> i32 { + QDP_CUDA_UNAVAILABLE + } + + pub(crate) unsafe fn cudaEventRecord(_event: *mut c_void, _stream: *mut c_void) -> i32 { + QDP_CUDA_UNAVAILABLE + } + + pub(crate) unsafe fn cudaEventDestroy(_event: *mut c_void) -> i32 { + QDP_CUDA_UNAVAILABLE + } + + pub(crate) unsafe fn cudaStreamWaitEvent( + _stream: *mut c_void, + _event: *mut c_void, + _flags: u32, + ) -> i32 { + QDP_CUDA_UNAVAILABLE + } + + pub(crate) unsafe fn cudaStreamSynchronize(_stream: *mut c_void) -> i32 { + QDP_CUDA_UNAVAILABLE + } + + pub(crate) unsafe fn cudaMemsetAsync( + _devPtr: *mut c_void, + _value: i32, + _count: usize, + _stream: *mut c_void, + ) -> i32 { + QDP_CUDA_UNAVAILABLE + } + + #[allow(dead_code)] + pub(crate) unsafe fn cudaEventQuery(_event: *mut c_void) -> i32 { + QDP_CUDA_UNAVAILABLE + } + + pub(crate) unsafe fn cudaEventSynchronize(_event: *mut c_void) -> i32 { + QDP_CUDA_UNAVAILABLE + } + + pub(crate) unsafe fn cudaEventElapsedTime( + _ms: *mut f32, + _start: *mut c_void, + _end: *mut c_void, + ) -> i32 { + QDP_CUDA_UNAVAILABLE + } +} + +#[cfg(qdp_no_cuda)] +pub(crate) use no_cuda_stubs::*;