Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 17 additions & 27 deletions c/src/core/c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,20 @@
#include <raft/util/cudart_utils.hpp>
#include <rapids_logger/logger.hpp>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/mr/device_memory_resource.hpp>
#include <rmm/mr/cuda_memory_resource.hpp>
#include <rmm/mr/managed_memory_resource.hpp>
#include <rmm/mr/owning_wrapper.hpp>
#include <rmm/mr/per_device_resource.hpp>
#include <rmm/mr/pool_memory_resource.hpp>
#include <rmm/mr/pinned_host_memory_resource.hpp>
#include <rmm/mr/pool_memory_resource.hpp>
#include <rmm/resource_ref.hpp>

#include "../core/exceptions.hpp"

#include <cuda/memory_resource>

#include <cstdint>
#include <memory>
#include <optional>
#include <thread>

extern "C" cuvsError_t cuvsResourcesCreate(cuvsResources_t* res)
Expand Down Expand Up @@ -132,60 +135,47 @@ extern "C" cuvsError_t cuvsRMMAlloc(cuvsResources_t res, void** ptr, size_t byte
{
return cuvs::core::translate_exceptions([=] {
auto res_ptr = reinterpret_cast<raft::resources*>(res);
auto mr = rmm::mr::get_current_device_resource();
*ptr = mr->allocate(raft::resource::get_cuda_stream(*res_ptr), bytes);
auto mr = rmm::mr::get_current_device_resource_ref();
*ptr = mr.allocate(raft::resource::get_cuda_stream(*res_ptr), bytes);
});
}

extern "C" cuvsError_t cuvsRMMFree(cuvsResources_t res, void* ptr, size_t bytes)
{
return cuvs::core::translate_exceptions([=] {
auto res_ptr = reinterpret_cast<raft::resources*>(res);
auto mr = rmm::mr::get_current_device_resource();
mr->deallocate(raft::resource::get_cuda_stream(*res_ptr), ptr, bytes);
auto mr = rmm::mr::get_current_device_resource_ref();
mr.deallocate(raft::resource::get_cuda_stream(*res_ptr), ptr, bytes);
});
}

thread_local std::shared_ptr<
rmm::mr::owning_wrapper<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>,
rmm::mr::device_memory_resource>>
pool_mr;
thread_local cuda::mr::any_resource<cuda::mr::device_accessible> pool_upstream;
thread_local std::optional<rmm::mr::pool_memory_resource> pool_mr;

extern "C" cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent,
int max_pool_size_percent,
bool managed)
{
return cuvs::core::translate_exceptions([=] {
// Upstream memory resource needs to be a cuda_memory_resource
auto cuda_mr = rmm::mr::get_current_device_resource();
auto* cuda_mr_casted = dynamic_cast<rmm::mr::cuda_memory_resource*>(cuda_mr);
if (cuda_mr_casted == nullptr) {
throw std::runtime_error("Current memory resource is not a cuda_memory_resource");
}

auto initial_size = rmm::percent_of_free_device_memory(initial_pool_size_percent);
auto max_size = rmm::percent_of_free_device_memory(max_pool_size_percent);

auto mr = std::shared_ptr<rmm::mr::device_memory_resource>();
if (managed) {
mr = std::static_pointer_cast<rmm::mr::device_memory_resource>(
std::make_shared<rmm::mr::managed_memory_resource>());
pool_upstream = rmm::mr::managed_memory_resource{};
} else {
mr = std::static_pointer_cast<rmm::mr::device_memory_resource>(
std::make_shared<rmm::mr::cuda_memory_resource>());
pool_upstream = rmm::mr::cuda_memory_resource{};
}

pool_mr =
rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(mr, initial_size, max_size);
pool_mr.emplace(pool_upstream, initial_size, max_size);

rmm::mr::set_current_device_resource(pool_mr.get());
rmm::mr::set_current_device_resource_ref(*pool_mr);
});
}

extern "C" cuvsError_t cuvsRMMMemoryResourceReset()
{
return cuvs::core::translate_exceptions([=] {
rmm::mr::set_current_device_resource(rmm::mr::detail::initial_resource());
rmm::mr::reset_current_device_resource_ref();
pool_mr.reset();
});
}
Expand Down
75 changes: 29 additions & 46 deletions cpp/bench/ann/src/common/cuda_huge_page_resource.hpp
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once

#include <raft/core/error.hpp>
#include <raft/core/logger_macros.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/mr/device_memory_resource.hpp>
#include <rmm/detail/aligned.hpp>
#include <rmm/resource_ref.hpp>

#include <cuda/memory_resource>

#include <sys/mman.h>

Expand All @@ -17,37 +19,25 @@

namespace raft::mr {
/**
* @brief `device_memory_resource` derived class that uses mmap to allocate memory.
* This class enables memory allocation using huge pages.
* @brief Memory resource that uses mmap to allocate memory with huge pages.
* It is assumed that the allocated memory is directly accessible on device. This currently only
* works on GH systems.
*
* TODO(tfeher): consider improving or removing this helper once we made progress with
* https://github.com/rapidsai/raft/issues/1819
*/
class cuda_huge_page_resource final : public rmm::mr::device_memory_resource {
class cuda_huge_page_resource {
public:
cuda_huge_page_resource() = default;
~cuda_huge_page_resource() override = default;
~cuda_huge_page_resource() = default;
cuda_huge_page_resource(cuda_huge_page_resource const&) = default;
cuda_huge_page_resource(cuda_huge_page_resource&&) = default;
auto operator=(cuda_huge_page_resource const&) -> cuda_huge_page_resource& = default;
auto operator=(cuda_huge_page_resource&&) -> cuda_huge_page_resource& = default;

private:
/**
* @brief Allocates memory of size at least `bytes` using cudaMalloc.
*
* The returned pointer has at least 256B alignment.
*
* @note Stream argument is ignored
*
* @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled
*
* @param bytes The size, in bytes, of the allocation
* @return void* Pointer to the newly allocated memory
*/
auto do_allocate(std::size_t bytes, rmm::cuda_stream_view) -> void* override
void* allocate(cuda::stream_ref,
std::size_t bytes,
std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT)
{
void* addr{nullptr};
addr = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
Expand All @@ -60,36 +50,29 @@ class cuda_huge_page_resource final : public rmm::mr::device_memory_resource {
return addr;
}

/**
* @brief Deallocate memory pointed to by \p p.
*
* @note Stream argument is ignored.
*
* @throws Nothing.
*
* @param p Pointer to be deallocated
*/
void do_deallocate(void* ptr, std::size_t size, rmm::cuda_stream_view) noexcept override
void deallocate(cuda::stream_ref,
void* ptr,
std::size_t size,
std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept
{
if (munmap(ptr, size) == -1) { RAFT_LOG_ERROR("huge_page_resource::munmap failed"); }
}

/**
* @brief Compare this resource to another.
*
* Two cuda_huge_page_resources always compare equal, because they can each
* deallocate memory allocated by the other.
*
* @throws Nothing.
*
* @param other The other resource to compare to
* @return true If the two resources are equivalent
* @return false If the two resources are not equal
*/
[[nodiscard]] auto do_is_equal(device_memory_resource const& other) const noexcept
-> bool override
void* allocate_sync(std::size_t bytes, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT)
{
return allocate(cuda::stream_ref{cudaStream_t{nullptr}}, bytes, alignment);
}

void deallocate_sync(void* ptr,
std::size_t bytes,
std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept
{
return dynamic_cast<cuda_huge_page_resource const*>(&other) != nullptr;
deallocate(cuda::stream_ref{cudaStream_t{nullptr}}, ptr, bytes, alignment);
}

bool operator==(cuda_huge_page_resource const&) const noexcept { return true; }

friend void get_property(cuda_huge_page_resource const&, cuda::mr::device_accessible) noexcept {}
};
static_assert(cuda::mr::resource_with<cuda_huge_page_resource, cuda::mr::device_accessible>);
} // namespace raft::mr
32 changes: 13 additions & 19 deletions cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once
Expand All @@ -15,14 +15,16 @@
#include <raft/core/host_mdspan.hpp>
#include <raft/core/logger.hpp>
#include <raft/core/operators.hpp>
#include <raft/core/resource/device_memory_resource.hpp>
#include <raft/util/cudart_utils.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/mr/device_memory_resource.hpp>
#include <rmm/mr/failure_callback_resource_adaptor.hpp>
#include <rmm/mr/managed_memory_resource.hpp>
#include <rmm/mr/per_device_resource.hpp>
#include <rmm/mr/pool_memory_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <memory>
#include <type_traits>
Expand Down Expand Up @@ -65,17 +67,16 @@ inline auto rmm_oom_callback(std::size_t bytes, void*) -> bool
*/
class shared_raft_resources {
public:
using pool_mr_type = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
using mr_type = rmm::mr::failure_callback_resource_adaptor<pool_mr_type>;
using pool_mr_type = rmm::mr::pool_memory_resource;
using mr_type = rmm::mr::failure_callback_resource_adaptor<>;
using large_mr_type = rmm::mr::managed_memory_resource;

shared_raft_resources()
try
: orig_resource_{rmm::mr::get_current_device_resource()},
pool_resource_(orig_resource_, 1024 * 1024 * 1024ull),
resource_(&pool_resource_, rmm_oom_callback, nullptr),
: pool_resource_(rmm::mr::get_current_device_resource_ref(), 1024 * 1024 * 1024ull),
resource_(pool_resource_, rmm_oom_callback, nullptr),
large_mr_() {
rmm::mr::set_current_device_resource(&resource_);
orig_resource_ = rmm::mr::set_current_device_resource_ref(resource_);
} catch (const std::exception& e) {
auto cuda_status = cudaGetLastError();
size_t free = 0;
Expand All @@ -95,15 +96,12 @@ class shared_raft_resources {
shared_raft_resources(const shared_raft_resources& res) = delete;
auto operator=(const shared_raft_resources& other) -> shared_raft_resources& = delete;

~shared_raft_resources() noexcept { rmm::mr::set_current_device_resource(orig_resource_); }
~shared_raft_resources() noexcept { rmm::mr::set_current_device_resource_ref(orig_resource_); }

auto get_large_memory_resource() noexcept
{
return static_cast<rmm::mr::device_memory_resource*>(&large_mr_);
}
auto get_large_memory_resource() noexcept -> rmm::device_async_resource_ref { return large_mr_; }

private:
rmm::mr::device_memory_resource* orig_resource_;
cuda::mr::any_resource<cuda::mr::device_accessible> orig_resource_;
pool_mr_type pool_resource_;
mr_type resource_;
large_mr_type large_mr_;
Expand All @@ -129,12 +127,8 @@ class configured_raft_resources {
res_{std::make_unique<raft::device_resources>(
rmm::cuda_stream_view(get_stream_from_global_pool()))}
{
// set the large workspace resource to the raft handle, but without the deleter
// (this resource is managed by the shared_res).
raft::resource::set_large_workspace_resource(
*res_,
std::shared_ptr<rmm::mr::device_memory_resource>(shared_res_->get_large_memory_resource(),
raft::void_op{}));
*res_, raft::mr::device_resource{shared_res_->get_large_memory_resource()});
}

/** Default constructor creates all resources anew. */
Expand Down
6 changes: 3 additions & 3 deletions cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,9 @@ class cuvs_cagra : public algo<T>, public algo_gpu {
inline rmm::device_async_resource_ref get_mr(AllocatorType mem_type)
{
switch (mem_type) {
case (AllocatorType::kHostPinned): return &mr_pinned_;
case (AllocatorType::kHostHugePage): return &mr_huge_page_;
default: return rmm::mr::get_current_device_resource();
case (AllocatorType::kHostPinned): return mr_pinned_;
case (AllocatorType::kHostHugePage): return mr_huge_page_;
default: return rmm::mr::get_current_device_resource_ref();
}
}
};
Expand Down
11 changes: 5 additions & 6 deletions cpp/internal/cuvs_internal/neighbors/naive_knn.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
* SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
* SPDX-License-Identifier: Apache-2.0
*/

Expand All @@ -13,7 +13,8 @@
#include <raft/core/resource/cuda_stream.hpp>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/mr/device_memory_resource.hpp>
#include <rmm/mr/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>

namespace cuvs::neighbors {

Expand Down Expand Up @@ -87,8 +88,7 @@ void naive_knn(raft::resources const& handle,
uint32_t k,
cuvs::distance::DistanceType type)
{
rmm::mr::device_memory_resource* mr = nullptr;
auto pool_guard = raft::get_pool_memory_resource(mr, 1024 * 1024);
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref();

auto stream = raft::resource::get_cuda_stream(handle);
dim3 block_dim(16, 32, 1);
Expand Down Expand Up @@ -116,8 +116,7 @@ void naive_knn(raft::resources const& handle,
static_cast<int>(k),
dist_topk + offset * k,
indices_topk + offset * k,
type != cuvs::distance::DistanceType::InnerProduct,
mr);
type != cuvs::distance::DistanceType::InnerProduct);
}
RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
}
Expand Down
12 changes: 6 additions & 6 deletions cpp/src/cluster/detail/kmeans_balanced.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ void compute_norm(const raft::resources& handle,
raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("compute_norm");
auto stream = raft::resource::get_cuda_stream(handle);
rmm::device_uvector<MathT> mapped_dataset(
0, stream, mr.value_or(raft::resource::get_workspace_resource(handle)));
0, stream, mr.value_or(raft::resource::get_workspace_resource_ref(handle)));

const MathT* dataset_ptr = nullptr;

Expand Down Expand Up @@ -426,7 +426,7 @@ void predict(const raft::resources& handle,
auto stream = raft::resource::get_cuda_stream(handle);
raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
"predict(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
auto mem_res = mr.value_or(raft::resource::get_workspace_resource(handle));
auto mem_res = mr.value_or(raft::resource::get_workspace_resource_ref(handle));
auto [max_minibatch_size, _mem_per_row] =
calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
rmm::device_uvector<MathT> cur_dataset(
Expand Down Expand Up @@ -1038,7 +1038,7 @@ void build_hierarchical(const raft::resources& handle,

// TODO: Remove the explicit managed memory- we shouldn't be creating this on the user's behalf.
rmm::mr::managed_memory_resource managed_memory;
rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(handle);
rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource_ref(handle);
auto [max_minibatch_size, mem_per_row] =
calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);

Expand Down Expand Up @@ -1079,8 +1079,8 @@ void build_hierarchical(const raft::resources& handle,
CounterT;

// build coarse clusters (mesoclusters)
rmm::device_uvector<LabelT> mesocluster_labels_buf(n_rows, stream, &managed_memory);
rmm::device_uvector<CounterT> mesocluster_sizes_buf(n_mesoclusters, stream, &managed_memory);
rmm::device_uvector<LabelT> mesocluster_labels_buf(n_rows, stream, managed_memory);
rmm::device_uvector<CounterT> mesocluster_sizes_buf(n_mesoclusters, stream, managed_memory);
{
rmm::device_uvector<MathT> mesocluster_centers_buf(n_mesoclusters * dim, stream, device_memory);
build_clusters(handle,
Expand Down Expand Up @@ -1136,7 +1136,7 @@ void build_hierarchical(const raft::resources& handle,
fine_clusters_nums_max,
cluster_centers,
mapping_op,
&managed_memory,
managed_memory,
device_memory);
RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters.");

Expand Down
Loading
Loading