diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json index 448a2b5cc4..055a3ccba4 100644 --- a/.devcontainer/cuda12.9-conda/devcontainer.json +++ b/.devcontainer/cuda12.9-conda/devcontainer.json @@ -5,19 +5,19 @@ "args": { "CUDA": "12.9", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:26.04-cpp-mambaforge" + "BASE": "rapidsai/devcontainers:26.06-cpp-mambaforge" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.04-cuda12.9-conda", + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.06-cuda12.9-conda", "--ulimit", "nofile=500000" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.4": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.6": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json index e3cea30335..197760306a 100644 --- a/.devcontainer/cuda12.9-pip/devcontainer.json +++ b/.devcontainer/cuda12.9-pip/devcontainer.json @@ -5,26 +5,26 @@ "args": { "CUDA": "12.9", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:26.04-cpp-cuda12.9-ucx1.19.0-openmpi5.0.7" + "BASE": "rapidsai/devcontainers:26.06-cpp-cuda12.9-ucx1.19.0-openmpi5.0.7" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.04-cuda12.9-pip", + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.06-cuda12.9-pip", "--ulimit", "nofile=500000" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/cuda:26.4": { + "ghcr.io/rapidsai/devcontainers/features/cuda:26.6": { "version": "12.9", "installcuBLAS": true, "installcuSOLVER": true, "installcuRAND": true, "installcuSPARSE": true }, - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.4": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.6": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/ucx", diff --git a/.devcontainer/cuda13.1-conda/devcontainer.json b/.devcontainer/cuda13.1-conda/devcontainer.json index 729d936efd..c3f329f278 100644 --- a/.devcontainer/cuda13.1-conda/devcontainer.json +++ b/.devcontainer/cuda13.1-conda/devcontainer.json @@ -5,19 +5,19 @@ "args": { "CUDA": "13.1", "PYTHON_PACKAGE_MANAGER": "conda", - "BASE": "rapidsai/devcontainers:26.04-cpp-mambaforge" + "BASE": "rapidsai/devcontainers:26.06-cpp-mambaforge" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.04-cuda13.1-conda", + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.06-cuda13.1-conda", "--ulimit", "nofile=500000" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.4": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.6": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" diff --git a/.devcontainer/cuda13.1-pip/devcontainer.json b/.devcontainer/cuda13.1-pip/devcontainer.json index 0f5ae085a3..e47bd1c3ff 100644 --- a/.devcontainer/cuda13.1-pip/devcontainer.json +++ b/.devcontainer/cuda13.1-pip/devcontainer.json @@ -5,26 +5,26 @@ "args": { "CUDA": "13.1", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:26.04-cpp-cuda13.1-ucx1.19.0-openmpi5.0.7" + "BASE": "rapidsai/devcontainers:26.06-cpp-cuda13.1-ucx1.19.0-openmpi5.0.7" } }, "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.04-cuda13.1-pip", + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.06-cuda13.1-pip", "--ulimit", "nofile=500000" ], "hostRequirements": {"gpu": "optional"}, "features": { - "ghcr.io/rapidsai/devcontainers/features/cuda:26.4": { + "ghcr.io/rapidsai/devcontainers/features/cuda:26.6": { "version": "13.1", "installcuBLAS": true, "installcuSOLVER": true, "installcuRAND": true, "installcuSPARSE": true }, - "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.4": {} + "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:26.6": {} }, "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/ucx", diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 7672a647c4..4bbdbda5ae 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -35,7 +35,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: rocky8-clib-standalone-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main strategy: fail-fast: false matrix: @@ -61,7 +61,7 @@ jobs: branch: ${{ inputs.branch }} arch: "${{matrix.arch}}" date: ${{ inputs.date }} - container_image: "rapidsai/ci-wheel:26.04-cuda${{ matrix.cuda_version }}-rockylinux8-py3.11" + container_image: "rapidsai/ci-wheel:26.06-cuda${{ matrix.cuda_version }}-rockylinux8-py3.11" node_type: "cpu16" requires_license_builder: true script: "ci/build_standalone_c.sh" @@ -71,7 +71,7 @@ jobs: rust-build: needs: cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main # Artifacts are not published from these jobs, so it's safe to run for multiple CUDA versions. # If these jobs start producing artifacts, the names will have to differentiate between CUDA versions. strategy: @@ -85,14 +85,14 @@ jobs: branch: ${{ inputs.branch }} arch: "amd64" date: ${{ inputs.date }} - container_image: "rapidsai/ci-conda:26.04-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" + container_image: "rapidsai/ci-conda:26.06-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" node_type: "gpu-l4-latest-1" script: "ci/build_rust.sh" sha: ${{ inputs.sha }} go-build: needs: cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main # Artifacts are not published from these jobs, so it's safe to run for multiple CUDA versions. # If these jobs start producing artifacts, the names will have to differentiate between CUDA versions. strategy: @@ -106,14 +106,14 @@ jobs: branch: ${{ inputs.branch }} arch: "amd64" date: ${{ inputs.date }} - container_image: "rapidsai/ci-conda:26.04-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" + container_image: "rapidsai/ci-conda:26.06-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" node_type: "gpu-l4-latest-1" script: "ci/build_go.sh" sha: ${{ inputs.sha }} java-build: needs: cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main # Artifacts are not published from these jobs, so it's safe to run for multiple CUDA versions. # If these jobs start producing artifacts, the names will have to differentiate between CUDA versions. strategy: @@ -127,7 +127,7 @@ jobs: branch: ${{ inputs.branch }} arch: "amd64" date: ${{ inputs.date }} - container_image: "rapidsai/ci-conda:26.04-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" + container_image: "rapidsai/ci-conda:26.06-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" script: "ci/build_java.sh" artifact-name: "cuvs-java-cuda${{ matrix.cuda_version }}" file_to_upload: "java/cuvs-java/target/" @@ -135,7 +135,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -147,7 +147,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -162,19 +162,19 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main with: arch: "amd64" branch: ${{ inputs.branch }} build_type: ${{ inputs.build_type || 'branch' }} - container_image: "rapidsai/ci-conda:26.04-latest" + container_image: "rapidsai/ci-conda:26.06-latest" date: ${{ inputs.date }} node_type: "gpu-l4-latest-1" script: "ci/build_docs.sh" sha: ${{ inputs.sha }} wheel-build-libcuvs: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -189,7 +189,7 @@ jobs: wheel-publish-libcuvs: needs: wheel-build-libcuvs secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -200,7 +200,7 @@ jobs: wheel-build-cuvs: needs: wheel-build-libcuvs secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -215,7 +215,7 @@ jobs: wheel-publish-cuvs: needs: wheel-build-cuvs secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@main with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index bbaa1ed778..9ab6c48e39 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -30,7 +30,7 @@ jobs: - devcontainer - telemetry-setup secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@main if: always() with: needs: ${{ toJSON(needs) }} @@ -63,7 +63,7 @@ jobs: changed-files: needs: telemetry-setup secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@main with: files_yaml: | build_docs: @@ -312,7 +312,7 @@ jobs: checks: needs: telemetry-setup secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@main with: enable_check_generated_files: false ignored_pr_jobs: "telemetry-summarize" @@ -322,7 +322,7 @@ jobs: conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@main with: build_type: pull-request node_type: cpu16 @@ -330,7 +330,7 @@ jobs: conda-cpp-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp with: build_type: pull-request @@ -338,14 +338,14 @@ jobs: conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@main with: build_type: pull-request symbol_exclusions: (void (thrust::|cub::)) conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@main with: build_type: pull-request script: ci/build_python.sh @@ -354,7 +354,7 @@ jobs: conda-python-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_conda with: build_type: pull-request @@ -362,7 +362,7 @@ jobs: rocky8-clib-standalone-build: needs: [checks] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main strategy: fail-fast: false matrix: @@ -376,7 +376,7 @@ jobs: build_type: pull-request arch: "${{matrix.arch}}" date: ${{ inputs.date }}_c - container_image: "rapidsai/ci-wheel:26.04-cuda${{ matrix.cuda_version }}-rockylinux8-py3.11" + container_image: "rapidsai/ci-wheel:26.06-cuda${{ matrix.cuda_version }}-rockylinux8-py3.11" node_type: "cpu16" requires_license_builder: true script: "ci/build_standalone_c.sh --build-tests" @@ -386,7 +386,7 @@ jobs: rocky8-clib-tests: needs: [rocky8-clib-standalone-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp strategy: fail-fast: false @@ -402,13 +402,13 @@ jobs: node_type: "gpu-l4-latest-1" arch: "${{matrix.arch}}" date: ${{ inputs.date }}_c - container_image: "rapidsai/ci-wheel:26.04-cuda${{ matrix.cuda_version }}-rockylinux8-py3.11" + container_image: "rapidsai/ci-wheel:26.06-cuda${{ matrix.cuda_version }}-rockylinux8-py3.11" script: "ci/test_standalone_c.sh libcuvs_c_${{ matrix.cuda_version }}_${{ matrix.arch }}.tar.gz" sha: ${{ inputs.sha }} conda-java-build-and-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp # Artifacts are not published from these jobs, so it's safe to run for multiple CUDA versions. # If these jobs start producing artifacts, the names will have to differentiate between CUDA versions. @@ -422,14 +422,14 @@ jobs: build_type: pull-request node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:26.04-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" + container_image: "rapidsai/ci-conda:26.06-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" script: "ci/test_java.sh" artifact-name: "cuvs-java-cuda${{ matrix.cuda_version }}" file_to_upload: "java/cuvs-java/target/" rust-build: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_rust || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp # Artifacts are not published from these jobs, so it's safe to run for multiple CUDA versions. # If these jobs start producing artifacts, the names will have to differentiate between CUDA versions. @@ -443,12 +443,12 @@ jobs: build_type: pull-request node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:26.04-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" + container_image: "rapidsai/ci-conda:26.06-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" script: "ci/build_rust.sh" go-build: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_go || fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp # Artifacts are not published from these jobs, so it's safe to run for multiple CUDA versions. # If these jobs start producing artifacts, the names will have to differentiate between CUDA versions. @@ -462,22 +462,22 @@ jobs: build_type: pull-request node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:26.04-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" + container_image: "rapidsai/ci-conda:26.06-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" script: "ci/build_go.sh" docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main with: build_type: pull-request node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:26.04-latest" + container_image: "rapidsai/ci-conda:26.06-latest" script: "ci/build_docs.sh" wheel-build-libcuvs: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: pull-request node_type: cpu16 @@ -489,7 +489,7 @@ jobs: wheel-build-cuvs: needs: wheel-build-libcuvs secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main with: build_type: pull-request node_type: cpu8 @@ -501,7 +501,7 @@ jobs: wheel-tests-cuvs: needs: [wheel-build-cuvs, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels with: build_type: pull-request @@ -509,7 +509,7 @@ jobs: devcontainer: secrets: inherit needs: telemetry-setup - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@main with: arch: '["amd64", "arm64"]' cuda: '["13.1"]' diff --git a/.github/workflows/publish-rust.yaml b/.github/workflows/publish-rust.yaml index b366c26db2..be8db7c089 100644 --- a/.github/workflows/publish-rust.yaml +++ b/.github/workflows/publish-rust.yaml @@ -16,7 +16,7 @@ jobs: cuda_version: - '12.9.1' container: - image: "rapidsai/ci-conda:26.04-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" + image: "rapidsai/ci-conda:26.06-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" steps: - uses: actions/checkout@v4 - name: Check if release build diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 9901bec7cb..617a7324bb 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -25,7 +25,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@main with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -34,7 +34,7 @@ jobs: symbol_exclusions: (void (thrust::|cub::)) conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@main with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -43,7 +43,7 @@ jobs: sha: ${{ inputs.sha }} conda-python-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@main with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} @@ -52,7 +52,7 @@ jobs: sha: ${{ inputs.sha }} conda-java-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main # Artifacts are not published from these jobs, so it's safe to run for multiple CUDA versions. # If these jobs start producing artifacts, the names will have to differentiate between CUDA versions. strategy: @@ -68,11 +68,11 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-l4-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:26.04-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" + container_image: "rapidsai/ci-conda:26.06-cuda${{ matrix.cuda_version }}-ubuntu24.04-py3.13" script: "ci/test_java.sh" wheel-tests-cuvs: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main with: build_type: ${{ inputs.build_type }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml index 8e0a321687..c471e2a151 100644 --- a/.github/workflows/trigger-breaking-change-alert.yaml +++ b/.github/workflows/trigger-breaking-change-alert.yaml @@ -12,7 +12,7 @@ jobs: trigger-notifier: if: contains(github.event.pull_request.labels.*.name, 'breaking') secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@release/26.04 + uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@main with: sender_login: ${{ github.event.sender.login }} sender_avatar: ${{ github.event.sender.avatar_url }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 28546f8332..6c6ef4622e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -92,7 +92,7 @@ repos: files: rust/.* language: rust - repo: https://github.com/codespell-project/codespell - rev: v2.2.2 + rev: v2.4.1 hooks: - id: codespell additional_dependencies: [tomli] diff --git a/RAPIDS_BRANCH b/RAPIDS_BRANCH index d5ea6ced53..ba2906d066 100644 --- a/RAPIDS_BRANCH +++ b/RAPIDS_BRANCH @@ -1 +1 @@ -release/26.04 +main diff --git a/README.md b/README.md index ea6343a6a9..ccfa62a838 100755 --- a/README.md +++ b/README.md @@ -139,7 +139,7 @@ cuvsCagraIndexParamsDestroy(index_params); cuvsResourcesDestroy(res); ``` -For more code examples of the C APIs, including drop-in Cmake project templates, please refer to the [C examples](https://github.com/rapidsai/cuvs/tree/release/26.04/examples/c) +For more code examples of the C APIs, including drop-in Cmake project templates, please refer to the [C examples](https://github.com/rapidsai/cuvs/tree/main/examples/c) ### Rust API @@ -202,7 +202,7 @@ fn cagra_example() -> Result<()> { } ``` -For more code examples of the Rust APIs, including a drop-in project templates, please refer to the [Rust examples](https://github.com/rapidsai/cuvs/tree/release/26.04/examples/rust). +For more code examples of the Rust APIs, including a drop-in project templates, please refer to the [Rust examples](https://github.com/rapidsai/cuvs/tree/main/examples/rust). ## Contributing diff --git a/VERSION b/VERSION index 0bd0e8a95b..cdb610a24d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -26.04.00 +26.06.00 diff --git a/c/src/core/c_api.cpp b/c/src/core/c_api.cpp index b407f40b3e..6c9da46da0 100644 --- a/c/src/core/c_api.cpp +++ b/c/src/core/c_api.cpp @@ -14,17 +14,20 @@ #include #include #include -#include +#include #include -#include #include -#include #include +#include +#include #include "../core/exceptions.hpp" +#include + #include #include +#include #include extern "C" cuvsError_t cuvsResourcesCreate(cuvsResources_t* res) @@ -132,8 +135,8 @@ extern "C" cuvsError_t cuvsRMMAlloc(cuvsResources_t res, void** ptr, size_t byte { return cuvs::core::translate_exceptions([=] { auto res_ptr = reinterpret_cast(res); - auto mr = rmm::mr::get_current_device_resource(); - *ptr = mr->allocate(raft::resource::get_cuda_stream(*res_ptr), bytes); + auto mr = rmm::mr::get_current_device_resource_ref(); + *ptr = mr.allocate(raft::resource::get_cuda_stream(*res_ptr), bytes); }); } @@ -141,51 +144,38 @@ extern "C" cuvsError_t cuvsRMMFree(cuvsResources_t res, void* ptr, size_t bytes) { return cuvs::core::translate_exceptions([=] { auto res_ptr = reinterpret_cast(res); - auto mr = rmm::mr::get_current_device_resource(); - mr->deallocate(raft::resource::get_cuda_stream(*res_ptr), ptr, bytes); + auto mr = rmm::mr::get_current_device_resource_ref(); + mr.deallocate(raft::resource::get_cuda_stream(*res_ptr), ptr, bytes); }); } -thread_local std::shared_ptr< - rmm::mr::owning_wrapper, - rmm::mr::device_memory_resource>> - pool_mr; +thread_local cuda::mr::any_resource pool_upstream; +thread_local std::optional pool_mr; extern "C" cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent, int max_pool_size_percent, bool managed) { return cuvs::core::translate_exceptions([=] { - // Upstream memory resource needs to be a cuda_memory_resource - auto cuda_mr = rmm::mr::get_current_device_resource(); - auto* cuda_mr_casted = dynamic_cast(cuda_mr); - if (cuda_mr_casted == nullptr) { - throw std::runtime_error("Current memory resource is not a cuda_memory_resource"); - } - auto initial_size = rmm::percent_of_free_device_memory(initial_pool_size_percent); auto max_size = rmm::percent_of_free_device_memory(max_pool_size_percent); - auto mr = std::shared_ptr(); if (managed) { - mr = std::static_pointer_cast( - std::make_shared()); + pool_upstream = rmm::mr::managed_memory_resource{}; } else { - mr = std::static_pointer_cast( - std::make_shared()); + pool_upstream = rmm::mr::cuda_memory_resource{}; } - pool_mr = - rmm::mr::make_owning_wrapper(mr, initial_size, max_size); + pool_mr.emplace(pool_upstream, initial_size, max_size); - rmm::mr::set_current_device_resource(pool_mr.get()); + rmm::mr::set_current_device_resource_ref(*pool_mr); }); } extern "C" cuvsError_t cuvsRMMMemoryResourceReset() { return cuvs::core::translate_exceptions([=] { - rmm::mr::set_current_device_resource(rmm::mr::detail::initial_resource()); + rmm::mr::reset_current_device_resource_ref(); pool_mr.reset(); }); } diff --git a/c/tests/CMakeLists.txt b/c/tests/CMakeLists.txt index 343e70ef63..2d09490975 100644 --- a/c/tests/CMakeLists.txt +++ b/c/tests/CMakeLists.txt @@ -94,7 +94,7 @@ endif() ConfigureTest(NAME cuvs_c_headers PATH core/headers.c) ConfigureTest(NAME cuvs_c_test PATH core/c_api.c) -target_link_libraries(cuvs_c_test PRIVATE CUDA::cudart) +target_link_libraries(cuvs_c_test PRIVATE CUDA::cudart_static) ConfigureTest(NAME cuvs_c_neighbors_test PATH neighbors/c_api.c) # ################################################################################################## diff --git a/ci/build_go.sh b/ci/build_go.sh index af3ed10c88..80370048ff 100755 --- a/ci/build_go.sh +++ b/ci/build_go.sh @@ -31,7 +31,7 @@ set -eu rapids-print-env export CGO_CFLAGS="-I${CONDA_PREFIX}/include" -export CGO_LDFLAGS="-L${CONDA_PREFIX}/lib -lcudart -lcuvs -lcuvs_c" +export CGO_LDFLAGS="-L${CONDA_PREFIX}/lib -lcudart_static -ldl -lrt -lcuvs -lcuvs_c" export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH" export CC=clang diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 66213a213c..369a0cd2e4 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -11,7 +11,6 @@ dependencies: - clang-tools==20.1.4 - clang==20.1.4 - cmake>=3.30.4 -- cuda-cudart-dev - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api @@ -31,7 +30,8 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- librmm==26.4.*,>=0.0.0a0 +- libnvjitlink-dev +- librmm==26.6.*,>=0.0.0a0 - make - nccl>=2.19 - ninja @@ -39,7 +39,7 @@ dependencies: - numpydoc - openblas - pre-commit -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pytest - pytest-cov - rapids-build-backend>=0.4.0,<0.5.0 diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index 52bcdfa730..8ee005ce49 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -11,7 +11,6 @@ dependencies: - clang-tools==20.1.4 - clang==20.1.4 - cmake>=3.30.4 -- cuda-cudart-dev - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api @@ -31,7 +30,8 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- librmm==26.4.*,>=0.0.0a0 +- libnvjitlink-dev +- librmm==26.6.*,>=0.0.0a0 - make - nccl>=2.19 - ninja @@ -39,7 +39,7 @@ dependencies: - numpydoc - openblas - pre-commit -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pytest - pytest-cov - rapids-build-backend>=0.4.0,<0.5.0 diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml index 5c348e8f34..94129ae55a 100644 --- a/conda/environments/all_cuda-131_arch-aarch64.yaml +++ b/conda/environments/all_cuda-131_arch-aarch64.yaml @@ -11,7 +11,6 @@ dependencies: - clang-tools==20.1.4 - clang==20.1.4 - cmake>=3.30.4 -- cuda-cudart-dev - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api @@ -32,7 +31,7 @@ dependencies: - libcusolver-dev - libcusparse-dev - libnvjitlink-dev -- librmm==26.4.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - make - nccl>=2.19 - ninja @@ -40,7 +39,7 @@ dependencies: - numpydoc - openblas - pre-commit -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pytest - pytest-cov - rapids-build-backend>=0.4.0,<0.5.0 diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml index 5020088a7c..23150f3ba4 100644 --- a/conda/environments/all_cuda-131_arch-x86_64.yaml +++ b/conda/environments/all_cuda-131_arch-x86_64.yaml @@ -11,7 +11,6 @@ dependencies: - clang-tools==20.1.4 - clang==20.1.4 - cmake>=3.30.4 -- cuda-cudart-dev - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api @@ -32,7 +31,7 @@ dependencies: - libcusolver-dev - libcusparse-dev - libnvjitlink-dev -- librmm==26.4.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - make - nccl>=2.19 - ninja @@ -40,7 +39,7 @@ dependencies: - numpydoc - openblas - pre-commit -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pytest - pytest-cov - rapids-build-backend>=0.4.0,<0.5.0 diff --git a/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml index f3cb4c38af..f09884d28c 100644 --- a/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-129_arch-aarch64.yaml @@ -10,14 +10,13 @@ dependencies: - clang==20.1.4 - click - cmake>=3.30.4 -- cuda-cudart-dev - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api - cuda-python>=12.9.2,<13.0 - cuda-version=12.9 - cupy>=13.6.0 -- cuvs==26.4.*,>=0.0.0a0 +- cuvs==26.6.*,>=0.0.0a0 - cxx-compiler - cython>=3.2.2 - dlpack>=0.8,<1.0 @@ -29,15 +28,16 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libcuvs==26.4.*,>=0.0.0a0 -- librmm==26.4.*,>=0.0.0a0 +- libcuvs==26.6.*,>=0.0.0a0 +- libnvjitlink-dev +- librmm==26.6.*,>=0.0.0a0 - matplotlib-base>=3.9 - nccl>=2.19 - ninja - nlohmann_json>=3.12.0 - openblas - pandas -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyyaml - rapids-build-backend>=0.4.0,<0.5.0 - requests diff --git a/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml index c44e2e58fb..b64caf30b3 100644 --- a/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-129_arch-x86_64.yaml @@ -10,14 +10,13 @@ dependencies: - clang==20.1.4 - click - cmake>=3.30.4 -- cuda-cudart-dev - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api - cuda-python>=12.9.2,<13.0 - cuda-version=12.9 - cupy>=13.6.0 -- cuvs==26.4.*,>=0.0.0a0 +- cuvs==26.6.*,>=0.0.0a0 - cxx-compiler - cython>=3.2.2 - dlpack>=0.8,<1.0 @@ -31,8 +30,9 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libcuvs==26.4.*,>=0.0.0a0 -- librmm==26.4.*,>=0.0.0a0 +- libcuvs==26.6.*,>=0.0.0a0 +- libnvjitlink-dev +- librmm==26.6.*,>=0.0.0a0 - matplotlib-base>=3.9 - mkl-devel=2023 - nccl>=2.19 @@ -40,7 +40,7 @@ dependencies: - nlohmann_json>=3.12.0 - openblas - pandas -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyyaml - rapids-build-backend>=0.4.0,<0.5.0 - requests diff --git a/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml index 2a7f1cd9ea..3758fab63b 100644 --- a/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-131_arch-aarch64.yaml @@ -10,14 +10,13 @@ dependencies: - clang==20.1.4 - click - cmake>=3.30.4 -- cuda-cudart-dev - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api - cuda-python>=13.0.1,<14.0 - cuda-version=13.1 - cupy>=13.6.0 -- cuvs==26.4.*,>=0.0.0a0 +- cuvs==26.6.*,>=0.0.0a0 - cxx-compiler - cython>=3.2.2 - dlpack>=0.8,<1.0 @@ -29,16 +28,16 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libcuvs==26.4.*,>=0.0.0a0 +- libcuvs==26.6.*,>=0.0.0a0 - libnvjitlink-dev -- librmm==26.4.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - matplotlib-base>=3.9 - nccl>=2.19 - ninja - nlohmann_json>=3.12.0 - openblas - pandas -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyyaml - rapids-build-backend>=0.4.0,<0.5.0 - requests diff --git a/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml index 07fb692de1..5f93dbb946 100644 --- a/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-131_arch-x86_64.yaml @@ -10,14 +10,13 @@ dependencies: - clang==20.1.4 - click - cmake>=3.30.4 -- cuda-cudart-dev - cuda-nvcc - cuda-nvtx-dev - cuda-profiler-api - cuda-python>=13.0.1,<14.0 - cuda-version=13.1 - cupy>=13.6.0 -- cuvs==26.4.*,>=0.0.0a0 +- cuvs==26.6.*,>=0.0.0a0 - cxx-compiler - cython>=3.2.2 - dlpack>=0.8,<1.0 @@ -31,9 +30,9 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libcuvs==26.4.*,>=0.0.0a0 +- libcuvs==26.6.*,>=0.0.0a0 - libnvjitlink-dev -- librmm==26.4.*,>=0.0.0a0 +- librmm==26.6.*,>=0.0.0a0 - matplotlib-base>=3.9 - mkl-devel=2023 - nccl>=2.19 @@ -41,7 +40,7 @@ dependencies: - nlohmann_json>=3.12.0 - openblas - pandas -- pylibraft==26.4.*,>=0.0.0a0 +- pylibraft==26.6.*,>=0.0.0a0 - pyyaml - rapids-build-backend>=0.4.0,<0.5.0 - requests diff --git a/conda/environments/go_cuda-129_arch-aarch64.yaml b/conda/environments/go_cuda-129_arch-aarch64.yaml index 55842e86b8..7eee810027 100644 --- a/conda/environments/go_cuda-129_arch-aarch64.yaml +++ b/conda/environments/go_cuda-129_arch-aarch64.yaml @@ -24,8 +24,9 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libcuvs==26.4.*,>=0.0.0a0 -- libraft==26.4.*,>=0.0.0a0 +- libcuvs==26.6.*,>=0.0.0a0 +- libnvjitlink-dev +- libraft==26.6.*,>=0.0.0a0 - nccl>=2.19 - ninja - sysroot_linux-aarch64==2.28 diff --git a/conda/environments/go_cuda-129_arch-x86_64.yaml b/conda/environments/go_cuda-129_arch-x86_64.yaml index 2854de33b7..0c84899580 100644 --- a/conda/environments/go_cuda-129_arch-x86_64.yaml +++ b/conda/environments/go_cuda-129_arch-x86_64.yaml @@ -24,8 +24,9 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libcuvs==26.4.*,>=0.0.0a0 -- libraft==26.4.*,>=0.0.0a0 +- libcuvs==26.6.*,>=0.0.0a0 +- libnvjitlink-dev +- libraft==26.6.*,>=0.0.0a0 - nccl>=2.19 - ninja - sysroot_linux-64==2.28 diff --git a/conda/environments/go_cuda-131_arch-aarch64.yaml b/conda/environments/go_cuda-131_arch-aarch64.yaml index 135f6a88cc..1b1f4a15b1 100644 --- a/conda/environments/go_cuda-131_arch-aarch64.yaml +++ b/conda/environments/go_cuda-131_arch-aarch64.yaml @@ -24,9 +24,9 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libcuvs==26.4.*,>=0.0.0a0 +- libcuvs==26.6.*,>=0.0.0a0 - libnvjitlink-dev -- libraft==26.4.*,>=0.0.0a0 +- libraft==26.6.*,>=0.0.0a0 - nccl>=2.19 - ninja - sysroot_linux-aarch64==2.28 diff --git a/conda/environments/go_cuda-131_arch-x86_64.yaml b/conda/environments/go_cuda-131_arch-x86_64.yaml index df6a779331..1a658372a9 100644 --- a/conda/environments/go_cuda-131_arch-x86_64.yaml +++ b/conda/environments/go_cuda-131_arch-x86_64.yaml @@ -24,9 +24,9 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libcuvs==26.4.*,>=0.0.0a0 +- libcuvs==26.6.*,>=0.0.0a0 - libnvjitlink-dev -- libraft==26.4.*,>=0.0.0a0 +- libraft==26.6.*,>=0.0.0a0 - nccl>=2.19 - ninja - sysroot_linux-64==2.28 diff --git a/conda/environments/rust_cuda-129_arch-aarch64.yaml b/conda/environments/rust_cuda-129_arch-aarch64.yaml index 0aa5a7ea6f..a9a01a4f48 100644 --- a/conda/environments/rust_cuda-129_arch-aarch64.yaml +++ b/conda/environments/rust_cuda-129_arch-aarch64.yaml @@ -21,8 +21,9 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libcuvs==26.4.*,>=0.0.0a0 -- libraft==26.4.*,>=0.0.0a0 +- libcuvs==26.6.*,>=0.0.0a0 +- libnvjitlink-dev +- libraft==26.6.*,>=0.0.0a0 - make - nccl>=2.19 - ninja diff --git a/conda/environments/rust_cuda-129_arch-x86_64.yaml b/conda/environments/rust_cuda-129_arch-x86_64.yaml index b9dabfafa7..5e57d4378d 100644 --- a/conda/environments/rust_cuda-129_arch-x86_64.yaml +++ b/conda/environments/rust_cuda-129_arch-x86_64.yaml @@ -21,8 +21,9 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libcuvs==26.4.*,>=0.0.0a0 -- libraft==26.4.*,>=0.0.0a0 +- libcuvs==26.6.*,>=0.0.0a0 +- libnvjitlink-dev +- libraft==26.6.*,>=0.0.0a0 - make - nccl>=2.19 - ninja diff --git a/conda/environments/rust_cuda-131_arch-aarch64.yaml b/conda/environments/rust_cuda-131_arch-aarch64.yaml index 062cbc8ea0..31df8d9fb0 100644 --- a/conda/environments/rust_cuda-131_arch-aarch64.yaml +++ b/conda/environments/rust_cuda-131_arch-aarch64.yaml @@ -21,9 +21,9 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libcuvs==26.4.*,>=0.0.0a0 +- libcuvs==26.6.*,>=0.0.0a0 - libnvjitlink-dev -- libraft==26.4.*,>=0.0.0a0 +- libraft==26.6.*,>=0.0.0a0 - make - nccl>=2.19 - ninja diff --git a/conda/environments/rust_cuda-131_arch-x86_64.yaml b/conda/environments/rust_cuda-131_arch-x86_64.yaml index 2b96d4a64e..d4584b481d 100644 --- a/conda/environments/rust_cuda-131_arch-x86_64.yaml +++ b/conda/environments/rust_cuda-131_arch-x86_64.yaml @@ -21,9 +21,9 @@ dependencies: - libcurand-dev - libcusolver-dev - libcusparse-dev -- libcuvs==26.4.*,>=0.0.0a0 +- libcuvs==26.6.*,>=0.0.0a0 - libnvjitlink-dev -- libraft==26.4.*,>=0.0.0a0 +- libraft==26.6.*,>=0.0.0a0 - make - nccl>=2.19 - ninja diff --git a/conda/recipes/cuvs/recipe.yaml b/conda/recipes/cuvs/recipe.yaml index dcce17cb13..690a2cf1f9 100644 --- a/conda/recipes/cuvs/recipe.yaml +++ b/conda/recipes/cuvs/recipe.yaml @@ -91,7 +91,6 @@ requirements: - if: cuda_major == "12" then: cuda-python >=12.9.2,<13.0 else: cuda-python >=13.0.1,<14.0 - - cuda-cudart ignore_run_exports: by_name: - cuda-version diff --git a/conda/recipes/libcuvs/recipe.yaml b/conda/recipes/libcuvs/recipe.yaml index c598bb59ce..0e1b5451bc 100644 --- a/conda/recipes/libcuvs/recipe.yaml +++ b/conda/recipes/libcuvs/recipe.yaml @@ -72,9 +72,7 @@ cache: - ninja - ${{ stdlib("c") }} host: - - if: cuda_major == "13" - then: - - libnvjitlink-dev + - libnvjitlink-dev - librmm =${{ minor_version }} - libraft-headers =${{ minor_version }} - nccl ${{ nccl_version }} @@ -121,15 +119,12 @@ outputs: - libcurand-dev - libcusolver-dev - libcusparse-dev - - if: cuda_major == "13" - then: - - libnvjitlink-dev + - libnvjitlink-dev run: - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} - libraft-headers =${{ minor_version }} - librmm =${{ minor_version }} - nccl - - cuda-cudart - libcublas - libcurand - libcusolver @@ -183,16 +178,13 @@ outputs: - libcurand-dev - libcusolver-dev - libcusparse-dev - - if: cuda_major == "13" - then: - - libnvjitlink-dev + - libnvjitlink-dev run: - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} - ${{ pin_subpackage("libcuvs-headers", exact=True) }} - libraft-headers =${{ minor_version }} - librmm =${{ minor_version }} - nccl - - cuda-cudart - libcublas - libcurand - libcusolver @@ -244,16 +236,13 @@ outputs: - libcurand-dev - libcusolver-dev - libcusparse-dev - - if: cuda_major == "13" - then: - - libnvjitlink-dev + - libnvjitlink-dev run: - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} - ${{ pin_subpackage("libcuvs-headers", exact=True) }} - libraft-headers =${{ minor_version }} - librmm =${{ minor_version }} - nccl - - cuda-cudart - libcublas - libcurand - libcusolver @@ -310,7 +299,6 @@ outputs: - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} - libraft-headers =${{ minor_version }} - nccl - - cuda-cudart - libcublas - libcurand - libcusolver @@ -405,15 +393,12 @@ outputs: - libcurand-dev - libcusolver-dev - libcusparse-dev - - if: cuda_major == "13" - then: - - libnvjitlink-dev + - libnvjitlink-dev run: - ${{ pin_subpackage("libcuvs-headers", exact=True) }} - ${{ pin_subpackage("libcuvs", exact=True) }} - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} - nccl - - cuda-cudart - libcublas - libcurand - libcusolver @@ -482,7 +467,6 @@ outputs: - ${{ pin_subpackage("libcuvs-headers", exact=True) }} - ${{ pin_subpackage("libcuvs", exact=True) }} - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} - - cuda-cudart - libcublas - libcurand - libcusolver diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8942adba3d..18d3d5a8c8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -57,7 +57,6 @@ option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF) option(CUDA_ENABLE_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF ) -option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF) option(CUDA_STATIC_MATH_LIBRARIES "Statically link the CUDA math libraries" OFF) option(CUVS_STATIC_RAPIDS_LIBRARIES "Build and statically link RAPIDS libraries" OFF) option(CUDA_LOG_COMPILE_TIME "Write a log of compilation times to nvcc_compile_log.csv" OFF) @@ -111,12 +110,11 @@ message(VERBOSE "cuVS: Build CPU only components: ${BUILD_CPU_ONLY}") message(VERBOSE "cuVS: Build ANN benchmarks: ${BUILD_CUVS_BENCH}") message(VERBOSE "cuVS: Build only the shared library: ${CUVS_COMPILE_DYNAMIC_ONLY}") message(VERBOSE "cuVS: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}") -message(VERBOSE "cuVS: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS}) +message(VERBOSE "cuVS: Disable deprecation warnings " ${DISABLE_DEPRECATION_WARNINGS}) message(VERBOSE "cuVS: Disable OpenMP: ${DISABLE_OPENMP}") message(VERBOSE "cuVS: Enable kernel resource usage info: ${CUDA_ENABLE_KERNELINFO}") message(VERBOSE "cuVS: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}") message(VERBOSE "cuVS: Enable nvtx markers: ${CUVS_NVTX}") -message(VERBOSE "cuVS: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}") message(VERBOSE "cuVS: Statically link the CUDA math libraries: ${CUDA_STATIC_MATH_LIBRARIES}") message(VERBOSE "cuVS: Build and statically link RAPIDS libraries: ${CUVS_STATIC_RAPIDS_LIBRARIES}") @@ -153,7 +151,7 @@ endif() if(NOT BUILD_CPU_ONLY) # CUDA runtime - rapids_cuda_init_runtime(USE_STATIC ${CUDA_STATIC_RUNTIME}) + rapids_cuda_init_runtime(USE_STATIC ON) # * find CUDAToolkit package # * determine GPU architectures # * enable the CMake CUDA language @@ -357,108 +355,90 @@ if(NOT BUILD_CPU_ONLY) ) endif() - set(JIT_LTO_TARGET_ARCHITECTURE "") - set(JIT_LTO_COMPILATION OFF) + set(JIT_LTO_TARGET_ARCHITECTURE "70-real") if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) set(JIT_LTO_TARGET_ARCHITECTURE "75-real") - set(JIT_LTO_COMPILATION ON) endif() - if(JIT_LTO_COMPILATION) - # Generate interleaved scan kernel files at build time - include(cmake/modules/generate_jit_lto_kernels.cmake) + # Generate interleaved scan kernel files at build time + include(cmake/modules/generate_jit_lto_kernels.cmake) - add_library(jit_lto_kernel_usage_requirements INTERFACE) - target_include_directories( - jit_lto_kernel_usage_requirements - INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/src" - "${CMAKE_CURRENT_SOURCE_DIR}/../c/include" - ) - target_compile_options( - jit_lto_kernel_usage_requirements INTERFACE "$<$:${CUVS_CXX_FLAGS}>" - "$<$:${CUVS_CUDA_FLAGS}>" - ) - target_compile_features(jit_lto_kernel_usage_requirements INTERFACE cuda_std_20) - target_link_libraries( - jit_lto_kernel_usage_requirements INTERFACE rmm::rmm raft::raft CCCL::CCCL - ) - - block(PROPAGATE interleaved_scan_files metric_files filter_files post_lambda_files) - set(CMAKE_CUDA_ARCHITECTURES ${JIT_LTO_TARGET_ARCHITECTURE}) - generate_jit_lto_kernels( - interleaved_scan_files - NAME_FORMAT - "interleaved_scan_capacity_@capacity@_veclen_@veclen@_@ascending_descending@_@compute_norm_name@_data_@type_abbrev@_acc_@acc_abbrev@_idx_@idx_abbrev@" - MATRIX_JSON_FILE - "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_matrix.json" - KERNEL_INPUT_FILE - "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_kernel.cu.in" - EMBEDDED_INPUT_FILE - "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_embedded.cpp.in" - OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/interleaved_scan" - KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements - ) - generate_jit_lto_kernels( - metric_files - NAME_FORMAT "metric_@metric_name@_veclen_@veclen@_data_@type_abbrev@_acc_@acc_abbrev@" - MATRIX_JSON_FILE - "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_matrix.json" - KERNEL_INPUT_FILE - "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_kernel.cu.in" - EMBEDDED_INPUT_FILE - "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_embedded.cpp.in" - OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/metric" - KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements - ) - generate_jit_lto_kernels( - filter_files - NAME_FORMAT "@filter_name@" - MATRIX_JSON_FILE - "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_matrix.json" - KERNEL_INPUT_FILE - "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_kernel.cu.in" - EMBEDDED_INPUT_FILE - "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_embedded.cpp.in" - OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/filter" - KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements - ) - generate_jit_lto_kernels( - post_lambda_files - NAME_FORMAT "@post_lambda_name@" - MATRIX_JSON_FILE - "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_matrix.json" - KERNEL_INPUT_FILE - "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_kernel.cu.in" - EMBEDDED_INPUT_FILE - "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_embedded.cpp.in" - OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/post_lambda" - KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements - ) - endblock() + add_library(jit_lto_kernel_usage_requirements INTERFACE) + target_include_directories( + jit_lto_kernel_usage_requirements + INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/src" + "${CMAKE_CURRENT_SOURCE_DIR}/../c/include" + ) + target_compile_options( + jit_lto_kernel_usage_requirements INTERFACE "$<$:${CUVS_CXX_FLAGS}>" + "$<$:${CUVS_CUDA_FLAGS}>" + ) + target_compile_features(jit_lto_kernel_usage_requirements INTERFACE cuda_std_20) + target_link_libraries(jit_lto_kernel_usage_requirements INTERFACE rmm::rmm raft::raft CCCL::CCCL) + + block(PROPAGATE interleaved_scan_files metric_files filter_files post_lambda_files) + set(CMAKE_CUDA_ARCHITECTURES ${JIT_LTO_TARGET_ARCHITECTURE}) + generate_jit_lto_kernels( + interleaved_scan_files + NAME_FORMAT + "interleaved_scan_capacity_@capacity@_veclen_@veclen@_@ascending_descending@_@compute_norm_name@_data_@type_abbrev@_acc_@acc_abbrev@_idx_@idx_abbrev@" + MATRIX_JSON_FILE + "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_matrix.json" + KERNEL_INPUT_FILE + "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_kernel.cu.in" + EMBEDDED_INPUT_FILE + "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_embedded.cpp.in" + OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/interleaved_scan" + KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements + ) + generate_jit_lto_kernels( + metric_files + NAME_FORMAT "metric_@metric_name@_veclen_@veclen@_data_@type_abbrev@_acc_@acc_abbrev@" + MATRIX_JSON_FILE + "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_matrix.json" + KERNEL_INPUT_FILE + "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_kernel.cu.in" + EMBEDDED_INPUT_FILE + "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/metric_embedded.cpp.in" + OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/metric" + KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements + ) + generate_jit_lto_kernels( + filter_files + NAME_FORMAT "@filter_name@" + MATRIX_JSON_FILE + "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_matrix.json" + KERNEL_INPUT_FILE + "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_kernel.cu.in" + EMBEDDED_INPUT_FILE + "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/filter_embedded.cpp.in" + OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/filter" + KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements + ) + generate_jit_lto_kernels( + post_lambda_files + NAME_FORMAT "@post_lambda_name@" + MATRIX_JSON_FILE + "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_matrix.json" + KERNEL_INPUT_FILE + "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_kernel.cu.in" + EMBEDDED_INPUT_FILE + "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_embedded.cpp.in" + OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/post_lambda" + KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements + ) + endblock() - add_library( - cuvs_jit_lto_kernels STATIC + set(jit_lto_files ${interleaved_scan_files} ${metric_files} ${filter_files} ${post_lambda_files} src/detail/jit_lto/AlgorithmLauncher.cpp src/detail/jit_lto/AlgorithmPlanner.cpp - src/detail/jit_lto/FragmentDatabase.cpp src/detail/jit_lto/FragmentEntry.cpp src/detail/jit_lto/nvjitlink_checker.cpp - ) - set_target_properties( - cuvs_jit_lto_kernels PROPERTIES POSITION_INDEPENDENT_CODE ON CXX_STANDARD 20 - ) - target_include_directories( - cuvs_jit_lto_kernels - PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/src" - "${CMAKE_CURRENT_SOURCE_DIR}/../c/include" - ) - target_link_libraries(cuvs_jit_lto_kernels PRIVATE raft::raft) - add_library(cuvs::cuvs_jit_lto_kernels ALIAS cuvs_jit_lto_kernels) - endif() + ) add_library( cuvs_objs OBJECT @@ -672,6 +652,7 @@ if(NOT BUILD_CPU_ONLY) src/stats/silhouette_score.cu src/stats/trustworthiness_score.cu ${CUVS_MG_ALGOS} + ${jit_lto_files} ) set_target_properties( @@ -690,10 +671,8 @@ if(NOT BUILD_CPU_ONLY) ) target_compile_definitions( - cuvs_objs - PRIVATE $<$:CUVS_BUILD_CAGRA_HNSWLIB> - $<$:NVTX_ENABLED> - $<$:CUVS_ENABLE_JIT_LTO> + cuvs_objs PRIVATE $<$:CUVS_BUILD_CAGRA_HNSWLIB> + $<$:NVTX_ENABLED> ) target_link_libraries( @@ -768,28 +747,23 @@ if(NOT BUILD_CPU_ONLY) "$<$,$>:${CUVS_DEBUG_CUDA_FLAGS}>" ) target_compile_definitions( - cuvs - PUBLIC $<$:CUVS_BUILD_CAGRA_HNSWLIB> - $<$:NVTX_ENABLED> - $<$:CUVS_ENABLE_JIT_LTO> + cuvs PUBLIC $<$:CUVS_BUILD_CAGRA_HNSWLIB> + $<$:NVTX_ENABLED> ) target_link_libraries( cuvs - PUBLIC rmm::rmm - raft::raft + INTERFACE $ + PUBLIC raft::raft cuvs::cuvs_cpp_headers ${CUVS_CTK_MATH_DEPENDENCIES} $> $> $<$:CUDA::nvtx3> - PRIVATE - $ - $ - $ - $<$:CUDA::nvJitLink> - $<$:$> + PRIVATE rmm::rmm $ + $ $ CUDA::nvJitLink ) + set_property(TARGET cuvs PROPERTY NO_CUDART_DEP ON) # ensure CUDA symbols aren't relocated to the middle of the debug build binaries file( @@ -826,10 +800,8 @@ SECTIONS target_compile_options(cuvs_static PRIVATE "$<$:${CUVS_CXX_FLAGS}>") target_compile_definitions( - cuvs_static - PUBLIC $<$:CUVS_BUILD_CAGRA_HNSWLIB> - $<$:NVTX_ENABLED> - $<$:CUVS_ENABLE_JIT_LTO> + cuvs_static PUBLIC $<$:CUVS_BUILD_CAGRA_HNSWLIB> + $<$:NVTX_ENABLED> ) target_include_directories(cuvs_static INTERFACE "$") @@ -839,20 +811,20 @@ SECTIONS target_link_libraries( cuvs_static - PUBLIC rmm::rmm - raft::raft + INTERFACE $ + PUBLIC raft::raft cuvs::cuvs_cpp_headers ${CUVS_CTK_MATH_DEPENDENCIES} $ # needs to be public for DT_NEEDED $> # header only - PRIVATE - $ - $<$:CUDA::nvJitLink> - $<$:CUDA::nvtx3> - $ - $ - $<$:$> + PRIVATE rmm::rmm + $ + CUDA::nvJitLink + $<$:CUDA::nvtx3> + $ + $ ) + set_property(TARGET cuvs_static PROPERTY NO_CUDART_DEP ON) endif() # ################################################################################################ @@ -892,11 +864,9 @@ target_compile_definitions(cuvs::cuvs INTERFACE $<$:NVTX_ENAB include(GNUInstallDirs) include(CPack) - set(target_names cuvs cuvs_static cuvs_jit_lto_kernels cuvs_cpp_headers cuvs_c) - set(component_names cuvs_shared cuvs_static cuvs_static cuvs_cpp_headers c_api) - set(export_names cuvs-shared-exports cuvs-static-exports cuvs-static-exports - cuvs-cpp-headers-exports cuvs-c-exports - ) + set(target_names cuvs cuvs_static cuvs_cpp_headers cuvs_c) + set(component_names cuvs_shared cuvs_static cuvs_cpp_headers c_api) + set(export_names cuvs-shared-exports cuvs-static-exports cuvs-cpp-headers-exports cuvs-c-exports) foreach(target component export IN ZIP_LISTS target_names component_names export_names) if(TARGET ${target}) install( diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp index 22859e9ab8..a588b1e2a6 100644 --- a/cpp/bench/ann/src/common/benchmark.hpp +++ b/cpp/bench/ann/src/common/benchmark.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -351,15 +351,9 @@ void bench_search(::benchmark::State& state, // Each thread calculates recall on their partition of queries. // evaluate recall - if (dataset->max_k() >= k) { - const std::int32_t* gt = dataset->gt_set(); - const std::uint32_t* filter_bitset = dataset->filter_bitset(MemoryType::kHostMmap); - auto filter = [filter_bitset](std::int32_t i) -> bool { - if (filter_bitset == nullptr) { return true; } - auto word = filter_bitset[i >> 5]; - return word & (1 << (i & 31)); - }; - const std::uint32_t max_k = dataset->max_k(); + if (dataset->max_k() >= k && dataset->gt_maps().has_value()) { + // gt_maps[i] is a hash map of {id, neighbor_rank} for query i + const auto& gt_maps = dataset->gt_maps(); result_buf.transfer_data(MemoryType::kHost, current_algo_props->query_memory_type); auto* neighbors_host = reinterpret_cast(result_buf.data(MemoryType::kHost)); std::size_t rows = std::min(queries_processed, query_set_size); @@ -369,39 +363,49 @@ void bench_search(::benchmark::State& state, // We go through the groundtruth with same stride as the benchmark loop. size_t out_offset = 0; size_t batch_offset = (state.thread_index() * n_queries) % query_set_size; + // Avoid CPU oversubscription when parallelizing recall calculation loop + int num_recall_calculation_worker_threads = + std::thread::hardware_concurrency() / benchmark_n_threads - 1; // -1 for the main thread + // ensure non-negative number of workers (possible if hardware_concurrency() + // does not return an expected value) by clamping to 0 + if (num_recall_calculation_worker_threads < 0) { num_recall_calculation_worker_threads = 0; } while (out_offset < rows) { - for (std::size_t i = 0; i < n_queries; i++) { - size_t i_orig_idx = batch_offset + i; - size_t i_out_idx = out_offset + i; - if (i_out_idx < rows) { - /* NOTE: recall correctness & filtering - - In the loop below, we filter the ground truth values on-the-fly. - We need enough ground truth values to compute recall correctly though. - But the ground truth file only contains `max_k` values per row; if there are less valid - values than k among them, we overestimate the recall. Essentially, we compare the first - `filter_pass_count` values of the algorithm output, and this counter can be less than `k`. - In the extreme case of very high filtering rate, we may be bypassing entire rows of - results. However, this is still better than no recall estimate at all. - - TODO: consider generating the filtered ground truth on-the-fly - */ - uint32_t filter_pass_count = 0; - for (std::uint32_t l = 0; l < max_k && filter_pass_count < k; l++) { - auto exp_idx = gt[i_orig_idx * max_k + l]; - if (!filter(exp_idx)) { continue; } - filter_pass_count++; - for (std::uint32_t j = 0; j < k; j++) { - auto act_idx = static_cast(neighbors_host[i_out_idx * k + j]); - if (act_idx == exp_idx) { - match_count++; - break; - } - } + std::vector recall_calculation_workers; + recall_calculation_workers.reserve(num_recall_calculation_worker_threads); + std::vector local_match_count(num_recall_calculation_worker_threads + 1); + std::vector local_total_count(num_recall_calculation_worker_threads + 1); + int chunk_size = + n_queries / (num_recall_calculation_worker_threads + 1); // +1 for the main thread + int remainder = n_queries % (num_recall_calculation_worker_threads + 1); + auto recall_calculation = [&](int start, int end, int tid) -> void { + for (int i = start; i < end; ++i) { + size_t i_orig_idx = batch_offset + i; + size_t i_out_idx = out_offset + i; + if (i_out_idx < rows) { + auto* candidates = neighbors_host + i_out_idx * k; + auto [matching, total] = gt_maps->count_matches(i_orig_idx, candidates, k); + local_match_count[tid] += matching; + local_total_count[tid] += total; } - total_count += filter_pass_count; } + }; + // launch worker threads + int start = 0; + for (int tid = 0; tid < num_recall_calculation_worker_threads; tid++) { + int end = start + chunk_size; + if (tid < remainder) { ++end; } + recall_calculation_workers.emplace_back(recall_calculation, start, end, tid); + start = end; } + // main thread works on last chunk + recall_calculation(start, n_queries, num_recall_calculation_worker_threads); + // join all worker threads + for (auto& worker : recall_calculation_workers) { + worker.join(); + } + match_count += std::accumulate(local_match_count.begin(), local_match_count.end(), 0); + total_count += std::accumulate(local_total_count.begin(), local_total_count.end(), 0); + out_offset += n_queries; batch_offset = (batch_offset + queries_stride) % query_set_size; } diff --git a/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp b/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp index 8039187bde..d63a1eede5 100644 --- a/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp +++ b/cpp/bench/ann/src/common/cuda_huge_page_resource.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -7,8 +7,10 @@ #include #include -#include -#include +#include +#include + +#include #include @@ -17,37 +19,25 @@ namespace raft::mr { /** - * @brief `device_memory_resource` derived class that uses mmap to allocate memory. - * This class enables memory allocation using huge pages. + * @brief Memory resource that uses mmap to allocate memory with huge pages. * It is assumed that the allocated memory is directly accessible on device. This currently only * works on GH systems. * * TODO(tfeher): consider improving or removing this helper once we made progress with * https://github.com/rapidsai/raft/issues/1819 */ -class cuda_huge_page_resource final : public rmm::mr::device_memory_resource { +class cuda_huge_page_resource { public: cuda_huge_page_resource() = default; - ~cuda_huge_page_resource() override = default; + ~cuda_huge_page_resource() = default; cuda_huge_page_resource(cuda_huge_page_resource const&) = default; cuda_huge_page_resource(cuda_huge_page_resource&&) = default; auto operator=(cuda_huge_page_resource const&) -> cuda_huge_page_resource& = default; auto operator=(cuda_huge_page_resource&&) -> cuda_huge_page_resource& = default; - private: - /** - * @brief Allocates memory of size at least `bytes` using cudaMalloc. - * - * The returned pointer has at least 256B alignment. - * - * @note Stream argument is ignored - * - * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled - * - * @param bytes The size, in bytes, of the allocation - * @return void* Pointer to the newly allocated memory - */ - auto do_allocate(std::size_t bytes, rmm::cuda_stream_view) -> void* override + void* allocate(cuda::stream_ref, + std::size_t bytes, + std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) { void* addr{nullptr}; addr = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); @@ -60,36 +50,29 @@ class cuda_huge_page_resource final : public rmm::mr::device_memory_resource { return addr; } - /** - * @brief Deallocate memory pointed to by \p p. - * - * @note Stream argument is ignored. - * - * @throws Nothing. - * - * @param p Pointer to be deallocated - */ - void do_deallocate(void* ptr, std::size_t size, rmm::cuda_stream_view) noexcept override + void deallocate(cuda::stream_ref, + void* ptr, + std::size_t size, + std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept { if (munmap(ptr, size) == -1) { RAFT_LOG_ERROR("huge_page_resource::munmap failed"); } } - /** - * @brief Compare this resource to another. - * - * Two cuda_huge_page_resources always compare equal, because they can each - * deallocate memory allocated by the other. - * - * @throws Nothing. - * - * @param other The other resource to compare to - * @return true If the two resources are equivalent - * @return false If the two resources are not equal - */ - [[nodiscard]] auto do_is_equal(device_memory_resource const& other) const noexcept - -> bool override + void* allocate_sync(std::size_t bytes, std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) + { + return allocate(cuda::stream_ref{cudaStream_t{nullptr}}, bytes, alignment); + } + + void deallocate_sync(void* ptr, + std::size_t bytes, + std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT) noexcept { - return dynamic_cast(&other) != nullptr; + deallocate(cuda::stream_ref{cudaStream_t{nullptr}}, ptr, bytes, alignment); } + + bool operator==(cuda_huge_page_resource const&) const noexcept { return true; } + + friend void get_property(cuda_huge_page_resource const&, cuda::mr::device_accessible) noexcept {} }; +static_assert(cuda::mr::resource_with); } // namespace raft::mr diff --git a/cpp/bench/ann/src/common/dataset.hpp b/cpp/bench/ann/src/common/dataset.hpp index b0aae76977..8ef70bc452 100644 --- a/cpp/bench/ann/src/common/dataset.hpp +++ b/cpp/bench/ann/src/common/dataset.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -14,6 +14,7 @@ #include #include #include +#include namespace cuvs::bench { @@ -33,10 +34,112 @@ void generate_bernoulli(CarrierT* data, size_t words, double p) } }; +template +struct ground_truth_map { + using bitset_carrier_type = uint32_t; + static constexpr uint32_t kMaxQueriesForRecall = 10'000; + + explicit ground_truth_map(std::string file_name, + uint32_t n_queries, + std::optional>& filter_bitset) + : gt_maps_(n_queries) + { + // Eagerly iterate over and optionally filter the ground truth set to build gt_maps_ for up to + // kMaxQueriesForRecall queries + /* NOTE: recall correctness & filtering + + We generate the filtered ground truth values and build unordered_maps with them to + enable O(1) lookup. We need enough ground truth values to compute recall correctly + though. But the ground truth file only contains `max_k_` values per row; if there are + less valid values than k among them, we overestimate the recall. Essentially, we compare + the first `gt_maps_[query_idx].size()` values of the algorithm output, and this value can be + less than `k`. In the extreme case of very high filtering rate, we may be bypassing + entire rows of results. However, this is still better than no recall estimate at all. + + */ + auto ground_truth_set = blob(file_name); + max_k_ = ground_truth_set.n_cols(); + auto filter = [&](T i) -> bool { + if (!filter_bitset.has_value()) { return true; } + // bitset is `32 = bitset_carrier_type * 8` times more dense than the data + // use bitwise arithmetic to get the `row_id` and correct bit pos in the `word` + auto word = filter_bitset->data(MemoryType::kHostMmap)[i >> 5]; + return word & (1 << (i & 31)); + }; + // Avoid CPU oversubscription when parallelizing recall calculation loop + int num_map_building_worker_threads = + std::thread::hardware_concurrency() - 1; // -1 for the main thread + // ensure non-negative number of workers (possible if hardware_concurrency() + // does not return an expected value) by clamping to 0 + if (num_map_building_worker_threads < 0) { num_map_building_worker_threads = 0; } + std::vector gt_map_building_workers; + gt_map_building_workers.reserve(num_map_building_worker_threads); + int chunk_size = n_queries / (num_map_building_worker_threads + 1); + int remainder = n_queries % (num_map_building_worker_threads + 1); + int stride = (n_queries - 1) / kMaxQueriesForRecall + 1; // round-up division + auto build_gt_map = [&](int start, int end, int tid) -> void { + for (int query_idx = start; query_idx < end; ++query_idx) { + if (query_idx % stride) continue; + for (std::uint32_t neighbor_rank = 0; neighbor_rank < max_k_; ++neighbor_rank) { + auto id = ground_truth_set.data()[query_idx * max_k_ + neighbor_rank]; + if (!filter(id)) { continue; } + if (gt_maps_[query_idx].count(id)) { + throw std::invalid_argument( + "Duplicate neighbor id found in ground truth set for query " + + std::to_string(query_idx)); + } + gt_maps_[query_idx][id] = neighbor_rank; + } + } + }; + // launch worker threads + int start = 0; + for (int tid = 0; tid < num_map_building_worker_threads; tid++) { + int end = start + chunk_size; + if (tid < remainder) { ++end; } + gt_map_building_workers.emplace_back(build_gt_map, start, end, tid); + start = end; + } + // main thread works on last chunk + build_gt_map(start, n_queries, num_map_building_worker_threads); + // join all worker threads + for (auto& worker : gt_map_building_workers) { + worker.join(); + } + } + + [[nodiscard]] auto max_k() const -> uint32_t { return max_k_; } + + template + [[nodiscard]] auto count_matches(size_t query_idx, const index_type* candidates, uint32_t k) const + -> std::pair + { + if (query_idx >= gt_maps_.size() || gt_maps_[query_idx].empty()) return {0, 0}; + + size_t matching = 0; + for (uint32_t i = 0; i < k; ++i) { + auto act_idx = candidates[i]; + if (gt_maps_[query_idx].count(act_idx) && + static_cast(gt_maps_[query_idx].at(act_idx)) < k) { + ++matching; + } + } + size_t total = std::min(gt_maps_[query_idx].size(), static_cast(k)); + return {matching, total}; + } + + private: + // Hash maps of {id, neighbor_rank} for up to kMaxQueriesForRecall queries in the ground truth set + // e.g. gt_maps_[i][j] = k means that for the i-th query in the ground truth set, the neighbor + // with idx j is the k-th nearest. Note that the nearest neighbor rank starts from 0. + std::vector> gt_maps_; + uint32_t max_k_ = 0; // number of nearest neighbors in the ground truth +}; + template struct dataset { public: - using bitset_carrier_type = uint32_t; + using bitset_carrier_type = typename ground_truth_map::bitset_carrier_type; static inline constexpr size_t kBitsPerCarrierValue = sizeof(bitset_carrier_type) * 8; private: @@ -44,8 +147,8 @@ struct dataset { std::string distance_; blob base_set_; blob query_set_; - std::optional> ground_truth_set_; std::optional> filter_bitset_; + std::optional> ground_truth_map_; // Protects the lazy mutations of the blobs accessed by multiple threads mutable std::mutex mutex_; @@ -73,10 +176,7 @@ struct dataset { : name_{std::move(name)}, distance_{std::move(distance)}, base_set_{base_file, subset_first_row, subset_size}, - query_set_{query_file}, - ground_truth_set_{groundtruth_neighbors_file.has_value() - ? std::make_optional>(groundtruth_neighbors_file.value()) - : std::nullopt} + query_set_{query_file} { if (filtering_rate.has_value()) { // Generate a random bitset for filtering @@ -94,6 +194,11 @@ struct dataset { 1.0 - filtering_rate.value()); filter_bitset_.emplace(std::move(bitset_blob)); } + + if (groundtruth_neighbors_file.has_value()) { + ground_truth_map_.emplace(ground_truth_map{ + groundtruth_neighbors_file.value(), query_set_.n_rows(), filter_bitset_}); + } } [[nodiscard]] auto name() const -> std::string { return name_; } @@ -118,8 +223,7 @@ struct dataset { } [[nodiscard]] auto max_k() const -> uint32_t { - std::lock_guard lock(mutex_); - if (ground_truth_set_.has_value()) { return ground_truth_set_->n_cols(); } + if (ground_truth_map_.has_value()) { return ground_truth_map_->max_k(); } return 0; } [[nodiscard]] auto base_set_size() const -> size_t @@ -137,11 +241,9 @@ struct dataset { return r; } - [[nodiscard]] auto gt_set() const -> const IdxT* + [[nodiscard]] auto gt_maps() const -> const std::optional>& { - std::lock_guard lock(mutex_); - if (ground_truth_set_.has_value()) { return ground_truth_set_->data(); } - return nullptr; + return ground_truth_map_; } [[nodiscard]] auto query_set() const -> const DataT* diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h index 83cb7303c8..c0966c0391 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h +++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_utils.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -15,14 +15,16 @@ #include #include #include +#include #include #include #include -#include #include #include +#include #include +#include #include #include @@ -65,17 +67,16 @@ inline auto rmm_oom_callback(std::size_t bytes, void*) -> bool */ class shared_raft_resources { public: - using pool_mr_type = rmm::mr::pool_memory_resource; - using mr_type = rmm::mr::failure_callback_resource_adaptor; + using pool_mr_type = rmm::mr::pool_memory_resource; + using mr_type = rmm::mr::failure_callback_resource_adaptor<>; using large_mr_type = rmm::mr::managed_memory_resource; shared_raft_resources() try - : orig_resource_{rmm::mr::get_current_device_resource()}, - pool_resource_(orig_resource_, 1024 * 1024 * 1024ull), - resource_(&pool_resource_, rmm_oom_callback, nullptr), + : pool_resource_(rmm::mr::get_current_device_resource_ref(), 1024 * 1024 * 1024ull), + resource_(pool_resource_, rmm_oom_callback, nullptr), large_mr_() { - rmm::mr::set_current_device_resource(&resource_); + orig_resource_ = rmm::mr::set_current_device_resource_ref(resource_); } catch (const std::exception& e) { auto cuda_status = cudaGetLastError(); size_t free = 0; @@ -95,15 +96,12 @@ class shared_raft_resources { shared_raft_resources(const shared_raft_resources& res) = delete; auto operator=(const shared_raft_resources& other) -> shared_raft_resources& = delete; - ~shared_raft_resources() noexcept { rmm::mr::set_current_device_resource(orig_resource_); } + ~shared_raft_resources() noexcept { rmm::mr::set_current_device_resource_ref(orig_resource_); } - auto get_large_memory_resource() noexcept - { - return static_cast(&large_mr_); - } + auto get_large_memory_resource() noexcept -> rmm::device_async_resource_ref { return large_mr_; } private: - rmm::mr::device_memory_resource* orig_resource_; + cuda::mr::any_resource orig_resource_; pool_mr_type pool_resource_; mr_type resource_; large_mr_type large_mr_; @@ -129,12 +127,8 @@ class configured_raft_resources { res_{std::make_unique( rmm::cuda_stream_view(get_stream_from_global_pool()))} { - // set the large workspace resource to the raft handle, but without the deleter - // (this resource is managed by the shared_res). raft::resource::set_large_workspace_resource( - *res_, - std::shared_ptr(shared_res_->get_large_memory_resource(), - raft::void_op{})); + *res_, raft::mr::device_resource{shared_res_->get_large_memory_resource()}); } /** Default constructor creates all resources anew. */ diff --git a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h index 34fea2f82a..98dd94c2e1 100644 --- a/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h +++ b/cpp/bench/ann/src/cuvs/cuvs_cagra_wrapper.h @@ -166,9 +166,9 @@ class cuvs_cagra : public algo, public algo_gpu { inline rmm::device_async_resource_ref get_mr(AllocatorType mem_type) { switch (mem_type) { - case (AllocatorType::kHostPinned): return &mr_pinned_; - case (AllocatorType::kHostHugePage): return &mr_huge_page_; - default: return rmm::mr::get_current_device_resource(); + case (AllocatorType::kHostPinned): return mr_pinned_; + case (AllocatorType::kHostHugePage): return mr_huge_page_; + default: return rmm::mr::get_current_device_resource_ref(); } } }; diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake index 75a4473276..dfdc1a4d31 100644 --- a/cpp/cmake/thirdparty/get_faiss.cmake +++ b/cpp/cmake/thirdparty/get_faiss.cmake @@ -48,7 +48,7 @@ function(find_and_configure_faiss) "FAISS_ENABLE_CUVS ${PKG_ENABLE_GPU}" "FAISS_ENABLE_PYTHON OFF" "FAISS_OPT_LEVEL ${CUVS_FAISS_OPT_LEVEL}" - "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}" + "FAISS_USE_CUDA_TOOLKIT_STATIC ON" "BUILD_TESTING OFF" "CMAKE_MESSAGE_LOG_LEVEL VERBOSE" ) diff --git a/cpp/include/cuvs/cluster/agglomerative.hpp b/cpp/include/cuvs/cluster/agglomerative.hpp index 817826856a..050f0cecdc 100644 --- a/cpp/include/cuvs/cluster/agglomerative.hpp +++ b/cpp/include/cuvs/cluster/agglomerative.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -91,7 +91,7 @@ class single_linkage_output { * @param[in] X dense input matrix in row-major layout * @param[out] dendrogram output dendrogram (size [n_rows - 1] * 2) * @param[out] labels output labels vector (size n_rows) - * @param[in] metric distance metrix to use when constructing connectivities graph + * @param[in] metric distance metric to use when constructing connectivities graph * @param[in] n_clusters number of clusters to assign data samples * @param[in] linkage strategy for constructing the linkage. PAIRWISE uses more memory but can be faster for diff --git a/cpp/include/cuvs/detail/jit_lto/AlgorithmPlanner.hpp b/cpp/include/cuvs/detail/jit_lto/AlgorithmPlanner.hpp index 138b6720aa..d463003b23 100644 --- a/cpp/include/cuvs/detail/jit_lto/AlgorithmPlanner.hpp +++ b/cpp/include/cuvs/detail/jit_lto/AlgorithmPlanner.hpp @@ -14,21 +14,22 @@ struct FragmentEntry; struct AlgorithmPlanner { - AlgorithmPlanner(std::string fragment_key, std::string entrypoint) - : fragment_key(std::move(fragment_key)), entrypoint(std::move(entrypoint)) - { - } + AlgorithmPlanner(std::string entrypoint) : entrypoint(std::move(entrypoint)) {} std::shared_ptr get_launcher(); - std::string fragment_key; std::string entrypoint; - std::vector device_functions; - std::vector fragments; + std::vector fragments; + + void add_fragment(const FragmentEntry& fragment); + + template + void add_fragment() + { + add_fragment(FragmentT{}); + } private: - void add_entrypoint(); - void add_device_functions(); - std::string get_device_functions_key() const; + std::string get_fragments_key() const; std::shared_ptr build(); }; diff --git a/cpp/include/cuvs/detail/jit_lto/FragmentDatabase.hpp b/cpp/include/cuvs/detail/jit_lto/FragmentDatabase.hpp deleted file mode 100644 index aeb170d861..0000000000 --- a/cpp/include/cuvs/detail/jit_lto/FragmentDatabase.hpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -#pragma once - -#include -#include -#include - -#include "FragmentEntry.hpp" -#include "MakeFragmentKey.hpp" - -class FragmentDatabase { - public: - FragmentDatabase(FragmentDatabase const&) = delete; - FragmentDatabase(FragmentDatabase&&) = delete; - - FragmentDatabase& operator=(FragmentDatabase&&) = delete; - FragmentDatabase& operator=(FragmentDatabase const&) = delete; - - FragmentEntry* get_fragment(std::string const& key); - - private: - FragmentDatabase(); - - bool make_cache_entry(std::string const& key); - - friend FragmentDatabase& fragment_database(); - - friend void registerFatbinFragment(std::string const& algo, - std::string const& params, - unsigned char const* blob, - std::size_t size); - - std::unordered_map> cache; -}; - -FragmentDatabase& fragment_database(); - -void registerFatbinFragment(std::string const& algo, - std::string const& params, - unsigned char const* blob, - std::size_t size); diff --git a/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp b/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp index a376068425..3e00c7d566 100644 --- a/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp +++ b/cpp/include/cuvs/detail/jit_lto/FragmentEntry.hpp @@ -12,21 +12,27 @@ #include +#include "nvjitlink_checker.hpp" + struct FragmentEntry { - FragmentEntry(std::string const& key); + virtual bool add_to(nvJitLinkHandle& handle) const = 0; - bool operator==(const FragmentEntry& rhs) const { return compute_key == rhs.compute_key; } + virtual const char* get_key() const = 0; +}; - virtual bool add_to(nvJitLinkHandle& handle) const = 0; +struct FatbinFragmentEntry : FragmentEntry { + virtual const uint8_t* get_data() const = 0; + + virtual size_t get_length() const = 0; - std::string compute_key{}; + bool add_to(nvJitLinkHandle& handle) const override final; }; -struct FatbinFragmentEntry final : FragmentEntry { - FatbinFragmentEntry(std::string const& key, unsigned char const* view, std::size_t size); +template +struct StaticFatbinFragmentEntry : FatbinFragmentEntry { + const uint8_t* get_data() const override final { return FragmentT::data; } - virtual bool add_to(nvJitLinkHandle& handle) const; + size_t get_length() const override final { return FragmentT::length; } - std::size_t data_size = 0; - unsigned char const* data_view = nullptr; + const char* get_key() const override final { return typeid(FragmentT).name(); } }; diff --git a/cpp/include/cuvs/detail/jit_lto/RegisterKernelFragment.hpp b/cpp/include/cuvs/detail/jit_lto/RegisterKernelFragment.hpp deleted file mode 100644 index 5643be6523..0000000000 --- a/cpp/include/cuvs/detail/jit_lto/RegisterKernelFragment.hpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -#pragma once - -#include "MakeFragmentKey.hpp" - -void registerFatbinFragment(std::string const& algo, - std::string const& params, - unsigned char const* blob, - std::size_t size); - -namespace { - -template -void registerAlgorithm(std::string algo, unsigned char const* blob, std::size_t size) -{ - auto key = make_fragment_key(); - registerFatbinFragment(algo, key, blob, size); -} - -} // namespace diff --git a/cpp/include/cuvs/detail/jit_lto/ivf_flat/interleaved_scan_fragments.hpp b/cpp/include/cuvs/detail/jit_lto/ivf_flat/interleaved_scan_fragments.hpp new file mode 100644 index 0000000000..dccb26db68 --- /dev/null +++ b/cpp/include/cuvs/detail/jit_lto/ivf_flat/interleaved_scan_fragments.hpp @@ -0,0 +1,52 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include + +namespace cuvs::neighbors::ivf_flat::detail { + +template +struct InterleavedScanFragmentEntry final + : StaticFatbinFragmentEntry> { + static const uint8_t* const data; + static const size_t length; +}; + +template +struct MetricFragmentEntry final + : StaticFatbinFragmentEntry> { + static const uint8_t* const data; + static const size_t length; +}; + +template +struct FilterFragmentEntry final + : StaticFatbinFragmentEntry> { + static const uint8_t* const data; + static const size_t length; +}; + +template +struct PostLambdaFragmentEntry final + : StaticFatbinFragmentEntry> { + static const uint8_t* const data; + static const size_t length; +}; + +} // namespace cuvs::neighbors::ivf_flat::detail diff --git a/cpp/include/cuvs/detail/jit_lto/ivf_flat/interleaved_scan_tags.hpp b/cpp/include/cuvs/detail/jit_lto/ivf_flat/interleaved_scan_tags.hpp index 8cb5b99a7e..a70e14df66 100644 --- a/cpp/include/cuvs/detail/jit_lto/ivf_flat/interleaved_scan_tags.hpp +++ b/cpp/include/cuvs/detail/jit_lto/ivf_flat/interleaved_scan_tags.hpp @@ -70,10 +70,7 @@ template struct tag_filter {}; // Tag types for distance metrics with full template info -template struct tag_metric_euclidean {}; - -template struct tag_metric_inner_product {}; // Tag types for post-processing diff --git a/cpp/src/detail/jit_lto/nvjitlink_checker.hpp b/cpp/include/cuvs/detail/jit_lto/nvjitlink_checker.hpp similarity index 100% rename from cpp/src/detail/jit_lto/nvjitlink_checker.hpp rename to cpp/include/cuvs/detail/jit_lto/nvjitlink_checker.hpp diff --git a/cpp/include/cuvs/neighbors/all_neighbors.hpp b/cpp/include/cuvs/neighbors/all_neighbors.hpp index 9f20a28134..70e066ef2f 100644 --- a/cpp/include/cuvs/neighbors/all_neighbors.hpp +++ b/cpp/include/cuvs/neighbors/all_neighbors.hpp @@ -111,7 +111,7 @@ struct all_neighbors_params { * all_neighbors::build(res, params, dataset, indices.view(), distances.view()); * @endcode * - * @param[in] handle raft::resources is an object mangaging resources + * @param[in] handle raft::resources is an object managing resources * @param[in] params an instance of all_neighbors::all_neighbors_params that are parameters * to build all-neighbors knn graph * @param[in] dataset raft::host_matrix_view input dataset expected to be located @@ -147,7 +147,7 @@ void build( * all_neighbors::build(res, params, dataset, indices.view(), distances.view()); * @endcode * - * @param[in] handle raft::resources is an object mangaging resources + * @param[in] handle raft::resources is an object managing resources * @param[in] params an instance of all_neighbors::all_neighbors_params that are parameters * to build all-neighbors knn graph * @param[in] dataset raft::device_matrix_view input dataset expected to be located diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index 6dd19b6781..a7e1249677 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -897,6 +897,7 @@ struct index : cuvs::neighbors::index { * - L2 * - InnerProduct (currently only supported with IVF-PQ as the build algorithm) * - CosineExpanded + * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm) * * Usage example: * @code{.cpp} @@ -935,6 +936,7 @@ auto build(raft::resources const& res, * - L2 * - InnerProduct (currently only supported with IVF-PQ as the build algorithm) * - CosineExpanded + * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm) * * Usage example: * @code{.cpp} @@ -973,6 +975,7 @@ auto build(raft::resources const& res, * - L2 * - InnerProduct (currently only supported with IVF-PQ as the build algorithm) * - CosineExpanded (dataset norms are computed as float regardless of input data type) + * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm) * * Usage example: * @code{.cpp} @@ -1010,6 +1013,7 @@ auto build(raft::resources const& res, * The following distance metrics are supported: * - L2 * - CosineExpanded (dataset norms are computed as float regardless of input data type) + * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm) * * Usage example: * @code{.cpp} @@ -1047,6 +1051,9 @@ auto build(raft::resources const& res, * The following distance metrics are supported: * - L2 * - CosineExpanded (dataset norms are computed as float regardless of input data type) + * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm) + * - BitwiseHamming (currently only supported with NN-Descent and Iterative Search as the build + * algorithm, and only for int8_t and uint8_t data types) * * Usage example: * @code{.cpp} @@ -1085,6 +1092,9 @@ auto build(raft::resources const& res, * - L2 * - InnerProduct (currently only supported with IVF-PQ as the build algorithm) * - CosineExpanded (dataset norms are computed as float regardless of input data type) + * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm) + * - BitwiseHamming (currently only supported with NN-Descent and Iterative Search as the build + * algorithm, and only for int8_t and uint8_t data types) * * Usage example: * @code{.cpp} @@ -1123,6 +1133,9 @@ auto build(raft::resources const& res, * - L2 * - InnerProduct (currently only supported with IVF-PQ as the build algorithm) * - CosineExpanded (dataset norms are computed as float regardless of input data type) + * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm) + * - BitwiseHamming (currently only supported with NN-Descent and Iterative Search as the build + * algorithm, and only for int8_t and uint8_t data types) * * Usage example: * @code{.cpp} @@ -1161,6 +1174,9 @@ auto build(raft::resources const& res, * - L2 * - InnerProduct (currently only supported with IVF-PQ as the build algorithm) * - CosineExpanded (dataset norms are computed as float regardless of input data type) + * - L1 (currently only supported with NN-Descent and Iterative Search as the build algorithm) + * - BitwiseHamming (currently only supported with NN-Descent and Iterative Search as the build + * algorithm, and only for int8_t and uint8_t data types) * * Usage example: * @code{.cpp} diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp index 222cdc9cde..3909098398 100644 --- a/cpp/include/cuvs/neighbors/common.hpp +++ b/cpp/include/cuvs/neighbors/common.hpp @@ -224,7 +224,7 @@ template inline constexpr bool is_strided_dataset_v = is_strided_dataset::value; /** - * @brief Contstruct a strided matrix from any mdarray or mdspan. + * @brief Construct a strided matrix from any mdarray or mdspan. * * This function constructs a non-owning view if the input satisfied two conditions: * @@ -299,7 +299,7 @@ auto make_strided_dataset(const raft::resources& res, const SrcT& src, uint32_t } /** - * @brief Contstruct a strided matrix from any mdarray. + * @brief Construct a strided matrix from any mdarray. * * This function constructs an owning device matrix and copies the data. * When the data is copied, padding elements are filled with zeroes. @@ -370,7 +370,7 @@ auto make_strided_dataset( } /** - * @brief Contstruct a strided matrix from any mdarray or mdspan. + * @brief Construct a strided matrix from any mdarray or mdspan. * * A variant `make_strided_dataset` that allows specifying the byte alignment instead of the * explicit stride length. @@ -915,7 +915,7 @@ enum distribution_mode { /** Search mode when using a replicated index */ /// \ingroup mg_cpp_search_params enum replicated_search_mode { - /** Search queries are splited to maintain equal load on GPUs */ + /** Search queries are split to maintain equal load on GPUs */ LOAD_BALANCER, /** Each search query is processed by a single GPU in a round-robin fashion */ ROUND_ROBIN diff --git a/cpp/include/cuvs/neighbors/ivf_flat.hpp b/cpp/include/cuvs/neighbors/ivf_flat.hpp index 23c6dd4944..994b97ce6e 100644 --- a/cpp/include/cuvs/neighbors/ivf_flat.hpp +++ b/cpp/include/cuvs/neighbors/ivf_flat.hpp @@ -242,7 +242,7 @@ struct index : cuvs::neighbors::index { raft::device_vector_view inds_ptrs() const noexcept; /** - * Whether to use convervative memory allocation when extending the list (cluster) data + * Whether to use conservative memory allocation when extending the list (cluster) data * (see index_params.conservative_memory_allocation). */ bool conservative_memory_allocation() const noexcept; diff --git a/cpp/include/cuvs/neighbors/ivf_pq.hpp b/cpp/include/cuvs/neighbors/ivf_pq.hpp index 710a08cd0c..1fa7a3a7d6 100644 --- a/cpp/include/cuvs/neighbors/ivf_pq.hpp +++ b/cpp/include/cuvs/neighbors/ivf_pq.hpp @@ -567,7 +567,7 @@ class index : public index_iface, cuvs::neighbors::index { uint32_t n_lists() const noexcept; /** - * Whether to use convervative memory allocation when extending the list (cluster) data + * Whether to use conservative memory allocation when extending the list (cluster) data * (see index_params.conservative_memory_allocation). */ bool conservative_memory_allocation() const noexcept override; diff --git a/cpp/include/cuvs/neighbors/nn_descent.hpp b/cpp/include/cuvs/neighbors/nn_descent.hpp index 44fbaed592..9ad548a628 100644 --- a/cpp/include/cuvs/neighbors/nn_descent.hpp +++ b/cpp/include/cuvs/neighbors/nn_descent.hpp @@ -101,7 +101,7 @@ struct index : cuvs::neighbors::index { * The type of the knn-graph is a dense raft::host_matrix and dimensions are * (n_rows, n_cols). * - * @param res raft::resources is an object mangaging resources + * @param res raft::resources is an object managing resources * @param n_rows number of rows in knn-graph * @param n_cols number of cols in knn-graph * @param return_distances whether to return distances @@ -132,7 +132,7 @@ struct index : cuvs::neighbors::index { * The type of the knn-graph is a dense raft::host_matrix and dimensions are * (n_rows, n_cols). * - * @param res raft::resources is an object mangaging resources + * @param res raft::resources is an object managing resources * @param graph_view raft::host_matrix_view for storing knn-graph * @param distances_view optional raft::device_matrix_view for storing * distances @@ -218,6 +218,7 @@ struct index : cuvs::neighbors::index { * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * * Usage example: * @code{.cpp} @@ -231,7 +232,7 @@ struct index : cuvs::neighbors::index { * // dataset * @endcode * - * @param[in] res raft::resources is an object mangaging resources + * @param[in] res raft::resources is an object managing resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm * @param[in] dataset raft::device_matrix_view input dataset expected to be located @@ -254,6 +255,7 @@ auto build(raft::resources const& res, * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * * Usage example: * @code{.cpp} @@ -269,7 +271,7 @@ auto build(raft::resources const& res, * * @tparam T data-type of the input dataset * @tparam IdxT data-type for the output index - * @param res raft::resources is an object mangaging resources + * @param res raft::resources is an object managing resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm * @param[in] dataset raft::host_matrix_view input dataset expected to be located @@ -292,6 +294,7 @@ auto build(raft::resources const& res, * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * * Usage example: * @code{.cpp} @@ -305,7 +308,7 @@ auto build(raft::resources const& res, * // dataset * @endcode * - * @param[in] res raft::resources is an object mangaging resources + * @param[in] res raft::resources is an object managing resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm * @param[in] dataset raft::device_matrix_view input dataset expected to be located @@ -328,6 +331,7 @@ auto build(raft::resources const& res, * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * * Usage example: * @code{.cpp} @@ -343,7 +347,7 @@ auto build(raft::resources const& res, * * @tparam T data-type of the input dataset * @tparam IdxT data-type for the output index - * @param res raft::resources is an object mangaging resources + * @param res raft::resources is an object managing resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm * @param[in] dataset raft::host_matrix_view input dataset expected to be located @@ -366,6 +370,7 @@ auto build(raft::resources const& res, * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * - BitwiseHamming * * Usage example: @@ -380,7 +385,7 @@ auto build(raft::resources const& res, * // dataset * @endcode * - * @param[in] res raft::resources is an object mangaging resources + * @param[in] res raft::resources is an object managing resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm * @param[in] dataset raft::device_matrix_view input dataset expected to be located @@ -403,6 +408,7 @@ auto build(raft::resources const& res, * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * - BitwiseHamming * * Usage example: @@ -419,7 +425,7 @@ auto build(raft::resources const& res, * * @tparam T data-type of the input dataset * @tparam IdxT data-type for the output index - * @param res raft::resources is an object mangaging resources + * @param res raft::resources is an object managing resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm * @param[in] dataset raft::host_matrix_view input dataset expected to be located @@ -442,6 +448,7 @@ auto build(raft::resources const& res, * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * - BitwiseHamming * * Usage example: @@ -456,7 +463,7 @@ auto build(raft::resources const& res, * // dataset * @endcode * - * @param[in] res raft::resources is an object mangaging resources + * @param[in] res raft::resources is an object managing resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm * @param[in] dataset raft::device_matrix_view input dataset expected to be located @@ -479,6 +486,7 @@ auto build(raft::resources const& res, * - L2SqrtExpanded * - CosineExpanded * - InnerProduct + * - L1 * - BitwiseHamming * * Usage example: @@ -495,7 +503,7 @@ auto build(raft::resources const& res, * * @tparam T data-type of the input dataset * @tparam IdxT data-type for the output index - * @param res raft::resources is an object mangaging resources + * @param res raft::resources is an object managing resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm * @param[in] dataset raft::host_matrix_view input dataset expected to be located diff --git a/cpp/include/cuvs/neighbors/scann.hpp b/cpp/include/cuvs/neighbors/scann.hpp index df13307a73..56948c3a74 100644 --- a/cpp/include/cuvs/neighbors/scann.hpp +++ b/cpp/include/cuvs/neighbors/scann.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -266,7 +266,7 @@ struct index : cuvs::neighbors::index { * The bits of __nv_bfloat16 are stored here reinterpreted as int16_t * * int16_t is used for two reaosns: - * * OSS ScaNN expects int16_t, so the serialzed bf16_dataset_ can be consumed + * * OSS ScaNN expects int16_t, so the serialized bf16_dataset_ can be consumed * without any additional post-processing * * For AVQ, we need to find the next bfloat16 number that is larger/smaller than a * given float. This is equivalent to incrementing/decrementing the mantissa diff --git a/cpp/include/cuvs/neighbors/vamana.hpp b/cpp/include/cuvs/neighbors/vamana.hpp index 6c0fc94c1e..c3ba86d5b6 100644 --- a/cpp/include/cuvs/neighbors/vamana.hpp +++ b/cpp/include/cuvs/neighbors/vamana.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -42,7 +42,7 @@ struct codebook_params { /** * @brief Parameters used to build DiskANN index * - * `graph_degree`: Maximum degree of graph; correspods to the R parameter of + * `graph_degree`: Maximum degree of graph; corresponds to the R parameter of * Vamana algorithm in the literature. * `visited_size`: Maximum number of visited nodes per search during Vamana algorithm. * Loosely corresponds to the L parameter in the literature. diff --git a/cpp/internal/cuvs_internal/neighbors/naive_knn.cuh b/cpp/internal/cuvs_internal/neighbors/naive_knn.cuh index 6c7577065b..7bc37193a0 100644 --- a/cpp/internal/cuvs_internal/neighbors/naive_knn.cuh +++ b/cpp/internal/cuvs_internal/neighbors/naive_knn.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include namespace cuvs::neighbors { @@ -87,8 +88,7 @@ void naive_knn(raft::resources const& handle, uint32_t k, cuvs::distance::DistanceType type) { - rmm::mr::device_memory_resource* mr = nullptr; - auto pool_guard = raft::get_pool_memory_resource(mr, 1024 * 1024); + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref(); auto stream = raft::resource::get_cuda_stream(handle); dim3 block_dim(16, 32, 1); @@ -116,8 +116,7 @@ void naive_knn(raft::resources const& handle, static_cast(k), dist_topk + offset * k, indices_topk + offset * k, - type != cuvs::distance::DistanceType::InnerProduct, - mr); + type != cuvs::distance::DistanceType::InnerProduct); } RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } diff --git a/cpp/src/cluster/detail/kmeans_balanced.cuh b/cpp/src/cluster/detail/kmeans_balanced.cuh index f5dc759725..aab18680ac 100644 --- a/cpp/src/cluster/detail/kmeans_balanced.cuh +++ b/cpp/src/cluster/detail/kmeans_balanced.cuh @@ -364,7 +364,7 @@ void compute_norm(const raft::resources& handle, raft::common::nvtx::range fun_scope("compute_norm"); auto stream = raft::resource::get_cuda_stream(handle); rmm::device_uvector mapped_dataset( - 0, stream, mr.value_or(raft::resource::get_workspace_resource(handle))); + 0, stream, mr.value_or(raft::resource::get_workspace_resource_ref(handle))); const MathT* dataset_ptr = nullptr; @@ -426,7 +426,7 @@ void predict(const raft::resources& handle, auto stream = raft::resource::get_cuda_stream(handle); raft::common::nvtx::range fun_scope( "predict(%zu, %u)", static_cast(n_rows), n_clusters); - auto mem_res = mr.value_or(raft::resource::get_workspace_resource(handle)); + auto mem_res = mr.value_or(raft::resource::get_workspace_resource_ref(handle)); auto [max_minibatch_size, _mem_per_row] = calc_minibatch_size(n_clusters, n_rows, dim, params.metric, std::is_same_v); rmm::device_uvector cur_dataset( @@ -1038,7 +1038,7 @@ void build_hierarchical(const raft::resources& handle, // TODO: Remove the explicit managed memory- we shouldn't be creating this on the user's behalf. rmm::mr::managed_memory_resource managed_memory; - rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(handle); + rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource_ref(handle); auto [max_minibatch_size, mem_per_row] = calc_minibatch_size(n_clusters, n_rows, dim, params.metric, std::is_same_v); @@ -1079,8 +1079,8 @@ void build_hierarchical(const raft::resources& handle, CounterT; // build coarse clusters (mesoclusters) - rmm::device_uvector mesocluster_labels_buf(n_rows, stream, &managed_memory); - rmm::device_uvector mesocluster_sizes_buf(n_mesoclusters, stream, &managed_memory); + rmm::device_uvector mesocluster_labels_buf(n_rows, stream, managed_memory); + rmm::device_uvector mesocluster_sizes_buf(n_mesoclusters, stream, managed_memory); { rmm::device_uvector mesocluster_centers_buf(n_mesoclusters * dim, stream, device_memory); build_clusters(handle, @@ -1136,7 +1136,7 @@ void build_hierarchical(const raft::resources& handle, fine_clusters_nums_max, cluster_centers, mapping_op, - &managed_memory, + managed_memory, device_memory); RAFT_EXPECTS(n_clusters_done == n_clusters, "Didn't process all clusters."); diff --git a/cpp/src/cluster/detail/single_linkage.cuh b/cpp/src/cluster/detail/single_linkage.cuh index f8d4615c75..dc0a5dca75 100644 --- a/cpp/src/cluster/detail/single_linkage.cuh +++ b/cpp/src/cluster/detail/single_linkage.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -69,7 +69,7 @@ void build_mr_linkage( auto dists = raft::make_device_matrix(handle, m, min_samples); if (all_neighbors_p.metric != metric) { - RAFT_LOG_WARN("Setting all neighbors metric to given metrix for build_mr_linkage"); + RAFT_LOG_WARN("Setting all neighbors metric to given metric for build_mr_linkage"); all_neighbors_p.metric = metric; } cuvs::neighbors::all_neighbors::build( @@ -250,7 +250,7 @@ void build_dist_linkage(raft::resources const& handle, * @param[in] X dense input matrix in row-major layout * @param[in] m number of rows in X * @param[in] n number of columns in X - * @param[in] metric distance metrix to use when constructing connectivities graph + * @param[in] metric distance metric to use when constructing connectivities graph * @param[out] out struct containing output dendrogram and cluster assignments * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect control diff --git a/cpp/src/cluster/kmeans_balanced.cuh b/cpp/src/cluster/kmeans_balanced.cuh index 0c0df03397..f3f52c2d8f 100644 --- a/cpp/src/cluster/kmeans_balanced.cuh +++ b/cpp/src/cluster/kmeans_balanced.cuh @@ -154,7 +154,7 @@ void predict(const raft::resources& handle, X.extent(0), labels.data_handle(), mapping_op, - raft::resource::get_workspace_resource(handle)); + raft::resource::get_workspace_resource_ref(handle)); } namespace helpers { @@ -305,7 +305,7 @@ void calc_centers_and_sizes(const raft::resources& handle, labels.data_handle(), reset_counters, mapping_op, - raft::resource::get_workspace_resource(handle)); + raft::resource::get_workspace_resource_ref(handle)); } } // namespace helpers diff --git a/cpp/src/cluster/kmeans_balanced_build_clusters_impl.cuh b/cpp/src/cluster/kmeans_balanced_build_clusters_impl.cuh index 6e60611df6..2bce856c6c 100644 --- a/cpp/src/cluster/kmeans_balanced_build_clusters_impl.cuh +++ b/cpp/src/cluster/kmeans_balanced_build_clusters_impl.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -68,7 +68,7 @@ void build_clusters(const raft::resources& handle, labels.data_handle(), cluster_sizes.data_handle(), mapping_op, - raft::resource::get_workspace_resource(handle), + raft::resource::get_workspace_resource_ref(handle), X_norm.has_value() ? X_norm.value().data_handle() : nullptr); } diff --git a/cpp/src/cluster/single_linkage.cuh b/cpp/src/cluster/single_linkage.cuh index 4dcb2b52c1..b1e9794081 100644 --- a/cpp/src/cluster/single_linkage.cuh +++ b/cpp/src/cluster/single_linkage.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2021-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2021-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -29,7 +29,7 @@ namespace cuvs::cluster::agglomerative { * @param[in] X dense input matrix in row-major layout * @param[in] m number of rows in X * @param[in] n number of columns in X - * @param[in] metric distance metrix to use when constructing connectivities graph + * @param[in] metric distance metric to use when constructing connectivities graph * @param[out] out struct containing output dendrogram and cluster assignments * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect control @@ -63,7 +63,7 @@ void single_linkage(raft::resources const& handle, * @param[in] X dense input matrix in row-major layout * @param[out] dendrogram output dendrogram (size [n_rows - 1] * 2) * @param[out] labels output labels vector (size n_rows) - * @param[in] metric distance metrix to use when constructing connectivities graph + * @param[in] metric distance metric to use when constructing connectivities graph * @param[in] n_clusters number of clusters to assign data samples * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect control of k. The algorithm will set `k = log(n) + c` diff --git a/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp b/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp index dbb1f09c30..6622476687 100644 --- a/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp +++ b/cpp/src/detail/jit_lto/AlgorithmPlanner.cpp @@ -3,8 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "nvjitlink_checker.hpp" - #include #include #include @@ -14,7 +12,8 @@ #include #include -#include +#include +#include #include "cuda_runtime.h" #include "nvJitLink.h" @@ -22,25 +21,16 @@ #include #include -void AlgorithmPlanner::add_entrypoint() +void AlgorithmPlanner::add_fragment(const FragmentEntry& fragment) { - auto entrypoint_fragment = fragment_database().get_fragment(this->fragment_key); - this->fragments.push_back(entrypoint_fragment); -} - -void AlgorithmPlanner::add_device_functions() -{ - for (const auto& device_function_key : this->device_functions) { - auto device_function_fragment = fragment_database().get_fragment(device_function_key); - this->fragments.push_back(device_function_fragment); - } + fragments.push_back(&fragment); } -std::string AlgorithmPlanner::get_device_functions_key() const +std::string AlgorithmPlanner::get_fragments_key() const { std::string key = ""; - for (const auto& device_function : this->device_functions) { - key += device_function; + for (const auto* fragment : this->fragments) { + key += fragment->get_key(); } return key; } @@ -48,17 +38,15 @@ std::string AlgorithmPlanner::get_device_functions_key() const std::shared_ptr AlgorithmPlanner::get_launcher() { auto& launchers = get_cached_launchers(); - auto launch_key = this->fragment_key + this->get_device_functions_key(); + auto launch_key = this->get_fragments_key(); static std::mutex cache_mutex; std::lock_guard lock(cache_mutex); if (launchers.count(launch_key) == 0) { - add_entrypoint(); - add_device_functions(); std::string log_message = - "JIT compiling launcher for kernel: " + this->fragment_key + " and device functions: "; - for (const auto& device_function : this->device_functions) { - log_message += device_function + ","; + "JIT compiling launcher for kernel: " + this->entrypoint + " and device functions: "; + for (const auto* fragment : this->fragments) { + log_message += std::string{fragment->get_key()} + ","; } log_message.pop_back(); RAFT_LOG_INFO("%s", log_message.c_str()); diff --git a/cpp/src/detail/jit_lto/FragmentDatabase.cpp b/cpp/src/detail/jit_lto/FragmentDatabase.cpp deleted file mode 100644 index 02ea688a0d..0000000000 --- a/cpp/src/detail/jit_lto/FragmentDatabase.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include -#include - -#include - -FragmentDatabase::FragmentDatabase() {} - -bool FragmentDatabase::make_cache_entry(std::string const& key) -{ - if (this->cache.count(key) == 0) { - this->cache[key] = std::unique_ptr{}; - return false; - } - return true; -} - -FragmentDatabase& fragment_database() -{ - static FragmentDatabase database; - return database; -} - -FragmentEntry* FragmentDatabase::get_fragment(std::string const& key) -{ - auto& db = fragment_database(); - auto val = db.cache.find(key); - RAFT_EXPECTS(val != db.cache.end(), "FragmentDatabase: Key not found: %s", key.c_str()); - return val->second.get(); -} - -void registerFatbinFragment(std::string const& algo, - std::string const& params, - unsigned char const* blob, - std::size_t size) -{ - auto& planner = fragment_database(); - std::string key = algo; - if (!params.empty()) { key += "_" + params; } - auto entry_exists = planner.make_cache_entry(key); - if (entry_exists) { return; } - planner.cache[key] = std::make_unique(key, blob, size); -} diff --git a/cpp/src/detail/jit_lto/FragmentEntry.cpp b/cpp/src/detail/jit_lto/FragmentEntry.cpp index af1fb90e58..bf0893c8a6 100644 --- a/cpp/src/detail/jit_lto/FragmentEntry.cpp +++ b/cpp/src/detail/jit_lto/FragmentEntry.cpp @@ -3,25 +3,11 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "nvjitlink_checker.hpp" - -#include - #include -FragmentEntry::FragmentEntry(std::string const& key) : compute_key(key) {} - -FatbinFragmentEntry::FatbinFragmentEntry(std::string const& key, - unsigned char const* view, - std::size_t size) - : FragmentEntry(key), data_size(size), data_view(view) -{ -} - bool FatbinFragmentEntry::add_to(nvJitLinkHandle& handle) const { - auto result = nvJitLinkAddData( - handle, NVJITLINK_INPUT_ANY, this->data_view, this->data_size, this->compute_key.c_str()); + auto result = nvJitLinkAddData(handle, NVJITLINK_INPUT_ANY, get_data(), get_length(), get_key()); check_nvjitlink_result(handle, result); return true; diff --git a/cpp/src/detail/jit_lto/nvjitlink_checker.cpp b/cpp/src/detail/jit_lto/nvjitlink_checker.cpp index 6f9ae988db..95f4725362 100644 --- a/cpp/src/detail/jit_lto/nvjitlink_checker.cpp +++ b/cpp/src/detail/jit_lto/nvjitlink_checker.cpp @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "nvjitlink_checker.hpp" +#include #include #include diff --git a/cpp/src/distance/detail/distance.cuh b/cpp/src/distance/detail/distance.cuh index 44b6ae5a63..ab06bc3ba1 100644 --- a/cpp/src/distance/detail/distance.cuh +++ b/cpp/src/distance/detail/distance.cuh @@ -772,12 +772,12 @@ void distance_impl(raft::resources const& handle, } /** - * @brief Evaluate pairwise distances with the user epilogue lamba allowed + * @brief Evaluate pairwise distances with the user epilogue lambda allowed * @tparam DistanceType which distance to evaluate * @tparam InType input argument type * @tparam AccType accumulation type * @tparam OutType output type - * @tparam FinalLambda user-defined epilogue lamba + * @tparam FinalLambda user-defined epilogue lambda * @tparam Index_ Index type * * @param x first set of points diff --git a/cpp/src/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h b/cpp/src/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h index b22826c8dd..57a66a6e61 100644 --- a/cpp/src/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h +++ b/cpp/src/distance/detail/fused_distance_nn/custom_epilogue_with_broadcast.h @@ -1,7 +1,7 @@ // clang-format off /* * SPDX-FileCopyrightText: Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause */ // clang-format on @@ -502,7 +502,7 @@ class EpilogueWithBroadcastCustom : public EpilogueBase(raft::resource::get_workspace_resource(handle)); - auto stream = raft::resource::get_cuda_stream(handle); + rmm::device_async_resource_ref ws_mr = raft::resource::get_workspace_resource_ref(handle); + auto stream = raft::resource::get_cuda_stream(handle); // Acquire temporary buffers and initialize to zero: // 1) Adjacency matrix bitfield diff --git a/cpp/src/distance/detail/predicated_tile_iterator_normvec.h b/cpp/src/distance/detail/predicated_tile_iterator_normvec.h index cacf4e6065..8ba07ee9e9 100644 --- a/cpp/src/distance/detail/predicated_tile_iterator_normvec.h +++ b/cpp/src/distance/detail/predicated_tile_iterator_normvec.h @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2018-2023, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2018-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -46,7 +46,7 @@ namespace threadblock { /// /// Satisfies: ReadableTileIterator | PredicatedTileIterator | ForwardTileIterator /// -template ::search( size_t buffer_size = num_queries * K * num_indices; auto main_stream = raft::resource::get_cuda_stream(handle); - auto tmp_res = raft::resource::get_workspace_resource(handle); + auto tmp_res = raft::resource::get_workspace_resource_ref(handle); rmm::device_uvector neighbors_buffer(buffer_size, main_stream, tmp_res); rmm::device_uvector distances_buffer(buffer_size, main_stream, tmp_res); diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh index 82bd6e755a..a7872f87a0 100644 --- a/cpp/src/neighbors/detail/ann_utils.cuh +++ b/cpp/src/neighbors/detail/ann_utils.cuh @@ -572,13 +572,14 @@ struct batch_load_iterator { * @param mr a custom memory resource for the intermediate buffer, if applicable. * @param prefetch enable prefetch feature in order to achieve kernel/copy overlapping. */ - batch_load_iterator(const T* source, - size_type n_rows, - size_type row_width, - size_type batch_size, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource(), - bool prefetch = false) + batch_load_iterator( + const T* source, + size_type n_rows, + size_type row_width, + size_type batch_size, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref(), + bool prefetch = false) : cur_batch_(new batch(source, n_rows, row_width, batch_size, stream, mr, prefetch)), cur_pos_(0), cur_prefetch_pos_(0) diff --git a/cpp/src/neighbors/detail/cagra/add_nodes.cuh b/cpp/src/neighbors/detail/cagra/add_nodes.cuh index 8d6ac67d83..38975ad5bc 100644 --- a/cpp/src/neighbors/detail/cagra/add_nodes.cuh +++ b/cpp/src/neighbors/detail/cagra/add_nodes.cuh @@ -69,7 +69,7 @@ void add_node_core( params.itopk_size = std::max(base_degree * 2lu, 256lu); // Memory space for rank-based neighbor list - auto mr = raft::resource::get_workspace_resource(handle); + auto mr = raft::resource::get_workspace_resource_ref(handle); auto neighbor_indices = raft::make_device_mdarray( handle, mr, raft::make_extents(max_search_batch_size, base_degree)); diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh index f433a4513a..dd2042bd12 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh @@ -5,7 +5,7 @@ #pragma once #include "../../../core/nvtx.hpp" -#include "../../vpq_dataset.cuh" +#include "../../../preprocessing/quantize/vpq_build-ext.cuh" #include "graph_core.cuh" #include @@ -1696,8 +1696,8 @@ void build_knn_graph( // If the workspace is smaller than desired, put the I/O buffers into the large workspace. rmm::device_async_resource_ref workspace_mr = - use_large_workspace ? raft::resource::get_large_workspace_resource(res) - : raft::resource::get_workspace_resource(res); + use_large_workspace ? raft::resource::get_large_workspace_resource_ref(res) + : raft::resource::get_workspace_resource_ref(res); RAFT_LOG_DEBUG( "IVF-PQ search node_degree: %d, top_k: %d, gpu_top_k: %d, max_batch_size:: %d, n_probes: %u", @@ -2116,7 +2116,7 @@ auto iterative_build_graph( dev_query_view.extent(1), max_chunk_size, raft::resource::get_cuda_stream(res), - raft::resource::get_workspace_resource(res)); + raft::resource::get_workspace_resource_ref(res)); for (const auto& batch : query_batch) { auto batch_dev_query_view = raft::make_device_matrix_view( batch.data(), batch.size(), dev_query_view.extent(1)); @@ -2279,8 +2279,7 @@ index build( idx.update_dataset( res, // TODO: hardcoding codebook math to `half`, we can do runtime dispatching later - cuvs::neighbors::vpq_build( - res, *params.compression, dataset)); + cuvs::preprocessing::quantize::pq::vpq_build(res, *params.compression, dataset)); return idx; } diff --git a/cpp/src/neighbors/detail/cagra/factory.cuh b/cpp/src/neighbors/detail/cagra/factory.cuh index a767d16530..75f236a65d 100644 --- a/cpp/src/neighbors/detail/cagra/factory.cuh +++ b/cpp/src/neighbors/detail/cagra/factory.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -11,6 +11,9 @@ #include "search_plan.cuh" #include "search_single_cta.cuh" +#include +#include + #include namespace cuvs::neighbors::cagra::detail { diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh index d94e279829..d16759b2b0 100644 --- a/cpp/src/neighbors/detail/cagra/graph_core.cuh +++ b/cpp/src/neighbors/detail/cagra/graph_core.cuh @@ -530,7 +530,7 @@ void sort_knn_graph( const uint64_t input_graph_degree = knn_graph.extent(1); IdxT* const input_graph_ptr = knn_graph.data_handle(); - auto large_tmp_mr = raft::resource::get_large_workspace_resource(res); + auto large_tmp_mr = raft::resource::get_large_workspace_resource_ref(res); auto d_input_graph = raft::make_device_mdarray( res, large_tmp_mr, raft::make_extents(graph_size, input_graph_degree)); @@ -1048,7 +1048,7 @@ void mst_optimization(raft::resources const& res, msg += ", total_num_edges: " + std::to_string(total_outgoing_edges) + ", " + std::to_string(total_incoming_edges); if (num_alternate + num_failure > 0) { - msg += ", altenate: " + std::to_string(num_alternate); + msg += ", alternate: " + std::to_string(num_alternate); if (num_failure > 0) { msg += ", failure: " + std::to_string(num_failure); } } RAFT_LOG_DEBUG("%s", msg.c_str()); @@ -1156,7 +1156,7 @@ void optimize( { RAFT_LOG_DEBUG( "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1)); - auto large_tmp_mr = raft::resource::get_large_workspace_resource(res); + auto large_tmp_mr = raft::resource::get_large_workspace_resource_ref(res); RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0), "Each input array is expected to have the same number of rows"); diff --git a/cpp/src/neighbors/detail/cagra/search_plan.cuh b/cpp/src/neighbors/detail/cagra/search_plan.cuh index 9cc6aeb353..15ebbf2e83 100644 --- a/cpp/src/neighbors/detail/cagra/search_plan.cuh +++ b/cpp/src/neighbors/detail/cagra/search_plan.cuh @@ -57,7 +57,7 @@ struct lightweight_uvector { if (new_size == size_) { return; } if (std::holds_alternative(res_)) { auto& h = std::get(res_); - res_ = rmm_res_type{raft::resource::get_workspace_resource(*h), + res_ = rmm_res_type{raft::resource::get_workspace_resource_ref(*h), raft::resource::get_cuda_stream(*h)}; } auto& [r, s] = std::get(res_); @@ -79,7 +79,7 @@ struct lightweight_uvector { if (new_size == size_) { return; } if (std::holds_alternative(res_)) { auto& h = std::get(res_); - res_ = rmm_res_type{raft::resource::get_workspace_resource(*h), stream}; + res_ = rmm_res_type{raft::resource::get_workspace_resource_ref(*h), stream}; } else { std::get(std::get(res_)) = stream; } diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp index 75c51f4da7..8e802fc40e 100644 --- a/cpp/src/neighbors/detail/cagra/utils.hpp +++ b/cpp/src/neighbors/detail/cagra/utils.hpp @@ -183,7 +183,7 @@ class device_matrix_view_from_host { // live on stack and not returned to a user. // The user may opt to set this resource to managed memory to allow large allocations. device_mem_.emplace(raft::make_device_mdarray( - res, raft::resource::get_large_workspace_resource(res), host_view.extents())); + res, raft::resource::get_large_workspace_resource_ref(res), host_view.extents())); raft::copy(res, device_mem_->view(), host_view); device_ptr = device_mem_->data_handle(); } @@ -269,7 +269,7 @@ void copy_with_padding( raft::resources const& res, raft::device_matrix& dst, raft::mdspan, raft::row_major, data_accessor> src, - rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref()) { size_t padded_dim = raft::round_up_safe(src.extent(1) * sizeof(T), 16) / sizeof(T); diff --git a/cpp/src/neighbors/detail/fused_l2_knn.cuh b/cpp/src/neighbors/detail/fused_l2_knn.cuh index 6d555c54e7..7dda9e1dab 100644 --- a/cpp/src/neighbors/detail/fused_l2_knn.cuh +++ b/cpp/src/neighbors/detail/fused_l2_knn.cuh @@ -362,7 +362,7 @@ __launch_bounds__(Policy::Nthreads, 2) RAFT_KERNEL fusedL2kNN(const DataT* x, heapArr[i]->warpKTop = tempKV.value; } - // total vals can atmost be 256, (32*8) + // total vals can at most be 256, (32*8) int numValsWarpTopK[Policy::AccRowsPerTh]; int anyWarpTopKs = 0; #pragma unroll diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp index 9ef1d0470f..bdb3697809 100644 --- a/cpp/src/neighbors/detail/hnsw.hpp +++ b/cpp/src/neighbors/detail/hnsw.hpp @@ -36,7 +36,7 @@ namespace cuvs::neighbors::hnsw::detail { // This is needed as hnswlib hardcodes the distance type to float // or int32_t in certain places. However, we can solve uint8 or int8 -// natively with the pacth cuVS applies. We could potentially remove +// natively with the patch cuVS applies. We could potentially remove // all the hardcodes and propagate templates throughout hnswlib, but // as of now it's not needed. template diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh index a1eb829569..2e50aefc15 100644 --- a/cpp/src/neighbors/detail/nn_descent.cuh +++ b/cpp/src/neighbors/detail/nn_descent.cuh @@ -122,6 +122,25 @@ constexpr __host__ __device__ __forceinline__ int skew_dim(int ndim) } } +template +struct dtype_traits; + +template <> +struct dtype_traits { + static constexpr int APAD = 4; + static constexpr int BPAD = 4; + static constexpr int TILE_COL_WIDTH = 32; + static __device__ __forceinline__ float to_float(float v) { return v; } +}; + +template <> +struct dtype_traits<__half> { + static constexpr int APAD = 8; + static constexpr int BPAD = 8; + static constexpr int TILE_COL_WIDTH = 64; + static __device__ __forceinline__ float to_float(__half v) { return __half2float(v); } +}; + template __device__ __forceinline__ ResultItem xor_swap(ResultItem x, int mask, int dir) { @@ -272,7 +291,8 @@ RAFT_KERNEL preprocess_data_kernel( for (int step = 0; step < raft::ceildiv(dim, raft::warp_size()); step++) { int idx = step * raft::warp_size() + threadIdx.x; if (idx < dim) { - if (metric == cuvs::distance::DistanceType::InnerProduct) { + if (metric == cuvs::distance::DistanceType::InnerProduct || + metric == cuvs::distance::DistanceType::L1) { output_data[list_id * dim + idx] = input_data[(size_t)blockIdx.x * dim + idx]; } else if (metric == cuvs::distance::DistanceType::CosineExpanded) { output_data[list_id * dim + idx] = @@ -518,7 +538,8 @@ __device__ __forceinline__ void calculate_metric(float* s_distances, for (int d = 0; d < data_dim; d++) { s_distances[i] += __popc(static_cast(data_n1[d] ^ data_n2[d]) & 0xff); } - } else { // L2Expanded or L2SqrtExpanded + } else if (metric == cuvs::distance::DistanceType::L2Expanded || + metric == cuvs::distance::DistanceType::L2SqrtExpanded) { s_distances[i] = l2_norms[row_neighbors[row_id]] + l2_norms[col_neighbors[col_id]] - 2.0 * s_distances[i]; // for fp32 vs fp16 precision differences resulting in negative distances when distance @@ -535,13 +556,29 @@ __device__ __forceinline__ void calculate_metric(float* s_distances, } } +struct DistAccumulator { + cuvs::distance::DistanceType metric; + __device__ __forceinline__ float operator()(float a, float b) const + { + if (metric == cuvs::distance::DistanceType::L1) { return raft::abs(a - b); } + // dot product: reused by IP, cosine, and L2 (postprocessed in calculate_metric) + return a * b; + } +}; + // launch_bounds here denote BLOCK_SIZE = 512 and MIN_BLOCKS_PER_SM = 4 // Per // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications, // MAX_RESIDENT_THREAD_PER_SM = BLOCK_SIZE * BLOCKS_PER_SM = 2048 // For architectures 750 and 860 (890), the values for MAX_RESIDENT_THREAD_PER_SM // is 1024 and 1536 respectively, which means the bounds don't work anymore -template , typename DistEpilogue_t> +// SIMT kernel: scalar element-wise distance computation. +// Used for fp32 data (all metrics) and fp16 data with L1 distance (which cannot use tensor cores). +template , + typename DistEpilogue_t> + requires(std::is_same_v || std::is_same_v) RAFT_KERNEL #ifdef __CUDA_ARCH__ // Use minBlocksPerMultiprocessor = 4 on specific arches @@ -552,32 +589,31 @@ __launch_bounds__(BLOCK_SIZE, 4) __launch_bounds__(BLOCK_SIZE) #endif #endif - local_join_kernel(const Index_t* graph_new, - const Index_t* rev_graph_new, - const int2* sizes_new, - const Index_t* graph_old, - const Index_t* rev_graph_old, - const int2* sizes_old, - const int width, - const float* data, - const int data_dim, - ID_t* graph, - DistData_t* dists, - int graph_width, - int* locks, - DistData_t* l2_norms, - cuvs::distance::DistanceType metric, - DistEpilogue_t dist_epilogue) + local_join_kernel_simt(const Index_t* graph_new, + const Index_t* rev_graph_new, + const int2* sizes_new, + const Index_t* graph_old, + const Index_t* rev_graph_old, + const int2* sizes_old, + const int width, + const Data_t* data, + const int data_dim, + ID_t* graph, + DistData_t* dists, + int graph_width, + int* locks, + DistData_t* l2_norms, + cuvs::distance::DistanceType metric, + DistEpilogue_t dist_epilogue) { #if (__CUDA_ARCH__ >= 700) - using namespace nvcuda; __shared__ int s_list[MAX_NUM_BI_SAMPLES * 2]; - constexpr int APAD = 4; - constexpr int BPAD = 4; - constexpr int TILE_COL_WIDTH = 32; - __shared__ float s_nv[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + APAD]; - __shared__ float s_ov[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + BPAD]; + constexpr int APAD = dtype_traits::APAD; + constexpr int BPAD = dtype_traits::BPAD; + constexpr int TILE_COL_WIDTH = dtype_traits::TILE_COL_WIDTH; + __shared__ Data_t s_nv[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + APAD]; + __shared__ Data_t s_ov[MAX_NUM_BI_SAMPLES][TILE_COL_WIDTH + BPAD]; __shared__ float s_distances[MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES]; // s_distances: MAX_NUM_BI_SAMPLES x SKEWED_MAX_NUM_BI_SAMPLES, reuse the space of s_ov @@ -635,48 +671,49 @@ __launch_bounds__(BLOCK_SIZE) int lane_id = threadIdx.x % raft::warp_size(); constexpr int num_warps = BLOCK_SIZE / raft::warp_size(); - if (metric != cuvs::distance::DistanceType::BitwiseHamming) { - int tid = threadIdx.x; - for (int i = tid; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) - s_distances[i] = 0.0f; + DistAccumulator dist_acc(metric); - __syncthreads(); + int tid = threadIdx.x; + for (int i = tid; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) + s_distances[i] = 0.0f; - for (int step = 0; step < raft::ceildiv(data_dim, TILE_COL_WIDTH); step++) { - int num_load_elems = (step == raft::ceildiv(data_dim, TILE_COL_WIDTH) - 1) - ? data_dim - step * TILE_COL_WIDTH - : TILE_COL_WIDTH; + __syncthreads(); + + for (int step = 0; step < raft::ceildiv(data_dim, TILE_COL_WIDTH); step++) { + int num_load_elems = (step == raft::ceildiv(data_dim, TILE_COL_WIDTH) - 1) + ? data_dim - step * TILE_COL_WIDTH + : TILE_COL_WIDTH; #pragma unroll - for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) { - int idx = i * num_warps + warp_id; - if (idx < list_new_size) { - size_t neighbor_id = new_neighbors[idx]; - size_t idx_in_data = neighbor_id * data_dim; - load_vec(s_nv[idx], - data + idx_in_data + step * TILE_COL_WIDTH, - num_load_elems, - TILE_COL_WIDTH, - lane_id); - } + for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) { + int idx = i * num_warps + warp_id; + if (idx < list_new_size) { + size_t neighbor_id = new_neighbors[idx]; + size_t idx_in_data = neighbor_id * data_dim; + load_vec(s_nv[idx], + data + idx_in_data + step * TILE_COL_WIDTH, + num_load_elems, + TILE_COL_WIDTH, + lane_id); } - __syncthreads(); + } + __syncthreads(); - // this is much faster than a warp-collaborative multiplication because MAX_NUM_BI_SAMPLES is - // fixed and small (64) - for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; - i += blockDim.x) { - int tmp_row = i / SKEWED_MAX_NUM_BI_SAMPLES; - int tmp_col = i % SKEWED_MAX_NUM_BI_SAMPLES; - if (tmp_row < list_new_size && tmp_col < list_new_size) { - float acc = 0.0f; - for (int d = 0; d < num_load_elems; d++) { - acc += s_nv[tmp_row][d] * s_nv[tmp_col][d]; - } - s_distances[i] += acc; + // this is much faster than a warp-collaborative multiplication because MAX_NUM_BI_SAMPLES is + // fixed and small (64) + for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) { + int tmp_row = i / SKEWED_MAX_NUM_BI_SAMPLES; + int tmp_col = i % SKEWED_MAX_NUM_BI_SAMPLES; + if (tmp_row < list_new_size && tmp_col < list_new_size) { + float acc = 0.0f; + for (int d = 0; d < num_load_elems; d++) { + float a = dtype_traits::to_float(s_nv[tmp_row][d]); + float b = dtype_traits::to_float(s_nv[tmp_col][d]); + acc += dist_acc(a, b); } + s_distances[i] += acc; } - __syncthreads(); } + __syncthreads(); } __syncthreads(); @@ -706,63 +743,61 @@ __launch_bounds__(BLOCK_SIZE) __syncthreads(); - if (metric != cuvs::distance::DistanceType::BitwiseHamming) { - int tid = threadIdx.x; - for (int i = tid; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) - s_distances[i] = 0.0f; + for (int i = tid; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) + s_distances[i] = 0.0f; - __syncthreads(); + __syncthreads(); - for (int step = 0; step < raft::ceildiv(data_dim, TILE_COL_WIDTH); step++) { - int num_load_elems = (step == raft::ceildiv(data_dim, TILE_COL_WIDTH) - 1) - ? data_dim - step * TILE_COL_WIDTH - : TILE_COL_WIDTH; - if (TILE_COL_WIDTH < data_dim) { -#pragma unroll - for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) { - int idx = i * num_warps + warp_id; - if (idx < list_new_size) { - size_t neighbor_id = new_neighbors[idx]; - size_t idx_in_data = neighbor_id * data_dim; - load_vec(s_nv[idx], - data + idx_in_data + step * TILE_COL_WIDTH, - num_load_elems, - TILE_COL_WIDTH, - lane_id); - } - } - } + for (int step = 0; step < raft::ceildiv(data_dim, TILE_COL_WIDTH); step++) { + int num_load_elems = (step == raft::ceildiv(data_dim, TILE_COL_WIDTH) - 1) + ? data_dim - step * TILE_COL_WIDTH + : TILE_COL_WIDTH; + if (TILE_COL_WIDTH < data_dim) { #pragma unroll for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) { int idx = i * num_warps + warp_id; - if (idx < list_old_size) { - size_t neighbor_id = old_neighbors[idx]; + if (idx < list_new_size) { + size_t neighbor_id = new_neighbors[idx]; size_t idx_in_data = neighbor_id * data_dim; - load_vec(s_ov[idx], + load_vec(s_nv[idx], data + idx_in_data + step * TILE_COL_WIDTH, num_load_elems, TILE_COL_WIDTH, lane_id); } } - __syncthreads(); + } +#pragma unroll + for (int i = 0; i < MAX_NUM_BI_SAMPLES / num_warps; i++) { + int idx = i * num_warps + warp_id; + if (idx < list_old_size) { + size_t neighbor_id = old_neighbors[idx]; + size_t idx_in_data = neighbor_id * data_dim; + load_vec(s_ov[idx], + data + idx_in_data + step * TILE_COL_WIDTH, + num_load_elems, + TILE_COL_WIDTH, + lane_id); + } + } + __syncthreads(); - // this is much faster than a warp-collaborative multiplication because MAX_NUM_BI_SAMPLES is - // fixed and small (64) - for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; - i += blockDim.x) { - int tmp_row = i / SKEWED_MAX_NUM_BI_SAMPLES; - int tmp_col = i % SKEWED_MAX_NUM_BI_SAMPLES; - if (tmp_row < list_new_size && tmp_col < list_old_size) { - float acc = 0.0f; - for (int d = 0; d < num_load_elems; d++) { - acc += s_nv[tmp_row][d] * s_ov[tmp_col][d]; - } - s_distances[i] += acc; + // this is much faster than a warp-collaborative multiplication because MAX_NUM_BI_SAMPLES is + // fixed and small (64) + for (int i = threadIdx.x; i < MAX_NUM_BI_SAMPLES * SKEWED_MAX_NUM_BI_SAMPLES; i += blockDim.x) { + int tmp_row = i / SKEWED_MAX_NUM_BI_SAMPLES; + int tmp_col = i % SKEWED_MAX_NUM_BI_SAMPLES; + if (tmp_row < list_new_size && tmp_col < list_old_size) { + float acc = 0.0f; + for (int d = 0; d < num_load_elems; d++) { + float a = dtype_traits::to_float(s_nv[tmp_row][d]); + float b = dtype_traits::to_float(s_ov[tmp_col][d]); + acc += dist_acc(a, b); } + s_distances[i] += acc; } - __syncthreads(); } + __syncthreads(); } __syncthreads(); @@ -820,22 +855,22 @@ __launch_bounds__(BLOCK_SIZE, 4) __launch_bounds__(BLOCK_SIZE) #endif #endif - local_join_kernel(const Index_t* graph_new, - const Index_t* rev_graph_new, - const int2* sizes_new, - const Index_t* graph_old, - const Index_t* rev_graph_old, - const int2* sizes_old, - const int width, - const __half* data, - const int data_dim, - ID_t* graph, - DistData_t* dists, - int graph_width, - int* locks, - DistData_t* l2_norms, - cuvs::distance::DistanceType metric, - DistEpilogue_t dist_epilogue) + local_join_kernel_wmma(const Index_t* graph_new, + const Index_t* rev_graph_new, + const int2* sizes_new, + const Index_t* graph_old, + const Index_t* rev_graph_old, + const int2* sizes_old, + const int width, + const __half* data, + const int data_dim, + ID_t* graph, + DistData_t* dists, + int graph_width, + int* locks, + DistData_t* l2_norms, + cuvs::distance::DistanceType metric, + DistEpilogue_t dist_epilogue) { #if (__CUDA_ARCH__ >= 700) using namespace nvcuda; @@ -1379,40 +1414,62 @@ template void GNND::local_join(cudaStream_t stream, DistEpilogue_t dist_epilogue) { raft::matrix::fill(res, dists_buffer_.view(), std::numeric_limits::max()); + + // Kernel dispatch logic: + // fp32 data -> SIMT (metric resolved at runtime inside the kernel) + // fp16 data + L1 distance -> SIMT (L1 needs element-wise ops, cannot use tensor cores) + // fp16 data + other metrics -> WMMA (tensor-core accelerated dot product) if (d_data_float_.has_value()) { - local_join_kernel<<>>(graph_.h_graph_new.data_handle(), - h_rev_graph_new_.data_handle(), - d_list_sizes_new_.data_handle(), - h_graph_old_.data_handle(), - h_rev_graph_old_.data_handle(), - d_list_sizes_old_.data_handle(), - NUM_SAMPLES, - d_data_float_.value().data_handle(), - ndim_, - graph_buffer_.data_handle(), - dists_buffer_.data_handle(), - DEGREE_ON_DEVICE, - d_locks_.data_handle(), - l2_norms_.data_handle(), - build_config_.metric, - dist_epilogue); + local_join_kernel_simt<<>>(graph_.h_graph_new.data_handle(), + h_rev_graph_new_.data_handle(), + d_list_sizes_new_.data_handle(), + h_graph_old_.data_handle(), + h_rev_graph_old_.data_handle(), + d_list_sizes_old_.data_handle(), + NUM_SAMPLES, + d_data_float_->data_handle(), + ndim_, + graph_buffer_.data_handle(), + dists_buffer_.data_handle(), + DEGREE_ON_DEVICE, + d_locks_.data_handle(), + l2_norms_.data_handle(), + build_config_.metric, + dist_epilogue); + } else if (build_config_.metric == cuvs::distance::DistanceType::L1) { + local_join_kernel_simt<<>>(graph_.h_graph_new.data_handle(), + h_rev_graph_new_.data_handle(), + d_list_sizes_new_.data_handle(), + h_graph_old_.data_handle(), + h_rev_graph_old_.data_handle(), + d_list_sizes_old_.data_handle(), + NUM_SAMPLES, + d_data_half_.value().data_handle(), + ndim_, + graph_buffer_.data_handle(), + dists_buffer_.data_handle(), + DEGREE_ON_DEVICE, + d_locks_.data_handle(), + l2_norms_.data_handle(), + build_config_.metric, + dist_epilogue); } else { - local_join_kernel<<>>(graph_.h_graph_new.data_handle(), - h_rev_graph_new_.data_handle(), - d_list_sizes_new_.data_handle(), - h_graph_old_.data_handle(), - h_rev_graph_old_.data_handle(), - d_list_sizes_old_.data_handle(), - NUM_SAMPLES, - d_data_half_.value().data_handle(), - ndim_, - graph_buffer_.data_handle(), - dists_buffer_.data_handle(), - DEGREE_ON_DEVICE, - d_locks_.data_handle(), - l2_norms_.data_handle(), - build_config_.metric, - dist_epilogue); + local_join_kernel_wmma<<>>(graph_.h_graph_new.data_handle(), + h_rev_graph_new_.data_handle(), + d_list_sizes_new_.data_handle(), + h_graph_old_.data_handle(), + h_rev_graph_old_.data_handle(), + d_list_sizes_old_.data_handle(), + NUM_SAMPLES, + d_data_half_.value().data_handle(), + ndim_, + graph_buffer_.data_handle(), + dists_buffer_.data_handle(), + DEGREE_ON_DEVICE, + d_locks_.data_handle(), + l2_norms_.data_handle(), + build_config_.metric, + dist_epilogue); } } @@ -1509,7 +1566,7 @@ void GNND::build(Data_t* data, std::thread update_and_sample_thread(update_and_sample, it); - RAFT_LOG_DEBUG("# GNND iteraton: %lu / %lu", it + 1, build_config_.max_iterations); + RAFT_LOG_DEBUG("# GNND iteration: %lu / %lu", it + 1, build_config_.max_iterations); // Reuse dists_buffer_ to save GPU memory. graph_buffer_ cannot be reused, because it // contains some information for local_join. diff --git a/cpp/src/neighbors/detail/nn_descent_gnnd.hpp b/cpp/src/neighbors/detail/nn_descent_gnnd.hpp index b0799505f4..a2639e4f43 100644 --- a/cpp/src/neighbors/detail/nn_descent_gnnd.hpp +++ b/cpp/src/neighbors/detail/nn_descent_gnnd.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -216,7 +216,7 @@ class GNND { int2* list_sizes, cudaStream_t stream = 0); - template + template void local_join(cudaStream_t stream = 0, DistEpilogue_t dist_epilogue = DistEpilogue_t{}); raft::resources const& res; @@ -264,10 +264,11 @@ inline BuildConfig get_build_config(raft::resources const& res, params.metric == cuvs::distance::DistanceType::L2SqrtExpanded || params.metric == cuvs::distance::DistanceType::CosineExpanded || params.metric == cuvs::distance::DistanceType::InnerProduct || - params.metric == cuvs::distance::DistanceType::BitwiseHamming; + params.metric == cuvs::distance::DistanceType::BitwiseHamming || + params.metric == cuvs::distance::DistanceType::L1; RAFT_EXPECTS(allowed_metrics, "The metric for NN Descent should be L2Expanded, L2SqrtExpanded, CosineExpanded, " - "InnerProduct or BitwiseHamming"); + "InnerProduct, BitwiseHamming or L1"); RAFT_EXPECTS( metric == params.metric, "The metrics set in nn_descent::index_params and nn_descent::index are inconsistent"); diff --git a/cpp/src/neighbors/detail/vamana/vamana_build.cuh b/cpp/src/neighbors/detail/vamana/vamana_build.cuh index 0798141c9c..107bc39ee5 100644 --- a/cpp/src/neighbors/detail/vamana/vamana_build.cuh +++ b/cpp/src/neighbors/detail/vamana/vamana_build.cuh @@ -143,7 +143,7 @@ void batched_insert_vamana( auto query_ids = raft::make_device_vector(res, max_batchsize); auto query_list_ptr = raft::make_device_mdarray>( res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(max_batchsize + 1)); QueryCandidates* query_list = static_cast*>(query_list_ptr.data_handle()); @@ -151,14 +151,14 @@ void batched_insert_vamana( // Results of each batch of inserts during build - Memory is used by query_list structure auto visited_ids = raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(max_batchsize, visited_size)); auto visited_dists = raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(max_batchsize, visited_size)); - // Assign memory to query_list structures and initiailize + // Assign memory to query_list structures and initialize init_query_candidate_list<<<256, blockD, 0, stream>>>(query_list, visited_ids.data_handle(), visited_dists.data_handle(), @@ -167,14 +167,14 @@ void batched_insert_vamana( 1); auto topk_pq_mem = raft::make_device_mdarray>(res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(max_batchsize, visited_size)); int align_padding = raft::alignTo(dim, 16) - dim; auto s_coords_mem = raft::make_device_mdarray( res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(min(maxBlocks, max(max_batchsize, reverse_batch)), dim + align_padding)); @@ -318,7 +318,7 @@ void batched_insert_vamana( // compute prefix sums of query_list sizes - TODO parallelize prefix sums // auto d_total_edges = raft::make_device_mdarray( - // res, raft::resource::get_workspace_resource(res), raft::make_extents(1)); + // res, raft::resource::get_workspace_resource_ref(res), raft::make_extents(1)); rmm::device_scalar d_total_edges(stream); prefix_sums_sizes<<<1, 1, 0, stream>>>(query_list, step_size, d_total_edges.data()); RAFT_CUDA_TRY(cudaPeekAtLastError()); @@ -329,16 +329,16 @@ void batched_insert_vamana( auto edge_dist_pair = raft::make_device_mdarray>( res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(total_edges)); auto edge_dest = raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(total_edges)); auto edge_src = raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(total_edges)); // Create reverse edge list @@ -367,7 +367,7 @@ void batched_insert_vamana( auto temp_sort_storage = raft::make_device_mdarray( res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(temp_storage_bytes / sizeof(IdxT))); // Sort to group reverse edges by destination @@ -406,7 +406,7 @@ void batched_insert_vamana( auto temp_sort_storage = raft::make_device_mdarray( res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(temp_storage_bytes / sizeof(IdxT))); // Sort to group reverse edges by destination @@ -451,16 +451,16 @@ void batched_insert_vamana( // Allocate reverse QueryCandidate list based on number of unique destinations auto reverse_list_ptr = raft::make_device_mdarray>( res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(reverse_batch)); auto rev_ids = raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(reverse_batch, visited_size)); auto rev_dists = raft::make_device_mdarray(res, - raft::resource::get_large_workspace_resource(res), + raft::resource::get_large_workspace_resource_ref(res), raft::make_extents(reverse_batch, visited_size)); QueryCandidates* reverse_list = @@ -662,7 +662,7 @@ index build( dim, max_batch_size, raft::resource::get_cuda_stream(res), - raft::resource::get_workspace_resource(res))) { + raft::resource::get_workspace_resource_ref(res))) { // perform rotation auto dataset_rotated = raft::make_device_matrix(res, batch.size(), dim); if constexpr (std::is_same_v) { diff --git a/cpp/src/neighbors/detail/vpq_dataset.cuh b/cpp/src/neighbors/detail/vpq_dataset.cuh index e609100a76..14ca2c9470 100644 --- a/cpp/src/neighbors/detail/vpq_dataset.cuh +++ b/cpp/src/neighbors/detail/vpq_dataset.cuh @@ -467,61 +467,63 @@ void process_and_fill_codes( RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 16]", pq_bits); } }(pq_bits); + bool need_copy_to_device = + cuvs::spatial::knn::detail::utils::check_pointer_residency(dataset.data_handle()) == + cuvs::spatial::knn::detail::utils::pointer_residency::host_only; + bool need_batching = n_rows > kReasonableMaxBatchSize; + auto launch_work = [&](auto& dataset_view, auto& labels_view, auto& codes_view) { + if (inline_vq_labels || (!vq_labels.empty() && !vq_centers.empty())) { + predict_vq(res, dataset_view, vq_centers, labels_view); + } + dim3 blocks( + raft::div_rounding_up_safe(dataset_view.extent(0), kBlockSize / threads_per_vec), 1, 1); + kernel<<>>(codes_view, + dataset_view, + pq_centers, + vq_centers, + raft::make_const_mdspan(labels_view), + rows_in_shared_memory, + pq_bits, + inline_vq_labels); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + }; + auto batch_labels = raft::make_device_vector(res, 0); + if (!need_batching && !need_copy_to_device) { + // No batching needed, launch the kernel directly + auto dataset_view = raft::make_device_matrix_view(dataset.data_handle(), n_rows, dim); + auto labels_view = raft::make_device_vector_view(nullptr, 0); + if (inline_vq_labels) { + batch_labels = raft::make_device_vector(res, dataset_view.extent(0)); + labels_view = batch_labels.view(); + } else if (!vq_labels.empty() && !vq_centers.empty()) { + labels_view = vq_labels; + } + launch_work(dataset_view, labels_view, codes); + return; + } + for (const auto& batch : cuvs::spatial::knn::detail::utils::batch_load_iterator( dataset.data_handle(), n_rows, dim, max_batch_size, stream, - rmm::mr::get_current_device_resource())) { + rmm::mr::get_current_device_resource_ref())) { auto batch_view = raft::make_device_matrix_view(batch.data(), ix_t(batch.size()), dim); - auto batch_labels = raft::make_device_vector(res, 0); auto batch_labels_view = raft::make_device_vector_view(nullptr, 0); if (inline_vq_labels) { batch_labels = raft::make_device_vector(res, batch.size()); batch_labels_view = batch_labels.view(); - predict_vq(res, batch_view, vq_centers, batch_labels_view); - } else { - if (!vq_labels.empty() && !vq_centers.empty()) { - batch_labels_view = raft::make_device_vector_view( - vq_labels.data_handle() + batch.offset(), batch.size()); - predict_vq(res, batch_view, vq_centers, batch_labels_view); - } + } else if (!vq_labels.empty() && !vq_centers.empty()) { + batch_labels_view = raft::make_device_vector_view( + vq_labels.data_handle() + batch.offset(), batch.size()); } - dim3 blocks(raft::div_rounding_up_safe(n_rows, kBlockSize / threads_per_vec), 1, 1); - kernel<<>>( - raft::make_device_matrix_view( - codes.data_handle() + batch.offset() * codes_rowlen, batch.size(), codes_rowlen), - batch_view, - pq_centers, - vq_centers, - raft::make_const_mdspan(batch_labels_view), - rows_in_shared_memory, - pq_bits, - inline_vq_labels); - RAFT_CUDA_TRY(cudaPeekAtLastError()); + auto batch_codes_view = raft::make_device_matrix_view( + codes.data_handle() + batch.offset() * codes_rowlen, batch.size(), codes_rowlen); + launch_work(batch_view, batch_labels_view, batch_codes_view); } } -template -auto vpq_convert_math_type(const raft::resources& res, vpq_dataset&& src) - -> vpq_dataset -{ - auto vq_code_book = raft::make_device_mdarray(res, src.vq_code_book.extents()); - auto pq_code_book = raft::make_device_mdarray(res, src.pq_code_book.extents()); - - raft::linalg::map(res, - vq_code_book.view(), - cuvs::spatial::knn::detail::utils::mapping{}, - raft::make_const_mdspan(src.vq_code_book.view())); - raft::linalg::map(res, - pq_code_book.view(), - cuvs::spatial::knn::detail::utils::mapping{}, - raft::make_const_mdspan(src.pq_code_book.view())); - return vpq_dataset{ - std::move(vq_code_book), std::move(pq_code_book), std::move(src.data)}; -} - // Helper for operations using vectorized loads of raft::TxN_t template struct vec_op : raft::TxN_t { @@ -858,14 +860,40 @@ void process_and_fill_codes_subspaces( } }(pq_bits); - ix_t max_batch_size = std::min(n_rows, kReasonableMaxBatchSize); - auto copy_stream = raft::resource::get_cuda_stream(res); // Using the main stream by default - bool enable_prefetch = false; - if (res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL)) { - if (raft::resource::get_stream_pool_size(res) >= 1) { - enable_prefetch = true; - copy_stream = raft::resource::get_stream_from_stream_pool(res); + ix_t max_batch_size = std::min(n_rows, kReasonableMaxBatchSize); + auto copy_stream = raft::resource::get_cuda_stream(res); // Using the main stream by default + bool enable_prefetch_stream = false; + bool has_cuda_stream_pool_resource = + res.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL) && + raft::resource::get_stream_pool_size(res) >= 1; + bool need_copy_to_device = + cuvs::spatial::knn::detail::utils::check_pointer_residency(dataset.data_handle()) == + cuvs::spatial::knn::detail::utils::pointer_residency::host_only; + bool need_batching = n_rows > kReasonableMaxBatchSize; + auto launch_work = [&](auto& dataset_view, auto& labels_view, auto& codes_view) { + if (!vq_labels.empty() && !vq_centers.empty()) { + predict_vq(res, dataset_view, vq_centers, labels_view); } + dim3 blocks( + raft::div_rounding_up_safe(dataset_view.extent(0), kBlockSize / threads_per_vec), 1, 1); + kernel<<>>(codes_view, + dataset_view, + pq_centers, + vq_centers, + raft::make_const_mdspan(labels_view), + pq_bits, + shared_memory_size > 0); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + }; + if (!need_batching && !need_copy_to_device) { + // No batching and no copy to device needed, launch the kernel directly + auto dataset_view = raft::make_device_matrix_view(dataset.data_handle(), n_rows, dim); + launch_work(dataset_view, vq_labels, codes); + return; + } + if (has_cuda_stream_pool_resource && need_copy_to_device) { + enable_prefetch_stream = true; + copy_stream = raft::resource::get_stream_from_stream_pool(res); } auto vec_batches = cuvs::spatial::knn::detail::utils::batch_load_iterator( dataset.data_handle(), @@ -873,8 +901,8 @@ void process_and_fill_codes_subspaces( dim, max_batch_size, copy_stream, - raft::resource::get_workspace_resource(res), - enable_prefetch); + raft::resource::get_workspace_resource_ref(res), + enable_prefetch_stream); vec_batches.prefetch_next_batch(); for (const auto& batch : vec_batches) { auto batch_view = raft::make_device_matrix_view(batch.data(), ix_t(batch.size()), dim); @@ -882,54 +910,14 @@ void process_and_fill_codes_subspaces( if (!vq_labels.empty() && !vq_centers.empty()) { batch_labels = raft::make_device_vector_view( vq_labels.data_handle() + batch.offset(), batch.size()); - predict_vq(res, batch_view, vq_centers, batch_labels); } - dim3 blocks(raft::div_rounding_up_safe(batch.size(), kBlockSize / threads_per_vec), 1, 1); - kernel<<>>( - raft::make_device_matrix_view( - codes.data_handle() + batch.offset() * codes_rowlen, batch.size(), codes_rowlen), - batch_view, - pq_centers, - vq_centers, - raft::make_const_mdspan(batch_labels), - pq_bits, - shared_memory_size > 0); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - vec_batches.prefetch_next_batch(); - raft::resource::sync_stream(res); + auto batch_codes_view = raft::make_device_matrix_view( + codes.data_handle() + batch.offset() * codes_rowlen, batch.size(), codes_rowlen); + launch_work(batch_view, batch_labels, batch_codes_view); + if (enable_prefetch_stream) { + vec_batches.prefetch_next_batch(); + raft::resource::sync_stream(res); + } } } - -template -auto vpq_build(const raft::resources& res, const vpq_params& params, const DatasetT& dataset) - -> vpq_dataset -{ - using label_t = uint32_t; - // Use a heuristic to impute missing parameters. - auto ps = fill_missing_params_heuristics(params, dataset); - - // Train codes - auto vq_code_book = train_vq(res, ps, dataset); - auto pq_code_book = - train_pq(res, ps, dataset, raft::make_const_mdspan(vq_code_book.view())); - - // Encode dataset - const IdxT n_rows = dataset.extent(0); - const IdxT codes_rowlen = sizeof(label_t) * (1 + raft::div_rounding_up_safe( - ps.pq_dim * ps.pq_bits, 8 * sizeof(label_t))); - - auto codes = raft::make_device_matrix(res, n_rows, codes_rowlen); - process_and_fill_codes(res, - ps, - dataset, - raft::make_const_mdspan(pq_code_book.view()), - raft::make_const_mdspan(vq_code_book.view()), - raft::make_device_vector_view(nullptr, 0), - codes.view(), - true); - - return vpq_dataset{ - std::move(vq_code_book), std::move(pq_code_book), std::move(codes)}; -} - } // namespace cuvs::neighbors::detail diff --git a/cpp/src/neighbors/ivf_common.cu b/cpp/src/neighbors/ivf_common.cu index 9fb7b59b0f..b87a14f7c3 100644 --- a/cpp/src/neighbors/ivf_common.cu +++ b/cpp/src/neighbors/ivf_common.cu @@ -79,7 +79,7 @@ void sort_cluster_sizes_descending(uint32_t* input, uint32_t* output, uint32_t n_lists, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* tmp_res) + rmm::device_async_resource_ref tmp_res) { int begin_bit = 0; int end_bit = sizeof(uint32_t) * 8; diff --git a/cpp/src/neighbors/ivf_common.cuh b/cpp/src/neighbors/ivf_common.cuh index 80aac970dd..e466a13fd7 100644 --- a/cpp/src/neighbors/ivf_common.cuh +++ b/cpp/src/neighbors/ivf_common.cuh @@ -20,7 +20,7 @@ void sort_cluster_sizes_descending(uint32_t* input, uint32_t* output, uint32_t n_lists, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* tmp_res); + rmm::device_async_resource_ref tmp_res); /** * Default value returned by `search` when the `n_probes` is too small and top-k is too large. @@ -256,7 +256,7 @@ template void recompute_internal_state(const raft::resources& res, Index& index) { auto stream = raft::resource::get_cuda_stream(res); - auto tmp_res = raft::resource::get_workspace_resource(res); + auto tmp_res = raft::resource::get_workspace_resource_ref(res); rmm::device_uvector sorted_sizes(index.n_lists(), stream, tmp_res); // Actualize the list pointers diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh index 06862c083d..a0cf4d2740 100644 --- a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh +++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh @@ -179,8 +179,10 @@ void extend(raft::resources const& handle, RAFT_EXPECTS(new_indices != nullptr || index->size() == 0, "You must pass data indices when the index is non-empty."); - auto new_labels = raft::make_device_mdarray( - handle, raft::resource::get_large_workspace_resource(handle), raft::make_extents(n_rows)); + auto new_labels = + raft::make_device_mdarray(handle, + raft::resource::get_large_workspace_resource_ref(handle), + raft::make_extents(n_rows)); cuvs::cluster::kmeans::balanced_params kmeans_params; kmeans_params.metric = index->metric(); auto orig_centroids_view = @@ -205,7 +207,7 @@ void extend(raft::resources const& handle, index->dim(), max_batch_size, copy_stream, - raft::resource::get_workspace_resource(handle), + raft::resource::get_workspace_resource_ref(handle), enable_prefetch); vec_batches.prefetch_next_batch(); @@ -224,7 +226,7 @@ void extend(raft::resources const& handle, auto* list_sizes_ptr = index->list_sizes().data_handle(); auto old_list_sizes_dev = raft::make_device_mdarray( - handle, raft::resource::get_workspace_resource(handle), raft::make_extents(n_lists)); + handle, raft::resource::get_workspace_resource_ref(handle), raft::make_extents(n_lists)); raft::copy(handle, old_list_sizes_dev.view(), raft::make_device_vector_view(list_sizes_ptr, n_lists)); @@ -293,8 +295,12 @@ void extend(raft::resources const& handle, raft::make_device_vector_view(list_sizes_ptr, n_lists), raft::make_device_vector_view(old_list_sizes_dev.data_handle(), n_lists)); - utils::batch_load_iterator vec_indices( - new_indices, n_rows, 1, max_batch_size, stream, raft::resource::get_workspace_resource(handle)); + utils::batch_load_iterator vec_indices(new_indices, + n_rows, + 1, + max_batch_size, + stream, + raft::resource::get_workspace_resource_ref(handle)); vec_batches.reset(); vec_batches.prefetch_next_batch(); utils::batch_load_iterator idx_batch = vec_indices.begin(); @@ -409,7 +415,7 @@ inline auto build(raft::resources const& handle, 1, n_rows / std::max(params.kmeans_trainset_fraction * n_rows, index.n_lists())); auto n_rows_train = n_rows / trainset_ratio; rmm::device_uvector trainset( - n_rows_train * index.dim(), stream, raft::resource::get_large_workspace_resource(handle)); + n_rows_train * index.dim(), stream, raft::resource::get_large_workspace_resource_ref(handle)); // TODO: a proper sampling RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(), sizeof(T) * index.dim(), @@ -469,7 +475,7 @@ inline void fill_refinement_index(raft::resources const& handle, "ivf_flat::fill_refinement_index(%zu, %u)", size_t(n_queries)); rmm::device_uvector new_labels( - n_queries * n_candidates, stream, raft::resource::get_workspace_resource(handle)); + n_queries * n_candidates, stream, raft::resource::get_workspace_resource_ref(handle)); auto new_labels_view = raft::make_device_vector_view(new_labels.data(), n_queries * n_candidates); raft::linalg::map_offset( diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh deleted file mode 100644 index 9d91c95d70..0000000000 --- a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh +++ /dev/null @@ -1,1339 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -#pragma once - -#include "../ivf_common.cuh" -#include "../sample_filter.cuh" -#include -#include - -#include "../detail/ann_utils.cuh" -#include -#include -#include -#include -#include // RAFT_CUDA_TRY -#include -#include -#include -#include - -#include - -namespace cuvs::neighbors::ivf_flat::detail { - -using namespace cuvs::spatial::knn::detail; // NOLINT - -constexpr int kThreadsPerBlock = 128; - -/** - * @brief Copy `n` elements per block from one place to another. - * - * @param[out] out target pointer (unique per block) - * @param[in] in source pointer - * @param n number of elements to copy - */ -template -__device__ inline void copy_vectorized(T* out, const T* in, uint32_t n) -{ - constexpr int VecElems = VecBytes / sizeof(T); // NOLINT - using align_bytes = raft::Pow2<(size_t)VecBytes>; - if constexpr (VecElems > 1) { - using align_elems = raft::Pow2; - if (!align_bytes::areSameAlignOffsets(out, in)) { - return copy_vectorized<(VecBytes >> 1), T>(out, in, n); - } - { // process unaligned head - uint32_t head = align_bytes::roundUp(in) - in; - if (head > 0) { - copy_vectorized(out, in, head); - n -= head; - in += head; - out += head; - } - } - { // process main part vectorized - using vec_t = typename raft::IOType::Type; - copy_vectorized( - reinterpret_cast(out), reinterpret_cast(in), align_elems::div(n)); - } - { // process unaligned tail - uint32_t tail = align_elems::mod(n); - if (tail > 0) { - n -= tail; - copy_vectorized(out + n, in + n, tail); - } - } - } - if constexpr (VecElems <= 1) { - for (int i = threadIdx.x; i < n; i += blockDim.x) { - out[i] = in[i]; - } - } -} - -/** - * @brief Load a part of a vector from the index and from query, compute the (part of the) distance - * between them, and aggregate it using the provided Lambda; one structure per thread, per query, - * and per index item. - * - * @tparam kUnroll elements per loop (normally, kUnroll = WarpSize / Veclen) - * @tparam Lambda computing the part of the distance for one dimension and aggregating it: - * void (AccT& acc, AccT x, AccT y) - * @tparam Veclen size of the vectorized load - * @tparam T type of the data in the query and the index - * @tparam AccT type of the accumulated value (an optimization for 8bit values to be loaded as 32bit - * values) - */ -template -struct loadAndComputeDist { - Lambda compute_dist; - AccT& dist; - AccT& norm_query; - AccT& norm_data; - - __device__ __forceinline__ - loadAndComputeDist(AccT& dist, Lambda op, AccT& norm_query, AccT& norm_data) - : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data) - { - } - - /** - * Load parts of vectors from the index and query and accumulates the partial distance. - * This version assumes the query is stored in shared memory. - * Every thread here processes exactly kUnroll * Veclen elements independently of others. - */ - template - __device__ __forceinline__ void runLoadShmemCompute(const T* const& data, - const T* query_shared, - IdxT loadIndex, - IdxT shmemIndex) - { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - T encV[Veclen]; - raft::ldg(encV, data + (loadIndex + j * kIndexGroupSize) * Veclen); - T queryRegs[Veclen]; - raft::lds(queryRegs, &query_shared[shmemIndex + j * Veclen]); -#pragma unroll - for (int k = 0; k < Veclen; ++k) { - compute_dist(dist, queryRegs[k], encV[k]); - if constexpr (ComputeNorm) { - norm_query += queryRegs[k] * queryRegs[k]; - norm_data += encV[k] * encV[k]; - } - } - } - } - - /** - * Load parts of vectors from the index and query and accumulates the partial distance. - * This version assumes the query is stored in the global memory and is different for every - * thread. One warp loads exactly WarpSize query elements at once and then reshuffles them into - * corresponding threads (`WarpSize / (kUnroll * Veclen)` elements per thread at once). - */ - template - __device__ __forceinline__ void runLoadShflAndCompute(const T*& data, - const T* query, - IdxT baseLoadIndex, - const int lane_id) - { - T queryReg = query[baseLoadIndex + lane_id]; - constexpr int stride = kUnroll * Veclen; - constexpr int totalIter = raft::WarpSize / stride; - constexpr int gmemStride = stride * kIndexGroupSize; -#pragma unroll - for (int i = 0; i < totalIter; ++i, data += gmemStride) { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - T encV[Veclen]; - raft::ldg(encV, data + (lane_id + j * kIndexGroupSize) * Veclen); - const int d = (i * kUnroll + j) * Veclen; -#pragma unroll - for (int k = 0; k < Veclen; ++k) { - T q = raft::shfl(queryReg, d + k, raft::WarpSize); - compute_dist(dist, q, encV[k]); - if constexpr (ComputeNorm) { - norm_query += q * q; - norm_data += encV[k] * encV[k]; - } - } - } - } - } - - /** - * Load parts of vectors from the index and query and accumulates the partial distance. - * This version augments `runLoadShflAndCompute` when `dim` is not a multiple of `WarpSize`. - */ - __device__ __forceinline__ void runLoadShflAndComputeRemainder( - const T*& data, const T* query, const int lane_id, const int dim, const int dimBlocks) - { - const int loadDim = dimBlocks + lane_id; - T queryReg = loadDim < dim ? query[loadDim] : T{0}; - const int loadDataIdx = lane_id * Veclen; - for (int d = 0; d < dim - dimBlocks; d += Veclen, data += kIndexGroupSize * Veclen) { - T enc[Veclen]; - raft::ldg(enc, data + loadDataIdx); -#pragma unroll - for (int k = 0; k < Veclen; k++) { - T q = raft::shfl(queryReg, d + k, raft::WarpSize); - compute_dist(dist, q, enc[k]); - if constexpr (ComputeNorm) { - norm_query += q * q; - norm_data += enc[k] * enc[k]; - } - } - } - } -}; - -// This handles uint8_t 8, 16 Veclens -template -struct loadAndComputeDist { - Lambda compute_dist; - uint32_t& dist; - uint32_t& norm_query; - uint32_t& norm_data; - - __device__ __forceinline__ - loadAndComputeDist(uint32_t& dist, Lambda op, uint32_t& norm_query, uint32_t& norm_data) - : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data) - { - } - - __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data, - const uint8_t* query_shared, - int loadIndex, - int shmemIndex) - { - constexpr int veclen_int = uint8_veclen / 4; // converting uint8_t veclens to int - loadIndex = loadIndex * veclen_int; -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - uint32_t encV[veclen_int]; - raft::ldg( - encV, - reinterpret_cast(data) + loadIndex + j * kIndexGroupSize * veclen_int); - uint32_t queryRegs[veclen_int]; - raft::lds(queryRegs, - reinterpret_cast(query_shared + shmemIndex) + j * veclen_int); -#pragma unroll - for (int k = 0; k < veclen_int; k++) { - compute_dist(dist, queryRegs[k], encV[k]); - if constexpr (ComputeNorm) { - norm_query = raft::dp4a(queryRegs[k], queryRegs[k], norm_query); - norm_data = raft::dp4a(encV[k], encV[k], norm_data); - } - } - } - } - __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data, - const uint8_t* query, - int baseLoadIndex, - const int lane_id) - { - constexpr int veclen_int = uint8_veclen / 4; // converting uint8_t veclens to int - uint32_t queryReg = - (lane_id < 8) ? reinterpret_cast(query + baseLoadIndex)[lane_id] : 0; - constexpr int stride = kUnroll * uint8_veclen; - -#pragma unroll - for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - uint32_t encV[veclen_int]; - raft::ldg( - encV, - reinterpret_cast(data) + (lane_id + j * kIndexGroupSize) * veclen_int); - const int d = (i * kUnroll + j) * veclen_int; -#pragma unroll - for (int k = 0; k < veclen_int; ++k) { - uint32_t q = raft::shfl(queryReg, d + k, raft::WarpSize); - compute_dist(dist, q, encV[k]); - if constexpr (ComputeNorm) { - norm_query = raft::dp4a(q, q, norm_query); - norm_data = raft::dp4a(encV[k], encV[k], norm_data); - } - } - } - } - } - - __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data, - const uint8_t* query, - const int lane_id, - const int dim, - const int dimBlocks) - { - constexpr int veclen_int = uint8_veclen / 4; - const int loadDim = dimBlocks + lane_id * 4; // Here 4 is for 1 - int - uint32_t queryReg = loadDim < dim ? reinterpret_cast(query + loadDim)[0] : 0; - for (int d = 0; d < dim - dimBlocks; - d += uint8_veclen, data += kIndexGroupSize * uint8_veclen) { - uint32_t enc[veclen_int]; - raft::ldg(enc, reinterpret_cast(data) + lane_id * veclen_int); -#pragma unroll - for (int k = 0; k < veclen_int; k++) { - uint32_t q = raft::shfl(queryReg, (d / 4) + k, raft::WarpSize); - compute_dist(dist, q, enc[k]); - if constexpr (ComputeNorm) { - norm_query = raft::dp4a(q, q, norm_query); - norm_data = raft::dp4a(enc[k], enc[k], norm_data); - } - } - } - } -}; - -// Keep this specialized uint8 Veclen = 4, because compiler is generating suboptimal code while -// using above common template of int2/int4 -template -struct loadAndComputeDist { - Lambda compute_dist; - uint32_t& dist; - uint32_t& norm_query; - uint32_t& norm_data; - - __device__ __forceinline__ - loadAndComputeDist(uint32_t& dist, Lambda op, uint32_t& norm_query, uint32_t& norm_data) - : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data) - { - } - - __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data, - const uint8_t* query_shared, - int loadIndex, - int shmemIndex) - { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - uint32_t encV = reinterpret_cast(data)[loadIndex + j * kIndexGroupSize]; - uint32_t queryRegs = reinterpret_cast(query_shared + shmemIndex)[j]; - compute_dist(dist, queryRegs, encV); - if constexpr (ComputeNorm) { - norm_query = raft::dp4a(queryRegs, queryRegs, norm_query); - norm_data = raft::dp4a(encV, encV, norm_data); - } - } - } - __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data, - const uint8_t* query, - int baseLoadIndex, - const int lane_id) - { - uint32_t queryReg = - (lane_id < 8) ? reinterpret_cast(query + baseLoadIndex)[lane_id] : 0; - constexpr int veclen = 4; - constexpr int stride = kUnroll * veclen; - -#pragma unroll - for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - uint32_t encV = reinterpret_cast(data)[lane_id + j * kIndexGroupSize]; - uint32_t q = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize); - compute_dist(dist, q, encV); - if constexpr (ComputeNorm) { - norm_query = raft::dp4a(q, q, norm_query); - norm_data = raft::dp4a(encV, encV, norm_data); - } - } - } - } - - __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data, - const uint8_t* query, - const int lane_id, - const int dim, - const int dimBlocks) - { - constexpr int veclen = 4; - const int loadDim = dimBlocks + lane_id; - uint32_t queryReg = loadDim < dim ? reinterpret_cast(query)[loadDim] : 0; - for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) { - uint32_t enc = reinterpret_cast(data)[lane_id]; - uint32_t q = raft::shfl(queryReg, d / veclen, raft::WarpSize); - compute_dist(dist, q, enc); - if constexpr (ComputeNorm) { - norm_query = raft::dp4a(q, q, norm_query); - norm_data = raft::dp4a(enc, enc, norm_data); - } - } - } -}; - -template -struct loadAndComputeDist { - Lambda compute_dist; - uint32_t& dist; - uint32_t& norm_query; - uint32_t& norm_data; - - __device__ __forceinline__ - loadAndComputeDist(uint32_t& dist, Lambda op, uint32_t& norm_query, uint32_t& norm_data) - : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data) - { - } - - __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data, - const uint8_t* query_shared, - int loadIndex, - int shmemIndex) - { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - uint32_t encV = reinterpret_cast(data)[loadIndex + j * kIndexGroupSize]; - uint32_t queryRegs = reinterpret_cast(query_shared + shmemIndex)[j]; - compute_dist(dist, queryRegs, encV); - if constexpr (ComputeNorm) { - norm_query = raft::dp4a(queryRegs, queryRegs, norm_query); - norm_data = raft::dp4a(encV, encV, norm_data); - } - } - } - - __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data, - const uint8_t* query, - int baseLoadIndex, - const int lane_id) - { - uint32_t queryReg = - (lane_id < 16) ? reinterpret_cast(query + baseLoadIndex)[lane_id] : 0; - constexpr int veclen = 2; - constexpr int stride = kUnroll * veclen; - -#pragma unroll - for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - uint32_t encV = reinterpret_cast(data)[lane_id + j * kIndexGroupSize]; - uint32_t q = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize); - compute_dist(dist, q, encV); - if constexpr (ComputeNorm) { - norm_query = raft::dp4a(q, q, norm_query); - norm_data = raft::dp4a(encV, encV, norm_data); - } - } - } - } - - __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data, - const uint8_t* query, - const int lane_id, - const int dim, - const int dimBlocks) - { - constexpr int veclen = 2; - int loadDim = dimBlocks + lane_id * veclen; - uint32_t queryReg = loadDim < dim ? reinterpret_cast(query + loadDim)[0] : 0; - for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) { - uint32_t enc = reinterpret_cast(data)[lane_id]; - uint32_t q = raft::shfl(queryReg, d / veclen, raft::WarpSize); - compute_dist(dist, q, enc); - if constexpr (ComputeNorm) { - norm_query = raft::dp4a(q, q, norm_query); - norm_data = raft::dp4a(enc, enc, norm_data); - } - } - } -}; - -template -struct loadAndComputeDist { - Lambda compute_dist; - uint32_t& dist; - uint32_t& norm_query; - uint32_t& norm_data; - - __device__ __forceinline__ - loadAndComputeDist(uint32_t& dist, Lambda op, uint32_t& norm_query, uint32_t& norm_data) - : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data) - { - } - - __device__ __forceinline__ void runLoadShmemCompute(const uint8_t* const& data, - const uint8_t* query_shared, - int loadIndex, - int shmemIndex) - { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - uint32_t encV = data[loadIndex + j * kIndexGroupSize]; - uint32_t queryRegs = query_shared[shmemIndex + j]; - compute_dist(dist, queryRegs, encV); - if constexpr (ComputeNorm) { - norm_query += queryRegs * queryRegs; - norm_data += encV * encV; - } - } - } - - __device__ __forceinline__ void runLoadShflAndCompute(const uint8_t*& data, - const uint8_t* query, - int baseLoadIndex, - const int lane_id) - { - uint32_t queryReg = query[baseLoadIndex + lane_id]; - constexpr int veclen = 1; - constexpr int stride = kUnroll * veclen; - -#pragma unroll - for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - uint32_t encV = data[lane_id + j * kIndexGroupSize]; - uint32_t q = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize); - compute_dist(dist, q, encV); - if constexpr (ComputeNorm) { - norm_query += q * q; - norm_data += encV * encV; - } - } - } - } - - __device__ __forceinline__ void runLoadShflAndComputeRemainder(const uint8_t*& data, - const uint8_t* query, - const int lane_id, - const int dim, - const int dimBlocks) - { - constexpr int veclen = 1; - int loadDim = dimBlocks + lane_id; - uint32_t queryReg = loadDim < dim ? query[loadDim] : 0; - for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) { - uint32_t enc = data[lane_id]; - uint32_t q = raft::shfl(queryReg, d, raft::WarpSize); - compute_dist(dist, q, enc); - if constexpr (ComputeNorm) { - norm_query += q * q; - norm_data += enc * enc; - } - } - } -}; - -// This device function is for int8 veclens 4, 8 and 16 -template -struct loadAndComputeDist { - Lambda compute_dist; - int32_t& dist; - int32_t& norm_query; - int32_t& norm_data; - - __device__ __forceinline__ - loadAndComputeDist(int32_t& dist, Lambda op, int32_t& norm_query, int32_t& norm_data) - : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data) - { - } - - __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data, - const int8_t* query_shared, - int loadIndex, - int shmemIndex) - { - constexpr int veclen_int = int8_veclen / 4; // converting int8_t veclens to int - -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - int32_t encV[veclen_int]; - raft::ldg( - encV, - reinterpret_cast(data) + (loadIndex + j * kIndexGroupSize) * veclen_int); - int32_t queryRegs[veclen_int]; - raft::lds(queryRegs, - reinterpret_cast(query_shared + shmemIndex) + j * veclen_int); -#pragma unroll - for (int k = 0; k < veclen_int; k++) { - compute_dist(dist, queryRegs[k], encV[k]); - if constexpr (ComputeNorm) { - norm_query = raft::dp4a(queryRegs[k], queryRegs[k], norm_query); - norm_data = raft::dp4a(encV[k], encV[k], norm_data); - } - } - } - } - - __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data, - const int8_t* query, - int baseLoadIndex, - const int lane_id) - { - constexpr int veclen_int = int8_veclen / 4; // converting int8_t veclens to int - - int32_t queryReg = - (lane_id < 8) ? reinterpret_cast(query + baseLoadIndex)[lane_id] : 0; - constexpr int stride = kUnroll * int8_veclen; - -#pragma unroll - for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - int32_t encV[veclen_int]; - raft::ldg( - encV, - reinterpret_cast(data) + (lane_id + j * kIndexGroupSize) * veclen_int); - const int d = (i * kUnroll + j) * veclen_int; -#pragma unroll - for (int k = 0; k < veclen_int; ++k) { - int32_t q = raft::shfl(queryReg, d + k, raft::WarpSize); - compute_dist(dist, q, encV[k]); - if constexpr (ComputeNorm) { - norm_query = raft::dp4a(q, q, norm_query); - norm_data = raft::dp4a(encV[k], encV[k], norm_data); - } - } - } - } - } - - __device__ __forceinline__ void runLoadShflAndComputeRemainder( - const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks) - { - constexpr int veclen_int = int8_veclen / 4; - const int loadDim = dimBlocks + lane_id * 4; // Here 4 is for 1 - int; - int32_t queryReg = loadDim < dim ? reinterpret_cast(query + loadDim)[0] : 0; - for (int d = 0; d < dim - dimBlocks; d += int8_veclen, data += kIndexGroupSize * int8_veclen) { - int32_t enc[veclen_int]; - raft::ldg(enc, reinterpret_cast(data) + lane_id * veclen_int); -#pragma unroll - for (int k = 0; k < veclen_int; k++) { - int32_t q = raft::shfl(queryReg, (d / 4) + k, raft::WarpSize); // Here 4 is for 1 - int; - compute_dist(dist, q, enc[k]); - if constexpr (ComputeNorm) { - norm_query = raft::dp4a(q, q, norm_query); - norm_data = raft::dp4a(enc[k], enc[k], norm_data); - } - } - } - } -}; - -template -struct loadAndComputeDist { - Lambda compute_dist; - int32_t& dist; - int32_t& norm_query; - int32_t& norm_data; - __device__ __forceinline__ - loadAndComputeDist(int32_t& dist, Lambda op, int32_t& norm_query, int32_t& norm_data) - : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data) - { - } - __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data, - const int8_t* query_shared, - int loadIndex, - int shmemIndex) - { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - int32_t encV = reinterpret_cast(data)[loadIndex + j * kIndexGroupSize]; - int32_t queryRegs = reinterpret_cast(query_shared + shmemIndex)[j]; - compute_dist(dist, queryRegs, encV); - if constexpr (ComputeNorm) { - norm_query = raft::dp4a(queryRegs, queryRegs, norm_query); - norm_data = raft::dp4a(encV, encV, norm_data); - } - } - } - - __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data, - const int8_t* query, - int baseLoadIndex, - const int lane_id) - { - int32_t queryReg = - (lane_id < 16) ? reinterpret_cast(query + baseLoadIndex)[lane_id] : 0; - constexpr int veclen = 2; - constexpr int stride = kUnroll * veclen; - -#pragma unroll - for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - int32_t encV = reinterpret_cast(data)[lane_id + j * kIndexGroupSize]; - int32_t q = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize); - compute_dist(dist, q, encV); - if constexpr (ComputeNorm) { - norm_query = raft::dp4a(queryReg, queryReg, norm_query); - norm_data = raft::dp4a(encV, encV, norm_data); - } - } - } - } - - __device__ __forceinline__ void runLoadShflAndComputeRemainder( - const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks) - { - constexpr int veclen = 2; - int loadDim = dimBlocks + lane_id * veclen; - int32_t queryReg = loadDim < dim ? reinterpret_cast(query + loadDim)[0] : 0; - for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) { - int32_t enc = reinterpret_cast(data + lane_id * veclen)[0]; - int32_t q = raft::shfl(queryReg, d / veclen, raft::WarpSize); - compute_dist(dist, q, enc); - if constexpr (ComputeNorm) { - norm_query = raft::dp4a(q, q, norm_query); - norm_data = raft::dp4a(enc, enc, norm_data); - } - } - } -}; - -template -struct loadAndComputeDist { - Lambda compute_dist; - int32_t& dist; - int32_t& norm_query; - int32_t& norm_data; - __device__ __forceinline__ - loadAndComputeDist(int32_t& dist, Lambda op, int32_t& norm_query, int32_t& norm_data) - : dist(dist), compute_dist(op), norm_query(norm_query), norm_data(norm_data) - { - } - - __device__ __forceinline__ void runLoadShmemCompute(const int8_t* const& data, - const int8_t* query_shared, - int loadIndex, - int shmemIndex) - { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - compute_dist(dist, query_shared[shmemIndex + j], data[loadIndex + j * kIndexGroupSize]); - if constexpr (ComputeNorm) { - norm_query += int32_t{query_shared[shmemIndex + j]} * int32_t{query_shared[shmemIndex + j]}; - norm_data += int32_t{data[loadIndex + j * kIndexGroupSize]} * - int32_t{data[loadIndex + j * kIndexGroupSize]}; - } - } - } - - __device__ __forceinline__ void runLoadShflAndCompute(const int8_t*& data, - const int8_t* query, - int baseLoadIndex, - const int lane_id) - { - constexpr int veclen = 1; - constexpr int stride = kUnroll * veclen; - int32_t queryReg = query[baseLoadIndex + lane_id]; - -#pragma unroll - for (int i = 0; i < raft::WarpSize / stride; ++i, data += stride * kIndexGroupSize) { -#pragma unroll - for (int j = 0; j < kUnroll; ++j) { - int32_t q = raft::shfl(queryReg, i * kUnroll + j, raft::WarpSize); - compute_dist(dist, q, data[lane_id + j * kIndexGroupSize]); - if constexpr (ComputeNorm) { - norm_query += q * q; - norm_data += data[lane_id + j * kIndexGroupSize] * data[lane_id + j * kIndexGroupSize]; - } - } - } - } - __device__ __forceinline__ void runLoadShflAndComputeRemainder( - const int8_t*& data, const int8_t* query, const int lane_id, const int dim, const int dimBlocks) - { - constexpr int veclen = 1; - const int loadDim = dimBlocks + lane_id; - int32_t queryReg = loadDim < dim ? query[loadDim] : 0; - for (int d = 0; d < dim - dimBlocks; d += veclen, data += kIndexGroupSize * veclen) { - int32_t q = raft::shfl(queryReg, d, raft::WarpSize); - compute_dist(dist, q, data[lane_id]); - if constexpr (ComputeNorm) { - norm_query += q * q; - norm_data += int32_t{data[lane_id]} * int32_t{data[lane_id]}; - } - } - } -}; - -// switch to dummy blocksort when Capacity is 0 this explicit dummy is chosen -// to support access to warpsort constants like ::queue_t::kDummy -template -struct flat_block_sort { - using type = raft::matrix::detail::select::warpsort::block_sort< - raft::matrix::detail::select::warpsort::warp_sort_filtered, - Capacity, - Ascending, - T, - IdxT>; -}; - -template -struct flat_block_sort<0, Ascending, T, IdxT> - : ivf::detail::dummy_block_sort_t { - using type = ivf::detail::dummy_block_sort_t; -}; - -template -using block_sort_t = typename flat_block_sort::type; - -/** - * Scan clusters for nearest neighbors of the query vectors. - * See `ivfflat_interleaved_scan` for more information. - * - * The clusters are stored in the interleaved index format described in ivf_flat_types.hpp. - * For each query vector, a set of clusters is probed: the distance to each vector in the cluster is - * calculated, and the top-k nearest neighbors are selected. - * - * @param compute_dist distance function - * @param query_smem_elems number of dimensions of the query vector to fit in a shared memory of a - * block; this number must be a multiple of `WarpSize * Veclen`. - * @param[in] query a pointer to all queries in a row-major contiguous format [gridDim.y, dim] - * @param[in] coarse_index a pointer to the cluster indices to search through [n_probes] - * @param[in] list_indices index.indices - * @param[in] list_data index.data - * @param[in] list_sizes index.list_sizes - * @param[in] list_offsets index.list_offsets - * @param n_probes - * @param k - * @param dim - * @param sample_filter - * @param[out] neighbors - * @param[out] distances - */ -template -RAFT_KERNEL __launch_bounds__(kThreadsPerBlock) - interleaved_scan_kernel(Lambda compute_dist, - PostLambda post_process, - const uint32_t query_smem_elems, - const T* query, - const uint32_t* coarse_index, - const T* const* list_data_ptrs, - const uint32_t* list_sizes, - const uint32_t queries_offset, - const uint32_t n_probes, - const uint32_t k, - const uint32_t max_samples, - const uint32_t* chunk_indices, - const uint32_t dim, - IvfSampleFilterT sample_filter, - uint32_t* neighbors, - float* distances) -{ - extern __shared__ __align__(256) uint8_t interleaved_scan_kernel_smem[]; - constexpr bool kManageLocalTopK = Capacity > 0; - // Using shared memory for the (part of the) query; - // This allows to save on global memory bandwidth when reading index and query - // data at the same time. - // Its size is `query_smem_elems`. - T* query_shared = reinterpret_cast(interleaved_scan_kernel_smem); - // Make the query input and output point to this block's shared query - { - const int query_id = blockIdx.y; - query += query_id * dim; - if constexpr (kManageLocalTopK) { - neighbors += query_id * k * gridDim.x + blockIdx.x * k; - distances += query_id * k * gridDim.x + blockIdx.x * k; - } else { - distances += query_id * uint64_t(max_samples); - } - chunk_indices += (n_probes * query_id); - coarse_index += query_id * n_probes; - } - - // Copy a part of the query into shared memory for faster processing - copy_vectorized(query_shared, query, std::min(dim, query_smem_elems)); - __syncthreads(); - - using local_topk_t = block_sort_t; - local_topk_t queue(k); - { - using align_warp = raft::Pow2; - const int lane_id = align_warp::mod(threadIdx.x); - - // How many full warps needed to compute the distance (without remainder) - const uint32_t full_warps_along_dim = align_warp::roundDown(dim); - - const uint32_t shm_assisted_dim = - (dim > query_smem_elems) ? query_smem_elems : full_warps_along_dim; - - // Every CUDA block scans one cluster at a time. - for (int probe_id = blockIdx.x; probe_id < n_probes; probe_id += gridDim.x) { - const uint32_t list_id = coarse_index[probe_id]; // The id of cluster(list) - - // The number of vectors in each cluster(list); [nlist] - const uint32_t list_length = list_sizes[list_id]; - - // The number of interleaved groups to be processed - const uint32_t num_groups = - align_warp::div(list_length + align_warp::Mask); // ceildiv by power of 2 - - uint32_t sample_offset = 0; - if (probe_id > 0) { sample_offset = chunk_indices[probe_id - 1]; } - assert(list_length == chunk_indices[probe_id] - sample_offset); - if constexpr (!kManageLocalTopK) { - // max_samples is zero/unused in the kManageLocalTopK mode - assert(sample_offset + list_length <= max_samples); - } - - constexpr int kUnroll = raft::WarpSize / Veclen; - constexpr uint32_t kNumWarps = kThreadsPerBlock / raft::WarpSize; - // Every warp reads WarpSize vectors and computes the distances to them. - // Then, the distances and corresponding ids are distributed among the threads, - // and each thread adds one (id, dist) pair to the filtering queue. - for (uint32_t group_id = align_warp::div(threadIdx.x); group_id < num_groups; - group_id += kNumWarps) { - AccT dist = 0; - AccT norm_query = 0; - AccT norm_dataset = 0; - // This is where this warp begins reading data (start position of an interleaved group) - const T* data = list_data_ptrs[list_id] + (group_id * kIndexGroupSize) * dim; - - // This is the vector a given lane/thread handles - const uint32_t vec_id = group_id * raft::WarpSize + lane_id; - const bool valid = - vec_id < list_length && sample_filter(queries_offset + blockIdx.y, list_id, vec_id); - - if (valid) { - // Process first shm_assisted_dim dimensions (always using shared memory) - loadAndComputeDist lc( - dist, compute_dist, norm_query, norm_dataset); - for (int pos = 0; pos < shm_assisted_dim; - pos += raft::WarpSize, data += kIndexGroupSize * raft::WarpSize) { - lc.runLoadShmemCompute(data, query_shared, lane_id, pos); - } - - if (dim > query_smem_elems) { - // The default path - using shfl ops - for dimensions beyond query_smem_elems - loadAndComputeDist lc( - dist, compute_dist, norm_query, norm_dataset); - for (int pos = shm_assisted_dim; pos < full_warps_along_dim; pos += raft::WarpSize) { - lc.runLoadShflAndCompute(data, query, pos, lane_id); - } - lc.runLoadShflAndComputeRemainder(data, query, lane_id, dim, full_warps_along_dim); - } else { - // when shm_assisted_dim == full_warps_along_dim < dim - loadAndComputeDist<1, decltype(compute_dist), Veclen, T, AccT, ComputeNorm> lc( - dist, compute_dist, norm_query, norm_dataset); - for (int pos = full_warps_along_dim; pos < dim; - pos += Veclen, data += kIndexGroupSize * Veclen) { - lc.runLoadShmemCompute(data, query_shared, lane_id, pos); - } - } - } - - // Enqueue one element per thread - float val = valid ? static_cast(dist) : local_topk_t::queue_t::kDummy; - - if constexpr (ComputeNorm) { - if (valid) - val = val / (raft::sqrt(static_cast(norm_query)) * - raft::sqrt(static_cast(norm_dataset))); - } - if constexpr (kManageLocalTopK) { - queue.add(val, sample_offset + vec_id); - } else { - if (vec_id < list_length) distances[sample_offset + vec_id] = val; - } - } - - // fill up unused slots for current query - if constexpr (!kManageLocalTopK) { - if (probe_id + 1 == n_probes) { - for (uint32_t i = threadIdx.x + sample_offset + list_length; i < max_samples; - i += blockDim.x) { - distances[i] = local_topk_t::queue_t::kDummy; - } - } - } - } - } - - // finalize and store selected neighbours - if constexpr (kManageLocalTopK) { - __syncthreads(); - queue.done(interleaved_scan_kernel_smem); - queue.store(distances, neighbors, post_process); - } -} - -/** - * Configure the gridDim.x to maximize GPU occupancy, but reduce the output size - */ -template -uint32_t configure_launch_x(uint32_t numQueries, uint32_t n_probes, int32_t sMemSize, T func) -{ - int dev_id; - RAFT_CUDA_TRY(cudaGetDevice(&dev_id)); - int num_sms; - RAFT_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id)); - int num_blocks_per_sm = 0; - RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &num_blocks_per_sm, func, kThreadsPerBlock, sMemSize)); - - size_t min_grid_size = num_sms * num_blocks_per_sm; - size_t min_grid_x = raft::ceildiv(min_grid_size, numQueries); - return min_grid_x > n_probes ? n_probes : static_cast(min_grid_x); -} - -template -void launch_kernel(Lambda lambda, - PostLambda post_process, - const index& index, - const T* queries, - const uint32_t* coarse_index, - const uint32_t num_queries, - const uint32_t queries_offset, - const uint32_t n_probes, - const uint32_t k, - const uint32_t max_samples, - const uint32_t* chunk_indices, - IvfSampleFilterT sample_filter, - uint32_t* neighbors, - float* distances, - uint32_t& grid_dim_x, - rmm::cuda_stream_view stream) -{ - RAFT_EXPECTS(Veclen == index.veclen(), - "Configured Veclen does not match the index interleaving pattern."); - constexpr auto kKernel = interleaved_scan_kernel; - const int max_query_smem = 16384; - int query_smem_elems = std::min(max_query_smem / sizeof(T), - raft::Pow2::roundUp(index.dim())); - int smem_size = query_smem_elems * sizeof(T); - - if constexpr (Capacity > 0) { - constexpr int kSubwarpSize = std::min(Capacity, raft::WarpSize); - auto block_merge_mem = - raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide( - kThreadsPerBlock / kSubwarpSize, k); - smem_size += std::max(smem_size, block_merge_mem); - } - - // power-of-two less than cuda limit (for better addr alignment) - constexpr uint32_t kMaxGridY = 32768; - - if (grid_dim_x == 0) { - grid_dim_x = configure_launch_x(std::min(kMaxGridY, num_queries), n_probes, smem_size, kKernel); - return; - } - - for (uint32_t query_offset = 0; query_offset < num_queries; query_offset += kMaxGridY) { - uint32_t grid_dim_y = std::min(kMaxGridY, num_queries - query_offset); - dim3 grid_dim(grid_dim_x, grid_dim_y, 1); - dim3 block_dim(kThreadsPerBlock); - RAFT_LOG_TRACE( - "Launching the ivf-flat interleaved_scan_kernel (%d, %d, 1) x (%d, 1, 1), n_probes = %d, " - "smem_size = %d", - grid_dim.x, - grid_dim.y, - block_dim.x, - n_probes, - smem_size); - kKernel<<>>(lambda, - post_process, - query_smem_elems, - queries, - coarse_index, - index.data_ptrs().data_handle(), - index.list_sizes().data_handle(), - queries_offset + query_offset, - n_probes, - k, - max_samples, - chunk_indices, - index.dim(), - sample_filter, - neighbors, - distances); - queries += grid_dim_y * index.dim(); - if constexpr (Capacity > 0) { - neighbors += grid_dim_y * grid_dim_x * k; - distances += grid_dim_y * grid_dim_x * k; - } else { - distances += grid_dim_y * max_samples; - } - chunk_indices += grid_dim_y * n_probes; - coarse_index += grid_dim_y * n_probes; - } -} - -template -struct euclidean_dist { - __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y) - { - const auto diff = x - y; - acc += diff * diff; - } -}; - -template -struct euclidean_dist { - __device__ __forceinline__ void operator()(uint32_t& acc, uint32_t x, uint32_t y) - { - if constexpr (Veclen > 1) { - const auto diff = __vabsdiffu4(x, y); - acc = raft::dp4a(diff, diff, acc); - } else { - const auto diff = __usad(x, y, 0u); - acc += diff * diff; - } - } -}; - -template -struct euclidean_dist { - __device__ __forceinline__ void operator()(int32_t& acc, int32_t x, int32_t y) - { - if constexpr (Veclen > 1) { - // Note that we enforce here that the unsigned version of dp4a is used, because the difference - // between two int8 numbers can be greater than 127 and therefore represented as a negative - // number in int8. Casting from int8 to int32 would yield incorrect results, while casting - // from uint8 to uint32 is correct. - const auto diff = __vabsdiffs4(x, y); - acc = raft::dp4a(diff, diff, static_cast(acc)); - } else { - const auto diff = x - y; - acc += diff * diff; - } - } -}; - -template -struct inner_prod_dist { - __device__ __forceinline__ void operator()(AccT& acc, AccT x, AccT y) - { - if constexpr (Veclen > 1 && (std::is_same_v || std::is_same_v)) { - acc = raft::dp4a(x, y, acc); - } else { - acc += x * y; - } - } -}; - -/** Select the distance computation function and forward the rest of the arguments. */ -template -void launch_with_fixed_consts(cuvs::distance::DistanceType metric, Args&&... args) -{ - switch (metric) { - case cuvs::distance::DistanceType::L2Expanded: - case cuvs::distance::DistanceType::L2Unexpanded: - return launch_kernel, - raft::identity_op>({}, {}, std::forward(args)...); - case cuvs::distance::DistanceType::L2SqrtExpanded: - case cuvs::distance::DistanceType::L2SqrtUnexpanded: - return launch_kernel, - raft::sqrt_op>({}, {}, std::forward(args)...); - case cuvs::distance::DistanceType::InnerProduct: - return launch_kernel, - raft::identity_op>({}, {}, std::forward(args)...); - case cuvs::distance::DistanceType::CosineExpanded: - // NB: "Ascending" is reversed because the post-processing step is done after that sort - return launch_kernel>( - {}, - raft::compose_op(raft::add_const_op{1.0f}, raft::mul_const_op{-1.0f}), - std::forward(args)...); // NB: update the description of `knn::ivf_flat::build` when - // adding here a new metric. - default: RAFT_FAIL("The chosen distance metric is not supported (%d)", int(metric)); - } -} - -/** - * Lift the `capacity` and `veclen` parameters to the template level, - * forward the rest of the arguments unmodified to `launch_interleaved_scan_kernel`. - */ -template (1, 16 / sizeof(T))> -struct select_interleaved_scan_kernel { - /** - * Recursively reduce the `Capacity` and `Veclen` parameters until they match the - * corresponding runtime arguments. - * By default, this recursive process starts with maximum possible values of the - * two parameters and ends with both values equal to 1. - */ - template - static inline void run(int k_max, int veclen, bool select_min, Args&&... args) - { - if constexpr (Capacity > 0) { - if (k_max == 0 || k_max > Capacity) { - return select_interleaved_scan_kernel::run( - k_max, veclen, select_min, std::forward(args)...); - } - } - if constexpr (Capacity > 1) { - if (k_max * 2 <= Capacity) { - return select_interleaved_scan_kernel::run(k_max, - veclen, - select_min, - std::forward(args)...); - } - } - if constexpr (Veclen > 1) { - if (veclen % Veclen != 0) { - return select_interleaved_scan_kernel::run( - k_max, 1, select_min, std::forward(args)...); - } - } - // NB: this is the limitation of the warpsort structures that use a huge number of - // registers (used in the main kernel here). - RAFT_EXPECTS(Capacity == 0 || k_max == Capacity, - "Capacity must be either 0 or a power-of-two not bigger than the maximum " - "allowed size matrix::detail::select::warpsort::kMaxCapacity (%d).", - raft::matrix::detail::select::warpsort::kMaxCapacity); - RAFT_EXPECTS( - veclen == Veclen, - "Veclen must be power-of-two not bigger than the maximum allowed size for this data type."); - if (select_min) { - launch_with_fixed_consts( - std::forward(args)...); - } else { - launch_with_fixed_consts( - std::forward(args)...); - } - } -}; - -/** - * @brief Configure and launch an appropriate template instance of the interleaved scan kernel. - * - * @tparam T value type - * @tparam AccT accumulated type - * @tparam IdxT type of the indices - * - * @param index previously built ivf-flat index - * @param[in] queries device pointer to the query vectors [batch_size, dim] - * @param[in] coarse_query_results device pointer to the cluster (list) ids [batch_size, n_probes] - * @param n_queries batch size - * @param[in] queries_offset - * An offset of the current query batch. It is used for feeding sample_filter with the - * correct query index. - * @param metric type of the measured distance - * @param n_probes number of nearest clusters to query - * @param k number of nearest neighbors. - * NB: the maximum value of `k` is limited statically by `kMaxCapacity`. - * @param select_min whether to select nearest (true) or furthest (false) points w.r.t. the given - * metric. - * @param[out] neighbors device pointer to the result indices for each query and cluster - * [batch_size, grid_dim_x, k] - * @param[out] distances device pointer to the result distances for each query and cluster - * [batch_size, grid_dim_x, k] - * @param[inout] grid_dim_x number of blocks launched across all n_probes clusters; - * (one block processes one or more probes, hence: 1 <= grid_dim_x <= n_probes) - * @param stream - * @param sample_filter - * A filter that selects samples for a given query. Use an instance of none_sample_filter to - * provide a green light for every sample. - */ -template -void ivfflat_interleaved_scan(const index& index, - const T* queries, - const uint32_t* coarse_query_results, - const uint32_t n_queries, - const uint32_t queries_offset, - const cuvs::distance::DistanceType metric, - const uint32_t n_probes, - const uint32_t k, - const uint32_t max_samples, - const uint32_t* chunk_indices, - const bool select_min, - IvfSampleFilterT sample_filter, - uint32_t* neighbors, - float* distances, - uint32_t& grid_dim_x, - rmm::cuda_stream_view stream) -{ - const int capacity = raft::bound_by_power_of_two(k); - - auto filter_adapter = cuvs::neighbors::filtering::ivf_to_sample_filter( - index.inds_ptrs().data_handle(), sample_filter); - select_interleaved_scan_kernel::run(capacity, - index.veclen(), - select_min, - metric, - index, - queries, - coarse_query_results, - n_queries, - queries_offset, - n_probes, - k, - max_samples, - chunk_indices, - filter_adapter, - neighbors, - distances, - grid_dim_x, - stream); -} - -} // namespace cuvs::neighbors::ivf_flat::detail diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_explicit_inst.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_explicit_inst.cuh index 81833a63b1..25e7eda686 100644 --- a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_explicit_inst.cuh +++ b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_explicit_inst.cuh @@ -6,11 +6,7 @@ #pragma once #include "../detail/ann_utils.cuh" -#ifdef CUVS_ENABLE_JIT_LTO #include "ivf_flat_interleaved_scan_jit.cuh" -#else -#include "ivf_flat_interleaved_scan.cuh" -#endif #include #include #include diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_jit.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_jit.cuh index be8652dd59..27e00f9fee 100644 --- a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_jit.cuh +++ b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan_jit.cuh @@ -66,36 +66,6 @@ constexpr auto get_filter_type_tag() } } -template -constexpr auto get_metric_name() -{ - if constexpr (std::is_same_v>) { - return "euclidean"; - } - if constexpr (std::is_same_v>) { - return "inner_prod"; - } -} - -template -constexpr auto get_filter_name() -{ - if constexpr (std::is_same_v>) { - return "filter_none"; - } - if constexpr (std::is_same_v>) { - return "filter_bitset"; - } -} - -template -constexpr auto get_post_lambda_name() -{ - if constexpr (std::is_same_v) { return "post_identity"; } - if constexpr (std::is_same_v) { return "post_sqrt"; } - if constexpr (std::is_same_v) { return "post_compose"; } -} - /** * Configure the gridDim.x to maximize GPU occupancy, but reduce the output size */ @@ -148,16 +118,17 @@ void launch_kernel(const index& index, RAFT_EXPECTS(Veclen == index.veclen(), "Configured Veclen does not match the index interleaving pattern."); + using DataTag = decltype(get_data_type_tag()); + using AccTag = decltype(get_acc_type_tag()); + using IdxTag = decltype(get_idx_type_tag()); + // Use tag types for the planner to avoid template bloat - auto kernel_planner = InterleavedScanPlanner()), - decltype(get_acc_type_tag()), - decltype(get_idx_type_tag())>( - Capacity, Veclen, Ascending, ComputeNorm); - kernel_planner.template add_metric_device_function()), - decltype(get_acc_type_tag())>( - get_metric_name(), Veclen); - kernel_planner.add_filter_device_function(get_filter_name()); - kernel_planner.add_post_lambda_device_function(get_post_lambda_name()); + InterleavedScanPlanner kernel_planner; + kernel_planner + .add_entrypoint(); + kernel_planner.add_metric_device_function(); + kernel_planner.add_filter_device_function(); + kernel_planner.add_post_lambda_device_function(); auto kernel_launcher = kernel_planner.get_launcher(); const int max_query_smem = 16384; @@ -250,7 +221,7 @@ void launch_with_fixed_consts(cuvs::distance::DistanceType metric, Args&&... arg AccT, IdxT, IvfSampleFilterTag, - tag_metric_euclidean, + tag_metric_euclidean, tag_post_identity>(std::forward(args)...); case cuvs::distance::DistanceType::L2SqrtExpanded: case cuvs::distance::DistanceType::L2SqrtUnexpanded: @@ -262,7 +233,7 @@ void launch_with_fixed_consts(cuvs::distance::DistanceType metric, Args&&... arg AccT, IdxT, IvfSampleFilterTag, - tag_metric_euclidean, + tag_metric_euclidean, tag_post_sqrt>(std::forward(args)...); case cuvs::distance::DistanceType::InnerProduct: return launch_kernel, + tag_metric_inner_product, tag_post_identity>(std::forward(args)...); case cuvs::distance::DistanceType::CosineExpanded: // NB: "Ascending" is reversed because the post-processing step is done after that sort @@ -285,7 +256,7 @@ void launch_with_fixed_consts(cuvs::distance::DistanceType metric, Args&&... arg AccT, IdxT, IvfSampleFilterTag, - tag_metric_inner_product, + tag_metric_inner_product, tag_post_compose>( std::forward(args)...); // NB: update the description of `knn::ivf_flat::build` when // adding here a new metric. diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh index 3379e7b8dc..6712a1f7f2 100644 --- a/cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh +++ b/cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh @@ -348,19 +348,20 @@ inline void search_with_filtering(raft::resources const& handle, for (uint32_t offset_q = 0; offset_q < n_queries; offset_q += max_queries) { uint32_t queries_batch = raft::min(max_queries, n_queries - offset_q); - search_impl(handle, - index, - queries + offset_q * index.dim(), - queries_batch, - offset_q, - k, - n_probes, - max_samples, - cuvs::distance::is_min_close(index.metric()), - neighbors + offset_q * k, - distances + offset_q * k, - raft::resource::get_workspace_resource(handle), - sample_filter); + search_impl( + handle, + index, + queries + offset_q * index.dim(), + queries_batch, + offset_q, + k, + n_probes, + max_samples, + cuvs::distance::is_min_close(index.metric()), + neighbors + offset_q * k, + distances + offset_q * k, + raft::resource::get_workspace_resource_ref(handle), + sample_filter); } } diff --git a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/filter_embedded.cpp.in b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/filter_embedded.cpp.in index a5a7299b73..d06a023b5f 100644 --- a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/filter_embedded.cpp.in +++ b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/filter_embedded.cpp.in @@ -3,19 +3,18 @@ * SPDX-License-Identifier: Apache-2.0 */ -// This file is auto-generated. Do not edit manually. - -#include +#include +#include #include "@embedded_header_file@" -namespace { +namespace cuvs::neighbors::ivf_flat::detail { -__attribute__((__constructor__)) void register_kernel() -{ - registerAlgorithm( - "@filter_name@", - embedded_fatbin, - sizeof(embedded_fatbin)); -} +using _FragmentEntry = FilterFragmentEntry>; + +template <> +const uint8_t* const _FragmentEntry::data = embedded_fatbin; + +template <> +const size_t _FragmentEntry::length = sizeof(embedded_fatbin); } diff --git a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/filter_kernel.cu.in b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/filter_kernel.cu.in index a4c2f18f53..8973c1dd27 100644 --- a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/filter_kernel.cu.in +++ b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/filter_kernel.cu.in @@ -3,8 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 */ -// This file is auto-generated. Do not edit manually. - #include <@header_file@> namespace cuvs::neighbors::ivf_flat::detail { diff --git a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_embedded.cpp.in b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_embedded.cpp.in index 1078d03b00..16c96a6fe3 100644 --- a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_embedded.cpp.in +++ b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_embedded.cpp.in @@ -3,16 +3,18 @@ * SPDX-License-Identifier: Apache-2.0 */ -// This file is auto-generated. Do not edit manually. - -#include +#include +#include #include "@embedded_header_file@" -namespace { +namespace cuvs::neighbors::ivf_flat::detail { -__attribute__((__constructor__)) void register_kernel() -{ - registerAlgorithm("@kernel_name@", embedded_fatbin, sizeof(embedded_fatbin)); -} +using _FragmentEntry = InterleavedScanFragmentEntry; + +template <> +const uint8_t* const _FragmentEntry::data = embedded_fatbin; + +template <> +const size_t _FragmentEntry::length = sizeof(embedded_fatbin); } diff --git a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_kernel.cu.in b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_kernel.cu.in index 41210bfdee..1cc89a24c3 100644 --- a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_kernel.cu.in +++ b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_kernel.cu.in @@ -3,8 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 */ -// This file is auto-generated. Do not edit manually. - #include namespace cuvs::neighbors::ivf_flat::detail { diff --git a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_planner.hpp b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_planner.hpp index 07c2b4a6da..f8f3571bbb 100644 --- a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_planner.hpp +++ b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/interleaved_scan_planner.hpp @@ -6,43 +6,52 @@ #pragma once #include -#include #include +#include #include #include #include -template +namespace cuvs::neighbors::ivf_flat::detail { + struct InterleavedScanPlanner : AlgorithmPlanner { - InterleavedScanPlanner(int Capacity, int Veclen, bool Ascending, bool ComputeNorm) - : AlgorithmPlanner("interleaved_scan_capacity_" + std::to_string(Capacity) + "_veclen_" + - std::to_string(Veclen) + "_" + (Ascending ? "ascending" : "descending") + - "_" + (ComputeNorm ? "compute_norm" : "no_compute_norm") + "_data_" + - cuvs::neighbors::ivf_flat::detail::tag_abbrev::value + - "_acc_" + - cuvs::neighbors::ivf_flat::detail::tag_abbrev::value + - "_idx_" + cuvs::neighbors::ivf_flat::detail::tag_abbrev::value, - "interleaved_scan") + InterleavedScanPlanner() : AlgorithmPlanner("interleaved_scan") {} + + template + void add_entrypoint() { + this->add_fragment>(); } - template - void add_metric_device_function(std::string metric_name, int Veclen) + template + void add_metric_device_function() { - auto key = metric_name + "_veclen_" + std::to_string(Veclen); - auto params = make_fragment_key(); - this->device_functions.push_back(key + "_" + params); + this->add_fragment>(); } - void add_filter_device_function(std::string filter_name) + template + void add_filter_device_function() { - auto key = filter_name; - this->device_functions.push_back(key); + this->add_fragment>(); } - void add_post_lambda_device_function(std::string post_lambda_name) + template + void add_post_lambda_device_function() { - auto key = post_lambda_name; - this->device_functions.push_back(key); + this->add_fragment>(); } }; + +} // namespace cuvs::neighbors::ivf_flat::detail diff --git a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/metric_embedded.cpp.in b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/metric_embedded.cpp.in index b951476565..2a895d9917 100644 --- a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/metric_embedded.cpp.in +++ b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/metric_embedded.cpp.in @@ -3,23 +3,18 @@ * SPDX-License-Identifier: Apache-2.0 */ -// This file is auto-generated. Do not edit manually. - -#include +#include #include #include "@embedded_header_file@" -using namespace cuvs::neighbors::ivf_flat::detail; +namespace cuvs::neighbors::ivf_flat::detail { -namespace { +using _FragmentEntry = MetricFragmentEntry<@veclen@, tag_@type_abbrev@, tag_acc_@acc_abbrev@, tag_metric_@metric_name@>; -__attribute__((__constructor__)) void register_kernel() -{ - registerAlgorithm( - "@metric_name@_veclen_@veclen@", - embedded_fatbin, - sizeof(embedded_fatbin)); -} +template <> +const uint8_t* const _FragmentEntry::data = embedded_fatbin; + +template <> +const size_t _FragmentEntry::length = sizeof(embedded_fatbin); } diff --git a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/metric_kernel.cu.in b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/metric_kernel.cu.in index a67956db58..09dedc2bb2 100644 --- a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/metric_kernel.cu.in +++ b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/metric_kernel.cu.in @@ -3,8 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 */ -// This file is auto-generated. Do not edit manually. - #include <@header_file@> namespace cuvs::neighbors::ivf_flat::detail { diff --git a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/metric_matrix.json b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/metric_matrix.json index 0629e837b6..55c71c1060 100644 --- a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/metric_matrix.json +++ b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/metric_matrix.json @@ -5,7 +5,7 @@ "header_file": "neighbors/ivf_flat/jit_lto_kernels/metric_euclidean.cuh" }, { - "metric_name": "inner_prod", + "metric_name": "inner_product", "header_file": "neighbors/ivf_flat/jit_lto_kernels/metric_inner_prod.cuh" } ], diff --git a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_embedded.cpp.in b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_embedded.cpp.in index a2e3f1ea03..172e741293 100644 --- a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_embedded.cpp.in +++ b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_embedded.cpp.in @@ -3,19 +3,18 @@ * SPDX-License-Identifier: Apache-2.0 */ -// This file is auto-generated. Do not edit manually. - -#include +#include +#include #include "@embedded_header_file@" -namespace { +namespace cuvs::neighbors::ivf_flat::detail { -__attribute__((__constructor__)) void register_kernel() -{ - registerAlgorithm( - "@post_lambda_name@", - embedded_fatbin, - sizeof(embedded_fatbin)); -} +using _FragmentEntry = PostLambdaFragmentEntry; + +template <> +const uint8_t* const _FragmentEntry::data = embedded_fatbin; + +template <> +const size_t _FragmentEntry::length = sizeof(embedded_fatbin); } diff --git a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_kernel.cu.in b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_kernel.cu.in index 363964dd42..99823843c6 100644 --- a/cpp/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_kernel.cu.in +++ b/cpp/src/neighbors/ivf_flat/jit_lto_kernels/post_lambda_kernel.cu.in @@ -3,8 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 */ -// This file is auto-generated. Do not edit manually. - #include <@header_file@> namespace cuvs::neighbors::ivf_flat::detail { diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh index b2da2bb821..2b6eb22bac 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh @@ -82,7 +82,7 @@ void select_residuals(raft::resources const& handle, const float* center, // [dim] const T* dataset, // [.., dim] const IdxT* row_ids, // [n_rows] - rmm::mr::device_memory_resource* device_memory + rmm::device_async_resource_ref device_memory ) { @@ -338,7 +338,7 @@ void train_per_subset(raft::resources const& handle, uint32_t max_train_points_per_pq_code) { auto stream = raft::resource::get_cuda_stream(handle); - auto device_memory = raft::resource::get_workspace_resource(handle); + auto device_memory = raft::resource::get_workspace_resource_ref(handle); rmm::device_uvector pq_centers_tmp(impl->pq_centers().size(), stream, device_memory); // Subsampling the train set for codebook generation based on max_train_points_per_pq_code. @@ -420,7 +420,7 @@ void train_per_cluster(raft::resources const& handle, uint32_t max_train_points_per_pq_code) { auto stream = raft::resource::get_cuda_stream(handle); - auto device_memory = raft::resource::get_workspace_resource(handle); + auto device_memory = raft::resource::get_workspace_resource_ref(handle); // NB: Managed memory is used for small arrays accessed from both device and host. There's no // performance reasoning behind this, just avoiding the boilerplate of explicit copies. rmm::mr::managed_memory_resource managed_memory; @@ -600,7 +600,7 @@ void reconstruct_list_data(raft::resources const& res, auto tmp = raft::make_device_mdarray(res, - raft::resource::get_workspace_resource(res), + raft::resource::get_workspace_resource_ref(res), raft::make_extents(n_rows, index.rot_dim())); constexpr uint32_t kBlockSize = 256; @@ -627,7 +627,7 @@ void reconstruct_list_data(raft::resources const& res, float* out_float_ptr = nullptr; rmm::device_uvector out_float_buf( - 0, raft::resource::get_cuda_stream(res), raft::resource::get_workspace_resource(res)); + 0, raft::resource::get_cuda_stream(res), raft::resource::get_workspace_resource_ref(res)); if constexpr (std::is_same_v) { out_float_ptr = out_vectors.data_handle(); } else { @@ -710,7 +710,7 @@ void encode_list_data(raft::resources const& res, auto n_rows = new_vectors.extent(0); if (n_rows == 0) { return; } - auto mr = raft::resource::get_workspace_resource(res); + auto mr = raft::resource::get_workspace_resource_ref(res); auto new_vectors_residual = raft::make_device_mdarray( res, mr, raft::make_extents(n_rows, index->rot_dim())); @@ -1001,9 +1001,9 @@ void extend(raft::resources const& handle, std::is_same_v, "Unsupported data type"); - rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(handle); + rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource_ref(handle); rmm::device_async_resource_ref large_memory = - raft::resource::get_large_workspace_resource(handle); + raft::resource::get_large_workspace_resource_ref(handle); // Try to allocate an index with the same parameters and the projected new size // (which can be slightly larger than index->size() + n_rows, due to padding for interleaved). @@ -1268,13 +1268,14 @@ auto build(raft::resources const& handle, size_t(n_rows) / std::max(params.kmeans_trainset_fraction * n_rows, impl->n_lists())); size_t n_rows_train = n_rows / trainset_ratio; - rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(handle); + rmm::device_async_resource_ref device_memory = + raft::resource::get_workspace_resource_ref(handle); // If the trainset is small enough to comfortably fit into device memory, put it there. // Otherwise, use the managed memory. constexpr size_t kTolerableRatio = 4; rmm::device_async_resource_ref big_memory_resource = - raft::resource::get_large_workspace_resource(handle); + raft::resource::get_large_workspace_resource_ref(handle); if (sizeof(float) * n_rows_train * impl->dim() * kTolerableRatio < raft::resource::get_workspace_free_bytes(handle)) { big_memory_resource = device_memory; diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh index 2ff005ad16..bef0119e06 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh @@ -69,7 +69,7 @@ void select_clusters(raft::resources const& handle, cuvs::distance::DistanceType metric, const T* queries, // [n_queries, dim] const float* cluster_centers, // [n_lists, dim_ext] - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { raft::common::nvtx::range fun_scope( "ivf_pq::search::select_clusters(n_probes = %u, n_queries = %u, n_lists = %u, dim = %u)", @@ -179,7 +179,7 @@ void select_clusters(raft::resources const& handle, cuvs::distance::DistanceType metric, const T* queries, // [n_queries, dim] const int8_t* cluster_centers, // [n_lists, dim_ext] - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { raft::common::nvtx::range fun_scope( "ivf_pq::search::select_clusters(n_probes = %u, n_queries = %u, n_lists = %u, dim = %u)", @@ -267,7 +267,7 @@ void select_clusters(raft::resources const& handle, cuvs::distance::DistanceType metric, const T* queries, // [n_queries, dim] const half* cluster_centers, // [n_lists, dim_ext] - rmm::mr::device_memory_resource* mr) + rmm::device_async_resource_ref mr) { raft::common::nvtx::range fun_scope( "ivf_pq::search::select_clusters(n_probes = %u, n_queries = %u, n_lists = %u, dim = %u)", @@ -440,7 +440,7 @@ void ivfpq_search_worker(raft::resources const& handle, topK, index.dim()); auto stream = raft::resource::get_cuda_stream(handle); - auto mr = raft::resource::get_workspace_resource(handle); + auto mr = raft::resource::get_workspace_resource_ref(handle); bool manage_local_topk = is_local_topk_feasible(topK, n_probes, n_queries); auto topk_len = manage_local_topk ? n_probes * topK : max_samples; @@ -484,7 +484,7 @@ void ivfpq_search_worker(raft::resources const& handle, if (coresidency > 1) { // Sorting index by cluster number (label). - // The goal is to incrase the L2 cache hit rate to read the vectors + // The goal is to increase the L2 cache hit rate to read the vectors // of a cluster by processing the cluster at the same time as much as // possible. index_list_sorted_buf.resize(n_queries_probes, stream); @@ -899,7 +899,7 @@ inline void search(raft::resources const& handle, max_samples = ms; } - auto mr = raft::resource::get_workspace_resource(handle); + auto mr = raft::resource::get_workspace_resource_ref(handle); // Maximum number of query vectors to search at the same time. // Number of queries in the outer loop, which includes query transform and coarse search. diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_transform.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_transform.cuh index ba92c53231..1d55692488 100644 --- a/cpp/src/neighbors/ivf_pq/ivf_pq_transform.cuh +++ b/cpp/src/neighbors/ivf_pq/ivf_pq_transform.cuh @@ -51,7 +51,7 @@ void transform_batch(raft::resources const& res, raft::device_matrix_view output_dataset) { IdxT n_rows = dataset.extent(0); - rmm::device_async_resource_ref mr = raft::resource::get_workspace_resource(res); + rmm::device_async_resource_ref mr = raft::resource::get_workspace_resource_ref(res); // Compute the labels for each vector cuvs::cluster::kmeans::balanced_params kmeans_params; @@ -115,7 +115,7 @@ void transform(raft::resources const& res, raft::common::nvtx::range fun_scope( "ivf_pq::transform(n_rows = %u, dim = %u)", n_rows, dataset.extent(1)); - rmm::device_async_resource_ref mr = raft::resource::get_workspace_resource(res); + rmm::device_async_resource_ref mr = raft::resource::get_workspace_resource_ref(res); // The cluster centers in the index are stored padded, which is not acceptable by // the kmeans_balanced::predict. Thus, we need the restructuring raft::copy. @@ -138,7 +138,7 @@ void transform(raft::resources const& res, } constexpr size_t max_batch_size = 65536; - rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(res); + rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource_ref(res); utils::batch_load_iterator vec_batches(dataset.data_handle(), n_rows, diff --git a/cpp/src/neighbors/nn_descent.cuh b/cpp/src/neighbors/nn_descent.cuh index 9015a521c2..abe45f0a6f 100644 --- a/cpp/src/neighbors/nn_descent.cuh +++ b/cpp/src/neighbors/nn_descent.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -44,7 +44,7 @@ namespace cuvs::neighbors::nn_descent { * * @tparam T data-type of the input dataset * @tparam IdxT data-type for the output index - * @param[in] res raft::resources is an object mangaging resources + * @param[in] res raft::resources is an object managing resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm * @param[in] dataset raft::device_matrix_view input dataset expected to be located @@ -85,7 +85,7 @@ auto build(raft::resources const& res, * * @tparam T data-type of the input dataset * @tparam IdxT data-type for the output index - * @param res raft::resources is an object mangaging resources + * @param res raft::resources is an object managing resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm * @param[in] dataset raft::device_matrix_view input dataset expected to be located @@ -126,7 +126,7 @@ void build(raft::resources const& res, * * @tparam T data-type of the input dataset * @tparam IdxT data-type for the output index - * @param res raft::resources is an object mangaging resources + * @param res raft::resources is an object managing resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm * @param[in] dataset raft::host_matrix_view input dataset expected to be located @@ -167,7 +167,7 @@ auto build(raft::resources const& res, * * @tparam T data-type of the input dataset * @tparam IdxT data-type for the output index - * @param[in] res raft::resources is an object mangaging resources + * @param[in] res raft::resources is an object managing resources * @param[in] params an instance of nn_descent::index_params that are parameters * to run the nn-descent algorithm * @param[in] dataset raft::host_matrix_view input dataset expected to be located diff --git a/cpp/src/neighbors/scann/detail/scann_avq.cuh b/cpp/src/neighbors/scann/detail/scann_avq.cuh index e7c1663f3e..51e9e12eb8 100644 --- a/cpp/src/neighbors/scann/detail/scann_avq.cuh +++ b/cpp/src/neighbors/scann/detail/scann_avq.cuh @@ -59,7 +59,7 @@ void compute_cluster_offsets(raft::resources const& dev_resources, { cudaStream_t stream = raft::resource::get_cuda_stream(dev_resources); rmm::device_async_resource_ref device_memory = - raft::resource::get_workspace_resource(dev_resources); + raft::resource::get_workspace_resource_ref(dev_resources); // Histrogram to compute cluster sizes int num_levels = cluster_sizes.extent(0) + 1; @@ -138,7 +138,7 @@ void sum_reduce_vector(raft::resources const& dev_resources, { cudaStream_t stream = raft::resource::get_cuda_stream(dev_resources); rmm::device_async_resource_ref device_memory = - raft::resource::get_workspace_resource(dev_resources); + raft::resource::get_workspace_resource_ref(dev_resources); size_t temp_storage_bytes = 0; @@ -166,7 +166,7 @@ void cholesky_solver(raft::resources const& dev_resources, cudaStream_t stream = raft::resource::get_cuda_stream(dev_resources); cusolverDnHandle_t cusolverH = raft::resource::get_cusolver_dn_handle(dev_resources); rmm::device_async_resource_ref device_memory = - raft::resource::get_workspace_resource(dev_resources); + raft::resource::get_workspace_resource_ref(dev_resources); // RAFT_CUSOLVER_TRY(cusolverDnSetStream(cusolverH, stream)); diff --git a/cpp/src/neighbors/scann/detail/scann_build.cuh b/cpp/src/neighbors/scann/detail/scann_build.cuh index 7805f622d3..bfe38209f7 100644 --- a/cpp/src/neighbors/scann/detail/scann_build.cuh +++ b/cpp/src/neighbors/scann/detail/scann_build.cuh @@ -8,6 +8,7 @@ #include "../../detail/ann_utils.cuh" #include #include +#include #include #include @@ -86,7 +87,7 @@ index build( raft::device_vector_view labels_view = idx.labels(); // setup batching for kmeans prediction + quantization - auto* device_memory = raft::resource::get_workspace_resource(res); + auto device_memory = raft::resource::get_workspace_resource_ref(res); constexpr size_t kReasonableMaxBatchSize = 65536; size_t max_batch_size = std::min(dataset.extent(0), kReasonableMaxBatchSize); @@ -159,47 +160,17 @@ index build( int dim_per_subspace = params.pq_dim; int num_clusters = 1 << params.pq_bits; - auto full_codebook = - raft::make_device_matrix(res, num_clusters * num_subspaces, dim_per_subspace); - - // Loop each subspace, training codebooks for each - for (int subspace = 0; subspace < num_subspaces; subspace++) { - int sub_dim_start = subspace * dim_per_subspace; - int sub_dim_end = (subspace + 1) * dim_per_subspace; - - auto sub_trainset = raft::make_device_matrix( - res, trainset_residuals.extent(0), (int64_t)dim_per_subspace); - raft::matrix::slice_coordinates avq_sub_coords( - 0, sub_dim_start, trainset_residuals.extent(0), sub_dim_end); - raft::matrix::slice( - res, raft::make_const_mdspan(trainset_residuals.view()), sub_trainset.view(), avq_sub_coords); - - // Set up quantization bits and params - cuvs::neighbors::vpq_params pq_params; - pq_params.pq_bits = params.pq_bits; - // For VPQ, pq_dim is the number of subspaces, not the dimension of the subspaces - pq_params.pq_dim = 1; - // We handle sampling/training set construction above, so use the full set in VPQ - pq_params.pq_kmeans_trainset_fraction = 1.0; - pq_params.kmeans_n_iters = params.pq_train_iters; - - // Create pq codebook for this subspace - auto sub_pq_codebook = - create_pq_codebook(res, raft::make_const_mdspan(sub_trainset.view()), pq_params); - - raft::copy( - res, - raft::make_device_vector_view( - full_codebook.data_handle() + (subspace * sub_pq_codebook.size()), sub_pq_codebook.size()), - raft::make_device_vector_view(sub_pq_codebook.data_handle(), - sub_pq_codebook.size())); - } - raft::resource::sync_stream(res); + cuvs::preprocessing::quantize::pq::params pq_build_params; + pq_build_params.pq_bits = params.pq_bits; + pq_build_params.pq_dim = num_subspaces; + pq_build_params.use_subspaces = true; + pq_build_params.use_vq = false; // We already computed residuals + pq_build_params.kmeans_n_iters = params.pq_train_iters; + pq_build_params.max_train_points_per_pq_code = pq_n_rows_train / num_clusters; + pq_build_params.pq_kmeans_type = cuvs::cluster::kmeans::kmeans_type::KMeansBalanced; - // Set up quantization bits and params - cuvs::neighbors::vpq_params pq_params; - pq_params.pq_bits = params.pq_bits; - pq_params.pq_dim = dataset.extent(1) / params.pq_dim; + auto pq_quantizer = cuvs::preprocessing::quantize::pq::build( + res, pq_build_params, raft::make_const_mdspan(trainset_residuals.view())); dataset_vec_batches.reset(); dataset_vec_batches.prefetch_next_batch(); @@ -230,9 +201,11 @@ index build( batch_soar_labels_view, params.soar_lambda); - // Compute and quantize residuals - auto avq_quant = quantize_residuals( - res, raft::make_const_mdspan(avq_residuals.view()), full_codebook.view(), pq_params); + // Compute and quantize residuals using the public PQ API + int64_t codes_dim = cuvs::preprocessing::quantize::pq::get_quantized_dim(pq_build_params); + auto avq_quant = raft::make_device_matrix(res, batch.size(), codes_dim); + cuvs::preprocessing::quantize::pq::transform( + res, pq_quantizer, raft::make_const_mdspan(avq_residuals.view()), avq_quant.view()); // Compute and quantize SOAR residuals auto soar_residuals = @@ -241,56 +214,64 @@ index build( raft::make_const_mdspan(centroids_view), raft::make_const_mdspan(batch_soar_labels_view)); - auto soar_quant = quantize_residuals( - res, raft::make_const_mdspan(soar_residuals.view()), full_codebook.view(), pq_params); + auto soar_quant = raft::make_device_matrix(res, batch.size(), codes_dim); + cuvs::preprocessing::quantize::pq::transform( + res, pq_quantizer, raft::make_const_mdspan(soar_residuals.view()), soar_quant.view()); + // Prefetch next batch + dataset_vec_batches.prefetch_next_batch(); // unpack codes - auto quantized_residuals = - raft::make_device_matrix(res, batch.size(), num_subspaces); - auto quantized_soar_residuals = - raft::make_device_matrix(res, batch.size(), num_subspaces); - - unpack_codes(res, - quantized_residuals.view(), - raft::make_const_mdspan(avq_quant.view()), - params.pq_bits, - num_subspaces); - unpack_codes(res, - quantized_soar_residuals.view(), - raft::make_const_mdspan(soar_quant.view()), - params.pq_bits, - num_subspaces); + if (pq_quantizer.params_quantizer.pq_bits == 8) { + // Copy unpacked codes to host + // TODO (rmaschal): these copies are blocking and not overlapped + raft::copy(idx.quantized_residuals().data_handle() + batch.offset() * num_subspaces, + avq_quant.data_handle(), + avq_quant.size(), + stream); + + raft::copy(idx.quantized_soar_residuals().data_handle() + batch.offset() * num_subspaces, + soar_quant.data_handle(), + soar_quant.size(), + stream); + } else { + auto quantized_residuals = + raft::make_device_matrix(res, batch.size(), num_subspaces); + auto quantized_soar_residuals = + raft::make_device_matrix(res, batch.size(), num_subspaces); + + unpack_codes(res, + quantized_residuals.view(), + raft::make_const_mdspan(avq_quant.view()), + params.pq_bits, + num_subspaces); + unpack_codes(res, + quantized_soar_residuals.view(), + raft::make_const_mdspan(soar_quant.view()), + params.pq_bits, + num_subspaces); + raft::copy(res, + raft::make_host_vector_view( + idx.quantized_residuals().data_handle() + batch.offset() * num_subspaces, + quantized_residuals.size()), + raft::make_device_vector_view(quantized_residuals.data_handle(), + quantized_residuals.size())); + + raft::copy(res, + raft::make_host_vector_view( + idx.quantized_soar_residuals().data_handle() + batch.offset() * num_subspaces, + quantized_soar_residuals.size()), + raft::make_device_vector_view( + quantized_soar_residuals.data_handle(), quantized_soar_residuals.size())); + } // quantize dataset to bfloat16, if enabled. Similar to SOAR, quantization // is performed in this loop to improve locality // TODO (rmaschal): Might be more efficient to do on CPU, to avoid DtoH copy - auto bf16_dataset = raft::make_device_matrix(res, batch_view.extent(0), dim); - if (params.reordering_bf16) { + auto bf16_dataset = + raft::make_device_matrix(res, batch_view.extent(0), dim); quantize_bfloat16( res, batch_view, bf16_dataset.view(), params.reordering_noise_shaping_threshold); - } - - // Prefetch next batch - dataset_vec_batches.prefetch_next_batch(); - - // Copy unpacked codes to host - // TODO (rmaschal): these copies are blocking and not overlapped - raft::copy(res, - raft::make_host_vector_view( - idx.quantized_residuals().data_handle() + batch.offset() * num_subspaces, - quantized_residuals.size()), - raft::make_device_vector_view(quantized_residuals.data_handle(), - quantized_residuals.size())); - - raft::copy(res, - raft::make_host_vector_view( - idx.quantized_soar_residuals().data_handle() + batch.offset() * num_subspaces, - quantized_soar_residuals.size()), - raft::make_device_vector_view(quantized_soar_residuals.data_handle(), - quantized_soar_residuals.size())); - - if (params.reordering_bf16) { raft::copy(res, raft::make_host_vector_view( idx.bf16_dataset().data_handle() + batch.offset() * dim, bf16_dataset.size()), @@ -305,7 +286,7 @@ index build( // Codebooks from VPQ have the shape [subspace idx, subspace dim, code] // This converts the codebook into matrix format for easy interoperability // with open-source ScaNN search - auto full_codebook_view = full_codebook.view(); + auto full_codebook_view = pq_quantizer.vpq_codebooks.pq_code_book.view(); raft::linalg::map_offset( res, diff --git a/cpp/src/neighbors/scann/detail/scann_quantize.cuh b/cpp/src/neighbors/scann/detail/scann_quantize.cuh index 16ef1f4295..95025f7f57 100644 --- a/cpp/src/neighbors/scann/detail/scann_quantize.cuh +++ b/cpp/src/neighbors/scann/detail/scann_quantize.cuh @@ -3,8 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "../../detail/vpq_dataset.cuh" -#include "../../ivf_pq/ivf_pq_codepacking.cuh" #include #include #include @@ -21,125 +19,6 @@ namespace cuvs::neighbors::experimental::scann::detail { /** Fix the internal indexing type to avoid integer underflows/overflows */ using ix_t = int64_t; -template -__launch_bounds__(BlockSize) RAFT_KERNEL process_and_fill_codes_subspaces_kernel( - raft::device_matrix_view out_codes, - raft::device_matrix_view dataset, - raft::device_matrix_view vq_centers, - raft::device_vector_view vq_labels, - raft::device_matrix_view pq_centers) -{ - constexpr uint32_t kSubWarpSize = std::min(raft::WarpSize, 1u << PqBits); - using subwarp_align = raft::Pow2; - const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{BlockSize} * IdxT{blockIdx.x}); - if (row_ix >= out_codes.extent(0)) { return; } - - const uint32_t pq_dim = raft::div_rounding_up_unsafe(vq_centers.extent(1), pq_centers.extent(1)); - - const uint32_t lane_id = raft::Pow2::mod(threadIdx.x); - const LabelT vq_label = vq_labels(row_ix); - - // write label - auto* out_label_ptr = reinterpret_cast(&out_codes(row_ix, 0)); - if (lane_id == 0) { *out_label_ptr = vq_label; } - - auto* out_codes_ptr = reinterpret_cast(out_label_ptr + 1); - cuvs::neighbors::ivf_pq::detail::bitfield_view_t code_view{out_codes_ptr}; - for (uint32_t j = 0; j < pq_dim; j++) { - // find PQ label - int subspace_offset = j * pq_centers.extent(1) * (1 << PqBits); - auto pq_subspace_view = raft::make_device_matrix_view( - pq_centers.data_handle() + subspace_offset, (uint32_t)(1 << PqBits), pq_centers.extent(1)); - auto pq_centers_smem = - raft::make_device_matrix_view(nullptr, 0, 0); - uint8_t code = cuvs::neighbors::detail::compute_code( - dataset, vq_centers, pq_centers_smem, pq_subspace_view, row_ix, j, vq_label); - // TODO: this writes in global memory one byte per warp, which is very slow. - // It's better to keep the codes in the shared memory or registers and dump them at once. - if (lane_id == 0) { code_view[j] = code; } - } -} - -template -auto process_and_fill_codes_subspaces( - const raft::resources& res, - const vpq_params& params, - const DatasetT& dataset, - raft::device_matrix_view vq_centers, - raft::device_matrix_view pq_centers) - -> raft::device_matrix -{ - using data_t = typename DatasetT::value_type; - using cdataset_t = vpq_dataset; - using label_t = uint32_t; - - const ix_t n_rows = dataset.extent(0); - const ix_t dim = dataset.extent(1); - const ix_t pq_dim = params.pq_dim; - const ix_t pq_bits = params.pq_bits; - const ix_t pq_n_centers = ix_t{1} << pq_bits; - // NB: codes must be aligned at least to sizeof(label_t) to be able to read labels. - const ix_t codes_rowlen = - sizeof(label_t) * (1 + raft::div_rounding_up_safe(pq_dim * pq_bits, 8 * sizeof(label_t))); - - auto codes = raft::make_device_matrix(res, n_rows, codes_rowlen); - - auto stream = raft::resource::get_cuda_stream(res); - - // TODO: with scaling workspace we could choose the batch size dynamically - constexpr ix_t kBlockSize = 256; - const ix_t threads_per_vec = std::min(raft::WarpSize, pq_n_centers); - dim3 threads(kBlockSize, 1, 1); - - auto kernel = [](uint32_t pq_bits) { - switch (pq_bits) { - case 4: - return process_and_fill_codes_subspaces_kernel; - case 8: - return process_and_fill_codes_subspaces_kernel; - default: RAFT_FAIL("Invalid pq_bits (%u), the value must be 4 or 8", pq_bits); - } - }(pq_bits); - - auto labels = raft::make_device_vector(res, dataset.extent(0)); - cuvs::neighbors::detail::predict_vq(res, dataset, vq_centers, labels.view()); - - dim3 blocks(raft::div_rounding_up_safe(n_rows, kBlockSize / threads_per_vec), 1, 1); - - kernel<<>>( - raft::make_device_matrix_view(codes.data_handle(), n_rows, codes_rowlen), - dataset, - vq_centers, - raft::make_const_mdspan(labels.view()), - pq_centers); - - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - return codes; -} - -template -auto create_pq_codebook(raft::resources const& res, - raft::device_matrix_view residuals, - cuvs::neighbors::vpq_params ps) - -> raft::device_matrix -{ - // Create codebooks (vq initialized to 0s since we don't need here) - auto vq_code_book = - raft::make_device_matrix(res, 1, residuals.extent(1)); - raft::linalg::map_offset(res, vq_code_book.view(), [] __device__(size_t i) { return 0; }); - - auto pq_code_book = cuvs::neighbors::detail::train_pq( - res, ps, residuals, raft::make_const_mdspan(vq_code_book.view())); - - return pq_code_book; -} - /** * @brief Subtract cluster center coordinates from each dataset vector. * @@ -175,61 +54,19 @@ auto compute_residuals(raft::resources const& res, return residuals; } -/**} - * @brief Generate PQ codes for residual vectors using codebook - * - * For each subspace, minimize L2 norm between residual vectors and - * PQ centers to generate codes for residual vectors - * - * @tparam T - * @tparam IdxT - * @tparam LabelT - * @param res raft resources - * @param residuals the residual vectors we're quantizing, size [n_rows, dim] - * @param pq_codebook the codebook of PQ centers size [dim, 1 << pq_bits] - * @oaran ps parameters used with vpq_dataset for pq quantization - * @return device matrix with (packed) codes from vpq, size [n_rows, 1 +ceil((dim / pq_dim * - * pq_bits) /( 8 * sizeof(LabelT)))] - */ -template -auto quantize_residuals(raft::resources const& res, - raft::device_matrix_view residuals, - raft::device_matrix_view pq_codebook, - cuvs::neighbors::vpq_params ps) - -> raft::device_matrix -{ - auto dim = residuals.extent(1); - - // Using a single 0 vector for the vq_codebook, since we already have - // vq centers and computed residuals w.r.t those centers - auto vq_codebook = raft::make_device_matrix(res, 1, dim); - - raft::matrix::fill(res, vq_codebook.view(), T(0)); - - auto codes = process_and_fill_codes_subspaces( - res, ps, residuals, raft::make_const_mdspan(vq_codebook.view()), pq_codebook); - - return codes; -} - /** * @brief Unpack VPQ codes into 1-byte per code * - * VPQ gives codes in a "packed" form. The first 4 bytes give the code for - * vector quantization, and the remaining bytes the codes for subspace product - * quantization. In the case of 4 bit PQ, each byte stores codes for 2 subspaces - * in a packed form. + * VPQ gives codes in a "packed" form. In the case of 4 bit PQ, each byte stores + * codes for 2 subspaces in a packed form. * - * This function unpacks the codes by discarding the VQ code (which we don't need, - * since we use VPQ only for residual quantization) and (in the case of 4-bit PQ) - * unpackes the subspace codes into one byte each. This is for interoperability - * with open source ScaNN, which doesn't pack codes + * This function unpacks the subspace codes into one byte each. This is for + * interoperability with open source ScaNN, which doesn't pack codes * * @tparam IdxT * @param res raft resources * @param unpacked_codes_view matrix of unpacked codes, size [n_rows, dim / pq_dim] - * @param codes_view packed codes from vpq, size [n_rows, 1 +ceil((dim / pq_dim * pq_bits) /( 8 * - * sizeof(LabelT)))] + * @param codes_view packed codes from vpq, size [n_rows, ceil((dim / pq_dim * pq_bits) / 8)] * @param pq_bits number of bits used for PQ * @param num_subspaces the number of pq_subspaces (dim / pq_dim) */ @@ -245,7 +82,7 @@ void unpack_codes(raft::resources const& res, res, unpacked_codes_view, [codes_view, num_subspaces] __device__(size_t i) { int64_t row_idx = i / num_subspaces; int64_t subspace_idx = i % num_subspaces; - int64_t packed_subspace_idx = 4 + subspace_idx / 2; + int64_t packed_subspace_idx = subspace_idx / 2; uint8_t mask = subspace_idx % 2; uint8_t packed_labels = codes_view(row_idx, packed_subspace_idx); @@ -254,10 +91,6 @@ void unpack_codes(raft::resources const& res, return (mask)*first + (1 - mask) * second; }); - - } else { - raft::matrix::slice_coordinates coords(0, 4, codes_view.extent(0), 4 + num_subspaces); - raft::matrix::slice(res, raft::make_const_mdspan(codes_view), unpacked_codes_view, coords); } } @@ -382,7 +215,7 @@ __launch_bounds__(BlockSize) RAFT_KERNEL // The change in || r_parallel ||^2 can be written (residual_dot + residual_dot_delta) ^ 2 // the change in || r_perpendicular || ^2 can be written residual_norm_delta - // parallel_norm_delta Thus cost_delta = eta * (residual_dot + residual_dot_delta) ^2 + - // (residual_norm_delta - (residual_dot + residual_dot_delta)^2 Expanding and simplying, + // (residual_norm_delta - (residual_dot + residual_dot_delta)^2 Expanding and simplifying, // cost_delta = a + b * resdiaul_dot, where a and b are as below. Since only residual_dot is // unknown (because updates must be made synchronously) we can compute a and b in parallel // across threads in the warp and minimize computation in the update step of the coordinate diff --git a/cpp/src/neighbors/vpq_dataset.cuh b/cpp/src/neighbors/vpq_dataset.cuh deleted file mode 100644 index 34b011bb0b..0000000000 --- a/cpp/src/neighbors/vpq_dataset.cuh +++ /dev/null @@ -1,41 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ - -#pragma once - -#include "detail/vpq_dataset.cuh" -#include - -#include - -namespace cuvs::neighbors { - -/** - * @brief Compress a dataset for use in CAGRA-Q search in place of the original data. - * - * @tparam DatasetT a row-major mdspan or mdarray (device or host). - * @tparam MathT a type of the codebook elements and internal math ops. - * @tparam IdxT type of the indices in the source dataset - * - * @param[in] res - * @param[in] params VQ and PQ parameters for compressing the data - * @param[in] dataset a row-major mdspan or mdarray (device or host) [n_rows, dim]. - */ -template -auto vpq_build(const raft::resources& res, const vpq_params& params, const DatasetT& dataset) - -> vpq_dataset - -{ - if constexpr (std::is_same_v) { - return detail::vpq_convert_math_type( - res, detail::vpq_build(res, params, dataset)); - } else { - return detail::vpq_build(res, params, dataset); - } -} - -} // namespace cuvs::neighbors diff --git a/cpp/src/preprocessing/quantize/detail/binary.cuh b/cpp/src/preprocessing/quantize/detail/binary.cuh index 3c61aba562..ccff4999cf 100644 --- a/cpp/src/preprocessing/quantize/detail/binary.cuh +++ b/cpp/src/preprocessing/quantize/detail/binary.cuh @@ -145,7 +145,7 @@ void mean_f16_in_f32(raft::resources const& res, const size_t dataset_size, cudaStream_t cuda_stream) { - auto mr = raft::resource::get_workspace_resource(res); + auto mr = raft::resource::get_workspace_resource_ref(res); auto f32_result_vec = raft::make_device_mdarray(res, mr, raft::make_extents(dataset_dim)); raft::matrix::fill(res, f32_result_vec.view(), float(0)); @@ -211,7 +211,7 @@ auto train(raft::resources const& res, static_cast(dataset_dim)); raft::random::RngState rng(29837lu); - auto mr = raft::resource::get_workspace_resource(res); + auto mr = raft::resource::get_workspace_resource_ref(res); auto sampled_dataset_chunk = raft::make_device_mdarray( res, mr, raft::make_extents(num_samples, max_dim_chunk)); auto transposed_sampled_dataset_chunk = raft::make_device_mdarray( @@ -331,7 +331,7 @@ auto train(raft::resources const& res, raft::make_host_vector_view(host_threshold_vec.data(), (int64_t)dataset_dim)); } else { - auto mr = raft::resource::get_workspace_resource(res); + auto mr = raft::resource::get_workspace_resource_ref(res); auto casted_vec = raft::make_device_mdarray( res, mr, raft::make_extents(dataset_dim)); raft::copy(res, @@ -425,7 +425,7 @@ void transform(raft::resources const& res, raft::make_device_vector_view(quantizer.threshold.data_handle(), (int64_t)dataset_dim)); } else { - auto mr = raft::resource::get_workspace_resource(res); + auto mr = raft::resource::get_workspace_resource_ref(res); auto casted_vec = raft::make_device_mdarray( res, mr, raft::make_extents(dataset_dim)); raft::linalg::map(res, diff --git a/cpp/src/preprocessing/quantize/detail/pq.cuh b/cpp/src/preprocessing/quantize/detail/pq.cuh index db7cc8d5c1..b4d2ee57b7 100644 --- a/cpp/src/preprocessing/quantize/detail/pq.cuh +++ b/cpp/src/preprocessing/quantize/detail/pq.cuh @@ -90,7 +90,7 @@ auto train_pq_subspaces( auto trainset_ptr = !vq_centers.empty() ? pq_trainset.data_handle() : dataset.data_handle(); auto sub_labels = raft::make_device_vector(res, 0); auto pq_cluster_sizes = raft::make_device_vector(res, 0); - auto device_memory = raft::resource::get_workspace_resource(res); + auto device_memory = raft::resource::get_workspace_resource_ref(res); if (params.pq_kmeans_type == cuvs::cluster::kmeans::kmeans_type::KMeansBalanced) { sub_labels = raft::make_device_mdarray( res, device_memory, raft::make_extents(n_rows_train)); @@ -328,4 +328,67 @@ void inverse_transform( out, quant.params_quantizer.use_subspaces); } + +template +void vpq_convert_math_type(const raft::resources& res, + const cuvs::neighbors::vpq_dataset& src, + cuvs::neighbors::vpq_dataset& dst) +{ + raft::linalg::map(res, + dst.vq_code_book.view(), + cuvs::spatial::knn::detail::utils::mapping{}, + raft::make_const_mdspan(src.vq_code_book.view())); + raft::linalg::map(res, + dst.pq_code_book.view(), + cuvs::spatial::knn::detail::utils::mapping{}, + raft::make_const_mdspan(src.pq_code_book.view())); +} + +template +auto vpq_build(const raft::resources& res, + const cuvs::neighbors::vpq_params& params, + const DatasetT& dataset) -> cuvs::neighbors::vpq_dataset +{ + using label_t = uint32_t; + // Use a heuristic to impute missing parameters. + auto ps = cuvs::neighbors::detail::fill_missing_params_heuristics(params, dataset); + + // Train codes + auto vq_code_book = cuvs::neighbors::detail::train_vq(res, ps, dataset); + auto pq_code_book = cuvs::neighbors::detail::train_pq( + res, ps, dataset, raft::make_const_mdspan(vq_code_book.view())); + + // Encode dataset + const IdxT n_rows = dataset.extent(0); + const IdxT codes_rowlen = sizeof(label_t) * (1 + raft::div_rounding_up_safe( + ps.pq_dim * ps.pq_bits, 8 * sizeof(label_t))); + + auto codes = raft::make_device_matrix(res, n_rows, codes_rowlen); + cuvs::neighbors::detail::process_and_fill_codes( + res, + ps, + dataset, + raft::make_const_mdspan(pq_code_book.view()), + raft::make_const_mdspan(vq_code_book.view()), + raft::make_device_vector_view(nullptr, 0), + codes.view(), + true); + + return cuvs::neighbors::vpq_dataset{ + std::move(vq_code_book), std::move(pq_code_book), std::move(codes)}; +} + +template +auto vpq_build_half(const raft::resources& res, + const cuvs::neighbors::vpq_params& params, + const DatasetT& dataset) -> cuvs::neighbors::vpq_dataset +{ + auto old_type = vpq_build(res, params, dataset); + auto new_type = cuvs::neighbors::vpq_dataset{ + raft::make_device_mdarray(res, old_type.vq_code_book.extents()), + raft::make_device_mdarray(res, old_type.pq_code_book.extents()), + std::move(old_type.data)}; + vpq_convert_math_type(res, old_type, new_type); + return new_type; +} } // namespace cuvs::preprocessing::quantize::pq::detail diff --git a/cpp/src/preprocessing/quantize/pq.cu b/cpp/src/preprocessing/quantize/pq.cu index 4a381c59ca..761474bdf8 100644 --- a/cpp/src/preprocessing/quantize/pq.cu +++ b/cpp/src/preprocessing/quantize/pq.cu @@ -52,4 +52,25 @@ CUVS_INST_QUANTIZATION(float, uint8_t); #undef CUVS_INST_QUANTIZATION +#define CUVS_INST_VPQ_BUILD(T) \ + auto vpq_build(const raft::resources& res, \ + const cuvs::neighbors::vpq_params& params, \ + const raft::host_matrix_view& dataset) \ + { \ + return detail::vpq_build_half(res, params, dataset); \ + } \ + auto vpq_build(const raft::resources& res, \ + const cuvs::neighbors::vpq_params& params, \ + const raft::device_matrix_view& dataset) \ + { \ + return detail::vpq_build_half(res, params, dataset); \ + } + +CUVS_INST_VPQ_BUILD(float); +CUVS_INST_VPQ_BUILD(half); +CUVS_INST_VPQ_BUILD(int8_t); +CUVS_INST_VPQ_BUILD(uint8_t); + +#undef CUVS_INST_VPQ_BUILD + } // namespace cuvs::preprocessing::quantize::pq diff --git a/cpp/src/preprocessing/quantize/vpq_build-ext.cuh b/cpp/src/preprocessing/quantize/vpq_build-ext.cuh new file mode 100644 index 0000000000..1745e53a33 --- /dev/null +++ b/cpp/src/preprocessing/quantize/vpq_build-ext.cuh @@ -0,0 +1,28 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include + +namespace cuvs::preprocessing::quantize::pq { + +#define CUVS_INST_VPQ_BUILD(T) \ + cuvs::neighbors::vpq_dataset vpq_build( \ + const raft::resources& res, \ + const cuvs::neighbors::vpq_params& params, \ + const raft::host_matrix_view& dataset); \ + cuvs::neighbors::vpq_dataset vpq_build( \ + const raft::resources& res, \ + const cuvs::neighbors::vpq_params& params, \ + const raft::device_matrix_view& dataset); + +CUVS_INST_VPQ_BUILD(float); +CUVS_INST_VPQ_BUILD(half); +CUVS_INST_VPQ_BUILD(int8_t); +CUVS_INST_VPQ_BUILD(uint8_t); + +#undef CUVS_INST_VPQ_BUILD +} // namespace cuvs::preprocessing::quantize::pq diff --git a/cpp/tests/distance/gram.cu b/cpp/tests/distance/gram.cu index e410eefade..b32049b6d4 100644 --- a/cpp/tests/distance/gram.cu +++ b/cpp/tests/distance/gram.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2019-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -23,7 +23,7 @@ namespace cuvs::distance::kernels { struct GramMatrixInputs { int n1; // feature vectors in matrix 1 - int n2; // featuer vectors in matrix 2 + int n2; // feature vectors in matrix 2 int n_cols; // number of elements in a feature vector bool is_row_major; KernelParams kernel; diff --git a/cpp/tests/neighbors/ann_cagra.cuh b/cpp/tests/neighbors/ann_cagra.cuh index 748e2e86c3..a6704f892a 100644 --- a/cpp/tests/neighbors/ann_cagra.cuh +++ b/cpp/tests/neighbors/ann_cagra.cuh @@ -906,7 +906,7 @@ class AnnCagraFilterTest : public ::testing::TestWithParam { EXPECT_FALSE(unacceptable_node); double min_recall = ps.min_recall; - // TODO(mfoerster): re-enable uniquenes test + // TODO(mfoerster): re-enable uniqueness test EXPECT_TRUE(eval_neighbours(indices_naive, indices_Cagra, distances_naive, diff --git a/cpp/tests/neighbors/ann_ivf_pq.cuh b/cpp/tests/neighbors/ann_ivf_pq.cuh index 4ce9c96077..033f0af9c2 100644 --- a/cpp/tests/neighbors/ann_ivf_pq.cuh +++ b/cpp/tests/neighbors/ann_ivf_pq.cuh @@ -103,7 +103,7 @@ void compare_vectors_l2( auto dim = a.extent(1); rmm::mr::managed_memory_resource managed_memory; auto dist = - raft::make_device_mdarray(res, &managed_memory, raft::make_extents(n_rows)); + raft::make_device_mdarray(res, managed_memory, raft::make_extents(n_rows)); raft::linalg::map_offset(res, dist.view(), [a, b, dim] __device__(uint32_t i) { cuvs::spatial::knn::detail::utils::mapping f{}; double d = 0.0f; diff --git a/cpp/tests/neighbors/ann_nn_descent.cuh b/cpp/tests/neighbors/ann_nn_descent.cuh index 7db6a523e9..568cebbc23 100644 --- a/cpp/tests/neighbors/ann_nn_descent.cuh +++ b/cpp/tests/neighbors/ann_nn_descent.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -471,7 +471,8 @@ const std::vector inputs = cuvs::distance::DistanceType::L2Expanded, cuvs::distance::DistanceType::L2SqrtExpanded, cuvs::distance::DistanceType::InnerProduct, - cuvs::distance::DistanceType::CosineExpanded}, + cuvs::distance::DistanceType::CosineExpanded, + cuvs::distance::DistanceType::L1}, {false, true}, {0.90}); diff --git a/cpp/tests/neighbors/ann_scann.cuh b/cpp/tests/neighbors/ann_scann.cuh index d8bc27fed7..eafddec9d2 100644 --- a/cpp/tests/neighbors/ann_scann.cuh +++ b/cpp/tests/neighbors/ann_scann.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ #pragma once @@ -8,7 +8,10 @@ #include "ann_utils.cuh" #include #include +#include +#include +#include #include #include #include @@ -123,6 +126,146 @@ class scann_test : public ::testing::TestWithParam { IdxT expected_bf16_size = ps.index_params.reordering_bf16 ? ps.dim * ps.num_db_vecs : 0; ASSERT_EQ(index.bf16_dataset().size(), expected_bf16_size); + check_code_validity(index, num_subspaces, num_pq_clusters); + check_reconstruction(index, num_subspaces); + } + + void check_code_validity(const index& idx, int num_subspaces, int num_pq_clusters) + { + auto quant_res_host = + raft::make_host_matrix(handle_, ps.num_db_vecs, num_subspaces); + auto quant_soar_host = + raft::make_host_matrix(handle_, ps.num_db_vecs, num_subspaces); + + raft::copy(quant_res_host.data_handle(), + idx.quantized_residuals().data_handle(), + idx.quantized_residuals().size(), + stream_); + raft::copy(quant_soar_host.data_handle(), + idx.quantized_soar_residuals().data_handle(), + idx.quantized_soar_residuals().size(), + stream_); + raft::resource::sync_stream(handle_); + + bool all_zeros = true; + auto n_vecs_to_check = std::min(ps.num_db_vecs, 50u); + for (IdxT i = 0; i < n_vecs_to_check * num_subspaces; i++) { + if (quant_res_host.data_handle()[i] != 0) { all_zeros = false; } + if (quant_soar_host.data_handle()[i] != 0) { all_zeros = false; } + // Check that unpacked codes are in valid range + if (ps.index_params.pq_bits == 4) { + ASSERT_LT(quant_res_host.data_handle()[i], num_pq_clusters) + << "AVQ quantized code out of range at position " << i; + ASSERT_LT(quant_soar_host.data_handle()[i], num_pq_clusters) + << "SOAR quantized code out of range at position " << i; + } + } + ASSERT_FALSE(all_zeros) << "Quantized output contains all zeros"; + } + + void check_reconstruction(const index& idx, int num_subspaces) + { + cuvs::preprocessing::quantize::pq::params pq_params; + pq_params.pq_bits = ps.index_params.pq_bits; + pq_params.pq_dim = num_subspaces; + pq_params.use_subspaces = true; + pq_params.use_vq = true; // SCANN uses centroids separately + + auto pq_codebook_copy = raft::make_device_matrix( + handle_, idx.pq_codebook().extent(0), idx.pq_codebook().extent(1)); + raft::copy(pq_codebook_copy.data_handle(), + idx.pq_codebook().data_handle(), + idx.pq_codebook().size(), + stream_); + + auto vq_codebook = raft::make_device_matrix( + handle_, idx.centers().extent(0), idx.centers().extent(1)); + raft::copy( + vq_codebook.data_handle(), idx.centers().data_handle(), idx.centers().size(), stream_); + auto empty_data = raft::make_device_matrix(handle_, 0, 0); + + cuvs::preprocessing::quantize::pq::quantizer quantizer{ + pq_params, + cuvs::neighbors::vpq_dataset{ + std::move(vq_codebook), std::move(pq_codebook_copy), std::move(empty_data)}}; + + auto quantized_residuals_device = + raft::make_device_matrix(handle_, ps.num_db_vecs, num_subspaces); + raft::copy(quantized_residuals_device.data_handle(), + idx.quantized_residuals().data_handle(), + idx.quantized_residuals().size(), + stream_); + + // Re-pack 4-bit codes. The 8-bit codes are already in the right format + auto codes_dim = cuvs::preprocessing::quantize::pq::get_quantized_dim(pq_params); + auto packed_codes = raft::make_device_matrix(handle_, ps.num_db_vecs, codes_dim); + + if (ps.index_params.pq_bits == 4) { + raft::linalg::map_offset( + handle_, + packed_codes.view(), + [qr_view = quantized_residuals_device.view(), num_subspaces, codes_dim] __device__( + size_t i) { + int64_t row_idx = i / codes_dim; + int64_t packed_idx = i % codes_dim; + int64_t code_idx = packed_idx * 2; + int64_t code_idx_next = code_idx + 1; + + uint8_t first_code = (code_idx < num_subspaces) ? qr_view(row_idx, code_idx) : 0; + uint8_t second_code = + (code_idx_next < num_subspaces) ? qr_view(row_idx, code_idx_next) : 0; + + return (first_code << 4) | (second_code & 0x0F); + }); + } else { + raft::copy(packed_codes.data_handle(), + quantized_residuals_device.data_handle(), + packed_codes.size(), + stream_); + } + + auto reconstructed_vectors = + raft::make_device_matrix(handle_, ps.num_db_vecs, ps.dim); + auto reconstructed_vectors_view = reconstructed_vectors.view(); + cuvs::preprocessing::quantize::pq::inverse_transform( + handle_, + quantizer, + raft::make_const_mdspan(packed_codes.view()), + reconstructed_vectors_view, + raft::make_const_mdspan(idx.labels())); + + // Compute L2 distances for reconstruction error + auto database_view = + raft::make_device_matrix_view(database.data(), ps.num_db_vecs, ps.dim); + auto distances = raft::make_device_vector(handle_, ps.num_db_vecs); + raft::linalg::map_offset( + handle_, + distances.view(), + [database_view, reconstructed_vectors_view, dim = ps.dim] __device__(IdxT i) { + float dist = 0.0f; + for (uint32_t j = 0; j < dim; j++) { + float diff = database_view(i, j) - reconstructed_vectors_view(i, j); + dist += diff * diff; + } + return sqrtf(dist / static_cast(dim)); + }); + + float max_allowed_error = 0.95f; + auto distances_host = raft::make_host_vector(handle_, ps.num_db_vecs); + raft::copy(distances_host.data_handle(), distances.data_handle(), ps.num_db_vecs, stream_); + raft::resource::sync_stream(handle_); + + float mean_error = 0.0f; + float max_error = 0.0f; + for (IdxT i = 0; i < ps.num_db_vecs; i++) { + mean_error += distances_host(i); + max_error = std::max(max_error, distances_host(i)); + } + mean_error /= static_cast(ps.num_db_vecs); + ASSERT_LT(mean_error, max_allowed_error) + << "Mean reconstruction error too large: " << mean_error; + ASSERT_LT(max_error, max_allowed_error * 1.5f) + << "Max reconstruction error too large: " << max_error; } void SetUp() override // NOLINT diff --git a/cpp/tests/neighbors/ann_utils.cuh b/cpp/tests/neighbors/ann_utils.cuh index 8a908c0187..cbc95d7bb7 100644 --- a/cpp/tests/neighbors/ann_utils.cuh +++ b/cpp/tests/neighbors/ann_utils.cuh @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -16,7 +16,6 @@ #include #include -#include #include "naive_knn.cuh" diff --git a/cpp/tests/neighbors/naive_knn.cuh b/cpp/tests/neighbors/naive_knn.cuh index 2ed461360c..6f19fe1c06 100644 --- a/cpp/tests/neighbors/naive_knn.cuh +++ b/cpp/tests/neighbors/naive_knn.cuh @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include namespace cuvs::neighbors { @@ -98,7 +99,7 @@ void naive_knn(raft::resources const& handle, uint32_t k, cuvs::distance::DistanceType type) { - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(); + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref(); auto stream = raft::resource::get_cuda_stream(handle); dim3 block_dim(16, 32, 1); @@ -126,8 +127,7 @@ void naive_knn(raft::resources const& handle, static_cast(k), dist_topk + offset * k, indices_topk + offset * k, - cuvs::distance::is_min_close(type), - mr); + cuvs::distance::is_min_close(type)); } RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } diff --git a/cpp/tests/preprocessing/product_quantization.cu b/cpp/tests/preprocessing/product_quantization.cu index d8a83e747a..f637f446af 100644 --- a/cpp/tests/preprocessing/product_quantization.cu +++ b/cpp/tests/preprocessing/product_quantization.cu @@ -53,7 +53,7 @@ void compare_vectors_l2(const raft::resources& res, auto dim = a.extent(1); rmm::mr::managed_memory_resource managed_memory; auto dist = - raft::make_device_mdarray(res, &managed_memory, raft::make_extents(n_rows)); + raft::make_device_mdarray(res, managed_memory, raft::make_extents(n_rows)); raft::linalg::map_offset(res, dist.view(), [a, b, dim] __device__(uint32_t i) { double d = 0.0f; for (uint32_t j = 0; j < dim; j++) { @@ -206,7 +206,7 @@ class ProductQuantizationTest : public ::testing::TestWithParam(n_samples_, n_encoded_cols); @@ -216,18 +216,13 @@ class ProductQuantizationTest : public ::testing::TestWithParam={whatever-cuvs-was-built-against}' at runtime, and mixing @@ -418,6 +410,11 @@ dependencies: packages: - *ctk_cu13 - *nvjitlink_cu13 + depends_on_cudart: + common: + - output_types: conda + packages: + - cuda-cudart-dev depends_on_cupy: common: - output_types: conda @@ -550,7 +547,7 @@ dependencies: - output_types: [conda, pyproject, requirements] packages: - click - - cuvs==26.4.*,>=0.0.0a0 + - cuvs==26.6.*,>=0.0.0a0 - pandas - pyyaml - requests @@ -577,17 +574,17 @@ dependencies: common: - output_types: conda packages: - - cuvs==26.4.*,>=0.0.0a0 + - cuvs==26.6.*,>=0.0.0a0 depends_on_cuvs_bench: common: - output_types: conda packages: - - cuvs-bench==26.4.*,>=0.0.0a0 + - cuvs-bench==26.6.*,>=0.0.0a0 depends_on_libcuvs: common: - output_types: conda packages: - - &libcuvs_unsuffixed libcuvs==26.4.*,>=0.0.0a0 + - &libcuvs_unsuffixed libcuvs==26.6.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -599,23 +596,23 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - libcuvs-cu12==26.4.*,>=0.0.0a0 + - libcuvs-cu12==26.6.*,>=0.0.0a0 - matrix: cuda: "13.*" cuda_suffixed: "true" packages: - - libcuvs-cu13==26.4.*,>=0.0.0a0 + - libcuvs-cu13==26.6.*,>=0.0.0a0 - {matrix: null, packages: [*libcuvs_unsuffixed]} depends_on_libcuvs_tests: common: - output_types: conda packages: - - libcuvs-tests==26.4.*,>=0.0.0a0 + - libcuvs-tests==26.6.*,>=0.0.0a0 depends_on_libraft: common: - output_types: conda packages: - - &libraft_unsuffixed libraft==26.4.*,>=0.0.0a0 + - &libraft_unsuffixed libraft==26.6.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -627,18 +624,18 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - libraft-cu12==26.4.*,>=0.0.0a0 + - libraft-cu12==26.6.*,>=0.0.0a0 - matrix: cuda: "13.*" cuda_suffixed: "true" packages: - - libraft-cu13==26.4.*,>=0.0.0a0 + - libraft-cu13==26.6.*,>=0.0.0a0 - {matrix: null, packages: [*libraft_unsuffixed]} depends_on_librmm: common: - output_types: conda packages: - - &librmm_unsuffixed librmm==26.4.*,>=0.0.0a0 + - &librmm_unsuffixed librmm==26.6.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -650,18 +647,18 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - librmm-cu12==26.4.*,>=0.0.0a0 + - librmm-cu12==26.6.*,>=0.0.0a0 - matrix: cuda: "13.*" cuda_suffixed: "true" packages: - - librmm-cu13==26.4.*,>=0.0.0a0 + - librmm-cu13==26.6.*,>=0.0.0a0 - {matrix: null, packages: [*librmm_unsuffixed]} depends_on_pylibraft: common: - output_types: conda packages: - - &pylibraft_unsuffixed pylibraft==26.4.*,>=0.0.0a0 + - &pylibraft_unsuffixed pylibraft==26.6.*,>=0.0.0a0 - output_types: requirements packages: # pip recognizes the index as a global option for the requirements.txt file @@ -673,12 +670,12 @@ dependencies: cuda: "12.*" cuda_suffixed: "true" packages: - - pylibraft-cu12==26.4.*,>=0.0.0a0 + - pylibraft-cu12==26.6.*,>=0.0.0a0 - matrix: cuda: "13.*" cuda_suffixed: "true" packages: - - pylibraft-cu13==26.4.*,>=0.0.0a0 + - pylibraft-cu13==26.6.*,>=0.0.0a0 - {matrix: null, packages: [*pylibraft_unsuffixed]} depends_on_nccl: common: diff --git a/docs/source/build.rst b/docs/source/build.rst index dcef10c96b..5e863e40f4 100644 --- a/docs/source/build.rst +++ b/docs/source/build.rst @@ -205,7 +205,7 @@ After building the C and C++ libraries, the Golang library can be built with the export CUDA_HOME="/usr/local/cuda" # or wherever your CUDA installation is. export CGO_CFLAGS="-I${CONDA_PREFIX}/include -I${CUDA_HOME}/include" - export CGO_LDFLAGS="-L${CONDA_PREFIX}/lib -lcudart -lcuvs -lcuvs_c" + export CGO_LDFLAGS="-L${CONDA_PREFIX}/lib -lcuvs -lcuvs_c" export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH" export CC=clang @@ -259,11 +259,6 @@ cuVS has the following configurable cmake flags available: - OFF - Enable the `-lineinfo` option for nvcc - * - CUDA_STATIC_RUNTIME - - ON, OFF - - OFF - - Statically link the CUDA runtime - * - CUDA_STATIC_MATH_LIBRARIES - ON, OFF - OFF diff --git a/docs/source/cuvs_bench/index.rst b/docs/source/cuvs_bench/index.rst index 7caec33892..f6e4989ddf 100644 --- a/docs/source/cuvs_bench/index.rst +++ b/docs/source/cuvs_bench/index.rst @@ -88,7 +88,7 @@ The following command pulls the nightly container for Python version 3.13, CUDA .. code-block:: bash - docker pull rapidsai/cuvs-bench:26.04a-cuda12-py3.13 # substitute cuvs-bench for the exact desired container. + docker pull rapidsai/cuvs-bench:26.06a-cuda12-py3.13 # substitute cuvs-bench for the exact desired container. The CUDA and python versions can be changed for the supported values: - Supported CUDA versions: 12, 13 @@ -235,7 +235,7 @@ For GPU-enabled systems, the `DATA_FOLDER` variable should be a local folder whe export DATA_FOLDER=path/to/store/datasets/and/results docker run --gpus all --rm -it -u $(id -u) \ -v $DATA_FOLDER:/data/benchmarks \ - rapidsai/cuvs-bench:26.04-cuda12.9-py3.13 \ + rapidsai/cuvs-bench:26.06-cuda12.9-py3.13 \ "--dataset deep-image-96-angular" \ "--normalize" \ "--algorithms cuvs_cagra,cuvs_ivf_pq --batch-size 10 -k 10" \ @@ -248,7 +248,7 @@ Usage of the above command is as follows: * - Argument - Description - * - `rapidsai/cuvs-bench:26.04-cuda12.9-py3.13` + * - `rapidsai/cuvs-bench:26.06-cuda12.9-py3.13` - Image to use. See "Docker" section for links to lists of available tags. * - `"--dataset deep-image-96-angular"` @@ -295,7 +295,7 @@ All of the `cuvs-bench` images contain the Conda packages, so they can be used d --entrypoint /bin/bash \ --workdir /data/benchmarks \ -v $DATA_FOLDER:/data/benchmarks \ - rapidsai/cuvs-bench:26.04-cuda12.9-py3.13 + rapidsai/cuvs-bench:26.06-cuda12.9-py3.13 This will drop you into a command line in the container, with the `cuvs-bench` python package ready to use, as described in the [Running the benchmarks](#running-the-benchmarks) section above: diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md index ef1988cbd0..c62e3fc986 100644 --- a/docs/source/developer_guide.md +++ b/docs/source/developer_guide.md @@ -187,7 +187,7 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour 1. Do not split empty functions/records/namespaces. 2. Two-space indentation everywhere, including the line continuations. 3. Disable reflowing of comments. - The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/cuvs/blob/release/26.04/cpp/.clang-format). + The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/cuvs/blob/main/cpp/.clang-format). [`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter. In order to run doxygen as a linter on C++/CUDA code, run @@ -205,7 +205,7 @@ you can run `codespell -i 3 -w .` from the repository root directory. This will bring up an interactive prompt to select which spelling fixes to apply. ### #include style -[include_checker.py](https://github.com/rapidsai/cuvs/blob/release/26.04/cpp/scripts/include_checker.py) is used to enforce the include style as follows: +[include_checker.py](https://github.com/rapidsai/cuvs/blob/main/cpp/scripts/include_checker.py) is used to enforce the include style as follows: 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies. 2. `#include <...>` should be used for referencing everything else diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst index 89be0f1e5b..68b4df9cf0 100644 --- a/docs/source/getting_started.rst +++ b/docs/source/getting_started.rst @@ -74,7 +74,7 @@ Please refer to the :doc:`guide on API interoperability ` Where to next? ============== -cuVS is free and open source software, licesed under Apache 2.0 Once you are familiar with and/or have used cuVS, you can access the developer community most easily through `Github `_. Please open Github issues for any bugs, questions or feature requests. +cuVS is free and open source software, licensed under Apache 2.0 Once you are familiar with and/or have used cuVS, you can access the developer community most easily through `Github `_. Please open Github issues for any bugs, questions or feature requests. Social media ------------ diff --git a/docs/source/neighbors/ivfflat.rst b/docs/source/neighbors/ivfflat.rst index 7154db0375..d4c8f03c18 100644 --- a/docs/source/neighbors/ivfflat.rst +++ b/docs/source/neighbors/ivfflat.rst @@ -19,7 +19,7 @@ in the index, and Filtering considerations ------------------------ -IVF methods only apply filters to the lists which are probed for each query point. As a result, the results of a filtered query will likely differ signficiantly from the results of a filtering applid to an exact method like brute-force. For example. imagine you have 3 IVF lists each containing 2 vectors and you perform a query against only the closest 2 lists but you filter out all but 1 element. If that remaining element happens to be in one of the lists which was not proved, it will not be considered at all in the search results. It's important to consider this when using any of the IVF methods in your applications. +IVF methods only apply filters to the lists which are probed for each query point. As a result, the results of a filtered query will likely differ significantly from the results of a filtering applid to an exact method like brute-force. For example. imagine you have 3 IVF lists each containing 2 vectors and you perform a query against only the closest 2 lists but you filter out all but 1 element. If that remaining element happens to be in one of the lists which was not proved, it will not be considered at all in the search results. It's important to consider this when using any of the IVF methods in your applications. Configuration parameters diff --git a/docs/source/tuning_guide.rst b/docs/source/tuning_guide.rst index 0f49802c86..031d9381b4 100644 --- a/docs/source/tuning_guide.rst +++ b/docs/source/tuning_guide.rst @@ -30,7 +30,7 @@ GPUs are naturally great at performing massively parallel tasks, especially when Steps to achieve automated tuning ================================= -More formally, an automated parameter tuning workflow with monte-carlo cross-validaton looks likes something like this: +More formally, an automated parameter tuning workflow with monte-carlo cross-validation looks likes something like this: #. Ingest a large dataset into the vector database of your choice diff --git a/examples/c/CMakeLists.txt b/examples/c/CMakeLists.txt index feb7a03309..61092834c7 100644 --- a/examples/c/CMakeLists.txt +++ b/examples/c/CMakeLists.txt @@ -34,29 +34,29 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Werror") add_executable(CAGRA_C_EXAMPLE src/cagra_c_example.c) target_include_directories(CAGRA_C_EXAMPLE PUBLIC "$") target_link_libraries( - CAGRA_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart $ + CAGRA_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart_static $ ) add_executable(L2_C_EXAMPLE src/L2_c_example.c) target_include_directories(L2_C_EXAMPLE PUBLIC "$") target_link_libraries( - L2_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart $ + L2_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart_static $ ) add_executable(IVF_FLAT_C_EXAMPLE src/ivf_flat_c_example.c) target_include_directories(IVF_FLAT_C_EXAMPLE PUBLIC "$") target_link_libraries( - IVF_FLAT_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart $ + IVF_FLAT_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart_static $ ) add_executable(IVF_PQ_C_EXAMPLE src/ivf_pq_c_example.c) target_include_directories(IVF_PQ_C_EXAMPLE PUBLIC "$") target_link_libraries( - IVF_PQ_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart $ + IVF_PQ_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart_static $ ) add_executable(BRUTEFORCE_C_EXAMPLE src/bruteforce_c_example.c) target_include_directories(BRUTEFORCE_C_EXAMPLE PUBLIC "$") target_link_libraries( - BRUTEFORCE_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart $ + BRUTEFORCE_C_EXAMPLE PRIVATE cuvs::c_api CUDA::cudart_static $ ) diff --git a/examples/cpp/src/ivf_flat_example.cu b/examples/cpp/src/ivf_flat_example.cu index 404cd86e89..fec73d8eb7 100644 --- a/examples/cpp/src/ivf_flat_example.cu +++ b/examples/cpp/src/ivf_flat_example.cu @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -86,7 +86,7 @@ void ivf_flat_build_extend_search(raft::device_resources const& dev_resources, std::make_optional(raft::make_const_mdspan(data_indices.view())), index); - std::cout << "Index size after addin dataset vectors " << index.size() << std::endl; + std::cout << "Index size after adding dataset vectors " << index.size() << std::endl; // Set search parameters. ivf_flat::search_params search_params; diff --git a/examples/go/README.md b/examples/go/README.md index 007eae7582..819b05c700 100644 --- a/examples/go/README.md +++ b/examples/go/README.md @@ -17,14 +17,14 @@ You may prefer to use `mamba`, as it provides significant speedup over `conda`. 1. Set up the required environment variables: ```bash export CGO_CFLAGS="-I${CONDA_PREFIX}/include" -export CGO_LDFLAGS="-L${CONDA_PREFIX}/lib -lcudart -lcuvs -lcuvs_c" +export CGO_LDFLAGS="-L${CONDA_PREFIX}/lib -lcudart_static -ldl -lrt -lcuvs -lcuvs_c" export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH" export CC=clang ``` 2. Install the Go module: ```bash -go get github.com/rapidsai/cuvs/go@v26.04.00 # 25.02.00 being your desired version, selected from https://github.com/rapidsai/cuvs/tags +go get github.com/rapidsai/cuvs/go@v26.06.00 # 25.02.00 being your desired version, selected from https://github.com/rapidsai/cuvs/tags ``` Then you can build your project with the usual `go build`. diff --git a/go/dlpack.go b/go/dlpack.go index 6fe619fd35..fcb9632586 100644 --- a/go/dlpack.go +++ b/go/dlpack.go @@ -3,6 +3,7 @@ package cuvs // #include // #include // #include +// #include import "C" import ( diff --git a/java/benchmarks/pom.xml b/java/benchmarks/pom.xml index 6f62725a31..065f0e7283 100644 --- a/java/benchmarks/pom.xml +++ b/java/benchmarks/pom.xml @@ -1,6 +1,6 @@ @@ -10,7 +10,7 @@ com.nvidia.cuvs benchmarks - 26.04.0 + 26.06.0 jar cuvs-java-benchmarks @@ -30,7 +30,7 @@ com.nvidia.cuvs cuvs-java - 26.04.0 + 26.06.0 jar diff --git a/java/build.sh b/java/build.sh index 3949fcae01..486f2fc5b2 100755 --- a/java/build.sh +++ b/java/build.sh @@ -1,6 +1,6 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 set -e -u -o pipefail @@ -8,7 +8,7 @@ set -e -u -o pipefail ARGS="$*" NUMARGS=$# -VERSION="26.04.0" # Note: The version is updated automatically when ci/release/update-version.sh is invoked +VERSION="26.06.0" # Note: The version is updated automatically when ci/release/update-version.sh is invoked GROUP_ID="com.nvidia.cuvs" # Identify CUDA major version. diff --git a/java/cuvs-java/pom.xml b/java/cuvs-java/pom.xml index 0a3fe9da5a..c37baca07e 100644 --- a/java/cuvs-java/pom.xml +++ b/java/cuvs-java/pom.xml @@ -1,6 +1,6 @@ @@ -11,7 +11,7 @@ com.nvidia.cuvs cuvs-java - 26.04.0 + 26.06.0 cuvs-java This project provides Java bindings for cuVS, enabling approximate nearest neighbors search and clustering diff --git a/java/cuvs-java/src/test/java/com/nvidia/cuvs/CagraMultiThreadStabilityIT.java b/java/cuvs-java/src/test/java/com/nvidia/cuvs/CagraMultiThreadStabilityIT.java index 8d16f7b9ca..f1701ebb6f 100644 --- a/java/cuvs-java/src/test/java/com/nvidia/cuvs/CagraMultiThreadStabilityIT.java +++ b/java/cuvs-java/src/test/java/com/nvidia/cuvs/CagraMultiThreadStabilityIT.java @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ package com.nvidia.cuvs; @@ -177,7 +177,7 @@ private void testQueryingUsingMultipleThreads(ResourcesSupplier resourcesSupplie log.debug(" Successful queries: {} / {}", actualSuccessfulQueries, expectedTotalQueries); if (firstError.get() != null) { - fail("MultiThreaded stablity test failed:" + " " + firstError.get().getMessage()); + fail("MultiThreaded stability test failed:" + " " + firstError.get().getMessage()); } assertTrue("All threads should complete successfully", allCompleted); diff --git a/java/examples/README.md b/java/examples/README.md index b8fa398376..8561a9e73f 100644 --- a/java/examples/README.md +++ b/java/examples/README.md @@ -11,17 +11,17 @@ This maven project contains examples for CAGRA, HNSW, and Bruteforce algorithms. ### CAGRA Example In the current directory do: ``` -mvn package && java --enable-native-access=ALL-UNNAMED -cp target/cuvs-java-examples-26.04.0.jar:$HOME/.m2/repository/com/nvidia/cuvs/cuvs-java/26.04.0/cuvs-java-26.04.0.jar com.nvidia.cuvs.examples.CagraExample +mvn package && java --enable-native-access=ALL-UNNAMED -cp target/cuvs-java-examples-26.06.0.jar:$HOME/.m2/repository/com/nvidia/cuvs/cuvs-java/26.06.0/cuvs-java-26.06.0.jar com.nvidia.cuvs.examples.CagraExample ``` ### HNSW Example In the current directory do: ``` -mvn package && java --enable-native-access=ALL-UNNAMED -cp target/cuvs-java-examples-26.04.0.jar:$HOME/.m2/repository/com/nvidia/cuvs/cuvs-java/26.04.0/cuvs-java-26.04.0.jar com.nvidia.cuvs.examples.HnswExample +mvn package && java --enable-native-access=ALL-UNNAMED -cp target/cuvs-java-examples-26.06.0.jar:$HOME/.m2/repository/com/nvidia/cuvs/cuvs-java/26.06.0/cuvs-java-26.06.0.jar com.nvidia.cuvs.examples.HnswExample ``` ### Bruteforce Example In the current directory do: ``` -mvn package && java --enable-native-access=ALL-UNNAMED -cp target/cuvs-java-examples-26.04.0.jar:$HOME/.m2/repository/com/nvidia/cuvs/cuvs-java/26.04.0/cuvs-java-26.04.0.jar com.nvidia.cuvs.examples.BruteForceExample +mvn package && java --enable-native-access=ALL-UNNAMED -cp target/cuvs-java-examples-26.06.0.jar:$HOME/.m2/repository/com/nvidia/cuvs/cuvs-java/26.06.0/cuvs-java-26.06.0.jar com.nvidia.cuvs.examples.BruteForceExample ``` diff --git a/java/examples/pom.xml b/java/examples/pom.xml index 800ad88c68..4095396ef2 100644 --- a/java/examples/pom.xml +++ b/java/examples/pom.xml @@ -1,5 +1,5 @@ @@ -10,7 +10,7 @@ SPDX-License-Identifier: Apache-2.0 com.nvidia.cuvs.examples cuvs-java-examples - 26.04.0 + 26.06.0 cuvs-java-examples @@ -23,7 +23,7 @@ SPDX-License-Identifier: Apache-2.0 com.nvidia.cuvs cuvs-java - 26.04.0 + 26.06.0 diff --git a/notebooks/tutorial_ivf_pq.ipynb b/notebooks/tutorial_ivf_pq.ipynb index bd1119c5e5..acd2482702 100644 --- a/notebooks/tutorial_ivf_pq.ipynb +++ b/notebooks/tutorial_ivf_pq.ipynb @@ -571,7 +571,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This figure represents the trade-offs one does by choosing different combintations of the internal search types (the bit sizes of the data types are shown as point labels).\n", + "This figure represents the trade-offs one does by choosing different combinations of the internal search types (the bit sizes of the data types are shown as point labels).\n", "Depending on the GPU and the selected dataset, you may see different pictures.\n", "With SIFT-128 (`pq_dim = 64`), reducing the `internal_distance_dtype` comes at a huge cost to recall,\n", "whereas `lut_dtype` doesn't cost too much while significantly improving QPS.\n", diff --git a/pyproject.toml b/pyproject.toml index 9a51f4fdb3..50755ed387 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. All rights reserved. # SPDX-License-Identifier: Apache-2.0 [tool.ruff] @@ -47,6 +47,6 @@ follow_imports = "skip" skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,.*_skbuild" # ignore short words, and typename parameters like OffsetT ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b" -ignore-words-list = "inout,unparseable,numer" +ignore-words-list = "inout,unparseable,numer,rocess,thirdparty,couldn,subtile" builtin = "clear" quiet-level = 3 diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml index 267130acb0..4f2ce2cbde 100644 --- a/python/cuvs/pyproject.toml +++ b/python/cuvs/pyproject.toml @@ -21,9 +21,9 @@ license = "Apache-2.0" requires-python = ">=3.11" dependencies = [ "cuda-python>=13.0.1,<14.0", - "libcuvs==26.4.*,>=0.0.0a0", + "libcuvs==26.6.*,>=0.0.0a0", "numpy>=1.23,<3.0", - "pylibraft==26.4.*,>=0.0.0a0", + "pylibraft==26.6.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -109,9 +109,9 @@ requires = [ "cmake>=3.30.4", "cuda-python>=13.0.1,<14.0", "cython>=3.2.2", - "libcuvs==26.4.*,>=0.0.0a0", - "libraft==26.4.*,>=0.0.0a0", - "librmm==26.4.*,>=0.0.0a0", + "libcuvs==26.6.*,>=0.0.0a0", + "libraft==26.6.*,>=0.0.0a0", + "librmm==26.6.*,>=0.0.0a0", "ninja", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. build-backend = "scikit_build_core.build" diff --git a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py index e0517e3855..1579e4ef1b 100644 --- a/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py +++ b/python/cuvs_bench/cuvs_bench/generate_groundtruth/__main__.py @@ -263,7 +263,7 @@ def main(): "--n_queries", type=int, default=10000, - help="Number of quries to generate (if no query file is given). " + help="Number of queries to generate (if no query file is given). " "Default: 10000.", ) diff --git a/python/cuvs_bench/cuvs_bench/plot/__main__.py b/python/cuvs_bench/cuvs_bench/plot/__main__.py index 843926853d..aca08505ea 100644 --- a/python/cuvs_bench/cuvs_bench/plot/__main__.py +++ b/python/cuvs_bench/cuvs_bench/plot/__main__.py @@ -6,7 +6,7 @@ # 1: https://github.com/erikbern/ann-benchmarks/blob/main/plot.py # 2: https://github.com/erikbern/ann-benchmarks/blob/main/ann_benchmarks/plotting/utils.py # noqa: E501 # 3: https://github.com/erikbern/ann-benchmarks/blob/main/ann_benchmarks/plotting/metrics.py # noqa: E501 -# License: https://github.com/rapidsai/cuvs/blob/release/26.04/thirdparty/LICENSES/LICENSE.ann-benchmark # noqa: E501 +# License: https://github.com/rapidsai/cuvs/blob/main/thirdparty/LICENSES/LICENSE.ann-benchmark # noqa: E501 import itertools import os diff --git a/python/cuvs_bench/pyproject.toml b/python/cuvs_bench/pyproject.toml index 446894fb6e..ed080c7b1f 100644 --- a/python/cuvs_bench/pyproject.toml +++ b/python/cuvs_bench/pyproject.toml @@ -20,7 +20,7 @@ license = "Apache-2.0" requires-python = ">=3.11" dependencies = [ "click", - "cuvs==26.4.*,>=0.0.0a0", + "cuvs==26.6.*,>=0.0.0a0", "matplotlib>=3.9", "pandas", "pyyaml", diff --git a/python/libcuvs/CMakeLists.txt b/python/libcuvs/CMakeLists.txt index 318e82a2b9..bac8373cf7 100644 --- a/python/libcuvs/CMakeLists.txt +++ b/python/libcuvs/CMakeLists.txt @@ -31,7 +31,6 @@ endif() unset(cuvs_FOUND) # --- CUDA --- # -set(CUDA_STATIC_RUNTIME ON) set(CUDA_STATIC_MATH_LIBRARIES OFF) # --- RAFT ---# diff --git a/python/libcuvs/pyproject.toml b/python/libcuvs/pyproject.toml index 052834cab0..19efa82647 100644 --- a/python/libcuvs/pyproject.toml +++ b/python/libcuvs/pyproject.toml @@ -20,8 +20,8 @@ license = "Apache-2.0" requires-python = ">=3.11" dependencies = [ "cuda-toolkit[cublas,curand,cusolver,cusparse]==13.*", - "libraft==26.4.*,>=0.0.0a0", - "librmm==26.4.*,>=0.0.0a0", + "libraft==26.6.*,>=0.0.0a0", + "librmm==26.6.*,>=0.0.0a0", "nvidia-nvjitlink>=13.0,<14", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -81,8 +81,8 @@ regex = "(?P.*)" build-backend = "scikit_build_core.build" requires = [ "cmake>=3.30.4", - "libraft==26.4.*,>=0.0.0a0", - "librmm==26.4.*,>=0.0.0a0", + "libraft==26.6.*,>=0.0.0a0", + "librmm==26.6.*,>=0.0.0a0", "ninja", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. dependencies-file = "../../dependencies.yaml" diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 965c3ba718..96a909a9f8 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -6,7 +6,7 @@ members = [ resolver = "2" [workspace.package] -version = "26.4.0" +version = "26.6.0" edition = "2021" repository = "https://github.com/rapidsai/cuvs" homepage = "https://github.com/rapidsai/cuvs" diff --git a/rust/cuvs-sys/build.rs b/rust/cuvs-sys/build.rs index 99e811eeb9..cec80eb736 100644 --- a/rust/cuvs-sys/build.rs +++ b/rust/cuvs-sys/build.rs @@ -15,8 +15,11 @@ fn main() { "cargo:rustc-link-search=native={}/lib", cuvs_build.display() ); + if let Ok(conda_prefix) = env::var("CONDA_PREFIX") { + println!("cargo:rustc-link-search=native={}/lib", conda_prefix); + } println!("cargo:rustc-link-lib=dylib=cuvs_c"); - println!("cargo:rustc-link-lib=dylib=cudart"); + println!("cargo:rustc-link-lib=static=cudart_static"); // we need some extra flags both to link against cuvs, and also to run bindgen // specifically we need to: diff --git a/rust/cuvs/Cargo.toml b/rust/cuvs/Cargo.toml index 15bf8cf920..f90e1aeaf6 100644 --- a/rust/cuvs/Cargo.toml +++ b/rust/cuvs/Cargo.toml @@ -9,7 +9,7 @@ authors.workspace = true license.workspace = true [dependencies] -ffi = { package = "cuvs-sys", path = "../cuvs-sys", version = "26.4.0" } +ffi = { package = "cuvs-sys", path = "../cuvs-sys", version = "26.6.0" } ndarray = "0.15" [dev-dependencies]