From c5dcba0333baa4daf41292a8901044442c8d79c8 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 18 Sep 2024 14:43:38 -0500
Subject: [PATCH 01/10] Use CI workflow branch 'branch-24.10' again [skip ci]
 (#2437)

---
 .github/workflows/build.yaml | 16 ++++++++--------
 .github/workflows/pr.yaml    | 26 +++++++++++++-------------
 .github/workflows/test.yaml  | 10 +++++-----
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 037d7cc05f..2b0ae5099c 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       package-name: pylibraft
   wheel-build-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -98,7 +98,7 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 26c6928f77..381ca6b378 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,29 +25,29 @@ jobs:
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: pull-request
       enable_check_symbols: true
@@ -55,19 +55,19 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -77,34 +77,34 @@ jobs:
   wheel-build-pylibraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibraft.sh
   wheel-tests-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibraft.sh
   wheel-build-raft-dask:
     needs: wheel-tests-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_raft_dask.sh"
   wheel-tests-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 90518c1132..ad0456d526 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -26,7 +26,7 @@ jobs:
       symbol_exclusions: raft_cutlass
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
       script: ci/test_wheel_pylibraft.sh
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From cd02a8f2b8dc22ec2115e80e559a4ab98d402c6e Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Fri, 20 Sep 2024 13:05:44 -0400
Subject: [PATCH 02/10] Fix sed syntax [skip-ci] (#2441)

---
 ci/release/update-version.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 075eb896f2..5bb98511cf 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -58,7 +58,7 @@ for FILE in dependencies.yaml conda/environments/*.yaml; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
   for DEP in "${UCXX_DEPENDENCIES[@]}"; do
-    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_UCXX_SHORT_TAG_PEP440}.*/,>=0.0.0a0/g" "${FILE}"
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_UCXX_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
 done
 for FILE in python/*/pyproject.toml; do

From f49567e1fb9f722f8864bc84880745b9b776eb61 Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Sat, 21 Sep 2024 01:09:00 +0200
Subject: [PATCH 03/10] Use runtime check of cudart version for eig (#2430)

Authors:
  - Micka (https://github.com/lowener)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2430
---
 cpp/include/raft/linalg/detail/eig.cuh | 31 ++++++++++++++------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh
index ba7ed3dcdf..561187178c 100644
--- a/cpp/include/raft/linalg/detail/eig.cuh
+++ b/cpp/include/raft/linalg/detail/eig.cuh
@@ -95,16 +95,19 @@ void eigDC(raft::resources const& handle,
   return;
 #endif
 
-#if CUDART_VERSION <= 12040
-  // Use a new stream instead of `cudaStreamPerThread` to avoid cusolver bug # 4580093.
+  int cudart_version = 0;
+  RAFT_CUDA_TRY(cudaRuntimeGetVersion(&cudart_version));
+  cudaStream_t stream_new;
+  cudaEvent_t sync_event = resource::detail::get_cuda_stream_sync_event(handle);
   rmm::cuda_stream stream_new_wrapper;
-  cudaStream_t stream_new = stream_new_wrapper.value();
-  cudaEvent_t sync_event  = resource::detail::get_cuda_stream_sync_event(handle);
-  RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream));
-  RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_new, sync_event));
-#else
-  cudaStream_t stream_new = stream;
-#endif
+  if (cudart_version < 12050) {
+    // Use a new stream instead of `cudaStreamPerThread` to avoid cusolver bug # 4580093.
+    stream_new = stream_new_wrapper.value();
+    RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream));
+    RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_new, sync_event));
+  } else {
+    stream_new = stream;
+  }
   cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle);
 
   cusolverDnParams_t dn_params = nullptr;
@@ -152,11 +155,11 @@ void eigDC(raft::resources const& handle,
          "eig.cuh: eigensolver couldn't converge to a solution. "
          "This usually occurs when some of the features do not vary enough.");
 
-#if CUDART_VERSION <= 12040
-  // Synchronize the created stream with the original stream before return
-  RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream_new));
-  RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_event));
-#endif
+  if (cudart_version < 12050) {
+    // Synchronize the created stream with the original stream before return
+    RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream_new));
+    RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_event));
+  }
 }
 
 enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };

From b79f15d2f229849bc02425b2e4ffd7bd3db89d4c Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 23 Sep 2024 16:09:01 -0500
Subject: [PATCH 04/10] Update fmt (to 11.0.2) and spdlog (to 1.14.1). (#2433)

* Update fmt (to 11.0.2) and spdlog (to 1.14.1).

* use rmm and ucxx CI artifacts

* try using librmm wheels

* try again to use rmm wheels

* rapids-get-pr-wheel-artifact was missing RAPIDS_PY_WHEEL_NAME

* ok you do not need to provide the Python slug yourself for rapids-get-pr-wheel-artifact

* constraints need 'file://' protocol

* try suppressing unreachable-code diagnostics from nvcc (this should be narrowed down / upstreamed before merging)

* fix checks

* Revert "try suppressing unreachable-code diagnostics from nvcc (this should be narrowed down / upstreamed before merging)"

This reverts commit 3ba2201d678cc8befdfcaa0e89630a95a00a78b2.

* copyright

* move rapids-cmake overrides [skip ci]

* kick off a build

* fix dependency graph

* devcontainer

* run all CI

* remove testing-only changes [skip ci]
---
 conda/recipes/libraft/conda_build_config.yaml         |  4 ++--
 .../raft-ann-bench-cpu/conda_build_config.yaml        |  4 ++--
 cpp/cmake/thirdparty/get_spdlog.cmake                 | 11 +----------
 3 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 00b133c821..5c0047fb9c 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -73,7 +73,7 @@ cuda11_cuda_profiler_api_run_version:
   - ">=11.4.240,<12"
 
 spdlog_version:
-  - ">=1.12.0,<1.13"
+  - ">=1.14.1,<1.15"
 
 fmt_version:
-  - ">=10.1.1,<11"
+  - ">=11.0.2,<12"
diff --git a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
index 70d1f0490e..ed6f708e14 100644
--- a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
@@ -23,7 +23,7 @@ nlohmann_json_version:
   - ">=3.11.2"
 
 spdlog_version:
-  - ">=1.12.0,<1.13"
+  - ">=1.14.1,<1.15"
 
 fmt_version:
-  - ">=10.1.1,<11"
+  - ">=11.0.2,<12"
diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake
index 7be7804c7e..57e38c2638 100644
--- a/cpp/cmake/thirdparty/get_spdlog.cmake
+++ b/cpp/cmake/thirdparty/get_spdlog.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -19,15 +19,6 @@ function(find_and_configure_spdlog)
     rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET rmm-exports)
     rapids_export_package(BUILD spdlog rmm-exports)
 
-    if(spdlog_ADDED)
-        rapids_export(
-                BUILD spdlog
-                EXPORT_SET spdlog
-                GLOBAL_TARGETS spdlog spdlog_header_only
-                NAMESPACE spdlog::)
-        include("${rapids-cmake-dir}/export/find_package_root.cmake")
-        rapids_export_find_package_root(BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET rmm-exports)
-    endif()
 endfunction()
 
 find_and_configure_spdlog()
\ No newline at end of file

From 878cefc923f25df07fc362126d1c269bdff30c0c Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Tue, 24 Sep 2024 14:10:57 -0500
Subject: [PATCH 05/10] update update-version.sh to use packaging lib (#2447)

---
 ci/release/update-version.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 5bb98511cf..032b88b4aa 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -25,8 +25,8 @@ NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 NEXT_UCXX_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})"
 
 # Need to distutils-normalize the original version
-NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
-NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_UCXX_SHORT_TAG}'))")
+NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
+NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_UCXX_SHORT_TAG}'))")
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 

From 17757b84c38dfa671abe5e8794fd2bd9bc636772 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 25 Sep 2024 13:49:36 -0500
Subject: [PATCH 06/10] Switch traceback to `native` (#2446)

In cudf we have observed a ~10% speed up of pytest suite execution by switching pytest traceback to `--native`:

```
currently:

102474 passed, 2117 skipped, 902 xfailed in 892.16s (0:14:52)

--tb=short:

102474 passed, 2117 skipped, 902 xfailed in 898.99s (0:14:58)

--tb=no:

102474 passed, 2117 skipped, 902 xfailed in 815.98s (0:13:35)

--tb=native:

102474 passed, 2117 skipped, 902 xfailed in 820.92s (0:13:40)
```

This PR makes a similar change to `raft` repo.

xref: https://github.com/rapidsai/cudf/pull/16851

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2446
---
 python/pylibraft/pylibraft/test/pytest.ini | 5 +++++
 python/raft-dask/pytest.ini                | 1 +
 python/raft-dask/raft_dask/test/pytest.ini | 5 +++++
 3 files changed, 11 insertions(+)
 create mode 100644 python/pylibraft/pylibraft/test/pytest.ini
 create mode 100644 python/raft-dask/raft_dask/test/pytest.ini

diff --git a/python/pylibraft/pylibraft/test/pytest.ini b/python/pylibraft/pylibraft/test/pytest.ini
new file mode 100644
index 0000000000..bf70c06f84
--- /dev/null
+++ b/python/pylibraft/pylibraft/test/pytest.ini
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
+
diff --git a/python/raft-dask/pytest.ini b/python/raft-dask/pytest.ini
index 2467e2089a..e09c2b173d 100644
--- a/python/raft-dask/pytest.ini
+++ b/python/raft-dask/pytest.ini
@@ -10,3 +10,4 @@ markers =
   nccl: marks a test as using NCCL
   ucx: marks a test as using UCX-Py
   ucxx: marks a test as using UCXX
+addopts = --tb=native
diff --git a/python/raft-dask/raft_dask/test/pytest.ini b/python/raft-dask/raft_dask/test/pytest.ini
new file mode 100644
index 0000000000..bf70c06f84
--- /dev/null
+++ b/python/raft-dask/raft_dask/test/pytest.ini
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[pytest]
+addopts = --tb=native
+

From 0f8b09720054e5cf7a65dddfa02a2f06e7ba8adb Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Wed, 25 Sep 2024 20:53:54 -0400
Subject: [PATCH 07/10] Disable NN Descent Batch tests temporarily (#2453)

Linked issue https://github.com/rapidsai/raft/issues/2450

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2453
---
 cpp/test/CMakeLists.txt               |  4 +++-
 cpp/test/neighbors/ann_nn_descent.cuh | 18 ++++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index a497e6d3ba..5d504d2100 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -440,7 +440,9 @@ if(BUILD_TESTS)
     neighbors/ann_nn_descent/test_float_uint32_t.cu
     neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
     neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
-    neighbors/ann_nn_descent/test_batch_float_uint32_t.cu
+    # TODO: Investigate why this test is failing Reference issue
+    # https://github.com/rapidsai/raft/issues/2450
+    # neighbors/ann_nn_descent/test_batch_float_uint32_t.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
     GPUS
diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh
index 2f9d4e252b..5070d83b15 100644
--- a/cpp/test/neighbors/ann_nn_descent.cuh
+++ b/cpp/test/neighbors/ann_nn_descent.cuh
@@ -318,13 +318,15 @@ const std::vector<AnnNNDescentInputs> inputs = raft::util::itertools::product<An
   {false, true},
   {0.90});
 
-const std::vector<AnnNNDescentBatchInputs> inputsBatch =
-  raft::util::itertools::product<AnnNNDescentBatchInputs>(
-    {std::make_pair(0.9, 3lu), std::make_pair(0.9, 2lu)},  // min_recall, n_clusters
-    {4000, 5000},                                          // n_rows
-    {192, 512},                                            // dim
-    {32, 64},                                              // graph_degree
-    {raft::distance::DistanceType::L2Expanded},
-    {false, true});
+// TODO: Investigate why this test is failing
+// Reference issue https://github.com/rapidsai/raft/issues/2450
+// const std::vector<AnnNNDescentBatchInputs> inputsBatch =
+//   raft::util::itertools::product<AnnNNDescentBatchInputs>(
+//     {std::make_pair(0.9, 3lu), std::make_pair(0.9, 2lu)},  // min_recall, n_clusters
+//     {4000, 5000},                                          // n_rows
+//     {192, 512},                                            // dim
+//     {32, 64},                                              // graph_degree
+//     {raft::distance::DistanceType::L2Expanded},
+//     {false, true});
 
 }  // namespace raft::neighbors::experimental::nn_descent

From 2e705fbf5bfa9af148871ba509ac55593da9a098 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Wed, 25 Sep 2024 23:01:05 -0700
Subject: [PATCH 08/10] Add missing `cuda_suffixed: true` (#2440)

This change ensures `rmm-cuXX` is not in the package dependencies when building the pylibraft wheel with `-C "rapidsai.matrix-entry=cuda_suffixed=false`.

Authors:
  - Paul Taylor (https://github.com/trxcllnt)
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/raft/pull/2440
---
 dependencies.yaml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 8f5c69245f..91fb176b56 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -499,10 +499,14 @@ dependencies:
               - *cuda_python
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - *rmm_cu12
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - *rmm_cu11
           - {matrix: null, packages: [*rmm_unsuffixed]}

From 0284b424367f07220de133715043a963b341485a Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Thu, 26 Sep 2024 07:42:29 -0700
Subject: [PATCH 09/10] [Feat] add `repeat`, `sparsity`,  `eval_n_elements`
 APIs to `bitset` (#2439)

- This PR is a part of the feature that applies the prefilter brute-force in Cagra.

Authors:
  - rhdong (https://github.com/rhdong)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/raft/pull/2439
---
 cpp/bench/prims/util/popc.cu          |   7 +-
 cpp/include/raft/core/bitset.cuh      | 107 +++++++++++++++++++++++-
 cpp/include/raft/core/bitset.hpp      |  76 +++++++++++++++++
 cpp/include/raft/util/detail/popc.cuh |   6 +-
 cpp/include/raft/util/popc.cuh        |   4 +-
 cpp/test/core/bitset.cu               | 114 +++++++++++++++++++++++---
 cpp/test/util/popc.cu                 |  22 +++--
 7 files changed, 311 insertions(+), 25 deletions(-)

diff --git a/cpp/bench/prims/util/popc.cu b/cpp/bench/prims/util/popc.cu
index 249dc13d1e..c6249fb2bd 100644
--- a/cpp/bench/prims/util/popc.cu
+++ b/cpp/bench/prims/util/popc.cu
@@ -89,10 +89,9 @@ struct popc_bench : public fixture {
       auto bits_view =
         raft::make_device_vector_view<const bits_t, index_t>(bits_d.data_handle(), bits_d.size());
 
-      index_t max_len   = params.n_rows * params.n_cols;
-      auto max_len_view = raft::make_host_scalar_view<index_t>(&max_len);
-      auto nnz_actual_view =
-        nnz_actual_d.view();  // raft::make_device_scalar_view<index_t>(nnz_actual_d.data_handle());
+      index_t max_len      = params.n_rows * params.n_cols;
+      auto max_len_view    = raft::make_host_scalar_view<const index_t, index_t>(&max_len);
+      auto nnz_actual_view = nnz_actual_d.view();
       raft::popc(this->handle, bits_view, max_len_view, nnz_actual_view);
     });
   }
diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
index b6e6128eca..d1bffdb81e 100644
--- a/cpp/include/raft/core/bitset.cuh
+++ b/cpp/include/raft/core/bitset.cuh
@@ -26,6 +26,8 @@
 #include <raft/util/device_atomics.cuh>
 #include <raft/util/popc.cuh>
 
+#include <rmm/device_scalar.hpp>
+
 #include <thrust/for_each.h>
 
 namespace raft::core {
@@ -60,6 +62,109 @@ _RAFT_DEVICE void bitset_view<bitset_t, index_t>::set(const index_t sample_index
   }
 }
 
+template <typename bitset_t, typename index_t>
+void bitset_view<bitset_t, index_t>::count(const raft::resources& res,
+                                           raft::device_scalar_view<index_t> count_gpu_scalar) const
+{
+  auto max_len = raft::make_host_scalar_view<const index_t, index_t>(&bitset_len_);
+  auto values  = raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
+  raft::popc(res, values, max_len, count_gpu_scalar);
+}
+
+template <typename bitset_t, typename index_t>
+RAFT_KERNEL bitset_repeat_kernel(const bitset_t* src,
+                                 bitset_t* output,
+                                 index_t src_bit_len,
+                                 index_t repeat_times)
+{
+  constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
+  int output_idx                     = blockIdx.x * blockDim.x + threadIdx.x;
+
+  index_t total_bits  = src_bit_len * repeat_times;
+  index_t output_size = (total_bits + bits_per_element - 1) / bits_per_element;
+  index_t src_size    = (src_bit_len + bits_per_element - 1) / bits_per_element;
+
+  if (output_idx < output_size) {
+    bitset_t result     = 0;
+    index_t bit_written = 0;
+
+    index_t start_bit = output_idx * bits_per_element;
+
+    while (bit_written < bits_per_element && start_bit + bit_written < total_bits) {
+      index_t bit_idx      = (start_bit + bit_written) % src_bit_len;
+      index_t src_word_idx = bit_idx / bits_per_element;
+      index_t src_offset   = bit_idx % bits_per_element;
+
+      index_t remaining_bits = min(bits_per_element - bit_written, src_bit_len - bit_idx);
+
+      bitset_t src_value = (src[src_word_idx] >> src_offset);
+
+      if (src_offset + remaining_bits > bits_per_element) {
+        bitset_t next_value = src[(src_word_idx + 1) % src_size];
+        src_value |= (next_value << (bits_per_element - src_offset));
+      }
+      src_value &= ((bitset_t{1} << remaining_bits) - 1);
+      result |= (src_value << bit_written);
+      bit_written += remaining_bits;
+    }
+    output[output_idx] = result;
+  }
+}
+
+template <typename bitset_t, typename index_t>
+void bitset_repeat(raft::resources const& handle,
+                   const bitset_t* d_src,
+                   bitset_t* d_output,
+                   index_t src_bit_len,
+                   index_t repeat_times)
+{
+  if (src_bit_len == 0 || repeat_times == 0) return;
+  auto stream = resource::get_cuda_stream(handle);
+
+  constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
+  const index_t total_bits           = src_bit_len * repeat_times;
+  const index_t output_size          = (total_bits + bits_per_element - 1) / bits_per_element;
+
+  int threadsPerBlock = 128;
+  int blocksPerGrid   = (output_size + threadsPerBlock - 1) / threadsPerBlock;
+  bitset_repeat_kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+    d_src, d_output, src_bit_len, repeat_times);
+
+  return;
+}
+
+template <typename bitset_t, typename index_t>
+void bitset_view<bitset_t, index_t>::repeat(const raft::resources& res,
+                                            index_t times,
+                                            bitset_t* output_device_ptr) const
+{
+  auto thrust_policy                 = raft::resource::get_thrust_policy(res);
+  constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
+
+  if (bitset_len_ % bits_per_element == 0) {
+    index_t num_elements_to_copy = bitset_len_ / bits_per_element;
+
+    for (index_t i = 0; i < times; ++i) {
+      raft::copy(output_device_ptr + i * num_elements_to_copy,
+                 bitset_ptr_,
+                 num_elements_to_copy,
+                 raft::resource::get_cuda_stream(res));
+    }
+  } else {
+    bitset_repeat(res, bitset_ptr_, output_device_ptr, bitset_len_, times);
+  }
+}
+
+template <typename bitset_t, typename index_t>
+double bitset_view<bitset_t, index_t>::sparsity(const raft::resources& res) const
+{
+  index_t size_h = this->size();
+  if (0 == size_h) { return static_cast<double>(1.0); }
+  index_t count_h = this->count(res);
+
+  return static_cast<double>((1.0 * (size_h - count_h)) / (1.0 * size_h));
+}
+
 template <typename bitset_t, typename index_t>
 bitset<bitset_t, index_t>::bitset(const raft::resources& res,
                                   raft::device_vector_view<const index_t, index_t> mask_index,
@@ -155,7 +260,7 @@ template <typename bitset_t, typename index_t>
 void bitset<bitset_t, index_t>::count(const raft::resources& res,
                                       raft::device_scalar_view<index_t> count_gpu_scalar)
 {
-  auto max_len = raft::make_host_scalar_view<index_t>(&bitset_len_);
+  auto max_len = raft::make_host_scalar_view<const index_t, index_t>(&bitset_len_);
   auto values =
     raft::make_device_vector_view<const bitset_t, index_t>(bitset_.data(), n_elements());
   raft::popc(res, values, max_len, count_gpu_scalar);
diff --git a/cpp/include/raft/core/bitset.hpp b/cpp/include/raft/core/bitset.hpp
index 3608ee43fa..be828def87 100644
--- a/cpp/include/raft/core/bitset.hpp
+++ b/cpp/include/raft/core/bitset.hpp
@@ -22,6 +22,8 @@
 #include <raft/core/resources.hpp>
 #include <raft/util/integer_utils.hpp>
 
+#include <cmath>
+
 namespace raft::core {
 /**
  * @defgroup bitset Bitset
@@ -103,6 +105,80 @@ struct bitset_view {
   {
     return raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
   }
+  /**
+   * @brief Returns the number of bits set to true in count_gpu_scalar.
+   *
+   * @param[in] res RAFT resources
+   * @param[out] count_gpu_scalar Device scalar to store the count
+   */
+  void count(const raft::resources& res, raft::device_scalar_view<index_t> count_gpu_scalar) const;
+  /**
+   * @brief Returns the number of bits set to true.
+   *
+   * @param res RAFT resources
+   * @return index_t Number of bits set to true
+   */
+  auto count(const raft::resources& res) const -> index_t
+  {
+    auto count_gpu_scalar = raft::make_device_scalar<index_t>(res, 0.0);
+    count(res, count_gpu_scalar.view());
+    index_t count_cpu = 0;
+    raft::update_host(
+      &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
+    resource::sync_stream(res);
+    return count_cpu;
+  }
+
+  /**
+   * @brief Repeats the bitset data and copies it to the output device pointer.
+   *
+   * This function takes the original bitset data stored in the device memory
+   * and repeats it a specified number of times into a new location in the device memory.
+   * The bits are copied bit-by-bit to ensure that even if the number of bits (bitset_len_)
+   * is not a multiple of the bitset element size (e.g., 32 for uint32_t), the bits are
+   * tightly packed without any gaps between rows.
+   *
+   * @param res RAFT resources for managing CUDA streams and execution policies.
+   * @param times Number of times the bitset data should be repeated in the output.
+   * @param output_device_ptr Device pointer where the repeated bitset data will be stored.
+   *
+   * The caller must ensure that the output device pointer has enough memory allocated
+   * to hold `times * bitset_len` bits, where `bitset_len` is the number of bits in the original
+   * bitset. This function uses Thrust parallel algorithms to efficiently perform the operation on
+   * the GPU.
+   */
+  void repeat(const raft::resources& res, index_t times, bitset_t* output_device_ptr) const;
+
+  /**
+   * @brief Calculate the sparsity (fraction of 0s) of the bitset.
+   *
+   * This function computes the sparsity of the bitset, defined as the ratio of unset bits (0s)
+   * to the total number of bits in the set. If the total number of bits is zero, the function
+   * returns 1.0, indicating the set is fully sparse.
+   *
+   * @param res RAFT resources for managing CUDA streams and execution policies.
+   * @return double The sparsity of the bitset, i.e., the fraction of unset bits.
+   *
+   * This API will synchronize on the stream of `res`.
+   */
+  double sparsity(const raft::resources& res) const;
+
+  /**
+   * @brief Calculates the number of `bitset_t` elements required to store a bitset.
+   *
+   * This function computes the number of `bitset_t` elements needed to store a bitset, ensuring
+   * that all bits are accounted for. If the bitset length is not a multiple of the `bitset_t` size
+   * (in bits), the calculation rounds up to include the remaining bits in an additional `bitset_t`
+   * element.
+   *
+   * @param bitset_len The total length of the bitset in bits.
+   * @return size_t The number of `bitset_t` elements required to store the bitset.
+   */
+  static inline size_t eval_n_elements(size_t bitset_len)
+  {
+    const size_t bits_per_element = sizeof(bitset_t) * 8;
+    return (bitset_len + bits_per_element - 1) / bits_per_element;
+  }
 
  private:
   bitset_t* bitset_ptr_;
diff --git a/cpp/include/raft/util/detail/popc.cuh b/cpp/include/raft/util/detail/popc.cuh
index 20b4814216..f335be6fd0 100644
--- a/cpp/include/raft/util/detail/popc.cuh
+++ b/cpp/include/raft/util/detail/popc.cuh
@@ -36,12 +36,12 @@ namespace raft::detail {
  */
 template <typename value_t, typename index_t>
 void popc(const raft::resources& res,
-          device_vector_view<value_t, index_t> values,
-          raft::host_scalar_view<index_t> max_len,
+          device_vector_view<const value_t, index_t> values,
+          raft::host_scalar_view<const index_t, index_t> max_len,
           raft::device_scalar_view<index_t> counter)
 {
   auto values_size   = values.size();
-  auto values_matrix = raft::make_device_matrix_view<value_t, index_t, col_major>(
+  auto values_matrix = raft::make_device_matrix_view<const value_t, index_t, col_major>(
     values.data_handle(), values_size, 1);
   auto counter_vector = raft::make_device_vector_view<index_t, index_t>(counter.data_handle(), 1);
 
diff --git a/cpp/include/raft/util/popc.cuh b/cpp/include/raft/util/popc.cuh
index 153694e45e..d4bc01e274 100644
--- a/cpp/include/raft/util/popc.cuh
+++ b/cpp/include/raft/util/popc.cuh
@@ -31,8 +31,8 @@ namespace raft {
  */
 template <typename value_t, typename index_t>
 void popc(const raft::resources& res,
-          device_vector_view<value_t, index_t> values,
-          raft::host_scalar_view<index_t> max_len,
+          device_vector_view<const value_t, index_t> values,
+          raft::host_scalar_view<const index_t, index_t> max_len,
           raft::device_scalar_view<index_t> counter)
 {
   detail::popc(res, values, max_len, counter);
diff --git a/cpp/test/core/bitset.cu b/cpp/test/core/bitset.cu
index b799297e8c..ac601274c1 100644
--- a/cpp/test/core/bitset.cu
+++ b/cpp/test/core/bitset.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,12 +32,13 @@ struct test_spec_bitset {
   uint64_t bitset_len;
   uint64_t mask_len;
   uint64_t query_len;
+  uint64_t repeat_times;
 };
 
 auto operator<<(std::ostream& os, const test_spec_bitset& ss) -> std::ostream&
 {
   os << "bitset{bitset_len: " << ss.bitset_len << ", mask_len: " << ss.mask_len
-     << ", query_len: " << ss.query_len << "}";
+     << ", query_len: " << ss.query_len << ", repeat_times: " << ss.repeat_times << "}";
   return os;
 }
 
@@ -80,6 +81,48 @@ void flip_cpu_bitset(std::vector<bitset_t>& bitset)
   }
 }
 
+template <typename bitset_t>
+void repeat_cpu_bitset(std::vector<bitset_t>& input,
+                       size_t input_bits,
+                       size_t repeat,
+                       std::vector<bitset_t>& output)
+{
+  const size_t output_bits  = input_bits * repeat;
+  const size_t output_units = (output_bits + sizeof(bitset_t) * 8 - 1) / (sizeof(bitset_t) * 8);
+
+  std::memset(output.data(), 0, output_units * sizeof(bitset_t));
+
+  size_t output_bit_index = 0;
+
+  for (size_t r = 0; r < repeat; ++r) {
+    for (size_t i = 0; i < input_bits; ++i) {
+      size_t input_unit_index = i / (sizeof(bitset_t) * 8);
+      size_t input_bit_offset = i % (sizeof(bitset_t) * 8);
+      bool bit                = (input[input_unit_index] >> input_bit_offset) & 1;
+
+      size_t output_unit_index = output_bit_index / (sizeof(bitset_t) * 8);
+      size_t output_bit_offset = output_bit_index % (sizeof(bitset_t) * 8);
+
+      output[output_unit_index] |= (static_cast<bitset_t>(bit) << output_bit_offset);
+
+      ++output_bit_index;
+    }
+  }
+}
+
+template <typename bitset_t>
+double sparsity_cpu_bitset(std::vector<bitset_t>& data, size_t total_bits)
+{
+  size_t one_count = 0;
+  for (size_t i = 0; i < total_bits; ++i) {
+    size_t unit_index = i / (sizeof(bitset_t) * 8);
+    size_t bit_offset = i % (sizeof(bitset_t) * 8);
+    bool bit          = (data[unit_index] >> bit_offset) & 1;
+    if (bit == 1) { ++one_count; }
+  }
+  return static_cast<double>((total_bits - one_count) / (1.0 * total_bits));
+}
+
 template <typename bitset_t, typename index_t>
 class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
  protected:
@@ -87,13 +130,19 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
   const test_spec_bitset spec;
   std::vector<bitset_t> bitset_result;
   std::vector<bitset_t> bitset_ref;
+  std::vector<bitset_t> bitset_repeat_ref;
+  std::vector<bitset_t> bitset_repeat_result;
   raft::resources res;
 
  public:
   explicit BitsetTest()
     : spec(testing::TestWithParam<test_spec_bitset>::GetParam()),
       bitset_result(raft::ceildiv(spec.bitset_len, uint64_t(bitset_element_size))),
-      bitset_ref(raft::ceildiv(spec.bitset_len, uint64_t(bitset_element_size)))
+      bitset_ref(raft::ceildiv(spec.bitset_len, uint64_t(bitset_element_size))),
+      bitset_repeat_ref(
+        raft::ceildiv(spec.bitset_len * spec.repeat_times, uint64_t(bitset_element_size))),
+      bitset_repeat_result(
+        raft::ceildiv(spec.bitset_len * spec.repeat_times, uint64_t(bitset_element_size)))
   {
   }
 
@@ -145,6 +194,50 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
     resource::sync_stream(res, stream);
     ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare<bitset_t>()));
 
+    // test sparsity, repeat and eval_n_elements
+    {
+      auto my_bitset_view  = my_bitset.view();
+      auto sparsity_result = my_bitset_view.sparsity(res);
+      auto sparsity_ref    = sparsity_cpu_bitset(bitset_ref, size_t(spec.bitset_len));
+      ASSERT_EQ(sparsity_result, sparsity_ref);
+
+      auto eval_n_elements =
+        bitset_view<bitset_t, index_t>::eval_n_elements(spec.bitset_len * spec.repeat_times);
+      ASSERT_EQ(bitset_repeat_ref.size(), eval_n_elements);
+
+      auto repeat_device = raft::make_device_vector<bitset_t, index_t>(res, eval_n_elements);
+      RAFT_CUDA_TRY(cudaMemsetAsync(
+        repeat_device.data_handle(), 0, eval_n_elements * sizeof(bitset_t), stream));
+      repeat_cpu_bitset(
+        bitset_ref, size_t(spec.bitset_len), size_t(spec.repeat_times), bitset_repeat_ref);
+
+      my_bitset_view.repeat(res, index_t(spec.repeat_times), repeat_device.data_handle());
+
+      ASSERT_EQ(bitset_repeat_ref.size(), repeat_device.size());
+      update_host(
+        bitset_repeat_result.data(), repeat_device.data_handle(), repeat_device.size(), stream);
+      ASSERT_EQ(bitset_repeat_ref.size(), bitset_repeat_result.size());
+
+      index_t errors                        = 0;
+      static constexpr index_t len_per_item = sizeof(bitset_t) * 8;
+      bitset_t tail_len = (index_t(spec.bitset_len * spec.repeat_times) % len_per_item);
+      bitset_t tail_mask =
+        tail_len ? (bitset_t)((bitset_t{1} << tail_len) - bitset_t{1}) : ~bitset_t{0};
+      for (index_t i = 0; i < bitset_repeat_ref.size(); i++) {
+        if (i == bitset_repeat_ref.size() - 1) {
+          errors += (bitset_repeat_ref[i] & tail_mask) != (bitset_repeat_result[i] & tail_mask);
+        } else {
+          errors += (bitset_repeat_ref[i] != bitset_repeat_result[i]);
+        }
+      }
+      ASSERT_EQ(errors, 0);
+
+      // recheck the sparsity after repeat
+      sparsity_result =
+        sparsity_cpu_bitset(bitset_repeat_result, size_t(spec.bitset_len * spec.repeat_times));
+      ASSERT_EQ(sparsity_result, sparsity_ref);
+    }
+
     // Flip the bitset and re-test
     auto bitset_count = my_bitset.count(res);
     my_bitset.flip(res);
@@ -167,13 +260,14 @@ class BitsetTest : public testing::TestWithParam<test_spec_bitset> {
   }
 };
 
-auto inputs_bitset = ::testing::Values(test_spec_bitset{32, 5, 10},
-                                       test_spec_bitset{100, 30, 10},
-                                       test_spec_bitset{1024, 55, 100},
-                                       test_spec_bitset{10000, 1000, 1000},
-                                       test_spec_bitset{1 << 15, 1 << 3, 1 << 12},
-                                       test_spec_bitset{1 << 15, 1 << 24, 1 << 13},
-                                       test_spec_bitset{1 << 25, 1 << 23, 1 << 14});
+auto inputs_bitset = ::testing::Values(test_spec_bitset{32, 5, 10, 101},
+                                       test_spec_bitset{100, 30, 10, 13},
+                                       test_spec_bitset{1024, 55, 100, 1},
+                                       test_spec_bitset{10000, 1000, 1000, 100},
+                                       test_spec_bitset{1 << 15, 1 << 3, 1 << 12, 5},
+                                       test_spec_bitset{1 << 15, 1 << 24, 1 << 13, 3},
+                                       test_spec_bitset{1 << 25, 1 << 23, 1 << 14, 3},
+                                       test_spec_bitset{1 << 25, 1 << 23, 1 << 14, 21});
 
 using Uint16_32 = BitsetTest<uint16_t, uint32_t>;
 TEST_P(Uint16_32, Run) { run(); }
diff --git a/cpp/test/util/popc.cu b/cpp/test/util/popc.cu
index c08faacb07..28eaad2fcb 100644
--- a/cpp/test/util/popc.cu
+++ b/cpp/test/util/popc.cu
@@ -76,7 +76,7 @@ class PopcTest : public ::testing::TestWithParam<PopcInputs<index_t>> {
       index_t bit_position = index % (8 * sizeof(bits_t));
 
       if (((element >> bit_position) & 1) == 0) {
-        element |= (static_cast<index_t>(1) << bit_position);
+        element |= (static_cast<bits_t>(1) << bit_position);
         num_ones--;
       }
     }
@@ -101,7 +101,7 @@ class PopcTest : public ::testing::TestWithParam<PopcInputs<index_t>> {
       raft::make_device_vector_view<const bits_t, index_t>(bits_d.data(), bits_d.size());
 
     index_t max_len   = params.n_rows * params.n_cols;
-    auto max_len_view = raft::make_host_scalar_view<index_t>(&max_len);
+    auto max_len_view = raft::make_host_scalar_view<const index_t, index_t>(&max_len);
 
     index_t nnz_actual_h = 0;
     rmm::device_scalar<index_t> nnz_actual_d(0, stream);
@@ -123,8 +123,17 @@ class PopcTest : public ::testing::TestWithParam<PopcInputs<index_t>> {
   index_t nnz_expected;
 };
 
-using PopcTestI32 = PopcTest<int32_t>;
-TEST_P(PopcTestI32, Result) { Run(); }
+using PopcTestI32_U32 = PopcTest<int32_t, uint32_t>;
+TEST_P(PopcTestI32_U32, Result) { Run(); }
+
+using PopcTestI32_U64 = PopcTest<int32_t, uint64_t>;
+TEST_P(PopcTestI32_U64, Result) { Run(); }
+
+using PopcTestI32_U16 = PopcTest<int32_t, uint16_t>;
+TEST_P(PopcTestI32_U16, Result) { Run(); }
+
+using PopcTestI32_U8 = PopcTest<int32_t, uint8_t>;
+TEST_P(PopcTestI32_U8, Result) { Run(); }
 
 template <typename index_t>
 const std::vector<PopcInputs<index_t>> popc_inputs = {
@@ -154,6 +163,9 @@ const std::vector<PopcInputs<index_t>> popc_inputs = {
   {2, 33, 0.2},
 };
 
-INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32, ::testing::ValuesIn(popc_inputs<int32_t>));
+INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32_U32, ::testing::ValuesIn(popc_inputs<int32_t>));
+INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32_U64, ::testing::ValuesIn(popc_inputs<int32_t>));
+INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32_U16, ::testing::ValuesIn(popc_inputs<int32_t>));
+INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32_U8, ::testing::ValuesIn(popc_inputs<int32_t>));
 
 }  // namespace raft

From f37c41c54fc64a4e3689e5a61851ba3821800fee Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 26 Sep 2024 09:45:11 -0500
Subject: [PATCH 10/10] bump NCCL floor to 2.18.1.1 (#2443)

Contributes to https://github.com/rapidsai/build-planning/issues/102

Some RAPIDS libraries are using `ncclCommSplit()`, which was introduced in `nccl==2.18.1.1`. This is part of a series of PRs across RAPIDS updating libraries' pins to `nccl>=2.18.1.1` to ensure they get a new-enough version that supports that.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/raft/pull/2443
---
 conda/environments/all_cuda-118_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml        | 2 +-
 conda/environments/all_cuda-125_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml        | 2 +-
 conda/environments/bench_ann_cuda-118_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml  | 2 +-
 conda/environments/bench_ann_cuda-120_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-120_arch-x86_64.yaml  | 2 +-
 conda/recipes/libraft/conda_build_config.yaml           | 2 +-
 conda/recipes/raft-ann-bench/conda_build_config.yaml    | 2 +-
 conda/recipes/raft-dask/conda_build_config.yaml         | 3 +++
 conda/recipes/raft-dask/meta.yaml                       | 4 ++--
 dependencies.yaml                                       | 2 +-
 13 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 462874a7e7..de4eb7e690 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index cfd974a6a8..26f4c1efaa 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -36,7 +36,7 @@ dependencies:
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 82e391e9ae..692956502b 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 0389427d13..133d42bfee 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver-dev
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - numba>=0.57
 - numpy>=1.23,<3.0a0
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index eff1c56840..f99cedd627 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-aarch64=11.8
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 87b19d2952..08aea32ab1 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcusparse=11.7.5.86
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - nvcc_linux-64=11.8
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index ff3451c15c..572ad85ab5 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -29,7 +29,7 @@ dependencies:
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index 085e099ae8..9fd23edb03 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -29,7 +29,7 @@ dependencies:
 - libcusparse-dev
 - libucxx==0.40.*,>=0.0.0a0
 - matplotlib
-- nccl>=2.9.9
+- nccl>=2.18.1.1
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 5c0047fb9c..6dc06648bd 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4,!=3.30.0"
 
 nccl_version:
-  - ">=2.9.9"
+  - ">=2.18.1.1"
 
 glog_version:
   - ">=0.6.0"
diff --git a/conda/recipes/raft-ann-bench/conda_build_config.yaml b/conda/recipes/raft-ann-bench/conda_build_config.yaml
index db0083b583..bdb4e883ea 100644
--- a/conda/recipes/raft-ann-bench/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4,!=3.30.0"
 
 nccl_version:
-  - ">=2.9.9"
+  - ">=2.18.1.1"
 
 glog_version:
   - ">=0.6.0"
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index e6afed2890..58e8ec3c9e 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -24,3 +24,6 @@ ucxx_version:
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
+
+nccl_version:
+  - ">=2.18.1.1"
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index 74b26b5935..bc13d352b7 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -50,7 +50,7 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
     - cython >=3.0.0
-    - nccl >=2.9.9
+    - nccl {{ nccl_version }}
     - pylibraft {{ version }}
     - python x.x
     - rmm ={{ minor_version }}
@@ -68,7 +68,7 @@ requirements:
     - dask-cuda ={{ minor_version }}
     - rapids-dask-dependency ={{ minor_version }}
     - joblib >=0.11
-    - nccl >=2.9.9
+    - nccl {{ nccl_version }}
     - pylibraft {{ version }}
     - python x.x
     - rmm ={{ minor_version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 91fb176b56..e833e8519a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -170,7 +170,7 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
-          - nccl>=2.9.9
+          - nccl>=2.18.1.1
           - libucxx==0.40.*,>=0.0.0a0
     specific:
       - output_types: conda