diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 5bb98511cf..032b88b4aa 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -25,8 +25,8 @@ NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR} NEXT_UCXX_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG})" # Need to distutils-normalize the original version -NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))") -NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_UCXX_SHORT_TAG}'))") +NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))") +NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_UCXX_SHORT_TAG}'))") echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG" diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml index 462874a7e7..de4eb7e690 100644 --- a/conda/environments/all_cuda-118_arch-aarch64.yaml +++ b/conda/environments/all_cuda-118_arch-aarch64.yaml @@ -36,7 +36,7 @@ dependencies: - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 - libucxx==0.40.*,>=0.0.0a0 -- nccl>=2.9.9 +- nccl>=2.18.1.1 - ninja - numba>=0.57 - numpy>=1.23,<3.0a0 diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index cfd974a6a8..26f4c1efaa 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -36,7 +36,7 @@ dependencies: - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 - libucxx==0.40.*,>=0.0.0a0 -- nccl>=2.9.9 +- nccl>=2.18.1.1 - ninja - numba>=0.57 - numpy>=1.23,<3.0a0 diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml index 82e391e9ae..692956502b 100644 --- a/conda/environments/all_cuda-125_arch-aarch64.yaml +++ b/conda/environments/all_cuda-125_arch-aarch64.yaml @@ -33,7 +33,7 @@ dependencies: - libcusolver-dev - libcusparse-dev - libucxx==0.40.*,>=0.0.0a0 -- nccl>=2.9.9 +- nccl>=2.18.1.1 - ninja - numba>=0.57 - numpy>=1.23,<3.0a0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 0389427d13..133d42bfee 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -33,7 +33,7 @@ dependencies: - libcusolver-dev - libcusparse-dev - libucxx==0.40.*,>=0.0.0a0 -- nccl>=2.9.9 +- nccl>=2.18.1.1 - ninja - numba>=0.57 - numpy>=1.23,<3.0a0 diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml index eff1c56840..f99cedd627 100644 --- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml @@ -32,7 +32,7 @@ dependencies: - libcusparse=11.7.5.86 - libucxx==0.40.*,>=0.0.0a0 - matplotlib -- nccl>=2.9.9 +- nccl>=2.18.1.1 - ninja - nlohmann_json>=3.11.2 - nvcc_linux-aarch64=11.8 diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml index 87b19d2952..08aea32ab1 100644 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -32,7 +32,7 @@ dependencies: - libcusparse=11.7.5.86 - libucxx==0.40.*,>=0.0.0a0 - matplotlib -- nccl>=2.9.9 +- nccl>=2.18.1.1 - ninja - nlohmann_json>=3.11.2 - nvcc_linux-64=11.8 diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml index ff3451c15c..572ad85ab5 100644 --- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml +++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml @@ -29,7 +29,7 @@ dependencies: - libcusparse-dev - libucxx==0.40.*,>=0.0.0a0 - matplotlib -- nccl>=2.9.9 +- nccl>=2.18.1.1 - ninja - nlohmann_json>=3.11.2 - openblas diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml index 085e099ae8..9fd23edb03 100644 --- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml @@ -29,7 +29,7 @@ dependencies: - libcusparse-dev - libucxx==0.40.*,>=0.0.0a0 - matplotlib -- nccl>=2.9.9 +- nccl>=2.18.1.1 - ninja - nlohmann_json>=3.11.2 - openblas diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml index 5c0047fb9c..6dc06648bd 100644 --- a/conda/recipes/libraft/conda_build_config.yaml +++ b/conda/recipes/libraft/conda_build_config.yaml @@ -20,7 +20,7 @@ cmake_version: - ">=3.26.4,!=3.30.0" nccl_version: - - ">=2.9.9" + - ">=2.18.1.1" glog_version: - ">=0.6.0" diff --git a/conda/recipes/raft-ann-bench/conda_build_config.yaml b/conda/recipes/raft-ann-bench/conda_build_config.yaml index db0083b583..bdb4e883ea 100644 --- a/conda/recipes/raft-ann-bench/conda_build_config.yaml +++ b/conda/recipes/raft-ann-bench/conda_build_config.yaml @@ -20,7 +20,7 @@ cmake_version: - ">=3.26.4,!=3.30.0" nccl_version: - - ">=2.9.9" + - ">=2.18.1.1" glog_version: - ">=0.6.0" diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml index e6afed2890..58e8ec3c9e 100644 --- a/conda/recipes/raft-dask/conda_build_config.yaml +++ b/conda/recipes/raft-dask/conda_build_config.yaml @@ -24,3 +24,6 @@ ucxx_version: cmake_version: - ">=3.26.4,!=3.30.0" + +nccl_version: + - ">=2.18.1.1" diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml index 74b26b5935..bc13d352b7 100644 --- a/conda/recipes/raft-dask/meta.yaml +++ b/conda/recipes/raft-dask/meta.yaml @@ -50,7 +50,7 @@ requirements: {% endif %} - cuda-version ={{ cuda_version }} - cython >=3.0.0 - - nccl >=2.9.9 + - nccl {{ nccl_version }} - pylibraft {{ version }} - python x.x - rmm ={{ minor_version }} @@ -68,7 +68,7 @@ requirements: - dask-cuda ={{ minor_version }} - rapids-dask-dependency ={{ minor_version }} - joblib >=0.11 - - nccl >=2.9.9 + - nccl {{ nccl_version }} - pylibraft {{ version }} - python x.x - rmm ={{ minor_version }} diff --git a/cpp/bench/prims/util/popc.cu b/cpp/bench/prims/util/popc.cu index 249dc13d1e..c6249fb2bd 100644 --- a/cpp/bench/prims/util/popc.cu +++ b/cpp/bench/prims/util/popc.cu @@ -89,10 +89,9 @@ struct popc_bench : public fixture { auto bits_view = raft::make_device_vector_view(bits_d.data_handle(), bits_d.size()); - index_t max_len = params.n_rows * params.n_cols; - auto max_len_view = raft::make_host_scalar_view(&max_len); - auto nnz_actual_view = - nnz_actual_d.view(); // raft::make_device_scalar_view(nnz_actual_d.data_handle()); + index_t max_len = params.n_rows * params.n_cols; + auto max_len_view = raft::make_host_scalar_view(&max_len); + auto nnz_actual_view = nnz_actual_d.view(); raft::popc(this->handle, bits_view, max_len_view, nnz_actual_view); }); } diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh index b6e6128eca..d1bffdb81e 100644 --- a/cpp/include/raft/core/bitset.cuh +++ b/cpp/include/raft/core/bitset.cuh @@ -26,6 +26,8 @@ #include #include +#include + #include namespace raft::core { @@ -60,6 +62,109 @@ _RAFT_DEVICE void bitset_view::set(const index_t sample_index } } +template +void bitset_view::count(const raft::resources& res, + raft::device_scalar_view count_gpu_scalar) const +{ + auto max_len = raft::make_host_scalar_view(&bitset_len_); + auto values = raft::make_device_vector_view(bitset_ptr_, n_elements()); + raft::popc(res, values, max_len, count_gpu_scalar); +} + +template +RAFT_KERNEL bitset_repeat_kernel(const bitset_t* src, + bitset_t* output, + index_t src_bit_len, + index_t repeat_times) +{ + constexpr index_t bits_per_element = sizeof(bitset_t) * 8; + int output_idx = blockIdx.x * blockDim.x + threadIdx.x; + + index_t total_bits = src_bit_len * repeat_times; + index_t output_size = (total_bits + bits_per_element - 1) / bits_per_element; + index_t src_size = (src_bit_len + bits_per_element - 1) / bits_per_element; + + if (output_idx < output_size) { + bitset_t result = 0; + index_t bit_written = 0; + + index_t start_bit = output_idx * bits_per_element; + + while (bit_written < bits_per_element && start_bit + bit_written < total_bits) { + index_t bit_idx = (start_bit + bit_written) % src_bit_len; + index_t src_word_idx = bit_idx / bits_per_element; + index_t src_offset = bit_idx % bits_per_element; + + index_t remaining_bits = min(bits_per_element - bit_written, src_bit_len - bit_idx); + + bitset_t src_value = (src[src_word_idx] >> src_offset); + + if (src_offset + remaining_bits > bits_per_element) { + bitset_t next_value = src[(src_word_idx + 1) % src_size]; + src_value |= (next_value << (bits_per_element - src_offset)); + } + src_value &= ((bitset_t{1} << remaining_bits) - 1); + result |= (src_value << bit_written); + bit_written += remaining_bits; + } + output[output_idx] = result; + } +} + +template +void bitset_repeat(raft::resources const& handle, + const bitset_t* d_src, + bitset_t* d_output, + index_t src_bit_len, + index_t repeat_times) +{ + if (src_bit_len == 0 || repeat_times == 0) return; + auto stream = resource::get_cuda_stream(handle); + + constexpr index_t bits_per_element = sizeof(bitset_t) * 8; + const index_t total_bits = src_bit_len * repeat_times; + const index_t output_size = (total_bits + bits_per_element - 1) / bits_per_element; + + int threadsPerBlock = 128; + int blocksPerGrid = (output_size + threadsPerBlock - 1) / threadsPerBlock; + bitset_repeat_kernel<<>>( + d_src, d_output, src_bit_len, repeat_times); + + return; +} + +template +void bitset_view::repeat(const raft::resources& res, + index_t times, + bitset_t* output_device_ptr) const +{ + auto thrust_policy = raft::resource::get_thrust_policy(res); + constexpr index_t bits_per_element = sizeof(bitset_t) * 8; + + if (bitset_len_ % bits_per_element == 0) { + index_t num_elements_to_copy = bitset_len_ / bits_per_element; + + for (index_t i = 0; i < times; ++i) { + raft::copy(output_device_ptr + i * num_elements_to_copy, + bitset_ptr_, + num_elements_to_copy, + raft::resource::get_cuda_stream(res)); + } + } else { + bitset_repeat(res, bitset_ptr_, output_device_ptr, bitset_len_, times); + } +} + +template +double bitset_view::sparsity(const raft::resources& res) const +{ + index_t size_h = this->size(); + if (0 == size_h) { return static_cast(1.0); } + index_t count_h = this->count(res); + + return static_cast((1.0 * (size_h - count_h)) / (1.0 * size_h)); +} + template bitset::bitset(const raft::resources& res, raft::device_vector_view mask_index, @@ -155,7 +260,7 @@ template void bitset::count(const raft::resources& res, raft::device_scalar_view count_gpu_scalar) { - auto max_len = raft::make_host_scalar_view(&bitset_len_); + auto max_len = raft::make_host_scalar_view(&bitset_len_); auto values = raft::make_device_vector_view(bitset_.data(), n_elements()); raft::popc(res, values, max_len, count_gpu_scalar); diff --git a/cpp/include/raft/core/bitset.hpp b/cpp/include/raft/core/bitset.hpp index 3608ee43fa..be828def87 100644 --- a/cpp/include/raft/core/bitset.hpp +++ b/cpp/include/raft/core/bitset.hpp @@ -22,6 +22,8 @@ #include #include +#include + namespace raft::core { /** * @defgroup bitset Bitset @@ -103,6 +105,80 @@ struct bitset_view { { return raft::make_device_vector_view(bitset_ptr_, n_elements()); } + /** + * @brief Returns the number of bits set to true in count_gpu_scalar. + * + * @param[in] res RAFT resources + * @param[out] count_gpu_scalar Device scalar to store the count + */ + void count(const raft::resources& res, raft::device_scalar_view count_gpu_scalar) const; + /** + * @brief Returns the number of bits set to true. + * + * @param res RAFT resources + * @return index_t Number of bits set to true + */ + auto count(const raft::resources& res) const -> index_t + { + auto count_gpu_scalar = raft::make_device_scalar(res, 0.0); + count(res, count_gpu_scalar.view()); + index_t count_cpu = 0; + raft::update_host( + &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res)); + resource::sync_stream(res); + return count_cpu; + } + + /** + * @brief Repeats the bitset data and copies it to the output device pointer. + * + * This function takes the original bitset data stored in the device memory + * and repeats it a specified number of times into a new location in the device memory. + * The bits are copied bit-by-bit to ensure that even if the number of bits (bitset_len_) + * is not a multiple of the bitset element size (e.g., 32 for uint32_t), the bits are + * tightly packed without any gaps between rows. + * + * @param res RAFT resources for managing CUDA streams and execution policies. + * @param times Number of times the bitset data should be repeated in the output. + * @param output_device_ptr Device pointer where the repeated bitset data will be stored. + * + * The caller must ensure that the output device pointer has enough memory allocated + * to hold `times * bitset_len` bits, where `bitset_len` is the number of bits in the original + * bitset. This function uses Thrust parallel algorithms to efficiently perform the operation on + * the GPU. + */ + void repeat(const raft::resources& res, index_t times, bitset_t* output_device_ptr) const; + + /** + * @brief Calculate the sparsity (fraction of 0s) of the bitset. + * + * This function computes the sparsity of the bitset, defined as the ratio of unset bits (0s) + * to the total number of bits in the set. If the total number of bits is zero, the function + * returns 1.0, indicating the set is fully sparse. + * + * @param res RAFT resources for managing CUDA streams and execution policies. + * @return double The sparsity of the bitset, i.e., the fraction of unset bits. + * + * This API will synchronize on the stream of `res`. + */ + double sparsity(const raft::resources& res) const; + + /** + * @brief Calculates the number of `bitset_t` elements required to store a bitset. + * + * This function computes the number of `bitset_t` elements needed to store a bitset, ensuring + * that all bits are accounted for. If the bitset length is not a multiple of the `bitset_t` size + * (in bits), the calculation rounds up to include the remaining bits in an additional `bitset_t` + * element. + * + * @param bitset_len The total length of the bitset in bits. + * @return size_t The number of `bitset_t` elements required to store the bitset. + */ + static inline size_t eval_n_elements(size_t bitset_len) + { + const size_t bits_per_element = sizeof(bitset_t) * 8; + return (bitset_len + bits_per_element - 1) / bits_per_element; + } private: bitset_t* bitset_ptr_; diff --git a/cpp/include/raft/util/detail/popc.cuh b/cpp/include/raft/util/detail/popc.cuh index 20b4814216..f335be6fd0 100644 --- a/cpp/include/raft/util/detail/popc.cuh +++ b/cpp/include/raft/util/detail/popc.cuh @@ -36,12 +36,12 @@ namespace raft::detail { */ template void popc(const raft::resources& res, - device_vector_view values, - raft::host_scalar_view max_len, + device_vector_view values, + raft::host_scalar_view max_len, raft::device_scalar_view counter) { auto values_size = values.size(); - auto values_matrix = raft::make_device_matrix_view( + auto values_matrix = raft::make_device_matrix_view( values.data_handle(), values_size, 1); auto counter_vector = raft::make_device_vector_view(counter.data_handle(), 1); diff --git a/cpp/include/raft/util/popc.cuh b/cpp/include/raft/util/popc.cuh index 153694e45e..d4bc01e274 100644 --- a/cpp/include/raft/util/popc.cuh +++ b/cpp/include/raft/util/popc.cuh @@ -31,8 +31,8 @@ namespace raft { */ template void popc(const raft::resources& res, - device_vector_view values, - raft::host_scalar_view max_len, + device_vector_view values, + raft::host_scalar_view max_len, raft::device_scalar_view counter) { detail::popc(res, values, max_len, counter); diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index a497e6d3ba..5d504d2100 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -440,7 +440,9 @@ if(BUILD_TESTS) neighbors/ann_nn_descent/test_float_uint32_t.cu neighbors/ann_nn_descent/test_int8_t_uint32_t.cu neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu - neighbors/ann_nn_descent/test_batch_float_uint32_t.cu + # TODO: Investigate why this test is failing Reference issue + # https://github.com/rapidsai/raft/issues/2450 + # neighbors/ann_nn_descent/test_batch_float_uint32_t.cu LIB EXPLICIT_INSTANTIATE_ONLY GPUS diff --git a/cpp/test/core/bitset.cu b/cpp/test/core/bitset.cu index b799297e8c..ac601274c1 100644 --- a/cpp/test/core/bitset.cu +++ b/cpp/test/core/bitset.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,12 +32,13 @@ struct test_spec_bitset { uint64_t bitset_len; uint64_t mask_len; uint64_t query_len; + uint64_t repeat_times; }; auto operator<<(std::ostream& os, const test_spec_bitset& ss) -> std::ostream& { os << "bitset{bitset_len: " << ss.bitset_len << ", mask_len: " << ss.mask_len - << ", query_len: " << ss.query_len << "}"; + << ", query_len: " << ss.query_len << ", repeat_times: " << ss.repeat_times << "}"; return os; } @@ -80,6 +81,48 @@ void flip_cpu_bitset(std::vector& bitset) } } +template +void repeat_cpu_bitset(std::vector& input, + size_t input_bits, + size_t repeat, + std::vector& output) +{ + const size_t output_bits = input_bits * repeat; + const size_t output_units = (output_bits + sizeof(bitset_t) * 8 - 1) / (sizeof(bitset_t) * 8); + + std::memset(output.data(), 0, output_units * sizeof(bitset_t)); + + size_t output_bit_index = 0; + + for (size_t r = 0; r < repeat; ++r) { + for (size_t i = 0; i < input_bits; ++i) { + size_t input_unit_index = i / (sizeof(bitset_t) * 8); + size_t input_bit_offset = i % (sizeof(bitset_t) * 8); + bool bit = (input[input_unit_index] >> input_bit_offset) & 1; + + size_t output_unit_index = output_bit_index / (sizeof(bitset_t) * 8); + size_t output_bit_offset = output_bit_index % (sizeof(bitset_t) * 8); + + output[output_unit_index] |= (static_cast(bit) << output_bit_offset); + + ++output_bit_index; + } + } +} + +template +double sparsity_cpu_bitset(std::vector& data, size_t total_bits) +{ + size_t one_count = 0; + for (size_t i = 0; i < total_bits; ++i) { + size_t unit_index = i / (sizeof(bitset_t) * 8); + size_t bit_offset = i % (sizeof(bitset_t) * 8); + bool bit = (data[unit_index] >> bit_offset) & 1; + if (bit == 1) { ++one_count; } + } + return static_cast((total_bits - one_count) / (1.0 * total_bits)); +} + template class BitsetTest : public testing::TestWithParam { protected: @@ -87,13 +130,19 @@ class BitsetTest : public testing::TestWithParam { const test_spec_bitset spec; std::vector bitset_result; std::vector bitset_ref; + std::vector bitset_repeat_ref; + std::vector bitset_repeat_result; raft::resources res; public: explicit BitsetTest() : spec(testing::TestWithParam::GetParam()), bitset_result(raft::ceildiv(spec.bitset_len, uint64_t(bitset_element_size))), - bitset_ref(raft::ceildiv(spec.bitset_len, uint64_t(bitset_element_size))) + bitset_ref(raft::ceildiv(spec.bitset_len, uint64_t(bitset_element_size))), + bitset_repeat_ref( + raft::ceildiv(spec.bitset_len * spec.repeat_times, uint64_t(bitset_element_size))), + bitset_repeat_result( + raft::ceildiv(spec.bitset_len * spec.repeat_times, uint64_t(bitset_element_size))) { } @@ -145,6 +194,50 @@ class BitsetTest : public testing::TestWithParam { resource::sync_stream(res, stream); ASSERT_TRUE(hostVecMatch(bitset_ref, bitset_result, raft::Compare())); + // test sparsity, repeat and eval_n_elements + { + auto my_bitset_view = my_bitset.view(); + auto sparsity_result = my_bitset_view.sparsity(res); + auto sparsity_ref = sparsity_cpu_bitset(bitset_ref, size_t(spec.bitset_len)); + ASSERT_EQ(sparsity_result, sparsity_ref); + + auto eval_n_elements = + bitset_view::eval_n_elements(spec.bitset_len * spec.repeat_times); + ASSERT_EQ(bitset_repeat_ref.size(), eval_n_elements); + + auto repeat_device = raft::make_device_vector(res, eval_n_elements); + RAFT_CUDA_TRY(cudaMemsetAsync( + repeat_device.data_handle(), 0, eval_n_elements * sizeof(bitset_t), stream)); + repeat_cpu_bitset( + bitset_ref, size_t(spec.bitset_len), size_t(spec.repeat_times), bitset_repeat_ref); + + my_bitset_view.repeat(res, index_t(spec.repeat_times), repeat_device.data_handle()); + + ASSERT_EQ(bitset_repeat_ref.size(), repeat_device.size()); + update_host( + bitset_repeat_result.data(), repeat_device.data_handle(), repeat_device.size(), stream); + ASSERT_EQ(bitset_repeat_ref.size(), bitset_repeat_result.size()); + + index_t errors = 0; + static constexpr index_t len_per_item = sizeof(bitset_t) * 8; + bitset_t tail_len = (index_t(spec.bitset_len * spec.repeat_times) % len_per_item); + bitset_t tail_mask = + tail_len ? (bitset_t)((bitset_t{1} << tail_len) - bitset_t{1}) : ~bitset_t{0}; + for (index_t i = 0; i < bitset_repeat_ref.size(); i++) { + if (i == bitset_repeat_ref.size() - 1) { + errors += (bitset_repeat_ref[i] & tail_mask) != (bitset_repeat_result[i] & tail_mask); + } else { + errors += (bitset_repeat_ref[i] != bitset_repeat_result[i]); + } + } + ASSERT_EQ(errors, 0); + + // recheck the sparsity after repeat + sparsity_result = + sparsity_cpu_bitset(bitset_repeat_result, size_t(spec.bitset_len * spec.repeat_times)); + ASSERT_EQ(sparsity_result, sparsity_ref); + } + // Flip the bitset and re-test auto bitset_count = my_bitset.count(res); my_bitset.flip(res); @@ -167,13 +260,14 @@ class BitsetTest : public testing::TestWithParam { } }; -auto inputs_bitset = ::testing::Values(test_spec_bitset{32, 5, 10}, - test_spec_bitset{100, 30, 10}, - test_spec_bitset{1024, 55, 100}, - test_spec_bitset{10000, 1000, 1000}, - test_spec_bitset{1 << 15, 1 << 3, 1 << 12}, - test_spec_bitset{1 << 15, 1 << 24, 1 << 13}, - test_spec_bitset{1 << 25, 1 << 23, 1 << 14}); +auto inputs_bitset = ::testing::Values(test_spec_bitset{32, 5, 10, 101}, + test_spec_bitset{100, 30, 10, 13}, + test_spec_bitset{1024, 55, 100, 1}, + test_spec_bitset{10000, 1000, 1000, 100}, + test_spec_bitset{1 << 15, 1 << 3, 1 << 12, 5}, + test_spec_bitset{1 << 15, 1 << 24, 1 << 13, 3}, + test_spec_bitset{1 << 25, 1 << 23, 1 << 14, 3}, + test_spec_bitset{1 << 25, 1 << 23, 1 << 14, 21}); using Uint16_32 = BitsetTest; TEST_P(Uint16_32, Run) { run(); } diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh index 2f9d4e252b..5070d83b15 100644 --- a/cpp/test/neighbors/ann_nn_descent.cuh +++ b/cpp/test/neighbors/ann_nn_descent.cuh @@ -318,13 +318,15 @@ const std::vector inputs = raft::util::itertools::product inputsBatch = - raft::util::itertools::product( - {std::make_pair(0.9, 3lu), std::make_pair(0.9, 2lu)}, // min_recall, n_clusters - {4000, 5000}, // n_rows - {192, 512}, // dim - {32, 64}, // graph_degree - {raft::distance::DistanceType::L2Expanded}, - {false, true}); +// TODO: Investigate why this test is failing +// Reference issue https://github.com/rapidsai/raft/issues/2450 +// const std::vector inputsBatch = +// raft::util::itertools::product( +// {std::make_pair(0.9, 3lu), std::make_pair(0.9, 2lu)}, // min_recall, n_clusters +// {4000, 5000}, // n_rows +// {192, 512}, // dim +// {32, 64}, // graph_degree +// {raft::distance::DistanceType::L2Expanded}, +// {false, true}); } // namespace raft::neighbors::experimental::nn_descent diff --git a/cpp/test/util/popc.cu b/cpp/test/util/popc.cu index c08faacb07..28eaad2fcb 100644 --- a/cpp/test/util/popc.cu +++ b/cpp/test/util/popc.cu @@ -76,7 +76,7 @@ class PopcTest : public ::testing::TestWithParam> { index_t bit_position = index % (8 * sizeof(bits_t)); if (((element >> bit_position) & 1) == 0) { - element |= (static_cast(1) << bit_position); + element |= (static_cast(1) << bit_position); num_ones--; } } @@ -101,7 +101,7 @@ class PopcTest : public ::testing::TestWithParam> { raft::make_device_vector_view(bits_d.data(), bits_d.size()); index_t max_len = params.n_rows * params.n_cols; - auto max_len_view = raft::make_host_scalar_view(&max_len); + auto max_len_view = raft::make_host_scalar_view(&max_len); index_t nnz_actual_h = 0; rmm::device_scalar nnz_actual_d(0, stream); @@ -123,8 +123,17 @@ class PopcTest : public ::testing::TestWithParam> { index_t nnz_expected; }; -using PopcTestI32 = PopcTest; -TEST_P(PopcTestI32, Result) { Run(); } +using PopcTestI32_U32 = PopcTest; +TEST_P(PopcTestI32_U32, Result) { Run(); } + +using PopcTestI32_U64 = PopcTest; +TEST_P(PopcTestI32_U64, Result) { Run(); } + +using PopcTestI32_U16 = PopcTest; +TEST_P(PopcTestI32_U16, Result) { Run(); } + +using PopcTestI32_U8 = PopcTest; +TEST_P(PopcTestI32_U8, Result) { Run(); } template const std::vector> popc_inputs = { @@ -154,6 +163,9 @@ const std::vector> popc_inputs = { {2, 33, 0.2}, }; -INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32, ::testing::ValuesIn(popc_inputs)); +INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32_U32, ::testing::ValuesIn(popc_inputs)); +INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32_U64, ::testing::ValuesIn(popc_inputs)); +INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32_U16, ::testing::ValuesIn(popc_inputs)); +INSTANTIATE_TEST_CASE_P(PopcTest, PopcTestI32_U8, ::testing::ValuesIn(popc_inputs)); } // namespace raft diff --git a/dependencies.yaml b/dependencies.yaml index 8f5c69245f..e833e8519a 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -170,7 +170,7 @@ dependencies: packages: - c-compiler - cxx-compiler - - nccl>=2.9.9 + - nccl>=2.18.1.1 - libucxx==0.40.*,>=0.0.0a0 specific: - output_types: conda @@ -499,10 +499,14 @@ dependencies: - *cuda_python - output_types: [requirements, pyproject] matrices: - - matrix: {cuda: "12.*"} + - matrix: + cuda: "12.*" + cuda_suffixed: "true" packages: - *rmm_cu12 - - matrix: {cuda: "11.*"} + - matrix: + cuda: "11.*" + cuda_suffixed: "true" packages: - *rmm_cu11 - {matrix: null, packages: [*rmm_unsuffixed]} diff --git a/python/pylibraft/pylibraft/test/pytest.ini b/python/pylibraft/pylibraft/test/pytest.ini new file mode 100644 index 0000000000..bf70c06f84 --- /dev/null +++ b/python/pylibraft/pylibraft/test/pytest.ini @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native + diff --git a/python/raft-dask/pytest.ini b/python/raft-dask/pytest.ini index 2467e2089a..e09c2b173d 100644 --- a/python/raft-dask/pytest.ini +++ b/python/raft-dask/pytest.ini @@ -10,3 +10,4 @@ markers = nccl: marks a test as using NCCL ucx: marks a test as using UCX-Py ucxx: marks a test as using UCXX +addopts = --tb=native diff --git a/python/raft-dask/raft_dask/test/pytest.ini b/python/raft-dask/raft_dask/test/pytest.ini new file mode 100644 index 0000000000..bf70c06f84 --- /dev/null +++ b/python/raft-dask/raft_dask/test/pytest.ini @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native +