Skip to content

Commit

Permalink
[Feat] add repeat, sparsity, eval_n_elements APIs to bitset (#…
Browse files Browse the repository at this point in the history
…2439)

- This PR is a part of the feature that applies the prefilter brute-force in Cagra.

Authors:
  - rhdong (https://github.com/rhdong)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Micka (https://github.com/lowener)

URL: #2439
  • Loading branch information
rhdong authored Sep 26, 2024
1 parent 2e705fb commit 0284b42
Show file tree
Hide file tree
Showing 7 changed files with 311 additions and 25 deletions.
7 changes: 3 additions & 4 deletions cpp/bench/prims/util/popc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,9 @@ struct popc_bench : public fixture {
auto bits_view =
raft::make_device_vector_view<const bits_t, index_t>(bits_d.data_handle(), bits_d.size());

index_t max_len = params.n_rows * params.n_cols;
auto max_len_view = raft::make_host_scalar_view<index_t>(&max_len);
auto nnz_actual_view =
nnz_actual_d.view(); // raft::make_device_scalar_view<index_t>(nnz_actual_d.data_handle());
index_t max_len = params.n_rows * params.n_cols;
auto max_len_view = raft::make_host_scalar_view<const index_t, index_t>(&max_len);
auto nnz_actual_view = nnz_actual_d.view();
raft::popc(this->handle, bits_view, max_len_view, nnz_actual_view);
});
}
Expand Down
107 changes: 106 additions & 1 deletion cpp/include/raft/core/bitset.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#include <raft/util/device_atomics.cuh>
#include <raft/util/popc.cuh>

#include <rmm/device_scalar.hpp>

#include <thrust/for_each.h>

namespace raft::core {
Expand Down Expand Up @@ -60,6 +62,109 @@ _RAFT_DEVICE void bitset_view<bitset_t, index_t>::set(const index_t sample_index
}
}

template <typename bitset_t, typename index_t>
void bitset_view<bitset_t, index_t>::count(const raft::resources& res,
raft::device_scalar_view<index_t> count_gpu_scalar) const
{
auto max_len = raft::make_host_scalar_view<const index_t, index_t>(&bitset_len_);
auto values = raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
raft::popc(res, values, max_len, count_gpu_scalar);
}

template <typename bitset_t, typename index_t>
RAFT_KERNEL bitset_repeat_kernel(const bitset_t* src,
bitset_t* output,
index_t src_bit_len,
index_t repeat_times)
{
constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
int output_idx = blockIdx.x * blockDim.x + threadIdx.x;

index_t total_bits = src_bit_len * repeat_times;
index_t output_size = (total_bits + bits_per_element - 1) / bits_per_element;
index_t src_size = (src_bit_len + bits_per_element - 1) / bits_per_element;

if (output_idx < output_size) {
bitset_t result = 0;
index_t bit_written = 0;

index_t start_bit = output_idx * bits_per_element;

while (bit_written < bits_per_element && start_bit + bit_written < total_bits) {
index_t bit_idx = (start_bit + bit_written) % src_bit_len;
index_t src_word_idx = bit_idx / bits_per_element;
index_t src_offset = bit_idx % bits_per_element;

index_t remaining_bits = min(bits_per_element - bit_written, src_bit_len - bit_idx);

bitset_t src_value = (src[src_word_idx] >> src_offset);

if (src_offset + remaining_bits > bits_per_element) {
bitset_t next_value = src[(src_word_idx + 1) % src_size];
src_value |= (next_value << (bits_per_element - src_offset));
}
src_value &= ((bitset_t{1} << remaining_bits) - 1);
result |= (src_value << bit_written);
bit_written += remaining_bits;
}
output[output_idx] = result;
}
}

template <typename bitset_t, typename index_t>
void bitset_repeat(raft::resources const& handle,
const bitset_t* d_src,
bitset_t* d_output,
index_t src_bit_len,
index_t repeat_times)
{
if (src_bit_len == 0 || repeat_times == 0) return;
auto stream = resource::get_cuda_stream(handle);

constexpr index_t bits_per_element = sizeof(bitset_t) * 8;
const index_t total_bits = src_bit_len * repeat_times;
const index_t output_size = (total_bits + bits_per_element - 1) / bits_per_element;

int threadsPerBlock = 128;
int blocksPerGrid = (output_size + threadsPerBlock - 1) / threadsPerBlock;
bitset_repeat_kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
d_src, d_output, src_bit_len, repeat_times);

return;
}

template <typename bitset_t, typename index_t>
void bitset_view<bitset_t, index_t>::repeat(const raft::resources& res,
index_t times,
bitset_t* output_device_ptr) const
{
auto thrust_policy = raft::resource::get_thrust_policy(res);
constexpr index_t bits_per_element = sizeof(bitset_t) * 8;

if (bitset_len_ % bits_per_element == 0) {
index_t num_elements_to_copy = bitset_len_ / bits_per_element;

for (index_t i = 0; i < times; ++i) {
raft::copy(output_device_ptr + i * num_elements_to_copy,
bitset_ptr_,
num_elements_to_copy,
raft::resource::get_cuda_stream(res));
}
} else {
bitset_repeat(res, bitset_ptr_, output_device_ptr, bitset_len_, times);
}
}

template <typename bitset_t, typename index_t>
double bitset_view<bitset_t, index_t>::sparsity(const raft::resources& res) const
{
index_t size_h = this->size();
if (0 == size_h) { return static_cast<double>(1.0); }
index_t count_h = this->count(res);

return static_cast<double>((1.0 * (size_h - count_h)) / (1.0 * size_h));
}

template <typename bitset_t, typename index_t>
bitset<bitset_t, index_t>::bitset(const raft::resources& res,
raft::device_vector_view<const index_t, index_t> mask_index,
Expand Down Expand Up @@ -155,7 +260,7 @@ template <typename bitset_t, typename index_t>
void bitset<bitset_t, index_t>::count(const raft::resources& res,
raft::device_scalar_view<index_t> count_gpu_scalar)
{
auto max_len = raft::make_host_scalar_view<index_t>(&bitset_len_);
auto max_len = raft::make_host_scalar_view<const index_t, index_t>(&bitset_len_);
auto values =
raft::make_device_vector_view<const bitset_t, index_t>(bitset_.data(), n_elements());
raft::popc(res, values, max_len, count_gpu_scalar);
Expand Down
76 changes: 76 additions & 0 deletions cpp/include/raft/core/bitset.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include <raft/core/resources.hpp>
#include <raft/util/integer_utils.hpp>

#include <cmath>

namespace raft::core {
/**
* @defgroup bitset Bitset
Expand Down Expand Up @@ -103,6 +105,80 @@ struct bitset_view {
{
return raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
}
/**
* @brief Returns the number of bits set to true in count_gpu_scalar.
*
* @param[in] res RAFT resources
* @param[out] count_gpu_scalar Device scalar to store the count
*/
void count(const raft::resources& res, raft::device_scalar_view<index_t> count_gpu_scalar) const;
/**
* @brief Returns the number of bits set to true.
*
* @param res RAFT resources
* @return index_t Number of bits set to true
*/
auto count(const raft::resources& res) const -> index_t
{
auto count_gpu_scalar = raft::make_device_scalar<index_t>(res, 0.0);
count(res, count_gpu_scalar.view());
index_t count_cpu = 0;
raft::update_host(
&count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
resource::sync_stream(res);
return count_cpu;
}

/**
* @brief Repeats the bitset data and copies it to the output device pointer.
*
* This function takes the original bitset data stored in the device memory
* and repeats it a specified number of times into a new location in the device memory.
* The bits are copied bit-by-bit to ensure that even if the number of bits (bitset_len_)
* is not a multiple of the bitset element size (e.g., 32 for uint32_t), the bits are
* tightly packed without any gaps between rows.
*
* @param res RAFT resources for managing CUDA streams and execution policies.
* @param times Number of times the bitset data should be repeated in the output.
* @param output_device_ptr Device pointer where the repeated bitset data will be stored.
*
* The caller must ensure that the output device pointer has enough memory allocated
* to hold `times * bitset_len` bits, where `bitset_len` is the number of bits in the original
* bitset. This function uses Thrust parallel algorithms to efficiently perform the operation on
* the GPU.
*/
void repeat(const raft::resources& res, index_t times, bitset_t* output_device_ptr) const;

/**
* @brief Calculate the sparsity (fraction of 0s) of the bitset.
*
* This function computes the sparsity of the bitset, defined as the ratio of unset bits (0s)
* to the total number of bits in the set. If the total number of bits is zero, the function
* returns 1.0, indicating the set is fully sparse.
*
* @param res RAFT resources for managing CUDA streams and execution policies.
* @return double The sparsity of the bitset, i.e., the fraction of unset bits.
*
* This API will synchronize on the stream of `res`.
*/
double sparsity(const raft::resources& res) const;

/**
* @brief Calculates the number of `bitset_t` elements required to store a bitset.
*
* This function computes the number of `bitset_t` elements needed to store a bitset, ensuring
* that all bits are accounted for. If the bitset length is not a multiple of the `bitset_t` size
* (in bits), the calculation rounds up to include the remaining bits in an additional `bitset_t`
* element.
*
* @param bitset_len The total length of the bitset in bits.
* @return size_t The number of `bitset_t` elements required to store the bitset.
*/
static inline size_t eval_n_elements(size_t bitset_len)
{
const size_t bits_per_element = sizeof(bitset_t) * 8;
return (bitset_len + bits_per_element - 1) / bits_per_element;
}

private:
bitset_t* bitset_ptr_;
Expand Down
6 changes: 3 additions & 3 deletions cpp/include/raft/util/detail/popc.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@ namespace raft::detail {
*/
template <typename value_t, typename index_t>
void popc(const raft::resources& res,
device_vector_view<value_t, index_t> values,
raft::host_scalar_view<index_t> max_len,
device_vector_view<const value_t, index_t> values,
raft::host_scalar_view<const index_t, index_t> max_len,
raft::device_scalar_view<index_t> counter)
{
auto values_size = values.size();
auto values_matrix = raft::make_device_matrix_view<value_t, index_t, col_major>(
auto values_matrix = raft::make_device_matrix_view<const value_t, index_t, col_major>(
values.data_handle(), values_size, 1);
auto counter_vector = raft::make_device_vector_view<index_t, index_t>(counter.data_handle(), 1);

Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/util/popc.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ namespace raft {
*/
template <typename value_t, typename index_t>
void popc(const raft::resources& res,
device_vector_view<value_t, index_t> values,
raft::host_scalar_view<index_t> max_len,
device_vector_view<const value_t, index_t> values,
raft::host_scalar_view<const index_t, index_t> max_len,
raft::device_scalar_view<index_t> counter)
{
detail::popc(res, values, max_len, counter);
Expand Down
Loading

0 comments on commit 0284b42

Please sign in to comment.