Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scaling workspace resources #181

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 41 additions & 18 deletions cpp/src/neighbors/detail/cagra/cagra_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

// TODO: Fixme- this needs to be migrated
#include "../../ivf_pq/ivf_pq_build.cuh"
#include "../../ivf_pq/ivf_pq_search.cuh"
#include "../../nn_descent.cuh"

// TODO: This shouldn't be calling spatial/knn APIs
Expand Down Expand Up @@ -162,42 +163,64 @@ void build_knn_graph(
// search top (k + 1) neighbors
//

const auto top_k = node_degree + 1;
uint32_t gpu_top_k = node_degree * pq.refinement_rate;
gpu_top_k = std::min<IdxT>(std::max(gpu_top_k, top_k), dataset.extent(0));
const auto num_queries = dataset.extent(0);
const auto max_batch_size = 1024;
const auto top_k = node_degree + 1;
uint32_t gpu_top_k = node_degree * pq.refinement_rate;
gpu_top_k = std::min<IdxT>(std::max(gpu_top_k, top_k), dataset.extent(0));
const auto num_queries = dataset.extent(0);

// Use the same maximum batch size as the ivf_pq::search to avoid allocating more than needed.
using cuvs::neighbors::ivf_pq::detail::kMaxQueries;
// Heuristic: the build_knn_graph code should use only a fraction of the workspace memory; the
// rest should be used by the ivf_pq::search. Here we say that the workspace size should be a good
// multiple of what is required for the I/O batching below.
constexpr size_t kMinWorkspaceRatio = 5;
auto desired_workspace_size = kMaxQueries * kMinWorkspaceRatio *
(sizeof(DataT) * dataset.extent(1) // queries (dataset batch)
+ sizeof(float) * gpu_top_k // distances
+ sizeof(int64_t) * gpu_top_k // neighbors
+ sizeof(float) * top_k // refined_distances
+ sizeof(int64_t) * top_k // refined_neighbors
);

// If the workspace is smaller than desired, put the I/O buffers into the large workspace.
rmm::device_async_resource_ref workspace_mr =
desired_workspace_size <= raft::resource::get_workspace_free_bytes(res)
? raft::resource::get_workspace_resource(res)
: raft::resource::get_large_workspace_resource(res);

RAFT_LOG_DEBUG(
"IVF-PQ search node_degree: %d, top_k: %d, gpu_top_k: %d, max_batch_size:: %d, n_probes: %u",
node_degree,
top_k,
gpu_top_k,
max_batch_size,
kMaxQueries,
pq.search_params.n_probes);

auto distances = raft::make_device_matrix<float, int64_t>(res, max_batch_size, gpu_top_k);
auto neighbors = raft::make_device_matrix<int64_t, int64_t>(res, max_batch_size, gpu_top_k);
auto refined_distances = raft::make_device_matrix<float, int64_t>(res, max_batch_size, top_k);
auto refined_neighbors = raft::make_device_matrix<int64_t, int64_t>(res, max_batch_size, top_k);
auto neighbors_host = raft::make_host_matrix<int64_t, int64_t>(max_batch_size, gpu_top_k);
auto queries_host = raft::make_host_matrix<DataT, int64_t>(max_batch_size, dataset.extent(1));
auto refined_neighbors_host = raft::make_host_matrix<int64_t, int64_t>(max_batch_size, top_k);
auto refined_distances_host = raft::make_host_matrix<float, int64_t>(max_batch_size, top_k);
auto distances = raft::make_device_mdarray<float>(
res, workspace_mr, raft::make_extents<int64_t>(kMaxQueries, gpu_top_k));
auto neighbors = raft::make_device_mdarray<int64_t>(
res, workspace_mr, raft::make_extents<int64_t>(kMaxQueries, gpu_top_k));
auto refined_distances = raft::make_device_mdarray<float>(
res, workspace_mr, raft::make_extents<int64_t>(kMaxQueries, top_k));
auto refined_neighbors = raft::make_device_mdarray<int64_t>(
res, workspace_mr, raft::make_extents<int64_t>(kMaxQueries, top_k));
auto neighbors_host = raft::make_host_matrix<int64_t, int64_t>(kMaxQueries, gpu_top_k);
auto queries_host = raft::make_host_matrix<DataT, int64_t>(kMaxQueries, dataset.extent(1));
auto refined_neighbors_host = raft::make_host_matrix<int64_t, int64_t>(kMaxQueries, top_k);
auto refined_distances_host = raft::make_host_matrix<float, int64_t>(kMaxQueries, top_k);

// TODO(tfeher): batched search with multiple GPUs
std::size_t num_self_included = 0;
bool first = true;
const auto start_clock = std::chrono::system_clock::now();

rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(res);

cuvs::spatial::knn::detail::utils::batch_load_iterator<DataT> vec_batches(
dataset.data_handle(),
dataset.extent(0),
dataset.extent(1),
(int64_t)max_batch_size,
static_cast<int64_t>(kMaxQueries),
raft::resource::get_cuda_stream(res),
device_memory);
workspace_mr);

size_t next_report_offset = 0;
size_t d_report_offset = dataset.extent(0) / 100; // Report progress in 1% steps.
Expand Down
30 changes: 18 additions & 12 deletions cpp/src/neighbors/detail/cagra/graph_core.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@
#include <memory>
#include <random>

namespace cuvs::neighbors::cagra::detail {
namespace graph {
namespace cuvs::neighbors::cagra::detail::graph {

// unnamed namespace to avoid multiple definition error
namespace {
Expand Down Expand Up @@ -251,15 +250,19 @@ void sort_knn_graph(
const uint32_t input_graph_degree = knn_graph.extent(1);
IdxT* const input_graph_ptr = knn_graph.data_handle();

auto d_input_graph = raft::make_device_matrix<IdxT, int64_t>(res, graph_size, input_graph_degree);
auto large_tmp_mr = raft::resource::get_large_workspace_resource(res);

auto d_input_graph = raft::make_device_mdarray<IdxT>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, input_graph_degree));

//
// Sorting kNN graph
//
const double time_sort_start = cur_time();
RAFT_LOG_DEBUG("# Sorting kNN Graph on GPUs ");

auto d_dataset = raft::make_device_matrix<DataT, int64_t>(res, dataset_size, dataset_dim);
auto d_dataset = raft::make_device_mdarray<DataT>(
res, large_tmp_mr, raft::make_extents<int64_t>(dataset_size, dataset_dim));
raft::copy(d_dataset.data_handle(),
dataset_ptr,
dataset_size * dataset_dim,
Expand Down Expand Up @@ -332,6 +335,7 @@ void optimize(
{
RAFT_LOG_DEBUG(
"# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
auto large_tmp_mr = raft::resource::get_large_workspace_resource(res);

RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0),
"Each input array is expected to have the same number of rows");
Expand All @@ -347,15 +351,16 @@ void optimize(
//
// Prune kNN graph
//
auto d_detour_count =
raft::make_device_matrix<uint8_t, int64_t>(res, graph_size, input_graph_degree);
auto d_detour_count = raft::make_device_mdarray<uint8_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, input_graph_degree));

RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
0xff,
graph_size * input_graph_degree * sizeof(uint8_t),
raft::resource::get_cuda_stream(res)));

auto d_num_no_detour_edges = raft::make_device_vector<uint32_t, int64_t>(res, graph_size);
auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(),
0x00,
graph_size * sizeof(uint32_t),
Expand Down Expand Up @@ -475,14 +480,16 @@ void optimize(
graph_size * output_graph_degree * sizeof(IdxT),
raft::resource::get_cuda_stream(res)));

auto d_rev_graph_count = raft::make_device_vector<uint32_t, int64_t>(res, graph_size);
auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph_count.data_handle(),
0x00,
graph_size * sizeof(uint32_t),
raft::resource::get_cuda_stream(res)));

auto dest_nodes = raft::make_host_vector<IdxT, int64_t>(graph_size);
auto d_dest_nodes = raft::make_device_vector<IdxT, int64_t>(res, graph_size);
auto dest_nodes = raft::make_host_vector<IdxT, int64_t>(graph_size);
auto d_dest_nodes =
raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));

for (uint64_t k = 0; k < output_graph_degree; k++) {
#pragma omp parallel for
Expand Down Expand Up @@ -578,5 +585,4 @@ void optimize(
}
}

} // namespace graph
} // namespace cuvs::neighbors::cagra::detail
} // namespace cuvs::neighbors::cagra::detail::graph
7 changes: 5 additions & 2 deletions cpp/src/neighbors/detail/cagra/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,11 @@ class device_matrix_view_from_host {
device_ptr = reinterpret_cast<T*>(attr.devicePointer);
if (device_ptr == NULL) {
// allocate memory and copy over
device_mem_.emplace(
raft::make_device_matrix<T, IdxT>(res, host_view.extent(0), host_view.extent(1)));
// NB: We use the temporary "large" workspace resource here; this structure is supposed to
tfeher marked this conversation as resolved.
Show resolved Hide resolved
// live on stack and not returned to a user.
// The user may opt to set this resource to managed memory to allow large allocations.
device_mem_.emplace(raft::make_device_mdarray<T, IdxT>(
res, raft::resource::get_large_workspace_resource(res), host_view.extents()));
raft::copy(device_mem_->data_handle(),
host_view.data_handle(),
host_view.extent(0) * host_view.extent(1),
Expand Down
13 changes: 9 additions & 4 deletions cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <raft/core/mdarray.hpp>
#include <raft/core/operators.hpp>
#include <raft/core/resource/cuda_stream.hpp>
#include <raft/core/resource/device_memory_resource.hpp>
#include <raft/core/resources.hpp>
#include <raft/linalg/add.cuh>
#include <raft/linalg/map.cuh>
Expand Down Expand Up @@ -184,7 +185,8 @@ void extend(raft::resources const& handle,
RAFT_EXPECTS(new_indices != nullptr || index->size() == 0,
"You must pass data indices when the index is non-empty.");

auto new_labels = raft::make_device_vector<LabelT, IdxT>(handle, n_rows);
auto new_labels = raft::make_device_mdarray<LabelT>(
handle, raft::resource::get_large_workspace_resource(handle), raft::make_extents<IdxT>(n_rows));
cuvs::cluster::kmeans::balanced_params kmeans_params;
kmeans_params.metric = index->metric();
auto orig_centroids_view =
Expand Down Expand Up @@ -215,7 +217,8 @@ void extend(raft::resources const& handle,
}

auto* list_sizes_ptr = index->list_sizes().data_handle();
auto old_list_sizes_dev = raft::make_device_vector<uint32_t, IdxT>(handle, n_lists);
auto old_list_sizes_dev = raft::make_device_mdarray<uint32_t>(
handle, raft::resource::get_workspace_resource(handle), raft::make_extents<IdxT>(n_lists));
raft::copy(old_list_sizes_dev.data_handle(), list_sizes_ptr, n_lists, stream);

// Calculate the centers and sizes on the new data, starting from the original values
Expand Down Expand Up @@ -371,7 +374,8 @@ inline auto build(raft::resources const& handle,
auto trainset_ratio = std::max<size_t>(
1, n_rows / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, index.n_lists()));
auto n_rows_train = n_rows / trainset_ratio;
rmm::device_uvector<T> trainset(n_rows_train * index.dim(), stream);
rmm::device_uvector<T> trainset(
n_rows_train * index.dim(), stream, raft::resource::get_large_workspace_resource(handle));
// TODO: a proper sampling
RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
sizeof(T) * index.dim(),
Expand Down Expand Up @@ -431,7 +435,8 @@ inline void fill_refinement_index(raft::resources const& handle,
common::nvtx::range<common::nvtx::domain::cuvs> fun_scope(
"ivf_flat::fill_refinement_index(%zu, %u)", size_t(n_queries));

rmm::device_uvector<LabelT> new_labels(n_queries * n_candidates, stream);
rmm::device_uvector<LabelT> new_labels(
n_queries * n_candidates, stream, raft::resource::get_workspace_resource(handle));
auto new_labels_view =
raft::make_device_vector_view<LabelT, IdxT>(new_labels.data(), n_queries * n_candidates);
raft::linalg::map_offset(
Expand Down
23 changes: 10 additions & 13 deletions cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -280,17 +280,15 @@ void search_impl(raft::resources const& handle,
template <typename T,
typename IdxT,
typename IvfSampleFilterT = cuvs::neighbors::filtering::none_ivf_sample_filter>
inline void search_with_filtering(
raft::resources const& handle,
const search_params& params,
const index<T, IdxT>& index,
const T* queries,
uint32_t n_queries,
uint32_t k,
IdxT* neighbors,
float* distances,
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource(),
IvfSampleFilterT sample_filter = IvfSampleFilterT())
inline void search_with_filtering(raft::resources const& handle,
const search_params& params,
const index<T, IdxT>& index,
const T* queries,
uint32_t n_queries,
uint32_t k,
IdxT* neighbors,
float* distances,
IvfSampleFilterT sample_filter = IvfSampleFilterT())
{
common::nvtx::range<common::nvtx::domain::cuvs> fun_scope(
"ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
Expand Down Expand Up @@ -335,7 +333,7 @@ inline void search_with_filtering(
cuvs::distance::is_min_close(index.metric()),
neighbors + offset_q * k,
distances + offset_q * k,
mr,
raft::resource::get_workspace_resource(handle),
sample_filter);
}
}
Expand Down Expand Up @@ -367,7 +365,6 @@ void search_with_filtering(raft::resources const& handle,
static_cast<std::uint32_t>(neighbors.extent(1)),
neighbors.data_handle(),
distances.data_handle(),
raft::resource::get_workspace_resource(handle),
sample_filter);
}

Expand Down
Loading
Loading