rapidsai · rapids-bot · Jun 12, 2024 · Jun 6, 2024 · Jun 10, 2024 · Jun 10, 2024
@@ -34,6 +34,7 @@
 
 // TODO: Fixme- this needs to be migrated
 #include "../../ivf_pq/ivf_pq_build.cuh"
+#include "../../ivf_pq/ivf_pq_search.cuh"
 #include "../../nn_descent.cuh"
 
 // TODO: This shouldn't be calling spatial/knn APIs
@@ -162,42 +163,64 @@ void build_knn_graph(
   // search top (k + 1) neighbors
   //
 
-  const auto top_k          = node_degree + 1;
-  uint32_t gpu_top_k        = node_degree * pq.refinement_rate;
-  gpu_top_k                 = std::min<IdxT>(std::max(gpu_top_k, top_k), dataset.extent(0));
-  const auto num_queries    = dataset.extent(0);
-  const auto max_batch_size = 1024;
+  const auto top_k       = node_degree + 1;
+  uint32_t gpu_top_k     = node_degree * pq.refinement_rate;
+  gpu_top_k              = std::min<IdxT>(std::max(gpu_top_k, top_k), dataset.extent(0));
+  const auto num_queries = dataset.extent(0);
+
+  // Use the same maximum batch size as the ivf_pq::search to avoid allocating more than needed.
+  using cuvs::neighbors::ivf_pq::detail::kMaxQueries;
+  // Heuristic: the build_knn_graph code should use only a fraction of the workspace memory; the
+  // rest should be used by the ivf_pq::search. Here we say that the workspace size should be a good
+  // multiple of what is required for the I/O batching below.
+  constexpr size_t kMinWorkspaceRatio = 5;
+  auto desired_workspace_size         = kMaxQueries * kMinWorkspaceRatio *
+                                (sizeof(DataT) * dataset.extent(1)  // queries (dataset batch)
+                                 + sizeof(float) * gpu_top_k        // distances
+                                 + sizeof(int64_t) * gpu_top_k      // neighbors
+                                 + sizeof(float) * top_k            // refined_distances
+                                 + sizeof(int64_t) * top_k          // refined_neighbors
+                                );
+
+  // If the workspace is smaller than desired, put the I/O buffers into the large workspace.
+  rmm::device_async_resource_ref workspace_mr =
+    desired_workspace_size <= raft::resource::get_workspace_free_bytes(res)
+      ? raft::resource::get_workspace_resource(res)
+      : raft::resource::get_large_workspace_resource(res);
+
   RAFT_LOG_DEBUG(
     "IVF-PQ search node_degree: %d, top_k: %d,  gpu_top_k: %d,  max_batch_size:: %d, n_probes: %u",
     node_degree,
     top_k,
     gpu_top_k,
-    max_batch_size,
+    kMaxQueries,
     pq.search_params.n_probes);
 
-  auto distances = raft::make_device_matrix<float, int64_t>(res, max_batch_size, gpu_top_k);
-  auto neighbors = raft::make_device_matrix<int64_t, int64_t>(res, max_batch_size, gpu_top_k);
-  auto refined_distances = raft::make_device_matrix<float, int64_t>(res, max_batch_size, top_k);
-  auto refined_neighbors = raft::make_device_matrix<int64_t, int64_t>(res, max_batch_size, top_k);
-  auto neighbors_host    = raft::make_host_matrix<int64_t, int64_t>(max_batch_size, gpu_top_k);
-  auto queries_host = raft::make_host_matrix<DataT, int64_t>(max_batch_size, dataset.extent(1));
-  auto refined_neighbors_host = raft::make_host_matrix<int64_t, int64_t>(max_batch_size, top_k);
-  auto refined_distances_host = raft::make_host_matrix<float, int64_t>(max_batch_size, top_k);
+  auto distances = raft::make_device_mdarray<float>(
+    res, workspace_mr, raft::make_extents<int64_t>(kMaxQueries, gpu_top_k));
+  auto neighbors = raft::make_device_mdarray<int64_t>(
+    res, workspace_mr, raft::make_extents<int64_t>(kMaxQueries, gpu_top_k));
+  auto refined_distances = raft::make_device_mdarray<float>(
+    res, workspace_mr, raft::make_extents<int64_t>(kMaxQueries, top_k));
+  auto refined_neighbors = raft::make_device_mdarray<int64_t>(
+    res, workspace_mr, raft::make_extents<int64_t>(kMaxQueries, top_k));
+  auto neighbors_host = raft::make_host_matrix<int64_t, int64_t>(kMaxQueries, gpu_top_k);
+  auto queries_host   = raft::make_host_matrix<DataT, int64_t>(kMaxQueries, dataset.extent(1));
+  auto refined_neighbors_host = raft::make_host_matrix<int64_t, int64_t>(kMaxQueries, top_k);
+  auto refined_distances_host = raft::make_host_matrix<float, int64_t>(kMaxQueries, top_k);
 
   // TODO(tfeher): batched search with multiple GPUs
   std::size_t num_self_included = 0;
   bool first                    = true;
   const auto start_clock        = std::chrono::system_clock::now();
 
-  rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(res);
-
   cuvs::spatial::knn::detail::utils::batch_load_iterator<DataT> vec_batches(
     dataset.data_handle(),
     dataset.extent(0),
     dataset.extent(1),
-    (int64_t)max_batch_size,
+    static_cast<int64_t>(kMaxQueries),
     raft::resource::get_cuda_stream(res),
-    device_memory);
+    workspace_mr);
 
   size_t next_report_offset = 0;
   size_t d_report_offset    = dataset.extent(0) / 100;  // Report progress in 1% steps.

@@ -41,8 +41,7 @@
 #include <memory>
 #include <random>
 
-namespace cuvs::neighbors::cagra::detail {
-namespace graph {
+namespace cuvs::neighbors::cagra::detail::graph {
 
 // unnamed namespace to avoid multiple definition error
 namespace {
@@ -251,15 +250,19 @@ void sort_knn_graph(
   const uint32_t input_graph_degree = knn_graph.extent(1);
   IdxT* const input_graph_ptr       = knn_graph.data_handle();
 
-  auto d_input_graph = raft::make_device_matrix<IdxT, int64_t>(res, graph_size, input_graph_degree);
+  auto large_tmp_mr = raft::resource::get_large_workspace_resource(res);
+
+  auto d_input_graph = raft::make_device_mdarray<IdxT>(
+    res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, input_graph_degree));
 
   //
   // Sorting kNN graph
   //
   const double time_sort_start = cur_time();
   RAFT_LOG_DEBUG("# Sorting kNN Graph on GPUs ");
 
-  auto d_dataset = raft::make_device_matrix<DataT, int64_t>(res, dataset_size, dataset_dim);
+  auto d_dataset = raft::make_device_mdarray<DataT>(
+    res, large_tmp_mr, raft::make_extents<int64_t>(dataset_size, dataset_dim));
   raft::copy(d_dataset.data_handle(),
              dataset_ptr,
              dataset_size * dataset_dim,
@@ -332,6 +335,7 @@ void optimize(
 {
   RAFT_LOG_DEBUG(
     "# Pruning kNN graph (size=%lu, degree=%lu)\n", knn_graph.extent(0), knn_graph.extent(1));
+  auto large_tmp_mr = raft::resource::get_large_workspace_resource(res);
 
   RAFT_EXPECTS(knn_graph.extent(0) == new_graph.extent(0),
                "Each input array is expected to have the same number of rows");
@@ -347,15 +351,16 @@ void optimize(
     //
     // Prune kNN graph
     //
-    auto d_detour_count =
-      raft::make_device_matrix<uint8_t, int64_t>(res, graph_size, input_graph_degree);
+    auto d_detour_count = raft::make_device_mdarray<uint8_t>(
+      res, large_tmp_mr, raft::make_extents<int64_t>(graph_size, input_graph_degree));
 
     RAFT_CUDA_TRY(cudaMemsetAsync(d_detour_count.data_handle(),
                                   0xff,
                                   graph_size * input_graph_degree * sizeof(uint8_t),
                                   raft::resource::get_cuda_stream(res)));
 
-    auto d_num_no_detour_edges = raft::make_device_vector<uint32_t, int64_t>(res, graph_size);
+    auto d_num_no_detour_edges = raft::make_device_mdarray<uint32_t>(
+      res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
     RAFT_CUDA_TRY(cudaMemsetAsync(d_num_no_detour_edges.data_handle(),
                                   0x00,
                                   graph_size * sizeof(uint32_t),
@@ -475,14 +480,16 @@ void optimize(
                                   graph_size * output_graph_degree * sizeof(IdxT),
                                   raft::resource::get_cuda_stream(res)));
 
-    auto d_rev_graph_count = raft::make_device_vector<uint32_t, int64_t>(res, graph_size);
+    auto d_rev_graph_count = raft::make_device_mdarray<uint32_t>(
+      res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
     RAFT_CUDA_TRY(cudaMemsetAsync(d_rev_graph_count.data_handle(),
                                   0x00,
                                   graph_size * sizeof(uint32_t),
                                   raft::resource::get_cuda_stream(res)));
 
-    auto dest_nodes   = raft::make_host_vector<IdxT, int64_t>(graph_size);
-    auto d_dest_nodes = raft::make_device_vector<IdxT, int64_t>(res, graph_size);
+    auto dest_nodes = raft::make_host_vector<IdxT, int64_t>(graph_size);
+    auto d_dest_nodes =
+      raft::make_device_mdarray<IdxT>(res, large_tmp_mr, raft::make_extents<int64_t>(graph_size));
 
     for (uint64_t k = 0; k < output_graph_degree; k++) {
 #pragma omp parallel for
@@ -578,5 +585,4 @@ void optimize(
   }
 }
 
-}  // namespace graph
-}  // namespace cuvs::neighbors::cagra::detail
+}  // namespace cuvs::neighbors::cagra::detail::graph
@@ -186,8 +186,11 @@ class device_matrix_view_from_host {
     device_ptr = reinterpret_cast<T*>(attr.devicePointer);
     if (device_ptr == NULL) {
       // allocate memory and copy over
-      device_mem_.emplace(
-        raft::make_device_matrix<T, IdxT>(res, host_view.extent(0), host_view.extent(1)));
+      // NB: We use the temporary "large" workspace resource here; this structure is supposed to
+      // live on stack and not returned to a user.
+      // The user may opt to set this resource to managed memory to allow large allocations.
+      device_mem_.emplace(raft::make_device_mdarray<T, IdxT>(
+        res, raft::resource::get_large_workspace_resource(res), host_view.extents()));
       raft::copy(device_mem_->data_handle(),
                  host_view.data_handle(),
                  host_view.extent(0) * host_view.extent(1),

@@ -31,6 +31,7 @@
 #include <raft/core/mdarray.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/map.cuh>
@@ -184,7 +185,8 @@ void extend(raft::resources const& handle,
   RAFT_EXPECTS(new_indices != nullptr || index->size() == 0,
                "You must pass data indices when the index is non-empty.");
 
-  auto new_labels = raft::make_device_vector<LabelT, IdxT>(handle, n_rows);
+  auto new_labels = raft::make_device_mdarray<LabelT>(
+    handle, raft::resource::get_large_workspace_resource(handle), raft::make_extents<IdxT>(n_rows));
   cuvs::cluster::kmeans::balanced_params kmeans_params;
   kmeans_params.metric = index->metric();
   auto orig_centroids_view =
@@ -215,7 +217,8 @@ void extend(raft::resources const& handle,
   }
 
   auto* list_sizes_ptr    = index->list_sizes().data_handle();
-  auto old_list_sizes_dev = raft::make_device_vector<uint32_t, IdxT>(handle, n_lists);
+  auto old_list_sizes_dev = raft::make_device_mdarray<uint32_t>(
+    handle, raft::resource::get_workspace_resource(handle), raft::make_extents<IdxT>(n_lists));
   raft::copy(old_list_sizes_dev.data_handle(), list_sizes_ptr, n_lists, stream);
 
   // Calculate the centers and sizes on the new data, starting from the original values
@@ -371,7 +374,8 @@ inline auto build(raft::resources const& handle,
     auto trainset_ratio = std::max<size_t>(
       1, n_rows / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, index.n_lists()));
     auto n_rows_train = n_rows / trainset_ratio;
-    rmm::device_uvector<T> trainset(n_rows_train * index.dim(), stream);
+    rmm::device_uvector<T> trainset(
+      n_rows_train * index.dim(), stream, raft::resource::get_large_workspace_resource(handle));
     // TODO: a proper sampling
     RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(),
                                     sizeof(T) * index.dim(),
@@ -431,7 +435,8 @@ inline void fill_refinement_index(raft::resources const& handle,
   common::nvtx::range<common::nvtx::domain::cuvs> fun_scope(
     "ivf_flat::fill_refinement_index(%zu, %u)", size_t(n_queries));
 
-  rmm::device_uvector<LabelT> new_labels(n_queries * n_candidates, stream);
+  rmm::device_uvector<LabelT> new_labels(
+    n_queries * n_candidates, stream, raft::resource::get_workspace_resource(handle));
   auto new_labels_view =
     raft::make_device_vector_view<LabelT, IdxT>(new_labels.data(), n_queries * n_candidates);
   raft::linalg::map_offset(

@@ -280,17 +280,15 @@ void search_impl(raft::resources const& handle,
 template <typename T,
           typename IdxT,
           typename IvfSampleFilterT = cuvs::neighbors::filtering::none_ivf_sample_filter>
-inline void search_with_filtering(
-  raft::resources const& handle,
-  const search_params& params,
-  const index<T, IdxT>& index,
-  const T* queries,
-  uint32_t n_queries,
-  uint32_t k,
-  IdxT* neighbors,
-  float* distances,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource(),
-  IvfSampleFilterT sample_filter    = IvfSampleFilterT())
+inline void search_with_filtering(raft::resources const& handle,
+                                  const search_params& params,
+                                  const index<T, IdxT>& index,
+                                  const T* queries,
+                                  uint32_t n_queries,
+                                  uint32_t k,
+                                  IdxT* neighbors,
+                                  float* distances,
+                                  IvfSampleFilterT sample_filter = IvfSampleFilterT())
 {
   common::nvtx::range<common::nvtx::domain::cuvs> fun_scope(
     "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
@@ -335,7 +333,7 @@ inline void search_with_filtering(
                                                   cuvs::distance::is_min_close(index.metric()),
                                                   neighbors + offset_q * k,
                                                   distances + offset_q * k,
-                                                  mr,
+                                                  raft::resource::get_workspace_resource(handle),
                                                   sample_filter);
   }
 }
@@ -367,7 +365,6 @@ void search_with_filtering(raft::resources const& handle,
                         static_cast<std::uint32_t>(neighbors.extent(1)),
                         neighbors.data_handle(),
                         distances.data_handle(),
-                        raft::resource::get_workspace_resource(handle),
                         sample_filter);
 }