rapidsai · bkarsin · Feb 25, 2025 · Apr 29, 2025 · Apr 29, 2025 · May 23, 2025
@@ -60,7 +60,7 @@ struct index_params : cuvs::neighbors::index_params {
    * literature **/
   uint32_t visited_size = 64;
   /** Number of Vamana vector insertion iterations (each iteration inserts all vectors). */
-  uint32_t vamana_iters = 1;
+  float vamana_iters = 1.0;
   /** Alpha for pruning parameter */
   float alpha = 1.2;
   /** Maximum fraction of dataset inserted per batch.              *
@@ -72,6 +72,7 @@ struct index_params : cuvs::neighbors::index_params {
   uint32_t queue_size = 127;
   /** Max batchsize of reverse edge processing (reduces memory footprint) */
   uint32_t reverse_batchsize = 1000000;
+
 };
 
 /**

@@ -16,10 +16,11 @@
 
 #pragma once
 
+#include <cub/cub.cuh>
+
 #include "macros.cuh"
 #include "priority_queue.cuh"
 #include "vamana_structs.cuh"
-#include <cub/cub.cuh>
 #include <cuvs/neighbors/vamana.hpp>
 
 #include <cuvs/distance/distance.hpp>
@@ -52,7 +53,7 @@ __forceinline__ __device__ void sort_visited(
   }
 
   __syncthreads();
-  BlockSortT(*sort_mem).Sort(tmp, CmpDist());
+  BlockSortT(*sort_mem).Sort(tmp, CmpDist<IdxT, accT>());
   __syncthreads();
 
   for (int i = 0; i < ELTS; i++) {
@@ -64,6 +65,27 @@ __forceinline__ __device__ void sort_visited(
 
 namespace {
 
+template <typename T,
+          typename accT,
+          typename IdxT     = uint32_t,
+          typename Accessor = raft::host_device_accessor<std::experimental::default_accessor<T>,
+                                                         raft::memory_type::host>>
+__global__ void SortPairsKernel(void* query_list_ptr, int num_queries, int topk)
+{
+  union ShmemLayout {
+    typename cub::BlockMergeSort<DistPair<IdxT, accT>, 32, 1>::TempStorage sort_mem;
+  };
+  extern __shared__ __align__(alignof(ShmemLayout)) char smem[];
+
+  QueryCandidates<IdxT, accT>* query_list =
+    static_cast<QueryCandidates<IdxT, accT>*>(query_list_ptr);
+
+  for (int i = blockIdx.x; i < num_queries; i += gridDim.x) {
+    __syncthreads();
+    SEARCH_SELECT_SORT(topk);
+  }
+}
+
 /********************************************************************************************
   GPU kernel to perform a batched GreedySearch on a graph. Since this is used for
   Vamana construction, the entire visited list is kept and stored within the query_list.
@@ -87,7 +109,7 @@ __global__ void GreedySearchKernel(
   int topk,
   cuvs::distance::DistanceType metric,
   int max_queue_size,
-  int sort_smem_size)
+  Node<accT>* topk_pq_mem)
 {
   int n      = dataset.extent(0);
   int dim    = dataset.extent(1);
@@ -105,9 +127,7 @@ __global__ void GreedySearchKernel(
 
   union ShmemLayout {
     // All blocksort sizes have same alignment (16)
-    typename cub::BlockMergeSort<DistPair<IdxT, accT>, 32, 1>::TempStorage sort_mem;
     T coords;
-    Node<accT> topk_pq;
     int neighborhood_arr;
     DistPair<IdxT, accT> candidate_queue;
   };
@@ -117,13 +137,14 @@ __global__ void GreedySearchKernel(
   // Dynamic shared memory used for blocksort, temp vector storage, and neighborhood list
   extern __shared__ __align__(alignof(ShmemLayout)) char smem[];
 
-  size_t smem_offset = sort_smem_size;  // temp sorting memory takes first chunk
+  size_t smem_offset = 0;
 
   T* s_coords = reinterpret_cast<T*>(&smem[smem_offset]);
   smem_offset += (dim + align_padding) * sizeof(T);
 
-  Node<accT>* topk_pq = reinterpret_cast<Node<accT>*>(&smem[smem_offset]);
-  smem_offset += topk * sizeof(Node<accT>);
+  Node<accT>* topk_pq = &topk_pq_mem[blockIdx.x * topk];
+  //  Node<accT>* topk_pq = reinterpret_cast<Node<accT>*>(&smem[smem_offset]); // Used to test
+  //  scenarios using more shared memory smem_offset += topk * sizeof(Node<accT>);
 
   int* neighbor_array = reinterpret_cast<int*>(&smem[smem_offset]);
   smem_offset += degree * sizeof(int);
@@ -245,6 +266,7 @@ __global__ void GreedySearchKernel(
         if (neighbor_array[j] == raft::upper_bound<IdxT>())
           atomicMin(&num_neighbors, (int)j);  // warp-wide min to find the number of neighbors
       }
+      //      __syncthreads();
 
       // computing distances between the query vector and neighbor vectors then enqueue in priority
       // queue.
@@ -272,8 +294,6 @@ __global__ void GreedySearchKernel(
 
     __syncthreads();
     if (self_found) query_list[i].size--;
-
-    SEARCH_SELECT_SORT(topk);
   }
 
   return;

@@ -42,27 +42,6 @@ namespace cuvs::neighbors::vamana::detail {
   COMPUTE_SMEM_SIZES(degree, visited_size, 256, 512); \
   COMPUTE_SMEM_SIZES(degree, visited_size, 256, 1024);
 
-/* Macros to call the CUB BlockSort primitives for supported sizes for ROBUST_PRUNE*/
-#define PRUNE_CALL_SORT(degree, visited_list, DEG, CANDS)                                  \
-  if (degree == DEG && visited_list <= CANDS && visited_list > CANDS / 2) {                \
-    using BlockSortT = cub::BlockMergeSort<DistPair<IdxT, accT>, 32, (DEG + CANDS) / 32>;  \
-    auto& sort_mem   = reinterpret_cast<typename BlockSortT::TempStorage&>(smem);          \
-    sort_edges_and_cands<accT, IdxT, DEG, CANDS>(new_nbh_list, &query_list[i], &sort_mem); \
-  }
-
-#define PRUNE_SELECT_SORT(degree, visited_list)    \
-  PRUNE_CALL_SORT(degree, visited_size, 32, 64);   \
-  PRUNE_CALL_SORT(degree, visited_size, 32, 128);  \
-  PRUNE_CALL_SORT(degree, visited_size, 32, 256);  \
-  PRUNE_CALL_SORT(degree, visited_size, 32, 512);  \
-  PRUNE_CALL_SORT(degree, visited_size, 64, 128);  \
-  PRUNE_CALL_SORT(degree, visited_size, 64, 256);  \
-  PRUNE_CALL_SORT(degree, visited_size, 64, 512);  \
-  PRUNE_CALL_SORT(degree, visited_size, 128, 256); \
-  PRUNE_CALL_SORT(degree, visited_size, 128, 512); \
-  PRUNE_CALL_SORT(degree, visited_size, 256, 512); \
-  PRUNE_CALL_SORT(degree, visited_size, 256, 1024);
-
 /* Macros to call the CUB BlockSort primitives for supported sizes for GREEDY SEARCH */
 #define SEARCH_CALL_SORT(topk, CANDS)                                             \
   if (topk <= CANDS && topk > CANDS / 2) {                                        \