8000 Improve Vamana index build performance and recall by bkarsin · Pull Request #1032 · rapidsai/cuvs · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Improve Vamana index build performance and recall #1032

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: branch-25.08
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cpp/include/cuvs/neighbors/vamana.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ struct index_params : cuvs::neighbors::index_params {
* literature **/
uint32_t visited_size = 64;
/** Number of Vamana vector insertion iterations (each iteration inserts all vectors). */
uint32_t vamana_iters = 1;
float vamana_iters = 1.0;
/** Alpha for pruning parameter */
float alpha = 1.2;
/** Maximum fraction of dataset inserted per batch. *
Expand All @@ -72,6 +72,7 @@ struct index_params : cuvs::neighbors::index_params {
uint32_t queue_size = 127;
/** Max batchsize of reverse edge processing (reduces memory footprint) */
uint32_t reverse_batchsize = 1000000;

};

/**
Expand Down
40 changes: 30 additions & 10 deletions cpp/src/neighbors/detail/vamana/greedy_search.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@

#pragma once

#include <cub/cub.cuh>

#include "macros.cuh"
#include "priority_queue.cuh"
#include "vamana_structs.cuh"
#include <cub/cub.cuh>
#include <cuvs/neighbors/vamana.hpp>

#include <cuvs/distance/distance.hpp>
Expand Down Expand Up @@ -52,7 +53,7 @@ __forceinline__ __device__ void sort_visited(
}

__syncthreads();
BlockSortT(*sort_mem).Sort(tmp, CmpDist());
BlockSortT(*sort_mem).Sort(tmp, CmpDist<IdxT, accT>());
__syncthreads();

for (int i = 0; i < ELTS; i++) {
Expand All @@ -64,6 +65,27 @@ __forceinline__ __device__ void sort_visited(

namespace {

template <typename T,
typename accT,
typename IdxT = uint32_t,
typename Accessor = raft::host_device_accessor<std::experimental::default_accessor<T>,
raft::memory_type::host>>
__global__ void SortPairsKernel(void* query_list_ptr, int num_queries, int topk)
{
union ShmemLayout {
typename cub::BlockMergeSort<DistPair<IdxT, accT>, 32, 1>::TempStorage sort_mem;
};
extern __shared__ __align__(alignof(ShmemLayout)) char smem[];

QueryCandidates<IdxT, accT>* query_list =
static_cast<QueryCandidates<IdxT, accT>*>(query_list_ptr);

for (int i = blockIdx.x; i < num_queries; i += gridDim.x) {
__syncthreads();
SEARCH_SELECT_SORT(topk);
}
}

/********************************************************************************************
GPU kernel to perform a batched GreedySearch on a graph. Since this is used for
Vamana construction, the entire visited list is kept and stored within the query_list.
Expand All @@ -87,7 +109,7 @@ __global__ void GreedySearchKernel(
int topk,
cuvs::distance::DistanceType metric,
int max_queue_size,
int sort_smem_size)
Node<accT>* topk_pq_mem)
{
int n = dataset.extent(0);
int dim = dataset.extent(1);
Expand All @@ -105,9 +127,7 @@ __global__ void GreedySearchKernel(

union ShmemLayout {
// All blocksort sizes have same alignment (16)
typename cub::BlockMergeSort<DistPair<IdxT, accT>, 32, 1>::TempStorage sort_mem;
T coords;
Node<accT> topk_pq;
int neighborhood_arr;
DistPair<IdxT, accT> candidate_queue;
};
Expand All @@ -117,13 +137,14 @@ __global__ void GreedySearchKernel(
// Dynamic shared memory used for blocksort, temp vector storage, and neighborhood list
extern __shared__ __align__(alignof(ShmemLayout)) char smem[];

size_t smem_offset = sort_smem_size; // temp sorting memory takes first chunk
size_t smem_offset = 0;

T* s_coords = reinterpret_cast<T*>(&smem[smem_offset]);
smem_offset += (dim + align_padding) * sizeof(T);

Node<accT>* topk_pq = reinterpret_cast<Node<accT>*>(&smem[smem_offset]);
smem_offset += topk * sizeof(Node<accT>);
Node<accT>* topk_pq = &topk_pq_mem[blockIdx.x * topk];
// Node<accT>* topk_pq = reinterpret_cast<Node<accT>*>(&smem[smem_offset]); // Used to test
// scenarios using more shared memory smem_offset += topk * sizeof(Node<accT>);

int* neighbor_array = reinterpret_cast<int*>(&smem[smem_offset]);
smem_offset += degree * sizeof(int);
Expand Down Expand Up @@ -245,6 +266,7 @@ __global__ void GreedySearchKernel(
if (neighbor_array[j] == raft::upper_bound<IdxT>())
atomicMin(&num_neighbors, (int)j); // warp-wide min to find the number of neighbors
}
// __syncthreads();

// computing distances between the query vector and neighbor vectors then enqueue in priority
// queue.
Expand Down Expand Up @@ -272,8 +294,6 @@ __global__ void GreedySearchKernel(

__syncthreads();
if (self_found) query_list[i].size--;

SEARCH_SELECT_SORT(topk);
}

return;
Expand Down
21 changes: 0 additions & 21 deletions cpp/src/neighbors/detail/vamana/macros.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -42,27 +42,6 @@ namespace cuvs::neighbors::vamana::detail {
COMPUTE_SMEM_SIZES(degree, visited_size, 256, 512); \
COMPUTE_SMEM_SIZES(degree, visited_size, 256, 1024);

/* Macros to call the CUB BlockSort primitives for supported sizes for ROBUST_PRUNE*/
#define PRUNE_CALL_SORT(degree, visited_list, DEG, CANDS) \
if (degree == DEG && visited_list <= CANDS && visited_list > CANDS / 2) { \
using BlockSortT = cub::BlockMergeSort<DistPair<IdxT, accT>, 32, (DEG + CANDS) / 32>; \
auto& sort_mem = reinterpret_cast<typename BlockSortT::TempStorage&>(smem); \
sort_edges_and_cands<accT, IdxT, DEG, CANDS>(new_nbh_list, &query_list[i], &sort_mem); \
}

#define PRUNE_SELECT_SORT(degree, visited_list) \
PRUNE_CALL_SORT(degree, visited_size, 32, 64); \
PRUNE_CALL_SORT(degree, visited_size, 32, 128); \
PRUNE_CALL_SORT(degree, visited_size, 32, 256); \
PRUNE_CALL_SORT(degree, visited_size, 32, 512); \
PRUNE_CALL_SORT(degree, visited_size, 64, 128); \
PRUNE_CALL_SORT(degree, visited_size, 64, 256); \
PRUNE_CALL_SORT(degree, visited_size, 64, 512); \
PRUNE_CALL_SORT(degree, visited_size, 128, 256); \
PRUNE_CALL_SORT(degree, visited_size, 128, 512); \
PRUNE_CALL_SORT(degree, visited_size, 256, 512); \
PRUNE_CALL_SORT(degree, visited_size, 256, 1024);

/* Macros to call the CUB BlockSort primitives for supported sizes for GREEDY SEARCH */
#define SEARCH_CALL_SORT(topk, CANDS) \
if (topk <= CANDS && topk > CANDS / 2) { \
Expand Down
Loading
0