From 27280aa6ef59f1c37c41768861acd4c0e9be2451 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Mon, 18 Oct 2021 16:24:04 +0100 Subject: [PATCH] Remove native_functions.yaml dependency from TensorTopK.cu [ghstack-poisoned] --- aten/src/ATen/native/cuda/TensorTopK.cpp | 52 ++++++++++++++++++++++++ aten/src/ATen/native/cuda/TensorTopK.cu | 51 ++++------------------- aten/src/ATen/native/cuda/TensorTopK.h | 13 ++++++ caffe2/CMakeLists.txt | 2 +- 4 files changed, 74 insertions(+), 44 deletions(-) create mode 100644 aten/src/ATen/native/cuda/TensorTopK.cpp create mode 100644 aten/src/ATen/native/cuda/TensorTopK.h diff --git a/aten/src/ATen/native/cuda/TensorTopK.cpp b/aten/src/ATen/native/cuda/TensorTopK.cpp new file mode 100644 index 0000000000000..163c32fa2d8e8 --- /dev/null +++ b/aten/src/ATen/native/cuda/TensorTopK.cpp @@ -0,0 +1,52 @@ +#include +#include +#include +#include + +namespace at { +namespace native { + +TORCH_IMPL_FUNC(topk_out_cuda) + (const Tensor& self, + int64_t k, int64_t dim, bool largest, bool sorted, + const Tensor& values, + const Tensor& indices) { + TensorArg topK_arg{values, "topK", 1}, indices_arg{indices, "indices", 2}, input_arg{self, "self", 3}; + checkAllSameGPU(__func__, {topK_arg, indices_arg, input_arg}); + dim = at::maybe_wrap_dim(dim, self); + + // If k is 0 the result is an empty tensor, so we don't need to launch a kernel. + if (k == 0) { + return; + } + + launch_gather_topk_kernel(self, k, dim, largest, sorted, values, indices); + + // Sort the results if the user wants them sorted, since our + // selection routine does not ensure sorting + if (sorted && values.numel() > 1) { + if (should_use_small_sort(values, dim)) { + // This avoids any memory allocations and performs all sorting + // work inplace along the slice + + sortKeyValueInplace(values, indices, dim, largest); + } else { + // Depend upon the backup sort that returns indices, which we + // can use in conjunction with gather to produce the original + // indices. + // This is not the most efficient implementation, especially since + // there are memory allocations performed here. If the user desires + // greater performance, they should torch.gather() the results + // themselves using the reported indices, providing previously + // allocated tensors to receive the results. + + Tensor sortedIndices = at::empty_like(indices); + Tensor sortedValues = at::empty_like(values); + sort_out_cuda(values, dim, largest, sortedValues, sortedIndices); + indices.copy_(indices.gather(dim, sortedIndices)); + values.copy_(sortedValues); + } + } +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu index 278de0528f01d..b74bba788f99b 100644 --- a/aten/src/ATen/native/cuda/TensorTopK.cu +++ b/aten/src/ATen/native/cuda/TensorTopK.cu @@ -1,9 +1,11 @@ -#include +#define TORCH_ASSERT_NO_OPERATORS +#include +#include #include +#include #include #include #include -#include #include #include #include @@ -161,26 +163,15 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo input, } // namespace -TORCH_IMPL_FUNC(topk_out_cuda) - (const Tensor& self, - int64_t k, int64_t dim, bool largest, bool sorted, - const Tensor& values, - const Tensor& indices) { - TensorArg topK_arg{values, "topK", 1}, indices_arg{indices, "indices", 2}, input_arg{self, "self", 3}; - checkAllSameGPU(__func__, {topK_arg, indices_arg, input_arg}); - dim = at::maybe_wrap_dim(dim, self); - +void launch_gather_topk_kernel( + const TensorBase& self, int64_t k, int64_t dim, bool largest, bool sorted, + const TensorBase& values, const TensorBase& indices) { int numDims = self.dim(); numDims = numDims == 0 ? 1 : numDims; TORCH_CHECK(numDims <= MAX_DIMS, "input tensor has too many dimensions"); int64_t sliceSize = self.dim() == 0 ? 1 : self.size(dim); - Tensor input = self.contiguous(); - - // If k is 0 the result is an empty tensor, so we don't need to launch a kernel. - if (k == 0) { - return; - } + auto input = self.contiguous(); // static_cast is required to ensure that the correct type (INDEX_T) // is provided to the kernel for the arguments. @@ -297,32 +288,6 @@ TORCH_IMPL_FUNC(topk_out_cuda) #undef RUN_DIM #undef RUN_DIR #undef RUN_K - - // Sort the results if the user wants them sorted, since our - // selection routine does not ensure sorting - if (sorted && values.numel() > 1) { - if (should_use_small_sort(values, dim)) { - // This avoids any memory allocations and performs all sorting - // work inplace along the slice - - sortKeyValueInplace(values, indices, dim, largest); - } else { - // Depend upon the backup sort that returns indices, which we - // can use in conjunction with gather to produce the original - // indices. - // This is not the most efficient implementation, especially since - // there are memory allocations performed here. If the user desires - // greater performance, they should torch.gather() the results - // themselves using the reported indices, providing previously - // allocated tensors to receive the results. - - Tensor sortedIndices = at::empty_like(indices); - Tensor sortedValues = at::empty_like(values); - sort_out_cuda(values, dim, largest, sortedValues, sortedIndices); - indices.copy_(indices.gather(dim, sortedIndices)); - values.copy_(sortedValues); - } - } } } // at::native diff --git a/aten/src/ATen/native/cuda/TensorTopK.h b/aten/src/ATen/native/cuda/TensorTopK.h new file mode 100644 index 0000000000000..9334f29a46d34 --- /dev/null +++ b/aten/src/ATen/native/cuda/TensorTopK.h @@ -0,0 +1,13 @@ +#include + +namespace at { +class TensorBase; +} + +namespace at { +namespace native { +void launch_gather_topk_kernel( + const TensorBase& self, + int64_t k, int64_t dim, bool largest, bool sorted, + const TensorBase& values, const TensorBase& indices); +}} diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index a215db11c47cf..102d5a963a0f7 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -176,7 +176,7 @@ endif() if(BUILD_SPLIT_CUDA) # Splitting the source files that'll be in torch_cuda between torch_cuda_cu and torch_cuda_cpp foreach(tmp ${Caffe2_GPU_SRCS}) - if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|TensorShapeCUDA|BatchLinearAlgebra|ReduceOps|Equal|Activation|ScanKernels|Sort).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*") + if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|TensorShapeCUDA|BatchLinearAlgebra|ReduceOps|Equal|Activation|ScanKernels|Sort|TensorTopK).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*") # Currently, torch_cuda_cu will have all the .cu files in aten, as well as some others that depend on those files list(APPEND Caffe2_GPU_SRCS_CU ${tmp}) else()