From 27280aa6ef59f1c37c41768861acd4c0e9be2451 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Mon, 18 Oct 2021 16:24:04 +0100
Subject: [PATCH] Remove native_functions.yaml dependency from TensorTopK.cu

[ghstack-poisoned]
---
 aten/src/ATen/native/cuda/TensorTopK.cpp | 52 ++++++++++++++++++++++++
 aten/src/ATen/native/cuda/TensorTopK.cu  | 51 ++++-------------------
 aten/src/ATen/native/cuda/TensorTopK.h   | 13 ++++++
 caffe2/CMakeLists.txt                    |  2 +-
 4 files changed, 74 insertions(+), 44 deletions(-)
 create mode 100644 aten/src/ATen/native/cuda/TensorTopK.cpp
 create mode 100644 aten/src/ATen/native/cuda/TensorTopK.h

diff --git a/aten/src/ATen/native/cuda/TensorTopK.cpp b/aten/src/ATen/native/cuda/TensorTopK.cpp
new file mode 100644
index 0000000000000..163c32fa2d8e8
--- /dev/null
+++ b/aten/src/ATen/native/cuda/TensorTopK.cpp
@@ -0,0 +1,52 @@
+#include <ATen/native/cuda/TensorTopK.h>
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/native/cuda/Sort.h>
+
+namespace at {
+namespace native {
+
+TORCH_IMPL_FUNC(topk_out_cuda)
+  (const Tensor& self,
+   int64_t k, int64_t dim, bool largest, bool sorted,
+   const Tensor& values,
+   const Tensor& indices) {
+  TensorArg topK_arg{values, "topK", 1}, indices_arg{indices, "indices", 2}, input_arg{self, "self", 3};
+  checkAllSameGPU(__func__, {topK_arg, indices_arg, input_arg});
+  dim = at::maybe_wrap_dim(dim, self);
+
+  // If k is 0 the result is an empty tensor, so we don't need to launch a kernel.
+  if (k == 0) {
+    return;
+  }
+
+  launch_gather_topk_kernel(self, k, dim, largest, sorted, values, indices);
+
+  // Sort the results if the user wants them sorted, since our
+  // selection routine does not ensure sorting
+  if (sorted && values.numel() > 1) {
+    if (should_use_small_sort(values, dim)) {
+      // This avoids any memory allocations and performs all sorting
+      // work inplace along the slice
+
+      sortKeyValueInplace(values, indices, dim, largest);
+    } else {
+      // Depend upon the backup sort that returns indices, which we
+      // can use in conjunction with gather to produce the original
+      // indices.
+      // This is not the most efficient implementation, especially since
+      // there are memory allocations performed here. If the user desires
+      // greater performance, they should torch.gather() the results
+      // themselves using the reported indices, providing previously
+      // allocated tensors to receive the results.
+
+      Tensor sortedIndices = at::empty_like(indices);
+      Tensor sortedValues = at::empty_like(values);
+      sort_out_cuda(values, dim, largest, sortedValues, sortedIndices);
+      indices.copy_(indices.gather(dim, sortedIndices));
+      values.copy_(sortedValues);
+    }
+  }
+}
+
+}} // namespace at::native
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu
index 278de0528f01d..b74bba788f99b 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@@ -1,9 +1,11 @@
-#include <ATen/ATen.h>
+#define TORCH_ASSERT_NO_OPERATORS
+#include <ATen/native/cuda/TensorTopK.h>
+#include <ATen/core/TensorBase.h>
 #include <ATen/ceil_div.h>
+#include <ATen/Dispatch.h>
 #include <ATen/cuda/detail/TensorInfo.cuh>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/cuda/ScanUtils.cuh>
-#include <ATen/native/Resize.h>
 #include <ATen/native/cuda/SortingCommon.cuh>
 #include <ATen/native/cuda/SortingRadixSelect.cuh>
 #include <ATen/native/cuda/SortUtils.cuh>
@@ -161,26 +163,15 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<T, IndexType> input,
 
 } // namespace
 
-TORCH_IMPL_FUNC(topk_out_cuda)
-  (const Tensor& self,
-   int64_t k, int64_t dim, bool largest, bool sorted,
-   const Tensor& values,
-   const Tensor& indices) {
-  TensorArg topK_arg{values, "topK", 1}, indices_arg{indices, "indices", 2}, input_arg{self, "self", 3};
-  checkAllSameGPU(__func__, {topK_arg, indices_arg, input_arg});
-  dim = at::maybe_wrap_dim(dim, self);
-
+void launch_gather_topk_kernel(
+    const TensorBase& self, int64_t k, int64_t dim, bool largest, bool sorted,
+    const TensorBase& values, const TensorBase& indices) {
   int numDims = self.dim();
   numDims = numDims == 0 ? 1 : numDims;
   TORCH_CHECK(numDims <= MAX_DIMS, "input tensor has too many dimensions");
   int64_t sliceSize = self.dim() == 0 ? 1 : self.size(dim);
 
-  Tensor input = self.contiguous();
-
-  // If k is 0 the result is an empty tensor, so we don't need to launch a kernel.
-  if (k == 0) {
-    return;
-  }
+  auto input = self.contiguous();
   // static_cast is required to ensure that the correct type (INDEX_T)
   // is provided to the kernel for the arguments.
 
@@ -297,32 +288,6 @@ TORCH_IMPL_FUNC(topk_out_cuda)
 #undef RUN_DIM
 #undef RUN_DIR
 #undef RUN_K
-
-  // Sort the results if the user wants them sorted, since our
-  // selection routine does not ensure sorting
-  if (sorted && values.numel() > 1) {
-    if (should_use_small_sort(values, dim)) {
-      // This avoids any memory allocations and performs all sorting
-      // work inplace along the slice
-
-      sortKeyValueInplace(values, indices, dim, largest);
-    } else {
-      // Depend upon the backup sort that returns indices, which we
-      // can use in conjunction with gather to produce the original
-      // indices.
-      // This is not the most efficient implementation, especially since
-      // there are memory allocations performed here. If the user desires
-      // greater performance, they should torch.gather() the results
-      // themselves using the reported indices, providing previously
-      // allocated tensors to receive the results.
-
-      Tensor sortedIndices = at::empty_like(indices);
-      Tensor sortedValues = at::empty_like(values);
-      sort_out_cuda(values, dim, largest, sortedValues, sortedIndices);
-      indices.copy_(indices.gather(dim, sortedIndices));
-      values.copy_(sortedValues);
-    }
-  }
 }
 
 } // at::native
diff --git a/aten/src/ATen/native/cuda/TensorTopK.h b/aten/src/ATen/native/cuda/TensorTopK.h
new file mode 100644
index 0000000000000..9334f29a46d34
--- /dev/null
+++ b/aten/src/ATen/native/cuda/TensorTopK.h
@@ -0,0 +1,13 @@
+#include <cstdint>
+
+namespace at {
+class TensorBase;
+}
+
+namespace at {
+namespace native {
+void launch_gather_topk_kernel(
+    const TensorBase& self,
+    int64_t k, int64_t dim, bool largest, bool sorted,
+    const TensorBase& values, const TensorBase& indices);
+}}
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index a215db11c47cf..102d5a963a0f7 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -176,7 +176,7 @@ endif()
 if(BUILD_SPLIT_CUDA)
   # Splitting the source files that'll be in torch_cuda between torch_cuda_cu and torch_cuda_cpp
   foreach(tmp ${Caffe2_GPU_SRCS})
-    if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|TensorShapeCUDA|BatchLinearAlgebra|ReduceOps|Equal|Activation|ScanKernels|Sort).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*")
+    if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|TensorShapeCUDA|BatchLinearAlgebra|ReduceOps|Equal|Activation|ScanKernels|Sort|TensorTopK).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*")
       # Currently, torch_cuda_cu will have all the .cu files in aten, as well as some others that depend on those files
       list(APPEND Caffe2_GPU_SRCS_CU ${tmp})
     else()