8000 Remove native_functions.yaml dependency from TensorTopK.cu by peterbell10 · Pull Request #66794 · pytorch/pytorch · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Remove native_functions.yaml dependency from TensorTopK.cu #66794

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions aten/src/ATen/native/cuda/TensorTopK.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#include <ATen/native/cuda/TensorTopK.h>
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#include <ATen/WrapDimUtils.h>
#include <ATen/native/cuda/Sort.h>

namespace at {
namespace native {

TORCH_IMPL_FUNC(topk_out_cuda)
(const Tensor& self,
int64_t k, int64_t dim, bool largest, bool sorted,
const Tensor& values,
const Tensor& indices) {
TensorArg topK_arg{values, "topK", 1}, indices_arg{indices, "indices", 2}, input_arg{self, "self", 3};
checkAllSameGPU(__func__, {topK_arg, indices_arg, input_arg});
dim = at::maybe_wrap_dim(dim, self);

// If k is 0 the result is an empty tensor, so we don't need to launch a kernel.
if (k == 0) {
return;
}

launch_gather_topk_kernel(self, k, dim, largest, sorted, values, indices);

// Sort the results if the user wants them sorted, since our
// selection routine does not ensure sorting
if (sorted && values.numel() > 1) {
if (should_use_small_sort(values, dim)) {
// This avoids any memory allocations and performs all sorting
// work inplace along the slice

sortKeyValueInplace(values, indices, dim, largest);
} else {
// Depend upon the backup sort that returns indices, which we
// can use in conjunction with gather to produce the original
// indices.
// This is not the most efficient implementation, especially since
// there are memory allocations performed here. If the user desires
// greater performance, they should torch.gather() the results
// themselves using the reported indices, providing previously
// allocated tensors to receive the results.

Tensor sortedIndices = at::empty_like(indices);
Tensor sortedValues = at::empty_like(values);
sort_out_cuda(values, dim, largest, sortedValues, sortedIndices);
indices.copy_(indices.gather(dim, sortedIndices));
values.copy_(sortedValues);
}
}
}

}} // namespace at::native
51 changes: 8 additions & 43 deletions aten/src/ATen/native/cuda/TensorTopK.cu
8000
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#include <ATen/ATen.h>
#define TORCH_ASSERT_NO_OPERATORS
#include <ATen/native/cuda/TensorTopK.h>
#include <ATen/core/TensorBase.h>
#include <ATen/ceil_div.h>
#include <ATen/Dispatch.h>
#include <ATen/cuda/detail/TensorInfo.cuh>
#include <ATen/cuda/detail/OffsetCalculator.cuh>
#include <ATen/cuda/ScanUtils.cuh>
#include <ATen/native/Resize.h>
#include <ATen/native/cuda/SortingCommon.cuh>
#include <ATen/native/cuda/SortingRadixSelect.cuh>
#include <ATen/native/cuda/SortUtils.cuh>
Expand Down Expand Up @@ -161,26 +163,15 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo<T, IndexType> input,

} // namespace

TORCH_IMPL_FUNC(topk_out_cuda)
(const Tensor& self,
int64_t k, int64_t dim, bool largest, bool sorted,
const Tensor& values,
const Tensor& indices) {
TensorArg topK_arg{values, "topK", 1}, indices_arg{indices, "indices", 2}, input_arg{self, "self", 3};
checkAllSameGPU(__func__, {topK_arg, indices_arg, input_arg});
dim = at::maybe_wrap_dim(dim, self);

void launch_gather_topk_kernel(
const TensorBase& self, int64_t k, int64_t dim, bool largest, bool sorted,
const TensorBase& values, const TensorBase& indices) {
int numDims = self.dim();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

out of curiosity, what is the criteria by which these four lines remain in the .cu but the if (k == 0) moves?

My understanding of your work is that you are prioritizing the following:

  1. pull out dependencies of Tensor and ATen.h from .cu files.
  2. less important but where possible, extract other unnecessary dependencies from .cu files
  3. take advantage of other opportunities to move code out of .cu files

Is that roughly the prioritization of your approach here? Under that, moving this block and the k == 8000 0 check both fit under 3) as the lowest priority.

Reordering the k == 0 check does change the behavior since it now avoids the check about having too many dimensions. Is that OK? FWIW, I like the idea of being stringent on inputs rather than letting a loophole like this let the user get away with an invalid input.

Changing topic altogether, do you think that splitting code up this way causes any meaningful harm by creating cross-module optimization barriers?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

out of curiosity, what is the criteria by which these four lines remain in the .cu but the if (k == 0) moves?

The return statement must go in the .cpp file function so we don't launch the sorting kernels. It would make sense to keep the MAX_DIMS checks together with it, but MAX_DIMS is defined in a .cuh header file so needs nvcc:

constexpr int MAX_DIMS = 25;

My understanding of your work is that you are prioritizing the following: [...]

This is mostly right, although .cu isn't actually in my criteria anywhere. I'm currently focusing on files that depend on native_functions.yaml, prioritized by their compile time (to maximize impact). It just so happens that cuda code is much slower to compile so the top of the list is all cuda files. Somewhat interestingly, GridSample.cpp was above GridSample.cu in compile time which is why that PR changes both.

The top of the list at the moment looks like this:

caffe2/CMakeFiles/torch_cuda_cu.dir/__/aten/src/ATen/native/cuda/DistributionBernoulli.cu.o
     1m 13s 344ms
caffe2/CMakeFiles/torch_cuda_cu.dir/__/aten/src/ATen/native/cuda/ForeachUnaryOp.cu.o
     1m 12s 425ms
caffe2/CMakeFiles/torch_cuda_cu.dir/__/aten/src/ATen/native/cuda/Distributions.cu.o
     1m 8s 582ms
caffe2/CMakeFiles/torch_cuda_cu.dir/__/aten/src/ATen/native/cuda/Indexing.cu.o
     1m 7s 595ms
caffe2/CMakeFiles/torch_cuda_cu.dir/__/aten/src/ATen/native/cuda/group_norm_kernel.cu.o
     0m 59s 931ms
caffe2/CMakeFiles/torch_cuda_cu.dir/__/aten/src/ATen/RegisterCUDA.cpp.o
     0m 55s 472ms

FWIW, I like the idea of being stringent on inputs rather than letting a loophole like this let the user get away with an invalid input.

I wouldn't say that applies here since MAX_DIMS is an implementation limitation not an invalid input. If, for example, matmul allowed empty tensors to have a shape mismatch then I would agree.

Changing topic altogether, do you think that splitting code up this way causes any meaningful harm by creating cross-module optimization barriers?

I don't think there's much the compiler can do here, but things like calling the same tensor method in both functions will have some impact (especially for virtual methods). However, generally speaking, the heavy lifting of these functions are done in the cuda runtime to actually launch the kernel. So, if there is any slow down I expect it to be fairly minimal.

numDims = numDims == 0 ? 1 : numDims;
TORCH_CHECK(numDims <= MAX_DIMS, "input tensor has too many dimensions");
int64_t sliceSize = self.dim() == 0 ? 1 : self.size(dim);

Tensor input = self.contiguous();

// If k is 0 the result is an empty tensor, so we don't need to launch a kernel.
if (k == 0) {
return;
}
auto input = self.contiguous();
// static_cast is required to ensure that the correct type (INDEX_T)
// is provided to the kernel for the arguments.

Expand Down Expand Up @@ -297,32 +288,6 @@ TORCH_IMPL_FUNC(topk_out_cuda)
#undef RUN_DIM
#undef RUN_DIR
#undef RUN_K

// Sort the results if the user wants them sorted, since our
// selection routine does not ensure sorting
if (sorted && values.numel() > 1) {
if (should_use_small_sort(values, dim)) {
// This avoids any memory allocations and performs all sorting
// work inplace along the slice

sortKeyValueInplace(values, indices, dim, largest);
} else {
// Depend upon the backup sort that returns indices, which we
// can use in conjunction with gather to produce the original
// indices.
// This is not the most efficient implementation, especially since
// there are memory allocations performed here. If the user desires
// greater performance, they should torch.gather() the results
// themselves using the reported indices, providing previously
// allocated tensors to receive the results.

Tensor sortedIndices = at::empty_like(indices);
Tensor sortedValues = at::empty_like(values);
sort_out_cuda(values, dim, largest, sortedValues, sortedIndices);
indices.copy_(indices.gather(dim, sortedIndices));
values.copy_(sortedValues);
}
}
}

} // at::native
Expand Down
14 changes: 14 additions & 0 deletions aten/src/ATen/native/cuda/TensorTopK.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#pragma once
#include <cstdint>

namespace at {
class TensorBase;
}

namespace at {
namespace native {
void launch_gather_topk_kernel(
const TensorBase& self,
int64_t k, int64_t dim, bool largest, bool sorted,
const TensorBase& values, const TensorBase& indices);
}}
2 changes: 1 addition & 1 deletion caffe2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ endif()
if(BUILD_SPLIT_CUDA)
# Splitting the source files that'll be in torch_cuda between torch_cuda_cu and torch_cuda_cpp
foreach(tmp ${Caffe2_GPU_SRCS})
if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|TensorShapeCUDA|BatchLinearAlgebra|ReduceOps|Equal|Activation|ScanKernels|Sort).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*")
if("${tmp}" MATCHES "(.*aten.*\\.cu|.*(b|B)las.*|.*((s|S)olver|Register.*CUDA|Legacy|THC|TensorShapeCUDA|BatchLinearAlgebra|ReduceOps|Equal|Activation|ScanKernels|Sort|TensorTopK).*\\.cpp)" AND NOT "${tmp}" MATCHES ".*(THC((CachingHost)?Allocator|General)).*")
# Currently, torch_cuda_cu will have all the .cu files in aten, as well as some others that depend on those files
list(APPEND Caffe2_GPU_SRCS_CU ${tmp})
else()
Expand Down
0