8000 [GPU] Update the cache info for padding resolver in cloneWithSimplifiedConfig. by hanhanW · Pull Request #20371 · iree-org/iree · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

[GPU] Update the cache info for padding resolver in cloneWithSimplifiedConfig. #20371

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
// RUN: --iree-hip-target=gfx90a --iree-hip-encoding-layout-resolver=none %s | FileCheck %s --check-prefix=NONE

// PAD: #hal.executable.target<"rocm"
// PAD-SAME: iree.encoding.resolver = #iree_gpu.gpu_pad_layout<cache_line_bytes = 128, cache_sets = 4>
// PAD-SAME: iree.encoding.resolver = #iree_gpu.gpu_pad_layout<>

// DATA-TILING: #hal.executable.target<"rocm"
// DATA-TILING-SAME: iree.encoding.resolver = #iree_gpu.gpu_encoding_layout<>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -322,8 +322,8 @@ def IREEGPU_GPUPadLayoutAttr : AttrDef<IREEGPU_Dialect, "GPUPadLayout"> {
let parameters = (ins
// Relevant target properties that will later allow us to decide the
// serialized pad layout.
"uint32_t":$cache_line_bytes,
"uint32_t":$cache_sets
OptionalParameter<"std::optional<uint32_t>">:$cache_line_bytes,
OptionalParameter<"std::optional<uint32_t>">:$cache_sets
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,14 @@ std::optional<TargetDetails> getAndroidProfileDetails(StringRef target) {
// Query functions
//===----------------------------------------------------------------------===//

std::optional<L1CacheInfo> getL1CacheInfo(TargetAttr target) {
// TODO(kuhar): Add L1 cache query for other HIP targets.
if (!target || !llvm::is_contained({"gfx90a", "gfx942"}, target.getArch())) {
return std::nullopt;
}
return L1CacheInfo{/*cacheLineBytes=*/128, /*cacheSets=*/4};
}

TargetAttr getMetalTargetDetails(MLIRContext *context) {
return createTargetAttr(*getAppleTargetDetails(), /*arch=*/"apple",
/*features=*/"spirv:v1.3,cap:Shader", context);
Expand Down Expand Up @@ -775,12 +783,10 @@ Attribute getHIPTargetEncodingLayoutAttr(TargetAttr target,
return IREE::GPU::GPUEncodingLayoutAttr::get(target.getContext(), {});
}

// GPUPadLayoutAttr is only enabled for CDNA2 and CDNA3 for the time being.
// TODO(kuhar): Enable for other HIP targets.
if (resolver == kPadEncodingLayoutResolverName &&
llvm::is_contained({"gfx90a", "gfx942"}, target.getArch())) {
return IREE::GPU::GPUPadLayoutAttr::get(
target.getContext(), /*cacheLineBytes=*/128, /*cacheSets=*/4);
if (resolver == kPadEncodingLayoutResolverName) {
return IREE::GPU::GPUPadLayoutAttr::get(target.getContext(),
/*cache_line_bytes=*/std::nullopt,
/*cache_sets=*/std::nullopt);
}
return nullptr;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ constexpr char kNoEncodingLayoutResolverName[] = "none";
constexpr char kPadEncodingLayoutResolverName[] = "pad";
constexpr char kDataTilingEncodingLayoutResolverName[] = "data-tiling";

struct L1CacheInfo {
uint32_t cacheLineBytes;
uint32_t cacheSets;
};

// Returns the L1 cache information for the `target`.
std::optional<L1CacheInfo> getL1CacheInfo(TargetAttr target);

// Returns a TargetAttr to target Metal via SPIR-V CodeGen.
TargetAttr getMetalTargetDetails(MLIRContext *context);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ iree_compiler_cc_library(
"//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
"//compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils",
"//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
"//compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils:KnownTargets",
"//compiler/src/iree/compiler/Codegen/Utils",
"//compiler/src/iree/compiler/Dialect/Encoding/IR",
"@llvm-project//llvm:Support",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ iree_cc_library(
iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
iree::compiler::Codegen::Dialect::Codegen::Utils
iree::compiler::Codegen::Dialect::GPU::IR::IREEGPUDialect
iree::compiler::Codegen::Dialect::GPU::TargetUtils::KnownTargets
iree::compiler::Codegen::Utils
iree::compiler::Dialect::Encoding::IR
PUBLIC
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
#include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.h"
#include "iree/compiler/Codegen/ExternalInterfaces/Utils.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
Expand Down Expand Up @@ -409,10 +410,15 @@ struct GPUPadEncodingLayoutResolverAttrInterface final
: Encoding::EncodingLayoutResolverAttrInterface::ExternalModel<
GPUPadEncodingLayoutResolverAttrInterface, GPUPadLayoutAttr> {
Attribute cloneWithSimplifiedConfig(Attribute attr,
DictionaryAttr /*config*/) const {
// This attribute is self-contained and does not need to look anything up
// from the target `config`.
return attr;
DictionaryAttr config) const {
MLIRContext *ctx = attr.getContext();
IREE::GPU::TargetAttr gpuTarget = getGPUTargetAttr(config);
std::optional<IREE::GPU::L1CacheInfo> cache =
IREE::GPU::getL1CacheInfo(gpuTarget);
if (!cache) {
return IREE::Encoding::IdentityEncodingAttr::get(ctx);
}
return GPUPadLayoutAttr::get(ctx, cache->cacheLineBytes, cache->cacheSets);
}

Attribute getLayout(Attribute attr, RankedTensorType type) const {
Expand All @@ -429,6 +435,10 @@ struct GPUPadEncodingLayoutResolverAttrInterface final
return noPaddingAttr;
}

if (!padLayoutAttr.getCacheLineBytes() || !padLayoutAttr.getCacheSets()) {
return noPaddingAttr;
}

const int64_t operandIndex = encodingAttr.getOperandIndex().getInt();
if (!llvm::is_contained({0, 1}, operandIndex)) {
// We only have to pad matmul operands.
Expand Down Expand Up @@ -481,7 +491,7 @@ struct GPUPadEncodingLayoutResolverAttrInterface final
}

const int64_t elementBits = type.getElementTypeBitWidth();
const int64_t cacheLineBytes = padLayoutAttr.getCacheLineBytes();
const int64_t cacheLineBytes = *padLayoutAttr.getCacheLineBytes();
if (elementBits % 8 != 0 || elementBits > cacheLineBytes) {
// We do not support unaligned element types.
return noPaddingAttr;
Expand All @@ -492,7 +502,7 @@ struct GPUPadEncodingLayoutResolverAttrInterface final
// cache line, but not a multiple of cache line * cache sets. This way the
// next 'row' will start at a different cache set.
const int64_t cacheSetSpanBytes =
padLayoutAttr.getCacheSets() * cacheLineBytes;
*padLayoutAttr.getCacheSets() * cacheLineBytes;
const int64_t dimSizeInBytes =
type.getDimSize(*padDimensionIndex) * (elementBits / 8);
if (dimSizeInBytes < cacheSetSpanBytes) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,19 @@ util.func public @gpu_with_encoding_layout(%d0: index, %d1: index) -> index {
#map2 = affine_map<(m, n, k) -> (m, n)>
#map3 = affine_map<(m, n, k) -> (n, k)>
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip",
iree.encoding.resolver = #iree_gpu.gpu_pad_layout<cache_line_bytes = 128, cache_sets = 4>, ukernels = "none"}>
iree.encoding.resolver = #iree_gpu.gpu_pad_layout<>,
iree.gpu.target = #iree_gpu.target<arch = "gfx942",
features = "",
wgp = <compute = fp32,
storage = b32,
subgroup = none,
dot = none,
mma = [<MFMA_F32_16x16x4_F32>],
subgroup_size_choices = [64],
max_workgroup_sizes = [1024, 1024, 1024],
max_thread_count_per_workgroup = 1024,
max_workgroup_memory_bytes = 65536,
max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>}>
#device_target_local_0_ = #hal.device.target<"local", {ordinal = 0 : index}, [#executable_target_rocm_hsaco_fb]> : !hal.device
#encodingA = #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f16, f16, f32], user_indexing_maps = [#map0, #map1, #map2]>
#encodingB = #iree_encoding.encoding<operand_index = 1 : index, op_type = matmul, element_types = [f16, f16, f32], user_indexing_maps = [#map0, #map1, #map2]>
Expand Down
Loading
0