iree-org · hanhanW · Apr 3, 2025 · Mar 25, 2025 · Mar 31, 2025 · Apr 2, 2025
@@ -11,7 +11,7 @@
 // RUN:   --iree-hip-target=gfx90a --iree-hip-encoding-layout-resolver=none %s | FileCheck %s --check-prefix=NONE
 
 // PAD:      #hal.executable.target<"rocm"
-// PAD-SAME:   iree.encoding.resolver = #iree_gpu.gpu_pad_layout<cache_line_bytes = 128, cache_sets = 4>
+// PAD-SAME:   iree.encoding.resolver = #iree_gpu.gpu_pad_layout<>
 
 // DATA-TILING:      #hal.executable.target<"rocm"
 // DATA-TILING-SAME:   iree.encoding.resolver = #iree_gpu.gpu_encoding_layout<>

@@ -322,8 +322,8 @@ def IREEGPU_GPUPadLayoutAttr : AttrDef<IREEGPU_Dialect, "GPUPadLayout"> {
   let parameters = (ins
     // Relevant target properties that will later allow us to decide the
     // serialized pad layout.
-    "uint32_t":$cache_line_bytes,
-    "uint32_t":$cache_sets
+    OptionalParameter<"std::optional<uint32_t>">:$cache_line_bytes,
+    OptionalParameter<"std::optional<uint32_t>">:$cache_sets
   );
 }
 

@@ -740,6 +740,14 @@ std::optional<TargetDetails> getAndroidProfileDetails(StringRef target) {
 // Query functions
 //===----------------------------------------------------------------------===//
 
+std::optional<L1CacheInfo> getL1CacheInfo(TargetAttr target) {
+  // TODO(kuhar): Add L1 cache query for other HIP targets.
+  if (!target || !llvm::is_contained({"gfx90a", "gfx942"}, target.getArch())) {
+    return std::nullopt;
+  }
+  return L1CacheInfo{/*cacheLineBytes=*/128, /*cacheSets=*/4};
+}
+
 TargetAttr getMetalTargetDetails(MLIRContext *context) {
   return createTargetAttr(*getAppleTargetDetails(), /*arch=*/"apple",
                           /*features=*/"spirv:v1.3,cap:Shader", context);
@@ -775,12 +783,10 @@ Attribute getHIPTargetEncodingLayoutAttr(TargetAttr target,
     return IREE::GPU::GPUEncodingLayoutAttr::get(target.getContext(), {});
   }
 
-  // GPUPadLayoutAttr is only enabled for CDNA2 and CDNA3 for the time being.
-  // TODO(kuhar): Enable for other HIP targets.
-  if (resolver == kPadEncodingLayoutResolverName &&
-      llvm::is_contained({"gfx90a", "gfx942"}, target.getArch())) {
-    return IREE::GPU::GPUPadLayoutAttr::get(
-        target.getContext(), /*cacheLineBytes=*/128, /*cacheSets=*/4);
+  if (resolver == kPadEncodingLayoutResolverName) {
+    return IREE::GPU::GPUPadLayoutAttr::get(target.getContext(),
+                                            /*cache_line_bytes=*/std::nullopt,
+                                            /*cache_sets=*/std::nullopt);
   }
   return nullptr;
 }

@@ -16,6 +16,14 @@ constexpr char kNoEncodingLayoutResolverName[] = "none";
 constexpr char kPadEncodingLayoutResolverName[] = "pad";
 constexpr char kDataTilingEncodingLayoutResolverName[] = "data-tiling";
 
+struct L1CacheInfo {
+  uint32_t cacheLineBytes;
+  uint32_t cacheSets;
+};
+
+// Returns the L1 cache information for the `target`.
+std::optional<L1CacheInfo> getL1CacheInfo(TargetAttr target);
+
 // Returns a TargetAttr to target Metal via SPIR-V CodeGen.
 TargetAttr getMetalTargetDetails(MLIRContext *context);
 

@@ -31,6 +31,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
+        "//compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils:KnownTargets",
         "//compiler/src/iree/compiler/Codegen/Utils",
         "//compiler/src/iree/compiler/Dialect/Encoding/IR",
         "@llvm-project//llvm:Support",

@@ -33,6 +33,7 @@ iree_cc_library(
     iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
     iree::compiler::Codegen::Dialect::Codegen::Utils
     iree::compiler::Codegen::Dialect::GPU::IR::IREEGPUDialect
+    iree::compiler::Codegen::Dialect::GPU::TargetUtils::KnownTargets
     iree::compiler::Codegen::Utils
     iree::compiler::Dialect::Encoding::IR
   PUBLIC

@@ -28,6 +28,7 @@
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
+#include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.h"
 #include "iree/compiler/Codegen/ExternalInterfaces/Utils.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
@@ -409,10 +410,15 @@ struct GPUPadEncodingLayoutResolverAttrInterface final
     : Encoding::EncodingLayoutResolverAttrInterface::ExternalModel<
           GPUPadEncodingLayoutResolverAttrInterface, GPUPadLayoutAttr> {
   Attribute cloneWithSimplifiedConfig(Attribute attr,
-                                      DictionaryAttr /*config*/) const {
-    // This attribute is self-contained and does not need to look anything up
-    // from the target `config`.
-    return attr;
+                                      DictionaryAttr config) const {
+    MLIRContext *ctx = attr.getContext();
+    IREE::GPU::TargetAttr gpuTarget = getGPUTargetAttr(config);
+    std::optional<IREE::GPU::L1CacheInfo> cache =
+        IREE::GPU::getL1CacheInfo(gpuTarget);
+    if (!cache) {
+      return IREE::Encoding::IdentityEncodingAttr::get(ctx);
+    }
+    return GPUPadLayoutAttr::get(ctx, cache->cacheLineBytes, cache->cacheSets);
   }
 
   Attribute getLayout(Attribute attr, RankedTensorType type) const {
@@ -429,6 +435,10 @@ struct GPUPadEncodingLayoutResolverAttrInterface final
       return noPaddingAttr;
     }
 
+    if (!padLayoutAttr.getCacheLineBytes() || !padLayoutAttr.getCacheSets()) {
+      return noPaddingAttr;
+    }
+
     const int64_t operandIndex = encodingAttr.getOperandIndex().getInt();
     if (!llvm::is_contained({0, 1}, operandIndex)) {
       // We only have to pad matmul operands.
@@ -481,7 +491,7 @@ struct GPUPadEncodingLayoutResolverAttrInterface final
     }
 
     const int64_t elementBits = type.getElementTypeBitWidth();
-    const int64_t cacheLineBytes = padLayoutAttr.getCacheLineBytes();
+    const int64_t cacheLineBytes = *padLayoutAttr.getCacheLineBytes();
     if (elementBits % 8 != 0 || elementBits > cacheLineBytes) {
       // We do not support unaligned element types.
       return noPaddingAttr;
@@ -492,7 +502,7 @@ struct GPUPadEncodingLayoutResolverAttrInterface final
     // cache line, but not a multiple of cache line * cache sets. This way the
     // next 'row' will start at a different cache set.
     const int64_t cacheSetSpanBytes =
-        padLayoutAttr.getCacheSets() * cacheLineBytes;
+        *padLayoutAttr.getCacheSets() * cacheLineBytes;
     const int64_t dimSizeInBytes =
         type.getDimSize(*padDimensionIndex) * (elementBits / 8);
     if (dimSizeInBytes < cacheSetSpanBytes) {

@@ -89,7 +89,19 @@ util.func public @gpu_with_encoding_layout(%d0: index, %d1: index) -> index {
 #map2 = affine_map<(m, n, k) -> (m, n)>
 #map3 = affine_map<(m, n, k) -> (n, k)>
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip",
-  iree.encoding.resolver = #iree_gpu.gpu_pad_layout<cache_line_bytes = 128, cache_sets = 4>, ukernels = "none"}>
+  iree.encoding.resolver = #iree_gpu.gpu_pad_layout<>,
+  iree.gpu.target = #iree_gpu.target<arch = "gfx942",
+                                     features = "",
+                                     wgp = <compute = fp32,
+                                            storage =  b32,
+                                            subgroup =  none,
+                                            dot =  none,
+                                            mma = [<MFMA_F32_16x16x4_F32>],
+                                            subgroup_size_choices = [64],
+                                            max_workgroup_sizes = [1024, 1024, 1024],
+                                            max_thread_count_per_workgroup = 1024,
+                                            max_workgroup_memory_bytes = 65536,
+                                            max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>}>
 #device_target_local_0_ = #hal.device.target<"local", {ordinal = 0 : index}, [#executable_target_rocm_hsaco_fb]> : !hal.device
 #encodingA = #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f16, f16, f32], user_indexing_maps = [#map0, #map1, #map2]>
 #encodingB = #iree_encoding.encoding<operand_index = 1 : index, op_type = matmul, element_types = [f16, f16, f32], user_indexing_maps = [#map0, #map1, #map2]>