Closed
Description
What happened?
The input IR for weight backward convolution with y = 1 and x = 3
as below
module {
func.func @conv_2d_bfloat16_weight_backward_16x24x16x96_nhwc_96x1x3x96_fhwc_nhwf_1x1s_0x1p_1x1d_1g_dispatch_0_conv_96x1x3x96x16x24x16_bf16xbf16xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = true>}>} {
%cst = arith.constant 0.000000e+00 : bf16
%cst_0 = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") {iree_gpu.use_rocdl_buffer_instructions} : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16x24x16x96xbf16>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") {iree_gpu.use_rocdl_buffer_instructions} : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16x24x16x96xbf16>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) {iree_gpu.use_rocdl_buffer_instructions} : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<96x1x3x96xbf16>>
%3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [16, 24, 16, 96], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16x24x16x96xbf16>> -> tensor<16x24x16x96xbf16>
%4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [16, 24, 16, 96], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16x24x16x96xbf16>> -> tensor<16x24x16x96xbf16>
%padded = tensor.pad %3 low[0, 0, 1, 0] high[0, 0, 1, 0] {
^bb0(%arg0: index, %arg1: index, %arg2: index, %arg3: index):
tensor.yield %cst : bf16
} : tensor<16x24x16x96xbf16> to tensor<16x24x18x96xbf16>
%5 = tensor.empty() : tensor<96x1x3x96xf32>
%6 = linalg.fill ins(%cst_0 : f32) outs(%5 : tensor<96x1x3x96xf32>) -> tensor<96x1x3x96xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d1 + d5, d2 + d6, d3)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d0)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]} ins(%padded, %4 : tensor<16x24x18x96xbf16>, tensor<16x24x16x96xbf16>) outs(%6 : tensor<96x1x3x96xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_BF16>, promote_operands = [0, 1], reduction = [0, 0, 0, 0, 4], subgroup = [1, 1, 1, 1, 0], workgroup = [32, 1, 1, 32, 0]}>} {
^bb0(%in: bf16, %in_1: bf16, %out: f32):
%10 = arith.extf %in : bf16 to f32
%11 = arith.extf %in_1 : bf16 to f32
%12 = arith.mulf %10, %11 : f32
%13 = arith.addf %out, %12 : f32
linalg.yield %13 : f32
} -> tensor<96x1x3x96xf32>
%8 = tensor.empty() : tensor<96x1x3x96xbf16>
%9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<96x1x3x96xf32>) outs(%8 : tensor<96x1x3x96xbf16>) {
^bb0(%in: f32, %out: bf16):
%10 = arith.truncf %in : f32 to bf16
linalg.yield %10 : bf16
} -> tensor<96x1x3x96xbf16>
iree_tensor_ext.dispatch.tensor.store %9, %2, offsets = [0, 0, 0, 0], sizes = [96, 1, 3, 96], strides = [1, 1, 1, 1] : tensor<96x1x3x96xbf16> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<96x1x3x96xbf16>>
return
}
}
Failed at DecomposeIm2colPass
with
error: 'linalg.copy' op expected operand rank (4) to match the result rank of indexing_map #0 (1)
The error happened as the im2col is now rank-reduced as
%7 = iree_linalg_ext.im2col strides = [1] dilations = [1] kernel_size = [16] m_offset = [0] * [1] k_offset = [0] * [1] batch_pos = [3] m_pos = [2] k_pos = [0, 1] input_k_perm = [0, 1, 2] ins(%padded : tensor<16x24x18x96xbf16>) outs(%6 : tensor<96x3x6144xbf16>) -> tensor<96x3x6144xbf16>
With the implementation in #20633, it reduced the rank of extract_slice
to 1 for this case, however this doesn't work when there's a leading padding op, because the extract_slice and pad ops get swapped.