hal.executable public @main$async_dispatch_18 { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target, , , , , ], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>) { hal.executable.export public @main$async_dispatch_18_elementwise_broadcast_24x4608x64x2_f32xf32xf32xf32xf16 ordinal(0) layout(#hal.pipeline.layout, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>, #hal.interface.binding<0, 2>]} { ^bb0(%arg0: !hal.device): %x, %y, %z = flow.dispatch.workgroup_count_from_slice hal.return %x, %y, %z : index, index, index } builtin.module { func.func @main$async_dispatch_18_elementwise_broadcast_24x4608x64x2_f32xf32xf32xf32xf16() { %c0 = arith.constant 0 : index %0 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(0) : i32 %1 = hal.interface.constant.load layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) ordinal(1) : i32 %2 = arith.index_castui %0 : i32 to index %3 = arith.index_castui %1 : i32 to index %4 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %5 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%2) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> %6 = hal.interface.binding.subspan layout(, <1, storage_buffer, "ReadOnly|Indirect">, <2, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(2) alignment(64) offset(%3) flags(Indirect) : !flow.dispatch.tensor> %7 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0, 0, 0, 0], sizes = [4608, 1, 24, 64, 1, 2], strides = [1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4608x1x24x64x1x2xf16> %8 = tensor.empty() : tensor<24x4608x64x2xf16> %9 = tensor.empty() : tensor<1x24x4608x64x1x2xf32> %10 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0, 0, 1], sizes = [1, 1, 4608, 64, 2, 1], strides = [1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4608x64x2xf32> %11 = flow.dispatch.tensor.load %4, offsets = [0, 0, 0, 0, 0, 0], sizes = [1, 1, 4608, 64, 2, 1], strides = [1, 1, 1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<4608x64x2xf32> %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d1, d3, d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%7 : tensor<4608x1x24x64x1x2xf16>) outs(%9 : tensor<1x24x4608x64x1x2xf32>) { ^bb0(%in: f16, %out: f32): %14 = arith.extf %in : f16 to f32 linalg.yield %14 : f32 } -> tensor<1x24x4608x64x1x2xf32> %extracted_slice = tensor.extract_slice %12[0, 0, 0, 0, 0, 0] [1, 24, 4608, 64, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x24x4608x64x1x2xf32> to tensor<24x4608x64xf32> %extracted_slice_0 = tensor.extract_slice %12[0, 0, 0, 0, 0, 1] [1, 24, 4608, 64, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x24x4608x64x1x2xf32> to tensor<24x4608x64xf32> %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%11, %extracted_slice, %10, %extracted_slice_0 : tensor<4608x64x2xf32>, tensor<24x4608x64xf32>, tensor<4608x64x2xf32>, tensor<24x4608x64xf32>) outs(%8 : tensor<24x4608x64x2xf16>) { ^bb0(%in: f32, %in_1: f32, %in_2: f32, %in_3: f32, %out: f16): %14 = arith.mulf %in_2, %in_3 : f32 %15 = arith.mulf %in, %in_1 : f32 %16 = arith.addf %15, %14 : f32 %17 = arith.truncf %16 : f32 to f16 linalg.yield %17 : f16 } -> tensor<24x4608x64x2xf16> flow.dispatch.tensor.store %13, %6, offsets = [0, 0, 0, 0], sizes = [24, 4608, 64, 2], strides = [1, 1, 1, 1] : tensor<24x4608x64x2xf16> -> !flow.dispatch.tensor> return } } } }