Skip to content

Commit 26e1309

Browse files
authored
[AMDGPU][True16][CodeGen] true16 codegen pattern for v_pack_b32_f16 (#121988)
true16 codegen pattern for v_pack_b32_f16
1 parent 305b25c commit 26e1309

File tree

7 files changed

+450
-84
lines changed

7 files changed

+450
-84
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3390,6 +3390,9 @@ let SubtargetPredicate = isGFX9Plus in {
33903390
let True16Predicate = NotHasTrue16BitInsts in
33913391
def : PackB32Pat<V_PACK_B32_F16_e64>;
33923392

3393+
let True16Predicate = UseRealTrue16Insts in
3394+
def : PackB32Pat<V_PACK_B32_F16_t16_e64>;
3395+
33933396
let True16Predicate = UseFakeTrue16Insts in
33943397
def : PackB32Pat<V_PACK_B32_F16_fake16_e64>;
33953398
} // End SubtargetPredicate = isGFX9Plus

llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -160,14 +160,9 @@ define amdgpu_kernel void @ceil_v2f16(
160160
; GFX11-NEXT: s_waitcnt vmcnt(0)
161161
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
162162
; GFX11-NEXT: v_ceil_f16_e32 v0.l, v0.l
163-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
163+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
164164
; GFX11-NEXT: v_ceil_f16_e32 v0.h, v1.l
165-
; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l
166-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
167-
; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
168-
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
169-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
170-
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
165+
; GFX11-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
171166
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
172167
; GFX11-NEXT: s_endpgm
173168
;

llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -161,14 +161,9 @@ define amdgpu_kernel void @floor_v2f16(
161161
; GFX11-NEXT: s_waitcnt vmcnt(0)
162162
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
163163
; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l
164-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
164+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
165165
; GFX11-NEXT: v_floor_f16_e32 v0.h, v1.l
166-
; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l
167-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
168-
; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
169-
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
170-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
171-
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
166+
; GFX11-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
172167
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
173168
; GFX11-NEXT: s_endpgm
174169
;

llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll

Lines changed: 21 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -480,9 +480,8 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
480480
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
481481
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
482482
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
483-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
484-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
485-
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
483+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
484+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
486485
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
487486
;
488487
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -610,9 +609,7 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
610609
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
611610
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
612611
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
613-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
614-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
615-
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
612+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
616613
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
617614
;
618615
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -737,15 +734,13 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
737734
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
738735
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff
739736
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff
740-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
737+
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff
738+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
741739
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v3.l
742740
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
743-
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v4, s0, 0x7fff
744741
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
745-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
746-
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v2.l
747-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
748-
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
742+
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l
743+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
749744
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
750745
;
751746
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -891,12 +886,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
891886
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
892887
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
893888
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
894-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
889+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
895890
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v4.l
896-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
897-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
898-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
899-
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
891+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
900892
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
901893
;
902894
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -1036,24 +1028,21 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
10361028
; GFX11-SDAG-TRUE16: ; %bb.0:
10371029
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10381030
; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x8000
1039-
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
1031+
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
10401032
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v5, v5, s0, 0x7fff
10411033
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff
1042-
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
1034+
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0
10431035
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff
10441036
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff
1037+
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v6.l, v5.l
10451038
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1046-
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l
1047-
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v5.l
1048-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1039+
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v7.l, v3.l
10491040
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
1041+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
10501042
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l
1051-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1052-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
1053-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
1054-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1055-
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
1056-
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
1043+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
1044+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
1045+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v1.h
10571046
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
10581047
;
10591048
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1238,20 +1227,14 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
12381227
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
12391228
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
12401229
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
1230+
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l
12411231
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
1242-
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v1.l, v3.l
12431232
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1244-
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v6.l, v5.l
1233+
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v5.l
12451234
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v4.l
1246-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1247-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
1248-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
1249-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1250-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
1251-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
12521235
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1253-
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
1254-
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v1, v1, v3, 0x5040100
1236+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
1237+
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v1.h
12551238
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
12561239
;
12571240
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i16:

llvm/test/CodeGen/AMDGPU/sitofp.f16.ll

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -237,14 +237,9 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
237237
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
238238
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
239239
; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l
240-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
240+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
241241
; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.h, v1.l
242-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
243-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
244-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
245-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
246-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
247-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
242+
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
248243
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
249244
; GFX11-TRUE16-NEXT: s_endpgm
250245
;
@@ -338,17 +333,13 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
338333
; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
339334
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
340335
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
341-
; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0
342336
; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1
337+
; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v0
343338
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
344-
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
345-
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
346-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
347-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
348-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
349-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
350-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
351-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
339+
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1
340+
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
341+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
342+
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l
352343
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
353344
; GFX11-TRUE16-NEXT: s_endpgm
354345
;

llvm/test/CodeGen/AMDGPU/uitofp.f16.ll

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -237,14 +237,9 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
237237
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
238238
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
239239
; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l
240-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
240+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
241241
; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.h, v1.l
242-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
243-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
244-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
245-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
246-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
247-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
242+
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
248243
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
249244
; GFX11-TRUE16-NEXT: s_endpgm
250245
;
@@ -338,17 +333,13 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
338333
; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
339334
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
340335
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
341-
; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
342336
; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1
337+
; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v0
343338
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
344-
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
345-
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
346-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
347-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
348-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
349-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
350-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
351-
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
339+
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1
340+
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
341+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
342+
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l
352343
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
353344
; GFX11-TRUE16-NEXT: s_endpgm
354345
;

0 commit comments

Comments
 (0)