@@ -9418,78 +9418,80 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) {
9418
9418
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1
9419
9419
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0
9420
9420
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
9421
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2 )
9421
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1 )
9422
9422
; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4
9423
- ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
9424
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
9425
9423
; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1
9424
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
9426
9425
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
9427
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
9428
- ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1
9429
- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
9430
- ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9431
- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4
9432
- ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
9433
- ; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff
9434
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
9426
+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
9427
+ ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
9435
9428
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5
9436
9429
; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
9437
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
9438
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo
9439
- ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
9430
+ ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff
9431
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9432
+ ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v1, 0xffff0000, v1
9433
+ ; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v6, 16, v2
9434
+ ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9435
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
9436
+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
9437
+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6
9438
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
9439
+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v1, 16, 1
9440
+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
9441
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
9442
+ ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v1, 0x7fff
9440
9443
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3
9441
9444
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
9442
- ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
9443
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
9444
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo
9445
+ ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
9446
+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v2
9447
+ ; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v3, 0x40c00000, v3
9448
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
9449
+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
9450
+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0
9451
+ ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
9452
+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v8, 16, 1
9453
+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v3, 16, 1
9454
+ ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
9455
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
9456
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo
9445
9457
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
9446
- ; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v6, 16, v2
9447
- ; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v1, 16, 1
9448
- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
9449
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
9450
- ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v11, v12 :: v_dual_add_f32 v6, 0x40c00000, v6
9451
- ; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v3, 16, 1
9452
- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v3
9453
- ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v1, 0x7fff
9454
- ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9455
- ; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
9456
- ; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v3, 0x7fff
9457
- ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
9458
- ; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_cndmask_b32 v1, v7, v9
9459
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
9460
- ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v10, v6, 0x7fff
9458
+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
9459
+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
9460
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo
9461
+ ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff
9461
9462
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6
9462
- ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
9463
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
9464
- ; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1
9465
- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8
9466
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
9463
+ ; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1
9464
+ ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
9467
9465
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
9468
- ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
9469
- ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v2
9470
- ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
9471
- ; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff
9472
- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
9473
- ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
9474
- ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v5
9475
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
9476
- ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v4
9477
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v13, vcc_lo
9466
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
9467
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
9468
+ ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v12, v2, 0x7fff
9469
+ ; GFX11-TRUE16-NEXT: v_add3_u32 v11, v13, v8, 0x7fff
9470
+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8
9478
9471
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
9479
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
9480
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
9481
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v10, v14, vcc_lo
9472
+ ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v14, v3, 0x7fff
9473
+ ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v3
9474
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
9475
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v11, v12, vcc_lo
9482
9476
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
9483
- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
9484
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v12, v15, vcc_lo
9485
- ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
9486
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9487
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
9488
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo
9489
- ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v3, 16, v7
9490
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
9491
- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
9492
- ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v6
9477
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc_lo
9478
+ ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
9479
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
9480
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
9481
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v15, vcc_lo
9482
+ ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9483
+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v8
9484
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v10, vcc_lo
9485
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
9486
+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6
9487
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.h
9488
+ ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v0, 16, v3
9489
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.h
9490
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
9491
+ ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v1, 16, v2
9492
+ ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v4
9493
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
9494
+ ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v6, 16, v5
9493
9495
; GFX11-TRUE16-NEXT: .LBB47_2: ; %end
9494
9496
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
9495
9497
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
0 commit comments