Skip to content

Commit f04aa1f

Browse files
authored
[AMDGPU][CodeGen] Fold immediates in src1 operands of V_MAD/MAC/FMA/FMAC. (#68002)
1 parent 821dfc3 commit f04aa1f

18 files changed

+339
-356
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+16-9
Original file line numberDiff line numberDiff line change
@@ -3250,9 +3250,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
32503250
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
32513251

32523252
// Multiplied part is the constant: Use v_madmk_{f16, f32}.
3253-
// We should only expect these to be on src0 due to canonicalization.
3254-
if (Src0->isReg() && Src0->getReg() == Reg) {
3255-
if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
3253+
if ((Src0->isReg() && Src0->getReg() == Reg) ||
3254+
(Src1->isReg() && Src1->getReg() == Reg)) {
3255+
MachineOperand *RegSrc =
3256+
Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3257+
if (!RegSrc->isReg() ||
3258+
RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())))
32563259
return false;
32573260

32583261
if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
@@ -3266,18 +3269,22 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
32663269
if (pseudoToMCOpcode(NewOpc) == -1)
32673270
return false;
32683271

3269-
// We need to swap operands 0 and 1 since madmk constant is at operand 1.
3272+
// V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3273+
// would also require restricting their register classes. For now
3274+
// just bail out.
3275+
if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
3276+
return false;
32703277

32713278
const int64_t Imm = ImmOp->getImm();
32723279

32733280
// FIXME: This would be a lot easier if we could return a new instruction
32743281
// instead of having to modify in place.
32753282

3276-
Register Src1Reg = Src1->getReg();
3277-
unsigned Src1SubReg = Src1->getSubReg();
3278-
Src0->setReg(Src1Reg);
3279-
Src0->setSubReg(Src1SubReg);
3280-
Src0->setIsKill(Src1->isKill());
3283+
Register SrcReg = RegSrc->getReg();
3284+
unsigned SrcSubReg = RegSrc->getSubReg();
3285+
Src0->setReg(SrcReg);
3286+
Src0->setSubReg(SrcSubReg);
3287+
Src0->setIsKill(RegSrc->isKill());
32813288

32823289
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
32833290
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||

llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll

+26-26
Original file line numberDiff line numberDiff line change
@@ -7149,7 +7149,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
71497149
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
71507150
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
71517151
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
7152-
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
7152+
; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
71537153
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
71547154
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
71557155
; GFX6-NEXT: s_movk_i32 s8, 0x11f
@@ -7269,7 +7269,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
72697269
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
72707270
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
72717271
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
7272-
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
7272+
; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
72737273
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
72747274
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
72757275
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
@@ -7533,21 +7533,21 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <
75337533
; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000
75347534
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
75357535
; GFX6-NEXT: s_movk_i32 s6, 0xf001
7536-
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
7537-
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
75387536
; GFX6-NEXT: s_movk_i32 s8, 0xfff
7537+
; GFX6-NEXT: s_mov_b32 s7, 0xf000
75397538
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
75407539
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
75417540
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
7542-
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
7541+
; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
75437542
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
75447543
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
7545-
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7546-
; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 12
7547-
; GFX6-NEXT: s_mov_b32 s7, 0xf000
7544+
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
7545+
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
75487546
; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6
75497547
; GFX6-NEXT: v_mul_lo_u32 v4, v1, s6
75507548
; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6
7549+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
7550+
; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 12
75517551
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0
75527552
; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4
75537553
; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3
@@ -7647,7 +7647,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <
76477647
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
76487648
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
76497649
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
7650-
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
7650+
; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
76517651
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
76527652
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
76537653
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -7834,7 +7834,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
78347834
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
78357835
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
78367836
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
7837-
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
7837+
; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
78387838
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
78397839
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
78407840
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -7954,7 +7954,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
79547954
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
79557955
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
79567956
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
7957-
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
7957+
; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
79587958
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
79597959
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
79607960
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
@@ -8283,7 +8283,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
82838283
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
82848284
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
82858285
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
8286-
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
8286+
; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
82878287
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
82888288
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
82898289
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -8399,7 +8399,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
83998399
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
84008400
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
84018401
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
8402-
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
8402+
; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
84038403
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
84048404
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
84058405
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
@@ -8589,14 +8589,14 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
85898589
; GFX6-NEXT: s_sub_u32 s4, 0, s10
85908590
; GFX6-NEXT: s_subb_u32 s5, 0, s11
85918591
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
8592-
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
8592+
; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
85938593
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
85948594
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
85958595
; GFX6-NEXT: s_ashr_i32 s12, s3, 31
85968596
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
85978597
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
85988598
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
8599-
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
8599+
; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
86008600
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
86018601
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
86028602
; GFX6-NEXT: s_add_u32 s2, s2, s12
@@ -8724,13 +8724,13 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
87248724
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
87258725
; GFX9-NEXT: s_sub_u32 s0, 0, s8
87268726
; GFX9-NEXT: s_subb_u32 s1, 0, s9
8727-
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
8727+
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
87288728
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
87298729
; GFX9-NEXT: v_mov_b32_e32 v0, 0
87308730
; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
87318731
; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
87328732
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
8733-
; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
8733+
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
87348734
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
87358735
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
87368736
; GFX9-NEXT: v_readfirstlane_b32 s10, v2
@@ -8944,14 +8944,14 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out,
89448944
; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000
89458945
; GFX6-NEXT: v_mac_f32_e32 v0, 0, v1
89468946
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
8947-
; GFX6-NEXT: s_movk_i32 s6, 0xf001
89488947
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
89498948
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd
8949+
; GFX6-NEXT: s_movk_i32 s6, 0xf001
89508950
; GFX6-NEXT: s_mov_b32 s7, 0xf000
89518951
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
89528952
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
89538953
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
8954-
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
8954+
; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
89558955
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
89568956
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
89578957
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -9073,7 +9073,7 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out,
90739073
; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
90749074
; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
90759075
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
9076-
; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
9076+
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
90779077
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
90789078
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
90799079
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -9789,7 +9789,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
97899789
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
97909790
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
97919791
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
9792-
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
9792+
; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
97939793
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
97949794
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
97959795
; GFX6-NEXT: s_mov_b32 s2, -1
@@ -9903,7 +9903,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
99039903
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
99049904
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
99059905
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
9906-
; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
9906+
; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
99079907
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
99089908
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
99099909
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
@@ -10093,14 +10093,14 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
1009310093
; GFX6-NEXT: s_sub_u32 s4, 0, s8
1009410094
; GFX6-NEXT: s_subb_u32 s5, 0, s9
1009510095
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
10096-
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
10096+
; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
1009710097
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
1009810098
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1009910099
; GFX6-NEXT: s_ashr_i32 s10, s3, 31
1010010100
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
1010110101
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
1010210102
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
10103-
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
10103+
; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
1010410104
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
1010510105
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
1010610106
; GFX6-NEXT: s_add_u32 s2, s2, s10
@@ -10226,13 +10226,13 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
1022610226
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1022710227
; GFX9-NEXT: s_sub_u32 s0, 0, s8
1022810228
; GFX9-NEXT: s_subb_u32 s1, 0, s9
10229-
; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
10229+
; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
1023010230
; GFX9-NEXT: v_rcp_f32_e32 v1, v0
1023110231
; GFX9-NEXT: v_mov_b32_e32 v0, 0
1023210232
; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
1023310233
; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
1023410234
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
10235-
; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
10235+
; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1
1023610236
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
1023710237
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
1023810238
; GFX9-NEXT: v_readfirstlane_b32 s2, v2

0 commit comments

Comments
 (0)