Skip to content

Commit a2263eb

Browse files
authored
AMDGPU: Handle gfx950 XDL-write-VGPR-VALU-Mem-Exp wait state change (llvm#126727)
1 parent 71478ec commit a2263eb

7 files changed

+389
-374
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2606,12 +2606,14 @@ static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
26062606
return NumPasses + 3;
26072607
}
26082608

2609-
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2610-
// 2 pass -> 5
2611-
// 4 pass -> 7
2612-
// 8 pass -> 11
2613-
// 16 pass -> 19
2614-
return NumPasses + 3;
2609+
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses,
2610+
bool IsGFX950) {
2611+
// xdl def cycles | gfx940 | gfx950
2612+
// 2 pass | 5 5
2613+
// 4 pass | 7 8
2614+
// 8 pass | 11 12
2615+
// 16 pass | 19 20
2616+
return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
26152617
}
26162618

26172619
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
@@ -2762,7 +2764,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
27622764
} else if (ST.hasGFX940Insts()) {
27632765
NeedWaitStates =
27642766
isXDL(ST, *MFMA)
2765-
? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2767+
? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(
2768+
NumPasses, ST.hasGFX950Insts())
27662769
: GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
27672770
NumPasses);
27682771
} else {

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
4949
; GCN-NEXT: v_mov_b32_e32 v9, s17
5050
; GCN-NEXT: v_mov_b32_e32 v10, s18
5151
; GCN-NEXT: v_mov_b32_e32 v11, s19
52-
; GCN-NEXT: s_nop 3
52+
; GCN-NEXT: s_nop 4
5353
; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
5454
; GCN-NEXT: s_waitcnt vmcnt(0)
5555
; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -122,7 +122,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
122122
; GCN-NEXT: v_mov_b32_e32 v9, s17
123123
; GCN-NEXT: v_mov_b32_e32 v10, s18
124124
; GCN-NEXT: v_mov_b32_e32 v11, s19
125-
; GCN-NEXT: s_nop 3
125+
; GCN-NEXT: s_nop 4
126126
; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
127127
; GCN-NEXT: s_waitcnt vmcnt(0)
128128
; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -179,7 +179,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b
179179
; GCN-NEXT: s_nop 1
180180
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
181181
; GCN-NEXT: s_nop 7
182-
; GCN-NEXT: s_nop 2
182+
; GCN-NEXT: s_nop 3
183183
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
184184
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
185185
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -224,7 +224,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0,
224224
; GCN-NEXT: s_nop 1
225225
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
226226
; GCN-NEXT: s_nop 7
227-
; GCN-NEXT: s_nop 2
227+
; GCN-NEXT: s_nop 3
228228
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
229229
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
230230
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -417,7 +417,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
417417
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
418418
; GCN-NEXT: v_mov_b32_e32 v0, 0
419419
; GCN-NEXT: s_nop 7
420-
; GCN-NEXT: s_nop 1
420+
; GCN-NEXT: s_nop 2
421421
; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
422422
; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
423423
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
@@ -459,7 +459,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
459459
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
460460
; GCN-NEXT: v_mov_b32_e32 v0, 0
461461
; GCN-NEXT: s_nop 7
462-
; GCN-NEXT: s_nop 1
462+
; GCN-NEXT: s_nop 2
463463
; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
464464
; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
465465
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16

0 commit comments

Comments
 (0)