Skip to content

Commit 93bcb84

Browse files
cdevadasDavid Salinas
authored and
David Salinas
committed
[AMDGPU] Include WWM register spill into BB Prolog (llvm#111496)
With llvm#93526 we split the regalloc pipeline further to have a standalone allocation for wwm registers and per-lane VGPRs. Currently the presence of the wwm-spill reloads inserted at the bb-top limits the isBasicPrologue function during the per-lane vgpr regalloc to skip past the exec manipulation instruction and ended up causing incorrect codegen. The wmm-spill inserted during the wwm-regalloc pipeline should also be included in the bb-prolog so that the per-lane vgpr regalloc pipeline can identify the appropriate insertion points for their spills and copies. Change-Id: Icb5596a4ca8204414d54b4b30b614b46927accc2
1 parent 06a4f5c commit 93bcb84

12 files changed

+368
-328
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8985,7 +8985,7 @@ bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
89858985

89868986
uint16_t Opcode = MI.getOpcode();
89878987
return IsNullOrVectorRegister &&
8988-
(isSGPRSpill(Opcode) ||
8988+
(isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
89898989
(!MI.isTerminator() && Opcode != AMDGPU::COPY &&
89908990
MI.modifiesRegister(AMDGPU::EXEC, &RI)));
89918991
}

llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
6868
; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
6969
; CHECK-NEXT: s_mov_b32 exec_lo, s21
7070
; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
71+
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
72+
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
73+
; CHECK-NEXT: s_mov_b32 exec_lo, s21
7174
; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
7275
; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
7376
; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -84,10 +87,7 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
8487
; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
8588
; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
8689
; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
87-
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
88-
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
89-
; CHECK-NEXT: s_mov_b32 exec_lo, s21
90-
; CHECK-NEXT: s_waitcnt vmcnt(1)
90+
; CHECK-NEXT: s_waitcnt vmcnt(0)
9191
; CHECK-NEXT: v_readfirstlane_b32 s12, v7
9292
; CHECK-NEXT: v_readfirstlane_b32 s10, v6
9393
; CHECK-NEXT: v_readfirstlane_b32 s9, v5
@@ -104,7 +104,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
104104
; CHECK-NEXT: s_mov_b32 s17, s6
105105
; CHECK-NEXT: s_mov_b32 s18, s5
106106
; CHECK-NEXT: s_mov_b32 s19, s4
107-
; CHECK-NEXT: s_waitcnt vmcnt(0)
108107
; CHECK-NEXT: v_writelane_b32 v16, s12, 5
109108
; CHECK-NEXT: v_writelane_b32 v16, s13, 6
110109
; CHECK-NEXT: v_writelane_b32 v16, s14, 7
@@ -138,8 +137,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
138137
; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
139138
; CHECK-NEXT: s_mov_b32 exec_lo, s21
140139
; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
141-
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
142-
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
143140
; CHECK-NEXT: s_or_saveexec_b32 s21, -1
144141
; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
145142
; CHECK-NEXT: s_mov_b32 exec_lo, s21
@@ -157,6 +154,9 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
157154
; CHECK-NEXT: v_readlane_b32 s17, v16, 1
158155
; CHECK-NEXT: v_readlane_b32 s18, v16, 2
159156
; CHECK-NEXT: v_readlane_b32 s19, v16, 3
157+
; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
158+
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
159+
; CHECK-NEXT: s_waitcnt vmcnt(0)
160160
; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
161161
; CHECK-NEXT: s_waitcnt vmcnt(0)
162162
; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill

llvm/test/CodeGen/AMDGPU/collapse-endcf.ll

Lines changed: 47 additions & 42 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,6 @@
4646

4747
; VMEM: [[ENDIF]]:
4848

49-
; Restore val
50-
; VGPR: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
51-
5249
; Reload and restore exec mask
5350
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
5451
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
@@ -61,7 +58,7 @@
6158
; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
6259

6360
; Restore val
64-
; VMEM: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
61+
; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
6562

6663
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
6764

@@ -123,7 +120,6 @@ endif:
123120
; GCN: buffer_store_dword v[[VAL_LOOP_RELOAD]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
124121

125122
; GCN: [[END]]:
126-
; VGPR: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
127123
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
128124
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
129125

@@ -133,7 +129,7 @@ endif:
133129
; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
134130

135131
; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
136-
; VMEM: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
132+
; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
137133

138134
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
139135

@@ -193,7 +189,6 @@ end:
193189
; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]]
194190

195191
; GCN: [[FLOW]]: ; %Flow
196-
; VGPR: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
197192
; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 ; 4-byte Folded Reload
198193
; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
199194
; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
@@ -205,7 +200,7 @@ end:
205200

206201
; GCN: s_or_saveexec_b64 s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]], s[[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]]
207202

208-
; VMEM: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
203+
; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
209204

210205
; Regular spill value restored after exec modification
211206
; Followed by spill
@@ -239,7 +234,6 @@ end:
239234
; GCN-NEXT: s_branch [[FLOW]]
240235

241236
; GCN: [[ENDIF]]:
242-
; VGPR: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
243237
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]]
244238
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]
245239

@@ -251,7 +245,7 @@ end:
251245

252246
; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
253247

254-
; VMEM: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
248+
; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
255249

256250
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
257251
define amdgpu_kernel void @divergent_if_else_endif(ptr addrspace(1) %out) #0 {

0 commit comments

Comments
 (0)