Skip to content

Commit 4b9112e

Browse files
authored
[AMDGPU]Optimize SGPR spills (llvm#93668)
This PR is dependent on [llvm#93779](llvm#93779). As currently, each SGPR Spills are lowered to go into distinct stack slots in stack frame after SGPR allocation phase. Therefore, this patch utilizes the capability of StackSlotColoring to ensure the stack slot sharing if possible for stack frame index, where the SGPR spills are occuring in the non-interfering region. StackSlotColoring is introduced immediately after SGPR register allocation, just to ensure that any further lowering happens on the optimally allocated stack slots, with certain flags to indicate the preservation of certain analysis result later to be used by RA of other register classes.
1 parent 7dbc168 commit 4b9112e

7 files changed

+126
-102
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1437,6 +1437,11 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
14371437
// since FastRegAlloc does the replacements itself.
14381438
addPass(createVirtRegRewriter(false));
14391439

1440+
// At this point, the sgpr-regalloc has been done and it is good to have the
1441+
// stack slot coloring to try to optimize the SGPR spill stack indices before
1442+
// attempting the custom SGPR spill lowering.
1443+
addPass(&StackSlotColoringID);
1444+
14401445
// Equivalent of PEI for SGPRs.
14411446
addPass(&SILowerSGPRSpillsID);
14421447
addPass(&SIPreAllocateWWMRegsID);

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1775,8 +1775,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
17751775

17761776
if (SpillToVGPR) {
17771777

1778-
assert(SB.NumSubRegs == VGPRSpills.size() &&
1779-
"Num of VGPR lanes should be equal to num of SGPRs spilled");
1778+
// Since stack slot coloring pass is trying to optimize SGPR spills,
1779+
// VGPR lanes (mapped from spill stack slot) may be shared for SGPR
1780+
// spills of different sizes. This accounts for number of VGPR lanes alloted
1781+
// equal to the largest SGPR being spilled in them.
1782+
assert(SB.NumSubRegs <= VGPRSpills.size() &&
1783+
"Num of SGPRs spilled should be less than or equal to num of "
1784+
"the VGPR lanes.");
17801785

17811786
for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
17821787
Register SubReg =

llvm/test/CodeGen/AMDGPU/llc-pipeline.ll

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,10 +366,12 @@
366366
; GCN-O1-NEXT: Machine Optimization Remark Emitter
367367
; GCN-O1-NEXT: Greedy Register Allocator
368368
; GCN-O1-NEXT: Virtual Register Rewriter
369+
; GCN-O1-NEXT: Stack Slot Coloring
369370
; GCN-O1-NEXT: SI lower SGPR spill instructions
370371
; GCN-O1-NEXT: Virtual Register Map
371372
; GCN-O1-NEXT: Live Register Matrix
372373
; GCN-O1-NEXT: SI Pre-allocate WWM Registers
374+
; GCN-O1-NEXT: Live Stack Slot Analysis
373375
; GCN-O1-NEXT: Greedy Register Allocator
374376
; GCN-O1-NEXT: SI Lower WWM Copies
375377
; GCN-O1-NEXT: GCN NSA Reassign
@@ -671,10 +673,12 @@
671673
; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter
672674
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
673675
; GCN-O1-OPTS-NEXT: Virtual Register Rewriter
676+
; GCN-O1-OPTS-NEXT: Stack Slot Coloring
674677
; GCN-O1-OPTS-NEXT: SI lower SGPR spill instructions
675678
; GCN-O1-OPTS-NEXT: Virtual Register Map
676679
; GCN-O1-OPTS-NEXT: Live Register Matrix
677680
; GCN-O1-OPTS-NEXT: SI Pre-allocate WWM Registers
681+
; GCN-O1-OPTS-NEXT: Live Stack Slot Analysis
678682
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
679683
; GCN-O1-OPTS-NEXT: SI Lower WWM Copies
680684
; GCN-O1-OPTS-NEXT: GCN NSA Reassign
@@ -982,10 +986,12 @@
982986
; GCN-O2-NEXT: Machine Optimization Remark Emitter
983987
; GCN-O2-NEXT: Greedy Register Allocator
984988
; GCN-O2-NEXT: Virtual Register Rewriter
989+
; GCN-O2-NEXT: Stack Slot Coloring
985990
; GCN-O2-NEXT: SI lower SGPR spill instructions
986991
; GCN-O2-NEXT: Virtual Register Map
987992
; GCN-O2-NEXT: Live Register Matrix
988993
; GCN-O2-NEXT: SI Pre-allocate WWM Registers
994+
; GCN-O2-NEXT: Live Stack Slot Analysis
989995
; GCN-O2-NEXT: Greedy Register Allocator
990996
; GCN-O2-NEXT: SI Lower WWM Copies
991997
; GCN-O2-NEXT: GCN NSA Reassign
@@ -1305,10 +1311,12 @@
13051311
; GCN-O3-NEXT: Machine Optimization Remark Emitter
13061312
; GCN-O3-NEXT: Greedy Register Allocator
13071313
; GCN-O3-NEXT: Virtual Register Rewriter
1314+
; GCN-O3-NEXT: Stack Slot Coloring
13081315
; GCN-O3-NEXT: SI lower SGPR spill instructions
13091316
; GCN-O3-NEXT: Virtual Register Map
13101317
; GCN-O3-NEXT: Live Register Matrix
13111318
; GCN-O3-NEXT: SI Pre-allocate WWM Registers
1319+
; GCN-O3-NEXT: Live Stack Slot Analysis
13121320
; GCN-O3-NEXT: Greedy Register Allocator
13131321
; GCN-O3-NEXT: SI Lower WWM Copies
13141322
; GCN-O3-NEXT: GCN NSA Reassign

llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -221,15 +221,15 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
221221
; GFX906-NEXT: ; def s29
222222
; GFX906-NEXT: ;;#ASMEND
223223
; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
224-
; GFX906-NEXT: v_writelane_b32 v40, s21, 24
225-
; GFX906-NEXT: v_writelane_b32 v40, s22, 25
226-
; GFX906-NEXT: v_writelane_b32 v40, s23, 26
227-
; GFX906-NEXT: v_writelane_b32 v40, s24, 27
228-
; GFX906-NEXT: v_writelane_b32 v40, s25, 28
229-
; GFX906-NEXT: v_writelane_b32 v40, s26, 29
230-
; GFX906-NEXT: v_writelane_b32 v40, s27, 30
231-
; GFX906-NEXT: v_writelane_b32 v40, s28, 31
232-
; GFX906-NEXT: v_writelane_b32 v40, s29, 32
224+
; GFX906-NEXT: v_writelane_b32 v40, s21, 12
225+
; GFX906-NEXT: v_writelane_b32 v40, s22, 13
226+
; GFX906-NEXT: v_writelane_b32 v40, s23, 14
227+
; GFX906-NEXT: v_writelane_b32 v40, s24, 15
228+
; GFX906-NEXT: v_writelane_b32 v40, s25, 16
229+
; GFX906-NEXT: v_writelane_b32 v40, s26, 17
230+
; GFX906-NEXT: v_writelane_b32 v40, s27, 18
231+
; GFX906-NEXT: v_writelane_b32 v40, s28, 19
232+
; GFX906-NEXT: v_writelane_b32 v40, s29, 20
233233
; GFX906-NEXT: v_readlane_b32 s4, v40, 10
234234
; GFX906-NEXT: v_readlane_b32 s6, v40, 0
235235
; GFX906-NEXT: v_readlane_b32 s8, v40, 8
@@ -249,39 +249,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
249249
; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17]
250250
; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1
251251
; GFX906-NEXT: s_mov_b64 exec, s[34:35]
252-
; GFX906-NEXT: v_readlane_b32 s21, v40, 24
252+
; GFX906-NEXT: v_readlane_b32 s21, v40, 12
253253
; GFX906-NEXT: ;;#ASMSTART
254254
; GFX906-NEXT: ; use s21
255255
; GFX906-NEXT: ;;#ASMEND
256-
; GFX906-NEXT: v_readlane_b32 s22, v40, 25
256+
; GFX906-NEXT: v_readlane_b32 s22, v40, 13
257257
; GFX906-NEXT: ;;#ASMSTART
258258
; GFX906-NEXT: ; use s22
259259
; GFX906-NEXT: ;;#ASMEND
260-
; GFX906-NEXT: v_readlane_b32 s23, v40, 26
260+
; GFX906-NEXT: v_readlane_b32 s23, v40, 14
261261
; GFX906-NEXT: ;;#ASMSTART
262262
; GFX906-NEXT: ; use s23
263263
; GFX906-NEXT: ;;#ASMEND
264-
; GFX906-NEXT: v_readlane_b32 s24, v40, 27
264+
; GFX906-NEXT: v_readlane_b32 s24, v40, 15
265265
; GFX906-NEXT: ;;#ASMSTART
266266
; GFX906-NEXT: ; use s24
267267
; GFX906-NEXT: ;;#ASMEND
268-
; GFX906-NEXT: v_readlane_b32 s25, v40, 28
268+
; GFX906-NEXT: v_readlane_b32 s25, v40, 16
269269
; GFX906-NEXT: ;;#ASMSTART
270270
; GFX906-NEXT: ; use s25
271271
; GFX906-NEXT: ;;#ASMEND
272-
; GFX906-NEXT: v_readlane_b32 s26, v40, 29
272+
; GFX906-NEXT: v_readlane_b32 s26, v40, 17
273273
; GFX906-NEXT: ;;#ASMSTART
274274
; GFX906-NEXT: ; use s26
275275
; GFX906-NEXT: ;;#ASMEND
276-
; GFX906-NEXT: v_readlane_b32 s27, v40, 30
276+
; GFX906-NEXT: v_readlane_b32 s27, v40, 18
277277
; GFX906-NEXT: ;;#ASMSTART
278278
; GFX906-NEXT: ; use s27
279279
; GFX906-NEXT: ;;#ASMEND
280-
; GFX906-NEXT: v_readlane_b32 s28, v40, 31
280+
; GFX906-NEXT: v_readlane_b32 s28, v40, 19
281281
; GFX906-NEXT: ;;#ASMSTART
282282
; GFX906-NEXT: ; use s28
283283
; GFX906-NEXT: ;;#ASMEND
284-
; GFX906-NEXT: v_readlane_b32 s29, v40, 32
284+
; GFX906-NEXT: v_readlane_b32 s29, v40, 20
285285
; GFX906-NEXT: ;;#ASMSTART
286286
; GFX906-NEXT: ; use s29
287287
; GFX906-NEXT: ;;#ASMEND
@@ -602,15 +602,15 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
602602
; GFX908-NEXT: ; def s29
603603
; GFX908-NEXT: ;;#ASMEND
604604
; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
605-
; GFX908-NEXT: v_writelane_b32 v40, s21, 24
606-
; GFX908-NEXT: v_writelane_b32 v40, s22, 25
607-
; GFX908-NEXT: v_writelane_b32 v40, s23, 26
608-
; GFX908-NEXT: v_writelane_b32 v40, s24, 27
609-
; GFX908-NEXT: v_writelane_b32 v40, s25, 28
610-
; GFX908-NEXT: v_writelane_b32 v40, s26, 29
611-
; GFX908-NEXT: v_writelane_b32 v40, s27, 30
612-
; GFX908-NEXT: v_writelane_b32 v40, s28, 31
613-
; GFX908-NEXT: v_writelane_b32 v40, s29, 32
605+
; GFX908-NEXT: v_writelane_b32 v40, s21, 12
606+
; GFX908-NEXT: v_writelane_b32 v40, s22, 13
607+
; GFX908-NEXT: v_writelane_b32 v40, s23, 14
608+
; GFX908-NEXT: v_writelane_b32 v40, s24, 15
609+
; GFX908-NEXT: v_writelane_b32 v40, s25, 16
610+
; GFX908-NEXT: v_writelane_b32 v40, s26, 17
611+
; GFX908-NEXT: v_writelane_b32 v40, s27, 18
612+
; GFX908-NEXT: v_writelane_b32 v40, s28, 19
613+
; GFX908-NEXT: v_writelane_b32 v40, s29, 20
614614
; GFX908-NEXT: v_readlane_b32 s4, v40, 10
615615
; GFX908-NEXT: v_readlane_b32 s6, v40, 0
616616
; GFX908-NEXT: v_readlane_b32 s8, v40, 8
@@ -630,39 +630,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
630630
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
631631
; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1
632632
; GFX908-NEXT: s_mov_b64 exec, s[34:35]
633-
; GFX908-NEXT: v_readlane_b32 s21, v40, 24
633+
; GFX908-NEXT: v_readlane_b32 s21, v40, 12
634634
; GFX908-NEXT: ;;#ASMSTART
635635
; GFX908-NEXT: ; use s21
636636
; GFX908-NEXT: ;;#ASMEND
637-
; GFX908-NEXT: v_readlane_b32 s22, v40, 25
637+
; GFX908-NEXT: v_readlane_b32 s22, v40, 13
638638
; GFX908-NEXT: ;;#ASMSTART
639639
; GFX908-NEXT: ; use s22
640640
; GFX908-NEXT: ;;#ASMEND
641-
; GFX908-NEXT: v_readlane_b32 s23, v40, 26
641+
; GFX908-NEXT: v_readlane_b32 s23, v40, 14
642642
; GFX908-NEXT: ;;#ASMSTART
643643
; GFX908-NEXT: ; use s23
644644
; GFX908-NEXT: ;;#ASMEND
645-
; GFX908-NEXT: v_readlane_b32 s24, v40, 27
645+
; GFX908-NEXT: v_readlane_b32 s24, v40, 15
646646
; GFX908-NEXT: ;;#ASMSTART
647647
; GFX908-NEXT: ; use s24
648648
; GFX908-NEXT: ;;#ASMEND
649-
; GFX908-NEXT: v_readlane_b32 s25, v40, 28
649+
; GFX908-NEXT: v_readlane_b32 s25, v40, 16
650650
; GFX908-NEXT: ;;#ASMSTART
651651
; GFX908-NEXT: ; use s25
652652
; GFX908-NEXT: ;;#ASMEND
653-
; GFX908-NEXT: v_readlane_b32 s26, v40, 29
653+
; GFX908-NEXT: v_readlane_b32 s26, v40, 17
654654
; GFX908-NEXT: ;;#ASMSTART
655655
; GFX908-NEXT: ; use s26
656656
; GFX908-NEXT: ;;#ASMEND
657-
; GFX908-NEXT: v_readlane_b32 s27, v40, 30
657+
; GFX908-NEXT: v_readlane_b32 s27, v40, 18
658658
; GFX908-NEXT: ;;#ASMSTART
659659
; GFX908-NEXT: ; use s27
660660
; GFX908-NEXT: ;;#ASMEND
661-
; GFX908-NEXT: v_readlane_b32 s28, v40, 31
661+
; GFX908-NEXT: v_readlane_b32 s28, v40, 19
662662
; GFX908-NEXT: ;;#ASMSTART
663663
; GFX908-NEXT: ; use s28
664664
; GFX908-NEXT: ;;#ASMEND
665-
; GFX908-NEXT: v_readlane_b32 s29, v40, 32
665+
; GFX908-NEXT: v_readlane_b32 s29, v40, 20
666666
; GFX908-NEXT: ;;#ASMSTART
667667
; GFX908-NEXT: ; use s29
668668
; GFX908-NEXT: ;;#ASMEND

llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@
1717

1818
; DEFAULT: Greedy Register Allocator
1919
; DEFAULT-NEXT: Virtual Register Rewriter
20+
; DEFAULT-NEXT: Stack Slot Coloring
2021
; DEFAULT-NEXT: SI lower SGPR spill instructions
2122
; DEFAULT-NEXT: Virtual Register Map
2223
; DEFAULT-NEXT: Live Register Matrix
2324
; DEFAULT-NEXT: SI Pre-allocate WWM Registers
25+
; DEFAULT-NEXT: Live Stack Slot Analysis
2426
; DEFAULT-NEXT: Greedy Register Allocator
2527
; DEFAULT-NEXT: SI Lower WWM Copies
2628
; DEFAULT-NEXT: GCN NSA Reassign
@@ -50,10 +52,12 @@
5052
; BASIC-DEFAULT-NEXT: Live Register Matrix
5153
; BASIC-DEFAULT-NEXT: Basic Register Allocator
5254
; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
55+
; BASIC-DEFAULT-NEXT: Stack Slot Coloring
5356
; BASIC-DEFAULT-NEXT: SI lower SGPR spill instructions
5457
; BASIC-DEFAULT-NEXT: Virtual Register Map
5558
; BASIC-DEFAULT-NEXT: Live Register Matrix
5659
; BASIC-DEFAULT-NEXT: SI Pre-allocate WWM Registers
60+
; BASIC-DEFAULT-NEXT: Live Stack Slot Analysis
5761
; BASIC-DEFAULT-NEXT: Bundle Machine CFG Edges
5862
; BASIC-DEFAULT-NEXT: Spill Code Placement Analysis
5963
; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis
@@ -69,10 +73,12 @@
6973

7074
; DEFAULT-BASIC: Greedy Register Allocator
7175
; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
76+
; DEFAULT-BASIC-NEXT: Stack Slot Coloring
7277
; DEFAULT-BASIC-NEXT: SI lower SGPR spill instructions
7378
; DEFAULT-BASIC-NEXT: Virtual Register Map
7479
; DEFAULT-BASIC-NEXT: Live Register Matrix
7580
; DEFAULT-BASIC-NEXT: SI Pre-allocate WWM Registers
81+
; DEFAULT-BASIC-NEXT: Live Stack Slot Analysis
7682
; DEFAULT-BASIC-NEXT: Basic Register Allocator
7783
; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
7884
; DEFAULT-BASIC-NEXT: GCN NSA Reassign
@@ -90,10 +96,12 @@
9096
; BASIC-BASIC-NEXT: Live Register Matrix
9197
; BASIC-BASIC-NEXT: Basic Register Allocator
9298
; BASIC-BASIC-NEXT: Virtual Register Rewriter
99+
; BASIC-BASIC-NEXT: Stack Slot Coloring
93100
; BASIC-BASIC-NEXT: SI lower SGPR spill instructions
94101
; BASIC-BASIC-NEXT: Virtual Register Map
95102
; BASIC-BASIC-NEXT: Live Register Matrix
96103
; BASIC-BASIC-NEXT: SI Pre-allocate WWM Registers
104+
; BASIC-BASIC-NEXT: Live Stack Slot Analysis
97105
; BASIC-BASIC-NEXT: Basic Register Allocator
98106
; BASIC-BASIC-NEXT: SI Lower WWM Copies
99107
; BASIC-BASIC-NEXT: GCN NSA Reassign

0 commit comments

Comments
 (0)