Skip to content

Commit cd4287b

Browse files
authored
[AMDGPU] Convert PrologEpilogSGPRSpills from DenseMap to sorted vector (#90957)
In practice PrologEpilogSGPRSpills never has more than 3 entries so DenseMap is overkill. In addition this means that iteration happens in register number order, instead of DenseMap's hashed order, so it will not be affected by future patches that define new physical registers. This should reduce future test case churn.
1 parent 81003f2 commit cd4287b

File tree

5 files changed

+47
-33
lines changed

5 files changed

+47
-33
lines changed

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -522,13 +522,13 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
522522
// the serialization easier.
523523
ReservedRegSet WWMReservedRegs;
524524

525-
using PrologEpilogSGPRSpillsMap =
526-
DenseMap<Register, PrologEpilogSGPRSaveRestoreInfo>;
525+
using PrologEpilogSGPRSpill =
526+
std::pair<Register, PrologEpilogSGPRSaveRestoreInfo>;
527527
// To track the SGPR spill method used for a CSR SGPR register during
528528
// frame lowering. Even though the SGPR spills are handled during
529529
// SILowerSGPRSpills pass, some special handling needed later during the
530530
// PrologEpilogInserter.
531-
PrologEpilogSGPRSpillsMap PrologEpilogSGPRSpills;
531+
SmallVector<PrologEpilogSGPRSpill, 3> PrologEpilogSGPRSpills;
532532

533533
// To save/restore EXEC MASK around WWM spills and copies.
534534
Register SGPRForEXECCopy;
@@ -596,7 +596,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
596596
const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
597597
const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; }
598598

599-
const PrologEpilogSGPRSpillsMap &getPrologEpilogSGPRSpills() const {
599+
ArrayRef<PrologEpilogSGPRSpill> getPrologEpilogSGPRSpills() const {
600+
assert(
601+
is_sorted(PrologEpilogSGPRSpills, [](const auto &LHS, const auto &RHS) {
602+
return LHS.first < RHS.first;
603+
}));
600604
return PrologEpilogSGPRSpills;
601605
}
602606

@@ -606,18 +610,29 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
606610

607611
void addToPrologEpilogSGPRSpills(Register Reg,
608612
PrologEpilogSGPRSaveRestoreInfo SI) {
609-
PrologEpilogSGPRSpills.insert(std::make_pair(Reg, SI));
613+
assert(!hasPrologEpilogSGPRSpillEntry(Reg));
614+
615+
// Insert a new entry in the right place to keep the vector in sorted order.
616+
// This should be cheap since the vector is expected to be very short.
617+
PrologEpilogSGPRSpills.insert(
618+
upper_bound(
619+
PrologEpilogSGPRSpills, Reg,
620+
[](const auto &LHS, const auto &RHS) { return LHS < RHS.first; }),
621+
std::make_pair(Reg, SI));
610622
}
611623

612624
// Check if an entry created for \p Reg in PrologEpilogSGPRSpills. Return true
613625
// on success and false otherwise.
614626
bool hasPrologEpilogSGPRSpillEntry(Register Reg) const {
615-
return PrologEpilogSGPRSpills.contains(Reg);
627+
auto I = find_if(PrologEpilogSGPRSpills,
628+
[&Reg](const auto &Spill) { return Spill.first == Reg; });
629+
return I != PrologEpilogSGPRSpills.end();
616630
}
617631

618632
// Get the scratch SGPR if allocated to save/restore \p Reg.
619633
Register getScratchSGPRCopyDstReg(Register Reg) const {
620-
auto I = PrologEpilogSGPRSpills.find(Reg);
634+
auto I = find_if(PrologEpilogSGPRSpills,
635+
[&Reg](const auto &Spill) { return Spill.first == Reg; });
621636
if (I != PrologEpilogSGPRSpills.end() &&
622637
I->second.getKind() == SGPRSaveKind::COPY_TO_SCRATCH_SGPR)
623638
return I->second.getReg();
@@ -646,7 +661,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
646661

647662
const PrologEpilogSGPRSaveRestoreInfo &
648663
getPrologEpilogSGPRSaveRestoreInfo(Register Reg) const {
649-
auto I = PrologEpilogSGPRSpills.find(Reg);
664+
auto I = find_if(PrologEpilogSGPRSpills,
665+
[&Reg](const auto &Spill) { return Spill.first == Reg; });
650666
assert(I != PrologEpilogSGPRSpills.end());
651667

652668
return I->second;

llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,11 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
3232
; GFX906-NEXT: v_writelane_b32 v2, s24, 5
3333
; GFX906-NEXT: s_mov_b64 s[26:27], s[10:11]
3434
; GFX906-NEXT: v_writelane_b32 v2, s26, 6
35-
; GFX906-NEXT: v_writelane_b32 v41, s34, 2
35+
; GFX906-NEXT: v_writelane_b32 v41, s16, 4
3636
; GFX906-NEXT: v_writelane_b32 v2, s27, 7
37-
; GFX906-NEXT: v_writelane_b32 v41, s35, 3
37+
; GFX906-NEXT: v_writelane_b32 v41, s34, 2
3838
; GFX906-NEXT: v_writelane_b32 v2, s8, 8
39-
; GFX906-NEXT: v_writelane_b32 v41, s16, 4
39+
; GFX906-NEXT: v_writelane_b32 v41, s35, 3
4040
; GFX906-NEXT: v_writelane_b32 v2, s9, 9
4141
; GFX906-NEXT: v_writelane_b32 v41, s30, 0
4242
; GFX906-NEXT: v_writelane_b32 v2, s4, 10
@@ -340,9 +340,9 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
340340
; GFX906-NEXT: v_readlane_b32 s31, v41, 1
341341
; GFX906-NEXT: v_readlane_b32 s30, v41, 0
342342
; GFX906-NEXT: ; kill: killed $vgpr40
343+
; GFX906-NEXT: v_readlane_b32 s4, v41, 4
343344
; GFX906-NEXT: v_readlane_b32 s34, v41, 2
344345
; GFX906-NEXT: v_readlane_b32 s35, v41, 3
345-
; GFX906-NEXT: v_readlane_b32 s4, v41, 4
346346
; GFX906-NEXT: s_waitcnt vmcnt(0)
347347
; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112
348348
; GFX906-NEXT: s_waitcnt vmcnt(0)
@@ -383,12 +383,12 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
383383
; GFX908-NEXT: s_mov_b64 exec, -1
384384
; GFX908-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
385385
; GFX908-NEXT: s_mov_b64 exec, s[18:19]
386+
; GFX908-NEXT: v_mov_b32_e32 v3, s16
387+
; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
386388
; GFX908-NEXT: v_mov_b32_e32 v3, s34
387389
; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
388390
; GFX908-NEXT: v_mov_b32_e32 v3, s35
389391
; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
390-
; GFX908-NEXT: v_mov_b32_e32 v3, s16
391-
; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
392392
; GFX908-NEXT: s_addk_i32 s32, 0x2c00
393393
; GFX908-NEXT: s_mov_b64 s[16:17], exec
394394
; GFX908-NEXT: s_mov_b64 exec, 1
@@ -753,16 +753,16 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
753753
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:172
754754
; GFX908-NEXT: s_waitcnt vmcnt(0)
755755
; GFX908-NEXT: s_mov_b64 exec, s[4:5]
756-
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload
756+
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload
757757
; GFX908-NEXT: ; kill: killed $vgpr40
758758
; GFX908-NEXT: s_waitcnt vmcnt(0)
759+
; GFX908-NEXT: v_readfirstlane_b32 s4, v0
760+
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload
761+
; GFX908-NEXT: s_waitcnt vmcnt(0)
759762
; GFX908-NEXT: v_readfirstlane_b32 s34, v0
760763
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload
761764
; GFX908-NEXT: s_waitcnt vmcnt(0)
762765
; GFX908-NEXT: v_readfirstlane_b32 s35, v0
763-
; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload
764-
; GFX908-NEXT: s_waitcnt vmcnt(0)
765-
; GFX908-NEXT: v_readfirstlane_b32 s4, v0
766766
; GFX908-NEXT: s_xor_saveexec_b64 s[6:7], -1
767767
; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
768768
; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload

llvm/test/CodeGen/AMDGPU/stack-realign.ll

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -162,21 +162,21 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
162162
; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
163163
; GCN-NEXT: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill
164164
; GCN-NEXT: s_mov_b64 exec, s[18:19]
165+
; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2
165166
; GCN-NEXT: v_mov_b32_e32 v32, 0
166167
; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3
167168
; GCN: s_mov_b32 s34, s32
168169
; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
169170
; GCN-NEXT: s_waitcnt vmcnt(0)
170171
; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
171172
; GCN-DAG: s_add_i32 s32, s32, 0x30000
172-
; GCN: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2
173173
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
174174
; GCN: s_swappc_b64 s[30:31],
175175

176176
; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1
177177
; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0
178-
; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3
179178
; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG]], 2
179+
; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3
180180
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
181181
; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload
182182
; GCN-NEXT: s_mov_b64 exec, s[6:7]
@@ -265,9 +265,9 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 {
265265
; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
266266
; GCN: s_xor_saveexec_b64 s[6:7], -1
267267
; GCN: buffer_store_dword v39, off, s[0:3], s33
268-
; GCN: v_mov_b32_e32 v0, s34
269-
; GCN: buffer_store_dword v0, off, s[0:3], s33
270268
; GCN: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]]
269+
; GCN: buffer_store_dword v0, off, s[0:3], s33
270+
; GCN: v_mov_b32_e32 v0, s34
271271
; GCN-DAG: buffer_store_dword v0, off, s[0:3], s33
272272
%local_val = alloca i32, align 128, addrspace(5)
273273
store volatile i32 %b, ptr addrspace(5) %local_val, align 128
@@ -304,13 +304,11 @@ define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i
304304
; GCN-NEXT: s_add_i32 s5, s33, 0x42100
305305
; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill
306306
; GCN-NEXT: s_mov_b64 exec, s[6:7]
307-
; GCN-NEXT: v_mov_b32_e32 v0, s34
308-
; GCN-NOT: v_mov_b32_e32 v0, 0x108c
309-
; GCN-NEXT: s_add_i32 s5, s33, 0x42300
310-
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
311307
; GCN-NEXT: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]]
312-
; GCN-NOT: v_mov_b32_e32 v0, 0x1088
313308
; GCN-NEXT: s_add_i32 s5, s33, 0x42200
309+
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
310+
; GCN-NEXT: v_mov_b32_e32 v0, s34
311+
; GCN-NEXT: s_add_i32 s5, s33, 0x42300
314312
; GCN-NEXT: s_mov_b32 s34, s32
315313
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
316314
%local_val = alloca i32, align 128, addrspace(5)

llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ define void @vector_reg_liverange_split() #0 {
1818
; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
1919
; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
2020
; GFX90A-NEXT: s_mov_b64 exec, s[18:19]
21+
; GFX90A-NEXT: v_writelane_b32 v40, s16, 4
2122
; GFX90A-NEXT: v_writelane_b32 v40, s28, 2
2223
; GFX90A-NEXT: v_writelane_b32 v40, s29, 3
23-
; GFX90A-NEXT: v_writelane_b32 v40, s16, 4
2424
; GFX90A-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
2525
; GFX90A-NEXT: v_writelane_b32 v40, s30, 0
2626
; GFX90A-NEXT: s_addk_i32 s32, 0x400
@@ -48,9 +48,9 @@ define void @vector_reg_liverange_split() #0 {
4848
; GFX90A-NEXT: v_readlane_b32 s31, v40, 1
4949
; GFX90A-NEXT: v_readlane_b32 s30, v40, 0
5050
; GFX90A-NEXT: ; kill: killed $vgpr0
51+
; GFX90A-NEXT: v_readlane_b32 s4, v40, 4
5152
; GFX90A-NEXT: v_readlane_b32 s28, v40, 2
5253
; GFX90A-NEXT: v_readlane_b32 s29, v40, 3
53-
; GFX90A-NEXT: v_readlane_b32 s4, v40, 4
5454
; GFX90A-NEXT: s_xor_saveexec_b64 s[6:7], -1
5555
; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
5656
; GFX90A-NEXT: s_mov_b64 exec, -1

llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ define void @test() #0 {
2424
; GCN-NEXT: s_mov_b64 exec, -1
2525
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
2626
; GCN-NEXT: s_mov_b64 exec, s[18:19]
27+
; GCN-NEXT: v_writelane_b32 v40, s16, 4
2728
; GCN-NEXT: v_writelane_b32 v40, s28, 2
2829
; GCN-NEXT: v_writelane_b32 v40, s29, 3
29-
; GCN-NEXT: v_writelane_b32 v40, s16, 4
3030
; GCN-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
3131
; GCN-NEXT: v_writelane_b32 v40, s30, 0
3232
; GCN-NEXT: s_addk_i32 s32, 0x800
@@ -55,9 +55,9 @@ define void @test() #0 {
5555
; GCN-NEXT: v_readlane_b32 s31, v40, 1
5656
; GCN-NEXT: v_readlane_b32 s30, v40, 0
5757
; GCN-NEXT: ; kill: killed $vgpr1
58+
; GCN-NEXT: v_readlane_b32 s4, v40, 4
5859
; GCN-NEXT: v_readlane_b32 s28, v40, 2
5960
; GCN-NEXT: v_readlane_b32 s29, v40, 3
60-
; GCN-NEXT: v_readlane_b32 s4, v40, 4
6161
; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1
6262
; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
6363
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
@@ -79,9 +79,9 @@ define void @test() #0 {
7979
; GCN-O0-NEXT: s_mov_b64 exec, -1
8080
; GCN-O0-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
8181
; GCN-O0-NEXT: s_mov_b64 exec, s[18:19]
82+
; GCN-O0-NEXT: v_writelane_b32 v40, s16, 4
8283
; GCN-O0-NEXT: v_writelane_b32 v40, s28, 2
8384
; GCN-O0-NEXT: v_writelane_b32 v40, s29, 3
84-
; GCN-O0-NEXT: v_writelane_b32 v40, s16, 4
8585
; GCN-O0-NEXT: s_add_i32 s32, s32, 0x400
8686
; GCN-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
8787
; GCN-O0-NEXT: v_writelane_b32 v40, s30, 0
@@ -117,9 +117,9 @@ define void @test() #0 {
117117
; GCN-O0-NEXT: v_readlane_b32 s31, v40, 1
118118
; GCN-O0-NEXT: v_readlane_b32 s30, v40, 0
119119
; GCN-O0-NEXT: ; kill: killed $vgpr0
120+
; GCN-O0-NEXT: v_readlane_b32 s4, v40, 4
120121
; GCN-O0-NEXT: v_readlane_b32 s28, v40, 2
121122
; GCN-O0-NEXT: v_readlane_b32 s29, v40, 3
122-
; GCN-O0-NEXT: v_readlane_b32 s4, v40, 4
123123
; GCN-O0-NEXT: s_xor_saveexec_b64 s[6:7], -1
124124
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
125125
; GCN-O0-NEXT: s_mov_b64 exec, -1

0 commit comments

Comments
 (0)