Skip to content

Commit d4b8b72

Browse files
authored
[CodeGen][MachineLICM] Use RegUnits in HoistRegionPostRA (#94608)
Those BitVectors get expensive on targets like AMDGPU with thousands of registers, and RegAliasIterator is also expensive. We can move all liveness calculations to use RegUnits instead to speed it up for targets where RegAliasIterator is expensive, like AMDGPU. On targets where RegAliasIterator is cheap, this alternative can be a little more expensive, but I believe the tradeoff is worth it.
1 parent 546c816 commit d4b8b72

File tree

2 files changed

+100
-47
lines changed

2 files changed

+100
-47
lines changed

llvm/lib/CodeGen/MachineLICM.cpp

Lines changed: 98 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -223,8 +223,8 @@ namespace {
223223
void HoistPostRA(MachineInstr *MI, unsigned Def, MachineLoop *CurLoop,
224224
MachineBasicBlock *CurPreheader);
225225

226-
void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
227-
BitVector &PhysRegClobbers, SmallSet<int, 32> &StoredFIs,
226+
void ProcessMI(MachineInstr *MI, BitVector &RUDefs, BitVector &RUClobbers,
227+
SmallSet<int, 32> &StoredFIs,
228228
SmallVectorImpl<CandidateInfo> &Candidates,
229229
MachineLoop *CurLoop);
230230

@@ -423,10 +423,47 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
423423
return false;
424424
}
425425

426+
static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
427+
BitVector &RUs,
428+
const uint32_t *Mask) {
429+
// Iterate over the RegMask raw to avoid constructing a BitVector, which is
430+
// expensive as it implies dynamically allocating memory.
431+
//
432+
// We also work backwards.
433+
const unsigned NumRegs = TRI.getNumRegs();
434+
const unsigned MaskWords = (NumRegs + 31) / 32;
435+
for (unsigned K = 0; K < MaskWords; ++K) {
436+
// We want to set the bits that aren't in RegMask, so flip it.
437+
uint32_t Word = ~Mask[K];
438+
439+
// Iterate all set bits, starting from the right.
440+
while (Word) {
441+
const unsigned SetBitIdx = countr_zero(Word);
442+
443+
// The bits are numbered from the LSB in each word.
444+
const unsigned PhysReg = (K * 32) + SetBitIdx;
445+
446+
// Clear the bit at SetBitIdx. Doing it this way appears to generate less
447+
// instructions on x86. This works because negating a number will flip all
448+
// the bits after SetBitIdx. So (Word & -Word) == (1 << SetBitIdx), but
449+
// faster.
450+
Word ^= Word & -Word;
451+
452+
if (PhysReg == NumRegs)
453+
return;
454+
455+
if (PhysReg) {
456+
for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
457+
RUs.set(*RUI);
458+
}
459+
}
460+
}
461+
}
462+
426463
/// Examine the instruction for potentai LICM candidate. Also
427464
/// gather register def and frame object update information.
428-
void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
429-
BitVector &PhysRegClobbers,
465+
void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &RUDefs,
466+
BitVector &RUClobbers,
430467
SmallSet<int, 32> &StoredFIs,
431468
SmallVectorImpl<CandidateInfo> &Candidates,
432469
MachineLoop *CurLoop) {
@@ -448,7 +485,7 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
448485
// We can't hoist an instruction defining a physreg that is clobbered in
449486
// the loop.
450487
if (MO.isRegMask()) {
451-
PhysRegClobbers.setBitsNotInMask(MO.getRegMask());
488+
applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, MO.getRegMask());
452489
continue;
453490
}
454491

@@ -460,16 +497,22 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
460497
assert(Reg.isPhysical() && "Not expecting virtual register!");
461498

462499
if (!MO.isDef()) {
463-
if (Reg && (PhysRegDefs.test(Reg) || PhysRegClobbers.test(Reg)))
464-
// If it's using a non-loop-invariant register, then it's obviously not
465-
// safe to hoist.
466-
HasNonInvariantUse = true;
500+
if (!HasNonInvariantUse) {
501+
for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
502+
// If it's using a non-loop-invariant register, then it's obviously
503+
// not safe to hoist.
504+
if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) {
505+
HasNonInvariantUse = true;
506+
break;
507+
}
508+
}
509+
}
467510
continue;
468511
}
469512

470513
if (MO.isImplicit()) {
471-
for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
472-
PhysRegClobbers.set(*AI);
514+
for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
515+
RUClobbers.set(*RUI);
473516
if (!MO.isDead())
474517
// Non-dead implicit def? This cannot be hoisted.
475518
RuledOut = true;
@@ -488,19 +531,18 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
488531
// If we have already seen another instruction that defines the same
489532
// register, then this is not safe. Two defs is indicated by setting a
490533
// PhysRegClobbers bit.
491-
for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) {
492-
if (PhysRegDefs.test(*AS))
493-
PhysRegClobbers.set(*AS);
534+
for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
535+
if (RUDefs.test(*RUI)) {
536+
RUClobbers.set(*RUI);
537+
RuledOut = true;
538+
} else if (RUClobbers.test(*RUI)) {
539+
// MI defined register is seen defined by another instruction in
540+
// the loop, it cannot be a LICM candidate.
541+
RuledOut = true;
542+
}
543+
544+
RUDefs.set(*RUI);
494545
}
495-
// Need a second loop because MCRegAliasIterator can visit the same
496-
// register twice.
497-
for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS)
498-
PhysRegDefs.set(*AS);
499-
500-
if (PhysRegClobbers.test(Reg))
501-
// MI defined register is seen defined by another instruction in
502-
// the loop, it cannot be a LICM candidate.
503-
RuledOut = true;
504546
}
505547

506548
// Only consider reloads for now and remats which do not have register
@@ -521,9 +563,9 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
521563
if (!Preheader)
522564
return;
523565

524-
unsigned NumRegs = TRI->getNumRegs();
525-
BitVector PhysRegDefs(NumRegs); // Regs defined once in the loop.
526-
BitVector PhysRegClobbers(NumRegs); // Regs defined more than once.
566+
unsigned NumRegUnits = TRI->getNumRegUnits();
567+
BitVector RUDefs(NumRegUnits); // RUs defined once in the loop.
568+
BitVector RUClobbers(NumRegUnits); // RUs defined more than once.
527569

528570
SmallVector<CandidateInfo, 32> Candidates;
529571
SmallSet<int, 32> StoredFIs;
@@ -540,22 +582,21 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
540582
// FIXME: That means a reload that're reused in successor block(s) will not
541583
// be LICM'ed.
542584
for (const auto &LI : BB->liveins()) {
543-
for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI)
544-
PhysRegDefs.set(*AI);
585+
for (MCRegUnitIterator RUI(LI.PhysReg, TRI); RUI.isValid(); ++RUI)
586+
RUDefs.set(*RUI);
545587
}
546588

547589
// Funclet entry blocks will clobber all registers
548590
if (const uint32_t *Mask = BB->getBeginClobberMask(TRI))
549-
PhysRegClobbers.setBitsNotInMask(Mask);
591+
applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, Mask);
550592

551593
SpeculationState = SpeculateUnknown;
552594
for (MachineInstr &MI : *BB)
553-
ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates,
554-
CurLoop);
595+
ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates, CurLoop);
555596
}
556597

557598
// Gather the registers read / clobbered by the terminator.
558-
BitVector TermRegs(NumRegs);
599+
BitVector TermRUs(NumRegUnits);
559600
MachineBasicBlock::iterator TI = Preheader->getFirstTerminator();
560601
if (TI != Preheader->end()) {
561602
for (const MachineOperand &MO : TI->operands()) {
@@ -564,8 +605,8 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
564605
Register Reg = MO.getReg();
565606
if (!Reg)
566607
continue;
567-
for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
568-
TermRegs.set(*AI);
608+
for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
609+
TermRUs.set(*RUI);
569610
}
570611
}
571612

@@ -583,24 +624,36 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
583624
continue;
584625

585626
unsigned Def = Candidate.Def;
586-
if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) {
587-
bool Safe = true;
588-
MachineInstr *MI = Candidate.MI;
589-
for (const MachineOperand &MO : MI->all_uses()) {
590-
if (!MO.getReg())
591-
continue;
592-
Register Reg = MO.getReg();
593-
if (PhysRegDefs.test(Reg) ||
594-
PhysRegClobbers.test(Reg)) {
627+
bool Safe = true;
628+
for (MCRegUnitIterator RUI(Def, TRI); RUI.isValid(); ++RUI) {
629+
if (RUClobbers.test(*RUI) || TermRUs.test(*RUI)) {
630+
Safe = false;
631+
break;
632+
}
633+
}
634+
635+
if (!Safe)
636+
continue;
637+
638+
MachineInstr *MI = Candidate.MI;
639+
for (const MachineOperand &MO : MI->all_uses()) {
640+
if (!MO.getReg())
641+
continue;
642+
for (MCRegUnitIterator RUI(MO.getReg(), TRI); RUI.isValid(); ++RUI) {
643+
if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) {
595644
// If it's using a non-loop-invariant register, then it's obviously
596645
// not safe to hoist.
597646
Safe = false;
598647
break;
599648
}
600649
}
601-
if (Safe)
602-
HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader);
650+
651+
if (!Safe)
652+
break;
603653
}
654+
655+
if (Safe)
656+
HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader);
604657
}
605658
}
606659

llvm/test/CodeGen/AMDGPU/indirect-call.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
886886
; GCN-NEXT: v_writelane_b32 v40, s62, 30
887887
; GCN-NEXT: v_writelane_b32 v40, s63, 31
888888
; GCN-NEXT: s_mov_b64 s[6:7], exec
889-
; GCN-NEXT: s_movk_i32 s4, 0x7b
890889
; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
891890
; GCN-NEXT: v_readfirstlane_b32 s8, v0
892891
; GCN-NEXT: v_readfirstlane_b32 s9, v1
893892
; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
894893
; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc
894+
; GCN-NEXT: s_movk_i32 s4, 0x7b
895895
; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
896896
; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1
897897
; GCN-NEXT: s_xor_b64 exec, exec, s[10:11]
@@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
980980
; GISEL-NEXT: v_writelane_b32 v40, s62, 30
981981
; GISEL-NEXT: v_writelane_b32 v40, s63, 31
982982
; GISEL-NEXT: s_mov_b64 s[6:7], exec
983-
; GISEL-NEXT: s_movk_i32 s4, 0x7b
984983
; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1
985984
; GISEL-NEXT: v_readfirstlane_b32 s8, v0
986985
; GISEL-NEXT: v_readfirstlane_b32 s9, v1
987986
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
988987
; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc
988+
; GISEL-NEXT: s_movk_i32 s4, 0x7b
989989
; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
990990
; GISEL-NEXT: ; implicit-def: $vgpr0
991991
; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11]

0 commit comments

Comments
 (0)