Skip to content

Commit 1728ab4

Browse files
AMDGPU/GlobalISel: Disable LCSSA pass
Disable LCSSA pass in preparation for implementing temporal divergence lowering in amdgpu divergence lowering. Breaks all cases where sgpr or i1 values are used outside of the cycle with divergent exit. Regenerate regression tests for amdgpu divergence lowering with LCSSA disabled and switch them to new reg bank select. Also add required regbanklegalize rules for these tests to pass. Update IntrinsicLaneMaskAnalyzer to stop tracking lcssa phis that are lane masks.
1 parent 57b4898 commit 1728ab4

26 files changed

+1896
-1840
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -91,25 +91,17 @@ void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) {
9191
GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI);
9292
if (GI && GI->is(Intrinsic::amdgcn_if_break)) {
9393
S32S64LaneMask.insert(MI.getOperand(3).getReg());
94-
findLCSSAPhi(MI.getOperand(0).getReg());
94+
S32S64LaneMask.insert(MI.getOperand(0).getReg());
9595
}
9696

9797
if (MI.getOpcode() == AMDGPU::SI_IF ||
9898
MI.getOpcode() == AMDGPU::SI_ELSE) {
99-
findLCSSAPhi(MI.getOperand(0).getReg());
99+
S32S64LaneMask.insert(MI.getOperand(0).getReg());
100100
}
101101
}
102102
}
103103
}
104104

105-
void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
106-
S32S64LaneMask.insert(Reg);
107-
for (const MachineInstr &LCSSAPhi : MRI.use_instructions(Reg)) {
108-
if (LCSSAPhi.isPHI())
109-
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
110-
}
111-
}
112-
113105
static LLT getReadAnyLaneSplitTy(LLT Ty) {
114106
if (Ty.isVector()) {
115107
LLT ElTy = Ty.getElementType();

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,6 @@ class IntrinsicLaneMaskAnalyzer {
4747

4848
private:
4949
void initLaneMaskIntrinsics(MachineFunction &MF);
50-
// This will not be needed when we turn off LCSSA for global-isel.
51-
void findLCSSAPhi(Register Reg);
5250
};
5351

5452
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,12 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
312312
}
313313

314314
// Opcodes that also support S1.
315+
if (Opc == G_FREEZE &&
316+
MRI.getType(MI->getOperand(0).getReg()) != LLT::scalar(1)) {
317+
RBLHelper.applyMappingTrivial(*MI);
318+
continue;
319+
}
320+
315321
if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT ||
316322
Opc == AMDGPU::G_IMPLICIT_DEF)) {
317323
Register Dst = MI->getOperand(0).getReg();

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,26 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
134134
switch (Mapping.LoweringMethod) {
135135
case DoNotLower:
136136
return;
137+
case VccExtToSel: {
138+
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
139+
Register Src = MI.getOperand(1).getReg();
140+
unsigned Opc = MI.getOpcode();
141+
if (Ty == S32 || Ty == S16) {
142+
auto True = B.buildConstant({VgprRB, Ty}, Opc == G_SEXT ? -1 : 1);
143+
auto False = B.buildConstant({VgprRB, Ty}, 0);
144+
B.buildSelect(MI.getOperand(0).getReg(), Src, True, False);
145+
}
146+
if (Ty == S64) {
147+
auto True = B.buildConstant({VgprRB, S32}, Opc == G_SEXT ? -1 : 1);
148+
auto False = B.buildConstant({VgprRB, S32}, 0);
149+
auto Sel = B.buildSelect({VgprRB, S32}, Src, True, False);
150+
B.buildMergeValues(
151+
MI.getOperand(0).getReg(),
152+
{Sel.getReg(0), Opc == G_SEXT ? Sel.getReg(0) : False.getReg(0)});
153+
}
154+
MI.eraseFromParent();
155+
return;
156+
}
137157
case UniExtToSel: {
138158
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
139159
auto True = B.buildConstant({SgprRB, Ty},
@@ -276,6 +296,8 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
276296
case Sgpr64:
277297
case Vgpr64:
278298
return LLT::scalar(64);
299+
case VgprP0:
300+
return LLT::pointer(0, 64);
279301
case SgprP1:
280302
case VgprP1:
281303
return LLT::pointer(1, 64);
@@ -383,6 +405,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
383405
return SgprRB;
384406
case Vgpr32:
385407
case Vgpr64:
408+
case VgprP0:
386409
case VgprP1:
387410
case VgprP3:
388411
case VgprP4:
@@ -425,6 +448,7 @@ void RegBankLegalizeHelper::applyMappingDst(
425448
case SgprV4S32:
426449
case Vgpr32:
427450
case Vgpr64:
451+
case VgprP0:
428452
case VgprP1:
429453
case VgprP3:
430454
case VgprP4:
@@ -555,6 +579,7 @@ void RegBankLegalizeHelper::applyMappingSrc(
555579
// vgpr scalars, pointers and vectors
556580
case Vgpr32:
557581
case Vgpr64:
582+
case VgprP0:
558583
case VgprP1:
559584
case VgprP3:
560585
case VgprP4:
@@ -653,7 +678,8 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
653678
// We accept all types that can fit in some register class.
654679
// Uniform G_PHIs have all sgpr registers.
655680
// Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
656-
if (Ty == LLT::scalar(32) || Ty == LLT::pointer(4, 64)) {
681+
if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
682+
Ty == LLT::pointer(4, 64)) {
657683
return;
658684
}
659685

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
5050
return MRI.getType(Reg) == LLT::scalar(32);
5151
case S64:
5252
return MRI.getType(Reg) == LLT::scalar(64);
53+
case P0:
54+
return MRI.getType(Reg) == LLT::pointer(0, 64);
5355
case P1:
5456
return MRI.getType(Reg) == LLT::pointer(1, 64);
5557
case P3:
@@ -58,6 +60,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
5860
return MRI.getType(Reg) == LLT::pointer(4, 64);
5961
case P5:
6062
return MRI.getType(Reg) == LLT::pointer(5, 32);
63+
case V4S32:
64+
return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
6165
case B32:
6266
return MRI.getType(Reg).getSizeInBits() == 32;
6367
case B64:
@@ -431,16 +435,20 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
431435
addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB)
432436
.Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}})
433437
.Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}})
438+
.Div(B32, {{VgprB32}, {VgprB32, VgprB32}})
439+
.Uni(B64, {{SgprB64}, {SgprB64, SgprB64}})
434440
.Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
435441

436442
addRulesForGOpcs({G_SHL}, Standard)
443+
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
437444
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
438445
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
439446

440447
// Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT
441448
// and G_FREEZE here, rest is trivially regbankselected earlier
442449
addRulesForGOpcs({G_CONSTANT})
443450
.Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}});
451+
addRulesForGOpcs({G_FREEZE}).Any({{DivS1}, {{Vcc}, {Vcc}}});
444452

445453
addRulesForGOpcs({G_ICMP})
446454
.Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
@@ -471,6 +479,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
471479

472480
addRulesForGOpcs({G_ZEXT, G_SEXT})
473481
.Any({{UniS32, S1}, {{Sgpr32}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
482+
.Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
474483
.Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
475484
.Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}});
476485

@@ -528,6 +537,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
528537
.Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}})
529538
.Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
530539
.Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
540+
.Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}})
531541
.Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}})
532542
.Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}})
533543

@@ -556,15 +566,25 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
556566
// clang-format on
557567

558568
addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector)
569+
.Div(S32, {{Vgpr32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
570+
.Uni(S32, {{UniInVgprS32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
559571
.Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
560572
.Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}});
561573

562574
addRulesForGOpcs({G_STORE})
575+
.Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}})
563576
.Any({{S32, P1}, {{}, {Vgpr32, VgprP1}}})
564577
.Any({{S64, P1}, {{}, {Vgpr64, VgprP1}}})
565578
.Any({{V4S32, P1}, {{}, {VgprV4S32, VgprP1}}});
566579

567-
addRulesForGOpcs({G_PTR_ADD}).Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}});
580+
addRulesForGOpcs({G_AMDGPU_BUFFER_STORE})
581+
.Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});
582+
583+
addRulesForGOpcs({G_PTR_ADD})
584+
.Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
585+
.Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}});
586+
587+
addRulesForGOpcs({G_INTTOPTR}).Any({{UniP4}, {{SgprP4}, {Sgpr64}}});
568588

569589
addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
570590

@@ -585,10 +605,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
585605

586606
using namespace Intrinsic;
587607

608+
addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});
609+
588610
// This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir.
589611
addRulesForIOpcs({amdgcn_end_cf}).Any({{_, S32}, {{}, {None, Sgpr32}}});
590612

591613
addRulesForIOpcs({amdgcn_if_break}, Standard)
592614
.Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}});
593615

616+
addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard)
617+
.Div(S32, {{}, {Vgpr32, None, Vgpr32, Vgpr32}});
618+
594619
} // end initialize rules

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ enum UniformityLLTOpPredicateID {
5050
DivS64,
5151

5252
// pointers
53+
P0,
5354
P1,
5455
P3,
5556
P4,
@@ -124,6 +125,7 @@ enum RegBankLLTMappingApplyID {
124125
// vgpr scalars, pointers, vectors and B-types
125126
Vgpr32,
126127
Vgpr64,
128+
VgprP0,
127129
VgprP1,
128130
VgprP3,
129131
VgprP4,
@@ -162,6 +164,7 @@ enum RegBankLLTMappingApplyID {
162164
// vgpr. Lower it to two S32 vgpr ANDs.
163165
enum LoweringMethodID {
164166
DoNotLower,
167+
VccExtToSel,
165168
UniExtToSel,
166169
VgprToVccCopy,
167170
SplitTo32,

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1366,7 +1366,8 @@ bool GCNPassConfig::addPreISel() {
13661366
// control flow modifications.
13671367
addPass(createAMDGPURewriteUndefForPHILegacyPass());
13681368

1369-
addPass(createLCSSAPass());
1369+
if (!getCGPassBuilderOption().EnableGlobalISelOption)
1370+
addPass(createLCSSAPass());
13701371

13711372
if (TM->getOptLevel() > CodeGenOptLevel::Less)
13721373
addPass(&AMDGPUPerfHintAnalysisLegacyID);
@@ -2062,7 +2063,8 @@ void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
20622063
// control flow modifications.
20632064
addPass(AMDGPURewriteUndefForPHIPass());
20642065

2065-
addPass(LCSSAPass());
2066+
if (!getCGPassBuilderOption().EnableGlobalISelOption)
2067+
addPass(LCSSAPass());
20662068

20672069
if (TM.getOptLevel() > CodeGenOptLevel::Less)
20682070
addPass(AMDGPUPerfHintAnalysisPass(TM));

0 commit comments

Comments
 (0)