Skip to content

Commit 11a9bd2

Browse files
AMDGPU/GlobalISel: Disable LCSSA pass
Disable LCSSA pass in preparation for implementing temporal divergence lowering in amdgpu divergence lowering. Breaks all cases where sgpr or i1 values are used outside of the cycle with divergent exit. Regenerate regression tests for amdgpu divergence lowering with LCSSA disabled and switch them to new reg bank select. Also add required regbanklegalize rules for these tests to pass. Update IntrinsicLaneMaskAnalyzer to stop tracking lcssa phis that are lane masks.
1 parent f1252f5 commit 11a9bd2

20 files changed

+2662
-1562
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -91,25 +91,17 @@ void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) {
9191
GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI);
9292
if (GI && GI->is(Intrinsic::amdgcn_if_break)) {
9393
S32S64LaneMask.insert(MI.getOperand(3).getReg());
94-
findLCSSAPhi(MI.getOperand(0).getReg());
94+
S32S64LaneMask.insert(MI.getOperand(0).getReg());
9595
}
9696

9797
if (MI.getOpcode() == AMDGPU::SI_IF ||
9898
MI.getOpcode() == AMDGPU::SI_ELSE) {
99-
findLCSSAPhi(MI.getOperand(0).getReg());
99+
S32S64LaneMask.insert(MI.getOperand(0).getReg());
100100
}
101101
}
102102
}
103103
}
104104

105-
void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
106-
S32S64LaneMask.insert(Reg);
107-
for (const MachineInstr &LCSSAPhi : MRI.use_instructions(Reg)) {
108-
if (LCSSAPhi.isPHI())
109-
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
110-
}
111-
}
112-
113105
static LLT getReadAnyLaneSplitTy(LLT Ty) {
114106
if (Ty.isVector()) {
115107
LLT ElTy = Ty.getElementType();

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,6 @@ class IntrinsicLaneMaskAnalyzer {
4747

4848
private:
4949
void initLaneMaskIntrinsics(MachineFunction &MF);
50-
// This will not be needed when we turn off LCSSA for global-isel.
51-
void findLCSSAPhi(Register Reg);
5250
};
5351

5452
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,12 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
312312
}
313313

314314
// Opcodes that also support S1.
315+
if (Opc == G_FREEZE &&
316+
MRI.getType(MI->getOperand(0).getReg()) != LLT::scalar(1)) {
317+
RBLHelper.applyMappingTrivial(*MI);
318+
continue;
319+
}
320+
315321
if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT ||
316322
Opc == AMDGPU::G_IMPLICIT_DEF)) {
317323
Register Dst = MI->getOperand(0).getReg();

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,26 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
134134
switch (Mapping.LoweringMethod) {
135135
case DoNotLower:
136136
return;
137+
case VccExtToSel: {
138+
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
139+
Register Src = MI.getOperand(1).getReg();
140+
unsigned Opc = MI.getOpcode();
141+
if (Ty == S32 || Ty == S16) {
142+
auto True = B.buildConstant({VgprRB, Ty}, Opc == G_SEXT ? -1 : 1);
143+
auto False = B.buildConstant({VgprRB, Ty}, 0);
144+
B.buildSelect(MI.getOperand(0).getReg(), Src, True, False);
145+
}
146+
if (Ty == S64) {
147+
auto True = B.buildConstant({VgprRB, S32}, Opc == G_SEXT ? -1 : 1);
148+
auto False = B.buildConstant({VgprRB, S32}, 0);
149+
auto Sel = B.buildSelect({VgprRB, S32}, Src, True, False);
150+
B.buildMergeValues(
151+
MI.getOperand(0).getReg(),
152+
{Sel.getReg(0), Opc == G_SEXT ? Sel.getReg(0) : False.getReg(0)});
153+
}
154+
MI.eraseFromParent();
155+
return;
156+
}
137157
case UniExtToSel: {
138158
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
139159
auto True = B.buildConstant({SgprRB, Ty},
@@ -276,6 +296,8 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
276296
case Sgpr64:
277297
case Vgpr64:
278298
return LLT::scalar(64);
299+
case VgprP0:
300+
return LLT::pointer(0, 64);
279301
case SgprP1:
280302
case VgprP1:
281303
return LLT::pointer(1, 64);
@@ -383,6 +405,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
383405
return SgprRB;
384406
case Vgpr32:
385407
case Vgpr64:
408+
case VgprP0:
386409
case VgprP1:
387410
case VgprP3:
388411
case VgprP4:
@@ -425,6 +448,7 @@ void RegBankLegalizeHelper::applyMappingDst(
425448
case SgprV4S32:
426449
case Vgpr32:
427450
case Vgpr64:
451+
case VgprP0:
428452
case VgprP1:
429453
case VgprP3:
430454
case VgprP4:
@@ -555,6 +579,7 @@ void RegBankLegalizeHelper::applyMappingSrc(
555579
// vgpr scalars, pointers and vectors
556580
case Vgpr32:
557581
case Vgpr64:
582+
case VgprP0:
558583
case VgprP1:
559584
case VgprP3:
560585
case VgprP4:
@@ -653,7 +678,8 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
653678
// We accept all types that can fit in some register class.
654679
// Uniform G_PHIs have all sgpr registers.
655680
// Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
656-
if (Ty == LLT::scalar(32) || Ty == LLT::pointer(4, 64)) {
681+
if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
682+
Ty == LLT::pointer(4, 64)) {
657683
return;
658684
}
659685

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
5050
return MRI.getType(Reg) == LLT::scalar(32);
5151
case S64:
5252
return MRI.getType(Reg) == LLT::scalar(64);
53+
case P0:
54+
return MRI.getType(Reg) == LLT::pointer(0, 64);
5355
case P1:
5456
return MRI.getType(Reg) == LLT::pointer(1, 64);
5557
case P3:
@@ -58,6 +60,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
5860
return MRI.getType(Reg) == LLT::pointer(4, 64);
5961
case P5:
6062
return MRI.getType(Reg) == LLT::pointer(5, 32);
63+
case V4S32:
64+
return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
6165
case B32:
6266
return MRI.getType(Reg).getSizeInBits() == 32;
6367
case B64:
@@ -78,6 +82,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
7882
return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg);
7983
case UniS64:
8084
return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg);
85+
case UniP0:
86+
return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniform(Reg);
8187
case UniP1:
8288
return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniform(Reg);
8389
case UniP3:
@@ -104,6 +110,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
104110
return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg);
105111
case DivS64:
106112
return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg);
113+
case DivP0:
114+
return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergent(Reg);
107115
case DivP1:
108116
return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg);
109117
case DivP3:
@@ -315,13 +323,15 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const {
315323
Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) {
316324
unsigned IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
317325
if (!IRulesAlias.contains(IntrID)) {
326+
MI.dump();
318327
LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
319328
llvm_unreachable("No rules defined for intrinsic opcode");
320329
}
321330
return IRules.at(IRulesAlias.at(IntrID));
322331
}
323332

324333
if (!GRulesAlias.contains(Opc)) {
334+
MI.dump();
325335
LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
326336
llvm_unreachable("No rules defined for generic opcode");
327337
}
@@ -431,16 +441,21 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
431441
addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB)
432442
.Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}})
433443
.Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}})
444+
.Div(B32, {{VgprB32}, {VgprB32, VgprB32}})
445+
.Uni(B64, {{SgprB64}, {SgprB64, SgprB64}})
434446
.Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
435447

436448
addRulesForGOpcs({G_SHL}, Standard)
449+
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
437450
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
438451
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
439452

440453
// Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT
441454
// and G_FREEZE here, rest is trivially regbankselected earlier
455+
addRulesForGOpcs({G_IMPLICIT_DEF}).Any({{UniS1}, {{Sgpr32Trunc}, {}}});
442456
addRulesForGOpcs({G_CONSTANT})
443457
.Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}});
458+
addRulesForGOpcs({G_FREEZE}).Any({{DivS1}, {{Vcc}, {Vcc}}});
444459

445460
addRulesForGOpcs({G_ICMP})
446461
.Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
@@ -471,6 +486,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
471486

472487
addRulesForGOpcs({G_ZEXT, G_SEXT})
473488
.Any({{UniS32, S1}, {{Sgpr32}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
489+
.Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
474490
.Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
475491
.Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}});
476492

@@ -525,9 +541,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
525541

526542
// clang-format off
527543
addRulesForGOpcs({G_LOAD})
544+
.Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}})
545+
528546
.Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}})
529547
.Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
530548
.Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
549+
.Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}})
531550
.Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}})
532551
.Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}})
533552

@@ -556,15 +575,26 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
556575
// clang-format on
557576

558577
addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector)
578+
.Div(S32, {{Vgpr32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
579+
.Uni(S32, {{UniInVgprS32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
559580
.Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
560581
.Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}});
561582

562583
addRulesForGOpcs({G_STORE})
584+
.Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}})
563585
.Any({{S32, P1}, {{}, {Vgpr32, VgprP1}}})
564586
.Any({{S64, P1}, {{}, {Vgpr64, VgprP1}}})
565587
.Any({{V4S32, P1}, {{}, {VgprV4S32, VgprP1}}});
566588

567-
addRulesForGOpcs({G_PTR_ADD}).Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}});
589+
addRulesForGOpcs({G_AMDGPU_BUFFER_STORE})
590+
.Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});
591+
592+
addRulesForGOpcs({G_PTR_ADD})
593+
.Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
594+
.Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
595+
.Any({{DivP0}, {{VgprP0}, {VgprP0, Vgpr64}}});
596+
597+
addRulesForGOpcs({G_INTTOPTR}).Any({{UniP4}, {{SgprP4}, {Sgpr64}}});
568598

569599
addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
570600

@@ -580,15 +610,24 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
580610
.Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
581611

582612
addRulesForGOpcs({G_UITOFP})
613+
.Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
583614
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
584615
.Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat);
585616

586617
using namespace Intrinsic;
587618

619+
addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});
620+
588621
// This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir.
589622
addRulesForIOpcs({amdgcn_end_cf}).Any({{_, S32}, {{}, {None, Sgpr32}}});
590623

591624
addRulesForIOpcs({amdgcn_if_break}, Standard)
592625
.Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}});
593626

627+
addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard)
628+
.Div(S32, {{}, {Vgpr32, None, Vgpr32, Vgpr32}});
629+
630+
addRulesForIOpcs({amdgcn_readfirstlane})
631+
.Any({{UniS32, _, DivS32}, {{}, {Sgpr32, None, Vgpr32}}});
632+
594633
} // end initialize rules

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,16 +50,19 @@ enum UniformityLLTOpPredicateID {
5050
DivS64,
5151

5252
// pointers
53+
P0,
5354
P1,
5455
P3,
5556
P4,
5657
P5,
5758

59+
UniP0,
5860
UniP1,
5961
UniP3,
6062
UniP4,
6163
UniP5,
6264

65+
DivP0,
6366
DivP1,
6467
DivP3,
6568
DivP4,
@@ -124,6 +127,7 @@ enum RegBankLLTMappingApplyID {
124127
// vgpr scalars, pointers, vectors and B-types
125128
Vgpr32,
126129
Vgpr64,
130+
VgprP0,
127131
VgprP1,
128132
VgprP3,
129133
VgprP4,
@@ -162,6 +166,7 @@ enum RegBankLLTMappingApplyID {
162166
// vgpr. Lower it to two S32 vgpr ANDs.
163167
enum LoweringMethodID {
164168
DoNotLower,
169+
VccExtToSel,
165170
UniExtToSel,
166171
VgprToVccCopy,
167172
SplitTo32,

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1382,7 +1382,11 @@ bool GCNPassConfig::addPreISel() {
13821382
// control flow modifications.
13831383
addPass(createAMDGPURewriteUndefForPHILegacyPass());
13841384

1385-
addPass(createLCSSAPass());
1385+
// SDAG requires LCSSA, GlobalISel does not. Disable LCSSA for -global-isel
1386+
// with -new-reg-bank-select and without any of the fallback options.
1387+
if (!getCGPassBuilderOption().EnableGlobalISelOption ||
1388+
!isGlobalISelAbortEnabled() || !NewRegBankSelect)
1389+
addPass(createLCSSAPass());
13861390

13871391
if (TM->getOptLevel() > CodeGenOptLevel::Less)
13881392
addPass(&AMDGPUPerfHintAnalysisLegacyID);
@@ -2086,7 +2090,9 @@ void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
20862090
// control flow modifications.
20872091
addPass(AMDGPURewriteUndefForPHIPass());
20882092

2089-
addPass(LCSSAPass());
2093+
if (!getCGPassBuilderOption().EnableGlobalISelOption ||
2094+
!isGlobalISelAbortEnabled() || !NewRegBankSelect)
2095+
addPass(LCSSAPass());
20902096

20912097
if (TM.getOptLevel() > CodeGenOptLevel::Less)
20922098
addPass(AMDGPUPerfHintAnalysisPass(TM));

0 commit comments

Comments
 (0)