Skip to content

Commit cd3d069

Browse files
AMDGPU/GlobalISel: Disable LCSSA pass
Disable LCSSA pass in preparation for implementing temporal divergence lowering in amdgpu divergence lowering. Breaks all cases where sgpr or i1 values are used outside of the cycle with divergent exit. Regenerate regression tests for amdgpu divergence lowering with LCSSA disabled and switch them to new reg bank select. Also add required regbanklegalize rules for these tests to pass. Update IntrinsicLaneMaskAnalyzer to stop tracking lcssa phis that are lane masks.
1 parent 77d3f8a commit cd3d069

20 files changed

+1692
-1514
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -91,25 +91,17 @@ void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) {
9191
GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI);
9292
if (GI && GI->is(Intrinsic::amdgcn_if_break)) {
9393
S32S64LaneMask.insert(MI.getOperand(3).getReg());
94-
findLCSSAPhi(MI.getOperand(0).getReg());
94+
S32S64LaneMask.insert(MI.getOperand(0).getReg());
9595
}
9696

9797
if (MI.getOpcode() == AMDGPU::SI_IF ||
9898
MI.getOpcode() == AMDGPU::SI_ELSE) {
99-
findLCSSAPhi(MI.getOperand(0).getReg());
99+
S32S64LaneMask.insert(MI.getOperand(0).getReg());
100100
}
101101
}
102102
}
103103
}
104104

105-
void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
106-
S32S64LaneMask.insert(Reg);
107-
for (const MachineInstr &LCSSAPhi : MRI.use_instructions(Reg)) {
108-
if (LCSSAPhi.isPHI())
109-
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
110-
}
111-
}
112-
113105
static LLT getReadAnyLaneSplitTy(LLT Ty) {
114106
if (Ty.isVector()) {
115107
LLT ElTy = Ty.getElementType();

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,6 @@ class IntrinsicLaneMaskAnalyzer {
4747

4848
private:
4949
void initLaneMaskIntrinsics(MachineFunction &MF);
50-
// This will not be needed when we turn off LCSSA for global-isel.
51-
void findLCSSAPhi(Register Reg);
5250
};
5351

5452
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,12 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
312312
}
313313

314314
// Opcodes that also support S1.
315+
if (Opc == G_FREEZE &&
316+
MRI.getType(MI->getOperand(0).getReg()) != LLT::scalar(1)) {
317+
RBLHelper.applyMappingTrivial(*MI);
318+
continue;
319+
}
320+
315321
if ((Opc == AMDGPU::G_CONSTANT || Opc == AMDGPU::G_FCONSTANT ||
316322
Opc == AMDGPU::G_IMPLICIT_DEF)) {
317323
Register Dst = MI->getOperand(0).getReg();

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,26 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
134134
switch (Mapping.LoweringMethod) {
135135
case DoNotLower:
136136
return;
137+
case VccExtToSel: {
138+
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
139+
Register Src = MI.getOperand(1).getReg();
140+
unsigned Opc = MI.getOpcode();
141+
if (Ty == S32 || Ty == S16) {
142+
auto True = B.buildConstant({VgprRB, Ty}, Opc == G_SEXT ? -1 : 1);
143+
auto False = B.buildConstant({VgprRB, Ty}, 0);
144+
B.buildSelect(MI.getOperand(0).getReg(), Src, True, False);
145+
}
146+
if (Ty == S64) {
147+
auto True = B.buildConstant({VgprRB, S32}, Opc == G_SEXT ? -1 : 1);
148+
auto False = B.buildConstant({VgprRB, S32}, 0);
149+
auto Sel = B.buildSelect({VgprRB, S32}, Src, True, False);
150+
B.buildMergeValues(
151+
MI.getOperand(0).getReg(),
152+
{Sel.getReg(0), Opc == G_SEXT ? Sel.getReg(0) : False.getReg(0)});
153+
}
154+
MI.eraseFromParent();
155+
return;
156+
}
137157
case UniExtToSel: {
138158
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
139159
auto True = B.buildConstant({SgprRB, Ty},
@@ -276,6 +296,8 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
276296
case Sgpr64:
277297
case Vgpr64:
278298
return LLT::scalar(64);
299+
case VgprP0:
300+
return LLT::pointer(0, 64);
279301
case SgprP1:
280302
case VgprP1:
281303
return LLT::pointer(1, 64);
@@ -383,6 +405,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
383405
return SgprRB;
384406
case Vgpr32:
385407
case Vgpr64:
408+
case VgprP0:
386409
case VgprP1:
387410
case VgprP3:
388411
case VgprP4:
@@ -425,6 +448,7 @@ void RegBankLegalizeHelper::applyMappingDst(
425448
case SgprV4S32:
426449
case Vgpr32:
427450
case Vgpr64:
451+
case VgprP0:
428452
case VgprP1:
429453
case VgprP3:
430454
case VgprP4:
@@ -555,6 +579,7 @@ void RegBankLegalizeHelper::applyMappingSrc(
555579
// vgpr scalars, pointers and vectors
556580
case Vgpr32:
557581
case Vgpr64:
582+
case VgprP0:
558583
case VgprP1:
559584
case VgprP3:
560585
case VgprP4:
@@ -653,7 +678,8 @@ void RegBankLegalizeHelper::applyMappingPHI(MachineInstr &MI) {
653678
// We accept all types that can fit in some register class.
654679
// Uniform G_PHIs have all sgpr registers.
655680
// Divergent G_PHIs have vgpr dst but inputs can be sgpr or vgpr.
656-
if (Ty == LLT::scalar(32) || Ty == LLT::pointer(4, 64)) {
681+
if (Ty == LLT::scalar(32) || Ty == LLT::pointer(1, 64) ||
682+
Ty == LLT::pointer(4, 64)) {
657683
return;
658684
}
659685

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
5050
return MRI.getType(Reg) == LLT::scalar(32);
5151
case S64:
5252
return MRI.getType(Reg) == LLT::scalar(64);
53+
case P0:
54+
return MRI.getType(Reg) == LLT::pointer(0, 64);
5355
case P1:
5456
return MRI.getType(Reg) == LLT::pointer(1, 64);
5557
case P3:
@@ -58,6 +60,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
5860
return MRI.getType(Reg) == LLT::pointer(4, 64);
5961
case P5:
6062
return MRI.getType(Reg) == LLT::pointer(5, 32);
63+
case V4S32:
64+
return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
6165
case B32:
6266
return MRI.getType(Reg).getSizeInBits() == 32;
6367
case B64:
@@ -315,13 +319,15 @@ RegBankLegalizeRules::getRulesForOpc(MachineInstr &MI) const {
315319
Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) {
316320
unsigned IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
317321
if (!IRulesAlias.contains(IntrID)) {
322+
MI.dump();
318323
LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
319324
llvm_unreachable("No rules defined for intrinsic opcode");
320325
}
321326
return IRules.at(IRulesAlias.at(IntrID));
322327
}
323328

324329
if (!GRulesAlias.contains(Opc)) {
330+
MI.dump();
325331
LLVM_DEBUG(dbgs() << "MI: "; MI.dump(););
326332
llvm_unreachable("No rules defined for generic opcode");
327333
}
@@ -431,16 +437,21 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
431437
addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB)
432438
.Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}}})
433439
.Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}})
440+
.Div(B32, {{VgprB32}, {VgprB32, VgprB32}})
441+
.Uni(B64, {{SgprB64}, {SgprB64, SgprB64}})
434442
.Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
435443

436444
addRulesForGOpcs({G_SHL}, Standard)
445+
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
437446
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
438447
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
439448

440449
// Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT, G_FCONSTANT
441450
// and G_FREEZE here, rest is trivially regbankselected earlier
451+
addRulesForGOpcs({G_IMPLICIT_DEF}).Any({{UniS1}, {{Sgpr32Trunc}, {}}});
442452
addRulesForGOpcs({G_CONSTANT})
443453
.Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}});
454+
addRulesForGOpcs({G_FREEZE}).Any({{DivS1}, {{Vcc}, {Vcc}}});
444455

445456
addRulesForGOpcs({G_ICMP})
446457
.Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
@@ -471,6 +482,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
471482

472483
addRulesForGOpcs({G_ZEXT, G_SEXT})
473484
.Any({{UniS32, S1}, {{Sgpr32}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
485+
.Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
474486
.Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
475487
.Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}});
476488

@@ -528,6 +540,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
528540
.Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}})
529541
.Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
530542
.Any({{{UniB512, UniP1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
543+
.Any({{{UniB32, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}})
531544
.Any({{{UniB256, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {VgprP1}, SplitLoad}})
532545
.Any({{{UniB512, UniP1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {VgprP1}, SplitLoad}})
533546

@@ -556,15 +569,25 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
556569
// clang-format on
557570

558571
addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector)
572+
.Div(S32, {{Vgpr32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
573+
.Uni(S32, {{UniInVgprS32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
559574
.Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
560575
.Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}});
561576

562577
addRulesForGOpcs({G_STORE})
578+
.Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}})
563579
.Any({{S32, P1}, {{}, {Vgpr32, VgprP1}}})
564580
.Any({{S64, P1}, {{}, {Vgpr64, VgprP1}}})
565581
.Any({{V4S32, P1}, {{}, {VgprV4S32, VgprP1}}});
566582

567-
addRulesForGOpcs({G_PTR_ADD}).Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}});
583+
addRulesForGOpcs({G_AMDGPU_BUFFER_STORE})
584+
.Any({{S32}, {{}, {Vgpr32, SgprV4S32, Vgpr32, Vgpr32, Sgpr32}}});
585+
586+
addRulesForGOpcs({G_PTR_ADD})
587+
.Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
588+
.Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}});
589+
590+
addRulesForGOpcs({G_INTTOPTR}).Any({{UniP4}, {{SgprP4}, {Sgpr64}}});
568591

569592
addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
570593

@@ -585,10 +608,18 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
585608

586609
using namespace Intrinsic;
587610

611+
addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});
612+
588613
// This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir.
589614
addRulesForIOpcs({amdgcn_end_cf}).Any({{_, S32}, {{}, {None, Sgpr32}}});
590615

591616
addRulesForIOpcs({amdgcn_if_break}, Standard)
592617
.Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}});
593618

619+
addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard)
620+
.Div(S32, {{}, {Vgpr32, None, Vgpr32, Vgpr32}});
621+
622+
addRulesForIOpcs({amdgcn_readfirstlane})
623+
.Any({{UniS32, _, DivS32}, {{}, {Sgpr32, None, Vgpr32}}});
624+
594625
} // end initialize rules

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ enum UniformityLLTOpPredicateID {
5050
DivS64,
5151

5252
// pointers
53+
P0,
5354
P1,
5455
P3,
5556
P4,
@@ -124,6 +125,7 @@ enum RegBankLLTMappingApplyID {
124125
// vgpr scalars, pointers, vectors and B-types
125126
Vgpr32,
126127
Vgpr64,
128+
VgprP0,
127129
VgprP1,
128130
VgprP3,
129131
VgprP4,
@@ -162,6 +164,7 @@ enum RegBankLLTMappingApplyID {
162164
// vgpr. Lower it to two S32 vgpr ANDs.
163165
enum LoweringMethodID {
164166
DoNotLower,
167+
VccExtToSel,
165168
UniExtToSel,
166169
VgprToVccCopy,
167170
SplitTo32,

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1374,7 +1374,11 @@ bool GCNPassConfig::addPreISel() {
13741374
// control flow modifications.
13751375
addPass(createAMDGPURewriteUndefForPHILegacyPass());
13761376

1377-
addPass(createLCSSAPass());
1377+
// SDAG requires LCSSA, GlobalISel does not. Disable LCSSA for -global-isel
1378+
// with -new-reg-bank-select and without any of the fallback options.
1379+
if (!getCGPassBuilderOption().EnableGlobalISelOption ||
1380+
!isGlobalISelAbortEnabled() || !NewRegBankSelect)
1381+
addPass(createLCSSAPass());
13781382

13791383
if (TM->getOptLevel() > CodeGenOptLevel::Less)
13801384
addPass(&AMDGPUPerfHintAnalysisLegacyID);
@@ -2072,7 +2076,9 @@ void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
20722076
// control flow modifications.
20732077
addPass(AMDGPURewriteUndefForPHIPass());
20742078

2075-
addPass(LCSSAPass());
2079+
if (!getCGPassBuilderOption().EnableGlobalISelOption ||
2080+
!isGlobalISelAbortEnabled() || !NewRegBankSelect)
2081+
addPass(LCSSAPass());
20762082

20772083
if (TM.getOptLevel() > CodeGenOptLevel::Less)
20782084
addPass(AMDGPUPerfHintAnalysisPass(TM));

0 commit comments

Comments
 (0)