Skip to content

Commit dc8b75b

Browse files
AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc
Uniform S1: Truncs to uniform S1 and AnyExts from S1 are left as is as they are meant to be combined away. Uniform S1 ZExt and SExt are lowered using select. Divergent S1: Trunc of VGPR to VCC is lowered as compare. Extends of VCC are lowered using select. For remaining types: S32 to S64 ZExt and SExt are lowered using merge values, AnyExt and Trunc are again left as is to be combined away. Notably uniform S16 for SExt and Zext is not lowered to S32 and left as is for instruction select to deal with them. This is because there are patterns that check for S16 type.
1 parent 7902e9b commit dc8b75b

10 files changed

+360
-183
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,13 @@ class AMDGPURegBankLegalizeCombiner {
213213
return;
214214
}
215215

216+
if (DstTy == S64 && TruncSrcTy == S32) {
217+
B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
218+
{TruncSrc, B.buildUndef({SgprRB, S32})});
219+
cleanUpAfterCombine(MI, Trunc);
220+
return;
221+
}
222+
216223
if (DstTy == S32 && TruncSrcTy == S16) {
217224
B.buildAnyExt(Dst, TruncSrc);
218225
cleanUpAfterCombine(MI, Trunc);

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 84 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,43 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
133133
MI.eraseFromParent();
134134
}
135135

136+
void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
137+
Register Dst = MI.getOperand(0).getReg();
138+
LLT Ty = MRI.getType(Dst);
139+
Register Src = MI.getOperand(1).getReg();
140+
unsigned Opc = MI.getOpcode();
141+
int TrueExtCst = Opc == G_SEXT ? -1 : 1;
142+
if (Ty == S32 || Ty == S16) {
143+
auto True = B.buildConstant({VgprRB, Ty}, TrueExtCst);
144+
auto False = B.buildConstant({VgprRB, Ty}, 0);
145+
B.buildSelect(Dst, Src, True, False);
146+
} else if (Ty == S64) {
147+
auto True = B.buildConstant({VgprRB_S32}, TrueExtCst);
148+
auto False = B.buildConstant({VgprRB_S32}, 0);
149+
auto Lo = B.buildSelect({VgprRB_S32}, Src, True, False);
150+
MachineInstrBuilder Hi;
151+
switch (Opc) {
152+
case G_SEXT:
153+
Hi = Lo;
154+
break;
155+
case G_ZEXT:
156+
Hi = False;
157+
break;
158+
case G_ANYEXT:
159+
Hi = B.buildUndef({VgprRB_S32});
160+
break;
161+
default:
162+
llvm_unreachable("Opcode not supported");
163+
}
164+
165+
B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
166+
} else {
167+
llvm_unreachable("Type not supported");
168+
}
169+
170+
MI.eraseFromParent();
171+
}
172+
136173
static bool isSignedBFE(MachineInstr &MI) {
137174
if (GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI))
138175
return (GI->is(Intrinsic::amdgcn_sbfe));
@@ -256,26 +293,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
256293
switch (Mapping.LoweringMethod) {
257294
case DoNotLower:
258295
return;
259-
case VccExtToSel: {
260-
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
261-
Register Src = MI.getOperand(1).getReg();
262-
unsigned Opc = MI.getOpcode();
263-
if (Ty == S32 || Ty == S16) {
264-
auto True = B.buildConstant({VgprRB, Ty}, Opc == G_SEXT ? -1 : 1);
265-
auto False = B.buildConstant({VgprRB, Ty}, 0);
266-
B.buildSelect(MI.getOperand(0).getReg(), Src, True, False);
267-
}
268-
if (Ty == S64) {
269-
auto True = B.buildConstant({VgprRB, S32}, Opc == G_SEXT ? -1 : 1);
270-
auto False = B.buildConstant({VgprRB, S32}, 0);
271-
auto Sel = B.buildSelect({VgprRB, S32}, Src, True, False);
272-
B.buildMergeValues(
273-
MI.getOperand(0).getReg(),
274-
{Sel.getReg(0), Opc == G_SEXT ? Sel.getReg(0) : False.getReg(0)});
275-
}
276-
MI.eraseFromParent();
277-
return;
278-
}
296+
case VccExtToSel:
297+
return lowerVccExtToSel(MI);
279298
case UniExtToSel: {
280299
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
281300
auto True = B.buildConstant({SgprRB, Ty},
@@ -292,13 +311,23 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
292311
case Ext32To64: {
293312
const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
294313
MachineInstrBuilder Hi;
295-
296-
if (MI.getOpcode() == AMDGPU::G_ZEXT) {
314+
switch (MI.getOpcode()) {
315+
case AMDGPU::G_ZEXT: {
297316
Hi = B.buildConstant({RB, S32}, 0);
298-
} else {
317+
break;
318+
}
319+
case AMDGPU::G_SEXT: {
299320
// Replicate sign bit from 32-bit extended part.
300321
auto ShiftAmt = B.buildConstant({RB, S32}, 31);
301322
Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
323+
break;
324+
}
325+
case AMDGPU::G_ANYEXT: {
326+
Hi = B.buildUndef({RB, S32});
327+
break;
328+
}
329+
default:
330+
llvm_unreachable("Unsuported Opcode in Ext32To64");
302331
}
303332

304333
B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
@@ -321,7 +350,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
321350
// compares all bits in register.
322351
Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
323352
if (Ty == S64) {
324-
auto Src64 = B.buildUnmerge({VgprRB, Ty}, Src);
353+
auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
325354
auto One = B.buildConstant(VgprRB_S32, 1);
326355
auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
327356
auto Zero = B.buildConstant(VgprRB_S32, 0);
@@ -409,8 +438,11 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
409438
case Sgpr32AExt:
410439
case Sgpr32AExtBoolInReg:
411440
case Sgpr32SExt:
441+
case Sgpr32ZExt:
412442
case UniInVgprS32:
413443
case Vgpr32:
444+
case Vgpr32SExt:
445+
case Vgpr32ZExt:
414446
return LLT::scalar(32);
415447
case Sgpr64:
416448
case Vgpr64:
@@ -521,6 +553,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
521553
case Sgpr32AExt:
522554
case Sgpr32AExtBoolInReg:
523555
case Sgpr32SExt:
556+
case Sgpr32ZExt:
524557
return SgprRB;
525558
case Vgpr16:
526559
case Vgpr32:
@@ -537,6 +570,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
537570
case VgprB128:
538571
case VgprB256:
539572
case VgprB512:
573+
case Vgpr32SExt:
574+
case Vgpr32ZExt:
540575
return VgprRB;
541576
default:
542577
return nullptr;
@@ -742,8 +777,8 @@ void RegBankLegalizeHelper::applyMappingSrc(
742777
assert(Ty.getSizeInBits() == 1);
743778
assert(RB == SgprRB);
744779
auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
745-
// Zext SgprS1 is not legal, this instruction is most of times meant to be
746-
// combined away in RB combiner, so do not make AND with 1.
780+
// Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
781+
// most of times meant to be combined away in AMDGPURegBankCombiner.
747782
auto Cst1 = B.buildConstant(SgprRB_S32, 1);
748783
auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
749784
Op.setReg(BoolInReg.getReg(0));
@@ -756,6 +791,29 @@ void RegBankLegalizeHelper::applyMappingSrc(
756791
Op.setReg(Sext.getReg(0));
757792
break;
758793
}
794+
case Sgpr32ZExt: {
795+
assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
796+
assert(RB == SgprRB);
797+
auto Zext = B.buildZExt({SgprRB, S32}, Reg);
798+
Op.setReg(Zext.getReg(0));
799+
break;
800+
}
801+
case Vgpr32SExt: {
802+
// Note this ext allows S1, and it is meant to be combined away.
803+
assert(Ty.getSizeInBits() < 32);
804+
assert(RB == VgprRB);
805+
auto Sext = B.buildSExt({VgprRB, S32}, Reg);
806+
Op.setReg(Sext.getReg(0));
807+
break;
808+
}
809+
case Vgpr32ZExt: {
810+
// Note this ext allows S1, and it is meant to be combined away.
811+
assert(Ty.getSizeInBits() < 32);
812+
assert(RB == VgprRB);
813+
auto Zext = B.buildZExt({VgprRB, S32}, Reg);
814+
Op.setReg(Zext.getReg(0));
815+
break;
816+
}
759817
default:
760818
llvm_unreachable("ID not supported");
761819
}

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ class RegBankLegalizeHelper {
110110
void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
111111
SmallSet<Register, 4> &SgprWaterfallOperandRegs);
112112

113+
void lowerVccExtToSel(MachineInstr &MI);
113114
void lowerV_BFE(MachineInstr &MI);
114115
void lowerS_BFE(MachineInstr &MI);
115116
void lowerSplitTo32(MachineInstr &MI);

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -489,22 +489,61 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
489489
.Uni(B32, {{SgprB32}, {Sgpr32AExtBoolInReg, SgprB32, SgprB32}});
490490

491491
addRulesForGOpcs({G_ANYEXT})
492+
.Any({{UniS16, S1}, {{None}, {None}}}) // should be combined away
492493
.Any({{UniS32, S1}, {{None}, {None}}}) // should be combined away
493-
.Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}});
494+
.Any({{UniS64, S1}, {{None}, {None}}}) // should be combined away
495+
.Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
496+
.Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
497+
.Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
498+
.Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
499+
.Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
500+
.Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
501+
.Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
494502

495503
// In global-isel G_TRUNC in-reg is treated as no-op, inst selected into COPY.
496504
// It is up to user to deal with truncated bits.
497505
addRulesForGOpcs({G_TRUNC})
506+
.Any({{UniS1, UniS16}, {{None}, {None}}}) // should be combined away
498507
.Any({{UniS1, UniS32}, {{None}, {None}}}) // should be combined away
508+
.Any({{UniS1, UniS64}, {{None}, {None}}}) // should be combined away
499509
.Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}})
510+
.Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
511+
.Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
512+
.Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
500513
// This is non-trivial. VgprToVccCopy is done using compare instruction.
501-
.Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}});
514+
.Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}})
515+
.Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}})
516+
.Any({{DivS1, DivS64}, {{Vcc}, {Vgpr64}, VgprToVccCopy}});
502517

503-
addRulesForGOpcs({G_ZEXT, G_SEXT})
518+
addRulesForGOpcs({G_ZEXT})
519+
.Any({{UniS16, S1}, {{Sgpr32Trunc}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
520+
.Any({{UniS32, S1}, {{Sgpr32}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
521+
.Any({{UniS64, S1}, {{Sgpr64}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
522+
.Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
523+
.Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
524+
.Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
525+
.Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
526+
.Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
527+
// not extending S16 to S32 is questionable.
528+
.Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32ZExt}, Ext32To64}})
529+
.Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32ZExt}, Ext32To64}})
530+
.Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
531+
.Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
532+
533+
addRulesForGOpcs({G_SEXT})
534+
.Any({{UniS16, S1}, {{Sgpr32Trunc}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
504535
.Any({{UniS32, S1}, {{Sgpr32}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
536+
.Any({{UniS64, S1}, {{Sgpr64}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
537+
.Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
505538
.Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
539+
.Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
506540
.Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
507-
.Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}});
541+
.Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
542+
// not extending S16 to S32 is questionable.
543+
.Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32SExt}, Ext32To64}})
544+
.Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32SExt}, Ext32To64}})
545+
.Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
546+
.Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
508547

509548
bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12;
510549
bool hasSMRDSmall = ST->hasScalarSubwordLoads();

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,9 @@ enum RegBankLLTMappingApplyID {
159159
Sgpr32AExt,
160160
Sgpr32AExtBoolInReg,
161161
Sgpr32SExt,
162+
Sgpr32ZExt,
163+
Vgpr32SExt,
164+
Vgpr32ZExt,
162165
};
163166

164167
// Instruction needs to be replaced with sequence of instructions. Lowering was

0 commit comments

Comments
 (0)