Skip to content

Commit fdca6f3

Browse files
AMDGPU/GlobalISel: add RegBankLegalize rules for extends and trunc
Uniform S1: Truncs to uniform S1 and AnyExts from S1 are left as is as they are meant to be combined away. Uniform S1 ZExt and SExt are lowered using select. Divergent S1: Trunc of VGPR to VCC is lowered as compare. Extends of VCC are lowered using select. For remaining types: S32 to S64 ZExt and SExt are lowered using merge values, AnyExt and Trunc are again left as is to be combined away. Notably uniform S16 for SExt and Zext is not lowered to S32 and left as is for instruction select to deal with them. This is because there are patterns that check for S16 type.
1 parent 4e74a7e commit fdca6f3

10 files changed

+358
-183
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,13 @@ class AMDGPURegBankLegalizeCombiner {
216216
return;
217217
}
218218

219+
if (DstTy == S64 && TruncSrcTy == S32) {
220+
B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
221+
{TruncSrc, B.buildUndef({SgprRB, S32})});
222+
cleanUpAfterCombine(MI, Trunc);
223+
return;
224+
}
225+
219226
if (DstTy == S32 && TruncSrcTy == S16) {
220227
B.buildAnyExt(Dst, TruncSrc);
221228
cleanUpAfterCombine(MI, Trunc);

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 82 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,41 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
131131
MI.eraseFromParent();
132132
}
133133

134+
void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) {
135+
Register Dst = MI.getOperand(0).getReg();
136+
LLT Ty = MRI.getType(Dst);
137+
Register Src = MI.getOperand(1).getReg();
138+
unsigned Opc = MI.getOpcode();
139+
if (Ty == S32 || Ty == S16) {
140+
auto True = B.buildConstant({VgprRB, Ty}, Opc == G_SEXT ? -1 : 1);
141+
auto False = B.buildConstant({VgprRB, Ty}, 0);
142+
B.buildSelect(Dst, Src, True, False);
143+
}
144+
if (Ty == S64) {
145+
auto True = B.buildConstant({VgprRB, S32}, Opc == G_SEXT ? -1 : 1);
146+
auto False = B.buildConstant({VgprRB, S32}, 0);
147+
auto Lo = B.buildSelect({VgprRB, S32}, Src, True, False);
148+
MachineInstrBuilder Hi;
149+
switch (Opc) {
150+
case G_SEXT:
151+
Hi = Lo;
152+
break;
153+
case G_ZEXT:
154+
Hi = False;
155+
break;
156+
case G_ANYEXT:
157+
Hi = B.buildUndef({VgprRB_S32});
158+
break;
159+
default:
160+
llvm_unreachable("Opcode not supported");
161+
}
162+
163+
B.buildMergeValues(Dst, {Lo.getReg(0), Hi.getReg(0)});
164+
}
165+
MI.eraseFromParent();
166+
return;
167+
}
168+
134169
bool isSignedBFE(MachineInstr &MI) {
135170
unsigned Opc =
136171
isa<GIntrinsic>(MI) ? MI.getOperand(1).getIntrinsicID() : MI.getOpcode();
@@ -265,26 +300,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
265300
switch (Mapping.LoweringMethod) {
266301
case DoNotLower:
267302
return;
268-
case VccExtToSel: {
269-
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
270-
Register Src = MI.getOperand(1).getReg();
271-
unsigned Opc = MI.getOpcode();
272-
if (Ty == S32 || Ty == S16) {
273-
auto True = B.buildConstant({VgprRB, Ty}, Opc == G_SEXT ? -1 : 1);
274-
auto False = B.buildConstant({VgprRB, Ty}, 0);
275-
B.buildSelect(MI.getOperand(0).getReg(), Src, True, False);
276-
}
277-
if (Ty == S64) {
278-
auto True = B.buildConstant({VgprRB, S32}, Opc == G_SEXT ? -1 : 1);
279-
auto False = B.buildConstant({VgprRB, S32}, 0);
280-
auto Sel = B.buildSelect({VgprRB, S32}, Src, True, False);
281-
B.buildMergeValues(
282-
MI.getOperand(0).getReg(),
283-
{Sel.getReg(0), Opc == G_SEXT ? Sel.getReg(0) : False.getReg(0)});
284-
}
285-
MI.eraseFromParent();
286-
return;
287-
}
303+
case VccExtToSel:
304+
return lowerVccExtToSel(MI);
288305
case UniExtToSel: {
289306
LLT Ty = MRI.getType(MI.getOperand(0).getReg());
290307
auto True = B.buildConstant({SgprRB, Ty},
@@ -301,13 +318,23 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
301318
case Ext32To64: {
302319
const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg());
303320
MachineInstrBuilder Hi;
304-
305-
if (MI.getOpcode() == AMDGPU::G_ZEXT) {
321+
switch (MI.getOpcode()) {
322+
case AMDGPU::G_ZEXT: {
306323
Hi = B.buildConstant({RB, S32}, 0);
307-
} else {
324+
break;
325+
}
326+
case AMDGPU::G_SEXT: {
308327
// Replicate sign bit from 32-bit extended part.
309328
auto ShiftAmt = B.buildConstant({RB, S32}, 31);
310329
Hi = B.buildAShr({RB, S32}, MI.getOperand(1).getReg(), ShiftAmt);
330+
break;
331+
}
332+
case AMDGPU::G_ANYEXT: {
333+
Hi = B.buildUndef({RB, S32});
334+
break;
335+
}
336+
default:
337+
llvm_unreachable("Unsuported Opcode in Ext32To64");
311338
}
312339

313340
B.buildMergeLikeInstr(MI.getOperand(0).getReg(),
@@ -330,7 +357,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
330357
// compares all bits in register.
331358
Register BoolSrc = MRI.createVirtualRegister({VgprRB, Ty});
332359
if (Ty == S64) {
333-
auto Src64 = B.buildUnmerge({VgprRB, Ty}, Src);
360+
auto Src64 = B.buildUnmerge(VgprRB_S32, Src);
334361
auto One = B.buildConstant(VgprRB_S32, 1);
335362
auto AndLo = B.buildAnd(VgprRB_S32, Src64.getReg(0), One);
336363
auto Zero = B.buildConstant(VgprRB_S32, 0);
@@ -418,8 +445,11 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
418445
case Sgpr32AExt:
419446
case Sgpr32AExtBoolInReg:
420447
case Sgpr32SExt:
448+
case Sgpr32ZExt:
421449
case UniInVgprS32:
422450
case Vgpr32:
451+
case Vgpr32SExt:
452+
case Vgpr32ZExt:
423453
return LLT::scalar(32);
424454
case Sgpr64:
425455
case Vgpr64:
@@ -530,6 +560,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
530560
case Sgpr32AExt:
531561
case Sgpr32AExtBoolInReg:
532562
case Sgpr32SExt:
563+
case Sgpr32ZExt:
533564
return SgprRB;
534565
case Vgpr16:
535566
case Vgpr32:
@@ -546,6 +577,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
546577
case VgprB128:
547578
case VgprB256:
548579
case VgprB512:
580+
case Vgpr32SExt:
581+
case Vgpr32ZExt:
549582
return VgprRB;
550583
default:
551584
return nullptr;
@@ -751,8 +784,8 @@ void RegBankLegalizeHelper::applyMappingSrc(
751784
assert(Ty.getSizeInBits() == 1);
752785
assert(RB == SgprRB);
753786
auto Aext = B.buildAnyExt(SgprRB_S32, Reg);
754-
// Zext SgprS1 is not legal, this instruction is most of times meant to be
755-
// combined away in RB combiner, so do not make AND with 1.
787+
// Zext SgprS1 is not legal, make AND with 1 instead. This instruction is
788+
// most of times meant to be combined away in AMDGPURegBankCombiner.
756789
auto Cst1 = B.buildConstant(SgprRB_S32, 1);
757790
auto BoolInReg = B.buildAnd(SgprRB_S32, Aext, Cst1);
758791
Op.setReg(BoolInReg.getReg(0));
@@ -765,6 +798,29 @@ void RegBankLegalizeHelper::applyMappingSrc(
765798
Op.setReg(Sext.getReg(0));
766799
break;
767800
}
801+
case Sgpr32ZExt: {
802+
assert(1 < Ty.getSizeInBits() && Ty.getSizeInBits() < 32);
803+
assert(RB == SgprRB);
804+
auto Zext = B.buildZExt({SgprRB, S32}, Reg);
805+
Op.setReg(Zext.getReg(0));
806+
break;
807+
}
808+
case Vgpr32SExt: {
809+
// Note this ext allows S1, and it is meant to be combined away.
810+
assert(Ty.getSizeInBits() < 32);
811+
assert(RB == VgprRB);
812+
auto Sext = B.buildSExt({VgprRB, S32}, Reg);
813+
Op.setReg(Sext.getReg(0));
814+
break;
815+
}
816+
case Vgpr32ZExt: {
817+
// Note this ext allows S1, and it is meant to be combined away.
818+
assert(Ty.getSizeInBits() < 32);
819+
assert(RB == VgprRB);
820+
auto Zext = B.buildZExt({VgprRB, S32}, Reg);
821+
Op.setReg(Zext.getReg(0));
822+
break;
823+
}
768824
default:
769825
llvm_unreachable("ID not supported");
770826
}

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ class RegBankLegalizeHelper {
109109
void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
110110
SmallSet<Register, 4> &SgprWaterfallOperandRegs);
111111

112+
void lowerVccExtToSel(MachineInstr &MI);
112113
void lowerDiv_BFE(MachineInstr &MI);
113114
void lowerUni_BFE(MachineInstr &MI);
114115
void lowerSplitTo32(MachineInstr &MI);

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -489,22 +489,61 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
489489
.Uni(B32, {{SgprB32}, {Sgpr32AExtBoolInReg, SgprB32, SgprB32}});
490490

491491
addRulesForGOpcs({G_ANYEXT})
492+
.Any({{UniS16, S1}, {{None}, {None}}}) // should be combined away
492493
.Any({{UniS32, S1}, {{None}, {None}}}) // should be combined away
493-
.Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}});
494+
.Any({{UniS64, S1}, {{None}, {None}}}) // should be combined away
495+
.Any({{{DivS16, S1}}, {{Vgpr16}, {Vcc}, VccExtToSel}})
496+
.Any({{{DivS32, S1}}, {{Vgpr32}, {Vcc}, VccExtToSel}})
497+
.Any({{{DivS64, S1}}, {{Vgpr64}, {Vcc}, VccExtToSel}})
498+
.Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
499+
.Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
500+
.Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
501+
.Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
494502

495503
// In global-isel G_TRUNC in-reg is treated as no-op, inst selected into COPY.
496504
// It is up to user to deal with truncated bits.
497505
addRulesForGOpcs({G_TRUNC})
506+
.Any({{UniS1, UniS16}, {{None}, {None}}}) // should be combined away
498507
.Any({{UniS1, UniS32}, {{None}, {None}}}) // should be combined away
508+
.Any({{UniS1, UniS64}, {{None}, {None}}}) // should be combined away
499509
.Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}})
510+
.Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
511+
.Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
512+
.Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
500513
// This is non-trivial. VgprToVccCopy is done using compare instruction.
501-
.Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}});
514+
.Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}})
515+
.Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}})
516+
.Any({{DivS1, DivS64}, {{Vcc}, {Vgpr64}, VgprToVccCopy}});
502517

503-
addRulesForGOpcs({G_ZEXT, G_SEXT})
518+
addRulesForGOpcs({G_ZEXT})
519+
.Any({{UniS16, S1}, {{Sgpr32Trunc}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
520+
.Any({{UniS32, S1}, {{Sgpr32}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
521+
.Any({{UniS64, S1}, {{Sgpr64}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
522+
.Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
523+
.Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
524+
.Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
525+
.Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
526+
.Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
527+
// not extending S16 to S32 is questionable.
528+
.Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32ZExt}, Ext32To64}})
529+
.Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32ZExt}, Ext32To64}})
530+
.Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
531+
.Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
532+
533+
addRulesForGOpcs({G_SEXT})
534+
.Any({{UniS16, S1}, {{Sgpr32Trunc}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
504535
.Any({{UniS32, S1}, {{Sgpr32}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
536+
.Any({{UniS64, S1}, {{Sgpr64}, {Sgpr32AExtBoolInReg}, UniExtToSel}})
537+
.Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
505538
.Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
539+
.Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
506540
.Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
507-
.Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}});
541+
.Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
542+
// not extending S16 to S32 is questionable.
543+
.Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32SExt}, Ext32To64}})
544+
.Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32SExt}, Ext32To64}})
545+
.Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
546+
.Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
508547

509548
bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12;
510549
bool hasSMRDSmall = ST->hasScalarSubwordLoads();

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,9 @@ enum RegBankLLTMappingApplyID {
159159
Sgpr32AExt,
160160
Sgpr32AExtBoolInReg,
161161
Sgpr32SExt,
162+
Sgpr32ZExt,
163+
Vgpr32SExt,
164+
Vgpr32ZExt,
162165
};
163166

164167
// Instruction needs to be replaced with sequence of instructions. Lowering was

0 commit comments

Comments
 (0)