Skip to content

Commit 9f92e94

Browse files
AMDGPU/GlobalISel: add RegBankLegalize rules for bit shifts and sext-inreg
Uniform S16 shifts have to be extended to S32 using appropriate Extend before lowering to S32 instruction. Uniform packed V2S16 are lowered to SGPR S32 instructions, other option is to use VALU packed V2S16 and ReadAnyLane. For uniform S32 and S64 and divergent S16, S32, S64 and V2S16 there are instructions available.
1 parent 6560c53 commit 9f92e94

13 files changed

+304
-151
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
306306
// Opcodes that support pretty much all combinations of reg banks and LLTs
307307
// (except S1). There is no point in writing rules for them.
308308
if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES ||
309-
Opc == AMDGPU::G_MERGE_VALUES) {
309+
Opc == AMDGPU::G_MERGE_VALUES || Opc == G_BITCAST) {
310310
RBLHelper.applyMappingTrivial(*MI);
311311
continue;
312312
}

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,28 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
130130
MI.eraseFromParent();
131131
}
132132

133+
std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
134+
auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
135+
auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
136+
auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
137+
auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
138+
return {Lo.getReg(0), Hi.getReg(0)};
139+
}
140+
141+
std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
142+
auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
143+
auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
144+
auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
145+
return {Lo.getReg(0), Hi.getReg(0)};
146+
}
147+
148+
std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
149+
auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
150+
auto Lo = PackedS32;
151+
auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
152+
return {Lo.getReg(0), Hi.getReg(0)};
153+
}
154+
133155
void RegBankLegalizeHelper::lower(MachineInstr &MI,
134156
const RegBankLLTMapping &Mapping,
135157
SmallSet<Register, 4> &WaterfallSgprs) {
@@ -259,6 +281,33 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
259281
MI.eraseFromParent();
260282
break;
261283
}
284+
case SExtInRegSplitTo32: {
285+
auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
286+
int Amt = MI.getOperand(2).getImm();
287+
Register Lo, Hi;
288+
// Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
289+
if (Amt <= 32) {
290+
auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
291+
if (Amt == 32) {
292+
// Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
293+
Lo = Freeze.getReg(0);
294+
} else {
295+
// Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
296+
Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
297+
}
298+
299+
auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
300+
Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
301+
} else {
302+
// Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
303+
Lo = Op1.getReg(0);
304+
Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
305+
}
306+
307+
B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
308+
MI.eraseFromParent();
309+
break;
310+
}
262311
case Div_BFE: {
263312
Register Dst = MI.getOperand(0).getReg();
264313
assert(MRI.getType(Dst) == LLT::scalar(64));
@@ -356,6 +405,37 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
356405
MI.eraseFromParent();
357406
return;
358407
}
408+
case Unpack: {
409+
Register Lo, Hi;
410+
switch (MI.getOpcode()) {
411+
case AMDGPU::G_SHL: {
412+
auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
413+
auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
414+
Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
415+
Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
416+
break;
417+
}
418+
case AMDGPU::G_LSHR: {
419+
auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
420+
auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
421+
Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
422+
Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
423+
break;
424+
}
425+
case AMDGPU::G_ASHR: {
426+
auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
427+
auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
428+
Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
429+
Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
430+
break;
431+
}
432+
default:
433+
llvm_unreachable("Unpack lowering not implemented");
434+
}
435+
B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
436+
MI.eraseFromParent();
437+
return;
438+
}
359439
case SplitLoad: {
360440
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
361441
unsigned Size = DstTy.getSizeInBits();
@@ -445,6 +525,13 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
445525
case SgprP5:
446526
case VgprP5:
447527
return LLT::pointer(5, 32);
528+
case SgprV2S16:
529+
case VgprV2S16:
530+
case UniInVgprV2S16:
531+
return LLT::fixed_vector(2, 16);
532+
case SgprV2S32:
533+
case VgprV2S32:
534+
return LLT::fixed_vector(2, 32);
448535
case SgprV4S32:
449536
case VgprV4S32:
450537
case UniInVgprV4S32:
@@ -518,6 +605,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
518605
case SgprP3:
519606
case SgprP4:
520607
case SgprP5:
608+
case SgprV2S16:
609+
case SgprV2S32:
521610
case SgprV4S32:
522611
case SgprB32:
523612
case SgprB64:
@@ -527,6 +616,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
527616
case SgprB512:
528617
case UniInVcc:
529618
case UniInVgprS32:
619+
case UniInVgprV2S16:
530620
case UniInVgprV4S32:
531621
case UniInVgprB32:
532622
case UniInVgprB64:
@@ -548,6 +638,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
548638
case VgprP3:
549639
case VgprP4:
550640
case VgprP5:
641+
case VgprV2S16:
642+
case VgprV2S32:
551643
case VgprV4S32:
552644
case VgprB32:
553645
case VgprB64:
@@ -585,6 +677,8 @@ void RegBankLegalizeHelper::applyMappingDst(
585677
case SgprP3:
586678
case SgprP4:
587679
case SgprP5:
680+
case SgprV2S16:
681+
case SgprV2S32:
588682
case SgprV4S32:
589683
case Vgpr16:
590684
case Vgpr32:
@@ -594,6 +688,8 @@ void RegBankLegalizeHelper::applyMappingDst(
594688
case VgprP3:
595689
case VgprP4:
596690
case VgprP5:
691+
case VgprV2S16:
692+
case VgprV2S32:
597693
case VgprV4S32: {
598694
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
599695
assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
@@ -628,6 +724,7 @@ void RegBankLegalizeHelper::applyMappingDst(
628724
break;
629725
}
630726
case UniInVgprS32:
727+
case UniInVgprV2S16:
631728
case UniInVgprV4S32: {
632729
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
633730
assert(RB == SgprRB);
@@ -701,6 +798,8 @@ void RegBankLegalizeHelper::applyMappingSrc(
701798
case SgprP3:
702799
case SgprP4:
703800
case SgprP5:
801+
case SgprV2S16:
802+
case SgprV2S32:
704803
case SgprV4S32: {
705804
assert(Ty == getTyFromID(MethodIDs[i]));
706805
assert(RB == getRegBankFromID(MethodIDs[i]));
@@ -726,6 +825,8 @@ void RegBankLegalizeHelper::applyMappingSrc(
726825
case VgprP3:
727826
case VgprP4:
728827
case VgprP5:
828+
case VgprV2S16:
829+
case VgprV2S32:
729830
case VgprV4S32: {
730831
assert(Ty == getTyFromID(MethodIDs[i]));
731832
if (RB != VgprRB) {

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,10 @@ class RegBankLegalizeHelper {
108108

109109
void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
110110
SmallSet<Register, 4> &SgprWaterfallOperandRegs);
111+
112+
std::pair<Register, Register> unpackZExt(Register Reg);
113+
std::pair<Register, Register> unpackSExt(Register Reg);
114+
std::pair<Register, Register> unpackAExt(Register Reg);
111115
};
112116

113117
} // end namespace AMDGPU

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
6060
return MRI.getType(Reg) == LLT::pointer(4, 64);
6161
case P5:
6262
return MRI.getType(Reg) == LLT::pointer(5, 32);
63+
case V2S32:
64+
return MRI.getType(Reg) == LLT::fixed_vector(2, 32);
6365
case V4S32:
6466
return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
6567
case B32:
@@ -92,6 +94,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
9294
return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg);
9395
case UniP5:
9496
return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg);
97+
case UniV2S16:
98+
return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg);
9599
case UniB32:
96100
return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg);
97101
case UniB64:
@@ -122,6 +126,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
122126
return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergent(Reg);
123127
case DivP5:
124128
return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergent(Reg);
129+
case DivV2S16:
130+
return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
125131
case DivB32:
126132
return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
127133
case DivB64:
@@ -434,7 +440,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
434440
MachineRegisterInfo &_MRI)
435441
: ST(&_ST), MRI(&_MRI) {
436442

437-
addRulesForGOpcs({G_ADD}, Standard)
443+
addRulesForGOpcs({G_ADD, G_SUB}, Standard)
438444
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
439445
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
440446

@@ -451,11 +457,36 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
451457
.Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
452458

453459
addRulesForGOpcs({G_SHL}, Standard)
460+
.Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32ZExt}})
461+
.Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
462+
.Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, Unpack})
463+
.Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
464+
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
465+
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
454466
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
467+
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
468+
469+
addRulesForGOpcs({G_LSHR}, Standard)
470+
.Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
471+
.Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
472+
.Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, Unpack})
473+
.Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
474+
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
455475
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
476+
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
456477
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
457478

458-
addRulesForGOpcs({G_LSHR}, Standard).Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}});
479+
addRulesForGOpcs({G_ASHR}, Standard)
480+
.Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32ZExt}})
481+
.Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
482+
.Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, Unpack})
483+
.Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
484+
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
485+
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
486+
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
487+
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
488+
489+
addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}});
459490

460491
addRulesForGOpcs({G_UBFX, G_SBFX}, Standard)
461492
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}, Uni_BFE})
@@ -514,6 +545,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
514545
.Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
515546
.Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
516547
.Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
548+
.Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}})
549+
.Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
517550
// This is non-trivial. VgprToVccCopy is done using compare instruction.
518551
.Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}})
519552
.Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}})
@@ -549,6 +582,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
549582
.Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
550583
.Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
551584

585+
addRulesForGOpcs({G_SEXT_INREG})
586+
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
587+
.Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
588+
.Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}})
589+
.Any({{DivS64, S64}, {{Vgpr64}, {Vgpr64}, SExtInRegSplitTo32}});
590+
552591
bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12;
553592
bool hasSMRDSmall = ST->hasScalarSubwordLoads();
554593

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ enum UniformityLLTOpPredicateID {
7575
V3S32,
7676
V4S32,
7777

78+
UniV2S16,
79+
80+
DivV2S16,
81+
7882
// B types
7983
B32,
8084
B64,
@@ -117,7 +121,9 @@ enum RegBankLLTMappingApplyID {
117121
SgprP3,
118122
SgprP4,
119123
SgprP5,
124+
SgprV2S16,
120125
SgprV4S32,
126+
SgprV2S32,
121127
SgprB32,
122128
SgprB64,
123129
SgprB96,
@@ -134,6 +140,8 @@ enum RegBankLLTMappingApplyID {
134140
VgprP3,
135141
VgprP4,
136142
VgprP5,
143+
VgprV2S16,
144+
VgprV2S32,
137145
VgprB32,
138146
VgprB64,
139147
VgprB96,
@@ -145,6 +153,7 @@ enum RegBankLLTMappingApplyID {
145153
// Dst only modifiers: read-any-lane and truncs
146154
UniInVcc,
147155
UniInVgprS32,
156+
UniInVgprV2S16,
148157
UniInVgprV4S32,
149158
UniInVgprB32,
150159
UniInVgprB64,
@@ -173,13 +182,15 @@ enum LoweringMethodID {
173182
DoNotLower,
174183
VccExtToSel,
175184
UniExtToSel,
185+
SExtInRegSplitTo32,
176186
Uni_BFE,
177187
Div_BFE,
178188
VgprToVccCopy,
179189
SplitTo32,
180190
SplitTo32Sel,
181191
Ext32To64,
182192
UniCstExt,
193+
Unpack,
183194
SplitLoad,
184195
WidenLoad,
185196
};

llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
5-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
2+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
5+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
77

88
define i8 @v_ashr_i8(i8 %value, i8 %amount) {
99
; GFX6-LABEL: v_ashr_i8:

0 commit comments

Comments
 (0)