Skip to content

Commit 2c7c01d

Browse files
AMDGPU/GlobalISel: add RegBankLegalize rules for select
Uniform condition S1 is AnyExtended to S32 and high bits are cleaned using AND with 1. Divergent S1 uses VCC. Using B32/B64 rules to cover scalars vector and pointer types. Divergent B64 is split to S32.
1 parent 649adb1 commit 2c7c01d

File tree

5 files changed

+657
-1280
lines changed

5 files changed

+657
-1280
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "llvm/CodeGen/MachineInstr.h"
2424
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
2525
#include "llvm/IR/IntrinsicsAMDGPU.h"
26+
#include "llvm/Support/AMDGPUAddrSpace.h"
2627

2728
#define DEBUG_TYPE "amdgpu-regbanklegalize"
2829

@@ -286,6 +287,25 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) {
286287
MI.eraseFromParent();
287288
}
288289

290+
void RegBankLegalizeHelper::lowerSplitTo32Sel(MachineInstr &MI) {
291+
Register Dst = MI.getOperand(0).getReg();
292+
LLT DstTy = MRI.getType(Dst);
293+
assert(DstTy == V4S16 || DstTy == V2S32 || DstTy == S64 ||
294+
(DstTy.isPointer() && DstTy.getSizeInBits() == 64));
295+
LLT Ty = (DstTy == V4S16 ? V2S16 : S32);
296+
auto Op2 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(2).getReg());
297+
auto Op3 = B.buildUnmerge({VgprRB, Ty}, MI.getOperand(3).getReg());
298+
Register Cond = MI.getOperand(1).getReg();
299+
auto Flags = MI.getFlags();
300+
auto Lo =
301+
B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(0), Op3.getReg(0), Flags);
302+
auto Hi =
303+
B.buildSelect({VgprRB, Ty}, Cond, Op2.getReg(1), Op3.getReg(1), Flags);
304+
305+
B.buildMergeLikeInstr(Dst, {Lo, Hi});
306+
MI.eraseFromParent();
307+
}
308+
289309
void RegBankLegalizeHelper::lower(MachineInstr &MI,
290310
const RegBankLLTMapping &Mapping,
291311
SmallSet<Register, 4> &WaterfallSgprs) {
@@ -372,6 +392,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
372392
return lowerS_BFE(MI);
373393
case SplitTo32:
374394
return lowerSplitTo32(MI);
395+
case SplitTo32Sel:
396+
return lowerSplitTo32Sel(MI);
375397
case SplitLoad: {
376398
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
377399
unsigned Size = DstTy.getSizeInBits();
@@ -485,7 +507,8 @@ LLT RegBankLegalizeHelper::getBTyFromID(RegBankLLTMappingApplyID ID, LLT Ty) {
485507
case UniInVgprB64:
486508
if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
487509
Ty == LLT::fixed_vector(4, 16) || Ty == LLT::pointer(0, 64) ||
488-
Ty == LLT::pointer(1, 64) || Ty == LLT::pointer(4, 64))
510+
Ty == LLT::pointer(1, 64) || Ty == LLT::pointer(4, 64) ||
511+
(Ty.isPointer() && Ty.getAddressSpace() > AMDGPUAS::MAX_AMDGPU_ADDRESS))
489512
return Ty;
490513
return LLT();
491514
case SgprB96:

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ class RegBankLegalizeHelper {
114114
void lowerV_BFE(MachineInstr &MI);
115115
void lowerS_BFE(MachineInstr &MI);
116116
void lowerSplitTo32(MachineInstr &MI);
117+
void lowerSplitTo32Sel(MachineInstr &MI);
117118
};
118119

119120
} // end namespace AMDGPU

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,8 @@ UniformityLLTOpPredicateID LLTToBId(LLT Ty) {
198198
return B32;
199199
if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
200200
Ty == LLT::fixed_vector(4, 16) || Ty == LLT::pointer(1, 64) ||
201-
Ty == LLT::pointer(4, 64))
201+
Ty == LLT::pointer(4, 64) ||
202+
(Ty.isPointer() && Ty.getAddressSpace() > AMDGPUAS::MAX_AMDGPU_ADDRESS))
202203
return B64;
203204
if (Ty == LLT::fixed_vector(3, 32))
204205
return B96;
@@ -485,8 +486,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
485486
addRulesForGOpcs({G_BR}).Any({{_}, {{}, {None}}});
486487

487488
addRulesForGOpcs({G_SELECT}, StandardB)
489+
.Any({{DivS16}, {{Vgpr16}, {Vcc, Vgpr16, Vgpr16}}})
490+
.Any({{UniS16}, {{Sgpr16}, {Sgpr32AExtBoolInReg, Sgpr16, Sgpr16}}})
488491
.Div(B32, {{VgprB32}, {Vcc, VgprB32, VgprB32}})
489-
.Uni(B32, {{SgprB32}, {Sgpr32AExtBoolInReg, SgprB32, SgprB32}});
492+
.Uni(B32, {{SgprB32}, {Sgpr32AExtBoolInReg, SgprB32, SgprB32}})
493+
.Div(B64, {{VgprB64}, {Vcc, VgprB64, VgprB64}, SplitTo32Sel})
494+
.Uni(B64, {{SgprB64}, {Sgpr32AExtBoolInReg, SgprB64, SgprB64}});
490495

491496
addRulesForGOpcs({G_ANYEXT})
492497
.Any({{UniS16, S1}, {{None}, {None}}}) // should be combined away

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ enum LoweringMethodID {
177177
V_BFE,
178178
VgprToVccCopy,
179179
SplitTo32,
180+
SplitTo32Sel,
180181
Ext32To64,
181182
UniCstExt,
182183
SplitLoad,

0 commit comments

Comments
 (0)