-
Notifications
You must be signed in to change notification settings - Fork 13.5k
AMDGPU/GlobalISel: add RegBankLegalize rules for bit shifts and sext-inreg #132385
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/petar-avramovic/select
Are you sure you want to change the base?
AMDGPU/GlobalISel: add RegBankLegalize rules for bit shifts and sext-inreg #132385
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Petar Avramovic (petar-avramovic) ChangesUniform S16 shifts have to be extended to S32 using appropriate Extend Patch is 48.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/132385.diff 13 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
index 44f1b5419abb9..4fd776bec9492 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp
@@ -23,6 +23,7 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -306,7 +307,7 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
// Opcodes that support pretty much all combinations of reg banks and LLTs
// (except S1). There is no point in writing rules for them.
if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES ||
- Opc == AMDGPU::G_MERGE_VALUES) {
+ Opc == AMDGPU::G_MERGE_VALUES || Opc == G_BITCAST) {
RBLHelper.applyMappingTrivial(*MI);
continue;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 0f5f3545ac8eb..59f16315bbd72 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -14,13 +14,16 @@
#include "AMDGPURegBankLegalizeHelper.h"
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegBankLegalizeRules.h"
#include "AMDGPURegisterBankInfo.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Support/ErrorHandling.h"
#define DEBUG_TYPE "amdgpu-regbanklegalize"
@@ -130,6 +133,28 @@ void RegBankLegalizeHelper::widenLoad(MachineInstr &MI, LLT WideTy,
MI.eraseFromParent();
}
+std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) {
+ auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
+ auto Mask = B.buildConstant(SgprRB_S32, 0x0000ffff);
+ auto Lo = B.buildAnd(SgprRB_S32, PackedS32, Mask);
+ auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
+ return {Lo.getReg(0), Hi.getReg(0)};
+}
+
+std::pair<Register, Register> RegBankLegalizeHelper::unpackSExt(Register Reg) {
+ auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
+ auto Lo = B.buildSExtInReg(SgprRB_S32, PackedS32, 16);
+ auto Hi = B.buildAShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
+ return {Lo.getReg(0), Hi.getReg(0)};
+}
+
+std::pair<Register, Register> RegBankLegalizeHelper::unpackAExt(Register Reg) {
+ auto PackedS32 = B.buildBitcast(SgprRB_S32, Reg);
+ auto Lo = PackedS32;
+ auto Hi = B.buildLShr(SgprRB_S32, PackedS32, B.buildConstant(SgprRB_S32, 16));
+ return {Lo.getReg(0), Hi.getReg(0)};
+}
+
void RegBankLegalizeHelper::lower(MachineInstr &MI,
const RegBankLLTMapping &Mapping,
SmallSet<Register, 4> &WaterfallSgprs) {
@@ -259,6 +284,33 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
MI.eraseFromParent();
break;
}
+ case SExtInRegSplitTo32: {
+ auto Op1 = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
+ int Amt = MI.getOperand(2).getImm();
+ Register Lo, Hi;
+ // Hi|Lo: s sign bit, ?/x bits changed/not changed by sign-extend
+ if (Amt <= 32) {
+ auto Freeze = B.buildFreeze(VgprRB_S32, Op1.getReg(0));
+ if (Amt == 32) {
+ // Hi|Lo: ????????|sxxxxxxx -> ssssssss|sxxxxxxx
+ Lo = Freeze.getReg(0);
+ } else {
+ // Hi|Lo: ????????|???sxxxx -> ssssssss|ssssxxxx
+ Lo = B.buildSExtInReg(VgprRB_S32, Freeze, Amt).getReg(0);
+ }
+
+ auto SignExtCst = B.buildConstant(SgprRB_S32, 31);
+ Hi = B.buildAShr(VgprRB_S32, Lo, SignExtCst).getReg(0);
+ } else {
+ // Hi|Lo: ?????sxx|xxxxxxxx -> ssssssxx|xxxxxxxx
+ Lo = Op1.getReg(0);
+ Hi = B.buildSExtInReg(VgprRB_S32, Op1.getReg(1), Amt - 32).getReg(0);
+ }
+
+ B.buildMergeLikeInstr(MI.getOperand(0).getReg(), {Lo, Hi});
+ MI.eraseFromParent();
+ break;
+ }
case Div_BFE: {
Register Dst = MI.getOperand(0).getReg();
assert(MRI.getType(Dst) == LLT::scalar(64));
@@ -356,6 +408,37 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
MI.eraseFromParent();
return;
}
+ case Unpack: {
+ Register Lo, Hi;
+ switch (MI.getOpcode()) {
+ case AMDGPU::G_SHL: {
+ auto [Val0, Val1] = unpackAExt(MI.getOperand(1).getReg());
+ auto [Amt0, Amt1] = unpackAExt(MI.getOperand(2).getReg());
+ Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
+ Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
+ break;
+ }
+ case AMDGPU::G_LSHR: {
+ auto [Val0, Val1] = unpackZExt(MI.getOperand(1).getReg());
+ auto [Amt0, Amt1] = unpackZExt(MI.getOperand(2).getReg());
+ Lo = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val0, Amt0}).getReg(0);
+ Hi = B.buildInstr(MI.getOpcode(), {SgprRB_S32}, {Val1, Amt1}).getReg(0);
+ break;
+ }
+ case AMDGPU::G_ASHR: {
+ auto [Val0, Val1] = unpackSExt(MI.getOperand(1).getReg());
+ auto [Amt0, Amt1] = unpackSExt(MI.getOperand(2).getReg());
+ Lo = B.buildAShr(SgprRB_S32, Val0, Amt0).getReg(0);
+ Hi = B.buildAShr(SgprRB_S32, Val1, Amt1).getReg(0);
+ break;
+ }
+ default:
+ llvm_unreachable("Unpack lowering not implemented");
+ }
+ B.buildBuildVectorTrunc(MI.getOperand(0).getReg(), {Lo, Hi});
+ MI.eraseFromParent();
+ return;
+ }
case SplitLoad: {
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
unsigned Size = DstTy.getSizeInBits();
@@ -445,6 +528,13 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case SgprP5:
case VgprP5:
return LLT::pointer(5, 32);
+ case SgprV2S16:
+ case VgprV2S16:
+ case UniInVgprV2S16:
+ return LLT::fixed_vector(2, 16);
+ case SgprV2S32:
+ case VgprV2S32:
+ return LLT::fixed_vector(2, 32);
case SgprV4S32:
case VgprV4S32:
case UniInVgprV4S32:
@@ -518,6 +608,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprP3:
case SgprP4:
case SgprP5:
+ case SgprV2S16:
+ case SgprV2S32:
case SgprV4S32:
case SgprB32:
case SgprB64:
@@ -527,6 +619,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprB512:
case UniInVcc:
case UniInVgprS32:
+ case UniInVgprV2S16:
case UniInVgprV4S32:
case UniInVgprB32:
case UniInVgprB64:
@@ -548,6 +641,8 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case VgprP3:
case VgprP4:
case VgprP5:
+ case VgprV2S16:
+ case VgprV2S32:
case VgprV4S32:
case VgprB32:
case VgprB64:
@@ -585,6 +680,8 @@ void RegBankLegalizeHelper::applyMappingDst(
case SgprP3:
case SgprP4:
case SgprP5:
+ case SgprV2S16:
+ case SgprV2S32:
case SgprV4S32:
case Vgpr16:
case Vgpr32:
@@ -594,6 +691,8 @@ void RegBankLegalizeHelper::applyMappingDst(
case VgprP3:
case VgprP4:
case VgprP5:
+ case VgprV2S16:
+ case VgprV2S32:
case VgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
assert(RB == getRegBankFromID(MethodIDs[OpIdx]));
@@ -628,6 +727,7 @@ void RegBankLegalizeHelper::applyMappingDst(
break;
}
case UniInVgprS32:
+ case UniInVgprV2S16:
case UniInVgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[OpIdx]));
assert(RB == SgprRB);
@@ -701,6 +801,8 @@ void RegBankLegalizeHelper::applyMappingSrc(
case SgprP3:
case SgprP4:
case SgprP5:
+ case SgprV2S16:
+ case SgprV2S32:
case SgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[i]));
assert(RB == getRegBankFromID(MethodIDs[i]));
@@ -726,6 +828,8 @@ void RegBankLegalizeHelper::applyMappingSrc(
case VgprP3:
case VgprP4:
case VgprP5:
+ case VgprV2S16:
+ case VgprV2S32:
case VgprV4S32: {
assert(Ty == getTyFromID(MethodIDs[i]));
if (RB != VgprRB) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index ae3ab86449dd5..e9aa97c2979d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -12,6 +12,7 @@
#include "AMDGPURegBankLegalizeRules.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Register.h"
namespace llvm {
@@ -108,6 +109,10 @@ class RegBankLegalizeHelper {
void lower(MachineInstr &MI, const RegBankLLTMapping &Mapping,
SmallSet<Register, 4> &SgprWaterfallOperandRegs);
+
+ std::pair<Register, Register> unpackZExt(Register Reg);
+ std::pair<Register, Register> unpackSExt(Register Reg);
+ std::pair<Register, Register> unpackAExt(Register Reg);
};
} // end namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 96b0a7d634f7e..78149472b0b09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -16,7 +16,9 @@
#include "AMDGPURegBankLegalizeRules.h"
#include "AMDGPUInstrInfo.h"
#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Support/AMDGPUAddrSpace.h"
@@ -60,6 +62,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::pointer(4, 64);
case P5:
return MRI.getType(Reg) == LLT::pointer(5, 32);
+ case V2S32:
+ return MRI.getType(Reg) == LLT::fixed_vector(2, 32);
case V4S32:
return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
case B32:
@@ -92,6 +96,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg);
case UniP5:
return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg);
+ case UniV2S16:
+ return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg);
case UniB32:
return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg);
case UniB64:
@@ -122,6 +128,8 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergent(Reg);
case DivP5:
return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergent(Reg);
+ case DivV2S16:
+ return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
case DivB32:
return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
case DivB64:
@@ -434,7 +442,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
MachineRegisterInfo &_MRI)
: ST(&_ST), MRI(&_MRI) {
- addRulesForGOpcs({G_ADD}, Standard)
+ addRulesForGOpcs({G_ADD, G_SUB}, Standard)
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
@@ -451,11 +459,36 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
addRulesForGOpcs({G_SHL}, Standard)
+ .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32ZExt}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, Unpack})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
+ .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
+
+ addRulesForGOpcs({G_LSHR}, Standard)
+ .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, Unpack})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
.Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
- addRulesForGOpcs({G_LSHR}, Standard).Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}});
+ addRulesForGOpcs({G_ASHR}, Standard)
+ .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32ZExt}})
+ .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
+ .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, Unpack})
+ .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
+ .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
+ .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
+ .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
+ .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
+
+ addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}});
addRulesForGOpcs({G_UBFX, G_SBFX}, Standard)
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}, Uni_BFE})
@@ -514,6 +547,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
.Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
.Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
+ .Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}})
+ .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
// This is non-trivial. VgprToVccCopy is done using compare instruction.
.Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}})
.Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}})
@@ -549,6 +584,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
.Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
+ addRulesForGOpcs({G_SEXT_INREG})
+ .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
+ .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
+ .Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}})
+ .Any({{DivS64, S64}, {{Vgpr64}, {Vgpr64}, SExtInRegSplitTo32}});
+
bool hasUnalignedLoads = ST->getGeneration() >= AMDGPUSubtarget::GFX12;
bool hasSMRDSmall = ST->hasScalarSubwordLoads();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 058e58c1a94ce..435323cbb9df4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -75,6 +75,10 @@ enum UniformityLLTOpPredicateID {
V3S32,
V4S32,
+ UniV2S16,
+
+ DivV2S16,
+
// B types
B32,
B64,
@@ -117,7 +121,9 @@ enum RegBankLLTMappingApplyID {
SgprP3,
SgprP4,
SgprP5,
+ SgprV2S16,
SgprV4S32,
+ SgprV2S32,
SgprB32,
SgprB64,
SgprB96,
@@ -134,6 +140,8 @@ enum RegBankLLTMappingApplyID {
VgprP3,
VgprP4,
VgprP5,
+ VgprV2S16,
+ VgprV2S32,
VgprB32,
VgprB64,
VgprB96,
@@ -145,6 +153,7 @@ enum RegBankLLTMappingApplyID {
// Dst only modifiers: read-any-lane and truncs
UniInVcc,
UniInVgprS32,
+ UniInVgprV2S16,
UniInVgprV4S32,
UniInVgprB32,
UniInVgprB64,
@@ -173,6 +182,7 @@ enum LoweringMethodID {
DoNotLower,
VccExtToSel,
UniExtToSel,
+ SExtInRegSplitTo32,
Uni_BFE,
Div_BFE,
VgprToVccCopy,
@@ -180,6 +190,7 @@ enum LoweringMethodID {
SplitTo32Sel,
Ext32To64,
UniCstExt,
+ Unpack,
SplitLoad,
WidenLoad,
};
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 493e8cef63890..c2c60136e8a0c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define i8 @v_ashr_i8(i8 %value, i8 %amount) {
; GFX6-LABEL: v_ashr_i8:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 784611cf68dd2..ec4e023182808 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
define i8 @v_lshr_i8(i8 %value, i8 %amount) {
; GFX6-LABEL: v_lshr_i8:
@@ -794,22 +794,22 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
;
; GFX9-LABEL: s_lshr_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshr_b32 s2, s0, 16
-; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT: s_and_b32 s2, s0, 0xffff
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
-; GFX9-NEXT: s_lshr_b32 s0, s0, s1
-; GFX9-NEXT: s_lshr_b32 s1, s2, s3
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX9-NEXT: s_lshr_b32 s1, s2, s1
+; GFX9-NEXT: s_lshr_b32 s0, s0, s3
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: s_lshr_v2i16:
; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16
-; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX10PLUS-NEXT: s_and_b32 s2, s0, 0xffff
+; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 16
; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 16
-; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1
-; GFX10PLUS-NEXT: s_lshr_b32 s1, s2, s3
-; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX10PLUS-NEXT: s_lshr_b32 s1, s2, s1
+; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s3
+; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s1, s0
; GFX10PLUS-NEXT: ; return t...
[truncated]
|
9f92e94
to
c7a043e
Compare
6d31707
to
58d5945
Compare
c7a043e
to
0572a36
Compare
58d5945
to
2bb763f
Compare
0572a36
to
2fdf172
Compare
2bb763f
to
ea57c82
Compare
2fdf172
to
585448f
Compare
ea57c82
to
da0dd13
Compare
585448f
to
183f6cc
Compare
183f6cc
to
cb28f82
Compare
da0dd13
to
2c7c01d
Compare
ping. Btw there are some changes in ll tests due to #131308, some instructions come to regbankselect as s16 (used to come as s32). Probably could be fixed with some combine in post reg bank combine (s16 AND + ZEXT to s32 -> AND s32) |
@@ -310,7 +310,7 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) { | |||
// Opcodes that support pretty much all combinations of reg banks and LLTs | |||
// (except S1). There is no point in writing rules for them. | |||
if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES || | |||
Opc == AMDGPU::G_MERGE_VALUES) { | |||
Opc == AMDGPU::G_MERGE_VALUES || Opc == G_BITCAST) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably should consistently qualify the enum value. Also this will need to cover freeze eventually?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, although freeze will have some complications for i1
@@ -171,6 +171,59 @@ void RegBankLegalizeHelper::lowerVccExtToSel(MachineInstr &MI) { | |||
MI.eraseFromParent(); | |||
} | |||
|
|||
std::pair<Register, Register> RegBankLegalizeHelper::unpackZExt(Register Reg) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
const?
; GFX8-LABEL: s_ashr_i8: | ||
; GFX8: ; %bb.0: | ||
; GFX8-NEXT: s_and_b32 s1, s1, 0xff | ||
; GFX8-NEXT: s_sext_i32_i8 s0, s0 | ||
; GFX8-NEXT: s_ashr_i32 s0, s0, s1 | ||
; GFX8-NEXT: ; return to shader part epilog | ||
; | ||
; GFX9-LABEL: s_ashr_i8: | ||
; GFX9: ; %bb.0: | ||
; GFX9-NEXT: s_and_b32 s1, s1, 0xff | ||
; GFX9-NEXT: s_sext_i32_i8 s0, s0 | ||
; GFX9-NEXT: s_ashr_i32 s0, s0, s1 | ||
; GFX9-NEXT: ; return to shader part epilog |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Regressions here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not related to this patch. #131308 changed input to be s16. Old global-isel was doing s16->s32. However new regbankselect leaves s16 as is since a lot of tablegen patterns rely on s16 type check. Fix is todo somewhere in post reg bank combine (s16 AND + ZEXT to s32 -> AND s32)
return {Lo.getReg(0), Hi.getReg(0)}; | ||
} | ||
|
||
void RegBankLegalizeHelper::lowerUnpack(MachineInstr &MI) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Naming for all these functions could be better. This is just for lowering shifts?
2c7c01d
to
a2bb5ea
Compare
cb28f82
to
0bc8320
Compare
…inreg Uniform S16 shifts have to be extended to S32 using appropriate Extend before lowering to S32 instruction. Uniform packed V2S16 are lowered to SGPR S32 instructions, other option is to use VALU packed V2S16 and ReadAnyLane. For uniform S32 and S64 and divergent S16, S32, S64 and V2S16 there are instructions available.
Uniform S16 shifts have to be extended to S32 using appropriate Extend
before lowering to S32 instruction.
Uniform packed V2S16 are lowered to SGPR S32 instructions,
other option is to use VALU packed V2S16 and ReadAnyLane.
For uniform S32 and S64 and divergent S16, S32, S64 and V2S16 there are
instructions available.