-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[X86][GlobalISel] Enable scalar versions of G_UITOFP and G_FPTOUI #100079
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
e9d7c68
20e9b7b
7d68049
679ed80
7e2987d
bf232e7
d016bf0
c846d0c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6931,6 +6931,77 @@ LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) { | |
return Legalized; | ||
} | ||
|
||
// Expand s32 = G_UITOFP s64 to an IEEE float representation using bit | ||
// operations and G_SITOFP | ||
LegalizerHelper::LegalizeResult | ||
LegalizerHelper::lowerU64ToF32WithSITOFP(MachineInstr &MI) { | ||
auto [Dst, Src] = MI.getFirst2Regs(); | ||
const LLT S64 = LLT::scalar(64); | ||
const LLT S32 = LLT::scalar(32); | ||
const LLT S1 = LLT::scalar(1); | ||
|
||
assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32); | ||
|
||
// For i64 < INT_MAX we simply reuse SITOFP. | ||
// Otherwise, divide i64 by 2, round result by ORing with the lowest bit | ||
// saved before division, convert to float by SITOFP, multiply the result | ||
// by 2. | ||
auto One = MIRBuilder.buildConstant(S64, 1); | ||
auto Zero = MIRBuilder.buildConstant(S64, 0); | ||
// Result if Src < INT_MAX | ||
auto SmallResult = MIRBuilder.buildSITOFP(S32, Src); | ||
// Result if Src >= INT_MAX | ||
auto Halved = MIRBuilder.buildLShr(S64, Src, One); | ||
auto LowerBit = MIRBuilder.buildAnd(S64, Src, One); | ||
auto RoundedHalved = MIRBuilder.buildOr(S64, Halved, LowerBit); | ||
auto HalvedFP = MIRBuilder.buildSITOFP(S32, RoundedHalved); | ||
auto LargeResult = MIRBuilder.buildFAdd(S32, HalvedFP, HalvedFP); | ||
// Choose | ||
auto IsLarge = | ||
MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, S1, Src, Zero); | ||
Comment on lines
+7193
to
+7194
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IsLarge = slt x, 0? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes because |
||
MIRBuilder.buildSelect(Dst, IsLarge, LargeResult, SmallResult); | ||
|
||
MI.eraseFromParent(); | ||
return Legalized; | ||
} | ||
|
||
// Expand s64 = G_UITOFP s64 using bit and float arithmetic operations to an | ||
// IEEE double representation. | ||
LegalizerHelper::LegalizeResult | ||
LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) { | ||
auto [Dst, Src] = MI.getFirst2Regs(); | ||
const LLT S64 = LLT::scalar(64); | ||
const LLT S32 = LLT::scalar(32); | ||
|
||
assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); | ||
|
||
// We create double value from 32 bit parts with 32 exponent difference. | ||
// Note that + and - are float operations that adjust the implicit leading | ||
// one, the bases 2^52 and 2^84 are for illustrative purposes. | ||
// | ||
// X = 2^52 * 1.0...LowBits | ||
// Y = 2^84 * 1.0...HighBits | ||
// Scratch = 2^84 * 1.0...HighBits - 2^84 * 1.0 - 2^52 * 1.0 | ||
// = - 2^52 * 1.0...HighBits | ||
// Result = - 2^52 * 1.0...HighBits + 2^52 * 1.0...LowBits | ||
auto TwoP52 = MIRBuilder.buildConstant(S64, UINT64_C(0x4330000000000000)); | ||
auto TwoP84 = MIRBuilder.buildConstant(S64, UINT64_C(0x4530000000000000)); | ||
arsenm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
auto TwoP52P84 = llvm::bit_cast<double>(UINT64_C(0x4530000000100000)); | ||
auto TwoP52P84FP = MIRBuilder.buildFConstant(S64, TwoP52P84); | ||
auto HalfWidth = MIRBuilder.buildConstant(S64, 32); | ||
|
||
auto LowBits = MIRBuilder.buildTrunc(S32, Src); | ||
LowBits = MIRBuilder.buildZExt(S64, LowBits); | ||
auto LowBitsFP = MIRBuilder.buildOr(S64, TwoP52, LowBits); | ||
auto HighBits = MIRBuilder.buildLShr(S64, Src, HalfWidth); | ||
auto HighBitsFP = MIRBuilder.buildOr(S64, TwoP84, HighBits); | ||
auto Scratch = MIRBuilder.buildFSub(S64, HighBitsFP, TwoP52P84FP); | ||
MIRBuilder.buildFAdd(Dst, Scratch, LowBitsFP); | ||
|
||
MI.eraseFromParent(); | ||
return Legalized; | ||
} | ||
|
||
LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) { | ||
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); | ||
|
||
|
@@ -6951,6 +7022,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) { | |
// has sitofp, does not have CTLZ, or can efficiently use f64 as an | ||
// intermediate type, this is probably worse. | ||
return lowerU64ToF32BitOps(MI); | ||
} else if (DstTy == LLT::scalar(64)) { | ||
e-kud marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return lowerU64ToF64BitFloatOps(MI); | ||
} | ||
|
||
return UnableToLegalize; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -497,6 +497,49 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, | |
.clampScalar(0, s32, sMaxScalar) | ||
.widenScalarToNextPow2(1); | ||
|
||
// For G_UITOFP and G_FPTOUI without AVX512, we have to custom legalize s16 | ||
// manually. Otherwise, in custom handler there is no way to understand | ||
// whether s32 is an original type and we need to promote it to s64 or s32 is | ||
// obtained after widening s16 and we shouldn't widen it to s64. | ||
// | ||
// For AVX512 we simply widen types as there is direct mapping from opcodes | ||
// to asm instructions. | ||
getActionDefinitionsBuilder(G_UITOFP) | ||
.legalIf([=](const LegalityQuery &Query) { | ||
return HasAVX512 && typeInSet(0, {s32, s64})(Query) && | ||
typeInSet(1, {s32, s64})(Query); | ||
}) | ||
.clampScalar(0, s32, HasSSE2 ? s64 : s32) | ||
.widenScalarToNextPow2(0) | ||
.customIf([=](const LegalityQuery &Query) { | ||
e-kud marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (HasAVX512) | ||
return false; | ||
return ((HasSSE1 && typeIs(0, s32)(Query)) || | ||
(HasSSE2 && typeIs(0, s64)(Query))) && | ||
(scalarNarrowerThan(1, 32)(Query) || | ||
(Is64Bit && typeInSet(1, {s32, s64})(Query))); | ||
}) | ||
.clampScalar(1, s32, sMaxScalar) | ||
.widenScalarToNextPow2(1); | ||
|
||
getActionDefinitionsBuilder(G_FPTOUI) | ||
.legalIf([=](const LegalityQuery &Query) { | ||
return HasAVX512 && typeInSet(0, {s32, s64})(Query) && | ||
typeInSet(1, {s32, s64})(Query); | ||
}) | ||
.clampScalar(1, s32, HasSSE2 ? s64 : s32) | ||
.widenScalarToNextPow2(1) | ||
.customIf([=](const LegalityQuery &Query) { | ||
if (HasAVX512) | ||
return false; | ||
return ((HasSSE1 && typeIs(1, s32)(Query)) || | ||
(HasSSE2 && typeIs(1, s64)(Query))) && | ||
(scalarNarrowerThan(0, 32)(Query) || | ||
(Is64Bit && typeInSet(0, {s32, s64})(Query))); | ||
}) | ||
.clampScalar(0, s32, sMaxScalar) | ||
.widenScalarToNextPow2(0); | ||
|
||
// vector ops | ||
getActionDefinitionsBuilder(G_BUILD_VECTOR) | ||
.customIf([=](const LegalityQuery &Query) { | ||
|
@@ -589,6 +632,10 @@ bool X86LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, | |
return false; | ||
case TargetOpcode::G_BUILD_VECTOR: | ||
return legalizeBuildVector(MI, MRI, Helper); | ||
case TargetOpcode::G_FPTOUI: | ||
return legalizeFPTOUI(MI, MRI, Helper); | ||
case TargetOpcode::G_UITOFP: | ||
return legalizeUITOFP(MI, MRI, Helper); | ||
} | ||
llvm_unreachable("expected switch to return"); | ||
} | ||
|
@@ -644,6 +691,77 @@ bool X86LegalizerInfo::legalizeBuildVector(MachineInstr &MI, | |
return true; | ||
} | ||
|
||
bool X86LegalizerInfo::legalizeFPTOUI(MachineInstr &MI, | ||
MachineRegisterInfo &MRI, | ||
LegalizerHelper &Helper) const { | ||
MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; | ||
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); | ||
unsigned DstSizeInBits = DstTy.getScalarSizeInBits(); | ||
const LLT s32 = LLT::scalar(32); | ||
const LLT s64 = LLT::scalar(64); | ||
|
||
// Simply reuse FPTOSI when it is possible to widen the type | ||
if (DstSizeInBits <= 32) { | ||
auto Casted = MIRBuilder.buildFPTOSI(DstTy == s32 ? s64 : s32, Src); | ||
MIRBuilder.buildTrunc(Dst, Casted); | ||
MI.eraseFromParent(); | ||
return true; | ||
} | ||
|
||
if (DstTy == s64) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is another expansion only in terms of generic instructions that can go in LegalizerHelper There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so, it relies on the fact that X86 version of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can't really directly rely on known target instruction behavior and still use the regular G_* opcodes. Other optimizations could do something assuming the poison behavior Also add that as a comment somewhere? |
||
APInt TwoPExpInt = APInt::getSignMask(DstSizeInBits); | ||
APFloat TwoPExpFP(SrcTy == s32 ? APFloat::IEEEsingle() | ||
: APFloat::IEEEdouble(), | ||
APInt::getZero(SrcTy.getSizeInBits())); | ||
TwoPExpFP.convertFromAPInt(TwoPExpInt, /*IsSigned=*/false, | ||
APFloat::rmNearestTiesToEven); | ||
|
||
// For fp Src greater or equal to Threshold(2^Exp), we use FPTOSI on | ||
// (Src - 2^Exp) and add 2^Exp by setting highest bit in result to 1. | ||
// For fp Src smaller, (Src - 2^Exp) is zeroed by And, the final result | ||
// is FPTOSI on Src. | ||
auto Casted = MIRBuilder.buildFPTOSI(DstTy, Src); | ||
auto Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP); | ||
auto FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold); | ||
auto ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub); | ||
auto Shift = MIRBuilder.buildConstant(DstTy, DstSizeInBits - 1); | ||
auto ResHighBit = MIRBuilder.buildAShr(DstTy, Casted, Shift); | ||
auto And = MIRBuilder.buildAnd(DstTy, ResHighBit, ResLowBits); | ||
MIRBuilder.buildOr(Dst, And, Casted); | ||
MI.eraseFromParent(); | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
bool X86LegalizerInfo::legalizeUITOFP(MachineInstr &MI, | ||
MachineRegisterInfo &MRI, | ||
LegalizerHelper &Helper) const { | ||
MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; | ||
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); | ||
const LLT s32 = LLT::scalar(32); | ||
const LLT s64 = LLT::scalar(64); | ||
|
||
// Simply reuse SITOFP when it is possible to widen the type | ||
if (SrcTy.getSizeInBits() <= 32) { | ||
auto Ext = MIRBuilder.buildZExt(SrcTy == s32 ? s64 : s32, Src); | ||
MIRBuilder.buildSITOFP(Dst, Ext); | ||
MI.eraseFromParent(); | ||
return true; | ||
} | ||
|
||
if (SrcTy == s64 && DstTy == s32) | ||
return Helper.lowerU64ToF32WithSITOFP(MI) != | ||
LegalizerHelper::LegalizeResult::UnableToLegalize; | ||
|
||
if (SrcTy == s64 && DstTy == s64) | ||
// TODO: rewrite with vector shuffles when supported. | ||
return Helper.lowerU64ToF64BitFloatOps(MI) != | ||
LegalizerHelper::LegalizeResult::UnableToLegalize; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would expect the legalize rules to be written so that the default lower action hits these paths anyway? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, that's great! If we can lower There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately no, we still don't distinguish between smaller types promoted to |
||
|
||
return false; | ||
} | ||
|
||
bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, | ||
MachineInstr &MI) const { | ||
return true; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Incomplete comment?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually no but I added a few lines considering the comment above about
slt x, 0