-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[X86][GlobalISel] Enable scalar versions of G_UITOFP and G_FPTOUI #100079
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
e9d7c68
20e9b7b
7d68049
679ed80
7e2987d
bf232e7
d016bf0
c846d0c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -497,6 +497,53 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, | |
.clampScalar(0, s32, sMaxScalar) | ||
.widenScalarToNextPow2(1); | ||
|
||
// For G_UITOFP and G_FPTOUI without AVX512, we have to custom legalize s16 | ||
// manually. Otherwise, in custom handler there is no way to understand | ||
// whether s32 is an original type and we need to promote it to s64 or s32 is | ||
// obtained after widening s16 and we shouldn't widen it to s64. | ||
// | ||
// For AVX512 we simply widen types as there is direct mapping from opcodes | ||
// to asm instructions. | ||
getActionDefinitionsBuilder(G_UITOFP) | ||
.legalIf([=](const LegalityQuery &Query) { | ||
return HasAVX512 && typeInSet(0, {s32, s64})(Query) && | ||
typeInSet(1, {s32, s64})(Query); | ||
}) | ||
.customIf([=](const LegalityQuery &Query) { | ||
if (HasAVX512) | ||
return false; | ||
return (HasSSE1 && | ||
(typePairInSet(0, 1, {{s32, s32}, {s32, s16}})(Query) || | ||
(Is64Bit && typePairInSet(0, 1, {{s32, s64}})(Query)))) || | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For uitofp, SSE has to cheat and zero-extend to i16->i32 / i32->i64 and use sitofp - so we need Is64Bit for i32->f32 as well. We only get real uitofp with AVX512 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. |
||
(HasSSE2 && | ||
(typePairInSet(0, 1, {{s64, s32}, {s64, s16}})(Query) || | ||
(Is64Bit && typePairInSet(0, 1, {{s64, s64}})(Query)))); | ||
}) | ||
.clampScalar(1, HasAVX512 ? s32 : s16, sMaxScalar) | ||
.widenScalarToNextPow2(1) | ||
.clampScalar(0, s32, HasSSE2 ? s64 : s32) | ||
.widenScalarToNextPow2(0); | ||
|
||
getActionDefinitionsBuilder(G_FPTOUI) | ||
.legalIf([=](const LegalityQuery &Query) { | ||
return HasAVX512 && typeInSet(0, {s32, s64})(Query) && | ||
typeInSet(1, {s32, s64})(Query); | ||
}) | ||
.customIf([=](const LegalityQuery &Query) { | ||
if (HasAVX512) | ||
return false; | ||
return (HasSSE1 && | ||
(typePairInSet(0, 1, {{s32, s32}, {s16, s32}})(Query) || | ||
(Is64Bit && typePairInSet(0, 1, {{s64, s32}})(Query)))) || | ||
(HasSSE2 && | ||
(typePairInSet(0, 1, {{s32, s64}, {s16, s64}})(Query) || | ||
(Is64Bit && typePairInSet(0, 1, {{s64, s64}})(Query)))); | ||
}) | ||
.clampScalar(1, s32, sMaxScalar) | ||
.widenScalarToNextPow2(1) | ||
.clampScalar(0, HasAVX512 ? s32 : s16, HasSSE2 ? s64 : s32) | ||
.widenScalarToNextPow2(0); | ||
|
||
// vector ops | ||
getActionDefinitionsBuilder(G_BUILD_VECTOR) | ||
.customIf([=](const LegalityQuery &Query) { | ||
|
@@ -589,6 +636,10 @@ bool X86LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, | |
return false; | ||
case TargetOpcode::G_BUILD_VECTOR: | ||
return legalizeBuildVector(MI, MRI, Helper); | ||
case TargetOpcode::G_FPTOUI: | ||
return legalizeFPTOUI(MI, MRI, Helper); | ||
case TargetOpcode::G_UITOFP: | ||
return legalizeUITOFP(MI, MRI, Helper); | ||
} | ||
llvm_unreachable("expected switch to return"); | ||
} | ||
|
@@ -644,6 +695,112 @@ bool X86LegalizerInfo::legalizeBuildVector(MachineInstr &MI, | |
return true; | ||
} | ||
|
||
bool X86LegalizerInfo::legalizeFPTOUI(MachineInstr &MI, | ||
MachineRegisterInfo &MRI, | ||
LegalizerHelper &Helper) const { | ||
MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; | ||
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); | ||
unsigned DstSizeInBits = DstTy.getScalarSizeInBits(); | ||
const LLT s32 = LLT::scalar(32); | ||
const LLT s64 = LLT::scalar(64); | ||
|
||
// Simply reuse FPTOSI when it is possible to widen the type | ||
if (DstSizeInBits == 16 || DstSizeInBits == 32) { | ||
auto Casted = MIRBuilder.buildFPTOSI(LLT::scalar(DstSizeInBits * 2), Src); | ||
MIRBuilder.buildTrunc(Dst, Casted); | ||
MI.eraseFromParent(); | ||
return true; | ||
} | ||
if (DstTy == s64) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is another expansion only in terms of generic instructions that can go in LegalizerHelper There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so, it relies on the fact that X86 version of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can't really directly rely on known target instruction behavior and still use the regular G_* opcodes. Other optimizations could do something assuming the poison behavior Also add that as a comment somewhere? |
||
APInt TwoPExpInt = APInt::getSignMask(DstSizeInBits); | ||
APFloat TwoPExpFP(SrcTy == s32 ? APFloat::IEEEsingle() | ||
: APFloat::IEEEdouble(), | ||
APInt::getZero(SrcTy.getSizeInBits())); | ||
TwoPExpFP.convertFromAPInt(TwoPExpInt, /*IsSigned=*/false, | ||
APFloat::rmNearestTiesToEven); | ||
|
||
// For fp Src greater or equal to Threshold(2^Exp), we use FPTOSI on | ||
// (Src - 2^Exp) and add 2^Exp by setting highest bit in result to 1. | ||
// For fp Src smaller, (Src - 2^Exp) is zeroed by And, the final result | ||
// is FPTOSI on Src. | ||
auto Casted = MIRBuilder.buildFPTOSI(DstTy, Src); | ||
auto Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP); | ||
auto FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold); | ||
auto ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub); | ||
auto Shift = MIRBuilder.buildConstant(DstTy, DstSizeInBits - 1); | ||
auto ResHighBit = MIRBuilder.buildAShr(DstTy, Casted, Shift); | ||
auto And = MIRBuilder.buildAnd(DstTy, ResHighBit, ResLowBits); | ||
MIRBuilder.buildOr(Dst, And, Casted); | ||
MI.eraseFromParent(); | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
bool X86LegalizerInfo::legalizeUITOFP(MachineInstr &MI, | ||
MachineRegisterInfo &MRI, | ||
LegalizerHelper &Helper) const { | ||
MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; | ||
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); | ||
const LLT s16 = LLT::scalar(16); | ||
const LLT s32 = LLT::scalar(32); | ||
const LLT s64 = LLT::scalar(64); | ||
|
||
// Simply reuse SITOFP when it is possible to widen the type | ||
if (SrcTy == s16 || SrcTy == s32) { | ||
const LLT WidenTy = LLT::scalar(SrcTy.getScalarSizeInBits() * 2); | ||
auto Ext = MIRBuilder.buildZExt(WidenTy, Src); | ||
MIRBuilder.buildSITOFP(Dst, Ext); | ||
MI.eraseFromParent(); | ||
return true; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is reinventing widenScalar on the source There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is not. Maybe I'm missing something but In other words, after There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The order is whatever the rules you explicitly ordered do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks! It helped to avoid this ugly Or have you meant that we need to use |
||
if (SrcTy == s64 && DstTy == s32) { | ||
e-kud marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// For i64 < INT_MAX we simply reuse SITOFP. | ||
// Otherwise, divide i64 by 2, round result by ORing with the lowest bit | ||
// saved before division, convert to float by SITOFP, multiply the result | ||
// by 2. | ||
auto SmallResult = MIRBuilder.buildSITOFP(DstTy, Src); | ||
auto One = MIRBuilder.buildConstant(SrcTy, 1); | ||
auto Zero = MIRBuilder.buildConstant(SrcTy, 0); | ||
auto Halved = MIRBuilder.buildLShr(SrcTy, Src, One); | ||
auto LowerBit = MIRBuilder.buildAnd(SrcTy, Src, One); | ||
auto RoundedHalved = MIRBuilder.buildOr(SrcTy, Halved, LowerBit); | ||
auto HalvedFP = MIRBuilder.buildSITOFP(DstTy, RoundedHalved); | ||
auto LargeResult = MIRBuilder.buildFAdd(DstTy, HalvedFP, HalvedFP); | ||
auto IsLarge = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, | ||
LLT::scalar(1), Src, Zero); | ||
MIRBuilder.buildSelect(Dst, IsLarge, LargeResult, SmallResult); | ||
MI.eraseFromParent(); | ||
return true; | ||
} | ||
if (SrcTy == s64 && DstTy == s64) { | ||
e-kud marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// TODO: rewrite on vector shuffles when supported. | ||
// We create doubles from 32 bit parts with 32 exponent difference. | ||
// | ||
// X = 2^52 * 1.0...LowBits | ||
// Y = 2^84 * 1.0...HighBits | ||
// Temp = 2^84 * 1.0...HighBits - 2^84 * 1.0 - 2^52 * 1.0 | ||
// = - 2^52 * 1.0...HighBits | ||
// Result = - 2^52 * 1.0...HighBits + 2^52 * 1.0...LowBits | ||
auto TwoP52 = MIRBuilder.buildConstant(s64, UINT64_C(0x4330000000000000)); | ||
auto TwoP84 = MIRBuilder.buildConstant(s64, UINT64_C(0x4530000000000000)); | ||
auto TwoP52P84 = llvm::bit_cast<double>(UINT64_C(0x4530000000100000)); | ||
auto TwoP52P84FP = MIRBuilder.buildFConstant(s64, TwoP52P84); | ||
auto HalfWidth = MIRBuilder.buildConstant(s64, 32); | ||
|
||
auto LowBits = MIRBuilder.buildTrunc(s32, Src); | ||
LowBits = MIRBuilder.buildZExt(s64, LowBits); | ||
auto LowBitsFP = MIRBuilder.buildOr(s64, TwoP52, LowBits); | ||
auto HighBits = MIRBuilder.buildLShr(s64, Src, HalfWidth); | ||
auto HighBitsFP = MIRBuilder.buildOr(s64, TwoP84, HighBits); | ||
auto Scratch = MIRBuilder.buildFSub(s64, HighBitsFP, TwoP52P84FP); | ||
MIRBuilder.buildFAdd(Dst, Scratch, LowBitsFP); | ||
MI.eraseFromParent(); | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, | ||
MachineInstr &MI) const { | ||
return true; | ||
|
Uh oh!
There was an error while loading. Please reload this page.