Skip to content

[X86][GlobalISel] Enable scalar versions of G_UITOFP and G_FPTOUI #100079

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,8 @@ class LegalizerHelper {
LegalizeResult lowerRotate(MachineInstr &MI);

LegalizeResult lowerU64ToF32BitOps(MachineInstr &MI);
LegalizeResult lowerU64ToF32WithSITOFP(MachineInstr &MI);
LegalizeResult lowerU64ToF64BitFloatOps(MachineInstr &MI);
LegalizeResult lowerUITOFP(MachineInstr &MI);
LegalizeResult lowerSITOFP(MachineInstr &MI);
LegalizeResult lowerFPTOUI(MachineInstr &MI);
Expand Down
73 changes: 73 additions & 0 deletions llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6931,6 +6931,77 @@ LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
return Legalized;
}

// Expand s32 = G_UITOFP s64 to an IEEE float representation using bit
// operations and G_SITOFP
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerU64ToF32WithSITOFP(MachineInstr &MI) {
auto [Dst, Src] = MI.getFirst2Regs();
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
const LLT S1 = LLT::scalar(1);

assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);

// For i64 < INT_MAX we simply reuse SITOFP.
// Otherwise, divide i64 by 2, round result by ORing with the lowest bit
// saved before division, convert to float by SITOFP, multiply the result
// by 2.
auto One = MIRBuilder.buildConstant(S64, 1);
auto Zero = MIRBuilder.buildConstant(S64, 0);
// Result if Src < INT_MAX
auto SmallResult = MIRBuilder.buildSITOFP(S32, Src);
// Result if Src >= INT_MAX
auto Halved = MIRBuilder.buildLShr(S64, Src, One);
auto LowerBit = MIRBuilder.buildAnd(S64, Src, One);
auto RoundedHalved = MIRBuilder.buildOr(S64, Halved, LowerBit);
auto HalvedFP = MIRBuilder.buildSITOFP(S32, RoundedHalved);
auto LargeResult = MIRBuilder.buildFAdd(S32, HalvedFP, HalvedFP);
// Choose
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Incomplete comment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually no but I added a few lines considering the comment above about slt x, 0

auto IsLarge =
MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, S1, Src, Zero);
Comment on lines +7193 to +7194
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IsLarge = slt x, 0?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes because x is unsigned. Or do we want to use ult and INT_MAX to make it more explicit?

MIRBuilder.buildSelect(Dst, IsLarge, LargeResult, SmallResult);

MI.eraseFromParent();
return Legalized;
}

// Expand s64 = G_UITOFP s64 using bit and float arithmetic operations to an
// IEEE double representation.
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
auto [Dst, Src] = MI.getFirst2Regs();
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);

assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);

// We create double value from 32 bit parts with 32 exponent difference.
// Note that + and - are float operations that adjust the implicit leading
// one, the bases 2^52 and 2^84 are for illustrative purposes.
//
// X = 2^52 * 1.0...LowBits
// Y = 2^84 * 1.0...HighBits
// Scratch = 2^84 * 1.0...HighBits - 2^84 * 1.0 - 2^52 * 1.0
// = - 2^52 * 1.0...HighBits
// Result = - 2^52 * 1.0...HighBits + 2^52 * 1.0...LowBits
auto TwoP52 = MIRBuilder.buildConstant(S64, UINT64_C(0x4330000000000000));
auto TwoP84 = MIRBuilder.buildConstant(S64, UINT64_C(0x4530000000000000));
auto TwoP52P84 = llvm::bit_cast<double>(UINT64_C(0x4530000000100000));
auto TwoP52P84FP = MIRBuilder.buildFConstant(S64, TwoP52P84);
auto HalfWidth = MIRBuilder.buildConstant(S64, 32);

auto LowBits = MIRBuilder.buildTrunc(S32, Src);
LowBits = MIRBuilder.buildZExt(S64, LowBits);
auto LowBitsFP = MIRBuilder.buildOr(S64, TwoP52, LowBits);
auto HighBits = MIRBuilder.buildLShr(S64, Src, HalfWidth);
auto HighBitsFP = MIRBuilder.buildOr(S64, TwoP84, HighBits);
auto Scratch = MIRBuilder.buildFSub(S64, HighBitsFP, TwoP52P84FP);
MIRBuilder.buildFAdd(Dst, Scratch, LowBitsFP);

MI.eraseFromParent();
return Legalized;
}

LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();

Expand All @@ -6951,6 +7022,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
// has sitofp, does not have CTLZ, or can efficiently use f64 as an
// intermediate type, this is probably worse.
return lowerU64ToF32BitOps(MI);
} else if (DstTy == LLT::scalar(64)) {
return lowerU64ToF64BitFloatOps(MI);
}

return UnableToLegalize;
Expand Down
118 changes: 118 additions & 0 deletions llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,49 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
.clampScalar(0, s32, sMaxScalar)
.widenScalarToNextPow2(1);

// For G_UITOFP and G_FPTOUI without AVX512, we have to custom legalize s16
// manually. Otherwise, in custom handler there is no way to understand
// whether s32 is an original type and we need to promote it to s64 or s32 is
// obtained after widening s16 and we shouldn't widen it to s64.
//
// For AVX512 we simply widen types as there is direct mapping from opcodes
// to asm instructions.
getActionDefinitionsBuilder(G_UITOFP)
.legalIf([=](const LegalityQuery &Query) {
return HasAVX512 && typeInSet(0, {s32, s64})(Query) &&
typeInSet(1, {s32, s64})(Query);
})
.clampScalar(0, s32, HasSSE2 ? s64 : s32)
.widenScalarToNextPow2(0)
.customIf([=](const LegalityQuery &Query) {
if (HasAVX512)
return false;
return ((HasSSE1 && typeIs(0, s32)(Query)) ||
(HasSSE2 && typeIs(0, s64)(Query))) &&
(scalarNarrowerThan(1, 32)(Query) ||
(Is64Bit && typeInSet(1, {s32, s64})(Query)));
})
.clampScalar(1, s32, sMaxScalar)
.widenScalarToNextPow2(1);

getActionDefinitionsBuilder(G_FPTOUI)
.legalIf([=](const LegalityQuery &Query) {
return HasAVX512 && typeInSet(0, {s32, s64})(Query) &&
typeInSet(1, {s32, s64})(Query);
})
.clampScalar(1, s32, HasSSE2 ? s64 : s32)
.widenScalarToNextPow2(1)
.customIf([=](const LegalityQuery &Query) {
if (HasAVX512)
return false;
return ((HasSSE1 && typeIs(1, s32)(Query)) ||
(HasSSE2 && typeIs(1, s64)(Query))) &&
(scalarNarrowerThan(0, 32)(Query) ||
(Is64Bit && typeInSet(0, {s32, s64})(Query)));
})
.clampScalar(0, s32, sMaxScalar)
.widenScalarToNextPow2(0);

// vector ops
getActionDefinitionsBuilder(G_BUILD_VECTOR)
.customIf([=](const LegalityQuery &Query) {
Expand Down Expand Up @@ -589,6 +632,10 @@ bool X86LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
return false;
case TargetOpcode::G_BUILD_VECTOR:
return legalizeBuildVector(MI, MRI, Helper);
case TargetOpcode::G_FPTOUI:
return legalizeFPTOUI(MI, MRI, Helper);
case TargetOpcode::G_UITOFP:
return legalizeUITOFP(MI, MRI, Helper);
}
llvm_unreachable("expected switch to return");
}
Expand Down Expand Up @@ -644,6 +691,77 @@ bool X86LegalizerInfo::legalizeBuildVector(MachineInstr &MI,
return true;
}

bool X86LegalizerInfo::legalizeFPTOUI(MachineInstr &MI,
MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const {
MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
unsigned DstSizeInBits = DstTy.getScalarSizeInBits();
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);

// Simply reuse FPTOSI when it is possible to widen the type
if (DstSizeInBits <= 32) {
auto Casted = MIRBuilder.buildFPTOSI(DstTy == s32 ? s64 : s32, Src);
MIRBuilder.buildTrunc(Dst, Casted);
MI.eraseFromParent();
return true;
}

if (DstTy == s64) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is another expansion only in terms of generic instructions that can go in LegalizerHelper

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so, it relies on the fact that X86 version of FPTOSI returns 80000000_00000000H in case of failed conversion.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can't really directly rely on known target instruction behavior and still use the regular G_* opcodes. Other optimizations could do something assuming the poison behavior

Also add that as a comment somewhere?

APInt TwoPExpInt = APInt::getSignMask(DstSizeInBits);
APFloat TwoPExpFP(SrcTy == s32 ? APFloat::IEEEsingle()
: APFloat::IEEEdouble(),
APInt::getZero(SrcTy.getSizeInBits()));
TwoPExpFP.convertFromAPInt(TwoPExpInt, /*IsSigned=*/false,
APFloat::rmNearestTiesToEven);

// For fp Src greater or equal to Threshold(2^Exp), we use FPTOSI on
// (Src - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
// For fp Src smaller, (Src - 2^Exp) is zeroed by And, the final result
// is FPTOSI on Src.
auto Casted = MIRBuilder.buildFPTOSI(DstTy, Src);
auto Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
auto FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
auto ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
auto Shift = MIRBuilder.buildConstant(DstTy, DstSizeInBits - 1);
auto ResHighBit = MIRBuilder.buildAShr(DstTy, Casted, Shift);
auto And = MIRBuilder.buildAnd(DstTy, ResHighBit, ResLowBits);
MIRBuilder.buildOr(Dst, And, Casted);
MI.eraseFromParent();
return true;
}
return false;
}

bool X86LegalizerInfo::legalizeUITOFP(MachineInstr &MI,
MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const {
MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);

// Simply reuse SITOFP when it is possible to widen the type
if (SrcTy.getSizeInBits() <= 32) {
auto Ext = MIRBuilder.buildZExt(SrcTy == s32 ? s64 : s32, Src);
MIRBuilder.buildSITOFP(Dst, Ext);
MI.eraseFromParent();
return true;
}

if (SrcTy == s64 && DstTy == s32)
return Helper.lowerU64ToF32WithSITOFP(MI) !=
LegalizerHelper::LegalizeResult::UnableToLegalize;

if (SrcTy == s64 && DstTy == s64)
// TODO: rewrite with vector shuffles when supported.
return Helper.lowerU64ToF64BitFloatOps(MI) !=
LegalizerHelper::LegalizeResult::UnableToLegalize;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would expect the legalize rules to be written so that the default lower action hits these paths anyway?

Copy link
Contributor Author

@e-kud e-kud Jul 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, that's great! If we can lower s64 as a default lower action, we can finally use widenScalar before customIf and get rid of manual widening.
The problem here is that there is already the implementation for s64->s32 that is not efficient in terms of X86 as the more efficient is with SITOFP. However I can't find a target using lower for UITOFP. So I can replace the lowering that is more convenient for X86. But in future we need either a hook or some parameters passed to the Helper.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately no, we still don't distinguish between smaller types promoted to s32 and original s32 because after widening s8->s32 we start rule iteration over and have no clue whether we should widen type to s64 or not.


return false;
}

bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const {
return true;
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ class X86LegalizerInfo : public LegalizerInfo {
private:
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const;

bool legalizeFPTOUI(MachineInstr &MI, MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const;

bool legalizeUITOFP(MachineInstr &MI, MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const;
};
} // namespace llvm
#endif
10 changes: 6 additions & 4 deletions llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,18 +323,20 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
getInstrPartialMappingIdxs(MI, MRI, /* isFP= */ true, OpRegBankIdx);
break;
case TargetOpcode::G_SITOFP:
case TargetOpcode::G_FPTOSI: {
case TargetOpcode::G_FPTOSI:
case TargetOpcode::G_UITOFP:
case TargetOpcode::G_FPTOUI: {
// Some of the floating-point instructions have mixed GPR and FP
// operands: fine-tune the computed mapping.
auto &Op0 = MI.getOperand(0);
auto &Op1 = MI.getOperand(1);
const LLT Ty0 = MRI.getType(Op0.getReg());
const LLT Ty1 = MRI.getType(Op1.getReg());

bool FirstArgIsFP = Opc == TargetOpcode::G_SITOFP;
bool SecondArgIsFP = Opc == TargetOpcode::G_FPTOSI;
bool FirstArgIsFP =
Opc == TargetOpcode::G_SITOFP || Opc == TargetOpcode::G_UITOFP;
OpRegBankIdx[0] = getPartialMappingIdx(MI, Ty0, /* isFP= */ FirstArgIsFP);
OpRegBankIdx[1] = getPartialMappingIdx(MI, Ty1, /* isFP= */ SecondArgIsFP);
OpRegBankIdx[1] = getPartialMappingIdx(MI, Ty1, /* isFP= */ !FirstArgIsFP);
break;
}
case TargetOpcode::G_FCMP: {
Expand Down
Loading
Loading