Skip to content

[X86][GlobalISel] Enable scalar versions of G_UITOFP and G_FPTOUI #100079

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,8 @@ class LegalizerHelper {
LegalizeResult lowerRotate(MachineInstr &MI);

LegalizeResult lowerU64ToF32BitOps(MachineInstr &MI);
LegalizeResult lowerU64ToF32WithSITOFP(MachineInstr &MI);
LegalizeResult lowerU64ToF64BitFloatOps(MachineInstr &MI);
LegalizeResult lowerUITOFP(MachineInstr &MI);
LegalizeResult lowerSITOFP(MachineInstr &MI);
LegalizeResult lowerFPTOUI(MachineInstr &MI);
Expand Down
86 changes: 80 additions & 6 deletions llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7163,6 +7163,78 @@ LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
return Legalized;
}

// Expand s32 = G_UITOFP s64 to an IEEE float representation using bit
// operations and G_SITOFP
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerU64ToF32WithSITOFP(MachineInstr &MI) {
auto [Dst, Src] = MI.getFirst2Regs();
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);
const LLT S1 = LLT::scalar(1);

assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);

// For i64 < INT_MAX we simply reuse SITOFP.
// Otherwise, divide i64 by 2, round result by ORing with the lowest bit
// saved before division, convert to float by SITOFP, multiply the result
// by 2.
auto One = MIRBuilder.buildConstant(S64, 1);
auto Zero = MIRBuilder.buildConstant(S64, 0);
// Result if Src < INT_MAX
auto SmallResult = MIRBuilder.buildSITOFP(S32, Src);
// Result if Src >= INT_MAX
auto Halved = MIRBuilder.buildLShr(S64, Src, One);
auto LowerBit = MIRBuilder.buildAnd(S64, Src, One);
auto RoundedHalved = MIRBuilder.buildOr(S64, Halved, LowerBit);
auto HalvedFP = MIRBuilder.buildSITOFP(S32, RoundedHalved);
auto LargeResult = MIRBuilder.buildFAdd(S32, HalvedFP, HalvedFP);
// Check if the original value is larger than INT_MAX by comparing with
// zero to pick one of the two conversions.
auto IsLarge =
MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, S1, Src, Zero);
Comment on lines +7193 to +7194
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IsLarge = slt x, 0?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes because x is unsigned. Or do we want to use ult and INT_MAX to make it more explicit?

MIRBuilder.buildSelect(Dst, IsLarge, LargeResult, SmallResult);

MI.eraseFromParent();
return Legalized;
}

// Expand s64 = G_UITOFP s64 using bit and float arithmetic operations to an
// IEEE double representation.
LegalizerHelper::LegalizeResult
LegalizerHelper::lowerU64ToF64BitFloatOps(MachineInstr &MI) {
auto [Dst, Src] = MI.getFirst2Regs();
const LLT S64 = LLT::scalar(64);
const LLT S32 = LLT::scalar(32);

assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);

// We create double value from 32 bit parts with 32 exponent difference.
// Note that + and - are float operations that adjust the implicit leading
// one, the bases 2^52 and 2^84 are for illustrative purposes.
//
// X = 2^52 * 1.0...LowBits
// Y = 2^84 * 1.0...HighBits
// Scratch = 2^84 * 1.0...HighBits - 2^84 * 1.0 - 2^52 * 1.0
// = - 2^52 * 1.0...HighBits
// Result = - 2^52 * 1.0...HighBits + 2^52 * 1.0...LowBits
auto TwoP52 = MIRBuilder.buildConstant(S64, UINT64_C(0x4330000000000000));
auto TwoP84 = MIRBuilder.buildConstant(S64, UINT64_C(0x4530000000000000));
auto TwoP52P84 = llvm::bit_cast<double>(UINT64_C(0x4530000000100000));
auto TwoP52P84FP = MIRBuilder.buildFConstant(S64, TwoP52P84);
auto HalfWidth = MIRBuilder.buildConstant(S64, 32);

auto LowBits = MIRBuilder.buildTrunc(S32, Src);
LowBits = MIRBuilder.buildZExt(S64, LowBits);
auto LowBitsFP = MIRBuilder.buildOr(S64, TwoP52, LowBits);
auto HighBits = MIRBuilder.buildLShr(S64, Src, HalfWidth);
auto HighBitsFP = MIRBuilder.buildOr(S64, TwoP84, HighBits);
auto Scratch = MIRBuilder.buildFSub(S64, HighBitsFP, TwoP52P84FP);
MIRBuilder.buildFAdd(Dst, Scratch, LowBitsFP);

MI.eraseFromParent();
return Legalized;
}

LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();

Expand All @@ -7177,13 +7249,15 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
if (SrcTy != LLT::scalar(64))
return UnableToLegalize;

if (DstTy == LLT::scalar(32)) {
if (DstTy == LLT::scalar(32))
// TODO: SelectionDAG has several alternative expansions to port which may
// be more reasonble depending on the available instructions. If a target
// has sitofp, does not have CTLZ, or can efficiently use f64 as an
// intermediate type, this is probably worse.
return lowerU64ToF32BitOps(MI);
}
// be more reasonable depending on the available instructions. We also need
// a more advanced mechanism to choose an optimal version depending on
// target features such as sitofp or CTLZ availability.
return lowerU64ToF32WithSITOFP(MI);

if (DstTy == LLT::scalar(64))
return lowerU64ToF64BitFloatOps(MI);

return UnableToLegalize;
}
Expand Down
99 changes: 99 additions & 0 deletions llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,62 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
.clampScalar(0, s32, sMaxScalar)
.widenScalarToNextPow2(1);

// For G_UITOFP and G_FPTOUI without AVX512, we have to custom legalize types
// <= s32 manually. Otherwise, in custom handler there is no way to
// understand whether s32 is an original type and we need to promote it to
// s64 or s32 is obtained after widening and we shouldn't widen it to s64.
//
// For AVX512 we simply widen types as there is direct mapping from opcodes
// to asm instructions.
getActionDefinitionsBuilder(G_UITOFP)
.legalIf([=](const LegalityQuery &Query) {
return HasAVX512 && typeInSet(0, {s32, s64})(Query) &&
typeInSet(1, {s32, s64})(Query);
})
.customIf([=](const LegalityQuery &Query) {
return !HasAVX512 &&
((HasSSE1 && typeIs(0, s32)(Query)) ||
(HasSSE2 && typeIs(0, s64)(Query))) &&
scalarNarrowerThan(1, Is64Bit ? 64 : 32)(Query);
})
.lowerIf([=](const LegalityQuery &Query) {
// Lower conversions from s64
return !HasAVX512 &&
((HasSSE1 && typeIs(0, s32)(Query)) ||
(HasSSE2 && typeIs(0, s64)(Query))) &&
(Is64Bit && typeIs(1, s64)(Query));
})
.clampScalar(0, s32, HasSSE2 ? s64 : s32)
.widenScalarToNextPow2(0)
.clampScalar(1, s32, sMaxScalar)
.widenScalarToNextPow2(1);

getActionDefinitionsBuilder(G_FPTOUI)
.legalIf([=](const LegalityQuery &Query) {
return HasAVX512 && typeInSet(0, {s32, s64})(Query) &&
typeInSet(1, {s32, s64})(Query);
})
.customIf([=](const LegalityQuery &Query) {
return !HasAVX512 &&
((HasSSE1 && typeIs(1, s32)(Query)) ||
(HasSSE2 && typeIs(1, s64)(Query))) &&
scalarNarrowerThan(0, Is64Bit ? 64 : 32)(Query);
})
// TODO: replace with customized legalization using
// specifics of cvttsd2si. The selection of this node requires
// a vector type. Either G_SCALAR_TO_VECTOR is needed or more advanced
// support of G_BUILD_VECTOR/G_INSERT_VECTOR_ELT is required beforehand.
.lowerIf([=](const LegalityQuery &Query) {
return !HasAVX512 &&
((HasSSE1 && typeIs(1, s32)(Query)) ||
(HasSSE2 && typeIs(1, s64)(Query))) &&
(Is64Bit && typeIs(0, s64)(Query));
})
.clampScalar(0, s32, sMaxScalar)
.widenScalarToNextPow2(0)
.clampScalar(1, s32, HasSSE2 ? s64 : s32)
.widenScalarToNextPow2(1);

// vector ops
getActionDefinitionsBuilder(G_BUILD_VECTOR)
.customIf([=](const LegalityQuery &Query) {
Expand Down Expand Up @@ -590,6 +646,10 @@ bool X86LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
return false;
case TargetOpcode::G_BUILD_VECTOR:
return legalizeBuildVector(MI, MRI, Helper);
case TargetOpcode::G_FPTOUI:
return legalizeFPTOUI(MI, MRI, Helper);
case TargetOpcode::G_UITOFP:
return legalizeUITOFP(MI, MRI, Helper);
}
llvm_unreachable("expected switch to return");
}
Expand Down Expand Up @@ -645,6 +705,45 @@ bool X86LegalizerInfo::legalizeBuildVector(MachineInstr &MI,
return true;
}

bool X86LegalizerInfo::legalizeFPTOUI(MachineInstr &MI,
MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const {
MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
unsigned DstSizeInBits = DstTy.getScalarSizeInBits();
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);

// Simply reuse FPTOSI when it is possible to widen the type
if (DstSizeInBits <= 32) {
auto Casted = MIRBuilder.buildFPTOSI(DstTy == s32 ? s64 : s32, Src);
MIRBuilder.buildTrunc(Dst, Casted);
MI.eraseFromParent();
return true;
}

return false;
}

bool X86LegalizerInfo::legalizeUITOFP(MachineInstr &MI,
MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const {
MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);

// Simply reuse SITOFP when it is possible to widen the type
if (SrcTy.getSizeInBits() <= 32) {
auto Ext = MIRBuilder.buildZExt(SrcTy == s32 ? s64 : s32, Src);
MIRBuilder.buildSITOFP(Dst, Ext);
MI.eraseFromParent();
return true;
}

return false;
}

bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const {
return true;
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ class X86LegalizerInfo : public LegalizerInfo {
private:
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const;

bool legalizeFPTOUI(MachineInstr &MI, MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const;

bool legalizeUITOFP(MachineInstr &MI, MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const;
};
} // namespace llvm
#endif
10 changes: 6 additions & 4 deletions llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,18 +296,20 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
getInstrPartialMappingIdxs(MI, MRI, /* isFP= */ true, OpRegBankIdx);
break;
case TargetOpcode::G_SITOFP:
case TargetOpcode::G_FPTOSI: {
case TargetOpcode::G_FPTOSI:
case TargetOpcode::G_UITOFP:
case TargetOpcode::G_FPTOUI: {
// Some of the floating-point instructions have mixed GPR and FP
// operands: fine-tune the computed mapping.
auto &Op0 = MI.getOperand(0);
auto &Op1 = MI.getOperand(1);
const LLT Ty0 = MRI.getType(Op0.getReg());
const LLT Ty1 = MRI.getType(Op1.getReg());

bool FirstArgIsFP = Opc == TargetOpcode::G_SITOFP;
bool SecondArgIsFP = Opc == TargetOpcode::G_FPTOSI;
bool FirstArgIsFP =
Opc == TargetOpcode::G_SITOFP || Opc == TargetOpcode::G_UITOFP;
OpRegBankIdx[0] = getPartialMappingIdx(MI, Ty0, /* isFP= */ FirstArgIsFP);
OpRegBankIdx[1] = getPartialMappingIdx(MI, Ty1, /* isFP= */ SecondArgIsFP);
OpRegBankIdx[1] = getPartialMappingIdx(MI, Ty1, /* isFP= */ !FirstArgIsFP);
break;
}
case TargetOpcode::G_FCMP: {
Expand Down
Loading
Loading