-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[X86][GlobalISel] Enable scalar versions of G_UITOFP and G_FPTOUI #100079
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Also add tests for G_SITOFP and G_FPTOSI
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-x86 Author: Evgenii Kudriashov (e-kud) ChangesAlso add tests for G_SITOFP and G_FPTOSI Patch is 30.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100079.diff 5 Files Affected:
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index dd8ecf6ef7fc7..0f1d3343630d7 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -497,6 +497,53 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
.clampScalar(0, s32, sMaxScalar)
.widenScalarToNextPow2(1);
+ // For G_UITOFP and G_FPTOUI without AVX512, we have to custom legalize s16
+ // manually. Otherwise, in custom handler there is no way to understand
+ // whether s32 is an original type and we need to promote it to s64 or s32 is
+ // obtained after widening s16 and we shouldn't widen it to s64.
+ //
+ // For AVX512 we simply widen types as there is direct mapping from opcodes
+ // to asm instructions.
+ getActionDefinitionsBuilder(G_UITOFP)
+ .legalIf([=](const LegalityQuery &Query) {
+ return HasAVX512 && typeInSet(0, {s32, s64})(Query) &&
+ typeInSet(1, {s32, s64})(Query);
+ })
+ .customIf([=](const LegalityQuery &Query) {
+ if (HasAVX512)
+ return false;
+ return (HasSSE1 &&
+ (typePairInSet(0, 1, {{s32, s32}, {s32, s16}})(Query) ||
+ (Is64Bit && typePairInSet(0, 1, {{s32, s64}})(Query)))) ||
+ (HasSSE2 &&
+ (typePairInSet(0, 1, {{s64, s32}, {s64, s16}})(Query) ||
+ (Is64Bit && typePairInSet(0, 1, {{s64, s64}})(Query))));
+ })
+ .clampScalar(1, HasAVX512 ? s32 : s16, sMaxScalar)
+ .widenScalarToNextPow2(1)
+ .clampScalar(0, s32, HasSSE2 ? s64 : s32)
+ .widenScalarToNextPow2(0);
+
+ getActionDefinitionsBuilder(G_FPTOUI)
+ .legalIf([=](const LegalityQuery &Query) {
+ return HasAVX512 && typeInSet(0, {s32, s64})(Query) &&
+ typeInSet(1, {s32, s64})(Query);
+ })
+ .customIf([=](const LegalityQuery &Query) {
+ if (HasAVX512)
+ return false;
+ return (HasSSE1 &&
+ (typePairInSet(0, 1, {{s32, s32}, {s16, s32}})(Query) ||
+ (Is64Bit && typePairInSet(0, 1, {{s64, s32}})(Query)))) ||
+ (HasSSE2 &&
+ (typePairInSet(0, 1, {{s32, s64}, {s16, s64}})(Query) ||
+ (Is64Bit && typePairInSet(0, 1, {{s64, s64}})(Query))));
+ })
+ .clampScalar(1, s32, sMaxScalar)
+ .widenScalarToNextPow2(1)
+ .clampScalar(0, HasAVX512 ? s32 : s16, HasSSE2 ? s64 : s32)
+ .widenScalarToNextPow2(0);
+
// vector ops
getActionDefinitionsBuilder(G_BUILD_VECTOR)
.customIf([=](const LegalityQuery &Query) {
@@ -589,6 +636,10 @@ bool X86LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
return false;
case TargetOpcode::G_BUILD_VECTOR:
return legalizeBuildVector(MI, MRI, Helper);
+ case TargetOpcode::G_FPTOUI:
+ return legalizeFPTOUI(MI, MRI, Helper);
+ case TargetOpcode::G_UITOFP:
+ return legalizeUITOFP(MI, MRI, Helper);
}
llvm_unreachable("expected switch to return");
}
@@ -644,6 +695,112 @@ bool X86LegalizerInfo::legalizeBuildVector(MachineInstr &MI,
return true;
}
+bool X86LegalizerInfo::legalizeFPTOUI(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ LegalizerHelper &Helper) const {
+ MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+ auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
+ unsigned DstSizeInBits = DstTy.getScalarSizeInBits();
+ const LLT s32 = LLT::scalar(32);
+ const LLT s64 = LLT::scalar(64);
+
+ // Simply reuse FPTOSI when it is possible to widen the type
+ if (DstSizeInBits == 16 || DstSizeInBits == 32) {
+ auto Casted = MIRBuilder.buildFPTOSI(LLT::scalar(DstSizeInBits * 2), Src);
+ MIRBuilder.buildTrunc(Dst, Casted);
+ MI.eraseFromParent();
+ return true;
+ }
+ if (DstTy == s64) {
+ APInt TwoPExpInt = APInt::getSignMask(DstSizeInBits);
+ APFloat TwoPExpFP(SrcTy == s32 ? APFloat::IEEEsingle()
+ : APFloat::IEEEdouble(),
+ APInt::getZero(SrcTy.getSizeInBits()));
+ TwoPExpFP.convertFromAPInt(TwoPExpInt, /*IsSigned=*/false,
+ APFloat::rmNearestTiesToEven);
+
+ // For fp Src greater or equal to Threshold(2^Exp), we use FPTOSI on
+ // (Src - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
+ // For fp Src smaller, (Src - 2^Exp) is zeroed by And, the final result
+ // is FPTOSI on Src.
+ auto Casted = MIRBuilder.buildFPTOSI(DstTy, Src);
+ auto Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
+ auto FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
+ auto ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
+ auto Shift = MIRBuilder.buildConstant(DstTy, DstSizeInBits - 1);
+ auto ResHighBit = MIRBuilder.buildAShr(DstTy, Casted, Shift);
+ auto And = MIRBuilder.buildAnd(DstTy, ResHighBit, ResLowBits);
+ MIRBuilder.buildOr(Dst, And, Casted);
+ MI.eraseFromParent();
+ return true;
+ }
+ return false;
+}
+
+bool X86LegalizerInfo::legalizeUITOFP(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ LegalizerHelper &Helper) const {
+ MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+ auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
+ const LLT s16 = LLT::scalar(16);
+ const LLT s32 = LLT::scalar(32);
+ const LLT s64 = LLT::scalar(64);
+
+ // Simply reuse SITOFP when it is possible to widen the type
+ if (SrcTy == s16 || SrcTy == s32) {
+ const LLT WidenTy = LLT::scalar(SrcTy.getScalarSizeInBits() * 2);
+ auto Ext = MIRBuilder.buildZExt(WidenTy, Src);
+ MIRBuilder.buildSITOFP(Dst, Ext);
+ MI.eraseFromParent();
+ return true;
+ }
+ if (SrcTy == s64 && DstTy == s32) {
+ // For i64 < INT_MAX we simply reuse SITOFP.
+ // Otherwise, divide i64 by 2, round result by ORing with the lowest bit
+ // saved before division, convert to float by SITOFP, multiply the result
+ // by 2.
+ auto SmallResult = MIRBuilder.buildSITOFP(DstTy, Src);
+ auto One = MIRBuilder.buildConstant(SrcTy, 1);
+ auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
+ auto Halved = MIRBuilder.buildLShr(SrcTy, Src, One);
+ auto LowerBit = MIRBuilder.buildAnd(SrcTy, Src, One);
+ auto RoundedHalved = MIRBuilder.buildOr(SrcTy, Halved, LowerBit);
+ auto HalvedFP = MIRBuilder.buildSITOFP(DstTy, RoundedHalved);
+ auto LargeResult = MIRBuilder.buildFAdd(DstTy, HalvedFP, HalvedFP);
+ auto IsLarge = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT,
+ LLT::scalar(1), Src, Zero);
+ MIRBuilder.buildSelect(Dst, IsLarge, LargeResult, SmallResult);
+ MI.eraseFromParent();
+ return true;
+ }
+ if (SrcTy == s64 && DstTy == s64) {
+ // TODO: rewrite on vector shuffles when supported.
+ // We create doubles from 32 bit parts with 32 exponent difference.
+ //
+ // X = 2^52 * 1.0...LowBits
+ // Y = 2^84 * 1.0...HighBits
+ // Temp = 2^84 * 1.0...HighBits - 2^84 * 1.0 - 2^52 * 1.0
+ // = - 2^52 * 1.0...HighBits
+ // Result = - 2^52 * 1.0...HighBits + 2^52 * 1.0...LowBits
+ auto TwoP52 = MIRBuilder.buildConstant(s64, UINT64_C(0x4330000000000000));
+ auto TwoP84 = MIRBuilder.buildConstant(s64, UINT64_C(0x4530000000000000));
+ auto TwoP52P84 = llvm::bit_cast<double>(UINT64_C(0x4530000000100000));
+ auto TwoP52P84FP = MIRBuilder.buildFConstant(s64, TwoP52P84);
+ auto HalfWidth = MIRBuilder.buildConstant(s64, 32);
+
+ auto LowBits = MIRBuilder.buildTrunc(s32, Src);
+ LowBits = MIRBuilder.buildZExt(s64, LowBits);
+ auto LowBitsFP = MIRBuilder.buildOr(s64, TwoP52, LowBits);
+ auto HighBits = MIRBuilder.buildLShr(s64, Src, HalfWidth);
+ auto HighBitsFP = MIRBuilder.buildOr(s64, TwoP84, HighBits);
+ auto Scratch = MIRBuilder.buildFSub(s64, HighBitsFP, TwoP52P84FP);
+ MIRBuilder.buildFAdd(Dst, Scratch, LowBitsFP);
+ MI.eraseFromParent();
+ return true;
+ }
+ return false;
+}
+
bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const {
return true;
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
index 229a58986903d..39bd9892e2f16 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
@@ -39,6 +39,12 @@ class X86LegalizerInfo : public LegalizerInfo {
private:
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
LegalizerHelper &Helper) const;
+
+ bool legalizeFPTOUI(MachineInstr &MI, MachineRegisterInfo &MRI,
+ LegalizerHelper &Helper) const;
+
+ bool legalizeUITOFP(MachineInstr &MI, MachineRegisterInfo &MRI,
+ LegalizerHelper &Helper) const;
};
} // namespace llvm
#endif
diff --git a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
index 9e85424e76e62..43d52fc0e5c3b 100644
--- a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
@@ -323,7 +323,9 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
getInstrPartialMappingIdxs(MI, MRI, /* isFP= */ true, OpRegBankIdx);
break;
case TargetOpcode::G_SITOFP:
- case TargetOpcode::G_FPTOSI: {
+ case TargetOpcode::G_FPTOSI:
+ case TargetOpcode::G_UITOFP:
+ case TargetOpcode::G_FPTOUI: {
// Some of the floating-point instructions have mixed GPR and FP
// operands: fine-tune the computed mapping.
auto &Op0 = MI.getOperand(0);
@@ -331,10 +333,10 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const LLT Ty0 = MRI.getType(Op0.getReg());
const LLT Ty1 = MRI.getType(Op1.getReg());
- bool FirstArgIsFP = Opc == TargetOpcode::G_SITOFP;
- bool SecondArgIsFP = Opc == TargetOpcode::G_FPTOSI;
+ bool FirstArgIsFP =
+ Opc == TargetOpcode::G_SITOFP || Opc == TargetOpcode::G_UITOFP;
OpRegBankIdx[0] = getPartialMappingIdx(MI, Ty0, /* isFP= */ FirstArgIsFP);
- OpRegBankIdx[1] = getPartialMappingIdx(MI, Ty1, /* isFP= */ SecondArgIsFP);
+ OpRegBankIdx[1] = getPartialMappingIdx(MI, Ty1, /* isFP= */ !FirstArgIsFP);
break;
}
case TargetOpcode::G_FCMP: {
diff --git a/llvm/test/CodeGen/X86/isel-fp-to-int.ll b/llvm/test/CodeGen/X86/isel-fp-to-int.ll
new file mode 100644
index 0000000000000..0dcca65e06d4a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-fp-to-int.ll
@@ -0,0 +1,323 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes X64,SDAG-X64
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes X64,GISEL-X64
+; RUN: llc < %s -mattr=+avx512f -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes AVX512,SDAG-AVX512
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mattr=+avx512f -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes AVX512,GISEL-AVX512
+
+define i64 @test_double_to_ui64(double %x) {
+; SDAG-X64-LABEL: test_double_to_ui64:
+; SDAG-X64: # %bb.0: # %entry
+; SDAG-X64-NEXT: cvttsd2si %xmm0, %rcx
+; SDAG-X64-NEXT: movq %rcx, %rdx
+; SDAG-X64-NEXT: sarq $63, %rdx
+; SDAG-X64-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SDAG-X64-NEXT: cvttsd2si %xmm0, %rax
+; SDAG-X64-NEXT: andq %rdx, %rax
+; SDAG-X64-NEXT: orq %rcx, %rax
+; SDAG-X64-NEXT: retq
+;
+; GISEL-X64-LABEL: test_double_to_ui64:
+; GISEL-X64: # %bb.0: # %entry
+; GISEL-X64-NEXT: cvttsd2si %xmm0, %rcx
+; GISEL-X64-NEXT: movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
+; GISEL-X64-NEXT: subsd %xmm1, %xmm0
+; GISEL-X64-NEXT: cvttsd2si %xmm0, %rdx
+; GISEL-X64-NEXT: movq %rcx, %rax
+; GISEL-X64-NEXT: sarq $63, %rax
+; GISEL-X64-NEXT: andq %rdx, %rax
+; GISEL-X64-NEXT: orq %rcx, %rax
+; GISEL-X64-NEXT: retq
+;
+; AVX512-LABEL: test_double_to_ui64:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vcvttsd2usi %xmm0, %rax
+; AVX512-NEXT: retq
+entry:
+ %conv = fptoui double %x to i64
+ ret i64 %conv
+}
+
+define i32 @test_double_to_ui32(double %x) {
+; X64-LABEL: test_double_to_ui32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: cvttsd2si %xmm0, %rax
+; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: retq
+;
+; AVX512-LABEL: test_double_to_ui32:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vcvttsd2usi %xmm0, %eax
+; AVX512-NEXT: retq
+entry:
+ %conv = fptoui double %x to i32
+ ret i32 %conv
+}
+
+define zeroext i16 @test_double_to_ui16(double %x) {
+; X64-LABEL: test_double_to_ui16:
+; X64: # %bb.0: # %entry
+; X64-NEXT: cvttsd2si %xmm0, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+;
+; SDAG-AVX512-LABEL: test_double_to_ui16:
+; SDAG-AVX512: # %bb.0: # %entry
+; SDAG-AVX512-NEXT: vcvttsd2si %xmm0, %eax
+; SDAG-AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; SDAG-AVX512-NEXT: retq
+;
+; GISEL-AVX512-LABEL: test_double_to_ui16:
+; GISEL-AVX512: # %bb.0: # %entry
+; GISEL-AVX512-NEXT: vcvttsd2usi %xmm0, %eax
+; GISEL-AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; GISEL-AVX512-NEXT: retq
+entry:
+ %conv = fptoui double %x to i16
+ ret i16 %conv
+}
+
+define zeroext i8 @test_double_to_ui8(double %x) {
+; X64-LABEL: test_double_to_ui8:
+; X64: # %bb.0: # %entry
+; X64-NEXT: cvttsd2si %xmm0, %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; SDAG-AVX512-LABEL: test_double_to_ui8:
+; SDAG-AVX512: # %bb.0: # %entry
+; SDAG-AVX512-NEXT: vcvttsd2si %xmm0, %eax
+; SDAG-AVX512-NEXT: # kill: def $al killed $al killed $eax
+; SDAG-AVX512-NEXT: retq
+;
+; GISEL-AVX512-LABEL: test_double_to_ui8:
+; GISEL-AVX512: # %bb.0: # %entry
+; GISEL-AVX512-NEXT: vcvttsd2usi %xmm0, %eax
+; GISEL-AVX512-NEXT: # kill: def $al killed $al killed $eax
+; GISEL-AVX512-NEXT: retq
+entry:
+ %conv = fptoui double %x to i8
+ ret i8 %conv
+}
+
+define i64 @test_float_to_ui64(float %x) {
+; SDAG-X64-LABEL: test_float_to_ui64:
+; SDAG-X64: # %bb.0: # %entry
+; SDAG-X64-NEXT: cvttss2si %xmm0, %rcx
+; SDAG-X64-NEXT: movq %rcx, %rdx
+; SDAG-X64-NEXT: sarq $63, %rdx
+; SDAG-X64-NEXT: subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SDAG-X64-NEXT: cvttss2si %xmm0, %rax
+; SDAG-X64-NEXT: andq %rdx, %rax
+; SDAG-X64-NEXT: orq %rcx, %rax
+; SDAG-X64-NEXT: retq
+;
+; GISEL-X64-LABEL: test_float_to_ui64:
+; GISEL-X64: # %bb.0: # %entry
+; GISEL-X64-NEXT: cvttss2si %xmm0, %rcx
+; GISEL-X64-NEXT: movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT: subss %xmm1, %xmm0
+; GISEL-X64-NEXT: cvttss2si %xmm0, %rdx
+; GISEL-X64-NEXT: movq %rcx, %rax
+; GISEL-X64-NEXT: sarq $63, %rax
+; GISEL-X64-NEXT: andq %rdx, %rax
+; GISEL-X64-NEXT: orq %rcx, %rax
+; GISEL-X64-NEXT: retq
+;
+; AVX512-LABEL: test_float_to_ui64:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vcvttss2usi %xmm0, %rax
+; AVX512-NEXT: retq
+entry:
+ %conv = fptoui float %x to i64
+ ret i64 %conv
+}
+
+define i32 @test_float_to_ui32(float %x) {
+; X64-LABEL: test_float_to_ui32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: cvttss2si %xmm0, %rax
+; X64-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-NEXT: retq
+;
+; AVX512-LABEL: test_float_to_ui32:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vcvttss2usi %xmm0, %eax
+; AVX512-NEXT: retq
+entry:
+ %conv = fptoui float %x to i32
+ ret i32 %conv
+}
+
+define zeroext i16 @test_float_to_ui16(float %x) {
+; X64-LABEL: test_float_to_ui16:
+; X64: # %bb.0: # %entry
+; X64-NEXT: cvttss2si %xmm0, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+;
+; SDAG-AVX512-LABEL: test_float_to_ui16:
+; SDAG-AVX512: # %bb.0: # %entry
+; SDAG-AVX512-NEXT: vcvttss2si %xmm0, %eax
+; SDAG-AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; SDAG-AVX512-NEXT: retq
+;
+; GISEL-AVX512-LABEL: test_float_to_ui16:
+; GISEL-AVX512: # %bb.0: # %entry
+; GISEL-AVX512-NEXT: vcvttss2usi %xmm0, %eax
+; GISEL-AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; GISEL-AVX512-NEXT: retq
+entry:
+ %conv = fptoui float %x to i16
+ ret i16 %conv
+}
+
+define zeroext i8 @test_float_to_ui8(float %x) {
+; X64-LABEL: test_float_to_ui8:
+; X64: # %bb.0: # %entry
+; X64-NEXT: cvttss2si %xmm0, %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; SDAG-AVX512-LABEL: test_float_to_ui8:
+; SDAG-AVX512: # %bb.0: # %entry
+; SDAG-AVX512-NEXT: vcvttss2si %xmm0, %eax
+; SDAG-AVX512-NEXT: # kill: def $al killed $al killed $eax
+; SDAG-AVX512-NEXT: retq
+;
+; GISEL-AVX512-LABEL: test_float_to_ui8:
+; GISEL-AVX512: # %bb.0: # %entry
+; GISEL-AVX512-NEXT: vcvttss2usi %xmm0, %eax
+; GISEL-AVX512-NEXT: # kill: def $al killed $al killed $eax
+; GISEL-AVX512-NEXT: retq
+entry:
+ %conv = fptoui float %x to i8
+ ret i8 %conv
+}
+
+define i64 @test_double_to_si64(double %x) {
+; X64-LABEL: test_double_to_si64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: cvttsd2si %xmm0, %rax
+; X64-NEXT: retq
+;
+; AVX512-LABEL: test_double_to_si64:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vcvttsd2si %xmm0, %rax
+; AVX512-NEXT: retq
+entry:
+ %conv = fptosi double %x to i64
+ ret i64 %conv
+}
+
+define i32 @test_double_to_si32(double %x) {
+; X64-LABEL: test_double_to_si32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: cvttsd2si %xmm0, %eax
+; X64-NEXT: retq
+;
+; AVX512-LABEL: test_double_to_si32:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vcvttsd2si %xmm0, %eax
+; AVX512-NEXT: retq
+entry:
+ %conv = fptosi double %x to i32
+ ret i32 %conv
+}
+
+define signext i16 @test_double_to_si16(double %x) {
+; X64-LABEL: test_double_to_si16:
+; X64: # %bb.0: # %entry
+; X64-NEXT: cvttsd2si %xmm0, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+;
+; AVX512-LABEL: test_double_to_si16:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vcvttsd2si %xmm0, %eax
+; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT: retq
+entry:
+ %conv = fptosi double %x to i16
+ ret i16 %conv
+}
+
+define signext i8 @test_double_to_si8(double %x) {
+; X64-LABEL: test_double_to_si8:
+; X64: # %bb.0: # %entry
+; X64-NEXT: cvttsd2si %xmm0, %eax
+; X64-NEXT: # kill: def $al killed $al killed $eax
+; X64-NEXT: retq
+;
+; AVX512-LABEL: test_double_to_si8:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vcvttsd2si %xmm0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: retq
+entry:
+ %conv = fptosi double %x to i8
+ ret i8 %conv
+}
+
+define i64 @test_float_to_si64(float %x) {
+; X64-LABEL: test_float_to_si64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: cvttss2si %xmm0, %rax
+; X64-NEXT: retq
+;
+; AVX512-LABEL: test_float_to_si64:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vcvttss2si %xmm0, %rax
+; AVX512-NEXT: retq
+entry:
+ %conv = fptosi float %x to i64
+ ret i64 %conv
+}
+
+define i32 @test_float_to_si32(float %x) {
+; X64-LABEL: test_float_to_si32:
+; X64: # %bb.0: # %entry
+; X64-NEXT: cvttss2si %xmm0, %eax
+; X64-NEXT: retq
+;
+; AVX512-LABEL: test_float_to_si32:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vcvttss2si %xmm0, %eax
+; AVX512-NEXT: retq
+entry:
+ %conv = fptosi float %x to i32
+ ret i32 %conv
+}
+
+define signext i16 @test_float_to_si16(float %x) {
+; X64-LABEL: test_float_to_si16:
+; X64: # %bb.0: # %entry
+; X64-NEXT: cvttss2si %xmm0, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+;
+; AVX512-LABEL: test_float_to_si16:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vcvttss2si %xmm0, %eax
+; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT: retq
+entry:
+ %conv = fptosi float %x to i16
+ ret i16 %co...
[truncated]
|
return false; | ||
return (HasSSE1 && | ||
(typePairInSet(0, 1, {{s32, s32}, {s32, s16}})(Query) || | ||
(Is64Bit && typePairInSet(0, 1, {{s32, s64}})(Query)))) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For uitofp, SSE has to cheat and zero-extend to i16->i32 / i32->i64 and use sitofp - so we need Is64Bit for i32->f32 as well. We only get real uitofp with AVX512
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
// Simply reuse SITOFP when it is possible to widen the type | ||
if (SrcTy == s16 || SrcTy == s32) { | ||
const LLT WidenTy = LLT::scalar(SrcTy.getScalarSizeInBits() * 2); | ||
auto Ext = MIRBuilder.buildZExt(WidenTy, Src); | ||
MIRBuilder.buildSITOFP(Dst, Ext); | ||
MI.eraseFromParent(); | ||
return true; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is reinventing widenScalar on the source
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is not. Maybe I'm missing something but customIf
works after widenScalar
. Given %1(s32) = G_UITOFP %0(s16)
, it becomes %1(s32) = G_UITOFP %0(s32)
, and only then we see it in customIf
. In legalizeUITOFP
we extend s32
to s64
because there it's the only way. What do we have at the end? We extend anything <s32
to s64
because widenScalar
loses the information about the original type. The reinvention allows us only extending s16
up to s32
.
In other words, after windeScalar
we don't know whether the original type was less than s32
or it is s32
. We need either lookup through instructions or extend everything to s64
to be correct.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe I'm missing something but customIf works after widenScalar
The order is whatever the rules you explicitly ordered do
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! It helped to avoid this ugly HasAVX512 ? s32 : s16
. But still, I don't get it how to avoid the reinvention. If we widen s32->s64
before custom lowering, we can't differ s32->s64
and original s64
. s32->s64
should be lowered through SITOFP
, when s64
is lowered without bit logic.
Or have you meant that we need to use widenScalarSrc
here, instead of creating ZExt
on our own?
return true; | ||
} | ||
|
||
if (DstTy == s64) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is another expansion only in terms of generic instructions that can go in LegalizerHelper
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think so, it relies on the fact that X86 version of FPTOSI
returns 80000000_00000000H
in case of failed conversion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can't really directly rely on known target instruction behavior and still use the regular G_* opcodes. Other optimizations could do something assuming the poison behavior
Also add that as a comment somewhere?
if (SrcTy == s64 && DstTy == s32) | ||
return Helper.lowerU64ToF32WithSITOFP(MI) != | ||
LegalizerHelper::LegalizeResult::UnableToLegalize; | ||
|
||
if (SrcTy == s64 && DstTy == s64) | ||
// TODO: rewrite with vector shuffles when supported. | ||
return Helper.lowerU64ToF64BitFloatOps(MI) != | ||
LegalizerHelper::LegalizeResult::UnableToLegalize; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would expect the legalize rules to be written so that the default lower action hits these paths anyway?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh, that's great! If we can lower s64
as a default lower action, we can finally use widenScalar
before customIf
and get rid of manual widening.
The problem here is that there is already the implementation for s64->s32
that is not efficient in terms of X86
as the more efficient is with SITOFP
. However I can't find a target using lower
for UITOFP
. So I can replace the lowering that is more convenient for X86
. But in future we need either a hook or some parameters passed to the Helper.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately no, we still don't distinguish between smaller types promoted to s32
and original s32
because after widening s8->s32
we start rule iteration over and have no clue whether we should widen type to s64
or not.
* New tests for i31 and i33 * Order doesn't matter much since after one action we start over
auto IsLarge = | ||
MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, S1, Src, Zero); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IsLarge = slt x, 0?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes because x
is unsigned. Or do we want to use ult and INT_MAX to make it more explicit?
auto RoundedHalved = MIRBuilder.buildOr(S64, Halved, LowerBit); | ||
auto HalvedFP = MIRBuilder.buildSITOFP(S32, RoundedHalved); | ||
auto LargeResult = MIRBuilder.buildFAdd(S32, HalvedFP, HalvedFP); | ||
// Choose |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Incomplete comment?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually no but I added a few lines considering the comment above about slt x, 0
return true; | ||
} | ||
|
||
if (DstTy == s64) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can't really directly rely on known target instruction behavior and still use the regular G_* opcodes. Other optimizations could do something assuming the poison behavior
Also add that as a comment somewhere?
Yes, this is true, thank you for noticing this! Updated version uses the lowering from the helper. The problem with Probably I could've hacked it by |
I think this is fine as-is to make progress, but I think this needs to be cleaned up in a follow up. I would suggest either emitting explicit clamping code and having an x86 combine match the instruction behavior, or introduce another G_* node with the clamping behavior and let other targets lower that with the clamp code |
; GISEL-AVX512-NEXT: # kill: def $al killed $al killed $eax | ||
; GISEL-AVX512-NEXT: retq | ||
entry: | ||
%conv = fptoui float %x to i8 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like we're missing fold to prefer to use fp_to_sint when legal
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are there any reasons to prefer vcvttss2si
over vcvttss2usi
in case of AVX512?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
vcvttss2si on 128/256-bit vectors has a shorter encoding when it can use VEX instead of EVEX.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
@arsenm I'm not sure if a new opcode is a good idea. x86 version clamps everything, that is out of bounds, to |
It is perfectly fine to define a new opcode with whatever the clamping behavior exists. I would suggest it's easier to just pattern match the explicit clamp code though |
Also add tests for G_SITOFP and G_FPTOSI