[X86][GlobalISel] Enable scalar versions of G_UITOFP and G_FPTOUI #100079

e-kud · 2024-07-23T08:25:25Z

Also add tests for G_SITOFP and G_FPTOSI

llvmbot · 2024-07-23T08:25:57Z

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-x86

Author: Evgenii Kudriashov (e-kud)

Changes

Also add tests for G_SITOFP and G_FPTOSI

Patch is 30.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100079.diff

5 Files Affected:

(modified) llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp (+157)
(modified) llvm/lib/Target/X86/GISel/X86LegalizerInfo.h (+6)
(modified) llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp (+6-4)
(added) llvm/test/CodeGen/X86/isel-fp-to-int.ll (+323)
(added) llvm/test/CodeGen/X86/isel-int-to-fp.ll (+320)

diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
index dd8ecf6ef7fc7..0f1d3343630d7 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp
@@ -497,6 +497,53 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
       .clampScalar(0, s32, sMaxScalar)
       .widenScalarToNextPow2(1);
 
+  // For G_UITOFP and G_FPTOUI without AVX512, we have to custom legalize s16
+  // manually. Otherwise, in custom handler there is no way to understand
+  // whether s32 is an original type and we need to promote it to s64 or s32 is
+  // obtained after widening s16 and we shouldn't widen it to s64.
+  //
+  // For AVX512 we simply widen types as there is direct mapping from opcodes
+  // to asm instructions.
+  getActionDefinitionsBuilder(G_UITOFP)
+      .legalIf([=](const LegalityQuery &Query) {
+        return HasAVX512 && typeInSet(0, {s32, s64})(Query) &&
+               typeInSet(1, {s32, s64})(Query);
+      })
+      .customIf([=](const LegalityQuery &Query) {
+        if (HasAVX512)
+          return false;
+        return (HasSSE1 &&
+                (typePairInSet(0, 1, {{s32, s32}, {s32, s16}})(Query) ||
+                 (Is64Bit && typePairInSet(0, 1, {{s32, s64}})(Query)))) ||
+               (HasSSE2 &&
+                (typePairInSet(0, 1, {{s64, s32}, {s64, s16}})(Query) ||
+                 (Is64Bit && typePairInSet(0, 1, {{s64, s64}})(Query))));
+      })
+      .clampScalar(1, HasAVX512 ? s32 : s16, sMaxScalar)
+      .widenScalarToNextPow2(1)
+      .clampScalar(0, s32, HasSSE2 ? s64 : s32)
+      .widenScalarToNextPow2(0);
+
+  getActionDefinitionsBuilder(G_FPTOUI)
+      .legalIf([=](const LegalityQuery &Query) {
+        return HasAVX512 && typeInSet(0, {s32, s64})(Query) &&
+               typeInSet(1, {s32, s64})(Query);
+      })
+      .customIf([=](const LegalityQuery &Query) {
+        if (HasAVX512)
+          return false;
+        return (HasSSE1 &&
+                (typePairInSet(0, 1, {{s32, s32}, {s16, s32}})(Query) ||
+                 (Is64Bit && typePairInSet(0, 1, {{s64, s32}})(Query)))) ||
+               (HasSSE2 &&
+                (typePairInSet(0, 1, {{s32, s64}, {s16, s64}})(Query) ||
+                 (Is64Bit && typePairInSet(0, 1, {{s64, s64}})(Query))));
+      })
+      .clampScalar(1, s32, sMaxScalar)
+      .widenScalarToNextPow2(1)
+      .clampScalar(0, HasAVX512 ? s32 : s16, HasSSE2 ? s64 : s32)
+      .widenScalarToNextPow2(0);
+
   // vector ops
   getActionDefinitionsBuilder(G_BUILD_VECTOR)
       .customIf([=](const LegalityQuery &Query) {
@@ -589,6 +636,10 @@ bool X86LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI,
     return false;
   case TargetOpcode::G_BUILD_VECTOR:
     return legalizeBuildVector(MI, MRI, Helper);
+  case TargetOpcode::G_FPTOUI:
+    return legalizeFPTOUI(MI, MRI, Helper);
+  case TargetOpcode::G_UITOFP:
+    return legalizeUITOFP(MI, MRI, Helper);
   }
   llvm_unreachable("expected switch to return");
 }
@@ -644,6 +695,112 @@ bool X86LegalizerInfo::legalizeBuildVector(MachineInstr &MI,
   return true;
 }
 
+bool X86LegalizerInfo::legalizeFPTOUI(MachineInstr &MI,
+                                      MachineRegisterInfo &MRI,
+                                      LegalizerHelper &Helper) const {
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+  auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
+  unsigned DstSizeInBits = DstTy.getScalarSizeInBits();
+  const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
+
+  // Simply reuse FPTOSI when it is possible to widen the type
+  if (DstSizeInBits == 16 || DstSizeInBits == 32) {
+    auto Casted = MIRBuilder.buildFPTOSI(LLT::scalar(DstSizeInBits * 2), Src);
+    MIRBuilder.buildTrunc(Dst, Casted);
+    MI.eraseFromParent();
+    return true;
+  }
+  if (DstTy == s64) {
+    APInt TwoPExpInt = APInt::getSignMask(DstSizeInBits);
+    APFloat TwoPExpFP(SrcTy == s32 ? APFloat::IEEEsingle()
+                                   : APFloat::IEEEdouble(),
+                      APInt::getZero(SrcTy.getSizeInBits()));
+    TwoPExpFP.convertFromAPInt(TwoPExpInt, /*IsSigned=*/false,
+                               APFloat::rmNearestTiesToEven);
+
+    // For fp Src greater or equal to Threshold(2^Exp), we use FPTOSI on
+    // (Src - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
+    // For fp Src smaller, (Src - 2^Exp) is zeroed by And, the final result
+    // is FPTOSI on Src.
+    auto Casted = MIRBuilder.buildFPTOSI(DstTy, Src);
+    auto Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
+    auto FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
+    auto ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
+    auto Shift = MIRBuilder.buildConstant(DstTy, DstSizeInBits - 1);
+    auto ResHighBit = MIRBuilder.buildAShr(DstTy, Casted, Shift);
+    auto And = MIRBuilder.buildAnd(DstTy, ResHighBit, ResLowBits);
+    MIRBuilder.buildOr(Dst, And, Casted);
+    MI.eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
+bool X86LegalizerInfo::legalizeUITOFP(MachineInstr &MI,
+                                      MachineRegisterInfo &MRI,
+                                      LegalizerHelper &Helper) const {
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+  auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
+  const LLT s16 = LLT::scalar(16);
+  const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
+
+  // Simply reuse SITOFP when it is possible to widen the type
+  if (SrcTy == s16 || SrcTy == s32) {
+    const LLT WidenTy = LLT::scalar(SrcTy.getScalarSizeInBits() * 2);
+    auto Ext = MIRBuilder.buildZExt(WidenTy, Src);
+    MIRBuilder.buildSITOFP(Dst, Ext);
+    MI.eraseFromParent();
+    return true;
+  }
+  if (SrcTy == s64 && DstTy == s32) {
+    // For i64 < INT_MAX we simply reuse SITOFP.
+    // Otherwise, divide i64 by 2, round result by ORing with the lowest bit
+    // saved before division, convert to float by SITOFP, multiply the result
+    // by 2.
+    auto SmallResult = MIRBuilder.buildSITOFP(DstTy, Src);
+    auto One = MIRBuilder.buildConstant(SrcTy, 1);
+    auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
+    auto Halved = MIRBuilder.buildLShr(SrcTy, Src, One);
+    auto LowerBit = MIRBuilder.buildAnd(SrcTy, Src, One);
+    auto RoundedHalved = MIRBuilder.buildOr(SrcTy, Halved, LowerBit);
+    auto HalvedFP = MIRBuilder.buildSITOFP(DstTy, RoundedHalved);
+    auto LargeResult = MIRBuilder.buildFAdd(DstTy, HalvedFP, HalvedFP);
+    auto IsLarge = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT,
+                                        LLT::scalar(1), Src, Zero);
+    MIRBuilder.buildSelect(Dst, IsLarge, LargeResult, SmallResult);
+    MI.eraseFromParent();
+    return true;
+  }
+  if (SrcTy == s64 && DstTy == s64) {
+    // TODO: rewrite on vector shuffles when supported.
+    // We create doubles from 32 bit parts with 32 exponent difference.
+    //
+    // X = 2^52 * 1.0...LowBits
+    // Y = 2^84 * 1.0...HighBits
+    // Temp = 2^84 * 1.0...HighBits - 2^84 * 1.0 - 2^52 * 1.0
+    //      = - 2^52 * 1.0...HighBits
+    // Result = - 2^52 * 1.0...HighBits + 2^52 * 1.0...LowBits
+    auto TwoP52 = MIRBuilder.buildConstant(s64, UINT64_C(0x4330000000000000));
+    auto TwoP84 = MIRBuilder.buildConstant(s64, UINT64_C(0x4530000000000000));
+    auto TwoP52P84 = llvm::bit_cast<double>(UINT64_C(0x4530000000100000));
+    auto TwoP52P84FP = MIRBuilder.buildFConstant(s64, TwoP52P84);
+    auto HalfWidth = MIRBuilder.buildConstant(s64, 32);
+
+    auto LowBits = MIRBuilder.buildTrunc(s32, Src);
+    LowBits = MIRBuilder.buildZExt(s64, LowBits);
+    auto LowBitsFP = MIRBuilder.buildOr(s64, TwoP52, LowBits);
+    auto HighBits = MIRBuilder.buildLShr(s64, Src, HalfWidth);
+    auto HighBitsFP = MIRBuilder.buildOr(s64, TwoP84, HighBits);
+    auto Scratch = MIRBuilder.buildFSub(s64, HighBitsFP, TwoP52P84FP);
+    MIRBuilder.buildFAdd(Dst, Scratch, LowBitsFP);
+    MI.eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
 bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
                                          MachineInstr &MI) const {
   return true;
diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
index 229a58986903d..39bd9892e2f16 100644
--- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
+++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h
@@ -39,6 +39,12 @@ class X86LegalizerInfo : public LegalizerInfo {
 private:
   bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
                            LegalizerHelper &Helper) const;
+
+  bool legalizeFPTOUI(MachineInstr &MI, MachineRegisterInfo &MRI,
+                      LegalizerHelper &Helper) const;
+
+  bool legalizeUITOFP(MachineInstr &MI, MachineRegisterInfo &MRI,
+                      LegalizerHelper &Helper) const;
 };
 } // namespace llvm
 #endif
diff --git a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
index 9e85424e76e62..43d52fc0e5c3b 100644
--- a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
@@ -323,7 +323,9 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     getInstrPartialMappingIdxs(MI, MRI, /* isFP= */ true, OpRegBankIdx);
     break;
   case TargetOpcode::G_SITOFP:
-  case TargetOpcode::G_FPTOSI: {
+  case TargetOpcode::G_FPTOSI:
+  case TargetOpcode::G_UITOFP:
+  case TargetOpcode::G_FPTOUI: {
     // Some of the floating-point instructions have mixed GPR and FP
     // operands: fine-tune the computed mapping.
     auto &Op0 = MI.getOperand(0);
@@ -331,10 +333,10 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     const LLT Ty0 = MRI.getType(Op0.getReg());
     const LLT Ty1 = MRI.getType(Op1.getReg());
 
-    bool FirstArgIsFP = Opc == TargetOpcode::G_SITOFP;
-    bool SecondArgIsFP = Opc == TargetOpcode::G_FPTOSI;
+    bool FirstArgIsFP =
+        Opc == TargetOpcode::G_SITOFP || Opc == TargetOpcode::G_UITOFP;
     OpRegBankIdx[0] = getPartialMappingIdx(MI, Ty0, /* isFP= */ FirstArgIsFP);
-    OpRegBankIdx[1] = getPartialMappingIdx(MI, Ty1, /* isFP= */ SecondArgIsFP);
+    OpRegBankIdx[1] = getPartialMappingIdx(MI, Ty1, /* isFP= */ !FirstArgIsFP);
     break;
   }
   case TargetOpcode::G_FCMP: {
diff --git a/llvm/test/CodeGen/X86/isel-fp-to-int.ll b/llvm/test/CodeGen/X86/isel-fp-to-int.ll
new file mode 100644
index 0000000000000..0dcca65e06d4a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-fp-to-int.ll
@@ -0,0 +1,323 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s                                   -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes X64,SDAG-X64
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes X64,GISEL-X64
+; RUN: llc < %s                                   -mattr=+avx512f -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes AVX512,SDAG-AVX512
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mattr=+avx512f -mtriple=x86_64-- -verify-machineinstrs | FileCheck %s --check-prefixes AVX512,GISEL-AVX512
+
+define i64 @test_double_to_ui64(double %x) {
+; SDAG-X64-LABEL: test_double_to_ui64:
+; SDAG-X64:       # %bb.0: # %entry
+; SDAG-X64-NEXT:    cvttsd2si %xmm0, %rcx
+; SDAG-X64-NEXT:    movq %rcx, %rdx
+; SDAG-X64-NEXT:    sarq $63, %rdx
+; SDAG-X64-NEXT:    subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SDAG-X64-NEXT:    cvttsd2si %xmm0, %rax
+; SDAG-X64-NEXT:    andq %rdx, %rax
+; SDAG-X64-NEXT:    orq %rcx, %rax
+; SDAG-X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: test_double_to_ui64:
+; GISEL-X64:       # %bb.0: # %entry
+; GISEL-X64-NEXT:    cvttsd2si %xmm0, %rcx
+; GISEL-X64-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
+; GISEL-X64-NEXT:    subsd %xmm1, %xmm0
+; GISEL-X64-NEXT:    cvttsd2si %xmm0, %rdx
+; GISEL-X64-NEXT:    movq %rcx, %rax
+; GISEL-X64-NEXT:    sarq $63, %rax
+; GISEL-X64-NEXT:    andq %rdx, %rax
+; GISEL-X64-NEXT:    orq %rcx, %rax
+; GISEL-X64-NEXT:    retq
+;
+; AVX512-LABEL: test_double_to_ui64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttsd2usi %xmm0, %rax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptoui double %x to i64
+  ret i64 %conv
+}
+
+define i32 @test_double_to_ui32(double %x) {
+; X64-LABEL: test_double_to_ui32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_double_to_ui32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttsd2usi %xmm0, %eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptoui double %x to i32
+  ret i32 %conv
+}
+
+define zeroext i16 @test_double_to_ui16(double %x) {
+; X64-LABEL: test_double_to_ui16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; SDAG-AVX512-LABEL: test_double_to_ui16:
+; SDAG-AVX512:       # %bb.0: # %entry
+; SDAG-AVX512-NEXT:    vcvttsd2si %xmm0, %eax
+; SDAG-AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; SDAG-AVX512-NEXT:    retq
+;
+; GISEL-AVX512-LABEL: test_double_to_ui16:
+; GISEL-AVX512:       # %bb.0: # %entry
+; GISEL-AVX512-NEXT:    vcvttsd2usi %xmm0, %eax
+; GISEL-AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; GISEL-AVX512-NEXT:    retq
+entry:
+  %conv = fptoui double %x to i16
+  ret i16 %conv
+}
+
+define zeroext i8 @test_double_to_ui8(double %x) {
+; X64-LABEL: test_double_to_ui8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; SDAG-AVX512-LABEL: test_double_to_ui8:
+; SDAG-AVX512:       # %bb.0: # %entry
+; SDAG-AVX512-NEXT:    vcvttsd2si %xmm0, %eax
+; SDAG-AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; SDAG-AVX512-NEXT:    retq
+;
+; GISEL-AVX512-LABEL: test_double_to_ui8:
+; GISEL-AVX512:       # %bb.0: # %entry
+; GISEL-AVX512-NEXT:    vcvttsd2usi %xmm0, %eax
+; GISEL-AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; GISEL-AVX512-NEXT:    retq
+entry:
+  %conv = fptoui double %x to i8
+  ret i8 %conv
+}
+
+define i64 @test_float_to_ui64(float %x) {
+; SDAG-X64-LABEL: test_float_to_ui64:
+; SDAG-X64:       # %bb.0: # %entry
+; SDAG-X64-NEXT:    cvttss2si %xmm0, %rcx
+; SDAG-X64-NEXT:    movq %rcx, %rdx
+; SDAG-X64-NEXT:    sarq $63, %rdx
+; SDAG-X64-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SDAG-X64-NEXT:    cvttss2si %xmm0, %rax
+; SDAG-X64-NEXT:    andq %rdx, %rax
+; SDAG-X64-NEXT:    orq %rcx, %rax
+; SDAG-X64-NEXT:    retq
+;
+; GISEL-X64-LABEL: test_float_to_ui64:
+; GISEL-X64:       # %bb.0: # %entry
+; GISEL-X64-NEXT:    cvttss2si %xmm0, %rcx
+; GISEL-X64-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
+; GISEL-X64-NEXT:    subss %xmm1, %xmm0
+; GISEL-X64-NEXT:    cvttss2si %xmm0, %rdx
+; GISEL-X64-NEXT:    movq %rcx, %rax
+; GISEL-X64-NEXT:    sarq $63, %rax
+; GISEL-X64-NEXT:    andq %rdx, %rax
+; GISEL-X64-NEXT:    orq %rcx, %rax
+; GISEL-X64-NEXT:    retq
+;
+; AVX512-LABEL: test_float_to_ui64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttss2usi %xmm0, %rax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptoui float %x to i64
+  ret i64 %conv
+}
+
+define i32 @test_float_to_ui32(float %x) {
+; X64-LABEL: test_float_to_ui32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttss2si %xmm0, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_float_to_ui32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttss2usi %xmm0, %eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptoui float %x to i32
+  ret i32 %conv
+}
+
+define zeroext i16 @test_float_to_ui16(float %x) {
+; X64-LABEL: test_float_to_ui16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttss2si %xmm0, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; SDAG-AVX512-LABEL: test_float_to_ui16:
+; SDAG-AVX512:       # %bb.0: # %entry
+; SDAG-AVX512-NEXT:    vcvttss2si %xmm0, %eax
+; SDAG-AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; SDAG-AVX512-NEXT:    retq
+;
+; GISEL-AVX512-LABEL: test_float_to_ui16:
+; GISEL-AVX512:       # %bb.0: # %entry
+; GISEL-AVX512-NEXT:    vcvttss2usi %xmm0, %eax
+; GISEL-AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; GISEL-AVX512-NEXT:    retq
+entry:
+  %conv = fptoui float %x to i16
+  ret i16 %conv
+}
+
+define zeroext i8 @test_float_to_ui8(float %x) {
+; X64-LABEL: test_float_to_ui8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttss2si %xmm0, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; SDAG-AVX512-LABEL: test_float_to_ui8:
+; SDAG-AVX512:       # %bb.0: # %entry
+; SDAG-AVX512-NEXT:    vcvttss2si %xmm0, %eax
+; SDAG-AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; SDAG-AVX512-NEXT:    retq
+;
+; GISEL-AVX512-LABEL: test_float_to_ui8:
+; GISEL-AVX512:       # %bb.0: # %entry
+; GISEL-AVX512-NEXT:    vcvttss2usi %xmm0, %eax
+; GISEL-AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; GISEL-AVX512-NEXT:    retq
+entry:
+  %conv = fptoui float %x to i8
+  ret i8 %conv
+}
+
+define i64 @test_double_to_si64(double %x) {
+; X64-LABEL: test_double_to_si64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %rax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_double_to_si64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi double %x to i64
+  ret i64 %conv
+}
+
+define i32 @test_double_to_si32(double %x) {
+; X64-LABEL: test_double_to_si32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %eax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_double_to_si32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttsd2si %xmm0, %eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi double %x to i32
+  ret i32 %conv
+}
+
+define signext i16 @test_double_to_si16(double %x) {
+; X64-LABEL: test_double_to_si16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_double_to_si16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttsd2si %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi double %x to i16
+  ret i16 %conv
+}
+
+define signext i8 @test_double_to_si8(double %x) {
+; X64-LABEL: test_double_to_si8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttsd2si %xmm0, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_double_to_si8:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttsd2si %xmm0, %eax
+; AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi double %x to i8
+  ret i8 %conv
+}
+
+define i64 @test_float_to_si64(float %x) {
+; X64-LABEL: test_float_to_si64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttss2si %xmm0, %rax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_float_to_si64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttss2si %xmm0, %rax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi float %x to i64
+  ret i64 %conv
+}
+
+define i32 @test_float_to_si32(float %x) {
+; X64-LABEL: test_float_to_si32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttss2si %xmm0, %eax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_float_to_si32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttss2si %xmm0, %eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi float %x to i32
+  ret i32 %conv
+}
+
+define signext i16 @test_float_to_si16(float %x) {
+; X64-LABEL: test_float_to_si16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cvttss2si %xmm0, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; AVX512-LABEL: test_float_to_si16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttss2si %xmm0, %eax
+; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT:    retq
+entry:
+  %conv = fptosi float %x to i16
+  ret i16 %co...
[truncated]

RKSimon · 2024-07-23T10:35:30Z

llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp

+          return false;
+        return (HasSSE1 &&
+                (typePairInSet(0, 1, {{s32, s32}, {s32, s16}})(Query) ||
+                 (Is64Bit && typePairInSet(0, 1, {{s32, s64}})(Query)))) ||


For uitofp, SSE has to cheat and zero-extend to i16->i32 / i32->i64 and use sitofp - so we need Is64Bit for i32->f32 as well. We only get real uitofp with AVX512

arsenm · 2024-07-23T10:40:46Z

llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp

+  // Simply reuse SITOFP when it is possible to widen the type
+  if (SrcTy == s16 || SrcTy == s32) {
+    const LLT WidenTy = LLT::scalar(SrcTy.getScalarSizeInBits() * 2);
+    auto Ext = MIRBuilder.buildZExt(WidenTy, Src);
+    MIRBuilder.buildSITOFP(Dst, Ext);
+    MI.eraseFromParent();
+    return true;
+  }


This is reinventing widenScalar on the source

It is not. Maybe I'm missing something but customIf works after widenScalar. Given %1(s32) = G_UITOFP %0(s16), it becomes %1(s32) = G_UITOFP %0(s32), and only then we see it in customIf. In legalizeUITOFP we extend s32 to s64 because there it's the only way. What do we have at the end? We extend anything <s32 to s64 because widenScalar loses the information about the original type. The reinvention allows us only extending s16 up to s32.

In other words, after windeScalar we don't know whether the original type was less than s32 or it is s32. We need either lookup through instructions or extend everything to s64 to be correct.

Maybe I'm missing something but customIf works after widenScalar

The order is whatever the rules you explicitly ordered do

Thanks! It helped to avoid this ugly HasAVX512 ? s32 : s16. But still, I don't get it how to avoid the reinvention. If we widen s32->s64 before custom lowering, we can't differ s32->s64 and original s64. s32->s64 should be lowered through SITOFP, when s64 is lowered without bit logic.

Or have you meant that we need to use widenScalarSrc here, instead of creating ZExt on our own?

llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

arsenm · 2024-07-30T11:08:38Z

llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp

+    return true;
+  }
+
+  if (DstTy == s64) {


This is another expansion only in terms of generic instructions that can go in LegalizerHelper

I don't think so, it relies on the fact that X86 version of FPTOSI returns 80000000_00000000H in case of failed conversion.

You can't really directly rely on known target instruction behavior and still use the regular G_* opcodes. Other optimizations could do something assuming the poison behavior

Also add that as a comment somewhere?

arsenm · 2024-07-30T11:09:13Z

llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp

+  if (SrcTy == s64 && DstTy == s32)
+    return Helper.lowerU64ToF32WithSITOFP(MI) !=
+           LegalizerHelper::LegalizeResult::UnableToLegalize;
+
+  if (SrcTy == s64 && DstTy == s64)
+    // TODO: rewrite with vector shuffles when supported.
+    return Helper.lowerU64ToF64BitFloatOps(MI) !=
+           LegalizerHelper::LegalizeResult::UnableToLegalize;


I would expect the legalize rules to be written so that the default lower action hits these paths anyway?

Oh, that's great! If we can lower s64 as a default lower action, we can finally use widenScalar before customIf and get rid of manual widening.
The problem here is that there is already the implementation for s64->s32 that is not efficient in terms of X86 as the more efficient is with SITOFP. However I can't find a target using lower for UITOFP. So I can replace the lowering that is more convenient for X86. But in future we need either a hook or some parameters passed to the Helper.

Unfortunately no, we still don't distinguish between smaller types promoted to s32 and original s32 because after widening s8->s32 we start rule iteration over and have no clue whether we should widen type to s64 or not.

* New tests for i31 and i33 * Order doesn't matter much since after one action we start over

arsenm · 2024-08-16T12:23:58Z

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

+  auto IsLarge =
+      MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, S1, Src, Zero);


IsLarge = slt x, 0?

Yes because x is unsigned. Or do we want to use ult and INT_MAX to make it more explicit?

arsenm · 2024-08-16T12:24:06Z

llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp

+  auto RoundedHalved = MIRBuilder.buildOr(S64, Halved, LowerBit);
+  auto HalvedFP = MIRBuilder.buildSITOFP(S32, RoundedHalved);
+  auto LargeResult = MIRBuilder.buildFAdd(S32, HalvedFP, HalvedFP);
+  // Choose


Incomplete comment?

Actually no but I added a few lines considering the comment above about slt x, 0

arsenm · 2024-08-16T12:26:30Z

llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp

+    return true;
+  }
+
+  if (DstTy == s64) {


You can't really directly rely on known target instruction behavior and still use the regular G_* opcodes. Other optimizations could do something assuming the poison behavior

Also add that as a comment somewhere?

llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp

e-kud · 2024-09-11T00:32:44Z

You can't really directly rely on known target instruction behavior and still use the regular G_* opcodes. Other optimizations could do something assuming the poison behavior

Yes, this is true, thank you for noticing this! Updated version uses the lowering from the helper. The problem with X86cvtts2Int is that there are patterns only for vector form in contrast to fp_to_sint. We currently struggle with it in GlobalISel.

Probably I could've hacked it by GINodeEquiv<G_CVTTS2INT, fp_to_sint> but I'm inclined to think that target specific opcodes should be mapped to target specific nodes.

arsenm · 2024-09-11T11:34:26Z

Yes, this is true, thank you for noticing this! Updated version uses the lowering from the helper. The problem with X86cvtts2Int is that there are patterns only for vector form in contrast to fp_to_sint. We currently struggle with it in GlobalISel.

I think this is fine as-is to make progress, but I think this needs to be cleaned up in a follow up. I would suggest either emitting explicit clamping code and having an x86 combine match the instruction behavior, or introduce another G_* node with the clamping behavior and let other targets lower that with the clamp code

llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp

RKSimon · 2024-09-17T16:12:14Z

llvm/test/CodeGen/X86/isel-fp-to-int.ll

+; GISEL-AVX512-NEXT:    # kill: def $al killed $al killed $eax
+; GISEL-AVX512-NEXT:    retq
+entry:
+  %conv = fptoui float %x to i8


Looks like we're missing fold to prefer to use fp_to_sint when legal

Are there any reasons to prefer vcvttss2si over vcvttss2usi in case of AVX512?

vcvttss2si on 128/256-bit vectors has a shorter encoding when it can use VEX instead of EVEX.

RKSimon

LGTM

e-kud · 2024-09-19T00:56:39Z

I think this is fine as-is to make progress, but I think this needs to be cleaned up in a follow up. I would suggest either emitting explicit clamping code and having an x86 combine match the instruction behavior, or introduce another G_* node with the clamping behavior and let other targets lower that with the clamp code

@arsenm I'm not sure if a new opcode is a good idea. x86 version clamps everything, that is out of bounds, to 80000000_00000000H. So to make it work, we need to introduce an opcode for generic clamping to the specified value or values.

arsenm · 2024-09-20T07:18:46Z

@arsenm I'm not sure if a new opcode is a good idea. x86 version clamps everything, that is out of bounds, to 80000000_00000000H. So to make it work, we need to introduce an opcode for generic clamping to the specified value or values.

It is perfectly fine to define a new opcode with whatever the clamping behavior exists. I would suggest it's easier to just pattern match the explicit clamp code though

[X86][GlobalISel] Enable scalar versions of G_UITOFP and G_FPTOUI

e9d7c68

Also add tests for G_SITOFP and G_FPTOSI

e-kud requested review from arsenm, RKSimon, tschuett and qcolombet July 23, 2024 08:25

llvmbot added the backend:X86 label Jul 23, 2024

RKSimon reviewed Jul 23, 2024

View reviewed changes

arsenm reviewed Jul 23, 2024

View reviewed changes

tschuett added the llvm:globalisel label Jul 23, 2024

Make UITOFP legalization generic. Require 64bits for uint32 conversion

20e9b7b

arsenm reviewed Jul 30, 2024

View reviewed changes

e-kud added 2 commits July 30, 2024 15:45

Legalize 64bit conversions using lowerIf.

7d68049

* New tests for i31 and i33 * Order doesn't matter much since after one action we start over

Merge branch 'main' into global-uifp

679ed80

arsenm reviewed Aug 16, 2024

View reviewed changes

e-kud added 2 commits September 6, 2024 07:30

Don't use generic instructions for X86 specifics

7e2987d

Addressing the comments

bf232e7

arsenm approved these changes Sep 11, 2024

View reviewed changes

Merge branch 'main' into global-uifp

d016bf0

RKSimon reviewed Sep 17, 2024

View reviewed changes

llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp Outdated Show resolved Hide resolved

Some more comments are addressed

c846d0c

RKSimon reviewed Sep 17, 2024

View reviewed changes

RKSimon approved these changes Sep 17, 2024

View reviewed changes

e-kud merged commit e9cb440 into llvm:main Sep 25, 2024
8 checks passed

		auto IsLarge =
		MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, S1, Src, Zero);

[X86][GlobalISel] Enable scalar versions of G_UITOFP and G_FPTOUI #100079

[X86][GlobalISel] Enable scalar versions of G_UITOFP and G_FPTOUI #100079

Uh oh!

Conversation

e-kud commented Jul 23, 2024

Uh oh!

llvmbot commented Jul 23, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

e-kud Jul 23, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

e-kud Jul 30, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

e-kud commented Sep 11, 2024

Uh oh!

arsenm commented Sep 11, 2024

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

e-kud commented Sep 19, 2024

Uh oh!

arsenm commented Sep 20, 2024

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Jul 23, 2024 •

edited

Loading

e-kud Jul 23, 2024 •

edited

Loading

e-kud Jul 30, 2024 •

edited

Loading