Skip to content

Commit 36dd421

Browse files
[X86][AVX10.2] Map vector saturated converts to public intrinsics (llvm#121483)
We already have support for saturated convert ISA in llvm. With this patch we mapped public llvm intrinsic onto saturated convert ISA. It includes support for float, double into sign and unsigned int.
1 parent ddba036 commit 36dd421

File tree

6 files changed

+380
-4
lines changed

6 files changed

+380
-4
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -341,8 +341,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
341341
}
342342
}
343343
if (Subtarget.hasAVX10_2()) {
344-
setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Legal);
345-
setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Legal);
344+
setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2i32, Custom);
345+
setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2i32, Custom);
346+
for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
347+
MVT::v4i64}) {
348+
setOperationAction(ISD::FP_TO_UINT_SAT, VT, Legal);
349+
setOperationAction(ISD::FP_TO_SINT_SAT, VT, Legal);
350+
}
351+
if (Subtarget.hasAVX10_2_512()) {
352+
setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal);
353+
setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i64, Legal);
354+
}
346355
if (Subtarget.is64Bit()) {
347356
setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Legal);
348357
setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Legal);
@@ -2656,6 +2665,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
26562665
ISD::UINT_TO_FP,
26572666
ISD::STRICT_SINT_TO_FP,
26582667
ISD::STRICT_UINT_TO_FP,
2668+
ISD::FP_TO_SINT_SAT,
2669+
ISD::FP_TO_UINT_SAT,
26592670
ISD::SETCC,
26602671
ISD::MUL,
26612672
ISD::XOR,
@@ -33665,6 +33676,26 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
3366533676
}
3366633677
return;
3366733678
}
33679+
case ISD::FP_TO_SINT_SAT:
33680+
case ISD::FP_TO_UINT_SAT: {
33681+
if (!Subtarget.hasAVX10_2())
33682+
return;
33683+
33684+
bool IsSigned = Opc == ISD::FP_TO_SINT_SAT;
33685+
EVT VT = N->getValueType(0);
33686+
SDValue Op = N->getOperand(0);
33687+
EVT OpVT = Op.getValueType();
33688+
SDValue Res;
33689+
33690+
if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
33691+
if (IsSigned)
33692+
Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
33693+
else
33694+
Res = DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v4i32, Op);
33695+
Results.push_back(Res);
33696+
}
33697+
return;
33698+
}
3366833699
case ISD::FP_TO_SINT:
3366933700
case ISD::STRICT_FP_TO_SINT:
3367033701
case ISD::FP_TO_UINT:
@@ -34645,6 +34676,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
3464534676
NODE_NAME_CASE(VPERMV3)
3464634677
NODE_NAME_CASE(VPERMI)
3464734678
NODE_NAME_CASE(VPTERNLOG)
34679+
NODE_NAME_CASE(FP_TO_SINT_SAT)
34680+
NODE_NAME_CASE(FP_TO_UINT_SAT)
3464834681
NODE_NAME_CASE(VFIXUPIMM)
3464934682
NODE_NAME_CASE(VFIXUPIMM_SAE)
3465034683
NODE_NAME_CASE(VFIXUPIMMS)
@@ -56202,6 +56235,33 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
5620256235
return SDValue();
5620356236
}
5620456237

56238+
// Custom handling for VCVTTPS2QQS/VCVTTPS2UQQS
56239+
static SDValue combineFP_TO_xINT_SAT(SDNode *N, SelectionDAG &DAG,
56240+
const X86Subtarget &Subtarget) {
56241+
if (!Subtarget.hasAVX10_2())
56242+
return SDValue();
56243+
56244+
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
56245+
EVT SrcVT = N->getOperand(0).getValueType();
56246+
EVT DstVT = N->getValueType(0);
56247+
SDLoc dl(N);
56248+
56249+
if (SrcVT == MVT::v2f32 && DstVT == MVT::v2i64) {
56250+
SDValue V2F32Value = DAG.getUNDEF(SrcVT);
56251+
56252+
// Concatenate the original v2f32 input and V2F32Value to create v4f32
56253+
SDValue NewSrc = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
56254+
N->getOperand(0), V2F32Value);
56255+
56256+
// Select the FP_TO_SINT_SAT/FP_TO_UINT_SAT node
56257+
if (IsSigned)
56258+
return DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v2i64, NewSrc);
56259+
56260+
return DAG.getNode(X86ISD::FP_TO_UINT_SAT, dl, MVT::v2i64, NewSrc);
56261+
}
56262+
return SDValue();
56263+
}
56264+
5620556265
static bool needCarryOrOverflowFlag(SDValue Flags) {
5620656266
assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
5620756267

@@ -59315,6 +59375,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
5931559375
case ISD::INTRINSIC_WO_CHAIN: return combineINTRINSIC_WO_CHAIN(N, DAG, DCI);
5931659376
case ISD::INTRINSIC_W_CHAIN: return combineINTRINSIC_W_CHAIN(N, DAG, DCI);
5931759377
case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
59378+
case ISD::FP_TO_SINT_SAT:
59379+
case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
5931859380
// clang-format on
5931959381
}
5932059382

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -908,6 +908,10 @@ namespace llvm {
908908
// Load x87 FPU environment from memory.
909909
FLDENVm,
910910

911+
// Custom handling for FP_TO_xINT_SAT
912+
FP_TO_SINT_SAT,
913+
FP_TO_UINT_SAT,
914+
911915
/// This instruction implements FP_TO_SINT with the
912916
/// integer destination in memory and a FP reg source. This corresponds
913917
/// to the X86::FIST*m instructions and the rounding mode change stuff. It

llvm/lib/Target/X86/X86InstrAVX10.td

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -834,6 +834,70 @@ let Predicates = [HasAVX10_2] in {
834834
// patterns have been disabled with null_frag.
835835
// Patterns VCVTTPD2DQSZ128
836836

837+
// VCVTTPD2DQS
838+
def : Pat<(v4i32(X86fp2sisat(v2f64 VR128X:$src))),
839+
(VCVTTPD2DQSZ128rr VR128X:$src)>;
840+
def : Pat<(v4i32(fp_to_sint_sat(v4f64 VR256X:$src), i32)),
841+
(VCVTTPD2DQSZ256rr VR256X:$src)>;
842+
def : Pat<(v8i32(fp_to_sint_sat(v8f64 VR512:$src), i32)),
843+
(VCVTTPD2DQSZrr VR512:$src)>;
844+
845+
// VCVTTPD2QQS
846+
def : Pat<(v2i64(fp_to_sint_sat(v2f64 VR128X:$src), i64)),
847+
(VCVTTPD2QQSZ128rr VR128X:$src)>;
848+
def : Pat<(v4i64(fp_to_sint_sat(v4f64 VR256X:$src), i64)),
849+
(VCVTTPD2QQSZ256rr VR256X:$src)>;
850+
def : Pat<(v8i64(fp_to_sint_sat(v8f64 VR512:$src), i64)),
851+
(VCVTTPD2QQSZrr VR512:$src)>;
852+
853+
// VCVTTPD2UDQS
854+
def : Pat<(v4i32(X86fp2uisat(v2f64 VR128X:$src))),
855+
(VCVTTPD2UDQSZ128rr VR128X:$src)>;
856+
def : Pat<(v4i32(fp_to_uint_sat(v4f64 VR256X:$src), i32)),
857+
(VCVTTPD2UDQSZ256rr VR256X:$src)>;
858+
def : Pat<(v8i32(fp_to_uint_sat(v8f64 VR512:$src), i32)),
859+
(VCVTTPD2UDQSZrr VR512:$src)>;
860+
861+
// VCVTTPD2UQQS
862+
def : Pat<(v2i64(fp_to_uint_sat(v2f64 VR128X:$src), i64)),
863+
(VCVTTPD2UQQSZ128rr VR128X:$src)>;
864+
def : Pat<(v4i64(fp_to_uint_sat(v4f64 VR256X:$src), i64)),
865+
(VCVTTPD2UQQSZ256rr VR256X:$src)>;
866+
def : Pat<(v8i64(fp_to_uint_sat(v8f64 VR512:$src), i64)),
867+
(VCVTTPD2UQQSZrr VR512:$src)>;
868+
869+
// VCVTTPS2DQS
870+
def : Pat<(v4i32(fp_to_sint_sat(v4f32 VR128X:$src), i32)),
871+
(VCVTTPS2DQSZ128rr VR128X:$src)>;
872+
def : Pat<(v8i32(fp_to_sint_sat(v8f32 VR256X:$src), i32)),
873+
(VCVTTPS2DQSZ256rr VR256X:$src)>;
874+
def : Pat<(v16i32(fp_to_sint_sat(v16f32 VR512:$src), i32)),
875+
(VCVTTPS2DQSZrr VR512:$src)>;
876+
877+
// VCVTTPS2QQS
878+
def : Pat<(v2i64(X86fp2sisat(v4f32 VR128X:$src))),
879+
(VCVTTPS2QQSZ128rr VR128X:$src)>;
880+
def : Pat<(v4i64(fp_to_sint_sat(v4f32 VR128X:$src), i64)),
881+
(VCVTTPS2QQSZ256rr VR128X:$src)>;
882+
def : Pat<(v8i64(fp_to_sint_sat(v8f32 VR256X:$src), i64)),
883+
(VCVTTPS2QQSZrr VR256X:$src)>;
884+
885+
// VCVTTPS2UDQS
886+
def : Pat<(v4i32(fp_to_uint_sat(v4f32 VR128X:$src), i32)),
887+
(VCVTTPS2UDQSZ128rr VR128X:$src)>;
888+
def : Pat<(v8i32(fp_to_uint_sat(v8f32 VR256X:$src), i32)),
889+
(VCVTTPS2UDQSZ256rr VR256X:$src)>;
890+
def : Pat<(v16i32(fp_to_uint_sat(v16f32 VR512:$src), i32)),
891+
(VCVTTPS2UDQSZrr VR512:$src)>;
892+
893+
// VCVTTPS2UQQS
894+
def : Pat<(v2i64(X86fp2uisat(v4f32 VR128X:$src))),
895+
(VCVTTPS2UQQSZ128rr VR128X:$src)>;
896+
def : Pat<(v4i64(fp_to_uint_sat(v4f32 VR128X:$src), i64)),
897+
(VCVTTPS2UQQSZ256rr VR128X:$src)>;
898+
def : Pat<(v8i64(fp_to_uint_sat(v8f32 VR256X:$src), i64)),
899+
(VCVTTPS2UQQSZrr VR256X:$src)>;
900+
837901
def : Pat<(v4i32 (X86cvttp2sis (v2f64 VR128X:$src))),
838902
(VCVTTPD2DQSZ128rr VR128X:$src)>;
839903
def : Pat<(v4i32 (X86cvttp2sis (loadv2f64 addr:$src))),

llvm/lib/Target/X86/X86InstrFragmentsSIMD.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,13 @@ def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
390390
SDTCisSameAs<1,2>, SDTCisSameAs<1,3>,
391391
SDTCisFP<0>, SDTCisVT<4, i32>]>;
392392

393+
def SDTFPToxIntSatOp
394+
: SDTypeProfile<1,
395+
1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCisFP<1>]>;
396+
397+
def X86fp2sisat : SDNode<"X86ISD::FP_TO_SINT_SAT", SDTFPToxIntSatOp>;
398+
def X86fp2uisat : SDNode<"X86ISD::FP_TO_UINT_SAT", SDTFPToxIntSatOp>;
399+
393400
def X86PAlignr : SDNode<"X86ISD::PALIGNR",
394401
SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i8>,
395402
SDTCisSameAs<0,1>,
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=i686-linux -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X86
3+
; RUN: llc < %s -mtriple=x86_64-linux -mattr=+avx10.2-512 | FileCheck %s --check-prefixes=CHECK,X64
4+
5+
; VCVTTPD2DQS
6+
define <8 x i32> @test_signed_v8i32_v8f64(<8 x double> %f) nounwind {
7+
; CHECK-LABEL: test_signed_v8i32_v8f64:
8+
; CHECK: # %bb.0:
9+
; CHECK-NEXT: vcvttpd2dqs %zmm0, %ymm0
10+
; CHECK-NEXT: ret{{[l|q]}}
11+
%x = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> %f)
12+
ret <8 x i32> %x
13+
}
14+
15+
; VCVTTPD2QQS
16+
define <8 x i64> @test_signed_v8i64_v8f64(<8 x double> %f) nounwind {
17+
; CHECK-LABEL: test_signed_v8i64_v8f64:
18+
; CHECK: # %bb.0:
19+
; CHECK-NEXT: vcvttpd2qqs %zmm0, %zmm0
20+
; CHECK-NEXT: ret{{[l|q]}}
21+
%x = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> %f)
22+
ret <8 x i64> %x
23+
}
24+
25+
; VCVTTPD2UDQS
26+
define <8 x i32> @test_unsigned_v8i32_v8f64(<8 x double> %f) nounwind {
27+
; CHECK-LABEL: test_unsigned_v8i32_v8f64:
28+
; CHECK: # %bb.0:
29+
; CHECK-NEXT: vcvttpd2udqs %zmm0, %ymm0
30+
; CHECK-NEXT: ret{{[l|q]}}
31+
%x = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> %f)
32+
ret <8 x i32> %x
33+
}
34+
35+
; VCVTTPD2UQQS
36+
define <8 x i64> @test_unsigned_v8i64_v8f64(<8 x double> %f) nounwind {
37+
; CHECK-LABEL: test_unsigned_v8i64_v8f64:
38+
; CHECK: # %bb.0:
39+
; CHECK-NEXT: vcvttpd2uqqs %zmm0, %zmm0
40+
; CHECK-NEXT: ret{{[l|q]}}
41+
%x = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> %f)
42+
ret <8 x i64> %x
43+
}
44+
45+
; VCVTTPS2DQS
46+
define <16 x i32> @test_signed_v16i32_v16f32(<16 x float> %f) nounwind {
47+
; CHECK-LABEL: test_signed_v16i32_v16f32:
48+
; CHECK: # %bb.0:
49+
; CHECK-NEXT: vcvttps2dqs %zmm0, %zmm0
50+
; CHECK-NEXT: ret{{[l|q]}}
51+
%x = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> %f)
52+
ret <16 x i32> %x
53+
}
54+
55+
; VCVTTPS2UDQS
56+
define <16 x i32> @test_unsigned_v16i32_v16f32(<16 x float> %f) nounwind {
57+
; CHECK-LABEL: test_unsigned_v16i32_v16f32:
58+
; CHECK: # %bb.0:
59+
; CHECK-NEXT: vcvttps2udqs %zmm0, %zmm0
60+
; CHECK-NEXT: ret{{[l|q]}}
61+
%x = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> %f)
62+
ret <16 x i32> %x
63+
}
64+
; VCVTTPS2QQS
65+
define <8 x i64> @test_signed_v8i64_v8f32(<8 x float> %f) nounwind {
66+
; CHECK-LABEL: test_signed_v8i64_v8f32:
67+
; CHECK: # %bb.0:
68+
; CHECK-NEXT: vcvttps2qqs %ymm0, %zmm0
69+
; CHECK-NEXT: ret{{[l|q]}}
70+
%x = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> %f)
71+
ret <8 x i64> %x
72+
}
73+
74+
; VCVTTPS2UQQS
75+
define <8 x i64> @test_unsigned_v8i64_v8f32(<8 x float> %f) nounwind {
76+
; CHECK-LABEL: test_unsigned_v8i64_v8f32:
77+
; CHECK: # %bb.0:
78+
; CHECK-NEXT: vcvttps2uqqs %ymm0, %zmm0
79+
; CHECK-NEXT: ret{{[l|q]}}
80+
%x = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> %f)
81+
ret <8 x i64> %x
82+
}
83+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
84+
; X64: {{.*}}
85+
; X86: {{.*}}

0 commit comments

Comments
 (0)