Skip to content

Commit 451af63

Browse files
committed
[X86] Remove combineVectorTruncation and delay general vector trunc to lowering
Stop folding vector truncations to PACKSS/PACKUS patterns prematurely - another step towards Issue #63710. We still prematurely fold to PACKSS/PACKUS if there are sufficient signbits, that will be addressed in a later patch when we remove combineVectorSignBitsTruncation. This required ReplaceNodeResults to extend handling of sub-128-bit results to SSSE3 (or later) cases, which has allowed us to improve vXi32->vXi16 truncations to use PSHUFB. I also tweaked LowerTruncateVecPack to recognise widened truncation source operands so the upper elements remain UNDEF (otherwise truncateVectorWithPACK* will constant fold them to allzeros/allones values).
1 parent ddb46ab commit 451af63

File tree

5 files changed

+338
-236
lines changed

5 files changed

+338
-236
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 26 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -22933,6 +22933,27 @@ static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL,
2293322933
return SDValue();
2293422934
}
2293522935

22936+
// If the upper half of the source is undef, then attempt to split and
22937+
// only truncate the lower half.
22938+
if (DstVT.getSizeInBits() >= 128) {
22939+
SmallVector<SDValue> SubOps;
22940+
if (collectConcatOps(In.getNode(), SubOps, DAG)) {
22941+
ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.end());
22942+
ArrayRef<SDValue> UpperOps(SubOps.begin(), SubOps.end());
22943+
LowerOps = LowerOps.drop_back(SubOps.size() / 2);
22944+
UpperOps = UpperOps.drop_front(SubOps.size() / 2);
22945+
if (all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
22946+
MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
22947+
MVT SrcHalfVT = SrcVT.getHalfNumVectorElementsVT();
22948+
SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcHalfVT, LowerOps);
22949+
if (SDValue Res =
22950+
LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
22951+
return widenSubVector(Res, false, Subtarget, DAG, DL,
22952+
DstVT.getSizeInBits());
22953+
}
22954+
}
22955+
}
22956+
2293622957
// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
2293722958
// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
2293822959
// truncate 2 x v4i32 to v8i16.
@@ -34615,12 +34636,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
3461534636
return;
3461634637
}
3461734638

34618-
// Pre-SSSE3 (or v4i64 -> v4i16) widen the truncation input vector to let
34619-
// LowerTRUNCATE handle this via type legalization.
34639+
// Attempt to widen the truncation input vector to let LowerTRUNCATE handle
34640+
// this via type legalization.
3462034641
if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
3462134642
(EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
34622-
(!Subtarget.hasSSSE3() || (InVT == MVT::v4i64 && VT == MVT::v4i16)) &&
34623-
!Subtarget.hasAVX()) {
34643+
(!Subtarget.hasSSSE3() || (InVT == MVT::v8i64 && VT == MVT::v8i8) ||
34644+
(InVT == MVT::v4i64 && VT == MVT::v4i16 && !Subtarget.hasAVX()))) {
3462434645
SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
3462534646
InEltVT.getSizeInBits() * WidenNumElts);
3462634647
Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
@@ -53266,57 +53287,6 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
5326653287
return SDValue();
5326753288
}
5326853289

53269-
/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
53270-
/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
53271-
/// legalization the truncation will be translated into a BUILD_VECTOR with each
53272-
/// element that is extracted from a vector and then truncated, and it is
53273-
/// difficult to do this optimization based on them.
53274-
/// TODO: Remove this and just use LowerTruncateVecPack.
53275-
static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
53276-
const X86Subtarget &Subtarget) {
53277-
EVT OutVT = N->getValueType(0);
53278-
if (!OutVT.isVector())
53279-
return SDValue();
53280-
53281-
SDValue In = N->getOperand(0);
53282-
if (!In.getValueType().isSimple())
53283-
return SDValue();
53284-
53285-
EVT InVT = In.getValueType();
53286-
unsigned NumElems = OutVT.getVectorNumElements();
53287-
53288-
// AVX512 provides fast truncate ops.
53289-
if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
53290-
return SDValue();
53291-
53292-
EVT OutSVT = OutVT.getVectorElementType();
53293-
EVT InSVT = InVT.getVectorElementType();
53294-
if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&
53295-
(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
53296-
NumElems >= 8))
53297-
return SDValue();
53298-
53299-
// SSSE3's pshufb results in less instructions in the cases below.
53300-
if (Subtarget.hasSSSE3() && NumElems == 8) {
53301-
if (InSVT == MVT::i16)
53302-
return SDValue();
53303-
if (InSVT == MVT::i32 &&
53304-
(OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))
53305-
return SDValue();
53306-
}
53307-
53308-
SDLoc DL(N);
53309-
// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
53310-
// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
53311-
// truncate 2 x v4i32 to v8i16.
53312-
if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
53313-
return truncateVectorWithPACKUS(OutVT, In, DL, Subtarget, DAG);
53314-
if (InSVT == MVT::i32)
53315-
return truncateVectorWithPACKSS(OutVT, In, DL, Subtarget, DAG);
53316-
53317-
return SDValue();
53318-
}
53319-
5332053290
/// This function transforms vector truncation of 'extended sign-bits' or
5332153291
/// 'extended zero-bits' values.
5332253292
/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
@@ -53664,7 +53634,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
5366453634
if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
5366553635
return V;
5366653636

53667-
return combineVectorTruncation(N, DAG, Subtarget);
53637+
return SDValue();
5366853638
}
5366953639

5367053640
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,

llvm/test/CodeGen/X86/psubus.ll

Lines changed: 86 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1798,50 +1798,92 @@ vector.ph:
17981798
}
17991799

18001800
define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind {
1801-
; SSE2OR3-LABEL: psubus_16i32_max:
1802-
; SSE2OR3: # %bb.0: # %vector.ph
1803-
; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
1804-
; SSE2OR3-NEXT: movdqa %xmm5, %xmm8
1805-
; SSE2OR3-NEXT: pxor %xmm7, %xmm8
1806-
; SSE2OR3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
1807-
; SSE2OR3-NEXT: movdqa %xmm6, %xmm9
1808-
; SSE2OR3-NEXT: pcmpgtd %xmm8, %xmm9
1809-
; SSE2OR3-NEXT: pcmpeqd %xmm8, %xmm8
1810-
; SSE2OR3-NEXT: pand %xmm9, %xmm5
1811-
; SSE2OR3-NEXT: pxor %xmm8, %xmm9
1812-
; SSE2OR3-NEXT: por %xmm5, %xmm9
1813-
; SSE2OR3-NEXT: pslld $16, %xmm9
1814-
; SSE2OR3-NEXT: psrad $16, %xmm9
1815-
; SSE2OR3-NEXT: movdqa %xmm4, %xmm10
1816-
; SSE2OR3-NEXT: pxor %xmm7, %xmm10
1817-
; SSE2OR3-NEXT: movdqa %xmm6, %xmm5
1818-
; SSE2OR3-NEXT: pcmpgtd %xmm10, %xmm5
1819-
; SSE2OR3-NEXT: pand %xmm5, %xmm4
1820-
; SSE2OR3-NEXT: pxor %xmm8, %xmm5
1821-
; SSE2OR3-NEXT: por %xmm4, %xmm5
1822-
; SSE2OR3-NEXT: pslld $16, %xmm5
1823-
; SSE2OR3-NEXT: psrad $16, %xmm5
1824-
; SSE2OR3-NEXT: packssdw %xmm9, %xmm5
1825-
; SSE2OR3-NEXT: movdqa %xmm3, %xmm4
1826-
; SSE2OR3-NEXT: pxor %xmm7, %xmm4
1827-
; SSE2OR3-NEXT: movdqa %xmm6, %xmm9
1828-
; SSE2OR3-NEXT: pcmpgtd %xmm4, %xmm9
1829-
; SSE2OR3-NEXT: pand %xmm9, %xmm3
1830-
; SSE2OR3-NEXT: pxor %xmm8, %xmm9
1831-
; SSE2OR3-NEXT: por %xmm3, %xmm9
1832-
; SSE2OR3-NEXT: pslld $16, %xmm9
1833-
; SSE2OR3-NEXT: psrad $16, %xmm9
1834-
; SSE2OR3-NEXT: pxor %xmm2, %xmm7
1835-
; SSE2OR3-NEXT: pcmpgtd %xmm7, %xmm6
1836-
; SSE2OR3-NEXT: pxor %xmm6, %xmm8
1837-
; SSE2OR3-NEXT: pand %xmm2, %xmm6
1838-
; SSE2OR3-NEXT: por %xmm8, %xmm6
1839-
; SSE2OR3-NEXT: pslld $16, %xmm6
1840-
; SSE2OR3-NEXT: psrad $16, %xmm6
1841-
; SSE2OR3-NEXT: packssdw %xmm9, %xmm6
1842-
; SSE2OR3-NEXT: psubusw %xmm6, %xmm0
1843-
; SSE2OR3-NEXT: psubusw %xmm5, %xmm1
1844-
; SSE2OR3-NEXT: retq
1801+
; SSE2-LABEL: psubus_16i32_max:
1802+
; SSE2: # %bb.0: # %vector.ph
1803+
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
1804+
; SSE2-NEXT: movdqa %xmm3, %xmm8
1805+
; SSE2-NEXT: pxor %xmm7, %xmm8
1806+
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
1807+
; SSE2-NEXT: movdqa %xmm6, %xmm9
1808+
; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
1809+
; SSE2-NEXT: pcmpeqd %xmm8, %xmm8
1810+
; SSE2-NEXT: pand %xmm9, %xmm3
1811+
; SSE2-NEXT: pxor %xmm8, %xmm9
1812+
; SSE2-NEXT: por %xmm3, %xmm9
1813+
; SSE2-NEXT: pslld $16, %xmm9
1814+
; SSE2-NEXT: psrad $16, %xmm9
1815+
; SSE2-NEXT: movdqa %xmm2, %xmm3
1816+
; SSE2-NEXT: pxor %xmm7, %xmm3
1817+
; SSE2-NEXT: movdqa %xmm6, %xmm10
1818+
; SSE2-NEXT: pcmpgtd %xmm3, %xmm10
1819+
; SSE2-NEXT: pand %xmm10, %xmm2
1820+
; SSE2-NEXT: pxor %xmm8, %xmm10
1821+
; SSE2-NEXT: por %xmm2, %xmm10
1822+
; SSE2-NEXT: pslld $16, %xmm10
1823+
; SSE2-NEXT: psrad $16, %xmm10
1824+
; SSE2-NEXT: packssdw %xmm9, %xmm10
1825+
; SSE2-NEXT: psubusw %xmm10, %xmm0
1826+
; SSE2-NEXT: movdqa %xmm5, %xmm2
1827+
; SSE2-NEXT: pxor %xmm7, %xmm2
1828+
; SSE2-NEXT: movdqa %xmm6, %xmm3
1829+
; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
1830+
; SSE2-NEXT: pand %xmm3, %xmm5
1831+
; SSE2-NEXT: pxor %xmm8, %xmm3
1832+
; SSE2-NEXT: por %xmm5, %xmm3
1833+
; SSE2-NEXT: pslld $16, %xmm3
1834+
; SSE2-NEXT: psrad $16, %xmm3
1835+
; SSE2-NEXT: pxor %xmm4, %xmm7
1836+
; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
1837+
; SSE2-NEXT: pxor %xmm6, %xmm8
1838+
; SSE2-NEXT: pand %xmm4, %xmm6
1839+
; SSE2-NEXT: por %xmm8, %xmm6
1840+
; SSE2-NEXT: pslld $16, %xmm6
1841+
; SSE2-NEXT: psrad $16, %xmm6
1842+
; SSE2-NEXT: packssdw %xmm3, %xmm6
1843+
; SSE2-NEXT: psubusw %xmm6, %xmm1
1844+
; SSE2-NEXT: retq
1845+
;
1846+
; SSSE3-LABEL: psubus_16i32_max:
1847+
; SSSE3: # %bb.0: # %vector.ph
1848+
; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147483648,2147483648,2147483648,2147483648]
1849+
; SSSE3-NEXT: movdqa %xmm3, %xmm8
1850+
; SSSE3-NEXT: pxor %xmm7, %xmm8
1851+
; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
1852+
; SSSE3-NEXT: movdqa %xmm6, %xmm9
1853+
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
1854+
; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8
1855+
; SSSE3-NEXT: pand %xmm9, %xmm3
1856+
; SSSE3-NEXT: pxor %xmm8, %xmm9
1857+
; SSSE3-NEXT: por %xmm3, %xmm9
1858+
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1859+
; SSSE3-NEXT: pshufb %xmm3, %xmm9
1860+
; SSSE3-NEXT: movdqa %xmm2, %xmm10
1861+
; SSSE3-NEXT: pxor %xmm7, %xmm10
1862+
; SSSE3-NEXT: movdqa %xmm6, %xmm11
1863+
; SSSE3-NEXT: pcmpgtd %xmm10, %xmm11
1864+
; SSSE3-NEXT: pand %xmm11, %xmm2
1865+
; SSSE3-NEXT: pxor %xmm8, %xmm11
1866+
; SSSE3-NEXT: por %xmm2, %xmm11
1867+
; SSSE3-NEXT: pshufb %xmm3, %xmm11
1868+
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm9[0]
1869+
; SSSE3-NEXT: psubusw %xmm11, %xmm0
1870+
; SSSE3-NEXT: movdqa %xmm5, %xmm2
1871+
; SSSE3-NEXT: pxor %xmm7, %xmm2
1872+
; SSSE3-NEXT: movdqa %xmm6, %xmm9
1873+
; SSSE3-NEXT: pcmpgtd %xmm2, %xmm9
1874+
; SSSE3-NEXT: pand %xmm9, %xmm5
1875+
; SSSE3-NEXT: pxor %xmm8, %xmm9
1876+
; SSSE3-NEXT: por %xmm5, %xmm9
1877+
; SSSE3-NEXT: pshufb %xmm3, %xmm9
1878+
; SSSE3-NEXT: pxor %xmm4, %xmm7
1879+
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6
1880+
; SSSE3-NEXT: pxor %xmm6, %xmm8
1881+
; SSSE3-NEXT: pand %xmm4, %xmm6
1882+
; SSSE3-NEXT: por %xmm8, %xmm6
1883+
; SSSE3-NEXT: pshufb %xmm3, %xmm6
1884+
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm9[0]
1885+
; SSSE3-NEXT: psubusw %xmm6, %xmm1
1886+
; SSSE3-NEXT: retq
18451887
;
18461888
; SSE41-LABEL: psubus_16i32_max:
18471889
; SSE41: # %bb.0: # %vector.ph

0 commit comments

Comments
 (0)