Skip to content

Commit 88e7b8b

Browse files
[SLP]Use TTI::getScalarizationOverhead where possible
Better to use TTI::getScalarizationOverhead instead of TTI::getVectorInstrCost to correctly calculate the costs of buildvectors/extracts. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #125725
1 parent 13432e0 commit 88e7b8b

File tree

7 files changed

+152
-93
lines changed

7 files changed

+152
-93
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 46 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -10706,6 +10706,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1070610706
});
1070710707
SmallPtrSet<Value *, 4> UniqueBases;
1070810708
unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
10709+
SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
1070910710
for (unsigned Part : seq<unsigned>(NumParts)) {
1071010711
unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
1071110712
ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
@@ -10756,10 +10757,18 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
1075610757
continue;
1075710758
}
1075810759
}
10759-
Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
10760-
CostKind, Idx);
10761-
}
10762-
}
10760+
APInt &DemandedElts =
10761+
VectorOpsToExtracts
10762+
.try_emplace(VecBase,
10763+
APInt::getZero(getNumElements(VecBase->getType())))
10764+
.first->getSecond();
10765+
DemandedElts.setBit(Idx);
10766+
}
10767+
}
10768+
for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
10769+
Cost -= TTI.getScalarizationOverhead(cast<VectorType>(Vec->getType()),
10770+
DemandedElts, /*Insert=*/false,
10771+
/*Extract=*/true, CostKind);
1076310772
// Check that gather of extractelements can be represented as just a
1076410773
// shuffle of a single/two vectors the scalars are extracted from.
1076510774
// Found the bunch of extractelement instructions that must be gathered
@@ -11283,24 +11292,27 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1128311292
}
1128411293
case Instruction::ExtractValue:
1128511294
case Instruction::ExtractElement: {
11295+
APInt DemandedElts;
11296+
VectorType *SrcVecTy = nullptr;
1128611297
auto GetScalarCost = [&](unsigned Idx) {
1128711298
if (isa<PoisonValue>(UniqueValues[Idx]))
1128811299
return InstructionCost(TTI::TCC_Free);
1128911300

1129011301
auto *I = cast<Instruction>(UniqueValues[Idx]);
11291-
VectorType *SrcVecTy;
11292-
if (ShuffleOrOp == Instruction::ExtractElement) {
11293-
auto *EE = cast<ExtractElementInst>(I);
11294-
SrcVecTy = EE->getVectorOperandType();
11295-
} else {
11296-
auto *EV = cast<ExtractValueInst>(I);
11297-
Type *AggregateTy = EV->getAggregateOperand()->getType();
11298-
unsigned NumElts;
11299-
if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11300-
NumElts = ATy->getNumElements();
11301-
else
11302-
NumElts = AggregateTy->getStructNumElements();
11303-
SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
11302+
if (!SrcVecTy) {
11303+
if (ShuffleOrOp == Instruction::ExtractElement) {
11304+
auto *EE = cast<ExtractElementInst>(I);
11305+
SrcVecTy = EE->getVectorOperandType();
11306+
} else {
11307+
auto *EV = cast<ExtractValueInst>(I);
11308+
Type *AggregateTy = EV->getAggregateOperand()->getType();
11309+
unsigned NumElts;
11310+
if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11311+
NumElts = ATy->getNumElements();
11312+
else
11313+
NumElts = AggregateTy->getStructNumElements();
11314+
SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
11315+
}
1130411316
}
1130511317
if (I->hasOneUse()) {
1130611318
Instruction *Ext = I->user_back();
@@ -11317,10 +11329,18 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
1131711329
return Cost;
1131811330
}
1131911331
}
11320-
return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
11321-
CostKind, *getExtractIndex(I));
11332+
if (DemandedElts.isZero())
11333+
DemandedElts = APInt::getZero(getNumElements(SrcVecTy));
11334+
DemandedElts.setBit(*getExtractIndex(I));
11335+
return InstructionCost(TTI::TCC_Free);
11336+
};
11337+
auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
11338+
return CommonCost - (DemandedElts.isZero()
11339+
? TTI::TCC_Free
11340+
: TTI.getScalarizationOverhead(
11341+
SrcVecTy, DemandedElts, /*Insert=*/false,
11342+
/*Extract=*/true, CostKind));
1132211343
};
11323-
auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
1132411344
return GetCostDiff(GetScalarCost, GetVectorCost);
1132511345
}
1132611346
case Instruction::InsertElement: {
@@ -13663,6 +13683,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
1366313683
// Check if the same elements are inserted several times and count them as
1366413684
// shuffle candidates.
1366513685
APInt ShuffledElements = APInt::getZero(VL.size());
13686+
APInt DemandedElements = APInt::getZero(VL.size());
1366613687
DenseMap<Value *, unsigned> UniqueElements;
1366713688
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1366813689
InstructionCost Cost;
@@ -13673,9 +13694,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
1367313694
V = nullptr;
1367413695
}
1367513696
if (!ForPoisonSrc)
13676-
Cost +=
13677-
TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
13678-
I, Constant::getNullValue(VecTy), V);
13697+
DemandedElements.setBit(I);
1367913698
};
1368013699
SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
1368113700
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
@@ -13698,6 +13717,10 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
1369813717
ShuffledElements.setBit(I);
1369913718
ShuffleMask[I] = Res.first->second;
1370013719
}
13720+
if (!DemandedElements.isZero())
13721+
Cost +=
13722+
TTI->getScalarizationOverhead(VecTy, DemandedElements, /*Insert=*/true,
13723+
/*Extract=*/false, CostKind, VL);
1370113724
if (ForPoisonSrc) {
1370213725
if (isa<FixedVectorType>(ScalarTy)) {
1370313726
assert(SLPReVec && "Only supported by REVEC.");

llvm/test/Transforms/SLPVectorizer/AArch64/div.ll

Lines changed: 7 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -607,35 +607,13 @@ define <2 x i32> @sdiv_v2i32_unknown_divisor(<2 x i32> %a, <2 x i32> %x, <2 x i3
607607

608608
; computes (a/const + x - y) * z
609609
define <2 x i32> @sdiv_v2i32_const_divisor(<2 x i32> %a, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
610-
; NO-SVE-LABEL: define <2 x i32> @sdiv_v2i32_const_divisor(
611-
; NO-SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
612-
; NO-SVE-NEXT: [[A0:%.*]] = extractelement <2 x i32> [[A]], i64 0
613-
; NO-SVE-NEXT: [[A1:%.*]] = extractelement <2 x i32> [[A]], i64 1
614-
; NO-SVE-NEXT: [[TMP1:%.*]] = sdiv i32 [[A0]], 2
615-
; NO-SVE-NEXT: [[TMP2:%.*]] = sdiv i32 [[A1]], 4
616-
; NO-SVE-NEXT: [[X0:%.*]] = extractelement <2 x i32> [[X]], i64 0
617-
; NO-SVE-NEXT: [[X1:%.*]] = extractelement <2 x i32> [[X]], i64 1
618-
; NO-SVE-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[X0]]
619-
; NO-SVE-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], [[X1]]
620-
; NO-SVE-NEXT: [[Y0:%.*]] = extractelement <2 x i32> [[Y]], i64 0
621-
; NO-SVE-NEXT: [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i64 1
622-
; NO-SVE-NEXT: [[TMP5:%.*]] = sub i32 [[TMP3]], [[Y0]]
623-
; NO-SVE-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[Y1]]
624-
; NO-SVE-NEXT: [[Z0:%.*]] = extractelement <2 x i32> [[Z]], i64 0
625-
; NO-SVE-NEXT: [[Z1:%.*]] = extractelement <2 x i32> [[Z]], i64 1
626-
; NO-SVE-NEXT: [[TMP7:%.*]] = mul i32 [[TMP5]], [[Z0]]
627-
; NO-SVE-NEXT: [[TMP8:%.*]] = mul i32 [[TMP6]], [[Z1]]
628-
; NO-SVE-NEXT: [[RES0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
629-
; NO-SVE-NEXT: [[RES1:%.*]] = insertelement <2 x i32> [[RES0]], i32 [[TMP8]], i32 1
630-
; NO-SVE-NEXT: ret <2 x i32> [[RES1]]
631-
;
632-
; SVE-LABEL: define <2 x i32> @sdiv_v2i32_const_divisor(
633-
; SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
634-
; SVE-NEXT: [[TMP1:%.*]] = sdiv <2 x i32> [[A]], <i32 2, i32 4>
635-
; SVE-NEXT: [[TMP2:%.*]] = add <2 x i32> [[TMP1]], [[X]]
636-
; SVE-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], [[Y]]
637-
; SVE-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP3]], [[Z]]
638-
; SVE-NEXT: ret <2 x i32> [[TMP4]]
610+
; CHECK-LABEL: define <2 x i32> @sdiv_v2i32_const_divisor(
611+
; CHECK-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
612+
; CHECK-NEXT: [[TMP1:%.*]] = sdiv <2 x i32> [[A]], <i32 2, i32 4>
613+
; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i32> [[TMP1]], [[X]]
614+
; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i32> [[TMP2]], [[Y]]
615+
; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP3]], [[Z]]
616+
; CHECK-NEXT: ret <2 x i32> [[TMP4]]
639617
;
640618
{
641619
%a0 = extractelement <2 x i32> %a, i64 0

llvm/test/Transforms/SLPVectorizer/X86/buildvector-with-reuses.ll

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,18 @@ define <4 x double> @test(ptr %ia, ptr %ib, ptr %ic, ptr %id, ptr %ie, ptr %x) {
1010
; CHECK-NEXT: [[I4275:%.*]] = load double, ptr [[ID]], align 8
1111
; CHECK-NEXT: [[I4277:%.*]] = load double, ptr [[IE]], align 8
1212
; CHECK-NEXT: [[I4326:%.*]] = load <4 x double>, ptr [[X]], align 8
13-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[I4326]], <4 x double> poison, <2 x i32> <i32 0, i32 poison>
14-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I4275]], i32 1
15-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
16-
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> poison, double [[I4238]], i32 0
17-
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[I4252]], i32 1
18-
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[I4264]], i32 2
19-
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[I4277]], i32 3
20-
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x double> [[TMP3]], [[TMP7]]
21-
; CHECK-NEXT: ret <4 x double> [[TMP8]]
13+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[I4326]], <4 x double> poison, <2 x i32> zeroinitializer
14+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[I4238]], i32 0
15+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[I4252]], i32 1
16+
; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP3]]
17+
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP1]], double [[I4275]], i32 1
18+
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[I4264]], i32 0
19+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[I4277]], i32 1
20+
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x double> [[TMP5]], [[TMP7]]
21+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
22+
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
23+
; CHECK-NEXT: [[I44281:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
24+
; CHECK-NEXT: ret <4 x double> [[I44281]]
2225
;
2326
%i4238 = load double, ptr %ia, align 8
2427
%i4252 = load double, ptr %ib, align 8

llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
99
; YAML-NEXT: Function: foo
1010
; YAML-NEXT: Args:
1111
; YAML-NEXT: - String: 'SLP vectorized with cost '
12-
; YAML-NEXT: - Cost: '-3'
12+
; YAML-NEXT: - Cost: '-4'
1313
; YAML-NEXT: - String: ' and with tree size '
1414
; YAML-NEXT: - TreeSize: '10'
1515
; YAML-NEXT: ...

llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,24 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <
4949
;
5050
; AVX512-LABEL: @reduce_and4(
5151
; AVX512-NEXT: entry:
52-
; AVX512-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
53-
; AVX512-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
54-
; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
55-
; AVX512-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
56-
; AVX512-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
52+
; AVX512-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0
53+
; AVX512-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V1]], i64 1
54+
; AVX512-NEXT: [[VECEXT2:%.*]] = extractelement <4 x i32> [[V1]], i64 2
55+
; AVX512-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i32> [[V1]], i64 3
56+
; AVX512-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0
57+
; AVX512-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1
58+
; AVX512-NEXT: [[VECEXT10:%.*]] = extractelement <4 x i32> [[V2]], i64 2
59+
; AVX512-NEXT: [[VECEXT12:%.*]] = extractelement <4 x i32> [[V2]], i64 3
60+
; AVX512-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
61+
; AVX512-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[VECEXT8]], i32 8
62+
; AVX512-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[VECEXT7]], i32 9
63+
; AVX512-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[VECEXT10]], i32 10
64+
; AVX512-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[VECEXT12]], i32 11
65+
; AVX512-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[VECEXT1]], i32 12
66+
; AVX512-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[VECEXT]], i32 13
67+
; AVX512-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[VECEXT2]], i32 14
68+
; AVX512-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[VECEXT4]], i32 15
69+
; AVX512-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP8]])
5770
; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
5871
; AVX512-NEXT: ret i32 [[OP_RDX1]]
5972
;
@@ -131,11 +144,24 @@ define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i
131144
; AVX2-NEXT: ret i32 [[OP_RDX]]
132145
;
133146
; AVX512-LABEL: @reduce_and4_transpose(
134-
; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
135-
; AVX512-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
136-
; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
137-
; AVX512-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
138-
; AVX512-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
147+
; AVX512-NEXT: [[VECEXT:%.*]] = extractelement <4 x i32> [[V1:%.*]], i64 0
148+
; AVX512-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[V2:%.*]], i64 0
149+
; AVX512-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i32> [[V1]], i64 1
150+
; AVX512-NEXT: [[VECEXT8:%.*]] = extractelement <4 x i32> [[V2]], i64 1
151+
; AVX512-NEXT: [[VECEXT15:%.*]] = extractelement <4 x i32> [[V1]], i64 2
152+
; AVX512-NEXT: [[VECEXT16:%.*]] = extractelement <4 x i32> [[V2]], i64 2
153+
; AVX512-NEXT: [[VECEXT23:%.*]] = extractelement <4 x i32> [[V1]], i64 3
154+
; AVX512-NEXT: [[VECEXT24:%.*]] = extractelement <4 x i32> [[V2]], i64 3
155+
; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
156+
; AVX512-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[VECEXT24]], i32 8
157+
; AVX512-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[VECEXT16]], i32 9
158+
; AVX512-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[VECEXT8]], i32 10
159+
; AVX512-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[VECEXT1]], i32 11
160+
; AVX512-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[VECEXT23]], i32 12
161+
; AVX512-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[VECEXT15]], i32 13
162+
; AVX512-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[VECEXT7]], i32 14
163+
; AVX512-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[VECEXT]], i32 15
164+
; AVX512-NEXT: [[OP_RDX:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP9]])
139165
; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
140166
; AVX512-NEXT: ret i32 [[OP_RDX1]]
141167
;

llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,43 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
2-
; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 < %s | FileCheck %s %}
3-
; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
2+
; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 < %s | FileCheck %s --check-prefix=X86 %}
3+
; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH64 %}
44

55
define void @tes() {
6-
; CHECK-LABEL: define void @tes() {
7-
; CHECK-NEXT: entry:
8-
; CHECK-NEXT: [[TMP0:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer
9-
; CHECK-NEXT: br label [[TMP1:%.*]]
10-
; CHECK: 1:
11-
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[TMP0]], <4 x i32> <i32 0, i32 0, i32 0, i32 2>
12-
; CHECK-NEXT: [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]])
13-
; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 false, i1 [[TMP4]], i1 false
14-
; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 false, i1 [[OP_RDX]], i1 false
15-
; CHECK-NEXT: br i1 [[OP_RDX1]], label [[TMP6:%.*]], label [[TMP5:%.*]]
16-
; CHECK: 4:
17-
; CHECK-NEXT: ret void
18-
; CHECK: 5:
19-
; CHECK-NEXT: ret void
6+
; X86-LABEL: define void @tes() {
7+
; X86-NEXT: entry:
8+
; X86-NEXT: [[TMP0:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer
9+
; X86-NEXT: br label [[TMP1:%.*]]
10+
; X86: 1:
11+
; X86-NEXT: [[TMP2:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[TMP0]], <4 x i32> <i32 0, i32 0, i32 0, i32 2>
12+
; X86-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP2]])
13+
; X86-NEXT: [[OP_RDX:%.*]] = select i1 false, i1 [[TMP3]], i1 false
14+
; X86-NEXT: [[OP_RDX1:%.*]] = select i1 false, i1 [[OP_RDX]], i1 false
15+
; X86-NEXT: br i1 [[OP_RDX1]], label [[TMP4:%.*]], label [[TMP5:%.*]]
16+
; X86: 4:
17+
; X86-NEXT: ret void
18+
; X86: 5:
19+
; X86-NEXT: ret void
20+
;
21+
; AARCH64-LABEL: define void @tes() {
22+
; AARCH64-NEXT: entry:
23+
; AARCH64-NEXT: [[TMP0:%.*]] = extractelement <2 x i1> zeroinitializer, i64 0
24+
; AARCH64-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> zeroinitializer, i64 0
25+
; AARCH64-NEXT: [[TMP2:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer
26+
; AARCH64-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
27+
; AARCH64-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> zeroinitializer, i64 0
28+
; AARCH64-NEXT: br label [[TMP5:%.*]]
29+
; AARCH64: 5:
30+
; AARCH64-NEXT: [[TMP6:%.*]] = select i1 false, i1 false, i1 false
31+
; AARCH64-NEXT: [[TMP7:%.*]] = select i1 [[TMP6]], i1 [[TMP0]], i1 false
32+
; AARCH64-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i1 [[TMP1]], i1 false
33+
; AARCH64-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i1 false, i1 false
34+
; AARCH64-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i1 [[TMP3]], i1 false
35+
; AARCH64-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i1 [[TMP4]], i1 false
36+
; AARCH64-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP13:%.*]]
37+
; AARCH64: 12:
38+
; AARCH64-NEXT: ret void
39+
; AARCH64: 13:
40+
; AARCH64-NEXT: ret void
2041
;
2142
entry:
2243
%0 = extractelement <2 x i1> zeroinitializer, i64 0

0 commit comments

Comments
 (0)