Skip to content

Commit aaaa2a3

Browse files
[SLP]Support vectorization of previously vectorized scalars in split nodes
Patch removes the restriction for the revectorization of the previously vectorized scalars in split nodes, and moves the cost profitability check to avoid regressions. Reviewers: hiraditya, RKSimon Reviewed By: RKSimon Pull Request: #134286
1 parent 01a2922 commit aaaa2a3

File tree

4 files changed

+44
-46
lines changed

4 files changed

+44
-46
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -895,6 +895,13 @@ class InstructionsState {
895895
is_contained(AddSub, getAltOpcode());
896896
}
897897

898+
/// Checks if main/alt instructions are cmp operations.
899+
bool isCmpOp() const {
900+
return (getOpcode() == Instruction::ICmp ||
901+
getOpcode() == Instruction::FCmp) &&
902+
getAltOpcode() == getOpcode();
903+
}
904+
898905
/// Checks if the current state is valid, i.e. has non-null MainOp
899906
bool valid() const { return MainOp && AltOp; }
900907

@@ -9277,22 +9284,23 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
92779284
// as alternate ops.
92789285
if (NumParts >= VL.size())
92799286
return false;
9287+
constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
9288+
InstructionCost InsertCost = ::getShuffleCost(
9289+
*TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
9290+
FixedVectorType *SubVecTy =
9291+
getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
9292+
InstructionCost NewShuffleCost =
9293+
::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
9294+
if (!LocalState.isCmpOp() && NumParts <= 1 &&
9295+
(Mask.empty() || InsertCost >= NewShuffleCost))
9296+
return false;
92809297
if ((LocalState.getMainOp()->isBinaryOp() &&
92819298
LocalState.getAltOp()->isBinaryOp() &&
92829299
(LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
92839300
LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
92849301
(LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
92859302
(LocalState.getMainOp()->isUnaryOp() &&
92869303
LocalState.getAltOp()->isUnaryOp())) {
9287-
constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
9288-
InstructionCost InsertCost = ::getShuffleCost(
9289-
*TTI, TTI::SK_InsertSubvector, VecTy, {}, Kind, Op1.size(), Op2VecTy);
9290-
FixedVectorType *SubVecTy =
9291-
getWidenedType(ScalarTy, std::max(Op1.size(), Op2.size()));
9292-
InstructionCost NewShuffleCost =
9293-
::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, SubVecTy, Mask, Kind);
9294-
if (NumParts <= 1 && (Mask.empty() || InsertCost >= NewShuffleCost))
9295-
return false;
92969304
InstructionCost OriginalVecOpsCost =
92979305
TTI->getArithmeticInstrCost(Opcode0, VecTy, Kind) +
92989306
TTI->getArithmeticInstrCost(Opcode1, VecTy, Kind);
@@ -9429,18 +9437,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
94299437
if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
94309438
return false;
94319439

9432-
// Any value is used in split node already - just gather.
9433-
if (any_of(VL, [&](Value *V) {
9434-
return ScalarsInSplitNodes.contains(V) || isVectorized(V);
9435-
})) {
9436-
if (TryToFindDuplicates(S)) {
9437-
auto Invalid = ScheduleBundle::invalid();
9438-
newTreeEntry(VL, Invalid /*not vectorized*/, S, UserTreeIdx,
9439-
ReuseShuffleIndices);
9440-
}
9441-
return true;
9442-
}
9443-
94449440
SmallVector<Value *> NewVL(VL.size());
94459441
copy(Op1, NewVL.begin());
94469442
copy(Op2, std::next(NewVL.begin(), Op1.size()));
@@ -9616,9 +9612,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
96169612
::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, {}, Kind) +
96179613
::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Extracted,
96189614
/*Insert=*/false, /*Extract=*/true, Kind);
9619-
InstructionCost ScalarizeCostEstimate =
9620-
::getScalarizationOverhead(*TTI, ScalarTy, VecTy, Vectorized,
9621-
/*Insert=*/true, /*Extract=*/false, Kind);
9615+
InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
9616+
*TTI, ScalarTy, VecTy, Vectorized,
9617+
/*Insert=*/true, /*Extract=*/false, Kind, /*ForPoisonSrc=*/false);
96229618
PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
96239619
}
96249620
if (PreferScalarize) {

llvm/test/Transforms/SLPVectorizer/X86/splat-score-adjustment.ll

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,22 @@ define i32 @a() {
77
; CHECK-NEXT: br label %[[BB1:.*]]
88
; CHECK: [[BB1]]:
99
; CHECK-NEXT: [[TMP2:%.*]] = phi <4 x i8> [ zeroinitializer, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[BB1]] ]
10-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
10+
; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i8> [ zeroinitializer, [[TMP0]] ], [ [[TMP17:%.*]], %[[BB1]] ]
11+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
1112
; CHECK-NEXT: [[TMP6]] = load <4 x i8>, ptr null, align 4
12-
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
13-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
13+
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
14+
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
15+
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1416
; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i8> [[TMP6]], [[TMP8]]
15-
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
16-
; CHECK-NEXT: [[TMP18:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP10]], <4 x i8> [[TMP6]], i64 4)
17-
; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP18]], <8 x i32> <i32 1, i32 2, i32 3, i32 12, i32 3, i32 12, i32 13, i32 14>
17+
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <8 x i32> <i32 poison, i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3>
18+
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <8 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 2, i32 poison, i32 3, i32 poison>
19+
; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i8> [[TMP10]], <8 x i8> [[TMP11]], <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
20+
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
21+
; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> [[TMP18]], <8 x i32> <i32 1, i32 3, i32 2, i32 9, i32 3, i32 11, i32 9, i32 13>
1822
; CHECK-NEXT: [[TMP22:%.*]] = xor <8 x i8> [[TMP18]], [[TMP21]]
1923
; CHECK-NEXT: [[TMP23:%.*]] = xor <8 x i8> [[TMP22]], [[TMP5]]
20-
; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i8> [[TMP23]], <8 x i8> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
21-
; CHECK-NEXT: store <8 x i8> [[TMP13]], ptr null, align 4
24+
; CHECK-NEXT: store <8 x i8> [[TMP23]], ptr null, align 4
25+
; CHECK-NEXT: [[TMP17]] = shufflevector <4 x i8> [[TMP6]], <4 x i8> poison, <2 x i32> <i32 2, i32 3>
2226
; CHECK-NEXT: br label %[[BB1]]
2327
;
2428
br label %1

llvm/test/Transforms/SLPVectorizer/X86/split-node-last-inst-vectorized.ll

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,19 @@ define void @test(ptr %0, <8 x i8> %1) {
66
; CHECK-SAME: ptr [[TMP0:%.*]], <8 x i8> [[TMP1:%.*]]) {
77
; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP0]], align 2
88
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0]], i64 13436
9+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP0]], i64 13444
10+
; CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP8]], align 4
11+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0]], i64 13544
912
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP0]], i64 13536
10-
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP0]], i64 13437
1113
; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i8>, ptr [[TMP4]], align 4
12-
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 5, i32 0, i32 7>
13-
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i8> [[TMP7]], i8 [[TMP3]], i32 1
14+
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x i8> poison, i8 [[TMP6]], i32 0
15+
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i8> [[TMP12]], i8 [[TMP3]], i32 1
1416
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i8> [[TMP9]], <8 x i8> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
15-
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
16-
; CHECK-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP11]], <8 x i8> [[TMP10]], i64 8)
17-
; CHECK-NEXT: [[TMP13:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
18-
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> poison, <8 x i32> <i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
19-
; CHECK-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[TMP7]], i64 0)
20-
; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP15]], <8 x i8> [[TMP14]], i64 8)
21-
; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i8> [[TMP16]], [[TMP12]]
22-
; CHECK-NEXT: store <16 x i8> [[TMP17]], ptr [[TMP5]], align 4
17+
; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i8> [[TMP10]], [[TMP7]]
18+
; CHECK-NEXT: store <8 x i8> [[TMP13]], ptr [[TMP11]], align 4
19+
; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 5, i32 0, i32 7>
20+
; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i8> [[TMP7]], [[TMP14]]
21+
; CHECK-NEXT: store <8 x i8> [[TMP15]], ptr [[TMP5]], align 4
2322
; CHECK-NEXT: ret void
2423
;
2524
%3 = load i8, ptr %0, align 2

llvm/test/Transforms/SLPVectorizer/X86/split-node-no-reorder-copy.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,8 @@ define i1 @test(ptr %0, ptr %1, <2 x float> %2, <2 x float> %3, <2 x float> %4)
1616
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP9]], i32 7
1717
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
1818
; CHECK-NEXT: [[TMP17:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP16]], <8 x float> [[TMP15]], i64 8)
19-
; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> [[TMP12]], <16 x i32> <i32 8, i32 9, i32 9, i32 9, i32 9, i32 9, i32 14, i32 14, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 poison>
20-
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP9]], i32 15
21-
; CHECK-NEXT: [[TMP20:%.*]] = fmul <16 x float> [[TMP17]], [[TMP19]]
19+
; CHECK-NEXT: [[TMP18:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v8f32(<16 x float> [[TMP16]], <8 x float> [[TMP15]], i64 8)
20+
; CHECK-NEXT: [[TMP20:%.*]] = fmul <16 x float> [[TMP18]], [[TMP17]]
2221
; CHECK-NEXT: [[TMP21:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP20]])
2322
; CHECK-NEXT: [[TMP22:%.*]] = call float @foo(float [[TMP21]])
2423
; CHECK-NEXT: ret i1 false

0 commit comments

Comments
 (0)