Skip to content

Commit a861ed4

Browse files
authored
[VPlan] Add initial loop-invariant code motion transform. (#107894)
Add initial transform to move out loop-invariant recipes. This also helps to fix a divergence between legacy and VPlan-based cost model due to legacy using ScalarEvolution::isLoopInvariant in some cases. Fixes #107501. PR: #107894
1 parent 37e5319 commit a861ed4

37 files changed

+746
-574
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2378,7 +2378,8 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
23782378
AC->registerAssumption(II);
23792379

23802380
// End if-block.
2381-
bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2381+
VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
2382+
bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
23822383
if (IfPredicateInstr)
23832384
PredicatedInstructions.push_back(Cloned);
23842385
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,41 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
971971
return R.getVPSingleValue()->replaceAllUsesWith(A);
972972
}
973973

974+
/// Move loop-invariant recipes out of the vector loop region in \p Plan.
975+
static void licm(VPlan &Plan) {
976+
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
977+
VPBasicBlock *Preheader =
978+
cast<VPBasicBlock>(LoopRegion->getSinglePredecessor());
979+
980+
// Return true if we do not know how to (mechanically) hoist a given recipe
981+
// out of a loop region. Does not address legality concerns such as aliasing
982+
// or speculation safety.
983+
auto CannotHoistRecipe = [](VPRecipeBase &R) {
984+
// Allocas cannot be hoisted.
985+
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
986+
return RepR && RepR->getOpcode() == Instruction::Alloca;
987+
};
988+
989+
// Hoist any loop invariant recipes from the vector loop region to the
990+
// preheader. Preform a shallow traversal of the vector loop region, to
991+
// exclude recipes in replicate regions.
992+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
993+
vp_depth_first_shallow(LoopRegion->getEntry()))) {
994+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
995+
if (CannotHoistRecipe(R))
996+
continue;
997+
// TODO: Relax checks in the future, e.g. we could also hoist reads, if
998+
// their memory location is not modified in the vector loop.
999+
if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
1000+
any_of(R.operands(), [](VPValue *Op) {
1001+
return !Op->isDefinedOutsideLoopRegions();
1002+
}))
1003+
continue;
1004+
R.moveBefore(*Preheader, Preheader->end());
1005+
}
1006+
}
1007+
}
1008+
9741009
/// Try to simplify the recipes in \p Plan.
9751010
static void simplifyRecipes(VPlan &Plan) {
9761011
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
@@ -1123,6 +1158,7 @@ void VPlanTransforms::optimize(VPlan &Plan) {
11231158
removeRedundantInductionCasts(Plan);
11241159

11251160
simplifyRecipes(Plan);
1161+
licm(Plan);
11261162
legalizeAndOptimizeInductions(Plan);
11271163
removeDeadRecipes(Plan);
11281164

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 75 additions & 220 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
2626
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
2727
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
2828
; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
29+
; CHECK-NEXT: [[TMP18:%.*]] = sdiv i64 [[M]], [[CONV6]]
30+
; CHECK-NEXT: [[TMP19:%.*]] = sdiv i64 [[M]], [[CONV6]]
2931
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
3032
; CHECK: [[VECTOR_BODY]]:
3133
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -35,8 +37,6 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
3537
; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], 0
3638
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 1
3739
; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]]
38-
; CHECK-NEXT: [[TMP18:%.*]] = sdiv i64 [[M]], [[CONV6]]
39-
; CHECK-NEXT: [[TMP19:%.*]] = sdiv i64 [[M]], [[CONV6]]
4040
; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP18]] to i32
4141
; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP19]] to i32
4242
; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP18]], [[CONV61]]

llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,22 +17,23 @@
1717

1818
; Check that the extractvalue operands are actually free in vector code.
1919

20-
; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
21-
; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
22-
; FORCED-NEXT: %0 = add i32 %index, 0
23-
; FORCED-NEXT: %1 = extractvalue { i64, i64 } %sv, 0
24-
; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i64> poison, i64 %1, i64 0
20+
; FORCED: [[E1:%.+]] = extractvalue { i64, i64 } %sv, 0
21+
; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i64> poison, i64 [[E1]], i64 0
2522
; FORCED-NEXT: %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> poison, <2 x i32> zeroinitializer
26-
; FORCED-NEXT: %2 = extractvalue { i64, i64 } %sv, 1
27-
; FORCED-NEXT: %broadcast.splatinsert1 = insertelement <2 x i64> poison, i64 %2, i64 0
23+
; FORCED-NEXT: [[E2:%.+]] = extractvalue { i64, i64 } %sv, 1
24+
; FORCED-NEXT: %broadcast.splatinsert1 = insertelement <2 x i64> poison, i64 [[E2]], i64 0
2825
; FORCED-NEXT: %broadcast.splat2 = shufflevector <2 x i64> %broadcast.splatinsert1, <2 x i64> poison, <2 x i32> zeroinitializer
29-
; FORCED-NEXT: %3 = getelementptr i64, ptr %dst, i32 %0
30-
; FORCED-NEXT: %4 = add <2 x i64> %broadcast.splat, %broadcast.splat2
31-
; FORCED-NEXT: %5 = getelementptr i64, ptr %3, i32 0
32-
; FORCED-NEXT: store <2 x i64> %4, ptr %5, align 4
26+
27+
; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
28+
; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
29+
; FORCED-NEXT: [[IV_0:%.]] = add i32 %index, 0
30+
; FORCED-NEXT: [[GEP:%.+]] = getelementptr i64, ptr %dst, i32 [[IV_0]]
31+
; FORCED-NEXT: [[ADD:%.+]] = add <2 x i64> %broadcast.splat, %broadcast.splat2
32+
; FORCED-NEXT: [[GEP2:%.+]] = getelementptr i64, ptr [[GEP]], i32 0
33+
; FORCED-NEXT: store <2 x i64> [[ADD]], ptr [[GEP2]], align 4
3334
; FORCED-NEXT: %index.next = add nuw i32 %index, 2
34-
; FORCED-NEXT: %6 = icmp eq i32 %index.next, 1000
35-
; FORCED-NEXT: br i1 %6, label %middle.block, label %vector.body
35+
; FORCED-NEXT: [[C:%.+]] = icmp eq i32 %index.next, 1000
36+
; FORCED-NEXT: br i1 [[C]], label %middle.block, label %vector.body
3637

3738
define void @test1(ptr %dst, {i64, i64} %sv) {
3839
entry:
@@ -66,22 +67,23 @@ declare float @powf(float, float) readnone nounwind
6667

6768
; FORCED-LABEL: define void @test_getVectorCallCost
6869

69-
; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
70-
; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
71-
; FORCED-NEXT: %0 = add i32 %index, 0
72-
; FORCED-NEXT: %1 = extractvalue { float, float } %sv, 0
73-
; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x float> poison, float %1, i64 0
70+
; FORCED: [[E1:%.+]] = extractvalue { float, float } %sv, 0
71+
; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x float> poison, float [[E1]], i64 0
7472
; FORCED-NEXT: %broadcast.splat = shufflevector <2 x float> %broadcast.splatinsert, <2 x float> poison, <2 x i32> zeroinitializer
75-
; FORCED-NEXT: %2 = extractvalue { float, float } %sv, 1
76-
; FORCED-NEXT: %broadcast.splatinsert1 = insertelement <2 x float> poison, float %2, i64 0
73+
; FORCED-NEXT: [[E2:%.+]] = extractvalue { float, float } %sv, 1
74+
; FORCED-NEXT: %broadcast.splatinsert1 = insertelement <2 x float> poison, float [[E2]], i64 0
7775
; FORCED-NEXT: %broadcast.splat2 = shufflevector <2 x float> %broadcast.splatinsert1, <2 x float> poison, <2 x i32> zeroinitializer
78-
; FORCED-NEXT: %3 = getelementptr float, ptr %dst, i32 %0
79-
; FORCED-NEXT: %4 = call <2 x float> @llvm.pow.v2f32(<2 x float> %broadcast.splat, <2 x float> %broadcast.splat2)
80-
; FORCED-NEXT: %5 = getelementptr float, ptr %3, i32 0
81-
; FORCED-NEXT: store <2 x float> %4, ptr %5, align 4
76+
77+
; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
78+
; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
79+
; FORCED-NEXT: [[IV0:%.+]] = add i32 %index, 0
80+
; FORCED-NEXT: [[GEP1:%.+]] = getelementptr float, ptr %dst, i32 [[IV0]]
81+
; FORCED-NEXT: [[POW:%.+]] = call <2 x float> @llvm.pow.v2f32(<2 x float> %broadcast.splat, <2 x float> %broadcast.splat2)
82+
; FORCED-NEXT: [[GEP2:%.+]] = getelementptr float, ptr [[GEP1]], i32 0
83+
; FORCED-NEXT: store <2 x float> [[POW]], ptr [[GEP2]], align 4
8284
; FORCED-NEXT: %index.next = add nuw i32 %index, 2
83-
; FORCED-NEXT: %6 = icmp eq i32 %index.next, 1000
84-
; FORCED-NEXT: br i1 %6, label %middle.block, label %vector.body
85+
; FORCED-NEXT: [[C:%.+]] = icmp eq i32 %index.next, 1000
86+
; FORCED-NEXT: br i1 [[C]], label %middle.block, label %vector.body
8587

8688
define void @test_getVectorCallCost(ptr %dst, {float, float} %sv) {
8789
entry:

0 commit comments

Comments
 (0)