[VPlan] Add VPValue for VF, use it for VPWidenIntOrFpInductionRecipe. #95305

fhahn · 2024-06-12T20:39:14Z

Similar to VFxUF, also add a VF VPValue to VPlan and use it to get the runtime VF in VPWidenIntOrFpInductionRecipe. Code for VF is only generated if there are users of VF, to avoid unnecessary test changes.

Note: some tests still need updating, will do once we converge on a final version of the patch.

llvmbot · 2024-06-12T20:39:43Z

@llvm/pr-subscribers-backend-powerpc

@llvm/pr-subscribers-llvm-transforms

Author: Florian Hahn (fhahn)

Changes

Similar to VFxUF, also add a VF VPValue to VPlan and use it to get the runtime VF in VPWidenIntOrFpInductionRecipe. Code for VF is only generated if there are users of VF, to avoid unnecessary test changes.

Note: some tests still need updating, will do once we converge on a final version of the patch.

Patch is 32.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95305.diff

12 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+5-2)
(modified) llvm/lib/Transforms/Vectorize/VPlan.cpp (+16-2)
(modified) llvm/lib/Transforms/Vectorize/VPlan.h (+10-4)
(modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+2-2)
(modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+2-1)
(modified) llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll (+1-3)
(modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll (+4-2)
(modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll (+13-29)
(modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll (+3-9)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll (+1-3)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll (+2-6)
(modified) llvm/test/Transforms/LoopVectorize/scalable-inductions.ll (+4-6)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7edf83a76faaa..07191235009ce 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8069,10 +8069,12 @@ createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
   VPValue *Step =
       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
-    return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
+    return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, Plan.getVF(),
+                                             IndDesc, TruncI);
   }
   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
-  return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
+  return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, Plan.getVF(),
+                                           IndDesc);
 }
 
 VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
@@ -8487,6 +8489,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
   Header->insert(CanonicalIVPHI, Header->begin());
 
+  VPBuilder PhBuilder(cast<VPBasicBlock>(TopRegion->getSinglePredecessor()));
   VPBuilder Builder(TopRegion->getExitingBasicBlock());
   // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
   auto *CanonicalIVIncrement = Builder.createOverflowingOp(
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f17be451e6846..5806a5141f9a2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -814,8 +814,21 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
 
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   // FIXME: Model VF * UF computation completely in VPlan.
-  VFxUF.setUnderlyingValue(
-      createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
+  Value *RuntimeVF = nullptr;
+  if (VF.getNumUsers()) {
+    RuntimeVF = createStepForVF(Builder, TripCountV->getType(), State.VF, 1);
+    VF.setUnderlyingValue(RuntimeVF);
+  }
+  if (RuntimeVF) {
+    VFxUF.setUnderlyingValue(
+        State.UF > 1 ? Builder.CreateMul(
+                           VF.getLiveInIRValue(),
+                           ConstantInt::get(TripCountV->getType(), State.UF))
+                     : VF.getLiveInIRValue());
+  } else {
+    VFxUF.setUnderlyingValue(
+        createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
+  }
 
   // When vectorizing the epilogue loop, the canonical induction start value
   // needs to be changed from zero to the value after the main vector loop.
@@ -1067,6 +1080,7 @@ VPlan *VPlan::duplicate() {
   }
   Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
   Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
+  Old2NewVPValues[&VF] = &NewPlan->VF;
   if (BackedgeTakenCount) {
     NewPlan->BackedgeTakenCount = new VPValue();
     Old2NewVPValues[BackedgeTakenCount] = NewPlan->BackedgeTakenCount;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5bb88e4a57dc3..566a5244c5abc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1705,25 +1705,27 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
 
 public:
   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
-                                const InductionDescriptor &IndDesc)
+                                VPValue *VF, const InductionDescriptor &IndDesc)
       : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start), IV(IV),
         Trunc(nullptr), IndDesc(IndDesc) {
     addOperand(Step);
+    addOperand(VF);
   }
 
   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
-                                const InductionDescriptor &IndDesc,
+                                VPValue *VF, const InductionDescriptor &IndDesc,
                                 TruncInst *Trunc)
       : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, Trunc, Start),
         IV(IV), Trunc(Trunc), IndDesc(IndDesc) {
     addOperand(Step);
+    addOperand(VF);
   }
 
   ~VPWidenIntOrFpInductionRecipe() override = default;
 
   VPWidenIntOrFpInductionRecipe *clone() override {
-    return new VPWidenIntOrFpInductionRecipe(IV, getStartValue(),
-                                             getStepValue(), IndDesc, Trunc);
+    return new VPWidenIntOrFpInductionRecipe(
+        IV, getStartValue(), getStepValue(), getOperand(2), IndDesc, Trunc);
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
@@ -3142,6 +3144,8 @@ class VPlan {
   /// Represents the vector trip count.
   VPValue VectorTripCount;
 
+  VPValue VF;
+
   /// Represents the loop-invariant VF * UF of the vector loop region.
   VPValue VFxUF;
 
@@ -3233,6 +3237,8 @@ class VPlan {
   /// Returns VF * UF of the vector loop region.
   VPValue &getVFxUF() { return VFxUF; }
 
+  VPValue *getVF() { return &VF; };
+
   void addVF(ElementCount VF) { VFs.insert(VF); }
 
   void setVF(ElementCount VF) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7a482455473e4..7165d77c1033b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1217,11 +1217,11 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
   // Multiply the vectorization factor by the step using integer or
   // floating-point arithmetic as appropriate.
   Type *StepType = Step->getType();
-  Value *RuntimeVF;
+  Value *RuntimeVF = State.get(getOperand(2), {0, 0});
   if (Step->getType()->isFloatingPointTy())
     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
   else
-    RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
+    RuntimeVF = Builder.CreateZExtOrTrunc(RuntimeVF, StepType);
   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
 
   // Create a vector splat to use in the induction update.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8ec67eb2f54bd..c2b8502af058a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -55,7 +55,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
         VPValue *Start = Plan->getOrAddLiveIn(II->getStartValue());
         VPValue *Step =
             vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE);
-        NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II);
+        NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, Step,
+                                                      Plan->getVF(), *II);
       } else {
         assert(isa<VPInstruction>(&Ingredient) &&
                "only VPInstructions expected here");
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
index 59a1e108b92f0..0d1298b909976 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll
@@ -24,9 +24,7 @@ define void @foo() {
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 4 x i64> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP19]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
index 3217f508f0adc..5d131e9c94727 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll
@@ -13,7 +13,8 @@ define void @induction_i7(ptr %dst) #0 {
 ; CHECK:       vector.ph:
 ; CHECK:         %ind.end = trunc i64 %n.vec to i7
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i7>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i7> [[TMP7]], zeroinitializer
@@ -73,7 +74,8 @@ define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK:       vector.ph:
 ; CHECK:         %ind.end = trunc i64 %n.vec to i3
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc <vscale x 2 x i8> [[TMP6]] to <vscale x 2 x i3>
 ; CHECK-NEXT:    [[TMP8:%.*]] = add <vscale x 2 x i3> [[TMP7]], zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 3a25ffe26cc03..501243672ea4f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -110,8 +110,7 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
@@ -201,8 +200,7 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
@@ -280,9 +278,7 @@ define i32 @test_struct_load6(ptr %S) #1 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP1]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
@@ -385,8 +381,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1023, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i32 [[TMP3]], -4
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[DOTNEG:%.*]] = sub nsw i32 0, [[TMP3]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DOTNEG]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -644,8 +640,7 @@ define void @load_gap_reverse(ptr noalias nocapture readonly %P1, ptr noalias no
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP3]], -4
+; CHECK-NEXT:    [[DOTNEG:%.*]] = sub nsw i64 0, [[TMP1]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[DOTNEG]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[X:%.*]], i64 0
@@ -895,9 +890,7 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -982,9 +975,7 @@ define i32 @PR27626_1(ptr %p, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
@@ -1077,9 +1068,7 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -1167,9 +1156,7 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP8]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
@@ -1271,8 +1258,7 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl <vscale x 4 x i64> [[TMP8]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 3
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP6]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X:%.*]], i64 0
@@ -1368,8 +1354,7 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i64> [[TMP9]], shufflevector (<vscale x 4 x i64> insertelement (<vscal...
[truncated]

david-arm

Overall this patch looks like a win by generally reducing the lines of emitted IR! I just have a few comments on the implementation and some observations on the tests.

david-arm · 2024-06-25T12:13:23Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

+    VF.setUnderlyingValue(RuntimeVF);
+  }
+  if (RuntimeVF) {
+    VFxUF.setUnderlyingValue(


Given RuntimeVF is only non-null if VF.getNumUsers() != 0 wouldn't it be neater to simply fold this into the if (VF.getNumUsers()) { block? i.e.

if (VF.getNumUsers()) { RuntimeVF = createStepForVF(Builder, TripCountV->getType(), State.VF, 1); VF.setUnderlyingValue(RuntimeVF); VFxUF.setUnderlyingValue( State.UF > 1 ? Builder.CreateMul( VF.getLiveInIRValue(), ConstantInt::get(TripCountV->getType(), State.UF)) : VF.getLiveInIRValue()); } else { VFxUF.setUnderlyingValue( createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF)); }

Adjusted, thanks!

david-arm · 2024-06-25T12:14:52Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

@@ -1067,6 +1080,7 @@ VPlan *VPlan::duplicate() {
  }
  Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
  Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
+  Old2NewVPValues[&VF] = &NewPlan->VF;


I must be honest it's not obvious to me why we need a specialised version for the UF=1 case. Why can't we use VFxUF always given VFx1 is a valid use case? It seems to add a bit of extra complexity. I understand in a loop we may want to calculate both a runtime VF (for a second part or something) and a runtime VF x UF (for a canonical induction variable, etc), but if we've filled out Old2NewVPValues[&VFxUF] with both VFx1 and VFxUF we can just query accordingly, right?

ut if we've filled out Old2NewVPValues[&VFxUF] with both VFx1 and VFxUF we can just query accordingly, right?

I am not sure what filling them out accordingly means here, perhaps using a pair instead of 2 separate fields?

There are different places that need both VF and VFxUF and to serve them we would need different values I think, but I might be missing something from your suggestion?

david-arm · 2024-06-25T12:23:14Z

llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions-unusual-types.ll

@@ -73,7 +74,8 @@ define void @induction_i3_zext(ptr %dst) #0 {
 ; CHECK:       vector.ph:
 ; CHECK:         %ind.end = trunc i64 %n.vec to i3
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP40]], 2


This is interesting. I guess I was expecting that since this patch only creates a runtime VF on demand that total lines of IR would only decrease, rather than increase which is happening here.

The reason this was showing up as net increase was that the test case originally didn't check all instructions in vector.ph; now that it does we safe vscale & mul but need a trunc, so it is neutral overall in this case.

david-arm · 2024-06-25T12:31:45Z

llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll

@@ -385,8 +381,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1023, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i32 [[TMP3]], -4
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32


In practice it probably doesn't matter since vscale is not unlikely to ever be that large, but the code after is not the same as before. Suppose vscale = 0xFFFFFFFF, then previously we'd be doing

0xFFFFFFFF (-1) * -4 -> +4

whereas now it would be

0xFFFFFFFF << 2 -> 0x3FFFFFFFC
trunc(0x3FFFFFFFC) to i32 -> 0xFFFFFFFC
0 - 0xFFFFFFFC -> poison due to nsw?

Perhaps the previous code was simply incorrect?

IIUC the fact that VF * UF cannot wrap is a fundamental assumption, but I am not sure if that's spelled out anywhere (there's no upper bound of vscale at the LLVM IR level it seems).

Using the assumption, if vscale * 4 doesn't wrap in i32 (the original code), then it shouldn't wrap in a wider type and the truncate should return the same result

I think what you say makes sense. It's just I wasn't sure if the VF*UF not wrapping assumption was in the unsigned or signed sense. I don't think I explained it well in my original comment though to be fair. :) Another example is 0x3FFFFFFF * 4, which would wrap in a signed sense and imply poison in the original code. To be honest, I'm not sure what 'trunc nuw nsw' actually means in practice, for example what does it mean to truncate (i64 0xFFFFFFFC) -> (i32 0xFFFFFFFC)? You could argue that a positive value as a i64 has now wrapped to become a negative i32 value, so probably I don't understand how nsw for trunc works!

Anyway, I'm happy with your explanation, but perhaps it would be good to spell out in a comment somewhere (not necessarily this patch) that VF * UF is not expected to wrap in either a unsigned or signed sense.

IIUC trunc nuw means the truncated bits must be non-zero, if it is nsw the top bit before and after truncation must be the same, so trunc nuw nsw would be poison if the operand is negative or negative after truncation.

Match all instructions in vector.ph in sve-inductions-unusual-types.ll. This should help to better show the impact of #95305.

Similar to VFxUF, also add a VF VPValue to VPlan and use it to get the runtime VF in VPWidenIntOrFpInductionRecipe. Code for VF is only generated if there are users of VF, to avoid unnecessary test changes. Note: some tests still need updating, will do once we converge on a final version of the patch.

ayalz · 2024-07-18T08:12:45Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -8667,6 +8669,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
  VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
  Header->insert(CanonicalIVPHI, Header->begin());

+  VPBuilder PhBuilder(cast<VPBasicBlock>(TopRegion->getSinglePredecessor()));


Suggested change

VPBuilder PhBuilder(cast<VPBasicBlock>(TopRegion->getSinglePredecessor()));

unused?

Removed, thanks!

ayalz · 2024-07-18T08:20:32Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

+  Value *RuntimeVF = nullptr;
+  if (VF.getNumUsers()) {
+    RuntimeVF = createStepForVF(Builder, TripCountV->getType(), State.VF, 1);


Suggested change

Value *RuntimeVF = nullptr;

if (VF.getNumUsers()) {

RuntimeVF = createStepForVF(Builder, TripCountV->getType(), State.VF, 1);

if (VF.getNumUsers()) {

Value *RuntimeVF = getRuntimeVF(Builder, TripCountV->getType(), State.VF);

could setting VF and/or VFxUF be done by optimizeForVFAndUF() instead, i.e., as soon as they are fixed for a VPlan?

The term RuntimeVF (here and below) may be confusing, as it relates to the Static (Fixed or Scalable) VF, as indicated in the Type of vector Values, rather than the Dynamic EVL.

Adjusted, thanks!

could setting VF and/or VFxUF be done by optimizeForVFAndUF() instead, i.e., as soon as they are fixed for a VPlan?
Could do, but may not be ideal to rely on an optimization to set them?

could setting VF and/or VFxUF be done by optimizeForVFAndUF() instead, i.e., as soon as they are fixed for a VPlan?

Could do, but may not be ideal to rely on an optimization to set them?

How about renaming the optimizeForVFAndUF() VPlanTransform to setVFAndUF(), or setVF() and setUF() VPlanTransforms, where fixing a constant value triggers its folding optimizations?

Sounds good, can update to setVFAndUF as a start separately?

ayalz · 2024-07-18T08:26:21Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

+        State.UF > 1 ? Builder.CreateMul(
+                           VF.getLiveInIRValue(),
+                           ConstantInt::get(TripCountV->getType(), State.UF))
+                     : VF.getLiveInIRValue());


VFxUF can always be set via createStepForVF(), regardless of UF=1, UF>1, VF having users or not. But we want to set it directly instead, using VF - when the latter is built with getRuntimeVF()? Maybe related to @david-arm's comment below about "... why we need a specialised version for the UF=1 case".

Suggested change

State.UF > 1 ? Builder.CreateMul(

VF.getLiveInIRValue(),

ConstantInt::get(TripCountV->getType(), State.UF))

: VF.getLiveInIRValue());

State.UF > 1 ? Builder.CreateMul(

RuntimeVF,

ConstantInt::get(TripCountV->getType(), State.UF))

: RuntimeVF);

But we want to set it directly instead, using VF - when the latter is built with getRuntimeVF(
yep, to reduce calls to @vscale .

Updated to use RuntimeVF variable, thanks!

ayalz · 2024-07-18T08:29:39Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

@@ -928,8 +928,19 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,

  IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
  // FIXME: Model VF * UF computation completely in VPlan.


Would it be possible to handle this FIXME now that VF is assigned a VPValue?

Yes, would either require introducing a UF placeholder or introducing during explicit interleaving, either way probably better as follow-up?

ok, so FIXME remains until a symbolic UF VPValue placeholder is introduced, as follow-up, along with a VPInstruction multiplying it with VF (introduced here), possibly subject to constant folding?

ayalz · 2024-07-18T08:39:35Z

llvm/lib/Transforms/Vectorize/VPlan.h

@@ -3282,6 +3287,8 @@ class VPlan {
  /// Represents the vector trip count.
  VPValue VectorTripCount;

+  VPValue VF;


Suggested change

VPValue VF;

/// Represents the vectorization factor of the loop.

VPValue VF;

(belongs to the loop Region rather than VPlan itself)

Added thanks!

VF may also be referenced outside the loop region, so probably should be defined at the top-level and used by the region?

(This is a follow-up thought, hence originally in brackets)
Regions (and blocks in general) currently model the HCFG only, leaving the def-use graph of values to recipes. A loop region has a canonical IV recipe which typically uses VFxUF as the canonical step controlling the loop. The latter could be a Mul recipe whose operands provide VF and UF, and a loop region could provide getVF() and getUF() to ease their retrieval?

Agreed, I think ideally the loop region would be a user of both VF, UF (and possibly VF x UF).

ayalz · 2024-07-18T08:41:16Z

llvm/lib/Transforms/Vectorize/VPlan.h

@@ -3380,6 +3387,8 @@ class VPlan {
  /// Returns VF * UF of the vector loop region.
  VPValue &getVFxUF() { return VFxUF; }

+  VPValue *getVF() { return &VF; };


Suggested change

VPValue *getVF() { return &VF; };

/// Returns the VF of the vector loop region.

VPValue *getVF() { return &VF; };

Updated, thanks!

Summary: Match all instructions in vector.ph in sve-inductions-unusual-types.ll. This should help to better show the impact of #95305. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251659

github-actions · 2024-08-15T11:52:24Z

✅ With the latest revision this PR passed the C/C++ code formatter.

fhahn · 2024-08-22T11:50:13Z

ping :)

fhahn · 2024-09-03T21:07:42Z

ping :)

fhahn · 2024-09-05T18:46:00Z

Adjusted the order of getVF as suggested by @ayalz in #95842

ayalz · 2024-09-05T23:35:38Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

+    Value *RuntimeVF =
+        createStepForVF(Builder, TripCountV->getType(), State.VF, 1);


Suggested change

Value *RuntimeVF =

createStepForVF(Builder, TripCountV->getType(), State.VF, 1);

Value *RuntimeVF = getRuntimeVF(Builder, TripCountV->getType(), State.VF);

While we're here, getRuntimeVF() should be createRuntimeVF()?

Updated, planning to rename separately.

Updated, planning to rename separately.

Thanks! Also noted earlier that the "Runtime" part is inaccurate/confusing:

The term RuntimeVF (here and below) may be confusing, as it relates to the Static (Fixed or Scalable) VF, as indicated in the Type of vector Values, rather than the Dynamic EVL.

ayalz · 2024-09-05T23:40:24Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

+            ? Builder.CreateMul(
+                  RuntimeVF, ConstantInt::get(TripCountV->getType(), State.UF))


Does this work for scalable VF or expected to handle only fixed VF?

It works for both scalable and fixed vectors, in the later case the multiplies of constants will be folded, in the former we have a multiply instruction of RuntimeVF (which in turn is (vscale * VF)

ok, just confirming we're doing an explicit Mul here, compared to createStepForVF() doing multiplyCoefficientBy() below. Perhaps related to (vscale * VF) * UF vs. vscale * (VF * UF)?

Yep, we are doing the explicit multiply here, while blow VF * UF are folded

ayalz · 2024-09-05T23:41:02Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

  Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
+  Old2NewVPValues[&VF] = &NewPlan->VF;


Suggested change

Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;

Old2NewVPValues[&VF] = &NewPlan->VF;

Old2NewVPValues[&VF] = &NewPlan->VF;

Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;

Updated, thanks!

ayalz · 2024-09-05T23:42:22Z

llvm/lib/Transforms/Vectorize/VPlan.h

-    return new VPWidenIntOrFpInductionRecipe(IV, getStartValue(),
-                                             getStepValue(), IndDesc, Trunc);
+    return new VPWidenIntOrFpInductionRecipe(
+        IV, getStartValue(), getStepValue(), getOperand(2), IndDesc, Trunc);


Suggested change

IV, getStartValue(), getStepValue(), getOperand(2), IndDesc, Trunc);

IV, getStartValue(), getStepValue(), getVFValue(), IndDesc, Trunc);

Done, thanks!

ayalz · 2024-09-05T23:45:01Z

llvm/lib/Transforms/Vectorize/VPlan.h

@@ -3408,6 +3416,9 @@ class VPlan {
  /// The vector trip count.
  VPValue &getVectorTripCount() { return VectorTripCount; }

+  /// Returns the VF of the vector loop region.


Why pointer rather than reference, as in VFxUF, null is not used/returned.

Adjusted to return by reference, thanks!

ayalz · 2024-09-06T00:00:25Z

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -8189,10 +8189,12 @@ createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
  VPValue *Step =
      vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
  if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
-    return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
+    return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, Plan.getVF(),


VPWidenIntOrFpInductionRecipe could retrieve VF by looking up Plan.getVF() on demand rather than recording it as on operand, but the latter helps in checking if VF has users, i.e., if any VPWidenIntOrFpInductionRecipe exists?

Surely VF is needed to vectorize any loop, including ones free of VPWidenIntOrFpInductionRecipes. Does it need to be cached somehow, to prevent regeneration?

VPWidenIntOrFpInductionRecipe could retrieve VF by looking up Plan.getVF() on demand rather than recording it as on operand, but the latter helps in checking if VF has users, i.e., if any VPWidenIntOrFpInductionRecipe exists?

Exactly, this is used to check whether to generate it or not. VFxUF is similarly added as operand to the VPInstruction to increment the canonical IV.

Surely VF is needed to vectorize any loop, including ones free of VPWidenIntOrFpInductionRecipes. Does it need to be cached somehow, to prevent regeneration?
There are multiple other places that currently generate runtime VF on demand. Adding it as operand here and generating on-demand only is mostly to gradually convert all users.

We could create VF unconditionally, then we would have update all tests with scalable vectors to split up VFxUF computation to ((vscale * VF) * UF) instead of (vscale * (VF * UF)) even if vscale * VF is only used in the multiply by UF.

To limit this we could try to fold it back as post-codegen cleanup. Or update all tests, happy to go either way (or leave as is in the current patch for now)

VPWidenIntOrFpInductionRecipe could retrieve VF by looking up Plan.getVF() on demand rather than recording it as on operand, but the latter helps in checking if VF has users, i.e., if any VPWidenIntOrFpInductionRecipe exists?

Exactly, this is used to check whether to generate it or not. VFxUF is similarly added as operand to the VPInstruction to increment the canonical IV.

Agree to model usage and dependence of values directly via explicit operands, rather than by retrieving them from plan (or region).

If/when VFxUF becomes a Mul VPInstruction which uses VF (and UF), will the check for no VF users change to check if VF is used only by this Mul?

Could this folding be done by a subsequent VPlan2VPlan pass? Would indeed be good to reduce amount of test changes...

ayalz · 2024-09-06T09:31:55Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

+                  RuntimeVF, ConstantInt::get(TripCountV->getType(), State.UF))
+            : RuntimeVF);
+  } else {
+    VFxUF.setUnderlyingValue(


VFxUF is set regardless of having users or not. Be consistent?

I see earlier that

... Code for VF is only generated if there are users of VF, to avoid unnecessary test changes.
Code for VF must be generated regardless of direct users, issues is its position and possible repetition?

Yes, VFxUF is used to increment the canonical induction, which is present in almost all cases, but could also be done only if users exist.

... Code for VF is only generated if there are users of VF, to avoid unnecessary test changes.
Code for VF must be generated regardless of direct users, issues is its position and possible repetition?

There are cases where VF separately isn't used, only as part of VFxUF. If only VFxUF is used, the constant multiply of VF * UF is folded, hence always generating VF separately would lead to additional test changes. Some alternatives are mentioned in my latest comment above.

Yes, VFxUF is used to increment the canonical induction, which is present in almost all cases, but could also be done only if users exist.

Exceptional use-less cases are loops whose trip count is VFxUF - where optimizeForVFAndUF() discards the canonical IV's increment by VFxUF?

So one way to improve consistency, w/o changing too many tests, would be to (also) set VFxUF only if used? Although logically it should always be used - to set the vector trip count(?).

At the moment, it is always used (added assert in 1a5a1e9).

ayalz

Adding various comments, overall looks good to me, provided @david-arm agrees.

ayalz · 2024-09-08T15:28:44Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

@@ -928,8 +928,19 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,

  IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
  // FIXME: Model VF * UF computation completely in VPlan.


ok, so FIXME remains until a symbolic UF VPValue placeholder is introduced, as follow-up, along with a VPInstruction multiplying it with VF (introduced here), possibly subject to constant folding?

ayalz · 2024-09-08T15:32:33Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

+            ? Builder.CreateMul(
+                  RuntimeVF, ConstantInt::get(TripCountV->getType(), State.UF))


ok, just confirming we're doing an explicit Mul here, compared to createStepForVF() doing multiplyCoefficientBy() below. Perhaps related to (vscale * VF) * UF vs. vscale * (VF * UF)?

ayalz · 2024-09-08T15:34:56Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

+                  RuntimeVF, ConstantInt::get(TripCountV->getType(), State.UF))
+            : RuntimeVF);
+  } else {
+    VFxUF.setUnderlyingValue(


Yes, VFxUF is used to increment the canonical induction, which is present in almost all cases, but could also be done only if users exist.

Exceptional use-less cases are loops whose trip count is VFxUF - where optimizeForVFAndUF() discards the canonical IV's increment by VFxUF?

So one way to improve consistency, w/o changing too many tests, would be to (also) set VFxUF only if used? Although logically it should always be used - to set the vector trip count(?).

ayalz · 2024-09-08T22:18:38Z

llvm/lib/Transforms/Vectorize/VPlan.h

@@ -3282,6 +3287,8 @@ class VPlan {
  /// Represents the vector trip count.
  VPValue VectorTripCount;

+  VPValue VF;


(This is a follow-up thought, hence originally in brackets)
Regions (and blocks in general) currently model the HCFG only, leaving the def-use graph of values to recipes. A loop region has a canonical IV recipe which typically uses VFxUF as the canonical step controlling the loop. The latter could be a Mul recipe whose operands provide VF and UF, and a loop region could provide getVF() and getUF() to ease their retrieval?

ayalz · 2024-09-08T22:29:48Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

+    Value *RuntimeVF =
+        createStepForVF(Builder, TripCountV->getType(), State.VF, 1);


Updated, planning to rename separately.

Thanks! Also noted earlier that the "Runtime" part is inaccurate/confusing:

The term RuntimeVF (here and below) may be confusing, as it relates to the Static (Fixed or Scalable) VF, as indicated in the Type of vector Values, rather than the Dynamic EVL.

ayalz · 2024-09-08T23:01:14Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

-  VFxUF.setUnderlyingValue(
-      createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
+  if (VF.getNumUsers()) {
+    Value *RuntimeVF = getRuntimeVF(Builder, TripCountV->getType(), State.VF);


nit: worth assigning auto TripCountType = TripCountV->getType() at the outset, or perhaps a shorter name.

Done, thanks!

ayalz · 2024-09-08T23:06:08Z

llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll

@@ -320,6 +324,7 @@ exit:
 define void @pred_cfg1(i32 %k, i32 %j) {
 ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg1'
 ; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF


This one is defined w/o being used?

Added missing use, thanks!

ayalz · 2024-09-08T23:09:22Z

llvm/test/Transforms/LoopVectorize/vplan-printing.ll

@@ -599,7 +602,7 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VTC]]>


VEC_TC >> VTC?

The test originally use dVEC_TC defined by a VPlan of an earlier function, while it should match VTC from the match for this function, which was OK so far. Pushed 3403438 to consistently use VTC across the file, removing the diff here

ayalz · 2024-09-08T23:13:01Z

llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll

 ; CHECK-VF4UF2: call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF2: call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2: call i32 @llvm.vscale.i32()
 ; CHECK-VF4UF2: %[[VSCALE1:.*]] = call i32 @llvm.vscale.i32()


Still some redundancies remain.

Yes, there are a number of places remaining that re-compute runtime VF, will update those in follow-ups to use the VF VPValue.

ayalz · 2024-09-08T23:14:15Z

llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll

@@ -368,6 +371,7 @@ exit:
 define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias %dst.2, i32 %x, i8 %y) optsize {
 ; CHECK-LABEL: sink_replicate_region_after_replicate_region
 ; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF


This def is w/o use?

Added missed use, thanks!

david-arm · 2024-09-09T07:59:05Z

Adding various comments, overall looks good to me, provided @david-arm agrees.

Yeah, once you're happy that your comments are addressed @ayalz, then I'm happy too. I like the cleanup in the tests and the reduced IR generated by the vectoriser. I do think it's probably worth documenting somewhere any assumptions the vectoriser makes with regard to calculation of VF * UF and not wrapping, but doesn't have to be in this patch!

Add assertion to ensure invariant discussed in #95305.

ayalz

LGTM, with minor nits, also note several earlier thoughts - replace optimizeForVFAndUF() by setVF() and setUF(), rename "RuntimeVF", and few comments in tests.

ayalz · 2024-09-09T19:57:24Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

+    VF.printAsOperand(O, SlotTracker);
+    O << " = VF";
+  }
+
  if (VFxUF.getNumUsers() > 0) {


Should this condition be an assert?

Yes, will adjust separately

ayalz · 2024-09-09T19:58:02Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

@@ -1551,6 +1568,8 @@ void VPSlotTracker::assignName(const VPValue *V) {
 }

 void VPSlotTracker::assignNames(const VPlan &Plan) {
+  if (Plan.VF.getNumUsers() > 0)
+    assignName(&Plan.VF);
  if (Plan.VFxUF.getNumUsers() > 0)


Should this condition be an assert?

Yes, will adjust separately

ayalz · 2024-09-09T20:05:43Z

llvm/lib/Transforms/Vectorize/VPlan.cpp

@@ -935,8 +935,18 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
  IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
  // FIXME: Model VF * UF computation completely in VPlan.
  assert(VFxUF.getNumUsers() && "VFxUF expected to always have users");
-  VFxUF.setUnderlyingValue(
-      createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
+  Type *TCTy = TripCountV->getType();


nit: can be defined earlier to also serve TCMO above.

Done, thanks!

The inconsistency surfaced in #95305. Split off the reduce the diff.

Refactors VPVectorPointerRecipe to use the VF VPValue to obtain the runtime VF, similar to llvm#95305. Since only reverse vector pointers require the runtime VF, the patch sets VPUnrollPart::PartOpIndex to 1 for vector pointers and 2 for reverse vector pointers. As a result, the generation of reverse vector pointers is moved into a separate recipe.

Refactors VPVectorPointerRecipe to use the VF VPValue to obtain the runtime VF, similar to #95305. Since only reverse vector pointers require the runtime VF, the patch sets VPUnrollPart::PartOpIndex to 1 for vector pointers and 2 for reverse vector pointers. As a result, the generation of reverse vector pointers is moved into a separate recipe.

Refactors VPVectorPointerRecipe to use the VF VPValue to obtain the runtime VF, similar to llvm#95305. Since only reverse vector pointers require the runtime VF, the patch sets VPUnrollPart::PartOpIndex to 1 for vector pointers and 2 for reverse vector pointers. As a result, the generation of reverse vector pointers is moved into a separate recipe.

llvmbot added vectorizers llvm:transforms labels Jun 12, 2024

fhahn mentioned this pull request Jun 12, 2024

[VPlan] Delay adding canonical IV increment. #82270

Open

fhahn requested review from ayalz, aniragil and david-arm June 25, 2024 11:21

david-arm reviewed Jun 25, 2024

View reviewed changes

fhahn added a commit that referenced this pull request Jul 16, 2024

[LV] Add missing check lines in vector.ph in tests.

4469a1e

Match all instructions in vector.ph in sve-inductions-unusual-types.ll. This should help to better show the impact of #95305.

fhahn added 2 commits July 16, 2024 14:35

!fixup merge conditions, add VPlan printing, use for FP

dbdd4f3

fhahn force-pushed the vplan-vf-vpvalue branch from eb31272 to dbdd4f3 Compare July 16, 2024 13:39

llvmbot added the backend:PowerPC label Jul 16, 2024

ayalz reviewed Jul 18, 2024

View reviewed changes

fhahn added 2 commits August 15, 2024 11:09

Merge remote-tracking branch 'origin/main' into vplan-vf-vpvalue

031e6b2

!fixup address latest comments, thanks!

907c19f

!fixup fix formatting

77c1e61

Merge branch 'main' into vplan-vf-vpvalue

7659683

fhahn mentioned this pull request Sep 5, 2024

[VPlan] Implement interleaving as VPlan-to-VPlan transform. #95842

Merged

fhahn added 2 commits September 5, 2024 19:34

Merge remote-tracking branch 'origin/main' into vplan-vf-vpvalue

4d495d0

!fixup move getVF before getVFxUF.

4c0467c

ayalz reviewed Sep 6, 2024

View reviewed changes

fhahn added 2 commits September 6, 2024 11:30

Merge remote-tracking branch 'origin/main' into vplan-vf-vpvalue

2eaf251

!fixup address latest comments, thanks!

9e95c10

ayalz reviewed Sep 8, 2024

View reviewed changes

Merge remote-tracking branch 'origin/main' into vplan-vf-vpvalue

4d5f2e5

fhahn added a commit that referenced this pull request Sep 9, 2024

[VPlan] Assert that VFxUF is always used.

1a5a1e9

Add assertion to ensure invariant discussed in #95305.

fhahn added 2 commits September 9, 2024 14:32

Merge remote-tracking branch 'origin/main' into vplan-vf-vpvalue

4ea3c6f

!fixup address latest comments, thanks!

ee8409c

ayalz approved these changes Sep 9, 2024

View reviewed changes

Merge remote-tracking branch 'origin/main' into vplan-vf-vpvalue

330dde5

fhahn added a commit that referenced this pull request Sep 9, 2024

[VPlan] Consistently use VTC for vector trip count in vplan-printing.ll.

3403438

The inconsistency surfaced in #95305. Split off the reduce the diff.

fhahn added 2 commits September 9, 2024 21:37

Merge remote-tracking branch 'origin/main' into vplan-vf-vpvalue

b8bcc2f

!fixup address latest comments, thanks

2b1bf7d

fhahn merged commit a794ee4 into llvm:main Sep 10, 2024
8 checks passed

fhahn deleted the vplan-vf-vpvalue branch September 10, 2024 09:41

arcbbb mentioned this pull request Oct 3, 2024

[LV][VPlan] Use VF VPValue in VPVectorPointerRecipe #110974

Merged

		@@ -928,8 +928,19 @@ void VPlan::prepareToExecute(Value TripCountV, Value VectorTripCountV,

		IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
		// FIXME: Model VF * UF computation completely in VPlan.

	VPValue VF;
	/// Represents the vectorization factor of the loop.
	VPValue VF;

	VPValue *getVF() { return &VF; };
	/// Returns the VF of the vector loop region.
	VPValue *getVF() { return &VF; };

		Value *RuntimeVF =
		createStepForVF(Builder, TripCountV->getType(), State.VF, 1);

	Value *RuntimeVF =
	createStepForVF(Builder, TripCountV->getType(), State.VF, 1);
	Value *RuntimeVF = getRuntimeVF(Builder, TripCountV->getType(), State.VF);

		? Builder.CreateMul(
		RuntimeVF, ConstantInt::get(TripCountV->getType(), State.UF))

		Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
		Old2NewVPValues[&VF] = &NewPlan->VF;

	IV, getStartValue(), getStepValue(), getOperand(2), IndDesc, Trunc);
	IV, getStartValue(), getStepValue(), getVFValue(), IndDesc, Trunc);

[VPlan] Add VPValue for VF, use it for VPWidenIntOrFpInductionRecipe. #95305

[VPlan] Add VPValue for VF, use it for VPWidenIntOrFpInductionRecipe. #95305

Conversation

fhahn commented Jun 12, 2024

llvmbot commented Jun 12, 2024 • edited Loading

david-arm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

github-actions bot commented Aug 15, 2024 • edited Loading

fhahn commented Aug 22, 2024

fhahn commented Sep 3, 2024

fhahn commented Sep 5, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

ayalz left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

david-arm commented Sep 9, 2024

ayalz left a comment

Choose a reason for hiding this comment

llvmbot commented Jun 12, 2024 •

edited

Loading

github-actions bot commented Aug 15, 2024 •

edited

Loading