Skip to content

[LoopVectorize][NFC] Rewrite tests to check output of vplan cost model #113697

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/InstructionCost.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/NativeFormatting.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
Expand Down Expand Up @@ -7403,7 +7404,20 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,

// Now compute and add the VPlan-based cost.
Cost += Plan.cost(VF, CostCtx);
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
#ifndef NDEBUG
unsigned EstimatedWidth = VF.getKnownMinValue();
if (VF.isScalable())
if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
EstimatedWidth *= *VScale;
Comment on lines +7409 to +7411
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would probably be convenient to have a simple helper for that as we do this in more and more places now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good suggestion! I've created #116247

LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
<< " (Estimated cost per lane: ");
if (Cost.isValid()) {
double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;
LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
} else /* No point dividing an invalid cost - it will still be invalid */
LLVM_DEBUG(dbgs() << "Invalid");
LLVM_DEBUG(dbgs() << ")\n");
#endif
return Cost;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ target triple = "aarch64--linux-gnu"
; %var4 a lower scalarization overhead.
;
; COST-LABEL: predicated_udiv_scalarized_operand
; COST: LV: Found an estimated cost of 5 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3
; COST: Cost of 5 for VF 2: profitable to scalarize %var4 = udiv i64 %var2, %var3
;
;
define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,14 @@
; CM: LV: Found uniform instruction: %a = extractvalue { i64, i64 } %sv, 0
; CM: LV: Found uniform instruction: %b = extractvalue { i64, i64 } %sv, 1

; Ensure the extractvalue + add instructions are hoisted out
; CM: vector.ph:
; CM: CLONE ir<%a> = extractvalue ir<%sv>
; CM: CLONE ir<%b> = extractvalue ir<%sv>
; CM: WIDEN ir<%add> = add ir<%a>, ir<%b>
; CM: Successor(s): vector loop

; CM: LV: Scalar loop costs: 5.
; CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %a = extractvalue { i64, i64 } %sv, 0
; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %b = extractvalue { i64, i64 } %sv, 1

; Check that the extractvalue operands are actually free in vector code.

Expand Down Expand Up @@ -58,12 +63,14 @@ exit:
; Similar to the test case above, but checks getVectorCallCost as well.
declare float @powf(float, float) readnone nounwind

; CM: LV: Found uniform instruction: %a = extractvalue { float, float } %sv, 0
; CM: LV: Found uniform instruction: %b = extractvalue { float, float } %sv, 1
; Ensure the extractvalue + add instructions are hoisted out
; CM: vector.ph:
; CM: CLONE ir<%a> = extractvalue ir<%sv>
; CM: CLONE ir<%b> = extractvalue ir<%sv>
; CM: WIDEN ir<%add> = add ir<%a>, ir<%b>
; CM: Successor(s): vector loop

; CM: LV: Scalar loop costs: 14.
; CM: LV: Found an estimated cost of 0 for VF 2 For instruction: %a = extractvalue { float, float } %sv, 0
; CM-NEXT: LV: Found an estimated cost of 0 for VF 2 For instruction: %b = extractvalue { float, float } %sv, 1

; FORCED-LABEL: define void @test_getVectorCallCost

Expand Down
14 changes: 7 additions & 7 deletions llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ target triple = "aarch64--linux-gnu"

; CHECK-COST-LABEL: sadd
; CHECK-COST: Found an estimated cost of 6 for VF 1 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
; CHECK-COST: Found an estimated cost of 4 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset)
; CHECK-COST: Cost of 4 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)
; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.sadd.sat(ir<%0>, ir<%offset>)

define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
; CHECK-LABEL: @saddsat(
Expand Down Expand Up @@ -95,10 +95,10 @@ while.end: ; preds = %while.body, %entry

; CHECK-COST-LABEL: umin
; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction: %1 = tail call i8 @llvm.umin.i8(i8 %0, i8 %offset)
; CHECK-COST: Cost of 1 for VF 2: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
; CHECK-COST: Cost of 1 for VF 4: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
; CHECK-COST: Cost of 1 for VF 8: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)
; CHECK-COST: Cost of 1 for VF 16: WIDEN-INTRINSIC ir<%1> = call llvm.umin(ir<%0>, ir<%offset>)

define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
; CHECK-LABEL: @umin(
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
target triple = "aarch64-unknown-linux-gnu"

; CHECK-COST: Checking a loop in 'fixed_width'
; CHECK-COST: Found an estimated cost of 10 for VF 2 For instruction: store i32 2, ptr %arrayidx1, align 4
; CHECK-COST: Found an estimated cost of 20 for VF 4 For instruction: store i32 2, ptr %arrayidx1, align 4
; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<%6>, ir<2>, vp<%5>
; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<%6>, ir<2>, vp<%5>
; CHECK-COST: Selecting VF: 1.

; We should decide this loop is not worth vectorising using fixed width vectors
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ target triple = "aarch64"
; due to invalid cost decisions. The loop below has a low maximum trip count,
; so will be masked.

; COST: LV: Found an estimated cost of 3000000 for VF 2 For instruction: %0 = load
; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %0 = load
; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %0 = load
; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %0 = load
; COST: Cost of 3000000 for VF 2: REPLICATE ir<%0> = load
; COST: Cost of 3000000 for VF 4: REPLICATE ir<%0> = load
; COST: Cost of 3000000 for VF 8: REPLICATE ir<%0> = load
; COST: Cost of 3000000 for VF 16: REPLICATE ir<%0> = load
; COST: LV: Selecting VF: 1.

define i32 @test(ptr nocapture noundef readonly %pInVec, ptr nocapture noundef readonly %pInA1, ptr nocapture noundef readonly %pInA2, ptr nocapture noundef readonly %pInA3, ptr nocapture noundef readonly %pInA4, i32 noundef %numCols) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ target triple = "aarch64--linux-gnu"

; CHECK-LABEL: all_scalar
; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions
;
define void @all_scalar(ptr %a, i64 %n) {
Expand All @@ -27,7 +26,6 @@ for.end:

; CHECK-LABEL: PR33193
; CHECK: LV: Found scalar instruction: %i.next = zext i32 %j.next to i64
; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %i.next = zext i32 %j.next to i64
; CHECK: LV: Not considering vector loop of width 8 because it will not generate any vector instructions
%struct.a = type { i32, i8 }
define void @PR33193(ptr %a, i64 %n) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu"
;; registers required for a <vscale x 4 x fp128> when trying to maximize
;; vector bandwidth with SVE.

; CHECK: LV: Found an estimated cost of Invalid for VF vscale x 2 For instruction: %load.ext = fpext double %load.in to fp128
; CHECK: Cost of Invalid for VF vscale x 2: WIDEN-CAST ir<%load.ext> = fpext ir<%load.in> to fp128

define void @load_ext_trunc_store(ptr readonly %in, ptr noalias %out, i64 %N) {
; CHECK-LABEL: define void @load_ext_trunc_store(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,54 +1,59 @@
; REQUIRES: asserts
; RUN: opt -mtriple=aarch64 -mattr=+sve \
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16

; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16

; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16

; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16

; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v2 \
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V2,VF-16

; GENERIC: Cost for VF vscale x 2: 11 (Estimated cost per lane: 2.8)
; GENERIC: Cost for VF vscale x 4: 11 (Estimated cost per lane: 1.4)
; GENERIC: LV: Selecting VF: vscale x 16

; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
; NEOVERSE-V1: Cost for VF vscale x 2: 11 (Estimated cost per lane: 2.8)
; NEOVERSE-V1: Cost for VF vscale x 4: 11 (Estimated cost per lane: 1.4)
; NEOVERSE-V1: LV: Selecting VF: vscale x 16

; NEOVERSE-V1: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
; NEOVERSE-V1: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
; NEOVERSE-N2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5.5)
; NEOVERSE-N2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2.8)
; NEOVERSE-N2: LV: Selecting VF: vscale x 16

; NEOVERSE-N2: LV: Vector loop of width vscale x 2 costs: 6 (assuming a minimum vscale of 1).
; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
; NEOVERSE-V2: Cost for VF vscale x 2: 11 (Estimated cost per lane: 5.5)
; NEOVERSE-V2: Cost for VF vscale x 4: 11 (Estimated cost per lane: 2.8)
; NEOVERSE-V2: LV: Selecting VF: 16

; VF-4: <4 x i32>
; VF-VSCALE4: <16 x i32>
; VF-16: <16 x i8>
; VF-VSCALE16: <vscale x 16 x i8>
define void @test0(ptr %a, ptr %b, ptr %c) #0 {
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%arrayidx = getelementptr inbounds i32, ptr %c, i64 %iv
%0 = load i32, ptr %arrayidx, align 4
%arrayidx = getelementptr inbounds i8, ptr %c, i64 %iv
%0 = load i8, ptr %arrayidx, align 4
%arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %iv
%1 = load i8, ptr %arrayidx2, align 4
%zext = zext i8 %1 to i32
%add = add nsw i32 %zext, %0
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 %iv
store i32 %add, ptr %arrayidx5, align 4
%add = add nsw i8 %0, %1
%arrayidx5 = getelementptr inbounds i8, ptr %a, i64 %iv
store i8 %add, ptr %arrayidx5, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %exit, label %loop

exit:
ret void
}

22 changes: 13 additions & 9 deletions llvm/test/Transforms/LoopVectorize/AArch64/select-costs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ target triple = "arm64-apple-ios5.0.0"

define void @selects_1(ptr nocapture %dst, i32 %A, i32 %B, i32 %C, i32 %N) {
; CHECK: LV: Checking a loop in 'selects_1'
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond = select i1 %cmp1, i32 10, i32 %and
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond6 = select i1 %cmp2, i32 30, i32 %and
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6

; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond = select i1 %cmp1, i32 10, i32 %and
; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond6 = select i1 %cmp2, i32 30, i32 %and
; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
; CHECK: Cost of 1 for VF 2: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>

; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond> = select ir<%cmp1>, ir<10>, ir<%and>
; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond6> = select ir<%cmp2>, ir<30>, ir<%and>
; CHECK: Cost of 1 for VF 4: WIDEN-SELECT ir<%cond11> = select ir<%cmp7>, ir<%cond>, ir<%cond6>

; CHECK: LV: Selecting VF: 4

entry:
Expand Down Expand Up @@ -48,9 +50,11 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo

define i32 @multi_user_cmp(ptr readonly %a, i64 noundef %n) {
; CHECK: LV: Checking a loop in 'multi_user_cmp'
; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %cmp1 = fcmp olt float %load1, 0.000000e+00
; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction: %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
; CHECK: LV: Found an estimated cost of 1 for VF 16 For instruction: %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
; CHECK: Cost of 1 for VF 16:
; CHECK: any-of reduction %all.off = select i1 %cmp1, i1 %all.off.next, i1 false
; CHECK: Cost of 1 for VF 16:
; CHECK: any-of reduction %.any.0.off0 = select i1 %cmp1, i1 true, i1 %any.0.off09
; CHECK: Cost of 4 for VF 16: WIDEN ir<%cmp1> = fcmp olt ir<%load1>, ir<0.000000e+00>
; CHECK: LV: Selecting VF: 16.
entry:
br label %for.body
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

target triple = "aarch64-unknown-linux-gnu"

; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %addi7 = add i7 %indvars.iv1294, 0
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv.next1295 = add i7 %indvars.iv1294, 1
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv1294 = phi i7 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN ir<%addi7> = add ir<%indvars.iv1294>, ir<0>

define void @induction_i7(ptr %dst) #0 {
; CHECK-LABEL: define void @induction_i7(
Expand Down Expand Up @@ -71,9 +71,9 @@ for.end: ; preds = %for.body
}


; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %zexti3 = zext i3 %indvars.iv1294 to i64
; DEBUG: Found an estimated cost of Invalid for VF vscale x 1 For instruction: %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv.next1295 = add i3 %indvars.iv1294, 1
; DEBUG: Cost of Invalid for VF vscale x 1: induction instruction %indvars.iv1294 = phi i3 [ %indvars.iv.next1295, %for.body ], [ 0, %entry ]
; DEBUG: Cost of Invalid for VF vscale x 1: WIDEN-CAST ir<%zexti3> = zext ir<%indvars.iv1294> to i64

define void @induction_i3_zext(ptr %dst) #0 {
; CHECK-LABEL: define void @induction_i3_zext(
Expand Down
20 changes: 14 additions & 6 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,15 @@

target triple="aarch64-unknown-linux-gnu"

; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07
; CHECK: Found an estimated cost of 8 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07
; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 2 for VF vscale x 2 For instruction: %add = fadd float %0, %sum.07
; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 4 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07
; CHECK-LABEL: LV: Checking a loop in 'fadd_strict32'
; CHECK: Cost of 4 for VF vscale x 2:
; CHECK: in-loop reduction %add = fadd float %0, %sum.07
; CHECK: Cost of 8 for VF vscale x 4:
; CHECK: in-loop reduction %add = fadd float %0, %sum.07
; CHECK-CPU-NEOVERSE-N2: Cost of 2 for VF vscale x 2:
; CHECK-CPU-NEOVERSE-N2: in-loop reduction %add = fadd float %0, %sum.07
; CHECK-CPU-NEOVERSE-N2: Cost of 4 for VF vscale x 4:
; CHECK-CPU-NEOVERSE-N2: in-loop reduction %add = fadd float %0, %sum.07

define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) #0 {
entry:
Expand All @@ -31,8 +36,11 @@ for.end:
}


; CHECK: Found an estimated cost of 4 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07
; CHECK-CPU-NEOVERSE-N2: Found an estimated cost of 2 for VF vscale x 2 For instruction: %add = fadd double %0, %sum.07
; CHECK-LABEL: LV: Checking a loop in 'fadd_strict64'
; CHECK: Cost of 4 for VF vscale x 2:
; CHECK: in-loop reduction %add = fadd double %0, %sum.07
; CHECK-CPU-NEOVERSE-N2: Cost of 2 for VF vscale x 2:
; CHECK-CPU-NEOVERSE-N2: in-loop reduction %add = fadd double %0, %sum.07

define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) #0 {
entry:
Expand Down
Loading
Loading