Skip to content

[Tests][LV][AArch64] Pre-commit tests for changing loop interleaving count computation for loops that need to run scalar iterations #79640

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK-IR
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed

target triple = "aarch64-linux-gnu"
Expand Down Expand Up @@ -125,6 +126,30 @@ for.end:
ret void
}

; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
; remainder than IC 2
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
entry:
br label %for.body

for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
%l = load i8, ptr %gep.src, align 1
%gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
store i8 %l, ptr %gep.dst, align 1
%i.next = add nuw nsw i64 %i, 1
%cond = icmp eq i64 %i.next, %n
br i1 %cond, label %for.end, label %for.body, !prof !4

for.end:
ret void
}

; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 2 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
Expand Down Expand Up @@ -153,6 +178,15 @@ for.end:
; it should choose conservatively IC 4 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
; CHECK-IR-LABEL: define void @loop_with_profile_tc_128(
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
; CHECK-IR-NEXT: iter.check:
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6:![0-9]+]]
; CHECK-IR: vector.main.loop.iter.check:
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 64
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
;
entry:
br label %for.body

Expand All @@ -173,6 +207,40 @@ for.end:
ret void
}

; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since
; the resulting interleaved group in this case may access memory out-of-bounds, it requires
; a scalar epilogue iteration for correctness, making at most 127 iterations available for
; interleaving.
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
; remainder than IC 4
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
; CHECK-IR-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
; CHECK-IR-NEXT: iter.check:
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
; CHECK-IR: vector.main.loop.iter.check:
; CHECK-IR-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
; CHECK-IR-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
;
entry:
br label %for.body

for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
%l = load i8, ptr %gep.src, align 1
%gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
store i8 %l, ptr %gep.dst, align 1
%i.next = add nuw nsw i64 %i, 1
%cond = icmp eq i64 %i.next, %n
br i1 %cond, label %for.end, label %for.body, !prof !6

for.end:
ret void
}

; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 4 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK-IR
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed

target triple = "aarch64-linux-gnu"
Expand Down Expand Up @@ -29,6 +30,30 @@ for.end:
ret void
}

; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
; correctness, making at most 31 iterations available for interleaving.
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
; than IC 2
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
entry:
br label %for.body

for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
%l = load i8, ptr %gep.src, align 1
%gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
store i8 %l, ptr %gep.dst, align 1
%i.next = add nuw nsw i64 %i, 1
%cond = icmp eq i64 %i.next, 32
br i1 %cond, label %for.end, label %for.body

for.end:
ret void
}

; For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
; IC 2 since there is a small remainder loop TC that needs to run after the vector loop.
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
Expand Down Expand Up @@ -177,6 +202,10 @@ for.end:
; IC 8 since there is no remainder loop run needed after the vector loop runs
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
define void @loop_with_tc_128(ptr noalias %p, ptr noalias %q) {
; CHECK-IR-LABEL: define void @loop_with_tc_128(
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
; CHECK-IR-NEXT: entry:
; CHECK-IR-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
entry:
br label %for.body

Expand All @@ -197,6 +226,35 @@ for.end:
ret void
}

; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group
; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for
; correctness, making at most 127 iterations available for interleaving.
; TODO: Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
; remainder than IC 4
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
; CHECK-IR-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
; CHECK-IR-NEXT: entry:
; CHECK-IR-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
entry:
br label %for.body

for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
%l = load i8, ptr %gep.src, align 1
%gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
store i8 %l, ptr %gep.dst, align 1
%i.next = add nuw nsw i64 %i, 1
%cond = icmp eq i64 %i.next, 128
br i1 %cond, label %for.end, label %for.body

for.end:
ret void
}

; For this loop with known TC of 129, when the auto-vectorizer chooses VF 16, it should choose
; IC 8 since there is a small remainder loop that needs to run after the vector loop
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
Expand Down