llvm · nilanjana87 · Jan 29, 2024 · Jan 25, 2024 · Jan 26, 2024 · Jan 26, 2024
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_estimated_tc.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK-IR
 ; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
 
 target triple = "aarch64-linux-gnu"
@@ -125,6 +126,30 @@ for.end:
   ret void
 }
 
+; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the 
+; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar 
+; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar 
+; remainder than IC 2
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !4
+
+for.end:
+  ret void
+}
+
 ; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16, 
 ; it should choose conservatively IC 2 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
@@ -153,6 +178,15 @@ for.end:
 ; it should choose conservatively IC 4 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
 define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
+; CHECK-IR-LABEL: define void @loop_with_profile_tc_128(
+; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
+; CHECK-IR-NEXT:  iter.check:
+; CHECK-IR-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-IR-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6:![0-9]+]]
+; CHECK-IR:       vector.main.loop.iter.check:
+; CHECK-IR-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 64
+; CHECK-IR-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
+;
 entry:
   br label %for.body
 
@@ -173,6 +207,40 @@ for.end:
   ret void
 }
 
+; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since 
+; the resulting interleaved group in this case may access memory out-of-bounds, it requires 
+; a scalar epilogue iteration for correctness, making at most 127 iterations available for 
+; interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar 
+; remainder than IC 4
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
+define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
+; CHECK-IR-LABEL: define void @loop_with_profile_tc_128_scalar_epilogue_reqd(
+; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]], i64 [[N:%.*]]) {
+; CHECK-IR-NEXT:  iter.check:
+; CHECK-IR-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 8
+; CHECK-IR-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]], !prof [[PROF6]]
+; CHECK-IR:       vector.main.loop.iter.check:
+; CHECK-IR-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ule i64 [[N]], 64
+; CHECK-IR-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]], !prof [[PROF6]]
+;
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body, !prof !6
+
+for.end:
+  ret void
+}
+
 ; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16, 
 ; it should choose conservatively IC 4 so that the vector loop runs twice at least
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave_count_for_known_tc.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -force-target-max-vector-interleave=8 -p loop-vectorize -S 2>&1 | FileCheck %s -check-prefix=CHECK-IR
 ; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
 
 target triple = "aarch64-linux-gnu"
@@ -29,6 +30,30 @@ for.end:
   ret void
 }
 
+; This has the same trip count as loop_with_tc_32 but since the resulting interleaved group 
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
+; correctness, making at most 31 iterations available for interleaving.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar remainder
+; than IC 2
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
+define void @loop_with_tc_32_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 32
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 ; For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
 ; IC 2 since there is a small remainder loop TC that needs to run after the vector loop.
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
@@ -177,6 +202,10 @@ for.end:
 ; IC 8 since there is no remainder loop run needed after the vector loop runs
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
 define void @loop_with_tc_128(ptr noalias %p, ptr noalias %q) {
+; CHECK-IR-LABEL: define void @loop_with_tc_128(
+; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
+; CHECK-IR-NEXT:  entry:
+; CHECK-IR-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 entry:
   br label %for.body
 
@@ -197,6 +226,35 @@ for.end:
   ret void
 }
 
+; This has the same trip count as loop_with_tc_128 but since the resulting interleaved group 
+; in this case may access memory out-of-bounds, it requires a scalar epilogue iteration for 
+; correctness, making at most 127 iterations available for interleaving.
+; TODO: Like loop_with_tc_128, the entry block should branch into the vector loop, instead of the scalar epilogue.
+; TODO: When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar 
+; remainder than IC 4
+; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
+define void @loop_with_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q) {
+; CHECK-IR-LABEL: define void @loop_with_tc_128_scalar_epilogue_reqd(
+; CHECK-IR-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) {
+; CHECK-IR-NEXT:  entry:
+; CHECK-IR-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %gep.src = getelementptr inbounds [3 x i8], ptr %p, i64 %i, i64 0
+  %l = load i8, ptr %gep.src, align 1
+  %gep.dst = getelementptr inbounds i8, ptr %q, i64 %i
+  store i8 %l, ptr %gep.dst, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, 128
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 ; For this loop with known TC of 129, when the auto-vectorizer chooses VF 16, it should choose
 ; IC 8 since there is a small remainder loop that needs to run after the vector loop
 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)