Skip to content

Commit d3e4702

Browse files
author
Rin
authored
[AArch64] [LoopVectorize] Use either fixed-width or scalable VF when tail-folding (#67543)
Since the getMaximisedVFForTarget function is called twice, once for fixed-width and once for scalable, it adds no value to always return a fixed-width VF. Instead, when we are tail-folding, we can use either fixed-width or scalable vectors.
1 parent 46aac94 commit d3e4702

File tree

3 files changed

+64
-17
lines changed

3 files changed

+64
-17
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5124,7 +5124,9 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
51245124
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
51255125
"exceeding the constant trip count: "
51265126
<< ClampedConstTripCount << "\n");
5127-
return ElementCount::getFixed(ClampedConstTripCount);
5127+
return ElementCount::get(
5128+
ClampedConstTripCount,
5129+
FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
51285130
}
51295131

51305132
TargetTransformInfo::RegisterKind RegKind =
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
; RUN: opt -S < %s -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve 2>&1 | FileCheck %s
2+
3+
define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val){
4+
; CHECK-LABEL: define void @clamped_tc_8
5+
; CHECK: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> {{.*}}, ptr {{.*}}, i32 1, <vscale x 8 x i1> {{.*}})
6+
entry:
7+
br label %for.body
8+
9+
for.body: ; preds = %entry, %for.body
10+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
11+
%p_out_tail.09 = phi ptr [ %dst, %entry ], [ %incdec.ptr, %for.body ]
12+
%0 = shl nuw nsw i64 %indvars.iv, 3
13+
%shr3 = lshr i64 %val, %0
14+
%conv4 = trunc i64 %shr3 to i8
15+
store i8 %conv4, ptr %p_out_tail.09, align 1
16+
%incdec.ptr = getelementptr inbounds i8, ptr %p_out_tail.09, i64 1
17+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
18+
%exitcond.not = icmp eq i64 %indvars.iv.next, 8
19+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
20+
21+
for.cond.cleanup: ; preds = %for.body
22+
ret void
23+
}

llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,32 @@ define void @small_trip_count_min_vlen_128(ptr nocapture %a) nounwind vscale_ran
66
; CHECK-NEXT: entry:
77
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
88
; CHECK: vector.ph:
9+
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
10+
; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4
11+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
12+
; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4
13+
; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP3]], 1
14+
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 4, [[TMP4]]
15+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
16+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
917
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1018
; CHECK: vector.body:
1119
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
12-
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
13-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
14-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
15-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
16-
; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
17-
; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP2]], align 4
18-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
20+
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0
21+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4)
22+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
23+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
24+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
25+
; CHECK-NEXT: [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
26+
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
27+
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
28+
; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4
29+
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP10]]
1930
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
2031
; CHECK: middle.block:
2132
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
2233
; CHECK: scalar.ph:
23-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
34+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
2435
; CHECK-NEXT: br label [[LOOP:%.*]]
2536
; CHECK: loop:
2637
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
@@ -56,21 +67,32 @@ define void @small_trip_count_min_vlen_32(ptr nocapture %a) nounwind vscale_rang
5667
; CHECK-NEXT: entry:
5768
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
5869
; CHECK: vector.ph:
70+
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
71+
; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4
72+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
73+
; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4
74+
; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP3]], 1
75+
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 4, [[TMP4]]
76+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
77+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
5978
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
6079
; CHECK: vector.body:
6180
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
62-
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
63-
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
64-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
65-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
66-
; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
67-
; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[TMP2]], align 4
68-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
81+
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0
82+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 [[TMP5]], i32 4)
83+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP5]]
84+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
85+
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
86+
; CHECK-NEXT: [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
87+
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
88+
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
89+
; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 4
90+
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP10]]
6991
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
7092
; CHECK: middle.block:
7193
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
7294
; CHECK: scalar.ph:
73-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
95+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
7496
; CHECK-NEXT: br label [[LOOP:%.*]]
7597
; CHECK: loop:
7698
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]

0 commit comments

Comments
 (0)