Skip to content

Commit ffd79b3

Browse files
authored
[LoopUnroll] Consider simplified operands while retrieving TTI instruction cost (#70929)
Get more precise cost of instruction after LoopUnroll considering that some operands of it can be simplified, e.g. induction variable will be replaced by constant after full unrolling.
1 parent f89fe08 commit ffd79b3

File tree

3 files changed

+59
-8
lines changed

3 files changed

+59
-8
lines changed

llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,15 @@ static std::optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
450450

451451
// First accumulate the cost of this instruction.
452452
if (!Cost.IsFree) {
453-
UnrolledCost += TTI.getInstructionCost(I, CostKind);
453+
// Consider simplified operands in instruction cost.
454+
SmallVector<Value *, 4> Operands;
455+
transform(I->operands(), std::back_inserter(Operands),
456+
[&](Value *Op) {
457+
if (auto Res = SimplifiedValues.lookup(Op))
458+
return Res;
459+
return Op;
460+
});
461+
UnrolledCost += TTI.getInstructionCost(I, Operands, CostKind);
454462
LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
455463
<< Iteration << "): ");
456464
LLVM_DEBUG(I->dump());

llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-cost-addrspacecast.ll

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -passes=loop-unroll -unroll-threshold=49 -unroll-peel-count=0 -unroll-allow-partial=false -unroll-max-iteration-count-to-analyze=16 < %s | FileCheck %s
1+
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -passes=loop-unroll -unroll-threshold=57 -unroll-peel-count=0 -unroll-allow-partial=false -unroll-max-iteration-count-to-analyze=16 < %s | FileCheck %s
2+
3+
@indices = external global [16 x i32]
24

35
; CHECK-LABEL: @test_func_addrspacecast_cost_noop(
46
; CHECK-NOT: br i1
@@ -9,8 +11,10 @@ entry:
911
for.body:
1012
%indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
1113
%sum.02 = phi float [ %fmul, %for.body ], [ 0.0, %entry ]
12-
%arrayidx.in = getelementptr inbounds float, ptr addrspace(1) %in, i32 %indvars.iv
13-
%arrayidx.out = getelementptr inbounds float, ptr addrspace(1) %out, i32 %indvars.iv
14+
%idx.ptr = getelementptr inbounds [16 x i32], ptr @indices, i32 0, i32 %indvars.iv
15+
%index = load i32, ptr %idx.ptr
16+
%arrayidx.in = getelementptr inbounds float, ptr addrspace(1) %in, i32 %index
17+
%arrayidx.out = getelementptr inbounds float, ptr addrspace(1) %out, i32 %index
1418
%cast.in = addrspacecast ptr addrspace(1) %arrayidx.in to ptr
1519
%cast.out = addrspacecast ptr addrspace(1) %arrayidx.out to ptr
1620
%load = load float, ptr %cast.in
@@ -34,8 +38,10 @@ entry:
3438
for.body:
3539
%indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
3640
%sum.02 = phi float [ %fmul, %for.body ], [ 0.0, %entry ]
37-
%arrayidx.in = getelementptr inbounds float, ptr %in, i32 %indvars.iv
38-
%arrayidx.out = getelementptr inbounds float, ptr %out, i32 %indvars.iv
41+
%idx.ptr = getelementptr inbounds [16 x i32], ptr @indices, i32 0, i32 %indvars.iv
42+
%index = load i32, ptr %idx.ptr
43+
%arrayidx.in = getelementptr inbounds float, ptr %in, i32 %index
44+
%arrayidx.out = getelementptr inbounds float, ptr %out, i32 %index
3945
%cast.in = addrspacecast ptr %arrayidx.in to ptr addrspace(3)
4046
%cast.out = addrspacecast ptr %arrayidx.out to ptr addrspace(3)
4147
%load = load float, ptr addrspace(3) %cast.in
@@ -58,8 +64,10 @@ entry:
5864
for.body:
5965
%indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
6066
%sum.02 = phi float [ %fmul, %for.body ], [ 0.0, %entry ]
61-
%arrayidx.in = getelementptr inbounds float, ptr addrspace(3) %in, i32 %indvars.iv
62-
%arrayidx.out = getelementptr inbounds float, ptr addrspace(3) %out, i32 %indvars.iv
67+
%idx.ptr = getelementptr inbounds [16 x i32], ptr @indices, i32 0, i32 %indvars.iv
68+
%index = load i32, ptr %idx.ptr
69+
%arrayidx.in = getelementptr inbounds float, ptr addrspace(3) %in, i32 %index
70+
%arrayidx.out = getelementptr inbounds float, ptr addrspace(3) %out, i32 %index
6371
%cast.in = addrspacecast ptr addrspace(3) %arrayidx.in to ptr
6472
%cast.out = addrspacecast ptr addrspace(3) %arrayidx.out to ptr
6573
%load = load float, ptr %cast.in
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
2+
; RUN: opt < %s -S -mtriple=riscv64 -passes=loop-unroll | FileCheck %s
3+
4+
; Function Attrs: optsize
5+
define void @foo(ptr %array, i32 %x) #0 {
6+
; CHECK-LABEL: define void @foo
7+
; CHECK-SAME: (ptr [[ARRAY:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
8+
; CHECK-NEXT: entry:
9+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
10+
; CHECK: for.body:
11+
; CHECK-NEXT: store i32 [[X]], ptr [[ARRAY]], align 4
12+
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 1
13+
; CHECK-NEXT: store i32 [[X]], ptr [[ARRAYIDX_1]], align 4
14+
; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 2
15+
; CHECK-NEXT: store i32 [[X]], ptr [[ARRAYIDX_2]], align 4
16+
; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 3
17+
; CHECK-NEXT: store i32 [[X]], ptr [[ARRAYIDX_3]], align 4
18+
; CHECK-NEXT: ret void
19+
;
20+
entry:
21+
br label %for.body
22+
23+
for.body:
24+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
25+
%arrayidx = getelementptr inbounds i32, ptr %array, i64 %indvars.iv
26+
store i32 %x, ptr %arrayidx, align 4
27+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
28+
%exitcond.not = icmp eq i64 %indvars.iv.next, 4
29+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
30+
31+
for.cond.cleanup:
32+
ret void
33+
}
34+
35+
attributes #0 = { optsize }

0 commit comments

Comments
 (0)