Skip to content

Commit 6bcfdda

Browse files
Pierre-vhDavid Salinas
authored and
David Salinas
committed
(Reland) [AMDGPU][PromoteAlloca] Don't stop when an alloca is too big to promote (llvm#93466)
When I rewrote this, I made a mistake in the control flow. I thought we could just stop promoting if an alloca is too big to vectorize, but we can't. Other allocas in the list may be promotable and fit within the budget. Fixes SWDEV-455343 Fixes SWDEV-464683 Change-Id: Iedeabc3ee1c91500da13503097d2029f2838c8ce
1 parent 93bcb84 commit 6bcfdda

File tree

2 files changed

+59
-14
lines changed

2 files changed

+59
-14
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -333,22 +333,26 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
333333
bool Changed = false;
334334
for (AllocaInst *AI : Allocas) {
335335
const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType());
336-
if (AllocaCost > VectorizationBudget) {
337-
LLVM_DEBUG(dbgs() << " Alloca too big for vectorization: " << *AI
338-
<< "\n");
339-
return Changed;
336+
// First, check if we have enough budget to vectorize this alloca.
337+
if (AllocaCost <= VectorizationBudget) {
338+
// If we do, attempt vectorization, otherwise, fall through and try
339+
// promoting to LDS instead.
340+
if (tryPromoteAllocaToVector(*AI)) {
341+
Changed = true;
342+
assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
343+
"Underflow!");
344+
VectorizationBudget -= AllocaCost;
345+
LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
346+
<< VectorizationBudget << "\n");
347+
continue;
348+
}
349+
} else {
350+
LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
351+
<< AllocaCost << ", budget:" << VectorizationBudget
352+
<< "): " << *AI << "\n");
340353
}
341354

342-
if (tryPromoteAllocaToVector(*AI)) {
343-
Changed = true;
344-
assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
345-
"Underflow!");
346-
VectorizationBudget -= AllocaCost;
347-
LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
348-
<< VectorizationBudget << "\n");
349-
if (VectorizationBudget == 0)
350-
break;
351-
} else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
355+
if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
352356
Changed = true;
353357
}
354358

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca-to-vector-limit=128 -passes=amdgpu-promote-alloca-to-vector %s -o - | FileCheck %s
3+
4+
; Check that when we see an alloca that's too big to vectorize given the remaining budget,
5+
; we don't give up and we keep looking for other allocas to vectorize.
6+
7+
define amdgpu_kernel void @simple_users_scores() {
8+
; CHECK-LABEL: define amdgpu_kernel void @simple_users_scores(
9+
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
10+
; CHECK-NEXT: [[ENTRY:.*:]]
11+
; CHECK-NEXT: [[MANYUSERS:%.*]] = alloca [64 x i64], align 4, addrspace(5)
12+
; CHECK-NEXT: [[MANYUSERS_1:%.*]] = getelementptr i8, ptr addrspace(5) [[MANYUSERS]], i64 2
13+
; CHECK-NEXT: [[V0:%.*]] = load i8, ptr addrspace(5) [[MANYUSERS_1]], align 1
14+
; CHECK-NEXT: [[V0_EXT:%.*]] = zext i8 [[V0]] to i64
15+
; CHECK-NEXT: store i64 [[V0_EXT]], ptr addrspace(5) [[MANYUSERS_1]], align 8
16+
; CHECK-NEXT: [[MANYUSERS_2:%.*]] = getelementptr i8, ptr addrspace(5) [[MANYUSERS]], i64 1
17+
; CHECK-NEXT: [[V1:%.*]] = load i8, ptr addrspace(5) [[MANYUSERS_2]], align 1
18+
; CHECK-NEXT: [[V1_EXT:%.*]] = zext i8 [[V0]] to i64
19+
; CHECK-NEXT: store i64 [[V1_EXT]], ptr addrspace(5) [[MANYUSERS_2]], align 8
20+
; CHECK-NEXT: ret void
21+
;
22+
entry:
23+
; should get a score of 1
24+
%simpleuser = alloca [4 x i64], align 4, addrspace(5)
25+
; should get a score of 4 and be visited first.
26+
%manyusers = alloca [64 x i64], align 4, addrspace(5)
27+
28+
store i64 42, ptr addrspace(5) %simpleuser
29+
30+
%manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
31+
%v0 = load i8, ptr addrspace(5) %manyusers.1
32+
%v0.ext = zext i8 %v0 to i64
33+
store i64 %v0.ext, ptr addrspace(5) %manyusers.1
34+
35+
%manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1
36+
%v1 = load i8, ptr addrspace(5) %manyusers.2
37+
%v1.ext = zext i8 %v0 to i64
38+
store i64 %v1.ext, ptr addrspace(5) %manyusers.2
39+
40+
ret void
41+
}

0 commit comments

Comments
 (0)