Skip to content

Commit 009368f

Browse files
authored
AMDGPU: Mark grid size loads with range metadata (#113019)
Only handles the v5 case.
1 parent 6168739 commit 009368f

File tree

3 files changed

+154
-4
lines changed

3 files changed

+154
-4
lines changed

llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "llvm/IR/InstIterator.h"
2323
#include "llvm/IR/Instructions.h"
2424
#include "llvm/IR/IntrinsicsAMDGPU.h"
25+
#include "llvm/IR/MDBuilder.h"
2526
#include "llvm/IR/PatternMatch.h"
2627
#include "llvm/Pass.h"
2728

@@ -82,6 +83,20 @@ Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
8283

8384
} // end anonymous namespace
8485

86+
static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
87+
uint32_t MaxNumGroups) {
88+
if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
89+
return;
90+
91+
if (!Load->getType()->isIntegerTy(32))
92+
return;
93+
94+
// TODO: If there is existing range metadata, preserve it if it is stricter.
95+
MDBuilder MDB(Load->getContext());
96+
MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
97+
Load->setMetadata(LLVMContext::MD_range, Range);
98+
}
99+
85100
static bool processUse(CallInst *CI, bool IsV5OrAbove) {
86101
Function *F = CI->getParent()->getParent();
87102

@@ -91,7 +106,11 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
91106
const bool HasUniformWorkGroupSize =
92107
F->getFnAttribute("uniform-work-group-size").getValueAsBool();
93108

94-
if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
109+
SmallVector<unsigned> MaxNumWorkgroups =
110+
AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups", 3);
111+
112+
if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
113+
none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
95114
return false;
96115

97116
Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
@@ -132,16 +151,22 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
132151
if (IsV5OrAbove) { // Base is ImplicitArgPtr.
133152
switch (Offset) {
134153
case HIDDEN_BLOCK_COUNT_X:
135-
if (LoadSize == 4)
154+
if (LoadSize == 4) {
136155
BlockCounts[0] = Load;
156+
annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
157+
}
137158
break;
138159
case HIDDEN_BLOCK_COUNT_Y:
139-
if (LoadSize == 4)
160+
if (LoadSize == 4) {
140161
BlockCounts[1] = Load;
162+
annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
163+
}
141164
break;
142165
case HIDDEN_BLOCK_COUNT_Z:
143-
if (LoadSize == 4)
166+
if (LoadSize == 4) {
144167
BlockCounts[2] = Load;
168+
annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
169+
}
145170
break;
146171
case HIDDEN_GROUP_SIZE_X:
147172
if (LoadSize == 2)

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct
366366
TM.getSubtarget<R600Subtarget>(F));
367367
}
368368

369+
// FIXME: This has no reason to be in subtarget
369370
SmallVector<unsigned>
370371
AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const {
371372
return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3,
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
2+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-lower-kernel-attributes %s | FileCheck %s
3+
4+
define i32 @use_grid_size_x_max_num_workgroups() #0 {
5+
; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups(
6+
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
7+
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
8+
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0:![0-9]+]]
9+
; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
10+
;
11+
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
12+
%grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
13+
ret i32 %grid.size.x
14+
}
15+
16+
define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range() #0 {
17+
; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_existing_nonzero_range(
18+
; CHECK-SAME: ) #[[ATTR0]] {
19+
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
20+
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG0]]
21+
; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
22+
;
23+
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
24+
%grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4, !range !0
25+
ret i32 %grid.size.x
26+
}
27+
28+
define i32 @use_grid_size_y_max_num_workgroups() #0 {
29+
; CHECK-LABEL: define i32 @use_grid_size_y_max_num_workgroups(
30+
; CHECK-SAME: ) #[[ATTR0]] {
31+
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
32+
; CHECK-NEXT: [[GEP_GRID_SIZE_Y:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 4
33+
; CHECK-NEXT: [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Y]], align 4, !range [[RNG1:![0-9]+]]
34+
; CHECK-NEXT: ret i32 [[GRID_SIZE_Y]]
35+
;
36+
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
37+
%gep.grid.size.y = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 4
38+
%grid.size.y = load i32, ptr addrspace(4) %gep.grid.size.y, align 4
39+
ret i32 %grid.size.y
40+
}
41+
42+
define i32 @use_grid_size_z_max_num_workgroups() #0 {
43+
; CHECK-LABEL: define i32 @use_grid_size_z_max_num_workgroups(
44+
; CHECK-SAME: ) #[[ATTR0]] {
45+
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
46+
; CHECK-NEXT: [[GEP_GRID_SIZE_Z:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 8
47+
; CHECK-NEXT: [[GRID_SIZE_Z:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_SIZE_Z]], align 4, !range [[RNG2:![0-9]+]]
48+
; CHECK-NEXT: ret i32 [[GRID_SIZE_Z]]
49+
;
50+
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
51+
%gep.grid.size.z = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 8
52+
%grid.size.z = load i32, ptr addrspace(4) %gep.grid.size.z, align 4
53+
ret i32 %grid.size.z
54+
}
55+
56+
define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type() #0 {
57+
; CHECK-LABEL: define <2 x i16> @use_grid_size_x_max_num_workgroups_load_wrong_type(
58+
; CHECK-SAME: ) #[[ATTR0]] {
59+
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
60+
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load <2 x i16>, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
61+
; CHECK-NEXT: ret <2 x i16> [[GRID_SIZE_X]]
62+
;
63+
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
64+
%grid.size.x = load <2 x i16>, ptr addrspace(4) %implicitarg.ptr, align 4
65+
ret <2 x i16> %grid.size.x
66+
}
67+
68+
define i32 @use_grid_size_x_max_num_workgroups_max_minus_1() #1 {
69+
; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max_minus_1(
70+
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
71+
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
72+
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4, !range [[RNG3:![0-9]+]]
73+
; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
74+
;
75+
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
76+
%grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
77+
ret i32 %grid.size.x
78+
}
79+
80+
define i32 @use_grid_size_x_max_num_workgroups_max() #2 {
81+
; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_max(
82+
; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
83+
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
84+
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
85+
; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
86+
;
87+
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
88+
%grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
89+
ret i32 %grid.size.x
90+
}
91+
92+
define i32 @use_grid_size_x_max_num_workgroups_zero() #3 {
93+
; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups_zero(
94+
; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
95+
; CHECK-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
96+
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG_PTR]], align 4
97+
; CHECK-NEXT: ret i32 [[GRID_SIZE_X]]
98+
;
99+
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
100+
%grid.size.x = load i32, ptr addrspace(4) %implicitarg.ptr, align 4
101+
ret i32 %grid.size.x
102+
}
103+
104+
declare noundef align 4 ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #3
105+
106+
attributes #0 = { "amdgpu-max-num-workgroups"="36,42,89" }
107+
attributes #1 = { "amdgpu-max-num-workgroups"="4294967294,42,89" }
108+
attributes #2 = { "amdgpu-max-num-workgroups"="4294967295,42,89" }
109+
attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" }
110+
111+
!0 = !{i32 0, i32 -1}
112+
113+
;.
114+
; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" }
115+
; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="4294967294,42,89" }
116+
; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="4294967295,42,89" }
117+
; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" }
118+
; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
119+
;.
120+
; CHECK: [[RNG0]] = !{i32 1, i32 37}
121+
; CHECK: [[RNG1]] = !{i32 1, i32 43}
122+
; CHECK: [[RNG2]] = !{i32 1, i32 90}
123+
; CHECK: [[RNG3]] = !{i32 1, i32 -1}
124+
;.

0 commit comments

Comments
 (0)