Skip to content

Commit 37da5a1

Browse files
authored
[NVPTX] Add ranges to intrinsic definitions, cleanup NVVMIntrRange (#138338)
Pull the global intrinsic ranges out of NVVMIntrRange and into the intrinsic table-gen definitions. Also improve range inference for cluster SReg intrinsics.
1 parent 0bd065d commit 37da5a1

File tree

6 files changed

+296
-100
lines changed

6 files changed

+296
-100
lines changed

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 80 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,19 @@ def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr
139139
// MISC
140140
//
141141

142+
defvar WARP_SIZE = 32;
143+
144+
// Note: the maximum grid size in the x-dimension is the lower value of 65535
145+
// on sm_20. We conservatively use the larger value here as it required for
146+
// sm_30+ and also correct for sm_20.
147+
defvar MAX_GRID_SIZE_X = 0x7fffffff;
148+
defvar MAX_GRID_SIZE_Y = 0xffff;
149+
defvar MAX_GRID_SIZE_Z = 0xffff;
150+
151+
defvar MAX_BLOCK_SIZE_X = 1024;
152+
defvar MAX_BLOCK_SIZE_Y = 1024;
153+
defvar MAX_BLOCK_SIZE_Z = 64;
154+
142155
// Helper class that concatenates list elements with
143156
// a given separator 'sep' and returns the result.
144157
// Handles empty strings.
@@ -4747,26 +4760,35 @@ def int_nvvm_sust_p_3d_v4i32_trap
47474760

47484761
// Accessing special registers.
47494762

4750-
class PTXReadSRegIntrinsicNB_r32
4751-
: DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>]>;
4752-
class PTXReadSRegIntrinsic_r32<string name>
4753-
: PTXReadSRegIntrinsicNB_r32, ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
4763+
class PTXReadSRegIntrinsicNB_r32<list<IntrinsicProperty> properties = []>
4764+
: DefaultAttrsIntrinsic<[llvm_i32_ty], [],
4765+
!listconcat([IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>], properties)>;
47544766

4755-
multiclass PTXReadSRegIntrinsic_v4i32<string regname> {
4767+
class PTXReadSRegIntrinsic_r32<string name,
4768+
list<IntrinsicProperty> properties = []>
4769+
: PTXReadSRegIntrinsicNB_r32<properties>,
4770+
ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
4771+
4772+
multiclass PTXReadSRegIntrinsic_v4i32<string regname,
4773+
list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
4774+
assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
47564775
// FIXME: Do we need the 128-bit integer type version?
47574776
// def _r64 : Intrinsic<[llvm_i128_ty], [], [IntrNoMem, IntrSpeculatable]>;
47584777

47594778
// FIXME: Enable this once v4i32 support is enabled in back-end.
47604779
// def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
4761-
foreach suffix = ["_x", "_y", "_z", "_w"] in
4762-
def suffix : PTXReadSRegIntrinsic_r32<regname # suffix>;
4780+
defvar suffixes = ["_x", "_y", "_z", "_w"];
4781+
foreach i = !range(suffixes) in
4782+
def suffixes[i] : PTXReadSRegIntrinsic_r32<regname # suffixes[i], properties[i]>;
47634783
}
47644784

47654785
// Same, but without automatic clang builtins. It will be used for
47664786
// registers that require particular GPU or PTX version.
4767-
multiclass PTXReadSRegIntrinsicNB_v4i32 {
4768-
foreach suffix = ["_x", "_y", "_z", "_w"] in
4769-
def suffix : PTXReadSRegIntrinsicNB_r32;
4787+
multiclass PTXReadSRegIntrinsicNB_v4i32<list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
4788+
assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
4789+
defvar suffixes = ["_x", "_y", "_z", "_w"];
4790+
foreach i = !range(suffixes) in
4791+
def suffixes[i] : PTXReadSRegIntrinsicNB_r32<properties[i]>;
47704792
}
47714793

47724794
class PTXReadSRegIntrinsic_r64<string name>
@@ -4782,15 +4804,41 @@ class PTXReadNCSRegIntrinsic_r64<string name>
47824804
: Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, NoUndef<RetIndex>]>,
47834805
ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
47844806

4785-
defm int_nvvm_read_ptx_sreg_tid : PTXReadSRegIntrinsic_v4i32<"tid">;
4786-
defm int_nvvm_read_ptx_sreg_ntid : PTXReadSRegIntrinsic_v4i32<"ntid">;
4807+
defm int_nvvm_read_ptx_sreg_tid
4808+
: PTXReadSRegIntrinsic_v4i32<"tid",
4809+
[[Range<RetIndex, 0, MAX_BLOCK_SIZE_X>],
4810+
[Range<RetIndex, 0, MAX_BLOCK_SIZE_Y>],
4811+
[Range<RetIndex, 0, MAX_BLOCK_SIZE_Z>],
4812+
[Range<RetIndex, 0, 1>]]>;
4813+
4814+
defm int_nvvm_read_ptx_sreg_ntid
4815+
: PTXReadSRegIntrinsic_v4i32<"ntid",
4816+
[[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_X, 1)>],
4817+
[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Y, 1)>],
4818+
[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Z, 1)>],
4819+
[Range<RetIndex, 0, 1>]]>;
4820+
4821+
def int_nvvm_read_ptx_sreg_laneid
4822+
: PTXReadSRegIntrinsic_r32<"laneid", [Range<RetIndex, 0, WARP_SIZE>]>;
47874823

4788-
def int_nvvm_read_ptx_sreg_laneid : PTXReadSRegIntrinsic_r32<"laneid">;
47894824
def int_nvvm_read_ptx_sreg_warpid : PTXReadSRegIntrinsic_r32<"warpid">;
47904825
def int_nvvm_read_ptx_sreg_nwarpid : PTXReadSRegIntrinsic_r32<"nwarpid">;
47914826

4792-
defm int_nvvm_read_ptx_sreg_ctaid : PTXReadSRegIntrinsic_v4i32<"ctaid">;
4793-
defm int_nvvm_read_ptx_sreg_nctaid : PTXReadSRegIntrinsic_v4i32<"nctaid">;
4827+
defvar MAX_GRID_ID_RANGE = [[Range<RetIndex, 0, MAX_GRID_SIZE_X>],
4828+
[Range<RetIndex, 0, MAX_GRID_SIZE_Y>],
4829+
[Range<RetIndex, 0, MAX_GRID_SIZE_Z>],
4830+
[Range<RetIndex, 0, 1>]];
4831+
4832+
defvar MAX_GRID_NID_RANGE = [[Range<RetIndex, 1, !add(MAX_GRID_SIZE_X, 1)>],
4833+
[Range<RetIndex, 1, !add(MAX_GRID_SIZE_Y, 1)>],
4834+
[Range<RetIndex, 1, !add(MAX_GRID_SIZE_Z, 1)>],
4835+
[Range<RetIndex, 0, 1>]];
4836+
4837+
defm int_nvvm_read_ptx_sreg_ctaid
4838+
: PTXReadSRegIntrinsic_v4i32<"ctaid", MAX_GRID_ID_RANGE>;
4839+
4840+
defm int_nvvm_read_ptx_sreg_nctaid
4841+
: PTXReadSRegIntrinsic_v4i32<"nctaid", MAX_GRID_NID_RANGE>;
47944842

47954843
def int_nvvm_read_ptx_sreg_smid : PTXReadSRegIntrinsic_r32<"smid">;
47964844
def int_nvvm_read_ptx_sreg_nsmid : PTXReadSRegIntrinsic_r32<"nsmid">;
@@ -4817,13 +4865,25 @@ def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32<"pm1">;
48174865
def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic_r32<"pm2">;
48184866
def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic_r32<"pm3">;
48194867

4820-
def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">;
4868+
def int_nvvm_read_ptx_sreg_warpsize
4869+
: PTXReadSRegIntrinsic_r32<"warpsize",
4870+
[Range<RetIndex, WARP_SIZE, !add(WARP_SIZE, 1)>]>;
48214871

48224872
// sm90+, PTX7.8+
4823-
defm int_nvvm_read_ptx_sreg_clusterid : PTXReadSRegIntrinsicNB_v4i32;
4824-
defm int_nvvm_read_ptx_sreg_nclusterid : PTXReadSRegIntrinsicNB_v4i32;
4825-
defm int_nvvm_read_ptx_sreg_cluster_ctaid : PTXReadSRegIntrinsicNB_v4i32;
4826-
defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32;
4873+
4874+
// Note: Since clusters are subdivisions of the grid, we conservatively use the
4875+
// maximum grid size as an upper bound for the clusterid and cluster_ctaid. In
4876+
// practice, the clusterid will likely be much smaller. The CUDA programming
4877+
// guide recommends 8 as a maximum portable value and H100s support 16.
4878+
4879+
defm int_nvvm_read_ptx_sreg_clusterid
4880+
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
4881+
defm int_nvvm_read_ptx_sreg_nclusterid
4882+
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
4883+
defm int_nvvm_read_ptx_sreg_cluster_ctaid
4884+
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
4885+
defm int_nvvm_read_ptx_sreg_cluster_nctaid
4886+
: PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
48274887

48284888
def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
48294889
def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;

llvm/lib/Target/NVPTX/NVPTXUtilities.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,16 @@ std::optional<uint64_t> getOverallReqNTID(const Function &F) {
330330
return getVectorProduct(ReqNTID);
331331
}
332332

333+
std::optional<uint64_t> getOverallClusterRank(const Function &F) {
334+
// maxclusterrank and cluster_dim are mutually exclusive.
335+
if (const auto ClusterRank = getMaxClusterRank(F))
336+
return ClusterRank;
337+
338+
// Note: The semantics here are a bit strange. See getMaxNTID.
339+
const auto ClusterDim = getClusterDim(F);
340+
return getVectorProduct(ClusterDim);
341+
}
342+
333343
std::optional<unsigned> getMaxClusterRank(const Function &F) {
334344
return getFnAttrParsedInt(F, "nvvm.maxclusterrank");
335345
}

llvm/lib/Target/NVPTX/NVPTXUtilities.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ SmallVector<unsigned, 3> getClusterDim(const Function &);
5454

5555
std::optional<uint64_t> getOverallMaxNTID(const Function &);
5656
std::optional<uint64_t> getOverallReqNTID(const Function &);
57+
std::optional<uint64_t> getOverallClusterRank(const Function &);
5758

5859
std::optional<unsigned> getMaxClusterRank(const Function &);
5960
std::optional<unsigned> getMinCTASm(const Function &);

llvm/lib/Target/NVPTX/NVVMIntrRange.cpp

Lines changed: 64 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -58,87 +58,89 @@ static bool addRangeAttr(uint64_t Low, uint64_t High, IntrinsicInst *II) {
5858
}
5959

6060
static bool runNVVMIntrRange(Function &F) {
61-
struct {
62-
unsigned x, y, z;
63-
} MaxBlockSize, MaxGridSize;
61+
struct Vector3 {
62+
unsigned X, Y, Z;
63+
};
6464

65-
const unsigned MetadataNTID = getOverallReqNTID(F).value_or(
66-
getOverallMaxNTID(F).value_or(std::numeric_limits<unsigned>::max()));
65+
// All these annotations are only valid for kernel functions.
66+
if (!isKernelFunction(F))
67+
return false;
6768

68-
MaxBlockSize.x = std::min(1024u, MetadataNTID);
69-
MaxBlockSize.y = std::min(1024u, MetadataNTID);
70-
MaxBlockSize.z = std::min(64u, MetadataNTID);
69+
const auto OverallReqNTID = getOverallReqNTID(F);
70+
const auto OverallMaxNTID = getOverallMaxNTID(F);
71+
const auto OverallClusterRank = getOverallClusterRank(F);
7172

72-
MaxGridSize.x = 0x7fffffff;
73-
MaxGridSize.y = 0xffff;
74-
MaxGridSize.z = 0xffff;
73+
// If this function lacks any range information, do nothing.
74+
if (!(OverallReqNTID || OverallMaxNTID || OverallClusterRank))
75+
return false;
7576

76-
// Go through the calls in this function.
77-
bool Changed = false;
78-
for (Instruction &I : instructions(F)) {
79-
IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
80-
if (!II)
81-
continue;
77+
const unsigned FunctionNTID = OverallReqNTID.value_or(
78+
OverallMaxNTID.value_or(std::numeric_limits<unsigned>::max()));
8279

80+
const unsigned FunctionClusterRank =
81+
OverallClusterRank.value_or(std::numeric_limits<unsigned>::max());
82+
83+
const Vector3 MaxBlockSize{std::min(1024u, FunctionNTID),
84+
std::min(1024u, FunctionNTID),
85+
std::min(64u, FunctionNTID)};
86+
87+
// We conservatively use the maximum grid size as an upper bound for the
88+
// cluster rank.
89+
const Vector3 MaxClusterRank{std::min(0x7fffffffu, FunctionClusterRank),
90+
std::min(0xffffu, FunctionClusterRank),
91+
std::min(0xffffu, FunctionClusterRank)};
92+
93+
const auto ProccessIntrinsic = [&](IntrinsicInst *II) -> bool {
8394
switch (II->getIntrinsicID()) {
8495
// Index within block
8596
case Intrinsic::nvvm_read_ptx_sreg_tid_x:
86-
Changed |= addRangeAttr(0, MaxBlockSize.x, II);
87-
break;
97+
return addRangeAttr(0, MaxBlockSize.X, II);
8898
case Intrinsic::nvvm_read_ptx_sreg_tid_y:
89-
Changed |= addRangeAttr(0, MaxBlockSize.y, II);
90-
break;
99+
return addRangeAttr(0, MaxBlockSize.Y, II);
91100
case Intrinsic::nvvm_read_ptx_sreg_tid_z:
92-
Changed |= addRangeAttr(0, MaxBlockSize.z, II);
93-
break;
101+
return addRangeAttr(0, MaxBlockSize.Z, II);
94102

95103
// Block size
96104
case Intrinsic::nvvm_read_ptx_sreg_ntid_x:
97-
Changed |= addRangeAttr(1, MaxBlockSize.x + 1, II);
98-
break;
105+
return addRangeAttr(1, MaxBlockSize.X + 1, II);
99106
case Intrinsic::nvvm_read_ptx_sreg_ntid_y:
100-
Changed |= addRangeAttr(1, MaxBlockSize.y + 1, II);
101-
break;
107+
return addRangeAttr(1, MaxBlockSize.Y + 1, II);
102108
case Intrinsic::nvvm_read_ptx_sreg_ntid_z:
103-
Changed |= addRangeAttr(1, MaxBlockSize.z + 1, II);
104-
break;
105-
106-
// Index within grid
107-
case Intrinsic::nvvm_read_ptx_sreg_ctaid_x:
108-
Changed |= addRangeAttr(0, MaxGridSize.x, II);
109-
break;
110-
case Intrinsic::nvvm_read_ptx_sreg_ctaid_y:
111-
Changed |= addRangeAttr(0, MaxGridSize.y, II);
109+
return addRangeAttr(1, MaxBlockSize.Z + 1, II);
110+
111+
// Cluster size
112+
case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_x:
113+
return addRangeAttr(0, MaxClusterRank.X, II);
114+
case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_y:
115+
return addRangeAttr(0, MaxClusterRank.Y, II);
116+
case Intrinsic::nvvm_read_ptx_sreg_cluster_ctaid_z:
117+
return addRangeAttr(0, MaxClusterRank.Z, II);
118+
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_x:
119+
return addRangeAttr(1, MaxClusterRank.X + 1, II);
120+
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_y:
121+
return addRangeAttr(1, MaxClusterRank.Y + 1, II);
122+
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctaid_z:
123+
return addRangeAttr(1, MaxClusterRank.Z + 1, II);
124+
125+
case Intrinsic::nvvm_read_ptx_sreg_cluster_ctarank:
126+
if (OverallClusterRank)
127+
return addRangeAttr(0, FunctionClusterRank, II);
112128
break;
113-
case Intrinsic::nvvm_read_ptx_sreg_ctaid_z:
114-
Changed |= addRangeAttr(0, MaxGridSize.z, II);
129+
case Intrinsic::nvvm_read_ptx_sreg_cluster_nctarank:
130+
if (OverallClusterRank)
131+
return addRangeAttr(1, FunctionClusterRank + 1, II);
115132
break;
116-
117-
// Grid size
118-
case Intrinsic::nvvm_read_ptx_sreg_nctaid_x:
119-
Changed |= addRangeAttr(1, MaxGridSize.x + 1, II);
120-
break;
121-
case Intrinsic::nvvm_read_ptx_sreg_nctaid_y:
122-
Changed |= addRangeAttr(1, MaxGridSize.y + 1, II);
123-
break;
124-
case Intrinsic::nvvm_read_ptx_sreg_nctaid_z:
125-
Changed |= addRangeAttr(1, MaxGridSize.z + 1, II);
126-
break;
127-
128-
// warp size is constant 32.
129-
case Intrinsic::nvvm_read_ptx_sreg_warpsize:
130-
Changed |= addRangeAttr(32, 32 + 1, II);
131-
break;
132-
133-
// Lane ID is [0..warpsize)
134-
case Intrinsic::nvvm_read_ptx_sreg_laneid:
135-
Changed |= addRangeAttr(0, 32, II);
136-
break;
137-
138133
default:
139-
break;
134+
return false;
140135
}
141-
}
136+
return false;
137+
};
138+
139+
// Go through the calls in this function.
140+
bool Changed = false;
141+
for (Instruction &I : instructions(F))
142+
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))
143+
Changed |= ProccessIntrinsic(II);
142144

143145
return Changed;
144146
}

0 commit comments

Comments
 (0)