@@ -139,6 +139,19 @@ def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr
139
139
// MISC
140
140
//
141
141
142
+ defvar WARP_SIZE = 32;
143
+
144
+ // Note: the maximum grid size in the x-dimension is the lower value of 65535
145
+ // on sm_20. We conservatively use the larger value here as it required for
146
+ // sm_30+ and also correct for sm_20.
147
+ defvar MAX_GRID_SIZE_X = 0x7fffffff;
148
+ defvar MAX_GRID_SIZE_Y = 0xffff;
149
+ defvar MAX_GRID_SIZE_Z = 0xffff;
150
+
151
+ defvar MAX_BLOCK_SIZE_X = 1024;
152
+ defvar MAX_BLOCK_SIZE_Y = 1024;
153
+ defvar MAX_BLOCK_SIZE_Z = 64;
154
+
142
155
// Helper class that concatenates list elements with
143
156
// a given separator 'sep' and returns the result.
144
157
// Handles empty strings.
@@ -4747,26 +4760,35 @@ def int_nvvm_sust_p_3d_v4i32_trap
4747
4760
4748
4761
// Accessing special registers.
4749
4762
4750
- class PTXReadSRegIntrinsicNB_r32
4751
- : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>]>;
4752
- class PTXReadSRegIntrinsic_r32<string name>
4753
- : PTXReadSRegIntrinsicNB_r32, ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
4763
+ class PTXReadSRegIntrinsicNB_r32<list<IntrinsicProperty> properties = []>
4764
+ : DefaultAttrsIntrinsic<[llvm_i32_ty], [],
4765
+ !listconcat([IntrNoMem, IntrSpeculatable, NoUndef<RetIndex>], properties)>;
4754
4766
4755
- multiclass PTXReadSRegIntrinsic_v4i32<string regname> {
4767
+ class PTXReadSRegIntrinsic_r32<string name,
4768
+ list<IntrinsicProperty> properties = []>
4769
+ : PTXReadSRegIntrinsicNB_r32<properties>,
4770
+ ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
4771
+
4772
+ multiclass PTXReadSRegIntrinsic_v4i32<string regname,
4773
+ list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
4774
+ assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
4756
4775
// FIXME: Do we need the 128-bit integer type version?
4757
4776
// def _r64 : Intrinsic<[llvm_i128_ty], [], [IntrNoMem, IntrSpeculatable]>;
4758
4777
4759
4778
// FIXME: Enable this once v4i32 support is enabled in back-end.
4760
4779
// def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
4761
- foreach suffix = ["_x", "_y", "_z", "_w"] in
4762
- def suffix : PTXReadSRegIntrinsic_r32<regname # suffix>;
4780
+ defvar suffixes = ["_x", "_y", "_z", "_w"];
4781
+ foreach i = !range(suffixes) in
4782
+ def suffixes[i] : PTXReadSRegIntrinsic_r32<regname # suffixes[i], properties[i]>;
4763
4783
}
4764
4784
4765
4785
// Same, but without automatic clang builtins. It will be used for
4766
4786
// registers that require particular GPU or PTX version.
4767
- multiclass PTXReadSRegIntrinsicNB_v4i32 {
4768
- foreach suffix = ["_x", "_y", "_z", "_w"] in
4769
- def suffix : PTXReadSRegIntrinsicNB_r32;
4787
+ multiclass PTXReadSRegIntrinsicNB_v4i32<list<list<IntrinsicProperty>> properties = [[], [], [], []]> {
4788
+ assert !eq(!size(properties), 4), "properties must be a list of 4 lists";
4789
+ defvar suffixes = ["_x", "_y", "_z", "_w"];
4790
+ foreach i = !range(suffixes) in
4791
+ def suffixes[i] : PTXReadSRegIntrinsicNB_r32<properties[i]>;
4770
4792
}
4771
4793
4772
4794
class PTXReadSRegIntrinsic_r64<string name>
@@ -4782,15 +4804,41 @@ class PTXReadNCSRegIntrinsic_r64<string name>
4782
4804
: Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, NoUndef<RetIndex>]>,
4783
4805
ClangBuiltin<"__nvvm_read_ptx_sreg_" # name>;
4784
4806
4785
- defm int_nvvm_read_ptx_sreg_tid : PTXReadSRegIntrinsic_v4i32<"tid">;
4786
- defm int_nvvm_read_ptx_sreg_ntid : PTXReadSRegIntrinsic_v4i32<"ntid">;
4807
+ defm int_nvvm_read_ptx_sreg_tid
4808
+ : PTXReadSRegIntrinsic_v4i32<"tid",
4809
+ [[Range<RetIndex, 0, MAX_BLOCK_SIZE_X>],
4810
+ [Range<RetIndex, 0, MAX_BLOCK_SIZE_Y>],
4811
+ [Range<RetIndex, 0, MAX_BLOCK_SIZE_Z>],
4812
+ [Range<RetIndex, 0, 1>]]>;
4813
+
4814
+ defm int_nvvm_read_ptx_sreg_ntid
4815
+ : PTXReadSRegIntrinsic_v4i32<"ntid",
4816
+ [[Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_X, 1)>],
4817
+ [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Y, 1)>],
4818
+ [Range<RetIndex, 1, !add(MAX_BLOCK_SIZE_Z, 1)>],
4819
+ [Range<RetIndex, 0, 1>]]>;
4820
+
4821
+ def int_nvvm_read_ptx_sreg_laneid
4822
+ : PTXReadSRegIntrinsic_r32<"laneid", [Range<RetIndex, 0, WARP_SIZE>]>;
4787
4823
4788
- def int_nvvm_read_ptx_sreg_laneid : PTXReadSRegIntrinsic_r32<"laneid">;
4789
4824
def int_nvvm_read_ptx_sreg_warpid : PTXReadSRegIntrinsic_r32<"warpid">;
4790
4825
def int_nvvm_read_ptx_sreg_nwarpid : PTXReadSRegIntrinsic_r32<"nwarpid">;
4791
4826
4792
- defm int_nvvm_read_ptx_sreg_ctaid : PTXReadSRegIntrinsic_v4i32<"ctaid">;
4793
- defm int_nvvm_read_ptx_sreg_nctaid : PTXReadSRegIntrinsic_v4i32<"nctaid">;
4827
+ defvar MAX_GRID_ID_RANGE = [[Range<RetIndex, 0, MAX_GRID_SIZE_X>],
4828
+ [Range<RetIndex, 0, MAX_GRID_SIZE_Y>],
4829
+ [Range<RetIndex, 0, MAX_GRID_SIZE_Z>],
4830
+ [Range<RetIndex, 0, 1>]];
4831
+
4832
+ defvar MAX_GRID_NID_RANGE = [[Range<RetIndex, 1, !add(MAX_GRID_SIZE_X, 1)>],
4833
+ [Range<RetIndex, 1, !add(MAX_GRID_SIZE_Y, 1)>],
4834
+ [Range<RetIndex, 1, !add(MAX_GRID_SIZE_Z, 1)>],
4835
+ [Range<RetIndex, 0, 1>]];
4836
+
4837
+ defm int_nvvm_read_ptx_sreg_ctaid
4838
+ : PTXReadSRegIntrinsic_v4i32<"ctaid", MAX_GRID_ID_RANGE>;
4839
+
4840
+ defm int_nvvm_read_ptx_sreg_nctaid
4841
+ : PTXReadSRegIntrinsic_v4i32<"nctaid", MAX_GRID_NID_RANGE>;
4794
4842
4795
4843
def int_nvvm_read_ptx_sreg_smid : PTXReadSRegIntrinsic_r32<"smid">;
4796
4844
def int_nvvm_read_ptx_sreg_nsmid : PTXReadSRegIntrinsic_r32<"nsmid">;
@@ -4817,13 +4865,25 @@ def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32<"pm1">;
4817
4865
def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic_r32<"pm2">;
4818
4866
def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic_r32<"pm3">;
4819
4867
4820
- def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">;
4868
+ def int_nvvm_read_ptx_sreg_warpsize
4869
+ : PTXReadSRegIntrinsic_r32<"warpsize",
4870
+ [Range<RetIndex, WARP_SIZE, !add(WARP_SIZE, 1)>]>;
4821
4871
4822
4872
// sm90+, PTX7.8+
4823
- defm int_nvvm_read_ptx_sreg_clusterid : PTXReadSRegIntrinsicNB_v4i32;
4824
- defm int_nvvm_read_ptx_sreg_nclusterid : PTXReadSRegIntrinsicNB_v4i32;
4825
- defm int_nvvm_read_ptx_sreg_cluster_ctaid : PTXReadSRegIntrinsicNB_v4i32;
4826
- defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32;
4873
+
4874
+ // Note: Since clusters are subdivisions of the grid, we conservatively use the
4875
+ // maximum grid size as an upper bound for the clusterid and cluster_ctaid. In
4876
+ // practice, the clusterid will likely be much smaller. The CUDA programming
4877
+ // guide recommends 8 as a maximum portable value and H100s support 16.
4878
+
4879
+ defm int_nvvm_read_ptx_sreg_clusterid
4880
+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
4881
+ defm int_nvvm_read_ptx_sreg_nclusterid
4882
+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
4883
+ defm int_nvvm_read_ptx_sreg_cluster_ctaid
4884
+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_ID_RANGE>;
4885
+ defm int_nvvm_read_ptx_sreg_cluster_nctaid
4886
+ : PTXReadSRegIntrinsicNB_v4i32<MAX_GRID_NID_RANGE>;
4827
4887
4828
4888
def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32;
4829
4889
def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32;
0 commit comments