Skip to content

AMDGPU: Mark workitem ID intrinsics with range attribute #136196

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 0 additions & 25 deletions clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,16 +171,6 @@ static Value *emitFPIntBuiltin(CodeGenFunction &CGF,
return CGF.Builder.CreateCall(F, {Src0, Src1});
}

static Value *emitRangedBuiltin(CodeGenFunction &CGF, unsigned IntrinsicID,
int low, int high) {
Function *F = CGF.CGM.getIntrinsic(IntrinsicID, {});
llvm::CallInst *Call = CGF.Builder.CreateCall(F);
llvm::ConstantRange CR(APInt(32, low), APInt(32, high));
Call->addRangeRetAttr(CR);
Call->addRetAttr(llvm::Attribute::AttrKind::NoUndef);
return Call;
}

// For processing memory ordering and memory scope arguments of various
// amdgcn builtins.
// \p Order takes a C++11 comptabile memory-ordering specifier and converts
Expand Down Expand Up @@ -934,15 +924,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
return Builder.CreateCall(F, Args);
}

// amdgcn workitem
case AMDGPU::BI__builtin_amdgcn_workitem_id_x:
return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_x, 0, 1024);
case AMDGPU::BI__builtin_amdgcn_workitem_id_y:
return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_y, 0, 1024);
case AMDGPU::BI__builtin_amdgcn_workitem_id_z:
return emitRangedBuiltin(*this, Intrinsic::amdgcn_workitem_id_z, 0, 1024);

// amdgcn workgroup size
case AMDGPU::BI__builtin_amdgcn_workgroup_size_x:
return EmitAMDGPUWorkGroupSize(*this, 0);
Expand All @@ -964,12 +945,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
return emitBuiltinWithOneOverloadedType<1>(*this, E,
Intrinsic::r600_recipsqrt_ieee);
case AMDGPU::BI__builtin_r600_read_tidig_x:
return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_x, 0, 1024);
case AMDGPU::BI__builtin_r600_read_tidig_y:
return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_y, 0, 1024);
case AMDGPU::BI__builtin_r600_read_tidig_z:
return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_z, 0, 1024);
case AMDGPU::BI__builtin_amdgcn_alignbit: {
llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
Expand Down
10 changes: 7 additions & 3 deletions clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Original file line number Diff line number Diff line change
Expand Up @@ -605,9 +605,9 @@ void test_s_getreg(volatile global uint *out)
}

// CHECK-LABEL: @test_get_local_id(
// CHECK: tail call noundef range(i32 0, 1024){{.*}} i32 @llvm.amdgcn.workitem.id.x()
// CHECK: tail call noundef range(i32 0, 1024){{.*}} i32 @llvm.amdgcn.workitem.id.y()
// CHECK: tail call noundef range(i32 0, 1024){{.*}} i32 @llvm.amdgcn.workitem.id.z()
// CHECK: tail call{{.*}} i32 @llvm.amdgcn.workitem.id.x()
// CHECK: tail call{{.*}} i32 @llvm.amdgcn.workitem.id.y()
// CHECK: tail call{{.*}} i32 @llvm.amdgcn.workitem.id.z()
void test_get_local_id(int d, global int *out)
{
switch (d) {
Expand All @@ -618,6 +618,10 @@ void test_get_local_id(int d, global int *out)
}
}

// CHECK: declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
// CHECK: declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
// CHECK: declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()

// CHECK-LABEL: @test_get_workgroup_size(
// CHECK: {{.*}}call align 8 dereferenceable(256){{.*}} ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// CHECK: getelementptr inbounds nuw i8, ptr addrspace(4) %{{.*}}, i64 12
Expand Down
9 changes: 6 additions & 3 deletions clang/test/CodeGenOpenCL/builtins-r600.cl
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ void test_get_group_id(int d, global int *out)
}

// CHECK-LABEL: @test_get_local_id(
// CHECK: tail call noundef range(i32 0, 1024) i32 @llvm.r600.read.tidig.x()
// CHECK: tail call noundef range(i32 0, 1024) i32 @llvm.r600.read.tidig.y()
// CHECK: tail call noundef range(i32 0, 1024) i32 @llvm.r600.read.tidig.z()
// CHECK: tail call i32 @llvm.r600.read.tidig.x()
// CHECK: tail call i32 @llvm.r600.read.tidig.y()
// CHECK: tail call i32 @llvm.r600.read.tidig.z()
void test_get_local_id(int d, global int *out)
{
switch (d) {
Expand All @@ -52,3 +52,6 @@ void test_get_local_id(int d, global int *out)
}
}

// CHECK: declare noundef range(i32 0, 1024) i32 @llvm.r600.read.tidig.x()
// CHECK: declare noundef range(i32 0, 1024) i32 @llvm.r600.read.tidig.y()
// CHECK: declare noundef range(i32 0, 1024) i32 @llvm.r600.read.tidig.z()
6 changes: 3 additions & 3 deletions clang/test/Headers/gpuintrin.c
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ __gpu_kernel void foo() {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
// AMDGPU-NEXT: [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
// AMDGPU-NEXT: ret i32 [[TMP0]]
//
//
Expand All @@ -300,7 +300,7 @@ __gpu_kernel void foo() {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
// AMDGPU-NEXT: [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
// AMDGPU-NEXT: ret i32 [[TMP0]]
//
//
Expand All @@ -309,7 +309,7 @@ __gpu_kernel void foo() {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
// AMDGPU-NEXT: [[TMP0:%.*]] = call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
// AMDGPU-NEXT: ret i32 [[TMP0]]
//
//
Expand Down
12 changes: 6 additions & 6 deletions clang/test/Headers/gpuintrin_lang.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ __device__ int foo() { return __gpu_thread_id_x(); }
// CUDA-LABEL: define dso_local i32 @foo(
// CUDA-SAME: ) #[[ATTR0:[0-9]+]] {
// CUDA-NEXT: [[ENTRY:.*:]]
// CUDA-NEXT: [[TMP0:%.*]] = call {{.*}}i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CUDA-NEXT: [[TMP0:%.*]] = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CUDA-NEXT: ret i32 [[TMP0]]
//
// HIP-LABEL: define dso_local i32 @foo(
Expand All @@ -46,29 +46,29 @@ __device__ int foo() { return __gpu_thread_id_x(); }
// HIP-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
// HIP-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
// HIP-NEXT: [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I]] to ptr
// HIP-NEXT: [[TMP0:%.*]] = call noundef {{.*}}i32 @llvm.amdgcn.workitem.id.x()
// HIP-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
// HIP-NEXT: ret i32 [[TMP0]]
//
// OPENCL-LABEL: define dso_local i32 @foo(
// OPENCL-SAME: ) #[[ATTR0:[0-9]+]] {
// OPENCL-NEXT: [[ENTRY:.*:]]
// OPENCL-NEXT: [[TMP0:%.*]] = call noundef {{.*}}i32 @llvm.amdgcn.workitem.id.x()
// OPENCL-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
// OPENCL-NEXT: ret i32 [[TMP0]]
//
// OPENMP-LABEL: define hidden i32 @foo(
// OPENMP-SAME: ) #[[ATTR0:[0-9]+]] {
// OPENMP-NEXT: [[ENTRY:.*:]]
// OPENMP-NEXT: [[TMP0:%.*]] = call noundef {{.*}}i32 @llvm.amdgcn.workitem.id.x()
// OPENMP-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
// OPENMP-NEXT: ret i32 [[TMP0]]
//
// C89-LABEL: define dso_local i32 @foo(
// C89-SAME: ) #[[ATTR2:[0-9]+]] {
// C89-SAME: ) #[[ATTR0:[0-9]+]] {
// C89-NEXT: [[ENTRY:.*:]]
// C89-NEXT: [[RETVAL_I:%.*]] = alloca i32, align 4, addrspace(5)
// C89-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
// C89-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
// C89-NEXT: [[RETVAL_ASCAST_I:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL_I]] to ptr
// C89-NEXT: [[TMP0:%.*]] = call noundef {{.*}}i32 @llvm.amdgcn.workitem.id.x()
// C89-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
// C89-NEXT: ret i32 [[TMP0]]
//
int foo() { return __gpu_thread_id_x(); }
Expand Down
47 changes: 31 additions & 16 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,16 @@ def local_ptr_ty : LLVMQualPointerType<3>;
// some preloaded register from a function that is known to not need it is a violation
// of the calling convention and also program-level UB. Outside of such IR-level UB,
// these preloaded registers are always set to a well-defined value and are thus `noundef`.
class AMDGPUReadPreloadRegisterIntrinsic
: DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
class AMDGPUReadPreloadRegisterIntrinsic<
list<IntrinsicProperty> ExtraAttrs = []>
: DefaultAttrsIntrinsic<[llvm_i32_ty], [],
!listconcat([NoUndef<RetIndex>, IntrNoMem,
IntrSpeculatable],
ExtraAttrs)>;

class AMDGPUReadPreloadRegisterIntrinsicNamed<string name>
: DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>, ClangBuiltin<name>;
class AMDGPUReadPreloadRegisterIntrinsicNamed<
string name, list<IntrinsicProperty> ExtraAttrs = []>
: AMDGPUReadPreloadRegisterIntrinsic<ExtraAttrs>, ClangBuiltin<name>;

// Used to tag image and resource intrinsics with information used to generate
// mem operands.
Expand All @@ -35,17 +40,22 @@ class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> {

let TargetPrefix = "r600" in {

multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz {
def _x : AMDGPUReadPreloadRegisterIntrinsic;
def _y : AMDGPUReadPreloadRegisterIntrinsic;
def _z : AMDGPUReadPreloadRegisterIntrinsic;
}
multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz<
list<IntrinsicProperty> ExtraAttrs = []> {
def _x : AMDGPUReadPreloadRegisterIntrinsic<ExtraAttrs>;
def _y : AMDGPUReadPreloadRegisterIntrinsic<ExtraAttrs>;
def _z : AMDGPUReadPreloadRegisterIntrinsic<ExtraAttrs>;
}

multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> {
def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>;
def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>;
def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>;
}
multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<
string prefix, list<IntrinsicProperty> ExtraAttrs = []> {
def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x"),
ExtraAttrs>;
def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y"),
ExtraAttrs>;
def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z"),
ExtraAttrs>;
}

defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
<"__builtin_r600_read_global_size">;
Expand All @@ -55,7 +65,9 @@ defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
<"__builtin_r600_read_tgid">;

defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz;
defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz;
defm int_r600_read_tidig
: AMDGPUReadPreloadRegisterIntrinsic_xyz_named<
"__builtin_r600_read_tidig", [Range<RetIndex, 0, 1024>]>;

def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">,
Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>;
Expand Down Expand Up @@ -146,7 +158,10 @@ let TargetPrefix = "amdgcn" in {
// ABI Special Intrinsics
//===----------------------------------------------------------------------===//

defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz;
defm int_amdgcn_workitem_id
: AMDGPUReadPreloadRegisterIntrinsic_xyz_named<
"__builtin_amdgcn_workitem_id", [Range<RetIndex, 0, 1024>]>;

defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
<"__builtin_amdgcn_workgroup_id">;

Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ entry:

; GCN: define amdgpu_kernel void @test_inliner(
; GCN-INL1: %c1 = tail call coldcc float @foo(
; GCN-INLDEF: %cmp.i = fcmp ogt float %tmp2, 0.000000e+00
; GCN-INLDEF: %cmp.i = fcmp ogt float %{{.+}}, 0.000000e+00
; GCN-MAXBBDEF: %div.i{{[0-9]*}} = fdiv float 1.000000e+00, %c
; GCN-MAXBBDEF: %div.i{{[0-9]*}} = fdiv float 2.000000e+00, %tmp1.i
; GCN-MAXBB1: call coldcc void @foo_private_ptr
Expand Down
33 changes: 14 additions & 19 deletions llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
Original file line number Diff line number Diff line change
Expand Up @@ -258,46 +258,41 @@ define amdgpu_kernel void @add_x_shl_max_offset() #1 {
define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 {
; CI-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
; CI: ; %bb.0:
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_xor_b32_e32 v0, 0xffff, v0
; CI-NEXT: v_mul_i32_i24_e32 v0, -4, v0
; CI-NEXT: v_mov_b32_e32 v1, 13
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write_b8 v0, v1
; CI-NEXT: ds_write_b8 v0, v1 offset:65535
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_mul_i32_i24_e32 v0, -4, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 13
; GFX9-NEXT: ds_write_b8 v0, v1
; GFX9-NEXT: ds_write_b8 v0, v1 offset:65535
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mul_i32_i24_e32 v0, -4, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 13
; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: ds_write_b8 v0, v1
; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535
; GFX10-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0xffff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mul_i32_i24_e32 v1, -4, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13
; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0
; GFX11-TRUE16-NEXT: ds_store_b8 v1, v0 offset:65535
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0xffff, v0
; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_mul_i32_i24_e32 v0, -4, v0
; GFX11-FAKE16-NEXT: ds_store_b8 v0, v1 offset:65535
; GFX11-FAKE16-NEXT: s_endpgm
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
%.neg = mul i32 %x.i, -4
Expand Down Expand Up @@ -447,9 +442,9 @@ define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
;
; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0
; GFX11-NEXT: ds_store_b32 v0, v1 offset:123
; GFX11-NEXT: ds_store_b32 v0, v1 offset:456
Expand Down
Loading
Loading