-
Notifications
You must be signed in to change notification settings - Fork 13.6k
AMDGPU: Try to add some more amdgpu-perf-hint tests #102644
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesThis test has hardly any test coverage, and no IR tests. Add a few Patch is 22.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102644.diff 1 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/perfhint.ll b/llvm/test/CodeGen/AMDGPU/perfhint.ll
index 9cf7bd4edd20d..77e0f46a3d457 100644
--- a/llvm/test/CodeGen/AMDGPU/perfhint.ll
+++ b/llvm/test/CodeGen/AMDGPU/perfhint.ll
@@ -1,9 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-perf-hint < %s | FileCheck -check-prefix=CHECK %s
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_membound:
; GCN: MemoryBound: 1
; GCN: WaveLimiterHint : 1
define amdgpu_kernel void @test_membound(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
+; CHECK-LABEL: define amdgpu_kernel void @test_membound(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
+; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
+; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP7]], align 16
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP6]]
+; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 16
+; CHECK-NEXT: ret void
+;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp2 = zext i32 %tmp to i64
@@ -22,8 +40,64 @@ bb:
; GCN-LABEL: {{^}}test_membound_1:
; GCN: MemoryBound: 1
define amdgpu_kernel void @test_membound_1(ptr addrspace(1) nocapture readonly %ptr.0,
- ptr addrspace(1) nocapture %ptr.1,
- <2 x double> %arg.0, i32 %arg.1, <4 x double> %arg.2) {
+; CHECK-LABEL: define amdgpu_kernel void @test_membound_1(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[PTR_0:%.*]], ptr addrspace(1) nocapture [[PTR_1:%.*]], <2 x double> [[ARG_0:%.*]], i32 [[ARG_1:%.*]], <4 x double> [[ARG_2:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[BB_ENTRY:.*:]]
+; CHECK-NEXT: [[ID_32:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[ID_0:%.*]] = zext i32 [[ID_32]] to i64
+; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_0]]
+; CHECK-NEXT: [[LD_0:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_0]], align 16
+; CHECK-NEXT: [[ADD_0:%.*]] = fadd <2 x double> [[ARG_0]], [[LD_0]]
+; CHECK-NEXT: [[ID_1:%.*]] = add nuw nsw i64 [[ID_0]], 1
+; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_1]]
+; CHECK-NEXT: [[LD_1:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_1]], align 16
+; CHECK-NEXT: [[ADD_1:%.*]] = fadd <2 x double> [[ADD_0]], [[LD_1]]
+; CHECK-NEXT: [[ID_2:%.*]] = add nuw nsw i64 [[ID_0]], 2
+; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_2]]
+; CHECK-NEXT: [[LD_2:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_2]], align 16
+; CHECK-NEXT: [[ADD_2:%.*]] = fadd <2 x double> [[ADD_1]], [[LD_2]]
+; CHECK-NEXT: [[ID_3:%.*]] = add nuw nsw i64 [[ID_0]], 3
+; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_3]]
+; CHECK-NEXT: [[LD_3:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_3]], align 16
+; CHECK-NEXT: [[ADD_3:%.*]] = fadd <2 x double> [[ADD_2]], [[LD_3]]
+; CHECK-NEXT: [[ID_4:%.*]] = add nuw nsw i64 [[ID_0]], 4
+; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_4]]
+; CHECK-NEXT: [[LD_4:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_4]], align 16
+; CHECK-NEXT: [[ADD_4:%.*]] = fadd <2 x double> [[ADD_3]], [[LD_4]]
+; CHECK-NEXT: store <2 x double> [[ADD_4]], ptr addrspace(1) [[PTR_1]], align 16
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[ARG_1]], 0
+; CHECK-NEXT: br i1 [[COND]], label %[[BB_TRUE:.*]], label %[[BB_RET:.*]]
+; CHECK: [[BB_TRUE]]:
+; CHECK-NEXT: [[I0_ARG_0:%.*]] = extractelement <2 x double> [[ARG_0]], i32 0
+; CHECK-NEXT: [[I1_ARG_0:%.*]] = extractelement <2 x double> [[ARG_0]], i32 1
+; CHECK-NEXT: [[ADD_1_0:%.*]] = fadd double [[I0_ARG_0]], [[I1_ARG_0]]
+; CHECK-NEXT: [[I0_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 0
+; CHECK-NEXT: [[I1_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 1
+; CHECK-NEXT: [[ADD_1_1:%.*]] = fadd double [[I0_ARG_2]], [[I1_ARG_2]]
+; CHECK-NEXT: [[ADD_1_2:%.*]] = fadd double [[ADD_1_0]], [[ADD_1_1]]
+; CHECK-NEXT: [[I2_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 2
+; CHECK-NEXT: [[I3_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 3
+; CHECK-NEXT: [[ADD_1_3:%.*]] = fadd double [[I2_ARG_2]], [[I3_ARG_2]]
+; CHECK-NEXT: [[ADD_1_4:%.*]] = fadd double [[ADD_1_2]], [[ADD_1_3]]
+; CHECK-NEXT: [[I0_ADD_0:%.*]] = extractelement <2 x double> [[ADD_0]], i32 0
+; CHECK-NEXT: [[I1_ADD_0:%.*]] = extractelement <2 x double> [[ADD_0]], i32 1
+; CHECK-NEXT: [[ADD_1_5:%.*]] = fadd double [[I0_ADD_0]], [[I1_ADD_0]]
+; CHECK-NEXT: [[ADD_1_6:%.*]] = fadd double [[ADD_1_4]], [[ADD_1_5]]
+; CHECK-NEXT: [[I0_ADD_1:%.*]] = extractelement <2 x double> [[ADD_1]], i32 0
+; CHECK-NEXT: [[I1_ADD_1:%.*]] = extractelement <2 x double> [[ADD_1]], i32 1
+; CHECK-NEXT: [[ADD_1_7:%.*]] = fadd double [[I0_ADD_1]], [[I1_ADD_1]]
+; CHECK-NEXT: [[ADD_1_8:%.*]] = fadd double [[ADD_1_6]], [[ADD_1_7]]
+; CHECK-NEXT: [[I0_ADD_2:%.*]] = extractelement <2 x double> [[ADD_2]], i32 0
+; CHECK-NEXT: [[I1_ADD_2:%.*]] = extractelement <2 x double> [[ADD_2]], i32 1
+; CHECK-NEXT: [[ADD_1_9:%.*]] = fadd double [[I0_ADD_2]], [[I1_ADD_2]]
+; CHECK-NEXT: [[ADD_1_10:%.*]] = fadd double [[ADD_1_8]], [[ADD_1_9]]
+; CHECK-NEXT: store double [[ADD_1_8]], ptr addrspace(1) [[PTR_1]], align 8
+; CHECK-NEXT: br label %[[BB_RET]]
+; CHECK: [[BB_RET]]:
+; CHECK-NEXT: ret void
+;
+ ptr addrspace(1) nocapture %ptr.1,
+ <2 x double> %arg.0, i32 %arg.1, <4 x double> %arg.2) {
bb.entry:
%id.32 = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.0 = zext i32 %id.32 to i64
@@ -91,6 +165,26 @@ bb.ret:
; GCN: MemoryBound: 0
; GCN: WaveLimiterHint : 1
define amdgpu_kernel void @test_large_stride(ptr addrspace(1) nocapture %arg) {
+; CHECK-LABEL: define amdgpu_kernel void @test_large_stride(
+; CHECK-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 4096
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP]], align 4
+; CHECK-NEXT: [[MUL1:%.*]] = mul i32 [[TMP1]], [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1
+; CHECK-NEXT: store i32 [[MUL1]], ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 8192
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[TMP3]], align 4
+; CHECK-NEXT: [[MUL4:%.*]] = mul i32 [[TMP4]], [[TMP4]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
+; CHECK-NEXT: store i32 [[MUL4]], ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 12288
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
+; CHECK-NEXT: [[MUL7:%.*]] = mul i32 [[TMP7]], [[TMP7]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3
+; CHECK-NEXT: store i32 [[MUL7]], ptr addrspace(1) [[TMP8]], align 4
+; CHECK-NEXT: ret void
+;
bb:
%tmp = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 4096
%tmp1 = load i32, ptr addrspace(1) %tmp, align 4
@@ -114,6 +208,35 @@ bb:
; GCN: MemoryBound: 1
; GCN: WaveLimiterHint : 1
define amdgpu_kernel void @test_indirect(ptr addrspace(1) nocapture %arg) {
+; CHECK-LABEL: define amdgpu_kernel void @test_indirect(
+; CHECK-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
+; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[ARG]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) [[TMP11]], align 4
+; CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(1) [[TMP15]], align 4
+; CHECK-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[TMP1]], align 4
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP18]]
+; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) [[TMP19]], align 4
+; CHECK-NEXT: store i32 [[TMP20]], ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
bb:
%tmp = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
%tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
@@ -146,6 +269,27 @@ bb:
; GCN: MemoryBound: 0
; GCN: WaveLimiterHint : 0
define amdgpu_kernel void @test_indirect_through_phi(ptr addrspace(1) %arg) {
+; CHECK-LABEL: define amdgpu_kernel void @test_indirect_through_phi(
+; CHECK-SAME: ptr addrspace(1) [[ARG:%.*]]) {
+; CHECK-NEXT: [[BB:.*]]:
+; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr addrspace(1) [[ARG]], align 8
+; CHECK-NEXT: [[LOAD_F:%.*]] = bitcast float [[LOAD]] to i32
+; CHECK-NEXT: [[N:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: br label %[[BB1:.*]]
+; CHECK: [[BB1]]:
+; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LOAD_F]], %[[BB]] ], [ [[AND2:%.*]], %[[BB1]] ]
+; CHECK-NEXT: [[IND:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[INC2:%.*]], %[[BB1]] ]
+; CHECK-NEXT: [[AND1:%.*]] = and i32 [[PHI]], [[N]]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARG]], i32 [[AND1]]
+; CHECK-NEXT: store float [[LOAD]], ptr addrspace(1) [[GEP]], align 4
+; CHECK-NEXT: [[INC1:%.*]] = add nsw i32 [[PHI]], 1310720
+; CHECK-NEXT: [[AND2]] = and i32 [[INC1]], [[N]]
+; CHECK-NEXT: [[INC2]] = add nuw nsw i32 [[IND]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[INC2]], 1024
+; CHECK-NEXT: br i1 [[CMP]], label %[[BB2:.*]], label %[[BB1]]
+; CHECK: [[BB2]]:
+; CHECK-NEXT: ret void
+;
bb:
%load = load float, ptr addrspace(1) %arg, align 8
%load.f = bitcast float %load to i32
@@ -168,4 +312,179 @@ bb2: ; preds = %bb1
ret void
}
+define void @test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
+; CHECK-LABEL: define void @test_membound_func(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
+; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
+; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP7]], align 16
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP6]]
+; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 16
+; CHECK-NEXT: ret void
+;
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp2 = zext i32 %tmp to i64
+ %tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
+ %tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
+ %tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
+ store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
+ %tmp6 = add nuw nsw i64 %tmp2, 1
+ %tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
+ %tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
+ %tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
+ store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
+ ret void
+}
+
+; GCN-LABEL: {{^}}kernel_call_test_membound_func:
+; GCN: MemoryBound: 1
+; GCN: WaveLimiterHint : 1
+define amdgpu_kernel void @kernel_call_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_call_test_membound_func(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @test_membound_func(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
+; CHECK-NEXT: ret void
+;
+ call void @test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
+ ret void
+}
+
+; TODO: Probably should assume yes?
+; GCN-LABEL: {{^}}kernel_indirect_call:
+; GCN: MemoryBound: 0
+; GCN: WaveLimiterHint : 0
+define amdgpu_kernel void @kernel_indirect_call(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, ptr %fptr) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_indirect_call(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]], ptr [[FPTR:%.*]]) {
+; CHECK-NEXT: call void [[FPTR]](ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
+; CHECK-NEXT: ret void
+;
+ call void %fptr(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
+ ret void
+}
+
+declare void @extern()
+
+define void @maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
+; CHECK-LABEL: define void @maybe_recursive_test_membound_func(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
+; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
+; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP7]], align 16
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP6]]
+; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 16
+; CHECK-NEXT: call void @extern()
+; CHECK-NEXT: ret void
+;
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp2 = zext i32 %tmp to i64
+ %tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
+ %tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
+ %tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
+ store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
+ %tmp6 = add nuw nsw i64 %tmp2, 1
+ %tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
+ %tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
+ %tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
+ store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
+ call void @extern()
+ ret void
+}
+
+; GCN-LABEL: {{^}}kernel_call_maybe_recursive_test_membound_func:
+; GCN: MemoryBound: 1
+; GCN: WaveLimiterHint : 1
+define amdgpu_kernel void @kernel_call_maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, ptr %fptr) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_call_maybe_recursive_test_membound_func(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]], ptr [[FPTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
+; CHECK-NEXT: ret void
+;
+ call void @maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
+ ret void
+}
+
+define void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
+; CHECK-LABEL: define void @mutually_recursive_test_membound_func_0(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP]] to i64
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
+; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
+; CHECK-NEXT: call void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
+; CHECK-NEXT: ret void
+;
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %tmp2 = zext i32 %tmp to i64
+ %tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
+ %tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
+ %tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
+ store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
+ call void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
+ ret void
+}
+
+define void @mutually_recursive_test_membound_func_1(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
+; CHECK-LABEL: define void @mutually_recursive_test_membound_...
[truncated]
|
ret void | ||
} | ||
|
||
; TODO: Probably should assume yes? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, setting membound allows to pessimise scheduling. Safe assumption it is not memory bound.
95c809d
to
a9c57cd
Compare
Merge activity
|
a9c57cd
to
7feda95
Compare
This test has hardly any test coverage, and no IR tests. Add a few more tests involving calls, and add some IR checks. This pass needs a lot of work to improve the test coverage, and to actually use the cost model instead of making up its own accounting scheme.
7feda95
to
13dd6d2
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/3774 Here is the relevant piece of the build log for the reference:
|
This test has hardly any test coverage, and no IR tests. Add a few
more tests involving calls, and add some IR checks. This pass needs
a lot of work to improve the test coverage, and to actually use
the cost model instead of making up its own accounting scheme.