Skip to content

AMDGPU: Try to add some more amdgpu-perf-hint tests #102644

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 11, 2024

Conversation

arsenm
Copy link
Contributor

@arsenm arsenm commented Aug 9, 2024

This test has hardly any test coverage, and no IR tests. Add a few
more tests involving calls, and add some IR checks. This pass needs
a lot of work to improve the test coverage, and to actually use
the cost model instead of making up its own accounting scheme.

@llvmbot
Copy link
Member

llvmbot commented Aug 9, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

This test has hardly any test coverage, and no IR tests. Add a few
more tests involving calls, and add some IR checks. This pass needs
a lot of work to improve the test coverage, and to actually use
the cost model instead of making up its own accounting scheme.


Patch is 22.61 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102644.diff

1 Files Affected:

  • (modified) llvm/test/CodeGen/AMDGPU/perfhint.ll (+321-2)
diff --git a/llvm/test/CodeGen/AMDGPU/perfhint.ll b/llvm/test/CodeGen/AMDGPU/perfhint.ll
index 9cf7bd4edd20d..77e0f46a3d457 100644
--- a/llvm/test/CodeGen/AMDGPU/perfhint.ll
+++ b/llvm/test/CodeGen/AMDGPU/perfhint.ll
@@ -1,9 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-perf-hint < %s | FileCheck -check-prefix=CHECK %s
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}test_membound:
 ; GCN: MemoryBound: 1
 ; GCN: WaveLimiterHint : 1
 define amdgpu_kernel void @test_membound(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
+; CHECK-LABEL: define amdgpu_kernel void @test_membound(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP7]], align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP6]]
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 16
+; CHECK-NEXT:    ret void
+;
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = zext i32 %tmp to i64
@@ -22,8 +40,64 @@ bb:
 ; GCN-LABEL: {{^}}test_membound_1:
 ; GCN: MemoryBound: 1
 define amdgpu_kernel void @test_membound_1(ptr addrspace(1) nocapture readonly %ptr.0,
-                                           ptr addrspace(1) nocapture %ptr.1,
-                                           <2 x double> %arg.0, i32 %arg.1, <4 x double> %arg.2) {
+; CHECK-LABEL: define amdgpu_kernel void @test_membound_1(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[PTR_0:%.*]], ptr addrspace(1) nocapture [[PTR_1:%.*]], <2 x double> [[ARG_0:%.*]], i32 [[ARG_1:%.*]], <4 x double> [[ARG_2:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[BB_ENTRY:.*:]]
+; CHECK-NEXT:    [[ID_32:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[ID_0:%.*]] = zext i32 [[ID_32]] to i64
+; CHECK-NEXT:    [[GEP_0:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_0]]
+; CHECK-NEXT:    [[LD_0:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_0]], align 16
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd <2 x double> [[ARG_0]], [[LD_0]]
+; CHECK-NEXT:    [[ID_1:%.*]] = add nuw nsw i64 [[ID_0]], 1
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_1]]
+; CHECK-NEXT:    [[LD_1:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_1]], align 16
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd <2 x double> [[ADD_0]], [[LD_1]]
+; CHECK-NEXT:    [[ID_2:%.*]] = add nuw nsw i64 [[ID_0]], 2
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_2]]
+; CHECK-NEXT:    [[LD_2:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_2]], align 16
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd <2 x double> [[ADD_1]], [[LD_2]]
+; CHECK-NEXT:    [[ID_3:%.*]] = add nuw nsw i64 [[ID_0]], 3
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_3]]
+; CHECK-NEXT:    [[LD_3:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_3]], align 16
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd <2 x double> [[ADD_2]], [[LD_3]]
+; CHECK-NEXT:    [[ID_4:%.*]] = add nuw nsw i64 [[ID_0]], 4
+; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_4]]
+; CHECK-NEXT:    [[LD_4:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_4]], align 16
+; CHECK-NEXT:    [[ADD_4:%.*]] = fadd <2 x double> [[ADD_3]], [[LD_4]]
+; CHECK-NEXT:    store <2 x double> [[ADD_4]], ptr addrspace(1) [[PTR_1]], align 16
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[ARG_1]], 0
+; CHECK-NEXT:    br i1 [[COND]], label %[[BB_TRUE:.*]], label %[[BB_RET:.*]]
+; CHECK:       [[BB_TRUE]]:
+; CHECK-NEXT:    [[I0_ARG_0:%.*]] = extractelement <2 x double> [[ARG_0]], i32 0
+; CHECK-NEXT:    [[I1_ARG_0:%.*]] = extractelement <2 x double> [[ARG_0]], i32 1
+; CHECK-NEXT:    [[ADD_1_0:%.*]] = fadd double [[I0_ARG_0]], [[I1_ARG_0]]
+; CHECK-NEXT:    [[I0_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 0
+; CHECK-NEXT:    [[I1_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 1
+; CHECK-NEXT:    [[ADD_1_1:%.*]] = fadd double [[I0_ARG_2]], [[I1_ARG_2]]
+; CHECK-NEXT:    [[ADD_1_2:%.*]] = fadd double [[ADD_1_0]], [[ADD_1_1]]
+; CHECK-NEXT:    [[I2_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 2
+; CHECK-NEXT:    [[I3_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 3
+; CHECK-NEXT:    [[ADD_1_3:%.*]] = fadd double [[I2_ARG_2]], [[I3_ARG_2]]
+; CHECK-NEXT:    [[ADD_1_4:%.*]] = fadd double [[ADD_1_2]], [[ADD_1_3]]
+; CHECK-NEXT:    [[I0_ADD_0:%.*]] = extractelement <2 x double> [[ADD_0]], i32 0
+; CHECK-NEXT:    [[I1_ADD_0:%.*]] = extractelement <2 x double> [[ADD_0]], i32 1
+; CHECK-NEXT:    [[ADD_1_5:%.*]] = fadd double [[I0_ADD_0]], [[I1_ADD_0]]
+; CHECK-NEXT:    [[ADD_1_6:%.*]] = fadd double [[ADD_1_4]], [[ADD_1_5]]
+; CHECK-NEXT:    [[I0_ADD_1:%.*]] = extractelement <2 x double> [[ADD_1]], i32 0
+; CHECK-NEXT:    [[I1_ADD_1:%.*]] = extractelement <2 x double> [[ADD_1]], i32 1
+; CHECK-NEXT:    [[ADD_1_7:%.*]] = fadd double [[I0_ADD_1]], [[I1_ADD_1]]
+; CHECK-NEXT:    [[ADD_1_8:%.*]] = fadd double [[ADD_1_6]], [[ADD_1_7]]
+; CHECK-NEXT:    [[I0_ADD_2:%.*]] = extractelement <2 x double> [[ADD_2]], i32 0
+; CHECK-NEXT:    [[I1_ADD_2:%.*]] = extractelement <2 x double> [[ADD_2]], i32 1
+; CHECK-NEXT:    [[ADD_1_9:%.*]] = fadd double [[I0_ADD_2]], [[I1_ADD_2]]
+; CHECK-NEXT:    [[ADD_1_10:%.*]] = fadd double [[ADD_1_8]], [[ADD_1_9]]
+; CHECK-NEXT:    store double [[ADD_1_8]], ptr addrspace(1) [[PTR_1]], align 8
+; CHECK-NEXT:    br label %[[BB_RET]]
+; CHECK:       [[BB_RET]]:
+; CHECK-NEXT:    ret void
+;
+  ptr addrspace(1) nocapture %ptr.1,
+  <2 x double> %arg.0, i32 %arg.1, <4 x double> %arg.2) {
 bb.entry:
   %id.32 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %id.0 = zext i32 %id.32 to i64
@@ -91,6 +165,26 @@ bb.ret:
 ; GCN: MemoryBound: 0
 ; GCN: WaveLimiterHint : 1
 define amdgpu_kernel void @test_large_stride(ptr addrspace(1) nocapture %arg) {
+; CHECK-LABEL: define amdgpu_kernel void @test_large_stride(
+; CHECK-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 4096
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP]], align 4
+; CHECK-NEXT:    [[MUL1:%.*]] = mul i32 [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1
+; CHECK-NEXT:    store i32 [[MUL1]], ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 8192
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[TMP3]], align 4
+; CHECK-NEXT:    [[MUL4:%.*]] = mul i32 [[TMP4]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
+; CHECK-NEXT:    store i32 [[MUL4]], ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 12288
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
+; CHECK-NEXT:    [[MUL7:%.*]] = mul i32 [[TMP7]], [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3
+; CHECK-NEXT:    store i32 [[MUL7]], ptr addrspace(1) [[TMP8]], align 4
+; CHECK-NEXT:    ret void
+;
 bb:
   %tmp = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 4096
   %tmp1 = load i32, ptr addrspace(1) %tmp, align 4
@@ -114,6 +208,35 @@ bb:
 ; GCN: MemoryBound: 1
 ; GCN: WaveLimiterHint : 1
 define amdgpu_kernel void @test_indirect(ptr addrspace(1) nocapture %arg) {
+; CHECK-LABEL: define amdgpu_kernel void @test_indirect(
+; CHECK-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
+; CHECK-NEXT:    store i32 [[TMP8]], ptr addrspace(1) [[ARG]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(1) [[TMP11]], align 4
+; CHECK-NEXT:    store i32 [[TMP12]], ptr addrspace(1) [[TMP]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(1) [[TMP15]], align 4
+; CHECK-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(1) [[TMP19]], align 4
+; CHECK-NEXT:    store i32 [[TMP20]], ptr addrspace(1) [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
 bb:
   %tmp = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
   %tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
@@ -146,6 +269,27 @@ bb:
 ; GCN: MemoryBound: 0
 ; GCN: WaveLimiterHint : 0
 define amdgpu_kernel void @test_indirect_through_phi(ptr addrspace(1) %arg) {
+; CHECK-LABEL: define amdgpu_kernel void @test_indirect_through_phi(
+; CHECK-SAME: ptr addrspace(1) [[ARG:%.*]]) {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr addrspace(1) [[ARG]], align 8
+; CHECK-NEXT:    [[LOAD_F:%.*]] = bitcast float [[LOAD]] to i32
+; CHECK-NEXT:    [[N:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[LOAD_F]], %[[BB]] ], [ [[AND2:%.*]], %[[BB1]] ]
+; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[INC2:%.*]], %[[BB1]] ]
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[PHI]], [[N]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARG]], i32 [[AND1]]
+; CHECK-NEXT:    store float [[LOAD]], ptr addrspace(1) [[GEP]], align 4
+; CHECK-NEXT:    [[INC1:%.*]] = add nsw i32 [[PHI]], 1310720
+; CHECK-NEXT:    [[AND2]] = and i32 [[INC1]], [[N]]
+; CHECK-NEXT:    [[INC2]] = add nuw nsw i32 [[IND]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[INC2]], 1024
+; CHECK-NEXT:    br i1 [[CMP]], label %[[BB2:.*]], label %[[BB1]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    ret void
+;
 bb:
   %load = load float, ptr addrspace(1) %arg, align 8
   %load.f = bitcast float %load to i32
@@ -168,4 +312,179 @@ bb2:                                              ; preds = %bb1
   ret void
 }
 
+define void @test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
+; CHECK-LABEL: define void @test_membound_func(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP7]], align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP6]]
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 16
+; CHECK-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp2 = zext i32 %tmp to i64
+  %tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
+  %tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
+  %tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
+  store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
+  %tmp6 = add nuw nsw i64 %tmp2, 1
+  %tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
+  %tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
+  %tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
+  store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}kernel_call_test_membound_func:
+; GCN: MemoryBound: 1
+; GCN: WaveLimiterHint : 1
+define amdgpu_kernel void @kernel_call_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_call_test_membound_func(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @test_membound_func(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
+; CHECK-NEXT:    ret void
+;
+  call void @test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
+  ret void
+}
+
+; TODO: Probably should assume yes?
+; GCN-LABEL: {{^}}kernel_indirect_call:
+; GCN: MemoryBound: 0
+; GCN: WaveLimiterHint : 0
+define amdgpu_kernel void @kernel_indirect_call(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, ptr %fptr) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_indirect_call(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]], ptr [[FPTR:%.*]]) {
+; CHECK-NEXT:    call void [[FPTR]](ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
+; CHECK-NEXT:    ret void
+;
+  call void %fptr(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
+  ret void
+}
+
+declare void @extern()
+
+define void @maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
+; CHECK-LABEL: define void @maybe_recursive_test_membound_func(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP7]], align 16
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP6]]
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 16
+; CHECK-NEXT:    call void @extern()
+; CHECK-NEXT:    ret void
+;
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp2 = zext i32 %tmp to i64
+  %tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
+  %tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
+  %tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
+  store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
+  %tmp6 = add nuw nsw i64 %tmp2, 1
+  %tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
+  %tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
+  %tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
+  store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
+  call void @extern()
+  ret void
+}
+
+; GCN-LABEL: {{^}}kernel_call_maybe_recursive_test_membound_func:
+; GCN: MemoryBound: 1
+; GCN: WaveLimiterHint : 1
+define amdgpu_kernel void @kernel_call_maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, ptr %fptr) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_call_maybe_recursive_test_membound_func(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]], ptr [[FPTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
+; CHECK-NEXT:    ret void
+;
+  call void @maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
+  ret void
+}
+
+define void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
+; CHECK-LABEL: define void @mutually_recursive_test_membound_func_0(
+; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
+; CHECK-NEXT:    call void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
+; CHECK-NEXT:    ret void
+;
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp2 = zext i32 %tmp to i64
+  %tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
+  %tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
+  %tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
+  store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
+  call void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
+  ret void
+}
+
+define void @mutually_recursive_test_membound_func_1(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
+; CHECK-LABEL: define void @mutually_recursive_test_membound_...
[truncated]

@arsenm arsenm marked this pull request as ready for review August 9, 2024 16:51
ret void
}

; TODO: Probably should assume yes?
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, setting membound allows to pessimise scheduling. Safe assumption it is not memory bound.

Copy link
Contributor Author

arsenm commented Aug 11, 2024

Merge activity

  • Aug 11, 7:00 AM EDT: @arsenm started a stack merge that includes this pull request via Graphite.
  • Aug 11, 7:02 AM EDT: Graphite rebased this pull request as part of a merge.
  • Aug 11, 7:05 AM EDT: Graphite rebased this pull request as part of a merge.
  • Aug 11, 7:07 AM EDT: @arsenm merged this pull request with Graphite.

@arsenm arsenm force-pushed the users/arsenm/add-perfhint-tests branch from a9c57cd to 7feda95 Compare August 11, 2024 11:01
This test has hardly any test coverage, and no IR tests. Add a few
more tests involving calls, and add some IR checks. This pass needs
a lot of work to improve the test coverage, and to actually use
the cost model instead of making up its own accounting scheme.
@arsenm arsenm force-pushed the users/arsenm/add-perfhint-tests branch from 7feda95 to 13dd6d2 Compare August 11, 2024 11:05
@arsenm arsenm merged commit 2b0a88f into main Aug 11, 2024
4 of 6 checks passed
@arsenm arsenm deleted the users/arsenm/add-perfhint-tests branch August 11, 2024 11:07
@llvm-ci
Copy link
Collaborator

llvm-ci commented Aug 11, 2024

LLVM Buildbot has detected a new failure on builder openmp-offload-amdgpu-runtime running on omp-vega20-0 while building llvm at step 6 "test-openmp".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/3774

Here is the relevant piece of the build log for the reference:

Step 6 (test-openmp) failure: test (failure)
******************** TEST 'libomp :: tasking/issue-94260-2.c' FAILED ********************
Exit Code: -11

Command Output (stdout):
--
# RUN: at line 1
/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang -fopenmp   -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/openmp/runtime/test -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -fno-omit-frame-pointer -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/openmp/runtime/test/ompt /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/openmp/runtime/test/tasking/issue-94260-2.c -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp -lm -latomic && /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang -fopenmp -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/openmp/runtime/test -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -fno-omit-frame-pointer -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/openmp/runtime/test/ompt /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/openmp/runtime/test/tasking/issue-94260-2.c -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp -lm -latomic
# note: command had no output on stdout or stderr
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/test/tasking/Output/issue-94260-2.c.tmp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -11

--

********************


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants