Skip to content

[llvm][AMDGPU] Fold llvm.amdgcn.wavefrontsize early #114481

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
3ba88ce
Add pass to handle AMDGCN pseudo-intrinsics (abstract placeholders fo…
AlexVlx Oct 31, 2024
1376596
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Oct 31, 2024
826c291
Implement review feedback.
AlexVlx Nov 1, 2024
ab6f5a2
Do not fold early for `generic` mcpu.
AlexVlx Nov 1, 2024
f8705fb
Fix formatting (again).
AlexVlx Nov 1, 2024
ed870a8
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Nov 1, 2024
f5751a5
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Nov 4, 2024
026ed00
Remove pass, fold in InstCombine.
AlexVlx Nov 4, 2024
195decc
Remove leftovers.
AlexVlx Nov 4, 2024
1a7abaf
Remove pass.
AlexVlx Nov 4, 2024
9aed76c
Fix formatting.
AlexVlx Nov 4, 2024
246c22f
Really fix formatting.
AlexVlx Nov 4, 2024
5a11720
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Nov 6, 2024
7cf7558
Split tests.
AlexVlx Nov 6, 2024
6a77b8a
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Nov 6, 2024
be414a8
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Nov 7, 2024
dedc593
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Nov 7, 2024
c634b4e
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Nov 18, 2024
c7be46f
Merge branch 'handle_wavefrontsize_early' of https://github.com/AlexV…
AlexVlx Nov 18, 2024
ed9f19f
Tweak `generic` mcpu handling.
AlexVlx Nov 18, 2024
d30cb95
Merge branch 'main' of https://github.com/llvm/llvm-project into hand…
AlexVlx Nov 24, 2024
dcfe7be
Use `isWaveSizeKnown` instead of gnarly hack.
AlexVlx Nov 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu tahiti -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK-AMDGCN,CHECK %s
// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefix=CHECK %s
// RUN: %clang_cc1 -cl-std=CL2.0 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck -enable-var-scope --check-prefixes=CHECK,CHECK-SPIRV %s


#pragma OPENCL EXTENSION cl_khr_fp64 : enable
Expand Down Expand Up @@ -866,7 +866,8 @@ void test_atomic_inc_dec(__attribute__((address_space(3))) uint *lptr, __attribu
// CHECK-LABEL test_wavefrontsize(
unsigned test_wavefrontsize() {

// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
// CHECK-AMDGCN: ret i32 {{[0-9]+}}
// CHECK-SPIRV: {{.*}}call{{.*}} i32 @llvm.amdgcn.wavefrontsize()
return __builtin_amdgcn_wavefrontsize();
}

Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1024,6 +1024,12 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
}
break;
}
case Intrinsic::amdgcn_wavefrontsize: {
if (ST->isWaveSizeKnown())
return IC.replaceInstUsesWith(
II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
break;
}
case Intrinsic::amdgcn_wqm_vote: {
// wqm_vote is identity when the argument is constant.
if (!isa<Constant>(II.getArgOperand(0)))
Expand Down
29 changes: 1 addition & 28 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wavefrontsize.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,48 +4,28 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GCN,W32 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,W64 %s

; RUN: opt -O3 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -O3 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -passes='default<O3>' -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -O3 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize32 -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -O3 -mattr=+wavefrontsize64 -S < %s | FileCheck -check-prefix=OPT %s

; GCN-LABEL: {{^}}fold_wavefrontsize:
; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(

; W32: v_mov_b32_e32 [[V:v[0-9]+]], 32
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 64
; GCN: store_{{dword|b32}} v{{.+}}, [[V]]

; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
; OPT: store i32 %tmp, ptr addrspace(1) %arg, align 4
; OPT-NEXT: ret void

define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {

bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
store i32 %tmp, ptr addrspace(1) %arg, align 4
ret void
}

; GCN-LABEL: {{^}}fold_and_optimize_wavefrontsize:
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(

; W32: v_mov_b32_e32 [[V:v[0-9]+]], 1{{$}}
; W64: v_mov_b32_e32 [[V:v[0-9]+]], 2{{$}}
; GCN-NOT: cndmask
; GCN: store_{{dword|b32}} v{{.+}}, [[V]]

; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
; OPT: %tmp1 = icmp ugt i32 %tmp, 32
; OPT: %tmp2 = select i1 %tmp1, i32 2, i32 1
; OPT: store i32 %tmp2, ptr addrspace(1) %arg
; OPT-NEXT: ret void

define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
bb:
Expand All @@ -57,13 +37,6 @@ bb:
}

; GCN-LABEL: {{^}}fold_and_optimize_if_wavefrontsize:
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(

; OPT: bb:
; OPT: %tmp = tail call i32 @llvm.amdgcn.wavefrontsize()
; OPT: %tmp1 = icmp ugt i32 %tmp, 32
; OPT: bb3:
; OPT-NEXT: ret void

define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
bb:
Expand Down
114 changes: 114 additions & 0 deletions llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wavefrontsize.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=amdgcn-- -passes=instcombine -S < %s | FileCheck -check-prefix=OPT %s
; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=tonga -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W32 %s
; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck -check-prefix=OPT-W64 %s

define amdgpu_kernel void @fold_wavefrontsize(ptr addrspace(1) nocapture %arg) {
; OPT-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
; OPT-NEXT: [[BB:.*:]]
; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1:[0-9]+]]
; OPT-NEXT: store i32 [[TMP]], ptr addrspace(1) [[ARG]], align 4
; OPT-NEXT: ret void
;
; OPT-W32-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
; OPT-W32-NEXT: [[BB:.*:]]
; OPT-W32-NEXT: store i32 32, ptr addrspace(1) [[ARG]], align 4
; OPT-W32-NEXT: ret void
;
; OPT-W64-LABEL: define amdgpu_kernel void @fold_wavefrontsize(
; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
; OPT-W64-NEXT: [[BB:.*:]]
; OPT-W64-NEXT: store i32 64, ptr addrspace(1) [[ARG]], align 4
; OPT-W64-NEXT: ret void
;
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
store i32 %tmp, ptr addrspace(1) %arg, align 4
ret void
}

define amdgpu_kernel void @fold_and_optimize_wavefrontsize(ptr addrspace(1) nocapture %arg) {
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
; OPT-NEXT: [[BB:.*:]]
; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
; OPT-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 2, i32 1
; OPT-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARG]], align 4
; OPT-NEXT: ret void
;
; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
; OPT-W32-NEXT: [[BB:.*:]]
; OPT-W32-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
; OPT-W32-NEXT: ret void
;
; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_wavefrontsize(
; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
; OPT-W64-NEXT: [[BB:.*:]]
; OPT-W64-NEXT: store i32 2, ptr addrspace(1) [[ARG]], align 4
; OPT-W64-NEXT: ret void
;
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
%tmp1 = icmp ugt i32 %tmp, 32
%tmp2 = select i1 %tmp1, i32 2, i32 1
store i32 %tmp2, ptr addrspace(1) %arg
ret void
}

define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(ptr addrspace(1) nocapture %arg) {
; OPT-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
; OPT-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) {
; OPT-NEXT: [[BB:.*:]]
; OPT-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.wavefrontsize() #[[ATTR1]]
; OPT-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP]], 32
; OPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
; OPT: [[BB2]]:
; OPT-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
; OPT-NEXT: br label %[[BB3]]
; OPT: [[BB3]]:
; OPT-NEXT: ret void
;
; OPT-W32-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
; OPT-W32-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
; OPT-W32-NEXT: [[BB:.*:]]
; OPT-W32-NEXT: br i1 false, label %[[BB2:.*]], label %[[BB3:.*]]
; OPT-W32: [[BB2]]:
; OPT-W32-NEXT: br label %[[BB3]]
; OPT-W32: [[BB3]]:
; OPT-W32-NEXT: ret void
;
; OPT-W64-LABEL: define amdgpu_kernel void @fold_and_optimize_if_wavefrontsize(
; OPT-W64-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR0]] {
; OPT-W64-NEXT: [[BB:.*:]]
; OPT-W64-NEXT: br i1 true, label %[[BB2:.*]], label %[[BB3:.*]]
; OPT-W64: [[BB2]]:
; OPT-W64-NEXT: store i32 1, ptr addrspace(1) [[ARG]], align 4
; OPT-W64-NEXT: br label %[[BB3]]
; OPT-W64: [[BB3]]:
; OPT-W64-NEXT: ret void
;
bb:
%tmp = tail call i32 @llvm.amdgcn.wavefrontsize() #0
%tmp1 = icmp ugt i32 %tmp, 32
br i1 %tmp1, label %bb2, label %bb3

bb2: ; preds = %bb
store i32 1, ptr addrspace(1) %arg, align 4
br label %bb3

bb3: ; preds = %bb2, %bb
ret void
}

declare i32 @llvm.amdgcn.wavefrontsize() #0

attributes #0 = { nounwind readnone speculatable }