Skip to content

Commit 89226ec

Browse files
authored
[AMDGPU] Do not widen scalar loads on GFX12 (#78724)
GFX12 has subword scalar loads so there is no need to do this.
1 parent aac1d97 commit 89226ec

File tree

3 files changed

+149
-68
lines changed

3 files changed

+149
-68
lines changed

llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@
1313
//===----------------------------------------------------------------------===//
1414

1515
#include "AMDGPU.h"
16+
#include "AMDGPUTargetMachine.h"
1617
#include "llvm/Analysis/AssumptionCache.h"
1718
#include "llvm/Analysis/UniformityAnalysis.h"
1819
#include "llvm/Analysis/ValueTracking.h"
20+
#include "llvm/CodeGen/TargetPassConfig.h"
1921
#include "llvm/IR/IRBuilder.h"
2022
#include "llvm/IR/InstVisitor.h"
2123
#include "llvm/InitializePasses.h"
@@ -58,6 +60,7 @@ class AMDGPULateCodeGenPrepare
5860
}
5961

6062
void getAnalysisUsage(AnalysisUsage &AU) const override {
63+
AU.addRequired<TargetPassConfig>();
6164
AU.addRequired<AssumptionCacheTracker>();
6265
AU.addRequired<UniformityInfoWrapperPass>();
6366
AU.setPreservesAll();
@@ -90,7 +93,11 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
9093
if (skipFunction(F))
9194
return false;
9295

93-
// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
96+
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
97+
const TargetMachine &TM = TPC.getTM<TargetMachine>();
98+
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
99+
if (ST.hasScalarSubwordLoads())
100+
return false;
94101

95102
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
96103
UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
@@ -181,6 +188,7 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
181188

182189
INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
183190
"AMDGPU IR late optimizations", false, false)
191+
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
184192
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
185193
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
186194
INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,

llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll

Lines changed: 55 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,39 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2-
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-late-codegenprepare %s | FileCheck %s
2+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-late-codegenprepare %s | FileCheck %s -check-prefix=GFX9
3+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-late-codegenprepare %s | FileCheck %s -check-prefix=GFX12
34

45
; Make sure we don't crash when trying to create a bitcast between
56
; address spaces
67
define amdgpu_kernel void @constant_from_offset_cast_generic_null() {
7-
; CHECK-LABEL: @constant_from_offset_cast_generic_null(
8-
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 4), align 4
9-
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
10-
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
11-
; CHECK-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
12-
; CHECK-NEXT: ret void
8+
; GFX9-LABEL: @constant_from_offset_cast_generic_null(
9+
; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 4), align 4
10+
; GFX9-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
11+
; GFX9-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
12+
; GFX9-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
13+
; GFX9-NEXT: ret void
14+
;
15+
; GFX12-LABEL: @constant_from_offset_cast_generic_null(
16+
; GFX12-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 6), align 1
17+
; GFX12-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1
18+
; GFX12-NEXT: ret void
1319
;
1420
%load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 6), align 1
1521
store i8 %load, ptr addrspace(1) undef
1622
ret void
1723
}
1824

1925
define amdgpu_kernel void @constant_from_offset_cast_global_null() {
20-
; CHECK-LABEL: @constant_from_offset_cast_global_null(
21-
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 4), align 4
22-
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
23-
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
24-
; CHECK-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
25-
; CHECK-NEXT: ret void
26+
; GFX9-LABEL: @constant_from_offset_cast_global_null(
27+
; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 4), align 4
28+
; GFX9-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
29+
; GFX9-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
30+
; GFX9-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
31+
; GFX9-NEXT: ret void
32+
;
33+
; GFX12-LABEL: @constant_from_offset_cast_global_null(
34+
; GFX12-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 6), align 1
35+
; GFX12-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1
36+
; GFX12-NEXT: ret void
2637
;
2738
%load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 6), align 1
2839
store i8 %load, ptr addrspace(1) undef
@@ -32,36 +43,51 @@ define amdgpu_kernel void @constant_from_offset_cast_global_null() {
3243
@gv = unnamed_addr addrspace(1) global [64 x i8] undef, align 4
3344

3445
define amdgpu_kernel void @constant_from_offset_cast_global_gv() {
35-
; CHECK-LABEL: @constant_from_offset_cast_global_gv(
36-
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 4), align 4
37-
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
38-
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
39-
; CHECK-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
40-
; CHECK-NEXT: ret void
46+
; GFX9-LABEL: @constant_from_offset_cast_global_gv(
47+
; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 4), align 4
48+
; GFX9-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
49+
; GFX9-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
50+
; GFX9-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
51+
; GFX9-NEXT: ret void
52+
;
53+
; GFX12-LABEL: @constant_from_offset_cast_global_gv(
54+
; GFX12-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 6), align 1
55+
; GFX12-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1
56+
; GFX12-NEXT: ret void
4157
;
4258
%load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 6), align 1
4359
store i8 %load, ptr addrspace(1) undef
4460
ret void
4561
}
4662

4763
define amdgpu_kernel void @constant_from_offset_cast_generic_inttoptr() {
48-
; CHECK-LABEL: @constant_from_offset_cast_generic_inttoptr(
49-
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 4), align 4
50-
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
51-
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
52-
; CHECK-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
53-
; CHECK-NEXT: ret void
64+
; GFX9-LABEL: @constant_from_offset_cast_generic_inttoptr(
65+
; GFX9-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 4), align 4
66+
; GFX9-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
67+
; GFX9-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
68+
; GFX9-NEXT: store i8 [[TMP3]], ptr addrspace(1) undef, align 1
69+
; GFX9-NEXT: ret void
70+
;
71+
; GFX12-LABEL: @constant_from_offset_cast_generic_inttoptr(
72+
; GFX12-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 6), align 1
73+
; GFX12-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1
74+
; GFX12-NEXT: ret void
5475
;
5576
%load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 6), align 1
5677
store i8 %load, ptr addrspace(1) undef
5778
ret void
5879
}
5980

6081
define amdgpu_kernel void @constant_from_inttoptr() {
61-
; CHECK-LABEL: @constant_from_inttoptr(
62-
; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 4
63-
; CHECK-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1
64-
; CHECK-NEXT: ret void
82+
; GFX9-LABEL: @constant_from_inttoptr(
83+
; GFX9-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 4
84+
; GFX9-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1
85+
; GFX9-NEXT: ret void
86+
;
87+
; GFX12-LABEL: @constant_from_inttoptr(
88+
; GFX12-NEXT: [[LOAD:%.*]] = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 1
89+
; GFX12-NEXT: store i8 [[LOAD]], ptr addrspace(1) undef, align 1
90+
; GFX12-NEXT: ret void
6591
;
6692
%load = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 1
6793
store i8 %load, ptr addrspace(1) undef
Lines changed: 85 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GFX9
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
34

45
; We have an indirect call with a known set of callees, which are
56
; known to not need any special inputs. The ABI still needs to use the
@@ -8,35 +9,63 @@
89
; FIXME: Passing real values for workitem ID, and 0s that can be undef
910

1011
define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
11-
; CHECK-LABEL: indirect_call_known_no_special_inputs:
12-
; CHECK: ; %bb.0: ; %bb
13-
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s4, s7
14-
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
15-
; CHECK-NEXT: s_add_u32 s0, s0, s7
16-
; CHECK-NEXT: s_addc_u32 s1, s1, 0
17-
; CHECK-NEXT: s_mov_b64 s[4:5], 0
18-
; CHECK-NEXT: s_load_dword s7, s[4:5], 0x0
19-
; CHECK-NEXT: s_getpc_b64 s[4:5]
20-
; CHECK-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
21-
; CHECK-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12
22-
; CHECK-NEXT: s_getpc_b64 s[8:9]
23-
; CHECK-NEXT: s_add_u32 s8, s8, snork@gotpcrel32@lo+4
24-
; CHECK-NEXT: s_addc_u32 s9, s9, snork@gotpcrel32@hi+12
25-
; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
26-
; CHECK-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
27-
; CHECK-NEXT: s_mov_b64 s[8:9], 0
28-
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
29-
; CHECK-NEXT: s_and_b32 s4, 1, s7
30-
; CHECK-NEXT: s_cmp_eq_u32 s4, 1
31-
; CHECK-NEXT: v_mov_b32_e32 v31, v0
32-
; CHECK-NEXT: s_cselect_b32 s5, s13, s11
33-
; CHECK-NEXT: s_cselect_b32 s4, s12, s10
34-
; CHECK-NEXT: s_mov_b32 s12, s6
35-
; CHECK-NEXT: v_mov_b32_e32 v1, 0
36-
; CHECK-NEXT: v_mov_b32_e32 v4, 0
37-
; CHECK-NEXT: s_mov_b32 s32, 0
38-
; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5]
39-
; CHECK-NEXT: s_endpgm
12+
; GFX9-LABEL: indirect_call_known_no_special_inputs:
13+
; GFX9: ; %bb.0: ; %bb
14+
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7
15+
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
16+
; GFX9-NEXT: s_add_u32 s0, s0, s7
17+
; GFX9-NEXT: s_addc_u32 s1, s1, 0
18+
; GFX9-NEXT: s_mov_b64 s[4:5], 0
19+
; GFX9-NEXT: s_load_dword s7, s[4:5], 0x0
20+
; GFX9-NEXT: s_getpc_b64 s[4:5]
21+
; GFX9-NEXT: s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
22+
; GFX9-NEXT: s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12
23+
; GFX9-NEXT: s_getpc_b64 s[8:9]
24+
; GFX9-NEXT: s_add_u32 s8, s8, snork@gotpcrel32@lo+4
25+
; GFX9-NEXT: s_addc_u32 s9, s9, snork@gotpcrel32@hi+12
26+
; GFX9-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
27+
; GFX9-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x0
28+
; GFX9-NEXT: s_mov_b64 s[8:9], 0
29+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
30+
; GFX9-NEXT: s_and_b32 s4, 1, s7
31+
; GFX9-NEXT: s_cmp_eq_u32 s4, 1
32+
; GFX9-NEXT: v_mov_b32_e32 v31, v0
33+
; GFX9-NEXT: s_cselect_b32 s5, s13, s11
34+
; GFX9-NEXT: s_cselect_b32 s4, s12, s10
35+
; GFX9-NEXT: s_mov_b32 s12, s6
36+
; GFX9-NEXT: v_mov_b32_e32 v1, 0
37+
; GFX9-NEXT: v_mov_b32_e32 v4, 0
38+
; GFX9-NEXT: s_mov_b32 s32, 0
39+
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
40+
; GFX9-NEXT: s_endpgm
41+
;
42+
; GFX12-LABEL: indirect_call_known_no_special_inputs:
43+
; GFX12: ; %bb.0: ; %bb
44+
; GFX12-NEXT: s_getpc_b64 s[4:5]
45+
; GFX12-NEXT: s_sext_i32_i16 s5, s5
46+
; GFX12-NEXT: s_add_co_u32 s4, s4, snork@gotpcrel32@lo+8
47+
; GFX12-NEXT: s_add_co_ci_u32 s5, s5, snork@gotpcrel32@hi+16
48+
; GFX12-NEXT: s_mov_b64 s[2:3], 0
49+
; GFX12-NEXT: s_getpc_b64 s[6:7]
50+
; GFX12-NEXT: s_sext_i32_i16 s7, s7
51+
; GFX12-NEXT: s_add_co_u32 s6, s6, wobble@gotpcrel32@lo+8
52+
; GFX12-NEXT: s_add_co_ci_u32 s7, s7, wobble@gotpcrel32@hi+16
53+
; GFX12-NEXT: s_load_u8 s1, s[2:3], 0x0
54+
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
55+
; GFX12-NEXT: s_load_b64 s[4:5], s[6:7], 0x0
56+
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0
57+
; GFX12-NEXT: v_mov_b32_e32 v31, v0
58+
; GFX12-NEXT: s_mov_b64 s[8:9], 0
59+
; GFX12-NEXT: s_mov_b32 s12, s0
60+
; GFX12-NEXT: s_mov_b32 s32, 0
61+
; GFX12-NEXT: s_wait_kmcnt 0x0
62+
; GFX12-NEXT: s_and_b32 s1, 1, s1
63+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
64+
; GFX12-NEXT: s_cmp_eq_u32 s1, 1
65+
; GFX12-NEXT: s_cselect_b32 s3, s5, s3
66+
; GFX12-NEXT: s_cselect_b32 s2, s4, s2
67+
; GFX12-NEXT: s_swappc_b64 s[30:31], s[2:3]
68+
; GFX12-NEXT: s_endpgm
4069

4170
bb:
4271
%cond = load i1, ptr addrspace(4) null
@@ -46,19 +75,37 @@ bb:
4675
}
4776

4877
define void @wobble() {
49-
; CHECK-LABEL: wobble:
50-
; CHECK: ; %bb.0: ; %bb
51-
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52-
; CHECK-NEXT: s_setpc_b64 s[30:31]
78+
; GFX9-LABEL: wobble:
79+
; GFX9: ; %bb.0: ; %bb
80+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
81+
; GFX9-NEXT: s_setpc_b64 s[30:31]
82+
;
83+
; GFX12-LABEL: wobble:
84+
; GFX12: ; %bb.0: ; %bb
85+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
86+
; GFX12-NEXT: s_wait_expcnt 0x0
87+
; GFX12-NEXT: s_wait_samplecnt 0x0
88+
; GFX12-NEXT: s_wait_bvhcnt 0x0
89+
; GFX12-NEXT: s_wait_kmcnt 0x0
90+
; GFX12-NEXT: s_setpc_b64 s[30:31]
5391
bb:
5492
ret void
5593
}
5694

5795
define void @snork() {
58-
; CHECK-LABEL: snork:
59-
; CHECK: ; %bb.0: ; %bb
60-
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61-
; CHECK-NEXT: s_setpc_b64 s[30:31]
96+
; GFX9-LABEL: snork:
97+
; GFX9: ; %bb.0: ; %bb
98+
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
99+
; GFX9-NEXT: s_setpc_b64 s[30:31]
100+
;
101+
; GFX12-LABEL: snork:
102+
; GFX12: ; %bb.0: ; %bb
103+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
104+
; GFX12-NEXT: s_wait_expcnt 0x0
105+
; GFX12-NEXT: s_wait_samplecnt 0x0
106+
; GFX12-NEXT: s_wait_bvhcnt 0x0
107+
; GFX12-NEXT: s_wait_kmcnt 0x0
108+
; GFX12-NEXT: s_setpc_b64 s[30:31]
62109
bb:
63110
ret void
64111
}

0 commit comments

Comments
 (0)