Skip to content

Commit 6aea630

Browse files
authored
AMDGPU: Fix creating illegally typed readfirstlane in atomic optimizer (#128388)
We need to promote 8/16-bit cases to 32-bit. Unfortunately we are missing demanded bits optimizations on readfirstlane, so we end up emitting an and instruction on the input. I'm also surprised this pass isn't handling half or bfloat yet.
1 parent 9b52d9e commit 6aea630

File tree

3 files changed

+4426
-2
lines changed

3 files changed

+4426
-2
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -898,8 +898,15 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
898898

899899
// We need to broadcast the value who was the lowest active lane (the first
900900
// lane) to all other lanes in the wavefront.
901-
Value *BroadcastI = nullptr;
902-
BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
901+
902+
Value *ReadlaneVal = PHI;
903+
if (TyBitWidth < 32)
904+
ReadlaneVal = B.CreateZExt(PHI, B.getInt32Ty());
905+
906+
Value *BroadcastI = B.CreateIntrinsic(
907+
ReadlaneVal->getType(), Intrinsic::amdgcn_readfirstlane, ReadlaneVal);
908+
if (TyBitWidth < 32)
909+
BroadcastI = B.CreateTrunc(BroadcastI, Ty);
903910

904911
// Now that we have the result of our single atomic operation, we need to
905912
// get our individual lane's slice into the result. We use the lane offset
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=amdgpu-atomic-optimizer %s | FileCheck %s
3+
4+
define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, i8 %val) {
5+
; CHECK-LABEL: define amdgpu_kernel void @uniform_or_i8(
6+
; CHECK-SAME: ptr addrspace(1) [[RESULT:%.*]], ptr addrspace(1) [[UNIFORM_PTR:%.*]], i8 [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
7+
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
8+
; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
9+
; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP1]], 32
10+
; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
11+
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
12+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]])
13+
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
14+
; CHECK-NEXT: br i1 [[TMP7]], label %[[BB8:.*]], label %[[BB10:.*]]
15+
; CHECK: [[BB8]]:
16+
; CHECK-NEXT: [[TMP9:%.*]] = atomicrmw or ptr addrspace(1) [[UNIFORM_PTR]], i8 [[VAL]] monotonic, align 1
17+
; CHECK-NEXT: br label %[[BB10]]
18+
; CHECK: [[BB10]]:
19+
; CHECK-NEXT: [[TMP11:%.*]] = phi i8 [ poison, [[TMP0:%.*]] ], [ [[TMP9]], %[[BB8]] ]
20+
; CHECK-NEXT: [[TMP16:%.*]] = zext i8 [[TMP11]] to i32
21+
; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP16]])
22+
; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP17]] to i8
23+
; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP6]] to i8
24+
; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP7]], i8 0, i8 [[VAL]]
25+
; CHECK-NEXT: [[TMP15:%.*]] = or i8 [[TMP12]], [[TMP14]]
26+
; CHECK-NEXT: store i8 [[TMP15]], ptr addrspace(1) [[RESULT]], align 1
27+
; CHECK-NEXT: ret void
28+
;
29+
%rmw = atomicrmw or ptr addrspace(1) %uniform.ptr, i8 %val monotonic, align 1
30+
store i8 %rmw, ptr addrspace(1) %result
31+
ret void
32+
}
33+
34+
define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, i8 %val) {
35+
; CHECK-LABEL: define amdgpu_kernel void @uniform_add_i8(
36+
; CHECK-SAME: ptr addrspace(1) [[RESULT:%.*]], ptr addrspace(1) [[UNIFORM_PTR:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] {
37+
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
38+
; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
39+
; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP1]], 32
40+
; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
41+
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
42+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]])
43+
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]])
44+
; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i8
45+
; CHECK-NEXT: [[TMP9:%.*]] = mul i8 [[VAL]], [[TMP8]]
46+
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP6]], 0
47+
; CHECK-NEXT: br i1 [[TMP10]], label %[[BB11:.*]], label %[[BB13:.*]]
48+
; CHECK: [[BB11]]:
49+
; CHECK-NEXT: [[TMP12:%.*]] = atomicrmw add ptr addrspace(1) [[UNIFORM_PTR]], i8 [[TMP9]] monotonic, align 1
50+
; CHECK-NEXT: br label %[[BB13]]
51+
; CHECK: [[BB13]]:
52+
; CHECK-NEXT: [[TMP14:%.*]] = phi i8 [ poison, [[TMP0:%.*]] ], [ [[TMP12]], %[[BB11]] ]
53+
; CHECK-NEXT: [[TMP19:%.*]] = zext i8 [[TMP14]] to i32
54+
; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]])
55+
; CHECK-NEXT: [[TMP15:%.*]] = trunc i32 [[TMP20]] to i8
56+
; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP6]] to i8
57+
; CHECK-NEXT: [[TMP17:%.*]] = mul i8 [[VAL]], [[TMP16]]
58+
; CHECK-NEXT: [[TMP18:%.*]] = add i8 [[TMP15]], [[TMP17]]
59+
; CHECK-NEXT: store i8 [[TMP18]], ptr addrspace(1) [[RESULT]], align 1
60+
; CHECK-NEXT: ret void
61+
;
62+
%rmw = atomicrmw add ptr addrspace(1) %uniform.ptr, i8 %val monotonic, align 1
63+
store i8 %rmw, ptr addrspace(1) %result
64+
ret void
65+
}
66+
67+
define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, i8 %val) {
68+
; CHECK-LABEL: define amdgpu_kernel void @uniform_xchg_i8(
69+
; CHECK-SAME: ptr addrspace(1) [[RESULT:%.*]], ptr addrspace(1) [[UNIFORM_PTR:%.*]], i8 [[VAL:%.*]]) #[[ATTR0]] {
70+
; CHECK-NEXT: [[RMW:%.*]] = atomicrmw xchg ptr addrspace(1) [[UNIFORM_PTR]], i8 [[VAL]] monotonic, align 1
71+
; CHECK-NEXT: store i8 [[RMW]], ptr addrspace(1) [[RESULT]], align 1
72+
; CHECK-NEXT: ret void
73+
;
74+
%rmw = atomicrmw xchg ptr addrspace(1) %uniform.ptr, i8 %val monotonic, align 1
75+
store i8 %rmw, ptr addrspace(1) %result
76+
ret void
77+
}
78+
79+
define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, i16 %val) {
80+
; CHECK-LABEL: define amdgpu_kernel void @uniform_or_i16(
81+
; CHECK-SAME: ptr addrspace(1) [[RESULT:%.*]], ptr addrspace(1) [[UNIFORM_PTR:%.*]], i16 [[VAL:%.*]]) #[[ATTR0]] {
82+
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
83+
; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
84+
; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP1]], 32
85+
; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
86+
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
87+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]])
88+
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
89+
; CHECK-NEXT: br i1 [[TMP7]], label %[[BB8:.*]], label %[[BB10:.*]]
90+
; CHECK: [[BB8]]:
91+
; CHECK-NEXT: [[TMP9:%.*]] = atomicrmw or ptr addrspace(1) [[UNIFORM_PTR]], i16 [[VAL]] monotonic, align 2
92+
; CHECK-NEXT: br label %[[BB10]]
93+
; CHECK: [[BB10]]:
94+
; CHECK-NEXT: [[TMP11:%.*]] = phi i16 [ poison, [[TMP0:%.*]] ], [ [[TMP9]], %[[BB8]] ]
95+
; CHECK-NEXT: [[TMP16:%.*]] = zext i16 [[TMP11]] to i32
96+
; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP16]])
97+
; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP17]] to i16
98+
; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP6]] to i16
99+
; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP7]], i16 0, i16 [[VAL]]
100+
; CHECK-NEXT: [[TMP15:%.*]] = or i16 [[TMP12]], [[TMP14]]
101+
; CHECK-NEXT: store i16 [[TMP15]], ptr addrspace(1) [[RESULT]], align 2
102+
; CHECK-NEXT: ret void
103+
;
104+
%rmw = atomicrmw or ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2
105+
store i16 %rmw, ptr addrspace(1) %result
106+
ret void
107+
}
108+
109+
define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, i16 %val) {
110+
; CHECK-LABEL: define amdgpu_kernel void @uniform_add_i16(
111+
; CHECK-SAME: ptr addrspace(1) [[RESULT:%.*]], ptr addrspace(1) [[UNIFORM_PTR:%.*]], i16 [[VAL:%.*]]) #[[ATTR0]] {
112+
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
113+
; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
114+
; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP1]], 32
115+
; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
116+
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
117+
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]])
118+
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]])
119+
; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i16
120+
; CHECK-NEXT: [[TMP9:%.*]] = mul i16 [[VAL]], [[TMP8]]
121+
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP6]], 0
122+
; CHECK-NEXT: br i1 [[TMP10]], label %[[BB11:.*]], label %[[BB13:.*]]
123+
; CHECK: [[BB11]]:
124+
; CHECK-NEXT: [[TMP12:%.*]] = atomicrmw add ptr addrspace(1) [[UNIFORM_PTR]], i16 [[TMP9]] monotonic, align 2
125+
; CHECK-NEXT: br label %[[BB13]]
126+
; CHECK: [[BB13]]:
127+
; CHECK-NEXT: [[TMP14:%.*]] = phi i16 [ poison, [[TMP0:%.*]] ], [ [[TMP12]], %[[BB11]] ]
128+
; CHECK-NEXT: [[TMP19:%.*]] = zext i16 [[TMP14]] to i32
129+
; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP19]])
130+
; CHECK-NEXT: [[TMP15:%.*]] = trunc i32 [[TMP20]] to i16
131+
; CHECK-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP6]] to i16
132+
; CHECK-NEXT: [[TMP17:%.*]] = mul i16 [[VAL]], [[TMP16]]
133+
; CHECK-NEXT: [[TMP18:%.*]] = add i16 [[TMP15]], [[TMP17]]
134+
; CHECK-NEXT: store i16 [[TMP18]], ptr addrspace(1) [[RESULT]], align 2
135+
; CHECK-NEXT: ret void
136+
;
137+
%rmw = atomicrmw add ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2
138+
store i16 %rmw, ptr addrspace(1) %result
139+
ret void
140+
}
141+
142+
define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, i16 %val) {
143+
; CHECK-LABEL: define amdgpu_kernel void @uniform_xchg_i16(
144+
; CHECK-SAME: ptr addrspace(1) [[RESULT:%.*]], ptr addrspace(1) [[UNIFORM_PTR:%.*]], i16 [[VAL:%.*]]) #[[ATTR0]] {
145+
; CHECK-NEXT: [[RMW:%.*]] = atomicrmw xchg ptr addrspace(1) [[UNIFORM_PTR]], i16 [[VAL]] monotonic, align 2
146+
; CHECK-NEXT: store i16 [[RMW]], ptr addrspace(1) [[RESULT]], align 2
147+
; CHECK-NEXT: ret void
148+
;
149+
%rmw = atomicrmw xchg ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2
150+
store i16 %rmw, ptr addrspace(1) %result
151+
ret void
152+
}
153+
154+
define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, half %val) {
155+
; CHECK-LABEL: define amdgpu_kernel void @uniform_fadd_f16(
156+
; CHECK-SAME: ptr addrspace(1) [[RESULT:%.*]], ptr addrspace(1) [[UNIFORM_PTR:%.*]], half [[VAL:%.*]]) #[[ATTR0]] {
157+
; CHECK-NEXT: [[RMW:%.*]] = atomicrmw fadd ptr addrspace(1) [[UNIFORM_PTR]], half [[VAL]] monotonic, align 2
158+
; CHECK-NEXT: store half [[RMW]], ptr addrspace(1) [[RESULT]], align 2
159+
; CHECK-NEXT: ret void
160+
;
161+
%rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, half %val monotonic, align 2
162+
store half %rmw, ptr addrspace(1) %result
163+
ret void
164+
}
165+
166+
define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, bfloat %val) {
167+
; CHECK-LABEL: define amdgpu_kernel void @uniform_fadd_bf16(
168+
; CHECK-SAME: ptr addrspace(1) [[RESULT:%.*]], ptr addrspace(1) [[UNIFORM_PTR:%.*]], bfloat [[VAL:%.*]]) #[[ATTR0]] {
169+
; CHECK-NEXT: [[RMW:%.*]] = atomicrmw fadd ptr addrspace(1) [[UNIFORM_PTR]], bfloat [[VAL]] monotonic, align 2
170+
; CHECK-NEXT: store bfloat [[RMW]], ptr addrspace(1) [[RESULT]], align 2
171+
; CHECK-NEXT: ret void
172+
;
173+
%rmw = atomicrmw fadd ptr addrspace(1) %uniform.ptr, bfloat %val monotonic, align 2
174+
store bfloat %rmw, ptr addrspace(1) %result
175+
ret void
176+
}

0 commit comments

Comments
 (0)