Skip to content

Commit c74e223

Browse files
authored
AMDGPU: Simplify demanded bits on readlane/writeline index arguments (#117963)
The main goal is to fold away wave64 code when compiled for wave32. If we have out of bounds indexing, these will now clamp down to a low bit which may CSE with the operations on the low half of the wave.
1 parent 5fa59ed commit c74e223

File tree

3 files changed

+142
-52
lines changed

3 files changed

+142
-52
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,37 @@ static bool isTriviallyUniform(const Use &U) {
450450
return false;
451451
}
452452

453+
/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
454+
///
455+
/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
456+
bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
457+
IntrinsicInst &II,
458+
unsigned LaneArgIdx) const {
459+
unsigned MaskBits = ST->getWavefrontSizeLog2();
460+
APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits));
461+
462+
KnownBits Known(32);
463+
if (IC.SimplifyDemandedBits(&II, LaneArgIdx, DemandedMask, Known))
464+
return true;
465+
466+
if (!Known.isConstant())
467+
return false;
468+
469+
// Out of bounds indexes may appear in wave64 code compiled for wave32.
470+
// Unlike the DAG version, SimplifyDemandedBits does not change constants, so
471+
// manually fix it up.
472+
473+
Value *LaneArg = II.getArgOperand(LaneArgIdx);
474+
Constant *MaskedConst =
475+
ConstantInt::get(LaneArg->getType(), Known.getConstant() & DemandedMask);
476+
if (MaskedConst != LaneArg) {
477+
II.getOperandUse(LaneArgIdx).set(MaskedConst);
478+
return true;
479+
}
480+
481+
return false;
482+
}
483+
453484
std::optional<Instruction *>
454485
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
455486
Intrinsic::ID IID = II.getIntrinsicID();
@@ -1092,7 +1123,17 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
10921123
const Use &Src = II.getArgOperandUse(0);
10931124
if (isTriviallyUniform(Src))
10941125
return IC.replaceInstUsesWith(II, Src.get());
1095-
break;
1126+
1127+
if (IID == Intrinsic::amdgcn_readlane &&
1128+
simplifyDemandedLaneMaskArg(IC, II, 1))
1129+
return &II;
1130+
1131+
return std::nullopt;
1132+
}
1133+
case Intrinsic::amdgcn_writelane: {
1134+
if (simplifyDemandedLaneMaskArg(IC, II, 1))
1135+
return &II;
1136+
return std::nullopt;
10961137
}
10971138
case Intrinsic::amdgcn_trig_preop: {
10981139
// The intrinsic is declared with name mangling, but currently the

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,10 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
220220

221221
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
222222
const Value *Op1, InstCombiner &IC) const;
223+
224+
bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II,
225+
unsigned LaneAgIdx) const;
226+
223227
std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
224228
IntrinsicInst &II) const;
225229
std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(

llvm/test/Transforms/InstCombine/AMDGPU/lane-index-simplify-demanded-bits.ll

Lines changed: 96 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -18,30 +18,45 @@ define i32 @readlane_31(i32 %arg) #0 {
1818
}
1919

2020
define i32 @readlane_32(i32 %arg) #0 {
21-
; CHECK-LABEL: define i32 @readlane_32(
22-
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
23-
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 32)
24-
; CHECK-NEXT: ret i32 [[RES]]
21+
; WAVE64-LABEL: define i32 @readlane_32(
22+
; WAVE64-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
23+
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 32)
24+
; WAVE64-NEXT: ret i32 [[RES]]
25+
;
26+
; WAVE32-LABEL: define i32 @readlane_32(
27+
; WAVE32-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
28+
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 0)
29+
; WAVE32-NEXT: ret i32 [[RES]]
2530
;
2631
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 32)
2732
ret i32 %res
2833
}
2934

3035
define i32 @readlane_33(i32 %arg) #0 {
31-
; CHECK-LABEL: define i32 @readlane_33(
32-
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
33-
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 33)
34-
; CHECK-NEXT: ret i32 [[RES]]
36+
; WAVE64-LABEL: define i32 @readlane_33(
37+
; WAVE64-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
38+
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 33)
39+
; WAVE64-NEXT: ret i32 [[RES]]
40+
;
41+
; WAVE32-LABEL: define i32 @readlane_33(
42+
; WAVE32-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
43+
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 1)
44+
; WAVE32-NEXT: ret i32 [[RES]]
3545
;
3646
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 33)
3747
ret i32 %res
3848
}
3949

4050
define i32 @readlane_63(i32 %arg) #0 {
41-
; CHECK-LABEL: define i32 @readlane_63(
42-
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
43-
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 63)
44-
; CHECK-NEXT: ret i32 [[RES]]
51+
; WAVE64-LABEL: define i32 @readlane_63(
52+
; WAVE64-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
53+
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 63)
54+
; WAVE64-NEXT: ret i32 [[RES]]
55+
;
56+
; WAVE32-LABEL: define i32 @readlane_63(
57+
; WAVE32-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
58+
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 31)
59+
; WAVE32-NEXT: ret i32 [[RES]]
4560
;
4661
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 63)
4762
ret i32 %res
@@ -50,19 +65,24 @@ define i32 @readlane_63(i32 %arg) #0 {
5065
define i32 @readlane_64(i32 %arg) #0 {
5166
; CHECK-LABEL: define i32 @readlane_64(
5267
; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
53-
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 64)
68+
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 0)
5469
; CHECK-NEXT: ret i32 [[RES]]
5570
;
5671
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 64)
5772
ret i32 %res
5873
}
5974

6075
define i32 @readlane_and_31(i32 %arg, i32 %idx) #0 {
61-
; CHECK-LABEL: define i32 @readlane_and_31(
62-
; CHECK-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
63-
; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
64-
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX_CLAMP]])
65-
; CHECK-NEXT: ret i32 [[RES]]
76+
; WAVE64-LABEL: define i32 @readlane_and_31(
77+
; WAVE64-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
78+
; WAVE64-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
79+
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX_CLAMP]])
80+
; WAVE64-NEXT: ret i32 [[RES]]
81+
;
82+
; WAVE32-LABEL: define i32 @readlane_and_31(
83+
; WAVE32-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
84+
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX]])
85+
; WAVE32-NEXT: ret i32 [[RES]]
6686
;
6787
%idx.clamp = and i32 %idx, 31
6888
%res = call i32 @llvm.amdgcn.readlane.i32(i32 %arg, i32 %idx.clamp)
@@ -72,8 +92,7 @@ define i32 @readlane_and_31(i32 %arg, i32 %idx) #0 {
7292
define i32 @readlane_and_63(i32 %arg, i32 %idx) #0 {
7393
; CHECK-LABEL: define i32 @readlane_and_63(
7494
; CHECK-SAME: i32 [[ARG:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
75-
; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 63
76-
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX_CLAMP]])
95+
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[ARG]], i32 [[IDX]])
7796
; CHECK-NEXT: ret i32 [[RES]]
7897
;
7998
%idx.clamp = and i32 %idx, 63
@@ -92,10 +111,15 @@ define i32 @readlane_poison(i32 %arg) #0 {
92111
}
93112

94113
define float @readlane_f32_63(float %arg) #0 {
95-
; CHECK-LABEL: define float @readlane_f32_63(
96-
; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] {
97-
; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 63)
98-
; CHECK-NEXT: ret float [[RES]]
114+
; WAVE64-LABEL: define float @readlane_f32_63(
115+
; WAVE64-SAME: float [[ARG:%.*]]) #[[ATTR0]] {
116+
; WAVE64-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 63)
117+
; WAVE64-NEXT: ret float [[RES]]
118+
;
119+
; WAVE32-LABEL: define float @readlane_f32_63(
120+
; WAVE32-SAME: float [[ARG:%.*]]) #[[ATTR0]] {
121+
; WAVE32-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.readlane.f32(float [[ARG]], i32 31)
122+
; WAVE32-NEXT: ret float [[RES]]
99123
;
100124
%res = call float @llvm.amdgcn.readlane.f32(float %arg, i32 63)
101125
ret float %res
@@ -116,30 +140,45 @@ define i32 @writelane_31(i32 %arg0, i32 %arg1) #0 {
116140
}
117141

118142
define i32 @writelane_32(i32 %arg0, i32 %arg1) #0 {
119-
; CHECK-LABEL: define i32 @writelane_32(
120-
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
121-
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 32, i32 [[ARG1]])
122-
; CHECK-NEXT: ret i32 [[RES]]
143+
; WAVE64-LABEL: define i32 @writelane_32(
144+
; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
145+
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 32, i32 [[ARG1]])
146+
; WAVE64-NEXT: ret i32 [[RES]]
147+
;
148+
; WAVE32-LABEL: define i32 @writelane_32(
149+
; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
150+
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 0, i32 [[ARG1]])
151+
; WAVE32-NEXT: ret i32 [[RES]]
123152
;
124153
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 32, i32 %arg1)
125154
ret i32 %res
126155
}
127156

128157
define i32 @writelane_33(i32 %arg0, i32 %arg1) #0 {
129-
; CHECK-LABEL: define i32 @writelane_33(
130-
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
131-
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 33, i32 [[ARG1]])
132-
; CHECK-NEXT: ret i32 [[RES]]
158+
; WAVE64-LABEL: define i32 @writelane_33(
159+
; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
160+
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 33, i32 [[ARG1]])
161+
; WAVE64-NEXT: ret i32 [[RES]]
162+
;
163+
; WAVE32-LABEL: define i32 @writelane_33(
164+
; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
165+
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 1, i32 [[ARG1]])
166+
; WAVE32-NEXT: ret i32 [[RES]]
133167
;
134168
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 33, i32 %arg1)
135169
ret i32 %res
136170
}
137171

138172
define i32 @writelane_63(i32 %arg0, i32 %arg1) #0 {
139-
; CHECK-LABEL: define i32 @writelane_63(
140-
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
141-
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 63, i32 [[ARG1]])
142-
; CHECK-NEXT: ret i32 [[RES]]
173+
; WAVE64-LABEL: define i32 @writelane_63(
174+
; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
175+
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 63, i32 [[ARG1]])
176+
; WAVE64-NEXT: ret i32 [[RES]]
177+
;
178+
; WAVE32-LABEL: define i32 @writelane_63(
179+
; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
180+
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 31, i32 [[ARG1]])
181+
; WAVE32-NEXT: ret i32 [[RES]]
143182
;
144183
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 63, i32 %arg1)
145184
ret i32 %res
@@ -148,19 +187,24 @@ define i32 @writelane_63(i32 %arg0, i32 %arg1) #0 {
148187
define i32 @writelane_64(i32 %arg0, i32 %arg1) #0 {
149188
; CHECK-LABEL: define i32 @writelane_64(
150189
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) #[[ATTR0]] {
151-
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 64, i32 [[ARG1]])
190+
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 0, i32 [[ARG1]])
152191
; CHECK-NEXT: ret i32 [[RES]]
153192
;
154193
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 64, i32 %arg1)
155194
ret i32 %res
156195
}
157196

158197
define i32 @writelane_and_31(i32 %arg0, i32 %arg1, i32 %idx) #0 {
159-
; CHECK-LABEL: define i32 @writelane_and_31(
160-
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
161-
; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
162-
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX_CLAMP]], i32 [[ARG1]])
163-
; CHECK-NEXT: ret i32 [[RES]]
198+
; WAVE64-LABEL: define i32 @writelane_and_31(
199+
; WAVE64-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
200+
; WAVE64-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 31
201+
; WAVE64-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX_CLAMP]], i32 [[ARG1]])
202+
; WAVE64-NEXT: ret i32 [[RES]]
203+
;
204+
; WAVE32-LABEL: define i32 @writelane_and_31(
205+
; WAVE32-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
206+
; WAVE32-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX]], i32 [[ARG1]])
207+
; WAVE32-NEXT: ret i32 [[RES]]
164208
;
165209
%idx.clamp = and i32 %idx, 31
166210
%res = call i32 @llvm.amdgcn.writelane.i32(i32 %arg0, i32 %idx.clamp, i32 %arg1)
@@ -170,8 +214,7 @@ define i32 @writelane_and_31(i32 %arg0, i32 %arg1, i32 %idx) #0 {
170214
define i32 @writelane_and_63(i32 %arg0, i32 %arg1, i32 %idx) #0 {
171215
; CHECK-LABEL: define i32 @writelane_and_63(
172216
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 [[IDX:%.*]]) #[[ATTR0]] {
173-
; CHECK-NEXT: [[IDX_CLAMP:%.*]] = and i32 [[IDX]], 63
174-
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX_CLAMP]], i32 [[ARG1]])
217+
; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.amdgcn.writelane.i32(i32 [[ARG0]], i32 [[IDX]], i32 [[ARG1]])
175218
; CHECK-NEXT: ret i32 [[RES]]
176219
;
177220
%idx.clamp = and i32 %idx, 63
@@ -190,16 +233,18 @@ define i32 @writelane_poison(i32 %arg0, i32 %arg1) #0 {
190233
}
191234

192235
define float @writelane_f32_63(float %arg0, float %arg1) #0 {
193-
; CHECK-LABEL: define float @writelane_f32_63(
194-
; CHECK-SAME: float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
195-
; CHECK-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.writelane.f32(float [[ARG0]], i32 63, float [[ARG1]])
196-
; CHECK-NEXT: ret float [[RES]]
236+
; WAVE64-LABEL: define float @writelane_f32_63(
237+
; WAVE64-SAME: float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
238+
; WAVE64-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.writelane.f32(float [[ARG0]], i32 63, float [[ARG1]])
239+
; WAVE64-NEXT: ret float [[RES]]
240+
;
241+
; WAVE32-LABEL: define float @writelane_f32_63(
242+
; WAVE32-SAME: float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
243+
; WAVE32-NEXT: [[RES:%.*]] = call float @llvm.amdgcn.writelane.f32(float [[ARG0]], i32 31, float [[ARG1]])
244+
; WAVE32-NEXT: ret float [[RES]]
197245
;
198246
%res = call float @llvm.amdgcn.writelane.f32(float %arg0, i32 63, float %arg1)
199247
ret float %res
200248
}
201249

202250
attributes #0 = { nounwind }
203-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
204-
; WAVE32: {{.*}}
205-
; WAVE64: {{.*}}

0 commit comments

Comments
 (0)