@@ -93,6 +93,326 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
93
93
ret void
94
94
}
95
95
96
+ define amdgpu_kernel void @set_inactive_f32 (ptr addrspace (1 ) %out , float %in ) {
97
+ ; GCN-LABEL: set_inactive_f32:
98
+ ; GCN: ; %bb.0:
99
+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
100
+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
101
+ ; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
102
+ ; GCN-NEXT: s_mov_b32 s2, -1
103
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
104
+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
105
+ ; GCN-NEXT: s_not_b64 exec, exec
106
+ ; GCN-NEXT: v_mov_b32_e32 v0, v1
107
+ ; GCN-NEXT: s_not_b64 exec, exec
108
+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
109
+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
110
+ ; GCN-NEXT: s_endpgm
111
+ %tmp = call float @llvm.amdgcn.set.inactive.f32 (float %in , float 3 .0 ) #0
112
+ store float %tmp , ptr addrspace (1 ) %out
113
+ ret void
114
+ }
115
+
116
+ define amdgpu_kernel void @set_inactive_f64 (ptr addrspace (1 ) %out , double %in ) {
117
+ ; GCN-LABEL: set_inactive_f64:
118
+ ; GCN: ; %bb.0:
119
+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
120
+ ; GCN-NEXT: s_mov_b32 s4, 0xcccccccd
121
+ ; GCN-NEXT: s_mov_b32 s5, 0x4010cccc
122
+ ; GCN-NEXT: v_mov_b32_e32 v2, s4
123
+ ; GCN-NEXT: v_mov_b32_e32 v3, s5
124
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
125
+ ; GCN-NEXT: v_mov_b32_e32 v0, s2
126
+ ; GCN-NEXT: v_mov_b32_e32 v1, s3
127
+ ; GCN-NEXT: s_not_b64 exec, exec
128
+ ; GCN-NEXT: v_mov_b32_e32 v0, v2
129
+ ; GCN-NEXT: v_mov_b32_e32 v1, v3
130
+ ; GCN-NEXT: s_not_b64 exec, exec
131
+ ; GCN-NEXT: s_mov_b32 s2, -1
132
+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
133
+ ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
134
+ ; GCN-NEXT: s_endpgm
135
+ %tmp = call double @llvm.amdgcn.set.inactive.f64 (double %in , double 4 .2 ) #0
136
+ store double %tmp , ptr addrspace (1 ) %out
137
+ ret void
138
+ }
139
+
140
+ define amdgpu_kernel void @set_inactive_v2i16 (ptr addrspace (1 ) %out , <2 x i16 > %in ) {
141
+ ; GCN-LABEL: set_inactive_v2i16:
142
+ ; GCN: ; %bb.0:
143
+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
144
+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
145
+ ; GCN-NEXT: v_mov_b32_e32 v1, 0x10001
146
+ ; GCN-NEXT: s_mov_b32 s2, -1
147
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
148
+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
149
+ ; GCN-NEXT: s_not_b64 exec, exec
150
+ ; GCN-NEXT: v_mov_b32_e32 v0, v1
151
+ ; GCN-NEXT: s_not_b64 exec, exec
152
+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
153
+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
154
+ ; GCN-NEXT: s_endpgm
155
+ %tmp = call <2 x i16 > @llvm.amdgcn.set.inactive.v2i16 (<2 x i16 > %in , <2 x i16 > <i16 1 , i16 1 >) #0
156
+ store <2 x i16 > %tmp , ptr addrspace (1 ) %out
157
+ ret void
158
+ }
159
+
160
+ define amdgpu_kernel void @set_inactive_v2f16 (ptr addrspace (1 ) %out , <2 x half > %in ) {
161
+ ; GCN-LABEL: set_inactive_v2f16:
162
+ ; GCN: ; %bb.0:
163
+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
164
+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
165
+ ; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00
166
+ ; GCN-NEXT: s_mov_b32 s2, -1
167
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
168
+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
169
+ ; GCN-NEXT: s_not_b64 exec, exec
170
+ ; GCN-NEXT: v_mov_b32_e32 v0, v1
171
+ ; GCN-NEXT: s_not_b64 exec, exec
172
+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
173
+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
174
+ ; GCN-NEXT: s_endpgm
175
+ %tmp = call <2 x half > @llvm.amdgcn.set.inactive.v2f16 (<2 x half > %in , <2 x half > <half 1 .0 , half 1 .0 >) #0
176
+ store <2 x half > %tmp , ptr addrspace (1 ) %out
177
+ ret void
178
+ }
179
+
180
+ define amdgpu_kernel void @set_inactive_v2i32 (ptr addrspace (1 ) %out , <2 x i32 > %in ) {
181
+ ; GCN-LABEL: set_inactive_v2i32:
182
+ ; GCN: ; %bb.0:
183
+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
184
+ ; GCN-NEXT: s_mov_b32 s4, 1
185
+ ; GCN-NEXT: s_mov_b32 s5, s4
186
+ ; GCN-NEXT: v_mov_b32_e32 v2, s4
187
+ ; GCN-NEXT: v_mov_b32_e32 v3, s5
188
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
189
+ ; GCN-NEXT: v_mov_b32_e32 v0, s2
190
+ ; GCN-NEXT: v_mov_b32_e32 v1, s3
191
+ ; GCN-NEXT: s_not_b64 exec, exec
192
+ ; GCN-NEXT: v_mov_b32_e32 v0, v2
193
+ ; GCN-NEXT: v_mov_b32_e32 v1, v3
194
+ ; GCN-NEXT: s_not_b64 exec, exec
195
+ ; GCN-NEXT: s_mov_b32 s2, -1
196
+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
197
+ ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
198
+ ; GCN-NEXT: s_endpgm
199
+ %tmp = call <2 x i32 > @llvm.amdgcn.set.inactive.v2i32 (<2 x i32 > %in , <2 x i32 > <i32 1 , i32 1 >) #0
200
+ store <2 x i32 > %tmp , ptr addrspace (1 ) %out
201
+ ret void
202
+ }
203
+
204
+ define amdgpu_kernel void @set_inactive_v2f32 (ptr addrspace (1 ) %out , <2 x float > %in ) {
205
+ ; GCN-LABEL: set_inactive_v2f32:
206
+ ; GCN: ; %bb.0:
207
+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
208
+ ; GCN-NEXT: s_mov_b32 s4, 1.0
209
+ ; GCN-NEXT: s_mov_b32 s5, s4
210
+ ; GCN-NEXT: v_mov_b32_e32 v2, s4
211
+ ; GCN-NEXT: v_mov_b32_e32 v3, s5
212
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
213
+ ; GCN-NEXT: v_mov_b32_e32 v0, s2
214
+ ; GCN-NEXT: v_mov_b32_e32 v1, s3
215
+ ; GCN-NEXT: s_not_b64 exec, exec
216
+ ; GCN-NEXT: v_mov_b32_e32 v0, v2
217
+ ; GCN-NEXT: v_mov_b32_e32 v1, v3
218
+ ; GCN-NEXT: s_not_b64 exec, exec
219
+ ; GCN-NEXT: s_mov_b32 s2, -1
220
+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
221
+ ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
222
+ ; GCN-NEXT: s_endpgm
223
+ %tmp = call <2 x float > @llvm.amdgcn.set.inactive.v2f32 (<2 x float > %in , <2 x float > <float 1 .0 , float 1 .0 >) #0
224
+ store <2 x float > %tmp , ptr addrspace (1 ) %out
225
+ ret void
226
+ }
227
+
228
+ define amdgpu_kernel void @set_inactive_v2bf16 (ptr addrspace (1 ) %out , <2 x bfloat> %in ) {
229
+ ; GCN-LABEL: set_inactive_v2bf16:
230
+ ; GCN: ; %bb.0:
231
+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
232
+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
233
+ ; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80
234
+ ; GCN-NEXT: s_mov_b32 s2, -1
235
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
236
+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
237
+ ; GCN-NEXT: s_not_b64 exec, exec
238
+ ; GCN-NEXT: v_mov_b32_e32 v0, v1
239
+ ; GCN-NEXT: s_not_b64 exec, exec
240
+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
241
+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
242
+ ; GCN-NEXT: s_endpgm
243
+ %tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16 (<2 x bfloat> %in , <2 x bfloat> <bfloat 1 .0 , bfloat 1 .0 >) #0
244
+ store <2 x bfloat> %tmp , ptr addrspace (1 ) %out
245
+ ret void
246
+ }
247
+
248
+ define amdgpu_kernel void @set_inactive_v4i16 (ptr addrspace (1 ) %out , <4 x i16 > %in ) {
249
+ ; GCN-LABEL: set_inactive_v4i16:
250
+ ; GCN: ; %bb.0:
251
+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
252
+ ; GCN-NEXT: s_mov_b32 s4, 0x10001
253
+ ; GCN-NEXT: s_mov_b32 s5, s4
254
+ ; GCN-NEXT: v_mov_b32_e32 v2, s4
255
+ ; GCN-NEXT: v_mov_b32_e32 v3, s5
256
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
257
+ ; GCN-NEXT: v_mov_b32_e32 v0, s2
258
+ ; GCN-NEXT: v_mov_b32_e32 v1, s3
259
+ ; GCN-NEXT: s_not_b64 exec, exec
260
+ ; GCN-NEXT: v_mov_b32_e32 v0, v2
261
+ ; GCN-NEXT: v_mov_b32_e32 v1, v3
262
+ ; GCN-NEXT: s_not_b64 exec, exec
263
+ ; GCN-NEXT: s_mov_b32 s2, -1
264
+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
265
+ ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
266
+ ; GCN-NEXT: s_endpgm
267
+ %tmp = call <4 x i16 > @llvm.amdgcn.set.inactive.v4i16 (<4 x i16 > %in , <4 x i16 > <i16 1 , i16 1 , i16 1 , i16 1 >) #0
268
+ store <4 x i16 > %tmp , ptr addrspace (1 ) %out
269
+ ret void
270
+ }
271
+
272
+ define amdgpu_kernel void @set_inactive_v4f16 (ptr addrspace (1 ) %out , <4 x half > %in ) {
273
+ ; GCN-LABEL: set_inactive_v4f16:
274
+ ; GCN: ; %bb.0:
275
+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
276
+ ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00
277
+ ; GCN-NEXT: s_mov_b32 s5, s4
278
+ ; GCN-NEXT: v_mov_b32_e32 v2, s4
279
+ ; GCN-NEXT: v_mov_b32_e32 v3, s5
280
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
281
+ ; GCN-NEXT: v_mov_b32_e32 v0, s2
282
+ ; GCN-NEXT: v_mov_b32_e32 v1, s3
283
+ ; GCN-NEXT: s_not_b64 exec, exec
284
+ ; GCN-NEXT: v_mov_b32_e32 v0, v2
285
+ ; GCN-NEXT: v_mov_b32_e32 v1, v3
286
+ ; GCN-NEXT: s_not_b64 exec, exec
287
+ ; GCN-NEXT: s_mov_b32 s2, -1
288
+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
289
+ ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
290
+ ; GCN-NEXT: s_endpgm
291
+ %tmp = call <4 x half > @llvm.amdgcn.set.inactive.v4f16 (<4 x half > %in , <4 x half > <half 1 .0 , half 1 .0 , half 1 .0 , half 1 .0 >) #0
292
+ store <4 x half > %tmp , ptr addrspace (1 ) %out
293
+ ret void
294
+ }
295
+
296
+ define amdgpu_kernel void @set_inactive_v4bf16 (ptr addrspace (1 ) %out , <4 x bfloat> %in ) {
297
+ ; GCN-LABEL: set_inactive_v4bf16:
298
+ ; GCN: ; %bb.0:
299
+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
300
+ ; GCN-NEXT: s_mov_b32 s4, 0x3f803f80
301
+ ; GCN-NEXT: s_mov_b32 s5, s4
302
+ ; GCN-NEXT: v_mov_b32_e32 v2, s4
303
+ ; GCN-NEXT: v_mov_b32_e32 v3, s5
304
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
305
+ ; GCN-NEXT: v_mov_b32_e32 v0, s2
306
+ ; GCN-NEXT: v_mov_b32_e32 v1, s3
307
+ ; GCN-NEXT: s_not_b64 exec, exec
308
+ ; GCN-NEXT: v_mov_b32_e32 v0, v2
309
+ ; GCN-NEXT: v_mov_b32_e32 v1, v3
310
+ ; GCN-NEXT: s_not_b64 exec, exec
311
+ ; GCN-NEXT: s_mov_b32 s2, -1
312
+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
313
+ ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
314
+ ; GCN-NEXT: s_endpgm
315
+ %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16 (<4 x bfloat> %in , <4 x bfloat> <bfloat 1 .0 , bfloat 1 .0 , bfloat 1 .0 , bfloat 1 .0 >) #0
316
+ store <4 x bfloat> %tmp , ptr addrspace (1 ) %out
317
+ ret void
318
+ }
319
+
320
+ define amdgpu_kernel void @set_inactive_p0 (ptr addrspace (1 ) %out , ptr %in ) {
321
+ ; GCN-LABEL: set_inactive_p0:
322
+ ; GCN: ; %bb.0:
323
+ ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
324
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
325
+ ; GCN-NEXT: v_mov_b32_e32 v0, s2
326
+ ; GCN-NEXT: v_mov_b32_e32 v1, s3
327
+ ; GCN-NEXT: s_not_b64 exec, exec
328
+ ; GCN-NEXT: v_mov_b32_e32 v0, 0
329
+ ; GCN-NEXT: v_mov_b32_e32 v1, 0
330
+ ; GCN-NEXT: s_not_b64 exec, exec
331
+ ; GCN-NEXT: s_mov_b32 s2, -1
332
+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
333
+ ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
334
+ ; GCN-NEXT: s_endpgm
335
+ %tmp = call ptr @llvm.amdgcn.set.inactive.p0 (ptr %in , ptr null ) #0
336
+ store ptr %tmp , ptr addrspace (1 ) %out
337
+ ret void
338
+ }
339
+
340
+ define amdgpu_kernel void @set_inactive_p2 (ptr addrspace (1 ) %out , ptr addrspace (2 ) %in ) {
341
+ ; GCN-LABEL: set_inactive_p2:
342
+ ; GCN: ; %bb.0:
343
+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
344
+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
345
+ ; GCN-NEXT: s_mov_b32 s2, -1
346
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
347
+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
348
+ ; GCN-NEXT: s_not_b64 exec, exec
349
+ ; GCN-NEXT: v_mov_b32_e32 v0, 0
350
+ ; GCN-NEXT: s_not_b64 exec, exec
351
+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
352
+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
353
+ ; GCN-NEXT: s_endpgm
354
+ %tmp = call ptr addrspace (2 ) @llvm.amdgcn.set.inactive.p2 (ptr addrspace (2 ) %in , ptr addrspace (2 ) null ) #0
355
+ store ptr addrspace (2 ) %tmp , ptr addrspace (1 ) %out
356
+ ret void
357
+ }
358
+
359
+ define amdgpu_kernel void @set_inactive_p3 (ptr addrspace (1 ) %out , ptr addrspace (3 ) %in ) {
360
+ ; GCN-LABEL: set_inactive_p3:
361
+ ; GCN: ; %bb.0:
362
+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
363
+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
364
+ ; GCN-NEXT: s_mov_b32 s2, -1
365
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
366
+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
367
+ ; GCN-NEXT: s_not_b64 exec, exec
368
+ ; GCN-NEXT: v_mov_b32_e32 v0, 0
369
+ ; GCN-NEXT: s_not_b64 exec, exec
370
+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
371
+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
372
+ ; GCN-NEXT: s_endpgm
373
+ %tmp = call ptr addrspace (3 ) @llvm.amdgcn.set.inactive.p3 (ptr addrspace (3 ) %in , ptr addrspace (3 ) null ) #0
374
+ store ptr addrspace (3 ) %tmp , ptr addrspace (1 ) %out
375
+ ret void
376
+ }
377
+
378
+ define amdgpu_kernel void @set_inactive_p5 (ptr addrspace (1 ) %out , ptr addrspace (5 ) %in ) {
379
+ ; GCN-LABEL: set_inactive_p5:
380
+ ; GCN: ; %bb.0:
381
+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
382
+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
383
+ ; GCN-NEXT: s_mov_b32 s2, -1
384
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
385
+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
386
+ ; GCN-NEXT: s_not_b64 exec, exec
387
+ ; GCN-NEXT: v_mov_b32_e32 v0, 0
388
+ ; GCN-NEXT: s_not_b64 exec, exec
389
+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
390
+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
391
+ ; GCN-NEXT: s_endpgm
392
+ %tmp = call ptr addrspace (5 ) @llvm.amdgcn.set.inactive.p5 (ptr addrspace (5 ) %in , ptr addrspace (5 ) null ) #0
393
+ store ptr addrspace (5 ) %tmp , ptr addrspace (1 ) %out
394
+ ret void
395
+ }
396
+
397
+ define amdgpu_kernel void @set_inactive_p6 (ptr addrspace (1 ) %out , ptr addrspace (6 ) %in ) {
398
+ ; GCN-LABEL: set_inactive_p6:
399
+ ; GCN: ; %bb.0:
400
+ ; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
401
+ ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
402
+ ; GCN-NEXT: s_mov_b32 s2, -1
403
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
404
+ ; GCN-NEXT: v_mov_b32_e32 v0, s3
405
+ ; GCN-NEXT: s_not_b64 exec, exec
406
+ ; GCN-NEXT: v_mov_b32_e32 v0, 0
407
+ ; GCN-NEXT: s_not_b64 exec, exec
408
+ ; GCN-NEXT: s_mov_b32 s3, 0xf000
409
+ ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
410
+ ; GCN-NEXT: s_endpgm
411
+ %tmp = call ptr addrspace (6 ) @llvm.amdgcn.set.inactive.p6 (ptr addrspace (6 ) %in , ptr addrspace (6 ) null ) #0
412
+ store ptr addrspace (6 ) %tmp , ptr addrspace (1 ) %out
413
+ ret void
414
+ }
415
+
96
416
declare i32 @llvm.amdgcn.set.inactive.i32 (i32 , i32 ) #0
97
417
declare i64 @llvm.amdgcn.set.inactive.i64 (i64 , i64 ) #0
98
418
declare i32 @llvm.amdgcn.s.buffer.load.i32 (<4 x i32 >, i32 , i32 )
0 commit comments