@@ -178,6 +178,55 @@ exit:
178
178
ret void
179
179
}
180
180
181
+ define void @divergent_i1_xor_used_outside_loop_twice (float %val , float %pre.cond.val , ptr %addr , ptr %addr2 ) {
182
+ ; GFX10-LABEL: divergent_i1_xor_used_outside_loop_twice:
183
+ ; GFX10: ; %bb.0: ; %entry
184
+ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
185
+ ; GFX10-NEXT: s_mov_b32 s4, 0
186
+ ; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1
187
+ ; GFX10-NEXT: v_mov_b32_e32 v1, s4
188
+ ; GFX10-NEXT: ; implicit-def: $sgpr6
189
+ ; GFX10-NEXT: .LBB3_1: ; %loop
190
+ ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
191
+ ; GFX10-NEXT: v_cvt_f32_u32_e32 v6, v1
192
+ ; GFX10-NEXT: s_xor_b32 s5, s5, -1
193
+ ; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
194
+ ; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v0
195
+ ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
196
+ ; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
197
+ ; GFX10-NEXT: s_and_b32 s7, exec_lo, s5
198
+ ; GFX10-NEXT: s_or_b32 s6, s6, s7
199
+ ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
200
+ ; GFX10-NEXT: s_cbranch_execnz .LBB3_1
201
+ ; GFX10-NEXT: ; %bb.2: ; %exit
202
+ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
203
+ ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
204
+ ; GFX10-NEXT: v_cndmask_b32_e64 v1, -1.0, 2.0, s6
205
+ ; GFX10-NEXT: flat_store_dword v[2:3], v0
206
+ ; GFX10-NEXT: flat_store_dword v[4:5], v1
207
+ ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
208
+ ; GFX10-NEXT: s_setpc_b64 s[30:31]
209
+ entry:
210
+ %pre.cond = fcmp ogt float %pre.cond.val , 1 .0
211
+ br label %loop
212
+
213
+ loop:
214
+ %counter = phi i32 [ 0 , %entry ], [ %counter.plus.1 , %loop ]
215
+ %bool.counter = phi i1 [ %pre.cond , %entry ], [ %neg.bool.counter , %loop ]
216
+ %neg.bool.counter = xor i1 %bool.counter , true
217
+ %f.counter = uitofp i32 %counter to float
218
+ %cond = fcmp ogt float %f.counter , %val
219
+ %counter.plus.1 = add i32 %counter , 1
220
+ br i1 %cond , label %exit , label %loop
221
+
222
+ exit:
223
+ %select = select i1 %neg.bool.counter , float 1 .000000e+00 , float 0 .000000e+00
224
+ store float %select , ptr %addr
225
+ %select2 = select i1 %neg.bool.counter , float 2 .000000e+00 , float -1 .000000e+00
226
+ store float %select2 , ptr %addr2
227
+ ret void
228
+ }
229
+
181
230
;void xor(int num_elts, int* a, int* addr) {
182
231
;for(int i=0; i<num_elts; ++i) {
183
232
; if(a[i]==0)
@@ -195,15 +244,15 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
195
244
; GFX10-NEXT: s_mov_b32 s5, 0
196
245
; GFX10-NEXT: s_mov_b32 s6, -1
197
246
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
198
- ; GFX10-NEXT: s_cbranch_execz .LBB3_6
247
+ ; GFX10-NEXT: s_cbranch_execz .LBB4_6
199
248
; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
200
249
; GFX10-NEXT: v_mov_b32_e32 v5, s5
201
250
; GFX10-NEXT: ; implicit-def: $sgpr6
202
251
; GFX10-NEXT: ; implicit-def: $sgpr7
203
252
; GFX10-NEXT: ; implicit-def: $sgpr8
204
- ; GFX10-NEXT: s_branch .LBB3_3
205
- ; GFX10-NEXT: .LBB3_2 : ; %Flow
206
- ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
253
+ ; GFX10-NEXT: s_branch .LBB4_3
254
+ ; GFX10-NEXT: .LBB4_2 : ; %Flow
255
+ ; GFX10-NEXT: ; in Loop: Header=BB4_3 Depth=1
207
256
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
208
257
; GFX10-NEXT: s_xor_b32 s9, s8, -1
209
258
; GFX10-NEXT: s_and_b32 s10, exec_lo, s7
@@ -212,8 +261,8 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
212
261
; GFX10-NEXT: s_and_b32 s9, exec_lo, s9
213
262
; GFX10-NEXT: s_or_b32 s6, s6, s9
214
263
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
215
- ; GFX10-NEXT: s_cbranch_execz .LBB3_5
216
- ; GFX10-NEXT: .LBB3_3 : ; %loop.start
264
+ ; GFX10-NEXT: s_cbranch_execz .LBB4_5
265
+ ; GFX10-NEXT: .LBB4_3 : ; %loop.start
217
266
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
218
267
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
219
268
; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo
@@ -228,9 +277,9 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
228
277
; GFX10-NEXT: s_waitcnt vmcnt(0)
229
278
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
230
279
; GFX10-NEXT: s_and_saveexec_b32 s9, vcc_lo
231
- ; GFX10-NEXT: s_cbranch_execz .LBB3_2
280
+ ; GFX10-NEXT: s_cbranch_execz .LBB4_2
232
281
; GFX10-NEXT: ; %bb.4: ; %loop.cond
233
- ; GFX10-NEXT: ; in Loop: Header=BB3_3 Depth=1
282
+ ; GFX10-NEXT: ; in Loop: Header=BB4_3 Depth=1
234
283
; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v5
235
284
; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0
236
285
; GFX10-NEXT: s_andn2_b32 s8, s8, exec_lo
@@ -240,20 +289,20 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
240
289
; GFX10-NEXT: s_and_b32 s11, exec_lo, vcc_lo
241
290
; GFX10-NEXT: s_or_b32 s8, s8, s10
242
291
; GFX10-NEXT: s_or_b32 s7, s7, s11
243
- ; GFX10-NEXT: s_branch .LBB3_2
244
- ; GFX10-NEXT: .LBB3_5 : ; %loop.exit.guard
292
+ ; GFX10-NEXT: s_branch .LBB4_2
293
+ ; GFX10-NEXT: .LBB4_5 : ; %loop.exit.guard
245
294
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
246
295
; GFX10-NEXT: s_andn2_b32 s5, -1, exec_lo
247
296
; GFX10-NEXT: s_and_b32 s6, exec_lo, s6
248
297
; GFX10-NEXT: s_or_b32 s6, s5, s6
249
- ; GFX10-NEXT: .LBB3_6 : ; %Flow1
298
+ ; GFX10-NEXT: .LBB4_6 : ; %Flow1
250
299
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
251
300
; GFX10-NEXT: s_and_saveexec_b32 s4, s6
252
- ; GFX10-NEXT: s_cbranch_execz .LBB3_8
301
+ ; GFX10-NEXT: s_cbranch_execz .LBB4_8
253
302
; GFX10-NEXT: ; %bb.7: ; %block.after.loop
254
303
; GFX10-NEXT: v_mov_b32_e32 v0, 5
255
304
; GFX10-NEXT: flat_store_dword v[3:4], v0
256
- ; GFX10-NEXT: .LBB3_8 : ; %exit
305
+ ; GFX10-NEXT: .LBB4_8 : ; %exit
257
306
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
258
307
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
259
308
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -299,53 +348,53 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
299
348
; GFX10-NEXT: s_mov_b32 s5, 0
300
349
; GFX10-NEXT: ; implicit-def: $sgpr6
301
350
; GFX10-NEXT: v_mov_b32_e32 v5, s5
302
- ; GFX10-NEXT: s_branch .LBB4_2
303
- ; GFX10-NEXT: .LBB4_1 : ; %Flow
304
- ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
351
+ ; GFX10-NEXT: s_branch .LBB5_2
352
+ ; GFX10-NEXT: .LBB5_1 : ; %Flow
353
+ ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
305
354
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
306
355
; GFX10-NEXT: s_and_b32 s4, exec_lo, s7
307
356
; GFX10-NEXT: s_or_b32 s5, s4, s5
308
357
; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo
309
358
; GFX10-NEXT: s_and_b32 s6, exec_lo, vcc_lo
310
359
; GFX10-NEXT: s_or_b32 s6, s4, s6
311
360
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
312
- ; GFX10-NEXT: s_cbranch_execz .LBB4_6
313
- ; GFX10-NEXT: .LBB4_2 : ; %cond.block.0
361
+ ; GFX10-NEXT: s_cbranch_execz .LBB5_6
362
+ ; GFX10-NEXT: .LBB5_2 : ; %cond.block.0
314
363
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
315
364
; GFX10-NEXT: v_mov_b32_e32 v4, v5
316
365
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v4
317
366
; GFX10-NEXT: s_and_saveexec_b32 s7, vcc_lo
318
- ; GFX10-NEXT: s_cbranch_execz .LBB4_4
367
+ ; GFX10-NEXT: s_cbranch_execz .LBB5_4
319
368
; GFX10-NEXT: ; %bb.3: ; %if.block.0
320
- ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
369
+ ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
321
370
; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4
322
371
; GFX10-NEXT: v_lshlrev_b64 v[8:9], 2, v[4:5]
323
372
; GFX10-NEXT: v_add_co_u32 v8, s4, v2, v8
324
373
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v3, v9, s4
325
374
; GFX10-NEXT: global_store_dword v[8:9], v4, off
326
- ; GFX10-NEXT: .LBB4_4 : ; %loop.break.block
327
- ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
375
+ ; GFX10-NEXT: .LBB5_4 : ; %loop.break.block
376
+ ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
328
377
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
329
378
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
330
379
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4
331
380
; GFX10-NEXT: s_mov_b32 s7, -1
332
381
; GFX10-NEXT: ; implicit-def: $vgpr5
333
382
; GFX10-NEXT: s_and_saveexec_b32 s8, s4
334
- ; GFX10-NEXT: s_cbranch_execz .LBB4_1
383
+ ; GFX10-NEXT: s_cbranch_execz .LBB5_1
335
384
; GFX10-NEXT: ; %bb.5: ; %loop.cond
336
- ; GFX10-NEXT: ; in Loop: Header=BB4_2 Depth=1
385
+ ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
337
386
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v4
338
387
; GFX10-NEXT: s_andn2_b32 s4, -1, exec_lo
339
388
; GFX10-NEXT: s_and_b32 s7, exec_lo, 0
340
389
; GFX10-NEXT: s_or_b32 s7, s4, s7
341
- ; GFX10-NEXT: s_branch .LBB4_1
342
- ; GFX10-NEXT: .LBB4_6 : ; %cond.block.1
390
+ ; GFX10-NEXT: s_branch .LBB5_1
391
+ ; GFX10-NEXT: .LBB5_6 : ; %cond.block.1
343
392
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
344
393
; GFX10-NEXT: s_and_saveexec_b32 s4, s6
345
- ; GFX10-NEXT: s_cbranch_execz .LBB4_8
394
+ ; GFX10-NEXT: s_cbranch_execz .LBB5_8
346
395
; GFX10-NEXT: ; %bb.7: ; %if.block.1
347
396
; GFX10-NEXT: global_store_dword v[6:7], v4, off
348
- ; GFX10-NEXT: .LBB4_8 : ; %exit
397
+ ; GFX10-NEXT: .LBB5_8 : ; %exit
349
398
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
350
399
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
351
400
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -410,9 +459,9 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
410
459
; GFX10-NEXT: v_mov_b32_e32 v5, s0
411
460
; GFX10-NEXT: ; implicit-def: $sgpr1
412
461
; GFX10-NEXT: ; implicit-def: $sgpr2
413
- ; GFX10-NEXT: s_branch .LBB5_2
414
- ; GFX10-NEXT: .LBB5_1 : ; %loop.cond
415
- ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
462
+ ; GFX10-NEXT: s_branch .LBB6_2
463
+ ; GFX10-NEXT: .LBB6_1 : ; %loop.cond
464
+ ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
416
465
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
417
466
; GFX10-NEXT: v_cmp_lt_i32_e32 vcc_lo, v5, v0
418
467
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v5
@@ -423,16 +472,16 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
423
472
; GFX10-NEXT: s_or_b32 s3, s3, s4
424
473
; GFX10-NEXT: s_or_b32 s1, s1, s4
425
474
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
426
- ; GFX10-NEXT: s_cbranch_execz .LBB5_4
427
- ; GFX10-NEXT: .LBB5_2 : ; %loop.start
475
+ ; GFX10-NEXT: s_cbranch_execz .LBB6_4
476
+ ; GFX10-NEXT: .LBB6_2 : ; %loop.start
428
477
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
429
478
; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
430
479
; GFX10-NEXT: s_and_b32 s4, exec_lo, s3
431
480
; GFX10-NEXT: s_or_b32 s2, s2, s4
432
481
; GFX10-NEXT: s_and_saveexec_b32 s4, s3
433
- ; GFX10-NEXT: s_cbranch_execz .LBB5_1
482
+ ; GFX10-NEXT: s_cbranch_execz .LBB6_1
434
483
; GFX10-NEXT: ; %bb.3: ; %is.eq.zero
435
- ; GFX10-NEXT: ; in Loop: Header=BB5_2 Depth=1
484
+ ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
436
485
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
437
486
; GFX10-NEXT: s_andn2_b32 s2, s2, exec_lo
438
487
; GFX10-NEXT: v_lshlrev_b64 v[6:7], 2, v[5:6]
@@ -444,8 +493,8 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
444
493
; GFX10-NEXT: s_and_b32 s3, exec_lo, vcc_lo
445
494
; GFX10-NEXT: s_or_b32 s2, s2, s3
446
495
; GFX10-NEXT: ; implicit-def: $sgpr3
447
- ; GFX10-NEXT: s_branch .LBB5_1
448
- ; GFX10-NEXT: .LBB5_4 : ; %exit
496
+ ; GFX10-NEXT: s_branch .LBB6_1
497
+ ; GFX10-NEXT: .LBB6_4 : ; %exit
449
498
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
450
499
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s1
451
500
; GFX10-NEXT: flat_store_dword v[3:4], v0
@@ -486,9 +535,9 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
486
535
; GFX10-NEXT: ; implicit-def: $sgpr2
487
536
; GFX10-NEXT: ; implicit-def: $sgpr3
488
537
; GFX10-NEXT: v_mov_b32_e32 v6, s0
489
- ; GFX10-NEXT: s_branch .LBB6_2
490
- ; GFX10-NEXT: .LBB6_1 : ; %Flow
491
- ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
538
+ ; GFX10-NEXT: s_branch .LBB7_2
539
+ ; GFX10-NEXT: .LBB7_1 : ; %Flow
540
+ ; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1
492
541
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
493
542
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
494
543
; GFX10-NEXT: s_and_b32 s4, exec_lo, s2
@@ -497,8 +546,8 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
497
546
; GFX10-NEXT: s_and_b32 s4, exec_lo, s3
498
547
; GFX10-NEXT: s_or_b32 s1, s1, s4
499
548
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
500
- ; GFX10-NEXT: s_cbranch_execz .LBB6_4
501
- ; GFX10-NEXT: .LBB6_2 : ; %A
549
+ ; GFX10-NEXT: s_cbranch_execz .LBB7_4
550
+ ; GFX10-NEXT: .LBB7_2 : ; %A
502
551
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
503
552
; GFX10-NEXT: v_ashrrev_i32_e32 v7, 31, v6
504
553
; GFX10-NEXT: s_andn2_b32 s3, s3, exec_lo
@@ -513,9 +562,9 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
513
562
; GFX10-NEXT: s_waitcnt vmcnt(0)
514
563
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
515
564
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
516
- ; GFX10-NEXT: s_cbranch_execz .LBB6_1
565
+ ; GFX10-NEXT: s_cbranch_execz .LBB7_1
517
566
; GFX10-NEXT: ; %bb.3: ; %loop.body
518
- ; GFX10-NEXT: ; in Loop: Header=BB6_2 Depth=1
567
+ ; GFX10-NEXT: ; in Loop: Header=BB7_2 Depth=1
519
568
; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v0, v7
520
569
; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
521
570
; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v6
@@ -531,16 +580,16 @@ define amdgpu_cs void @loop_with_1break(ptr addrspace(1) %x, ptr addrspace(1) %a
531
580
; GFX10-NEXT: s_waitcnt vmcnt(0)
532
581
; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v9
533
582
; GFX10-NEXT: global_store_dword v[7:8], v9, off
534
- ; GFX10-NEXT: s_branch .LBB6_1
535
- ; GFX10-NEXT: .LBB6_4 : ; %loop.exit.guard
583
+ ; GFX10-NEXT: s_branch .LBB7_1
584
+ ; GFX10-NEXT: .LBB7_4 : ; %loop.exit.guard
536
585
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
537
586
; GFX10-NEXT: s_and_saveexec_b32 s0, s1
538
587
; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
539
- ; GFX10-NEXT: s_cbranch_execz .LBB6_6
588
+ ; GFX10-NEXT: s_cbranch_execz .LBB7_6
540
589
; GFX10-NEXT: ; %bb.5: ; %break.body
541
590
; GFX10-NEXT: v_mov_b32_e32 v0, 10
542
591
; GFX10-NEXT: global_store_dword v[4:5], v0, off
543
- ; GFX10-NEXT: .LBB6_6 : ; %exit
592
+ ; GFX10-NEXT: .LBB7_6 : ; %exit
544
593
; GFX10-NEXT: s_endpgm
545
594
entry:
546
595
br label %A
0 commit comments