Skip to content

Commit 50dd383

Browse files
committed
[MachineLICM] Handle Subloops
Following discussion on https://reviews.llvm.org/D154205, make MachineLICM pass handle subloops with only visiting outmost loop's blocks once. Differential Revision: https://reviews.llvm.org/D154205
1 parent cce5324 commit 50dd383

File tree

7 files changed

+136
-116
lines changed

7 files changed

+136
-116
lines changed

llvm/lib/CodeGen/MachineLICM.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -778,8 +778,25 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
778778
// Process the block
779779
SpeculationState = SpeculateUnknown;
780780
for (MachineInstr &MI : llvm::make_early_inc_range(*MBB)) {
781-
if (!Hoist(&MI, Preheader))
781+
if (!Hoist(&MI, Preheader)) {
782+
// We have failed to hoist MI to outmost loop's preheader. If MI is in
783+
// subloop, try to hoist it to subloop's preheader.
784+
MachineLoop *InnerMostLoop = MLI->getLoopFor(MI.getParent());
785+
MachineBasicBlock *InnerMostLoopPreheader =
786+
InnerMostLoop->getLoopPreheader();
787+
if (CurLoop != InnerMostLoop && InnerMostLoopPreheader) {
788+
std::swap(CurLoop, InnerMostLoop);
789+
std::swap(CurPreheader, InnerMostLoopPreheader);
790+
Hoist(&MI, CurPreheader);
791+
std::swap(CurLoop, InnerMostLoop);
792+
std::swap(CurPreheader, InnerMostLoopPreheader);
793+
}
794+
// When MI is hoisted to inner-most loop's preheader, we need to update
795+
// reg pressure because we have already visited inner-most loop's
796+
// preheader.
782797
UpdateRegPressure(&MI);
798+
}
799+
783800
// If we have hoisted an instruction that may store, it can only be a
784801
// constant store.
785802
}

llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,20 +38,20 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
3838
; CHECK-NEXT: mov x16, x14
3939
; CHECK-NEXT: mov x17, x12
4040
; CHECK-NEXT: mov x18, x11
41+
; CHECK-NEXT: dup v0.8h, w15
4142
; CHECK-NEXT: .LBB0_6: // %vector.body
4243
; CHECK-NEXT: // Parent Loop BB0_3 Depth=1
4344
; CHECK-NEXT: // => This Inner Loop Header: Depth=2
44-
; CHECK-NEXT: ldp q0, q1, [x16, #-16]
45-
; CHECK-NEXT: dup v3.8h, w15
45+
; CHECK-NEXT: ldp q1, q2, [x16, #-16]
4646
; CHECK-NEXT: subs x18, x18, #16
4747
; CHECK-NEXT: add x16, x16, #32
48-
; CHECK-NEXT: ldp q4, q2, [x17, #-32]
49-
; CHECK-NEXT: smlal v4.4s, v3.4h, v0.4h
48+
; CHECK-NEXT: ldp q4, q3, [x17, #-32]
49+
; CHECK-NEXT: smlal v4.4s, v0.4h, v1.4h
5050
; CHECK-NEXT: ldp q6, q5, [x17]
51-
; CHECK-NEXT: smlal2 v2.4s, v3.8h, v0.8h
52-
; CHECK-NEXT: smlal v6.4s, v3.4h, v1.4h
53-
; CHECK-NEXT: stp q4, q2, [x17, #-32]
54-
; CHECK-NEXT: smlal2 v5.4s, v3.8h, v1.8h
51+
; CHECK-NEXT: smlal2 v3.4s, v0.8h, v1.8h
52+
; CHECK-NEXT: smlal v6.4s, v0.4h, v2.4h
53+
; CHECK-NEXT: stp q4, q3, [x17, #-32]
54+
; CHECK-NEXT: smlal2 v5.4s, v0.8h, v2.8h
5555
; CHECK-NEXT: stp q6, q5, [x17], #64
5656
; CHECK-NEXT: b.ne .LBB0_6
5757
; CHECK-NEXT: // %bb.7: // %middle.block

llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -557,11 +557,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
557557
; GFX908-NEXT: s_mul_hi_u32 s9, s0, s5
558558
; GFX908-NEXT: s_mul_i32 s0, s0, s5
559559
; GFX908-NEXT: s_add_i32 s1, s9, s1
560-
; GFX908-NEXT: s_lshl_b64 s[0:1], s[0:1], 5
560+
; GFX908-NEXT: s_lshl_b64 s[14:15], s[0:1], 5
561561
; GFX908-NEXT: s_branch .LBB3_2
562562
; GFX908-NEXT: .LBB3_1: ; %Flow20
563563
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
564-
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15]
564+
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[0:1]
565565
; GFX908-NEXT: s_cbranch_vccz .LBB3_12
566566
; GFX908-NEXT: .LBB3_2: ; %bb9
567567
; GFX908-NEXT: ; =>This Loop Header: Depth=1
@@ -571,15 +571,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
571571
; GFX908-NEXT: ; %bb.3: ; %bb14
572572
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
573573
; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
574+
; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
574575
; GFX908-NEXT: s_mov_b32 s9, s8
576+
; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
575577
; GFX908-NEXT: v_mov_b32_e32 v4, s8
578+
; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6
576579
; GFX908-NEXT: v_mov_b32_e32 v8, s8
577580
; GFX908-NEXT: v_mov_b32_e32 v6, s8
578581
; GFX908-NEXT: v_mov_b32_e32 v5, s9
579582
; GFX908-NEXT: v_mov_b32_e32 v9, s9
580583
; GFX908-NEXT: v_mov_b32_e32 v7, s9
581-
; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0
582-
; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1
584+
; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0
583585
; GFX908-NEXT: v_mov_b32_e32 v11, v5
584586
; GFX908-NEXT: s_mov_b64 s[20:21], s[10:11]
585587
; GFX908-NEXT: v_mov_b32_e32 v10, v4
@@ -599,9 +601,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
599601
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
600602
; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
601603
; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
602-
; GFX908-NEXT: s_add_u32 s20, s20, s0
604+
; GFX908-NEXT: s_add_u32 s20, s20, s14
603605
; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3]
604-
; GFX908-NEXT: s_addc_u32 s21, s21, s1
606+
; GFX908-NEXT: s_addc_u32 s21, s21, s15
605607
; GFX908-NEXT: s_mov_b64 s[22:23], 0
606608
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25]
607609
; GFX908-NEXT: s_cbranch_vccz .LBB3_9
@@ -620,7 +622,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
620622
; GFX908-NEXT: s_waitcnt vmcnt(0)
621623
; GFX908-NEXT: ds_read_b64 v[12:13], v19
622624
; GFX908-NEXT: ds_read_b64 v[14:15], v0
623-
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17]
625+
; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1]
624626
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
625627
; GFX908-NEXT: s_cbranch_vccnz .LBB3_7
626628
; GFX908-NEXT: ; %bb.6: ; %bb51
@@ -648,7 +650,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
648650
; GFX908-NEXT: s_mov_b64 s[22:23], -1
649651
; GFX908-NEXT: s_branch .LBB3_4
650652
; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
651-
; GFX908-NEXT: s_mov_b64 s[22:23], s[14:15]
653+
; GFX908-NEXT: s_mov_b64 s[22:23], s[16:17]
652654
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23]
653655
; GFX908-NEXT: s_cbranch_vccz .LBB3_4
654656
; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
@@ -663,7 +665,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
663665
; GFX908-NEXT: s_xor_b64 s[16:17], s[22:23], -1
664666
; GFX908-NEXT: .LBB3_10: ; %Flow19
665667
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
666-
; GFX908-NEXT: s_mov_b64 s[14:15], -1
668+
; GFX908-NEXT: s_mov_b64 s[0:1], -1
667669
; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17]
668670
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
669671
; GFX908-NEXT: ; %bb.11: ; %bb12
@@ -672,7 +674,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
672674
; GFX908-NEXT: s_addc_u32 s7, s7, 0
673675
; GFX908-NEXT: s_add_u32 s10, s10, s12
674676
; GFX908-NEXT: s_addc_u32 s11, s11, s13
675-
; GFX908-NEXT: s_mov_b64 s[14:15], 0
677+
; GFX908-NEXT: s_mov_b64 s[0:1], 0
676678
; GFX908-NEXT: s_branch .LBB3_1
677679
; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock
678680
; GFX908-NEXT: s_endpgm
@@ -722,11 +724,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
722724
; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s5
723725
; GFX90A-NEXT: s_mul_i32 s0, s0, s5
724726
; GFX90A-NEXT: s_add_i32 s1, s9, s1
725-
; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], 5
727+
; GFX90A-NEXT: s_lshl_b64 s[14:15], s[0:1], 5
726728
; GFX90A-NEXT: s_branch .LBB3_2
727729
; GFX90A-NEXT: .LBB3_1: ; %Flow20
728730
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
729-
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15]
731+
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[0:1]
730732
; GFX90A-NEXT: s_cbranch_vccz .LBB3_12
731733
; GFX90A-NEXT: .LBB3_2: ; %bb9
732734
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
@@ -736,12 +738,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
736738
; GFX90A-NEXT: ; %bb.3: ; %bb14
737739
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
738740
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
741+
; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
739742
; GFX90A-NEXT: s_mov_b32 s9, s8
743+
; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1]
740744
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
745+
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8
741746
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
742747
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
743-
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0
744-
; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1
748+
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0
745749
; GFX90A-NEXT: s_mov_b64 s[20:21], s[10:11]
746750
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
747751
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -760,8 +764,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
760764
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
761765
; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
762766
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
763-
; GFX90A-NEXT: s_add_u32 s20, s20, s0
764-
; GFX90A-NEXT: s_addc_u32 s21, s21, s1
767+
; GFX90A-NEXT: s_add_u32 s20, s20, s14
768+
; GFX90A-NEXT: s_addc_u32 s21, s21, s15
765769
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5]
766770
; GFX90A-NEXT: s_mov_b64 s[22:23], 0
767771
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25]
@@ -781,7 +785,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
781785
; GFX90A-NEXT: s_waitcnt vmcnt(0)
782786
; GFX90A-NEXT: ds_read_b64 v[14:15], v19
783787
; GFX90A-NEXT: ds_read_b64 v[16:17], v0
784-
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17]
788+
; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1]
785789
; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23
786790
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
787791
; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7
@@ -802,7 +806,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
802806
; GFX90A-NEXT: s_mov_b64 s[22:23], -1
803807
; GFX90A-NEXT: s_branch .LBB3_4
804808
; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
805-
; GFX90A-NEXT: s_mov_b64 s[22:23], s[14:15]
809+
; GFX90A-NEXT: s_mov_b64 s[22:23], s[16:17]
806810
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23]
807811
; GFX90A-NEXT: s_cbranch_vccz .LBB3_4
808812
; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
@@ -817,7 +821,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
817821
; GFX90A-NEXT: s_xor_b64 s[16:17], s[22:23], -1
818822
; GFX90A-NEXT: .LBB3_10: ; %Flow19
819823
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
820-
; GFX90A-NEXT: s_mov_b64 s[14:15], -1
824+
; GFX90A-NEXT: s_mov_b64 s[0:1], -1
821825
; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17]
822826
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
823827
; GFX90A-NEXT: ; %bb.11: ; %bb12
@@ -826,7 +830,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
826830
; GFX90A-NEXT: s_addc_u32 s7, s7, 0
827831
; GFX90A-NEXT: s_add_u32 s10, s10, s12
828832
; GFX90A-NEXT: s_addc_u32 s11, s11, s13
829-
; GFX90A-NEXT: s_mov_b64 s[14:15], 0
833+
; GFX90A-NEXT: s_mov_b64 s[0:1], 0
830834
; GFX90A-NEXT: s_branch .LBB3_1
831835
; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock
832836
; GFX90A-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool llc
22
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
33

44
; GCN-LABEL: {{^}}negated_cond:
55
; GCN: .LBB0_1:
6-
; GCN: v_cmp_eq_u32_e64 [[CC:[^,]+]],
76
; GCN: .LBB0_3:
87
; GCN-NOT: v_cndmask_b32
98
; GCN-NOT: v_cmp
10-
; GCN: s_andn2_b64 vcc, exec, [[CC]]
119
; GCN: s_lshl_b32 s12, s12, 5
1210
; GCN: s_cbranch_vccz .LBB0_6
1311
define amdgpu_kernel void @negated_cond(ptr addrspace(1) %arg1) {
@@ -38,17 +36,9 @@ bb4:
3836

3937
; GCN-LABEL: {{^}}negated_cond_dominated_blocks:
4038
; GCN: s_cmp_lg_u32
41-
; GCN: s_cselect_b64 [[CC1:[^,]+]], -1, 0
42-
; GCN: s_branch [[BB1:.LBB[0-9]+_[0-9]+]]
43-
; GCN: [[BB0:.LBB[0-9]+_[0-9]+]]
4439
; GCN-NOT: v_cndmask_b32
4540
; GCN-NOT: v_cmp
46-
; GCN: [[BB1]]:
47-
; GCN: s_mov_b64 vcc, [[CC1]]
48-
; GCN: s_cbranch_vccz [[BB2:.LBB[0-9]+_[0-9]+]]
4941
; GCN: s_mov_b64 vcc, exec
50-
; GCN: s_cbranch_execnz [[BB0]]
51-
; GCN: [[BB2]]:
5242
define amdgpu_kernel void @negated_cond_dominated_blocks(ptr addrspace(1) %arg1) {
5343
bb:
5444
br label %bb2
@@ -81,3 +71,5 @@ bb7:
8171
%tmp8 = icmp eq i32 %tmp7, 32
8272
br i1 %tmp8, label %bb3, label %bb4
8373
}
74+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
75+
; GCN: {{.*}}

llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
178178
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46
179179
; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
180180
; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0)
181-
; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1]
182-
; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[66:67], 0, v2
181+
; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
182+
; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
183+
; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2
184+
; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0
183185
; GLOBALNESS1-NEXT: s_branch .LBB1_15
184186
; GLOBALNESS1-NEXT: .LBB1_13: ; %Flow16
185187
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2
@@ -207,7 +209,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
207209
; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_20
208210
; GLOBALNESS1-NEXT: ; %bb.19: ; %bb6.i.i
209211
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2
210-
; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[64:65]
212+
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[66:67]
211213
; GLOBALNESS1-NEXT: .LBB1_20: ; %spam.exit.i
212214
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2
213215
; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[56:57]
@@ -236,7 +238,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
236238
; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41
237239
; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off
238240
; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77]
239-
; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[66:67]
241+
; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[64:65]
240242
; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_13
241243
; GLOBALNESS1-NEXT: ; %bb.22: ; %bb62.i
242244
; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2
@@ -465,8 +467,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
465467
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46
466468
; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc
467469
; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0)
468-
; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1]
469-
; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[66:67], 0, v2
470+
; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1]
471+
; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
472+
; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[64:65], 0, v2
473+
; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0
470474
; GLOBALNESS0-NEXT: s_branch .LBB1_15
471475
; GLOBALNESS0-NEXT: .LBB1_13: ; %Flow16
472476
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2
@@ -494,7 +498,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
494498
; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_20
495499
; GLOBALNESS0-NEXT: ; %bb.19: ; %bb6.i.i
496500
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2
497-
; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[64:65]
501+
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[66:67]
498502
; GLOBALNESS0-NEXT: .LBB1_20: ; %spam.exit.i
499503
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2
500504
; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[56:57]
@@ -523,7 +527,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
523527
; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41
524528
; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off
525529
; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79]
526-
; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[66:67]
530+
; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[64:65]
527531
; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_13
528532
; GLOBALNESS0-NEXT: ; %bb.22: ; %bb62.i
529533
; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2

0 commit comments

Comments
 (0)