Skip to content

Commit 6c2eec5

Browse files
AMDGPU/GlobalISel: lane masks merging (#73337)
Basic implementation of lane mask merging for GlobalISel. Lane masks on GlobalISel are registers with sgpr register class and S1 LLT - required by machine uniformity analysis. Implements equivalent of lowerPhis from SILowerI1Copies.cpp in: patch 1: #75340 patch 2: #75349 patch 3: #80003 patch 4: #78431 patch 5: is in this commit: AMDGPU/GlobalISelDivergenceLowering: constrain incoming registers Previously, in PHIs that represent lane masks, incoming registers taken as-is were not selected as lane masks. Such registers are not being merged with another lane mask and most often only have S1 LLT. Implement constrainAsLaneMask by constraining incoming registers taken as-is with lane mask attributes, essentially transforming them to lane masks. This is final step in having PHI instructions created in this pass to be fully instruction-selected.
1 parent dbca8a4 commit 6c2eec5

7 files changed

+256
-232
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,16 @@ void DivergenceLoweringHelper::buildMergeLaneMasks(
177177
B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});
178178
}
179179

180-
void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { return; }
180+
// GlobalISel has to constrain S1 incoming taken as-is with lane mask register
181+
// class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block,
182+
// Incoming.Reg becomes that new lane mask.
183+
void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
184+
B.setInsertPt(*In.Block, In.Block->getFirstTerminator());
185+
186+
auto Copy = B.buildCopy(LLT::scalar(1), In.Reg);
187+
MRI->setRegClass(Copy.getReg(0), ST->getBoolRC());
188+
In.Reg = Copy.getReg(0);
189+
}
181190

182191
} // End anonymous namespace.
183192

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2-
; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
2+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
33

44
; Divergent phis that don't require lowering using lane mask merging
55

@@ -147,32 +147,28 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
147147
; GFX10-LABEL: divergent_i1_phi_used_inside_loop_bigger_loop_body:
148148
; GFX10: ; %bb.0: ; %entry
149149
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
150-
; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
151-
; GFX10-NEXT: s_mov_b32 s5, 0
150+
; GFX10-NEXT: s_mov_b32 s4, 0
151+
; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1
152152
; GFX10-NEXT: v_mov_b32_e32 v1, 0x3e8
153-
; GFX10-NEXT: v_mov_b32_e32 v8, s5
153+
; GFX10-NEXT: v_mov_b32_e32 v8, s4
154154
; GFX10-NEXT: ; implicit-def: $sgpr6
155-
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
156155
; GFX10-NEXT: s_branch .LBB3_2
157156
; GFX10-NEXT: .LBB3_1: ; %loop_body
158157
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
159158
; GFX10-NEXT: v_cvt_f32_u32_e32 v9, v8
160-
; GFX10-NEXT: s_xor_b32 s4, s4, -1
159+
; GFX10-NEXT: s_xor_b32 s5, s5, -1
161160
; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v8
162161
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v0
163-
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4
164-
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
162+
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
165163
; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
166-
; GFX10-NEXT: s_and_b32 s4, exec_lo, s4
167-
; GFX10-NEXT: s_or_b32 s6, s6, s4
168-
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
164+
; GFX10-NEXT: s_and_b32 s7, exec_lo, s5
165+
; GFX10-NEXT: s_or_b32 s6, s6, s7
166+
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
169167
; GFX10-NEXT: s_cbranch_execz .LBB3_6
170168
; GFX10-NEXT: .LBB3_2: ; %loop_start
171169
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
172-
; GFX10-NEXT: v_and_b32_e32 v9, 1, v9
173170
; GFX10-NEXT: v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8
174171
; GFX10-NEXT: s_mov_b32 s7, 1
175-
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v9
176172
; GFX10-NEXT: s_cbranch_vccz .LBB3_4
177173
; GFX10-NEXT: ; %bb.3: ; %else
178174
; GFX10-NEXT: ; in Loop: Header=BB3_2 Depth=1
@@ -189,7 +185,7 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
189185
; GFX10-NEXT: flat_store_dword v[4:5], v1
190186
; GFX10-NEXT: s_branch .LBB3_1
191187
; GFX10-NEXT: .LBB3_6: ; %exit
192-
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
188+
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
193189
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
194190
; GFX10-NEXT: flat_store_dword v[2:3], v0
195191
; GFX10-NEXT: s_waitcnt lgkmcnt(0)

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ body: |
3333
; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
3434
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
3535
; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
36+
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
3637
; GFX10-NEXT: G_BRCOND [[ICMP1]](s1), %bb.2
3738
; GFX10-NEXT: G_BR %bb.1
3839
; GFX10-NEXT: {{ $}}
@@ -46,20 +47,22 @@ body: |
4647
; GFX10-NEXT: bb.2:
4748
; GFX10-NEXT: successors: %bb.4(0x80000000)
4849
; GFX10-NEXT: {{ $}}
49-
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = G_PHI %14(s1), %bb.3, [[ICMP]](s1), %bb.0
50+
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.0, %20(s1), %bb.3
51+
; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
5052
; GFX10-NEXT: G_BR %bb.4
5153
; GFX10-NEXT: {{ $}}
5254
; GFX10-NEXT: bb.3:
5355
; GFX10-NEXT: successors: %bb.2(0x80000000)
5456
; GFX10-NEXT: {{ $}}
5557
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
5658
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C3]]
59+
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
5760
; GFX10-NEXT: G_BR %bb.2
5861
; GFX10-NEXT: {{ $}}
5962
; GFX10-NEXT: bb.4:
6063
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
6164
; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
62-
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C5]], [[C4]]
65+
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY7]](s1), [[C5]], [[C4]]
6366
; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
6467
; GFX10-NEXT: S_ENDPGM 0
6568
bb.0:
@@ -126,9 +129,10 @@ body: |
126129
; GFX10-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr0
127130
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
128131
; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
129-
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
130132
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
131133
; GFX10-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
134+
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
135+
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[COPY4]](s1)
132136
; GFX10-NEXT: G_BRCOND [[ICMP1]](s1), %bb.2
133137
; GFX10-NEXT: G_BR %bb.1
134138
; GFX10-NEXT: {{ $}}
@@ -137,17 +141,17 @@ body: |
137141
; GFX10-NEXT: {{ $}}
138142
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
139143
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
140-
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
141-
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
142-
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
144+
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
145+
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
146+
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY6]](s1), implicit-def $scc
143147
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
144148
; GFX10-NEXT: {{ $}}
145149
; GFX10-NEXT: bb.2:
146-
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
147-
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
150+
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
151+
; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
148152
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
149153
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
150-
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
154+
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY7]](s1), [[C4]], [[C3]]
151155
; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
152156
; GFX10-NEXT: S_ENDPGM 0
153157
bb.0:
@@ -292,19 +296,21 @@ body: |
292296
; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
293297
; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
294298
; GFX10-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
299+
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1)
295300
; GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
296301
; GFX10-NEXT: {{ $}}
297302
; GFX10-NEXT: bb.1:
298303
; GFX10-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000)
299304
; GFX10-NEXT: {{ $}}
300-
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %39(s1), %bb.5
301-
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
302-
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
303-
; GFX10-NEXT: [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %19(s1), %bb.5
304-
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
305+
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %42(s1), %bb.5
306+
; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.0, %39(s1), %bb.5
307+
; GFX10-NEXT: [[PHI2:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
308+
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
309+
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
310+
; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
305311
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
306312
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
307-
; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI2]](s32), [[C3]]
313+
; GFX10-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI3]](s32), [[C3]]
308314
; GFX10-NEXT: G_BRCOND [[ICMP]](s1), %bb.4
309315
; GFX10-NEXT: G_BR %bb.2
310316
; GFX10-NEXT: {{ $}}
@@ -336,26 +342,27 @@ body: |
336342
; GFX10-NEXT: successors: %bb.6(0x04000000), %bb.1(0x7c000000)
337343
; GFX10-NEXT: {{ $}}
338344
; GFX10-NEXT: [[C8:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
339-
; GFX10-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C8]]
340-
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
341-
; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
345+
; GFX10-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[COPY10]], [[C8]]
346+
; GFX10-NEXT: [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
347+
; GFX10-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI3]](s32)
342348
; GFX10-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
343349
; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
344-
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C9]]
345-
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32)
346-
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
347-
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
350+
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C9]]
351+
; GFX10-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
352+
; GFX10-NEXT: [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
353+
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
354+
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
348355
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
349356
; GFX10-NEXT: SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
350357
; GFX10-NEXT: G_BR %bb.6
351358
; GFX10-NEXT: {{ $}}
352359
; GFX10-NEXT: bb.6:
353360
; GFX10-NEXT: [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
354-
; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
361+
; GFX10-NEXT: [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
355362
; GFX10-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
356363
; GFX10-NEXT: [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
357364
; GFX10-NEXT: [[C11:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
358-
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY10]](s1), [[C11]], [[C10]]
365+
; GFX10-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY13]](s1), [[C11]], [[C10]]
359366
; GFX10-NEXT: G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
360367
; GFX10-NEXT: SI_RETURN
361368
bb.0:
@@ -475,6 +482,7 @@ body: |
475482
; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[AND1]](s32)
476483
; GFX10-NEXT: [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
477484
; GFX10-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[TRUNC1]], [[C5]]
485+
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1)
478486
; GFX10-NEXT: G_BRCOND [[XOR]](s1), %bb.2
479487
; GFX10-NEXT: G_BR %bb.1
480488
; GFX10-NEXT: {{ $}}
@@ -487,9 +495,10 @@ body: |
487495
; GFX10-NEXT: bb.2:
488496
; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000)
489497
; GFX10-NEXT: {{ $}}
490-
; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %30(s32), %bb.4, [[DEF]](s32), %bb.0
491-
; GFX10-NEXT: [[PHI1:%[0-9]+]]:sreg_32(s1) = G_PHI %32(s1), %bb.4, [[C5]](s1), %bb.0
492-
; GFX10-NEXT: G_BRCOND [[PHI1]](s1), %bb.5
498+
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %58(s1), %bb.4
499+
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %30(s32), %bb.4, [[DEF]](s32), %bb.0
500+
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
501+
; GFX10-NEXT: G_BRCOND [[COPY4]](s1), %bb.5
493502
; GFX10-NEXT: G_BR %bb.6
494503
; GFX10-NEXT: {{ $}}
495504
; GFX10-NEXT: bb.3:
@@ -517,6 +526,7 @@ body: |
517526
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[PHI5]](s32), [[AMDGPU_BUFFER_LOAD]]
518527
; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s1) = G_OR [[ICMP]], [[ICMP2]]
519528
; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s1)
529+
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
520530
; GFX10-NEXT: G_BR %bb.2
521531
; GFX10-NEXT: {{ $}}
522532
; GFX10-NEXT: bb.5:
@@ -527,7 +537,7 @@ body: |
527537
; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[C11]]
528538
; GFX10-NEXT: {{ $}}
529539
; GFX10-NEXT: bb.6:
530-
; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI]](s32), %bb.2, [[OR2]](s32), %bb.5
540+
; GFX10-NEXT: [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI1]](s32), %bb.2, [[OR2]](s32), %bb.5
531541
; GFX10-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[LOAD]](<8 x s32>)
532542
; GFX10-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY1]]
533543
; GFX10-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 2

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2-
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
2+
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
33

44
; This file contains various tests that have divergent i1s used outside of
55
; the loop. These are lane masks is sgpr and need to have correct value in
@@ -137,28 +137,24 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val,
137137
; GFX10-LABEL: divergent_i1_xor_used_outside_loop:
138138
; GFX10: ; %bb.0: ; %entry
139139
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140-
; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
141-
; GFX10-NEXT: s_mov_b32 s5, 0
140+
; GFX10-NEXT: s_mov_b32 s4, 0
141+
; GFX10-NEXT: v_cmp_lt_f32_e64 s5, 1.0, v1
142+
; GFX10-NEXT: v_mov_b32_e32 v1, s4
142143
; GFX10-NEXT: ; implicit-def: $sgpr6
143-
; GFX10-NEXT: v_mov_b32_e32 v1, s5
144-
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
145144
; GFX10-NEXT: .LBB2_1: ; %loop
146145
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
147-
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
148-
; GFX10-NEXT: v_cvt_f32_u32_e32 v5, v1
146+
; GFX10-NEXT: v_cvt_f32_u32_e32 v4, v1
147+
; GFX10-NEXT: s_xor_b32 s5, s5, -1
149148
; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v1
150-
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
151-
; GFX10-NEXT: v_cmp_gt_f32_e64 s4, v5, v0
152-
; GFX10-NEXT: s_xor_b32 s7, vcc_lo, -1
153-
; GFX10-NEXT: s_or_b32 s5, s4, s5
154-
; GFX10-NEXT: v_mov_b32_e32 v4, s7
155-
; GFX10-NEXT: s_andn2_b32 s4, s6, exec_lo
156-
; GFX10-NEXT: s_and_b32 s6, exec_lo, s7
157-
; GFX10-NEXT: s_or_b32 s6, s4, s6
158-
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
149+
; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v0
150+
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
151+
; GFX10-NEXT: s_andn2_b32 s6, s6, exec_lo
152+
; GFX10-NEXT: s_and_b32 s7, exec_lo, s5
153+
; GFX10-NEXT: s_or_b32 s6, s6, s7
154+
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
159155
; GFX10-NEXT: s_cbranch_execnz .LBB2_1
160156
; GFX10-NEXT: ; %bb.2: ; %exit
161-
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
157+
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
162158
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s6
163159
; GFX10-NEXT: flat_store_dword v[2:3], v0
164160
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -197,7 +193,7 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
197193
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
198194
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
199195
; GFX10-NEXT: s_mov_b32 s5, 0
200-
; GFX10-NEXT: s_mov_b32 s6, 1
196+
; GFX10-NEXT: s_mov_b32 s6, -1
201197
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
202198
; GFX10-NEXT: s_cbranch_execz .LBB3_6
203199
; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader
@@ -332,7 +328,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
332328
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
333329
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s7
334330
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, v1, v4
335-
; GFX10-NEXT: s_mov_b32 s7, 1
331+
; GFX10-NEXT: s_mov_b32 s7, -1
336332
; GFX10-NEXT: ; implicit-def: $vgpr5
337333
; GFX10-NEXT: s_and_saveexec_b32 s8, s4
338334
; GFX10-NEXT: s_cbranch_execz .LBB4_1
@@ -410,7 +406,7 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
410406
; GFX10-LABEL: divergent_i1_freeze_used_outside_loop:
411407
; GFX10: ; %bb.0: ; %entry
412408
; GFX10-NEXT: s_mov_b32 s0, 0
413-
; GFX10-NEXT: s_mov_b32 s3, 1
409+
; GFX10-NEXT: s_mov_b32 s3, -1
414410
; GFX10-NEXT: v_mov_b32_e32 v5, s0
415411
; GFX10-NEXT: ; implicit-def: $sgpr1
416412
; GFX10-NEXT: ; implicit-def: $sgpr2

0 commit comments

Comments
 (0)