Skip to content

Commit c07e1e3

Browse files
AMDGPU/GlobalISel: Temporal divergence lowering (non i1) (#124298)
Record all uses outside cycle with divergent exit during propagateTemporalDivergence in Uniformity analysis. With this list of candidates for temporal divergence lowering, excluding known lane masks from control flow intrinsics, find sources from inside the cycle that are not i1 and uniform. Temporal divergence lowering (non i1): create copy(v_mov) to vgpr, with implicit exec (to stop other passes from moving this copy outside of the cycle) and use this vgpr outside of the cycle instead of original uniform source.
1 parent 553da96 commit c07e1e3

12 files changed

+176
-54
lines changed

llvm/include/llvm/ADT/GenericUniformityImpl.h

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
#include "llvm/ADT/SmallPtrSet.h"
5252
#include "llvm/ADT/SparseBitVector.h"
5353
#include "llvm/ADT/StringExtras.h"
54+
#include "llvm/CodeGen/MachineInstr.h"
5455
#include "llvm/Support/raw_ostream.h"
5556

5657
#define DEBUG_TYPE "uniformity"
@@ -342,6 +343,9 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
342343
typename SyncDependenceAnalysisT::DivergenceDescriptor;
343344
using BlockLabelMapT = typename SyncDependenceAnalysisT::BlockLabelMap;
344345

346+
using TemporalDivergenceTuple =
347+
std::tuple<ConstValueRefT, InstructionT *, const CycleT *>;
348+
345349
GenericUniformityAnalysisImpl(const DominatorTreeT &DT, const CycleInfoT &CI,
346350
const TargetTransformInfo *TTI)
347351
: Context(CI.getSSAContext()), F(*Context.getFunction()), CI(CI),
@@ -396,6 +400,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
396400

397401
void print(raw_ostream &out) const;
398402

403+
SmallVector<TemporalDivergenceTuple, 8> TemporalDivergenceList;
404+
405+
void recordTemporalDivergence(ConstValueRefT, const InstructionT *,
406+
const CycleT *);
407+
399408
protected:
400409
/// \brief Value/block pair representing a single phi input.
401410
struct PhiInput {
@@ -1129,6 +1138,13 @@ void GenericUniformityAnalysisImpl<ContextT>::compute() {
11291138
}
11301139
}
11311140

1141+
template <typename ContextT>
1142+
void GenericUniformityAnalysisImpl<ContextT>::recordTemporalDivergence(
1143+
ConstValueRefT Val, const InstructionT *User, const CycleT *Cycle) {
1144+
TemporalDivergenceList.emplace_back(Val, const_cast<InstructionT *>(User),
1145+
Cycle);
1146+
}
1147+
11321148
template <typename ContextT>
11331149
bool GenericUniformityAnalysisImpl<ContextT>::isAlwaysUniform(
11341150
const InstructionT &Instr) const {
@@ -1146,6 +1162,12 @@ template <typename ContextT>
11461162
void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
11471163
bool haveDivergentArgs = false;
11481164

1165+
// When we print Value, LLVM IR instruction, we want to print extra new line.
1166+
// In LLVM IR print function for Value does not print new line at the end.
1167+
// In MIR print for MachineInstr prints new line at the end.
1168+
constexpr bool IsMIR = std::is_same<InstructionT, MachineInstr>::value;
1169+
std::string NewLine = IsMIR ? "" : "\n";
1170+
11491171
// Control flow instructions may be divergent even if their inputs are
11501172
// uniform. Thus, although exceedingly rare, it is possible to have a program
11511173
// with no divergent values but with divergent control structures.
@@ -1180,6 +1202,16 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
11801202
}
11811203
}
11821204

1205+
if (!TemporalDivergenceList.empty()) {
1206+
OS << "\nTEMPORAL DIVERGENCE LIST:\n";
1207+
1208+
for (auto [Val, UseInst, Cycle] : TemporalDivergenceList) {
1209+
OS << "Value :" << Context.print(Val) << NewLine
1210+
<< "Used by :" << Context.print(UseInst) << NewLine
1211+
<< "Outside cycle :" << Cycle->print(Context) << "\n\n";
1212+
}
1213+
}
1214+
11831215
for (auto &block : F) {
11841216
OS << "\nBLOCK " << Context.print(&block) << '\n';
11851217

@@ -1191,7 +1223,7 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
11911223
OS << " DIVERGENT: ";
11921224
else
11931225
OS << " ";
1194-
OS << Context.print(value) << '\n';
1226+
OS << Context.print(value) << NewLine;
11951227
}
11961228

11971229
OS << "TERMINATORS\n";
@@ -1203,13 +1235,21 @@ void GenericUniformityAnalysisImpl<ContextT>::print(raw_ostream &OS) const {
12031235
OS << " DIVERGENT: ";
12041236
else
12051237
OS << " ";
1206-
OS << Context.print(T) << '\n';
1238+
OS << Context.print(T) << NewLine;
12071239
}
12081240

12091241
OS << "END BLOCK\n";
12101242
}
12111243
}
12121244

1245+
template <typename ContextT>
1246+
iterator_range<
1247+
typename GenericUniformityInfo<ContextT>::TemporalDivergenceTuple *>
1248+
GenericUniformityInfo<ContextT>::getTemporalDivergenceList() const {
1249+
return make_range(DA->TemporalDivergenceList.begin(),
1250+
DA->TemporalDivergenceList.end());
1251+
}
1252+
12131253
template <typename ContextT>
12141254
bool GenericUniformityInfo<ContextT>::hasDivergence() const {
12151255
return DA->hasDivergence();

llvm/include/llvm/ADT/GenericUniformityInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ template <typename ContextT> class GenericUniformityInfo {
4040
using CycleInfoT = GenericCycleInfo<ContextT>;
4141
using CycleT = typename CycleInfoT::CycleT;
4242

43+
using TemporalDivergenceTuple =
44+
std::tuple<ConstValueRefT, InstructionT *, const CycleT *>;
45+
4346
GenericUniformityInfo(const DominatorTreeT &DT, const CycleInfoT &CI,
4447
const TargetTransformInfo *TTI = nullptr);
4548
GenericUniformityInfo() = default;
@@ -78,6 +81,8 @@ template <typename ContextT> class GenericUniformityInfo {
7881

7982
void print(raw_ostream &Out) const;
8083

84+
iterator_range<TemporalDivergenceTuple *> getTemporalDivergenceList() const;
85+
8186
private:
8287
using ImplT = GenericUniformityAnalysisImpl<ContextT>;
8388

llvm/lib/Analysis/UniformityAnalysis.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,12 @@ template <>
7979
void llvm::GenericUniformityAnalysisImpl<
8080
SSAContext>::propagateTemporalDivergence(const Instruction &I,
8181
const Cycle &DefCycle) {
82-
if (isDivergent(I))
83-
return;
8482
for (auto *User : I.users()) {
8583
auto *UserInstr = cast<Instruction>(User);
8684
if (DefCycle.contains(UserInstr->getParent()))
8785
continue;
8886
markDivergent(*UserInstr);
87+
recordTemporalDivergence(&I, UserInstr, &DefCycle);
8988
}
9089
}
9190

llvm/lib/CodeGen/MachineUniformityAnalysis.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,12 +117,12 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::
117117
if (!Op.getReg().isVirtual())
118118
continue;
119119
auto Reg = Op.getReg();
120-
if (isDivergent(Reg))
121-
continue;
122120
for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
123121
if (DefCycle.contains(UserInstr.getParent()))
124122
continue;
125123
markDivergent(UserInstr);
124+
125+
recordTemporalDivergence(Reg, &UserInstr, &DefCycle);
126126
}
127127
}
128128
}
@@ -193,7 +193,7 @@ INITIALIZE_PASS_END(MachineUniformityAnalysisPass, "machine-uniformity",
193193

194194
void MachineUniformityAnalysisPass::getAnalysisUsage(AnalysisUsage &AU) const {
195195
AU.setPreservesAll();
196-
AU.addRequired<MachineCycleInfoWrapperPass>();
196+
AU.addRequiredTransitive<MachineCycleInfoWrapperPass>();
197197
AU.addRequired<MachineDominatorTreeWrapperPass>();
198198
MachineFunctionPass::getAnalysisUsage(AU);
199199
}

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
//===----------------------------------------------------------------------===//
1717

1818
#include "AMDGPU.h"
19+
#include "AMDGPUGlobalISelUtils.h"
1920
#include "SILowerI1Copies.h"
2021
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
2122
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -78,6 +79,8 @@ class DivergenceLoweringHelper : public PhiLoweringHelper {
7879
Register DstReg, Register PrevReg,
7980
Register CurReg) override;
8081
void constrainAsLaneMask(Incoming &In) override;
82+
83+
bool lowerTemporalDivergence();
8184
};
8285

8386
DivergenceLoweringHelper::DivergenceLoweringHelper(
@@ -188,6 +191,43 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
188191
In.Reg = Copy.getReg(0);
189192
}
190193

194+
void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst,
195+
Register NewReg) {
196+
for (MachineOperand &Op : Inst->operands()) {
197+
if (Op.isReg() && Op.getReg() == Reg)
198+
Op.setReg(NewReg);
199+
}
200+
}
201+
202+
bool DivergenceLoweringHelper::lowerTemporalDivergence() {
203+
AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF);
204+
DenseMap<Register, Register> TDCache;
205+
206+
for (auto [Reg, UseInst, _] : MUI->getTemporalDivergenceList()) {
207+
if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) ||
208+
ILMA.isS32S64LaneMask(Reg))
209+
continue;
210+
211+
Register CachedTDCopy = TDCache.lookup(Reg);
212+
if (CachedTDCopy) {
213+
replaceUsesOfRegInInstWith(Reg, UseInst, CachedTDCopy);
214+
continue;
215+
}
216+
217+
MachineInstr *Inst = MRI->getVRegDef(Reg);
218+
MachineBasicBlock *MBB = Inst->getParent();
219+
B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator())));
220+
221+
Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
222+
B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg})
223+
.addUse(ExecReg, RegState::Implicit);
224+
225+
replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg);
226+
TDCache[Reg] = VgprReg;
227+
}
228+
return false;
229+
}
230+
191231
} // End anonymous namespace.
192232

193233
INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
@@ -218,5 +258,15 @@ bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
218258

219259
DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
220260

221-
return Helper.lowerPhis();
261+
bool Changed = false;
262+
// Temporal divergence lowering needs to inspect list of instructions used
263+
// outside cycle with divergent exit provided by uniformity analysis. Uniform
264+
// instructions from the list require lowering, no instruction is deleted.
265+
// Thus it needs to be run before lowerPhis that deletes phis that require
266+
// lowering and replaces them with new instructions.
267+
268+
// Non-i1 temporal divergence lowering.
269+
Changed |= Helper.lowerTemporalDivergence();
270+
Changed |= Helper.lowerPhis();
271+
return Changed;
222272
}

llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ class RegBankSelectHelper {
8383
MachineRegisterInfo &MRI;
8484
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
8585
const MachineUniformityInfo &MUI;
86+
const SIRegisterInfo &TRI;
8687
const RegisterBank *SgprRB;
8788
const RegisterBank *VgprRB;
8889
const RegisterBank *VccRB;
@@ -91,14 +92,29 @@ class RegBankSelectHelper {
9192
RegBankSelectHelper(MachineIRBuilder &B,
9293
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
9394
const MachineUniformityInfo &MUI,
94-
const RegisterBankInfo &RBI)
95-
: B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI),
95+
const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
96+
: B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI),
9697
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
9798
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
9899
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
99100

101+
// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of
102+
// the cycle
103+
// Note: uniformity analysis does not consider that registers with vgpr def
104+
// are divergent (you can have uniform value in vgpr).
105+
// - TODO: implicit use of $exec could be implemented as indicator that
106+
// instruction is divergent
107+
bool isTemporalDivergenceCopy(Register Reg) {
108+
MachineInstr *MI = MRI.getVRegDef(Reg);
109+
if (!MI->isCopy() || MI->getNumImplicitOperands() != 1)
110+
return false;
111+
112+
return MI->implicit_operands().begin()->getReg() == TRI.getExec();
113+
}
114+
100115
const RegisterBank *getRegBankToAssign(Register Reg) {
101-
if (MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg))
116+
if (!isTemporalDivergenceCopy(Reg) &&
117+
(MUI.isUniform(Reg) || ILMA.isS32S64LaneMask(Reg)))
102118
return SgprRB;
103119
if (MRI.getType(Reg) == LLT::scalar(1))
104120
return VccRB;
@@ -209,7 +225,8 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
209225
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
210226
MachineRegisterInfo &MRI = *B.getMRI();
211227
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
212-
RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegBankInfo());
228+
RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegisterInfo(),
229+
*ST.getRegBankInfo());
213230
// Virtual registers at this point don't have register banks.
214231
// Virtual registers in def and use operands of already inst-selected
215232
// instruction have register class.

llvm/lib/Target/AMDGPU/SILowerI1Copies.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "GCNSubtarget.h"
1616
#include "llvm/CodeGen/MachineBasicBlock.h"
1717
#include "llvm/CodeGen/MachinePostDominators.h"
18+
#include "llvm/CodeGen/MachineRegisterInfo.h"
1819
#include "llvm/CodeGen/MachineSSAUpdater.h"
1920

2021
namespace llvm {
@@ -72,6 +73,11 @@ class PhiLoweringHelper {
7273
LaneMaskRegAttrs = MRI->getVRegAttrs(LaneMask);
7374
}
7475

76+
void
77+
initializeLaneMaskRegisterAttributes(MachineRegisterInfo::VRegAttrs Attrs) {
78+
LaneMaskRegAttrs = Attrs;
79+
}
80+
7581
bool isLaneMaskReg(Register Reg) const {
7682
return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
7783
TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ body: |
471471
; GFX10-NEXT: bb.2:
472472
; GFX10-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000)
473473
; GFX10-NEXT: {{ $}}
474-
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %56(s1), %bb.4
474+
; GFX10-NEXT: [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %57(s1), %bb.4
475475
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI %29(s32), %bb.4, [[DEF]](s32), %bb.0
476476
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
477477
; GFX10-NEXT: G_BRCOND [[COPY4]](s1), %bb.5
@@ -486,6 +486,7 @@ body: |
486486
; GFX10-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
487487
; GFX10-NEXT: [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:_(s32) = G_AMDGPU_BUFFER_LOAD [[UV]](<4 x s32>), [[C7]](s32), [[PHI2]], [[C7]], 0, 0, 0 :: (dereferenceable load (s32), align 1, addrspace 8)
488488
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AMDGPU_BUFFER_LOAD1]], [[PHI4]]
489+
; GFX10-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[ADD]](s32), implicit $exec_lo
489490
; GFX10-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
490491
; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C8]]
491492
; GFX10-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
@@ -497,11 +498,11 @@ body: |
497498
; GFX10-NEXT: bb.4:
498499
; GFX10-NEXT: successors: %bb.2(0x80000000)
499500
; GFX10-NEXT: {{ $}}
500-
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[AMDGPU_BUFFER_LOAD]]
501+
; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY5]](s32), [[AMDGPU_BUFFER_LOAD]]
501502
; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s1) = G_OR [[ICMP]], [[ICMP2]]
502503
; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s1)
503504
; GFX10-NEXT: [[C10:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
504-
; GFX10-NEXT: [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
505+
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
505506
; GFX10-NEXT: G_BR %bb.2
506507
; GFX10-NEXT: {{ $}}
507508
; GFX10-NEXT: bb.5:

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -642,6 +642,7 @@ body: |
642642
; GFX10-NEXT: {{ $}}
643643
; GFX10-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
644644
; GFX10-NEXT: [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
645+
; GFX10-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[PHI1]](s32), implicit $exec_lo
645646
; GFX10-NEXT: {{ $}}
646647
; GFX10-NEXT: bb.2:
647648
; GFX10-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000)
@@ -665,8 +666,8 @@ body: |
665666
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
666667
; GFX10-NEXT: [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI1]]
667668
; GFX10-NEXT: [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
668-
; GFX10-NEXT: [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
669-
; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[COPY6]](s1)
669+
; GFX10-NEXT: [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
670+
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
670671
; GFX10-NEXT: [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
671672
; GFX10-NEXT: G_BR %bb.5
672673
; GFX10-NEXT: {{ $}}
@@ -676,19 +677,19 @@ body: |
676677
; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
677678
; GFX10-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
678679
; GFX10-NEXT: [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
679-
; GFX10-NEXT: [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
680-
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
681-
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
680+
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
681+
; GFX10-NEXT: [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
682+
; GFX10-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
682683
; GFX10-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
683684
; GFX10-NEXT: {{ $}}
684685
; GFX10-NEXT: bb.6:
685686
; GFX10-NEXT: successors: %bb.7(0x04000000), %bb.1(0x7c000000)
686687
; GFX10-NEXT: {{ $}}
687-
; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
688+
; GFX10-NEXT: [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[COPY7]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
688689
; GFX10-NEXT: [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
689-
; GFX10-NEXT: [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
690+
; GFX10-NEXT: [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
690691
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
691-
; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY9]](s1), [[PHI]](s32)
692+
; GFX10-NEXT: [[INT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI]](s32)
692693
; GFX10-NEXT: SI_LOOP [[INT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
693694
; GFX10-NEXT: G_BR %bb.7
694695
; GFX10-NEXT: {{ $}}
@@ -702,7 +703,7 @@ body: |
702703
; GFX10-NEXT: bb.8:
703704
; GFX10-NEXT: successors: %bb.9(0x80000000)
704705
; GFX10-NEXT: {{ $}}
705-
; GFX10-NEXT: G_STORE [[PHI1]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
706+
; GFX10-NEXT: G_STORE [[COPY6]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
706707
; GFX10-NEXT: {{ $}}
707708
; GFX10-NEXT: bb.9:
708709
; GFX10-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)

0 commit comments

Comments
 (0)