-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[AMDGPU] Implement moveToVALU for S_CSELECT_B64 #70352
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
moveToVALU previously only handled S_CSELECT_B64 in the trivial case where it was semantically equivalent to a copy. Implement the general case using V_CNDMASK_B64_PSEUDO and implement post-RA expansion of V_CNDMASK_B64_PSEUDO with immediate as well as register operands.
@llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) ChangesmoveToVALU previously only handled S_CSELECT_B64 in the trivial case Full diff: https://github.com/llvm/llvm-project/pull/70352.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7d457edad0d5cdf..20bae0f27647a42 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4727,8 +4727,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
const SIRegisterInfo *TRI = ST.getRegisterInfo();
Register Dst = MI.getOperand(0).getReg();
- Register Src0 = MI.getOperand(1).getReg();
- Register Src1 = MI.getOperand(2).getReg();
+ const MachineOperand &Src0 = MI.getOperand(1);
+ const MachineOperand &Src1 = MI.getOperand(2);
const DebugLoc &DL = MI.getDebugLoc();
Register SrcCond = MI.getOperand(3).getReg();
@@ -4737,20 +4737,42 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
+ const TargetRegisterClass *Src0RC = Src0.isReg()
+ ? MRI.getRegClass(Src0.getReg())
+ : &AMDGPU::VReg_64RegClass;
+ const TargetRegisterClass *Src1RC = Src1.isReg()
+ ? MRI.getRegClass(Src1.getReg())
+ : &AMDGPU::VReg_64RegClass;
+
+ const TargetRegisterClass *Src0SubRC =
+ TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
+ const TargetRegisterClass *Src1SubRC =
+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
+
+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
+
+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
+
BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
.addReg(SrcCond);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
- .addImm(0)
- .addReg(Src0, 0, AMDGPU::sub0)
- .addImm(0)
- .addReg(Src1, 0, AMDGPU::sub0)
- .addReg(SrcCondCopy);
+ .addImm(0)
+ .add(Src0Sub0)
+ .addImm(0)
+ .add(Src1Sub0)
+ .addReg(SrcCondCopy);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
- .addImm(0)
- .addReg(Src0, 0, AMDGPU::sub1)
- .addImm(0)
- .addReg(Src1, 0, AMDGPU::sub1)
- .addReg(SrcCondCopy);
+ .addImm(0)
+ .add(Src0Sub1)
+ .addImm(0)
+ .add(Src1Sub1)
+ .addReg(SrcCondCopy);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
.addReg(DstLo)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 327f8988ac2f105..f1d1fd386d1b2cc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7223,27 +7223,27 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineOperand &Src1 = Inst.getOperand(2);
MachineOperand &Cond = Inst.getOperand(3);
- Register SCCSource = Cond.getReg();
- bool IsSCC = (SCCSource == AMDGPU::SCC);
+ Register CondReg = Cond.getReg();
+ bool IsSCC = (CondReg == AMDGPU::SCC);
// If this is a trivial select where the condition is effectively not SCC
- // (SCCSource is a source of copy to SCC), then the select is semantically
- // equivalent to copying SCCSource. Hence, there is no need to create
+ // (CondReg is a source of copy to SCC), then the select is semantically
+ // equivalent to copying CondReg. Hence, there is no need to create
// V_CNDMASK, we can just use that and bail out.
if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
(Src1.getImm() == 0)) {
- MRI.replaceRegWith(Dest.getReg(), SCCSource);
+ MRI.replaceRegWith(Dest.getReg(), CondReg);
return;
}
- const TargetRegisterClass *TC =
- RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
-
- Register CopySCC = MRI.createVirtualRegister(TC);
-
+ Register NewCondReg = CondReg;
if (IsSCC) {
+ const TargetRegisterClass *TC =
+ RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+ NewCondReg = MRI.createVirtualRegister(TC);
+
// Now look for the closest SCC def if it is a copy
- // replacing the SCCSource with the COPY source register
+ // replacing the CondReg with the COPY source register
bool CopyFound = false;
for (MachineInstr &CandI :
make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
@@ -7251,7 +7251,7 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
-1) {
if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
- BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC)
+ BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
.addReg(CandI.getOperand(1).getReg());
CopyFound = true;
}
@@ -7266,24 +7266,31 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
: AMDGPU::S_CSELECT_B32;
auto NewSelect =
- BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
+ BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
}
}
- Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- auto UpdatedInst =
- BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg)
- .addImm(0)
- .add(Src1) // False
- .addImm(0)
- .add(Src0) // True
- .addReg(IsSCC ? CopySCC : SCCSource);
-
- MRI.replaceRegWith(Dest.getReg(), ResultReg);
- legalizeOperands(*UpdatedInst, MDT);
- addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+ Register NewDestReg = MRI.createVirtualRegister(
+ RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
+ MachineInstr *NewInst;
+ if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
+ NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
+ .addImm(0)
+ .add(Src1) // False
+ .addImm(0)
+ .add(Src0) // True
+ .addReg(NewCondReg);
+ } else {
+ NewInst =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
+ .add(Src1) // False
+ .add(Src0) // True
+ .addReg(NewCondReg);
+ }
+ MRI.replaceRegWith(Dest.getReg(), NewDestReg);
+ legalizeOperands(*NewInst, MDT);
+ addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
}
void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
diff --git a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
new file mode 100644
index 000000000000000..ac196635b363a41
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s
+
+define amdgpu_cs <2 x i32> @f() {
+; CHECK-LABEL: f:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_mov_b32 s0, 0
+; CHECK-NEXT: s_mov_b32 s1, s0
+; CHECK-NEXT: s_mov_b32 s2, s0
+; CHECK-NEXT: s_mov_b32 s3, s0
+; CHECK-NEXT: s_mov_b32 s4, s0
+; CHECK-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; CHECK-NEXT: s_mov_b32 s5, s0
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; CHECK-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; CHECK-NEXT: ; return to shader part epilog
+bb:
+ %i = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
+ %i1 = bitcast <2 x i32> %i to i64
+ %i2 = insertelement <3 x i64> zeroinitializer, i64 %i1, i64 2
+ %i3 = icmp ne <3 x i64> %i2, zeroinitializer
+ %i4 = zext <3 x i1> %i3 to <3 x i64>
+ %i5 = bitcast <3 x i64> %i4 to <6 x i32>
+ %i6 = shufflevector <6 x i32> %i5, <6 x i32> zeroinitializer, <2 x i32> <i32 4, i32 5>
+ call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %i6, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
+ ret <2 x i32> %i6
+}
+
+declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32>, i32, i32, i32 immarg)
+declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
index 8d0a9899b5dbcc7..b1715bd9d332864 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
@@ -233,3 +233,22 @@ body: |
%0:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
%1:sreg_32 = COPY %0.sub0
...
+
+---
+# GCN-LABEL: name: s_cselect_b64
+# GCN: %0:vgpr_32 = IMPLICIT_DEF
+# GCN: %1:vreg_64 = IMPLICIT_DEF
+# GCN: %2:sreg_32 = IMPLICIT_DEF
+# GCN: %3:sreg_64 = IMPLICIT_DEF
+# GCN: %5:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 0, implicit $exec
+# GCN: %6:vreg_64 = V_CNDMASK_B64_PSEUDO 0, %1, %5, implicit $exec
+name: s_cselect_b64
+body: |
+ bb.0:
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:vreg_64 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %0
+ %3:sreg_64 = COPY %1
+ S_CMP_EQ_U32 %2, 0, implicit-def $scc
+ %4:sreg_64 = S_CSELECT_B64 %3, 0, implicit $scc
+...
|
Some of this was previously implemented as part of https://reviews.llvm.org/D109159 "[amdgpu] Enable selection of |
Yes! The instruction selection parts of D109159 still need to be unreverted. But I don't know exactly what problem caused them to be reverted in the first place, so I would like to proceed in small steps. |
Ping! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
|
||
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( | ||
MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); | ||
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wow! I didn't know it does exist. Seems useful.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is mostly copied from the handling of S_ADD_U64_PSEUDO
and other 64-bit pseudos that get split.
Thanks - sounds good. |
moveToVALU previously only handled S_CSELECT_B64 in the trivial case
where it was semantically equivalent to a copy. Implement the general
case using V_CNDMASK_B64_PSEUDO and implement post-RA expansion of
V_CNDMASK_B64_PSEUDO with immediate as well as register operands.