Skip to content

Commit 1590cac

Browse files
authored
[AMDGPU] Implement moveToVALU for S_CSELECT_B64 (llvm#70352)
moveToVALU previously only handled S_CSELECT_B64 in the trivial case where it was semantically equivalent to a copy. Implement the general case using V_CNDMASK_B64_PSEUDO and implement post-RA expansion of V_CNDMASK_B64_PSEUDO with immediate as well as register operands.
1 parent 07f0e75 commit 1590cac

File tree

4 files changed

+120
-38
lines changed

4 files changed

+120
-38
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+34-12
Original file line numberDiff line numberDiff line change
@@ -4744,8 +4744,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
47444744
const SIRegisterInfo *TRI = ST.getRegisterInfo();
47454745

47464746
Register Dst = MI.getOperand(0).getReg();
4747-
Register Src0 = MI.getOperand(1).getReg();
4748-
Register Src1 = MI.getOperand(2).getReg();
4747+
const MachineOperand &Src0 = MI.getOperand(1);
4748+
const MachineOperand &Src1 = MI.getOperand(2);
47494749
const DebugLoc &DL = MI.getDebugLoc();
47504750
Register SrcCond = MI.getOperand(3).getReg();
47514751

@@ -4754,20 +4754,42 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
47544754
const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
47554755
Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
47564756

4757+
const TargetRegisterClass *Src0RC = Src0.isReg()
4758+
? MRI.getRegClass(Src0.getReg())
4759+
: &AMDGPU::VReg_64RegClass;
4760+
const TargetRegisterClass *Src1RC = Src1.isReg()
4761+
? MRI.getRegClass(Src1.getReg())
4762+
: &AMDGPU::VReg_64RegClass;
4763+
4764+
const TargetRegisterClass *Src0SubRC =
4765+
TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
4766+
const TargetRegisterClass *Src1SubRC =
4767+
TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
4768+
4769+
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
4770+
MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4771+
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
4772+
MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4773+
4774+
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
4775+
MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4776+
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
4777+
MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4778+
47574779
BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
47584780
.addReg(SrcCond);
47594781
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
4760-
.addImm(0)
4761-
.addReg(Src0, 0, AMDGPU::sub0)
4762-
.addImm(0)
4763-
.addReg(Src1, 0, AMDGPU::sub0)
4764-
.addReg(SrcCondCopy);
4782+
.addImm(0)
4783+
.add(Src0Sub0)
4784+
.addImm(0)
4785+
.add(Src1Sub0)
4786+
.addReg(SrcCondCopy);
47654787
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
4766-
.addImm(0)
4767-
.addReg(Src0, 0, AMDGPU::sub1)
4768-
.addImm(0)
4769-
.addReg(Src1, 0, AMDGPU::sub1)
4770-
.addReg(SrcCondCopy);
4788+
.addImm(0)
4789+
.add(Src0Sub1)
4790+
.addImm(0)
4791+
.add(Src1Sub1)
4792+
.addReg(SrcCondCopy);
47714793

47724794
BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
47734795
.addReg(DstLo)

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

+33-26
Original file line numberDiff line numberDiff line change
@@ -7265,35 +7265,35 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
72657265
MachineOperand &Src1 = Inst.getOperand(2);
72667266
MachineOperand &Cond = Inst.getOperand(3);
72677267

7268-
Register SCCSource = Cond.getReg();
7269-
bool IsSCC = (SCCSource == AMDGPU::SCC);
7268+
Register CondReg = Cond.getReg();
7269+
bool IsSCC = (CondReg == AMDGPU::SCC);
72707270

72717271
// If this is a trivial select where the condition is effectively not SCC
7272-
// (SCCSource is a source of copy to SCC), then the select is semantically
7273-
// equivalent to copying SCCSource. Hence, there is no need to create
7272+
// (CondReg is a source of copy to SCC), then the select is semantically
7273+
// equivalent to copying CondReg. Hence, there is no need to create
72747274
// V_CNDMASK, we can just use that and bail out.
72757275
if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
72767276
(Src1.getImm() == 0)) {
7277-
MRI.replaceRegWith(Dest.getReg(), SCCSource);
7277+
MRI.replaceRegWith(Dest.getReg(), CondReg);
72787278
return;
72797279
}
72807280

7281-
const TargetRegisterClass *TC =
7282-
RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7283-
7284-
Register CopySCC = MRI.createVirtualRegister(TC);
7285-
7281+
Register NewCondReg = CondReg;
72867282
if (IsSCC) {
7283+
const TargetRegisterClass *TC =
7284+
RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7285+
NewCondReg = MRI.createVirtualRegister(TC);
7286+
72877287
// Now look for the closest SCC def if it is a copy
7288-
// replacing the SCCSource with the COPY source register
7288+
// replacing the CondReg with the COPY source register
72897289
bool CopyFound = false;
72907290
for (MachineInstr &CandI :
72917291
make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
72927292
Inst.getParent()->rend())) {
72937293
if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
72947294
-1) {
72957295
if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7296-
BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC)
7296+
BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
72977297
.addReg(CandI.getOperand(1).getReg());
72987298
CopyFound = true;
72997299
}
@@ -7308,24 +7308,31 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
73087308
unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
73097309
: AMDGPU::S_CSELECT_B32;
73107310
auto NewSelect =
7311-
BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
7311+
BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
73127312
NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
73137313
}
73147314
}
73157315

7316-
Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7317-
7318-
auto UpdatedInst =
7319-
BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg)
7320-
.addImm(0)
7321-
.add(Src1) // False
7322-
.addImm(0)
7323-
.add(Src0) // True
7324-
.addReg(IsSCC ? CopySCC : SCCSource);
7325-
7326-
MRI.replaceRegWith(Dest.getReg(), ResultReg);
7327-
legalizeOperands(*UpdatedInst, MDT);
7328-
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7316+
Register NewDestReg = MRI.createVirtualRegister(
7317+
RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7318+
MachineInstr *NewInst;
7319+
if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7320+
NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7321+
.addImm(0)
7322+
.add(Src1) // False
7323+
.addImm(0)
7324+
.add(Src0) // True
7325+
.addReg(NewCondReg);
7326+
} else {
7327+
NewInst =
7328+
BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7329+
.add(Src1) // False
7330+
.add(Src0) // True
7331+
.addReg(NewCondReg);
7332+
}
7333+
MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7334+
legalizeOperands(*NewInst, MDT);
7335+
addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
73297336
}
73307337

73317338
void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2+
; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s
3+
4+
define amdgpu_cs <2 x i32> @f() {
5+
; CHECK-LABEL: f:
6+
; CHECK: ; %bb.0: ; %bb
7+
; CHECK-NEXT: s_mov_b32 s0, 0
8+
; CHECK-NEXT: s_mov_b32 s1, s0
9+
; CHECK-NEXT: s_mov_b32 s2, s0
10+
; CHECK-NEXT: s_mov_b32 s3, s0
11+
; CHECK-NEXT: s_mov_b32 s4, s0
12+
; CHECK-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
13+
; CHECK-NEXT: s_mov_b32 s5, s0
14+
; CHECK-NEXT: v_mov_b32_e32 v2, s0
15+
; CHECK-NEXT: s_waitcnt vmcnt(0)
16+
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
17+
; CHECK-NEXT: v_mov_b32_e32 v0, 0
18+
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
19+
; CHECK-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0
20+
; CHECK-NEXT: ; return to shader part epilog
21+
bb:
22+
%i = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
23+
%i1 = bitcast <2 x i32> %i to i64
24+
%i2 = insertelement <3 x i64> zeroinitializer, i64 %i1, i64 2
25+
%i3 = icmp ne <3 x i64> %i2, zeroinitializer
26+
%i4 = zext <3 x i1> %i3 to <3 x i64>
27+
%i5 = bitcast <3 x i64> %i4 to <6 x i32>
28+
%i6 = shufflevector <6 x i32> %i5, <6 x i32> zeroinitializer, <2 x i32> <i32 4, i32 5>
29+
call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %i6, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
30+
ret <2 x i32> %i6
31+
}
32+
33+
declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32>, i32, i32, i32 immarg)
34+
declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32 immarg)

llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir

+19
Original file line numberDiff line numberDiff line change
@@ -233,3 +233,22 @@ body: |
233233
%0:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
234234
%1:sreg_32 = COPY %0.sub0
235235
...
236+
237+
---
238+
# GCN-LABEL: name: s_cselect_b64
239+
# GCN: %0:vgpr_32 = IMPLICIT_DEF
240+
# GCN: %1:vreg_64 = IMPLICIT_DEF
241+
# GCN: %2:sreg_32 = IMPLICIT_DEF
242+
# GCN: %3:sreg_64 = IMPLICIT_DEF
243+
# GCN: %5:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 0, implicit $exec
244+
# GCN: %6:vreg_64 = V_CNDMASK_B64_PSEUDO 0, %1, %5, implicit $exec
245+
name: s_cselect_b64
246+
body: |
247+
bb.0:
248+
%0:vgpr_32 = IMPLICIT_DEF
249+
%1:vreg_64 = IMPLICIT_DEF
250+
%2:sreg_32 = COPY %0
251+
%3:sreg_64 = COPY %1
252+
S_CMP_EQ_U32 %2, 0, implicit-def $scc
253+
%4:sreg_64 = S_CSELECT_B64 %3, 0, implicit $scc
254+
...

0 commit comments

Comments
 (0)