Skip to content

Commit f7e6f7b

Browse files
JanekvOJanek van Oirschot
authored and
Janek van Oirschot
committed
[AMDGPU] SelectionDAG support for vector type materialization of 0 to multiple sgpr64
1 parent 553da96 commit f7e6f7b

40 files changed

+1216
-1262
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -634,6 +634,61 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
634634
case ISD::BUILD_VECTOR: {
635635
EVT VT = N->getValueType(0);
636636
unsigned NumVectorElts = VT.getVectorNumElements();
637+
638+
auto IsSplatAllZeros = [this](SDNode *N) -> bool {
639+
if (ISD::isConstantSplatVectorAllZeros(N))
640+
return true;
641+
642+
// Types may have legalized by stripping the 16 bit multi-element vector
643+
// into multiple BUILD_VECTORs. Peek through and see if it is all zeros
644+
// regardless of what the legalizer did. Assumes cases along the lines of:
645+
// v8i16 build_vector 0, 0, 0, 0, 0, 0, 0, 0
646+
// -> legalizer ->
647+
// t0 = v2i16 build_vector 0, 0
648+
// t1 = bitcast t0 to i32
649+
// v4i32 build_vector t1, t1, t1, t1
650+
if (CurDAG->isSplatValue(SDValue(N, 0))) {
651+
SDValue Op = peekThroughBitcasts(N->getOperand(0));
652+
EVT InnerVT = Op.getValueType();
653+
if (InnerVT.isVector() && Op.getOpcode() == ISD::BUILD_VECTOR &&
654+
InnerVT.getVectorNumElements() == 2)
655+
return ISD::isConstantSplatVectorAllZeros(Op.getNode());
656+
}
657+
return false;
658+
};
659+
if (IsSplatAllZeros(N)) {
660+
unsigned FixedBitSize = VT.getFixedSizeInBits();
661+
SDLoc DL(N);
662+
if (FixedBitSize == 64) {
663+
SDValue Set0 = {
664+
CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, MVT::i64,
665+
CurDAG->getTargetConstant(0, DL, MVT::i64)),
666+
0};
667+
CurDAG->SelectNodeTo(N, AMDGPU::COPY, VT, Set0);
668+
return;
669+
} else if (NumVectorElts <= 32 && (FixedBitSize % 64 == 0)) {
670+
SmallVector<SDValue, 32 * 2 + 1> Ops((FixedBitSize / 64) * 2 + 1);
671+
SDValue Set0 = {
672+
CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, MVT::i64,
673+
CurDAG->getTargetConstant(0, DL, MVT::i64)),
674+
0};
675+
unsigned RCID =
676+
SIRegisterInfo::getSGPRClassForBitWidth(FixedBitSize)->getID();
677+
Ops[0] = CurDAG->getTargetConstant(RCID, DL, MVT::i32);
678+
679+
for (unsigned i = 0, CurrentBitSize = FixedBitSize; CurrentBitSize != 0;
680+
++i, CurrentBitSize -= 64) {
681+
unsigned SubRegs =
682+
SIRegisterInfo::getSubRegFromChannel(i * 2, /*NumRegs=*/2);
683+
Ops[i * 2 + 1] = Set0;
684+
Ops[i * 2 + 2] = CurDAG->getTargetConstant(SubRegs, DL, MVT::i32);
685+
}
686+
687+
CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, VT, Ops);
688+
return;
689+
}
690+
}
691+
637692
if (VT.getScalarSizeInBits() == 16) {
638693
if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
639694
if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {

llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,14 @@
44
define float @test() {
55
; GFX10-LABEL: name: test
66
; GFX10: bb.0.bb:
7-
; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
8-
; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3, [[S_MOV_B32_]], %subreg.sub4, [[S_MOV_B32_]], %subreg.sub5, [[S_MOV_B32_]], %subreg.sub6, [[S_MOV_B32_]], %subreg.sub7
9-
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
10-
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
11-
; GFX10-NEXT: [[IMAGE_LOAD_V2_V2_nsa_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_LOAD_V2_V2_nsa_gfx10 [[COPY]], [[COPY1]], killed [[REG_SEQUENCE]], 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
12-
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub1
13-
; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub0
14-
; GFX10-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY2]], 0, killed [[COPY3]], 0, 0, implicit $mode, implicit $exec
15-
; GFX10-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
7+
; GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
8+
; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B64_]], %subreg.sub2_sub3, [[S_MOV_B64_]], %subreg.sub4_sub5, [[S_MOV_B64_]], %subreg.sub6_sub7
9+
; GFX10-NEXT: [[V_MOV_B32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0
10+
; GFX10-NEXT: [[IMAGE_LOAD_V2_V2_nsa_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_LOAD_V2_V2_nsa_gfx10 [[V_MOV_B32_]], [[V_MOV_B32_]], killed [[REG_SEQUENCE]], 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
11+
; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub1
12+
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub0
13+
; GFX10-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY]], 0, killed [[COPY1]], 0, 0, implicit $mode, implicit $exec
14+
; GFX10-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
1615
; GFX10-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_1]]
1716
; GFX10-NEXT: SI_RETURN implicit $vgpr0
1817
bb:

0 commit comments

Comments
 (0)