-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU] SelectionDAG support for vector type set 0 to multiple sgpr64 #128017
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Janek van Oirschot (JanekvO) ChangesZeros seem to materialize into a sequence of Patch is 205.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128017.diff 40 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8e90754103ff1..c990df622175f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -634,6 +634,61 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::BUILD_VECTOR: {
EVT VT = N->getValueType(0);
unsigned NumVectorElts = VT.getVectorNumElements();
+
+ auto IsSplatAllZeros = [this](SDNode *N) -> bool {
+ if (ISD::isConstantSplatVectorAllZeros(N))
+ return true;
+
+ // Types may have legalized by stripping the 16 bit multi-element vector
+ // into multiple BUILD_VECTORs. Peek through and see if it is all zeros
+ // regardless of what the legalizer did. Assumes cases along the lines of:
+ // v8i16 build_vector 0, 0, 0, 0, 0, 0, 0, 0
+ // -> legalizer ->
+ // t0 = v2i16 build_vector 0, 0
+ // t1 = bitcast t0 to i32
+ // v4i32 build_vector t1, t1, t1, t1
+ if (CurDAG->isSplatValue(SDValue(N, 0))) {
+ SDValue Op = peekThroughBitcasts(N->getOperand(0));
+ EVT InnerVT = Op.getValueType();
+ if (InnerVT.isVector() && Op.getOpcode() == ISD::BUILD_VECTOR &&
+ InnerVT.getVectorNumElements() == 2)
+ return ISD::isConstantSplatVectorAllZeros(Op.getNode());
+ }
+ return false;
+ };
+ if (IsSplatAllZeros(N)) {
+ unsigned FixedBitSize = VT.getFixedSizeInBits();
+ SDLoc DL(N);
+ if (FixedBitSize == 64) {
+ SDValue Set0 = {
+ CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, MVT::i64,
+ CurDAG->getTargetConstant(0, DL, MVT::i64)),
+ 0};
+ CurDAG->SelectNodeTo(N, AMDGPU::COPY, VT, Set0);
+ return;
+ } else if (NumVectorElts <= 32 && (FixedBitSize % 64 == 0)) {
+ SmallVector<SDValue, 32 * 2 + 1> Ops((FixedBitSize / 64) * 2 + 1);
+ SDValue Set0 = {
+ CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, MVT::i64,
+ CurDAG->getTargetConstant(0, DL, MVT::i64)),
+ 0};
+ unsigned RCID =
+ SIRegisterInfo::getSGPRClassForBitWidth(FixedBitSize)->getID();
+ Ops[0] = CurDAG->getTargetConstant(RCID, DL, MVT::i32);
+
+ for (unsigned i = 0, CurrentBitSize = FixedBitSize; CurrentBitSize != 0;
+ ++i, CurrentBitSize -= 64) {
+ unsigned SubRegs =
+ SIRegisterInfo::getSubRegFromChannel(i * 2, /*NumRegs=*/2);
+ Ops[i * 2 + 1] = Set0;
+ Ops[i * 2 + 2] = CurDAG->getTargetConstant(SubRegs, DL, MVT::i32);
+ }
+
+ CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, VT, Ops);
+ return;
+ }
+ }
+
if (VT.getScalarSizeInBits() == 16) {
if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index fa15e73bc31d5..3e7129f7bcd8f 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1035,6 +1035,9 @@ void SIFoldOperandsImpl::foldOperand(
getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
const DebugLoc &DL = UseMI->getDebugLoc();
MachineBasicBlock &MBB = *UseMI->getParent();
+ unsigned SubRegSize =
+ Defs.empty() ? 4
+ : TRI->getNumChannelsFromSubReg(Defs[0].second) * 4;
UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
@@ -1043,17 +1046,29 @@ void SIFoldOperandsImpl::foldOperand(
MachineInstrBuilder B(*MBB.getParent(), UseMI);
DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
- for (unsigned I = 0; I < Size / 4; ++I) {
+ for (unsigned I = 0; I < Size / SubRegSize; ++I) {
MachineOperand *Def = Defs[I].first;
TargetInstrInfo::RegSubRegPair CopyToVGPR;
if (Def->isImm() &&
TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
int64_t Imm = Def->getImm();
- auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
- BuildMI(MBB, UseMI, DL,
- TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
- B.addReg(Tmp);
+ // Adding multiple immediates in case of 64 bit MOVs to agpr.
+ unsigned SubWordSize = SubRegSize / 4;
+ for (unsigned ImmIt = 0; ImmIt < SubWordSize; ++ImmIt) {
+ auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+ unsigned SubRegs =
+ SIRegisterInfo::getSubRegFromChannel(I * SubWordSize + ImmIt);
+ BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
+ Tmp)
+ .addImm(Imm);
+ B.addReg(Tmp);
+ B.addImm(SubRegs);
+ }
+ // Avoid fallthrough that adds 'Defs[I].second' to B as we're
+ // technically pulling the SubReg apart from sgpr64 to
+ // agpr32:agpr32.
+ continue;
} else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
auto Src = getRegSubRegPair(*Def);
Def->setIsKill(false);
diff --git a/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll b/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll
index 23a7bb6ece488..1d0a9f9585123 100644
--- a/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll
+++ b/llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll
@@ -4,15 +4,14 @@
define float @test() {
; GFX10-LABEL: name: test
; GFX10: bb.0.bb:
- ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3, [[S_MOV_B32_]], %subreg.sub4, [[S_MOV_B32_]], %subreg.sub5, [[S_MOV_B32_]], %subreg.sub6, [[S_MOV_B32_]], %subreg.sub7
- ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GFX10-NEXT: [[IMAGE_LOAD_V2_V2_nsa_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_LOAD_V2_V2_nsa_gfx10 [[COPY]], [[COPY1]], killed [[REG_SEQUENCE]], 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
- ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub1
- ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub0
- ; GFX10-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY2]], 0, killed [[COPY3]], 0, 0, implicit $mode, implicit $exec
- ; GFX10-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX10-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+ ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_256 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[S_MOV_B64_]], %subreg.sub2_sub3, [[S_MOV_B64_]], %subreg.sub4_sub5, [[S_MOV_B64_]], %subreg.sub6_sub7
+ ; GFX10-NEXT: [[V_MOV_B32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0
+ ; GFX10-NEXT: [[IMAGE_LOAD_V2_V2_nsa_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_LOAD_V2_V2_nsa_gfx10 [[V_MOV_B32_]], [[V_MOV_B32_]], killed [[REG_SEQUENCE]], 3, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
+ ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub1
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[IMAGE_LOAD_V2_V2_nsa_gfx10_]].sub0
+ ; GFX10-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY]], 0, killed [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX10-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_1]]
; GFX10-NEXT: SI_RETURN implicit $vgpr0
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 4ce46bbaf45ac..02b17e6229b15 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -515,48 +515,47 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX908-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX908-NEXT: s_load_dword s7, s[8:9], 0x18
-; GFX908-NEXT: s_mov_b32 s6, 0
-; GFX908-NEXT: s_mov_b32 s9, s6
+; GFX908-NEXT: s_mov_b32 s7, 0
+; GFX908-NEXT: s_load_dword s8, s[8:9], 0x18
+; GFX908-NEXT: v_mov_b32_e32 v19, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX908-NEXT: s_sub_i32 s8, 0, s3
-; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s7
-; GFX908-NEXT: v_mov_b32_e32 v19, 0
-; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0
+; GFX908-NEXT: s_sub_i32 s6, 0, s3
+; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s8
+; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, 0
; GFX908-NEXT: v_mov_b32_e32 v1, 0
-; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX908-NEXT: v_readfirstlane_b32 s10, v2
-; GFX908-NEXT: s_mul_i32 s8, s8, s10
-; GFX908-NEXT: s_mul_hi_u32 s8, s10, s8
-; GFX908-NEXT: s_add_i32 s10, s10, s8
-; GFX908-NEXT: s_mul_hi_u32 s8, s2, s10
-; GFX908-NEXT: s_mul_i32 s10, s8, s3
-; GFX908-NEXT: s_sub_i32 s2, s2, s10
-; GFX908-NEXT: s_add_i32 s11, s8, 1
-; GFX908-NEXT: s_sub_i32 s10, s2, s3
+; GFX908-NEXT: v_readfirstlane_b32 s9, v2
+; GFX908-NEXT: s_mul_i32 s6, s6, s9
+; GFX908-NEXT: s_mul_hi_u32 s6, s9, s6
+; GFX908-NEXT: s_add_i32 s9, s9, s6
+; GFX908-NEXT: s_mul_hi_u32 s6, s2, s9
+; GFX908-NEXT: s_mul_i32 s9, s6, s3
+; GFX908-NEXT: s_sub_i32 s2, s2, s9
+; GFX908-NEXT: s_add_i32 s10, s6, 1
+; GFX908-NEXT: s_sub_i32 s9, s2, s3
; GFX908-NEXT: s_cmp_ge_u32 s2, s3
-; GFX908-NEXT: s_cselect_b32 s8, s11, s8
-; GFX908-NEXT: s_cselect_b32 s2, s10, s2
-; GFX908-NEXT: s_add_i32 s10, s8, 1
+; GFX908-NEXT: s_cselect_b32 s6, s10, s6
+; GFX908-NEXT: s_cselect_b32 s2, s9, s2
+; GFX908-NEXT: s_add_i32 s9, s6, 1
; GFX908-NEXT: s_cmp_ge_u32 s2, s3
-; GFX908-NEXT: s_cselect_b32 s8, s10, s8
-; GFX908-NEXT: s_lshr_b32 s7, s7, 16
-; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s7
+; GFX908-NEXT: s_cselect_b32 s6, s9, s6
+; GFX908-NEXT: s_lshr_b32 s10, s8, 16
+; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s10
+; GFX908-NEXT: s_lshl_b64 s[10:11], s[6:7], 5
; GFX908-NEXT: s_lshl_b64 s[2:3], s[0:1], 5
-; GFX908-NEXT: s_lshl_b64 s[12:13], s[8:9], 5
-; GFX908-NEXT: s_lshl_b64 s[10:11], s[4:5], 5
-; GFX908-NEXT: s_or_b32 s10, s10, 28
+; GFX908-NEXT: s_lshl_b64 s[8:9], s[4:5], 5
+; GFX908-NEXT: s_or_b32 s8, s8, 28
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readfirstlane_b32 s7, v16
; GFX908-NEXT: s_and_b32 s7, 0xffff, s7
; GFX908-NEXT: s_mul_i32 s1, s1, s7
-; GFX908-NEXT: s_mul_hi_u32 s9, s0, s7
+; GFX908-NEXT: s_mul_hi_u32 s12, s0, s7
; GFX908-NEXT: s_mul_i32 s0, s0, s7
-; GFX908-NEXT: s_add_i32 s1, s9, s1
-; GFX908-NEXT: s_lshl_b64 s[14:15], s[0:1], 5
+; GFX908-NEXT: s_add_i32 s1, s12, s1
+; GFX908-NEXT: s_lshl_b64 s[12:13], s[0:1], 5
; GFX908-NEXT: s_branch .LBB3_2
; GFX908-NEXT: .LBB3_1: ; %Flow20
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
@@ -565,59 +564,58 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: .LBB3_2: ; %bb9
; GFX908-NEXT: ; =>This Loop Header: Depth=1
; GFX908-NEXT: ; Child Loop BB3_5 Depth 2
-; GFX908-NEXT: s_mov_b64 s[16:17], -1
+; GFX908-NEXT: s_mov_b64 s[14:15], -1
; GFX908-NEXT: s_cbranch_scc0 .LBB3_10
; GFX908-NEXT: ; %bb.3: ; %bb14
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
-; GFX908-NEXT: s_mov_b32 s7, s6
-; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX908-NEXT: v_mov_b32_e32 v4, s6
-; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6
-; GFX908-NEXT: v_mov_b32_e32 v6, s6
-; GFX908-NEXT: v_mov_b32_e32 v9, s7
-; GFX908-NEXT: v_mov_b32_e32 v5, s7
-; GFX908-NEXT: v_mov_b32_e32 v7, s7
-; GFX908-NEXT: v_mov_b32_e32 v8, s6
-; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
-; GFX908-NEXT: v_mov_b32_e32 v11, v5
-; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11]
-; GFX908-NEXT: v_mov_b32_e32 v10, v4
+; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[4:5], 0
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[0:1]
+; GFX908-NEXT: v_mov_b32_e32 v6, 0
+; GFX908-NEXT: v_mov_b32_e32 v8, 0
+; GFX908-NEXT: v_mov_b32_e32 v10, 0
+; GFX908-NEXT: v_mov_b32_e32 v5, 0
+; GFX908-NEXT: v_mov_b32_e32 v7, 0
+; GFX908-NEXT: v_mov_b32_e32 v9, 0
+; GFX908-NEXT: v_mov_b32_e32 v11, 0
+; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v12
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_readfirstlane_b32 s7, v2
-; GFX908-NEXT: v_readfirstlane_b32 s9, v3
+; GFX908-NEXT: v_readfirstlane_b32 s16, v3
; GFX908-NEXT: s_add_u32 s7, s7, 1
-; GFX908-NEXT: s_addc_u32 s9, s9, 0
-; GFX908-NEXT: s_mul_hi_u32 s20, s2, s7
-; GFX908-NEXT: s_mul_i32 s9, s2, s9
-; GFX908-NEXT: s_mul_i32 s21, s3, s7
-; GFX908-NEXT: s_add_i32 s9, s20, s9
+; GFX908-NEXT: s_addc_u32 s16, s16, 0
+; GFX908-NEXT: s_mul_hi_u32 s17, s2, s7
+; GFX908-NEXT: s_mul_i32 s16, s2, s16
+; GFX908-NEXT: s_mul_i32 s18, s3, s7
+; GFX908-NEXT: s_add_i32 s16, s17, s16
; GFX908-NEXT: s_mul_i32 s7, s2, s7
-; GFX908-NEXT: s_add_i32 s9, s9, s21
+; GFX908-NEXT: s_add_i32 s22, s16, s18
+; GFX908-NEXT: s_mov_b64 s[16:17], s[8:9]
; GFX908-NEXT: s_branch .LBB3_5
; GFX908-NEXT: .LBB3_4: ; %bb58
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX908-NEXT: s_add_u32 s18, s18, s14
-; GFX908-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[2:3]
-; GFX908-NEXT: s_addc_u32 s19, s19, s15
-; GFX908-NEXT: s_mov_b64 s[20:21], 0
-; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23]
+; GFX908-NEXT: s_add_u32 s16, s16, s12
+; GFX908-NEXT: v_cmp_lt_i64_e64 s[20:21], -1, v[2:3]
+; GFX908-NEXT: s_addc_u32 s17, s17, s13
+; GFX908-NEXT: s_mov_b64 s[18:19], 0
+; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21]
; GFX908-NEXT: s_cbranch_vccz .LBB3_9
; GFX908-NEXT: .LBB3_5: ; %bb16
; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT: s_add_u32 s20, s18, s7
-; GFX908-NEXT: s_addc_u32 s21, s19, s9
-; GFX908-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc
+; GFX908-NEXT: s_add_u32 s18, s16, s7
+; GFX908-NEXT: s_addc_u32 s19, s17, s22
+; GFX908-NEXT: global_load_dword v21, v19, s[18:19] offset:-12 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_load_dword v20, v19, s[20:21] offset:-8 glc
+; GFX908-NEXT: global_load_dword v20, v19, s[18:19] offset:-8 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_load_dword v12, v19, s[20:21] offset:-4 glc
+; GFX908-NEXT: global_load_dword v12, v19, s[18:19] offset:-4 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: global_load_dword v12, v19, s[20:21] glc
+; GFX908-NEXT: global_load_dword v12, v19, s[18:19] glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: ds_read_b64 v[12:13], v19
; GFX908-NEXT: ds_read_b64 v[14:15], v0
@@ -646,29 +644,29 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: v_add_f32_e32 v9, v9, v15
; GFX908-NEXT: v_add_f32_e32 v10, v10, v12
; GFX908-NEXT: v_add_f32_e32 v11, v11, v13
-; GFX908-NEXT: s_mov_b64 s[20:21], -1
+; GFX908-NEXT: s_mov_b64 s[18:19], -1
; GFX908-NEXT: s_branch .LBB3_4
; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
-; GFX908-NEXT: s_mov_b64 s[20:21], s[16:17]
-; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21]
+; GFX908-NEXT: s_mov_b64 s[18:19], s[14:15]
+; GFX908-NEXT: s_andn2_b64 vcc, exec, s[18:19]
; GFX908-NEXT: s_cbranch_vccz .LBB3_4
; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
-; GFX908-NEXT: ; implicit-def: $sgpr18_sgpr19
+; GFX908-NEXT: ; implicit-def: $sgpr16_sgpr17
; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT: s_xor_b64 s[16:17], s[20:21], -1
+; GFX908-NEXT: s_xor_b64 s[14:15], s[18:19], -1
; GFX908-NEXT: .LBB3_10: ; %Flow19
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: s_mov_b64 s[0:1], -1
-; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17]
+; GFX908-NEXT: s_and_b64 vcc, exec, s[14:15]
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
; GFX908-NEXT: ; %bb.11: ; %bb12
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT: s_add_u32 s4, s4, s8
+; GFX908-NEXT: s_add_u32 s4, s4, s6
; GFX908-NEXT: s_addc_u32 s5, s5, 0
-; GFX908-NEXT: s_add_u32 s10, s10, s12
-; GFX908-NEXT: s_addc_u32 s11, s11, s13
+; GFX908-NEXT: s_add_u32 s8, s8, s10
+; GFX908-NEXT: s_addc_u32 s9, s9, s11
; GFX908-NEXT: s_mov_b64 s[0:1], 0
; GFX908-NEXT: s_branch .LBB3_1
; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock
@@ -679,47 +677,46 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX90A-NEXT: s_load_dword s7, s[8:9], 0x18
-; GFX90A-NEXT: s_mov_b32 s6, 0
-; GFX90A-NEXT: s_mov_b32 s9, s6
+; GFX90A-NEXT: s_mov_b32 s7, 0
+; GFX90A-NEXT: s_load_dword s8, s[8:9], 0x18
+; GFX90A-NEXT: v_mov_b32_e32 v19, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT: s_sub_i32 s8, 0, s3
-; GFX90A-NEXT: v_mov_b32_e32 v19, 0
-; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0
+; GFX90A-NEXT: s_sub_i32 s6, 0, s3
+; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s8
+; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0
-; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v2
-; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, s7
-; GFX90A-NEXT: v_readfirstlane_b32 s10, v3
-; GFX90A-NEXT: s_mul_i32 s8, s8, s10
-; GFX90A-NEXT: s_mul_hi_u32 s8, s10, s8
-; GFX90A-NEXT: s_add_i32 s10, s10, s8
-; GFX90A-NEXT: s_mul_hi_u32 s8, s2, s10
-; GFX90A-NEXT: s_mul_i32 s10, s8, s3
-; GFX90A-NEXT: s_sub_i32 s2, s2, s10
-; GFX90A-NEXT: s_add_i32 s11, s8, 1
-; GFX90A-NEXT: s_sub_i32 s10, s2, s3
+; GFX90A-NEXT: v_readfirstlane_b32 s9, v3
+; GFX90A-NEXT: s_mul_i32 s6, s6, s9
+; GFX90A-NEXT: s_mul_hi_u32 s6, s9, s6
+; GFX90A-NEXT: s_add_i32 s9, s9, s6
+; GFX90A-NEXT: s_mul_hi_u32 s6, s2, s9
+; GFX90A-NEXT: s_mul_i32 s9, s6, s3
+; GFX90A-NEXT: s_sub_i32 s2, s2, s9
+; GFX90A-NEXT: s_add_i32 s10, s6, 1
+; GFX90A-NEXT: s_sub_i32 s9, s2, s3
; GFX90A-NEXT: s_cmp_ge_u32 s2, s3
-; GFX90A-NEXT: s_cselect_b32 s8, s11, s8
-; GFX90A-NEXT: s_cselect_b32 s2, s10, s2
-; GFX90A-NEXT: s_add_i32 s10, s8, 1
+; GFX90A-NEXT: s_cselect_b32 s6, s10, s6
+; GFX90A-NEXT: s_cselect_b32 s2, s9, s2
+; GFX90A-NEXT: s_add_i32 s9, s6, 1
; GFX90A-NEXT: s_cmp_ge_u32 s2, s3
-; GFX90A-NEXT: s_cselect_b32 s8, s10, s8
-; GFX9...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 553da9634dc4bae215e6c850d2de3186d09f9da5 f7e6f7b7f3bd16943a7874a717dbec2e04b5156f llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp llvm/test/CodeGen/AMDGPU/adjust-writemask-cse.ll llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll llvm/test/CodeGen/AMDGPU/cluster_stores.ll llvm/test/CodeGen/AMDGPU/collapse-endcf.ll llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll llvm/test/CodeGen/AMDGPU/fcanonicalize.ll llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll llvm/test/CodeGen/AMDGPU/flat-scratch.ll llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll llvm/test/CodeGen/AMDGPU/hazard-recognizer-src-shared-base.ll llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll llvm/test/CodeGen/AMDGPU/imm.ll llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll llvm/test/CodeGen/AMDGPU/issue92561-restore-undef-scc-verifier-error.ll llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll llvm/test/CodeGen/AMDGPU/masked-load-vectortypes.ll llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll llvm/test/CodeGen/AMDGPU/mfma-loop.ll llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll llvm/test/CodeGen/AMDGPU/scheduler-rp-calc-one-successor-two-predecessors-bug.ll llvm/test/CodeGen/AMDGPU/set_kill_i1_for_floation_point_comparison.ll llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll llvm/test/CodeGen/AMDGPU/si-scheduler-exports.ll llvm/test/CodeGen/AMDGPU/smfmac_no_agprs.ll llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll llvm/test/CodeGen/AMDGPU/vopc_dpp.ll llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll llvm/test/CodeGen/AMDGPU/wqm.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The set of peephole opt patches I'm working on improve some of these type of situations.
You also have a number of regressions in the VALU cases. Should do something about those (which may mean not doing this for VALU values). Also should consider v_mov_b64 on gfx940
if (ISD::isConstantSplatVectorAllZeros(N)) | ||
return true; | ||
|
||
// Types may have legalized by stripping the 16 bit multi-element vector |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be better to adjust the legalizer to do this in the first place
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As in, allow vector types as long as they splat 0?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Even if they don't splat 0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The splatness isn't significant, and if we're going to have 64-bit vector elements we should have made them legal throughout the DAG, not just during selection
@@ -1374,29 +1374,28 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out) | |||
; GFX9: ; %bb.0: | |||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 | |||
; GFX9-NEXT: v_mov_b32_e32 v0, 0 | |||
; GFX9-NEXT: v_mov_b32_e32 v1, v0 | |||
; GFX9-NEXT: v_mov_b32_e32 v1, 0 | |||
; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Regression
@@ -1585,29 +1584,28 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr | |||
; GFX9: ; %bb.0: | |||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 | |||
; GFX9-NEXT: v_mov_b32_e32 v0, 0 | |||
; GFX9-NEXT: v_mov_b32_e32 v1, v0 | |||
; GFX9-NEXT: v_mov_b32_e32 v1, 0 | |||
; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Regression
Looking into it
Was going to be a follow up patch to fold the sgpr64 set 0 cases into vgpr64 in SIFoldOperand (perhaps should be peephole opt?) |
I am speculating that there are several things going on, some of which are going to be improved by a set of patches I'm working on which is taking significantly longer than I expected. Related to that, I was thinking about other ways we can improve the immediate folding. I've been working on enabling MachineCSE to work for subregister extracts, and improving folds of copies through sub registers. In doing this, I see a number of places where we get better coalescing and re-assemble materialize of 64-bit inline immediates. I've been thinking we should teach foldImmediate and/or SIFoldOperands to reconstruct 64-bit moves. e.g. foldImmediate can see the use instruction is a reg_sequence with the same register used twice, and replace it with an s_mov_b64. It also currently skips any constants with multiple uses, but the correct heuristic is probably more refined (like only skip multiple uses for non inline-immediate) |
This is exactly what would fix the aforementioned regressions. In the case prior to s_movb64 materialization, the MachineCSE eliminates the duplicate s_mov_b32 but it doesn't have the logic for subregister materialized immediate CSE'ing.
One of the reasons I tried to solve this in the selectiondag is because it has non-opaque and nicely described helpers for checking if there's a splatted vector. Additionally, my plan was to add s_mov_b64 -> v_mov_b64 folding (for gfx942+) in SIFoldOperands and I didn't have to fit in both s_mov_b32 x2 -> s_mov_b64 and s_mov_b64 -> v_mov_b64 in the same pass. Anywho, I'll check some of the mentioned alternatives cause this approach also requires a separate change for globalisel to do the same |
Rebase |
I've been looking at resolving the regression but having a hard time finding a good MachineCSE alternative. Tried to convince the complex pattern to associate the added Additionally, I bailed out of moving this selection code into |
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp) | ||
.addImm(Imm); | ||
B.addReg(Tmp); | ||
// Adding multiple immediates in case of 64 bit MOVs to agpr. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This part is obsolete with #129463
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll have to rebase wrt your changes and add a similar split-b64-for-agpr
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do any of these test changes include the example you are trying to fix? I have several patches in flight which largely address similar issues
I've added https://github.com/llvm/llvm-project/pull/128017/files#diff-a073dcad4b3da7b790117b5218e1e7d7bd24eb963ac0d54b601f7f6fb001e453 Additionally, my follow up patch will fold |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you separately pre-commit the test
@@ -0,0 +1,272 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | |||
; RUN: llc -O3 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefix=GFX942 %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't need -O3, probably
%0 = insertelement <2 x i1> poison, i1 %mask, i64 0 | ||
%1 = shufflevector <2 x i1> %0, <2 x i1> poison, <2 x i32> zeroinitializer | ||
%result = tail call <2 x i32> @llvm.masked.load.v2i32.p1(ptr addrspace(1) %ptr, i32 2, <2 x i1> %1, <2 x i32> zeroinitializer) | ||
ret <2 x i32> %result |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use named values in tests
ret <8 x bfloat> %result | ||
} | ||
|
||
define <16 x i8> @masked_load_v16i8(ptr addrspace(1) inreg nocapture readonly %ptr, i1 %mask) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add something to the name to indicate the value kinds? These are all uniform SGPR ptr + divergent mask
ret <16 x i8> %result | ||
} | ||
|
||
; Declare the masked load intrinsics |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't need the comment (don't actually need the declarations either)
I've put up #129703 and yesterday's PR #129464 has already helped with removing the intermediate |
Any ideas for alternatives to implementing subreg awareness to MachineCSE for the regression of an additional vgpr use in cases like |
Rebase |
Shouldn't be resolved in selection; move b64 immediate materialization (through v_mov_b64, if available) to amdgpu fold mechanism. |
Zeros seem to materialize into a sequence of
s_mov_b32
s, even whens_mov_b64
is possible. First step to solving a worse case when 0 should be materialized into vector type vgprs as it first materializes into sgprs beforemov
ing into vgprs.