-
Notifications
You must be signed in to change notification settings - Fork 13.6k
WIP: RegisterCoalescer: Expand source register class to coalesce subreg inserts #130879
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
WIP: RegisterCoalescer: Expand source register class to coalesce subreg inserts #130879
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesRegisterCoalescer: Expand source register class to coalesce subreg inserts This fixes worse coalescing on AMDGPU in situations where VGPRs are copied %0:vgpr_32 = COPY $vgpr0 Patch is 28.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130879.diff 6 Files Affected:
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 74606e66d4e4b..d6be8a3ea95d1 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -456,6 +456,100 @@ static bool isSplitEdge(const MachineBasicBlock *MBB) {
return true;
}
+static const TargetRegisterClass *
+getCommonSubClassWithSubReg(const TargetRegisterInfo &TRI,
+ const TargetRegisterClass *DstRC,
+ const TargetRegisterClass *SrcRC, unsigned DstSub,
+ const MachineFunction &MF) {
+ const TargetRegisterClass *ReferenceRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+
+ if (ReferenceRC)
+ return ReferenceRC;
+
+
+ const TargetRegisterClass *NewSuperDstRC = nullptr;
+ for (auto SuperDstRCID : DstRC->superclasses()) {
+ const TargetRegisterClass *SuperDstRC = TRI.getRegClass(SuperDstRCID);
+ const TargetRegisterClass *WithSubRC = TRI.getSubClassWithSubReg(SuperDstRC, DstSub);
+ if (WithSubRC)
+ return TRI.getMatchingSuperRegClass(WithSubRC, SrcRC, DstSub);
+ }
+
+ // TODO: For a subregister insert, it may well be worth either reconstraining
+ // the source register based on the use constraints, or even introducing a
+ // copy.
+
+
+ LLVM_DEBUG(dbgs() << "NewSuperDstRC = " << (NewSuperDstRC ? TRI.getRegClassName(NewSuperDstRC) : "<null>") << '\n');
+ if (!NewSuperDstRC)
+ return nullptr;
+
+ const TargetRegisterClass *LargerSrcRC =
+ TRI.getLargestLegalSuperClass(SrcRC, MF);
+
+ return TRI.getMatchingSuperRegClass(NewSuperDstRC, SrcRC, DstSub);
+
+ //const TargetRegisterClass *ReferenceRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+
+
+ const TargetRegisterClass *LargerDstRC =
+ TRI.getLargestLegalSuperClass(DstRC, MF);
+
+ const TargetRegisterClass *NewSuperRC =
+ TRI.getMatchingSuperRegClass(DstRC, LargerSrcRC, DstSub);
+
+ LLVM_DEBUG(
+ dbgs() << "ReferenceRC: " << (ReferenceRC ? TRI.getRegClassName(ReferenceRC) : "<null>") << '\n';
+ dbgs() << "LargerDstRC: " << TRI.getRegClassName(LargerDstRC) << '\n';
+ dbgs() << "LargerSrcRC: " << TRI.getRegClassName(LargerSrcRC) << '\n';);
+
+ if (NewSuperRC) {
+
+ {
+
+ unsigned PreA, PreB;
+ const TargetRegisterClass *CommSup = TRI.getCommonSuperRegClass(SrcRC, DstSub,
+ DstRC, DstSub,
+ PreA, PreB);
+
+ }
+
+ if (TRI.getSubClassWithSubReg(NewSuperRC, DstSub) == NewSuperRC) {
+ // Subregister supported in test class
+ }
+
+ if (false && TRI.getSubRegisterClass(NewSuperRC, DstSub) != SrcRC)
+ return nullptr;
+
+ const TargetRegisterClass *NewDstSubRC = TRI.getSubClassWithSubReg(NewSuperRC, DstSub);
+
+ const TargetRegisterClass *NewSubRC = TRI.getSubRegisterClass(NewSuperRC, DstSub);
+
+
+// if (TRI.getCommonSubClass(NewSubRC, SrcRC) == SrcRC)
+// return nullptr;
+ return TRI.getMatchingSuperRegClass(NewSuperRC, NewSubRC, DstSub);
+
+ //if (NewSubRC->hasSubClassEq(SrcRC))
+ //return nullptr;
+
+
+
+ //if (!NewDstSubRC || !NewDstSubRC->hasSubClassEq(SrcRC))
+ ///return nullptr;
+
+ //return TRI.getCommonSubClass(DstRC, NewDstSubRC);
+
+ return NewDstSubRC;
+
+ //const TargetRegisterClass *Rematch =
+ //TRI.getMatchingSuperRegClass(NewSuperRC, SrcRC, DstSub);
+ //return Rematch;
+ }
+
+ return nullptr;
+}
+
bool CoalescerPair::setRegisters(const MachineInstr *MI) {
SrcReg = DstReg = Register();
SrcIdx = DstIdx = 0;
@@ -477,7 +571,8 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
Flipped = true;
}
- const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+ const MachineFunction *MF = MI->getMF();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
if (Dst.isPhysical()) {
@@ -514,7 +609,37 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
} else if (DstSub) {
// SrcReg will be merged with a sub-register of DstReg.
SrcIdx = DstSub;
- NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+
+ const TargetRegisterClass *OldRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+
+ // TODO: Probably should be more aggressive than one use. This is mostly
+ // to avoid regressing some lane broadcast patterns.
+ if (false && !OldRC && !MRI.hasOneNonDBGUse(Src))
+ return false;
+
+ LLVM_DEBUG(
+ dbgs() << "SrcRC = " << TRI.getRegClassName(SrcRC) << '\n';
+ dbgs() << "DstRC = " << TRI.getRegClassName(DstRC) << '\n';
+ dbgs() << "DstSub = " << TRI.getSubRegIndexName(DstSub) << '\n';
+ dbgs() << "OldRC " << (OldRC ? TRI.getRegClassName(OldRC) : "<null>")
+ << '\n';);
+
+ NewRC = getCommonSubClassWithSubReg(TRI, DstRC, SrcRC, DstSub, *MF);
+
+ LLVM_DEBUG(dbgs() << "NewRC "
+ << (NewRC ? TRI.getRegClassName(NewRC) : "<null>")
+ << '\n';);
+
+
+ /*
+ const TargetRegisterClass *SRC =
+ TRI.getSubClassWithSubReg(NewRC, DstSub);
+
+ assert(SRC && "no class with subreg");
+ assert(SRC->hasSubClassEq(SrcRC));
+ */
+
+
} else if (SrcSub) {
// DstReg will be merged with a sub-register of SrcReg.
DstIdx = SrcSub;
@@ -2205,11 +2330,20 @@ bool RegisterCoalescer::joinCopy(
return false;
}
+
// Coalescing to a virtual register that is of a sub-register class of the
// other. Make sure the resulting register is set to the right register class.
if (CP.isCrossClass()) {
- ++numCrossRCs;
+
+ if (false && !MRI->constrainRegClass(CP.getDstReg(), CP.getNewRC())) {
+ //dbgs() <
+ }
+
MRI->setRegClass(CP.getDstReg(), CP.getNewRC());
+
+ //InflateRegs.push_back(CP.getSrcReg());
+ //InflateRegs.push_back(CP.getDstReg());
+ ++numCrossRCs;
}
// Removing sub-register copies can ease the register class constraints.
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-better-job-with-av-32.mir b/llvm/test/CodeGen/AMDGPU/coalescer-better-job-with-av-32.mir
new file mode 100644
index 0000000000000..e3cb988a04f97
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-better-job-with-av-32.mir
@@ -0,0 +1,186 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -start-before=register-coalescer -o - %s | FileCheck %s
+--- |
+ define void @coalescer_av_32_test() {
+ ; CHECK-LABEL: coalescer_av_32_test:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; CHECK-NEXT: v_mov_b32_e32 v16, s0
+ ; CHECK-NEXT: v_mov_b32_e32 v17, s1
+ ; CHECK-NEXT: v_mov_b32_e32 v18, s2
+ ; CHECK-NEXT: v_mov_b32_e32 v19, s3
+ ; CHECK-NEXT: v_mov_b32_e32 v20, s16
+ ; CHECK-NEXT: v_mov_b32_e32 v21, s17
+ ; CHECK-NEXT: v_mov_b32_e32 v22, s18
+ ; CHECK-NEXT: v_mov_b32_e32 v23, s19
+ ; CHECK-NEXT: v_accvgpr_write_b32 a0, s20
+ ; CHECK-NEXT: v_accvgpr_write_b32 a1, s21
+ ; CHECK-NEXT: v_accvgpr_write_b32 a2, s22
+ ; CHECK-NEXT: v_accvgpr_write_b32 a3, s23
+ ; CHECK-NEXT: v_accvgpr_write_b32 a4, s24
+ ; CHECK-NEXT: v_accvgpr_write_b32 a5, s25
+ ; CHECK-NEXT: v_accvgpr_write_b32 a6, s26
+ ; CHECK-NEXT: v_accvgpr_write_b32 a7, s27
+ ; CHECK-NEXT: v_accvgpr_write_b32 a8, s28
+ ; CHECK-NEXT: v_accvgpr_write_b32 a9, s29
+ ; CHECK-NEXT: v_accvgpr_write_b32 a10, v8
+ ; CHECK-NEXT: v_accvgpr_write_b32 a11, v9
+ ; CHECK-NEXT: v_accvgpr_write_b32 a12, v10
+ ; CHECK-NEXT: v_accvgpr_write_b32 a13, v11
+ ; CHECK-NEXT: v_accvgpr_write_b32 a14, v12
+ ; CHECK-NEXT: v_accvgpr_write_b32 a15, v13
+ ; CHECK-NEXT: s_nop 1
+ ; CHECK-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+ ; CHECK-NEXT: s_nop 7
+ ; CHECK-NEXT: s_nop 7
+ ; CHECK-NEXT: s_nop 3
+ ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+ ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+ ; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
+ ; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
+ ; CHECK-NEXT: v_accvgpr_read_b32 v4, a4
+ ; CHECK-NEXT: v_accvgpr_read_b32 v5, a5
+ ; CHECK-NEXT: v_accvgpr_read_b32 v6, a6
+ ; CHECK-NEXT: v_accvgpr_read_b32 v7, a7
+ ; CHECK-NEXT: v_accvgpr_read_b32 v8, a8
+ ; CHECK-NEXT: v_accvgpr_read_b32 v9, a9
+ ; CHECK-NEXT: v_accvgpr_read_b32 v10, a10
+ ; CHECK-NEXT: v_accvgpr_read_b32 v11, a11
+ ; CHECK-NEXT: v_accvgpr_read_b32 v12, a12
+ ; CHECK-NEXT: v_accvgpr_read_b32 v13, a13
+ ; CHECK-NEXT: v_accvgpr_read_b32 v14, a14
+ ; CHECK-NEXT: v_accvgpr_read_b32 v15, a15
+ ; CHECK-NEXT: s_setpc_b64 s[30:31]
+ ret void
+ }
+
+...
+---
+name: coalescer_av_32_test
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+
+ %0:vgpr_32 = COPY killed $vgpr15
+ %1:vgpr_32 = COPY killed $vgpr14
+ %2:vgpr_32 = COPY killed $vgpr13
+ %3:vgpr_32 = COPY killed $vgpr12
+ %4:vgpr_32 = COPY killed $vgpr11
+ %5:vgpr_32 = COPY killed $vgpr10
+ %6:vgpr_32 = COPY killed $vgpr9
+ %7:vgpr_32 = COPY killed $vgpr8
+ %8:sgpr_32 = COPY killed $sgpr29
+ %9:sgpr_32 = COPY killed $sgpr28
+ %10:sgpr_32 = COPY killed $sgpr27
+ %11:sgpr_32 = COPY killed $sgpr26
+ %12:sgpr_32 = COPY killed $sgpr25
+ %13:sgpr_32 = COPY killed $sgpr24
+ %14:sgpr_32 = COPY killed $sgpr23
+ %15:sgpr_32 = COPY killed $sgpr22
+ %16:sgpr_32 = COPY killed $sgpr21
+ %17:sgpr_32 = COPY killed $sgpr20
+ %18:vgpr_32 = COPY killed $vgpr7
+ %19:vgpr_32 = COPY killed $vgpr6
+ %20:vgpr_32 = COPY killed $vgpr5
+ %21:vgpr_32 = COPY killed $vgpr4
+ %22:vgpr_32 = COPY killed $vgpr3
+ %23:vgpr_32 = COPY killed $vgpr2
+ %24:vgpr_32 = COPY killed $vgpr1
+ %25:vgpr_32 = COPY killed $vgpr0
+ %26:sgpr_32 = COPY killed $sgpr19
+ %27:sgpr_32 = COPY killed $sgpr18
+ %28:sgpr_32 = COPY killed $sgpr17
+ %29:sgpr_32 = COPY killed $sgpr16
+ %30:sgpr_32 = COPY killed $sgpr3
+ %31:sgpr_32 = COPY killed $sgpr2
+ %32:sgpr_32 = COPY killed $sgpr1
+ %33:sgpr_32 = COPY killed $sgpr0
+ %34:av_32 = COPY killed %17
+ %35:av_32 = COPY killed %16
+ %36:av_32 = COPY killed %15
+ %37:av_32 = COPY killed %14
+ %38:av_32 = COPY killed %13
+ %39:av_32 = COPY killed %12
+ %40:av_32 = COPY killed %11
+ %41:av_32 = COPY killed %10
+ %42:av_32 = COPY killed %9
+ %43:av_32 = COPY killed %8
+ undef %44.sub0:vreg_256_align2 = COPY killed %25
+ %44.sub1:vreg_256_align2 = COPY killed %24
+ %44.sub2:vreg_256_align2 = COPY killed %23
+ %44.sub3:vreg_256_align2 = COPY killed %22
+ %44.sub4:vreg_256_align2 = COPY killed %21
+ %44.sub5:vreg_256_align2 = COPY killed %20
+ %44.sub6:vreg_256_align2 = COPY killed %19
+ %44.sub7:vreg_256_align2 = COPY killed %18
+ %45:av_32 = COPY killed %33
+ %46:av_32 = COPY killed %32
+ %47:av_32 = COPY killed %31
+ %48:av_32 = COPY killed %30
+ %49:av_32 = COPY killed %29
+ %50:av_32 = COPY killed %28
+ %51:av_32 = COPY killed %27
+ %52:av_32 = COPY killed %26
+ undef %53.sub0:vreg_256_align2 = COPY killed %45
+ %53.sub1:vreg_256_align2 = COPY killed %46
+ %53.sub2:vreg_256_align2 = COPY killed %47
+ %53.sub3:vreg_256_align2 = COPY killed %48
+ %53.sub4:vreg_256_align2 = COPY killed %49
+ %53.sub5:vreg_256_align2 = COPY killed %50
+ %53.sub6:vreg_256_align2 = COPY killed %51
+ %53.sub7:vreg_256_align2 = COPY killed %52
+ undef %54.sub0:areg_512_align2 = COPY killed %34
+ %54.sub1:areg_512_align2 = COPY killed %35
+ %54.sub2:areg_512_align2 = COPY killed %36
+ %54.sub3:areg_512_align2 = COPY killed %37
+ %54.sub4:areg_512_align2 = COPY killed %38
+ %54.sub5:areg_512_align2 = COPY killed %39
+ %54.sub6:areg_512_align2 = COPY killed %40
+ %54.sub7:areg_512_align2 = COPY killed %41
+ %54.sub8:areg_512_align2 = COPY killed %42
+ %54.sub9:areg_512_align2 = COPY killed %43
+ %54.sub10:areg_512_align2 = COPY killed %7
+ %54.sub11:areg_512_align2 = COPY killed %6
+ %54.sub12:areg_512_align2 = COPY killed %5
+ %54.sub13:areg_512_align2 = COPY killed %4
+ %54.sub14:areg_512_align2 = COPY killed %3
+ %54.sub15:areg_512_align2 = COPY killed %2
+ %55:areg_512_align2 = COPY killed %54
+ %55:areg_512_align2 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f8_f8_mac_e64 killed %53, killed %44, %55, 0, 0, killed %1, killed %0, 0, 0, implicit $mode, implicit $exec
+ %56:av_32 = COPY %55.sub0
+ %57:av_32 = COPY %55.sub1
+ %58:av_32 = COPY %55.sub2
+ %59:av_32 = COPY %55.sub3
+ %60:av_32 = COPY %55.sub4
+ %61:av_32 = COPY %55.sub5
+ %62:av_32 = COPY %55.sub6
+ %63:av_32 = COPY %55.sub7
+ %64:av_32 = COPY %55.sub8
+ %65:av_32 = COPY %55.sub9
+ %66:av_32 = COPY %55.sub10
+ %67:av_32 = COPY %55.sub11
+ %68:av_32 = COPY %55.sub12
+ %69:av_32 = COPY %55.sub13
+ %70:av_32 = COPY %55.sub14
+ %71:av_32 = COPY killed %55.sub15
+ $vgpr0 = COPY killed %56
+ $vgpr1 = COPY killed %57
+ $vgpr2 = COPY killed %58
+ $vgpr3 = COPY killed %59
+ $vgpr4 = COPY killed %60
+ $vgpr5 = COPY killed %61
+ $vgpr6 = COPY killed %62
+ $vgpr7 = COPY killed %63
+ $vgpr8 = COPY killed %64
+ $vgpr9 = COPY killed %65
+ $vgpr10 = COPY killed %66
+ $vgpr11 = COPY killed %67
+ $vgpr12 = COPY killed %68
+ $vgpr13 = COPY killed %69
+ $vgpr14 = COPY killed %70
+ $vgpr15 = COPY killed %71
+ SI_RETURN implicit killed $vgpr0, implicit killed $vgpr1, implicit killed $vgpr2, implicit killed $vgpr3, implicit killed $vgpr4, implicit killed $vgpr5, implicit killed $vgpr6, implicit killed $vgpr7, implicit killed $vgpr8, implicit killed $vgpr9, implicit killed $vgpr10, implicit killed $vgpr11, implicit killed $vgpr12, implicit killed $vgpr13, implicit killed $vgpr14, implicit killed $vgpr15
+
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-worse-job-with-vgpr-32-instead-of-av-32.mir b/llvm/test/CodeGen/AMDGPU/coalescer-worse-job-with-vgpr-32-instead-of-av-32.mir
new file mode 100644
index 0000000000000..8ff727e3c3890
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/coalescer-worse-job-with-vgpr-32-instead-of-av-32.mir
@@ -0,0 +1,200 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -start-before=register-coalescer -o - %s | FileCheck %s
+--- |
+ define void @coalescer_av_32_test() {
+ ; CHECK-LABEL: coalescer_av_32_test:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; CHECK-NEXT: v_mov_b32_e32 v16, s0
+ ; CHECK-NEXT: v_mov_b32_e32 v17, s1
+ ; CHECK-NEXT: v_mov_b32_e32 v18, s2
+ ; CHECK-NEXT: v_mov_b32_e32 v19, s3
+ ; CHECK-NEXT: v_mov_b32_e32 v20, s16
+ ; CHECK-NEXT: v_mov_b32_e32 v21, s17
+ ; CHECK-NEXT: v_mov_b32_e32 v22, s18
+ ; CHECK-NEXT: v_mov_b32_e32 v23, s19
+ ; CHECK-NEXT: v_mov_b32_e32 v24, s20
+ ; CHECK-NEXT: v_mov_b32_e32 v25, s21
+ ; CHECK-NEXT: v_mov_b32_e32 v26, s22
+ ; CHECK-NEXT: v_mov_b32_e32 v27, s23
+ ; CHECK-NEXT: v_mov_b32_e32 v28, s24
+ ; CHECK-NEXT: v_mov_b32_e32 v29, s25
+ ; CHECK-NEXT: v_mov_b32_e32 v30, s26
+ ; CHECK-NEXT: v_mov_b32_e32 v31, s27
+ ; CHECK-NEXT: v_mov_b32_e32 v32, s28
+ ; CHECK-NEXT: v_mov_b32_e32 v33, s29
+ ; CHECK-NEXT: v_accvgpr_write_b32 a0, v24
+ ; CHECK-NEXT: v_accvgpr_write_b32 a1, v25
+ ; CHECK-NEXT: v_accvgpr_write_b32 a2, v26
+ ; CHECK-NEXT: v_accvgpr_write_b32 a3, v27
+ ; CHECK-NEXT: v_accvgpr_write_b32 a4, v28
+ ; CHECK-NEXT: v_accvgpr_write_b32 a5, v29
+ ; CHECK-NEXT: v_accvgpr_write_b32 a6, v30
+ ; CHECK-NEXT: v_accvgpr_write_b32 a7, v31
+ ; CHECK-NEXT: v_accvgpr_write_b32 a8, v32
+ ; CHECK-NEXT: v_accvgpr_write_b32 a9, v33
+ ; CHECK-NEXT: v_accvgpr_write_b32 a10, v8
+ ; CHECK-NEXT: v_accvgpr_write_b32 a11, v9
+ ; CHECK-NEXT: v_accvgpr_write_b32 a12, v10
+ ; CHECK-NEXT: v_accvgpr_write_b32 a13, v11
+ ; CHECK-NEXT: v_accvgpr_write_b32 a14, v12
+ ; CHECK-NEXT: v_accvgpr_write_b32 a15, v13
+ ; CHECK-NEXT: s_nop 1
+ ; CHECK-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+ ; CHECK-NEXT: s_nop 7
+ ; CHECK-NEXT: s_nop 7
+ ; CHECK-NEXT: s_nop 3
+ ; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
+ ; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
+ ; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
+ ; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
+ ; CHECK-NEXT: v_accvgpr_read_b32 v4, a4
+ ; CHECK-NEXT: v_accvgpr_read_b32 v5, a5
+ ; CHECK-NEXT: v_accvgpr_read_b32 v6, a6
+ ; CHECK-NEXT: v_accvgpr_read_b32 v7, a7
+ ; CHECK-NEXT: v_accvgpr_read_b32 v8, a8
+ ; CHECK-NEXT: v_accvgpr_read_b32 v9, a9
+ ; CHECK-NEXT: v_accvgpr_read_b32 v10, a10
+ ; CHECK-NEXT: v_accvgpr_read_b32 v11, a11
+ ; CHECK-NEXT: v_accvgpr_read_b32 v12, a12
+ ; CHECK-NEXT: v_accvgpr_read_b32 v13, a13
+ ; CHECK-NEXT: v_accvgpr_read_b32 v14, a14
+ ; CHECK-NEXT: v_accvgpr_read_b32 v15, a15
+ ; CHECK-NEXT: s_setpc_b64 s[30:31]
+ ret void
+ }
+
+...
+---
+name: coalescer_av_32_test
+tracksRegLiveness: true
+noPhis: true
+isSSA: false
+noVRegs: false
+hasFakeUses: false
+registers:
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
+
+ %0:vgpr_32 = COPY killed $vgpr15
+ %1:vgpr_32 = COPY killed $vgpr14
+ %2:vgpr_32 = COPY killed $vgpr13
+ %3:vgpr_32 = COPY killed $vgpr12
+ %4:vgpr_32 = COPY killed $vgpr11
+ %5:vgpr_32 = COPY killed $vgpr10
+ %6:vgpr_32 = COPY killed $vgpr9
+ %7:vgpr_32 = COPY killed $vgpr8
+ %8:sgpr_32 = COPY killed $sgpr29
+ %9:sgpr_32 = COPY killed $sgpr28
+ %10:sgpr_32 = COPY killed $sgpr27
+ %11:sgpr_32 = COPY killed $sgpr26
+ %12:sgpr_32 = COPY killed $sgpr25
+ %13:sgpr_32 = COPY killed $sgpr24
+ %14:sgpr_32 = COPY killed $sgpr23
+ %15:sgpr_32 = COPY killed $sgpr22
+ %16:sgpr_32 = COPY killed $sgpr21
+ %17:sgpr_32 = COPY killed $sgpr20
+ %18:vgpr_32 = COPY killed $vgpr7
+ %19:vgpr_32 = COPY killed $vgpr6
+ %20:vgpr_32 = COPY killed $vgpr5
+ %21:vgpr_32 = COPY killed $vgpr4
+ %22:vgpr_32 = COPY killed $vgpr3
+ %23:vgpr_32 = COPY killed $vgpr2
+ %24:vgpr_32 = COPY killed $vgpr1
+ %25:vgpr_32 = COPY killed $vgpr0
+ %26:sgpr_32 = COPY killed $sgpr19
+ %27:sgpr_32 = COPY killed $sgpr18
+ %28:sgpr_32 = COPY killed $sgpr17
+ %29:sgpr_32 = COPY killed $sgpr16
+ %30:sgpr_32 = COPY killed $sgpr3
+ %31:sgpr_32 = COPY killed $sgpr2
+ %32:sgpr_32 = COPY killed $sgpr1
+ %33:sgpr_32 = COPY killed $sgpr0
+ %34:vgpr_32 = COPY killed %17, implicit $exec
+ %35:vgpr_32 = COPY killed %16, implicit $exec
+ %36:vgpr_32 = COPY killed %15, implicit $exec
+ %37:vgpr_32 = COPY killed %14, implicit $exec
+ %38:vgpr_32 = COPY killed %13, implicit $exec
+ %39:vgpr_32 = COPY killed %12, implicit $exec
+ %40:vgpr_32 = COPY killed %11, implicit $exec
+ %41:vgpr_32 = COPY killed %10, implicit $exec
+ %42:vgpr_32 = COPY killed %9, implicit $exec
+ %43:vgpr_32 = COPY killed %8, implicit $exec
+ undef %44.sub0:vreg_256_align2 = COPY killed %25
+ %44.sub1:vreg_256_align2 = COPY killed %24
+ %44.sub2:vreg_256_align2 = COPY killed %23
+ %44.sub3:vreg_256_align2 = COPY killed %22
+ %44.sub4:vreg_256_align2 = COPY killed %21
+ %44.sub5:vreg_256_align2 = COPY killed %20
+ %44.sub6:vreg_256_align2 = COPY killed %19
+ %44.sub7:vreg_256_align2 = COPY killed ...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff adae90ee35ca6f60e99189dea95cb719b4969860 34a866b294a5e9a7a0fefae142468b5bc0caa19c --extensions cpp -- llvm/lib/CodeGen/RegisterCoalescer.cpp View the diff from clang-format here.diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index d6be8a3ea9..5261913a39 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -461,16 +461,17 @@ getCommonSubClassWithSubReg(const TargetRegisterInfo &TRI,
const TargetRegisterClass *DstRC,
const TargetRegisterClass *SrcRC, unsigned DstSub,
const MachineFunction &MF) {
- const TargetRegisterClass *ReferenceRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+ const TargetRegisterClass *ReferenceRC =
+ TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
if (ReferenceRC)
return ReferenceRC;
-
const TargetRegisterClass *NewSuperDstRC = nullptr;
for (auto SuperDstRCID : DstRC->superclasses()) {
const TargetRegisterClass *SuperDstRC = TRI.getRegClass(SuperDstRCID);
- const TargetRegisterClass *WithSubRC = TRI.getSubClassWithSubReg(SuperDstRC, DstSub);
+ const TargetRegisterClass *WithSubRC =
+ TRI.getSubClassWithSubReg(SuperDstRC, DstSub);
if (WithSubRC)
return TRI.getMatchingSuperRegClass(WithSubRC, SrcRC, DstSub);
}
@@ -479,8 +480,10 @@ getCommonSubClassWithSubReg(const TargetRegisterInfo &TRI,
// the source register based on the use constraints, or even introducing a
// copy.
-
- LLVM_DEBUG(dbgs() << "NewSuperDstRC = " << (NewSuperDstRC ? TRI.getRegClassName(NewSuperDstRC) : "<null>") << '\n');
+ LLVM_DEBUG(
+ dbgs() << "NewSuperDstRC = "
+ << (NewSuperDstRC ? TRI.getRegClassName(NewSuperDstRC) : "<null>")
+ << '\n');
if (!NewSuperDstRC)
return nullptr;
@@ -489,8 +492,8 @@ getCommonSubClassWithSubReg(const TargetRegisterInfo &TRI,
return TRI.getMatchingSuperRegClass(NewSuperDstRC, SrcRC, DstSub);
- //const TargetRegisterClass *ReferenceRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
-
+ // const TargetRegisterClass *ReferenceRC =
+ // TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
const TargetRegisterClass *LargerDstRC =
TRI.getLargestLegalSuperClass(DstRC, MF);
@@ -499,7 +502,9 @@ getCommonSubClassWithSubReg(const TargetRegisterInfo &TRI,
TRI.getMatchingSuperRegClass(DstRC, LargerSrcRC, DstSub);
LLVM_DEBUG(
- dbgs() << "ReferenceRC: " << (ReferenceRC ? TRI.getRegClassName(ReferenceRC) : "<null>") << '\n';
+ dbgs() << "ReferenceRC: "
+ << (ReferenceRC ? TRI.getRegClassName(ReferenceRC) : "<null>")
+ << '\n';
dbgs() << "LargerDstRC: " << TRI.getRegClassName(LargerDstRC) << '\n';
dbgs() << "LargerSrcRC: " << TRI.getRegClassName(LargerSrcRC) << '\n';);
@@ -508,10 +513,8 @@ getCommonSubClassWithSubReg(const TargetRegisterInfo &TRI,
{
unsigned PreA, PreB;
- const TargetRegisterClass *CommSup = TRI.getCommonSuperRegClass(SrcRC, DstSub,
- DstRC, DstSub,
- PreA, PreB);
-
+ const TargetRegisterClass *CommSup =
+ TRI.getCommonSuperRegClass(SrcRC, DstSub, DstRC, DstSub, PreA, PreB);
}
if (TRI.getSubClassWithSubReg(NewSuperRC, DstSub) == NewSuperRC) {
@@ -521,30 +524,29 @@ getCommonSubClassWithSubReg(const TargetRegisterInfo &TRI,
if (false && TRI.getSubRegisterClass(NewSuperRC, DstSub) != SrcRC)
return nullptr;
- const TargetRegisterClass *NewDstSubRC = TRI.getSubClassWithSubReg(NewSuperRC, DstSub);
-
- const TargetRegisterClass *NewSubRC = TRI.getSubRegisterClass(NewSuperRC, DstSub);
+ const TargetRegisterClass *NewDstSubRC =
+ TRI.getSubClassWithSubReg(NewSuperRC, DstSub);
+ const TargetRegisterClass *NewSubRC =
+ TRI.getSubRegisterClass(NewSuperRC, DstSub);
-// if (TRI.getCommonSubClass(NewSubRC, SrcRC) == SrcRC)
-// return nullptr;
+ // if (TRI.getCommonSubClass(NewSubRC, SrcRC) == SrcRC)
+ // return nullptr;
return TRI.getMatchingSuperRegClass(NewSuperRC, NewSubRC, DstSub);
- //if (NewSubRC->hasSubClassEq(SrcRC))
- //return nullptr;
+ // if (NewSubRC->hasSubClassEq(SrcRC))
+ // return nullptr;
+ // if (!NewDstSubRC || !NewDstSubRC->hasSubClassEq(SrcRC))
+ /// return nullptr;
-
- //if (!NewDstSubRC || !NewDstSubRC->hasSubClassEq(SrcRC))
- ///return nullptr;
-
- //return TRI.getCommonSubClass(DstRC, NewDstSubRC);
+ // return TRI.getCommonSubClass(DstRC, NewDstSubRC);
return NewDstSubRC;
- //const TargetRegisterClass *Rematch =
- //TRI.getMatchingSuperRegClass(NewSuperRC, SrcRC, DstSub);
- //return Rematch;
+ // const TargetRegisterClass *Rematch =
+ // TRI.getMatchingSuperRegClass(NewSuperRC, SrcRC, DstSub);
+ // return Rematch;
}
return nullptr;
@@ -610,7 +612,8 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
// SrcReg will be merged with a sub-register of DstReg.
SrcIdx = DstSub;
- const TargetRegisterClass *OldRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+ const TargetRegisterClass *OldRC =
+ TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
// TODO: Probably should be more aggressive than one use. This is mostly
// to avoid regressing some lane broadcast patterns.
@@ -630,7 +633,6 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
<< (NewRC ? TRI.getRegClassName(NewRC) : "<null>")
<< '\n';);
-
/*
const TargetRegisterClass *SRC =
TRI.getSubClassWithSubReg(NewRC, DstSub);
@@ -639,7 +641,6 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
assert(SRC->hasSubClassEq(SrcRC));
*/
-
} else if (SrcSub) {
// DstReg will be merged with a sub-register of SrcReg.
DstIdx = SrcSub;
@@ -2330,19 +2331,18 @@ bool RegisterCoalescer::joinCopy(
return false;
}
-
// Coalescing to a virtual register that is of a sub-register class of the
// other. Make sure the resulting register is set to the right register class.
if (CP.isCrossClass()) {
if (false && !MRI->constrainRegClass(CP.getDstReg(), CP.getNewRC())) {
- //dbgs() <
+ // dbgs() <
}
MRI->setRegClass(CP.getDstReg(), CP.getNewRC());
- //InflateRegs.push_back(CP.getSrcReg());
- //InflateRegs.push_back(CP.getDstReg());
+ // InflateRegs.push_back(CP.getSrcReg());
+ // InflateRegs.push_back(CP.getDstReg());
++numCrossRCs;
}
|
…serts This fixes worse coalescing on AMDGPU in situations where VGPRs are copied into AGPRs. E.g. %0:vgpr_32 = COPY $vgpr0 %1:vgpr_32 = COPY $vgpr1 undef %2.sub0:areg_64 = COPY %0 %2.sub1:areg_64 = COPY %2
4aa5ce7
to
34a866b
Compare
Thanks for posting this -- To be clear, you're not actively working on this, and I can inherit it for my related work #130870 . Or did you have plans for this? |
No, I've put this aside. Feel free to pick it up |
RegisterCoalescer: Expand source register class to coalesce subreg inserts
This fixes worse coalescing on AMDGPU in situations where VGPRs are copied
into AGPRs. E.g.
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
undef %2.sub0:areg_64 = COPY %0
%2.sub1:areg_64 = COPY %2