-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[RegisterCoalescer]: Try inflated RC for coalescing #130870
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-regalloc @llvm/pr-subscribers-backend-powerpc Author: Jeffrey Byrnes (jrbyrnes) ChangesCurrently, we do register inflation for coalesced copies at the end of the pass. However, there is a benefit to doing inflation while coalescing, as inflation may allow for a common RC. This PR does inflation while coalescing if we are unable to find a common RC. This some test churn in: Which unfortunately doesn't render well. After going through the diffs of both these tests files, I didn't see any regressions, and saw some slight improvements. Most of the test churn is due to deviation between GISEL and SDAG: the order of v_accvgpr_write in GISEL was reversed. For the motivating case see: Patch is 938.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130870.diff 19 Files Affected:
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 74606e66d4e4b..1a99da6e0c153 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -477,8 +477,9 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
Flipped = true;
}
- const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
- const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
+ MachineRegisterInfo *MRI =
+ const_cast<MachineRegisterInfo *>(&MI->getMF()->getRegInfo());
+ const TargetRegisterClass *SrcRC = MRI->getRegClass(Src);
if (Dst.isPhysical()) {
// Eliminate DstSub on a physreg.
@@ -499,7 +500,14 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
}
} else {
// Both registers are virtual.
- const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+ const TargetRegisterClass *DstRC = MRI->getRegClass(Dst);
+
+ auto recomputeRegClasses = [&MRI](Register &Src, Register &Dst) {
+ bool Success = false;
+ Success = MRI->recomputeRegClass(Src);
+ Success |= MRI->recomputeRegClass(Dst);
+ return Success;
+ };
// Both registers have subreg indices.
if (SrcSub && DstSub) {
@@ -509,19 +517,42 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub, SrcIdx,
DstIdx);
- if (!NewRC)
- return false;
+ if (!NewRC) {
+ if (recomputeRegClasses(Src, Dst)) {
+ SrcRC = MRI->getRegClass(Src);
+ DstRC = MRI->getRegClass(Dst);
+ NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub,
+ SrcIdx, DstIdx);
+ }
+ if (!NewRC)
+ return false;
+ }
} else if (DstSub) {
// SrcReg will be merged with a sub-register of DstReg.
SrcIdx = DstSub;
NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+ if (!NewRC && recomputeRegClasses(Src, Dst)) {
+ SrcRC = MRI->getRegClass(Src);
+ DstRC = MRI->getRegClass(Dst);
+ NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, DstSub);
+ }
} else if (SrcSub) {
// DstReg will be merged with a sub-register of SrcReg.
DstIdx = SrcSub;
NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSub);
+ if (!NewRC && recomputeRegClasses(Src, Dst)) {
+ SrcRC = MRI->getRegClass(Src);
+ DstRC = MRI->getRegClass(Dst);
+ NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSub);
+ }
} else {
// This is a straight copy without sub-registers.
NewRC = TRI.getCommonSubClass(DstRC, SrcRC);
+ if (!NewRC && recomputeRegClasses(Src, Dst)) {
+ SrcRC = MRI->getRegClass(Src);
+ DstRC = MRI->getRegClass(Dst);
+ NewRC = TRI.getCommonSubClass(DstRC, SrcRC);
+ }
}
// The combined constraint may be impossible to satisfy.
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
index 73ccab64a1925..74e72d75cb63a 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
@@ -1,96 +1,244 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX90A %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX908 %s
-; GCN-LABEL: {{^}}func_empty:
-; GCN-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GCN: s_setpc_b64
+
define void @func_empty() #0 {
+; GCN-LABEL: func_empty:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
ret void
}
-; GCN-LABEL: {{^}}func_areg_4:
-; GCN-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GCN: use agpr3
-; GCN-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GCN: s_setpc_b64
define void @func_areg_4() #0 {
+; GCN-LABEL: func_areg_4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; use agpr3
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "; use agpr3", "~{a3}" ()
ret void
}
-; GCN-LABEL: {{^}}func_areg_32:
-; GCN-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GCN: use agpr31
-; GCN-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GCN: s_setpc_b64
define void @func_areg_32() #0 {
+; GCN-LABEL: func_areg_32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; use agpr31
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "; use agpr31", "~{a31}" ()
ret void
}
-; GCN-LABEL: {{^}}func_areg_33:
-; GCN-NOT: a32
-; GFX90A: v_accvgpr_read_b32 v0, a32 ; Reload Reuse
-; GCN-NOT: a32
-; GCN: use agpr32
-; GCN-NOT: a32
-; GFX90A: v_accvgpr_write_b32 a32, v0 ; Reload Reuse
-; GCN-NOT: a32
-; GCN: s_setpc_b64
define void @func_areg_33() #0 {
+; GFX90A-LABEL: func_areg_33:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a32 ; Reload Reuse
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use agpr32
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0 ; Reload Reuse
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: func_areg_33:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; use agpr32
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "; use agpr32", "~{a32}" ()
ret void
}
-; GCN-LABEL: {{^}}func_areg_64:
-; GFX908-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GFX90A: v_accvgpr_read_b32 v0, a63 ; Reload Reuse
-; GCN: use agpr63
-; GFX90A: v_accvgpr_write_b32 a63, v0 ; Reload Reuse
-; GCN-NOT: v_accvgpr
-; GCN: s_setpc_b64
define void @func_areg_64() #0 {
+; GFX90A-LABEL: func_areg_64:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a63 ; Reload Reuse
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use agpr63
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a63, v0 ; Reload Reuse
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: func_areg_64:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; use agpr63
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "; use agpr63", "~{a63}" ()
ret void
}
-; GCN-LABEL: {{^}}func_areg_31_63:
-; GFX908-NOT: buffer_
-; GFX908-NOT: v_accvgpr
-; GFX908-NOT: buffer
-; GFX90A: v_accvgpr_read_b32 v0, a63 ; Reload Reuse
-; GCN: use agpr31, agpr63
-; GFX90A: v_accvgpr_write_b32 a63, v0 ; Reload Reuse
-; GFX908-NOT: v_accvgpr
-; GFX908-NOT: buffer
-; GCN: s_setpc_b64
define void @func_areg_31_63() #0 {
+; GFX90A-LABEL: func_areg_31_63:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a63 ; Reload Reuse
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use agpr31, agpr63
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a63, v0 ; Reload Reuse
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: func_areg_31_63:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; use agpr31, agpr63
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "; use agpr31, agpr63", "~{a31},~{a63}" ()
ret void
}
declare void @func_unknown() #0
-; GCN-LABEL: {{^}}test_call_empty:
-; GCN-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GCN: def a[0:31]
-; GFX908-COUNT-8: v_accvgpr_read_b32
-; GFX90A-NOT: v_accvgpr
-; GCN-NOT: buffer_
-; GCN: s_swappc_b64
-; GCN-NOT: buffer_
-; GFX90A-NOT: v_accvgpr
-; GFX908-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
-; GFX90A-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
-; GCN: s_endpgm
define amdgpu_kernel void @test_call_empty() #0 {
+; GFX90A-LABEL: test_call_empty:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX90A-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX90A-NEXT: s_mov_b32 s22, -1
+; GFX90A-NEXT: s_mov_b32 s23, 0xe00000
+; GFX90A-NEXT: s_add_u32 s20, s20, s11
+; GFX90A-NEXT: s_addc_u32 s21, s21, 0
+; GFX90A-NEXT: s_mov_b32 s12, s8
+; GFX90A-NEXT: s_add_u32 s8, s4, 36
+; GFX90A-NEXT: s_mov_b32 s13, s9
+; GFX90A-NEXT: s_addc_u32 s9, s5, 0
+; GFX90A-NEXT: s_getpc_b64 s[4:5]
+; GFX90A-NEXT: s_add_u32 s4, s4, func_empty@gotpcrel32@lo+4
+; GFX90A-NEXT: s_addc_u32 s5, s5, func_empty@gotpcrel32@hi+12
+; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX90A-NEXT: s_mov_b32 s14, s10
+; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21]
+; GFX90A-NEXT: v_mov_b32_e32 v31, v0
+; GFX90A-NEXT: s_mov_b64 s[2:3], s[22:23]
+; GFX90A-NEXT: s_mov_b32 s32, 0
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a[0:31]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[28:31], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[24:27], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[20:23], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[16:19], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[12:15], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[8:11], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[4:7], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[0:3], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_endpgm
+;
+; GFX908-LABEL: test_call_empty:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX908-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX908-NEXT: s_mov_b32 s22, -1
+; GFX908-NEXT: s_mov_b32 s23, 0xe00000
+; GFX908-NEXT: s_add_u32 s20, s20, s11
+; GFX908-NEXT: s_addc_u32 s21, s21, 0
+; GFX908-NEXT: s_mov_b32 s12, s8
+; GFX908-NEXT: s_add_u32 s8, s4, 36
+; GFX908-NEXT: s_mov_b32 s13, s9
+; GFX908-NEXT: s_addc_u32 s9, s5, 0
+; GFX908-NEXT: s_getpc_b64 s[4:5]
+; GFX908-NEXT: s_add_u32 s4, s4, func_empty@gotpcrel32@lo+4
+; GFX908-NEXT: s_addc_u32 s5, s5, func_empty@gotpcrel32@hi+12
+; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX908-NEXT: s_mov_b32 s14, s10
+; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX908-NEXT: s_mov_b64 s[0:1], s[20:21]
+; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23]
+; GFX908-NEXT: s_mov_b32 s32, 0
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; def a[0:31]
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_endpgm
bb:
%reg = call <32 x float> asm sideeffect "; def $0", "=a"()
call void @func_empty()
@@ -98,21 +246,142 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_call_areg4:
-; GCN-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GFX908: def a[0:31]
-; GFX90A: def a[4:35]
-; GFX908-COUNT-8: v_accvgpr_read_b32
-; GFX90A-NOT: v_accvgpr
-; GCN-NOT: buffer_
-; GCN: s_swappc_b64
-; GCN-NOT: buffer_
-; GFX90A-NOT: v_accvgpr
-; GFX908-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
-; GFX90A-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
-; GCN: s_endpgm
define amdgpu_kernel void @test_call_areg4() #0 {
+; GFX90A-LABEL: test_call_areg4:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX90A-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX90A-NEXT: s_mov_b32 s22, -1
+; GFX90A-NEXT: s_mov_b32 s23, 0xe00000
+; GFX90A-NEXT: s_add_u32 s20, s20, s11
+; GFX90A-NEXT: s_addc_u32 s21, s21, 0
+; GFX90A-NEXT: s_mov_b32 s12, s8
+; GFX90A-NEXT: s_add_u32 s8, s4, 36
+; GFX90A-NEXT: s_mov_b32 s13, s9
+; GFX90A-NEXT: s_addc_u32 s9, s5, 0
+; GFX90A-NEXT: s_getpc_b64 s[4:5]
+; GFX90A-NEXT: s_add_u32 s4, s4, func_areg_4@gotpcrel32@lo+4
+; GFX90A-NEXT: s_addc_u32 s5, s5, func_areg_4@gotpcrel32@hi+12
+; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX90A-NEXT: s_mov_b32 s14, s10
+; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21]
+; GFX90A-NEXT: v_mov_b32_e32 v31, v0
+; GFX90A-NEXT: s_mov_b64 s[2:3], s[22:23]
+; GFX90A-NEXT: s_mov_b32 s32, 0
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a[4:35]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[32:35], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[28:31], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[24:27], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[20:23], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[16:19], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[12:15], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[8:11], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[4:7], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_endpgm
+;
+; GFX908-LABEL: test_call_areg4:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX908-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX908-NEXT: s_mov_b32 s22, -1
+; GFX908-NEXT: s_mov_b32 s23, 0xe00000
+; GFX908-NEXT: s_add_u32 s20, s20, s11
+; GFX908-NEXT: s_addc_u32 s21, s21, 0
+; GFX908-NEXT: s_mov_b32 s12, s8
+; GFX908-NEXT: s_add_u32 s8, s4, 36
+; GFX908-NEXT: s_mov_b32 s13, s9
+; GFX908-NEXT: s_addc_u32 s9, s5, 0
+; GFX908-NEXT: s_getpc_b64 s[4:5]
+; GFX908-NEXT: s_add_u32 s4, s4, func_areg_4@gotpcrel32@lo+4
+; GFX908-NEXT: s_addc_u32 s5, s5, func_areg_4@gotpcrel32@hi+12
+; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX908-NEXT: s_mov_b32 s14, s10
+; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX908-NEXT: s_mov_b64 s[0:1], s[20:21]
+; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23]
+; GFX908-NEXT: s_mov_b32 s32, 0
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; def a[4:35]
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a32
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a33
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a34
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a35
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Jeffrey Byrnes (jrbyrnes) ChangesCurrently, we do register inflation for coalesced copies at the end of the pass. However, there is a benefit to doing inflation while coalescing, as inflation may allow for a common RC. This PR does inflation while coalescing if we are unable to find a common RC. This some test churn in: Which unfortunately doesn't render well. After going through the diffs of both these tests files, I didn't see any regressions, and saw some slight improvements. Most of the test churn is due to deviation between GISEL and SDAG: the order of v_accvgpr_write in GISEL was reversed. For the motivating case see: Patch is 938.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130870.diff 19 Files Affected:
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 74606e66d4e4b..1a99da6e0c153 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -477,8 +477,9 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
Flipped = true;
}
- const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
- const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
+ MachineRegisterInfo *MRI =
+ const_cast<MachineRegisterInfo *>(&MI->getMF()->getRegInfo());
+ const TargetRegisterClass *SrcRC = MRI->getRegClass(Src);
if (Dst.isPhysical()) {
// Eliminate DstSub on a physreg.
@@ -499,7 +500,14 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
}
} else {
// Both registers are virtual.
- const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+ const TargetRegisterClass *DstRC = MRI->getRegClass(Dst);
+
+ auto recomputeRegClasses = [&MRI](Register &Src, Register &Dst) {
+ bool Success = false;
+ Success = MRI->recomputeRegClass(Src);
+ Success |= MRI->recomputeRegClass(Dst);
+ return Success;
+ };
// Both registers have subreg indices.
if (SrcSub && DstSub) {
@@ -509,19 +517,42 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub, SrcIdx,
DstIdx);
- if (!NewRC)
- return false;
+ if (!NewRC) {
+ if (recomputeRegClasses(Src, Dst)) {
+ SrcRC = MRI->getRegClass(Src);
+ DstRC = MRI->getRegClass(Dst);
+ NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub,
+ SrcIdx, DstIdx);
+ }
+ if (!NewRC)
+ return false;
+ }
} else if (DstSub) {
// SrcReg will be merged with a sub-register of DstReg.
SrcIdx = DstSub;
NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+ if (!NewRC && recomputeRegClasses(Src, Dst)) {
+ SrcRC = MRI->getRegClass(Src);
+ DstRC = MRI->getRegClass(Dst);
+ NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, DstSub);
+ }
} else if (SrcSub) {
// DstReg will be merged with a sub-register of SrcReg.
DstIdx = SrcSub;
NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSub);
+ if (!NewRC && recomputeRegClasses(Src, Dst)) {
+ SrcRC = MRI->getRegClass(Src);
+ DstRC = MRI->getRegClass(Dst);
+ NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSub);
+ }
} else {
// This is a straight copy without sub-registers.
NewRC = TRI.getCommonSubClass(DstRC, SrcRC);
+ if (!NewRC && recomputeRegClasses(Src, Dst)) {
+ SrcRC = MRI->getRegClass(Src);
+ DstRC = MRI->getRegClass(Dst);
+ NewRC = TRI.getCommonSubClass(DstRC, SrcRC);
+ }
}
// The combined constraint may be impossible to satisfy.
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
index 73ccab64a1925..74e72d75cb63a 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-csr.ll
@@ -1,96 +1,244 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX90A %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX908 %s
-; GCN-LABEL: {{^}}func_empty:
-; GCN-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GCN: s_setpc_b64
+
define void @func_empty() #0 {
+; GCN-LABEL: func_empty:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
ret void
}
-; GCN-LABEL: {{^}}func_areg_4:
-; GCN-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GCN: use agpr3
-; GCN-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GCN: s_setpc_b64
define void @func_areg_4() #0 {
+; GCN-LABEL: func_areg_4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; use agpr3
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "; use agpr3", "~{a3}" ()
ret void
}
-; GCN-LABEL: {{^}}func_areg_32:
-; GCN-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GCN: use agpr31
-; GCN-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GCN: s_setpc_b64
define void @func_areg_32() #0 {
+; GCN-LABEL: func_areg_32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; use agpr31
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "; use agpr31", "~{a31}" ()
ret void
}
-; GCN-LABEL: {{^}}func_areg_33:
-; GCN-NOT: a32
-; GFX90A: v_accvgpr_read_b32 v0, a32 ; Reload Reuse
-; GCN-NOT: a32
-; GCN: use agpr32
-; GCN-NOT: a32
-; GFX90A: v_accvgpr_write_b32 a32, v0 ; Reload Reuse
-; GCN-NOT: a32
-; GCN: s_setpc_b64
define void @func_areg_33() #0 {
+; GFX90A-LABEL: func_areg_33:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a32 ; Reload Reuse
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use agpr32
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0 ; Reload Reuse
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: func_areg_33:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; use agpr32
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "; use agpr32", "~{a32}" ()
ret void
}
-; GCN-LABEL: {{^}}func_areg_64:
-; GFX908-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GFX90A: v_accvgpr_read_b32 v0, a63 ; Reload Reuse
-; GCN: use agpr63
-; GFX90A: v_accvgpr_write_b32 a63, v0 ; Reload Reuse
-; GCN-NOT: v_accvgpr
-; GCN: s_setpc_b64
define void @func_areg_64() #0 {
+; GFX90A-LABEL: func_areg_64:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a63 ; Reload Reuse
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use agpr63
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a63, v0 ; Reload Reuse
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: func_areg_64:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; use agpr63
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "; use agpr63", "~{a63}" ()
ret void
}
-; GCN-LABEL: {{^}}func_areg_31_63:
-; GFX908-NOT: buffer_
-; GFX908-NOT: v_accvgpr
-; GFX908-NOT: buffer
-; GFX90A: v_accvgpr_read_b32 v0, a63 ; Reload Reuse
-; GCN: use agpr31, agpr63
-; GFX90A: v_accvgpr_write_b32 a63, v0 ; Reload Reuse
-; GFX908-NOT: v_accvgpr
-; GFX908-NOT: buffer
-; GCN: s_setpc_b64
define void @func_areg_31_63() #0 {
+; GFX90A-LABEL: func_areg_31_63:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a63 ; Reload Reuse
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use agpr31, agpr63
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a63, v0 ; Reload Reuse
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: func_areg_31_63:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; use agpr31, agpr63
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "; use agpr31, agpr63", "~{a31},~{a63}" ()
ret void
}
declare void @func_unknown() #0
-; GCN-LABEL: {{^}}test_call_empty:
-; GCN-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GCN: def a[0:31]
-; GFX908-COUNT-8: v_accvgpr_read_b32
-; GFX90A-NOT: v_accvgpr
-; GCN-NOT: buffer_
-; GCN: s_swappc_b64
-; GCN-NOT: buffer_
-; GFX90A-NOT: v_accvgpr
-; GFX908-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
-; GFX90A-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
-; GCN: s_endpgm
define amdgpu_kernel void @test_call_empty() #0 {
+; GFX90A-LABEL: test_call_empty:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX90A-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX90A-NEXT: s_mov_b32 s22, -1
+; GFX90A-NEXT: s_mov_b32 s23, 0xe00000
+; GFX90A-NEXT: s_add_u32 s20, s20, s11
+; GFX90A-NEXT: s_addc_u32 s21, s21, 0
+; GFX90A-NEXT: s_mov_b32 s12, s8
+; GFX90A-NEXT: s_add_u32 s8, s4, 36
+; GFX90A-NEXT: s_mov_b32 s13, s9
+; GFX90A-NEXT: s_addc_u32 s9, s5, 0
+; GFX90A-NEXT: s_getpc_b64 s[4:5]
+; GFX90A-NEXT: s_add_u32 s4, s4, func_empty@gotpcrel32@lo+4
+; GFX90A-NEXT: s_addc_u32 s5, s5, func_empty@gotpcrel32@hi+12
+; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX90A-NEXT: s_mov_b32 s14, s10
+; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21]
+; GFX90A-NEXT: v_mov_b32_e32 v31, v0
+; GFX90A-NEXT: s_mov_b64 s[2:3], s[22:23]
+; GFX90A-NEXT: s_mov_b32 s32, 0
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a[0:31]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[28:31], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[24:27], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[20:23], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[16:19], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[12:15], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[8:11], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[4:7], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[0:3], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_endpgm
+;
+; GFX908-LABEL: test_call_empty:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX908-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX908-NEXT: s_mov_b32 s22, -1
+; GFX908-NEXT: s_mov_b32 s23, 0xe00000
+; GFX908-NEXT: s_add_u32 s20, s20, s11
+; GFX908-NEXT: s_addc_u32 s21, s21, 0
+; GFX908-NEXT: s_mov_b32 s12, s8
+; GFX908-NEXT: s_add_u32 s8, s4, 36
+; GFX908-NEXT: s_mov_b32 s13, s9
+; GFX908-NEXT: s_addc_u32 s9, s5, 0
+; GFX908-NEXT: s_getpc_b64 s[4:5]
+; GFX908-NEXT: s_add_u32 s4, s4, func_empty@gotpcrel32@lo+4
+; GFX908-NEXT: s_addc_u32 s5, s5, func_empty@gotpcrel32@hi+12
+; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX908-NEXT: s_mov_b32 s14, s10
+; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX908-NEXT: s_mov_b64 s[0:1], s[20:21]
+; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23]
+; GFX908-NEXT: s_mov_b32 s32, 0
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; def a[0:31]
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_endpgm
bb:
%reg = call <32 x float> asm sideeffect "; def $0", "=a"()
call void @func_empty()
@@ -98,21 +246,142 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_call_areg4:
-; GCN-NOT: buffer_
-; GCN-NOT: v_accvgpr
-; GFX908: def a[0:31]
-; GFX90A: def a[4:35]
-; GFX908-COUNT-8: v_accvgpr_read_b32
-; GFX90A-NOT: v_accvgpr
-; GCN-NOT: buffer_
-; GCN: s_swappc_b64
-; GCN-NOT: buffer_
-; GFX90A-NOT: v_accvgpr
-; GFX908-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], v[{{[0-9:]+}}]
-; GFX90A-COUNT-8: global_store_dwordx4 v[{{[0-9:]+}}], a[{{[0-9:]+}}]
-; GCN: s_endpgm
define amdgpu_kernel void @test_call_areg4() #0 {
+; GFX90A-LABEL: test_call_areg4:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX90A-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX90A-NEXT: s_mov_b32 s22, -1
+; GFX90A-NEXT: s_mov_b32 s23, 0xe00000
+; GFX90A-NEXT: s_add_u32 s20, s20, s11
+; GFX90A-NEXT: s_addc_u32 s21, s21, 0
+; GFX90A-NEXT: s_mov_b32 s12, s8
+; GFX90A-NEXT: s_add_u32 s8, s4, 36
+; GFX90A-NEXT: s_mov_b32 s13, s9
+; GFX90A-NEXT: s_addc_u32 s9, s5, 0
+; GFX90A-NEXT: s_getpc_b64 s[4:5]
+; GFX90A-NEXT: s_add_u32 s4, s4, func_areg_4@gotpcrel32@lo+4
+; GFX90A-NEXT: s_addc_u32 s5, s5, func_areg_4@gotpcrel32@hi+12
+; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX90A-NEXT: s_mov_b32 s14, s10
+; GFX90A-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX90A-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX90A-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX90A-NEXT: s_mov_b64 s[0:1], s[20:21]
+; GFX90A-NEXT: v_mov_b32_e32 v31, v0
+; GFX90A-NEXT: s_mov_b64 s[2:3], s[22:23]
+; GFX90A-NEXT: s_mov_b32 s32, 0
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a[4:35]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[32:35], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[28:31], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[24:27], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[20:23], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[16:19], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[12:15], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[8:11], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], a[4:7], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_endpgm
+;
+; GFX908-LABEL: test_call_areg4:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
+; GFX908-NEXT: s_mov_b32 s21, SCRATCH_RSRC_DWORD1
+; GFX908-NEXT: s_mov_b32 s22, -1
+; GFX908-NEXT: s_mov_b32 s23, 0xe00000
+; GFX908-NEXT: s_add_u32 s20, s20, s11
+; GFX908-NEXT: s_addc_u32 s21, s21, 0
+; GFX908-NEXT: s_mov_b32 s12, s8
+; GFX908-NEXT: s_add_u32 s8, s4, 36
+; GFX908-NEXT: s_mov_b32 s13, s9
+; GFX908-NEXT: s_addc_u32 s9, s5, 0
+; GFX908-NEXT: s_getpc_b64 s[4:5]
+; GFX908-NEXT: s_add_u32 s4, s4, func_areg_4@gotpcrel32@lo+4
+; GFX908-NEXT: s_addc_u32 s5, s5, func_areg_4@gotpcrel32@hi+12
+; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX908-NEXT: s_mov_b32 s14, s10
+; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX908-NEXT: s_mov_b64 s[0:1], s[20:21]
+; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23]
+; GFX908-NEXT: s_mov_b32 s32, 0
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; def a[4:35]
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a32
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a33
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a34
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a35
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908...
[truncated]
|
; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18 | ||
; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19 | ||
; GREEDY942-NEXT: s_nop 1 | ||
; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a16 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I looked at this in detail and I think it makes the most sense to live with it.
RegisterCoalescing actually reduces the number of copies and we end up with less registers needed. However, before this PR, the extra copies caused different register assignment, which enabled more copy propagation.
In order to resolve this, I think we would need to have RA heuristics work to facilitate more copy propagation.
@@ -718,57 +750,65 @@ define amdgpu_kernel void @test_call_areg31_63() #0 { | |||
; GFX908-NEXT: s_mov_b64 s[2:3], s[22:23] | |||
; GFX908-NEXT: s_mov_b32 s32, 0 | |||
; GFX908-NEXT: ;;#ASMSTART | |||
; GFX908-NEXT: ; def a[0:31] | |||
; GFX908-NEXT: ; def a[64:95] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still trying to understand what is going on here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was working on the same thing a few weeks ago, and I think we need more targeted handling. This is too aggressively trying to recompute the register classes. The reconstrain should happen somewhere in the caller, if and only if the coalescing is going to succeed. In terms of patch splitting, I also think we should take each of these cases one at a time. I think the subregister insert case is the most likely to be profitable piece, so I think we should start there.
However, in terms of the AGPR-or-VGPR to AV_* class case, it will soon only be relevant for gfx908. I'm actively working on avoiding using AGPRs until the actual allocation forces it. I was also wondering if instead we should just be using the AV_* classes for all values in the first place.
I'll try to polish my patch into something postable.
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A,GFX942_A %s | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX942,GFX942_A %s | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Switching tests to generated checks should be done separately
#130879 was what I had, it's full of junk and missing test updates |
I think if we were to switch to AV_* we would need to coordinate a scheduler change as well (assuming the scheduler sees all these AV_*s and the switch is not done after) since the generic trackers don't handle AV_*s well. In general, the scheduler is able to do a better job if it knows which registers will be assigned to AGPR vs VGPR. |
Change-Id: I36894fc36e6e6214930fae67f2ca35999abf3b88
Change-Id: I49b2a587eeef47ebc7150d9129d90843ae7a3042
Change-Id: Iff80066d6eda0189bb39a7a5680d23fa87adb08d
f6fc64a
to
f523a93
Compare
Force push for rebase for precommit autogen checks.
Done
I looked into doing it one at a time, but various lit tests had regressions due to the partial implementation. If this patch is simply to large I will split, but my concern is that by incrementally landing we may trigger a real regression and revert. |
Change-Id: Ibb944e04bf295ac66ff01185d580060b36e1bf9b
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 | ||
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]] | ||
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]] | ||
; CHECK-NEXT: [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure why this wasn't picked up with update_mir_test_checks.. I had to do it manually..
ping -- does this really need to be split up? |
if (!NewRC) { | ||
auto SuperDstRC = getLargestLegalRegClass(Dst, MF, MRI); | ||
if (SuperDstRC != DstRC) | ||
NewRC = TRI.getMatchingSuperRegClass(SuperDstRC, SrcRC, DstSub); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should handle each of these cases in a separate patch, starting with this subregister insert case
static const TargetRegisterClass * | ||
getLargestLegalRegClass(Register Reg, const MachineFunction *MF, | ||
const MachineRegisterInfo &MRI) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is mostly copy paste from the MRI function, can you refactor it there to expose this
Currently, we do register inflation for coalesced copies at the end of the pass. However, there is a benefit to doing inflation while coalescing, as inflation may allow for a common RC. This PR does inflation while coalescing if we are unable to find a common RC.
This caused some test churn in:
test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
Which unfortunately doesn't render well. After going through the diffs of both these tests files, I didn't see any regressions, and saw some slight improvements. Most of the test churn is due to deviation between GISEL and SDAG: the order of v_accvgpr_write in GISEL was reversed.
For the motivating case see:
test/CodeGen/AMDGPU/regcoalesce-avgpr.ll