Skip to content

Commit 42bc4f6

Browse files
committed
Reland "[X86] X86LowerTileCopy: Find dead register to use to prevent save-reload of tile register (#83628)"
Fixes compile time regression in previous commit.
1 parent da213d7 commit 42bc4f6

File tree

2 files changed

+54
-29
lines changed

2 files changed

+54
-29
lines changed

llvm/lib/Target/X86/X86LowerTileCopy.cpp

Lines changed: 54 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "X86InstrBuilder.h"
2121
#include "X86InstrInfo.h"
2222
#include "X86Subtarget.h"
23+
#include "llvm/CodeGen/LiveRegUnits.h"
2324
#include "llvm/CodeGen/MachineBasicBlock.h"
2425
#include "llvm/CodeGen/MachineFrameInfo.h"
2526
#include "llvm/CodeGen/MachineFunction.h"
@@ -72,10 +73,28 @@ FunctionPass *llvm::createX86LowerTileCopyPass() {
7273
bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
7374
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
7475
const X86InstrInfo *TII = ST.getInstrInfo();
76+
const TargetRegisterInfo *TRI = ST.getRegisterInfo();
77+
BitVector GR64Regs =
78+
TRI->getAllocatableSet(MF, TRI->getRegClass(X86::GR64RegClassID));
79+
BitVector TILERegs =
80+
TRI->getAllocatableSet(MF, TRI->getRegClass(X86::TILERegClassID));
7581
bool Changed = false;
7682

7783
for (MachineBasicBlock &MBB : MF) {
78-
for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
84+
// There won't be a tile copy if no tile register live in.
85+
bool HasTileCopy = false;
86+
for (const auto &LI : MBB.liveins()) {
87+
if (TILERegs.test(LI.PhysReg)) {
88+
HasTileCopy = true;
89+
break;
90+
}
91+
}
92+
if (!HasTileCopy)
93+
continue;
94+
LiveRegUnits UsedRegs(*TRI);
95+
UsedRegs.addLiveOuts(MBB);
96+
for (MachineInstr &MI : llvm::make_early_inc_range(reverse(MBB))) {
97+
UsedRegs.stepBackward(MI);
7998
if (!MI.isCopy())
8099
continue;
81100
MachineOperand &DstMO = MI.getOperand(0);
@@ -85,27 +104,41 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
85104
if (!X86::TILERegClass.contains(DstReg, SrcReg))
86105
continue;
87106

88-
const TargetRegisterInfo *TRI = ST.getRegisterInfo();
89107
// Allocate stack slot for tile register
90108
unsigned Size = TRI->getSpillSize(X86::TILERegClass);
91109
Align Alignment = TRI->getSpillAlign(X86::TILERegClass);
92110
int TileSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
93-
// Allocate stack slot for stride register
94-
Size = TRI->getSpillSize(X86::GR64RegClass);
95-
Alignment = TRI->getSpillAlign(X86::GR64RegClass);
96-
int StrideSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
97111

98-
// TODO: Pick a killed regiter to avoid save/reload. There is problem
99-
// to get live interval in this stage.
100-
Register GR64Cand = X86::RAX;
112+
int StrideSS = 0;
113+
114+
// Pick a killed register to avoid a save/reload.
115+
Register GR64Cand = X86::NoRegister;
116+
for (auto RegT : GR64Regs.set_bits()) {
117+
if (UsedRegs.available(RegT)) {
118+
GR64Cand = RegT;
119+
break;
120+
}
121+
}
101122

102123
const DebugLoc &DL = MI.getDebugLoc();
103-
// mov %rax (%sp)
104-
BuildMI(MBB, MI, DL, TII->get(X86::IMPLICIT_DEF), GR64Cand);
105-
addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64mr)), StrideSS)
106-
.addReg(GR64Cand);
107-
// mov 64 %rax
108-
BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), GR64Cand).addImm(64);
124+
if (GR64Cand) {
125+
// mov 64 %reg
126+
BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), GR64Cand).addImm(64);
127+
} else {
128+
// No available register? Save RAX and reload it after use.
129+
130+
// Allocate stack slot for stride register
131+
Size = TRI->getSpillSize(X86::GR64RegClass);
132+
Alignment = TRI->getSpillAlign(X86::GR64RegClass);
133+
StrideSS = MF.getFrameInfo().CreateSpillStackObject(Size, Alignment);
134+
135+
// mov %reg (%sp)
136+
addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64mr)),
137+
StrideSS)
138+
.addReg(X86::RAX);
139+
// mov 64 %reg
140+
BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), X86::RAX).addImm(64);
141+
}
109142
// tilestored %tmm, (%sp, %idx)
110143
#define GET_EGPR_IF_ENABLED(OPC) (ST.hasEGPR() ? OPC##_EVEX : OPC)
111144
unsigned Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
@@ -120,10 +153,12 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
120153
#undef GET_EGPR_IF_ENABLED
121154
NewMI = addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc), DstReg),
122155
TileSS);
123-
// restore %rax
124-
// mov (%sp) %rax
125-
addFrameReference(BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand),
126-
StrideSS);
156+
if (!GR64Cand) {
157+
// restore %rax
158+
// mov (%sp) %rax
159+
addFrameReference(
160+
BuildMI(MBB, MI, DL, TII->get(X86::MOV64rm), GR64Cand), StrideSS);
161+
}
127162
MI.eraseFromParent();
128163
Changed = true;
129164
}

llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,8 @@ define dso_local void @test1(ptr%buf) nounwind {
4444
; CHECK-NEXT: tileloadd 3024(%rsp,%rax), %tmm3 # 1024-byte Folded Reload
4545
; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm0
4646
; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm1
47-
; CHECK-NEXT: # implicit-def: $rax
48-
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
49-
; CHECK-NEXT: movabsq $64, %rax
5047
; CHECK-NEXT: tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
5148
; CHECK-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
52-
; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
5349
; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
5450
; CHECK-NEXT: tilestored %tmm2, (%rbx,%r15)
5551
; CHECK-NEXT: incl %r14d
@@ -111,16 +107,10 @@ define dso_local void @test1(ptr%buf) nounwind {
111107
; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x04,0xd0,0x0b,0x00,0x00]
112108
; EGPR-NEXT: tileloadd (%rbx,%r15), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x04,0x3b]
113109
; EGPR-NEXT: tileloadd (%rbx,%r15), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x3b]
114-
; EGPR-NEXT: # implicit-def: $rax
115-
; EGPR-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
116-
; EGPR-NEXT: # encoding: [0x48,0x89,0x84,0x24,0xb8,0x03,0x00,0x00]
117-
; EGPR-NEXT: movabsq $64, %rax # encoding: [0x48,0xb8,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
118110
; EGPR-NEXT: tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
119111
; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x9c,0x04,0x00,0x04,0x00,0x00]
120112
; EGPR-NEXT: tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
121113
; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x94,0x24,0x00,0x04,0x00,0x00]
122-
; EGPR-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
123-
; EGPR-NEXT: # encoding: [0x48,0x8b,0x84,0x24,0xb8,0x03,0x00,0x00]
124114
; EGPR-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x5e,0xd0]
125115
; EGPR-NEXT: tilestored %tmm2, (%rbx,%r15) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x14,0x3b]
126116
; EGPR-NEXT: incl %r14d # encoding: [0x41,0xff,0xc6]

0 commit comments

Comments
 (0)