-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[X86][AMX] Checking AMXProgModel in X86LowerTileCopy #94358
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This fixes compile time increment after llvm#93692.
@llvm/pr-subscribers-backend-x86 Author: Phoebe Wang (phoebewang) ChangesThis fixes compile time regression after #93692. Full diff: https://github.com/llvm/llvm-project/pull/94358.diff 5 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 0bf3294af92a8..3933e82b718f2 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5120,6 +5120,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::x86_tileloaddt164_internal: {
if (!Subtarget->hasAMXTILE())
break;
+ auto *MFI =
+ CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
? X86::PTILELOADDV
: X86::PTILELOADDT1V;
@@ -5201,6 +5204,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
}
case Intrinsic::x86_tilestored64_internal: {
+ auto *MFI =
+ CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
unsigned Opc = X86::PTILESTOREDV;
// _tile_stored_internal(row, col, buf, STRIDE, c)
SDValue Base = Node->getOperand(4);
@@ -5228,6 +5234,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::x86_tilestored64: {
if (!Subtarget->hasAMXTILE())
break;
+ auto *MFI =
+ CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+ MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
unsigned Opc;
switch (IntNo) {
default: llvm_unreachable("Unexpected intrinsic!");
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0e377dd53b742..5aa8b015d9d95 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -26776,7 +26776,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
case Intrinsic::swift_async_context_addr: {
SDLoc dl(Op);
auto &MF = DAG.getMachineFunction();
- auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
MF.getFrameInfo().setFrameAddressIsTaken(true);
X86FI->setHasSwiftAsyncContext(true);
@@ -36781,7 +36781,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
}
case TargetOpcode::PREALLOCATED_SETUP: {
assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
- auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+ auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
MFI->setHasPreallocatedCall(true);
int64_t PreallocatedId = MI.getOperand(0).getImm();
size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
@@ -36798,7 +36798,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
int64_t PreallocatedId = MI.getOperand(1).getImm();
int64_t ArgIdx = MI.getOperand(2).getImm();
- auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+ auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
<< ", arg offset " << ArgOffset << "\n");
@@ -36841,6 +36841,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
unsigned Imm = MI.getOperand(0).getImm();
BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
MI.eraseFromParent(); // The pseudo is gone now.
+ auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
+ MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
+ return BB;
+ }
+ case X86::PTILEZEROV: {
+ auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
+ MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
return BB;
}
case X86::PTILELOADD:
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index c47bee070e04f..99deacc811a17 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -74,7 +74,7 @@ let SchedRW = [WriteSystem] in {
GR16:$src2, opaquemem:$src3,
TILE:$src4), []>;
let isPseudo = true, isReMaterializable = 1, isAsCheapAsAMove = 1,
- canFoldAsLoad = 1 in
+ canFoldAsLoad = 1, usesCustomInserter = 1 in
def PTILEZEROV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2),
[(set TILE:$dst, (int_x86_tilezero_internal
GR16:$src1, GR16:$src2))]>;
diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
index f27676a27e86c..613722b398f44 100644
--- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp
+++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
@@ -19,6 +19,7 @@
#include "X86.h"
#include "X86InstrBuilder.h"
#include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
#include "X86Subtarget.h"
#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -71,6 +72,10 @@ FunctionPass *llvm::createX86LowerTileCopyPass() {
}
bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
+ X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+ if (FuncInfo->getAMXProgModel() != AMXProgModelEnum::ManagedRA)
+ return false;
+
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
const X86InstrInfo *TII = ST.getInstrInfo();
const TargetRegisterInfo *TRI = ST.getRegisterInfo();
@@ -81,26 +86,8 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
- // There won't be a tile copy if neither tile register live in nor live out.
- bool HasTileCopy = false;
- for (const auto &LI : MBB.liveins()) {
- if (TILERegs.test(LI.PhysReg)) {
- HasTileCopy = true;
- break;
- }
- }
LiveRegUnits UsedRegs(*TRI);
UsedRegs.addLiveOuts(MBB);
- if (!HasTileCopy) {
- for (auto RegT : TILERegs.set_bits()) {
- if (UsedRegs.available(RegT)) {
- HasTileCopy = true;
- break;
- }
- }
- }
- if (!HasTileCopy)
- continue;
for (MachineInstr &MI : llvm::make_early_inc_range(reverse(MBB))) {
UsedRegs.stepBackward(MI);
if (!MI.isCopy())
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index f6e853270e073..8aaa49945f9d4 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -21,6 +21,8 @@
namespace llvm {
+enum AMXProgModelEnum { None = 0, DirectReg = 1, ManagedRA = 2 };
+
/// X86MachineFunctionInfo - This class is derived from MachineFunction and
/// contains private X86 target-specific information for each MachineFunction.
class X86MachineFunctionInfo : public MachineFunctionInfo {
@@ -96,6 +98,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
/// used to address arguments in a function using a base pointer.
int SEHFramePtrSaveIndex = 0;
+ /// The AMX programing model used in the function.
+ AMXProgModelEnum AMXProgModel = AMXProgModelEnum::None;
+
/// True if this function has a subset of CSRs that is handled explicitly via
/// copies.
bool IsSplitCSR = false;
@@ -219,6 +224,13 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
int getSEHFramePtrSaveIndex() const { return SEHFramePtrSaveIndex; }
void setSEHFramePtrSaveIndex(int Index) { SEHFramePtrSaveIndex = Index; }
+ AMXProgModelEnum getAMXProgModel() const { return AMXProgModel; }
+ void setAMXProgModel(AMXProgModelEnum Model) {
+ assert((AMXProgModel == AMXProgModelEnum::None || AMXProgModel == Model) &&
+ "mixed model is not supported");
+ AMXProgModel = Model;
+ }
+
SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
return ForwardedMustTailRegParms;
}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This makes sense to me conceptually, but I don't know enough about AMX to properly review this :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I did my best to look at this, but I don't feel really qualified to review AMX. I like the idea (very similar to what I had in mind) and this generally looks good, but I have a few questions/points where I'm unsure.
- Maybe add a short comment explaining the two AMX modes? If there's some documentation elsewhere, I couldn't find it.
- For managed RA, the only sources of AMX values are zero, load and cast intrinsics. Whenever a load/zero(/store) occurs, the mode is set accordingly. Looks good. What about casts from vectors?
- For unmanaged RA, all (?) (non-internal) intrinsics should set the prog model. So there's zero, load, store, and the actual operations. Zero, load, store are handled, ok. But shouldn't any occurrence of the compute intrinsics also set the prog model? (Can they theoretically occur alone?)
Thanks @aengelke for looking at this! All are good questions!
This is a tech debt for a long time. And since previous authors not working on this, we may not able to complete it in a short time.
Some documents scattered in Intel Intrinsic Guide, Clang doxygen and some technical paper etc. But this is not quite related to this patch, and I don't find a good place for it.
AMX casts were handled by X86LowerAMXType before ISel. They all have been transformed into load/store at this phase.
Yes, it should happen in theory, but using them without zero, load or strore can be seem as an invalid use scenario. We can save verbose |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the detailed answers! LGTM.
Follow-up of llvm#94358. Do the checks even before calling getRegisterInfo etc., because some of these are virtual function calls.
This allows tested passes to depend on the AMX model in the function info. Preparatory work for to adopt #94358 for other AMX passes.
Follow-up of llvm#94358. Do the checks even before calling getRegisterInfo etc., because some of these are virtual function calls.
Follow-up of #94358. Do the checks even before calling getRegisterInfo etc., because some of these are virtual function calls.
This allows tested passes to depend on the AMX model in the function info. Preparatory work for to adopt llvm#94358 for other AMX passes.
This fixes compile time regression after #93692.