[X86][AMX] Checking AMXProgModel in X86LowerTileCopy #94358

phoebewang · 2024-06-04T14:24:20Z

This fixes compile time regression after #93692.

This fixes compile time increment after llvm#93692.

llvmbot · 2024-06-04T14:24:54Z

@llvm/pr-subscribers-backend-x86

Author: Phoebe Wang (phoebewang)

Changes

This fixes compile time regression after #93692.

Full diff: https://github.com/llvm/llvm-project/pull/94358.diff

5 Files Affected:

(modified) llvm/lib/Target/X86/X86ISelDAGToDAG.cpp (+9)
(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+10-3)
(modified) llvm/lib/Target/X86/X86InstrAMX.td (+1-1)
(modified) llvm/lib/Target/X86/X86LowerTileCopy.cpp (+5-18)
(modified) llvm/lib/Target/X86/X86MachineFunctionInfo.h (+12)

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 0bf3294af92a8..3933e82b718f2 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5120,6 +5120,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::x86_tileloaddt164_internal: {
       if (!Subtarget->hasAMXTILE())
         break;
+      auto *MFI =
+          CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+      MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
       unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
                          ? X86::PTILELOADDV
                          : X86::PTILELOADDT1V;
@@ -5201,6 +5204,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       break;
     }
     case Intrinsic::x86_tilestored64_internal: {
+      auto *MFI =
+          CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+      MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
       unsigned Opc = X86::PTILESTOREDV;
       // _tile_stored_internal(row, col, buf, STRIDE, c)
       SDValue Base = Node->getOperand(4);
@@ -5228,6 +5234,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::x86_tilestored64: {
       if (!Subtarget->hasAMXTILE())
         break;
+      auto *MFI =
+          CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+      MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
       unsigned Opc;
       switch (IntNo) {
       default: llvm_unreachable("Unexpected intrinsic!");
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0e377dd53b742..5aa8b015d9d95 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -26776,7 +26776,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
     case Intrinsic::swift_async_context_addr: {
       SDLoc dl(Op);
       auto &MF = DAG.getMachineFunction();
-      auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
+      auto *X86FI = MF.getInfo<X86MachineFunctionInfo>();
       if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
         MF.getFrameInfo().setFrameAddressIsTaken(true);
         X86FI->setHasSwiftAsyncContext(true);
@@ -36781,7 +36781,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   }
   case TargetOpcode::PREALLOCATED_SETUP: {
     assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
-    auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+    auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
     MFI->setHasPreallocatedCall(true);
     int64_t PreallocatedId = MI.getOperand(0).getImm();
     size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
@@ -36798,7 +36798,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
     int64_t PreallocatedId = MI.getOperand(1).getImm();
     int64_t ArgIdx = MI.getOperand(2).getImm();
-    auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+    auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
     size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
     LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
                       << ", arg offset " << ArgOffset << "\n");
@@ -36841,6 +36841,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     unsigned Imm = MI.getOperand(0).getImm();
     BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
     MI.eraseFromParent(); // The pseudo is gone now.
+    auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
+    MFI->setAMXProgModel(AMXProgModelEnum::DirectReg);
+    return BB;
+  }
+  case X86::PTILEZEROV: {
+    auto *MFI = MF->getInfo<X86MachineFunctionInfo>();
+    MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA);
     return BB;
   }
   case X86::PTILELOADD:
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index c47bee070e04f..99deacc811a17 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -74,7 +74,7 @@ let SchedRW = [WriteSystem] in {
                                             GR16:$src2, opaquemem:$src3,
                                             TILE:$src4), []>;
     let isPseudo = true, isReMaterializable = 1, isAsCheapAsAMove = 1,
-        canFoldAsLoad = 1 in
+        canFoldAsLoad = 1, usesCustomInserter = 1 in
       def PTILEZEROV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2),
                                 [(set TILE:$dst, (int_x86_tilezero_internal
                                   GR16:$src1, GR16:$src2))]>;
diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
index f27676a27e86c..613722b398f44 100644
--- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp
+++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
@@ -19,6 +19,7 @@
 #include "X86.h"
 #include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
+#include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -71,6 +72,10 @@ FunctionPass *llvm::createX86LowerTileCopyPass() {
 }
 
 bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  if (FuncInfo->getAMXProgModel() != AMXProgModelEnum::ManagedRA)
+    return false;
+
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
   const X86InstrInfo *TII = ST.getInstrInfo();
   const TargetRegisterInfo *TRI = ST.getRegisterInfo();
@@ -81,26 +86,8 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
   for (MachineBasicBlock &MBB : MF) {
-    // There won't be a tile copy if neither tile register live in nor live out.
-    bool HasTileCopy = false;
-    for (const auto &LI : MBB.liveins()) {
-      if (TILERegs.test(LI.PhysReg)) {
-        HasTileCopy = true;
-        break;
-      }
-    }
     LiveRegUnits UsedRegs(*TRI);
     UsedRegs.addLiveOuts(MBB);
-    if (!HasTileCopy) {
-      for (auto RegT : TILERegs.set_bits()) {
-        if (UsedRegs.available(RegT)) {
-          HasTileCopy = true;
-          break;
-        }
-      }
-    }
-    if (!HasTileCopy)
-      continue;
     for (MachineInstr &MI : llvm::make_early_inc_range(reverse(MBB))) {
       UsedRegs.stepBackward(MI);
       if (!MI.isCopy())
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index f6e853270e073..8aaa49945f9d4 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -21,6 +21,8 @@
 
 namespace llvm {
 
+enum AMXProgModelEnum { None = 0, DirectReg = 1, ManagedRA = 2 };
+
 /// X86MachineFunctionInfo - This class is derived from MachineFunction and
 /// contains private X86 target-specific information for each MachineFunction.
 class X86MachineFunctionInfo : public MachineFunctionInfo {
@@ -96,6 +98,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// used to address arguments in a function using a base pointer.
   int SEHFramePtrSaveIndex = 0;
 
+  /// The AMX programing model used in the function.
+  AMXProgModelEnum AMXProgModel = AMXProgModelEnum::None;
+
   /// True if this function has a subset of CSRs that is handled explicitly via
   /// copies.
   bool IsSplitCSR = false;
@@ -219,6 +224,13 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   int getSEHFramePtrSaveIndex() const { return SEHFramePtrSaveIndex; }
   void setSEHFramePtrSaveIndex(int Index) { SEHFramePtrSaveIndex = Index; }
 
+  AMXProgModelEnum getAMXProgModel() const { return AMXProgModel; }
+  void setAMXProgModel(AMXProgModelEnum Model) {
+    assert((AMXProgModel == AMXProgModelEnum::None || AMXProgModel == Model) &&
+           "mixed model is not supported");
+    AMXProgModel = Model;
+  }
+
   SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
     return ForwardedMustTailRegParms;
   }

nikic

This makes sense to me conceptually, but I don't know enough about AMX to properly review this :)

aengelke

I did my best to look at this, but I don't feel really qualified to review AMX. I like the idea (very similar to what I had in mind) and this generally looks good, but I have a few questions/points where I'm unsure.

Maybe add a short comment explaining the two AMX modes? If there's some documentation elsewhere, I couldn't find it.
For managed RA, the only sources of AMX values are zero, load and cast intrinsics. Whenever a load/zero(/store) occurs, the mode is set accordingly. Looks good. What about casts from vectors?
For unmanaged RA, all (?) (non-internal) intrinsics should set the prog model. So there's zero, load, store, and the actual operations. Zero, load, store are handled, ok. But shouldn't any occurrence of the compute intrinsics also set the prog model? (Can they theoretically occur alone?)

phoebewang · 2024-06-06T04:10:34Z

I did my best to look at this, but I don't feel really qualified to review AMX. I like the idea (very similar to what I had in mind) and this generally looks good, but I have a few questions/points where I'm unsure.

Thanks @aengelke for looking at this! All are good questions!

Maybe add a short comment explaining the two AMX modes? If there's some documentation elsewhere, I couldn't find it.

This is a tech debt for a long time. And since previous authors not working on this, we may not able to complete it in a short time.
A short introduction would be:

DirectReg model provides ablities to operate AMX instructions with immediate register number through intrinsics.
ManagedRA model provides managed register allocation as well as AMX configuration from the compiler. User still needs to use intrinsics.

Some documents scattered in Intel Intrinsic Guide, Clang doxygen and some technical paper etc.

But this is not quite related to this patch, and I don't find a good place for it.

For managed RA, the only sources of AMX values are zero, load and cast intrinsics. Whenever a load/zero(/store) occurs, the mode is set accordingly. Looks good. What about casts from vectors?

AMX casts were handled by X86LowerAMXType before ISel. They all have been transformed into load/store at this phase.

For unmanaged RA, all (?) (non-internal) intrinsics should set the prog model. So there's zero, load, store, and the actual operations. Zero, load, store are handled, ok. But shouldn't any occurrence of the compute intrinsics also set the prog model? (Can they theoretically occur alone?)

Yes, it should happen in theory, but using them without zero, load or strore can be seem as an invalid use scenario. We can save verbose setAMXProgModel by considering it is UB and not to handle it.

aengelke

Thanks for the detailed answers! LGTM.

Follow-up of llvm#94358. Do the checks even before calling getRegisterInfo etc., because some of these are virtual function calls.

This allows tested passes to depend on the AMX model in the function info. Preparatory work for to adopt #94358 for other AMX passes.

Follow-up of llvm#94358. Do the checks even before calling getRegisterInfo etc., because some of these are virtual function calls.

Follow-up of #94358. Do the checks even before calling getRegisterInfo etc., because some of these are virtual function calls.

This allows tested passes to depend on the AMX model in the function info. Preparatory work for to adopt llvm#94358 for other AMX passes.

[X86][AMX] Checking AMXProgModel in X86LowerTileCopy

31281cf

This fixes compile time increment after llvm#93692.

phoebewang requested review from nikic, RKSimon, aengelke and LuoYuanke June 4, 2024 14:24

llvmbot added the backend:X86 label Jun 4, 2024

phoebewang mentioned this pull request Jun 4, 2024

[X86][AMX] Check also AMX register live out for copy lowering #93692

Merged

nikic reviewed Jun 5, 2024

View reviewed changes

phoebewang requested a review from yubingex007-a11y June 5, 2024 13:00

aengelke reviewed Jun 5, 2024

View reviewed changes

aengelke approved these changes Jun 6, 2024

View reviewed changes

phoebewang merged commit e5c93ed into llvm:main Jun 6, 2024
9 checks passed

phoebewang deleted the AMX2 branch June 6, 2024 13:10

phoebewang mentioned this pull request Jun 6, 2024

[X86] Skip AMX type lowering when AMX is not used #92910

Merged

aengelke added a commit to aengelke/llvm-project that referenced this pull request Jun 10, 2024

[X86] Early exit MIR AMX passes when AMX is unused

b566068

Follow-up of llvm#94358. Do the checks even before calling getRegisterInfo etc., because some of these are virtual function calls.

This was referenced Jun 10, 2024

[X86] Add AMXProgModel to YAML serialization #94988

Merged

[X86] Early exit MIR AMX passes when AMX is unused #94989

Merged

aengelke added a commit that referenced this pull request Jun 11, 2024

[X86] Add AMXProgModel to YAML serialization (#94988)

214ff50

This allows tested passes to depend on the AMX model in the function info. Preparatory work for to adopt #94358 for other AMX passes.

aengelke added a commit to aengelke/llvm-project that referenced this pull request Jun 11, 2024

[X86] Early exit MIR AMX passes when AMX is unused

5b41574

Follow-up of llvm#94358. Do the checks even before calling getRegisterInfo etc., because some of these are virtual function calls.

aengelke added a commit that referenced this pull request Jun 12, 2024

[X86] Early exit MIR AMX passes when AMX is unused (#94989)

9a2c841

Follow-up of #94358. Do the checks even before calling getRegisterInfo etc., because some of these are virtual function calls.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[X86][AMX] Checking AMXProgModel in X86LowerTileCopy #94358

[X86][AMX] Checking AMXProgModel in X86LowerTileCopy #94358

phoebewang commented Jun 4, 2024

llvmbot commented Jun 4, 2024

nikic left a comment

aengelke left a comment •

edited

Loading

phoebewang commented Jun 6, 2024

aengelke left a comment

[X86][AMX] Checking AMXProgModel in X86LowerTileCopy #94358

[X86][AMX] Checking AMXProgModel in X86LowerTileCopy #94358

Conversation

phoebewang commented Jun 4, 2024

llvmbot commented Jun 4, 2024

nikic left a comment

Choose a reason for hiding this comment

aengelke left a comment • edited Loading

Choose a reason for hiding this comment

phoebewang commented Jun 6, 2024

aengelke left a comment

Choose a reason for hiding this comment

aengelke left a comment •

edited

Loading