llvm · zmodem · Apr 25, 2025 · Apr 28, 2025 · Apr 29, 2025 · Apr 29, 2025
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -890,27 +890,50 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
       LD->getExtensionType() != ISD::NON_EXTLOAD)
     return false;
 
+  // If the load's outgoing chain has more than one use, we can't (currently)
+  // move the load since we'd most likely create a loop. TODO: Maybe it could
+  // work if moveBelowOrigChain() updated *all* the chain users.
+  if (!Callee.getValue(1).hasOneUse())
+    return false;
+
   // Now let's find the callseq_start.
   while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) {
     if (!Chain.hasOneUse())
       return false;
     Chain = Chain.getOperand(0);
   }
 
-  if (!Chain.getNumOperands())
-    return false;
-  // Since we are not checking for AA here, conservatively abort if the chain
-  // writes to memory. It's not safe to move the callee (a load) across a store.
-  if (isa<MemSDNode>(Chain.getNode()) &&
-      cast<MemSDNode>(Chain.getNode())->writeMem())
+  while (true) {
+    if (!Chain.getNumOperands())
+      return false;
+    // Since we are not checking for AA here, conservatively abort if the chain
+    // writes to memory. It's not safe to move the callee (a load) across a
+    // store.
+    if (isa<MemSDNode>(Chain.getNode()) &&
+        cast<MemSDNode>(Chain.getNode())->writeMem())
+      return false;
+    // Moving across inline asm is not safe: it could do anything.
+    if (Chain.getNode()->getOpcode() == ISD::INLINEASM ||
+        Chain.getNode()->getOpcode() == ISD::INLINEASM_BR)
+      return false;
+
+    if (Chain.getOperand(0).getNode() == Callee.getNode())
+      return true;
+    if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
+        Chain.getOperand(0).getValue(0).hasOneUse() &&
+        Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
+        Callee.getValue(1).hasOneUse())
+      return true;
+
+    // Look past CopyToRegs. We only walk one path, so the chain mustn't branch.
+    if (Chain.getOperand(0).getOpcode() == ISD::CopyToReg &&
+        Chain.getOperand(0).getValue(0).hasOneUse()) {
+      Chain = Chain.getOperand(0);
+      continue;
+    }
+
     return false;
-  if (Chain.getOperand(0).getNode() == Callee.getNode())
-    return true;
-  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
-      Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
-      Callee.getValue(1).hasOneUse())
-    return true;
-  return false;
+  }
 }
 
 static bool isEndbrImm64(uint64_t Imm) {
@@ -1353,6 +1376,22 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
          (N->getOpcode() == X86ISD::TC_RETURN &&
           (Subtarget->is64Bit() ||
            !getTargetMachine().isPositionIndependent())))) {
+
+      if (N->getOpcode() == X86ISD::TC_RETURN) {
+        // There needs to be enough non-callee-saved GPRs available to compute
+        // the load address if folded into the tailcall. See how the
+        // X86tcret_6regs and X86tcret_1reg classes are used and defined.
+        unsigned NumRegs = 0;
+        for (unsigned I = 3, E = N->getNumOperands(); I != E; ++I) {
+          if (isa<RegisterSDNode>(N->getOperand(I)))
+            ++NumRegs;
+        }
+        if (!Subtarget->is64Bit() && NumRegs > 1)
+          continue;
+        if (NumRegs > 6)
 def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off), 
                              (X86tcret node:$ptr, node:$off), [{ 
   // X86tcret args: (*chain, ptr, imm, regs..., glue) 
   unsigned NumRegs = 0; 
   for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i) 
     if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6) 
       return false; 
   return true; 
 }]>; 
 // Don't fold loads into X86tcret requiring more than 6 regs. 
 // There wouldn't be enough scratch registers for base+index. 
 def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off), 
           (TCRETURNmi64 addr:$dst, timm:$off)>, 
           Requires<[In64BitMode, NotUseIndirectThunkCalls]>; 
 const TargetRegisterClass * 
 X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const { 
   const Function &F = MF.getFunction(); 
   if (IsWin64 || (F.getCallingConv() == CallingConv::Win64)) 
     return &X86::GR64_TCW64RegClass; 
   else if (Is64Bit) 
     return &X86::GR64_TCRegClass; 
   bool hasHipeCC = (F.getCallingConv() == CallingConv::HiPE); 
   if (hasHipeCC) 
     return &X86::GR32RegClass; 
   return &X86::GR32_TCRegClass; 
 } 
 def X86tcret_1reg : PatFrag<(ops node:$ptr, node:$off), 
                              (X86tcret node:$ptr, node:$off), [{ 
   // X86tcret args: (*chain, ptr, imm, regs..., glue) 
   unsigned NumRegs = 1; 
   const SDValue& BasePtr = cast<LoadSDNode>(N->getOperand(1))->getBasePtr(); 
   if (isa<FrameIndexSDNode>(BasePtr)) 
     NumRegs = 3; 
   else if (BasePtr->getNumOperands() && isa<GlobalAddressSDNode>(BasePtr->getOperand(0))) 
     NumRegs = 3; 
   for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i) 
     if (isa<RegisterSDNode>(N->getOperand(i)) && ( NumRegs-- == 0)) 
       return false; 
   return true; 
 }]>; 
 def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off), 
                              (X86tcret node:$ptr, node:$off), [{ 
   // X86tcret args: (*chain, ptr, imm, regs..., glue) 
   unsigned NumRegs = 0; 
   for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i) 
     if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6) 
       return false; 
   return true; 
 }]>; 
 // Don't fold loads into X86tcret requiring more than 6 regs. 
 // There wouldn't be enough scratch registers for base+index. 
 def : Pat<(X86tcret_6regs (load addr:$dst), timm:$off), 
           (TCRETURNmi64 addr:$dst, timm:$off)>, 
           Requires<[In64BitMode, NotUseIndirectThunkCalls]>; 
 const TargetRegisterClass * 
 X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const { 
   const Function &F = MF.getFunction(); 
   if (IsWin64 || (F.getCallingConv() == CallingConv::Win64)) 
     return &X86::GR64_TCW64RegClass; 
   else if (Is64Bit) 
     return &X86::GR64_TCRegClass; 
  
   bool hasHipeCC = (F.getCallingConv() == CallingConv::HiPE); 
   if (hasHipeCC) 
     return &X86::GR32RegClass; 
   return &X86::GR32_TCRegClass; 
 } 
 def X86tcret_1reg : PatFrag<(ops node:$ptr, node:$off), 
                              (X86tcret node:$ptr, node:$off), [{ 
   // X86tcret args: (*chain, ptr, imm, regs..., glue) 
   unsigned NumRegs = 1; 
   const SDValue& BasePtr = cast<LoadSDNode>(N->getOperand(1))->getBasePtr(); 
   if (isa<FrameIndexSDNode>(BasePtr)) 
     NumRegs = 3; 
   else if (BasePtr->getNumOperands() && isa<GlobalAddressSDNode>(BasePtr->getOperand(0))) 
     NumRegs = 3; 
   for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i) 
     if (isa<RegisterSDNode>(N->getOperand(i)) && ( NumRegs-- == 0)) 
       return false; 
   return true; 
 }]>; 
+          continue;
+      }
+
       /// Also try moving call address load from outside callseq_start to just
       /// before the call to allow it to be folded.
       ///

diff --git a/llvm/test/CodeGen/X86/cfguard-checks.ll b/llvm/test/CodeGen/X86/cfguard-checks.ll
@@ -210,8 +210,7 @@ entry:
   ; X64-LABEL: vmptr_thunk:
   ; X64:            movq (%rcx), %rax
   ; X64-NEXT:       movq 8(%rax), %rax
-  ; X64-NEXT:       movq __guard_dispatch_icall_fptr(%rip), %rdx
-  ; X64-NEXT:       rex64 jmpq *%rdx            # TAILCALL
+  ; X64-NEXT:       rex64 jmpq *__guard_dispatch_icall_fptr(%rip) # TAILCALL
   ; X64-NOT:   callq
 }
 

diff --git a/llvm/test/CodeGen/X86/fold-call-4.ll b/llvm/test/CodeGen/X86/fold-call-4.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=LIN
+; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefix=WIN
+
+; The callee address computation should get folded into the call.
+; CHECK-LABEL: f:
+; CHECK-NOT: mov
+; LIN: jmpq *(%rdi,%rsi,8)
+; WIN: rex64 jmpq *(%rcx,%rdx,8)
+define void @f(ptr %table, i64 %idx, i64 %aux1, i64 %aux2, i64 %aux3) {
+entry:
+  %arrayidx = getelementptr inbounds ptr, ptr %table, i64 %idx
+  %funcptr = load ptr, ptr %arrayidx, align 8
+  tail call void %funcptr(ptr %table, i64 %idx, i64 %aux1, i64 %aux2, i64 %aux3)
+  ret void
+}
+
+; Check that we don't assert here. On Win64 this has a TokenFactor with
+; multiple uses, which we can't currently fold.
+define void @thunk(ptr %this, ...) {
+entry:
+  %vtable = load ptr, ptr %this, align 8
+  %vfn = getelementptr inbounds nuw i8, ptr %vtable, i64 8
+  %0 = load ptr, ptr %vfn, align 8
+  musttail call void (ptr, ...) %0(ptr %this, ...)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/fold-call.ll b/llvm/test/CodeGen/X86/fold-call.ll
@@ -24,3 +24,15 @@ entry:
   tail call void %0()
   ret void
 }
+
+; Don't fold the load+call if there's inline asm in between.
+; CHECK: test3
+; CHECK: mov{{.*}}
+; CHECK: jmp{{.*}}
+define void @test3(ptr nocapture %x) {
+entry:
+  %0 = load ptr, ptr %x
+  call void asm sideeffect "", ""()  ; It could do anything.
+  tail call void %0()
+  ret void
+}