llvm
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
Lines changed: 98 additions & 25 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
Lines changed: 98 additions & 25 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Lines changed: 3 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Lines changed: 3 additions & 0 deletions
@@ -18,6 +18,7 @@
 #include "AMDGPUTargetTransformInfo.h"
 #include "GCNSubtarget.h"
 #include "llvm/ADT/FloatingPointMode.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include <optional>
@@ -503,6 +504,98 @@ bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
   return false;
 }
 
+static CallInst *rewriteCall(IRBuilderBase &B, CallInst &Old,
+                             Function &NewCallee, ArrayRef<Value *> Ops) {
+  SmallVector<OperandBundleDef, 2> OpBundles;
+  Old.getOperandBundlesAsDefs(OpBundles);
+
+  CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
+  NewCall->takeName(&Old);
+  return NewCall;
+}
+
+Instruction *
+GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
+                                             IntrinsicInst &II) const {
+  const auto IID = II.getIntrinsicID();
+  assert(IID == Intrinsic::amdgcn_readlane ||
+         IID == Intrinsic::amdgcn_readfirstlane ||
+         IID == Intrinsic::amdgcn_permlane64);
+
+  Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));
+
+  // Only do this if both instructions are in the same block
+  // (so the exec mask won't change) and the readlane is the only user of its
+  // operand.
+  if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
+    return nullptr;
+
+  const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
+
+  // If this is a readlane, check that the second operand is a constant, or is
+  // defined before OpInst so we know it's safe to move this intrinsic higher.
+  Value *LaneID = nullptr;
+  if (IsReadLane) {
+    LaneID = II.getOperand(1);
+
+    // readlane take an extra operand for the lane ID, so we must check if that
+    // LaneID value can be used at the point where we want to move the
+    // intrinsic.
+    if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
+      if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
+        return nullptr;
+    }
+  }
+
+  // Hoist the intrinsic (II) through OpInst.
+  //
+  // (II (OpInst x)) -> (OpInst (II x))
+  const auto DoIt = [&](unsigned OpIdx,
+                        Function *NewIntrinsic) -> Instruction * {
+    SmallVector<Value *, 2> Ops{OpInst->getOperand(OpIdx)};
+    if (IsReadLane)
+      Ops.push_back(LaneID);
+
+    // Rewrite the intrinsic call.
+    CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);
+
+    // Rewrite OpInst so it takes the result of the intrinsic now.
+    Instruction &NewOp = *OpInst->clone();
+    NewOp.setOperand(OpIdx, NewII);
+    return &NewOp;
+  };
+
+  // TODO(?): Should we do more with permlane64?
+  if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
+    return nullptr;
+
+  if (isa<UnaryOperator>(OpInst))
+    return DoIt(0, II.getCalledFunction());
+
+  if (isa<CastInst>(OpInst)) {
+    Value *Src = OpInst->getOperand(0);
+    Type *SrcTy = Src->getType();
+    if (!isTypeLegal(SrcTy))
+      return nullptr;
+
+    Function *Remangled =
+        Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
+    return DoIt(0, Remangled);
+  }
+
+  // We can also hoist through binary operators if the other operand is uniform.
+  if (isa<BinaryOperator>(OpInst)) {
+    // FIXME: If we had access to UniformityInfo here we could just check
+    // if the operand is uniform.
+    if (isTriviallyUniform(OpInst->getOperandUse(0)))
+      return DoIt(1, II.getCalledFunction());
+    if (isTriviallyUniform(OpInst->getOperandUse(1)))
+      return DoIt(0, II.getCalledFunction());
+  }
+
+  return nullptr;
+}
+
 std::optional<Instruction *>
 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
   Intrinsic::ID IID = II.getIntrinsicID();
@@ -1264,31 +1357,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         simplifyDemandedLaneMaskArg(IC, II, 1))
       return &II;
 
-    // readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1)
-    if (auto *BC = dyn_cast<BitCastInst>(Src);
-        BC && BC->hasOneUse() && IID != Intrinsic::amdgcn_ds_bpermute) {
-      Value *BCSrc = BC->getOperand(0);
-
-      // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
-      if (isTypeLegal(BCSrc->getType())) {
-        Module *M = IC.Builder.GetInsertBlock()->getModule();
-        Function *Remangled =
-            Intrinsic::getOrInsertDeclaration(M, IID, {BCSrc->getType()});
-
-        // Make sure convergence tokens are preserved.
-        // TODO: CreateIntrinsic should allow directly copying bundles
-        SmallVector<OperandBundleDef, 2> OpBundles;
-        II.getOperandBundlesAsDefs(OpBundles);
-
-        SmallVector<Value *, 3> Args(II.args());
-        Args[0] = BCSrc;
-
-        CallInst *NewCall = IC.Builder.CreateCall(Remangled, Args, OpBundles);
-        NewCall->takeName(&II);
-        return new BitCastInst(NewCall, II.getType());
-      }
-    }
-
     // If the lane argument of bpermute is uniform, change it to readlane. This
     // generates better code and can enable further optimizations because
     // readlane is AlwaysUniform.
@@ -1305,6 +1373,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       }
     }
 
+    if (IID != Intrinsic::amdgcn_ds_bpermute) {
+      if (Instruction *Res = hoistLaneIntrinsicThroughOperand(IC, II))
+        return Res;
+    }
+
     return std::nullopt;
   }
   case Intrinsic::amdgcn_writelane: {
 
@@ -224,6 +224,9 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
                                              const APInt &DemandedElts,
                                              APInt &UndefElts) const;
 
+  Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
+                                                IntrinsicInst &II) const;
+
   std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
       APInt &UndefElts2, APInt &UndefElts3,