llvm
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
Lines changed: 36 additions & 44 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
Lines changed: 36 additions & 44 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Lines changed: 2 additions & 2 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.permlane64.ll
Lines changed: 153 additions & 0 deletions b/‎llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.permlane64.ll
Lines changed: 153 additions & 0 deletions
@@ -482,8 +482,14 @@ bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
   return false;
 }
 
-Instruction *GCNTTIImpl::hoistReadLaneThroughOperand(InstCombiner &IC,
-                                                     IntrinsicInst &II) const {
+Instruction *
+GCNTTIImpl::hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
+                                             IntrinsicInst &II) const {
+  const auto IID = II.getIntrinsicID();
+  assert(IID == Intrinsic::amdgcn_readlane ||
+         IID == Intrinsic::amdgcn_readfirstlane ||
+         IID == Intrinsic::amdgcn_permlane64);
+
   Instruction *Op = dyn_cast<Instruction>(II.getOperand(0));
 
   // Only do this if both instructions are in the same block
@@ -492,7 +498,8 @@ Instruction *GCNTTIImpl::hoistReadLaneThroughOperand(InstCombiner &IC,
   if (!Op || !Op->hasOneUser() || Op->getParent() != II.getParent())
     return nullptr;
 
-  const bool IsReadLane = (II.getIntrinsicID() == Intrinsic::amdgcn_readlane);
+  const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
+  const bool IsPermLane = (IID == Intrinsic::amdgcn_permlane64);
 
   // If this is a readlane, check that the second operand is a constant, or is
   // defined before Op so we know it's safe to move this intrinsic higher.
@@ -505,7 +512,8 @@ Instruction *GCNTTIImpl::hoistReadLaneThroughOperand(InstCombiner &IC,
       return nullptr;
   }
 
-  const auto DoIt = [&](unsigned OpIdx) -> Instruction * {
+  const auto DoIt = [&](unsigned OpIdx,
+                        Function *NewIntrinsic) -> Instruction * {
     SmallVector<Value *, 2> Ops{Op->getOperand(OpIdx)};
     if (IsReadLane)
       Ops.push_back(LaneID);
@@ -515,27 +523,40 @@ Instruction *GCNTTIImpl::hoistReadLaneThroughOperand(InstCombiner &IC,
     SmallVector<OperandBundleDef, 2> OpBundles;
     II.getOperandBundlesAsDefs(OpBundles);
 
-    CallInst *NewII =
-        IC.Builder.CreateCall(II.getCalledFunction(), Ops, OpBundles);
+    CallInst *NewII = IC.Builder.CreateCall(NewIntrinsic, Ops, OpBundles);
+    NewII->takeName(&II);
 
     Instruction &NewOp = *Op->clone();
     NewOp.setOperand(OpIdx, NewII);
     return &NewOp;
   };
 
-  // TODO: Are any operations more expensive on the SALU than VALU, and thus
-  //       need to be excluded here?
-
   if (isa<UnaryOperator>(Op))
-    return DoIt(0);
+    return DoIt(0, II.getCalledFunction());
+
+  if (isa<CastInst>(Op)) {
+    Value *Src = Op->getOperand(0);
+    Type *SrcTy = Src->getType();
+    if (!isTypeLegal(SrcTy))
+      return nullptr;
+
+    Function *Remangled =
+        Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
+    return DoIt(0, Remangled);
+  }
 
-  if (isa<BinaryOperator>(Op)) {
+  // Don't hoist through a binary operator for permlane64. It doesn't
+  // achieve anything and we'd need to repeat the call on every operand.
+  //
+  // We can do it for read(first)lane if other operands are already scalar
+  // because then we don't need to repeat the call.
+  if (!IsPermLane && isa<BinaryOperator>(Op)) {
     // FIXME: If we had access to UniformityInfo here we could just check
     // if the operand is uniform.
     if (isTriviallyUniform(Op->getOperandUse(0)))
-      return DoIt(1);
+      return DoIt(1, II.getCalledFunction());
     if (isTriviallyUniform(Op->getOperandUse(1)))
-      return DoIt(0);
+      return DoIt(0, II.getCalledFunction());
   }
 
   return nullptr;
@@ -1188,37 +1209,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         simplifyDemandedLaneMaskArg(IC, II, 1))
       return &II;
 
-    // readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1)
-    if (auto *BC = dyn_cast<BitCastInst>(Src); BC && BC->hasOneUse()) {
-      Value *BCSrc = BC->getOperand(0);
-
-      // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
-      if (isTypeLegal(BCSrc->getType())) {
-        Module *M = IC.Builder.GetInsertBlock()->getModule();
-        Function *Remangled =
-            Intrinsic::getOrInsertDeclaration(M, IID, {BCSrc->getType()});
-
-        // Make sure convergence tokens are preserved.
-        // TODO: CreateIntrinsic should allow directly copying bundles
-        SmallVector<OperandBundleDef, 2> OpBundles;
-        II.getOperandBundlesAsDefs(OpBundles);
-
-        SmallVector<Value *, 3> Args(II.args());
-        Args[0] = BCSrc;
-
-        CallInst *NewCall = IC.Builder.CreateCall(Remangled, Args, OpBundles);
-        NewCall->takeName(&II);
-        return new BitCastInst(NewCall, II.getType());
-      }
-    }
-
-    // If the readfirstlane reads the result of an operation that exists
-    // both in the SALU and VALU, we may be able to hoist it higher in order
-    // to scalarize the expression.
-    if (IID != Intrinsic::amdgcn_permlane64) {
-      if (Instruction *Res = hoistReadLaneThroughOperand(IC, II))
-        return Res;
-    }
+    if (Instruction *Res = hoistLaneIntrinsicThroughOperand(IC, II))
+      return Res;
 
     return std::nullopt;
   }
 
@@ -224,8 +224,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II,
                                    unsigned LaneAgIdx) const;
 
-  Instruction *hoistReadLaneThroughOperand(InstCombiner &IC,
-                                           IntrinsicInst &II) const;
+  Instruction *hoistLaneIntrinsicThroughOperand(InstCombiner &IC,
+                                                IntrinsicInst &II) const;
 
   std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
                                                     IntrinsicInst &II) const;
 
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 -passes=instcombine -S < %s | FileCheck %s
+
+; The readfirstlane version of this test covers all the interesting cases of the
+; shared logic. This testcase focuses on permlane64 specific pitfalls.
+
+; test unary
+
+define float @hoist_fneg_f32(float %arg) {
+; CHECK-LABEL: define float @hoist_fneg_f32(
+; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.amdgcn.readfirstlane.f32(float [[ARG]])
+; CHECK-NEXT:    [[RFL:%.*]] = fneg float [[TMP0]]
+; CHECK-NEXT:    ret float [[RFL]]
+;
+bb:
+  %val = fneg float %arg
+  %pl = call float @llvm.amdgcn.readfirstlane.f32(float %val)
+  ret float %pl
+}
+
+define double @hoist_fneg_f64(double %arg) {
+; CHECK-LABEL: define double @hoist_fneg_f64(
+; CHECK-SAME: double [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.amdgcn.readfirstlane.f64(double [[ARG]])
+; CHECK-NEXT:    [[RFL:%.*]] = fneg double [[TMP0]]
+; CHECK-NEXT:    ret double [[RFL]]
+;
+bb:
+  %val = fneg double %arg
+  %pl = call double @llvm.amdgcn.readfirstlane.f64(double %val)
+  ret double %pl
+}
+
+; test casts
+
+define i32 @hoist_trunc(i64 %arg) {
+; CHECK-LABEL: define i32 @hoist_trunc(
+; CHECK-SAME: i64 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[RFL:%.*]] = call i64 @llvm.amdgcn.readfirstlane.i64(i64 [[ARG]])
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[RFL]] to i32
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+bb:
+  %val = trunc i64 %arg to i32
+  %pl = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %val)
+  ret i32 %pl
+}
+
+define i64 @hoist_zext(i32 %arg) {
+; CHECK-LABEL: define i64 @hoist_zext(
+; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[RFL:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[ARG]])
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[RFL]] to i64
+; CHECK-NEXT:    ret i64 [[TMP0]]
+;
+bb:
+  %val = zext i32 %arg to i64
+  %pl = call i64 @llvm.amdgcn.readfirstlane.i64(i64 %val)
+  ret i64 %pl
+}
+
+; test binary i32
+
+define i32 @hoist_add_i32(i32 %arg) {
+; CHECK-LABEL: define i32 @hoist_add_i32(
+; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[VAL:%.*]] = add i32 [[ARG]], 16777215
+; CHECK-NEXT:    [[RFL:%.*]] = call i32 @llvm.amdgcn.permlane64.i32(i32 [[VAL]])
+; CHECK-NEXT:    ret i32 [[RFL]]
+;
+bb:
+  %val = add i32 %arg, 16777215
+  %pl = call i32 @llvm.amdgcn.permlane64.i32(i32 %val)
+  ret i32 %pl
+}
+
+define float @hoist_fadd_f32(float %arg) {
+; CHECK-LABEL: define float @hoist_fadd_f32(
+; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[VAL:%.*]] = fadd float [[ARG]], 1.280000e+02
+; CHECK-NEXT:    [[RFL:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[VAL]])
+; CHECK-NEXT:    ret float [[RFL]]
+;
+bb:
+  %val = fadd float %arg, 128.0
+  %pl = call float @llvm.amdgcn.permlane64.f32(float %val)
+  ret float %pl
+}
+
+; test cases where hoisting isn't possible
+
+define float @cross_block_hoisting(i1 %cond, float %arg) {
+; CHECK-LABEL: define float @cross_block_hoisting(
+; CHECK-SAME: i1 [[COND:%.*]], float [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[VAL:%.*]] = fneg float [[ARG]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[RFL:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[VAL]])
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi float [ [[RFL]], %[[THEN]] ], [ [[VAL]], %[[BB]] ]
+; CHECK-NEXT:    ret float [[RES]]
+;
+bb:
+  %val = fneg float %arg
+  br i1 %cond, label %then, label %end
+
+then:
+  %pl = call float @llvm.amdgcn.permlane64.f32(float %val)
+  br label %end
+
+end:
+  %res = phi float [%pl, %then], [%val, %bb]
+  ret float %res
+}
+
+; test that convergence tokens are preserved
+
+define float @hoist_preserves_convergence_token(i1 %cond, float %arg) convergent {
+; CHECK-LABEL: define float @hoist_preserves_convergence_token(
+; CHECK-SAME: i1 [[COND:%.*]], float [[ARG:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    [[ENTRY:%.*]] = call token @llvm.experimental.convergence.entry()
+; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[END:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[RFL:%.*]] = call float @llvm.amdgcn.permlane64.f32(float [[ARG]]) [ "convergencectrl"(token [[ENTRY]]) ]
+; CHECK-NEXT:    [[TMP0:%.*]] = fneg float [[RFL]]
+; CHECK-NEXT:    br label %[[END]]
+; CHECK:       [[END]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi float [ [[TMP0]], %[[THEN]] ], [ [[ARG]], %[[BB]] ]
+; CHECK-NEXT:    ret float [[RES]]
+;
+bb:
+  %entry = call token @llvm.experimental.convergence.entry()
+  br i1 %cond, label %then, label %end
+
+then:
+  %val = fneg float %arg
+  %pl = call float @llvm.amdgcn.permlane64.f32(float %val) [ "convergencectrl"(token %entry)]
+  br label %end
+
+end:
+  %res = phi float [%pl, %then], [%arg, %bb]
+  ret float %res
+}