18
18
#include " AMDGPUTargetTransformInfo.h"
19
19
#include " GCNSubtarget.h"
20
20
#include " llvm/ADT/FloatingPointMode.h"
21
+ #include " llvm/IR/Dominators.h"
21
22
#include " llvm/IR/IntrinsicsAMDGPU.h"
22
23
#include " llvm/Transforms/InstCombine/InstCombiner.h"
23
24
#include < optional>
@@ -503,6 +504,98 @@ bool GCNTTIImpl::simplifyDemandedLaneMaskArg(InstCombiner &IC,
503
504
return false ;
504
505
}
505
506
507
+ static CallInst *rewriteCall (IRBuilderBase &B, CallInst &Old,
508
+ Function &NewCallee, ArrayRef<Value *> Ops) {
509
+ SmallVector<OperandBundleDef, 2 > OpBundles;
510
+ Old.getOperandBundlesAsDefs (OpBundles);
511
+
512
+ CallInst *NewCall = B.CreateCall (&NewCallee, Ops, OpBundles);
513
+ NewCall->takeName (&Old);
514
+ return NewCall;
515
+ }
516
+
517
+ Instruction *
518
+ GCNTTIImpl::hoistLaneIntrinsicThroughOperand (InstCombiner &IC,
519
+ IntrinsicInst &II) const {
520
+ const auto IID = II.getIntrinsicID ();
521
+ assert (IID == Intrinsic::amdgcn_readlane ||
522
+ IID == Intrinsic::amdgcn_readfirstlane ||
523
+ IID == Intrinsic::amdgcn_permlane64);
524
+
525
+ Instruction *OpInst = dyn_cast<Instruction>(II.getOperand (0 ));
526
+
527
+ // Only do this if both instructions are in the same block
528
+ // (so the exec mask won't change) and the readlane is the only user of its
529
+ // operand.
530
+ if (!OpInst || !OpInst->hasOneUser () || OpInst->getParent () != II.getParent ())
531
+ return nullptr ;
532
+
533
+ const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
534
+
535
+ // If this is a readlane, check that the second operand is a constant, or is
536
+ // defined before OpInst so we know it's safe to move this intrinsic higher.
537
+ Value *LaneID = nullptr ;
538
+ if (IsReadLane) {
539
+ LaneID = II.getOperand (1 );
540
+
541
+ // readlane take an extra operand for the lane ID, so we must check if that
542
+ // LaneID value can be used at the point where we want to move the
543
+ // intrinsic.
544
+ if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
545
+ if (!IC.getDominatorTree ().dominates (LaneIDInst, OpInst))
546
+ return nullptr ;
547
+ }
548
+ }
549
+
550
+ // Hoist the intrinsic (II) through OpInst.
551
+ //
552
+ // (II (OpInst x)) -> (OpInst (II x))
553
+ const auto DoIt = [&](unsigned OpIdx,
554
+ Function *NewIntrinsic) -> Instruction * {
555
+ SmallVector<Value *, 2 > Ops{OpInst->getOperand (OpIdx)};
556
+ if (IsReadLane)
557
+ Ops.push_back (LaneID);
558
+
559
+ // Rewrite the intrinsic call.
560
+ CallInst *NewII = rewriteCall (IC.Builder , II, *NewIntrinsic, Ops);
561
+
562
+ // Rewrite OpInst so it takes the result of the intrinsic now.
563
+ Instruction &NewOp = *OpInst->clone ();
564
+ NewOp.setOperand (OpIdx, NewII);
565
+ return &NewOp;
566
+ };
567
+
568
+ // TODO(?): Should we do more with permlane64?
569
+ if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
570
+ return nullptr ;
571
+
572
+ if (isa<UnaryOperator>(OpInst))
573
+ return DoIt (0 , II.getCalledFunction ());
574
+
575
+ if (isa<CastInst>(OpInst)) {
576
+ Value *Src = OpInst->getOperand (0 );
577
+ Type *SrcTy = Src->getType ();
578
+ if (!isTypeLegal (SrcTy))
579
+ return nullptr ;
580
+
581
+ Function *Remangled =
582
+ Intrinsic::getOrInsertDeclaration (II.getModule (), IID, {SrcTy});
583
+ return DoIt (0 , Remangled);
584
+ }
585
+
586
+ // We can also hoist through binary operators if the other operand is uniform.
587
+ if (isa<BinaryOperator>(OpInst)) {
588
+ // FIXME: If we had access to UniformityInfo here we could just check
589
+ // if the operand is uniform.
590
+ if (isTriviallyUniform (OpInst->getOperandUse (0 )))
591
+ return DoIt (1 , II.getCalledFunction ());
592
+ if (isTriviallyUniform (OpInst->getOperandUse (1 )))
593
+ return DoIt (0 , II.getCalledFunction ());
594
+ }
595
+
596
+ return nullptr ;
597
+ }
598
+
506
599
std::optional<Instruction *>
507
600
GCNTTIImpl::instCombineIntrinsic (InstCombiner &IC, IntrinsicInst &II) const {
508
601
Intrinsic::ID IID = II.getIntrinsicID ();
@@ -1264,31 +1357,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
1264
1357
simplifyDemandedLaneMaskArg (IC, II, 1 ))
1265
1358
return &II;
1266
1359
1267
- // readfirstlane.ty0 (bitcast ty1 x to ty0) -> bitcast (readfirstlane.ty1)
1268
- if (auto *BC = dyn_cast<BitCastInst>(Src);
1269
- BC && BC->hasOneUse () && IID != Intrinsic::amdgcn_ds_bpermute) {
1270
- Value *BCSrc = BC->getOperand (0 );
1271
-
1272
- // TODO: Handle this for update_dpp, mov_ddp8, and all permlane variants.
1273
- if (isTypeLegal (BCSrc->getType ())) {
1274
- Module *M = IC.Builder .GetInsertBlock ()->getModule ();
1275
- Function *Remangled =
1276
- Intrinsic::getOrInsertDeclaration (M, IID, {BCSrc->getType ()});
1277
-
1278
- // Make sure convergence tokens are preserved.
1279
- // TODO: CreateIntrinsic should allow directly copying bundles
1280
- SmallVector<OperandBundleDef, 2 > OpBundles;
1281
- II.getOperandBundlesAsDefs (OpBundles);
1282
-
1283
- SmallVector<Value *, 3 > Args (II.args ());
1284
- Args[0 ] = BCSrc;
1285
-
1286
- CallInst *NewCall = IC.Builder .CreateCall (Remangled, Args, OpBundles);
1287
- NewCall->takeName (&II);
1288
- return new BitCastInst (NewCall, II.getType ());
1289
- }
1290
- }
1291
-
1292
1360
// If the lane argument of bpermute is uniform, change it to readlane. This
1293
1361
// generates better code and can enable further optimizations because
1294
1362
// readlane is AlwaysUniform.
@@ -1305,6 +1373,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
1305
1373
}
1306
1374
}
1307
1375
1376
+ if (IID != Intrinsic::amdgcn_ds_bpermute) {
1377
+ if (Instruction *Res = hoistLaneIntrinsicThroughOperand (IC, II))
1378
+ return Res;
1379
+ }
1380
+
1308
1381
return std::nullopt;
1309
1382
}
1310
1383
case Intrinsic::amdgcn_writelane: {
0 commit comments