@@ -491,77 +491,80 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
491
491
// original instruction.
492
492
B.SetInsertPoint (&I);
493
493
494
- // Create a PHI node to get our new atomic result into the exit block.
495
- PHINode *const PHI = B.CreatePHI (Ty, 2 );
496
- PHI->addIncoming (UndefValue::get (Ty), EntryBB);
497
- PHI->addIncoming (NewI, SingleLaneTerminator->getParent ());
498
-
499
- // We need to broadcast the value who was the lowest active lane (the first
500
- // lane) to all other lanes in the wavefront. We use an intrinsic for this,
501
- // but have to handle 64-bit broadcasts with two calls to this intrinsic.
502
- Value *BroadcastI = nullptr ;
503
-
504
- if (TyBitWidth == 64 ) {
505
- Value *const ExtractLo = B.CreateTrunc (PHI, B.getInt32Ty ());
506
- Value *const ExtractHi =
507
- B.CreateTrunc (B.CreateLShr (PHI, B.getInt64 (32 )), B.getInt32Ty ());
508
- CallInst *const ReadFirstLaneLo =
509
- B.CreateIntrinsic (Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
510
- CallInst *const ReadFirstLaneHi =
511
- B.CreateIntrinsic (Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
512
- Value *const PartialInsert = B.CreateInsertElement (
513
- UndefValue::get (VecTy), ReadFirstLaneLo, B.getInt32 (0 ));
514
- Value *const Insert =
515
- B.CreateInsertElement (PartialInsert, ReadFirstLaneHi, B.getInt32 (1 ));
516
- BroadcastI = B.CreateBitCast (Insert, Ty);
517
- } else if (TyBitWidth == 32 ) {
518
-
519
- BroadcastI = B.CreateIntrinsic (Intrinsic::amdgcn_readfirstlane, {}, PHI);
520
- } else {
521
- llvm_unreachable (" Unhandled atomic bit width" );
522
- }
494
+ const bool NeedResult = !I.use_empty ();
495
+ if (NeedResult) {
496
+ // Create a PHI node to get our new atomic result into the exit block.
497
+ PHINode *const PHI = B.CreatePHI (Ty, 2 );
498
+ PHI->addIncoming (UndefValue::get (Ty), EntryBB);
499
+ PHI->addIncoming (NewI, SingleLaneTerminator->getParent ());
523
500
524
- // Now that we have the result of our single atomic operation, we need to
525
- // get our individual lane's slice into the result. We use the lane offset we
526
- // previously calculated combined with the atomic result value we got from the
527
- // first lane, to get our lane's index into the atomic result.
528
- Value *LaneOffset = nullptr ;
529
- if (ValDivergent) {
530
- LaneOffset = B.CreateIntrinsic (Intrinsic::amdgcn_wwm, Ty, ExclScan);
531
- } else {
532
- switch (Op) {
533
- default :
534
- llvm_unreachable (" Unhandled atomic op" );
535
- case AtomicRMWInst::Add:
536
- case AtomicRMWInst::Sub:
537
- LaneOffset = B.CreateMul (V, Mbcnt);
538
- break ;
539
- case AtomicRMWInst::And:
540
- case AtomicRMWInst::Or:
541
- case AtomicRMWInst::Max:
542
- case AtomicRMWInst::Min:
543
- case AtomicRMWInst::UMax:
544
- case AtomicRMWInst::UMin:
545
- LaneOffset = B.CreateSelect (Cond, Identity, V);
546
- break ;
547
- case AtomicRMWInst::Xor:
548
- LaneOffset = B.CreateMul (V, B.CreateAnd (Mbcnt, 1 ));
549
- break ;
501
+ // We need to broadcast the value who was the lowest active lane (the first
502
+ // lane) to all other lanes in the wavefront. We use an intrinsic for this,
503
+ // but have to handle 64-bit broadcasts with two calls to this intrinsic.
504
+ Value *BroadcastI = nullptr ;
505
+
506
+ if (TyBitWidth == 64 ) {
507
+ Value *const ExtractLo = B.CreateTrunc (PHI, B.getInt32Ty ());
508
+ Value *const ExtractHi =
509
+ B.CreateTrunc (B.CreateLShr (PHI, B.getInt64 (32 )), B.getInt32Ty ());
510
+ CallInst *const ReadFirstLaneLo =
511
+ B.CreateIntrinsic (Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
512
+ CallInst *const ReadFirstLaneHi =
513
+ B.CreateIntrinsic (Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
514
+ Value *const PartialInsert = B.CreateInsertElement (
515
+ UndefValue::get (VecTy), ReadFirstLaneLo, B.getInt32 (0 ));
516
+ Value *const Insert =
517
+ B.CreateInsertElement (PartialInsert, ReadFirstLaneHi, B.getInt32 (1 ));
518
+ BroadcastI = B.CreateBitCast (Insert, Ty);
519
+ } else if (TyBitWidth == 32 ) {
520
+
521
+ BroadcastI = B.CreateIntrinsic (Intrinsic::amdgcn_readfirstlane, {}, PHI);
522
+ } else {
523
+ llvm_unreachable (" Unhandled atomic bit width" );
550
524
}
551
- }
552
- Value *const Result = buildNonAtomicBinOp (B, Op, BroadcastI, LaneOffset);
553
525
554
- if (IsPixelShader) {
555
- // Need a final PHI to reconverge to above the helper lane branch mask.
556
- B.SetInsertPoint (PixelExitBB->getFirstNonPHI ());
526
+ // Now that we have the result of our single atomic operation, we need to
527
+ // get our individual lane's slice into the result. We use the lane offset
528
+ // we previously calculated combined with the atomic result value we got
529
+ // from the first lane, to get our lane's index into the atomic result.
530
+ Value *LaneOffset = nullptr ;
531
+ if (ValDivergent) {
532
+ LaneOffset = B.CreateIntrinsic (Intrinsic::amdgcn_wwm, Ty, ExclScan);
533
+ } else {
534
+ switch (Op) {
535
+ default :
536
+ llvm_unreachable (" Unhandled atomic op" );
537
+ case AtomicRMWInst::Add:
538
+ case AtomicRMWInst::Sub:
539
+ LaneOffset = B.CreateMul (V, Mbcnt);
540
+ break ;
541
+ case AtomicRMWInst::And:
542
+ case AtomicRMWInst::Or:
543
+ case AtomicRMWInst::Max:
544
+ case AtomicRMWInst::Min:
545
+ case AtomicRMWInst::UMax:
546
+ case AtomicRMWInst::UMin:
547
+ LaneOffset = B.CreateSelect (Cond, Identity, V);
548
+ break ;
549
+ case AtomicRMWInst::Xor:
550
+ LaneOffset = B.CreateMul (V, B.CreateAnd (Mbcnt, 1 ));
551
+ break ;
552
+ }
553
+ }
554
+ Value *const Result = buildNonAtomicBinOp (B, Op, BroadcastI, LaneOffset);
557
555
558
- PHINode *const PHI = B.CreatePHI (Ty, 2 );
559
- PHI->addIncoming (UndefValue::get (Ty), PixelEntryBB);
560
- PHI->addIncoming (Result, I.getParent ());
561
- I.replaceAllUsesWith (PHI);
562
- } else {
563
- // Replace the original atomic instruction with the new one.
564
- I.replaceAllUsesWith (Result);
556
+ if (IsPixelShader) {
557
+ // Need a final PHI to reconverge to above the helper lane branch mask.
558
+ B.SetInsertPoint (PixelExitBB->getFirstNonPHI ());
559
+
560
+ PHINode *const PHI = B.CreatePHI (Ty, 2 );
561
+ PHI->addIncoming (UndefValue::get (Ty), PixelEntryBB);
562
+ PHI->addIncoming (Result, I.getParent ());
563
+ I.replaceAllUsesWith (PHI);
564
+ } else {
565
+ // Replace the original atomic instruction with the new one.
566
+ I.replaceAllUsesWith (Result);
567
+ }
565
568
}
566
569
567
570
// And delete the original.
0 commit comments