Skip to content

Commit 298500a

Browse files
committed
[AMDGPU] Save some work when an atomic op has no uses
Summary: In the atomic optimizer, save doing a bunch of work and generating a bunch of dead IR in the fairly common case where the result of an atomic op (i.e. the value that was in memory before the atomic op was performed) is not used. NFC. Reviewers: arsenm, dstuttard, tpr Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, jfb, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64981 llvm-svn: 366667
1 parent 3d72a58 commit 298500a

File tree

1 file changed

+70
-67
lines changed

1 file changed

+70
-67
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 70 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -491,77 +491,80 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
491491
// original instruction.
492492
B.SetInsertPoint(&I);
493493

494-
// Create a PHI node to get our new atomic result into the exit block.
495-
PHINode *const PHI = B.CreatePHI(Ty, 2);
496-
PHI->addIncoming(UndefValue::get(Ty), EntryBB);
497-
PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
498-
499-
// We need to broadcast the value who was the lowest active lane (the first
500-
// lane) to all other lanes in the wavefront. We use an intrinsic for this,
501-
// but have to handle 64-bit broadcasts with two calls to this intrinsic.
502-
Value *BroadcastI = nullptr;
503-
504-
if (TyBitWidth == 64) {
505-
Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
506-
Value *const ExtractHi =
507-
B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
508-
CallInst *const ReadFirstLaneLo =
509-
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
510-
CallInst *const ReadFirstLaneHi =
511-
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
512-
Value *const PartialInsert = B.CreateInsertElement(
513-
UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
514-
Value *const Insert =
515-
B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
516-
BroadcastI = B.CreateBitCast(Insert, Ty);
517-
} else if (TyBitWidth == 32) {
518-
519-
BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
520-
} else {
521-
llvm_unreachable("Unhandled atomic bit width");
522-
}
494+
const bool NeedResult = !I.use_empty();
495+
if (NeedResult) {
496+
// Create a PHI node to get our new atomic result into the exit block.
497+
PHINode *const PHI = B.CreatePHI(Ty, 2);
498+
PHI->addIncoming(UndefValue::get(Ty), EntryBB);
499+
PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
523500

524-
// Now that we have the result of our single atomic operation, we need to
525-
// get our individual lane's slice into the result. We use the lane offset we
526-
// previously calculated combined with the atomic result value we got from the
527-
// first lane, to get our lane's index into the atomic result.
528-
Value *LaneOffset = nullptr;
529-
if (ValDivergent) {
530-
LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
531-
} else {
532-
switch (Op) {
533-
default:
534-
llvm_unreachable("Unhandled atomic op");
535-
case AtomicRMWInst::Add:
536-
case AtomicRMWInst::Sub:
537-
LaneOffset = B.CreateMul(V, Mbcnt);
538-
break;
539-
case AtomicRMWInst::And:
540-
case AtomicRMWInst::Or:
541-
case AtomicRMWInst::Max:
542-
case AtomicRMWInst::Min:
543-
case AtomicRMWInst::UMax:
544-
case AtomicRMWInst::UMin:
545-
LaneOffset = B.CreateSelect(Cond, Identity, V);
546-
break;
547-
case AtomicRMWInst::Xor:
548-
LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
549-
break;
501+
// We need to broadcast the value who was the lowest active lane (the first
502+
// lane) to all other lanes in the wavefront. We use an intrinsic for this,
503+
// but have to handle 64-bit broadcasts with two calls to this intrinsic.
504+
Value *BroadcastI = nullptr;
505+
506+
if (TyBitWidth == 64) {
507+
Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
508+
Value *const ExtractHi =
509+
B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
510+
CallInst *const ReadFirstLaneLo =
511+
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
512+
CallInst *const ReadFirstLaneHi =
513+
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
514+
Value *const PartialInsert = B.CreateInsertElement(
515+
UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
516+
Value *const Insert =
517+
B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
518+
BroadcastI = B.CreateBitCast(Insert, Ty);
519+
} else if (TyBitWidth == 32) {
520+
521+
BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
522+
} else {
523+
llvm_unreachable("Unhandled atomic bit width");
550524
}
551-
}
552-
Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
553525

554-
if (IsPixelShader) {
555-
// Need a final PHI to reconverge to above the helper lane branch mask.
556-
B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
526+
// Now that we have the result of our single atomic operation, we need to
527+
// get our individual lane's slice into the result. We use the lane offset
528+
// we previously calculated combined with the atomic result value we got
529+
// from the first lane, to get our lane's index into the atomic result.
530+
Value *LaneOffset = nullptr;
531+
if (ValDivergent) {
532+
LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
533+
} else {
534+
switch (Op) {
535+
default:
536+
llvm_unreachable("Unhandled atomic op");
537+
case AtomicRMWInst::Add:
538+
case AtomicRMWInst::Sub:
539+
LaneOffset = B.CreateMul(V, Mbcnt);
540+
break;
541+
case AtomicRMWInst::And:
542+
case AtomicRMWInst::Or:
543+
case AtomicRMWInst::Max:
544+
case AtomicRMWInst::Min:
545+
case AtomicRMWInst::UMax:
546+
case AtomicRMWInst::UMin:
547+
LaneOffset = B.CreateSelect(Cond, Identity, V);
548+
break;
549+
case AtomicRMWInst::Xor:
550+
LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
551+
break;
552+
}
553+
}
554+
Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
557555

558-
PHINode *const PHI = B.CreatePHI(Ty, 2);
559-
PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
560-
PHI->addIncoming(Result, I.getParent());
561-
I.replaceAllUsesWith(PHI);
562-
} else {
563-
// Replace the original atomic instruction with the new one.
564-
I.replaceAllUsesWith(Result);
556+
if (IsPixelShader) {
557+
// Need a final PHI to reconverge to above the helper lane branch mask.
558+
B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
559+
560+
PHINode *const PHI = B.CreatePHI(Ty, 2);
561+
PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
562+
PHI->addIncoming(Result, I.getParent());
563+
I.replaceAllUsesWith(PHI);
564+
} else {
565+
// Replace the original atomic instruction with the new one.
566+
I.replaceAllUsesWith(Result);
567+
}
565568
}
566569

567570
// And delete the original.

0 commit comments

Comments
 (0)