-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[NVPTX] Add Volta Load/Store Atomics (.relaxed, .acquire, .release) and Volatile (.mmio/.volatile) support #99709
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-nvptx Author: None (gonzalobg) ChangesFollowup to #98022 which broke support for LLVM This PR lowers Patch is 124.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/99709.diff 7 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 380d878c1f532..a004d64c21cc6 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -227,9 +227,33 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
if (Modifier) {
const MCOperand &MO = MI->getOperand(OpNum);
int Imm = (int) MO.getImm();
- if (!strcmp(Modifier, "volatile")) {
- if (Imm)
+ if (!strcmp(Modifier, "sem")) {
+ switch (Imm) {
+ case NVPTX::PTXLdStInstCode::NotAtomic:
+ break;
+ case NVPTX::PTXLdStInstCode::Volatile:
O << ".volatile";
+ break;
+ case NVPTX::PTXLdStInstCode::Relaxed:
+ O << ".relaxed.sys";
+ break;
+ case NVPTX::PTXLdStInstCode::Acquire:
+ O << ".acquire.sys";
+ break;
+ case NVPTX::PTXLdStInstCode::Release:
+ O << ".release.sys";
+ break;
+ case NVPTX::PTXLdStInstCode::RelaxedMMIO:
+ O << ".mmio.relaxed.sys";
+ break;
+ default:
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "NVPTX LdStCode Printer does not support \"" << Imm
+ << "\" sem modifier.";
+ report_fatal_error(OS.str());
+ break;
+ }
} else if (!strcmp(Modifier, "addsp")) {
switch (Imm) {
case NVPTX::PTXLdStInstCode::GLOBAL:
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index b0cb24c63c3ce..3c7167b157025 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -107,6 +107,14 @@ enum LoadStore {
};
namespace PTXLdStInstCode {
+enum MemorySemantic {
+ NotAtomic = 0, // PTX calls these: "Weak"
+ Volatile = 1,
+ Relaxed = 2,
+ Acquire = 3,
+ Release = 4,
+ RelaxedMMIO = 5
+};
enum AddressSpace {
GENERIC = 0,
GLOBAL = 1,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 11193c11ede3b..e6290649513fa 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -714,6 +714,175 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
return NVPTX::PTXLdStInstCode::GENERIC;
}
+static unsigned int getCodeMemorySemantic(MemSDNode *N,
+ const NVPTXSubtarget *Subtarget) {
+ AtomicOrdering Ordering = N->getSuccessOrdering();
+ auto CodeAddrSpace = getCodeAddrSpace(N);
+
+ bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
+ bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
+
+ // TODO: lowering for SequentiallyConsistent Operations: for now, we error.
+ // TODO: lowering for AcquireRelease Operations: for now, we error.
+ //
+
+ // clang-format off
+
+ // Lowering for non-SequentiallyConsistent Operations
+ //
+ // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
+ // |---------|----------|--------------------|------------|------------------------------|
+ // | No | No | All | plain | .weak |
+ // | No | Yes | Generic,Shared, | .volatile | .volatile |
+ // | | | Global [0] | | |
+ // | No | Yes | Local,Const,Param | plain [1] | .weak [1] |
+ // | Unorder | Yes/No | Generic,Shared, | .volatile | .volatile |
+ // | | | Global [0] | | |
+ // | Relaxed | No | Generic,Shared, | .volatile | <atomic sem> |
+ // | | | Global [0] | | |
+ // | Other | No | Generic,Shared, | Error [2] | <atomic sem> |
+ // | | | Global [0] | | |
+ // | Yes | No | Local,Const,Param | plain [1] | .weak [1] |
+ // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile |
+ // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) |
+ // | | | | | or .volatile (PTX 8.1-) |
+ // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] |
+ // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
+ // | | | / Global [0] | | |
+
+ // clang-format on
+
+ // [0]: volatile and atomics are only supported on global or shared
+ // memory locations, accessed via generic/shared/global pointers.
+ // MMIO is only supported on global memory locations,
+ // accessed via generic/global pointers.
+ // TODO: Implement MMIO access via generic pointer to global.
+ // Currently implemented for global pointers only.
+
+ // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
+ // PTX instructions fails to preserve their C++ side-effects.
+ //
+ // Example (https://github.com/llvm/llvm-project/issues/62057):
+ //
+ // void example() {
+ // std::atomic<bool> True = true;
+ // while (True.load(std::memory_order_relaxed));
+ // }
+ //
+ // A C++ program that calls "example" is well-defined: the infinite loop
+ // performs an atomic operation. By lowering volatile/atomics to
+ // "weak" memory operations, we are transforming the above into:
+ //
+ // void undefined_behavior() {
+ // bool True = true;
+ // while (True);
+ // }
+ //
+ // which exhibits undefined behavior in both C++ and PTX.
+ //
+ // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
+ // behavior due to lack of Independent Forward Progress. Lowering these
+ // to weak memory operations in sm_60- is therefore fine.
+ //
+ // TODO: lower atomic and volatile operations to memory locations
+ // in local, const, and param to two PTX instructions in sm_70+:
+ // - the "weak" memory instruction we are currently lowering to, and
+ // - some other instruction that preserves the side-effect, e.g.,
+ // a dead dummy volatile load.
+
+ if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
+ CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
+ CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
+ return NVPTX::PTXLdStInstCode::NotAtomic;
+ }
+
+ // [2]: Atomics with Ordering different than Unordered or Relaxed are not
+ // supported on sm_60 and older; this includes volatile atomics.
+ if (!(Ordering == AtomicOrdering::NotAtomic ||
+ Ordering == AtomicOrdering::Unordered ||
+ Ordering == AtomicOrdering::Monotonic) &&
+ !HasMemoryOrdering) {
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "PTX does not support \"atomic\" for orderings different than"
+ "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order is: \""
+ << toIRString(Ordering) << "\".";
+ report_fatal_error(OS.str());
+ }
+
+ // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
+ // the volatile semantics and preserve the atomic ones.
+
+ // PTX volatile and PTX atomics are not available for statespace that differ
+ // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
+ // atomics is undefined if the generic address does not refer to a .global or
+ // .shared memory location.
+ bool AddrGenericOrGlobalOrShared =
+ (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC ||
+ CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL ||
+ CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED);
+ bool UseRelaxedMMIO =
+ HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL;
+
+ switch (Ordering) {
+ case AtomicOrdering::NotAtomic:
+ return N->isVolatile() && AddrGenericOrGlobalOrShared
+ ? NVPTX::PTXLdStInstCode::Volatile
+ : NVPTX::PTXLdStInstCode::NotAtomic;
+ case AtomicOrdering::Unordered:
+ return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
+ : NVPTX::PTXLdStInstCode::NotAtomic;
+ case AtomicOrdering::Monotonic:
+ if (N->isVolatile())
+ return UseRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO
+ : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
+ : NVPTX::PTXLdStInstCode::NotAtomic;
+ else
+ return HasMemoryOrdering ? NVPTX::PTXLdStInstCode::Relaxed
+ : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
+ : NVPTX::PTXLdStInstCode::NotAtomic;
+ case AtomicOrdering::Acquire:
+ if (!N->readMem()) {
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "PTX only supports Acquire Ordering on reads: "
+ << N->getOperationName();
+ N->print(OS);
+ report_fatal_error(OS.str());
+ }
+ return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire
+ : NVPTX::PTXLdStInstCode::NotAtomic;
+ case AtomicOrdering::Release:
+ if (!N->writeMem()) {
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "PTX only supports Release Ordering on writes: "
+ << N->getOperationName();
+ N->print(OS);
+ report_fatal_error(OS.str());
+ }
+ return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release
+ : NVPTX::PTXLdStInstCode::NotAtomic;
+ case AtomicOrdering::AcquireRelease: {
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "PTX only supports AcquireRelease Ordering on read-modify-write: "
+ << N->getOperationName();
+ N->print(OS);
+ report_fatal_error(OS.str());
+ }
+ case AtomicOrdering::SequentiallyConsistent:
+ // TODO: support AcquireRelease and SequentiallyConsistent
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "NVPTX backend does not support AtomicOrdering \""
+ << toIRString(Ordering) << "\" yet.";
+ report_fatal_error(OS.str());
+ }
+
+ llvm_unreachable("unexpected unhandled case");
+}
+
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
unsigned CodeAddrSpace, MachineFunction *F) {
// We use ldg (i.e. ld.global.nc) for invariant loads from the global address
@@ -916,32 +1085,18 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
if (!LoadedVT.isSimple())
return false;
- AtomicOrdering Ordering = LD->getSuccessOrdering();
- // In order to lower atomic loads with stronger guarantees we would need to
- // use load.acquire or insert fences. However these features were only added
- // with PTX ISA 6.0 / sm_70.
- // TODO: Check if we can actually use the new instructions and implement them.
- if (isStrongerThanMonotonic(Ordering))
- return false;
-
// Address Space Setting
unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
return tryLDGLDU(N);
}
+ // Memory Semantic Setting
+ unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget);
+
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
- // Volatile Setting
- // - .volatile is only available for .global and .shared
- // - .volatile has the same memory synchronization semantics as .relaxed.sys
- bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
- if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
- CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
- CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
- isVolatile = false;
-
// Type Setting: fromType + fromTypeWidth
//
// Sign : ISD::SEXTLOAD
@@ -982,9 +1137,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl), getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl), Addr, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl),
+ getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl),
+ Addr,
+ Chain};
NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
} else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
: SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
@@ -993,9 +1152,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl), getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl),
+ getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl),
+ Base,
+ Offset,
+ Chain};
NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
} else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
: SelectADDRri(N1.getNode(), N1, Base, Offset)) {
@@ -1010,9 +1174,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl), getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl),
+ getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl),
+ Base,
+ Offset,
+ Chain};
NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
} else {
if (PointerSize == 64)
@@ -1026,9 +1195,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl), getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl), N1, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl),
+ getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl),
+ N1,
+ Chain};
NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
}
@@ -1065,13 +1238,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
- // Volatile Setting
- // - .volatile is only availalble for .global and .shared
- bool IsVolatile = MemSD->isVolatile();
- if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
- CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
- CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
- IsVolatile = false;
+ // Memory Semantic Setting
+ unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
// Vector Setting
MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1138,9 +1306,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL), getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL), Addr, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL),
+ getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL),
+ Addr,
+ Chain};
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
} else if (PointerSize == 64
? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
@@ -1163,9 +1335,14 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL), getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL),
+ getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL),
+ Base,
+ Offset,
+ Chain};
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
} else if (PointerSize == 64
? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
@@ -1208,9 +1385,14 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL), getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL),
+ getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL),
+ Base,
+ Offset,
+ Chain};
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
} else {
@@ -1253,9 +1435,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL), getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL), Op1, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL),
+ getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL),
+ Op1,
+ Chain};
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
}
@@ -1698,27 +1884,13 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!StoreVT.isSimple())
return false;
- AtomicOrdering Ordering = ST->getSuccessOrdering();
- // In order to lower atomic loads with stronger guarantees we would need to
- // use store.release or insert fences. However these features were only added
- // with PTX ISA 6.0 / sm_70.
- // TODO: Check if we can actually use the new instructions and implement them.
- if (isStrongerThanMonotonic(Ordering))
- return false;
-
// Address Space Setting
unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
- // Volatile Setting
- // - .volatile is only available for .global and .shared
- // - .volatile has the sa...
[truncated]
|
cfb61fb
to
57be2e4
Compare
57be2e4
to
cbb14ca
Compare
@Artem-B the last commit now adds support for |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Thank you for fixing this.
Thanks, please help merge :) |
…nd Volatile (.mmio/.volatile) support (#99709) Summary: Followup to #98022 which broke support for LLVM `unordered` atomic ordering. This PR lowers `atomic unordered` to PTX volatile operations to preserve atomicity, and adds a bunch of tests for this ordering to the NVPTX backend (we had none). Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60250591
Followup to #98022 which broke support for LLVM
unordered
atomic ordering.This PR lowers
atomic unordered
to PTX volatile operations to preserve atomicity, and adds a bunch of tests for this ordering to the NVPTX backend (we had none).