@@ -16062,26 +16062,21 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
16062
16062
SNaN, Depth);
16063
16063
}
16064
16064
16065
- #if 0
16066
- // FIXME: This should be checked before unsafe fp atomics are enabled
16067
- // Global FP atomic instructions have a hardcoded FP mode and do not support
16068
- // FP32 denormals, and only support v2f16 denormals.
16069
- static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16065
+ // On older subtargets, global FP atomic instructions have a hardcoded FP mode
16066
+ // and do not support FP32 denormals, and only support v2f16/f64 denormals.
16067
+ static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
16068
+ if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16069
+ return true;
16070
+
16070
16071
const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16071
- auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16072
- if (&Flt == &APFloat::IEEEsingle())
16073
- return DenormMode == DenormalMode::getPreserveSign();
16074
- return DenormMode == DenormalMode::getIEEE();
16075
- }
16076
- #endif
16072
+ auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16073
+ if (DenormMode == DenormalMode::getPreserveSign())
16074
+ return true;
16077
16075
16078
- // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16079
- // floating point atomic instructions. May generate more efficient code,
16080
- // but may not respect rounding and denormal modes, and may give incorrect
16081
- // results for certain memory destinations.
16082
- bool unsafeFPAtomicsDisabled(Function *F) {
16083
- return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16084
- "true";
16076
+ // TODO: Remove this.
16077
+ return RMW->getFunction()
16078
+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16079
+ .getValueAsBool();
16085
16080
}
16086
16081
16087
16082
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
@@ -16210,82 +16205,85 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
16210
16205
return AtomicExpansionKind::CmpXChg;
16211
16206
}
16212
16207
16213
- if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
16214
- AS != AMDGPUAS::BUFFER_FAT_POINTER)
16215
- return AtomicExpansionKind::CmpXChg;
16216
-
16217
- if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16218
- return AtomicExpansionKind::None;
16219
-
16220
- if (AS == AMDGPUAS::FLAT_ADDRESS) {
16221
- // gfx940, gfx12
16222
- // FIXME: Needs to account for no fine-grained memory
16223
- if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16224
- return AtomicExpansionKind::None;
16225
- } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16226
- // gfx90a, gfx940, gfx12
16227
- // FIXME: Needs to account for no fine-grained memory
16228
- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16229
- return AtomicExpansionKind::None;
16230
-
16231
- // gfx940, gfx12
16232
- // FIXME: Needs to account for no fine-grained memory
16233
- if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16234
- return AtomicExpansionKind::None;
16235
- } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16236
- // gfx90a, gfx940, gfx12
16237
- // FIXME: Needs to account for no fine-grained memory
16238
- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16239
- return AtomicExpansionKind::None;
16240
-
16241
- // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16242
- // buffer. gfx12 does have the buffer version.
16243
- if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
16244
- return AtomicExpansionKind::None;
16245
- }
16246
-
16247
- if (unsafeFPAtomicsDisabled(RMW->getFunction()))
16248
- return AtomicExpansionKind::CmpXChg;
16249
-
16250
- // Always expand system scope fp atomics.
16251
- if (HasSystemScope)
16208
+ // LDS atomics respect the denormal mode from the mode register.
16209
+ //
16210
+ // Traditionally f32 global/buffer memory atomics would unconditionally
16211
+ // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16212
+ // flush.
16213
+ //
16214
+ // On targets with flat atomic fadd, denormals would flush depending on
16215
+ // whether the target address resides in LDS or global memory. We consider
16216
+ // this flat-maybe-flush as will-flush.
16217
+ if (Ty->isFloatTy() &&
16218
+ !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
16219
+ !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
16252
16220
return AtomicExpansionKind::CmpXChg;
16253
16221
16254
- // global and flat atomic fadd f64: gfx90a, gfx940.
16255
- if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16256
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
16222
+ // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16223
+ // safe. The message phrasing also should be better.
16224
+ if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16225
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
16226
+ // gfx940, gfx12
16227
+ if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
16228
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16229
+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16230
+ // gfx90a, gfx940, gfx12
16231
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16232
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16257
16233
16258
- if (AS != AMDGPUAS::FLAT_ADDRESS) {
16259
- if (Ty->isFloatTy()) {
16260
- // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16261
- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16234
+ // gfx940, gfx12
16235
+ if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
16262
16236
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16263
- // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16264
- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16237
+ } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16238
+ // gfx90a, gfx940, gfx12
16239
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16265
16240
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16266
- } else {
16267
- // gfx908
16268
- if (RMW->use_empty() &&
16269
- Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts () && isHalf2 (Ty))
16241
+
16242
+ // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16243
+ // buffer. gfx12 does have the buffer version.
16244
+ if ( Subtarget->hasAtomicBufferPkAddBF16Inst () && isBFloat2 (Ty))
16270
16245
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16271
16246
}
16272
- }
16273
16247
16274
- // flat atomic fadd f32: gfx940, gfx11+.
16275
- if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16276
- if (Subtarget->hasFlatAtomicFaddF32Inst())
16248
+ // global and flat atomic fadd f64: gfx90a, gfx940.
16249
+ if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16277
16250
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16278
16251
16279
- // If it is in flat address space, and the type is float, we will try to
16280
- // expand it, if the target supports global and lds atomic fadd. The
16281
- // reason we need that is, in the expansion, we emit the check of address
16282
- // space. If it is in global address space, we emit the global atomic
16283
- // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16284
- if (Subtarget->hasLDSFPAtomicAddF32()) {
16285
- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16286
- return AtomicExpansionKind::Expand;
16287
- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16288
- return AtomicExpansionKind::Expand;
16252
+ if (AS != AMDGPUAS::FLAT_ADDRESS) {
16253
+ if (Ty->isFloatTy()) {
16254
+ // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16255
+ // gfx11+.
16256
+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16257
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16258
+ // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16259
+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16260
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16261
+ } else {
16262
+ // gfx908
16263
+ if (RMW->use_empty() &&
16264
+ Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
16265
+ isHalf2(Ty))
16266
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16267
+ }
16268
+ }
16269
+
16270
+ // flat atomic fadd f32: gfx940, gfx11+.
16271
+ if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16272
+ if (Subtarget->hasFlatAtomicFaddF32Inst())
16273
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
16274
+
16275
+ // If it is in flat address space, and the type is float, we will try to
16276
+ // expand it, if the target supports global and lds atomic fadd. The
16277
+ // reason we need that is, in the expansion, we emit the check of
16278
+ // address space. If it is in global address space, we emit the global
16279
+ // atomic fadd; if it is in shared address space, we emit the LDS atomic
16280
+ // fadd.
16281
+ if (Subtarget->hasLDSFPAtomicAddF32()) {
16282
+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16283
+ return AtomicExpansionKind::Expand;
16284
+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16285
+ return AtomicExpansionKind::Expand;
16286
+ }
16289
16287
}
16290
16288
}
16291
16289
0 commit comments