Skip to content

[OpenMP][AMDGPU] Adapt dynamic callstack sizes to HIP behavior #74080

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1043,7 +1043,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
FeatureMaxWaveScratchSize13x256
]
>;

Expand All @@ -1054,7 +1055,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
FeatureMaxWaveScratchSize13x256
]
>;

Expand All @@ -1070,7 +1072,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
FeatureDefaultComponentZero
FeatureDefaultComponentZero, FeatureMaxWaveScratchSize13x256
]
>;

Expand All @@ -1088,7 +1090,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero
FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero,
FeatureMaxWaveScratchSize13x256
]
>;

Expand All @@ -1109,7 +1112,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
FeatureMaxHardClauseLength63
FeatureMaxHardClauseLength63, FeatureMaxWaveScratchSize13x256
]
>;

Expand All @@ -1130,7 +1133,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
FeatureGWS, FeatureDefaultComponentZero,
FeatureMaxHardClauseLength32
FeatureMaxHardClauseLength32, FeatureMaxWaveScratchSize15x64
]
>;

Expand All @@ -1151,7 +1154,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
FeatureMaxHardClauseLength32
FeatureMaxHardClauseLength32, FeatureMaxWaveScratchSize18x64
]
>;

Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,18 @@ def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
"Enable promote alloca pass"
>;

class SubtargetFeatureMaxWaveScratchSize <int Value, int FieldSize,
int Elements> :
SubtargetFeature<
"maxwavescratchsize"#FieldSize#"x"#Elements,
"MaxWaveScratchSize",
!cast<string>(Value),
"The dynamic callstack size in bytes"
>;

def FeatureMaxWaveScratchSize13x256 :
SubtargetFeatureMaxWaveScratchSize<8387584, 13, 256>;
def FeatureMaxWaveScratchSize15x64 :
SubtargetFeatureMaxWaveScratchSize<8388352, 15, 64>;
def FeatureMaxWaveScratchSize18x64 :
SubtargetFeatureMaxWaveScratchSize<67108608, 18, 64>;
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ class AMDGPUSubtarget {
unsigned MaxWavesPerEU = 10;
unsigned LocalMemorySize = 0;
unsigned AddressableLocalMemorySize = 0;
unsigned MaxWaveScratchSize = 0;
char WavefrontSizeLog2 = 0;

public:
Expand Down Expand Up @@ -234,6 +235,8 @@ class AMDGPUSubtarget {
return AddressableLocalMemorySize;
}

unsigned getMaxWaveScratchSize() const { return MaxWaveScratchSize; }

/// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
/// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
/// CU mode into account.
Expand Down
14 changes: 0 additions & 14 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -302,20 +302,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return (Generation)Gen;
}

unsigned getMaxWaveScratchSize() const {
// See COMPUTE_TMPRING_SIZE.WAVESIZE.
if (getGeneration() >= GFX12) {
// 18-bit field in units of 64-dword.
return (64 * 4) * ((1 << 18) - 1);
}
if (getGeneration() == GFX11) {
// 15-bit field in units of 64-dword.
return (64 * 4) * ((1 << 15) - 1);
}
// 13-bit field in units of 256-dword.
return (256 * 4) * ((1 << 13) - 1);
}

/// Return the number of high bits known to be zero for a frame index.
unsigned getKnownHighZeroBitsForFrameIndex() const {
return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
Expand Down
16 changes: 16 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -900,6 +900,22 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
return 0;
}

unsigned getMaxWaveScratchSize(const MCSubtargetInfo *STI) {
// See COMPUTE_TMPRING_SIZE.WAVESIZE.
if (STI->getFeatureBits().test(FeatureMaxWaveScratchSize18x64)) {
// 18-bit field in units of 64-dword.
return (64 * 4) * ((1 << 18) - 1);
}

if (STI->getFeatureBits().test(FeatureMaxWaveScratchSize15x64)) {
// 15-bit field in units of 64-dword.
return (64 * 4) * ((1 << 15) - 1);
}

// 13-bit field in units of 256-dword.
return (256 * 4) * ((1 << 13) - 1);
}

unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
// "Per CU" really means "per whatever functional block the waves of a
// workgroup must share". For gfx10 in CU mode this is the CU, which contains
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,10 @@ unsigned getLocalMemorySize(const MCSubtargetInfo *STI);
/// \p STI.
unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI);

/// \returns Maximum dynamic callstack size in bytes for given subtarget
/// \p STI.
unsigned getMaxWaveScratchSize(const MCSubtargetInfo *STI);

/// \returns Number of execution units per compute unit for given subtarget \p
/// STI.
unsigned getEUsPerCU(const MCSubtargetInfo *STI);
Expand Down
38 changes: 30 additions & 8 deletions openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -704,7 +704,7 @@ struct AMDGPUQueueTy {
/// signal and can define an optional input signal (nullptr if none).
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
uint32_t GroupSize, uint64_t StackSize,
uint32_t GroupSize, uint32_t StackSize,
AMDGPUSignalTy *OutputSignal,
AMDGPUSignalTy *InputSignal) {
assert(OutputSignal && "Invalid kernel output signal");
Expand Down Expand Up @@ -743,7 +743,8 @@ struct AMDGPUQueueTy {
Packet->grid_size_y = 1;
Packet->grid_size_z = 1;
Packet->private_segment_size =
Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
Kernel.usesDynamicStack() ? std::max(Kernel.getPrivateSize(), StackSize)
: Kernel.getPrivateSize();
Packet->group_segment_size = GroupSize;
Packet->kernel_object = Kernel.getKernelObject();
Packet->kernarg_address = KernelArgs;
Expand Down Expand Up @@ -1212,7 +1213,7 @@ struct AMDGPUStreamTy {
/// the kernel args buffer to the specified memory manager.
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
uint32_t GroupSize, uint64_t StackSize,
uint32_t GroupSize, uint32_t StackSize,
AMDGPUMemoryManagerTy &MemoryManager) {
if (Queue == nullptr)
return Plugin::error("Target queue was nullptr");
Expand Down Expand Up @@ -1975,6 +1976,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
.contains("xnack+"))
IsXnackEnabled = true;

// See: 'getMaxWaveScratchSize' in 'llvm/lib/Target/AMDGPU/GCNSubtarget.h'.
// See: e.g. 'FeatureMaxWaveScratchSize13x256' in
// 'llvm/lib/Target/AMDGPU/AMDGPUFeatures.td'
// ToDo: Relay MaxWaveScratchSize value here
// MaxThreadScratchSize = GCNSubtarget.getMaxWaveScratchSize() /
// WavefrontSize;

// detect if device is an APU.
if (auto Err = checkIfAPU())
return Err;
Expand Down Expand Up @@ -2708,7 +2716,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}
Error setDeviceStackSize(uint64_t Value) override {
StackSize = Value;
if (Value > MaxThreadScratchSize) {
// Cap device scratch size.
MESSAGE("Scratch memory size will be set to %d. Reason: Requested size "
"%ld would exceed available resources.",
MaxThreadScratchSize, Value);
StackSize = MaxThreadScratchSize;
} else {
// Apply device scratch size, since it is within limits.
StackSize = Value;
}

return Plugin::success();
}
Error getDeviceHeapSize(uint64_t &Value) override {
Expand Down Expand Up @@ -2896,9 +2914,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// The current size of the global device memory pool (managed by us).
uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;

/// The current size of the stack that will be used in cases where it could
/// not be statically determined.
uint64_t StackSize = 16 * 1024 /* 16 KB */;
/// Default: 1024, in conformity to hipLimitStackSize.
uint64_t StackSize = 1024 /* 1 KiB */;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

uint32_t


// The maximum scratch memory size per thread.
// See COMPUTE_TMPRING_SIZE.WAVESIZE (divided by threads per wave).
uint32_t MaxThreadScratchSize;

/// Is the plugin associated with an APU?
bool IsAPU = false;
Expand Down Expand Up @@ -3314,7 +3335,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,

// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
GroupSize, StackSize, ArgsMemoryManager);
GroupSize, static_cast<uint32_t>(StackSize),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change it to uint32_t everywhere instead. Let's assume we stop at 4GB stack size.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks -- Sure, sounds reasonable.

ArgsMemoryManager);
}

Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
Expand Down
80 changes: 80 additions & 0 deletions openmp/libomptarget/test/offloading/dynamic_callstack.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#include <omp.h>
#include <stdio.h>

// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O2 -mcode-object-version=5

// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
// RUN: env LIBOMPTARGET_STACK_SIZE=4096 \
// RUN: %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
// RUN: | %fcheck-amdgcn-amd-amdhsa

// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
// RUN: env LIBOMPTARGET_STACK_SIZE=131073 \
// RUN: %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
// RUN: | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_EXCEEDED

// TODO: Realize the following run in an acceptable manner.
// Unfortunately with insufficient scratch mem size programs will hang.
// Therefore, a timeout mechanism would help tremendously.
// Additionally, we need to allow empty output / unsuccessful execution.

// RUN?: env OMP_TARGET_OFFLOAD=MANDATORY \
// RUN?: env LIBOMPTARGET_STACK_SIZE=16 \
// RUN?: timeout 10 %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
// RUN?: | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_INSUFFICIENT \
// RUN?: --allow-empty

// REQUIRES: amdgcn-amd-amdhsa

// Cause the compiler to set amdhsa_uses_dynamic_stack to '1' using recursion.
// That is: stack requirement for main's target region may not be calculated.

// This recursive function will eventually return 0.
int recursiveFunc(const int Recursions) {
if (Recursions < 1)
return 0;

int j[Recursions];
#pragma omp target private(j)
{ ; }

return recursiveFunc(Recursions - 1);
}

int main() {
int N = 256;
int a[N];
int b[N];
int i;

for (i = 0; i < N; i++)
a[i] = 0;

for (i = 0; i < N; i++)
b[i] = i;

#pragma omp target parallel for
{
for (int j = 0; j < N; j++)
a[j] = b[j] + recursiveFunc(j);
}

int rc = 0;
for (i = 0; i < N; i++)
if (a[i] != b[i]) {
rc++;
printf("Wrong value: a[%d]=%d\n", i, a[i]);
}

if (!rc)
printf("Success\n");

return rc;
}

/// CHECK: Success

/// LIMIT_EXCEEDED: Scratch memory size will be set to
/// LIMIT_EXCEEDED: Success

/// LIMIT_INSUFFICIENT-NOT: Success