-
Notifications
You must be signed in to change notification settings - Fork 13.8k
[OpenMP][AMDGPU] Adapt dynamic callstack sizes to HIP behavior #74080
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
mhalk
wants to merge
1
commit into
llvm:main
Choose a base branch
from
mhalk:feature/dyn_callstack_calc
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -704,7 +704,7 @@ struct AMDGPUQueueTy { | |
/// signal and can define an optional input signal (nullptr if none). | ||
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs, | ||
uint32_t NumThreads, uint64_t NumBlocks, | ||
uint32_t GroupSize, uint64_t StackSize, | ||
uint32_t GroupSize, uint32_t StackSize, | ||
AMDGPUSignalTy *OutputSignal, | ||
AMDGPUSignalTy *InputSignal) { | ||
assert(OutputSignal && "Invalid kernel output signal"); | ||
|
@@ -743,7 +743,8 @@ struct AMDGPUQueueTy { | |
Packet->grid_size_y = 1; | ||
Packet->grid_size_z = 1; | ||
Packet->private_segment_size = | ||
Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize(); | ||
Kernel.usesDynamicStack() ? std::max(Kernel.getPrivateSize(), StackSize) | ||
: Kernel.getPrivateSize(); | ||
Packet->group_segment_size = GroupSize; | ||
Packet->kernel_object = Kernel.getKernelObject(); | ||
Packet->kernarg_address = KernelArgs; | ||
|
@@ -1212,7 +1213,7 @@ struct AMDGPUStreamTy { | |
/// the kernel args buffer to the specified memory manager. | ||
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs, | ||
uint32_t NumThreads, uint64_t NumBlocks, | ||
uint32_t GroupSize, uint64_t StackSize, | ||
uint32_t GroupSize, uint32_t StackSize, | ||
AMDGPUMemoryManagerTy &MemoryManager) { | ||
if (Queue == nullptr) | ||
return Plugin::error("Target queue was nullptr"); | ||
|
@@ -1975,6 +1976,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { | |
.contains("xnack+")) | ||
IsXnackEnabled = true; | ||
|
||
// See: 'getMaxWaveScratchSize' in 'llvm/lib/Target/AMDGPU/GCNSubtarget.h'. | ||
// See: e.g. 'FeatureMaxWaveScratchSize13x256' in | ||
// 'llvm/lib/Target/AMDGPU/AMDGPUFeatures.td' | ||
// ToDo: Relay MaxWaveScratchSize value here | ||
// MaxThreadScratchSize = GCNSubtarget.getMaxWaveScratchSize() / | ||
// WavefrontSize; | ||
|
||
// detect if device is an APU. | ||
if (auto Err = checkIfAPU()) | ||
return Err; | ||
|
@@ -2708,7 +2716,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { | |
return Plugin::success(); | ||
} | ||
Error setDeviceStackSize(uint64_t Value) override { | ||
StackSize = Value; | ||
if (Value > MaxThreadScratchSize) { | ||
// Cap device scratch size. | ||
MESSAGE("Scratch memory size will be set to %d. Reason: Requested size " | ||
"%ld would exceed available resources.", | ||
MaxThreadScratchSize, Value); | ||
StackSize = MaxThreadScratchSize; | ||
} else { | ||
// Apply device scratch size, since it is within limits. | ||
StackSize = Value; | ||
} | ||
|
||
return Plugin::success(); | ||
} | ||
Error getDeviceHeapSize(uint64_t &Value) override { | ||
|
@@ -2896,9 +2914,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { | |
/// The current size of the global device memory pool (managed by us). | ||
uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/; | ||
|
||
/// The current size of the stack that will be used in cases where it could | ||
/// not be statically determined. | ||
uint64_t StackSize = 16 * 1024 /* 16 KB */; | ||
/// Default: 1024, in conformity to hipLimitStackSize. | ||
uint64_t StackSize = 1024 /* 1 KiB */; | ||
|
||
// The maximum scratch memory size per thread. | ||
// See COMPUTE_TMPRING_SIZE.WAVESIZE (divided by threads per wave). | ||
uint32_t MaxThreadScratchSize; | ||
|
||
/// Is the plugin associated with an APU? | ||
bool IsAPU = false; | ||
|
@@ -3314,7 +3335,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, | |
|
||
// Push the kernel launch into the stream. | ||
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks, | ||
GroupSize, StackSize, ArgsMemoryManager); | ||
GroupSize, static_cast<uint32_t>(StackSize), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change it to uint32_t everywhere instead. Let's assume we stop at 4GB stack size. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks -- Sure, sounds reasonable. |
||
ArgsMemoryManager); | ||
} | ||
|
||
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
#include <omp.h> | ||
#include <stdio.h> | ||
|
||
// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O2 -mcode-object-version=5 | ||
|
||
// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \ | ||
// RUN: env LIBOMPTARGET_STACK_SIZE=4096 \ | ||
// RUN: %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \ | ||
// RUN: | %fcheck-amdgcn-amd-amdhsa | ||
|
||
// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \ | ||
// RUN: env LIBOMPTARGET_STACK_SIZE=131073 \ | ||
// RUN: %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \ | ||
// RUN: | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_EXCEEDED | ||
|
||
// TODO: Realize the following run in an acceptable manner. | ||
// Unfortunately with insufficient scratch mem size programs will hang. | ||
// Therefore, a timeout mechanism would help tremendously. | ||
// Additionally, we need to allow empty output / unsuccessful execution. | ||
|
||
// RUN?: env OMP_TARGET_OFFLOAD=MANDATORY \ | ||
// RUN?: env LIBOMPTARGET_STACK_SIZE=16 \ | ||
// RUN?: timeout 10 %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \ | ||
// RUN?: | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_INSUFFICIENT \ | ||
// RUN?: --allow-empty | ||
|
||
// REQUIRES: amdgcn-amd-amdhsa | ||
|
||
// Cause the compiler to set amdhsa_uses_dynamic_stack to '1' using recursion. | ||
// That is: stack requirement for main's target region may not be calculated. | ||
|
||
// This recursive function will eventually return 0. | ||
int recursiveFunc(const int Recursions) { | ||
if (Recursions < 1) | ||
return 0; | ||
|
||
int j[Recursions]; | ||
#pragma omp target private(j) | ||
{ ; } | ||
|
||
return recursiveFunc(Recursions - 1); | ||
} | ||
|
||
int main() { | ||
int N = 256; | ||
int a[N]; | ||
int b[N]; | ||
int i; | ||
|
||
for (i = 0; i < N; i++) | ||
a[i] = 0; | ||
|
||
for (i = 0; i < N; i++) | ||
b[i] = i; | ||
|
||
#pragma omp target parallel for | ||
{ | ||
for (int j = 0; j < N; j++) | ||
a[j] = b[j] + recursiveFunc(j); | ||
} | ||
|
||
int rc = 0; | ||
for (i = 0; i < N; i++) | ||
if (a[i] != b[i]) { | ||
rc++; | ||
printf("Wrong value: a[%d]=%d\n", i, a[i]); | ||
} | ||
|
||
if (!rc) | ||
printf("Success\n"); | ||
|
||
return rc; | ||
} | ||
|
||
/// CHECK: Success | ||
|
||
/// LIMIT_EXCEEDED: Scratch memory size will be set to | ||
/// LIMIT_EXCEEDED: Success | ||
|
||
/// LIMIT_INSUFFICIENT-NOT: Success |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
uint32_t