-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[OpenMP][libomptarget][RFC] extend libomptarget with mechanism to execute fill memory on the target #73801
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[OpenMP][libomptarget][RFC] extend libomptarget with mechanism to execute fill memory on the target #73801
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2357,6 +2357,14 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { | |
getAgent(), (uint64_t)Size); | ||
} | ||
|
||
/// Fill memory on the target device (aka memset) | ||
Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues, | ||
AsyncInfoWrapperTy &AsyncInfoWrapperTy) override { | ||
hsa_status_t Status = | ||
hsa_amd_memory_fill(const_cast<void *>(Ptr), Val, NumValues); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why |
||
return Plugin::check(Status, "Error in hsa_amd_memory_fill: %s"); | ||
} | ||
|
||
/// Initialize the async info for interoperability purposes. | ||
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { | ||
// TODO: Implement this function. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1453,6 +1453,14 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev, | |
return Err; | ||
} | ||
|
||
Error GenericDeviceTy::fillMemory(void *Ptr, int32_t Val, uint64_t NumValues, | ||
__tgt_async_info *AsyncInfo) { | ||
AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo); | ||
auto Err = fillMemoryImpl(Ptr, Val, NumValues, AsyncInfoWrapper); | ||
AsyncInfoWrapper.finalize(Err); | ||
return Err; | ||
} | ||
|
||
Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs, | ||
ptrdiff_t *ArgOffsets, | ||
KernelArgsTy &KernelArgs, | ||
|
@@ -1895,6 +1903,29 @@ int32_t __tgt_rtl_data_exchange_async(int32_t SrcDeviceId, void *SrcPtr, | |
return OFFLOAD_SUCCESS; | ||
} | ||
|
||
int32_t __tgt_rtl_fill_memory(int32_t DevId, void *Ptr, int32_t ByteVal, | ||
int64_t NumBytes) { | ||
return __tgt_rtl_fill_memory_async(DevId, Ptr, ByteVal, NumBytes, | ||
/* AsyncInfoPtr */ nullptr); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think clang-format understands: |
||
} | ||
|
||
int32_t __tgt_rtl_fill_memory_async(int32_t DevId, void *Ptr, int32_t Val, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Try to be consistent, DeviceId is what we use elsewhere, I think, and Val vs ByteVal. |
||
int64_t NumValues, | ||
__tgt_async_info *AsyncInfo) { | ||
printf("--> in function %s\n", __FUNCTION__); | ||
printf("--> Dev: %d, Ptr: %p, Val: %d, NumValues: %ld\n", DevId, Ptr, Val, | ||
NumValues); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. leftover |
||
GenericDeviceTy &Device = Plugin::get().getDevice(DevId); | ||
auto Err = Device.fillMemory(Ptr, Val, NumValues, AsyncInfo); | ||
if (Err) { | ||
REPORT("Failure to fill memory on device (%d) at pointer " DPxMOD | ||
" with byte value %d and %" PRId64 " values: %s\n", | ||
DevId, DPxPTR(Ptr), Val, NumValues, toString(std::move(Err)).data()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there really no better way to print the error here? |
||
return OFFLOAD_FAIL; | ||
} | ||
return OFFLOAD_SUCCESS; | ||
} | ||
|
||
int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr, | ||
void **TgtArgs, ptrdiff_t *TgtOffsets, | ||
KernelArgsTy *KernelArgs, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -775,6 +775,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy { | |
void *DstPtr, int64_t Size, | ||
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0; | ||
|
||
/// Fill memory on the target device (aka memset). | ||
Error fillMemory(void *Ptr, int32_t Val, uint64_t NumValues, | ||
__tgt_async_info *AsyncInfo); | ||
virtual Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValue, | ||
AsyncInfoWrapperTy &AsyncInfo) = 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consistency, NumValues vs NumValue, AsyncInfo is the |
||
|
||
/// Run the kernel associated with \p EntryPtr | ||
Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets, | ||
KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -746,6 +746,10 @@ struct CUDADeviceTy : public GenericDeviceTy { | |
void *DstPtr, int64_t Size, | ||
AsyncInfoWrapperTy &AsyncInfoWrapper) override; | ||
|
||
// Fill memory on the target device (aka memset) | ||
Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues, | ||
AsyncInfoWrapperTy &AsyncInfoWrapperTy) override; | ||
|
||
/// Initialize the async info for interoperability purposes. | ||
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override { | ||
if (auto Err = setContext()) | ||
|
@@ -1387,6 +1391,14 @@ Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr, | |
return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s"); | ||
} | ||
|
||
/// Fill memory on the target device (aka memset) | ||
Error CUDADeviceTy::fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues, | ||
AsyncInfoWrapperTy &AsyncInfoWrapperTy) { | ||
CUdeviceptr DevPtr = reinterpret_cast<CUdeviceptr>(Ptr); | ||
CUresult Res = cuMemsetD32(DevPtr, Val, static_cast<size_t>(NumValues)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not the async version? |
||
return Plugin::check(Res, "Error in cuMemsetD32: %s"); | ||
} | ||
|
||
GenericPluginTy *Plugin::createPlugin() { return new CUDAPluginTy(); } | ||
|
||
GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
|
||
#include <cassert> | ||
#include <cstddef> | ||
#include <cstring> | ||
#include <ffi.h> | ||
#include <string> | ||
#include <unordered_map> | ||
|
@@ -267,6 +268,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy { | |
return Plugin::error("dataExchangeImpl not supported"); | ||
} | ||
|
||
/// Fill memory on the target device (aka memset). | ||
Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues, | ||
AsyncInfoWrapperTy &AsyncInfoWrapperTy) override { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't use the same name as the type, that leads to all sorts of confusion. |
||
(void)std::memset(Ptr, Val, | ||
static_cast<size_t>(NumValues) * sizeof(int32_t)); | ||
return Plugin::success(); | ||
} | ||
|
||
/// All functions are already synchronous. No need to do anything on this | ||
/// synchronization function. | ||
Error synchronizeImpl(__tgt_async_info &AsyncInfo) override { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -320,21 +320,31 @@ EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes, | |
// That will require the ability to execute a kernel from within | ||
// libomptarget.so (which we do not have at the moment). | ||
|
||
// This is a very slow path: create a filled array on the host and upload | ||
// it to the GPU device. | ||
int InitialDevice = omp_get_initial_device(); | ||
void *Shadow = omp_target_alloc(NumBytes, InitialDevice); | ||
if (Shadow) { | ||
(void)memset(Shadow, ByteVal, NumBytes); | ||
(void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum, | ||
InitialDevice); | ||
(void)omp_target_free(Shadow, InitialDevice); | ||
if (NumBytes % sizeof(int32_t) == 0) { | ||
DeviceTy &Dev = *PM->Devices[DeviceNum]; | ||
AsyncInfoTy AsyncInfo(Dev); | ||
int32_t Val = | ||
ByteVal + (ByteVal << 8) + (ByteVal << 16) + (ByteVal << 24); | ||
uint64_t NumValues = NumBytes / sizeof(int32_t); | ||
int Rc = Dev.fillMemory(Ptr, Val, NumValues, AsyncInfo); | ||
printf("--> Rc=%d\n", Rc); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. leftover |
||
} else { | ||
// If the omp_target_alloc has failed, let's just not do anything. | ||
// omp_target_memset does not have any good way to fail, so we | ||
// simply avoid a catastrophic failure of the process for now. | ||
DP("omp_target_memset failed to fill memory due to error with " | ||
"omp_target_alloc"); | ||
// This is a very slow path: create a filled array on the host and upload | ||
// it to the GPU device. | ||
int InitialDevice = omp_get_initial_device(); | ||
void *Shadow = omp_target_alloc(NumBytes, InitialDevice); | ||
if (Shadow) { | ||
(void)memset(Shadow, ByteVal, NumBytes); | ||
(void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum, | ||
InitialDevice); | ||
(void)omp_target_free(Shadow, InitialDevice); | ||
} else { | ||
// If the omp_target_alloc has failed, let's just not do anything. | ||
// omp_target_memset does not have any good way to fail, so we | ||
// simply avoid a catastrophic failure of the process for now. | ||
DP("omp_target_memset failed to fill memory due to error with " | ||
"omp_target_alloc"); | ||
} | ||
} | ||
} | ||
|
||
|
@@ -462,7 +472,7 @@ EXTERN int omp_target_memcpy_rect_async( | |
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", " | ||
"volume " DPxMOD ", element size %zu, num_dims %d\n", | ||
DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets), | ||
DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions), | ||
DPxPTR(SrcOffsets), DPxPTR(DstDimensimons), DPxPTR(SrcDimensions), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Typo |
||
DPxPTR(Volume), ElementSize, NumDims); | ||
|
||
// Need to check this first to not return OFFLOAD_FAIL instead | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -672,6 +672,16 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, | |
DstPtr, Size, AsyncInfo); | ||
} | ||
|
||
// Run a "fill memory" operation (aka "memset") on the target device | ||
int32_t DeviceTy::fillMemory(void *Ptr, int32_t Val, uint64_t NumValues, | ||
AsyncInfoTy &AsyncInfo) { | ||
if (!AsyncInfo || !RTL->fill_memory_async || !RTL->synchronize) { | ||
assert(RTL->fill_memory && "RTL->fill_memory is nullptr"); | ||
return RTL->fill_memory(RTLDeviceID, Ptr, Val, NumValues); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems the function is not optional, maybe we should follow this up by moving the default logic (shadow array) into the Plugin. At least we can think about it. |
||
} | ||
return RTL->fill_memory_async(RTLDeviceID, Ptr, Val, NumValues, AsyncInfo); | ||
} | ||
|
||
int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) { | ||
if (!RTL->data_notify_mapped) | ||
return OFFLOAD_SUCCESS; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there an async version?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I did not find one in the HSA headers.