Skip to content

[OpenMP][libomptarget][RFC] extend libomptarget with mechanism to execute fill memory on the target #73801

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions openmp/libomptarget/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,10 @@ struct DeviceTy {
int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
int64_t Size, AsyncInfoTy &AsyncInfo);

/// Fill memory on the target device (aka memset)
int32_t fillMemory(void *Ptr, int32_t Val, uint64_t NumValues,
AsyncInfoTy &AsyncInfo);

/// Notify the plugin about a new mapping starting at the host address
/// \p HstPtr and \p Size bytes.
int32_t notifyDataMapped(void *HstPtr, int64_t Size);
Expand Down
11 changes: 11 additions & 0 deletions openmp/libomptarget/include/omptargetplugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,17 @@ int32_t __tgt_rtl_data_exchange_async(int32_t SrcID, void *SrcPtr,
int32_t DesID, void *DstPtr, int64_t Size,
__tgt_async_info *AsyncInfo);

// Perform a memory fill operation on the target device (aka "memset") by
// calling a native driver operation. In case of success, return zero.
// Otherwise, return an error code.
int32_t __tgt_rtl_fill_memory(int32_t DevID, void *Ptr, int32_t ByteVal,
int64_t NumBytes);

// Asynchronous version of __tgt_rtl_fill_memory
int32_t __tgt_rtl_fill_memory_async(int32_t DevID, void *Ptr, int32_t Val,
int64_t NumValues,
__tgt_async_info *AsyncInfo);

// De-allocate the data referenced by target ptr on the device. In case of
// success, return zero. Otherwise, return an error code. Kind dictates what
// allocator to use (e.g. shared, host, device).
Expand Down
5 changes: 5 additions & 0 deletions openmp/libomptarget/include/rtl.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ struct RTLInfoTy {
typedef int32_t(data_exchange_ty)(int32_t, void *, int32_t, void *, int64_t);
typedef int32_t(data_exchange_async_ty)(int32_t, void *, int32_t, void *,
int64_t, __tgt_async_info *);
typedef int32_t(fill_memory_ty)(int32_t, void *, int32_t, uint64_t);
typedef int32_t(fill_memory_async_ty)(int32_t, void *, int32_t, uint64_t,
__tgt_async_info *);
typedef int32_t(data_delete_ty)(int32_t, void *, int32_t);
typedef int32_t(launch_kernel_ty)(int32_t, void *, void **, ptrdiff_t *,
const KernelArgsTy *, __tgt_async_info *);
Expand Down Expand Up @@ -101,6 +104,8 @@ struct RTLInfoTy {
data_retrieve_async_ty *data_retrieve_async = nullptr;
data_exchange_ty *data_exchange = nullptr;
data_exchange_async_ty *data_exchange_async = nullptr;
fill_memory_ty *fill_memory = nullptr;
fill_memory_async_ty *fill_memory_async = nullptr;
data_delete_ty *data_delete = nullptr;
launch_kernel_ty *launch_kernel = nullptr;
init_requires_ty *init_requires = nullptr;
Expand Down
8 changes: 8 additions & 0 deletions openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2357,6 +2357,14 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
getAgent(), (uint64_t)Size);
}

/// Fill memory on the target device (aka memset)
Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues,
AsyncInfoWrapperTy &AsyncInfoWrapperTy) override {
hsa_status_t Status =
hsa_amd_memory_fill(const_cast<void *>(Ptr), Val, NumValues);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there an async version?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did not find one in the HSA headers.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why const_cast<void*> here? I think the pointer can just be passed to the HSA function.

return Plugin::check(Status, "Error in hsa_amd_memory_fill: %s");
}

/// Initialize the async info for interoperability purposes.
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
// TODO: Implement this function.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1453,6 +1453,14 @@ Error GenericDeviceTy::dataExchange(const void *SrcPtr, GenericDeviceTy &DstDev,
return Err;
}

Error GenericDeviceTy::fillMemory(void *Ptr, int32_t Val, uint64_t NumValues,
__tgt_async_info *AsyncInfo) {
AsyncInfoWrapperTy AsyncInfoWrapper(*this, AsyncInfo);
auto Err = fillMemoryImpl(Ptr, Val, NumValues, AsyncInfoWrapper);
AsyncInfoWrapper.finalize(Err);
return Err;
}

Error GenericDeviceTy::launchKernel(void *EntryPtr, void **ArgPtrs,
ptrdiff_t *ArgOffsets,
KernelArgsTy &KernelArgs,
Expand Down Expand Up @@ -1895,6 +1903,29 @@ int32_t __tgt_rtl_data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
return OFFLOAD_SUCCESS;
}

int32_t __tgt_rtl_fill_memory(int32_t DevId, void *Ptr, int32_t ByteVal,
int64_t NumBytes) {
return __tgt_rtl_fill_memory_async(DevId, Ptr, ByteVal, NumBytes,
/* AsyncInfoPtr */ nullptr);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think clang-format understands: /*AsyncInfoPtr=*/ nullptr

}

int32_t __tgt_rtl_fill_memory_async(int32_t DevId, void *Ptr, int32_t Val,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Try to be consistent, DeviceId is what we use elsewhere, I think, and Val vs ByteVal.

int64_t NumValues,
__tgt_async_info *AsyncInfo) {
printf("--> in function %s\n", __FUNCTION__);
printf("--> Dev: %d, Ptr: %p, Val: %d, NumValues: %ld\n", DevId, Ptr, Val,
NumValues);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

leftover

GenericDeviceTy &Device = Plugin::get().getDevice(DevId);
auto Err = Device.fillMemory(Ptr, Val, NumValues, AsyncInfo);
if (Err) {
REPORT("Failure to fill memory on device (%d) at pointer " DPxMOD
" with byte value %d and %" PRId64 " values: %s\n",
DevId, DPxPTR(Ptr), Val, NumValues, toString(std::move(Err)).data());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there really no better way to print the error here?

return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}

int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
void **TgtArgs, ptrdiff_t *TgtOffsets,
KernelArgsTy *KernelArgs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) = 0;

/// Fill memory on the target device (aka memset).
Error fillMemory(void *Ptr, int32_t Val, uint64_t NumValues,
__tgt_async_info *AsyncInfo);
virtual Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValue,
AsyncInfoWrapperTy &AsyncInfo) = 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consistency, NumValues vs NumValue, AsyncInfo is the __tgt_async_info type, AsyncInfoWrapper is a AsyncInfoWrapperTy type.


/// Run the kernel associated with \p EntryPtr
Error launchKernel(void *EntryPtr, void **ArgPtrs, ptrdiff_t *ArgOffsets,
KernelArgsTy &KernelArgs, __tgt_async_info *AsyncInfo);
Expand Down
12 changes: 12 additions & 0 deletions openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
void *DstPtr, int64_t Size,
AsyncInfoWrapperTy &AsyncInfoWrapper) override;

// Fill memory on the target device (aka memset)
Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues,
AsyncInfoWrapperTy &AsyncInfoWrapperTy) override;

/// Initialize the async info for interoperability purposes.
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
if (auto Err = setContext())
Expand Down Expand Up @@ -1387,6 +1391,14 @@ Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr,
return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s");
}

/// Fill memory on the target device (aka memset)
Error CUDADeviceTy::fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues,
AsyncInfoWrapperTy &AsyncInfoWrapperTy) {
CUdeviceptr DevPtr = reinterpret_cast<CUdeviceptr>(Ptr);
CUresult Res = cuMemsetD32(DevPtr, Val, static_cast<size_t>(NumValues));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not the async version?

return Plugin::check(Res, "Error in cuMemsetD32: %s");
}

GenericPluginTy *Plugin::createPlugin() { return new CUDAPluginTy(); }

GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include <cassert>
#include <cstddef>
#include <cstring>
#include <ffi.h>
#include <string>
#include <unordered_map>
Expand Down Expand Up @@ -267,6 +268,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
return Plugin::error("dataExchangeImpl not supported");
}

/// Fill memory on the target device (aka memset).
Error fillMemoryImpl(void *Ptr, int32_t Val, uint64_t NumValues,
AsyncInfoWrapperTy &AsyncInfoWrapperTy) override {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't use the same name as the type, that leads to all sorts of confusion.

(void)std::memset(Ptr, Val,
static_cast<size_t>(NumValues) * sizeof(int32_t));
return Plugin::success();
}

/// All functions are already synchronous. No need to do anything on this
/// synchronization function.
Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
Expand Down
40 changes: 25 additions & 15 deletions openmp/libomptarget/src/api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,21 +320,31 @@ EXTERN void *omp_target_memset(void *Ptr, int ByteVal, size_t NumBytes,
// That will require the ability to execute a kernel from within
// libomptarget.so (which we do not have at the moment).

// This is a very slow path: create a filled array on the host and upload
// it to the GPU device.
int InitialDevice = omp_get_initial_device();
void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
if (Shadow) {
(void)memset(Shadow, ByteVal, NumBytes);
(void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
InitialDevice);
(void)omp_target_free(Shadow, InitialDevice);
if (NumBytes % sizeof(int32_t) == 0) {
DeviceTy &Dev = *PM->Devices[DeviceNum];
AsyncInfoTy AsyncInfo(Dev);
int32_t Val =
ByteVal + (ByteVal << 8) + (ByteVal << 16) + (ByteVal << 24);
uint64_t NumValues = NumBytes / sizeof(int32_t);
int Rc = Dev.fillMemory(Ptr, Val, NumValues, AsyncInfo);
printf("--> Rc=%d\n", Rc);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

leftover

} else {
// If the omp_target_alloc has failed, let's just not do anything.
// omp_target_memset does not have any good way to fail, so we
// simply avoid a catastrophic failure of the process for now.
DP("omp_target_memset failed to fill memory due to error with "
"omp_target_alloc");
// This is a very slow path: create a filled array on the host and upload
// it to the GPU device.
int InitialDevice = omp_get_initial_device();
void *Shadow = omp_target_alloc(NumBytes, InitialDevice);
if (Shadow) {
(void)memset(Shadow, ByteVal, NumBytes);
(void)omp_target_memcpy(Ptr, Shadow, NumBytes, 0, 0, DeviceNum,
InitialDevice);
(void)omp_target_free(Shadow, InitialDevice);
} else {
// If the omp_target_alloc has failed, let's just not do anything.
// omp_target_memset does not have any good way to fail, so we
// simply avoid a catastrophic failure of the process for now.
DP("omp_target_memset failed to fill memory due to error with "
"omp_target_alloc");
}
}
}

Expand Down Expand Up @@ -462,7 +472,7 @@ EXTERN int omp_target_memcpy_rect_async(
"src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
"volume " DPxMOD ", element size %zu, num_dims %d\n",
DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets),
DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions),
DPxPTR(SrcOffsets), DPxPTR(DstDimensimons), DPxPTR(SrcDimensions),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo

DPxPTR(Volume), ElementSize, NumDims);

// Need to check this first to not return OFFLOAD_FAIL instead
Expand Down
10 changes: 10 additions & 0 deletions openmp/libomptarget/src/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,16 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
DstPtr, Size, AsyncInfo);
}

// Run a "fill memory" operation (aka "memset") on the target device
int32_t DeviceTy::fillMemory(void *Ptr, int32_t Val, uint64_t NumValues,
AsyncInfoTy &AsyncInfo) {
if (!AsyncInfo || !RTL->fill_memory_async || !RTL->synchronize) {
assert(RTL->fill_memory && "RTL->fill_memory is nullptr");
return RTL->fill_memory(RTLDeviceID, Ptr, Val, NumValues);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems the function is not optional, maybe we should follow this up by moving the default logic (shadow array) into the Plugin. At least we can think about it.

}
return RTL->fill_memory_async(RTLDeviceID, Ptr, Val, NumValues, AsyncInfo);
}

int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) {
if (!RTL->data_notify_mapped)
return OFFLOAD_SUCCESS;
Expand Down
4 changes: 4 additions & 0 deletions openmp/libomptarget/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,10 @@ bool RTLsTy::attemptLoadRTL(const std::string &RTLName, RTLInfoTy &RTL) {
DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange");
*((void **)&RTL.data_exchange_async) =
DynLibrary->getAddressOfSymbol("__tgt_rtl_data_exchange_async");
*((void **)&RTL.fill_memory) =
DynLibrary->getAddressOfSymbol("__tgt_rtl_fill_memory");
*((void **)&RTL.fill_memory_async) =
DynLibrary->getAddressOfSymbol("__tgt_rtl_fill_memory_async");
*((void **)&RTL.is_data_exchangable) =
DynLibrary->getAddressOfSymbol("__tgt_rtl_is_data_exchangable");
*((void **)&RTL.supports_empty_images) =
Expand Down