Skip to content

Commit 877ffa7

Browse files
[libomptarget] Build a minimal deviceRTL for amdgcn
Summary: [libomptarget] Build a minimal deviceRTL for amdgcn The CMakeLists.txt file is functionally identical to the one used in the aomp fork. Whitespace changes were made based on nvptx/CMakeLists.txt, plus the copyright notice updated to match (Greg was the original author so would like his sign off on that here). This change will build a small subset of the deviceRTL if an appropriate toolchain is available, e.g. a local install of rocm. Support.h is moved from nvptx as a dependency of debug.h. Reviewers: jdoerfert, ABataev, grokos, ronlieb, gregrodgers Reviewed By: jdoerfert Subscribers: jfb, Hahnfeld, jvesely, mgorny, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D70414
1 parent 269a9af commit 877ffa7

File tree

5 files changed

+183
-14
lines changed

5 files changed

+183
-14
lines changed

openmp/libomptarget/deviceRTLs/CMakeLists.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66
#
77
# ##===----------------------------------------------------------------------===##
88
#
9-
# Build a device RTL for each available machine available.
9+
# Build a device RTL for each available machine.
1010
#
1111
##===----------------------------------------------------------------------===##
1212

13+
add_subdirectory(amdgcn)
1314
add_subdirectory(nvptx)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
##===----------------------------------------------------------------------===##
2+
#
3+
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
# See https://llvm.org/LICENSE.txt for license information.
5+
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
#
7+
##===----------------------------------------------------------------------===##
8+
#
9+
# Build the AMDGCN Device RTL if the ROCM tools are available
10+
#
11+
##===----------------------------------------------------------------------===##
12+
13+
find_package(LLVM QUIET CONFIG
14+
PATHS
15+
$ENV{AOMP}
16+
$ENV{HOME}/rocm/aomp
17+
/opt/rocm/aomp
18+
/usr/lib/rocm/aomp
19+
${LIBOMPTARGET_NVPTX_CUDA_COMPILER_DIR}
20+
${LIBOMPTARGET_NVPTX_CUDA_LINKER_DIR}
21+
${CMAKE_CXX_COMPILER_DIR}
22+
NO_DEFAULT_PATH)
23+
24+
if (LLVM_DIR)
25+
libomptarget_say("Found LLVM ${LLVM_PACKAGE_VERSION}. Configure: ${LLVM_DIR}/LLVMConfig.cmake")
26+
else()
27+
libomptarget_say("Not building AMDGCN device RTL: AOMP not found")
28+
return()
29+
endif()
30+
31+
set(AOMP_INSTALL_PREFIX ${LLVM_INSTALL_PREFIX})
32+
33+
if (AOMP_INSTALL_PREFIX)
34+
set(AOMP_BINDIR ${AOMP_INSTALL_PREFIX}/bin)
35+
else()
36+
set(AOMP_BINDIR ${LLVM_BUILD_BINARY_DIR}/bin)
37+
endif()
38+
39+
libomptarget_say("Building AMDGCN device RTL. LLVM_COMPILER_PATH=${AOMP_BINDIR}")
40+
41+
project(omptarget-amdgcn)
42+
43+
add_custom_target(omptarget-amdgcn ALL)
44+
45+
#optimization level
46+
set(optimization_level 2)
47+
48+
# Activate RTL message dumps if requested by the user.
49+
if(LIBOMPTARGET_NVPTX_DEBUG)
50+
set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1)
51+
endif()
52+
53+
get_filename_component(devicertl_base_directory
54+
${CMAKE_CURRENT_SOURCE_DIR}
55+
DIRECTORY)
56+
57+
set(cuda_sources
58+
${devicertl_base_directory}/common/src/cancel.cu
59+
${devicertl_base_directory}/common/src/critical.cu)
60+
61+
set(h_files
62+
${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h
63+
${CMAKE_CURRENT_SOURCE_DIR}/src/device_environment.h
64+
${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h
65+
${devicertl_base_directory}/common/debug.h
66+
${devicertl_base_directory}/common/state-queue.h
67+
${devicertl_base_directory}/common/state-queuei.h
68+
${devicertl_base_directory}/common/support.h)
69+
70+
# for both in-tree and out-of-tree build
71+
if (NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY)
72+
set(OUTPUTDIR ${CMAKE_CURRENT_BINARY_DIR})
73+
else()
74+
set(OUTPUTDIR ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY})
75+
endif()
76+
77+
# create libraries
78+
set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900)
79+
if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
80+
set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
81+
endif()
82+
83+
macro(add_cuda_bc_library)
84+
set(cu_cmd ${AOMP_BINDIR}/clang++
85+
-std=c++11
86+
-fcuda-rdc
87+
-fvisibility=default
88+
--cuda-device-only
89+
-Wno-unused-value
90+
-x hip
91+
-O${optimization_level}
92+
--cuda-gpu-arch=${mcpu}
93+
${CUDA_DEBUG}
94+
-I${CMAKE_CURRENT_SOURCE_DIR}/src
95+
-I${devicertl_base_directory})
96+
97+
set(bc1_files)
98+
99+
foreach(file ${ARGN})
100+
get_filename_component(fname ${file} NAME_WE)
101+
set(bc1_filename ${fname}.${mcpu}.bc)
102+
103+
add_custom_command(
104+
OUTPUT ${bc1_filename}
105+
COMMAND ${cu_cmd} ${file} -o ${bc1_filename}
106+
DEPENDS ${file} ${h_files})
107+
108+
list(APPEND bc1_files ${bc1_filename})
109+
endforeach()
110+
111+
add_custom_command(
112+
OUTPUT linkout.cuda.${mcpu}.bc
113+
COMMAND ${AOMP_BINDIR}/llvm-link ${bc1_files} -o linkout.cuda.${mcpu}.bc
114+
DEPENDS ${bc1_files})
115+
116+
list(APPEND bc_files linkout.cuda.${mcpu}.bc)
117+
endmacro()
118+
119+
set(libname "omptarget-amdgcn")
120+
121+
foreach(mcpu ${mcpus})
122+
set(bc_files)
123+
add_cuda_bc_library(${cuda_sources})
124+
125+
set(bc_libname lib${libname}-${mcpu}.bc)
126+
add_custom_command(
127+
OUTPUT ${bc_libname}
128+
COMMAND ${AOMP_BINDIR}/llvm-link ${bc_files} | ${AOMP_BINDIR}/opt --always-inline -o ${OUTPUTDIR}/${bc_libname}
129+
DEPENDS ${bc_files})
130+
131+
add_custom_target(lib${libname}-${mcpu} ALL DEPENDS ${bc_libname})
132+
133+
install(FILES ${OUTPUTDIR}/${bc_libname}
134+
DESTINATION "${OPENMP_INSTALL_LIBDIR}/libdevice"
135+
)
136+
endforeach()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
//===---- device_environment.h - OpenMP GPU device environment --- CUDA -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// Global device environment
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#ifndef _OMPTARGET_DEVICE_ENVIRONMENT_H_
14+
#define _OMPTARGET_DEVICE_ENVIRONMENT_H_
15+
16+
#include "target_impl.h"
17+
18+
struct omptarget_device_environmentTy {
19+
int32_t debug_level; // gets value of envvar LIBOMPTARGET_DEVICE_RTL_DEBUG
20+
// only useful for Debug build of deviceRTLs
21+
int32_t num_devices; // gets number of active offload devices
22+
int32_t device_num; // gets a value 0 to num_devices-1
23+
};
24+
25+
extern DEVICE omptarget_device_environmentTy omptarget_device_environment;
26+
27+
#endif

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h

+18-13
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,6 @@ EXTERN uint64_t __lanemask_lt();
7272
// thread's lane number in the warp
7373
EXTERN uint64_t __lanemask_gt();
7474

75-
EXTERN void llvm_amdgcn_s_barrier();
76-
7775
// CU id
7876
EXTERN unsigned __smid();
7977

@@ -101,25 +99,21 @@ INLINE uint32_t __kmpc_impl_smid() {
10199
return __smid();
102100
}
103101

104-
INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __ffsll(x); }
102+
INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
105103

106-
INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __popcll(x); }
104+
INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
107105

108106
INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
109107
return __ballot64(1);
110108
}
111109

112-
INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
113-
int32_t SrcLane) {
114-
return __shfl(Var, SrcLane, WARPSIZE);
115-
}
110+
EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
111+
int32_t SrcLane);
116112

117-
INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
118-
uint32_t Delta, int32_t Width) {
119-
return __shfl_down(Var, Delta, Width);
120-
}
113+
EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
114+
uint32_t Delta, int32_t Width);
121115

122-
INLINE void __kmpc_impl_syncthreads() { llvm_amdgcn_s_barrier(); }
116+
INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
123117

124118
INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) {
125119
// we have protected the master warp from releasing from its barrier
@@ -128,4 +122,15 @@ INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) {
128122
__builtin_amdgcn_s_barrier();
129123
}
130124

125+
// DEVICE versions of part of libc
126+
extern "C" {
127+
DEVICE __attribute__((noreturn)) void
128+
__assertfail(const char *, const char *, unsigned, const char *, size_t);
129+
INLINE static void __assert_fail(const char *__message, const char *__file,
130+
unsigned int __line, const char *__function) {
131+
__assertfail(__message, __file, __line, __function, sizeof(char));
132+
}
133+
DEVICE int printf(const char *, ...);
134+
}
135+
131136
#endif

0 commit comments

Comments
 (0)