Skip to content

Commit 50ce7a0

Browse files
AntonRydahljdoerfert
authored andcommitted
Adding OpenMP Offloading Backend for C++ Parallel Algorithms Rebased
1 parent 560b72c commit 50ce7a0

31 files changed

+1555
-2
lines changed

.github/workflows/libcxx-build-and-test.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ jobs:
146146
'generic-no-wide-characters',
147147
'generic-no-rtti',
148148
'generic-optimized-speed',
149+
'generic-pstl-openmp',
149150
'generic-static',
150151
'bootstrapping-build'
151152
]

libcxx/CMakeLists.txt

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -300,10 +300,11 @@ option(LIBCXX_HAS_EXTERNAL_THREAD_API
300300
This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
301301

302302
if (LIBCXX_ENABLE_THREADS)
303-
set(LIBCXX_PSTL_BACKEND "std_thread" CACHE STRING "Which PSTL backend to use")
303+
set(LIBCXX_PSTL_BACKEND_DEFAULT "std_thread")
304304
else()
305-
set(LIBCXX_PSTL_BACKEND "serial" CACHE STRING "Which PSTL backend to use")
305+
set(LIBCXX_PSTL_BACKEND_DEFAULT "serial")
306306
endif()
307+
set(LIBCXX_PSTL_BACKEND "${LIBCXX_PSTL_BACKEND_DEFAULT}" CACHE STRING "Select the PSTL backend to use. Valid values are serial, std-thread, libdispatch, openmp. Default: ${LIBCXX_PSTL_BACKEND_DEFAULT}")
307308

308309
# Misc options ----------------------------------------------------------------
309310
# FIXME: Turn -pedantic back ON. It is currently off because it warns
@@ -552,6 +553,11 @@ function(cxx_add_basic_build_flags target)
552553
endif()
553554
endif()
554555
target_compile_options(${target} PUBLIC "${LIBCXX_ADDITIONAL_COMPILE_FLAGS}")
556+
557+
# If the PSTL backend depends on OpenMP, we must enable the OpenMP tool chain
558+
if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
559+
target_add_compile_flags_if_supported(${target} PUBLIC -fopenmp)
560+
endif()
555561
endfunction()
556562

557563
# Exception flags =============================================================
@@ -784,6 +790,8 @@ elseif(LIBCXX_PSTL_BACKEND STREQUAL "std_thread")
784790
config_define(1 _LIBCPP_PSTL_BACKEND_STD_THREAD)
785791
elseif(LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
786792
config_define(1 _LIBCPP_PSTL_BACKEND_LIBDISPATCH)
793+
elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
794+
config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
787795
else()
788796
message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
789797
Valid backends are: serial, std_thread and libdispatch")
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
set(LIBCXX_PSTL_BACKEND openmp CACHE STRING "")

libcxx/docs/UserDocumentation.rst

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,107 @@ and as such, libc++ does not go out of its way to support them. The library may
329329
compiler extensions which would then be documented explicitly, but the basic expectation should be
330330
that no special support is provided for arbitrary compiler extensions.
331331

332+
Offloading C++ Parallel Algorithms to GPUs
333+
------------------------------------------
334+
335+
Experimental support for GPU offloading has been added to ``libc++``. The
336+
implementation uses OpenMP target offloading to leverage GPU compute resources.
337+
The OpenMP PSTL backend can target both NVIDIA and AMD GPUs.
338+
However, the implementation only supports contiguous iterators, such as
339+
iterators for ``std::vector`` or ``std::array``.
340+
To enable the OpenMP offloading backend it must be selected with
341+
``LIBCXX_PSTL_BACKEND=openmp`` when installing ``libc++``. Further, when
342+
compiling a program, the user must specify the command line options
343+
``-fopenmp -fexperimental-library``. To install LLVM with OpenMP offloading
344+
enabled, please read
345+
`the LLVM OpenMP FAQ. <https://openmp.llvm.org/SupportAndFAQ.html>`_
346+
You may also want to to visit
347+
`the OpenMP offloading command-line argument reference. <https://openmp.llvm.org/CommandLineArgumentReference.html#offload-command-line-arguments>`_
348+
349+
Example
350+
~~~~~~~
351+
352+
The following is an example of offloading vector addition to a GPU using our
353+
standard library extension. It implements the classical vector addition from
354+
BLAS that overwrites the vector ``y`` with ``y=a*x+y``. Thus ``y.begin()`` is
355+
both used as an input and an output iterator in this example.
356+
357+
.. code-block:: cpp
358+
359+
#include <algorithm>
360+
#include <execution>
361+
362+
template <typename T1, typename T2, typename T3>
363+
void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
364+
std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
365+
y.begin(), [=](T2 xi, T3 yi) { return a * xi + yi; });
366+
}
367+
368+
The execution policy ``std::execution::par_unseq`` states that the algorithm's
369+
execution may be parallelized, vectorized, and migrated across threads. This is
370+
the only execution mode that is safe to offload to GPUs, and for all other
371+
execution modes the algorithms will execute on the CPU.
372+
Special attention must be paid to the lambda captures when enabling GPU
373+
offloading. If the lambda captures by reference, the user must manually map the
374+
variables to the device. If capturing by reference, the above example could
375+
be implemented in the following way.
376+
377+
.. code-block:: cpp
378+
379+
template <typename T1, typename T2, typename T3>
380+
void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
381+
#pragma omp target data map(to : a)
382+
std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
383+
y.begin(), [&](T2 xi, T3 yi) { return a * xi + yi; });
384+
}
385+
386+
However, if unified shared memory, USM, is enabled, no additional data mapping
387+
is necessary when capturing y reference.
388+
389+
Compiling functions for GPUs with OpenMP
390+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
391+
392+
The C++ standard defines that all accesses to memory are inside a single address
393+
space. However, discrete GPU systems have distinct address spaces. A single
394+
address space can be emulated if your system supports unified shared memory.
395+
However, many discrete GPU systems do not, and in those cases it is important to
396+
pass device function pointers to the parallel algorithms. Below is an example of
397+
how the OpenMP ``declare target`` directive with the ``indirect`` clause can be
398+
used to mark that a function should be compiled for both host and device.
399+
400+
.. code-block:: cpp
401+
402+
// This function computes the squared difference of two floating points
403+
float squared(float a, float b) { return a * a - 2.0f * a * b + b * b; };
404+
405+
// Declare that the function must be compiled for both host and device
406+
#pragma omp declare target indirect to(squared)
407+
408+
int main() {
409+
std::vector<float> a(100, 1.0);
410+
std::vector<float> b(100, 1.25);
411+
412+
// Pass the host function pointer to the parallel algorithm and let OpenMP
413+
// translate it to the device function pointer internally
414+
float sum =
415+
std::transform_reduce(std::execution::par_unseq, a.begin(), a.end(),
416+
b.begin(), 0.0f, std::plus{}, squared);
417+
418+
// Validate that the result is approximately 6.25
419+
assert(std::abs(sum - 6.25f) < 1e-10);
420+
return 0;
421+
}
422+
423+
Without unified shared memory, the above example will not work if the host
424+
function pointer ``squared`` is passed to the parallel algorithm.
425+
426+
Important notes about exception handling
427+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
428+
429+
GPU architectures do not support exception handling and, for now,
430+
``-fno-exceptions`` is required to offload to the GPU. Parallel CPU fallback
431+
is available without restrictions.
432+
332433
Platform specific behavior
333434
==========================
334435

libcxx/docs/VendorDocumentation.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,17 @@ General purpose options
264264
default assertion handler. If this is specified as a relative path, it
265265
is assumed to be relative to ``<monorepo>/libcxx``.
266266

267+
.. option:: LIBCXX_PSTL_BACKEND:STRING
268+
269+
**Default**:: ``"serial"``
270+
271+
**Values**:: ``serial``, ``std-thread``, ``libdispatch``, ``openmp``
272+
273+
Select the desired backend for C++ parallel algorithms. All four options can
274+
target multi-core CPU architectures, and ``openmp`` can additionally target
275+
GPU architectures. The ``openmp`` backend requires OpenMP version 4.5 or
276+
later (clang's default is sufficient).
277+
267278
ABI Specific Options
268279
--------------------
269280

libcxx/include/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,7 @@ set(files
613613
__pstl/backend_fwd.h
614614
__pstl/backends/default.h
615615
__pstl/backends/libdispatch.h
616+
__pstl/backends/openmp.h
616617
__pstl/backends/serial.h
617618
__pstl/backends/std_thread.h
618619
__pstl/cpu_algos/any_of.h

libcxx/include/__config_site.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#cmakedefine _LIBCPP_PSTL_BACKEND_SERIAL
3939
#cmakedefine _LIBCPP_PSTL_BACKEND_STD_THREAD
4040
#cmakedefine _LIBCPP_PSTL_BACKEND_LIBDISPATCH
41+
#cmakedefine _LIBCPP_PSTL_BACKEND_OPENMP
4142

4243
// Hardening.
4344
#cmakedefine _LIBCPP_HARDENING_MODE_DEFAULT @_LIBCPP_HARDENING_MODE_DEFAULT@

libcxx/include/__pstl/backend.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ _LIBCPP_PUSH_MACROS
3030
# elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
3131
# include <__pstl/backends/default.h>
3232
# include <__pstl/backends/libdispatch.h>
33+
# elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
34+
# include <__pstl/backends/default.h>
35+
# include <__pstl/backends/openmp.h>
36+
# include <__pstl/backends/std_thread.h>
3337
# endif
3438

3539
#endif // _LIBCPP_STD_VER >= 17

libcxx/include/__pstl/backend_fwd.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ struct __backend_configuration;
4949

5050
struct __default_backend_tag;
5151
struct __libdispatch_backend_tag;
52+
struct __openmp_backend_tag;
5253
struct __serial_backend_tag;
5354
struct __std_thread_backend_tag;
5455

@@ -60,6 +61,9 @@ using __current_configuration _LIBCPP_NODEBUG =
6061
# elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
6162
using __current_configuration _LIBCPP_NODEBUG =
6263
__backend_configuration<__libdispatch_backend_tag, __default_backend_tag>;
64+
# elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
65+
using __current_configuration _LIBCPP_NODEBUG =
66+
__backend_configuration<__openmp_backend_tag, __std_thread_backend_tag, __default_backend_tag>;
6367
# else
6468

6569
// ...New vendors can add parallel backends here...

0 commit comments

Comments
 (0)