llvm · jdoerfert · Sep 21, 2023
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
@@ -146,6 +146,7 @@ jobs:
           'generic-no-wide-characters',
           'generic-no-rtti',
           'generic-optimized-speed',
+          'generic-pstl-openmp',
           'generic-static',
           'bootstrapping-build'
         ]

@@ -300,10 +300,11 @@ option(LIBCXX_HAS_EXTERNAL_THREAD_API
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
 
 if (LIBCXX_ENABLE_THREADS)
-  set(LIBCXX_PSTL_BACKEND "std_thread" CACHE STRING "Which PSTL backend to use")
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "std_thread")
 else()
-  set(LIBCXX_PSTL_BACKEND "serial" CACHE STRING "Which PSTL backend to use")
+  set(LIBCXX_PSTL_BACKEND_DEFAULT "serial")
 endif()
+set(LIBCXX_PSTL_BACKEND "${LIBCXX_PSTL_BACKEND_DEFAULT}" CACHE STRING "Select the PSTL backend to use. Valid values are serial, std-thread, libdispatch, openmp. Default: ${LIBCXX_PSTL_BACKEND_DEFAULT}")
 
 # Misc options ----------------------------------------------------------------
 # FIXME: Turn -pedantic back ON. It is currently off because it warns
@@ -552,6 +553,11 @@ function(cxx_add_basic_build_flags target)
     endif()
   endif()
   target_compile_options(${target} PUBLIC "${LIBCXX_ADDITIONAL_COMPILE_FLAGS}")
+
+  # If the PSTL backend depends on OpenMP, we must enable the OpenMP tool chain
+  if (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+    target_add_compile_flags_if_supported(${target} PUBLIC -fopenmp)
+  endif()
 endfunction()
 
 # Exception flags =============================================================
@@ -784,6 +790,8 @@ elseif(LIBCXX_PSTL_BACKEND STREQUAL "std_thread")
   config_define(1 _LIBCPP_PSTL_BACKEND_STD_THREAD)
 elseif(LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   config_define(1 _LIBCPP_PSTL_BACKEND_LIBDISPATCH)
+elseif (LIBCXX_PSTL_BACKEND STREQUAL "openmp")
+  config_define(1 _LIBCPP_PSTL_BACKEND_OPENMP)
 else()
   message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
                        Valid backends are: serial, std_thread and libdispatch")

@@ -0,0 +1 @@
+set(LIBCXX_PSTL_BACKEND openmp CACHE STRING "")
@@ -329,6 +329,107 @@ and as such, libc++ does not go out of its way to support them. The library may
 compiler extensions which would then be documented explicitly, but the basic expectation should be
 that no special support is provided for arbitrary compiler extensions.
 
+Offloading C++ Parallel Algorithms to GPUs
+------------------------------------------
+
+Experimental support for GPU offloading has been added to ``libc++``. The
+implementation uses OpenMP target offloading to leverage GPU compute resources.
+The OpenMP PSTL backend can target both NVIDIA and AMD GPUs.
+However, the implementation only supports contiguous iterators, such as
+iterators for ``std::vector`` or ``std::array``.
+To enable the OpenMP offloading backend it must be selected with
+``LIBCXX_PSTL_BACKEND=openmp`` when installing ``libc++``. Further, when
+compiling a program, the user must specify the command line options
+``-fopenmp -fexperimental-library``. To install LLVM with OpenMP offloading
+enabled, please read
+`the LLVM OpenMP FAQ. <https://openmp.llvm.org/SupportAndFAQ.html>`_
+You may also want to to visit
+`the OpenMP offloading command-line argument reference. <https://openmp.llvm.org/CommandLineArgumentReference.html#offload-command-line-arguments>`_
+
+Example
+~~~~~~~
+
+The following is an example of offloading vector addition to a GPU using our
+standard library extension. It implements the classical vector addition from
+BLAS that overwrites the vector ``y`` with ``y=a*x+y``. Thus ``y.begin()`` is
+both used as an input and an output iterator in this example.
+
+.. code-block:: cpp
+
+  #include <algorithm>
+  #include <execution>
+
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [=](T2 xi, T3 yi) { return a * xi + yi; });
+  }
+
+The execution policy ``std::execution::par_unseq`` states that the algorithm's
+execution may be parallelized, vectorized, and migrated across threads. This is
+the only execution mode that is safe to offload to GPUs, and for all other
+execution modes the algorithms will execute on the CPU.
+Special attention must be paid to the lambda captures when enabling GPU
+offloading. If the lambda captures by reference, the user must manually map the
+variables to the device. If capturing by reference, the above example could
+be implemented in the following way.
+
+.. code-block:: cpp
+
+  template <typename T1, typename T2, typename T3>
+  void axpy(const T1 a, const std::vector<T2> &x, std::vector<T3> &y) {
+  #pragma omp target data map(to : a)
+    std::transform(std::execution::par_unseq, x.begin(), x.end(), y.begin(),
+                  y.begin(), [&](T2 xi, T3 yi) { return a * xi + yi; });
+  }
+
+However, if unified shared memory, USM, is enabled, no additional data mapping
+is necessary when capturing y reference.
+
+Compiling functions for GPUs with OpenMP
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The C++ standard defines that all accesses to memory are inside a single address
+space. However, discrete GPU systems have distinct address spaces. A single
+address space can be emulated if your system supports unified shared memory.
+However, many discrete GPU systems do not, and in those cases it is important to
+pass device function pointers to the parallel algorithms. Below is an example of
+how the OpenMP ``declare target`` directive with the ``indirect`` clause can be
+used to mark that a function should be compiled for both host and device.
+
+.. code-block:: cpp
+
+  // This function computes the squared difference of two floating points
+  float squared(float a, float b) { return a * a - 2.0f * a * b + b * b; };
+
+  // Declare that the function must be compiled for both host and device
+  #pragma omp declare target indirect to(squared)
+
+  int main() {
+    std::vector<float> a(100, 1.0);
+    std::vector<float> b(100, 1.25);
+
+    // Pass the host function pointer to the parallel algorithm and let OpenMP
+    // translate it to the device function pointer internally
+    float sum =
+        std::transform_reduce(std::execution::par_unseq, a.begin(), a.end(),
+                              b.begin(), 0.0f, std::plus{}, squared);
+
+    // Validate that the result is approximately 6.25
+    assert(std::abs(sum - 6.25f) < 1e-10);
+    return 0;
+  }
+
+Without unified shared memory, the above example will not work if the host
+function pointer ``squared`` is passed to the parallel algorithm.
+
+Important notes about exception handling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+GPU architectures do not support exception handling and, for now,
+``-fno-exceptions`` is required to offload to the GPU. Parallel CPU fallback 
+is available without restrictions.
+
 Platform specific behavior
 ==========================
 

@@ -264,6 +264,17 @@ General purpose options
   default assertion handler. If this is specified as a relative path, it
   is assumed to be relative to ``<monorepo>/libcxx``.
 
+.. option:: LIBCXX_PSTL_BACKEND:STRING
+
+  **Default**:: ``"serial"``
+
+  **Values**:: ``serial``, ``std-thread``, ``libdispatch``, ``openmp``
+
+  Select the desired backend for C++ parallel algorithms. All four options can
+  target multi-core CPU architectures, and ``openmp`` can additionally target
+  GPU architectures. The ``openmp`` backend requires OpenMP version 4.5 or
+  later (clang's default is sufficient).
+
 ABI Specific Options
 --------------------
 

@@ -613,6 +613,7 @@ set(files
   __pstl/backend_fwd.h
   __pstl/backends/default.h
   __pstl/backends/libdispatch.h
+  __pstl/backends/openmp.h
   __pstl/backends/serial.h
   __pstl/backends/std_thread.h
   __pstl/cpu_algos/any_of.h

@@ -38,6 +38,7 @@
 #cmakedefine _LIBCPP_PSTL_BACKEND_SERIAL
 #cmakedefine _LIBCPP_PSTL_BACKEND_STD_THREAD
 #cmakedefine _LIBCPP_PSTL_BACKEND_LIBDISPATCH
+#cmakedefine _LIBCPP_PSTL_BACKEND_OPENMP
 
 // Hardening.
 #cmakedefine _LIBCPP_HARDENING_MODE_DEFAULT @_LIBCPP_HARDENING_MODE_DEFAULT@

@@ -30,6 +30,10 @@ _LIBCPP_PUSH_MACROS
 #  elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
 #    include <__pstl/backends/default.h>
 #    include <__pstl/backends/libdispatch.h>
+#  elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+#    include <__pstl/backends/default.h>
+#    include <__pstl/backends/openmp.h>
+#    include <__pstl/backends/std_thread.h>
 #  endif
 
 #endif // _LIBCPP_STD_VER >= 17

@@ -49,6 +49,7 @@ struct __backend_configuration;
 
 struct __default_backend_tag;
 struct __libdispatch_backend_tag;
+struct __openmp_backend_tag;
 struct __serial_backend_tag;
 struct __std_thread_backend_tag;
 
@@ -60,6 +61,9 @@ using __current_configuration _LIBCPP_NODEBUG =
 #  elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
 using __current_configuration _LIBCPP_NODEBUG =
     __backend_configuration<__libdispatch_backend_tag, __default_backend_tag>;
+#  elif defined(_LIBCPP_PSTL_BACKEND_OPENMP)
+using __current_configuration _LIBCPP_NODEBUG =
+    __backend_configuration<__openmp_backend_tag, __std_thread_backend_tag, __default_backend_tag>;
 #  else
 
 // ...New vendors can add parallel backends here...
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		set(LIBCXX_PSTL_BACKEND openmp CACHE STRING "")