[SYCL] Add support for compressed BF16 device lib images in runtime (#18108)

uditagarwal97 · web-flow · commit a53dacf4899d · 2025-04-22T13:09:57.000-07:00
#16729 added support to embed BF16 device lib in executable using dynamic linking feature. However, it does not work with `--offload-compress`. This PR fixes that. See CMPLRLLVM-66723
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
@@ -1912,10 +1912,13 @@ void ProgramManager::addImage(sycl_device_binary RawImg,
   if (EntriesB == EntriesE && shouldSkipEmptyImage(RawImg))
     return;
 
-  std::unique_ptr<RTDeviceBinaryImage> Img;
-  bool IsBfloat16DeviceLib = false;
   uint32_t Bfloat16DeviceLibVersion = 0;
-  if (isDeviceImageCompressed(RawImg))
+  const bool IsBfloat16DeviceLib =
+      isBfloat16DeviceLibImage(RawImg, &Bfloat16DeviceLibVersion);
+  const bool IsDeviceImageCompressed = isDeviceImageCompressed(RawImg);
+
+  std::unique_ptr<RTDeviceBinaryImage> Img;
+  if (IsDeviceImageCompressed) {
 #ifndef SYCL_RT_ZSTD_NOT_AVAIABLE
     Img = std::make_unique<CompressedRTDeviceBinaryImage>(RawImg);
 #else
@@ -1924,11 +1927,8 @@ void ProgramManager::addImage(sycl_device_binary RawImg,
                           "SYCL RT was built without ZSTD support."
                           "Aborting. ");
 #endif
-  else {
-    IsBfloat16DeviceLib =
-        isBfloat16DeviceLibImage(RawImg, &Bfloat16DeviceLibVersion);
-    if (!IsBfloat16DeviceLib)
-      Img = std::make_unique<RTDeviceBinaryImage>(RawImg);
+  } else if (!IsBfloat16DeviceLib) {
+    Img = std::make_unique<RTDeviceBinaryImage>(RawImg);
   }
 
   // If an output image is requested, set it to the newly allocated image.
@@ -1966,21 +1966,29 @@ void ProgramManager::addImage(sycl_device_binary RawImg,
              "Invalid Bfloat16 Device Library Index.");
       if (m_Bfloat16DeviceLibImages[Bfloat16DeviceLibVersion].get())
         return;
-      size_t ImgSize =
-          static_cast<size_t>(RawImg->BinaryEnd - RawImg->BinaryStart);
-      std::unique_ptr<char[]> Data(new char[ImgSize]);
-      std::memcpy(Data.get(), RawImg->BinaryStart, ImgSize);
-      auto DynBfloat16DeviceLibImg =
-          std::make_unique<DynRTDeviceBinaryImage>(std::move(Data), ImgSize);
+
+      std::unique_ptr<RTDeviceBinaryImage> DevImg;
+      if (IsDeviceImageCompressed) {
+        // Decompress the image.
+        CheckAndDecompressImage(Img.get());
+        DevImg = std::move(Img);
+      } else {
+        size_t ImgSize =
+            static_cast<size_t>(RawImg->BinaryEnd - RawImg->BinaryStart);
+        std::unique_ptr<char[]> Data(new char[ImgSize]);
+        std::memcpy(Data.get(), RawImg->BinaryStart, ImgSize);
+        DevImg =
+            std::make_unique<DynRTDeviceBinaryImage>(std::move(Data), ImgSize);
+      }
+
+      // Register export symbols for bfloat16 device library image.
       auto ESPropSet = getExportedSymbolPS(RawImg);
-      sycl_device_binary_property ESProp;
-      for (ESProp = ESPropSet->PropertiesBegin;
+      for (auto ESProp = ESPropSet->PropertiesBegin;
            ESProp != ESPropSet->PropertiesEnd; ++ESProp) {
-        m_ExportedSymbolImages.insert(
-            {ESProp->Name, DynBfloat16DeviceLibImg.get()});
+        m_ExportedSymbolImages.insert({ESProp->Name, DevImg.get()});
       }
-      m_Bfloat16DeviceLibImages[Bfloat16DeviceLibVersion] =
-          std::move(DynBfloat16DeviceLibImg);
+      m_Bfloat16DeviceLibImages[Bfloat16DeviceLibVersion] = std::move(DevImg);
+
       return;
     }
   }
diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp
@@ -541,7 +541,7 @@ class ProgramManager {
   // version and 2nd is for native version. These bfloat16 device library
   // images are provided by compiler long time ago, we expect no further
   // update, so keeping 1 copy should be OK.
-  std::array<DynRTDeviceBinaryImageUPtr, 2> m_Bfloat16DeviceLibImages;
+  std::array<RTDeviceBinaryImageUPtr, 2> m_Bfloat16DeviceLibImages;
 
   friend class ::ProgramManagerTest;
 };
diff --git a/sycl/test-e2e/DeviceLib/bfloat16_conversion_test.cpp b/sycl/test-e2e/DeviceLib/bfloat16_conversion_test.cpp
@@ -8,70 +8,10 @@
 
 // REQUIRES: linux
 // RUN: %{build} -DBUILD_LIB -fPIC -shared -o %T/lib%basename_t.so
-
 // RUN: %{build} -DBUILD_EXE -L%T -o %t1.out -l%basename_t -Wl,-rpath=%T
 // RUN: %{run} %t1.out
 
 // UNSUPPORTED: target-nvidia || target-amd
 // UNSUPPORTED-INTENDED: bfloat16 device library is not used on AMD and Nvidia.
 
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/oneapi/bfloat16.hpp>
-
-using namespace sycl;
-
-constexpr access::mode sycl_read = access::mode::read;
-constexpr access::mode sycl_write = access::mode::write;
-
-using BFP = sycl::ext::oneapi::bfloat16;
-
-#ifdef BUILD_LIB
-void foo(queue &deviceQueue) {
-  BFP bf16_v;
-  float fp32_v = 16.5f;
-  {
-    buffer<float, 1> fp32_buffer{&fp32_v, 1};
-    buffer<BFP, 1> bf16_buffer{&bf16_v, 1};
-    deviceQueue
-        .submit([&](handler &cgh) {
-          auto fp32_acc = fp32_buffer.template get_access<sycl_read>(cgh);
-          auto bf16_acc = bf16_buffer.template get_access<sycl_write>(cgh);
-          cgh.single_task([=]() { bf16_acc[0] = BFP{fp32_acc[0]}; });
-        })
-        .wait();
-  }
-  std::cout << "In foo: " << bf16_v << std::endl;
-}
-#endif
-
-#ifdef BUILD_EXE
-void foo(queue &deviceQueue);
-#endif
-
-int main() {
-  BFP bf16_array[3];
-  float fp32_array[3] = {7.0f, 8.5f, 0.5f};
-
-  sycl::queue deviceQueue;
-  {
-    buffer<float, 1> fp32_buffer{fp32_array, 3};
-    buffer<BFP, 1> bf16_buffer{bf16_array, 3};
-    deviceQueue
-        .submit([&](handler &cgh) {
-          auto fp32_acc = fp32_buffer.template get_access<sycl_read>(cgh);
-          auto bf16_acc = bf16_buffer.template get_access<sycl_write>(cgh);
-          cgh.single_task([=]() {
-            bf16_acc[0] = BFP{fp32_acc[0]};
-            bf16_acc[1] = BFP{fp32_acc[1]};
-            bf16_acc[2] = BFP{fp32_acc[2]};
-          });
-        })
-        .wait();
-  }
-  std::cout << bf16_array[0] << " " << bf16_array[1] << " " << bf16_array[2]
-            << std::endl;
-#ifdef BUILD_EXE
-  foo(deviceQueue);
-#endif
-  return 0;
-}
+#include "bfloat16_conversion_test.hpp"
diff --git a/sycl/test-e2e/DeviceLib/bfloat16_conversion_test.hpp b/sycl/test-e2e/DeviceLib/bfloat16_conversion_test.hpp
@@ -0,0 +1,60 @@
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
+
+using namespace sycl;
+
+constexpr access::mode sycl_read = access::mode::read;
+constexpr access::mode sycl_write = access::mode::write;
+
+using BFP = sycl::ext::oneapi::bfloat16;
+
+#ifdef BUILD_LIB
+void foo(queue &deviceQueue) {
+  BFP bf16_v;
+  float fp32_v = 16.5f;
+  {
+    buffer<float, 1> fp32_buffer{&fp32_v, 1};
+    buffer<BFP, 1> bf16_buffer{&bf16_v, 1};
+    deviceQueue
+        .submit([&](handler &cgh) {
+          auto fp32_acc = fp32_buffer.template get_access<sycl_read>(cgh);
+          auto bf16_acc = bf16_buffer.template get_access<sycl_write>(cgh);
+          cgh.single_task([=]() { bf16_acc[0] = BFP{fp32_acc[0]}; });
+        })
+        .wait();
+  }
+  std::cout << "In foo: " << bf16_v << std::endl;
+}
+#endif
+
+#ifdef BUILD_EXE
+void foo(queue &deviceQueue);
+#endif
+
+int main() {
+  BFP bf16_array[3];
+  float fp32_array[3] = {7.0f, 8.5f, 0.5f};
+
+  sycl::queue deviceQueue;
+  {
+    buffer<float, 1> fp32_buffer{fp32_array, 3};
+    buffer<BFP, 1> bf16_buffer{bf16_array, 3};
+    deviceQueue
+        .submit([&](handler &cgh) {
+          auto fp32_acc = fp32_buffer.template get_access<sycl_read>(cgh);
+          auto bf16_acc = bf16_buffer.template get_access<sycl_write>(cgh);
+          cgh.single_task([=]() {
+            bf16_acc[0] = BFP{fp32_acc[0]};
+            bf16_acc[1] = BFP{fp32_acc[1]};
+            bf16_acc[2] = BFP{fp32_acc[2]};
+          });
+        })
+        .wait();
+  }
+  std::cout << bf16_array[0] << " " << bf16_array[1] << " " << bf16_array[2]
+            << std::endl;
+#ifdef BUILD_EXE
+  foo(deviceQueue);
+#endif
+  return 0;
+}
diff --git a/sycl/test-e2e/DeviceLib/bfloat16_conversion_test_compress.cpp b/sycl/test-e2e/DeviceLib/bfloat16_conversion_test_compress.cpp
@@ -0,0 +1,19 @@
+//==-------------- bf1oat16 devicelib test for SYCL JIT --------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Check bfloat16 devicelib device image compression.
+
+// REQUIRES: linux, zstd
+// RUN: %{build} --offload-compress -DBUILD_LIB -fPIC -shared -o %T/lib%basename_t_compress.so
+// RUN: %{build} --offload-compress -DBUILD_EXE -L%T -o %t1.out -l%basename_t_compress -Wl,-rpath=%T
+// RUN: %{run} %t1.out
+
+// UNSUPPORTED: target-nvidia || target-amd
+// UNSUPPORTED-INTENDED: bfloat16 device library is not used on AMD and Nvidia.
+
+#include "bfloat16_conversion_test.hpp"