Skip to content

Commit a53dacf

Browse files
[SYCL] Add support for compressed BF16 device lib images in runtime (#18108)
#16729 added support to embed BF16 device lib in executable using dynamic linking feature. However, it does not work with `--offload-compress`. This PR fixes that. See CMPLRLLVM-66723
1 parent 3fa9c4a commit a53dacf

File tree

5 files changed

+109
-82
lines changed

5 files changed

+109
-82
lines changed

sycl/source/detail/program_manager/program_manager.cpp

+28-20
Original file line numberDiff line numberDiff line change
@@ -1912,10 +1912,13 @@ void ProgramManager::addImage(sycl_device_binary RawImg,
19121912
if (EntriesB == EntriesE && shouldSkipEmptyImage(RawImg))
19131913
return;
19141914

1915-
std::unique_ptr<RTDeviceBinaryImage> Img;
1916-
bool IsBfloat16DeviceLib = false;
19171915
uint32_t Bfloat16DeviceLibVersion = 0;
1918-
if (isDeviceImageCompressed(RawImg))
1916+
const bool IsBfloat16DeviceLib =
1917+
isBfloat16DeviceLibImage(RawImg, &Bfloat16DeviceLibVersion);
1918+
const bool IsDeviceImageCompressed = isDeviceImageCompressed(RawImg);
1919+
1920+
std::unique_ptr<RTDeviceBinaryImage> Img;
1921+
if (IsDeviceImageCompressed) {
19191922
#ifndef SYCL_RT_ZSTD_NOT_AVAIABLE
19201923
Img = std::make_unique<CompressedRTDeviceBinaryImage>(RawImg);
19211924
#else
@@ -1924,11 +1927,8 @@ void ProgramManager::addImage(sycl_device_binary RawImg,
19241927
"SYCL RT was built without ZSTD support."
19251928
"Aborting. ");
19261929
#endif
1927-
else {
1928-
IsBfloat16DeviceLib =
1929-
isBfloat16DeviceLibImage(RawImg, &Bfloat16DeviceLibVersion);
1930-
if (!IsBfloat16DeviceLib)
1931-
Img = std::make_unique<RTDeviceBinaryImage>(RawImg);
1930+
} else if (!IsBfloat16DeviceLib) {
1931+
Img = std::make_unique<RTDeviceBinaryImage>(RawImg);
19321932
}
19331933

19341934
// If an output image is requested, set it to the newly allocated image.
@@ -1966,21 +1966,29 @@ void ProgramManager::addImage(sycl_device_binary RawImg,
19661966
"Invalid Bfloat16 Device Library Index.");
19671967
if (m_Bfloat16DeviceLibImages[Bfloat16DeviceLibVersion].get())
19681968
return;
1969-
size_t ImgSize =
1970-
static_cast<size_t>(RawImg->BinaryEnd - RawImg->BinaryStart);
1971-
std::unique_ptr<char[]> Data(new char[ImgSize]);
1972-
std::memcpy(Data.get(), RawImg->BinaryStart, ImgSize);
1973-
auto DynBfloat16DeviceLibImg =
1974-
std::make_unique<DynRTDeviceBinaryImage>(std::move(Data), ImgSize);
1969+
1970+
std::unique_ptr<RTDeviceBinaryImage> DevImg;
1971+
if (IsDeviceImageCompressed) {
1972+
// Decompress the image.
1973+
CheckAndDecompressImage(Img.get());
1974+
DevImg = std::move(Img);
1975+
} else {
1976+
size_t ImgSize =
1977+
static_cast<size_t>(RawImg->BinaryEnd - RawImg->BinaryStart);
1978+
std::unique_ptr<char[]> Data(new char[ImgSize]);
1979+
std::memcpy(Data.get(), RawImg->BinaryStart, ImgSize);
1980+
DevImg =
1981+
std::make_unique<DynRTDeviceBinaryImage>(std::move(Data), ImgSize);
1982+
}
1983+
1984+
// Register export symbols for bfloat16 device library image.
19751985
auto ESPropSet = getExportedSymbolPS(RawImg);
1976-
sycl_device_binary_property ESProp;
1977-
for (ESProp = ESPropSet->PropertiesBegin;
1986+
for (auto ESProp = ESPropSet->PropertiesBegin;
19781987
ESProp != ESPropSet->PropertiesEnd; ++ESProp) {
1979-
m_ExportedSymbolImages.insert(
1980-
{ESProp->Name, DynBfloat16DeviceLibImg.get()});
1988+
m_ExportedSymbolImages.insert({ESProp->Name, DevImg.get()});
19811989
}
1982-
m_Bfloat16DeviceLibImages[Bfloat16DeviceLibVersion] =
1983-
std::move(DynBfloat16DeviceLibImg);
1990+
m_Bfloat16DeviceLibImages[Bfloat16DeviceLibVersion] = std::move(DevImg);
1991+
19841992
return;
19851993
}
19861994
}

sycl/source/detail/program_manager/program_manager.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -541,7 +541,7 @@ class ProgramManager {
541541
// version and 2nd is for native version. These bfloat16 device library
542542
// images are provided by compiler long time ago, we expect no further
543543
// update, so keeping 1 copy should be OK.
544-
std::array<DynRTDeviceBinaryImageUPtr, 2> m_Bfloat16DeviceLibImages;
544+
std::array<RTDeviceBinaryImageUPtr, 2> m_Bfloat16DeviceLibImages;
545545

546546
friend class ::ProgramManagerTest;
547547
};

sycl/test-e2e/DeviceLib/bfloat16_conversion_test.cpp

+1-61
Original file line numberDiff line numberDiff line change
@@ -8,70 +8,10 @@
88

99
// REQUIRES: linux
1010
// RUN: %{build} -DBUILD_LIB -fPIC -shared -o %T/lib%basename_t.so
11-
1211
// RUN: %{build} -DBUILD_EXE -L%T -o %t1.out -l%basename_t -Wl,-rpath=%T
1312
// RUN: %{run} %t1.out
1413

1514
// UNSUPPORTED: target-nvidia || target-amd
1615
// UNSUPPORTED-INTENDED: bfloat16 device library is not used on AMD and Nvidia.
1716

18-
#include <sycl/detail/core.hpp>
19-
#include <sycl/ext/oneapi/bfloat16.hpp>
20-
21-
using namespace sycl;
22-
23-
constexpr access::mode sycl_read = access::mode::read;
24-
constexpr access::mode sycl_write = access::mode::write;
25-
26-
using BFP = sycl::ext::oneapi::bfloat16;
27-
28-
#ifdef BUILD_LIB
29-
void foo(queue &deviceQueue) {
30-
BFP bf16_v;
31-
float fp32_v = 16.5f;
32-
{
33-
buffer<float, 1> fp32_buffer{&fp32_v, 1};
34-
buffer<BFP, 1> bf16_buffer{&bf16_v, 1};
35-
deviceQueue
36-
.submit([&](handler &cgh) {
37-
auto fp32_acc = fp32_buffer.template get_access<sycl_read>(cgh);
38-
auto bf16_acc = bf16_buffer.template get_access<sycl_write>(cgh);
39-
cgh.single_task([=]() { bf16_acc[0] = BFP{fp32_acc[0]}; });
40-
})
41-
.wait();
42-
}
43-
std::cout << "In foo: " << bf16_v << std::endl;
44-
}
45-
#endif
46-
47-
#ifdef BUILD_EXE
48-
void foo(queue &deviceQueue);
49-
#endif
50-
51-
int main() {
52-
BFP bf16_array[3];
53-
float fp32_array[3] = {7.0f, 8.5f, 0.5f};
54-
55-
sycl::queue deviceQueue;
56-
{
57-
buffer<float, 1> fp32_buffer{fp32_array, 3};
58-
buffer<BFP, 1> bf16_buffer{bf16_array, 3};
59-
deviceQueue
60-
.submit([&](handler &cgh) {
61-
auto fp32_acc = fp32_buffer.template get_access<sycl_read>(cgh);
62-
auto bf16_acc = bf16_buffer.template get_access<sycl_write>(cgh);
63-
cgh.single_task([=]() {
64-
bf16_acc[0] = BFP{fp32_acc[0]};
65-
bf16_acc[1] = BFP{fp32_acc[1]};
66-
bf16_acc[2] = BFP{fp32_acc[2]};
67-
});
68-
})
69-
.wait();
70-
}
71-
std::cout << bf16_array[0] << " " << bf16_array[1] << " " << bf16_array[2]
72-
<< std::endl;
73-
#ifdef BUILD_EXE
74-
foo(deviceQueue);
75-
#endif
76-
return 0;
77-
}
17+
#include "bfloat16_conversion_test.hpp"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#include <sycl/detail/core.hpp>
2+
#include <sycl/ext/oneapi/bfloat16.hpp>
3+
4+
using namespace sycl;
5+
6+
constexpr access::mode sycl_read = access::mode::read;
7+
constexpr access::mode sycl_write = access::mode::write;
8+
9+
using BFP = sycl::ext::oneapi::bfloat16;
10+
11+
#ifdef BUILD_LIB
12+
void foo(queue &deviceQueue) {
13+
BFP bf16_v;
14+
float fp32_v = 16.5f;
15+
{
16+
buffer<float, 1> fp32_buffer{&fp32_v, 1};
17+
buffer<BFP, 1> bf16_buffer{&bf16_v, 1};
18+
deviceQueue
19+
.submit([&](handler &cgh) {
20+
auto fp32_acc = fp32_buffer.template get_access<sycl_read>(cgh);
21+
auto bf16_acc = bf16_buffer.template get_access<sycl_write>(cgh);
22+
cgh.single_task([=]() { bf16_acc[0] = BFP{fp32_acc[0]}; });
23+
})
24+
.wait();
25+
}
26+
std::cout << "In foo: " << bf16_v << std::endl;
27+
}
28+
#endif
29+
30+
#ifdef BUILD_EXE
31+
void foo(queue &deviceQueue);
32+
#endif
33+
34+
int main() {
35+
BFP bf16_array[3];
36+
float fp32_array[3] = {7.0f, 8.5f, 0.5f};
37+
38+
sycl::queue deviceQueue;
39+
{
40+
buffer<float, 1> fp32_buffer{fp32_array, 3};
41+
buffer<BFP, 1> bf16_buffer{bf16_array, 3};
42+
deviceQueue
43+
.submit([&](handler &cgh) {
44+
auto fp32_acc = fp32_buffer.template get_access<sycl_read>(cgh);
45+
auto bf16_acc = bf16_buffer.template get_access<sycl_write>(cgh);
46+
cgh.single_task([=]() {
47+
bf16_acc[0] = BFP{fp32_acc[0]};
48+
bf16_acc[1] = BFP{fp32_acc[1]};
49+
bf16_acc[2] = BFP{fp32_acc[2]};
50+
});
51+
})
52+
.wait();
53+
}
54+
std::cout << bf16_array[0] << " " << bf16_array[1] << " " << bf16_array[2]
55+
<< std::endl;
56+
#ifdef BUILD_EXE
57+
foo(deviceQueue);
58+
#endif
59+
return 0;
60+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
//==-------------- bf1oat16 devicelib test for SYCL JIT --------------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
// Check bfloat16 devicelib device image compression.
10+
11+
// REQUIRES: linux, zstd
12+
// RUN: %{build} --offload-compress -DBUILD_LIB -fPIC -shared -o %T/lib%basename_t_compress.so
13+
// RUN: %{build} --offload-compress -DBUILD_EXE -L%T -o %t1.out -l%basename_t_compress -Wl,-rpath=%T
14+
// RUN: %{run} %t1.out
15+
16+
// UNSUPPORTED: target-nvidia || target-amd
17+
// UNSUPPORTED-INTENDED: bfloat16 device library is not used on AMD and Nvidia.
18+
19+
#include "bfloat16_conversion_test.hpp"

0 commit comments

Comments
 (0)