Skip to content

Commit abc2fe3

Browse files
authored
[APFloat] Add support for f8E3M4 IEEE 754 type (#99698)
This PR adds `f8E4M3` type to APFloat. `f8E3M4` type follows IEEE 754 convention ```c f8E3M4 (IEEE 754) - Exponent bias: 3 - Maximum stored exponent value: 6 (binary 110) - Maximum unbiased exponent value: 6 - 3 = 3 - Minimum stored exponent value: 1 (binary 001) - Minimum unbiased exponent value: 1 − 3 = −2 - Precision specifies the total number of bits used for the significand (mantissa), including implicit leading integer bit = 4 + 1 = 5 - Follows IEEE 754 conventions for representation of special values - Has Positive and Negative zero - Has Positive and Negative infinity - Has NaNs Additional details: - Max exp (unbiased): 3 - Min exp (unbiased): -2 - Infinities (+/-): S.111.0000 - Zeros (+/-): S.000.0000 - NaNs: S.111.{0,1}⁴ except S.111.0000 - Max normal number: S.110.1111 = +/-2^(6-3) x (1 + 15/16) = +/-2^3 x 31 x 2^(-4) = +/-15.5 - Min normal number: S.001.0000 = +/-2^(1-3) x (1 + 0) = +/-2^(-2) - Max subnormal number: S.000.1111 = +/-2^(-2) x 15/16 = +/-2^(-2) x 15 x 2^(-4) = +/-15 x 2^(-6) - Min subnormal number: S.000.0001 = +/-2^(-2) x 1/16 = +/-2^(-2) x 2^(-4) = +/-2^(-6) ``` Related PRs: - [PR-97179](#97179) [APFloat] Add support for f8E4M3 IEEE 754 type
1 parent fcd6bd5 commit abc2fe3

File tree

4 files changed

+108
-0
lines changed

4 files changed

+108
-0
lines changed

clang/lib/AST/MicrosoftMangle.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,6 +1015,7 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat Number) {
10151015
case APFloat::S_Float8E5M2FNUZ:
10161016
case APFloat::S_Float8E4M3FNUZ:
10171017
case APFloat::S_Float8E4M3B11FNUZ:
1018+
case APFloat::S_Float8E3M4:
10181019
case APFloat::S_FloatTF32:
10191020
case APFloat::S_Float6E3M2FN:
10201021
case APFloat::S_Float6E2M3FN:

llvm/include/llvm/ADT/APFloat.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,9 @@ struct APFloatBase {
188188
// This format's exponent bias is 11, instead of the 7 (2 ** (4 - 1) - 1)
189189
// that IEEE precedent would imply.
190190
S_Float8E4M3B11FNUZ,
191+
// 8-bit floating point number following IEEE-754 conventions with bit
192+
// layout S1E3M4.
193+
S_Float8E3M4,
191194
// Floating point number that occupies 32 bits or less of storage, providing
192195
// improved range compared to half (16-bit) formats, at (potentially)
193196
// greater throughput than single precision (32-bit) formats.
@@ -224,6 +227,7 @@ struct APFloatBase {
224227
static const fltSemantics &Float8E4M3FN() LLVM_READNONE;
225228
static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE;
226229
static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE;
230+
static const fltSemantics &Float8E3M4() LLVM_READNONE;
227231
static const fltSemantics &FloatTF32() LLVM_READNONE;
228232
static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
229233
static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
@@ -646,6 +650,7 @@ class IEEEFloat final : public APFloatBase {
646650
APInt convertFloat8E4M3FNAPFloatToAPInt() const;
647651
APInt convertFloat8E4M3FNUZAPFloatToAPInt() const;
648652
APInt convertFloat8E4M3B11FNUZAPFloatToAPInt() const;
653+
APInt convertFloat8E3M4APFloatToAPInt() const;
649654
APInt convertFloatTF32APFloatToAPInt() const;
650655
APInt convertFloat6E3M2FNAPFloatToAPInt() const;
651656
APInt convertFloat6E2M3FNAPFloatToAPInt() const;
@@ -665,6 +670,7 @@ class IEEEFloat final : public APFloatBase {
665670
void initFromFloat8E4M3FNAPInt(const APInt &api);
666671
void initFromFloat8E4M3FNUZAPInt(const APInt &api);
667672
void initFromFloat8E4M3B11FNUZAPInt(const APInt &api);
673+
void initFromFloat8E3M4APInt(const APInt &api);
668674
void initFromFloatTF32APInt(const APInt &api);
669675
void initFromFloat6E3M2FNAPInt(const APInt &api);
670676
void initFromFloat6E2M3FNAPInt(const APInt &api);

llvm/lib/Support/APFloat.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ static constexpr fltSemantics semFloat8E4M3FNUZ = {
143143
7, -7, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
144144
static constexpr fltSemantics semFloat8E4M3B11FNUZ = {
145145
4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
146+
static constexpr fltSemantics semFloat8E3M4 = {3, -2, 5, 8};
146147
static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19};
147148
static constexpr fltSemantics semFloat6E3M2FN = {
148149
4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly};
@@ -217,6 +218,8 @@ const llvm::fltSemantics &APFloatBase::EnumToSemantics(Semantics S) {
217218
return Float8E4M3FNUZ();
218219
case S_Float8E4M3B11FNUZ:
219220
return Float8E4M3B11FNUZ();
221+
case S_Float8E3M4:
222+
return Float8E3M4();
220223
case S_FloatTF32:
221224
return FloatTF32();
222225
case S_Float6E3M2FN:
@@ -257,6 +260,8 @@ APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) {
257260
return S_Float8E4M3FNUZ;
258261
else if (&Sem == &llvm::APFloat::Float8E4M3B11FNUZ())
259262
return S_Float8E4M3B11FNUZ;
263+
else if (&Sem == &llvm::APFloat::Float8E3M4())
264+
return S_Float8E3M4;
260265
else if (&Sem == &llvm::APFloat::FloatTF32())
261266
return S_FloatTF32;
262267
else if (&Sem == &llvm::APFloat::Float6E3M2FN())
@@ -287,6 +292,7 @@ const fltSemantics &APFloatBase::Float8E4M3FNUZ() { return semFloat8E4M3FNUZ; }
287292
const fltSemantics &APFloatBase::Float8E4M3B11FNUZ() {
288293
return semFloat8E4M3B11FNUZ;
289294
}
295+
const fltSemantics &APFloatBase::Float8E3M4() { return semFloat8E3M4; }
290296
const fltSemantics &APFloatBase::FloatTF32() { return semFloatTF32; }
291297
const fltSemantics &APFloatBase::Float6E3M2FN() { return semFloat6E3M2FN; }
292298
const fltSemantics &APFloatBase::Float6E2M3FN() { return semFloat6E2M3FN; }
@@ -3643,6 +3649,11 @@ APInt IEEEFloat::convertFloat8E4M3B11FNUZAPFloatToAPInt() const {
36433649
return convertIEEEFloatToAPInt<semFloat8E4M3B11FNUZ>();
36443650
}
36453651

3652+
APInt IEEEFloat::convertFloat8E3M4APFloatToAPInt() const {
3653+
assert(partCount() == 1);
3654+
return convertIEEEFloatToAPInt<semFloat8E3M4>();
3655+
}
3656+
36463657
APInt IEEEFloat::convertFloatTF32APFloatToAPInt() const {
36473658
assert(partCount() == 1);
36483659
return convertIEEEFloatToAPInt<semFloatTF32>();
@@ -3704,6 +3715,9 @@ APInt IEEEFloat::bitcastToAPInt() const {
37043715
if (semantics == (const llvm::fltSemantics *)&semFloat8E4M3B11FNUZ)
37053716
return convertFloat8E4M3B11FNUZAPFloatToAPInt();
37063717

3718+
if (semantics == (const llvm::fltSemantics *)&semFloat8E3M4)
3719+
return convertFloat8E3M4APFloatToAPInt();
3720+
37073721
if (semantics == (const llvm::fltSemantics *)&semFloatTF32)
37083722
return convertFloatTF32APFloatToAPInt();
37093723

@@ -3932,6 +3946,10 @@ void IEEEFloat::initFromFloat8E4M3B11FNUZAPInt(const APInt &api) {
39323946
initFromIEEEAPInt<semFloat8E4M3B11FNUZ>(api);
39333947
}
39343948

3949+
void IEEEFloat::initFromFloat8E3M4APInt(const APInt &api) {
3950+
initFromIEEEAPInt<semFloat8E3M4>(api);
3951+
}
3952+
39353953
void IEEEFloat::initFromFloatTF32APInt(const APInt &api) {
39363954
initFromIEEEAPInt<semFloatTF32>(api);
39373955
}
@@ -3977,6 +3995,8 @@ void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
39773995
return initFromFloat8E4M3FNUZAPInt(api);
39783996
if (Sem == &semFloat8E4M3B11FNUZ)
39793997
return initFromFloat8E4M3B11FNUZAPInt(api);
3998+
if (Sem == &semFloat8E3M4)
3999+
return initFromFloat8E3M4APInt(api);
39804000
if (Sem == &semFloatTF32)
39814001
return initFromFloatTF32APInt(api);
39824002
if (Sem == &semFloat6E3M2FN)

llvm/unittests/ADT/APFloatTest.cpp

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2141,6 +2141,8 @@ TEST(APFloatTest, getZero) {
21412141
{&APFloat::Float8E4M3FNUZ(), true, false, {0, 0}, 1},
21422142
{&APFloat::Float8E4M3B11FNUZ(), false, false, {0, 0}, 1},
21432143
{&APFloat::Float8E4M3B11FNUZ(), true, false, {0, 0}, 1},
2144+
{&APFloat::Float8E3M4(), false, true, {0, 0}, 1},
2145+
{&APFloat::Float8E3M4(), true, true, {0x80ULL, 0}, 1},
21442146
{&APFloat::FloatTF32(), false, true, {0, 0}, 1},
21452147
{&APFloat::FloatTF32(), true, true, {0x40000ULL, 0}, 1},
21462148
{&APFloat::Float6E3M2FN(), false, true, {0, 0}, 1},
@@ -6636,6 +6638,45 @@ TEST(APFloatTest, Float8E4M3FNUZToDouble) {
66366638
EXPECT_TRUE(std::isnan(QNaN.convertToDouble()));
66376639
}
66386640

6641+
TEST(APFloatTest, Float8E3M4ToDouble) {
6642+
APFloat PosZero = APFloat::getZero(APFloat::Float8E3M4(), false);
6643+
APFloat PosZeroToDouble(PosZero.convertToDouble());
6644+
EXPECT_TRUE(PosZeroToDouble.isPosZero());
6645+
APFloat NegZero = APFloat::getZero(APFloat::Float8E3M4(), true);
6646+
APFloat NegZeroToDouble(NegZero.convertToDouble());
6647+
EXPECT_TRUE(NegZeroToDouble.isNegZero());
6648+
6649+
APFloat One(APFloat::Float8E3M4(), "1.0");
6650+
EXPECT_EQ(1.0, One.convertToDouble());
6651+
APFloat Two(APFloat::Float8E3M4(), "2.0");
6652+
EXPECT_EQ(2.0, Two.convertToDouble());
6653+
APFloat PosLargest = APFloat::getLargest(APFloat::Float8E3M4(), false);
6654+
EXPECT_EQ(15.5F, PosLargest.convertToDouble());
6655+
APFloat NegLargest = APFloat::getLargest(APFloat::Float8E3M4(), true);
6656+
EXPECT_EQ(-15.5F, NegLargest.convertToDouble());
6657+
APFloat PosSmallest =
6658+
APFloat::getSmallestNormalized(APFloat::Float8E3M4(), false);
6659+
EXPECT_EQ(0x1.p-2, PosSmallest.convertToDouble());
6660+
APFloat NegSmallest =
6661+
APFloat::getSmallestNormalized(APFloat::Float8E3M4(), true);
6662+
EXPECT_EQ(-0x1.p-2, NegSmallest.convertToDouble());
6663+
6664+
APFloat PosSmallestDenorm =
6665+
APFloat::getSmallest(APFloat::Float8E3M4(), false);
6666+
EXPECT_TRUE(PosSmallestDenorm.isDenormal());
6667+
EXPECT_EQ(0x1.p-6, PosSmallestDenorm.convertToDouble());
6668+
APFloat NegSmallestDenorm = APFloat::getSmallest(APFloat::Float8E3M4(), true);
6669+
EXPECT_TRUE(NegSmallestDenorm.isDenormal());
6670+
EXPECT_EQ(-0x1.p-6, NegSmallestDenorm.convertToDouble());
6671+
6672+
APFloat PosInf = APFloat::getInf(APFloat::Float8E3M4());
6673+
EXPECT_EQ(std::numeric_limits<double>::infinity(), PosInf.convertToDouble());
6674+
APFloat NegInf = APFloat::getInf(APFloat::Float8E3M4(), true);
6675+
EXPECT_EQ(-std::numeric_limits<double>::infinity(), NegInf.convertToDouble());
6676+
APFloat QNaN = APFloat::getQNaN(APFloat::Float8E3M4());
6677+
EXPECT_TRUE(std::isnan(QNaN.convertToDouble()));
6678+
}
6679+
66396680
TEST(APFloatTest, FloatTF32ToDouble) {
66406681
APFloat One(APFloat::FloatTF32(), "1.0");
66416682
EXPECT_EQ(1.0, One.convertToDouble());
@@ -6944,6 +6985,46 @@ TEST(APFloatTest, Float8E4M3FNToFloat) {
69446985
EXPECT_TRUE(std::isnan(QNaN.convertToFloat()));
69456986
}
69466987

6988+
TEST(APFloatTest, Float8E3M4ToFloat) {
6989+
APFloat PosZero = APFloat::getZero(APFloat::Float8E3M4(), false);
6990+
APFloat PosZeroToFloat(PosZero.convertToFloat());
6991+
EXPECT_TRUE(PosZeroToFloat.isPosZero());
6992+
APFloat NegZero = APFloat::getZero(APFloat::Float8E3M4(), true);
6993+
APFloat NegZeroToFloat(NegZero.convertToFloat());
6994+
EXPECT_TRUE(NegZeroToFloat.isNegZero());
6995+
6996+
APFloat One(APFloat::Float8E3M4(), "1.0");
6997+
EXPECT_EQ(1.0F, One.convertToFloat());
6998+
APFloat Two(APFloat::Float8E3M4(), "2.0");
6999+
EXPECT_EQ(2.0F, Two.convertToFloat());
7000+
7001+
APFloat PosLargest = APFloat::getLargest(APFloat::Float8E3M4(), false);
7002+
EXPECT_EQ(15.5F, PosLargest.convertToFloat());
7003+
APFloat NegLargest = APFloat::getLargest(APFloat::Float8E3M4(), true);
7004+
EXPECT_EQ(-15.5F, NegLargest.convertToFloat());
7005+
APFloat PosSmallest =
7006+
APFloat::getSmallestNormalized(APFloat::Float8E3M4(), false);
7007+
EXPECT_EQ(0x1.p-2, PosSmallest.convertToFloat());
7008+
APFloat NegSmallest =
7009+
APFloat::getSmallestNormalized(APFloat::Float8E3M4(), true);
7010+
EXPECT_EQ(-0x1.p-2, NegSmallest.convertToFloat());
7011+
7012+
APFloat PosSmallestDenorm =
7013+
APFloat::getSmallest(APFloat::Float8E3M4(), false);
7014+
EXPECT_TRUE(PosSmallestDenorm.isDenormal());
7015+
EXPECT_EQ(0x1.p-6, PosSmallestDenorm.convertToFloat());
7016+
APFloat NegSmallestDenorm = APFloat::getSmallest(APFloat::Float8E3M4(), true);
7017+
EXPECT_TRUE(NegSmallestDenorm.isDenormal());
7018+
EXPECT_EQ(-0x1.p-6, NegSmallestDenorm.convertToFloat());
7019+
7020+
APFloat PosInf = APFloat::getInf(APFloat::Float8E3M4());
7021+
EXPECT_EQ(std::numeric_limits<float>::infinity(), PosInf.convertToFloat());
7022+
APFloat NegInf = APFloat::getInf(APFloat::Float8E3M4(), true);
7023+
EXPECT_EQ(-std::numeric_limits<float>::infinity(), NegInf.convertToFloat());
7024+
APFloat QNaN = APFloat::getQNaN(APFloat::Float8E3M4());
7025+
EXPECT_TRUE(std::isnan(QNaN.convertToFloat()));
7026+
}
7027+
69477028
TEST(APFloatTest, FloatTF32ToFloat) {
69487029
APFloat PosZero = APFloat::getZero(APFloat::FloatTF32());
69497030
APFloat PosZeroToFloat(PosZero.convertToFloat());

0 commit comments

Comments
 (0)