Skip to content

[libclc] Reduce bithacking for INF/NAN values #129738

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions libclc/clc/include/clc/float/definitions.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
#define FLT_MIN 0x1.0p-126f
#define FLT_EPSILON 0x1.0p-23f
#define FLT_NAN __builtin_nanf("")
#define FLT_PINF (__builtin_inff())
#define FLT_NINF (-__builtin_inff())

#define FP_ILOGB0 (-2147483647 - 1)
#define FP_ILOGBNAN 2147483647
Expand Down Expand Up @@ -47,6 +49,8 @@
#define DBL_MIN 0x1.0p-1022
#define DBL_EPSILON 0x1.0p-52
#define DBL_NAN __builtin_nan("")
#define DBL_PINF (__builtin_inf())
#define DBL_NINF (-__builtin_inf())

#define M_E 0x1.5bf0a8b145769p+1
#define M_LOG2E 0x1.71547652b82fep+0
Expand Down Expand Up @@ -82,6 +86,8 @@
#define HALF_MIN 0x1.0p-14h
#define HALF_EPSILON 0x1.0p-10h
#define HALF_NAN __builtin_nanf16("")
#define HALF_PINF (__builtin_inff16())
#define HALF_NINF (-__builtin_inff16())

#define M_LOG2E_H 0x1.714p+0h

Expand Down
23 changes: 23 additions & 0 deletions libclc/clc/include/clc/math/gentype.inc
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <clc/clcfunc.h>
#include <clc/clctypes.h>
#include <clc/float/definitions.h>
#include <clc/utils.h>

// Define some useful macros for type conversions.
Expand Down Expand Up @@ -45,6 +46,8 @@
#define __CLC_CONVERT_UINTN __CLC_XCONCAT(__clc_convert_, __CLC_UINTN)
#define __CLC_CONVERT_ULONGN __CLC_XCONCAT(__clc_convert_, __CLC_ULONGN)

#define __CLC_GENTYPE_INF __CLC_GENTYPE_PINF

// See definitions of __CLC_S_GENTYPE/__CLC_U_GENTYPE below, which depend on the
// specific size of floating-point type. These are the signed and unsigned
// integers of the same bitwidth and element count as the GENTYPE. They match
Expand All @@ -59,6 +62,9 @@
#define __CLC_SCALAR_GENTYPE float
#define __CLC_FPSIZE 32
#define __CLC_FP_LIT(x) x##F
#define __CLC_GENTYPE_PINF ((__CLC_GENTYPE)FLT_PINF)
#define __CLC_GENTYPE_NINF ((__CLC_GENTYPE)FLT_NINF)
#define __CLC_GENTYPE_NAN ((__CLC_GENTYPE)FLT_NAN)

#define __CLC_S_GENTYPE __CLC_XCONCAT(int, __CLC_VECSIZE)
#define __CLC_U_GENTYPE __CLC_XCONCAT(uint, __CLC_VECSIZE)
Expand Down Expand Up @@ -115,6 +121,9 @@

#undef __CLC_U_GENTYPE
#undef __CLC_S_GENTYPE
#undef __CLC_GENTYPE_NINF
#undef __CLC_GENTYPE_PINF
#undef __CLC_GENTYPE_NAN
#undef __CLC_FP_LIT
#undef __CLC_FPSIZE
#undef __CLC_SCALAR_GENTYPE
Expand All @@ -126,6 +135,9 @@
#define __CLC_SCALAR_GENTYPE double
#define __CLC_FPSIZE 64
#define __CLC_FP_LIT(x) (x)
#define __CLC_GENTYPE_PINF ((__CLC_GENTYPE)DBL_PINF)
#define __CLC_GENTYPE_NINF ((__CLC_GENTYPE)DBL_NINF)
#define __CLC_GENTYPE_NAN ((__CLC_GENTYPE)DBL_NAN)

#define __CLC_S_GENTYPE __CLC_XCONCAT(long, __CLC_VECSIZE)
#define __CLC_U_GENTYPE __CLC_XCONCAT(ulong, __CLC_VECSIZE)
Expand Down Expand Up @@ -182,6 +194,9 @@

#undef __CLC_U_GENTYPE
#undef __CLC_S_GENTYPE
#undef __CLC_GENTYPE_NINF
#undef __CLC_GENTYPE_PINF
#undef __CLC_GENTYPE_NAN
#undef __CLC_FP_LIT
#undef __CLC_FPSIZE
#undef __CLC_SCALAR_GENTYPE
Expand All @@ -195,6 +210,9 @@
#define __CLC_SCALAR_GENTYPE half
#define __CLC_FPSIZE 16
#define __CLC_FP_LIT(x) x##H
#define __CLC_GENTYPE_NAN ((__CLC_GENTYPE)HALF_NAN)
#define __CLC_GENTYPE_PINF ((__CLC_GENTYPE)HALF_PINF)
#define __CLC_GENTYPE_NINF ((__CLC_GENTYPE)HALF_NINF)

#define __CLC_S_GENTYPE __CLC_XCONCAT(short, __CLC_VECSIZE)
#define __CLC_U_GENTYPE __CLC_XCONCAT(ushort, __CLC_VECSIZE)
Expand Down Expand Up @@ -251,6 +269,9 @@

#undef __CLC_U_GENTYPE
#undef __CLC_S_GENTYPE
#undef __CLC_GENTYPE_NINF
#undef __CLC_GENTYPE_PINF
#undef __CLC_GENTYPE_NAN
#undef __CLC_FP_LIT
#undef __CLC_FPSIZE
#undef __CLC_SCALAR_GENTYPE
Expand Down Expand Up @@ -289,6 +310,8 @@
#undef __CLC_CONVERT_UINTN
#undef __CLC_CONVERT_ULONGN

#undef __CLC_GENTYPE_INF

#undef __CLC_ULONGN
#undef __CLC_UINTN
#undef __CLC_USHORTN
Expand Down
1 change: 1 addition & 0 deletions libclc/clc/lib/generic/math/clc_hypot.cl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <clc/math/clc_sqrt.h>
#include <clc/math/clc_subnormal_config.h>
#include <clc/math/math.h>
#include <clc/relational/clc_isinf.h>
#include <clc/relational/clc_isnan.h>
#include <clc/shared/clc_clamp.h>

Expand Down
11 changes: 3 additions & 8 deletions libclc/clc/lib/generic/math/clc_hypot.inc
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_hypot(__CLC_GENTYPE x,
__CLC_GENTYPE retval = __clc_sqrt(__clc_mad(fx, fx, fy * fy)) * fx_exp;

retval = (ux > PINFBITPATT_SP32 || uy == 0) ? __CLC_AS_GENTYPE(ux) : retval;
retval = (ux == PINFBITPATT_SP32 || uy == PINFBITPATT_SP32)
? __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32)
: retval;
retval = __clc_isinf(x) || __clc_isinf(y) ? __CLC_GENTYPE_INF : retval;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Above can be rewritten with frexp and ldexp.

The output of the temporary result also implies one or the other were infinity, you can fold this to one isinf check on the maximum of fabs x and y (which is done as integer above)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aux and auy can be rewritten to float typed fabs(x),y. ux and uy are then applying a umax, which can be replaced by fmax. The fmax result then has the implied infiniteness, and you can check if that is infinity down here.

The bithacking below that is frexp, and the scaling applied to fx and fy can be done with ldexp from the integer exponent obtained from the frexp

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right yeah, I see. Thank you for the explanation.

I'm still pondering the naming of the frexp and ldexp helpers you propose. It's not fully-fledged frexp or ldexp as far as I can tell. If it is, then our implementations of frexp and ldexp are far too complicated. Are we talking about a 'native' frexp and ldexp? A 'fast' one? That's just going on existing OpenCL terminology, which may or may not confuse things. Should they handle NaNs, Infs, subnormals?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMDGPU instructions names call them frexp_exp and frexp_mant. It's just the two parts of the frexp, split into 2 instructions.

ldexp is just ldexp, there's nothing special to it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't tell from the documentation: what do frexp_exp and frexp_mant do with subnormal numbers? For exp, D0.i32 = exponent(S0.f32) - 127 + 1 implies it returns -126, which wouldn't be what frexp returns for the same input. So frexp relies on extra processing of frexp_exp and frexp_mant?

We would want consistent results across all libclc targets for a hypothetical __clc_frexp_exp and __clc_frexp_mant. We'd need to define the various edge cases and subnormal handling.

For targets without native support for either of these operations I worry that using these functions would be less optimal than doing bit manipulation/scaling/etc directly inline.

Might we not just end up in a scenario where AMDGPU in fact wants a custom implementation of __clc_hypot, rather than a generic solution for all targets involving frexp/ldexp?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They behave correctly, not sure what documentation you are referring to. The lowering of llvm.frexp is just the two intrinsic calls

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was looking at V_FREXP_EXP_I32_F32 and V_FREXP_MANT_F32 in the RDNA 3.5 docs.

I was previously looking at the lowering of llvm.frexp in CodeGen tests and saw more than just those two instructions, but now I realise I was only seeing the GFX6 output. The lowering is indeed just those two instructions for other architectures.

My broader point is that I need to align the semantics of these instructions with what other targets would have to do for the equivalent operations. To match the AMDGPU behaviour like-for-like on other targets, such as separating out the two exp/mant operations, having subnormal support, etc., would be far more expensive than the bithacking it's currently doing in hypot to just shift out the mantissa. But if the AMDGPU version of "frexp_mant" does subnormal scaling and other targets don't, then that could cause bugs between platforms as now we can't guarantee the bits that come out of the frexp_mant and frexp_exp helpers.

That's why I suggested it might just be better to have AMDGPU override hypot directly, rather than rely on a "generic" hypot which calls into mant/exp helpers which AMDGPU specialize.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But if the AMDGPU version of "frexp_mant" does subnormal scaling and other targets don't, then that could cause bugs between platforms as now we can't guarantee the bits that come out of the frexp_mant and frexp_exp helpers.

If they don't handle denormals, it's an incorrect implementation of frexp or frexp_mant. The operation is not simply mask out the exponent bits. The clamping to 126 code above is dealing with the denormal cases. That can be folded into a software frexp_mant utility which happens to be one instruction for amdgpu

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

float hypot(float x, float y) {
    float a = __builtin_fabsf(x);
    float b = __builtin_fabsf(y);
    float t = __builtin_fmaxf(a, b);
    int e = __clc_frexp_exp(t) ;
    a = __builtin_ldexpf(a, -e);
    b = __builtin_ldexpf(b, -e);
    float ret = __builtin_ldexpf(__clc_sqrt(__clc_mad(a, a, b*b)), e);
    ret = __builtin_isinf(t) ? PINF_F32 : ret;
    return ret;
}

return retval;
}

Expand Down Expand Up @@ -86,13 +84,10 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_hypot(__CLC_GENTYPE x,
r = c ? s : r;

// Check for NaN
c = __clc_isnan(x) || __clc_isnan(y);
r = c ? __CLC_AS_GENTYPE((__CLC_ULONGN)QNANBITPATT_DP64) : r;
r = __clc_isnan(x) || __clc_isnan(y) ? __CLC_GENTYPE_NAN : r;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't matter much, but can fold this condition into one __builtin_isunordered (or just let instcombine do it)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Above should also be rewritten with frexp and ldexp

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't matter much, but can fold this condition into one __builtin_isunordered (or just let instcombine do it)

Yeah I think I'll just let instcombine do it (which it does).

Above should also be rewritten with frexp and ldexp

Sorry, I'm not seeing which bit - could you explain? I note that libclc's frexp and ldexp are more involved than any of the bithacking here (possibly due to subnormal support?) so I'm not sure what the goal would be.


// If either is Inf, we must return Inf
c = x == __CLC_AS_GENTYPE((__CLC_ULONGN)PINFBITPATT_DP64) ||
y == __CLC_AS_GENTYPE((__CLC_ULONGN)PINFBITPATT_DP64);
r = c ? __CLC_AS_GENTYPE((__CLC_ULONGN)PINFBITPATT_DP64) : r;
r = __clc_isinf(x) || __clc_isinf(y) ? __CLC_GENTYPE_INF : r;

return r;
}
Expand Down
13 changes: 6 additions & 7 deletions libclc/clc/lib/generic/math/clc_log_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ __clc_log(float x)
#endif

uint xi = __clc_as_uint(x);
uint ax = xi & EXSIGNBIT_SP32;

// Calculations for |x-1| < 2^-4
float r = x - 1.0f;
Expand Down Expand Up @@ -179,9 +178,9 @@ __clc_log(float x)
z = near1 ? znear1 : z;

// Corner cases
z = ax >= PINFBITPATT_SP32 ? x : z;
z = xi != ax ? __clc_as_float(QNANBITPATT_SP32) : z;
z = ax == 0 ? __clc_as_float(NINFBITPATT_SP32) : z;
z = __clc_isinf(x) ? FLT_PINF : z;
z = (__clc_isnan(x) || x < 0.0f) ? FLT_NAN : z;
z = x == 0.0f ? FLT_NINF : z;

return z;
}
Expand Down Expand Up @@ -311,9 +310,9 @@ __clc_log(double x)

double ret = is_near ? ret_near : ret_far;

ret = __clc_isinf(x) ? __clc_as_double(PINFBITPATT_DP64) : ret;
ret = (__clc_isnan(x) | (x < 0.0)) ? __clc_as_double(QNANBITPATT_DP64) : ret;
ret = x == 0.0 ? __clc_as_double(NINFBITPATT_DP64) : ret;
ret = __clc_isinf(x) ? DBL_PINF : ret;
ret = (__clc_isnan(x) || x < 0.0) ? DBL_NAN : ret;
ret = x == 0.0 ? DBL_NINF : ret;
return ret;
}

Expand Down