Skip to content

Commit 7f3b8ec

Browse files
committed
[libclc] Move fma to the CLC library
This builtin is a little more involved than others as targets deal with fma in various different ways. Fundamentally, the CLC __clc_fma builtin compiles to __builtin_elementwise_fma, which compiles to the @llvm.fma intrinsic. However, in the case of fp32 fma some targets call the __clc_sw_fma function, which provides a software implementation of the builtin. This in principle is controlled by the __CLC_HAVE_HW_FMA32 macro and may be a runtime decision, depending on how the target defines that macro. All targets build the CLC fma functions for all types. This is to the CLC library can have a reliable internal implementation for its own purposes. For AMD/NVPTX targets there are no meaningful changes to the generated LLVM bytecode. Some blocks of code have moved around, which confounds llvm-diff. For the clspv and SPIR-V/Mesa targets, only fp32 fma is of interest. Its use in libclc is tightly controlled by checking __CLC_HAVE_HW_FMA32 first. This can either be a compile-time constant (0, for clspv) or a runtime function for SPIR-V/Mesa. The SPIR-V/Mesa target only provided fp32 fma in the OpenCL layer. It unconditionally mapped that to the __clc_sw_fma builtin, even though the generic version in theory had a runtime toggle through __CLC_HAVE_HW_FMA32 specifically for that target. Callers of fma, though, would end up using the ExtInst fma, *not* calling the _Z3fmafff function provided by libclc. This commit keeps this system in place in the OpenCL layer, by mapping fma to __clc_sw_fma. Where other builtins would previously call fma (i.e., result in the ExtInst), they now call __clc_fma. This function checks the __CLC_HAVE_HW_FMA32 runtime toggle, which selects between the slow version or the quick version. The quick version is the LLVM fma intrinsic which llvm-spirv translates to the ExtInst. The clspv target had its own software implementation of fp32 fma, which it called unconditionally - even though __CLC_HAVE_HW_FMA32 is 1 for that target. This is potentially just so its library ships a software version which it can fall back on. In the OpenCL layer, the target doesn't provide fp64 fma, and maps fp16 fma to fp32 mad. This commit keeps this system roughly in place: in the OpenCL layer it maps fp32 fma to __clc_sw_fma, and fp16 fma to mad. Where builtins would previously call into fma, they now call __clc_fma, which compiles to the LLVM intrinsic. If this goes through a translation to SPIR-V it will become the fma ExtInst, or the intrinsic could be replaced by the _Z3fmafff software implementation. The clspv and SPIR-V/Mesa targets could potentially be cleaned up later, depending on their needs.
1 parent d4144ca commit 7f3b8ec

File tree

25 files changed

+486
-443
lines changed

25 files changed

+486
-443
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#define MAXFLOAT 0x1.fffffep127f
2+
#define HUGE_VALF __builtin_huge_valf()
3+
#define INFINITY __builtin_inff()
4+
#define NAN __builtin_nanf("")
5+
6+
#define FLT_DIG 6
7+
#define FLT_MANT_DIG 24
8+
#define FLT_MAX_10_EXP +38
9+
#define FLT_MAX_EXP +128
10+
#define FLT_MIN_10_EXP -37
11+
#define FLT_MIN_EXP -125
12+
#define FLT_RADIX 2
13+
#define FLT_MAX MAXFLOAT
14+
#define FLT_MIN 0x1.0p-126f
15+
#define FLT_EPSILON 0x1.0p-23f
16+
17+
#define FP_ILOGB0 (-2147483647 - 1)
18+
#define FP_ILOGBNAN 2147483647
19+
20+
#define M_E_F 0x1.5bf0a8p+1f
21+
#define M_LOG2E_F 0x1.715476p+0f
22+
#define M_LOG10E_F 0x1.bcb7b2p-2f
23+
#define M_LN2_F 0x1.62e430p-1f
24+
#define M_LN10_F 0x1.26bb1cp+1f
25+
#define M_PI_F 0x1.921fb6p+1f
26+
#define M_PI_2_F 0x1.921fb6p+0f
27+
#define M_PI_4_F 0x1.921fb6p-1f
28+
#define M_1_PI_F 0x1.45f306p-2f
29+
#define M_2_PI_F 0x1.45f306p-1f
30+
#define M_2_SQRTPI_F 0x1.20dd76p+0f
31+
#define M_SQRT2_F 0x1.6a09e6p+0f
32+
#define M_SQRT1_2_F 0x1.6a09e6p-1f
33+
34+
#ifdef __CLC_INTERNAL
35+
#define M_LOG210_F 0x1.a934f0p+1f
36+
#endif
37+
38+
#ifdef cl_khr_fp64
39+
40+
#define HUGE_VAL __builtin_huge_val()
41+
42+
#define DBL_DIG 15
43+
#define DBL_MANT_DIG 53
44+
#define DBL_MAX_10_EXP +308
45+
#define DBL_MAX_EXP +1024
46+
#define DBL_MIN_10_EXP -307
47+
#define DBL_MIN_EXP -1021
48+
#define DBL_MAX 0x1.fffffffffffffp1023
49+
#define DBL_MIN 0x1.0p-1022
50+
#define DBL_EPSILON 0x1.0p-52
51+
52+
#define M_E 0x1.5bf0a8b145769p+1
53+
#define M_LOG2E 0x1.71547652b82fep+0
54+
#define M_LOG10E 0x1.bcb7b1526e50ep-2
55+
#define M_LN2 0x1.62e42fefa39efp-1
56+
#define M_LN10 0x1.26bb1bbb55516p+1
57+
#define M_PI 0x1.921fb54442d18p+1
58+
#define M_PI_2 0x1.921fb54442d18p+0
59+
#define M_PI_4 0x1.921fb54442d18p-1
60+
#define M_1_PI 0x1.45f306dc9c883p-2
61+
#define M_2_PI 0x1.45f306dc9c883p-1
62+
#define M_2_SQRTPI 0x1.20dd750429b6dp+0
63+
#define M_SQRT2 0x1.6a09e667f3bcdp+0
64+
#define M_SQRT1_2 0x1.6a09e667f3bcdp-1
65+
66+
#ifdef __CLC_INTERNAL
67+
#define M_LOG210 0x1.a934f0979a371p+1
68+
#endif
69+
70+
#endif
71+
72+
#ifdef cl_khr_fp16
73+
74+
#if __OPENCL_VERSION__ >= 120
75+
76+
#define HALF_DIG 3
77+
#define HALF_MANT_DIG 11
78+
#define HALF_MAX_10_EXP +4
79+
#define HALF_MAX_EXP +16
80+
#define HALF_MIN_10_EXP -4
81+
#define HALF_MIN_EXP -13
82+
83+
#define HALF_RADIX 2
84+
#define HALF_MAX 0x1.ffcp15h
85+
#define HALF_MIN 0x1.0p-14h
86+
#define HALF_EPSILON 0x1.0p-10h
87+
88+
#endif
89+
90+
#endif
Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,7 @@
11
#ifndef __CLC_INTEGER_CLC_ABS_H__
22
#define __CLC_INTEGER_CLC_ABS_H__
33

4-
#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
5-
// clspv and spir-v targets provide their own OpenCL-compatible abs
6-
#define __clc_abs abs
7-
#else
8-
94
#define __CLC_BODY <clc/integer/clc_abs.inc>
105
#include <clc/integer/gentype.inc>
116

12-
#endif
13-
147
#endif // __CLC_INTEGER_CLC_ABS_H__
Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
#define __CLC_FUNCTION __clc_fma
2-
#define __CLC_INTRINSIC "llvm.fma"
3-
#include "math/ternary_intrin.inc"
1+
#ifndef __CLC_INTERNAL_MATH_CLC_SW_FMA_H__
2+
#define __CLC_INTERNAL_MATH_CLC_SW_FMA_H__
43

5-
#define __FLOAT_ONLY
64
#define __CLC_FUNCTION __clc_sw_fma
75
#define __CLC_BODY <clc/shared/ternary_decl.inc>
6+
87
#include <clc/math/gentype.inc>
8+
99
#undef __CLC_BODY
1010
#undef __CLC_FUNCTION
11-
#undef __FLOAT_ONLY
11+
12+
#endif // __CLC_INTERNAL_MATH_CLC_SW_FMA_H__

libclc/clc/include/clc/math/clc_fma.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#ifndef __CLC_MATH_CLC_FMA_H__
2+
#define __CLC_MATH_CLC_FMA_H__
3+
4+
#define __CLC_FUNCTION __clc_fma
5+
#define __CLC_BODY <clc/shared/ternary_decl.inc>
6+
7+
#include <clc/math/gentype.inc>
8+
9+
#undef __CLC_BODY
10+
#undef __CLC_FUNCTION
11+
12+
#endif // __CLC_MATH_CLC_FMA_H__

libclc/clc/include/clc/math/math.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,12 @@
3939
#define PINF 0x200
4040

4141
#if (defined __AMDGCN__ || defined __R600__) && !defined __HAS_FMAF__
42-
#define HAVE_HW_FMA32() (0)
42+
#define __CLC_HAVE_HW_FMA32() (0)
4343
#elif defined(CLC_SPIRV)
4444
bool __attribute__((noinline)) __clc_runtime_has_hw_fma32(void);
45-
#define HAVE_HW_FMA32() __clc_runtime_has_hw_fma32()
45+
#define __CLC_HAVE_HW_FMA32() __clc_runtime_has_hw_fma32()
4646
#else
47-
#define HAVE_HW_FMA32() (1)
47+
#define __CLC_HAVE_HW_FMA32() (1)
4848
#endif
4949

5050
#define HAVE_BITALIGN() (0)

libclc/clc/include/clc/relational/clc_isinf.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,6 @@
11
#ifndef __CLC_RELATIONAL_CLC_ISINF_H__
22
#define __CLC_RELATIONAL_CLC_ISINF_H__
33

4-
#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
5-
// clspv and spir-v targets provide their own OpenCL-compatible isinf
6-
#define __clc_isinf isinf
7-
#else
8-
94
#include <clc/clcfunc.h>
105
#include <clc/clctypes.h>
116

@@ -37,6 +32,4 @@ _CLC_VECTOR_ISINF_DECL(short, half)
3732
#undef _CLC_ISINF_DECL
3833
#undef _CLC_VECTOR_ISINF_DECL
3934

40-
#endif
41-
4235
#endif // __CLC_RELATIONAL_CLC_ISINF_H__
Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,10 @@
11
#ifndef __CLC_SHARED_CLC_MAX_H__
22
#define __CLC_SHARED_CLC_MAX_H__
33

4-
#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
5-
// clspv and spir-v targets provide their own OpenCL-compatible max
6-
#define __clc_max max
7-
#else
8-
94
#define __CLC_BODY <clc/shared/clc_max.inc>
105
#include <clc/integer/gentype.inc>
116

127
#define __CLC_BODY <clc/shared/clc_max.inc>
138
#include <clc/math/gentype.inc>
149

15-
#endif
16-
1710
#endif // __CLC_SHARED_CLC_MAX_H__
Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,10 @@
11
#ifndef __CLC_SHARED_CLC_MIN_H__
22
#define __CLC_SHARED_CLC_MIN_H__
33

4-
#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
5-
// clspv and spir-v targets provide their own OpenCL-compatible min
6-
#define __clc_min min
7-
#else
8-
94
#define __CLC_BODY <clc/shared/clc_min.inc>
105
#include <clc/integer/gentype.inc>
116

127
#define __CLC_BODY <clc/shared/clc_min.inc>
138
#include <clc/math/gentype.inc>
149

15-
#endif
16-
1710
#endif // __CLC_SHARED_CLC_MIN_H__

libclc/clc/lib/clspv/SOURCES

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
../generic/integer/clc_abs.cl
12
../generic/integer/clc_add_sat.cl
23
../generic/integer/clc_clz.cl
34
../generic/integer/clc_hadd.cl
@@ -14,10 +15,15 @@
1415
../generic/math/clc_copysign.cl
1516
../generic/math/clc_fabs.cl
1617
../generic/math/clc_floor.cl
18+
../generic/math/clc_fma.cl
1719
../generic/math/clc_mad.cl
1820
../generic/math/clc_nextafter.cl
1921
../generic/math/clc_rint.cl
2022
../generic/math/clc_trunc.cl
23+
../generic/relational/clc_isinf.cl
2124
../generic/relational/clc_isnan.cl
2225
../generic/relational/clc_select.cl
2326
../generic/shared/clc_clamp.cl
27+
../generic/shared/clc_max.cl
28+
../generic/shared/clc_min.cl
29+
math/clc_sw_fma.cl

0 commit comments

Comments
 (0)