Skip to content

Commit 9bd48ab

Browse files
committed
Break: New dispatch schema for Arm
This commit add new capability levels for Arm allowing us to differentiate f16, bf16. and i8-supporting generations of CPUs, becoming increasingly popular in the datacenter. This breaks compilation of Rust and Python bindings due to the "target specific options mismatch".
1 parent d7abdef commit 9bd48ab

File tree

9 files changed

+304
-91
lines changed

9 files changed

+304
-91
lines changed

c/lib.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,13 @@ SIMSIMD_METRIC_DECLARATION(js, f32, f32)
110110
SIMSIMD_METRIC_DECLARATION(js, f64, f64)
111111

112112
SIMSIMD_DYNAMIC int simsimd_uses_neon(void) { return (simsimd_capabilities() & simsimd_cap_neon_k) != 0; }
113+
SIMSIMD_DYNAMIC int simsimd_uses_neon_f16(void) { return (simsimd_capabilities() & simsimd_cap_neon_f16_k) != 0; }
114+
SIMSIMD_DYNAMIC int simsimd_uses_neon_bf16(void) { return (simsimd_capabilities() & simsimd_cap_neon_bf16_k) != 0; }
115+
SIMSIMD_DYNAMIC int simsimd_uses_neon_i8(void) { return (simsimd_capabilities() & simsimd_cap_neon_i8_k) != 0; }
113116
SIMSIMD_DYNAMIC int simsimd_uses_sve(void) { return (simsimd_capabilities() & simsimd_cap_sve_k) != 0; }
117+
SIMSIMD_DYNAMIC int simsimd_uses_sve_f16(void) { return (simsimd_capabilities() & simsimd_cap_sve_f16_k) != 0; }
118+
SIMSIMD_DYNAMIC int simsimd_uses_sve_bf16(void) { return (simsimd_capabilities() & simsimd_cap_sve_bf16_k) != 0; }
119+
SIMSIMD_DYNAMIC int simsimd_uses_sve_i8(void) { return (simsimd_capabilities() & simsimd_cap_sve_i8_k) != 0; }
114120
SIMSIMD_DYNAMIC int simsimd_uses_haswell(void) { return (simsimd_capabilities() & simsimd_cap_haswell_k) != 0; }
115121
SIMSIMD_DYNAMIC int simsimd_uses_skylake(void) { return (simsimd_capabilities() & simsimd_cap_skylake_k) != 0; }
116122
SIMSIMD_DYNAMIC int simsimd_uses_ice(void) { return (simsimd_capabilities() & simsimd_cap_ice_k) != 0; }

cpp/bench.cxx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,13 @@ int main(int argc, char** argv) {
242242
std::printf("\n");
243243
std::printf("Run-time settings:\n");
244244
std::printf("- Arm NEON support enabled: %s\n", flags[(runtime_caps & simsimd_cap_neon_k) != 0]);
245+
std::printf("- Arm NEON F16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_neon_f16_k) != 0]);
246+
std::printf("- Arm NEON BF16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_neon_bf16_k) != 0]);
247+
std::printf("- Arm NEON I8 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_neon_i8_k) != 0]);
245248
std::printf("- Arm SVE support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_k) != 0]);
249+
std::printf("- Arm SVE F16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_f16_k) != 0]);
250+
std::printf("- Arm SVE BF16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_bf16_k) != 0]);
251+
std::printf("- Arm SVE I8 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_i8_k) != 0]);
246252
std::printf("- x86 Haswell support enabled: %s\n", flags[(runtime_caps & simsimd_cap_haswell_k) != 0]);
247253
std::printf("- x86 Skylake support enabled: %s\n", flags[(runtime_caps & simsimd_cap_skylake_k) != 0]);
248254
std::printf("- x86 Ice Lake support enabled: %s\n", flags[(runtime_caps & simsimd_cap_ice_k) != 0]);

cpp/test.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,13 @@ void print_capabilities(void) {
3636
printf("\n");
3737
printf("Run-time settings:\n");
3838
printf("- Arm NEON support enabled: %s\n", flags[(runtime_caps & simsimd_cap_neon_k) != 0]);
39+
printf("- Arm NEON F16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_neon_f16_k) != 0]);
40+
printf("- Arm NEON BF16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_neon_bf16_k) != 0]);
41+
printf("- Arm NEON I8 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_neon_i8_k) != 0]);
3942
printf("- Arm SVE support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_k) != 0]);
43+
printf("- Arm SVE F16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_f16_k) != 0]);
44+
printf("- Arm SVE BF16 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_bf16_k) != 0]);
45+
printf("- Arm SVE I8 support enabled: %s\n", flags[(runtime_caps & simsimd_cap_sve_i8_k) != 0]);
4046
printf("- x86 Haswell support enabled: %s\n", flags[(runtime_caps & simsimd_cap_haswell_k) != 0]);
4147
printf("- x86 Skylake support enabled: %s\n", flags[(runtime_caps & simsimd_cap_skylake_k) != 0]);
4248
printf("- x86 Ice Lake support enabled: %s\n", flags[(runtime_caps & simsimd_cap_ice_k) != 0]);

include/simsimd/simsimd.h

Lines changed: 215 additions & 60 deletions
Large diffs are not rendered by default.

include/simsimd/spatial.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,7 @@ SIMSIMD_PUBLIC void simsimd_cos_f16_neon(simsimd_f16_t const* a, simsimd_f16_t c
300300

301301
SIMSIMD_PUBLIC void simsimd_cos_bf16_neon(simsimd_bf16_t const* a, simsimd_bf16_t const* b, simsimd_size_t n,
302302
simsimd_distance_t* result) {
303+
// TODO: Redo with BFMMLA - vbfmmlaq_f32
303304
float32x4_t ab_high_vec = vdupq_n_f32(0), ab_low_vec = vdupq_n_f32(0);
304305
float32x4_t a2_high_vec = vdupq_n_f32(0), a2_low_vec = vdupq_n_f32(0);
305306
float32x4_t b2_high_vec = vdupq_n_f32(0), b2_low_vec = vdupq_n_f32(0);
@@ -440,6 +441,7 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_neon(simsimd_i8_t const* a, simsimd_i8_t cons
440441
// b2_vec = vaddq_s32(b2_vec, vaddq_s32(vmovl_s16(vget_high_s16(b2_part_vec)), //
441442
// vmovl_s16(vget_low_s16(b2_part_vec))));
442443
// }
444+
// TODO: Redo with MMLA: vmmlaq_s32
443445
for (; i + 16 <= n; i += 16) {
444446
int8x16_t a_vec = vld1q_s8(a + i);
445447
int8x16_t b_vec = vld1q_s8(b + i);

include/simsimd/types.h

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -87,28 +87,6 @@
8787

8888
// Compiling for x86: SIMSIMD_TARGET_SKYLAKE, SIMSIMD_TARGET_ICE, SIMSIMD_TARGET_SAPPHIRE
8989
//
90-
// It's important to provide fine-grained controls over AVX512 families, as they are very fragmented:
91-
// - Intel Skylake servers: F, CD, VL, DQ, BW
92-
// - Intel Cascade Lake workstations: F, CD, VL, DQ, BW, VNNI
93-
// > In other words, it extends Skylake with VNNI support
94-
// - Intel Sunny Cove (Ice Lake) servers:
95-
// F, CD, VL, DQ, BW, VNNI, VPOPCNTDQ, IFMA, VBMI, VAES, GFNI, VBMI2, BITALG, VPCLMULQDQ
96-
// - AMD Zen4 (Genoa):
97-
// F, CD, VL, DQ, BW, VNNI, VPOPCNTDQ, IFMA, VBMI, VAES, GFNI, VBMI2, BITALG, VPCLMULQDQ, BF16
98-
// > In other words, it extends Sunny Cove with BF16 support
99-
// - Golden Cove (Sapphire Rapids): extends Zen4 and Sunny Cove with FP16 support
100-
//
101-
// Intel Palm Cove was an irrelevant intermediate release extending Skylake with IFMA and VBMI.
102-
// Intel Willow Cove was an irrelevant intermediate release extending Sunny Cove with VP2INTERSECT,
103-
// that aren't supported by any other CPU built to date... and those are only available in Tiger Lake laptops.
104-
// Intel Cooper Lake was the only intermediary platform, that supported BF16, but not FP16.
105-
// It's mostly used in 4-socket and 8-socket high-memory configurations.
106-
//
107-
// In practical terms, it makes sense to differentiate only 3 AVX512 generations:
108-
// 1. Skylake (pre 2019): supports single-precision dot-products.
109-
// 2. Ice Lake (2019-2021): advanced integer algorithms.
110-
// 3. Sapphire Rapids (2023+): advanced mixed-precision float processing.
111-
//
11290
// To list all available macros for x86, take a recent compiler, like GCC 12 and run:
11391
// gcc-12 -march=sapphirerapids -dM -E - < /dev/null | egrep "SSE|AVX" | sort
11492
// On Arm machines you may want to check for other flags:
@@ -199,14 +177,13 @@ typedef unsigned long long simsimd_u64_t;
199177
typedef simsimd_u64_t simsimd_size_t;
200178
typedef simsimd_f64_t simsimd_distance_t;
201179

202-
#if !defined(SIMSIMD_NATIVE_F16) || SIMSIMD_NATIVE_F16
203-
/**
204-
* @brief Half-precision floating-point type.
180+
/* @brief Half-precision floating-point type.
205181
*
206-
* - GCC or Clang on 64-bit ARM: `__fp16`, may require `-mfp16-format` option.
182+
* - GCC or Clang on 64-bit Arm: `__fp16`, may require `-mfp16-format` option.
207183
* - GCC or Clang on 64-bit x86: `_Float16`.
208184
* - Default: `unsigned short`.
209185
*/
186+
#if !defined(SIMSIMD_NATIVE_F16) || SIMSIMD_NATIVE_F16
210187
#if (defined(__GNUC__) || defined(__clang__)) && (defined(__ARM_ARCH) || defined(__aarch64__)) && \
211188
(defined(__ARM_FP16_FORMAT_IEEE))
212189
#if !defined(SIMSIMD_NATIVE_F16)
@@ -228,6 +205,12 @@ typedef _Float16 simsimd_f16_t;
228205
typedef unsigned short simsimd_f16_t;
229206
#endif
230207

208+
/* @brief Half-precision "brain" floating-point type.
209+
*
210+
* - GCC or Clang on 64-bit Arm: `__bf16`, may require `-mbf16-format` option.
211+
* - GCC or Clang on 64-bit x86: `bfloat16_t`.
212+
* - Default: `unsigned short`.
213+
*/
231214
#if !defined(SIMSIMD_NATIVE_BF16)
232215
#define SIMSIMD_NATIVE_BF16 1
233216
#endif

python/lib.c

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -181,10 +181,20 @@ static PyObject* api_enable_capability(PyObject* self, PyObject* args) {
181181

182182
if (same_string(cap_name, "neon")) {
183183
static_capabilities |= simsimd_cap_neon_k;
184+
} else if (same_string(cap_name, "neon_f16")) {
185+
static_capabilities |= simsimd_cap_neon_f16_k;
186+
} else if (same_string(cap_name, "neon_bf16")) {
187+
static_capabilities |= simsimd_cap_neon_bf16_k;
188+
} else if (same_string(cap_name, "neon_i8")) {
189+
static_capabilities |= simsimd_cap_neon_i8_k;
184190
} else if (same_string(cap_name, "sve")) {
185191
static_capabilities |= simsimd_cap_sve_k;
186-
} else if (same_string(cap_name, "sve2")) {
187-
static_capabilities |= simsimd_cap_sve2_k;
192+
} else if (same_string(cap_name, "sve_f16")) {
193+
static_capabilities |= simsimd_cap_sve_f16_k;
194+
} else if (same_string(cap_name, "sve_bf16")) {
195+
static_capabilities |= simsimd_cap_sve_bf16_k;
196+
} else if (same_string(cap_name, "sve_i8")) {
197+
static_capabilities |= simsimd_cap_sve_i8_k;
188198
} else if (same_string(cap_name, "haswell")) {
189199
static_capabilities |= simsimd_cap_haswell_k;
190200
} else if (same_string(cap_name, "skylake")) {
@@ -214,10 +224,20 @@ static PyObject* api_disable_capability(PyObject* self, PyObject* args) {
214224

215225
if (same_string(cap_name, "neon")) {
216226
static_capabilities &= ~simsimd_cap_neon_k;
227+
} else if (same_string(cap_name, "neon_f16")) {
228+
static_capabilities &= ~simsimd_cap_neon_f16_k;
229+
} else if (same_string(cap_name, "neon_bf16")) {
230+
static_capabilities &= ~simsimd_cap_neon_bf16_k;
231+
} else if (same_string(cap_name, "neon_i8")) {
232+
static_capabilities &= ~simsimd_cap_neon_i8_k;
217233
} else if (same_string(cap_name, "sve")) {
218234
static_capabilities &= ~simsimd_cap_sve_k;
219-
} else if (same_string(cap_name, "sve2")) {
220-
static_capabilities &= ~simsimd_cap_sve2_k;
235+
} else if (same_string(cap_name, "sve_f16")) {
236+
static_capabilities &= ~simsimd_cap_sve_f16_k;
237+
} else if (same_string(cap_name, "sve_bf16")) {
238+
static_capabilities &= ~simsimd_cap_sve_bf16_k;
239+
} else if (same_string(cap_name, "sve_i8")) {
240+
static_capabilities &= ~simsimd_cap_sve_i8_k;
221241
} else if (same_string(cap_name, "haswell")) {
222242
static_capabilities &= ~simsimd_cap_haswell_k;
223243
} else if (same_string(cap_name, "skylake")) {

python/test.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,13 @@ def test_capabilities_list():
7474
"""Tests the visibility of hardware capabilities."""
7575
assert "serial" in simd.get_capabilities()
7676
assert "neon" in simd.get_capabilities()
77+
assert "neon_f16" in simd.get_capabilities()
78+
assert "neon_bf16" in simd.get_capabilities()
79+
assert "neon_i8" in simd.get_capabilities()
7780
assert "sve" in simd.get_capabilities()
78-
assert "sve2" in simd.get_capabilities()
81+
assert "sve_f16" in simd.get_capabilities()
82+
assert "sve_bf16" in simd.get_capabilities()
83+
assert "sve_i8" in simd.get_capabilities()
7984
assert "haswell" in simd.get_capabilities()
8085
assert "ice" in simd.get_capabilities()
8186
assert "skylake" in simd.get_capabilities()

rust/lib.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,13 @@ extern "C" {
8787
fn simsimd_kl_f64(a: *const f64, b: *const f64, c: usize, d: *mut Distance);
8888

8989
fn simsimd_uses_neon() -> i32;
90+
fn simsimd_uses_neon_f16() -> i32;
91+
fn simsimd_uses_neon_bf16() -> i32;
92+
fn simsimd_uses_neon_i8() -> i32;
9093
fn simsimd_uses_sve() -> i32;
94+
fn simsimd_uses_sve_f16() -> i32;
95+
fn simsimd_uses_sve_bf16() -> i32;
96+
fn simsimd_uses_sve_i8() -> i32;
9197
fn simsimd_uses_haswell() -> i32;
9298
fn simsimd_uses_skylake() -> i32;
9399
fn simsimd_uses_ice() -> i32;
@@ -109,10 +115,34 @@ pub mod capabilties {
109115
unsafe { crate::simsimd_uses_neon() != 0 }
110116
}
111117

118+
pub fn uses_neon_f16() -> bool {
119+
unsafe { crate::simsimd_uses_neon_f16() != 0 }
120+
}
121+
122+
pub fn uses_neon_bf16() -> bool {
123+
unsafe { crate::simsimd_uses_neon_bf16() != 0 }
124+
}
125+
126+
pub fn uses_neon_i8() -> bool {
127+
unsafe { crate::simsimd_uses_neon_i8() != 0 }
128+
}
129+
112130
pub fn uses_sve() -> bool {
113131
unsafe { crate::simsimd_uses_sve() != 0 }
114132
}
115133

134+
pub fn uses_sve_f16() -> bool {
135+
unsafe { crate::simsimd_uses_sve_f16() != 0 }
136+
}
137+
138+
pub fn uses_sve_bf16() -> bool {
139+
unsafe { crate::simsimd_uses_sve_bf16() != 0 }
140+
}
141+
142+
pub fn uses_sve_i8() -> bool {
143+
unsafe { crate::simsimd_uses_sve_i8() != 0 }
144+
}
145+
116146
pub fn uses_haswell() -> bool {
117147
unsafe { crate::simsimd_uses_haswell() != 0 }
118148
}

0 commit comments

Comments
 (0)