Skip to content

Commit 90569e0

Browse files
authored
[Support] Add Arm NEON implementation for llvm::xxh3_64bits (#99634)
Compared to the generic scalar code, using Arm NEON instructions yields a ~11x speedup: 31 vs 339.5 ms to hash 1 GiB of random data on the Apple M1. This follows the upstream implementation closely, with some simplifications made: - Removed workarounds for suboptimal codegen on older GCC - Removed instruction reordering barriers which seem to have a negligible impact according to my measurements - We do not support WebAssembly's mostly NEON-compatible API - There is no configurable mixing of SIMD and scalar code; according to the upstream comments, this is only relevant for smaller Cortex cores which can dispatch relatively few NEON micro-ops per cycle. This commit intends to use only standard ACLE intrinsics and datatypes, so it should build with all supported versions of GCC, Clang and MSVC. This feature is enabled by default when targeting AArch64, but the `LLVM_XXH_USE_NEON=0` macro can be set to explicitly disable it. XXH3 is used for ICF, string deduplication and computing the UUID in ld64.lld; this commit results in a -1.77% +/- 0.59% speed improvement for a `--threads=8` link of Chromium.framework.
1 parent 1c798e0 commit 90569e0

File tree

3 files changed

+203
-21
lines changed

3 files changed

+203
-21
lines changed

llvm/benchmarks/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
set(LLVM_LINK_COMPONENTS
22
Support)
33

4-
add_benchmark(DummyYAML DummyYAML.cpp)
4+
add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED)
5+
add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED)

llvm/benchmarks/xxhash.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#include "llvm/Support/xxhash.h"
2+
#include "benchmark/benchmark.h"
3+
4+
#include <memory>
5+
6+
static uint32_t xorshift(uint32_t State) {
7+
State ^= State << 13;
8+
State ^= State >> 17;
9+
State ^= State << 5;
10+
return State;
11+
}
12+
13+
static void BM_xxh3_64bits(benchmark::State &State) {
14+
std::unique_ptr<uint32_t[]> Data(new uint32_t[State.range(0) / 4]);
15+
16+
uint32_t Prev = 0xcafebabe;
17+
for (int64_t I = 0; I < State.range(0) / 4; I++)
18+
Data[I] = Prev = xorshift(Prev);
19+
20+
llvm::ArrayRef DataRef =
21+
llvm::ArrayRef(reinterpret_cast<uint8_t *>(Data.get()), State.range(0));
22+
23+
for (auto _ : State)
24+
llvm::xxh3_64bits(DataRef);
25+
}
26+
27+
BENCHMARK(BM_xxh3_64bits)->Arg(32)->Arg(512)->Arg(64 * 1024)->Arg(1024 * 1024);
28+
29+
BENCHMARK_MAIN();

llvm/lib/Support/xxhash.cpp

Lines changed: 172 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,19 @@
4747

4848
#include <stdlib.h>
4949

50+
#if !defined(LLVM_XXH_USE_NEON)
51+
#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) && \
52+
!defined(__ARM_BIG_ENDIAN)
53+
#define LLVM_XXH_USE_NEON 1
54+
#else
55+
#define LLVM_XXH_USE_NEON 0
56+
#endif
57+
#endif
58+
59+
#if LLVM_XXH_USE_NEON
60+
#include <arm_neon.h>
61+
#endif
62+
5063
using namespace llvm;
5164
using namespace support;
5265

@@ -323,6 +336,144 @@ static uint64_t XXH3_len_129to240_64b(const uint8_t *input, size_t len,
323336
return XXH3_avalanche(acc);
324337
}
325338

339+
#if LLVM_XXH_USE_NEON
340+
341+
#define XXH3_accumulate_512 XXH3_accumulate_512_neon
342+
#define XXH3_scrambleAcc XXH3_scrambleAcc_neon
343+
344+
// NEON implementation based on commit a57f6cce2698049863af8c25787084ae0489d849
345+
// (July 2024), with the following removed:
346+
// - workaround for suboptimal codegen on older GCC
347+
// - compiler barriers against instruction reordering
348+
// - WebAssembly SIMD support
349+
// - configurable split between NEON and scalar lanes (benchmarking shows no
350+
// penalty when fully doing SIMD on the Apple M1)
351+
352+
#if defined(__GNUC__) || defined(__clang__)
353+
#define XXH_ALIASING __attribute__((__may_alias__))
354+
#else
355+
#define XXH_ALIASING /* nothing */
356+
#endif
357+
358+
typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
359+
360+
LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64x2_t XXH_vld1q_u64(void const *ptr) {
361+
return vreinterpretq_u64_u8(vld1q_u8((uint8_t const *)ptr));
362+
}
363+
364+
LLVM_ATTRIBUTE_ALWAYS_INLINE
365+
static void XXH3_accumulate_512_neon(uint64_t *acc, const uint8_t *input,
366+
const uint8_t *secret) {
367+
xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
368+
369+
#ifdef __clang__
370+
#pragma clang loop unroll(full)
371+
#endif
372+
for (size_t i = 0; i < XXH_ACC_NB / 2; i += 2) {
373+
/* data_vec = input[i]; */
374+
uint64x2_t data_vec_1 = XXH_vld1q_u64(input + (i * 16));
375+
uint64x2_t data_vec_2 = XXH_vld1q_u64(input + ((i + 1) * 16));
376+
377+
/* key_vec = secret[i]; */
378+
uint64x2_t key_vec_1 = XXH_vld1q_u64(secret + (i * 16));
379+
uint64x2_t key_vec_2 = XXH_vld1q_u64(secret + ((i + 1) * 16));
380+
381+
/* data_swap = swap(data_vec) */
382+
uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
383+
uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
384+
385+
/* data_key = data_vec ^ key_vec; */
386+
uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
387+
uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
388+
389+
/*
390+
* If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
391+
* de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
392+
* get one vector with the low 32 bits of each lane, and one vector
393+
* with the high 32 bits of each lane.
394+
*
395+
* The intrinsic returns a double vector because the original ARMv7-a
396+
* instruction modified both arguments in place. AArch64 and SIMD128 emit
397+
* two instructions from this intrinsic.
398+
*
399+
* [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
400+
* [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
401+
*/
402+
uint32x4x2_t unzipped = vuzpq_u32(vreinterpretq_u32_u64(data_key_1),
403+
vreinterpretq_u32_u64(data_key_2));
404+
405+
/* data_key_lo = data_key & 0xFFFFFFFF */
406+
uint32x4_t data_key_lo = unzipped.val[0];
407+
/* data_key_hi = data_key >> 32 */
408+
uint32x4_t data_key_hi = unzipped.val[1];
409+
410+
/*
411+
* Then, we can split the vectors horizontally and multiply which, as for
412+
* most widening intrinsics, have a variant that works on both high half
413+
* vectors for free on AArch64. A similar instruction is available on
414+
* SIMD128.
415+
*
416+
* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
417+
*/
418+
uint64x2_t sum_1 = vmlal_u32(data_swap_1, vget_low_u32(data_key_lo),
419+
vget_low_u32(data_key_hi));
420+
uint64x2_t sum_2 = vmlal_u32(data_swap_2, vget_high_u32(data_key_lo),
421+
vget_high_u32(data_key_hi));
422+
423+
/* xacc[i] = acc_vec + sum; */
424+
xacc[i] = vaddq_u64(xacc[i], sum_1);
425+
xacc[i + 1] = vaddq_u64(xacc[i + 1], sum_2);
426+
}
427+
}
428+
429+
LLVM_ATTRIBUTE_ALWAYS_INLINE
430+
static void XXH3_scrambleAcc_neon(uint64_t *acc, const uint8_t *secret) {
431+
xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
432+
433+
/* { prime32_1, prime32_1 } */
434+
uint32x2_t const kPrimeLo = vdup_n_u32(PRIME32_1);
435+
/* { 0, prime32_1, 0, prime32_1 } */
436+
uint32x4_t const kPrimeHi =
437+
vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)PRIME32_1 << 32));
438+
439+
for (size_t i = 0; i < XXH_ACC_NB / 2; ++i) {
440+
/* xacc[i] ^= (xacc[i] >> 47); */
441+
uint64x2_t acc_vec = XXH_vld1q_u64(acc + (2 * i));
442+
uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
443+
uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
444+
445+
/* xacc[i] ^= secret[i]; */
446+
uint64x2_t key_vec = XXH_vld1q_u64(secret + (i * 16));
447+
uint64x2_t data_key = veorq_u64(data_vec, key_vec);
448+
449+
/*
450+
* xacc[i] *= XXH_PRIME32_1
451+
*
452+
* Expanded version with portable NEON intrinsics
453+
*
454+
* lo(x) * lo(y) + (hi(x) * lo(y) << 32)
455+
*
456+
* prod_hi = hi(data_key) * lo(prime) << 32
457+
*
458+
* Since we only need 32 bits of this multiply a trick can be used,
459+
* reinterpreting the vector as a uint32x4_t and multiplying by
460+
* { 0, prime, 0, prime } to cancel out the unwanted bits and avoid the
461+
* shift.
462+
*/
463+
uint32x4_t prod_hi = vmulq_u32(vreinterpretq_u32_u64(data_key), kPrimeHi);
464+
465+
/* Extract low bits for vmlal_u32 */
466+
uint32x2_t data_key_lo = vmovn_u64(data_key);
467+
468+
/* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
469+
xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
470+
}
471+
}
472+
#else
473+
474+
#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
475+
#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
476+
326477
LLVM_ATTRIBUTE_ALWAYS_INLINE
327478
static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input,
328479
const uint8_t *secret) {
@@ -335,20 +486,23 @@ static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input,
335486
}
336487

337488
LLVM_ATTRIBUTE_ALWAYS_INLINE
338-
static void XXH3_accumulate_scalar(uint64_t *acc, const uint8_t *input,
339-
const uint8_t *secret, size_t nbStripes) {
340-
for (size_t n = 0; n < nbStripes; ++n)
341-
XXH3_accumulate_512_scalar(acc, input + n * XXH_STRIPE_LEN,
342-
secret + n * XXH_SECRET_CONSUME_RATE);
343-
}
344-
345-
static void XXH3_scrambleAcc(uint64_t *acc, const uint8_t *secret) {
489+
static void XXH3_scrambleAcc_scalar(uint64_t *acc, const uint8_t *secret) {
346490
for (size_t i = 0; i < XXH_ACC_NB; ++i) {
347491
acc[i] ^= acc[i] >> 47;
348492
acc[i] ^= endian::read64le(secret + 8 * i);
349493
acc[i] *= PRIME32_1;
350494
}
351495
}
496+
#endif
497+
498+
LLVM_ATTRIBUTE_ALWAYS_INLINE
499+
static void XXH3_accumulate(uint64_t *acc, const uint8_t *input,
500+
const uint8_t *secret, size_t nbStripes) {
501+
for (size_t n = 0; n < nbStripes; ++n) {
502+
XXH3_accumulate_512(acc, input + n * XXH_STRIPE_LEN,
503+
secret + n * XXH_SECRET_CONSUME_RATE);
504+
}
505+
}
352506

353507
static uint64_t XXH3_mix2Accs(const uint64_t *acc, const uint8_t *secret) {
354508
return XXH3_mul128_fold64(acc[0] ^ endian::read64le(secret),
@@ -375,21 +529,20 @@ static uint64_t XXH3_hashLong_64b(const uint8_t *input, size_t len,
375529
PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1,
376530
};
377531
for (size_t n = 0; n < nb_blocks; ++n) {
378-
XXH3_accumulate_scalar(acc, input + n * block_len, secret,
379-
nbStripesPerBlock);
532+
XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
380533
XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
381534
}
382535

383536
/* last partial block */
384537
const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
385538
assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
386-
XXH3_accumulate_scalar(acc, input + nb_blocks * block_len, secret, nbStripes);
539+
XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
387540

388541
/* last stripe */
389542
constexpr size_t XXH_SECRET_LASTACC_START = 7;
390-
XXH3_accumulate_512_scalar(acc, input + len - XXH_STRIPE_LEN,
391-
secret + secretSize - XXH_STRIPE_LEN -
392-
XXH_SECRET_LASTACC_START);
543+
XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
544+
secret + secretSize - XXH_STRIPE_LEN -
545+
XXH_SECRET_LASTACC_START);
393546

394547
/* converge into final hash */
395548
constexpr size_t XXH_SECRET_MERGEACCS_START = 11;
@@ -840,21 +993,20 @@ XXH3_hashLong_128b(const uint8_t *input, size_t len, const uint8_t *secret,
840993
};
841994

842995
for (size_t n = 0; n < nb_blocks; ++n) {
843-
XXH3_accumulate_scalar(acc, input + n * block_len, secret,
844-
nbStripesPerBlock);
996+
XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
845997
XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
846998
}
847999

8481000
/* last partial block */
8491001
const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
8501002
assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
851-
XXH3_accumulate_scalar(acc, input + nb_blocks * block_len, secret, nbStripes);
1003+
XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
8521004

8531005
/* last stripe */
8541006
constexpr size_t XXH_SECRET_LASTACC_START = 7;
855-
XXH3_accumulate_512_scalar(acc, input + len - XXH_STRIPE_LEN,
856-
secret + secretSize - XXH_STRIPE_LEN -
857-
XXH_SECRET_LASTACC_START);
1007+
XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
1008+
secret + secretSize - XXH_STRIPE_LEN -
1009+
XXH_SECRET_LASTACC_START);
8581010

8591011
/* converge into final hash */
8601012
static_assert(sizeof(acc) == 64);

0 commit comments

Comments
 (0)