Skip to content

[Support] Add Arm NEON implementation for llvm::xxh3_64bits #99634

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion llvm/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
set(LLVM_LINK_COMPONENTS
Support)

add_benchmark(DummyYAML DummyYAML.cpp)
add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED)
add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@BertalanD @kirillbobyrev this adds a ninja target xxhash which just builds benchmarks/xxhash. Is that necessary or even useful? It caused a clash for us downstream when LLVM is included as a subproject into a larger project, because some other code also used xxhash as a ninja target name.

Copy link
Member Author

@BertalanD BertalanD Jul 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I didn't expect it to create a top-level Ninja target. I would be inclined to rename it to benchmarkLLVMxxhash or the like to reduce the chance of name collisions. But there's probably some CMake trick for this that I do not know of. Any ideas?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

31 changes: 31 additions & 0 deletions llvm/benchmarks/xxhash.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#include "llvm/Support/xxhash.h"
#include "benchmark/benchmark.h"

static uint32_t xorshift(uint32_t State) {
State ^= State << 13;
State ^= State >> 17;
State ^= State << 5;
return State;
}

static void BM_xxh3_64bits(benchmark::State &State) {
uint32_t *Data = new uint32_t[State.range(0) / 4];

uint32_t Prev = 0xcafebabe;
for (int64_t I = 0; I < State.range(0) / 4; I++) {
Data[I] = Prev = xorshift(Prev);
}

llvm::ArrayRef DataRef =
llvm::ArrayRef(reinterpret_cast<uint8_t *>(Data), State.range(0));

for (auto _ : State) {
llvm::xxh3_64bits(DataRef);
}

delete[] Data;
}

BENCHMARK(BM_xxh3_64bits)->Arg(32)->Arg(512)->Arg(64 * 1024)->Arg(1024 * 1024);

BENCHMARK_MAIN();
192 changes: 172 additions & 20 deletions llvm/lib/Support/xxhash.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,19 @@

#include <stdlib.h>

#if !defined(LLVM_XXH_USE_NEON)
#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) && \
!defined(__ARM_BIG_ENDIAN)
#define LLVM_XXH_USE_NEON 1
#else
#define LLVM_XXH_USE_NEON 0
#endif
#endif

#if LLVM_XXH_USE_NEON
#include <arm_neon.h>
#endif

using namespace llvm;
using namespace support;

Expand Down Expand Up @@ -323,6 +336,144 @@ static uint64_t XXH3_len_129to240_64b(const uint8_t *input, size_t len,
return XXH3_avalanche(acc);
}

#if LLVM_XXH_USE_NEON

#define XXH3_accumulate_512 XXH3_accumulate_512_neon
#define XXH3_scrambleAcc XXH3_scrambleAcc_neon

// NEON implementation based on commit a57f6cce2698049863af8c25787084ae0489d849
// (July 2024), with the following removed:
// - workaround for suboptimal codegen on older GCC
// - compiler barriers against instruction reordering
// - WebAssembly SIMD support
// - configurable split between NEON and scalar lanes (benchmarking shows no
// penalty when fully doing SIMD on the Apple M1)

#if defined(__GNUC__) || defined(__clang__)
#define XXH_ALIASING __attribute__((__may_alias__))
#else
#define XXH_ALIASING /* nothing */
#endif

typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;

LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64x2_t XXH_vld1q_u64(void const *ptr) {
return vreinterpretq_u64_u8(vld1q_u8((uint8_t const *)ptr));
}

LLVM_ATTRIBUTE_ALWAYS_INLINE
static void XXH3_accumulate_512_neon(uint64_t *acc, const uint8_t *input,
const uint8_t *secret) {
xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;

#ifdef __clang__
#pragma clang loop unroll(full)
#endif
for (size_t i = 0; i < XXH_ACC_NB / 2; i += 2) {
/* data_vec = input[i]; */
uint64x2_t data_vec_1 = XXH_vld1q_u64(input + (i * 16));
uint64x2_t data_vec_2 = XXH_vld1q_u64(input + ((i + 1) * 16));

/* key_vec = secret[i]; */
uint64x2_t key_vec_1 = XXH_vld1q_u64(secret + (i * 16));
uint64x2_t key_vec_2 = XXH_vld1q_u64(secret + ((i + 1) * 16));

/* data_swap = swap(data_vec) */
uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);

/* data_key = data_vec ^ key_vec; */
uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);

/*
* If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
* de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
* get one vector with the low 32 bits of each lane, and one vector
* with the high 32 bits of each lane.
*
* The intrinsic returns a double vector because the original ARMv7-a
* instruction modified both arguments in place. AArch64 and SIMD128 emit
* two instructions from this intrinsic.
*
* [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
* [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
*/
uint32x4x2_t unzipped = vuzpq_u32(vreinterpretq_u32_u64(data_key_1),
vreinterpretq_u32_u64(data_key_2));

/* data_key_lo = data_key & 0xFFFFFFFF */
uint32x4_t data_key_lo = unzipped.val[0];
/* data_key_hi = data_key >> 32 */
uint32x4_t data_key_hi = unzipped.val[1];

/*
* Then, we can split the vectors horizontally and multiply which, as for
* most widening intrinsics, have a variant that works on both high half
* vectors for free on AArch64. A similar instruction is available on
* SIMD128.
*
* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
*/
uint64x2_t sum_1 = vmlal_u32(data_swap_1, vget_low_u32(data_key_lo),
vget_low_u32(data_key_hi));
uint64x2_t sum_2 = vmlal_u32(data_swap_2, vget_high_u32(data_key_lo),
vget_high_u32(data_key_hi));

/* xacc[i] = acc_vec + sum; */
xacc[i] = vaddq_u64(xacc[i], sum_1);
xacc[i + 1] = vaddq_u64(xacc[i + 1], sum_2);
}
}

LLVM_ATTRIBUTE_ALWAYS_INLINE
static void XXH3_scrambleAcc_neon(uint64_t *acc, const uint8_t *secret) {
xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;

/* { prime32_1, prime32_1 } */
uint32x2_t const kPrimeLo = vdup_n_u32(PRIME32_1);
/* { 0, prime32_1, 0, prime32_1 } */
uint32x4_t const kPrimeHi =
vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)PRIME32_1 << 32));

for (size_t i = 0; i < XXH_ACC_NB / 2; ++i) {
/* xacc[i] ^= (xacc[i] >> 47); */
uint64x2_t acc_vec = XXH_vld1q_u64(acc + (2 * i));
uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
uint64x2_t data_vec = veorq_u64(acc_vec, shifted);

/* xacc[i] ^= secret[i]; */
uint64x2_t key_vec = XXH_vld1q_u64(secret + (i * 16));
uint64x2_t data_key = veorq_u64(data_vec, key_vec);

/*
* xacc[i] *= XXH_PRIME32_1
*
* Expanded version with portable NEON intrinsics
*
* lo(x) * lo(y) + (hi(x) * lo(y) << 32)
*
* prod_hi = hi(data_key) * lo(prime) << 32
*
* Since we only need 32 bits of this multiply a trick can be used,
* reinterpreting the vector as a uint32x4_t and multiplying by
* { 0, prime, 0, prime } to cancel out the unwanted bits and avoid the
* shift.
*/
uint32x4_t prod_hi = vmulq_u32(vreinterpretq_u32_u64(data_key), kPrimeHi);

/* Extract low bits for vmlal_u32 */
uint32x2_t data_key_lo = vmovn_u64(data_key);

/* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
}
}
#else

#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar

LLVM_ATTRIBUTE_ALWAYS_INLINE
static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input,
const uint8_t *secret) {
Expand All @@ -335,20 +486,23 @@ static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input,
}

LLVM_ATTRIBUTE_ALWAYS_INLINE
static void XXH3_accumulate_scalar(uint64_t *acc, const uint8_t *input,
const uint8_t *secret, size_t nbStripes) {
for (size_t n = 0; n < nbStripes; ++n)
XXH3_accumulate_512_scalar(acc, input + n * XXH_STRIPE_LEN,
secret + n * XXH_SECRET_CONSUME_RATE);
}

static void XXH3_scrambleAcc(uint64_t *acc, const uint8_t *secret) {
static void XXH3_scrambleAcc_scalar(uint64_t *acc, const uint8_t *secret) {
for (size_t i = 0; i < XXH_ACC_NB; ++i) {
acc[i] ^= acc[i] >> 47;
acc[i] ^= endian::read64le(secret + 8 * i);
acc[i] *= PRIME32_1;
}
}
#endif

LLVM_ATTRIBUTE_ALWAYS_INLINE
static void XXH3_accumulate(uint64_t *acc, const uint8_t *input,
const uint8_t *secret, size_t nbStripes) {
for (size_t n = 0; n < nbStripes; ++n) {
XXH3_accumulate_512(acc, input + n * XXH_STRIPE_LEN,
secret + n * XXH_SECRET_CONSUME_RATE);
}
}

static uint64_t XXH3_mix2Accs(const uint64_t *acc, const uint8_t *secret) {
return XXH3_mul128_fold64(acc[0] ^ endian::read64le(secret),
Expand All @@ -375,21 +529,20 @@ static uint64_t XXH3_hashLong_64b(const uint8_t *input, size_t len,
PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1,
};
for (size_t n = 0; n < nb_blocks; ++n) {
XXH3_accumulate_scalar(acc, input + n * block_len, secret,
nbStripesPerBlock);
XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
}

/* last partial block */
const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
XXH3_accumulate_scalar(acc, input + nb_blocks * block_len, secret, nbStripes);
XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);

/* last stripe */
constexpr size_t XXH_SECRET_LASTACC_START = 7;
XXH3_accumulate_512_scalar(acc, input + len - XXH_STRIPE_LEN,
secret + secretSize - XXH_STRIPE_LEN -
XXH_SECRET_LASTACC_START);
XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
secret + secretSize - XXH_STRIPE_LEN -
XXH_SECRET_LASTACC_START);

/* converge into final hash */
constexpr size_t XXH_SECRET_MERGEACCS_START = 11;
Expand Down Expand Up @@ -840,21 +993,20 @@ XXH3_hashLong_128b(const uint8_t *input, size_t len, const uint8_t *secret,
};

for (size_t n = 0; n < nb_blocks; ++n) {
XXH3_accumulate_scalar(acc, input + n * block_len, secret,
nbStripesPerBlock);
XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
}

/* last partial block */
const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
XXH3_accumulate_scalar(acc, input + nb_blocks * block_len, secret, nbStripes);
XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);

/* last stripe */
constexpr size_t XXH_SECRET_LASTACC_START = 7;
XXH3_accumulate_512_scalar(acc, input + len - XXH_STRIPE_LEN,
secret + secretSize - XXH_STRIPE_LEN -
XXH_SECRET_LASTACC_START);
XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
secret + secretSize - XXH_STRIPE_LEN -
XXH_SECRET_LASTACC_START);

/* converge into final hash */
static_assert(sizeof(acc) == 64);
Expand Down
Loading