-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[Support] Add Arm NEON implementation for llvm::xxh3_64bits
#99634
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Compared to the generic scalar code, using Arm NEON instructions yields a ~11x speedup: 31 vs 339.5 ms to hash 1 GiB of random data on the Apple M1. This follows the upstream implementation closely, with some simplifications made: - Removed workarounds for suboptimal codegen on older GCC - Removed instruction reordering barriers which seem to have a negligible impact according to my measurements - We do not support WebAssembly's mostly NEON-compatible API - There is no configurable mixing of SIMD and scalar code; according to the upstream comments, this is only relevant for smaller Cortex cores which can dispatch relatively few NEON micro-ops per cycle. This commit intends to use only standard ACLE intrinsics and datatypes, so it should build with all supported versions of GCC, Clang and MSVC. This feature is enabled by default when targeting AArch64, but the `LLVM_XXH_USE_NEON=0` macro can be set to explicitly disable it. XXH3 is used for ICF, string deduplication and computing the UUID in ld64.lld; this commit results in a -1.77% +/- 0.59% speed improvement for a `--threads=8` link of Chromium.framework.
@llvm/pr-subscribers-llvm-support Author: Daniel Bertalan (BertalanD) ChangesCompared to the generic scalar code, using Arm NEON instructions yields a ~11x speedup: 31 vs 339.5 ms to hash 1 GiB of random data on the Apple M1. This follows the upstream implementation closely, with some simplifications made:
This commit intends to use only standard ACLE intrinsics and datatypes, so it should build with all supported versions of GCC, Clang and MSVC. This feature is enabled by default when targeting AArch64, but the XXH3 is used for ICF, string deduplication and computing the UUID in ld64.lld; this commit results in a -1.77% +/- 0.59% speed improvement for a Full diff: https://github.com/llvm/llvm-project/pull/99634.diff 3 Files Affected:
diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt
index 43f88f7257924..52d726451ada9 100644
--- a/llvm/benchmarks/CMakeLists.txt
+++ b/llvm/benchmarks/CMakeLists.txt
@@ -1,4 +1,5 @@
set(LLVM_LINK_COMPONENTS
Support)
-add_benchmark(DummyYAML DummyYAML.cpp)
+add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED)
+add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED)
diff --git a/llvm/benchmarks/xxhash.cpp b/llvm/benchmarks/xxhash.cpp
new file mode 100644
index 0000000000000..0c499b12ea67e
--- /dev/null
+++ b/llvm/benchmarks/xxhash.cpp
@@ -0,0 +1,36 @@
+#include "llvm/Support/xxhash.h"
+#include "benchmark/benchmark.h"
+
+static uint32_t xorshift(uint32_t State) {
+ State ^= State << 13;
+ State ^= State >> 17;
+ State ^= State << 5;
+ return State;
+}
+
+static void BM_xxh3_64bits(benchmark::State &State) {
+ uint32_t *Data = new uint32_t[State.range(0) / 4];
+
+ uint32_t Prev = 0xcafebabe;
+ for (int64_t I = 0; I < State.range(0) / 4; I++) {
+ Data[I] = Prev = xorshift(Prev);
+ }
+
+ llvm::ArrayRef DataRef =
+ llvm::ArrayRef(reinterpret_cast<uint8_t *>(Data), State.range(0));
+
+ for (auto _ : State) {
+ llvm::xxh3_64bits(DataRef);
+ }
+
+ delete[] Data;
+}
+
+BENCHMARK(BM_xxh3_64bits)
+ ->Arg(32)
+ ->Arg(512)
+ ->Arg(64 * 1024)
+ ->Arg(1024 * 1024);
+
+BENCHMARK_MAIN();
+
diff --git a/llvm/lib/Support/xxhash.cpp b/llvm/lib/Support/xxhash.cpp
index 607789b391381..cdb76d57e2c1d 100644
--- a/llvm/lib/Support/xxhash.cpp
+++ b/llvm/lib/Support/xxhash.cpp
@@ -47,6 +47,19 @@
#include <stdlib.h>
+#if !defined(LLVM_XXH_USE_NEON)
+#if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) && \
+ !defined(__ARM_BIG_ENDIAN)
+#define LLVM_XXH_USE_NEON 1
+#else
+#define LLVM_XXH_USE_NEON 0
+#endif
+#endif
+
+#if LLVM_XXH_USE_NEON
+#include <arm_neon.h>
+#endif
+
using namespace llvm;
using namespace support;
@@ -323,6 +336,144 @@ static uint64_t XXH3_len_129to240_64b(const uint8_t *input, size_t len,
return XXH3_avalanche(acc);
}
+#if LLVM_XXH_USE_NEON
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_neon
+#define XXH3_scrambleAcc XXH3_scrambleAcc_neon
+
+// NEON implementation based on commit a57f6cce2698049863af8c25787084ae0489d849
+// (July 2024), with the following removed:
+// - workaround for suboptimal codegen on older GCC
+// - compiler barriers against instruction reordering
+// - WebAssembly SIMD support
+// - configurable split between NEON and scalar lanes (benchmarking shows no
+// penalty when fully doing SIMD on the Apple M1)
+
+#if defined(__GNUC__) || defined(__clang__)
+#define XXH_ALIASING __attribute__((__may_alias__))
+#else
+#define XXH_ALIASING /* nothing */
+#endif
+
+typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64x2_t XXH_vld1q_u64(void const *ptr) {
+ return vreinterpretq_u64_u8(vld1q_u8((uint8_t const *)ptr));
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE
+static void XXH3_accumulate_512_neon(uint64_t *acc, const uint8_t *input,
+ const uint8_t *secret) {
+ xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
+
+#ifdef __clang__
+#pragma clang loop unroll(full)
+#endif
+ for (size_t i = 0; i < XXH_ACC_NB / 2; i += 2) {
+ /* data_vec = input[i]; */
+ uint64x2_t data_vec_1 = XXH_vld1q_u64(input + (i * 16));
+ uint64x2_t data_vec_2 = XXH_vld1q_u64(input + ((i + 1) * 16));
+
+ /* key_vec = secret[i]; */
+ uint64x2_t key_vec_1 = XXH_vld1q_u64(secret + (i * 16));
+ uint64x2_t key_vec_2 = XXH_vld1q_u64(secret + ((i + 1) * 16));
+
+ /* data_swap = swap(data_vec) */
+ uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
+ uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
+
+ /* data_key = data_vec ^ key_vec; */
+ uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
+ uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
+
+ /*
+ * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
+ * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
+ * get one vector with the low 32 bits of each lane, and one vector
+ * with the high 32 bits of each lane.
+ *
+ * The intrinsic returns a double vector because the original ARMv7-a
+ * instruction modified both arguments in place. AArch64 and SIMD128 emit
+ * two instructions from this intrinsic.
+ *
+ * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
+ * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
+ */
+ uint32x4x2_t unzipped = vuzpq_u32(vreinterpretq_u32_u64(data_key_1),
+ vreinterpretq_u32_u64(data_key_2));
+
+ /* data_key_lo = data_key & 0xFFFFFFFF */
+ uint32x4_t data_key_lo = unzipped.val[0];
+ /* data_key_hi = data_key >> 32 */
+ uint32x4_t data_key_hi = unzipped.val[1];
+
+ /*
+ * Then, we can split the vectors horizontally and multiply which, as for
+ * most widening intrinsics, have a variant that works on both high half
+ * vectors for free on AArch64. A similar instruction is available on
+ * SIMD128.
+ *
+ * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
+ */
+ uint64x2_t sum_1 = vmlal_u32(data_swap_1, vget_low_u32(data_key_lo),
+ vget_low_u32(data_key_hi));
+ uint64x2_t sum_2 = vmlal_u32(data_swap_2, vget_high_u32(data_key_lo),
+ vget_high_u32(data_key_hi));
+
+ /* xacc[i] = acc_vec + sum; */
+ xacc[i] = vaddq_u64(xacc[i], sum_1);
+ xacc[i + 1] = vaddq_u64(xacc[i + 1], sum_2);
+ }
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE
+static void XXH3_scrambleAcc_neon(uint64_t *acc, const uint8_t *secret) {
+ xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
+
+ /* { prime32_1, prime32_1 } */
+ uint32x2_t const kPrimeLo = vdup_n_u32(PRIME32_1);
+ /* { 0, prime32_1, 0, prime32_1 } */
+ uint32x4_t const kPrimeHi =
+ vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)PRIME32_1 << 32));
+
+ for (size_t i = 0; i < XXH_ACC_NB / 2; ++i) {
+ /* xacc[i] ^= (xacc[i] >> 47); */
+ uint64x2_t acc_vec = XXH_vld1q_u64(acc + (2 * i));
+ uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
+ uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
+
+ /* xacc[i] ^= secret[i]; */
+ uint64x2_t key_vec = XXH_vld1q_u64(secret + (i * 16));
+ uint64x2_t data_key = veorq_u64(data_vec, key_vec);
+
+ /*
+ * xacc[i] *= XXH_PRIME32_1
+ *
+ * Expanded version with portable NEON intrinsics
+ *
+ * lo(x) * lo(y) + (hi(x) * lo(y) << 32)
+ *
+ * prod_hi = hi(data_key) * lo(prime) << 32
+ *
+ * Since we only need 32 bits of this multiply a trick can be used,
+ * reinterpreting the vector as a uint32x4_t and multiplying by
+ * { 0, prime, 0, prime } to cancel out the unwanted bits and avoid the
+ * shift.
+ */
+ uint32x4_t prod_hi = vmulq_u32(vreinterpretq_u32_u64(data_key), kPrimeHi);
+
+ /* Extract low bits for vmlal_u32 */
+ uint32x2_t data_key_lo = vmovn_u64(data_key);
+
+ /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
+ xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
+ }
+}
+#else
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
+#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
+
LLVM_ATTRIBUTE_ALWAYS_INLINE
static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input,
const uint8_t *secret) {
@@ -335,20 +486,23 @@ static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input,
}
LLVM_ATTRIBUTE_ALWAYS_INLINE
-static void XXH3_accumulate_scalar(uint64_t *acc, const uint8_t *input,
- const uint8_t *secret, size_t nbStripes) {
- for (size_t n = 0; n < nbStripes; ++n)
- XXH3_accumulate_512_scalar(acc, input + n * XXH_STRIPE_LEN,
- secret + n * XXH_SECRET_CONSUME_RATE);
-}
-
-static void XXH3_scrambleAcc(uint64_t *acc, const uint8_t *secret) {
+static void XXH3_scrambleAcc_scalar(uint64_t *acc, const uint8_t *secret) {
for (size_t i = 0; i < XXH_ACC_NB; ++i) {
acc[i] ^= acc[i] >> 47;
acc[i] ^= endian::read64le(secret + 8 * i);
acc[i] *= PRIME32_1;
}
}
+#endif
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE
+static void XXH3_accumulate(uint64_t *acc, const uint8_t *input,
+ const uint8_t *secret, size_t nbStripes) {
+ for (size_t n = 0; n < nbStripes; ++n) {
+ XXH3_accumulate_512(acc, input + n * XXH_STRIPE_LEN,
+ secret + n * XXH_SECRET_CONSUME_RATE);
+ }
+}
static uint64_t XXH3_mix2Accs(const uint64_t *acc, const uint8_t *secret) {
return XXH3_mul128_fold64(acc[0] ^ endian::read64le(secret),
@@ -375,21 +529,20 @@ static uint64_t XXH3_hashLong_64b(const uint8_t *input, size_t len,
PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1,
};
for (size_t n = 0; n < nb_blocks; ++n) {
- XXH3_accumulate_scalar(acc, input + n * block_len, secret,
- nbStripesPerBlock);
+ XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
}
/* last partial block */
const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
- XXH3_accumulate_scalar(acc, input + nb_blocks * block_len, secret, nbStripes);
+ XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
/* last stripe */
constexpr size_t XXH_SECRET_LASTACC_START = 7;
- XXH3_accumulate_512_scalar(acc, input + len - XXH_STRIPE_LEN,
- secret + secretSize - XXH_STRIPE_LEN -
- XXH_SECRET_LASTACC_START);
+ XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
+ secret + secretSize - XXH_STRIPE_LEN -
+ XXH_SECRET_LASTACC_START);
/* converge into final hash */
constexpr size_t XXH_SECRET_MERGEACCS_START = 11;
@@ -840,21 +993,20 @@ XXH3_hashLong_128b(const uint8_t *input, size_t len, const uint8_t *secret,
};
for (size_t n = 0; n < nb_blocks; ++n) {
- XXH3_accumulate_scalar(acc, input + n * block_len, secret,
- nbStripesPerBlock);
+ XXH3_accumulate(acc, input + n * block_len, secret, nbStripesPerBlock);
XXH3_scrambleAcc(acc, secret + secretSize - XXH_STRIPE_LEN);
}
/* last partial block */
const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
assert(nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
- XXH3_accumulate_scalar(acc, input + nb_blocks * block_len, secret, nbStripes);
+ XXH3_accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
/* last stripe */
constexpr size_t XXH_SECRET_LASTACC_START = 7;
- XXH3_accumulate_512_scalar(acc, input + len - XXH_STRIPE_LEN,
- secret + secretSize - XXH_STRIPE_LEN -
- XXH_SECRET_LASTACC_START);
+ XXH3_accumulate_512(acc, input + len - XXH_STRIPE_LEN,
+ secret + secretSize - XXH_STRIPE_LEN -
+ XXH_SECRET_LASTACC_START);
/* converge into final hash */
static_assert(sizeof(acc) == 64);
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice!
@@ -1,4 +1,5 @@ | |||
set(LLVM_LINK_COMPONENTS | |||
Support) | |||
|
|||
add_benchmark(DummyYAML DummyYAML.cpp) | |||
add_benchmark(DummyYAML DummyYAML.cpp PARTIAL_SOURCES_INTENDED) | |||
add_benchmark(xxhash xxhash.cpp PARTIAL_SOURCES_INTENDED) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@BertalanD @kirillbobyrev this adds a ninja target xxhash
which just builds benchmarks/xxhash
. Is that necessary or even useful? It caused a clash for us downstream when LLVM is included as a subproject into a larger project, because some other code also used xxhash
as a ninja target name.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I didn't expect it to create a top-level Ninja target. I would be inclined to rename it to benchmarkLLVMxxhash
or the like to reduce the chance of name collisions. But there's probably some CMake trick for this that I do not know of. Any ideas?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Downstream users reported that the `xxhash` target name conflicts with some of their existing build targets. Rename it to `bench-xxhash`. See llvm#99634 (comment)
Downstream users reported that the `xxhash` target name conflicts with some of their existing build targets. Rename it to `bench-xxhash`. See llvm#99634 (comment)
Compared to the generic scalar code, using Arm NEON instructions yields a ~11x speedup: 31 vs 339.5 ms to hash 1 GiB of random data on the Apple M1. This follows the upstream implementation closely, with some simplifications made: - Removed workarounds for suboptimal codegen on older GCC - Removed instruction reordering barriers which seem to have a negligible impact according to my measurements - We do not support WebAssembly's mostly NEON-compatible API - There is no configurable mixing of SIMD and scalar code; according to the upstream comments, this is only relevant for smaller Cortex cores which can dispatch relatively few NEON micro-ops per cycle. This commit intends to use only standard ACLE intrinsics and datatypes, so it should build with all supported versions of GCC, Clang and MSVC. This feature is enabled by default when targeting AArch64, but the `LLVM_XXH_USE_NEON=0` macro can be set to explicitly disable it. XXH3 is used for ICF, string deduplication and computing the UUID in ld64.lld; this commit results in a -1.77% +/- 0.59% speed improvement for a `--threads=8` link of Chromium.framework.
Compared to the generic scalar code, using Arm NEON instructions yields a ~11x speedup: 31 vs 339.5 ms to hash 1 GiB of random data on the Apple M1.
This follows the upstream implementation closely, with some simplifications made:
This commit intends to use only standard ACLE intrinsics and datatypes, so it should build with all supported versions of GCC, Clang and MSVC.
This feature is enabled by default when targeting AArch64, but the
LLVM_XXH_USE_NEON=0
macro can be set to explicitly disable it.XXH3 is used for ICF, string deduplication and computing the UUID in ld64.lld; this commit results in a -1.77% +/- 0.59% speed improvement for a
--threads=8
link of Chromium.framework.