47
47
48
48
#include < stdlib.h>
49
49
50
+ #if !defined(LLVM_XXH_USE_NEON)
51
+ #if (defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) && \
52
+ !defined(__ARM_BIG_ENDIAN)
53
+ #define LLVM_XXH_USE_NEON 1
54
+ #else
55
+ #define LLVM_XXH_USE_NEON 0
56
+ #endif
57
+ #endif
58
+
59
+ #if LLVM_XXH_USE_NEON
60
+ #include < arm_neon.h>
61
+ #endif
62
+
50
63
using namespace llvm ;
51
64
using namespace support ;
52
65
@@ -323,6 +336,144 @@ static uint64_t XXH3_len_129to240_64b(const uint8_t *input, size_t len,
323
336
return XXH3_avalanche (acc);
324
337
}
325
338
339
+ #if LLVM_XXH_USE_NEON
340
+
341
+ #define XXH3_accumulate_512 XXH3_accumulate_512_neon
342
+ #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
343
+
344
+ // NEON implementation based on commit a57f6cce2698049863af8c25787084ae0489d849
345
+ // (July 2024), with the following removed:
346
+ // - workaround for suboptimal codegen on older GCC
347
+ // - compiler barriers against instruction reordering
348
+ // - WebAssembly SIMD support
349
+ // - configurable split between NEON and scalar lanes (benchmarking shows no
350
+ // penalty when fully doing SIMD on the Apple M1)
351
+
352
+ #if defined(__GNUC__) || defined(__clang__)
353
+ #define XXH_ALIASING __attribute__ ((__may_alias__))
354
+ #else
355
+ #define XXH_ALIASING /* nothing */
356
+ #endif
357
+
358
+ typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
359
+
360
+ LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64x2_t XXH_vld1q_u64 (void const *ptr) {
361
+ return vreinterpretq_u64_u8 (vld1q_u8 ((uint8_t const *)ptr));
362
+ }
363
+
364
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
365
+ static void XXH3_accumulate_512_neon (uint64_t *acc, const uint8_t *input,
366
+ const uint8_t *secret) {
367
+ xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
368
+
369
+ #ifdef __clang__
370
+ #pragma clang loop unroll(full)
371
+ #endif
372
+ for (size_t i = 0 ; i < XXH_ACC_NB / 2 ; i += 2 ) {
373
+ /* data_vec = input[i]; */
374
+ uint64x2_t data_vec_1 = XXH_vld1q_u64 (input + (i * 16 ));
375
+ uint64x2_t data_vec_2 = XXH_vld1q_u64 (input + ((i + 1 ) * 16 ));
376
+
377
+ /* key_vec = secret[i]; */
378
+ uint64x2_t key_vec_1 = XXH_vld1q_u64 (secret + (i * 16 ));
379
+ uint64x2_t key_vec_2 = XXH_vld1q_u64 (secret + ((i + 1 ) * 16 ));
380
+
381
+ /* data_swap = swap(data_vec) */
382
+ uint64x2_t data_swap_1 = vextq_u64 (data_vec_1, data_vec_1, 1 );
383
+ uint64x2_t data_swap_2 = vextq_u64 (data_vec_2, data_vec_2, 1 );
384
+
385
+ /* data_key = data_vec ^ key_vec; */
386
+ uint64x2_t data_key_1 = veorq_u64 (data_vec_1, key_vec_1);
387
+ uint64x2_t data_key_2 = veorq_u64 (data_vec_2, key_vec_2);
388
+
389
+ /*
390
+ * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
391
+ * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
392
+ * get one vector with the low 32 bits of each lane, and one vector
393
+ * with the high 32 bits of each lane.
394
+ *
395
+ * The intrinsic returns a double vector because the original ARMv7-a
396
+ * instruction modified both arguments in place. AArch64 and SIMD128 emit
397
+ * two instructions from this intrinsic.
398
+ *
399
+ * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
400
+ * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
401
+ */
402
+ uint32x4x2_t unzipped = vuzpq_u32 (vreinterpretq_u32_u64 (data_key_1),
403
+ vreinterpretq_u32_u64 (data_key_2));
404
+
405
+ /* data_key_lo = data_key & 0xFFFFFFFF */
406
+ uint32x4_t data_key_lo = unzipped.val [0 ];
407
+ /* data_key_hi = data_key >> 32 */
408
+ uint32x4_t data_key_hi = unzipped.val [1 ];
409
+
410
+ /*
411
+ * Then, we can split the vectors horizontally and multiply which, as for
412
+ * most widening intrinsics, have a variant that works on both high half
413
+ * vectors for free on AArch64. A similar instruction is available on
414
+ * SIMD128.
415
+ *
416
+ * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
417
+ */
418
+ uint64x2_t sum_1 = vmlal_u32 (data_swap_1, vget_low_u32 (data_key_lo),
419
+ vget_low_u32 (data_key_hi));
420
+ uint64x2_t sum_2 = vmlal_u32 (data_swap_2, vget_high_u32 (data_key_lo),
421
+ vget_high_u32 (data_key_hi));
422
+
423
+ /* xacc[i] = acc_vec + sum; */
424
+ xacc[i] = vaddq_u64 (xacc[i], sum_1);
425
+ xacc[i + 1 ] = vaddq_u64 (xacc[i + 1 ], sum_2);
426
+ }
427
+ }
428
+
429
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
430
+ static void XXH3_scrambleAcc_neon (uint64_t *acc, const uint8_t *secret) {
431
+ xxh_aliasing_uint64x2_t *const xacc = (xxh_aliasing_uint64x2_t *)acc;
432
+
433
+ /* { prime32_1, prime32_1 } */
434
+ uint32x2_t const kPrimeLo = vdup_n_u32 (PRIME32_1);
435
+ /* { 0, prime32_1, 0, prime32_1 } */
436
+ uint32x4_t const kPrimeHi =
437
+ vreinterpretq_u32_u64 (vdupq_n_u64 ((uint64_t )PRIME32_1 << 32 ));
438
+
439
+ for (size_t i = 0 ; i < XXH_ACC_NB / 2 ; ++i) {
440
+ /* xacc[i] ^= (xacc[i] >> 47); */
441
+ uint64x2_t acc_vec = XXH_vld1q_u64 (acc + (2 * i));
442
+ uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47 );
443
+ uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);
444
+
445
+ /* xacc[i] ^= secret[i]; */
446
+ uint64x2_t key_vec = XXH_vld1q_u64 (secret + (i * 16 ));
447
+ uint64x2_t data_key = veorq_u64 (data_vec, key_vec);
448
+
449
+ /*
450
+ * xacc[i] *= XXH_PRIME32_1
451
+ *
452
+ * Expanded version with portable NEON intrinsics
453
+ *
454
+ * lo(x) * lo(y) + (hi(x) * lo(y) << 32)
455
+ *
456
+ * prod_hi = hi(data_key) * lo(prime) << 32
457
+ *
458
+ * Since we only need 32 bits of this multiply a trick can be used,
459
+ * reinterpreting the vector as a uint32x4_t and multiplying by
460
+ * { 0, prime, 0, prime } to cancel out the unwanted bits and avoid the
461
+ * shift.
462
+ */
463
+ uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64 (data_key), kPrimeHi );
464
+
465
+ /* Extract low bits for vmlal_u32 */
466
+ uint32x2_t data_key_lo = vmovn_u64 (data_key);
467
+
468
+ /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
469
+ xacc[i] = vmlal_u32 (vreinterpretq_u64_u32 (prod_hi), data_key_lo, kPrimeLo );
470
+ }
471
+ }
472
+ #else
473
+
474
+ #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
475
+ #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
476
+
326
477
LLVM_ATTRIBUTE_ALWAYS_INLINE
327
478
static void XXH3_accumulate_512_scalar (uint64_t *acc, const uint8_t *input,
328
479
const uint8_t *secret) {
@@ -335,20 +486,23 @@ static void XXH3_accumulate_512_scalar(uint64_t *acc, const uint8_t *input,
335
486
}
336
487
337
488
LLVM_ATTRIBUTE_ALWAYS_INLINE
338
- static void XXH3_accumulate_scalar (uint64_t *acc, const uint8_t *input,
339
- const uint8_t *secret, size_t nbStripes) {
340
- for (size_t n = 0 ; n < nbStripes; ++n)
341
- XXH3_accumulate_512_scalar (acc, input + n * XXH_STRIPE_LEN,
342
- secret + n * XXH_SECRET_CONSUME_RATE);
343
- }
344
-
345
- static void XXH3_scrambleAcc (uint64_t *acc, const uint8_t *secret) {
489
+ static void XXH3_scrambleAcc_scalar (uint64_t *acc, const uint8_t *secret) {
346
490
for (size_t i = 0 ; i < XXH_ACC_NB; ++i) {
347
491
acc[i] ^= acc[i] >> 47 ;
348
492
acc[i] ^= endian::read64le (secret + 8 * i);
349
493
acc[i] *= PRIME32_1;
350
494
}
351
495
}
496
+ #endif
497
+
498
+ LLVM_ATTRIBUTE_ALWAYS_INLINE
499
+ static void XXH3_accumulate (uint64_t *acc, const uint8_t *input,
500
+ const uint8_t *secret, size_t nbStripes) {
501
+ for (size_t n = 0 ; n < nbStripes; ++n) {
502
+ XXH3_accumulate_512 (acc, input + n * XXH_STRIPE_LEN,
503
+ secret + n * XXH_SECRET_CONSUME_RATE);
504
+ }
505
+ }
352
506
353
507
static uint64_t XXH3_mix2Accs (const uint64_t *acc, const uint8_t *secret) {
354
508
return XXH3_mul128_fold64 (acc[0 ] ^ endian::read64le (secret),
@@ -375,21 +529,20 @@ static uint64_t XXH3_hashLong_64b(const uint8_t *input, size_t len,
375
529
PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1,
376
530
};
377
531
for (size_t n = 0 ; n < nb_blocks; ++n) {
378
- XXH3_accumulate_scalar (acc, input + n * block_len, secret,
379
- nbStripesPerBlock);
532
+ XXH3_accumulate (acc, input + n * block_len, secret, nbStripesPerBlock);
380
533
XXH3_scrambleAcc (acc, secret + secretSize - XXH_STRIPE_LEN);
381
534
}
382
535
383
536
/* last partial block */
384
537
const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
385
538
assert (nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
386
- XXH3_accumulate_scalar (acc, input + nb_blocks * block_len, secret, nbStripes);
539
+ XXH3_accumulate (acc, input + nb_blocks * block_len, secret, nbStripes);
387
540
388
541
/* last stripe */
389
542
constexpr size_t XXH_SECRET_LASTACC_START = 7 ;
390
- XXH3_accumulate_512_scalar (acc, input + len - XXH_STRIPE_LEN,
391
- secret + secretSize - XXH_STRIPE_LEN -
392
- XXH_SECRET_LASTACC_START);
543
+ XXH3_accumulate_512 (acc, input + len - XXH_STRIPE_LEN,
544
+ secret + secretSize - XXH_STRIPE_LEN -
545
+ XXH_SECRET_LASTACC_START);
393
546
394
547
/* converge into final hash */
395
548
constexpr size_t XXH_SECRET_MERGEACCS_START = 11 ;
@@ -840,21 +993,20 @@ XXH3_hashLong_128b(const uint8_t *input, size_t len, const uint8_t *secret,
840
993
};
841
994
842
995
for (size_t n = 0 ; n < nb_blocks; ++n) {
843
- XXH3_accumulate_scalar (acc, input + n * block_len, secret,
844
- nbStripesPerBlock);
996
+ XXH3_accumulate (acc, input + n * block_len, secret, nbStripesPerBlock);
845
997
XXH3_scrambleAcc (acc, secret + secretSize - XXH_STRIPE_LEN);
846
998
}
847
999
848
1000
/* last partial block */
849
1001
const size_t nbStripes = (len - 1 - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
850
1002
assert (nbStripes <= secretSize / XXH_SECRET_CONSUME_RATE);
851
- XXH3_accumulate_scalar (acc, input + nb_blocks * block_len, secret, nbStripes);
1003
+ XXH3_accumulate (acc, input + nb_blocks * block_len, secret, nbStripes);
852
1004
853
1005
/* last stripe */
854
1006
constexpr size_t XXH_SECRET_LASTACC_START = 7 ;
855
- XXH3_accumulate_512_scalar (acc, input + len - XXH_STRIPE_LEN,
856
- secret + secretSize - XXH_STRIPE_LEN -
857
- XXH_SECRET_LASTACC_START);
1007
+ XXH3_accumulate_512 (acc, input + len - XXH_STRIPE_LEN,
1008
+ secret + secretSize - XXH_STRIPE_LEN -
1009
+ XXH_SECRET_LASTACC_START);
858
1010
859
1011
/* converge into final hash */
860
1012
static_assert (sizeof (acc) == 64 );
0 commit comments