[libc] Fix buggy AVX2 / AVX512 `memcmp` #77081

gchatelet · 2024-01-05T11:02:04Z

Fixes #77080.

Fixes 77080.

llvmbot · 2024-01-05T11:02:33Z

@llvm/pr-subscribers-libc

Author: Guillaume Chatelet (gchatelet)

Changes

Fixes #77080.

Full diff: https://github.com/llvm/llvm-project/pull/77081.diff

2 Files Affected:

(modified) libc/src/string/memory_utils/op_x86.h (+27-6)
(modified) libc/test/src/string/memcmp_test.cpp (+7)

diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
index 1a20659c178cd1..23e6b897997e90 100644
--- a/libc/src/string/memory_utils/op_x86.h
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -129,7 +129,8 @@ LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
                                               8, 9, 10, 11, 12, 13, 14, 15));
 }
 LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
-  return static_cast<uint16_t>(_mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
+  return static_cast<uint16_t>(
+      _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
 }
 template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
   const auto a = load<__m128i>(p1, offset);
@@ -181,11 +182,31 @@ LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
   return _mm256_max_epu8(a, b);
 }
 LIBC_INLINE __m256i bytewise_reverse(__m256i value) {
-  return _mm256_shuffle_epi8(value,
-                             _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,         //
-                                             8, 9, 10, 11, 12, 13, 14, 15,   //
-                                             16, 17, 18, 19, 20, 21, 22, 23, //
-                                             24, 25, 26, 27, 28, 29, 30, 31));
+  const __m256i indices = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,         //
+                                          8, 9, 10, 11, 12, 13, 14, 15,   //
+                                          16, 17, 18, 19, 20, 21, 22, 23, //
+                                          24, 25, 26, 27, 28, 29, 30, 31);
+#if defined(__AVX512VBMI__) && defined(__AVX512VL__)
+  // AVX512 allows full __m256i byte permutation.
+  // ymm = ymm[31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,
+  //           15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+  return _mm256_permutexvar_epi8(value, indices);
+#else
+  // We can't byte-reverse __m256i in a single instruction with AVX2.
+  // '_mm256_shuffle_epi8' can only shuffle within each xmm lane
+  // leading to:
+  // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+  //           31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
+  const __m256i tmp = _mm256_shuffle_epi8(value, indices);
+  // Then we shuffle accross lanes using 64 bit values.
+  // ymm = ymm[2,3,0,1]
+  // Leading to a fully reversed vector
+  // ymm = ymm[31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,
+  //           15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+  // The immediate encodes the 64 bit word indices  :    1, 0, 3, 2.
+  // Each index is encoded with 2 bits              : 0b01'00'11'10.
+  return _mm256_permute4x64_epi64(tmp, 0b01'00'11'10);
+#endif
 }
 LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
   return _mm256_movemask_epi8(bytewise_reverse(_mm256_cmpeq_epi8(max, value)));
diff --git a/libc/test/src/string/memcmp_test.cpp b/libc/test/src/string/memcmp_test.cpp
index 03a0ac1c0ba655..a69257704a64a2 100644
--- a/libc/test/src/string/memcmp_test.cpp
+++ b/libc/test/src/string/memcmp_test.cpp
@@ -37,6 +37,13 @@ TEST(LlvmLibcMemcmpTest, LhsAfterRhsLexically) {
   EXPECT_GT(LIBC_NAMESPACE::memcmp(lhs, rhs, 2), 0);
 }
 
+TEST(LlvmLibcMemcmpTest, Issue77080) {
+  // https://github.com/llvm/llvm-project/issues/77080
+  constexpr char lhs[35] = "1.069cd68bbe76eb2143a3284d27ebe220";
+  constexpr char rhs[35] = "1.0500185b5d966a544e2d0fa40701b0f3";
+  EXPECT_GT(LIBC_NAMESPACE::memcmp(lhs, rhs, 34), 0);
+}
+
 // Adapt CheckMemcmp signature to memcmp.
 static inline int Adaptor(cpp::span<char> p1, cpp::span<char> p2, size_t size) {
   return LIBC_NAMESPACE::memcmp(p1.begin(), p2.begin(), size);

libc/src/string/memory_utils/op_x86.h

michaelrj-google

LGTM with nit

libc/test/src/string/memcmp_test.cpp

nafi3000 · 2024-01-06T11:11:03Z

libc/src/string/memory_utils/op_x86.h

+  //           15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+  // The immediate encodes the 64 bit word indices  :    1, 0, 3, 2.
+  // Each index is encoded with 2 bits              : 0b01'00'11'10.
+  return _mm256_permute4x64_epi64(tmp, 0b01'00'11'10);


Consider doing this after calling the _mm256_movemask_epi8. Then a rorx (1 cycle) will be sufficient. E.g.

static inline uint32_t SwapWords(uint32_t x) { return (x << 16) | (x >> 16); } int cmp_neq(__m256i a, __m256i b) { __m256i vmax = _mm256_max_epu8(a, b); __m256i a_le_b = _mm256_cmpeq_epi8(vmax, b); __m256i a_ge_b = _mm256_cmpeq_epi8(vmax, a); const __m256i indices = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, // 8, 9, 10, 11, 12, 13, 14, 15); uint32_t le = SwapWords(_mm256_movemask_epi8(_mm256_shuffle_epi8(a_le_b, indices))); uint32_t ge = SwapWords(_mm256_movemask_epi8(_mm256_shuffle_epi8(a_ge_b, indices))); return le < ge ? 5 : -5; }

https://godbolt.org/z/6bhjE35j3

Version llvm-mca skylake latency znver3 latency skylake latency without mask loading znver3 latency without mask loading

With bug link 13 13 8 7

#else version above link 17 19 12 13

#if version above link 15 17 10 10

shuffle-movemask-rorx version link 14 14 9 8

I have tested the shuffle-movemask-rorx version: https://godbolt.org/z/as5Kqjof5. The tests fail without the rorx fix: https://godbolt.org/z/ccndGYM6z.

What do you think?

nafi3000

FWIW, I believe __m512i bytewise_reverse(__m512i value) is also buggy at the moment!
https://godbolt.org/z/e8sfvP7h4

https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_epi8&ig_expand=6009
index[5:0] := b[i+3:i] + (j & 0x30)
It is only considering 4 lower bits from each byte of b and then adding 4th and 5th bits from j which is basically the 128-bit lane index. So, it is implemented the same way as _mm256_shuffle_epi8. :(

legrosbuffle

We should probably rerun the benchmarks to get some visibility on the impact on performance.

gchatelet · 2024-01-09T13:28:15Z

libc/src/string/memory_utils/op_x86.h

+                                       48, 49, 50, 51, 52, 53, 54, 55, //
+                                       56, 57, 58, 59, 60, 61, 62, 63);
+  // Then we compute the mask for equal bytes.
+  return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), //


It appears that this code is not well optimized by the compiler, using GPRs instead of vector registers.
#77459

The vector code is 21 cycles whether the GPR is 25 according to llvm-mca.
I'll benchmark both of them and report.

gchatelet · 2024-01-09T15:59:30Z

I've run a couple of benchmarks. All numbers are throughput in GB/s : higher is better. They are run on a sapphirerapids machine.

pre-nafi base is revision aa28875, just before the buggy version. It represents the previous implementation.
buggy is 124efca.
GPR is this patch at commit 2ca4747.
vector is this patch at commit 2ca4747 with the avx512vbmi version disabled, that is these lines removed.

	AVX2			AVX512
	pre-nafi base	buggy	vector	pre-nafi base	buggy	GPR	vector
BM_Memcmp/0/0	2.91849	3.11338	3.1697	2.96577	3.57009	3.11467	3.5289
BM_Memcmp/1/0	8.54724	8.72347	8.91954	8.26899	9.91166	9.12617	10.4736
BM_Memcmp/2/0	5.57227	5.88153	5.86237	5.63377	6.52597	5.84524	6.46245
BM_Memcmp/3/0	4.65409	5.31556	5.24833	4.93053	5.87956	5.23889	5.92499
BM_Memcmp/4/0	2.41787	2.52415	2.50199	2.34756	2.69541	2.62342	2.61858
BM_Memcmp/5/0	4.95331	5.26778	5.52597	4.73212	5.80927	5.84406	5.89062
BM_Memcmp/6/0	8.26034	9.09897	8.7936	8.34453	9.72203	9.48509	9.68149
BM_Memcmp/7/0	5.76333	5.97761	5.91712	5.50937	6.40736	6.36177	6.57352
BM_Memcmp/8/0	3.12097	3.301	3.24115	3.1782	3.6211	3.358	3.49582
BM_Memcmp/9/0	40.1616	39.7599	39.6047	62.2187	64.8115	62.472	63.1284

Same table as percentage of base

	AVX2			AVX512
	pre-nafi base	buggy	vector	pre-nafi base	buggy	GPR	vector
BM_Memcmp/0/0	-	107%	109%	-	120%	105%	119%
BM_Memcmp/1/0	-	102%	104%	-	120%	110%	127%
BM_Memcmp/2/0	-	106%	105%	-	116%	104%	115%
BM_Memcmp/3/0	-	114%	113%	-	119%	106%	120%
BM_Memcmp/4/0	-	104%	103%	-	115%	112%	112%
BM_Memcmp/5/0	-	106%	112%	-	123%	123%	124%
BM_Memcmp/6/0	-	110%	106%	-	117%	114%	116%
BM_Memcmp/7/0	-	104%	103%	-	116%	115%	119%
BM_Memcmp/8/0	-	106%	104%	-	114%	106%	110%
BM_Memcmp/9/0	-	99%	99%	-	104%	100%	101%

This is a clear indication that the avx512vbmi version using GPR is not good. I'll disable it for now and we can re-enable it when clang bumps to a version with better codegen.

Fixes llvm#77080.

[libc] Fix buggy AVX2 memcmp

fb8dbd5

Fixes 77080.

llvmbot added the libc label Jan 5, 2024

gchatelet requested a review from legrosbuffle January 5, 2024 11:02

topperc reviewed Jan 5, 2024

View reviewed changes

libc/src/string/memory_utils/op_x86.h Outdated Show resolved Hide resolved

michaelrj-google approved these changes Jan 5, 2024

View reviewed changes

libc/test/src/string/memcmp_test.cpp Outdated Show resolved Hide resolved

nafi3000 reviewed Jan 6, 2024

View reviewed changes

legrosbuffle reviewed Jan 8, 2024

View reviewed changes

gchatelet added 3 commits January 8, 2024 08:51

Make test clearer

0489166

Fix typo

0c76eb2

Use nafi3000 suggestion and fix avx512 version as well.

2ca4747

gchatelet commented Jan 9, 2024

View reviewed changes

gchatelet changed the title ~~[libc] Fix buggy AVX2 memcmp~~ [libc] Fix buggy AVX2 / AVX512 memcmp Jan 9, 2024

Disable AVX512BMI version for now

de6e7b6

gchatelet requested review from legrosbuffle and nafi3000 January 9, 2024 16:06

gchatelet and others added 2 commits January 9, 2024 16:30

Reformulate documentation

a49f014

Merge branch 'llvm:main' into main

148d0dd

gchatelet mentioned this pull request Jan 11, 2024

[libc] Add memcmp / bcmp fuzzers #77741

Merged

gchatelet merged commit 9ca6e5b into llvm:main Jan 11, 2024

justinfargnoli pushed a commit to justinfargnoli/llvm-project that referenced this pull request Jan 28, 2024

[libc] Fix buggy AVX2 / AVX512 memcmp (llvm#77081)

b6aeb8c

Fixes llvm#77080.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[libc] Fix buggy AVX2 / AVX512 `memcmp` #77081

[libc] Fix buggy AVX2 / AVX512 `memcmp` #77081

Uh oh!

gchatelet commented Jan 5, 2024 •

edited

Loading

Uh oh!

llvmbot commented Jan 5, 2024

Uh oh!

Uh oh!

michaelrj-google left a comment

Uh oh!

Uh oh!

nafi3000 Jan 6, 2024

Uh oh!

nafi3000 left a comment

Uh oh!

legrosbuffle left a comment

Uh oh!

gchatelet Jan 9, 2024

Uh oh!

gchatelet commented Jan 9, 2024

Uh oh!

Uh oh!

Version	llvm-mca	skylake latency	znver3 latency	skylake latency without mask loading	znver3 latency without mask loading
With bug	link	13	13	8	7
#else version above	link	17	19	12	13
#if version above	link	15	17	10	10
shuffle-movemask-rorx version	link	14	14	9	8

[libc] Fix buggy AVX2 / AVX512 memcmp #77081

[libc] Fix buggy AVX2 / AVX512 memcmp #77081

Uh oh!

Conversation

gchatelet commented Jan 5, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jan 5, 2024

Uh oh!

Uh oh!

michaelrj-google left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

nafi3000 Jan 6, 2024

Choose a reason for hiding this comment

Uh oh!

nafi3000 left a comment

Choose a reason for hiding this comment

Uh oh!

legrosbuffle left a comment

Choose a reason for hiding this comment

Uh oh!

gchatelet Jan 9, 2024

Choose a reason for hiding this comment

Uh oh!

gchatelet commented Jan 9, 2024

Uh oh!

Uh oh!

[libc] Fix buggy AVX2 / AVX512 `memcmp` #77081

[libc] Fix buggy AVX2 / AVX512 `memcmp` #77081

gchatelet commented Jan 5, 2024 •

edited

Loading