-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[libc] Fix buggy AVX2 / AVX512 memcmp
#77081
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Fixes 77080.
@llvm/pr-subscribers-libc Author: Guillaume Chatelet (gchatelet) ChangesFixes #77080. Full diff: https://github.com/llvm/llvm-project/pull/77081.diff 2 Files Affected:
diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
index 1a20659c178cd1..23e6b897997e90 100644
--- a/libc/src/string/memory_utils/op_x86.h
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -129,7 +129,8 @@ LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
8, 9, 10, 11, 12, 13, 14, 15));
}
LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
- return static_cast<uint16_t>(_mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
+ return static_cast<uint16_t>(
+ _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
}
template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m128i>(p1, offset);
@@ -181,11 +182,31 @@ LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
return _mm256_max_epu8(a, b);
}
LIBC_INLINE __m256i bytewise_reverse(__m256i value) {
- return _mm256_shuffle_epi8(value,
- _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
- 8, 9, 10, 11, 12, 13, 14, 15, //
- 16, 17, 18, 19, 20, 21, 22, 23, //
- 24, 25, 26, 27, 28, 29, 30, 31));
+ const __m256i indices = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
+ 8, 9, 10, 11, 12, 13, 14, 15, //
+ 16, 17, 18, 19, 20, 21, 22, 23, //
+ 24, 25, 26, 27, 28, 29, 30, 31);
+#if defined(__AVX512VBMI__) && defined(__AVX512VL__)
+ // AVX512 allows full __m256i byte permutation.
+ // ymm = ymm[31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,
+ // 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+ return _mm256_permutexvar_epi8(value, indices);
+#else
+ // We can't byte-reverse __m256i in a single instruction with AVX2.
+ // '_mm256_shuffle_epi8' can only shuffle within each xmm lane
+ // leading to:
+ // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+ // 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
+ const __m256i tmp = _mm256_shuffle_epi8(value, indices);
+ // Then we shuffle accross lanes using 64 bit values.
+ // ymm = ymm[2,3,0,1]
+ // Leading to a fully reversed vector
+ // ymm = ymm[31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,
+ // 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+ // The immediate encodes the 64 bit word indices : 1, 0, 3, 2.
+ // Each index is encoded with 2 bits : 0b01'00'11'10.
+ return _mm256_permute4x64_epi64(tmp, 0b01'00'11'10);
+#endif
}
LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
return _mm256_movemask_epi8(bytewise_reverse(_mm256_cmpeq_epi8(max, value)));
diff --git a/libc/test/src/string/memcmp_test.cpp b/libc/test/src/string/memcmp_test.cpp
index 03a0ac1c0ba655..a69257704a64a2 100644
--- a/libc/test/src/string/memcmp_test.cpp
+++ b/libc/test/src/string/memcmp_test.cpp
@@ -37,6 +37,13 @@ TEST(LlvmLibcMemcmpTest, LhsAfterRhsLexically) {
EXPECT_GT(LIBC_NAMESPACE::memcmp(lhs, rhs, 2), 0);
}
+TEST(LlvmLibcMemcmpTest, Issue77080) {
+ // https://github.com/llvm/llvm-project/issues/77080
+ constexpr char lhs[35] = "1.069cd68bbe76eb2143a3284d27ebe220";
+ constexpr char rhs[35] = "1.0500185b5d966a544e2d0fa40701b0f3";
+ EXPECT_GT(LIBC_NAMESPACE::memcmp(lhs, rhs, 34), 0);
+}
+
// Adapt CheckMemcmp signature to memcmp.
static inline int Adaptor(cpp::span<char> p1, cpp::span<char> p2, size_t size) {
return LIBC_NAMESPACE::memcmp(p1.begin(), p2.begin(), size);
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with nit
// 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] | ||
// The immediate encodes the 64 bit word indices : 1, 0, 3, 2. | ||
// Each index is encoded with 2 bits : 0b01'00'11'10. | ||
return _mm256_permute4x64_epi64(tmp, 0b01'00'11'10); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider doing this after calling the _mm256_movemask_epi8
. Then a rorx
(1 cycle) will be sufficient. E.g.
static inline uint32_t SwapWords(uint32_t x) {
return (x << 16) | (x >> 16);
}
int cmp_neq(__m256i a, __m256i b) {
__m256i vmax = _mm256_max_epu8(a, b);
__m256i a_le_b = _mm256_cmpeq_epi8(vmax, b);
__m256i a_ge_b = _mm256_cmpeq_epi8(vmax, a);
const __m256i indices = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
8, 9, 10, 11, 12, 13, 14, 15, //
0, 1, 2, 3, 4, 5, 6, 7, //
8, 9, 10, 11, 12, 13, 14, 15);
uint32_t le = SwapWords(_mm256_movemask_epi8(_mm256_shuffle_epi8(a_le_b, indices)));
uint32_t ge = SwapWords(_mm256_movemask_epi8(_mm256_shuffle_epi8(a_ge_b, indices)));
return le < ge ? 5 : -5;
}
https://godbolt.org/z/6bhjE35j3
Version | llvm-mca | skylake latency | znver3 latency | skylake latency without mask loading | znver3 latency without mask loading |
---|---|---|---|---|---|
With bug | link | 13 | 13 | 8 | 7 |
#else version above | link | 17 | 19 | 12 | 13 |
#if version above | link | 15 | 17 | 10 | 10 |
shuffle-movemask-rorx version | link | 14 | 14 | 9 | 8 |
I have tested the shuffle-movemask-rorx version: https://godbolt.org/z/as5Kqjof5. The tests fail without the rorx
fix: https://godbolt.org/z/ccndGYM6z.
What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FWIW, I believe __m512i bytewise_reverse(__m512i value)
is also buggy at the moment!
https://godbolt.org/z/e8sfvP7h4
https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_shuffle_epi8&ig_expand=6009
index[5:0] := b[i+3:i] + (j & 0x30)
It is only considering 4 lower bits from each byte of b
and then adding 4th and 5th bits from j
which is basically the 128-bit lane index. So, it is implemented the same way as _mm256_shuffle_epi8
. :(
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should probably rerun the benchmarks to get some visibility on the impact on performance.
48, 49, 50, 51, 52, 53, 54, 55, // | ||
56, 57, 58, 59, 60, 61, 62, 63); | ||
// Then we compute the mask for equal bytes. | ||
return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), // |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It appears that this code is not well optimized by the compiler, using GPRs instead of vector registers.
#77459
The vector code is 21 cycles whether the GPR is 25 according to llvm-mca.
I'll benchmark both of them and report.
I've run a couple of benchmarks. All numbers are throughput in GB/s : higher is better. They are run on a sapphirerapids machine.
Same table as percentage of base
This is a clear indication that the |
memcmp
memcmp
Fixes #77080.