@@ -129,7 +129,8 @@ LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
129
129
8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ));
130
130
}
131
131
LIBC_INLINE uint16_t big_endian_cmp_mask (__m128i max, __m128i value) {
132
- return static_cast <uint16_t >(_mm_movemask_epi8 (bytewise_reverse (_mm_cmpeq_epi8 (max, value))));
132
+ return static_cast <uint16_t >(
133
+ _mm_movemask_epi8 (bytewise_reverse (_mm_cmpeq_epi8 (max, value))));
133
134
}
134
135
template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
135
136
const auto a = load<__m128i>(p1, offset);
@@ -180,15 +181,41 @@ template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
180
181
LIBC_INLINE __m256i bytewise_max (__m256i a, __m256i b) {
181
182
return _mm256_max_epu8 (a, b);
182
183
}
183
- LIBC_INLINE __m256i bytewise_reverse (__m256i value) {
184
- return _mm256_shuffle_epi8 (value,
185
- _mm256_set_epi8 (0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , //
186
- 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , //
187
- 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , //
188
- 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 ));
189
- }
190
184
LIBC_INLINE uint32_t big_endian_cmp_mask (__m256i max, __m256i value) {
191
- return _mm256_movemask_epi8 (bytewise_reverse (_mm256_cmpeq_epi8 (max, value)));
185
+ // Bytewise comparison of 'max' and 'value'.
186
+ const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8 (max, value);
187
+ // Because x86 is little endian, bytes in the vector must be reversed before
188
+ // using movemask.
189
+ #if defined(__AVX512VBMI__) && defined(__AVX512VL__)
190
+ // When AVX512BMI is available we can completely reverse the vector through
191
+ // VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a);
192
+ const __m256i big_endian_byte_mask =
193
+ _mm256_permutexvar_epi8 (_mm256_set_epi8 (0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , //
194
+ 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , //
195
+ 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , //
196
+ 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 ),
197
+ little_endian_byte_mask);
198
+ // And turn the byte vector mask into an 'uint32_t' for direct scalar
199
+ // comparison.
200
+ return _mm256_movemask_epi8 (big_endian_byte_mask);
201
+ #else
202
+ // We can't byte-reverse '__m256i' in a single instruction with AVX2.
203
+ // '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane
204
+ // leading to:
205
+ // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
206
+ // 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
207
+ // So we first shuffle each 16-byte lane leading to half-reversed vector mask.
208
+ const __m256i half_reversed = _mm256_shuffle_epi8 (
209
+ little_endian_byte_mask, _mm256_set_epi8 (0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , //
210
+ 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , //
211
+ 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , //
212
+ 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 ));
213
+ // Then we turn the vector into an uint32_t.
214
+ const uint32_t half_reversed_scalar = _mm256_movemask_epi8 (half_reversed);
215
+ // And swap the lower and upper parts. This is optimized into a single `rorx`
216
+ // instruction.
217
+ return (half_reversed_scalar << 16 ) | (half_reversed_scalar >> 16 );
218
+ #endif
192
219
}
193
220
template <>
194
221
LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
@@ -198,7 +225,7 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
198
225
const auto le = big_endian_cmp_mask (vmax, b);
199
226
const auto ge = big_endian_cmp_mask (vmax, a);
200
227
static_assert (cpp::is_same_v<cpp::remove_cv_t <decltype (le)>, uint32_t >);
201
- return cmp_uint32_t (ge, le);
228
+ return cmp_neq_uint64_t (ge, le);
202
229
}
203
230
#endif // __AVX2__
204
231
@@ -210,19 +237,48 @@ template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
210
237
LIBC_INLINE __m512i bytewise_max (__m512i a, __m512i b) {
211
238
return _mm512_max_epu8 (a, b);
212
239
}
213
- LIBC_INLINE __m512i bytewise_reverse (__m512i value) {
214
- return _mm512_shuffle_epi8 (value,
215
- _mm512_set_epi8 (0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , //
216
- 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , //
217
- 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , //
218
- 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , //
219
- 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , //
220
- 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , //
221
- 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , //
222
- 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 ));
223
- }
224
240
LIBC_INLINE uint64_t big_endian_cmp_mask (__m512i max, __m512i value) {
225
- return _mm512_cmpeq_epi8_mask (bytewise_reverse (max), bytewise_reverse (value));
241
+ // The AVX512BMI version is disabled due to bad codegen.
242
+ // https://github.com/llvm/llvm-project/issues/77459
243
+ // https://github.com/llvm/llvm-project/pull/77081
244
+ // TODO: Re-enable when clang version meets the fixed version.
245
+ #if false && defined(__AVX512VBMI__)
246
+ // When AVX512BMI is available we can completely reverse the vector through
247
+ // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a);
248
+ const auto indices = _mm512_set_epi8 (0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , //
249
+ 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , //
250
+ 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , //
251
+ 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , //
252
+ 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , //
253
+ 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , //
254
+ 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , //
255
+ 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 );
256
+ // Then we compute the mask for equal bytes.
257
+ return _mm512_cmpeq_epi8_mask (_mm512_permutexvar_epi8 (indices, max), //
258
+ _mm512_permutexvar_epi8 (indices, value));
259
+ #else
260
+ // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__.
261
+ // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane.
262
+ // So we only reverse groups of 8 bytes, these groups are necessarily within a
263
+ // 16-byte lane.
264
+ // zmm = | 16 bytes | 16 bytes | 16 bytes | 16 bytes |
265
+ // zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> |
266
+ const __m512i indices = _mm512_set_epi8 (56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , //
267
+ 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , //
268
+ 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , //
269
+ 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , //
270
+ 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , //
271
+ 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , //
272
+ 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , //
273
+ 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 );
274
+ // Then we compute the mask for equal bytes. In this mask the bits of each
275
+ // byte are already reversed but the byte themselves should be reversed, this
276
+ // is done by using a bswap instruction.
277
+ return __builtin_bswap64 (
278
+ _mm512_cmpeq_epi8_mask (_mm512_shuffle_epi8 (max, indices), //
279
+ _mm512_shuffle_epi8 (value, indices)));
280
+
281
+ #endif
226
282
}
227
283
template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
228
284
const auto a = load<__m512i>(p1, offset);
0 commit comments