@@ -131,7 +131,7 @@ unsafe fn foo_avx2() {
131
131
#[cfg(target_arch = "x86_64")]
132
132
use std::arch::x86_64::_mm256_add_epi64;
133
133
134
- _mm256_add_epi64(...);
134
+ unsafe { _mm256_add_epi64(...); }
135
135
}
136
136
```
137
137
@@ -287,47 +287,49 @@ unsafe fn hex_encode_sse41(mut src: &[u8], dst: &mut [u8]) {
287
287
#[cfg(target_arch = "x86_64")]
288
288
use std::arch::x86_64::*;
289
289
290
- let ascii_zero = _mm_set1_epi8(b'0' as i8);
291
- let nines = _mm_set1_epi8(9);
292
- let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8);
293
- let and4bits = _mm_set1_epi8(0xf);
294
-
295
- let mut i = 0_isize;
296
- while src.len() >= 16 {
297
- let invec = _mm_loadu_si128(src.as_ptr() as *const _);
298
-
299
- let masked1 = _mm_and_si128(invec, and4bits);
300
- let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits);
301
-
302
- // return 0xff corresponding to the elements > 9, or 0x00 otherwise
303
- let cmpmask1 = _mm_cmpgt_epi8(masked1, nines);
304
- let cmpmask2 = _mm_cmpgt_epi8(masked2, nines);
305
-
306
- // add '0' or the offset depending on the masks
307
- let masked1 = _mm_add_epi8(
308
- masked1,
309
- _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1),
310
- );
311
- let masked2 = _mm_add_epi8(
312
- masked2,
313
- _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2),
314
- );
315
-
316
- // interleave masked1 and masked2 bytes
317
- let res1 = _mm_unpacklo_epi8(masked2, masked1);
318
- let res2 = _mm_unpackhi_epi8(masked2, masked1);
319
-
320
- _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
321
- _mm_storeu_si128(
322
- dst.as_mut_ptr().offset(i * 2 + 16) as *mut _,
323
- res2,
324
- );
325
- src = &src[16..];
326
- i += 16;
327
- }
290
+ unsafe {
291
+ let ascii_zero = _mm_set1_epi8(b'0' as i8);
292
+ let nines = _mm_set1_epi8(9);
293
+ let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8);
294
+ let and4bits = _mm_set1_epi8(0xf);
295
+
296
+ let mut i = 0_isize;
297
+ while src.len() >= 16 {
298
+ let invec = _mm_loadu_si128(src.as_ptr() as *const _);
299
+
300
+ let masked1 = _mm_and_si128(invec, and4bits);
301
+ let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits);
302
+
303
+ // return 0xff corresponding to the elements > 9, or 0x00 otherwise
304
+ let cmpmask1 = _mm_cmpgt_epi8(masked1, nines);
305
+ let cmpmask2 = _mm_cmpgt_epi8(masked2, nines);
306
+
307
+ // add '0' or the offset depending on the masks
308
+ let masked1 = _mm_add_epi8(
309
+ masked1,
310
+ _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1),
311
+ );
312
+ let masked2 = _mm_add_epi8(
313
+ masked2,
314
+ _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2),
315
+ );
316
+
317
+ // interleave masked1 and masked2 bytes
318
+ let res1 = _mm_unpacklo_epi8(masked2, masked1);
319
+ let res2 = _mm_unpackhi_epi8(masked2, masked1);
320
+
321
+ _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
322
+ _mm_storeu_si128(
323
+ dst.as_mut_ptr().offset(i * 2 + 16) as *mut _,
324
+ res2,
325
+ );
326
+ src = &src[16..];
327
+ i += 16;
328
+ }
328
329
329
- let i = i as usize;
330
- hex_encode_fallback(src, &mut dst[i * 2..]);
330
+ let i = i as usize;
331
+ hex_encode_fallback(src, &mut dst[i * 2..]);
332
+ }
331
333
}
332
334
333
335
fn hex_encode_fallback(src: &[u8], dst: &mut [u8]) {
0 commit comments