Skip to content

Commit 2b97174

Browse files
committed
Auto merge of #41764 - scottmcm:faster-reverse, r=brson
Make [u8]::reverse() 5x faster Since LLVM doesn't vectorize the loop for us, do unaligned reads of a larger type and use LLVM's bswap intrinsic to do the reversing of the actual bytes. cfg!-restricted to x86 and x86_64, as I assume it wouldn't help on things like ARMv5. Also makes [u16]::reverse() a more modest 1.5x faster by loading/storing u32 and swapping the u16s with ROT16. Thank you ptr::*_unaligned for making this easy :) Benchmark results (from my i5-2500K): ```text # Before test slice::reverse_u8 ... bench: 273,836 ns/iter (+/- 15,592) = 3829 MB/s test slice::reverse_u16 ... bench: 139,793 ns/iter (+/- 17,748) = 7500 MB/s test slice::reverse_u32 ... bench: 74,997 ns/iter (+/- 5,130) = 13981 MB/s test slice::reverse_u64 ... bench: 47,452 ns/iter (+/- 2,213) = 22097 MB/s # After test slice::reverse_u8 ... bench: 52,170 ns/iter (+/- 3,962) = 20099 MB/s test slice::reverse_u16 ... bench: 93,330 ns/iter (+/- 4,412) = 11235 MB/s test slice::reverse_u32 ... bench: 74,731 ns/iter (+/- 1,425) = 14031 MB/s test slice::reverse_u64 ... bench: 47,556 ns/iter (+/- 3,025) = 22049 MB/s ``` If you're curious about the assembly, instead of doing this ``` movzx eax, byte ptr [rdi] movzx ecx, byte ptr [rsi] mov byte ptr [rdi], cl mov byte ptr [rsi], al ``` it does this ``` mov rax, qword ptr [rdx] mov rbx, qword ptr [r11 + rcx - 8] bswap rbx mov qword ptr [rdx], rbx bswap rax mov qword ptr [r11 + rcx - 8], rax ```
2 parents 58b33ad + da91361 commit 2b97174

File tree

4 files changed

+86
-0
lines changed

4 files changed

+86
-0
lines changed

src/libcollections/benches/lib.rs

+2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010

1111
#![deny(warnings)]
1212

13+
#![feature(i128_type)]
1314
#![feature(rand)]
15+
#![feature(repr_simd)]
1416
#![feature(sort_unstable)]
1517
#![feature(test)]
1618

src/libcollections/benches/slice.rs

+25
Original file line numberDiff line numberDiff line change
@@ -290,3 +290,28 @@ sort!(sort_unstable, sort_unstable_large_random, gen_random, 10000);
290290
sort!(sort_unstable, sort_unstable_large_big_random, gen_big_random, 10000);
291291
sort!(sort_unstable, sort_unstable_large_strings, gen_strings, 10000);
292292
sort_expensive!(sort_unstable_by, sort_unstable_large_random_expensive, gen_random, 10000);
293+
294+
macro_rules! reverse {
295+
($name:ident, $ty:ty, $f:expr) => {
296+
#[bench]
297+
fn $name(b: &mut Bencher) {
298+
// odd length and offset by 1 to be as unaligned as possible
299+
let n = 0xFFFFF;
300+
let mut v: Vec<_> =
301+
(0..1+(n / mem::size_of::<$ty>() as u64))
302+
.map($f)
303+
.collect();
304+
b.iter(|| black_box(&mut v[1..]).reverse());
305+
b.bytes = n;
306+
}
307+
}
308+
}
309+
310+
reverse!(reverse_u8, u8, |x| x as u8);
311+
reverse!(reverse_u16, u16, |x| x as u16);
312+
reverse!(reverse_u8x3, [u8;3], |x| [x as u8, (x>>8) as u8, (x>>16) as u8]);
313+
reverse!(reverse_u32, u32, |x| x as u32);
314+
reverse!(reverse_u64, u64, |x| x as u64);
315+
reverse!(reverse_u128, u128, |x| x as u128);
316+
#[repr(simd)] struct F64x4(f64, f64, f64, f64);
317+
reverse!(reverse_simd_f64x4, F64x4, |x| { let x = x as f64; F64x4(x,x,x,x) });

src/libcollections/tests/slice.rs

+10
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,16 @@ fn test_reverse() {
379379
let mut v3 = Vec::<i32>::new();
380380
v3.reverse();
381381
assert!(v3.is_empty());
382+
383+
// check the 1-byte-types path
384+
let mut v = (-50..51i8).collect::<Vec<_>>();
385+
v.reverse();
386+
assert_eq!(v, (-50..51i8).rev().collect::<Vec<_>>());
387+
388+
// check the 2-byte-types path
389+
let mut v = (-50..51i16).collect::<Vec<_>>();
390+
v.reverse();
391+
assert_eq!(v, (-50..51i16).rev().collect::<Vec<_>>());
382392
}
383393

384394
#[test]

src/libcore/slice/mod.rs

+49
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,55 @@ impl<T> SliceExt for [T] {
539539
fn reverse(&mut self) {
540540
let mut i: usize = 0;
541541
let ln = self.len();
542+
543+
// For very small types, all the individual reads in the normal
544+
// path perform poorly. We can do better, given efficient unaligned
545+
// load/store, by loading a larger chunk and reversing a register.
546+
547+
// Ideally LLVM would do this for us, as it knows better than we do
548+
// whether unaligned reads are efficient (since that changes between
549+
// different ARM versions, for example) and what the best chunk size
550+
// would be. Unfortunately, as of LLVM 4.0 (2017-05) it only unrolls
551+
// the loop, so we need to do this ourselves. (Hypothesis: reverse
552+
// is troublesome because the sides can be aligned differently --
553+
// will be, when the length is odd -- so there's no way of emitting
554+
// pre- and postludes to use fully-aligned SIMD in the middle.)
555+
556+
let fast_unaligned =
557+
cfg!(any(target_arch = "x86", target_arch = "x86_64"));
558+
559+
if fast_unaligned && mem::size_of::<T>() == 1 {
560+
// Use the llvm.bswap intrinsic to reverse u8s in a usize
561+
let chunk = mem::size_of::<usize>();
562+
while i + chunk - 1 < ln / 2 {
563+
unsafe {
564+
let pa: *mut T = self.get_unchecked_mut(i);
565+
let pb: *mut T = self.get_unchecked_mut(ln - i - chunk);
566+
let va = ptr::read_unaligned(pa as *mut usize);
567+
let vb = ptr::read_unaligned(pb as *mut usize);
568+
ptr::write_unaligned(pa as *mut usize, vb.swap_bytes());
569+
ptr::write_unaligned(pb as *mut usize, va.swap_bytes());
570+
}
571+
i += chunk;
572+
}
573+
}
574+
575+
if fast_unaligned && mem::size_of::<T>() == 2 {
576+
// Use rotate-by-16 to reverse u16s in a u32
577+
let chunk = mem::size_of::<u32>() / 2;
578+
while i + chunk - 1 < ln / 2 {
579+
unsafe {
580+
let pa: *mut T = self.get_unchecked_mut(i);
581+
let pb: *mut T = self.get_unchecked_mut(ln - i - chunk);
582+
let va = ptr::read_unaligned(pa as *mut u32);
583+
let vb = ptr::read_unaligned(pb as *mut u32);
584+
ptr::write_unaligned(pa as *mut u32, vb.rotate_left(16));
585+
ptr::write_unaligned(pb as *mut u32, va.rotate_left(16));
586+
}
587+
i += chunk;
588+
}
589+
}
590+
542591
while i < ln / 2 {
543592
// Unsafe swap to avoid the bounds check in safe swap.
544593
unsafe {

0 commit comments

Comments
 (0)