Skip to content

Commit bad7e05

Browse files
committed
Re-use TimSort analysis for small slices
Add new loop based mini-merge sort for small sizes. This extends allocation free sorting for random inputs of types that qualify up to 32.
1 parent 05a6b9c commit bad7e05

File tree

1 file changed

+81
-99
lines changed

1 file changed

+81
-99
lines changed

library/alloc/src/slice.rs

+81-99
Original file line numberDiff line numberDiff line change
@@ -822,90 +822,6 @@ where
822822
merge_sort(v, &mut is_less);
823823
}
824824

825-
// Sort a small number of elements as fast as possible, without allocations.
826-
#[cfg(not(no_global_oom_handling))]
827-
fn stable_sort_small<T, F>(v: &mut [T], is_less: &mut F)
828-
where
829-
F: FnMut(&T, &T) -> bool,
830-
{
831-
let len = v.len();
832-
833-
// This implementation is really not fit for anything beyond that, and the call is probably a
834-
// bug.
835-
debug_assert!(len <= 40);
836-
837-
if len < 2 {
838-
return;
839-
}
840-
841-
// It's not clear that using custom code for specific sizes is worth it here.
842-
// So we go with the simpler code.
843-
let offset = if len <= 6 || !qualifies_for_branchless_sort::<T>() {
844-
1
845-
} else {
846-
// Once a certain threshold is reached, it becomes worth it to analyze the input and do
847-
// branchless swapping for the first 5 elements.
848-
849-
// SAFETY: We just checked that len >= 5
850-
unsafe {
851-
let arr_ptr = v.as_mut_ptr();
852-
853-
let should_swap_0_1 = is_less(&*arr_ptr.add(1), &*arr_ptr.add(0));
854-
let should_swap_1_2 = is_less(&*arr_ptr.add(2), &*arr_ptr.add(1));
855-
let should_swap_2_3 = is_less(&*arr_ptr.add(3), &*arr_ptr.add(2));
856-
let should_swap_3_4 = is_less(&*arr_ptr.add(4), &*arr_ptr.add(3));
857-
858-
let swap_count = should_swap_0_1 as usize
859-
+ should_swap_1_2 as usize
860-
+ should_swap_2_3 as usize
861-
+ should_swap_3_4 as usize;
862-
863-
if swap_count == 0 {
864-
// Potentially already sorted. No need to swap, we know the first 5 elements are
865-
// already in the right order.
866-
5
867-
} else if swap_count == 4 {
868-
// Potentially reversed.
869-
let mut rev_i = 4;
870-
while rev_i < (len - 1) {
871-
if !is_less(&*arr_ptr.add(rev_i + 1), &*arr_ptr.add(rev_i)) {
872-
break;
873-
}
874-
rev_i += 1;
875-
}
876-
rev_i += 1;
877-
v[..rev_i].reverse();
878-
insertion_sort_shift_left(v, rev_i, is_less);
879-
return;
880-
} else {
881-
// Potentially random pattern.
882-
branchless_swap(arr_ptr.add(0), arr_ptr.add(1), should_swap_0_1);
883-
branchless_swap(arr_ptr.add(2), arr_ptr.add(3), should_swap_2_3);
884-
885-
if len >= 12 {
886-
// This aims to find a good balance between generating more code, which is bad
887-
// for cold loops and improving hot code while not increasing mean comparison
888-
// count too much.
889-
sort8_stable(&mut v[4..12], is_less);
890-
insertion_sort_shift_left(&mut v[4..], 8, is_less);
891-
insertion_sort_shift_right(v, 4, is_less);
892-
return;
893-
} else {
894-
// Complete the sort network for the first 4 elements.
895-
swap_next_if_less(arr_ptr.add(1), is_less);
896-
swap_next_if_less(arr_ptr.add(2), is_less);
897-
swap_next_if_less(arr_ptr.add(0), is_less);
898-
swap_next_if_less(arr_ptr.add(1), is_less);
899-
900-
4
901-
}
902-
}
903-
}
904-
};
905-
906-
insertion_sort_shift_left(v, offset, is_less);
907-
}
908-
909825
#[cfg(not(no_global_oom_handling))]
910826
fn merge_sort<T, F>(v: &mut [T], is_less: &mut F)
911827
where
@@ -918,12 +834,7 @@ where
918834

919835
let len = v.len();
920836

921-
// Slices of up to this length get sorted using insertion sort.
922-
const MAX_NO_ALLOC_SIZE: usize = 20;
923-
924-
// Short arrays get sorted in-place via insertion sort to avoid allocations.
925-
if len <= MAX_NO_ALLOC_SIZE {
926-
stable_sort_small(v, is_less);
837+
if len < 2 {
927838
return;
928839
}
929840

@@ -963,6 +874,11 @@ where
963874
// return without allocating.
964875
return;
965876
} else if buf_ptr.is_null() {
877+
// Short arrays get sorted in-place via insertion sort to avoid allocations.
878+
if sort_small_stable(v, start, is_less) {
879+
return;
880+
}
881+
966882
// Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
967883
// shallow copies of the contents of `v` without risking the dtors running on copies if
968884
// `is_less` panics. When merging two sorted runs, this buffer holds a copy of the
@@ -1016,11 +932,7 @@ where
1016932
|| (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
1017933
|| (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
1018934
{
1019-
if n >= 3 && runs[n - 3].len < runs[n - 1].len {
1020-
Some(n - 3)
1021-
} else {
1022-
Some(n - 2)
1023-
}
935+
if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) }
1024936
} else {
1025937
None
1026938
}
@@ -1033,6 +945,67 @@ where
1033945
}
1034946
}
1035947

948+
/// Check whether `v` applies for small sort optimization.
949+
/// `v[start..]` is assumed already sorted.
950+
#[cfg(not(no_global_oom_handling))]
951+
fn sort_small_stable<T, F>(v: &mut [T], start: usize, is_less: &mut F) -> bool
952+
where
953+
F: FnMut(&T, &T) -> bool,
954+
{
955+
let len = v.len();
956+
957+
if qualifies_for_branchless_sort::<T>() {
958+
// Testing showed that even though this incurs more comparisons, up to size 32 (4 * 8),
959+
// avoiding the allocation and sticking with simple code is worth it. Going further eg. 40
960+
// is still worth it for u64 or even types with more expensive comparisons, but risks
961+
// incurring just too many comparisons than doing the regular TimSort.
962+
const MAX_NO_ALLOC_SIZE: usize = 32;
963+
if len <= MAX_NO_ALLOC_SIZE {
964+
if len < 8 {
965+
insertion_sort_shift_right(v, start, is_less);
966+
return true;
967+
}
968+
969+
let mut merge_count = 0;
970+
for chunk in v.chunks_exact_mut(8) {
971+
// SAFETY: chunks_exact_mut promised to give us slices of len 8.
972+
unsafe {
973+
sort8_stable(chunk, is_less);
974+
}
975+
merge_count += 1;
976+
}
977+
978+
let mut swap = mem::MaybeUninit::<[T; 8]>::uninit();
979+
let swap_ptr = swap.as_mut_ptr() as *mut T;
980+
981+
let mut i = 8;
982+
while merge_count > 1 {
983+
// SAFETY: We know the smaller side will be of size 8 because mid is 8. And both
984+
// sides are non empty because of merge_count, and the right side will always be of
985+
// size 8 and the left size of 8 or greater. Thus the smaller side will always be
986+
// exactly 8 long, the size of swap.
987+
unsafe {
988+
merge(&mut v[0..(i + 8)], i, swap_ptr, is_less);
989+
}
990+
i += 8;
991+
merge_count -= 1;
992+
}
993+
994+
insertion_sort_shift_left(v, i, is_less);
995+
996+
return true;
997+
}
998+
} else {
999+
const MAX_NO_ALLOC_SIZE: usize = 20;
1000+
if len <= MAX_NO_ALLOC_SIZE {
1001+
insertion_sort_shift_right(v, start, is_less);
1002+
return true;
1003+
}
1004+
}
1005+
1006+
false
1007+
}
1008+
10361009
/// Takes a range as denoted by start and end, that is already sorted and extends it if necessary
10371010
/// with sorts optimized for smaller ranges such as insertion sort.
10381011
#[cfg(not(no_global_oom_handling))]
@@ -1042,8 +1015,7 @@ where
10421015
{
10431016
debug_assert!(end > start);
10441017

1045-
// Testing showed that using MAX_INSERTION here yields the best performance for many types, but
1046-
// incurs more total comparisons. A balance between least comparisons and best performance, as
1018+
// This value is a balance between least comparisons and best performance, as
10471019
// influenced by for example cache locality.
10481020
const MIN_INSERTION_RUN: usize = 10;
10491021

@@ -1115,6 +1087,7 @@ impl<T> Drop for InsertionHole<T> {
11151087

11161088
/// Inserts `v[v.len() - 1]` into pre-sorted sequence `v[..v.len() - 1]` so that whole `v[..]`
11171089
/// becomes sorted.
1090+
#[cfg(not(no_global_oom_handling))]
11181091
unsafe fn insert_tail<T, F>(v: &mut [T], is_less: &mut F)
11191092
where
11201093
F: FnMut(&T, &T) -> bool,
@@ -1167,11 +1140,12 @@ where
11671140
}
11681141
}
11691142

1170-
/// Sort v assuming v[..offset] is already sorted.
1143+
/// Sort `v` assuming `v[..offset]` is already sorted.
11711144
///
11721145
/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
11731146
/// performance impact. Even improving performance in some cases.
11741147
#[inline(never)]
1148+
#[cfg(not(no_global_oom_handling))]
11751149
fn insertion_sort_shift_left<T, F>(v: &mut [T], offset: usize, is_less: &mut F)
11761150
where
11771151
F: FnMut(&T, &T) -> bool,
@@ -1195,11 +1169,12 @@ where
11951169
}
11961170
}
11971171

1198-
/// Sort v assuming v[offset..] is already sorted.
1172+
/// Sort `v` assuming `v[offset..]` is already sorted.
11991173
///
12001174
/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
12011175
/// performance impact. Even improving performance in some cases.
12021176
#[inline(never)]
1177+
#[cfg(not(no_global_oom_handling))]
12031178
fn insertion_sort_shift_right<T, F>(v: &mut [T], offset: usize, is_less: &mut F)
12041179
where
12051180
F: FnMut(&T, &T) -> bool,
@@ -1227,6 +1202,7 @@ where
12271202
/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
12281203
///
12291204
/// This is the integral subroutine of insertion sort.
1205+
#[cfg(not(no_global_oom_handling))]
12301206
unsafe fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
12311207
where
12321208
F: FnMut(&T, &T) -> bool,
@@ -1287,6 +1263,10 @@ where
12871263
///
12881264
/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
12891265
/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
1266+
///
1267+
/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
1268+
/// performance impact.
1269+
#[inline(never)]
12901270
#[cfg(not(no_global_oom_handling))]
12911271
unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
12921272
where
@@ -1506,6 +1486,7 @@ where
15061486
/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
15071487
/// performance impact.
15081488
#[inline(never)]
1489+
#[cfg(not(no_global_oom_handling))]
15091490
unsafe fn sort8_stable<T, F>(v: &mut [T], is_less: &mut F)
15101491
where
15111492
F: FnMut(&T, &T) -> bool,
@@ -1559,6 +1540,7 @@ where
15591540
}
15601541
}
15611542

1543+
#[cfg(not(no_global_oom_handling))]
15621544
unsafe fn sort24_stable<T, F>(v: &mut [T], is_less: &mut F)
15631545
where
15641546
F: FnMut(&T, &T) -> bool,

0 commit comments

Comments
 (0)