@@ -822,90 +822,6 @@ where
822
822
merge_sort ( v, & mut is_less) ;
823
823
}
824
824
825
- // Sort a small number of elements as fast as possible, without allocations.
826
- #[ cfg( not( no_global_oom_handling) ) ]
827
- fn stable_sort_small < T , F > ( v : & mut [ T ] , is_less : & mut F )
828
- where
829
- F : FnMut ( & T , & T ) -> bool ,
830
- {
831
- let len = v. len ( ) ;
832
-
833
- // This implementation is really not fit for anything beyond that, and the call is probably a
834
- // bug.
835
- debug_assert ! ( len <= 40 ) ;
836
-
837
- if len < 2 {
838
- return ;
839
- }
840
-
841
- // It's not clear that using custom code for specific sizes is worth it here.
842
- // So we go with the simpler code.
843
- let offset = if len <= 6 || !qualifies_for_branchless_sort :: < T > ( ) {
844
- 1
845
- } else {
846
- // Once a certain threshold is reached, it becomes worth it to analyze the input and do
847
- // branchless swapping for the first 5 elements.
848
-
849
- // SAFETY: We just checked that len >= 5
850
- unsafe {
851
- let arr_ptr = v. as_mut_ptr ( ) ;
852
-
853
- let should_swap_0_1 = is_less ( & * arr_ptr. add ( 1 ) , & * arr_ptr. add ( 0 ) ) ;
854
- let should_swap_1_2 = is_less ( & * arr_ptr. add ( 2 ) , & * arr_ptr. add ( 1 ) ) ;
855
- let should_swap_2_3 = is_less ( & * arr_ptr. add ( 3 ) , & * arr_ptr. add ( 2 ) ) ;
856
- let should_swap_3_4 = is_less ( & * arr_ptr. add ( 4 ) , & * arr_ptr. add ( 3 ) ) ;
857
-
858
- let swap_count = should_swap_0_1 as usize
859
- + should_swap_1_2 as usize
860
- + should_swap_2_3 as usize
861
- + should_swap_3_4 as usize ;
862
-
863
- if swap_count == 0 {
864
- // Potentially already sorted. No need to swap, we know the first 5 elements are
865
- // already in the right order.
866
- 5
867
- } else if swap_count == 4 {
868
- // Potentially reversed.
869
- let mut rev_i = 4 ;
870
- while rev_i < ( len - 1 ) {
871
- if !is_less ( & * arr_ptr. add ( rev_i + 1 ) , & * arr_ptr. add ( rev_i) ) {
872
- break ;
873
- }
874
- rev_i += 1 ;
875
- }
876
- rev_i += 1 ;
877
- v[ ..rev_i] . reverse ( ) ;
878
- insertion_sort_shift_left ( v, rev_i, is_less) ;
879
- return ;
880
- } else {
881
- // Potentially random pattern.
882
- branchless_swap ( arr_ptr. add ( 0 ) , arr_ptr. add ( 1 ) , should_swap_0_1) ;
883
- branchless_swap ( arr_ptr. add ( 2 ) , arr_ptr. add ( 3 ) , should_swap_2_3) ;
884
-
885
- if len >= 12 {
886
- // This aims to find a good balance between generating more code, which is bad
887
- // for cold loops and improving hot code while not increasing mean comparison
888
- // count too much.
889
- sort8_stable ( & mut v[ 4 ..12 ] , is_less) ;
890
- insertion_sort_shift_left ( & mut v[ 4 ..] , 8 , is_less) ;
891
- insertion_sort_shift_right ( v, 4 , is_less) ;
892
- return ;
893
- } else {
894
- // Complete the sort network for the first 4 elements.
895
- swap_next_if_less ( arr_ptr. add ( 1 ) , is_less) ;
896
- swap_next_if_less ( arr_ptr. add ( 2 ) , is_less) ;
897
- swap_next_if_less ( arr_ptr. add ( 0 ) , is_less) ;
898
- swap_next_if_less ( arr_ptr. add ( 1 ) , is_less) ;
899
-
900
- 4
901
- }
902
- }
903
- }
904
- } ;
905
-
906
- insertion_sort_shift_left ( v, offset, is_less) ;
907
- }
908
-
909
825
#[ cfg( not( no_global_oom_handling) ) ]
910
826
fn merge_sort < T , F > ( v : & mut [ T ] , is_less : & mut F )
911
827
where
@@ -918,12 +834,7 @@ where
918
834
919
835
let len = v. len ( ) ;
920
836
921
- // Slices of up to this length get sorted using insertion sort.
922
- const MAX_NO_ALLOC_SIZE : usize = 20 ;
923
-
924
- // Short arrays get sorted in-place via insertion sort to avoid allocations.
925
- if len <= MAX_NO_ALLOC_SIZE {
926
- stable_sort_small ( v, is_less) ;
837
+ if len < 2 {
927
838
return ;
928
839
}
929
840
@@ -963,6 +874,11 @@ where
963
874
// return without allocating.
964
875
return ;
965
876
} else if buf_ptr. is_null ( ) {
877
+ // Short arrays get sorted in-place via insertion sort to avoid allocations.
878
+ if sort_small_stable ( v, start, is_less) {
879
+ return ;
880
+ }
881
+
966
882
// Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
967
883
// shallow copies of the contents of `v` without risking the dtors running on copies if
968
884
// `is_less` panics. When merging two sorted runs, this buffer holds a copy of the
@@ -1016,11 +932,7 @@ where
1016
932
|| ( n >= 3 && runs[ n - 3 ] . len <= runs[ n - 2 ] . len + runs[ n - 1 ] . len )
1017
933
|| ( n >= 4 && runs[ n - 4 ] . len <= runs[ n - 3 ] . len + runs[ n - 2 ] . len ) )
1018
934
{
1019
- if n >= 3 && runs[ n - 3 ] . len < runs[ n - 1 ] . len {
1020
- Some ( n - 3 )
1021
- } else {
1022
- Some ( n - 2 )
1023
- }
935
+ if n >= 3 && runs[ n - 3 ] . len < runs[ n - 1 ] . len { Some ( n - 3 ) } else { Some ( n - 2 ) }
1024
936
} else {
1025
937
None
1026
938
}
@@ -1033,6 +945,67 @@ where
1033
945
}
1034
946
}
1035
947
948
+ /// Check whether `v` applies for small sort optimization.
949
+ /// `v[start..]` is assumed already sorted.
950
+ #[ cfg( not( no_global_oom_handling) ) ]
951
+ fn sort_small_stable < T , F > ( v : & mut [ T ] , start : usize , is_less : & mut F ) -> bool
952
+ where
953
+ F : FnMut ( & T , & T ) -> bool ,
954
+ {
955
+ let len = v. len ( ) ;
956
+
957
+ if qualifies_for_branchless_sort :: < T > ( ) {
958
+ // Testing showed that even though this incurs more comparisons, up to size 32 (4 * 8),
959
+ // avoiding the allocation and sticking with simple code is worth it. Going further eg. 40
960
+ // is still worth it for u64 or even types with more expensive comparisons, but risks
961
+ // incurring just too many comparisons than doing the regular TimSort.
962
+ const MAX_NO_ALLOC_SIZE : usize = 32 ;
963
+ if len <= MAX_NO_ALLOC_SIZE {
964
+ if len < 8 {
965
+ insertion_sort_shift_right ( v, start, is_less) ;
966
+ return true ;
967
+ }
968
+
969
+ let mut merge_count = 0 ;
970
+ for chunk in v. chunks_exact_mut ( 8 ) {
971
+ // SAFETY: chunks_exact_mut promised to give us slices of len 8.
972
+ unsafe {
973
+ sort8_stable ( chunk, is_less) ;
974
+ }
975
+ merge_count += 1 ;
976
+ }
977
+
978
+ let mut swap = mem:: MaybeUninit :: < [ T ; 8 ] > :: uninit ( ) ;
979
+ let swap_ptr = swap. as_mut_ptr ( ) as * mut T ;
980
+
981
+ let mut i = 8 ;
982
+ while merge_count > 1 {
983
+ // SAFETY: We know the smaller side will be of size 8 because mid is 8. And both
984
+ // sides are non empty because of merge_count, and the right side will always be of
985
+ // size 8 and the left size of 8 or greater. Thus the smaller side will always be
986
+ // exactly 8 long, the size of swap.
987
+ unsafe {
988
+ merge ( & mut v[ 0 ..( i + 8 ) ] , i, swap_ptr, is_less) ;
989
+ }
990
+ i += 8 ;
991
+ merge_count -= 1 ;
992
+ }
993
+
994
+ insertion_sort_shift_left ( v, i, is_less) ;
995
+
996
+ return true ;
997
+ }
998
+ } else {
999
+ const MAX_NO_ALLOC_SIZE : usize = 20 ;
1000
+ if len <= MAX_NO_ALLOC_SIZE {
1001
+ insertion_sort_shift_right ( v, start, is_less) ;
1002
+ return true ;
1003
+ }
1004
+ }
1005
+
1006
+ false
1007
+ }
1008
+
1036
1009
/// Takes a range as denoted by start and end, that is already sorted and extends it if necessary
1037
1010
/// with sorts optimized for smaller ranges such as insertion sort.
1038
1011
#[ cfg( not( no_global_oom_handling) ) ]
@@ -1042,8 +1015,7 @@ where
1042
1015
{
1043
1016
debug_assert ! ( end > start) ;
1044
1017
1045
- // Testing showed that using MAX_INSERTION here yields the best performance for many types, but
1046
- // incurs more total comparisons. A balance between least comparisons and best performance, as
1018
+ // This value is a balance between least comparisons and best performance, as
1047
1019
// influenced by for example cache locality.
1048
1020
const MIN_INSERTION_RUN : usize = 10 ;
1049
1021
@@ -1115,6 +1087,7 @@ impl<T> Drop for InsertionHole<T> {
1115
1087
1116
1088
/// Inserts `v[v.len() - 1]` into pre-sorted sequence `v[..v.len() - 1]` so that whole `v[..]`
1117
1089
/// becomes sorted.
1090
+ #[ cfg( not( no_global_oom_handling) ) ]
1118
1091
unsafe fn insert_tail < T , F > ( v : & mut [ T ] , is_less : & mut F )
1119
1092
where
1120
1093
F : FnMut ( & T , & T ) -> bool ,
@@ -1167,11 +1140,12 @@ where
1167
1140
}
1168
1141
}
1169
1142
1170
- /// Sort v assuming v[..offset] is already sorted.
1143
+ /// Sort `v` assuming ` v[..offset]` is already sorted.
1171
1144
///
1172
1145
/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
1173
1146
/// performance impact. Even improving performance in some cases.
1174
1147
#[ inline( never) ]
1148
+ #[ cfg( not( no_global_oom_handling) ) ]
1175
1149
fn insertion_sort_shift_left < T , F > ( v : & mut [ T ] , offset : usize , is_less : & mut F )
1176
1150
where
1177
1151
F : FnMut ( & T , & T ) -> bool ,
@@ -1195,11 +1169,12 @@ where
1195
1169
}
1196
1170
}
1197
1171
1198
- /// Sort v assuming v[offset..] is already sorted.
1172
+ /// Sort `v` assuming ` v[offset..]` is already sorted.
1199
1173
///
1200
1174
/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
1201
1175
/// performance impact. Even improving performance in some cases.
1202
1176
#[ inline( never) ]
1177
+ #[ cfg( not( no_global_oom_handling) ) ]
1203
1178
fn insertion_sort_shift_right < T , F > ( v : & mut [ T ] , offset : usize , is_less : & mut F )
1204
1179
where
1205
1180
F : FnMut ( & T , & T ) -> bool ,
@@ -1227,6 +1202,7 @@ where
1227
1202
/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
1228
1203
///
1229
1204
/// This is the integral subroutine of insertion sort.
1205
+ #[ cfg( not( no_global_oom_handling) ) ]
1230
1206
unsafe fn insert_head < T , F > ( v : & mut [ T ] , is_less : & mut F )
1231
1207
where
1232
1208
F : FnMut ( & T , & T ) -> bool ,
@@ -1287,6 +1263,10 @@ where
1287
1263
///
1288
1264
/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
1289
1265
/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
1266
+ ///
1267
+ /// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
1268
+ /// performance impact.
1269
+ #[ inline( never) ]
1290
1270
#[ cfg( not( no_global_oom_handling) ) ]
1291
1271
unsafe fn merge < T , F > ( v : & mut [ T ] , mid : usize , buf : * mut T , is_less : & mut F )
1292
1272
where
@@ -1506,6 +1486,7 @@ where
1506
1486
/// Never inline this function to avoid code bloat. It still optimizes nicely and has practically no
1507
1487
/// performance impact.
1508
1488
#[ inline( never) ]
1489
+ #[ cfg( not( no_global_oom_handling) ) ]
1509
1490
unsafe fn sort8_stable < T , F > ( v : & mut [ T ] , is_less : & mut F )
1510
1491
where
1511
1492
F : FnMut ( & T , & T ) -> bool ,
@@ -1559,6 +1540,7 @@ where
1559
1540
}
1560
1541
}
1561
1542
1543
+ #[ cfg( not( no_global_oom_handling) ) ]
1562
1544
unsafe fn sort24_stable < T , F > ( v : & mut [ T ] , is_less : & mut F )
1563
1545
where
1564
1546
F : FnMut ( & T , & T ) -> bool ,
0 commit comments