Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optimize_for_size variants for stable and unstable sort as well as select_nth_unstable #129587

Merged
merged 9 commits into from
Sep 24, 2024
11 changes: 10 additions & 1 deletion library/core/src/slice/sort/select.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
//! better performance than one would get using heapsort as fallback.

use crate::mem::{self, SizedTypeProperties};
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::pivot::choose_pivot;
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;
use crate::slice::sort::unstable::quicksort::partition;
Expand Down Expand Up @@ -40,7 +41,13 @@ where
let min_idx = min_index(v, &mut is_less).unwrap();
v.swap(min_idx, index);
} else {
partition_at_index_loop(v, index, None, &mut is_less);
cfg_if! {
if #[cfg(feature = "optimize_for_size")] {
median_of_medians(v, &mut is_less, index);
} else {
partition_at_index_loop(v, index, None, &mut is_less);
}
}
}

let (left, right) = v.split_at_mut(index);
Expand All @@ -53,6 +60,7 @@ where
// most once, it doesn't make sense to use something more sophisticated than insertion-sort.
const INSERTION_SORT_THRESHOLD: usize = 16;

#[cfg(not(feature = "optimize_for_size"))]
fn partition_at_index_loop<'a, T, F>(
mut v: &'a mut [T],
mut index: usize,
Expand Down Expand Up @@ -169,6 +177,7 @@ fn median_of_medians<T, F: FnMut(&T, &T) -> bool>(mut v: &mut [T], is_less: &mut
if v.len() >= 2 {
insertion_sort_shift_left(v, 1, is_less);
}

return;
}

Expand Down
2 changes: 2 additions & 0 deletions library/core/src/slice/sort/shared/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#![cfg_attr(feature = "optimize_for_size", allow(dead_code))]

use crate::marker::Freeze;

pub(crate) mod pivot;
Expand Down
7 changes: 6 additions & 1 deletion library/core/src/slice/sort/shared/smallsort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,12 @@ where

/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
/// value at position `b_pos` is less than the one at position `a_pos`.
pub unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
///
/// Purposefully not marked `#[inline]`, despite us wanting it to be inlined for integers like
/// types. `is_less` could be a huge function and we want to give the compiler an option to
/// not inline this function. For the same reasons that this function is very perf critical
/// it should be in the same module as the functions that use it.
unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
Expand Down
65 changes: 51 additions & 14 deletions library/core/src/slice/sort/stable/mod.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
//! This module contains the entry points for `slice::sort`.

#[cfg(not(feature = "optimize_for_size"))]
use crate::cmp;
use crate::intrinsics;
use crate::mem::{self, MaybeUninit, SizedTypeProperties};
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::smallsort::{
insertion_sort_shift_left, StableSmallSortTypeImpl, SMALL_SORT_GENERAL_SCRATCH_LEN,
};
use crate::{cmp, intrinsics};

pub(crate) mod drift;
pub(crate) mod merge;

#[cfg(not(feature = "optimize_for_size"))]
pub(crate) mod drift;
#[cfg(not(feature = "optimize_for_size"))]
pub(crate) mod quicksort;

#[cfg(feature = "optimize_for_size")]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This likely needs to be #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))], see #130818

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch.

pub(crate) mod tiny;

/// Stable sort called driftsort by Orson Peters and Lukas Bergdoll.
/// Design document:
/// <https://github.com/Voultapher/sort-research-rs/blob/main/writeup/driftsort_introduction/text.md>
Expand All @@ -30,25 +39,53 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less
return;
}

// More advanced sorting methods than insertion sort are faster if called in
// a hot loop for small inputs, but for general-purpose code the small
// binary size of insertion sort is more important. The instruction cache in
// modern processors is very valuable, and for a single sort call in general
// purpose code any gains from an advanced method are cancelled by i-cache
// misses during the sort, and thrashing the i-cache for surrounding code.
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
insertion_sort_shift_left(v, 1, is_less);
return;
}
cfg_if! {
if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] {
let alloc_len = len / 2;

cfg_if! {
if #[cfg(target_pointer_width = "16")] {
let heap_buf = BufT::with_capacity(alloc_len);
let scratch = heap_buf.as_uninit_slice_mut();
} else {
// For small inputs 4KiB of stack storage suffices, which allows us to avoid
// calling the (de-)allocator. Benchmarks showed this was quite beneficial.
let mut stack_buf = AlignedStorage::<T, 4096>::new();
let stack_scratch = stack_buf.as_uninit_slice_mut();
let mut heap_buf;
let scratch = if stack_scratch.len() >= alloc_len {
stack_scratch
} else {
heap_buf = BufT::with_capacity(alloc_len);
heap_buf.as_uninit_slice_mut()
};
}
}

driftsort_main::<T, F, BufT>(v, is_less);
tiny::mergesort(v, scratch, is_less);
} else {
// More advanced sorting methods than insertion sort are faster if called in
// a hot loop for small inputs, but for general-purpose code the small
// binary size of insertion sort is more important. The instruction cache in
// modern processors is very valuable, and for a single sort call in general
// purpose code any gains from an advanced method are cancelled by i-cache
// misses during the sort, and thrashing the i-cache for surrounding code.
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
insertion_sort_shift_left(v, 1, is_less);
return;
}

driftsort_main::<T, F, BufT>(v, is_less);
}
}
}

/// See [`sort`]
///
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
/// inlined insertion sort i-cache footprint remains minimal.
#[cfg(not(feature = "optimize_for_size"))]
#[inline(never)]
fn driftsort_main<T, F: FnMut(&T, &T) -> bool, BufT: BufGuard<T>>(v: &mut [T], is_less: &mut F) {
// By allocating n elements of memory we can ensure the entire input can
Expand Down
41 changes: 41 additions & 0 deletions library/core/src/slice/sort/stable/tiny.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
//! Binary-size optimized mergesort inspired by https://github.com/voultapher/tiny-sort-rs.

use crate::mem::MaybeUninit;
use crate::ptr;
use crate::slice::sort::stable::merge;

/// Tiny recursive top-down merge sort optimized for binary size. It has no adaptiveness whatsoever,
/// no run detection, etc.
#[inline(always)]
pub fn mergesort<T, F: FnMut(&T, &T) -> bool>(
v: &mut [T],
scratch: &mut [MaybeUninit<T>],
is_less: &mut F,
) {
let len = v.len();

if len > 2 {
let mid = len / 2;

// SAFETY: mid is in-bounds.
unsafe {
// Sort the left half recursively.
mergesort(v.get_unchecked_mut(..mid), scratch, is_less);
// Sort the right half recursively.
mergesort(v.get_unchecked_mut(mid..), scratch, is_less);
}

merge::merge(v, scratch, mid, is_less);
} else if len == 2 {
// SAFETY: We checked the len, the pointers we create are valid and don't overlap.
unsafe {
let v_base = v.as_mut_ptr();
let v_a = v_base;
let v_b = v_base.add(1);

if is_less(&*v_b, &*v_a) {
ptr::swap_nonoverlapping(v_a, v_b, 1);
}
}
}
}
40 changes: 19 additions & 21 deletions library/core/src/slice/sort/unstable/heapsort.rs
Original file line number Diff line number Diff line change
@@ -1,46 +1,46 @@
//! This module contains a branchless heapsort as fallback for unstable quicksort.

use crate::{intrinsics, ptr};
use crate::{cmp, intrinsics, ptr};

/// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case.
///
/// Never inline this, it sits the main hot-loop in `recurse` and is meant as unlikely algorithmic
/// fallback.
///
/// SAFETY: The caller has to guarantee that `v.len()` >= 2.
#[inline(never)]
pub(crate) unsafe fn heapsort<T, F>(v: &mut [T], is_less: &mut F)
pub(crate) fn heapsort<T, F>(v: &mut [T], is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
// SAFETY: See function safety.
unsafe {
intrinsics::assume(v.len() >= 2);

// Build the heap in linear time.
for i in (0..v.len() / 2).rev() {
sift_down(v, i, is_less);
}
let len = v.len();

// Pop maximal elements from the heap.
for i in (1..v.len()).rev() {
for i in (0..len + len / 2).rev() {
let sift_idx = if i >= len {
i - len
} else {
v.swap(0, i);
sift_down(&mut v[..i], 0, is_less);
0
};

// SAFETY: The above calculation ensures that `sift_idx` is either 0 or
// `(len..(len + (len / 2))) - len`, which simplifies to `0..(len / 2)`.
// This guarantees the required `sift_idx <= len`.
unsafe {
sift_down(&mut v[..cmp::min(i, len)], sift_idx, is_less);
}
}
}

// This binary heap respects the invariant `parent >= child`.
//
// SAFETY: The caller has to guarantee that node < `v.len()`.
#[inline(never)]
// SAFETY: The caller has to guarantee that `node <= v.len()`.
#[inline(always)]
unsafe fn sift_down<T, F>(v: &mut [T], mut node: usize, is_less: &mut F)
where
F: FnMut(&T, &T) -> bool,
{
// SAFETY: See function safety.
unsafe {
intrinsics::assume(node < v.len());
intrinsics::assume(node <= v.len());
}

let len = v.len();
Expand Down Expand Up @@ -69,9 +69,7 @@ where
break;
}

// Swap `node` with the greater child, move one step down, and continue sifting. This
// could be ptr::swap_nonoverlapping but that adds a significant amount of binary-size.
ptr::swap(v_base.add(node), v_base.add(child));
ptr::swap_nonoverlapping(v_base.add(node), v_base.add(child), 1);
}

node = child;
Expand Down
33 changes: 21 additions & 12 deletions library/core/src/slice/sort/unstable/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

use crate::intrinsics;
use crate::mem::SizedTypeProperties;
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::find_existing_run;
#[cfg(not(feature = "optimize_for_size"))]
use crate::slice::sort::shared::smallsort::insertion_sort_shift_left;

pub(crate) mod heapsort;
Expand All @@ -28,25 +30,32 @@ pub fn sort<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
return;
}

// More advanced sorting methods than insertion sort are faster if called in
// a hot loop for small inputs, but for general-purpose code the small
// binary size of insertion sort is more important. The instruction cache in
// modern processors is very valuable, and for a single sort call in general
// purpose code any gains from an advanced method are cancelled by i-cache
// misses during the sort, and thrashing the i-cache for surrounding code.
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
insertion_sort_shift_left(v, 1, is_less);
return;
}
cfg_if! {
if #[cfg(any(feature = "optimize_for_size", target_pointer_width = "16"))] {
heapsort::heapsort(v, is_less);
} else {
// More advanced sorting methods than insertion sort are faster if called in
// a hot loop for small inputs, but for general-purpose code the small
// binary size of insertion sort is more important. The instruction cache in
// modern processors is very valuable, and for a single sort call in general
// purpose code any gains from an advanced method are cancelled by i-cache
// misses during the sort, and thrashing the i-cache for surrounding code.
const MAX_LEN_ALWAYS_INSERTION_SORT: usize = 20;
if intrinsics::likely(len <= MAX_LEN_ALWAYS_INSERTION_SORT) {
insertion_sort_shift_left(v, 1, is_less);
return;
}

ipnsort(v, is_less);
ipnsort(v, is_less);
}
}
}

/// See [`sort`]
///
/// Deliberately don't inline the main sorting routine entrypoint to ensure the
/// inlined insertion sort i-cache footprint remains minimal.
#[cfg(not(feature = "optimize_for_size"))]
#[inline(never)]
fn ipnsort<T, F>(v: &mut [T], is_less: &mut F)
where
Expand Down
Loading
Loading