From 9ad948a391244fd8beeb7534d6e0f59f21969964 Mon Sep 17 00:00:00 2001 From: Benjamin Desef Date: Sat, 1 Jun 2024 20:20:28 +0200 Subject: [PATCH 1/9] Add QuadSort and BlitSort --- src/BlitSort.jl | 468 +++++++++++++++++++ src/QuadSort.jl | 951 +++++++++++++++++++++++++++++++++++++++ src/SortingAlgorithms.jl | 37 ++ 3 files changed, 1456 insertions(+) create mode 100644 src/BlitSort.jl create mode 100644 src/QuadSort.jl diff --git a/src/BlitSort.jl b/src/BlitSort.jl new file mode 100644 index 0000000..2724458 --- /dev/null +++ b/src/BlitSort.jl @@ -0,0 +1,468 @@ +export BlitSort + +struct BlitSortAlg <: Algorithm end + +const BlitSort = maybe_optimize(BlitSortAlg()) + +const BLIT_AUX = 512 # set to 0 for sqrt(n) cache size +const BLIT_OUT = 96 # should be smaller or equal to BLIT_AUX + +function blit_analyze!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp) + @inbounds begin + half1 = nmemb ÷ 2 + quad1 = half1 ÷ 2 + quad2 = half1 - quad1 + half2 = nmemb - half1 + quad3 = half2 ÷ 2 + quad4 = half2 - quad3 + + pta = array_index + ptb = array_index + asInt(quad1) + ptc = array_index + asInt(half1) + ptd = array_index + asInt(half1 + quad3) + + astreaks = bstreaks = cstreaks = dstreaks = zero(UInt) + abalance = bbalance = cbalance = dbalance = zero(UInt) + + cnt = nmemb + while cnt > 132 + asum::UInt8 = bsum::UInt8 = csum::UInt8 = dsum::UInt8 = 0 + for _ in 32:-1:1 + asum += cmp(array[pta], array[pta+1]) > 0; pta += 1 + bsum += cmp(array[ptb], array[ptb+1]) > 0; ptb += 1 + csum += cmp(array[ptc], array[ptc+1]) > 0; ptc += 1 + dsum += cmp(array[ptd], array[ptd+1]) > 0; ptd += 1 + end + abalance += asum; astreaks += asum = (asum == 0) | (asum == 32) + bbalance += bsum; bstreaks += bsum = (bsum == 0) | (bsum == 32) + cbalance += csum; cstreaks += csum = (csum == 0) | (csum == 32) + dbalance += dsum; dstreaks += dsum = (dsum == 0) | (dsum == 32) + + if cnt > 516 && asum + bsum + csum + dsum == 0 + abalance += 48; pta += 96 + bbalance += 48; ptb += 96 + cbalance += 48; ptc += 96 + dbalance += 48; ptd += 96 + cnt -= 384 + end + + cnt -= 128 + end + + for _ in cnt:-4:UInt(8) + abalance += cmp(array[pta], array[pta+1]) > 0; pta += 1 + bbalance += cmp(array[ptb], array[ptb+1]) > 0; ptb += 1 + cbalance += cmp(array[ptc], array[ptc+1]) > 0; ptc += 1 + dbalance += cmp(array[ptd], array[ptd+1]) > 0; ptd += 1 + end + + quad1 < quad2 && (bbalance += cmp(array[ptb], array[ptb+1]) > 0; ptb += 1) + quad1 < quad3 && (cbalance += cmp(array[ptc], array[ptc+1]) > 0; ptc += 1) + quad1 < quad4 && (dbalance += cmp(array[ptd], array[ptd+1]) > 0; ptd += 1) + + cnt = abalance + bbalance + cbalance + dbalance + + cnt == 0 && cmp(array[pta], array[pta+1]) ≤ 0 && + cmp(array[ptb], array[ptb+1]) ≤ 0 && cmp(array[ptc], array[ptc+1]) ≤ 0 && return + + abool = quad1 - abalance == 1 + bbool = quad2 - bbalance == 1 + cbool = quad3 - cbalance == 1 + dbool = quad4 - dbalance == 1 + + if abool | bbool | cbool | dbool + span1 = (abool && bbool) * (cmp(array[pta], array[pta+1]) > 0) + span2 = (bbool && cbool) * (cmp(array[ptb], array[ptb+1]) > 0) + span3 = (cbool && dbool) * (cmp(array[ptc], array[ptc+1]) > 0) + + tmp = span1 | span2 * 2 | span3 * 4 + if tmp == 1 + quad_reversal!(array, array_index, ptb) + abalance = bbalance = 0 + elseif tmp == 2 + quad_reversal!(array, pta + 1, ptc) + bbalance = cbalance = 0 + elseif tmp == 3 + quad_reversal!(array, array_index, ptc) + abalance = bbalance = cbalance = 0 + elseif tmp == 4 + quad_reversal!(array, ptb + 1, ptd) + cbalance = dbalance = 0 + elseif tmp == 5 + quad_reversal!(array, array_index, ptb) + quad_reversal!(array, ptb + 1, ptd) + abalance = bbalance = cbalance = dbalance = 0 + elseif tmp == 6 + quad_reversal!(array, pta + 1, ptd) + bbalance = cbalance = dbalance = 0 + elseif tmp == 7 + quad_reversal!(array, array_index, ptd) + return + end + + abool && !iszero(abalance) && (quad_reversal!(array, array_index, pta); abalance = 0) + bbool && !iszero(bbalance) && (quad_reversal!(array, pta + 1, ptb); bbalance = 0) + cbool && !iszero(cbalance) && (quad_reversal!(array, ptb + 1, ptc); cbalance = 0) + dbool && !iszero(dbalance) && (quad_reversal!(array, ptc + 1, ptd); dbalance = 0) + end + + cnt = nmemb ÷ 256 # more than 50% ordered + abool = astreaks > cnt + bbool = bstreaks > cnt + cbool = cstreaks > cnt + dbool = dstreaks > cnt + + tmp = abool + 2bbool + 4cbool + 8dbool + if tmp == 0 + blit_partition!(array, array_index, swap, swap_index, swap_size, nmemb, cmp) + return + elseif tmp == 1 + iszero(abalance) || quadsort_swap!(array, array_index, swap, swap_index, swap_size, quad1, cmp) + blit_partition!(array, pta + 1, swap, swap_index, swap_size, quad2 + half2, cmp) + elseif tmp == 2 + blit_partition!(array, array_index, swap, swap_index, swap_size, quad1, cmp) + iszero(bbalance) || quadsort_swap!(array, pta + 1, swap, swap_index, swap_size, quad2, cmp) + blit_partition!(array, ptb + 1, swap, swap_index, swap_size, half2, cmp) + elseif tmp == 3 + iszero(abalance) || quadsort_swap!(array, array_index, swap, swap_index, swap_size, quad1, cmp) + iszero(bbalance) || quadsort_swap!(array, pta + 1, swap, swap_index, swap_size, quad2, cmp) + blit_partition!(array, ptb + 1, swap, swap_index, swap_size, half2, cmp) + elseif tmp == 4 + blit_partition!(array, array_index, swap, swap_index, swap_size, half1, cmp) + iszero(cbalance) || quadsort_swap!(array, ptb + 1, swap, swap_index, swap_size, quad3, cmp) + blit_partition!(array, ptc + 1, swap, swap_index, swap_size, quad4, cmp) + elseif tmp == 8 + blit_partition!(array, array_index, swap, swap_index, swap_size, half1 + quad3, cmp) + iszero(dbalance) || quadsort_swap!(array, ptc + 1, swap, swap_index, swap_size, quad4, cmp) + elseif tmp == 9 + iszero(abalance) || quadsort_swap!(array, array_index, swap, swap_index, swap_size, quad1, cmp) + blit_partition!(array, pta + 1, swap, swap_index, swap_size, quad2 + quad3, cmp) + iszero(dbalance) || quadsort_swap!(array, ptc + 1, swap, swap_index, swap_size, quad4, cmp) + elseif tmp == 12 + blit_partition!(array, array_index, swap, swap_index, swap_size, half1, cmp) + iszero(cbalance) || quadsort_swap!(array, ptb + 1, swap, swap_index, swap_size, quad3, cmp) + iszero(dbalance) || quadsort_swap!(array, ptc + 1, swap, swap_index, swap_size, quad4, cmp) + else + if abool + iszero(abalance) || quadsort_swap!(array, array_index, swap, swap_index, swap_size, quad1, cmp) + else + blit_partition!(array, array_index, swap, swap_index, swap_size, quad1, cmp) + end + if bbool + iszero(bbalance) || quadsort_swap!(array, pta + 1, swap, swap_index, swap_size, quad2, cmp) + else + blit_partition!(array, pta + 1, swap, swap_index, swap_size, quad2, cmp) + end + if cbool + iszero(cbalance) || quadsort_swap!(array, ptb + 1, swap, swap_index, swap_size, quad3, cmp) + else + blit_partition!(array, ptb + 1, swap, swap_index, swap_size, quad3, cmp) + end + if dbool + iszero(dbalance) || quadsort_swap!(array, ptc + 1, swap, swap_index, swap_size, quad4, cmp) + else + blit_partition!(array, ptc + 1, swap, swap_index, swap_size, quad4, cmp) + end + end + + if cmp(array[pta], array[pta+1]) ≤ 0 + if cmp(array[ptc], array[ptc+1]) ≤ 0 + cmp(array[ptb], array[ptb+1]) ≤ 0 && return + else + rotate_merge_block!(array, array_index + asInt(half1), swap, swap_index, swap_size, quad3, quad4, cmp) + end + else + rotate_merge_block!(array, array_index, swap, swap_index, swap_size, quad1, quad2, cmp) + cmp(array[ptc], array[ptc+1]) > 0 && + rotate_merge_block!(array, array_index + asInt(half1), swap, swap_index, swap_size, quad3, quad4, cmp) + end + rotate_merge_block!(array, array_index, swap, swap_index, swap_size, half1, half2, cmp) + end +end + +# The next 4 functions are used for pivot selection + +function blit_binary_median(arraya, pta::Int, arrayb, ptb::Int, len::UInt, cmp) + @inbounds begin + while !iszero(len ÷= 2) + leni = asInt(len) + if cmp(arraya[pta+leni], arrayb[ptb+leni]) ≤ 0 + pta += leni + else + ptb += leni + end + end + aa = arraya[pta] + bb = arrayb[ptb] + return ifelse(cmp(aa, bb) > 0, aa, bb) + end +end + +function blit_trim_four!(array, pta::Int, cmp) + @inbounds begin + x = cmp(array[pta], array[pta+1]) > 0 + array[pta], array[pta+1] = array[pta+x], array[pta+!x] + pta += 2 + + x = cmp(array[pta], array[pta+1]) > 0 + array[pta], array[pta+1] = array[pta+x], array[pta+!x] + pta -= 2 + + x = 2(cmp(array[pta], array[pta+2]) ≤ 0) + array[pta+2] = array[pta+x] + pta += 1 + x = 2(cmp(array[pta], array[pta+2]) > 0) + array[pta] = array[pta+x] + end +end + +function blit_median_of_nine!(array, array_index::Int, swap, swap_index::Int, nmemb::UInt, cmp) + @inbounds begin + z = asInt(nmemb ÷ 9) + + pta = array_index + + for x in swap_index:swap_index+8 + swap[x] = array[pta] + pta += z + end + + blit_trim_four!(swap, swap_index, cmp) + blit_trim_four!(swap, swap_index + 4, cmp) + + swap[swap_index] = swap[swap_index+5] + swap[swap_index+3] = swap[swap_index+8] + + blit_trim_four!(swap, swap_index, cmp) + + swap[swap_index] = swap[swap_index+6] + + x = cmp(swap[swap_index], swap[swap_index+1]) > 0 + y = cmp(swap[swap_index], swap[swap_index+2]) > 0 + z = cmp(swap[swap_index+1], swap[swap_index+2]) > 0 + + return swap[swap_index + (x == y) + (y ⊻ z)] + end +end + +function blit_median_of_cbrt!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp) + @inbounds begin + cbrt = UInt(32) # TODO: figure out how to write this more efficiently using bsr + while nmemb > cbrt * cbrt * cbrt && cbrt < swap_size + cbrt *= 2 + end + + div = asInt(nmemb ÷ cbrt) + + pta = array_index + pts = swap_index + + for cnt in 0:Core.bitcast(Int, cbrt)-1 + swap[pts+cnt] = array[pta] + pta += div + end + pta = pts + ptb = pts + asInt(cbrt ÷ 2) + + for cnt in cbrt÷8:-1:1 + blit_trim_four!(swap, pta, cmp) + blit_trim_four!(swap, ptb, cmp) + + swap[pta] = swap[ptb+1] + swap[pta+3] = swap[ptb+2] + + pta += 4 + ptb += 4 + end + cbrt ÷= 4; + + quadsort_swap!(swap, pts, swap, pts + 2asInt(cbrt), cbrt, cbrt, cmp) + quadsort_swap!(swap, pts + asInt(cbrt), swap, pts + 2asInt(cbrt), cbrt, cbrt, cmp) + + return cmp(swap[pts+2asInt(cbrt)-1], swap[pts]) ≤ 0, + blit_binary_median(swap, pts, swap, pts + asInt(cbrt), cbrt, cmp) + end +end + +# As per suggestion by Marshall Lochbaum to improve generic data handling +function blit_reverse_partition!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, piv, nmemb::UInt, cmp) + @inbounds begin + if nmemb > swap_size + h = nmemb ÷ 2; + + l = blit_reverse_partition!(array, array_index, swap, swap_index, swap_size, piv, h, cmp) + r = blit_reverse_partition!(array, array_index + asInt(h), swap, swap_index, swap_size, piv, nmemb - h, cmp) + + trinity_rotation!(array, array_index + asInt(l), swap, swap_index, swap_size, h - l + r, h - l) + + return l + r + end + + ptx = array_index + pta = array_index + pts = swap_index + + for _ in nmemb÷4:-1:1 + @unroll 4 begin + if cmp(piv, array[ptx]) > 0 + array[pta] = array[ptx] + pta += 1 + else + swap[pts] = array[ptx] + pts += 1 + end + ptx += 1 + end + end + + for _ in nmemb%4:-1:1 + if cmp(piv, array[ptx]) > 0 + array[pta] = array[ptx] + pta += 1 + else + swap[pts] = array[ptx] + pts += 1 + end + ptx += 1 + end + m = pta - array_index + + _unsafe_copyto!(array, pta, swap, swap_index, nmemb - m) + + return asUInt(m) + end +end + +function blit_default_partition!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, piv, nmemb::UInt, cmp) + @inbounds begin + if nmemb > swap_size + h = nmemb ÷ 2 + + l = blit_default_partition!(array, array_index, swap, swap_index, swap_size, piv, h, cmp) + r = blit_default_partition!(array, array_index + asInt(h), swap, swap_index, swap_size, piv, nmemb - h, cmp) + + trinity_rotation!(array, array_index + asInt(l), swap, swap_index, swap_size, h - l + r, h - l) + + return l + r + end + + ptx = array_index + pta = array_index + pts = swap_index + + for _ in nmemb÷4:-1:one(UInt) + @unroll 4 begin + if cmp(array[ptx], piv) ≤ 0 + array[pta] = array[ptx] + pta += 1 + else + swap[pts] = array[ptx] + pts += 1 + end + ptx += 1 + end + end + + for _ in nmemb%4:-1:one(UInt) + if cmp(array[ptx], piv) ≤ 0 + array[pta] = array[ptx] + pta += 1 + else + swap[pts] = array[ptx] + pts += 1 + end + ptx += 1 + end + m = pta - array_index + + _unsafe_copyto!(array, pta, swap, swap_index, nmemb - m) + + return asUInt(m) + end +end + +function blit_partition!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp) + @inbounds begin + a_size = zero(UInt) + local max + + while true + if nmemb ≤ 2048 + piv = blit_median_of_nine!(array, array_index, swap, swap_index, nmemb, cmp) + else + generic, piv = blit_median_of_cbrt!(array, array_index, swap, swap_index, swap_size, nmemb, cmp) + + if generic + quadsort_swap!(array, array_index, swap, swap_index, swap_size, nmemb, cmp) + return + end + end + + if !iszero(a_size) && cmp(max, piv) ≤ 0 + a_size = blit_reverse_partition!(array, array_index, swap, swap_index, swap_size, piv, nmemb, cmp) + s_size = nmemb - a_size + + if s_size ≤ a_size÷16 || a_size <= BLIT_OUT + quadsort_swap!(array, array_index, swap, swap_index, swap_size, a_size, cmp) + return + end + nmemb = a_size + a_size = zero(UInt) + continue + end + + a_size = blit_default_partition!(array, array_index, swap, swap_index, swap_size, piv, nmemb, cmp) + s_size = nmemb - a_size + + if a_size ≤ s_size÷16 || s_size ≤ BLIT_OUT + if iszero(s_size) + a_size = blit_reverse_partition!(array, array_index, swap, swap_index, swap_size, piv, a_size, cmp) + s_size = nmemb - a_size + + if s_size ≤ a_size÷16 || a_size ≤ BLIT_OUT + return quadsort_swap!(array, array_index, swap, swap_index, swap_size, a_size, cmp) + end + nmemb = a_size + a_size = zero(UInt) + continue + end + quadsort_swap!(array, array_index + asInt(a_size), swap, swap_index, swap_size, s_size, cmp) + else + blit_partition!(array, array_index + asInt(a_size), swap, swap_index, swap_size, s_size, cmp) + end + + if s_size ≤ a_size÷16 || a_size ≤ BLIT_OUT + quadsort_swap!(array, array_index, swap, swap_index, swap_size, a_size, cmp) + return + end + nmemb = a_size + max = piv + end + end +end + +function sort!(array::AbstractVector, lo::Int, hi::Int, ::BlitSortAlg, o::Ordering) + len = UInt(hi - lo +1) + if len ≤ 132 + sort!(array, lo, hi, QuadSortAlg(), o) + else + cmp = let o=o; (x, y) -> lt(o, y, x) end + + if !iszero(BLIT_AUX) + swap_size = UInt(BLIT_AUX) + else + swap_size = one(UInt) << 19 + + while len ÷ swap_size < swap_size ÷ 128 + swap_size ÷= 4 + end + end + @with_stackvec swap Int(swap_size) eltype(array) begin + blit_analyze!(array, lo, swap, firstindex(swap), swap_size, len, cmp) + end + end + return array +end + +blitsort_swap!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp) = + (nmemb ≤ 132 ? quadsort_swap! : blit_analyze!)(array, array_index, swap, swap_index, swap_size, nmemb, cmp) \ No newline at end of file diff --git a/src/QuadSort.jl b/src/QuadSort.jl new file mode 100644 index 0000000..9c841e5 --- /dev/null +++ b/src/QuadSort.jl @@ -0,0 +1,951 @@ +export QuadSort + +struct QuadSortAlg <: Algorithm end + +const QuadSort = maybe_optimize(QuadSortAlg()) + +# We remove all the branchless stuff. It actually doesn't do any good (we cannot do the clang-branchless optimization without +# introducing a lot of references, which makes everything much slower; and if we use the gcc-branchless workaround, the +# benchmarked performance is slightly worse). +# While branchless avoids branch mispredictions, we still have dependency chains; and Julia might sometimes decide to remove +# the branch anyway, but mimicking it with ifelse makes it harder for the compiler to do any optimization. +macro head_branchless_merge(arrayd, ptd, arrayl, ptl, arrayr, ptr, cmp) + esc(quote + let pl=$arrayl[$ptl], pr=$arrayr[$ptr] + $arrayd[$ptd] = if $cmp(pl, pr) ≤ 0 + $ptl += 1 + pl + else + $ptr += 1 + pr + end + end + $ptd += 1 + end) +end + +macro tail_branchless_merge(arrayd, tpd, arrayl, tpl, arrayr, tpr, cmp) + esc(quote + let pl=$arrayl[$tpl], pr=$arrayr[$tpr] + $arrayd[$tpd] = if $cmp(pl, pr) > 0 + $tpl -= 1 + pl + else + $tpr -= 1 + pr + end + end + $tpd -= 1 + end) +end + +macro getcmp(array, indexp1, fn, indexp2, cmp) + esc(:(let a1=$array[$indexp1], a2=$array[$indexp2] + ifelse($fn($cmp(a1, a2), 0), a1, a2) # here we keep it branchless, giving a minimal advantage over ternary + end)) +end + +@inline function parity_merge_two!(array, array_index::Int, swap, swap_index::Int, cmp) + @inbounds begin + ptl = array_index; ptr = array_index + 2; pts = swap_index + @head_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + swap[pts] = @getcmp(array, ptl, ≤, ptr, cmp) + ptl = array_index + 1; ptr = array_index + 3; pts = swap_index + 3 + @tail_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + swap[pts] = @getcmp(array, ptl, >, ptr, cmp) + nothing + end +end + +@inline function parity_merge_four!(array, array_index::Int, swap, swap_index::Int, cmp) + @inbounds begin + ptl = array_index; ptr = array_index + 4; pts = swap_index + @head_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + @head_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + @head_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + swap[pts] = @getcmp(array, ptl, ≤, ptr, cmp) + ptl = array_index + 3; ptr = array_index + 7; pts = swap_index + 7 + @tail_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + @tail_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + @tail_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) + swap[pts] = @getcmp(array, ptl, >, ptr, cmp) + nothing + end +end + +@inline function swap_branchless!(array, array_index::Int, cmp, + x::Bool=@inbounds(cmp(array[array_index], array[array_index+1]) > 0)) + y = !x + @inbounds array[array_index], array[array_index+1] = array[array_index+x], array[array_index+y] + nothing +end + +# the next seven functions are used for sorting 0 to 31 elements +function tiny_sort!(array, array_index::Int, nmemb::UInt, cmp) + @inbounds if nmemb == 4 + # This is really just @inline quad_swap_four!(array, array_index, cmp) + # However, for some weird reason, the profiler reports lots of GC and dynamic dispatch going on in either tiny_sort! or + # quad_swap_four! unless we copy the code verbatim. Very strange and cannot be verified in the assembly code; however, + # by inlining manually, it actually runs faster. + swap_branchless!(array, array_index, cmp) + swap_branchless!(array, array_index +2, cmp) + array_index += 1 + + a0 = array[array_index] + a1 = array[array_index+1] + if cmp(a0, a1) > 0 + array[array_index] = a1 + array[array_index+1] = a0 + array_index -= 1 + + swap_branchless!(array, array_index, cmp) + swap_branchless!(array, array_index +2, cmp) + swap_branchless!(array, array_index +1, cmp) + end + elseif nmemb == 3 + swap_branchless!(array, array_index, cmp) + swap_branchless!(array, array_index +1, cmp) + swap_branchless!(array, array_index, cmp) + elseif nmemb == 2 + swap_branchless!(array, array_index, cmp) + end +end + +# this function requires a minimum offset of 2 to work properly +function twice_unguarded_insert!(array, array_index::Int, offset::UInt, nmemb::UInt, cmp) + @inbounds for i in asInt(offset):asInt(nmemb)-1 + end_ = array_index + i + pta = end_ + cmp(array[pta -= 1], array[end_]) ≤ 0 && continue + key = array[end_] + if cmp(array[array_index+1], key) > 0 + top = i -1 + while true + array[end_] = array[pta]; pta -= 1; end_ -= 1 + iszero(top -= 1) && break + end + array[end_] = key + end_ -= 1 + else + while true + array[end_] = array[pta]; pta -= 1; end_ -= 1 + array[end_] = array[pta]; pta -= 1; end_ -= 1 + cmp(array[pta], key) > 0 || break + end + array[end_] = array[end_+1] + array[end_+1] = key + end + swap_branchless!(array, end_, cmp) + end +end + +function quad_swap_four!(array, array_index::Int, cmp) + @inbounds begin + swap_branchless!(array, array_index, cmp) + swap_branchless!(array, array_index +2, cmp) + array_index += 1 + + a0 = array[array_index] + a1 = array[array_index+1] + if cmp(a0, a1) > 0 + array[array_index] = a1 + array[array_index+1] = a0 + array_index -= 1 + + swap_branchless!(array, array_index, cmp) + swap_branchless!(array, array_index +2, cmp) + swap_branchless!(array, array_index +1, cmp) + end + nothing + end +end + +function parity_swap_eight!(array, array_index::Int, swap, swap_index::Int, cmp) + @inbounds begin + swap_branchless!(array, array_index, cmp) + swap_branchless!(array, array_index +2, cmp) + swap_branchless!(array, array_index +4, cmp) + swap_branchless!(array, array_index +6, cmp) + + cmp(array[array_index+1], array[array_index+2]) ≤ 0 && + cmp(array[array_index+3], array[array_index+4]) ≤ 0 && + cmp(array[array_index+5], array[array_index+6]) ≤ 0 && return + parity_merge_two!(array, array_index, swap, swap_index, cmp) + parity_merge_two!(array, array_index +4, swap, swap_index +4, cmp) + parity_merge_four!(swap, swap_index, array, array_index, cmp) + end +end + +# left must be equal or one smaller than right +function parity_merge!(dest, dest_index::Int, from, from_index::Int, left::UInt, right::UInt, cmp) + @inbounds begin + ptl = from_index + ptr = from_index + asInt(left) + ptd = dest_index + tpl = ptr -1 + tpr = tpl + asInt(right) + tpd = dest_index + asInt(left + right) -1 + left < right && @head_branchless_merge(dest, ptd, from, ptl, from, ptr, cmp) + @head_branchless_merge(dest, ptd, from, ptl, from, ptr, cmp) + while !iszero(left -= 1) + @head_branchless_merge(dest, ptd, from, ptl, from, ptr, cmp) + @tail_branchless_merge(dest, tpd, from, tpl, from, tpr, cmp) + end + dest[tpd] = @getcmp(from, tpl, >, tpr, cmp) + nothing + end +end + +function parity_swap_sixteen!(array, array_index::Int, swap, swap_index::Int, cmp) + @inbounds begin + quad_swap_four!(array, array_index, cmp) + quad_swap_four!(array, array_index +4, cmp) + quad_swap_four!(array, array_index +8, cmp) + quad_swap_four!(array, array_index +12, cmp) + cmp(array[array_index+3], array[array_index+4]) ≤ 0 && + cmp(array[array_index+7], array[array_index+8]) ≤ 0 && + cmp(array[array_index+11], array[array_index+12]) ≤ 0 && return + parity_merge_four!(array, array_index, swap, swap_index, cmp) + parity_merge_four!(array, array_index +8, swap, swap_index +8, cmp) + parity_merge!(array, array_index, swap, swap_index, UInt(8), UInt(8), cmp) + end +end + +function tail_swap!(array, array_index::Int, swap, swap_index::Int, nmemb::UInt, cmp) + @inbounds begin + if nmemb < 5 + tiny_sort!(array, array_index, nmemb, cmp) + return + end + if nmemb < 8 + quad_swap_four!(array, array_index, cmp) + twice_unguarded_insert!(array, array_index, UInt(4), nmemb, cmp) + return + end + if nmemb < 12 + parity_swap_eight!(array, array_index, swap, swap_index, cmp) + twice_unguarded_insert!(array, array_index, UInt(8), nmemb, cmp) + return + end + if 16 ≤ nmemb < 24 + parity_swap_sixteen!(array, array_index, swap, swap_index, cmp) + twice_unguarded_insert!(array, array_index, UInt(16), nmemb, cmp) + return + end + half1 = asInt(nmemb ÷ 2) + quad1 = half1 ÷ 2 + quad2 = half1 - quad1 + + half2 = asInt(nmemb - half1) + quad3 = half2 ÷ 2 + quad4 = half2 - quad3 + + pta = array_index + tail_swap!(array, pta, swap, swap_index, asUInt(quad1), cmp); pta += quad1 + tail_swap!(array, pta, swap, swap_index, asUInt(quad2), cmp); pta += quad2 + tail_swap!(array, pta, swap, swap_index, asUInt(quad3), cmp); pta += quad3 + tail_swap!(array, pta, swap, swap_index, asUInt(quad4), cmp) + + cmp(array[array_index+quad1-1], array[array_index+quad1]) ≤ 0 && + cmp(array[array_index+half1-1], array[array_index+half1]) ≤ 0 && + cmp(array[pta-1], array[pta]) ≤ 0 && return + + parity_merge!(swap, swap_index, array, array_index, asUInt(quad1), asUInt(quad2), cmp) + parity_merge!(swap, swap_index + half1, array, array_index + half1, asUInt(quad3), asUInt(quad4), cmp) + parity_merge!(array, array_index, swap, swap_index, asUInt(half1), asUInt(half2), cmp) + end +end + +# the next three functions create sorted blocks of 32 elements +function quad_reversal!(array, pta::Int, ptz::Int) + @inbounds begin + loop = (ptz - pta) ÷ 2 + + ptb = pta + loop + pty = ptz - loop + if iseven(loop) + array[ptb], array[pty] = array[pty], array[ptb] + ptb -= 1; pty += 1 + loop -= 1 + end + loop ÷= 2 + while true + array[pta], array[ptz] = array[ptz], array[pta] + pta += 1; ptz -= 1 + array[ptb], array[pty] = array[pty], array[ptb] + ptb -= 1; pty += 1 + + iszero(loop) && break + loop -= 1 + end + nothing + end +end + +function quad_swap_merge!(array, array_index::Int, swap, swap_index::Int, cmp) + parity_merge_two!(array, array_index, swap, swap_index, cmp) + parity_merge_two!(array, array_index +4, swap, swap_index +4, cmp) + parity_merge_four!(swap, swap_index, array, array_index, cmp) +end + +function quad_swap!(array, array_index::Int, nmemb::UInt, cmp) + @inbounds(@with_stackvec swap 32 eltype(array) begin + pta = array_index + count = nmemb ÷ 8 + while !iszero(count) + count -= 1 + v1 = cmp(array[pta], array[pta+1]) > 0 + v2 = cmp(array[pta+2], array[pta+3]) > 0 + v3 = cmp(array[pta+4], array[pta+5]) > 0 + v4 = cmp(array[pta+6], array[pta+7]) > 0 + + tmp = v1 + 2v2 + 4v3 + 8v4 + if tmp == 0 + cmp(array[pta+1], array[pta+2]) ≤ 0 && + cmp(array[pta+3], array[pta+4]) ≤ 0 && + cmp(array[pta+5], array[pta+6]) ≤ 0 && + @goto ordered + quad_swap_merge!(array, pta, swap, firstindex(swap), cmp) + else + if tmp == 15 && cmp(array[pta+1], array[pta+2]) > 0 && + cmp(array[pta+3], array[pta+4]) > 0 && + cmp(array[pta+5], array[pta+6]) > 0 + pts = pta + @goto reversed + end + + @label not_ordered + swap_branchless!(array, pta, cmp, v1) + swap_branchless!(array, pta +2, cmp, v2) + swap_branchless!(array, pta +4, cmp, v3) + swap_branchless!(array, pta +6, cmp, v4) + + quad_swap_merge!(array, pta, swap, firstindex(swap), cmp) + end + + pta += 8 + continue + + @label ordered + pta += 8 + if !iszero(count) + count -= 1 + if (v1 = cmp(array[pta], array[pta+1]) > 0) | (v2 = cmp(array[pta+2], array[pta+3]) > 0) | + (v3 = cmp(array[pta+4], array[pta+5]) > 0) | (v4 = cmp(array[pta+6], array[pta+7]) > 0) + if v1 + v2 + v3 + v4 == 4 && cmp(array[pta+1], array[pta+2]) > 0 && + cmp(array[pta+3], array[pta+4]) > 0 && cmp(array[pta+5], array[pta+6]) > 0 + pts = pta + @goto reversed + end + @goto not_ordered + end + cmp(array[pta+1], array[pta+2]) ≤ 0 && cmp(array[pta+3], array[pta+4]) ≤ 0 && + cmp(array[pta+5], array[pta+6]) ≤ 0 && @goto ordered + quad_swap_merge!(array, pta, swap, firstindex(swap), cmp) + pta += 8 + continue + else + count -= 1 + end + break + + @label reversed + pta += 8 + if !iszero(count) + count -= 1 + if !((v1 = cmp(array[pta], array[pta+1]) ≤ 0) | (v2 = cmp(array[pta+2], array[pta+3]) ≤ 0) | + (v3 = cmp(array[pta+4], array[pta+5]) ≤ 0) | (v4 = cmp(array[pta+6], array[pta+7]) ≤ 0)) + cmp(array[pta-1], array[pta]) > 0 && cmp(array[pta+1], array[pta+2]) > 0 && + cmp(array[pta+3], array[pta+4]) > 0 && cmp(array[pta+5], array[pta+6]) > 0 && + @goto reversed + end + quad_reversal!(array, pts, pta -1) + v1 + v2 + v3 + v4 == 4 && cmp(array[pta+1], array[pta+2]) ≤ 0 && + cmp(array[pta+3], array[pta+4]) ≤ 0 && cmp(array[pta+5], array[pta+6]) ≤ 0 && + @goto ordered + if v1 + v2 + v3 + v4 == 0 && cmp(array[pta+1], array[pta+2]) > 0 && + cmp(array[pta+3], array[pta+4]) > 0 && cmp(array[pta+5], array[pta+6]) > 0 + pts = pta + @goto reversed + end + + swap_branchless!(array, pta, cmp, !v1) + swap_branchless!(array, pta +2, cmp, !v2) + swap_branchless!(array, pta +4, cmp, !v3) + swap_branchless!(array, pta +6, cmp, !v4) + + if cmp(array[pta+1], array[pta+2]) > 0 || + cmp(array[pta+3], array[pta+4]) > 0 || + cmp(array[pta+5], array[pta+6]) > 0 + quad_swap_merge!(array, pta, swap, firstindex(swap), cmp) + end + pta += 8 + continue + else + count -= 1 + end + + nmembrem = asInt(nmemb % 8) # subtracting -1 should give something negative + doit = true + for δ in nmembrem-1:-1:0 + if cmp(array[pta+δ-1], array[pta+δ]) ≤ 0 + doit = false + break + end + end + if doit + quad_reversal!(array, pts, pta + nmembrem -1) + pts == array_index && return true + @goto reverse_end + end + quad_reversal!(array, pts, pta -1) + break + end + tail_swap!(array, pta, swap, firstindex(swap), nmemb % 8, cmp) + + @label reverse_end + + pta = array_index + count = nmemb ÷ 32 + while !iszero(count) + count -= 1 + cmp(array[pta+7], array[pta+8]) ≤ 0 && + cmp(array[pta+15], array[pta+16]) ≤ 0 && + cmp(array[pta+23], array[pta+24]) ≤ 0 && continue + parity_merge!(swap, firstindex(swap), array, pta, UInt(8), UInt(8), cmp) + parity_merge!(swap, firstindex(swap) +16, array, pta +16, UInt(8), UInt(8), cmp) + parity_merge!(array, pta, swap, firstindex(swap), UInt(16), UInt(16), cmp) + + pta += 32 + end + nmemb % 32 > 8 && tail_merge!(array, pta, swap, firstindex(swap), UInt(32), nmemb % 32, UInt(8), cmp) + return false + end) +end + +# quad merge support routines +function cross_merge!(dest, dest_index::Int, from, from_index::Int, left::UInt, right::UInt, cmp) + @inbounds begin + ptl = from_index + ptr = from_index + asInt(left) + tpl = ptr -1 + tpr = tpl + asInt(right) + + if left +1 ≥ right && right +1 ≥ left && left ≥ 32 + if cmp(from[ptl+15], from[ptr]) > 0 && + cmp(from[ptl], from[ptr+15]) ≤ 0 && + cmp(from[tpl], from[tpr-15]) > 0 && + cmp(from[tpl-15], from[tpr]) ≤ 0 + parity_merge!(dest, dest_index, from, from_index, left, right, cmp) + return + end + end + ptd = dest_index + tpd = dest_index + asInt(left + right) -1 + + while tpl - ptl > 8 && tpr - ptr > 8 + @label ptl8_ptr + if cmp(from[ptl+7], from[ptr]) ≤ 0 + _unsafe_copyto!(dest, ptd, from, ptl, 8); ptd += 8; ptl += 8 + tpl - ptl > 8 && @goto ptl8_ptr + break + end + @label ptl_ptr8 + if cmp(from[ptl], from[ptr+7]) > 0 + _unsafe_copyto!(dest, ptd, from, ptr, 8); ptd += 8; ptr += 8 + tpr - ptr > 8 && @goto ptl_ptr8 + break + end + @label tpl_tpr8 + if cmp(from[tpl], from[tpr-7]) ≤ 0 + tpd -= 8; tpr -= 8; _unsafe_copyto!(dest, tpd +1, from, tpr +1, 8) + tpr - ptr > 8 && @goto tpl_tpr8 + break + end + @label tpl8_tpr + if cmp(from[tpl-7], from[tpr]) > 0 + tpd -= 8; tpl -= 8; _unsafe_copyto!(dest, tpd +1, from, tpl +1, 8) + tpl - ptl > 8 && @goto tpl8_tpr + end + loop = 8 + while true + @head_branchless_merge(dest, ptd, from, ptl, from, ptr, cmp) + @tail_branchless_merge(dest, tpd, from, tpl, from, tpr, cmp) + iszero(loop -= 1) && break + end + end + + if cmp(from[tpl], from[tpr]) ≤ 0 + while ptl ≤ tpl + @head_branchless_merge(dest, ptd, from, ptl, from, ptr, cmp) + end + while ptr ≤ tpr + dest[ptd] = from[ptr]; ptr += 1; ptd += 1 + end + else + while ptr ≤ tpr + @head_branchless_merge(dest, ptd, from, ptl, from, ptr, cmp) + end + while ptl ≤ tpl + dest[ptd] = from[ptl]; ptl += 1; ptd += 1 + end + end + nothing + end +end + +# main memory: [A][B][C][D] +# swap memory: [A B] step 1 +# swap memory: [A B][C D] step 2 +# main memory: [A B C D] step 3 +function quad_merge_block!(array, array_index::Int, swap, swap_index::Int, block::UInt, cmp) + @inbounds begin + block_x_2 = 2block + + pt1 = array_index + asInt(block) + pt2 = pt1 + asInt(block) + pt3 = pt2 + asInt(block) + + tmp = (cmp(array[pt1-1], array[pt1]) ≤ 0) | 2(cmp(array[pt3-1], array[pt3]) ≤ 0) + if tmp == 0 + cross_merge!(swap, swap_index, array, array_index, block, block, cmp) + cross_merge!(swap, swap_index + asInt(block_x_2), array, pt2, block, block, cmp) + elseif tmp == 1 + _unsafe_copyto!(swap, swap_index, array, array_index, block_x_2) + cross_merge!(swap, swap_index + asInt(block_x_2), array, pt2, block, block, cmp) + elseif tmp == 2 + cross_merge!(swap, swap_index, array, array_index, block, block, cmp) + _unsafe_copyto!(swap, swap_index + asInt(block_x_2), array, pt2, block_x_2) + elseif tmp == 3 + cmp(array[pt2-1], array[pt2]) ≤ 0 && return + _unsafe_copyto!(swap, swap_index, array, array_index, 2block_x_2) + end + cross_merge!(array, array_index, swap, swap_index, block_x_2, block_x_2, cmp) + end +end + +function quad_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp) + pte = array_index + asInt(nmemb) + block *= 4 + while block ≤ nmemb && block ≤ swap_size + pta = array_index + while true + quad_merge_block!(array, pta, swap, swap_index, block ÷ 4, cmp) + pta += asInt(block) + pta + asInt(block) ≤ pte || break + end + tail_merge!(array, pta, swap, swap_index, swap_size, asUInt(pte - pta), block ÷ 4, cmp) + block *= 4 + end + tail_merge!(array, array_index, swap, swap_index, swap_size, nmemb, block ÷ 4, cmp) + return block ÷ 2 +end + +function partial_forward_merge!(array, array_index::Int, swap, swap_index::Int, nmemb::UInt, block::UInt, cmp) + @inbounds begin + nmemb == block && return + + ptr = array_index + asInt(block) + tpr = array_index + asInt(nmemb) -1 + cmp(array[ptr-1], array[ptr]) ≤ 0 && return + + _unsafe_copyto!(swap, swap_index, array, array_index, block) + ptl = swap_index + tpl = swap_index + asInt(block) -1 + while ptl < tpl -1 && ptr < tpr -1 + if cmp(swap[ptl], array[ptr+1]) > 0 + array[array_index] = array[ptr] + array[array_index+1] = array[ptr+1] + ptr += 2 + array_index += 2 + elseif cmp(swap[ptl+1], array[ptr]) ≤ 0 + array[array_index] = swap[ptl] + array[array_index+1] = swap[ptl+1] + ptl += 2 + array_index += 2 + else + x = cmp(swap[ptl], array[ptr]) ≤ 0 + y = !x + array[ptr+x] = array[ptr] + ptr += 1 + array[array_index+y] = swap[ptl] + ptl += 1 + array_index += 2 + @head_branchless_merge(array, array_index, swap, ptl, array, ptr, cmp) + end + end + while ptl ≤ tpl && ptr ≤ tpr + @head_branchless_merge(array, array_index, swap, ptl, array, ptr, cmp) + end + while ptl ≤ tpl + array[array_index] = swap[ptl] + ptl += 1 + array_index += 1 + end + nothing + end +end + +function partial_backward_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, + cmp) + @inbounds begin + nmemb == block && return + + tpl = array_index + asInt(block) -1 + tpa = array_index + asInt(nmemb) -1 + cmp(array[tpl], array[tpl+1]) ≤ 0 && return + + right = nmemb - block + if nmemb ≤ swap_size && right ≥ 64 + cross_merge!(swap, swap_index, array, array_index, block, right, cmp) + _unsafe_copyto!(array, array_index, swap, swap_index, nmemb) + return + end + + _unsafe_copyto!(swap, swap_index, array, array_index + asInt(block), right) + tpr = swap_index + asInt(right) -1 + while tpl > array_index +16 && tpr > swap_index +16 + @label tpl_tpr16 + if cmp(array[tpl], swap[tpr-15]) ≤ 0 + loop = 16 + while true + array[tpa] = swap[tpr] + tpr -= 1 + tpa -= 1 + iszero(loop -= 1) && break + end + tpr > swap_index + 16 && @goto tpl_tpr16 + break + end + @label tpl16_tpr + if cmp(array[tpl-15], swap[tpr]) > 0 + loop = 16 + while true + array[tpa] = array[tpl] + tpl -= 1 + tpa -= 1 + iszero(loop -= 1) && break + end + tpl > array_index +16 && @goto tpl16_tpr + break + end + loop = 8 + while true + if cmp(array[tpl], swap[tpr-1]) ≤ 0 + array[tpa] = swap[tpr] + array[tpa-1] = swap[tpr-1] + tpr -= 2 + tpa -= 2 + elseif cmp(array[tpl-1], swap[tpr]) > 0 + array[tpa] = array[tpl] + array[tpa-1] = array[tpl-1] + tpl -= 2 + tpa -= 2 + else + x = cmp(array[tpl], swap[tpr]) ≤ 0 + y = !x + tpa -= 1 + array[tpa+x] = swap[tpr] + tpr -= 1 + array[tpa+y] = array[tpl] + tpl -= 1 + tpa -= 1 + @tail_branchless_merge(array, tpa, array, tpl, swap, tpr, cmp) + end + iszero(loop -= 1) && break + end + end + while tpr > swap_index +1 && tpl > array_index +1 + if cmp(array[tpl], swap[tpr-1]) ≤ 0 + array[tpa] = swap[tpr] + array[tpa-1] = swap[tpr-1] + tpr -= 2 + tpa -= 2 + elseif cmp(array[tpl-1], swap[tpr]) > 0 + array[tpa] = array[tpl] + array[tpa-1] = array[tpl-1] + tpl -= 2 + tpa -= 2 + else + x = cmp(array[tpl], swap[tpr]) ≤ 0 + y = !x + tpa -= 1 + array[tpa+x] = swap[tpr] + tpr -= 1 + array[tpa+y] = array[tpl] + tpl -= 1 + tpa -= 1 + @tail_branchless_merge(array, tpa, array, tpl, swap, tpr, cmp) + end + end + while tpr ≥ swap_index && tpl ≥ array_index + @tail_branchless_merge(array, tpa, array, tpl, swap, tpr, cmp) + end + while tpr ≥ swap_index + array[tpa] = swap[tpr] + tpr -= 1 + tpa -= 1 + end + nothing + end +end + +function tail_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp) + pte = array_index + asInt(nmemb) + while block < nmemb && block ≤ swap_size + pta = array_index + while pta + asInt(block) < pte + if pta + 2asInt(block) < pte + partial_backward_merge!(array, pta, swap, swap_index, swap_size, 2block, block, cmp) + pta += 2asInt(block) + continue + end + partial_backward_merge!(array, pta, swap, swap_index, swap_size, asUInt(pte - pta), block, cmp) + break + end + block *= 2 + end + nothing +end + +# the next four functions provide in-place rotate merge support +function trinity_rotation!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, left::UInt) + @inbounds begin + right = nmemb - left + if swap_size > 65536 + swap_size = UInt(65536) + end + if left < right + if left ≤ swap_size + _unsafe_copyto!(swap, swap_index, array, array_index, left) + _unsafe_copyto!(array, array_index, array, array_index + asInt(left), right) + _unsafe_copyto!(array, array_index + asInt(right), swap, swap_index, left) + else + pta = array_index + ptb = pta + asInt(left) + bridge = right - left + if bridge ≤ swap_size && bridge > 3 + ptc = pta + asInt(right) + ptd = ptc + asInt(left) + _unsafe_copyto!(swap, swap_index, array, ptb, bridge) + for _ in 1:left + array[ptc -= 1] = array[ptd -= 1] + array[ptd] = array[ptb -= 1] + end + _unsafe_copyto!(array, pta, swap, swap_index, bridge) + else + ptc = ptb + ptd = ptc + asInt(right) + for _ in 1:left÷2 + ptb -= 1 + ptd -= 1 + array[pta], array[ptb], array[ptc], array[ptd] = array[ptc], array[pta], array[ptd], array[ptb] + pta += 1 + ptc += 1 + end + for _ in 1:(ptd - ptc)÷2 + ptd -= 1 + array[pta], array[ptc], array[ptd] = array[ptc], array[ptd], array[pta] + ptc += 1 + pta += 1 + end + for _ in 1:(ptd - pta)÷2 + ptd -= 1 + array[pta], array[ptd] = array[ptd], array[pta] + pta += 1 + end + end + end + elseif right < left + if right ≤ swap_size + _unsafe_copyto!(swap, swap_index, array, array_index + asInt(left), right) + _unsafe_copyto!(array, array_index + asInt(right), array, array_index, left) + _unsafe_copyto!(array, array_index, swap, swap_index, right) + else + pta = array_index + ptb = pta + asInt(left) + bridge = left - right + if bridge ≤ swap_size && bridge > 3 + ptc = pta + asInt(right) + ptd = ptc + asInt(left) + _unsafe_copyto!(swap, swap_index, array, ptc, bridge) + for _ in 1:right + array[ptc] = array[pta] + array[pta] = array[ptb] + ptc += 1 + pta += 1 + ptb += 1 + end + _unsafe_copyto!(array, ptd - asInt(bridge), swap, swap_index, bridge) + else + ptc = ptb + ptd = ptc + asInt(right) + for _ in 1:right÷2 + ptb -= 1 + ptd -= 1 + array[pta], array[ptb], array[ptc], array[ptd] = array[ptc], array[pta], array[ptd], array[ptb] + pta += 1 + ptc += 1 + end + for _ in 1:(ptb - pta)÷2 + ptb -= 1 + ptd -= 1 + array[pta], array[ptb], array[ptd] = array[ptd], array[pta], array[ptb] + pta += 1 + end + for _ in 1:(ptd - pta)÷2 + ptd -= 1 + array[pta], array[ptd] = array[ptd], array[pta] + pta += 1 + end + end + end + else + pta = array_index + ptb = pta + asInt(left) + for _ in 1:left + array[pta], array[ptb] = array[ptb], array[pta] + pta += 1 + ptb += 1 + end + end + nothing + end +end + +function monobound_binary_first!(array, array_index::Int, value, top::UInt, cmp) + @inbounds begin + end_ = array_index + asInt(top) + while top > 1 + mid = asInt(top ÷ 2) + if cmp(value, array[end_-mid]) ≤ 0 + end_ -= mid + end + top -= mid + end + if cmp(value, array[end_-1]) ≤ 0 + end_ -= 1 + end + return asUInt(end_ - array_index) + end +end + +function rotate_merge_block!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, lblock::UInt, right::UInt, cmp) + @inbounds begin + cmp(array[array_index+asInt(lblock)-1], array[array_index+asInt(lblock)]) ≤ 0 && return + + rblock = lblock ÷ 2 + rblocki = asInt(rblock) + lblock -= rblock + lblocki = asInt(lblock) + + left = monobound_binary_first!(array, array_index + lblocki + rblocki, array[array_index+lblocki], right, cmp) + + right -= left + + # [ lblock ] [ rblock ] [ left ] [ right ] + + if !iszero(left) + if lblock + left ≤ swap_size + _unsafe_copyto!(swap, swap_index, array, array_index, lblock) + _unsafe_copyto!(swap, swap_index + lblocki, array, array_index + lblocki + rblocki, left) + _unsafe_copyto!(array, array_index + lblocki + asInt(left), array, array_index + lblocki, rblock) + + cross_merge!(array, array_index, swap, swap_index, lblock, left, cmp) + else + trinity_rotation!(array, array_index + lblocki, swap, swap_index, swap_size, rblock + left, rblock) + unbalanced = (2left < lblock) | (2lblock < left) + if unbalanced && left ≤ swap_size + partial_backward_merge!(array, array_index, swap, swap_index, swap_size, lblock + left, lblock, cmp) + elseif unbalanced && lblock ≤ swap_size + partial_forward_merge!(array, array_index, swap, swap_index, lblock + left, lblock, cmp) + else + rotate_merge_block!(array, array_index, swap, swap_index, swap_size, lblock, left, cmp) + end + end + end + + if !iszero(right) + unbalanced = (2right < rblock) | (2rblock < right) + if unbalanced && right ≤ swap_size || right + rblock ≤ swap_size + partial_backward_merge!(array, array_index + lblocki + asInt(left), swap, swap_index, swap_size, + rblock + right, rblock, cmp) + elseif unbalanced && rblock ≤ swap_size + partial_forward_merge!(array, array_index + lblocki + asInt(left), swap, swap_index, rblock + right, rblock, + cmp) + else + rotate_merge_block!(array, array_index + lblocki + asInt(left), swap, swap_index, swap_size, rblock, right, + cmp) + end + end + nothing + end +end + +function rotate_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp) + if nmemb ≤ 2block && nmemb - block ≤ swap_size # unsigned subtraction, ensures nmemb ≥ block + partial_backward_merge!(array, array_index, swap, swap_index, swap_size, nmemb, block, cmp) + return + end + + pte = array_index + asInt(nmemb) + while block < nmemb + pta = array_index + while pta + asInt(block) < pte + if pta + 2asInt(block) < pte + rotate_merge_block!(array, pta, swap, swap_index, swap_size, block, block, cmp) + pta += 2asInt(block) + continue + end + rotate_merge_block!(array, pta, swap, swap_index, swap_size, block, pte - pta - block, cmp) + break + end + block *= 2 + end +end + +function sort!(array::AbstractVector, lo::Int, hi::Int, ::QuadSortAlg, o::Ordering) + len = UInt(hi - lo +1) + cmp = let o=o; (x, y) -> lt(o, y, x) end + + if len < 32 + @with_stackvec(swap, 32, eltype(array), + # we use a fixed size to make sure it is on the stack + tail_swap!(array, lo, swap, firstindex(swap), len, cmp) + ) + elseif !quad_swap!(array, lo, len, cmp) + swap_size = len + if swap_size ≤ 512 + @with_stackvec swapstack Int(swap_size) eltype(array) begin + block = quad_merge!(array, lo, swapstack, firstindex(swapstack), swap_size, len, UInt(32), cmp) + rotate_merge!(array, lo, swapstack, firstindex(swapstack), swap_size, len, block, cmp) + end + else + local swap + try + swap = Vector{eltype(array)}(undef, swap_size) + catch e + if e isa OutOfMemoryError + @with_stackvec stack 512 eltype(array) begin + tail_merge!(array, lo, stack, firstindex(stack), UInt(32), len, UInt(32), cmp) + rotate_merge!(array, lo, stack, firstindex(stack), UInt(32), len, UInt(64), cmp) + end + return + end + rethrow() + end + block = quad_merge!(array, lo, swap, firstindex(swap), swap_size, len, UInt(32), cmp) + rotate_merge!(array, lo, swap, firstindex(swap), swap_size, len, block, cmp) + end + end + return array +end + +function quadsort_swap!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp) + if nmemb ≤ 96 + tail_swap!(array, array_index, swap, swap_index, nmemb, cmp) + elseif !quad_swap!(array, array_index, nmemb, cmp) + block = quad_merge!(array, array_index, swap, swap_index, swap_size, nmemb, UInt(32), cmp) + rotate_merge!(array, array_index, swap, swap_index, swap_size, nmemb, block, cmp) + end +end \ No newline at end of file diff --git a/src/SortingAlgorithms.jl b/src/SortingAlgorithms.jl index f432e2c..6a79c37 100644 --- a/src/SortingAlgorithms.jl +++ b/src/SortingAlgorithms.jl @@ -932,4 +932,41 @@ function sort!(v::AbstractVector, lo::Integer, hi::Integer, ::PagedMergeSortAlg, pagedmergesort!(v, lo, hi, o, scratch, pageLocations) return v end + +# QuadSort and BlitSort +mutable struct StackSpace{N,T} + data::NTuple{N,T} + + StackSpace{N,T}() where {T,N} = new{N,T}() +end + +macro with_stackvec(name, len, eltype, body) + quote + local stackvec = StackSpace{$(esc(len)),$(esc(eltype))}() + $(esc(name)) = unsafe_wrap(Array, Ptr{$(esc(eltype))}(pointer_from_objref(stackvec)), $(esc(len))) + GC.@preserve stackvec begin + $(esc(body)) + if !isbitstype($(esc(eltype))) + for i in eachindex($(esc(name))) + Base._unsetindex!($(esc(name)), i) + end + end + end + end +end + +macro unroll(n, expr) + result = Expr(:block) + for _ in 1:n + push!(result.args, expr) + end + esc(result) +end + +asUInt(i::Int) = Core.bitcast(UInt, i) +asInt(i::UInt) = Core.bitcast(Int, i) + +include("./QuadSort.jl") +include("./BlitSort.jl") + end # module From f272d35e2f50bb7ce5192ffdc87fa332e10fb252 Mon Sep 17 00:00:00 2001 From: Benjamin Desef Date: Sat, 1 Jun 2024 20:21:05 +0200 Subject: [PATCH 2/9] Add to tests --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index aab5738..1dee5b0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,7 +3,7 @@ using Test using StatsBase using Random -stable_algorithms = [TimSort, RadixSort, PagedMergeSort] +stable_algorithms = [TimSort, RadixSort, PagedMergeSort, QuadSort, BlitSort] unstable_algorithms = [HeapSort, CombSort] a = rand(1:10000, 1000) From 3632725c2eafcb38b1ff0aaf3927bb15f7f40a8c Mon Sep 17 00:00:00 2001 From: Benjamin Desef Date: Sat, 1 Jun 2024 21:12:00 +0200 Subject: [PATCH 3/9] Revert back to copyto!, one of the operations will need memmove-behavior --- src/BlitSort.jl | 4 ++-- src/QuadSort.jl | 46 +++++++++++++++++++++++----------------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/BlitSort.jl b/src/BlitSort.jl index 2724458..e2964e5 100644 --- a/src/BlitSort.jl +++ b/src/BlitSort.jl @@ -327,7 +327,7 @@ function blit_reverse_partition!(array, array_index::Int, swap, swap_index::Int, end m = pta - array_index - _unsafe_copyto!(array, pta, swap, swap_index, nmemb - m) + copyto!(array, pta, swap, swap_index, nmemb - m) return asUInt(m) end @@ -375,7 +375,7 @@ function blit_default_partition!(array, array_index::Int, swap, swap_index::Int, end m = pta - array_index - _unsafe_copyto!(array, pta, swap, swap_index, nmemb - m) + copyto!(array, pta, swap, swap_index, nmemb - m) return asUInt(m) end diff --git a/src/QuadSort.jl b/src/QuadSort.jl index 9c841e5..1046a84 100644 --- a/src/QuadSort.jl +++ b/src/QuadSort.jl @@ -446,25 +446,25 @@ function cross_merge!(dest, dest_index::Int, from, from_index::Int, left::UInt, while tpl - ptl > 8 && tpr - ptr > 8 @label ptl8_ptr if cmp(from[ptl+7], from[ptr]) ≤ 0 - _unsafe_copyto!(dest, ptd, from, ptl, 8); ptd += 8; ptl += 8 + copyto!(dest, ptd, from, ptl, 8); ptd += 8; ptl += 8 tpl - ptl > 8 && @goto ptl8_ptr break end @label ptl_ptr8 if cmp(from[ptl], from[ptr+7]) > 0 - _unsafe_copyto!(dest, ptd, from, ptr, 8); ptd += 8; ptr += 8 + copyto!(dest, ptd, from, ptr, 8); ptd += 8; ptr += 8 tpr - ptr > 8 && @goto ptl_ptr8 break end @label tpl_tpr8 if cmp(from[tpl], from[tpr-7]) ≤ 0 - tpd -= 8; tpr -= 8; _unsafe_copyto!(dest, tpd +1, from, tpr +1, 8) + tpd -= 8; tpr -= 8; copyto!(dest, tpd +1, from, tpr +1, 8) tpr - ptr > 8 && @goto tpl_tpr8 break end @label tpl8_tpr if cmp(from[tpl-7], from[tpr]) > 0 - tpd -= 8; tpl -= 8; _unsafe_copyto!(dest, tpd +1, from, tpl +1, 8) + tpd -= 8; tpl -= 8; copyto!(dest, tpd +1, from, tpl +1, 8) tpl - ptl > 8 && @goto tpl8_tpr end loop = 8 @@ -511,14 +511,14 @@ function quad_merge_block!(array, array_index::Int, swap, swap_index::Int, block cross_merge!(swap, swap_index, array, array_index, block, block, cmp) cross_merge!(swap, swap_index + asInt(block_x_2), array, pt2, block, block, cmp) elseif tmp == 1 - _unsafe_copyto!(swap, swap_index, array, array_index, block_x_2) + copyto!(swap, swap_index, array, array_index, block_x_2) cross_merge!(swap, swap_index + asInt(block_x_2), array, pt2, block, block, cmp) elseif tmp == 2 cross_merge!(swap, swap_index, array, array_index, block, block, cmp) - _unsafe_copyto!(swap, swap_index + asInt(block_x_2), array, pt2, block_x_2) + copyto!(swap, swap_index + asInt(block_x_2), array, pt2, block_x_2) elseif tmp == 3 cmp(array[pt2-1], array[pt2]) ≤ 0 && return - _unsafe_copyto!(swap, swap_index, array, array_index, 2block_x_2) + copyto!(swap, swap_index, array, array_index, 2block_x_2) end cross_merge!(array, array_index, swap, swap_index, block_x_2, block_x_2, cmp) end @@ -549,7 +549,7 @@ function partial_forward_merge!(array, array_index::Int, swap, swap_index::Int, tpr = array_index + asInt(nmemb) -1 cmp(array[ptr-1], array[ptr]) ≤ 0 && return - _unsafe_copyto!(swap, swap_index, array, array_index, block) + copyto!(swap, swap_index, array, array_index, block) ptl = swap_index tpl = swap_index + asInt(block) -1 while ptl < tpl -1 && ptr < tpr -1 @@ -598,11 +598,11 @@ function partial_backward_merge!(array, array_index::Int, swap, swap_index::Int, right = nmemb - block if nmemb ≤ swap_size && right ≥ 64 cross_merge!(swap, swap_index, array, array_index, block, right, cmp) - _unsafe_copyto!(array, array_index, swap, swap_index, nmemb) + copyto!(array, array_index, swap, swap_index, nmemb) return end - _unsafe_copyto!(swap, swap_index, array, array_index + asInt(block), right) + copyto!(swap, swap_index, array, array_index + asInt(block), right) tpr = swap_index + asInt(right) -1 while tpl > array_index +16 && tpr > swap_index +16 @label tpl_tpr16 @@ -717,9 +717,9 @@ function trinity_rotation!(array, array_index::Int, swap, swap_index::Int, swap_ end if left < right if left ≤ swap_size - _unsafe_copyto!(swap, swap_index, array, array_index, left) - _unsafe_copyto!(array, array_index, array, array_index + asInt(left), right) - _unsafe_copyto!(array, array_index + asInt(right), swap, swap_index, left) + copyto!(swap, swap_index, array, array_index, left) + copyto!(array, array_index, array, array_index + asInt(left), right) + copyto!(array, array_index + asInt(right), swap, swap_index, left) else pta = array_index ptb = pta + asInt(left) @@ -727,12 +727,12 @@ function trinity_rotation!(array, array_index::Int, swap, swap_index::Int, swap_ if bridge ≤ swap_size && bridge > 3 ptc = pta + asInt(right) ptd = ptc + asInt(left) - _unsafe_copyto!(swap, swap_index, array, ptb, bridge) + copyto!(swap, swap_index, array, ptb, bridge) for _ in 1:left array[ptc -= 1] = array[ptd -= 1] array[ptd] = array[ptb -= 1] end - _unsafe_copyto!(array, pta, swap, swap_index, bridge) + copyto!(array, pta, swap, swap_index, bridge) else ptc = ptb ptd = ptc + asInt(right) @@ -758,9 +758,9 @@ function trinity_rotation!(array, array_index::Int, swap, swap_index::Int, swap_ end elseif right < left if right ≤ swap_size - _unsafe_copyto!(swap, swap_index, array, array_index + asInt(left), right) - _unsafe_copyto!(array, array_index + asInt(right), array, array_index, left) - _unsafe_copyto!(array, array_index, swap, swap_index, right) + copyto!(swap, swap_index, array, array_index + asInt(left), right) + copyto!(array, array_index + asInt(right), array, array_index, left) + copyto!(array, array_index, swap, swap_index, right) else pta = array_index ptb = pta + asInt(left) @@ -768,7 +768,7 @@ function trinity_rotation!(array, array_index::Int, swap, swap_index::Int, swap_ if bridge ≤ swap_size && bridge > 3 ptc = pta + asInt(right) ptd = ptc + asInt(left) - _unsafe_copyto!(swap, swap_index, array, ptc, bridge) + copyto!(swap, swap_index, array, ptc, bridge) for _ in 1:right array[ptc] = array[pta] array[pta] = array[ptb] @@ -776,7 +776,7 @@ function trinity_rotation!(array, array_index::Int, swap, swap_index::Int, swap_ pta += 1 ptb += 1 end - _unsafe_copyto!(array, ptd - asInt(bridge), swap, swap_index, bridge) + copyto!(array, ptd - asInt(bridge), swap, swap_index, bridge) else ptc = ptb ptd = ptc + asInt(right) @@ -847,9 +847,9 @@ function rotate_merge_block!(array, array_index::Int, swap, swap_index::Int, swa if !iszero(left) if lblock + left ≤ swap_size - _unsafe_copyto!(swap, swap_index, array, array_index, lblock) - _unsafe_copyto!(swap, swap_index + lblocki, array, array_index + lblocki + rblocki, left) - _unsafe_copyto!(array, array_index + lblocki + asInt(left), array, array_index + lblocki, rblock) + copyto!(swap, swap_index, array, array_index, lblock) + copyto!(swap, swap_index + lblocki, array, array_index + lblocki + rblocki, left) + copyto!(array, array_index + lblocki + asInt(left), array, array_index + lblocki, rblock) cross_merge!(array, array_index, swap, swap_index, lblock, left, cmp) else From 1b774065ef4677985ff614db697c55ee979089ad Mon Sep 17 00:00:00 2001 From: Benjamin Desef Date: Sat, 1 Jun 2024 21:31:17 +0200 Subject: [PATCH 4/9] Go back to unsafe, but safe --- src/BlitSort.jl | 4 ++-- src/QuadSort.jl | 44 ++++++++++++++++++++-------------------- src/SortingAlgorithms.jl | 11 ++++++++++ 3 files changed, 35 insertions(+), 24 deletions(-) diff --git a/src/BlitSort.jl b/src/BlitSort.jl index e2964e5..2724458 100644 --- a/src/BlitSort.jl +++ b/src/BlitSort.jl @@ -327,7 +327,7 @@ function blit_reverse_partition!(array, array_index::Int, swap, swap_index::Int, end m = pta - array_index - copyto!(array, pta, swap, swap_index, nmemb - m) + _unsafe_copyto!(array, pta, swap, swap_index, nmemb - m) return asUInt(m) end @@ -375,7 +375,7 @@ function blit_default_partition!(array, array_index::Int, swap, swap_index::Int, end m = pta - array_index - copyto!(array, pta, swap, swap_index, nmemb - m) + _unsafe_copyto!(array, pta, swap, swap_index, nmemb - m) return asUInt(m) end diff --git a/src/QuadSort.jl b/src/QuadSort.jl index 1046a84..3ea8fdd 100644 --- a/src/QuadSort.jl +++ b/src/QuadSort.jl @@ -446,25 +446,25 @@ function cross_merge!(dest, dest_index::Int, from, from_index::Int, left::UInt, while tpl - ptl > 8 && tpr - ptr > 8 @label ptl8_ptr if cmp(from[ptl+7], from[ptr]) ≤ 0 - copyto!(dest, ptd, from, ptl, 8); ptd += 8; ptl += 8 + _unsafe_copyto!(dest, ptd, from, ptl, 8); ptd += 8; ptl += 8 tpl - ptl > 8 && @goto ptl8_ptr break end @label ptl_ptr8 if cmp(from[ptl], from[ptr+7]) > 0 - copyto!(dest, ptd, from, ptr, 8); ptd += 8; ptr += 8 + _unsafe_copyto!(dest, ptd, from, ptr, 8); ptd += 8; ptr += 8 tpr - ptr > 8 && @goto ptl_ptr8 break end @label tpl_tpr8 if cmp(from[tpl], from[tpr-7]) ≤ 0 - tpd -= 8; tpr -= 8; copyto!(dest, tpd +1, from, tpr +1, 8) + tpd -= 8; tpr -= 8; _unsafe_copyto!(dest, tpd +1, from, tpr +1, 8) tpr - ptr > 8 && @goto tpl_tpr8 break end @label tpl8_tpr if cmp(from[tpl-7], from[tpr]) > 0 - tpd -= 8; tpl -= 8; copyto!(dest, tpd +1, from, tpl +1, 8) + tpd -= 8; tpl -= 8; _unsafe_copyto!(dest, tpd +1, from, tpl +1, 8) tpl - ptl > 8 && @goto tpl8_tpr end loop = 8 @@ -515,10 +515,10 @@ function quad_merge_block!(array, array_index::Int, swap, swap_index::Int, block cross_merge!(swap, swap_index + asInt(block_x_2), array, pt2, block, block, cmp) elseif tmp == 2 cross_merge!(swap, swap_index, array, array_index, block, block, cmp) - copyto!(swap, swap_index + asInt(block_x_2), array, pt2, block_x_2) + _unsafe_copyto!(swap, swap_index + asInt(block_x_2), array, pt2, block_x_2) elseif tmp == 3 cmp(array[pt2-1], array[pt2]) ≤ 0 && return - copyto!(swap, swap_index, array, array_index, 2block_x_2) + _unsafe_copyto!(swap, swap_index, array, array_index, 2block_x_2) end cross_merge!(array, array_index, swap, swap_index, block_x_2, block_x_2, cmp) end @@ -549,7 +549,7 @@ function partial_forward_merge!(array, array_index::Int, swap, swap_index::Int, tpr = array_index + asInt(nmemb) -1 cmp(array[ptr-1], array[ptr]) ≤ 0 && return - copyto!(swap, swap_index, array, array_index, block) + _unsafe_copyto!(swap, swap_index, array, array_index, block) ptl = swap_index tpl = swap_index + asInt(block) -1 while ptl < tpl -1 && ptr < tpr -1 @@ -598,11 +598,11 @@ function partial_backward_merge!(array, array_index::Int, swap, swap_index::Int, right = nmemb - block if nmemb ≤ swap_size && right ≥ 64 cross_merge!(swap, swap_index, array, array_index, block, right, cmp) - copyto!(array, array_index, swap, swap_index, nmemb) + _unsafe_copyto!(array, array_index, swap, swap_index, nmemb) return end - copyto!(swap, swap_index, array, array_index + asInt(block), right) + _unsafe_copyto!(swap, swap_index, array, array_index + asInt(block), right) tpr = swap_index + asInt(right) -1 while tpl > array_index +16 && tpr > swap_index +16 @label tpl_tpr16 @@ -717,9 +717,9 @@ function trinity_rotation!(array, array_index::Int, swap, swap_index::Int, swap_ end if left < right if left ≤ swap_size - copyto!(swap, swap_index, array, array_index, left) - copyto!(array, array_index, array, array_index + asInt(left), right) - copyto!(array, array_index + asInt(right), swap, swap_index, left) + _unsafe_copyto!(swap, swap_index, array, array_index, left) + _unsafe_copyto!(array, array_index, array, array_index + asInt(left), right) + _unsafe_copyto!(array, array_index + asInt(right), swap, swap_index, left) else pta = array_index ptb = pta + asInt(left) @@ -727,12 +727,12 @@ function trinity_rotation!(array, array_index::Int, swap, swap_index::Int, swap_ if bridge ≤ swap_size && bridge > 3 ptc = pta + asInt(right) ptd = ptc + asInt(left) - copyto!(swap, swap_index, array, ptb, bridge) + _unsafe_copyto!(swap, swap_index, array, ptb, bridge) for _ in 1:left array[ptc -= 1] = array[ptd -= 1] array[ptd] = array[ptb -= 1] end - copyto!(array, pta, swap, swap_index, bridge) + _unsafe_copyto!(array, pta, swap, swap_index, bridge) else ptc = ptb ptd = ptc + asInt(right) @@ -758,9 +758,9 @@ function trinity_rotation!(array, array_index::Int, swap, swap_index::Int, swap_ end elseif right < left if right ≤ swap_size - copyto!(swap, swap_index, array, array_index + asInt(left), right) - copyto!(array, array_index + asInt(right), array, array_index, left) - copyto!(array, array_index, swap, swap_index, right) + _unsafe_copyto!(swap, swap_index, array, array_index + asInt(left), right) + _unsafe_copyto_backwards!(array, array_index + asInt(right), array, array_index, left) + _unsafe_copyto!(array, array_index, swap, swap_index, right) else pta = array_index ptb = pta + asInt(left) @@ -768,7 +768,7 @@ function trinity_rotation!(array, array_index::Int, swap, swap_index::Int, swap_ if bridge ≤ swap_size && bridge > 3 ptc = pta + asInt(right) ptd = ptc + asInt(left) - copyto!(swap, swap_index, array, ptc, bridge) + _unsafe_copyto!(swap, swap_index, array, ptc, bridge) for _ in 1:right array[ptc] = array[pta] array[pta] = array[ptb] @@ -776,7 +776,7 @@ function trinity_rotation!(array, array_index::Int, swap, swap_index::Int, swap_ pta += 1 ptb += 1 end - copyto!(array, ptd - asInt(bridge), swap, swap_index, bridge) + _unsafe_copyto!(array, ptd - asInt(bridge), swap, swap_index, bridge) else ptc = ptb ptd = ptc + asInt(right) @@ -847,9 +847,9 @@ function rotate_merge_block!(array, array_index::Int, swap, swap_index::Int, swa if !iszero(left) if lblock + left ≤ swap_size - copyto!(swap, swap_index, array, array_index, lblock) - copyto!(swap, swap_index + lblocki, array, array_index + lblocki + rblocki, left) - copyto!(array, array_index + lblocki + asInt(left), array, array_index + lblocki, rblock) + _unsafe_copyto!(swap, swap_index, array, array_index, lblock) + _unsafe_copyto!(swap, swap_index + lblocki, array, array_index + lblocki + rblocki, left) + _unsafe_copyto_backwards!(array, array_index + lblocki + asInt(left), array, array_index + lblocki, rblock) cross_merge!(array, array_index, swap, swap_index, lblock, left, cmp) else diff --git a/src/SortingAlgorithms.jl b/src/SortingAlgorithms.jl index 6a79c37..782d69e 100644 --- a/src/SortingAlgorithms.jl +++ b/src/SortingAlgorithms.jl @@ -701,6 +701,17 @@ function _unsafe_copyto!(dest::Array, doffs, src::Array, soffs, n) unsafe_copyto!(dest, doffs, src, soffs, n) end +function _unsafe_copyto_backwards!(dest, doffs, src, soffs, n) + @inbounds for i in n-1:-1:0 + dest[doffs + i] = src[soffs + i] + end + dest +end + +function _unsafe_copyto_backwards!(dest::Array, doffs, src::Array, soffs, n) + unsafe_copyto!(dest, doffs, src, soffs, n) +end + # merge v[lo:m] and v[m+1:hi] ([A;B]) using scratch[1:1+hi-lo] # This is faster than merge! but requires twice as much auxiliary memory. function twoended_merge!(v::AbstractVector{T}, lo::Integer, m::Integer, hi::Integer, o::Ordering, scratch::AbstractVector{T}) where T From 77421437977a179da04f031c184206d545088c01 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner Date: Sat, 1 Jun 2024 16:08:29 -0500 Subject: [PATCH 5/9] force specialization on cmp --- src/QuadSort.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/QuadSort.jl b/src/QuadSort.jl index 3ea8fdd..678c74c 100644 --- a/src/QuadSort.jl +++ b/src/QuadSort.jl @@ -282,7 +282,7 @@ function quad_reversal!(array, pta::Int, ptz::Int) end end -function quad_swap_merge!(array, array_index::Int, swap, swap_index::Int, cmp) +function quad_swap_merge!(array, array_index::Int, swap, swap_index::Int, cmp::T) where T parity_merge_two!(array, array_index, swap, swap_index, cmp) parity_merge_two!(array, array_index +4, swap, swap_index +4, cmp) parity_merge_four!(swap, swap_index, array, array_index, cmp) @@ -524,7 +524,7 @@ function quad_merge_block!(array, array_index::Int, swap, swap_index::Int, block end end -function quad_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp) +function quad_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp::T) where T pte = array_index + asInt(nmemb) block *= 4 while block ≤ nmemb && block ≤ swap_size @@ -690,7 +690,7 @@ function partial_backward_merge!(array, array_index::Int, swap, swap_index::Int, end end -function tail_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp) +function tail_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp::T) where T pte = array_index + asInt(nmemb) while block < nmemb && block ≤ swap_size pta = array_index @@ -882,7 +882,7 @@ function rotate_merge_block!(array, array_index::Int, swap, swap_index::Int, swa end end -function rotate_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp) +function rotate_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp::T) where T if nmemb ≤ 2block && nmemb - block ≤ swap_size # unsigned subtraction, ensures nmemb ≥ block partial_backward_merge!(array, array_index, swap, swap_index, swap_size, nmemb, block, cmp) return From 674388d52af8d5984fc0a1eae8a5136ceb809eef Mon Sep 17 00:00:00 2001 From: Benjamin Desef Date: Sun, 2 Jun 2024 20:52:05 +0200 Subject: [PATCH 6/9] Force specialization everywhere --- src/BlitSort.jl | 18 +++++++++--------- src/QuadSort.jl | 48 ++++++++++++++++++++++++------------------------ 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/BlitSort.jl b/src/BlitSort.jl index 2724458..c44b59b 100644 --- a/src/BlitSort.jl +++ b/src/BlitSort.jl @@ -7,7 +7,7 @@ const BlitSort = maybe_optimize(BlitSortAlg()) const BLIT_AUX = 512 # set to 0 for sqrt(n) cache size const BLIT_OUT = 96 # should be smaller or equal to BLIT_AUX -function blit_analyze!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp) +function blit_analyze!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp::F) where {F} @inbounds begin half1 = nmemb ÷ 2 quad1 = half1 ÷ 2 @@ -182,7 +182,7 @@ end # The next 4 functions are used for pivot selection -function blit_binary_median(arraya, pta::Int, arrayb, ptb::Int, len::UInt, cmp) +function blit_binary_median(arraya, pta::Int, arrayb, ptb::Int, len::UInt, cmp::F) where {F} @inbounds begin while !iszero(len ÷= 2) leni = asInt(len) @@ -198,7 +198,7 @@ function blit_binary_median(arraya, pta::Int, arrayb, ptb::Int, len::UInt, cmp) end end -function blit_trim_four!(array, pta::Int, cmp) +function blit_trim_four!(array, pta::Int, cmp::F) where {F} @inbounds begin x = cmp(array[pta], array[pta+1]) > 0 array[pta], array[pta+1] = array[pta+x], array[pta+!x] @@ -216,7 +216,7 @@ function blit_trim_four!(array, pta::Int, cmp) end end -function blit_median_of_nine!(array, array_index::Int, swap, swap_index::Int, nmemb::UInt, cmp) +function blit_median_of_nine!(array, array_index::Int, swap, swap_index::Int, nmemb::UInt, cmp::F) where {F} @inbounds begin z = asInt(nmemb ÷ 9) @@ -245,7 +245,7 @@ function blit_median_of_nine!(array, array_index::Int, swap, swap_index::Int, nm end end -function blit_median_of_cbrt!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp) +function blit_median_of_cbrt!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp::F) where {F} @inbounds begin cbrt = UInt(32) # TODO: figure out how to write this more efficiently using bsr while nmemb > cbrt * cbrt * cbrt && cbrt < swap_size @@ -285,7 +285,7 @@ function blit_median_of_cbrt!(array, array_index::Int, swap, swap_index::Int, sw end # As per suggestion by Marshall Lochbaum to improve generic data handling -function blit_reverse_partition!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, piv, nmemb::UInt, cmp) +function blit_reverse_partition!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, piv, nmemb::UInt, cmp::F) where {F} @inbounds begin if nmemb > swap_size h = nmemb ÷ 2; @@ -333,7 +333,7 @@ function blit_reverse_partition!(array, array_index::Int, swap, swap_index::Int, end end -function blit_default_partition!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, piv, nmemb::UInt, cmp) +function blit_default_partition!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, piv, nmemb::UInt, cmp::F) where {F} @inbounds begin if nmemb > swap_size h = nmemb ÷ 2 @@ -381,7 +381,7 @@ function blit_default_partition!(array, array_index::Int, swap, swap_index::Int, end end -function blit_partition!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp) +function blit_partition!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp::F) where {F} @inbounds begin a_size = zero(UInt) local max @@ -464,5 +464,5 @@ function sort!(array::AbstractVector, lo::Int, hi::Int, ::BlitSortAlg, o::Orderi return array end -blitsort_swap!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp) = +blitsort_swap!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp::F) where {F} = (nmemb ≤ 132 ? quadsort_swap! : blit_analyze!)(array, array_index, swap, swap_index, swap_size, nmemb, cmp) \ No newline at end of file diff --git a/src/QuadSort.jl b/src/QuadSort.jl index 3ea8fdd..68782b9 100644 --- a/src/QuadSort.jl +++ b/src/QuadSort.jl @@ -45,7 +45,7 @@ macro getcmp(array, indexp1, fn, indexp2, cmp) end)) end -@inline function parity_merge_two!(array, array_index::Int, swap, swap_index::Int, cmp) +@inline function parity_merge_two!(array, array_index::Int, swap, swap_index::Int, cmp::F) where {F} @inbounds begin ptl = array_index; ptr = array_index + 2; pts = swap_index @head_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) @@ -57,7 +57,7 @@ end end end -@inline function parity_merge_four!(array, array_index::Int, swap, swap_index::Int, cmp) +@inline function parity_merge_four!(array, array_index::Int, swap, swap_index::Int, cmp::F) where {F} @inbounds begin ptl = array_index; ptr = array_index + 4; pts = swap_index @head_branchless_merge(swap, pts, array, ptl, array, ptr, cmp) @@ -73,15 +73,15 @@ end end end -@inline function swap_branchless!(array, array_index::Int, cmp, - x::Bool=@inbounds(cmp(array[array_index], array[array_index+1]) > 0)) +@inline function swap_branchless!(array, array_index::Int, cmp::F, + x::Bool=@inbounds(cmp(array[array_index], array[array_index+1]) > 0)) where {F} y = !x @inbounds array[array_index], array[array_index+1] = array[array_index+x], array[array_index+y] nothing end # the next seven functions are used for sorting 0 to 31 elements -function tiny_sort!(array, array_index::Int, nmemb::UInt, cmp) +function tiny_sort!(array, array_index::Int, nmemb::UInt, cmp::F) where {F} @inbounds if nmemb == 4 # This is really just @inline quad_swap_four!(array, array_index, cmp) # However, for some weird reason, the profiler reports lots of GC and dynamic dispatch going on in either tiny_sort! or @@ -112,7 +112,7 @@ function tiny_sort!(array, array_index::Int, nmemb::UInt, cmp) end # this function requires a minimum offset of 2 to work properly -function twice_unguarded_insert!(array, array_index::Int, offset::UInt, nmemb::UInt, cmp) +function twice_unguarded_insert!(array, array_index::Int, offset::UInt, nmemb::UInt, cmp::F) where {F} @inbounds for i in asInt(offset):asInt(nmemb)-1 end_ = array_index + i pta = end_ @@ -139,7 +139,7 @@ function twice_unguarded_insert!(array, array_index::Int, offset::UInt, nmemb::U end end -function quad_swap_four!(array, array_index::Int, cmp) +function quad_swap_four!(array, array_index::Int, cmp::F) where {F} @inbounds begin swap_branchless!(array, array_index, cmp) swap_branchless!(array, array_index +2, cmp) @@ -160,7 +160,7 @@ function quad_swap_four!(array, array_index::Int, cmp) end end -function parity_swap_eight!(array, array_index::Int, swap, swap_index::Int, cmp) +function parity_swap_eight!(array, array_index::Int, swap, swap_index::Int, cmp::F) where {F} @inbounds begin swap_branchless!(array, array_index, cmp) swap_branchless!(array, array_index +2, cmp) @@ -177,7 +177,7 @@ function parity_swap_eight!(array, array_index::Int, swap, swap_index::Int, cmp) end # left must be equal or one smaller than right -function parity_merge!(dest, dest_index::Int, from, from_index::Int, left::UInt, right::UInt, cmp) +function parity_merge!(dest, dest_index::Int, from, from_index::Int, left::UInt, right::UInt, cmp::F) where {F} @inbounds begin ptl = from_index ptr = from_index + asInt(left) @@ -196,7 +196,7 @@ function parity_merge!(dest, dest_index::Int, from, from_index::Int, left::UInt, end end -function parity_swap_sixteen!(array, array_index::Int, swap, swap_index::Int, cmp) +function parity_swap_sixteen!(array, array_index::Int, swap, swap_index::Int, cmp::F) where {F} @inbounds begin quad_swap_four!(array, array_index, cmp) quad_swap_four!(array, array_index +4, cmp) @@ -211,7 +211,7 @@ function parity_swap_sixteen!(array, array_index::Int, swap, swap_index::Int, cm end end -function tail_swap!(array, array_index::Int, swap, swap_index::Int, nmemb::UInt, cmp) +function tail_swap!(array, array_index::Int, swap, swap_index::Int, nmemb::UInt, cmp::F) where {F} @inbounds begin if nmemb < 5 tiny_sort!(array, array_index, nmemb, cmp) @@ -282,13 +282,13 @@ function quad_reversal!(array, pta::Int, ptz::Int) end end -function quad_swap_merge!(array, array_index::Int, swap, swap_index::Int, cmp) +function quad_swap_merge!(array, array_index::Int, swap, swap_index::Int, cmp::F) where {F} parity_merge_two!(array, array_index, swap, swap_index, cmp) parity_merge_two!(array, array_index +4, swap, swap_index +4, cmp) parity_merge_four!(swap, swap_index, array, array_index, cmp) end -function quad_swap!(array, array_index::Int, nmemb::UInt, cmp) +function quad_swap!(array, array_index::Int, nmemb::UInt, cmp::F) where {F} @inbounds(@with_stackvec swap 32 eltype(array) begin pta = array_index count = nmemb ÷ 8 @@ -424,7 +424,7 @@ function quad_swap!(array, array_index::Int, nmemb::UInt, cmp) end # quad merge support routines -function cross_merge!(dest, dest_index::Int, from, from_index::Int, left::UInt, right::UInt, cmp) +function cross_merge!(dest, dest_index::Int, from, from_index::Int, left::UInt, right::UInt, cmp::F) where {F} @inbounds begin ptl = from_index ptr = from_index + asInt(left) @@ -498,7 +498,7 @@ end # swap memory: [A B] step 1 # swap memory: [A B][C D] step 2 # main memory: [A B C D] step 3 -function quad_merge_block!(array, array_index::Int, swap, swap_index::Int, block::UInt, cmp) +function quad_merge_block!(array, array_index::Int, swap, swap_index::Int, block::UInt, cmp::F) where {F} @inbounds begin block_x_2 = 2block @@ -511,7 +511,7 @@ function quad_merge_block!(array, array_index::Int, swap, swap_index::Int, block cross_merge!(swap, swap_index, array, array_index, block, block, cmp) cross_merge!(swap, swap_index + asInt(block_x_2), array, pt2, block, block, cmp) elseif tmp == 1 - copyto!(swap, swap_index, array, array_index, block_x_2) + _unsafe_copyto!(swap, swap_index, array, array_index, block_x_2) cross_merge!(swap, swap_index + asInt(block_x_2), array, pt2, block, block, cmp) elseif tmp == 2 cross_merge!(swap, swap_index, array, array_index, block, block, cmp) @@ -524,7 +524,7 @@ function quad_merge_block!(array, array_index::Int, swap, swap_index::Int, block end end -function quad_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp) +function quad_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp::F) where {F} pte = array_index + asInt(nmemb) block *= 4 while block ≤ nmemb && block ≤ swap_size @@ -541,7 +541,7 @@ function quad_merge!(array, array_index::Int, swap, swap_index::Int, swap_size:: return block ÷ 2 end -function partial_forward_merge!(array, array_index::Int, swap, swap_index::Int, nmemb::UInt, block::UInt, cmp) +function partial_forward_merge!(array, array_index::Int, swap, swap_index::Int, nmemb::UInt, block::UInt, cmp::F) where {F} @inbounds begin nmemb == block && return @@ -587,7 +587,7 @@ function partial_forward_merge!(array, array_index::Int, swap, swap_index::Int, end function partial_backward_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, - cmp) + cmp::F) where {F} @inbounds begin nmemb == block && return @@ -690,7 +690,7 @@ function partial_backward_merge!(array, array_index::Int, swap, swap_index::Int, end end -function tail_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp) +function tail_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp::F) where {F} pte = array_index + asInt(nmemb) while block < nmemb && block ≤ swap_size pta = array_index @@ -813,7 +813,7 @@ function trinity_rotation!(array, array_index::Int, swap, swap_index::Int, swap_ end end -function monobound_binary_first!(array, array_index::Int, value, top::UInt, cmp) +function monobound_binary_first!(array, array_index::Int, value, top::UInt, cmp::F) where {F} @inbounds begin end_ = array_index + asInt(top) while top > 1 @@ -830,7 +830,7 @@ function monobound_binary_first!(array, array_index::Int, value, top::UInt, cmp) end end -function rotate_merge_block!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, lblock::UInt, right::UInt, cmp) +function rotate_merge_block!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, lblock::UInt, right::UInt, cmp::F) where {F} @inbounds begin cmp(array[array_index+asInt(lblock)-1], array[array_index+asInt(lblock)]) ≤ 0 && return @@ -882,7 +882,7 @@ function rotate_merge_block!(array, array_index::Int, swap, swap_index::Int, swa end end -function rotate_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp) +function rotate_merge!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, block::UInt, cmp::F) where {F} if nmemb ≤ 2block && nmemb - block ≤ swap_size # unsigned subtraction, ensures nmemb ≥ block partial_backward_merge!(array, array_index, swap, swap_index, swap_size, nmemb, block, cmp) return @@ -941,7 +941,7 @@ function sort!(array::AbstractVector, lo::Int, hi::Int, ::QuadSortAlg, o::Orderi return array end -function quadsort_swap!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp) +function quadsort_swap!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp::F) where {F} if nmemb ≤ 96 tail_swap!(array, array_index, swap, swap_index, nmemb, cmp) elseif !quad_swap!(array, array_index, nmemb, cmp) From 1907e5b5d0611b31b44ad28bd81ce8aefc1c1f47 Mon Sep 17 00:00:00 2001 From: Benjamin Desef Date: Sun, 2 Jun 2024 20:52:19 +0200 Subject: [PATCH 7/9] Fix type instability --- src/BlitSort.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/BlitSort.jl b/src/BlitSort.jl index c44b59b..23c5f14 100644 --- a/src/BlitSort.jl +++ b/src/BlitSort.jl @@ -22,7 +22,7 @@ function blit_analyze!(array, array_index::Int, swap, swap_index::Int, swap_size ptd = array_index + asInt(half1 + quad3) astreaks = bstreaks = cstreaks = dstreaks = zero(UInt) - abalance = bbalance = cbalance = dbalance = zero(UInt) + abalance::UInt = bbalance::UInt = cbalance::UInt = dbalance::UInt = zero(UInt) cnt = nmemb while cnt > 132 From feae5cbb0b60687d5a38d6dd8ef58799241c0c8b Mon Sep 17 00:00:00 2001 From: Benjamin Desef Date: Sun, 2 Jun 2024 20:52:39 +0200 Subject: [PATCH 8/9] Tell about non-throwing exception --- src/BlitSort.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/BlitSort.jl b/src/BlitSort.jl index 23c5f14..639a534 100644 --- a/src/BlitSort.jl +++ b/src/BlitSort.jl @@ -245,7 +245,8 @@ function blit_median_of_nine!(array, array_index::Int, swap, swap_index::Int, nm end end -function blit_median_of_cbrt!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp::F) where {F} +Base.@assume_effects :nothrow function blit_median_of_cbrt!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, + nmemb::UInt, cmp::F) where {F} @inbounds begin cbrt = UInt(32) # TODO: figure out how to write this more efficiently using bsr while nmemb > cbrt * cbrt * cbrt && cbrt < swap_size @@ -381,7 +382,8 @@ function blit_default_partition!(array, array_index::Int, swap, swap_index::Int, end end -function blit_partition!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, nmemb::UInt, cmp::F) where {F} +Base.@assume_effects :nothrow function blit_partition!(array, array_index::Int, swap, swap_index::Int, swap_size::UInt, + nmemb::UInt, cmp::F) where {F} @inbounds begin a_size = zero(UInt) local max From 954a5d99e878960458332f7fc80058517f3c3a3b Mon Sep 17 00:00:00 2001 From: Benjamin Desef Date: Sun, 2 Jun 2024 20:53:03 +0200 Subject: [PATCH 9/9] Wrong name for non-mutating function --- src/QuadSort.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/QuadSort.jl b/src/QuadSort.jl index 68782b9..58804c9 100644 --- a/src/QuadSort.jl +++ b/src/QuadSort.jl @@ -813,7 +813,7 @@ function trinity_rotation!(array, array_index::Int, swap, swap_index::Int, swap_ end end -function monobound_binary_first!(array, array_index::Int, value, top::UInt, cmp::F) where {F} +function monobound_binary_first(array, array_index::Int, value, top::UInt, cmp::F) where {F} @inbounds begin end_ = array_index + asInt(top) while top > 1 @@ -839,7 +839,7 @@ function rotate_merge_block!(array, array_index::Int, swap, swap_index::Int, swa lblock -= rblock lblocki = asInt(lblock) - left = monobound_binary_first!(array, array_index + lblocki + rblocki, array[array_index+lblocki], right, cmp) + left = monobound_binary_first(array, array_index + lblocki + rblocki, array[array_index+lblocki], right, cmp) right -= left