Skip to content

Commit 4d1e0b2

Browse files
strings: add eachsplit for iterative splitting
This commit moves the existing splitting implementation into an iterator named `eachsplit` and changes the definition of `split(...)` to `collect(eachsplit(...))`, plus a few edge cases.
1 parent b0b4a48 commit 4d1e0b2

File tree

2 files changed

+76
-38
lines changed

2 files changed

+76
-38
lines changed

base/exports.jl

+1
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,7 @@ export
572572
codeunits,
573573
digits,
574574
digits!,
575+
eachsplit,
575576
escape_string,
576577
hex2bytes,
577578
hex2bytes!,

base/strings/util.jl

+75-38
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,78 @@ function rpad(
367367
r == 0 ? string(s, p^q) : string(s, p^q, first(p, r))
368368
end
369369

370+
"""
371+
eachsplit(str::AbstractString, dlm; limit::Integer=0)
372+
eachsplit(str::AbstractString; limit::Integer=0)
373+
374+
Split `str` on occurrences of the delimiter(s) `dlm` and return an iterator over the
375+
substrings. `dlm` can be any of the formats allowed by [`findnext`](@ref)'s first argument
376+
(i.e. as a string, regular expression or a function), or as a single character or collection
377+
of characters.
378+
379+
If `dlm` is omitted, it defaults to [`isspace`](@ref).
380+
381+
The iterator will return a maximum of `limit` results if the keyword argument is supplied.
382+
The default of `limit=0` implies no maximum.
383+
384+
See also [`split`](@ref).
385+
386+
# Examples
387+
```jldoctest
388+
julia> a = "Ma.rch"
389+
"Ma.rch"
390+
391+
julia> collect(eachsplit(a, "."))
392+
2-element Vector{SubString}:
393+
"Ma"
394+
"rch"
395+
```
396+
"""
397+
function eachsplit end
398+
399+
# Forcing specialization on `splitter` improves performance (roughly 30% decrease in runtime)
400+
# and prevents a major invalidation risk (1550 MethodInstances)
401+
struct SplitIterator{S<:AbstractString,F}
402+
str::S
403+
splitter::F
404+
limit::Int
405+
end
406+
407+
eltype(::Type{<:SplitIterator}) = SubString
408+
409+
IteratorSize(::Type{<:SplitIterator}) = SizeUnknown()
410+
411+
# i: the starting index of the substring to be extracted
412+
# k: the starting index of the next substring to be extracted
413+
# n: the number of splits returned so far; always less than iter.limit - 1 (1 for the rest)
414+
function iterate(iter::SplitIterator, (i, k, n)=(firstindex(iter.str), firstindex(iter.str), 0))
415+
i - 1 > ncodeunits(iter.str)::Int && return nothing
416+
r = findnext(iter.splitter, iter.str, k)::Union{Nothing,Int,UnitRange{Int}}
417+
while r !== nothing && n != iter.limit - 1 && first(r) <= lastindex(iter.str)
418+
j, k = first(r), nextind(iter.str, last(r))::Int
419+
k_ = ifelse(k <= j, nextind(iter.str, j), k)
420+
if i < k
421+
substr = @inbounds SubString(iter.str, i, prevind(iter.str, j)::Int)
422+
return (substr, (max(i, k), k_, n + Int(i < j)))
423+
end
424+
k = k_
425+
r = findnext(iter.splitter, iter.str, k)::Union{Nothing,Int,UnitRange{Int}}
426+
end
427+
@inbounds SubString(iter.str, i), (ncodeunits(iter.str) + 2, k, n + 1)
428+
end
429+
430+
eachsplit(str::T, splitter; limit::Integer=0) where {T<:AbstractString} =
431+
SplitIterator(str, splitter, limit)
432+
433+
eachsplit(str::T, splitter::Union{Tuple{Vararg{AbstractChar}},AbstractVector{<:AbstractChar},Set{<:AbstractChar}}; limit::Integer=0) where {T<:AbstractString} =
434+
eachsplit(str, in(splitter); limit)
435+
436+
eachsplit(str::T, splitter::AbstractChar; limit::Integer=0) where {T<:AbstractString} =
437+
eachsplit(str, isequal(splitter); limit)
438+
439+
# a bit oddball, but standard behavior in Perl, Ruby & Python:
440+
eachsplit(str::AbstractString; limit::Integer=0) = eachsplit(str, isspace; limit)
441+
370442
"""
371443
split(str::AbstractString, dlm; limit::Integer=0, keepempty::Bool=true)
372444
split(str::AbstractString; limit::Integer=0, keepempty::Bool=false)
@@ -396,46 +468,11 @@ julia> split(a, ".")
396468
"rch"
397469
```
398470
"""
399-
function split end
400-
401471
function split(str::T, splitter;
402472
limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString}
403-
_split(str, splitter, limit, keepempty, T <: SubString ? T[] : SubString{T}[])
404-
end
405-
function split(str::T, splitter::Union{Tuple{Vararg{AbstractChar}},AbstractVector{<:AbstractChar},Set{<:AbstractChar}};
406-
limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString}
407-
_split(str, in(splitter), limit, keepempty, T <: SubString ? T[] : SubString{T}[])
408-
end
409-
function split(str::T, splitter::AbstractChar;
410-
limit::Integer=0, keepempty::Bool=true) where {T<:AbstractString}
411-
_split(str, isequal(splitter), limit, keepempty, T <: SubString ? T[] : SubString{T}[])
412-
end
413-
414-
function _split(str::AbstractString, splitter::F, limit::Integer, keepempty::Bool, strs::Vector) where F
415-
# Forcing specialization on `splitter` improves performance (roughly 30% decrease in runtime)
416-
# and prevents a major invalidation risk (1550 MethodInstances)
417-
i = 1 # firstindex(str)
418-
n = lastindex(str)::Int
419-
r = findfirst(splitter,str)::Union{Nothing,Int,UnitRange{Int}}
420-
if r !== nothing
421-
j, k = first(r), nextind(str,last(r))::Int
422-
while 0 < j <= n && length(strs) != limit-1
423-
if i < k
424-
if keepempty || i < j
425-
push!(strs, @inbounds SubString(str,i,prevind(str,j)::Int))
426-
end
427-
i = k
428-
end
429-
(k <= j) && (k = nextind(str,j)::Int)
430-
r = findnext(splitter,str,k)::Union{Nothing,Int,UnitRange{Int}}
431-
r === nothing && break
432-
j, k = first(r), nextind(str,last(r))::Int
433-
end
434-
end
435-
if keepempty || i <= ncodeunits(str)::Int
436-
push!(strs, @inbounds SubString(str,i))
437-
end
438-
return strs
473+
itr = eachsplit(str, splitter; limit)
474+
keepempty || (itr = Iterators.filter(!isempty, itr))
475+
collect(T <: SubString ? T : SubString{T}, itr)
439476
end
440477

441478
# a bit oddball, but standard behavior in Perl, Ruby & Python:

0 commit comments

Comments
 (0)