From fc569292b99937219b67f35f317be9b690212e51 Mon Sep 17 00:00:00 2001 From: Gio Cielo Date: Mon, 20 Feb 2017 06:55:28 -0800 Subject: [PATCH] Add an iterator implementation for String splitting --- base/exports.jl | 1 + base/strings/util.jl | 117 +++++++++++++++++++++++++++++++++++++++++++ test/strings/util.jl | 41 +++++++++++++++ 3 files changed, 159 insertions(+) diff --git a/base/exports.jl b/base/exports.jl index 51bfbf6632ef1..252ca188799a3 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -753,6 +753,7 @@ export digits!, dump, eachmatch, + eachsplit, endswith, escape_string, graphemes, diff --git a/base/strings/util.jl b/base/strings/util.jl index d38bacbc759b2..fd9ff6e0b6819 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -248,6 +248,123 @@ julia> rpad("March",20) rpad(s, n::Integer, p=" ") = rpad(string(s),n,string(p)) cpad(s, n::Integer, p=" ") = rpad(lpad(s,div(n+strwidth(s),2),p),n,p) +immutable SplitIterator + str::AbstractString + splitter + limit::Integer + keep_empty::Bool +end + +iteratorsize(::Type{SplitIterator}) = SizeUnknown() +iteratoreltype(::Type{SplitIterator}) = HasEltype() +eltype(::SplitIterator) = SubString + +type SplitIteratorState + i::Int + j::Int + k::Int + n::Int + s::Int +end + +""" + eachsplit(s::AbstractString, [chars]; limit::Integer=0, keep::Bool=true) + +Return an iterator of substrings by splitting the given string on occurrences of the given +character delimiters, which may be specified in any of the formats allowed by `search`'s +second argument (i.e. a single character, collection of characters, string, or regular +expression). If `chars` is omitted, it defaults to the set of all space characters, and +`keep` is taken to be `false`. The two keyword arguments are optional: they are a +maximum size for the result and a flag determining whether empty fields should be kept in +the result. + +This method is typically slower than `split`, but it does not preemptively allocate an +array. + +```jldoctest +julia> a = "Ma.rch" +"Ma.rch" + +julia> collect(eachsplit(a,".")) +2-element Array{SubString{String},1}: + "Ma" + "rch" +``` +""" +function eachsplit(str::AbstractString, splitter; limit::Integer=0, keep::Bool=true) + _eachsplit(str, splitter, limit, keep) +end + +eachsplit(str::AbstractString) = _eachsplit(_default_delims; limit=0, keep=false) + +function _eachsplit(str::AbstractString, splitter, limit::Integer, keep_empty::Bool) + # Empty string splitter means you want to iterate over the characters + splitter == "" ? graphemes(str) : SplitIterator(str, splitter, limit, keep_empty) +end + +function start(iter::SplitIterator) + i = start(iter.str) + n = endof(iter.str) + + r = search(iter.str, iter.splitter, i) + j, k = first(r), nextind(iter.str, last(r)) + + # Could not find the splitter in the string + if j == 0 + j = k = nextind(iter.str, n) + end + + # Eat the prefix that matches the splitter + while !iter.keep_empty && i == j && i <= n + i = k + r = search(iter.str, iter.splitter, i) + j, k = first(r), nextind(iter.str, last(r)) + + # Could not find the splitter in the string + if j == 0 + j = k = nextind(iter.str, n) + end + end + + SplitIteratorState(i, j, k, n, 0) +end + +function done(iter::SplitIterator, state::SplitIteratorState) + state.i > state.n || (iter.limit > 0 && state.s == iter.limit) +end + +function next(iter::SplitIterator, state::SplitIteratorState) + result = SubString(iter.str, state.i, prevind(iter.str, state.j)) + # Move our iterator to the next position of a potential substring + state.i = state.k + state.s += 1 + + if done(iter, state) + return result, state + end + + # Update the state to find the next end point, j, of the next substring + r = search(iter.str, iter.splitter, state.i) + state.j, state.k = first(r), nextind(iter.str, last(r)) + + if state.j == 0 + state.j = state.k = nextind(iter.str, state.n) + end + + while !iter.keep_empty && state.i == state.j && state.i <= state.n + state.i = state.k + r = search(iter.str, iter.splitter, state.i) + state.j, state.k = first(r), nextind(iter.str, last(r)) + + # Could not find the splitter in the string + if state.j == 0 + state.j = state.k = nextind(iter.str, state.n) + end + end + + result, state +end + # splitter can be a Char, Vector{Char}, AbstractString, Regex, ... # any splitter that provides search(s::AbstractString, splitter) split{T<:SubString}(str::T, splitter; limit::Integer=0, keep::Bool=true) = _split(str, splitter, limit, keep, T[]) diff --git a/test/strings/util.jl b/test/strings/util.jl index 21072a524bf4c..ce72f495db0fd 100644 --- a/test/strings/util.jl +++ b/test/strings/util.jl @@ -77,6 +77,24 @@ end #@test isequal(rsplit("a b c"), ["a","b","c"]) #@test isequal(rsplit("a b \t c\n"), ["a","b","c"]) +@test isequal(collect(eachsplit("foo,bar,baz", 'x')), ["foo,bar,baz"]) +@test isequal(collect(eachsplit("foo,bar,baz", ',')), ["foo","bar","baz"]) +@test isequal(collect(eachsplit("foo,bar,baz", ",")), ["foo","bar","baz"]) +@test isequal(collect(eachsplit("foo,bar,baz", r",")), ["foo","bar","baz"]) +@test isequal(collect(eachsplit("foo,bar,baz", ','; limit=0)), ["foo","bar","baz"]) +@test isequal(collect(eachsplit("foo,bar,baz", ','; limit=1)), ["foo,bar,baz"]) +@test isequal(collect(eachsplit("foo,bar,baz", ','; limit=2)), ["foo","bar,baz"]) +@test isequal(collect(eachsplit("foo,bar,baz", ','; limit=3)), ["foo","bar","baz"]) +@test isequal(collect(eachsplit("foo,bar", "o,b")), ["fo","ar"]) + +@test isequal(collect(eachsplit("", ',')), [""]) +@test isequal(collect(eachsplit(",", ',')), ["",""]) +@test isequal(collect(eachsplit(",,", ',')), ["","",""]) +@test isequal(collect(eachsplit(",,", ','; limit=2)), [",",""]) +@test isequal(collect(eachsplit("", ',' ; keep=false)), []) +@test isequal(collect(eachsplit(",", ',' ; keep=false)), []) +@test isequal(collect(eachsplit(",,", ','; keep=false)), []) + let str = "a.:.ba..:..cba.:.:.dcba.:." @test isequal(split(str, ".:."), ["a","ba.",".cba",":.dcba",""]) @test isequal(split(str, ".:."; keep=false), ["a","ba.",".cba",":.dcba"]) @@ -93,6 +111,14 @@ let str = "a.:.ba..:..cba.:.:.dcba.:." @test isequal(rsplit(str, ".:."; limit=4), ["a.:.ba.", ".cba.:", "dcba", ""]) @test isequal(rsplit(str, ".:."; limit=5), ["a", "ba.", ".cba.:", "dcba", ""]) @test isequal(rsplit(str, ".:."; limit=6), ["a", "ba.", ".cba.:", "dcba", ""]) + +@test isequal(collect(eachsplit(str, ".:.")), ["a","ba.",".cba.:","dcba",""]) +@test isequal(collect(eachsplit(str, ".:."; keep=false)), ["a","ba.",".cba.:","dcba"]) +@test isequal(collect(eachsplit(str, ".:."; limit=2)), ["a.:.ba..:..cba.:.:.dcba", ""]) +@test isequal(collect(eachsplit(str, ".:."; limit=3)), ["a.:.ba..:..cba.:", "dcba", ""]) +@test isequal(collect(eachsplit(str, ".:."; limit=4)), ["a.:.ba.", ".cba.:", "dcba", ""]) +@test isequal(collect(eachsplit(str, ".:."; limit=5)), ["a", "ba.", ".cba.:", "dcba", ""]) +@test isequal(collect(eachsplit(str, ".:."; limit=6)), ["a", "ba.", ".cba.:", "dcba", ""]) end # zero-width splits @@ -113,6 +139,21 @@ end @test isequal(split("abcd", r"d+"), ["abc",""]) @test isequal(split("abcd", r"[ad]?"), ["","b","c",""]) +@test isequal(collect(eachsplit("", "")), [""]) +@test isequal(collect(eachsplit("", r"")), [""]) +@test isequal(collect(eachsplit("abc", "")), ["a","b","c"]) +@test isequal(collect(eachsplit("abc", r"")), ["a","b","c"]) +@test isequal(collect(eachsplit("abcd", r"b?")), ["a","c","d"]) +@test isequal(collect(eachsplit("abcd", r"b*")), ["a","c","d"]) +@test isequal(collect(eachsplit("abcd", r"b+")), ["a","cd"]) +@test isequal(collect(eachsplit("abcd", r"b?c?")), ["a","d"]) +@test isequal(collect(eachsplit("abcd", r"[bc]?")), ["a","","d"]) +@test isequal(collect(eachsplit("abcd", r"a*")), ["","b","c","d"]) +@test isequal(collect(eachsplit("abcd", r"a+")), ["","bcd"]) +@test isequal(collect(eachsplit("abcd", r"d*")), ["a","b","c",""]) +@test isequal(collect(eachsplit("abcd", r"d+")), ["abc",""]) +@test isequal(collect(eachsplit("abcd", r"[ad]?")), ["","b","c",""]) + # replace @test replace("\u2202", '*', '\0') == "\u2202"