From fc569292b99937219b67f35f317be9b690212e51 Mon Sep 17 00:00:00 2001
From: Gio Cielo <gccielo@gmail.com>
Date: Mon, 20 Feb 2017 06:55:28 -0800
Subject: [PATCH] Add an iterator implementation for String splitting

---
 base/exports.jl      |   1 +
 base/strings/util.jl | 117 +++++++++++++++++++++++++++++++++++++++++++
 test/strings/util.jl |  41 +++++++++++++++
 3 files changed, 159 insertions(+)

diff --git a/base/exports.jl b/base/exports.jl
index 51bfbf6632ef1..252ca188799a3 100644
--- a/base/exports.jl
+++ b/base/exports.jl
@@ -753,6 +753,7 @@ export
     digits!,
     dump,
     eachmatch,
+    eachsplit,
     endswith,
     escape_string,
     graphemes,
diff --git a/base/strings/util.jl b/base/strings/util.jl
index d38bacbc759b2..fd9ff6e0b6819 100644
--- a/base/strings/util.jl
+++ b/base/strings/util.jl
@@ -248,6 +248,123 @@ julia> rpad("March",20)
 rpad(s, n::Integer, p=" ") = rpad(string(s),n,string(p))
 cpad(s, n::Integer, p=" ") = rpad(lpad(s,div(n+strwidth(s),2),p),n,p)
 
+immutable SplitIterator
+    str::AbstractString
+    splitter
+    limit::Integer
+    keep_empty::Bool
+end
+
+iteratorsize(::Type{SplitIterator}) = SizeUnknown()
+iteratoreltype(::Type{SplitIterator}) = HasEltype()
+eltype(::SplitIterator) = SubString
+
+type SplitIteratorState
+    i::Int
+    j::Int
+    k::Int
+    n::Int
+    s::Int
+end
+
+"""
+    eachsplit(s::AbstractString, [chars]; limit::Integer=0, keep::Bool=true)
+
+Return an iterator of substrings by splitting the given string on occurrences of the given
+character delimiters, which may be specified in any of the formats allowed by `search`'s
+second argument (i.e. a single character, collection of characters, string, or regular
+expression). If `chars` is omitted, it defaults to the set of all space characters, and
+`keep` is taken to be `false`. The two keyword arguments are optional: they are a
+maximum size for the result and a flag determining whether empty fields should be kept in
+the result.
+
+This method is typically slower than `split`, but it does not preemptively allocate an
+array.
+
+```jldoctest
+julia> a = "Ma.rch"
+"Ma.rch"
+
+julia> collect(eachsplit(a,"."))
+2-element Array{SubString{String},1}:
+ "Ma"
+ "rch"
+```
+"""
+function eachsplit(str::AbstractString, splitter; limit::Integer=0, keep::Bool=true)
+    _eachsplit(str, splitter, limit, keep)
+end
+
+eachsplit(str::AbstractString) = _eachsplit(_default_delims; limit=0, keep=false)
+
+function _eachsplit(str::AbstractString, splitter, limit::Integer, keep_empty::Bool)
+    # Empty string splitter means you want to iterate over the characters
+    splitter == "" ? graphemes(str) : SplitIterator(str, splitter, limit, keep_empty)
+end
+
+function start(iter::SplitIterator)
+    i = start(iter.str)
+    n = endof(iter.str)
+
+    r = search(iter.str, iter.splitter, i)
+    j, k = first(r), nextind(iter.str, last(r))
+
+    # Could not find the splitter in the string
+    if j == 0
+        j = k = nextind(iter.str, n)
+    end
+
+    # Eat the prefix that matches the splitter
+    while !iter.keep_empty && i == j && i <= n
+        i = k
+        r = search(iter.str, iter.splitter, i)
+        j, k = first(r), nextind(iter.str, last(r))
+
+        # Could not find the splitter in the string
+        if j == 0
+            j = k = nextind(iter.str, n)
+        end
+    end
+
+    SplitIteratorState(i, j, k, n, 0)
+end
+
+function done(iter::SplitIterator, state::SplitIteratorState)
+  state.i > state.n || (iter.limit > 0 && state.s == iter.limit)
+end
+
+function next(iter::SplitIterator, state::SplitIteratorState)
+    result = SubString(iter.str, state.i, prevind(iter.str, state.j))
+    # Move our iterator to the next position of a potential substring
+    state.i = state.k
+    state.s += 1
+
+    if done(iter, state)
+        return result, state
+    end
+
+    # Update the state to find the next end point, j, of the next substring
+    r = search(iter.str, iter.splitter, state.i)
+    state.j, state.k = first(r), nextind(iter.str, last(r))
+
+    if state.j == 0
+        state.j = state.k = nextind(iter.str, state.n)
+    end
+
+    while !iter.keep_empty && state.i == state.j && state.i <= state.n
+        state.i = state.k
+        r = search(iter.str, iter.splitter, state.i)
+        state.j, state.k = first(r), nextind(iter.str, last(r))
+
+        # Could not find the splitter in the string
+        if state.j == 0
+            state.j = state.k = nextind(iter.str, state.n)
+        end
+    end
+
+    result, state
+end
+
 # splitter can be a Char, Vector{Char}, AbstractString, Regex, ...
 # any splitter that provides search(s::AbstractString, splitter)
 split{T<:SubString}(str::T, splitter; limit::Integer=0, keep::Bool=true) = _split(str, splitter, limit, keep, T[])
diff --git a/test/strings/util.jl b/test/strings/util.jl
index 21072a524bf4c..ce72f495db0fd 100644
--- a/test/strings/util.jl
+++ b/test/strings/util.jl
@@ -77,6 +77,24 @@ end
 #@test isequal(rsplit("a b c"), ["a","b","c"])
 #@test isequal(rsplit("a  b \t c\n"), ["a","b","c"])
 
+@test isequal(collect(eachsplit("foo,bar,baz", 'x')), ["foo,bar,baz"])
+@test isequal(collect(eachsplit("foo,bar,baz", ',')), ["foo","bar","baz"])
+@test isequal(collect(eachsplit("foo,bar,baz", ",")), ["foo","bar","baz"])
+@test isequal(collect(eachsplit("foo,bar,baz", r",")), ["foo","bar","baz"])
+@test isequal(collect(eachsplit("foo,bar,baz", ','; limit=0)), ["foo","bar","baz"])
+@test isequal(collect(eachsplit("foo,bar,baz", ','; limit=1)), ["foo,bar,baz"])
+@test isequal(collect(eachsplit("foo,bar,baz", ','; limit=2)), ["foo","bar,baz"])
+@test isequal(collect(eachsplit("foo,bar,baz", ','; limit=3)), ["foo","bar","baz"])
+@test isequal(collect(eachsplit("foo,bar", "o,b")), ["fo","ar"])
+
+@test isequal(collect(eachsplit("", ',')), [""])
+@test isequal(collect(eachsplit(",", ',')), ["",""])
+@test isequal(collect(eachsplit(",,", ',')), ["","",""])
+@test isequal(collect(eachsplit(",,", ','; limit=2)), [",",""])
+@test isequal(collect(eachsplit("", ','  ; keep=false)), [])
+@test isequal(collect(eachsplit(",", ',' ; keep=false)), [])
+@test isequal(collect(eachsplit(",,", ','; keep=false)), [])
+
 let str = "a.:.ba..:..cba.:.:.dcba.:."
 @test isequal(split(str, ".:."), ["a","ba.",".cba",":.dcba",""])
 @test isequal(split(str, ".:."; keep=false), ["a","ba.",".cba",":.dcba"])
@@ -93,6 +111,14 @@ let str = "a.:.ba..:..cba.:.:.dcba.:."
 @test isequal(rsplit(str, ".:."; limit=4), ["a.:.ba.", ".cba.:", "dcba", ""])
 @test isequal(rsplit(str, ".:."; limit=5), ["a", "ba.", ".cba.:", "dcba", ""])
 @test isequal(rsplit(str, ".:."; limit=6), ["a", "ba.", ".cba.:", "dcba", ""])
+
+@test isequal(collect(eachsplit(str, ".:.")), ["a","ba.",".cba.:","dcba",""])
+@test isequal(collect(eachsplit(str, ".:."; keep=false)), ["a","ba.",".cba.:","dcba"])
+@test isequal(collect(eachsplit(str, ".:."; limit=2)), ["a.:.ba..:..cba.:.:.dcba", ""])
+@test isequal(collect(eachsplit(str, ".:."; limit=3)), ["a.:.ba..:..cba.:", "dcba", ""])
+@test isequal(collect(eachsplit(str, ".:."; limit=4)), ["a.:.ba.", ".cba.:", "dcba", ""])
+@test isequal(collect(eachsplit(str, ".:."; limit=5)), ["a", "ba.", ".cba.:", "dcba", ""])
+@test isequal(collect(eachsplit(str, ".:."; limit=6)), ["a", "ba.", ".cba.:", "dcba", ""])
 end
 
 # zero-width splits
@@ -113,6 +139,21 @@ end
 @test isequal(split("abcd", r"d+"), ["abc",""])
 @test isequal(split("abcd", r"[ad]?"), ["","b","c",""])
 
+@test isequal(collect(eachsplit("", "")), [""])
+@test isequal(collect(eachsplit("", r"")), [""])
+@test isequal(collect(eachsplit("abc", "")), ["a","b","c"])
+@test isequal(collect(eachsplit("abc", r"")), ["a","b","c"])
+@test isequal(collect(eachsplit("abcd", r"b?")), ["a","c","d"])
+@test isequal(collect(eachsplit("abcd", r"b*")), ["a","c","d"])
+@test isequal(collect(eachsplit("abcd", r"b+")), ["a","cd"])
+@test isequal(collect(eachsplit("abcd", r"b?c?")), ["a","d"])
+@test isequal(collect(eachsplit("abcd", r"[bc]?")), ["a","","d"])
+@test isequal(collect(eachsplit("abcd", r"a*")), ["","b","c","d"])
+@test isequal(collect(eachsplit("abcd", r"a+")), ["","bcd"])
+@test isequal(collect(eachsplit("abcd", r"d*")), ["a","b","c",""])
+@test isequal(collect(eachsplit("abcd", r"d+")), ["abc",""])
+@test isequal(collect(eachsplit("abcd", r"[ad]?")), ["","b","c",""])
+
 # replace
 @test replace("\u2202", '*', '\0') == "\u2202"