JuliaLang · StefanKarpinski · Aug 9, 2013 · Aug 9, 2013 · Aug 9, 2013 · Aug 9, 2013
diff --git a/base/regex.jl b/base/regex.jl
@@ -59,8 +59,8 @@ end
 # or maybe it's better to just fail since that would be quite slow
 
 immutable RegexMatch
-    match::ByteString
-    captures::Vector{Union(Nothing,ByteString)}
+    match::SubString{UTF8String}
+    captures::Vector{Union(Nothing,SubString{UTF8String})}
     offset::Int
     offsets::Vector{Int}
 end
@@ -85,26 +85,67 @@ end
 ismatch(r::Regex, s::String) =
     PCRE.exec(r.regex, C_NULL, bytestring(s), 0, r.options & PCRE.EXECUTE_MASK, false)
 
-function match(re::Regex, str::ByteString, idx::Integer, add_opts::Uint32)
+function match(re::Regex, str::UTF8String, idx::Integer, add_opts::Uint32=uint32(0),
+               extra::Ptr{Void}=C_NULL)
     opts = re.options & PCRE.EXECUTE_MASK | add_opts
-    m, n = PCRE.exec(re.regex, C_NULL, str, idx-1, opts, true)
+    m, n = PCRE.exec(re.regex, extra, str, idx-1, opts, true)
     if isempty(m); return nothing; end
-    mat = str[m[1]+1:m[2]]
-    cap = Union(Nothing,ByteString)[
-            m[2i+1] < 0 ? nothing : str[m[2i+1]+1:m[2i+2]] for i=1:n ]
+    mat = SubString(str, m[1]+1, m[2])
+    cap = Union(Nothing,SubString{UTF8String})[
+            m[2i+1] < 0 ? nothing : SubString(str, m[2i+1]+1, m[2i+2]) for i=1:n ]
     off = Int[ m[2i+1]::Int32+1 for i=1:n ]
     RegexMatch(mat, cap, m[1]+1, off)
 end
-match(re::Regex, str::ByteString, idx::Integer) = match(re, str, idx, uint32(0))
+
+match(re::Regex, str::ByteString, idx::Integer, add_opts::Uint32=uint32(0)) =
+    match(re, utf8(str), idx, add_opts)
+
 match(r::Regex, s::String) = match(r, s, start(s))
 match(r::Regex, s::String, i::Integer) =
     error("regex matching is only available for bytestrings; use bytestring(s) to convert")
 
-function matchall(re::Regex, str::ByteString, overlap::Bool)
-    [eachmatch(re, str, overlap)...]
+function matchall(re::Regex, str::UTF8String, overlap::Bool=false)
+    extra = PCRE.study(re.regex, PCRE.STUDY_JIT_COMPILE)
+    n = length(str.data)
+    matches = SubString{UTF8String}[]
+    offset = int32(0)
+    opts = re.options & PCRE.EXECUTE_MASK
+    opts_nonempty = opts | PCRE.ANCHORED | PCRE.NOTEMPTY_ATSTART
+    prevempty = false
+    ovec = Array(Int32, 3)
+    while true
+        result = ccall((:pcre_exec, :libpcre), Int32,
+                       (Ptr{Void}, Ptr{Void}, Ptr{Uint8}, Int32,
+                       Int32, Int32, Ptr{Int32}, Int32),
+                       re.regex, extra, str, n,
+                       offset, prevempty ? opts_nonempty : opts, ovec, 3)
+
+        if result < 0
+            if prevempty && offset < n
+                offset = int32(nextind(str, offset + 1) - 1)
+                prevempty = false
+                continue
+            else
+                break
+            end
+        end
+
+        push!(matches, SubString(str, ovec[1]+1, ovec[2]))
+        prevempty = offset == ovec[2]
+        if overlap
+            if !prevempty
+                offset = int32(nextind(str, offset + 1) - 1)
+            end
+        else
+            offset = ovec[2]
+        end
+    end
+    PCRE.free_study(extra)
+    matches
 end
 
-matchall(re::Regex, str::ByteString) = matchall(re, str, false)
+matchall(re::Regex, str::ByteString, overlap::Bool=false) =
+    matchall(re, utf8(str), overlap)
 
 function search(str::ByteString, re::Regex, idx::Integer)
     if idx > nextind(str,endof(str))
@@ -120,54 +161,61 @@ search(s::String, r::Regex) = search(s,r,start(s))
 
 immutable RegexMatchIterator
     regex::Regex
-    string::ByteString
+    string::UTF8String
     overlap::Bool
+    extra::Ptr{Void}
 
-    function RegexMatchIterator(regex::Regex, string::String, ovr::Bool)
-        new(regex, string, ovr)
+    function RegexMatchIterator(regex::Regex, string::String, ovr::Bool=false)
+        extra = PCRE.study(regex.regex, PCRE.STUDY_JIT_COMPILE)
+        new(regex, string, ovr, extra)
     end
-    RegexMatchIterator(regex::Regex, string::String) = RegexMatchIterator(regex, string, false)
 end
 
 eltype(itr::RegexMatchIterator) = RegexMatch
-start(itr::RegexMatchIterator) = match(itr.regex, itr.string, 1)
+start(itr::RegexMatchIterator) = match(itr.regex, itr.string, 1, uint32(0), itr.extra)
 done(itr::RegexMatchIterator, prev_match) = (prev_match == nothing)
 
 # Assumes prev_match is not nothing
 function next(itr::RegexMatchIterator, prev_match)
-    m = prev_match
-    str = itr.string
+    prevempty = isempty(prev_match.match)
+
+    if itr.overlap
+        if !prevempty
+            offset = nextind(itr.string, prev_match.offset)
+        else
+            offset = prev_match.offset
+        end
+    else
+        offset = prev_match.offset + endof(prev_match.match)
+    end
 
+    opts_nonempty = uint32(PCRE.ANCHORED | PCRE.NOTEMPTY_ATSTART)
     while true
-      opts = uint32(0)
-      if m != nothing
-          idx = itr.overlap ? next(str, m.offset)[2] : m.offset + length(m.match.data)
-
-          if length(m.match) == 0
-              if m.offset == length(str.data) + 1
-                  break
-              end
-              opts = opts | PCRE.ANCHORED | PCRE.NOTEMPTY_ATSTART
-          end
-      end
-
-      m = match(itr.regex, str, idx, opts)
-      if m == nothing
-          if opts == 0
-              break
-          end
-          idx = next(str, idx)[2]
-          continue
-      end
-
-      return (prev_match, m)
+        mat = match(itr.regex, itr.string, offset,
+                    prevempty ? opts_nonempty : uint32(0), itr.extra)
+
+        if mat === nothing
+            if prevempty && offset <= length(itr.string.data)
+                offset = nextind(itr.string, offset)
+                prevempty = false
+                continue
+            else
+                break
+            end
+        else
+            return (prev_match, mat)
+        end
     end
 
+    PCRE.free_study(itr.extra)
     (prev_match, nothing)
 end
 
-eachmatch(re::Regex, str::String, ovr::Bool) = RegexMatchIterator(re,str,ovr)
-eachmatch(re::Regex, str::String)            = RegexMatchIterator(re,str)
+function eachmatch(re::Regex, str::String, ovr::Bool=false)
+    RegexMatchIterator(re,str,ovr)
+end
+
+eachmatch(re::Regex, str::String) = RegexMatchIterator(re,str)
 
 # miscellaneous methods that depend on Regex being defined
 

diff --git a/base/string.jl b/base/string.jl
@@ -388,9 +388,14 @@ immutable SubString{T<:String} <: String
         if i > endof(s) || j<i
             return new(s, i, 0)
         else
-            if !isvalid(s,i) || !isvalid(s,j)
+            if !isvalid(s,i)
                 error("invalid SubString indexes")
             end
+
+            while !isvalid(s,j) && j > i
+                j -= 1
+            end
+
             o = i-1
             new(s, o, max(0, j-o))
         end
@@ -417,6 +422,8 @@ end
 
 getindex(s::SubString, i::Int) = getindex(s.string, i+s.offset)
 
+isempty(s::SubString) = s.endof == 0
+
 endof(s::SubString) = s.endof
 # TODO: length(s::SubString) = ??
 # default implementation will work but it's slow

diff --git a/base/version.jl b/base/version.jl
@@ -75,7 +75,7 @@ function split_idents(s::String)
     idents = split(s, '.')
     ntuple(length(idents)) do i
         ident = idents[i]
-        ismatch(r"^\d+$", ident) ? parseint(ident) : ident
+        ismatch(r"^\d+$", ident) ? parseint(ident) : bytestring(ident)
     end
 end
 

diff --git a/test/regex.jl b/test/regex.jl
@@ -1,7 +1,19 @@
-matches(args...) = map(m->m.match,matchall(args...))
 
-@test matches(r"a?b?", "asbd") == ["a","","b","",""]
-@test matches(r"\w+", "hello", true) == ["hello","ello","llo","lo","o"]
-@test matches(r".\s", "x \u2200 x \u2203 y") == ["x ", "∀ ", "x ", "∃ "]
-@test matches(r"(\w+)(\s*)", "The dark side of the moon") ==
-	  ["The ", "dark ", "side ", "of ", "the ", "moon"]
+function collect_eachmatch(re, str, overlap=false)
+    [m.match for m in collect(eachmatch(re, str, overlap))]
+end
+
+for f in [matchall, collect_eachmatch]
+    @test f(r"a?b?", "asbd") == ["a","","b","",""]
+    @test f(r"a?b?", "asbd", true) == ["a","","b","",""]
+    @test f(r"\w+", "hello", true) == ["hello","ello","llo","lo","o"]
+    @test f(r".\s", "x \u2200 x \u2203 y") == ["x ", "∀ ", "x ", "∃ "]
+    @test f(r"(\w+)(\s*)", "The dark side of the moon") ==
+          ["The ", "dark ", "side ", "of ", "the ", "moon"]
+    @test f(r"", "") == [""]
+    @test f(r"", "", true) == [""]
+    @test f(r"aa", "aaaa") == ["aa", "aa"]
+    @test f(r"aa", "aaaa", true) == ["aa", "aa", "aa"]
+    @test f(r"", "aaa") == ["", "", "", ""]
+    @test f(r"", "aaa", true) == ["", "", "", ""]
+end