From a33e80df30b8ffb4aba1a75db63c4792e5bac004 Mon Sep 17 00:00:00 2001 From: Daniel Jones Date: Fri, 9 Aug 2013 01:01:36 -0700 Subject: [PATCH 1/3] A faster matchall fuction that returns an array of string. --- base/regex.jl | 42 ++++++++++++++++++++++++++++++++++++++---- test/regex.jl | 14 +++++++++----- 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/base/regex.jl b/base/regex.jl index f4a4a1acb6171..3c4feeca52e4f 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -100,11 +100,45 @@ match(r::Regex, s::String) = match(r, s, start(s)) match(r::Regex, s::String, i::Integer) = error("regex matching is only available for bytestrings; use bytestring(s) to convert") -function matchall(re::Regex, str::ByteString, overlap::Bool) - [eachmatch(re, str, overlap)...] -end +function matchall(re::Regex, str::ByteString, overlap::Bool=false) + extra = PCRE.study(re.regex, PCRE.STUDY_JIT_COMPILE) + n = length(str.data) + matches = SubString[] + offset = int32(0) + opts = re.options & PCRE.EXECUTE_MASK + opts_nonempty = opts | PCRE.ANCHORED | PCRE.NOTEMPTY_ATSTART + prevempty = false + ovec = Array(Int32, 3) + while true + result = ccall((:pcre_exec, :libpcre), Int32, + (Ptr{Void}, Ptr{Void}, Ptr{Uint8}, Int32, + Int32, Int32, Ptr{Int32}, Int32), + re.regex, extra, str, n, + offset, prevempty ? opts_nonempty : opts, ovec, 3) + + if result < 0 + if prevempty && offset < n + offset = int32(next(str, offset + 1)[2] - 1) + prevempty = false + continue + else + break + end + end -matchall(re::Regex, str::ByteString) = matchall(re, str, false) + push!(matches, SubString(str, ovec[1]+1, ovec[2])) + prevempty = offset == ovec[2] + if overlap + if !prevempty + offset = int32(next(str, offset + 1)[2] - 1) + end + else + offset = ovec[2] + end + end + PCRE.free_study(extra) + matches +end function search(str::ByteString, re::Regex, idx::Integer) if idx > nextind(str,endof(str)) diff --git a/test/regex.jl b/test/regex.jl index 048fc6e0b783b..1881c1d275e26 100644 --- a/test/regex.jl +++ b/test/regex.jl @@ -1,7 +1,11 @@ -matches(args...) = map(m->m.match,matchall(args...)) -@test matches(r"a?b?", "asbd") == ["a","","b","",""] -@test matches(r"\w+", "hello", true) == ["hello","ello","llo","lo","o"] -@test matches(r".\s", "x \u2200 x \u2203 y") == ["x ", "∀ ", "x ", "∃ "] -@test matches(r"(\w+)(\s*)", "The dark side of the moon") == +@test matchall(r"a?b?", "asbd") == ["a","","b","",""] +@test matchall(r"a?b?", "asbd", true) == ["a","","b","",""] +@test matchall(r"\w+", "hello", true) == ["hello","ello","llo","lo","o"] +@test matchall(r".\s", "x \u2200 x \u2203 y") == ["x ", "∀ ", "x ", "∃ "] +@test matchall(r"(\w+)(\s*)", "The dark side of the moon") == ["The ", "dark ", "side ", "of ", "the ", "moon"] + +@test matchall(r"aa", "aaaa", true) == ["aa", "aa", "aa"] +@test matchall(r"", "aaa") == ["", "", "", ""] +@test matchall(r"", "aaa", true) == ["", "", "", ""] From 892188be3a01a53226a9b47526bc03d5572c3141 Mon Sep 17 00:00:00 2001 From: Daniel Jones Date: Fri, 9 Aug 2013 11:42:48 -0700 Subject: [PATCH 2/3] Make match return SubStrings. Improvements to eachmatch. --- base/regex.jl | 87 ++++++++++++++++++++++++++----------------------- base/string.jl | 9 ++++- base/version.jl | 2 +- test/regex.jl | 26 ++++++++++----- 4 files changed, 73 insertions(+), 51 deletions(-) diff --git a/base/regex.jl b/base/regex.jl index 3c4feeca52e4f..fc1449747202d 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -59,8 +59,8 @@ end # or maybe it's better to just fail since that would be quite slow immutable RegexMatch - match::ByteString - captures::Vector{Union(Nothing,ByteString)} + match::SubString + captures::Vector{Union(Nothing,SubString)} offset::Int offsets::Vector{Int} end @@ -85,17 +85,17 @@ end ismatch(r::Regex, s::String) = PCRE.exec(r.regex, C_NULL, bytestring(s), 0, r.options & PCRE.EXECUTE_MASK, false) -function match(re::Regex, str::ByteString, idx::Integer, add_opts::Uint32) +function match(re::Regex, str::ByteString, idx::Integer, add_opts::Uint32=uint32(0), + extra::Ptr{Void}=C_NULL) opts = re.options & PCRE.EXECUTE_MASK | add_opts - m, n = PCRE.exec(re.regex, C_NULL, str, idx-1, opts, true) + m, n = PCRE.exec(re.regex, extra, str, idx-1, opts, true) if isempty(m); return nothing; end - mat = str[m[1]+1:m[2]] - cap = Union(Nothing,ByteString)[ - m[2i+1] < 0 ? nothing : str[m[2i+1]+1:m[2i+2]] for i=1:n ] + mat = SubString(str, m[1]+1, m[2]) + cap = Union(Nothing,SubString)[ + m[2i+1] < 0 ? nothing : SubString(str, m[2i+1]+1, m[2i+2]) for i=1:n ] off = Int[ m[2i+1]::Int32+1 for i=1:n ] RegexMatch(mat, cap, m[1]+1, off) end -match(re::Regex, str::ByteString, idx::Integer) = match(re, str, idx, uint32(0)) match(r::Regex, s::String) = match(r, s, start(s)) match(r::Regex, s::String, i::Integer) = error("regex matching is only available for bytestrings; use bytestring(s) to convert") @@ -118,7 +118,7 @@ function matchall(re::Regex, str::ByteString, overlap::Bool=false) if result < 0 if prevempty && offset < n - offset = int32(next(str, offset + 1)[2] - 1) + offset = int32(nextind(str, offset + 1) - 1) prevempty = false continue else @@ -130,7 +130,7 @@ function matchall(re::Regex, str::ByteString, overlap::Bool=false) prevempty = offset == ovec[2] if overlap if !prevempty - offset = int32(next(str, offset + 1)[2] - 1) + offset = int32(nextind(str, offset + 1) - 1) end else offset = ovec[2] @@ -156,52 +156,59 @@ immutable RegexMatchIterator regex::Regex string::ByteString overlap::Bool + extra::Ptr{Void} - function RegexMatchIterator(regex::Regex, string::String, ovr::Bool) - new(regex, string, ovr) + function RegexMatchIterator(regex::Regex, string::String, ovr::Bool=false) + extra = PCRE.study(regex.regex, PCRE.STUDY_JIT_COMPILE) + new(regex, string, ovr, extra) end - RegexMatchIterator(regex::Regex, string::String) = RegexMatchIterator(regex, string, false) end eltype(itr::RegexMatchIterator) = RegexMatch -start(itr::RegexMatchIterator) = match(itr.regex, itr.string, 1) +start(itr::RegexMatchIterator) = match(itr.regex, itr.string, 1, uint32(0), itr.extra) done(itr::RegexMatchIterator, prev_match) = (prev_match == nothing) # Assumes prev_match is not nothing function next(itr::RegexMatchIterator, prev_match) - m = prev_match - str = itr.string + prevempty = isempty(prev_match.match) + + if itr.overlap + if !prevempty + offset = nextind(itr.string, prev_match.offset) + else + offset = prev_match.offset + end + else + offset = prev_match.offset + endof(prev_match.match) + end + opts_nonempty = uint32(PCRE.ANCHORED | PCRE.NOTEMPTY_ATSTART) while true - opts = uint32(0) - if m != nothing - idx = itr.overlap ? next(str, m.offset)[2] : m.offset + length(m.match.data) - - if length(m.match) == 0 - if m.offset == length(str.data) + 1 - break - end - opts = opts | PCRE.ANCHORED | PCRE.NOTEMPTY_ATSTART - end - end - - m = match(itr.regex, str, idx, opts) - if m == nothing - if opts == 0 - break - end - idx = next(str, idx)[2] - continue - end - - return (prev_match, m) + mat = match(itr.regex, itr.string, offset, + prevempty ? opts_nonempty : uint32(0), itr.extra) + + if mat === nothing + if prevempty && offset <= length(itr.string.data) + offset = nextind(itr.string, offset) + prevempty = false + continue + else + break + end + else + return (prev_match, mat) + end end + PCRE.free_study(itr.extra) (prev_match, nothing) end -eachmatch(re::Regex, str::String, ovr::Bool) = RegexMatchIterator(re,str,ovr) -eachmatch(re::Regex, str::String) = RegexMatchIterator(re,str) +function eachmatch(re::Regex, str::String, ovr::Bool=false) + RegexMatchIterator(re,str,ovr) +end + +eachmatch(re::Regex, str::String) = RegexMatchIterator(re,str) # miscellaneous methods that depend on Regex being defined diff --git a/base/string.jl b/base/string.jl index 81e33336d9fef..931b789f785ba 100644 --- a/base/string.jl +++ b/base/string.jl @@ -388,9 +388,14 @@ immutable SubString{T<:String} <: String if i > endof(s) || j i + j -= 1 + end + o = i-1 new(s, o, max(0, j-o)) end @@ -417,6 +422,8 @@ end getindex(s::SubString, i::Int) = getindex(s.string, i+s.offset) +isempty(s::SubString) = s.endof == 0 + endof(s::SubString) = s.endof # TODO: length(s::SubString) = ?? # default implementation will work but it's slow diff --git a/base/version.jl b/base/version.jl index 71c162a6d31eb..0460a03b5fb60 100644 --- a/base/version.jl +++ b/base/version.jl @@ -75,7 +75,7 @@ function split_idents(s::String) idents = split(s, '.') ntuple(length(idents)) do i ident = idents[i] - ismatch(r"^\d+$", ident) ? parseint(ident) : ident + ismatch(r"^\d+$", ident) ? parseint(ident) : bytestring(ident) end end diff --git a/test/regex.jl b/test/regex.jl index 1881c1d275e26..30c7bd940c9b0 100644 --- a/test/regex.jl +++ b/test/regex.jl @@ -1,11 +1,19 @@ -@test matchall(r"a?b?", "asbd") == ["a","","b","",""] -@test matchall(r"a?b?", "asbd", true) == ["a","","b","",""] -@test matchall(r"\w+", "hello", true) == ["hello","ello","llo","lo","o"] -@test matchall(r".\s", "x \u2200 x \u2203 y") == ["x ", "∀ ", "x ", "∃ "] -@test matchall(r"(\w+)(\s*)", "The dark side of the moon") == - ["The ", "dark ", "side ", "of ", "the ", "moon"] +function collect_eachmatch(re, str, overlap=false) + [m.match for m in collect(eachmatch(re, str, overlap))] +end -@test matchall(r"aa", "aaaa", true) == ["aa", "aa", "aa"] -@test matchall(r"", "aaa") == ["", "", "", ""] -@test matchall(r"", "aaa", true) == ["", "", "", ""] +for f in [matchall, collect_eachmatch] + @test f(r"a?b?", "asbd") == ["a","","b","",""] + @test f(r"a?b?", "asbd", true) == ["a","","b","",""] + @test f(r"\w+", "hello", true) == ["hello","ello","llo","lo","o"] + @test f(r".\s", "x \u2200 x \u2203 y") == ["x ", "∀ ", "x ", "∃ "] + @test f(r"(\w+)(\s*)", "The dark side of the moon") == + ["The ", "dark ", "side ", "of ", "the ", "moon"] + @test f(r"", "") == [""] + @test f(r"", "", true) == [""] + @test f(r"aa", "aaaa") == ["aa", "aa"] + @test f(r"aa", "aaaa", true) == ["aa", "aa", "aa"] + @test f(r"", "aaa") == ["", "", "", ""] + @test f(r"", "aaa", true) == ["", "", "", ""] +end From c5d4b8791b40973b6efacd516b36cbe5d8d05f82 Mon Sep 17 00:00:00 2001 From: Daniel Jones Date: Fri, 9 Aug 2013 12:59:10 -0700 Subject: [PATCH 3/3] Use utf8 strings spcefically in regex. --- base/regex.jl | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/base/regex.jl b/base/regex.jl index fc1449747202d..5e0bf320289c6 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -59,8 +59,8 @@ end # or maybe it's better to just fail since that would be quite slow immutable RegexMatch - match::SubString - captures::Vector{Union(Nothing,SubString)} + match::SubString{UTF8String} + captures::Vector{Union(Nothing,SubString{UTF8String})} offset::Int offsets::Vector{Int} end @@ -85,25 +85,29 @@ end ismatch(r::Regex, s::String) = PCRE.exec(r.regex, C_NULL, bytestring(s), 0, r.options & PCRE.EXECUTE_MASK, false) -function match(re::Regex, str::ByteString, idx::Integer, add_opts::Uint32=uint32(0), +function match(re::Regex, str::UTF8String, idx::Integer, add_opts::Uint32=uint32(0), extra::Ptr{Void}=C_NULL) opts = re.options & PCRE.EXECUTE_MASK | add_opts m, n = PCRE.exec(re.regex, extra, str, idx-1, opts, true) if isempty(m); return nothing; end mat = SubString(str, m[1]+1, m[2]) - cap = Union(Nothing,SubString)[ + cap = Union(Nothing,SubString{UTF8String})[ m[2i+1] < 0 ? nothing : SubString(str, m[2i+1]+1, m[2i+2]) for i=1:n ] off = Int[ m[2i+1]::Int32+1 for i=1:n ] RegexMatch(mat, cap, m[1]+1, off) end + +match(re::Regex, str::ByteString, idx::Integer, add_opts::Uint32=uint32(0)) = + match(re, utf8(str), idx, add_opts) + match(r::Regex, s::String) = match(r, s, start(s)) match(r::Regex, s::String, i::Integer) = error("regex matching is only available for bytestrings; use bytestring(s) to convert") -function matchall(re::Regex, str::ByteString, overlap::Bool=false) +function matchall(re::Regex, str::UTF8String, overlap::Bool=false) extra = PCRE.study(re.regex, PCRE.STUDY_JIT_COMPILE) n = length(str.data) - matches = SubString[] + matches = SubString{UTF8String}[] offset = int32(0) opts = re.options & PCRE.EXECUTE_MASK opts_nonempty = opts | PCRE.ANCHORED | PCRE.NOTEMPTY_ATSTART @@ -140,6 +144,9 @@ function matchall(re::Regex, str::ByteString, overlap::Bool=false) matches end +matchall(re::Regex, str::ByteString, overlap::Bool=false) = + matchall(re, utf8(str), overlap) + function search(str::ByteString, re::Regex, idx::Integer) if idx > nextind(str,endof(str)) throw(BoundsError()) @@ -154,7 +161,7 @@ search(s::String, r::Regex) = search(s,r,start(s)) immutable RegexMatchIterator regex::Regex - string::ByteString + string::UTF8String overlap::Bool extra::Ptr{Void}