Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RFC: Regex improvements #4002

Merged
merged 3 commits into from
Aug 9, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 90 additions & 42 deletions base/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ end
# or maybe it's better to just fail since that would be quite slow

immutable RegexMatch
match::ByteString
captures::Vector{Union(Nothing,ByteString)}
match::SubString{UTF8String}
captures::Vector{Union(Nothing,SubString{UTF8String})}
offset::Int
offsets::Vector{Int}
end
Expand All @@ -85,26 +85,67 @@ end
ismatch(r::Regex, s::String) =
PCRE.exec(r.regex, C_NULL, bytestring(s), 0, r.options & PCRE.EXECUTE_MASK, false)

function match(re::Regex, str::ByteString, idx::Integer, add_opts::Uint32)
function match(re::Regex, str::UTF8String, idx::Integer, add_opts::Uint32=uint32(0),
extra::Ptr{Void}=C_NULL)
opts = re.options & PCRE.EXECUTE_MASK | add_opts
m, n = PCRE.exec(re.regex, C_NULL, str, idx-1, opts, true)
m, n = PCRE.exec(re.regex, extra, str, idx-1, opts, true)
if isempty(m); return nothing; end
mat = str[m[1]+1:m[2]]
cap = Union(Nothing,ByteString)[
m[2i+1] < 0 ? nothing : str[m[2i+1]+1:m[2i+2]] for i=1:n ]
mat = SubString(str, m[1]+1, m[2])
cap = Union(Nothing,SubString{UTF8String})[
m[2i+1] < 0 ? nothing : SubString(str, m[2i+1]+1, m[2i+2]) for i=1:n ]
off = Int[ m[2i+1]::Int32+1 for i=1:n ]
RegexMatch(mat, cap, m[1]+1, off)
end
match(re::Regex, str::ByteString, idx::Integer) = match(re, str, idx, uint32(0))

match(re::Regex, str::ByteString, idx::Integer, add_opts::Uint32=uint32(0)) =
match(re, utf8(str), idx, add_opts)

match(r::Regex, s::String) = match(r, s, start(s))
match(r::Regex, s::String, i::Integer) =
error("regex matching is only available for bytestrings; use bytestring(s) to convert")

function matchall(re::Regex, str::ByteString, overlap::Bool)
[eachmatch(re, str, overlap)...]
function matchall(re::Regex, str::UTF8String, overlap::Bool=false)
extra = PCRE.study(re.regex, PCRE.STUDY_JIT_COMPILE)
n = length(str.data)
matches = SubString{UTF8String}[]
offset = int32(0)
opts = re.options & PCRE.EXECUTE_MASK
opts_nonempty = opts | PCRE.ANCHORED | PCRE.NOTEMPTY_ATSTART
prevempty = false
ovec = Array(Int32, 3)
while true
result = ccall((:pcre_exec, :libpcre), Int32,
(Ptr{Void}, Ptr{Void}, Ptr{Uint8}, Int32,
Int32, Int32, Ptr{Int32}, Int32),
re.regex, extra, str, n,
offset, prevempty ? opts_nonempty : opts, ovec, 3)

if result < 0
if prevempty && offset < n
offset = int32(nextind(str, offset + 1) - 1)
prevempty = false
continue
else
break
end
end

push!(matches, SubString(str, ovec[1]+1, ovec[2]))
prevempty = offset == ovec[2]
if overlap
if !prevempty
offset = int32(nextind(str, offset + 1) - 1)
end
else
offset = ovec[2]
end
end
PCRE.free_study(extra)
matches
end

matchall(re::Regex, str::ByteString) = matchall(re, str, false)
matchall(re::Regex, str::ByteString, overlap::Bool=false) =
matchall(re, utf8(str), overlap)

function search(str::ByteString, re::Regex, idx::Integer)
if idx > nextind(str,endof(str))
Expand All @@ -120,54 +161,61 @@ search(s::String, r::Regex) = search(s,r,start(s))

immutable RegexMatchIterator
regex::Regex
string::ByteString
string::UTF8String
overlap::Bool
extra::Ptr{Void}

function RegexMatchIterator(regex::Regex, string::String, ovr::Bool)
new(regex, string, ovr)
function RegexMatchIterator(regex::Regex, string::String, ovr::Bool=false)
extra = PCRE.study(regex.regex, PCRE.STUDY_JIT_COMPILE)
new(regex, string, ovr, extra)
end
RegexMatchIterator(regex::Regex, string::String) = RegexMatchIterator(regex, string, false)
end

eltype(itr::RegexMatchIterator) = RegexMatch
start(itr::RegexMatchIterator) = match(itr.regex, itr.string, 1)
start(itr::RegexMatchIterator) = match(itr.regex, itr.string, 1, uint32(0), itr.extra)
done(itr::RegexMatchIterator, prev_match) = (prev_match == nothing)

# Assumes prev_match is not nothing
function next(itr::RegexMatchIterator, prev_match)
m = prev_match
str = itr.string
prevempty = isempty(prev_match.match)

if itr.overlap
if !prevempty
offset = nextind(itr.string, prev_match.offset)
else
offset = prev_match.offset
end
else
offset = prev_match.offset + endof(prev_match.match)
end

opts_nonempty = uint32(PCRE.ANCHORED | PCRE.NOTEMPTY_ATSTART)
while true
opts = uint32(0)
if m != nothing
idx = itr.overlap ? next(str, m.offset)[2] : m.offset + length(m.match.data)

if length(m.match) == 0
if m.offset == length(str.data) + 1
break
end
opts = opts | PCRE.ANCHORED | PCRE.NOTEMPTY_ATSTART
end
end

m = match(itr.regex, str, idx, opts)
if m == nothing
if opts == 0
break
end
idx = next(str, idx)[2]
continue
end

return (prev_match, m)
mat = match(itr.regex, itr.string, offset,
prevempty ? opts_nonempty : uint32(0), itr.extra)

if mat === nothing
if prevempty && offset <= length(itr.string.data)
offset = nextind(itr.string, offset)
prevempty = false
continue
else
break
end
else
return (prev_match, mat)
end
end

PCRE.free_study(itr.extra)
(prev_match, nothing)
end

eachmatch(re::Regex, str::String, ovr::Bool) = RegexMatchIterator(re,str,ovr)
eachmatch(re::Regex, str::String) = RegexMatchIterator(re,str)
function eachmatch(re::Regex, str::String, ovr::Bool=false)
RegexMatchIterator(re,str,ovr)
end

eachmatch(re::Regex, str::String) = RegexMatchIterator(re,str)

# miscellaneous methods that depend on Regex being defined

Expand Down
9 changes: 8 additions & 1 deletion base/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -388,9 +388,14 @@ immutable SubString{T<:String} <: String
if i > endof(s) || j<i
return new(s, i, 0)
else
if !isvalid(s,i) || !isvalid(s,j)
if !isvalid(s,i)
error("invalid SubString indexes")
end

while !isvalid(s,j) && j > i
j -= 1
end

o = i-1
new(s, o, max(0, j-o))
end
Expand All @@ -417,6 +422,8 @@ end

getindex(s::SubString, i::Int) = getindex(s.string, i+s.offset)

isempty(s::SubString) = s.endof == 0

endof(s::SubString) = s.endof
# TODO: length(s::SubString) = ??
# default implementation will work but it's slow
Expand Down
2 changes: 1 addition & 1 deletion base/version.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ function split_idents(s::String)
idents = split(s, '.')
ntuple(length(idents)) do i
ident = idents[i]
ismatch(r"^\d+$", ident) ? parseint(ident) : ident
ismatch(r"^\d+$", ident) ? parseint(ident) : bytestring(ident)
end
end

Expand Down
24 changes: 18 additions & 6 deletions test/regex.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,19 @@
matches(args...) = map(m->m.match,matchall(args...))

@test matches(r"a?b?", "asbd") == ["a","","b","",""]
@test matches(r"\w+", "hello", true) == ["hello","ello","llo","lo","o"]
@test matches(r".\s", "x \u2200 x \u2203 y") == ["x ", "∀ ", "x ", "∃ "]
@test matches(r"(\w+)(\s*)", "The dark side of the moon") ==
["The ", "dark ", "side ", "of ", "the ", "moon"]
function collect_eachmatch(re, str, overlap=false)
[m.match for m in collect(eachmatch(re, str, overlap))]
end

for f in [matchall, collect_eachmatch]
@test f(r"a?b?", "asbd") == ["a","","b","",""]
@test f(r"a?b?", "asbd", true) == ["a","","b","",""]
@test f(r"\w+", "hello", true) == ["hello","ello","llo","lo","o"]
@test f(r".\s", "x \u2200 x \u2203 y") == ["x ", "∀ ", "x ", "∃ "]
@test f(r"(\w+)(\s*)", "The dark side of the moon") ==
["The ", "dark ", "side ", "of ", "the ", "moon"]
@test f(r"", "") == [""]
@test f(r"", "", true) == [""]
@test f(r"aa", "aaaa") == ["aa", "aa"]
@test f(r"aa", "aaaa", true) == ["aa", "aa", "aa"]
@test f(r"", "aaa") == ["", "", "", ""]
@test f(r"", "aaa", true) == ["", "", "", ""]
end