Skip to content

Commit b8ee561

Browse files
Merge pull request JuliaLang#24708 from JuliaLang/sk/revstring
remove `RevString`; efficient generic `reverseind`
2 parents e4cf911 + 5167f17 commit b8ee561

17 files changed

+156
-165
lines changed

NEWS.md

+15
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,12 @@ This section lists changes that do not have deprecation warnings.
308308
`AbstractArray` types that specialized broadcasting using the old internal API will
309309
need to switch to the new API. ([#20740])
310310

311+
* The `RevString` type has been removed from the language; `reverse(::String)` returns
312+
a `String` with code points (or fragments thereof) in reverse order. In general,
313+
`reverse(s)` should return a string of the same type and encoding as `s` with code
314+
points in reverse order; any string type overrides `reverse` to return a different
315+
type of string must also override `reverseind` to compute reversed indices correctly.
316+
311317
Library improvements
312318
--------------------
313319

@@ -409,6 +415,15 @@ Library improvements
409415
* The `keys` of an `Associative` are now an `AbstractSet`. `Base.KeyIterator{<:Associative}`
410416
has been changed to `KeySet{K, <:Associative{K}} <: AbstractSet{K}` ([#24580]).
411417

418+
* New function `ncodeunits(s::AbstractString)` gives the number of code units in a string.
419+
The generic definition is constant time but calls `endof(s)` which may be inefficient.
420+
Therefore custom string types may want to define direct `ncodeunits` methods.
421+
422+
* `reverseind(s::AbstractString, i::Integer)` now has an efficient generic fallback, so
423+
custom string types do not need to provide their own efficient defintions. The generic
424+
definition relies on `ncodeunits` however, so for optimal performance you may need to
425+
define a custom method for that function.
426+
412427
Compiler/Runtime improvements
413428
-----------------------------
414429

base/exports.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ export
8888
Rational,
8989
Regex,
9090
RegexMatch,
91-
RevString,
9291
RoundFromZero,
9392
RoundDown,
9493
RoundingMode,
@@ -756,6 +755,7 @@ export
756755
lstrip,
757756
match,
758757
matchall,
758+
ncodeunits,
759759
ndigits,
760760
nextind,
761761
normalize_string,

base/precompile.jl

-3
Original file line numberDiff line numberDiff line change
@@ -578,9 +578,6 @@ precompile(Tuple{typeof(Base.LineEdit.complete_line), Base.LineEdit.PromptState,
578578
precompile(Tuple{typeof(Base.LineEdit.input_string_newlines_aftercursor), Base.LineEdit.PromptState})
579579
precompile(Tuple{typeof(Base.LineEdit.complete_line), Base.REPL.REPLCompletionProvider, Base.LineEdit.PromptState})
580580
precompile(Tuple{getfield(Base, Symbol("#kw##parse")), Array{Any, 1}, typeof(Base.parse), String})
581-
precompile(Tuple{typeof(Base.isvalid), Base.RevString{String}, Int64})
582-
precompile(Tuple{typeof(Base.nextind), Base.RevString{String}, Int64})
583-
precompile(Tuple{typeof(Base.search), Base.RevString{String}, Array{Char, 1}, Int64})
584581
precompile(Tuple{typeof(Base.rsearch), String, Array{Char, 1}, Int64})
585582
precompile(Tuple{getfield(Base.REPLCompletions, Symbol("#kw##find_start_brace")), Array{Any, 1}, typeof(Base.REPLCompletions.find_start_brace), String})
586583
precompile(Tuple{typeof(Core.Inference.isbits), Tuple{Void, Void, Void}})

base/repl/REPLCompletions.jl

+8-5
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ end
225225
# closed start brace from the end of the string.
226226
function find_start_brace(s::AbstractString; c_start='(', c_end=')')
227227
braces = 0
228-
r = RevString(s)
228+
r = reverse(s)
229229
i = start(r)
230230
in_single_quotes = false
231231
in_double_quotes = false
@@ -245,18 +245,21 @@ function find_start_brace(s::AbstractString; c_start='(', c_end=')')
245245
in_back_ticks = true
246246
end
247247
else
248-
if !in_back_ticks && !in_double_quotes && c == '\'' && !done(r, i) && next(r, i)[1]!='\\'
248+
if !in_back_ticks && !in_double_quotes &&
249+
c == '\'' && !done(r, i) && next(r, i)[1] != '\\'
249250
in_single_quotes = !in_single_quotes
250-
elseif !in_back_ticks && !in_single_quotes && c == '"' && !done(r, i) && next(r, i)[1]!='\\'
251+
elseif !in_back_ticks && !in_single_quotes &&
252+
c == '"' && !done(r, i) && next(r, i)[1] != '\\'
251253
in_double_quotes = !in_double_quotes
252-
elseif !in_single_quotes && !in_double_quotes && c == '`' && !done(r, i) && next(r, i)[1]!='\\'
254+
elseif !in_single_quotes && !in_double_quotes &&
255+
c == '`' && !done(r, i) && next(r, i)[1] != '\\'
253256
in_back_ticks = !in_back_ticks
254257
end
255258
end
256259
braces == 1 && break
257260
end
258261
braces != 1 && return 0:-1, -1
259-
method_name_end = reverseind(r, i)
262+
method_name_end = reverseind(s, i)
260263
startind = nextind(s, rsearch(s, non_identifier_chars, method_name_end))
261264
return (startind:endof(s), method_name_end)
262265
end

base/shell.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ function shell_parse(str::AbstractString, interpolate::Bool=true;
1414
special::AbstractString="")
1515
s = lstrip(str)
1616
# strips the end but respects the space when the string ends with "\\ "
17-
r = RevString(s)
17+
r = reverse(s)
1818
i = start(r)
1919
c_old = nothing
2020
while !done(r,i)

base/strings/basic.jl

+17-7
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ julia> 'j' * "ulia"
6969

7070
one(::Union{T,Type{T}}) where {T<:AbstractString} = convert(T, "")
7171

72+
# generic number of code units; implementations generally know how long a string
73+
# is though and should override this with a more efficient method
74+
ncodeunits(s::AbstractString) = nextind(s, endof(s)) - 1
75+
7276
"""
7377
length(s::AbstractString)
7478
@@ -233,11 +237,11 @@ end
233237
## Generic indexing functions ##
234238

235239
"""
236-
thisind(str::AbstractString, i::Integer)
240+
thisind(s::AbstractString, i::Integer)
237241
238-
Get the largest valid string index at or before `i`.
239-
Returns `0` if there is no valid string index at or before `i`.
240-
Returns `endof(str)` if `i≥endof(str)`.
242+
If `i` is the index into a character in `s` then `thisind` returns the index of the
243+
start of that character. If `i < start(s)` then it returns `start(s) - 1`.
244+
If `i > ncodeunits(s)` then it returns `ncodeunits(s) + 1`.
241245
242246
# Examples
243247
```jldoctest
@@ -253,15 +257,21 @@ julia> thisind("αβγdef", 3)
253257
julia> thisind("αβγdef", 4)
254258
3
255259
256-
julia> thisind("αβγdef", 20)
260+
julia> thisind("αβγdef", 9)
257261
9
262+
263+
julia> thisind("αβγdef", 10)
264+
10
265+
266+
julia> thisind("αβγdef", 20)
267+
10
258268
"""
259269
function thisind(s::AbstractString, i::Integer)
260270
j = Int(i)
261271
isvalid(s, j) && return j
262272
j < start(s) && return 0
263-
e = endof(s)
264-
j >= endof(s) && return e
273+
n = ncodeunits(s)
274+
j > n && return n + 1
265275
prevind(s, j)
266276
end
267277

base/strings/search.jl

+26-26
Original file line numberDiff line numberDiff line change
@@ -194,12 +194,6 @@ end
194194
search(s::AbstractString, t::AbstractString, i::Integer=start(s)) = _search(s, t, i)
195195
search(s::ByteArray, t::ByteArray, i::Integer=start(s)) = _search(s, t, i)
196196

197-
function rsearch(s::AbstractString, c::Chars)
198-
j = search(RevString(s), c)
199-
j == 0 && return 0
200-
endof(s)-j+1
201-
end
202-
203197
"""
204198
rsearch(s::AbstractString, chars::Chars, [start::Integer])
205199
@@ -212,44 +206,50 @@ julia> rsearch("aaabbb","b")
212206
6:6
213207
```
214208
"""
215-
function rsearch(s::AbstractString, c::Chars, i::Integer)
216-
e = endof(s)
217-
j = search(RevString(s), c, e-i+1)
218-
j == 0 && return 0
219-
e-j+1
209+
function rsearch(s::AbstractString, c::Chars, i::Integer=start(s))
210+
if i < 1
211+
return i == 0 ? 0 : throw(BoundsError(s, i))
212+
end
213+
n = ncodeunits(s)
214+
if i > n
215+
return i == n+1 ? 0 : throw(BoundsError(s, i))
216+
end
217+
# r[reverseind(r,i)] == reverse(r)[i] == s[i]
218+
# s[reverseind(s,j)] == reverse(s)[j] == r[j]
219+
r = reverse(s)
220+
j = search(r, c, reverseind(r, i))
221+
j == 0 ? 0 : reverseind(s, j)
220222
end
221223

222224
function _rsearchindex(s, t, i)
223225
if isempty(t)
224-
return 1 <= i <= nextind(s,endof(s)) ? i :
226+
return 1 <= i <= nextind(s, endof(s)) ? i :
225227
throw(BoundsError(s, i))
226228
end
227-
t = RevString(t)
228-
rs = RevString(s)
229+
t = reverse(t)
230+
rs = reverse(s)
229231
l = endof(s)
230-
t1, j2 = next(t,start(t))
232+
t1, j2 = next(t, start(t))
231233
while true
232-
i = rsearch(s,t1,i)
233-
if i == 0 return 0 end
234-
c, ii = next(rs,l-i+1)
234+
i = rsearch(s, t1, i)
235+
i == 0 && return 0
236+
c, ii = next(rs, reverseind(rs, i))
235237
j = j2; k = ii
236238
matched = true
237-
while !done(t,j)
238-
if done(rs,k)
239+
while !done(t, j)
240+
if done(rs, k)
239241
matched = false
240242
break
241243
end
242-
c, k = next(rs,k)
243-
d, j = next(t,j)
244+
c, k = next(rs, k)
245+
d, j = next(t, j)
244246
if c != d
245247
matched = false
246248
break
247249
end
248250
end
249-
if matched
250-
return nextind(s,l-k+1)
251-
end
252-
i = l-ii+1
251+
matched && return nextind(s, reverseind(s, k))
252+
i = reverseind(s, ii)
253253
end
254254
end
255255

base/strings/string.jl

+16-42
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,20 @@ codeunit(s::AbstractString, i::Integer)
8787
@gc_preserve s unsafe_load(pointer(s, i))
8888
end
8989

90+
"""
91+
ncodeunits(s::AbstractString)
92+
93+
The number of code units in a string. For example, for UTF-8-like data such as
94+
the default `String` type, the number of code units is the number of bytes in
95+
the string, a.k.a. `sizeof(s)`. For a UTF-16 encoded string type, however, the
96+
code unit is `UInt16` so the number of code units is the number of `UInt16`
97+
words in the representation of the string. The expression `codeunit(s, i)` is
98+
valid and safe for precisely the range of `i` values `1:ncodeunits(s)`.
99+
100+
See also: [`codeunit`](@ref).
101+
"""
102+
ncodeunits(s::String) = sizeof(s)
103+
90104
write(io::IO, s::String) =
91105
@gc_preserve s unsafe_write(io, pointer(s), reinterpret(UInt, sizeof(s)))
92106

@@ -109,8 +123,8 @@ end
109123
function thisind(s::String, i::Integer)
110124
j = Int(i)
111125
j < 1 && return 0
112-
e = endof(s)
113-
j >= e && return e
126+
n = ncodeunits(s)
127+
j > n && return n + 1
114128
@inbounds while j > 0 && is_valid_continuation(codeunit(s,j))
115129
j -= 1
116130
end
@@ -281,14 +295,6 @@ function first_utf8_byte(ch::Char)
281295
return b
282296
end
283297

284-
function reverseind(s::String, i::Integer)
285-
j = sizeof(s) + 1 - i
286-
@inbounds while is_valid_continuation(codeunit(s, j))
287-
j -= 1
288-
end
289-
return j
290-
end
291-
292298
## overload methods for efficiency ##
293299

294300
isvalid(s::String, i::Integer) =
@@ -463,38 +469,6 @@ function string(a::Union{String,Char}...)
463469
return out
464470
end
465471

466-
function reverse(s::String)
467-
dat = Vector{UInt8}(s)
468-
n = length(dat)
469-
n <= 1 && return s
470-
buf = StringVector(n)
471-
out = n
472-
pos = 1
473-
@inbounds while out > 0
474-
ch = dat[pos]
475-
if ch > 0xdf
476-
if ch < 0xf0
477-
(out -= 3) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
478-
buf[out + 1], buf[out + 2], buf[out + 3] = ch, dat[pos + 1], dat[pos + 2]
479-
pos += 3
480-
else
481-
(out -= 4) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
482-
buf[out+1], buf[out+2], buf[out+3], buf[out+4] = ch, dat[pos+1], dat[pos+2], dat[pos+3]
483-
pos += 4
484-
end
485-
elseif ch > 0x7f
486-
(out -= 2) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
487-
buf[out + 1], buf[out + 2] = ch, dat[pos + 1]
488-
pos += 2
489-
else
490-
buf[out] = ch
491-
out -= 1
492-
pos += 1
493-
end
494-
end
495-
String(buf)
496-
end
497-
498472
function repeat(s::String, r::Integer)
499473
r < 0 && throw(ArgumentError("can't repeat a string $r times"))
500474
n = sizeof(s)

base/strings/strings.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# This file is a part of Julia. License is MIT: https://julialang.org/license
22

33
include("strings/errors.jl")
4-
include("strings/types.jl")
4+
include("strings/substring.jl")
55
include("strings/basic.jl")
66
include("strings/search.jl")
77
include("strings/util.jl")

0 commit comments

Comments
 (0)