Skip to content

Commit ecc0bf0

Browse files
remove RevString; efficient generic reverseind
These seem unrelated, but they're actually linked: * If you reverse generic strings by wrapping them in `RevString` then then this generic `reverseind` is incorrect. * In order to have a correct generic `reverseind` one needs to assume that `reverse(s)` returns a string of the same type and encoding as `s` with code points in reverse order; one also needs to assume that the code units encoding each character remain the same when reversed. This is a valid assumption for UTF-8, UTF-16 and (trivially) UTF-32. Reverse string search functions are pretty messed up by this and I've fixed them well enough to work but they may be quite inefficient for long strings now. I'm not going to spend too much time on this since there's other work going on to generalize and unify searching APIs. Close #22611 Close #24613 See also: #10593 #23612 #24103
1 parent c2feee7 commit ecc0bf0

14 files changed

+96
-146
lines changed

NEWS.md

+11
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,12 @@ This section lists changes that do not have deprecation warnings.
295295
`AbstractArray` types that specialized broadcasting using the old internal API will
296296
need to switch to the new API. ([#20740])
297297

298+
* The `RevString` type has been removed from the language; `reverse(::String)` returns
299+
a `String` with code points (or fragments thereof) in reverse order. In general,
300+
`reverse(s)` should return a string of the same type and encoding as `s` with code
301+
points in reverse order; any string type overrides `reverse` to return a different
302+
type of string must also override `reverseind` to compute reversed indices correctly.
303+
298304
Library improvements
299305
--------------------
300306

@@ -397,6 +403,11 @@ Library improvements
397403
The generic definition is constant time but calls `endof(s)` which may be inefficient.
398404
Therefore custom string types may want to define direct `ncodeunits` methods.
399405

406+
* `reverseind(s::AbstractString, i::Integer)` now has an efficient generic fallback, so
407+
custom string types do not need to provide their own efficient defintions. The generic
408+
definition relies on `ncodeunits` however, so for optimal performance you may need to
409+
define a custom method for that function.
410+
400411
Compiler/Runtime improvements
401412
-----------------------------
402413

base/exports.jl

-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ export
8888
Rational,
8989
Regex,
9090
RegexMatch,
91-
RevString,
9291
RoundFromZero,
9392
RoundDown,
9493
RoundingMode,

base/precompile.jl

-3
Original file line numberDiff line numberDiff line change
@@ -578,9 +578,6 @@ precompile(Tuple{typeof(Base.LineEdit.complete_line), Base.LineEdit.PromptState,
578578
precompile(Tuple{typeof(Base.LineEdit.input_string_newlines_aftercursor), Base.LineEdit.PromptState})
579579
precompile(Tuple{typeof(Base.LineEdit.complete_line), Base.REPL.REPLCompletionProvider, Base.LineEdit.PromptState})
580580
precompile(Tuple{getfield(Base, Symbol("#kw##parse")), Array{Any, 1}, typeof(Base.parse), String})
581-
precompile(Tuple{typeof(Base.isvalid), Base.RevString{String}, Int64})
582-
precompile(Tuple{typeof(Base.nextind), Base.RevString{String}, Int64})
583-
precompile(Tuple{typeof(Base.search), Base.RevString{String}, Array{Char, 1}, Int64})
584581
precompile(Tuple{typeof(Base.rsearch), String, Array{Char, 1}, Int64})
585582
precompile(Tuple{getfield(Base.REPLCompletions, Symbol("#kw##find_start_brace")), Array{Any, 1}, typeof(Base.REPLCompletions.find_start_brace), String})
586583
precompile(Tuple{typeof(Core.Inference.isbits), Tuple{Void, Void, Void}})

base/repl/REPLCompletions.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ end
225225
# closed start brace from the end of the string.
226226
function find_start_brace(s::AbstractString; c_start='(', c_end=')')
227227
braces = 0
228-
r = RevString(s)
228+
r = reverse(s)
229229
i = start(r)
230230
in_single_quotes = false
231231
in_double_quotes = false
@@ -259,7 +259,7 @@ function find_start_brace(s::AbstractString; c_start='(', c_end=')')
259259
braces == 1 && break
260260
end
261261
braces != 1 && return 0:-1, -1
262-
method_name_end = reverseind(r, i)
262+
method_name_end = reverseind(s, i)
263263
startind = nextind(s, rsearch(s, non_identifier_chars, method_name_end))
264264
return (startind:endof(s), method_name_end)
265265
end

base/shell.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ function shell_parse(str::AbstractString, interpolate::Bool=true;
1414
special::AbstractString="")
1515
s = lstrip(str)
1616
# strips the end but respects the space when the string ends with "\\ "
17-
r = RevString(s)
17+
r = reverse(s)
1818
i = start(r)
1919
c_old = nothing
2020
while !done(r,i)

base/strings/search.jl

+26-26
Original file line numberDiff line numberDiff line change
@@ -194,12 +194,6 @@ end
194194
search(s::AbstractString, t::AbstractString, i::Integer=start(s)) = _search(s, t, i)
195195
search(s::ByteArray, t::ByteArray, i::Integer=start(s)) = _search(s, t, i)
196196

197-
function rsearch(s::AbstractString, c::Chars)
198-
j = search(RevString(s), c)
199-
j == 0 && return 0
200-
endof(s)-j+1
201-
end
202-
203197
"""
204198
rsearch(s::AbstractString, chars::Chars, [start::Integer])
205199
@@ -212,44 +206,50 @@ julia> rsearch("aaabbb","b")
212206
6:6
213207
```
214208
"""
215-
function rsearch(s::AbstractString, c::Chars, i::Integer)
216-
e = endof(s)
217-
j = search(RevString(s), c, e-i+1)
218-
j == 0 && return 0
219-
e-j+1
209+
function rsearch(s::AbstractString, c::Chars, i::Integer=start(s))
210+
if i < 1
211+
return i == 0 ? 0 : throw(BoundsError(s, i))
212+
end
213+
n = ncodeunits(s)
214+
if i > n
215+
return i == n+1 ? 0 : throw(BoundsError(s, i))
216+
end
217+
# r[reverseind(r,i)] == reverse(r)[i] == s[i]
218+
# s[reverseind(s,j)] == reverse(s)[j] == r[j]
219+
r = reverse(s)
220+
j = search(r, c, reverseind(r, i))
221+
j == 0 ? 0 : reverseind(s, j)
220222
end
221223

222224
function _rsearchindex(s, t, i)
223225
if isempty(t)
224-
return 1 <= i <= nextind(s,endof(s)) ? i :
226+
return 1 <= i <= nextind(s, endof(s)) ? i :
225227
throw(BoundsError(s, i))
226228
end
227-
t = RevString(t)
228-
rs = RevString(s)
229+
t = reverse(t)
230+
rs = reverse(s)
229231
l = endof(s)
230-
t1, j2 = next(t,start(t))
232+
t1, j2 = next(t, start(t))
231233
while true
232-
i = rsearch(s,t1,i)
233-
if i == 0 return 0 end
234-
c, ii = next(rs,l-i+1)
234+
i = rsearch(s, t1, i)
235+
i == 0 && return 0
236+
c, ii = next(rs, reverseind(rs, i))
235237
j = j2; k = ii
236238
matched = true
237-
while !done(t,j)
238-
if done(rs,k)
239+
while !done(t, j)
240+
if done(rs, k)
239241
matched = false
240242
break
241243
end
242-
c, k = next(rs,k)
243-
d, j = next(t,j)
244+
c, k = next(rs, k)
245+
d, j = next(t, j)
244246
if c != d
245247
matched = false
246248
break
247249
end
248250
end
249-
if matched
250-
return nextind(s,l-k+1)
251-
end
252-
i = l-ii+1
251+
matched && return nextind(s, reverseind(s, k))
252+
i = reverseind(s, ii)
253253
end
254254
end
255255

base/strings/string.jl

-40
Original file line numberDiff line numberDiff line change
@@ -295,14 +295,6 @@ function first_utf8_byte(ch::Char)
295295
return b
296296
end
297297

298-
function reverseind(s::String, i::Integer)
299-
j = sizeof(s) + 1 - i
300-
@inbounds while is_valid_continuation(codeunit(s, j))
301-
j -= 1
302-
end
303-
return j
304-
end
305-
306298
## overload methods for efficiency ##
307299

308300
isvalid(s::String, i::Integer) =
@@ -477,38 +469,6 @@ function string(a::Union{String,Char}...)
477469
return out
478470
end
479471

480-
function reverse(s::String)
481-
dat = Vector{UInt8}(s)
482-
n = length(dat)
483-
n <= 1 && return s
484-
buf = StringVector(n)
485-
out = n
486-
pos = 1
487-
@inbounds while out > 0
488-
ch = dat[pos]
489-
if ch > 0xdf
490-
if ch < 0xf0
491-
(out -= 3) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
492-
buf[out + 1], buf[out + 2], buf[out + 3] = ch, dat[pos + 1], dat[pos + 2]
493-
pos += 3
494-
else
495-
(out -= 4) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
496-
buf[out+1], buf[out+2], buf[out+3], buf[out+4] = ch, dat[pos+1], dat[pos+2], dat[pos+3]
497-
pos += 4
498-
end
499-
elseif ch > 0x7f
500-
(out -= 2) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
501-
buf[out + 1], buf[out + 2] = ch, dat[pos + 1]
502-
pos += 2
503-
else
504-
buf[out] = ch
505-
out -= 1
506-
pos += 1
507-
end
508-
end
509-
String(buf)
510-
end
511-
512472
function repeat(s::String, r::Integer)
513473
r < 0 && throw(ArgumentError("can't repeat a string $r times"))
514474
n = sizeof(s)

base/strings/strings.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# This file is a part of Julia. License is MIT: https://julialang.org/license
22

33
include("strings/errors.jl")
4-
include("strings/types.jl")
4+
include("strings/substring.jl")
55
include("strings/basic.jl")
66
include("strings/search.jl")
77
include("strings/util.jl")

base/strings/types.jl base/strings/substring.jl

+21-34
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
11
# This file is a part of Julia. License is MIT: https://julialang.org/license
22

3-
# SubString and RevString types
4-
5-
## substrings reference original strings ##
6-
73
"""
84
SubString(s::AbstractString, i::Integer, j::Integer=endof(s))
95
SubString(s::AbstractString, r::UnitRange{<:Integer})
@@ -51,6 +47,9 @@ end
5147
SubString(s::AbstractString) = SubString(s, 1, endof(s))
5248
SubString{T}(s::T) where {T<:AbstractString} = SubString{T}(s, 1, endof(s))
5349

50+
convert(::Type{SubString{S}}, s::AbstractString) where {S<:AbstractString} =
51+
SubString(convert(S, s))
52+
5453
String(p::SubString{String}) =
5554
unsafe_string(pointer(p.string, p.offset+1), nextind(p, p.endof)-1)
5655

@@ -123,32 +122,18 @@ function unsafe_convert(::Type{Ptr{R}}, s::SubString{String}) where R<:Union{Int
123122
convert(Ptr{R}, pointer(s.string)) + s.offset
124123
end
125124

126-
## reversed strings without data movement ##
127-
128-
struct RevString{T<:AbstractString} <: AbstractString
129-
string::T
130-
end
131-
132-
endof(s::RevString) = endof(s.string)
133-
length(s::RevString) = length(s.string)
134-
sizeof(s::RevString) = sizeof(s.string)
135-
136-
function next(s::RevString, i::Int)
137-
n = endof(s); j = n-i+1
138-
(s.string[j], n-prevind(s.string,j)+1)
139-
end
140-
141125
"""
142126
reverse(s::AbstractString) -> AbstractString
143127
144-
Reverses a string.
145-
146-
Technically, this function reverses the codepoints in a string, and its
128+
Reverses a string. Technically, this function reverses the codepoints in a string and its
147129
main utility is for reversed-order string processing, especially for reversed
148-
regular-expression searches. See also [`reverseind`](@ref) to convert indices
149-
in `s` to indices in `reverse(s)` and vice-versa, and [`graphemes`](@ref)
150-
to operate on user-visible "characters" (graphemes) rather than codepoints.
151-
See also [`Iterators.reverse`](@ref) for reverse-order iteration without making a copy.
130+
regular-expression searches. See also [`reverseind`](@ref) to convert indices in `s` to
131+
indices in `reverse(s)` and vice-versa, and [`graphemes`](@ref) to operate on user-visible
132+
"characters" (graphemes) rather than codepoints. See also [`Iterators.reverse`](@ref) for
133+
reverse-order iteration without making a copy. Custom string types must implement the
134+
`reverse` function themselves and should typically return a string with the same type
135+
and encoding. If they return a string with a different encoding, they must also override
136+
`reverseind` for that string type to satisfy `s[reverseind(s,i)] == reverse(s)[i]`.
152137
153138
# Examples
154139
```jldoctest
@@ -162,10 +147,15 @@ julia> join(reverse(collect(graphemes("ax̂e")))) # reverses graphemes
162147
"ex̂a"
163148
```
164149
"""
165-
reverse(s::AbstractString) = RevString(s)
166-
reverse(s::RevString) = s.string
167-
168-
## reverse an index i so that reverse(s)[i] == s[reverseind(s,i)]
150+
function reverse(s::Union{String,SubString{String}})::String
151+
sprint() do io
152+
i, j = start(s), endof(s)
153+
while i j
154+
c, j = s[j], prevind(s, j)
155+
write(io, c)
156+
end
157+
end
158+
end
169159

170160
"""
171161
reverseind(v, i)
@@ -185,10 +175,7 @@ julia> for i in 1:length(r)
185175
Julia
186176
```
187177
"""
188-
reverseind(s::AbstractString, i) = chr2ind(s, length(s) + 1 - ind2chr(reverse(s), i))
189-
reverseind(s::RevString, i::Integer) = endof(s) - i + 1
190-
reverseind(s::SubString{String}, i::Integer) =
191-
reverseind(s.string, nextind(s.string, endof(s.string))-s.offset-s.endof+i-1) - s.offset
178+
reverseind(s::AbstractString, i::Integer) = thisind(s, ncodeunits(s)-i+1)
192179

193180
"""
194181
repeat(s::AbstractString, r::Integer)

base/strings/util.jl

+7-8
Original file line numberDiff line numberDiff line change
@@ -190,16 +190,15 @@ julia> rstrip(a)
190190
```
191191
"""
192192
function rstrip(s::AbstractString, chars::Chars=_default_delims)
193-
r = RevString(s)
194-
i = start(r)
195-
while !done(r,i)
196-
c, j = next(r,i)
197-
if !(c in chars)
198-
return SubString(s, 1, endof(s)-i+1)
199-
end
193+
a = start(s)
194+
i = endof(s)
195+
while a  i
196+
c = s[i]
197+
j = prevind(s, i)
198+
c in chars || return SubString(s, 1:i)
200199
i = j
201200
end
202-
SubString(s, 1, 0)
201+
SubString(s, a, a-1)
203202
end
204203

205204
"""

stdlib/Test/src/Test.jl

+3
Original file line numberDiff line numberDiff line change
@@ -1384,6 +1384,9 @@ struct GenericString <: AbstractString
13841384
end
13851385
Base.endof(s::GenericString) = endof(s.string)
13861386
Base.next(s::GenericString, i::Int) = next(s.string, i)
1387+
Base.reverse(s::GenericString) = GenericString(reverse(s.string))
1388+
Base.reverse(s::SubString{GenericString}) =
1389+
GenericString(typeof(s.string)(reverse(String(s))))
13871390

13881391
"""
13891392
The `GenericSet` can be used to test generic set APIs that program to

test/replcompletions.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
using Base.REPLCompletions
44

55
let ex = quote
6-
module CompletionFoo
6+
module CompletionFoo
77
mutable struct Test_y
88
yy
99
end

0 commit comments

Comments
 (0)