Skip to content

Commit 54f1a06

Browse files
stevengjLilithHafner
authored andcommitted
add Unicode.isequal_normalized function (JuliaLang#42493)
This adds a function `isequal_normalized` to the Unicode stdlib to check whether two strings are canonically equivalent (optionally casefolding and/or stripping combining marks). Previously, the only way to do this was to call `Unicode.normalize` on the two strings, to construct normalized versions, but this seemed a bit wasteful — the new `isequal_normalized` function calls lower-level functions in utf8proc to accomplish the same task while only allocating 4-codepoint (16-byte) temporary arrays. It seems to be about 2x faster than calling `normalize` in the expensive case where the strings are equivalent, and is potentially much faster for inequivalent strings for which the loop can break early. (If we could stack-allocate small arrays it might get faster.) (In the future, we might also want to add `Unicode.isless_normalized` and `Unicode.cmp_normalized` functions for comparing Unicode strings, but `isequal_normalized` seemed like a good start.)
1 parent 8d93b07 commit 54f1a06

File tree

4 files changed

+87
-1
lines changed

4 files changed

+87
-1
lines changed

NEWS.md

+3
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,9 @@ Standard library changes
118118
* The standard log levels `BelowMinLevel`, `Debug`, `Info`, `Warn`, `Error`,
119119
and `AboveMaxLevel` are now exported from the Logging stdlib ([#40980]).
120120

121+
#### Unicode
122+
* Added function `isequal_normalized` to check for Unicode equivalence without
123+
explicitly constructing normalized strings ([#42493]).
121124

122125
Deprecated or removed
123126
---------------------

stdlib/Unicode/docs/src/index.md

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
```@docs
44
Unicode.isassigned
5+
Unicode.isequal_normalized
56
Unicode.normalize
67
Unicode.graphemes
78
```

stdlib/Unicode/src/Unicode.jl

+71-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
module Unicode
44

5-
export graphemes
5+
export graphemes, isequal_normalized
66

77
"""
88
Unicode.normalize(s::AbstractString; keywords...)
@@ -89,4 +89,74 @@ letter combined with an accent mark is a single grapheme.)
8989
"""
9090
graphemes(s::AbstractString) = Base.Unicode.GraphemeIterator{typeof(s)}(s)
9191

92+
using Base.Unicode: utf8proc_error, UTF8PROC_DECOMPOSE, UTF8PROC_CASEFOLD, UTF8PROC_STRIPMARK
93+
94+
function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, options::Integer)
95+
ret = @ccall utf8proc_decompose_char(codepoint::UInt32, dest::Ptr{UInt32}, length(dest)::Int, options::Cint, C_NULL::Ptr{Cint})::Int
96+
ret < 0 && utf8proc_error(ret)
97+
return ret
98+
end
99+
100+
"""
101+
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false)
102+
103+
Return whether `s1` and `s2` are canonically equivalent Unicode strings. If `casefold=true`,
104+
ignores case (performs Unicode case-folding); if `stripmark=true`, strips diacritical marks
105+
and other combining characters.
106+
107+
# Examples
108+
109+
For example, the string `"noël"` can be constructed in two canonically equivalent ways
110+
in Unicode, depending on whether `"ë"` is formed from a single codepoint U+00EB or
111+
from the ASCII character `'o'` followed by the U+0308 combining-diaeresis character.
112+
113+
```jldoctest
114+
julia> s1 = "no\u00EBl"
115+
"noël"
116+
117+
julia> s2 = "noe\u0308l"
118+
"noël"
119+
120+
julia> s1 == s2
121+
false
122+
123+
julia> isequal_normalized(s1, s2)
124+
true
125+
126+
julia> isequal_normalized(s1, "noel", stripmark=true)
127+
true
128+
129+
julia> isequal_normalized(s1, "NOËL", casefold=true)
130+
true
131+
```
132+
"""
133+
function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false)
134+
function decompose_next_char!(c, state, d, options, s)
135+
n = _decompose_char!(c, d, options)
136+
if n > length(d) # may be possible in future Unicode versions?
137+
n = _decompose_char!(c, resize!(d, n), options)
138+
end
139+
return 1, n, iterate(s, state)
140+
end
141+
options = UTF8PROC_DECOMPOSE
142+
casefold && (options |= UTF8PROC_CASEFOLD)
143+
stripmark && (options |= UTF8PROC_STRIPMARK)
144+
i1,i2 = iterate(s1),iterate(s2)
145+
d1,d2 = Vector{UInt32}(undef, 4), Vector{UInt32}(undef, 4) # codepoint buffers
146+
n1 = n2 = 0 # lengths of codepoint buffers
147+
j1 = j2 = 1 # indices in d1, d2
148+
while true
149+
if j1 > n1
150+
i1 === nothing && return i2 === nothing && j2 > n2
151+
j1, n1, i1 = decompose_next_char!(UInt32(i1[1]), i1[2], d1, options, s1)
152+
end
153+
if j2 > n2
154+
i2 === nothing && return false
155+
j2, n2, i2 = decompose_next_char!(UInt32(i2[1]), i2[2], d2, options, s2)
156+
end
157+
d1[j1] == d2[j2] || return false
158+
j1 += 1; j2 += 1
159+
end
160+
end
161+
92162
end

stdlib/Unicode/test/runtests.jl

+12
Original file line numberDiff line numberDiff line change
@@ -417,3 +417,15 @@ end
417417
@test prod(["*" for i in 1:3]) == "***"
418418
@test prod(["*" for i in 1:0]) == ""
419419
end
420+
421+
@testset "Unicode equivalence" begin
422+
@test isequal_normalized("no\u00EBl", "noe\u0308l")
423+
@test !isequal_normalized("no\u00EBl", "noe\u0308l ")
424+
@test isequal_normalized("", "")
425+
@test !isequal_normalized("", " ")
426+
@test !isequal_normalized("no\u00EBl", "NOËL")
427+
@test isequal_normalized("no\u00EBl", "NOËL", casefold=true)
428+
@test !isequal_normalized("no\u00EBl", "noel")
429+
@test isequal_normalized("no\u00EBl", "noel", stripmark=true)
430+
@test isequal_normalized("no\u00EBl", "NOEL", stripmark=true, casefold=true)
431+
end

0 commit comments

Comments
 (0)