Skip to content

Commit 7601943

Browse files
committed
support chartransform
1 parent b6900d6 commit 7601943

File tree

2 files changed

+9
-4
lines changed

2 files changed

+9
-4
lines changed

stdlib/Unicode/src/Unicode.jl

+8-4
Original file line numberDiff line numberDiff line change
@@ -152,12 +152,16 @@ function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32},
152152
end
153153

154154
"""
155-
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false)
155+
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity)
156156
157157
Return whether `s1` and `s2` are canonically equivalent Unicode strings. If `casefold=true`,
158158
ignores case (performs Unicode case-folding); if `stripmark=true`, strips diacritical marks
159159
and other combining characters.
160160
161+
As with [`Unicode.normalize`](@ref), you can also pass an arbitrary
162+
function via the `chartransform` keyword (mapping `Integer` codepoints to codepoints)
163+
to perform custom normalizations, such as [`Unicode.julia_chartransform`](@ref).
164+
161165
# Examples
162166
163167
For example, the string `"noël"` can be constructed in two canonically equivalent ways
@@ -184,7 +188,7 @@ julia> isequal_normalized(s1, "NOËL", casefold=true)
184188
true
185189
```
186190
"""
187-
function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false)
191+
function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity)
188192
function decompose_next_char!(c, state, d, options, s)
189193
n = _decompose_char!(c, d, options)
190194
if n > length(d) # may be possible in future Unicode versions?
@@ -202,11 +206,11 @@ function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bo
202206
while true
203207
if j1 > n1
204208
i1 === nothing && return i2 === nothing && j2 > n2
205-
j1, n1, i1 = decompose_next_char!(UInt32(i1[1]), i1[2], d1, options, s1)
209+
j1, n1, i1 = decompose_next_char!(chartransform(UInt32(i1[1])), i1[2], d1, options, s1)
206210
end
207211
if j2 > n2
208212
i2 === nothing && return false
209-
j2, n2, i2 = decompose_next_char!(UInt32(i2[1]), i2[2], d2, options, s2)
213+
j2, n2, i2 = decompose_next_char!(chartransform(UInt32(i2[1])), i2[2], d2, options, s2)
210214
end
211215
d1[j1] == d2[j2] || return false
212216
j1 += 1; j2 += 1

stdlib/Unicode/test/runtests.jl

+1
Original file line numberDiff line numberDiff line change
@@ -433,4 +433,5 @@ end
433433
@test !isequal_normalized("no\u00EBl", "noel")
434434
@test isequal_normalized("no\u00EBl", "noel", stripmark=true)
435435
@test isequal_normalized("no\u00EBl", "NOEL", stripmark=true, casefold=true)
436+
@test isequal_normalized("\u00B5\u0302m", "\u03BC\u0302m", chartransform=julia_chartransform)
436437
end

0 commit comments

Comments
 (0)