|
2 | 2 |
|
3 | 3 | module Unicode
|
4 | 4 |
|
5 |
| -export graphemes |
| 5 | +export graphemes, isequal_normalized |
6 | 6 |
|
7 | 7 | """
|
8 | 8 | Unicode.normalize(s::AbstractString; keywords...)
|
@@ -89,4 +89,74 @@ letter combined with an accent mark is a single grapheme.)
|
89 | 89 | """
|
90 | 90 | graphemes(s::AbstractString) = Base.Unicode.GraphemeIterator{typeof(s)}(s)
|
91 | 91 |
|
| 92 | +using Base.Unicode: utf8proc_error, UTF8PROC_DECOMPOSE, UTF8PROC_CASEFOLD, UTF8PROC_STRIPMARK |
| 93 | + |
| 94 | +function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32}, options::Integer) |
| 95 | + ret = @ccall utf8proc_decompose_char(codepoint::UInt32, dest::Ptr{UInt32}, length(dest)::Int, options::Cint, C_NULL::Ptr{Cint})::Int |
| 96 | + ret < 0 && utf8proc_error(ret) |
| 97 | + return ret |
| 98 | +end |
| 99 | + |
| 100 | +""" |
| 101 | + isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false) |
| 102 | +
|
| 103 | +Return whether `s1` and `s2` are canonically equivalent Unicode strings. If `casefold=true`, |
| 104 | +ignores case (performs Unicode case-folding); if `stripmark=true`, strips diacritical marks |
| 105 | +and other combining characters. |
| 106 | +
|
| 107 | +# Examples |
| 108 | +
|
| 109 | +For example, the string `"noël"` can be constructed in two canonically equivalent ways |
| 110 | +in Unicode, depending on whether `"ë"` is formed from a single codepoint U+00EB or |
| 111 | +from the ASCII character `'o'` followed by the U+0308 combining-diaeresis character. |
| 112 | +
|
| 113 | +```jldoctest |
| 114 | +julia> s1 = "no\u00EBl" |
| 115 | +"noël" |
| 116 | +
|
| 117 | +julia> s2 = "noe\u0308l" |
| 118 | +"noël" |
| 119 | +
|
| 120 | +julia> s1 == s2 |
| 121 | +false |
| 122 | +
|
| 123 | +julia> isequal_normalized(s1, s2) |
| 124 | +true |
| 125 | +
|
| 126 | +julia> isequal_normalized(s1, "noel", stripmark=true) |
| 127 | +true |
| 128 | +
|
| 129 | +julia> isequal_normalized(s1, "NOËL", casefold=true) |
| 130 | +true |
| 131 | +``` |
| 132 | +""" |
| 133 | +function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false) |
| 134 | + function decompose_next_char!(c, state, d, options, s) |
| 135 | + n = _decompose_char!(c, d, options) |
| 136 | + if n > length(d) # may be possible in future Unicode versions? |
| 137 | + n = _decompose_char!(c, resize!(d, n), options) |
| 138 | + end |
| 139 | + return 1, n, iterate(s, state) |
| 140 | + end |
| 141 | + options = UTF8PROC_DECOMPOSE |
| 142 | + casefold && (options |= UTF8PROC_CASEFOLD) |
| 143 | + stripmark && (options |= UTF8PROC_STRIPMARK) |
| 144 | + i1,i2 = iterate(s1),iterate(s2) |
| 145 | + d1,d2 = Vector{UInt32}(undef, 4), Vector{UInt32}(undef, 4) # codepoint buffers |
| 146 | + n1 = n2 = 0 # lengths of codepoint buffers |
| 147 | + j1 = j2 = 1 # indices in d1, d2 |
| 148 | + while true |
| 149 | + if j1 > n1 |
| 150 | + i1 === nothing && return i2 === nothing && j2 > n2 |
| 151 | + j1, n1, i1 = decompose_next_char!(UInt32(i1[1]), i1[2], d1, options, s1) |
| 152 | + end |
| 153 | + if j2 > n2 |
| 154 | + i2 === nothing && return false |
| 155 | + j2, n2, i2 = decompose_next_char!(UInt32(i2[1]), i2[2], d2, options, s2) |
| 156 | + end |
| 157 | + d1[j1] == d2[j2] || return false |
| 158 | + j1 += 1; j2 += 1 |
| 159 | + end |
| 160 | +end |
| 161 | + |
92 | 162 | end
|
0 commit comments