Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add Unicode.julia_chartransform Julia-parser normalization #42561

Merged
merged 16 commits into from
Oct 18, 2021
Merged
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,11 @@ Standard library changes
#### Unicode
* Added function `isequal_normalized` to check for Unicode equivalence without
explicitly constructing normalized strings ([#42493]).
* The `Unicode.normalize` function now accepts a `chartransform` keyword that can
be used to supply custom character mappings, and a `Unicode.julia_chartransform`
function is provided to reproduce the mapping used in identifier normalization
by the Julia parser ([#42561]).


Deprecated or removed
---------------------
Expand Down
42 changes: 33 additions & 9 deletions base/strings/unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -145,20 +145,43 @@ const UTF8PROC_STRIPMARK = (1<<13)

utf8proc_error(result) = error(unsafe_string(ccall(:utf8proc_errmsg, Cstring, (Cssize_t,), result)))

function utf8proc_map(str::Union{String,SubString{String}}, options::Integer)
nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
str, sizeof(str), C_NULL, 0, options)
nwords < 0 && utf8proc_error(nwords)
# static wrapper around user callback function
utf8proc_custom_func(codepoint::UInt32, callback::Any) =
UInt32(callback(codepoint))::UInt32

function utf8proc_decompose(str, options, buffer, nwords, chartransform::typeof(identity))
ret = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
str, sizeof(str), buffer, nwords, options)
ret < 0 && utf8proc_error(ret)
return ret
end
function utf8proc_decompose(str, options, buffer, nwords, chartransform::T) where T
ret = ccall(:utf8proc_decompose_custom, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint, Ptr{Cvoid}, Ref{T}),
str, sizeof(str), buffer, nwords, options,
@cfunction(utf8proc_custom_func, UInt32, (UInt32, Ref{T})), chartransform)
ret < 0 && utf8proc_error(ret)
return ret
end

function utf8proc_map(str::Union{String,SubString{String}}, options::Integer, chartransform=identity)
nwords = utf8proc_decompose(str, options, C_NULL, 0, chartransform)
buffer = Base.StringVector(nwords*4)
nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
str, sizeof(str), buffer, nwords, options)
nwords < 0 && utf8proc_error(nwords)
nwords = utf8proc_decompose(str, options, buffer, nwords, chartransform)
nbytes = ccall(:utf8proc_reencode, Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options)
nbytes < 0 && utf8proc_error(nbytes)
return String(resize!(buffer, nbytes))
end

utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags)
# from julia_charmap.h, used by julia_chartransform in the Unicode stdlib
const _julia_charmap = Dict{UInt32,UInt32}(
0x025B => 0x03B5,
0x00B5 => 0x03BC,
0x00B7 => 0x22C5,
0x0387 => 0x22C5,
0x2212 => 0x002D,
)

utf8proc_map(s::AbstractString, flags::Integer, chartransform=identity) = utf8proc_map(String(s), flags, chartransform)

# Documented in Unicode module
function normalize(
Expand All @@ -176,6 +199,7 @@ function normalize(
casefold::Bool=false,
lump::Bool=false,
stripmark::Bool=false,
chartransform=identity,
)
flags = 0
stable && (flags = flags | UTF8PROC_STABLE)
Expand All @@ -198,7 +222,7 @@ function normalize(
casefold && (flags = flags | UTF8PROC_CASEFOLD)
lump && (flags = flags | UTF8PROC_LUMP)
stripmark && (flags = flags | UTF8PROC_STRIPMARK)
utf8proc_map(s, flags)
utf8proc_map(s, flags, chartransform)
end

function normalize(s::AbstractString, nf::Symbol)
Expand Down
5 changes: 4 additions & 1 deletion src/flisp/julia_charmap.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
/* Array of {original codepoint, replacement codepoint} normalizations
to perform on Julia identifiers, to canonicalize characters that
are both easily confused and easily inputted by accident. */
are both easily confused and easily inputted by accident.

Important: when this table is updated, also update the corresponding table
in base/strings/unicode.jl */
static const uint32_t charmap[][2] = {
{ 0x025B, 0x03B5 }, // latin small letter open e -> greek small letter epsilon
{ 0x00B5, 0x03BC }, // micro sign -> greek small letter mu
Expand Down
1 change: 1 addition & 0 deletions stdlib/Unicode/docs/src/index.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Unicode

```@docs
Unicode.julia_chartransform
Unicode.isassigned
Unicode.isequal_normalized
Unicode.normalize
Expand Down
66 changes: 62 additions & 4 deletions stdlib/Unicode/src/Unicode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,50 @@ module Unicode

export graphemes, isequal_normalized

"""
Unicode.julia_chartransform(c::Union{Char,Integer})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thoughts about naming this something like parsertransform?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, do you have an opinion on my last comment above? #42561 (comment)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the callback function is good

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have strong feelings on the name…


Map the Unicode character (`Char`) or codepoint (`Integer`) `c` to the corresponding
"equivalent" character or codepoint, respectively, according to the custom equivalence
used within the Julia parser (in addition to NFC normalization).

For example, `'µ'` (U+00B5 micro) is treated as equivalent to `'μ'` (U+03BC mu) by
Julia's parser, so `julia_chartransform` performs this transformation while leaving
other characters unchanged:
```jldoctest
julia> Unicode.julia_chartransform('\u00B5')
'μ': Unicode U+03BC (category Ll: Letter, lowercase)

julia> Unicode.julia_chartransform('x')
'x': ASCII/Unicode U+0078 (category Ll: Letter, lowercase)
```

`julia_chartransform` is mainly useful for passing to the [`Unicode.normalize`](@ref)
function in order to mimic the normalization used by the Julia parser:
```jl
julia> s = "\u00B5o\u0308"
"µö"

julia> s2 = Unicode.normalize(s, compose=true, stable=true, chartransform=Unicode.julia_chartransform)
"μö"

julia> collect(s2)
2-element Vector{Char}:
'μ': Unicode U+03BC (category Ll: Letter, lowercase)
'ö': Unicode U+00F6 (category Ll: Letter, lowercase)

julia> s2 == string(Meta.parse(s))
true
```

!!! compat "Julia 1.8"
This function was introduced in Julia 1.8.
"""
function julia_chartransform end
julia_chartransform(codepoint::UInt32) = get(Base.Unicode._julia_charmap, codepoint, codepoint)
julia_chartransform(codepoint::Integer) = julia_chartransform(UInt32(codepoint))
julia_chartransform(char::Char) = Char(julia_chartransform(UInt32(char)))

"""
Unicode.normalize(s::AbstractString; keywords...)
Unicode.normalize(s::AbstractString, normalform::Symbol)
Expand Down Expand Up @@ -42,6 +86,13 @@ options (which all default to `false` except for `compose`) are specified:
* `rejectna=true`: throw an error if unassigned code points are found
* `stable=true`: enforce Unicode versioning stability (never introduce characters missing from earlier Unicode versions)

You can also use the `chartransform` keyword (which defaults to `identity`) to pass an arbitrary
*function* mapping `Integer` codepoints to codepoints, which is is called on each
character in `s` as it is processed, in order to perform arbitrary additional normalizations.
For example, by passing `chartransform=Unicode.julia_chartransform`, you can apply a few Julia-specific
character normalizations that are performed by Julia when parsing identifiers (in addition to
NFC normalization: `compose=true, stable=true`).

For example, NFKC corresponds to the options `compose=true, compat=true, stable=true`.

# Examples
Expand All @@ -58,6 +109,9 @@ julia> Unicode.normalize("JuLiA", casefold=true)
julia> Unicode.normalize("JúLiA", stripmark=true)
"JuLiA"
```

!!! compat "Julia 1.8"
The `chartransform` keyword argument requires Julia 1.8.
"""
function normalize end
normalize(s::AbstractString, nf::Symbol) = Base.Unicode.normalize(s, nf)
Expand Down Expand Up @@ -98,12 +152,16 @@ function _decompose_char!(codepoint::Union{Integer,Char}, dest::Vector{UInt32},
end

"""
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false)
isequal_normalized(s1::AbstractString, s2::AbstractString; casefold=false, stripmark=false, chartransform=identity)

Return whether `s1` and `s2` are canonically equivalent Unicode strings. If `casefold=true`,
ignores case (performs Unicode case-folding); if `stripmark=true`, strips diacritical marks
and other combining characters.

As with [`Unicode.normalize`](@ref), you can also pass an arbitrary
function via the `chartransform` keyword (mapping `Integer` codepoints to codepoints)
to perform custom normalizations, such as [`Unicode.julia_chartransform`](@ref).

# Examples

For example, the string `"noël"` can be constructed in two canonically equivalent ways
Expand All @@ -130,7 +188,7 @@ julia> isequal_normalized(s1, "NOËL", casefold=true)
true
```
"""
function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false)
function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bool=false, stripmark::Bool=false, chartransform=identity)
function decompose_next_char!(c, state, d, options, s)
n = _decompose_char!(c, d, options)
if n > length(d) # may be possible in future Unicode versions?
Expand All @@ -148,11 +206,11 @@ function isequal_normalized(s1::AbstractString, s2::AbstractString; casefold::Bo
while true
if j1 > n1
i1 === nothing && return i2 === nothing && j2 > n2
j1, n1, i1 = decompose_next_char!(UInt32(i1[1]), i1[2], d1, options, s1)
j1, n1, i1 = decompose_next_char!(chartransform(UInt32(i1[1])), i1[2], d1, options, s1)
end
if j2 > n2
i2 === nothing && return false
j2, n2, i2 = decompose_next_char!(UInt32(i2[1]), i2[2], d2, options, s2)
j2, n2, i2 = decompose_next_char!(chartransform(UInt32(i2[1])), i2[2], d2, options, s2)
end
d1[j1] == d2[j2] || return false
j1 += 1; j2 += 1
Expand Down
8 changes: 7 additions & 1 deletion stdlib/Unicode/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

using Test
using Unicode
using Unicode: normalize, isassigned
using Unicode: normalize, isassigned, julia_chartransform

@testset "string normalization" begin
# normalize (Unicode normalization etc.):
Expand All @@ -25,6 +25,11 @@ using Unicode: normalize, isassigned
@test normalize("\t\r", stripcc=true) == " "
@test normalize("\t\r", stripcc=true, newline2ls=true) == " \u2028"
@test normalize("\u0072\u0307\u0323", :NFC) == "\u1E5B\u0307" #26917

# julia_chartransform identifier normalization
@test normalize("julia\u025B\u00B5\u00B7\u0387\u2212", chartransform=julia_chartransform) ==
"julia\u03B5\u03BC\u22C5\u22C5\u002D"
@test julia_chartransform('\u00B5') === '\u03BC'
end

@testset "unicode sa#15" begin
Expand Down Expand Up @@ -428,4 +433,5 @@ end
@test !isequal_normalized("no\u00EBl", "noel")
@test isequal_normalized("no\u00EBl", "noel", stripmark=true)
@test isequal_normalized("no\u00EBl", "NOEL", stripmark=true, casefold=true)
@test isequal_normalized("\u00B5\u0302m", "\u03BC\u0302m", chartransform=julia_chartransform)
end