From c5216e001510ac77eee65d3385faeacfe2231a24 Mon Sep 17 00:00:00 2001 From: Solomon Rutzky Date: Mon, 16 Sep 2019 23:16:28 -0400 Subject: [PATCH 1/6] Correct code point format in Base/Char/show function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two minor changes (both on line 307) to conform to the Unicode Standard. Unicode code points currently display with: 1. Lowercase letters, a - f, when present 2. A leading 0 for 5-digit code point values (i.e. 10000 - 9ffff) However, the Unicode Standard specifies that when using the "U+" notation, you should use: 1. Uppercase letters 2. Leading zeros only when the code point would have fewer than four digits (i.e. 0000 - 0FFF) For reference, the Unicode Standard (two versions to show consistency over time) * [(Version 12.1, 2019) Appendix A: Notational Conventions ⇒ Code Points](http://www.unicode.org/versions/Unicode12.0.0/appA.pdf) * [(Version 4.0.0, 2003) Preface: Notational Conventions ⇒ Code Points](http://www.unicode.org/versions/Unicode4.0.0/Preface.pdf) states: > In running text, an individual Unicode code point is expressed as U+n, where n is four to six hexadecimal digits, using the digits 0–9 and uppercase letters A–F (for 10 through 15, respectively). Leading zeros are omitted, unless the code point would have fewer than four hexadecimal digits—for example, U+0001, U+0012, U+0123, U+1234, U+12345, U+102345. --- base/char.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/char.jl b/base/char.jl index a983dcae4b81c..4207b9d12f910 100644 --- a/base/char.jl +++ b/base/char.jl @@ -304,7 +304,7 @@ function show(io::IO, ::MIME"text/plain", c::T) where {T<:AbstractChar} else u = codepoint(c) end - h = string(u, base = 16, pad = u ≤ 0xffff ? 4 : 6) + h = uppercase(string(u, base = 16, pad = 4)) print(io, (isascii(c) ? "ASCII/" : ""), "Unicode U+", h) else print(io, ": Malformed UTF-8") From 96d668d92cb1bb20558597398d1d021fb859aa86 Mon Sep 17 00:00:00 2001 From: Solomon Rutzky Date: Tue, 17 Sep 2019 17:38:32 -0400 Subject: [PATCH 2/6] Add tests for U+ syntax formatting --- test/char.jl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/char.jl b/test/char.jl index 3a2c54b76d3bb..9043c1a65b877 100644 --- a/test/char.jl +++ b/test/char.jl @@ -290,3 +290,12 @@ end @testset "broadcasting of Char" begin @test identity.('a') == 'a' end + +@testset "code point format of U+ syntax (PR 33291)" begin + @test repr("text/plain", '\n') == "'\\n': ASCII/Unicode U+000A (category Cc: Other, control)" + @test repr("text/plain", '/') == "'/': ASCII/Unicode U+002F (category Po: Punctuation, other)" + @test repr("text/plain", '\u10e') == "'Ď': Unicode U+010E (category Lu: Letter, uppercase)" + @test repr("text/plain", '\u3a2c') == "'㨬': Unicode U+3A2C (category Lo: Letter, other)" + @test repr("text/plain", '\U001f428') == "'🐨': Unicode U+1F428 (category So: Symbol, other)" + @test repr("text/plain", '\U010f321') == "'\\U10f321': Unicode U+10F321 (category Co: Other, private use)" +end From 8a6280e5162bc34d201d327ba7883ab3815a8451 Mon Sep 17 00:00:00 2001 From: Solomon Rutzky Date: Tue, 17 Sep 2019 23:44:28 -0400 Subject: [PATCH 3/6] Update code point format to match change in show() function --- base/strings/basic.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/strings/basic.jl b/base/strings/basic.jl index 98709d6fb470e..8ac496cc9e6ce 100644 --- a/base/strings/basic.jl +++ b/base/strings/basic.jl @@ -107,7 +107,7 @@ julia> isvalid(str, 1) true julia> str[1] -'α': Unicode U+03b1 (category Ll: Letter, lowercase) +'α': Unicode U+03B1 (category Ll: Letter, lowercase) julia> isvalid(str, 2) false From f364e43d6094e3441ea972034ccc7d2f9d537a07 Mon Sep 17 00:00:00 2001 From: Solomon Rutzky Date: Wed, 18 Sep 2019 00:29:18 -0400 Subject: [PATCH 4/6] Update code point format to match change in show() function --- base/iostream.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/base/iostream.jl b/base/iostream.jl index 70b08d84f7aa9..33a88095dba3b 100644 --- a/base/iostream.jl +++ b/base/iostream.jl @@ -100,7 +100,7 @@ julia> io = IOBuffer("JuliaLang is a GitHub organization."); julia> seek(io, 5); julia> read(io, Char) -'L': ASCII/Unicode U+004c (category Lu: Letter, uppercase) +'L': ASCII/Unicode U+004C (category Lu: Letter, uppercase) ``` """ function seek(s::IOStream, n::Integer) @@ -122,12 +122,12 @@ julia> io = IOBuffer("JuliaLang is a GitHub organization."); julia> seek(io, 5); julia> read(io, Char) -'L': ASCII/Unicode U+004c (category Lu: Letter, uppercase) +'L': ASCII/Unicode U+004C (category Lu: Letter, uppercase) julia> seekstart(io); julia> read(io, Char) -'J': ASCII/Unicode U+004a (category Lu: Letter, uppercase) +'J': ASCII/Unicode U+004A (category Lu: Letter, uppercase) ``` """ seekstart(s::IO) = seek(s,0) From 0452e63c307e645582589e50ba8aae806229f80d Mon Sep 17 00:00:00 2001 From: Solomon Rutzky Date: Wed, 18 Sep 2019 00:50:15 -0400 Subject: [PATCH 5/6] Update code point format to match change in show() function --- doc/src/manual/strings.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/src/manual/strings.md b/doc/src/manual/strings.md index e3b5935618ca4..bc9c7218f3151 100644 --- a/doc/src/manual/strings.md +++ b/doc/src/manual/strings.md @@ -88,8 +88,8 @@ julia> isvalid(Char, 0x110000) false ``` -As of this writing, the valid Unicode code points are `U+00` through `U+d7ff` and `U+e000` through -`U+10ffff`. These have not all been assigned intelligible meanings yet, nor are they necessarily +As of this writing, the valid Unicode code points are `U+0000` through `U+D7FF` and `U+E000` through +`U+10FFFF`. These have not all been assigned intelligible meanings yet, nor are they necessarily interpretable by applications, but all of these values are considered to be valid Unicode characters. You can input any Unicode character in single quotes using `\u` followed by up to four hexadecimal @@ -107,7 +107,7 @@ julia> '\u2200' '∀': Unicode U+2200 (category Sm: Symbol, math) julia> '\U10ffff' -'\U10ffff': Unicode U+10ffff (category Cn: Other, not assigned) +'\U10ffff': Unicode U+10FFFF (category Cn: Other, not assigned) ``` Julia uses your system's locale and language settings to determine which characters can be printed @@ -173,10 +173,10 @@ julia> str[1] 'H': ASCII/Unicode U+0048 (category Lu: Letter, uppercase) julia> str[6] -',': ASCII/Unicode U+002c (category Po: Punctuation, other) +',': ASCII/Unicode U+002C (category Po: Punctuation, other) julia> str[end] -'\n': ASCII/Unicode U+000a (category Cc: Other, control) +'\n': ASCII/Unicode U+000A (category Cc: Other, control) ``` Many Julia objects, including strings, can be indexed with integers. The index of the first @@ -192,7 +192,7 @@ a normal value: ```jldoctest helloworldstring julia> str[end-1] -'.': ASCII/Unicode U+002e (category Po: Punctuation, other) +'.': ASCII/Unicode U+002E (category Po: Punctuation, other) julia> str[end÷2] ' ': ASCII/Unicode U+0020 (category Zs: Separator, space) @@ -223,7 +223,7 @@ Notice that the expressions `str[k]` and `str[k:k]` do not give the same result: ```jldoctest helloworldstring julia> str[6] -',': ASCII/Unicode U+002c (category Po: Punctuation, other) +',': ASCII/Unicode U+002C (category Po: Punctuation, other) julia> str[6:6] "," @@ -416,7 +416,7 @@ julia> foreach(display, s) '\xc0\xa0': [overlong] ASCII/Unicode U+0020 (category Zs: Separator, space) '\xe2\x88': Malformed UTF-8 (category Ma: Malformed, bad data) '\xe2': Malformed UTF-8 (category Ma: Malformed, bad data) -'|': ASCII/Unicode U+007c (category Sm: Symbol, math) +'|': ASCII/Unicode U+007C (category Sm: Symbol, math) julia> isvalid.(collect(s)) 4-element BitArray{1}: @@ -429,7 +429,7 @@ julia> s2 = "\xf7\xbf\xbf\xbf" "\U1fffff" julia> foreach(display, s2) -'\U1fffff': Unicode U+1fffff (category In: Invalid, too high) +'\U1fffff': Unicode U+1FFFFF (category In: Invalid, too high) ``` We can see that the first two code units in the string `s` form an overlong encoding of From 249202861f8248698ee3253a1c4333c9070a41a6 Mon Sep 17 00:00:00 2001 From: Solomon Rutzky Date: Wed, 18 Sep 2019 01:19:15 -0400 Subject: [PATCH 6/6] Update code point format to match change in show() function --- base/io.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/io.jl b/base/io.jl index d339b773da42b..12130f8ff81dd 100644 --- a/base/io.jl +++ b/base/io.jl @@ -135,7 +135,7 @@ Read the entirety of `io`, as a `String`. julia> io = IOBuffer("JuliaLang is a GitHub organization"); julia> read(io, Char) -'J': ASCII/Unicode U+004a (category Lu: Letter, uppercase) +'J': ASCII/Unicode U+004A (category Lu: Letter, uppercase) julia> io = IOBuffer("JuliaLang is a GitHub organization");