Skip to content

Commit e6488e3

Browse files
committed
WIP: Unicode character properties
1 parent 99cf12c commit e6488e3

15 files changed

+540
-382
lines changed

base/exports.jl

+5
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ export
2222
Serializer,
2323
Docs,
2424
Markdown,
25+
Unicode,
2526

2627
# Types
2728
AbstractChannel,
@@ -40,6 +41,8 @@ export
4041
CartesianIndex,
4142
CartesianRange,
4243
Channel,
44+
CharCategory,
45+
CharCategoryCode,
4346
Cmd,
4447
Colon,
4548
Complex,
@@ -116,6 +119,7 @@ export
116119
SymTridiagonal,
117120
Timer,
118121
Tridiagonal,
122+
UnicodeProperty,
119123
UnitRange,
120124
UpperTriangular,
121125
UTF16String,
@@ -818,6 +822,7 @@ export
818822
bits,
819823
bytes2hex,
820824
bytestring,
825+
charprop,
821826
charwidth,
822827
chomp,
823828
chop,

base/io.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -176,15 +176,15 @@ function read(s::IO, ::Type{Char})
176176
end
177177

178178
# mimic utf8.next function
179-
trailing = Base.utf8_trailing[ch+1]
179+
trailing = Unicode.utf8_trailing[ch+1]
180180
c::UInt32 = 0
181181
for j = 1:trailing
182182
c += ch
183183
c <<= 6
184184
ch = read(s, UInt8)
185185
end
186186
c += ch
187-
c -= Base.utf8_offset[trailing+1]
187+
c -= Unicode.utf8_offset[trailing+1]
188188
Char(c)
189189
end
190190

base/unicode.jl

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
11
# This file is a part of Julia. License is MIT: http://julialang.org/license
22

3+
module Unicode
4+
import Base: string, convert, write, length, endof, next, reverseind, lastidx, reverse, isvalid,
5+
sizeof, unsafe_convert, map, getindex, search, rsearch, pointer, containsnul,
6+
lowercase, uppercase, eltype
37
include("unicode/UnicodeError.jl")
48
include("unicode/types.jl")
59
include("unicode/checkstring.jl")
610
include("unicode/utf8.jl")
711
include("unicode/utf16.jl")
812
include("unicode/utf32.jl")
13+
include("unicode/properties.jl")
914
include("unicode/utf8proc.jl")
10-
importall .UTF8proc
15+
end
16+
importall .Unicode

base/unicode/UnicodeError.jl

+24-21
Original file line numberDiff line numberDiff line change
@@ -2,30 +2,33 @@
22

33
## Error messages for Unicode / UTF support
44

5-
const UTF_ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)"
6-
const UTF_ERR_CONT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
7-
const UTF_ERR_LONG = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
8-
const UTF_ERR_NOT_LEAD = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
9-
const UTF_ERR_NOT_TRAIL = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
10-
const UTF_ERR_NOT_SURROGATE = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>"
11-
const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
12-
const UTF_ERR_INVALID = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
13-
const UTF_ERR_SURROGATE = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
14-
const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
15-
const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
16-
const UTF_ERR_ODD_BYTES_16 = "UTF16String can't have odd number of bytes <<1>>"
17-
const UTF_ERR_ODD_BYTES_32 = "UTF32String must have multiple of 4 bytes <<1>>"
18-
const UTF_ERR_INVALID_CHAR = "invalid Unicode character (0x<<2>> > 0x10ffff)"
19-
const UTF_ERR_INVALID_8 = "invalid UTF-8 data"
20-
const UTF_ERR_INVALID_16 = "invalid UTF-16 data"
21-
const UTF_ERR_INVALID_INDEX = "invalid character index"
22-
const UTF_ERR_MAP_CHAR = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"
5+
const ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)"
6+
const ERR_CONT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
7+
const ERR_LONG = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
8+
const ERR_NOT_LEAD = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
9+
const ERR_NOT_TRAIL = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
10+
const ERR_NOT_SURROGATE = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>"
11+
const ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
12+
const ERR_INVALID = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
13+
const ERR_SURROGATE = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
14+
const ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
15+
const ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
16+
const ERR_ODD_BYTES_16 = "UTF16String can't have odd number of bytes <<1>>"
17+
const ERR_ODD_BYTES_32 = "UTF32String must have multiple of 4 bytes <<1>>"
18+
const ERR_INVALID_CHAR = "invalid Unicode character (0x<<2>> > 0x10ffff)"
19+
const ERR_INVALID_8 = "invalid UTF-8 data"
20+
const ERR_INVALID_16 = "invalid UTF-16 data"
21+
const ERR_INVALID_INDEX = "invalid character index"
22+
const ERR_MAP_CHAR = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"
23+
24+
export UnicodeError
2325

2426
type UnicodeError <: Exception
25-
errmsg::AbstractString ##< A UTF_ERR_ message
27+
errmsg::AbstractString ##< An Unicode.ERR_ message
2628
errpos::Int32 ##< Position of invalid character
2729
errchr::UInt32 ##< Invalid character
2830
end
2931

30-
show(io::IO, exc::UnicodeError) = print(io, replace(replace(string("UnicodeError: ",exc.errmsg),
31-
"<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr)))
32+
Base.show(io::IO, exc::UnicodeError) =
33+
print(io, replace(replace(string("UnicodeError: ",exc.errmsg),
34+
"<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr)))

base/unicode/checkstring.jl

+22-20
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,13 @@ const UTF_SURROGATE = 32 ##< surrogate pairs present
2020
## Get a UTF-8 continuation byte, give error if invalid, return updated character value
2121
@inline function get_continuation(ch::UInt32, byt::UInt8, pos)
2222
if !is_valid_continuation(byt)
23-
throw(UnicodeError(UTF_ERR_CONT, pos, byt))
23+
throw(UnicodeError(ERR_CONT, pos, byt))
2424
end
2525
(ch << 6) | (byt & 0x3f)
2626
end
2727

28+
export unsafe_checkstring, checkstring
29+
2830
"""
2931
Validates and calculates number of characters in a UTF-8,UTF-16 or UTF-32 encoded vector/string
3032
@@ -73,7 +75,7 @@ function unsafe_checkstring(dat::Vector{UInt8},
7375
# Check UTF-8 encoding
7476
if ch < 0xe0
7577
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
76-
(pos > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
78+
(pos > endpos) && throw(UnicodeError(ERR_SHORT, pos, ch))
7779
byt, pos = next(dat, pos)
7880
ch = get_continuation(ch & 0x3f, byt, pos)
7981
if ch > 0x7f
@@ -84,28 +86,28 @@ function unsafe_checkstring(dat::Vector{UInt8},
8486
elseif (ch == 0) && accept_long_null
8587
flags |= UTF_LONG
8688
else
87-
throw(UnicodeError(UTF_ERR_LONG, pos, ch))
89+
throw(UnicodeError(ERR_LONG, pos, ch))
8890
end
8991
elseif ch < 0xf0
9092
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
91-
(pos + 1 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
93+
(pos + 1 > endpos) && throw(UnicodeError(ERR_SHORT, pos, ch))
9294
byt, pos = next(dat, pos)
9395
ch = get_continuation(ch & 0x0f, byt, pos)
9496
byt, pos = next(dat, pos)
9597
ch = get_continuation(ch, byt, pos)
9698
# check for surrogate pairs, make sure correct
9799
if is_surrogate_codeunit(ch)
98-
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch))
100+
!is_surrogate_lead(ch) && throw(UnicodeError(ERR_NOT_LEAD, pos-2, ch))
99101
# next character *must* be a trailing surrogate character
100-
(pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
102+
(pos + 2 > endpos) && throw(UnicodeError(ERR_MISSING_SURROGATE, pos-2, ch))
101103
byt, pos = next(dat, pos)
102-
(byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt))
104+
(byt != 0xed) && throw(UnicodeError(ERR_NOT_TRAIL, pos, byt))
103105
byt, pos = next(dat, pos)
104106
surr = get_continuation(0x0000d, byt, pos)
105107
byt, pos = next(dat, pos)
106108
surr = get_continuation(surr, byt, pos)
107-
!is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr))
108-
!accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
109+
!is_surrogate_trail(surr) && throw(UnicodeError(ERR_NOT_TRAIL, pos-2, surr))
110+
!accept_surrogates && throw(UnicodeError(ERR_SURROGATE, pos-2, surr))
109111
flags |= UTF_SURROGATE
110112
num4byte += 1
111113
elseif ch > 0x07ff
@@ -114,23 +116,23 @@ function unsafe_checkstring(dat::Vector{UInt8},
114116
flags |= UTF_LONG
115117
num2byte += 1
116118
else
117-
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
119+
throw(UnicodeError(ERR_LONG, pos-2, ch))
118120
end
119121
elseif ch < 0xf5
120122
# 4-byte UTF-8 sequence (i.e. characters > 0xffff)
121-
(pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
123+
(pos + 2 > endpos) && throw(UnicodeError(ERR_SHORT, pos, ch))
122124
byt, pos = next(dat, pos)
123125
ch = get_continuation(ch & 0x07, byt, pos)
124126
byt, pos = next(dat, pos)
125127
ch = get_continuation(ch, byt, pos)
126128
byt, pos = next(dat, pos)
127129
ch = get_continuation(ch, byt, pos)
128130
if ch > 0x10ffff
129-
throw(UnicodeError(UTF_ERR_INVALID, pos-3, ch))
131+
throw(UnicodeError(ERR_INVALID, pos-3, ch))
130132
elseif ch > 0xffff
131133
num4byte += 1
132134
elseif is_surrogate_codeunit(ch)
133-
throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch))
135+
throw(UnicodeError(ERR_SURROGATE, pos-3, ch))
134136
elseif accept_long_char
135137
# This is an overly long encoded character
136138
flags |= UTF_LONG
@@ -140,10 +142,10 @@ function unsafe_checkstring(dat::Vector{UInt8},
140142
num2byte += 1
141143
end
142144
else
143-
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
145+
throw(UnicodeError(ERR_LONG, pos-2, ch))
144146
end
145147
else
146-
throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
148+
throw(UnicodeError(ERR_INVALID, pos, ch))
147149
end
148150
end
149151
end
@@ -174,22 +176,22 @@ function unsafe_checkstring{T <: Union{Vector{UInt16}, Vector{UInt32}, AbstractS
174176
num2byte += 1
175177
flags |= UTF_UNICODE2
176178
elseif ch > 0x0ffff
177-
(ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
179+
(ch > 0x10ffff) && throw(UnicodeError(ERR_INVALID, pos, ch))
178180
num4byte += 1
179181
elseif !is_surrogate_codeunit(ch)
180182
num3byte += 1
181183
elseif is_surrogate_lead(ch)
182-
pos > endpos && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos, ch))
184+
pos > endpos && throw(UnicodeError(ERR_MISSING_SURROGATE, pos, ch))
183185
# next character *must* be a trailing surrogate character
184186
ch, pos = next(dat, pos)
185-
!is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch))
187+
!is_surrogate_trail(ch) && throw(UnicodeError(ERR_NOT_TRAIL, pos, ch))
186188
num4byte += 1
187189
if T != Vector{UInt16}
188-
!accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch))
190+
!accept_surrogates && throw(UnicodeError(ERR_SURROGATE, pos, ch))
189191
flags |= UTF_SURROGATE
190192
end
191193
else
192-
throw(UnicodeError(UTF_ERR_NOT_LEAD, pos, ch))
194+
throw(UnicodeError(ERR_NOT_LEAD, pos, ch))
193195
end
194196
end
195197
end

0 commit comments

Comments
 (0)