Skip to content

Commit 21bbaef

Browse files
committed
Reorganize UTF handling files
1 parent fc0364b commit 21bbaef

File tree

7 files changed

+461
-440
lines changed

7 files changed

+461
-440
lines changed

base/sysimg.jl

+6-1
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,15 @@ include("iterator.jl")
8484
include("osutils.jl")
8585

8686
# strings & printing
87+
include("utferror.jl")
88+
include("utftype.jl")
89+
include("utfcheck.jl")
8790
include("char.jl")
8891
include("ascii.jl")
8992
include("utf8.jl")
90-
include("utf.jl")
93+
include("utf16.jl")
94+
include("utf32.jl")
95+
include("utfconvert.jl")
9196
include("iobuffer.jl")
9297
include("string.jl")
9398
include("utf8proc.jl")

base/utf16.jl

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# This file is a part of Julia. License is MIT: http://julialang.org/license
2+
3+
function length(s::UTF16String)
4+
d = s.data
5+
len = length(d) - 1
6+
len == 0 && return 0
7+
cnum = 0
8+
for i = 1:len
9+
@inbounds cnum += !is_surrogate_trail(d[i])
10+
end
11+
cnum
12+
end
13+
14+
function endof(s::UTF16String)
15+
d = s.data
16+
i = length(d) - 1
17+
i == 0 && return i
18+
return is_surrogate_codeunit(d[i]) ? i-1 : i
19+
end
20+
21+
get_supplementary(lead::Unsigned, trail::Unsigned) = (UInt32(lead-0xd7f7)<<10 + trail)
22+
23+
function next(s::UTF16String, i::Int)
24+
ch = s.data[i]
25+
!is_surrogate_codeunit(ch) && return (Char(ch), i+1)
26+
# check length, account for terminating \0
27+
i >= (length(s.data)-1) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, i, UInt32(ch))
28+
!is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, i, ch)
29+
ct = s.data[i+1]
30+
!is_surrogate_trail(ct) && utf_errfunc(UTF_ERR_NOT_TRAIL, i, ch)
31+
Char(get_supplementary(ch, ct)), i+2
32+
end
33+
34+
function reverseind(s::UTF16String, i::Integer)
35+
j = length(s.data) - i
36+
return is_surrogate_trail(s.data[j]) ? j-1 : j
37+
end
38+
39+
lastidx(s::UTF16String) = length(s.data) - 1 # s.data includes NULL terminator
40+
41+
function reverse(s::UTF16String)
42+
d = s.data
43+
out = similar(d)
44+
out[end] = 0 # NULL termination
45+
n = length(d)
46+
@inbounds for i = 1:n-1
47+
ch = d[n-i]
48+
if is_surrogate_lead(ch)
49+
out[i],out[i-1] = out[i-1],ch
50+
else
51+
out[i] = ch
52+
end
53+
end
54+
UTF16String(out)
55+
end
56+
57+
sizeof(s::UTF16String) = sizeof(s.data) - sizeof(UInt16)
58+
59+
function isvalid(::Type{UTF16String}, data::AbstractArray{UInt16})
60+
i = 1
61+
n = length(data) # this may include NULL termination; that's okay
62+
@inbounds while i < n # check for unpaired surrogates
63+
if is_surrogate_lead(data[i]) && is_surrogate_trail(data[i+1])
64+
i += 2
65+
elseif is_surrogate_codeunit(data[i])
66+
return false
67+
else
68+
i += 1
69+
end
70+
end
71+
return i > n || !is_surrogate_codeunit(data[i])
72+
end

base/utf32.jl

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# This file is a part of Julia. License is MIT: http://julialang.org/license
2+
3+
# UTF-32 basic functions
4+
next(s::UTF32String, i::Int) = (s.data[i], i+1)
5+
endof(s::UTF32String) = length(s.data) - 1
6+
length(s::UTF32String) = length(s.data) - 1
7+
8+
reverse(s::UTF32String) = UTF32String(reverse!(copy(s.data), 1, length(s)))
9+
10+
sizeof(s::UTF32String) = sizeof(s.data) - sizeof(Char)
11+
12+
function isvalid(::Type{UTF32String}, str::Union(Vector{Char}, Vector{UInt32}))
13+
for i=1:length(str)
14+
@inbounds if !isvalid(Char, UInt32(str[i])) ; return false ; end
15+
end
16+
return true
17+
end
18+
isvalid(str::Vector{Char}) = isvalid(UTF32String, str)
19+
20+
function map(f, s::UTF32String)
21+
d = s.data
22+
out = similar(d)
23+
out[end] = 0
24+
25+
@inbounds for i = 1:(length(d)-1)
26+
c2 = f(d[i])
27+
if !isa(c2, Char)
28+
throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
29+
end
30+
out[i] = (c2::Char)
31+
end
32+
UTF32String(out)
33+
end

base/utfcheck.jl

+255
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
# This file is a part of Julia. License is MIT: http://julialang.org/license
2+
3+
# Functions to check validity of UTF-8, UTF-16, and UTF-32 encoded strings,
4+
# and also to return information necessary to convert to other encodings
5+
6+
is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
7+
is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
8+
is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
9+
is_valid_continuation(c) = ((c & 0xc0) == 0x80)
10+
11+
# Options for check_string_* functions
12+
13+
const UTF_NO_LONG_NULL = 1 # don't accept 0xc0 0x80 for '\0'
14+
const UTF_NO_SURROGATES = 2 # don't accept surrogate pairs in UTF-8/UTF-32
15+
const UTF_ACCEPT_LONG = 4 # accept long encodings (other than long null in UTF-8)
16+
17+
const UTF_LONG = 1 # Long encodings are present
18+
const UTF_LATIN1 = 2 # characters in range 0x80-0xFF present
19+
const UTF_UNICODE2 = 4 # characters in range 0x100-0x7ff present
20+
const UTF_UNICODE3 = 8 # characters in range 0x800-0xd7ff, 0xe000-0xffff
21+
const UTF_UNICODE4 = 16 # non-BMP characters present
22+
const UTF_SURROGATE = 32 # surrogate pairs present
23+
24+
# Get a UTF-8 continuation byte, give error if invalid, and update position and character value
25+
@inline function get_continuation(ch::UInt32, byt::UInt8, pos)
26+
!is_valid_continuation(byt) && utf_errfunc(UTF_ERR_CONT, pos, byt)
27+
(ch << 6) | (byt & 0x3f)
28+
end
29+
30+
#=
31+
@doc """
32+
@brief Validates and calculates number of characters in a UTF-8 encoded vector of UInt8
33+
34+
@param[in] str Vector of UInt8
35+
@param[in] options flags to determine error handling (default 0)
36+
37+
@return (total characters, flags, 4-byte, 3-byte, 2-byte)
38+
@throws ArgumentError
39+
""" ->
40+
=#
41+
function check_string_utf8(dat::Vector{UInt8}, options::Integer=0)
42+
local byt::UInt8, ch::UInt32, surr::UInt32
43+
flags::UInt = 0
44+
totalchar = num2byte = num3byte = num4byte = 0
45+
pos = 0
46+
len = sizeof(dat)
47+
@inbounds while pos < len
48+
ch = dat[pos += 1]
49+
totalchar += 1
50+
if ch > 0x7f
51+
# Check UTF-8 encoding
52+
if ch < 0xe0
53+
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
54+
(pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
55+
ch = get_continuation(ch & 0x3f, dat[pos += 1], pos)
56+
if ch > 0x7f
57+
num2byte += 1
58+
flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
59+
elseif (options & UTF_ACCEPT_LONG) != 0
60+
flags |= UTF_LONG
61+
elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0)
62+
flags |= UTF_LONG
63+
else
64+
utf_errfunc(UTF_ERR_LONG, pos, ch)
65+
end
66+
elseif ch < 0xf0
67+
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
68+
(pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
69+
ch = get_continuation(ch & 0x0f, dat[pos += 1], pos)
70+
ch = get_continuation(ch, dat[pos += 1], pos)
71+
# check for surrogate pairs, make sure correct
72+
if is_surrogate_codeunit(ch)
73+
!is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch)
74+
# next character *must* be a trailing surrogate character
75+
(pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch)
76+
byt = dat[pos += 1]
77+
(byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt)
78+
surr = get_continuation(0x0000d, dat[pos += 1], pos)
79+
surr = get_continuation(surr, dat[pos += 1], pos)
80+
!is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr)
81+
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr)
82+
flags |= UTF_SURROGATE
83+
num4byte += 1
84+
elseif ch > 0x07ff
85+
num3byte += 1
86+
elseif (options & UTF_ACCEPT_LONG) != 0
87+
flags |= UTF_LONG
88+
num2byte += 1
89+
else
90+
utf_errfunc(UTF_ERR_LONG, pos-2, ch)
91+
end
92+
elseif ch < 0xf5
93+
# 4-byte UTF-8 sequence (i.e. characters > 0xffff)
94+
(pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch)
95+
ch = get_continuation(ch & 0x07, dat[pos += 1], pos)
96+
ch = get_continuation(ch, dat[pos += 1], pos)
97+
ch = get_continuation(ch, dat[pos += 1], pos)
98+
if ch > 0x10ffff
99+
utf_errfunc(UTF_ERR_INVALID, pos-3, ch)
100+
elseif ch > 0xffff
101+
num4byte += 1
102+
elseif is_surrogate_codeunit(ch)
103+
utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch)
104+
elseif (options & UTF_ACCEPT_LONG) != 0
105+
# This is an overly long encode character
106+
flags |= UTF_LONG
107+
if ch > 0x7ff
108+
num3byte += 1
109+
elseif ch > 0x7f
110+
num2byte += 1
111+
end
112+
else
113+
utf_errfunc(UTF_ERR_LONG, pos-2, ch)
114+
end
115+
else
116+
utf_errfunc(UTF_ERR_INVALID, pos, ch)
117+
end
118+
end
119+
end
120+
num3byte != 0 && (flags |= UTF_UNICODE3)
121+
num4byte != 0 && (flags |= UTF_UNICODE4)
122+
return totalchar, flags, num4byte, num3byte, num2byte
123+
end
124+
125+
#=
126+
@doc """
127+
@brief Validates and calculates number of characters in a UTF-16 encoded vector of UInt16
128+
129+
@param[in] dat Vector{UInt16}
130+
@param[in] options flags to determine error handling (default 0)
131+
132+
@return (total characters, flags, 4-byte, 3-byte, 2-byte)
133+
@throws ArgumentError
134+
""" ->
135+
=#
136+
function check_string_utf16(dat::Vector{UInt16}, len::Int)
137+
local ch::UInt32
138+
flags::UInt = 0
139+
totalchar = num2byte = num3byte = num4byte = 0
140+
pos = 0
141+
@inbounds while pos < len
142+
ch = dat[pos += 1]
143+
totalchar += 1
144+
if ch > 0x7f
145+
if ch < 0x100
146+
num2byte += 1
147+
flags |= UTF_LATIN1
148+
elseif ch < 0x800
149+
num2byte += 1
150+
flags |= UTF_UNICODE2
151+
elseif !is_surrogate_codeunit(ch)
152+
num3byte += 1
153+
elseif is_surrogate_lead(ch)
154+
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
155+
# next character *must* be a trailing surrogate character
156+
ch = dat[pos += 1]
157+
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
158+
num4byte += 1
159+
else
160+
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
161+
end
162+
end
163+
end
164+
num3byte != 0 && (flags |= UTF_UNICODE3)
165+
num4byte != 0 && (flags |= UTF_UNICODE4)
166+
return totalchar, flags, num4byte, num3byte, num2byte
167+
end
168+
169+
#=
170+
@doc """
171+
@brief Validates and calculates number of characters in a UTF-32 encoded vector of UInt32
172+
173+
@param[in] dat Vector{UInt32}
174+
@param[in] options flags to determine error handling (default 0)
175+
176+
@return (total characters, flags, 4-byte, 3-byte, 2-byte)
177+
@throws ArgumentError
178+
""" ->
179+
=#
180+
function check_string_utf32(dat::Vector{UInt32}, len::Int, options::Integer=0)
181+
local ch::UInt32
182+
flags::UInt = 0
183+
totalchar = num2byte = num3byte = num4byte = 0
184+
pos = 0
185+
@inbounds while pos < len
186+
ch = dat[pos += 1]
187+
totalchar += 1
188+
if ch > 0x7f
189+
if ch < 0x100
190+
num2byte += 1
191+
flags |= UTF_LATIN1
192+
elseif ch < 0x800
193+
num2byte += 1
194+
flags |= UTF_UNICODE2
195+
elseif ch > 0xffff
196+
(ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
197+
num4byte += 1
198+
elseif !is_surrogate_codeunit(ch)
199+
num3byte += 1
200+
elseif is_surrogate_lead(ch)
201+
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
202+
# next character *must* be a trailing surrogate character
203+
ch = dat[pos += 1]
204+
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
205+
num4byte += 1
206+
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
207+
flags |= UTF_SURROGATE
208+
else
209+
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
210+
end
211+
end
212+
end
213+
num3byte != 0 && (flags |= UTF_UNICODE3)
214+
num4byte != 0 && (flags |= UTF_UNICODE4)
215+
return totalchar, flags, num4byte, num3byte, num2byte
216+
end
217+
218+
function check_string_abs(str::AbstractString, options::Integer=0)
219+
local ch::UInt32
220+
flags::UInt = 0
221+
totalchar = num2byte = num3byte = num4byte = 0
222+
pos = start(str)
223+
len = endof(str)
224+
@inbounds while pos < len
225+
ch, pos = next(str, pos)
226+
totalchar += 1
227+
if ch > 0x7f
228+
if ch < 0x100
229+
num2byte += 1
230+
flags |= UTF_LATIN1
231+
elseif ch < 0x800
232+
num2byte += 1
233+
flags |= UTF_UNICODE2
234+
elseif ch > 0xffff
235+
(ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch)
236+
num4byte += 1
237+
elseif !is_surrogate_codeunit(ch)
238+
num3byte += 1
239+
elseif is_surrogate_lead(ch)
240+
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch)
241+
# next character *must* be a trailing surrogate character
242+
ch, pos = next(str, pos)
243+
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch)
244+
num4byte += 1
245+
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch)
246+
flags |= UTF_SURROGATE
247+
else
248+
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch)
249+
end
250+
end
251+
end
252+
num3byte != 0 && (flags |= UTF_UNICODE3)
253+
num4byte != 0 && (flags |= UTF_UNICODE4)
254+
return totalchar, flags, num4byte, num3byte, num2byte
255+
end

0 commit comments

Comments
 (0)