@@ -20,11 +20,13 @@ const UTF_SURROGATE = 32 ##< surrogate pairs present
20
20
# # Get a UTF-8 continuation byte, give error if invalid, return updated character value
21
21
@inline function get_continuation (ch:: UInt32 , byt:: UInt8 , pos)
22
22
if ! is_valid_continuation (byt)
23
- throw (UnicodeError (UTF_ERR_CONT , pos, byt))
23
+ throw (UnicodeError (ERR_CONT , pos, byt))
24
24
end
25
25
(ch << 6 ) | (byt & 0x3f )
26
26
end
27
27
28
+ export unsafe_checkstring, checkstring
29
+
28
30
"""
29
31
Validates and calculates number of characters in a UTF-8,UTF-16 or UTF-32 encoded vector/string
30
32
@@ -73,7 +75,7 @@ function unsafe_checkstring(dat::Vector{UInt8},
73
75
# Check UTF-8 encoding
74
76
if ch < 0xe0
75
77
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
76
- (pos > endpos) && throw (UnicodeError (UTF_ERR_SHORT , pos, ch))
78
+ (pos > endpos) && throw (UnicodeError (ERR_SHORT , pos, ch))
77
79
byt, pos = next (dat, pos)
78
80
ch = get_continuation (ch & 0x3f , byt, pos)
79
81
if ch > 0x7f
@@ -84,28 +86,28 @@ function unsafe_checkstring(dat::Vector{UInt8},
84
86
elseif (ch == 0 ) && accept_long_null
85
87
flags |= UTF_LONG
86
88
else
87
- throw (UnicodeError (UTF_ERR_LONG , pos, ch))
89
+ throw (UnicodeError (ERR_LONG , pos, ch))
88
90
end
89
91
elseif ch < 0xf0
90
92
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
91
- (pos + 1 > endpos) && throw (UnicodeError (UTF_ERR_SHORT , pos, ch))
93
+ (pos + 1 > endpos) && throw (UnicodeError (ERR_SHORT , pos, ch))
92
94
byt, pos = next (dat, pos)
93
95
ch = get_continuation (ch & 0x0f , byt, pos)
94
96
byt, pos = next (dat, pos)
95
97
ch = get_continuation (ch, byt, pos)
96
98
# check for surrogate pairs, make sure correct
97
99
if is_surrogate_codeunit (ch)
98
- ! is_surrogate_lead (ch) && throw (UnicodeError (UTF_ERR_NOT_LEAD , pos- 2 , ch))
100
+ ! is_surrogate_lead (ch) && throw (UnicodeError (ERR_NOT_LEAD , pos- 2 , ch))
99
101
# next character *must* be a trailing surrogate character
100
- (pos + 2 > endpos) && throw (UnicodeError (UTF_ERR_MISSING_SURROGATE , pos- 2 , ch))
102
+ (pos + 2 > endpos) && throw (UnicodeError (ERR_MISSING_SURROGATE , pos- 2 , ch))
101
103
byt, pos = next (dat, pos)
102
- (byt != 0xed ) && throw (UnicodeError (UTF_ERR_NOT_TRAIL , pos, byt))
104
+ (byt != 0xed ) && throw (UnicodeError (ERR_NOT_TRAIL , pos, byt))
103
105
byt, pos = next (dat, pos)
104
106
surr = get_continuation (0x0000d , byt, pos)
105
107
byt, pos = next (dat, pos)
106
108
surr = get_continuation (surr, byt, pos)
107
- ! is_surrogate_trail (surr) && throw (UnicodeError (UTF_ERR_NOT_TRAIL , pos- 2 , surr))
108
- ! accept_surrogates && throw (UnicodeError (UTF_ERR_SURROGATE , pos- 2 , surr))
109
+ ! is_surrogate_trail (surr) && throw (UnicodeError (ERR_NOT_TRAIL , pos- 2 , surr))
110
+ ! accept_surrogates && throw (UnicodeError (ERR_SURROGATE , pos- 2 , surr))
109
111
flags |= UTF_SURROGATE
110
112
num4byte += 1
111
113
elseif ch > 0x07ff
@@ -114,23 +116,23 @@ function unsafe_checkstring(dat::Vector{UInt8},
114
116
flags |= UTF_LONG
115
117
num2byte += 1
116
118
else
117
- throw (UnicodeError (UTF_ERR_LONG , pos- 2 , ch))
119
+ throw (UnicodeError (ERR_LONG , pos- 2 , ch))
118
120
end
119
121
elseif ch < 0xf5
120
122
# 4-byte UTF-8 sequence (i.e. characters > 0xffff)
121
- (pos + 2 > endpos) && throw (UnicodeError (UTF_ERR_SHORT , pos, ch))
123
+ (pos + 2 > endpos) && throw (UnicodeError (ERR_SHORT , pos, ch))
122
124
byt, pos = next (dat, pos)
123
125
ch = get_continuation (ch & 0x07 , byt, pos)
124
126
byt, pos = next (dat, pos)
125
127
ch = get_continuation (ch, byt, pos)
126
128
byt, pos = next (dat, pos)
127
129
ch = get_continuation (ch, byt, pos)
128
130
if ch > 0x10ffff
129
- throw (UnicodeError (UTF_ERR_INVALID , pos- 3 , ch))
131
+ throw (UnicodeError (ERR_INVALID , pos- 3 , ch))
130
132
elseif ch > 0xffff
131
133
num4byte += 1
132
134
elseif is_surrogate_codeunit (ch)
133
- throw (UnicodeError (UTF_ERR_SURROGATE , pos- 3 , ch))
135
+ throw (UnicodeError (ERR_SURROGATE , pos- 3 , ch))
134
136
elseif accept_long_char
135
137
# This is an overly long encoded character
136
138
flags |= UTF_LONG
@@ -140,10 +142,10 @@ function unsafe_checkstring(dat::Vector{UInt8},
140
142
num2byte += 1
141
143
end
142
144
else
143
- throw (UnicodeError (UTF_ERR_LONG , pos- 2 , ch))
145
+ throw (UnicodeError (ERR_LONG , pos- 2 , ch))
144
146
end
145
147
else
146
- throw (UnicodeError (UTF_ERR_INVALID , pos, ch))
148
+ throw (UnicodeError (ERR_INVALID , pos, ch))
147
149
end
148
150
end
149
151
end
@@ -174,22 +176,22 @@ function unsafe_checkstring{T <: Union{Vector{UInt16}, Vector{UInt32}, AbstractS
174
176
num2byte += 1
175
177
flags |= UTF_UNICODE2
176
178
elseif ch > 0x0ffff
177
- (ch > 0x10ffff ) && throw (UnicodeError (UTF_ERR_INVALID , pos, ch))
179
+ (ch > 0x10ffff ) && throw (UnicodeError (ERR_INVALID , pos, ch))
178
180
num4byte += 1
179
181
elseif ! is_surrogate_codeunit (ch)
180
182
num3byte += 1
181
183
elseif is_surrogate_lead (ch)
182
- pos > endpos && throw (UnicodeError (UTF_ERR_MISSING_SURROGATE , pos, ch))
184
+ pos > endpos && throw (UnicodeError (ERR_MISSING_SURROGATE , pos, ch))
183
185
# next character *must* be a trailing surrogate character
184
186
ch, pos = next (dat, pos)
185
- ! is_surrogate_trail (ch) && throw (UnicodeError (UTF_ERR_NOT_TRAIL , pos, ch))
187
+ ! is_surrogate_trail (ch) && throw (UnicodeError (ERR_NOT_TRAIL , pos, ch))
186
188
num4byte += 1
187
189
if T != Vector{UInt16}
188
- ! accept_surrogates && throw (UnicodeError (UTF_ERR_SURROGATE , pos, ch))
190
+ ! accept_surrogates && throw (UnicodeError (ERR_SURROGATE , pos, ch))
189
191
flags |= UTF_SURROGATE
190
192
end
191
193
else
192
- throw (UnicodeError (UTF_ERR_NOT_LEAD , pos, ch))
194
+ throw (UnicodeError (ERR_NOT_LEAD , pos, ch))
193
195
end
194
196
end
195
197
end
0 commit comments