Skip to content

Commit 47cebce

Browse files
wip: implement utf8_to_utf16 helper function
1 parent 71ac402 commit 47cebce

File tree

1 file changed

+37
-1
lines changed

1 file changed

+37
-1
lines changed

base/c.jl

+37-1
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,43 @@ function utf8_to_utf16(src::Vector{UInt8})
140140
end
141141

142142
function utf16_to_utf8(src::Vector{UInt16})
143-
143+
dst = UInt8[]
144+
i, n = 1, length(src)
145+
n > 0 || return dst
146+
sizehint!(dst, n)
147+
a = src[1]
148+
while true
149+
if a < 0x80 # ASCII
150+
push!(dst, a % UInt8)
151+
elseif a < 0x800 # 2-byte UTF-8
152+
push!(dst, 0xc0 | ((a >> 6) % UInt8),
153+
0x80 | ((a % UInt8) & 0x3f))
154+
elseif ((a & 0xfc00) == 0xd800) & (i < n)
155+
b = src[i += 1]
156+
if (b & 0xfc00) == 0xdc00
157+
# 2-unit UTF-16 sequence => 4-byte UTF-8
158+
a += 0x2840
159+
push!(dst, 0xf0 | ((a >> 8) % UInt8),
160+
0x80 | ((a % UInt8) >> 2),
161+
0xf0 $ ((((a % UInt8) << 4) & 0x3f) $ (b >> 6) % UInt8),
162+
0x80 | ((b % UInt8) & 0x3f))
163+
else
164+
push!(dst, 0xe0 | ((a >> 12) % UInt8),
165+
0x80 | (((a >> 6) % UInt8) & 0x3f),
166+
0x80 | ((a % UInt8) & 0x3f))
167+
a = b; continue
168+
end
169+
else
170+
# 1-unit high UTF-16 or unpaired high surrogate
171+
# either way, encode as 3-byte UTF-8 code point
172+
push!(dst, 0xe0 | ((a >> 12) % UInt8),
173+
0x80 | (((a >> 6) % UInt8) & 0x3f),
174+
0x80 | ((a % UInt8) & 0x3f))
175+
end
176+
i < n || break
177+
a = src[i += 1]
178+
end
179+
return dst
144180
end
145181

146182
# deferring (or un-deferring) ctrl-c handler for external C code that

0 commit comments

Comments
 (0)