Skip to content

Commit 47e7a05

Browse files
committed
Add some utf16 routines for OS API interop.
1 parent 1430675 commit 47e7a05

File tree

1 file changed

+128
-0
lines changed

1 file changed

+128
-0
lines changed

src/libcore/str.rs

+128
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ export
8181

8282
// Misc
8383
is_utf8,
84+
is_utf16,
85+
to_utf16,
86+
from_utf16,
87+
utf16_chars,
8488
count_chars, count_bytes,
8589
utf8_char_width,
8690
char_range_at,
@@ -1060,6 +1064,83 @@ fn is_utf8(v: [u8]) -> bool {
10601064
ret true;
10611065
}
10621066

1067+
1068+
fn is_utf16(v: [u16]) -> bool {
1069+
let len = v.len();
1070+
let i = 0u;
1071+
while (i < len) {
1072+
let u = v[i];
1073+
1074+
if u <= 0xD7FF_u16 || u >= 0xE000_u16 {
1075+
i += 1u;
1076+
1077+
} else {
1078+
if i+1u < len { ret false; }
1079+
let u2 = v[i+1u];
1080+
if u < 0xD7FF_u16 || u > 0xDBFF_u16 { ret false; }
1081+
if u2 < 0xDC00_u16 || u2 > 0xDFFF_u16 { ret false; }
1082+
i += 2u;
1083+
}
1084+
}
1085+
ret true;
1086+
}
1087+
1088+
1089+
fn to_utf16(s: str) -> [u16] {
1090+
let u = [];
1091+
chars_iter(s) {|cch|
1092+
// Arithmetic with u32 literals is easier on the eyes than chars.
1093+
let ch = cch as u32;
1094+
1095+
if (ch & 0xFFFF_u32) == ch {
1096+
// The BMP falls through (assuming non-surrogate, as it should)
1097+
assert ch <= 0xD7FF_u32 || ch >= 0xE000_u32;
1098+
u += [ch as u16]
1099+
} else {
1100+
// Supplementary planes break into surrogates.
1101+
assert ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32;
1102+
ch -= 0x1_0000_u32;
1103+
let w1 = 0xD800_u16 | ((ch >> 10) as u16);
1104+
let w2 = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
1105+
u += [w1, w2]
1106+
}
1107+
}
1108+
ret u;
1109+
}
1110+
1111+
fn utf16_chars(v: [u16], f: fn(char)) {
1112+
let len = v.len();
1113+
let i = 0u;
1114+
while (i < len) {
1115+
let u = v[i];
1116+
1117+
if u <= 0xD7FF_u16 || u >= 0xE000_u16 {
1118+
f(u as char);
1119+
i += 1u;
1120+
1121+
} else {
1122+
let u2 = v[i+1u];
1123+
assert u >= 0xD800_u16 && u <= 0xDBFF_u16;
1124+
assert u2 >= 0xDC00_u16 && u2 <= 0xDFFF_u16;
1125+
let c = (u - 0xD800_u16) as char;
1126+
c = c << 10;
1127+
c |= (u2 - 0xDC00_u16) as char;
1128+
c |= 0x1_0000_u32 as char;
1129+
f(c);
1130+
i += 2u;
1131+
}
1132+
}
1133+
}
1134+
1135+
1136+
fn from_utf16(v: [u16]) -> str {
1137+
let buf = "";
1138+
reserve(buf, v.len());
1139+
utf16_chars(v) {|ch| push_char(buf, ch); }
1140+
ret buf;
1141+
}
1142+
1143+
10631144
/*
10641145
Function: count_chars
10651146
@@ -2223,4 +2304,51 @@ mod tests {
22232304
assert ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m']
22242305
== chars(ss);
22252306
}
2307+
2308+
#[test]
2309+
fn test_utf16() {
2310+
let pairs =
2311+
[("𐍅𐌿𐌻𐍆𐌹𐌻𐌰\n",
2312+
[0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
2313+
0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
2314+
0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
2315+
0xd800_u16, 0xdf30_u16, 0x000a_u16]),
2316+
2317+
("𐐒𐑉𐐮𐑀𐐲𐑋 𐐏𐐲𐑍\n",
2318+
[0xd801_u16, 0xdc12_u16, 0xd801_u16,
2319+
0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
2320+
0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
2321+
0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
2322+
0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
2323+
0x000a_u16]),
2324+
2325+
("𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n",
2326+
[0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
2327+
0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
2328+
0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
2329+
0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
2330+
0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
2331+
0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
2332+
0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
2333+
2334+
("𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n",
2335+
[0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
2336+
0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
2337+
0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
2338+
0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
2339+
0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
2340+
0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
2341+
0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
2342+
0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
2343+
0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
2344+
0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
2345+
0x000a_u16 ]) ];
2346+
2347+
for (s, u) in pairs {
2348+
assert to_utf16(s) == u;
2349+
assert from_utf16(u) == s;
2350+
assert from_utf16(to_utf16(s)) == s;
2351+
assert to_utf16(from_utf16(u)) == u;
2352+
}
2353+
}
22262354
}

0 commit comments

Comments
 (0)