@@ -81,6 +81,10 @@ export
81
81
82
82
// Misc
83
83
is_utf8,
84
+ is_utf16,
85
+ to_utf16,
86
+ from_utf16,
87
+ utf16_chars,
84
88
count_chars, count_bytes,
85
89
utf8_char_width,
86
90
char_range_at,
@@ -1060,6 +1064,83 @@ fn is_utf8(v: [u8]) -> bool {
1060
1064
ret true;
1061
1065
}
1062
1066
1067
+
1068
+ fn is_utf16 ( v : [ u16 ] ) -> bool {
1069
+ let len = v. len ( ) ;
1070
+ let i = 0 u;
1071
+ while ( i < len) {
1072
+ let u = v[ i] ;
1073
+
1074
+ if u <= 0xD7FF_u16 || u >= 0xE000_u16 {
1075
+ i += 1 u;
1076
+
1077
+ } else {
1078
+ if i+1 u < len { ret false ; }
1079
+ let u2 = v[ i+1 u] ;
1080
+ if u < 0xD7FF_u16 || u > 0xDBFF_u16 { ret false ; }
1081
+ if u2 < 0xDC00_u16 || u2 > 0xDFFF_u16 { ret false ; }
1082
+ i += 2 u;
1083
+ }
1084
+ }
1085
+ ret true;
1086
+ }
1087
+
1088
+
1089
+ fn to_utf16 ( s : str ) -> [ u16 ] {
1090
+ let u = [ ] ;
1091
+ chars_iter ( s) { |cch|
1092
+ // Arithmetic with u32 literals is easier on the eyes than chars.
1093
+ let ch = cch as u32 ;
1094
+
1095
+ if ( ch & 0xFFFF_u32 ) == ch {
1096
+ // The BMP falls through (assuming non-surrogate, as it should)
1097
+ assert ch <= 0xD7FF_u32 || ch >= 0xE000_u32 ;
1098
+ u += [ ch as u16 ]
1099
+ } else {
1100
+ // Supplementary planes break into surrogates.
1101
+ assert ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32 ;
1102
+ ch -= 0x1_0000_u32 ;
1103
+ let w1 = 0xD800_u16 | ( ( ch >> 10 ) as u16 ) ;
1104
+ let w2 = 0xDC00_u16 | ( ( ch as u16 ) & 0x3FF_u16 ) ;
1105
+ u += [ w1, w2]
1106
+ }
1107
+ }
1108
+ ret u;
1109
+ }
1110
+
1111
+ fn utf16_chars ( v : [ u16 ] , f : fn ( char ) ) {
1112
+ let len = v. len ( ) ;
1113
+ let i = 0 u;
1114
+ while ( i < len) {
1115
+ let u = v[ i] ;
1116
+
1117
+ if u <= 0xD7FF_u16 || u >= 0xE000_u16 {
1118
+ f ( u as char ) ;
1119
+ i += 1 u;
1120
+
1121
+ } else {
1122
+ let u2 = v[ i+1 u] ;
1123
+ assert u >= 0xD800_u16 && u <= 0xDBFF_u16 ;
1124
+ assert u2 >= 0xDC00_u16 && u2 <= 0xDFFF_u16 ;
1125
+ let c = ( u - 0xD800_u16 ) as char ;
1126
+ c = c << 10 ;
1127
+ c |= ( u2 - 0xDC00_u16 ) as char ;
1128
+ c |= 0x1_0000_u32 as char ;
1129
+ f ( c) ;
1130
+ i += 2 u;
1131
+ }
1132
+ }
1133
+ }
1134
+
1135
+
1136
+ fn from_utf16 ( v : [ u16 ] ) -> str {
1137
+ let buf = "" ;
1138
+ reserve ( buf, v. len ( ) ) ;
1139
+ utf16_chars ( v) { |ch| push_char ( buf, ch) ; }
1140
+ ret buf;
1141
+ }
1142
+
1143
+
1063
1144
/*
1064
1145
Function: count_chars
1065
1146
@@ -2223,4 +2304,51 @@ mod tests {
2223
2304
assert [ 'ศ' , 'ไ' , 'ท' , 'ย' , '中' , '华' , 'V' , 'i' , 'ệ' , 't' , ' ' , 'N' , 'a' , 'm' ]
2224
2305
== chars ( ss) ;
2225
2306
}
2307
+
2308
+ #[ test]
2309
+ fn test_utf16 ( ) {
2310
+ let pairs =
2311
+ [ ( "𐍅𐌿𐌻𐍆𐌹𐌻𐌰\n " ,
2312
+ [ 0xd800_u16 , 0xdf45_u16 , 0xd800_u16 , 0xdf3f_u16 ,
2313
+ 0xd800_u16 , 0xdf3b_u16 , 0xd800_u16 , 0xdf46_u16 ,
2314
+ 0xd800_u16 , 0xdf39_u16 , 0xd800_u16 , 0xdf3b_u16 ,
2315
+ 0xd800_u16 , 0xdf30_u16 , 0x000a_u16 ] ) ,
2316
+
2317
+ ( "𐐒𐑉𐐮𐑀𐐲𐑋 𐐏𐐲𐑍\n " ,
2318
+ [ 0xd801_u16 , 0xdc12_u16 , 0xd801_u16 ,
2319
+ 0xdc49_u16 , 0xd801_u16 , 0xdc2e_u16 , 0xd801_u16 ,
2320
+ 0xdc40_u16 , 0xd801_u16 , 0xdc32_u16 , 0xd801_u16 ,
2321
+ 0xdc4b_u16 , 0x0020_u16 , 0xd801_u16 , 0xdc0f_u16 ,
2322
+ 0xd801_u16 , 0xdc32_u16 , 0xd801_u16 , 0xdc4d_u16 ,
2323
+ 0x000a_u16 ] ) ,
2324
+
2325
+ ( "𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n " ,
2326
+ [ 0xd800_u16 , 0xdf00_u16 , 0xd800_u16 , 0xdf16_u16 ,
2327
+ 0xd800_u16 , 0xdf0b_u16 , 0xd800_u16 , 0xdf04_u16 ,
2328
+ 0xd800_u16 , 0xdf11_u16 , 0xd800_u16 , 0xdf09_u16 ,
2329
+ 0x00b7_u16 , 0xd800_u16 , 0xdf0c_u16 , 0xd800_u16 ,
2330
+ 0xdf04_u16 , 0xd800_u16 , 0xdf15_u16 , 0xd800_u16 ,
2331
+ 0xdf04_u16 , 0xd800_u16 , 0xdf0b_u16 , 0xd800_u16 ,
2332
+ 0xdf09_u16 , 0xd800_u16 , 0xdf11_u16 , 0x000a_u16 ] ) ,
2333
+
2334
+ ( "𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n " ,
2335
+ [ 0xd801_u16 , 0xdc8b_u16 , 0xd801_u16 , 0xdc98_u16 ,
2336
+ 0xd801_u16 , 0xdc88_u16 , 0xd801_u16 , 0xdc91_u16 ,
2337
+ 0xd801_u16 , 0xdc9b_u16 , 0xd801_u16 , 0xdc92_u16 ,
2338
+ 0x0020_u16 , 0xd801_u16 , 0xdc95_u16 , 0xd801_u16 ,
2339
+ 0xdc93_u16 , 0x0020_u16 , 0xd801_u16 , 0xdc88_u16 ,
2340
+ 0xd801_u16 , 0xdc9a_u16 , 0xd801_u16 , 0xdc8d_u16 ,
2341
+ 0x0020_u16 , 0xd801_u16 , 0xdc8f_u16 , 0xd801_u16 ,
2342
+ 0xdc9c_u16 , 0xd801_u16 , 0xdc92_u16 , 0xd801_u16 ,
2343
+ 0xdc96_u16 , 0xd801_u16 , 0xdc86_u16 , 0x0020_u16 ,
2344
+ 0xd801_u16 , 0xdc95_u16 , 0xd801_u16 , 0xdc86_u16 ,
2345
+ 0x000a_u16 ] ) ] ;
2346
+
2347
+ for ( s, u) in pairs {
2348
+ assert to_utf16 ( s) == u;
2349
+ assert from_utf16 ( u) == s;
2350
+ assert from_utf16 ( to_utf16 ( s) ) == s;
2351
+ assert to_utf16 ( from_utf16 ( u) ) == u;
2352
+ }
2353
+ }
2226
2354
}
0 commit comments