test: update wpt encoding

watilde · danielleadams · commit 4acc2732f903 · 2021-01-12T07:10:28.000-05:00
Refs: web-platform-tests/wpt#26385 PR-URL: #36659 Reviewed-By: Michaël Zasso <targos@protonmail.com> Reviewed-By: Rich Trott <rtrott@gmail.com>
diff --git a/test/fixtures/wpt/README.md b/test/fixtures/wpt/README.md
@@ -11,7 +11,7 @@ See [test/wpt](../../wpt/README.md) for information on how these tests are run.
 Last update:
 
 - console: https://github.com/web-platform-tests/wpt/tree/3b1f72e99a/console
-- encoding: https://github.com/web-platform-tests/wpt/tree/1821fb5f77/encoding
+- encoding: https://github.com/web-platform-tests/wpt/tree/3c9820d1cc/encoding
 - url: https://github.com/web-platform-tests/wpt/tree/1783c9bccf/url
 - resources: https://github.com/web-platform-tests/wpt/tree/001e50de41/resources
 - interfaces: https://github.com/web-platform-tests/wpt/tree/8719553b2d/interfaces
diff --git a/test/fixtures/wpt/encoding/legacy-mb-schinese/gb18030/gb18030-decoder.html b/test/fixtures/wpt/encoding/legacy-mb-schinese/gb18030/gb18030-decoder.html
@@ -0,0 +1,55 @@
+<!doctype html>
+<script src=/resources/testharness.js></script>
+<script src=/resources/testharnessreport.js></script>
+<script src=resources/ranges.js></script>
+<script>
+ const decode = (input, output, desc) => {
+   test(function() {
+     for (encoding of ["gb18030", "gbk"])
+       assert_equals(new TextDecoder(encoding).decode(new Uint8Array(input)), output)
+   }, "gb18030 decoder: " + desc)
+ }
+
+ decode([115], "s", "ASCII");
+ decode([0x80], "\u20AC", "euro");
+ decode([0xFF], "\uFFFD", "initial byte out of accepted ranges");
+ decode([0x81], "\uFFFD", "end of queue, gb18030 first not 0");
+ decode([0x81, 0x28], "\ufffd(", "two bytes 0x81 0x28");
+ decode([0x81, 0x40], "\u4E02", "two bytes 0x81 0x40");
+ decode([0x81, 0x7E], "\u4E8A", "two bytes 0x81 0x7e");
+ decode([0x81, 0x7F], "\ufffd\u007f", "two bytes 0x81 0x7f");
+ decode([0x81, 0x80], "\u4E90", "two bytes 0x81 0x80");
+ decode([0x81, 0xFE], "\u4FA2", "two bytes 0x81 0xFE");
+ decode([0x81, 0xFF], "\ufffd", "two bytes 0x81 0xFF");
+ decode([0xFE, 0x40], "\uFA0C", "two bytes 0xFE 0x40");
+ decode([0xFE, 0xFE], "\uE4C5", "two bytes 0xFE 0xFE");
+ decode([0xFE, 0xFF], "\ufffd", "two bytes 0xFE 0xFF");
+ decode([0x81, 0x30], "\ufffd", "two bytes 0x81 0x30");
+ decode([0x81, 0x30, 0xFE], "\ufffd", "three bytes 0x81 0x30 0xFE");
+ decode([0x81, 0x30, 0xFF], "\ufffd0\ufffd", "three bytes 0x81 0x30 0xFF");
+ decode([0x81, 0x30, 0xFE, 0x29], "\ufffd0\ufffd)", "four bytes 0x81 0x30 0xFE 0x29");
+ decode([0xFE, 0x39, 0xFE, 0x39], "\ufffd", "four bytes 0xFE 0x39 0xFE 0x39");
+ decode([0x81, 0x35, 0xF4, 0x36], "\u1E3E", "pointer 7458");
+ decode([0x81, 0x35, 0xF4, 0x37], "\ue7c7", "pointer 7457");
+ decode([0x81, 0x35, 0xF4, 0x38], "\u1E40", "pointer 7459");
+ decode([0x84, 0x31, 0xA4, 0x39], "\uffff", "pointer 39419");
+ decode([0x84, 0x31, 0xA5, 0x30], "\ufffd", "pointer 39420");
+ decode([0x8F, 0x39, 0xFE, 0x39], "\ufffd", "pointer 189999");
+ decode([0x90, 0x30, 0x81, 0x30], "\u{10000}", "pointer 189000");
+ decode([0xE3, 0x32, 0x9A, 0x35], "\u{10FFFF}", "pointer 1237575");
+ decode([0xE3, 0x32, 0x9A, 0x36], "\ufffd", "pointer 1237576");
+ decode([0x83, 0x36, 0xC8, 0x30], "\uE7C8", "legacy ICU special case 1");
+ decode([0xA1, 0xAD], "\u2026", "legacy ICU special case 2");
+ decode([0xA1, 0xAB], "\uFF5E", "legacy ICU special case 3");
+
+ let i = 0;
+ for (const range of ranges) {
+   const pointer = range[0];
+   decode([
+     Math.floor(pointer / 12600) + 0x81,
+     Math.floor((pointer % 12600) / 1260) + 0x30,
+     Math.floor((pointer % 1260) / 10) + 0x81,
+     pointer % 10 + 0x30
+   ], range[1], "range " + i++);
+ }
+</script>
diff --git a/test/fixtures/wpt/encoding/legacy-mb-schinese/gb18030/gb18030-encoder.html b/test/fixtures/wpt/encoding/legacy-mb-schinese/gb18030/gb18030-encoder.html
@@ -0,0 +1,48 @@
+<!doctype html>
+<meta charset=gb18030>
+<script src=/resources/testharness.js></script>
+<script src=/resources/testharnessreport.js></script>
+<script src=resources/ranges.js></script>
+<script>
+ const encode = (input, output, desc) => {
+   test(function() {
+     const a = document.createElement("a"); // <a> uses document encoding for URL's query
+     a.href = "https://example.com/?" + input;
+     assert_equals(a.search.substr(1), output); // remove leading "?"
+   }, "gb18030 encoder: " + desc);
+ }
+
+ encode("s", "s", "very basic");
+ encode("\u20AC", "%A2%E3", "Euro");
+ encode("\u4E02", "%81@", "character");
+ encode("\uE4C6", "%A1@", "PUA");
+ encode("\uE4C5", "%FE%FE", "PUA #2");
+ encode("\uE5E5", "%26%2358853%3B", "PUA #3");
+ encode("\ud83d\udca9", "%949%DA3", "poo");
+ encode("\uE7C7", "%815%F47", "Ranges pointer special case");
+ encode("\uE7C8", "%836%C80", "legacy ICU special case 1");
+ encode("\u2026", "%A1%AD", "legacy ICU special case 2");
+ encode("\uFF5E", "%A1%AB", "legacy ICU special case 3");
+
+ const upperCaseNibble = x => {
+   return Math.floor(x).toString(16).toUpperCase();
+ }
+
+ const encodePointer = pointer => {
+   const firstByte = Math.floor(pointer / 12600) + 0x81;
+   const thirdByte = Math.floor((pointer % 1260) / 10) + 0x81;
+   return "%"
+     + upperCaseNibble(firstByte / 16)
+     + upperCaseNibble(firstByte % 16)
+     + String.fromCharCode(Math.floor((pointer % 12600) / 1260) + 0x30)
+     + "%"
+     + upperCaseNibble(thirdByte / 16)
+     + upperCaseNibble(thirdByte % 16)
+     + String.fromCharCode(pointer % 10 + 0x30);
+ }
+
+ let i = 0;
+ for (const range of ranges) {
+   encode(range[1], encodePointer(range[0]), "range " + i++);
+ }
+</script>
diff --git a/test/fixtures/wpt/encoding/legacy-mb-schinese/gb18030/resources/ranges.js b/test/fixtures/wpt/encoding/legacy-mb-schinese/gb18030/resources/ranges.js
@@ -0,0 +1,210 @@
+// Based on https://encoding.spec.whatwg.org/index-gb18030-ranges.txt
+const ranges = [
+	[0, "\u0080"],
+	[36, "\u00A5"],
+	[38, "\u00A9"],
+	[45, "\u00B2"],
+	[50, "\u00B8"],
+	[81, "\u00D8"],
+	[89, "\u00E2"],
+	[95, "\u00EB"],
+	[96, "\u00EE"],
+	[100, "\u00F4"],
+	[103, "\u00F8"],
+	[104, "\u00FB"],
+	[105, "\u00FD"],
+	[109, "\u0102"],
+	[126, "\u0114"],
+	[133, "\u011C"],
+	[148, "\u012C"],
+	[172, "\u0145"],
+	[175, "\u0149"],
+	[179, "\u014E"],
+	[208, "\u016C"],
+	[306, "\u01CF"],
+	[307, "\u01D1"],
+	[308, "\u01D3"],
+	[309, "\u01D5"],
+	[310, "\u01D7"],
+	[311, "\u01D9"],
+	[312, "\u01DB"],
+	[313, "\u01DD"],
+	[341, "\u01FA"],
+	[428, "\u0252"],
+	[443, "\u0262"],
+	[544, "\u02C8"],
+	[545, "\u02CC"],
+	[558, "\u02DA"],
+	[741, "\u03A2"],
+	[742, "\u03AA"],
+	[749, "\u03C2"],
+	[750, "\u03CA"],
+	[805, "\u0402"],
+	[819, "\u0450"],
+	[820, "\u0452"],
+	[7922, "\u2011"],
+	[7924, "\u2017"],
+	[7925, "\u201A"],
+	[7927, "\u201E"],
+	[7934, "\u2027"],
+	[7943, "\u2031"],
+	[7944, "\u2034"],
+	[7945, "\u2036"],
+	[7950, "\u203C"],
+	[8062, "\u20AD"],
+	[8148, "\u2104"],
+	[8149, "\u2106"],
+	[8152, "\u210A"],
+	[8164, "\u2117"],
+	[8174, "\u2122"],
+	[8236, "\u216C"],
+	[8240, "\u217A"],
+	[8262, "\u2194"],
+	[8264, "\u219A"],
+	[8374, "\u2209"],
+	[8380, "\u2210"],
+	[8381, "\u2212"],
+	[8384, "\u2216"],
+	[8388, "\u221B"],
+	[8390, "\u2221"],
+	[8392, "\u2224"],
+	[8393, "\u2226"],
+	[8394, "\u222C"],
+	[8396, "\u222F"],
+	[8401, "\u2238"],
+	[8406, "\u223E"],
+	[8416, "\u2249"],
+	[8419, "\u224D"],
+	[8424, "\u2253"],
+	[8437, "\u2262"],
+	[8439, "\u2268"],
+	[8445, "\u2270"],
+	[8482, "\u2296"],
+	[8485, "\u229A"],
+	[8496, "\u22A6"],
+	[8521, "\u22C0"],
+	[8603, "\u2313"],
+	[8936, "\u246A"],
+	[8946, "\u249C"],
+	[9046, "\u254C"],
+	[9050, "\u2574"],
+	[9063, "\u2590"],
+	[9066, "\u2596"],
+	[9076, "\u25A2"],
+	[9092, "\u25B4"],
+	[9100, "\u25BE"],
+	[9108, "\u25C8"],
+	[9111, "\u25CC"],
+	[9113, "\u25D0"],
+	[9131, "\u25E6"],
+	[9162, "\u2607"],
+	[9164, "\u260A"],
+	[9218, "\u2641"],
+	[9219, "\u2643"],
+	[11329, "\u2E82"],
+	[11331, "\u2E85"],
+	[11334, "\u2E89"],
+	[11336, "\u2E8D"],
+	[11346, "\u2E98"],
+	[11361, "\u2EA8"],
+	[11363, "\u2EAB"],
+	[11366, "\u2EAF"],
+	[11370, "\u2EB4"],
+	[11372, "\u2EB8"],
+	[11375, "\u2EBC"],
+	[11389, "\u2ECB"],
+	[11682, "\u2FFC"],
+	[11686, "\u3004"],
+	[11687, "\u3018"],
+	[11692, "\u301F"],
+	[11694, "\u302A"],
+	[11714, "\u303F"],
+	[11716, "\u3094"],
+	[11723, "\u309F"],
+	[11725, "\u30F7"],
+	[11730, "\u30FF"],
+	[11736, "\u312A"],
+	[11982, "\u322A"],
+	[11989, "\u3232"],
+	[12102, "\u32A4"],
+	[12336, "\u3390"],
+	[12348, "\u339F"],
+	[12350, "\u33A2"],
+	[12384, "\u33C5"],
+	[12393, "\u33CF"],
+	[12395, "\u33D3"],
+	[12397, "\u33D6"],
+	[12510, "\u3448"],
+	[12553, "\u3474"],
+	[12851, "\u359F"],
+	[12962, "\u360F"],
+	[12973, "\u361B"],
+	[13738, "\u3919"],
+	[13823, "\u396F"],
+	[13919, "\u39D1"],
+	[13933, "\u39E0"],
+	[14080, "\u3A74"],
+	[14298, "\u3B4F"],
+	[14585, "\u3C6F"],
+	[14698, "\u3CE1"],
+	[15583, "\u4057"],
+	[15847, "\u4160"],
+	[16318, "\u4338"],
+	[16434, "\u43AD"],
+	[16438, "\u43B2"],
+	[16481, "\u43DE"],
+	[16729, "\u44D7"],
+	[17102, "\u464D"],
+	[17122, "\u4662"],
+	[17315, "\u4724"],
+	[17320, "\u472A"],
+	[17402, "\u477D"],
+	[17418, "\u478E"],
+	[17859, "\u4948"],
+	[17909, "\u497B"],
+	[17911, "\u497E"],
+	[17915, "\u4984"],
+	[17916, "\u4987"],
+	[17936, "\u499C"],
+	[17939, "\u49A0"],
+	[17961, "\u49B8"],
+	[18664, "\u4C78"],
+	[18703, "\u4CA4"],
+	[18814, "\u4D1A"],
+	[18962, "\u4DAF"],
+	[19043, "\u9FA6"],
+	[33469, "\uE76C"],
+	[33470, "\uE7C8"],
+	[33471, "\uE7E7"],
+	[33484, "\uE815"],
+	[33485, "\uE819"],
+	[33490, "\uE81F"],
+	[33497, "\uE827"],
+	[33501, "\uE82D"],
+	[33505, "\uE833"],
+	[33513, "\uE83C"],
+	[33520, "\uE844"],
+	[33536, "\uE856"],
+	[33550, "\uE865"],
+	[37845, "\uF92D"],
+	[37921, "\uF97A"],
+	[37948, "\uF996"],
+	[38029, "\uF9E8"],
+	[38038, "\uF9F2"],
+	[38064, "\uFA10"],
+	[38065, "\uFA12"],
+	[38066, "\uFA15"],
+	[38069, "\uFA19"],
+	[38075, "\uFA22"],
+	[38076, "\uFA25"],
+	[38078, "\uFA2A"],
+	[39108, "\uFE32"],
+	[39109, "\uFE45"],
+	[39113, "\uFE53"],
+	[39114, "\uFE58"],
+	[39115, "\uFE67"],
+	[39116, "\uFE6C"],
+	[39265, "\uFF5F"],
+	[39394, "\uFFE6"],
+	[189000, "\u{10000}"]
+];
diff --git a/test/fixtures/wpt/encoding/legacy-mb-schinese/gbk/gbk-decoder.html b/test/fixtures/wpt/encoding/legacy-mb-schinese/gbk/gbk-decoder.html
@@ -0,0 +1,33 @@
+<!doctype html>
+<script src=/resources/testharness.js></script>
+<script src=/resources/testharnessreport.js></script>
+<script>
+const gbkPointers = [
+    6432, 7533, 7536, 7672, 7673, 7674, 7675, 7676, 7677, 7678, 7679, 7680, 7681, 7682, 7683, 7684,
+    23766, 23770, 23771, 23772, 23773, 23774, 23776, 23777, 23778, 23779, 23780, 23781, 23782, 23784, 23785, 23786,
+    23787, 23790, 23791, 23792, 23793, 23796, 23797, 23798, 23799, 23800, 23801, 23802, 23803, 23805, 23806, 23807,
+    23808, 23809, 23810, 23811, 23813, 23814, 23815, 23816, 23817, 23818, 23819, 23820, 23821, 23822, 23823, 23824,
+    23825, 23826, 23827, 23828, 23831, 23832, 23833, 23834, 23835, 23836, 23837, 23838, 23839, 23840, 23841, 23842,
+    23843, 23844
+];
+const codePoints = [
+    0x20ac, 0x1e3f, 0x01f9, 0x303e, 0x2ff0, 0x2ff1, 0x2ff2, 0x2ff3, 0x2ff4, 0x2ff5, 0x2ff6, 0x2ff7, 0x2ff8, 0x2ff9, 0x2ffa, 0x2ffb,
+    0x2e81, 0x2e84, 0x3473, 0x3447, 0x2e88, 0x2e8b, 0x359e, 0x361a, 0x360e, 0x2e8c, 0x2e97, 0x396e, 0x3918, 0x39cf, 0x39df, 0x3a73,
+    0x39d0, 0x3b4e, 0x3c6e, 0x3ce0, 0x2ea7, 0x2eaa, 0x4056, 0x415f, 0x2eae, 0x4337, 0x2eb3, 0x2eb6, 0x2eb7, 0x43b1, 0x43ac, 0x2ebb,
+    0x43dd, 0x44d6, 0x4661, 0x464c, 0x4723, 0x4729, 0x477c, 0x478d, 0x2eca, 0x4947, 0x497a, 0x497d, 0x4982, 0x4983, 0x4985, 0x4986,
+    0x499f, 0x499b, 0x49b7, 0x49b6, 0x4ca3, 0x4c9f, 0x4ca0, 0x4ca1, 0x4c77, 0x4ca2, 0x4d13, 0x4d14, 0x4d15, 0x4d16, 0x4d17, 0x4d18,
+    0x4d19, 0x4dae
+];
+
+for (let i = 0; i < gbkPointers.length; i++) {
+    const pointer = gbkPointers[i];
+    test(function() {
+        const lead = pointer / 190 + 0x81;
+        const trail = pointer % 190;
+        const offset = trail < 0x3F ? 0x40 : 0x41;
+        const encoded = [lead, trail + offset];
+        const decoded = new TextDecoder("GBK").decode(new Uint8Array(encoded)).charCodeAt(0);
+        assert_equals(decoded, codePoints[i]);
+    }, "gbk pointer: " + pointer)
+}
+</script>
diff --git a/test/fixtures/wpt/encoding/legacy-mb-schinese/gbk/gbk-encoder.html b/test/fixtures/wpt/encoding/legacy-mb-schinese/gbk/gbk-encoder.html
@@ -0,0 +1,26 @@
+<!doctype html>
+<meta charset=gbk> <!-- if the server overrides this, it is stupid, as this is a testsuite -->
+<script src=/resources/testharness.js></script>
+<script src=/resources/testharnessreport.js></script>
+<script>
+ function encode(input, output, desc) {
+   test(function() {
+     const a = document.createElement("a") // <a> uses document encoding for URL's query
+     a.href = "https://example.com/?" + input;
+     assert_equals(a.search.substr(1), output); // remove leading "?"
+   }, "gbk encoder: " + desc);
+ }
+
+ encode("s", "s", "very basic");
+ encode("\u20AC", "%80", "Euro");
+ encode("\u4E02", "%81@", "character");
+ encode("\uE4C6", "%A1@", "PUA");
+ encode("\uE4C5", "%FE%FE", "PUA #2");
+ encode("\ud83d\udca9", "%26%23128169%3B", "poo");
+ encode("\uE7C8", "%26%2359336%3B", "legacy ICU special case 1");
+ encode("\u2026", "%A1%AD", "legacy ICU special case 2");
+ encode("\uFF5E", "%A1%AB", "legacy ICU special case 3");
+ encode("\u00A5", "%26%23165%3B", "legacy WebKit case 1");
+ encode("\u22EF", "%26%238943%3B", "legacy WebKit case 2");
+ encode("\u301C", "%26%2312316%3B", "legacy WebKit case 3");
+</script>
diff --git a/test/fixtures/wpt/versions.json b/test/fixtures/wpt/versions.json
@@ -4,7 +4,7 @@
     "path": "console"
   },
   "encoding": {
-    "commit": "1821fb5f77723b5361058c6a8ed0b71f9d2d6b8d",
+    "commit": "3c9820d1cc5d9d2627c26ef1268b6d54a35adf22",
     "path": "encoding"
   },
   "url": {