Skip to content

Commit 06a617a

Browse files
zimbabaoTimothyGu
authored andcommitted
url: update IDNA error conditions
This commit contains three separate changes: - Always return a string from ToUnicode no matter if an error occurred. - Disable CheckHyphens boolean flag. This flag will soon be enabled in the URL Standard, but is implemented manually by selectively ignoring certain errors. - Enable CheckBidi boolean flag per URL Standard update. This allows domain names with hyphens at 3 and 4th position, as well as those with leading and trailing hyphens. They are technically invalid, but seen in the wild. Tests are updated and simplified accordingly. PR-URL: #12966 Fixes: #12965 Refs: whatwg/url#309 Reviewed-By: Daijiro Wachi <[email protected]> Reviewed-By: Timothy Gu <[email protected]>
1 parent 841bb4c commit 06a617a

6 files changed

+59
-51
lines changed

src/node_i18n.cc

+22-16
Original file line numberDiff line numberDiff line change
@@ -436,11 +436,9 @@ bool InitializeICUDirectory(const std::string& path) {
436436

437437
int32_t ToUnicode(MaybeStackBuffer<char>* buf,
438438
const char* input,
439-
size_t length,
440-
bool lenient) {
439+
size_t length) {
441440
UErrorCode status = U_ZERO_ERROR;
442-
uint32_t options = UIDNA_DEFAULT;
443-
options |= UIDNA_NONTRANSITIONAL_TO_UNICODE;
441+
uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE;
444442
UIDNA* uidna = uidna_openUTS46(options, &status);
445443
if (U_FAILURE(status))
446444
return -1;
@@ -462,14 +460,10 @@ int32_t ToUnicode(MaybeStackBuffer<char>* buf,
462460
&status);
463461
}
464462

465-
// UTS #46's ToUnicode operation applies no validation of domain name length
466-
// (nor a flag requesting it to do so, like VerifyDnsLength for ToASCII). For
467-
// that reason, unlike ToASCII below, ICU4C correctly accepts long domain
468-
// names. However, ICU4C still sets the EMPTY_LABEL error in contrary to UTS
469-
// #46. Therefore, explicitly filters out that error here.
470-
info.errors &= ~UIDNA_ERROR_EMPTY_LABEL;
463+
// info.errors is ignored as UTS #46 ToUnicode always produces a Unicode
464+
// string, regardless of whether an error occurred.
471465

472-
if (U_FAILURE(status) || (!lenient && info.errors != 0)) {
466+
if (U_FAILURE(status)) {
473467
len = -1;
474468
buf->SetLength(0);
475469
} else {
@@ -485,8 +479,7 @@ int32_t ToASCII(MaybeStackBuffer<char>* buf,
485479
size_t length,
486480
bool lenient) {
487481
UErrorCode status = U_ZERO_ERROR;
488-
uint32_t options = UIDNA_DEFAULT;
489-
options |= UIDNA_NONTRANSITIONAL_TO_ASCII;
482+
uint32_t options = UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_CHECK_BIDI;
490483
UIDNA* uidna = uidna_openUTS46(options, &status);
491484
if (U_FAILURE(status))
492485
return -1;
@@ -518,6 +511,21 @@ int32_t ToASCII(MaybeStackBuffer<char>* buf,
518511
info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG;
519512
info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
520513

514+
// These error conditions are mandated unconditionally by UTS #46 version
515+
// 9.0.0 (rev. 17), but were found to be incompatible with actual domain
516+
// names in the wild. As such, in the current UTS #46 draft (rev. 18) these
517+
// checks are made optional depending on the CheckHyphens flag, which will be
518+
// disabled in WHATWG URL's "domain to ASCII" algorithm soon.
519+
// Refs:
520+
// - https://github.com/whatwg/url/issues/53
521+
// - https://github.com/whatwg/url/pull/309
522+
// - http://www.unicode.org/review/pri317/
523+
// - http://www.unicode.org/reports/tr46/tr46-18.html
524+
// - https://www.icann.org/news/announcement-2000-01-07-en
525+
info.errors &= ~UIDNA_ERROR_HYPHEN_3_4;
526+
info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN;
527+
info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN;
528+
521529
if (U_FAILURE(status) || (!lenient && info.errors != 0)) {
522530
len = -1;
523531
buf->SetLength(0);
@@ -534,11 +542,9 @@ static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
534542
CHECK_GE(args.Length(), 1);
535543
CHECK(args[0]->IsString());
536544
Utf8Value val(env->isolate(), args[0]);
537-
// optional arg
538-
bool lenient = args[1]->BooleanValue(env->context()).FromJust();
539545

540546
MaybeStackBuffer<char> buf;
541-
int32_t len = ToUnicode(&buf, *val, val.length(), lenient);
547+
int32_t len = ToUnicode(&buf, *val, val.length());
542548

543549
if (len < 0) {
544550
return env->ThrowError("Cannot convert name to Unicode");

src/node_i18n.h

+1-2
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,7 @@ int32_t ToASCII(MaybeStackBuffer<char>* buf,
4343
bool lenient = false);
4444
int32_t ToUnicode(MaybeStackBuffer<char>* buf,
4545
const char* input,
46-
size_t length,
47-
bool lenient = false);
46+
size_t length);
4847

4948
} // namespace i18n
5049
} // namespace node

test/fixtures/url-idna.js

+22-11
Original file line numberDiff line numberDiff line change
@@ -191,22 +191,33 @@ module.exports = {
191191
{
192192
ascii: `${`${'a'.repeat(64)}.`.repeat(4)}com`,
193193
unicode: `${`${'a'.repeat(64)}.`.repeat(4)}com`
194-
}
195-
],
196-
invalid: [
197-
// invalid character
194+
},
195+
// URLs with hyphen
196+
{
197+
ascii: 'r4---sn-a5mlrn7s.gevideo.com',
198+
unicode: 'r4---sn-a5mlrn7s.gevideo.com'
199+
},
200+
{
201+
ascii: '-sn-a5mlrn7s.gevideo.com',
202+
unicode: '-sn-a5mlrn7s.gevideo.com'
203+
},
198204
{
199-
url: '\ufffd.com',
200-
mode: 'ascii'
205+
ascii: 'sn-a5mlrn7s-.gevideo.com',
206+
unicode: 'sn-a5mlrn7s-.gevideo.com'
201207
},
202208
{
203-
url: '\ufffd.com',
204-
mode: 'unicode'
209+
ascii: '-sn-a5mlrn7s-.gevideo.com',
210+
unicode: '-sn-a5mlrn7s-.gevideo.com'
205211
},
206-
// invalid Punycode
207212
{
208-
url: 'xn---abc.com',
209-
mode: 'unicode'
213+
ascii: '-sn--a5mlrn7s-.gevideo.com',
214+
unicode: '-sn--a5mlrn7s-.gevideo.com'
210215
}
216+
],
217+
invalid: [
218+
// invalid character
219+
'\ufffd.com',
220+
// invalid bi-directional character
221+
'تشادرlatin.icom.museum'
211222
]
212223
}

test/parallel/test-icu-punycode.js

+8-14
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,13 @@ const tests = require('../fixtures/url-idna.js');
2323
}
2424

2525
{
26-
const errorRe = {
27-
ascii: /^Error: Cannot convert name to ASCII$/,
28-
unicode: /^Error: Cannot convert name to Unicode$/
29-
};
30-
const convertFunc = {
31-
ascii: icu.toASCII,
32-
unicode: icu.toUnicode
33-
};
34-
35-
for (const [i, { url, mode }] of tests.invalid.entries()) {
36-
assert.throws(() => convertFunc[mode](url), errorRe[mode],
37-
`Invalid case ${i + 1}`);
38-
assert.doesNotThrow(() => convertFunc[mode](url, true),
39-
`Invalid case ${i + 1} in lenient mode`);
26+
for (const [i, url] of tests.invalid.entries()) {
27+
assert.throws(() => icu.toASCII(url),
28+
/^Error: Cannot convert name to ASCII$/,
29+
`ToASCII invalid case ${i + 1}`);
30+
assert.doesNotThrow(() => icu.toASCII(url, true),
31+
`ToASCII invalid case ${i + 1} in lenient mode`);
32+
assert.doesNotThrow(() => icu.toUnicode(url),
33+
`ToUnicode invalid case ${i + 1}`);
4034
}
4135
}

test/parallel/test-url-domain-ascii-unicode.js

+2-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ const domainToASCII = url.domainToASCII;
88
const domainToUnicode = url.domainToUnicode;
99

1010
const domainWithASCII = [
11-
['ıídيٴ', 'xn--d-iga7ro0q9f'],
11+
['ıíd', 'xn--d-iga7r'],
12+
['يٴ', 'xn--mhb8f'],
1213
['www.ϧƽəʐ.com', 'www.xn--cja62apfr6c.com'],
1314
['новини.com', 'xn--b1amarcd.com'],
1415
['名がドメイン.com', 'xn--v8jxj3d1dzdz08w.com'],

test/parallel/test-whatwg-url-domainto.js

+4-7
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,8 @@ const tests = require('../fixtures/url-idna.js');
3535
}
3636

3737
{
38-
const convertFunc = {
39-
ascii: domainToASCII,
40-
unicode: domainToUnicode
41-
};
42-
43-
for (const [i, { url, mode }] of tests.invalid.entries())
44-
assert.strictEqual(convertFunc[mode](url), '', `Invalid case ${i + 1}`);
38+
for (const [i, url] of tests.invalid.entries()) {
39+
assert.strictEqual(domainToASCII(url), '', `Invalid case ${i + 1}`);
40+
assert.strictEqual(domainToUnicode(url), '', `Invalid case ${i + 1}`);
41+
}
4542
}

0 commit comments

Comments
 (0)