Skip to content

Commit b610a4d

Browse files
committed
url: enforce valid UTF-8 in WHATWG parser
This commit implements the Web IDL USVString conversion, which mandates all unpaired Unicode surrogates be turned into U+FFFD REPLACEMENT CHARACTER. It also disallows Symbols to be used as USVString per spec. Certain functions call into C++ methods in the binding that use the Utf8Value class to access string arguments. Utf8Value already does the normalization using V8's String::Write, so in those cases, instead of doing the full USVString normalization, only a symbol check is done (`'' + val`, which uses ES's ToString, versus `String()` which has special provisions for symbols). PR-URL: #11436 Reviewed-By: Ben Noordhuis <[email protected]> Reviewed-By: James M Snell <[email protected]>
1 parent a7f7724 commit b610a4d

13 files changed

+509
-44
lines changed

lib/internal/url.js

+65-33
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,18 @@ const IteratorPrototype = Object.getPrototypeOf(
2323
Object.getPrototypeOf([][Symbol.iterator]())
2424
);
2525

26+
const unpairedSurrogateRe =
27+
/([^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])/;
28+
function toUSVString(val) {
29+
const str = '' + val;
30+
// As of V8 5.5, `str.search()` (and `unpairedSurrogateRe[@@search]()`) are
31+
// slower than `unpairedSurrogateRe.exec()`.
32+
const match = unpairedSurrogateRe.exec(str);
33+
if (!match)
34+
return str;
35+
return binding.toUSVString(str, match.index);
36+
}
37+
2638
class OpaqueOrigin {
2739
toString() {
2840
return 'null';
@@ -104,7 +116,6 @@ function onParseComplete(flags, protocol, username, password,
104116

105117
// Reused by URL constructor and URL#href setter.
106118
function parse(url, input, base) {
107-
input = String(input);
108119
const base_context = base ? base[context] : undefined;
109120
url[context] = new StorageObject();
110121
binding.parse(input.trim(), -1,
@@ -206,8 +217,10 @@ function onParseHashComplete(flags, protocol, username, password,
206217

207218
class URL {
208219
constructor(input, base) {
220+
// toUSVString is not needed.
221+
input = '' + input;
209222
if (base !== undefined && !(base instanceof URL))
210-
base = new URL(String(base));
223+
base = new URL(base);
211224
parse(this, input, base);
212225
}
213226

@@ -315,6 +328,8 @@ Object.defineProperties(URL.prototype, {
315328
return this[kFormat]({});
316329
},
317330
set(input) {
331+
// toUSVString is not needed.
332+
input = '' + input;
318333
parse(this, input);
319334
}
320335
},
@@ -332,7 +347,8 @@ Object.defineProperties(URL.prototype, {
332347
return this[context].scheme;
333348
},
334349
set(scheme) {
335-
scheme = String(scheme);
350+
// toUSVString is not needed.
351+
scheme = '' + scheme;
336352
if (scheme.length === 0)
337353
return;
338354
binding.parse(scheme, binding.kSchemeStart, null, this[context],
@@ -346,7 +362,8 @@ Object.defineProperties(URL.prototype, {
346362
return this[context].username || '';
347363
},
348364
set(username) {
349-
username = String(username);
365+
// toUSVString is not needed.
366+
username = '' + username;
350367
if (!this.hostname)
351368
return;
352369
const ctx = this[context];
@@ -366,7 +383,8 @@ Object.defineProperties(URL.prototype, {
366383
return this[context].password || '';
367384
},
368385
set(password) {
369-
password = String(password);
386+
// toUSVString is not needed.
387+
password = '' + password;
370388
if (!this.hostname)
371389
return;
372390
const ctx = this[context];
@@ -391,7 +409,8 @@ Object.defineProperties(URL.prototype, {
391409
},
392410
set(host) {
393411
const ctx = this[context];
394-
host = String(host);
412+
// toUSVString is not needed.
413+
host = '' + host;
395414
if (this[cannotBeBase] ||
396415
(this[special] && host.length === 0)) {
397416
// Cannot set the host if cannot-be-base is set or
@@ -415,7 +434,8 @@ Object.defineProperties(URL.prototype, {
415434
},
416435
set(host) {
417436
const ctx = this[context];
418-
host = String(host);
437+
// toUSVString is not needed.
438+
host = '' + host;
419439
if (this[cannotBeBase] ||
420440
(this[special] && host.length === 0)) {
421441
// Cannot set the host if cannot-be-base is set or
@@ -439,11 +459,12 @@ Object.defineProperties(URL.prototype, {
439459
return port === undefined ? '' : String(port);
440460
},
441461
set(port) {
462+
// toUSVString is not needed.
463+
port = '' + port;
442464
const ctx = this[context];
443465
if (!ctx.host || this[cannotBeBase] ||
444466
this.protocol === 'file:')
445467
return;
446-
port = String(port);
447468
if (port === '') {
448469
ctx.port = undefined;
449470
return;
@@ -462,9 +483,11 @@ Object.defineProperties(URL.prototype, {
462483
return ctx.path !== undefined ? `/${ctx.path.join('/')}` : '';
463484
},
464485
set(path) {
486+
// toUSVString is not needed.
487+
path = '' + path;
465488
if (this[cannotBeBase])
466489
return;
467-
binding.parse(String(path), binding.kPathStart, null, this[context],
490+
binding.parse(path, binding.kPathStart, null, this[context],
468491
onParsePathComplete.bind(this));
469492
}
470493
},
@@ -477,7 +500,7 @@ Object.defineProperties(URL.prototype, {
477500
},
478501
set(search) {
479502
const ctx = this[context];
480-
search = String(search);
503+
search = toUSVString(search);
481504
if (!search) {
482505
ctx.query = null;
483506
ctx.flags &= ~binding.URL_FLAGS_HAS_QUERY;
@@ -509,7 +532,8 @@ Object.defineProperties(URL.prototype, {
509532
},
510533
set(hash) {
511534
const ctx = this[context];
512-
hash = String(hash);
535+
// toUSVString is not needed.
536+
hash = '' + hash;
513537
if (this.protocol === 'javascript:')
514538
return;
515539
if (!hash) {
@@ -652,19 +676,22 @@ class URLSearchParams {
652676
if (pair.length !== 2) {
653677
throw new TypeError('Each query pair must be a name/value tuple');
654678
}
655-
this[searchParams].push(String(pair[0]), String(pair[1]));
679+
const key = toUSVString(pair[0]);
680+
const value = toUSVString(pair[1]);
681+
this[searchParams].push(key, value);
656682
}
657683
} else {
658684
// record<USVString, USVString>
659685
this[searchParams] = [];
660-
for (const key of Object.keys(init)) {
661-
const value = String(init[key]);
686+
for (var key of Object.keys(init)) {
687+
key = toUSVString(key);
688+
const value = toUSVString(init[key]);
662689
this[searchParams].push(key, value);
663690
}
664691
}
665692
} else {
666693
// USVString
667-
init = String(init);
694+
init = toUSVString(init);
668695
if (init[0] === '?') init = init.slice(1);
669696
initSearchParams(this, init);
670697
}
@@ -743,8 +770,8 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
743770
throw new TypeError('"name" and "value" arguments must be specified');
744771
}
745772

746-
name = String(name);
747-
value = String(value);
773+
name = toUSVString(name);
774+
value = toUSVString(value);
748775
this[searchParams].push(name, value);
749776
update(this[context], this);
750777
},
@@ -758,7 +785,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
758785
}
759786

760787
const list = this[searchParams];
761-
name = String(name);
788+
name = toUSVString(name);
762789
for (var i = 0; i < list.length;) {
763790
const cur = list[i];
764791
if (cur === name) {
@@ -779,7 +806,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
779806
}
780807

781808
const list = this[searchParams];
782-
name = String(name);
809+
name = toUSVString(name);
783810
for (var i = 0; i < list.length; i += 2) {
784811
if (list[i] === name) {
785812
return list[i + 1];
@@ -798,7 +825,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
798825

799826
const list = this[searchParams];
800827
const values = [];
801-
name = String(name);
828+
name = toUSVString(name);
802829
for (var i = 0; i < list.length; i += 2) {
803830
if (list[i] === name) {
804831
values.push(list[i + 1]);
@@ -816,7 +843,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
816843
}
817844

818845
const list = this[searchParams];
819-
name = String(name);
846+
name = toUSVString(name);
820847
for (var i = 0; i < list.length; i += 2) {
821848
if (list[i] === name) {
822849
return true;
@@ -834,8 +861,8 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
834861
}
835862

836863
const list = this[searchParams];
837-
name = String(name);
838-
value = String(value);
864+
name = toUSVString(name);
865+
value = toUSVString(value);
839866

840867
// If there are any name-value pairs whose name is `name`, in `list`, set
841868
// the value of the first such name-value pair to `value` and remove the
@@ -1098,11 +1125,13 @@ function originFor(url, base) {
10981125
}
10991126

11001127
function domainToASCII(domain) {
1101-
return binding.domainToASCII(String(domain));
1128+
// toUSVString is not needed.
1129+
return binding.domainToASCII('' + domain);
11021130
}
11031131

11041132
function domainToUnicode(domain) {
1105-
return binding.domainToUnicode(String(domain));
1133+
// toUSVString is not needed.
1134+
return binding.domainToUnicode('' + domain);
11061135
}
11071136

11081137
// Utility function that converts a URL object into an ordinary
@@ -1188,11 +1217,14 @@ function getPathFromURL(path) {
11881217
return isWindows ? getPathFromURLWin32(path) : getPathFromURLPosix(path);
11891218
}
11901219

1191-
exports.getPathFromURL = getPathFromURL;
1192-
exports.URL = URL;
1193-
exports.URLSearchParams = URLSearchParams;
1194-
exports.domainToASCII = domainToASCII;
1195-
exports.domainToUnicode = domainToUnicode;
1196-
exports.urlToOptions = urlToOptions;
1197-
exports.formatSymbol = kFormat;
1198-
exports.searchParamsSymbol = searchParams;
1220+
module.exports = {
1221+
toUSVString,
1222+
getPathFromURL,
1223+
URL,
1224+
URLSearchParams,
1225+
domainToASCII,
1226+
domainToUnicode,
1227+
urlToOptions,
1228+
formatSymbol: kFormat,
1229+
searchParamsSymbol: searchParams
1230+
};

src/node_url.cc

+53
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
#include <unicode/utf.h>
2121
#endif
2222

23+
#define UNICODE_REPLACEMENT_CHARACTER 0xFFFD
24+
2325
namespace node {
2426

2527
using v8::Array;
@@ -143,6 +145,21 @@ namespace url {
143145
}
144146
#endif
145147

148+
// If a UTF-16 character is a low/trailing surrogate.
149+
static inline bool IsUnicodeTrail(uint16_t c) {
150+
return (c & 0xFC00) == 0xDC00;
151+
}
152+
153+
// If a UTF-16 character is a surrogate.
154+
static inline bool IsUnicodeSurrogate(uint16_t c) {
155+
return (c & 0xF800) == 0xD800;
156+
}
157+
158+
// If a UTF-16 surrogate is a low/trailing one.
159+
static inline bool IsUnicodeSurrogateTrail(uint16_t c) {
160+
return (c & 0x400) != 0;
161+
}
162+
146163
static url_host_type ParseIPv6Host(url_host* host,
147164
const char* input,
148165
size_t length) {
@@ -1351,6 +1368,41 @@ namespace url {
13511368
v8::NewStringType::kNormal).ToLocalChecked());
13521369
}
13531370

1371+
static void ToUSVString(const FunctionCallbackInfo<Value>& args) {
1372+
Environment* env = Environment::GetCurrent(args);
1373+
CHECK_GE(args.Length(), 2);
1374+
CHECK(args[0]->IsString());
1375+
CHECK(args[1]->IsNumber());
1376+
1377+
TwoByteValue value(env->isolate(), args[0]);
1378+
const size_t n = value.length();
1379+
1380+
const int64_t start = args[1]->IntegerValue(env->context()).FromJust();
1381+
CHECK_GE(start, 0);
1382+
1383+
for (size_t i = start; i < n; i++) {
1384+
uint16_t c = value[i];
1385+
if (!IsUnicodeSurrogate(c)) {
1386+
continue;
1387+
} else if (IsUnicodeSurrogateTrail(c) || i == n - 1) {
1388+
value[i] = UNICODE_REPLACEMENT_CHARACTER;
1389+
} else {
1390+
uint16_t d = value[i + 1];
1391+
if (IsUnicodeTrail(d)) {
1392+
i++;
1393+
} else {
1394+
value[i] = UNICODE_REPLACEMENT_CHARACTER;
1395+
}
1396+
}
1397+
}
1398+
1399+
args.GetReturnValue().Set(
1400+
String::NewFromTwoByte(env->isolate(),
1401+
*value,
1402+
v8::NewStringType::kNormal,
1403+
n).ToLocalChecked());
1404+
}
1405+
13541406
static void DomainToASCII(const FunctionCallbackInfo<Value>& args) {
13551407
Environment* env = Environment::GetCurrent(args);
13561408
CHECK_GE(args.Length(), 1);
@@ -1398,6 +1450,7 @@ namespace url {
13981450
Environment* env = Environment::GetCurrent(context);
13991451
env->SetMethod(target, "parse", Parse);
14001452
env->SetMethod(target, "encodeAuth", EncodeAuthSet);
1453+
env->SetMethod(target, "toUSVString", ToUSVString);
14011454
env->SetMethod(target, "domainToASCII", DomainToASCII);
14021455
env->SetMethod(target, "domainToUnicode", DomainToUnicode);
14031456

0 commit comments

Comments
 (0)