Skip to content

Commit 7c9ca0f

Browse files
TimothyGuevanlucas
authored andcommittedMay 1, 2017
url: enforce valid UTF-8 in WHATWG parser
This commit implements the Web IDL USVString conversion, which mandates all unpaired Unicode surrogates be turned into U+FFFD REPLACEMENT CHARACTER. It also disallows Symbols to be used as USVString per spec. Certain functions call into C++ methods in the binding that use the Utf8Value class to access string arguments. Utf8Value already does the normalization using V8's String::Write, so in those cases, instead of doing the full USVString normalization, only a symbol check is done (`'' + val`, which uses ES's ToString, versus `String()` which has special provisions for symbols). PR-URL: #12507 Reviewed-By: James M Snell <[email protected]>
1 parent e48e00b commit 7c9ca0f

13 files changed

+509
-44
lines changed
 

‎lib/internal/url.js

+65-33
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,18 @@ const IteratorPrototype = Object.getPrototypeOf(
2323
Object.getPrototypeOf([][Symbol.iterator]())
2424
);
2525

26+
const unpairedSurrogateRe =
27+
/([^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])/;
28+
function toUSVString(val) {
29+
const str = '' + val;
30+
// As of V8 5.5, `str.search()` (and `unpairedSurrogateRe[@@search]()`) are
31+
// slower than `unpairedSurrogateRe.exec()`.
32+
const match = unpairedSurrogateRe.exec(str);
33+
if (!match)
34+
return str;
35+
return binding.toUSVString(str, match.index);
36+
}
37+
2638
class OpaqueOrigin {
2739
toString() {
2840
return 'null';
@@ -108,7 +120,6 @@ function onParseError(flags, input) {
108120

109121
// Reused by URL constructor and URL#href setter.
110122
function parse(url, input, base) {
111-
input = String(input);
112123
const base_context = base ? base[context] : undefined;
113124
url[context] = new StorageObject();
114125
binding.parse(input.trim(), -1,
@@ -203,8 +214,10 @@ function onParseHashComplete(flags, protocol, username, password,
203214

204215
class URL {
205216
constructor(input, base) {
217+
// toUSVString is not needed.
218+
input = '' + input;
206219
if (base !== undefined && !(base instanceof URL))
207-
base = new URL(String(base));
220+
base = new URL(base);
208221
parse(this, input, base);
209222
}
210223

@@ -312,6 +325,8 @@ Object.defineProperties(URL.prototype, {
312325
return this[kFormat]({});
313326
},
314327
set(input) {
328+
// toUSVString is not needed.
329+
input = '' + input;
315330
parse(this, input);
316331
}
317332
},
@@ -329,7 +344,8 @@ Object.defineProperties(URL.prototype, {
329344
return this[context].scheme;
330345
},
331346
set(scheme) {
332-
scheme = String(scheme);
347+
// toUSVString is not needed.
348+
scheme = '' + scheme;
333349
if (scheme.length === 0)
334350
return;
335351
binding.parse(scheme, binding.kSchemeStart, null, this[context],
@@ -343,7 +359,8 @@ Object.defineProperties(URL.prototype, {
343359
return this[context].username || '';
344360
},
345361
set(username) {
346-
username = String(username);
362+
// toUSVString is not needed.
363+
username = '' + username;
347364
if (!this.hostname)
348365
return;
349366
const ctx = this[context];
@@ -363,7 +380,8 @@ Object.defineProperties(URL.prototype, {
363380
return this[context].password || '';
364381
},
365382
set(password) {
366-
password = String(password);
383+
// toUSVString is not needed.
384+
password = '' + password;
367385
if (!this.hostname)
368386
return;
369387
const ctx = this[context];
@@ -388,7 +406,8 @@ Object.defineProperties(URL.prototype, {
388406
},
389407
set(host) {
390408
const ctx = this[context];
391-
host = String(host);
409+
// toUSVString is not needed.
410+
host = '' + host;
392411
if (this[cannotBeBase] ||
393412
(this[special] && host.length === 0)) {
394413
// Cannot set the host if cannot-be-base is set or
@@ -412,7 +431,8 @@ Object.defineProperties(URL.prototype, {
412431
},
413432
set(host) {
414433
const ctx = this[context];
415-
host = String(host);
434+
// toUSVString is not needed.
435+
host = '' + host;
416436
if (this[cannotBeBase] ||
417437
(this[special] && host.length === 0)) {
418438
// Cannot set the host if cannot-be-base is set or
@@ -436,11 +456,12 @@ Object.defineProperties(URL.prototype, {
436456
return port === undefined ? '' : String(port);
437457
},
438458
set(port) {
459+
// toUSVString is not needed.
460+
port = '' + port;
439461
const ctx = this[context];
440462
if (!ctx.host || this[cannotBeBase] ||
441463
this.protocol === 'file:')
442464
return;
443-
port = String(port);
444465
if (port === '') {
445466
ctx.port = undefined;
446467
return;
@@ -459,9 +480,11 @@ Object.defineProperties(URL.prototype, {
459480
return ctx.path !== undefined ? `/${ctx.path.join('/')}` : '';
460481
},
461482
set(path) {
483+
// toUSVString is not needed.
484+
path = '' + path;
462485
if (this[cannotBeBase])
463486
return;
464-
binding.parse(String(path), binding.kPathStart, null, this[context],
487+
binding.parse(path, binding.kPathStart, null, this[context],
465488
onParsePathComplete.bind(this));
466489
}
467490
},
@@ -474,7 +497,7 @@ Object.defineProperties(URL.prototype, {
474497
},
475498
set(search) {
476499
const ctx = this[context];
477-
search = String(search);
500+
search = toUSVString(search);
478501
if (!search) {
479502
ctx.query = null;
480503
ctx.flags &= ~binding.URL_FLAGS_HAS_QUERY;
@@ -506,7 +529,8 @@ Object.defineProperties(URL.prototype, {
506529
},
507530
set(hash) {
508531
const ctx = this[context];
509-
hash = String(hash);
532+
// toUSVString is not needed.
533+
hash = '' + hash;
510534
if (this.protocol === 'javascript:')
511535
return;
512536
if (!hash) {
@@ -649,19 +673,22 @@ class URLSearchParams {
649673
if (pair.length !== 2) {
650674
throw new TypeError('Each query pair must be a name/value tuple');
651675
}
652-
this[searchParams].push(String(pair[0]), String(pair[1]));
676+
const key = toUSVString(pair[0]);
677+
const value = toUSVString(pair[1]);
678+
this[searchParams].push(key, value);
653679
}
654680
} else {
655681
// record<USVString, USVString>
656682
this[searchParams] = [];
657-
for (const key of Object.keys(init)) {
658-
const value = String(init[key]);
683+
for (var key of Object.keys(init)) {
684+
key = toUSVString(key);
685+
const value = toUSVString(init[key]);
659686
this[searchParams].push(key, value);
660687
}
661688
}
662689
} else {
663690
// USVString
664-
init = String(init);
691+
init = toUSVString(init);
665692
if (init[0] === '?') init = init.slice(1);
666693
initSearchParams(this, init);
667694
}
@@ -740,8 +767,8 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
740767
throw new TypeError('"name" and "value" arguments must be specified');
741768
}
742769

743-
name = String(name);
744-
value = String(value);
770+
name = toUSVString(name);
771+
value = toUSVString(value);
745772
this[searchParams].push(name, value);
746773
update(this[context], this);
747774
},
@@ -755,7 +782,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
755782
}
756783

757784
const list = this[searchParams];
758-
name = String(name);
785+
name = toUSVString(name);
759786
for (var i = 0; i < list.length;) {
760787
const cur = list[i];
761788
if (cur === name) {
@@ -776,7 +803,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
776803
}
777804

778805
const list = this[searchParams];
779-
name = String(name);
806+
name = toUSVString(name);
780807
for (var i = 0; i < list.length; i += 2) {
781808
if (list[i] === name) {
782809
return list[i + 1];
@@ -795,7 +822,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
795822

796823
const list = this[searchParams];
797824
const values = [];
798-
name = String(name);
825+
name = toUSVString(name);
799826
for (var i = 0; i < list.length; i += 2) {
800827
if (list[i] === name) {
801828
values.push(list[i + 1]);
@@ -813,7 +840,7 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
813840
}
814841

815842
const list = this[searchParams];
816-
name = String(name);
843+
name = toUSVString(name);
817844
for (var i = 0; i < list.length; i += 2) {
818845
if (list[i] === name) {
819846
return true;
@@ -831,8 +858,8 @@ defineIDLClass(URLSearchParams.prototype, 'URLSearchParams', {
831858
}
832859

833860
const list = this[searchParams];
834-
name = String(name);
835-
value = String(value);
861+
name = toUSVString(name);
862+
value = toUSVString(value);
836863

837864
// If there are any name-value pairs whose name is `name`, in `list`, set
838865
// the value of the first such name-value pair to `value` and remove the
@@ -1094,11 +1121,13 @@ function originFor(url, base) {
10941121
}
10951122

10961123
function domainToASCII(domain) {
1097-
return binding.domainToASCII(String(domain));
1124+
// toUSVString is not needed.
1125+
return binding.domainToASCII('' + domain);
10981126
}
10991127

11001128
function domainToUnicode(domain) {
1101-
return binding.domainToUnicode(String(domain));
1129+
// toUSVString is not needed.
1130+
return binding.domainToUnicode('' + domain);
11021131
}
11031132

11041133
// Utility function that converts a URL object into an ordinary
@@ -1184,11 +1213,14 @@ function getPathFromURL(path) {
11841213
return isWindows ? getPathFromURLWin32(path) : getPathFromURLPosix(path);
11851214
}
11861215

1187-
exports.getPathFromURL = getPathFromURL;
1188-
exports.URL = URL;
1189-
exports.URLSearchParams = URLSearchParams;
1190-
exports.domainToASCII = domainToASCII;
1191-
exports.domainToUnicode = domainToUnicode;
1192-
exports.urlToOptions = urlToOptions;
1193-
exports.formatSymbol = kFormat;
1194-
exports.searchParamsSymbol = searchParams;
1216+
module.exports = {
1217+
toUSVString,
1218+
getPathFromURL,
1219+
URL,
1220+
URLSearchParams,
1221+
domainToASCII,
1222+
domainToUnicode,
1223+
urlToOptions,
1224+
formatSymbol: kFormat,
1225+
searchParamsSymbol: searchParams
1226+
};

‎src/node_url.cc

+53
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
#include <unicode/utf.h>
2121
#endif
2222

23+
#define UNICODE_REPLACEMENT_CHARACTER 0xFFFD
24+
2325
namespace node {
2426

2527
using v8::Array;
@@ -104,6 +106,21 @@ namespace url {
104106
}
105107
#endif
106108

109+
// If a UTF-16 character is a low/trailing surrogate.
110+
static inline bool IsUnicodeTrail(uint16_t c) {
111+
return (c & 0xFC00) == 0xDC00;
112+
}
113+
114+
// If a UTF-16 character is a surrogate.
115+
static inline bool IsUnicodeSurrogate(uint16_t c) {
116+
return (c & 0xF800) == 0xD800;
117+
}
118+
119+
// If a UTF-16 surrogate is a low/trailing one.
120+
static inline bool IsUnicodeSurrogateTrail(uint16_t c) {
121+
return (c & 0x400) != 0;
122+
}
123+
107124
static url_host_type ParseIPv6Host(url_host* host,
108125
const char* input,
109126
size_t length) {
@@ -1356,6 +1373,41 @@ namespace url {
13561373
v8::NewStringType::kNormal).ToLocalChecked());
13571374
}
13581375

1376+
static void ToUSVString(const FunctionCallbackInfo<Value>& args) {
1377+
Environment* env = Environment::GetCurrent(args);
1378+
CHECK_GE(args.Length(), 2);
1379+
CHECK(args[0]->IsString());
1380+
CHECK(args[1]->IsNumber());
1381+
1382+
TwoByteValue value(env->isolate(), args[0]);
1383+
const size_t n = value.length();
1384+
1385+
const int64_t start = args[1]->IntegerValue(env->context()).FromJust();
1386+
CHECK_GE(start, 0);
1387+
1388+
for (size_t i = start; i < n; i++) {
1389+
uint16_t c = value[i];
1390+
if (!IsUnicodeSurrogate(c)) {
1391+
continue;
1392+
} else if (IsUnicodeSurrogateTrail(c) || i == n - 1) {
1393+
value[i] = UNICODE_REPLACEMENT_CHARACTER;
1394+
} else {
1395+
uint16_t d = value[i + 1];
1396+
if (IsUnicodeTrail(d)) {
1397+
i++;
1398+
} else {
1399+
value[i] = UNICODE_REPLACEMENT_CHARACTER;
1400+
}
1401+
}
1402+
}
1403+
1404+
args.GetReturnValue().Set(
1405+
String::NewFromTwoByte(env->isolate(),
1406+
*value,
1407+
v8::NewStringType::kNormal,
1408+
n).ToLocalChecked());
1409+
}
1410+
13591411
static void DomainToASCII(const FunctionCallbackInfo<Value>& args) {
13601412
Environment* env = Environment::GetCurrent(args);
13611413
CHECK_GE(args.Length(), 1);
@@ -1403,6 +1455,7 @@ namespace url {
14031455
Environment* env = Environment::GetCurrent(context);
14041456
env->SetMethod(target, "parse", Parse);
14051457
env->SetMethod(target, "encodeAuth", EncodeAuthSet);
1458+
env->SetMethod(target, "toUSVString", ToUSVString);
14061459
env->SetMethod(target, "domainToASCII", DomainToASCII);
14071460
env->SetMethod(target, "domainToUnicode", DomainToUnicode);
14081461

0 commit comments

Comments
 (0)
Please sign in to comment.