Skip to content
This repository was archived by the owner on Apr 22, 2023. It is now read-only.

Commit 90802d6

Browse files
committedApr 20, 2011
Close #954 URL parsing/formatting corrections
1. Allow single-quotes in urls, but escape them. 2. Add comments about which RFCs we're following for guidance. 3. Handle any invalid character in the hostname portion. 4. lcase protocol and hostname portions, since they are case-insensitive.
·
1 parent d3d35ec commit 90802d6

File tree

2 files changed

+171
-19
lines changed

2 files changed

+171
-19
lines changed
 

‎lib/url.js

+65-7
Original file line numberDiff line numberDiff line change
@@ -24,25 +24,40 @@ exports.resolve = urlResolve;
2424
exports.resolveObject = urlResolveObject;
2525
exports.format = urlFormat;
2626

27+
// Reference: RFC 3986, RFC 1808, RFC 2396
28+
2729
// define these here so at least they only have to be
2830
// compiled once on the first module load.
29-
var protocolPattern = /^([a-z0-9]+:)/,
31+
var protocolPattern = /^([a-z0-9]+:)/i,
3032
portPattern = /:[0-9]+$/,
31-
delims = ['<', '>', '"', '\'', '`', /\s/],
33+
// RFC 2396: characters reserved for delimiting URLs.
34+
delims = ['<', '>', '"', '`', ' ', '\r', '\n', '\t'],
35+
// RFC 2396: characters not allowed for various reasons.
3236
unwise = ['{', '}', '|', '\\', '^', '~', '[', ']', '`'].concat(delims),
33-
nonHostChars = ['/', '?', ';', '#'].concat(unwise),
37+
// Allowed by RFCs, but cause of XSS attacks. Always escape these.
38+
autoEscape = ['\''],
39+
// Characters that are never ever allowed in a hostname.
40+
// Note that any invalid chars are also handled, but these
41+
// are the ones that are *expected* to be seen, so we fast-path
42+
// them.
43+
nonHostChars = ['%', '/', '?', ';', '#']
44+
.concat(unwise).concat(autoEscape),
3445
hostnameMaxLen = 255,
35-
hostnamePartPattern = /^[a-z0-9][a-z0-9A-Z-]{0,62}$/,
46+
hostnamePartPattern = /^[a-zA-Z0-9][a-z0-9A-Z-]{0,62}$/,
47+
hostnamePartStart = /^([a-zA-Z0-9][a-z0-9A-Z-]{0,62})(.*)$/,
48+
// protocols that can allow "unsafe" and "unwise" chars.
3649
unsafeProtocol = {
3750
'javascript': true,
3851
'javascript:': true
3952
},
53+
// protocols that never have a hostname.
4054
hostlessProtocol = {
4155
'javascript': true,
4256
'javascript:': true,
4357
'file': true,
4458
'file:': true
4559
},
60+
// protocols that always have a path component.
4661
pathedProtocol = {
4762
'http': true,
4863
'https': true,
@@ -54,6 +69,7 @@ var protocolPattern = /^([a-z0-9]+:)/,
5469
'gopher:': true,
5570
'file:': true
5671
},
72+
// protocols that always contain a // bit.
5773
slashedProtocol = {
5874
'http': true,
5975
'https': true,
@@ -74,10 +90,19 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
7490
var out = {},
7591
rest = url;
7692

93+
// cut off any delimiters.
94+
// This is to support parse stuff like "<http://foo.com>"
95+
for (var i = 0, l = rest.length; i < l; i++) {
96+
if (delims.indexOf(rest.charAt(i)) === -1) break;
97+
}
98+
if (i !== 0) rest = rest.substr(i);
99+
100+
77101
var proto = protocolPattern.exec(rest);
78102
if (proto) {
79103
proto = proto[0];
80-
out.protocol = proto;
104+
var lowerProto = proto.toLowerCase();
105+
out.protocol = lowerProto;
81106
rest = rest.substr(proto.length);
82107
}
83108

@@ -119,6 +144,7 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
119144
var key = keys[i];
120145
out[key] = p[key];
121146
}
147+
122148
// we've indicated that there is a hostname,
123149
// so even if it's empty, it has to be present.
124150
out.hostname = out.hostname || '';
@@ -130,17 +156,49 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
130156
var hostparts = out.hostname.split(/\./);
131157
for (var i = 0, l = hostparts.length; i < l; i++) {
132158
var part = hostparts[i];
159+
if (!part) continue;
133160
if (!part.match(hostnamePartPattern)) {
134-
out.hostname = '';
161+
var validParts = hostparts.slice(0, i);
162+
var notHost = hostparts.slice(i + 1);
163+
var bit = part.match(hostnamePartStart);
164+
if (bit) {
165+
validParts.push(bit[1]);
166+
notHost.unshift(bit[2]);
167+
}
168+
if (notHost.length) {
169+
rest = '/' + notHost.join('.') + rest
170+
}
171+
out.hostname = validParts.join('.');
135172
break;
136173
}
137174
}
138175
}
176+
// hostnames are always lower case.
177+
out.hostname = out.hostname.toLowerCase();
178+
179+
out.host = ((out.auth) ? out.auth + '@' : '') +
180+
(out.hostname || '') +
181+
((out.port) ? ':' + out.port : '');
182+
out.href += out.host;
139183
}
140184

141185
// now rest is set to the post-host stuff.
142186
// chop off any delim chars.
143-
if (!unsafeProtocol[proto]) {
187+
if (!unsafeProtocol[lowerProto]) {
188+
189+
// First, make 100% sure that any "autoEscape" chars get
190+
// escaped, even if encodeURIComponent doesn't think they
191+
// need to be.
192+
for (var i = 0, l = autoEscape.length; i < l; i++) {
193+
var ae = autoEscape[i];
194+
var esc = encodeURIComponent(ae);
195+
if (esc === ae) {
196+
esc = escape(ae);
197+
}
198+
rest = rest.split(ae).join(esc);
199+
}
200+
201+
// Now make sure that delims never appear in a url.
144202
var chop = rest.length;
145203
for (var i = 0, l = delims.length; i < l; i++) {
146204
var c = rest.indexOf(delims[i]);

‎test/simple/test-url.js

+106-12
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,99 @@ var parseTests = {
3232
'href': '//some_path',
3333
'pathname': '//some_path'
3434
},
35+
'HTTP://www.example.com/' : {
36+
'href': 'http://www.example.com/',
37+
'protocol': 'http:',
38+
'host': 'www.example.com',
39+
'hostname': 'www.example.com',
40+
'pathname': '/'
41+
},
42+
'http://www.ExAmPlE.com/' : {
43+
'href': 'http://www.example.com/',
44+
'protocol': 'http:',
45+
'host': 'www.example.com',
46+
'hostname': 'www.example.com',
47+
'pathname': '/'
48+
49+
},
50+
'http://user:pw@www.ExAmPlE.com/' : {
51+
'href': 'http://user:pw@www.example.com/',
52+
'protocol': 'http:',
53+
'auth': 'user:pw',
54+
'host': 'user:pw@www.example.com',
55+
'hostname': 'www.example.com',
56+
'pathname': '/'
57+
58+
},
59+
'http://USER:PW@www.ExAmPlE.com/' : {
60+
'href': 'http://USER:PW@www.example.com/',
61+
'protocol': 'http:',
62+
'auth': 'USER:PW',
63+
'host': 'USER:PW@www.example.com',
64+
'hostname': 'www.example.com',
65+
'pathname': '/'
66+
},
67+
'http://x.com/path?that\'s#all, folks' : {
68+
'href': 'http://x.com/path?that%27s#all,',
69+
'protocol': 'http:',
70+
'host': 'x.com',
71+
'hostname': 'x.com',
72+
'search': '?that%27s',
73+
'query': 'that%27s',
74+
'pathname': '/path',
75+
'hash': '#all,'
76+
},
77+
'HTTP://X.COM/Y' : {
78+
'href': 'http://x.com/Y',
79+
'protocol': 'http:',
80+
'host': 'x.com',
81+
'hostname': 'x.com',
82+
'pathname': '/Y',
83+
},
84+
// an unexpected invalid char in the hostname.
85+
'HtTp://x.y.cOm*a/b/c?d=e#f g<h>i' : {
86+
'href': 'http://x.y.com/*a/b/c?d=e#f',
87+
'protocol': 'http:',
88+
'host': 'x.y.com',
89+
'hostname': 'x.y.com',
90+
'pathname': '/*a/b/c',
91+
'search': '?d=e',
92+
'query': 'd=e',
93+
'hash': '#f'
94+
},
95+
// make sure that we don't accidentally lcast the path parts.
96+
'HtTp://x.y.cOm*A/b/c?d=e#f g<h>i' : {
97+
'href': 'http://x.y.com/*A/b/c?d=e#f',
98+
'protocol': 'http:',
99+
'host': 'x.y.com',
100+
'hostname': 'x.y.com',
101+
'pathname': '/*A/b/c',
102+
'search': '?d=e',
103+
'query': 'd=e',
104+
'hash': '#f'
105+
},
106+
'http://x...y...#p': {
107+
'href': 'http://x...y.../#p',
108+
'protocol': 'http:',
109+
'host': 'x...y...',
110+
'hostname': 'x...y...',
111+
'hash': '#p',
112+
'pathname': '/'
113+
},
114+
'http://x/p/"quoted"': {
115+
'href': 'http://x/p/',
116+
'protocol':'http:',
117+
'host': 'x',
118+
'hostname': 'x',
119+
'pathname': '/p/'
120+
},
121+
'<http://goo.corn/bread> Is a URL!': {
122+
'href': 'http://goo.corn/bread',
123+
'protocol': 'http:',
124+
'host': 'goo.corn',
125+
'hostname': 'goo.corn',
126+
'pathname': '/bread'
127+
},
35128
'http://www.narwhaljs.org/blog/categories?id=news' : {
36129
'href': 'http://www.narwhaljs.org/blog/categories?id=news',
37130
'protocol': 'http:',
@@ -58,17 +151,18 @@ var parseTests = {
58151
'query': '??&hl=en&src=api&x=2&y=2&z=3&s=',
59152
'pathname': '/vt/lyrs=m@114'
60153
},
61-
'http://user:pass@mt0.google.com/vt/lyrs=m@114???&hl=en&src=api&x=2&y=2&z=3&s=' : {
62-
'href': 'http://user:pass@mt0.google.com/vt/lyrs=m@114???' +
63-
'&hl=en&src=api&x=2&y=2&z=3&s=',
64-
'protocol': 'http:',
65-
'host': 'user:pass@mt0.google.com',
66-
'auth': 'user:pass',
67-
'hostname': 'mt0.google.com',
68-
'search': '???&hl=en&src=api&x=2&y=2&z=3&s=',
69-
'query': '??&hl=en&src=api&x=2&y=2&z=3&s=',
70-
'pathname': '/vt/lyrs=m@114'
71-
},
154+
'http://user:pass@mt0.google.com/vt/lyrs=m@114???&hl=en&src=api&x=2&y=2&z=3&s=':
155+
{
156+
'href': 'http://user:pass@mt0.google.com/vt/lyrs=m@114???' +
157+
'&hl=en&src=api&x=2&y=2&z=3&s=',
158+
'protocol': 'http:',
159+
'host': 'user:pass@mt0.google.com',
160+
'auth': 'user:pass',
161+
'hostname': 'mt0.google.com',
162+
'search': '???&hl=en&src=api&x=2&y=2&z=3&s=',
163+
'query': '??&hl=en&src=api&x=2&y=2&z=3&s=',
164+
'pathname': '/vt/lyrs=m@114'
165+
},
72166
'file:///etc/passwd' : {
73167
'href': 'file:///etc/passwd',
74168
'protocol': 'file:',
@@ -154,7 +248,7 @@ for (var u in parseTests) {
154248
'parse(' + u + ').' + i + ' == ' + e + '\nactual: ' + a);
155249
}
156250

157-
var expected = u,
251+
var expected = parseTests[u].href,
158252
actual = url.format(parseTests[u]);
159253

160254
assert.equal(expected, actual,

0 commit comments

Comments
 (0)
This repository has been archived.