@@ -24,25 +24,40 @@ exports.resolve = urlResolve;
24
24
exports . resolveObject = urlResolveObject ;
25
25
exports . format = urlFormat ;
26
26
27
+ // Reference: RFC 3986, RFC 1808, RFC 2396
28
+
27
29
// define these here so at least they only have to be
28
30
// compiled once on the first module load.
29
- var protocolPattern = / ^ ( [ a - z 0 - 9 ] + : ) / ,
31
+ var protocolPattern = / ^ ( [ a - z 0 - 9 ] + : ) / i ,
30
32
portPattern = / : [ 0 - 9 ] + $ / ,
31
- delims = [ '<' , '>' , '"' , '\'' , '`' , / \s / ] ,
33
+ // RFC 2396: characters reserved for delimiting URLs.
34
+ delims = [ '<' , '>' , '"' , '`' , ' ' , '\r' , '\n' , '\t' ] ,
35
+ // RFC 2396: characters not allowed for various reasons.
32
36
unwise = [ '{' , '}' , '|' , '\\' , '^' , '~' , '[' , ']' , '`' ] . concat ( delims ) ,
33
- nonHostChars = [ '/' , '?' , ';' , '#' ] . concat ( unwise ) ,
37
+ // Allowed by RFCs, but cause of XSS attacks. Always escape these.
38
+ autoEscape = [ '\'' ] ,
39
+ // Characters that are never ever allowed in a hostname.
40
+ // Note that any invalid chars are also handled, but these
41
+ // are the ones that are *expected* to be seen, so we fast-path
42
+ // them.
43
+ nonHostChars = [ '%' , '/' , '?' , ';' , '#' ]
44
+ . concat ( unwise ) . concat ( autoEscape ) ,
34
45
hostnameMaxLen = 255 ,
35
- hostnamePartPattern = / ^ [ a - z 0 - 9 ] [ a - z 0 - 9 A - Z - ] { 0 , 62 } $ / ,
46
+ hostnamePartPattern = / ^ [ a - z A - Z 0 - 9 ] [ a - z 0 - 9 A - Z - ] { 0 , 62 } $ / ,
47
+ hostnamePartStart = / ^ ( [ a - z A - Z 0 - 9 ] [ a - z 0 - 9 A - Z - ] { 0 , 62 } ) ( .* ) $ / ,
48
+ // protocols that can allow "unsafe" and "unwise" chars.
36
49
unsafeProtocol = {
37
50
'javascript' : true ,
38
51
'javascript:' : true
39
52
} ,
53
+ // protocols that never have a hostname.
40
54
hostlessProtocol = {
41
55
'javascript' : true ,
42
56
'javascript:' : true ,
43
57
'file' : true ,
44
58
'file:' : true
45
59
} ,
60
+ // protocols that always have a path component.
46
61
pathedProtocol = {
47
62
'http' : true ,
48
63
'https' : true ,
@@ -54,6 +69,7 @@ var protocolPattern = /^([a-z0-9]+:)/,
54
69
'gopher:' : true ,
55
70
'file:' : true
56
71
} ,
72
+ // protocols that always contain a // bit.
57
73
slashedProtocol = {
58
74
'http' : true ,
59
75
'https' : true ,
@@ -74,10 +90,19 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
74
90
var out = { } ,
75
91
rest = url ;
76
92
93
+ // cut off any delimiters.
94
+ // This is to support parse stuff like "<http://foo.com>"
95
+ for ( var i = 0 , l = rest . length ; i < l ; i ++ ) {
96
+ if ( delims . indexOf ( rest . charAt ( i ) ) === - 1 ) break ;
97
+ }
98
+ if ( i !== 0 ) rest = rest . substr ( i ) ;
99
+
100
+
77
101
var proto = protocolPattern . exec ( rest ) ;
78
102
if ( proto ) {
79
103
proto = proto [ 0 ] ;
80
- out . protocol = proto ;
104
+ var lowerProto = proto . toLowerCase ( ) ;
105
+ out . protocol = lowerProto ;
81
106
rest = rest . substr ( proto . length ) ;
82
107
}
83
108
@@ -119,6 +144,7 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
119
144
var key = keys [ i ] ;
120
145
out [ key ] = p [ key ] ;
121
146
}
147
+
122
148
// we've indicated that there is a hostname,
123
149
// so even if it's empty, it has to be present.
124
150
out . hostname = out . hostname || '' ;
@@ -130,17 +156,49 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
130
156
var hostparts = out . hostname . split ( / \. / ) ;
131
157
for ( var i = 0 , l = hostparts . length ; i < l ; i ++ ) {
132
158
var part = hostparts [ i ] ;
159
+ if ( ! part ) continue ;
133
160
if ( ! part . match ( hostnamePartPattern ) ) {
134
- out . hostname = '' ;
161
+ var validParts = hostparts . slice ( 0 , i ) ;
162
+ var notHost = hostparts . slice ( i + 1 ) ;
163
+ var bit = part . match ( hostnamePartStart ) ;
164
+ if ( bit ) {
165
+ validParts . push ( bit [ 1 ] ) ;
166
+ notHost . unshift ( bit [ 2 ] ) ;
167
+ }
168
+ if ( notHost . length ) {
169
+ rest = '/' + notHost . join ( '.' ) + rest
170
+ }
171
+ out . hostname = validParts . join ( '.' ) ;
135
172
break ;
136
173
}
137
174
}
138
175
}
176
+ // hostnames are always lower case.
177
+ out . hostname = out . hostname . toLowerCase ( ) ;
178
+
179
+ out . host = ( ( out . auth ) ? out . auth + '@' : '' ) +
180
+ ( out . hostname || '' ) +
181
+ ( ( out . port ) ? ':' + out . port : '' ) ;
182
+ out . href += out . host ;
139
183
}
140
184
141
185
// now rest is set to the post-host stuff.
142
186
// chop off any delim chars.
143
- if ( ! unsafeProtocol [ proto ] ) {
187
+ if ( ! unsafeProtocol [ lowerProto ] ) {
188
+
189
+ // First, make 100% sure that any "autoEscape" chars get
190
+ // escaped, even if encodeURIComponent doesn't think they
191
+ // need to be.
192
+ for ( var i = 0 , l = autoEscape . length ; i < l ; i ++ ) {
193
+ var ae = autoEscape [ i ] ;
194
+ var esc = encodeURIComponent ( ae ) ;
195
+ if ( esc === ae ) {
196
+ esc = escape ( ae ) ;
197
+ }
198
+ rest = rest . split ( ae ) . join ( esc ) ;
199
+ }
200
+
201
+ // Now make sure that delims never appear in a url.
144
202
var chop = rest . length ;
145
203
for ( var i = 0 , l = delims . length ; i < l ; i ++ ) {
146
204
var c = rest . indexOf ( delims [ i ] ) ;
0 commit comments