url: fix C0 control and whitespace handling

TimothyGu · TimothyGu · commit 841bb4c61f07 · 2017-05-19T23:19:57.000-07:00
PR-URL: #12846 Fixes: #12825 Refs: web-platform-tests/wpt#5792 Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com>
diff --git a/src/node_url.cc b/src/node_url.cc
@@ -133,6 +133,9 @@ enum url_error_cb_args {
 // https://infra.spec.whatwg.org/#ascii-tab-or-newline
 CHAR_TEST(8, IsASCIITabOrNewline, (ch == '\t' || ch == '\n' || ch == '\r'))
 
+// https://infra.spec.whatwg.org/#c0-control-or-space
+CHAR_TEST(8, IsC0ControlOrSpace, (ch >= '\0' && ch <= ' '))
+
 // https://infra.spec.whatwg.org/#ascii-digit
 CHAR_TEST(8, IsASCIIDigit, (ch >= '0' && ch <= '9'))
 
@@ -1134,15 +1137,45 @@ static inline void ShortenUrlPath(struct url_data* url) {
 }
 
 void URL::Parse(const char* input,
-                const size_t len,
+                size_t len,
                 enum url_parse_state state_override,
                 struct url_data* url,
+                bool has_url,
                 const struct url_data* base,
                 bool has_base) {
+  const char* p = input;
+  const char* end = input + len;
+
+  if (!has_url) {
+    for (const char* ptr = p; ptr < end; ptr++) {
+      if (IsC0ControlOrSpace(*ptr))
+        p++;
+      else
+        break;
+    }
+    for (const char* ptr = end - 1; ptr >= p; ptr--) {
+      if (IsC0ControlOrSpace(*ptr))
+        end--;
+      else
+        break;
+    }
+    len = end - p;
+  }
+
+  std::string whitespace_stripped;
+  whitespace_stripped.reserve(len);
+  for (const char* ptr = p; ptr < end; ptr++)
+    if (!IsASCIITabOrNewline(*ptr))
+      whitespace_stripped += *ptr;
+
+  input = whitespace_stripped.c_str();
+  len = whitespace_stripped.size();
+  p = input;
+  end = input + len;
+
   bool atflag = false;
   bool sbflag = false;
   bool uflag = false;
-  int wskip = 0;
 
   std::string buffer;
   url->scheme.reserve(len);
@@ -1159,9 +1192,6 @@ void URL::Parse(const char* input,
   enum url_parse_state state = has_state_override ? state_override :
                                                     kSchemeStart;
 
-  const char* p = input;
-  const char* end = input + len;
-
   if (state < kSchemeStart || state > kFragment) {
     url->flags |= URL_FLAGS_INVALID_PARSE_STATE;
     return;
@@ -1171,18 +1201,6 @@ void URL::Parse(const char* input,
     const char ch = p < end ? p[0] : kEOL;
     const size_t remaining = end == p ? 0 : (end - p - 1);
 
-    if (IsASCIITabOrNewline(ch)) {
-      if (state == kAuthority) {
-        // It's necessary to keep track of how much whitespace
-        // is being ignored when in kAuthority state because of
-        // how the buffer is managed. TODO: See if there's a better
-        // way
-        wskip++;
-      }
-      p++;
-      continue;
-    }
-
     bool special = (url->flags & URL_FLAGS_SPECIAL);
     bool cannot_be_base;
     const bool special_back_slash = (special && ch == '\\');
@@ -1500,7 +1518,7 @@ void URL::Parse(const char* input,
             url->flags |= URL_FLAGS_FAILED;
             return;
           }
-          p -= buffer.size() + 1 + wskip;
+          p -= buffer.size() + 1;
           buffer.clear();
           state = kHost;
         } else {
@@ -1892,16 +1910,17 @@ static void Parse(Environment* env,
   HandleScope handle_scope(isolate);
   Context::Scope context_scope(context);
 
+  const bool has_context = context_obj->IsObject();
   const bool has_base = base_obj->IsObject();
 
   struct url_data base;
   struct url_data url;
-  if (context_obj->IsObject())
+  if (has_context)
     HarvestContext(env, &url, context_obj.As<Object>());
   if (has_base)
     HarvestBase(env, &base, base_obj.As<Object>());
 
-  URL::Parse(input, len, state_override, &url, &base, has_base);
+  URL::Parse(input, len, state_override, &url, has_context, &base, has_base);
   if ((url.flags & URL_FLAGS_INVALID_PARSE_STATE) ||
       ((state_override != kUnknownState) &&
        (url.flags & URL_FLAGS_TERMINATED)))
diff --git a/src/node_url.h b/src/node_url.h
@@ -81,30 +81,35 @@ struct url_data {
 class URL {
  public:
   static void Parse(const char* input,
-                    const size_t len,
+                    size_t len,
                     enum url_parse_state state_override,
                     struct url_data* url,
+                    bool has_url,
                     const struct url_data* base,
                     bool has_base);
 
   URL(const char* input, const size_t len) {
-    Parse(input, len, kUnknownState, &context_, nullptr, false);
+    Parse(input, len, kUnknownState, &context_, false, nullptr, false);
   }
 
   URL(const char* input, const size_t len, const URL* base) {
     if (base != nullptr)
-      Parse(input, len, kUnknownState, &context_, &(base->context_), true);
+      Parse(input, len, kUnknownState,
+            &context_, false,
+            &(base->context_), true);
     else
-      Parse(input, len, kUnknownState, &context_, nullptr, false);
+      Parse(input, len, kUnknownState, &context_, false, nullptr, false);
   }
 
   URL(const char* input, const size_t len,
       const char* base, const size_t baselen) {
     if (base != nullptr && baselen > 0) {
       URL _base(base, baselen);
-      Parse(input, len, kUnknownState, &context_, &(_base.context_), true);
+      Parse(input, len, kUnknownState,
+            &context_, false,
+            &(_base.context_), true);
     } else {
-      Parse(input, len, kUnknownState, &context_, nullptr, false);
+      Parse(input, len, kUnknownState, &context_, false, nullptr, false);
     }
   }
 
diff --git a/test/fixtures/url-tests.js b/test/fixtures/url-tests.js
@@ -1,7 +1,7 @@
 'use strict';
 
 /* WPT Refs:
-   https://github.com/w3c/web-platform-tests/blob/28541bb/url/urltestdata.json
+   https://github.com/w3c/web-platform-tests/blob/0f26c418a5/url/urltestdata.json
    License: http://www.w3.org/Consortium/Legal/2008/04-testsuite-copyright.html
 */
 module.exports =
@@ -3566,6 +3566,22 @@ module.exports =
     "search": "",
     "hash": ""
   },
+  "Leading and trailing C0 control or space",
+  {
+    "input": "\u0000\u001b\u0004\u0012 http://example.com/\u001f \u000d ",
+    "base": "about:blank",
+    "href": "http://example.com/",
+    "origin": "http://example.com",
+    "protocol": "http:",
+    "username": "",
+    "password": "",
+    "host": "example.com",
+    "hostname": "example.com",
+    "port": "",
+    "pathname": "/",
+    "search": "",
+    "hash": ""
+  },
   "Ideographic full stop (full-width period for Chinese, etc.) should be treated as a dot. U+3002 is mapped to U+002E (dot)",
   {
     "input": "http://www.foo。bar.com",
@@ -5487,6 +5503,20 @@ module.exports =
     "search": "",
     "hash": ""
   },
+  {
+    "input": "C|\n/",
+    "base": "file://host/dir/file",
+    "href": "file:///C:/",
+    "protocol": "file:",
+    "username": "",
+    "password": "",
+    "host": "",
+    "hostname": "",
+    "port": "",
+    "pathname": "/C:/",
+    "search": "",
+    "hash": ""
+  },
   {
     "input": "C|\\",
     "base": "file://host/dir/file",