Skip to content

Commit 023856c

Browse files
Rollup merge of rust-lang#87580 - ChrisDenton:win-arg-parse-2008, r=m-ou-se
Update Windows Argument Parsing Fixes rust-lang#44650 The Windows command line is passed to applications [as a single string](https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string) which the application then parses to get a list of arguments. The standard rules (as used by C/C++) for parsing the command line have slightly changed over the years, most recently in 2008 which added new escaping rules. This PR implements the new rules as [described on MSDN](https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments) and [further detailed here](https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN). It has been tested against the behaviour of C++ by calling a C++ program that outputs its raw command line and the contents of `argv`. See [my repo](https://github.com/ChrisDenton/winarg/tree/std) if anyone wants to reproduce my work. For an overview of how this PR changes argument parsing behavior and why we feel it is warranted see rust-lang#87580 (comment). For some examples see: rust-lang#87580 (comment)
2 parents 7eb6f19 + e26dda5 commit 023856c

File tree

4 files changed

+201
-123
lines changed

4 files changed

+201
-123
lines changed

library/std/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@
253253
#![feature(const_ip)]
254254
#![feature(const_ipv4)]
255255
#![feature(const_ipv6)]
256+
#![feature(const_option)]
256257
#![feature(const_raw_ptr_deref)]
257258
#![feature(const_socketaddr)]
258259
#![feature(const_trait_impl)]

library/std/src/sys/windows/args.rs

+152-105
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,30 @@
1-
#![allow(dead_code)] // runtime init functions not used during testing
1+
//! The Windows command line is just a string
2+
//! <https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string>
3+
//!
4+
//! This module implements the parsing necessary to turn that string into a list of arguments.
25
36
#[cfg(test)]
47
mod tests;
58

69
use crate::ffi::OsString;
710
use crate::fmt;
11+
use crate::marker::PhantomData;
12+
use crate::num::NonZeroU16;
813
use crate::os::windows::prelude::*;
914
use crate::path::PathBuf;
10-
use crate::slice;
15+
use crate::ptr::NonNull;
1116
use crate::sys::c;
1217
use crate::sys::windows::os::current_exe;
1318
use crate::vec;
1419

1520
use core::iter;
1621

1722
pub fn args() -> Args {
23+
// SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16
24+
// string so it's safe for `WStrUnits` to use.
1825
unsafe {
1926
let lp_cmd_line = c::GetCommandLineW();
20-
let parsed_args_list = parse_lp_cmd_line(lp_cmd_line as *const u16, || {
27+
let parsed_args_list = parse_lp_cmd_line(WStrUnits::new(lp_cmd_line), || {
2128
current_exe().map(PathBuf::into_os_string).unwrap_or_else(|_| OsString::new())
2229
});
2330

@@ -28,129 +35,120 @@ pub fn args() -> Args {
2835
/// Implements the Windows command-line argument parsing algorithm.
2936
///
3037
/// Microsoft's documentation for the Windows CLI argument format can be found at
31-
/// <https://docs.microsoft.com/en-us/previous-versions//17w5ykft(v=vs.85)>.
38+
/// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments>
3239
///
33-
/// Windows includes a function to do this in shell32.dll,
34-
/// but linking with that DLL causes the process to be registered as a GUI application.
40+
/// A more in-depth explanation is here:
41+
/// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN>
42+
///
43+
/// Windows includes a function to do command line parsing in shell32.dll.
44+
/// However, this is not used for two reasons:
45+
///
46+
/// 1. Linking with that DLL causes the process to be registered as a GUI application.
3547
/// GUI applications add a bunch of overhead, even if no windows are drawn. See
3648
/// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>.
3749
///
38-
/// This function was tested for equivalence to the shell32.dll implementation in
39-
/// Windows 10 Pro v1803, using an exhaustive test suite available at
40-
/// <https://gist.github.com/notriddle/dde431930c392e428055b2dc22e638f5> or
41-
/// <https://paste.gg/p/anonymous/47d6ed5f5bd549168b1c69c799825223>.
42-
unsafe fn parse_lp_cmd_line<F: Fn() -> OsString>(
43-
lp_cmd_line: *const u16,
50+
/// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above.
51+
///
52+
/// This function was tested for equivalence to the C/C++ parsing rules using an
53+
/// extensive test suite available at
54+
/// <https://github.com/ChrisDenton/winarg/tree/std>.
55+
fn parse_lp_cmd_line<'a, F: Fn() -> OsString>(
56+
lp_cmd_line: Option<WStrUnits<'a>>,
4457
exe_name: F,
4558
) -> Vec<OsString> {
46-
const BACKSLASH: u16 = '\\' as u16;
47-
const QUOTE: u16 = '"' as u16;
48-
const TAB: u16 = '\t' as u16;
49-
const SPACE: u16 = ' ' as u16;
59+
const BACKSLASH: NonZeroU16 = NonZeroU16::new(b'\\' as u16).unwrap();
60+
const QUOTE: NonZeroU16 = NonZeroU16::new(b'"' as u16).unwrap();
61+
const TAB: NonZeroU16 = NonZeroU16::new(b'\t' as u16).unwrap();
62+
const SPACE: NonZeroU16 = NonZeroU16::new(b' ' as u16).unwrap();
63+
5064
let mut ret_val = Vec::new();
51-
if lp_cmd_line.is_null() || *lp_cmd_line == 0 {
65+
// If the cmd line pointer is null or it points to an empty string then
66+
// return the name of the executable as argv[0].
67+
if lp_cmd_line.as_ref().and_then(|cmd| cmd.peek()).is_none() {
5268
ret_val.push(exe_name());
5369
return ret_val;
5470
}
55-
let mut cmd_line = {
56-
let mut end = 0;
57-
while *lp_cmd_line.offset(end) != 0 {
58-
end += 1;
59-
}
60-
slice::from_raw_parts(lp_cmd_line, end as usize)
61-
};
71+
let mut code_units = lp_cmd_line.unwrap();
72+
6273
// The executable name at the beginning is special.
63-
cmd_line = match cmd_line[0] {
64-
// The executable name ends at the next quote mark,
65-
// no matter what.
66-
QUOTE => {
67-
let args = {
68-
let mut cut = cmd_line[1..].splitn(2, |&c| c == QUOTE);
69-
if let Some(exe) = cut.next() {
70-
ret_val.push(OsString::from_wide(exe));
71-
}
72-
cut.next()
73-
};
74-
if let Some(args) = args {
75-
args
76-
} else {
77-
return ret_val;
78-
}
79-
}
80-
// Implement quirk: when they say whitespace here,
81-
// they include the entire ASCII control plane:
82-
// "However, if lpCmdLine starts with any amount of whitespace, CommandLineToArgvW
83-
// will consider the first argument to be an empty string. Excess whitespace at the
84-
// end of lpCmdLine is ignored."
85-
0..=SPACE => {
86-
ret_val.push(OsString::new());
87-
&cmd_line[1..]
88-
}
89-
// The executable name ends at the next whitespace,
90-
// no matter what.
91-
_ => {
92-
let args = {
93-
let mut cut = cmd_line.splitn(2, |&c| c > 0 && c <= SPACE);
94-
if let Some(exe) = cut.next() {
95-
ret_val.push(OsString::from_wide(exe));
96-
}
97-
cut.next()
98-
};
99-
if let Some(args) = args {
100-
args
101-
} else {
102-
return ret_val;
103-
}
74+
let mut in_quotes = false;
75+
let mut cur = Vec::new();
76+
for w in &mut code_units {
77+
match w {
78+
// A quote mark always toggles `in_quotes` no matter what because
79+
// there are no escape characters when parsing the executable name.
80+
QUOTE => in_quotes = !in_quotes,
81+
// If not `in_quotes` then whitespace ends argv[0].
82+
SPACE | TAB if !in_quotes => break,
83+
// In all other cases the code unit is taken literally.
84+
_ => cur.push(w.get()),
10485
}
105-
};
86+
}
87+
// Skip whitespace.
88+
code_units.advance_while(|w| w == SPACE || w == TAB);
89+
ret_val.push(OsString::from_wide(&cur));
90+
91+
// Parse the arguments according to these rules:
92+
// * All code units are taken literally except space, tab, quote and backslash.
93+
// * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are
94+
// treated as a single separator.
95+
// * A space or tab `in_quotes` is taken literally.
96+
// * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally.
97+
// * A quote can be escaped if preceded by an odd number of backslashes.
98+
// * If any number of backslashes is immediately followed by a quote then the number of
99+
// backslashes is halved (rounding down).
100+
// * Backslashes not followed by a quote are all taken literally.
101+
// * If `in_quotes` then a quote can also be escaped using another quote
102+
// (i.e. two consecutive quotes become one literal quote).
106103
let mut cur = Vec::new();
107104
let mut in_quotes = false;
108-
let mut was_in_quotes = false;
109-
let mut backslash_count: usize = 0;
110-
for &c in cmd_line {
111-
match c {
112-
// backslash
113-
BACKSLASH => {
114-
backslash_count += 1;
115-
was_in_quotes = false;
105+
while let Some(w) = code_units.next() {
106+
match w {
107+
// If not `in_quotes`, a space or tab ends the argument.
108+
SPACE | TAB if !in_quotes => {
109+
ret_val.push(OsString::from_wide(&cur[..]));
110+
cur.truncate(0);
111+
112+
// Skip whitespace.
113+
code_units.advance_while(|w| w == SPACE || w == TAB);
116114
}
117-
QUOTE if backslash_count % 2 == 0 => {
118-
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2));
119-
backslash_count = 0;
120-
if was_in_quotes {
121-
cur.push('"' as u16);
122-
was_in_quotes = false;
115+
// Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote.
116+
BACKSLASH => {
117+
let backslash_count = code_units.advance_while(|w| w == BACKSLASH) + 1;
118+
if code_units.peek() == Some(QUOTE) {
119+
cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count / 2));
120+
// The quote is escaped if there are an odd number of backslashes.
121+
if backslash_count % 2 == 1 {
122+
code_units.next();
123+
cur.push(QUOTE.get());
124+
}
123125
} else {
124-
was_in_quotes = in_quotes;
125-
in_quotes = !in_quotes;
126+
// If there is no quote on the end then there is no escaping.
127+
cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count));
126128
}
127129
}
128-
QUOTE if backslash_count % 2 != 0 => {
129-
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2));
130-
backslash_count = 0;
131-
was_in_quotes = false;
132-
cur.push(b'"' as u16);
133-
}
134-
SPACE | TAB if !in_quotes => {
135-
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
136-
if !cur.is_empty() || was_in_quotes {
137-
ret_val.push(OsString::from_wide(&cur[..]));
138-
cur.truncate(0);
130+
// If `in_quotes` and not backslash escaped (see above) then a quote either
131+
// unsets `in_quote` or is escaped by another quote.
132+
QUOTE if in_quotes => match code_units.peek() {
133+
// Two consecutive quotes when `in_quotes` produces one literal quote.
134+
Some(QUOTE) => {
135+
cur.push(QUOTE.get());
136+
code_units.next();
139137
}
140-
backslash_count = 0;
141-
was_in_quotes = false;
142-
}
143-
_ => {
144-
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
145-
backslash_count = 0;
146-
was_in_quotes = false;
147-
cur.push(c);
148-
}
138+
// Otherwise set `in_quotes`.
139+
Some(_) => in_quotes = false,
140+
// The end of the command line.
141+
// Push `cur` even if empty, which we do by breaking while `in_quotes` is still set.
142+
None => break,
143+
},
144+
// If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`.
145+
QUOTE => in_quotes = true,
146+
// Everything else is always taken literally.
147+
_ => cur.push(w.get()),
149148
}
150149
}
151-
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
152-
// include empty quoted strings at the end of the arguments list
153-
if !cur.is_empty() || was_in_quotes || in_quotes {
150+
// Push the final argument, if any.
151+
if !cur.is_empty() || in_quotes {
154152
ret_val.push(OsString::from_wide(&cur[..]));
155153
}
156154
ret_val
@@ -187,3 +185,52 @@ impl ExactSizeIterator for Args {
187185
self.parsed_args_list.len()
188186
}
189187
}
188+
189+
/// A safe iterator over a LPWSTR
190+
/// (aka a pointer to a series of UTF-16 code units terminated by a NULL).
191+
struct WStrUnits<'a> {
192+
// The pointer must never be null...
193+
lpwstr: NonNull<u16>,
194+
// ...and the memory it points to must be valid for this lifetime.
195+
lifetime: PhantomData<&'a [u16]>,
196+
}
197+
impl WStrUnits<'_> {
198+
/// Create the iterator. Returns `None` if `lpwstr` is null.
199+
///
200+
/// SAFETY: `lpwstr` must point to a null-terminated wide string that lives
201+
/// at least as long as the lifetime of this struct.
202+
unsafe fn new(lpwstr: *const u16) -> Option<Self> {
203+
Some(Self { lpwstr: NonNull::new(lpwstr as _)?, lifetime: PhantomData })
204+
}
205+
fn peek(&self) -> Option<NonZeroU16> {
206+
// SAFETY: It's always safe to read the current item because we don't
207+
// ever move out of the array's bounds.
208+
unsafe { NonZeroU16::new(*self.lpwstr.as_ptr()) }
209+
}
210+
/// Advance the iterator while `predicate` returns true.
211+
/// Returns the number of items it advanced by.
212+
fn advance_while<P: FnMut(NonZeroU16) -> bool>(&mut self, mut predicate: P) -> usize {
213+
let mut counter = 0;
214+
while let Some(w) = self.peek() {
215+
if !predicate(w) {
216+
break;
217+
}
218+
counter += 1;
219+
self.next();
220+
}
221+
counter
222+
}
223+
}
224+
impl Iterator for WStrUnits<'_> {
225+
// This can never return zero as that marks the end of the string.
226+
type Item = NonZeroU16;
227+
fn next(&mut self) -> Option<NonZeroU16> {
228+
// SAFETY: If NULL is reached we immediately return.
229+
// Therefore it's safe to advance the pointer after that.
230+
unsafe {
231+
let next = self.peek()?;
232+
self.lpwstr = NonNull::new_unchecked(self.lpwstr.as_ptr().add(1));
233+
Some(next)
234+
}
235+
}
236+
}

0 commit comments

Comments
 (0)