1
- #![ allow( dead_code) ] // runtime init functions not used during testing
1
+ //! The Windows command line is just a string
2
+ //! <https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string>
3
+ //!
4
+ //! This module implements the parsing necessary to turn that string into a list of arguments.
2
5
3
6
#[ cfg( test) ]
4
7
mod tests;
5
8
6
9
use crate :: ffi:: OsString ;
7
10
use crate :: fmt;
11
+ use crate :: marker:: PhantomData ;
12
+ use crate :: num:: NonZeroU16 ;
8
13
use crate :: os:: windows:: prelude:: * ;
9
14
use crate :: path:: PathBuf ;
10
- use crate :: slice ;
15
+ use crate :: ptr :: NonNull ;
11
16
use crate :: sys:: c;
12
17
use crate :: sys:: windows:: os:: current_exe;
13
18
use crate :: vec;
14
19
15
20
use core:: iter;
16
21
17
22
pub fn args ( ) -> Args {
23
+ // SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16
24
+ // string so it's safe for `WStrUnits` to use.
18
25
unsafe {
19
26
let lp_cmd_line = c:: GetCommandLineW ( ) ;
20
- let parsed_args_list = parse_lp_cmd_line ( lp_cmd_line as * const u16 , || {
27
+ let parsed_args_list = parse_lp_cmd_line ( WStrUnits :: new ( lp_cmd_line) , || {
21
28
current_exe ( ) . map ( PathBuf :: into_os_string) . unwrap_or_else ( |_| OsString :: new ( ) )
22
29
} ) ;
23
30
@@ -28,129 +35,120 @@ pub fn args() -> Args {
28
35
/// Implements the Windows command-line argument parsing algorithm.
29
36
///
30
37
/// Microsoft's documentation for the Windows CLI argument format can be found at
31
- /// <https://docs.microsoft.com/en-us/previous-versions//17w5ykft(v=vs.85)>.
38
+ /// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments>
32
39
///
33
- /// Windows includes a function to do this in shell32.dll,
34
- /// but linking with that DLL causes the process to be registered as a GUI application.
40
+ /// A more in-depth explanation is here:
41
+ /// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN>
42
+ ///
43
+ /// Windows includes a function to do command line parsing in shell32.dll.
44
+ /// However, this is not used for two reasons:
45
+ ///
46
+ /// 1. Linking with that DLL causes the process to be registered as a GUI application.
35
47
/// GUI applications add a bunch of overhead, even if no windows are drawn. See
36
48
/// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>.
37
49
///
38
- /// This function was tested for equivalence to the shell32.dll implementation in
39
- /// Windows 10 Pro v1803, using an exhaustive test suite available at
40
- /// <https://gist.github.com/notriddle/dde431930c392e428055b2dc22e638f5> or
41
- /// <https://paste.gg/p/anonymous/47d6ed5f5bd549168b1c69c799825223>.
42
- unsafe fn parse_lp_cmd_line < F : Fn ( ) -> OsString > (
43
- lp_cmd_line : * const u16 ,
50
+ /// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above.
51
+ ///
52
+ /// This function was tested for equivalence to the C/C++ parsing rules using an
53
+ /// extensive test suite available at
54
+ /// <https://github.com/ChrisDenton/winarg/tree/std>.
55
+ fn parse_lp_cmd_line < ' a , F : Fn ( ) -> OsString > (
56
+ lp_cmd_line : Option < WStrUnits < ' a > > ,
44
57
exe_name : F ,
45
58
) -> Vec < OsString > {
46
- const BACKSLASH : u16 = '\\' as u16 ;
47
- const QUOTE : u16 = '"' as u16 ;
48
- const TAB : u16 = '\t' as u16 ;
49
- const SPACE : u16 = ' ' as u16 ;
59
+ const BACKSLASH : NonZeroU16 = NonZeroU16 :: new ( b'\\' as u16 ) . unwrap ( ) ;
60
+ const QUOTE : NonZeroU16 = NonZeroU16 :: new ( b'"' as u16 ) . unwrap ( ) ;
61
+ const TAB : NonZeroU16 = NonZeroU16 :: new ( b'\t' as u16 ) . unwrap ( ) ;
62
+ const SPACE : NonZeroU16 = NonZeroU16 :: new ( b' ' as u16 ) . unwrap ( ) ;
63
+
50
64
let mut ret_val = Vec :: new ( ) ;
51
- if lp_cmd_line. is_null ( ) || * lp_cmd_line == 0 {
65
+ // If the cmd line pointer is null or it points to an empty string then
66
+ // return the name of the executable as argv[0].
67
+ if lp_cmd_line. as_ref ( ) . and_then ( |cmd| cmd. peek ( ) ) . is_none ( ) {
52
68
ret_val. push ( exe_name ( ) ) ;
53
69
return ret_val;
54
70
}
55
- let mut cmd_line = {
56
- let mut end = 0 ;
57
- while * lp_cmd_line. offset ( end) != 0 {
58
- end += 1 ;
59
- }
60
- slice:: from_raw_parts ( lp_cmd_line, end as usize )
61
- } ;
71
+ let mut code_units = lp_cmd_line. unwrap ( ) ;
72
+
62
73
// The executable name at the beginning is special.
63
- cmd_line = match cmd_line[ 0 ] {
64
- // The executable name ends at the next quote mark,
65
- // no matter what.
66
- QUOTE => {
67
- let args = {
68
- let mut cut = cmd_line[ 1 ..] . splitn ( 2 , |& c| c == QUOTE ) ;
69
- if let Some ( exe) = cut. next ( ) {
70
- ret_val. push ( OsString :: from_wide ( exe) ) ;
71
- }
72
- cut. next ( )
73
- } ;
74
- if let Some ( args) = args {
75
- args
76
- } else {
77
- return ret_val;
78
- }
79
- }
80
- // Implement quirk: when they say whitespace here,
81
- // they include the entire ASCII control plane:
82
- // "However, if lpCmdLine starts with any amount of whitespace, CommandLineToArgvW
83
- // will consider the first argument to be an empty string. Excess whitespace at the
84
- // end of lpCmdLine is ignored."
85
- 0 ..=SPACE => {
86
- ret_val. push ( OsString :: new ( ) ) ;
87
- & cmd_line[ 1 ..]
88
- }
89
- // The executable name ends at the next whitespace,
90
- // no matter what.
91
- _ => {
92
- let args = {
93
- let mut cut = cmd_line. splitn ( 2 , |& c| c > 0 && c <= SPACE ) ;
94
- if let Some ( exe) = cut. next ( ) {
95
- ret_val. push ( OsString :: from_wide ( exe) ) ;
96
- }
97
- cut. next ( )
98
- } ;
99
- if let Some ( args) = args {
100
- args
101
- } else {
102
- return ret_val;
103
- }
74
+ let mut in_quotes = false ;
75
+ let mut cur = Vec :: new ( ) ;
76
+ for w in & mut code_units {
77
+ match w {
78
+ // A quote mark always toggles `in_quotes` no matter what because
79
+ // there are no escape characters when parsing the executable name.
80
+ QUOTE => in_quotes = !in_quotes,
81
+ // If not `in_quotes` then whitespace ends argv[0].
82
+ SPACE | TAB if !in_quotes => break ,
83
+ // In all other cases the code unit is taken literally.
84
+ _ => cur. push ( w. get ( ) ) ,
104
85
}
105
- } ;
86
+ }
87
+ // Skip whitespace.
88
+ code_units. advance_while ( |w| w == SPACE || w == TAB ) ;
89
+ ret_val. push ( OsString :: from_wide ( & cur) ) ;
90
+
91
+ // Parse the arguments according to these rules:
92
+ // * All code units are taken literally except space, tab, quote and backslash.
93
+ // * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are
94
+ // treated as a single separator.
95
+ // * A space or tab `in_quotes` is taken literally.
96
+ // * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally.
97
+ // * A quote can be escaped if preceded by an odd number of backslashes.
98
+ // * If any number of backslashes is immediately followed by a quote then the number of
99
+ // backslashes is halved (rounding down).
100
+ // * Backslashes not followed by a quote are all taken literally.
101
+ // * If `in_quotes` then a quote can also be escaped using another quote
102
+ // (i.e. two consecutive quotes become one literal quote).
106
103
let mut cur = Vec :: new ( ) ;
107
104
let mut in_quotes = false ;
108
- let mut was_in_quotes = false ;
109
- let mut backslash_count: usize = 0 ;
110
- for & c in cmd_line {
111
- match c {
112
- // backslash
113
- BACKSLASH => {
114
- backslash_count += 1 ;
115
- was_in_quotes = false ;
105
+ while let Some ( w) = code_units. next ( ) {
106
+ match w {
107
+ // If not `in_quotes`, a space or tab ends the argument.
108
+ SPACE | TAB if !in_quotes => {
109
+ ret_val. push ( OsString :: from_wide ( & cur[ ..] ) ) ;
110
+ cur. truncate ( 0 ) ;
111
+
112
+ // Skip whitespace.
113
+ code_units. advance_while ( |w| w == SPACE || w == TAB ) ;
116
114
}
117
- QUOTE if backslash_count % 2 == 0 => {
118
- cur. extend ( iter:: repeat ( b'\\' as u16 ) . take ( backslash_count / 2 ) ) ;
119
- backslash_count = 0 ;
120
- if was_in_quotes {
121
- cur. push ( '"' as u16 ) ;
122
- was_in_quotes = false ;
115
+ // Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote.
116
+ BACKSLASH => {
117
+ let backslash_count = code_units. advance_while ( |w| w == BACKSLASH ) + 1 ;
118
+ if code_units. peek ( ) == Some ( QUOTE ) {
119
+ cur. extend ( iter:: repeat ( BACKSLASH . get ( ) ) . take ( backslash_count / 2 ) ) ;
120
+ // The quote is escaped if there are an odd number of backslashes.
121
+ if backslash_count % 2 == 1 {
122
+ code_units. next ( ) ;
123
+ cur. push ( QUOTE . get ( ) ) ;
124
+ }
123
125
} else {
124
- was_in_quotes = in_quotes ;
125
- in_quotes = !in_quotes ;
126
+ // If there is no quote on the end then there is no escaping.
127
+ cur . extend ( iter :: repeat ( BACKSLASH . get ( ) ) . take ( backslash_count ) ) ;
126
128
}
127
129
}
128
- QUOTE if backslash_count % 2 != 0 => {
129
- cur. extend ( iter:: repeat ( b'\\' as u16 ) . take ( backslash_count / 2 ) ) ;
130
- backslash_count = 0 ;
131
- was_in_quotes = false ;
132
- cur. push ( b'"' as u16 ) ;
133
- }
134
- SPACE | TAB if !in_quotes => {
135
- cur. extend ( iter:: repeat ( b'\\' as u16 ) . take ( backslash_count) ) ;
136
- if !cur. is_empty ( ) || was_in_quotes {
137
- ret_val. push ( OsString :: from_wide ( & cur[ ..] ) ) ;
138
- cur. truncate ( 0 ) ;
130
+ // If `in_quotes` and not backslash escaped (see above) then a quote either
131
+ // unsets `in_quote` or is escaped by another quote.
132
+ QUOTE if in_quotes => match code_units. peek ( ) {
133
+ // Two consecutive quotes when `in_quotes` produces one literal quote.
134
+ Some ( QUOTE ) => {
135
+ cur. push ( QUOTE . get ( ) ) ;
136
+ code_units. next ( ) ;
139
137
}
140
- backslash_count = 0 ;
141
- was_in_quotes = false ;
142
- }
143
- _ => {
144
- cur. extend ( iter:: repeat ( b'\\' as u16 ) . take ( backslash_count) ) ;
145
- backslash_count = 0 ;
146
- was_in_quotes = false ;
147
- cur. push ( c) ;
148
- }
138
+ // Otherwise set `in_quotes`.
139
+ Some ( _) => in_quotes = false ,
140
+ // The end of the command line.
141
+ // Push `cur` even if empty, which we do by breaking while `in_quotes` is still set.
142
+ None => break ,
143
+ } ,
144
+ // If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`.
145
+ QUOTE => in_quotes = true ,
146
+ // Everything else is always taken literally.
147
+ _ => cur. push ( w. get ( ) ) ,
149
148
}
150
149
}
151
- cur. extend ( iter:: repeat ( b'\\' as u16 ) . take ( backslash_count) ) ;
152
- // include empty quoted strings at the end of the arguments list
153
- if !cur. is_empty ( ) || was_in_quotes || in_quotes {
150
+ // Push the final argument, if any.
151
+ if !cur. is_empty ( ) || in_quotes {
154
152
ret_val. push ( OsString :: from_wide ( & cur[ ..] ) ) ;
155
153
}
156
154
ret_val
@@ -187,3 +185,52 @@ impl ExactSizeIterator for Args {
187
185
self . parsed_args_list . len ( )
188
186
}
189
187
}
188
+
189
+ /// A safe iterator over a LPWSTR
190
+ /// (aka a pointer to a series of UTF-16 code units terminated by a NULL).
191
+ struct WStrUnits < ' a > {
192
+ // The pointer must never be null...
193
+ lpwstr : NonNull < u16 > ,
194
+ // ...and the memory it points to must be valid for this lifetime.
195
+ lifetime : PhantomData < & ' a [ u16 ] > ,
196
+ }
197
+ impl WStrUnits < ' _ > {
198
+ /// Create the iterator. Returns `None` if `lpwstr` is null.
199
+ ///
200
+ /// SAFETY: `lpwstr` must point to a null-terminated wide string that lives
201
+ /// at least as long as the lifetime of this struct.
202
+ unsafe fn new ( lpwstr : * const u16 ) -> Option < Self > {
203
+ Some ( Self { lpwstr : NonNull :: new ( lpwstr as _ ) ?, lifetime : PhantomData } )
204
+ }
205
+ fn peek ( & self ) -> Option < NonZeroU16 > {
206
+ // SAFETY: It's always safe to read the current item because we don't
207
+ // ever move out of the array's bounds.
208
+ unsafe { NonZeroU16 :: new ( * self . lpwstr . as_ptr ( ) ) }
209
+ }
210
+ /// Advance the iterator while `predicate` returns true.
211
+ /// Returns the number of items it advanced by.
212
+ fn advance_while < P : FnMut ( NonZeroU16 ) -> bool > ( & mut self , mut predicate : P ) -> usize {
213
+ let mut counter = 0 ;
214
+ while let Some ( w) = self . peek ( ) {
215
+ if !predicate ( w) {
216
+ break ;
217
+ }
218
+ counter += 1 ;
219
+ self . next ( ) ;
220
+ }
221
+ counter
222
+ }
223
+ }
224
+ impl Iterator for WStrUnits < ' _ > {
225
+ // This can never return zero as that marks the end of the string.
226
+ type Item = NonZeroU16 ;
227
+ fn next ( & mut self ) -> Option < NonZeroU16 > {
228
+ // SAFETY: If NULL is reached we immediately return.
229
+ // Therefore it's safe to advance the pointer after that.
230
+ unsafe {
231
+ let next = self . peek ( ) ?;
232
+ self . lpwstr = NonNull :: new_unchecked ( self . lpwstr . as_ptr ( ) . add ( 1 ) ) ;
233
+ Some ( next)
234
+ }
235
+ }
236
+ }
0 commit comments