@@ -7,6 +7,8 @@ use std::{
7
7
io,
8
8
path:: { Path , PathBuf } ,
9
9
} ;
10
+ use std:: borrow:: Cow ;
11
+ use bstr:: ByteSlice ;
10
12
11
13
pub mod git;
12
14
@@ -171,12 +173,72 @@ pub fn copy_dir_sanitized(
171
173
src_path. display( )
172
174
) ) ;
173
175
} else if ft. is_file ( ) {
174
- std:: fs:: copy ( entry. path ( ) , & dest_path) ?;
176
+ // only obviously non-text files get a pass
177
+ if is_binary_file_extension ( & dest_path) {
178
+ std:: fs:: copy ( & src_path, & dest_path) ?;
179
+ } else {
180
+ let input = std:: fs:: read ( & src_path) ?;
181
+ let output = escape_tricky_unicode ( & input) ;
182
+ if output != input {
183
+ changes. push ( format ! ( "Escaped potentially confusing UTF-8 in '{}'" , src_path. display( ) ) ) ;
184
+ }
185
+ std:: fs:: write ( & dest_path, output) ?;
186
+ }
175
187
} else {
176
188
assert ! ( ft. is_dir( ) ) ;
177
189
let _ = std:: fs:: create_dir ( & dest_path) ;
178
- copy_dir_sanitized ( & entry . path ( ) , & dest_path, changes) ?;
190
+ copy_dir_sanitized ( & src_path , & dest_path, changes) ?;
179
191
}
180
192
}
181
193
Ok ( ( ) )
182
194
}
195
+
196
+ fn is_binary_file_extension ( path : & Path ) -> bool {
197
+ path. extension ( ) . and_then ( |e| e. to_str ( ) ) . map_or ( false , |e| {
198
+ matches ! ( e. to_lowercase( ) . as_str( ) , "bin" | "zip" | "gz" | "xz" | "bz2" | "jpg" | "jpeg" | "png" | "gif" | "exe" | "dll" )
199
+ } )
200
+ }
201
+
202
+ fn escape_tricky_unicode ( input : & [ u8 ] ) -> Cow < [ u8 ] > {
203
+ if input. is_ascii ( ) {
204
+ return input. into ( ) ;
205
+ }
206
+
207
+ let mut output = Vec :: with_capacity ( input. len ( ) ) ;
208
+ for ch in input. utf8_chunks ( ) {
209
+ output. extend_from_slice ( escape_tricky_unicode_str ( ch. valid ( ) ) . as_bytes ( ) ) ;
210
+ output. extend_from_slice ( ch. invalid ( ) ) ;
211
+ }
212
+ output. into ( )
213
+ }
214
+
215
+ fn escape_tricky_unicode_str ( input : & str ) -> Cow < str > {
216
+ if input. is_ascii ( ) {
217
+ return input. into ( ) ;
218
+ }
219
+
220
+ use std:: fmt:: Write ;
221
+ let mut out = String :: with_capacity ( input. len ( ) ) ;
222
+ for ch in input. chars ( ) {
223
+ match ch {
224
+ // https://blog.rust-lang.org/2021/11/01/cve-2021-42574.html
225
+ '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}' => {
226
+ let _ = write ! ( & mut out, "\\ u{{{:04x}}}" , ch as u32 ) ;
227
+ } ,
228
+ _ => out. push ( ch) ,
229
+ }
230
+ }
231
+ out. into ( )
232
+ }
233
+
234
+ #[ test]
235
+ fn escapes_unicode_bidi ( ) {
236
+ let bidi_test = "\u{202A} \u{202B} \u{202C} \u{202D} \u{202E} | \u{2066} | \x00 \u{2067} | \u{2068} \u{FFFF} | \u{2069} " ;
237
+ assert_eq ! (
238
+ "\\ u{202a}\\ u{202b}\\ u{202c}\\ u{202d}\\ u{202e} | \\ u{2066} | \u{0} \\ u{2067} | \\ u{2068}\u{ffff} | \\ u{2069}" . as_bytes( ) ,
239
+ & * escape_tricky_unicode( bidi_test. as_bytes( ) ) ,
240
+ ) ;
241
+
242
+ let binary_test = & b"ABC\0 \0 \0 \x11 \xff \xc0 \xfa \xda " [ ..] ;
243
+ assert_eq ! ( binary_test, & * escape_tricky_unicode( binary_test) ) ;
244
+ }
0 commit comments