17
17
mod cursor;
18
18
pub mod unescape;
19
19
20
+ #[ cfg( test) ]
21
+ mod tests;
22
+
20
23
use self :: LiteralKind :: * ;
21
24
use self :: TokenKind :: * ;
22
25
use crate :: cursor:: { Cursor , EOF_CHAR } ;
26
+ use std:: convert:: TryInto ;
23
27
24
28
/// Parsed token.
25
29
/// It doesn't contain information about data that has been parsed,
@@ -132,9 +136,80 @@ pub enum LiteralKind {
132
136
/// "b"abc"", "b"abc"
133
137
ByteStr { terminated : bool } ,
134
138
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
135
- RawStr { n_hashes : usize , started : bool , terminated : bool } ,
139
+ RawStr ( UnvalidatedRawStr ) ,
136
140
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
137
- RawByteStr { n_hashes : usize , started : bool , terminated : bool } ,
141
+ RawByteStr ( UnvalidatedRawStr ) ,
142
+ }
143
+
144
+ /// Represents something that looks like a raw string, but may have some
145
+ /// problems. Use `.validate()` to convert it into something
146
+ /// usable.
147
+ #[ derive( Clone , Copy , Debug , PartialEq , Eq , PartialOrd , Ord ) ]
148
+ pub struct UnvalidatedRawStr {
149
+ /// The prefix (`r###"`) is valid
150
+ valid_start : bool ,
151
+ /// The number of leading `#`
152
+ n_start_hashes : usize ,
153
+ /// The number of trailing `#`. `n_end_hashes` <= `n_start_hashes`
154
+ n_end_hashes : usize ,
155
+ /// The offset starting at `r` or `br` where the user may have intended to end the string.
156
+ /// Currently, it is the longest sequence of pattern `"#+"`.
157
+ possible_terminator_offset : Option < usize > ,
158
+ }
159
+
160
+ /// Error produced validating a raw string. Represents cases like:
161
+ /// - `r##~"abcde"##`: `LexRawStrError::InvalidStarter`
162
+ /// - `r###"abcde"##`: `LexRawStrError::NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
163
+ /// - Too many `#`s (>65536): `TooManyDelimiters`
164
+ #[ derive( Clone , Copy , Debug , PartialEq , Eq , PartialOrd , Ord ) ]
165
+ pub enum LexRawStrError {
166
+ /// Non `#` characters exist between `r` and `"` eg. `r#~"..`
167
+ InvalidStarter ,
168
+ /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
169
+ /// may have intended to terminate it.
170
+ NoTerminator { expected : usize , found : usize , possible_terminator_offset : Option < usize > } ,
171
+ /// More than 65536 `#`s exist.
172
+ TooManyDelimiters ,
173
+ }
174
+
175
+ /// Raw String that contains a valid prefix (`#+"`) and postfix (`"#+`) where
176
+ /// there are a matching number of `#` characters in both. Note that this will
177
+ /// not consume extra trailing `#` characters: `r###"abcde"####` is lexed as a
178
+ /// `ValidatedRawString { n_hashes: 3 }` followed by a `#` token.
179
+ #[ derive( Debug , Eq , PartialEq , Copy , Clone ) ]
180
+ pub struct ValidatedRawStr {
181
+ n_hashes : u16 ,
182
+ }
183
+
184
+ impl ValidatedRawStr {
185
+ pub fn num_hashes ( & self ) -> u16 {
186
+ self . n_hashes
187
+ }
188
+ }
189
+
190
+ impl UnvalidatedRawStr {
191
+ pub fn validate ( self ) -> Result < ValidatedRawStr , LexRawStrError > {
192
+ if !self . valid_start {
193
+ return Err ( LexRawStrError :: InvalidStarter ) ;
194
+ }
195
+
196
+ // Only up to 65535 `#`s are allowed in raw strings
197
+ let n_start_safe: u16 =
198
+ self . n_start_hashes . try_into ( ) . map_err ( |_| LexRawStrError :: TooManyDelimiters ) ?;
199
+
200
+ if self . n_start_hashes > self . n_end_hashes {
201
+ Err ( LexRawStrError :: NoTerminator {
202
+ expected : self . n_start_hashes ,
203
+ found : self . n_end_hashes ,
204
+ possible_terminator_offset : self . possible_terminator_offset ,
205
+ } )
206
+ } else {
207
+ // Since the lexer should never produce a literal with n_end > n_start, if n_start <= n_end,
208
+ // they must be equal.
209
+ debug_assert_eq ! ( self . n_start_hashes, self . n_end_hashes) ;
210
+ Ok ( ValidatedRawStr { n_hashes : n_start_safe } )
211
+ }
212
+ }
138
213
}
139
214
140
215
/// Base of numeric literal encoding according to its prefix.
@@ -209,7 +284,7 @@ pub fn is_whitespace(c: char) -> bool {
209
284
// Dedicated whitespace characters from Unicode
210
285
| '\u{2028}' // LINE SEPARATOR
211
286
| '\u{2029}' // PARAGRAPH SEPARATOR
212
- => true ,
287
+ => true ,
213
288
_ => false ,
214
289
}
215
290
}
@@ -258,12 +333,12 @@ impl Cursor<'_> {
258
333
'r' => match ( self . first ( ) , self . second ( ) ) {
259
334
( '#' , c1) if is_id_start ( c1) => self . raw_ident ( ) ,
260
335
( '#' , _) | ( '"' , _) => {
261
- let ( n_hashes , started , terminated ) = self . raw_double_quoted_string ( ) ;
336
+ let raw_str_i = self . raw_double_quoted_string ( 1 ) ;
262
337
let suffix_start = self . len_consumed ( ) ;
263
- if terminated {
338
+ if raw_str_i . n_end_hashes == raw_str_i . n_start_hashes {
264
339
self . eat_literal_suffix ( ) ;
265
340
}
266
- let kind = RawStr { n_hashes , started , terminated } ;
341
+ let kind = RawStr ( raw_str_i ) ;
267
342
Literal { kind, suffix_start }
268
343
}
269
344
_ => self . ident ( ) ,
@@ -293,12 +368,14 @@ impl Cursor<'_> {
293
368
}
294
369
( 'r' , '"' ) | ( 'r' , '#' ) => {
295
370
self . bump ( ) ;
296
- let ( n_hashes , started , terminated ) = self . raw_double_quoted_string ( ) ;
371
+ let raw_str_i = self . raw_double_quoted_string ( 2 ) ;
297
372
let suffix_start = self . len_consumed ( ) ;
373
+ let terminated = raw_str_i. n_start_hashes == raw_str_i. n_end_hashes ;
298
374
if terminated {
299
375
self . eat_literal_suffix ( ) ;
300
376
}
301
- let kind = RawByteStr { n_hashes, started, terminated } ;
377
+
378
+ let kind = RawByteStr ( raw_str_i) ;
302
379
Literal { kind, suffix_start }
303
380
}
304
381
_ => self . ident ( ) ,
@@ -594,37 +671,49 @@ impl Cursor<'_> {
594
671
false
595
672
}
596
673
597
- /// Eats the double-quoted string and returns a tuple of
598
- /// (amount of the '#' symbols, raw string started, raw string terminated)
599
- fn raw_double_quoted_string ( & mut self ) -> ( usize , bool , bool ) {
674
+ /// Eats the double-quoted string and returns an `UnvalidatedRawStr`.
675
+ fn raw_double_quoted_string ( & mut self , prefix_len : usize ) -> UnvalidatedRawStr {
600
676
debug_assert ! ( self . prev( ) == 'r' ) ;
601
- let mut started: bool = false ;
602
- let mut finished: bool = false ;
677
+ let mut valid_start: bool = false ;
678
+ let start_pos = self . len_consumed ( ) ;
679
+ let ( mut possible_terminator_offset, mut max_hashes) = ( None , 0 ) ;
603
680
604
681
// Count opening '#' symbols.
605
- let n_hashes = self . eat_while ( |c| c == '#' ) ;
682
+ let n_start_hashes = self . eat_while ( |c| c == '#' ) ;
606
683
607
684
// Check that string is started.
608
685
match self . bump ( ) {
609
- Some ( '"' ) => started = true ,
610
- _ => return ( n_hashes, started, finished) ,
686
+ Some ( '"' ) => valid_start = true ,
687
+ _ => {
688
+ return UnvalidatedRawStr {
689
+ valid_start,
690
+ n_start_hashes,
691
+ n_end_hashes : 0 ,
692
+ possible_terminator_offset,
693
+ } ;
694
+ }
611
695
}
612
696
613
697
// Skip the string contents and on each '#' character met, check if this is
614
698
// a raw string termination.
615
- while !finished {
699
+ loop {
616
700
self . eat_while ( |c| c != '"' ) ;
617
701
618
702
if self . is_eof ( ) {
619
- return ( n_hashes, started, finished) ;
703
+ return UnvalidatedRawStr {
704
+ valid_start,
705
+ n_start_hashes,
706
+ n_end_hashes : max_hashes,
707
+ possible_terminator_offset,
708
+ } ;
620
709
}
621
710
622
711
// Eat closing double quote.
623
712
self . bump ( ) ;
624
713
625
714
// Check that amount of closing '#' symbols
626
715
// is equal to the amount of opening ones.
627
- let mut hashes_left = n_hashes ;
716
+ let mut hashes_left = n_start_hashes ;
628
717
let is_closing_hash = |c| {
629
718
if c == '#' && hashes_left != 0 {
630
719
hashes_left -= 1 ;
@@ -633,10 +722,23 @@ impl Cursor<'_> {
633
722
false
634
723
}
635
724
} ;
636
- finished = self . eat_while ( is_closing_hash) == n_hashes;
725
+ let n_end_hashes = self . eat_while ( is_closing_hash) ;
726
+
727
+ if n_end_hashes == n_start_hashes {
728
+ return UnvalidatedRawStr {
729
+ valid_start,
730
+ n_start_hashes,
731
+ n_end_hashes,
732
+ possible_terminator_offset : None ,
733
+ } ;
734
+ } else if n_end_hashes > max_hashes {
735
+ // Keep track of possible terminators to give a hint about where there might be
736
+ // a missing terminator
737
+ possible_terminator_offset =
738
+ Some ( self . len_consumed ( ) - start_pos - n_end_hashes + prefix_len) ;
739
+ max_hashes = n_end_hashes;
740
+ }
637
741
}
638
-
639
- ( n_hashes, started, finished)
640
742
}
641
743
642
744
fn eat_decimal_digits ( & mut self ) -> bool {
0 commit comments