17
17
mod cursor;
18
18
pub mod unescape;
19
19
20
+ #[ cfg( test) ]
21
+ mod tests;
22
+
20
23
use self :: LiteralKind :: * ;
21
24
use self :: TokenKind :: * ;
22
25
use crate :: cursor:: { Cursor , EOF_CHAR } ;
26
+ use std:: convert:: TryInto ;
23
27
24
28
/// Parsed token.
25
29
/// It doesn't contain information about data that has been parsed,
@@ -132,9 +136,65 @@ pub enum LiteralKind {
132
136
/// "b"abc"", "b"abc"
133
137
ByteStr { terminated : bool } ,
134
138
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
135
- RawStr { n_hashes : usize , started : bool , terminated : bool } ,
139
+ RawStr ( UnvalidatedRawStr ) ,
136
140
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
137
- RawByteStr { n_hashes : usize , started : bool , terminated : bool } ,
141
+ RawByteStr ( UnvalidatedRawStr ) ,
142
+ }
143
+
144
+ #[ derive( Clone , Copy , Debug , PartialEq , Eq , PartialOrd , Ord ) ]
145
+ pub struct UnvalidatedRawStr {
146
+ valid_start : bool ,
147
+ n_start_hashes : usize ,
148
+ n_end_hashes : usize ,
149
+ possible_terminator_offset : Option < usize > ,
150
+ }
151
+
152
+ #[ derive( Clone , Copy , Debug , PartialEq , Eq , PartialOrd , Ord ) ]
153
+ pub enum LexRawStrError {
154
+ /// Non # characters between `r` and `"` eg. `r#~"..`
155
+ InvalidStarter ,
156
+ /// The string was never terminated. `possible_terminator_offset` is the best guess of where they
157
+ /// may have intended to terminate it.
158
+ NoTerminator { expected : usize , found : usize , possible_terminator_offset : Option < usize > } ,
159
+ /// More than 65536 # signs
160
+ TooManyDelimiters ,
161
+ }
162
+
163
+ #[ derive( Debug , Eq , PartialEq , Copy , Clone ) ]
164
+ pub struct ValidatedRawStr {
165
+ n_hashes : u16 ,
166
+ }
167
+
168
+ impl ValidatedRawStr {
169
+ pub fn num_hashes ( & self ) -> u16 {
170
+ self . n_hashes
171
+ }
172
+ }
173
+
174
+ impl UnvalidatedRawStr {
175
+ pub fn started ( & self ) -> bool {
176
+ self . valid_start
177
+ }
178
+
179
+ pub fn validate ( self ) -> Result < ValidatedRawStr , LexRawStrError > {
180
+ if !self . valid_start {
181
+ return Err ( LexRawStrError :: InvalidStarter ) ;
182
+ }
183
+
184
+ let n_start_safe: u16 =
185
+ self . n_start_hashes . try_into ( ) . map_err ( |_| LexRawStrError :: TooManyDelimiters ) ?;
186
+ match ( self . n_start_hashes , self . n_end_hashes ) {
187
+ ( n_start, n_end) if n_start > n_end => Err ( LexRawStrError :: NoTerminator {
188
+ expected : n_start,
189
+ found : self . n_end_hashes ,
190
+ possible_terminator_offset : self . possible_terminator_offset ,
191
+ } ) ,
192
+ ( n_start, n_end) => {
193
+ debug_assert_eq ! ( n_start, n_end) ;
194
+ Ok ( ValidatedRawStr { n_hashes : n_start_safe } )
195
+ }
196
+ }
197
+ }
138
198
}
139
199
140
200
/// Base of numeric literal encoding according to its prefix.
@@ -209,7 +269,7 @@ pub fn is_whitespace(c: char) -> bool {
209
269
// Dedicated whitespace characters from Unicode
210
270
| '\u{2028}' // LINE SEPARATOR
211
271
| '\u{2029}' // PARAGRAPH SEPARATOR
212
- => true ,
272
+ => true ,
213
273
_ => false ,
214
274
}
215
275
}
@@ -258,12 +318,12 @@ impl Cursor<'_> {
258
318
'r' => match ( self . first ( ) , self . second ( ) ) {
259
319
( '#' , c1) if is_id_start ( c1) => self . raw_ident ( ) ,
260
320
( '#' , _) | ( '"' , _) => {
261
- let ( n_hashes , started , terminated ) = self . raw_double_quoted_string ( ) ;
321
+ let raw_str_i = self . raw_double_quoted_string ( 1 ) ;
262
322
let suffix_start = self . len_consumed ( ) ;
263
- if terminated {
323
+ if raw_str_i . n_end_hashes == raw_str_i . n_start_hashes {
264
324
self . eat_literal_suffix ( ) ;
265
325
}
266
- let kind = RawStr { n_hashes , started , terminated } ;
326
+ let kind = RawStr ( raw_str_i ) ;
267
327
Literal { kind, suffix_start }
268
328
}
269
329
_ => self . ident ( ) ,
@@ -293,12 +353,14 @@ impl Cursor<'_> {
293
353
}
294
354
( 'r' , '"' ) | ( 'r' , '#' ) => {
295
355
self . bump ( ) ;
296
- let ( n_hashes , started , terminated ) = self . raw_double_quoted_string ( ) ;
356
+ let raw_str_i = self . raw_double_quoted_string ( 2 ) ;
297
357
let suffix_start = self . len_consumed ( ) ;
358
+ let terminated = raw_str_i. n_start_hashes == raw_str_i. n_end_hashes ;
298
359
if terminated {
299
360
self . eat_literal_suffix ( ) ;
300
361
}
301
- let kind = RawByteStr { n_hashes, started, terminated } ;
362
+
363
+ let kind = RawByteStr ( raw_str_i) ;
302
364
Literal { kind, suffix_start }
303
365
}
304
366
_ => self . ident ( ) ,
@@ -594,37 +656,49 @@ impl Cursor<'_> {
594
656
false
595
657
}
596
658
597
- /// Eats the double-quoted string and returns a tuple of
598
- /// (amount of the '#' symbols, raw string started, raw string terminated)
599
- fn raw_double_quoted_string ( & mut self ) -> ( usize , bool , bool ) {
659
+ /// Eats the double-quoted string an UnvalidatedRawStr
660
+ fn raw_double_quoted_string ( & mut self , prefix_len : usize ) -> UnvalidatedRawStr {
600
661
debug_assert ! ( self . prev( ) == 'r' ) ;
601
- let mut started: bool = false ;
602
- let mut finished: bool = false ;
662
+ let mut valid_start: bool = false ;
663
+ let start_pos = self . len_consumed ( ) ;
664
+ let ( mut possible_terminator_offset, mut max_hashes) = ( None , 0 ) ;
603
665
604
666
// Count opening '#' symbols.
605
- let n_hashes = self . eat_while ( |c| c == '#' ) ;
667
+ let n_start_hashes = self . eat_while ( |c| c == '#' ) ;
606
668
607
669
// Check that string is started.
608
670
match self . bump ( ) {
609
- Some ( '"' ) => started = true ,
610
- _ => return ( n_hashes, started, finished) ,
671
+ Some ( '"' ) => valid_start = true ,
672
+ _ => {
673
+ return UnvalidatedRawStr {
674
+ valid_start,
675
+ n_start_hashes,
676
+ n_end_hashes : 0 ,
677
+ possible_terminator_offset,
678
+ } ;
679
+ }
611
680
}
612
681
613
682
// Skip the string contents and on each '#' character met, check if this is
614
683
// a raw string termination.
615
- while !finished {
684
+ loop {
616
685
self . eat_while ( |c| c != '"' ) ;
617
686
618
687
if self . is_eof ( ) {
619
- return ( n_hashes, started, finished) ;
688
+ return UnvalidatedRawStr {
689
+ valid_start,
690
+ n_start_hashes,
691
+ n_end_hashes : max_hashes,
692
+ possible_terminator_offset,
693
+ } ;
620
694
}
621
695
622
696
// Eat closing double quote.
623
697
self . bump ( ) ;
624
698
625
699
// Check that amount of closing '#' symbols
626
700
// is equal to the amount of opening ones.
627
- let mut hashes_left = n_hashes ;
701
+ let mut hashes_left = n_start_hashes ;
628
702
let is_closing_hash = |c| {
629
703
if c == '#' && hashes_left != 0 {
630
704
hashes_left -= 1 ;
@@ -633,10 +707,23 @@ impl Cursor<'_> {
633
707
false
634
708
}
635
709
} ;
636
- finished = self . eat_while ( is_closing_hash) == n_hashes;
710
+ let n_end_hashes = self . eat_while ( is_closing_hash) ;
711
+
712
+ if n_end_hashes == n_start_hashes {
713
+ return UnvalidatedRawStr {
714
+ valid_start,
715
+ n_start_hashes,
716
+ n_end_hashes,
717
+ possible_terminator_offset : None ,
718
+ } ;
719
+ } else if n_end_hashes > 0 && n_end_hashes > max_hashes {
720
+ // Keep track of possible terminators to give a hint about where there might be
721
+ // a missing terminator
722
+ possible_terminator_offset =
723
+ Some ( self . len_consumed ( ) - start_pos - n_end_hashes + prefix_len) ;
724
+ max_hashes = n_end_hashes;
725
+ }
637
726
}
638
-
639
- ( n_hashes, started, finished)
640
727
}
641
728
642
729
fn eat_decimal_digits ( & mut self ) -> bool {
0 commit comments