Skip to content

Commit 629e97a

Browse files
committed
Improve error messages for raw strings (#60762)
This diff improves error messages around raw strings in a few ways: - Catch extra trailing `#` in the parser. This can't be handled in the lexer because we could be in a macro that actually expects another # (see test) - Refactor & unify error handling in the lexer between ByteStrings and RawByteStrings - Detect potentially intended terminators (longest sequence of "#*" is suggested)
1 parent 840a576 commit 629e97a

File tree

10 files changed

+344
-63
lines changed

10 files changed

+344
-63
lines changed

src/librustc_lexer/src/cursor.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ impl<'a> Cursor<'a> {
4141
/// If requested position doesn't exist, `EOF_CHAR` is returned.
4242
/// However, getting `EOF_CHAR` doesn't always mean actual end of file,
4343
/// it should be checked with `is_eof` method.
44-
fn nth_char(&self, n: usize) -> char {
44+
pub(crate) fn nth_char(&self, n: usize) -> char {
4545
self.chars().nth(n).unwrap_or(EOF_CHAR)
4646
}
4747

src/librustc_lexer/src/lib.rs

+109-22
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,13 @@
1717
mod cursor;
1818
pub mod unescape;
1919

20+
#[cfg(test)]
21+
mod tests;
22+
2023
use self::LiteralKind::*;
2124
use self::TokenKind::*;
2225
use crate::cursor::{Cursor, EOF_CHAR};
26+
use std::convert::TryInto;
2327

2428
/// Parsed token.
2529
/// It doesn't contain information about data that has been parsed,
@@ -132,9 +136,65 @@ pub enum LiteralKind {
132136
/// "b"abc"", "b"abc"
133137
ByteStr { terminated: bool },
134138
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
135-
RawStr { n_hashes: usize, started: bool, terminated: bool },
139+
RawStr(UnvalidatedRawStr),
136140
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
137-
RawByteStr { n_hashes: usize, started: bool, terminated: bool },
141+
RawByteStr(UnvalidatedRawStr),
142+
}
143+
144+
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
145+
pub struct UnvalidatedRawStr {
146+
valid_start: bool,
147+
n_start_hashes: usize,
148+
n_end_hashes: usize,
149+
possible_terminator_offset: Option<usize>,
150+
}
151+
152+
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
153+
pub enum LexRawStrError {
154+
/// Non # characters between `r` and `"` eg. `r#~"..`
155+
InvalidStarter,
156+
/// The string was never terminated. `possible_terminator_offset` is the best guess of where they
157+
/// may have intended to terminate it.
158+
NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
159+
/// More than 65536 # signs
160+
TooManyDelimiters,
161+
}
162+
163+
#[derive(Debug, Eq, PartialEq, Copy, Clone)]
164+
pub struct ValidatedRawStr {
165+
n_hashes: u16,
166+
}
167+
168+
impl ValidatedRawStr {
169+
pub fn num_hashes(&self) -> u16 {
170+
self.n_hashes
171+
}
172+
}
173+
174+
impl UnvalidatedRawStr {
175+
pub fn started(&self) -> bool {
176+
self.valid_start
177+
}
178+
179+
pub fn validate(self) -> Result<ValidatedRawStr, LexRawStrError> {
180+
if !self.valid_start {
181+
return Err(LexRawStrError::InvalidStarter);
182+
}
183+
184+
let n_start_safe: u16 =
185+
self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?;
186+
match (self.n_start_hashes, self.n_end_hashes) {
187+
(n_start, n_end) if n_start > n_end => Err(LexRawStrError::NoTerminator {
188+
expected: n_start,
189+
found: self.n_end_hashes,
190+
possible_terminator_offset: self.possible_terminator_offset,
191+
}),
192+
(n_start, n_end) => {
193+
debug_assert_eq!(n_start, n_end);
194+
Ok(ValidatedRawStr { n_hashes: n_start_safe })
195+
}
196+
}
197+
}
138198
}
139199

140200
/// Base of numeric literal encoding according to its prefix.
@@ -209,7 +269,7 @@ pub fn is_whitespace(c: char) -> bool {
209269
// Dedicated whitespace characters from Unicode
210270
| '\u{2028}' // LINE SEPARATOR
211271
| '\u{2029}' // PARAGRAPH SEPARATOR
212-
=> true,
272+
=> true,
213273
_ => false,
214274
}
215275
}
@@ -258,12 +318,12 @@ impl Cursor<'_> {
258318
'r' => match (self.first(), self.second()) {
259319
('#', c1) if is_id_start(c1) => self.raw_ident(),
260320
('#', _) | ('"', _) => {
261-
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
321+
let raw_str_i = self.raw_double_quoted_string(1);
262322
let suffix_start = self.len_consumed();
263-
if terminated {
323+
if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes {
264324
self.eat_literal_suffix();
265325
}
266-
let kind = RawStr { n_hashes, started, terminated };
326+
let kind = RawStr(raw_str_i);
267327
Literal { kind, suffix_start }
268328
}
269329
_ => self.ident(),
@@ -293,12 +353,14 @@ impl Cursor<'_> {
293353
}
294354
('r', '"') | ('r', '#') => {
295355
self.bump();
296-
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
356+
let raw_str_i = self.raw_double_quoted_string(2);
297357
let suffix_start = self.len_consumed();
358+
let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes;
298359
if terminated {
299360
self.eat_literal_suffix();
300361
}
301-
let kind = RawByteStr { n_hashes, started, terminated };
362+
363+
let kind = RawByteStr(raw_str_i);
302364
Literal { kind, suffix_start }
303365
}
304366
_ => self.ident(),
@@ -594,37 +656,49 @@ impl Cursor<'_> {
594656
false
595657
}
596658

597-
/// Eats the double-quoted string and returns a tuple of
598-
/// (amount of the '#' symbols, raw string started, raw string terminated)
599-
fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
659+
/// Eats the double-quoted string an UnvalidatedRawStr
660+
fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr {
600661
debug_assert!(self.prev() == 'r');
601-
let mut started: bool = false;
602-
let mut finished: bool = false;
662+
let mut valid_start: bool = false;
663+
let start_pos = self.len_consumed();
664+
let (mut possible_terminator_offset, mut max_hashes) = (None, 0);
603665

604666
// Count opening '#' symbols.
605-
let n_hashes = self.eat_while(|c| c == '#');
667+
let n_start_hashes = self.eat_while(|c| c == '#');
606668

607669
// Check that string is started.
608670
match self.bump() {
609-
Some('"') => started = true,
610-
_ => return (n_hashes, started, finished),
671+
Some('"') => valid_start = true,
672+
_ => {
673+
return UnvalidatedRawStr {
674+
valid_start,
675+
n_start_hashes,
676+
n_end_hashes: 0,
677+
possible_terminator_offset,
678+
};
679+
}
611680
}
612681

613682
// Skip the string contents and on each '#' character met, check if this is
614683
// a raw string termination.
615-
while !finished {
684+
loop {
616685
self.eat_while(|c| c != '"');
617686

618687
if self.is_eof() {
619-
return (n_hashes, started, finished);
688+
return UnvalidatedRawStr {
689+
valid_start,
690+
n_start_hashes,
691+
n_end_hashes: max_hashes,
692+
possible_terminator_offset,
693+
};
620694
}
621695

622696
// Eat closing double quote.
623697
self.bump();
624698

625699
// Check that amount of closing '#' symbols
626700
// is equal to the amount of opening ones.
627-
let mut hashes_left = n_hashes;
701+
let mut hashes_left = n_start_hashes;
628702
let is_closing_hash = |c| {
629703
if c == '#' && hashes_left != 0 {
630704
hashes_left -= 1;
@@ -633,10 +707,23 @@ impl Cursor<'_> {
633707
false
634708
}
635709
};
636-
finished = self.eat_while(is_closing_hash) == n_hashes;
710+
let n_end_hashes = self.eat_while(is_closing_hash);
711+
712+
if n_end_hashes == n_start_hashes {
713+
return UnvalidatedRawStr {
714+
valid_start,
715+
n_start_hashes,
716+
n_end_hashes,
717+
possible_terminator_offset: None,
718+
};
719+
} else if n_end_hashes > 0 && n_end_hashes > max_hashes {
720+
// Keep track of possible terminators to give a hint about where there might be
721+
// a missing terminator
722+
possible_terminator_offset =
723+
Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
724+
max_hashes = n_end_hashes;
725+
}
637726
}
638-
639-
(n_hashes, started, finished)
640727
}
641728

642729
fn eat_decimal_digits(&mut self) -> bool {

src/librustc_lexer/src/tests.rs

+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
#[cfg(test)]
2+
mod tests {
3+
use crate::*;
4+
5+
fn check_raw_str(
6+
s: &str,
7+
expected: UnvalidatedRawStr,
8+
validated: Result<ValidatedRawStr, LexRawStrError>,
9+
) {
10+
let mut cursor = Cursor::new(s);
11+
let tok = cursor.raw_double_quoted_string(0);
12+
assert_eq!(tok, expected);
13+
assert_eq!(tok.validate(), validated);
14+
}
15+
16+
#[test]
17+
fn test_naked_raw_str() {
18+
check_raw_str(
19+
r#""abc""#,
20+
UnvalidatedRawStr {
21+
n_start_hashes: 0,
22+
n_end_hashes: 0,
23+
valid_start: true,
24+
possible_terminator_offset: None,
25+
},
26+
Ok(ValidatedRawStr { n_hashes: 0 }),
27+
);
28+
}
29+
30+
#[test]
31+
fn test_raw_no_start() {
32+
check_raw_str(
33+
r##""abc"#"##,
34+
UnvalidatedRawStr {
35+
n_start_hashes: 0,
36+
n_end_hashes: 0,
37+
valid_start: true,
38+
possible_terminator_offset: None,
39+
},
40+
Ok(ValidatedRawStr { n_hashes: 0 }),
41+
);
42+
}
43+
44+
#[test]
45+
fn test_too_many_terminators() {
46+
// this error is handled in the parser later
47+
check_raw_str(
48+
r###"#"abc"##"###,
49+
UnvalidatedRawStr {
50+
n_start_hashes: 1,
51+
n_end_hashes: 1,
52+
valid_start: true,
53+
possible_terminator_offset: None,
54+
},
55+
Ok(ValidatedRawStr { n_hashes: 1 }),
56+
);
57+
}
58+
59+
#[test]
60+
fn test_unterminated() {
61+
check_raw_str(
62+
r#"#"abc"#,
63+
UnvalidatedRawStr {
64+
n_start_hashes: 1,
65+
n_end_hashes: 0,
66+
valid_start: true,
67+
possible_terminator_offset: None,
68+
},
69+
Err(LexRawStrError::NoTerminator {
70+
expected: 1,
71+
found: 0,
72+
possible_terminator_offset: None,
73+
}),
74+
);
75+
check_raw_str(
76+
r###"##"abc"#"###,
77+
UnvalidatedRawStr {
78+
n_start_hashes: 2,
79+
n_end_hashes: 1,
80+
valid_start: true,
81+
possible_terminator_offset: Some(7),
82+
},
83+
Err(LexRawStrError::NoTerminator {
84+
expected: 2,
85+
found: 1,
86+
possible_terminator_offset: Some(7),
87+
}),
88+
);
89+
// We're looking for "# not just any #
90+
check_raw_str(
91+
r###"##"abc#"###,
92+
UnvalidatedRawStr {
93+
n_start_hashes: 2,
94+
n_end_hashes: 0,
95+
valid_start: true,
96+
possible_terminator_offset: None,
97+
},
98+
Err(LexRawStrError::NoTerminator {
99+
expected: 2,
100+
found: 0,
101+
possible_terminator_offset: None,
102+
}),
103+
)
104+
}
105+
106+
#[test]
107+
fn test_invalid_start() {
108+
check_raw_str(
109+
r##"#~"abc"#"##,
110+
UnvalidatedRawStr {
111+
n_start_hashes: 1,
112+
n_end_hashes: 0,
113+
valid_start: false,
114+
possible_terminator_offset: None,
115+
},
116+
Err(LexRawStrError::InvalidStarter),
117+
);
118+
}
119+
}

0 commit comments

Comments
 (0)