Skip to content

Commit c739465

Browse files
authored
Rollup merge of rust-lang#70522 - rcoh:60762-raw-string-errors, r=petrochenkov
Improve error messages for raw strings (rust-lang#60762) This diff improves error messages around raw strings in a few ways: - Catch extra trailing `#` in the parser. This can't be handled in the lexer because we could be in a macro that actually expects another # (see test) - Refactor & unify error handling in the lexer between ByteStrings and RawByteStrings - Detect potentially intended terminators (longest sequence of "#*" is suggested) Fixes rust-lang#60762 cc @estebank who reviewed the original (abandoned) PR for the same ticket. r? @Centril
2 parents 84a4633 + 55a5eea commit c739465

22 files changed

+385
-74
lines changed

src/librustc_lexer/src/lib.rs

+124-22
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,13 @@
1717
mod cursor;
1818
pub mod unescape;
1919

20+
#[cfg(test)]
21+
mod tests;
22+
2023
use self::LiteralKind::*;
2124
use self::TokenKind::*;
2225
use crate::cursor::{Cursor, EOF_CHAR};
26+
use std::convert::TryInto;
2327

2428
/// Parsed token.
2529
/// It doesn't contain information about data that has been parsed,
@@ -132,9 +136,80 @@ pub enum LiteralKind {
132136
/// "b"abc"", "b"abc"
133137
ByteStr { terminated: bool },
134138
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
135-
RawStr { n_hashes: usize, started: bool, terminated: bool },
139+
RawStr(UnvalidatedRawStr),
136140
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
137-
RawByteStr { n_hashes: usize, started: bool, terminated: bool },
141+
RawByteStr(UnvalidatedRawStr),
142+
}
143+
144+
/// Represents something that looks like a raw string, but may have some
145+
/// problems. Use `.validate()` to convert it into something
146+
/// usable.
147+
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
148+
pub struct UnvalidatedRawStr {
149+
/// The prefix (`r###"`) is valid
150+
valid_start: bool,
151+
/// The number of leading `#`
152+
n_start_hashes: usize,
153+
/// The number of trailing `#`. `n_end_hashes` <= `n_start_hashes`
154+
n_end_hashes: usize,
155+
/// The offset starting at `r` or `br` where the user may have intended to end the string.
156+
/// Currently, it is the longest sequence of pattern `"#+"`.
157+
possible_terminator_offset: Option<usize>,
158+
}
159+
160+
/// Error produced validating a raw string. Represents cases like:
161+
/// - `r##~"abcde"##`: `LexRawStrError::InvalidStarter`
162+
/// - `r###"abcde"##`: `LexRawStrError::NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
163+
/// - Too many `#`s (>65536): `TooManyDelimiters`
164+
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
165+
pub enum LexRawStrError {
166+
/// Non `#` characters exist between `r` and `"` eg. `r#~"..`
167+
InvalidStarter,
168+
/// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
169+
/// may have intended to terminate it.
170+
NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
171+
/// More than 65536 `#`s exist.
172+
TooManyDelimiters,
173+
}
174+
175+
/// Raw String that contains a valid prefix (`#+"`) and postfix (`"#+`) where
176+
/// there are a matching number of `#` characters in both. Note that this will
177+
/// not consume extra trailing `#` characters: `r###"abcde"####` is lexed as a
178+
/// `ValidatedRawString { n_hashes: 3 }` followed by a `#` token.
179+
#[derive(Debug, Eq, PartialEq, Copy, Clone)]
180+
pub struct ValidatedRawStr {
181+
n_hashes: u16,
182+
}
183+
184+
impl ValidatedRawStr {
185+
pub fn num_hashes(&self) -> u16 {
186+
self.n_hashes
187+
}
188+
}
189+
190+
impl UnvalidatedRawStr {
191+
pub fn validate(self) -> Result<ValidatedRawStr, LexRawStrError> {
192+
if !self.valid_start {
193+
return Err(LexRawStrError::InvalidStarter);
194+
}
195+
196+
// Only up to 65535 `#`s are allowed in raw strings
197+
let n_start_safe: u16 =
198+
self.n_start_hashes.try_into().map_err(|_| LexRawStrError::TooManyDelimiters)?;
199+
200+
if self.n_start_hashes > self.n_end_hashes {
201+
Err(LexRawStrError::NoTerminator {
202+
expected: self.n_start_hashes,
203+
found: self.n_end_hashes,
204+
possible_terminator_offset: self.possible_terminator_offset,
205+
})
206+
} else {
207+
// Since the lexer should never produce a literal with n_end > n_start, if n_start <= n_end,
208+
// they must be equal.
209+
debug_assert_eq!(self.n_start_hashes, self.n_end_hashes);
210+
Ok(ValidatedRawStr { n_hashes: n_start_safe })
211+
}
212+
}
138213
}
139214

140215
/// Base of numeric literal encoding according to its prefix.
@@ -209,7 +284,7 @@ pub fn is_whitespace(c: char) -> bool {
209284
// Dedicated whitespace characters from Unicode
210285
| '\u{2028}' // LINE SEPARATOR
211286
| '\u{2029}' // PARAGRAPH SEPARATOR
212-
=> true,
287+
=> true,
213288
_ => false,
214289
}
215290
}
@@ -258,12 +333,12 @@ impl Cursor<'_> {
258333
'r' => match (self.first(), self.second()) {
259334
('#', c1) if is_id_start(c1) => self.raw_ident(),
260335
('#', _) | ('"', _) => {
261-
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
336+
let raw_str_i = self.raw_double_quoted_string(1);
262337
let suffix_start = self.len_consumed();
263-
if terminated {
338+
if raw_str_i.n_end_hashes == raw_str_i.n_start_hashes {
264339
self.eat_literal_suffix();
265340
}
266-
let kind = RawStr { n_hashes, started, terminated };
341+
let kind = RawStr(raw_str_i);
267342
Literal { kind, suffix_start }
268343
}
269344
_ => self.ident(),
@@ -293,12 +368,14 @@ impl Cursor<'_> {
293368
}
294369
('r', '"') | ('r', '#') => {
295370
self.bump();
296-
let (n_hashes, started, terminated) = self.raw_double_quoted_string();
371+
let raw_str_i = self.raw_double_quoted_string(2);
297372
let suffix_start = self.len_consumed();
373+
let terminated = raw_str_i.n_start_hashes == raw_str_i.n_end_hashes;
298374
if terminated {
299375
self.eat_literal_suffix();
300376
}
301-
let kind = RawByteStr { n_hashes, started, terminated };
377+
378+
let kind = RawByteStr(raw_str_i);
302379
Literal { kind, suffix_start }
303380
}
304381
_ => self.ident(),
@@ -594,37 +671,49 @@ impl Cursor<'_> {
594671
false
595672
}
596673

597-
/// Eats the double-quoted string and returns a tuple of
598-
/// (amount of the '#' symbols, raw string started, raw string terminated)
599-
fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
674+
/// Eats the double-quoted string and returns an `UnvalidatedRawStr`.
675+
fn raw_double_quoted_string(&mut self, prefix_len: usize) -> UnvalidatedRawStr {
600676
debug_assert!(self.prev() == 'r');
601-
let mut started: bool = false;
602-
let mut finished: bool = false;
677+
let mut valid_start: bool = false;
678+
let start_pos = self.len_consumed();
679+
let (mut possible_terminator_offset, mut max_hashes) = (None, 0);
603680

604681
// Count opening '#' symbols.
605-
let n_hashes = self.eat_while(|c| c == '#');
682+
let n_start_hashes = self.eat_while(|c| c == '#');
606683

607684
// Check that string is started.
608685
match self.bump() {
609-
Some('"') => started = true,
610-
_ => return (n_hashes, started, finished),
686+
Some('"') => valid_start = true,
687+
_ => {
688+
return UnvalidatedRawStr {
689+
valid_start,
690+
n_start_hashes,
691+
n_end_hashes: 0,
692+
possible_terminator_offset,
693+
};
694+
}
611695
}
612696

613697
// Skip the string contents and on each '#' character met, check if this is
614698
// a raw string termination.
615-
while !finished {
699+
loop {
616700
self.eat_while(|c| c != '"');
617701

618702
if self.is_eof() {
619-
return (n_hashes, started, finished);
703+
return UnvalidatedRawStr {
704+
valid_start,
705+
n_start_hashes,
706+
n_end_hashes: max_hashes,
707+
possible_terminator_offset,
708+
};
620709
}
621710

622711
// Eat closing double quote.
623712
self.bump();
624713

625714
// Check that amount of closing '#' symbols
626715
// is equal to the amount of opening ones.
627-
let mut hashes_left = n_hashes;
716+
let mut hashes_left = n_start_hashes;
628717
let is_closing_hash = |c| {
629718
if c == '#' && hashes_left != 0 {
630719
hashes_left -= 1;
@@ -633,10 +722,23 @@ impl Cursor<'_> {
633722
false
634723
}
635724
};
636-
finished = self.eat_while(is_closing_hash) == n_hashes;
725+
let n_end_hashes = self.eat_while(is_closing_hash);
726+
727+
if n_end_hashes == n_start_hashes {
728+
return UnvalidatedRawStr {
729+
valid_start,
730+
n_start_hashes,
731+
n_end_hashes,
732+
possible_terminator_offset: None,
733+
};
734+
} else if n_end_hashes > max_hashes {
735+
// Keep track of possible terminators to give a hint about where there might be
736+
// a missing terminator
737+
possible_terminator_offset =
738+
Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
739+
max_hashes = n_end_hashes;
740+
}
637741
}
638-
639-
(n_hashes, started, finished)
640742
}
641743

642744
fn eat_decimal_digits(&mut self) -> bool {

src/librustc_lexer/src/tests.rs

+121
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
#[cfg(test)]
2+
mod tests {
3+
use crate::*;
4+
5+
fn check_raw_str(
6+
s: &str,
7+
expected: UnvalidatedRawStr,
8+
validated: Result<ValidatedRawStr, LexRawStrError>,
9+
) {
10+
let s = &format!("r{}", s);
11+
let mut cursor = Cursor::new(s);
12+
cursor.bump();
13+
let tok = cursor.raw_double_quoted_string(0);
14+
assert_eq!(tok, expected);
15+
assert_eq!(tok.validate(), validated);
16+
}
17+
18+
#[test]
19+
fn test_naked_raw_str() {
20+
check_raw_str(
21+
r#""abc""#,
22+
UnvalidatedRawStr {
23+
n_start_hashes: 0,
24+
n_end_hashes: 0,
25+
valid_start: true,
26+
possible_terminator_offset: None,
27+
},
28+
Ok(ValidatedRawStr { n_hashes: 0 }),
29+
);
30+
}
31+
32+
#[test]
33+
fn test_raw_no_start() {
34+
check_raw_str(
35+
r##""abc"#"##,
36+
UnvalidatedRawStr {
37+
n_start_hashes: 0,
38+
n_end_hashes: 0,
39+
valid_start: true,
40+
possible_terminator_offset: None,
41+
},
42+
Ok(ValidatedRawStr { n_hashes: 0 }),
43+
);
44+
}
45+
46+
#[test]
47+
fn test_too_many_terminators() {
48+
// this error is handled in the parser later
49+
check_raw_str(
50+
r###"#"abc"##"###,
51+
UnvalidatedRawStr {
52+
n_start_hashes: 1,
53+
n_end_hashes: 1,
54+
valid_start: true,
55+
possible_terminator_offset: None,
56+
},
57+
Ok(ValidatedRawStr { n_hashes: 1 }),
58+
);
59+
}
60+
61+
#[test]
62+
fn test_unterminated() {
63+
check_raw_str(
64+
r#"#"abc"#,
65+
UnvalidatedRawStr {
66+
n_start_hashes: 1,
67+
n_end_hashes: 0,
68+
valid_start: true,
69+
possible_terminator_offset: None,
70+
},
71+
Err(LexRawStrError::NoTerminator {
72+
expected: 1,
73+
found: 0,
74+
possible_terminator_offset: None,
75+
}),
76+
);
77+
check_raw_str(
78+
r###"##"abc"#"###,
79+
UnvalidatedRawStr {
80+
n_start_hashes: 2,
81+
n_end_hashes: 1,
82+
valid_start: true,
83+
possible_terminator_offset: Some(7),
84+
},
85+
Err(LexRawStrError::NoTerminator {
86+
expected: 2,
87+
found: 1,
88+
possible_terminator_offset: Some(7),
89+
}),
90+
);
91+
// We're looking for "# not just any #
92+
check_raw_str(
93+
r###"##"abc#"###,
94+
UnvalidatedRawStr {
95+
n_start_hashes: 2,
96+
n_end_hashes: 0,
97+
valid_start: true,
98+
possible_terminator_offset: None,
99+
},
100+
Err(LexRawStrError::NoTerminator {
101+
expected: 2,
102+
found: 0,
103+
possible_terminator_offset: None,
104+
}),
105+
)
106+
}
107+
108+
#[test]
109+
fn test_invalid_start() {
110+
check_raw_str(
111+
r##"#~"abc"#"##,
112+
UnvalidatedRawStr {
113+
n_start_hashes: 1,
114+
n_end_hashes: 0,
115+
valid_start: false,
116+
possible_terminator_offset: None,
117+
},
118+
Err(LexRawStrError::InvalidStarter),
119+
);
120+
}
121+
}

0 commit comments

Comments
 (0)