Skip to content

Commit 4e932e6

Browse files
committed
Fix a bug in capture with match
When performing "EndText" matching, it is necessary to check whether the current position matches the input text length. However, when capturing a submatch using the matching result of DFA, "EndText" matching wasn't actually performed correctly because the input text sliced. By applying this patch we specify the match end position by the argument "end", not using slice when performing capture with the matching result of DFA. Fixes rust-lang#557
1 parent 9b951a6 commit 4e932e6

File tree

4 files changed

+41
-44
lines changed

4 files changed

+41
-44
lines changed

src/backtrack.rs

+4-3
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
9898
slots: &'s mut [Slot],
9999
input: I,
100100
start: usize,
101+
end: usize,
101102
) -> bool {
102103
let mut cache = cache.borrow_mut();
103104
let cache = &mut cache.backtrack;
@@ -109,7 +110,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
109110
slots: slots,
110111
m: cache,
111112
};
112-
b.exec_(start)
113+
b.exec_(start, end)
113114
}
114115

115116
/// Clears the cache such that the backtracking engine can be executed
@@ -147,7 +148,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
147148

148149
/// Start backtracking at the given position in the input, but also look
149150
/// for literal prefixes.
150-
fn exec_(&mut self, mut at: InputAt) -> bool {
151+
fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
151152
self.clear();
152153
// If this is an anchored regex at the beginning of the input, then
153154
// we're either already done or we only need to try backtracking once.
@@ -170,7 +171,7 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
170171
if matched && self.prog.matches.len() == 1 {
171172
return true;
172173
}
173-
if at.is_end() {
174+
if at.pos() == end {
174175
break;
175176
}
176177
at = self.input.at(at.next_pos());

src/exec.rs

+27-39
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
use std::cell::RefCell;
1212
use std::collections::HashMap;
13-
use std::cmp;
1413
use std::sync::Arc;
1514

1615
use thread_local::CachedThreadLocal;
@@ -557,7 +556,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
557556
match self.ro.match_type {
558557
MatchType::Literal(ty) => {
559558
self.find_literals(ty, text, start).and_then(|(s, e)| {
560-
self.captures_nfa_with_match(slots, text, s, e)
559+
self.captures_nfa_type(MatchNfaType::Auto, slots, text, s, e)
561560
})
562561
}
563562
MatchType::Dfa => {
@@ -566,7 +565,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
566565
} else {
567566
match self.find_dfa_forward(text, start) {
568567
dfa::Result::Match((s, e)) => {
569-
self.captures_nfa_with_match(slots, text, s, e)
568+
self.captures_nfa_type(MatchNfaType::Auto, slots, text, s, e)
570569
}
571570
dfa::Result::NoMatch(_) => None,
572571
dfa::Result::Quit => self.captures_nfa(slots, text, start),
@@ -576,7 +575,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
576575
MatchType::DfaAnchoredReverse => {
577576
match self.find_dfa_anchored_reverse(text, start) {
578577
dfa::Result::Match((s, e)) => {
579-
self.captures_nfa_with_match(slots, text, s, e)
578+
self.captures_nfa_type(MatchNfaType::Auto, slots, text, s, e)
580579
}
581580
dfa::Result::NoMatch(_) => None,
582581
dfa::Result::Quit => self.captures_nfa(slots, text, start),
@@ -585,14 +584,14 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
585584
MatchType::DfaSuffix => {
586585
match self.find_dfa_reverse_suffix(text, start) {
587586
dfa::Result::Match((s, e)) => {
588-
self.captures_nfa_with_match(slots, text, s, e)
587+
self.captures_nfa_type(MatchNfaType::Auto, slots, text, s, e)
589588
}
590589
dfa::Result::NoMatch(_) => None,
591590
dfa::Result::Quit => self.captures_nfa(slots, text, start),
592591
}
593592
}
594593
MatchType::Nfa(ty) => {
595-
self.captures_nfa_type(ty, slots, text, start)
594+
self.captures_nfa_type(ty, slots, text, start, text.len())
596595
}
597596
MatchType::Nothing => None,
598597
MatchType::DfaMany => {
@@ -830,7 +829,7 @@ impl<'c> ExecNoSync<'c> {
830829
text: &[u8],
831830
start: usize,
832831
) -> bool {
833-
self.exec_nfa(ty, &mut [false], &mut [], true, text, start)
832+
self.exec_nfa(ty, &mut [false], &mut [], true, text, start, text.len())
834833
}
835834

836835
/// Finds the shortest match using an NFA.
@@ -846,7 +845,7 @@ impl<'c> ExecNoSync<'c> {
846845
start: usize,
847846
) -> Option<usize> {
848847
let mut slots = [None, None];
849-
if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start) {
848+
if self.exec_nfa(ty, &mut [false], &mut slots, true, text, start, text.len()) {
850849
slots[1]
851850
} else {
852851
None
@@ -861,7 +860,7 @@ impl<'c> ExecNoSync<'c> {
861860
start: usize,
862861
) -> Option<(usize, usize)> {
863862
let mut slots = [None, None];
864-
if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start) {
863+
if self.exec_nfa(ty, &mut [false], &mut slots, false, text, start, text.len()) {
865864
match (slots[0], slots[1]) {
866865
(Some(s), Some(e)) => Some((s, e)),
867866
_ => None,
@@ -871,26 +870,6 @@ impl<'c> ExecNoSync<'c> {
871870
}
872871
}
873872

874-
/// Like find_nfa, but fills in captures and restricts the search space
875-
/// using previously found match information.
876-
///
877-
/// `slots` should have length equal to `2 * nfa.captures.len()`.
878-
fn captures_nfa_with_match(
879-
&self,
880-
slots: &mut [Slot],
881-
text: &[u8],
882-
match_start: usize,
883-
match_end: usize,
884-
) -> Option<(usize, usize)> {
885-
// We can't use match_end directly, because we may need to examine one
886-
// "character" after the end of a match for lookahead operators. We
887-
// need to move two characters beyond the end, since some look-around
888-
// operations may falsely assume a premature end of text otherwise.
889-
let e = cmp::min(
890-
next_utf8(text, next_utf8(text, match_end)), text.len());
891-
self.captures_nfa(slots, &text[..e], match_start)
892-
}
893-
894873
/// Like find_nfa, but fills in captures.
895874
///
896875
/// `slots` should have length equal to `2 * nfa.captures.len()`.
@@ -900,7 +879,7 @@ impl<'c> ExecNoSync<'c> {
900879
text: &[u8],
901880
start: usize,
902881
) -> Option<(usize, usize)> {
903-
self.captures_nfa_type(MatchNfaType::Auto, slots, text, start)
882+
self.captures_nfa_type(MatchNfaType::Auto, slots, text, start, text.len())
904883
}
905884

906885
/// Like captures_nfa, but allows specification of type of NFA engine.
@@ -910,8 +889,9 @@ impl<'c> ExecNoSync<'c> {
910889
slots: &mut [Slot],
911890
text: &[u8],
912891
start: usize,
892+
end: usize,
913893
) -> Option<(usize, usize)> {
914-
if self.exec_nfa(ty, &mut [false], slots, false, text, start) {
894+
if self.exec_nfa(ty, &mut [false], slots, false, text, start, end) {
915895
match (slots[0], slots[1]) {
916896
(Some(s), Some(e)) => Some((s, e)),
917897
_ => None,
@@ -929,6 +909,7 @@ impl<'c> ExecNoSync<'c> {
929909
quit_after_match: bool,
930910
text: &[u8],
931911
start: usize,
912+
end: usize,
932913
) -> bool {
933914
use self::MatchNfaType::*;
934915
if let Auto = ty {
@@ -940,10 +921,10 @@ impl<'c> ExecNoSync<'c> {
940921
}
941922
match ty {
942923
Auto => unreachable!(),
943-
Backtrack => self.exec_backtrack(matches, slots, text, start),
924+
Backtrack => self.exec_backtrack(matches, slots, text, start, end),
944925
PikeVM => {
945926
self.exec_pikevm(
946-
matches, slots, quit_after_match, text, start)
927+
matches, slots, quit_after_match, text, start, end)
947928
}
948929
}
949930
}
@@ -956,6 +937,7 @@ impl<'c> ExecNoSync<'c> {
956937
quit_after_match: bool,
957938
text: &[u8],
958939
start: usize,
940+
end: usize,
959941
) -> bool {
960942
if self.ro.nfa.uses_bytes() {
961943
pikevm::Fsm::exec(
@@ -965,7 +947,8 @@ impl<'c> ExecNoSync<'c> {
965947
slots,
966948
quit_after_match,
967949
ByteInput::new(text, self.ro.nfa.only_utf8),
968-
start)
950+
start,
951+
end)
969952
} else {
970953
pikevm::Fsm::exec(
971954
&self.ro.nfa,
@@ -974,7 +957,8 @@ impl<'c> ExecNoSync<'c> {
974957
slots,
975958
quit_after_match,
976959
CharInput::new(text),
977-
start)
960+
start,
961+
end)
978962
}
979963
}
980964

@@ -985,6 +969,7 @@ impl<'c> ExecNoSync<'c> {
985969
slots: &mut [Slot],
986970
text: &[u8],
987971
start: usize,
972+
end: usize,
988973
) -> bool {
989974
if self.ro.nfa.uses_bytes() {
990975
backtrack::Bounded::exec(
@@ -993,15 +978,17 @@ impl<'c> ExecNoSync<'c> {
993978
matches,
994979
slots,
995980
ByteInput::new(text, self.ro.nfa.only_utf8),
996-
start)
981+
start,
982+
end)
997983
} else {
998984
backtrack::Bounded::exec(
999985
&self.ro.nfa,
1000986
self.cache,
1001987
matches,
1002988
slots,
1003989
CharInput::new(text),
1004-
start)
990+
start,
991+
end)
1005992
}
1006993
}
1007994

@@ -1045,11 +1032,12 @@ impl<'c> ExecNoSync<'c> {
10451032
&mut [],
10461033
false,
10471034
text,
1048-
start)
1035+
start,
1036+
text.len())
10491037
}
10501038
}
10511039
}
1052-
Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start),
1040+
Nfa(ty) => self.exec_nfa(ty, matches, &mut [], false, text, start, text.len()),
10531041
Nothing => false,
10541042
}
10551043
}

src/pikevm.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ impl<'r, I: Input> Fsm<'r, I> {
107107
quit_after_match: bool,
108108
input: I,
109109
start: usize,
110+
end: usize,
110111
) -> bool {
111112
let mut cache = cache.borrow_mut();
112113
let cache = &mut cache.pikevm;
@@ -124,6 +125,7 @@ impl<'r, I: Input> Fsm<'r, I> {
124125
slots,
125126
quit_after_match,
126127
at,
128+
end,
127129
)
128130
}
129131

@@ -135,6 +137,7 @@ impl<'r, I: Input> Fsm<'r, I> {
135137
slots: &mut [Slot],
136138
quit_after_match: bool,
137139
mut at: InputAt,
140+
end: usize,
138141
) -> bool {
139142
let mut matched = false;
140143
let mut all_matched = false;
@@ -212,7 +215,7 @@ impl<'r, I: Input> Fsm<'r, I> {
212215
}
213216
}
214217
}
215-
if at.is_end() {
218+
if at.pos() == end {
216219
break;
217220
}
218221
at = at_next;

tests/regression.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,13 @@ ismatch!(reverse_suffix2, r"\d\d\d000", "153.230000\n", true);
8888
matiter!(reverse_suffix3, r"\d\d\d000", "153.230000\n", (4, 10));
8989

9090
// See: https://github.com/rust-lang/regex/issues/334
91-
mat!(captures_after_dfa_premature_end, r"a(b*(X|$))?", "abcbX",
91+
// See: https://github.com/rust-lang/regex/issues/557
92+
mat!(captures_after_dfa_premature_end1, r"a(b*(X|$))?", "abcbX",
9293
Some((0, 1)), None, None);
94+
mat!(captures_after_dfa_premature_end2, r"a(bc*(X|$))?", "abcbX",
95+
Some((0, 1)), None, None);
96+
mat!(captures_after_dfa_premature_end3, r"(aa$)?", "aaz",
97+
Some((0, 0)));
9398

9499
// See: https://github.com/rust-lang/regex/issues/437
95100
ismatch!(

0 commit comments

Comments
 (0)