Skip to content

Commit 50d991e

Browse files
committed
It's a sad day: remove reverse suffix literal optimization.
TL;DR: As implemented, its worst case time complexity was quadratic. Given the regex `.*z.*abcd` and the haystack `abcdabcdabcdabcd...`, the reverse suffix literal optimization would find the first occurrence of `abcd`, then try to match `.*z.*` in reverse. When the match fails, it starts searching for `abcd` again after the last occurrence. On the second match (immediately after the first match), it has to run the DFA backwards all over again. This is repeated for every match of `abcd`. In essence, it will have run for quadratic time before reporting that the regex cannot match. There is hope. If we can know that the "reverse" search, in this case, `.*z.*` *cannot* match the suffix literal `abcd`, then we can cap the haystack that is searched in reverse so that each byte is visited by the DFA at most once. In particular, we can cause the DFA to only search as far as the previous match of the suffix literal. This is somewhat close to a generalization of Boyer-Moore in the sense that we're utilizing information of mismatches and the nature of the needle to limit how much work search has to do.
1 parent fdbe843 commit 50d991e

File tree

2 files changed

+8
-138
lines changed

2 files changed

+8
-138
lines changed

src/exec.rs

+2-137
Original file line numberDiff line numberDiff line change
@@ -326,36 +326,6 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
326326
}
327327
}
328328
}
329-
MatchType::DfaSuffix => {
330-
let lcs = self.ro.suffixes.lcs();
331-
let mut end = start;
332-
while end <= text.len() {
333-
end = end + match lcs.find(&text[end..]) {
334-
None => return None,
335-
Some(e) => e + lcs.len(),
336-
};
337-
338-
// Search in reverse from the end of the suffix match.
339-
match dfa::Fsm::reverse(
340-
&self.ro.dfa_reverse,
341-
&self.cache,
342-
true,
343-
&text[start..end],
344-
end - start,
345-
) {
346-
// We don't care about the "start" location since we're
347-
// trying to return the first ending position that
348-
// satisfies the regex. The end of the suffix is it!
349-
dfa::Result::Match(_) => return Some(end),
350-
dfa::Result::NoMatch => continue,
351-
dfa::Result::Quit => {
352-
return self.shortest_match_nfa(
353-
MatchNfaType::Auto, text, start);
354-
}
355-
}
356-
}
357-
None
358-
}
359329
MatchType::Nfa(ty) => self.shortest_match_nfa(ty, text, start),
360330
MatchType::Nothing => None,
361331
}
@@ -375,7 +345,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
375345
// filling in captures[1], but a RegexSet has no captures. In other
376346
// words, a RegexSet can't (currently) use shortest_match. ---AG
377347
match self.ro.match_type {
378-
Literal(_) | Dfa | DfaMany | DfaSuffix | Nothing => {
348+
Literal(_) | Dfa | DfaMany | Nothing => {
379349
self.shortest_match_at(text, start).is_some()
380350
}
381351
Nfa(ty) => {
@@ -404,15 +374,6 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
404374
}
405375
}
406376
}
407-
MatchType::DfaSuffix => {
408-
match self.find_dfa_reverse_suffix(text, start) {
409-
dfa::Result::Match((s, e)) => Some((s, e)),
410-
dfa::Result::NoMatch => None,
411-
dfa::Result::Quit => {
412-
self.find_nfa(MatchNfaType::Auto, text, start)
413-
}
414-
}
415-
}
416377
MatchType::Nfa(ty) => self.find_nfa(ty, text, start),
417378
MatchType::Nothing => None,
418379
MatchType::DfaMany => {
@@ -457,19 +418,6 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
457418
}
458419
}
459420
}
460-
MatchType::DfaSuffix => {
461-
match self.find_dfa_reverse_suffix(text, start) {
462-
dfa::Result::Match((s, _)) => {
463-
self.captures_nfa(
464-
MatchNfaType::Auto, slots, text, s)
465-
}
466-
dfa::Result::NoMatch => None,
467-
dfa::Result::Quit => {
468-
self.captures_nfa(
469-
MatchNfaType::Auto, slots, text, start)
470-
}
471-
}
472-
}
473421
MatchType::Nfa(ty) => {
474422
self.captures_nfa(ty, slots, text, start)
475423
}
@@ -505,7 +453,7 @@ impl<'c> ExecNoSync<'c> {
505453
matches[0] = self.exec_literals(ty, text, start).is_some();
506454
matches[0]
507455
}
508-
Dfa | DfaMany | DfaSuffix => {
456+
Dfa | DfaMany => {
509457
match dfa::Fsm::forward_many(
510458
&self.ro.dfa,
511459
&self.cache,
@@ -583,60 +531,6 @@ impl<'c> ExecNoSync<'c> {
583531
}
584532
}
585533

586-
/// Finds the leftmost-first match (start and end) using only the DFA
587-
/// by scanning for suffix literals.
588-
///
589-
/// If the result returned indicates that the DFA quit, then another
590-
/// matching engine should be used.
591-
#[inline(always)] // reduces constant overhead
592-
fn find_dfa_reverse_suffix(
593-
&self,
594-
text: &[u8],
595-
start: usize,
596-
) -> dfa::Result<(usize, usize)> {
597-
use dfa::Result::*;
598-
599-
let lcs = self.ro.suffixes.lcs();
600-
debug_assert!(lcs.len() >= 1);
601-
let mut end = start;
602-
while end <= text.len() {
603-
end = end + match lcs.find(&text[end..]) {
604-
None => return NoMatch,
605-
Some(end) => end + lcs.len(),
606-
};
607-
let match_start = match dfa::Fsm::reverse(
608-
&self.ro.dfa_reverse,
609-
&self.cache,
610-
false,
611-
&text[start..end],
612-
end - start,
613-
) {
614-
NoMatch => continue,
615-
Quit => return Quit,
616-
Match(s) => start + s,
617-
};
618-
// At this point, we've found a match. The only way to quit now
619-
// without a match is if the DFA gives up (seems unlikely).
620-
//
621-
// Now run the DFA forwards to find the proper end of the match.
622-
// (The suffix literal match can only indicate the earliest
623-
// possible end location, which may appear before the end of the
624-
// leftmost-first match.)
625-
return match dfa::Fsm::forward(
626-
&self.ro.dfa,
627-
&self.cache,
628-
false,
629-
text,
630-
match_start,
631-
) {
632-
NoMatch => panic!("BUG: reverse match implies forward match"),
633-
Quit => Quit,
634-
Match(e) => Match((match_start, e)),
635-
};
636-
}
637-
NoMatch
638-
}
639-
640534
/// Like find, but executes an NFA engine.
641535
fn find_nfa(
642536
&self,
@@ -908,39 +802,12 @@ impl ExecReadOnly {
908802
if self.res.len() >= 2 {
909803
return DfaMany;
910804
}
911-
// If there's a longish suffix literal, then it might be faster
912-
// to look for that first.
913-
if self.should_suffix_scan() {
914-
return DfaSuffix;
915-
}
916805
// Fall back to your garden variety forward searching lazy DFA.
917806
return Dfa;
918807
}
919808
// We're so totally hosed.
920809
Nfa(MatchNfaType::Auto)
921810
}
922-
923-
/// Returns true if the program is amenable to suffix scanning.
924-
///
925-
/// When this is true, as a heuristic, we assume it is OK to quickly scan
926-
/// for suffix literals and then do a *reverse* DFA match from any matches
927-
/// produced by the literal scan. (And then followed by a forward DFA
928-
/// search, since the previously found suffix literal maybe not actually be
929-
/// the end of a match.)
930-
///
931-
/// This is a bit of a specialized optimization, but can result in pretty
932-
/// big performance wins if 1) there are no prefix literals and 2) the
933-
/// suffix literals are pretty rare in the text. (1) is obviously easy to
934-
/// account for but (2) is harder. As a proxy, we assume that longer
935-
/// strings are generally rarer, so we only enable this optimization when
936-
/// we have a meaty suffix.
937-
fn should_suffix_scan(&self) -> bool {
938-
if self.suffixes.is_empty() {
939-
return false;
940-
}
941-
let lcs_len = self.suffixes.lcs().char_len();
942-
lcs_len >= 3 && lcs_len > self.dfa.prefixes.lcp().char_len()
943-
}
944811
}
945812

946813
#[derive(Clone, Copy, Debug)]
@@ -950,8 +817,6 @@ enum MatchType {
950817
Literal(MatchLiteralType),
951818
/// A normal DFA search.
952819
Dfa,
953-
/// A reverse DFA search with suffix literal scanning.
954-
DfaSuffix,
955820
/// Use the DFA on two or more regular expressions.
956821
DfaMany,
957822
/// An NFA variant.

tests/suffix_reverse.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,15 @@
88
// option. This file may not be copied, modified, or distributed
99
// except according to those terms.
1010

11+
// N.B. These tests were specifically for the reverse suffix literal
12+
// optimization. That optimization has since been removed because its worst
13+
// case time complexity is quadratic. There is hope for bringing it back, so
14+
// we leave these tests here.
15+
1116
mat!(t01, r".*abcd", r"abcd", Some((0, 4)));
1217
mat!(t02, r".*(?:abcd)+", r"abcd", Some((0, 4)));
1318
mat!(t03, r".*(?:abcd)+", r"abcdabcd", Some((0, 8)));
1419
mat!(t04, r".*(?:abcd)+", r"abcdxabcd", Some((0, 9)));
1520
mat!(t05, r".*x(?:abcd)+", r"abcdxabcd", Some((0, 9)));
1621
mat!(t06, r"[^abcd]*x(?:abcd)+", r"abcdxabcd", Some((4, 9)));
17-
// mat!(t05, r".*(?:abcd)+", r"abcdabcd", Some((0, 4)));
22+
mat!(t07, r".*(?:abcd)+", r"abcdabcd", Some((0, 8)));

0 commit comments

Comments
 (0)