Skip to content

Commit f8ebdbb

Browse files
committed
exec: add Aho-Corasick optimization
Finally, if a regex is just `foo|bar|baz|...|quux`, we will now use plain old Aho-Corasick. The reason why we weren't doing this before is because Aho-Corasick didn't support proper leftmost-first match semantics. But since aho-corasick 0.7, it does, so we can now use it as a drop-in replacement. This basically fixes a pretty bad performance bug in a really common case, but it is otherwise really hacked. First of all, this only happens when a regex is literally `foo|bar|...|baz`. Even something like `foo|b(a)r|...|baz` will prevent this optimization from happening, which is a little silly. Second of all, this optimization only kicks in after we've compiled the full pattern, which adds quite a bit of overhead. Fixing this isn't trivial, since we may need the compiled program to resolve capturing groups. The way to do this is probably to specialize compilation for certain types of expressions. Maybe. Anyway, we hack this in for now, and punt on further improvements until we can really re-think how this should all work.
1 parent bd5f2b4 commit f8ebdbb

File tree

2 files changed

+106
-0
lines changed

2 files changed

+106
-0
lines changed

src/exec.rs

+96
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ use std::collections::HashMap;
1313
use std::cmp;
1414
use std::sync::Arc;
1515

16+
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
1617
use thread_local::CachedThreadLocal;
1718
use syntax::ParserBuilder;
1819
use syntax::hir::Hir;
@@ -86,6 +87,16 @@ struct ExecReadOnly {
8687
/// Prefix literals are stored on the `Program`, since they are used inside
8788
/// the matching engines.
8889
suffixes: LiteralSearcher,
90+
/// An Aho-Corasick automaton with leftmost-first match semantics.
91+
///
92+
/// This is only set when the entire regex is a simple unanchored
93+
/// alternation of literals. We could probably use it more circumstances,
94+
/// but this is already hacky enough in this architecture.
95+
///
96+
/// N.B. We use u32 as a state ID representation under the assumption that
97+
/// if we were to exhaust the ID space, we probably would have long
98+
/// surpassed the compilation size limit.
99+
ac: Option<AhoCorasick<u32>>,
89100
/// match_type encodes as much upfront knowledge about how we're going to
90101
/// execute a search as possible.
91102
match_type: MatchType,
@@ -287,6 +298,7 @@ impl ExecBuilder {
287298
dfa: Program::new(),
288299
dfa_reverse: Program::new(),
289300
suffixes: LiteralSearcher::empty(),
301+
ac: None,
290302
match_type: MatchType::Nothing,
291303
});
292304
return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() });
@@ -319,12 +331,32 @@ impl ExecBuilder {
319331
dfa.dfa_size_limit = self.options.dfa_size_limit;
320332
dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
321333

334+
let mut ac = None;
335+
if parsed.exprs.len() == 1 {
336+
if let Some(lits) = alternation_literals(&parsed.exprs[0]) {
337+
// If we have a small number of literals, then let Teddy
338+
// handle things (see literal/mod.rs).
339+
if lits.len() > 32 {
340+
let fsm = AhoCorasickBuilder::new()
341+
.match_kind(MatchKind::LeftmostFirst)
342+
.auto_configure(&lits)
343+
// We always want this to reduce size, regardless of
344+
// what auto-configure does.
345+
.byte_classes(true)
346+
.build_with_size::<u32, _, _>(&lits)
347+
.expect("AC automaton too big");
348+
ac = Some(fsm);
349+
}
350+
}
351+
}
352+
322353
let mut ro = ExecReadOnly {
323354
res: self.options.pats,
324355
nfa: nfa,
325356
dfa: dfa,
326357
dfa_reverse: dfa_reverse,
327358
suffixes: LiteralSearcher::suffixes(suffixes),
359+
ac: ac,
328360
match_type: MatchType::Nothing,
329361
};
330362
ro.match_type = ro.choose_match_type(self.match_type);
@@ -633,6 +665,11 @@ impl<'c> ExecNoSync<'c> {
633665
lits.find_end(&text[start..])
634666
.map(|(s, e)| (start + s, start + e))
635667
}
668+
AhoCorasick => {
669+
self.ro.ac.as_ref().unwrap()
670+
.find(&text[start..])
671+
.map(|m| (start + m.start(), start + m.end()))
672+
}
636673
}
637674
}
638675

@@ -1163,6 +1200,9 @@ impl ExecReadOnly {
11631200
// aren't anchored. We would then only search for all of them when at
11641201
// the beginning of the input and use the subset in all other cases.
11651202
if self.res.len() == 1 {
1203+
if self.ac.is_some() {
1204+
return Literal(MatchLiteralType::AhoCorasick);
1205+
}
11661206
if self.nfa.prefixes.complete() {
11671207
return if self.nfa.is_anchored_start {
11681208
Literal(MatchLiteralType::AnchoredStart)
@@ -1254,6 +1294,9 @@ enum MatchLiteralType {
12541294
AnchoredStart,
12551295
/// Match literals only at the end of text.
12561296
AnchoredEnd,
1297+
/// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1298+
/// ExecReadOnly.
1299+
AhoCorasick,
12571300
}
12581301

12591302
#[derive(Clone, Copy, Debug)]
@@ -1295,6 +1338,59 @@ impl ProgramCacheInner {
12951338
}
12961339
}
12971340

1341+
/// Alternation literals checks if the given HIR is a simple alternation of
1342+
/// literals, and if so, returns them. Otherwise, this returns None.
1343+
fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
1344+
use syntax::hir::{HirKind, Literal};
1345+
1346+
// This is pretty hacky, but basically, if `is_alternation_literal` is
1347+
// true, then we can make several assumptions about the structure of our
1348+
// HIR. This is what justifies the `unreachable!` statements below.
1349+
//
1350+
// This code should be refactored once we overhaul this crate's
1351+
// optimization pipeline, because this is a terribly inflexible way to go
1352+
// about things.
1353+
1354+
if !expr.is_alternation_literal() {
1355+
return None;
1356+
}
1357+
let alts = match *expr.kind() {
1358+
HirKind::Alternation(ref alts) => alts,
1359+
_ => return None, // one literal isn't worth it
1360+
};
1361+
1362+
let extendlit = |lit: &Literal, dst: &mut Vec<u8>| {
1363+
match *lit {
1364+
Literal::Unicode(c) => {
1365+
let mut buf = [0; 4];
1366+
dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
1367+
}
1368+
Literal::Byte(b) => {
1369+
dst.push(b);
1370+
}
1371+
}
1372+
};
1373+
1374+
let mut lits = vec![];
1375+
for alt in alts {
1376+
let mut lit = vec![];
1377+
match *alt.kind() {
1378+
HirKind::Literal(ref x) => extendlit(x, &mut lit),
1379+
HirKind::Concat(ref exprs) => {
1380+
for e in exprs {
1381+
match *e.kind() {
1382+
HirKind::Literal(ref x) => extendlit(x, &mut lit),
1383+
_ => unreachable!("expected literal, got {:?}", e),
1384+
}
1385+
}
1386+
}
1387+
_ => unreachable!("expected literal or concat, got {:?}", alt),
1388+
}
1389+
lits.push(lit);
1390+
}
1391+
Some(lits)
1392+
}
1393+
12981394
#[cfg(test)]
12991395
mod test {
13001396
#[test]

tests/regression.rs

+10
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,13 @@ ismatch!(
114114
\u{10}\u{11}\u{12}\u{13}\u{14}\u{15}\u{16}\u{17}\
115115
\u{18}\u{19}\u{1a}\u{1b}\u{1c}\u{1d}\u{1e}\u{1f}",
116116
true);
117+
118+
// Tests that our Aho-Corasick optimization works correctly. It only
119+
// kicks in when we have >32 literals.
120+
mat!(
121+
ahocorasick1,
122+
"samwise|sam|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|\
123+
A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z",
124+
"samwise",
125+
Some((0, 7))
126+
);

0 commit comments

Comments
 (0)