Skip to content

upgrade to aho-corasick 0.7 #566

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ members = [

[dependencies]
# For very fast prefix literal matching.
aho-corasick = "0.6.7"
aho-corasick = "0.7.1"
# For skipping along search text quickly when a leading byte is known.
memchr = "2.0.2"
# For managing regex caches quickly across multiple threads.
Expand Down
4 changes: 1 addition & 3 deletions ci/script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ ci/test-regex-capi
# very long time. Also, check that we can build the regex-debug tool.
if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
cargo build --verbose --manifest-path regex-debug/Cargo.toml
for x in rust rust-bytes pcre1 onig; do
(cd bench && ./run $x --no-run --verbose)
done
(cd bench && ./run rust --no-run --verbose)

# Test minimal versions.
cargo +nightly generate-lockfile -Z minimal-versions
Expand Down
55 changes: 55 additions & 0 deletions regex-syntax/src/hir/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,8 @@ impl Hir {
info.set_any_anchored_start(false);
info.set_any_anchored_end(false);
info.set_match_empty(true);
info.set_literal(true);
info.set_alternation_literal(true);
Hir {
kind: HirKind::Empty,
info: info,
Expand All @@ -253,6 +255,8 @@ impl Hir {
info.set_any_anchored_start(false);
info.set_any_anchored_end(false);
info.set_match_empty(false);
info.set_literal(true);
info.set_alternation_literal(true);
Hir {
kind: HirKind::Literal(lit),
info: info,
Expand All @@ -271,6 +275,8 @@ impl Hir {
info.set_any_anchored_start(false);
info.set_any_anchored_end(false);
info.set_match_empty(false);
info.set_literal(false);
info.set_alternation_literal(false);
Hir {
kind: HirKind::Class(class),
info: info,
Expand All @@ -289,6 +295,8 @@ impl Hir {
info.set_any_anchored_start(false);
info.set_any_anchored_end(false);
info.set_match_empty(true);
info.set_literal(false);
info.set_alternation_literal(false);
if let Anchor::StartText = anchor {
info.set_anchored_start(true);
info.set_line_anchored_start(true);
Expand Down Expand Up @@ -322,6 +330,8 @@ impl Hir {
info.set_line_anchored_end(false);
info.set_any_anchored_start(false);
info.set_any_anchored_end(false);
info.set_literal(false);
info.set_alternation_literal(false);
// A negated word boundary matches the empty string, but a normal
// word boundary does not!
info.set_match_empty(word_boundary.is_negated());
Expand Down Expand Up @@ -357,6 +367,8 @@ impl Hir {
info.set_any_anchored_start(rep.hir.is_any_anchored_start());
info.set_any_anchored_end(rep.hir.is_any_anchored_end());
info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
info.set_literal(false);
info.set_alternation_literal(false);
Hir {
kind: HirKind::Repetition(rep),
info: info,
Expand All @@ -375,6 +387,8 @@ impl Hir {
info.set_any_anchored_start(group.hir.is_any_anchored_start());
info.set_any_anchored_end(group.hir.is_any_anchored_end());
info.set_match_empty(group.hir.is_match_empty());
info.set_literal(false);
info.set_alternation_literal(false);
Hir {
kind: HirKind::Group(group),
info: info,
Expand All @@ -395,6 +409,8 @@ impl Hir {
info.set_any_anchored_start(false);
info.set_any_anchored_end(false);
info.set_match_empty(true);
info.set_literal(true);
info.set_alternation_literal(true);

// Some attributes require analyzing all sub-expressions.
for e in &exprs {
Expand All @@ -416,6 +432,14 @@ impl Hir {

let x = info.is_match_empty() && e.is_match_empty();
info.set_match_empty(x);

let x = info.is_literal() && e.is_literal();
info.set_literal(x);

let x =
info.is_alternation_literal()
&& e.is_alternation_literal();
info.set_alternation_literal(x);
}
// Anchored attributes require something slightly more
// sophisticated. Normally, WLOG, to determine whether an
Expand Down Expand Up @@ -488,6 +512,8 @@ impl Hir {
info.set_any_anchored_start(false);
info.set_any_anchored_end(false);
info.set_match_empty(false);
info.set_literal(false);
info.set_alternation_literal(true);

// Some attributes require analyzing all sub-expressions.
for e in &exprs {
Expand Down Expand Up @@ -523,6 +549,11 @@ impl Hir {

let x = info.is_match_empty() || e.is_match_empty();
info.set_match_empty(x);

let x =
info.is_alternation_literal()
&& e.is_literal();
info.set_alternation_literal(x);
}
Hir {
kind: HirKind::Alternation(exprs),
Expand Down Expand Up @@ -655,6 +686,28 @@ impl Hir {
pub fn is_match_empty(&self) -> bool {
self.info.is_match_empty()
}

/// Return true if and only if this HIR is a simple literal. This is only
/// true when this HIR expression is either itself a `Literal` or a
/// concatenation of only `Literal`s.
///
/// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`
/// are not (even though that contain sub-expressions that are literals).
pub fn is_literal(&self) -> bool {
self.info.is_literal()
}

/// Return true if and only if this HIR is either a simple literal or an
/// alternation of simple literals. This is only
/// true when this HIR expression is either itself a `Literal` or a
/// concatenation of only `Literal`s or an alternation of only `Literal`s.
///
/// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternaiton
/// literals, but `f+`, `(foo)`, `foo()`
/// are not (even though that contain sub-expressions that are literals).
pub fn is_alternation_literal(&self) -> bool {
self.info.is_alternation_literal()
}
}

impl HirKind {
Expand Down Expand Up @@ -1415,6 +1468,8 @@ impl HirInfo {
define_bool!(6, is_any_anchored_start, set_any_anchored_start);
define_bool!(7, is_any_anchored_end, set_any_anchored_end);
define_bool!(8, is_match_empty, set_match_empty);
define_bool!(9, is_literal, set_literal);
define_bool!(10, is_alternation_literal, set_alternation_literal);
}

#[cfg(test)]
Expand Down
45 changes: 45 additions & 0 deletions regex-syntax/src/hir/translate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2589,4 +2589,49 @@ mod tests {
assert!(!t(r"\b").is_match_empty());
assert!(!t(r"(?-u)\b").is_match_empty());
}

#[test]
fn analysis_is_literal() {
// Positive examples.
assert!(t(r"").is_literal());
assert!(t(r"a").is_literal());
assert!(t(r"ab").is_literal());
assert!(t(r"abc").is_literal());
assert!(t(r"(?m)abc").is_literal());

// Negative examples.
assert!(!t(r"^").is_literal());
assert!(!t(r"a|b").is_literal());
assert!(!t(r"(a)").is_literal());
assert!(!t(r"a+").is_literal());
assert!(!t(r"foo(a)").is_literal());
assert!(!t(r"(a)foo").is_literal());
assert!(!t(r"[a]").is_literal());
}

#[test]
fn analysis_is_alternation_literal() {
// Positive examples.
assert!(t(r"").is_alternation_literal());
assert!(t(r"a").is_alternation_literal());
assert!(t(r"ab").is_alternation_literal());
assert!(t(r"abc").is_alternation_literal());
assert!(t(r"(?m)abc").is_alternation_literal());
assert!(t(r"a|b").is_alternation_literal());
assert!(t(r"a|b|c").is_alternation_literal());
assert!(t(r"foo|bar").is_alternation_literal());
assert!(t(r"foo|bar|baz").is_alternation_literal());

// Negative examples.
assert!(!t(r"^").is_alternation_literal());
assert!(!t(r"(a)").is_alternation_literal());
assert!(!t(r"a+").is_alternation_literal());
assert!(!t(r"foo(a)").is_alternation_literal());
assert!(!t(r"(a)foo").is_alternation_literal());
assert!(!t(r"[a]").is_alternation_literal());
assert!(!t(r"[a]|b").is_alternation_literal());
assert!(!t(r"a|[b]").is_alternation_literal());
assert!(!t(r"(a)|b").is_alternation_literal());
assert!(!t(r"a|(b)").is_alternation_literal());
}
}
96 changes: 96 additions & 0 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use std::collections::HashMap;
use std::cmp;
use std::sync::Arc;

use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
use thread_local::CachedThreadLocal;
use syntax::ParserBuilder;
use syntax::hir::Hir;
Expand Down Expand Up @@ -86,6 +87,16 @@ struct ExecReadOnly {
/// Prefix literals are stored on the `Program`, since they are used inside
/// the matching engines.
suffixes: LiteralSearcher,
/// An Aho-Corasick automaton with leftmost-first match semantics.
///
/// This is only set when the entire regex is a simple unanchored
/// alternation of literals. We could probably use it more circumstances,
/// but this is already hacky enough in this architecture.
///
/// N.B. We use u32 as a state ID representation under the assumption that
/// if we were to exhaust the ID space, we probably would have long
/// surpassed the compilation size limit.
ac: Option<AhoCorasick<u32>>,
/// match_type encodes as much upfront knowledge about how we're going to
/// execute a search as possible.
match_type: MatchType,
Expand Down Expand Up @@ -287,6 +298,7 @@ impl ExecBuilder {
dfa: Program::new(),
dfa_reverse: Program::new(),
suffixes: LiteralSearcher::empty(),
ac: None,
match_type: MatchType::Nothing,
});
return Ok(Exec { ro: ro, cache: CachedThreadLocal::new() });
Expand Down Expand Up @@ -319,12 +331,32 @@ impl ExecBuilder {
dfa.dfa_size_limit = self.options.dfa_size_limit;
dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;

let mut ac = None;
if parsed.exprs.len() == 1 {
if let Some(lits) = alternation_literals(&parsed.exprs[0]) {
// If we have a small number of literals, then let Teddy
// handle things (see literal/mod.rs).
if lits.len() > 32 {
let fsm = AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostFirst)
.auto_configure(&lits)
// We always want this to reduce size, regardless of
// what auto-configure does.
.byte_classes(true)
.build_with_size::<u32, _, _>(&lits)
.expect("AC automaton too big");
ac = Some(fsm);
}
}
}

let mut ro = ExecReadOnly {
res: self.options.pats,
nfa: nfa,
dfa: dfa,
dfa_reverse: dfa_reverse,
suffixes: LiteralSearcher::suffixes(suffixes),
ac: ac,
match_type: MatchType::Nothing,
};
ro.match_type = ro.choose_match_type(self.match_type);
Expand Down Expand Up @@ -633,6 +665,11 @@ impl<'c> ExecNoSync<'c> {
lits.find_end(&text[start..])
.map(|(s, e)| (start + s, start + e))
}
AhoCorasick => {
self.ro.ac.as_ref().unwrap()
.find(&text[start..])
.map(|m| (start + m.start(), start + m.end()))
}
}
}

Expand Down Expand Up @@ -1163,6 +1200,9 @@ impl ExecReadOnly {
// aren't anchored. We would then only search for all of them when at
// the beginning of the input and use the subset in all other cases.
if self.res.len() == 1 {
if self.ac.is_some() {
return Literal(MatchLiteralType::AhoCorasick);
}
if self.nfa.prefixes.complete() {
return if self.nfa.is_anchored_start {
Literal(MatchLiteralType::AnchoredStart)
Expand Down Expand Up @@ -1254,6 +1294,9 @@ enum MatchLiteralType {
AnchoredStart,
/// Match literals only at the end of text.
AnchoredEnd,
/// Use an Aho-Corasick automaton. This requires `ac` to be Some on
/// ExecReadOnly.
AhoCorasick,
}

#[derive(Clone, Copy, Debug)]
Expand Down Expand Up @@ -1295,6 +1338,59 @@ impl ProgramCacheInner {
}
}

/// Alternation literals checks if the given HIR is a simple alternation of
/// literals, and if so, returns them. Otherwise, this returns None.
fn alternation_literals(expr: &Hir) -> Option<Vec<Vec<u8>>> {
use syntax::hir::{HirKind, Literal};

// This is pretty hacky, but basically, if `is_alternation_literal` is
// true, then we can make several assumptions about the structure of our
// HIR. This is what justifies the `unreachable!` statements below.
//
// This code should be refactored once we overhaul this crate's
// optimization pipeline, because this is a terribly inflexible way to go
// about things.

if !expr.is_alternation_literal() {
return None;
}
let alts = match *expr.kind() {
HirKind::Alternation(ref alts) => alts,
_ => return None, // one literal isn't worth it
};

let extendlit = |lit: &Literal, dst: &mut Vec<u8>| {
match *lit {
Literal::Unicode(c) => {
let mut buf = [0; 4];
dst.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
}
Literal::Byte(b) => {
dst.push(b);
}
}
};

let mut lits = vec![];
for alt in alts {
let mut lit = vec![];
match *alt.kind() {
HirKind::Literal(ref x) => extendlit(x, &mut lit),
HirKind::Concat(ref exprs) => {
for e in exprs {
match *e.kind() {
HirKind::Literal(ref x) => extendlit(x, &mut lit),
_ => unreachable!("expected literal, got {:?}", e),
}
}
}
_ => unreachable!("expected literal or concat, got {:?}", alt),
}
lits.push(lit);
}
Some(lits)
}

#[cfg(test)]
mod test {
#[test]
Expand Down
Loading