@@ -13,6 +13,7 @@ use std::collections::HashMap;
13
13
use std:: cmp;
14
14
use std:: sync:: Arc ;
15
15
16
+ use aho_corasick:: { AhoCorasick , AhoCorasickBuilder , MatchKind } ;
16
17
use thread_local:: CachedThreadLocal ;
17
18
use syntax:: ParserBuilder ;
18
19
use syntax:: hir:: Hir ;
@@ -86,6 +87,16 @@ struct ExecReadOnly {
86
87
/// Prefix literals are stored on the `Program`, since they are used inside
87
88
/// the matching engines.
88
89
suffixes : LiteralSearcher ,
90
+ /// An Aho-Corasick automaton with leftmost-first match semantics.
91
+ ///
92
+ /// This is only set when the entire regex is a simple unanchored
93
+ /// alternation of literals. We could probably use it more circumstances,
94
+ /// but this is already hacky enough in this architecture.
95
+ ///
96
+ /// N.B. We use u32 as a state ID representation under the assumption that
97
+ /// if we were to exhaust the ID space, we probably would have long
98
+ /// surpassed the compilation size limit.
99
+ ac : Option < AhoCorasick < u32 > > ,
89
100
/// match_type encodes as much upfront knowledge about how we're going to
90
101
/// execute a search as possible.
91
102
match_type : MatchType ,
@@ -287,6 +298,7 @@ impl ExecBuilder {
287
298
dfa : Program :: new ( ) ,
288
299
dfa_reverse : Program :: new ( ) ,
289
300
suffixes : LiteralSearcher :: empty ( ) ,
301
+ ac : None ,
290
302
match_type : MatchType :: Nothing ,
291
303
} ) ;
292
304
return Ok ( Exec { ro : ro, cache : CachedThreadLocal :: new ( ) } ) ;
@@ -319,12 +331,32 @@ impl ExecBuilder {
319
331
dfa. dfa_size_limit = self . options . dfa_size_limit ;
320
332
dfa_reverse. dfa_size_limit = self . options . dfa_size_limit ;
321
333
334
+ let mut ac = None ;
335
+ if parsed. exprs . len ( ) == 1 {
336
+ if let Some ( lits) = alternation_literals ( & parsed. exprs [ 0 ] ) {
337
+ // If we have a small number of literals, then let Teddy
338
+ // handle things (see literal/mod.rs).
339
+ if lits. len ( ) > 32 {
340
+ let fsm = AhoCorasickBuilder :: new ( )
341
+ . match_kind ( MatchKind :: LeftmostFirst )
342
+ . auto_configure ( & lits)
343
+ // We always want this to reduce size, regardless of
344
+ // what auto-configure does.
345
+ . byte_classes ( true )
346
+ . build_with_size :: < u32 , _ , _ > ( & lits)
347
+ . expect ( "AC automaton too big" ) ;
348
+ ac = Some ( fsm) ;
349
+ }
350
+ }
351
+ }
352
+
322
353
let mut ro = ExecReadOnly {
323
354
res : self . options . pats ,
324
355
nfa : nfa,
325
356
dfa : dfa,
326
357
dfa_reverse : dfa_reverse,
327
358
suffixes : LiteralSearcher :: suffixes ( suffixes) ,
359
+ ac : ac,
328
360
match_type : MatchType :: Nothing ,
329
361
} ;
330
362
ro. match_type = ro. choose_match_type ( self . match_type ) ;
@@ -633,6 +665,11 @@ impl<'c> ExecNoSync<'c> {
633
665
lits. find_end ( & text[ start..] )
634
666
. map ( |( s, e) | ( start + s, start + e) )
635
667
}
668
+ AhoCorasick => {
669
+ self . ro . ac . as_ref ( ) . unwrap ( )
670
+ . find ( & text[ start..] )
671
+ . map ( |m| ( start + m. start ( ) , start + m. end ( ) ) )
672
+ }
636
673
}
637
674
}
638
675
@@ -1163,6 +1200,9 @@ impl ExecReadOnly {
1163
1200
// aren't anchored. We would then only search for all of them when at
1164
1201
// the beginning of the input and use the subset in all other cases.
1165
1202
if self . res . len ( ) == 1 {
1203
+ if self . ac . is_some ( ) {
1204
+ return Literal ( MatchLiteralType :: AhoCorasick ) ;
1205
+ }
1166
1206
if self . nfa . prefixes . complete ( ) {
1167
1207
return if self . nfa . is_anchored_start {
1168
1208
Literal ( MatchLiteralType :: AnchoredStart )
@@ -1254,6 +1294,9 @@ enum MatchLiteralType {
1254
1294
AnchoredStart ,
1255
1295
/// Match literals only at the end of text.
1256
1296
AnchoredEnd ,
1297
+ /// Use an Aho-Corasick automaton. This requires `ac` to be Some on
1298
+ /// ExecReadOnly.
1299
+ AhoCorasick ,
1257
1300
}
1258
1301
1259
1302
#[ derive( Clone , Copy , Debug ) ]
@@ -1295,6 +1338,59 @@ impl ProgramCacheInner {
1295
1338
}
1296
1339
}
1297
1340
1341
+ /// Alternation literals checks if the given HIR is a simple alternation of
1342
+ /// literals, and if so, returns them. Otherwise, this returns None.
1343
+ fn alternation_literals ( expr : & Hir ) -> Option < Vec < Vec < u8 > > > {
1344
+ use syntax:: hir:: { HirKind , Literal } ;
1345
+
1346
+ // This is pretty hacky, but basically, if `is_alternation_literal` is
1347
+ // true, then we can make several assumptions about the structure of our
1348
+ // HIR. This is what justifies the `unreachable!` statements below.
1349
+ //
1350
+ // This code should be refactored once we overhaul this crate's
1351
+ // optimization pipeline, because this is a terribly inflexible way to go
1352
+ // about things.
1353
+
1354
+ if !expr. is_alternation_literal ( ) {
1355
+ return None ;
1356
+ }
1357
+ let alts = match * expr. kind ( ) {
1358
+ HirKind :: Alternation ( ref alts) => alts,
1359
+ _ => return None , // one literal isn't worth it
1360
+ } ;
1361
+
1362
+ let extendlit = |lit : & Literal , dst : & mut Vec < u8 > | {
1363
+ match * lit {
1364
+ Literal :: Unicode ( c) => {
1365
+ let mut buf = [ 0 ; 4 ] ;
1366
+ dst. extend_from_slice ( c. encode_utf8 ( & mut buf) . as_bytes ( ) ) ;
1367
+ }
1368
+ Literal :: Byte ( b) => {
1369
+ dst. push ( b) ;
1370
+ }
1371
+ }
1372
+ } ;
1373
+
1374
+ let mut lits = vec ! [ ] ;
1375
+ for alt in alts {
1376
+ let mut lit = vec ! [ ] ;
1377
+ match * alt. kind ( ) {
1378
+ HirKind :: Literal ( ref x) => extendlit ( x, & mut lit) ,
1379
+ HirKind :: Concat ( ref exprs) => {
1380
+ for e in exprs {
1381
+ match * e. kind ( ) {
1382
+ HirKind :: Literal ( ref x) => extendlit ( x, & mut lit) ,
1383
+ _ => unreachable ! ( "expected literal, got {:?}" , e) ,
1384
+ }
1385
+ }
1386
+ }
1387
+ _ => unreachable ! ( "expected literal or concat, got {:?}" , alt) ,
1388
+ }
1389
+ lits. push ( lit) ;
1390
+ }
1391
+ Some ( lits)
1392
+ }
1393
+
1298
1394
#[ cfg( test) ]
1299
1395
mod test {
1300
1396
#[ test]
0 commit comments