@@ -152,6 +152,9 @@ struct CacheInner {
152
152
/// The total number of times this cache has been flushed by the DFA
153
153
/// because of space constraints.
154
154
flush_count : u64 ,
155
+ /// The total heap size of the DFA's cache. We use this to determine when
156
+ /// we should flush the cache.
157
+ size : usize ,
155
158
}
156
159
157
160
/// The transition table.
@@ -420,18 +423,32 @@ impl Cache {
420
423
pub fn new ( prog : & Program ) -> Self {
421
424
// We add 1 to account for the special EOF byte.
422
425
let num_byte_classes = ( prog. byte_classes [ 255 ] as usize + 1 ) + 1 ;
423
- Cache {
426
+ let starts = vec ! [ STATE_UNKNOWN ; 256 ] ;
427
+ let mut cache = Cache {
424
428
inner : CacheInner {
425
429
compiled : HashMap :: new ( ) ,
426
430
trans : Transitions :: new ( num_byte_classes) ,
427
431
states : vec ! [ ] ,
428
- start_states : vec ! [ STATE_UNKNOWN ; 256 ] ,
432
+ start_states : starts ,
429
433
stack : vec ! [ ] ,
430
434
flush_count : 0 ,
435
+ size : 0 ,
431
436
} ,
432
437
qcur : SparseSet :: new ( prog. insts . len ( ) ) ,
433
438
qnext : SparseSet :: new ( prog. insts . len ( ) ) ,
434
- }
439
+ } ;
440
+ cache. inner . reset_size ( ) ;
441
+ cache
442
+ }
443
+ }
444
+
445
+ impl CacheInner {
446
+ /// Resets the cache size to account for fixed costs, such as the program
447
+ /// and stack sizes.
448
+ fn reset_size ( & mut self ) {
449
+ self . size =
450
+ ( self . start_states . len ( ) * mem:: size_of :: < StatePtr > ( ) )
451
+ + ( self . stack . len ( ) * mem:: size_of :: < InstPtr > ( ) ) ;
435
452
}
436
453
}
437
454
@@ -1151,7 +1168,9 @@ impl<'a> Fsm<'a> {
1151
1168
}
1152
1169
// If the cache has gotten too big, wipe it.
1153
1170
if self . approximate_size ( ) > self . prog . dfa_size_limit {
1171
+ println ! ( "clearing cache (size: {:?})" , self . approximate_size( ) ) ;
1154
1172
if !self . clear_cache_and_save ( current_state) {
1173
+ println ! ( "giving up" ) ;
1155
1174
// Ooops. DFA is giving up.
1156
1175
return None ;
1157
1176
}
@@ -1280,6 +1299,7 @@ impl<'a> Fsm<'a> {
1280
1299
} else {
1281
1300
None
1282
1301
} ;
1302
+ self . cache . reset_size ( ) ;
1283
1303
self . cache . trans . clear ( ) ;
1284
1304
self . cache . states . clear ( ) ;
1285
1305
self . cache . compiled . clear ( ) ;
@@ -1454,6 +1474,11 @@ impl<'a> Fsm<'a> {
1454
1474
}
1455
1475
// Finally, put our actual state on to our heap of states and index it
1456
1476
// so we can find it later.
1477
+ self . cache . size +=
1478
+ self . cache . trans . state_heap_size ( )
1479
+ + ( 2 * state. data . len ( ) )
1480
+ + ( 2 * mem:: size_of :: < State > ( ) )
1481
+ + mem:: size_of :: < StatePtr > ( ) ;
1457
1482
self . cache . states . push ( state. clone ( ) ) ;
1458
1483
self . cache . compiled . insert ( state, si) ;
1459
1484
// Transition table and set of states and map should all be in sync.
@@ -1536,51 +1561,8 @@ impl<'a> Fsm<'a> {
1536
1561
/// be wiped. Namely, it is possible that for certain regexes on certain
1537
1562
/// inputs, a new state could be created for every byte of input. (This is
1538
1563
/// bad for memory use, so we bound it with a cache.)
1539
- ///
1540
- /// The approximation is guaranteed to be done in constant time (and
1541
- /// indeed, this requirement is why it's approximate).
1542
1564
fn approximate_size ( & self ) -> usize {
1543
- use std:: mem:: size_of as size;
1544
- // Estimate that there are about 16 instructions per state consuming
1545
- // 20 = 4 + (15 * 1) bytes of space (1 byte because of delta encoding).
1546
- const STATE_HEAP : usize = 20 + 1 ; // one extra byte for flags
1547
- let compiled =
1548
- ( self . cache . compiled . len ( ) * ( size :: < State > ( ) + STATE_HEAP ) )
1549
- + ( self . cache . compiled . len ( ) * size :: < StatePtr > ( ) ) ;
1550
- let states =
1551
- self . cache . states . len ( )
1552
- * ( size :: < State > ( )
1553
- + STATE_HEAP
1554
- + ( self . num_byte_classes ( ) * size :: < StatePtr > ( ) ) ) ;
1555
- let start_states = self . cache . start_states . len ( ) * size :: < StatePtr > ( ) ;
1556
- self . prog . approximate_size ( ) + compiled + states + start_states
1557
- }
1558
-
1559
- /// Returns the actual heap space of the DFA. This visits every state in
1560
- /// the DFA.
1561
- #[ allow( dead_code) ] // useful for debugging
1562
- fn actual_size ( & self ) -> usize {
1563
- let mut compiled = 0 ;
1564
- for k in self . cache . compiled . keys ( ) {
1565
- compiled += mem:: size_of :: < State > ( ) ;
1566
- compiled += mem:: size_of :: < StatePtr > ( ) ;
1567
- compiled += k. data . len ( ) * mem:: size_of :: < u8 > ( ) ;
1568
- }
1569
- let mut states = 0 ;
1570
- for s in & self . cache . states {
1571
- states += mem:: size_of :: < State > ( ) ;
1572
- states += s. data . len ( ) * mem:: size_of :: < u8 > ( ) ;
1573
- }
1574
- compiled
1575
- + states
1576
- + ( self . cache . trans . num_states ( ) *
1577
- mem:: size_of :: < StatePtr > ( ) *
1578
- self . num_byte_classes ( ) )
1579
- + ( self . cache . start_states . len ( ) * mem:: size_of :: < StatePtr > ( ) )
1580
- + ( self . cache . stack . len ( ) * mem:: size_of :: < InstPtr > ( ) )
1581
- + mem:: size_of :: < Fsm > ( )
1582
- + mem:: size_of :: < CacheInner > ( )
1583
- + self . prog . approximate_size ( ) // OK, not actual, but close enough
1565
+ self . cache . size + self . prog . approximate_size ( )
1584
1566
}
1585
1567
}
1586
1568
@@ -1628,6 +1610,11 @@ impl Transitions {
1628
1610
self . table [ si as usize + cls]
1629
1611
}
1630
1612
1613
+ /// The heap size, in bytes, of a single state in the transition table.
1614
+ fn state_heap_size ( & self ) -> usize {
1615
+ self . num_byte_classes * mem:: size_of :: < StatePtr > ( )
1616
+ }
1617
+
1631
1618
/// Like `next`, but uses unchecked access and is therefore unsafe.
1632
1619
unsafe fn next_unchecked ( & self , si : StatePtr , cls : usize ) -> StatePtr {
1633
1620
debug_assert ! ( ( si as usize ) < self . table. len( ) ) ;
0 commit comments