Skip to content

Commit 11447f0

Browse files
authored
Merge pull request #270 from BurntSushi/fixes
fix a few bugs
2 parents d95d021 + 16931b0 commit 11447f0

File tree

7 files changed

+96
-54
lines changed

7 files changed

+96
-54
lines changed

.travis.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ rust:
77
sudo: false
88
script:
99
- cargo build --verbose
10-
- cargo build --verbose --manifest-path=regex-debug/Cargo.toml
1110
- if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
11+
cargo build --verbose --manifest-path=regex-debug/Cargo.toml;
1212
RUSTFLAGS="-C target-feature=+ssse3" cargo test --verbose --features 'simd-accel pattern';
1313
else
1414
travis_wait cargo test --verbose;

regex-syntax/src/lib.rs

+30
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,21 @@ impl Expr {
528528
}
529529
}
530530

531+
/// Returns true if and only if the expression has at least one matchable
532+
/// sub-expression that must match the beginning of text.
533+
pub fn has_anchored_start(&self) -> bool {
534+
match *self {
535+
Repeat { ref e, r, .. } => {
536+
!r.matches_empty() && e.has_anchored_start()
537+
}
538+
Group { ref e, .. } => e.has_anchored_start(),
539+
Concat(ref es) => es[0].has_anchored_start(),
540+
Alternate(ref es) => es.iter().any(|e| e.has_anchored_start()),
541+
StartText => true,
542+
_ => false,
543+
}
544+
}
545+
531546
/// Returns true if and only if the expression is required to match at the
532547
/// end of the text.
533548
pub fn is_anchored_end(&self) -> bool {
@@ -543,6 +558,21 @@ impl Expr {
543558
}
544559
}
545560

561+
/// Returns true if and only if the expression has at least one matchable
562+
/// sub-expression that must match the beginning of text.
563+
pub fn has_anchored_end(&self) -> bool {
564+
match *self {
565+
Repeat { ref e, r, .. } => {
566+
!r.matches_empty() && e.has_anchored_end()
567+
}
568+
Group { ref e, .. } => e.has_anchored_end(),
569+
Concat(ref es) => es[es.len() - 1].has_anchored_end(),
570+
Alternate(ref es) => es.iter().any(|e| e.has_anchored_end()),
571+
EndText => true,
572+
_ => false,
573+
}
574+
}
575+
546576
/// Returns true if and only if the expression contains sub-expressions
547577
/// that can match arbitrary bytes.
548578
pub fn has_bytes(&self) -> bool {

src/compile.rs

+6
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,9 @@ impl Compiler {
143143
// matching engine itself.
144144
let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
145145
self.compiled.is_anchored_start = expr.is_anchored_start();
146+
self.compiled.has_anchored_start = expr.has_anchored_start();
146147
self.compiled.is_anchored_end = expr.is_anchored_end();
148+
self.compiled.has_anchored_end = expr.has_anchored_end();
147149
if self.compiled.needs_dotstar() {
148150
dotstar_patch = try!(self.c_dotstar());
149151
self.compiled.start = dotstar_patch.entry;
@@ -171,6 +173,10 @@ impl Compiler {
171173
exprs.iter().all(|e| e.is_anchored_start());
172174
self.compiled.is_anchored_end =
173175
exprs.iter().all(|e| e.is_anchored_end());
176+
self.compiled.has_anchored_start =
177+
exprs.iter().any(|e| e.has_anchored_start());
178+
self.compiled.has_anchored_end =
179+
exprs.iter().any(|e| e.has_anchored_end());
174180
let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
175181
if self.compiled.needs_dotstar() {
176182
dotstar_patch = try!(self.c_dotstar());

src/dfa.rs

+34-47
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,9 @@ struct CacheInner {
152152
/// The total number of times this cache has been flushed by the DFA
153153
/// because of space constraints.
154154
flush_count: u64,
155+
/// The total heap size of the DFA's cache. We use this to determine when
156+
/// we should flush the cache.
157+
size: usize,
155158
}
156159

157160
/// The transition table.
@@ -420,18 +423,32 @@ impl Cache {
420423
pub fn new(prog: &Program) -> Self {
421424
// We add 1 to account for the special EOF byte.
422425
let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1;
423-
Cache {
426+
let starts = vec![STATE_UNKNOWN; 256];
427+
let mut cache = Cache {
424428
inner: CacheInner {
425429
compiled: HashMap::new(),
426430
trans: Transitions::new(num_byte_classes),
427431
states: vec![],
428-
start_states: vec![STATE_UNKNOWN; 256],
432+
start_states: starts,
429433
stack: vec![],
430434
flush_count: 0,
435+
size: 0,
431436
},
432437
qcur: SparseSet::new(prog.insts.len()),
433438
qnext: SparseSet::new(prog.insts.len()),
434-
}
439+
};
440+
cache.inner.reset_size();
441+
cache
442+
}
443+
}
444+
445+
impl CacheInner {
446+
/// Resets the cache size to account for fixed costs, such as the program
447+
/// and stack sizes.
448+
fn reset_size(&mut self) {
449+
self.size =
450+
(self.start_states.len() * mem::size_of::<StatePtr>())
451+
+ (self.stack.len() * mem::size_of::<InstPtr>());
435452
}
436453
}
437454

@@ -1151,7 +1168,9 @@ impl<'a> Fsm<'a> {
11511168
}
11521169
// If the cache has gotten too big, wipe it.
11531170
if self.approximate_size() > self.prog.dfa_size_limit {
1171+
println!("clearing cache (size: {:?})", self.approximate_size());
11541172
if !self.clear_cache_and_save(current_state) {
1173+
println!("giving up");
11551174
// Ooops. DFA is giving up.
11561175
return None;
11571176
}
@@ -1280,6 +1299,7 @@ impl<'a> Fsm<'a> {
12801299
} else {
12811300
None
12821301
};
1302+
self.cache.reset_size();
12831303
self.cache.trans.clear();
12841304
self.cache.states.clear();
12851305
self.cache.compiled.clear();
@@ -1454,6 +1474,11 @@ impl<'a> Fsm<'a> {
14541474
}
14551475
// Finally, put our actual state on to our heap of states and index it
14561476
// so we can find it later.
1477+
self.cache.size +=
1478+
self.cache.trans.state_heap_size()
1479+
+ (2 * state.data.len())
1480+
+ (2 * mem::size_of::<State>())
1481+
+ mem::size_of::<StatePtr>();
14571482
self.cache.states.push(state.clone());
14581483
self.cache.compiled.insert(state, si);
14591484
// Transition table and set of states and map should all be in sync.
@@ -1536,51 +1561,8 @@ impl<'a> Fsm<'a> {
15361561
/// be wiped. Namely, it is possible that for certain regexes on certain
15371562
/// inputs, a new state could be created for every byte of input. (This is
15381563
/// bad for memory use, so we bound it with a cache.)
1539-
///
1540-
/// The approximation is guaranteed to be done in constant time (and
1541-
/// indeed, this requirement is why it's approximate).
15421564
fn approximate_size(&self) -> usize {
1543-
use std::mem::size_of as size;
1544-
// Estimate that there are about 16 instructions per state consuming
1545-
// 20 = 4 + (15 * 1) bytes of space (1 byte because of delta encoding).
1546-
const STATE_HEAP: usize = 20 + 1; // one extra byte for flags
1547-
let compiled =
1548-
(self.cache.compiled.len() * (size::<State>() + STATE_HEAP))
1549-
+ (self.cache.compiled.len() * size::<StatePtr>());
1550-
let states =
1551-
self.cache.states.len()
1552-
* (size::<State>()
1553-
+ STATE_HEAP
1554-
+ (self.num_byte_classes() * size::<StatePtr>()));
1555-
let start_states = self.cache.start_states.len() * size::<StatePtr>();
1556-
self.prog.approximate_size() + compiled + states + start_states
1557-
}
1558-
1559-
/// Returns the actual heap space of the DFA. This visits every state in
1560-
/// the DFA.
1561-
#[allow(dead_code)] // useful for debugging
1562-
fn actual_size(&self) -> usize {
1563-
let mut compiled = 0;
1564-
for k in self.cache.compiled.keys() {
1565-
compiled += mem::size_of::<State>();
1566-
compiled += mem::size_of::<StatePtr>();
1567-
compiled += k.data.len() * mem::size_of::<u8>();
1568-
}
1569-
let mut states = 0;
1570-
for s in &self.cache.states {
1571-
states += mem::size_of::<State>();
1572-
states += s.data.len() * mem::size_of::<u8>();
1573-
}
1574-
compiled
1575-
+ states
1576-
+ (self.cache.trans.num_states() *
1577-
mem::size_of::<StatePtr>() *
1578-
self.num_byte_classes())
1579-
+ (self.cache.start_states.len() * mem::size_of::<StatePtr>())
1580-
+ (self.cache.stack.len() * mem::size_of::<InstPtr>())
1581-
+ mem::size_of::<Fsm>()
1582-
+ mem::size_of::<CacheInner>()
1583-
+ self.prog.approximate_size() // OK, not actual, but close enough
1565+
self.cache.size + self.prog.approximate_size()
15841566
}
15851567
}
15861568

@@ -1628,6 +1610,11 @@ impl Transitions {
16281610
self.table[si as usize + cls]
16291611
}
16301612

1613+
/// The heap size, in bytes, of a single state in the transition table.
1614+
fn state_heap_size(&self) -> usize {
1615+
self.num_byte_classes * mem::size_of::<StatePtr>()
1616+
}
1617+
16311618
/// Like `next`, but uses unchecked access and is therefore unsafe.
16321619
unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr {
16331620
debug_assert!((si as usize) < self.table.len());

src/exec.rs

+10-6
Original file line numberDiff line numberDiff line change
@@ -840,11 +840,7 @@ impl<'c> ExecNoSync<'c> {
840840
) -> Option<(usize, usize)> {
841841
// We can't use match_end directly, because we may need to examine
842842
// one "character" after the end of a match for lookahead operators.
843-
let e = if self.ro.nfa.uses_bytes() {
844-
cmp::min(match_end + 1, text.len())
845-
} else {
846-
cmp::min(next_utf8(text, match_end), text.len())
847-
};
843+
let e = cmp::min(next_utf8(text, match_end), text.len());
848844
self.captures_nfa(slots, &text[..e], match_start)
849845
}
850846

@@ -1113,7 +1109,15 @@ impl ExecReadOnly {
11131109
// If our set of prefixes is complete, then we can use it to find
11141110
// a match in lieu of a regex engine. This doesn't quite work well in
11151111
// the presence of multiple regexes, so only do it when there's one.
1116-
if self.res.len() == 1 {
1112+
//
1113+
// TODO(burntsushi): Also, don't try to match literals if the regex is
1114+
// partially anchored. We could technically do it, but we'd need to
1115+
// create two sets of literals: all of them and then the subset that
1116+
// aren't anchored. We would then only search for all of them when at
1117+
// the beginning of the input and use the subset in all other cases.
1118+
if self.res.len() == 1
1119+
&& !self.nfa.has_anchored_start
1120+
&& !self.nfa.has_anchored_end {
11171121
if self.nfa.prefixes.complete() {
11181122
return if self.nfa.is_anchored_start {
11191123
Literal(MatchLiteralType::AnchoredStart)

src/prog.rs

+8
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,12 @@ pub struct Program {
5252
pub is_anchored_start: bool,
5353
/// Whether the regex must match at the end of the input.
5454
pub is_anchored_end: bool,
55+
/// Whether the regex has at least one matchable sub-expression that must
56+
/// match from the start of the input.
57+
pub has_anchored_start: bool,
58+
/// Whether the regex has at least one matchable sub-expression that must
59+
/// match at the end of the input.
60+
pub has_anchored_end: bool,
5561
/// Whether this program contains a Unicode word boundary instruction.
5662
pub has_unicode_word_boundary: bool,
5763
/// A possibly empty machine for very quickly matching prefix literals.
@@ -91,6 +97,8 @@ impl Program {
9197
is_reverse: false,
9298
is_anchored_start: false,
9399
is_anchored_end: false,
100+
has_anchored_start: false,
101+
has_anchored_end: false,
94102
has_unicode_word_boundary: false,
95103
prefixes: LiteralSearcher::empty(),
96104
dfa_size_limit: 2 * (1<<20),

tests/regression.rs

+7
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,10 @@ split!(split_on_word_boundary, r"\b", r"Should this (work?)",
5757
t!(" ("), t!("work"), t!("?)")]);
5858
matiter!(word_boundary_dfa, r"\b", "a b c",
5959
(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5));
60+
61+
// See: https://github.com/rust-lang-nursery/regex/issues/268
62+
matiter!(partial_anchor, u!(r"^a|b"), "ba", (0, 1));
63+
64+
// See: https://github.com/rust-lang-nursery/regex/issues/264
65+
mat!(ascii_boundary_no_capture, u!(r"(?-u)\B"), "\u{28f3e}", Some((0, 0)));
66+
mat!(ascii_boundary_capture, u!(r"(?-u)(\B)"), "\u{28f3e}", Some((0, 0)));

0 commit comments

Comments
 (0)