Skip to content

Commit 3f408e5

Browse files
committed
Merge pull request #217 from rust-lang-nursery/fix-capture-perf
Add known upper limit to capture search.
2 parents 4471212 + 49e8df5 commit 3f408e5

File tree

2 files changed

+40
-15
lines changed

2 files changed

+40
-15
lines changed

src/exec.rs

+21-15
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
use std::cell::RefCell;
1212
use std::collections::HashMap;
13+
use std::cmp;
1314
use std::sync::Arc;
1415

1516
use thread_local::CachedThreadLocal;
@@ -27,6 +28,7 @@ use re_bytes;
2728
use re_trait::{RegularExpression, Slot};
2829
use re_unicode;
2930
use set;
31+
use utf8::next_utf8;
3032

3133
/// Exec manages the execution of a regular expression.
3234
///
@@ -253,17 +255,7 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> {
253255
fn slots_len(&self) -> usize { self.0.slots_len() }
254256

255257
fn next_after_empty(&self, text: &str, i: usize) -> usize {
256-
let b = text.as_bytes()[i];
257-
let inc = if b <= 0x7F {
258-
1
259-
} else if b <= 0b110_11111 {
260-
2
261-
} else if b <= 0b1110_1111 {
262-
3
263-
} else {
264-
4
265-
};
266-
i + inc
258+
next_utf8(text.as_bytes(), i)
267259
}
268260

269261
#[inline(always)] // reduces constant overhead
@@ -433,15 +425,29 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
433425
}
434426
match self.ro.match_type {
435427
MatchType::Literal(ty) => {
436-
self.exec_literals(ty, text, start).and_then(|(s, _)| {
437-
self.captures_nfa(MatchNfaType::Auto, slots, text, s)
428+
self.exec_literals(ty, text, start).and_then(|(s, e)| {
429+
// We need the +1 here to account for lookahead
430+
// operators.
431+
let e = if self.ro.nfa.uses_bytes() {
432+
cmp::min(e + 1, text.len())
433+
} else {
434+
cmp::min(next_utf8(text, e), text.len())
435+
};
436+
self.captures_nfa(MatchNfaType::Auto, slots, &text[..e], s)
438437
})
439438
}
440439
MatchType::Dfa => {
441440
match self.find_dfa_forward(text, start) {
442-
dfa::Result::Match((s, _)) => {
441+
dfa::Result::Match((s, e)) => {
442+
// We need the +1 here to account for lookahead
443+
// operators.
444+
let e = if self.ro.nfa.uses_bytes() {
445+
cmp::min(e + 1, text.len())
446+
} else {
447+
cmp::min(next_utf8(text, e), text.len())
448+
};
443449
self.captures_nfa(
444-
MatchNfaType::Auto, slots, text, s)
450+
MatchNfaType::Auto, slots, &text[..e], s)
445451
}
446452
dfa::Result::NoMatch => None,
447453
dfa::Result::Quit => {

src/utf8.rs

+19
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,25 @@ const TAG_TWO: u8 = 0b1100_0000;
1919
const TAG_THREE: u8 = 0b1110_0000;
2020
const TAG_FOUR: u8 = 0b1111_0000;
2121

22+
/// Returns the smallest possible index of the next valid UTF-8 sequence
23+
/// starting after `i`.
24+
pub fn next_utf8(text: &[u8], i: usize) -> usize {
25+
let b = match text.get(i) {
26+
None => return i + 1,
27+
Some(&b) => b,
28+
};
29+
let inc = if b <= 0x7F {
30+
1
31+
} else if b <= 0b110_11111 {
32+
2
33+
} else if b <= 0b1110_1111 {
34+
3
35+
} else {
36+
4
37+
};
38+
i + inc
39+
}
40+
2241
/// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
2342
///
2443
/// If `dst` is not long enough, then `None` is returned. Otherwise, the number

0 commit comments

Comments
 (0)