Skip to content

Commit b7280ca

Browse files
committed
teddy: port teddy searcher to AVX2
This commit adds a copy of the Teddy searcher that works on AVX2. We don't attempt to reuse any code between them just yet, and instead just copy & paste and tweak parts of it to work on 32 bytes instead of 16. (Some parts were trickier than others. For example, @jneem figured out how to nearly compensate for the lack of a real 256-bit bytewise PALIGNR instruction, which we borrow here.) Overall, AVX2 provides a nice bump in performance.
1 parent 35b73f5 commit b7280ca

File tree

7 files changed

+734
-12
lines changed

7 files changed

+734
-12
lines changed

build.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ fn main() {
2121
if env::var_os("CARGO_CFG_REGEX_DISABLE_AUTO_OPTIMIZATIONS").is_none() {
2222
if version.contains("nightly") {
2323
println!("cargo:rustc-cfg=regex_runtime_teddy_ssse3");
24+
println!("cargo:rustc-cfg=regex_runtime_teddy_avx2");
2425
}
2526
}
2627
}

src/literal/mod.rs

Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@ use memchr::{memchr, memchr2, memchr3};
1616
use syntax::hir::literal::{Literal, Literals};
1717

1818
use freqs::BYTE_FREQUENCIES;
19-
use self::teddy_ssse3::Teddy;
19+
use self::teddy_avx2::{Teddy as TeddyAVX2};
20+
use self::teddy_ssse3::{Teddy as TeddySSSE3};
2021

22+
mod teddy_avx2;
2123
mod teddy_ssse3;
2224

2325
/// A prefix extracted from a compiled regular expression.
@@ -47,7 +49,10 @@ enum Matcher {
4749
AC(FullAcAutomaton<Literal>),
4850
/// A simd accelerated multiple string matcher. Used only for a small
4951
/// number of small literals.
50-
Teddy128(Teddy),
52+
TeddySSSE3(TeddySSSE3),
53+
/// A simd accelerated multiple string matcher. Used only for a small
54+
/// number of small literals. This uses 256-bit vectors.
55+
TeddyAVX2(TeddyAVX2),
5156
}
5257

5358
impl LiteralSearcher {
@@ -98,7 +103,8 @@ impl LiteralSearcher {
98103
FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
99104
BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
100105
AC(ref aut) => aut.find(haystack).next().map(|m| (m.start, m.end)),
101-
Teddy128(ref ted) => ted.find(haystack).map(|m| (m.start, m.end)),
106+
TeddySSSE3(ref t) => t.find(haystack).map(|m| (m.start, m.end)),
107+
TeddyAVX2(ref t) => t.find(haystack).map(|m| (m.start, m.end)),
102108
}
103109
}
104110

@@ -136,8 +142,11 @@ impl LiteralSearcher {
136142
Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat),
137143
Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
138144
Matcher::AC(ref ac) => LiteralIter::AC(ac.patterns()),
139-
Matcher::Teddy128(ref ted) => {
140-
LiteralIter::Teddy128(ted.patterns())
145+
Matcher::TeddySSSE3(ref ted) => {
146+
LiteralIter::TeddySSSE3(ted.patterns())
147+
}
148+
Matcher::TeddyAVX2(ref ted) => {
149+
LiteralIter::TeddyAVX2(ted.patterns())
141150
}
142151
}
143152
}
@@ -166,7 +175,8 @@ impl LiteralSearcher {
166175
FreqyPacked(_) => 1,
167176
BoyerMoore(_) => 1,
168177
AC(ref aut) => aut.len(),
169-
Teddy128(ref ted) => ted.len(),
178+
TeddySSSE3(ref ted) => ted.len(),
179+
TeddyAVX2(ref ted) => ted.len(),
170180
}
171181
}
172182

@@ -179,7 +189,8 @@ impl LiteralSearcher {
179189
FreqyPacked(ref single) => single.approximate_size(),
180190
BoyerMoore(ref single) => single.approximate_size(),
181191
AC(ref aut) => aut.heap_bytes(),
182-
Teddy128(ref ted) => ted.approximate_size(),
192+
TeddySSSE3(ref ted) => ted.approximate_size(),
193+
TeddyAVX2(ref ted) => ted.approximate_size(),
183194
}
184195
}
185196
}
@@ -220,7 +231,15 @@ impl Matcher {
220231
}
221232
}
222233
let is_aho_corasick_fast = sset.dense.len() == 1 && sset.all_ascii;
223-
if Teddy::available() && !is_aho_corasick_fast {
234+
if TeddyAVX2::available() && !is_aho_corasick_fast {
235+
const MAX_TEDDY_LITERALS: usize = 32;
236+
if lits.literals().len() <= MAX_TEDDY_LITERALS {
237+
if let Some(ted) = TeddyAVX2::new(lits) {
238+
return Matcher::TeddyAVX2(ted);
239+
}
240+
}
241+
}
242+
if TeddySSSE3::available() && !is_aho_corasick_fast {
224243
// Only try Teddy if Aho-Corasick can't use memchr on an ASCII
225244
// byte. Also, in its current form, Teddy doesn't scale well to
226245
// lots of literals.
@@ -232,8 +251,8 @@ impl Matcher {
232251
// negating the benefit of memchr.
233252
const MAX_TEDDY_LITERALS: usize = 32;
234253
if lits.literals().len() <= MAX_TEDDY_LITERALS {
235-
if let Some(ted) = Teddy::new(lits) {
236-
return Matcher::Teddy128(ted);
254+
if let Some(ted) = TeddySSSE3::new(lits) {
255+
return Matcher::TeddySSSE3(ted);
237256
}
238257
}
239258
// Fallthrough to ol' reliable Aho-Corasick...
@@ -248,7 +267,8 @@ pub enum LiteralIter<'a> {
248267
Bytes(&'a [u8]),
249268
Single(&'a [u8]),
250269
AC(&'a [Literal]),
251-
Teddy128(&'a [Vec<u8>]),
270+
TeddySSSE3(&'a [Vec<u8>]),
271+
TeddyAVX2(&'a [Vec<u8>]),
252272
}
253273

254274
impl<'a> Iterator for LiteralIter<'a> {
@@ -284,7 +304,16 @@ impl<'a> Iterator for LiteralIter<'a> {
284304
Some(&**next)
285305
}
286306
}
287-
LiteralIter::Teddy128(ref mut lits) => {
307+
LiteralIter::TeddySSSE3(ref mut lits) => {
308+
if lits.is_empty() {
309+
None
310+
} else {
311+
let next = &lits[0];
312+
*lits = &lits[1..];
313+
Some(&**next)
314+
}
315+
}
316+
LiteralIter::TeddyAVX2(ref mut lits) => {
288317
if lits.is_empty() {
289318
None
290319
} else {

src/literal/teddy_avx2/fallback.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
use syntax::hir::literal::Literals;
2+
3+
#[derive(Debug, Clone)]
4+
pub struct Teddy(());
5+
6+
#[derive(Debug, Clone)]
7+
pub struct Match {
8+
pub pat: usize,
9+
pub start: usize,
10+
pub end: usize,
11+
}
12+
13+
impl Teddy {
14+
pub fn available() -> bool { false }
15+
pub fn new(_pats: &Literals) -> Option<Teddy> { None }
16+
pub fn patterns(&self) -> &[Vec<u8>] { &[] }
17+
pub fn len(&self) -> usize { 0 }
18+
pub fn approximate_size(&self) -> usize { 0 }
19+
pub fn find(&self, _haystack: &[u8]) -> Option<Match> { None }
20+
}

0 commit comments

Comments
 (0)