Skip to content

Commit f9cd75c

Browse files
mkrupcaleBurntSushi
authored andcommitted
bench: add C++'s std::regex
This commit adds a new `re-stdcpp` feature to the benchmark runner that enables benchmarking C++'s standard library regex implementation.
1 parent 17764ff commit f9cd75c

File tree

11 files changed

+265
-3
lines changed

11 files changed

+265
-3
lines changed

bench/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ bench = false
4646
re-pcre1 = ["libpcre-sys"]
4747
re-pcre2 = []
4848
re-onig = ["onig"]
49+
re-stdcpp = []
4950
re-re2 = []
5051
re-dphobos = []
5152
re-dphobos-dmd = ["re-dphobos"]

bench/build.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@ fn main() {
1818
if env::var("CARGO_FEATURE_RE_PCRE2").is_ok() {
1919
pkg_config::probe_library("libpcre2-8").unwrap();
2020
}
21+
if env::var("CARGO_FEATURE_RE_STDCPP").is_ok() {
22+
// stdcpp is a C++ library, so we need to compile our shim layer.
23+
cc::Build::new()
24+
.cpp(true)
25+
.file("src/ffi/stdcpp.cpp")
26+
.compile("libcstdcpp.a");
27+
}
2128
if env::var("CARGO_FEATURE_RE_RE2").is_ok() {
2229
// RE2 is a C++ library, so we need to compile our shim layer.
2330
cc::Build::new()

bench/compile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22

33
exec cargo build \
44
--release \
5-
--features 're-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
5+
--features 're-stdcpp re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
66
"$@"

bench/run

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22

33
usage() {
4-
echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | re2 | onig | tcl ]" >&2
4+
echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | re2 | onig | tcl ]" >&2
55
exit 1
66
}
77

@@ -30,6 +30,9 @@ case $which in
3030
rust-bytes)
3131
exec cargo bench --bench bench --features re-rust-bytes "$@"
3232
;;
33+
stdcpp)
34+
exec cargo bench --bench bench --features re-stdcpp "$@"
35+
;;
3336
re2)
3437
exec cargo bench --bench bench --features re-re2 "$@"
3538
;;

bench/src/bench.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ pub use ffi::onig::Regex;
3535
pub use ffi::pcre1::Regex;
3636
#[cfg(feature = "re-pcre2")]
3737
pub use ffi::pcre2::Regex;
38+
#[cfg(feature = "re-stdcpp")]
39+
pub use ffi::stdcpp::Regex;
3840
#[cfg(feature = "re-re2")]
3941
pub use ffi::re2::Regex;
4042
#[cfg(feature = "re-dphobos")]
@@ -90,6 +92,7 @@ macro_rules! text {
9092
feature = "re-onig",
9193
feature = "re-pcre1",
9294
feature = "re-pcre2",
95+
feature = "re-stdcpp",
9396
feature = "re-re2",
9497
feature = "re-dphobos",
9598
feature = "re-rust",
@@ -107,6 +110,7 @@ type Text = Vec<u8>;
107110
feature = "re-onig",
108111
feature = "re-pcre1",
109112
feature = "re-pcre2",
113+
feature = "re-stdcpp",
110114
feature = "re-re2",
111115
feature = "re-dphobos",
112116
feature = "re-rust",

bench/src/ffi/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ pub mod onig;
2020
pub mod pcre1;
2121
#[cfg(feature = "re-pcre2")]
2222
pub mod pcre2;
23+
#[cfg(feature = "re-stdcpp")]
24+
pub mod stdcpp;
2325
#[cfg(feature = "re-re2")]
2426
pub mod re2;
2527
#[cfg(feature = "re-tcl")]

bench/src/ffi/stdcpp.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#include <regex>
2+
3+
extern "C" {
4+
typedef void stdcpp_regexp;
5+
6+
typedef struct stdcpp_string {
7+
const char *text;
8+
int len;
9+
} stdcpp_string;
10+
11+
stdcpp_regexp* stdcpp_regexp_new(stdcpp_string pat) {
12+
return reinterpret_cast<stdcpp_regexp*>(new std::regex(pat.text,
13+
pat.len,
14+
std::regex::optimize));
15+
}
16+
17+
void stdcpp_regexp_free(stdcpp_regexp *re) {
18+
delete reinterpret_cast<std::regex*>(re);
19+
}
20+
21+
bool stdcpp_regexp_match(stdcpp_regexp *re, stdcpp_string text,
22+
int startpos, int endpos) {
23+
std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
24+
return std::regex_search(text.text + startpos, text.text + endpos,
25+
cpp_re);
26+
}
27+
28+
bool stdcpp_regexp_find(stdcpp_regexp *re, stdcpp_string text,
29+
int startpos, int endpos,
30+
int *match_start, int *match_end) {
31+
std::regex cpp_re(*reinterpret_cast<std::regex*>(re));
32+
std::cmatch result;
33+
bool matched;
34+
matched = std::regex_search(text.text + startpos, text.text + endpos,
35+
result, cpp_re);
36+
if (matched) {
37+
*match_start = result[0].first - text.text;
38+
*match_end = *match_start + result.length(0);
39+
}
40+
return matched;
41+
}
42+
}

bench/src/ffi/stdcpp.rs

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
#![allow(non_camel_case_types)]
12+
13+
use libc::{c_uchar, c_int, c_void};
14+
15+
/// Regex wraps a std::regex regular expression.
16+
///
17+
/// It cannot be used safely from multiple threads simultaneously.
18+
pub struct Regex {
19+
re: *mut stdcpp_regexp,
20+
}
21+
22+
unsafe impl Send for Regex {}
23+
24+
impl Drop for Regex {
25+
fn drop(&mut self) {
26+
unsafe { stdcpp_regexp_free(self.re); }
27+
}
28+
}
29+
30+
#[derive(Debug)]
31+
pub struct Error(());
32+
33+
impl Regex {
34+
pub fn new(pattern: &str) -> Result<Regex, Error> {
35+
unsafe { Ok(Regex { re: stdcpp_regexp_new(pattern.into()) }) }
36+
}
37+
38+
pub fn is_match(&self, text: &str) -> bool {
39+
unsafe {
40+
stdcpp_regexp_match(self.re, text.into(), 0, text.len() as c_int)
41+
}
42+
}
43+
44+
pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
45+
FindMatches {
46+
re: self,
47+
text: text,
48+
last_end: 0,
49+
last_match: None,
50+
}
51+
}
52+
53+
fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
54+
let (mut s, mut e): (c_int, c_int) = (0, 0);
55+
let matched = unsafe {
56+
stdcpp_regexp_find(
57+
self.re,
58+
text.into(),
59+
start as c_int,
60+
text.len() as c_int,
61+
&mut s,
62+
&mut e,
63+
)
64+
};
65+
if matched {
66+
Some((s as usize, e as usize))
67+
} else {
68+
None
69+
}
70+
}
71+
}
72+
73+
pub struct FindMatches<'r, 't> {
74+
re: &'r Regex,
75+
text: &'t str,
76+
last_end: usize,
77+
last_match: Option<usize>,
78+
}
79+
80+
// This implementation is identical to the one Rust uses, since both Rust's
81+
// regex engine and std::regex handle empty matches in the same way.
82+
impl<'r, 't> Iterator for FindMatches<'r, 't> {
83+
type Item = (usize, usize);
84+
85+
fn next(&mut self) -> Option<(usize, usize)> {
86+
fn next_after_empty(text: &str, i: usize) -> usize {
87+
let b = match text.as_bytes().get(i) {
88+
None => return text.len() + 1,
89+
Some(&b) => b,
90+
};
91+
let inc = if b <= 0x7F {
92+
1
93+
} else if b <= 0b110_11111 {
94+
2
95+
} else if b <= 0b1110_1111 {
96+
3
97+
} else {
98+
4
99+
};
100+
i + inc
101+
}
102+
103+
if self.last_end > self.text.len() {
104+
return None;
105+
}
106+
let (s, e) = match self.re.find_at(self.text, self.last_end) {
107+
None => return None,
108+
Some((s, e)) => (s, e),
109+
};
110+
assert!(s >= self.last_end);
111+
if s == e {
112+
// This is an empty match. To ensure we make progress, start
113+
// the next search at the smallest possible starting position
114+
// of the next match following this one.
115+
self.last_end = next_after_empty(&self.text, e);
116+
// Don't accept empty matches immediately following a match.
117+
// Just move on to the next match.
118+
if Some(e) == self.last_match {
119+
return self.next();
120+
}
121+
} else {
122+
self.last_end = e;
123+
}
124+
self.last_match = Some(self.last_end);
125+
Some((s, e))
126+
}
127+
}
128+
129+
// stdcpp FFI is below. Note that this uses a hand-rolled C API that is defined
130+
// in stdcpp.cpp.
131+
132+
type stdcpp_regexp = c_void;
133+
134+
#[repr(C)]
135+
struct stdcpp_string {
136+
text: *const c_uchar,
137+
len: c_int,
138+
}
139+
140+
impl<'a> From<&'a str> for stdcpp_string {
141+
fn from(s: &'a str) -> stdcpp_string {
142+
stdcpp_string { text: s.as_ptr(), len: s.len() as c_int }
143+
}
144+
}
145+
146+
extern {
147+
fn stdcpp_regexp_new(pat: stdcpp_string) -> *mut stdcpp_regexp;
148+
fn stdcpp_regexp_free(re: *mut stdcpp_regexp);
149+
fn stdcpp_regexp_match(
150+
re: *mut stdcpp_regexp,
151+
text: stdcpp_string,
152+
startpos: c_int,
153+
endpos: c_int,
154+
) -> bool;
155+
fn stdcpp_regexp_find(
156+
re: *mut stdcpp_regexp,
157+
text: stdcpp_string,
158+
startpos: c_int,
159+
endpos: c_int,
160+
match_start: *mut c_int,
161+
match_end: *mut c_int,
162+
) -> bool;
163+
}

bench/src/main.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ Since this tool includes compilation of the <pattern>, sufficiently large
4545
haystacks should be used to amortize the cost of compilation. (e.g., >1MB.)
4646
4747
Usage:
48-
regex-run-one [options] [onig | pcre1 | pcre2 | re2 | rust | rust-bytes | tcl] <file> <pattern>
48+
regex-run-one [options] [onig | pcre1 | pcre2 | stdcpp | re2 | rust | rust-bytes | tcl] <file> <pattern>
4949
regex-run-one [options] (-h | --help)
5050
5151
Options:
@@ -59,6 +59,7 @@ struct Args {
5959
cmd_onig: bool,
6060
cmd_pcre1: bool,
6161
cmd_pcre2: bool,
62+
cmd_stdcpp: bool,
6263
cmd_re2: bool,
6364
cmd_rust: bool,
6465
cmd_rust_bytes: bool,
@@ -87,6 +88,8 @@ impl Args {
8788
count_pcre1(pat, haystack)
8889
} else if self.cmd_pcre2 {
8990
count_pcre2(pat, haystack)
91+
} else if self.cmd_stdcpp {
92+
count_stdcpp(pat, haystack)
9093
} else if self.cmd_re2 {
9194
count_re2(pat, haystack)
9295
} else if self.cmd_rust {
@@ -132,6 +135,13 @@ fn count_pcre2(pat: &str, haystack: &str) -> usize {
132135
Regex::new(pat).unwrap().find_iter(haystack).count()
133136
}
134137

138+
nada!("re-stdcpp", count_stdcpp);
139+
#[cfg(feature = "re-stdcpp")]
140+
fn count_stdcpp(pat: &str, haystack: &str) -> usize {
141+
use ffi::stdcpp::Regex;
142+
Regex::new(pat).unwrap().find_iter(haystack).count()
143+
}
144+
135145
nada!("re-re2", count_re2);
136146
#[cfg(feature = "re-re2")]
137147
fn count_re2(pat: &str, haystack: &str) -> usize {

bench/src/misc.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use {Regex, Text};
1919
#[cfg(not(feature = "re-onig"))]
2020
#[cfg(not(feature = "re-pcre1"))]
2121
#[cfg(not(feature = "re-pcre2"))]
22+
#[cfg(not(feature = "re-stdcpp"))]
2223
#[cfg(not(feature = "re-dphobos-dmd-ct"))]
2324
#[cfg(not(feature = "re-dphobos-ldc-ct"))]
2425
bench_match!(no_exponential, {
@@ -45,6 +46,7 @@ bench_match!(match_class_in_range, "[ac]", {
4546
});
4647

4748
#[cfg(not(feature = "re-rust-bytes"))]
49+
#[cfg(not(feature = "re-stdcpp"))]
4850
#[cfg(not(feature = "re-tcl"))]
4951
bench_match!(match_class_unicode, r"\p{L}", {
5052
format!("{}a", repeat("☃5☃5").take(20).collect::<String>())

0 commit comments

Comments
 (0)