Skip to content

Add support for C++ std::regex to benchmarks #459

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions bench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ bench = false
re-pcre1 = ["libpcre-sys"]
re-pcre2 = []
re-onig = ["onig"]
re-stdcpp = []
libcxx = []
re-boost = []
re-re2 = []
re-dphobos = []
re-dphobos-dmd = ["re-dphobos"]
Expand Down
26 changes: 26 additions & 0 deletions bench/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,32 @@ fn main() {
if env::var("CARGO_FEATURE_RE_PCRE2").is_ok() {
pkg_config::probe_library("libpcre2-8").unwrap();
}
if env::var("CARGO_FEATURE_RE_STDCPP").is_ok() {
// stdcpp is a C++ library, so we need to compile our shim layer.
if !env::var("CARGO_FEATURE_LIBCXX").is_ok() {
// use default stdlib
cc::Build::new()
.cpp(true)
.file("src/ffi/stdcpp.cpp")
.compile("libcstdcpp.a");
} else {
// use libc++ stdlib
cc::Build::new()
.cpp(true)
.file("src/ffi/stdcpp.cpp")
.cpp_link_stdlib("c++")
.compile("libcstdcpp.a");
}
}
if env::var("CARGO_FEATURE_RE_BOOST").is_ok() {
// stdcpp is a C++ library, so we need to compile our shim layer.
cc::Build::new()
.cpp(true)
.file("src/ffi/stdcpp.cpp")
.define("USE_BOOST", None)
.compile("libcboost.a");
println!("cargo:rustc-link-lib=boost_regex");
}
if env::var("CARGO_FEATURE_RE_RE2").is_ok() {
// RE2 is a C++ library, so we need to compile our shim layer.
cc::Build::new()
Expand Down
2 changes: 1 addition & 1 deletion bench/compile
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

exec cargo build \
--release \
--features 're-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
--features 're-stdcpp re-boost re-re2 re-onig re-pcre1 re-pcre2 re-rust re-rust-bytes re-tcl re-dphobos-dmd re-dphobos-ldc' \
"$@"
11 changes: 10 additions & 1 deletion bench/run
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

usage() {
echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | re2 | onig | tcl ]" >&2
echo "Usage: $(basename $0) [dphobos-dmd | dphobos-ldc | dphobos-dmd-ct | dphobos-ldc-ct | rust | rust-bytes | pcre1 | pcre2 | stdcpp | stdcpp-libcxx | boost | re2 | onig | tcl ]" >&2
exit 1
}

Expand Down Expand Up @@ -30,6 +30,15 @@ case $which in
rust-bytes)
exec cargo bench --bench bench --features re-rust-bytes "$@"
;;
stdcpp)
exec cargo bench --bench bench --features re-stdcpp "$@"
;;
stdcpp-libcxx)
exec cargo bench --bench bench --features 're-stdcpp libcxx' "$@"
;;
boost)
exec cargo bench --bench bench --features re-boost "$@"
;;
re2)
exec cargo bench --bench bench --features re-re2 "$@"
;;
Expand Down
10 changes: 9 additions & 1 deletion bench/src/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,17 @@ extern crate regex;
extern crate regex_syntax;
extern crate test;


#[cfg(feature = "re-onig")]
pub use ffi::onig::Regex;
#[cfg(feature = "re-pcre1")]
pub use ffi::pcre1::Regex;
#[cfg(feature = "re-pcre2")]
pub use ffi::pcre2::Regex;
#[cfg(any(
feature = "re-stdcpp",
feature = "re-boost",
))]
pub use ffi::stdcpp::Regex;
#[cfg(feature = "re-re2")]
pub use ffi::re2::Regex;
#[cfg(feature = "re-dphobos")]
Expand Down Expand Up @@ -90,6 +94,8 @@ macro_rules! text {
feature = "re-onig",
feature = "re-pcre1",
feature = "re-pcre2",
feature = "re-stdcpp",
feature = "re-boost",
feature = "re-re2",
feature = "re-dphobos",
feature = "re-rust",
Expand All @@ -107,6 +113,8 @@ type Text = Vec<u8>;
feature = "re-onig",
feature = "re-pcre1",
feature = "re-pcre2",
feature = "re-stdcpp",
feature = "re-boost",
feature = "re-re2",
feature = "re-dphobos",
feature = "re-rust",
Expand Down
5 changes: 5 additions & 0 deletions bench/src/ffi/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ pub mod onig;
pub mod pcre1;
#[cfg(feature = "re-pcre2")]
pub mod pcre2;
#[cfg(any(
feature = "re-stdcpp",
feature = "re-boost",
))]
pub mod stdcpp;
#[cfg(feature = "re-re2")]
pub mod re2;
#[cfg(feature = "re-tcl")]
Expand Down
53 changes: 53 additions & 0 deletions bench/src/ffi/stdcpp.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#ifdef USE_BOOST
#include <boost/regex.hpp>
#else
#include <regex>
#endif

extern "C" {

#ifdef USE_BOOST
namespace regex_ns = boost;
#else
namespace regex_ns = std;
#endif

typedef void stdcpp_regexp;

typedef struct stdcpp_string {
const char *text;
int len;
} stdcpp_string;

stdcpp_regexp* stdcpp_regexp_new(stdcpp_string pat) {
return reinterpret_cast<stdcpp_regexp*>(new regex_ns::regex(pat.text,
pat.len,
regex_ns::regex::optimize));
}

void stdcpp_regexp_free(stdcpp_regexp *re) {
delete reinterpret_cast<regex_ns::regex*>(re);
}

bool stdcpp_regexp_match(stdcpp_regexp *re, stdcpp_string text,
int startpos, int endpos) {
regex_ns::regex cpp_re(*reinterpret_cast<regex_ns::regex*>(re));
return regex_ns::regex_search(text.text + startpos, text.text + endpos,
cpp_re);
}

bool stdcpp_regexp_find(stdcpp_regexp *re, stdcpp_string text,
int startpos, int endpos,
int *match_start, int *match_end) {
regex_ns::regex cpp_re(*reinterpret_cast<regex_ns::regex*>(re));
regex_ns::cmatch result;
bool matched;
matched = regex_ns::regex_search(text.text + startpos, text.text + endpos,
result, cpp_re);
if (matched) {
*match_start = result[0].first - text.text;
*match_end = *match_start + result.length(0);
}
return matched;
}
}
163 changes: 163 additions & 0 deletions bench/src/ffi/stdcpp.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

#![allow(non_camel_case_types)]

use libc::{c_uchar, c_int, c_void};

/// Regex wraps a std::regex regular expression.
///
/// It cannot be used safely from multiple threads simultaneously.
pub struct Regex {
re: *mut stdcpp_regexp,
}

unsafe impl Send for Regex {}

impl Drop for Regex {
fn drop(&mut self) {
unsafe { stdcpp_regexp_free(self.re); }
}
}

#[derive(Debug)]
pub struct Error(());

impl Regex {
pub fn new(pattern: &str) -> Result<Regex, Error> {
unsafe { Ok(Regex { re: stdcpp_regexp_new(pattern.into()) }) }
}

pub fn is_match(&self, text: &str) -> bool {
unsafe {
stdcpp_regexp_match(self.re, text.into(), 0, text.len() as c_int)
}
}

pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
FindMatches {
re: self,
text: text,
last_end: 0,
last_match: None,
}
}

fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
let (mut s, mut e): (c_int, c_int) = (0, 0);
let matched = unsafe {
stdcpp_regexp_find(
self.re,
text.into(),
start as c_int,
text.len() as c_int,
&mut s,
&mut e,
)
};
if matched {
Some((s as usize, e as usize))
} else {
None
}
}
}

pub struct FindMatches<'r, 't> {
re: &'r Regex,
text: &'t str,
last_end: usize,
last_match: Option<usize>,
}

// This implementation is identical to the one Rust uses, since both Rust's
// regex engine and std::regex handle empty matches in the same way.
impl<'r, 't> Iterator for FindMatches<'r, 't> {
type Item = (usize, usize);

fn next(&mut self) -> Option<(usize, usize)> {
fn next_after_empty(text: &str, i: usize) -> usize {
let b = match text.as_bytes().get(i) {
None => return text.len() + 1,
Some(&b) => b,
};
let inc = if b <= 0x7F {
1
} else if b <= 0b110_11111 {
2
} else if b <= 0b1110_1111 {
3
} else {
4
};
i + inc
}

if self.last_end > self.text.len() {
return None;
}
let (s, e) = match self.re.find_at(self.text, self.last_end) {
None => return None,
Some((s, e)) => (s, e),
};
assert!(s >= self.last_end);
if s == e {
// This is an empty match. To ensure we make progress, start
// the next search at the smallest possible starting position
// of the next match following this one.
self.last_end = next_after_empty(&self.text, e);
// Don't accept empty matches immediately following a match.
// Just move on to the next match.
if Some(e) == self.last_match {
return self.next();
}
} else {
self.last_end = e;
}
self.last_match = Some(self.last_end);
Some((s, e))
}
}

// stdcpp FFI is below. Note that this uses a hand-rolled C API that is defined
// in stdcpp.cpp.

type stdcpp_regexp = c_void;

#[repr(C)]
struct stdcpp_string {
text: *const c_uchar,
len: c_int,
}

impl<'a> From<&'a str> for stdcpp_string {
fn from(s: &'a str) -> stdcpp_string {
stdcpp_string { text: s.as_ptr(), len: s.len() as c_int }
}
}

extern {
fn stdcpp_regexp_new(pat: stdcpp_string) -> *mut stdcpp_regexp;
fn stdcpp_regexp_free(re: *mut stdcpp_regexp);
fn stdcpp_regexp_match(
re: *mut stdcpp_regexp,
text: stdcpp_string,
startpos: c_int,
endpos: c_int,
) -> bool;
fn stdcpp_regexp_find(
re: *mut stdcpp_regexp,
text: stdcpp_string,
startpos: c_int,
endpos: c_int,
match_start: *mut c_int,
match_end: *mut c_int,
) -> bool;
}
19 changes: 18 additions & 1 deletion bench/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Since this tool includes compilation of the <pattern>, sufficiently large
haystacks should be used to amortize the cost of compilation. (e.g., >1MB.)

Usage:
regex-run-one [options] [onig | pcre1 | pcre2 | re2 | rust | rust-bytes | tcl] <file> <pattern>
regex-run-one [options] [onig | pcre1 | pcre2 | stdcpp | re2 | rust | rust-bytes | tcl] <file> <pattern>
regex-run-one [options] (-h | --help)

Options:
Expand All @@ -59,6 +59,7 @@ struct Args {
cmd_onig: bool,
cmd_pcre1: bool,
cmd_pcre2: bool,
cmd_stdcpp: bool,
cmd_re2: bool,
cmd_rust: bool,
cmd_rust_bytes: bool,
Expand Down Expand Up @@ -87,6 +88,8 @@ impl Args {
count_pcre1(pat, haystack)
} else if self.cmd_pcre2 {
count_pcre2(pat, haystack)
} else if self.cmd_stdcpp {
count_stdcpp(pat, haystack)
} else if self.cmd_re2 {
count_re2(pat, haystack)
} else if self.cmd_rust {
Expand Down Expand Up @@ -132,6 +135,20 @@ fn count_pcre2(pat: &str, haystack: &str) -> usize {
Regex::new(pat).unwrap().find_iter(haystack).count()
}

#[cfg(not(any(
feature = "re-stdcpp",
feature = "re-boost",
)))]
nada!("re-stdcpp", count_stdcpp);
#[cfg(any(
feature = "re-stdcpp",
feature = "re-boost",
))]
fn count_stdcpp(pat: &str, haystack: &str) -> usize {
use ffi::stdcpp::Regex;
Regex::new(pat).unwrap().find_iter(haystack).count()
}

nada!("re-re2", count_re2);
#[cfg(feature = "re-re2")]
fn count_re2(pat: &str, haystack: &str) -> usize {
Expand Down
Loading