Skip to content

Use variable width integer encoding for compression #229

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 7, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bench/src/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ pub use ffi::tcl::Regex;
#[cfg(not(feature = "re-rust-bytes"))]
#[cfg(not(feature = "re-rust-plugin"))]
macro_rules! regex {
($re:expr) => { ::Regex::new($re).unwrap() }
($re:expr) => { ::Regex::new(&$re.to_owned()).unwrap() }
}

#[cfg(feature = "re-rust-bytes")]
Expand All @@ -72,7 +72,7 @@ macro_rules! regex {
// Always enable the Unicode flag for byte based regexes.
// Really, this should have been enabled by default. *sigh*
use regex::bytes::RegexBuilder;
RegexBuilder::new($re).unicode(true).compile().unwrap()
RegexBuilder::new(&$re.to_owned()).unicode(true).compile().unwrap()
}}
}

Expand Down
7 changes: 2 additions & 5 deletions bench/src/misc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,16 @@ use test::Bencher;

use {Regex, Text};

/*
#[cfg(not(feature = "re-onig"))]
#[cfg(not(feature = "re-pcre1"))]
#[cfg(not(feature = "re-pcre2"))]
#[cfg(not(feature = "re-rust-plugin"))]
bench_match!(no_exponential, {
let re = format!(
format!(
"{}{}",
repeat("a?").take(100).collect::<String>(),
repeat("a").take(100).collect::<String>());
regex!(&re)
repeat("a").take(100).collect::<String>())
}, repeat("a").take(100).collect());
*/

bench_match!(literal, r"y", {
format!("{}y", repeat("x").take(50).collect::<String>())
Expand Down
4 changes: 4 additions & 0 deletions bench/src/sherlock.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,10 @@ sherlock!(words, r"\w+", 109222); // hmm, why does RE2 diverge here?
// optimizations.
sherlock!(before_holmes, r"\w+\s+Holmes", 319);

// Find complete words before Holmes. Both of the `\w`s defeat any prefix
// and suffix optimizations.
sherlock!(before_after_holmes, r"\w+\s+Holmes\s+\w+", 137);

// Find Holmes co-occuring with Watson in a particular window of characters.
// This uses Aho-Corasick for the Holmes|Watson prefix, but the lazy DFA for
// the rest.
Expand Down
134 changes: 91 additions & 43 deletions src/dfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -275,25 +275,12 @@ struct State{
/// `u32` here for the DFA to save on space.
type InstPtr = u32;

// Used to construct new states.
/// Adds ip to data using delta encoding with respect to prev.
///
/// After completion, `data` will contain `ip` and `prev` will be set to `ip`.
fn push_inst_ptr(data: &mut Vec<u8>, prev: &mut InstPtr, ip: InstPtr) {
let delta = (ip as i32) - (*prev as i32);
if delta.abs() <= 127 {
data.push(delta as u8);
*prev = ip;
return;
}
let delta = delta as u32;
// Write 4 bytes in little-endian format.
let a = (delta & (0xFF << 0 * 8)) >> 0 * 8;
let b = (delta & (0xFF << 1 * 8)) >> 1 * 8;
let c = (delta & (0xFF << 2 * 8)) >> 2 * 8;
let d = (delta & (0xFF << 3 * 8)) >> 3 * 8;
data.push(128);
data.push(a as u8);
data.push(b as u8);
data.push(c as u8);
data.push(d as u8);
write_vari32(data, delta);
*prev = ip;
}

Expand All @@ -306,31 +293,20 @@ impl <'a>Iterator for InstPtrs<'a> {
type Item = usize;

fn next(&mut self) -> Option<usize> {
let x = match self.data.get(0){
Some(&x) => x,
None => return None,
};
let delta = if x == 128 {
//Read 4 bytes in little-endian format.
let a = self.data[1] as u32;
let b = self.data[2] as u32;
let c = self.data[3] as u32;
let d = self.data[4] as u32;
self.data = &self.data[5..];
(a << 0 * 8 | b << 1 * 8 | c << 2 * 8 | d << 3 * 8) as i32 as isize
} else {
self.data = &self.data[1..];
x as i8 as isize
};
let base = self.base as isize + delta;
if self.data.is_empty() {
return None;
}
let (delta, nread) = read_vari32(self.data);
let base = self.base as i32 + delta;
debug_assert!(base >= 0);
debug_assert!(nread > 0);
self.data = &self.data[nread..];
self.base = base as usize;
Some(self.base)
}
}

impl State {

fn flags(&self) -> StateFlags {
StateFlags(self.data[0])
}
Expand Down Expand Up @@ -1566,14 +1542,15 @@ impl<'a> Fsm<'a> {
fn approximate_size(&self) -> usize {
use std::mem::size_of as size;
// Estimate that there are about 16 instructions per state consuming
// 64 = 16 * 4 bytes of space.
// 20 = 4 + (15 * 1) bytes of space (1 byte because of delta encoding).
const STATE_HEAP: usize = 20 + 1; // one extra byte for flags
let compiled =
(self.cache.compiled.len() * (size::<State>() + 64))
(self.cache.compiled.len() * (size::<State>() + STATE_HEAP))
+ (self.cache.compiled.len() * size::<StatePtr>());
let states =
self.cache.states.len()
* (size::<State>()
+ 64
+ STATE_HEAP
+ (self.num_byte_classes() * size::<StatePtr>()));
let start_states = self.cache.start_states.len() * size::<StatePtr>();
self.prog.approximate_size() + compiled + states + start_states
Expand Down Expand Up @@ -1802,11 +1779,56 @@ fn show_state_ptr(si: StatePtr) -> String {
s
}

/// https://developers.google.com/protocol-buffers/docs/encoding#varints
fn write_vari32(data: &mut Vec<u8>, n: i32) {
let mut un = (n as u32) << 1;
if n < 0 {
un = !un;
}
write_varu32(data, un)
}

/// https://developers.google.com/protocol-buffers/docs/encoding#varints
fn read_vari32(data: &[u8]) -> (i32, usize) {
let (un, i) = read_varu32(data);
let mut n = (un >> 1) as i32;
if un & 1 != 0 {
n = !n;
}
(n, i)
}

/// https://developers.google.com/protocol-buffers/docs/encoding#varints
fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
while n >= 0b1000_0000 {
data.push((n as u8) | 0b1000_0000);
n >>= 7;
}
data.push(n as u8);
}

/// https://developers.google.com/protocol-buffers/docs/encoding#varints
fn read_varu32(data: &[u8]) -> (u32, usize) {
let mut n: u32 = 0;
let mut shift: u32 = 0;
for (i, &b) in data.iter().enumerate() {
if b < 0b1000_0000 {
return (n | ((b as u32) << shift), i + 1);
}
n |= ((b as u32) & 0b0111_1111) << shift;
shift += 7;
}
(0, 0)
}

#[cfg(test)]
mod tests {
use quickcheck::quickcheck;
extern crate rand;

use quickcheck::{QuickCheck, StdGen, quickcheck};
use super::{
StateFlags, State, push_inst_ptr,
write_varu32, read_varu32, write_vari32, read_vari32,
};

#[test]
Expand All @@ -1818,10 +1840,36 @@ mod tests {
push_inst_ptr(&mut data, &mut prev, ip);
}
let state = State { data: data.into_boxed_slice() };
state.inst_ptrs().zip(ips.iter()).all(|(x, &y)| x == y as usize)
&&
state.flags() == StateFlags(flags)

let expected: Vec<usize> =
ips.into_iter().map(|ip| ip as usize).collect();
let got: Vec<usize> = state.inst_ptrs().collect();
expected == got && state.flags() == StateFlags(flags)
}
QuickCheck::new()
.gen(StdGen::new(self::rand::thread_rng(), 70_000))
.quickcheck(p as fn(Vec<u32>, u8) -> bool);
}

#[test]
fn prop_read_write_u32() {
fn p(n: u32) -> bool {
let mut buf = vec![];
write_varu32(&mut buf, n);
let (got, nread) = read_varu32(&buf);
nread == buf.len() && got == n
}
quickcheck(p as fn(u32) -> bool);
}

#[test]
fn prop_read_write_i32() {
fn p(n: i32) -> bool {
let mut buf = vec![];
write_vari32(&mut buf, n);
let (got, nread) = read_vari32(&buf);
nread == buf.len() && got == n
}
quickcheck(p as fn(Vec<u32>, u8) -> bool)
quickcheck(p as fn(i32) -> bool);
}
}
1 change: 0 additions & 1 deletion tests/suffix_reverse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,3 @@ mat!(t03, r".*(?:abcd)+", r"abcdabcd", Some((0, 8)));
mat!(t04, r".*(?:abcd)+", r"abcdxabcd", Some((0, 9)));
mat!(t05, r".*x(?:abcd)+", r"abcdxabcd", Some((0, 9)));
mat!(t06, r"[^abcd]*x(?:abcd)+", r"abcdxabcd", Some((4, 9)));
// mat!(t05, r".*(?:abcd)+", r"abcdabcd", Some((0, 4)));