Skip to content

Commit c5d9304

Browse files
BurntSushiSeanRBurton
authored andcommitted
Use variable width integer encoding for compression.
This slightly modifies the implementation in rust-lang#226 to use variable width integers. This saves a touch more space with regexes with huge alternations spanning larger-than-127 instructions (typically from Unicode character classes). This also adjusts `approximate_size`, which is the actual gatekeeper behind deciding whether the DFA has filled its cache or not. The approximation is now reduced slightly to account for the space savings. The variable integer encoding used is from Protocol Buffers, documented here: https://developers.google.com/protocol-buffers/docs/encoding#varints
1 parent 14a8989 commit c5d9304

File tree

1 file changed

+91
-43
lines changed

1 file changed

+91
-43
lines changed

src/dfa.rs

+91-43
Original file line numberDiff line numberDiff line change
@@ -275,25 +275,12 @@ struct State{
275275
/// `u32` here for the DFA to save on space.
276276
type InstPtr = u32;
277277

278-
// Used to construct new states.
278+
/// Adds ip to data using delta encoding with respect to prev.
279+
///
280+
/// After completion, `data` will contain `ip` and `prev` will be set to `ip`.
279281
fn push_inst_ptr(data: &mut Vec<u8>, prev: &mut InstPtr, ip: InstPtr) {
280282
let delta = (ip as i32) - (*prev as i32);
281-
if delta.abs() <= 127 {
282-
data.push(delta as u8);
283-
*prev = ip;
284-
return;
285-
}
286-
let delta = delta as u32;
287-
// Write 4 bytes in little-endian format.
288-
let a = (delta & (0xFF << 0 * 8)) >> 0 * 8;
289-
let b = (delta & (0xFF << 1 * 8)) >> 1 * 8;
290-
let c = (delta & (0xFF << 2 * 8)) >> 2 * 8;
291-
let d = (delta & (0xFF << 3 * 8)) >> 3 * 8;
292-
data.push(128);
293-
data.push(a as u8);
294-
data.push(b as u8);
295-
data.push(c as u8);
296-
data.push(d as u8);
283+
write_vari32(data, delta);
297284
*prev = ip;
298285
}
299286

@@ -306,31 +293,20 @@ impl <'a>Iterator for InstPtrs<'a> {
306293
type Item = usize;
307294

308295
fn next(&mut self) -> Option<usize> {
309-
let x = match self.data.get(0){
310-
Some(&x) => x,
311-
None => return None,
312-
};
313-
let delta = if x == 128 {
314-
//Read 4 bytes in little-endian format.
315-
let a = self.data[1] as u32;
316-
let b = self.data[2] as u32;
317-
let c = self.data[3] as u32;
318-
let d = self.data[4] as u32;
319-
self.data = &self.data[5..];
320-
(a << 0 * 8 | b << 1 * 8 | c << 2 * 8 | d << 3 * 8) as i32 as isize
321-
} else {
322-
self.data = &self.data[1..];
323-
x as i8 as isize
324-
};
325-
let base = self.base as isize + delta;
296+
if self.data.is_empty() {
297+
return None;
298+
}
299+
let (delta, nread) = read_vari32(self.data);
300+
let base = self.base as i32 + delta;
326301
debug_assert!(base >= 0);
302+
debug_assert!(nread > 0);
303+
self.data = &self.data[nread..];
327304
self.base = base as usize;
328305
Some(self.base)
329306
}
330307
}
331308

332309
impl State {
333-
334310
fn flags(&self) -> StateFlags {
335311
StateFlags(self.data[0])
336312
}
@@ -1566,14 +1542,15 @@ impl<'a> Fsm<'a> {
15661542
fn approximate_size(&self) -> usize {
15671543
use std::mem::size_of as size;
15681544
// Estimate that there are about 16 instructions per state consuming
1569-
// 64 = 16 * 4 bytes of space.
1545+
// 20 = 4 + (15 * 1) bytes of space (1 byte because of delta encoding).
1546+
const STATE_HEAP: usize = 20 + 1; // one extra byte for flags
15701547
let compiled =
1571-
(self.cache.compiled.len() * (size::<State>() + 64))
1548+
(self.cache.compiled.len() * (size::<State>() + STATE_HEAP))
15721549
+ (self.cache.compiled.len() * size::<StatePtr>());
15731550
let states =
15741551
self.cache.states.len()
15751552
* (size::<State>()
1576-
+ 64
1553+
+ STATE_HEAP
15771554
+ (self.num_byte_classes() * size::<StatePtr>()));
15781555
let start_states = self.cache.start_states.len() * size::<StatePtr>();
15791556
self.prog.approximate_size() + compiled + states + start_states
@@ -1802,11 +1779,56 @@ fn show_state_ptr(si: StatePtr) -> String {
18021779
s
18031780
}
18041781

1782+
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1783+
fn write_vari32(data: &mut Vec<u8>, n: i32) {
1784+
let mut un = (n as u32) << 1;
1785+
if n < 0 {
1786+
un = !un;
1787+
}
1788+
write_varu32(data, un)
1789+
}
1790+
1791+
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1792+
fn read_vari32(data: &[u8]) -> (i32, usize) {
1793+
let (un, i) = read_varu32(data);
1794+
let mut n = (un >> 1) as i32;
1795+
if un & 1 != 0 {
1796+
n = !n;
1797+
}
1798+
(n, i)
1799+
}
1800+
1801+
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1802+
fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
1803+
while n >= 0b1000_0000 {
1804+
data.push((n as u8) | 0b1000_0000);
1805+
n >>= 7;
1806+
}
1807+
data.push(n as u8);
1808+
}
1809+
1810+
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1811+
fn read_varu32(data: &[u8]) -> (u32, usize) {
1812+
let mut n: u32 = 0;
1813+
let mut shift: u32 = 0;
1814+
for (i, &b) in data.iter().enumerate() {
1815+
if b < 0b1000_0000 {
1816+
return (n | ((b as u32) << shift), i + 1);
1817+
}
1818+
n |= ((b as u32) & 0b0111_1111) << shift;
1819+
shift += 7;
1820+
}
1821+
(0, 0)
1822+
}
1823+
18051824
#[cfg(test)]
18061825
mod tests {
1807-
use quickcheck::quickcheck;
1826+
extern crate rand;
1827+
1828+
use quickcheck::{QuickCheck, StdGen, quickcheck};
18081829
use super::{
18091830
StateFlags, State, push_inst_ptr,
1831+
write_varu32, read_varu32, write_vari32, read_vari32,
18101832
};
18111833

18121834
#[test]
@@ -1818,10 +1840,36 @@ mod tests {
18181840
push_inst_ptr(&mut data, &mut prev, ip);
18191841
}
18201842
let state = State { data: data.into_boxed_slice() };
1821-
state.inst_ptrs().zip(ips.iter()).all(|(x, &y)| x == y as usize)
1822-
&&
1823-
state.flags() == StateFlags(flags)
1843+
1844+
let expected: Vec<usize> =
1845+
ips.into_iter().map(|ip| ip as usize).collect();
1846+
let got: Vec<usize> = state.inst_ptrs().collect();
1847+
expected == got && state.flags() == StateFlags(flags)
1848+
}
1849+
QuickCheck::new()
1850+
.gen(StdGen::new(self::rand::thread_rng(), 70_000))
1851+
.quickcheck(p as fn(Vec<u32>, u8) -> bool);
1852+
}
1853+
1854+
#[test]
1855+
fn prop_read_write_u32() {
1856+
fn p(n: u32) -> bool {
1857+
let mut buf = vec![];
1858+
write_varu32(&mut buf, n);
1859+
let (got, nread) = read_varu32(&buf);
1860+
nread == buf.len() && got == n
1861+
}
1862+
quickcheck(p as fn(u32) -> bool);
1863+
}
1864+
1865+
#[test]
1866+
fn prop_read_write_i32() {
1867+
fn p(n: i32) -> bool {
1868+
let mut buf = vec![];
1869+
write_vari32(&mut buf, n);
1870+
let (got, nread) = read_vari32(&buf);
1871+
nread == buf.len() && got == n
18241872
}
1825-
quickcheck(p as fn(Vec<u32>, u8) -> bool)
1873+
quickcheck(p as fn(i32) -> bool);
18261874
}
18271875
}

0 commit comments

Comments
 (0)