Skip to content

Commit 18a3db6

Browse files
committed
auto merge of #18357 : TeXitoi/rust/simplify-reverse-complement, r=alexcrichton
Simpler, safer and shorter, in the same spirit of the current version, and the same performances. @mahkoh please review, I think I didn't change any performances related thing.
2 parents 77f44d4 + 7017fb0 commit 18a3db6

File tree

1 file changed

+109
-179
lines changed

1 file changed

+109
-179
lines changed

src/test/bench/shootout-reverse-complement.rs

+109-179
Original file line numberDiff line numberDiff line change
@@ -38,185 +38,93 @@
3838
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
3939
// OF THE POSSIBILITY OF SUCH DAMAGE.
4040

41-
// ignore-android doesn't terminate?
41+
// ignore-android see #10393 #13206
4242

43-
#![feature(slicing_syntax, asm, if_let, tuple_indexing)]
43+
#![feature(slicing_syntax, unboxed_closures, overloaded_calls)]
4444

4545
extern crate libc;
4646

4747
use std::io::stdio::{stdin_raw, stdout_raw};
48-
use std::sync::{Future};
4948
use std::num::{div_rem};
5049
use std::ptr::{copy_memory};
5150
use std::io::{IoResult, EndOfFile};
52-
use std::slice::raw::{mut_buf_as_slice};
5351

54-
use shared_memory::{SharedMemory};
55-
56-
mod tables {
57-
use std::sync::{Once, ONCE_INIT};
58-
59-
/// Lookup tables.
60-
static mut CPL16: [u16, ..1 << 16] = [0, ..1 << 16];
61-
static mut CPL8: [u8, ..1 << 8] = [0, ..1 << 8];
62-
63-
/// Generates the tables.
64-
pub fn get() -> Tables {
65-
/// To make sure we initialize the tables only once.
66-
static INIT: Once = ONCE_INIT;
67-
INIT.doit(|| {
68-
unsafe {
69-
for i in range(0, 1 << 8) {
70-
CPL8[i] = match i as u8 {
71-
b'A' | b'a' => b'T',
72-
b'C' | b'c' => b'G',
73-
b'G' | b'g' => b'C',
74-
b'T' | b't' => b'A',
75-
b'U' | b'u' => b'A',
76-
b'M' | b'm' => b'K',
77-
b'R' | b'r' => b'Y',
78-
b'W' | b'w' => b'W',
79-
b'S' | b's' => b'S',
80-
b'Y' | b'y' => b'R',
81-
b'K' | b'k' => b'M',
82-
b'V' | b'v' => b'B',
83-
b'H' | b'h' => b'D',
84-
b'D' | b'd' => b'H',
85-
b'B' | b'b' => b'V',
86-
b'N' | b'n' => b'N',
87-
i => i,
88-
};
89-
}
90-
91-
for (i, v) in CPL16.iter_mut().enumerate() {
92-
*v = *CPL8.unsafe_get(i & 255) as u16 << 8 |
93-
*CPL8.unsafe_get(i >> 8) as u16;
94-
}
95-
}
96-
});
97-
Tables { _dummy: () }
98-
}
99-
100-
/// Accessor for the static arrays.
101-
///
102-
/// To make sure that the tables can't be accessed without having been initialized.
103-
pub struct Tables {
104-
_dummy: ()
105-
}
106-
107-
impl Tables {
108-
/// Retreives the complement for `i`.
109-
pub fn cpl8(self, i: u8) -> u8 {
110-
// Not really unsafe.
111-
unsafe { CPL8[i as uint] }
112-
}
113-
114-
/// Retreives the complement for `i`.
115-
pub fn cpl16(self, i: u16) -> u16 {
116-
unsafe { CPL16[i as uint] }
117-
}
118-
}
52+
struct Tables {
53+
table8: [u8, ..1 << 8],
54+
table16: [u16, ..1 << 16]
11955
}
12056

121-
mod shared_memory {
122-
use std::sync::{Arc};
123-
use std::mem::{transmute};
124-
use std::raw::{Slice};
125-
126-
/// Structure for sharing disjoint parts of a vector mutably across tasks.
127-
pub struct SharedMemory {
128-
ptr: Arc<Vec<u8>>,
129-
start: uint,
130-
len: uint,
131-
}
132-
133-
impl SharedMemory {
134-
pub fn new(ptr: Vec<u8>) -> SharedMemory {
135-
let len = ptr.len();
136-
SharedMemory {
137-
ptr: Arc::new(ptr),
138-
start: 0,
139-
len: len,
140-
}
57+
impl Tables {
58+
fn new() -> Tables {
59+
let mut table8 = [0, ..1 << 8];
60+
for (i, v) in table8.iter_mut().enumerate() {
61+
*v = Tables::computed_cpl8(i as u8);
14162
}
142-
143-
pub fn as_mut_slice(&mut self) -> &mut [u8] {
144-
unsafe {
145-
transmute(Slice {
146-
data: self.ptr.as_ptr().offset(self.start as int) as *const u8,
147-
len: self.len,
148-
})
149-
}
63+
let mut table16 = [0, ..1 << 16];
64+
for (i, v) in table16.iter_mut().enumerate() {
65+
*v = table8[i & 255] as u16 << 8 |
66+
table8[i >> 8] as u16;
15067
}
68+
Tables { table8: table8, table16: table16 }
69+
}
15170

152-
pub fn len(&self) -> uint {
153-
self.len
71+
fn computed_cpl8(c: u8) -> u8 {
72+
match c {
73+
b'A' | b'a' => b'T',
74+
b'C' | b'c' => b'G',
75+
b'G' | b'g' => b'C',
76+
b'T' | b't' => b'A',
77+
b'U' | b'u' => b'A',
78+
b'M' | b'm' => b'K',
79+
b'R' | b'r' => b'Y',
80+
b'W' | b'w' => b'W',
81+
b'S' | b's' => b'S',
82+
b'Y' | b'y' => b'R',
83+
b'K' | b'k' => b'M',
84+
b'V' | b'v' => b'B',
85+
b'H' | b'h' => b'D',
86+
b'D' | b'd' => b'H',
87+
b'B' | b'b' => b'V',
88+
b'N' | b'n' => b'N',
89+
i => i,
15490
}
91+
}
15592

156-
pub fn split_at(self, mid: uint) -> (SharedMemory, SharedMemory) {
157-
assert!(mid <= self.len);
158-
let left = SharedMemory {
159-
ptr: self.ptr.clone(),
160-
start: self.start,
161-
len: mid,
162-
};
163-
let right = SharedMemory {
164-
ptr: self.ptr,
165-
start: self.start + mid,
166-
len: self.len - mid,
167-
};
168-
(left, right)
169-
}
93+
/// Retreives the complement for `i`.
94+
fn cpl8(&self, i: u8) -> u8 {
95+
self.table8[i as uint]
96+
}
17097

171-
/// Resets the object so that it covers the whole range of the contained vector.
172-
///
173-
/// You must not call this method if `self` is not the only reference to the
174-
/// shared memory.
175-
///
176-
/// FIXME: If `Arc` had a method to check if the reference is unique, then we
177-
/// wouldn't need the `unsafe` here.
178-
///
179-
/// FIXME: If `Arc` had a method to unwrap the contained value, then we could
180-
/// simply unwrap here.
181-
pub unsafe fn reset(self) -> SharedMemory {
182-
let len = self.ptr.len();
183-
SharedMemory {
184-
ptr: self.ptr,
185-
start: 0,
186-
len: len,
187-
}
188-
}
98+
/// Retreives the complement for `i`.
99+
fn cpl16(&self, i: u16) -> u16 {
100+
self.table16[i as uint]
189101
}
190102
}
191103

192-
193104
/// Reads all remaining bytes from the stream.
194105
fn read_to_end<R: Reader>(r: &mut R) -> IoResult<Vec<u8>> {
106+
// As reading the input stream in memory is a bottleneck, we tune
107+
// Reader::read_to_end() with a fast growing policy to limit
108+
// recopies. If MREMAP_RETAIN is implemented in the linux kernel
109+
// and jemalloc use it, this trick will become useless.
195110
const CHUNK: uint = 64 * 1024;
196111

197-
let mut vec = Vec::with_capacity(1024 * 1024);
112+
let mut vec = Vec::with_capacity(CHUNK);
198113
loop {
114+
// workaround: very fast growing
199115
if vec.capacity() - vec.len() < CHUNK {
200116
let cap = vec.capacity();
201117
let mult = if cap < 256 * 1024 * 1024 {
202-
// FIXME (mahkoh): Temporary workaround for jemalloc on linux. Replace
203-
// this by 2x once the jemalloc preformance issue has been fixed.
204118
16
205119
} else {
206120
2
207121
};
208122
vec.reserve_exact(mult * cap);
209123
}
210-
unsafe {
211-
let ptr = vec.as_mut_ptr().offset(vec.len() as int);
212-
match mut_buf_as_slice(ptr, CHUNK, |s| r.read(s)) {
213-
Ok(n) => {
214-
let len = vec.len();
215-
vec.set_len(len + n);
216-
},
217-
Err(ref e) if e.kind == EndOfFile => break,
218-
Err(e) => return Err(e),
219-
}
124+
match r.push_at_least(1, CHUNK, &mut vec) {
125+
Ok(_) => {}
126+
Err(ref e) if e.kind == EndOfFile => break,
127+
Err(e) => return Err(e)
220128
}
221129
}
222130
Ok(vec)
@@ -225,11 +133,8 @@ fn read_to_end<R: Reader>(r: &mut R) -> IoResult<Vec<u8>> {
225133
/// Finds the first position at which `b` occurs in `s`.
226134
fn memchr(h: &[u8], n: u8) -> Option<uint> {
227135
use libc::{c_void, c_int, size_t};
228-
extern {
229-
fn memchr(h: *const c_void, n: c_int, s: size_t) -> *mut c_void;
230-
}
231136
let res = unsafe {
232-
memchr(h.as_ptr() as *const c_void, n as c_int, h.len() as size_t)
137+
libc::memchr(h.as_ptr() as *const c_void, n as c_int, h.len() as size_t)
233138
};
234139
if res.is_null() {
235140
None
@@ -238,13 +143,36 @@ fn memchr(h: &[u8], n: u8) -> Option<uint> {
238143
}
239144
}
240145

146+
/// A mutable iterator over DNA sequences
147+
struct MutDnaSeqs<'a> { s: &'a mut [u8] }
148+
fn mut_dna_seqs<'a>(s: &'a mut [u8]) -> MutDnaSeqs<'a> {
149+
MutDnaSeqs { s: s }
150+
}
151+
impl<'a> Iterator<&'a mut [u8]> for MutDnaSeqs<'a> {
152+
fn next(&mut self) -> Option<&'a mut [u8]> {
153+
let tmp = std::mem::replace(&mut self.s, &mut []);
154+
let tmp = match memchr(tmp, b'\n') {
155+
Some(i) => tmp.slice_from_mut(i + 1),
156+
None => return None,
157+
};
158+
let (seq, tmp) = match memchr(tmp, b'>') {
159+
Some(i) => tmp.split_at_mut(i),
160+
None => {
161+
let len = tmp.len();
162+
tmp.split_at_mut(len)
163+
}
164+
};
165+
self.s = tmp;
166+
Some(seq)
167+
}
168+
}
169+
241170
/// Length of a normal line without the terminating \n.
242171
const LINE_LEN: uint = 60;
243172

244173
/// Compute the reverse complement.
245-
fn reverse_complement(mut view: SharedMemory, tables: tables::Tables) {
246-
// Drop the last newline
247-
let seq = view.as_mut_slice().init_mut();
174+
fn reverse_complement(seq: &mut [u8], tables: &Tables) {
175+
let seq = seq.init_mut();// Drop the last newline
248176
let len = seq.len();
249177
let off = LINE_LEN - len % (LINE_LEN + 1);
250178
let mut i = LINE_LEN;
@@ -290,34 +218,36 @@ fn reverse_complement(mut view: SharedMemory, tables: tables::Tables) {
290218
}
291219
}
292220

293-
fn main() {
294-
let mut data = SharedMemory::new(read_to_end(&mut stdin_raw()).unwrap());
295-
let tables = tables::get();
296-
297-
let mut futures = vec!();
298-
loop {
299-
let (_, mut tmp_data) = match memchr(data.as_mut_slice(), b'\n') {
300-
Some(i) => data.split_at(i + 1),
301-
_ => break,
302-
};
303-
let (view, tmp_data) = match memchr(tmp_data.as_mut_slice(), b'>') {
304-
Some(i) => tmp_data.split_at(i),
305-
None => {
306-
let len = tmp_data.len();
307-
tmp_data.split_at(len)
308-
},
309-
};
310-
futures.push(Future::spawn(proc() reverse_complement(view, tables)));
311-
data = tmp_data;
312-
}
313-
314-
for f in futures.iter_mut() {
315-
f.get();
221+
/// Executes a closure in parallel over the given iterator over mutable slice.
222+
/// The closure `f` is run in parallel with an element of `iter`.
223+
fn parallel<'a, I, T, F>(mut iter: I, f: F)
224+
where T: Send + Sync,
225+
I: Iterator<&'a mut [T]>,
226+
F: Fn(&'a mut [T]) + Sync {
227+
use std::mem;
228+
use std::raw::Repr;
229+
230+
let (tx, rx) = channel();
231+
for chunk in iter {
232+
let tx = tx.clone();
233+
234+
// Need to convert `f` and `chunk` to something that can cross the task
235+
// boundary.
236+
let f = &f as *const F as *const uint;
237+
let raw = chunk.repr();
238+
spawn(proc() {
239+
let f = f as *const F;
240+
unsafe { (*f)(mem::transmute(raw)) }
241+
drop(tx)
242+
});
316243
}
244+
drop(tx);
245+
for () in rx.iter() {}
246+
}
317247

318-
// Not actually unsafe. If Arc had a way to check uniqueness then we could do that in
319-
// `reset` and it would tell us that, yes, it is unique at this point.
320-
data = unsafe { data.reset() };
321-
248+
fn main() {
249+
let mut data = read_to_end(&mut stdin_raw()).unwrap();
250+
let tables = &Tables::new();
251+
parallel(mut_dna_seqs(data[mut]), |&: seq| reverse_complement(seq, tables));
322252
stdout_raw().write(data.as_mut_slice()).unwrap();
323253
}

0 commit comments

Comments
 (0)