Skip to content

Commit 1868a26

Browse files
committed
auto merge of #17989 : alexcrichton/rust/spectralnorm, r=thestinger
This improves the spectralnorm shootout benchmark through a few vectors after looking at the leading C implementation: * The simd-based f64x2 is now used to parallelize a few computations * RWLock usage has been removed. A custom `parallel` function was added as a form of stack-based fork-join parallelism. I found that the contention on the locks was high as well as hindering other optimizations. This does, however, introduce one `unsafe` block into the benchmarks, which previously had none. In terms of timings, the before and after numbers are: ``` $ time ./shootout-spectralnorm-before ./shootout-spectralnorm-before 2.07s user 0.71s system 324% cpu 0.857 total $ time ./shootout-spectralnorm-before 5500 ./shootout-spectralnorm-before 5500 11.88s user 1.13s system 459% cpu 2.830 total $ time ./shootout-spectralnorm-after ./shootout-spectralnorm-after 0.58s user 0.01s system 280% cpu 0.210 tota $ time ./shootout-spectralnorm-after 5500 ./shootout-spectralnorm-after 5500 3.55s user 0.01s system 455% cpu 0.783 total ```
2 parents 9d5fa7a + f7b5470 commit 1868a26

File tree

1 file changed

+68
-79
lines changed

1 file changed

+68
-79
lines changed

src/test/bench/shootout-spectralnorm.rs

+68-79
Original file line numberDiff line numberDiff line change
@@ -41,105 +41,94 @@
4141
// no-pretty-expanded FIXME #15189
4242

4343
#![allow(non_snake_case)]
44+
#![feature(unboxed_closures, overloaded_calls)]
4445

45-
use std::from_str::FromStr;
46-
use std::iter::count;
47-
use std::cmp::min;
46+
use std::iter::AdditiveIterator;
47+
use std::mem;
4848
use std::os;
49-
use std::sync::{Arc, RWLock};
49+
use std::raw::Repr;
50+
use std::simd::f64x2;
5051

51-
fn A(i: uint, j: uint) -> f64 {
52-
((i + j) * (i + j + 1) / 2 + i + 1) as f64
52+
fn main() {
53+
let args = os::args();
54+
let answer = spectralnorm(if os::getenv("RUST_BENCH").is_some() {
55+
5500
56+
} else if args.len() < 2 {
57+
2000
58+
} else {
59+
from_str(args[1].as_slice()).unwrap()
60+
});
61+
println!("{:.9f}", answer);
5362
}
5463

55-
fn dot(v: &[f64], u: &[f64]) -> f64 {
56-
let mut sum = 0.0;
57-
for (&v_i, &u_i) in v.iter().zip(u.iter()) {
58-
sum += v_i * u_i;
64+
fn spectralnorm(n: uint) -> f64 {
65+
assert!(n % 2 == 0, "only even lengths are accepted");
66+
let mut u = Vec::from_elem(n, 1.0);
67+
let mut v = Vec::from_elem(n, 1.0);
68+
let mut tmp = Vec::from_elem(n, 1.0);
69+
for _ in range(0u, 10) {
70+
mult_AtAv(u.as_slice(), v.as_mut_slice(), tmp.as_mut_slice());
71+
mult_AtAv(v.as_slice(), u.as_mut_slice(), tmp.as_mut_slice());
5972
}
60-
sum
73+
(dot(u.as_slice(), v.as_slice()) / dot(v.as_slice(), v.as_slice())).sqrt()
6174
}
6275

63-
fn mult(v: Arc<RWLock<Vec<f64>>>, out: Arc<RWLock<Vec<f64>>>,
64-
f: fn(&Vec<f64>, uint) -> f64) {
65-
// We launch in different tasks the work to be done. To finish
66-
// this function, we need to wait for the completion of every
67-
// tasks. To do that, we give to each tasks a wait_chan that we
68-
// drop at the end of the work. At the end of this function, we
69-
// wait until the channel hang up.
70-
let (tx, rx) = channel();
71-
72-
let len = out.read().len();
73-
let chunk = len / 20 + 1;
74-
for chk in count(0, chunk) {
75-
if chk >= len {break;}
76-
let tx = tx.clone();
77-
let v = v.clone();
78-
let out = out.clone();
79-
spawn(proc() {
80-
for i in range(chk, min(len, chk + chunk)) {
81-
let val = f(&*v.read(), i);
82-
*out.write().get_mut(i) = val;
83-
}
84-
drop(tx)
85-
});
86-
}
87-
88-
// wait until the channel hang up (every task finished)
89-
drop(tx);
90-
for () in rx.iter() {}
76+
fn mult_AtAv(v: &[f64], out: &mut [f64], tmp: &mut [f64]) {
77+
mult_Av(v, tmp);
78+
mult_Atv(tmp, out);
9179
}
9280

93-
fn mult_Av_impl(v: &Vec<f64> , i: uint) -> f64 {
94-
let mut sum = 0.;
95-
for (j, &v_j) in v.iter().enumerate() {
96-
sum += v_j / A(i, j);
97-
}
98-
sum
81+
fn mult_Av(v: &[f64], out: &mut [f64]) {
82+
parallel(out, |&: start, out| mult(v, out, start, |i, j| A(i, j)));
9983
}
10084

101-
fn mult_Av(v: Arc<RWLock<Vec<f64>>>, out: Arc<RWLock<Vec<f64>>>) {
102-
mult(v, out, mult_Av_impl);
85+
fn mult_Atv(v: &[f64], out: &mut [f64]) {
86+
parallel(out, |&: start, out| mult(v, out, start, |i, j| A(j, i)));
10387
}
10488

105-
fn mult_Atv_impl(v: &Vec<f64> , i: uint) -> f64 {
106-
let mut sum = 0.;
107-
for (j, &v_j) in v.iter().enumerate() {
108-
sum += v_j / A(j, i);
89+
fn mult(v: &[f64], out: &mut [f64], start: uint, a: |uint, uint| -> f64) {
90+
for (i, slot) in out.iter_mut().enumerate().map(|(i, s)| (i + start, s)) {
91+
let mut sum = f64x2(0.0, 0.0);
92+
for (j, chunk) in v.chunks(2).enumerate().map(|(j, s)| (2 * j, s)) {
93+
let top = f64x2(chunk[0], chunk[1]);
94+
let bot = f64x2(a(i, j), a(i, j + 1));
95+
sum += top / bot;
96+
}
97+
let f64x2(a, b) = sum;
98+
*slot = a + b;
10999
}
110-
sum
111100
}
112101

113-
fn mult_Atv(v: Arc<RWLock<Vec<f64>>>, out: Arc<RWLock<Vec<f64>>>) {
114-
mult(v, out, mult_Atv_impl);
102+
fn A(i: uint, j: uint) -> f64 {
103+
((i + j) * (i + j + 1) / 2 + i + 1) as f64
115104
}
116105

117-
fn mult_AtAv(v: Arc<RWLock<Vec<f64>>>, out: Arc<RWLock<Vec<f64>>>,
118-
tmp: Arc<RWLock<Vec<f64>>>) {
119-
mult_Av(v, tmp.clone());
120-
mult_Atv(tmp, out);
106+
fn dot(v: &[f64], u: &[f64]) -> f64 {
107+
v.iter().zip(u.iter()).map(|(a, b)| *a * *b).sum()
121108
}
122109

123-
fn main() {
124-
let args = os::args();
125-
let args = args.as_slice();
126-
let n = if os::getenv("RUST_BENCH").is_some() {
127-
5500
128-
} else if args.len() < 2 {
129-
2000
130-
} else {
131-
FromStr::from_str(args[1].as_slice()).unwrap()
132-
};
133-
let u = Arc::new(RWLock::new(Vec::from_elem(n, 1f64)));
134-
let v = Arc::new(RWLock::new(Vec::from_elem(n, 1f64)));
135-
let tmp = Arc::new(RWLock::new(Vec::from_elem(n, 1f64)));
136-
for _ in range(0u8, 10) {
137-
mult_AtAv(u.clone(), v.clone(), tmp.clone());
138-
mult_AtAv(v.clone(), u.clone(), tmp.clone());
139-
}
110+
// Executes a closure in parallel over the given mutable slice. The closure `f`
111+
// is run in parallel and yielded the starting index within `v` as well as a
112+
// sub-slice of `v`.
113+
fn parallel<'a, T, F>(v: &'a mut [T], f: F)
114+
where T: Send + Sync,
115+
F: Fn(uint, &'a mut [T]) + Sync {
116+
let (tx, rx) = channel();
117+
let size = v.len() / os::num_cpus() + 1;
140118

141-
let u = u.read();
142-
let v = v.read();
143-
println!("{:.9f}", (dot(u.as_slice(), v.as_slice()) /
144-
dot(v.as_slice(), v.as_slice())).sqrt());
119+
for (i, chunk) in v.chunks_mut(size).enumerate() {
120+
let tx = tx.clone();
121+
122+
// Need to convert `f` and `chunk` to something that can cross the task
123+
// boundary.
124+
let f = &f as *const _ as *const uint;
125+
let raw = chunk.repr();
126+
spawn(proc() {
127+
let f = f as *const F;
128+
unsafe { (*f)(i * size, mem::transmute(raw)) }
129+
drop(tx)
130+
});
131+
}
132+
drop(tx);
133+
for () in rx.iter() {}
145134
}

0 commit comments

Comments
 (0)