Merge #11

bors[bot] · smarnach · bors[bot] · commit ad60b608d952 · 2018-10-03T20:44:35.000Z
11: Optimise the GCD implementations. r=cuviper a=smarnach

This change avoids using `swap()` in the GCD implementation, which results in a speedup of almost a factor of two on my system.  Here are a few benchmark numbers for `u32` values, where `t_old` is the time per function call for the current implementation, and `t_new` is the time of the implementation introduced in this change.
```
| m             | n             | t_old (ns) | t_new (ns) |
+---------------+---------------+------------+------------+
| 2_971_215_073 | 1_836_311_903 |         76 |         40 |
|       253_241 |       489_997 |         26 |         12 |
| 4_183_928_743 |     1_234_567 |         63 |         31 |
| 3_221_225_469 |             3 |         93 |         47 |
|   0xffff_ffff |   0x8000_0000 |         98 |         55 |
```
I'm aware that having tested this on a single system for a single type doesn't tell us too much, but before trying to perform a more thorough benchmark, I'd like to know whether you are interested in this kind of optimisation in principle, and what kind of benchmark results you would like to see to inlcude it.

Co-authored-by: Sven Marnach &lt;sven@marnach.net&gt;
diff --git a/benches/gcd.rs b/benches/gcd.rs
@@ -0,0 +1,176 @@
+//! Benchmark comparing the current GCD implemtation against an older one.
+
+#![feature(test)]
+
+extern crate num_integer;
+extern crate num_traits;
+extern crate test;
+
+use num_integer::Integer;
+use num_traits::{AsPrimitive, Bounded, Signed};
+use test::{black_box, Bencher};
+
+trait GcdOld: Integer {
+    fn gcd_old(&self, other: &Self) -> Self;
+}
+
+macro_rules! impl_gcd_old_for_isize {
+    ($T:ty) => {
+        impl GcdOld for $T {
+            /// Calculates the Greatest Common Divisor (GCD) of the number and
+            /// `other`. The result is always positive.
+            #[inline]
+            fn gcd_old(&self, other: &Self) -> Self {
+                // Use Stein's algorithm
+                let mut m = *self;
+                let mut n = *other;
+                if m == 0 || n == 0 {
+                    return (m | n).abs();
+                }
+
+                // find common factors of 2
+                let shift = (m | n).trailing_zeros();
+
+                // The algorithm needs positive numbers, but the minimum value
+                // can't be represented as a positive one.
+                // It's also a power of two, so the gcd can be
+                // calculated by bitshifting in that case
+
+                // Assuming two's complement, the number created by the shift
+                // is positive for all numbers except gcd = abs(min value)
+                // The call to .abs() causes a panic in debug mode
+                if m == Self::min_value() || n == Self::min_value() {
+                    return (1 << shift).abs();
+                }
+
+                // guaranteed to be positive now, rest like unsigned algorithm
+                m = m.abs();
+                n = n.abs();
+
+                // divide n and m by 2 until odd
+                // m inside loop
+                n >>= n.trailing_zeros();
+
+                while m != 0 {
+                    m >>= m.trailing_zeros();
+                    if n > m {
+                        std::mem::swap(&mut n, &mut m)
+                    }
+                    m -= n;
+                }
+
+                n << shift
+            }
+        }
+    };
+}
+
+impl_gcd_old_for_isize!(i8);
+impl_gcd_old_for_isize!(i16);
+impl_gcd_old_for_isize!(i32);
+impl_gcd_old_for_isize!(i64);
+impl_gcd_old_for_isize!(isize);
+impl_gcd_old_for_isize!(i128);
+
+macro_rules! impl_gcd_old_for_usize {
+    ($T:ty) => {
+        impl GcdOld for $T {
+            /// Calculates the Greatest Common Divisor (GCD) of the number and
+            /// `other`. The result is always positive.
+            #[inline]
+            fn gcd_old(&self, other: &Self) -> Self {
+                // Use Stein's algorithm
+                let mut m = *self;
+                let mut n = *other;
+                if m == 0 || n == 0 {
+                    return m | n;
+                }
+
+                // find common factors of 2
+                let shift = (m | n).trailing_zeros();
+
+                // divide n and m by 2 until odd
+                // m inside loop
+                n >>= n.trailing_zeros();
+
+                while m != 0 {
+                    m >>= m.trailing_zeros();
+                    if n > m {
+                        std::mem::swap(&mut n, &mut m)
+                    }
+                    m -= n;
+                }
+
+                n << shift
+            }
+        }
+    };
+}
+
+impl_gcd_old_for_usize!(u8);
+impl_gcd_old_for_usize!(u16);
+impl_gcd_old_for_usize!(u32);
+impl_gcd_old_for_usize!(u64);
+impl_gcd_old_for_usize!(usize);
+impl_gcd_old_for_usize!(u128);
+
+/// Return an iterator that yields all Fibonacci numbers fitting into a u128.
+fn fibonacci() -> impl Iterator<Item=u128> {
+    (0..185).scan((0, 1), |&mut (ref mut a, ref mut b), _| {
+        let tmp = *a;
+        *a = *b;
+        *b += tmp;
+        Some(*b)
+    })
+}
+
+fn run_bench<T: Integer + Bounded + Copy + 'static>(b: &mut Bencher, gcd: fn(&T, &T) -> T)
+where
+    T: AsPrimitive<u128>,
+    u128: AsPrimitive<T>,
+{
+    let max_value: u128 = T::max_value().as_();
+    let pairs: Vec<(T, T)> = fibonacci()
+        .collect::<Vec<_>>()
+        .windows(2)
+        .filter(|&pair| pair[0] <= max_value && pair[1] <= max_value)
+        .map(|pair| (pair[0].as_(), pair[1].as_()))
+        .collect();
+    b.iter(|| {
+        for &(ref m, ref n) in &pairs {
+            black_box(gcd(m, n));
+        }
+    });
+}
+
+macro_rules! bench_gcd {
+    ($T:ident) => {
+        mod $T {
+            use crate::{run_bench, GcdOld};
+            use num_integer::Integer;
+            use test::Bencher;
+
+            #[bench]
+            fn bench_gcd(b: &mut Bencher) {
+                run_bench(b, $T::gcd);
+            }
+
+            #[bench]
+            fn bench_gcd_old(b: &mut Bencher) {
+                run_bench(b, $T::gcd_old);
+            }
+        }
+    };
+}
+
+bench_gcd!(u8);
+bench_gcd!(u16);
+bench_gcd!(u32);
+bench_gcd!(u64);
+bench_gcd!(u128);
+
+bench_gcd!(i8);
+bench_gcd!(i16);
+bench_gcd!(i32);
+bench_gcd!(i64);
+bench_gcd!(i128);
diff --git a/src/lib.rs b/src/lib.rs
@@ -23,7 +23,6 @@ extern crate std;
 extern crate num_traits as traits;
 
 use core::ops::Add;
-use core::mem;
 
 use traits::{Num, Signed};
 
@@ -275,16 +274,19 @@ macro_rules! impl_integer_for_isize {
                 n = n.abs();
 
                 // divide n and m by 2 until odd
-                // m inside loop
+                m >>= m.trailing_zeros();
                 n >>= n.trailing_zeros();
 
-                while m != 0 {
-                    m >>= m.trailing_zeros();
-                    if n > m { mem::swap(&mut n, &mut m) }
-                    m -= n;
+                while m != n {
+                    if m > n {
+                        m -= n;
+                        m >>= m.trailing_zeros();
+                    } else {
+                        n -= m;
+                        n >>= n.trailing_zeros();
+                    }
                 }
-
-                n << shift
+                m << shift
             }
 
             /// Calculates the Lowest Common Multiple (LCM) of the number and
@@ -537,16 +539,19 @@ macro_rules! impl_integer_for_usize {
                 let shift = (m | n).trailing_zeros();
 
                 // divide n and m by 2 until odd
-                // m inside loop
+                m >>= m.trailing_zeros();
                 n >>= n.trailing_zeros();
 
-                while m != 0 {
-                    m >>= m.trailing_zeros();
-                    if n > m { mem::swap(&mut n, &mut m) }
-                    m -= n;
+                while m != n {
+                    if m > n {
+                        m -= n;
+                        m >>= m.trailing_zeros();
+                    } else {
+                        n -= m;
+                        n >>= n.trailing_zeros();
+                    }
                 }
-
-                n << shift
+                m << shift
             }
 
             /// Calculates the Lowest Common Multiple (LCM) of the number and `other`.