Closed
Description
Here's Cachegrind data for clap-rs
from rustc-benchmarks on a "base incremental" "check" build (i.e. the first incremental build):
--------------------------------------------------------------------------------
Ir
--------------------------------------------------------------------------------
39,792,587,805 PROGRAM TOTALS
6,008,282,218 librustc_data_structures/sip128.rs:rustc_data_structures::sip128::SipHasher128::short_write
1,655,965,059 libcore/num/mod.rs:rustc_data_structures::sip128::SipHasher128::short_write
319,188,771 libcore/cmp.rs:rustc_data_structures::sip128::SipHasher128::short_write
That's over 20% of the instructions under short_write
.
Here are the annotations for the hot pieces of code in librustc_data_structures/sip128.rs.
. macro_rules! compress {
. ($state:expr) => ({
967,427,173 compress!($state.v0, $state.v1, $state.v2, $state.v3)
. });
. ($v0:expr, $v1:expr, $v2:expr, $v3:expr) =>
. ({
. $v0 = $v0.wrapping_add($v1); $v1 = $v1.rotate_left(13); $v1 ^= $v0;
. $v0 = $v0.rotate_left(32);
. $v2 = $v2.wrapping_add($v3); $v3 = $v3.rotate_left(16); $v3 ^= $v2;
. $v0 = $v0.wrapping_add($v3); $v3 = $v3.rotate_left(21); $v3 ^= $v0;
. $v2 = $v2.wrapping_add($v1); $v1 = $v1.rotate_left(17); $v1 ^= $v2;
79,051,764 $v2 = $v2.rotate_left(32);
. });
. }
.
. /// Load an integer of the desired type from a byte stream, in LE order. Uses
. /// `copy_nonoverlapping` to let the compiler generate the most efficient way
. /// to load it from a possibly unaligned address.
. ///
. /// Unsafe because: unchecked indexing at i..i+size_of(int_ty)
. macro_rules! load_int_le {
. ($buf:expr, $i:expr, $int_ty:ident) =>
. ({
. debug_assert!($i + mem::size_of::<$int_ty>() <= $buf.len());
. let mut data = 0 as $int_ty;
13,166,995 ptr::copy_nonoverlapping($buf.get_unchecked($i),
. &mut data as *mut _ as *mut u8,
. mem::size_of::<$int_ty>());
. data.to_le()
. });
. }
.
. /// Load an u64 using up to 7 bytes of a byte slice.
. ///
. /// Unsafe because: unchecked indexing at start..start+len
. #[inline]
. unsafe fn u8to64_le(buf: &[u8], start: usize, len: usize) -> u64 {
. debug_assert!(len < 8);
. let mut i = 0; // current byte index (from LSB) in the output u64
. let mut out = 0;
345,951,546 if i + 3 < len {
80,328,075 out = load_int_le!(buf, start + i, u32) as u64;
. i += 4;
. }
747,722,073 if i + 1 < len {
480,400,389 out |= (load_int_le!(buf, start + i, u16) as u64) << (i * 8);
87,344,805 i += 2
. }
345,951,546 if i < len {
211,374,738 out |= (*buf.get_unchecked(start + i) as u64) << (i * 8);
. i += 1;
. }
. debug_assert_eq!(i, len);
. out
. }
and
. #[inline]
. fn short_write(&mut self, msg: &[u8]) {
. debug_assert!(msg.len() <= 8);
. let length = msg.len();
212,792,515 self.length += length;
.
319,188,774 let needed = 8 - self.ntail;
. let fill = cmp::min(length, needed);
212,792,515 if fill == 8 {
38,954,670 self.tail = unsafe { load_int_le!(msg, 0, u64) };
. } else {
560,468,202 self.tail |= unsafe { u8to64_le(msg, 0, fill) } << (8 * self.ntail);
186,822,734 if length < needed {
55,081,556 self.ntail += length;
. return;
. }
. }
78,855,480 self.state.v3 ^= self.tail;
. Sip24Rounds::c_rounds(&mut self.state);
157,710,960 self.state.v0 ^= self.tail;
.
. // Buffered tail is now flushed, process new input.
157,710,958 self.ntail = length - needed;
78,855,480 self.tail = unsafe { u8to64_le(msg, needed, self.ntail) };
212,792,514 }
And from libcore/num/mod.rs:
. #[inline]
. pub fn rotate_left(self, n: u32) -> Self {
. // Protect against undefined behaviour for over-long bit shifts
. let n = n % $BITS;
1,187,178,937 (self << n) | (self >> (($BITS - n) % $BITS))
. }
I stared at this for a while but wasn't able to come up with any notable improvements.
Hashing is the hottest part of most incremental check builds. If we can't speed up this code, perhaps we could use a different hasher, or find a way to hash less data.