SipHasher takes up lots of time in incremental builds

Here's Cachegrind data for `clap-rs` from rustc-benchmarks on a "base incremental" "check" build (i.e. the first incremental build):
```
--------------------------------------------------------------------------------
            Ir
--------------------------------------------------------------------------------
39,792,587,805  PROGRAM TOTALS

6,008,282,218  librustc_data_structures/sip128.rs:rustc_data_structures::sip128::SipHasher128::short_write
1,655,965,059  libcore/num/mod.rs:rustc_data_structures::sip128::SipHasher128::short_write
  319,188,771  libcore/cmp.rs:rustc_data_structures::sip128::SipHasher128::short_write
```
That's over 20% of the instructions under `short_write`.

Here are the annotations for the hot pieces of code in librustc_data_structures/sip128.rs.
```
          .  macro_rules! compress {
          .      ($state:expr) => ({
967,427,173          compress!($state.v0, $state.v1, $state.v2, $state.v3)
          .      });
          .      ($v0:expr, $v1:expr, $v2:expr, $v3:expr) =>
          .      ({
          .          $v0 = $v0.wrapping_add($v1); $v1 = $v1.rotate_left(13); $v1 ^= $v0;
          .          $v0 = $v0.rotate_left(32);
          .          $v2 = $v2.wrapping_add($v3); $v3 = $v3.rotate_left(16); $v3 ^= $v2;
          .          $v0 = $v0.wrapping_add($v3); $v3 = $v3.rotate_left(21); $v3 ^= $v0;
          .          $v2 = $v2.wrapping_add($v1); $v1 = $v1.rotate_left(17); $v1 ^= $v2;
 79,051,764          $v2 = $v2.rotate_left(32);
          .      });
          .  }
          .
          .  /// Load an integer of the desired type from a byte stream, in LE order. Uses
          .  /// `copy_nonoverlapping` to let the compiler generate the most efficient way
          .  /// to load it from a possibly unaligned address.
          .  ///
          .  /// Unsafe because: unchecked indexing at i..i+size_of(int_ty)
          .  macro_rules! load_int_le {
          .      ($buf:expr, $i:expr, $int_ty:ident) =>
          .      ({
          .         debug_assert!($i + mem::size_of::<$int_ty>() <= $buf.len());
          .         let mut data = 0 as $int_ty;
 13,166,995         ptr::copy_nonoverlapping($buf.get_unchecked($i),
          .                                  &mut data as *mut _ as *mut u8,
          .                                  mem::size_of::<$int_ty>());
          .         data.to_le()
          .      });
          .  }
          .
          .  /// Load an u64 using up to 7 bytes of a byte slice.
          .  ///
          .  /// Unsafe because: unchecked indexing at start..start+len
          .  #[inline]
          .  unsafe fn u8to64_le(buf: &[u8], start: usize, len: usize) -> u64 {
          .      debug_assert!(len < 8);
          .      let mut i = 0; // current byte index (from LSB) in the output u64
          .      let mut out = 0;
345,951,546      if i + 3 < len {
 80,328,075          out = load_int_le!(buf, start + i, u32) as u64;
          .          i += 4;
          .      }
747,722,073      if i + 1 < len {
480,400,389          out |= (load_int_le!(buf, start + i, u16) as u64) << (i * 8);
 87,344,805          i += 2
          .      }
345,951,546      if i < len {
211,374,738          out |= (*buf.get_unchecked(start + i) as u64) << (i * 8);
          .          i += 1;
          .      }
          .      debug_assert_eq!(i, len);
          .      out
          .  }
```
and
```
          .      #[inline]
          .      fn short_write(&mut self, msg: &[u8]) {
          .          debug_assert!(msg.len() <= 8);
          .          let length = msg.len();
212,792,515          self.length += length;
          .
319,188,774          let needed = 8 - self.ntail;
          .          let fill = cmp::min(length, needed);
212,792,515          if fill == 8 {
 38,954,670              self.tail = unsafe { load_int_le!(msg, 0, u64) };
          .          } else {
560,468,202              self.tail |= unsafe { u8to64_le(msg, 0, fill) } << (8 * self.ntail);
186,822,734              if length < needed {
 55,081,556                  self.ntail += length;
          .                  return;
          .              }
          .          }
 78,855,480          self.state.v3 ^= self.tail;
          .          Sip24Rounds::c_rounds(&mut self.state);
157,710,960          self.state.v0 ^= self.tail;
          .
          .          // Buffered tail is now flushed, process new input.
157,710,958          self.ntail = length - needed;
 78,855,480          self.tail = unsafe { u8to64_le(msg, needed, self.ntail) };
212,792,514      }
```
And from libcore/num/mod.rs:
```
            .          #[inline]
            .          pub fn rotate_left(self, n: u32) -> Self {
            .              // Protect against undefined behaviour for over-long bit shifts
            .              let n = n % $BITS;
1,187,178,937              (self << n) | (self >> (($BITS - n) % $BITS))
            .          }
```
I stared at this for a while but wasn't able to come up with any notable improvements.

Hashing is the hottest part of most incremental check builds. If we can't speed up this code, perhaps we could use a different hasher, or find a way to hash less data.

CC @michaelwoerister 

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

SipHasher takes up lots of time in incremental builds #51054

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

SipHasher takes up lots of time in incremental builds #51054

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions