Skip to content

SipHasher takes up lots of time in incremental builds #51054

Closed
@nnethercote

Description

@nnethercote

Here's Cachegrind data for clap-rs from rustc-benchmarks on a "base incremental" "check" build (i.e. the first incremental build):

--------------------------------------------------------------------------------
            Ir
--------------------------------------------------------------------------------
39,792,587,805  PROGRAM TOTALS

6,008,282,218  librustc_data_structures/sip128.rs:rustc_data_structures::sip128::SipHasher128::short_write
1,655,965,059  libcore/num/mod.rs:rustc_data_structures::sip128::SipHasher128::short_write
  319,188,771  libcore/cmp.rs:rustc_data_structures::sip128::SipHasher128::short_write

That's over 20% of the instructions under short_write.

Here are the annotations for the hot pieces of code in librustc_data_structures/sip128.rs.

          .  macro_rules! compress {
          .      ($state:expr) => ({
967,427,173          compress!($state.v0, $state.v1, $state.v2, $state.v3)
          .      });
          .      ($v0:expr, $v1:expr, $v2:expr, $v3:expr) =>
          .      ({
          .          $v0 = $v0.wrapping_add($v1); $v1 = $v1.rotate_left(13); $v1 ^= $v0;
          .          $v0 = $v0.rotate_left(32);
          .          $v2 = $v2.wrapping_add($v3); $v3 = $v3.rotate_left(16); $v3 ^= $v2;
          .          $v0 = $v0.wrapping_add($v3); $v3 = $v3.rotate_left(21); $v3 ^= $v0;
          .          $v2 = $v2.wrapping_add($v1); $v1 = $v1.rotate_left(17); $v1 ^= $v2;
 79,051,764          $v2 = $v2.rotate_left(32);
          .      });
          .  }
          .
          .  /// Load an integer of the desired type from a byte stream, in LE order. Uses
          .  /// `copy_nonoverlapping` to let the compiler generate the most efficient way
          .  /// to load it from a possibly unaligned address.
          .  ///
          .  /// Unsafe because: unchecked indexing at i..i+size_of(int_ty)
          .  macro_rules! load_int_le {
          .      ($buf:expr, $i:expr, $int_ty:ident) =>
          .      ({
          .         debug_assert!($i + mem::size_of::<$int_ty>() <= $buf.len());
          .         let mut data = 0 as $int_ty;
 13,166,995         ptr::copy_nonoverlapping($buf.get_unchecked($i),
          .                                  &mut data as *mut _ as *mut u8,
          .                                  mem::size_of::<$int_ty>());
          .         data.to_le()
          .      });
          .  }
          .
          .  /// Load an u64 using up to 7 bytes of a byte slice.
          .  ///
          .  /// Unsafe because: unchecked indexing at start..start+len
          .  #[inline]
          .  unsafe fn u8to64_le(buf: &[u8], start: usize, len: usize) -> u64 {
          .      debug_assert!(len < 8);
          .      let mut i = 0; // current byte index (from LSB) in the output u64
          .      let mut out = 0;
345,951,546      if i + 3 < len {
 80,328,075          out = load_int_le!(buf, start + i, u32) as u64;
          .          i += 4;
          .      }
747,722,073      if i + 1 < len {
480,400,389          out |= (load_int_le!(buf, start + i, u16) as u64) << (i * 8);
 87,344,805          i += 2
          .      }
345,951,546      if i < len {
211,374,738          out |= (*buf.get_unchecked(start + i) as u64) << (i * 8);
          .          i += 1;
          .      }
          .      debug_assert_eq!(i, len);
          .      out
          .  }

and

          .      #[inline]
          .      fn short_write(&mut self, msg: &[u8]) {
          .          debug_assert!(msg.len() <= 8);
          .          let length = msg.len();
212,792,515          self.length += length;
          .
319,188,774          let needed = 8 - self.ntail;
          .          let fill = cmp::min(length, needed);
212,792,515          if fill == 8 {
 38,954,670              self.tail = unsafe { load_int_le!(msg, 0, u64) };
          .          } else {
560,468,202              self.tail |= unsafe { u8to64_le(msg, 0, fill) } << (8 * self.ntail);
186,822,734              if length < needed {
 55,081,556                  self.ntail += length;
          .                  return;
          .              }
          .          }
 78,855,480          self.state.v3 ^= self.tail;
          .          Sip24Rounds::c_rounds(&mut self.state);
157,710,960          self.state.v0 ^= self.tail;
          .
          .          // Buffered tail is now flushed, process new input.
157,710,958          self.ntail = length - needed;
 78,855,480          self.tail = unsafe { u8to64_le(msg, needed, self.ntail) };
212,792,514      }

And from libcore/num/mod.rs:

            .          #[inline]
            .          pub fn rotate_left(self, n: u32) -> Self {
            .              // Protect against undefined behaviour for over-long bit shifts
            .              let n = n % $BITS;
1,187,178,937              (self << n) | (self >> (($BITS - n) % $BITS))
            .          }

I stared at this for a while but wasn't able to come up with any notable improvements.

Hashing is the hottest part of most incremental check builds. If we can't speed up this code, perhaps we could use a different hasher, or find a way to hash less data.

CC @michaelwoerister

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-incr-compArea: Incremental compilationC-enhancementCategory: An issue proposing an enhancement or a PR with one.I-compiletimeIssue: Problems and improvements with respect to compile times.T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.WG-compiler-performanceWorking group: Compiler Performance

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions