Skip to content

Commit 0ff6c12

Browse files
committed
auto merge of #11280 : c-a/rust/inline_byteswap, r=brson
After writing some benchmarks for ebml::reader::vuint_at() I noticed that LLVM doesn't seem to inline the from_be32 function even though it only does a call to the bswap32 intrinsic in the x86_64 case. Marking the functions with #[inline(always)] fixes that and seems to me a reasonable thing to do. I got the following measurements in my vuint_at() benchmarks: - Before test ebml::bench::vuint_at_A_aligned ... bench: 1075 ns/iter (+/- 58) test ebml::bench::vuint_at_A_unaligned ... bench: 1073 ns/iter (+/- 5) test ebml::bench::vuint_at_D_aligned ... bench: 1150 ns/iter (+/- 5) test ebml::bench::vuint_at_D_unaligned ... bench: 1151 ns/iter (+/- 6) - Inline from_be32 test ebml::bench::vuint_at_A_aligned ... bench: 769 ns/iter (+/- 9) test ebml::bench::vuint_at_A_unaligned ... bench: 795 ns/iter (+/- 6) test ebml::bench::vuint_at_D_aligned ... bench: 758 ns/iter (+/- 8) test ebml::bench::vuint_at_D_unaligned ... bench: 759 ns/iter (+/- 8) - Using vuint_at_slow() test ebml::bench::vuint_at_A_aligned ... bench: 646 ns/iter (+/- 7) test ebml::bench::vuint_at_A_unaligned ... bench: 645 ns/iter (+/- 3) test ebml::bench::vuint_at_D_aligned ... bench: 907 ns/iter (+/- 4) test ebml::bench::vuint_at_D_unaligned ... bench: 1085 ns/iter (+/- 16) As expected inlining from_be32() gave a considerable speedup. I also tried how the "slow" version fared against the optimized version and noticed that it's actually a bit faster for small A class integers (using only two bytes) but slower for big D class integers (using four bytes)
2 parents 8bfd2a8 + a82f32b commit 0ff6c12

File tree

2 files changed

+111
-27
lines changed

2 files changed

+111
-27
lines changed

src/libextra/ebml.rs

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -960,3 +960,87 @@ mod tests {
960960
test_v(Some(3));
961961
}
962962
}
963+
964+
#[cfg(test)]
965+
mod bench {
966+
use ebml::reader;
967+
use test::BenchHarness;
968+
969+
#[bench]
970+
pub fn vuint_at_A_aligned(bh: &mut BenchHarness) {
971+
use std::vec;
972+
let data = vec::from_fn(4*100, |i| {
973+
match (i % 2) {
974+
0 => 0x80u8,
975+
_ => i as u8,
976+
}
977+
});
978+
let mut sum = 0u;
979+
bh.iter(|| {
980+
let mut i = 0;
981+
while (i < data.len()) {
982+
sum += reader::vuint_at(data, i).val;
983+
i += 4;
984+
}
985+
});
986+
}
987+
988+
#[bench]
989+
pub fn vuint_at_A_unaligned(bh: &mut BenchHarness) {
990+
use std::vec;
991+
let data = vec::from_fn(4*100+1, |i| {
992+
match (i % 2) {
993+
1 => 0x80u8,
994+
_ => i as u8
995+
}
996+
});
997+
let mut sum = 0u;
998+
bh.iter(|| {
999+
let mut i = 1;
1000+
while (i < data.len()) {
1001+
sum += reader::vuint_at(data, i).val;
1002+
i += 4;
1003+
}
1004+
});
1005+
}
1006+
1007+
#[bench]
1008+
pub fn vuint_at_D_aligned(bh: &mut BenchHarness) {
1009+
use std::vec;
1010+
let data = vec::from_fn(4*100, |i| {
1011+
match (i % 4) {
1012+
0 => 0x10u8,
1013+
3 => i as u8,
1014+
_ => 0u8
1015+
}
1016+
});
1017+
let mut sum = 0u;
1018+
bh.iter(|| {
1019+
let mut i = 0;
1020+
while (i < data.len()) {
1021+
sum += reader::vuint_at(data, i).val;
1022+
i += 4;
1023+
}
1024+
});
1025+
}
1026+
1027+
#[bench]
1028+
pub fn vuint_at_D_unaligned(bh: &mut BenchHarness) {
1029+
use std::vec;
1030+
let data = vec::from_fn(4*100+1, |i| {
1031+
match (i % 4) {
1032+
1 => 0x10u8,
1033+
0 => i as u8,
1034+
_ => 0u8
1035+
}
1036+
});
1037+
let mut sum = 0u;
1038+
bh.iter(|| {
1039+
let mut i = 1;
1040+
while (i < data.len()) {
1041+
sum += reader::vuint_at(data, i).val;
1042+
i += 4;
1043+
}
1044+
});
1045+
}
1046+
}

src/libstd/unstable/intrinsics.rs

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -486,33 +486,33 @@ extern "rust-intrinsic" {
486486
pub fn u64_mul_with_overflow(x: u64, y: u64) -> (u64, bool);
487487
}
488488

489-
#[cfg(target_endian = "little")] pub fn to_le16(x: i16) -> i16 { x }
490-
#[cfg(target_endian = "big")] pub fn to_le16(x: i16) -> i16 { unsafe { bswap16(x) } }
491-
#[cfg(target_endian = "little")] pub fn to_le32(x: i32) -> i32 { x }
492-
#[cfg(target_endian = "big")] pub fn to_le32(x: i32) -> i32 { unsafe { bswap32(x) } }
493-
#[cfg(target_endian = "little")] pub fn to_le64(x: i64) -> i64 { x }
494-
#[cfg(target_endian = "big")] pub fn to_le64(x: i64) -> i64 { unsafe { bswap64(x) } }
495-
496-
#[cfg(target_endian = "little")] pub fn to_be16(x: i16) -> i16 { unsafe { bswap16(x) } }
497-
#[cfg(target_endian = "big")] pub fn to_be16(x: i16) -> i16 { x }
498-
#[cfg(target_endian = "little")] pub fn to_be32(x: i32) -> i32 { unsafe { bswap32(x) } }
499-
#[cfg(target_endian = "big")] pub fn to_be32(x: i32) -> i32 { x }
500-
#[cfg(target_endian = "little")] pub fn to_be64(x: i64) -> i64 { unsafe { bswap64(x) } }
501-
#[cfg(target_endian = "big")] pub fn to_be64(x: i64) -> i64 { x }
502-
503-
#[cfg(target_endian = "little")] pub fn from_le16(x: i16) -> i16 { x }
504-
#[cfg(target_endian = "big")] pub fn from_le16(x: i16) -> i16 { unsafe { bswap16(x) } }
505-
#[cfg(target_endian = "little")] pub fn from_le32(x: i32) -> i32 { x }
506-
#[cfg(target_endian = "big")] pub fn from_le32(x: i32) -> i32 { unsafe { bswap32(x) } }
507-
#[cfg(target_endian = "little")] pub fn from_le64(x: i64) -> i64 { x }
508-
#[cfg(target_endian = "big")] pub fn from_le64(x: i64) -> i64 { unsafe { bswap64(x) } }
509-
510-
#[cfg(target_endian = "little")] pub fn from_be16(x: i16) -> i16 { unsafe { bswap16(x) } }
511-
#[cfg(target_endian = "big")] pub fn from_be16(x: i16) -> i16 { x }
512-
#[cfg(target_endian = "little")] pub fn from_be32(x: i32) -> i32 { unsafe { bswap32(x) } }
513-
#[cfg(target_endian = "big")] pub fn from_be32(x: i32) -> i32 { x }
514-
#[cfg(target_endian = "little")] pub fn from_be64(x: i64) -> i64 { unsafe { bswap64(x) } }
515-
#[cfg(target_endian = "big")] pub fn from_be64(x: i64) -> i64 { x }
489+
#[cfg(target_endian = "little")] #[inline] pub fn to_le16(x: i16) -> i16 { x }
490+
#[cfg(target_endian = "big")] #[inline] pub fn to_le16(x: i16) -> i16 { unsafe { bswap16(x) } }
491+
#[cfg(target_endian = "little")] #[inline] pub fn to_le32(x: i32) -> i32 { x }
492+
#[cfg(target_endian = "big")] #[inline] pub fn to_le32(x: i32) -> i32 { unsafe { bswap32(x) } }
493+
#[cfg(target_endian = "little")] #[inline] pub fn to_le64(x: i64) -> i64 { x }
494+
#[cfg(target_endian = "big")] #[inline] pub fn to_le64(x: i64) -> i64 { unsafe { bswap64(x) } }
495+
496+
#[cfg(target_endian = "little")] #[inline] pub fn to_be16(x: i16) -> i16 { unsafe { bswap16(x) } }
497+
#[cfg(target_endian = "big")] #[inline] pub fn to_be16(x: i16) -> i16 { x }
498+
#[cfg(target_endian = "little")] #[inline] pub fn to_be32(x: i32) -> i32 { unsafe { bswap32(x) } }
499+
#[cfg(target_endian = "big")] #[inline] pub fn to_be32(x: i32) -> i32 { x }
500+
#[cfg(target_endian = "little")] #[inline] pub fn to_be64(x: i64) -> i64 { unsafe { bswap64(x) } }
501+
#[cfg(target_endian = "big")] #[inline] pub fn to_be64(x: i64) -> i64 { x }
502+
503+
#[cfg(target_endian = "little")] #[inline] pub fn from_le16(x: i16) -> i16 { x }
504+
#[cfg(target_endian = "big")] #[inline] pub fn from_le16(x: i16) -> i16 { unsafe { bswap16(x) } }
505+
#[cfg(target_endian = "little")] #[inline] pub fn from_le32(x: i32) -> i32 { x }
506+
#[cfg(target_endian = "big")] #[inline] pub fn from_le32(x: i32) -> i32 { unsafe { bswap32(x) } }
507+
#[cfg(target_endian = "little")] #[inline] pub fn from_le64(x: i64) -> i64 { x }
508+
#[cfg(target_endian = "big")] #[inline] pub fn from_le64(x: i64) -> i64 { unsafe { bswap64(x) } }
509+
510+
#[cfg(target_endian = "little")] #[inline] pub fn from_be16(x: i16) -> i16 { unsafe { bswap16(x) } }
511+
#[cfg(target_endian = "big")] #[inline] pub fn from_be16(x: i16) -> i16 { x }
512+
#[cfg(target_endian = "little")] #[inline] pub fn from_be32(x: i32) -> i32 { unsafe { bswap32(x) } }
513+
#[cfg(target_endian = "big")] #[inline] pub fn from_be32(x: i32) -> i32 { x }
514+
#[cfg(target_endian = "little")] #[inline] pub fn from_be64(x: i64) -> i64 { unsafe { bswap64(x) } }
515+
#[cfg(target_endian = "big")] #[inline] pub fn from_be64(x: i64) -> i64 { x }
516516

517517
/// `TypeId` represents a globally unique identifier for a type
518518
#[lang="type_id"] // This needs to be kept in lockstep with the code in trans/intrinsic.rs and

0 commit comments

Comments
 (0)