Skip to content

Commit 87fec0f

Browse files
hdevalencealexcrichton
authored andcommitted
avx2: add _mm256_shuffle_epi32 reusing _mm_shuffle_epi32 code (#156)
1 parent f08e9c2 commit 87fec0f

File tree

1 file changed

+92
-1
lines changed

1 file changed

+92
-1
lines changed

src/x86/avx2.rs

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1057,7 +1057,98 @@ pub unsafe fn _mm256_shuffle_epi8(a: u8x32, b: u8x32) -> u8x32 {
10571057
pshufb(a, b)
10581058
}
10591059

1060-
// TODO _mm256_shuffle_epi32 (__m256i a, const int imm8)
1060+
/// Shuffle 32-bit integers in 128-bit lanes of `a` using the control in `imm8`.
1061+
///
1062+
/// ```rust
1063+
/// # #![feature(cfg_target_feature)]
1064+
/// # #![feature(target_feature)]
1065+
/// #
1066+
/// # #[macro_use] extern crate stdsimd;
1067+
/// #
1068+
/// # fn main() {
1069+
/// # if cfg_feature_enabled!("avx2") {
1070+
/// # #[target_feature = "+avx2"]
1071+
/// # fn worker() {
1072+
/// use stdsimd::simd::i32x8;
1073+
/// use stdsimd::vendor::_mm256_shuffle_epi32;
1074+
///
1075+
/// let a = i32x8::new(0, 1, 2, 3, 4, 5, 6, 7);
1076+
///
1077+
/// let shuffle1 = 0b00_11_10_01;
1078+
/// let shuffle2 = 0b01_00_10_11;
1079+
///
1080+
/// let c1: i32x8; let c2: i32x8;
1081+
/// unsafe {
1082+
/// c1 = _mm256_shuffle_epi32(a, shuffle1);
1083+
/// c2 = _mm256_shuffle_epi32(a, shuffle2);
1084+
/// }
1085+
///
1086+
/// let expected1 = i32x8::new(1, 2, 3, 0, 5, 6, 7, 4);
1087+
/// let expected2 = i32x8::new(3, 2, 0, 1, 7, 6, 4, 5);
1088+
///
1089+
/// assert_eq!(c1, expected1);
1090+
/// assert_eq!(c2, expected2);
1091+
/// # }
1092+
/// # worker();
1093+
/// # }
1094+
/// # }
1095+
/// ```
1096+
#[inline(always)]
1097+
#[target_feature = "+avx2"]
1098+
#[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
1099+
pub unsafe fn _mm256_shuffle_epi32(a: i32x8, imm8: i32) -> i32x8 {
1100+
// simd_shuffleX requires that its selector parameter be made up of
1101+
// constant values, but we can't enforce that here. In spirit, we need
1102+
// to write a `match` on all possible values of a byte, and for each value,
1103+
// hard-code the correct `simd_shuffleX` call using only constants. We
1104+
// then hope for LLVM to do the rest.
1105+
//
1106+
// Of course, that's... awful. So we try to use macros to do it for us.
1107+
let imm8 = (imm8 & 0xFF) as u8;
1108+
1109+
macro_rules! shuffle_done {
1110+
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
1111+
simd_shuffle8(a, a, [$x01, $x23, $x45, $x67, 4+$x01, 4+$x23, 4+$x45, 4+$x67])
1112+
}
1113+
}
1114+
macro_rules! shuffle_x67 {
1115+
($x01:expr, $x23:expr, $x45:expr) => {
1116+
match (imm8 >> 6) & 0b11 {
1117+
0b00 => shuffle_done!($x01, $x23, $x45, 0),
1118+
0b01 => shuffle_done!($x01, $x23, $x45, 1),
1119+
0b10 => shuffle_done!($x01, $x23, $x45, 2),
1120+
_ => shuffle_done!($x01, $x23, $x45, 3),
1121+
}
1122+
}
1123+
}
1124+
macro_rules! shuffle_x45 {
1125+
($x01:expr, $x23:expr) => {
1126+
match (imm8 >> 4) & 0b11 {
1127+
0b00 => shuffle_x67!($x01, $x23, 0),
1128+
0b01 => shuffle_x67!($x01, $x23, 1),
1129+
0b10 => shuffle_x67!($x01, $x23, 2),
1130+
_ => shuffle_x67!($x01, $x23, 3),
1131+
}
1132+
}
1133+
}
1134+
macro_rules! shuffle_x23 {
1135+
($x01:expr) => {
1136+
match (imm8 >> 2) & 0b11 {
1137+
0b00 => shuffle_x45!($x01, 0),
1138+
0b01 => shuffle_x45!($x01, 1),
1139+
0b10 => shuffle_x45!($x01, 2),
1140+
_ => shuffle_x45!($x01, 3),
1141+
}
1142+
}
1143+
}
1144+
match imm8 & 0b11 {
1145+
0b00 => shuffle_x23!(0),
1146+
0b01 => shuffle_x23!(1),
1147+
0b10 => shuffle_x23!(2),
1148+
_ => shuffle_x23!(3),
1149+
}
1150+
}
1151+
10611152
// TODO _mm256_shufflehi_epi16 (__m256i a, const int imm8)
10621153
// TODO _mm256_shufflelo_epi16 (__m256i a, const int imm8)
10631154

0 commit comments

Comments
 (0)