Skip to content

Commit 5c57be9

Browse files
Tony Sifkarovskignzlbg
Tony Sifkarovski
authored andcommitted
[avx2] add shuffle, insert/extract i128, permute* (rust-lang#210)
* [x86][avx2] add _mm256_shuffle{hi,lo}_epi16 * [x86][avx2] add _mm256_{insert,extract}i128_si256 * [x86][avx2] add remaining permute intrinsics
1 parent 259d479 commit 5c57be9

File tree

1 file changed

+295
-9
lines changed

1 file changed

+295
-9
lines changed

coresimd/src/x86/i586/avx2.rs

Lines changed: 295 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use simd_llvm::{simd_shuffle16, simd_shuffle32};
2424

2525
use v256::*;
2626
use v128::*;
27-
use x86::__m256i;
27+
use x86::{__m128i, __m256i};
2828

2929
#[cfg(test)]
3030
use stdsimd_test::assert_instr;
@@ -643,7 +643,20 @@ pub unsafe fn _mm256_cvtepu8_epi64(a: u8x16) -> i64x4 {
643643
simd_cast::<::v32::u8x4, _>(simd_shuffle4(a, a, [0, 1, 2, 3]))
644644
}
645645

646-
// TODO _m128i _mm256_extracti128_si256
646+
/// Extract 128 bits (of integer data) from `a` selected with `imm8`.
647+
#[inline(always)]
648+
#[target_feature = "+avx2"]
649+
#[cfg_attr(test, assert_instr(vextractf128, imm8 = 1))]
650+
pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
651+
use x86::i586::avx::_mm256_undefined_si256;
652+
let imm8 = (imm8 & 0xFF) as u8;
653+
let b = i64x4::from(_mm256_undefined_si256());
654+
let dst: i64x2 = match imm8 & 0b01 {
655+
0 => simd_shuffle2(i64x4::from(a), b, [0, 1]),
656+
_ => simd_shuffle2(i64x4::from(a), b, [2, 3]),
657+
};
658+
__m128i::from(dst)
659+
}
647660

648661
/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`.
649662
#[inline(always)]
@@ -1191,7 +1204,23 @@ pub unsafe fn _mm256_mask_i64gather_pd(
11911204
constify_imm8!(scale, call)
11921205
}
11931206

1194-
// TODO _mm256_inserti128_si256
1207+
/// Copy `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1208+
/// location specified by `imm8`.
1209+
#[inline(always)]
1210+
#[target_feature = "+avx2"]
1211+
#[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
1212+
pub unsafe fn _mm256_inserti128_si256(
1213+
a: __m256i, b: __m128i, imm8: i32
1214+
) -> __m256i {
1215+
use x86::i586::avx::_mm256_castsi128_si256;
1216+
let imm8 = (imm8 & 0b01) as u8;
1217+
let b = i64x4::from(_mm256_castsi128_si256(b));
1218+
let dst: i64x4 = match imm8 & 0b01 {
1219+
0 => simd_shuffle4(i64x4::from(a), b, [4, 5, 2, 3]),
1220+
_ => simd_shuffle4(i64x4::from(a), b, [0, 1, 4, 5]),
1221+
};
1222+
__m256i::from(dst)
1223+
}
11951224

11961225
/// Multiply packed signed 16-bit integers in `a` and `b`, producing
11971226
/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
@@ -1616,9 +1645,80 @@ pub unsafe fn _mm256_permute4x64_epi64(a: i64x4, imm8: i32) -> i64x4 {
16161645
}
16171646
}
16181647

1619-
// TODO _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8)
1620-
// TODO _mm256_permute4x64_pd (__m256d a, const int imm8)
1621-
// TODO _mm256_permutevar8x32_ps (__m256 a, __m256i idx)
1648+
/// Shuffle 128-bits of integer data selected by `imm8` from `a` and `b`.
1649+
#[inline(always)]
1650+
#[target_feature = "+avx2"]
1651+
#[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))]
1652+
pub unsafe fn _mm256_permute2x128_si256(
1653+
a: __m256i, b: __m256i, imm8: i32
1654+
) -> __m256i {
1655+
macro_rules! call {
1656+
($imm8:expr) => {
1657+
__m256i::from(vperm2i128(i64x4::from(a), i64x4::from(b), $imm8))
1658+
}
1659+
}
1660+
constify_imm8!(imm8, call)
1661+
}
1662+
1663+
/// Shuffle 64-bit floating-point elements in `a` across lanes using the
1664+
/// control in `imm8`.
1665+
#[inline(always)]
1666+
#[target_feature = "+avx2"]
1667+
#[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))]
1668+
pub unsafe fn _mm256_permute4x64_pd(a: f64x4, imm8: i32) -> f64x4 {
1669+
use x86::i586::avx::_mm256_undefined_pd;
1670+
let imm8 = (imm8 & 0xFF) as u8;
1671+
macro_rules! shuffle_done {
1672+
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
1673+
simd_shuffle4(a, _mm256_undefined_pd(), [$x01, $x23, $x45, $x67])
1674+
}
1675+
}
1676+
macro_rules! shuffle_x67 {
1677+
($x01:expr, $x23:expr, $x45:expr) => {
1678+
match (imm8 >> 6) & 0b11 {
1679+
0b00 => shuffle_done!($x01, $x23, $x45, 0),
1680+
0b01 => shuffle_done!($x01, $x23, $x45, 1),
1681+
0b10 => shuffle_done!($x01, $x23, $x45, 2),
1682+
_ => shuffle_done!($x01, $x23, $x45, 3),
1683+
}
1684+
}
1685+
}
1686+
macro_rules! shuffle_x45 {
1687+
($x01:expr, $x23:expr) => {
1688+
match (imm8 >> 4) & 0b11 {
1689+
0b00 => shuffle_x67!($x01, $x23, 0),
1690+
0b01 => shuffle_x67!($x01, $x23, 1),
1691+
0b10 => shuffle_x67!($x01, $x23, 2),
1692+
_ => shuffle_x67!($x01, $x23, 3),
1693+
}
1694+
}
1695+
}
1696+
macro_rules! shuffle_x23 {
1697+
($x01:expr) => {
1698+
match (imm8 >> 2) & 0b11 {
1699+
0b00 => shuffle_x45!($x01, 0),
1700+
0b01 => shuffle_x45!($x01, 1),
1701+
0b10 => shuffle_x45!($x01, 2),
1702+
_ => shuffle_x45!($x01, 3),
1703+
}
1704+
}
1705+
}
1706+
match imm8 & 0b11 {
1707+
0b00 => shuffle_x23!(0),
1708+
0b01 => shuffle_x23!(1),
1709+
0b10 => shuffle_x23!(2),
1710+
_ => shuffle_x23!(3),
1711+
}
1712+
}
1713+
1714+
/// Shuffle eight 32-bit foating-point elements in `a` across lanes using
1715+
/// the corresponding 32-bit integer index in `idx`.
1716+
#[inline(always)]
1717+
#[target_feature = "+avx2"]
1718+
#[cfg_attr(test, assert_instr(vpermps))]
1719+
pub unsafe fn _mm256_permutevar8x32_ps(a: f32x8, idx: i32x8) -> f32x8 {
1720+
permps(a, idx)
1721+
}
16221722

16231723
/// Compute the absolute differences of packed unsigned 8-bit integers in `a`
16241724
/// and `b`, then horizontally sum each consecutive 8 differences to
@@ -1760,8 +1860,115 @@ pub unsafe fn _mm256_shuffle_epi32(a: i32x8, imm8: i32) -> i32x8 {
17601860
}
17611861
}
17621862

1763-
// TODO _mm256_shufflehi_epi16 (__m256i a, const int imm8)
1764-
// TODO _mm256_shufflelo_epi16 (__m256i a, const int imm8)
1863+
/// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
1864+
/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
1865+
/// to the output.
1866+
#[inline(always)]
1867+
#[target_feature = "+avx2"]
1868+
#[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))]
1869+
pub unsafe fn _mm256_shufflehi_epi16(a: i16x16, imm8: i32) -> i16x16 {
1870+
let imm8 = (imm8 & 0xFF) as u8;
1871+
macro_rules! shuffle_done {
1872+
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
1873+
#[cfg_attr(rustfmt, rustfmt_skip)]
1874+
simd_shuffle16(a, a, [
1875+
0, 1, 2, 3, 4+$x01, 4+$x23, 4+$x45, 4+$x67,
1876+
8, 9, 10, 11, 12+$x01, 12+$x23, 12+$x45, 12+$x67
1877+
]);
1878+
}
1879+
}
1880+
macro_rules! shuffle_x67 {
1881+
($x01:expr, $x23:expr, $x45:expr) => {
1882+
match (imm8 >> 6) & 0b11 {
1883+
0b00 => shuffle_done!($x01, $x23, $x45, 0),
1884+
0b01 => shuffle_done!($x01, $x23, $x45, 1),
1885+
0b10 => shuffle_done!($x01, $x23, $x45, 2),
1886+
_ => shuffle_done!($x01, $x23, $x45, 3),
1887+
}
1888+
}
1889+
}
1890+
macro_rules! shuffle_x45 {
1891+
($x01:expr, $x23:expr) => {
1892+
match (imm8 >> 4) & 0b11 {
1893+
0b00 => shuffle_x67!($x01, $x23, 0),
1894+
0b01 => shuffle_x67!($x01, $x23, 1),
1895+
0b10 => shuffle_x67!($x01, $x23, 2),
1896+
_ => shuffle_x67!($x01, $x23, 3),
1897+
}
1898+
}
1899+
}
1900+
macro_rules! shuffle_x23 {
1901+
($x01:expr) => {
1902+
match (imm8 >> 2) & 0b11 {
1903+
0b00 => shuffle_x45!($x01, 0),
1904+
0b01 => shuffle_x45!($x01, 1),
1905+
0b10 => shuffle_x45!($x01, 2),
1906+
_ => shuffle_x45!($x01, 3),
1907+
}
1908+
}
1909+
}
1910+
match imm8 & 0b11 {
1911+
0b00 => shuffle_x23!(0),
1912+
0b01 => shuffle_x23!(1),
1913+
0b10 => shuffle_x23!(2),
1914+
_ => shuffle_x23!(3),
1915+
}
1916+
}
1917+
1918+
/// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
1919+
/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
1920+
/// to the output.
1921+
#[inline(always)]
1922+
#[target_feature = "+avx2"]
1923+
#[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))]
1924+
pub unsafe fn _mm256_shufflelo_epi16(a: i16x16, imm8: i32) -> i16x16 {
1925+
let imm8 = (imm8 & 0xFF) as u8;
1926+
macro_rules! shuffle_done {
1927+
($x01:expr, $x23:expr, $x45:expr, $x67:expr) => {
1928+
#[cfg_attr(rustfmt, rustfmt_skip)]
1929+
simd_shuffle16(a, a, [
1930+
0+$x01, 0+$x23, 0+$x45, 0+$x67, 4, 5, 6, 7,
1931+
8+$x01, 8+$x23, 8+$x45, 8+$x67, 12, 13, 14, 15,
1932+
]);
1933+
}
1934+
}
1935+
macro_rules! shuffle_x67 {
1936+
($x01:expr, $x23:expr, $x45:expr) => {
1937+
match (imm8 >> 6) & 0b11 {
1938+
0b00 => shuffle_done!($x01, $x23, $x45, 0),
1939+
0b01 => shuffle_done!($x01, $x23, $x45, 1),
1940+
0b10 => shuffle_done!($x01, $x23, $x45, 2),
1941+
_ => shuffle_done!($x01, $x23, $x45, 3),
1942+
}
1943+
}
1944+
}
1945+
macro_rules! shuffle_x45 {
1946+
($x01:expr, $x23:expr) => {
1947+
match (imm8 >> 4) & 0b11 {
1948+
0b00 => shuffle_x67!($x01, $x23, 0),
1949+
0b01 => shuffle_x67!($x01, $x23, 1),
1950+
0b10 => shuffle_x67!($x01, $x23, 2),
1951+
_ => shuffle_x67!($x01, $x23, 3),
1952+
}
1953+
}
1954+
}
1955+
macro_rules! shuffle_x23 {
1956+
($x01:expr) => {
1957+
match (imm8 >> 2) & 0b11 {
1958+
0b00 => shuffle_x45!($x01, 0),
1959+
0b01 => shuffle_x45!($x01, 1),
1960+
0b10 => shuffle_x45!($x01, 2),
1961+
_ => shuffle_x45!($x01, 3),
1962+
}
1963+
}
1964+
}
1965+
match imm8 & 0b11 {
1966+
0b00 => shuffle_x23!(0),
1967+
0b01 => shuffle_x23!(1),
1968+
0b10 => shuffle_x23!(2),
1969+
_ => shuffle_x23!(3),
1970+
}
1971+
}
17651972

17661973
/// Negate packed 16-bit integers in `a` when the corresponding signed
17671974
/// 16-bit integer in `b` is negative, and return the results.
@@ -2626,6 +2833,10 @@ extern "C" {
26262833
fn pshufb(a: u8x32, b: u8x32) -> u8x32;
26272834
#[link_name = "llvm.x86.avx2.permd"]
26282835
fn permd(a: u32x8, b: u32x8) -> u32x8;
2836+
#[link_name = "llvm.x86.avx2.permps"]
2837+
fn permps(a: f32x8, b: i32x8) -> f32x8;
2838+
#[link_name = "llvm.x86.avx2.vperm2i128"]
2839+
fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
26292840
#[link_name = "llvm.x86.avx2.gather.d.d"]
26302841
fn pgatherdd(
26312842
src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8
@@ -2700,7 +2911,7 @@ mod tests {
27002911
use v256::*;
27012912
use v128::*;
27022913
use x86::i586::avx2;
2703-
use x86::__m256i;
2914+
use x86::{__m128i, __m256i};
27042915
use std;
27052916

27062917
#[simd_test = "avx2"]
@@ -3306,6 +3517,14 @@ mod tests {
33063517
assert_eq!(r, avx2::_mm256_cvtepu8_epi64(a));
33073518
}
33083519

3520+
#[simd_test = "avx2"]
3521+
unsafe fn _mm256_extracti128_si256() {
3522+
let a = __m256i::from(i64x4::new(1, 2, 3, 4));
3523+
let r = avx2::_mm256_extracti128_si256(a, 0b01);
3524+
let e = __m128i::from(i64x2::new(3, 4));
3525+
assert_eq!(r, e);
3526+
}
3527+
33093528
#[simd_test = "avx2"]
33103529
unsafe fn _mm256_hadd_epi16() {
33113530
let a = i16x16::splat(2);
@@ -3370,6 +3589,15 @@ mod tests {
33703589
assert_eq!(r, e);
33713590
}
33723591

3592+
#[simd_test = "avx2"]
3593+
unsafe fn _mm256_inserti128_si256() {
3594+
let a = __m256i::from(i64x4::new(1, 2, 3, 4));
3595+
let b = __m128i::from(i64x2::new(7, 8));
3596+
let r = avx2::_mm256_inserti128_si256(a, b, 0b01);
3597+
let e = i64x4::new(1, 2, 7, 8);
3598+
assert_eq!(r, __m256i::from(e));
3599+
}
3600+
33733601
#[simd_test = "avx2"]
33743602
unsafe fn _mm256_maddubs_epi16() {
33753603
let a = u8x32::splat(2);
@@ -3704,6 +3932,38 @@ mod tests {
37043932
assert_eq!(r, e);
37053933
}
37063934

3935+
#[simd_test = "avx2"]
3936+
unsafe fn _mm256_shufflehi_epi16() {
3937+
#[cfg_attr(rustfmt, rustfmt_skip)]
3938+
let a = i16x16::new(
3939+
0, 1, 2, 3, 11, 22, 33, 44,
3940+
4, 5, 6, 7, 55, 66, 77, 88,
3941+
);
3942+
#[cfg_attr(rustfmt, rustfmt_skip)]
3943+
let e = i16x16::new(
3944+
0, 1, 2, 3, 44, 22, 22, 11,
3945+
4, 5, 6, 7, 88, 66, 66, 55,
3946+
);
3947+
let r = avx2::_mm256_shufflehi_epi16(a, 0b00_01_01_11);
3948+
assert_eq!(r, e);
3949+
}
3950+
3951+
#[simd_test = "avx2"]
3952+
unsafe fn _mm256_shufflelo_epi16() {
3953+
#[cfg_attr(rustfmt, rustfmt_skip)]
3954+
let a = i16x16::new(
3955+
11, 22, 33, 44, 0, 1, 2, 3,
3956+
55, 66, 77, 88, 4, 5, 6, 7,
3957+
);
3958+
#[cfg_attr(rustfmt, rustfmt_skip)]
3959+
let e = i16x16::new(
3960+
44, 22, 22, 11, 0, 1, 2, 3,
3961+
88, 66, 66, 55, 4, 5, 6, 7,
3962+
);
3963+
let r = avx2::_mm256_shufflelo_epi16(a, 0b00_01_01_11);
3964+
assert_eq!(r, e);
3965+
}
3966+
37073967
#[simd_test = "avx2"]
37083968
unsafe fn _mm256_sign_epi16() {
37093969
let a = i16x16::splat(2);
@@ -4119,6 +4379,32 @@ mod tests {
41194379
assert_eq!(r, expected);
41204380
}
41214381

4382+
#[simd_test = "avx2"]
4383+
unsafe fn _mm256_permute2x128_si256() {
4384+
let a = __m256i::from(i64x4::new(100, 200, 500, 600));
4385+
let b = __m256i::from(i64x4::new(300, 400, 700, 800));
4386+
let r = avx2::_mm256_permute2x128_si256(a, b, 0b00_01_00_11);
4387+
let e = i64x4::new(700, 800, 500, 600);
4388+
assert_eq!(i64x4::from(r), e);
4389+
}
4390+
4391+
#[simd_test = "avx2"]
4392+
unsafe fn _mm256_permute4x64_pd() {
4393+
let a = f64x4::new(1., 2., 3., 4.);
4394+
let r = avx2::_mm256_permute4x64_pd(a, 0b00_01_00_11);
4395+
let e = f64x4::new(4., 1., 2., 1.);
4396+
assert_eq!(r, e);
4397+
}
4398+
4399+
#[simd_test = "avx2"]
4400+
unsafe fn _mm256_permutevar8x32_ps() {
4401+
let a = f32x8::new(1., 2., 3., 4., 5., 6., 7., 8.);
4402+
let b = i32x8::new(5, 0, 5, 1, 7, 6, 3, 4);
4403+
let r = avx2::_mm256_permutevar8x32_ps(a, b);
4404+
let e = f32x8::new(6., 1., 6., 2., 8., 7., 4., 5.);
4405+
assert_eq!(r, e);
4406+
}
4407+
41224408
#[simd_test = "avx2"]
41234409
unsafe fn _mm_i32gather_epi32() {
41244410
let mut arr = [0i32; 128];

0 commit comments

Comments
 (0)