@@ -24,7 +24,7 @@ use simd_llvm::{simd_shuffle16, simd_shuffle32};
24
24
25
25
use v256:: * ;
26
26
use v128:: * ;
27
- use x86:: __m256i;
27
+ use x86:: { __m128i , __m256i} ;
28
28
29
29
#[ cfg( test) ]
30
30
use stdsimd_test:: assert_instr;
@@ -643,7 +643,20 @@ pub unsafe fn _mm256_cvtepu8_epi64(a: u8x16) -> i64x4 {
643
643
simd_cast :: < :: v32:: u8x4 , _ > ( simd_shuffle4 ( a, a, [ 0 , 1 , 2 , 3 ] ) )
644
644
}
645
645
646
- // TODO _m128i _mm256_extracti128_si256
646
+ /// Extract 128 bits (of integer data) from `a` selected with `imm8`.
647
+ #[ inline( always) ]
648
+ #[ target_feature = "+avx2" ]
649
+ #[ cfg_attr( test, assert_instr( vextractf128, imm8 = 1 ) ) ]
650
+ pub unsafe fn _mm256_extracti128_si256 ( a : __m256i , imm8 : i32 ) -> __m128i {
651
+ use x86:: i586:: avx:: _mm256_undefined_si256;
652
+ let imm8 = ( imm8 & 0xFF ) as u8 ;
653
+ let b = i64x4:: from ( _mm256_undefined_si256 ( ) ) ;
654
+ let dst: i64x2 = match imm8 & 0b01 {
655
+ 0 => simd_shuffle2 ( i64x4:: from ( a) , b, [ 0 , 1 ] ) ,
656
+ _ => simd_shuffle2 ( i64x4:: from ( a) , b, [ 2 , 3 ] ) ,
657
+ } ;
658
+ __m128i:: from ( dst)
659
+ }
647
660
648
661
/// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`.
649
662
#[ inline( always) ]
@@ -1191,7 +1204,23 @@ pub unsafe fn _mm256_mask_i64gather_pd(
1191
1204
constify_imm8 ! ( scale, call)
1192
1205
}
1193
1206
1194
- // TODO _mm256_inserti128_si256
1207
+ /// Copy `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1208
+ /// location specified by `imm8`.
1209
+ #[ inline( always) ]
1210
+ #[ target_feature = "+avx2" ]
1211
+ #[ cfg_attr( test, assert_instr( vinsertf128, imm8 = 1 ) ) ]
1212
+ pub unsafe fn _mm256_inserti128_si256 (
1213
+ a : __m256i , b : __m128i , imm8 : i32
1214
+ ) -> __m256i {
1215
+ use x86:: i586:: avx:: _mm256_castsi128_si256;
1216
+ let imm8 = ( imm8 & 0b01 ) as u8 ;
1217
+ let b = i64x4:: from ( _mm256_castsi128_si256 ( b) ) ;
1218
+ let dst: i64x4 = match imm8 & 0b01 {
1219
+ 0 => simd_shuffle4 ( i64x4:: from ( a) , b, [ 4 , 5 , 2 , 3 ] ) ,
1220
+ _ => simd_shuffle4 ( i64x4:: from ( a) , b, [ 0 , 1 , 4 , 5 ] ) ,
1221
+ } ;
1222
+ __m256i:: from ( dst)
1223
+ }
1195
1224
1196
1225
/// Multiply packed signed 16-bit integers in `a` and `b`, producing
1197
1226
/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
@@ -1616,9 +1645,80 @@ pub unsafe fn _mm256_permute4x64_epi64(a: i64x4, imm8: i32) -> i64x4 {
1616
1645
}
1617
1646
}
1618
1647
1619
- // TODO _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8)
1620
- // TODO _mm256_permute4x64_pd (__m256d a, const int imm8)
1621
- // TODO _mm256_permutevar8x32_ps (__m256 a, __m256i idx)
1648
+ /// Shuffle 128-bits of integer data selected by `imm8` from `a` and `b`.
1649
+ #[ inline( always) ]
1650
+ #[ target_feature = "+avx2" ]
1651
+ #[ cfg_attr( test, assert_instr( vperm2f128, imm8 = 9 ) ) ]
1652
+ pub unsafe fn _mm256_permute2x128_si256 (
1653
+ a : __m256i , b : __m256i , imm8 : i32
1654
+ ) -> __m256i {
1655
+ macro_rules! call {
1656
+ ( $imm8: expr) => {
1657
+ __m256i:: from( vperm2i128( i64x4:: from( a) , i64x4:: from( b) , $imm8) )
1658
+ }
1659
+ }
1660
+ constify_imm8 ! ( imm8, call)
1661
+ }
1662
+
1663
+ /// Shuffle 64-bit floating-point elements in `a` across lanes using the
1664
+ /// control in `imm8`.
1665
+ #[ inline( always) ]
1666
+ #[ target_feature = "+avx2" ]
1667
+ #[ cfg_attr( test, assert_instr( vpermpd, imm8 = 1 ) ) ]
1668
+ pub unsafe fn _mm256_permute4x64_pd ( a : f64x4 , imm8 : i32 ) -> f64x4 {
1669
+ use x86:: i586:: avx:: _mm256_undefined_pd;
1670
+ let imm8 = ( imm8 & 0xFF ) as u8 ;
1671
+ macro_rules! shuffle_done {
1672
+ ( $x01: expr, $x23: expr, $x45: expr, $x67: expr) => {
1673
+ simd_shuffle4( a, _mm256_undefined_pd( ) , [ $x01, $x23, $x45, $x67] )
1674
+ }
1675
+ }
1676
+ macro_rules! shuffle_x67 {
1677
+ ( $x01: expr, $x23: expr, $x45: expr) => {
1678
+ match ( imm8 >> 6 ) & 0b11 {
1679
+ 0b00 => shuffle_done!( $x01, $x23, $x45, 0 ) ,
1680
+ 0b01 => shuffle_done!( $x01, $x23, $x45, 1 ) ,
1681
+ 0b10 => shuffle_done!( $x01, $x23, $x45, 2 ) ,
1682
+ _ => shuffle_done!( $x01, $x23, $x45, 3 ) ,
1683
+ }
1684
+ }
1685
+ }
1686
+ macro_rules! shuffle_x45 {
1687
+ ( $x01: expr, $x23: expr) => {
1688
+ match ( imm8 >> 4 ) & 0b11 {
1689
+ 0b00 => shuffle_x67!( $x01, $x23, 0 ) ,
1690
+ 0b01 => shuffle_x67!( $x01, $x23, 1 ) ,
1691
+ 0b10 => shuffle_x67!( $x01, $x23, 2 ) ,
1692
+ _ => shuffle_x67!( $x01, $x23, 3 ) ,
1693
+ }
1694
+ }
1695
+ }
1696
+ macro_rules! shuffle_x23 {
1697
+ ( $x01: expr) => {
1698
+ match ( imm8 >> 2 ) & 0b11 {
1699
+ 0b00 => shuffle_x45!( $x01, 0 ) ,
1700
+ 0b01 => shuffle_x45!( $x01, 1 ) ,
1701
+ 0b10 => shuffle_x45!( $x01, 2 ) ,
1702
+ _ => shuffle_x45!( $x01, 3 ) ,
1703
+ }
1704
+ }
1705
+ }
1706
+ match imm8 & 0b11 {
1707
+ 0b00 => shuffle_x23 ! ( 0 ) ,
1708
+ 0b01 => shuffle_x23 ! ( 1 ) ,
1709
+ 0b10 => shuffle_x23 ! ( 2 ) ,
1710
+ _ => shuffle_x23 ! ( 3 ) ,
1711
+ }
1712
+ }
1713
+
1714
+ /// Shuffle eight 32-bit foating-point elements in `a` across lanes using
1715
+ /// the corresponding 32-bit integer index in `idx`.
1716
+ #[ inline( always) ]
1717
+ #[ target_feature = "+avx2" ]
1718
+ #[ cfg_attr( test, assert_instr( vpermps) ) ]
1719
+ pub unsafe fn _mm256_permutevar8x32_ps ( a : f32x8 , idx : i32x8 ) -> f32x8 {
1720
+ permps ( a, idx)
1721
+ }
1622
1722
1623
1723
/// Compute the absolute differences of packed unsigned 8-bit integers in `a`
1624
1724
/// and `b`, then horizontally sum each consecutive 8 differences to
@@ -1760,8 +1860,115 @@ pub unsafe fn _mm256_shuffle_epi32(a: i32x8, imm8: i32) -> i32x8 {
1760
1860
}
1761
1861
}
1762
1862
1763
- // TODO _mm256_shufflehi_epi16 (__m256i a, const int imm8)
1764
- // TODO _mm256_shufflelo_epi16 (__m256i a, const int imm8)
1863
+ /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
1864
+ /// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
1865
+ /// to the output.
1866
+ #[ inline( always) ]
1867
+ #[ target_feature = "+avx2" ]
1868
+ #[ cfg_attr( test, assert_instr( vpshufhw, imm8 = 9 ) ) ]
1869
+ pub unsafe fn _mm256_shufflehi_epi16 ( a : i16x16 , imm8 : i32 ) -> i16x16 {
1870
+ let imm8 = ( imm8 & 0xFF ) as u8 ;
1871
+ macro_rules! shuffle_done {
1872
+ ( $x01: expr, $x23: expr, $x45: expr, $x67: expr) => {
1873
+ #[ cfg_attr( rustfmt, rustfmt_skip) ]
1874
+ simd_shuffle16( a, a, [
1875
+ 0 , 1 , 2 , 3 , 4 +$x01, 4 +$x23, 4 +$x45, 4 +$x67,
1876
+ 8 , 9 , 10 , 11 , 12 +$x01, 12 +$x23, 12 +$x45, 12 +$x67
1877
+ ] ) ;
1878
+ }
1879
+ }
1880
+ macro_rules! shuffle_x67 {
1881
+ ( $x01: expr, $x23: expr, $x45: expr) => {
1882
+ match ( imm8 >> 6 ) & 0b11 {
1883
+ 0b00 => shuffle_done!( $x01, $x23, $x45, 0 ) ,
1884
+ 0b01 => shuffle_done!( $x01, $x23, $x45, 1 ) ,
1885
+ 0b10 => shuffle_done!( $x01, $x23, $x45, 2 ) ,
1886
+ _ => shuffle_done!( $x01, $x23, $x45, 3 ) ,
1887
+ }
1888
+ }
1889
+ }
1890
+ macro_rules! shuffle_x45 {
1891
+ ( $x01: expr, $x23: expr) => {
1892
+ match ( imm8 >> 4 ) & 0b11 {
1893
+ 0b00 => shuffle_x67!( $x01, $x23, 0 ) ,
1894
+ 0b01 => shuffle_x67!( $x01, $x23, 1 ) ,
1895
+ 0b10 => shuffle_x67!( $x01, $x23, 2 ) ,
1896
+ _ => shuffle_x67!( $x01, $x23, 3 ) ,
1897
+ }
1898
+ }
1899
+ }
1900
+ macro_rules! shuffle_x23 {
1901
+ ( $x01: expr) => {
1902
+ match ( imm8 >> 2 ) & 0b11 {
1903
+ 0b00 => shuffle_x45!( $x01, 0 ) ,
1904
+ 0b01 => shuffle_x45!( $x01, 1 ) ,
1905
+ 0b10 => shuffle_x45!( $x01, 2 ) ,
1906
+ _ => shuffle_x45!( $x01, 3 ) ,
1907
+ }
1908
+ }
1909
+ }
1910
+ match imm8 & 0b11 {
1911
+ 0b00 => shuffle_x23 ! ( 0 ) ,
1912
+ 0b01 => shuffle_x23 ! ( 1 ) ,
1913
+ 0b10 => shuffle_x23 ! ( 2 ) ,
1914
+ _ => shuffle_x23 ! ( 3 ) ,
1915
+ }
1916
+ }
1917
+
1918
+ /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
1919
+ /// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
1920
+ /// to the output.
1921
+ #[ inline( always) ]
1922
+ #[ target_feature = "+avx2" ]
1923
+ #[ cfg_attr( test, assert_instr( vpshuflw, imm8 = 9 ) ) ]
1924
+ pub unsafe fn _mm256_shufflelo_epi16 ( a : i16x16 , imm8 : i32 ) -> i16x16 {
1925
+ let imm8 = ( imm8 & 0xFF ) as u8 ;
1926
+ macro_rules! shuffle_done {
1927
+ ( $x01: expr, $x23: expr, $x45: expr, $x67: expr) => {
1928
+ #[ cfg_attr( rustfmt, rustfmt_skip) ]
1929
+ simd_shuffle16( a, a, [
1930
+ 0 +$x01, 0 +$x23, 0 +$x45, 0 +$x67, 4 , 5 , 6 , 7 ,
1931
+ 8 +$x01, 8 +$x23, 8 +$x45, 8 +$x67, 12 , 13 , 14 , 15 ,
1932
+ ] ) ;
1933
+ }
1934
+ }
1935
+ macro_rules! shuffle_x67 {
1936
+ ( $x01: expr, $x23: expr, $x45: expr) => {
1937
+ match ( imm8 >> 6 ) & 0b11 {
1938
+ 0b00 => shuffle_done!( $x01, $x23, $x45, 0 ) ,
1939
+ 0b01 => shuffle_done!( $x01, $x23, $x45, 1 ) ,
1940
+ 0b10 => shuffle_done!( $x01, $x23, $x45, 2 ) ,
1941
+ _ => shuffle_done!( $x01, $x23, $x45, 3 ) ,
1942
+ }
1943
+ }
1944
+ }
1945
+ macro_rules! shuffle_x45 {
1946
+ ( $x01: expr, $x23: expr) => {
1947
+ match ( imm8 >> 4 ) & 0b11 {
1948
+ 0b00 => shuffle_x67!( $x01, $x23, 0 ) ,
1949
+ 0b01 => shuffle_x67!( $x01, $x23, 1 ) ,
1950
+ 0b10 => shuffle_x67!( $x01, $x23, 2 ) ,
1951
+ _ => shuffle_x67!( $x01, $x23, 3 ) ,
1952
+ }
1953
+ }
1954
+ }
1955
+ macro_rules! shuffle_x23 {
1956
+ ( $x01: expr) => {
1957
+ match ( imm8 >> 2 ) & 0b11 {
1958
+ 0b00 => shuffle_x45!( $x01, 0 ) ,
1959
+ 0b01 => shuffle_x45!( $x01, 1 ) ,
1960
+ 0b10 => shuffle_x45!( $x01, 2 ) ,
1961
+ _ => shuffle_x45!( $x01, 3 ) ,
1962
+ }
1963
+ }
1964
+ }
1965
+ match imm8 & 0b11 {
1966
+ 0b00 => shuffle_x23 ! ( 0 ) ,
1967
+ 0b01 => shuffle_x23 ! ( 1 ) ,
1968
+ 0b10 => shuffle_x23 ! ( 2 ) ,
1969
+ _ => shuffle_x23 ! ( 3 ) ,
1970
+ }
1971
+ }
1765
1972
1766
1973
/// Negate packed 16-bit integers in `a` when the corresponding signed
1767
1974
/// 16-bit integer in `b` is negative, and return the results.
@@ -2626,6 +2833,10 @@ extern "C" {
2626
2833
fn pshufb ( a : u8x32 , b : u8x32 ) -> u8x32 ;
2627
2834
#[ link_name = "llvm.x86.avx2.permd" ]
2628
2835
fn permd ( a : u32x8 , b : u32x8 ) -> u32x8 ;
2836
+ #[ link_name = "llvm.x86.avx2.permps" ]
2837
+ fn permps ( a : f32x8 , b : i32x8 ) -> f32x8 ;
2838
+ #[ link_name = "llvm.x86.avx2.vperm2i128" ]
2839
+ fn vperm2i128 ( a : i64x4 , b : i64x4 , imm8 : i8 ) -> i64x4 ;
2629
2840
#[ link_name = "llvm.x86.avx2.gather.d.d" ]
2630
2841
fn pgatherdd (
2631
2842
src : i32x4 , slice : * const i8 , offsets : i32x4 , mask : i32x4 , scale : i8
@@ -2700,7 +2911,7 @@ mod tests {
2700
2911
use v256:: * ;
2701
2912
use v128:: * ;
2702
2913
use x86:: i586:: avx2;
2703
- use x86:: __m256i;
2914
+ use x86:: { __m128i , __m256i} ;
2704
2915
use std;
2705
2916
2706
2917
#[ simd_test = "avx2" ]
@@ -3306,6 +3517,14 @@ mod tests {
3306
3517
assert_eq ! ( r, avx2:: _mm256_cvtepu8_epi64( a) ) ;
3307
3518
}
3308
3519
3520
+ #[ simd_test = "avx2" ]
3521
+ unsafe fn _mm256_extracti128_si256 ( ) {
3522
+ let a = __m256i:: from ( i64x4:: new ( 1 , 2 , 3 , 4 ) ) ;
3523
+ let r = avx2:: _mm256_extracti128_si256 ( a, 0b01 ) ;
3524
+ let e = __m128i:: from ( i64x2:: new ( 3 , 4 ) ) ;
3525
+ assert_eq ! ( r, e) ;
3526
+ }
3527
+
3309
3528
#[ simd_test = "avx2" ]
3310
3529
unsafe fn _mm256_hadd_epi16 ( ) {
3311
3530
let a = i16x16:: splat ( 2 ) ;
@@ -3370,6 +3589,15 @@ mod tests {
3370
3589
assert_eq ! ( r, e) ;
3371
3590
}
3372
3591
3592
+ #[ simd_test = "avx2" ]
3593
+ unsafe fn _mm256_inserti128_si256 ( ) {
3594
+ let a = __m256i:: from ( i64x4:: new ( 1 , 2 , 3 , 4 ) ) ;
3595
+ let b = __m128i:: from ( i64x2:: new ( 7 , 8 ) ) ;
3596
+ let r = avx2:: _mm256_inserti128_si256 ( a, b, 0b01 ) ;
3597
+ let e = i64x4:: new ( 1 , 2 , 7 , 8 ) ;
3598
+ assert_eq ! ( r, __m256i:: from( e) ) ;
3599
+ }
3600
+
3373
3601
#[ simd_test = "avx2" ]
3374
3602
unsafe fn _mm256_maddubs_epi16 ( ) {
3375
3603
let a = u8x32:: splat ( 2 ) ;
@@ -3704,6 +3932,38 @@ mod tests {
3704
3932
assert_eq ! ( r, e) ;
3705
3933
}
3706
3934
3935
+ #[ simd_test = "avx2" ]
3936
+ unsafe fn _mm256_shufflehi_epi16 ( ) {
3937
+ #[ cfg_attr( rustfmt, rustfmt_skip) ]
3938
+ let a = i16x16:: new (
3939
+ 0 , 1 , 2 , 3 , 11 , 22 , 33 , 44 ,
3940
+ 4 , 5 , 6 , 7 , 55 , 66 , 77 , 88 ,
3941
+ ) ;
3942
+ #[ cfg_attr( rustfmt, rustfmt_skip) ]
3943
+ let e = i16x16:: new (
3944
+ 0 , 1 , 2 , 3 , 44 , 22 , 22 , 11 ,
3945
+ 4 , 5 , 6 , 7 , 88 , 66 , 66 , 55 ,
3946
+ ) ;
3947
+ let r = avx2:: _mm256_shufflehi_epi16 ( a, 0b00_01_01_11 ) ;
3948
+ assert_eq ! ( r, e) ;
3949
+ }
3950
+
3951
+ #[ simd_test = "avx2" ]
3952
+ unsafe fn _mm256_shufflelo_epi16 ( ) {
3953
+ #[ cfg_attr( rustfmt, rustfmt_skip) ]
3954
+ let a = i16x16:: new (
3955
+ 11 , 22 , 33 , 44 , 0 , 1 , 2 , 3 ,
3956
+ 55 , 66 , 77 , 88 , 4 , 5 , 6 , 7 ,
3957
+ ) ;
3958
+ #[ cfg_attr( rustfmt, rustfmt_skip) ]
3959
+ let e = i16x16:: new (
3960
+ 44 , 22 , 22 , 11 , 0 , 1 , 2 , 3 ,
3961
+ 88 , 66 , 66 , 55 , 4 , 5 , 6 , 7 ,
3962
+ ) ;
3963
+ let r = avx2:: _mm256_shufflelo_epi16 ( a, 0b00_01_01_11 ) ;
3964
+ assert_eq ! ( r, e) ;
3965
+ }
3966
+
3707
3967
#[ simd_test = "avx2" ]
3708
3968
unsafe fn _mm256_sign_epi16 ( ) {
3709
3969
let a = i16x16:: splat ( 2 ) ;
@@ -4119,6 +4379,32 @@ mod tests {
4119
4379
assert_eq ! ( r, expected) ;
4120
4380
}
4121
4381
4382
+ #[ simd_test = "avx2" ]
4383
+ unsafe fn _mm256_permute2x128_si256 ( ) {
4384
+ let a = __m256i:: from ( i64x4:: new ( 100 , 200 , 500 , 600 ) ) ;
4385
+ let b = __m256i:: from ( i64x4:: new ( 300 , 400 , 700 , 800 ) ) ;
4386
+ let r = avx2:: _mm256_permute2x128_si256 ( a, b, 0b00_01_00_11 ) ;
4387
+ let e = i64x4:: new ( 700 , 800 , 500 , 600 ) ;
4388
+ assert_eq ! ( i64x4:: from( r) , e) ;
4389
+ }
4390
+
4391
+ #[ simd_test = "avx2" ]
4392
+ unsafe fn _mm256_permute4x64_pd ( ) {
4393
+ let a = f64x4:: new ( 1. , 2. , 3. , 4. ) ;
4394
+ let r = avx2:: _mm256_permute4x64_pd ( a, 0b00_01_00_11 ) ;
4395
+ let e = f64x4:: new ( 4. , 1. , 2. , 1. ) ;
4396
+ assert_eq ! ( r, e) ;
4397
+ }
4398
+
4399
+ #[ simd_test = "avx2" ]
4400
+ unsafe fn _mm256_permutevar8x32_ps ( ) {
4401
+ let a = f32x8:: new ( 1. , 2. , 3. , 4. , 5. , 6. , 7. , 8. ) ;
4402
+ let b = i32x8:: new ( 5 , 0 , 5 , 1 , 7 , 6 , 3 , 4 ) ;
4403
+ let r = avx2:: _mm256_permutevar8x32_ps ( a, b) ;
4404
+ let e = f32x8:: new ( 6. , 1. , 6. , 2. , 8. , 7. , 4. , 5. ) ;
4405
+ assert_eq ! ( r, e) ;
4406
+ }
4407
+
4122
4408
#[ simd_test = "avx2" ]
4123
4409
unsafe fn _mm_i32gather_epi32 ( ) {
4124
4410
let mut arr = [ 0i32 ; 128 ] ;
0 commit comments