|
| 1 | +//! Advanced Vector Extensions (AVX) |
| 2 | +//! |
| 3 | +//! The references are: |
| 4 | +//! |
| 5 | +//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: |
| 6 | +//! Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture |
| 7 | +//! Programmer's Manual, Volume 3: General-Purpose and System |
| 8 | +//! Instructions][amd64_ref]. |
| 9 | +//! |
| 10 | +//! [Wikipedia][wiki] provides a quick overview of the instructions available. |
| 11 | +//! |
| 12 | +//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf |
| 13 | +//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf |
| 14 | +//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions |
| 15 | +
|
1 | 16 | use std::mem;
|
2 | 17 | use std::ptr;
|
3 | 18 |
|
@@ -113,7 +128,7 @@ pub unsafe fn _mm256_shuffle_pd(a: f64x4, b: f64x4, imm8: i32) -> f64x4 {
|
113 | 128 | }
|
114 | 129 | }
|
115 | 130 | }
|
116 |
| - match (imm8 >> 0) & 0x1 { |
| 131 | + match imm8 & 0x1 { |
117 | 132 | 0 => shuffle1!(0),
|
118 | 133 | _ => shuffle1!(1),
|
119 | 134 | }
|
@@ -161,7 +176,7 @@ pub unsafe fn _mm256_shuffle_ps(a: f32x8, b: f32x8, imm8: i32) -> f32x8 {
|
161 | 176 | }
|
162 | 177 | }
|
163 | 178 | }
|
164 |
| - match (imm8 >> 0) & 0x3 { |
| 179 | + match imm8 & 0x3 { |
165 | 180 | 0 => shuffle1!(0, 4),
|
166 | 181 | 1 => shuffle1!(1, 5),
|
167 | 182 | 2 => shuffle1!(2, 6),
|
@@ -594,69 +609,69 @@ pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 {
|
594 | 609 | mem::transmute(a ^ b)
|
595 | 610 | }
|
596 | 611 |
|
597 |
| -// Equal (ordered, non-signaling) |
| 612 | +/// Equal (ordered, non-signaling) |
598 | 613 | pub const _CMP_EQ_OQ: u8 = 0x00;
|
599 |
| -// Less-than (ordered, signaling) |
| 614 | +/// Less-than (ordered, signaling) |
600 | 615 | pub const _CMP_LT_OS: u8 = 0x01;
|
601 |
| -// Less-than-or-equal (ordered, signaling) |
| 616 | +/// Less-than-or-equal (ordered, signaling) |
602 | 617 | pub const _CMP_LE_OS: u8 = 0x02;
|
603 |
| -// Unordered (non-signaling) |
| 618 | +/// Unordered (non-signaling) |
604 | 619 | pub const _CMP_UNORD_Q: u8 = 0x03;
|
605 |
| -// Not-equal (unordered, non-signaling) |
| 620 | +/// Not-equal (unordered, non-signaling) |
606 | 621 | pub const _CMP_NEQ_UQ: u8 = 0x04;
|
607 |
| -// Not-less-than (unordered, signaling) |
| 622 | +/// Not-less-than (unordered, signaling) |
608 | 623 | pub const _CMP_NLT_US: u8 = 0x05;
|
609 |
| -// Not-less-than-or-equal (unordered, signaling) |
| 624 | +/// Not-less-than-or-equal (unordered, signaling) |
610 | 625 | pub const _CMP_NLE_US: u8 = 0x06;
|
611 |
| -// Ordered (non-signaling) |
| 626 | +/// Ordered (non-signaling) |
612 | 627 | pub const _CMP_ORD_Q: u8 = 0x07;
|
613 |
| -// Equal (unordered, non-signaling) |
| 628 | +/// Equal (unordered, non-signaling) |
614 | 629 | pub const _CMP_EQ_UQ: u8 = 0x08;
|
615 |
| -// Not-greater-than-or-equal (unordered, signaling) |
| 630 | +/// Not-greater-than-or-equal (unordered, signaling) |
616 | 631 | pub const _CMP_NGE_US: u8 = 0x09;
|
617 |
| -// Not-greater-than (unordered, signaling) |
| 632 | +/// Not-greater-than (unordered, signaling) |
618 | 633 | pub const _CMP_NGT_US: u8 = 0x0a;
|
619 |
| -// False (ordered, non-signaling) |
| 634 | +/// False (ordered, non-signaling) |
620 | 635 | pub const _CMP_FALSE_OQ: u8 = 0x0b;
|
621 |
| -// Not-equal (ordered, non-signaling) |
| 636 | +/// Not-equal (ordered, non-signaling) |
622 | 637 | pub const _CMP_NEQ_OQ: u8 = 0x0c;
|
623 |
| -// Greater-than-or-equal (ordered, signaling) |
| 638 | +/// Greater-than-or-equal (ordered, signaling) |
624 | 639 | pub const _CMP_GE_OS: u8 = 0x0d;
|
625 |
| -// Greater-than (ordered, signaling) |
| 640 | +/// Greater-than (ordered, signaling) |
626 | 641 | pub const _CMP_GT_OS: u8 = 0x0e;
|
627 |
| -// True (unordered, non-signaling) |
| 642 | +/// True (unordered, non-signaling) |
628 | 643 | pub const _CMP_TRUE_UQ: u8 = 0x0f;
|
629 |
| -// Equal (ordered, signaling) |
| 644 | +/// Equal (ordered, signaling) |
630 | 645 | pub const _CMP_EQ_OS: u8 = 0x10;
|
631 |
| -// Less-than (ordered, non-signaling) |
| 646 | +/// Less-than (ordered, non-signaling) |
632 | 647 | pub const _CMP_LT_OQ: u8 = 0x11;
|
633 |
| -// Less-than-or-equal (ordered, non-signaling) |
| 648 | +/// Less-than-or-equal (ordered, non-signaling) |
634 | 649 | pub const _CMP_LE_OQ: u8 = 0x12;
|
635 |
| -// Unordered (signaling) |
| 650 | +/// Unordered (signaling) |
636 | 651 | pub const _CMP_UNORD_S: u8 = 0x13;
|
637 |
| -// Not-equal (unordered, signaling) |
| 652 | +/// Not-equal (unordered, signaling) |
638 | 653 | pub const _CMP_NEQ_US: u8 = 0x14;
|
639 |
| -// Not-less-than (unordered, non-signaling) |
| 654 | +/// Not-less-than (unordered, non-signaling) |
640 | 655 | pub const _CMP_NLT_UQ: u8 = 0x15;
|
641 |
| -// Not-less-than-or-equal (unordered, non-signaling) |
| 656 | +/// Not-less-than-or-equal (unordered, non-signaling) |
642 | 657 | pub const _CMP_NLE_UQ: u8 = 0x16;
|
643 |
| -// Ordered (signaling) |
| 658 | +/// Ordered (signaling) |
644 | 659 | pub const _CMP_ORD_S: u8 = 0x17;
|
645 |
| -// Equal (unordered, signaling) |
| 660 | +/// Equal (unordered, signaling) |
646 | 661 | pub const _CMP_EQ_US: u8 = 0x18;
|
647 |
| -// Not-greater-than-or-equal (unordered, non-signaling) |
| 662 | +/// Not-greater-than-or-equal (unordered, non-signaling) |
648 | 663 | pub const _CMP_NGE_UQ: u8 = 0x19;
|
649 |
| -// Not-greater-than (unordered, non-signaling) |
| 664 | +/// Not-greater-than (unordered, non-signaling) |
650 | 665 | pub const _CMP_NGT_UQ: u8 = 0x1a;
|
651 |
| -// False (ordered, signaling) |
| 666 | +/// False (ordered, signaling) |
652 | 667 | pub const _CMP_FALSE_OS: u8 = 0x1b;
|
653 |
| -// Not-equal (ordered, signaling) |
| 668 | +/// Not-equal (ordered, signaling) |
654 | 669 | pub const _CMP_NEQ_OS: u8 = 0x1c;
|
655 |
| -// Greater-than-or-equal (ordered, non-signaling) |
| 670 | +/// Greater-than-or-equal (ordered, non-signaling) |
656 | 671 | pub const _CMP_GE_OQ: u8 = 0x1d;
|
657 |
| -// Greater-than (ordered, non-signaling) |
| 672 | +/// Greater-than (ordered, non-signaling) |
658 | 673 | pub const _CMP_GT_OQ: u8 = 0x1e;
|
659 |
| -// True (unordered, signaling) |
| 674 | +/// True (unordered, signaling) |
660 | 675 | pub const _CMP_TRUE_US: u8 = 0x1f;
|
661 | 676 |
|
662 | 677 | /// Compare packed double-precision (64-bit) floating-point
|
@@ -920,13 +935,10 @@ pub unsafe fn _mm_permutevar_ps(a: f32x4, b: i32x4) -> f32x4 {
|
920 | 935 | #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
|
921 | 936 | pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
|
922 | 937 | let imm8 = (imm8 & 0xFF) as u8;
|
923 |
| - const fn add4(x: u32) -> u32 { |
924 |
| - x + 4 |
925 |
| - } |
926 | 938 | macro_rules! shuffle4 {
|
927 | 939 | ($a:expr, $b:expr, $c:expr, $d:expr) => {
|
928 | 940 | simd_shuffle8(a, _mm256_undefined_ps(), [
|
929 |
| - $a, $b, $c, $d, add4($a), add4($b), add4($c), add4($d) |
| 941 | + $a, $b, $c, $d, $a + 4, $b + 4, $c + 4, $d + 4 |
930 | 942 | ])
|
931 | 943 | }
|
932 | 944 | }
|
@@ -960,7 +972,7 @@ pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
|
960 | 972 | }
|
961 | 973 | }
|
962 | 974 | }
|
963 |
| - match (imm8 >> 0) & 0b11 { |
| 975 | + match imm8 & 0b11 { |
964 | 976 | 0b00 => shuffle1!(0),
|
965 | 977 | 0b01 => shuffle1!(1),
|
966 | 978 | 0b10 => shuffle1!(2),
|
@@ -1014,14 +1026,16 @@ pub unsafe fn _mm_permute_ps(a: f32x4, imm8: i32) -> f32x4 {
|
1014 | 1026 | }
|
1015 | 1027 | }
|
1016 | 1028 | }
|
1017 |
| - match (imm8 >> 0) & 0b11 { |
| 1029 | + match imm8 & 0b11 { |
1018 | 1030 | 0b00 => shuffle1!(0),
|
1019 | 1031 | 0b01 => shuffle1!(1),
|
1020 | 1032 | 0b10 => shuffle1!(2),
|
1021 | 1033 | _ => shuffle1!(3),
|
1022 | 1034 | }
|
1023 | 1035 | }
|
1024 | 1036 |
|
| 1037 | +/// Shuffle double-precision (64-bit) floating-point elements in `a` |
| 1038 | +/// within 256-bit lanes using the control in `b`. |
1025 | 1039 | #[inline(always)]
|
1026 | 1040 | #[target_feature = "+avx"]
|
1027 | 1041 | #[cfg_attr(test, assert_instr(vpermilpd))]
|
@@ -1074,7 +1088,7 @@ pub unsafe fn _mm256_permute_pd(a: f64x4, imm8: i32) -> f64x4 {
|
1074 | 1088 | }
|
1075 | 1089 | }
|
1076 | 1090 | }
|
1077 |
| - match (imm8 >> 0) & 0x1 { |
| 1091 | + match imm8 & 0x1 { |
1078 | 1092 | 0 => shuffle1!(0),
|
1079 | 1093 | _ => shuffle1!(1),
|
1080 | 1094 | }
|
@@ -1102,7 +1116,7 @@ pub unsafe fn _mm_permute_pd(a: f64x2, imm8: i32) -> f64x2 {
|
1102 | 1116 | }
|
1103 | 1117 | }
|
1104 | 1118 | }
|
1105 |
| - match (imm8 >> 0) & 0x1 { |
| 1119 | + match imm8 & 0x1 { |
1106 | 1120 | 0 => shuffle1!(0),
|
1107 | 1121 | _ => shuffle1!(1),
|
1108 | 1122 | }
|
@@ -2750,8 +2764,7 @@ mod tests {
|
2750 | 2764 | let a = f32x8::new(4., 9., 16., 25., 4., 9., 16., 25.);
|
2751 | 2765 | let b = f32x8::new(4., 3., 2., 5., 8., 9., 64., 50.);
|
2752 | 2766 | let r = avx::_mm256_dp_ps(a, b, 0xFF);
|
2753 |
| - let e = |
2754 |
| - f32x8::new(200., 200., 200., 200., 2387., 2387., 2387., 2387.); |
| 2767 | + let e = f32x8::new(200., 200., 200., 200., 2387., 2387., 2387., 2387.); |
2755 | 2768 | assert_eq!(r, e);
|
2756 | 2769 | }
|
2757 | 2770 |
|
|
0 commit comments