|
| 1 | +//! Advanced Vector Extensions (AVX) |
| 2 | +//! |
| 3 | +//! The references are: |
| 4 | +//! |
| 5 | +//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: Instruction Set Reference, A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf). |
| 6 | +//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and System Instructions](http://support.amd.com/TechDocs/24594.pdf). |
| 7 | +//! |
| 8 | +//! [Wikipedia](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) provides a quick overview of the instructions available. |
| 9 | +
|
1 | 10 | use std::mem;
|
2 | 11 | use std::ptr;
|
3 | 12 |
|
@@ -494,69 +503,69 @@ pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 {
|
494 | 503 | mem::transmute(a ^ b)
|
495 | 504 | }
|
496 | 505 |
|
497 |
| -// Equal (ordered, non-signaling) |
| 506 | +/// Equal (ordered, non-signaling) |
498 | 507 | pub const _CMP_EQ_OQ: u8 = 0x00;
|
499 |
| -// Less-than (ordered, signaling) |
| 508 | +/// Less-than (ordered, signaling) |
500 | 509 | pub const _CMP_LT_OS: u8 = 0x01;
|
501 |
| -// Less-than-or-equal (ordered, signaling) |
| 510 | +/// Less-than-or-equal (ordered, signaling) |
502 | 511 | pub const _CMP_LE_OS: u8 = 0x02;
|
503 |
| -// Unordered (non-signaling) |
| 512 | +/// Unordered (non-signaling) |
504 | 513 | pub const _CMP_UNORD_Q: u8 = 0x03;
|
505 |
| -// Not-equal (unordered, non-signaling) |
| 514 | +/// Not-equal (unordered, non-signaling) |
506 | 515 | pub const _CMP_NEQ_UQ: u8 = 0x04;
|
507 |
| -// Not-less-than (unordered, signaling) |
| 516 | +/// Not-less-than (unordered, signaling) |
508 | 517 | pub const _CMP_NLT_US: u8 = 0x05;
|
509 |
| -// Not-less-than-or-equal (unordered, signaling) |
| 518 | +/// Not-less-than-or-equal (unordered, signaling) |
510 | 519 | pub const _CMP_NLE_US: u8 = 0x06;
|
511 |
| -// Ordered (non-signaling) |
| 520 | +/// Ordered (non-signaling) |
512 | 521 | pub const _CMP_ORD_Q: u8 = 0x07;
|
513 |
| -// Equal (unordered, non-signaling) |
| 522 | +/// Equal (unordered, non-signaling) |
514 | 523 | pub const _CMP_EQ_UQ: u8 = 0x08;
|
515 |
| -// Not-greater-than-or-equal (unordered, signaling) |
| 524 | +/// Not-greater-than-or-equal (unordered, signaling) |
516 | 525 | pub const _CMP_NGE_US: u8 = 0x09;
|
517 |
| -// Not-greater-than (unordered, signaling) |
| 526 | +/// Not-greater-than (unordered, signaling) |
518 | 527 | pub const _CMP_NGT_US: u8 = 0x0a;
|
519 |
| -// False (ordered, non-signaling) |
| 528 | +/// False (ordered, non-signaling) |
520 | 529 | pub const _CMP_FALSE_OQ: u8 = 0x0b;
|
521 |
| -// Not-equal (ordered, non-signaling) |
| 530 | +/// Not-equal (ordered, non-signaling) |
522 | 531 | pub const _CMP_NEQ_OQ: u8 = 0x0c;
|
523 |
| -// Greater-than-or-equal (ordered, signaling) |
| 532 | +/// Greater-than-or-equal (ordered, signaling) |
524 | 533 | pub const _CMP_GE_OS: u8 = 0x0d;
|
525 |
| -// Greater-than (ordered, signaling) |
| 534 | +/// Greater-than (ordered, signaling) |
526 | 535 | pub const _CMP_GT_OS: u8 = 0x0e;
|
527 |
| -// True (unordered, non-signaling) |
| 536 | +/// True (unordered, non-signaling) |
528 | 537 | pub const _CMP_TRUE_UQ: u8 = 0x0f;
|
529 |
| -// Equal (ordered, signaling) |
| 538 | +/// Equal (ordered, signaling) |
530 | 539 | pub const _CMP_EQ_OS: u8 = 0x10;
|
531 |
| -// Less-than (ordered, non-signaling) |
| 540 | +/// Less-than (ordered, non-signaling) |
532 | 541 | pub const _CMP_LT_OQ: u8 = 0x11;
|
533 |
| -// Less-than-or-equal (ordered, non-signaling) |
| 542 | +/// Less-than-or-equal (ordered, non-signaling) |
534 | 543 | pub const _CMP_LE_OQ: u8 = 0x12;
|
535 |
| -// Unordered (signaling) |
| 544 | +/// Unordered (signaling) |
536 | 545 | pub const _CMP_UNORD_S: u8 = 0x13;
|
537 |
| -// Not-equal (unordered, signaling) |
| 546 | +/// Not-equal (unordered, signaling) |
538 | 547 | pub const _CMP_NEQ_US: u8 = 0x14;
|
539 |
| -// Not-less-than (unordered, non-signaling) |
| 548 | +/// Not-less-than (unordered, non-signaling) |
540 | 549 | pub const _CMP_NLT_UQ: u8 = 0x15;
|
541 |
| -// Not-less-than-or-equal (unordered, non-signaling) |
| 550 | +/// Not-less-than-or-equal (unordered, non-signaling) |
542 | 551 | pub const _CMP_NLE_UQ: u8 = 0x16;
|
543 |
| -// Ordered (signaling) |
| 552 | +/// Ordered (signaling) |
544 | 553 | pub const _CMP_ORD_S: u8 = 0x17;
|
545 |
| -// Equal (unordered, signaling) |
| 554 | +/// Equal (unordered, signaling) |
546 | 555 | pub const _CMP_EQ_US: u8 = 0x18;
|
547 |
| -// Not-greater-than-or-equal (unordered, non-signaling) |
| 556 | +/// Not-greater-than-or-equal (unordered, non-signaling) |
548 | 557 | pub const _CMP_NGE_UQ: u8 = 0x19;
|
549 |
| -// Not-greater-than (unordered, non-signaling) |
| 558 | +/// Not-greater-than (unordered, non-signaling) |
550 | 559 | pub const _CMP_NGT_UQ: u8 = 0x1a;
|
551 |
| -// False (ordered, signaling) |
| 560 | +/// False (ordered, signaling) |
552 | 561 | pub const _CMP_FALSE_OS: u8 = 0x1b;
|
553 |
| -// Not-equal (ordered, signaling) |
| 562 | +/// Not-equal (ordered, signaling) |
554 | 563 | pub const _CMP_NEQ_OS: u8 = 0x1c;
|
555 |
| -// Greater-than-or-equal (ordered, non-signaling) |
| 564 | +/// Greater-than-or-equal (ordered, non-signaling) |
556 | 565 | pub const _CMP_GE_OQ: u8 = 0x1d;
|
557 |
| -// Greater-than (ordered, non-signaling) |
| 566 | +/// Greater-than (ordered, non-signaling) |
558 | 567 | pub const _CMP_GT_OQ: u8 = 0x1e;
|
559 |
| -// True (unordered, signaling) |
| 568 | +/// True (unordered, signaling) |
560 | 569 | pub const _CMP_TRUE_US: u8 = 0x1f;
|
561 | 570 |
|
562 | 571 | /// Compare packed double-precision (64-bit) floating-point
|
@@ -826,7 +835,7 @@ pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
|
826 | 835 | macro_rules! shuffle4 {
|
827 | 836 | ($a:expr, $b:expr, $c:expr, $d:expr) => {
|
828 | 837 | simd_shuffle8(a, _mm256_undefined_ps(), [
|
829 |
| - $a, $b, $c, $d, add4($a), add4($b), add4($c), add4($d) |
| 838 | + $a, $b, $c, $d, $a + 4, $b + 4, $c + 4, $d + 4 |
830 | 839 | ])
|
831 | 840 | }
|
832 | 841 | }
|
@@ -922,6 +931,8 @@ pub unsafe fn _mm_permute_ps(a: f32x4, imm8: i32) -> f32x4 {
|
922 | 931 | }
|
923 | 932 | }
|
924 | 933 |
|
| 934 | +/// Shuffle double-precision (64-bit) floating-point elements in `a` |
| 935 | +/// within 256-bit lanes using the control in `b`. |
925 | 936 | #[inline(always)]
|
926 | 937 | #[target_feature = "+avx"]
|
927 | 938 | #[cfg_attr(test, assert_instr(vpermilpd))]
|
|
0 commit comments