|
| 1 | +//! Advanced Vector Extensions (AVX) |
| 2 | +//! |
| 3 | +//! The references are: |
| 4 | +//! |
| 5 | +//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: Instruction Set Reference, A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf). |
| 6 | +//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and System Instructions](http://support.amd.com/TechDocs/24594.pdf). |
| 7 | +//! |
| 8 | +//! [Wikipedia](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) provides a quick overview of the instructions available. |
| 9 | +
|
1 | 10 | use std::mem;
|
2 | 11 |
|
3 | 12 | #[cfg(test)]
|
@@ -484,69 +493,69 @@ pub unsafe fn _mm256_xor_ps(a: f32x8, b: f32x8) -> f32x8 {
|
484 | 493 | mem::transmute(a ^ b)
|
485 | 494 | }
|
486 | 495 |
|
487 |
| -// Equal (ordered, non-signaling) |
| 496 | +/// Equal (ordered, non-signaling) |
488 | 497 | pub const _CMP_EQ_OQ: u8 = 0x00;
|
489 |
| -// Less-than (ordered, signaling) |
| 498 | +/// Less-than (ordered, signaling) |
490 | 499 | pub const _CMP_LT_OS: u8 = 0x01;
|
491 |
| -// Less-than-or-equal (ordered, signaling) |
| 500 | +/// Less-than-or-equal (ordered, signaling) |
492 | 501 | pub const _CMP_LE_OS: u8 = 0x02;
|
493 |
| -// Unordered (non-signaling) |
| 502 | +/// Unordered (non-signaling) |
494 | 503 | pub const _CMP_UNORD_Q: u8 = 0x03;
|
495 |
| -// Not-equal (unordered, non-signaling) |
| 504 | +/// Not-equal (unordered, non-signaling) |
496 | 505 | pub const _CMP_NEQ_UQ: u8 = 0x04;
|
497 |
| -// Not-less-than (unordered, signaling) |
| 506 | +/// Not-less-than (unordered, signaling) |
498 | 507 | pub const _CMP_NLT_US: u8 = 0x05;
|
499 |
| -// Not-less-than-or-equal (unordered, signaling) |
| 508 | +/// Not-less-than-or-equal (unordered, signaling) |
500 | 509 | pub const _CMP_NLE_US: u8 = 0x06;
|
501 |
| -// Ordered (non-signaling) |
| 510 | +/// Ordered (non-signaling) |
502 | 511 | pub const _CMP_ORD_Q: u8 = 0x07;
|
503 |
| -// Equal (unordered, non-signaling) |
| 512 | +/// Equal (unordered, non-signaling) |
504 | 513 | pub const _CMP_EQ_UQ: u8 = 0x08;
|
505 |
| -// Not-greater-than-or-equal (unordered, signaling) |
| 514 | +/// Not-greater-than-or-equal (unordered, signaling) |
506 | 515 | pub const _CMP_NGE_US: u8 = 0x09;
|
507 |
| -// Not-greater-than (unordered, signaling) |
| 516 | +/// Not-greater-than (unordered, signaling) |
508 | 517 | pub const _CMP_NGT_US: u8 = 0x0a;
|
509 |
| -// False (ordered, non-signaling) |
| 518 | +/// False (ordered, non-signaling) |
510 | 519 | pub const _CMP_FALSE_OQ: u8 = 0x0b;
|
511 |
| -// Not-equal (ordered, non-signaling) |
| 520 | +/// Not-equal (ordered, non-signaling) |
512 | 521 | pub const _CMP_NEQ_OQ: u8 = 0x0c;
|
513 |
| -// Greater-than-or-equal (ordered, signaling) |
| 522 | +/// Greater-than-or-equal (ordered, signaling) |
514 | 523 | pub const _CMP_GE_OS: u8 = 0x0d;
|
515 |
| -// Greater-than (ordered, signaling) |
| 524 | +/// Greater-than (ordered, signaling) |
516 | 525 | pub const _CMP_GT_OS: u8 = 0x0e;
|
517 |
| -// True (unordered, non-signaling) |
| 526 | +/// True (unordered, non-signaling) |
518 | 527 | pub const _CMP_TRUE_UQ: u8 = 0x0f;
|
519 |
| -// Equal (ordered, signaling) |
| 528 | +/// Equal (ordered, signaling) |
520 | 529 | pub const _CMP_EQ_OS: u8 = 0x10;
|
521 |
| -// Less-than (ordered, non-signaling) |
| 530 | +/// Less-than (ordered, non-signaling) |
522 | 531 | pub const _CMP_LT_OQ: u8 = 0x11;
|
523 |
| -// Less-than-or-equal (ordered, non-signaling) |
| 532 | +/// Less-than-or-equal (ordered, non-signaling) |
524 | 533 | pub const _CMP_LE_OQ: u8 = 0x12;
|
525 |
| -// Unordered (signaling) |
| 534 | +/// Unordered (signaling) |
526 | 535 | pub const _CMP_UNORD_S: u8 = 0x13;
|
527 |
| -// Not-equal (unordered, signaling) |
| 536 | +/// Not-equal (unordered, signaling) |
528 | 537 | pub const _CMP_NEQ_US: u8 = 0x14;
|
529 |
| -// Not-less-than (unordered, non-signaling) |
| 538 | +/// Not-less-than (unordered, non-signaling) |
530 | 539 | pub const _CMP_NLT_UQ: u8 = 0x15;
|
531 |
| -// Not-less-than-or-equal (unordered, non-signaling) |
| 540 | +/// Not-less-than-or-equal (unordered, non-signaling) |
532 | 541 | pub const _CMP_NLE_UQ: u8 = 0x16;
|
533 |
| -// Ordered (signaling) |
| 542 | +/// Ordered (signaling) |
534 | 543 | pub const _CMP_ORD_S: u8 = 0x17;
|
535 |
| -// Equal (unordered, signaling) |
| 544 | +/// Equal (unordered, signaling) |
536 | 545 | pub const _CMP_EQ_US: u8 = 0x18;
|
537 |
| -// Not-greater-than-or-equal (unordered, non-signaling) |
| 546 | +/// Not-greater-than-or-equal (unordered, non-signaling) |
538 | 547 | pub const _CMP_NGE_UQ: u8 = 0x19;
|
539 |
| -// Not-greater-than (unordered, non-signaling) |
| 548 | +/// Not-greater-than (unordered, non-signaling) |
540 | 549 | pub const _CMP_NGT_UQ: u8 = 0x1a;
|
541 |
| -// False (ordered, signaling) |
| 550 | +/// False (ordered, signaling) |
542 | 551 | pub const _CMP_FALSE_OS: u8 = 0x1b;
|
543 |
| -// Not-equal (ordered, signaling) |
| 552 | +/// Not-equal (ordered, signaling) |
544 | 553 | pub const _CMP_NEQ_OS: u8 = 0x1c;
|
545 |
| -// Greater-than-or-equal (ordered, non-signaling) |
| 554 | +/// Greater-than-or-equal (ordered, non-signaling) |
546 | 555 | pub const _CMP_GE_OQ: u8 = 0x1d;
|
547 |
| -// Greater-than (ordered, non-signaling) |
| 556 | +/// Greater-than (ordered, non-signaling) |
548 | 557 | pub const _CMP_GT_OQ: u8 = 0x1e;
|
549 |
| -// True (unordered, signaling) |
| 558 | +/// True (unordered, signaling) |
550 | 559 | pub const _CMP_TRUE_US: u8 = 0x1f;
|
551 | 560 |
|
552 | 561 | /// Compare packed double-precision (64-bit) floating-point
|
@@ -806,12 +815,11 @@ pub unsafe fn _mm_permutevar_ps(a: f32x4, b: i32x4) -> f32x4 {
|
806 | 815 | #[target_feature = "+avx"]
|
807 | 816 | #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
|
808 | 817 | pub unsafe fn _mm256_permute_ps(a: f32x8, imm8: i32) -> f32x8 {
|
809 |
| - const fn add4(x: u32) -> u32 { x + 4 } |
810 | 818 | let imm8 = (imm8 & 0xFF) as u8;
|
811 | 819 | macro_rules! shuffle4 {
|
812 | 820 | ($a:expr, $b:expr, $c:expr, $d:expr) => {
|
813 | 821 | simd_shuffle8(a, _mm256_undefined_ps(), [
|
814 |
| - $a, $b, $c, $d, add4($a), add4($b), add4($c), add4($d) |
| 822 | + $a, $b, $c, $d, $a + 4, $b + 4, $c + 4, $d + 4 |
815 | 823 | ])
|
816 | 824 | }
|
817 | 825 | }
|
@@ -907,6 +915,8 @@ pub unsafe fn _mm_permute_ps(a: f32x4, imm8: i32) -> f32x4 {
|
907 | 915 | }
|
908 | 916 | }
|
909 | 917 |
|
| 918 | +/// Shuffle double-precision (64-bit) floating-point elements in `a` |
| 919 | +/// within 256-bit lanes using the control in `b`. |
910 | 920 | #[inline(always)]
|
911 | 921 | #[target_feature = "+avx"]
|
912 | 922 | #[cfg_attr(test, assert_instr(vpermilpd))]
|
|
0 commit comments