Skip to content

Commit 59271e0

Browse files
marxinAmanieu
authored andcommitted
Fix and document _mmX_alignr_epiX family of intrinsics
1 parent 7ef7a4a commit 59271e0

File tree

3 files changed

+36
-27
lines changed

3 files changed

+36
-27
lines changed

crates/core_arch/src/x86/avx2.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m
166166
static_assert_uimm_bits!(IMM8, 8);
167167
// If palignr is shifting the pair of vectors more than the size of two
168168
// lanes, emit zero.
169-
if IMM8 > 32 {
169+
if IMM8 >= 32 {
170170
return _mm256_setzero_si256();
171171
}
172172
// If palignr is shifting the pair of input vectors more than one lane,

crates/core_arch/src/x86/avx512bw.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ use crate::{
44
ptr,
55
};
66

7+
use core::hint::unreachable_unchecked;
8+
79
#[cfg(test)]
810
use stdarch_test::assert_instr;
911

@@ -10850,6 +10852,8 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
1085010852
}
1085110853

1085210854
/// Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by imm8 bytes, and store the low 16 bytes in dst.
10855+
/// Unlike [`_mm_alignr_epi8`], [`_mm256_alignr_epi8`] functions, where the entire input vectors are concatenated to the temporary result,
10856+
/// this concatenation happens in 4 steps, where each step builds 32-byte temporary result.
1085310857
///
1085410858
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi8&expand=263)
1085510859
#[inline]
@@ -10860,7 +10864,7 @@ pub unsafe fn _mm512_bsrli_epi128<const IMM8: i32>(a: __m512i) -> __m512i {
1086010864
pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
1086110865
// If palignr is shifting the pair of vectors more than the size of two
1086210866
// lanes, emit zero.
10863-
if IMM8 > 32 {
10867+
if IMM8 >= 32 {
1086410868
return _mm512_setzero_si512();
1086510869
}
1086610870
// If palignr is shifting the pair of input vectors more than one lane,
@@ -10873,6 +10877,10 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
1087310877
let a = a.as_i8x64();
1087410878
let b = b.as_i8x64();
1087510879

10880+
if IMM8 == 16 {
10881+
return transmute(a);
10882+
}
10883+
1087610884
let r: i8x64 = match IMM8 % 16 {
1087710885
0 => simd_shuffle!(
1087810886
b,
@@ -11031,7 +11039,7 @@ pub unsafe fn _mm512_alignr_epi8<const IMM8: i32>(a: __m512i, b: __m512i) -> __m
1103111039
121, 122, 123, 124, 125, 126,
1103211040
],
1103311041
),
11034-
_ => b,
11042+
_ => unreachable_unchecked(),
1103511043
};
1103611044
transmute(r)
1103711045
}

crates/core_arch/src/x86/avx512f.rs

Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use crate::{
66
mem, ptr,
77
};
88

9+
use core::hint::unreachable_unchecked;
910
#[cfg(test)]
1011
use stdarch_test::assert_instr;
1112

@@ -26202,6 +26203,8 @@ pub unsafe fn _mm_mask_blend_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
2620226203

2620326204
/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst.
2620426205
///
26206+
/// <div class="warning">Only lowest <strong>4 bits</strong> are used from the mask (shift at maximum by 60 bytes)!</div>
26207+
///
2620526208
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi32&expand=245)
2620626209
#[inline]
2620726210
#[target_feature(enable = "avx512f")]
@@ -26269,7 +26272,8 @@ pub unsafe fn _mm512_alignr_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __
2626926272
12 => simd_shuffle!(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
2627026273
13 => simd_shuffle!(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
2627126274
14 => simd_shuffle!(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
26272-
_ => simd_shuffle!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
26275+
15 => simd_shuffle!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
26276+
_ => unreachable_unchecked(),
2627326277
};
2627426278
transmute(r)
2627526279
}
@@ -26313,6 +26317,8 @@ pub unsafe fn _mm512_maskz_alignr_epi32<const IMM8: i32>(
2631326317

2631426318
/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst.
2631526319
///
26320+
/// <div class="warning">Only lowest <strong>3 bits</strong> are used from the mask (shift at maximum by 28 bytes)!</div>
26321+
///
2631626322
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi32&expand=242)
2631726323
#[inline]
2631826324
#[target_feature(enable = "avx512f,avx512vl")]
@@ -26323,7 +26329,7 @@ pub unsafe fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __
2632326329
static_assert_uimm_bits!(IMM8, 8);
2632426330
let a = a.as_i32x8();
2632526331
let b = b.as_i32x8();
26326-
let imm8: i32 = IMM8 % 16;
26332+
let imm8: i32 = IMM8 % 8;
2632726333
let r: i32x8 = match imm8 {
2632826334
0 => simd_shuffle!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
2632926335
1 => simd_shuffle!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
@@ -26333,14 +26339,7 @@ pub unsafe fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __
2633326339
5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
2633426340
6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
2633526341
7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
26336-
8 => simd_shuffle!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
26337-
9 => simd_shuffle!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
26338-
10 => simd_shuffle!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
26339-
11 => simd_shuffle!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
26340-
12 => simd_shuffle!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
26341-
13 => simd_shuffle!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
26342-
14 => simd_shuffle!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
26343-
_ => simd_shuffle!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
26342+
_ => unreachable_unchecked(),
2634426343
};
2634526344
transmute(r)
2634626345
}
@@ -26384,6 +26383,8 @@ pub unsafe fn _mm256_maskz_alignr_epi32<const IMM8: i32>(
2638426383

2638526384
/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst.
2638626385
///
26386+
/// <div class="warning">Only lowest <strong>2 bits</strong> are used from the mask (shift at maximum by 12 bytes)!</div>
26387+
///
2638726388
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi32&expand=239)
2638826389
#[inline]
2638926390
#[target_feature(enable = "avx512f,avx512vl")]
@@ -26394,16 +26395,13 @@ pub unsafe fn _mm_alignr_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m12
2639426395
static_assert_uimm_bits!(IMM8, 8);
2639526396
let a = a.as_i32x4();
2639626397
let b = b.as_i32x4();
26397-
let imm8: i32 = IMM8 % 8;
26398+
let imm8: i32 = IMM8 % 4;
2639826399
let r: i32x4 = match imm8 {
2639926400
0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
2640026401
1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
2640126402
2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
2640226403
3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
26403-
4 => simd_shuffle!(a, b, [0, 1, 2, 3]),
26404-
5 => simd_shuffle!(a, b, [1, 2, 3, 0]),
26405-
6 => simd_shuffle!(a, b, [2, 3, 0, 1]),
26406-
_ => simd_shuffle!(a, b, [3, 0, 1, 2]),
26404+
_ => unreachable_unchecked(),
2640726405
};
2640826406
transmute(r)
2640926407
}
@@ -26447,6 +26445,8 @@ pub unsafe fn _mm_maskz_alignr_epi32<const IMM8: i32>(
2644726445

2644826446
/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst.
2644926447
///
26448+
/// <div class="warning">Only lowest <strong>3 bits</strong> are used from the mask (shift at maximum by 56 bytes)!</div>
26449+
///
2645026450
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_alignr_epi64&expand=254)
2645126451
#[inline]
2645226452
#[target_feature(enable = "avx512f")]
@@ -26464,7 +26464,8 @@ pub unsafe fn _mm512_alignr_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __
2646426464
4 => simd_shuffle!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
2646526465
5 => simd_shuffle!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
2646626466
6 => simd_shuffle!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
26467-
_ => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
26467+
7 => simd_shuffle!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
26468+
_ => unreachable_unchecked(),
2646826469
};
2646926470
transmute(r)
2647026471
}
@@ -26508,6 +26509,8 @@ pub unsafe fn _mm512_maskz_alignr_epi64<const IMM8: i32>(
2650826509

2650926510
/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst.
2651026511
///
26512+
/// <div class="warning">Only lowest <strong>2 bits</strong> are used from the mask (shift at maximum by 24 bytes)!</div>
26513+
///
2651126514
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi64&expand=251)
2651226515
#[inline]
2651326516
#[target_feature(enable = "avx512f,avx512vl")]
@@ -26516,16 +26519,13 @@ pub unsafe fn _mm512_maskz_alignr_epi64<const IMM8: i32>(
2651626519
#[rustc_legacy_const_generics(2)]
2651726520
pub unsafe fn _mm256_alignr_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2651826521
static_assert_uimm_bits!(IMM8, 8);
26519-
let imm8: i32 = IMM8 % 8;
26522+
let imm8: i32 = IMM8 % 4;
2652026523
let r: i64x4 = match imm8 {
2652126524
0 => simd_shuffle!(a, b, [4, 5, 6, 7]),
2652226525
1 => simd_shuffle!(a, b, [5, 6, 7, 0]),
2652326526
2 => simd_shuffle!(a, b, [6, 7, 0, 1]),
2652426527
3 => simd_shuffle!(a, b, [7, 0, 1, 2]),
26525-
4 => simd_shuffle!(a, b, [0, 1, 2, 3]),
26526-
5 => simd_shuffle!(a, b, [1, 2, 3, 4]),
26527-
6 => simd_shuffle!(a, b, [2, 3, 4, 5]),
26528-
_ => simd_shuffle!(a, b, [3, 4, 5, 6]),
26528+
_ => unreachable_unchecked(),
2652926529
};
2653026530
transmute(r)
2653126531
}
@@ -26569,6 +26569,8 @@ pub unsafe fn _mm256_maskz_alignr_epi64<const IMM8: i32>(
2656926569

2657026570
/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst.
2657126571
///
26572+
/// <div class="warning">Only lowest <strong>bit</strong> is used from the mask (shift at maximum by 8 bytes)!</div>
26573+
///
2657226574
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi64&expand=248)
2657326575
#[inline]
2657426576
#[target_feature(enable = "avx512f,avx512vl")]
@@ -26577,12 +26579,11 @@ pub unsafe fn _mm256_maskz_alignr_epi64<const IMM8: i32>(
2657726579
#[rustc_legacy_const_generics(2)]
2657826580
pub unsafe fn _mm_alignr_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
2657926581
static_assert_uimm_bits!(IMM8, 8);
26580-
let imm8: i32 = IMM8 % 4;
26582+
let imm8: i32 = IMM8 % 2;
2658126583
let r: i64x2 = match imm8 {
2658226584
0 => simd_shuffle!(a, b, [2, 3]),
2658326585
1 => simd_shuffle!(a, b, [3, 0]),
26584-
2 => simd_shuffle!(a, b, [0, 1]),
26585-
_ => simd_shuffle!(a, b, [1, 2]),
26586+
_ => unreachable_unchecked(),
2658626587
};
2658726588
transmute(r)
2658826589
}

0 commit comments

Comments
 (0)