Skip to content

Commit 561922d

Browse files
hdevalencegnzlbg
authored andcommitted
Add AVX512VL variants of IFMA instructions.
1 parent 3f52beb commit 561922d

File tree

1 file changed

+112
-0
lines changed

1 file changed

+112
-0
lines changed

crates/core_arch/src/x86/avx512ifma.rs

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,62 @@ pub unsafe fn _mm512_madd52lo_epu64(a: __m512i, b: __m512i, c: __m512i) -> __m51
3131
vpmadd52luq_512(a, b, c)
3232
}
3333

34+
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
35+
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
36+
/// unsigned integer from the intermediate result with the
37+
/// corresponding unsigned 64-bit integer in `a`, and store the
38+
/// results in `dst`.
39+
///
40+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3485)
41+
#[inline]
42+
#[target_feature(enable = "avx512ifma,avx512vl")]
43+
#[cfg_attr(test, assert_instr(vpmadd52huq))]
44+
pub unsafe fn _mm256_madd52hi_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
45+
vpmadd52huq_256(a, b, c)
46+
}
47+
48+
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
49+
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
50+
/// unsigned integer from the intermediate result with the
51+
/// corresponding unsigned 64-bit integer in `a`, and store the
52+
/// results in `dst`.
53+
///
54+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL&expand=3494)
55+
#[inline]
56+
#[target_feature(enable = "avx512ifma,avx512vl")]
57+
#[cfg_attr(test, assert_instr(vpmadd52huq))]
58+
pub unsafe fn _mm256_madd52lo_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
59+
vpmadd52luq_256(a, b, c)
60+
}
61+
62+
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
63+
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
64+
/// unsigned integer from the intermediate result with the
65+
/// corresponding unsigned 64-bit integer in `a`, and store the
66+
/// results in `dst`.
67+
///
68+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3482&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL)
69+
#[inline]
70+
#[target_feature(enable = "avx512ifma,avx512vl")]
71+
#[cfg_attr(test, assert_instr(vpmadd52huq))]
72+
pub unsafe fn _mm_madd52hi_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
73+
vpmadd52huq_128(a, b, c)
74+
}
75+
76+
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
77+
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
78+
/// unsigned integer from the intermediate result with the
79+
/// corresponding unsigned 64-bit integer in `a`, and store the
80+
/// results in `dst`.
81+
///
82+
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=3488,3491&text=vpmadd52&avx512techs=AVX512IFMA52,AVX512VL)
83+
#[inline]
84+
#[target_feature(enable = "avx512ifma,avx512vl")]
85+
#[cfg_attr(test, assert_instr(vpmadd52huq))]
86+
pub unsafe fn _mm_madd52lo_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
87+
vpmadd52luq_128(a, b, c)
88+
}
89+
3490
#[allow(improper_ctypes)]
3591
extern "C" {
3692
#[link_name = "llvm.x86.avx512.vpmadd52l.uq.128"]
@@ -81,4 +137,60 @@ mod tests {
81137

82138
assert_eq_m512i(a, expected);
83139
}
140+
141+
#[simd_test(enable = "avx512ifma,avx512vl")]
142+
unsafe fn test_mm256_madd52hi_epu64() {
143+
let mut a = _mm256_set1_epi64x(10 << 40);
144+
let b = _mm256_set1_epi64x((11 << 40) + 4);
145+
let c = _mm256_set1_epi64x((12 << 40) + 3);
146+
147+
a = _mm256_madd52hi_epu64(a, b, c);
148+
149+
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
150+
let expected = _mm256_set1_epi64x(11030549757952);
151+
152+
assert_eq_m256i(a, expected);
153+
}
154+
155+
#[simd_test(enable = "avx512ifma,avx512vl")]
156+
unsafe fn test_mm256_madd52lo_epu64() {
157+
let mut a = _mm256_set1_epi64x(10 << 40);
158+
let b = _mm256_set1_epi64x((11 << 40) + 4);
159+
let c = _mm256_set1_epi64x((12 << 40) + 3);
160+
161+
a = _mm256_madd52lo_epu64(a, b, c);
162+
163+
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
164+
let expected = _mm256_set1_epi64x(100055558127628);
165+
166+
assert_eq_m256i(a, expected);
167+
}
168+
169+
#[simd_test(enable = "avx512ifma,avx512vl")]
170+
unsafe fn test_mm_madd52hi_epu64() {
171+
let mut a = _mm_set1_epi64x(10 << 40);
172+
let b = _mm_set1_epi64x((11 << 40) + 4);
173+
let c = _mm_set1_epi64x((12 << 40) + 3);
174+
175+
a = _mm_madd52hi_epu64(a, b, c);
176+
177+
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
178+
let expected = _mm_set1_epi64x(11030549757952);
179+
180+
assert_eq_m128i(a, expected);
181+
}
182+
183+
#[simd_test(enable = "avx512ifma,avx512vl")]
184+
unsafe fn test_mm_madd52lo_epu64() {
185+
let mut a = _mm_set1_epi64x(10 << 40);
186+
let b = _mm_set1_epi64x((11 << 40) + 4);
187+
let c = _mm_set1_epi64x((12 << 40) + 3);
188+
189+
a = _mm_madd52hi_epu64(a, b, c);
190+
191+
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
192+
let expected = _mm_set1_epi64x(11030549757952);
193+
194+
assert_eq_m128i(a, expected);
195+
}
84196
}

0 commit comments

Comments
 (0)