Skip to content

Commit b6bedbc

Browse files
RalfJungAmanieu
authored andcommitted
non-temporal stores: use inline assembly
1 parent 92464f1 commit b6bedbc

File tree

6 files changed

+59
-18
lines changed

6 files changed

+59
-18
lines changed

crates/core_arch/src/x86/avx.rs

+15-3
Original file line numberDiff line numberDiff line change
@@ -1718,7 +1718,11 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
17181718
#[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq
17191719
#[stable(feature = "simd_x86", since = "1.27.0")]
17201720
pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1721-
intrinsics::nontemporal_store(mem_addr, a);
1721+
crate::arch::asm!(
1722+
"vmovntps [{mem_addr}], {a}",
1723+
mem_addr = in(reg) mem_addr,
1724+
a = in(ymm_reg) a,
1725+
);
17221726
}
17231727

17241728
/// Moves double-precision values from a 256-bit vector of `[4 x double]`
@@ -1741,7 +1745,11 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
17411745
#[stable(feature = "simd_x86", since = "1.27.0")]
17421746
#[allow(clippy::cast_ptr_alignment)]
17431747
pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1744-
intrinsics::nontemporal_store(mem_addr as *mut __m256d, a);
1748+
crate::arch::asm!(
1749+
"vmovntps [{mem_addr}], {a}",
1750+
mem_addr = in(reg) mem_addr,
1751+
a = in(ymm_reg) a,
1752+
);
17451753
}
17461754

17471755
/// Moves single-precision floating point values from a 256-bit vector
@@ -1765,7 +1773,11 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
17651773
#[stable(feature = "simd_x86", since = "1.27.0")]
17661774
#[allow(clippy::cast_ptr_alignment)]
17671775
pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1768-
intrinsics::nontemporal_store(mem_addr as *mut __m256, a);
1776+
crate::arch::asm!(
1777+
"vmovntps [{mem_addr}], {a}",
1778+
mem_addr = in(reg) mem_addr,
1779+
a = in(ymm_reg) a,
1780+
);
17691781
}
17701782

17711783
/// Computes the approximate reciprocal of packed single-precision (32-bit)

crates/core_arch/src/x86/avx512f.rs

+15-3
Original file line numberDiff line numberDiff line change
@@ -28014,7 +28014,11 @@ pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) ->
2801428014
#[cfg_attr(test, assert_instr(vmovntps))]
2801528015
#[allow(clippy::cast_ptr_alignment)]
2801628016
pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
28017-
intrinsics::nontemporal_store(mem_addr as *mut __m512, a);
28017+
crate::arch::asm!(
28018+
"vmovntps [{mem_addr}], {a}",
28019+
mem_addr = in(reg) mem_addr,
28020+
a = in(zmm_reg) a,
28021+
);
2801828022
}
2801928023

2802028024
/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
@@ -28035,7 +28039,11 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
2803528039
#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntpd
2803628040
#[allow(clippy::cast_ptr_alignment)]
2803728041
pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
28038-
intrinsics::nontemporal_store(mem_addr as *mut __m512d, a);
28042+
crate::arch::asm!(
28043+
"vmovntps [{mem_addr}], {a}",
28044+
mem_addr = in(reg) mem_addr,
28045+
a = in(zmm_reg) a,
28046+
);
2803928047
}
2804028048

2804128049
/// Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
@@ -28056,7 +28064,11 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
2805628064
#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntdq
2805728065
#[allow(clippy::cast_ptr_alignment)]
2805828066
pub unsafe fn _mm512_stream_si512(mem_addr: *mut i64, a: __m512i) {
28059-
intrinsics::nontemporal_store(mem_addr as *mut __m512i, a);
28067+
crate::arch::asm!(
28068+
"vmovntps [{mem_addr}], {a}",
28069+
mem_addr = in(reg) mem_addr,
28070+
a = in(zmm_reg) a,
28071+
);
2806028072
}
2806128073

2806228074
/// Sets packed 32-bit integers in `dst` with the supplied values.

crates/core_arch/src/x86/mod.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
33
#[allow(unused_imports)]
44
use crate::marker::Sized;
5-
use crate::{intrinsics, mem::transmute};
5+
use crate::mem::transmute;
66

77
#[macro_use]
88
mod macros;

crates/core_arch/src/x86/sse.rs

+5-1
Original file line numberDiff line numberDiff line change
@@ -2002,7 +2002,11 @@ extern "C" {
20022002
#[stable(feature = "simd_x86", since = "1.27.0")]
20032003
#[allow(clippy::cast_ptr_alignment)]
20042004
pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
2005-
intrinsics::nontemporal_store(mem_addr as *mut __m128, a);
2005+
crate::arch::asm!(
2006+
"movntps [{mem_addr}], {a}",
2007+
mem_addr = in(reg) mem_addr,
2008+
a = in(xmm_reg) a,
2009+
);
20062010
}
20072011

20082012
#[cfg(test)]

crates/core_arch/src/x86/sse2.rs

+17-5
Original file line numberDiff line numberDiff line change
@@ -1327,11 +1327,15 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
13271327
///
13281328
/// See [`_mm_sfence`] for details.
13291329
#[inline]
1330-
#[target_feature(enable = "sse2")]
1330+
#[target_feature(enable = "sse,sse2")]
13311331
#[cfg_attr(test, assert_instr(movntps))] // FIXME movntdq
13321332
#[stable(feature = "simd_x86", since = "1.27.0")]
13331333
pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
1334-
intrinsics::nontemporal_store(mem_addr, a);
1334+
crate::arch::asm!(
1335+
"movntps [{mem_addr}], {a}",
1336+
mem_addr = in(reg) mem_addr,
1337+
a = in(xmm_reg) a,
1338+
);
13351339
}
13361340

13371341
/// Stores a 32-bit integer value in the specified memory location.
@@ -1353,7 +1357,11 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
13531357
#[cfg_attr(test, assert_instr(movnti))]
13541358
#[stable(feature = "simd_x86", since = "1.27.0")]
13551359
pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
1356-
intrinsics::nontemporal_store(mem_addr, a);
1360+
crate::arch::asm!(
1361+
"movnti [{mem_addr}], {a:e}", // `:e` for 32bit value
1362+
mem_addr = in(reg) mem_addr,
1363+
a = in(reg) a,
1364+
);
13571365
}
13581366

13591367
/// Returns a vector where the low element is extracted from `a` and its upper
@@ -2543,12 +2551,16 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
25432551
///
25442552
/// See [`_mm_sfence`] for details.
25452553
#[inline]
2546-
#[target_feature(enable = "sse2")]
2554+
#[target_feature(enable = "sse,sse2")]
25472555
#[cfg_attr(test, assert_instr(movntps))] // FIXME movntpd
25482556
#[stable(feature = "simd_x86", since = "1.27.0")]
25492557
#[allow(clippy::cast_ptr_alignment)]
25502558
pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
2551-
intrinsics::nontemporal_store(mem_addr as *mut __m128d, a);
2559+
crate::arch::asm!(
2560+
"movntps [{mem_addr}], {a}",
2561+
mem_addr = in(reg) mem_addr,
2562+
a = in(xmm_reg) a,
2563+
);
25522564
}
25532565

25542566
/// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a

crates/core_arch/src/x86_64/sse2.rs

+6-5
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
//! `x86_64`'s Streaming SIMD Extensions 2 (SSE2)
22
3-
use crate::{
4-
core_arch::x86::*,
5-
intrinsics::{self, simd::*},
6-
};
3+
use crate::{core_arch::x86::*, intrinsics::simd::*};
74

85
#[cfg(test)]
96
use stdarch_test::assert_instr;
@@ -81,7 +78,11 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
8178
#[cfg_attr(test, assert_instr(movnti))]
8279
#[stable(feature = "simd_x86", since = "1.27.0")]
8380
pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
84-
intrinsics::nontemporal_store(mem_addr, a);
81+
crate::arch::asm!(
82+
"movnti [{mem_addr}], {a}",
83+
mem_addr = in(reg) mem_addr,
84+
a = in(reg) a,
85+
);
8586
}
8687

8788
/// Returns a vector whose lowest element is `a` and all higher elements are

0 commit comments

Comments
 (0)