Skip to content

x86_64 memcmp may recurse infinitely without SSE #470

Closed
@Demindiro

Description

@Demindiro

While looking at assembly in my kernel (which has SSE disabled) I noticed that there is an infinite recursion in memcmp if a string of 32 bytes or longer is passed:

...
ffff800000029d84:       48 83 fa 20             cmp    rdx,0x20
ffff800000029d88:       0f 82 ae 00 00 00       jb     ffff800000029e3c <compiler_builtins::mem::memcmp+0xcc>
ffff800000029d8e:       48 89 54 24 10          mov    QWORD PTR [rsp+0x10],rdx
ffff800000029d93:       49 89 d6                mov    r14,rdx
ffff800000029d96:       49 c1 ee 05             shr    r14,0x5
ffff800000029d9a:       49 c1 e6 05             shl    r14,0x5
ffff800000029d9e:       4a 8d 04 33             lea    rax,[rbx+r14*1]
ffff800000029da2:       48 89 44 24 08          mov    QWORD PTR [rsp+0x8],rax
ffff800000029da7:       31 ed                   xor    ebp,ebp
ffff800000029da9:       4c 8d 64 24 18          lea    r12,[rsp+0x18]
ffff800000029dae:       4c 8d 6c 24 38          lea    r13,[rsp+0x38]
ffff800000029db3:       66 2e 0f 1f 84 00 00    nop    WORD PTR cs:[rax+rax*1+0x0]
ffff800000029dba:       00 00 00 
ffff800000029dbd:       0f 1f 00                nop    DWORD PTR [rax]
        tmp.assume_init()
ffff800000029dc0:       48 8b 44 2b 18          mov    rax,QWORD PTR [rbx+rbp*1+0x18]
ffff800000029dc5:       48 89 44 24 30          mov    QWORD PTR [rsp+0x30],rax
ffff800000029dca:       48 8b 44 2b 10          mov    rax,QWORD PTR [rbx+rbp*1+0x10]
ffff800000029dcf:       48 89 44 24 28          mov    QWORD PTR [rsp+0x28],rax
ffff800000029dd4:       48 8b 04 2b             mov    rax,QWORD PTR [rbx+rbp*1]
ffff800000029dd8:       48 8b 4c 2b 08          mov    rcx,QWORD PTR [rbx+rbp*1+0x8]
ffff800000029ddd:       48 89 4c 24 20          mov    QWORD PTR [rsp+0x20],rcx
ffff800000029de2:       48 89 44 24 18          mov    QWORD PTR [rsp+0x18],rax
ffff800000029de7:       49 8b 44 2f 18          mov    rax,QWORD PTR [r15+rbp*1+0x18]
ffff800000029dec:       48 89 44 24 50          mov    QWORD PTR [rsp+0x50],rax
ffff800000029df1:       49 8b 44 2f 10          mov    rax,QWORD PTR [r15+rbp*1+0x10]
ffff800000029df6:       48 89 44 24 48          mov    QWORD PTR [rsp+0x48],rax
ffff800000029dfb:       49 8b 04 2f             mov    rax,QWORD PTR [r15+rbp*1]
ffff800000029dff:       49 8b 4c 2f 08          mov    rcx,QWORD PTR [r15+rbp*1+0x8]
ffff800000029e04:       48 89 4c 24 40          mov    QWORD PTR [rsp+0x40],rcx
ffff800000029e09:       48 89 44 24 38          mov    QWORD PTR [rsp+0x38],rax
ffff800000029e0e:       ba 20 00 00 00          mov    edx,0x20
ffff800000029e13:       4c 89 e7                mov    rdi,r12
ffff800000029e16:       4c 89 ee                mov    rsi,r13
ffff800000029e19:       e8 72 fe ff ff          call   ffff800000029c90 <memcmp>
...

This seems to be caused by the comparison between [u128; 2], which internally calls core::intrinsics::raw_eq. The documentation states:

Above some backend-decided threshold this will emit calls to memcmp, like slice equality does, instead of causing massive code size.

Replacing it with (u128, u128) does not cause a call to memcmp to be emitted though it severely impacts performance (~2x slower):

[u128; 2]
test memcmp_rust_1048576              ... bench:      24,756 ns/iter (+/- 238) = 42356 MB/s
test memcmp_rust_16                   ... bench:           3 ns/iter (+/- 0) = 5333 MB/s
test memcmp_rust_32                   ... bench:           4 ns/iter (+/- 0) = 8000 MB/s
test memcmp_rust_4096                 ... bench:         111 ns/iter (+/- 0) = 36900 MB/s
test memcmp_rust_64                   ... bench:           5 ns/iter (+/- 0) = 12800 MB/s
test memcmp_rust_8                    ... bench:           3 ns/iter (+/- 0) = 2666 MB/s
test memcmp_rust_unaligned_1048575    ... bench:      27,997 ns/iter (+/- 11,037) = 37453 MB/s
test memcmp_rust_unaligned_15         ... bench:           3 ns/iter (+/- 1) = 5333 MB/s
test memcmp_rust_unaligned_31         ... bench:           4 ns/iter (+/- 0) = 8000 MB/s
test memcmp_rust_unaligned_4095       ... bench:         101 ns/iter (+/- 2) = 40554 MB/s
test memcmp_rust_unaligned_63         ... bench:           5 ns/iter (+/- 0) = 12800 MB/s
test memcmp_rust_unaligned_7          ... bench:           3 ns/iter (+/- 0) = 2666 MB/s
(u128, u128)
test memcmp_rust_1048576              ... bench:      51,088 ns/iter (+/- 782) = 20524 MB/s
test memcmp_rust_16                   ... bench:           4 ns/iter (+/- 0) = 4000 MB/s
test memcmp_rust_32                   ... bench:           5 ns/iter (+/- 0) = 6400 MB/s
test memcmp_rust_4096                 ... bench:         170 ns/iter (+/- 3) = 24094 MB/s
test memcmp_rust_64                   ... bench:           6 ns/iter (+/- 0) = 10666 MB/s
test memcmp_rust_8                    ... bench:           3 ns/iter (+/- 0) = 2666 MB/s
test memcmp_rust_unaligned_1048575    ... bench:      56,451 ns/iter (+/- 832) = 18574 MB/s
test memcmp_rust_unaligned_15         ... bench:           4 ns/iter (+/- 0) = 4000 MB/s
test memcmp_rust_unaligned_31         ... bench:           4 ns/iter (+/- 0) = 8000 MB/s
test memcmp_rust_unaligned_4095       ... bench:         198 ns/iter (+/- 7) = 20686 MB/s
test memcmp_rust_unaligned_63         ... bench:           6 ns/iter (+/- 0) = 10666 MB/s
test memcmp_rust_unaligned_7          ... bench:           4 ns/iter (+/- 0) = 2000 MB/s

Leaving out c32() entirely has a lesser but still significant performance impact (~1.3x slower), though compares on smaller strings seem to benefit:

No c32()
test memcmp_rust_1048576              ... bench:      31,405 ns/iter (+/- 426) = 33388 MB/s
test memcmp_rust_16                   ... bench:           3 ns/iter (+/- 0) = 5333 MB/s
test memcmp_rust_32                   ... bench:           3 ns/iter (+/- 0) = 10666 MB/s
test memcmp_rust_4096                 ... bench:         147 ns/iter (+/- 49) = 27863 MB/s
test memcmp_rust_64                   ... bench:           5 ns/iter (+/- 0) = 12800 MB/s
test memcmp_rust_8                    ... bench:           2 ns/iter (+/- 0) = 4000 MB/s
test memcmp_rust_unaligned_1048575    ... bench:      31,616 ns/iter (+/- 2,652) = 33165 MB/s
test memcmp_rust_unaligned_15         ... bench:           3 ns/iter (+/- 0) = 5333 MB/s
test memcmp_rust_unaligned_31         ... bench:           3 ns/iter (+/- 0) = 10666 MB/s
test memcmp_rust_unaligned_4095       ... bench:         139 ns/iter (+/- 35) = 29467 MB/s
test memcmp_rust_unaligned_63         ... bench:           4 ns/iter (+/- 0) = 16000 MB/s
test memcmp_rust_unaligned_7          ... bench:           2 ns/iter (+/- 0) = 4000 MB/s

Given that targets with SSE(2) do not seem to generate a call to memcmp even in debug mode it may be worth to keep using c32() if this feature is present. I don't know if it is worth the gamble though.

diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs
index 4d2f6e5..e295e89 100644
--- a/src/mem/x86_64.rs
+++ b/src/mem/x86_64.rs
@@ -143,5 +143,9 @@ pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
     let c8 = |a: *const u64, b, n| cmp(a, b, n, c4);
     let c16 = |a: *const u128, b, n| cmp(a, b, n, c8);
     let c32 = |a: *const [u128; 2], b, n| cmp(a, b, n, c16);
-    c32(a.cast(), b.cast(), n)
+    if cfg!(target_feature = "sse2") {
+        c32(a.cast(), b.cast(), n)
+    } else {
+        c16(a.cast(), b.cast(), n)
+    }
 }

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions