Skip to content

Arc::new duplicates stack memory #111603

Closed
@marc0246

Description

@marc0246

Running the following through Godbolt, using rustc 1.71.0-nightly (cba1407 2023-05-10) with -C opt-level=3:

use std::sync::Arc;

pub fn test1(x: u64) -> Arc<[u64]> {
    let array = [x; 1000];
    Arc::new(array)
}

I expected rustc to create an 8 KB stack allocation and a memcpy from it to the heap allocation, instead it creates 2 identical stack allocations and 2 memcpys to go along with them:

example::test1:
        push    rbx
        sub     rsp, 4096
        mov     qword ptr [rsp], 0
        sub     rsp, 4096
        mov     qword ptr [rsp], 0
        sub     rsp, 4096
        mov     qword ptr [rsp], 0
        sub     rsp, 3712
        movq    xmm0, rdi
        pshufd  xmm0, xmm0, 68
        mov     eax, 18
.LBB0_1:
        movdqu  xmmword ptr [rsp + 8*rax - 144], xmm0
        movdqu  xmmword ptr [rsp + 8*rax - 128], xmm0
        movdqu  xmmword ptr [rsp + 8*rax - 112], xmm0
        movdqu  xmmword ptr [rsp + 8*rax - 96], xmm0
        movdqu  xmmword ptr [rsp + 8*rax - 80], xmm0
        movdqu  xmmword ptr [rsp + 8*rax - 64], xmm0
        movdqu  xmmword ptr [rsp + 8*rax - 48], xmm0
        movdqu  xmmword ptr [rsp + 8*rax - 32], xmm0
        movdqu  xmmword ptr [rsp + 8*rax - 16], xmm0
        movdqu  xmmword ptr [rsp + 8*rax], xmm0
        add     rax, 20
        cmp     rax, 1018
        jne     .LBB0_1
        lea     rdi, [rsp + 8000]
        mov     rsi, rsp
        mov     edx, 8000
        call    qword ptr [rip + memcpy@GOTPCREL]
        mov     edi, 8016
        mov     esi, 8
        call    qword ptr [rip + __rust_alloc@GOTPCREL]
        test    rax, rax
        je      .LBB0_3
        mov     rbx, rax
        mov     qword ptr [rax], 1
        mov     qword ptr [rax + 8], 1
        mov     rdi, rax
        add     rdi, 16
        lea     rsi, [rsp + 8000]
        mov     edx, 8000
        call    qword ptr [rip + memcpy@GOTPCREL]
        mov     edx, 1000
        mov     rax, rbx
        add     rsp, 16000
        pop     rbx
        ret
.LBB0_3:
        mov     edi, 8
        mov     esi, 8016
        call    qword ptr [rip + alloc::alloc::handle_alloc_error@GOTPCREL]
        ud2

Trying to narrow the problem down further, when eliminating the stack allocations entirely:

#![feature(get_mut_unchecked, new_uninit)]

use std::sync::Arc;

pub fn test2(x: u64) -> Arc<[u64]> {
    let mut arc = Arc::new_uninit_slice(1000);
    for elem in unsafe { Arc::get_mut_unchecked(&mut arc) } {
        elem.write(x);
    }
    unsafe { arc.assume_init() }
}

The code duplication still persists, and is limited to the arcinner_layout_for_value_layout call:

example::test2:
        push    r15
        push    r14
        push    r12
        push    rbx
        push    rax
        mov     rbx, rdi
        mov     r12, qword ptr [rip + alloc::sync::arcinner_layout_for_value_layout@GOTPCREL]
        mov     edi, 8
        mov     esi, 8000
        call    r12
        mov     r14, rax
        mov     r15, rdx
        mov     edi, 8
        mov     esi, 8000
        call    r12
        test    rdx, rdx
        je      .LBB0_2
        mov     rdi, rdx
        mov     rsi, rax
        call    qword ptr [rip + __rust_alloc@GOTPCREL]
.LBB0_2:
        test    rax, rax
        je      .LBB0_6
        mov     qword ptr [rax], 1
        mov     qword ptr [rax + 8], 1
        movq    xmm0, rbx
        pshufd  xmm0, xmm0, 68
        mov     ecx, 20
.LBB0_4:
        movdqu  xmmword ptr [rax + 8*rcx - 144], xmm0
        movdqu  xmmword ptr [rax + 8*rcx - 128], xmm0
        movdqu  xmmword ptr [rax + 8*rcx - 112], xmm0
        movdqu  xmmword ptr [rax + 8*rcx - 96], xmm0
        movdqu  xmmword ptr [rax + 8*rcx - 80], xmm0
        movdqu  xmmword ptr [rax + 8*rcx - 64], xmm0
        movdqu  xmmword ptr [rax + 8*rcx - 48], xmm0
        movdqu  xmmword ptr [rax + 8*rcx - 32], xmm0
        movdqu  xmmword ptr [rax + 8*rcx - 16], xmm0
        movdqu  xmmword ptr [rax + 8*rcx], xmm0
        add     rcx, 20
        cmp     rcx, 1020
        jne     .LBB0_4
        mov     edx, 1000
        add     rsp, 8
        pop     rbx
        pop     r12
        pop     r14
        pop     r15
        ret
.LBB0_6:
        mov     rdi, r14
        mov     rsi, r15
        call    qword ptr [rip + alloc::alloc::handle_alloc_error@GOTPCREL]
        ud2

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-LLVMArea: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.A-codegenArea: Code generationC-bugCategory: This is a bug.I-heavyIssue: Problems and improvements with respect to binary size of generated code.I-slowIssue: Problems and improvements with respect to performance of generated code.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions