Closed
Description
Running the following through Godbolt, using rustc 1.71.0-nightly (cba1407 2023-05-10) with -C opt-level=3
:
use std::sync::Arc;
pub fn test1(x: u64) -> Arc<[u64]> {
let array = [x; 1000];
Arc::new(array)
}
I expected rustc to create an 8 KB stack allocation and a memcpy from it to the heap allocation, instead it creates 2 identical stack allocations and 2 memcpys to go along with them:
example::test1:
push rbx
sub rsp, 4096
mov qword ptr [rsp], 0
sub rsp, 4096
mov qword ptr [rsp], 0
sub rsp, 4096
mov qword ptr [rsp], 0
sub rsp, 3712
movq xmm0, rdi
pshufd xmm0, xmm0, 68
mov eax, 18
.LBB0_1:
movdqu xmmword ptr [rsp + 8*rax - 144], xmm0
movdqu xmmword ptr [rsp + 8*rax - 128], xmm0
movdqu xmmword ptr [rsp + 8*rax - 112], xmm0
movdqu xmmword ptr [rsp + 8*rax - 96], xmm0
movdqu xmmword ptr [rsp + 8*rax - 80], xmm0
movdqu xmmword ptr [rsp + 8*rax - 64], xmm0
movdqu xmmword ptr [rsp + 8*rax - 48], xmm0
movdqu xmmword ptr [rsp + 8*rax - 32], xmm0
movdqu xmmword ptr [rsp + 8*rax - 16], xmm0
movdqu xmmword ptr [rsp + 8*rax], xmm0
add rax, 20
cmp rax, 1018
jne .LBB0_1
lea rdi, [rsp + 8000]
mov rsi, rsp
mov edx, 8000
call qword ptr [rip + memcpy@GOTPCREL]
mov edi, 8016
mov esi, 8
call qword ptr [rip + __rust_alloc@GOTPCREL]
test rax, rax
je .LBB0_3
mov rbx, rax
mov qword ptr [rax], 1
mov qword ptr [rax + 8], 1
mov rdi, rax
add rdi, 16
lea rsi, [rsp + 8000]
mov edx, 8000
call qword ptr [rip + memcpy@GOTPCREL]
mov edx, 1000
mov rax, rbx
add rsp, 16000
pop rbx
ret
.LBB0_3:
mov edi, 8
mov esi, 8016
call qword ptr [rip + alloc::alloc::handle_alloc_error@GOTPCREL]
ud2
Trying to narrow the problem down further, when eliminating the stack allocations entirely:
#![feature(get_mut_unchecked, new_uninit)]
use std::sync::Arc;
pub fn test2(x: u64) -> Arc<[u64]> {
let mut arc = Arc::new_uninit_slice(1000);
for elem in unsafe { Arc::get_mut_unchecked(&mut arc) } {
elem.write(x);
}
unsafe { arc.assume_init() }
}
The code duplication still persists, and is limited to the arcinner_layout_for_value_layout
call:
example::test2:
push r15
push r14
push r12
push rbx
push rax
mov rbx, rdi
mov r12, qword ptr [rip + alloc::sync::arcinner_layout_for_value_layout@GOTPCREL]
mov edi, 8
mov esi, 8000
call r12
mov r14, rax
mov r15, rdx
mov edi, 8
mov esi, 8000
call r12
test rdx, rdx
je .LBB0_2
mov rdi, rdx
mov rsi, rax
call qword ptr [rip + __rust_alloc@GOTPCREL]
.LBB0_2:
test rax, rax
je .LBB0_6
mov qword ptr [rax], 1
mov qword ptr [rax + 8], 1
movq xmm0, rbx
pshufd xmm0, xmm0, 68
mov ecx, 20
.LBB0_4:
movdqu xmmword ptr [rax + 8*rcx - 144], xmm0
movdqu xmmword ptr [rax + 8*rcx - 128], xmm0
movdqu xmmword ptr [rax + 8*rcx - 112], xmm0
movdqu xmmword ptr [rax + 8*rcx - 96], xmm0
movdqu xmmword ptr [rax + 8*rcx - 80], xmm0
movdqu xmmword ptr [rax + 8*rcx - 64], xmm0
movdqu xmmword ptr [rax + 8*rcx - 48], xmm0
movdqu xmmword ptr [rax + 8*rcx - 32], xmm0
movdqu xmmword ptr [rax + 8*rcx - 16], xmm0
movdqu xmmword ptr [rax + 8*rcx], xmm0
add rcx, 20
cmp rcx, 1020
jne .LBB0_4
mov edx, 1000
add rsp, 8
pop rbx
pop r12
pop r14
pop r15
ret
.LBB0_6:
mov rdi, r14
mov rsi, r15
call qword ptr [rip + alloc::alloc::handle_alloc_error@GOTPCREL]
ud2
Metadata
Metadata
Assignees
Labels
Area: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.Area: Code generationCategory: This is a bug.Issue: Problems and improvements with respect to binary size of generated code.Issue: Problems and improvements with respect to performance of generated code.