Skip to content

MaybeUninit::assume_init optimizes poorly #74267

Open
@dtolnay

Description

@dtolnay

In #74254 we observed that returning expr.assume_init() from a function unexpectedly inhibits the return value from being constructed in place up front.

https://rust.godbolt.org/z/hr77qM

#![allow(deprecated)]

use std::mem::{self, MaybeUninit};
use std::ptr;

type T = String;
const N: usize = 2;

// fast
pub fn a() -> [T; N] {
    Default::default()
}

// fast
pub fn b() -> [T; N] {
    unsafe {
        // ignore the UB for now
        let mut array: [T; N] = mem::uninitialized();
        for slot in &mut array {
            ptr::write(slot, T::default());
        }
        array
    }
}

// slow
pub fn c() -> [T; N] {
    let mut array: MaybeUninit<[T; N]> = MaybeUninit::uninit();
    unsafe {
        // ignore the UB for now
        // ordinarily would cast to &mut [MaybeUninit<T>; N]
        // but here we try to minimize difference from `b`
        let slots = &mut *array.as_mut_ptr();
        for slot in slots {
            ptr::write(slot, T::default());
        }
        array.assume_init()
    }
}

Notice that in the slow function the return value is constructed exactly the same as in both of the fast functions (6 instructions) except in the wrong place, then relocated from [rsp-48] to [rdi] 😢 (12 instructions).

example::a:
        mov     rax, rdi
        mov     qword ptr [rdi], 1
        vxorps  xmm0, xmm0, xmm0
        vmovups xmmword ptr [rdi + 8], xmm0
        mov     qword ptr [rdi + 24], 1
        vmovups xmmword ptr [rdi + 32], xmm0
        ret

example::b:
        mov     rax, rdi
        mov     qword ptr [rdi], 1
        vxorps  xmm0, xmm0, xmm0
        vmovups xmmword ptr [rdi + 8], xmm0
        mov     qword ptr [rdi + 24], 1
        vmovups xmmword ptr [rdi + 32], xmm0
        ret

example::c:
        sub     rsp, 48
        mov     rax, rdi
        mov     qword ptr [rsp], 1
        vxorps  xmm0, xmm0, xmm0
        vmovups xmmword ptr [rsp + 8], xmm0
        mov     qword ptr [rsp + 24], 1
        vmovups xmmword ptr [rsp + 32], xmm0
        mov     rcx, qword ptr [rsp]
        mov     qword ptr [rdi], rcx
        vmovups xmm0, xmmword ptr [rsp + 8]
        vmovups xmmword ptr [rdi + 8], xmm0
        mov     rcx, qword ptr [rsp + 24]
        mov     qword ptr [rdi + 24], rcx
        mov     rcx, qword ptr [rsp + 16]
        mov     qword ptr [rdi + 16], rcx
        mov     rcx, qword ptr [rsp + 24]
        mov     qword ptr [rdi + 24], rcx
        vmovups xmm0, xmmword ptr [rsp + 32]
        vmovups xmmword ptr [rdi + 32], xmm0
        add     rsp, 48
        ret

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-codegenArea: Code generationC-optimizationCategory: An issue highlighting optimization opportunities or PRs implementing suchE-needs-testCall for participation: An issue has been fixed and does not reproduce, but no test has been added.I-slowIssue: Problems and improvements with respect to performance of generated code.T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions