Skip to content

Inlining leaves extra assembly #141144

Open
Open
@RoDmitry

Description

@RoDmitry

I have found a weird issue with inlining. I have expected these two functions to have the same assembly, but the second one has extra code:

search_by_char_opt:
        xor     eax, eax
        cmp     edx, dword ptr [rdi]
        sete    al
        xor     edx, edx
        ret

search:
        mov     esi, dword ptr [rdi]
        cmp     esi, edx
        seta    al
        sbb     al, 0
        movsx   rcx, al
        xor     eax, eax
        cmp     edx, esi
        sete    al
        mov     rdx, rcx
        ret

Code to reproduce: godbolt.org (nightly, -C opt-level=3)

Code

use core::cmp::Ordering::{self, Equal, Greater, Less};

#[inline(always)]
fn compare(v: char, ch: char) -> Ordering {
    if ch < v {
        Greater
    } else if ch > v {
        Less
    } else {
        Equal
    }
}

#[unsafe(no_mangle)]
#[inline(never)]
pub fn search_by_char_opt(chars: &[char], ch: char) -> Option<usize> {
    let base = 0usize;
    let res = *unsafe { chars.get_unchecked(base) };
    let cmp = compare(res, ch);

    if cmp == Equal {
        Some(base)
    } else {
        None
    }
}

#[inline(always)]
fn search_by_char(chars: &[char],  ch: char) -> Result<usize, usize> {
    let base = 0usize;
    let res = *unsafe { chars.get_unchecked(base) };
    let cmp = compare(res, ch);

    // bad
    if cmp == Equal {
        Ok(base)
    } else {
        let result = base + cmp as usize; // also `(cmp == Less)`
        Err(result)
    }

    // good
    // match cmp {
    //     Equal => Ok(base),
    //     _ => {
    //         let result = base + cmp as usize;
    //         Err(result)
    //     }
    // }

    // bad
    // match cmp {
    //     Equal => Ok(base),
    //     _ => {
    //         let result = base + (cmp == Less) as usize;
    //         Err(result)
    //     }
    // }

    // good
    // match cmp {
    //     Equal => Ok(base),
    //     v => {
    //         let result = base + (v == Less) as usize;
    //         Err(result)
    //     }
    // }
}

#[unsafe(no_mangle)]
#[inline(never)]
pub fn search(chars: &[char], ch: char) -> Option<usize> {
    match search_by_char(chars, ch) {
        Ok(v) => Some(v),
        _ => None,
    }
}

Considering different examples (if or match), especially (cmp == Less), my guess is that the second usage of the cmp variable duplicates the assembly of fn compare.

Update

Example with generics (closure), not affected by -C debuginfo=0 or -C opt-level=1: godbolt.org

Code

use core::cmp::Ordering::{self, Equal, Greater, Less};

#[inline(always)]
fn compare(v: char, ch: char) -> Ordering {
    if ch < v {
        Greater
    } else if ch > v {
        Less
    } else {
        Equal
    }
}

#[inline(always)]
pub fn search_by_opt<F>(chars: &[char], f: F) -> Option<usize>
    where
        F: Fn(char) -> Ordering
{
    let base = 0usize;
    let res = *unsafe { chars.get_unchecked(base) };
    let cmp = f(res);

    if cmp == Equal {
        Some(base)
    } else {
        None
    }
}

#[unsafe(no_mangle)]
#[inline(never)]
pub fn search_opt(chars: &[char], ch: char) -> Option<usize> {
    search_by_opt(chars, |v| compare(v, ch))
}

#[inline(always)]
fn search_by<F>(chars: &[char], f: F) -> Result<usize, usize>
    where
        F: Fn(char) -> Ordering
{
    let base = 0usize;
    let res = *unsafe { chars.get_unchecked(base) };
    let cmp = f(res);

    if cmp == Equal {
        Ok(base)
    } else {
        let result = base + cmp as usize;
        Err(result)
    }
}

#[unsafe(no_mangle)]
#[inline(never)]
pub fn search(chars: &[char], ch: char) -> Option<usize> {
    match search_by(chars, |v| compare(v, ch)) {
    // this helps, but only in this exact example
    // match search_by(chars, #[inline(always)] |v| compare(v, ch)) {
        Ok(v) => Some(v),
        _ => None,
    }
}

Full project with this issue

Here even #[inline(always)] for the closure does not help: https://github.com/RoDmitry/alphabet_detector/tree/141144

File alphabet_detector/src/lang/ucd/script.rs starting from the line 453.

Two identical functions, only difference is a generic Fn:

cargo asm alphabet_detector::lang::ucd::script::UcdScript::find

Asm

alphabet_detector::lang::ucd::script::UcdScript::find:
 cmp     edi, 43739
 jae     .LBB2_2
 xor     eax, eax
 jmp     .LBB2_3
.LBB2_2:
 mov     eax, 403
 mov     edx, 403
 cmp     edi, 43744
 jb      .LBB2_23
.LBB2_3:
 lea     rdx, [rax, +, 201]
 lea     rsi, [rdx, +, 2*rdx]
 lea     rcx, [rip, +, .Lanon.6381ac5fc63e354c7ed6bf76902decf4.389]
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 jb      .LBB2_5
 lea     rsi, [rcx, +, 4*rsi]
 mov     rax, rdx
 cmp     edi, dword, ptr, [rsi, +, 4]
 jbe     .LBB2_23
.LBB2_5:
 lea     rdx, [rax, +, 101]
 lea     rsi, [rdx, +, 2*rdx]
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 jb      .LBB2_7
 lea     rsi, [rcx, +, 4*rsi]
 mov     rax, rdx
 cmp     edi, dword, ptr, [rsi, +, 4]
 jbe     .LBB2_23
.LBB2_7:
 lea     rdx, [rax, +, 50]
 lea     rsi, [rdx, +, 2*rdx]
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 jb      .LBB2_9
 lea     rsi, [rcx, +, 4*rsi]
 mov     rax, rdx
 cmp     edi, dword, ptr, [rsi, +, 4]
 jbe     .LBB2_23
.LBB2_9:
 lea     rdx, [rax, +, 25]
 lea     rsi, [rdx, +, 2*rdx]
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 jb      .LBB2_11
 lea     rsi, [rcx, +, 4*rsi]
 mov     rax, rdx
 cmp     edi, dword, ptr, [rsi, +, 4]
 jbe     .LBB2_23
.LBB2_11:
 lea     rdx, [rax, +, 13]
 lea     rsi, [rdx, +, 2*rdx]
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 jb      .LBB2_13
 lea     rsi, [rcx, +, 4*rsi]
 mov     rax, rdx
 cmp     edi, dword, ptr, [rsi, +, 4]
 jbe     .LBB2_23
.LBB2_13:
 lea     rdx, [rax, +, 6]
 lea     rsi, [rdx, +, 2*rdx]
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 jb      .LBB2_15
 lea     rsi, [rcx, +, 4*rsi]
 mov     rax, rdx
 cmp     edi, dword, ptr, [rsi, +, 4]
 jbe     .LBB2_23
.LBB2_15:
 lea     rdx, [rax, +, 3]
 lea     rsi, [rdx, +, 2*rdx]
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 jb      .LBB2_17
 lea     rsi, [rcx, +, 4*rsi]
 mov     rax, rdx
 cmp     edi, dword, ptr, [rsi, +, 4]
 jbe     .LBB2_23
.LBB2_17:
 lea     rdx, [rax, +, 2]
 lea     rsi, [rdx, +, 2*rdx]
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 jb      .LBB2_19
 lea     rsi, [rcx, +, 4*rsi]
 mov     rax, rdx
 cmp     edi, dword, ptr, [rsi, +, 4]
 jbe     .LBB2_23
.LBB2_19:
 lea     rsi, [rax, +, 1]
 lea     r8, [rsi, +, 2*rsi]
 mov     rdx, rax
 cmp     edi, dword, ptr, [rcx, +, 4*r8]
 jb      .LBB2_21
 lea     rax, [rcx, +, 4*r8]
 mov     rdx, rsi
 cmp     edi, dword, ptr, [rax, +, 4]
 jbe     .LBB2_23
.LBB2_21:
 lea     rsi, [rdx, +, 2*rdx]
 mov     al, 24
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 jb      .LBB2_24
 lea     rcx, [rcx, +, 4*rsi]
 cmp     edi, dword, ptr, [rcx, +, 4]
 ja      .LBB2_24
.LBB2_23:
 lea     rax, [rdx, +, 2*rdx]
 lea     rcx, [rip, +, .Lanon.6381ac5fc63e354c7ed6bf76902decf4.389]
 movzx   eax, byte, ptr, [rcx, +, 4*rax, +, 8]
.LBB2_24:
 ret

cargo asm alphabet_detector::lang::ucd::script::UcdScript::find_broken

Asm

alphabet_detector::lang::ucd::script::UcdScript::find_broken:
 xor     eax, eax
 cmp     edi, 43744
 adc     al, -1
 cmp     edi, 43739
 movzx   eax, al
 mov     ecx, 1
 cmovae  ecx, eax
 mov     eax, 403
 test    cl, cl
 je      .LBB3_34
 movzx   ecx, cl
 cmp     ecx, 1
 jne     .LBB3_3
 xor     eax, eax
.LBB3_3:
 lea     rdx, [rax, +, 201]
 lea     rsi, [rdx, +, 2*rdx]
 lea     rcx, [rip, +, .Lanon.6381ac5fc63e354c7ed6bf76902decf4.389]
 xor     r8d, r8d
 cmp     dword, ptr, [rcx, +, 4*rsi, +, 4], edi
 sbb     r8d, r8d
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 mov     esi, 1
 cmovae  esi, r8d
 test    sil, sil
 je      .LBB3_33
 movzx   esi, sil
 cmp     esi, 255
 je      .LBB3_6
 mov     rdx, rax
.LBB3_6:
 lea     rax, [rdx, +, 101]
 lea     rsi, [rax, +, 2*rax]
 xor     r8d, r8d
 cmp     dword, ptr, [rcx, +, 4*rsi, +, 4], edi
 sbb     r8d, r8d
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 mov     esi, 1
 cmovae  esi, r8d
 test    sil, sil
 je      .LBB3_34
 movzx   esi, sil
 cmp     esi, 255
 je      .LBB3_9
 mov     rax, rdx
.LBB3_9:
 lea     rdx, [rax, +, 50]
 lea     rsi, [rdx, +, 2*rdx]
 xor     r8d, r8d
 cmp     dword, ptr, [rcx, +, 4*rsi, +, 4], edi
 sbb     r8d, r8d
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 mov     esi, 1
 cmovae  esi, r8d
 test    sil, sil
 je      .LBB3_33
 movzx   esi, sil
 cmp     esi, 255
 je      .LBB3_12
 mov     rdx, rax
.LBB3_12:
 lea     rax, [rdx, +, 25]
 lea     rsi, [rax, +, 2*rax]
 xor     r8d, r8d
 cmp     dword, ptr, [rcx, +, 4*rsi, +, 4], edi
 sbb     r8d, r8d
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 mov     esi, 1
 cmovae  esi, r8d
 test    sil, sil
 je      .LBB3_34
 movzx   esi, sil
 cmp     esi, 255
 je      .LBB3_15
 mov     rax, rdx
.LBB3_15:
 lea     rdx, [rax, +, 13]
 lea     rsi, [rdx, +, 2*rdx]
 xor     r8d, r8d
 cmp     dword, ptr, [rcx, +, 4*rsi, +, 4], edi
 sbb     r8d, r8d
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 mov     esi, 1
 cmovae  esi, r8d
 test    sil, sil
 je      .LBB3_33
 movzx   esi, sil
 cmp     esi, 255
 je      .LBB3_18
 mov     rdx, rax
.LBB3_18:
 lea     rax, [rdx, +, 6]
 lea     rsi, [rax, +, 2*rax]
 xor     r8d, r8d
 cmp     dword, ptr, [rcx, +, 4*rsi, +, 4], edi
 sbb     r8d, r8d
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 mov     esi, 1
 cmovae  esi, r8d
 test    sil, sil
 je      .LBB3_34
 movzx   esi, sil
 cmp     esi, 255
 je      .LBB3_21
 mov     rax, rdx
.LBB3_21:
 lea     rdx, [rax, +, 3]
 lea     rsi, [rdx, +, 2*rdx]
 xor     r8d, r8d
 cmp     dword, ptr, [rcx, +, 4*rsi, +, 4], edi
 sbb     r8d, r8d
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 mov     esi, 1
 cmovae  esi, r8d
 test    sil, sil
 je      .LBB3_33
 movzx   esi, sil
 cmp     esi, 255
 je      .LBB3_24
 mov     rdx, rax
.LBB3_24:
 lea     rax, [rdx, +, 2]
 lea     rsi, [rax, +, 2*rax]
 xor     r8d, r8d
 cmp     dword, ptr, [rcx, +, 4*rsi, +, 4], edi
 sbb     r8d, r8d
 cmp     edi, dword, ptr, [rcx, +, 4*rsi]
 mov     esi, 1
 cmovae  esi, r8d
 test    sil, sil
 je      .LBB3_34
 movzx   esi, sil
 cmp     esi, 255
 je      .LBB3_27
 mov     rax, rdx
.LBB3_27:
 lea     rdx, [rax, +, 1]
 lea     rsi, [rdx, +, 2*rdx]
 mov     r8d, dword, ptr, [rcx, +, 4*rsi]
 mov     esi, dword, ptr, [rcx, +, 4*rsi, +, 4]
 xor     r10d, r10d
 cmp     esi, edi
 sbb     r10d, r10d
 cmp     edi, r8d
 mov     r9d, 1
 cmovae  r9d, r10d
 test    r9b, r9b
 je      .LBB3_33
 movzx   r9d, r9b
 cmp     r9d, 255
 je      .LBB3_30
 lea     rdx, [rax, +, 2*rax]
 mov     r8d, dword, ptr, [rcx, +, 4*rdx]
 mov     esi, dword, ptr, [rcx, +, 4*rdx, +, 4]
 mov     rdx, rax
.LBB3_30:
 cmp     edi, r8d
 setae   cl
 cmp     edi, esi
 seta    r9b
 mov     al, 24
 cmp     edi, r8d
 jb      .LBB3_35
 cmp     edi, esi
 ja      .LBB3_35
 and     cl, r9b
 movzx   eax, cl
 add     rdx, rax
.LBB3_33:
 mov     rax, rdx
.LBB3_34:
 lea     rax, [rax, +, 2*rax]
 lea     rcx, [rip, +, .Lanon.6381ac5fc63e354c7ed6bf76902decf4.389]
 movzx   eax, byte, ptr, [rcx, +, 4*rax, +, 8]
.LBB3_35:
 ret

But if I replace

base = match cmp {
        Less => mid,
        Equal => {
            unsafe { hint::assert_unchecked(mid < self.len()) };
            return Ok(mid);
        }
        Greater => base,
    };

To base = hint::select_unpredictable(cmp == Greater, base, mid);, their assembly start to look closer to identical.

rustc --version --verbose:

rustc 1.89.0-nightly (d97326eab 2025-05-15)
binary: rustc
commit-hash: d97326eabfc3b2c33abcb08d6bc117aefa697cb7
commit-date: 2025-05-15
host: x86_64-unknown-linux-gnu
release: 1.89.0-nightly
LLVM version: 20.1.4
RUSTFLAGS: "-C target-cpu=native" (Intel 9700K)

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-LLVMArea: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.A-codegenArea: Code generationC-optimizationCategory: An issue highlighting optimization opportunities or PRs implementing suchI-slowIssue: Problems and improvements with respect to performance of generated code.T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions