Description
I have found a weird issue with inlining. I have expected these two functions to have the same assembly, but the second one has extra code:
search_by_char_opt:
xor eax, eax
cmp edx, dword ptr [rdi]
sete al
xor edx, edx
ret
search:
mov esi, dword ptr [rdi]
cmp esi, edx
seta al
sbb al, 0
movsx rcx, al
xor eax, eax
cmp edx, esi
sete al
mov rdx, rcx
ret
Code to reproduce: godbolt.org (nightly, -C opt-level=3
)
Code
use core::cmp::Ordering::{self, Equal, Greater, Less};
#[inline(always)]
fn compare(v: char, ch: char) -> Ordering {
if ch < v {
Greater
} else if ch > v {
Less
} else {
Equal
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn search_by_char_opt(chars: &[char], ch: char) -> Option<usize> {
let base = 0usize;
let res = *unsafe { chars.get_unchecked(base) };
let cmp = compare(res, ch);
if cmp == Equal {
Some(base)
} else {
None
}
}
#[inline(always)]
fn search_by_char(chars: &[char], ch: char) -> Result<usize, usize> {
let base = 0usize;
let res = *unsafe { chars.get_unchecked(base) };
let cmp = compare(res, ch);
// bad
if cmp == Equal {
Ok(base)
} else {
let result = base + cmp as usize; // also `(cmp == Less)`
Err(result)
}
// good
// match cmp {
// Equal => Ok(base),
// _ => {
// let result = base + cmp as usize;
// Err(result)
// }
// }
// bad
// match cmp {
// Equal => Ok(base),
// _ => {
// let result = base + (cmp == Less) as usize;
// Err(result)
// }
// }
// good
// match cmp {
// Equal => Ok(base),
// v => {
// let result = base + (v == Less) as usize;
// Err(result)
// }
// }
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn search(chars: &[char], ch: char) -> Option<usize> {
match search_by_char(chars, ch) {
Ok(v) => Some(v),
_ => None,
}
}
Considering different examples (if
or match
), especially (cmp == Less)
, my guess is that the second usage of the cmp
variable duplicates the assembly of fn compare
.
Update
Example with generics (closure), not affected by -C debuginfo=0
or -C opt-level=1
: godbolt.org
Code
use core::cmp::Ordering::{self, Equal, Greater, Less};
#[inline(always)]
fn compare(v: char, ch: char) -> Ordering {
if ch < v {
Greater
} else if ch > v {
Less
} else {
Equal
}
}
#[inline(always)]
pub fn search_by_opt<F>(chars: &[char], f: F) -> Option<usize>
where
F: Fn(char) -> Ordering
{
let base = 0usize;
let res = *unsafe { chars.get_unchecked(base) };
let cmp = f(res);
if cmp == Equal {
Some(base)
} else {
None
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn search_opt(chars: &[char], ch: char) -> Option<usize> {
search_by_opt(chars, |v| compare(v, ch))
}
#[inline(always)]
fn search_by<F>(chars: &[char], f: F) -> Result<usize, usize>
where
F: Fn(char) -> Ordering
{
let base = 0usize;
let res = *unsafe { chars.get_unchecked(base) };
let cmp = f(res);
if cmp == Equal {
Ok(base)
} else {
let result = base + cmp as usize;
Err(result)
}
}
#[unsafe(no_mangle)]
#[inline(never)]
pub fn search(chars: &[char], ch: char) -> Option<usize> {
match search_by(chars, |v| compare(v, ch)) {
// this helps, but only in this exact example
// match search_by(chars, #[inline(always)] |v| compare(v, ch)) {
Ok(v) => Some(v),
_ => None,
}
}
Full project with this issue
Here even #[inline(always)]
for the closure does not help: https://github.com/RoDmitry/alphabet_detector/tree/141144
File alphabet_detector/src/lang/ucd/script.rs
starting from the line 453.
Two identical functions, only difference is a generic Fn
:
cargo asm alphabet_detector::lang::ucd::script::UcdScript::find
Asm
alphabet_detector::lang::ucd::script::UcdScript::find:
cmp edi, 43739
jae .LBB2_2
xor eax, eax
jmp .LBB2_3
.LBB2_2:
mov eax, 403
mov edx, 403
cmp edi, 43744
jb .LBB2_23
.LBB2_3:
lea rdx, [rax, +, 201]
lea rsi, [rdx, +, 2*rdx]
lea rcx, [rip, +, .Lanon.6381ac5fc63e354c7ed6bf76902decf4.389]
cmp edi, dword, ptr, [rcx, +, 4*rsi]
jb .LBB2_5
lea rsi, [rcx, +, 4*rsi]
mov rax, rdx
cmp edi, dword, ptr, [rsi, +, 4]
jbe .LBB2_23
.LBB2_5:
lea rdx, [rax, +, 101]
lea rsi, [rdx, +, 2*rdx]
cmp edi, dword, ptr, [rcx, +, 4*rsi]
jb .LBB2_7
lea rsi, [rcx, +, 4*rsi]
mov rax, rdx
cmp edi, dword, ptr, [rsi, +, 4]
jbe .LBB2_23
.LBB2_7:
lea rdx, [rax, +, 50]
lea rsi, [rdx, +, 2*rdx]
cmp edi, dword, ptr, [rcx, +, 4*rsi]
jb .LBB2_9
lea rsi, [rcx, +, 4*rsi]
mov rax, rdx
cmp edi, dword, ptr, [rsi, +, 4]
jbe .LBB2_23
.LBB2_9:
lea rdx, [rax, +, 25]
lea rsi, [rdx, +, 2*rdx]
cmp edi, dword, ptr, [rcx, +, 4*rsi]
jb .LBB2_11
lea rsi, [rcx, +, 4*rsi]
mov rax, rdx
cmp edi, dword, ptr, [rsi, +, 4]
jbe .LBB2_23
.LBB2_11:
lea rdx, [rax, +, 13]
lea rsi, [rdx, +, 2*rdx]
cmp edi, dword, ptr, [rcx, +, 4*rsi]
jb .LBB2_13
lea rsi, [rcx, +, 4*rsi]
mov rax, rdx
cmp edi, dword, ptr, [rsi, +, 4]
jbe .LBB2_23
.LBB2_13:
lea rdx, [rax, +, 6]
lea rsi, [rdx, +, 2*rdx]
cmp edi, dword, ptr, [rcx, +, 4*rsi]
jb .LBB2_15
lea rsi, [rcx, +, 4*rsi]
mov rax, rdx
cmp edi, dword, ptr, [rsi, +, 4]
jbe .LBB2_23
.LBB2_15:
lea rdx, [rax, +, 3]
lea rsi, [rdx, +, 2*rdx]
cmp edi, dword, ptr, [rcx, +, 4*rsi]
jb .LBB2_17
lea rsi, [rcx, +, 4*rsi]
mov rax, rdx
cmp edi, dword, ptr, [rsi, +, 4]
jbe .LBB2_23
.LBB2_17:
lea rdx, [rax, +, 2]
lea rsi, [rdx, +, 2*rdx]
cmp edi, dword, ptr, [rcx, +, 4*rsi]
jb .LBB2_19
lea rsi, [rcx, +, 4*rsi]
mov rax, rdx
cmp edi, dword, ptr, [rsi, +, 4]
jbe .LBB2_23
.LBB2_19:
lea rsi, [rax, +, 1]
lea r8, [rsi, +, 2*rsi]
mov rdx, rax
cmp edi, dword, ptr, [rcx, +, 4*r8]
jb .LBB2_21
lea rax, [rcx, +, 4*r8]
mov rdx, rsi
cmp edi, dword, ptr, [rax, +, 4]
jbe .LBB2_23
.LBB2_21:
lea rsi, [rdx, +, 2*rdx]
mov al, 24
cmp edi, dword, ptr, [rcx, +, 4*rsi]
jb .LBB2_24
lea rcx, [rcx, +, 4*rsi]
cmp edi, dword, ptr, [rcx, +, 4]
ja .LBB2_24
.LBB2_23:
lea rax, [rdx, +, 2*rdx]
lea rcx, [rip, +, .Lanon.6381ac5fc63e354c7ed6bf76902decf4.389]
movzx eax, byte, ptr, [rcx, +, 4*rax, +, 8]
.LBB2_24:
ret
cargo asm alphabet_detector::lang::ucd::script::UcdScript::find_broken
Asm
alphabet_detector::lang::ucd::script::UcdScript::find_broken:
xor eax, eax
cmp edi, 43744
adc al, -1
cmp edi, 43739
movzx eax, al
mov ecx, 1
cmovae ecx, eax
mov eax, 403
test cl, cl
je .LBB3_34
movzx ecx, cl
cmp ecx, 1
jne .LBB3_3
xor eax, eax
.LBB3_3:
lea rdx, [rax, +, 201]
lea rsi, [rdx, +, 2*rdx]
lea rcx, [rip, +, .Lanon.6381ac5fc63e354c7ed6bf76902decf4.389]
xor r8d, r8d
cmp dword, ptr, [rcx, +, 4*rsi, +, 4], edi
sbb r8d, r8d
cmp edi, dword, ptr, [rcx, +, 4*rsi]
mov esi, 1
cmovae esi, r8d
test sil, sil
je .LBB3_33
movzx esi, sil
cmp esi, 255
je .LBB3_6
mov rdx, rax
.LBB3_6:
lea rax, [rdx, +, 101]
lea rsi, [rax, +, 2*rax]
xor r8d, r8d
cmp dword, ptr, [rcx, +, 4*rsi, +, 4], edi
sbb r8d, r8d
cmp edi, dword, ptr, [rcx, +, 4*rsi]
mov esi, 1
cmovae esi, r8d
test sil, sil
je .LBB3_34
movzx esi, sil
cmp esi, 255
je .LBB3_9
mov rax, rdx
.LBB3_9:
lea rdx, [rax, +, 50]
lea rsi, [rdx, +, 2*rdx]
xor r8d, r8d
cmp dword, ptr, [rcx, +, 4*rsi, +, 4], edi
sbb r8d, r8d
cmp edi, dword, ptr, [rcx, +, 4*rsi]
mov esi, 1
cmovae esi, r8d
test sil, sil
je .LBB3_33
movzx esi, sil
cmp esi, 255
je .LBB3_12
mov rdx, rax
.LBB3_12:
lea rax, [rdx, +, 25]
lea rsi, [rax, +, 2*rax]
xor r8d, r8d
cmp dword, ptr, [rcx, +, 4*rsi, +, 4], edi
sbb r8d, r8d
cmp edi, dword, ptr, [rcx, +, 4*rsi]
mov esi, 1
cmovae esi, r8d
test sil, sil
je .LBB3_34
movzx esi, sil
cmp esi, 255
je .LBB3_15
mov rax, rdx
.LBB3_15:
lea rdx, [rax, +, 13]
lea rsi, [rdx, +, 2*rdx]
xor r8d, r8d
cmp dword, ptr, [rcx, +, 4*rsi, +, 4], edi
sbb r8d, r8d
cmp edi, dword, ptr, [rcx, +, 4*rsi]
mov esi, 1
cmovae esi, r8d
test sil, sil
je .LBB3_33
movzx esi, sil
cmp esi, 255
je .LBB3_18
mov rdx, rax
.LBB3_18:
lea rax, [rdx, +, 6]
lea rsi, [rax, +, 2*rax]
xor r8d, r8d
cmp dword, ptr, [rcx, +, 4*rsi, +, 4], edi
sbb r8d, r8d
cmp edi, dword, ptr, [rcx, +, 4*rsi]
mov esi, 1
cmovae esi, r8d
test sil, sil
je .LBB3_34
movzx esi, sil
cmp esi, 255
je .LBB3_21
mov rax, rdx
.LBB3_21:
lea rdx, [rax, +, 3]
lea rsi, [rdx, +, 2*rdx]
xor r8d, r8d
cmp dword, ptr, [rcx, +, 4*rsi, +, 4], edi
sbb r8d, r8d
cmp edi, dword, ptr, [rcx, +, 4*rsi]
mov esi, 1
cmovae esi, r8d
test sil, sil
je .LBB3_33
movzx esi, sil
cmp esi, 255
je .LBB3_24
mov rdx, rax
.LBB3_24:
lea rax, [rdx, +, 2]
lea rsi, [rax, +, 2*rax]
xor r8d, r8d
cmp dword, ptr, [rcx, +, 4*rsi, +, 4], edi
sbb r8d, r8d
cmp edi, dword, ptr, [rcx, +, 4*rsi]
mov esi, 1
cmovae esi, r8d
test sil, sil
je .LBB3_34
movzx esi, sil
cmp esi, 255
je .LBB3_27
mov rax, rdx
.LBB3_27:
lea rdx, [rax, +, 1]
lea rsi, [rdx, +, 2*rdx]
mov r8d, dword, ptr, [rcx, +, 4*rsi]
mov esi, dword, ptr, [rcx, +, 4*rsi, +, 4]
xor r10d, r10d
cmp esi, edi
sbb r10d, r10d
cmp edi, r8d
mov r9d, 1
cmovae r9d, r10d
test r9b, r9b
je .LBB3_33
movzx r9d, r9b
cmp r9d, 255
je .LBB3_30
lea rdx, [rax, +, 2*rax]
mov r8d, dword, ptr, [rcx, +, 4*rdx]
mov esi, dword, ptr, [rcx, +, 4*rdx, +, 4]
mov rdx, rax
.LBB3_30:
cmp edi, r8d
setae cl
cmp edi, esi
seta r9b
mov al, 24
cmp edi, r8d
jb .LBB3_35
cmp edi, esi
ja .LBB3_35
and cl, r9b
movzx eax, cl
add rdx, rax
.LBB3_33:
mov rax, rdx
.LBB3_34:
lea rax, [rax, +, 2*rax]
lea rcx, [rip, +, .Lanon.6381ac5fc63e354c7ed6bf76902decf4.389]
movzx eax, byte, ptr, [rcx, +, 4*rax, +, 8]
.LBB3_35:
ret
But if I replace
base = match cmp {
Less => mid,
Equal => {
unsafe { hint::assert_unchecked(mid < self.len()) };
return Ok(mid);
}
Greater => base,
};
To base = hint::select_unpredictable(cmp == Greater, base, mid);
, their assembly start to look closer to identical.
rustc --version --verbose
:
rustc 1.89.0-nightly (d97326eab 2025-05-15)
binary: rustc
commit-hash: d97326eabfc3b2c33abcb08d6bc117aefa697cb7
commit-date: 2025-05-15
host: x86_64-unknown-linux-gnu
release: 1.89.0-nightly
LLVM version: 20.1.4
RUSTFLAGS: "-C target-cpu=native" (Intel 9700K)