Skip to content

Bad performing code from #[deriving(Eq)] for enums #13623

Closed
@dotdash

Description

@dotdash

#[deriving(Eq)] generates nested matches for enums, like this:

#[deriving(Eq)]
pub enum Foo { A1, A2, A3, }
#[automatically_derived]
impl ::std::cmp::Eq for Foo {
    #[inline]
    fn eq(&self, __arg_0: &Foo) -> ::bool {
        match *self {
            A1 => match *__arg_0 { A1 => true, _ => false },
            A2 => match *__arg_0 { A2 => true, _ => false },
            A3 => match *__arg_0 { A3 => true, _ => false }
        }
    }
    #[inline]
    fn ne(&self, __arg_0: &Foo) -> ::bool {
        match *self {
            A1 => match *__arg_0 { A1 => false, _ => true },
            A2 => match *__arg_0 { A2 => false, _ => true },
            A3 => match *__arg_0 { A3 => false, _ => true }
        }
    }
}

Although we emit range constraints for the loads, this doesn't get optimized to
a simple comparison but this:

define i8 @_ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E(i8* nocapture readonly, i8* nocapture readonly) unnamed_addr #0 {
entry-block:
  %2 = load i8* %0, align 1, !range !0
  switch i8 %2, label %match_else [
    i8 0, label %match_case
    i8 1, label %match_case3
  ]

match_else:                                       ; preds = %entry-block
  %3 = load i8* %1, align 1, !range !0
  %cond20 = icmp eq i8 %3, 2
  %. = zext i1 %cond20 to i8
  br label %join18

match_case:                                       ; preds = %entry-block
  %4 = load i8* %1, align 1, !range !0
  %cond19 = icmp eq i8 %4, 0
  %.23 = zext i1 %cond19 to i8
  br label %join18

match_case3:                                      ; preds = %entry-block
  %5 = load i8* %1, align 1, !range !0
  %cond = icmp eq i8 %5, 1
  %.24 = zext i1 %cond to i8
  br label %join18

join18:                                           ; preds = %match_case3, %match_case, %match_else
  %__make_return_pointer.0 = phi i8 [ %., %match_else ], [ %.23, %match_case ], [ %.24, %match_case3 ]
  ret i8 %__make_return_pointer.0
}

Which gets a pretty straight-forward translation to assembly:

    .globl  _ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E
    .align  16, 0x90
    .type   _ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E,@function
_ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E:
    .cfi_startproc
    cmpq    %fs:112, %rsp
    ja  .LBB0_2
    movabsq $0, %r10
    movabsq $0, %r11
    callq   __morestack
    retq
.LBB0_2:
    movb    (%rdi), %al
    testb   %al, %al
    jne .LBB0_3
    cmpb    $0, (%rsi)
    sete    %al
    retq
.LBB0_3:
    movzbl  %al, %eax
    cmpl    $1, %eax
    jne .LBB0_4
    movzbl  (%rsi), %eax
    cmpl    $1, %eax
    sete    %al
    retq
.LBB0_4:
    movzbl  (%rsi), %eax
    cmpl    $2, %eax
    sete    %al
    retq
.Ltmp0:
    .size   _ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E, .Ltmp0-_ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E
    .cfi_endproc

For C-style enums, we could maybe special case the deriving code generate something like this instead:

fn eq(&self, rhs: &Foo) -> bool {
    *self as u64 == *rhs as u64
}

But since the discriminant isn't exposed in the language (and is only virtual in case of e.g. Option<&int>), I don't see how to solve this for ADTs in general.

Exposing the discriminant through an intrinsic and doing an early equality check helps LLVM to generate better code, but it's still not optimal.

pub fn test1(a: &Foo, b: &Foo) -> bool {
    let d1 = unsafe { std::intrinsics::get_disr(a) };
    let d2 = unsafe { std::intrinsics::get_disr(b) };

    if d1 != d2 {
        return false;
    }

    match a {
        &A1 => match b { &A1 => true, _ => false },
        &A2 => match b { &A2 => true, _ => false },
        &A3 => match b { &A3 => true, _ => false },
    }
}

Gives:

define zeroext i1 @_ZN5test120h51356ea9d1622fa9haa4v0.0E(i8* nocapture readonly, i8* nocapture readonly) unnamed_addr #0 {
entry-block:
  %2 = load i8* %0, align 1, !range !0
  %3 = load i8* %1, align 1, !range !0
  %4 = icmp eq i8 %2, %3
  br i1 %4, label %next-block, label %return

next-block:                                       ; preds = %entry-block
  %5 = icmp ne i8 %2, 3
  ret i1 %5

return:                                           ; preds = %entry-block
  ret i1 false
}

And:

    .globl  _ZN5test120h51356ea9d1622fa9haa4v0.0E
    .align  16, 0x90
    .type   _ZN5test120h51356ea9d1622fa9haa4v0.0E,@function
_ZN5test120h51356ea9d1622fa9haa4v0.0E:
    .cfi_startproc
    cmpq    %fs:112, %rsp
    ja  .LBB0_2
    movabsq $0, %r10
    movabsq $0, %r11
    callq   __morestack
    retq
.LBB0_2:
    movzbl  (%rdi), %eax
    movzbl  (%rsi), %ecx
    cmpl    %ecx, %eax
    jne .LBB0_4
    movzbl  %al, %eax
    cmpl    $3, %eax
    setne   %al
    retq
.LBB0_4:
    xorl    %eax, %eax
    retq
.Ltmp0:
    .size   _ZN5test120h51356ea9d1622fa9haa4v0.0E, .Ltmp0-_ZN5test120h51356ea9d1622fa9haa4v0.0E
    .cfi_endproc

But I don't think exposing the discriminant is such a great idea, and having to do the extra check is weird and not quite natural. So I wonder if there's a better way to handle this.


Fun fact: Adding another variant to the enum (so we have pub enum Foo { A1, A2, A3, A4 }) makes LLVM generate better code for some reason:

define zeroext i1 @_ZN5test120ha73e87eb2055d3a4iaa4v0.0E(i8* nocapture readonly, i8* nocapture readonly) unnamed_addr #0 {
entry-block:
  %2 = load i8* %0, align 1, !range !0
  %3 = load i8* %1, align 1, !range !0
  %4 = icmp eq i8 %2, %3
  br i1 %4, label %select.end, label %select.mid

select.mid:                                       ; preds = %entry-block
  br label %select.end

select.end:                                       ; preds = %entry-block, %select.mid
  %. = phi i1 [ true, %entry-block ], [ false, %select.mid ]
  ret i1 %.
}

And adding another round of opt -O2 (LLVM seems to stop optimizing too early here) gives:

define zeroext i1 @_ZN5test120ha73e87eb2055d3a4iaa4v0.0E(i8* nocapture readonly, i8* nocapture readonly) unnamed_addr #0 {
entry-block:
  %2 = load i8* %0, align 1, !range !0
  %3 = load i8* %1, align 1, !range !0
  %4 = icmp eq i8 %2, %3
  ret i1 %4
}

EDIT: Forgot to mention that the code with the intrinsic is based upon my branch that uses i1 for bools. Just to avoid any confusion...

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-codegenArea: Code generationI-compiletimeIssue: Problems and improvements with respect to compile times.I-slowIssue: Problems and improvements with respect to performance of generated code.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions