Bad performing code from `#[deriving(Eq)]` for enums

`#[deriving(Eq)]` generates nested matches for enums, like this:

``` rust
#[deriving(Eq)]
pub enum Foo { A1, A2, A3, }
#[automatically_derived]
impl ::std::cmp::Eq for Foo {
    #[inline]
    fn eq(&self, __arg_0: &Foo) -> ::bool {
        match *self {
            A1 => match *__arg_0 { A1 => true, _ => false },
            A2 => match *__arg_0 { A2 => true, _ => false },
            A3 => match *__arg_0 { A3 => true, _ => false }
        }
    }
    #[inline]
    fn ne(&self, __arg_0: &Foo) -> ::bool {
        match *self {
            A1 => match *__arg_0 { A1 => false, _ => true },
            A2 => match *__arg_0 { A2 => false, _ => true },
            A3 => match *__arg_0 { A3 => false, _ => true }
        }
    }
}
```

Although we emit range constraints for the loads, this doesn't get optimized to
a simple comparison but this:

``` llvm
define i8 @_ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E(i8* nocapture readonly, i8* nocapture readonly) unnamed_addr #0 {
entry-block:
  %2 = load i8* %0, align 1, !range !0
  switch i8 %2, label %match_else [
    i8 0, label %match_case
    i8 1, label %match_case3
  ]

match_else:                                       ; preds = %entry-block
  %3 = load i8* %1, align 1, !range !0
  %cond20 = icmp eq i8 %3, 2
  %. = zext i1 %cond20 to i8
  br label %join18

match_case:                                       ; preds = %entry-block
  %4 = load i8* %1, align 1, !range !0
  %cond19 = icmp eq i8 %4, 0
  %.23 = zext i1 %cond19 to i8
  br label %join18

match_case3:                                      ; preds = %entry-block
  %5 = load i8* %1, align 1, !range !0
  %cond = icmp eq i8 %5, 1
  %.24 = zext i1 %cond to i8
  br label %join18

join18:                                           ; preds = %match_case3, %match_case, %match_else
  %__make_return_pointer.0 = phi i8 [ %., %match_else ], [ %.23, %match_case ], [ %.24, %match_case3 ]
  ret i8 %__make_return_pointer.0
}
```

Which gets a pretty straight-forward translation to assembly:

``` asm
    .globl  _ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E
    .align  16, 0x90
    .type   _ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E,@function
_ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E:
    .cfi_startproc
    cmpq    %fs:112, %rsp
    ja  .LBB0_2
    movabsq $0, %r10
    movabsq $0, %r11
    callq   __morestack
    retq
.LBB0_2:
    movb    (%rdi), %al
    testb   %al, %al
    jne .LBB0_3
    cmpb    $0, (%rsi)
    sete    %al
    retq
.LBB0_3:
    movzbl  %al, %eax
    cmpl    $1, %eax
    jne .LBB0_4
    movzbl  (%rsi), %eax
    cmpl    $1, %eax
    sete    %al
    retq
.LBB0_4:
    movzbl  (%rsi), %eax
    cmpl    $2, %eax
    sete    %al
    retq
.Ltmp0:
    .size   _ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E, .Ltmp0-_ZN18Foo...std..cmp..Eq2eq20h9a2945fb10f6320blaa4v0.0E
    .cfi_endproc
```

For C-style enums, we could maybe special case the deriving code generate something like this instead:

``` rust
fn eq(&self, rhs: &Foo) -> bool {
    *self as u64 == *rhs as u64
}
```

But since the discriminant isn't exposed in the language (and is only virtual in case of e.g. Option<&int>), I don't see how to solve this for ADTs in general.

Exposing the discriminant through an intrinsic and doing an early equality check helps LLVM to generate better code, but it's still not optimal.

``` rust
pub fn test1(a: &Foo, b: &Foo) -> bool {
    let d1 = unsafe { std::intrinsics::get_disr(a) };
    let d2 = unsafe { std::intrinsics::get_disr(b) };

    if d1 != d2 {
        return false;
    }

    match a {
        &A1 => match b { &A1 => true, _ => false },
        &A2 => match b { &A2 => true, _ => false },
        &A3 => match b { &A3 => true, _ => false },
    }
}
```

Gives:

``` llvm
define zeroext i1 @_ZN5test120h51356ea9d1622fa9haa4v0.0E(i8* nocapture readonly, i8* nocapture readonly) unnamed_addr #0 {
entry-block:
  %2 = load i8* %0, align 1, !range !0
  %3 = load i8* %1, align 1, !range !0
  %4 = icmp eq i8 %2, %3
  br i1 %4, label %next-block, label %return

next-block:                                       ; preds = %entry-block
  %5 = icmp ne i8 %2, 3
  ret i1 %5

return:                                           ; preds = %entry-block
  ret i1 false
}
```

And:

``` asm
    .globl  _ZN5test120h51356ea9d1622fa9haa4v0.0E
    .align  16, 0x90
    .type   _ZN5test120h51356ea9d1622fa9haa4v0.0E,@function
_ZN5test120h51356ea9d1622fa9haa4v0.0E:
    .cfi_startproc
    cmpq    %fs:112, %rsp
    ja  .LBB0_2
    movabsq $0, %r10
    movabsq $0, %r11
    callq   __morestack
    retq
.LBB0_2:
    movzbl  (%rdi), %eax
    movzbl  (%rsi), %ecx
    cmpl    %ecx, %eax
    jne .LBB0_4
    movzbl  %al, %eax
    cmpl    $3, %eax
    setne   %al
    retq
.LBB0_4:
    xorl    %eax, %eax
    retq
.Ltmp0:
    .size   _ZN5test120h51356ea9d1622fa9haa4v0.0E, .Ltmp0-_ZN5test120h51356ea9d1622fa9haa4v0.0E
    .cfi_endproc
```

But I don't think exposing the discriminant is such a great idea, and having to do the extra check is weird and not quite natural. So I wonder if there's a better way to handle this.

---

Fun fact: Adding another variant to the enum (so we have `pub enum Foo { A1, A2, A3, A4 }`) makes LLVM generate better code for some reason:

``` llvm
define zeroext i1 @_ZN5test120ha73e87eb2055d3a4iaa4v0.0E(i8* nocapture readonly, i8* nocapture readonly) unnamed_addr #0 {
entry-block:
  %2 = load i8* %0, align 1, !range !0
  %3 = load i8* %1, align 1, !range !0
  %4 = icmp eq i8 %2, %3
  br i1 %4, label %select.end, label %select.mid

select.mid:                                       ; preds = %entry-block
  br label %select.end

select.end:                                       ; preds = %entry-block, %select.mid
  %. = phi i1 [ true, %entry-block ], [ false, %select.mid ]
  ret i1 %.
}
```

And adding another round of `opt -O2` (LLVM seems to stop optimizing too early here) gives:

``` llvm
define zeroext i1 @_ZN5test120ha73e87eb2055d3a4iaa4v0.0E(i8* nocapture readonly, i8* nocapture readonly) unnamed_addr #0 {
entry-block:
  %2 = load i8* %0, align 1, !range !0
  %3 = load i8* %1, align 1, !range !0
  %4 = icmp eq i8 %2, %3
  ret i1 %4
}
```

EDIT: Forgot to mention that the code with the intrinsic is based upon my branch that uses `i1` for bools. Just to avoid any confusion...


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Bad performing code from `#[deriving(Eq)]` for enums #13623

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Bad performing code from #[deriving(Eq)] for enums #13623

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

Bad performing code from `#[deriving(Eq)]` for enums #13623