Skip to content

Performance regression of array::IntoIter vs slice::Iter #115339

Closed
@DaniPopes

Description

@DaniPopes

Code

I tried this code:

use core::iter;

pub fn array(mut a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
    iter::zip(&mut a, b).for_each(|(a, b)| *a |= b);
    a
}

pub fn slice(mut a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
    iter::zip(&mut a, &b).for_each(|(a, b)| *a |= *b);
    a
}

I expected to see this happen: same codegen or similar in performance

Instead, this happened: array has way worse codegen

godbolt

Assembly

example::array:
        vmovdqu xmm1, xmmword ptr [rdx]
        vmovdqu xmm0, xmmword ptr [rdx + 16]
        mov     rax, rdi
        vmovd   ecx, xmm1
        or      byte ptr [rsi], cl
        vpextrb ecx, xmm1, 1
        or      byte ptr [rsi + 1], cl
        vpextrb ecx, xmm1, 2
        or      byte ptr [rsi + 2], cl
        vpextrb ecx, xmm1, 3
        or      byte ptr [rsi + 3], cl
        vpextrb ecx, xmm1, 4
        or      byte ptr [rsi + 4], cl
        vpextrb ecx, xmm1, 5
        or      byte ptr [rsi + 5], cl
        vpextrb ecx, xmm1, 6
        or      byte ptr [rsi + 6], cl
        vpextrb ecx, xmm1, 7
        or      byte ptr [rsi + 7], cl
        vpextrb ecx, xmm1, 8
        or      byte ptr [rsi + 8], cl
        vpextrb ecx, xmm1, 9
        or      byte ptr [rsi + 9], cl
        vpextrb ecx, xmm1, 10
        or      byte ptr [rsi + 10], cl
        vpextrb ecx, xmm1, 11
        or      byte ptr [rsi + 11], cl
        vpextrb ecx, xmm1, 12
        or      byte ptr [rsi + 12], cl
        vpextrb ecx, xmm1, 13
        or      byte ptr [rsi + 13], cl
        vpextrb ecx, xmm1, 14
        or      byte ptr [rsi + 14], cl
        vpextrb ecx, xmm1, 15
        or      byte ptr [rsi + 15], cl
        vmovd   ecx, xmm0
        or      byte ptr [rsi + 16], cl
        vpextrb ecx, xmm0, 1
        or      byte ptr [rsi + 17], cl
        vpextrb ecx, xmm0, 2
        or      byte ptr [rsi + 18], cl
        vpextrb ecx, xmm0, 3
        or      byte ptr [rsi + 19], cl
        vpextrb ecx, xmm0, 4
        or      byte ptr [rsi + 20], cl
        vpextrb ecx, xmm0, 5
        or      byte ptr [rsi + 21], cl
        vpextrb ecx, xmm0, 6
        or      byte ptr [rsi + 22], cl
        vpextrb ecx, xmm0, 7
        or      byte ptr [rsi + 23], cl
        vpextrb ecx, xmm0, 8
        or      byte ptr [rsi + 24], cl
        vpextrb ecx, xmm0, 9
        or      byte ptr [rsi + 25], cl
        vpextrb ecx, xmm0, 10
        or      byte ptr [rsi + 26], cl
        vpextrb ecx, xmm0, 11
        or      byte ptr [rsi + 27], cl
        vpextrb ecx, xmm0, 12
        or      byte ptr [rsi + 28], cl
        vpextrb ecx, xmm0, 13
        or      byte ptr [rsi + 29], cl
        vpextrb ecx, xmm0, 14
        or      byte ptr [rsi + 30], cl
        vpextrb ecx, xmm0, 15
        or      byte ptr [rsi + 31], cl
        vmovups ymm0, ymmword ptr [rsi]
        vmovups ymmword ptr [rdi], ymm0
        vzeroupper
        ret

example::slice:
        vmovups ymm0, ymmword ptr [rsi]
        mov     rax, rdi
        vorps   ymm0, ymm0, ymmword ptr [rdx]
        vmovups ymmword ptr [rsi], ymm0
        vmovups ymmword ptr [rdi], ymm0
        vzeroupper
        ret

Version it worked on

It most recently worked on: 1.64

Version with regression

1.65 till now.

Regressed in nightly-2022-08-13, maybe LLVM 15 #99464?:

fetching (via local git) commits from 20ffea6938b5839c390252e07940b99e3b6a889a to f22819bcce4abaff7d1246a56eec493418f9f4ee

found 8 bors merge commits in the specified range
  commit[0] 2022-08-11: Auto merge of #100416 - Dylan-DPC:rollup-m344lh1, r=Dylan-DPC
  commit[1] 2022-08-11: Auto merge of #100426 - matthiaskrgr:rollup-0ks4dou, r=matthiaskrgr
  commit[2] 2022-08-12: Auto merge of #100419 - flip1995:clippyup, r=Manishearth
  commit[3] 2022-08-12: Auto merge of #99464 - nikic:llvm-15, r=cuviper
  commit[4] 2022-08-12: Auto merge of #100435 - ehuss:update-cargo, r=ehuss
  commit[5] 2022-08-12: Auto merge of #99624 - vincenzopalazzo:macros/unix_error, r=Amanieu
  commit[6] 2022-08-12: Auto merge of #100328 - davidtwco:perf-implications, r=nnethercote
  commit[7] 2022-08-12: Auto merge of #100456 - Dylan-DPC:rollup-fn17z9f, r=Dylan-DPC

rustc --version --verbose:

rustc 1.74.0-nightly (8550f15e1 2023-08-27)
binary: rustc
commit-hash: 8550f15e148407159af401e02b1d9259762b3496
commit-date: 2023-08-27
host: x86_64-unknown-linux-gnu
release: 1.74.0-nightly
LLVM version: 17.0.0

Backtrace

Backtrace

N/A

@rustbot modify labels: +regression-from-stable-to-stable -regression-untriaged

Metadata

Metadata

Assignees

Labels

A-LLVMArea: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.C-bugCategory: This is a bug.C-optimizationCategory: An issue highlighting optimization opportunities or PRs implementing suchI-slowIssue: Problems and improvements with respect to performance of generated code.P-mediumMedium priorityT-compilerRelevant to the compiler team, which will review and decide on the PR/issue.llvm-fixed-upstreamIssue expected to be fixed by the next major LLVM upgrade, or backported fixesregression-from-stable-to-stablePerformance or correctness regression from one stable version to another.

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions