Skip to content

Big performance problem with closed intervals looping #45222

Open
@leonardo-m

Description

@leonardo-m

In my code I've essentially stopped using loops with ... (intervals closed on the right, recently written with the syntax ..=) because they give performance problems. This is a simple example that shows the problem:

#![feature(inclusive_range_syntax)]
#![allow(private_no_mangle_fns)]

#[inline(never)]
#[no_mangle]
fn foo1(n: u64) -> u64 {
    let mut count = 0;
    for _ in 0 .. n {
        for j in (0 .. n + 1).rev() {
            count += j;
        }
    }
    count
}

#[inline(never)]
#[no_mangle]
fn foo2(n: u64) -> u64 {
    let mut count = 0;
    for _ in 0 .. n {
        for j in (0 ..= n).rev() {
            count += j;
        }
    }
    count
}

fn main() {
    let n: u64 = std::env::args().nth(1).unwrap().parse().unwrap();
    let what: u32 = std::env::args().nth(2).unwrap().parse().unwrap();

    match what {
        1 => println!("{}", foo1(n)),
        2 => println!("{}", foo2(n)),
        _ => panic!(),
    }
}

Compiled with the last Nightly:

rustc 1.22.0-nightly (d6d711dd8 2017-10-10)
binary: rustc
commit-hash: d6d711dd8f7ad5885294b8e1f0009a23dc1f8b1f
commit-date: 2017-10-10
host: x86_64-pc-windows-gnu
release: 1.22.0-nightly
LLVM version: 4.0

Compiled with:
rustc -O test.rs

Running it calling foo1 takes 0.02 seconds:

...>elaps test 100000 1
500005000000000

Running it calling foo2 takes about 13.65 seconds:


...>elaps test 100000 2
500005000000000
The asm I am seeing using "--emit asm"
foo1:
	testq	%rcx, %rcx
	je	.LBB5_1
	movq	%rcx, %r8
	imulq	%r8, %r8
	leaq	-1(%rcx), %rdx
	movq	%rcx, %rax
	mulq	%rdx
	shldq	$63, %rax, %rdx
	subq	%rdx, %r8
	cmpq	$3, %rcx
	jbe	.LBB5_3
	movq	%rcx, %rdx
	andq	$-4, %rdx
	je	.LBB5_3
	movd	%r8, %xmm0
	pshufd	$68, %xmm0, %xmm2
	leaq	-4(%rdx), %r9
	movl	%r9d, %eax
	shrl	$2, %eax
	incl	%eax
	andq	$3, %rax
	je	.LBB5_8
	cmpq	$-1, %rcx
	pxor	%xmm0, %xmm0
	pxor	%xmm3, %xmm3
	je	.LBB5_11
	movdqa	%xmm2, %xmm3
.LBB5_11:
	negq	%rax
	xorl	%r10d, %r10d
	pxor	%xmm1, %xmm1
	.p2align	4, 0x90
.LBB5_12:
	paddq	%xmm3, %xmm0
	paddq	%xmm3, %xmm1
	addq	$4, %r10
	incq	%rax
	jne	.LBB5_12
	jmp	.LBB5_13
.LBB5_3:
	xorl	%eax, %eax
	xorl	%edx, %edx
.LBB5_4:
	xorl	%r9d, %r9d
	cmpq	$-1, %rcx
	cmoveq	%r9, %r8
	.p2align	4, 0x90
.LBB5_5:
	incq	%rdx
	addq	%r8, %rax
	cmpq	%rcx, %rdx
	jb	.LBB5_5
.LBB5_19:
	retq
.LBB5_1:
	xorl	%eax, %eax
	retq
.LBB5_8:
	xorl	%r10d, %r10d
	pxor	%xmm0, %xmm0
	pxor	%xmm1, %xmm1
.LBB5_13:
	cmpq	$12, %r9
	jb	.LBB5_18
	cmpq	$-1, %rcx
	pxor	%xmm3, %xmm3
	je	.LBB5_16
	movdqa	%xmm2, %xmm3
.LBB5_16:
	movq	%rdx, %rax
	subq	%r10, %rax
	.p2align	4, 0x90
.LBB5_17:
	paddq	%xmm3, %xmm0
	paddq	%xmm3, %xmm1
	paddq	%xmm3, %xmm0
	paddq	%xmm3, %xmm1
	paddq	%xmm3, %xmm0
	paddq	%xmm3, %xmm1
	paddq	%xmm3, %xmm0
	paddq	%xmm3, %xmm1
	addq	$-16, %rax
	jne	.LBB5_17
.LBB5_18:
	paddq	%xmm1, %xmm0
	pshufd	$78, %xmm0, %xmm1
	paddq	%xmm0, %xmm1
	movd	%xmm1, %rax
	cmpq	%rcx, %rdx
	jne	.LBB5_4
	jmp	.LBB5_19



foo2:
	pushq	%rsi
	pushq	%rdi
	pushq	%rbx
	testq	%rcx, %rcx
	je	.LBB6_1
	testb	$1, %cl
	jne	.LBB6_4
	xorl	%eax, %eax
	xorl	%r8d, %r8d
	cmpq	$1, %rcx
	jne	.LBB6_11
	jmp	.LBB6_23
.LBB6_1:
	xorl	%eax, %eax
	jmp	.LBB6_23
.LBB6_4:
	xorl	%r8d, %r8d
	movq	$-1, %r9
	xorl	%r10d, %r10d
	movq	%rcx, %r11
	xorl	%eax, %eax
	jmp	.LBB6_5
	.p2align	4, 0x90
.LBB6_8:
	addq	%r11, %rax
	movq	%rdi, %r10
	movq	%rdx, %r11
.LBB6_5:
	cmpq	%r11, %r10
	movl	$1, %esi
	cmovbq	%r9, %rsi
	cmoveq	%r8, %rsi
	testq	%rsi, %rsi
	movl	$1, %edi
	movl	$0, %edx
	je	.LBB6_8
	cmpq	$-1, %rsi
	jne	.LBB6_9
	leaq	-1(%r11), %rdx
	movq	%r10, %rdi
	jmp	.LBB6_8
.LBB6_9:
	movl	$1, %r8d
	cmpq	$1, %rcx
	je	.LBB6_23
.LBB6_11:
	xorl	%r9d, %r9d
	movq	$-1, %r10
	.p2align	4, 0x90
.LBB6_12:
	xorl	%r11d, %r11d
	movq	%rcx, %rdx
	jmp	.LBB6_13
	.p2align	4, 0x90
.LBB6_16:
	addq	%rdx, %rax
	movq	%rbx, %r11
	movq	%rsi, %rdx
.LBB6_13:
	cmpq	%rdx, %r11
	movl	$1, %edi
	cmovbq	%r10, %rdi
	cmoveq	%r9, %rdi
	testq	%rdi, %rdi
	movl	$1, %ebx
	movl	$0, %esi
	je	.LBB6_16
	cmpq	$-1, %rdi
	jne	.LBB6_17
	leaq	-1(%rdx), %rsi
	movq	%r11, %rbx
	jmp	.LBB6_16
	.p2align	4, 0x90
.LBB6_17:
	addq	$2, %r8
	xorl	%r11d, %r11d
	movq	%rcx, %rdx
	jmp	.LBB6_18
	.p2align	4, 0x90
.LBB6_21:
	addq	%rdx, %rax
	movq	%rbx, %r11
	movq	%rsi, %rdx
.LBB6_18:
	cmpq	%rdx, %r11
	movl	$1, %edi
	cmovbq	%r10, %rdi
	cmoveq	%r9, %rdi
	testq	%rdi, %rdi
	movl	$1, %ebx
	movl	$0, %esi
	je	.LBB6_21
	cmpq	$-1, %rdi
	jne	.LBB6_22
	leaq	-1(%rdx), %rsi
	movq	%r11, %rbx
	jmp	.LBB6_21
	.p2align	4, 0x90
.LBB6_22:
	cmpq	%rcx, %r8
	jb	.LBB6_12
.LBB6_23:
	popq	%rbx
	popq	%rdi
	popq	%rsi
	retq

Metadata

Metadata

Assignees

No one assigned

    Labels

    C-enhancementCategory: An issue proposing an enhancement or a PR with one.C-optimizationCategory: An issue highlighting optimization opportunities or PRs implementing suchI-slowIssue: Problems and improvements with respect to performance of generated code.T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions