Skip to content

Fast algorithm for u128 (and i128) divided by small constant #54867

Open
@leonardo-m

Description

@leonardo-m

This is an enhancement request. While u16, u32 and u64 numbers get divided by a small constant divisor using a fast algorithm, the same isn't true for u128 numbers:

fn digits_sum0(mut n: u16) -> u16 {
    let mut total = 0;
    while n != 0 {
        total += n % 10;
        n /= 10;
    }
    total
}

fn digits_sum1(mut n: u32) -> u32 {
    let mut total = 0;
    while n != 0 {
        total += n % 10;
        n /= 10;
    }
    total
}

fn digits_sum2(mut n: u64) -> u64 {
    let mut total = 0;
    while n != 0 {
        total += n % 10;
        n /= 10;
    }
    total
}

fn digits_sum3(mut n: u128) -> u128 {
    let mut total = 0;
    while n != 0 {
        total += n % 10;
        n /= 10;
    }
    total
}

Generate asm (with -O):

digits_sum0:
        xor     eax, eax
        test    di, di
        je      .LBB0_2
.LBB0_1:
        movzx   ecx, di
        imul    edx, ecx, 52429
        shr     edx, 19
        lea     esi, [rdx + rdx]
        lea     esi, [rsi + 4*rsi]
        sub     edi, esi
        add     eax, edi
        mov     edi, edx
        cmp     ecx, 10
        jae     .LBB0_1
.LBB0_2:
        ret
        

digits_sum1:
        xor     eax, eax
        test    edi, edi
        je      .LBB0_3
        mov     r8d, 3435973837
.LBB0_2:
        mov     edx, edi
        imul    rdx, r8
        shr     rdx, 35
        lea     esi, [rdx + rdx]
        lea     esi, [rsi + 4*rsi]
        mov     ecx, edi
        sub     ecx, esi
        add     eax, ecx
        cmp     edi, 10
        mov     edi, edx
        jae     .LBB0_2
.LBB0_3:
        ret


digits_sum2:
        xor     ecx, ecx
        test    rdi, rdi
        je      .LBB1_3
        movabs  r8, -3689348814741910323
.LBB1_2:
        mov     rax, rdi
        mul     r8
        shr     rdx, 3
        lea     rax, [rdx + rdx]
        lea     rax, [rax + 4*rax]
        mov     rsi, rdi
        sub     rsi, rax
        add     rcx, rsi
        cmp     rdi, 10
        mov     rdi, rdx
        jae     .LBB1_2
.LBB1_3:
        mov     rax, rcx
        ret


digits_sum3:
        push    r15
        push    r14
        push    r13
        push    r12
        push    rbx
        mov     rax, rdi
        or      rax, rsi
        je      .LBB2_1
        mov     rbx, rsi
        mov     r15, rdi
        xor     r14d, r14d
        mov     r13d, 10
        xor     r12d, r12d
.LBB2_4:
        mov     edx, 10
        xor     ecx, ecx
        mov     rdi, r15
        mov     rsi, rbx
        call    __udivti3@PLT
        mov     rcx, rax
        mov     rsi, rdx
        mul     r13
        lea     rdi, [rsi + 4*rsi]
        lea     rdx, [rdx + 2*rdi]
        mov     rdi, r15
        sub     rdi, rax
        mov     rax, rbx
        sbb     rax, rdx
        add     r14, rdi
        adc     r12, rax
        cmp     r15, 10
        sbb     rbx, 0
        mov     r15, rcx
        mov     rbx, rsi
        jae     .LBB2_4
        jmp     .LBB2_2
.LBB2_1:
        xor     r14d, r14d
        xor     r12d, r12d
.LBB2_2:
        mov     rax, r14
        mov     rdx, r12
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        ret

The faster algorithm is short enough and it could be added to rustc:

http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
http://ridiculousfish.com/blog/posts/labor-of-division-episode-ii.html
http://libdivide.com/
http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-LLVMArea: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions