Skip to content

String::to_lowercase does not get vectorized well contrary to code comments #123712

Closed
@jhorstmann

Description

@jhorstmann

I'm looking into the performance of to_lowercase / to_uppercaseon mostly ascii strings, using a small microbenchmark added to library/alloc/benches/string.rs.

#[bench]
fn bench_to_lowercase(b: &mut Bencher) {
    let s = "Hello there, the quick brown fox jumped over the lazy dog! \
              Lorem ipsum dolor sit amet, consectetur. ";
    b.iter(|| s.to_lowercase())
}

Using linux perf tooling I see that the hot part of the code is the following large loop, which despite heavy use of sse2 instructions only seems to process 32 bytes per iteration.

       │ d0:┌─→mov        r9,QWORD PTR [r14+r15*1]
       │    │  movdqu     xmm3,XMMWORD PTR [r14+r15*1]
  0,12 │    │  pshufd     xmm12,xmm3,0xee
  2,56 │    │  movq       rdx,xmm12
       │    │  mov        rsi,rdx
       │    │  or         rsi,r9
  0,59 │    │  test       rsi,rcx
       │    │↓ jne        319
  2,10 │    │  mov        rsi,r9
       │    │  mov        rdi,r9
       │    │  mov        r8,r9
       │    │  mov        r10,r9
  2,23 │    │  shr        r9d,0x8
       │    │  movd       xmm12,r9d
       │    │  shr        r10,0x20
  0,12 │    │  pshufd     xmm13,xmm3,0x44
  1,05 │    │  movdqa     xmm14,xmm3
       │    │  psrlq      xmm14,0x10
       │    │  psrlq      xmm13,0x18
  0,12 │    │  movsd      xmm13,xmm14
  1,75 │    │  movd       xmm14,r10d
       │    │  shr        r8,0x28
       │    │  punpcklqdq xmm3,xmm12
       │    │  movd       xmm12,r8d
  1,52 │    │  andpd      xmm13,xmm0
       │    │  pand       xmm3,xmm0
       │    │  packuswb   xmm3,xmm13
  0,35 │    │  pshufd     xmm13,xmm14,0x50
  1,05 │    │  movdqa     xmm14,XMMWORD PTR [rip+0x403c7]
       │    │  pandn      xmm14,xmm13
       │    │  psllq      xmm12,0x28
       │    │  movdqa     xmm13,XMMWORD PTR [rip+0x403c3]
  2,94 │    │  pandn      xmm13,xmm12
       │    │  shr        rdi,0x30
       │    │  por        xmm13,xmm14
  0,35 │    │  movd       xmm12,edi
  2,22 │    │  shr        rsi,0x38
       │    │  packuswb   xmm3,xmm1
       │    │  packuswb   xmm3,xmm1
  0,47 │    │  por        xmm13,xmm3
  4,21 │    │  psllq      xmm12,0x30
       │    │  movdqa     xmm3,xmm4
       │    │  pandn      xmm3,xmm12
  0,35 │    │  movd       xmm12,esi
  2,47 │    │  mov        esi,edx
       │    │  shr        esi,0x8
       │    │  pand       xmm13,xmm4
  0,35 │    │  por        xmm3,xmm13
  2,10 │    │  pand       xmm3,xmm5
       │    │  psllq      xmm12,0x38
       │    │  movdqa     xmm13,xmm5
       │    │  pandn      xmm13,xmm12
  2,34 │    │  por        xmm13,xmm3
       │    │  movd       xmm3,edx
       │    │  pshufd     xmm3,xmm3,0x44
  0,53 │    │  movdqa     xmm12,xmm6
  2,47 │    │  pandn      xmm12,xmm3
       │    │  movd       xmm3,esi
       │    │  mov        esi,edx
  0,23 │    │  shr        esi,0x10
  2,64 │    │  pand       xmm13,xmm6
       │    │  por        xmm12,xmm13
       │    │  pslldq     xmm3,0x9
  0,12 │    │  movdqa     xmm13,xmm7
  2,45 │    │  pandn      xmm13,xmm3
       │    │  movd       xmm3,esi
       │    │  mov        esi,edx
  0,51 │    │  shr        esi,0x18
  2,60 │    │  pand       xmm12,xmm7
       │    │  por        xmm13,xmm12
       │    │  pslldq     xmm3,0xa
       │    │  movdqa     xmm12,xmm8
  1,76 │    │  pandn      xmm12,xmm3
       │    │  movd       xmm3,esi
       │    │  mov        rsi,rdx
  0,47 │    │  shr        rsi,0x20
  2,34 │    │  pand       xmm13,xmm8
       │    │  por        xmm12,xmm13
       │    │  pslldq     xmm3,0xb
  0,23 │    │  movdqa     xmm13,xmm9
  1,99 │    │  pandn      xmm13,xmm3
       │    │  movd       xmm3,esi
       │    │  mov        rsi,rdx
  0,35 │    │  shr        rsi,0x28
  2,97 │    │  pand       xmm12,xmm9
       │    │  por        xmm13,xmm12
       │    │  pshufd     xmm3,xmm3,0x0
  0,12 │    │  movdqa     xmm12,xmm10
  2,11 │    │  pandn      xmm12,xmm3
       │    │  movd       xmm3,esi
       │    │  shr        rdx,0x30
       │    │  pand       xmm13,xmm10
  1,87 │    │  por        xmm12,xmm13
       │    │  pand       xmm12,xmm11
       │    │  pslldq     xmm3,0xd
  0,23 │    │  movdqa     xmm13,xmm11
  2,23 │    │  pandn      xmm13,xmm3
       │    │  por        xmm13,xmm12
       │    │  pand       xmm13,XMMWORD PTR [rip+0x40320]
  0,12 │    │  movd       xmm3,edx
  2,80 │    │  pslldq     xmm3,0xe
       │    │  por        xmm3,xmm13
       │    │  pand       xmm3,XMMWORD PTR [rip+0x4031a]
  0,23 │    │  movzx      edx,BYTE PTR [r14+r15*1+0xf]
  3,31 │    │  movd       xmm12,edx
       │    │  pslldq     xmm12,0xf
       │    │  por        xmm12,xmm3
  0,12 │    │  movdqa     xmm3,xmm12
  2,92 │    │  paddb      xmm3,XMMWORD PTR [rip+0x31d97]        # 1009a0 <anon.cf73386a2f5127d166baeac25be116f0.63.llvm.16014458289627072720+0x459>                                                                                                                                                                        ▒
       │    │  movdqa     xmm13,xmm3
       │    │  pminub     xmm13,xmm15
  0,47 │    │  pcmpeqb    xmm13,xmm3
  1,53 │    │  pand       xmm13,xmm2
  0,36 │    │  por        xmm13,xmm12
       │    │  movdqu     XMMWORD PTR [rax+r15*1],xmm13
  0,23 │    │  lea        rdx,[r15+0x10]
  2,34 │    │  add        r15,0x20
  0,12 │    │  cmp        r15,rbx
       │    │  mov        r15,rdx
  1,64 │    └──jbe        d0             

I don't see an easy way to improve the autovectorization of this code, but it should be relatively easy to explicitly vectorize it using portable_simd, and I would like to prepare such a PR if there are no objections. As far as I know, portable_simd is already in use inside core, for example by #103779.

Metadata

Metadata

Assignees

No one assigned

    Labels

    C-optimizationCategory: An issue highlighting optimization opportunities or PRs implementing suchT-libsRelevant to the library team, which will review and decide on the PR/issue.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions