Closed
Description
I'm looking into the performance of to_lowercase
/ to_uppercase
on mostly ascii strings, using a small microbenchmark added to library/alloc/benches/string.rs
.
#[bench]
fn bench_to_lowercase(b: &mut Bencher) {
let s = "Hello there, the quick brown fox jumped over the lazy dog! \
Lorem ipsum dolor sit amet, consectetur. ";
b.iter(|| s.to_lowercase())
}
Using linux perf tooling I see that the hot part of the code is the following large loop, which despite heavy use of sse2 instructions only seems to process 32 bytes per iteration.
│ d0:┌─→mov r9,QWORD PTR [r14+r15*1] ▒
│ │ movdqu xmm3,XMMWORD PTR [r14+r15*1] ▒
0,12 │ │ pshufd xmm12,xmm3,0xee ▒
2,56 │ │ movq rdx,xmm12 ▒
│ │ mov rsi,rdx ▒
│ │ or rsi,r9 ▒
0,59 │ │ test rsi,rcx ▒
│ │↓ jne 319 ▒
2,10 │ │ mov rsi,r9 ▒
│ │ mov rdi,r9 ▒
│ │ mov r8,r9 ▒
│ │ mov r10,r9 ▒
2,23 │ │ shr r9d,0x8 ▒
│ │ movd xmm12,r9d ▒
│ │ shr r10,0x20 ◆
0,12 │ │ pshufd xmm13,xmm3,0x44 ▒
1,05 │ │ movdqa xmm14,xmm3 ▒
│ │ psrlq xmm14,0x10 ▒
│ │ psrlq xmm13,0x18 ▒
0,12 │ │ movsd xmm13,xmm14 ▒
1,75 │ │ movd xmm14,r10d ▒
│ │ shr r8,0x28 ▒
│ │ punpcklqdq xmm3,xmm12 ▒
│ │ movd xmm12,r8d ▒
1,52 │ │ andpd xmm13,xmm0 ▒
│ │ pand xmm3,xmm0 ▒
│ │ packuswb xmm3,xmm13 ▒
0,35 │ │ pshufd xmm13,xmm14,0x50 ▒
1,05 │ │ movdqa xmm14,XMMWORD PTR [rip+0x403c7] ▒
│ │ pandn xmm14,xmm13 ▒
│ │ psllq xmm12,0x28 ▒
│ │ movdqa xmm13,XMMWORD PTR [rip+0x403c3] ▒
2,94 │ │ pandn xmm13,xmm12 ▒
│ │ shr rdi,0x30 ▒
│ │ por xmm13,xmm14 ▒
0,35 │ │ movd xmm12,edi ▒
2,22 │ │ shr rsi,0x38 ▒
│ │ packuswb xmm3,xmm1 ▒
│ │ packuswb xmm3,xmm1 ▒
0,47 │ │ por xmm13,xmm3 ▒
4,21 │ │ psllq xmm12,0x30 ▒
│ │ movdqa xmm3,xmm4 ▒
│ │ pandn xmm3,xmm12 ▒
0,35 │ │ movd xmm12,esi ▒
2,47 │ │ mov esi,edx ▒
│ │ shr esi,0x8 ▒
│ │ pand xmm13,xmm4 ▒
0,35 │ │ por xmm3,xmm13 ▒
2,10 │ │ pand xmm3,xmm5 ▒
│ │ psllq xmm12,0x38 ▒
│ │ movdqa xmm13,xmm5 ▒
│ │ pandn xmm13,xmm12 ▒
2,34 │ │ por xmm13,xmm3 ▒
│ │ movd xmm3,edx ▒
│ │ pshufd xmm3,xmm3,0x44 ▒
0,53 │ │ movdqa xmm12,xmm6 ▒
2,47 │ │ pandn xmm12,xmm3 ▒
│ │ movd xmm3,esi ▒
│ │ mov esi,edx ▒
0,23 │ │ shr esi,0x10 ▒
2,64 │ │ pand xmm13,xmm6 ▒
│ │ por xmm12,xmm13 ▒
│ │ pslldq xmm3,0x9 ▒
0,12 │ │ movdqa xmm13,xmm7 ▒
2,45 │ │ pandn xmm13,xmm3 ▒
│ │ movd xmm3,esi ▒
│ │ mov esi,edx ▒
0,51 │ │ shr esi,0x18 ▒
2,60 │ │ pand xmm12,xmm7 ▒
│ │ por xmm13,xmm12 ▒
│ │ pslldq xmm3,0xa ▒
│ │ movdqa xmm12,xmm8 ▒
1,76 │ │ pandn xmm12,xmm3 ▒
│ │ movd xmm3,esi ▒
│ │ mov rsi,rdx ▒
0,47 │ │ shr rsi,0x20 ▒
2,34 │ │ pand xmm13,xmm8 ▒
│ │ por xmm12,xmm13 ▒
│ │ pslldq xmm3,0xb ▒
0,23 │ │ movdqa xmm13,xmm9 ▒
1,99 │ │ pandn xmm13,xmm3 ▒
│ │ movd xmm3,esi ▒
│ │ mov rsi,rdx ▒
0,35 │ │ shr rsi,0x28 ▒
2,97 │ │ pand xmm12,xmm9 ▒
│ │ por xmm13,xmm12 ▒
│ │ pshufd xmm3,xmm3,0x0 ▒
0,12 │ │ movdqa xmm12,xmm10 ▒
2,11 │ │ pandn xmm12,xmm3 ▒
│ │ movd xmm3,esi ▒
│ │ shr rdx,0x30 ▒
│ │ pand xmm13,xmm10 ▒
1,87 │ │ por xmm12,xmm13 ▒
│ │ pand xmm12,xmm11 ▒
│ │ pslldq xmm3,0xd ▒
0,23 │ │ movdqa xmm13,xmm11 ▒
2,23 │ │ pandn xmm13,xmm3 ▒
│ │ por xmm13,xmm12 ▒
│ │ pand xmm13,XMMWORD PTR [rip+0x40320] ▒
0,12 │ │ movd xmm3,edx ▒
2,80 │ │ pslldq xmm3,0xe ▒
│ │ por xmm3,xmm13 ▒
│ │ pand xmm3,XMMWORD PTR [rip+0x4031a] ▒
0,23 │ │ movzx edx,BYTE PTR [r14+r15*1+0xf] ▒
3,31 │ │ movd xmm12,edx ▒
│ │ pslldq xmm12,0xf ▒
│ │ por xmm12,xmm3 ▒
0,12 │ │ movdqa xmm3,xmm12 ▒
2,92 │ │ paddb xmm3,XMMWORD PTR [rip+0x31d97] # 1009a0 <anon.cf73386a2f5127d166baeac25be116f0.63.llvm.16014458289627072720+0x459> ▒
│ │ movdqa xmm13,xmm3 ▒
│ │ pminub xmm13,xmm15 ▒
0,47 │ │ pcmpeqb xmm13,xmm3 ▒
1,53 │ │ pand xmm13,xmm2 ▒
0,36 │ │ por xmm13,xmm12 ▒
│ │ movdqu XMMWORD PTR [rax+r15*1],xmm13 ▒
0,23 │ │ lea rdx,[r15+0x10] ▒
2,34 │ │ add r15,0x20 ▒
0,12 │ │ cmp r15,rbx ▒
│ │ mov r15,rdx ▒
1,64 │ └──jbe d0
I don't see an easy way to improve the autovectorization of this code, but it should be relatively easy to explicitly vectorize it using portable_simd
, and I would like to prepare such a PR if there are no objections. As far as I know, portable_simd
is already in use inside core
, for example by #103779.