Skip to content

Commit f20b180

Browse files
klauspostbradfitz
authored andcommitted
compress/flate: eliminate most common bounds checks
This uses the SSA compiler to eliminate various unneeded bounds checks in loops and various lookups. This fixes the low hanging fruit, without any major code changes. name old time/op new time/op delta EncodeDigitsHuffman1e4-8 49.9µs ± 1% 48.1µs ± 1% -3.74% (p=0.000 n=10+9) EncodeDigitsHuffman1e5-8 476µs ± 1% 458µs ± 1% -3.58% (p=0.000 n=10+10) EncodeDigitsHuffman1e6-8 4.80ms ± 2% 4.56ms ± 1% -5.07% (p=0.000 n=10+9) EncodeDigitsSpeed1e4-8 305µs ± 3% 290µs ± 2% -5.03% (p=0.000 n=10+9) EncodeDigitsSpeed1e5-8 3.67ms ± 2% 3.49ms ± 2% -4.78% (p=0.000 n=9+10) EncodeDigitsSpeed1e6-8 38.3ms ± 2% 35.8ms ± 1% -6.58% (p=0.000 n=9+10) EncodeDigitsDefault1e4-8 361µs ± 2% 346µs ± 3% -4.12% (p=0.000 n=10+9) EncodeDigitsDefault1e5-8 5.24ms ± 2% 4.96ms ± 3% -5.38% (p=0.000 n=10+10) EncodeDigitsDefault1e6-8 56.5ms ± 3% 52.2ms ± 2% -7.68% (p=0.000 n=10+10) EncodeDigitsCompress1e4-8 362µs ± 2% 343µs ± 1% -5.20% (p=0.000 n=10+9) EncodeDigitsCompress1e5-8 5.26ms ± 3% 4.98ms ± 2% -5.48% (p=0.000 n=10+10) EncodeDigitsCompress1e6-8 56.0ms ± 4% 52.1ms ± 1% -7.01% (p=0.000 n=10+10) EncodeTwainHuffman1e4-8 70.9µs ± 3% 64.7µs ± 1% -8.68% (p=0.000 n=10+9) EncodeTwainHuffman1e5-8 556µs ± 2% 524µs ± 2% -5.84% (p=0.000 n=10+10) EncodeTwainHuffman1e6-8 5.54ms ± 3% 5.22ms ± 2% -5.70% (p=0.000 n=10+10) EncodeTwainSpeed1e4-8 294µs ± 3% 284µs ± 1% -3.71% (p=0.000 n=10+10) EncodeTwainSpeed1e5-8 2.59ms ± 2% 2.48ms ± 1% -4.14% (p=0.000 n=10+9) EncodeTwainSpeed1e6-8 25.6ms ± 1% 24.3ms ± 1% -5.28% (p=0.000 n=9+10) EncodeTwainDefault1e4-8 419µs ± 2% 396µs ± 1% -5.59% (p=0.000 n=10+9) EncodeTwainDefault1e5-8 6.23ms ± 4% 5.75ms ± 1% -7.83% (p=0.000 n=10+9) EncodeTwainDefault1e6-8 66.2ms ± 2% 61.4ms ± 1% -7.22% (p=0.000 n=10+10) EncodeTwainCompress1e4-8 426µs ± 1% 405µs ± 1% -4.97% (p=0.000 n=9+10) EncodeTwainCompress1e5-8 6.80ms ± 1% 6.32ms ± 1% -6.97% (p=0.000 n=9+10) EncodeTwainCompress1e6-8 74.6ms ± 3% 68.7ms ± 1% -7.90% (p=0.000 n=10+9) name old speed new speed delta EncodeDigitsHuffman1e4-8 200MB/s ± 1% 208MB/s ± 1% +3.88% (p=0.000 n=10+9) EncodeDigitsHuffman1e5-8 210MB/s ± 1% 218MB/s ± 1% +3.71% (p=0.000 n=10+10) EncodeDigitsHuffman1e6-8 208MB/s ± 2% 219MB/s ± 1% +5.32% (p=0.000 n=10+9) EncodeDigitsSpeed1e4-8 32.8MB/s ± 3% 34.5MB/s ± 2% +5.29% (p=0.000 n=10+9) EncodeDigitsSpeed1e5-8 27.2MB/s ± 2% 28.6MB/s ± 2% +5.29% (p=0.000 n=10+10) EncodeDigitsSpeed1e6-8 26.1MB/s ± 2% 27.9MB/s ± 1% +7.02% (p=0.000 n=9+10) EncodeDigitsDefault1e4-8 27.7MB/s ± 2% 28.9MB/s ± 3% +4.30% (p=0.000 n=10+9) EncodeDigitsDefault1e5-8 19.1MB/s ± 2% 20.2MB/s ± 3% +5.69% (p=0.000 n=10+10) EncodeDigitsDefault1e6-8 17.7MB/s ± 3% 19.2MB/s ± 2% +8.31% (p=0.000 n=10+10) EncodeDigitsCompress1e4-8 27.6MB/s ± 2% 29.1MB/s ± 1% +5.47% (p=0.000 n=10+9) EncodeDigitsCompress1e5-8 19.0MB/s ± 3% 20.1MB/s ± 2% +5.78% (p=0.000 n=10+10) EncodeDigitsCompress1e6-8 17.9MB/s ± 4% 19.2MB/s ± 1% +7.50% (p=0.000 n=10+10) EncodeTwainHuffman1e4-8 141MB/s ± 3% 154MB/s ± 1% +9.46% (p=0.000 n=10+9) EncodeTwainHuffman1e5-8 180MB/s ± 2% 191MB/s ± 2% +6.19% (p=0.000 n=10+10) EncodeTwainHuffman1e6-8 181MB/s ± 3% 192MB/s ± 2% +6.02% (p=0.000 n=10+10) EncodeTwainSpeed1e4-8 34.0MB/s ± 3% 35.3MB/s ± 1% +3.84% (p=0.000 n=10+10) EncodeTwainSpeed1e5-8 38.7MB/s ± 2% 40.3MB/s ± 1% +4.30% (p=0.000 n=10+9) EncodeTwainSpeed1e6-8 39.1MB/s ± 1% 41.2MB/s ± 1% +5.57% (p=0.000 n=9+10) EncodeTwainDefault1e4-8 23.9MB/s ± 2% 25.3MB/s ± 1% +5.91% (p=0.000 n=10+9) EncodeTwainDefault1e5-8 16.0MB/s ± 4% 17.4MB/s ± 1% +8.47% (p=0.000 n=10+9) EncodeTwainDefault1e6-8 15.1MB/s ± 2% 16.3MB/s ± 1% +7.76% (p=0.000 n=10+10) EncodeTwainCompress1e4-8 23.5MB/s ± 1% 24.7MB/s ± 1% +5.24% (p=0.000 n=9+10) EncodeTwainCompress1e5-8 14.7MB/s ± 1% 15.8MB/s ± 1% +7.50% (p=0.000 n=9+10) EncodeTwainCompress1e6-8 13.4MB/s ± 3% 14.6MB/s ± 1% +8.57% (p=0.000 n=10+9) Change-Id: I5c7e84c2f9ea4d38a2115995705eebb93387e22f Reviewed-on: https://go-review.googlesource.com/21759 Reviewed-by: Brad Fitzpatrick <[email protected]> Run-TryBot: Brad Fitzpatrick <[email protected]> TryBot-Result: Gobot Gobot <[email protected]>
1 parent 012557b commit f20b180

File tree

3 files changed

+48
-48
lines changed

3 files changed

+48
-48
lines changed

src/compress/flate/deflate.go

+17-20
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ type compressor struct {
7373
// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
7474
// with the same hash value.
7575
chainHead int
76-
hashHead []uint32
77-
hashPrev []uint32
76+
hashHead [hashSize]uint32
77+
hashPrev [windowSize]uint32
7878
hashOffset int
7979

8080
// input window: unprocessed data is window[index:windowEnd]
@@ -188,12 +188,13 @@ func (d *compressor) fillWindow(b []byte) {
188188
var newH uint32
189189
for i, val := range dst {
190190
di := i + index
191-
newH = val & hashMask
191+
newH = val
192+
hh := &d.hashHead[newH&hashMask]
192193
// Get previous value with the same hash.
193194
// Our chain should point to the previous value.
194-
d.hashPrev[di&windowMask] = d.hashHead[newH]
195+
d.hashPrev[di&windowMask] = *hh
195196
// Set the head of the hash chain to us.
196-
d.hashHead[newH] = uint32(di + d.hashOffset)
197+
*hh = uint32(di + d.hashOffset)
197198
}
198199
d.hash = newH
199200
}
@@ -293,6 +294,7 @@ func bulkHash4(b []byte, dst []uint32) {
293294
// bytes in size.
294295
func matchLen(a, b []byte, max int) int {
295296
a = a[:max]
297+
b = b[:len(a)]
296298
for i, av := range a {
297299
if b[i] != av {
298300
return i
@@ -302,8 +304,6 @@ func matchLen(a, b []byte, max int) int {
302304
}
303305

304306
func (d *compressor) initDeflate() {
305-
d.hashHead = make([]uint32, hashSize)
306-
d.hashPrev = make([]uint32, windowSize)
307307
d.window = make([]byte, 2*windowSize)
308308
d.hashOffset = 1
309309
d.tokens = make([]token, 0, maxFlateBlockTokens+1)
@@ -358,9 +358,10 @@ Loop:
358358
if d.index < d.maxInsertIndex {
359359
// Update the hash
360360
d.hash = hash4(d.window[d.index : d.index+minMatchLength])
361-
d.chainHead = int(d.hashHead[d.hash])
361+
hh := &d.hashHead[d.hash&hashMask]
362+
d.chainHead = int(*hh)
362363
d.hashPrev[d.index&windowMask] = uint32(d.chainHead)
363-
d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
364+
*hh = uint32(d.index + d.hashOffset)
364365
}
365366
prevLength := d.length
366367
prevOffset := d.offset
@@ -404,9 +405,10 @@ Loop:
404405
d.hash = hash4(d.window[d.index : d.index+minMatchLength])
405406
// Get previous value with the same hash.
406407
// Our chain should point to the previous value.
407-
d.hashPrev[d.index&windowMask] = d.hashHead[d.hash]
408+
hh := &d.hashHead[d.hash&hashMask]
409+
d.hashPrev[d.index&windowMask] = *hh
408410
// Set the head of the hash chain to us.
409-
d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
411+
*hh = uint32(d.index + d.hashOffset)
410412
}
411413
}
412414
if d.fastSkipHashing == skipNever {
@@ -531,9 +533,6 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
531533
return nil
532534
}
533535

534-
// hzeroes is used for zeroing the hash slice.
535-
var hzeroes [256]uint32
536-
537536
func (d *compressor) reset(w io.Writer) {
538537
d.w.reset(w)
539538
d.sync = false
@@ -543,15 +542,13 @@ func (d *compressor) reset(w io.Writer) {
543542
d.windowEnd = 0
544543
default:
545544
d.chainHead = -1
546-
for s := d.hashHead; len(s) > 0; {
547-
n := copy(s, hzeroes[:])
548-
s = s[n:]
545+
for i := range d.hashHead {
546+
d.hashHead[i] = 0
549547
}
550-
for s := d.hashPrev; len(s) > 0; s = s[len(hzeroes):] {
551-
copy(s, hzeroes[:])
548+
for i := range d.hashPrev {
549+
d.hashPrev[i] = 0
552550
}
553551
d.hashOffset = 1
554-
555552
d.index, d.windowEnd = 0, 0
556553
d.blockStart, d.byteAvailable = 0, false
557554
d.tokens = d.tokens[:0]

src/compress/flate/huffman_bit_writer.go

+29-26
Original file line numberDiff line numberDiff line change
@@ -84,11 +84,11 @@ type huffmanBitWriter struct {
8484
bits uint64
8585
nbits uint
8686
bytes [bufferSize]byte
87+
codegenFreq [codegenCodeCount]int32
8788
nbytes int
8889
literalFreq []int32
8990
offsetFreq []int32
9091
codegen []uint8
91-
codegenFreq []int32
9292
literalEncoding *huffmanEncoder
9393
offsetEncoding *huffmanEncoder
9494
codegenEncoding *huffmanEncoder
@@ -101,7 +101,6 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
101101
literalFreq: make([]int32, maxNumLit),
102102
offsetFreq: make([]int32, offsetCodeCount),
103103
codegen: make([]uint8, maxNumLit+offsetCodeCount+1),
104-
codegenFreq: make([]int32, codegenCodeCount),
105104
literalEncoding: newHuffmanEncoder(maxNumLit),
106105
codegenEncoding: newHuffmanEncoder(codegenCodeCount),
107106
offsetEncoding: newHuffmanEncoder(offsetCodeCount),
@@ -143,12 +142,13 @@ func (w *huffmanBitWriter) writeBits(b int32, nb uint) {
143142
w.bits >>= 48
144143
w.nbits -= 48
145144
n := w.nbytes
146-
w.bytes[n+0] = byte(bits)
147-
w.bytes[n+1] = byte(bits >> 8)
148-
w.bytes[n+2] = byte(bits >> 16)
149-
w.bytes[n+3] = byte(bits >> 24)
150-
w.bytes[n+4] = byte(bits >> 32)
151-
w.bytes[n+5] = byte(bits >> 40)
145+
bytes := w.bytes[n : n+6]
146+
bytes[0] = byte(bits)
147+
bytes[1] = byte(bits >> 8)
148+
bytes[2] = byte(bits >> 16)
149+
bytes[3] = byte(bits >> 24)
150+
bytes[4] = byte(bits >> 32)
151+
bytes[5] = byte(bits >> 40)
152152
n += 6
153153
if n >= bufferFlushSize {
154154
_, w.err = w.w.Write(w.bytes[:n])
@@ -293,12 +293,13 @@ func (w *huffmanBitWriter) writeCode(c hcode) {
293293
w.bits >>= 48
294294
w.nbits -= 48
295295
n := w.nbytes
296-
w.bytes[n+0] = byte(bits)
297-
w.bytes[n+1] = byte(bits >> 8)
298-
w.bytes[n+2] = byte(bits >> 16)
299-
w.bytes[n+3] = byte(bits >> 24)
300-
w.bytes[n+4] = byte(bits >> 32)
301-
w.bytes[n+5] = byte(bits >> 40)
296+
bytes := w.bytes[n : n+6]
297+
bytes[0] = byte(bits)
298+
bytes[1] = byte(bits >> 8)
299+
bytes[2] = byte(bits >> 16)
300+
bytes[3] = byte(bits >> 24)
301+
bytes[4] = byte(bits >> 32)
302+
bytes[5] = byte(bits >> 40)
302303
n += 6
303304
if n >= bufferFlushSize {
304305
_, w.err = w.w.Write(w.bytes[:n])
@@ -428,13 +429,13 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
428429
// Generate codegen and codegenFrequencies, which indicates how to encode
429430
// the literalEncoding and the offsetEncoding.
430431
w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
431-
w.codegenEncoding.generate(w.codegenFreq, 7)
432+
w.codegenEncoding.generate(w.codegenFreq[:], 7)
432433
numCodegens = len(w.codegenFreq)
433434
for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
434435
numCodegens--
435436
}
436437
dynamicHeader := int64(3+5+5+4+(3*numCodegens)) +
437-
w.codegenEncoding.bitLength(w.codegenFreq) +
438+
w.codegenEncoding.bitLength(w.codegenFreq[:]) +
438439
int64(extraBits) +
439440
int64(w.codegenFreq[16]*2) +
440441
int64(w.codegenFreq[17]*3) +
@@ -482,7 +483,7 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []b
482483
// Generate codegen and codegenFrequencies, which indicates how to encode
483484
// the literalEncoding and the offsetEncoding.
484485
w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
485-
w.codegenEncoding.generate(w.codegenFreq, 7)
486+
w.codegenEncoding.generate(w.codegenFreq[:], 7)
486487
numCodegens := len(w.codegenFreq)
487488
for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
488489
numCodegens--
@@ -609,13 +610,13 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
609610
// Generate codegen and codegenFrequencies, which indicates how to encode
610611
// the literalEncoding and the offsetEncoding.
611612
w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
612-
w.codegenEncoding.generate(w.codegenFreq, 7)
613+
w.codegenEncoding.generate(w.codegenFreq[:], 7)
613614
numCodegens = len(w.codegenFreq)
614615
for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
615616
numCodegens--
616617
}
617618
headerSize := int64(3+5+5+4+(3*numCodegens)) +
618-
w.codegenEncoding.bitLength(w.codegenFreq) +
619+
w.codegenEncoding.bitLength(w.codegenFreq[:]) +
619620
int64(w.codegenFreq[16]*2) +
620621
int64(w.codegenFreq[17]*3) +
621622
int64(w.codegenFreq[18]*7)
@@ -639,7 +640,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
639640

640641
// Huffman.
641642
w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
642-
encoding := w.literalEncoding.codes
643+
encoding := w.literalEncoding.codes[:257]
643644
n := w.nbytes
644645
for _, t := range input {
645646
// Bitwriting inlined, ~30% speedup
@@ -653,12 +654,13 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
653654
bits := w.bits
654655
w.bits >>= 48
655656
w.nbits -= 48
656-
w.bytes[n+0] = byte(bits)
657-
w.bytes[n+1] = byte(bits >> 8)
658-
w.bytes[n+2] = byte(bits >> 16)
659-
w.bytes[n+3] = byte(bits >> 24)
660-
w.bytes[n+4] = byte(bits >> 32)
661-
w.bytes[n+5] = byte(bits >> 40)
657+
bytes := w.bytes[n : n+6]
658+
bytes[0] = byte(bits)
659+
bytes[1] = byte(bits >> 8)
660+
bytes[2] = byte(bits >> 16)
661+
bytes[3] = byte(bits >> 24)
662+
bytes[4] = byte(bits >> 32)
663+
bytes[5] = byte(bits >> 40)
662664
n += 6
663665
if n < bufferFlushSize {
664666
continue
@@ -677,6 +679,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
677679
//
678680
// len(h) must be >= 256, and h's elements must be all zeroes.
679681
func histogram(b []byte, h []int32) {
682+
h = h[:256]
680683
for _, t := range b {
681684
h[t]++
682685
}

src/compress/flate/huffman_code.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
9696
func generateFixedOffsetEncoding() *huffmanEncoder {
9797
h := newHuffmanEncoder(30)
9898
codes := h.codes
99-
for ch := uint16(0); ch < 30; ch++ {
100-
codes[ch] = hcode{code: reverseBits(ch, 5), len: 5}
99+
for ch := range codes {
100+
codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5}
101101
}
102102
return h
103103
}

0 commit comments

Comments
 (0)