@@ -764,13 +764,15 @@ def quantize_array_q8_0(arr):
764
764
assert arr .dtype == np .float32 , f'Bad array type { arr .dtype } '
765
765
n_blocks = arr .size // QK8_0
766
766
blocks = arr .reshape ((n_blocks , QK8_0 ))
767
- return np .fromiter (map ( quantize_block_q8_0 , blocks ), count = n_blocks , dtype = BLOCK_Q8_0 )
767
+ return np .fromiter (quantize_blocks_q8_0 ( blocks ), count = n_blocks , dtype = BLOCK_Q8_0 )
768
768
769
- def quantize_block_q8_0 (blk , zero = np .float32 (0 ), one = np .float32 (1 ), onetwentyseven = np .float32 (127 ), zero_chunk = (np .int8 (0 ),) * QK8_0 ):
770
- d = abs (blk ).max () / onetwentyseven
771
- if d == zero :
772
- return (np .float16 (d ), zero_chunk )
773
- return (np .float16 (d ), (blk * (one / d )).round ())
769
+ # Much faster implementation of block quantization contributed by @Cebtenzzre
770
+ def quantize_blocks_q8_0 (blocks ):
771
+ d = abs (blocks ).max (axis = 1 ) / np .float32 (127 )
772
+ with np .errstate (divide = 'ignore' ):
773
+ qs = (blocks / d [:, None ]).round ()
774
+ qs [d == 0 ] = 0
775
+ yield from zip (np .float16 (d ), qs )
774
776
775
777
776
778
class OutputFile :
@@ -892,7 +894,7 @@ def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyM
892
894
elapsed = time .time () - start
893
895
size = ' x ' .join (f"{ dim :6d} " for dim in lazy_tensor .shape )
894
896
padi = len (str (len (model )))
895
- print (f"[{ i + 1 :{padi }d} /{ len (model )} ] Writing tensor { name :38s} | size { size :16} | type { lazy_tensor .data_type .name :6 } | T+{ int (elapsed ):4} " )
897
+ print (f"[{ i + 1 :{padi }d} /{ len (model )} ] Writing tensor { name :38s} | size { size :16} | type { lazy_tensor .data_type .name :4 } | T+{ int (elapsed ):4} " )
896
898
of .gguf .write_tensor_data (ndarray )
897
899
898
900
of .close ()
0 commit comments