fix

lugimzzz · lugimzzz · commit 75bb712e3f38 · 2025-05-15T21:37:08.000+08:00
diff --git a/paddlenlp/quantization/hadamard_utils.py b/paddlenlp/quantization/hadamard_utils.py
@@ -34,42 +34,27 @@ def matmul_hadU(X):
 
 
 def random_hadamard_matrix(size, dtype, quantization_config):
-    if not quantization_config.hadamard_is_block:
+    if quantization_config.hadamard_block_size < 0:
         A = paddle.randint(low=0, high=2, shape=[size, size]).astype("float32") * 2 - 1
         Q, _ = paddle.linalg.qr(A)
         return Q.astype(dtype), 1
     else:
-        if quantization_config.hadamard_block_size != -1:
-            assert size % quantization_config.hadamard_block_size == 0, "Please choose a correct block_size"
-            num_blocks = size // quantization_config.hadamard_block_size
-            Q = paddle.diag(paddle.ones((quantization_config.hadamard_block_size,), dtype="float32"))
-            block = matmul_hadU(Q)
-            return block, quantization_config.hadamard_block_size
-        else:
-            num_blocks = size
-            while not (num_blocks % 2):
-                num_blocks = num_blocks // 2
-            block_size = size // num_blocks
-            Q = paddle.diag(paddle.ones((block_size,), dtype="float32"))
-            block = matmul_hadU(Q)
-            large_matrix = paddle.zeros([size, size])
-
-            for i in range(num_blocks):
-                start_row = i * block_size
-                start_col = i * block_size
-                large_matrix[start_row : start_row + block_size, start_col : start_col + block_size] = block
-            return large_matrix.cast(dtype), block_size
-
-
-def hadamard_matmul(input, side, hadamard_maxtrix, block_size):
+        assert size % quantization_config.hadamard_block_size == 0, "Please choose a correct block_size"
+        Q = paddle.diag(paddle.ones((quantization_config.hadamard_block_size,), dtype="float32"))
+        block = matmul_hadU(Q)
+        print("random_hadamard_matrix", block, quantization_config.hadamard_block_size)
+        return block, quantization_config.hadamard_block_size
+
+
+def hadamard_matmul(input, side, hadamard_matrix, block_size):
     # left -> H.T@input right -> input@H
     origin_shape = input.shape
     input = input.reshape([-1, origin_shape[-1]])
     if side == "left":
         # H.T@input -> (input.T@H).T
         input = input.transpose([1, 0])
     block_num = input.shape[-1] // block_size
-    output = input.reshape([-1, block_num, block_size]) @ hadamard_maxtrix
+    output = input.reshape([-1, block_num, block_size]) @ hadamard_matrix
     output = output.reshape([-1, block_num * block_size])
     if side == "left":
         output = output.transpose([1, 0])
@@ -81,17 +66,23 @@ def hadamard_matmul(input, side, hadamard_maxtrix, block_size):
 def apply_hadamard_matmul(x, side, quantization_config=None, dequant=False):
     if getattr(infohub, "hadamard") is None:
         setattr(infohub, "hadamard", {})
-    if side == "left":
-        x_shape = x.shape[0]
+
+    if quantization_config.hadamard_block_size < 0:
+        if side == "left":
+            block_size = x.shape[0]
+        else:
+            block_size = x.shape[-1]
     else:
-        x_shape = x.shape[-1]
-    if x_shape in infohub.hadamard:
-        hadamard_maxtrix, block_size = infohub.hadamard[x_shape]
+        block_size = quantization_config.hadamard_block_size
+
+    if block_size in infohub.hadamard:
+        hadamard_matrix, hadamard_scale = infohub.hadamard[block_size]
     else:
-        hadamard_matrix, block_size = random_hadamard_matrix(x_shape, x.dtype, quantization_config)
-        infohub.hadamard[x_shape] = (hadamard_matrix, block_size)
-    if block_size > 1:
-        target_x = hadamard_matmul(x, side, hadamard_maxtrix, block_size)
+        hadamard_matrix, hadamard_scale = random_hadamard_matrix(block_size, x.dtype, quantization_config)
+        infohub.hadamard[block_size] = (hadamard_matrix, hadamard_scale)
+
+    if hadamard_scale > 1:
+        target_x = hadamard_matmul(x, side, hadamard_matrix, block_size)
     else:
         if dequant:
             hadamard_matrix = hadamard_matrix.T
@@ -100,4 +91,4 @@ def apply_hadamard_matmul(x, side, quantization_config=None, dequant=False):
         else:
             target_x = hadamard_matrix.T @ x
 
-    return target_x, block_size
+    return target_x, hadamard_scale
diff --git a/paddlenlp/quantization/qat_utils.py b/paddlenlp/quantization/qat_utils.py
@@ -38,11 +38,12 @@ def quantize(
     group=None,
 ):
     if apply_hadamard:
-        target_x, block_size = apply_hadamard_matmul(x, side, quantization_config)
+        target_x, hadamard_scale = apply_hadamard_matmul(x, side, quantization_config)
     else:
         target_x = x
-        block_size = 1
+        hadamard_scale = 1
     qmin, qmax = QMAX_QMIN_MAPPING[weight_quantize_algo + "_" + tensor_type]
+    print("apply_hadamard", apply_hadamard, qmin, qmax, tensor_type, hadamard_scale)
     if tensor_type == "activation":
         if act_scale is not None:
             if training:
@@ -51,7 +52,8 @@ def quantize(
                 if state > quantization_config.apply_online_actscale_step:
                     scale = act_scale
             else:
-                scale = act_scale
+                # scale = act_scale
+                scale = paddle.max(paddle.abs(target_x)) / qmax
         else:
             scale = paddle.max(paddle.abs(target_x)) / qmax
         if weight_quantize_algo in ["a8w8linear", "a8w4linear"]:
@@ -66,7 +68,7 @@ def quantize(
                 paddle.distributed.all_reduce(scale, op=paddle.distributed.ReduceOp.MAX, group=group, sync_op=True)
             quant_x = paddle.clip((target_x / scale).round(), qmin, qmax).astype("int8").T
             scale.stop_gradient = True
-            scale = scale.squeeze(0) / block_size
+            scale = scale.squeeze(0) / hadamard_scale
         else:
             raise NotImplementedError(f"Unknown {weight_quantize_algo}.")
     else:
@@ -79,8 +81,8 @@ def dequantize(quant_x, scale, tensor_type, weight_quantize_algo, apply_hadamard
         if weight_quantize_algo in ["a8w8linear", "a8w4linear"]:
             x = quant_x.T.astype(scale.dtype)
             if apply_hadamard:
-                x, block_size = apply_hadamard_matmul(x, side, dequant=True)
-                x *= scale / block_size
+                x, hadamard_scale = apply_hadamard_matmul(x, side, dequant=True)
+                x *= scale / hadamard_scale
             else:
                 x *= scale
     else:
@@ -112,6 +114,7 @@ def int8_forward(
     )
 
     out = paddle.matmul(quant_x, quant_w.T).astype(scale_w.dtype) * (scale_x * scale_w)
+    # out = paddle.matmul(x, quant_w.T.astype("bfloat16")*scale_w)
     if bias is not None:
         out += bias
     return out
diff --git a/paddlenlp/quantization/quantization_config.py b/paddlenlp/quantization/quantization_config.py
@@ -60,8 +60,7 @@ def __init__(
         ignore_modules=None,
         group_size=-1,
         apply_hadamard=False,
-        hadamard_is_block=True,
-        hadamard_block_size=-1,
+        hadamard_block_size=32,
         quant_input_grad=False,
         apply_online_actscale_step=200,
         scale_epsilon=0,
@@ -135,7 +134,6 @@ def __init__(
         self.quant_input_grad = quant_input_grad
         self.apply_online_actscale_step = apply_online_actscale_step
         self.scale_epsilon = scale_epsilon
-        self.hadamard_is_block = hadamard_is_block
         self.moving_rate = moving_rate
         self.hadamard_block_size = hadamard_block_size
 
diff --git a/paddlenlp/quantization/quantization_linear.py b/paddlenlp/quantization/quantization_linear.py
@@ -471,6 +471,7 @@ def forward(self, x):
                 )
         else:
             input_parallel = x
+        print("input_parallel", input_parallel.shape, self.quant_weight.shape, self.quant_scale.shape)
 
         output_parallel = quant_weight_linear(
             x=input_parallel,
@@ -490,11 +491,13 @@ def forward(self, x):
         )
         if self.training:
             self.state += 1
-
+        print("output_parallel", output_parallel.shape)
+        print(self.gather_output, self.is_mp, self.gather_output and self.is_mp)
         if self.gather_output and self.is_mp:
             output = mp_ops._c_concat(output_parallel, group=self.model_parallel_group)
         else:
             output = output_parallel
+        print("output", output.shape)
         return output
 
 
diff --git a/paddlenlp/transformers/conversion_utils.py b/paddlenlp/transformers/conversion_utils.py
@@ -62,7 +62,7 @@
 
 def add_quant_mapping(name_action_mappings, quantization_config):
     mapping_keys = list(name_action_mappings.keys())
-    pattern = r"(?:^|\.)layers(\.[a-zA-Z0-9_]+)+\.weight$"
+    pattern = r"^(?:.*\.)?layers(\.[a-zA-Z0-9_]+)*\.weight$"
     for key in mapping_keys:
         if re.match(pattern, key):
             quant_key = key.replace("weight", "quant_weight")
@@ -1233,16 +1233,15 @@ def get_tensor_parallel_convert_actions(
         base_model_prefix=None,
     ):
         name_action_mappings = cls._get_tensor_parallel_mappings(config, is_split=is_split)
-        if config.quantization_config.is_weight_quantize():
-            name_action_mappings = add_quant_mapping(name_action_mappings, config.quantization_config)
-
         state_keys_map = cls._resolve_prefix_keys(
             name_action_mappings.keys(), loaded_state_dict_keys, ignore_error, base_model_prefix=base_model_prefix
         )
         for k, v in state_keys_map.items():
             if k not in name_action_mappings:
                 continue
             name_action_mappings[v] = name_action_mappings.pop(k)
+        if config.quantization_config.is_weight_quantize():
+            name_action_mappings = add_quant_mapping(name_action_mappings, config.quantization_config)
         return name_action_mappings
 
     @classmethod
diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
@@ -933,7 +933,6 @@ def forward(
     ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
         # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
-
         if self.fuse_attention_qkv:
             mix_layer = self.qkv_proj(hidden_states)
             # NOTE for GQA attention fusion (compatible with MHA and MQA):
@@ -987,6 +986,7 @@ def forward(
                 query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
         else:
             query_states = self.q_proj(hidden_states)
+
             key_states = self.k_proj(hidden_states)
             value_states = self.v_proj(hidden_states)
 
diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py
@@ -400,23 +400,29 @@ def _load_part_state_dict(
                 continue
 
             py_safe_slice_ = f.get_slice(key)
-            if quantization_linear_list is not None:
-                if key.split(".weight")[0] in quantization_linear_list:
-                    weight = paddle.Tensor.__call__(py_safe_slice_[:], zero_copy=True)
-                    key_name = key.split(".weight")[0]
-                    quant_key_name = key_name + ".quant_weight"
-                    quant_state_dict = convert_to_weight_quantize_state_dict(
-                        state_dict={key_name: weight},
-                        name=key_name,
-                        quantization_config=quantization_config,
-                        dtype=dtype,
-                        weight_quantize_algo=parse_weight_quantize_algo(quantization_config, quant_key_name),
+            if quantization_linear_list is not None and key.split(".weight")[0] in quantization_linear_list:
+                weight = paddle.Tensor.__call__(py_safe_slice_[:], zero_copy=True)
+                key_name = key.split(".weight")[0]
+                quant_key_name = key_name + ".quant_weight"
+                quant_scale_name = key_name + ".quant_scale"
+                quant_state_dict = convert_to_weight_quantize_state_dict(
+                    state_dict={key: weight},
+                    name=key_name,
+                    quantization_config=quantization_config,
+                    dtype=dtype,
+                    weight_quantize_algo=parse_weight_quantize_algo(quantization_config, quant_key_name),
+                )
+                if quant_key_name in tensor_parallel_split_mapping:
+                    quant_state_dict[quant_key_name] = tensor_parallel_split_mapping[quant_key_name](
+                        quant_state_dict[quant_key_name]
                     )
-                    if quant_key_name in tensor_parallel_split_mapping:
-                        quant_state_dict[quant_key_name] = tensor_parallel_split_mapping[quant_key_name](
-                            quant_state_dict[quant_key_name]
+                    if quant_scale_name in tensor_parallel_split_mapping:
+                        quant_state_dict[quant_scale_name] = tensor_parallel_split_mapping[quant_scale_name](
+                            quant_state_dict[quant_scale_name]
                         )
-                    part_state_dict.update(quant_state_dict)
+                    for key in list(quant_state_dict.keys()):
+                        quant_state_dict[key] = paddle.Tensor.__call__(quant_state_dict[key], zero_copy=True)
+                part_state_dict.update(quant_state_dict)
             else:
                 if key in tensor_parallel_split_mapping:
                     weight = tensor_parallel_split_mapping[key](py_safe_slice_)
@@ -518,8 +524,6 @@ def load_state_dict(
                     for k in list(state_dict.keys()):
                         if "quant" not in k:
                             state_dict[k] = paddle.Tensor.__call__(state_dict.pop(k), zero_copy=True)
-                        else:
-                            print("aaaaaaa", k)
 
             if len(scale_dict) != 0:
                 if ckpt_quant_stage == "O0":
@@ -1001,6 +1005,7 @@ def _load_state_dict_into_meta_model(
 
             if old_param is not None:
                 param = param.astype(dtype=old_param.dtype)
+        print("meta", param_name, param.shape)
         with paddle.no_grad():
             model_state_dict[param_name].get_tensor()._share_data_with(param.value().get_tensor())
             param.value().get_tensor()._clear()
diff --git a/paddlenlp/transformers/model_utils2.py b/paddlenlp/transformers/model_utils2.py
diff --git a/paddlenlp/trl/model_config.py b/paddlenlp/trl/model_config.py