Support fusion moe #10507

risemeup1 · 2025-04-27T02:27:40Z

Before submitting

Lint code. If there are lint issues, please format the code first.

# Install and register `pre-commit` in the project folder
pip install pre-commit && pre-commit install

# Process previous code files separately
pre-commit run --file XXXX.py

Add test cases into tests folder. If there are codecov issues, please add tests cases first.

PR types

PR changes

Description

paddle-bot · 2025-04-27T02:27:45Z

Thanks for your contribution!

A-nnonymous

LGTM in tokens_unzip_and_zip.cu

zhangbo9674 · 2025-04-27T02:44:44Z

paddlenlp/transformers/deepseek_v2/fp8_linear.py

            out=None,
            accumulate=False,
            use_split_accumulator=True,
            is_a_1d_scaled=is_a_1d_scaled,
            is_b_1d_scaled=is_b_1d_scaled,
        )
    else:
-        y = paddle.zeros([x_fp8.shape[0], w_fp8.shape[0]], paddle.bfloat16)
+        y = paddle.zeros([x_fp8.shape[0], w_fp8.shape[0]], paddle.float32)


Suggested change

y = paddle.zeros([x_fp8.shape[0], w_fp8.shape[0]], paddle.float32)

def kitchen_fp8_gemm(x_fp8, x_scale, w_fp8, w_scale, is_a_1d_scaled, is_b_1d_scaled, rtn_dtype=paddle.bfloat16):

if numpy.prod(x_fp8.shape) != 0 and numpy.prod(w_fp8.shape) != 0:

y = kitchen.ops.fp8_gemm_blockwise(

a=x_fp8,

a_decode_scale=x_scale,

b=w_fp8,

b_decode_scale=w_scale,

out_dtype=rtn_dtype,

out=None,

accumulate=False,

use_split_accumulator=True,

is_a_1d_scaled=is_a_1d_scaled,

is_b_1d_scaled=is_b_1d_scaled,

)

else:

y = paddle.zeros([x_fp8.shape[0], w_fp8.shape[0]], rtn_dtype)

return y

zhangbo9674 · 2025-04-27T02:46:46Z

paddlenlp/transformers/fp8_utils.py

-            o1 = paddle.stack([out_0, out_1, out_2, out_3])
-
-            return o1
+        o1 = paddle.zeros([expert_w_count, x_fp8.shape[1], w1_t_quant.shape[1]], dtype="bfloat16")


dtype 建议使用 x_bf16.dtype 的形式

zhangbo9674 · 2025-04-27T02:47:03Z

paddlenlp/transformers/fp8_utils.py

-        else:
-            _, seq_len, H1 = o2_quant.shape
-            _, H2, _ = w2_quant.shape
+        o3 = paddle.zeros([expert_w_count, o2_quant.shape[1], w2_quant.shape[1]], dtype=paddle.bfloat16)


zhangbo9674 · 2025-04-27T02:47:58Z

paddlenlp/transformers/fp8_utils.py

+        )
+        unzipped_grad_fp8 = unzipped_grad_fp8.reshape([len(expert_w2), -1, unzipped_grad_fp8.shape[-1]])
+        unzipped_grad_scale = unzipped_grad_scale.reshape([len(expert_w2), -1, unzipped_grad_scale.shape[-1]])
+        do2_s = paddle.zeros([len(expert_w2), unzipped_grad_fp8.shape[1], bw_w2_quant.shape[1]], dtype="bfloat16")


zhangbo9674 · 2025-04-27T03:01:27Z

paddlenlp/transformers/fp8_utils.py


    # ===== do1 = swiglu_grad(o1, None, do2) =====
-    def bwd_swiglu(self, o1, do2):
-        do1, _ = paddle._C_ops.swiglu_grad(self.o1, None, do2)
+    def bwd_swiglu(self, o1, do2, tokens_per_expert):


删除没用的参数

zhangbo9674 · 2025-04-27T03:02:51Z

paddlenlp/transformers/fp8_utils.py

-        input_x_fp8 = input_x_fp8.reshape([group_num, H1, -1])
-        input_x_scale = input_x_scale.reshape([group_num, H1, -1])
+        input_x_fp8 = paddle.split(input_x_fp8, num_or_sections=group_num, axis=0)
+        input_x_scale = paddle.split(input_x_scale, num_or_sections=group_num, axis=-1)


zhangbo9674 · 2025-04-27T03:03:02Z

paddlenlp/transformers/fp8_utils.py


        # transpose do1 and quant do1
        H2 = do1.shape[-1]
        do1 = do1.reshape([group_num, -1, H2]).transpose([0, 2, 1]).contiguous().reshape([group_num * H2, -1])
        do1_fp8, do1_scale = kitchen_quant(
            do1, backend=kitchen.ops.Backend.CUBLAS, is_1d_scaled=True, return_transpose=False
        )
-        do1_fp8 = do1_fp8.reshape([group_num, H2, -1])
-        # do1_scale = do1_scale.T.contiguous().reshape([group_num, H2, -1])
+        do1_fp8 = paddle.split(do1_fp8, num_or_sections=group_num, axis=0)
        do1_scale = paddle.split(do1_scale, num_or_sections=group_num, axis=-1)


因为后边要给kitchen,gemm用，backhend用的CUBLAS，do1_fp8 shape是shape=[8192, 8448]，do1_scale 的shape是[66, 8192]，所以一个axis=0，一个axis=-1

zhangbo9674 · 2025-04-27T03:05:02Z

paddlenlp/transformers/moe_utils.py


    @paddle.no_grad()
    def forward(
        self, expert_out, zipped_expertwise_rowmap, routemap_topk, unzipped_probs, total_zipped_tokens, num_experts
    ):
+        self.expert_out = expert_out


delete expert out unziooed_probs

phlrain · 2025-04-27T03:15:01Z

paddlenlp/transformers/fp8_utils.py

+    def bwd_dowm_input(self, expert_w2, unzipped_grad, tokens_per_expert, expected_m):
+        # recomput o2
+        o2 = self.fwd_swiglu(self.o1)
+        o2_s = (o2 * self.unzipped_probs).cast(paddle.bfloat16)


这个地方是不是要补充一组实验，不cast会bf16，这个输入，是给quant用的

最开始的版本就是不cast回bf16，如果不cast回bf16，loss diff比现在的大

phlrain · 2025-04-27T03:19:11Z

paddlenlp/transformers/fp8_utils.py


        input_x_fp8, input_x_scale = kitchen_quant(
            input_x, backend=kitchen.ops.Backend.CUBLAS, is_1d_scaled=True, return_transpose=False
        )
-        input_x_fp8 = input_x_fp8.reshape([group_num, H1, -1])
-        input_x_scale = input_x_scale.reshape([group_num, H1, -1])
+        input_x_fp8 = paddle.split(input_x_fp8, num_or_sections=group_num, axis=0)


对于 0号轴的split，感觉是可以reshape，然后用stride机制，给下面的op来用的，来提升性能
input_x_fp8 = input_x_fp8.reshape([group_num, -1, input_x_fp8.shape[-1] ]

phlrain · 2025-04-27T03:25:49Z

paddlenlp/transformers/fp8_utils.py


        # transpose do1 and quant do1
        H2 = do1.shape[-1]
        do1 = do1.reshape([group_num, -1, H2]).transpose([0, 2, 1]).contiguous().reshape([group_num * H2, -1])
        do1_fp8, do1_scale = kitchen_quant(
            do1, backend=kitchen.ops.Backend.CUBLAS, is_1d_scaled=True, return_transpose=False
        )
-        do1_fp8 = do1_fp8.reshape([group_num, H2, -1])
-        # do1_scale = do1_scale.T.contiguous().reshape([group_num, H2, -1])
+        do1_fp8 = paddle.split(do1_fp8, num_or_sections=group_num, axis=0)


support fusion moe

3149d91

risemeup1 changed the base branch from develop to dsv3_dev April 27, 2025 02:27

A-nnonymous approved these changes Apr 27, 2025

View reviewed changes

zhangbo9674 reviewed Apr 27, 2025

View reviewed changes

phlrain reviewed Apr 27, 2025

View reviewed changes

risemeup1 added 4 commits April 27, 2025 03:40

fix

cabad36

fix

53742ab

fix

a3e8573

merge dev

d4cdcb4

phlrain merged commit 67b21ae into PaddlePaddle:dsv3_dev May 22, 2025
2 of 5 checks passed

-        y = paddle.zeros([x_fp8.shape[0], w_fp8.shape[0]], paddle.float32)
+        def kitchen_fp8_gemm(x_fp8, x_scale, w_fp8, w_scale, is_a_1d_scaled, is_b_1d_scaled, rtn_dtype=paddle.bfloat16):
+    if numpy.prod(x_fp8.shape) != 0 and numpy.prod(w_fp8.shape) != 0:
+        y = kitchen.ops.fp8_gemm_blockwise(
+            a=x_fp8,
+            a_decode_scale=x_scale,
+            b=w_fp8,
+            b_decode_scale=w_scale,
+            out_dtype=rtn_dtype,
+            out=None,
+            accumulate=False,
+            use_split_accumulator=True,
+            is_a_1d_scaled=is_a_1d_scaled,
+            is_b_1d_scaled=is_b_1d_scaled,
+        )
+    else:
+        y = paddle.zeros([x_fp8.shape[0], w_fp8.shape[0]], rtn_dtype)
+    return y

Support fusion moe #10507

Support fusion moe #10507

Uh oh!

Conversation

risemeup1 commented Apr 27, 2025

Before submitting

PR types

PR changes

Description

Uh oh!

paddle-bot bot commented Apr 27, 2025

Uh oh!

A-nnonymous left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!