add cutlass support

dsikka · dsikka · commit f6c7914761ec · 2025-05-19T13:45:51.000-04:00
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -3,7 +3,10 @@
 
 import torch
 from torch.nn.parameter import Parameter
-
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm._custom_ops import (cutlass_scaled_fp4_mm,
+                              cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.utils.nvfp4_emulation_utils import (  # noqa: E501
@@ -12,13 +15,26 @@
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
 
+logger = init_logger(__name__)
+
 __all__ = ["CompressedTensorsW4A4Fp4"]
 
 
+def cutlass_fp4_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+    return cutlass_scaled_mm_supports_fp4(capability)
+
 class CompressedTensorsW4A4Fp4(CompressedTensorsScheme):
 
     def __init__(self):
         self.group_size = 16
+        self.cutlass_nvfp4_supported = cutlass_fp4_supported()
+        if not self.cutlass_nvfp4_supported:
+            logger.warning("Current platform does not support cutlass NVFP4."
+                    " Running emulations.")
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -101,37 +117,60 @@ def process_weights_after_loading(self, layer) -> None:
             layer.weight_global_scale.max().to(torch.float32),
             requires_grad=False)
 
+
+
         swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
         layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
                                                 requires_grad=False)
 
+        # Required by the cutlass kernel - need parameter input, not ModelWeightParameter
+        layer.weight = Parameter(layer.weight_packed.data, requires_grad=False)
+        
+        if self.cutlass_nvfp4_supported:
+            layer.alpha = Parameter(layer.input_global_scale * layer.weight_global_scale,
+                            requires_grad=False)
+            
     def apply_weights(self,
                       layer: torch.nn.Module,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        x_m, x_k = x.shape
+        if not self.cutlass_nvfp4_supported:
+            x_m, x_k = x.shape
+            output_dtype = x.dtype
+
+            # quantize input to (FP4 and interleaved block scale)
+            x_global_scale = layer.input_global_scale
+            x_fp4, x_blockscale = ref_nvfp4_quant(x, x_global_scale,
+                                                self.group_size)
+
+            # dequantize input
+            x_fp4 = x_fp4.reshape(x_m, x_k // self.group_size, self.group_size)
+            x_blockscale = x_blockscale.unsqueeze(-1) / x_global_scale
+            x_dq = (x_fp4 * x_blockscale).reshape(x_m, x_k).to(output_dtype)
+            del x_fp4, x_blockscale
+
+            # dequantize weight
+            w_fp4 = layer.weight.data.view(torch.uint8)
+            w_blockscale = layer.weight_scale_swizzled.data
+            w_global_scale = layer.weight_global_scale
+            w_dq = dequantize_to_dtype(w_fp4, w_blockscale, w_global_scale,
+                                    output_dtype, x.device, self.group_size)
+
+            # matmul
+            out = torch.matmul(x_dq, w_dq.t())
+            del w_dq, x_dq
+            return out
+
         output_dtype = x.dtype
+        output_shape = [x.shape[0], layer.weight.shape[0]]
+
+        # quantize BF16 or FP16 to (FP4 and interleaved block scale)
+        x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)
 
-        # quantize input to (FP4 and interleaved block scale)
-        x_global_scale = layer.input_global_scale
-        x_fp4, x_blockscale = ref_nvfp4_quant(x, x_global_scale,
-                                              self.group_size)
-
-        # dequantize input
-        x_fp4 = x_fp4.reshape(x_m, x_k // self.group_size, self.group_size)
-        x_blockscale = x_blockscale.unsqueeze(-1) / x_global_scale
-        x_dq = (x_fp4 * x_blockscale).reshape(x_m, x_k).to(output_dtype)
-        del x_fp4, x_blockscale
-
-        # dequantize weight
-        w_fp4 = layer.weight_packed.data.view(torch.uint8)
-        w_blockscale = layer.weight_scale_swizzled.data
-        w_global_scale = layer.weight_global_scale
-        w_dq = dequantize_to_dtype(w_fp4, w_blockscale, w_global_scale,
-                                   output_dtype, x.device, self.group_size)
-
-        # matmul
-        out = torch.matmul(x_dq, w_dq.t())
-        del w_dq, x_dq
-        return out
+        out = cutlass_scaled_fp4_mm(x_fp4, layer.weight, x_blockscale,
+                            layer.weight_scale_swizzled, 1 / layer.alpha,
+                            output_dtype)
+        if bias is not None:
+            out = out + bias
+        return out.view(*output_shape)