fix conditions; add test models

dsikka · dsikka · commit 698028487cb5 · 2025-05-17T19:59:34.000-04:00
Signed-off-by: Dipika &lt;dipikasikka1@gmail.com&gt;
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
@@ -13,9 +13,10 @@
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensors24, CompressedTensorsLinearMethod,
-    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
-    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
-    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+    CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
+    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
+    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
+    CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     sparse_cutlass_supported)
 from vllm.platforms import current_platform
@@ -650,9 +651,13 @@ def check_model(model):
         assert output
 
 
-def test_compressed_tensors_nvfp4a16(vllm_runner):
-    # run weight only example
-    model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP4"
+# TODO: update model configs with next ct release
+@pytest.mark.parametrize("args", [
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-FP4", CompressedTensorsW4A16Fp4),
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-NVFP4A4", CompressedTensorsW4A4Fp4)
+])
+def test_compressed_tensors_nvfp4(vllm_runner, args):
+    model, scheme = args
     with vllm_runner(model, enforce_eager=True) as llm:
 
         def check_model(model):
@@ -661,7 +666,7 @@ def check_model(model):
             qkv_proj = layer.self_attn.qkv_proj
             assert isinstance(qkv_proj.quant_method,
                               CompressedTensorsLinearMethod)
-            assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Fp4)
+            assert isinstance(qkv_proj.scheme, scheme)
             assert qkv_proj.scheme.group_size == 16
 
         llm.apply_model(check_model)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -219,8 +219,9 @@ def _check_scheme_supported(self,
 
     def _is_fp4a4_nvfp4(self, weight_quant: BaseModel, input_quant: BaseModel):
 
-        is_weight_act_quant = (weight_quant is not None
-                               and input_quant is not None)
+        if weight_quant is None or input_quant is None:
+            return False
+
         is_group_quant = (
             weight_quant.strategy == QuantizationStrategy.GROUP.value)
         is_symmetric = weight_quant.symmetric and input_quant.symmetric
@@ -231,13 +232,18 @@ def _is_fp4a4_nvfp4(self, weight_quant: BaseModel, input_quant: BaseModel):
                          and input_quant.type == QuantizationType.FLOAT.value)
         is_4_bits = weight_quant.num_bits == 4 and input_quant.num_bits == 4
 
-        return (is_weight_act_quant and is_group_quant and is_float_type
-                and is_4_bits and is_group_size_16 and is_symmetric)
+        return (is_group_quant and is_float_type and is_4_bits
+                and is_group_size_16 and is_symmetric)
 
     def _is_fp4a16_nvfp4(self, weight_quant: BaseModel,
                          input_quant: BaseModel):
 
-        is_weight_only = weight_quant is not None and input_quant is None
+        if weight_quant is None:
+            return False
+
+        if input_quant is not None:
+            return False
+
         is_group_quant = (
             weight_quant.strategy == QuantizationStrategy.GROUP.value)
         is_symmetric = weight_quant.symmetric
@@ -246,8 +252,8 @@ def _is_fp4a16_nvfp4(self, weight_quant: BaseModel,
         is_float_type = weight_quant.type == QuantizationType.FLOAT
         is_4_bits = weight_quant.num_bits == 4
 
-        return (is_weight_only and is_group_quant and is_float_type
-                and is_4_bits and is_group_size_16 and is_symmetric)
+        return (is_group_quant and is_float_type and is_4_bits
+                and is_group_size_16 and is_symmetric)
 
     def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
                                input_quant: BaseModel) -> bool:
@@ -351,9 +357,6 @@ def _get_scheme_from_parts(
         if self._is_fp4a16_nvfp4(weight_quant, input_quant):
             return CompressedTensorsW4A16Fp4()
 
-        if self._is_fp4a4_nvfp4(weight_quant, input_quant):
-            return CompressedTensorsW4A4Fp4()
-
         if self._is_wNa16_group_channel(weight_quant, input_quant):
             if (self.quant_format == CompressionFormat.marlin_24.value
                     and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
@@ -372,6 +375,9 @@ def _get_scheme_from_parts(
                     actorder=weight_quant.actorder)
 
         if is_activation_quantization_format(self.quant_format):
+            if self._is_fp4a4_nvfp4(weight_quant, input_quant):
+                return CompressedTensorsW4A4Fp4()
+
             if self._is_fp8_w8a8(weight_quant, input_quant):
                 is_fp8_w8a8_supported = self._check_scheme_supported(
                     CompressedTensorsW8A8Fp8.get_min_capability(), error=False)