huggingface · sayakpaul · Dec 4, 2024 · Nov 1, 2024 · Nov 2, 2024 · Nov 2, 2024
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
@@ -55,6 +55,7 @@
     is_accelerate_version,
     is_torch_npu_available,
     is_torch_version,
+    is_transformers_available,
     is_transformers_version,
     logging,
     numpy_to_pil,
@@ -66,6 +67,8 @@
 if is_torch_npu_available():
     import torch_npu  # noqa: F401
 
+if is_transformers_available():
+    from transformers import PreTrainedModel
 
 from .pipeline_loading_utils import (
     ALL_IMPORTABLE_CLASSES,
@@ -410,10 +413,14 @@ def module_is_offloaded(module):
         pipeline_is_sequentially_offloaded = any(
             module_is_sequentially_offloaded(module) for _, module in self.components.items()
         )
+        pipeline_has_bnb = any(
+            (_check_bnb_status(module)[1] or _check_bnb_status(module)[-1]) for _, module in self.components.items()
+        )
         if pipeline_is_sequentially_offloaded and device and torch.device(device).type == "cuda":
-            raise ValueError(
-                "It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading."
-            )
+            if not pipeline_has_bnb:
+                raise ValueError(
+                    "It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading."
+                )
 
         is_pipeline_device_mapped = self.hf_device_map is not None and len(self.hf_device_map) > 1
         if is_pipeline_device_mapped:
@@ -448,8 +455,17 @@ def module_is_offloaded(module):
 
             # This can happen for `transformer` models. CPU placement was added in
             # https://github.com/huggingface/transformers/pull/33122. So, we guard this accordingly.
-            if is_loaded_in_4bit_bnb and device is not None and is_transformers_version(">", "4.44.0"):
-                module.to(device=device)
+            if is_loaded_in_4bit_bnb and device is not None:
+                if is_transformers_available() and isinstance(module, PreTrainedModel):
+                    if is_transformers_version(">", "4.44.0"):
+                        module.to(device=device)
+                    else:
+                        logger.warning(
+                            f"{module.__class__.__name__} could not be placed on {device}. Module is still on {module.device}. Please update your `transformers` installation to the latest."
+                        )
+                # For `diffusers` it should not be a problem.
+                else:
+                    module.to(device=device)
             elif not is_loaded_in_4bit_bnb and not is_loaded_in_8bit_bnb:
                 module.to(device, dtype)
 

diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
@@ -47,6 +47,7 @@ def get_some_linear_layer(model):
 
 
 if is_transformers_available():
+    from transformers import BitsAndBytesConfig as BnbConfig
     from transformers import T5EncoderModel
 
 if is_torch_available():
@@ -484,6 +485,42 @@ def test_moving_to_cpu_throws_warning(self):
 
         assert "Pipelines loaded with `dtype=torch.float16`" in cap_logger.out
 
+    def test_pipeline_cuda_placement_works_with_nf4(self):
+        transformer_nf4_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+        )
+        transformer_4bit = SD3Transformer2DModel.from_pretrained(
+            self.model_name,
+            subfolder="transformer",
+            quantization_config=transformer_nf4_config,
+            torch_dtype=torch.float16,
+        )
+        text_encoder_3_nf4_config = BnbConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+        )
+        text_encoder_3_4bit = T5EncoderModel.from_pretrained(
+            self.model_name,
+            subfolder="text_encoder_3",
+            quantization_config=text_encoder_3_nf4_config,
+            torch_dtype=torch.float16,
+        )
+        # CUDA device placement works.
+        pipeline_4bit = DiffusionPipeline.from_pretrained(
+            self.model_name,
+            transformer=transformer_4bit,
+            text_encoder_3=text_encoder_3_4bit,
+            torch_dtype=torch.float16,
+        ).to("cuda")
+
+        # Check if inference works.
+        _ = pipeline_4bit("table", max_sequence_length=20, num_inference_steps=2)
+
+        del pipeline_4bit
+
 
 @require_transformers_version_greater("4.44.0")
 class SlowBnb4BitFluxTests(Base4bitTests):

diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
@@ -44,6 +44,7 @@ def get_some_linear_layer(model):
 
 
 if is_transformers_available():
+    from transformers import BitsAndBytesConfig as BnbConfig
     from transformers import T5EncoderModel
 
 if is_torch_available():
@@ -432,6 +433,34 @@ def test_generate_quality_dequantize(self):
             output_type="np",
         ).images
 
+    def test_pipeline_cuda_placement_works_with_mixed_int8(self):
+        transformer_8bit_config = BitsAndBytesConfig(load_in_8bit=True)
+        transformer_8bit = SD3Transformer2DModel.from_pretrained(
+            self.model_name,
+            subfolder="transformer",
+            quantization_config=transformer_8bit_config,
+            torch_dtype=torch.float16,
+        )
+        text_encoder_3_8bit_config = BnbConfig(load_in_8bit=True)
+        text_encoder_3_8bit = T5EncoderModel.from_pretrained(
+            self.model_name,
+            subfolder="text_encoder_3",
+            quantization_config=text_encoder_3_8bit_config,
+            torch_dtype=torch.float16,
+        )
+        # CUDA device placement works.
+        pipeline_8bit = DiffusionPipeline.from_pretrained(
+            self.model_name,
+            transformer=transformer_8bit,
+            text_encoder_3=text_encoder_3_8bit,
+            torch_dtype=torch.float16,
+        ).to("cuda")
+
+        # Check if inference works.
+        _ = pipeline_8bit("table", max_sequence_length=20, num_inference_steps=2)
+
+        del pipeline_8bit
+
 
 @require_transformers_version_greater("4.44.0")
 class SlowBnb8bitFluxTests(Base8bitTests):