huggingface · yiyixuxu · Jun 5, 2024 · Jun 2, 2024 · Jun 3, 2024 · asomoza
diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
@@ -928,9 +928,7 @@ def _convert_ip_adapter_attn_to_diffusers(self, state_dicts, low_cpu_mem_usage=F
                 hidden_size = self.config.block_out_channels[block_id]
 
             if cross_attention_dim is None or "motion_modules" in name:
-                attn_processor_class = (
-                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
-                )
+                attn_processor_class = self.attn_processors[name].__class__
                 attn_procs[name] = attn_processor_class()
 
             else:

diff --git a/src/diffusers/pipelines/pag_utils.py b/src/diffusers/pipelines/pag_utils.py
@@ -45,7 +45,7 @@ def enable_pag(
         self._pag_applied_layers = pag_applied_layers
         self._pag_applied_layers_index = pag_applied_layers_index
         self._pag_cfg = pag_cfg
-
+        self._is_pag_enabled = True
         self._set_pag_attn_processor()
 
     def _get_self_attn_layers(self):
@@ -180,6 +180,7 @@ def disable_pag(self):
         self._pag_applied_layers = None
         self._pag_applied_layers_index = None
         self._pag_cfg = None
+        self._is_pag_enabled = False
 
     @property
     def pag_adaptive_scaling(self):
@@ -191,4 +192,4 @@ def do_pag_adaptive_scaling(self):
 
     @property
     def do_perturbed_attention_guidance(self):
-        return hasattr(self, "_pag_scale") and self._pag_scale is not None and self._pag_scale > 0
+        return self._is_pag_enabled
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -1172,6 +1172,10 @@ def __call__(
                 self.do_classifier_free_guidance,
             )
 
+        # expand the image embeddings if we are using perturbed-attention guidance
+        for i in range(len(image_embeds)):
+            image_embeds[i] = image_embeds[i].repeat(prompt_embeds.shape[0] // latents.shape[0], 1, 1)
+
         # 8. Denoising loop
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
 
@@ -1205,7 +1209,7 @@ def __call__(
                 if self.interrupt:
                     continue
 
-                # expand the latents if we are doing classifier free guidance
+                # expand the latents if we are doing classifier free guidance, perturbed-attention guidance, or both
                 latent_model_input = torch.cat([latents] * (prompt_embeds.shape[0] // latents.shape[0]))
 
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)