Add support Karras sigmas for StableDiffusionKDiffusionPipeline (huggingface#2874)

takuma104 · web-flow · commit 7337b877c165 · 2023-03-31T09:12:11.000+05:30
* add use_karras_sigmas option thanks @Stax124 * fix sigma_min/max from scheduler.sigmas * add docstring * revert to use k_diffusion_model.sigma, to(device) * add integration test * make style
diff --git a/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py
@@ -17,6 +17,7 @@
 
 import torch
 from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
+from k_diffusion.sampling import get_sigmas_karras
 
 from ...loaders import TextualInversionLoaderMixin
 from ...pipelines import DiffusionPipeline
@@ -409,6 +410,7 @@ def __call__(
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
+        use_karras_sigmas: Optional[bool] = False,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -465,7 +467,10 @@ def __call__(
             callback_steps (`int`, *optional*, defaults to 1):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
-
+            use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+                Use karras sigmas. For example, specifying `sample_dpmpp_2m` to `set_scheduler` will be equivalent to
+                `DPM++2M` in stable-diffusion-webui. On top of that, setting this option to True will make it `DPM++2M
+                Karras`.
         Returns:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
             [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
@@ -503,10 +508,18 @@ def __call__(
 
         # 4. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=prompt_embeds.device)
-        sigmas = self.scheduler.sigmas
+
+        # 5. Prepare sigmas
+        if use_karras_sigmas:
+            sigma_min: float = self.k_diffusion_model.sigmas[0].item()
+            sigma_max: float = self.k_diffusion_model.sigmas[-1].item()
+            sigmas = get_sigmas_karras(n=num_inference_steps, sigma_min=sigma_min, sigma_max=sigma_max)
+            sigmas = sigmas.to(device)
+        else:
+            sigmas = self.scheduler.sigmas
         sigmas = sigmas.to(prompt_embeds.dtype)
 
-        # 5. Prepare latent variables
+        # 6. Prepare latent variables
         num_channels_latents = self.unet.in_channels
         latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
@@ -522,7 +535,7 @@ def __call__(
         self.k_diffusion_model.sigmas = self.k_diffusion_model.sigmas.to(latents.device)
         self.k_diffusion_model.log_sigmas = self.k_diffusion_model.log_sigmas.to(latents.device)
 
-        # 6. Define model function
+        # 7. Define model function
         def model_fn(x, t):
             latent_model_input = torch.cat([x] * 2)
             t = torch.cat([t] * 2)
@@ -533,16 +546,16 @@ def model_fn(x, t):
             noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
             return noise_pred
 
-        # 7. Run k-diffusion solver
+        # 8. Run k-diffusion solver
         latents = self.sampler(model_fn, latents, sigmas)
 
-        # 8. Post-processing
+        # 9. Post-processing
         image = self.decode_latents(latents)
 
-        # 9. Run safety checker
+        # 10. Run safety checker
         image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
 
-        # 10. Convert to PIL
+        # 11. Convert to PIL
         if output_type == "pil":
             image = self.numpy_to_pil(image)