huggingface
diff --git a/‎.github/workflows/pr_test_fetcher.yml
Lines changed: 1 addition & 7 deletions b/‎.github/workflows/pr_test_fetcher.yml
Lines changed: 1 addition & 7 deletions
diff --git a/‎.github/workflows/pr_tests.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/pr_tests.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/en/_toctree.yml
Lines changed: 4 additions & 0 deletions b/‎docs/source/en/_toctree.yml
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/source/en/api/attnprocessor.md
Lines changed: 3 additions & 0 deletions b/‎docs/source/en/api/attnprocessor.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/controlnetxs.md
Lines changed: 39 additions & 0 deletions b/‎docs/source/en/api/pipelines/controlnetxs.md
Lines changed: 39 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/controlnetxs_sdxl.md
Lines changed: 45 additions & 0 deletions b/‎docs/source/en/api/pipelines/controlnetxs_sdxl.md
Lines changed: 45 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/overview.md
Lines changed: 3 additions & 0 deletions b/‎docs/source/en/api/pipelines/overview.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md
Lines changed: 2 additions & 20 deletions b/‎docs/source/en/api/pipelines/stable_diffusion/sdxl_turbo.md
Lines changed: 2 additions & 20 deletions
diff --git a/‎docs/source/en/using-diffusers/push_to_hub.md
Lines changed: 1 addition & 7 deletions b/‎docs/source/en/using-diffusers/push_to_hub.md
Lines changed: 1 addition & 7 deletions
diff --git a/‎examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
Lines changed: 47 additions & 20 deletions b/‎examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
Lines changed: 47 additions & 20 deletions
diff --git a/‎examples/community/README.md
Lines changed: 0 additions & 4 deletions b/‎examples/community/README.md
Lines changed: 0 additions & 4 deletions
@@ -1,12 +1,6 @@
 name: Fast tests for PRs - Test Fetcher
 
-on:
-  pull_request:
-    branches:
-      - main
-  push:
-    branches:
-      - ci-*
+on: workflow_dispatch
 
 env:
   DIFFUSERS_IS_CI: yes
 
@@ -113,6 +113,7 @@ jobs:
     - name: Run example PyTorch CPU tests
       if: ${{ matrix.config.framework == 'pytorch_examples' }}
       run: |
+        python -m pip install peft
         python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
           --make-reports=tests_${{ matrix.config.report }} \
           examples
 
@@ -264,6 +264,10 @@
       title: ControlNet
     - local: api/pipelines/controlnet_sdxl
       title: ControlNet with Stable Diffusion XL
+    - local: api/pipelines/controlnetxs
+      title: ControlNet-XS
+    - local: api/pipelines/controlnetxs_sdxl
+      title: ControlNet-XS with Stable Diffusion XL
     - local: api/pipelines/cycle_diffusion
       title: Cycle Diffusion
     - local: api/pipelines/dance_diffusion
 
@@ -20,6 +20,9 @@ An attention processor is a class for applying different types of attention mech
 ## AttnProcessor2_0
 [[autodoc]] models.attention_processor.AttnProcessor2_0
 
+## FusedAttnProcessor2_0
+[[autodoc]] models.attention_processor.FusedAttnProcessor2_0
+
 ## LoRAAttnProcessor
 [[autodoc]] models.attention_processor.LoRAAttnProcessor
 
 
@@ -0,0 +1,39 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ControlNet-XS
+
+ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
+
+Like the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
+
+ControlNet-XS generates images with comparable quality to a regular ControlNet, but it is 20-25% faster ([see benchmark](https://github.com/UmerHA/controlnet-xs-benchmark/blob/main/Speed%20Benchmark.ipynb) with StableDiffusion-XL) and uses ~45% less memory.
+
+Here's the overview from the [project page](https://vislearn.github.io/ControlNet-XS/):
+
+*With increasing computing capabilities, current model architectures appear to follow the trend of simply upscaling all components without validating the necessity for doing so. In this project we investigate the size and architectural design of ControlNet [Zhang et al., 2023] for controlling the image generation process with stable diffusion-based models. We show that a new architecture with as little as 1% of the parameters of the base model achieves state-of-the art results, considerably better than ControlNet in terms of FID score. Hence we call it ControlNet-XS. We provide the code for controlling StableDiffusion-XL [Podell et al., 2023] (Model B, 48M Parameters) and StableDiffusion 2.1 [Rombach et al. 2022] (Model B, 14M Parameters), all under openrail license.*
+
+This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+## StableDiffusionControlNetXSPipeline
+[[autodoc]] StableDiffusionControlNetXSPipeline
+	- all
+	- __call__
+
+## StableDiffusionPipelineOutput
+[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
@@ -0,0 +1,45 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ControlNet-XS with Stable Diffusion XL
+
+ControlNet-XS was introduced in [ControlNet-XS](https://vislearn.github.io/ControlNet-XS/) by Denis Zavadski and Carsten Rother. It is based on the observation that the control model in the [original ControlNet](https://huggingface.co/papers/2302.05543) can be made much smaller and still produce good results.
+
+Like the original ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide a depth map, the ControlNet model generates an image that'll preserve the spatial information from the depth map. It is a more flexible and accurate way to control the image generation process.
+
+ControlNet-XS generates images with comparable quality to a regular ControlNet, but it is 20-25% faster ([see benchmark](https://github.com/UmerHA/controlnet-xs-benchmark/blob/main/Speed%20Benchmark.ipynb)) and uses ~45% less memory.
+
+Here's the overview from the [project page](https://vislearn.github.io/ControlNet-XS/):
+
+*With increasing computing capabilities, current model architectures appear to follow the trend of simply upscaling all components without validating the necessity for doing so. In this project we investigate the size and architectural design of ControlNet [Zhang et al., 2023] for controlling the image generation process with stable diffusion-based models. We show that a new architecture with as little as 1% of the parameters of the base model achieves state-of-the art results, considerably better than ControlNet in terms of FID score. Hence we call it ControlNet-XS. We provide the code for controlling StableDiffusion-XL [Podell et al., 2023] (Model B, 48M Parameters) and StableDiffusion 2.1 [Rombach et al. 2022] (Model B, 14M Parameters), all under openrail license.*
+
+This model was contributed by [UmerHA](https://twitter.com/UmerHAdil). ❤️
+
+<Tip warning={true}>
+
+🧪 Many of the SDXL ControlNet checkpoints are experimental, and there is a lot of room for improvement. Feel free to open an [Issue](https://github.com/huggingface/diffusers/issues/new/choose) and leave us feedback on how we can improve!
+
+</Tip>
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+## StableDiffusionXLControlNetXSPipeline
+[[autodoc]] StableDiffusionXLControlNetXSPipeline
+	- all
+	- __call__
+
+## StableDiffusionPipelineOutput
+[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
@@ -40,6 +40,8 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [Consistency Models](consistency_models) | unconditional image generation |
 | [ControlNet](controlnet) | text2image, image2image, inpainting |
 | [ControlNet with Stable Diffusion XL](controlnet_sdxl) | text2image |
+| [ControlNet-XS](controlnetxs) | text2image |
+| [ControlNet-XS with Stable Diffusion XL](controlnetxs_sdxl) | text2image |
 | [Cycle Diffusion](cycle_diffusion) | image2image |
 | [Dance Diffusion](dance_diffusion) | unconditional audio generation |
 | [DDIM](ddim) | unconditional image generation |
@@ -71,6 +73,7 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [Stable Diffusion](stable_diffusion/overview) | text2image, image2image, depth2image, inpainting, image variation, latent upscaler, super-resolution |
 | [Stable Diffusion Model Editing](model_editing) | model editing |
 | [Stable Diffusion XL](stable_diffusion/stable_diffusion_xl) | text2image, image2image, inpainting |
+| [Stable Diffusion XL Turbo](stable_diffusion/sdxl_turbo) | text2image, image2image, inpainting |
 | [Stable unCLIP](stable_unclip) | text2image, image variation |
 | [Stochastic Karras VE](stochastic_karras_ve) | unconditional image generation |
 | [T2I-Adapter](stable_diffusion/adapter) | text2image |
 
@@ -20,34 +20,16 @@ The abstract from the paper is:
 
 ## Tips
 
-- SDXL Turbo uses the exact same architecture as [SDXL](./stable_diffusion_xl).
+- SDXL Turbo uses the exact same architecture as [SDXL](./stable_diffusion_xl), which means it also has the same API. Please refer to the [SDXL](./stable_diffusion_xl) API reference for more details.
 - SDXL Turbo should disable guidance scale by setting `guidance_scale=0.0`
 - SDXL Turbo should use `timestep_spacing='trailing'` for the scheduler and use between 1 and 4 steps.
 - SDXL Turbo has been trained to generate images of size 512x512.
 - SDXL Turbo is open-access, but not open-source meaning that one might have to buy a model license in order to use it for commercial applications. Make sure to read the [official model card](https://huggingface.co/stabilityai/sdxl-turbo) to learn more.
 
 <Tip>
 
-To learn how to use SDXL Turbo for various tasks, how to optimize performance, and other usage examples, take a look at the [Stable Diffusion XL](../../../using-diffusers/sdxl_turbo) guide.
+To learn how to use SDXL Turbo for various tasks, how to optimize performance, and other usage examples, take a look at the [SDXL Turbo](../../../using-diffusers/sdxl_turbo) guide.
 
 Check out the [Stability AI](https://huggingface.co/stabilityai) Hub organization for the official base and refiner model checkpoints!
 
 </Tip>
-
-## StableDiffusionXLPipeline
-
-[[autodoc]] StableDiffusionXLPipeline
-	- all
-	- __call__
-
-## StableDiffusionXLImg2ImgPipeline
-
-[[autodoc]] StableDiffusionXLImg2ImgPipeline
-	- all
-	- __call__
-
-## StableDiffusionXLInpaintPipeline
-
-[[autodoc]] StableDiffusionXLInpaintPipeline
-	- all
-	- __call__
@@ -174,10 +174,4 @@ Set `private=True` in the [`~diffusers.utils.PushToHubMixin.push_to_hub`] functi
 controlnet.push_to_hub("my-controlnet-model-private", private=True)
 ```
 
-Private repositories are only visible to you, and other users won't be able to clone the repository and your repository won't appear in search results. Even if a user has the URL to your private repository, they'll receive a `404 - Sorry, we can't find the page you are looking for.`
-
-To load a model, scheduler, or pipeline from private or gated repositories, set `use_auth_token=True`:
-
-```py
-model = ControlNetModel.from_pretrained("your-namespace/my-controlnet-model-private", use_auth_token=True)
-```
+Private repositories are only visible to you, and other users won't be able to clone the repository and your repository won't appear in search results. Even if a user has the URL to your private repository, they'll receive a `404 - Sorry, we can't find the page you are looking for`. You must be [logged in](https://huggingface.co/docs/huggingface_hub/quick-start#login) to load a model from a private repository.
@@ -133,7 +133,7 @@ def save_model_card(
         diffusers_imports_pivotal = """from huggingface_hub import hf_hub_download
 from safetensors.torch import load_file
         """
-        diffusers_example_pivotal = f"""embedding_path = hf_hub_download(repo_id="{repo_id}", filename="embeddings.safetensors", repo_type="model")
+        diffusers_example_pivotal = f"""embedding_path = hf_hub_download(repo_id='{repo_id}', filename="embeddings.safetensors", repo_type="model")
 state_dict = load_file(embedding_path)
 pipeline.load_textual_inversion(state_dict["clip_l"], token=["<s0>", "<s1>"], text_encoder=pipe.text_encoder, tokenizer=pipe.tokenizer)
 pipeline.load_textual_inversion(state_dict["clip_g"], token=["<s0>", "<s1>"], text_encoder=pipe.text_encoder_2, tokenizer=pipe.tokenizer_2)
@@ -145,8 +145,7 @@ def save_model_card(
 to trigger concept `{key}` → use `{tokens}` in your prompt \n
 """
 
-    yaml = f"""
----
+    yaml = f"""---
 tags:
 - stable-diffusion-xl
 - stable-diffusion-xl-diffusers
@@ -159,7 +158,7 @@ def save_model_card(
 instance_prompt: {instance_prompt}
 license: openrail++
 ---
-    """
+"""
 
     model_card = f"""
 # SDXL LoRA DreamBooth - {repo_id}
@@ -170,14 +169,6 @@ def save_model_card(
 
 ### These are {repo_id} LoRA adaption weights for {base_model}.
 
-The weights were trained  using [DreamBooth](https://dreambooth.github.io/).
-
-LoRA for the text encoder was enabled: {train_text_encoder}.
-
-Pivotal tuning was enabled: {train_text_encoder_ti}.
-
-Special VAE used for training: {vae_path}.
-
 ## Trigger words
 
 {trigger_str}
@@ -196,11 +187,24 @@ def save_model_card(
 
 For more details, including weighting, merging and fusing LoRAs, check the [documentation on loading LoRAs in diffusers](https://huggingface.co/docs/diffusers/main/en/using-diffusers/loading_adapters)
 
-## Download model (use it with UIs such as AUTO1111, Comfy, SD.Next, Invoke)
+## Download model
+
+### Use it with UIs such as AUTOMATIC1111, Comfy UI, SD.Next, Invoke
+
+- Download the LoRA *.safetensors [here](/{repo_id}/blob/main/pytorch_lora_weights.safetensors). Rename it and place it on your Lora folder.
+- Download the text embeddings *.safetensors [here](/{repo_id}/blob/main/embeddings.safetensors). Rename it and place it on it on your embeddings folder.
+
+All [Files & versions](/{repo_id}/tree/main).
 
-Weights for this model are available in Safetensors format.
+## Details
 
-[Download]({repo_id}/tree/main) them in the Files & versions tab.
+The weights were trained using [🧨 diffusers Advanced Dreambooth Training Script](https://github.com/huggingface/diffusers/blob/main/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py).
+
+LoRA for the text encoder was enabled. {train_text_encoder}.
+
+Pivotal tuning was enabled: {train_text_encoder_ti}.
+
+Special VAE used for training: {vae_path}.
 
 """
     with open(os.path.join(repo_folder, "README.md"), "w") as f:
@@ -667,6 +671,12 @@ def parse_args(input_args=None):
         default=4,
         help=("The dimension of the LoRA update matrices."),
     )
+    parser.add_argument(
+        "--cache_latents",
+        action="store_true",
+        default=False,
+        help="Cache the VAE latents",
+    )
 
     if input_args is not None:
         args = parser.parse_args(input_args)
@@ -1170,6 +1180,7 @@ def main(args):
         revision=args.revision,
         variant=args.variant,
     )
+    vae_scaling_factor = vae.config.scaling_factor
     unet = UNet2DConditionModel.from_pretrained(
         args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
     )
@@ -1600,6 +1611,20 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers):
             args.validation_prompt = args.validation_prompt.replace(token_abs, "".join(token_replacement))
     print("validation prompt:", args.validation_prompt)
 
+    if args.cache_latents:
+        latents_cache = []
+        for batch in tqdm(train_dataloader, desc="Caching latents"):
+            with torch.no_grad():
+                batch["pixel_values"] = batch["pixel_values"].to(
+                    accelerator.device, non_blocking=True, dtype=torch.float32
+                )
+                latents_cache.append(vae.encode(batch["pixel_values"]).latent_dist)
+
+        if args.validation_prompt is None:
+            del vae
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
     num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
@@ -1715,9 +1740,7 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers):
         unet.train()
         for step, batch in enumerate(train_dataloader):
             with accelerator.accumulate(unet):
-                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
                 prompts = batch["prompts"]
-                # print(prompts)
                 # encode batch prompts when custom prompts are provided for each image -
                 if train_dataset.custom_instance_prompts:
                     if freeze_text_encoder:
@@ -1729,9 +1752,13 @@ def compute_text_embeddings(prompt, text_encoders, tokenizers):
                         tokens_one = tokenize_prompt(tokenizer_one, prompts, add_special_tokens)
                         tokens_two = tokenize_prompt(tokenizer_two, prompts, add_special_tokens)
 
-                # Convert images to latent space
-                model_input = vae.encode(pixel_values).latent_dist.sample()
-                model_input = model_input * vae.config.scaling_factor
+                if args.cache_latents:
+                    model_input = latents_cache[step].sample()
+                else:
+                    pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
+                    model_input = vae.encode(pixel_values).latent_dist.sample()
+
+                model_input = model_input * vae_scaling_factor
                 if args.pretrained_vae_model_name_or_path is None:
                     model_input = model_input.to(weight_dtype)
 
 
@@ -512,7 +512,6 @@ device = torch.device('cpu' if not has_cuda else 'cuda')
 pipe = DiffusionPipeline.from_pretrained(
     "CompVis/stable-diffusion-v1-4",
         safety_checker=None,
-    use_auth_token=True,
     custom_pipeline="imagic_stable_diffusion",
     scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
 ).to(device)
@@ -552,7 +551,6 @@ device = th.device('cpu' if not has_cuda else 'cuda')
 
 pipe = DiffusionPipeline.from_pretrained(
     "CompVis/stable-diffusion-v1-4",
-    use_auth_token=True,
     custom_pipeline="seed_resize_stable_diffusion"
 ).to(device)
 
@@ -588,7 +586,6 @@ generator = th.Generator("cuda").manual_seed(0)
 
 pipe = DiffusionPipeline.from_pretrained(
     "CompVis/stable-diffusion-v1-4",
-    use_auth_token=True,
     custom_pipeline="/home/mark/open_source/diffusers/examples/community/"
 ).to(device)
 
@@ -607,7 +604,6 @@ image.save('./seed_resize/seed_resize_{w}_{h}_image.png'.format(w=width, h=heigh
 
 pipe_compare = DiffusionPipeline.from_pretrained(
     "CompVis/stable-diffusion-v1-4",
-    use_auth_token=True,
     custom_pipeline="/home/mark/open_source/diffusers/examples/community/"
 ).to(device)