a-r-r-o-w
diff --git a/‎README.md
Lines changed: 2 additions & 2 deletions b/‎README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/en/api/models/unet-motion.md
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/api/models/unet-motion.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/api/models/unet.md
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/api/models/unet.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/api/models/unet2d-cond.md
Lines changed: 3 additions & 3 deletions b/‎docs/source/en/api/models/unet2d-cond.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source/en/api/models/unet2d.md
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/api/models/unet2d.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/api/models/unet3d-cond.md
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/api/models/unet3d-cond.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/community/pipeline_animatediff_controlnet.py
Lines changed: 1 addition & 1 deletion b/‎examples/community/pipeline_animatediff_controlnet.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/community/stable_diffusion_controlnet_reference.py
Lines changed: 1 addition & 1 deletion b/‎examples/community/stable_diffusion_controlnet_reference.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/community/stable_diffusion_reference.py
Lines changed: 1 addition & 1 deletion b/‎examples/community/stable_diffusion_reference.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/community/stable_diffusion_xl_reference.py
Lines changed: 1 addition & 1 deletion b/‎examples/community/stable_diffusion_xl_reference.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/research_projects/controlnetxs/controlnetxs.py
Lines changed: 2 additions & 2 deletions b/‎examples/research_projects/controlnetxs/controlnetxs.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/research_projects/diffusion_dpo/train_diffusion_dpo_sdxl.py
Lines changed: 4 additions & 5 deletions b/‎examples/research_projects/diffusion_dpo/train_diffusion_dpo_sdxl.py
Lines changed: 4 additions & 5 deletions
diff --git a/‎examples/text_to_image/README_sdxl.md
Lines changed: 60 additions & 0 deletions b/‎examples/text_to_image/README_sdxl.md
Lines changed: 60 additions & 0 deletions
diff --git a/‎examples/text_to_image/train_text_to_image_lora_sdxl.py
Lines changed: 5 additions & 4 deletions b/‎examples/text_to_image/train_text_to_image_lora_sdxl.py
Lines changed: 5 additions & 4 deletions
diff --git a/‎scripts/convert_amused.py
Lines changed: 1 addition & 1 deletion b/‎scripts/convert_amused.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/convert_consistency_decoder.py
Lines changed: 1 addition & 1 deletion b/‎scripts/convert_consistency_decoder.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/diffusers/__init__.py
Lines changed: 4 additions & 2 deletions b/‎src/diffusers/__init__.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/diffusers/experimental/rl/value_guided_sampling.py
Lines changed: 1 addition & 1 deletion b/‎src/diffusers/experimental/rl/value_guided_sampling.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/diffusers/models/__init__.py
Lines changed: 21 additions & 18 deletions b/‎src/diffusers/models/__init__.py
Lines changed: 21 additions & 18 deletions
diff --git a/‎src/diffusers/models/autoencoders/autoencoder_kl.py
Lines changed: 5 additions & 5 deletions b/‎src/diffusers/models/autoencoders/autoencoder_kl.py
Lines changed: 5 additions & 5 deletions
@@ -77,7 +77,7 @@ Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggi
 
 ## Quickstart
 
-Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 16000+ checkpoints):
+Generating outputs is super easy with 🤗 Diffusers. To generate an image from text, use the `from_pretrained` method to load any pretrained diffusion model (browse the [Hub](https://huggingface.co/models?library=diffusers&sort=downloads) for 19000+ checkpoints):
 
 ```python
 from diffusers import DiffusionPipeline
@@ -219,7 +219,7 @@ Also, say 👋 in our public Discord channel <a href="https://discord.gg/G7tWnz9
 - https://github.com/deep-floyd/IF
 - https://github.com/bentoml/BentoML
 - https://github.com/bmaltais/kohya_ss
-- +7000 other amazing GitHub repositories 💪
+- +8000 other amazing GitHub repositories 💪
 
 Thank you for using us ❤️.
 
 
@@ -22,4 +22,4 @@ The abstract from the paper is:
 [[autodoc]] UNetMotionModel
 
 ## UNet3DConditionOutput
-[[autodoc]] models.unet_3d_condition.UNet3DConditionOutput
+[[autodoc]] models.unets.unet_3d_condition.UNet3DConditionOutput
@@ -22,4 +22,4 @@ The abstract from the paper is:
 [[autodoc]] UNet1DModel
 
 ## UNet1DOutput
-[[autodoc]] models.unet_1d.UNet1DOutput
+[[autodoc]] models.unets.unet_1d.UNet1DOutput
@@ -22,10 +22,10 @@ The abstract from the paper is:
 [[autodoc]] UNet2DConditionModel
 
 ## UNet2DConditionOutput
-[[autodoc]] models.unet_2d_condition.UNet2DConditionOutput
+[[autodoc]] models.unets.unet_2d_condition.UNet2DConditionOutput
 
 ## FlaxUNet2DConditionModel
-[[autodoc]] models.unet_2d_condition_flax.FlaxUNet2DConditionModel
+[[autodoc]] models.unets.unet_2d_condition_flax.FlaxUNet2DConditionModel
 
 ## FlaxUNet2DConditionOutput
-[[autodoc]] models.unet_2d_condition_flax.FlaxUNet2DConditionOutput
+[[autodoc]] models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput
@@ -22,4 +22,4 @@ The abstract from the paper is:
 [[autodoc]] UNet2DModel
 
 ## UNet2DOutput
-[[autodoc]] models.unet_2d.UNet2DOutput
+[[autodoc]] models.unets.unet_2d.UNet2DOutput
@@ -22,4 +22,4 @@ The abstract from the paper is:
 [[autodoc]] UNet3DConditionModel
 
 ## UNet3DConditionOutput
-[[autodoc]] models.unet_3d_condition.UNet3DConditionOutput
+[[autodoc]] models.unets.unet_3d_condition.UNet3DConditionOutput
@@ -26,7 +26,7 @@
 from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, ControlNetModel, UNet2DConditionModel, UNetMotionModel
 from diffusers.models.lora import adjust_lora_scale_text_encoder
-from diffusers.models.unet_motion_model import MotionAdapter
+from diffusers.models.unets.unet_motion_model import MotionAdapter
 from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import (
 
@@ -8,7 +8,7 @@
 from diffusers import StableDiffusionControlNetPipeline
 from diffusers.models import ControlNetModel
 from diffusers.models.attention import BasicTransformerBlock
-from diffusers.models.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D
+from diffusers.models.unets.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D
 from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.utils import logging
 
@@ -7,7 +7,7 @@
 
 from diffusers import StableDiffusionPipeline
 from diffusers.models.attention import BasicTransformerBlock
-from diffusers.models.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D
+from diffusers.models.unets.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import rescale_noise_cfg
 from diffusers.utils import PIL_INTERPOLATION, logging
 
@@ -8,7 +8,7 @@
 
 from diffusers import StableDiffusionXLPipeline
 from diffusers.models.attention import BasicTransformerBlock
-from diffusers.models.unet_2d_blocks import (
+from diffusers.models.unets.unet_2d_blocks import (
     CrossAttnDownBlock2D,
     CrossAttnUpBlock2D,
     DownBlock2D,
 
@@ -26,7 +26,7 @@
 from diffusers.models.autoencoders import AutoencoderKL
 from diffusers.models.lora import LoRACompatibleConv
 from diffusers.models.modeling_utils import ModelMixin
-from diffusers.models.unet_2d_blocks import (
+from diffusers.models.unets.unet_2d_blocks import (
     CrossAttnDownBlock2D,
     CrossAttnUpBlock2D,
     DownBlock2D,
@@ -36,7 +36,7 @@
     UpBlock2D,
     Upsample2D,
 )
-from diffusers.models.unet_2d_condition import UNet2DConditionModel
+from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
 from diffusers.utils import BaseOutput, logging
 
 
 
@@ -740,6 +740,10 @@ def preprocess_train(examples):
             # Resize.
             combined_im = train_resize(combined_im)
 
+            # Flipping.
+            if not args.no_flip and random.random() < 0.5:
+                combined_im = train_flip(combined_im)
+
             # Cropping.
             if not args.random_crop:
                 y1 = max(0, int(round((combined_im.shape[1] - args.resolution) / 2.0)))
@@ -749,11 +753,6 @@ def preprocess_train(examples):
                 y1, x1, h, w = train_crop.get_params(combined_im, (args.resolution, args.resolution))
                 combined_im = crop(combined_im, y1, x1, h, w)
 
-            # Flipping.
-            if random.random() < 0.5:
-                x1 = combined_im.shape[2] - x1
-                combined_im = train_flip(combined_im)
-
             crop_top_left = (y1, x1)
             crop_top_lefts.append(crop_top_left)
             combined_im = normalize(combined_im)
 
@@ -183,6 +183,66 @@ The above command will also run inference as fine-tuning progresses and log the
 
 * SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
 
+
+### Using DeepSpeed
+Using DeepSpeed one can reduce the consumption of GPU memory, enabling the training of models on GPUs with smaller memory sizes. DeepSpeed is capable of offloading model parameters to the machine's memory, or it can distribute parameters, gradients, and optimizer states across multiple GPUs. This allows for the training of larger models under the same hardware configuration.
+
+First, you need to use the `accelerate config` command to choose to use DeepSpeed, or manually use the accelerate config file to set up DeepSpeed.
+
+Here is an example of a config file for using DeepSpeed. For more detailed explanations of the configuration, you can refer to this [link](https://huggingface.co/docs/accelerate/usage_guides/deepspeed).
+```yaml
+compute_environment: LOCAL_MACHINE
+debug: true
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: fp16
+num_machines: 1
+num_processes: 1
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+You need to save the mentioned configuration as an `accelerate_config.yaml` file. Then, you need to input the path of your `accelerate_config.yaml` file into the `ACCELERATE_CONFIG_FILE` parameter. This way you can use DeepSpeed to train your SDXL model in LoRA. Additionally, you can use DeepSpeed to train other SD models in this way.
+
+```shell
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export ACCELERATE_CONFIG_FILE="your accelerate_config.yaml"
+
+accelerate launch  --config_file $ACCELERATE_CONFIG_FILE train_text_to_image_lora_sdxl.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --pretrained_vae_model_name_or_path=$VAE_NAME \
+  --dataset_name=$DATASET_NAME --caption_column="text" \
+  --resolution=1024  \
+  --train_batch_size=1 \
+  --num_train_epochs=2 \
+  --checkpointing_steps=2 \
+  --learning_rate=1e-04 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --mixed_precision="fp16" \
+  --max_train_steps=20 \
+  --validation_epochs=20 \
+  --seed=1234 \
+  --output_dir="sd-pokemon-model-lora-sdxl" \
+  --validation_prompt="cute dragon creature" 
+  
+```
+
+
 ### Finetuning the text encoder and UNet
 
 The script also allows you to finetune the `text_encoder` along with the `unet`.
 
@@ -652,21 +652,22 @@ def save_model_hook(models, weights, output_dir):
             text_encoder_two_lora_layers_to_save = None
 
             for model in models:
-                if isinstance(model, type(unwrap_model(unet))):
+                if isinstance(unwrap_model(model), type(unwrap_model(unet))):
                     unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
-                elif isinstance(model, type(unwrap_model(text_encoder_one))):
+                elif isinstance(unwrap_model(model), type(unwrap_model(text_encoder_one))):
                     text_encoder_one_lora_layers_to_save = convert_state_dict_to_diffusers(
                         get_peft_model_state_dict(model)
                     )
-                elif isinstance(model, type(unwrap_model(text_encoder_two))):
+                elif isinstance(unwrap_model(model), type(unwrap_model(text_encoder_two))):
                     text_encoder_two_lora_layers_to_save = convert_state_dict_to_diffusers(
                         get_peft_model_state_dict(model)
                     )
                 else:
                     raise ValueError(f"unexpected save model: {model.__class__}")
 
                 # make sure to pop weight so that corresponding model is not saved again
-                weights.pop()
+                if weights:
+                    weights.pop()
 
             StableDiffusionXLPipeline.save_lora_weights(
                 output_dir,
 
@@ -10,7 +10,7 @@
 
 from diffusers import VQModel
 from diffusers.models.attention_processor import AttnProcessor
-from diffusers.models.uvit_2d import UVit2DModel
+from diffusers.models.unets.uvit_2d import UVit2DModel
 from diffusers.pipelines.amused.pipeline_amused import AmusedPipeline
 from diffusers.schedulers import AmusedScheduler
 
 
@@ -14,7 +14,7 @@
 from diffusers import AutoencoderKL, ConsistencyDecoderVAE, DiffusionPipeline, StableDiffusionPipeline, UNet2DModel
 from diffusers.models.autoencoders.vae import Encoder
 from diffusers.models.embeddings import TimestepEmbedding
-from diffusers.models.unet_2d_blocks import ResnetDownsampleBlock2D, ResnetUpsampleBlock2D, UNetMidBlock2D
+from diffusers.models.unets.unet_2d_blocks import ResnetDownsampleBlock2D, ResnetUpsampleBlock2D, UNetMidBlock2D
 
 
 args = ArgumentParser()
 
@@ -153,6 +153,7 @@
             "LCMScheduler",
             "PNDMScheduler",
             "RePaintScheduler",
+            "SASolverScheduler",
             "SchedulerMixin",
             "ScoreSdeVeScheduler",
             "UnCLIPScheduler",
@@ -382,7 +383,7 @@
 else:
     _import_structure["models.controlnet_flax"] = ["FlaxControlNetModel"]
     _import_structure["models.modeling_flax_utils"] = ["FlaxModelMixin"]
-    _import_structure["models.unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
+    _import_structure["models.unets.unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
     _import_structure["models.vae_flax"] = ["FlaxAutoencoderKL"]
     _import_structure["pipelines"].extend(["FlaxDiffusionPipeline"])
     _import_structure["schedulers"].extend(
@@ -531,6 +532,7 @@
             LCMScheduler,
             PNDMScheduler,
             RePaintScheduler,
+            SASolverScheduler,
             SchedulerMixin,
             ScoreSdeVeScheduler,
             UnCLIPScheduler,
@@ -711,7 +713,7 @@
     else:
         from .models.controlnet_flax import FlaxControlNetModel
         from .models.modeling_flax_utils import FlaxModelMixin
-        from .models.unet_2d_condition_flax import FlaxUNet2DConditionModel
+        from .models.unets.unet_2d_condition_flax import FlaxUNet2DConditionModel
         from .models.vae_flax import FlaxAutoencoderKL
         from .pipelines import FlaxDiffusionPipeline
         from .schedulers import (
 
@@ -16,7 +16,7 @@
 import torch
 import tqdm
 
-from ...models.unet_1d import UNet1DModel
+from ...models.unets.unet_1d import UNet1DModel
 from ...pipelines import DiffusionPipeline
 from ...utils.dummy_pt_objects import DDPMScheduler
 from ...utils.torch_utils import randn_tensor
 
@@ -39,19 +39,19 @@
     _import_structure["t5_film_transformer"] = ["T5FilmDecoder"]
     _import_structure["transformer_2d"] = ["Transformer2DModel"]
     _import_structure["transformer_temporal"] = ["TransformerTemporalModel"]
-    _import_structure["unet_1d"] = ["UNet1DModel"]
-    _import_structure["unet_2d"] = ["UNet2DModel"]
-    _import_structure["unet_2d_condition"] = ["UNet2DConditionModel"]
-    _import_structure["unet_3d_condition"] = ["UNet3DConditionModel"]
-    _import_structure["unet_kandinsky3"] = ["Kandinsky3UNet"]
-    _import_structure["unet_motion_model"] = ["MotionAdapter", "UNetMotionModel"]
-    _import_structure["unet_spatio_temporal_condition"] = ["UNetSpatioTemporalConditionModel"]
-    _import_structure["uvit_2d"] = ["UVit2DModel"]
+    _import_structure["unets.unet_1d"] = ["UNet1DModel"]
+    _import_structure["unets.unet_2d"] = ["UNet2DModel"]
+    _import_structure["unets.unet_2d_condition"] = ["UNet2DConditionModel"]
+    _import_structure["unets.unet_3d_condition"] = ["UNet3DConditionModel"]
+    _import_structure["unets.unet_kandinsky3"] = ["Kandinsky3UNet"]
+    _import_structure["unets.unet_motion_model"] = ["MotionAdapter", "UNetMotionModel"]
+    _import_structure["unets.unet_spatio_temporal_condition"] = ["UNetSpatioTemporalConditionModel"]
+    _import_structure["unets.uvit_2d"] = ["UVit2DModel"]
     _import_structure["vq_model"] = ["VQModel"]
 
 if is_flax_available():
     _import_structure["controlnet_flax"] = ["FlaxControlNetModel"]
-    _import_structure["unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
+    _import_structure["unets.unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
     _import_structure["vae_flax"] = ["FlaxAutoencoderKL"]
 
 
@@ -73,19 +73,22 @@
         from .t5_film_transformer import T5FilmDecoder
         from .transformer_2d import Transformer2DModel
         from .transformer_temporal import TransformerTemporalModel
-        from .unet_1d import UNet1DModel
-        from .unet_2d import UNet2DModel
-        from .unet_2d_condition import UNet2DConditionModel
-        from .unet_3d_condition import UNet3DConditionModel
-        from .unet_kandinsky3 import Kandinsky3UNet
-        from .unet_motion_model import MotionAdapter, UNetMotionModel
-        from .unet_spatio_temporal_condition import UNetSpatioTemporalConditionModel
-        from .uvit_2d import UVit2DModel
+        from .unets import (
+            Kandinsky3UNet,
+            MotionAdapter,
+            UNet1DModel,
+            UNet2DConditionModel,
+            UNet2DModel,
+            UNet3DConditionModel,
+            UNetMotionModel,
+            UNetSpatioTemporalConditionModel,
+            UVit2DModel,
+        )
         from .vq_model import VQModel
 
     if is_flax_available():
         from .controlnet_flax import FlaxControlNetModel
-        from .unet_2d_condition_flax import FlaxUNet2DConditionModel
+        from .unets import FlaxUNet2DConditionModel
         from .vae_flax import FlaxAutoencoderKL
 
 else:
 
@@ -157,7 +157,7 @@ def disable_slicing(self):
         self.use_slicing = False
 
     @property
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
     def attn_processors(self) -> Dict[str, AttentionProcessor]:
         r"""
         Returns:
@@ -181,7 +181,7 @@ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors:
 
         return processors
 
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
     def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
         r"""
         Sets the attention processor to use to compute attention.
@@ -216,7 +216,7 @@ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
         for name, module in self.named_children():
             fn_recursive_attn_processor(name, module, processor)
 
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
     def set_default_attn_processor(self):
         """
         Disables custom attention processors and sets the default attention implementation.
@@ -448,7 +448,7 @@ def forward(
 
         return DecoderOutput(sample=dec)
 
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
     def fuse_qkv_projections(self):
         """
         Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
@@ -472,7 +472,7 @@ def fuse_qkv_projections(self):
             if isinstance(module, Attention):
                 module.fuse_projections(fuse=True)
 
-    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
     def unfuse_qkv_projections(self):
         """Disables the fused QKV projection if enabled.