Skip to content

Commit a1beedb

Browse files
authored
Merge branch 'main' into animatediff-controlnet-ipadapter-bugfix
2 parents 0aefecb + 405a1fa commit a1beedb

27 files changed

+36
-32
lines changed

docs/source/en/using-diffusers/svd.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ This guide will show you how to use SVD to generate short videos from images.
2121
Before you begin, make sure you have the following libraries installed:
2222

2323
```py
24-
!pip install -q -U diffusers transformers accelerate
24+
!pip install -q -U diffusers transformers accelerate
2525
```
2626

2727
The are two variants of this model, [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) and [SVD-XT](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt). The SVD checkpoint is trained to generate 14 frames and the SVD-XT checkpoint is further finetuned to generate 25 frames.
@@ -86,7 +86,7 @@ Video generation is very memory intensive because you're essentially generating
8686
+ frames = pipe(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0]
8787
```
8888

89-
Using all these tricks togethere should lower the memory requirement to less than 8GB VRAM.
89+
Using all these tricks together should lower the memory requirement to less than 8GB VRAM.
9090

9191
## Micro-conditioning
9292

examples/community/unclip_text_interpolation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class UnCLIPTextInterpolationPipeline(DiffusionPipeline):
4848
Tokenizer of class
4949
[CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
5050
prior ([`PriorTransformer`]):
51-
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
51+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
5252
text_proj ([`UnCLIPTextProjModel`]):
5353
Utility class to prepare and combine the embeddings before they are passed to the decoder.
5454
decoder ([`UNet2DConditionModel`]):

src/diffusers/models/unets/unet_3d_condition.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
9191
cross_attention_dim (`int`, *optional*, defaults to 1024): The dimension of the cross attention features.
9292
attention_head_dim (`int`, *optional*, defaults to 64): The dimension of the attention heads.
9393
num_attention_heads (`int`, *optional*): The number of attention heads.
94+
time_cond_proj_dim (`int`, *optional*, defaults to `None`):
95+
The dimension of `cond_proj` layer in the timestep embedding.
9496
"""
9597

9698
_supports_gradient_checkpointing = False
@@ -123,6 +125,7 @@ def __init__(
123125
cross_attention_dim: int = 1024,
124126
attention_head_dim: Union[int, Tuple[int]] = 64,
125127
num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
128+
time_cond_proj_dim: Optional[int] = None,
126129
):
127130
super().__init__()
128131

@@ -174,6 +177,7 @@ def __init__(
174177
timestep_input_dim,
175178
time_embed_dim,
176179
act_fn=act_fn,
180+
cond_proj_dim=time_cond_proj_dim,
177181
)
178182

179183
self.transformer_in = TransformerTemporalModel(

src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
129129
movq ([`VQModel`]):
130130
MoVQ Decoder to generate the image from the latents.
131131
prior_prior ([`PriorTransformer`]):
132-
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
132+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
133133
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
134134
Frozen image-encoder.
135135
prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -346,7 +346,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
346346
movq ([`VQModel`]):
347347
MoVQ Decoder to generate the image from the latents.
348348
prior_prior ([`PriorTransformer`]):
349-
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
349+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
350350
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
351351
Frozen image-encoder.
352352
prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -586,7 +586,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
586586
movq ([`VQModel`]):
587587
MoVQ Decoder to generate the image from the latents.
588588
prior_prior ([`PriorTransformer`]):
589-
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
589+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
590590
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
591591
Frozen image-encoder.
592592
prior_text_encoder ([`CLIPTextModelWithProjection`]):

src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):
134134
135135
Args:
136136
prior ([`PriorTransformer`]):
137-
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
137+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
138138
image_encoder ([`CLIPVisionModelWithProjection`]):
139139
Frozen image-encoder.
140140
text_encoder ([`CLIPTextModelWithProjection`]):

src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
119119
movq ([`VQModel`]):
120120
MoVQ Decoder to generate the image from the latents.
121121
prior_prior ([`PriorTransformer`]):
122-
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
122+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
123123
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
124124
Frozen image-encoder.
125125
prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -346,7 +346,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
346346
movq ([`VQModel`]):
347347
MoVQ Decoder to generate the image from the latents.
348348
prior_prior ([`PriorTransformer`]):
349-
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
349+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
350350
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
351351
Frozen image-encoder.
352352
prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -594,7 +594,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
594594
movq ([`VQModel`]):
595595
MoVQ Decoder to generate the image from the latents.
596596
prior_prior ([`PriorTransformer`]):
597-
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
597+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
598598
prior_image_encoder ([`CLIPVisionModelWithProjection`]):
599599
Frozen image-encoder.
600600
prior_text_encoder ([`CLIPTextModelWithProjection`]):

src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):
9090
9191
Args:
9292
prior ([`PriorTransformer`]):
93-
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
93+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
9494
image_encoder ([`CLIPVisionModelWithProjection`]):
9595
Frozen image-encoder.
9696
text_encoder ([`CLIPTextModelWithProjection`]):

src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
108108
109109
Args:
110110
prior ([`PriorTransformer`]):
111-
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
111+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
112112
image_encoder ([`CLIPVisionModelWithProjection`]):
113113
Frozen image-encoder.
114114
text_encoder ([`CLIPTextModelWithProjection`]):

src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):
8686
8787
Args:
8888
prior ([`PriorTransformer`]):
89-
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
89+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
9090
image_encoder ([`~transformers.CLIPVisionModel`]):
9191
Frozen image-encoder.
9292
image_processor ([`~transformers.CLIPImageProcessor`]):

src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -700,8 +700,8 @@ def __call__(
700700
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
701701
>>> init_image = Image.open(requests.get(url, stream=True).raw)
702702
>>> prompt = "two tigers"
703-
>>> n_propmt = "bad, deformed, ugly, bad anotomy"
704-
>>> image = pipe(prompt=prompt, image=init_image, negative_prompt=n_propmt, strength=0.7).images[0]
703+
>>> n_prompt = "bad, deformed, ugly, bad anotomy"
704+
>>> image = pipe(prompt=prompt, image=init_image, negative_prompt=n_prompt, strength=0.7).images[0]
705705
```
706706
707707
Returns:

src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ def __call__(
194194
A higher guidance scale value encourages the model to generate images closely linked to the text
195195
`prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
196196
image_guidance_scale (`float`, *optional*, defaults to 1.5):
197-
Push the generated image towards the inital `image`. Image guidance scale is enabled by setting
197+
Push the generated image towards the initial `image`. Image guidance scale is enabled by setting
198198
`image_guidance_scale > 1`. Higher image guidance scale encourages generated images that are closely
199199
linked to the source `image`, usually at the expense of lower image quality. This pipeline requires a
200200
value of at least `1`.

src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
7676
prior_text_encoder ([`CLIPTextModelWithProjection`]):
7777
Frozen [`CLIPTextModelWithProjection`] text-encoder.
7878
prior ([`PriorTransformer`]):
79-
The canonincal unCLIP prior to approximate the image embedding from the text embedding.
79+
The canonical unCLIP prior to approximate the image embedding from the text embedding.
8080
prior_scheduler ([`KarrasDiffusionSchedulers`]):
8181
Scheduler used in the prior denoising process.
8282
image_normalizer ([`StableUnCLIPImageNormalizer`]):

src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -659,7 +659,7 @@ def __call__(
659659
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
660660
usually at the expense of lower image quality.
661661
image_guidance_scale (`float`, *optional*, defaults to 1.5):
662-
Image guidance scale is to push the generated image towards the inital image `image`. Image guidance
662+
Image guidance scale is to push the generated image towards the initial image `image`. Image guidance
663663
scale is enabled by setting `image_guidance_scale > 1`. Higher image guidance scale encourages to
664664
generate images that are closely linked to the source image `image`, usually at the expense of lower
665665
image quality. This pipeline requires a value of at least `1`.

src/diffusers/schedulers/scheduling_consistency_models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,7 @@ def add_noise(
438438
# add_noise is called after first denoising step (for inpainting)
439439
step_indices = [self.step_index] * timesteps.shape[0]
440440
else:
441-
# add noise is called bevore first denoising step to create inital latent(img2img)
441+
# add noise is called before first denoising step to create initial latent(img2img)
442442
step_indices = [self.begin_index] * timesteps.shape[0]
443443

444444
sigma = sigmas[step_indices].flatten()

src/diffusers/schedulers/scheduling_deis_multistep.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -775,7 +775,7 @@ def add_noise(
775775
# add_noise is called after first denoising step (for inpainting)
776776
step_indices = [self.step_index] * timesteps.shape[0]
777777
else:
778-
# add noise is called bevore first denoising step to create inital latent(img2img)
778+
# add noise is called before first denoising step to create initial latent(img2img)
779779
step_indices = [self.begin_index] * timesteps.shape[0]
780780

781781
sigma = sigmas[step_indices].flatten()

src/diffusers/schedulers/scheduling_dpmsolver_multistep.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1018,7 +1018,7 @@ def add_noise(
10181018
# add_noise is called after first denoising step (for inpainting)
10191019
step_indices = [self.step_index] * timesteps.shape[0]
10201020
else:
1021-
# add noise is called bevore first denoising step to create inital latent(img2img)
1021+
# add noise is called before first denoising step to create initial latent(img2img)
10221022
step_indices = [self.begin_index] * timesteps.shape[0]
10231023

10241024
sigma = sigmas[step_indices].flatten()

src/diffusers/schedulers/scheduling_dpmsolver_sde.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -547,7 +547,7 @@ def add_noise(
547547
# add_noise is called after first denoising step (for inpainting)
548548
step_indices = [self.step_index] * timesteps.shape[0]
549549
else:
550-
# add noise is called bevore first denoising step to create inital latent(img2img)
550+
# add noise is called before first denoising step to create initial latent(img2img)
551551
step_indices = [self.begin_index] * timesteps.shape[0]
552552

553553
sigma = sigmas[step_indices].flatten()

src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -968,7 +968,7 @@ def add_noise(
968968
# add_noise is called after first denoising step (for inpainting)
969969
step_indices = [self.step_index] * timesteps.shape[0]
970970
else:
971-
# add noise is called bevore first denoising step to create inital latent(img2img)
971+
# add noise is called before first denoising step to create initial latent(img2img)
972972
step_indices = [self.begin_index] * timesteps.shape[0]
973973

974974
sigma = sigmas[step_indices].flatten()

src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -673,7 +673,7 @@ def add_noise(
673673
# add_noise is called after first denoising step (for inpainting)
674674
step_indices = [self.step_index] * timesteps.shape[0]
675675
else:
676-
# add noise is called bevore first denoising step to create inital latent(img2img)
676+
# add noise is called before first denoising step to create initial latent(img2img)
677677
step_indices = [self.begin_index] * timesteps.shape[0]
678678

679679
sigma = sigmas[step_indices].flatten()

src/diffusers/schedulers/scheduling_edm_euler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ def add_noise(
371371
# add_noise is called after first denoising step (for inpainting)
372372
step_indices = [self.step_index] * timesteps.shape[0]
373373
else:
374-
# add noise is called bevore first denoising step to create inital latent(img2img)
374+
# add noise is called before first denoising step to create initial latent(img2img)
375375
step_indices = [self.begin_index] * timesteps.shape[0]
376376

377377
sigma = sigmas[step_indices].flatten()

src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ def add_noise(
471471
# add_noise is called after first denoising step (for inpainting)
472472
step_indices = [self.step_index] * timesteps.shape[0]
473473
else:
474-
# add noise is called bevore first denoising step to create inital latent(img2img)
474+
# add noise is called before first denoising step to create initial latent(img2img)
475475
step_indices = [self.begin_index] * timesteps.shape[0]
476476

477477
sigma = sigmas[step_indices].flatten()

src/diffusers/schedulers/scheduling_euler_discrete.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -566,7 +566,7 @@ def add_noise(
566566
# add_noise is called after first denoising step (for inpainting)
567567
step_indices = [self.step_index] * timesteps.shape[0]
568568
else:
569-
# add noise is called bevore first denoising step to create inital latent(img2img)
569+
# add noise is called before first denoising step to create initial latent(img2img)
570570
step_indices = [self.begin_index] * timesteps.shape[0]
571571

572572
sigma = sigmas[step_indices].flatten()

src/diffusers/schedulers/scheduling_heun_discrete.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ def add_noise(
472472
# add_noise is called after first denoising step (for inpainting)
473473
step_indices = [self.step_index] * timesteps.shape[0]
474474
else:
475-
# add noise is called bevore first denoising step to create inital latent(img2img)
475+
# add noise is called before first denoising step to create initial latent(img2img)
476476
step_indices = [self.begin_index] * timesteps.shape[0]
477477

478478
sigma = sigmas[step_indices].flatten()

src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,7 @@ def add_noise(
498498
# add_noise is called after first denoising step (for inpainting)
499499
step_indices = [self.step_index] * timesteps.shape[0]
500500
else:
501-
# add noise is called bevore first denoising step to create inital latent(img2img)
501+
# add noise is called before first denoising step to create initial latent(img2img)
502502
step_indices = [self.begin_index] * timesteps.shape[0]
503503

504504
sigma = sigmas[step_indices].flatten()

src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,7 @@ def add_noise(
473473
# add_noise is called after first denoising step (for inpainting)
474474
step_indices = [self.step_index] * timesteps.shape[0]
475475
else:
476-
# add noise is called bevore first denoising step to create inital latent(img2img)
476+
# add noise is called before first denoising step to create initial latent(img2img)
477477
step_indices = [self.begin_index] * timesteps.shape[0]
478478

479479
sigma = sigmas[step_indices].flatten()

src/diffusers/schedulers/scheduling_lms_discrete.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ def add_noise(
465465
# add_noise is called after first denoising step (for inpainting)
466466
step_indices = [self.step_index] * timesteps.shape[0]
467467
else:
468-
# add noise is called bevore first denoising step to create inital latent(img2img)
468+
# add noise is called before first denoising step to create initial latent(img2img)
469469
step_indices = [self.begin_index] * timesteps.shape[0]
470470

471471
sigma = sigmas[step_indices].flatten()

src/diffusers/schedulers/scheduling_unipc_multistep.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -869,7 +869,7 @@ def add_noise(
869869
# add_noise is called after first denoising step (for inpainting)
870870
step_indices = [self.step_index] * timesteps.shape[0]
871871
else:
872-
# add noise is called bevore first denoising step to create inital latent(img2img)
872+
# add noise is called before first denoising step to create initial latent(img2img)
873873
step_indices = [self.begin_index] * timesteps.shape[0]
874874

875875
sigma = sigmas[step_indices].flatten()

0 commit comments

Comments
 (0)