Skip to content

Commit a326d61

Browse files
authored
Fix configuring VAE from single file mixin (#6950)
* update
1 parent e7696e2 commit a326d61

File tree

2 files changed

+28
-3
lines changed

2 files changed

+28
-3
lines changed

src/diffusers/loaders/autoencoder.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
3838
- A link to the `.ckpt` file (for example
3939
`"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
4040
- A path to a *file* containing all pipeline weights.
41+
config_file (`str`, *optional*):
42+
Filepath to the configuration YAML file associated with the model. If not provided it will default to:
43+
https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
4144
torch_dtype (`str` or `torch.dtype`, *optional*):
4245
Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
4346
dtype is automatically derived from the model's weights.
@@ -65,6 +68,13 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
6568
image_size (`int`, *optional*, defaults to 512):
6669
The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
6770
Diffusion v2 base model. Use 768 for Stable Diffusion v2.
71+
scaling_factor (`float`, *optional*, defaults to 0.18215):
72+
The component-wise standard deviation of the trained latent space computed using the first batch of the
73+
training set. This is used to scale the latent space to have unit variance when training the diffusion
74+
model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
75+
diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z
76+
= 1 / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution
77+
Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
6878
use_safetensors (`bool`, *optional*, defaults to `None`):
6979
If set to `None`, the safetensors weights are downloaded if they're available **and** if the
7080
safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
@@ -92,6 +102,7 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
92102
"""
93103

94104
original_config_file = kwargs.pop("original_config_file", None)
105+
config_file = kwargs.pop("config_file", None)
95106
resume_download = kwargs.pop("resume_download", False)
96107
force_download = kwargs.pop("force_download", False)
97108
proxies = kwargs.pop("proxies", None)
@@ -103,6 +114,13 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
103114
use_safetensors = kwargs.pop("use_safetensors", True)
104115

105116
class_name = cls.__name__
117+
118+
if (config_file is not None) and (original_config_file is not None):
119+
raise ValueError(
120+
"You cannot pass both `config_file` and `original_config_file` to `from_single_file`. Please use only one of these arguments."
121+
)
122+
123+
original_config_file = original_config_file or config_file
106124
original_config, checkpoint = fetch_ldm_config_and_checkpoint(
107125
pretrained_model_link_or_path=pretrained_model_link_or_path,
108126
class_name=class_name,
@@ -118,7 +136,10 @@ def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
118136
)
119137

120138
image_size = kwargs.pop("image_size", None)
121-
component = create_diffusers_vae_model_from_ldm(class_name, original_config, checkpoint, image_size=image_size)
139+
scaling_factor = kwargs.pop("scaling_factor", None)
140+
component = create_diffusers_vae_model_from_ldm(
141+
class_name, original_config, checkpoint, image_size=image_size, scaling_factor=scaling_factor
142+
)
122143
vae = component["vae"]
123144
if torch_dtype is not None:
124145
vae = vae.to(torch_dtype)

src/diffusers/loaders/single_file_utils.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@
175175
}
176176

177177
LDM_VAE_KEY = "first_stage_model."
178+
LDM_VAE_DEFAULT_SCALING_FACTOR = 0.18215
178179
LDM_UNET_KEY = "model.diffusion_model."
179180
LDM_CONTROLNET_KEY = "control_model."
180181
LDM_CLIP_PREFIX_TO_REMOVE = ["cond_stage_model.transformer.", "conditioner.embedders.0.transformer."]
@@ -518,7 +519,10 @@ def create_vae_diffusers_config(original_config, image_size, scaling_factor=None
518519
Creates a config for the diffusers based on the config of the LDM model.
519520
"""
520521
vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
521-
scaling_factor = scaling_factor or original_config["model"]["params"]["scale_factor"]
522+
if scaling_factor is None and "scale_factor" in original_config["model"]["params"]:
523+
scaling_factor = original_config["model"]["params"]["scale_factor"]
524+
elif scaling_factor is None:
525+
scaling_factor = LDM_VAE_DEFAULT_SCALING_FACTOR
522526

523527
block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
524528
down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
@@ -1173,7 +1177,7 @@ def create_diffusers_unet_model_from_ldm(
11731177

11741178

11751179
def create_diffusers_vae_model_from_ldm(
1176-
pipeline_class_name, original_config, checkpoint, image_size=None, scaling_factor=0.18125
1180+
pipeline_class_name, original_config, checkpoint, image_size=None, scaling_factor=None
11771181
):
11781182
# import here to avoid circular imports
11791183
from ..models import AutoencoderKL

0 commit comments

Comments
 (0)