2
2
import re
3
3
4
4
import torch
5
+ import yaml
5
6
from transformers import (
6
7
CLIPProcessor ,
7
8
CLIPTextModel ,
28
29
textenc_conversion_map ,
29
30
textenc_pattern ,
30
31
)
31
- from diffusers .utils import is_omegaconf_available
32
- from diffusers .utils .import_utils import BACKENDS_MAPPING
33
32
34
33
35
34
def convert_open_clip_checkpoint (checkpoint ):
@@ -370,64 +369,64 @@ def convert_gligen_unet_checkpoint(checkpoint, config, path=None, extract_ema=Fa
370
369
371
370
372
371
def create_vae_config (original_config , image_size : int ):
373
- vae_params = original_config . autoencoder . params . ddconfig
374
- _ = original_config . autoencoder . params . embed_dim
372
+ vae_params = original_config [ " autoencoder" ][ " params" ][ " ddconfig" ]
373
+ _ = original_config [ " autoencoder" ][ " params" ][ " embed_dim" ]
375
374
376
- block_out_channels = [vae_params . ch * mult for mult in vae_params . ch_mult ]
375
+ block_out_channels = [vae_params [ "ch" ] * mult for mult in vae_params [ " ch_mult" ] ]
377
376
down_block_types = ["DownEncoderBlock2D" ] * len (block_out_channels )
378
377
up_block_types = ["UpDecoderBlock2D" ] * len (block_out_channels )
379
378
380
379
config = {
381
380
"sample_size" : image_size ,
382
- "in_channels" : vae_params . in_channels ,
383
- "out_channels" : vae_params . out_ch ,
381
+ "in_channels" : vae_params [ " in_channels" ] ,
382
+ "out_channels" : vae_params [ " out_ch" ] ,
384
383
"down_block_types" : tuple (down_block_types ),
385
384
"up_block_types" : tuple (up_block_types ),
386
385
"block_out_channels" : tuple (block_out_channels ),
387
- "latent_channels" : vae_params . z_channels ,
388
- "layers_per_block" : vae_params . num_res_blocks ,
386
+ "latent_channels" : vae_params [ " z_channels" ] ,
387
+ "layers_per_block" : vae_params [ " num_res_blocks" ] ,
389
388
}
390
389
391
390
return config
392
391
393
392
394
393
def create_unet_config (original_config , image_size : int , attention_type ):
395
- unet_params = original_config . model . params
396
- vae_params = original_config . autoencoder . params . ddconfig
394
+ unet_params = original_config [ " model" ][ " params" ]
395
+ vae_params = original_config [ " autoencoder" ][ " params" ][ " ddconfig" ]
397
396
398
- block_out_channels = [unet_params . model_channels * mult for mult in unet_params . channel_mult ]
397
+ block_out_channels = [unet_params [ " model_channels" ] * mult for mult in unet_params [ " channel_mult" ] ]
399
398
400
399
down_block_types = []
401
400
resolution = 1
402
401
for i in range (len (block_out_channels )):
403
- block_type = "CrossAttnDownBlock2D" if resolution in unet_params . attention_resolutions else "DownBlock2D"
402
+ block_type = "CrossAttnDownBlock2D" if resolution in unet_params [ " attention_resolutions" ] else "DownBlock2D"
404
403
down_block_types .append (block_type )
405
404
if i != len (block_out_channels ) - 1 :
406
405
resolution *= 2
407
406
408
407
up_block_types = []
409
408
for i in range (len (block_out_channels )):
410
- block_type = "CrossAttnUpBlock2D" if resolution in unet_params . attention_resolutions else "UpBlock2D"
409
+ block_type = "CrossAttnUpBlock2D" if resolution in unet_params [ " attention_resolutions" ] else "UpBlock2D"
411
410
up_block_types .append (block_type )
412
411
resolution //= 2
413
412
414
- vae_scale_factor = 2 ** (len (vae_params . ch_mult ) - 1 )
413
+ vae_scale_factor = 2 ** (len (vae_params [ " ch_mult" ] ) - 1 )
415
414
416
- head_dim = unet_params . num_heads if "num_heads" in unet_params else None
415
+ head_dim = unet_params [ " num_heads" ] if "num_heads" in unet_params else None
417
416
use_linear_projection = (
418
- unet_params . use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
417
+ unet_params [ " use_linear_in_transformer" ] if "use_linear_in_transformer" in unet_params else False
419
418
)
420
419
if use_linear_projection :
421
420
if head_dim is None :
422
421
head_dim = [5 , 10 , 20 , 20 ]
423
422
424
423
config = {
425
424
"sample_size" : image_size // vae_scale_factor ,
426
- "in_channels" : unet_params . in_channels ,
425
+ "in_channels" : unet_params [ " in_channels" ] ,
427
426
"down_block_types" : tuple (down_block_types ),
428
427
"block_out_channels" : tuple (block_out_channels ),
429
- "layers_per_block" : unet_params . num_res_blocks ,
430
- "cross_attention_dim" : unet_params . context_dim ,
428
+ "layers_per_block" : unet_params [ " num_res_blocks" ] ,
429
+ "cross_attention_dim" : unet_params [ " context_dim" ] ,
431
430
"attention_head_dim" : head_dim ,
432
431
"use_linear_projection" : use_linear_projection ,
433
432
"attention_type" : attention_type ,
@@ -445,11 +444,6 @@ def convert_gligen_to_diffusers(
445
444
num_in_channels : int = None ,
446
445
device : str = None ,
447
446
):
448
- if not is_omegaconf_available ():
449
- raise ValueError (BACKENDS_MAPPING ["omegaconf" ][1 ])
450
-
451
- from omegaconf import OmegaConf
452
-
453
447
if device is None :
454
448
device = "cuda" if torch .cuda .is_available () else "cpu"
455
449
checkpoint = torch .load (checkpoint_path , map_location = device )
@@ -461,14 +455,14 @@ def convert_gligen_to_diffusers(
461
455
else :
462
456
print ("global_step key not found in model" )
463
457
464
- original_config = OmegaConf . load (original_config_file )
458
+ original_config = yaml . safe_load (original_config_file )
465
459
466
460
if num_in_channels is not None :
467
461
original_config ["model" ]["params" ]["in_channels" ] = num_in_channels
468
462
469
- num_train_timesteps = original_config . diffusion . params . timesteps
470
- beta_start = original_config . diffusion . params . linear_start
471
- beta_end = original_config . diffusion . params . linear_end
463
+ num_train_timesteps = original_config [ " diffusion" ][ " params" ][ " timesteps" ]
464
+ beta_start = original_config [ " diffusion" ][ " params" ][ " linear_start" ]
465
+ beta_end = original_config [ " diffusion" ][ " params" ][ " linear_end" ]
472
466
473
467
scheduler = DDIMScheduler (
474
468
beta_end = beta_end ,
0 commit comments