|
19 | 19 | import PIL
|
20 | 20 | import regex as re
|
21 | 21 | import torch
|
22 |
| -from transformers import AutoTokenizer, CLIPImageProcessor, CLIPVisionModelWithProjection, UMT5EncoderModel |
| 22 | +from transformers import AutoTokenizer, CLIPImageProcessor, CLIPVisionModel, UMT5EncoderModel |
23 | 23 |
|
24 | 24 | from ...callbacks import MultiPipelineCallbacks, PipelineCallback
|
25 | 25 | from ...image_processor import PipelineImageInput
|
|
49 | 49 | >>> import numpy as np
|
50 | 50 | >>> from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
|
51 | 51 | >>> from diffusers.utils import export_to_video, load_image
|
52 |
| - >>> from transformers import CLIPVisionModelWithProjection |
| 52 | + >>> from transformers import CLIPVisionModel |
53 | 53 |
|
54 | 54 | >>> # Available models: Wan-AI/Wan2.1-I2V-14B-480P-Diffusers, Wan-AI/Wan2.1-I2V-14B-720P-Diffusers
|
55 | 55 | >>> model_id = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
|
56 |
| - >>> image_encoder = CLIPVisionModelWithProjection.from_pretrained( |
| 56 | + >>> image_encoder = CLIPVisionModel.from_pretrained( |
57 | 57 | ... model_id, subfolder="image_encoder", torch_dtype=torch.float32
|
58 | 58 | ... )
|
59 | 59 | >>> vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
@@ -171,7 +171,7 @@ def __init__(
|
171 | 171 | self,
|
172 | 172 | tokenizer: AutoTokenizer,
|
173 | 173 | text_encoder: UMT5EncoderModel,
|
174 |
| - image_encoder: CLIPVisionModelWithProjection, |
| 174 | + image_encoder: CLIPVisionModel, |
175 | 175 | image_processor: CLIPImageProcessor,
|
176 | 176 | transformer: WanTransformer3DModel,
|
177 | 177 | vae: AutoencoderKLWan,
|
|
0 commit comments