add freenoise

Anonymous · Anonymous · commit 9abf5ed9ac9a · 2023-12-02T17:04:11.000+08:00
diff --git a/animatediff/models/motion_module.py b/animatediff/models/motion_module.py
@@ -17,6 +17,26 @@
 import math
 
 
+def get_views(video_length, window_size=16, stride=4):
+    num_blocks_time = (video_length - window_size) // stride + 1
+    views = []
+    for i in range(num_blocks_time):
+        t_start = int(i * stride)
+        t_end = t_start + window_size
+        views.append((t_start,t_end))
+    return views
+
+
+def generate_weight_sequence(n):
+    if n % 2 == 0:
+        max_weight = n // 2
+        weight_sequence = list(range(1, max_weight + 1, 1)) + list(range(max_weight, 0, -1))
+    else:
+        max_weight = (n + 1) // 2
+        weight_sequence = list(range(1, max_weight, 1)) + [max_weight] + list(range(max_weight - 1, 0, -1))
+    return weight_sequence
+
+
 def zero_module(module):
     # Zero out the parameters of a module and return it.
     for p in module.parameters():
@@ -46,6 +66,16 @@ def get_motion_module(
     else:
         raise ValueError
 
+def get_window_motion_module(
+    in_channels,
+    motion_module_type: str, 
+    motion_module_kwargs: dict
+):
+    if motion_module_type == "Vanilla":
+        return VanillaTemporalModule(in_channels=in_channels, local_window=True, **motion_module_kwargs,)    
+    else:
+        raise ValueError
+
 
 class VanillaTemporalModule(nn.Module):
     def __init__(
@@ -59,6 +89,7 @@ def __init__(
         temporal_position_encoding_max_len = 24,
         temporal_attention_dim_div         = 1,
         zero_initialize                    = True,
+        **kwargs,
     ):
         super().__init__()
         
@@ -71,6 +102,7 @@ def __init__(
             cross_frame_attention_mode=cross_frame_attention_mode,
             temporal_position_encoding=temporal_position_encoding,
             temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+            **kwargs,
         )
         
         if zero_initialize:
@@ -103,6 +135,7 @@ def __init__(
         cross_frame_attention_mode         = None,
         temporal_position_encoding         = False,
         temporal_position_encoding_max_len = 24,
+        **kwargs,
     ):
         super().__init__()
 
@@ -127,6 +160,7 @@ def __init__(
                     cross_frame_attention_mode=cross_frame_attention_mode,
                     temporal_position_encoding=temporal_position_encoding,
                     temporal_position_encoding_max_len=temporal_position_encoding_max_len,
+                    **kwargs,
                 )
                 for d in range(num_layers)
             ]
@@ -176,6 +210,7 @@ def __init__(
         cross_frame_attention_mode         = None,
         temporal_position_encoding         = False,
         temporal_position_encoding_max_len = 24,
+        local_window = False,
     ):
         super().__init__()
 
@@ -208,15 +243,52 @@ def __init__(
         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
         self.ff_norm = nn.LayerNorm(dim)
 
+        self.local_window = local_window
+
 
     def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
-        for attention_block, norm in zip(self.attention_blocks, self.norms):
-            norm_hidden_states = norm(hidden_states)
-            hidden_states = attention_block(
-                norm_hidden_states,
-                encoder_hidden_states=encoder_hidden_states if attention_block.is_cross_attention else None,
-                video_length=video_length,
-            ) + hidden_states
+
+        if not self.local_window:
+            for attention_block, norm in zip(self.attention_blocks, self.norms):
+                norm_hidden_states = norm(hidden_states)
+                hidden_states = attention_block(
+                    norm_hidden_states,
+                    encoder_hidden_states=encoder_hidden_states if attention_block.is_cross_attention else None,
+                    video_length=video_length,
+                ) + hidden_states
+        else:
+            views = get_views(video_length)
+            hidden_states = rearrange(hidden_states, "(b f) d c -> b f d c", f=video_length)
+            count = torch.zeros_like(hidden_states)
+            value = torch.zeros_like(hidden_states)
+            for t_start, t_end in views:
+                weight_sequence = generate_weight_sequence(t_end - t_start)
+                weight_tensor = torch.ones_like(count[:, t_start:t_end])
+                weight_tensor = weight_tensor * torch.Tensor(weight_sequence).to(hidden_states.device).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+
+                sub_hidden_states = rearrange(hidden_states[:, t_start:t_end], "b f d c -> (b f) d c")
+                for attention_block, norm in zip(self.attention_blocks, self.norms):
+                    norm_hidden_states = norm(sub_hidden_states)
+                    sub_hidden_states = attention_block(
+                        norm_hidden_states,
+                        encoder_hidden_states=encoder_hidden_states if attention_block.is_cross_attention else None,
+                        video_length=t_end-t_start,
+                    ) + sub_hidden_states
+                sub_hidden_states = rearrange(sub_hidden_states, "(b f) d c -> b f d c", f=t_end-t_start)
+
+                value[:,t_start:t_end] += sub_hidden_states * weight_tensor
+                count[:,t_start:t_end] += weight_tensor
+
+            hidden_states = torch.where(count>0, value/count, value)
+            hidden_states = rearrange(hidden_states, "b f d c -> (b f) d c")
+
+        # for attention_block, norm in zip(self.attention_blocks, self.norms):
+        #     norm_hidden_states = norm(hidden_states)
+        #     hidden_states = attention_block(
+        #         norm_hidden_states,
+        #         encoder_hidden_states=encoder_hidden_states if attention_block.is_cross_attention else None,
+        #         video_length=video_length,
+        #     ) + hidden_states
             
         hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states
         
diff --git a/animatediff/models/unet.py b/animatediff/models/unet.py
@@ -88,6 +88,7 @@ def __init__(
         motion_module_kwargs           = {},
         unet_use_cross_frame_attention = None,
         unet_use_temporal_attention    = None,
+        **kwargs,
     ):
         super().__init__()
         
diff --git a/animatediff/models/unet_blocks.py b/animatediff/models/unet_blocks.py
@@ -5,7 +5,8 @@
 
 from .attention import Transformer3DModel
 from .resnet import Downsample3D, ResnetBlock3D, Upsample3D
-from .motion_module import get_motion_module
+# from .motion_module import get_motion_module
+from .motion_module import get_window_motion_module as get_motion_module
 
 import pdb
 
diff --git a/animatediff/pipelines/pipeline_animation.py b/animatediff/pipelines/pipeline_animation.py
@@ -28,6 +28,7 @@
 from einops import rearrange
 
 from ..models.unet import UNet3DConditionModel
+import random
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -283,8 +284,18 @@ def check_inputs(self, prompt, height, width, callback_steps):
                 f" {type(callback_steps)}."
             )
 
-    def prepare_latents(self, batch_size, num_channels_latents, video_length, height, width, dtype, device, generator, latents=None):
+    def prepare_latents(self, batch_size, num_channels_latents, video_length, height, width, dtype, device, generator, latents=None, use_freenoise=False):
         shape = (batch_size, num_channels_latents, video_length, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        
+        if use_freenoise:
+            window_size = 16
+            window_stride = 4
+            latents = torch.randn(shape)
+            for frame_index in range(window_size, video_length, window_stride):
+                list_index = list(range(frame_index-window_size, frame_index+window_stride-window_size))
+                random.shuffle(list_index)
+                latents[:, :, frame_index:frame_index+window_stride] = latents[:, :, list_index]
+        
         if isinstance(generator, list) and len(generator) != batch_size:
             raise ValueError(
                 f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
@@ -330,6 +341,7 @@ def __call__(
         return_dict: bool = True,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: Optional[int] = 1,
+        use_freenoise: Optional[bool] = False,
         **kwargs,
     ):
         # Default height and width to unet
@@ -377,6 +389,7 @@ def __call__(
             device,
             generator,
             latents,
+            use_freenoise = use_freenoise,
         )
         latents_dtype = latents.dtype
 
@@ -392,7 +405,7 @@ def __call__(
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample.to(dtype=latents_dtype)
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings, use_freenoise=use_freenoise).sample.to(dtype=latents_dtype)
                 # noise_pred = []
                 # import pdb
                 # pdb.set_trace()
diff --git a/scripts/animate.py b/scripts/animate.py
@@ -94,6 +94,7 @@ def main(args):
                     width               = args.W,
                     height              = args.H,
                     video_length        = args.L,
+                    use_freenoise       = args.use_freenoise,
                 ).videos
                 samples.append(sample)
 
@@ -115,9 +116,10 @@ def main(args):
     parser.add_argument("--inference_config",      type=str, default="configs/inference/inference-v1.yaml")    
     parser.add_argument("--config",                type=str, required=True)
     
-    parser.add_argument("--L", type=int, default=16 )
+    parser.add_argument("--L", type=int, default=64 )
     parser.add_argument("--W", type=int, default=512)
     parser.add_argument("--H", type=int, default=512)
+    parser.add_argument("--use_freenoise", type=bool, default=True)
 
     args = parser.parse_args()
     main(args)