Fix typo and format BasicTransformerBlock attributes (huggingface#2953)

offchan42 · patrickvonplaten · web-flow · commit 363127b833ad · 2023-04-12T00:31:05.000+02:00
* ⚙️chore(train_controlnet) fix typo in logger message

* ⚙️chore(models) refactor modules order; make them the same as calling order

When printing the BasicTransformerBlock to stdout, I think it's crucial that the attributes order are shown in proper order. And also previously the "3. Feed Forward" comment was not making sense. It should have been close to self.ff but it's instead next to self.norm3

* correct many tests

* remove bogus file

* make style

* correct more tests

* finish tests

* fix one more

* make style

* make unclip deterministic

* ⚙️chore(models/attention) reorganize comments in BasicTransformerBlock class

---------

Co-authored-by: Patrick von Platen &lt;patrick.v.platen@gmail.com&gt;
diff --git a/models/attention.py b/models/attention.py
@@ -224,7 +224,14 @@ def __init__(
                 f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
             )
 
+        # Define 3 blocks. Each block has its own normalization layer.
         # 1. Self-Attn
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_zero:
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
         self.attn1 = Attention(
             query_dim=dim,
             heads=num_attention_heads,
@@ -235,10 +242,16 @@ def __init__(
             upcast_attention=upcast_attention,
         )
 
-        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
-
         # 2. Cross-Attn
         if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm)
+                if self.use_ada_layer_norm
+                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            )
             self.attn2 = Attention(
                 query_dim=dim,
                 cross_attention_dim=cross_attention_dim if not double_self_attention else None,
@@ -248,30 +261,13 @@ def __init__(
                 bias=attention_bias,
                 upcast_attention=upcast_attention,
             )  # is self-attn if encoder_hidden_states is none
-        else:
-            self.attn2 = None
-
-        if self.use_ada_layer_norm:
-            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
-        elif self.use_ada_layer_norm_zero:
-            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
-        else:
-            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
-
-        if cross_attention_dim is not None or double_self_attention:
-            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
-            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
-            # the second cross attention block.
-            self.norm2 = (
-                AdaLayerNorm(dim, num_embeds_ada_norm)
-                if self.use_ada_layer_norm
-                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
-            )
         else:
             self.norm2 = None
+            self.attn2 = None
 
         # 3. Feed-forward
         self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
 
     def forward(
         self,
@@ -283,6 +279,8 @@ def forward(
         cross_attention_kwargs=None,
         class_labels=None,
     ):
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 1. Self-Attention
         if self.use_ada_layer_norm:
             norm_hidden_states = self.norm1(hidden_states, timestep)
         elif self.use_ada_layer_norm_zero:
@@ -292,7 +290,6 @@ def forward(
         else:
             norm_hidden_states = self.norm1(hidden_states)
 
-        # 1. Self-Attention
         cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
         attn_output = self.attn1(
             norm_hidden_states,
@@ -304,14 +301,14 @@ def forward(
             attn_output = gate_msa.unsqueeze(1) * attn_output
         hidden_states = attn_output + hidden_states
 
+        # 2. Cross-Attention
         if self.attn2 is not None:
             norm_hidden_states = (
                 self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
             )
             # TODO (Birch-San): Here we should prepare the encoder_attention mask correctly
             # prepare attention mask here
 
-            # 2. Cross-Attention
             attn_output = self.attn2(
                 norm_hidden_states,
                 encoder_hidden_states=encoder_hidden_states,