Change the injection method of conditions on lynxnet (#225)

KakaruHayate · web-flow · commit ffc686bcd526 · 2024-12-13T16:29:57.000+08:00
diff --git a/configs/templates/config_acoustic.yaml b/configs/templates/config_acoustic.yaml
@@ -63,7 +63,7 @@ backbone_args:
   num_channels: 1024
   num_layers: 6
   kernel_size: 31
-  dropout_rate: 0.0
+  dropout_rate: 0.1
 #backbone_type: 'wavenet'
 #backbone_args:
 #  num_channels: 512
diff --git a/configs/templates/config_variance.yaml b/configs/templates/config_variance.yaml
@@ -87,6 +87,7 @@ pitch_prediction_args:
 # backbone_args:
 #   num_layers: 6
 #   num_channels: 512
+#   dropout_rate: 0.1
 
 variances_prediction_args:
   total_repeat_bins: 48
@@ -99,6 +100,7 @@ variances_prediction_args:
 # backbone_args:
 #   num_layers: 6
 #   num_channels: 384
+#   dropout_rate: 0.1
 
 lambda_dur_loss: 1.0
 lambda_pitch_loss: 1.0
diff --git a/modules/backbones/lynxnet.py b/modules/backbones/lynxnet.py
@@ -10,6 +10,12 @@
 from utils.hparams import hparams
 
 
+class Conv1d(torch.nn.Conv1d):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        nn.init.kaiming_normal_(self.weight)
+
+
 class SwiGLU(nn.Module):
     # Swish-Applies the gated linear unit function.
     def __init__(self, dim=-1):
@@ -39,7 +45,7 @@ def calc_same_padding(kernel_size):
         pad = kernel_size // 2
         return pad, pad - (kernel_size + 1) % 2
 
-    def __init__(self, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.):
+    def __init__(self, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.1):
         super().__init__()
         inner_dim = dim * expansion_factor
         activation_classes = {
@@ -57,7 +63,7 @@ def __init__(self, dim, expansion_factor, kernel_size=31, activation='PReLU', dr
         else:
             _dropout = nn.Identity()
         self.net = nn.Sequential(
-            nn.LayerNorm(dim),
+            nn.LayerNorm(dim, eps=1e-6),
             Transpose((1, 2)),
             nn.Conv1d(dim, inner_dim * 2, 1),
             SwiGLU(dim=1),
@@ -73,16 +79,17 @@ def forward(self, x):
 
 
 class LYNXNetResidualLayer(nn.Module):
-    def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.):
+    def __init__(self, dim_cond, dim, expansion_factor, kernel_size=31, activation='PReLU', dropout=0.1):
         super().__init__()
         self.diffusion_projection = nn.Conv1d(dim, dim, 1)
         self.conditioner_projection = nn.Conv1d(dim_cond, dim, 1)
         self.convmodule = LYNXConvModule(dim=dim, expansion_factor=expansion_factor, kernel_size=kernel_size,
                                          activation=activation, dropout=dropout)
 
     def forward(self, x, conditioner, diffusion_step):
+        x = x + self.conditioner_projection(conditioner)
         res_x = x.transpose(1, 2)
-        x = x + self.diffusion_projection(diffusion_step) + self.conditioner_projection(conditioner)
+        x = x + self.diffusion_projection(diffusion_step)
         x = x.transpose(1, 2)
         x = self.convmodule(x)  # (#batch, dim, length)
         x = x + res_x
@@ -93,7 +100,7 @@ def forward(self, x, conditioner, diffusion_step):
 
 class LYNXNet(nn.Module):
     def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansion_factor=2, kernel_size=31,
-                 activation='PReLU', dropout=0.):
+                 activation='PReLU', dropout=0.1):
         """
         LYNXNet(Linear Gated Depthwise Separable Convolution Network)
         TIPS:You can control the style of the generated results by modifying the 'activation', 
@@ -104,7 +111,7 @@ def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansio
         super().__init__()
         self.in_dims = in_dims
         self.n_feats = n_feats
-        self.input_projection = nn.Conv1d(in_dims * n_feats, num_channels, 1)
+        self.input_projection = Conv1d(in_dims * n_feats, num_channels, 1)
         self.diffusion_embedding = nn.Sequential(
             SinusoidalPosEmb(num_channels),
             nn.Linear(num_channels, num_channels * 4),
@@ -124,8 +131,8 @@ def __init__(self, in_dims, n_feats, *, num_layers=6, num_channels=512, expansio
                 for i in range(num_layers)
             ]
         )
-        self.norm = nn.LayerNorm(num_channels)
-        self.output_projection = nn.Conv1d(num_channels, in_dims * n_feats, kernel_size=1)
+        self.norm = nn.LayerNorm(num_channels, eps=1e-6)
+        self.output_projection = Conv1d(num_channels, in_dims * n_feats, kernel_size=1)
         nn.init.zeros_(self.output_projection.weight)
 
     def forward(self, spec, diffusion_step, cond):
@@ -142,7 +149,7 @@ def forward(self, spec, diffusion_step, cond):
             x = spec.flatten(start_dim=1, end_dim=2)  # [B, F x M, T]
 
         x = self.input_projection(x)  # x [B, residual_channel, T]
-        x = F.gelu(x)
+        # x = F.gelu(x)
 
         diffusion_step = self.diffusion_embedding(diffusion_step).unsqueeze(-1)