@@ -441,20 +441,20 @@ def forward_step(self, input, hidden):
441
441
# :alt:
442
442
#
443
443
#
444
- # Bahdanau attention, also known as additive attention, is a commonly used
445
- # attention mechanism in sequence-to-sequence models, particularly in neural
446
- # machine translation tasks. It was introduced by Bahdanau et al. in their
447
- # paper titled `Neural Machine Translation by Jointly Learning to Align and Translate <https://arxiv.org/pdf/1409.0473.pdf>`__.
448
- # This attention mechanism employs a learned alignment model to compute attention
449
- # scores between the encoder and decoder hidden states. It utilizes a feed-forward
444
+ # Bahdanau attention, also known as additive attention, is a commonly used
445
+ # attention mechanism in sequence-to-sequence models, particularly in neural
446
+ # machine translation tasks. It was introduced by Bahdanau et al. in their
447
+ # paper titled `Neural Machine Translation by Jointly Learning to Align and Translate <https://arxiv.org/pdf/1409.0473.pdf>`__.
448
+ # This attention mechanism employs a learned alignment model to compute attention
449
+ # scores between the encoder and decoder hidden states. It utilizes a feed-forward
450
450
# neural network to calculate alignment scores.
451
451
#
452
- # However, there are alternative attention mechanisms available, such as Luong attention,
453
- # which computes attention scores by taking the dot product between the decoder hidden
454
- # state and the encoder hidden states. It does not involve the non-linear transformation
452
+ # However, there are alternative attention mechanisms available, such as Luong attention,
453
+ # which computes attention scores by taking the dot product between the decoder hidden
454
+ # state and the encoder hidden states. It does not involve the non-linear transformation
455
455
# used in Bahdanau attention.
456
456
#
457
- # In this tutorial, we will be using Bahdanau attention. However, it would be a valuable
457
+ # In this tutorial, we will be using Bahdanau attention. However, it would be a valuable
458
458
# exercise to explore modifying the attention mechanism to use Luong attention.
459
459
460
460
class BahdanauAttention (nn .Module ):
@@ -467,7 +467,7 @@ def __init__(self, hidden_size):
467
467
def forward (self , query , keys ):
468
468
scores = self .Va (torch .tanh (self .Wa (query ) + self .Ua (keys )))
469
469
scores = scores .squeeze (2 ).unsqueeze (1 )
470
-
470
+
471
471
weights = F .softmax (scores , dim = - 1 )
472
472
context = torch .bmm (weights , keys )
473
473
@@ -605,9 +605,9 @@ def get_dataloader(batch_size):
605
605
# ``teacher_forcing_ratio`` up to use more of it.
606
606
#
607
607
608
- def train_epoch (dataloader , encoder , decoder , encoder_optimizer ,
608
+ def train_epoch (dataloader , encoder , decoder , encoder_optimizer ,
609
609
decoder_optimizer , criterion ):
610
-
610
+
611
611
total_loss = 0
612
612
for data in dataloader :
613
613
input_tensor , target_tensor = data
@@ -617,7 +617,7 @@ def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
617
617
618
618
encoder_outputs , encoder_hidden = encoder (input_tensor )
619
619
decoder_outputs , _ , _ = decoder (encoder_outputs , encoder_hidden , target_tensor )
620
-
620
+
621
621
loss = criterion (
622
622
decoder_outputs .view (- 1 , decoder_outputs .size (- 1 )),
623
623
target_tensor .view (- 1 )
@@ -628,7 +628,7 @@ def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
628
628
decoder_optimizer .step ()
629
629
630
630
total_loss += loss .item ()
631
-
631
+
632
632
return total_loss / len (dataloader )
633
633
634
634
@@ -671,7 +671,7 @@ def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
671
671
plot_losses = []
672
672
print_loss_total = 0 # Reset every print_every
673
673
plot_loss_total = 0 # Reset every plot_every
674
-
674
+
675
675
encoder_optimizer = optim .Adam (encoder .parameters (), lr = learning_rate )
676
676
decoder_optimizer = optim .Adam (decoder .parameters (), lr = learning_rate )
677
677
criterion = nn .NLLLoss ()
@@ -680,7 +680,7 @@ def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
680
680
loss = train_epoch (train_dataloader , encoder , decoder , encoder_optimizer , decoder_optimizer , criterion )
681
681
print_loss_total += loss
682
682
plot_loss_total += loss
683
-
683
+
684
684
if epoch % print_every == 0 :
685
685
print_loss_avg = print_loss_total / print_every
686
686
print_loss_total = 0
@@ -691,7 +691,7 @@ def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
691
691
plot_loss_avg = plot_loss_total / plot_every
692
692
plot_losses .append (plot_loss_avg )
693
693
plot_loss_total = 0
694
-
694
+
695
695
showPlot (plot_losses )
696
696
697
697
######################################################################
@@ -736,7 +736,7 @@ def evaluate(encoder, decoder, sentence, input_lang, output_lang):
736
736
737
737
_ , topi = decoder_outputs .topk (1 )
738
738
decoded_ids = topi .squeeze ()
739
-
739
+
740
740
decoded_words = []
741
741
for idx in decoded_ids :
742
742
if idx .item () == EOS_token :
@@ -793,7 +793,9 @@ def evaluateRandomly(encoder, decoder, n=10):
793
793
794
794
######################################################################
795
795
#
796
-
796
+ # Set dropout layers to ``eval`` mode
797
+ encoder .eval ()
798
+ decoder .eval ()
797
799
evaluateRandomly (encoder , decoder )
798
800
799
801
@@ -807,7 +809,7 @@ def evaluateRandomly(encoder, decoder, n=10):
807
809
# at each time step.
808
810
#
809
811
# You could simply run ``plt.matshow(attentions)`` to see attention output
810
- # displayed as a matrix. For a better viewing experience we will do the
812
+ # displayed as a matrix. For a better viewing experience we will do the
811
813
# extra work of adding axes and labels:
812
814
#
813
815
0 commit comments