From 291dfbb9f02cb9cca653e8d586c749af2240e0e0 Mon Sep 17 00:00:00 2001
From: Andrew Hoblitzell <ahoblitzell@salesforce.com>
Date: Tue, 7 Nov 2023 19:35:26 -0500
Subject: [PATCH 1/4] generate_square_subsequent_mask

---
 beginner_source/transformer_tutorial.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
index d7ebee959e5..6cd39cc064b 100644
--- a/beginner_source/transformer_tutorial.py
+++ b/beginner_source/transformer_tutorial.py
@@ -91,11 +91,12 @@ def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
         """
         src = self.embedding(src) * math.sqrt(self.d_model)
         src = self.pos_encoder(src)
+        if src_mask is None:
+            src_mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
         output = self.transformer_encoder(src, src_mask)
         output = self.linear(output)
         return output
 
-
 ######################################################################
 # ``PositionalEncoding`` module injects some information about the
 # relative or absolute position of the tokens in the sequence. The

From 73ef5d04e161d936830030f31bd4c9d095775797 Mon Sep 17 00:00:00 2001
From: Andrew Hoblitzell <ahoblitzell@salesforce.com>
Date: Tue, 7 Nov 2023 19:38:27 -0500
Subject: [PATCH 2/4] dropped a space

---
 beginner_source/transformer_tutorial.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
index 6cd39cc064b..77ef6f571b6 100644
--- a/beginner_source/transformer_tutorial.py
+++ b/beginner_source/transformer_tutorial.py
@@ -97,6 +97,7 @@ def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
         output = self.linear(output)
         return output
 
+
 ######################################################################
 # ``PositionalEncoding`` module injects some information about the
 # relative or absolute position of the tokens in the sequence. The

From 29496ca0702fdf0242fbda9dcb6cb4f881647296 Mon Sep 17 00:00:00 2001
From: Andrew Hoblitzell <ahoblitz@users.noreply.github.com>
Date: Thu, 9 Nov 2023 09:08:02 -0500
Subject: [PATCH 3/4] Update transformer_tutorial.py with description about the
 need for masking

---
 beginner_source/transformer_tutorial.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
index 77ef6f571b6..ce7dfbde56e 100644
--- a/beginner_source/transformer_tutorial.py
+++ b/beginner_source/transformer_tutorial.py
@@ -41,8 +41,10 @@
 # Along with the input sequence, a square attention mask is required because the
 # self-attention layers in ``nn.TransformerDecoder`` are only allowed to attend
 # the earlier positions in the sequence. For the language modeling task, any
-# tokens on the future positions should be masked. To produce a probability
-# distribution over output words, the output of the ``nn.TransformerEncoder``
+# tokens on the future positions should be masked.  This masking, combined with fact that 
+# the output embeddings are offset with later positions ensures that the
+# predictions for position i can depend only on the known outputs at positions less than i.
+# To produce a probability  distribution over output words, the output of the ``nn.TransformerEncoder``
 # model is passed through a linear layer to output unnormalized logits.
 # The log-softmax function isn't applied here due to the later use of
 # `CrossEntropyLoss <https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html>`__,
@@ -92,6 +94,9 @@ def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
         src = self.embedding(src) * math.sqrt(self.d_model)
         src = self.pos_encoder(src)
         if src_mask is None:
+            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
+            Unmasked positions are filled with float(0.0).
+            """
             src_mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
         output = self.transformer_encoder(src, src_mask)
         output = self.linear(output)

From b5fcf990942dcf836499775fd47de88d40c5ac6b Mon Sep 17 00:00:00 2001
From: Andrew Hoblitzell <ahoblitz@users.noreply.github.com>
Date: Thu, 9 Nov 2023 19:09:05 -0500
Subject: [PATCH 4/4] Update transformer_tutorial.py

---
 beginner_source/transformer_tutorial.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/beginner_source/transformer_tutorial.py b/beginner_source/transformer_tutorial.py
index ce7dfbde56e..437f6345241 100644
--- a/beginner_source/transformer_tutorial.py
+++ b/beginner_source/transformer_tutorial.py
@@ -29,7 +29,7 @@
 
 ######################################################################
 # In this tutorial, we train a ``nn.TransformerEncoder`` model on a
-# language modeling task. Please note that this tutorial does not cover
+# causal language modeling task. Please note that this tutorial does not cover
 # the training of `nn.TransformerDecoder <https://pytorch.org/docs/stable/generated/torch.nn.TransformerDecoder.html#torch.nn.TransformerDecoder>`__, as depicted in
 # the right half of the diagram above. The language modeling task is to assign a
 # probability for the likelihood of a given word (or a sequence of words)