Merge branch 'main' into finetuning_scheduler

Borda · web-flow · commit 24ff01c3c421 · 2021-12-05T00:22:31.000+01:00
diff --git a/.actions/helpers.py b/.actions/helpers.py
@@ -79,7 +79,7 @@
 #
 # ### Great thanks from the entire Pytorch Lightning Team for your interest !
 #
-# ![Pytorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/docs/source/_static/images/logo.png){height="60px" width="240px"}
+# [![Pytorch Lightning](https://raw.githubusercontent.com/PyTorchLightning/pytorch-lightning/master/docs/source/_static/images/logo.png){height="60px" width="240px"}](https://pytorchlightning.ai)
 
 """
 TEMPLATE_CARD_ITEM = """
diff --git a/.azure-pipelines/ipynb-publish.yml b/.azure-pipelines/ipynb-publish.yml
@@ -17,7 +17,7 @@ jobs:
     # - For 60 minutes on Microsoft-hosted agents with a private project or private repository
     timeoutInMinutes: 0
 
-    pool: gridai-spot-pool
+    pool: azure-gpus-persist
     # this need to have installed docker in the base image...
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
diff --git a/.azure-pipelines/ipynb-tests.yml b/.azure-pipelines/ipynb-tests.yml
@@ -13,7 +13,7 @@ jobs:
     # how much time to give 'run always even if cancelled tasks' before stopping them
     cancelTimeoutInMinutes: 2
 
-    pool: gridai-spot-pool
+    pool: azure-gpus-spot
     # this need to have installed docker in the base image...
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
diff --git a/.github/workflows/ci_docs.yml b/.github/workflows/ci_docs.yml
@@ -1,4 +1,4 @@
-name: test Docs
+name: validate Docs
 
 on:  # Trigger the workflow on push or pull request
   pull_request: {}
@@ -71,7 +71,7 @@ jobs:
         working-directory: ./docs
         run: |
           # First run the same pipeline as Read-The-Docs
-          make html --debug --jobs $(nproc) SPHINXOPTS="-W --keep-going"
+          make html --debug --jobs $(nproc) SPHINXOPTS="-W --keep-going" -b linkcheck
 
       - name: Upload built docs
         uses: actions/upload-artifact@v2
diff --git a/course_UvA-DL/01-introduction-to-pytorch/.meta.yml b/course_UvA-DL/01-introduction-to-pytorch/.meta.yml
@@ -1,7 +1,7 @@
 title: "Tutorial 1: Introduction to PyTorch"
 author: Phillip Lippe
 created: 2021-08-27
-updated: 2021-08-27
+updated: 2021-11-29
 license: CC BY-SA
 description: |
   This tutorial will give a short introduction to PyTorch basics, and get you setup for writing your own neural networks.
diff --git a/course_UvA-DL/01-introduction-to-pytorch/Introduction_to_PyTorch.py b/course_UvA-DL/01-introduction-to-pytorch/Introduction_to_PyTorch.py
@@ -4,7 +4,7 @@
 # The following notebook is meant to give a short introduction to PyTorch basics, and get you setup for writing your own neural networks.
 # PyTorch is an open source machine learning framework that allows you to write your own neural networks and optimize them efficiently.
 # However, PyTorch is not the only framework of its kind.
-# Alternatives to PyTorch include [TensorFlow](https://www.tensorflow.org/), [JAX](https://github.com/google/jax#quickstart-colab-in-the-cloud) and [Caffe](http://caffe.berkeleyvision.org/).
+# Alternatives to PyTorch include [TensorFlow](https://www.tensorflow.org/), [JAX](https://github.com/google/jax) and [Caffe](http://caffe.berkeleyvision.org/).
 # We choose to teach PyTorch at the University of Amsterdam because it is well established, has a huge developer community (originally developed by Facebook), is very flexible and especially used in research.
 # Many current papers publish their code in PyTorch, and thus it is good to be familiar with PyTorch as well.
 # Meanwhile, TensorFlow (developed by Google) is usually known for being a production-grade deep learning library.
diff --git a/course_UvA-DL/03-initialization-and-optimization/.meta.yml b/course_UvA-DL/03-initialization-and-optimization/.meta.yml
@@ -1,7 +1,7 @@
 title: "Tutorial 3: Initialization and Optimization"
 author: Phillip Lippe
 created: 2021-08-27
-updated: 2021-08-27
+updated: 2021-11-29
 license: CC BY-SA
 tags:
   - Image
diff --git a/course_UvA-DL/03-initialization-and-optimization/Initialization_and_Optimization.py b/course_UvA-DL/03-initialization-and-optimization/Initialization_and_Optimization.py
@@ -323,7 +323,7 @@ def visualize_activations(model, color="C0", print_variance=False):
 # %% [markdown]
 # ## Initialization
 #
-# Before starting our discussion about initialization, it should be noted that there exist many very good blog posts about the topic of neural network initialization (for example [deeplearning.ai](https://www.deeplearning.ai/ai-notes/initialization/), or a more [math-focused blog post](https://pouannes.github.io/blog/initialization/#mjx-eqn-eqfwd_K)).
+# Before starting our discussion about initialization, it should be noted that there exist many very good blog posts about the topic of neural network initialization (for example [deeplearning.ai](https://www.deeplearning.ai/ai-notes/initialization/), or a more [math-focused blog post](https://pouannes.github.io/blog/initialization)).
 # In case something remains unclear after this tutorial, we recommend skimming through these blog posts as well.
 #
 # When initializing a neural network, there are a few properties we would like to have.
@@ -457,7 +457,7 @@ def equal_var_init(model):
 # Besides the variance of the activations, another variance we would like to stabilize is the one of the gradients.
 # This ensures a stable optimization for deep networks.
 # It turns out that we can do the same calculation as above starting from $\Delta x=W\Delta y$, and come to the conclusion that we should initialize our layers with $1/d_y$ where $d_y$ is the number of output neurons.
-# You can do the calculation as a practice, or check a thorough explanation in [this blog post](https://pouannes.github.io/blog/initialization/#mjx-eqn-eqfwd_K).
+# You can do the calculation as a practice, or check a thorough explanation in [this blog post](https://pouannes.github.io/blog/initialization).
 # As a compromise between both constraints, [Glorot and Bengio (2010)](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf?hc_location=ufi) proposed to use the harmonic mean of both values.
 # This leads us to the well-known Xavier initialization:
 #
diff --git a/course_UvA-DL/04-inception-resnet-densenet/.meta.yaml b/course_UvA-DL/04-inception-resnet-densenet/.meta.yaml
@@ -1,7 +1,7 @@
 title: "Tutorial 4: Inception, ResNet and DenseNet"
 author: Phillip Lippe
 created: 2021-08-27
-updated: 2021-08-27
+updated: 2021-11-29
 license: CC BY-SA
 tags:
   - Image
diff --git a/course_UvA-DL/04-inception-resnet-densenet/Inception_ResNet_DenseNet.py b/course_UvA-DL/04-inception-resnet-densenet/Inception_ResNet_DenseNet.py
@@ -208,7 +208,7 @@
 # 5. Test loop (`test_step`) which is the same as validation, only on a test set.
 #
 # Therefore, we don't abstract the PyTorch code, but rather organize it and define some default operations that are commonly used.
-# If you need to change something else in your training/validation/test loop, there are many possible functions you can overwrite (see the [docs](https://pytorch-lightning.readthedocs.io/en/stable/lightning_module.html) for details).
+# If you need to change something else in your training/validation/test loop, there are many possible functions you can overwrite (see the [docs](https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html) for details).
 #
 # Now we can look at an example of how a Lightning Module for training a CNN looks like:
 
@@ -322,7 +322,7 @@ def create_model(model_name, model_hparams):
 # Besides the Lightning module, the second most important module in PyTorch Lightning is the `Trainer`.
 # The trainer is responsible to execute the training steps defined in the Lightning module and completes the framework.
 # Similar to the Lightning module, you can override any key part that you don't want to be automated, but the default settings are often the best practice to do.
-# For a full overview, see the [documentation](https://pytorch-lightning.readthedocs.io/en/stable/trainer.html).
+# For a full overview, see the [documentation](https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html).
 # The most important functions we use below are:
 #
 # * `trainer.fit`: Takes as input a lightning module, a training dataset, and an (optional) validation dataset.
@@ -764,7 +764,7 @@ def forward(self, x):
 #
 # The three groups operate on the resolutions $32\times32$, $16\times16$ and $8\times8$ respectively.
 # The blocks in orange denote ResNet blocks with downsampling.
-# The same notation is used by many other implementations such as in the [torchvision library](https://pytorch.org/docs/stable/_modules/torchvision/models/resnet.html#resnet18) from PyTorch.
+# The same notation is used by many other implementations such as in the [torchvision library](https://pytorch.org/vision/0.11/models.html#torchvision.models.resnet18) from PyTorch.
 # Thus, our code looks as follows:
 
 
diff --git a/course_UvA-DL/05-transformers-and-MH-attention/.meta.yml b/course_UvA-DL/05-transformers-and-MH-attention/.meta.yml
@@ -1,7 +1,7 @@
 title: "Tutorial 5: Transformers and Multi-Head Attention"
 author: Phillip Lippe
 created: 2021-06-30
-updated: 2021-06-30
+updated: 2021-11-29
 license: CC BY-SA
 build: 0
 tags:
diff --git a/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py b/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py
@@ -669,10 +669,9 @@ def forward(self, x):
 # Improved optimizers like [RAdam](https://arxiv.org/abs/1908.03265) have been shown to overcome this issue,
 # not requiring warm-up for training Transformers.
 # Secondly, the iteratively applied Layer Normalization across layers can lead to very high gradients during
-# the first iterations, which can be solved by using
-# [Pre-Layer Normalization](https://proceedings.icml.cc/static/paper_files/icml/2020/328-Paper.pdf)
+# the first iterations, which can be solved by using Pre-Layer Normalization
 # (similar to Pre-Activation ResNet), or replacing Layer Normalization by other techniques
-# ([Adaptive Normalization](https://proceedings.icml.cc/static/paper_files/icml/2020/328-Paper.pdf),
+# (Adaptive Normalization,
 # [Power Normalization](https://arxiv.org/abs/2003.07845)).
 #
 # Nevertheless, many applications and papers still use the original Transformer architecture with Adam,
diff --git a/course_UvA-DL/06-graph-neural-networks/.meta.yml b/course_UvA-DL/06-graph-neural-networks/.meta.yml
@@ -1,7 +1,7 @@
 title: "Tutorial 6: Basics of Graph Neural Networks"
 author: Phillip Lippe
 created: 2021-06-07
-updated: 2021-06-16
+updated: 2021-12-04
 license: CC BY-SA
 build: 0
 tags:
@@ -22,7 +22,7 @@ requirements:
   - torch-sparse
   - torch-cluster
   - torch-spline-conv
-  - torch-geometric==1.7.2
+  - torch-geometric==2.0.2
 pip__find-link:
   # - https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
   - https://pytorch-geometric.com/whl/torch-%(TORCH_MAJOR_DOT_MINOR)s.0+%(DEVICE)s.html
diff --git a/course_UvA-DL/06-graph-neural-networks/GNN_overview.py b/course_UvA-DL/06-graph-neural-networks/GNN_overview.py
@@ -805,7 +805,7 @@ def print_results(result_dict):
 # Torch geometric uses a different, more efficient approach: we can view the $N$ graphs in a batch as a single large graph with concatenated node and edge list.
 # As there is no edge between the $N$ graphs, running GNN layers on the large graph gives us the same output as running the GNN on each graph separately.
 # Visually, this batching strategy is visualized below (figure credit - PyTorch Geometric team,
-# [tutorial here](https://colab.research.google.com/drive/1I8a0DfQ3fI7Njc62__mVXUlcAleUclnb?usp=sharing#scrollTo=2owRWKcuoALo)).
+# [tutorial here](https://colab.research.google.com/drive/1I8a0DfQ3fI7Njc62__mVXUlcAleUclnb)).
 #
 # <center width="100%"><img src="torch_geometric_stacking_graphs.png" width="600px"></center>
 #
diff --git a/flash_tutorials/electricity_forecasting/.meta.yml b/flash_tutorials/electricity_forecasting/.meta.yml
@@ -3,7 +3,7 @@ author: Ethan Harris (ethan@pytorchlightning.ai)
 created: 2021-11-23
 updated: 2021-11-23
 license: CC BY-SA
-build: 2
+build: 3
 tags:
   - Tabular
   - Forecasting
diff --git a/lightning_examples/augmentation_kornia/.meta.yml b/lightning_examples/augmentation_kornia/.meta.yml
@@ -3,7 +3,7 @@ author: PL/Kornia team
 created: 2021-06-11
 updated: 2021-06-16
 license: CC BY-SA
-build: 2
+build: 3
 tags:
   - Image
 description: |
diff --git a/lightning_examples/basic-gan/.meta.yaml b/lightning_examples/basic-gan/.meta.yaml
@@ -3,7 +3,7 @@ author: PL team
 created: 2020-12-21
 updated: 2021-06-16
 license: CC BY-SA
-build: 3
+build: 4
 tags:
   - Image
 description: |
diff --git a/lightning_examples/cifar10-baseline/.meta.yml b/lightning_examples/cifar10-baseline/.meta.yml
@@ -3,7 +3,7 @@ author: PL team
 created: 2020-12-21
 updated: 2021-06-16
 license: CC BY-SA
-build: 1
+build: 2
 tags:
   - Image
 description: >
diff --git a/lightning_examples/datamodules/.meta.yml b/lightning_examples/datamodules/.meta.yml
@@ -3,7 +3,7 @@ author: PL team
 created: 2020-12-21
 updated: 2021-06-07
 license: CC BY-SA
-build: 1
+build: 2
 description: This notebook will walk you through how to start using Datamodules. With
   the release of `pytorch-lightning` version 0.9.0, we have included a new class called
   `LightningDataModule` to help you decouple data related hooks from your `LightningModule`.
diff --git a/lightning_examples/datamodules/datamodules.py b/lightning_examples/datamodules/datamodules.py
@@ -9,9 +9,9 @@
 import torch
 import torch.nn.functional as F
 from pytorch_lightning import LightningDataModule, LightningModule, Trainer
-from pytorch_lightning.metrics.functional import accuracy
 from torch import nn
 from torch.utils.data import DataLoader, random_split
+from torchmetrics.functional import accuracy
 from torchvision import transforms
 
 # Note - you must have torchvision installed for this example
diff --git a/lightning_examples/mnist-hello-world/.meta.yml b/lightning_examples/mnist-hello-world/.meta.yml
@@ -3,7 +3,7 @@ author: PL team
 created: 2020-12-21
 updated: 2021-06-16
 license: CC BY-SA
-build: 1
+build: 2
 tags:
   - Image
 description: In this notebook, we'll go over the basics of lightning by preparing
diff --git a/lightning_examples/mnist-hello-world/hello-world.py b/lightning_examples/mnist-hello-world/hello-world.py
@@ -79,17 +79,17 @@ def configure_optimizers(self):
 #
 # ### Note what the following built-in functions are doing:
 #
-# 1. [prepare_data()](https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.core.lightning.html#pytorch_lightning.core.lightning.LightningModule.prepare_data) 💾
+# 1. [prepare_data()](https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#prepare-data) 💾
 #     - This is where we can download the dataset. We point to our desired dataset and ask torchvision's `MNIST` dataset class to download if the dataset isn't found there.
 #     - **Note we do not make any state assignments in this function** (i.e. `self.something = ...`)
 #
-# 2. [setup(stage)](https://pytorch-lightning.readthedocs.io/en/latest/common/lightning-module.html#setup) ⚙️
+# 2. [setup(stage)](https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#setup) ⚙️
 #     - Loads in data from file and prepares PyTorch tensor datasets for each split (train, val, test).
 #     - Setup expects a 'stage' arg which is used to separate logic for 'fit' and 'test'.
 #     - If you don't mind loading all your datasets at once, you can set up a condition to allow for both 'fit' related setup and 'test' related setup to run whenever `None` is passed to `stage` (or ignore it altogether and exclude any conditionals).
 #     - **Note this runs across all GPUs and it *is* safe to make state assignments here**
 #
-# 3. [x_dataloader()](https://pytorch-lightning.readthedocs.io/en/latest/common/lightning-module.html#data-hooks) ♻️
+# 3. [x_dataloader()](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.hooks.html) ♻️
 #     - `train_dataloader()`, `val_dataloader()`, and `test_dataloader()` all return PyTorch `DataLoader` instances that are created by wrapping their respective datasets that we prepared in `setup()`
 
 
diff --git a/lightning_examples/mnist-tpu-training/.meta.yml b/lightning_examples/mnist-tpu-training/.meta.yml
@@ -3,7 +3,7 @@ author: PL team
 created: 2020-12-21
 updated: 2021-06-25
 license: CC BY-SA
-build: 0
+build: 1
 tags:
   - Image
 description: In this notebook, we'll train a model on TPUs. Updating one Trainer flag is all you need for that.
diff --git a/lightning_examples/reinforce-learning-DQN/.meta.yml b/lightning_examples/reinforce-learning-DQN/.meta.yml
@@ -1,7 +1,7 @@
 title: How to train a Deep Q Network
 author: PL team
 created: 2021-01-31
-updated: 2021-06-17
+updated: 2021-12-03
 license: CC BY-SA
 build: 1
 tags:
@@ -13,6 +13,9 @@ description: |
   2. Handle unsupervised learning by using an IterableDataset where the dataset itself is constantly updated during training
   3. Each training step carries has the agent taking an action in the environment and storing the experience in the IterableDataset
 requirements:
+  - torchvision<=0.10
+  - torchaudio<=0.10
+  - torchtext<=0.10
   - gym
 accelerator:
   - CPU
diff --git a/lightning_examples/reinforce-learning-DQN/dqn.py b/lightning_examples/reinforce-learning-DQN/dqn.py
@@ -1,7 +1,7 @@
 # %%
 import os
 from collections import OrderedDict, deque, namedtuple
-from typing import List, Tuple
+from typing import Iterator, List, Tuple
 
 import gym
 import numpy as np
@@ -99,7 +99,7 @@ def __init__(self, buffer: ReplayBuffer, sample_size: int = 200) -> None:
         self.buffer = buffer
         self.sample_size = sample_size
 
-    def __iter__(self) -> Tuple:
+    def __iter__(self) -> Iterator[Tuple]:
         states, actions, rewards, dones, new_states = self.buffer.sample(self.sample_size)
         for i in range(len(dones)):
             yield states[i], actions[i], rewards[i], dones[i], new_states[i]
@@ -247,7 +247,7 @@ def populate(self, steps: int = 1000) -> None:
         Args:
             steps: number of random steps to populate the buffer with
         """
-        for i in range(steps):
+        for _ in range(steps):
             self.agent.play_step(self.net, epsilon=1.0)
 
     def forward(self, x: Tensor) -> Tensor:
@@ -273,7 +273,7 @@ def dqn_mse_loss(self, batch: Tuple[Tensor, Tensor]) -> Tensor:
         """
         states, actions, rewards, dones, next_states = batch
 
-        state_action_values = self.net(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)
+        state_action_values = self.net(states).gather(1, actions.long().unsqueeze(-1)).squeeze(-1)
 
         with torch.no_grad():
             next_state_values = self.target_net(next_states).max(1)[0]
@@ -284,6 +284,11 @@ def dqn_mse_loss(self, batch: Tuple[Tensor, Tensor]) -> Tensor:
 
         return nn.MSELoss()(state_action_values, expected_state_action_values)
 
+    def get_epsilon(self, start: int, end: int, frames: int) -> float:
+        if self.global_step > frames:
+            return end
+        return start - (self.global_step / frames) * (start - end)
+
     def training_step(self, batch: Tuple[Tensor, Tensor], nb_batch) -> OrderedDict:
         """Carries out a single step through the environment to update the replay buffer. Then calculates loss
         based on the minibatch recieved.
@@ -296,14 +301,13 @@ def training_step(self, batch: Tuple[Tensor, Tensor], nb_batch) -> OrderedDict:
             Training loss and log metrics
         """
         device = self.get_device(batch)
-        epsilon = max(
-            self.hparams.eps_end,
-            self.hparams.eps_start - self.global_step + 1 / self.hparams.eps_last_frame,
-        )
+        epsilon = self.get_epsilon(self.hparams.eps_start, self.hparams.eps_end, self.hparams.eps_last_frame)
+        self.log("epsilon", epsilon)
 
         # step through environment with agent
         reward, done = self.agent.play_step(self.net, epsilon, device)
         self.episode_reward += reward
+        self.log("episode reward", self.episode_reward)
 
         # calculates training loss
         loss = self.dqn_mse_loss(batch)
diff --git a/lightning_examples/text-transformers/.meta.yml b/lightning_examples/text-transformers/.meta.yml
@@ -1,9 +1,9 @@
 title: Finetune Transformers Models with PyTorch Lightning
 author: PL team
 created: 2021-01-31
-updated: 2021-06-21
+updated: 2021-12-03
 license: CC BY-SA
-build: 1
+build: 2
 tags:
   - Text
 description: |
@@ -15,6 +15,7 @@ requirements:
   - datasets
   - scipy
   - scikit-learn
+  - torchtext>=0.9
 accelerator:
   - CPU
   - GPU
diff --git a/lightning_examples/text-transformers/text-transformers.py b/lightning_examples/text-transformers/text-transformers.py
@@ -220,7 +220,7 @@ def setup(self, stage=None) -> None:
         if stage != "fit":
             return
         # Get dataloader by calling it - train_dataloader() is called after setup() by default
-        train_loader = self.train_dataloader()
+        train_loader = self.trainer.datamodule.train_dataloader()
 
         # Calculate total steps
         tb_size = self.hparams.train_batch_size * max(1, self.trainer.gpus)
@@ -274,7 +274,7 @@ def configure_optimizers(self):
 )
 
 trainer = Trainer(max_epochs=1, gpus=AVAIL_GPUS)
-trainer.fit(model, dm)
+trainer.fit(model, datamodule=dm)
 
 # %% [markdown]
 # ### MRPC
@@ -298,7 +298,7 @@ def configure_optimizers(self):
 )
 
 trainer = Trainer(max_epochs=3, gpus=AVAIL_GPUS)
-trainer.fit(model, dm)
+trainer.fit(model, datamodule=dm)
 
 # %% [markdown]
 # ### MNLI
diff --git a/sample-template/.meta.yml b/sample-template/.meta.yml

Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@`
`79`	`79`	`#`
`80`	`80`	`# ### Great thanks from the entire Pytorch Lightning Team for your interest !`
`81`	`81`	`#`
`82`		`-# ![Pytorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/docs/source/_static/images/logo.png){height="60px" width="240px"}`
	`82`	`+# [![Pytorch Lightning](https://raw.githubusercontent.com/PyTorchLightning/pytorch-lightning/master/docs/source/_static/images/logo.png){height="60px" width="240px"}](https://pytorchlightning.ai)`
`83`	`83`
`84`	`84`	`"""`
`85`	`85`	`TEMPLATE_CARD_ITEM = """`
Original file line number	Diff line number	Diff line change
`@@ -323,7 +323,7 @@ def visualize_activations(model, color="C0", print_variance=False):`
`323`	`323`	`# %% [markdown]`
`324`	`324`	`# ## Initialization`
`325`	`325`	`#`
`326`		`-# Before starting our discussion about initialization, it should be noted that there exist many very good blog posts about the topic of neural network initialization (for example [deeplearning.ai](https://www.deeplearning.ai/ai-notes/initialization/), or a more [math-focused blog post](https://pouannes.github.io/blog/initialization/#mjx-eqn-eqfwd_K)).`
	`326`	`+# Before starting our discussion about initialization, it should be noted that there exist many very good blog posts about the topic of neural network initialization (for example [deeplearning.ai](https://www.deeplearning.ai/ai-notes/initialization/), or a more [math-focused blog post](https://pouannes.github.io/blog/initialization)).`
`327`	`327`	`# In case something remains unclear after this tutorial, we recommend skimming through these blog posts as well.`
`328`	`328`	`#`
`329`	`329`	`# When initializing a neural network, there are a few properties we would like to have.`
`@@ -457,7 +457,7 @@ def equal_var_init(model):`
`457`	`457`	`# Besides the variance of the activations, another variance we would like to stabilize is the one of the gradients.`
`458`	`458`	`# This ensures a stable optimization for deep networks.`
`459`	`459`	`# It turns out that we can do the same calculation as above starting from $\Delta x=W\Delta y$, and come to the conclusion that we should initialize our layers with $1/d_y$ where $d_y$ is the number of output neurons.`
`460`		`-# You can do the calculation as a practice, or check a thorough explanation in [this blog post](https://pouannes.github.io/blog/initialization/#mjx-eqn-eqfwd_K).`
	`460`	`+# You can do the calculation as a practice, or check a thorough explanation in [this blog post](https://pouannes.github.io/blog/initialization).`
`461`	`461`	`# As a compromise between both constraints, [Glorot and Bengio (2010)](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf?hc_location=ufi) proposed to use the harmonic mean of both values.`
`462`	`462`	`# This leads us to the well-known Xavier initialization:`
`463`	`463`	`#`
Original file line number	Diff line number	Diff line change
`@@ -208,7 +208,7 @@`
`208`	`208`	# 5. Test loop (`test_step`) which is the same as validation, only on a test set.
`209`	`209`	`#`
`210`	`210`	`# Therefore, we don't abstract the PyTorch code, but rather organize it and define some default operations that are commonly used.`
`211`		`-# If you need to change something else in your training/validation/test loop, there are many possible functions you can overwrite (see the [docs](https://pytorch-lightning.readthedocs.io/en/stable/lightning_module.html) for details).`
	`211`	`+# If you need to change something else in your training/validation/test loop, there are many possible functions you can overwrite (see the [docs](https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html) for details).`
`212`	`212`	`#`
`213`	`213`	`# Now we can look at an example of how a Lightning Module for training a CNN looks like:`
`214`	`214`
`@@ -322,7 +322,7 @@ def create_model(model_name, model_hparams):`
`322`	`322`	# Besides the Lightning module, the second most important module in PyTorch Lightning is the `Trainer`.
`323`	`323`	`# The trainer is responsible to execute the training steps defined in the Lightning module and completes the framework.`
`324`	`324`	`# Similar to the Lightning module, you can override any key part that you don't want to be automated, but the default settings are often the best practice to do.`
`325`		`-# For a full overview, see the [documentation](https://pytorch-lightning.readthedocs.io/en/stable/trainer.html).`
	`325`	`+# For a full overview, see the [documentation](https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html).`
`326`	`326`	`# The most important functions we use below are:`
`327`	`327`	`#`
`328`	`328`	# * `trainer.fit`: Takes as input a lightning module, a training dataset, and an (optional) validation dataset.
`@@ -764,7 +764,7 @@ def forward(self, x):`
`764`	`764`	`#`
`765`	`765`	`# The three groups operate on the resolutions $32\times32$, $16\times16$ and $8\times8$ respectively.`
`766`	`766`	`# The blocks in orange denote ResNet blocks with downsampling.`
`767`		`-# The same notation is used by many other implementations such as in the [torchvision library](https://pytorch.org/docs/stable/_modules/torchvision/models/resnet.html#resnet18) from PyTorch.`
	`767`	`+# The same notation is used by many other implementations such as in the [torchvision library](https://pytorch.org/vision/0.11/models.html#torchvision.models.resnet18) from PyTorch.`
`768`	`768`	`# Thus, our code looks as follows:`
`769`	`769`
`770`	`770`
Original file line number	Diff line number	Diff line change
`@@ -805,7 +805,7 @@ def print_results(result_dict):`
`805`	`805`	`# Torch geometric uses a different, more efficient approach: we can view the $N$ graphs in a batch as a single large graph with concatenated node and edge list.`
`806`	`806`	`# As there is no edge between the $N$ graphs, running GNN layers on the large graph gives us the same output as running the GNN on each graph separately.`
`807`	`807`	`# Visually, this batching strategy is visualized below (figure credit - PyTorch Geometric team,`
`808`		`-# [tutorial here](https://colab.research.google.com/drive/1I8a0DfQ3fI7Njc62__mVXUlcAleUclnb?usp=sharing#scrollTo=2owRWKcuoALo)).`
	`808`	`+# [tutorial here](https://colab.research.google.com/drive/1I8a0DfQ3fI7Njc62__mVXUlcAleUclnb)).`
`809`	`809`	`#`
`810`	`810`	`# <center width="100%"><img src="torch_geometric_stacking_graphs.png" width="600px"></center>`
`811`	`811`	`#`
Original file line number	Diff line number	Diff line change
`@@ -79,17 +79,17 @@ def configure_optimizers(self):`
`79`	`79`	`#`
`80`	`80`	`# ### Note what the following built-in functions are doing:`
`81`	`81`	`#`
`82`		`-# 1. [prepare_data()](https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.core.lightning.html#pytorch_lightning.core.lightning.LightningModule.prepare_data) 💾`
	`82`	`+# 1. [prepare_data()](https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#prepare-data) 💾`
`83`	`83`	# - This is where we can download the dataset. We point to our desired dataset and ask torchvision's `MNIST` dataset class to download if the dataset isn't found there.
`84`	`84`	# - Note we do not make any state assignments in this function (i.e. `self.something = ...`)
`85`	`85`	`#`
`86`		`-# 2. [setup(stage)](https://pytorch-lightning.readthedocs.io/en/latest/common/lightning-module.html#setup) ⚙️`
	`86`	`+# 2. [setup(stage)](https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#setup) ⚙️`
`87`	`87`	`# - Loads in data from file and prepares PyTorch tensor datasets for each split (train, val, test).`
`88`	`88`	`# - Setup expects a 'stage' arg which is used to separate logic for 'fit' and 'test'.`
`89`	`89`	# - If you don't mind loading all your datasets at once, you can set up a condition to allow for both 'fit' related setup and 'test' related setup to run whenever `None` is passed to `stage` (or ignore it altogether and exclude any conditionals).
`90`	`90`	`# - *Note this runs across all GPUs and it is* safe to make state assignments here**`
`91`	`91`	`#`
`92`		`-# 3. [x_dataloader()](https://pytorch-lightning.readthedocs.io/en/latest/common/lightning-module.html#data-hooks) ♻️`
	`92`	`+# 3. [x_dataloader()](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.hooks.html) ♻️`
`93`	`93`	# - `train_dataloader()`, `val_dataloader()`, and `test_dataloader()` all return PyTorch `DataLoader` instances that are created by wrapping their respective datasets that we prepared in `setup()`
`94`	`94`
`95`	`95`