diff --git a/docs/source/conf.py b/docs/source/conf.py
index 30eab910e..8e90952d1 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -181,6 +181,11 @@
# (source start file, name, description, authors, manual section).
man_pages = [(master_doc, project, project + " Documentation", [author], 1)]
+# -- Options for linkcheck builder ----------------------------------------------
+# regex pattern 0: allow linking to a specific selection state in
+# tensorboard.dev links while continuing to validate the base experiment link
+linkcheck_anchors_ignore = ["scalars.*&runSelectionState.*"]
+
# -- Options for Texinfo output ----------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
diff --git a/lightning_examples/finetuning-scheduler/.meta.yml b/lightning_examples/finetuning-scheduler/.meta.yml
new file mode 100644
index 000000000..3cae7999e
--- /dev/null
+++ b/lightning_examples/finetuning-scheduler/.meta.yml
@@ -0,0 +1,19 @@
+title: Finetuning Scheduler
+author: "[Dan Dale](https://github.com/speediedan)"
+created: 2021-11-29
+updated: 2022-05-10
+license: CC BY-SA
+build: 3
+tags:
+ - finetuning
+description: |
+ This notebook introduces the [Finetuning Scheduler](https://finetuning-scheduler.readthedocs.io/en/stable/index.html) extension
+ and demonstrates the use of it to finetune a small foundational model on the
+ [RTE](https://huggingface.co/datasets/viewer/?dataset=super_glue&config=rte) task of
+ [SuperGLUE](https://super.gluebenchmark.com/) with iterative early-stopping defined according to a user-specified
+ schedule. It uses Hugging Face's ``datasets`` and ``transformers`` libraries to retrieve the relevant benchmark data
+ and foundational model weights. The required dependencies are installed via the finetuning-scheduler ``[examples]`` extra.
+requirements:
+ - finetuning-scheduler[examples]
+accelerator:
+ - GPU
diff --git a/lightning_examples/finetuning-scheduler/RteBoolqModule_ft_schedule_deberta_base.yaml b/lightning_examples/finetuning-scheduler/RteBoolqModule_ft_schedule_deberta_base.yaml
new file mode 100644
index 000000000..62bdbae28
--- /dev/null
+++ b/lightning_examples/finetuning-scheduler/RteBoolqModule_ft_schedule_deberta_base.yaml
@@ -0,0 +1,18 @@
+
+0:
+ params:
+ - model.classifier.bias
+ - model.classifier.weight
+ - model.pooler.dense.bias
+ - model.pooler.dense.weight
+ - model.deberta.encoder.LayerNorm.bias
+ - model.deberta.encoder.LayerNorm.weight
+ - model.deberta.encoder.rel_embeddings.weight
+ - model.deberta.encoder.layer.{0,11}.(output|attention|intermediate).*
+1:
+ params:
+ - model.deberta.embeddings.LayerNorm.bias
+ - model.deberta.embeddings.LayerNorm.weight
+2:
+ params:
+ - model.deberta.embeddings.word_embeddings.weight
diff --git a/lightning_examples/finetuning-scheduler/emphasized_yaml.png b/lightning_examples/finetuning-scheduler/emphasized_yaml.png
new file mode 100644
index 000000000..492be1d40
Binary files /dev/null and b/lightning_examples/finetuning-scheduler/emphasized_yaml.png differ
diff --git a/lightning_examples/finetuning-scheduler/finetuning-scheduler.py b/lightning_examples/finetuning-scheduler/finetuning-scheduler.py
new file mode 100644
index 000000000..e276fe677
--- /dev/null
+++ b/lightning_examples/finetuning-scheduler/finetuning-scheduler.py
@@ -0,0 +1,711 @@
+# %% [markdown]
+# ## Scheduled Finetuning with the Finetuning Scheduler Extension
+#
+# {height="58px" width="401px"}
+#
+# The [Finetuning Scheduler](https://finetuning-scheduler.readthedocs.io/en/stable/index.html) extension accelerates and enhances model experimentation with flexible finetuning schedules.
+#
+# Training with the extension is simple and confers a host of benefits:
+#
+# - it dramatically increases finetuning flexibility
+# - expedites and facilitates exploration of model tuning dynamics
+# - enables marginal performance improvements of finetuned models
+#
+# Setup is straightforward, just install from PyPI! Since this notebook-based example requires a few additional packages (e.g.
+# ``transformers``, ``sentencepiece``), we installed the ``finetuning-scheduler`` package with the ``[examples]`` extra above.
+# Once the ``finetuning-scheduler`` package is installed, the [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) callback is available for use with PyTorch Lightning.
+# For additional installation options, please see the Finetuning Scheduler [README](https://github.com/speediedan/finetuning-scheduler/blob/main/README.md).
+#
+#
+#
+#
+#
+# Fundamentally, [Finetuning Scheduler](https://finetuning-scheduler.readthedocs.io/en/stable/index.html) enables
+# scheduled, multi-phase, finetuning of foundational models. Gradual unfreezing (i.e. thawing) can help maximize
+# foundational model knowledge retention while allowing (typically upper layers of) the model to
+# optimally adapt to new tasks during transfer learning [1, 2, 3](#f1)
+#
+#
+#
+# The [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) callback orchestrates the gradual unfreezing
+# of models via a finetuning schedule that is either implicitly generated (the default) or explicitly provided by the user
+# (more computationally efficient). Finetuning phase transitions are driven by
+# [FTSEarlyStopping](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSEarlyStopping)
+# criteria (a multi-phase extension of ``EarlyStopping`` packaged with FinetuningScheduler), user-specified epoch transitions or a composition of the two (the default mode).
+# A [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) training session completes when the
+# final phase of the schedule has its stopping criteria met. See
+# the [early stopping documentation](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.EarlyStopping.html) for more details on that callback's configuration.
+#
+# {height="272px" width="376px"}
+
+# %% [markdown]
+#
+# ## Basic Usage
+#
+#
+#
+# If no finetuning schedule is provided by the user, [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) will generate a
+# [default schedule](#The-Default-Finetuning-Schedule) and proceed to finetune according to the generated schedule,
+# using default [FTSEarlyStopping](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSEarlyStopping) and [FTSCheckpoint](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSCheckpoint) callbacks with ``monitor=val_loss``.
+#
+#
+#
+# ```python
+# from pytorch_lightning import Trainer
+# from finetuning_scheduler import FinetuningScheduler
+# trainer = Trainer(callbacks=[FinetuningScheduler()])
+# ```
+
+# %% [markdown]
+# ## The Default Finetuning Schedule
+#
+# Schedule definition is facilitated via the [gen_ft_schedule](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.SchedulingMixin.gen_ft_schedule) method which dumps a default finetuning schedule (by default using a naive, 2-parameters per level heuristic) which can be adjusted as
+# desired by the user and/or subsequently passed to the callback. Using the default/implicitly generated schedule will likely be less computationally efficient than a user-defined finetuning schedule but is useful for exploring a model's finetuning behavior and can serve as a good baseline for subsequent explicit schedule refinement.
+# While the current version of [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) only supports single optimizer and (optional) lr_scheduler configurations, per-phase maximum learning rates can be set as demonstrated in the next section.
+
+# %% [markdown]
+# ## Specifying a Finetuning Schedule
+#
+# To specify a finetuning schedule, it's convenient to first generate the default schedule and then alter the thawed/unfrozen parameter groups associated with each finetuning phase as desired. Finetuning phases are zero-indexed and executed in ascending order.
+#
+# 1. First, generate the default schedule to ``Trainer.log_dir``. It will be named after your
+# ``LightningModule`` subclass with the suffix ``_ft_schedule.yaml``.
+#
+# ```python
+# from pytorch_lightning import Trainer
+# from finetuning_scheduler import FinetuningScheduler
+# trainer = Trainer(callbacks=[FinetuningScheduler(gen_ft_sched_only=True)])
+# ```
+#
+# 2. Alter the schedule as desired.
+#
+# {height="327px" width="800px"}
+#
+# 3. Once the finetuning schedule has been altered as desired, pass it to
+# [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) to commence scheduled training:
+#
+# ```python
+# from pytorch_lightning import Trainer
+# from finetuning_scheduler import FinetuningScheduler
+#
+# trainer = Trainer(callbacks=[FinetuningScheduler(ft_schedule="/path/to/my/schedule/my_schedule.yaml")])
+# ```
+
+# %% [markdown]
+# ## Early-Stopping and Epoch-Driven Phase Transition Criteria
+#
+#
+# By default, [FTSEarlyStopping](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSEarlyStopping) and epoch-driven
+# transition criteria are composed. If a ``max_transition_epoch`` is specified for a given phase, the next finetuning phase will begin at that epoch unless [FTSEarlyStopping](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSEarlyStopping) criteria are met first.
+# If [FinetuningScheduler.epoch_transitions_only](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler.params.epoch_transitions_only) is ``True``, [FTSEarlyStopping](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSEarlyStopping) will not be used
+# and transitions will be exclusively epoch-driven.
+#
+#
+#
+#
+# **Tip:** Use of regex expressions can be convenient for specifying more complex schedules. Also, a per-phase base maximum lr can be specified:
+#
+# {height="380px" width="800px"}
+#
+#
+#
+#
+#
+# The end-to-end example in this notebook ([Scheduled Finetuning For SuperGLUE](#superglue)) uses [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) in explicit mode to finetune a small foundational model on the [RTE](https://huggingface.co/datasets/viewer/?dataset=super_glue&config=rte) task of [SuperGLUE](https://super.gluebenchmark.com/).
+# Please see the [official Finetuning Scheduler documentation](https://finetuning-scheduler.readthedocs.io/en/stable/index.html) if you are interested in a similar [CLI-based example](https://finetuning-scheduler.readthedocs.io/en/stable/index.html#scheduled-finetuning-superglue) using the LightningCLI.
+
+# %% [markdown]
+# ## Resuming Scheduled Finetuning Training Sessions
+#
+# Resumption of scheduled finetuning training is identical to the continuation of
+# [other training sessions](https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html) with the caveat that the provided checkpoint must have been saved by a [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) session.
+# [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) uses [FTSCheckpoint](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSCheckpoint) (an extension of ``ModelCheckpoint``) to maintain schedule state with special metadata.
+#
+#
+# ```python
+# from pytorch_lightning import Trainer
+# from finetuning_scheduler import FinetuningScheduler
+# trainer = Trainer(callbacks=[FinetuningScheduler()])
+# trainer.fit(..., ckpt_path="some/path/to/my_checkpoint.ckpt")
+# ```
+#
+# Training will resume at the depth/level of the provided checkpoint according to the specified schedule. Schedules can be altered between training sessions but schedule compatibility is left to the user for maximal flexibility. If executing a user-defined schedule, typically the same schedule should be provided for the original and resumed training sessions.
+#
+# By default ([FinetuningScheduler.restore_best](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html?highlight=restore_best#finetuning_scheduler.fts.FinetuningScheduler.params.restore_best) is ``True``), [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) will attempt to restore the best available checkpoint before finetuning depth transitions.
+#
+# ```python
+# trainer = Trainer(callbacks=[FinetuningScheduler()])
+# trainer.fit(..., ckpt_path="some/path/to/my_kth_best_checkpoint.ckpt")
+# ```
+#
+# Note that similar to the behavior of [ModelCheckpoint](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.ModelCheckpoint.html), (specifically [this PR](https://github.com/PyTorchLightning/pytorch-lightning/pull/12045)),
+# when resuming training with a different [FTSCheckpoint](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSCheckpoint) ``dirpath`` from the provided
+# checkpoint, the new training session's checkpoint state will be re-initialized at the resumption depth with the provided checkpoint being set as the best checkpoint.
+
+# %% [markdown]
+#
+#
+# **Note:** Currently, [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) supports the following strategy types:
+#
+# - ``DP``
+# - ``DDP``
+# - ``DDP_SPAWN``
+# - ``DDP_SHARDED``
+# - ``DDP_SHARDED_SPAWN``
+#
+#
+
+# %% [markdown]
+#
+#
+# ## Scheduled Finetuning For SuperGLUE
+#
+# The following example demonstrates the use of [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) to finetune a small foundational model on the [RTE](https://huggingface.co/datasets/viewer/?dataset=super_glue&config=rte) task of [SuperGLUE](https://super.gluebenchmark.com/). Iterative early-stopping will be applied according to a user-specified schedule.
+#
+
+# %%
+import os
+import warnings
+from datetime import datetime
+from importlib import import_module
+from typing import Any, Dict, List, Optional
+
+import datasets
+
+import sentencepiece as sp # noqa: F401 # isort: split
+import pytorch_lightning as pl
+import torch
+from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
+from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
+from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities.cli import _Registry
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from torch.optim.adamw import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
+from torch.utils.data import DataLoader
+from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
+from transformers import logging as transformers_logging
+from transformers.tokenization_utils_base import BatchEncoding
+
+# %%
+# a couple helper functions to prepare code to work with a user module registry
+MOCK_REGISTRY = _Registry()
+
+
+def mock_register_module(key: str, require_fqn: bool = False) -> List:
+ if key.lower() == "finetuningscheduler":
+ mod = import_module("finetuning_scheduler")
+ MOCK_REGISTRY.register_classes(mod, pl.callbacks.Callback)
+ else:
+ raise MisconfigurationException(f"user module key '{key}' not found")
+ registered_list = []
+ # make registered class available by unqualified class name by default
+ if not require_fqn:
+ for n, c in MOCK_REGISTRY.items():
+ globals()[f"{n}"] = c
+ registered_list = ", ".join([n for n in MOCK_REGISTRY.names])
+ else:
+ registered_list = ", ".join([c.__module__ + "." + c.__name__ for c in MOCK_REGISTRY.classes])
+ print(f"Imported and registered the following callbacks: {registered_list}")
+
+
+# %%
+# Load the `FinetuningScheduler` PyTorch Lightning extension module we want to use. This will import all necessary callbacks.
+mock_register_module("finetuningscheduler")
+# set notebook-level variables
+TASK_NUM_LABELS = {"boolq": 2, "rte": 2}
+DEFAULT_TASK = "rte"
+
+transformers_logging.set_verbosity_error()
+# ignore warnings related tokenizers_parallelism/DataLoader parallelism trade-off and
+# expected logging behavior
+for warnf in [".*does not have many workers*", ".*The number of training samples.*"]:
+ warnings.filterwarnings("ignore", warnf)
+
+
+# %%
+class RteBoolqDataModule(pl.LightningDataModule):
+ """A ``LightningDataModule`` designed for both the RTE or BoolQ SuperGLUE Hugging Face datasets."""
+
+ TASK_TEXT_FIELD_MAP = {"rte": ("premise", "hypothesis"), "boolq": ("question", "passage")}
+ LOADER_COLUMNS = (
+ "datasets_idx",
+ "input_ids",
+ "token_type_ids",
+ "attention_mask",
+ "start_positions",
+ "end_positions",
+ "labels",
+ )
+
+ def __init__(
+ self,
+ model_name_or_path: str,
+ task_name: str = DEFAULT_TASK,
+ max_seq_length: int = 128,
+ train_batch_size: int = 16,
+ eval_batch_size: int = 16,
+ tokenizers_parallelism: bool = True,
+ **dataloader_kwargs: Any,
+ ):
+ r"""Initialize the ``LightningDataModule`` designed for both the RTE or BoolQ SuperGLUE Hugging Face
+ datasets.
+
+ Args:
+ model_name_or_path (str):
+ Can be either:
+ - A string, the ``model id`` of a pretrained model hosted inside a model repo on huggingface.co.
+ Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
+ a user or organization name, like ``dbmdz/bert-base-german-cased``.
+ - A path to a ``directory`` containing model weights saved using
+ :meth:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
+ task_name (str, optional): Name of the SuperGLUE task to execute. This module supports 'rte' or 'boolq'.
+ Defaults to DEFAULT_TASK which is 'rte'.
+ max_seq_length (int, optional): Length to which we will pad sequences or truncate input. Defaults to 128.
+ train_batch_size (int, optional): Training batch size. Defaults to 16.
+ eval_batch_size (int, optional): Batch size to use for validation and testing splits. Defaults to 16.
+ tokenizers_parallelism (bool, optional): Whether to use parallelism in the tokenizer. Defaults to True.
+ \**dataloader_kwargs: Arguments passed when initializing the dataloader
+ """
+ super().__init__()
+ task_name = task_name if task_name in TASK_NUM_LABELS.keys() else DEFAULT_TASK
+ self.text_fields = self.TASK_TEXT_FIELD_MAP[task_name]
+ self.dataloader_kwargs = {
+ "num_workers": dataloader_kwargs.get("num_workers", 0),
+ "pin_memory": dataloader_kwargs.get("pin_memory", False),
+ }
+ self.save_hyperparameters()
+ os.environ["TOKENIZERS_PARALLELISM"] = "true" if self.hparams.tokenizers_parallelism else "false"
+ self.tokenizer = AutoTokenizer.from_pretrained(
+ self.hparams.model_name_or_path, use_fast=True, local_files_only=False
+ )
+
+ def prepare_data(self):
+ """Load the SuperGLUE dataset."""
+ # N.B. PL calls prepare_data from a single process (rank 0) so do not use it to assign
+ # state (e.g. self.x=y)
+ datasets.load_dataset("super_glue", self.hparams.task_name)
+
+ def setup(self, stage):
+ """Setup our dataset splits for training/validation."""
+ self.dataset = datasets.load_dataset("super_glue", self.hparams.task_name)
+ for split in self.dataset.keys():
+ self.dataset[split] = self.dataset[split].map(
+ self._convert_to_features, batched=True, remove_columns=["label"]
+ )
+ self.columns = [c for c in self.dataset[split].column_names if c in self.LOADER_COLUMNS]
+ self.dataset[split].set_format(type="torch", columns=self.columns)
+
+ self.eval_splits = [x for x in self.dataset.keys() if "validation" in x]
+
+ def train_dataloader(self):
+ return DataLoader(self.dataset["train"], batch_size=self.hparams.train_batch_size, **self.dataloader_kwargs)
+
+ def val_dataloader(self):
+ return DataLoader(self.dataset["validation"], batch_size=self.hparams.eval_batch_size, **self.dataloader_kwargs)
+
+ def _convert_to_features(self, example_batch: datasets.arrow_dataset.Batch) -> BatchEncoding:
+ """Convert raw text examples to a :class:`~transformers.tokenization_utils_base.BatchEncoding` container
+ (derived from python dict) of features that includes helpful methods for translating between word/character
+ space and token space.
+
+ Args:
+ example_batch ([type]): The set of examples to convert to token space.
+
+ Returns:
+ ``BatchEncoding``: A batch of encoded examples (note default tokenizer batch_size=1000)
+ """
+ text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]]))
+ # Tokenize the text/text pairs
+ features = self.tokenizer.batch_encode_plus(
+ text_pairs, max_length=self.hparams.max_seq_length, padding="longest", truncation=True
+ )
+ # Rename label to labels to make it easier to pass to model forward
+ features["labels"] = example_batch["label"]
+ return features
+
+
+# %%
+class RteBoolqModule(pl.LightningModule):
+ """A ``LightningModule`` that can be used to finetune a foundational model on either the RTE or BoolQ SuperGLUE
+ tasks using Hugging Face implementations of a given model and the `SuperGLUE Hugging Face dataset."""
+
+ def __init__(
+ self,
+ model_name_or_path: str,
+ optimizer_init: Dict[str, Any],
+ lr_scheduler_init: Dict[str, Any],
+ model_cfg: Optional[Dict[str, Any]] = None,
+ task_name: str = DEFAULT_TASK,
+ experiment_tag: str = "default",
+ ):
+ """
+ Args:
+ model_name_or_path (str): Path to pretrained model or identifier from https://huggingface.co/models
+ optimizer_init (Dict[str, Any]): The desired optimizer configuration.
+ lr_scheduler_init (Dict[str, Any]): The desired learning rate scheduler config
+ model_cfg (Optional[Dict[str, Any]], optional): Defines overrides of the default model config. Defaults to
+ ``None``.
+ task_name (str, optional): The SuperGLUE task to execute, one of ``'rte'``, ``'boolq'``. Defaults to "rte".
+ experiment_tag (str, optional): The tag to use for the experiment and tensorboard logs. Defaults to
+ "default".
+ """
+ super().__init__()
+ if task_name not in TASK_NUM_LABELS.keys():
+ rank_zero_warn(f"Invalid task_name {task_name!r}. Proceeding with the default task: {DEFAULT_TASK!r}")
+ task_name = DEFAULT_TASK
+ self.num_labels = TASK_NUM_LABELS[task_name]
+ self.model_cfg = model_cfg or {}
+ conf = AutoConfig.from_pretrained(model_name_or_path, num_labels=self.num_labels, local_files_only=False)
+ self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=conf)
+ self.model.config.update(self.model_cfg) # apply model config overrides
+ self.init_hparams = {
+ "optimizer_init": optimizer_init,
+ "lr_scheduler_init": lr_scheduler_init,
+ "model_config": self.model.config,
+ "model_name_or_path": model_name_or_path,
+ "task_name": task_name,
+ "experiment_id": f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{experiment_tag}",
+ }
+ self.save_hyperparameters(self.init_hparams)
+ self.metric = datasets.load_metric(
+ "super_glue", self.hparams.task_name, experiment_id=self.hparams.experiment_id
+ )
+ self.no_decay = ["bias", "LayerNorm.weight"]
+
+ @property
+ def finetuningscheduler_callback(self) -> FinetuningScheduler: # type: ignore # noqa
+ fts = [c for c in self.trainer.callbacks if isinstance(c, FinetuningScheduler)] # type: ignore # noqa
+ return fts[0] if fts else None
+
+ def forward(self, **inputs):
+ return self.model(**inputs)
+
+ def training_step(self, batch, batch_idx):
+ outputs = self(**batch)
+ loss = outputs[0]
+ self.log("train_loss", loss)
+ return loss
+
+ def training_epoch_end(self, outputs: List[Any]) -> None:
+ if self.finetuningscheduler_callback:
+ self.log("finetuning_schedule_depth", float(self.finetuningscheduler_callback.curr_depth))
+
+ def validation_step(self, batch, batch_idx, dataloader_idx=0):
+ outputs = self(**batch)
+ val_loss, logits = outputs[:2]
+ if self.num_labels >= 1:
+ preds = torch.argmax(logits, axis=1)
+ elif self.num_labels == 1:
+ preds = logits.squeeze()
+ labels = batch["labels"]
+ self.log("val_loss", val_loss, prog_bar=True)
+ metric_dict = self.metric.compute(predictions=preds, references=labels)
+ self.log_dict(metric_dict, prog_bar=True)
+
+ def _init_param_groups(self) -> List[Dict]:
+ """Initialize the parameter groups. Used to ensure weight_decay is not applied to our specified bias
+ parameters when we initialize the optimizer.
+
+ Returns:
+ List[Dict]: A list of parameter group dictionaries.
+ """
+ return [
+ {
+ "params": [
+ p
+ for n, p in self.model.named_parameters()
+ if not any(nd in n for nd in self.no_decay) and p.requires_grad
+ ],
+ "weight_decay": self.hparams.optimizer_init["weight_decay"],
+ },
+ {
+ "params": [
+ p
+ for n, p in self.model.named_parameters()
+ if any(nd in n for nd in self.no_decay) and p.requires_grad
+ ],
+ "weight_decay": 0.0,
+ },
+ ]
+
+ def configure_optimizers(self):
+ # the phase 0 parameters will have been set to require gradients during setup
+ # you can initialize the optimizer with a simple requires.grad filter as is often done,
+ # but in this case we pass a list of parameter groups to ensure weight_decay is
+ # not applied to the bias parameter (for completeness, in this case it won't make much
+ # performance difference)
+ optimizer = AdamW(params=self._init_param_groups(), **self.hparams.optimizer_init)
+ scheduler = {
+ "scheduler": CosineAnnealingWarmRestarts(optimizer, **self.hparams.lr_scheduler_init),
+ "interval": "epoch",
+ }
+ return [optimizer], [scheduler]
+
+
+# %% [markdown]
+# ### Our Training Sessions
+#
+# We'll be comparing three different finetuning training configurations. Every configuration in this example depends
+# upon a shared set of defaults, only differing in their respective finetuning schedules.
+#
+# | Experiment Tag | Training Scenario Description |
+# |:-----------------:| ---------------------------------------------------------------------- |
+# | ``fts_explicit`` | Training with a finetuning schedule explicitly provided by the user |
+# | ``nofts_baseline``| A baseline finetuning training session (without scheduled finetuning) |
+# | ``fts_implicit`` | Training with an implicitly generated finetuning schedule (the default)|
+#
+# Let's begin by configuring the ``fts_explicit`` scenario. We'll subsequently run the other two scenarios for
+# comparison.
+
+# %%
+# Let's create a finetuning schedule for our model and run an explicitly scheduled finetuning training scenario with it
+# Please see the [FinetuningScheduler documentation](https://finetuning-scheduler.readthedocs.io/en/stable/index.html) for a full description of the schedule format
+
+
+ft_schedule_yaml = """
+0:
+ params:
+ - model.classifier.bias
+ - model.classifier.weight
+ - model.pooler.dense.bias
+ - model.pooler.dense.weight
+ - model.deberta.encoder.LayerNorm.bias
+ - model.deberta.encoder.LayerNorm.weight
+ - model.deberta.encoder.rel_embeddings.weight
+ - model.deberta.encoder.layer.{0,11}.(output|attention|intermediate).*
+1:
+ params:
+ - model.deberta.embeddings.LayerNorm.bias
+ - model.deberta.embeddings.LayerNorm.weight
+2:
+ params:
+ - model.deberta.embeddings.word_embeddings.weight
+"""
+ft_schedule_name = "RteBoolqModule_ft_schedule_deberta_base.yaml"
+# Let's write the schedule to a file so we can simulate loading an explicitly defined finetuning
+# schedule.
+with open(ft_schedule_name, "w") as f:
+ f.write(ft_schedule_yaml)
+
+# %%
+datasets.logging.disable_progress_bar()
+pl.seed_everything(42)
+dm = RteBoolqDataModule(model_name_or_path="microsoft/deberta-v3-base", tokenizers_parallelism=True)
+
+# %% [markdown]
+# ### Optimizer Configuration
+#
+#
+#
+# Though other optimizers can arguably yield some marginal advantage contingent on the context,
+# the Adam optimizer (and the [AdamW version](https://pytorch.org/docs/stable/_modules/torch/optim/adamw.html#AdamW) which
+# implements decoupled weight decay) remains robust to hyperparameter choices and is commonly used for finetuning
+# foundational language models. See [(Sivaprasad et al., 2020)](#f2) and [(Mosbach, Andriushchenko & Klakow, 2020)](#f3) for theoretical and systematic empirical justifications of Adam and its use in finetuning
+# large transformer-based language models. The values used here have some justification
+# in the referenced literature but have been largely empirically determined and while a good
+# starting point could be could be further tuned.
+#
+#
+
+# %%
+optimizer_init = {"weight_decay": 1e-05, "eps": 1e-07, "lr": 1e-05}
+
+# %% [markdown]
+# ### LR Scheduler Configuration
+#
+#
+#
+# The [CosineAnnealingWarmRestarts scheduler](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.CosineAnnealingWarmRestarts.html?highlight=cosineannealingwarm#torch.optim.lr_scheduler.CosineAnnealingWarmRestarts) nicely fits with our iterative finetuning since it does not depend upon a global max_epoch
+# value. The importance of initial warmup is reduced due to the innate warmup effect of Adam bias correction [[5]](#f3)
+# and the gradual thawing we are performing. Note that commonly used LR schedulers that depend on providing
+# max_iterations/epochs (e.g. the
+# [CosineWarmupScheduler](https://github.com/PyTorchLightning/lightning-tutorials/blob/0c325829101d5a6ebf32ed99bbf5b09badf04a59/course_UvA-DL/05-transformers-and-MH-attention/Transformers_MHAttention.py#L688)
+# used in other pytorch-lightning tutorials) also work with FinetuningScheduler. Though the LR scheduler is theoretically
+# justified [(Loshchilov & Hutter, 2016)](#f4), the particular values provided here are primarily empircally driven.
+#
+#
+
+
+# %%
+lr_scheduler_init = {"T_0": 1, "T_mult": 2, "eta_min": 1e-07}
+
+# %%
+# Load our lightning module...
+lightning_module_kwargs = {
+ "model_name_or_path": "microsoft/deberta-v3-base",
+ "optimizer_init": optimizer_init,
+ "lr_scheduler_init": lr_scheduler_init,
+}
+model = RteBoolqModule(**lightning_module_kwargs, experiment_tag="fts_explicit")
+
+# %% [markdown]
+# ### Callback Configuration
+#
+# The only callback required to invoke the [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) is the [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) callback itself.
+# Default versions of [FTSCheckpoint](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSCheckpoint) and [FTSEarlyStopping](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts_supporters.html#finetuning_scheduler.fts_supporters.FTSEarlyStopping)
+# (if not specifying ``epoch_only_transitions``) will be included ([as discussed above](#basic_usage)) if not provided
+# in the callbacks list. For demonstration purposes I'm including example configurations of all three callbacks below.
+
+# %%
+# let's save our callback configurations for the explicit scenario since we'll be reusing the same
+# configurations for the implicit and nofts_baseline scenarios (except the config for the
+# FinetuningScheduler callback itself of course in the case of nofts_baseline)
+earlystopping_kwargs = {"monitor": "val_loss", "min_delta": 0.001, "patience": 2}
+checkpoint_kwargs = {"monitor": "val_loss", "save_top_k": 1}
+fts_kwargs = {"max_depth": 1}
+callbacks = [
+ FinetuningScheduler(ft_schedule=ft_schedule_name, **fts_kwargs), # type: ignore # noqa
+ FTSEarlyStopping(**earlystopping_kwargs), # type: ignore # noqa
+ FTSCheckpoint(**checkpoint_kwargs), # type: ignore # noqa
+]
+
+# %%
+logger = TensorBoardLogger("lightning_logs", name="fts_explicit")
+# optionally start tensorboard and monitor progress graphically while viewing multi-phase finetuning specific training
+# logs in the cell output below by uncommenting the next 2 lines
+# # %load_ext tensorboard
+# # %tensorboard --logdir lightning_logs
+# disable progress bar by default to focus on multi-phase training logs. Set to True to re-enable if desired
+enable_progress_bar = False
+
+# %%
+
+
+def train() -> None:
+ trainer = pl.Trainer(
+ enable_progress_bar=enable_progress_bar,
+ max_epochs=100,
+ precision=16,
+ accelerator="auto",
+ devices=1 if torch.cuda.is_available() else None,
+ callbacks=callbacks,
+ logger=logger,
+ )
+ trainer.fit(model, datamodule=dm)
+
+
+print(
+ "Note given the computation associated w/ the multiple phases of finetuning demonstrated, this notebook is best used with an accelerator"
+)
+train()
+
+# %% [markdown]
+# ### Running the Baseline and Implicit Finetuning Scenarios
+#
+# Let's now compare our ``nofts_baseline`` and ``fts_implicit`` scenarios with the ``fts_explicit`` one we just ran.
+#
+# We'll need to update our callbacks list, using the core PL ``EarlyStopping`` and ``ModelCheckpoint`` callbacks for the
+# ``nofts_baseline`` (which operate identically to their FTS analogs apart from the recursive training support).
+# For both core PyTorch Lightning and user-registered callbacks, we can define our callbacks using a dictionary as we do
+# with the LightningCLI. This allows us to avoid managing imports and support more complex configuration separated from
+# code.
+#
+# Note that we'll be using identical callback configurations to the ``fts_explicit`` scenario. Keeping [max_depth](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html?highlight=max_depth#finetuning_scheduler.fts.FinetuningScheduler.params.max_depth) for
+# the implicit schedule will limit finetuning to just the last 4 parameters of the model, which is only a small fraction
+# of the parameters you'd want to tune for maximum performance. Since the implicit schedule is quite computationally
+# intensive and most useful for exploring model behavior, leaving [max_depth](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html?highlight=max_depth#finetuning_scheduler.fts.FinetuningScheduler.params.max_depth) 1 allows us to demo implicit mode
+# behavior while keeping the computational cost and runtime of this notebook reasonable. To review how a full implicit
+# mode run compares to the ``nofts_baseline`` and ``fts_explicit`` scenarios, please see the the following
+# [tensorboard experiment summary](https://tensorboard.dev/experiment/n7U8XhrzRbmvVzC4SQSpWw/).
+
+
+# %%
+nofts_callbacks = [EarlyStopping(**earlystopping_kwargs), ModelCheckpoint(**checkpoint_kwargs)]
+fts_implicit_callbacks = [
+ FinetuningScheduler(**fts_kwargs), # type: ignore # noqa
+ FTSEarlyStopping(**earlystopping_kwargs), # type: ignore # noqa
+ FTSCheckpoint(**checkpoint_kwargs), # type: ignore # noqa
+]
+scenario_callbacks = {"nofts_baseline": nofts_callbacks, "fts_implicit": fts_implicit_callbacks}
+
+# %%
+for scenario_name, scenario_callbacks in scenario_callbacks.items():
+ model = RteBoolqModule(**lightning_module_kwargs, experiment_tag=scenario_name)
+ logger = TensorBoardLogger("lightning_logs", name=scenario_name)
+ callbacks = scenario_callbacks
+ print(f"Beginning training the '{scenario_name}' scenario")
+ train()
+
+# %% [markdown]
+# ### Reviewing the Training Results
+#
+# See the [tensorboard experiment summaries](https://tensorboard.dev/experiment/n7U8XhrzRbmvVzC4SQSpWw/) to get a sense
+# of the relative computational and performance tradeoffs associated with these [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) configurations.
+# The summary compares a full ``fts_implicit`` execution to ``fts_explicit`` and ``nofts_baseline`` scenarios using DDP
+# training with 2 GPUs. The full logs/schedules for all three scenarios are available
+# [here](https://drive.google.com/file/d/1LrUcisRLHeJgh_BDOOD_GUBPp5iHAkoR/view?usp=sharing) and the checkpoints
+# produced in the scenarios [here](https://drive.google.com/file/d/1t7myBgcqcZ9ax_IT9QVk-vFH_l_o5UXB/view?usp=sharing)
+# (caution, ~3.5GB).
+#
+# [{height="315px" width="492px"}](https://tensorboard.dev/experiment/n7U8XhrzRbmvVzC4SQSpWw/#scalars&_smoothingWeight=0&runSelectionState=eyJmdHNfZXhwbGljaXQiOnRydWUsIm5vZnRzX2Jhc2VsaW5lIjpmYWxzZSwiZnRzX2ltcGxpY2l0IjpmYWxzZX0%3D)
+# [{height="316px" width="505px"}](https://tensorboard.dev/experiment/n7U8XhrzRbmvVzC4SQSpWw/#scalars&_smoothingWeight=0&runSelectionState=eyJmdHNfZXhwbGljaXQiOmZhbHNlLCJub2Z0c19iYXNlbGluZSI6dHJ1ZSwiZnRzX2ltcGxpY2l0IjpmYWxzZX0%3D)
+#
+# Note there could be around ~1% variation in performance from the tensorboard summaries generated by this notebook
+# which uses DP and 1 GPU.
+#
+# [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) expands the space of possible finetuning schedules and the composition of more sophisticated schedules can
+# yield marginal finetuning performance gains. That stated, it should be emphasized the primary utility of [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) is to grant
+# greater finetuning flexibility for model exploration in research. For example, glancing at DeBERTa-v3's implicit training
+# run, a critical tuning transition point is immediately apparent:
+#
+# [{height="272px" width="494px"}](https://tensorboard.dev/experiment/n7U8XhrzRbmvVzC4SQSpWw/#scalars&_smoothingWeight=0&runSelectionState=eyJmdHNfZXhwbGljaXQiOmZhbHNlLCJub2Z0c19iYXNlbGluZSI6ZmFsc2UsImZ0c19pbXBsaWNpdCI6dHJ1ZX0%3D)
+#
+# Our `val_loss` begins a precipitous decline at step 3119 which corresponds to phase 17 in the schedule. Referring to our
+# schedule, in phase 17 we're beginning tuning the attention parameters of our 10th encoder layer (of 11). Interesting!
+# Though beyond the scope of this tutorial, it might be worth investigating these dynamics further and
+# [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) allows one to do just that quite easily.
+#
+# %% [markdown]
+#
+# Note that though this example is intended to capture a common usage scenario, substantial variation is expected
+# among use cases and models.
+# In summary, [FinetuningScheduler](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html#finetuning_scheduler.fts.FinetuningScheduler) provides increased finetuning flexibility that can be useful in a variety of
+# contexts from exploring model tuning behavior to maximizing performance.
+# %% [markdown]
+# ## Footnotes
+#
+#
+# -
+#
+# [Howard, J., & Ruder, S. (2018)](https://arxiv.org/pdf/1801.06146.pdf). Fine-tuned Language
+# Models for Text Classification. ArXiv, abs/1801.06146. [↩](#a1)
+#
+#
+# -
+#
+# [Chronopoulou, A., Baziotis, C., & Potamianos, A. (2019)](https://arxiv.org/pdf/1902.10547.pdf).
+# An embarrassingly simple approach for transfer learning from pretrained language models. arXiv
+# preprint arXiv:1902.10547. [↩](#a1)
+#
+#
+# -
+#
+# [Peters, M. E., Ruder, S., & Smith, N. A. (2019)](https://arxiv.org/pdf/1903.05987.pdf). To tune or not to
+# tune? adapting pretrained representations to diverse tasks. arXiv preprint arXiv:1903.05987. [↩](#a1)
+#
+#
+# -
+#
+# [Sivaprasad, P. T., Mai, F., Vogels, T., Jaggi, M., & Fleuret, F. (2020)](https://arxiv.org/pdf/1910.11758.pdf).
+# Optimizer benchmarking needs to account for hyperparameter tuning. In International Conference on Machine Learning
+# (pp. 9036-9045). PMLR. [↩](#a2)
+#
+#
+# -
+#
+# [Mosbach, M., Andriushchenko, M., & Klakow, D. (2020)](https://arxiv.org/pdf/2006.04884.pdf). On the stability of
+# fine-tuning bert: Misconceptions, explanations, and strong baselines. arXiv preprint arXiv:2006.04884. [↩](#a2)
+#
+#
+# -
+#
+# [Loshchilov, I., & Hutter, F. (2016)](https://arxiv.org/pdf/1608.03983.pdf). Sgdr: Stochastic gradient descent with
+# warm restarts. arXiv preprint arXiv:1608.03983. [↩](#a3)
+#
+#
+#
+#
+
+# %% [markdown]
+#
diff --git a/lightning_examples/finetuning-scheduler/fts_explicit_accuracy.png b/lightning_examples/finetuning-scheduler/fts_explicit_accuracy.png
new file mode 100644
index 000000000..b5d8f554a
Binary files /dev/null and b/lightning_examples/finetuning-scheduler/fts_explicit_accuracy.png differ
diff --git a/lightning_examples/finetuning-scheduler/fts_explicit_loss_anim.gif b/lightning_examples/finetuning-scheduler/fts_explicit_loss_anim.gif
new file mode 100644
index 000000000..7451f65c0
Binary files /dev/null and b/lightning_examples/finetuning-scheduler/fts_explicit_loss_anim.gif differ
diff --git a/lightning_examples/finetuning-scheduler/implicit_training_transition.png b/lightning_examples/finetuning-scheduler/implicit_training_transition.png
new file mode 100644
index 000000000..6854dbfd9
Binary files /dev/null and b/lightning_examples/finetuning-scheduler/implicit_training_transition.png differ
diff --git a/lightning_examples/finetuning-scheduler/logo_fts.png b/lightning_examples/finetuning-scheduler/logo_fts.png
new file mode 100644
index 000000000..00599a54d
Binary files /dev/null and b/lightning_examples/finetuning-scheduler/logo_fts.png differ
diff --git a/lightning_examples/finetuning-scheduler/nofts_baseline_accuracy.png b/lightning_examples/finetuning-scheduler/nofts_baseline_accuracy.png
new file mode 100644
index 000000000..b78f8c675
Binary files /dev/null and b/lightning_examples/finetuning-scheduler/nofts_baseline_accuracy.png differ
diff --git a/lightning_examples/finetuning-scheduler/side_by_side_yaml.png b/lightning_examples/finetuning-scheduler/side_by_side_yaml.png
new file mode 100644
index 000000000..3a32a1f3f
Binary files /dev/null and b/lightning_examples/finetuning-scheduler/side_by_side_yaml.png differ