diff --git a/docs/source/conf.py b/docs/source/conf.py
index 57fa832fb..15b8807ad 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -94,11 +94,14 @@ def get_by_name(string: str):
     """
     Import by name and return imported module/function/class
 
-    Args:
-        string (str): module/function/class to import, e.g. 'pandas.read_csv' will return read_csv function as
-        defined by pandas
-
-    Returns:
+    Parameters
+    ----------
+    string (str):
+        module/function/class to import, e.g. 'pandas.read_csv'
+        will return read_csv function as defined by pandas
+
+    Returns
+    -------
         imported object
     """
     class_name = string.split(".")[-1]
diff --git a/examples/ar.py b/examples/ar.py
index 4422302af..0d677b22c 100644
--- a/examples/ar.py
+++ b/examples/ar.py
@@ -98,7 +98,7 @@
 # deepar.hparams.log_val_interval = -1
 # trainer.limit_train_batches = 1.0
 # res = Tuner(trainer).lr_find(
-#     deepar, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader, min_lr=1e-5, max_lr=1e2
+#     deepar, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader, min_lr=1e-5, max_lr=1e2 # noqa: E501
 # )
 
 # print(f"suggested learning rate: {res.suggestion()}")
diff --git a/examples/nbeats.py b/examples/nbeats.py
index ce2f636c6..a450d8533 100644
--- a/examples/nbeats.py
+++ b/examples/nbeats.py
@@ -87,7 +87,7 @@
 # trainer.limit_train_batches = 1.0
 # # run learning rate finder
 # res = Tuner(trainer).lr_find(
-#     net, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader, min_lr=1e-5, max_lr=1e2
+#     net, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader, min_lr=1e-5, max_lr=1e2 # noqa: E501
 # )
 # print(f"suggested learning rate: {res.suggestion()}")
 # fig = res.plot(show=True, suggest=True)
diff --git a/examples/stallion.py b/examples/stallion.py
index 0066c3e22..9f5a5d5fa 100644
--- a/examples/stallion.py
+++ b/examples/stallion.py
@@ -34,7 +34,7 @@
 data["avg_volume_by_agency"] = data.groupby(
     ["time_idx", "agency"], observed=True
 ).volume.transform("mean")
-# data = data[lambda x: (x.sku == data.iloc[0]["sku"]) & (x.agency == data.iloc[0]["agency"])]
+# data = data[lambda x: (x.sku == data.iloc[0]["sku"]) & (x.agency == data.iloc[0]["agency"])] # noqa: E501
 special_days = [
     "easter_day",
     "good_friday",
@@ -151,7 +151,7 @@
 # trainer.limit_train_batches = 1.0
 # # run learning rate finder
 # res = Tuner(trainer).lr_find(
-#     tft, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader, min_lr=1e-5, max_lr=1e2
+#     tft, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader, min_lr=1e-5, max_lr=1e2 # noqa: E501
 # )
 # print(f"suggested learning rate: {res.suggestion()}")
 # fig = res.plot(show=True, suggest=True)
diff --git a/pyproject.toml b/pyproject.toml
index 0f5ef235a..2e5154717 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -136,7 +136,9 @@ exclude = [
   ".venv/",
   ".git/",
   ".history/",
+  "docs/source/tutorials/",
 ]
+target-version = "py39"
 
 [tool.ruff.lint]
 select = ["E", "F", "W", "C4", "S"]
@@ -161,6 +163,11 @@ known-first-party = ["pytorch_forecasting"]
 combine-as-imports = true
 force-sort-within-sections = true
 
+[tool.ruff.lint.per-file-ignores]
+"pytorch_forecasting/data/timeseries.py" = [
+  "E501", # Line too long being fixed in #1746 To be removed after merging
+]
+
 [tool.black]
 line-length = 88
 include = '\.pyi?$'
@@ -188,3 +195,6 @@ exclude = '''
 [tool.nbqa.mutate]
 ruff = 1
 black = 1
+
+[tool.nbqa.exclude]
+ruff = "docs/source/tutorials/" # ToDo: Remove this when fixing notebooks
diff --git a/pytorch_forecasting/data/__init__.py b/pytorch_forecasting/data/__init__.py
index ef60114b8..301c8394d 100644
--- a/pytorch_forecasting/data/__init__.py
+++ b/pytorch_forecasting/data/__init__.py
@@ -1,8 +1,8 @@
 """
 Datasets, etc. for timeseries data.
 
-Handling timeseries data is not trivial. It requires special treatment. This sub-package provides the necessary tools
-to abstracts the necessary work.
+Handling timeseries data is not trivial. It requires special treatment.
+This sub-package provides the necessary tools to abstracts the necessary work.
 """
 
 from pytorch_forecasting.data.encoders import (
diff --git a/pytorch_forecasting/data/encoders.py b/pytorch_forecasting/data/encoders.py
index 562ad66ed..6869153a6 100644
--- a/pytorch_forecasting/data/encoders.py
+++ b/pytorch_forecasting/data/encoders.py
@@ -181,12 +181,16 @@ def get_transform(
     ) -> Dict[str, Callable]:
         """Return transformation functions.
 
-        Args:
-            transformation (Union[str, Dict[str, Callable]]): name of transformation or
-                dictionary with transformation information.
+        Parameters
+        ----------
+        transformation: Union[str, Dict[str, Callable]]
+            name of transformation or dictionary with transformation information.
 
-        Returns:
-            Dict[str, Callable]: dictionary with transformation functions (forward, reverse, inverse and inverse_torch)
+        Returns
+        -------
+        Dict[str, Callable]
+            dictionary with transformation functions
+            (forward, reverse, inverse and inverse_torch)
         """
         if isinstance(transformation, str):
             transform = cls.TRANSFORMATIONS[transformation]
@@ -204,8 +208,15 @@ def preprocess(
 
         Uses ``transform`` attribute to determine how to apply transform.
 
-        Returns:
-            Union[np.ndarray, torch.Tensor]: return rescaled series with type depending on input type
+        Parameters
+        ----------
+        y: Union[pd.Series, pd.DataFrame, np.ndarray, torch.Tensor]
+            input data
+
+        Returns
+        -------
+        Union[np.ndarray, torch.Tensor]
+            return rescaled series with type depending on input type
         """
         if self.transformation is None:
             return y
@@ -229,8 +240,15 @@ def inverse_preprocess(
 
         Uses ``transform`` attribute to determine how to apply inverse transform.
 
-        Returns:
-            Union[np.ndarray, torch.Tensor]: return rescaled series with type depending on input type
+        Parameters
+        ----------
+        y: Union[pd.Series, np.ndarray, torch.Tensor]
+            input data
+
+        Returns
+        -------
+        Union[np.ndarray, torch.Tensor]
+            return rescaled series with type depending on input type
         """
         if self.transformation is None:
             pass
@@ -249,15 +267,18 @@ class NaNLabelEncoder(
 ):
     """
     Labelencoder that can optionally always encode nan and unknown classes (in transform) as class ``0``
-    """
+    """  # noqa: E501
 
     def __init__(self, add_nan: bool = False, warn: bool = True):
         """
         init NaNLabelEncoder
 
-        Args:
-            add_nan: if to force encoding of nan at 0
-            warn: if to warn if additional nans are added because items are unknown
+        Returns
+        -------
+        add_nan
+            if to force encoding of nan at 0
+        warn
+            if to warn if additional nans are added because items are unknown
         """
         self.add_nan = add_nan
         self.warn = warn
@@ -267,12 +288,17 @@ def fit_transform(self, y: pd.Series, overwrite: bool = False) -> np.ndarray:
         """
         Fit and transform data.
 
-        Args:
-            y (pd.Series): input data
-            overwrite (bool): if to overwrite current mappings or if to add to it.
+        Parameters
+        ----------
+        y: pd.Series
+            input data
+        overwrite: bool
+            if to overwrite current mappings or if to add to it.
 
-        Returns:
-            np.ndarray: encoded data
+        Returns
+        -------
+        np.ndarray
+            encoded data
         """
         self.fit(y, overwrite=overwrite)
         return self.transform(y)
@@ -280,14 +306,18 @@ def fit_transform(self, y: pd.Series, overwrite: bool = False) -> np.ndarray:
     @staticmethod
     def is_numeric(y: pd.Series) -> bool:
         """
-        Determine if series is numeric or not. Will also return True if series is a categorical type with
-        underlying integers.
+        Determine if series is numeric or not. Will also return True
+        if series is a categorical type with underlying integers.
 
-        Args:
-            y (pd.Series): series for which to carry out assessment
+        Parameters
+        ----------
+        y: pd.Series
+            series for which to carry out assessment
 
-        Returns:
-            bool: True if series is numeric
+        Returns
+        -------
+        bool
+            True if series is numeric
         """
         return y.dtype.kind in "bcif" or (
             isinstance(y.dtype, pd.CategoricalDtype)
@@ -298,12 +328,16 @@ def fit(self, y: pd.Series, overwrite: bool = False):
         """
         Fit transformer
 
-        Args:
-            y (pd.Series): input data to fit on
-            overwrite (bool): if to overwrite current mappings or if to add to it.
+        Parameters
+        ----------
+        y: pd.Series
+            input data to fit on
+        overwrite: bool
+            whether to overwrite current mappings or if to add to it.
 
-        Returns:
-            NaNLabelEncoder: self
+        Returns
+        -------
+        NaNLabelEncoder: self
         """
         if not overwrite and hasattr(self, "classes_"):
             offset = len(self.classes_)
@@ -341,23 +375,33 @@ def transform(
         """
         Encode iterable with integers.
 
-        Args:
-            y (Iterable): iterable to encode
-            return_norm: only exists for compatability with other encoders - returns a tuple if true.
-            target_scale: only exists for compatability with other encoders - has no effect.
-            ignore_na (bool): if to ignore na values and map them to zeros
-                (this is different to `add_nan=True` option which maps ONLY NAs to zeros
-                while this options maps the first class and NAs to zeros)
-
-        Returns:
-            Union[torch.Tensor, np.ndarray]: returns encoded data as torch tensor or numpy array depending on input type
+        Parameters
+        ----------
+        y: Iterable
+            iterable to encode
+        return_norm
+            only exists for compatability with other encoders - returns a tuple if true.
+        target_scale
+            only exists for compatability with other encoders - has no effect.
+        ignore_na: bool
+            if to ignore na values and map them to zeros
+            (this is different to `add_nan=True` option which maps ONLY NAs to zeros
+            while this options maps the first class and NAs to zeros)
+
+        Returns
+        -------
+        Union[torch.Tensor, np.ndarray]
+            returns encoded data as torch tensor or numpy array depending on input type
         """
         if self.add_nan:
             if self.warn:
                 cond = np.array([item not in self.classes_ for item in y])
                 if cond.any():
                     warnings.warn(
-                        f"Found {np.unique(np.asarray(y)[cond]).size} unknown classes which were set to NaN",
+                        (
+                            f"Found {np.unique(np.asarray(y)[cond]).size} "
+                            "unknown classes which were set to NaN"
+                        ),
                         UserWarning,
                     )
 
@@ -372,7 +416,8 @@ def transform(
                     encoded = [self.classes_[v] for v in y]
                 except KeyError as e:
                     raise KeyError(
-                        f"Unknown category '{e.args[0]}' encountered. Set `add_nan=True` to allow unknown categories"
+                        f"Unknown category '{e.args[0]}' encountered. "
+                        "Set `add_nan=True` to allow unknown categories"
                     )
 
         if isinstance(y, torch.Tensor):
@@ -389,14 +434,20 @@ def inverse_transform(self, y: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
         """
         Decode data, i.e. transform from integers to labels.
 
-        Args:
-            y (Union[torch.Tensor, np.ndarray]): encoded data
+        Parameters
+        ----------
+        y: Union[torch.Tensor, np.ndarray]
+            encoded data
 
-        Raises:
-            KeyError: if unknown elements should be decoded
+        Raises
+        ------
+        KeyError
+            if unknown elements should be decoded
 
-        Returns:
-            np.ndarray: decoded data
+        Returns
+        -------
+        np.ndarray
+            decoded data
         """
         if y.max() >= len(self.classes_vector_):
             raise KeyError("New unknown values detected")
@@ -405,17 +456,23 @@ def inverse_transform(self, y: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
         decoded = self.classes_vector_[y]
         return decoded
 
-    def __call__(self, data: Dict[str, torch.Tensor]) -> torch.Tensor:
+    def __call__(self, data: dict[str, torch.Tensor]) -> torch.Tensor:
         """
         Extract prediction from network output. Does not map back to input
         categories as this would require a numpy tensor without grad-abilities.
 
-        Args:
-            data (Dict[str, torch.Tensor]): Dictionary with entries
-                * prediction: data to de-scale
+        Parameters
+        ----------
+        data: dict[str, torch.Tensor]
+            Dictionary with entries
+
+            * prediction: data to de-scale
+            * target_scale: center and scale of data
 
-        Returns:
-            torch.Tensor: prediction
+        Returns
+        -------
+        torch.Tensor
+            prediction
         """
         return data["prediction"]
 
@@ -425,8 +482,10 @@ def get_parameters(self, groups=None, group_names=None) -> np.ndarray:
 
         All parameters are unused - exists for compatability.
 
-        Returns:
-            np.ndarray: zero array.
+        Returns
+        -------
+        np.ndarray
+            zero array.
         """
         return np.zeros(2, dtype=np.float64)
 
@@ -446,30 +505,42 @@ def __init__(
         method_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """
-        Args:
-            method (str, optional): method to rescale series. Either "identity", "standard" (standard scaling)
-                or "robust" (scale using quantiles 0.25-0.75). Defaults to "standard".
-            method_kwargs (Dict[str, Any], optional): Dictionary of method specific arguments as listed below
-                * "robust" method: "upper", "lower", "center" quantiles defaulting to 0.75, 0.25 and 0.5
-            center (bool, optional): If to center the output to zero. Defaults to True.
-            transformation (Union[str, Dict[str, Callable]] optional): Transform values before
-                applying normalizer. Available options are
-
-                * None (default): No transformation of values
-                * log: Estimate in log-space leading to a multiplicative model
-                * log1p: Estimate in log-space but add 1 to values before transforming for stability
-                  (e.g. if many small values <<1 are present).
-                  Note, that inverse transform is still only `torch.exp()` and not `torch.expm1()`.
-                * logit: Apply logit transformation on values that are between 0 and 1
-                * count: Apply softplus to output (inverse transformation) and x + 1 to input (transformation)
-                * softplus: Apply softplus to output (inverse transformation) and inverse softplus to input
-                    (transformation)
-                * relu: Apply max(0, x) to output
-                * Dict[str, Callable] of PyTorch functions that transforms and inversely transforms values.
-                  ``forward`` and ``reverse`` entries are required. ``inverse`` transformation is optional and
-                  should be defined if ``reverse`` is not the inverse of the forward transformation. ``inverse_torch``
-                  can be defined to provide a torch distribution transform for inverse transformations.
-        """
+        Parameters
+        ----------
+        method: str, optional, default="standard"
+            method to rescale series. Either "identity", "standard" (standard scaling)
+            or "robust" (scale using quantiles 0.25-0.75). Defaults to "standard".
+        method_kwargs: Dict[str, Any], optional, default=None
+            Dictionary of method specific arguments as listed below
+
+            - "robust" method: "upper", "lower", "center" quantiles defaulting to 0.75, 0.25 and 0.5
+
+        center: bool, optional, default=True
+            If to center the output to zero. Defaults to True.
+        transformation: Union[str, Dict[str, Callable]] optional, default=None
+            Transform values before applying normalizer. Available options are
+
+            - None (default): No transformation of values
+            - log: Estimate in log-space leading to a multiplicative model
+            - log1p: Estimate in log-space but add 1 to values before transforming for stability
+
+                (e.g. if many small values <<1 are present).
+                Note, that inverse transform is still only `torch.exp()` and
+                not `torch.expm1()`.
+
+            - logit: Apply logit transformation on values that are between 0 and 1
+            - count: Apply softplus to output (inverse transformation) and x + 1 to input (transformation)
+            - softplus: Apply softplus to output (inverse transformation) and inverse softplus to input (transformation)
+            - relu: Apply max(0, x) to output
+            - Dict[str, Callable] of PyTorch functions that transforms and inversely transforms values.
+
+                ``forward`` and ``reverse`` entries are required. ``inverse``
+                transformation is optional and
+                should be defined if ``reverse`` is not the
+                inverse of the forward transformation. ``inverse_torch``
+                can be defined to provide a torch distribution transform for
+                inverse transformations.
+        """  # noqa E501
         self.method = method
         assert method in [
             "standard",
@@ -487,8 +558,10 @@ def get_parameters(self, *args, **kwargs) -> torch.Tensor:
         """
         Returns parameters that were used for encoding.
 
-        Returns:
-            torch.Tensor: First element is center of data and second is scale
+        Returns
+        -------
+        torch.Tensor
+            First element is center of data and second is scale
         """
         return torch.stack(
             [torch.as_tensor(self.center_), torch.as_tensor(self.scale_)], dim=-1
@@ -498,11 +571,14 @@ def fit(self, y: Union[pd.Series, np.ndarray, torch.Tensor]):
         """
         Fit transformer, i.e. determine center and scale of data
 
-        Args:
-            y (Union[pd.Series, np.ndarray, torch.Tensor]): input data
+        Parameters
+        ----------
+        y: Union[pd.Series, np.ndarray, torch.Tensor]
+            input data
 
-        Returns:
-            TorchNormalizer: self
+        Returns
+        -------
+        TorchNormalizer: self
         """
         y = self.preprocess(y)
         self._set_parameters(y_center=y, y_scale=y)
@@ -516,9 +592,12 @@ def _set_parameters(
         """
         Calculate parameters for scale and center based on input timeseries
 
-        Args:
-            y_center (Union[pd.Series, np.ndarray, torch.Tensor]): timeseries for calculating center
-            y_scale (Union[pd.Series, np.ndarray, torch.Tensor]): timeseries for calculating scale
+        Parameters
+        ----------
+        y_center: Union[pd.Series, np.ndarray, torch.Tensor]
+            timeseries for calculating center
+        y_scale: Union[pd.Series, np.ndarray, torch.Tensor]
+            timeseries for calculating scale
         """
         if isinstance(y_center, torch.Tensor):
             eps = torch.finfo(y_center.dtype).eps
@@ -529,8 +608,10 @@ def _set_parameters(
                 self.center_ = torch.zeros(y_center.size()[:-1])
                 self.scale_ = torch.ones(y_scale.size()[:-1])
             elif isinstance(y_center, (np.ndarray, pd.Series, pd.DataFrame)):
-                # numpy default type is numpy.float64 while torch default type is torch.float32 (if not changed)
-                # therefore, we first generate torch tensors (with torch default type) and then
+                # numpy default type is numpy.float64 while torch default
+                # type is torch.float32 (if not changed)
+                # therefore, we first generate torch tensors
+                # (with torch default type) and then
                 # convert them to numpy arrays
                 self.center_ = torch.zeros(y_center.shape[:-1]).numpy()
                 self.scale_ = torch.ones(y_scale.shape[:-1]).numpy()
@@ -548,7 +629,8 @@ def _set_parameters(
             else:
                 self.center_ = np.mean(y_center)
                 self.scale_ = np.std(y_scale) + eps
-            # correct numpy scalar dtype promotion, e.g. fix type from `np.float32(0.0) + 1e-8` gives `np.float64(1e-8)`
+            # correct numpy scalar dtype promotion, e.g. fix type from
+            # `np.float32(0.0) + 1e-8` gives `np.float64(1e-8)`
             if isinstance(self.scale_, np.ndarray):
                 self.scale_ = self.scale_.astype(y_scale.dtype)
 
@@ -606,15 +688,20 @@ def transform(
         """
         Rescale data.
 
-        Args:
-            y (Union[pd.Series, np.ndarray, torch.Tensor]): input data
-            return_norm (bool, optional): [description]. Defaults to False.
-            target_scale (torch.Tensor): target scale to use instead of fitted center and scale
-
-        Returns:
-            Union[Tuple[Union[np.ndarray, torch.Tensor], np.ndarray], Union[np.ndarray, torch.Tensor]]: rescaled
-                data with type depending on input type. returns second element if ``return_norm=True``
-        """
+        Parameters
+        ----------
+        y: Union[pd.Series, np.ndarray, torch.Tensor]
+            input data
+        return_norm: bool, optional, default=False
+            [description]. Defaults to False.
+        target_scale: torch.Tensor, optional, default=None
+            target scale to use instead of fitted center and scale
+
+        Returns
+        -------
+        Union[Tuple[Union[np.ndarray, torch.Tensor],np.ndarray],Union[np.ndarray, torch.Tensor]]
+            rescaled data with type depending on input type. returns second element if ``return_norm=True``
+        """  # noqa: E501
         y = self.preprocess(y)
         # get center and scale
         if target_scale is None:
@@ -643,11 +730,15 @@ def inverse_transform(self, y: torch.Tensor) -> torch.Tensor:
         """
         Inverse scale.
 
-        Args:
-            y (torch.Tensor): scaled data
+        Parameters
+        ----------
+        y: torch.Tensor
+            scaled data
 
-        Returns:
-            torch.Tensor: de-scaled data
+        Returns
+        -------
+        torch.Tensor
+            de-scaled data
         """
         return self(dict(prediction=y, target_scale=self.get_parameters().unsqueeze(0)))
 
@@ -655,13 +746,18 @@ def __call__(self, data: Dict[str, torch.Tensor]) -> torch.Tensor:
         """
         Inverse transformation but with network output as input.
 
-        Args:
-            data (Dict[str, torch.Tensor]): Dictionary with entries
-                * prediction: data to de-scale
-                * target_scale: center and scale of data
+        Parameters
+        ----------
+        data: Dict[str, torch.Tensor]
+            Dictionary with entries
+
+            * prediction: data to de-scale
+            * target_scale: center and scale of data
 
-        Returns:
-            torch.Tensor: de-scaled data
+        Returns
+        -------
+        torch.Tensor
+            de-scaled data
         """
         # ensure output dtype matches input dtype
         dtype = data["prediction"].dtype
@@ -703,33 +799,42 @@ def __init__(
         """
         Initialize
 
-        Args:
-            method (str, optional): method to rescale series. Either "identity", "standard" (standard scaling)
-                or "robust" (scale using quantiles 0.25-0.75). Defaults to "standard".
-            method_kwargs (Dict[str, Any], optional): Dictionary of method specific arguments as listed below
+        Parameters
+        ----------
+        method: str, optional, default="standard"
+            method to rescale series. Either "identity", "standard" (standard scaling)
+            or "robust" (scale using quantiles 0.25-0.75). Defaults to "standard".
+        method_kwargs: Dict[str, Any], optional, default=None
+            Dictionary of method specific arguments as listed below
+
                 * "robust" method: "upper", "lower", "center" quantiles defaulting to 0.75, 0.25 and 0.5
-            center (bool, optional): If to center the output to zero. Defaults to True.
-            max_length(Union[int, List[int]], optional): Maximum length to take into account for calculating
-                parameters. If tuple, first length is maximum length for calculating center and second is maximum
-                length for calculating scale. Defaults to entire length of time series.
-            transformation (Union[str, Tuple[Callable, Callable]] optional): Transform values before
-                applying normalizer. Available options are
+
+        center: bool, optional, default=True
+            If to center the output to zero. Defaults to True.
+        max_length: Union[int, List[int]], optional
+            Maximum length to take into account for calculating parameters.
+            If tuple, first length is maximum length for calculating center and second is maximum
+            length for calculating scale. Defaults to entire length of time series.
+        transformation: Union[str, Tuple[Callable, Callable]] optional:
+            Transform values before applying normalizer. Available options are
 
                 * None (default): No transformation of values
                 * log: Estimate in log-space leading to a multiplicative model
                 * log1p: Estimate in log-space but add 1 to values before transforming for stability
+
                     (e.g. if many small values <<1 are present).
                     Note, that inverse transform is still only `torch.exp()` and not `torch.expm1()`.
+
                 * logit: Apply logit transformation on values that are between 0 and 1
                 * count: Apply softplus to output (inverse transformation) and x + 1 to input (transformation)
-                * softplus: Apply softplus to output (inverse transformation) and inverse softplus to input
-                    (transformation)
+                * softplus: Apply softplus to output (inverse transformation) and inverse softplus to input (transformation)
                 * relu: Apply max(0, x) to output
                 * Dict[str, Callable] of PyTorch functions that transforms and inversely transforms values.
+
                   ``forward`` and ``reverse`` entries are required. ``inverse`` transformation is optional and
                   should be defined if ``reverse`` is not the inverse of the forward transformation. ``inverse_torch``
                   can be defined to provide a torch distribution transform for inverse transformations.
-        """
+        """  # noqa: E501
         method_kwargs = deepcopy(method_kwargs) if method_kwargs is not None else {}
         super().__init__(
             method=method,
@@ -743,11 +848,14 @@ def fit(self, y: Union[pd.Series, np.ndarray, torch.Tensor]):
         """
         Fit transformer, i.e. determine center and scale of data
 
-        Args:
-            y (Union[pd.Series, np.ndarray, torch.Tensor]): input data
+        Parameters
+        ----------
+        y: Union[pd.Series, np.ndarray, torch.Tensor]
+            input data
 
-        Returns:
-            TorchNormalizer: self
+        Returns
+        -------
+        TorchNormalizer: self
         """
         # reduce size of time series - take only max length
         if self.max_length is None:
@@ -784,12 +892,17 @@ def _slice(
         """
         Slice pandas data frames, numpy arrays and tensors.
 
-        Args:
-            x (Union[pd.Series, np.ndarray, torch.Tensor]): object to slice
-            s (slice): slice, e.g. ``slice(None, -5)```
+        Parameters
+        ----------
+        x: Union[pd.Series, np.ndarray, torch.Tensor]
+            object to slice
+        s: slice
+            slice, e.g. ``slice(None, -5)```
 
-        Returns:
-            Union[pd.Series, np.ndarray, torch.Tensor]: sliced object
+        Returns
+        -------
+        Union[pd.Series, np.ndarray, torch.Tensor]
+            sliced object
         """
 
         if isinstance(x, (pd.DataFrame, pd.Series)):
@@ -802,8 +915,8 @@ class GroupNormalizer(TorchNormalizer):
     """
     Normalizer that scales by groups.
 
-    For each group a scaler is fitted and applied. This scaler can be used as target normalizer or
-    also to normalize any other variable.
+    For each group a scaler is fitted and applied. This scaler can be used
+    as target normalizer or also to normalize any other variable.
     """
 
     def __init__(
@@ -818,23 +931,33 @@ def __init__(
         """
         Group normalizer to normalize a given entry by groups. Can be used as target normalizer.
 
-        Args:
-            method (str, optional): method to rescale series. Either "standard" (standard scaling) or "robust"
-                (scale using quantiles 0.25-0.75). Defaults to "standard".
-            method_kwargs (Dict[str, Any], optional): Dictionary of method specific arguments as listed below
+        Parameters
+        ----------
+        method: str, optional, default="standard"
+            method to rescale series. Either "standard" (standard scaling) or "robust"
+            (scale using quantiles 0.25-0.75). Defaults to "standard".
+        method_kwargs: Dict[str, Any], optional, default=None
+            Dictionary of method specific arguments as listed below
+
                 * "robust" method: "upper", "lower", "center" quantiles defaulting to 0.75, 0.25 and 0.5
-            groups (List[str], optional): Group names to normalize by. Defaults to [].
-            center (bool, optional): If to center the output to zero. Defaults to True.
-            scale_by_group (bool, optional): If to scale the output by group, i.e. norm is calculated as
-                ``(group1_norm * group2_norm * ...) ^ (1 / n_groups)``. Defaults to False.
-            transformation (Union[str, Tuple[Callable, Callable]] optional): Transform values before
-                applying normalizer. Available options are
+
+        groups: List[str], optional, default=[]
+            Group names to normalize by. Defaults to [].
+        center: bool, optional, default=True
+            If to center the output to zero. Defaults to True.
+        scale_by_group: bool, optional
+            If to scale the output by group, i.e. norm is calculated as
+            ``(group1_norm * group2_norm * ...) ^ (1 / n_groups)``. Defaults to False.
+        transformation: Union[str, Tuple[Callable, Callable]] optional, default=None):
+            Transform values before applying normalizer. Available options are
 
                 * None (default): No transformation of values
                 * log: Estimate in log-space leading to a multiplicative model
                 * log1p: Estimate in log-space but add 1 to values before transforming for stability
+
                     (e.g. if many small values <<1 are present).
                     Note, that inverse transform is still only `torch.exp()` and not `torch.expm1()`.
+
                 * logit: Apply logit transformation on values that are between 0 and 1
                 * count: Apply softplus to output (inverse transformation) and x + 1 to input
                     (transformation)
@@ -846,7 +969,7 @@ def __init__(
                   should be defined if ``reverse`` is not the inverse of the forward transformation. ``inverse_torch``
                   can be defined to provide a torch distribution transform for inverse transformations.
 
-        """
+        """  # noqa: E501
         self.groups = groups
         self._groups = list(groups) if groups is not None else []
         self.scale_by_group = scale_by_group
@@ -862,12 +985,16 @@ def fit(self, y: pd.Series, X: pd.DataFrame):
         """
         Determine scales for each group
 
-        Args:
-            y (pd.Series): input data
-            X (pd.DataFrame): dataframe with columns for each group defined in ``groups`` parameter.
+        Parameters
+        ----------
+        y: pd.Series
+            input data
+        X: pd.DataFrame
+            dataframe with columns for each group defined in ``groups`` parameter.
 
-        Returns:
-            self
+        Returns
+        -------
+        self
         """
         y = self.preprocess(y)
         eps = np.finfo(np.float16).eps
@@ -1014,8 +1141,10 @@ def names(self) -> List[str]:
         """
         Names of determined scales.
 
-        Returns:
-            List[str]: list of names
+        Returns
+        -------
+        List[str]
+            list of names
         """
         return ["center", "scale"]
 
@@ -1025,21 +1154,26 @@ def fit_transform(
         """
         Fit normalizer and scale input data.
 
-        Args:
-            y (pd.Series): data to scale
-            X (pd.DataFrame): dataframe with ``groups`` columns
-            return_norm (bool, optional): If to return . Defaults to False.
+        Parameters
+        ----------
+        y: pd.Series
+            data to scale
+        X: pd.DataFrame
+            dataframe with ``groups`` columns
+        return_norm: bool, optional, default=False
+            If to return . Defaults to False.
 
-        Returns:
-            Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: Scaled data, if ``return_norm=True``, returns also scales
-                as second element
+        Returns
+        -------
+        Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]
+            Scaled data, if ``return_norm=True``, returns also scales as second element
         """
         return self.fit(y, X).transform(y, X, return_norm=return_norm)
 
     def inverse_transform(self, y: pd.Series, X: pd.DataFrame):
         """
         Rescaling data to original scale - not implemented - call class with target scale instead.
-        """
+        """  # noqa: E501
         raise NotImplementedError()
 
     def transform(
@@ -1052,15 +1186,21 @@ def transform(
         """
         Scale input data.
 
-        Args:
-            y (pd.Series): data to scale
-            X (pd.DataFrame): dataframe with ``groups`` columns
-            return_norm (bool, optional): If to return . Defaults to False.
-            target_scale (torch.Tensor): target scale to use instead of fitted center and scale
-
-        Returns:
-            Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: Scaled data, if ``return_norm=True``, returns also scales
-                as second element
+        Parameters
+        ----------
+        y: pd.Series
+            data to scale
+        X: pd.DataFrame
+            dataframe with ``groups`` columns
+        return_norm: bool, optional, default=False
+            If to return . Defaults to False.
+        target_scale: torch.Tensor
+            target scale to use instead of fitted center and scale
+
+        Returns
+        -------
+        Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]
+            Scaled data, if ``return_norm=True``, returns also scales as second element
         """
         # # check if arguments are wrong way round
         if isinstance(y, pd.DataFrame) and not isinstance(X, pd.DataFrame):
@@ -1076,13 +1216,18 @@ def get_parameters(
         """
         Get fitted scaling parameters for a given group.
 
-        Args:
-            groups (Union[torch.Tensor, list, tuple]): group ids for which to get parameters
-            group_names (List[str], optional): Names of groups corresponding to positions
-                in ``groups``. Defaults to None, i.e. the instance attribute ``groups``.
+        Parameters
+        ----------
+        groups: Union[torch.Tensor, list, tuple]
+            group ids for which to get parameters
+        group_names: List[str], optional, default=None
+            Names of groups corresponding to positions in ``groups``.
+            Defaults to None, i.e. the instance attribute ``groups``.
 
-        Returns:
-            np.ndarray: parameters used for scaling
+        Returns
+        -------
+        np.ndarray
+            parameters used for scaling
         """
         if isinstance(groups, torch.Tensor):
             groups = groups.tolist()
@@ -1121,11 +1266,16 @@ def get_norm(self, X: pd.DataFrame) -> pd.DataFrame:
         """
         Get scaling parameters for multiple groups.
 
-        Args:
-            X (pd.DataFrame): dataframe with ``groups`` columns
+        Parameters
+        ----------
+        X: pd.DataFrame
+            dataframe with ``groups`` columns
 
-        Returns:
-            pd.DataFrame: dataframe with scaling parameterswhere each row corresponds to the input dataframe
+        Returns
+        -------
+        pd.DataFrame
+            dataframe with scaling parameterswhere each row corresponds
+            to the input dataframe
         """
         if len(self._groups) == 0:
             norm = np.asarray([self.norm_["center"], self.norm_["scale"]]).reshape(
@@ -1166,8 +1316,10 @@ class MultiNormalizer(TorchNormalizer):
 
     def __init__(self, normalizers: List[TorchNormalizer]):
         """
-        Args:
-            normalizers (List[TorchNormalizer]): list of normalizers to apply to targets
+        Parameters
+        ----------
+        normalizers: List[TorchNormalizer]
+            list of normalizers to apply to targets
         """
         self.normalizers = normalizers
 
@@ -1177,11 +1329,14 @@ def fit(
         """
         Fit transformer, i.e. determine center and scale of data
 
-        Args:
-            y (Union[pd.Series, np.ndarray, torch.Tensor]): input data
+        Parameters
+        ----------
+        y: Union[pd.Series, np.ndarray, torch.Tensor]
+            input data
 
-        Returns:
-            MultiNormalizer: self
+        Returns
+        -------
+        MultiNormalizer: self
         """
         if isinstance(y, pd.DataFrame):
             y = y.to_numpy()
@@ -1199,8 +1354,10 @@ def __getitem__(self, idx: int):
         """
         Return normalizer.
 
-        Args:
-            idx (int): metric index
+        Parameters
+        ----------
+        idx: int
+            metric index
         """
         return self.normalizers[idx]
 
@@ -1229,17 +1386,23 @@ def transform(
         """
         Scale input data.
 
-        Args:
-            y (Union[pd.DataFrame, np.ndarray, torch.Tensor]): data to scale
-            X (pd.DataFrame): dataframe with ``groups`` columns. Only necessary if :py:class:`~GroupNormalizer`
-                is among normalizers
-            return_norm (bool, optional): If to return . Defaults to False.
-            target_scale (List[torch.Tensor]): target scale to use instead of fitted center and scale
-
-        Returns:
-            Union[List[Tuple[Union[np.ndarray, torch.Tensor], np.ndarray]], List[Union[np.ndarray, torch.Tensor]]]:
+        Parameters
+        ----------
+        y: Union[pd.DataFrame, np.ndarray, torch.Tensor]
+            data to scale
+        X: pd.DataFrame
+            dataframe with ``groups`` columns. Only necessary
+            if :py:class:`~GroupNormalizer` is among normalizers
+        return_norm: bool, optional
+            If to return . Defaults to False.
+        target_scale: List[torch.Tensor]
+            target scale to use instead of fitted center and scale
+
+        Returns
+        -------
+        Union[List[Tuple[Union[np.ndarray, torch.Tensor], np.ndarray]], List[Union[np.ndarray, torch.Tensor]]]
                 List of scaled data, if ``return_norm=True``, returns also scales as second element
-        """
+        """  # noqa: E501
         if isinstance(y, pd.DataFrame):
             y = y.to_numpy().transpose()
 
@@ -1270,13 +1433,18 @@ def __call__(
         """
         Inverse transformation but with network output as input.
 
-        Args:
-            data (Dict[str, Union[List[torch.Tensor], torch.Tensor]]): Dictionary with entries
-                * prediction: list of data to de-scale
-                * target_scale: list of center and scale of data
+        Parameters
+        ----------
+        data: Dict[str, Union[List[torch.Tensor], torch.Tensor]])
+            Dictionary with entries
+
+            * prediction: list of data to de-scale
+            * target_scale: list of center and scale of data
 
-        Returns:
-            List[torch.Tensor]: list of de-scaled data
+        Returns
+        -------
+        List[torch.Tensor]
+            list of de-scaled data
         """
         denormalized = [
             normalizer(
@@ -1293,8 +1461,10 @@ def get_parameters(self, *args, **kwargs) -> List[torch.Tensor]:
         """
         Returns parameters that were used for encoding.
 
-        Returns:
-            List[torch.Tensor]: First element is center of data and second is scale
+        Returns
+        -------
+        List[torch.Tensor]
+            First element is center of data and second is scale
         """
         return [
             normalizer.get_parameters(*args, **kwargs)
@@ -1305,16 +1475,20 @@ def __getattr__(self, name: str):
         """
         Return dynamically attributes.
 
-        Return attributes if defined in this class. If not, create dynamically attributes based on
-        attributes of underlying normalizers that are lists. Create functions if necessary.
-        Arguments to functions are distributed to the functions if they are lists and their length
-        matches the number of normalizers. Otherwise, they are directly passed to each callable of the
-        normalizers.
+        Return attributes if defined in this class. If not,
+        create dynamically attributes based on attributes of underlying normalizers
+        that are lists. Create functions if necessary. Arguments to functions are
+        distributed to the functions if they are lists and their length matches the
+        number of normalizers. Otherwise, they are directly passed to each callable of
+        the normalizers.
 
-        Args:
-            name (str): name of attribute
+        Parameters
+        name: str
+            name of attribute
 
-        Returns:
+        Returns
+        -------
+        ret
             attributes of this class or list of attributes of underlying class
         """
         try:
@@ -1327,7 +1501,8 @@ def __getattr__(self, name: str):
                     n = len(self.normalizers)
 
                     def func(*args, **kwargs):
-                        # if arg/kwarg is list and of length normalizers, then apply each part to a normalizer.
+                        # if arg/kwarg is list and of length normalizers,
+                        # then apply each part to a normalizer.
                         #  otherwise pass it directly to all normalizers
                         results = []
                         for idx, norm in enumerate(self.normalizers):
diff --git a/pytorch_forecasting/data/examples.py b/pytorch_forecasting/data/examples.py
index 0a9a01e51..1adab65d6 100644
--- a/pytorch_forecasting/data/examples.py
+++ b/pytorch_forecasting/data/examples.py
@@ -28,7 +28,7 @@ def _get_data_by_filename(fname: str) -> Path:
     # check if file exists - download if necessary
     if not full_fname.exists():
         url = BASE_URL + fname
-        urlretrieve(url, full_fname)
+        urlretrieve(url, full_fname)  # noqa: S310
 
     return full_fname
 
@@ -82,7 +82,7 @@ def generate_ar_data(
 
     Returns:
         pd.DataFrame: data
-    """
+    """  # noqa: E501
     # sample parameters
     np.random.seed(seed)
     linear_trends = np.random.normal(size=n_series)[:, None] / timesteps
diff --git a/pytorch_forecasting/data/samplers.py b/pytorch_forecasting/data/samplers.py
index 99961b54e..27add6a31 100644
--- a/pytorch_forecasting/data/samplers.py
+++ b/pytorch_forecasting/data/samplers.py
@@ -1,6 +1,6 @@
 """
 Samplers for sampling time series from the :py:class:`~pytorch_forecasting.data.timeseries.TimeSeriesDataSet`
-"""
+"""  # noqa: E501
 
 import warnings
 
@@ -16,7 +16,7 @@ class GroupedSampler(Sampler):
 
     This means that the items from the different groups are always sampled together.
     This is an abstract class. Implement the :py:meth:`~get_groups` method which creates groups to be sampled from.
-    """
+    """  # noqa: E501
 
     def __init__(
         self,
@@ -36,7 +36,7 @@ def __init__(
             batch_size (int, optional): Number of samples in a mini-batch. This is rather the maximum number
                 of samples. Because mini-batches are grouped by prediction time, chances are that there
                 are multiple where batch size will be smaller than the maximum. Defaults to 64.
-        """
+        """  # noqa: E501
         # Since collections.abc.Iterable does not check for `__getitem__`, which
         # is one way for an object to be an iterable, we don't do an `isinstance`
         # check here.
@@ -71,7 +71,7 @@ def get_groups(self, sampler: Sampler):
 
         Returns:
             dict-like: dictionary-like object with data_source.index as values and group names as keys
-        """
+        """  # noqa: E501
         raise NotImplementedError()
 
     def construct_batch_groups(self, groups):
@@ -94,7 +94,8 @@ def construct_batch_groups(self, groups):
                 warns.append(name)
         if len(warns) > 0:
             warnings.warn(
-                f"Less than {self.batch_size} samples available for {len(warns)} prediction times. "
+                f"Less than {self.batch_size} samples available for "
+                f"{len(warns)} prediction times. "
                 f"Use batch size smaller than {self.batch_size}. "
                 f"First 10 prediction times with small batch sizes: {warns[:10]}"
             )
@@ -134,7 +135,7 @@ class TimeSynchronizedBatchSampler(GroupedSampler):
 
     Time-synchornisation means that the time index of the first decoder samples are aligned across the batch.
     This sampler does not support missing values in the dataset.
-    """
+    """  # noqa: E501
 
     def get_groups(self, sampler: Sampler):
         data_source = sampler.data_source
diff --git a/pytorch_forecasting/metrics/_mqf2_utils.py b/pytorch_forecasting/metrics/_mqf2_utils.py
index 9627b8a81..47ccb2517 100644
--- a/pytorch_forecasting/metrics/_mqf2_utils.py
+++ b/pytorch_forecasting/metrics/_mqf2_utils.py
@@ -432,7 +432,7 @@ def rsample(self, sample_shape: torch.Size = torch.Size()) -> torch.Tensor:
             layout=self.hidden_state.layout,
         ).clamp(
             min=1e-4, max=1 - 1e-4
-        )  # prevent numerical issues by preventing to sample beyond 0.1% and 99.9% percentiles
+        )  # prevent numerical issues by preventing to sample beyond 0.1% and 99.9% percentiles # noqa: E501
 
         samples = (
             self.quantile(alpha, hidden_state_repeat)
diff --git a/pytorch_forecasting/metrics/base_metrics.py b/pytorch_forecasting/metrics/base_metrics.py
index 97d0a1278..c9952ef96 100644
--- a/pytorch_forecasting/metrics/base_metrics.py
+++ b/pytorch_forecasting/metrics/base_metrics.py
@@ -22,7 +22,7 @@ class Metric(LightningMetric):
     for details of how to implement a new metric
 
     Other metrics should inherit from this base class
-    """
+    """  # noqa: E501
 
     full_state_update = False
     higher_is_better = False
@@ -42,7 +42,7 @@ def __init__(
             name (str): metric name. Defaults to class name.
             quantiles (List[float], optional): quantiles for probability range. Defaults to None.
             reduction (str, optional): Reduction, "none", "mean" or "sqrt-mean". Defaults to "mean".
-        """
+        """  # noqa: E501
         self.quantiles = quantiles
         self.reduction = reduction
         if name is None:
@@ -84,7 +84,7 @@ def rescale_parameters(
 
         Returns:
             torch.Tensor: parameters in real/not normalized space
-        """
+        """  # noqa: E501
         return encoder(dict(prediction=parameters, target_scale=target_scale))
 
     def to_prediction(self, y_pred: torch.Tensor) -> torch.Tensor:
@@ -120,7 +120,7 @@ def to_quantiles(
 
         Returns:
             torch.Tensor: prediction quantiles
-        """
+        """  # noqa: E501
         if quantiles is None:
             quantiles = self.quantiles
 
@@ -166,14 +166,14 @@ class TorchMetricWrapper(Metric):
     Wrap a torchmetric to work with PyTorch Forecasting.
 
     Does not support weighting of errors and only supports metrics for point predictions.
-    """
+    """  # noqa: E501
 
     def __init__(self, torchmetric: LightningMetric, reduction: str = None, **kwargs):
         """
         Args:
             torchmetric (LightningMetric): Torchmetric to wrap.
             reduction (str, optional): use reduction with torchmetric directly. Defaults to None.
-        """
+        """  # noqa: E501
         super().__init__(**kwargs)
         if reduction is not None:
             raise ValueError("use reduction with torchmetric directly")
@@ -231,7 +231,8 @@ def update(
         self.torchmetric.update(y_pred_flattened, target_flattened, **kwargs)
 
     def forward(self, y_pred, target, **kwargs):
-        # need this explicitly to avoid backpropagation errors because of sketchy caching
+        # need this explicitly to avoid backpropagation
+        # errors because of sketchy caching
         y_pred_flattened, target_flattened = self._convert(y_pred, target)
         return self.torchmetric.forward(y_pred_flattened, target_flattened, **kwargs)
 
@@ -276,7 +277,7 @@ def __init__(self, metrics: List[LightningMetric], weights: List[float] = None):
         Args:
             metrics (List[LightningMetric], optional): list of metrics to combine.
             weights (List[float], optional): list of weights / multipliers for weights. Defaults to 1.0 for all metrics.
-        """
+        """  # noqa: E501
         assert len(metrics) > 0, "at least one metric has to be specified"
         if weights is None:
             weights = [1.0 for _ in metrics]
@@ -477,7 +478,7 @@ def __getattr__(self, name: str):
 
         Returns:
             attributes of this class or list of attributes of underlying class
-        """
+        """  # noqa: E501
         try:
             return super().__getattr__(name)
         except AttributeError as e:
@@ -488,7 +489,8 @@ def __getattr__(self, name: str):
                     n = len(self.metrics)
 
                     def func(*args, **kwargs):
-                        # if arg/kwarg is list and of length metric, then apply each part to a metric. otherwise
+                        # if arg/kwarg is list and of length metric,
+                        # then apply each part to a metric. otherwise
                         # pass it directly to all metrics
                         results = []
                         for idx, m in enumerate(self.metrics):
@@ -535,7 +537,7 @@ class CompositeMetric(LightningMetric):
         .. code-block:: python
 
             composite_metric = SMAPE() + 0.4 * MAE()
-    """
+    """  # noqa: E501
 
     full_state_update = False
     higher_is_better = False
@@ -550,7 +552,7 @@ def __init__(
         Args:
             metrics (List[LightningMetric], optional): list of metrics to combine. Defaults to None.
             weights (List[float], optional): list of weights / multipliers for weights. Defaults to 1.0 for all metrics.
-        """
+        """  # noqa: E501
         self.metrics = metrics
         self.weights = weights
 
@@ -913,7 +915,7 @@ def mask_losses(
 
         Returns:
             torch.Tensor: masked losses
-        """
+        """  # noqa: E501
         if reduction is None:
             reduction = self.reduction
         if losses.ndim > 0:
@@ -946,7 +948,7 @@ def reduce_loss(
 
         Returns:
             torch.Tensor: reduced loss
-        """
+        """  # noqa: E501
         if reduction is None:
             reduction = self.reduction
         if reduction == "none":
@@ -962,9 +964,10 @@ def reduce_loss(
             "Loss should not be nan - i.e. something went wrong "
             "in calculating the loss (e.g. log of a negative number)"
         )
-        assert torch.isfinite(
-            loss
-        ), "Loss should not be infinite - i.e. something went wrong (e.g. input is not in log space)"
+        assert torch.isfinite(loss), (
+            "Loss should not be infinite - i.e."
+            " something went wrong (e.g. input is not in log space)"
+        )
         return loss
 
 
@@ -983,7 +986,7 @@ class DistributionLoss(MultiHorizonMetric):
         distribution_arguments (List[str]): list of parameter names for the distribution
 
     Further, implement the methods :py:meth:`~map_x_to_distribution` and :py:meth:`~rescale_parameters`.
-    """
+    """  # noqa: E501
 
     distribution_class: distributions.Distribution
     distribution_arguments: List[str]
@@ -1002,7 +1005,7 @@ def __init__(
             quantiles (List[float], optional): quantiles for probability range.
                 Defaults to [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98].
             reduction (str, optional): Reduction, "none", "mean" or "sqrt-mean". Defaults to "mean".
-        """
+        """  # noqa: E501
         if quantiles is None:
             quantiles = [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]
         super().__init__(name=name, quantiles=quantiles, reduction=reduction)
@@ -1017,7 +1020,7 @@ def map_x_to_distribution(self, x: torch.Tensor) -> distributions.Distribution:
         Returns:
             distributions.Distribution: torch probability distribution as defined in the
                 class attribute ``distribution_class``
-        """
+        """  # noqa: E501
         raise NotImplementedError("implement this method")
 
     def loss(self, y_pred: torch.Tensor, y_actual: torch.Tensor) -> torch.Tensor:
@@ -1061,7 +1064,7 @@ def sample(self, y_pred, n_samples: int) -> torch.Tensor:
 
         Returns:
             torch.Tensor: tensor with samples  (shape batch_size x n_timesteps x n_samples)
-        """
+        """  # noqa: E501
         dist = self.map_x_to_distribution(y_pred)
         samples = dist.sample((n_samples,))
         if samples.ndim == 3:
@@ -1084,7 +1087,7 @@ def to_quantiles(
 
         Returns:
             torch.Tensor: prediction quantiles (last dimension)
-        """
+        """  # noqa: E501
         if quantiles is None:
             quantiles = self.quantiles
         try:
@@ -1106,7 +1109,7 @@ class MultivariateDistributionLoss(DistributionLoss):
     Class should be inherited for all multivariate distribution losses, i.e. if a batch of values
     is predicted in one go and the batch dimension is not independent, but the time dimension still
     remains independent.
-    """
+    """  # noqa: E501
 
     def sample(self, y_pred, n_samples: int) -> torch.Tensor:
         """
@@ -1118,11 +1121,11 @@ def sample(self, y_pred, n_samples: int) -> torch.Tensor:
 
         Returns:
             torch.Tensor: tensor with samples  (shape batch_size x n_timesteps x n_samples)
-        """
+        """  # noqa: E501
         dist = self.map_x_to_distribution(y_pred)
         samples = dist.sample((n_samples,)).permute(
             2, 1, 0
-        )  # returned as (n_samples, n_timesteps, batch_size), so reshape to (batch_size, n_timesteps, n_samples)
+        )  # returned as (n_samples, n_timesteps, batch_size), so reshape to (batch_size, n_timesteps, n_samples) # noqa: E501
         return samples
 
     def loss(self, y_pred: torch.Tensor, y_actual: torch.Tensor) -> torch.Tensor:
diff --git a/pytorch_forecasting/metrics/distributions.py b/pytorch_forecasting/metrics/distributions.py
index fc93f56e9..b79a9c83e 100644
--- a/pytorch_forecasting/metrics/distributions.py
+++ b/pytorch_forecasting/metrics/distributions.py
@@ -1,4 +1,4 @@
-"""Metrics that allow the parametric forecast of parameters of uni- and multivariate distributions."""
+"""Metrics that allow the parametric forecast of parameters of uni- and multivariate distributions."""  # noqa: E501
 
 from typing import List, Optional
 
@@ -87,7 +87,7 @@ def __init__(
             rank (int): rank of low-rank approximation for covariance matrix. Defaults to 10.
             sigma_init (float, optional): default value for diagonal covariance. Defaults to 1.0.
             sigma_minimum (float, optional): minimum value for diagonal covariance. Defaults to 1e-3.
-        """
+        """  # noqa: E501
         if quantiles is None:
             quantiles = [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]
         super().__init__(name=name, quantiles=quantiles, reduction=reduction)
@@ -106,9 +106,10 @@ def __init__(
         self._cov_factor_scale: float = np.sqrt(self.rank)
 
     def map_x_to_distribution(self, x: torch.Tensor) -> distributions.Normal:
-        assert (
-            x.device.type != "mps"
-        ), "MPS accelerator has a bug https://github.com/pytorch/pytorch/issues/98074, use cpu or gpu"
+        assert x.device.type != "mps", (
+            "MPS accelerator has a bug"
+            " https://github.com/pytorch/pytorch/issues/98074, use cpu or gpu"
+        )
         x = x.permute(1, 0, 2)
         distr = self.distribution_class(
             loc=x[..., 2],
@@ -182,9 +183,10 @@ def rescale_parameters(
         target_scale: torch.Tensor,
         encoder: BaseEstimator,
     ) -> torch.Tensor:
-        assert (
-            not encoder.center
-        ), "NegativeBinomialDistributionLoss is not compatible with `center=True` normalization"
+        assert not encoder.center, (
+            "NegativeBinomialDistributionLoss is not"
+            " compatible with `center=True` normalization"
+        )
         assert encoder.transformation not in [
             "logit",
             "log",
@@ -216,7 +218,7 @@ def to_prediction(self, y_pred: torch.Tensor) -> torch.Tensor:
 
         Returns:
             torch.Tensor: mean prediction
-        """
+        """  # noqa: E501
         return y_pred[..., 0]
 
 
@@ -243,7 +245,10 @@ def rescale_parameters(
         assert isinstance(encoder.transformation, str) and encoder.transformation in [
             "log",
             "log1p",
-        ], f"Log distribution requires log scaling but found `transformation={encoder.transform}`"
+        ], (
+            "Log distribution requires log scaling but found"
+            f" `transformation={encoder.transform}`"
+        )
 
         assert encoder.transformation not in [
             "logit"
@@ -306,16 +311,19 @@ def rescale_parameters(
         scaled_mean = encoder(
             dict(prediction=parameters[..., 0], target_scale=target_scale)
         )
-        # need to first transform target scale standard deviation in logit space to real space
-        # we assume a normal distribution in logit space (we used a logit transform and a standard scaler)
-        # and know that the variance of the beta distribution is limited by `scaled_mean * (1 - scaled_mean)`
+        # need to first transform target scale standard deviation in
+        # logit space to real space
+        # we assume a normal distribution in logit space
+        # (we used a logit transform and a standard scaler)
+        # and know that the variance of the beta distribution is
+        # limited by `scaled_mean * (1 - scaled_mean)`
         scaled_mean = (
             scaled_mean * (1 - 2 * self.eps) + self.eps
         )  # ensure that mean is not exactly 0 or 1
         mean_derivative = scaled_mean * (1 - scaled_mean)
 
         # we can approximate variance as
-        # torch.pow(torch.tanh(target_scale[..., 1].unsqueeze(1) * torch.sqrt(mean_derivative)), 2) * mean_derivative
+        # torch.pow(torch.tanh(target_scale[..., 1].unsqueeze(1) * torch.sqrt(mean_derivative)), 2) * mean_derivative # noqa: E501
         # shape is (positive) parameter * mean_derivative / var
         shape_scaler = (
             torch.pow(
@@ -365,7 +373,7 @@ def __init__(
             icnn_hidden_size (int, optional): hidden size of distribution estimating network. Defaults to 20.
             icnn_num_layers (int, optional): number of hidden layers in distribution estimating network. Defaults to 2.
             estimate_logdet (bool, optional): if to estimate log determinant. Defaults to False.
-        """
+        """  # noqa: E501
         if quantiles is None:
             quantiles = [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]
         super().__init__(quantiles=quantiles)
@@ -485,7 +493,7 @@ def to_quantiles(
 
         Returns:
             torch.Tensor: prediction quantiles (last dimension)
-        """
+        """  # noqa: E501
         if quantiles is None:
             quantiles = self.quantiles
         distribution = self.map_x_to_distribution(y_pred)
@@ -562,7 +570,7 @@ def __init__(
             input_size (int, optional): input size per prediction length. Defaults to 16.
             hidden_size (int, optional): hidden size per prediction length. Defaults to 64.
             n_loss_samples (int, optional): number of quantiles to sample to calculate loss.
-        """
+        """  # noqa: E501
         if quantiles is None:
             quantiles = [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]
         super().__init__(quantiles=quantiles)
@@ -574,7 +582,8 @@ def __init__(
 
     def sample(self, y_pred, n_samples: int) -> torch.Tensor:
         eps = 1e-3
-        # for a couple of random quantiles (excl. 0 and 1 as they would lead to infinities)
+        # for a couple of random quantiles
+        # (excl. 0 and 1 as they would lead to infinities)
         quantiles = torch.rand(size=(n_samples,), device=y_pred.device).clamp(
             eps, 1 - eps
         )
@@ -594,7 +603,8 @@ def loss(self, y_pred: torch.Tensor, y_actual: torch.Tensor) -> torch.Tensor:
             torch.Tensor: metric value on which backpropagation can be applied
         """
         eps = 1e-3
-        # for a couple of random quantiles (excl. 0 and 1 as they would lead to infinities)
+        # for a couple of random quantiles
+        # (excl. 0 and 1 as they would lead to infinities)
         quantiles = torch.rand(size=(self.n_loss_samples,), device=y_pred.device).clamp(
             eps, 1 - eps
         )
@@ -623,7 +633,8 @@ def to_prediction(self, y_pred: torch.Tensor, n_samples: int = 100) -> torch.Ten
         if n_samples is None:
             return self.to_quantiles(y_pred, quantiles=[0.5]).squeeze(-1)
         else:
-            # for a couple of random quantiles (excl. 0 and 1 as they would lead to infinities) make prediction
+            # for a couple of random quantiles
+            # (excl. 0 and 1 as they would lead to infinities) make prediction
             return self.sample(y_pred, n_samples=n_samples).mean(-1)
 
     def to_quantiles(
@@ -639,7 +650,7 @@ def to_quantiles(
 
         Returns:
             torch.Tensor: prediction quantiles (last dimension)
-        """
+        """  # noqa: E501
         if quantiles is None:
             quantiles = self.quantiles
         quantiles = torch.as_tensor(quantiles, device=y_pred.device)
diff --git a/pytorch_forecasting/metrics/point.py b/pytorch_forecasting/metrics/point.py
index a2d930fc0..45d4b659d 100644
--- a/pytorch_forecasting/metrics/point.py
+++ b/pytorch_forecasting/metrics/point.py
@@ -30,7 +30,7 @@ class PoissonLoss(MultiHorizonMetric):
     Note that in this example, the data is log1p-transformed before normalized but not re-transformed.
     The PoissonLoss applies this "exp"-re-transformation on the network output after it has been de-normalized.
     The result is the model prediction.
-    """
+    """  # noqa: E501
 
     def loss(
         self, y_pred: Dict[str, torch.Tensor], target: torch.Tensor
@@ -145,7 +145,7 @@ def to_quantiles(
 
         Returns:
             torch.Tensor: prediction quantiles
-        """
+        """  # noqa: E501
         return y_pred
 
 
@@ -170,7 +170,7 @@ class MASE(MultiHorizonMetric):
 
     Defined as ``(y_pred - target).abs() / all_targets[:, :-1] - all_targets[:, 1:]).mean(1)``.
     ``all_targets`` are here the concatenated encoder and decoder targets
-    """
+    """  # noqa: E501
 
     def update(
         self,
@@ -191,7 +191,7 @@ def update(
 
         Returns:
             torch.Tensor: loss as a single number for backpropagation
-        """
+        """  # noqa: E501
         # unpack weight
         if isinstance(target, (list, tuple)):
             weight = target[1]
@@ -306,7 +306,7 @@ class TweedieLoss(MultiHorizonMetric):
     Note that in this example, the data is log1p-transformed before normalized but not re-transformed.
     The TweedieLoss applies this "exp"-re-transformation on the network output after it has been de-normalized.
     The result is the model prediction.
-    """
+    """  # noqa: E501
 
     def __init__(self, reduction="mean", p: float = 1.5, **kwargs):
         """
diff --git a/pytorch_forecasting/metrics/quantile.py b/pytorch_forecasting/metrics/quantile.py
index ad9f9921c..2bdb75715 100644
--- a/pytorch_forecasting/metrics/quantile.py
+++ b/pytorch_forecasting/metrics/quantile.py
@@ -12,7 +12,7 @@ class QuantileLoss(MultiHorizonMetric):
     Quantile loss, i.e. a quantile of ``q=0.5`` will give half of the mean absolute error as it is calculated as
 
     Defined as ``max(q * (y-y_pred), (1-q) * (y_pred-y))``
-    """
+    """  # noqa: E501
 
     def __init__(
         self,
diff --git a/pytorch_forecasting/models/base_model.py b/pytorch_forecasting/models/base_model.py
index 238c87816..918846405 100644
--- a/pytorch_forecasting/models/base_model.py
+++ b/pytorch_forecasting/models/base_model.py
@@ -1,6 +1,6 @@
 """
 Timeseries models share a number of common characteristics. This module implements these in a common base class.
-"""
+"""  # noqa: E501
 
 from collections import namedtuple
 from copy import deepcopy
@@ -119,7 +119,8 @@ def _torch_cat_na(x: List[torch.Tensor]) -> torch.Tensor:
     else:
         # make list instead but warn
         warnings.warn(
-            f"Not all dimensions are equal for tensors shapes. Example tensor {x[0].shape}. "
+            "Not all dimensions are equal for tensors shapes."
+            f" Example tensor {x[0].shape}. "
             "Returning list instead of torch.Tensor.",
             UserWarning,
         )
@@ -146,7 +147,7 @@ def _concatenate_output(
     Returns:
         Dict[str, Union[torch.Tensor, np.ndarray, List[Union[torch.Tensor, int, bool, str]]]]:
             concatenated output
-    """
+    """  # noqa: E501
     output_cat = {}
     for name in output[0].keys():
         v0 = output[0][name]
@@ -208,7 +209,7 @@ class Prediction(PredictTuple, OutputMixIn):
 
 
 class PredictCallback(BasePredictionWriter):
-    """Internally used callback to capture predictions and optionally write them to disk."""
+    """Internally used callback to capture predictions and optionally write them to disk."""  # noqa: E501
 
     # see base class predict function for documentation of parameters
     def __init__(
@@ -265,7 +266,10 @@ def on_predict_batch_end(
                 out = out[self.mode[1]]
             else:
                 raise ValueError(
-                    f"If a tuple is specified, the first element must be 'raw' - got {self.mode[0]} instead"
+                    (
+                        "If a tuple is specified, the first element must be 'raw' - got"
+                        f" {self.mode[0]} instead"
+                    )
                 )
         elif self.mode == "prediction":
             out = pl_module.to_prediction(out, **self.mode_kwargs)
@@ -461,7 +465,7 @@ def forward(self, x):
                     prediction = self.transform_output(prediction=normalized_prediction, target_scale=x["target_scale"])
                     return self.to_network_output(prediction=prediction)
 
-    """
+    """  # noqa: E501
 
     CHECKPOINT_HYPER_PARAMS_SPECIAL_KEY = "__special_save__"
 
@@ -517,7 +521,7 @@ def __init__(
                 a `lr` argument (optionally also `weight_decay`). Defaults to
                 `"ranger" <https://pytorch-optimizers.readthedocs.io/en/latest/optimizer_api.html#ranger21>`_,
                 if pytorch_optimizer is installed, otherwise "adam".
-        """
+        """  # noqa: E501
         if monotone_constaints is None:
             monotone_constaints = {}
         super().__init__()
@@ -533,9 +537,10 @@ def __init__(
                 warnings.warn(
                     "In pytorch-forecasting models, from version 1.2.0, "
                     "the default optimizer will be 'adam', in order to "
-                    "minimize the number of dependencies in default parameter settings. "
-                    "Users who wish to ensure their code continues using 'ranger' as optimizer "
-                    "should ensure that pytorch_optimizer is installed, and set the optimizer "
+                    "minimize the number of dependencies in default"
+                    " parameter settings. Users who wish to ensure their"
+                    " code continues using 'ranger' as optimizer should ensure"
+                    " that pytorch_optimizer is installed, and set the optimizer "
                     "parameter explicitly to 'ranger'.",
                     stacklevel=2,
                 )
@@ -548,9 +553,10 @@ def __init__(
                     "otherwise it defaults to 'ranger' from pytorch_optimizer. "
                     "From version 1.2.0, the default optimizer will be 'adam' "
                     "regardless of whether pytorch_optimizer is installed, in order to "
-                    "minimize the number of dependencies in default parameter settings. "
-                    "Users who wish to ensure their code continues using 'ranger' as optimizer "
-                    "should ensure that pytorch_optimizer is installed, and set the optimizer "
+                    "minimize the number of dependencies in default parameter"
+                    " settings. Users who wish to ensure their code continues"
+                    " using 'ranger' as optimizer should ensure that pytorch_optimizer"
+                    " is installed, and set the optimizer "
                     "parameter explicitly to 'ranger'.",
                     stacklevel=2,
                 )
@@ -629,7 +635,7 @@ def current_stage(self) -> str:
         """
         Available inside lightning loops.
         :return: current trainer stage. One of ["train", "val", "test", "predict", "sanity_check"]
-        """
+        """  # noqa: E501
         return STAGE_STATES.get(self.trainer.state.stage, None)
 
     @property
@@ -663,14 +669,14 @@ def transform_output(
 
         Returns:
             torch.Tensor: rescaled prediction
-        """
+        """  # noqa: E501
         if loss is None:
             loss = self.loss
         if isinstance(loss, MultiLoss):
             out = loss.rescale_parameters(
                 prediction,
                 target_scale=target_scale,
-                encoder=self.output_transformer.normalizers,  # need to use normalizer per encoder
+                encoder=self.output_transformer.normalizers,  # need to use normalizer per encoder # noqa: E501
             )
         else:
             out = loss.rescale_parameters(
@@ -804,7 +810,7 @@ def create_log(
 
         Returns:
             Dict[str, Any]: log dictionary to be returned by training and validation steps
-        """
+        """  # noqa: E501
 
         prediction_kwargs = (
             {} if prediction_kwargs is None else deepcopy(prediction_kwargs)
@@ -850,7 +856,7 @@ def step(
             Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: tuple where the first
                 entry is a dictionary to which additional logging results can be added for consumption in the
                 ``on_epoch_end`` hook and the second entry is the model's output.
-        """
+        """  # noqa: E501
         # pack y sequence if different encoder lengths exist
         if (x["decoder_lengths"] < x["decoder_lengths"].max()).any():
             if isinstance(y[0], (list, tuple)):
@@ -918,7 +924,8 @@ def step(
             # add additionl loss if gradient points in wrong direction
             gradient = gradient[..., indices] * monotonicity[None, None]
             monotinicity_loss = gradient.clamp_max(0).mean()
-            # multiply monotinicity loss by large number to ensure relevance and take to the power of 2
+            # multiply monotinicity loss by large number
+            # to ensure relevance and take to the power of 2
             # for smoothness of loss function
             monotinicity_loss = 10 * torch.pow(monotinicity_loss, 2)
             if not self.predicting:
@@ -977,7 +984,7 @@ def log_metrics(
             y (torch.Tensor): y as passed to the loss function by the dataloader
             out (Dict[str, torch.Tensor]): output of the network
             prediction_kwargs (Dict[str, Any]): parameters for ``to_prediction()`` of the loss metric.
-        """
+        """  # noqa: E501
         # logging losses - for each target
         if prediction_kwargs is None:
             prediction_kwargs = {}
@@ -1060,7 +1067,7 @@ def forward(self, x:
                     # The conversion to a named tuple can be directly achieved with the `to_network_output` function.
                     return self.to_network_output(prediction=prediction)
 
-        """
+        """  # noqa: E501
         raise NotImplementedError()
 
     def on_epoch_end(self, outputs):
@@ -1182,7 +1189,7 @@ def plot_prediction(
 
         Returns:
             matplotlib figure
-        """
+        """  # noqa: E501
         if quantiles_kwargs is None:
             quantiles_kwargs = {}
         if prediction_kwargs is None:
@@ -1369,7 +1376,7 @@ def configure_optimizers(self):
 
         Returns:
             Tuple[List]: first entry is list of optimizers and second is list of schedulers
-        """
+        """  # noqa: E501
         ptopt_in_env = "pytorch_optimizer" in _get_installed_packages()
         # either set a schedule of lrs or find it dynamically
         if self.hparams.optimizer_params is None:
@@ -1410,7 +1417,8 @@ def configure_optimizers(self):
             if not ptopt_in_env:
                 raise ImportError(
                     "optimizer 'ranger' requires pytorch_optimizer in the evironment. "
-                    "Please install pytorch_optimizer with `pip install pytorch_optimizer`."
+                    "Please install pytorch_optimizer with"
+                    "`pip install pytorch_optimizer`."
                 )
             from pytorch_optimizer import Ranger21
 
@@ -1430,7 +1438,8 @@ def configure_optimizers(self):
                     ),
                 )
             else:
-                # if finding not limiting train batches, set iterations to dataloader length
+                # if finding not limiting train batches,
+                # set iterations to dataloader length
                 optimizer_params.setdefault(
                     "num_iterations", self.trainer.num_training_batches
                 )
@@ -1476,7 +1485,10 @@ def configure_optimizers(self):
                     )
             else:
                 raise ValueError(
-                    f"Optimizer of self.hparams.optimizer={self.hparams.optimizer} unknown"
+                    (
+                        f"Optimizer of self.hparams.optimizer={self.hparams.optimizer}"
+                        " unknown"
+                    )
                 )
         else:
             raise ValueError(
@@ -1527,7 +1539,7 @@ def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs) -> LightningModule:
 
         Returns:
             BaseModel: Model that can be trained
-        """
+        """  # noqa: E501
         if "output_transformer" not in kwargs:
             kwargs["output_transformer"] = dataset.target_normalizer
         if "dataset_parameters" not in kwargs:
@@ -1691,7 +1703,7 @@ def predict(
         Returns:
             Prediction: if one of the ```return`` arguments is present,
                 prediction tuple with fields ``prediction``, ``x``, ``y``, ``index`` and ``decoder_lengths``
-        """
+        """  # noqa: E501
         # convert to dataloader
         if isinstance(data, pd.DataFrame):
             data = TimeSeriesDataSet.from_parameters(
@@ -1731,9 +1743,10 @@ def predict(
         )
         trainer_kwargs.setdefault("enable_progress_bar", False)
         trainer_kwargs.setdefault("inference_mode", False)
-        assert (
-            "fast_dev_run" not in trainer_kwargs
-        ), "fast_dev_run should be passed as argument to predict and not in trainer_kwargs"
+        assert "fast_dev_run" not in trainer_kwargs, (
+            "fast_dev_run should be passed as"
+            " argument to predict and not in trainer_kwargs"
+        )
         log_level_lighting = logging.getLogger("lightning").getEffectiveLevel()
         log_level_pytorch_lightning = logging.getLogger(
             "pytorch_lightning"
@@ -1781,7 +1794,7 @@ def predict_dependency(
 
         Returns:
             Union[np.ndarray, torch.Tensor, pd.Series, pd.DataFrame]: output
-        """
+        """  # noqa: E501
         values = np.asarray(values)
         if isinstance(data, pd.DataFrame):  # convert to dataframe
             data = TimeSeriesDataSet.from_parameters(
@@ -1884,7 +1897,7 @@ class BaseModelWithCovariates(BaseModel):
         categorical_groups (Dict[str, List[str]]): dictionary of categorical variables that are grouped together and
             can also take multiple values simultaneously (e.g. holiday during octoberfest). They should be implemented
             as bag of embeddings
-    """
+    """  # noqa: E501
 
     @property
     def target_positions(self) -> torch.LongTensor:
@@ -1973,7 +1986,7 @@ def from_dataset(
 
         Returns:
             LightningModule
-        """
+        """  # noqa: E501
         # assert fixed encoder and decoder length for the moment
         if allowed_encoder_known_variable_names is None:
             allowed_encoder_known_variable_names = (
@@ -2039,7 +2052,7 @@ def extract_features(
 
         Returns:
             torch.Tensor: tensor with selected variables
-        """
+        """  # noqa: E501
         # select period
         if period == "encoder":
             x_cat = x["encoder_cat"]
@@ -2091,7 +2104,7 @@ def calculate_prediction_actual_by_variable(
 
         Returns:
             dictionary that can be used to plot averages with :py:meth:`~plot_prediction_actual_by_variable`
-        """
+        """  # noqa: E501
         support = {}  # histogram
         # averages
         averages_actual = {}
@@ -2218,7 +2231,7 @@ def plot_prediction_actual_by_variable(
 
         Returns:
             Union[Dict[str, plt.Figure], plt.Figure]: matplotlib figure
-        """
+        """  # noqa: E501
         _check_matplotlib("plot_prediction_actual_by_variable")
 
         from matplotlib import pyplot as plt
@@ -2278,7 +2291,8 @@ def plot_prediction_actual_by_variable(
                 else:
                     scaler = self.dataset_parameters["scalers"][name]
                 x = np.linspace(-data["std"], data["std"], bins)
-                # reversing normalization for group normalizer is not possible without sample level information
+                # reversing normalization for group normalizer
+                # is not possible without sample level information
                 if not isinstance(scaler, (GroupNormalizer, EncoderNormalizer)):
                     x = scaler.inverse_transform(x.reshape(-1, 1)).reshape(-1)
                     ax.set_xlabel(f"Normalized {name}")
@@ -2353,7 +2367,7 @@ class AutoRegressiveBaseModel(BaseModel):
             Lags can be useful to indicate seasonality to the models. If you know the seasonalit(ies) of your data,
             add at least the target variables with the corresponding lags to improve performance.
             Defaults to no lags, i.e. an empty dictionary.
-    """
+    """  # noqa: E501
 
     @classmethod
     def from_dataset(
@@ -2370,7 +2384,7 @@ def from_dataset(
 
         Returns:
             LightningModule
-        """
+        """  # noqa: E501
         kwargs.setdefault("target", dataset.target)
         # check that lags for targets are the same
         lags = {
@@ -2411,7 +2425,7 @@ def output_to_prediction(
         Returns:
             Tuple[Union[List[torch.Tensor], torch.Tensor], torch.Tensor]: tuple of rescaled prediction and
                 normalized prediction (e.g. for input into next auto-regressive step)
-        """
+        """  # noqa: E501
         single_prediction = to_list(normalized_prediction_parameters)[0].ndim == 2
         if single_prediction:  # add time dimension as it is expected
             normalized_prediction_parameters = apply_to_list(
@@ -2574,7 +2588,7 @@ def decode_one(idx, lagged_targets, hidden_state):
                         # predictions are already rescaled
                         return output
 
-        """
+        """  # noqa: E501
         # make predictions which are fed into next step
         output = []
         current_target = first_target
@@ -2597,7 +2611,8 @@ def decode_one(idx, lagged_targets, hidden_state):
             )
             # save normalized output for lagged targets
             normalized_output.append(current_target)
-            # set output to unnormalized samples, append each target as n_batch_samples x n_random_samples
+            # set output to unnormalized samples, append each target as
+            # n_batch_samples x n_random_samples
 
             output.append(prediction)
         if isinstance(self.hparams.target, str):
@@ -2653,7 +2668,7 @@ def plot_prediction(
 
         Returns:
             matplotlib figure
-        """
+        """  # noqa: E501
 
         prediction_kwargs = (
             {} if prediction_kwargs is None else deepcopy(prediction_kwargs)
@@ -2685,10 +2700,11 @@ def lagged_target_positions(self) -> Dict[int, torch.LongTensor]:
 
         Returns:
             Dict[int, torch.LongTensor]: dictionary mapping integer lags to tensor of variable positions.
-        """
+        """  # noqa: E501
         raise Exception(
             "lagged targets can only be used with class inheriting "
-            "from AutoRegressiveBaseModelWithCovariates but not from AutoRegressiveBaseModel"
+            "from AutoRegressiveBaseModelWithCovariates but not"
+            " from AutoRegressiveBaseModel"
         )
 
 
@@ -2723,7 +2739,7 @@ class AutoRegressiveBaseModelWithCovariates(
         categorical_groups (Dict[str, List[str]]): dictionary of categorical variables that are grouped together and
             can also take multiple values simultaneously (e.g. holiday during octoberfest). They should be implemented
             as bag of embeddings
-    """
+    """  # noqa: E501
 
     @property
     def lagged_target_positions(self) -> Dict[int, torch.LongTensor]:
@@ -2732,7 +2748,7 @@ def lagged_target_positions(self) -> Dict[int, torch.LongTensor]:
 
         Returns:
             Dict[int, torch.LongTensor]: dictionary mapping integer lags to tensor of variable positions.
-        """
+        """  # noqa: E501
         # todo: expand for categorical targets
         if len(self.hparams.target_lags) == 0:
             return {}
diff --git a/pytorch_forecasting/models/deepar/_deepar.py b/pytorch_forecasting/models/deepar/_deepar.py
index c8346367b..773445a44 100644
--- a/pytorch_forecasting/models/deepar/_deepar.py
+++ b/pytorch_forecasting/models/deepar/_deepar.py
@@ -2,7 +2,7 @@
 `DeepAR: Probabilistic forecasting with autoregressive recurrent networks
 <https://www.sciencedirect.com/science/article/pii/S0169207019301888>`_
 which is the one of the most popular forecasting algorithms and is often used as a baseline
-"""
+"""  # noqa: E501
 
 from copy import deepcopy
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union
@@ -108,7 +108,7 @@ def __init__(
                 Defaults to :py:class:`~pytorch_forecasting.metrics.NormalDistributionLoss`.
             logging_metrics (nn.ModuleList, optional): Metrics to log during training.
                 Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()]).
-        """
+        """  # noqa: E501
         if loss is None:
             loss = NormalDistributionLoss()
         if logging_metrics is None:
@@ -158,9 +158,10 @@ def __init__(
         lagged_target_names = [l for lags in target_lags.values() for l in lags]
         assert set(self.encoder_variables) - set(to_list(target)) - set(
             lagged_target_names
-        ) == set(self.decoder_variables) - set(
-            lagged_target_names
-        ), "Encoder and decoder variables have to be the same apart from target variable"
+        ) == set(self.decoder_variables) - set(lagged_target_names), (
+            "Encoder and decoder variables have to be"
+            " the same apart from target variable"
+        )
         for targeti in to_list(target):
             assert (
                 targeti in time_varying_reals_encoder
@@ -213,7 +214,7 @@ def from_dataset(
 
         Returns:
             DeepAR network
-        """
+        """  # noqa: E501
         new_kwargs = {}
         if dataset.multi_target:
             new_kwargs.setdefault(
@@ -227,7 +228,7 @@ def from_dataset(
                 not isinstance(normalizer, NaNLabelEncoder)
                 for normalizer in dataset.target_normalizer
             )
-        ), "target(s) should be continuous - categorical targets are not supported"  # todo: remove this restriction
+        ), "target(s) should be continuous - categorical targets are not supported"  # todo: remove this restriction # noqa: E501
         if isinstance(new_kwargs.get("loss", None), MultivariateDistributionLoss):
             assert (
                 dataset.min_prediction_length == dataset.max_prediction_length
@@ -248,7 +249,8 @@ def construct_input_vector(
         Create input vector into RNN network
 
         Args:
-            one_off_target: tensor to insert into first position of target. If None (default), remove first time step.
+            one_off_target: tensor to insert into first position of target.
+                If None (default), remove first time step.
         """
         # create input vector
         if len(self.categoricals) > 0:
@@ -361,7 +363,8 @@ def decode_one(
                 n_samples=n_samples,
             )
             # reshape predictions for n_samples:
-            # from n_samples * batch_size x time steps to batch_size x time steps x n_samples
+            # from n_samples * batch_size x time steps
+            # to batch_size x time steps x n_samples
             output = apply_to_list(
                 output,
                 lambda x: x.reshape(-1, n_samples, input_vector.size(1)).permute(
@@ -462,7 +465,7 @@ def predict(
         Returns:
             Prediction: if one of the ```return`` arguments is present,
                 prediction tuple with fields ``prediction``, ``x``, ``y``, ``index`` and ``decoder_lengths``
-        """
+        """  # noqa: E501
         if isinstance(mode, str):
             if mode in ["prediction", "quantiles"]:
                 if mode_kwargs is None:
diff --git a/pytorch_forecasting/models/mlp/_decodermlp.py b/pytorch_forecasting/models/mlp/_decodermlp.py
index 4f9dfa5d3..6692cc360 100644
--- a/pytorch_forecasting/models/mlp/_decodermlp.py
+++ b/pytorch_forecasting/models/mlp/_decodermlp.py
@@ -85,7 +85,7 @@ def __init__(
                 Defaults to QuantileLoss.
             logging_metrics (nn.ModuleList, optional): Metrics to log during training.
                 Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()]).
-        """
+        """  # noqa: E501
         if loss is None:
             loss = QuantileLoss()
         if logging_metrics is None:
diff --git a/pytorch_forecasting/models/nbeats/_nbeats.py b/pytorch_forecasting/models/nbeats/_nbeats.py
index 0c0d172ba..e3a289a12 100644
--- a/pytorch_forecasting/models/nbeats/_nbeats.py
+++ b/pytorch_forecasting/models/nbeats/_nbeats.py
@@ -90,7 +90,7 @@ def __init__(
             logging_metrics (nn.ModuleList[MultiHorizonMetric]): list of metrics that are logged during training.
                 Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()])
             **kwargs: additional arguments to :py:class:`~BaseModel`.
-        """
+        """  # noqa: E501
         if expansion_coefficient_lengths is None:
             expansion_coefficient_lengths = [3, 7]
         if sharing is None:
@@ -198,7 +198,7 @@ def forward(self, x: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
             # update backcast and forecast
             backcast = (
                 backcast - backcast_block
-            )  # do not use backcast -= backcast_block as this signifies an inline operation
+            )  # do not use backcast -= backcast_block as this signifies an inline operation # noqa : E501
             forecast = forecast + forecast_block
 
         return self.to_network_output(
@@ -231,7 +231,7 @@ def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs):
 
         Returns:
             NBeats
-        """
+        """  # noqa: E501
         new_kwargs = {
             "prediction_length": dataset.max_prediction_length,
             "context_length": dataset.max_encoder_length,
@@ -245,13 +245,15 @@ def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs):
         assert not isinstance(
             dataset.target_normalizer, NaNLabelEncoder
         ), "only regression tasks are supported - target must not be categorical"
-        assert (
-            dataset.min_encoder_length == dataset.max_encoder_length
-        ), "only fixed encoder length is allowed, but min_encoder_length != max_encoder_length"
+        assert dataset.min_encoder_length == dataset.max_encoder_length, (
+            "only fixed encoder length is allowed,"
+            " but min_encoder_length != max_encoder_length"
+        )
 
-        assert (
-            dataset.max_prediction_length == dataset.min_prediction_length
-        ), "only fixed prediction length is allowed, but max_prediction_length != min_prediction_length"
+        assert dataset.max_prediction_length == dataset.min_prediction_length, (
+            "only fixed prediction length is allowed,"
+            " but max_prediction_length != min_prediction_length"
+        )
 
         assert (
             dataset.randomize_length is None
@@ -265,7 +267,10 @@ def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs):
             and len(dataset.reals) == 1
             and len(dataset._time_varying_unknown_reals) == 1
             and dataset._time_varying_unknown_reals[0] == dataset.target
-        ), "The only variable as input should be the target which is part of time_varying_unknown_reals"
+        ), (
+            "The only variable as input should be the"
+            " target which is part of time_varying_unknown_reals"
+        )
 
         # initialize class
         return super().from_dataset(dataset, **new_kwargs)
@@ -361,7 +366,7 @@ def plot_interpretation(
 
         Returns:
             plt.Figure: matplotlib figure
-        """
+        """  # noqa: E501
         _check_matplotlib("plot_interpretation")
 
         import matplotlib.pyplot as plt
diff --git a/pytorch_forecasting/models/nhits/_nhits.py b/pytorch_forecasting/models/nhits/_nhits.py
index 641447c3f..b71dd53af 100644
--- a/pytorch_forecasting/models/nhits/_nhits.py
+++ b/pytorch_forecasting/models/nhits/_nhits.py
@@ -148,7 +148,7 @@ def __init__(
             logging_metrics (nn.ModuleList[MultiHorizonMetric]): list of metrics that are logged during training.
                 Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()])
             **kwargs: additional arguments to :py:class:`~BaseModel`.
-        """
+        """  # noqa: E501
         if static_categoricals is None:
             static_categoricals = []
         if static_reals is None:
@@ -389,8 +389,8 @@ def forward(self, x: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
             backcast=self.transform_output(
                 backcast, target_scale=x["target_scale"], loss=MultiHorizonMetric()
             ),  # (n_outputs x) n_samples x n_timesteps x 1
-            block_backcasts=block_backcasts,  # n_blocks x (n_outputs x) n_samples x n_timesteps x 1
-            block_forecasts=block_forecasts,  # n_blocks x (n_outputs x) n_samples x n_timesteps x output_size
+            block_backcasts=block_backcasts,  # n_blocks x (n_outputs x) n_samples x n_timesteps x 1 # noqa: E501
+            block_forecasts=block_forecasts,  # n_blocks x (n_outputs x) n_samples x n_timesteps x output_size # noqa: E501
         )
 
     @classmethod
@@ -404,18 +404,20 @@ def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs):
 
         Returns:
             NBeats
-        """
+        """  # noqa: E501
         # validate arguments
         assert not isinstance(
             dataset.target_normalizer, NaNLabelEncoder
         ), "only regression tasks are supported - target must not be categorical"
-        assert (
-            dataset.min_encoder_length == dataset.max_encoder_length
-        ), "only fixed encoder length is allowed, but min_encoder_length != max_encoder_length"
+        assert dataset.min_encoder_length == dataset.max_encoder_length, (
+            "only fixed encoder length is allowed,"
+            " but min_encoder_length != max_encoder_length"
+        )
 
-        assert (
-            dataset.max_prediction_length == dataset.min_prediction_length
-        ), "only fixed prediction length is allowed, but max_prediction_length != min_prediction_length"
+        assert dataset.max_prediction_length == dataset.min_prediction_length, (
+            "only fixed prediction length is allowed,"
+            " but max_prediction_length != min_prediction_length"
+        )
 
         assert (
             dataset.randomize_length is None
@@ -436,9 +438,10 @@ def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs):
         assert (new_kwargs.get("backcast_loss_ratio", 0) == 0) | (
             isinstance(new_kwargs["output_size"], int)
             and new_kwargs["output_size"] == 1
-        ) or all(
-            o == 1 for o in new_kwargs["output_size"]
-        ), "output sizes can only be of size 1, i.e. point forecasts if backcast_loss_ratio > 0"
+        ) or all(o == 1 for o in new_kwargs["output_size"]), (
+            "output sizes can only be of size 1, i.e."
+            " point forecasts if backcast_loss_ratio > 0"
+        )
 
         # initialize class
         return super().from_dataset(dataset, **new_kwargs)
@@ -517,7 +520,7 @@ def plot_interpretation(
 
         Returns:
             plt.Figure: matplotlib figure
-        """
+        """  # noqa: E501
         _check_matplotlib("plot_interpretation")
 
         from matplotlib import pyplot as plt
diff --git a/pytorch_forecasting/models/nhits/sub_modules.py b/pytorch_forecasting/models/nhits/sub_modules.py
index bc882c459..7856665e4 100644
--- a/pytorch_forecasting/models/nhits/sub_modules.py
+++ b/pytorch_forecasting/models/nhits/sub_modules.py
@@ -85,7 +85,7 @@ def init_weights(module, initialization):
         elif initialization == "glorot_normal":
             torch.nn.init.xavier_normal_(module.weight)
         elif initialization == "lecun_normal":
-            pass  # torch.nn.init.normal_(module.weight, 0.0, std=1/np.sqrt(module.weight.numel()))
+            pass  # torch.nn.init.normal_(module.weight, 0.0, std=1/np.sqrt(module.weight.numel())) # noqa: E501
         else:
             assert 1 < 0, f"Initialization {initialization} not found"
 
@@ -188,7 +188,8 @@ def __init__(
         ]
         layers = hidden_layers + output_layer
 
-        # static_size is computed with data, static_hidden_size is provided by user, if 0 no statics are used
+        # static_size is computed with data, static_hidden_size is provided by user,
+        # if 0 no statics are used
         if (self.static_size > 0) and (self.static_hidden_size > 0):
             self.static_encoder = StaticFeaturesEncoder(
                 in_features=static_size, out_features=static_hidden_size
@@ -387,7 +388,7 @@ def forward(
         decoder_x_t,
         x_s,
     ):
-        residuals = encoder_y  # .flip(dims=(1,))  # todo: check if flip is required or should be rather replaced by scatter
+        residuals = encoder_y  # .flip(dims=(1,))  # todo: check if flip is required or should be rather replaced by scatter # noqa: E501
         # encoder_x_t = encoder_x_t.flip(dims=(-1,))
         # encoder_mask = encoder_mask.flip(dims=(-1,))
         encoder_mask = encoder_mask.unsqueeze(-1)
diff --git a/pytorch_forecasting/models/nn/embeddings.py b/pytorch_forecasting/models/nn/embeddings.py
index a5ecbf93d..9f9651d06 100644
--- a/pytorch_forecasting/models/nn/embeddings.py
+++ b/pytorch_forecasting/models/nn/embeddings.py
@@ -75,7 +75,7 @@ def __init__(
                 embedding vector. Defaults to empty list.
             max_embedding_size (int, optional): if embedding size defined by ``embedding_sizes`` is larger than
                 ``max_embedding_size``, it will be constrained. Defaults to None.
-        """
+        """  # noqa: E501
         if categorical_groups is None:
             categorical_groups = {}
         if embedding_paddings is None:
@@ -94,7 +94,10 @@ def __init__(
                 ), "categorical_groups must be in embedding_sizes."
                 assert not any(
                     name in embedding_sizes for name in categorical_group_variables
-                ), "group variables in categorical_groups must not be in embedding_sizes."
+                ), (
+                    "group variables in categorical_groups"
+                    " must not be in embedding_sizes."
+                )
                 assert all(
                     name in x_categoricals for name in categorical_group_variables
                 ), "group variables in categorical_groups must be in x_categoricals."
@@ -103,13 +106,13 @@ def __init__(
                 for name in embedding_sizes
                 if name not in categorical_group_variables
             ), (
-                "all variables in embedding_sizes must be in x_categoricals - but only if"
-                "not already in categorical_groups."
+                "all variables in embedding_sizes must be in x_categoricals - "
+                "but only if not already in categorical_groups."
             )
         else:
             assert (
                 x_categoricals is None and len(categorical_groups) == 0
-            ), "If embedding_sizes is not a dictionary, categorical_groups and x_categoricals must be empty."
+            ), "If embedding_sizes is not a dictionary, categorical_groups and x_categoricals must be empty."  # noqa: E501
             # number embeddings based on order
             embedding_sizes = {
                 str(name): size for name, size in enumerate(embedding_sizes)
@@ -193,7 +196,7 @@ def forward(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
                 of shape batch x (optional) time x embedding_size if ``embedding_size`` is given as dictionary.
                 Otherwise, returns the embedding of shape batch x (optional) time x sum(embedding_sizes).
                 Query attribute ``output_size`` to get the size of the output(s).
-        """
+        """  # noqa: E501
         input_vectors = {}
         for name, emb in self.embeddings.items():
             if name in self.categorical_groups:
diff --git a/pytorch_forecasting/models/nn/rnn.py b/pytorch_forecasting/models/nn/rnn.py
index c96725d5f..36a67e8fc 100644
--- a/pytorch_forecasting/models/nn/rnn.py
+++ b/pytorch_forecasting/models/nn/rnn.py
@@ -36,7 +36,7 @@ def handle_no_encoding(
 
         Returns:
             HiddenState: hidden state with propagated initial hidden state where appropriate
-        """
+        """  # noqa: E501
         pass
 
     @abstractmethod
@@ -92,7 +92,7 @@ def forward(
         Returns:
             Tuple[Union[rnn.PackedSequence, torch.Tensor], HiddenState]: output and hidden state.
                 Output is packed sequence if input has been a packed sequence.
-        """
+        """  # noqa: E501
         if isinstance(x, rnn.PackedSequence) or lengths is None:
             assert (
                 lengths is None
@@ -133,7 +133,8 @@ def forward(
                     ),
                     hx=hx,
                 )
-                # replace hidden cell with initial input if encoder_length is zero to determine correct initial state
+                # replace hidden cell with initial input if encoder_length
+                # is zero to determine correct initial state
                 if min_length == 0:
                     no_encoding = (lengths == 0)[
                         None, :, None
diff --git a/pytorch_forecasting/models/rnn/_rnn.py b/pytorch_forecasting/models/rnn/_rnn.py
index a3f026292..e5d5f0bb5 100644
--- a/pytorch_forecasting/models/rnn/_rnn.py
+++ b/pytorch_forecasting/models/rnn/_rnn.py
@@ -89,7 +89,7 @@ def __init__(
             loss (MultiHorizonMetric, optional): loss: loss function taking prediction and targets.
             logging_metrics (nn.ModuleList, optional): Metrics to log during training.
                 Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()]).
-        """
+        """  # noqa : E501
         if static_categoricals is None:
             static_categoricals = []
         if static_reals is None:
@@ -134,9 +134,10 @@ def __init__(
         lagged_target_names = [l for lags in target_lags.values() for l in lags]
         assert set(self.encoder_variables) - set(to_list(target)) - set(
             lagged_target_names
-        ) == set(self.decoder_variables) - set(
-            lagged_target_names
-        ), "Encoder and decoder variables have to be the same apart from target variable"
+        ) == set(self.decoder_variables) - set(lagged_target_names), (
+            "Encoder and decoder variables have to"
+            " be the same apart from target variable"
+        )
         for targeti in to_list(target):
             assert (
                 targeti in time_varying_reals_encoder
@@ -196,7 +197,7 @@ def from_dataset(
 
         Returns:
             Recurrent network
-        """
+        """  # noqa: E501
         new_kwargs = copy(kwargs)
         new_kwargs.update(
             cls.deduce_default_output_parameters(
@@ -209,7 +210,7 @@ def from_dataset(
                 not isinstance(normalizer, NaNLabelEncoder)
                 for normalizer in dataset.target_normalizer
             )
-        ), "target(s) should be continuous - categorical targets are not supported"  # todo: remove this restriction
+        ), "target(s) should be continuous - categorical targets are not supported"  # todo: remove this restriction # noqa: E501
         return super().from_dataset(
             dataset,
             allowed_encoder_known_variable_names=allowed_encoder_known_variable_names,
@@ -227,7 +228,7 @@ def construct_input_vector(
 
         Args:
             one_off_target: tensor to insert into first position of target. If None (default), remove first time step.
-        """
+        """  # noqa : E501
         # create input vector
         if len(self.categoricals) > 0:
             embeddings = self.embeddings(x_cat)
diff --git a/pytorch_forecasting/models/temporal_fusion_transformer/_tft.py b/pytorch_forecasting/models/temporal_fusion_transformer/_tft.py
index 75f4d552b..6ab878ecb 100644
--- a/pytorch_forecasting/models/temporal_fusion_transformer/_tft.py
+++ b/pytorch_forecasting/models/temporal_fusion_transformer/_tft.py
@@ -1,6 +1,6 @@
 """
 The temporal fusion transformer is a powerful predictive model for forecasting timeseries
-"""
+"""  # noqa: E501
 
 from copy import copy
 from typing import Dict, List, Optional, Tuple, Union
@@ -146,7 +146,7 @@ def __init__(
             logging_metrics (nn.ModuleList[LightningMetric]): list of metrics that are logged during training.
                 Defaults to nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE()]).
             **kwargs: additional arguments to :py:class:`~BaseModel`.
-        """
+        """  # noqa: E501
         if monotone_constaints is None:
             monotone_constaints = {}
         if embedding_labels is None:
@@ -366,11 +366,13 @@ def __init__(
             self.hparams.hidden_size, dropout=self.hparams.dropout
         )
         self.post_lstm_gate_decoder = self.post_lstm_gate_encoder
-        # self.post_lstm_gate_decoder = GatedLinearUnit(self.hparams.hidden_size, dropout=self.hparams.dropout)
+        # self.post_lstm_gate_decoder = GatedLinearUnit(
+        #           self.hparams.hidden_size, dropout=self.hparams.dropout)
         self.post_lstm_add_norm_encoder = AddNorm(
             self.hparams.hidden_size, trainable_add=False
         )
-        # self.post_lstm_add_norm_decoder = AddNorm(self.hparams.hidden_size, trainable_add=True)
+        # self.post_lstm_add_norm_decoder = AddNorm(
+        #                               self.hparams.hidden_size, trainable_add=True)
         self.post_lstm_add_norm_decoder = self.post_lstm_add_norm_encoder
 
         # static enrichment and processing past LSTM
@@ -432,7 +434,7 @@ def from_dataset(
 
         Returns:
             TemporalFusionTransformer
-        """
+        """  # noqa: E501
         # add maximum encoder length
         # update defaults
         new_kwargs = copy(kwargs)
@@ -473,12 +475,14 @@ def get_attention_mask(
                 .expand(encoder_lengths.size(0), -1, -1)
             )
         else:
-            # there is value in attending to future forecasts if they are made with knowledge currently
-            #   available
-            #   one possibility is here to use a second attention layer for future attention (assuming different effects
-            #   matter in the future than the past)
-            #   or alternatively using the same layer but allowing forward attention - i.e. only
-            #   masking out non-available data and self
+            # there is value in attending to future forecasts if
+            # they are made with knowledge currently available
+            #   one possibility is here to use a second attention layer
+            # for future attention
+            # (assuming different effects matter in the future than the past)
+            #  or alternatively using the same layer but
+            # allowing forward attention - i.e. only
+            #  masking out non-available data and self
             decoder_mask = (
                 create_mask(decoder_length, decoder_lengths)
                 .unsqueeze(1)
@@ -630,7 +634,8 @@ def forward(self, x: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
 
         output = self.pos_wise_ff(attn_output)
 
-        # skip connection over temporal fusion decoder (not LSTM decoder despite the LSTM output contains
+        # skip connection over temporal fusion decoder (not LSTM decoder
+        # despite the LSTM output contains
         # a skip from the variable selection network)
         output = self.pre_output_gate_norm(output, lstm_output[:, max_encoder_length:])
         if self.n_targets > 1:  # if to use multi-target architecture
@@ -664,7 +669,7 @@ def _log_interpretation(self, out):
         interpretation = self.interpret_output(
             detach(out),
             reduction="sum",
-            attention_prediction_horizon=0,  # attention only for first prediction horizon
+            attention_prediction_horizon=0,  # attention only for first prediction horizon # noqa: E501
         )
         return interpretation
 
@@ -692,7 +697,7 @@ def interpret_output(
 
         Returns:
             interpretations that can be plotted with ``plot_interpretation()``
-        """
+        """  # noqa: E501
         # take attention and concatenate if a list to proper attention object
         batch_size = len(out["decoder_attention"])
         if isinstance(out["decoder_attention"], (list, tuple)):
@@ -778,7 +783,8 @@ def interpret_output(
             out["decoder_lengths"], min=1, max=out["decoder_variables"].size(1)
         )
 
-        # mask where decoder and encoder where not applied when averaging variable selection weights
+        # mask where decoder and encoder where not applied
+        # when averaging variable selection weights
         encoder_variables = out["encoder_variables"].squeeze(-2).clone()
         encode_mask = create_mask(encoder_variables.size(1), out["encoder_lengths"])
         encoder_variables = encoder_variables.masked_fill(
@@ -800,7 +806,8 @@ def interpret_output(
         # static variables need no masking
         static_variables = out["static_variables"].squeeze(1)
         # attention is batch x time x heads x time_to_attend
-        # average over heads + only keep prediction attention and attention on observed timesteps
+        # average over heads + only keep prediction attention and
+        # attention on observed timesteps
         attention = masked_op(
             attention[
                 :,
@@ -961,7 +968,8 @@ def log_interpretation(self, outputs):
         """
         # extract interpretations
         interpretation = {
-            # use padded_stack because decoder length histogram can be of different length
+            # use padded_stack because decoder
+            # length histogram can be of different length
             name: padded_stack(
                 [x["interpretation"][name].detach() for x in outputs],
                 side="right",
@@ -969,7 +977,8 @@ def log_interpretation(self, outputs):
             ).sum(0)
             for name in outputs[0]["interpretation"].keys()
         }
-        # normalize attention with length histogram squared to account for: 1. zeros in attention and
+        # normalize attention with length histogram squared to account for:
+        # 1. zeros in attention and
         # 2. higher attention due to less values
         attention_occurances = (
             interpretation["encoder_length_histogram"][1:].flip(0).float().cumsum(0)
diff --git a/pytorch_forecasting/models/temporal_fusion_transformer/sub_modules.py b/pytorch_forecasting/models/temporal_fusion_transformer/sub_modules.py
index 7ad55b11d..ec636ffa5 100644
--- a/pytorch_forecasting/models/temporal_fusion_transformer/sub_modules.py
+++ b/pytorch_forecasting/models/temporal_fusion_transformer/sub_modules.py
@@ -285,7 +285,7 @@ def __init__(
     ):
         """
         Calculate weights for ``num_inputs`` variables  which are each of size ``input_size``
-        """
+        """  # noqa: E501
         super().__init__()
 
         self.hidden_size = hidden_size
diff --git a/pytorch_forecasting/models/temporal_fusion_transformer/tuning.py b/pytorch_forecasting/models/temporal_fusion_transformer/tuning.py
index ddd7c94f7..2a24b215b 100644
--- a/pytorch_forecasting/models/temporal_fusion_transformer/tuning.py
+++ b/pytorch_forecasting/models/temporal_fusion_transformer/tuning.py
@@ -86,7 +86,7 @@ def optimize_hyperparameters(
 
     Returns:
         optuna.Study: optuna study results
-    """
+    """  # noqa : E501
     pkgs = _get_installed_packages()
 
     if "optuna" not in pkgs or "statsmodels" not in pkgs:
@@ -125,11 +125,12 @@ class PyTorchLightningPruningCallbackAdjusted(
 
     loss = kwargs.get(
         "loss", QuantileLoss()
-    )  # need a deepcopy of loss as it will otherwise propagate from one trial to the next
+    )  # need a deepcopy of loss as it will otherwise propagate from one trial to the next # noqa : E501
 
     # create objective function
     def objective(trial: optuna.Trial) -> float:
-        # Filenames for each trial must be made unique in order to access each checkpoint.
+        # Filenames for each trial must be made unique
+        # in order to access each checkpoint.
         checkpoint_callback = ModelCheckpoint(
             dirpath=os.path.join(model_path, "trial_{}".format(trial.number)),
             filename="{epoch}",
diff --git a/pytorch_forecasting/models/tide/_tide.py b/pytorch_forecasting/models/tide/_tide.py
index 101d4a4b3..2d45c74d5 100644
--- a/pytorch_forecasting/models/tide/_tide.py
+++ b/pytorch_forecasting/models/tide/_tide.py
@@ -98,7 +98,7 @@ def __init__(
         **kwargs
             Allows optional arguments to configure pytorch_lightning.Module, pytorch_lightning.Trainer, and
             pytorch-forecasting's :class:BaseModelWithCovariates.
-        """
+        """  # noqa: E501
 
         if static_categoricals is None:
             static_categoricals = []
@@ -127,7 +127,8 @@ def __init__(
         if logging_metrics is None:
             logging_metrics = nn.ModuleList([SMAPE(), MAE(), RMSE(), MAPE(), MASE()])
 
-        # loss and logging_metrics are ignored as they are modules and stored before calling save_hyperparameters
+        # loss and logging_metrics are ignored as they are modules
+        # and stored before calling save_hyperparameters
         self.save_hyperparameters(ignore=["loss", "logging_metrics"])
         super().__init__(logging_metrics=logging_metrics, **kwargs)
         self.output_dim = len(self.target_names)
@@ -207,20 +208,22 @@ def from_dataset(cls, dataset: TimeSeriesDataSet, **kwargs):
 
         Returns:
             TiDE
-        """
+        """  # noqa: E501
 
         # validate arguments
         assert not isinstance(
             dataset.target_normalizer, NaNLabelEncoder
         ), "only regression tasks are supported - target must not be categorical"
 
-        assert (
-            dataset.min_encoder_length == dataset.max_encoder_length
-        ), "only fixed encoder length is allowed, but min_encoder_length != max_encoder_length"
+        assert dataset.min_encoder_length == dataset.max_encoder_length, (
+            "only fixed encoder length is allowed,"
+            " but min_encoder_length != max_encoder_length"
+        )
 
-        assert (
-            dataset.max_prediction_length == dataset.min_prediction_length
-        ), "only fixed prediction length is allowed, but max_prediction_length != min_prediction_length"
+        assert dataset.max_prediction_length == dataset.min_prediction_length, (
+            "only fixed prediction length is allowed,"
+            " but max_prediction_length != min_prediction_length"
+        )
 
         assert (
             dataset.randomize_length is None
@@ -258,7 +261,8 @@ def forward(self, x: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
         encoder_features = self.extract_features(x, self.embeddings, period="encoder")
 
         if self.encoder_covariate_size > 0:
-            # encoder_features = self.extract_features(x, self.embeddings, period="encoder")
+            # encoder_features = self.extract_features(
+            #                   x, self.embeddings, period="encoder")
             encoder_x_t = torch.concat(
                 [
                     encoder_features[name]
@@ -295,8 +299,10 @@ def forward(self, x: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
         prediction = self.model(x_in)
 
         if self.output_dim > 1:  # for multivariate targets
-            # adjust prefictions dimensions according to format required for consequent processes
-            # from (batch size, seq len, output_dim) to (output_dim, batch size, seq len)
+            # adjust prefictions dimensions according
+            # to format required for consequent processes
+            # from (batch size, seq len, output_dim) to
+            # (output_dim, batch size, seq len)
             prediction = prediction.permute(2, 0, 1)
             prediction = [i.clone().detach().requires_grad_(True) for i in prediction]
 
diff --git a/pytorch_forecasting/models/tide/sub_modules.py b/pytorch_forecasting/models/tide/sub_modules.py
index 75097cd97..ccc5096c5 100644
--- a/pytorch_forecasting/models/tide/sub_modules.py
+++ b/pytorch_forecasting/models/tide/sub_modules.py
@@ -76,7 +76,8 @@ def __init__(
         Parameters
         ----------
         input_dim
-            The total number of input features, including the target and optional covariates.
+            The total number of input features, including the target
+            and optional covariates.
         output_dim
             The number of output features in the target.
         future_cov_dim
@@ -96,7 +97,8 @@ def __init__(
         temporal_width_future
             The dimensionality of the embedding space for future covariates.
         temporal_hidden_size_future
-            The size of the hidden layers in the Residual Block projecting future covariates.
+            The size of the hidden layers in the Residual Block projecting
+            future covariates.
         use_layer_norm
             Indicates whether to apply layer normalization in the Residual Blocks.
         dropout
@@ -105,13 +107,15 @@ def __init__(
         Inputs
         ------
         x
-            A tuple of Tensors (x_past, x_future, x_static) where x_past represents the input/past sequence,
+            A tuple of Tensors (x_past, x_future, x_static)
+            where x_past represents the input/past sequence,
             and x_future represents the output/future sequence. The input dimensions are
             (batch_size, time_steps, components).
         Outputs
         -------
         y
-            A Tensor with dimensions (batch_size, output_chunk_length, output_dim) representing the model's output.
+            A Tensor with dimensions (batch_size, output_chunk_length, output_dim)
+            epresenting the model's output.
         """
         super().__init__()
 
@@ -130,7 +134,8 @@ def __init__(
         self.temporal_width_future = temporal_width_future
         self.temporal_hidden_size_future = temporal_hidden_size_future or hidden_size
 
-        # future covariates handling: either feature projection, raw features, or no features
+        # future covariates handling: either feature projection,
+        # raw features, or no features
         self.future_cov_projection = None
         if future_cov_dim > 0 and self.temporal_width_future:
             # residual block for future covariates feature projection
@@ -224,8 +229,10 @@ def forward(
         Parameters
         ----------
         x_in
-            comes as tuple (x_past, x_future, x_static) where x_past is the input/past chunk and x_future
-            is the output/future chunk. Input dimensions are (batch_size, time_steps, components)
+            comes as tuple (x_past, x_future, x_static)
+            where x_past is the input/past chunk and x_future
+            is the output/future chunk. Input dimensions are
+            (batch_size, time_steps, components)
         Returns
         -------
         torch.Tensor
@@ -240,7 +247,8 @@ def forward(
         x_lookback = x[:, :, : self.output_dim]
 
         # future covariates: feature projection or raw features
-        # historical future covariates need to be extracted from x and stacked with part of future covariates
+        # historical future covariates need to be extracted from x and
+        # stacked with part of future covariates
         if self.future_cov_dim > 0:
             x_dynamic_future_covariates = torch.cat(
                 [
@@ -292,8 +300,8 @@ def forward(
         temporal_decoded = self.temporal_decoder(temporal_decoder_input)
 
         # pass x_lookback through self.lookback_skip but swap the last two dimensions
-        # this is needed because the skip connection is applied across the input time steps
-        # and not across the output time steps
+        # this is needed because the skip connection is applied across
+        # the input time steps and not across the output time steps
         skip = self.lookback_skip(x_lookback.transpose(1, 2)).transpose(1, 2)
 
         # add skip connection
diff --git a/pytorch_forecasting/utils/_dependencies.py b/pytorch_forecasting/utils/_dependencies.py
index e8db215b4..22cc908ec 100644
--- a/pytorch_forecasting/utils/_dependencies.py
+++ b/pytorch_forecasting/utils/_dependencies.py
@@ -58,7 +58,10 @@ def _check_matplotlib(ref="This feature", raise_error=True):
 
     if raise_error and "matplotlib" not in pkgs:
         raise ImportError(
-            f"{ref} requires matplotlib. Please install matplotlib with `pip install matplotlib`."
+            (
+                f"{ref} requires matplotlib."
+                " Please install matplotlib with `pip install matplotlib`."
+            )
         )
 
     return "matplotlib" in pkgs
diff --git a/pytorch_forecasting/utils/_utils.py b/pytorch_forecasting/utils/_utils.py
index 27044372d..af93006cf 100644
--- a/pytorch_forecasting/utils/_utils.py
+++ b/pytorch_forecasting/utils/_utils.py
@@ -59,7 +59,8 @@ def groupby_apply(
         return_histogram: if to return histogram on top
 
     Returns:
-        tensor of size ``bins`` with aggregated values and optionally with counts of values
+        tensor of size ``bins`` with aggregated values
+        and optionally with counts of values
     """
     if reduction == "mean":
         reduce = torch.mean
@@ -94,7 +95,7 @@ def profile(
         profile_fname (str): path where to save profile (`.txt` file will be saved with line profile)
         filter (str, optional): filter name (e.g. module name) to filter profile. Defaults to "".
         period (float, optional): frequency of calling profiler in seconds. Defaults to 0.0001.
-    """
+    """  # noqa : E501
     import vmprof
     from vmprof.show import LinesPrinter
 
@@ -243,7 +244,7 @@ def unpack_sequence(
 
     Returns:
         Tuple[torch.Tensor, torch.Tensor]: tuple of unpacked sequence and length of samples
-    """
+    """  # noqa : E501
     if isinstance(sequence, rnn.PackedSequence):
         sequence, lengths = rnn.pad_packed_sequence(sequence, batch_first=True)
         # batch sizes reside on the CPU by default -> we need to bring them to GPU
@@ -267,7 +268,7 @@ def concat_sequences(
 
     Returns:
         Union[torch.Tensor, rnn.PackedSequence]: concatenated sequence
-    """
+    """  # noqa : E501
     if isinstance(sequences[0], rnn.PackedSequence):
         return rnn.pack_sequence(sequences, enforce_sorted=False)
     elif isinstance(sequences[0], torch.Tensor):
@@ -298,7 +299,7 @@ def padded_stack(
 
     Returns:
         torch.Tensor: stacked tensor
-    """
+    """  # noqa : E501
     full_size = max([x.size(-1) for x in tensors])
 
     def make_padding(pad):
@@ -370,8 +371,8 @@ def apply_to_list(obj: Union[List[Any], Any], func: Callable) -> Union[List[Any]
         func (Callable): function to apply
 
     Returns:
-        Union[List[Any], Any]: list of objects or object depending on function output and
-            if input ``obj`` is of type list/tuple
+        Union[List[Any], Any]: list of objects or object depending on function output
+            and if input ``obj`` is of type list/tuple
     """
     if isinstance(obj, (list, tuple)) and not isinstance(obj, rnn.PackedSequence):
         return [func(o) for o in obj]
@@ -412,7 +413,7 @@ def iget(self, idx: Union[int, slice]):
 
 
 class TupleOutputMixIn:
-    """MixIn to give output a namedtuple-like access capabilities with ``to_network_output() function``."""
+    """MixIn to give output a namedtuple-like access capabilitieswith ``to_network_output() function``."""  # noqa : E501
 
     def to_network_output(self, **results):
         """
@@ -459,7 +460,7 @@ def move_to_device(
 
     Returns:
         x on targeted device
-    """
+    """  # noqa: E501
     if isinstance(device, str):
         if device == "mps":
             if hasattr(torch.backends, device):
@@ -531,7 +532,7 @@ def masked_op(
 
     Returns:
         torch.Tensor: tensor with averaged out dimension
-    """
+    """  # noqa : E501
     if mask is None:
         mask = ~torch.isnan(tensor)
     masked = tensor.masked_fill(~mask, 0.0)
@@ -560,7 +561,7 @@ def repr_class(
 
     Returns:
         str
-    """
+    """  # noqa E501
     if extra_attributes is None:
         extra_attributes = {}
     # get attributes
diff --git a/tests/test_data/test_encoders.py b/tests/test_data/test_encoders.py
index b88204113..3a1d6e23a 100644
--- a/tests/test_data/test_encoders.py
+++ b/tests/test_data/test_encoders.py
@@ -176,7 +176,7 @@ def test_TorchNormalizer_dtype_consistency():
     """
     - Ensures that even for float64 `target_scale`, the transformation will not change the prediction dtype.
     - Ensure that target_scale will be of type float32 if method is 'identity'
-    """
+    """  # noqa: E501
     parameters = torch.tensor([[[366.4587]]])
     target_scale = torch.tensor([[427875.7500, 80367.4766]], dtype=torch.float64)
     assert (
diff --git a/tests/test_data/test_timeseries.py b/tests/test_data/test_timeseries.py
index ff9d3e25d..faa5ca424 100644
--- a/tests/test_data/test_timeseries.py
+++ b/tests/test_data/test_timeseries.py
@@ -248,7 +248,8 @@ def test_from_dataset_equivalence(test_data):
         test_data[lambda x: x.time_idx > x.time_idx.min() + 2],
         predict=True,
     )
-    # ensure validation1 and validation2 datasets are exactly the same despite different data inputs
+    # ensure validation1 and validation2 datasets are exactly
+    # the same despite different data inputs
     for v1, v2 in zip(
         iter(validation1.to_dataloader(train=False)),
         iter(validation2.to_dataloader(train=False)),
@@ -564,7 +565,7 @@ def distance_to_weights(dist):
                     != indices["sequence_id"].iloc[sub_group_idx]
                 ]
                 # filter duplicate timeseries
-                # indices = indices.sort_values("sequence_length").drop_duplicates("sequence_id", keep="last")
+                # indices = indices.sort_values("sequence_length").drop_duplicates("sequence_id", keep="last") # noqa : E501
 
                 # calculate distances for corresponding groups
                 group_distances = torch.cdist(
diff --git a/tests/test_models/test_deepar.py b/tests/test_models/test_deepar.py
index 2204bb59b..6b3e1f0cc 100644
--- a/tests/test_models/test_deepar.py
+++ b/tests/test_models/test_deepar.py
@@ -234,4 +234,4 @@ def test_pickle(dataloaders_with_covariates, loss):
         loss=loss,
     )
     pkl = pickle.dumps(model)
-    pickle.loads(pkl)
+    pickle.loads(pkl)  # noqa: S301
diff --git a/tests/test_models/test_mlp.py b/tests/test_models/test_mlp.py
index 71f62632d..54e4a6191 100644
--- a/tests/test_models/test_mlp.py
+++ b/tests/test_models/test_mlp.py
@@ -141,4 +141,4 @@ def model(dataloaders_with_covariates):
 
 def test_pickle(model):
     pkl = pickle.dumps(model)
-    pickle.loads(pkl)
+    pickle.loads(pkl)  # noqa: S301
diff --git a/tests/test_models/test_nbeats.py b/tests/test_models/test_nbeats.py
index 604891394..c3379fbf1 100644
--- a/tests/test_models/test_nbeats.py
+++ b/tests/test_models/test_nbeats.py
@@ -86,7 +86,7 @@ def model(dataloaders_fixed_window_without_covariates):
 
 def test_pickle(model):
     pkl = pickle.dumps(model)
-    pickle.loads(pkl)
+    pickle.loads(pkl)  # noqa: S301
 
 
 @pytest.mark.skipif(
diff --git a/tests/test_models/test_nhits.py b/tests/test_models/test_nhits.py
index 959026f51..a79e7a93f 100644
--- a/tests/test_models/test_nhits.py
+++ b/tests/test_models/test_nhits.py
@@ -59,7 +59,8 @@ def _integration(dataloader, tmp_path, trainer_kwargs=None, **kwargs):
             train_dataloaders=train_dataloader,
             val_dataloaders=val_dataloader,
         )
-        # todo: testing somehow disables grad computation even though it is explicitly turned on
+        # todo: testing somehow disables grad computation even though
+        # it is explicitly turned on
         #       loss is calculated as "grad" for MQF2
         if not isinstance(net.loss, MQF2DistributionLoss):
             test_outputs = trainer.test(net, dataloaders=test_dataloader)
@@ -153,7 +154,7 @@ def model(dataloaders_with_covariates):
 
 def test_pickle(model):
     pkl = pickle.dumps(model)
-    pickle.loads(pkl)
+    pickle.loads(pkl)  # noqa : S301
 
 
 @pytest.mark.skipif(
diff --git a/tests/test_models/test_rnn_model.py b/tests/test_models/test_rnn_model.py
index bae7fee20..69a9b558e 100644
--- a/tests/test_models/test_rnn_model.py
+++ b/tests/test_models/test_rnn_model.py
@@ -145,4 +145,4 @@ def model(dataloaders_with_covariates):
 
 def test_pickle(model):
     pkl = pickle.dumps(model)
-    pickle.loads(pkl)
+    pickle.loads(pkl)  # noqa: S301
diff --git a/tests/test_models/test_temporal_fusion_transformer.py b/tests/test_models/test_temporal_fusion_transformer.py
index ae250d6d8..ea1468f8a 100644
--- a/tests/test_models/test_temporal_fusion_transformer.py
+++ b/tests/test_models/test_temporal_fusion_transformer.py
@@ -159,7 +159,7 @@ def _integration(dataloader, tmp_path, loss=None, trainer_kwargs=None, **kwargs)
                         if isinstance(normalizer, NaNLabelEncoder)
                         else QuantileLoss()
                     )
-                    for normalizer in train_dataloader.dataset.target_normalizer.normalizers
+                    for normalizer in train_dataloader.dataset.target_normalizer.normalizers  # noqa : E501
                 ]
             )
         else:
@@ -184,7 +184,8 @@ def _integration(dataloader, tmp_path, loss=None, trainer_kwargs=None, **kwargs)
                 train_dataloaders=train_dataloader,
                 val_dataloaders=val_dataloader,
             )
-            # todo: testing somehow disables grad computation even though it is explicitly turned on -
+            # todo: testing somehow disables grad computation
+            # even though it is explicitly turned on -
             #       loss is calculated as "grad" for MQF2
             if not isinstance(net.loss, MQF2DistributionLoss):
                 test_outputs = trainer.test(net, dataloaders=test_dataloader)
@@ -312,7 +313,7 @@ def test_distribution(dataloaders_with_covariates, tmp_path, strategy):
 
 def test_pickle(model):
     pkl = pickle.dumps(model)
-    pickle.loads(pkl)
+    pickle.loads(pkl)  # noqa: S301
 
 
 @pytest.mark.parametrize(