updates

rohitgr7 · rohitgr7 · commit 8c3e0de4d285 · 2022-01-17T01:00:22.000+05:30
diff --git a/pytorch_lightning/callbacks/batch_size_finder.py b/pytorch_lightning/callbacks/batch_size_finder.py
@@ -110,11 +110,6 @@ def scale_batch_size(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule
                 f"Field {self.batch_arg_name} not found in both `model` and `model.hparams`"
             )
 
-        if not lightning_hasattr(pl_module, self.batch_arg_name):
-            raise MisconfigurationException(
-                f"Field {self.batch_arg_name} not found in both `model` and `model.hparams`"
-            )
-
         if (
             hasattr(pl_module, self.batch_arg_name)
             and hasattr(pl_module, "hparams")
@@ -126,6 +121,7 @@ def scale_batch_size(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule
                 " If this is not the intended behavior, please remove either one."
             )
 
+        # TODO: check if this can be enabled (#4040)
         if not trainer._data_connector._train_dataloader_source.is_module():
             raise MisconfigurationException(
                 "The batch scaling feature cannot be used with dataloaders passed directly to `.fit()`."
@@ -329,6 +325,7 @@ def _reset_params(self, trainer: "pl.Trainer") -> None:
             trainer.limit_predict_batches = self.steps_per_trial
 
     def _restore_params(self, trainer: "pl.Trainer") -> None:
+        # TODO: There are more states that needs to be reset (#4512 and #4870)
         from pytorch_lightning.trainer.states import TrainerFn
 
         trainer.logger = self._dumped_params["logger"]
@@ -350,6 +347,7 @@ def _restore_params(self, trainer: "pl.Trainer") -> None:
             trainer.limit_predict_batches = self._dumped_params["limit_eval_batches"]
 
         loop.load_state_dict(deepcopy(self._dumped_params["loop_state_dict"]))
+        loop.restarting = False
         if "loop_verbose" in self._dumped_params:
             loop.verbose = self._dumped_params["loop_verbose"]
 
diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
@@ -23,7 +23,6 @@
 from pytorch_lightning.trainer.progress import BaseProgress
 from pytorch_lightning.utilities.enums import _FaultTolerantMode
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _fault_tolerant_training
 
 T = TypeVar("T")  # the output type of `run`
 
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -225,12 +225,16 @@ def restore_loops(self) -> None:
         if state_dict is not None and self.trainer.state.fn != TrainerFn.TUNING:
             if self.trainer.state.fn == TrainerFn.FITTING:
                 self.trainer.fit_loop.load_state_dict(state_dict["fit_loop"])
+                self.trainer.fit_loop.restarting = True
             elif self.trainer.state.fn == TrainerFn.VALIDATING:
                 self.trainer.validate_loop.load_state_dict(state_dict["validate_loop"])
+                self.trainer.validate_loop.restarting = True
             elif self.trainer.state.fn == TrainerFn.TESTING:
                 self.trainer.test_loop.load_state_dict(state_dict["test_loop"])
+                self.trainer.test_loop.restarting = True
             elif self.trainer.state.fn == TrainerFn.PREDICTING:
                 self.trainer.predict_loop.load_state_dict(state_dict["predict_loop"])
+                self.trainer.predict_loop.restarting = True
 
         if self.trainer.state.fn != TrainerFn.FITTING:
             return
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
@@ -283,6 +283,7 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
     state_dict["loop_child.state_dict"]["a"] = 3
     # check restarting after `load_state_dict`
     loop_parent.load_state_dict(state_dict)
+    loop_parent.restarting = True
     assert loop_parent.restarting
 
     loop_parent.run()
@@ -306,6 +307,7 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
     loop_child = Simple(2)
     loop_parent.loop_child = loop_child
     loop_parent.load_state_dict(state_dict)
+    loop_parent.restarting = True
     assert loop_parent.progress.increment == 1
     assert loop_parent.loop_child.progress.increment == 1
 
@@ -359,6 +361,7 @@ def val_dataloader(self):
     assert checkpoint["epoch_loop.val_loop.dataloader_progress"] == expected
 
     trainer.fit_loop.load_state_dict(checkpoint)
+    trainer.fit_loop.restarting = True
 
     # `nbe_`: non-breaking epoch, as in, no exception will be raised. `be_`: breaking epoch
     # the fit-validation total batch progress is reset per epoch so it's not counted for the total value.
@@ -548,6 +551,7 @@ def configure_optimizers_multiple(self):
     assert checkpoint["loops"]["fit_loop"] == expected
 
     trainer.fit_loop.load_state_dict(checkpoint["loops"]["fit_loop"])
+    trainer.fit_loop.restarting = True
     state_dict = trainer.fit_loop.state_dict()
 
     # need to remove these elements for comparison; comparing with `fit_loop.state_dict()` would require the
@@ -557,6 +561,7 @@ def configure_optimizers_multiple(self):
     assert state_dict == checkpoint["loops"]["fit_loop"]
 
     trainer.fit_loop.load_state_dict(checkpoint["loops"]["fit_loop"])
+    trainer.fit_loop.restarting = True
     # test resetting manually, we expect all `ready` counters to be reset to `completed`
     trainer.fit_loop.reset()
     trainer.fit_loop.epoch_loop.reset()
@@ -753,6 +758,7 @@ def test_fit_loop_reset(tmpdir):
 
     # we load exactly what was saved - no reset yet
     fit_loop.load_state_dict(mid_epoch_ckpt["loops"]["fit_loop"])
+    fit_loop.restarting = True
     # resetting from a mid-of-epoch checkpoint SHOULD NOT reset the current counters to 0
     fit_loop.reset()
     epoch_loop.reset()
@@ -785,6 +791,7 @@ def test_fit_loop_reset(tmpdir):
 
     # we load exactly what was saved - no reset yet
     fit_loop.load_state_dict(end_of_epoch_ckpt["loops"]["fit_loop"])
+    fit_loop.restarting = True
     # resetting from a end-of-epoch checkpoint SHOULD reset the current counters to 0
     fit_loop.reset()
     epoch_loop.reset()