Skip to content

Commit c4492ad

Browse files
authored
Merge pull request #8174 from PyTorchLightning/bugfix/8159_log_gpu_memory_on_step
[bugfix] Resolve memory not logged when missing metrics
1 parent 2a372e3 commit c4492ad

File tree

3 files changed

+38
-5
lines changed

3 files changed

+38
-5
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
317317

318318
- Fixed a bug where an infinite recursion would be triggered when using the `BaseFinetuning` callback on a model that contains a `ModuleDict` ([#8170](https://github.com/PyTorchLightning/pytorch-lightning/pull/8170))
319319

320+
321+
- Fixed `log_gpu_memory` metrics not being added to `logging` when nothing else is logged ([#8174](https://github.com/PyTorchLightning/pytorch-lightning/pull/8174))
322+
323+
320324
## [1.3.7] - 2021-06-22
321325

322326
- Fixed a bug where skipping an optimizer while using amp causes amp to trigger an assertion error ([#7975](https://github.com/PyTorchLightning/pytorch-lightning/pull/7975))

pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def __init__(self, trainer: 'pl.Trainer', log_gpu_memory: Optional[str] = None)
3838
self._progress_bar_metrics: Dict[str, float] = {}
3939
self._logged_metrics: Dict[str, _METRIC] = {}
4040
self._callback_metrics: Dict[str, _METRIC] = {}
41+
self._gpus_metrics: Dict[str, str] = {}
4142
self._epoch_end_reached = False
4243
self._current_fx: Optional[str] = None
4344
self._batch_idx: Optional[int] = None
@@ -94,11 +95,6 @@ def log_metrics(self, metrics: Dict[str, _METRIC], step: Optional[int] = None) -
9495
if self.trainer.logger is None or not metrics:
9596
return
9697

97-
# add gpu memory
98-
if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
99-
mem_map = memory.get_memory_profile(self.log_gpu_memory)
100-
metrics.update(mem_map)
101-
10298
# turn all tensors to scalars
10399
scalar_metrics = metrics_to_scalars(metrics)
104100

@@ -213,6 +209,8 @@ def update_train_step_metrics(self) -> None:
213209
if self.trainer.fit_loop.should_accumulate() and self.trainer.lightning_module.automatic_optimization:
214210
return
215211

212+
self._log_gpus_metrics()
213+
216214
# when metrics should be logged
217215
assert not self._epoch_end_reached
218216
if self.should_update_logs or self.trainer.fast_dev_run:
@@ -226,6 +224,12 @@ def update_train_epoch_metrics(self) -> None:
226224
# reset result collection for next epoch
227225
self.trainer._results.reset(metrics=True)
228226

227+
def _log_gpus_metrics(self):
228+
for key, mem in self.gpus_metrics.items():
229+
gpu_id = int(key.split('/')[0].split(':')[1])
230+
if gpu_id in self.trainer.accelerator_connector.parallel_device_ids:
231+
self.trainer.lightning_module.log(key, mem, prog_bar=False, logger=True, on_step=True, on_epoch=False)
232+
229233
"""
230234
Utilities and properties
231235
"""
@@ -276,6 +280,13 @@ def metrics(self) -> Dict[MetricSource, Dict[str, _METRIC]]:
276280
on_step = not self._epoch_end_reached
277281
return self.trainer._results.metrics(on_step)
278282

283+
@property
284+
def gpus_metrics(self) -> Dict[str, str]:
285+
if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory:
286+
mem_map = memory.get_memory_profile(self.log_gpu_memory)
287+
self._gpus_metrics.update(mem_map)
288+
return self._gpus_metrics
289+
279290
@property
280291
def callback_metrics(self) -> Dict[str, _METRIC]:
281292
if self.trainer._results:

tests/trainer/logging_/test_train_loop_logging.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,3 +712,21 @@ def training_step(self, *args):
712712
model = TestModel()
713713
with pytest.raises(MisconfigurationException, match=r'reduce_fx={min,max,mean,sum}\)` are currently supported'):
714714
trainer.fit(model)
715+
716+
717+
@RunIf(min_gpus=2)
718+
def test_log_gpu_memory_without_logging_on_step(tmpdir):
719+
720+
model = BoringModel()
721+
trainer = Trainer(
722+
default_root_dir=tmpdir,
723+
max_epochs=1,
724+
limit_train_batches=1,
725+
limit_val_batches=0,
726+
log_gpu_memory='all',
727+
log_every_n_steps=1,
728+
gpus=[1]
729+
)
730+
trainer.fit(model)
731+
732+
assert 'gpu_id: 1/memory.used (MB)' in trainer.logged_metrics

0 commit comments

Comments
 (0)