Skip to content

Commit 32cfe96

Browse files
SkafteNickiawaelchli
authored andcommitted
Fig logging with log_gpu_memory='min_max' (#9013)
1 parent 6c0b504 commit 32cfe96

File tree

3 files changed

+18
-7
lines changed

3 files changed

+18
-7
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
3737
- Fixed a bug in the binary search mode of auto batch size scaling where exception was thrown if the first trainer run resulted in OOM ([#8954](https://github.com/PyTorchLightning/pytorch-lightning/pull/8954))
3838

3939

40+
- Fixed a bug causing logging with `log_gpu_memory='min_max'` not working ([#9013](https://github.com/PyTorchLightning/pytorch-lightning/pull/9013))
41+
4042
## [1.4.0] - 2021-07-27
4143

4244
### Added

pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -226,9 +226,14 @@ def update_train_epoch_metrics(self) -> None:
226226

227227
def _log_gpus_metrics(self):
228228
for key, mem in self.gpus_metrics.items():
229-
gpu_id = int(key.split("/")[0].split(":")[1])
230-
if gpu_id in self.trainer.accelerator_connector.parallel_device_ids:
231-
self.trainer.lightning_module.log(key, mem, prog_bar=False, logger=True, on_step=True, on_epoch=False)
229+
if self.log_gpu_memory == "min_max":
230+
self.trainer.lightning_module.log(key, mem, prog_bar=False, logger=True)
231+
else:
232+
gpu_id = int(key.split("/")[0].split(":")[1])
233+
if gpu_id in self.trainer.accelerator_connector.parallel_device_ids:
234+
self.trainer.lightning_module.log(
235+
key, mem, prog_bar=False, logger=True, on_step=True, on_epoch=False
236+
)
232237

233238
"""
234239
Utilities and properties

tests/trainer/logging_/test_train_loop_logging.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -646,18 +646,22 @@ def training_step(self, batch, batch_idx):
646646

647647

648648
@RunIf(min_gpus=2)
649-
def test_log_gpu_memory_without_logging_on_step(tmpdir):
649+
@pytest.mark.parametrize("log_gpu_memory", ["all", "min_max"])
650+
def test_log_gpu_memory_without_logging_on_step(tmpdir, log_gpu_memory):
650651

651652
model = BoringModel()
652653
trainer = Trainer(
653654
default_root_dir=tmpdir,
654655
max_epochs=1,
655656
limit_train_batches=1,
656657
limit_val_batches=0,
657-
log_gpu_memory="all",
658+
log_gpu_memory=log_gpu_memory,
658659
log_every_n_steps=1,
659660
gpus=[1],
660661
)
661662
trainer.fit(model)
662-
663-
assert "gpu_id: 1/memory.used (MB)" in trainer.logged_metrics
663+
if log_gpu_memory == "min_max":
664+
assert "min_gpu_mem" in trainer.logged_metrics
665+
assert "max_gpu_mem" in trainer.logged_metrics
666+
else:
667+
assert "gpu_id: 1/memory.used (MB)" in trainer.logged_metrics

0 commit comments

Comments
 (0)