-
Notifications
You must be signed in to change notification settings - Fork 3.5k
[bugfix] Resolve memory not logged when missing metrics #8174
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4549007
3fc2d94
ead9f88
65d4af8
c6e40e8
319ad41
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,6 +38,7 @@ def __init__(self, trainer: 'pl.Trainer', log_gpu_memory: Optional[str] = None) | |
self._progress_bar_metrics: Dict[str, float] = {} | ||
self._logged_metrics: Dict[str, _METRIC] = {} | ||
self._callback_metrics: Dict[str, _METRIC] = {} | ||
self._gpus_metrics: Dict[str, str] = {} | ||
self._epoch_end_reached = False | ||
self._current_fx: Optional[str] = None | ||
self._batch_idx: Optional[int] = None | ||
|
@@ -94,11 +95,6 @@ def log_metrics(self, metrics: Dict[str, _METRIC], step: Optional[int] = None) - | |
if self.trainer.logger is None or not metrics: | ||
return | ||
|
||
# add gpu memory | ||
if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory: | ||
mem_map = memory.get_memory_profile(self.log_gpu_memory) | ||
metrics.update(mem_map) | ||
|
||
# turn all tensors to scalars | ||
scalar_metrics = metrics_to_scalars(metrics) | ||
|
||
|
@@ -213,6 +209,8 @@ def update_train_step_metrics(self) -> None: | |
if self.trainer.fit_loop.should_accumulate() and self.trainer.lightning_module.automatic_optimization: | ||
return | ||
|
||
self._log_gpus_metrics() | ||
|
||
# when metrics should be logged | ||
assert not self._epoch_end_reached | ||
if self.should_update_logs or self.trainer.fast_dev_run: | ||
|
@@ -226,6 +224,12 @@ def update_train_epoch_metrics(self) -> None: | |
# reset result collection for next epoch | ||
self.trainer._results.reset(metrics=True) | ||
|
||
def _log_gpus_metrics(self): | ||
for key, mem in self.gpus_metrics.items(): | ||
gpu_id = int(key.split('/')[0].split(':')[1]) | ||
if gpu_id in self.trainer.accelerator_connector.parallel_device_ids: | ||
self.trainer.lightning_module.log(key, mem, prog_bar=False, logger=True, on_step=True, on_epoch=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. since we're already in the trainer, why do we have to log through the lightning module's |
||
|
||
""" | ||
Utilities and properties | ||
""" | ||
|
@@ -276,6 +280,13 @@ def metrics(self) -> Dict[MetricSource, Dict[str, _METRIC]]: | |
on_step = not self._epoch_end_reached | ||
return self.trainer._results.metrics(on_step) | ||
|
||
@property | ||
def gpus_metrics(self) -> Dict[str, str]: | ||
if self.trainer._device_type == DeviceType.GPU and self.log_gpu_memory: | ||
mem_map = memory.get_memory_profile(self.log_gpu_memory) | ||
self._gpus_metrics.update(mem_map) | ||
return self._gpus_metrics | ||
|
||
Comment on lines
+283
to
+289
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why did this PR need to add This means that gpu metrics are now duplicated in this dictionary and in logged metrics. Also it only gets filled when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
@property | ||
def callback_metrics(self) -> Dict[str, _METRIC]: | ||
if self.trainer._results: | ||
|
Uh oh!
There was an error while loading. Please reload this page.